Mercurial > core / lisp/lib/nlp/tokenize.lisp
changeset 698: |
96958d3eb5b0 |
parent: |
7120877e0453
|
author: |
Richard Westhaver <ellis@rwest.io> |
date: |
Fri, 04 Oct 2024 22:04:59 -0400 |
permissions: |
-rw-r--r-- |
description: |
fixes |
1 (defpackage :nlp/tokenize 2 (:use :cl :std :cl-ppcre :nlp/data :nlp/stem/porter) 3 (:export :word-tokenize :sentence-tokenize)) 5 (in-package :nlp/tokenize) 7 (defun word-tokenize (string &key (remove-stop-words t) (stem nil) (down-case t) (alphabetic t)) 8 "Split a string into a list of words." 9 (let* ((tokens (split " " (collapse-whitespaces string))) 10 (tokens (if remove-stop-words 11 (delete-if (lambda (x) (gethash (string-downcase x) (stop-words-lookup *language-data*))) tokens) 14 (mapcar #'stem tokens) 17 (mapcar #'string-downcase tokens) 19 (tokens (if alphabetic 20 (delete-if-not (lambda (x) (cl-ppcre:scan "^[A-Za-z]*$" x)) tokens) 24 (defun sentence-tokenize (string) 25 "Split a string into a list of sentences." 26 (remove "" (mapcar #'std:trim (cl-ppcre:split "[.!?]" string)) :test #'equal))