53
|
1
|
(defpackage :nlp/tokenize |
96
|
2
|
(:use :cl :std :cl-ppcre :nlp/data :nlp/stem/porter) |
53
|
3
|
(:export :word-tokenize :sentence-tokenize)) |
|
4
|
|
|
5
|
(in-package :nlp/tokenize) |
|
6
|
|
|
7
|
(defun word-tokenize (string &key (remove-stop-words t) (stem nil) (down-case t) (alphabetic t)) |
|
8
|
"Split a string into a list of words." |
54
|
9
|
(let* ((tokens (split " " (collapse-whitespaces string))) |
53
|
10
|
(tokens (if remove-stop-words |
|
11
|
(delete-if (lambda (x) (gethash (string-downcase x) (stop-words-lookup *language-data*))) tokens) |
|
12
|
tokens)) |
|
13
|
(tokens (if stem |
|
14
|
(mapcar #'stem tokens) |
|
15
|
tokens)) |
|
16
|
(tokens (if down-case |
|
17
|
(mapcar #'string-downcase tokens) |
|
18
|
tokens)) |
|
19
|
(tokens (if alphabetic |
54
|
20
|
(delete-if-not (lambda (x) (cl-ppcre:scan "^[A-Za-z]*$" x)) tokens) |
53
|
21
|
tokens))) |
|
22
|
tokens)) |
|
23
|
|
|
24
|
(defun sentence-tokenize (string) |
|
25
|
"Split a string into a list of sentences." |
96
|
26
|
(remove "" (mapcar #'std:trim (cl-ppcre:split "[.!?]" string)) :test #'equal)) |