changelog shortlog graph tags branches changeset files file revisions raw help

Mercurial > core / annotate lisp/lib/nlp/tokenize.lisp

changeset 698: 96958d3eb5b0
parent: 7120877e0453
author: Richard Westhaver <ellis@rwest.io>
date: Fri, 04 Oct 2024 22:04:59 -0400
permissions: -rw-r--r--
description: fixes
53
daad2b8bb63f init nlp
ellis <ellis@rwest.io>
parents:
diff changeset
1
 (defpackage :nlp/tokenize
96
301fd45bbe73 big refactor of lisp code
ellis <ellis@rwest.io>
parents: 54
diff changeset
2
   (:use :cl :std :cl-ppcre :nlp/data :nlp/stem/porter)
53
daad2b8bb63f init nlp
ellis <ellis@rwest.io>
parents:
diff changeset
3
   (:export :word-tokenize :sentence-tokenize))
daad2b8bb63f init nlp
ellis <ellis@rwest.io>
parents:
diff changeset
4
 
daad2b8bb63f init nlp
ellis <ellis@rwest.io>
parents:
diff changeset
5
 (in-package :nlp/tokenize)
daad2b8bb63f init nlp
ellis <ellis@rwest.io>
parents:
diff changeset
6
 
daad2b8bb63f init nlp
ellis <ellis@rwest.io>
parents:
diff changeset
7
 (defun word-tokenize (string &key (remove-stop-words t) (stem nil) (down-case t) (alphabetic t))
daad2b8bb63f init nlp
ellis <ellis@rwest.io>
parents:
diff changeset
8
   "Split a string into a list of words."
54
83f6c62bf2a8 cleanup
ellis <ellis@rwest.io>
parents: 53
diff changeset
9
   (let* ((tokens (split " " (collapse-whitespaces string)))
53
daad2b8bb63f init nlp
ellis <ellis@rwest.io>
parents:
diff changeset
10
          (tokens (if remove-stop-words
daad2b8bb63f init nlp
ellis <ellis@rwest.io>
parents:
diff changeset
11
                      (delete-if (lambda (x) (gethash (string-downcase  x) (stop-words-lookup *language-data*))) tokens)
daad2b8bb63f init nlp
ellis <ellis@rwest.io>
parents:
diff changeset
12
                      tokens))
daad2b8bb63f init nlp
ellis <ellis@rwest.io>
parents:
diff changeset
13
          (tokens (if stem
daad2b8bb63f init nlp
ellis <ellis@rwest.io>
parents:
diff changeset
14
                      (mapcar #'stem tokens)
daad2b8bb63f init nlp
ellis <ellis@rwest.io>
parents:
diff changeset
15
                      tokens))
daad2b8bb63f init nlp
ellis <ellis@rwest.io>
parents:
diff changeset
16
          (tokens (if down-case
daad2b8bb63f init nlp
ellis <ellis@rwest.io>
parents:
diff changeset
17
                      (mapcar #'string-downcase tokens)
daad2b8bb63f init nlp
ellis <ellis@rwest.io>
parents:
diff changeset
18
                      tokens))
daad2b8bb63f init nlp
ellis <ellis@rwest.io>
parents:
diff changeset
19
          (tokens (if alphabetic
54
83f6c62bf2a8 cleanup
ellis <ellis@rwest.io>
parents: 53
diff changeset
20
                      (delete-if-not (lambda (x) (cl-ppcre:scan "^[A-Za-z]*$" x)) tokens)
53
daad2b8bb63f init nlp
ellis <ellis@rwest.io>
parents:
diff changeset
21
                      tokens)))
daad2b8bb63f init nlp
ellis <ellis@rwest.io>
parents:
diff changeset
22
     tokens))
daad2b8bb63f init nlp
ellis <ellis@rwest.io>
parents:
diff changeset
23
 
daad2b8bb63f init nlp
ellis <ellis@rwest.io>
parents:
diff changeset
24
 (defun sentence-tokenize (string)
daad2b8bb63f init nlp
ellis <ellis@rwest.io>
parents:
diff changeset
25
   "Split a string into a list of sentences."
96
301fd45bbe73 big refactor of lisp code
ellis <ellis@rwest.io>
parents: 54
diff changeset
26
   (remove "" (mapcar #'std:trim (cl-ppcre:split "[.!?]" string)) :test #'equal))