changelog shortlog graph tags branches changeset files revisions annotate raw help

Mercurial > core / lisp/lib/nlp/tokenize.lisp

changeset 698: 96958d3eb5b0
parent: 7120877e0453
author: Richard Westhaver <ellis@rwest.io>
date: Fri, 04 Oct 2024 22:04:59 -0400
permissions: -rw-r--r--
description: fixes
1 (defpackage :nlp/tokenize
2  (:use :cl :std :cl-ppcre :nlp/data :nlp/stem/porter)
3  (:export :word-tokenize :sentence-tokenize))
4 
5 (in-package :nlp/tokenize)
6 
7 (defun word-tokenize (string &key (remove-stop-words t) (stem nil) (down-case t) (alphabetic t))
8  "Split a string into a list of words."
9  (let* ((tokens (split " " (collapse-whitespaces string)))
10  (tokens (if remove-stop-words
11  (delete-if (lambda (x) (gethash (string-downcase x) (stop-words-lookup *language-data*))) tokens)
12  tokens))
13  (tokens (if stem
14  (mapcar #'stem tokens)
15  tokens))
16  (tokens (if down-case
17  (mapcar #'string-downcase tokens)
18  tokens))
19  (tokens (if alphabetic
20  (delete-if-not (lambda (x) (cl-ppcre:scan "^[A-Za-z]*$" x)) tokens)
21  tokens)))
22  tokens))
23 
24 (defun sentence-tokenize (string)
25  "Split a string into a list of sentences."
26  (remove "" (mapcar #'std:trim (cl-ppcre:split "[.!?]" string)) :test #'equal))