core:lisp/lib/nlp/tokenize.lisp

changelog shortlog graph tags branches changeset files revisions annotate raw help

Mercurial > core / lisp/lib/nlp/tokenize.lisp

changeset 698:	96958d3eb5b0
parent:	7120877e0453
author:	Richard Westhaver <ellis@rwest.io>
date:	Fri, 04 Oct 2024 22:04:59 -0400
permissions:	-rw-r--r--
description:	fixes

     1 (defpackage :nlp/tokenize
     2   (:use :cl :std :cl-ppcre :nlp/data :nlp/stem/porter)
     3   (:export :word-tokenize :sentence-tokenize))
     4 
     5 (in-package :nlp/tokenize)
     6 
     7 (defun word-tokenize (string &key (remove-stop-words t) (stem nil) (down-case t) (alphabetic t))
     8   "Split a string into a list of words."
     9   (let* ((tokens (split " " (collapse-whitespaces string)))
    10          (tokens (if remove-stop-words
    11                      (delete-if (lambda (x) (gethash (string-downcase  x) (stop-words-lookup *language-data*))) tokens)
    12                      tokens))
    13          (tokens (if stem
    14                      (mapcar #'stem tokens)
    15                      tokens))
    16          (tokens (if down-case
    17                      (mapcar #'string-downcase tokens)
    18                      tokens))
    19          (tokens (if alphabetic
    20                      (delete-if-not (lambda (x) (cl-ppcre:scan "^[A-Za-z]*$" x)) tokens)
    21                      tokens)))
    22     tokens))
    23 
    24 (defun sentence-tokenize (string)
    25   "Split a string into a list of sentences."
    26   (remove "" (mapcar #'std:trim (cl-ppcre:split "[.!?]" string)) :test #'equal))