core: lisp/lib/nlp/tokenize.lisp annotate

changelog shortlog graph tags branches changeset files file revisions raw help

Mercurial > core / annotate lisp/lib/nlp/tokenize.lisp

changeset 698:	96958d3eb5b0
parent:	7120877e0453
author:	Richard Westhaver <ellis@rwest.io>
date:	Fri, 04 Oct 2024 22:04:59 -0400
permissions:	-rw-r--r--
description:	fixes

53 daad2b8bb63f init nlp ellis <ellis@rwest.io> parents: diff changeset	1	(defpackage :nlp/tokenize
96 301fd45bbe73 big refactor of lisp code ellis <ellis@rwest.io> parents: 54 diff changeset	2	(:use :cl :std :cl-ppcre :nlp/data :nlp/stem/porter)
53 daad2b8bb63f init nlp ellis <ellis@rwest.io> parents: diff changeset	3	(:export :word-tokenize :sentence-tokenize))
daad2b8bb63f init nlp ellis <ellis@rwest.io> parents: diff changeset	4
daad2b8bb63f init nlp ellis <ellis@rwest.io> parents: diff changeset	5	(in-package :nlp/tokenize)
daad2b8bb63f init nlp ellis <ellis@rwest.io> parents: diff changeset	6
daad2b8bb63f init nlp ellis <ellis@rwest.io> parents: diff changeset	7	(defun word-tokenize (string &key (remove-stop-words t) (stem nil) (down-case t) (alphabetic t))
daad2b8bb63f init nlp ellis <ellis@rwest.io> parents: diff changeset	8	"Split a string into a list of words."
54 83f6c62bf2a8 cleanup ellis <ellis@rwest.io> parents: 53 diff changeset	9	(let* ((tokens (split " " (collapse-whitespaces string)))
53 daad2b8bb63f init nlp ellis <ellis@rwest.io> parents: diff changeset	10	(tokens (if remove-stop-words
daad2b8bb63f init nlp ellis <ellis@rwest.io> parents: diff changeset	11	(delete-if (lambda (x) (gethash (string-downcase x) (stop-words-lookup language-data))) tokens)
daad2b8bb63f init nlp ellis <ellis@rwest.io> parents: diff changeset	12	tokens))
daad2b8bb63f init nlp ellis <ellis@rwest.io> parents: diff changeset	13	(tokens (if stem
daad2b8bb63f init nlp ellis <ellis@rwest.io> parents: diff changeset	14	(mapcar #'stem tokens)
daad2b8bb63f init nlp ellis <ellis@rwest.io> parents: diff changeset	15	tokens))
daad2b8bb63f init nlp ellis <ellis@rwest.io> parents: diff changeset	16	(tokens (if down-case
daad2b8bb63f init nlp ellis <ellis@rwest.io> parents: diff changeset	17	(mapcar #'string-downcase tokens)
daad2b8bb63f init nlp ellis <ellis@rwest.io> parents: diff changeset	18	tokens))
daad2b8bb63f init nlp ellis <ellis@rwest.io> parents: diff changeset	19	(tokens (if alphabetic
54 83f6c62bf2a8 cleanup ellis <ellis@rwest.io> parents: 53 diff changeset	20	(delete-if-not (lambda (x) (cl-ppcre:scan "^[A-Za-z]*$" x)) tokens)
53 daad2b8bb63f init nlp ellis <ellis@rwest.io> parents: diff changeset	21	tokens)))
daad2b8bb63f init nlp ellis <ellis@rwest.io> parents: diff changeset	22	tokens))
daad2b8bb63f init nlp ellis <ellis@rwest.io> parents: diff changeset	23
daad2b8bb63f init nlp ellis <ellis@rwest.io> parents: diff changeset	24	(defun sentence-tokenize (string)
daad2b8bb63f init nlp ellis <ellis@rwest.io> parents: diff changeset	25	"Split a string into a list of sentences."
96 301fd45bbe73 big refactor of lisp code ellis <ellis@rwest.io> parents: 54 diff changeset	26	(remove "" (mapcar #'std:trim (cl-ppcre:split "[.!?]" string)) :test #'equal))