changelog shortlog graph tags branches files raw help

Mercurial > core / changeset: nlp fixes

changeset 380: 16bb4464adcb
parent 379: 45889d307d7f
child 381: 386d51cf61ca
author: Richard Westhaver <ellis@rwest.io>
date: Tue, 28 May 2024 17:55:30 -0400
files: lisp/lib/nlp/dbscan.lisp lisp/lib/nlp/doc.lisp lisp/lib/nlp/nlp.asd lisp/lib/nlp/readme.org lisp/lib/nlp/tests.lisp lisp/lib/nlp/textrank.lisp lisp/std/seq.lisp
description: nlp fixes
     1.1--- a/lisp/lib/nlp/dbscan.lisp	Tue May 28 16:44:54 2024 -0400
     1.2+++ b/lisp/lib/nlp/dbscan.lisp	Tue May 28 17:55:30 2024 -0400
     1.3@@ -2,10 +2,11 @@
     1.4 
     1.5 ;;; Code:
     1.6 (defpackage :nlp/dbscan
     1.7-  (:use :cl :std :nlp/doc)
     1.8+  (:use :cl :std :nlp/doc :nlp/textrank :nlp/tokenize)
     1.9   (:export 
    1.10    :document-cluster :clusters :get-cluster :distance
    1.11-   :generate-document-distance-vectors 
    1.12+   :generate-document-distance-vectors
    1.13+   :cluster :neighbors :clusters
    1.14    :dbscan))
    1.15 
    1.16 (in-package :nlp/dbscan)
     2.1--- a/lisp/lib/nlp/doc.lisp	Tue May 28 16:44:54 2024 -0400
     2.2+++ b/lisp/lib/nlp/doc.lisp	Tue May 28 17:55:30 2024 -0400
     2.3@@ -9,6 +9,15 @@
     2.4    :add-document
     2.5    :document-collection
     2.6    :keywords
     2.7+   :dictionary
     2.8+   :term-count
     2.9+   :document-frequency
    2.10+   :inverse-document-frequency
    2.11+   :tf-idf-vectorize-documents
    2.12+   :termp
    2.13+   :string-contents
    2.14+   :rank
    2.15+   :term-frequency
    2.16    :extract-keywords
    2.17    :tf-vectorize-documents
    2.18    :vector-data))
     3.1--- a/lisp/lib/nlp/nlp.asd	Tue May 28 16:44:54 2024 -0400
     3.2+++ b/lisp/lib/nlp/nlp.asd	Tue May 28 17:55:30 2024 -0400
     3.3@@ -6,5 +6,20 @@
     3.4   :class :package-inferred-system
     3.5   :defsystem-depends-on (:asdf-package-system)
     3.6   :depends-on (:std :rdb :cl-ppcre :parse :nlp/pkg)
     3.7-  :in-order-to ((test-op (test-op :nlp/tests)))
     3.8-  :perform (test-op (op c) (uiop:symbol-call '#:rt '#:do-tests :nlp)))
     3.9+  :components ((:file "pkg")
    3.10+               (:file "data")
    3.11+               (:file "tokenize")
    3.12+               (:file "doc")
    3.13+               (:module "stem"
    3.14+                :components
    3.15+                ((:file "porter")))
    3.16+               (:file "textrank")
    3.17+               (:file "dbscan")
    3.18+               (:file "section"))
    3.19+  :in-order-to ((test-op (test-op :nlp/tests))))
    3.20+
    3.21+
    3.22+(defsystem :nlp/tests
    3.23+  :depends-on (:nlp :std :rt)
    3.24+  :components ((:file "tests"))
    3.25+  :in-order-to ((test-op (rt:do-tests :nlp))))
     4.1--- a/lisp/lib/nlp/readme.org	Tue May 28 16:44:54 2024 -0400
     4.2+++ b/lisp/lib/nlp/readme.org	Tue May 28 17:55:30 2024 -0400
     4.3@@ -3,6 +3,8 @@
     4.4 This library is a small set of algorithms and data processing
     4.5 utilities for [[https://en.wikipedia.org/wiki/Natural_language][Natural Languages]].
     4.6 
     4.7+Much of this code is from the Nyxt analysis library [[https://github.com/atlas-engineer/nyxt/tree/master/libraries/analysis][here]].
     4.8+
     4.9 - Features
    4.10   - tokenization
    4.11   - stop-words
     5.1--- a/lisp/lib/nlp/tests.lisp	Tue May 28 16:44:54 2024 -0400
     5.2+++ b/lisp/lib/nlp/tests.lisp	Tue May 28 17:55:30 2024 -0400
     5.3@@ -6,12 +6,26 @@
     5.4 (defsuite :nlp)
     5.5 (in-suite :nlp)
     5.6 
     5.7-(defvar %docs (make-instance 'document-collection))
     5.8+(defvar *test-docs* (make-instance 'document-collection))
     5.9+
    5.10+(deftest tokenize ()
    5.11+  (is (= 3 (length (word-tokenize "foo bar baz"))))
    5.12+  (is (= 2 (length (sentence-tokenize "This is the first second. Now the second")))))
    5.13+
    5.14+(deftest sections ()
    5.15+  (is (typep (extract-sections "Testing 1 2 3") 'document-collection)))
    5.16 
    5.17 (deftest porter-stem ()
    5.18   (is (string= (stem "hacking") "hack")))
    5.19 
    5.20-(deftest dbscan ())
    5.21+(deftest docs ()
    5.22+  (let ((doc (make-instance 'document :string-contents "test test test")))
    5.23+    (is (= 3 (nlp/doc:term-count
    5.24+              doc "test")))
    5.25+    (is (= 1.0 (nlp/doc:term-frequency
    5.26+                doc "test")))))
    5.27 
    5.28-(deftest textrank ())
    5.29-
    5.30+(deftest textrank ()
    5.31+  (is
    5.32+   (typep (summarize-text "This is a test which will be summarized by the 'SUMMARIZE-TEXT' function. Yada yada. Test 1 2 3.")
    5.33+          'list)))
     6.1--- a/lisp/lib/nlp/textrank.lisp	Tue May 28 16:44:54 2024 -0400
     6.2+++ b/lisp/lib/nlp/textrank.lisp	Tue May 28 17:55:30 2024 -0400
     6.3@@ -4,9 +4,9 @@
     6.4 
     6.5 ;;; Code:
     6.6 (defpackage :nlp/textrank
     6.7-  (:use :cl :std :nlp/doc)
     6.8+  (:use :cl :std :nlp/doc :nlp/tokenize)
     6.9   (:export 
    6.10-   :summarize-text))
    6.11+   :summarize-text :edges :document-vertex))
    6.12 
    6.13 (in-package :nlp/textrank)
    6.14 
     7.1--- a/lisp/std/seq.lisp	Tue May 28 16:44:54 2024 -0400
     7.2+++ b/lisp/std/seq.lisp	Tue May 28 17:55:30 2024 -0400
     7.3@@ -5,6 +5,11 @@
     7.4 ;;; Code:
     7.5 (in-package :std/seq)
     7.6 
     7.7+;; from serapeum
     7.8+(declaim (inline firstn))
     7.9+(defun firstn (n list)
    7.10+  (loop repeat n for x in list collect x))
    7.11+
    7.12 (defun take (n seq)
    7.13   "Return, at most, the first N elements of SEQ, as a *new* sequence
    7.14 of the same type as SEQ.
    7.15@@ -13,9 +18,8 @@
    7.16 
    7.17 If N is negative, then |N| elements are taken (in their original
    7.18 order) from the end of SEQ."
    7.19-  #+sbcl (declare (sb-ext:muffle-conditions style-warning))
    7.20   (declare (type signed-array-length n))
    7.21-  (seq-dispatch seq
    7.22+  (sb-impl::seq-dispatch seq
    7.23     (if (minusp n)
    7.24         (last seq (abs n))
    7.25         (firstn n seq))