1.1--- a/lisp/lib/nlp/dbscan.lisp Tue May 28 16:44:54 2024 -0400
1.2+++ b/lisp/lib/nlp/dbscan.lisp Tue May 28 17:55:30 2024 -0400
1.3@@ -2,10 +2,11 @@
1.4
1.5 ;;; Code:
1.6 (defpackage :nlp/dbscan
1.7- (:use :cl :std :nlp/doc)
1.8+ (:use :cl :std :nlp/doc :nlp/textrank :nlp/tokenize)
1.9 (:export
1.10 :document-cluster :clusters :get-cluster :distance
1.11- :generate-document-distance-vectors
1.12+ :generate-document-distance-vectors
1.13+ :cluster :neighbors :clusters
1.14 :dbscan))
1.15
1.16 (in-package :nlp/dbscan)
2.1--- a/lisp/lib/nlp/doc.lisp Tue May 28 16:44:54 2024 -0400
2.2+++ b/lisp/lib/nlp/doc.lisp Tue May 28 17:55:30 2024 -0400
2.3@@ -9,6 +9,15 @@
2.4 :add-document
2.5 :document-collection
2.6 :keywords
2.7+ :dictionary
2.8+ :term-count
2.9+ :document-frequency
2.10+ :inverse-document-frequency
2.11+ :tf-idf-vectorize-documents
2.12+ :termp
2.13+ :string-contents
2.14+ :rank
2.15+ :term-frequency
2.16 :extract-keywords
2.17 :tf-vectorize-documents
2.18 :vector-data))
3.1--- a/lisp/lib/nlp/nlp.asd Tue May 28 16:44:54 2024 -0400
3.2+++ b/lisp/lib/nlp/nlp.asd Tue May 28 17:55:30 2024 -0400
3.3@@ -6,5 +6,20 @@
3.4 :class :package-inferred-system
3.5 :defsystem-depends-on (:asdf-package-system)
3.6 :depends-on (:std :rdb :cl-ppcre :parse :nlp/pkg)
3.7- :in-order-to ((test-op (test-op :nlp/tests)))
3.8- :perform (test-op (op c) (uiop:symbol-call '#:rt '#:do-tests :nlp)))
3.9+ :components ((:file "pkg")
3.10+ (:file "data")
3.11+ (:file "tokenize")
3.12+ (:file "doc")
3.13+ (:module "stem"
3.14+ :components
3.15+ ((:file "porter")))
3.16+ (:file "textrank")
3.17+ (:file "dbscan")
3.18+ (:file "section"))
3.19+ :in-order-to ((test-op (test-op :nlp/tests))))
3.20+
3.21+
3.22+(defsystem :nlp/tests
3.23+ :depends-on (:nlp :std :rt)
3.24+ :components ((:file "tests"))
3.25+ :in-order-to ((test-op (rt:do-tests :nlp))))
4.1--- a/lisp/lib/nlp/readme.org Tue May 28 16:44:54 2024 -0400
4.2+++ b/lisp/lib/nlp/readme.org Tue May 28 17:55:30 2024 -0400
4.3@@ -3,6 +3,8 @@
4.4 This library is a small set of algorithms and data processing
4.5 utilities for [[https://en.wikipedia.org/wiki/Natural_language][Natural Languages]].
4.6
4.7+Much of this code is from the Nyxt analysis library [[https://github.com/atlas-engineer/nyxt/tree/master/libraries/analysis][here]].
4.8+
4.9 - Features
4.10 - tokenization
4.11 - stop-words
5.1--- a/lisp/lib/nlp/tests.lisp Tue May 28 16:44:54 2024 -0400
5.2+++ b/lisp/lib/nlp/tests.lisp Tue May 28 17:55:30 2024 -0400
5.3@@ -6,12 +6,26 @@
5.4 (defsuite :nlp)
5.5 (in-suite :nlp)
5.6
5.7-(defvar %docs (make-instance 'document-collection))
5.8+(defvar *test-docs* (make-instance 'document-collection))
5.9+
5.10+(deftest tokenize ()
5.11+ (is (= 3 (length (word-tokenize "foo bar baz"))))
5.12+ (is (= 2 (length (sentence-tokenize "This is the first second. Now the second")))))
5.13+
5.14+(deftest sections ()
5.15+ (is (typep (extract-sections "Testing 1 2 3") 'document-collection)))
5.16
5.17 (deftest porter-stem ()
5.18 (is (string= (stem "hacking") "hack")))
5.19
5.20-(deftest dbscan ())
5.21+(deftest docs ()
5.22+ (let ((doc (make-instance 'document :string-contents "test test test")))
5.23+ (is (= 3 (nlp/doc:term-count
5.24+ doc "test")))
5.25+ (is (= 1.0 (nlp/doc:term-frequency
5.26+ doc "test")))))
5.27
5.28-(deftest textrank ())
5.29-
5.30+(deftest textrank ()
5.31+ (is
5.32+ (typep (summarize-text "This is a test which will be summarized by the 'SUMMARIZE-TEXT' function. Yada yada. Test 1 2 3.")
5.33+ 'list)))
6.1--- a/lisp/lib/nlp/textrank.lisp Tue May 28 16:44:54 2024 -0400
6.2+++ b/lisp/lib/nlp/textrank.lisp Tue May 28 17:55:30 2024 -0400
6.3@@ -4,9 +4,9 @@
6.4
6.5 ;;; Code:
6.6 (defpackage :nlp/textrank
6.7- (:use :cl :std :nlp/doc)
6.8+ (:use :cl :std :nlp/doc :nlp/tokenize)
6.9 (:export
6.10- :summarize-text))
6.11+ :summarize-text :edges :document-vertex))
6.12
6.13 (in-package :nlp/textrank)
6.14
7.1--- a/lisp/std/seq.lisp Tue May 28 16:44:54 2024 -0400
7.2+++ b/lisp/std/seq.lisp Tue May 28 17:55:30 2024 -0400
7.3@@ -5,6 +5,11 @@
7.4 ;;; Code:
7.5 (in-package :std/seq)
7.6
7.7+;; from serapeum
7.8+(declaim (inline firstn))
7.9+(defun firstn (n list)
7.10+ (loop repeat n for x in list collect x))
7.11+
7.12 (defun take (n seq)
7.13 "Return, at most, the first N elements of SEQ, as a *new* sequence
7.14 of the same type as SEQ.
7.15@@ -13,9 +18,8 @@
7.16
7.17 If N is negative, then |N| elements are taken (in their original
7.18 order) from the end of SEQ."
7.19- #+sbcl (declare (sb-ext:muffle-conditions style-warning))
7.20 (declare (type signed-array-length n))
7.21- (seq-dispatch seq
7.22+ (sb-impl::seq-dispatch seq
7.23 (if (minusp n)
7.24 (last seq (abs n))
7.25 (firstn n seq))