changelog shortlog graph tags branches changeset files revisions annotate raw help

Mercurial > core / lisp/lib/nlp/doc.lisp

changeset 53: daad2b8bb63f
child: 83f6c62bf2a8
author: ellis <ellis@rwest.io>
date: Fri, 24 Nov 2023 18:11:20 -0500
permissions: -rw-r--r--
description: init nlp
1 ;;; doc.lisp --- Text Documents
2 
3 ;;; Code:
4 (defpackage :nlp/doc
5  (:use :cl :std/fu :std/list :nlp/tokenize)
6  (:export
7  :document
8  :documents
9  :add-document
10  :document-collection
11  :keywords
12  :extract-keywords
13  :tf-vectorize-documents
14  :vector-data))
15 
16 (in-package :nlp/doc)
17 
18 (defclass document ()
19  ((source :accessor source :initarg :source
20  :documentation "The source object for the document.")
21  (string-contents :initarg :string-contents :accessor string-contents)
22  (term-count-table :initform (make-hash-table :test #'equal)
23  :documentation "Contains a mapping of term ->
24 amount of times word appears in the document.")
25  (vector-data :accessor vector-data
26  :documentation "Vector representation of the document.")
27  (rank :accessor rank :documentation "Rank used for sorting.")
28  (tokens :accessor tokens)
29  (token-count :accessor token-count))
30  (:documentation "The document class represents a document. After
31 creating a document, you can perform several operations on it, some
32 examples:
33 
34 + term count: how many times does a term appear in a document?
35 + term frequency: how many times does a term appear divided by the
36  total number of words in the document?"))
37 
38 (defclass document-collection ()
39  ((documents :initform () :initarg :documents :accessor documents))
40  (:documentation "The document collection class represents a
41 collection of documents. As with a document, there are several
42 operations available, some examples:
43 
44 + dictionary: which words appear in the document collection?
45 + keywords: what are the important keywords in this document
46  collection?"))
47 
48 (defmethod initialize-instance :after ((document document) &key)
49  (setf (tokens document) (word-tokenize (string-contents document)))
50  (setf (token-count document) (length (tokens document)))
51  (loop for token in (tokens document) do
52  (incf (gethash token (slot-value document 'term-count-table) 0))))
53 
54 (defmethod term-count ((document document) term)
55  (gethash term (slot-value document 'term-count-table) 0))
56 
57 (defmethod term-frequency ((document document) term)
58  "How often does the word exist in the document?"
59  (/ (term-count document term)
60  ;; prevent division by zero for malformed documents
61  (max 1 (token-count document))))
62 
63 (defmethod termp ((document document) term)
64  "Does the term exist in the document?"
65  (> (term-count document term) 0))
66 
67 (defmethod add-document ((document-collection document-collection) document)
68  "Add a document to the document collection."
69  (push document (documents document-collection)))
70 
71 (defun match-term (term)
72  (lambda (document)
73  (termp document term)))
74 
75 (defmethod document-frequency ((document-collection document-collection) term)
76  (/ (count-if (match-term term) (documents document-collection))
77  (length (documents document-collection))))
78 
79 (defmethod inverse-document-frequency ((document-collection document-collection) term)
80  (log (/ (length (documents document-collection))
81  (count-if (match-term term) (documents document-collection)))))
82 
83 (defmethod term-frequency-inverse-document-frequency ((document document)
84  (document-collection document-collection)
85  term)
86  (* (term-frequency document term) (inverse-document-frequency document-collection term)))
87 
88 (defmethod dictionary ((document document))
89  "Return a list of all of the words that appear in a document."
90  (loop for key being the hash-keys of (slot-value document 'term-count-table)
91  collect key))
92 
93 (defmethod dictionary ((document-collection document-collection))
94  "Return a list of all of the words that appear in a document collection."
95  (let ((words (list)))
96  (loop for document in (documents document-collection)
97  do (appendf words (tokens document)))
98  (remove-duplicates words :test #'equalp)))
99 
100 (defmethod keywords ((document document) &optional document-collection)
101  (if document-collection
102  (sort (loop for word in (dictionary document)
103  collect (cons word (term-frequency-inverse-document-frequency
104  document document-collection word)))
105  #'>
106  :key #'rest)
107  (sort (loop for word in (dictionary document)
108  collect (cons word (term-frequency document word)))
109  #'>
110  :key #'rest)))
111 
112 (defun extract-keywords (text &key (limit 5))
113  "Extract keywords from a string of text."
114  (take limit (keywords (make-instance 'document
115  :string-contents text))))
116 
117 ;;; Doc Vector
118 
119 (defmethod word-count-vectorize ((document document) dictionary)
120  "Transform a document into a vector using word counts."
121  (let ((vector-data (make-array (length dictionary) :initial-element 0)))
122  (loop for word in dictionary
123  for index from 0 below (length vector-data)
124  do (setf (aref vector-data index) (term-count document word)))
125  (setf (vector-data document) vector-data)))
126 
127 (defmethod tf-idf-vectorize ((document document) (collection document-collection) dictionary)
128  "Transform a document into a vector using tf-idf.
129 Definition: tf-idf: term frequency, inverse document frequency. How
130 often does a term a appear in a document as compared to all other
131 documents?"
132  (let ((vector-data (make-array (length dictionary) :initial-element 0)))
133  (loop for word in dictionary
134  for index from 0 below (length vector-data)
135  do (setf (aref vector-data index)
136  (term-frequency-inverse-document-frequency document collection word)))
137  (setf (vector-data document) vector-data)))
138 
139 (defmethod tf-vectorize ((document document) dictionary)
140  "Transform a document into a vector using tf.
141 Definition: tf: term frequency. How often does a term appear in a
142 document?"
143  (let ((vector-data (make-array (length dictionary) :initial-element 0)))
144  (loop for word in dictionary
145  for index from 0 below (length vector-data)
146  do (setf (aref vector-data index)
147  (term-frequency document word)))
148  (setf (vector-data document) vector-data)))
149 
150 (defmethod vectorize-documents ((document-collection document-collection) operation)
151  (let ((dictionary (dictionary document-collection)))
152  (loop for document in (documents document-collection)
153  do (funcall operation document dictionary))))
154 
155 (defmethod word-count-vectorize-documents ((document-collection document-collection))
156  (vectorize-documents document-collection #'word-count-vectorize))
157 
158 (defmethod tf-vectorize-documents ((document-collection document-collection))
159  "Definition: tf: term frequency. How often does a term appear in a
160 document?"
161  (vectorize-documents document-collection #'tf-vectorize))
162 
163 (defmethod tf-idf-vectorize-documents ((document-collection document-collection))
164  "Definition: tf-idf: term frequency, inverse document frequency. How
165 often does a term appear in a document as compared to all other
166 documents?"
167  (vectorize-documents document-collection (lambda (document dictionary)
168  (tf-idf-vectorize document document-collection dictionary))))
169