changelog shortlog graph tags branches changeset files revisions annotate raw help

Mercurial > core / lisp/lib/nlp/doc.lisp

changeset 698: 96958d3eb5b0
parent: 16bb4464adcb
author: Richard Westhaver <ellis@rwest.io>
date: Fri, 04 Oct 2024 22:04:59 -0400
permissions: -rw-r--r--
description: fixes
1 ;;; doc.lisp --- Text Documents
2 
3 ;;; Code:
4 (defpackage :nlp/doc
5  (:use :cl :std :nlp/data :nlp/tokenize)
6  (:export
7  :document
8  :documents
9  :add-document
10  :document-collection
11  :keywords
12  :dictionary
13  :term-count
14  :document-frequency
15  :inverse-document-frequency
16  :tf-idf-vectorize-documents
17  :termp
18  :string-contents
19  :rank
20  :term-frequency
21  :extract-keywords
22  :tf-vectorize-documents
23  :vector-data))
24 
25 (in-package :nlp/doc)
26 
27 (defclass document ()
28  ((source :accessor source :initarg :source
29  :documentation "The source object for the document.")
30  (string-contents :initarg :string-contents :accessor string-contents)
31  (term-count-table :initform (make-hash-table :test #'equal)
32  :documentation "Contains a mapping of term ->
33 amount of times word appears in the document.")
34  (vector-data :accessor vector-data
35  :documentation "Vector representation of the document.")
36  (rank :accessor rank :documentation "Rank used for sorting.")
37  (tokens :accessor tokens)
38  (token-count :accessor token-count))
39  (:documentation "The document class represents a document. After
40 creating a document, you can perform several operations on it, some
41 examples:
42 
43 + term count: how many times does a term appear in a document?
44 + term frequency: how many times does a term appear divided by the
45  total number of words in the document?"))
46 
47 (defclass document-collection ()
48  ((documents :initform () :initarg :documents :accessor documents))
49  (:documentation "The document collection class represents a
50 collection of documents. As with a document, there are several
51 operations available, some examples:
52 
53 + dictionary: which words appear in the document collection?
54 + keywords: what are the important keywords in this document
55  collection?"))
56 
57 (defmethod initialize-instance :after ((document document) &key)
58  (setf (tokens document) (word-tokenize (string-contents document)))
59  (setf (token-count document) (length (tokens document)))
60  (loop for token in (tokens document) do
61  (incf (gethash token (slot-value document 'term-count-table) 0))))
62 
63 (defmethod term-count ((document document) term)
64  (gethash term (slot-value document 'term-count-table) 0))
65 
66 (defmethod term-frequency ((document document) term)
67  "How often does the word exist in the document?"
68  (/ (term-count document term)
69  ;; prevent division by zero for malformed documents
70  (max 1 (token-count document))))
71 
72 (defmethod termp ((document document) term)
73  "Does the term exist in the document?"
74  (> (term-count document term) 0))
75 
76 (defmethod add-document ((document-collection document-collection) document)
77  "Add a document to the document collection."
78  (push document (documents document-collection)))
79 
80 (defun match-term (term)
81  (lambda (document)
82  (termp document term)))
83 
84 (defmethod document-frequency ((document-collection document-collection) term)
85  (/ (count-if (match-term term) (documents document-collection))
86  (length (documents document-collection))))
87 
88 (defmethod inverse-document-frequency ((document-collection document-collection) term)
89  (log (/ (length (documents document-collection))
90  (count-if (match-term term) (documents document-collection)))))
91 
92 (defmethod term-frequency-inverse-document-frequency ((document document)
93  (document-collection document-collection)
94  term)
95  (* (term-frequency document term) (inverse-document-frequency document-collection term)))
96 
97 (defmethod dictionary ((document document))
98  "Return a list of all of the words that appear in a document."
99  (loop for key being the hash-keys of (slot-value document 'term-count-table)
100  collect key))
101 
102 (defmethod dictionary ((document-collection document-collection))
103  "Return a list of all of the words that appear in a document collection."
104  (let ((words (list)))
105  (loop for document in (documents document-collection)
106  do (appendf words (tokens document)))
107  (remove-duplicates words :test #'equalp)))
108 
109 (defmethod keywords ((document document) &optional document-collection)
110  (if document-collection
111  (sort (loop for word in (dictionary document)
112  collect (cons word (term-frequency-inverse-document-frequency
113  document document-collection word)))
114  #'>
115  :key #'rest)
116  (sort (loop for word in (dictionary document)
117  collect (cons word (term-frequency document word)))
118  #'>
119  :key #'rest)))
120 
121 (defun extract-keywords (text &key (limit 5))
122  "Extract keywords from a string of text."
123  (take limit (keywords (make-instance 'document
124  :string-contents text))))
125 
126 ;;; Doc Vector
127 
128 (defmethod word-count-vectorize ((document document) dictionary)
129  "Transform a document into a vector using word counts."
130  (let ((vector-data (make-array (length dictionary) :initial-element 0)))
131  (loop for word in dictionary
132  for index from 0 below (length vector-data)
133  do (setf (aref vector-data index) (term-count document word)))
134  (setf (vector-data document) vector-data)))
135 
136 (defmethod tf-idf-vectorize ((document document) (collection document-collection) dictionary)
137  "Transform a document into a vector using tf-idf.
138 Definition: tf-idf: term frequency, inverse document frequency. How
139 often does a term a appear in a document as compared to all other
140 documents?"
141  (let ((vector-data (make-array (length dictionary) :initial-element 0)))
142  (loop for word in dictionary
143  for index from 0 below (length vector-data)
144  do (setf (aref vector-data index)
145  (term-frequency-inverse-document-frequency document collection word)))
146  (setf (vector-data document) vector-data)))
147 
148 (defmethod tf-vectorize ((document document) dictionary)
149  "Transform a document into a vector using tf.
150 Definition: tf: term frequency. How often does a term appear in a
151 document?"
152  (let ((vector-data (make-array (length dictionary) :initial-element 0)))
153  (loop for word in dictionary
154  for index from 0 below (length vector-data)
155  do (setf (aref vector-data index)
156  (term-frequency document word)))
157  (setf (vector-data document) vector-data)))
158 
159 (defmethod vectorize-documents ((document-collection document-collection) operation)
160  (let ((dictionary (dictionary document-collection)))
161  (loop for document in (documents document-collection)
162  do (funcall operation document dictionary))))
163 
164 (defmethod word-count-vectorize-documents ((document-collection document-collection))
165  (vectorize-documents document-collection #'word-count-vectorize))
166 
167 (defmethod tf-vectorize-documents ((document-collection document-collection))
168  "Definition: tf: term frequency. How often does a term appear in a
169 document?"
170  (vectorize-documents document-collection #'tf-vectorize))
171 
172 (defmethod tf-idf-vectorize-documents ((document-collection document-collection))
173  "Definition: tf-idf: term frequency, inverse document frequency. How
174 often does a term appear in a document as compared to all other
175 documents?"
176  (vectorize-documents document-collection (lambda (document dictionary)
177  (tf-idf-vectorize document document-collection dictionary))))
178