Mercurial > core / lisp/lib/nlp/doc.lisp
changeset 380: |
16bb4464adcb |
parent: |
301fd45bbe73
|
author: |
Richard Westhaver <ellis@rwest.io> |
date: |
Tue, 28 May 2024 17:55:30 -0400 |
permissions: |
-rw-r--r-- |
description: |
nlp fixes |
1 ;;; doc.lisp --- Text Documents 5 (:use :cl :std :nlp/data :nlp/tokenize) 15 :inverse-document-frequency 16 :tf-idf-vectorize-documents 22 :tf-vectorize-documents 28 ((source :accessor source :initarg :source 29 :documentation "The source object for the document.") 30 (string-contents :initarg :string-contents :accessor string-contents) 31 (term-count-table :initform (make-hash-table :test #'equal) 32 :documentation "Contains a mapping of term -> 33 amount of times word appears in the document.") 34 (vector-data :accessor vector-data 35 :documentation "Vector representation of the document.") 36 (rank :accessor rank :documentation "Rank used for sorting.") 37 (tokens :accessor tokens) 38 (token-count :accessor token-count)) 39 (:documentation "The document class represents a document. After 40 creating a document, you can perform several operations on it, some 43 + term count: how many times does a term appear in a document? 44 + term frequency: how many times does a term appear divided by the 45 total number of words in the document?")) 47 (defclass document-collection () 48 ((documents :initform () :initarg :documents :accessor documents)) 49 (:documentation "The document collection class represents a 50 collection of documents. As with a document, there are several 51 operations available, some examples: 53 + dictionary: which words appear in the document collection? 54 + keywords: what are the important keywords in this document 57 (defmethod initialize-instance :after ((document document) &key) 58 (setf (tokens document) (word-tokenize (string-contents document))) 59 (setf (token-count document) (length (tokens document))) 60 (loop for token in (tokens document) do 61 (incf (gethash token (slot-value document 'term-count-table) 0)))) 63 (defmethod term-count ((document document) term) 64 (gethash term (slot-value document 'term-count-table) 0)) 66 (defmethod term-frequency ((document document) term) 67 "How often does the word exist in the document?" 68 (/ (term-count document term) 69 ;; prevent division by zero for malformed documents 70 (max 1 (token-count document)))) 72 (defmethod termp ((document document) term) 73 "Does the term exist in the document?" 74 (> (term-count document term) 0)) 76 (defmethod add-document ((document-collection document-collection) document) 77 "Add a document to the document collection." 78 (push document (documents document-collection))) 80 (defun match-term (term) 82 (termp document term))) 84 (defmethod document-frequency ((document-collection document-collection) term) 85 (/ (count-if (match-term term) (documents document-collection)) 86 (length (documents document-collection)))) 88 (defmethod inverse-document-frequency ((document-collection document-collection) term) 89 (log (/ (length (documents document-collection)) 90 (count-if (match-term term) (documents document-collection))))) 92 (defmethod term-frequency-inverse-document-frequency ((document document) 93 (document-collection document-collection) 95 (* (term-frequency document term) (inverse-document-frequency document-collection term))) 97 (defmethod dictionary ((document document)) 98 "Return a list of all of the words that appear in a document." 99 (loop for key being the hash-keys of (slot-value document 'term-count-table) 102 (defmethod dictionary ((document-collection document-collection)) 103 "Return a list of all of the words that appear in a document collection." 104 (let ((words (list))) 105 (loop for document in (documents document-collection) 106 do (appendf words (tokens document))) 107 (remove-duplicates words :test #'equalp))) 109 (defmethod keywords ((document document) &optional document-collection) 110 (if document-collection 111 (sort (loop for word in (dictionary document) 112 collect (cons word (term-frequency-inverse-document-frequency 113 document document-collection word))) 116 (sort (loop for word in (dictionary document) 117 collect (cons word (term-frequency document word))) 121 (defun extract-keywords (text &key (limit 5)) 122 "Extract keywords from a string of text." 123 (take limit (keywords (make-instance 'document 124 :string-contents text)))) 128 (defmethod word-count-vectorize ((document document) dictionary) 129 "Transform a document into a vector using word counts." 130 (let ((vector-data (make-array (length dictionary) :initial-element 0))) 131 (loop for word in dictionary 132 for index from 0 below (length vector-data) 133 do (setf (aref vector-data index) (term-count document word))) 134 (setf (vector-data document) vector-data))) 136 (defmethod tf-idf-vectorize ((document document) (collection document-collection) dictionary) 137 "Transform a document into a vector using tf-idf. 138 Definition: tf-idf: term frequency, inverse document frequency. How 139 often does a term a appear in a document as compared to all other 141 (let ((vector-data (make-array (length dictionary) :initial-element 0))) 142 (loop for word in dictionary 143 for index from 0 below (length vector-data) 144 do (setf (aref vector-data index) 145 (term-frequency-inverse-document-frequency document collection word))) 146 (setf (vector-data document) vector-data))) 148 (defmethod tf-vectorize ((document document) dictionary) 149 "Transform a document into a vector using tf. 150 Definition: tf: term frequency. How often does a term appear in a 152 (let ((vector-data (make-array (length dictionary) :initial-element 0))) 153 (loop for word in dictionary 154 for index from 0 below (length vector-data) 155 do (setf (aref vector-data index) 156 (term-frequency document word))) 157 (setf (vector-data document) vector-data))) 159 (defmethod vectorize-documents ((document-collection document-collection) operation) 160 (let ((dictionary (dictionary document-collection))) 161 (loop for document in (documents document-collection) 162 do (funcall operation document dictionary)))) 164 (defmethod word-count-vectorize-documents ((document-collection document-collection)) 165 (vectorize-documents document-collection #'word-count-vectorize)) 167 (defmethod tf-vectorize-documents ((document-collection document-collection)) 168 "Definition: tf: term frequency. How often does a term appear in a 170 (vectorize-documents document-collection #'tf-vectorize)) 172 (defmethod tf-idf-vectorize-documents ((document-collection document-collection)) 173 "Definition: tf-idf: term frequency, inverse document frequency. How 174 often does a term appear in a document as compared to all other 176 (vectorize-documents document-collection (lambda (document dictionary) 177 (tf-idf-vectorize document document-collection dictionary))))