Mercurial > core / lisp/lib/nlp/doc.lisp
changeset 53: |
daad2b8bb63f |
child: |
83f6c62bf2a8 |
author: |
ellis <ellis@rwest.io> |
date: |
Fri, 24 Nov 2023 18:11:20 -0500 |
permissions: |
-rw-r--r-- |
description: |
init nlp |
1 ;;; doc.lisp --- Text Documents 5 (:use :cl :std/fu :std/list :nlp/tokenize) 13 :tf-vectorize-documents 19 ((source :accessor source :initarg :source 20 :documentation "The source object for the document.") 21 (string-contents :initarg :string-contents :accessor string-contents) 22 (term-count-table :initform (make-hash-table :test #'equal) 23 :documentation "Contains a mapping of term -> 24 amount of times word appears in the document.") 25 (vector-data :accessor vector-data 26 :documentation "Vector representation of the document.") 27 (rank :accessor rank :documentation "Rank used for sorting.") 28 (tokens :accessor tokens) 29 (token-count :accessor token-count)) 30 (:documentation "The document class represents a document. After 31 creating a document, you can perform several operations on it, some 34 + term count: how many times does a term appear in a document? 35 + term frequency: how many times does a term appear divided by the 36 total number of words in the document?")) 38 (defclass document-collection () 39 ((documents :initform () :initarg :documents :accessor documents)) 40 (:documentation "The document collection class represents a 41 collection of documents. As with a document, there are several 42 operations available, some examples: 44 + dictionary: which words appear in the document collection? 45 + keywords: what are the important keywords in this document 48 (defmethod initialize-instance :after ((document document) &key) 49 (setf (tokens document) (word-tokenize (string-contents document))) 50 (setf (token-count document) (length (tokens document))) 51 (loop for token in (tokens document) do 52 (incf (gethash token (slot-value document 'term-count-table) 0)))) 54 (defmethod term-count ((document document) term) 55 (gethash term (slot-value document 'term-count-table) 0)) 57 (defmethod term-frequency ((document document) term) 58 "How often does the word exist in the document?" 59 (/ (term-count document term) 60 ;; prevent division by zero for malformed documents 61 (max 1 (token-count document)))) 63 (defmethod termp ((document document) term) 64 "Does the term exist in the document?" 65 (> (term-count document term) 0)) 67 (defmethod add-document ((document-collection document-collection) document) 68 "Add a document to the document collection." 69 (push document (documents document-collection))) 71 (defun match-term (term) 73 (termp document term))) 75 (defmethod document-frequency ((document-collection document-collection) term) 76 (/ (count-if (match-term term) (documents document-collection)) 77 (length (documents document-collection)))) 79 (defmethod inverse-document-frequency ((document-collection document-collection) term) 80 (log (/ (length (documents document-collection)) 81 (count-if (match-term term) (documents document-collection))))) 83 (defmethod term-frequency-inverse-document-frequency ((document document) 84 (document-collection document-collection) 86 (* (term-frequency document term) (inverse-document-frequency document-collection term))) 88 (defmethod dictionary ((document document)) 89 "Return a list of all of the words that appear in a document." 90 (loop for key being the hash-keys of (slot-value document 'term-count-table) 93 (defmethod dictionary ((document-collection document-collection)) 94 "Return a list of all of the words that appear in a document collection." 96 (loop for document in (documents document-collection) 97 do (appendf words (tokens document))) 98 (remove-duplicates words :test #'equalp))) 100 (defmethod keywords ((document document) &optional document-collection) 101 (if document-collection 102 (sort (loop for word in (dictionary document) 103 collect (cons word (term-frequency-inverse-document-frequency 104 document document-collection word))) 107 (sort (loop for word in (dictionary document) 108 collect (cons word (term-frequency document word))) 112 (defun extract-keywords (text &key (limit 5)) 113 "Extract keywords from a string of text." 114 (take limit (keywords (make-instance 'document 115 :string-contents text)))) 119 (defmethod word-count-vectorize ((document document) dictionary) 120 "Transform a document into a vector using word counts." 121 (let ((vector-data (make-array (length dictionary) :initial-element 0))) 122 (loop for word in dictionary 123 for index from 0 below (length vector-data) 124 do (setf (aref vector-data index) (term-count document word))) 125 (setf (vector-data document) vector-data))) 127 (defmethod tf-idf-vectorize ((document document) (collection document-collection) dictionary) 128 "Transform a document into a vector using tf-idf. 129 Definition: tf-idf: term frequency, inverse document frequency. How 130 often does a term a appear in a document as compared to all other 132 (let ((vector-data (make-array (length dictionary) :initial-element 0))) 133 (loop for word in dictionary 134 for index from 0 below (length vector-data) 135 do (setf (aref vector-data index) 136 (term-frequency-inverse-document-frequency document collection word))) 137 (setf (vector-data document) vector-data))) 139 (defmethod tf-vectorize ((document document) dictionary) 140 "Transform a document into a vector using tf. 141 Definition: tf: term frequency. How often does a term appear in a 143 (let ((vector-data (make-array (length dictionary) :initial-element 0))) 144 (loop for word in dictionary 145 for index from 0 below (length vector-data) 146 do (setf (aref vector-data index) 147 (term-frequency document word))) 148 (setf (vector-data document) vector-data))) 150 (defmethod vectorize-documents ((document-collection document-collection) operation) 151 (let ((dictionary (dictionary document-collection))) 152 (loop for document in (documents document-collection) 153 do (funcall operation document dictionary)))) 155 (defmethod word-count-vectorize-documents ((document-collection document-collection)) 156 (vectorize-documents document-collection #'word-count-vectorize)) 158 (defmethod tf-vectorize-documents ((document-collection document-collection)) 159 "Definition: tf: term frequency. How often does a term appear in a 161 (vectorize-documents document-collection #'tf-vectorize)) 163 (defmethod tf-idf-vectorize-documents ((document-collection document-collection)) 164 "Definition: tf-idf: term frequency, inverse document frequency. How 165 often does a term appear in a document as compared to all other 167 (vectorize-documents document-collection (lambda (document dictionary) 168 (tf-idf-vectorize document document-collection dictionary))))