Mercurial > core / lisp/lib/nlp/textrank.lisp
changeset 698: |
96958d3eb5b0 |
parent: |
16bb4464adcb
|
author: |
Richard Westhaver <ellis@rwest.io> |
date: |
Fri, 04 Oct 2024 22:04:59 -0400 |
permissions: |
-rw-r--r-- |
description: |
fixes |
1 ;;; textrank.lisp --- TextRank 3 ;; based on https://web.eecs.umich.edu/~mihalcea/papers/mihalcea.emnlp04.pdf 6 (defpackage :nlp/textrank 7 (:use :cl :std :nlp/doc :nlp/tokenize) 9 :summarize-text :edges :document-vertex)) 11 (in-package :nlp/textrank) 13 (defclass document-vertex (document) 14 ((edges :accessor edges :initform (make-hash-table) 15 :documentation "The keys of the hash table represent the 16 edges, the values of the hash table represent the edge 18 (:documentation "The document vertex class represents a document 19 that is part of a graph. The edges slot of the document vertex class 20 is used to store edges of that particular vertex. The keys in the 21 edges slot hash table are the actual vertexes, and the values are the 24 (defmethod cosine-similarity ((document-a document) (document-b document)) 25 "Calculate the cosine similarity between two vectors." 26 (flet ((vector-product (document-a document-b) 27 (loop for a across (vector-data document-a) 28 for b across (vector-data document-b) 30 (vector-sum-root (document) 31 (sqrt (loop for i across (vector-data document) 33 (vector-zero-p (document) 34 (every #'zerop (vector-data document)))) 35 (if (or (vector-zero-p document-a) (vector-zero-p document-b)) 36 0 ; if either vector is completely zero, they are dissimilar 37 (/ (vector-product document-a document-b) 38 (* (vector-sum-root document-a) (vector-sum-root document-b)))))) 40 (defmethod generate-document-similarity-vectors ((collection document-collection)) 41 "Set the edge weights for all document neighbors (graph is fully connected)." 42 (with-accessors ((documents documents)) collection 43 (loop for document-a in documents 44 do (loop for document-b in documents 45 do (setf (gethash document-b (edges document-a)) 46 (cosine-similarity document-a document-b)))))) 48 (defmethod textrank ((collection document-collection) &key (epsilon 0.001) 51 (iteration-limit 100)) 52 "This method is used to calculate the text rankings for a document 53 collection. The `epsilon' is the maximum delta for a given node 54 rank change during an iteration to be considered convergent. The 55 `damping' is a factor utilized to normalize the data. The 56 `initial-rank' is the rank given to nodes before any 57 iterations. The `iteration-limit' is the amount of times the 58 algorithm may traverse the graph before giving up (if the algorithm 60 (with-accessors ((documents documents)) collection 61 (unless (zerop (length documents)) 62 (labels ((set-initial-rank () 63 "Set the initial rank of all documents to a supplied 64 value OR 1/length of the documents." 65 (let ((initial-rank (or initial-rank (/ 1 (length documents))))) 66 (mapcar (lambda (document) (setf (rank document) initial-rank)) documents))) 67 (graph-neighbors (document) 68 "Return a list of neighbors. In a fully connected graph, 69 all nodes are a neighbor except for the node itself." 70 (remove document documents)) 71 (graph-neighbor-edge-sum (document) 72 "Add up the edges of all neighbors of a given node." 73 (let ((sum (- (reduce #'+ (hash-table-values (edges document))) 1))) 74 (if (> sum 0) sum 1))) 75 (document-similarity (document-a document-b) 76 (gethash document-b (edges document-a) 0)) 77 (convergedp (previous-score current-score) 78 "Check if a delta qualifies for convergence." 79 (<= (abs (- previous-score current-score)) epsilon)) 80 (calculate-rank (document) 81 "Calculate the rank of a document." 82 (loop for neighbor in (graph-neighbors document) 83 sum (/ (* damping (rank neighbor) (document-similarity document neighbor)) 84 (graph-neighbor-edge-sum neighbor))))) 86 (loop with converged = nil 87 for iteration from 0 to iteration-limit until converged 89 (loop for document in documents 90 for old-rank = (rank document) 91 for new-rank = (calculate-rank document) 92 do (setf (rank document) new-rank) 93 unless (convergedp old-rank new-rank) 94 do (setf converged nil))))))) 96 (defun summarize-text (text &key (summary-length 3) (show-rank-p nil)) 97 (let ((collection (make-instance 'document-collection))) 98 (loop for sentence in (sentence-tokenize text) 99 do (add-document collection 100 (make-instance 'document-vertex 101 :string-contents sentence))) 102 (tf-idf-vectorize-documents collection) 103 (generate-document-similarity-vectors collection) 104 (textrank collection :iteration-limit 100) 106 (mapcar (if show-rank-p 107 (lambda (i) (cons (rank i) (string-contents i))) 109 (sort (documents collection) #'> :key #'rank)))))