changelog shortlog graph tags branches changeset files file revisions raw help

Mercurial > core / annotate lisp/lib/nlp/section.lisp

changeset 698: 96958d3eb5b0
parent: daad2b8bb63f
author: Richard Westhaver <ellis@rwest.io>
date: Fri, 04 Oct 2024 22:04:59 -0400
permissions: -rw-r--r--
description: fixes
53
daad2b8bb63f init nlp
ellis <ellis@rwest.io>
parents:
diff changeset
1
 (defpackage :nlp/section
daad2b8bb63f init nlp
ellis <ellis@rwest.io>
parents:
diff changeset
2
   (:use :cl :std :nlp/doc :nlp/dbscan :nlp/tokenize)
daad2b8bb63f init nlp
ellis <ellis@rwest.io>
parents:
diff changeset
3
   (:export :extract-sections))
daad2b8bb63f init nlp
ellis <ellis@rwest.io>
parents:
diff changeset
4
 
daad2b8bb63f init nlp
ellis <ellis@rwest.io>
parents:
diff changeset
5
 (in-package :nlp/section)
daad2b8bb63f init nlp
ellis <ellis@rwest.io>
parents:
diff changeset
6
 
daad2b8bb63f init nlp
ellis <ellis@rwest.io>
parents:
diff changeset
7
 (defun extract-sections (text &key (epsilon 0.5))
daad2b8bb63f init nlp
ellis <ellis@rwest.io>
parents:
diff changeset
8
   "Extract the sections from a string of text. Epsilon refers to the
daad2b8bb63f init nlp
ellis <ellis@rwest.io>
parents:
diff changeset
9
    distance between two points for them to be considered related."
daad2b8bb63f init nlp
ellis <ellis@rwest.io>
parents:
diff changeset
10
   (labels ((average-distance (point points)
daad2b8bb63f init nlp
ellis <ellis@rwest.io>
parents:
diff changeset
11
              (/ (reduce #'+ points
daad2b8bb63f init nlp
ellis <ellis@rwest.io>
parents:
diff changeset
12
                         :key (lambda (i) (distance (vector-data i)
daad2b8bb63f init nlp
ellis <ellis@rwest.io>
parents:
diff changeset
13
                                                    (vector-data point))))
daad2b8bb63f init nlp
ellis <ellis@rwest.io>
parents:
diff changeset
14
                 (length points))))
daad2b8bb63f init nlp
ellis <ellis@rwest.io>
parents:
diff changeset
15
     (let ((collection (make-instance 'document-collection)))
daad2b8bb63f init nlp
ellis <ellis@rwest.io>
parents:
diff changeset
16
       (loop for sentence in (sentence-tokenize text)
daad2b8bb63f init nlp
ellis <ellis@rwest.io>
parents:
diff changeset
17
             do (add-document collection
daad2b8bb63f init nlp
ellis <ellis@rwest.io>
parents:
diff changeset
18
                              (make-instance 'document-cluster
daad2b8bb63f init nlp
ellis <ellis@rwest.io>
parents:
diff changeset
19
                                             :string-contents sentence)))
daad2b8bb63f init nlp
ellis <ellis@rwest.io>
parents:
diff changeset
20
       (tf-vectorize-documents collection)
daad2b8bb63f init nlp
ellis <ellis@rwest.io>
parents:
diff changeset
21
       (loop for document in (documents collection)
daad2b8bb63f init nlp
ellis <ellis@rwest.io>
parents:
diff changeset
22
             with cluster-index = 0
daad2b8bb63f init nlp
ellis <ellis@rwest.io>
parents:
diff changeset
23
             for cluster = (get-cluster cluster-index (documents collection))
daad2b8bb63f init nlp
ellis <ellis@rwest.io>
parents:
diff changeset
24
             do (if (and cluster (>= epsilon (average-distance document cluster)))
daad2b8bb63f init nlp
ellis <ellis@rwest.io>
parents:
diff changeset
25
                    (setf (cluster document) cluster-index)
daad2b8bb63f init nlp
ellis <ellis@rwest.io>
parents:
diff changeset
26
                    (setf (cluster document) (incf cluster-index))))
daad2b8bb63f init nlp
ellis <ellis@rwest.io>
parents:
diff changeset
27
       collection)))