Mercurial > core / lisp/lib/nlp/section.lisp
changeset 698: |
96958d3eb5b0 |
parent: |
daad2b8bb63f
|
author: |
Richard Westhaver <ellis@rwest.io> |
date: |
Fri, 04 Oct 2024 22:04:59 -0400 |
permissions: |
-rw-r--r-- |
description: |
fixes |
1 (defpackage :nlp/section 2 (:use :cl :std :nlp/doc :nlp/dbscan :nlp/tokenize) 3 (:export :extract-sections)) 5 (in-package :nlp/section) 7 (defun extract-sections (text &key (epsilon 0.5)) 8 "Extract the sections from a string of text. Epsilon refers to the 9 distance between two points for them to be considered related." 10 (labels ((average-distance (point points) 12 :key (lambda (i) (distance (vector-data i) 13 (vector-data point)))) 15 (let ((collection (make-instance 'document-collection))) 16 (loop for sentence in (sentence-tokenize text) 17 do (add-document collection 18 (make-instance 'document-cluster 19 :string-contents sentence))) 20 (tf-vectorize-documents collection) 21 (loop for document in (documents collection) 22 with cluster-index = 0 23 for cluster = (get-cluster cluster-index (documents collection)) 24 do (if (and cluster (>= epsilon (average-distance document cluster))) 25 (setf (cluster document) cluster-index) 26 (setf (cluster document) (incf cluster-index))))