changelog shortlog graph tags branches changeset files revisions annotate raw help

Mercurial > core / lisp/lib/nlp/section.lisp

changeset 698: 96958d3eb5b0
parent: daad2b8bb63f
author: Richard Westhaver <ellis@rwest.io>
date: Fri, 04 Oct 2024 22:04:59 -0400
permissions: -rw-r--r--
description: fixes
1 (defpackage :nlp/section
2  (:use :cl :std :nlp/doc :nlp/dbscan :nlp/tokenize)
3  (:export :extract-sections))
4 
5 (in-package :nlp/section)
6 
7 (defun extract-sections (text &key (epsilon 0.5))
8  "Extract the sections from a string of text. Epsilon refers to the
9  distance between two points for them to be considered related."
10  (labels ((average-distance (point points)
11  (/ (reduce #'+ points
12  :key (lambda (i) (distance (vector-data i)
13  (vector-data point))))
14  (length points))))
15  (let ((collection (make-instance 'document-collection)))
16  (loop for sentence in (sentence-tokenize text)
17  do (add-document collection
18  (make-instance 'document-cluster
19  :string-contents sentence)))
20  (tf-vectorize-documents collection)
21  (loop for document in (documents collection)
22  with cluster-index = 0
23  for cluster = (get-cluster cluster-index (documents collection))
24  do (if (and cluster (>= epsilon (average-distance document cluster)))
25  (setf (cluster document) cluster-index)
26  (setf (cluster document) (incf cluster-index))))
27  collection)))