53
|
1
|
(defpackage :nlp/section |
|
2
|
(:use :cl :std :nlp/doc :nlp/dbscan :nlp/tokenize) |
|
3
|
(:export :extract-sections)) |
|
4
|
|
|
5
|
(in-package :nlp/section) |
|
6
|
|
|
7
|
(defun extract-sections (text &key (epsilon 0.5)) |
|
8
|
"Extract the sections from a string of text. Epsilon refers to the |
|
9
|
distance between two points for them to be considered related." |
|
10
|
(labels ((average-distance (point points) |
|
11
|
(/ (reduce #'+ points |
|
12
|
:key (lambda (i) (distance (vector-data i) |
|
13
|
(vector-data point)))) |
|
14
|
(length points)))) |
|
15
|
(let ((collection (make-instance 'document-collection))) |
|
16
|
(loop for sentence in (sentence-tokenize text) |
|
17
|
do (add-document collection |
|
18
|
(make-instance 'document-cluster |
|
19
|
:string-contents sentence))) |
|
20
|
(tf-vectorize-documents collection) |
|
21
|
(loop for document in (documents collection) |
|
22
|
with cluster-index = 0 |
|
23
|
for cluster = (get-cluster cluster-index (documents collection)) |
|
24
|
do (if (and cluster (>= epsilon (average-distance document cluster))) |
|
25
|
(setf (cluster document) cluster-index) |
|
26
|
(setf (cluster document) (incf cluster-index)))) |
|
27
|
collection))) |