changelog shortlog graph tags branches changeset files revisions annotate raw help

Mercurial > core / lisp/lib/nlp/dbscan.lisp

changeset 698: 96958d3eb5b0
parent: 16bb4464adcb
author: Richard Westhaver <ellis@rwest.io>
date: Fri, 04 Oct 2024 22:04:59 -0400
permissions: -rw-r--r--
description: fixes
1 ;;; dbscan --- Density-based spacial clustering of applications with noise (DBSCAN) algorithm
2 
3 ;;; Code:
4 (defpackage :nlp/dbscan
5  (:use :cl :std :nlp/doc :nlp/textrank :nlp/tokenize)
6  (:export
7  :document-cluster :clusters :get-cluster :distance
8  :generate-document-distance-vectors
9  :cluster :neighbors :clusters
10  :dbscan))
11 
12 (in-package :nlp/dbscan)
13 
14 (defclass document-cluster (document-vertex)
15  ((cluster :accessor cluster :initform :noise)
16  (neighbors :accessor neighbors))
17  (:documentation "The document cluster class represents a document
18 that is part of a graph which will be clustered. It extends the
19 documenet-vertex class and adds support for a cluster tag and a list
20 of neighbors. These slots are useful for clustering algorithms."))
21 
22 (defmethod clusters ((collection document-collection))
23  "Return a list of clusters. Each hash key represents a cluster, and
24  the hash value is the list of elements in that cluster.
25 
26  Please note: this function is not responsible for computing the
27  clusters, only for returning the list of pre-tagged documents in
28  cluster lists."
29  (let ((result (make-hash-table)))
30  (loop for document in (documents collection)
31  do (push document (gethash (cluster document) result (list))))
32  result))
33 
34 (defun get-cluster (cluster-label points)
35  "Return all matching points for a given cluster label."
36  (remove-if-not (lambda (i) (eq (cluster i) cluster-label)) points))
37 
38 (defmethod distance ((vector-1 t) (vector-2 t))
39  "Return the Euclidean distance between two vectors."
40  (sqrt (loop for i across vector-1
41  for j across vector-2
42  sum (expt (- i j) 2))))
43 
44 (defmethod distance ((document-a document-cluster) (document-b document-cluster))
45  (distance (vector-data document-a) (vector-data document-b)))
46 
47 (defmethod generate-document-distance-vectors ((collection document-collection))
48  "Set the edge weights for all document neighbors (graph is fully connected)."
49  (with-accessors ((documents documents)) collection
50  (loop for document-a in documents
51  do (loop for document-b in documents
52  do (setf (gethash document-b (edges document-a))
53  (distance document-a document-b))))))
54 
55 (defmethod dbscan ((collection document-collection) &key (minimum-points 3)
56  (epsilon 0.5))
57  "Minimum points refers to the minimum amount of points that must
58  exist in the neighborhood of a point for it to be considered a
59  core-point in a cluster. Epsilon refers to the distance between
60  two points for them to be considered neighbors."
61  (labels ((range-query (document)
62  "Return all points that have a distance less than epsilon."
63  (loop for vertex being the hash-keys of (edges document)
64  when (and (<= (gethash vertex (edges document)) epsilon)
65  (not (eq vertex document)))
66  collect vertex))
67  (core-point-p (point)
68  "Is a point a core-point?"
69  (<= minimum-points (length (range-query point))))
70  (cluster-match-p (point cluster)
71  "Check if a core point belongs to a cluster."
72  (intersection cluster (range-query point))))
73  ;;; identify core points
74  (let* ((core-points (remove-if-not #'core-point-p (documents collection)))
75  (non-core-points (set-difference (documents collection) core-points)))
76  ;;; assign labels to core points
77  (loop for point in core-points
78  with cluster-count = 0
79  do (loop named cluster-set
80  for i from 0 to cluster-count
81  ;; point found cluster match, setf and break
82  when (cluster-match-p point (get-cluster i core-points))
83  do (setf (cluster point) i)
84  (return-from cluster-set)
85  ;; point found no cluster-match, create new cluster
86  finally (setf (cluster point) (incf cluster-count))))
87  ;;; assign labels to non-core points
88  (loop for point in non-core-points
89  for intersection = (intersection core-points (range-query point))
90  when intersection
91  do (setf (cluster point) (cluster (first intersection)))))))