Mercurial > core / lisp/lib/nlp/dbscan.lisp
changeset 698: |
96958d3eb5b0 |
parent: |
16bb4464adcb
|
author: |
Richard Westhaver <ellis@rwest.io> |
date: |
Fri, 04 Oct 2024 22:04:59 -0400 |
permissions: |
-rw-r--r-- |
description: |
fixes |
1 ;;; dbscan --- Density-based spacial clustering of applications with noise (DBSCAN) algorithm 4 (defpackage :nlp/dbscan 5 (:use :cl :std :nlp/doc :nlp/textrank :nlp/tokenize) 7 :document-cluster :clusters :get-cluster :distance 8 :generate-document-distance-vectors 9 :cluster :neighbors :clusters 12 (in-package :nlp/dbscan) 14 (defclass document-cluster (document-vertex) 15 ((cluster :accessor cluster :initform :noise) 16 (neighbors :accessor neighbors)) 17 (:documentation "The document cluster class represents a document 18 that is part of a graph which will be clustered. It extends the 19 documenet-vertex class and adds support for a cluster tag and a list 20 of neighbors. These slots are useful for clustering algorithms.")) 22 (defmethod clusters ((collection document-collection)) 23 "Return a list of clusters. Each hash key represents a cluster, and 24 the hash value is the list of elements in that cluster. 26 Please note: this function is not responsible for computing the 27 clusters, only for returning the list of pre-tagged documents in 29 (let ((result (make-hash-table))) 30 (loop for document in (documents collection) 31 do (push document (gethash (cluster document) result (list)))) 34 (defun get-cluster (cluster-label points) 35 "Return all matching points for a given cluster label." 36 (remove-if-not (lambda (i) (eq (cluster i) cluster-label)) points)) 38 (defmethod distance ((vector-1 t) (vector-2 t)) 39 "Return the Euclidean distance between two vectors." 40 (sqrt (loop for i across vector-1 42 sum (expt (- i j) 2)))) 44 (defmethod distance ((document-a document-cluster) (document-b document-cluster)) 45 (distance (vector-data document-a) (vector-data document-b))) 47 (defmethod generate-document-distance-vectors ((collection document-collection)) 48 "Set the edge weights for all document neighbors (graph is fully connected)." 49 (with-accessors ((documents documents)) collection 50 (loop for document-a in documents 51 do (loop for document-b in documents 52 do (setf (gethash document-b (edges document-a)) 53 (distance document-a document-b)))))) 55 (defmethod dbscan ((collection document-collection) &key (minimum-points 3) 57 "Minimum points refers to the minimum amount of points that must 58 exist in the neighborhood of a point for it to be considered a 59 core-point in a cluster. Epsilon refers to the distance between 60 two points for them to be considered neighbors." 61 (labels ((range-query (document) 62 "Return all points that have a distance less than epsilon." 63 (loop for vertex being the hash-keys of (edges document) 64 when (and (<= (gethash vertex (edges document)) epsilon) 65 (not (eq vertex document))) 68 "Is a point a core-point?" 69 (<= minimum-points (length (range-query point)))) 70 (cluster-match-p (point cluster) 71 "Check if a core point belongs to a cluster." 72 (intersection cluster (range-query point)))) 73 ;;; identify core points 74 (let* ((core-points (remove-if-not #'core-point-p (documents collection))) 75 (non-core-points (set-difference (documents collection) core-points))) 76 ;;; assign labels to core points 77 (loop for point in core-points 78 with cluster-count = 0 79 do (loop named cluster-set 80 for i from 0 to cluster-count 81 ;; point found cluster match, setf and break 82 when (cluster-match-p point (get-cluster i core-points)) 83 do (setf (cluster point) i) 84 (return-from cluster-set) 85 ;; point found no cluster-match, create new cluster 86 finally (setf (cluster point) (incf cluster-count)))) 87 ;;; assign labels to non-core points 88 (loop for point in non-core-points 89 for intersection = (intersection core-points (range-query point)) 91 do (setf (cluster point) (cluster (first intersection)))))))