Mercurial > core / lisp/lib/dat/arff.lisp
changeset 698: |
96958d3eb5b0 |
parent: |
a4ed30cbe083
|
author: |
Richard Westhaver <ellis@rwest.io> |
date: |
Fri, 04 Oct 2024 22:04:59 -0400 |
permissions: |
-rw-r--r-- |
description: |
fixes |
1 ;;; lib/dat/arff.lisp --- ARFF file format 3 ;; Attribute-Relation File Format 5 ;; based on https://github.com/pieterw/cl-arff-parser 7 ;; ref: https://waikato.github.io/weka-wiki/formats_and_processing/arff_stable/ 11 % 1. Title: Iris Plants Database 14 % (a) Creator: R.A. Fisher 15 % (b) Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov) 16 % (c) Date: July, 1988 20 @ATTRIBUTE sepallength NUMERIC 21 @ATTRIBUTE sepalwidth NUMERIC 22 @ATTRIBUTE petallength NUMERIC 23 @ATTRIBUTE petalwidth NUMERIC 24 @ATTRIBUTE class {Iris-setosa,Iris-versicolor,Iris-virginica} 30 5.1,3.5,1.4,0.2,Iris-setosa 31 4.9,3.0,1.4,0.2,Iris-setosa 32 4.7,3.2,1.3,0.2,Iris-setosa 33 4.6,3.1,1.5,0.2,Iris-setosa 34 5.0,3.6,1.4,0.2,Iris-setosa 35 5.4,3.9,1.7,0.4,Iris-setosa 36 4.6,3.4,1.4,0.3,Iris-setosa 37 5.0,3.4,1.5,0.2,Iris-setosa 38 4.4,2.9,1.4,0.2,Iris-setosa 39 4.9,3.1,1.5,0.1,Iris-setosa 44 https://storm.cis.fordham.edu/~gweiss/data-mining/datasets.html 47 (in-package :dat/arff) 50 ((arff-path :accessor arff-path 53 :documentation "A string to the path of the arff 54 file. e.g. /home/user/myData/foo.arff") 55 (arff-relation :accessor arff-relation 56 :initarg :arff-reltation 58 :documentation "The string after @relation. This is 59 essentially the name of the arff.") 60 (arff-attributes :accessor arff-attributes 61 :initarg :arff-attributes 64 :documentation "The attributes as specified in the 65 header. Each attribute is a list that looks as 66 follows: (\"attribute-name\" (\"type\")). In case of a 67 nominal attribute it looks like 68 this: (\"attribute-name\" (\"nominal\" . values)). ") 69 (arff-data :accessor arff-data 73 :documentation "All the data. The bulk of the file.")) 74 (:documentation "An arff object contains all the data found in a 77 (defmethod print-object ((arff arff) stream) 79 (pprint-logical-block (stream nil) 80 (format stream "<arff ~a:~%attributes: ~{~%~a~^,~}~%data: ~{~%~a~}>" 81 (arff-relation arff) (arff-attributes arff) (arff-data arff))) 82 (format stream "<arff ~a>" (arff-relation arff)))) 84 (defun trim-comments-and-spaces (string &optional (comment-marker "%")) 85 (string-trim (list (code-char 9)) ;; tabs 86 (string-trim " " (subseq string 0 (search comment-marker string))))) 88 (defun csv->list (string &optional (separator ",")) 89 "Given a string like '1,2,3, 6, foo' will return list ('1' '2' '3' 95 for end = (search separator string :start2 start) 97 do (setf continue nil) 98 (setf end (length string)) 99 collect (string-trim " " (subseq string start end)) 100 do (setf start (+ end 1)))) 102 (defun string-replace (str1 sub1 sub2) 103 "Nondestructively replaces all occurences of sub1 in str1 by sub2" 104 (let ((str1 (string str1)) 110 if (string-equal str1 sub1 112 :end1 (min (length str1) 113 (+ index1 (length sub1)))) 114 do (setq str2 (concatenate 'string str2 sub2)) 115 (incf index1 (length sub1)) 117 (setq str2 (concatenate 'string str2 118 (subseq str1 index1 (1+ index1)))) 120 unless (< index1 (length str1)) 123 (defun search-space-or-tab (line) 124 (or (search " " line) 125 (search (list (code-char 9)) line))) 127 (defun parse-attribute-name (line) 128 "Assumes the beginning of this line is the attribute-name. If spaces 129 are to be included in the name then the entire name must be quoted. As 130 second return value it also returns the rest of the line which should 132 (setf line (string-replace line (string (code-char 9)) " ")) 133 (if (and (search "'" line) ;; attribute name contains ' 134 (or (not (search "{" line)) 135 (< (search "'" line) (search "{" line)))) 136 (values (string-replace (subseq line 1 (search "'" line :start2 1)) " " "-") 137 (trim-comments-and-spaces (subseq line (1+ (search "'" line :start2 1))))) 138 (values (subseq line 0 (search-space-or-tab line)) 139 (trim-comments-and-spaces (subseq line (search-space-or-tab line)))))) 141 (defun parse-datatype (line) 142 "Assumes that the line starts with the datatype.Look at 143 http://www.cs.waikato.ac.nz/~ml/weka/arff.html for information about 144 the datatype. There is no support for the date datatype." 145 (cond ((equal 0 (search "real" line :test #'string-equal)) 147 ((equal 0 (search "integer" line :test #'string-equal)) 149 ((equal 0 (search "numeric" line :test #'string-equal)) 151 ((equal 0 (search "string" line :test #'string-equal)) 153 ((search "{" line) ;; nominal 156 (string-trim " " (subseq line (1+ (search "{" line)) (search "}" line)))))) 158 (error "datatype ~a not real, integer or nominal" line)))) 161 (defun parse-@attribute (line) 162 "@attribute <attribute-name> <datatype>. Returns a list containing 163 the attribute-name and then a list containing datatype information as 164 parsed by parse-datatype." 165 (let (attribute data-type) 166 (multiple-value-setq (attribute data-type) 167 (parse-attribute-name (subseq line (1+ (search " " line))))) 168 (list attribute (parse-datatype data-type)))) 170 (defun parse-data (line) 174 ;; ----------------------------------------- 175 ;; Main function used to parse an arff file. 176 ;; ----------------------------------------- 177 (defun parse-arff-stream (stream &optional path) 178 (let ((arff (make-instance 'arff :arff-path path))) 180 with data-mode-p = nil ;; true when parsing data 181 for line = (read-line stream nil) 183 for trimmed-line = (trim-comments-and-spaces line) 184 when (and data-mode-p 185 (not (equalp trimmed-line ""))) 186 collect (parse-data line) into data 187 when (not data-mode-p) 188 do (cond ((equalp trimmed-line "")) ;; skip empty and commented lines 189 ((search "@relation" (string-downcase trimmed-line)) 190 (setf (arff-relation arff) 191 (subseq trimmed-line (1+ (search " " trimmed-line))))) 192 ((search "@attribute" (string-downcase trimmed-line)) 193 (setf (arff-attributes arff) 194 (append (arff-attributes arff) 195 (list (parse-@attribute trimmed-line))))) 196 ((search "@data" (string-downcase trimmed-line)) 197 (setf data-mode-p t))) 198 finally (setf (arff-data arff) data)) 201 (defun parse-arff-string (string &optional path) 202 (with-input-from-string (s string) 203 (parse-arff-stream s path))) 205 (defun parse-arff (arff-path) 206 "The arff-path should be a string pointing to an arff-file." 207 (parse-arff-stream (open arff-path) arff-path)) 210 (defgeneric remove-attribute-by-name (arff name) 211 (:documentation "Removes the feature with the given name from the 212 arff object (not from the actual file). It will remove it both from 213 that @attributes and the @data.")) 215 (defmethod remove-attribute-by-name ((arff arff) (name string)) 216 (let ((position (position name (arff-attributes arff) :key #'first :test #'string-equal))) 218 (setf (arff-attributes arff) 219 (delete name (arff-attributes arff) :key #'first :test #'string-equal)) 220 (setf (arff-data arff) 221 (loop for instance in (arff-data arff) 222 collect (delete-if (nth position instance) instance)))))) 224 (defmethod remove-attribute-by-name ((arff arff) name) 225 (remove-attribute-by-name arff (format nil "~a" name)))