changelog shortlog graph tags branches changeset files revisions annotate raw help

Mercurial > core / lisp/lib/dat/arff.lisp

changeset 698: 96958d3eb5b0
parent: a4ed30cbe083
author: Richard Westhaver <ellis@rwest.io>
date: Fri, 04 Oct 2024 22:04:59 -0400
permissions: -rw-r--r--
description: fixes
1 ;;; lib/dat/arff.lisp --- ARFF file format
2 
3 ;; Attribute-Relation File Format
4 
5 ;; based on https://github.com/pieterw/cl-arff-parser
6 
7 ;; ref: https://waikato.github.io/weka-wiki/formats_and_processing/arff_stable/
8 
9 ;; example header:
10 #|
11 % 1. Title: Iris Plants Database
12 %
13 % 2. Sources:
14 % (a) Creator: R.A. Fisher
15 % (b) Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
16 % (c) Date: July, 1988
17 %
18 @RELATION iris
19 
20 @ATTRIBUTE sepallength NUMERIC
21 @ATTRIBUTE sepalwidth NUMERIC
22 @ATTRIBUTE petallength NUMERIC
23 @ATTRIBUTE petalwidth NUMERIC
24 @ATTRIBUTE class {Iris-setosa,Iris-versicolor,Iris-virginica}
25 |#
26 
27 ;; example data:
28 #|
29 @DATA
30 5.1,3.5,1.4,0.2,Iris-setosa
31 4.9,3.0,1.4,0.2,Iris-setosa
32 4.7,3.2,1.3,0.2,Iris-setosa
33 4.6,3.1,1.5,0.2,Iris-setosa
34 5.0,3.6,1.4,0.2,Iris-setosa
35 5.4,3.9,1.7,0.4,Iris-setosa
36 4.6,3.4,1.4,0.3,Iris-setosa
37 5.0,3.4,1.5,0.2,Iris-setosa
38 4.4,2.9,1.4,0.2,Iris-setosa
39 4.9,3.1,1.5,0.1,Iris-setosa
40 |#
41 
42 ;; example datasets:
43 #|
44 https://storm.cis.fordham.edu/~gweiss/data-mining/datasets.html
45 |#
46 ;;; Code:
47 (in-package :dat/arff)
48 
49 (defclass arff ()
50  ((arff-path :accessor arff-path
51  :initarg :arff-path
52  :initform "~/"
53  :documentation "A string to the path of the arff
54  file. e.g. /home/user/myData/foo.arff")
55  (arff-relation :accessor arff-relation
56  :initarg :arff-reltation
57  :initform ""
58  :documentation "The string after @relation. This is
59  essentially the name of the arff.")
60  (arff-attributes :accessor arff-attributes
61  :initarg :arff-attributes
62  :initform nil
63  :type list
64  :documentation "The attributes as specified in the
65  header. Each attribute is a list that looks as
66  follows: (\"attribute-name\" (\"type\")). In case of a
67  nominal attribute it looks like
68  this: (\"attribute-name\" (\"nominal\" . values)). ")
69  (arff-data :accessor arff-data
70  :initarg :arff-data
71  :initform nil
72  :type list
73  :documentation "All the data. The bulk of the file."))
74  (:documentation "An arff object contains all the data found in a
75  parsed arff file."))
76 
77 (defmethod print-object ((arff arff) stream)
78  (if *print-pretty*
79  (pprint-logical-block (stream nil)
80  (format stream "<arff ~a:~%attributes: ~{~%~a~^,~}~%data: ~{~%~a~}>"
81  (arff-relation arff) (arff-attributes arff) (arff-data arff)))
82  (format stream "<arff ~a>" (arff-relation arff))))
83 
84 (defun trim-comments-and-spaces (string &optional (comment-marker "%"))
85  (string-trim (list (code-char 9)) ;; tabs
86  (string-trim " " (subseq string 0 (search comment-marker string)))))
87 
88 (defun csv->list (string &optional (separator ","))
89  "Given a string like '1,2,3, 6, foo' will return list ('1' '2' '3'
90 '6' 'foo')"
91  (loop
92  with continue = t
93  with start = 0
94  while continue
95  for end = (search separator string :start2 start)
96  unless end
97  do (setf continue nil)
98  (setf end (length string))
99  collect (string-trim " " (subseq string start end))
100  do (setf start (+ end 1))))
101 
102 (defun string-replace (str1 sub1 sub2)
103  "Nondestructively replaces all occurences of sub1 in str1 by sub2"
104  (let ((str1 (string str1))
105  (str2 "")
106  (sub1 (string sub1))
107  (sub2 (string sub2))
108  (index1 0))
109  (loop
110  if (string-equal str1 sub1
111  :start1 index1
112  :end1 (min (length str1)
113  (+ index1 (length sub1))))
114  do (setq str2 (concatenate 'string str2 sub2))
115  (incf index1 (length sub1))
116  else do
117  (setq str2 (concatenate 'string str2
118  (subseq str1 index1 (1+ index1))))
119  (incf index1)
120  unless (< index1 (length str1))
121  return str2)))
122 
123 (defun search-space-or-tab (line)
124  (or (search " " line)
125  (search (list (code-char 9)) line)))
126 
127 (defun parse-attribute-name (line)
128  "Assumes the beginning of this line is the attribute-name. If spaces
129 are to be included in the name then the entire name must be quoted. As
130 second return value it also returns the rest of the line which should
131 be the datatype."
132  (setf line (string-replace line (string (code-char 9)) " "))
133  (if (and (search "'" line) ;; attribute name contains '
134  (or (not (search "{" line))
135  (< (search "'" line) (search "{" line))))
136  (values (string-replace (subseq line 1 (search "'" line :start2 1)) " " "-")
137  (trim-comments-and-spaces (subseq line (1+ (search "'" line :start2 1)))))
138  (values (subseq line 0 (search-space-or-tab line))
139  (trim-comments-and-spaces (subseq line (search-space-or-tab line))))))
140 
141 (defun parse-datatype (line)
142  "Assumes that the line starts with the datatype.Look at
143 http://www.cs.waikato.ac.nz/~ml/weka/arff.html for information about
144 the datatype. There is no support for the date datatype."
145  (cond ((equal 0 (search "real" line :test #'string-equal))
146  (list "real"))
147  ((equal 0 (search "integer" line :test #'string-equal))
148  (list "integer"))
149  ((equal 0 (search "numeric" line :test #'string-equal))
150  (list "numeric"))
151  ((equal 0 (search "string" line :test #'string-equal))
152  (list "string"))
153  ((search "{" line) ;; nominal
154  (cons "nominal"
155  (csv->list
156  (string-trim " " (subseq line (1+ (search "{" line)) (search "}" line))))))
157  (t
158  (error "datatype ~a not real, integer or nominal" line))))
159 
160 
161 (defun parse-@attribute (line)
162  "@attribute <attribute-name> <datatype>. Returns a list containing
163 the attribute-name and then a list containing datatype information as
164 parsed by parse-datatype."
165  (let (attribute data-type)
166  (multiple-value-setq (attribute data-type)
167  (parse-attribute-name (subseq line (1+ (search " " line)))))
168  (list attribute (parse-datatype data-type))))
169 
170 (defun parse-data (line)
171  (csv->list line))
172 
173 
174 ;; -----------------------------------------
175 ;; Main function used to parse an arff file.
176 ;; -----------------------------------------
177 (defun parse-arff-stream (stream &optional path)
178  (let ((arff (make-instance 'arff :arff-path path)))
179  (loop
180  with data-mode-p = nil ;; true when parsing data
181  for line = (read-line stream nil)
182  while line
183  for trimmed-line = (trim-comments-and-spaces line)
184  when (and data-mode-p
185  (not (equalp trimmed-line "")))
186  collect (parse-data line) into data
187  when (not data-mode-p)
188  do (cond ((equalp trimmed-line "")) ;; skip empty and commented lines
189  ((search "@relation" (string-downcase trimmed-line))
190  (setf (arff-relation arff)
191  (subseq trimmed-line (1+ (search " " trimmed-line)))))
192  ((search "@attribute" (string-downcase trimmed-line))
193  (setf (arff-attributes arff)
194  (append (arff-attributes arff)
195  (list (parse-@attribute trimmed-line)))))
196  ((search "@data" (string-downcase trimmed-line))
197  (setf data-mode-p t)))
198  finally (setf (arff-data arff) data))
199  arff))
200 
201 (defun parse-arff-string (string &optional path)
202  (with-input-from-string (s string)
203  (parse-arff-stream s path)))
204 
205 (defun parse-arff (arff-path)
206  "The arff-path should be a string pointing to an arff-file."
207  (parse-arff-stream (open arff-path) arff-path))
208 
209 
210 (defgeneric remove-attribute-by-name (arff name)
211  (:documentation "Removes the feature with the given name from the
212  arff object (not from the actual file). It will remove it both from
213  that @attributes and the @data."))
214 
215 (defmethod remove-attribute-by-name ((arff arff) (name string))
216  (let ((position (position name (arff-attributes arff) :key #'first :test #'string-equal)))
217  (when position
218  (setf (arff-attributes arff)
219  (delete name (arff-attributes arff) :key #'first :test #'string-equal))
220  (setf (arff-data arff)
221  (loop for instance in (arff-data arff)
222  collect (delete-if (nth position instance) instance))))))
223 
224 (defmethod remove-attribute-by-name ((arff arff) name)
225  (remove-attribute-by-name arff (format nil "~a" name)))