changelog shortlog graph tags branches changeset files revisions annotate raw help

Mercurial > core / lisp/lib/dat/csv.lisp

changeset 698: 96958d3eb5b0
parent: d3e2829521a3
author: Richard Westhaver <ellis@rwest.io>
date: Fri, 04 Oct 2024 22:04:59 -0400
permissions: -rw-r--r--
description: fixes
1 ;;; lib/dat/csv.lisp --- CSV Data Format
2 
3 ;; Comma Separated Values (or tabs or whatever)
4 
5 ;;; Commentary:
6 
7 ;; This package prioritizes flexibility. If you want speed, convert to
8 ;; parquet.
9 
10 ;; Still, efficiency is worth pursuing here and there are some obvious gaps to
11 ;; remedy.
12 
13 ;; - remove sequence functions
14 ;; - research optimized access patterns used in other langs/state of art
15 ;; - buffered reads
16 ;; - multithreading
17 
18 ;; ref: https://datatracker.ietf.org/doc/html/rfc4180
19 
20 ;;; Code:
21 (in-package :dat/csv)
22 
23 (defun parse-number-no-error (string &optional default)
24  (let ((result
25  (ignore-errors
26  (parse-number string))))
27  (if result
28  result
29  default)))
30 
31 (defparameter *csv-separator* #\,)
32 (defparameter *csv-quote* #\")
33 (defparameter *csv-print-quote-p* t "print \" when the element is a string?")
34 (defparameter *csv-default-external-format* :utf-8)
35 
36 (defun write-csv-line (record &key stream (delimiter *csv-separator*))
37  "Accept a record and print it in one line as a csv record.
38 
39 A record is a sequence of element. A element can be of any type.
40 If record is nil, nothing will be printed.
41 If stream is nil (default case), it will return a string, otherwise it will return nil.
42 For efficiency reason, no intermediate string will be constructed. "
43  (let ((result
44  (with-output-to-string (s)
45  (let ((*standard-output* s)
46  (record-size (length record)))
47  (loop for e across record
48  for i from 0
49  do (typecase e
50  (string (progn
51  (if *csv-print-quote-p*
52  (progn
53  (write-char *csv-quote*)
54  (write-string e)
55  (write-char *csv-quote*))
56  (write-string e))))
57  (t (princ e)))
58  (when (< i (1- record-size))
59  (write-char delimiter)))))))
60  (format stream "~&~a" result)))
61 
62 (defun write-csv-stream (stream table &key (delimiter *csv-separator*))
63  "Accept a stream and a table and output the table as csv form to the stream.
64 
65 A table is a sequence of lines. A line is a sequence of elements.
66 Elements can be any types"
67  (loop for l across table
68  do (write-csv-line l :stream stream :delimiter delimiter))
69  (write-char #\newline stream)
70  nil)
71 
72 (defun write-csv-file (filename table &key (external-format *csv-default-external-format*) (delimiter *csv-separator*))
73  "Accept a filename and a table and output the table as csv form to the file.
74 
75 A table is a sequence of lines. A line is a sequence of elements.
76 Elements can be any types"
77  (with-open-file (f filename :direction :output
78  :if-does-not-exist :create
79  :if-exists :supersede
80  :external-format external-format)
81  (write-csv-stream f table :delimiter delimiter)))
82 
83 (defun write-csv-string (table)
84  (with-output-to-string (str)
85  (write-csv-stream str table)))
86 
87 (defun parse-csv-string (str &key (delimiter *csv-separator*)) ;; refer RFC4180
88  (coerce
89  ;; (regexp:split-re "," str)
90  (let ((q-count (count *csv-quote* str :test #'char-equal)))
91  (when (oddp q-count) (warn 'simple-warning :format-control "odd number of #\" in a line (~A)"
92  :format-arguments (list q-count)))
93  (if (zerop q-count)
94  (cl-ppcre:split delimiter str) ;(cl-ppcre:split *csv-separator* str)
95  (macrolet ((push-f (fld flds) `(push (coerce (reverse ,fld) 'string) ,flds)))
96  (loop with state = :at-first ;; :at-first | :data-nq | :data-q | :q-in-nq | q-in-q
97  with field with fields
98  for chr of-type character across str
99  do (cond ((eq state :at-first)
100  (setf field nil)
101  (cond ((char-equal chr *csv-quote*) (setf state :data-q))
102  ((char-equal chr delimiter) (push "" fields))
103  (t (setf state :data-nq) (push chr field))))
104  ((eq state :data-nq)
105  (cond ((char-equal chr *csv-quote*) (setf state :q-in-nq))
106  ((char-equal chr delimiter)
107  (push-f field fields)
108  (setf state :at-first))
109  (t (push chr field))))
110  ((eq state :q-in-nq)
111  (cond ((char-equal chr *csv-quote*) (setf state :q-in-q))
112  ((char-equal chr delimiter)
113  (push-f field fields)
114  (setf state :at-first))
115  (t (setf state :data-nq) (push chr field))))
116  ((eq state :data-q)
117  (cond ((char-equal chr *csv-quote*) (setf state :q-in-q))
118  ((char-equal chr delimiter) (push-f field fields) (setf state :at-first))
119  (t (push chr field))))
120  ((eq state :q-in-q)
121  (cond ((char-equal chr *csv-quote*) (setf state :data-q))
122  ;; this should only be done conditionally - early escapes quotes
123  ((char-equal chr delimiter)
124  (push-f field fields)
125  (setf state :at-first))
126  (t
127  ;; (error "illegal value ( ~A ) after quotation" chr)
128  (push chr field)
129  ))))
130  finally (return
131  (progn (push-f field fields) (reverse fields)))))))
132  'vector))
133 
134 
135 (defun read-csv-line (stream &key type-conv-fns map-fns (delimiter *csv-separator*) (start 0) end)
136  "Read one line from stream and return a csv record.
137 
138 A CSV record is a vector of elements.
139 
140 type-conv-fns should be a list of functions.
141 If type-conv-fns is nil (the default case), then all will be treated
142 as string.
143 
144 map-fns is a list of functions of one argument and output one result.
145 each function in it will be applied to the parsed element.
146 If map-fns is nil, then nothing will be applied.
147 
148 start and end specifies how many elements per record will be included.
149 If start or end is negative, it counts from the end. -1 is the last element.
150 "
151  (declare (type (or (simple-array function *) null) type-conv-fns map-fns))
152  (let* ((rline (read-line stream nil nil)))
153  (when rline
154  (let* ((line (string-trim '(#\Space #\Newline #\Return) rline))
155  (strs (parse-csv-string line :delimiter delimiter))
156  (strs-size (length strs)))
157  (when (< start 0)
158  (setf start (+ start strs-size)))
159  (when (and end (< end 0))
160  (setf end (+ end strs-size)))
161  (setf strs (subseq strs start end))
162  (when type-conv-fns
163  (unless (= (length strs) (length type-conv-fns))
164  (error "Number of type specifier (~a) does not match the number of elements (~a)."
165  (length type-conv-fns) (length strs))))
166  (when map-fns
167  (unless (= (length strs) (length map-fns))
168  (error "Number of mapping functions (~a) does not match the number of elements (~a)."
169  (length map-fns) (length strs))))
170  (let ((result strs))
171  ;; strs is not needed so we simply overwrite it
172  (when type-conv-fns
173  (setf result
174  (map 'vector #'funcall type-conv-fns result)))
175  (when map-fns
176  (setf result
177  (map 'vector #'funcall map-fns result)))
178  result)))))
179 
180 (defun read-csv-stream (stream &key (header t) type-spec map-fns (delimiter *csv-separator*) (start 0) end)
181  "Read from stream until eof and return a csv table.
182 
183 A csv table is a vector of csv records.
184 A csv record is a vector of elements.
185 
186 Type spec should be a list of type specifier (symbols).
187 If the type specifier is nil or t, it will be treated as string.
188 If type-spec is nil (the default case), then all will be treated
189 as string.
190 
191 map-fns is a list of functions of one argument and output one result.
192 each function in it will be applied to the parsed element.
193 If any function in the list is nil or t, it equals to #'identity.
194 If map-fns is nil, then nothing will be applied.
195 
196 start and end specifies how many elements per record will be included.
197 If start or end is negative, it counts from the end. -1 is the last element.
198 "
199  (let ((type-conv-fns
200  (when type-spec
201  (macrolet ((make-num-specifier (specifier)
202  `(lambda (s) (let ((s (parse-number-no-error s s)))
203  (if (numberp s) (funcall ,specifier s) s)))))
204  (map 'vector
205  (lambda (type)
206  (ecase type
207  ((t nil string) #'identity)
208  (number #'(lambda (s) (parse-number-no-error s s)))
209  (float (make-num-specifier #'float))
210  (single-float (make-num-specifier #'(lambda (s) (coerce s 'single-float))))
211  (double-float (make-num-specifier #'(lambda (s) (coerce s 'double-float))))
212  (integer (make-num-specifier #'round))
213  (pathname #'pathname)
214  (symbol #'intern)
215  (keyword (lambda (s) (intern s :keyword)))))
216  type-spec))))
217  (map-fns
218  (when map-fns
219  (map 'vector
220  (lambda (fn)
221  (cond ((or (eq fn t)
222  (eq fn nil))
223  #'identity)
224  ((functionp fn)
225  fn)
226  ((and (symbolp fn)
227  (not (keywordp fn)))
228  (symbol-function fn))
229  (t (error "~a is not a valid function specifier." fn))))
230  map-fns)))
231  (header
232  (etypecase header
233  (cons (coerce header 'vector))
234  (boolean (when header
235  (read-csv-line stream :delimiter delimiter))))))
236  (loop for rec = (read-csv-line stream :type-conv-fns type-conv-fns :map-fns map-fns :delimiter delimiter
237  :start start :end end)
238  while rec
239  collect rec into result
240  finally (return
241  (values
242  (coerce result 'vector)
243  header)))))
244 
245 (defun read-csv-file (filename &key (header t)
246  type-spec
247  map-fns
248  (delimiter *csv-separator*)
249  (external-format *csv-default-external-format*)
250  (start 0)
251  end)
252  "Read from stream until eof and return a csv table.
253 
254 A csv table is a vector of csv records.
255 A csv record is a vector of elements.
256 
257 Type spec should be a list of type specifier (symbols).
258 If the type specifier is nil or t, it will be treated as string.
259 If type-spec is nil (the default case), then all will be treated
260 as string.
261 
262 map-fns is a list of functions of one argument and output one result.
263 each function in it will be applied to the parsed element.
264 If any function in the list is nil or t, it equals to #'identity.
265 If map-fns is nil, then nothing will be applied.
266 
267 external-format (default is :UTF-8)
268 
269 start and end specifies how many elements per record will be included.
270 If start or end is negative, it counts from the end. -1 is the last element.
271 "
272  (with-open-file (f filename :external-format external-format)
273  (read-csv-stream f :type-spec type-spec :map-fns map-fns
274  :delimiter delimiter
275  :start start :end end
276  :header header)))
277 
278 
279 (defun read-csv-file-and-sort (filename sort-order &key (header t) (order :ascend) type-spec map-fns (delimiter *csv-separator*) (external-format *csv-default-external-format*))
280  (let ((table (read-csv-file filename
281  :header header
282  :type-spec type-spec
283  :map-fns map-fns
284  :delimiter delimiter
285  :external-format external-format)))
286  (loop for i in (reverse sort-order)
287  do (setf table
288  (stable-sort table (ecase order (:ascend #'string<=) (:descend #'string>=))
289  :key (lambda (rec) (aref rec i))))
290  finally (return table))))
291 
292 (defclass csv-file-data (file-data-source) ())
293 
294 ;; TODO 2024-08-05:
295 (defmethod scan-data ((self csv-file-data) (projection sequence))
296  (if (null projection)
297  (read-csv-file (file-data-path self))
298  (nyi!)))