changelog shortlog graph tags branches changeset files revisions annotate raw help

Mercurial > core / lisp/lib/dat/csv.lisp

changeset 580: 571685ae64f1
parent: c0fc6b87557f
child: d3e2829521a3
author: Richard Westhaver <ellis@rwest.io>
date: Mon, 05 Aug 2024 21:57:13 -0400
permissions: -rw-r--r--
description: queries, cli fixes, dat/csv, emacs org-columns
1 ;;; lib/dat/csv.lisp --- CSV Data Format
2 
3 ;; Comma Separated Values (or tabs or whatever)
4 
5 ;;; Commentary:
6 
7 ;; This package prioritizes flexibility. If you want speed, convert to
8 ;; parquet.
9 
10 ;; Still, efficiency is worth pursuing here and there are some obvious gaps to
11 ;; remedy.
12 
13 ;; - remove sequence functions
14 ;; - research optimized access patterns used in other langs/state of art
15 ;; - buffered reads
16 ;; - multithreading
17 
18 ;;; Code:
19 (in-package :dat/csv)
20 
21 (defun parse-number-no-error (string &optional default)
22  (let ((result
23  (ignore-errors
24  (parse-number string))))
25  (if result
26  result
27  default)))
28 
29 (defparameter *csv-separator* #\,)
30 (defparameter *csv-quote* #\")
31 (defparameter *csv-print-quote-p* t "print \" when the element is a string?")
32 (defparameter *csv-default-external-format* :utf-8)
33 
34 (defun write-csv-line (record &key stream (delimiter *csv-separator*))
35  "Accept a record and print it in one line as a csv record.
36 
37 A record is a sequence of element. A element can be of any type.
38 If record is nil, nothing will be printed.
39 If stream is nil (default case), it will return a string, otherwise it will return nil.
40 For efficiency reason, no intermediate string will be constructed. "
41  (let ((result
42  (with-output-to-string (s)
43  (let ((*standard-output* s)
44  (record-size (length record)))
45  (loop for e across record
46  for i from 0
47  do (typecase e
48  (string (progn
49  (if *csv-print-quote-p*
50  (progn
51  (write-char *csv-quote*)
52  (write-string e)
53  (write-char *csv-quote*))
54  (write-string e))))
55  (t (princ e)))
56  (when (< i (1- record-size))
57  (write-char delimiter)))))))
58  (format stream "~&~a" result)))
59 
60 (defun write-csv-stream (stream table &key (delimiter *csv-separator*))
61  "Accept a stream and a table and output the table as csv form to the stream.
62 
63 A table is a sequence of lines. A line is a sequence of elements.
64 Elements can be any types"
65  (loop for l across table
66  do (write-csv-line l :stream stream :delimiter delimiter))
67  (write-char #\newline stream)
68  nil)
69 
70 (defun write-csv-file (filename table &key (external-format *csv-default-external-format*) (delimiter *csv-separator*))
71  "Accept a filename and a table and output the table as csv form to the file.
72 
73 A table is a sequence of lines. A line is a sequence of elements.
74 Elements can be any types"
75  (with-open-file (f filename :direction :output
76  :if-does-not-exist :create
77  :if-exists :supersede
78  :external-format external-format)
79  (write-csv-stream f table :delimiter delimiter)))
80 
81 (defun parse-csv-string (str &key (delimiter *csv-separator*)) ;; refer RFC4180
82  (coerce
83  ;; (regexp:split-re "," str)
84  (let ((q-count (count *csv-quote* str :test #'char-equal)))
85  (when (oddp q-count) (warn 'simple-warning :format-control "odd number of #\" in a line (~A)"
86  :format-arguments (list q-count)))
87  (if (zerop q-count)
88  (cl-ppcre:split delimiter str) ;(cl-ppcre:split *csv-separator* str)
89  (macrolet ((push-f (fld flds) `(push (coerce (reverse ,fld) 'string) ,flds)))
90  (loop with state = :at-first ;; :at-first | :data-nq | :data-q | :q-in-nq | q-in-q
91  with field with fields
92  for chr of-type character across str
93  do (cond ((eq state :at-first)
94  (setf field nil)
95  (cond ((char-equal chr *csv-quote*) (setf state :data-q))
96  ((char-equal chr delimiter) (push "" fields))
97  (t (setf state :data-nq) (push chr field))))
98  ((eq state :data-nq)
99  (cond ((char-equal chr *csv-quote*) (setf state :q-in-nq))
100  ((char-equal chr delimiter)
101  (push-f field fields)
102  (setf state :at-first))
103  (t (push chr field))))
104  ((eq state :q-in-nq)
105  (cond ((char-equal chr *csv-quote*) (setf state :q-in-q))
106  ((char-equal chr delimiter)
107  (push-f field fields)
108  (setf state :at-first))
109  (t (setf state :data-nq) (push chr field))))
110  ((eq state :data-q)
111  (cond ((char-equal chr *csv-quote*) (setf state :q-in-q))
112  ((char-equal chr delimiter) (push-f field fields) (setf state :at-first))
113  (t (push chr field))))
114  ((eq state :q-in-q)
115  (cond ((char-equal chr *csv-quote*) (setf state :data-q))
116  ;; this should only be done conditionally - early escapes quotes
117  ((char-equal chr delimiter)
118  (push-f field fields)
119  (setf state :at-first))
120  (t
121  ;; (error "illegal value ( ~A ) after quotation" chr)
122  (push chr field)
123  ))))
124  finally (return
125  (progn (push-f field fields) (reverse fields)))))))
126  'vector))
127 
128 
129 (defun read-csv-line (stream &key type-conv-fns map-fns (delimiter *csv-separator*) (start 0) end)
130  "Read one line from stream and return a csv record.
131 
132 A CSV record is a vector of elements.
133 
134 type-conv-fns should be a list of functions.
135 If type-conv-fns is nil (the default case), then all will be treated
136 as string.
137 
138 map-fns is a list of functions of one argument and output one result.
139 each function in it will be applied to the parsed element.
140 If map-fns is nil, then nothing will be applied.
141 
142 start and end specifies how many elements per record will be included.
143 If start or end is negative, it counts from the end. -1 is the last element.
144 "
145  (declare (type (or (simple-array function *) null) type-conv-fns map-fns))
146  (let* ((rline (read-line stream nil nil)))
147  (when rline
148  (let* ((line (string-trim '(#\Space #\Newline #\Return) rline))
149  (strs (parse-csv-string line :delimiter delimiter))
150  (strs-size (length strs)))
151  (when (< start 0)
152  (setf start (+ start strs-size)))
153  (when (and end (< end 0))
154  (setf end (+ end strs-size)))
155  (setf strs (subseq strs start end))
156  (when type-conv-fns
157  (unless (= (length strs) (length type-conv-fns))
158  (error "Number of type specifier (~a) does not match the number of elements (~a)."
159  (length type-conv-fns) (length strs))))
160  (when map-fns
161  (unless (= (length strs) (length map-fns))
162  (error "Number of mapping functions (~a) does not match the number of elements (~a)."
163  (length map-fns) (length strs))))
164  (let ((result strs))
165  ;; strs is not needed so we simply overwrite it
166  (when type-conv-fns
167  (setf result
168  (map 'vector #'funcall type-conv-fns result)))
169  (when map-fns
170  (setf result
171  (map 'vector #'funcall map-fns result)))
172  result)))))
173 
174 (defun read-csv-stream (stream &key (header t) type-spec map-fns (delimiter *csv-separator*) (start 0) end)
175  "Read from stream until eof and return a csv table.
176 
177 A csv table is a vector of csv records.
178 A csv record is a vector of elements.
179 
180 Type spec should be a list of type specifier (symbols).
181 If the type specifier is nil or t, it will be treated as string.
182 If type-spec is nil (the default case), then all will be treated
183 as string.
184 
185 map-fns is a list of functions of one argument and output one result.
186 each function in it will be applied to the parsed element.
187 If any function in the list is nil or t, it equals to #'identity.
188 If map-fns is nil, then nothing will be applied.
189 
190 start and end specifies how many elements per record will be included.
191 If start or end is negative, it counts from the end. -1 is the last element.
192 "
193  (let ((type-conv-fns
194  (when type-spec
195  (macrolet ((make-num-specifier (specifier)
196  `(lambda (s) (let ((s (parse-number-no-error s s)))
197  (if (numberp s) (funcall ,specifier s) s)))))
198  (map 'vector
199  (lambda (type)
200  (ecase type
201  ((t nil string) #'identity)
202  (number #'(lambda (s) (parse-number-no-error s s)))
203  (float (make-num-specifier #'float))
204  (single-float (make-num-specifier #'(lambda (s) (coerce s 'single-float))))
205  (double-float (make-num-specifier #'(lambda (s) (coerce s 'double-float))))
206  (integer (make-num-specifier #'round))
207  (pathname #'pathname)
208  (symbol #'intern)
209  (keyword (lambda (s) (intern s :keyword)))))
210  type-spec))))
211  (map-fns
212  (when map-fns
213  (map 'vector
214  (lambda (fn)
215  (cond ((or (eq fn t)
216  (eq fn nil))
217  #'identity)
218  ((functionp fn)
219  fn)
220  ((and (symbolp fn)
221  (not (keywordp fn)))
222  (symbol-function fn))
223  (t (error "~a is not a valid function specifier." fn))))
224  map-fns)))
225  (header
226  (etypecase header
227  (cons (coerce header 'vector))
228  (boolean (when header
229  (read-csv-line stream :delimiter delimiter))))))
230  (loop for rec = (read-csv-line stream :type-conv-fns type-conv-fns :map-fns map-fns :delimiter delimiter
231  :start start :end end)
232  while rec
233  collect rec into result
234  finally (return
235  (values
236  (coerce result 'vector)
237  header)))))
238 
239 (defun read-csv-file (filename &key (header t)
240  type-spec
241  map-fns
242  (delimiter *csv-separator*)
243  (external-format *csv-default-external-format*)
244  (start 0)
245  end)
246  "Read from stream until eof and return a csv table.
247 
248 A csv table is a vector of csv records.
249 A csv record is a vector of elements.
250 
251 Type spec should be a list of type specifier (symbols).
252 If the type specifier is nil or t, it will be treated as string.
253 If type-spec is nil (the default case), then all will be treated
254 as string.
255 
256 map-fns is a list of functions of one argument and output one result.
257 each function in it will be applied to the parsed element.
258 If any function in the list is nil or t, it equals to #'identity.
259 If map-fns is nil, then nothing will be applied.
260 
261 external-format (default is :UTF-8)
262 
263 start and end specifies how many elements per record will be included.
264 If start or end is negative, it counts from the end. -1 is the last element.
265 "
266  (with-open-file (f filename :external-format external-format)
267  (read-csv-stream f :type-spec type-spec :map-fns map-fns
268  :delimiter delimiter
269  :start start :end end
270  :header header)))
271 
272 
273 (defun read-csv-file-and-sort (filename sort-order &key (header t) (order :ascend) type-spec map-fns (delimiter *csv-separator*) (external-format *csv-default-external-format*))
274  (let ((table (read-csv-file filename
275  :header header
276  :type-spec type-spec
277  :map-fns map-fns
278  :delimiter delimiter
279  :external-format external-format)))
280  (loop for i in (reverse sort-order)
281  do (setf table
282  (stable-sort table (ecase order (:ascend #'string<=) (:descend #'string>=))
283  :key (lambda (rec) (aref rec i))))
284  finally (return table))))
285 
286 (defclass csv-file-data (file-data-source) ())
287 
288 ;; TODO 2024-08-05:
289 (defmethod scan-data ((self csv-file-data) (projection sequence))
290  (if (null projection)
291  (read-csv-file (file-data-path self))
292  (nyi!)))