forked from tpapp/data-omnivore
-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathdelimited-text.lisp
More file actions
111 lines (99 loc) · 4.85 KB
/
delimited-text.lisp
File metadata and controls
111 lines (99 loc) · 4.85 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
;;; -*- Mode: LISP; Syntax: Ansi-Common-Lisp; Base: 10; Package: DFIO -*-
;;; Copyright (c) 2021-2022,2026 Symbolics Pte. Ltd. All rights reserved.
(in-package #:dfio)
(defun csv-to-data-columns (source skip-first-row? &key map-alist)
"Read a CSV file (or stream, or string), accumulate the values in DATA-COLUMNs, return a list of these. Rows are checked to have the same number of elements.
When SKIP-FIRST-ROW? is non-NIL, the first row is read separately and returned as the second value (list of strings), otherwise it is considered data like all other rows."
(let (data-columns
(first-row skip-first-row?))
(with-input-stream (s source)
(loop for row = (fare-csv:read-csv-line s) while row do
(progn
(if data-columns
(assert (length= data-columns row))
(setf data-columns (loop repeat (length row) collect (data-column :map-alist map-alist))))
(if first-row
(mapc #'data-column-add data-columns row)
(setf first-row row)))
) ;loop
)
(values data-columns (unless skip-first-row? first-row))))
(defun read-csv (source
&key
(skip-first-row? nil)
(column-keys-or-function #'string-to-symbol)
(package nil)
(map-alist '(("" . :na)
("NA" . :na))))
"Read a CSV file, stream, string or URL into a DATA-FRAME, which is returned.
When SKIP-FIRST-ROW?, the first row is read separately and COLUMN-KEYS-OR-FUNCTION is used to form column keys.
When COLUMN-KEYS-OR-FUNCTION is a sequence, it is used for column keys, regardless of the value of SKIP-FIRST-ROW?.
PACKAGE indicates the package to intern column names into.
MAP-ALIST maps values during the import. This is useful if you want special mappings for missing, though the mechanism is general.
Returns two values, the data-frame and the source"
(let+ (((&values data-columns first-row)
(csv-to-data-columns source skip-first-row? :map-alist map-alist))
(*package* (cond
((not package) *package*)
((find-package (string-upcase package)) (find-package (string-upcase package)))
(t (make-package (string-upcase package)))))
(column-keys (cond
((and first-row (functionp column-keys-or-function))
(mapcar column-keys-or-function first-row))
((typep column-keys-or-function 'sequence)
(assert (length= data-columns column-keys-or-function) ()
"The length of column keys ~A does not match the number of columns ~A."
column-keys-or-function (length data-columns))
column-keys-or-function)
(t (error "Could not generate column keys."))))
(df (data-frame:alist-df
(mapcar (lambda (column-key data-column)
(cons column-key (data-column-vector data-column)))
column-keys data-columns))))
df))
(defun write-csv (df stream
&key
(add-first-row nil)
((:separator separator) fare-csv:*separator*)
((:quote quote) fare-csv:*quote*)
((:eol eol) fare-csv:+LF+))
"Write DF to STREAM in CSV format.
STREAM can be:
- a stream (written to directly)
- a pathname (opened for output)
- NIL (a string is returned)
This implementation writes rows incrementally (streaming) and avoids
materializing a 2-D array. Row count is computed using LENGTH on each
column, which respects fill-pointers on adjustable vectors (fixes issue #9)."
(let ((fare-csv:*separator* separator)
(fare-csv:*quote* quote)
(fare-csv:*eol* eol))
(with-csv-output-stream (s stream)
(let* ((keys (df:keys df)) ; typically a vector
(ncol (length keys))
(cols (make-array ncol))
(nrow 0))
;; Cache columns once; compute effective row count via LENGTH.
(loop for j below ncol
for k = (aref keys j)
for c = (df:column df k)
do (setf (aref cols j) c)
finally (setf nrow (if (zerop ncol) 0 (length (aref cols 0)))))
;; Defensive: ensure all columns share the same effective length.
(loop for j below ncol
do (assert (= (length (aref cols j)) nrow) ()
"Columns don't have the same effective length."))
;; Optional header row.
(when add-first-row
(fare-csv:write-csv-line (coerce keys 'list) s))
;; Stream data rows.
(loop for i below nrow do
(fare-csv:write-csv-line
(loop for j below ncol
for col = (aref cols j)
collect (if (arrayp col)
(aref col i)
(elt col i)))
s)))
(unless stream
(get-output-stream-string s)))))