-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathcsv_model.py
283 lines (227 loc) · 8.83 KB
/
csv_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
import csv
import itertools
import datetime
import dateutil.parser
class CSVRow(object):
def __init__(self, row):
self.data = tuple(row)
def __iter__(self):
return iter(self.data)
def __eq__(self, other):
return hasattr(other, '__iter__') and self.data == tuple(other)
def __len__(self):
return len(self.data)
def _getslice(self, start, end):
if start is None:
start = 0
if end is None:
end = len(self)
return slice(start, end)
def __getitem__(self, idx):
if isinstance(idx, (int, slice)):
try:
return self.data[idx]
except IndexError:
raise IndexError('CSVRow index out of range')
raise TypeError('CSVRow can only be indexed with integers or slices, not {}'.format(type(idx).__name__))
def cast(self, filters):
if not filters:
return self
return CSVRow(self._cast_row(filters))
def _cast_row(self, filters):
if len(filters) != len(self):
raise ValueError('number of filters does not match number of CSV columns; {} != {}'.format(len(filters), len(self)))
return [f(v) for f, v in zip(filters, self)]
def __str__(self):
return str(self.data)
class CSVDictRow(CSVRow):
def __init__(self, fieldnames, row):
if len(fieldnames) != len(row):
raise ValueError("number of fields should match number of columns")
super().__init__(row)
self._idx_map = {}
for idx, field in enumerate(fieldnames):
self._idx_map[field] = idx
self.fieldnames = fieldnames
def _getindex(self, i):
if isinstance(i, int):
return i
return self._idx_map[i]
def _getslice(self, start, end, step=None):
if start is None:
start = 0
else:
start = self._getindex(start)
if end is None:
end = len(self)
elif isinstance(end, str):
end = self._idx_map.get(end)+1
return slice(start, end, step)
def __getitem__(self, idx):
if isinstance(idx, str):
idx = self._getindex(idx)
if isinstance(idx, int):
try:
return self.data[idx]
except IndexError:
raise IndexError('CSVDictRow index out of range')
if not isinstance(idx, slice):
msg = ('CSVDictRow can only be indexed with integers, fieldnames, '
'or slices, not {}').format(type(idx).__name__)
raise TypeError(msg)
if idx.step is not None and not isinstance(idx.step, int):
raise TypeError('slice indices must be integers or None or have an __index__ method')
return self.data[self._getslice(idx.start, idx.stop, idx.step)]
def cast(self, filters):
if not filters:
return self
return CSVDictRow(self.fieldnames, self._cast_row(filters))
class CSVColumn(object):
def __init__(self, col, name=None):
self.data = tuple(col)
self.fieldname = name
def __getitem__(self, idx):
return self.data[idx]
def __iter__(self):
return iter(self.data)
def __eq__(self, other):
return hasattr(other, '__iter__') and self.data == tuple(other)
def __len__(self):
return len(self.data)
def cast(self, filter_func):
return CSVColumn(filter(filter_func, self.data), name=self.fieldname)
def __str__(self):
if self.fieldname:
return '{}: {}'.format(self.fieldname, self.data)
return str(self.data)
def cast_to_bool(s=None):
if s is None:
return False
if isinstance(s, str):
if s.lower() in ['true', 'yes', 'y']:
return True
elif s.lower() in ['false', 'no', 'n']:
return False
raise ValueError()
else:
return bool(s)
def cast_to_date(d=None, parserinfo=None, **kwargs):
if d is None:
d = 0
if isinstance(d, datetime.date):
return d
if isinstance(d, str):
return dateutil.parser.parse(d, parserinfo=parserinfo, **kwargs)
if isinstance(d, int):
return datetime.datetime.fromtimestamp(d)
raise ValueError()
class CSVModel:
def __init__(self, rows, types=None):
rows = tuple(rows)
max_len = max(map(len, rows))
if types is None:
casts = [int, float, cast_to_bool, cast_to_date]
types = [str]*max_len
for i in range(max_len):
# try to find the most specific cast
for cast in casts:
try:
list(map(cast, (row[i] for row in rows if i < len(row))))
except ValueError:
continue
types[i] = cast
break
elif len(types) != max_len:
raise ValueError(('number of given types ({}) should match '
'number of columns ({})!').format(len(types), max_len))
new_rows = []
for row in rows:
new_row = [t(row[i]) if i < len(row) else t() for i, t in enumerate(types)]
new_rows.append(self._init_row(new_row))
self._rows = tuple(new_rows)
self._cols = tuple(self._init_col(i) for i in range(max_len))
self.num_cols = max_len
self.num_rows = len(rows)
self.types = types
def _init_row(self, row):
return CSVRow(row)
def _init_col(self, col_num):
return CSVColumn(row[col_num] for row in self._rows)
def cast(self, filters):
return CSVModel(self._rows, types=tuple(filters))
def cast_range(self, filters, start=None, end=None):
if not self._rows:
return
filterlen = len(filters)
csvrow = self._rows[0]
rangelen = len(csvrow[start:end])
if rangelen != filterlen:
raise ValueError('Number of filters ({}) should match number of columns ({})!'.format(filterlen, rangelen))
new_filters = list(self.types)
new_filters[csvrow._getslice(start, end)] = filters
return self.cast(new_filters)
def __iter__(self):
return iter(self._rows)
def __len__(self):
return self.num_rows
def __reversed__(self):
return reversed(self._rows)
def __str__(self):
return '\n'.join(map(str, self._rows))
def rows(self):
return self._rows
def row_slice(self, start=None, end=None):
return CSVModel(self._rows[start:end], types=self.types)
def iterrows(self):
return iter(self._rows)
def cols(self):
return self._cols
def col_slice(self, start=None, end=None):
return CSVModel((row[start:end] for row in self._rows), types=self.types[start:end])
def itercols(self):
return iter(self._cols)
@classmethod
def from_file(cls, filename, types=None):
with open(filename) as csvfile:
reader = csv.reader(csvfile)
if types is None:
return cls(reader)
rows = []
for row in reader:
rows.append([cast(item) for cast, item in itertools.zip_longest(types, row)])
return cls(rows, types=types)
class CSVDictModel(CSVModel):
def __init__(self, fieldnames, rows, types=None):
self.fieldnames = tuple(fieldnames)
super().__init__(rows, types=types)
if not self._rows:
raise ValueError('rows cannot be empty!')
def __str__(self):
return '\n'.join(map(str, itertools.chain([self.fieldnames], self._rows)))
def _init_row(self, row):
return CSVDictRow(self.fieldnames, row)
def _init_col(self, col_num):
return CSVColumn((row[col_num] for row in self._rows), name=self.fieldnames[col_num])
def cast(self, filters):
return CSVDictModel(self.fieldnames, self._rows, types=tuple(filters))
def row_slice(self, start, end):
return CSVDictModel(self.fieldnames, self._rows[start:end], types=self.types)
def col_slice(self, start, end):
s = self._rows[0]._getslice(start, end)
return CSVDictModel(self.fieldnames[s], (row[s] for row in self._rows),
types=self.types[s])
@classmethod
def from_file(cls, filename, types=None):
with open(filename) as csvfile:
reader = csv.DictReader(csvfile)
rows = []
if types is None:
for row in reader:
rows.append(tuple(row[field] for field in reader.fieldnames))
return cls(reader.fieldnames, rows)
for row in reader:
row_data = []
for cast, field in itertools.zip_longest(types, reader.fieldnames):
row_data.append(cast(row[field]))
rows.append(row_data)
return cls(reader.fieldnames, rows, types=types)