-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy path__init__.py
2039 lines (1655 loc) · 70.2 KB
/
__init__.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# =================================================================
#
# Terms and Conditions of Use
#
# Unless otherwise noted, computer program source code of this
# distribution is covered under Crown Copyright, Government of
# Canada, and is distributed under the MIT License.
#
# The Canada wordmark and related graphics associated with this
# distribution are protected under trademark law and copyright law.
# No permission is granted to use them outside the parameters of
# the Government of Canada's corporate identity program. For
# more information, see
# http://www.tbs-sct.gc.ca/fip-pcim/index-eng.asp
#
# Copyright title to all 3rd party software distributed with this
# software is held by the respective copyright holders as noted in
# those files. Users are asked to read the 3rd Party Licenses
# referenced with those assets.
#
# Copyright (c) 2025 Government of Canada
#
# Permission is hereby granted, free of charge, to any person
# obtaining a copy of this software and associated documentation
# files (the "Software"), to deal in the Software without
# restriction, including without limitation the rights to use,
# copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following
# conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
# OTHER DEALINGS IN THE SOFTWARE.
#
# =================================================================
import csv
import json
import io
import os
import yaml
import re
import jsonschema
import logging
from io import StringIO
from datetime import datetime, time
from collections import OrderedDict
from woudc_extcsv.util import (parse_integer_range, _table_index,
non_content_line)
__version__ = '0.6.0'
__dirpath = os.path.dirname(os.path.realpath(__file__))
WDR_TABLE_SCHEMA = os.path.join(__dirpath, 'tables-schema.json')
WDR_TABLE_CONFIG = os.path.join(__dirpath, 'tables-backfilling.yml')
WDR_ERROR_CONFIG = os.path.join(__dirpath, 'errors-backfilling.csv')
LOGGER = logging.getLogger(__name__)
with open(WDR_TABLE_SCHEMA) as table_schema_file:
table_schema = json.load(table_schema_file)
with open(WDR_TABLE_CONFIG) as table_definitions:
DOMAINS = yaml.safe_load(table_definitions)
with open(WDR_ERROR_CONFIG) as error_definitions:
reader = csv.reader(error_definitions, escapechar='\\')
next(reader) # Skip header line.
ERRORS = OrderedDict()
for row in reader:
error_code = int(row[0])
ERRORS[error_code] = row[1:3]
try:
jsonschema.validate(DOMAINS, table_schema)
except jsonschema.SchemaError as err:
LOGGER.critical('Failed to read table definition schema:'
' cannot process incoming files')
raise err
except jsonschema.ValidationError as err:
LOGGER.critical('Failed to read table definition file:'
' cannot process incoming files')
raise err
def dump(extcsv_obj, filename):
"""
Dump Reader Extended CSV object to file
:param extcsv_obj: Reader Extended CSV object
:param filename: filename
:returns: void, writes file to disk
"""
LOGGER.info('Dumping Extended CSV object to file: %s' % filename)
with open(filename, 'w') as ff:
ff.write(_dump(extcsv_obj))
def dumps(extcsv_obj):
"""
Dump Writer Extended CSV object to string representation
:param extcsv_obj: Writer Extended CSV object
:returns: string
"""
return _dump(extcsv_obj)
def _dump(extcsv_obj):
"""
Internal helper function to dump Extended CSV object to
string representation
:param extcsv_obj: Extended CSV object
:returns: string representation of Extended CSV
"""
validate = [extcsv_obj.metadata_validator(),
extcsv_obj.dataset_validator()]
bad = validate[0] and validate[1]
if bad: # validation errors found
msg = 'Could not serialize object to string. Violations found: %s' % \
','.join(validate[1])
LOGGER.error(msg)
raise RuntimeError(msg)
# object is good, dump to string
try:
LOGGER.info('Serializing object to string')
return extcsv_obj.serialize().getvalue()
except Exception as err:
msg = 'Extended CSV cannot be serialized %s' % err
LOGGER.error(msg)
raise RuntimeError(msg)
def load(filename, reader=True):
"""
Load Extended CSV from file
:param filename: filename
:returns: Extended CSV data structure
"""
try:
with io.open(filename, encoding='utf-8') as ff:
content = ff.read()
if not reader:
return ExtendedCSV(content)
else:
return Reader(content)
except UnicodeError as err:
LOGGER.warning(err)
msg = 'Unable to read {} with utf8 encoding. Attempting to read' \
' with latin1 encoding.'.format(filename)
LOGGER.info(msg)
with io.open(filename, encoding='latin1') as ff:
content = ff.read()
if not reader:
return ExtendedCSV(content)
else:
return Reader(content)
def loads(strbuf):
"""
Load Extended CSV from string
:param strbuf: string representation of Extended CSV
:returns: Extended CSV data structure
"""
return Reader(strbuf)
class ExtendedCSV(object):
"""
WOUDC Extended CSV parser
https://guide.woudc.org/en/#chapter-3-standard-data-format
"""
def __init__(self, content, reporter=None):
"""
Read WOUDC Extended CSV file
:param content: buffer of Extended CSV data
:returns: `ExtendedCSV` object
"""
self.extcsv = {}
self._raw = None
self._table_count = {}
self._line_num = {}
self.file_comments = []
self.warnings = []
self.errors = []
self.reports = reporter
self._noncore_table_schema = None
self._observations_table = None
LOGGER.debug('Reading into csv')
self._raw = content
self._raw = content.lstrip('\ufeff')
reader = csv.reader(StringIO(self._raw))
LOGGER.debug('Parsing object model')
parent_table = None
lines = enumerate(reader, 1)
success = True
for line_num, row in lines:
separators = []
for bad_sep in ['::', ';', '$', '%', '|', '\\']:
if not non_content_line(row) and bad_sep in row[0]:
separators.append(bad_sep)
for separator in separators:
comma_separated = row[0].replace(separator, ',')
row = next(csv.reader(StringIO(comma_separated)))
if not self._add_to_report(104, line_num, separator=separator):
success = False
if len(row) == 1 and row[0].startswith('#'): # table name
parent_table = ''.join(row).lstrip('#').strip()
try:
LOGGER.debug('Found new table {}'.format(parent_table))
ln, fields = next(lines)
while non_content_line(fields):
if not self._add_to_report(103, ln):
success = False
ln, fields = next(lines)
parent_table = self.init_table(parent_table, fields,
line_num)
except StopIteration:
if not self._add_to_report(206, line_num,
table=parent_table):
success = False
elif len(row) > 0 and row[0].startswith('*'): # comment
LOGGER.debug('Found comment')
self.file_comments.append(row)
continue
elif non_content_line(row): # blank line
LOGGER.debug('Found blank line')
continue
elif parent_table is not None and not non_content_line(row):
if not self.add_values_to_table(parent_table, row, line_num):
success = False
else:
if not self._add_to_report(211, line_num, row=','.join(row)):
success = False
if not success:
raise NonStandardDataError(self.errors)
def _add_to_report(self, error_code, line=None, **kwargs):
"""
Submit a warning or error of code <error_code> to the report generator,
with was found at line <line> in the input file. Uses keyword arguments
to detail the warning/error message.
:returns: False iff the error is serious enough to abort parsing,
i.e. True iff the file can continue parsing.
"""
if self.reports is not None:
message, severe = self.reports.add_message(error_code, line,
**kwargs)
else:
severe = ERRORS[error_code][0] == 'Error'
message = ERRORS[error_code][1]
while '{' in message:
for i in range(0, len(message), 1):
if message[i] == '{':
key_start = i+1
elif message[i] == '}':
curr_key = message[key_start:i]
message = message[0:key_start-1] + \
str(kwargs[curr_key]) + \
message[i+1:len(message)]
break
if severe:
LOGGER.error(message)
self.errors.append(message)
else:
LOGGER.warning(message)
self.warnings.append(message)
return not severe
def add_comment(self, comment):
"""
Add file-level comments
:param comment: file level comment to be added.
"""
self.file_comments.append(comment)
LOGGER.info('added file-level comment')
def add_table_comment(self, table, comment, index=1):
"""
Add table-level comments
:param table: table name
:param index: table index or grouping
:param comment: table level comment to be added.
"""
table_n = _table_index(table, index)
self.extcsv[table_n]['comments'].append(comment)
LOGGER.info('added table-level comment')
def line_num(self, table):
"""
Returns the line in the source file at which <table> started.
If there is no table in the file named <table>, returns None instead.
:param table: name of an Extended CSV table.
:returns: line number where the table occurs, or None if
the table never occurs.
"""
return self._line_num.get(table, None)
def update_line_num(self):
"""
Updates the dictionary of lines in the source file at which
<table> started.
:returns: line number where the table occurs, or None if
the table never occurs.
"""
return self._line_num
def table_count(self, table_type):
"""
Returns the number of tables named <table_type> in the source file.
:param table_type: name of an Extended CSV table without suffixes.
:returns: number of tables named <table_type> in the input file.
"""
return self._table_count.get(table_type, 0)
def init_table(self, table_name, fields, line_num):
"""
Record an empty Extended CSV table named <table_name> with
fields given in the list <fields>, which starts at line <line_num>.
May change the name of the table if a table named <table_name>
already exists. Returns the table name that ends up being used.
:param table_name: name of the new table
:param fields: list of column names in the new table
:param line_num: line number of the table's header (its name)
:returns: final name for the new table
"""
if table_name not in self._table_count:
self._table_count[table_name] = 1
else:
updated_count = self._table_count[table_name] + 1
self._table_count[table_name] = updated_count
table_name += '_' + str(updated_count)
self.extcsv[table_name] = OrderedDict()
self._line_num[table_name] = line_num
self.extcsv[table_name]['comments'] = []
for field in fields:
self.extcsv[table_name][field.strip()] = []
msg = 'added table {}'.format(table_name)
LOGGER.info(msg)
return table_name
def get_value(self, table, field=None, index=None):
"""
Get value from table field.
:param table: table name
:param field: field name
:returns: value in the extcsv file.
"""
if table not in self.extcsv:
raise KeyError(f"Table '{table}' not found in extcsv.")
if field is not None and field not in self.extcsv[table]:
raise KeyError(f"Field '{field}' not found in table '{table}'.")
headers = [
'CONTENT', 'DATA_GENERATION', 'PLATFORM',
'INSTRUMENT', 'LOCATION', 'TIMESTAMP'
]
if index == 1:
return self.extcsv[table][field][0]
if table in headers and field is None:
data = self.extcsv[table]
for field in list(data.keys()):
if field != 'comments' and isinstance(data[field], list):
data[field] = data[field][0]
return data
elif table not in headers and field is None:
return self.extcsv[table]
return self.extcsv[table][field]
def add_field_to_table(self, table_name, fields, index=1):
"""
Record an empty column(s) in Extended CSV table named <table_name>
with field names given in the list <fields>.
May reject a field in <fields> if the field already exists in
<table_name>. Returns the field names that ends up being used.
:param table_name: name of the table
:param fields: list of column names in the new table
:param line_num: line number of the table's header (its name)
:returns: field names added to the table
"""
added_fields = []
_table_name = _table_index(table_name, index)
if _table_name not in self._line_num:
return added_fields
for field in fields:
if field not in self.extcsv[_table_name]:
self.extcsv[_table_name][field.strip()] = []
added_fields += [field]
msg = 'field {} added to table {} index {}' \
.format(field, _table_name, index)
LOGGER.info(msg)
else:
msg = 'field {} already exists in table {} index {}' \
.format(field, _table_name, index)
LOGGER.error(msg)
return added_fields
def add_values_to_table(self, table_name, values, line_num, fields=None,
index=1, horizontal=True):
"""
Add the raw strings in <values> to the bottom of the columns
in the tabled named <table_name>.
Returns whether the operation was successful (no errors occurred).
:param table_name: name of the table the values fall under
:param values: list of values from one row in the table
:param line_num: line number the row occurs at
:param horizontal: True if horizontal insert, false if vertical
:returns: `bool` of whether the operation was successful
"""
success = True
_table_name = _table_index(table_name, index)
if horizontal: # horizontal insert
if fields is not None:
all_fields = list(self.extcsv[_table_name].keys())[1:]
for f in all_fields:
if f not in fields:
fields.append(f)
values.append('')
else:
fields = list(self.extcsv[_table_name].keys())[1:]
fillins = len(fields) - len(values)
if fillins < 0:
if not self._add_to_report(
212, line_num, table=_table_name):
success = False
values.extend([''] * fillins)
values = values[:len(fields)]
for field, value in zip(fields, values):
self.extcsv[table_name][field].append(value.strip())
else: # vertical insert
if len(fields) == 1:
for value in values:
self.extcsv[table_name][fields[0]].append(value)
else:
for (field, value) in zip(fields, values):
self.extcsv[table_name][field].append(value)
return success
def remove_table(self, table_type, index=1):
"""
Remove a table from the memory of this Extended CSV instance.
Does not alter the source file in any way.
:param table_name: name of the table to delete.
:returns: void
"""
table_name = _table_index(table_type, index)
self.extcsv.pop(table_name)
self._line_num.pop(table_name)
if self._table_count[table_type] > 1:
self._table_count[table_type] -= 1
else:
self._table_count.pop(table_type)
msg = 'removed table {}'.format(table_name)
LOGGER.info(msg)
def remove_field(self, table, field, index=1):
"""
Remove field (and data) from table
:param table: table name
:param field: field name
:param index: table index or grouping
"""
table_n = _table_index(table, index)
try:
del self.extcsv[table_n][field]
msg = 'removed field %s table %s index %s' % (field, table, index)
LOGGER.info(msg)
except Exception as err:
msg = 'unable to remove field %s' % err
LOGGER.error(msg)
def remove_values_from_table(self, table, field, data=None, index=1,
d_index=None, all_occurences=False):
"""
Remove data from Extended CSV table field
:param table: table name
:param field: field name
:param data: data to be removed
:param index: table index or grouping
:param d_index: index of data in a multi value field
(i.e. profile field)
:param all_occurences: remove all occurences of matching data
from a table field (default is False).
"""
table_n = _table_index(table, index)
if all([d_index is None, data is not None]): # remove first occurence
try:
self.extcsv[table_n][field].remove(data)
msg = 'data %s field %s table %s index %s removed' % \
(data, field, table, index)
LOGGER.info(msg)
except ValueError:
msg = 'value %s not found' % data
LOGGER.error(msg)
if d_index is not None: # remove by index
try:
self.extcsv[table_n][field].pop(d_index)
msg = 'data at index %s field %s table %s index %s removed' % \
(d_index, field, table, index)
LOGGER.info(msg)
except IndexError:
msg = 'no data found pos %s field %s table %s index %s' % \
(d_index, field, table, index)
LOGGER.error(msg)
if all([data is not None, all_occurences]): # remove all
LOGGER.info('removing all occurences')
val = filter(lambda a: a != data, self.extcsv[table_n][field])
self.extcsv[table_n][field] = list(val)
msg = 'data %s field %s table %s index %s removed' % \
(data, field, table, index)
LOGGER.info(msg)
def clear_file(self):
"""
Remove all tables from Extended CSV Writer
"""
try:
self.extcsv.clear()
LOGGER.info('Extended CSV cleared')
except Exception as err:
msg = 'Could not clear Extended CSV: %s' % err
LOGGER.error(msg)
def clear_table(self, table, index=1):
"""
Clear table (all fields except table comments)
:param table: table name
:param index: index name
"""
table_n = _table_index(table, index)
try:
# back up comments
t_comments = self.extcsv[table_n]['comments']
self.extcsv[table_n].clear()
# put back commenets
self.extcsv[table_n]['comments'] = t_comments
msg = 'table %s index %s cleared' % (table, index)
LOGGER.info(msg)
except Exception as err:
msg = 'could not clear table %s' % err
LOGGER.error(msg)
def clear_field(self, table, field, index=1):
"""
Clear all values from field
:param table: table name
:param field: field name
:param index: index name
"""
table_n = _table_index(table, index)
try:
self.extcsv[table_n][field] = []
msg = 'field %s table %s index %s cleared' % (field, table, index)
LOGGER.info(msg)
except Exception as err:
msg = 'could not clear field %s' % err
LOGGER.error(msg)
def typecast_value(self, table, field, value, line_num):
"""
Returns a copy of the string <value> converted to the expected type
for a column named <field> in table <table>, if possible, or returns
the original string otherwise.
:param table: name of the table where the value was found
:param field: name of the column
:param value: string containing a value
:param line_num: line number where the value was found
:returns: value cast to the appropriate type for its column
"""
if value == '': # Empty CSV cell
return None
lowered_field = field.lower()
try:
if lowered_field == 'time':
return self.parse_timestamp(table, value, line_num)
elif lowered_field == 'date':
return self.parse_datestamp(table, value, line_num)
elif lowered_field == 'utcoffset':
return self.parse_utcoffset(table, value, line_num)
except Exception as err:
self._add_to_report(335, line_num, table=table, field=field,
reason=err)
return value
try:
if '.' in value: # Check float conversion
return float(value)
elif len(value) > 1 and value.startswith('0'):
return value
else: # Check integer conversion
return int(value)
except Exception: # Default type to string
return value
def parse_timestamp(self, table, timestamp, line_num):
"""
Return a time object representing the time contained in string
<timestamp> according to the expected HH:mm:SS format with optional
'am' or 'pm' designation.
Corrects common formatting errors and performs very simple validation
checks. Raises ValueError if the string cannot be parsed.
The other parameters are used for error reporting.
:param table: name of table the value was found under.
:param timestamp: string value taken from a Time column.
:param line_num: line number where the value was found.
:returns: the timestamp converted to a time object.
"""
success = True
if timestamp[-2:] in ['am', 'pm']:
noon_indicator = timestamp[-2:]
timestamp = timestamp[:-2].strip()
else:
noon_indicator = None
separators = re.findall(r'[^\w\d]', timestamp)
bad_seps = set(separators) - set(':')
for separator in bad_seps:
if not self._add_to_report(109, line_num, table=table,
separator=separator):
success = False
timestamp = timestamp.replace(separator, ':')
tokens = timestamp.split(':')
hour = tokens[0] or '00'
minute = tokens[1] or '00' if len(tokens) > 1 else '00'
second = tokens[2] or '00' if len(tokens) > 2 else '00'
hour_numeric = minute_numeric = second_numeric = None
try:
hour_numeric = int(hour)
except ValueError:
if not self._add_to_report(301, line_num, table=table,
component='hour'):
success = False
try:
minute_numeric = int(minute)
except ValueError:
if not self._add_to_report(301, line_num, table=table,
component='minute'):
success = False
try:
second_numeric = int(second)
except ValueError:
if not self._add_to_report(301, line_num, table=table,
component='second'):
success = False
if not success:
raise ValueError('Parsing errors encountered in #{}.Time'
.format(table))
if noon_indicator == 'am' and hour_numeric == 12:
if not self._add_to_report(110, line_num, table=table):
success = False
hour_numeric = 0
elif noon_indicator == 'pm' and hour_numeric not in [12, None]:
if not self._add_to_report(110, line_num, table=table):
success = False
hour_numeric += 12
if second_numeric is not None and second_numeric not in range(0, 60):
if not self._add_to_report(340, line_num, table=table,
component='second',
lower='00', upper='59'):
success = False
while second_numeric >= 60 and minute_numeric is not None:
second_numeric -= 60
minute_numeric += 1
if minute_numeric is not None and minute_numeric not in range(0, 60):
if not self._add_to_report(340, line_num, table=table,
component='minute',
lower='00', upper='59'):
success = False
while minute_numeric >= 60 and hour_numeric is not None:
minute_numeric -= 60
hour_numeric += 1
if hour_numeric is not None and hour_numeric not in range(0, 24):
if not self._add_to_report(340, line_num, table=table,
component='hour',
lower='00', upper='23'):
success = False
if not success:
raise ValueError('Parsing errors encountered in #{}.Time'
.format(table))
else:
return time(hour_numeric, minute_numeric, second_numeric)
def parse_datestamp(self, table, datestamp, line_num):
"""
Return a date object representing the date contained in string
<datestamp> according to the expected YYYY-MM-DD format.
Corrects common formatting errors and performs very simple validation
checks. Raises ValueError if the string cannot be parsed.
The other parameters are used for error reporting.
:param table: name of table the value was found under.
:param datestamp: string value taken from a Date column.
:param line_num: line number where the value was found.
:returns: datestamp converted to a datetime object.
"""
success = True
separators = re.findall(r'[^\w\d]', datestamp)
bad_seps = set(separators) - set('-')
for separator in bad_seps:
if not self._add_to_report(111, line_num, table=table,
separator=separator):
success = False
datestamp = datestamp.replace(separator, '-')
tokens = datestamp.split('-')
if len(tokens) == 1:
if not self._add_to_report(112, line_num, table=table):
success = False
if len(tokens) < 3:
if not self._add_to_report(113, line_num, table=table):
success = False
elif len(tokens) > 3:
if not self._add_to_report(114, line_num, table=table):
success = False
if not success:
raise ValueError('Parsing errors encountered in #{}.Date'
.format(table))
year = month = day = None
try:
year = int(tokens[0])
except ValueError:
if not self._add_to_report(302, line_num, table=table,
component='year'):
success = False
try:
month = int(tokens[1])
except ValueError:
if not self._add_to_report(302, line_num, table=table,
component='month'):
success = False
try:
day = int(tokens[2])
except ValueError:
if not self._add_to_report(302, line_num, table=table,
component='day'):
success = False
present_year = datetime.now().year
if year is not None and year not in range(1924, present_year + 1):
if not self._add_to_report(303, line_num, table=table,
component='year',
lower='1924', upper='PRESENT'):
success = False
if month is not None and month not in range(1, 12 + 1):
if not self._add_to_report(303, line_num, table=table,
component='month',
lower='01', upper='12'):
success = False
if day is not None and day not in range(1, 31 + 1):
if not self._add_to_report(304, line_num, table=table,
lower='01', upper='31'):
success = False
if not success:
raise ValueError('Parsing errors encountered in #{}.Date'
.format(table))
else:
return datetime.strptime(datestamp, '%Y-%m-%d').date()
def parse_utcoffset(self, table, utcoffset, line_num):
"""
Validates the raw string <utcoffset>, converting it to the expected
format defined by the regular expression (+|-)\\d\\d:\\d\\d:\\d\\d if
possible. Returns the converted value or else raises a ValueError.
The other parameters are used for error reporting.
:param table: name of table the value was found under.
:param utcoffset: string value taken from a UTCOffset column.
:param line_num: line number where the value was found.
:returns: value converted to expected UTCOffset format.
"""
success = True
separators = re.findall(r'[^-\+\w\d]', utcoffset)
bad_seps = set(separators) - set(':')
for separator in bad_seps:
if not self._add_to_report(115, line_num, table=table,
separator=separator):
success = False
utcoffset = utcoffset.replace(separator, ':')
sign = r'(\+|-|\+-)?'
delim = r'[^-\+\w\d]'
mandatory_place = r'([\d]{1,2})'
optional_place = '(' + delim + r'([\d]{0,2}))?'
template = '^{sign}{mandatory}{optional}{optional}$' \
.format(sign=sign, mandatory=mandatory_place,
optional=optional_place)
match = re.findall(template, utcoffset)
if len(match) == 1:
sign, hour, _, minute, _, second = match[0]
if len(hour) < 2:
if not self._add_to_report(116, line_num, table=table,
component='hour'):
success = False
hour = hour.rjust(2, '0')
if not minute:
if not self._add_to_report(117, line_num, table=table,
component='minute'):
success = False
minute = '00'
elif len(minute) < 2:
if not self._add_to_report(116, line_num, table=table,
component='minute'):
success = False
minute = minute.rjust(2, '0')
if not second:
if not self._add_to_report(117, line_num, table=table,
component='second'):
success = False
second = '00'
elif len(second) < 2:
if not self._add_to_report(116, line_num, table=table,
component='second'):
success = False
second = second.rjust(2, '0')
if all([hour == '00', minute == '00', second == '00']):
if sign != '+':
if not self._add_to_report(119, line_num, table=table,
sign='+'):
success = False
sign = '+'
elif not sign:
if not self._add_to_report(118, line_num, table=table):
success = False
sign = '+'
elif sign == '+-':
if not self._add_to_report(119, line_num, table=table,
sign='-'):
success = False
sign = '-'
if not success:
raise ValueError('Parsing errors encountered in #{}.UTCOffset'
.format(table))
try:
magnitude = time(int(hour), int(minute), int(second))
return '{}{}'.format(sign, magnitude)
except (ValueError, TypeError) as err:
self._add_to_report(305, line_num, table=table)
raise err
template = '^{sign}[0]+{delim}?[0]*{delim}?[0]*$' \
.format(sign=sign, delim=delim)
match = re.findall(template, utcoffset)
if len(match) == 1:
if not self._add_to_report(120, line_num, table=table):
raise ValueError('Parsing errors encountered in #{}.UTCOffset'
.format(table))
else:
return '+00:00:00'
self._add_to_report(305, line_num, table=table)
raise ValueError('Parsing errors encountered in #{}.UTCOffset'
.format(table))
def gen_woudc_filename(self):
"""generate WOUDC filename convention"""
timestamp = self.extcsv['TIMESTAMP']['Date'].strftime('%Y%m%d')
instrument_name = self.extcsv['INSTRUMENT']['Name']
instrument_model = self.extcsv['INSTRUMENT']['Model']
extcsv_serial = self.extcsv['INSTRUMENT'].get('Number', None)
instrument_number = extcsv_serial or 'na'