Merge pull request #28 from korotkevics/feature/infer_mode

Infer 'REQUIRED' mode with a flag for consistently filled in values
bxparks · Mar 4, 2019 · 6c52455 · 6c52455
2 parents 66cf730 + 479f7c5
commit 6c52455
Show file tree

Hide file tree

Showing 3 changed files with 87 additions and 1 deletion.
diff --git a/bigquery_schema_generator/generate_schema.py b/bigquery_schema_generator/generate_schema.py
@@ -75,11 +75,13 @@ class SchemaGenerator:
 
     def __init__(self,
                  input_format='json',
+                 infer_mode=False,
                  keep_nulls=False,
                  quoted_values_are_strings=False,
                  debugging_interval=1000,
                  debugging_map=False):
         self.input_format = input_format
+        self.infer_mode = infer_mode
         self.keep_nulls = keep_nulls
         self.quoted_values_are_strings = quoted_values_are_strings
         self.debugging_interval = debugging_interval
@@ -198,6 +200,14 @@ def merge_schema_entry(self, old_schema_entry, new_schema_entry):
         old_status = old_schema_entry['status']
         new_status = new_schema_entry['status']
 
+        if old_status == 'soft' or new_status == 'soft':
+            update = {'is_always_filled_in': 'no'}
+            old_schema_entry.update(update)
+            new_schema_entry.update(update)
+
+        if old_schema_entry.get('is_always_filled_in') == 'undetermined' and new_status == 'hard':
+            new_schema_entry.update({'is_always_filled_in': 'yes'})
+
         # new 'soft' does not clobber old 'hard'
         if old_status == 'hard' and new_status == 'soft':
             return old_schema_entry
@@ -258,12 +268,16 @@ def merge_schema_entry(self, old_schema_entry, new_schema_entry):
         # might seem reasonable to allow a NULLABLE {primitive_type} to be
         # upgraded to a REPEATED {primitive_type}, but currently 'bq load' does
         # not support that so we must also follow that rule.
-        if old_mode != new_mode:
+        if old_mode != new_mode and not self.infer_mode:
             raise Exception(('Mismatched mode for non-RECORD: '
                              'old=(%s,%s,%s,%s); new=(%s,%s,%s,%s)') %
                             (old_status, old_name, old_mode, old_type,
                              new_status, new_name, new_mode, new_type))
 
+        if new_schema_entry.get('info').get('mode') == 'NULLABLE':
+            if new_schema_entry.get('is_always_filled_in') == 'yes' and self.infer_mode:
+                    new_schema_entry.get('info').update({'mode': 'REQUIRED'})
+
         candidate_type = convert_type(old_type, new_type)
         if not candidate_type:
             raise Exception(
@@ -327,6 +341,7 @@ def get_schema_entry(self, key, value):
             else:
                 status = 'hard'
             schema_entry = OrderedDict([('status', status),
+                                        ('is_always_filled_in', 'undetermined'),
                                         ('info', OrderedDict([
                                             ('mode', value_mode),
                                             ('name', key),
@@ -638,6 +653,11 @@ def main():
         '--quoted_values_are_strings',
         help='Quoted values should be interpreted as strings',
         action="store_true")
+    parser.add_argument(
+        '--infer_mode',
+        help="If set to 'true', keys consistently having non-null values will gain 'REQUIRED' mode in the schema.",
+        action='store_false'
+    )
     parser.add_argument(
         '--debugging_interval',
         help='Number of lines between heartbeat debugging messages',
@@ -655,6 +675,7 @@ def main():
 
     generator = SchemaGenerator(
         input_format=args.input_format,
+        infer_mode=args.infer_mode,
         keep_nulls=args.keep_nulls,
         quoted_values_are_strings=args.quoted_values_are_strings,
         debugging_interval=args.debugging_interval,

diff --git a/tests/test_generate_schema.py b/tests/test_generate_schema.py
@@ -411,6 +411,7 @@ def verify_data_chunk(self, chunk_count, chunk):
         data_flags = chunk['data_flags']
         input_format = 'csv' if ('csv' in data_flags) else 'json'
         keep_nulls = ('keep_nulls' in data_flags)
+        infer_mode = ('infer_mode' in data_flags)
         quoted_values_are_strings = ('quoted_values_are_strings' in data_flags)
         records = chunk['records']
         expected_errors = chunk['errors']
@@ -422,6 +423,7 @@ def verify_data_chunk(self, chunk_count, chunk):
         # Generate schema.
         generator = SchemaGenerator(
             input_format=input_format,
+            infer_mode=infer_mode,
             keep_nulls=keep_nulls,
             quoted_values_are_strings=quoted_values_are_strings)
         schema_map, error_logs = generator.deduce_schema(records)

diff --git a/tests/testdata.txt b/tests/testdata.txt
@@ -831,3 +831,66 @@ SCHEMA
   }
 ]
 END
+
+# Infer 'REQUIRED' mode for a consistently filled in value - simple
+DATA csv infer_mode
+a,b,c,d,e
+,ho,hi,true
+3,hu,he,
+SCHEMA
+[
+  {
+    "mode": "NULLABLE",
+    "name": "a",
+    "type": "INTEGER"
+  },
+  {
+    "mode": "REQUIRED",
+    "name": "b",
+    "type": "STRING"
+  },
+  {
+    "mode": "REQUIRED",
+    "name": "c",
+    "type": "STRING"
+  },
+  {
+    "mode": "NULLABLE",
+    "name": "d",
+    "type": "BOOLEAN"
+  },
+  {
+    "mode": "NULLABLE",
+    "name": "e",
+    "type": "STRING"
+  }
+]
+END
+
+# Infer 'REQUIRED' mode for a consistently filled in value - complex
+DATA csv infer_mode
+name,surname,age
+John
+Michael,,
+Maria,Smith,30
+Joanna,Anders,21
+SCHEMA
+[
+  {
+    "mode": "REQUIRED",
+    "name": "name",
+    "type": "STRING"
+  },
+  {
+    "mode": "NULLABLE",
+    "name": "surname",
+    "type": "STRING"
+  },
+  {
+    "mode": "NULLABLE",
+    "name": "age",
+    "type": "INTEGER"
+  }
+]
+END
+