Loading $uuid as UUID representation

BSON binary subtype 3 is a legacy UUID format. By default, JSON is deserialized as subtype 4. With this fix, it will deserialize $uuid as binary subtype 3 See Issue - e-mission/e-mission-docs#856 (comment)
e-mission · Mar 9, 2023 · 3b456e7 · 3b456e7
1 parent 6ac02a2
commit 3b456e7
Show file tree

Hide file tree

Showing 3 changed files with 51 additions and 60 deletions.
diff --git a/emission/tests/analysisTests/intakeTests/TestPipelineRealData.py b/emission/tests/analysisTests/intakeTests/TestPipelineRealData.py
@@ -33,6 +33,7 @@
 import logging
 import json
 import bson.json_util as bju
+from bson.binary import UuidRepresentation
 import attrdict as ad
 import arrow
 import numpy as np
@@ -200,8 +201,8 @@ def persistGroundTruthIfNeeded(self, api_result, dataFile, ld, cacheKey):
 
     def standardMatchDataGroundTruth(self, dataFile, ld, cacheKey):
         with open(dataFile+".ground_truth") as gfp:
-            ground_truth = json.load(gfp, object_hook=bju.object_hook)
-
+            ground_truth = bju.loads(gfp.read(), json_options = bju.LEGACY_JSON_OPTIONS.with_options(uuid_representation= UuidRepresentation.PYTHON_LEGACY))
+        
         etc.setupRealExample(self, dataFile)
         etc.runIntakePipeline(self.testUUID)
         # runIntakePipeline does not run the common trips, habitica or store views to cache
@@ -321,7 +322,7 @@ def testJumpSmoothingSectionsStraddle(self):
         start_ld = ecwl.LocalDate({'year': 2016, 'month': 8, 'day': 15})
         cacheKey = "diary/trips-2016-08-15"
         with open("emission/tests/data/real_examples/shankari_2016-independence_day.ground_truth") as gfp:
-            ground_truth = json.load(gfp, object_hook=bju.object_hook)
+            ground_truth = bju.loads(gfp.read(), json_options = bju.LEGACY_JSON_OPTIONS.with_options(uuid_representation= UuidRepresentation.PYTHON_LEGACY))
 
         etc.setupRealExample(self, dataFile)
         etc.runIntakePipeline(self.testUUID)
@@ -337,7 +338,7 @@ def testJumpSmoothingSectionStart(self):
         start_ld = ecwl.LocalDate({'year': 2016, 'month': 8, 'day': 15})
         cacheKey = "diary/trips-2016-08-15"
         with open("emission/tests/data/real_examples/shankari_2016-independence_day.ground_truth") as gfp:
-            ground_truth = json.load(gfp, object_hook=bju.object_hook)
+            ground_truth = bju.loads(gfp.read(), json_options = bju.LEGACY_JSON_OPTIONS.with_options(uuid_representation= UuidRepresentation.PYTHON_LEGACY))
 
         etc.setupRealExample(self, dataFile)
         etc.runIntakePipeline(self.testUUID)
@@ -385,13 +386,12 @@ def testAug10MultiSyncEndDetected(self):
         end_ld = ecwl.LocalDate({'year': 2016, 'month': 8, 'day': 10})
         cacheKey = "diary/trips-2016-08-10"
         with open("emission/tests/data/real_examples/shankari_2016-08-910.ground_truth") as gtf:
-            ground_truth = json.load(gtf,
-                                 object_hook=bju.object_hook)
+            ground_truth = bju.loads(gtf.read(), json_options = bju.LEGACY_JSON_OPTIONS.with_options(uuid_representation= UuidRepresentation.PYTHON_LEGACY))
 
         logging.info("Before loading, timeseries db size = %s" % edb.get_timeseries_db().estimated_document_count())
         all_entries = None
         with open(dataFile) as secondfp:
-            all_entries = json.load(secondfp, object_hook = bju.object_hook)
+            all_entries = bju.loads(secondfp.read(), json_options = bju.LEGACY_JSON_OPTIONS.with_options(uuid_representation= UuidRepresentation.PYTHON_LEGACY))
         ts_1030 = arrow.get("2016-08-10T10:30:00-07:00").int_timestamp
         logging.debug("ts_1030 = %s, converted back = %s" % (ts_1030, arrow.get(ts_1030).to("America/Los_Angeles")))
         before_1030_entries = [e for e in all_entries if ad.AttrDict(e).metadata.write_ts <= ts_1030]
@@ -428,11 +428,11 @@ def testFeb22MultiSyncEndDetected(self):
         end_ld = ecwl.LocalDate({'year': 2016, 'month': 2, 'day': 22})
         cacheKey = "diary/trips-2016-02-22"
         with open(dataFile+".ground_truth") as gtf:
-            ground_truth = json.load(gtf, object_hook=bju.object_hook)
+            ground_truth = bju.loads(gtf.read(), json_options = bju.LEGACY_JSON_OPTIONS.with_options(uuid_representation= UuidRepresentation.PYTHON_LEGACY))
 
         logging.info("Before loading, timeseries db size = %s" % edb.get_timeseries_db().estimated_document_count())
         with open(dataFile) as df:
-            all_entries = json.load(df, object_hook = bju.object_hook)
+            all_entries = bju.loads(df.read(), json_options = bju.LEGACY_JSON_OPTIONS.with_options(uuid_representation= UuidRepresentation.PYTHON_LEGACY))
         # 18:01 because the transition was at 2016-02-22T18:00:09.623404-08:00, so right after
         # 18:00
         ts_1800 = arrow.get("2016-02-22T18:00:30-08:00").int_timestamp
@@ -467,12 +467,11 @@ def testAug10MultiSyncEndNotDetected(self):
         end_ld = ecwl.LocalDate({'year': 2016, 'month': 8, 'day': 10})
         cacheKey = "diary/trips-2016-08-10"
         with open("emission/tests/data/real_examples/shankari_2016-08-910.ground_truth") as gtf:
-            ground_truth = json.load(gtf,
-                                 object_hook=bju.object_hook)
+            ground_truth = bju.loads(gtf.read(), json_options = bju.LEGACY_JSON_OPTIONS.with_options(uuid_representation= UuidRepresentation.PYTHON_LEGACY))
 
         logging.info("Before loading, timeseries db size = %s" % edb.get_timeseries_db().estimated_document_count())
         with open(dataFile) as df:
-            all_entries = json.load(df, object_hook = bju.object_hook)
+            all_entries = bju.loads(df.read(), json_options = bju.LEGACY_JSON_OPTIONS.with_options(uuid_representation= UuidRepresentation.PYTHON_LEGACY))
         ts_1030 = arrow.get("2016-08-10T10:30:00-07:00").int_timestamp
         logging.debug("ts_1030 = %s, converted back = %s" % (ts_1030, arrow.get(ts_1030).to("America/Los_Angeles")))
         before_1030_entries = [e for e in all_entries if ad.AttrDict(e).metadata.write_ts <= ts_1030]
@@ -509,14 +508,14 @@ def testJul22SplitAroundReboot(self):
         cacheKey_1 = "diary/trips-2016-07-22"
         cacheKey_2 = "diary/trips-2016-07-25"
         with open(dataFile_1+".ground_truth") as gtf1:
-            ground_truth_1 = json.load(gtf1, object_hook=bju.object_hook)
+            ground_truth_1 = bju.loads(gtf1.read(), json_options = bju.LEGACY_JSON_OPTIONS.with_options(uuid_representation= UuidRepresentation.PYTHON_LEGACY))
         with open(dataFile_2+".ground_truth") as gtf2:
-            ground_truth_2 = json.load(gtf2, object_hook=bju.object_hook)
+            ground_truth_2 = bju.loads(gtf2.read(), json_options = bju.LEGACY_JSON_OPTIONS.with_options(uuid_representation= UuidRepresentation.PYTHON_LEGACY))
 
         etc.setupRealExample(self, dataFile_1)
         etc.runIntakePipeline(self.testUUID)
         with open(dataFile_2) as df2:
-            self.entries = json.load(df2, object_hook = bju.object_hook)
+            self.entries = bju.loads(df2.read(), json_options = bju.LEGACY_JSON_OPTIONS.with_options(uuid_representation= UuidRepresentation.PYTHON_LEGACY))
         etc.setupRealExampleWithEntries(self)
         etc.runIntakePipeline(self.testUUID)
 
@@ -541,11 +540,11 @@ def testFeb22MultiSyncEndNotDetected(self):
         end_ld = ecwl.LocalDate({'year': 2016, 'month': 2, 'day': 22})
         cacheKey = "diary/trips-2016-02-22"
         with open(dataFile+".ground_truth") as gtf:
-            ground_truth = json.load(gtf, object_hook=bju.object_hook)
+            ground_truth = bju.loads(gtf.read(), json_options = bju.LEGACY_JSON_OPTIONS.with_options(uuid_representation= UuidRepresentation.PYTHON_LEGACY))
 
         logging.info("Before loading, timeseries db size = %s" % edb.get_timeseries_db().estimated_document_count())
         with open(dataFile) as df:
-            all_entries = json.load(df, object_hook = bju.object_hook)
+            all_entries = bju.loads(df.read(), json_options = bju.LEGACY_JSON_OPTIONS.with_options(uuid_representation= UuidRepresentation.PYTHON_LEGACY))
         # 18:01 because the transition was at 2016-02-22T18:00:09.623404-08:00, so right after
         # 18:00
         ts_1800 = arrow.get("2016-02-22T18:00:30-08:00").int_timestamp
@@ -581,11 +580,11 @@ def testOct07MultiSyncSpuriousEndDetected(self):
         end_ld = ecwl.LocalDate({'year': 2016, 'month': 10, 'day': 0o7})
         cacheKey = "diary/trips-2016-10-07"
         with open(dataFile+".ground_truth") as gtf:
-            ground_truth = json.load(gtf, object_hook=bju.object_hook)
+            ground_truth = bju.loads(gtf.read(), json_options = bju.LEGACY_JSON_OPTIONS.with_options(uuid_representation= UuidRepresentation.PYTHON_LEGACY))
 
         logging.info("Before loading, timeseries db size = %s" % edb.get_timeseries_db().estimated_document_count())
         with open(dataFile) as df:
-            all_entries = json.load(df, object_hook = bju.object_hook)
+            all_entries = bju.loads(df.read(), json_options = bju.LEGACY_JSON_OPTIONS.with_options(uuid_representation= UuidRepresentation.PYTHON_LEGACY))
         # 18:01 because the transition was at 2016-02-22T18:00:09.623404-08:00, so right after
         # 18:00
         ts_1800 = arrow.get("2016-10-07T18:33:11-07:00").int_timestamp
@@ -621,13 +620,13 @@ def testZeroDurationPlaceInterpolationSingleSync(self):
         cacheKey_1 = "diary/trips-2016-01-12"
         cacheKey_2 = "diary/trips-2016-01-13"
         with open(dataFile_1+".ground_truth") as gtf1:
-            ground_truth_1 = json.load(gtf1, object_hook=bju.object_hook)
+            ground_truth_1 = bju.loads(gtf1.read(), json_options = bju.LEGACY_JSON_OPTIONS.with_options(uuid_representation= UuidRepresentation.PYTHON_LEGACY))
         with open(dataFile_2+".ground_truth") as gtf2:
-            ground_truth_2 = json.load(gtf2, object_hook=bju.object_hook)
+            ground_truth_2 = bju.loads(gtf2.read(), json_options = bju.LEGACY_JSON_OPTIONS.with_options(uuid_representation= UuidRepresentation.PYTHON_LEGACY))
 
         etc.setupRealExample(self, dataFile_1)
         with open(dataFile_2) as df2:
-            self.entries = json.load(df2, object_hook = bju.object_hook)
+            self.entries = bju.loads(df2.read(), json_options = bju.LEGACY_JSON_OPTIONS.with_options(uuid_representation= UuidRepresentation.PYTHON_LEGACY))
         etc.setupRealExampleWithEntries(self)
         etc.runIntakePipeline(self.testUUID)
 
@@ -653,14 +652,14 @@ def testZeroDurationPlaceInterpolationMultiSync(self):
         cacheKey_1 = "diary/trips-2016-01-12"
         cacheKey_2 = "diary/trips-2016-01-13"
         with open(dataFile_1+".ground_truth") as gtf1:
-            ground_truth_1 = json.load(gtf1, object_hook=bju.object_hook)
+            ground_truth_1 = bju.loads(gtf1.read(), json_options = bju.LEGACY_JSON_OPTIONS.with_options(uuid_representation= UuidRepresentation.PYTHON_LEGACY))
         with open(dataFile_2+".ground_truth") as gtf2:
-            ground_truth_2 = json.load(gtf2, object_hook=bju.object_hook)
+            ground_truth_2 = bju.loads(gtf2.read(), json_options = bju.LEGACY_JSON_OPTIONS.with_options(uuid_representation= UuidRepresentation.PYTHON_LEGACY))
 
         etc.setupRealExample(self, dataFile_1)
         etc.runIntakePipeline(self.testUUID)
         with open(dataFile_2) as df2:
-            self.entries = json.load(df2, object_hook = bju.object_hook)
+            self.entries = bju.loads(df2.read(), json_options = bju.LEGACY_JSON_OPTIONS.with_options(uuid_representation= UuidRepresentation.PYTHON_LEGACY))
         etc.setupRealExampleWithEntries(self)
         etc.runIntakePipeline(self.testUUID)
 

diff --git a/emission/tests/modellingTests/TestRunGreedyIncrementalModel.py b/emission/tests/modellingTests/TestRunGreedyIncrementalModel.py
@@ -6,10 +6,7 @@
 import time
 import pandas as pd
 import bson.json_util as bju
-import bson
-from bson.json_util import loads
-from itertools import chain
-from bson.binary import Binary, UuidRepresentation
+from bson.binary import UuidRepresentation
 
 import emission.analysis.modelling.trip_model.model_storage as eamums
 import emission.analysis.modelling.trip_model.model_type as eamumt
@@ -35,13 +32,7 @@ def setUp(self):
 
         # emission/tests/data/real_examples/shankari_2016-06-20.expected_confirmed_trips
         self.user_id = uuid.UUID('aa9fdec9-2944-446c-8ee2-50d79b3044d3')
-        # self.user_id = Binary.from_uuid(uuid.UUID('aa9fdec9-2944-446c-8ee2-50d79b3044d3'), UuidRepresentation.PYTHON_LEGACY)
-        print("USER IDDDDD", self.user_id)
         self.ts = esta.TimeSeries.get_time_series(self.user_id)
-        print("TSSSS", self.ts.find_entries([esdatq.CONFIRMED_TRIP_KEY]))
-        # for ent in self.ts.find_entries([esdatq.CONFIRMED_TRIP_KEY]):
-        # res = list(chain(self.ts.find_entries([esdatq.CONFIRMED_TRIP_KEY])))
-        # print("RES", res)
         self.new_trips_per_invocation = 3
         self.model_type = eamumt.ModelType.GREEDY_SIMILARITY_BINNING
         self.model_storage = eamums.ModelStorage.DOCUMENT_DATABASE
@@ -60,7 +51,7 @@ def setUp(self):
         # load in trips from a test file source
         input_file = 'emission/tests/data/real_examples/shankari_2016-06-20.expected_confirmed_trips'
         with open(input_file, 'r') as f:
-            trips_json = bju.loads(f.read(), json_options = bju.LEGACY_JSON_OPTIONS.with_options(strict_uuid= False, uuid_representation= UuidRepresentation.PYTHON_LEGACY))
+            trips_json = bju.loads(f.read(), json_options = bju.LEGACY_JSON_OPTIONS.with_options(uuid_representation= UuidRepresentation.PYTHON_LEGACY))
             trips = [ecwe.Entry(r) for r in trips_json]
         logging.debug(f'loaded {len(trips)} trips from {input_file}')
         self.ts.bulk_insert(trips)