Skip to content

Commit

Permalink
Loading $uuid as UUID representation
Browse files Browse the repository at this point in the history
BSON binary subtype 3 is a legacy  UUID format. By default, JSON is deserialized as subtype 4. With this fix, it will deserialize $uuid as binary subtype 3
See Issue - e-mission/e-mission-docs#856 (comment)
  • Loading branch information
swastis10 committed Mar 9, 2023
1 parent 6ac02a2 commit 3b456e7
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 60 deletions.
49 changes: 24 additions & 25 deletions emission/tests/analysisTests/intakeTests/TestPipelineRealData.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
import logging
import json
import bson.json_util as bju
from bson.binary import UuidRepresentation
import attrdict as ad
import arrow
import numpy as np
Expand Down Expand Up @@ -200,8 +201,8 @@ def persistGroundTruthIfNeeded(self, api_result, dataFile, ld, cacheKey):

def standardMatchDataGroundTruth(self, dataFile, ld, cacheKey):
with open(dataFile+".ground_truth") as gfp:
ground_truth = json.load(gfp, object_hook=bju.object_hook)

ground_truth = bju.loads(gfp.read(), json_options = bju.LEGACY_JSON_OPTIONS.with_options(uuid_representation= UuidRepresentation.PYTHON_LEGACY))
etc.setupRealExample(self, dataFile)
etc.runIntakePipeline(self.testUUID)
# runIntakePipeline does not run the common trips, habitica or store views to cache
Expand Down Expand Up @@ -321,7 +322,7 @@ def testJumpSmoothingSectionsStraddle(self):
start_ld = ecwl.LocalDate({'year': 2016, 'month': 8, 'day': 15})
cacheKey = "diary/trips-2016-08-15"
with open("emission/tests/data/real_examples/shankari_2016-independence_day.ground_truth") as gfp:
ground_truth = json.load(gfp, object_hook=bju.object_hook)
ground_truth = bju.loads(gfp.read(), json_options = bju.LEGACY_JSON_OPTIONS.with_options(uuid_representation= UuidRepresentation.PYTHON_LEGACY))

etc.setupRealExample(self, dataFile)
etc.runIntakePipeline(self.testUUID)
Expand All @@ -337,7 +338,7 @@ def testJumpSmoothingSectionStart(self):
start_ld = ecwl.LocalDate({'year': 2016, 'month': 8, 'day': 15})
cacheKey = "diary/trips-2016-08-15"
with open("emission/tests/data/real_examples/shankari_2016-independence_day.ground_truth") as gfp:
ground_truth = json.load(gfp, object_hook=bju.object_hook)
ground_truth = bju.loads(gfp.read(), json_options = bju.LEGACY_JSON_OPTIONS.with_options(uuid_representation= UuidRepresentation.PYTHON_LEGACY))

etc.setupRealExample(self, dataFile)
etc.runIntakePipeline(self.testUUID)
Expand Down Expand Up @@ -385,13 +386,12 @@ def testAug10MultiSyncEndDetected(self):
end_ld = ecwl.LocalDate({'year': 2016, 'month': 8, 'day': 10})
cacheKey = "diary/trips-2016-08-10"
with open("emission/tests/data/real_examples/shankari_2016-08-910.ground_truth") as gtf:
ground_truth = json.load(gtf,
object_hook=bju.object_hook)
ground_truth = bju.loads(gtf.read(), json_options = bju.LEGACY_JSON_OPTIONS.with_options(uuid_representation= UuidRepresentation.PYTHON_LEGACY))

logging.info("Before loading, timeseries db size = %s" % edb.get_timeseries_db().estimated_document_count())
all_entries = None
with open(dataFile) as secondfp:
all_entries = json.load(secondfp, object_hook = bju.object_hook)
all_entries = bju.loads(secondfp.read(), json_options = bju.LEGACY_JSON_OPTIONS.with_options(uuid_representation= UuidRepresentation.PYTHON_LEGACY))
ts_1030 = arrow.get("2016-08-10T10:30:00-07:00").int_timestamp
logging.debug("ts_1030 = %s, converted back = %s" % (ts_1030, arrow.get(ts_1030).to("America/Los_Angeles")))
before_1030_entries = [e for e in all_entries if ad.AttrDict(e).metadata.write_ts <= ts_1030]
Expand Down Expand Up @@ -428,11 +428,11 @@ def testFeb22MultiSyncEndDetected(self):
end_ld = ecwl.LocalDate({'year': 2016, 'month': 2, 'day': 22})
cacheKey = "diary/trips-2016-02-22"
with open(dataFile+".ground_truth") as gtf:
ground_truth = json.load(gtf, object_hook=bju.object_hook)
ground_truth = bju.loads(gtf.read(), json_options = bju.LEGACY_JSON_OPTIONS.with_options(uuid_representation= UuidRepresentation.PYTHON_LEGACY))

logging.info("Before loading, timeseries db size = %s" % edb.get_timeseries_db().estimated_document_count())
with open(dataFile) as df:
all_entries = json.load(df, object_hook = bju.object_hook)
all_entries = bju.loads(df.read(), json_options = bju.LEGACY_JSON_OPTIONS.with_options(uuid_representation= UuidRepresentation.PYTHON_LEGACY))
# 18:01 because the transition was at 2016-02-22T18:00:09.623404-08:00, so right after
# 18:00
ts_1800 = arrow.get("2016-02-22T18:00:30-08:00").int_timestamp
Expand Down Expand Up @@ -467,12 +467,11 @@ def testAug10MultiSyncEndNotDetected(self):
end_ld = ecwl.LocalDate({'year': 2016, 'month': 8, 'day': 10})
cacheKey = "diary/trips-2016-08-10"
with open("emission/tests/data/real_examples/shankari_2016-08-910.ground_truth") as gtf:
ground_truth = json.load(gtf,
object_hook=bju.object_hook)
ground_truth = bju.loads(gtf.read(), json_options = bju.LEGACY_JSON_OPTIONS.with_options(uuid_representation= UuidRepresentation.PYTHON_LEGACY))

logging.info("Before loading, timeseries db size = %s" % edb.get_timeseries_db().estimated_document_count())
with open(dataFile) as df:
all_entries = json.load(df, object_hook = bju.object_hook)
all_entries = bju.loads(df.read(), json_options = bju.LEGACY_JSON_OPTIONS.with_options(uuid_representation= UuidRepresentation.PYTHON_LEGACY))
ts_1030 = arrow.get("2016-08-10T10:30:00-07:00").int_timestamp
logging.debug("ts_1030 = %s, converted back = %s" % (ts_1030, arrow.get(ts_1030).to("America/Los_Angeles")))
before_1030_entries = [e for e in all_entries if ad.AttrDict(e).metadata.write_ts <= ts_1030]
Expand Down Expand Up @@ -509,14 +508,14 @@ def testJul22SplitAroundReboot(self):
cacheKey_1 = "diary/trips-2016-07-22"
cacheKey_2 = "diary/trips-2016-07-25"
with open(dataFile_1+".ground_truth") as gtf1:
ground_truth_1 = json.load(gtf1, object_hook=bju.object_hook)
ground_truth_1 = bju.loads(gtf1.read(), json_options = bju.LEGACY_JSON_OPTIONS.with_options(uuid_representation= UuidRepresentation.PYTHON_LEGACY))
with open(dataFile_2+".ground_truth") as gtf2:
ground_truth_2 = json.load(gtf2, object_hook=bju.object_hook)
ground_truth_2 = bju.loads(gtf2.read(), json_options = bju.LEGACY_JSON_OPTIONS.with_options(uuid_representation= UuidRepresentation.PYTHON_LEGACY))

etc.setupRealExample(self, dataFile_1)
etc.runIntakePipeline(self.testUUID)
with open(dataFile_2) as df2:
self.entries = json.load(df2, object_hook = bju.object_hook)
self.entries = bju.loads(df2.read(), json_options = bju.LEGACY_JSON_OPTIONS.with_options(uuid_representation= UuidRepresentation.PYTHON_LEGACY))
etc.setupRealExampleWithEntries(self)
etc.runIntakePipeline(self.testUUID)

Expand All @@ -541,11 +540,11 @@ def testFeb22MultiSyncEndNotDetected(self):
end_ld = ecwl.LocalDate({'year': 2016, 'month': 2, 'day': 22})
cacheKey = "diary/trips-2016-02-22"
with open(dataFile+".ground_truth") as gtf:
ground_truth = json.load(gtf, object_hook=bju.object_hook)
ground_truth = bju.loads(gtf.read(), json_options = bju.LEGACY_JSON_OPTIONS.with_options(uuid_representation= UuidRepresentation.PYTHON_LEGACY))

logging.info("Before loading, timeseries db size = %s" % edb.get_timeseries_db().estimated_document_count())
with open(dataFile) as df:
all_entries = json.load(df, object_hook = bju.object_hook)
all_entries = bju.loads(df.read(), json_options = bju.LEGACY_JSON_OPTIONS.with_options(uuid_representation= UuidRepresentation.PYTHON_LEGACY))
# 18:01 because the transition was at 2016-02-22T18:00:09.623404-08:00, so right after
# 18:00
ts_1800 = arrow.get("2016-02-22T18:00:30-08:00").int_timestamp
Expand Down Expand Up @@ -581,11 +580,11 @@ def testOct07MultiSyncSpuriousEndDetected(self):
end_ld = ecwl.LocalDate({'year': 2016, 'month': 10, 'day': 0o7})
cacheKey = "diary/trips-2016-10-07"
with open(dataFile+".ground_truth") as gtf:
ground_truth = json.load(gtf, object_hook=bju.object_hook)
ground_truth = bju.loads(gtf.read(), json_options = bju.LEGACY_JSON_OPTIONS.with_options(uuid_representation= UuidRepresentation.PYTHON_LEGACY))

logging.info("Before loading, timeseries db size = %s" % edb.get_timeseries_db().estimated_document_count())
with open(dataFile) as df:
all_entries = json.load(df, object_hook = bju.object_hook)
all_entries = bju.loads(df.read(), json_options = bju.LEGACY_JSON_OPTIONS.with_options(uuid_representation= UuidRepresentation.PYTHON_LEGACY))
# 18:01 because the transition was at 2016-02-22T18:00:09.623404-08:00, so right after
# 18:00
ts_1800 = arrow.get("2016-10-07T18:33:11-07:00").int_timestamp
Expand Down Expand Up @@ -621,13 +620,13 @@ def testZeroDurationPlaceInterpolationSingleSync(self):
cacheKey_1 = "diary/trips-2016-01-12"
cacheKey_2 = "diary/trips-2016-01-13"
with open(dataFile_1+".ground_truth") as gtf1:
ground_truth_1 = json.load(gtf1, object_hook=bju.object_hook)
ground_truth_1 = bju.loads(gtf1.read(), json_options = bju.LEGACY_JSON_OPTIONS.with_options(uuid_representation= UuidRepresentation.PYTHON_LEGACY))
with open(dataFile_2+".ground_truth") as gtf2:
ground_truth_2 = json.load(gtf2, object_hook=bju.object_hook)
ground_truth_2 = bju.loads(gtf2.read(), json_options = bju.LEGACY_JSON_OPTIONS.with_options(uuid_representation= UuidRepresentation.PYTHON_LEGACY))

etc.setupRealExample(self, dataFile_1)
with open(dataFile_2) as df2:
self.entries = json.load(df2, object_hook = bju.object_hook)
self.entries = bju.loads(df2.read(), json_options = bju.LEGACY_JSON_OPTIONS.with_options(uuid_representation= UuidRepresentation.PYTHON_LEGACY))
etc.setupRealExampleWithEntries(self)
etc.runIntakePipeline(self.testUUID)

Expand All @@ -653,14 +652,14 @@ def testZeroDurationPlaceInterpolationMultiSync(self):
cacheKey_1 = "diary/trips-2016-01-12"
cacheKey_2 = "diary/trips-2016-01-13"
with open(dataFile_1+".ground_truth") as gtf1:
ground_truth_1 = json.load(gtf1, object_hook=bju.object_hook)
ground_truth_1 = bju.loads(gtf1.read(), json_options = bju.LEGACY_JSON_OPTIONS.with_options(uuid_representation= UuidRepresentation.PYTHON_LEGACY))
with open(dataFile_2+".ground_truth") as gtf2:
ground_truth_2 = json.load(gtf2, object_hook=bju.object_hook)
ground_truth_2 = bju.loads(gtf2.read(), json_options = bju.LEGACY_JSON_OPTIONS.with_options(uuid_representation= UuidRepresentation.PYTHON_LEGACY))

etc.setupRealExample(self, dataFile_1)
etc.runIntakePipeline(self.testUUID)
with open(dataFile_2) as df2:
self.entries = json.load(df2, object_hook = bju.object_hook)
self.entries = bju.loads(df2.read(), json_options = bju.LEGACY_JSON_OPTIONS.with_options(uuid_representation= UuidRepresentation.PYTHON_LEGACY))
etc.setupRealExampleWithEntries(self)
etc.runIntakePipeline(self.testUUID)

Expand Down
13 changes: 2 additions & 11 deletions emission/tests/modellingTests/TestRunGreedyIncrementalModel.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,7 @@
import time
import pandas as pd
import bson.json_util as bju
import bson
from bson.json_util import loads
from itertools import chain
from bson.binary import Binary, UuidRepresentation
from bson.binary import UuidRepresentation

import emission.analysis.modelling.trip_model.model_storage as eamums
import emission.analysis.modelling.trip_model.model_type as eamumt
Expand All @@ -35,13 +32,7 @@ def setUp(self):

# emission/tests/data/real_examples/shankari_2016-06-20.expected_confirmed_trips
self.user_id = uuid.UUID('aa9fdec9-2944-446c-8ee2-50d79b3044d3')
# self.user_id = Binary.from_uuid(uuid.UUID('aa9fdec9-2944-446c-8ee2-50d79b3044d3'), UuidRepresentation.PYTHON_LEGACY)
print("USER IDDDDD", self.user_id)
self.ts = esta.TimeSeries.get_time_series(self.user_id)
print("TSSSS", self.ts.find_entries([esdatq.CONFIRMED_TRIP_KEY]))
# for ent in self.ts.find_entries([esdatq.CONFIRMED_TRIP_KEY]):
# res = list(chain(self.ts.find_entries([esdatq.CONFIRMED_TRIP_KEY])))
# print("RES", res)
self.new_trips_per_invocation = 3
self.model_type = eamumt.ModelType.GREEDY_SIMILARITY_BINNING
self.model_storage = eamums.ModelStorage.DOCUMENT_DATABASE
Expand All @@ -60,7 +51,7 @@ def setUp(self):
# load in trips from a test file source
input_file = 'emission/tests/data/real_examples/shankari_2016-06-20.expected_confirmed_trips'
with open(input_file, 'r') as f:
trips_json = bju.loads(f.read(), json_options = bju.LEGACY_JSON_OPTIONS.with_options(strict_uuid= False, uuid_representation= UuidRepresentation.PYTHON_LEGACY))
trips_json = bju.loads(f.read(), json_options = bju.LEGACY_JSON_OPTIONS.with_options(uuid_representation= UuidRepresentation.PYTHON_LEGACY))
trips = [ecwe.Entry(r) for r in trips_json]
logging.debug(f'loaded {len(trips)} trips from {input_file}')
self.ts.bulk_insert(trips)
Expand Down
Loading

0 comments on commit 3b456e7

Please sign in to comment.