Skip to content

Commit e38c8d1

Browse files
committed
refactor: change queue_emu_changes to always do one day's worth of data
We're only ever using the only-do-one parameter, so might as well just make it default behaviour.
1 parent 5988e57 commit e38c8d1

File tree

3 files changed

+45
-58
lines changed

3 files changed

+45
-58
lines changed

dataimporter/cli/emu.py

+6-7
Original file line numberDiff line numberDiff line change
@@ -36,12 +36,12 @@ def auto(config: Config, one: bool = False, delay_sync: bool = False):
3636
with DataImporter(config) as importer:
3737
while True:
3838
console.log("Queuing next dump set")
39-
dates_queued = importer.queue_emu_changes(only_one=True)
40-
if not dates_queued:
39+
date_queued = importer.queue_emu_changes()
40+
if date_queued is None:
4141
console.log("No more dumps to import, done")
4242
break
4343

44-
console.log(f"Date queued: {dates_queued[0].isoformat()}")
44+
console.log(f"Date queued: {date_queued.isoformat()}")
4545

4646
for name in VIEW_NAMES:
4747
console.log(f"Adding changes from {name} view to mongo")
@@ -92,13 +92,12 @@ def queue(amount: str, config: Config):
9292

9393
while amount > 0:
9494
console.log("Queuing next dump set")
95-
dates_queued = importer.queue_emu_changes(only_one=True)
96-
if not dates_queued:
95+
date_queued = importer.queue_emu_changes()
96+
if date_queued is None:
9797
console.log("No data to queue")
9898
break
9999
else:
100-
date_queued = dates_queued[0].isoformat()
101-
console.log(f"Date queued: {date_queued}")
100+
console.log(f"Date queued: {date_queued.isoformat()}")
102101
amount -= 1
103102

104103

dataimporter/importer.py

+30-40
Original file line numberDiff line numberDiff line change
@@ -170,55 +170,45 @@ def queue_changes(self, records: Iterable[SourceRecord], store_name: str):
170170
for view in views:
171171
view.queue(batch)
172172

173-
def queue_emu_changes(self, only_one: bool = False) -> List[date]:
173+
def queue_emu_changes(self) -> Optional[date]:
174174
"""
175-
Look for new EMu dumps, upsert the records into the appropriate DataDB and then
176-
queue the changes into the derived views.
175+
Look for new EMu dumps and if any are found beyond the date of the last queued
176+
EMu import, add the next day's data to the stores and view queues.
177177
178-
:param only_one: if True, only process the first set of dumps and then return,
179-
otherwise, process them all (default: False)
180-
:return the dates that were queued
178+
:return the date that was queued or None if no dumps were found
181179
"""
182180
last_queued = self.emu_status.get()
183181
dump_sets = find_emu_dumps(self.config.dumps_path, after=last_queued)
184182
if not dump_sets:
185-
return []
183+
return None
186184

187-
if only_one:
188-
dump_sets = dump_sets[:1]
185+
next_day_dump_set = dump_sets[0]
189186

190187
store_names = {store.name for store in self.stores if store.name != "gbif"}
191-
dates_queued = []
192-
for dump_set in dump_sets:
193-
for dump in dump_set.dumps:
194-
# normal tables are immediately processable, but if the dump is from
195-
# the eaudit table we need to do some additional work because each
196-
# audit record refers to a potentially different table from which it
197-
# is deleting a record
198-
if dump.table != "eaudit":
199-
self.queue_changes(dump.read(), dump.table)
200-
else:
201-
# wrap the dump stream in a filter to only allow through records
202-
# we want to process
203-
filtered_dump = filter(
204-
partial(is_valid_eaudit_record, tables=store_names),
205-
dump.read(),
206-
)
207-
# queue the changes to each table's database in turn
208-
for table, records in groupby(
209-
filtered_dump, key=lambda record: record.data["AudTable"]
210-
):
211-
# convert the raw audit records into delete records as we
212-
# queue them
213-
self.queue_changes(
214-
map(convert_eaudit_to_delete, records), table
215-
)
216-
# we've handled all the dumps from this date, update the last date stored on
217-
# disk in case we fail later to avoid redoing work
218-
self.emu_status.update(dump_set.date)
219-
dates_queued.append(dump_set.date)
220-
221-
return dates_queued
188+
for dump in next_day_dump_set.dumps:
189+
# normal tables are immediately processable, but if the dump is from
190+
# the eaudit table we need to do some additional work because each
191+
# audit record refers to a potentially different table from which it
192+
# is deleting a record
193+
if dump.table != "eaudit":
194+
self.queue_changes(dump.read(), dump.table)
195+
else:
196+
# wrap the dump stream in a filter to only allow through records
197+
# we want to process
198+
filtered_dump = filter(
199+
partial(is_valid_eaudit_record, tables=store_names),
200+
dump.read(),
201+
)
202+
# queue the changes to each table's database in turn
203+
for table, records in groupby(
204+
filtered_dump, key=lambda record: record.data["AudTable"]
205+
):
206+
# convert the raw audit records into delete records as we
207+
# queue them
208+
self.queue_changes(map(convert_eaudit_to_delete, records), table)
209+
210+
self.emu_status.update(next_day_dump_set.date)
211+
return next_day_dump_set.date
222212

223213
def queue_gbif_changes(self):
224214
"""

tests/test_importer.py

+9-11
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@ def test_queue_emu_changes(self, importer: DataImporter, config: Config):
118118
create_etaxonomy("2"),
119119
)
120120

121-
importer.queue_emu_changes()
121+
assert importer.queue_emu_changes() == first_dump_date
122122

123123
assert importer.get_store("ecatalogue").size() == 4
124124
assert importer.get_store("emultimedia").size() == 3
@@ -148,7 +148,7 @@ def test_queue_emu_changes(self, importer: DataImporter, config: Config):
148148
create_eaudit("1", "etaxonomy"),
149149
)
150150

151-
importer.queue_emu_changes()
151+
assert importer.queue_emu_changes() == second_dump_date
152152

153153
# these have all lost 1 to reflect the newly deleted records
154154
assert importer.get_store("ecatalogue").size() == 3
@@ -183,7 +183,7 @@ def test_queue_emu_changes(self, importer: DataImporter, config: Config):
183183
create_emultimedia("4"),
184184
)
185185

186-
importer.queue_emu_changes()
186+
assert importer.queue_emu_changes() == third_dump_date
187187

188188
assert importer.get_store("ecatalogue").size() == 3
189189
# there's a new emultimedia record now
@@ -217,13 +217,6 @@ def test_queue_emu_changes_only_one(self, config: Config):
217217
config.dumps_path, "etaxonomy", first_dump_date, create_etaxonomy("1")
218218
)
219219

220-
importer.queue_emu_changes(only_one=True)
221-
222-
assert importer.emu_status.get() == first_dump_date
223-
assert importer.get_store("ecatalogue").size() == 1
224-
assert importer.get_store("emultimedia").size() == 1
225-
assert importer.get_store("etaxonomy").size() == 1
226-
227220
second_dump_date = date(2023, 10, 4)
228221
create_dump(
229222
config.dumps_path,
@@ -238,8 +231,13 @@ def test_queue_emu_changes_only_one(self, config: Config):
238231
config.dumps_path, "etaxonomy", second_dump_date, create_etaxonomy("2")
239232
)
240233

241-
importer.queue_emu_changes(only_one=True)
234+
assert importer.queue_emu_changes() == first_dump_date
235+
assert importer.emu_status.get() == first_dump_date
236+
assert importer.get_store("ecatalogue").size() == 1
237+
assert importer.get_store("emultimedia").size() == 1
238+
assert importer.get_store("etaxonomy").size() == 1
242239

240+
assert importer.queue_emu_changes() == second_dump_date
243241
assert importer.emu_status.get() == second_dump_date
244242
assert importer.get_store("ecatalogue").size() == 2
245243
assert importer.get_store("emultimedia").size() == 2

0 commit comments

Comments
 (0)