-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathload_json_dump_to_db.py
64 lines (49 loc) · 2.01 KB
/
load_json_dump_to_db.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import json
from wb_nlp.dir_manager import get_data_dir
from wb_nlp.interfaces import mongodb
from wb_nlp.types import metadata
# Save the validated data to `docs_metadata` collection
collection = mongodb.get_docs_metadata_collection()
print(f"Emptying collection {collection} before loading dump...")
collection.delete_many({})
# client = pymongo.MongoClient(host='mongodb', port=27017)
# db = client['nlp']
# collection = db['metadata']
meta_collection = mongodb.get_metadata_collection()
print(f"Emptying collection {meta_collection} before loading dump...")
meta_collection.delete_many({})
# Dump file can be generated by running:
# mongoexport --collection=metadata --db=nlp --out=<outfilename>.json
DUMP_FILE = get_data_dir('raw', 'nlp-metadata-wbes2474-20201007.json')
print(
f"Start loading data dump {DUMP_FILE} to collection: {collection} and {meta_collection}")
# dump_data = [json.loads(metadata.make_metadata_model_from_nlp_schema(
# json.loads(line)).json()) for line in open(DUMP_FILE, 'r')]
dump_data = []
meta_dump_data = []
unloaded_data = []
with open(DUMP_FILE, "r") as open_file:
for line in open_file:
body = json.loads(line)
meta_dump_data.append(body)
try:
dm = json.loads(
metadata.make_metadata_model_from_nlp_schema(body).json())
dm["_id"] = dm["id"]
dump_data.append(dm)
except KeyError:
unloaded_data.append(body["_id"])
print(f"Loading data to db...")
if isinstance(dump_data, list):
collection.insert_many(dump_data)
else:
collection.insert_one(dump_data)
print(
f"Finished loading {len(dump_data)} data dump to collection: {collection} with {len(unloaded_data)} unloaded data...")
print(f"Loading data to db...")
if isinstance(meta_dump_data, list):
meta_collection.insert_many(meta_dump_data)
else:
meta_collection.insert_one(meta_dump_data)
print(
f"Finished loading {len(meta_dump_data)} data dump to collection: {meta_collection} with {len(unloaded_data)} unloaded data...")