-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpos_tag_json.2021.py
47 lines (36 loc) · 1.12 KB
/
pos_tag_json.2021.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
#Iris, Jan 2021
# script to read in the huge json file,
# extract the relevant items from json per document: title and doc_body
#
import json
import sys
DIRPATH="pathtodir"
#we read in the orginal json file
jsonfile= sys.argv[1]
# we write separte text files to the outdirname
outdirname = DIRPATH +jsonfile+"/"
#we also extract the IDS and write them to a list (outlist)
listname= outdirname+"/"+jsonfile+".ids"
outlist = open(listname, 'w')
injson =DIRPATH+jsonfile
with open(injson,'r',encoding='utf-8') as jfile:
for l in jfile.readlines():
l = l.strip()
document = json.loads(l)
doc_ident = document['_id']
doc_url = document['url']
doc_title = document['title']
doc_body = document['body']
file_bname = outdirname + str(doc_ident) + ".body.txt"
file_tname = outdirname + str(doc_ident) + ".title.txt"
f = open(file_bname, 'w')
f.write(doc_body)
f.close
f = open(file_tname, 'w')
f.write(doc_title)
f.close
# print("entry: ", doc_ident)
outlist.write(str(doc_ident)+"\n")
#json.dump()json.dump(doc_body.decode('utf-8'),f)--> doesnt work -encoing problems
jfile.close()
outlist.close()