-
Notifications
You must be signed in to change notification settings - Fork 2
/
parse.py
39 lines (31 loc) · 987 Bytes
/
parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
# -*- coding:utf-8 -*-
__author__ = '[email protected]'
import re
class CorpusParser:
def __init__(self, filename):
self.filename = filename
self.regex = re.compile('^#\s*\d+')
self.corpus = dict()
def parse(self):
with open(self.filename) as f:
s = ''.join(f.readlines())
blobs = s.split('#')[1:]
for x in blobs:
text = x.split()
docid = text.pop(0)
self.corpus[docid] = text
def get_corpus(self):
return self.corpus
class QueryParser:
def __init__(self, filename):
self.filename = filename
self.queries = []
def parse(self):
with open(self.filename) as f:
lines = ''.join(f.readlines())
self.queries = [x.rstrip().split() for x in lines.split('\n')[:-1]]
def get_queries(self):
return self.queries
if __name__ == '__main__':
qp = QueryParser('./data/queries.txt')
print qp.get_queries()