-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse.py
120 lines (95 loc) · 4.25 KB
/
parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#!/bin/env python3
import readidx
import os
from exceptions import *
import re
class Parser:
properties={}
def __init__(self, dictFormat, header):
self.dictFormat=dictFormat
self.delimiter=dictFormat[1]
self.header=header
pass
def parseIfoLine(self, line):
line=line.rstrip().split("=")
if len(line)!=2:
raise BadFormatException("czech-cizi.ifo inappropriately formatted")
self.properties[line[0]]=line[1]
return line
def parse(self, ifo, dictF, idx, txt):
self.properties={}
with open(ifo, "r") as r:
r.readline() # first line is just intro msg
verN,ver=self.parseIfoLine(r.readline())
if verN!="version":
raise BadFormatException("czech-cizi.ifo inappropriately formatted")
if ver!="2.4.2" and ver!="3.0.0":
raise BadFormatException("Only stardict version 2.4.2 and 3.0.0 supported")
for x in r:
self.parseIfoLine(x)
if not 'sametypesequence' in self.properties:
raise NotSupportedFormat("Only dictionaries with specified sametypesequence are supported.")
supported = ['m', 'g', 'h', 't']
if not self.properties['sametypesequence'] in supported:
raise NotSupportedFormat("Dictionaries with sametypesequence=%s are not supported." % (self.properties['sametypesequence']))
self.idxoffsetbits=32
if 'idxoffsetbits' in self.properties:
if self.properties['idxoffsetbits']!='64' and self.properties['idxoffsetbits']!='32':
raise BadFormatException('Bad value in field idxoffsetbits')
self.idxoffsetbits=int(self.properties['idxoffsetbits']) # safe, can be 32, 64
data=os.open(dictF, os.O_RDONLY)
readidx.startRead(idx, int(self.idxoffsetbits/8)) # in bytes
if os.path.isfile(txt):
number=2
while os.path.isfile(txt[:-4]+str(number)+'.txt'):
number+=1
txt=txt[:-4]+str(number)+'.txt'
with open(txt, 'w') as wDict:
try:
if self.header:
wDict.write('#DICTFORMAT:%s\n' % (self.dictFormat))
while True:
word,offset,size=readidx.nextRecord()
record={}
record['r']=os.read(data, size)\
.decode('utf-8')
record['k']=word
# sametypesequence=m,h,t has no other information
if self.properties['sametypesequence']=='h':
record['h']=record['r']
else:
record['h']=''
if self.properties['sametypesequence']=='m':
record['m']=record['r']
else:
record['m']=''
if self.properties['sametypesequence']=='t':
record['t']=record['r']
else:
record['t']=''
if self.properties['sametypesequence']=='g':
record['g']=record['r']
else:
record['g']=''
record['b']=self.parseTag('b', record['r'])
record['i']=self.parseTag('i', record['r'])
record['s']=self.parseTag('small', record['r'])
for x in range(0, len(self.dictFormat), 2):
if self.dictFormat[x] in record:
wDict.write(record[self.dictFormat[x]]
.replace('\n', '\\n')\
.replace(self.delimiter, '\\'+self.delimiter))
if x+1!=len(self.dictFormat):
wDict.write(self.delimiter)
wDict.write('\n')
except StopIteration as e:
pass
readidx.stopRead()
def parseTag(self, tag, text):
if not self.properties['sametypesequence']=='g':
return ''
res=''
matches=re.findall('<%s>[^<]*</%s>' %(tag, tag), text)
for x in matches:
res+=re.sub('<[^>]*>', '', x)
return res