Skip to content

Commit

Permalink
improve dicts
Browse files Browse the repository at this point in the history
  • Loading branch information
cdhigh committed Nov 24, 2024
1 parent de1620c commit 8e2164d
Show file tree
Hide file tree
Showing 6 changed files with 58 additions and 34 deletions.
75 changes: 49 additions & 26 deletions application/lib/dictionary/babylon/bgl_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
RESOURCE = 2
CHARSET = {
0x41: "cp1252", #Default
0x42: "ISO-8859-1", #Latin
0x42: "utf-8", #Latin, 原来是 ISO-8859-1,但是部分词典乱码
0x43: "ISO-8859-2", #Eastern European
0x44: "cp1251", #Cyriilic
0x45: "cp932",#Japanese
Expand Down Expand Up @@ -103,17 +103,21 @@ def unpack_term11(data: bytes, wordOnly=True) -> (bytes, bytes):
definition = definition[:sepIdx]
return word, definition

def resilient_decode(data: bytes, encoding: str, fallback: str = 'latin1') -> str:
def resilient_decode(data: bytes, encoding: str, fallback: str='latin1') -> str:
"""decode data to string with encoding, and try fallback when errors occur"""
ret = ''
while len(data) > 0:
try:
ret += data.decode(encoding)
break
except UnicodeDecodeError as e:
ret += data[e.start:e.end].decode(fallback)
data = data[e.end:]
return ret
#ret = ''
#while len(data) > 0:
# try:
# ret += data.decode(encoding)
# break
# except UnicodeDecodeError as e:
# ret += data[e.start:e.end].decode(fallback)
# data = data[e.end:]
#return ret
try:
return data.decode(encoding)
except UnicodeDecodeError:
return data.decode(fallback)

#封装类文件对象,读取和移动文件指针都添加一个偏移
class OffsetFileWrapper:
Expand Down Expand Up @@ -220,31 +224,45 @@ def __init__(self, fileName, resetBeforeParse=False):

#查询一个单词的释义
def query(self, word, default=''):
for wd in [word, word.lower(), word.capitalize()]:
for wd in [word, word.lower(), word.capitalize(), word.upper()]:
if wd in self.trie:
break
else:
return default

pos = int.from_bytes(self.trie[wd][0], byteorder='big')
definitions = []
elems = self.trie[wd][0]
if self.resetBeforeParse:
self.bglFile.reset()
self.bglFile.seek(pos)
self.gzipPos = pos
pos, recType, data = self.readRecord()
if recType is None:
return default

_, definition = self.decodeWordRecord(recType, data, wordOnly=False)
return self.justifyDefinition(definition)
for i in range(0, len(elems), 4): #每4字节一个释义,合并所有释义
pos = int.from_bytes(elems[i: i + 4], byteorder='big')
self.bglFile.seek(pos)
self.gzipPos = pos
_, recType, data = self.readRecord()
if recType is not None:
_, definition = self.decodeWordRecord(recType, data, wordOnly=False)
definitions.append(definition)
return self.justifyDefinition('<br/>'.join(definitions))

#分析索引数据,构建前缀树
def buildTrie(self):
encoding = self.encoding
records = [(word, pos.to_bytes(4, byteorder='big')) for word, pos in self.wordList()]
self.trie = marisa_trie.BytesTrie(records) #type:ignore
records = {}
for word, pos in self.wordList():
if word in records: #每4个字节一个释义,最大支持 2**32 字节的文件
records[word] += pos.to_bytes(4, byteorder='big')
else:
records[word] = pos.to_bytes(4, byteorder='big')

self.trie = marisa_trie.BytesTrie((word, pos) for word, pos in records.items()) #type:ignore
self.trie.save(self.trieFileName)

#测试使用 TODO
#with open(self.trieFileName + '.json', 'w', encoding='utf-8') as f:
# import json, binascii
# json.dump({word: f'0x{binascii.hexlify(pos).decode("ascii")}' for word, pos in records.items()},
# f, ensure_ascii=False, indent=2)

#同时保存内容编码
with open(self.encFileName, 'w', encoding='utf-8') as f:
f.write(self.encoding)
Expand Down Expand Up @@ -273,7 +291,6 @@ def wordList(self):
word = self.justifyWord(word)
if word:
yield (word, pos)
#bglFile.close()

#当前只分析单词块,返回 word, definition
#wordOnly: =True - 不解码释义部分,节省时间
Expand Down Expand Up @@ -350,9 +367,10 @@ def justifyWord(self, word: str):
word = re.sub(r'&#(\d+);', lambda match: chr(int(match.group(1))), word)
word = re.sub(r'&#x([0-9A-Fa-f]+);', lambda match: chr(int(match.group(1), 16)), word)
word = re.sub(r'\s+', ' ', word)
return word
return word.strip()

def justifyDefinition(self, definition):
#print(definition) #TODO
if not definition:
return ''

Expand All @@ -366,7 +384,12 @@ def justifyDefinition(self, definition):
for a in soup.find_all('a', href=True):
href = a['href']
if href.startswith('bword://'):
a['href'] = f'https://kindleear/entry/{href[8:].strip()}'
bword1 = href[8:].strip()
bword2 = (a.string or bword1).strip()
if bword1 in self.trie:
a['href'] = f'https://kindleear/entry/{bword1}'
else:
a['href'] = f'https://kindleear/entry/{bword2}'
else:
a.extract()

Expand Down
2 changes: 1 addition & 1 deletion application/lib/dictionary/lingvo/dsl_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ def openDslFile(self):

#查词接口
def get(self, word, default=''): #type:ignore
for wd in [word, word.lower(), word.capitalize()]:
for wd in [word, word.lower(), word.capitalize(), word.upper()]:
if wd in self.trie:
break
else:
Expand Down
2 changes: 1 addition & 1 deletion application/lib/dictionary/mdict/mdict.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ def get(self, word):
if word == ':about':
return self.dict_html_info()

for wd in [word, word.lower(), word.capitalize()]:
for wd in [word, word.lower(), word.capitalize(), word.upper()]:
if wd in self.trie:
indexes = self.trie[word]
break
Expand Down
2 changes: 1 addition & 1 deletion application/lib/dictionary/stardict/pystardict.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def __repr__(self):
return f'{self.__class__} {self.ifo.bookname}'

def get(self, word, default=''): #type:ignore
for wd in [word, word.lower(), word.capitalize()]:
for wd in [word, word.lower(), word.capitalize(), word.upper()]:
if wd in self.idx:
return self[wd]
else:
Expand Down
5 changes: 3 additions & 2 deletions docs/Chinese/reader.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,10 +45,11 @@ KindleEar支持邮件推送和在线阅读,内置一个为电子墨水屏进

### 安装词典
1. KindleEar支持在线词典 [dict.org](https://dict.org/), [dict.cc](https://www.dict.cc/), [dict.cn](http://dict.cn/), [韦氏词典](https://www.merriam-webster.com/)[牛津词典](https://www.oxfordlearnersdictionaries.com/), 这几个词典不需要安装,开箱即用。
2. 在线词典很方便,但是避免有时候因为网络原因不是太稳定,所以如果要稳定使用,最好还是使用离线词典,为此,KindleEar同时支持 mdict/stardict/lingvo 格式词典,下载对应的词典后,解压到 `data/dict` 目录(可以使用子目录整理不同的词典)。
2. 在线词典很方便,但是避免有时候因为网络原因不是太稳定,所以如果要稳定使用,最好还是使用离线词典,为此,KindleEar同时支持 mdict/stardict/lingvo/babylon 格式词典,下载对应的词典后,解压到 `data/dict` 目录(可以使用子目录整理不同的词典)。
* mdict: 只需要 mdx 文件,如果有css,则需要位于同样目录
* stardict: 需要 ifo, idx, dict 或 dict.dz
* lingvo: 只需要 dsl 文件,不支持 dsl.dz,需要将 dsl.dz 解压缩为 dsl
* lingvo: 只需要 dsl 文件,不支持 dsl.dz,需要将 dsl.dz 解压缩为 dsl (使用gzip/7zip/winrar等软件)
* babylon: 只需要 bgl 文件 (如果查词乱码,可以修改 \*.enc 文件为准确的编码)
3. 离线词典第一次查词会比较慢,因为要创建索引文件(后缀为trie),之后就很快了。
如果要使用大型词典(比如几百兆以上),在生成索引的过程中会消耗比较多的内存,如你的服务器内存比较小,可能会创建索引失败,你可以在你的本地机器先使用对应词典查一次单词,待本地生成trie文件后,拷贝到服务器对应目录即可。
4. 已经默认支持美式英语的构词法规则,可以查询单词时态语态复数等变形,如果需要支持其他语种的构词法,请下载对应的hunspell格式的文件(.dic/.aff),然后拷贝到 `data/dict/morphology` (请直接创建此目录) ,注意不要存放到子目录下,KindleEar会自动使用和书本语言相匹配的构词法规则。
Expand Down
6 changes: 3 additions & 3 deletions docs/English/reader.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,11 +50,11 @@ The extracted word is sent to your deployed KindleEar site for translation, and
### Installing Dictionaries
1. KindleEar supports online dictionaries such as [dict.org](https://dict.org/), [dict.cc](https://www.dict.cc/), [dict.cn](http://dict.cn/), [Merriam-Webster](https://www.merriam-webster.com/), [Oxford](https://www.oxfordlearnersdictionaries.com/). These dictionaries require no installation and are ready to use out of the box.

2. KindleEar also supports offline dictionaries in the mdict/stardict/lingvo format. After downloading the corresponding dictionary, unzip it into the `data/dict` directory. You can organize different dictionaries into subdirectories. Then, restart the KindleEar service to refresh the dictionary list.
2. KindleEar also supports offline dictionaries in the mdict/stardict/lingvo/babylon format. After downloading the corresponding dictionary, unzip it into the `data/dict` directory. You can organize different dictionaries into subdirectories. Then, restart the KindleEar service to refresh the dictionary list.
* mdict: Requires `.mdx` file only. If there is a `.css`, it must be in the same directory.
* stardict: Requires `.ifo`, `.idx`, and `.dict` (or `.dict.dz`).
* lingvo: Only the `.dsl`. requires extracting it into `dsl` if you have `dsl.dz`.

* lingvo: Only the `.dsl`. requires extracting it into `dsl` (by gzip/7zip/winrar...) if you have `dsl.dz`.
* babylon: Requires bgl file. (If garbled, correct the encoding in the \*.enc file.)
3. The first time you look up a word in the offline dictionary, it may be slow because it needs to create an index file (suffix: trie), After that, it will be much faster.
If you are using a large dictionary (for example, above several hundred megabytes), the indexing process will consume a significant amount of memory. If the server has limited memory, the indexing might fail. You can first use the dictionary on your local machine to look up a word and generate the "trie" file, then copy it to the corresponding directory on the server.

Expand Down

0 comments on commit 8e2164d

Please sign in to comment.