improve dicts

cdhigh · Nov 24, 2024 · 8e2164d · 8e2164d
1 parent de1620c
commit 8e2164d
Show file tree

Hide file tree

Showing 6 changed files with 58 additions and 34 deletions.
diff --git a/application/lib/dictionary/babylon/bgl_reader.py b/application/lib/dictionary/babylon/bgl_reader.py
@@ -20,7 +20,7 @@
 RESOURCE = 2
 CHARSET = {
     0x41: "cp1252", #Default
-    0x42: "ISO-8859-1", #Latin
+    0x42: "utf-8", #Latin, 原来是 ISO-8859-1，但是部分词典乱码
     0x43: "ISO-8859-2", #Eastern European
     0x44: "cp1251", #Cyriilic
     0x45: "cp932",#Japanese
@@ -103,17 +103,21 @@ def unpack_term11(data: bytes, wordOnly=True) -> (bytes, bytes):
         definition = definition[:sepIdx]
     return word, definition
 
-def resilient_decode(data: bytes, encoding: str, fallback: str = 'latin1') -> str:
+def resilient_decode(data: bytes, encoding: str, fallback: str='latin1') -> str:
     """decode data to string with encoding, and try fallback when errors occur"""
-    ret = ''
-    while len(data) > 0:
-        try:
-            ret += data.decode(encoding)
-            break
-        except UnicodeDecodeError as e:
-            ret += data[e.start:e.end].decode(fallback)
-            data = data[e.end:]
-    return ret
+    #ret = ''
+    #while len(data) > 0:
+    #    try:
+    #        ret += data.decode(encoding)
+    #        break
+    #    except UnicodeDecodeError as e:
+    #        ret += data[e.start:e.end].decode(fallback)
+    #        data = data[e.end:]
+    #return ret
+    try:
+        return data.decode(encoding)
+    except UnicodeDecodeError:
+        return data.decode(fallback)
 
 #封装类文件对象，读取和移动文件指针都添加一个偏移
 class OffsetFileWrapper:
@@ -220,31 +224,45 @@ def __init__(self, fileName, resetBeforeParse=False):
 
     #查询一个单词的释义
     def query(self, word, default=''):
-        for wd in [word, word.lower(), word.capitalize()]:
+        for wd in [word, word.lower(), word.capitalize(), word.upper()]:
             if wd in self.trie:
                 break
         else:
             return default
 
-        pos = int.from_bytes(self.trie[wd][0], byteorder='big')
+        definitions = []
+        elems = self.trie[wd][0]
         if self.resetBeforeParse:
             self.bglFile.reset()
-        self.bglFile.seek(pos)
-        self.gzipPos = pos
-        pos, recType, data = self.readRecord()
-        if recType is None:
-            return default
-
-        _, definition = self.decodeWordRecord(recType, data, wordOnly=False)
-        return self.justifyDefinition(definition)
+        for i in range(0, len(elems), 4): #每4字节一个释义，合并所有释义
+            pos = int.from_bytes(elems[i: i + 4], byteorder='big')
+            self.bglFile.seek(pos)
+            self.gzipPos = pos
+            _, recType, data = self.readRecord()
+            if recType is not None:
+                _, definition = self.decodeWordRecord(recType, data, wordOnly=False)
+                definitions.append(definition)
+        return self.justifyDefinition('<br/>'.join(definitions))
 
     #分析索引数据，构建前缀树
     def buildTrie(self):
         encoding = self.encoding
-        records = [(word, pos.to_bytes(4, byteorder='big')) for word, pos in self.wordList()]
-        self.trie = marisa_trie.BytesTrie(records) #type:ignore
+        records = {}
+        for word, pos in self.wordList():
+            if word in records: #每4个字节一个释义，最大支持 2**32 字节的文件
+                records[word] += pos.to_bytes(4, byteorder='big')
+            else:
+                records[word] = pos.to_bytes(4, byteorder='big')
+
+        self.trie = marisa_trie.BytesTrie((word, pos) for word, pos in records.items()) #type:ignore
         self.trie.save(self.trieFileName)
 
+        #测试使用 TODO
+        #with open(self.trieFileName + '.json', 'w', encoding='utf-8') as f:
+        #    import json, binascii
+        #    json.dump({word: f'0x{binascii.hexlify(pos).decode("ascii")}' for word, pos in records.items()}, 
+        #        f, ensure_ascii=False, indent=2)
+
         #同时保存内容编码
         with open(self.encFileName, 'w', encoding='utf-8') as f:
             f.write(self.encoding)
@@ -273,7 +291,6 @@ def wordList(self):
             word = self.justifyWord(word)
             if word:
                 yield (word, pos)
-        #bglFile.close()
 
     #当前只分析单词块，返回 word, definition
     #wordOnly: =True - 不解码释义部分，节省时间
@@ -350,9 +367,10 @@ def justifyWord(self, word: str):
         word = re.sub(r'&#(\d+);', lambda match: chr(int(match.group(1))), word)
         word = re.sub(r'&#x([0-9A-Fa-f]+);', lambda match: chr(int(match.group(1), 16)), word)
         word = re.sub(r'\s+', ' ', word)
-        return word
+        return word.strip()
 
     def justifyDefinition(self, definition):
+        #print(definition) #TODO
         if not definition:
             return ''
 
@@ -366,7 +384,12 @@ def justifyDefinition(self, definition):
         for a in soup.find_all('a', href=True):
             href = a['href']
             if href.startswith('bword://'):
-                a['href'] = f'https://kindleear/entry/{href[8:].strip()}'
+                bword1 = href[8:].strip()
+                bword2 = (a.string or bword1).strip()
+                if bword1 in self.trie:
+                    a['href'] = f'https://kindleear/entry/{bword1}'
+                else:
+                    a['href'] = f'https://kindleear/entry/{bword2}'
             else:
                 a.extract()
 

diff --git a/application/lib/dictionary/lingvo/dsl_reader.py b/application/lib/dictionary/lingvo/dsl_reader.py
@@ -112,7 +112,7 @@ def openDslFile(self):
 
     #查词接口
     def get(self, word, default=''): #type:ignore
-        for wd in [word, word.lower(), word.capitalize()]:
+        for wd in [word, word.lower(), word.capitalize(), word.upper()]:
             if wd in self.trie:
                 break
         else:

diff --git a/application/lib/dictionary/mdict/mdict.py b/application/lib/dictionary/mdict/mdict.py
@@ -105,7 +105,7 @@ def get(self, word):
         if word == ':about':
             return self.dict_html_info()
 
-        for wd in [word, word.lower(), word.capitalize()]:
+        for wd in [word, word.lower(), word.capitalize(), word.upper()]:
             if wd in self.trie:
                 indexes = self.trie[word]
                 break

diff --git a/application/lib/dictionary/stardict/pystardict.py b/application/lib/dictionary/stardict/pystardict.py
@@ -50,7 +50,7 @@ def __repr__(self):
         return f'{self.__class__} {self.ifo.bookname}'
 
     def get(self, word, default=''): #type:ignore
-        for wd in [word, word.lower(), word.capitalize()]:
+        for wd in [word, word.lower(), word.capitalize(), word.upper()]:
             if wd in self.idx:
                 return self[wd]
         else:

diff --git a/docs/Chinese/reader.md b/docs/Chinese/reader.md
@@ -45,10 +45,11 @@ KindleEar支持邮件推送和在线阅读，内置一个为电子墨水屏进
 
 ### 安装词典
 1. KindleEar支持在线词典 [dict.org](https://dict.org/), [dict.cc](https://www.dict.cc/), [dict.cn](http://dict.cn/), [韦氏词典](https://www.merriam-webster.com/)，[牛津词典](https://www.oxfordlearnersdictionaries.com/)， 这几个词典不需要安装，开箱即用。    
-2. 在线词典很方便，但是避免有时候因为网络原因不是太稳定，所以如果要稳定使用，最好还是使用离线词典，为此，KindleEar同时支持 mdict/stardict/lingvo 格式词典，下载对应的词典后，解压到 `data/dict` 目录（可以使用子目录整理不同的词典）。   
+2. 在线词典很方便，但是避免有时候因为网络原因不是太稳定，所以如果要稳定使用，最好还是使用离线词典，为此，KindleEar同时支持 mdict/stardict/lingvo/babylon 格式词典，下载对应的词典后，解压到 `data/dict` 目录（可以使用子目录整理不同的词典）。   
   * mdict: 只需要 mdx 文件，如果有css，则需要位于同样目录   
   * stardict: 需要 ifo, idx, dict 或 dict.dz   
-  * lingvo: 只需要 dsl 文件，不支持 dsl.dz，需要将 dsl.dz 解压缩为 dsl     
+  * lingvo: 只需要 dsl 文件，不支持 dsl.dz，需要将 dsl.dz 解压缩为 dsl (使用gzip/7zip/winrar等软件)     
+  * babylon: 只需要 bgl 文件 （如果查词乱码，可以修改 \*.enc 文件为准确的编码）    
 3. 离线词典第一次查词会比较慢，因为要创建索引文件(后缀为trie)，之后就很快了。 
 如果要使用大型词典（比如几百兆以上），在生成索引的过程中会消耗比较多的内存，如你的服务器内存比较小，可能会创建索引失败，你可以在你的本地机器先使用对应词典查一次单词，待本地生成trie文件后，拷贝到服务器对应目录即可。   
 4. 已经默认支持美式英语的构词法规则，可以查询单词时态语态复数等变形，如果需要支持其他语种的构词法，请下载对应的hunspell格式的文件（.dic/.aff），然后拷贝到 `data/dict/morphology` (请直接创建此目录) ，注意不要存放到子目录下，KindleEar会自动使用和书本语言相匹配的构词法规则。   

diff --git a/docs/English/reader.md b/docs/English/reader.md
@@ -50,11 +50,11 @@ The extracted word is sent to your deployed KindleEar site for translation, and
 ### Installing Dictionaries
 1. KindleEar supports online dictionaries such as [dict.org](https://dict.org/), [dict.cc](https://www.dict.cc/), [dict.cn](http://dict.cn/), [Merriam-Webster](https://www.merriam-webster.com/), [Oxford](https://www.oxfordlearnersdictionaries.com/). These dictionaries require no installation and are ready to use out of the box.    
 
-2. KindleEar also supports offline dictionaries in the mdict/stardict/lingvo format. After downloading the corresponding dictionary, unzip it into the `data/dict` directory. You can organize different dictionaries into subdirectories. Then, restart the KindleEar service to refresh the dictionary list.    
+2. KindleEar also supports offline dictionaries in the mdict/stardict/lingvo/babylon format. After downloading the corresponding dictionary, unzip it into the `data/dict` directory. You can organize different dictionaries into subdirectories. Then, restart the KindleEar service to refresh the dictionary list.    
   * mdict: Requires `.mdx` file only. If there is a `.css`, it must be in the same directory.    
   * stardict: Requires `.ifo`, `.idx`, and `.dict` (or `.dict.dz`).   
-  * lingvo: Only the `.dsl`. requires extracting it into `dsl` if you have `dsl.dz`.  
-
+  * lingvo: Only the `.dsl`. requires extracting it into `dsl` (by gzip/7zip/winrar...) if you have `dsl.dz`.   
+  * babylon: Requires bgl file. (If garbled, correct the encoding in the \*.enc file.)       
 3. The first time you look up a word in the offline dictionary, it may be slow because it needs to create an index file (suffix: trie), After that, it will be much faster. 
 If you are using a large dictionary (for example, above several hundred megabytes), the indexing process will consume a significant amount of memory. If the server has limited memory, the indexing might fail. You can first use the dictionary on your local machine to look up a word and generate the "trie" file, then copy it to the corresponding directory on the server.