Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

动态监测语言实现 #3

Merged
merged 1 commit into from
Jan 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 0 additions & 21 deletions api_tesseractocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,41 +11,20 @@
# 依赖包目录
SitePackages = os.path.join(CurrentDir, "site-packages")

TESSERACT_SUPPORTED = ['afr', 'amh', 'ara', 'asm', 'aze', 'aze', 'bel', 'ben', 'bod', 'bos', 'bre', 'bul', 'cat', 'ceb', 'ces', 'chi', 'chi', 'chr', 'cos', 'cym', 'dan', 'deu', 'div', 'dzo', 'ell', 'Mod', '145', 'eng', 'enm', 'Mid', '110', 'epo', 'equ', 'est', 'eus', 'fas', 'fao', 'fil', 'fin', 'fra', 'frk', 'frm', 'Mid', 'ca.', 'fry', 'gla', 'gle', 'glg', 'grc', 'Anc', 'to ', 'guj', 'hat', 'heb', 'hin', 'hrv', 'hun', 'hye', 'iku', 'ind', 'isl', 'ita', 'ita', 'jav', 'jpn', 'kan', 'kat', 'kat', 'kaz', 'khm', 'kir', 'kmr', 'kor', 'kor', 'lao', 'lat', 'lav', 'lit', 'ltz', 'mal', 'mar', 'mkd', 'mlt', 'mon', 'mri', 'msa', 'mya', 'nep', 'nld', 'nor', 'oci', 'ori', 'osd', 'pan', 'pol', 'por', 'pus', 'que', 'ron', 'rus', 'san', 'sin', 'slk', 'slv', 'snd', 'spa', 'spa', 'sqi', 'srp', 'srp', 'sun', 'swa', 'swe', 'syr', 'tam', 'tat', 'tel', 'tgk', 'tha', 'tir', 'ton', 'tur', 'uig', 'ukr', 'urd', 'uzb', 'uzb', 'vie', 'yid', 'yor']
ModelDir = os.path.join(CurrentDir,"engine/tessdata/")

class Api:
def __init__(self, globalArgd):
self.tesseractOcr = None

def check(self,languages:list) -> (list,list):
unsupported = []
uninstalled = []
for language in languages:
if language not in TESSERACT_SUPPORTED:
unsupported.append(language)
elif not os.path.exists(ModelDir + language + ".traineddata"):
uninstalled.append(language)
return (unsupported,uninstalled)

def get_select_languages(self,argd) -> list:
selects = []
for k,flag in argd.items():
if k == "language.~enabledOther" or k == 'language.~other':
continue
if k.startswith("language.") and flag:
language = k[9:]
if (language == 'chi_sim' or language == "chi_tra") and argd['vert']:
selects.append(language+"_vert")
selects.append(language)
if argd['language.~enabledOther']:
otherLangs = argd['language.~other'].split(" ")
unsupported,uninstalled = self.check(otherLangs)
if len(unsupported):
raise Exception(f"Unsupported languages: {unsupported}")
if len(uninstalled):
raise Exception(f"Uninstalled languages: {uninstalled}")
selects += otherLangs
return selects

# 获取两个连续单词的分隔符。letter1为单词1结尾字母,letter2为单词2结尾字母
Expand Down
31 changes: 23 additions & 8 deletions tesseractocr_config.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,25 @@
from plugin_i18n import Translator
import os

tr = Translator(__file__, "i18n.csv")

TESSERACT_SUPPORTED = {'afr': 'Afrikaans', 'amh': 'Amharic', 'ara': 'Arabic', 'asm': 'Assamese', 'aze': 'Azerbaijani', 'aze_cyrl': 'Azerbaijani - Cyrilic', 'bel': 'Belarusian', 'ben': 'Bengali', 'bod': 'Tibetan', 'bos': 'Bosnian', 'bre': 'Breton', 'bul': 'Bulgarian', 'cat': 'Catalan; Valencian', 'ceb': 'Cebuano', 'ces': 'Czech', 'chr': 'Cherokee', 'cos': 'Corsican', 'cym': 'Welsh', 'dan': 'Danish', 'deu': 'German', 'div': 'Dhivehi', 'dzo': 'Dzongkha', 'ell': 'Greek, Modern, 1453-', 'enm': 'English, Middle, 1100-1500', 'epo': 'Esperanto', 'equ': 'Math / equation detection module', 'est': 'Estonian', 'eus': 'Basque', 'fas': 'Persian', 'fao': 'Faroese', 'fil': 'Filipino', 'fin': 'Finnish', 'fra': 'French', 'frk': 'Frankish', 'frm': 'French, Middle, ca.1400-1600', 'fry': 'West Frisian', 'gla': 'Scottish Gaelic', 'gle': 'Irish', 'glg': 'Galician', 'grc': 'Greek, Ancient, to 1453', 'guj': 'Gujarati', 'hat': 'Haitian; Haitian Creole', 'heb': 'Hebrew', 'hin': 'Hindi', 'hrv': 'Croatian', 'hun': 'Hungarian', 'hye': 'Armenian', 'iku': 'Inuktitut', 'ind': 'Indonesian', 'isl': 'Icelandic', 'ita': 'Italian', 'ita_old': 'Italian - Old', 'jav': 'Javanese', 'kan': 'Kannada', 'kat': 'Georgian', 'kat_old': 'Georgian - Old', 'kaz': 'Kazakh', 'khm': 'Central Khmer', 'kir': 'Kirghiz; Kyrgyz', 'kmr': 'Kurdish Kurmanji', 'kor': 'Korean', 'kor_vert': 'Korean vertical', 'lao': 'Lao', 'lat': 'Latin', 'lav': 'Latvian', 'lit': 'Lithuanian', 'ltz': 'Luxembourgish', 'mal': 'Malayalam', 'mar': 'Marathi', 'mkd': 'Macedonian', 'mlt': 'Maltese', 'mon': 'Mongolian', 'mri': 'Maori', 'msa': 'Malay', 'mya': 'Burmese', 'nep': 'Nepali', 'nld': 'Dutch; Flemish', 'nor': 'Norwegian', 'oci': 'Occitan post 1500', 'ori': 'Oriya', 'pan': 'Panjabi; Punjabi', 'pol': 'Polish', 'por': 'Portuguese', 'pus': 'Pushto; Pashto', 'que': 'Quechua', 'ron': 'Romanian; Moldavian; Moldovan', 'rus': 'Russian', 'san': 'Sanskrit', 'sin': 'Sinhala; Sinhalese', 'slk': 'Slovak', 'slv': 'Slovenian', 'snd': 'Sindhi', 'spa': 'Spanish; Castilian', 'spa_old': 'Spanish; Castilian - Old', 'sqi': 'Albanian', 'srp': 'Serbian', 'srp_latn': 'Serbian - Latin', 'sun': 'Sundanese', 'swa': 'Swahili', 'swe': 'Swedish', 'syr': 'Syriac', 'tam': 'Tamil', 'tat': 'Tatar', 'tel': 'Telugu', 'tgk': 'Tajik', 'tha': 'Thai', 'tir': 'Tigrinya', 'ton': 'Tonga', 'tur': 'Turkish', 'uig': 'Uighur; Uyghur', 'ukr': 'Ukrainian', 'urd': 'Urdu', 'uzb': 'Uzbek', 'uzb_cyrl': 'Uzbek - Cyrilic vie Vietnamese', 'yid': 'Yiddish', 'yor': 'Yoruba'}

def _dymanicLangList():
modelsPath = os.path.dirname(os.path.abspath(__file__)) + "/engine/tessdata"
files = os.listdir(modelsPath)
with open(modelsPath+"/modelsInstalled.txt",'a') as f:
for fileName in files:
if fileName.endswith(".traineddata") and not fileName.endswith("vert.traineddata"):
f.write(fileName+"\n")
modelName = fileName.split(".")[0]
if not modelName in localOptions['language'] and modelName in TESSERACT_SUPPORTED:
localOptions['language'][modelName] = {
"title":TESSERACT_SUPPORTED[modelName],
"default": False,
}
f.close()

globalOptions = {
"title": tr("TesseractOCR(本地)"),
"type": "group",
Expand Down Expand Up @@ -32,15 +50,10 @@
"title": "日文",
"default": False,
},
"~enabledOther": {
"title": "启用自定义语言短码",
"equ": {
"title": "启用数学识别",
"default": False,
},
"~other": {
"title": "自定义语言短码",
"toolTip": "请查看tesseract官方文档,使用空格对所选语言进行分割。",
"default": "",
},
},
"psm":{
"title": "自动识别排版",
Expand All @@ -57,4 +70,6 @@
"toolTip": "识别数据中低于该置信度的内容将会被丢弃(输入范围:0~100)",
"default": "60",
}
}
}

_dymanicLangList()