Skip to content

Commit

Permalink
Merge pull request #3 from qwedc001/feat.dymanicLang
Browse files Browse the repository at this point in the history
动态监测语言实现
  • Loading branch information
qwedc001 authored Jan 27, 2024
2 parents 6c6e0ca + 83710db commit ea336a3
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 29 deletions.
21 changes: 0 additions & 21 deletions api_tesseractocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,41 +11,20 @@
# 依赖包目录
SitePackages = os.path.join(CurrentDir, "site-packages")

TESSERACT_SUPPORTED = ['afr', 'amh', 'ara', 'asm', 'aze', 'aze', 'bel', 'ben', 'bod', 'bos', 'bre', 'bul', 'cat', 'ceb', 'ces', 'chi', 'chi', 'chr', 'cos', 'cym', 'dan', 'deu', 'div', 'dzo', 'ell', 'Mod', '145', 'eng', 'enm', 'Mid', '110', 'epo', 'equ', 'est', 'eus', 'fas', 'fao', 'fil', 'fin', 'fra', 'frk', 'frm', 'Mid', 'ca.', 'fry', 'gla', 'gle', 'glg', 'grc', 'Anc', 'to ', 'guj', 'hat', 'heb', 'hin', 'hrv', 'hun', 'hye', 'iku', 'ind', 'isl', 'ita', 'ita', 'jav', 'jpn', 'kan', 'kat', 'kat', 'kaz', 'khm', 'kir', 'kmr', 'kor', 'kor', 'lao', 'lat', 'lav', 'lit', 'ltz', 'mal', 'mar', 'mkd', 'mlt', 'mon', 'mri', 'msa', 'mya', 'nep', 'nld', 'nor', 'oci', 'ori', 'osd', 'pan', 'pol', 'por', 'pus', 'que', 'ron', 'rus', 'san', 'sin', 'slk', 'slv', 'snd', 'spa', 'spa', 'sqi', 'srp', 'srp', 'sun', 'swa', 'swe', 'syr', 'tam', 'tat', 'tel', 'tgk', 'tha', 'tir', 'ton', 'tur', 'uig', 'ukr', 'urd', 'uzb', 'uzb', 'vie', 'yid', 'yor']
ModelDir = os.path.join(CurrentDir,"engine/tessdata/")

class Api:
def __init__(self, globalArgd):
self.tesseractOcr = None

def check(self,languages:list) -> (list,list):
unsupported = []
uninstalled = []
for language in languages:
if language not in TESSERACT_SUPPORTED:
unsupported.append(language)
elif not os.path.exists(ModelDir + language + ".traineddata"):
uninstalled.append(language)
return (unsupported,uninstalled)

def get_select_languages(self,argd) -> list:
selects = []
for k,flag in argd.items():
if k == "language.~enabledOther" or k == 'language.~other':
continue
if k.startswith("language.") and flag:
language = k[9:]
if (language == 'chi_sim' or language == "chi_tra") and argd['vert']:
selects.append(language+"_vert")
selects.append(language)
if argd['language.~enabledOther']:
otherLangs = argd['language.~other'].split(" ")
unsupported,uninstalled = self.check(otherLangs)
if len(unsupported):
raise Exception(f"Unsupported languages: {unsupported}")
if len(uninstalled):
raise Exception(f"Uninstalled languages: {uninstalled}")
selects += otherLangs
return selects

# 获取两个连续单词的分隔符。letter1为单词1结尾字母,letter2为单词2结尾字母
Expand Down
31 changes: 23 additions & 8 deletions tesseractocr_config.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,25 @@
from plugin_i18n import Translator
import os

tr = Translator(__file__, "i18n.csv")

TESSERACT_SUPPORTED = {'afr': 'Afrikaans', 'amh': 'Amharic', 'ara': 'Arabic', 'asm': 'Assamese', 'aze': 'Azerbaijani', 'aze_cyrl': 'Azerbaijani - Cyrilic', 'bel': 'Belarusian', 'ben': 'Bengali', 'bod': 'Tibetan', 'bos': 'Bosnian', 'bre': 'Breton', 'bul': 'Bulgarian', 'cat': 'Catalan; Valencian', 'ceb': 'Cebuano', 'ces': 'Czech', 'chr': 'Cherokee', 'cos': 'Corsican', 'cym': 'Welsh', 'dan': 'Danish', 'deu': 'German', 'div': 'Dhivehi', 'dzo': 'Dzongkha', 'ell': 'Greek, Modern, 1453-', 'enm': 'English, Middle, 1100-1500', 'epo': 'Esperanto', 'equ': 'Math / equation detection module', 'est': 'Estonian', 'eus': 'Basque', 'fas': 'Persian', 'fao': 'Faroese', 'fil': 'Filipino', 'fin': 'Finnish', 'fra': 'French', 'frk': 'Frankish', 'frm': 'French, Middle, ca.1400-1600', 'fry': 'West Frisian', 'gla': 'Scottish Gaelic', 'gle': 'Irish', 'glg': 'Galician', 'grc': 'Greek, Ancient, to 1453', 'guj': 'Gujarati', 'hat': 'Haitian; Haitian Creole', 'heb': 'Hebrew', 'hin': 'Hindi', 'hrv': 'Croatian', 'hun': 'Hungarian', 'hye': 'Armenian', 'iku': 'Inuktitut', 'ind': 'Indonesian', 'isl': 'Icelandic', 'ita': 'Italian', 'ita_old': 'Italian - Old', 'jav': 'Javanese', 'kan': 'Kannada', 'kat': 'Georgian', 'kat_old': 'Georgian - Old', 'kaz': 'Kazakh', 'khm': 'Central Khmer', 'kir': 'Kirghiz; Kyrgyz', 'kmr': 'Kurdish Kurmanji', 'kor': 'Korean', 'kor_vert': 'Korean vertical', 'lao': 'Lao', 'lat': 'Latin', 'lav': 'Latvian', 'lit': 'Lithuanian', 'ltz': 'Luxembourgish', 'mal': 'Malayalam', 'mar': 'Marathi', 'mkd': 'Macedonian', 'mlt': 'Maltese', 'mon': 'Mongolian', 'mri': 'Maori', 'msa': 'Malay', 'mya': 'Burmese', 'nep': 'Nepali', 'nld': 'Dutch; Flemish', 'nor': 'Norwegian', 'oci': 'Occitan post 1500', 'ori': 'Oriya', 'pan': 'Panjabi; Punjabi', 'pol': 'Polish', 'por': 'Portuguese', 'pus': 'Pushto; Pashto', 'que': 'Quechua', 'ron': 'Romanian; Moldavian; Moldovan', 'rus': 'Russian', 'san': 'Sanskrit', 'sin': 'Sinhala; Sinhalese', 'slk': 'Slovak', 'slv': 'Slovenian', 'snd': 'Sindhi', 'spa': 'Spanish; Castilian', 'spa_old': 'Spanish; Castilian - Old', 'sqi': 'Albanian', 'srp': 'Serbian', 'srp_latn': 'Serbian - Latin', 'sun': 'Sundanese', 'swa': 'Swahili', 'swe': 'Swedish', 'syr': 'Syriac', 'tam': 'Tamil', 'tat': 'Tatar', 'tel': 'Telugu', 'tgk': 'Tajik', 'tha': 'Thai', 'tir': 'Tigrinya', 'ton': 'Tonga', 'tur': 'Turkish', 'uig': 'Uighur; Uyghur', 'ukr': 'Ukrainian', 'urd': 'Urdu', 'uzb': 'Uzbek', 'uzb_cyrl': 'Uzbek - Cyrilic vie Vietnamese', 'yid': 'Yiddish', 'yor': 'Yoruba'}

def _dymanicLangList():
modelsPath = os.path.dirname(os.path.abspath(__file__)) + "/engine/tessdata"
files = os.listdir(modelsPath)
with open(modelsPath+"/modelsInstalled.txt",'a') as f:
for fileName in files:
if fileName.endswith(".traineddata") and not fileName.endswith("vert.traineddata"):
f.write(fileName+"\n")
modelName = fileName.split(".")[0]
if not modelName in localOptions['language'] and modelName in TESSERACT_SUPPORTED:
localOptions['language'][modelName] = {
"title":TESSERACT_SUPPORTED[modelName],
"default": False,
}
f.close()

globalOptions = {
"title": tr("TesseractOCR(本地)"),
"type": "group",
Expand Down Expand Up @@ -32,15 +50,10 @@
"title": "日文",
"default": False,
},
"~enabledOther": {
"title": "启用自定义语言短码",
"equ": {
"title": "启用数学识别",
"default": False,
},
"~other": {
"title": "自定义语言短码",
"toolTip": "请查看tesseract官方文档,使用空格对所选语言进行分割。",
"default": "",
},
},
"psm":{
"title": "自动识别排版",
Expand All @@ -57,4 +70,6 @@
"toolTip": "识别数据中低于该置信度的内容将会被丢弃(输入范围:0~100)",
"default": "60",
}
}
}

_dymanicLangList()

0 comments on commit ea336a3

Please sign in to comment.