From 83710db231be8a86e9a9a7cb581cbbc0d5d275a4 Mon Sep 17 00:00:00 2001 From: Eric Guo <2364319479@qq.com> Date: Sat, 27 Jan 2024 16:37:12 +0800 Subject: [PATCH] =?UTF-8?q?=E5=8A=A8=E6=80=81=E7=9B=91=E6=B5=8B=E8=AF=AD?= =?UTF-8?q?=E8=A8=80=E5=AE=9E=E7=8E=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- api_tesseractocr.py | 21 --------------------- tesseractocr_config.py | 31 +++++++++++++++++++++++-------- 2 files changed, 23 insertions(+), 29 deletions(-) diff --git a/api_tesseractocr.py b/api_tesseractocr.py index 6ff8efb..a0bf7ec 100644 --- a/api_tesseractocr.py +++ b/api_tesseractocr.py @@ -11,41 +11,20 @@ # 依赖包目录 SitePackages = os.path.join(CurrentDir, "site-packages") -TESSERACT_SUPPORTED = ['afr', 'amh', 'ara', 'asm', 'aze', 'aze', 'bel', 'ben', 'bod', 'bos', 'bre', 'bul', 'cat', 'ceb', 'ces', 'chi', 'chi', 'chr', 'cos', 'cym', 'dan', 'deu', 'div', 'dzo', 'ell', 'Mod', '145', 'eng', 'enm', 'Mid', '110', 'epo', 'equ', 'est', 'eus', 'fas', 'fao', 'fil', 'fin', 'fra', 'frk', 'frm', 'Mid', 'ca.', 'fry', 'gla', 'gle', 'glg', 'grc', 'Anc', 'to ', 'guj', 'hat', 'heb', 'hin', 'hrv', 'hun', 'hye', 'iku', 'ind', 'isl', 'ita', 'ita', 'jav', 'jpn', 'kan', 'kat', 'kat', 'kaz', 'khm', 'kir', 'kmr', 'kor', 'kor', 'lao', 'lat', 'lav', 'lit', 'ltz', 'mal', 'mar', 'mkd', 'mlt', 'mon', 'mri', 'msa', 'mya', 'nep', 'nld', 'nor', 'oci', 'ori', 'osd', 'pan', 'pol', 'por', 'pus', 'que', 'ron', 'rus', 'san', 'sin', 'slk', 'slv', 'snd', 'spa', 'spa', 'sqi', 'srp', 'srp', 'sun', 'swa', 'swe', 'syr', 'tam', 'tat', 'tel', 'tgk', 'tha', 'tir', 'ton', 'tur', 'uig', 'ukr', 'urd', 'uzb', 'uzb', 'vie', 'yid', 'yor'] ModelDir = os.path.join(CurrentDir,"engine/tessdata/") class Api: def __init__(self, globalArgd): self.tesseractOcr = None - - def check(self,languages:list) -> (list,list): - unsupported = [] - uninstalled = [] - for language in languages: - if language not in TESSERACT_SUPPORTED: - unsupported.append(language) - elif not os.path.exists(ModelDir + language + ".traineddata"): - uninstalled.append(language) - return (unsupported,uninstalled) def get_select_languages(self,argd) -> list: selects = [] for k,flag in argd.items(): - if k == "language.~enabledOther" or k == 'language.~other': - continue if k.startswith("language.") and flag: language = k[9:] if (language == 'chi_sim' or language == "chi_tra") and argd['vert']: selects.append(language+"_vert") selects.append(language) - if argd['language.~enabledOther']: - otherLangs = argd['language.~other'].split(" ") - unsupported,uninstalled = self.check(otherLangs) - if len(unsupported): - raise Exception(f"Unsupported languages: {unsupported}") - if len(uninstalled): - raise Exception(f"Uninstalled languages: {uninstalled}") - selects += otherLangs return selects # 获取两个连续单词的分隔符。letter1为单词1结尾字母,letter2为单词2结尾字母 diff --git a/tesseractocr_config.py b/tesseractocr_config.py index d12f653..5fb4989 100644 --- a/tesseractocr_config.py +++ b/tesseractocr_config.py @@ -1,7 +1,25 @@ from plugin_i18n import Translator +import os tr = Translator(__file__, "i18n.csv") +TESSERACT_SUPPORTED = {'afr': 'Afrikaans', 'amh': 'Amharic', 'ara': 'Arabic', 'asm': 'Assamese', 'aze': 'Azerbaijani', 'aze_cyrl': 'Azerbaijani - Cyrilic', 'bel': 'Belarusian', 'ben': 'Bengali', 'bod': 'Tibetan', 'bos': 'Bosnian', 'bre': 'Breton', 'bul': 'Bulgarian', 'cat': 'Catalan; Valencian', 'ceb': 'Cebuano', 'ces': 'Czech', 'chr': 'Cherokee', 'cos': 'Corsican', 'cym': 'Welsh', 'dan': 'Danish', 'deu': 'German', 'div': 'Dhivehi', 'dzo': 'Dzongkha', 'ell': 'Greek, Modern, 1453-', 'enm': 'English, Middle, 1100-1500', 'epo': 'Esperanto', 'equ': 'Math / equation detection module', 'est': 'Estonian', 'eus': 'Basque', 'fas': 'Persian', 'fao': 'Faroese', 'fil': 'Filipino', 'fin': 'Finnish', 'fra': 'French', 'frk': 'Frankish', 'frm': 'French, Middle, ca.1400-1600', 'fry': 'West Frisian', 'gla': 'Scottish Gaelic', 'gle': 'Irish', 'glg': 'Galician', 'grc': 'Greek, Ancient, to 1453', 'guj': 'Gujarati', 'hat': 'Haitian; Haitian Creole', 'heb': 'Hebrew', 'hin': 'Hindi', 'hrv': 'Croatian', 'hun': 'Hungarian', 'hye': 'Armenian', 'iku': 'Inuktitut', 'ind': 'Indonesian', 'isl': 'Icelandic', 'ita': 'Italian', 'ita_old': 'Italian - Old', 'jav': 'Javanese', 'kan': 'Kannada', 'kat': 'Georgian', 'kat_old': 'Georgian - Old', 'kaz': 'Kazakh', 'khm': 'Central Khmer', 'kir': 'Kirghiz; Kyrgyz', 'kmr': 'Kurdish Kurmanji', 'kor': 'Korean', 'kor_vert': 'Korean vertical', 'lao': 'Lao', 'lat': 'Latin', 'lav': 'Latvian', 'lit': 'Lithuanian', 'ltz': 'Luxembourgish', 'mal': 'Malayalam', 'mar': 'Marathi', 'mkd': 'Macedonian', 'mlt': 'Maltese', 'mon': 'Mongolian', 'mri': 'Maori', 'msa': 'Malay', 'mya': 'Burmese', 'nep': 'Nepali', 'nld': 'Dutch; Flemish', 'nor': 'Norwegian', 'oci': 'Occitan post 1500', 'ori': 'Oriya', 'pan': 'Panjabi; Punjabi', 'pol': 'Polish', 'por': 'Portuguese', 'pus': 'Pushto; Pashto', 'que': 'Quechua', 'ron': 'Romanian; Moldavian; Moldovan', 'rus': 'Russian', 'san': 'Sanskrit', 'sin': 'Sinhala; Sinhalese', 'slk': 'Slovak', 'slv': 'Slovenian', 'snd': 'Sindhi', 'spa': 'Spanish; Castilian', 'spa_old': 'Spanish; Castilian - Old', 'sqi': 'Albanian', 'srp': 'Serbian', 'srp_latn': 'Serbian - Latin', 'sun': 'Sundanese', 'swa': 'Swahili', 'swe': 'Swedish', 'syr': 'Syriac', 'tam': 'Tamil', 'tat': 'Tatar', 'tel': 'Telugu', 'tgk': 'Tajik', 'tha': 'Thai', 'tir': 'Tigrinya', 'ton': 'Tonga', 'tur': 'Turkish', 'uig': 'Uighur; Uyghur', 'ukr': 'Ukrainian', 'urd': 'Urdu', 'uzb': 'Uzbek', 'uzb_cyrl': 'Uzbek - Cyrilic vie Vietnamese', 'yid': 'Yiddish', 'yor': 'Yoruba'} + +def _dymanicLangList(): + modelsPath = os.path.dirname(os.path.abspath(__file__)) + "/engine/tessdata" + files = os.listdir(modelsPath) + with open(modelsPath+"/modelsInstalled.txt",'a') as f: + for fileName in files: + if fileName.endswith(".traineddata") and not fileName.endswith("vert.traineddata"): + f.write(fileName+"\n") + modelName = fileName.split(".")[0] + if not modelName in localOptions['language'] and modelName in TESSERACT_SUPPORTED: + localOptions['language'][modelName] = { + "title":TESSERACT_SUPPORTED[modelName], + "default": False, + } + f.close() + globalOptions = { "title": tr("TesseractOCR(本地)"), "type": "group", @@ -32,15 +50,10 @@ "title": "日文", "default": False, }, - "~enabledOther": { - "title": "启用自定义语言短码", + "equ": { + "title": "启用数学识别", "default": False, }, - "~other": { - "title": "自定义语言短码", - "toolTip": "请查看tesseract官方文档,使用空格对所选语言进行分割。", - "default": "", - }, }, "psm":{ "title": "自动识别排版", @@ -57,4 +70,6 @@ "toolTip": "识别数据中低于该置信度的内容将会被丢弃(输入范围:0~100)", "default": "60", } -} \ No newline at end of file +} + +_dymanicLangList() \ No newline at end of file