Skip to content

Commit

Permalink
Add alphabets for more languages
Browse files Browse the repository at this point in the history
  • Loading branch information
eu9ene committed Jun 13, 2024
1 parent 27c66a6 commit 7aefb5b
Showing 1 changed file with 5 additions and 0 deletions.
5 changes: 5 additions & 0 deletions opuscleaner/filters/clean_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
'ar': r'[\u0600-\u06FF]', # This is not entirely right, as it also includes farsi symbols and whatnot
'bg': r'[АаБбВвГгДддЕеЖжЗзИиЙйКкkasЛлМмНнОоПпРрСсТтУуФфХхЦцЧчШшЩщЪъЬьЮюЯя]',
'bn': r'[\u0980-\u09FF]', # bangla
'bs': r'[a-zćĆčČđĐšŠžŽЉљЊњЋћЏџа-я]',
'ca': r'[a-zÀàÈèÉéÍíÒòÓóÚúÇç]',
'cs': r'[a-zÁáČčĎďÉéěÍíŇňÓóŘřŠšŤťÚúůÝýŽž]',
'da': r'[a-zÆæØøÅå]',
Expand All @@ -22,6 +23,7 @@
'hr': r'[abcčČćĆdđĐefghijklmnoprsšŠtuvzžŽ]',
'hu': r'[a-zÁáÉéÍíÓóÖöŐőŰű]',
'hy': r'[\u0530-\u058F]',
'id': r'[a-z]',
'is': r'[abdefghijklmnoprstuvxyÁáðÐÉéÍíÓóÚúÝýÞþÆæÖö]',
'it': r'[a-zàÀèÈéÉìÌíÍîÎòÒóÓùÙúÚ]',
'ko': r'[\uac00-\ud7af]|[\u1100-\u11ff]|[\u3130-\u318f]|[\ua960-\ua97f]|[\ud7b0-\ud7ff]',
Expand All @@ -38,7 +40,10 @@
'ru': r'[а-я]',
'sk': r'[a-záäÁÄčČďĎžéÉíÍĺĹľĽňŇóÓôÔŕŔšŠťŤúÚýÝžŽ]',
'sl': r'[abcčČdđĐefghijklmnoprsšŠtuvzžŽ]',
'sr': r'[a-zčČćĆđĐšŠžŽЉљЊњЋћЏџа-я]',
'sv': r'[a-zÅåÄäÖö]',
'tr': r'[a-zçÇğĞıİöÖşŞüÜ]',
'uk': r'[А-ЩЬЮЯҐЄІЇа-щьюяґєії\'`’ʼ]',
'vi': r'[a-zàáảãạâầấẩẫậăằắẳẵặèéẻẽẹêềếểễệìíỉĩịòóỏõọôồốổỗộơờớởỡợùúủũụưừứửữựỳýỷỹỵÀÁẢÃẠÂẦẤẨẪẬĂẰẮẲẴẶÈÉẺẼẸÊỀẾỂỄỆÌÍỈĨỊÒÓỎÕỌÔỒỐỔỖỘƠỜỚỞỠỢÙÚỦŨỤƯỪỨỬỮỰỲÝỶỸỴ]',
'zh': r'[\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff]',
}

0 comments on commit 7aefb5b

Please sign in to comment.