-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
102 lines (84 loc) · 3.02 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
from sudachipy import Tokenizer, dictionary, tokenizer
from jp_news_scraper_pipeline.configure_logging import configure_logging_with_file
logger = configure_logging_with_file(log_file='main.log', logger_name='main', level='INFO')
def get_tokenizer() -> Tokenizer:
"""
Get SudachiPys's tokenizer.
:return: SudachiPys's tokenizer.
"""
logger.info("Get SudachiPys's tokenizer.")
return dictionary.Dictionary().create()
def get_tokenizer_mode() -> Tokenizer.SplitMode:
"""
Get SudachiPys's tokenizer's mode.
:return: SudachiPys's tokenizer's mode.
"""
logger.info("Get SudachiPys's tokenizer's Mode C.")
return tokenizer.Tokenizer.SplitMode.C
def get_jp_pos_dict():
"""
Get the Japanese Part of Speech dictionary.
:return: Japanese Part of Speech dictionary.
"""
logger.info("Get the Japanese Part of Speech dictionary.")
return {
"代名詞": "Pronoun",
"副詞": "Adverb",
"助動詞": "Auxiliary Verb",
"助詞": "Particle",
"動詞": "Verb",
"名詞": "Noun",
"形容詞": "Adjective",
"形状詞": "Adjectival Noun",
"感動詞": "Interjection",
"接尾辞": "Suffix",
"接続詞": "Conjunction",
"接頭辞": "Prefix",
"空白": "Whitespace",
"補助記号": "Supplementary Symbol",
"連体詞": "Adnominal",
"記号": "Symbol"
}
def get_excluded_jp_pos():
"""
Get the Japanese Part of Speech that needs to be excluded as a dictionary.
:return: Excluded Japanese Part of Speech dictionary.
"""
logger.info("Get the Japanese Part of Speech that needs to be excluded as a dictionary")
return {
"空白": "Whitespace",
"補助記号": "Supplementary Symbol",
"連体詞": "Adnominal",
"記号": "Symbol"
}
def check_if_all_list_len_is_equal(*args) -> bool:
"""
Check if all list lengths are equal.
:param args: Target lists.
:return: True if all list lengths are equal, False otherwise.
"""
logger.info("Check if all list lengths are equal.")
list_len: tuple = check_list_len(*args)
kanji_list_len = list_len[0]
logger.debug(f'Kanji list length: {kanji_list_len}')
pos_list_len = list_len[1]
logger.debug(f'Part of Speech list length: {pos_list_len}')
pos_translated_list_len = list_len[2]
logger.debug(f'Translated Part of Speech list length: {pos_translated_list_len}')
if kanji_list_len == pos_list_len == pos_translated_list_len:
logger.info("All list lengths are equal.")
return True
else:
logger.info("Not all list lengths are equal.")
return False
def check_list_len(*args) -> tuple:
"""
Calculate the length of the target list and return it as an integer.
:param args: Target lists.
:return: Length of the target list as Tuple.
"""
logger.info(f"Checking length of target lists...")
lengths = [len(arg) for arg in args]
return tuple(lengths)
if __name__ == '__main__':
pass