-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathclean_text.py
79 lines (62 loc) · 2.12 KB
/
clean_text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
"""
Modified version of
https://github.com/getalp/Flaubert/blob/master/tools/clean_text.py
# Original copyright is appended below.
# Copyright 2019 Hang Le
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
#
"""
import sys
import unicodedata
import six
import string, re
import argparse
import unicodedata
import time
def normalize_unicode(text):
"""
Normalize unicode underlying representation
"""
text = unicodedata.normalize("NFC", text)
return text
def convert_to_unicode(text):
"""
Converts `text` to Unicode (if it's not already), assuming UTF-8 input.
"""
# six_ensure_text is copied from https://github.com/benjaminp/six
def six_ensure_text(s, encoding='utf-8', errors='strict'):
if isinstance(s, six.binary_type):
return s.decode(encoding, errors)
elif isinstance(s, six.text_type):
return s
else:
raise TypeError("not expecting type '%s'" % type(s))
return six_ensure_text(text, encoding="utf-8", errors="ignore")
def rm_spaces(text):
"""
Remove multiple spaces and the space between a word and a punctuation(only before punctuation, not after)
"""
pattern = re.compile(r'( ){2,}')
text = re.sub(pattern, r' ', text)
text = re.sub(r'\s([?.!"](?:\s|$))', r'\1', text)
text = re.sub(r'(?<=[.!;:,·])(?=[Α-ΩΊΪΪΌΆΈΎΫΫ́ΉΏ])', ' ', text) # add whitespace after punctuation if there is not
return text
def cleaner(text, rm_new_lines=False, do_lower=False):
"""
Clean up an input text
"""
# Convert and normalize the unicode underlying representation
text = convert_to_unicode(text)
text = normalize_unicode(text)
# Normalize whitespace characters and remove carriage return
if rm_new_lines:
remap = {ord('\f'):' ', ord('\r'): '', ord('\n'):'', ord('\t'):''}
text = text.translate(remap)
else:
remap = {ord('\f'):' ', ord('\r'): ''}
text = text.translate(remap)
# remove multiple spaces in text
text = rm_spaces(text)
return text