-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtext_utils.py
executable file
·90 lines (70 loc) · 2.61 KB
/
text_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
from __future__ import print_function
from __future__ import unicode_literals
import codecs
import re
def is_emoji(cp):
'''
This function identifies emojis, cp is assumed to be a 32bit character
:param cp: unicode codepoint
:return: True if is a emoju
'''
# new in unicode 7
if 0x1F300 <= cp <= 0x1F5FF:
return True
# other emoticons
elif 0x1F600 <= cp <= 0x1F640F:
return True
elif 0x2600 <= cp <= 0x26FF:
return True
elif 0x2700 <= cp <= 0x27BF:
return True
def get_uniord(char):
'''
Convert a pair of 16bit chars into a single 32bit encoded unicode char
:param char: a pair of unichrs representing a surrogate, unicode astral plane
:return: the number representing the unique character
'''
if len(char) != 2:
return ord(char)
return 0x10000 + (ord(char[0]) - 0xD800) * 0x400 + (ord(char[1]) - 0xDC00)
def to_char(ch0, ch1):
'''
Get a unicode codepoint from a pair of surrogates
:param ch0: char1
:param ch1: char2
:return:
'''
return 0x10000 + (ord(ch0) - 0xD800) * 0x400 + (ord(ch1) - 0xDC00)
def get_surrogates(ascii_chrs):
'''
In the resulting twitter JSON emojis are encoded as a pair of ascii encoded utf-8 characters
Ex: '\ud83d\ude31' 'is FACE SCREAMING IN FEAR' or (U+1F631)
:param ascii_chrs: an ascii escaped UTF-8 text or a utf-8 string
:return: a list of unichrs representing the ones founded in the astral plane
'''
# If we find an ascii encode surrogate pair the convert to unicode
chrs = re.findall(r'\\uD\w{3}\\uD\w{3}', ascii_chrs, flags=re.IGNORECASE | re.MULTILINE)
if chrs:
chrs = [codecs.decode(x, 'unicode_escape') for x in chrs]
else:
# If ascii encoded surrogetes are not founded in the text
# returns bytes so match against bytes and the convert the result again to utf8
ascii_chrs = ascii_chrs.encode('unicode_escape')
chrs = re.findall(b'\\\\U000\w{5}', ascii_chrs, re.MULTILINE)
chrs = [ch.decode('unicode_escape') for ch in chrs]
return chrs
def get_all_emojis(ascii_chrs):
'''
Get all emojis contained in the ascii encoded utf8 text
:param ascii_chrs:
:return:
'''
return [x for x in get_surrogates(ascii_chrs) if is_emoji(get_uniord(x))]
def sanitize(w):
# Strip punctuation from the front
while len(w) > 0 and not w[0].isalnum():
w = w[1:]
# String punctuation from the back
while len(w) > 0 and not w[-1].isalnum():
w = w[:-1]
return w