-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathutils.py
65 lines (57 loc) · 2.04 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# -*- coding: utf-8 -*-
"""
@Time : 2020/4/2 上午11:48
@FileName: utils.py
@author: 王炳宁
@contact: [email protected]
"""
import re
import numpy as np
def DBC2SBC(ustring):
rstring = ""
for uchar in ustring:
inside_code = ord(uchar)
if inside_code == 0x3000:
inside_code = 0x0020
else:
inside_code -= 0xfee0
if not (0x0021 <= inside_code <= 0x7e):
rstring += uchar
continue
rstring += chr(inside_code)
return rstring
def cleanhtmltag(raw_html):
# cleanr = re.compile()
cleantext = re.sub('<.*?>', '', raw_html)
return cleantext
def clean(txt):
txt = DBC2SBC(txt)
txt = txt.lower()
txt = re.sub('\s*', '', txt)
return cleanhtmltag(txt)
def padding(sequence, pads=0, max_len=None, dtype='int32', return_matrix_for_size=False):
# we should judge the rank
if True or isinstance(sequence[0], list):
v_length = [len(x) for x in sequence] # every sequence length
seq_max_len = max(v_length)
if (max_len is None) or (max_len > seq_max_len):
max_len = seq_max_len
v_length = list(map(lambda z: z if z <= max_len else max_len, v_length))
x = (np.ones((len(sequence), max_len)) * pads).astype(dtype)
for idx, s in enumerate(sequence):
trunc = s[:max_len]
x[idx, :len(trunc)] = trunc
if return_matrix_for_size:
v_matrix = np.asanyarray([map(lambda item: 1 if item < line else 0, range(max_len)) for line in v_length],
dtype=dtype)
return x, v_matrix
return x, np.asarray(v_length, dtype='int32')
else:
seq_len = len(sequence)
if max_len is None:
max_len = seq_len
v_vector = sequence + [0] * (max_len - seq_len)
padded_vector = np.asarray(v_vector, dtype=dtype)
v_index = [1] * seq_len + [0] * (max_len - seq_len)
padded_index = np.asanyarray(v_index, dtype=dtype)
return padded_vector, padded_index