-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmodel.py
136 lines (109 loc) · 3.69 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import numpy as np
from numba import jit
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
import requests
from bs4 import BeautifulSoup
import scipy.spatial as sp
from utils import Utils as Ut
ad_bag=['https',]
@jit(nopython=True)
def cosine_similarity_numba(u:np.ndarray, v:np.ndarray):
assert(u.shape[0] == v.shape[0])
uv = 0
uu = 0
vv = 0
for i in range(u.shape[0]):
uv += u[i]*v[i]
uu += u[i]*u[i]
vv += v[i]*v[i]
cos_theta = 1
if uu!=0 and vv!=0:
cos_theta = uv/np.sqrt(uu*vv)
return cos_theta
class USE:
def __init__(self):
def embed_useT(module):
with tf.Graph().as_default():
sentences = tf.placeholder(tf.string)
embed = hub.Module(module)
embeddings = embed(sentences)
session = tf.train.MonitoredSession()
return lambda x: session.run(embeddings, {sentences: x})
self.model=embed_useT("./universal-sentence-encoder-lite_2")
def one_to_one(self,s1, s2):
embs=self.model([s1,s2])
score=cosine_similarity_numba(embs[0],embs[1])
return score
def cluster(self,content_list):
corpus_embeddings = self.model(content_list)
# Then, we perform k-means clustering using sklearn:
clustering_model = KMeans(n_clusters=2)
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_
cluster_0=''
cluster_1=''
for indx,text in enumerate(content_list):
if cluster_assignment[indx]==0:
cluster_0+=text
if cluster_assignment[indx]==1:
cluster_1+=text
if len(cluster_1)>len(cluster_0):
web_content=cluster_1
noise=cluster_0
else:
web_content=cluster_0
noise=cluster_1
return web_content,noise
class SBERT:
def __init__(self):
self.model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')
def one_to_one(self,s1, s2):
embs=self.model.encode([s1,s2])
score=cosine_similarity_numba(embs[0],embs[1])
return score
def cluster(self,content_list):
corpus_embeddings = self.model.encode(content_list)
# Then, we perform k-means clustering using sklearn:
clustering_model = KMeans(n_clusters=2)
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_
cluster_0=''
cluster_1=''
for indx,text in enumerate(content_list):
if cluster_assignment[indx]==0:
cluster_0+=text
if cluster_assignment[indx]==1:
cluster_1+=text
if len(cluster_1)>len(cluster_0) or cluster_0.count('css')> 3:
web_content=cluster_1
noise=cluster_0
else:
web_content=cluster_0
noise=cluster_1
return web_content,noise
def sentence_movers_distance(self,text_1,text_2):
sentence_list_1=Ut.split_into_sentences(text_1)
sentence_list_2=Ut.split_into_sentences(text_2)
matrix1=self.model.encode(sentence_list_1)
matrix2=self.model.encode(sentence_list_2)
score=1 - sp.distance.cdist(matrix1, matrix2, 'cosine')
top_count=int(np.shape(score)[0]*np.shape(score)[1]*0.15)
high_scores=score[Ut.largest_indices(score,top_count)]
return np.average(high_scores)
class Scrap:
@staticmethod
def get_web_content(link):
article = requests.get(link,verify=False)
article_content = article.content
soup = BeautifulSoup(article_content, 'html5lib')
content_list=[]
try:
titleTag = soup.html.head.title.text # getting the title
content_list.append(titleTag)
except:
titleTag=''
pass
for p in soup.findAll('p'):
content_list.append(p.text)
return content_list