forked from cognitivecomputations/SystemChat
-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
92 lines (81 loc) · 3.1 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import logging
from sentence_transformers import SentenceTransformer
import torch
import warnings
from tqdm import tqdm
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
def setup_logging():
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(message)s",
level=logging.INFO
)
def get_device():
if torch.backends.mps.is_available():
return torch.device("mps")
elif torch.cuda.is_available():
return torch.device("cuda")
else:
return torch.device("cpu")
def load_model(model_name, device):
try:
with warnings.catch_warnings():
warnings.filterwarnings("ignore", category=UserWarning, message="You try to use a model that was created with version")
warnings.filterwarnings("ignore", category=FutureWarning, message="`resume_download` is deprecated and will be removed in version 1.0.0")
model = SentenceTransformer(model_name).to(device)
return model
except Exception as e:
logging.error(f"Error loading model {model_name}: {e}")
raise
def dedupe(strings, model_name="Snowflake/snowflake-arctic-embed-xs", threshold=0.97):
try:
device = get_device()
model = load_model(model_name, device)
except Exception as e:
logging.error(f"Initialization error: {e}")
return []
logging.info("Starting deduplication")
# Eliminate exact duplicates
try:
stripped_strings = [s.strip() for s in strings]
unique_strings = list(set(stripped_strings))
logging.info(f"Removed exact duplicates, {len(strings) - len(unique_strings)} duplicates found.")
except Exception as e:
logging.error(f"Error eliminating exact duplicates: {e}")
return []
# Encode strings
try:
embeddings = model.encode(unique_strings, convert_to_tensor=True, device=device).cpu().numpy()
except Exception as e:
logging.error(f"Error encoding strings: {e}")
return []
# Compute cosine similarities
try:
similarities = cosine_similarity(embeddings)
unique_indices = set()
for i in tqdm(range(len(similarities)), desc="Checking similarities"):
if all(similarities[i][j] < threshold for j in range(len(similarities)) if i != j):
unique_indices.add(i)
unique = [unique_strings[i] for i in unique_indices]
except Exception as e:
logging.error(f"Error computing cosine similarities: {e}")
return []
logging.info("Deduplication complete")
return unique
def read_file(file_path):
try:
with open(file_path) as file:
lines = [line.rstrip() for line in file]
return lines
except Exception as e:
logging.error(f"Error reading file {file_path}: {e}")
raise
def write_file(file_path, lines):
try:
with open(file_path, "w") as file:
for line in lines:
file.write(line + "\n")
logging.info(f"Output written to {file_path}")
except Exception as e:
logging.error(f"Error writing to file {file_path}: {e}")
raise