Skip to content

Commit ff3fb5f

Browse files
committed
python generating dataset with sentences to be used
1 parent 3e659c9 commit ff3fb5f

File tree

4 files changed

+43
-15
lines changed

4 files changed

+43
-15
lines changed

embeddings.py

+34-13
Original file line numberDiff line numberDiff line change
@@ -2,25 +2,42 @@
22

33
from sentence_transformers import SentenceTransformer
44

5-
from numpy import ndarray
65
import numpy as np
7-
from typing import Type
6+
import csv
87

98

109
def generate_test_data(sentences: list[str], filename: str):
1110
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
12-
embeddings: Type[ndarray] = model.encode(sentences, convert_to_numpy=True)
11+
embeddings = model.encode(sentences, convert_to_numpy=True)
12+
new_embeddings = [
13+
[sentence, *embedding] for sentence, embedding in zip(sentences, embeddings)
14+
]
15+
16+
with open(filename, "w", encoding="utf-8", newline="") as f:
17+
writer = csv.writer(f)
18+
writer.writerows(new_embeddings)
19+
20+
# np.savetxt(f"{filename}.csv", new_embeddings, delimiter=",")
1321

14-
np.asarray(embeddings, np.float64)
15-
np.savetxt(f"{filename}.csv", embeddings, delimiter=",")
22+
23+
def load_data_from_file(filename: str):
24+
temp = []
25+
with open(filename, "r", encoding="utf-8", newline="") as f:
26+
reader = csv.reader(f)
27+
for row in reader:
28+
temp2 = [row[0]]
29+
for value in row[1:]:
30+
temp2.append(float(value))
31+
temp.append(temp2)
32+
return temp
1633

1734

1835
def cosine_similarity(a: np.array, b: np.array):
1936
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
2037

2138

2239
if __name__ == "__main__":
23-
dataset = [
40+
text_sentences = [
2441
"Chandrayaan-3 is the third Indian lunar exploration mission under the Indian Space Research Organisation's (ISRO) Chandrayaan programme.",
2542
"Chandrayaan-3 was launched on 14 July 2023.",
2643
"On 22 July 2019, ISRO launched Chandrayaan-2",
@@ -29,21 +46,25 @@ def cosine_similarity(a: np.array, b: np.array):
2946
"Confirming the existence of the project, ISRO's former chairman K. Sivan stated that the estimated cost would be around ₹615 crore (equivalent to ₹721 crore or US$90 million in 2023).",
3047
]
3148

32-
questions = ["Name of the space programme", "Number of wheels of vehicle"]
49+
text_questions = ["Name of the space programme", "Number of wheels of vehicle"]
3350

34-
# generate_test_data(dataset, "dataset")
51+
# sentences = generate_test_data(text_sentences, "sentences.csv")
52+
# questions = generate_test_data(text_questions, "questions.csv")
3553

36-
dataset = np.loadtxt("dataset.csv", dtype=np.float64, delimiter=",")
37-
questions = np.loadtxt("questions.csv", dtype=np.float64, delimiter=",")
54+
sentences = load_data_from_file("sentences.csv")
55+
questions = load_data_from_file("questions.csv")
3856

3957
rankings = []
4058
for question in questions:
4159
temp = []
42-
for sentence in dataset:
43-
temp.append(cosine_similarity(question, sentence))
60+
for sentence in sentences:
61+
temp.append([sentence[0], cosine_similarity(question[1:], sentence[1:])])
4462
rankings.append(temp)
4563

46-
print(rankings)
64+
for rank in rankings:
65+
data = sorted(rank, key=lambda x: x[1], reverse=True)
66+
print(data[0])
67+
4768

4869
""" Test Match using consine run
4970
1.

0 commit comments

Comments
 (0)