2
2
3
3
from sentence_transformers import SentenceTransformer
4
4
5
- from numpy import ndarray
6
5
import numpy as np
7
- from typing import Type
6
+ import csv
8
7
9
8
10
9
def generate_test_data (sentences : list [str ], filename : str ):
11
10
model = SentenceTransformer ("sentence-transformers/all-MiniLM-L6-v2" )
12
- embeddings : Type [ndarray ] = model .encode (sentences , convert_to_numpy = True )
11
+ embeddings = model .encode (sentences , convert_to_numpy = True )
12
+ new_embeddings = [
13
+ [sentence , * embedding ] for sentence , embedding in zip (sentences , embeddings )
14
+ ]
15
+
16
+ with open (filename , "w" , encoding = "utf-8" , newline = "" ) as f :
17
+ writer = csv .writer (f )
18
+ writer .writerows (new_embeddings )
19
+
20
+ # np.savetxt(f"{filename}.csv", new_embeddings, delimiter=",")
13
21
14
- np .asarray (embeddings , np .float64 )
15
- np .savetxt (f"{ filename } .csv" , embeddings , delimiter = "," )
22
+
23
+ def load_data_from_file (filename : str ):
24
+ temp = []
25
+ with open (filename , "r" , encoding = "utf-8" , newline = "" ) as f :
26
+ reader = csv .reader (f )
27
+ for row in reader :
28
+ temp2 = [row [0 ]]
29
+ for value in row [1 :]:
30
+ temp2 .append (float (value ))
31
+ temp .append (temp2 )
32
+ return temp
16
33
17
34
18
35
def cosine_similarity (a : np .array , b : np .array ):
19
36
return np .dot (a , b ) / (np .linalg .norm (a ) * np .linalg .norm (b ))
20
37
21
38
22
39
if __name__ == "__main__" :
23
- dataset = [
40
+ text_sentences = [
24
41
"Chandrayaan-3 is the third Indian lunar exploration mission under the Indian Space Research Organisation's (ISRO) Chandrayaan programme." ,
25
42
"Chandrayaan-3 was launched on 14 July 2023." ,
26
43
"On 22 July 2019, ISRO launched Chandrayaan-2" ,
@@ -29,21 +46,25 @@ def cosine_similarity(a: np.array, b: np.array):
29
46
"Confirming the existence of the project, ISRO's former chairman K. Sivan stated that the estimated cost would be around ₹615 crore (equivalent to ₹721 crore or US$90 million in 2023)." ,
30
47
]
31
48
32
- questions = ["Name of the space programme" , "Number of wheels of vehicle" ]
49
+ text_questions = ["Name of the space programme" , "Number of wheels of vehicle" ]
33
50
34
- # generate_test_data(dataset, "dataset")
51
+ # sentences = generate_test_data(text_sentences, "sentences.csv")
52
+ # questions = generate_test_data(text_questions, "questions.csv")
35
53
36
- dataset = np . loadtxt ( "dataset .csv" , dtype = np . float64 , delimiter = ", " )
37
- questions = np . loadtxt ("questions.csv" , dtype = np . float64 , delimiter = ", " )
54
+ sentences = load_data_from_file ( "sentences .csv" )
55
+ questions = load_data_from_file ("questions.csv" )
38
56
39
57
rankings = []
40
58
for question in questions :
41
59
temp = []
42
- for sentence in dataset :
43
- temp .append (cosine_similarity (question , sentence ) )
60
+ for sentence in sentences :
61
+ temp .append ([ sentence [ 0 ], cosine_similarity (question [ 1 :] , sentence [ 1 :])] )
44
62
rankings .append (temp )
45
63
46
- print (rankings )
64
+ for rank in rankings :
65
+ data = sorted (rank , key = lambda x : x [1 ], reverse = True )
66
+ print (data [0 ])
67
+
47
68
48
69
""" Test Match using consine run
49
70
1.
0 commit comments