From b360040767d4b81f3c2bfff37313506d577c28ec Mon Sep 17 00:00:00 2001
From: Konstantinos Nikoletos <nikoletos.konstantinos99@gmail.com>
Date: Fri, 19 Jul 2024 13:12:52 +0300
Subject: [PATCH] Fixed issues #22 and #23;

---
 pyproject.toml                       |  2 +-
 src/pyjedai/_version.py              |  2 +-
 src/pyjedai/datamodel.py             |  6 +++---
 src/pyjedai/matching.py              | 21 +++++++++++++--------
 src/pyjedai/vector_based_blocking.py |  6 ++----
 src/pyjedai/workflow.py              | 15 +++++++++++++--
 6 files changed, 33 insertions(+), 19 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 2d4ae88..55f088c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "pyjedai"
-version = "0.1.7"
+version = "0.1.8"
 description = "An open-source library that builds powerful end-to-end Entity Resolution workflows."
 readme = "README.md"
 authors = [
diff --git a/src/pyjedai/_version.py b/src/pyjedai/_version.py
index f1380ee..9cb17e7 100644
--- a/src/pyjedai/_version.py
+++ b/src/pyjedai/_version.py
@@ -1 +1 @@
-__version__ = "0.1.7"
+__version__ = "0.1.8"
diff --git a/src/pyjedai/datamodel.py b/src/pyjedai/datamodel.py
index d7bd273..2eea542 100644
--- a/src/pyjedai/datamodel.py
+++ b/src/pyjedai/datamodel.py
@@ -123,12 +123,12 @@ def __init__(
         self.dataset_name_2 = dataset_name_2
         
         # Fill NaN values with empty string
-        self.dataset_1 = self.dataset_1.astype(str)
         self.dataset_1.fillna("", inplace=True)
+        self.dataset_1 = self.dataset_1.astype(str)
         if not self.is_dirty_er:
-            self.dataset_2 = self.dataset_2.astype(str)
             self.dataset_2.fillna("", inplace=True)
-
+            self.dataset_2 = self.dataset_2.astype(str)
+            
         # Attributes
         if attributes_1 is None:
             if dataset_1.columns.values.tolist():
diff --git a/src/pyjedai/matching.py b/src/pyjedai/matching.py
index 9955fec..8a1c741 100644
--- a/src/pyjedai/matching.py
+++ b/src/pyjedai/matching.py
@@ -155,7 +155,7 @@ def get_weights_median(self) -> float:
     def get_weights_standard_deviation(self) -> float:
         return statistics.stdev([w for _, _, w in self.pairs.edges(data='weight')])
     
-    def plot_distribution_of_all_weights(self) -> None:
+    def plot_distribution_of_all_weights(self, save_figure_path=None) -> None:
         title = "Distribution of scores with " + self.metric + " metric in graph from entity matching"
         plt.figure(figsize=(10, 6))
         all_weights = [w for _, _, w in self.pairs.edges(data='weight')]
@@ -168,9 +168,11 @@ def plot_distribution_of_all_weights(self) -> None:
         plt.axvline(x = self.get_weights_median(), color = 'black', label = 'Median weight')
         plt.axvline(x = self.get_weights_avg()+self.get_weights_standard_deviation(), color = 'green', label = 'Average + SD weight')
         plt.legend()
+        if save_figure_path:
+            plt.savefig(save_figure_path)
         plt.show()
 
-    def plot_distribution_of_all_weights_2d(self) -> None:
+    def plot_distribution_of_all_weights_2d(self, save_figure_path=None) -> None:
         title = "Distribution of scores with " + self.metric + " metric in graph from entity matching"
         plt.figure(figsize=(10, 6))
         all_weights = [w for _, _, w in self.pairs.edges(data='weight')]
@@ -182,9 +184,11 @@ def plot_distribution_of_all_weights_2d(self) -> None:
         plt.axvline(x = self.get_weights_median(), color = 'black', label = 'Median weight')
         plt.axvline(x = self.get_weights_avg()+self.get_weights_standard_deviation(), color = 'green', label = 'Average + SD weight')
         plt.legend()
+        if save_figure_path:
+            plt.savefig(save_figure_path)
         plt.show()
 
-    def plot_distribution_of_scores(self) -> None:
+    def plot_distribution_of_scores(self, save_figure_path=None) -> None:
         title = "Distribution of scores with " + self.metric + " metric in graph from entity matching"
         def weight_distribution(G):
             bins = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
@@ -221,9 +225,11 @@ def weight_distribution(G):
         plt.axvline(x = self.get_weights_median()*10, color = 'black', label = 'Median weight')
         plt.axvline(x = self.get_weights_avg()*10+self.get_weights_standard_deviation()*10, color = 'green', label = 'Average + SD weight')
         plt.legend()
+        if save_figure_path:
+            plt.savefig(save_figure_path)
         plt.show()
 
-    def plot_gt_distribution_of_scores(self) -> None:
+    def plot_gt_distribution_of_scores(self, save_figure_path=None) -> None:
         title = "Distribution of scores with " + self.metric + " metric on ground truth pairs"
         def weight_distribution():
             bins = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
@@ -257,6 +263,8 @@ def weight_distribution():
         ax.set_title(title)
         ax.set_xlabel('Similarity score range')
         fig.tight_layout()
+        if save_figure_path:
+            plt.savefig(save_figure_path)
         plt.show()
 
     def evaluate(self,
@@ -319,9 +327,6 @@ def export_to_df(self, prediction: Graph) -> pd.DataFrame:
         Returns:
             pd.DataFrame: Dataframe with the predicted pairs
         """
-        if self.data.ground_truth is None:
-            raise AttributeError("Can not proceed to evaluation without a ground-truth file. \
-                Data object mush have initialized with the ground-truth file")
         pairs_df = pd.DataFrame(columns=['id1', 'id2'])
         for edge in prediction.edges:
             id1 = self.data._gt_to_ids_reversed_1[edge[0]]
@@ -345,7 +350,7 @@ def __init__(
             tokenizer: str = 'white_space_tokenizer',
             vectorizer : str = None,
             qgram : int = 1,
-            similarity_threshold: float = 0.5,
+            similarity_threshold: float = 0.0,
             tokenizer_return_unique_values = False, # unique values or not,
             attributes: any = None,
         ) -> None:
diff --git a/src/pyjedai/vector_based_blocking.py b/src/pyjedai/vector_based_blocking.py
index 3584966..55db2bf 100644
--- a/src/pyjedai/vector_based_blocking.py
+++ b/src/pyjedai/vector_based_blocking.py
@@ -138,7 +138,6 @@ def build_blocks(self,
         if self.similarity_search != 'faiss':
             raise AttributeError("Only FAISS is available for now.")
         
-        print('Building blocks via Embeddings-NN Block Building [' + self.vectorizer + ', ' + self.similarity_search + ']')
         _start_time = time()
         self.blocks = dict()
         self.verbose = verbose
@@ -226,7 +225,7 @@ def build_blocks(self,
                     if verbose: print(f"{p2} -> Loaded Successfully")
                 else:
                     if verbose: print("Embeddings not found for D2. Creating new ones.")
-        if not self._d1_loaded or not self._d2_loaded:
+        if not self._d1_loaded or (not data.is_dirty_er and not self._d2_loaded):
             if self.vectorizer in ['word2vec', 'fasttext', 'doc2vec', 'glove']:
                 self.vectors_1, self.vectors_2 = self._create_gensim_embeddings()
             elif self.vectorizer in ['bert', 'distilbert', 'roberta', 'xlnet', 'albert']:
@@ -368,7 +367,6 @@ def _create_pretrained_sentence_embeddings(self):
         vectors_2 = []
         if not self.data.is_dirty_er and not self._d2_loaded:            
             for e2 in self._entities_d2:
-                # print("e2: ", e2)
                 vector = model.encode(e2)
                 vectors_2.append(vector)
                 self._progress_bar.update(1)
@@ -420,7 +418,7 @@ def _similarity_search_with_FAISS(self):
         self.blocks = dict()
         if self.verbose:
             print("Building blocks...")
-        print("disable", not self.verbose)
+
         for _entity in tqdm(range(0, self.neighbors.shape[0]), desc="Building blocks", disable=not self.verbose):
             
             _entity_id = self._si.d1_retained_ids[_entity] if self.data.is_dirty_er else self._si.d2_retained_ids[_entity]
diff --git a/src/pyjedai/workflow.py b/src/pyjedai/workflow.py
index b2a9f3e..4a0d393 100644
--- a/src/pyjedai/workflow.py
+++ b/src/pyjedai/workflow.py
@@ -192,7 +192,7 @@ def export_pairs(self) -> pd.DataFrame:
         Returns:
             pd.DataFrame: pairs as a DataFrame
         """
-        return write(self.final_pairs, self.data)
+        return self.final_step_method.export_to_df(self.final_pairs)
 
     def _save_step(self, results: dict, configuration: dict) -> None:
         self.f1.append(results['F1 %'])
@@ -580,6 +580,7 @@ def run(self,
                                                                 if "attributes_2" in self.block_building else None,
                                                 tqdm_disable=workflow_step_tqdm_disable)
         self.final_pairs = block_building_blocks
+        self.final_step_method = block_building_method
         if data.ground_truth is not None:
             res = block_building_method.evaluate(block_building_blocks,
                                                 export_to_dict=True,
@@ -604,6 +605,7 @@ def run(self,
                                                                       tqdm_disable=workflow_step_tqdm_disable)
                 
                 self.final_pairs = bblocks = block_cleaning_blocks
+                # self.final_pairs = block_cleaning_method.export_to_df(self.final_pairs)
                 if data.ground_truth is not None:
                     res = block_cleaning_method.evaluate(bblocks,
                                                         export_to_dict=True,
@@ -625,6 +627,8 @@ def run(self,
                                                     else block_building_blocks,
                                                 data,
                                                 tqdm_disable=workflow_step_tqdm_disable)
+            self.final_step_method = comparison_cleaning_method
+
             if data.ground_truth is not None:
                 res = comparison_cleaning_method.evaluate(comparison_cleaning_blocks,
                                                         export_to_dict=True,
@@ -653,6 +657,8 @@ def run(self,
                 tqdm_disable=workflow_step_tqdm_disable,
                 **self.entity_matching["exec_params"])
 
+        self.final_step_method = entity_matching_method
+
         if data.ground_truth is not None:
             res = entity_matching_method.evaluate(em_graph,
                                                     export_to_dict=True,
@@ -671,7 +677,8 @@ def run(self,
                 self.final_pairs = components = clustering_method.process(em_graph, data)
             else:
                 self.final_pairs = components = clustering_method.process(em_graph, data, **self.clustering["exec_params"])
-            
+
+            self.final_step_method = clustering_method
             self.clusters = components
             if data.ground_truth is not None:
                 res = clustering_method.evaluate(components,
@@ -859,6 +866,8 @@ def run(self,
                                                 **self.block_building["exec_params"])                
 
         self.final_pairs = block_building_blocks
+        self.final_pairs = block_building_method.export_to_df(self.final_pairs)
+
         if data.ground_truth is not None:
             res = block_building_method.evaluate(block_building_blocks,
                                                 export_to_dict=True,
@@ -874,6 +883,8 @@ def run(self,
                                             if "params" in self.clustering \
                                             else self.clustering['method']()
             self.final_pairs = components = clustering_method.process(em_graph, data)
+            self.final_pairs = clustering_method.export_to_df(self.final_pairs)
+
             if data.ground_truth is not None:
                 res = clustering_method.evaluate(components,
                                                 export_to_dict=True,