diff --git a/pyproject.toml b/pyproject.toml index 2d4ae88..55f088c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta" [project] name = "pyjedai" -version = "0.1.7" +version = "0.1.8" description = "An open-source library that builds powerful end-to-end Entity Resolution workflows." readme = "README.md" authors = [ diff --git a/src/pyjedai/_version.py b/src/pyjedai/_version.py index f1380ee..9cb17e7 100644 --- a/src/pyjedai/_version.py +++ b/src/pyjedai/_version.py @@ -1 +1 @@ -__version__ = "0.1.7" +__version__ = "0.1.8" diff --git a/src/pyjedai/datamodel.py b/src/pyjedai/datamodel.py index d7bd273..2eea542 100644 --- a/src/pyjedai/datamodel.py +++ b/src/pyjedai/datamodel.py @@ -123,12 +123,12 @@ def __init__( self.dataset_name_2 = dataset_name_2 # Fill NaN values with empty string - self.dataset_1 = self.dataset_1.astype(str) self.dataset_1.fillna("", inplace=True) + self.dataset_1 = self.dataset_1.astype(str) if not self.is_dirty_er: - self.dataset_2 = self.dataset_2.astype(str) self.dataset_2.fillna("", inplace=True) - + self.dataset_2 = self.dataset_2.astype(str) + # Attributes if attributes_1 is None: if dataset_1.columns.values.tolist(): diff --git a/src/pyjedai/matching.py b/src/pyjedai/matching.py index 9955fec..8a1c741 100644 --- a/src/pyjedai/matching.py +++ b/src/pyjedai/matching.py @@ -155,7 +155,7 @@ def get_weights_median(self) -> float: def get_weights_standard_deviation(self) -> float: return statistics.stdev([w for _, _, w in self.pairs.edges(data='weight')]) - def plot_distribution_of_all_weights(self) -> None: + def plot_distribution_of_all_weights(self, save_figure_path=None) -> None: title = "Distribution of scores with " + self.metric + " metric in graph from entity matching" plt.figure(figsize=(10, 6)) all_weights = [w for _, _, w in self.pairs.edges(data='weight')] @@ -168,9 +168,11 @@ def plot_distribution_of_all_weights(self) -> None: plt.axvline(x = self.get_weights_median(), color = 'black', label = 'Median weight') plt.axvline(x = self.get_weights_avg()+self.get_weights_standard_deviation(), color = 'green', label = 'Average + SD weight') plt.legend() + if save_figure_path: + plt.savefig(save_figure_path) plt.show() - def plot_distribution_of_all_weights_2d(self) -> None: + def plot_distribution_of_all_weights_2d(self, save_figure_path=None) -> None: title = "Distribution of scores with " + self.metric + " metric in graph from entity matching" plt.figure(figsize=(10, 6)) all_weights = [w for _, _, w in self.pairs.edges(data='weight')] @@ -182,9 +184,11 @@ def plot_distribution_of_all_weights_2d(self) -> None: plt.axvline(x = self.get_weights_median(), color = 'black', label = 'Median weight') plt.axvline(x = self.get_weights_avg()+self.get_weights_standard_deviation(), color = 'green', label = 'Average + SD weight') plt.legend() + if save_figure_path: + plt.savefig(save_figure_path) plt.show() - def plot_distribution_of_scores(self) -> None: + def plot_distribution_of_scores(self, save_figure_path=None) -> None: title = "Distribution of scores with " + self.metric + " metric in graph from entity matching" def weight_distribution(G): bins = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] @@ -221,9 +225,11 @@ def weight_distribution(G): plt.axvline(x = self.get_weights_median()*10, color = 'black', label = 'Median weight') plt.axvline(x = self.get_weights_avg()*10+self.get_weights_standard_deviation()*10, color = 'green', label = 'Average + SD weight') plt.legend() + if save_figure_path: + plt.savefig(save_figure_path) plt.show() - def plot_gt_distribution_of_scores(self) -> None: + def plot_gt_distribution_of_scores(self, save_figure_path=None) -> None: title = "Distribution of scores with " + self.metric + " metric on ground truth pairs" def weight_distribution(): bins = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] @@ -257,6 +263,8 @@ def weight_distribution(): ax.set_title(title) ax.set_xlabel('Similarity score range') fig.tight_layout() + if save_figure_path: + plt.savefig(save_figure_path) plt.show() def evaluate(self, @@ -319,9 +327,6 @@ def export_to_df(self, prediction: Graph) -> pd.DataFrame: Returns: pd.DataFrame: Dataframe with the predicted pairs """ - if self.data.ground_truth is None: - raise AttributeError("Can not proceed to evaluation without a ground-truth file. \ - Data object mush have initialized with the ground-truth file") pairs_df = pd.DataFrame(columns=['id1', 'id2']) for edge in prediction.edges: id1 = self.data._gt_to_ids_reversed_1[edge[0]] @@ -345,7 +350,7 @@ def __init__( tokenizer: str = 'white_space_tokenizer', vectorizer : str = None, qgram : int = 1, - similarity_threshold: float = 0.5, + similarity_threshold: float = 0.0, tokenizer_return_unique_values = False, # unique values or not, attributes: any = None, ) -> None: diff --git a/src/pyjedai/vector_based_blocking.py b/src/pyjedai/vector_based_blocking.py index 3584966..55db2bf 100644 --- a/src/pyjedai/vector_based_blocking.py +++ b/src/pyjedai/vector_based_blocking.py @@ -138,7 +138,6 @@ def build_blocks(self, if self.similarity_search != 'faiss': raise AttributeError("Only FAISS is available for now.") - print('Building blocks via Embeddings-NN Block Building [' + self.vectorizer + ', ' + self.similarity_search + ']') _start_time = time() self.blocks = dict() self.verbose = verbose @@ -226,7 +225,7 @@ def build_blocks(self, if verbose: print(f"{p2} -> Loaded Successfully") else: if verbose: print("Embeddings not found for D2. Creating new ones.") - if not self._d1_loaded or not self._d2_loaded: + if not self._d1_loaded or (not data.is_dirty_er and not self._d2_loaded): if self.vectorizer in ['word2vec', 'fasttext', 'doc2vec', 'glove']: self.vectors_1, self.vectors_2 = self._create_gensim_embeddings() elif self.vectorizer in ['bert', 'distilbert', 'roberta', 'xlnet', 'albert']: @@ -368,7 +367,6 @@ def _create_pretrained_sentence_embeddings(self): vectors_2 = [] if not self.data.is_dirty_er and not self._d2_loaded: for e2 in self._entities_d2: - # print("e2: ", e2) vector = model.encode(e2) vectors_2.append(vector) self._progress_bar.update(1) @@ -420,7 +418,7 @@ def _similarity_search_with_FAISS(self): self.blocks = dict() if self.verbose: print("Building blocks...") - print("disable", not self.verbose) + for _entity in tqdm(range(0, self.neighbors.shape[0]), desc="Building blocks", disable=not self.verbose): _entity_id = self._si.d1_retained_ids[_entity] if self.data.is_dirty_er else self._si.d2_retained_ids[_entity] diff --git a/src/pyjedai/workflow.py b/src/pyjedai/workflow.py index b2a9f3e..4a0d393 100644 --- a/src/pyjedai/workflow.py +++ b/src/pyjedai/workflow.py @@ -192,7 +192,7 @@ def export_pairs(self) -> pd.DataFrame: Returns: pd.DataFrame: pairs as a DataFrame """ - return write(self.final_pairs, self.data) + return self.final_step_method.export_to_df(self.final_pairs) def _save_step(self, results: dict, configuration: dict) -> None: self.f1.append(results['F1 %']) @@ -580,6 +580,7 @@ def run(self, if "attributes_2" in self.block_building else None, tqdm_disable=workflow_step_tqdm_disable) self.final_pairs = block_building_blocks + self.final_step_method = block_building_method if data.ground_truth is not None: res = block_building_method.evaluate(block_building_blocks, export_to_dict=True, @@ -604,6 +605,7 @@ def run(self, tqdm_disable=workflow_step_tqdm_disable) self.final_pairs = bblocks = block_cleaning_blocks + # self.final_pairs = block_cleaning_method.export_to_df(self.final_pairs) if data.ground_truth is not None: res = block_cleaning_method.evaluate(bblocks, export_to_dict=True, @@ -625,6 +627,8 @@ def run(self, else block_building_blocks, data, tqdm_disable=workflow_step_tqdm_disable) + self.final_step_method = comparison_cleaning_method + if data.ground_truth is not None: res = comparison_cleaning_method.evaluate(comparison_cleaning_blocks, export_to_dict=True, @@ -653,6 +657,8 @@ def run(self, tqdm_disable=workflow_step_tqdm_disable, **self.entity_matching["exec_params"]) + self.final_step_method = entity_matching_method + if data.ground_truth is not None: res = entity_matching_method.evaluate(em_graph, export_to_dict=True, @@ -671,7 +677,8 @@ def run(self, self.final_pairs = components = clustering_method.process(em_graph, data) else: self.final_pairs = components = clustering_method.process(em_graph, data, **self.clustering["exec_params"]) - + + self.final_step_method = clustering_method self.clusters = components if data.ground_truth is not None: res = clustering_method.evaluate(components, @@ -859,6 +866,8 @@ def run(self, **self.block_building["exec_params"]) self.final_pairs = block_building_blocks + self.final_pairs = block_building_method.export_to_df(self.final_pairs) + if data.ground_truth is not None: res = block_building_method.evaluate(block_building_blocks, export_to_dict=True, @@ -874,6 +883,8 @@ def run(self, if "params" in self.clustering \ else self.clustering['method']() self.final_pairs = components = clustering_method.process(em_graph, data) + self.final_pairs = clustering_method.export_to_df(self.final_pairs) + if data.ground_truth is not None: res = clustering_method.evaluate(components, export_to_dict=True,