Skip to content

Commit

Permalink
Fixed issues #22 and #23;
Browse files Browse the repository at this point in the history
  • Loading branch information
Nikoletos-K committed Jul 19, 2024
1 parent a668c98 commit b360040
Show file tree
Hide file tree
Showing 6 changed files with 33 additions and 19 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "pyjedai"
version = "0.1.7"
version = "0.1.8"
description = "An open-source library that builds powerful end-to-end Entity Resolution workflows."
readme = "README.md"
authors = [
Expand Down
2 changes: 1 addition & 1 deletion src/pyjedai/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.1.7"
__version__ = "0.1.8"
6 changes: 3 additions & 3 deletions src/pyjedai/datamodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,12 +123,12 @@ def __init__(
self.dataset_name_2 = dataset_name_2

# Fill NaN values with empty string
self.dataset_1 = self.dataset_1.astype(str)
self.dataset_1.fillna("", inplace=True)
self.dataset_1 = self.dataset_1.astype(str)
if not self.is_dirty_er:
self.dataset_2 = self.dataset_2.astype(str)
self.dataset_2.fillna("", inplace=True)

self.dataset_2 = self.dataset_2.astype(str)

# Attributes
if attributes_1 is None:
if dataset_1.columns.values.tolist():
Expand Down
21 changes: 13 additions & 8 deletions src/pyjedai/matching.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ def get_weights_median(self) -> float:
def get_weights_standard_deviation(self) -> float:
return statistics.stdev([w for _, _, w in self.pairs.edges(data='weight')])

def plot_distribution_of_all_weights(self) -> None:
def plot_distribution_of_all_weights(self, save_figure_path=None) -> None:
title = "Distribution of scores with " + self.metric + " metric in graph from entity matching"
plt.figure(figsize=(10, 6))
all_weights = [w for _, _, w in self.pairs.edges(data='weight')]
Expand All @@ -168,9 +168,11 @@ def plot_distribution_of_all_weights(self) -> None:
plt.axvline(x = self.get_weights_median(), color = 'black', label = 'Median weight')
plt.axvline(x = self.get_weights_avg()+self.get_weights_standard_deviation(), color = 'green', label = 'Average + SD weight')
plt.legend()
if save_figure_path:
plt.savefig(save_figure_path)
plt.show()

def plot_distribution_of_all_weights_2d(self) -> None:
def plot_distribution_of_all_weights_2d(self, save_figure_path=None) -> None:
title = "Distribution of scores with " + self.metric + " metric in graph from entity matching"
plt.figure(figsize=(10, 6))
all_weights = [w for _, _, w in self.pairs.edges(data='weight')]
Expand All @@ -182,9 +184,11 @@ def plot_distribution_of_all_weights_2d(self) -> None:
plt.axvline(x = self.get_weights_median(), color = 'black', label = 'Median weight')
plt.axvline(x = self.get_weights_avg()+self.get_weights_standard_deviation(), color = 'green', label = 'Average + SD weight')
plt.legend()
if save_figure_path:
plt.savefig(save_figure_path)
plt.show()

def plot_distribution_of_scores(self) -> None:
def plot_distribution_of_scores(self, save_figure_path=None) -> None:
title = "Distribution of scores with " + self.metric + " metric in graph from entity matching"
def weight_distribution(G):
bins = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
Expand Down Expand Up @@ -221,9 +225,11 @@ def weight_distribution(G):
plt.axvline(x = self.get_weights_median()*10, color = 'black', label = 'Median weight')
plt.axvline(x = self.get_weights_avg()*10+self.get_weights_standard_deviation()*10, color = 'green', label = 'Average + SD weight')
plt.legend()
if save_figure_path:
plt.savefig(save_figure_path)
plt.show()

def plot_gt_distribution_of_scores(self) -> None:
def plot_gt_distribution_of_scores(self, save_figure_path=None) -> None:
title = "Distribution of scores with " + self.metric + " metric on ground truth pairs"
def weight_distribution():
bins = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
Expand Down Expand Up @@ -257,6 +263,8 @@ def weight_distribution():
ax.set_title(title)
ax.set_xlabel('Similarity score range')
fig.tight_layout()
if save_figure_path:
plt.savefig(save_figure_path)
plt.show()

def evaluate(self,
Expand Down Expand Up @@ -319,9 +327,6 @@ def export_to_df(self, prediction: Graph) -> pd.DataFrame:
Returns:
pd.DataFrame: Dataframe with the predicted pairs
"""
if self.data.ground_truth is None:
raise AttributeError("Can not proceed to evaluation without a ground-truth file. \
Data object mush have initialized with the ground-truth file")
pairs_df = pd.DataFrame(columns=['id1', 'id2'])
for edge in prediction.edges:
id1 = self.data._gt_to_ids_reversed_1[edge[0]]
Expand All @@ -345,7 +350,7 @@ def __init__(
tokenizer: str = 'white_space_tokenizer',
vectorizer : str = None,
qgram : int = 1,
similarity_threshold: float = 0.5,
similarity_threshold: float = 0.0,
tokenizer_return_unique_values = False, # unique values or not,
attributes: any = None,
) -> None:
Expand Down
6 changes: 2 additions & 4 deletions src/pyjedai/vector_based_blocking.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,6 @@ def build_blocks(self,
if self.similarity_search != 'faiss':
raise AttributeError("Only FAISS is available for now.")

print('Building blocks via Embeddings-NN Block Building [' + self.vectorizer + ', ' + self.similarity_search + ']')
_start_time = time()
self.blocks = dict()
self.verbose = verbose
Expand Down Expand Up @@ -226,7 +225,7 @@ def build_blocks(self,
if verbose: print(f"{p2} -> Loaded Successfully")
else:
if verbose: print("Embeddings not found for D2. Creating new ones.")
if not self._d1_loaded or not self._d2_loaded:
if not self._d1_loaded or (not data.is_dirty_er and not self._d2_loaded):
if self.vectorizer in ['word2vec', 'fasttext', 'doc2vec', 'glove']:
self.vectors_1, self.vectors_2 = self._create_gensim_embeddings()
elif self.vectorizer in ['bert', 'distilbert', 'roberta', 'xlnet', 'albert']:
Expand Down Expand Up @@ -368,7 +367,6 @@ def _create_pretrained_sentence_embeddings(self):
vectors_2 = []
if not self.data.is_dirty_er and not self._d2_loaded:
for e2 in self._entities_d2:
# print("e2: ", e2)
vector = model.encode(e2)
vectors_2.append(vector)
self._progress_bar.update(1)
Expand Down Expand Up @@ -420,7 +418,7 @@ def _similarity_search_with_FAISS(self):
self.blocks = dict()
if self.verbose:
print("Building blocks...")
print("disable", not self.verbose)

for _entity in tqdm(range(0, self.neighbors.shape[0]), desc="Building blocks", disable=not self.verbose):

_entity_id = self._si.d1_retained_ids[_entity] if self.data.is_dirty_er else self._si.d2_retained_ids[_entity]
Expand Down
15 changes: 13 additions & 2 deletions src/pyjedai/workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ def export_pairs(self) -> pd.DataFrame:
Returns:
pd.DataFrame: pairs as a DataFrame
"""
return write(self.final_pairs, self.data)
return self.final_step_method.export_to_df(self.final_pairs)

def _save_step(self, results: dict, configuration: dict) -> None:
self.f1.append(results['F1 %'])
Expand Down Expand Up @@ -580,6 +580,7 @@ def run(self,
if "attributes_2" in self.block_building else None,
tqdm_disable=workflow_step_tqdm_disable)
self.final_pairs = block_building_blocks
self.final_step_method = block_building_method
if data.ground_truth is not None:
res = block_building_method.evaluate(block_building_blocks,
export_to_dict=True,
Expand All @@ -604,6 +605,7 @@ def run(self,
tqdm_disable=workflow_step_tqdm_disable)

self.final_pairs = bblocks = block_cleaning_blocks
# self.final_pairs = block_cleaning_method.export_to_df(self.final_pairs)
if data.ground_truth is not None:
res = block_cleaning_method.evaluate(bblocks,
export_to_dict=True,
Expand All @@ -625,6 +627,8 @@ def run(self,
else block_building_blocks,
data,
tqdm_disable=workflow_step_tqdm_disable)
self.final_step_method = comparison_cleaning_method

if data.ground_truth is not None:
res = comparison_cleaning_method.evaluate(comparison_cleaning_blocks,
export_to_dict=True,
Expand Down Expand Up @@ -653,6 +657,8 @@ def run(self,
tqdm_disable=workflow_step_tqdm_disable,
**self.entity_matching["exec_params"])

self.final_step_method = entity_matching_method

if data.ground_truth is not None:
res = entity_matching_method.evaluate(em_graph,
export_to_dict=True,
Expand All @@ -671,7 +677,8 @@ def run(self,
self.final_pairs = components = clustering_method.process(em_graph, data)
else:
self.final_pairs = components = clustering_method.process(em_graph, data, **self.clustering["exec_params"])


self.final_step_method = clustering_method
self.clusters = components
if data.ground_truth is not None:
res = clustering_method.evaluate(components,
Expand Down Expand Up @@ -859,6 +866,8 @@ def run(self,
**self.block_building["exec_params"])

self.final_pairs = block_building_blocks
self.final_pairs = block_building_method.export_to_df(self.final_pairs)

if data.ground_truth is not None:
res = block_building_method.evaluate(block_building_blocks,
export_to_dict=True,
Expand All @@ -874,6 +883,8 @@ def run(self,
if "params" in self.clustering \
else self.clustering['method']()
self.final_pairs = components = clustering_method.process(em_graph, data)
self.final_pairs = clustering_method.export_to_df(self.final_pairs)

if data.ground_truth is not None:
res = clustering_method.evaluate(components,
export_to_dict=True,
Expand Down

0 comments on commit b360040

Please sign in to comment.