diff --git a/docs/pyjedai/_version.py b/docs/pyjedai/_version.py index a73339b..ae73625 100644 --- a/docs/pyjedai/_version.py +++ b/docs/pyjedai/_version.py @@ -1 +1 @@ -__version__ = "0.0.8" +__version__ = "0.1.3" diff --git a/docs/pyjedai/block_building.py b/docs/pyjedai/block_building.py index fd29790..36e7414 100644 --- a/docs/pyjedai/block_building.py +++ b/docs/pyjedai/block_building.py @@ -13,7 +13,7 @@ from tqdm.auto import tqdm from .datamodel import Block, Data, PYJEDAIFeature -from .utils import (are_matching, drop_big_blocks_by_size, +from .utils import (are_matching, drop_big_blocks_by_size, create_entity_index, drop_single_entity_blocks, get_blocks_cardinality) from .evaluation import Evaluation @@ -93,39 +93,148 @@ def evaluate(self, self.stats(eval_blocks) return eval_result - def stats(self, blocks: dict) -> None: - self.list_of_sizes = [] + def stats(self, blocks: dict, verbose: bool = True) -> dict: + + # Atomic features + self.portion_of_singleton_entites = 0 + self.portion_of_duplicate_blocks = 0 # contain the same entities + self.num_of_block_assignments = 0 + self.num_of_minimal_blocks = 0 # one-comparison blocks + self.num_of_blocks_per_entity = 0 + self.average_number_of_block_assignments_per_comparison = 0 + self.optimality_distance = 0 self.entities_in_blocks = set() + self.size_per_block = [] + self.cardinalities = [] + self.num_of_blocks = len(blocks) for block in blocks.values(): self.sum_of_sizes += block.get_size() self.min_block_size = min(self.min_block_size, block.get_size()) if self.min_block_size else block.get_size() self.max_block_size = max(self.max_block_size, block.get_size()) if self.max_block_size else block.get_size() self.min_block_comparisons = min(self.min_block_comparisons, block.get_cardinality(self.data.is_dirty_er)) if self.min_block_comparisons else block.get_cardinality(self.data.is_dirty_er) self.max_block_comparisons = max(self.max_block_comparisons, block.get_cardinality(self.data.is_dirty_er)) if self.max_block_comparisons else block.get_cardinality(self.data.is_dirty_er) - self.list_of_sizes.append(block.get_size()) + self.size_per_block.append(block.get_size()) self.entities_in_blocks = self.entities_in_blocks.union(block.entities_D1) if not self.data.is_dirty_er: self.entities_in_blocks = self.entities_in_blocks.union(block.entities_D2) - self.total_num_of_comparisons += block.get_cardinality(self.data.is_dirty_er) - - self.num_of_blocks = len(blocks) + cardinality = block.get_cardinality(self.data.is_dirty_er) + self.cardinalities.append(cardinality) + if cardinality == 1: + self.num_of_minimal_blocks += 1 + + self.num_of_minimal_blocks /= self.num_of_blocks + self.num_of_entities_in_blocks = len(self.entities_in_blocks) + self.num_of_block_assignments = self.total_num_of_comparisons = sum(self.cardinalities) self.average_block_size = int(self.sum_of_sizes / self.num_of_blocks) - self.list_of_sizes = sorted(self.list_of_sizes) - median = self.list_of_sizes[int(len(self.list_of_sizes)/2)] - print( - "Statistics:" + - "\n\tNumber of blocks: " + str(self.num_of_blocks) + - "\n\tAverage block size: " + str(self.average_block_size) + - "\n\tMedian block size: " + str(median) + - "\n\tMax block size: " + str(self.max_block_size) + - "\n\tMin block size: " + str(self.min_block_size) + - "\n\tNumber of blocks dropped: " + str(self.num_of_blocks_dropped) + - "\n\tNumber of comparisons: " + str(self.total_num_of_comparisons) + - "\n\tMax comparisons per block: " + str(self.max_block_comparisons) + - "\n\tMin comparisons per block: " + str(self.min_block_comparisons) + - "\n\tEntities in blocks: " + str(len(self.entities_in_blocks)) - ) - print(u'\u2500' * 123) + self.size_per_block = sorted(self.size_per_block) + self.num_of_blocks_per_entity = self.num_of_blocks / self.num_of_entities_in_blocks + self.average_number_of_block_assignments_per_comparison = self.num_of_block_assignments / (2*self.total_num_of_comparisons) + median = self.size_per_block[int(len(self.size_per_block)/2)] + + entity_index = create_entity_index(blocks, self.data.is_dirty_er) + + # Distributional features + self.blocks_frequency = [] + self.relative_block_frequency = [] + self.comparison_frequency = [] + self.relative_comparison_frequency = [] + + for entity in entity_index: + if len(entity_index[entity]) == 1: + self.portion_of_singleton_entites += 1 + self.blocks_frequency.append(len(entity_index[entity])) + self.relative_block_frequency.append(len(entity_index[entity]) / self.num_of_blocks) + self.comparison_frequency.append(sum([blocks[block_key].get_cardinality(self.data.is_dirty_er) for block_key in entity_index[entity]])) + self.relative_comparison_frequency.append(sum([blocks[block_key].get_cardinality(self.data.is_dirty_er) for block_key in entity_index[entity]]) / self.total_num_of_comparisons) + + self.portion_of_singleton_entites /= self.num_of_entities_in_blocks + self.portion_of_minimal_blocks = self.num_of_minimal_blocks / self.num_of_blocks + + # Distributional features + self.average_blocks_per_entity = np.mean(self.blocks_frequency) + self.average_number_of_block_assignments_per_entity = np.mean(self.relative_block_frequency) + self.average_comparison_per_entity = np.mean(self.comparison_frequency) + self.average_relative_number_of_comparisons_per_entity = np.mean(self.relative_comparison_frequency) + + self.entropy_of_blocks_per_entity = -np.sum([p * np.log2(p) for p in self.blocks_frequency]) + self.entropy_of_comparison_per_entity = -np.sum([p * np.log2(p) for p in self.comparison_frequency]) + + self.kurtosis_of_blocks_per_entity = np.sum([(p - self.average_blocks_per_entity)**4 for p in self.blocks_frequency]) /\ + (self.num_of_blocks * self.average_blocks_per_entity**4) + self.kurtosis_of_comparison_per_entity = np.sum([(p - self.average_comparison_per_entity)**4 for p in self.comparison_frequency]) /\ + (self.num_of_blocks * self.average_comparison_per_entity**4) + + self.skewness_of_blocks_per_entity = np.sum([(p - self.average_blocks_per_entity)**3 for p in self.blocks_frequency]) /\ + (self.num_of_blocks * self.average_blocks_per_entity**3) + self.skewness_of_comparison_per_entity = np.sum([(p - self.average_comparison_per_entity)**3 for p in self.comparison_frequency]) /\ + (self.num_of_blocks * self.average_comparison_per_entity**3) + + + if verbose: + print( + "Statistics:" + + "\n\tNumber of blocks: " + str(self.num_of_blocks) + + "\n\tAverage block size: " + str(self.average_block_size) + + "\n\tMedian block size: " + str(median) + + "\n\tMax block size: " + str(self.max_block_size) + + "\n\tMin block size: " + str(self.min_block_size) + + "\n\tNumber of blocks dropped: " + str(self.num_of_blocks_dropped) + + "\n\tNumber of comparisons: " + str(self.total_num_of_comparisons) + + "\n\tMax comparisons per block: " + str(self.max_block_comparisons) + + "\n\tMin comparisons per block: " + str(self.min_block_comparisons) + + "\n\tEntities in blocks: " + str(len(self.entities_in_blocks)) + ) + print(u'\u2500' * 123) + print( + "\tAtomic feautures" + + "\n\t\tNumber of entities in blocks: " + str(self.num_of_entities_in_blocks) + + "\n\t\tNumber of blocks: " + str(self.num_of_blocks) + + "\n\t\tPortion of singleton entities: " + str(self.portion_of_singleton_entites) + + "\n\t\tTotal number of comparisons: " + str(self.total_num_of_comparisons) + + "\n\t\tNumber of blocks: " + str(self.num_of_blocks) + + "\n\t\tNumber of block assignments: " + str(self.num_of_block_assignments) + + "\n\t\tPortion of minimal blocks: " + str(self.portion_of_minimal_blocks) + + "\n\t\tNumber of blocks per entity: " + str(self.num_of_blocks_per_entity) + + "\n\t\tAverage number of block assignments per comparison: " + str(self.average_number_of_block_assignments_per_comparison) + ) + print(u'\u2500' * 123) + print( + "\tDistributional feautures" + + "\n\t\tAverage blocks per entity: " + str(self.average_blocks_per_entity) + + "\n\t\tAverage number of block assignments per entity: " + str(self.average_number_of_block_assignments_per_entity) + + "\n\t\tAverage comparison per entity: " + str(self.average_comparison_per_entity) + + "\n\t\tAverage relative number of comparisons per entity: " + str(self.average_relative_number_of_comparisons_per_entity) + + "\n\t\tEntropy of blocks per entity: " + str(self.entropy_of_blocks_per_entity) + + "\n\t\tEntropy of comparison per entity: " + str(self.entropy_of_comparison_per_entity) + + "\n\t\tKurtosis of blocks per entity: " + str(self.kurtosis_of_blocks_per_entity) + + "\n\t\tKurtosis of comparison per entity: " + str(self.kurtosis_of_comparison_per_entity) + + "\n\t\tSkewness of blocks per entity: " + str(self.skewness_of_blocks_per_entity) + + "\n\t\tSkewness of comparison per entity: " + str(self.skewness_of_comparison_per_entity) + ) + print(u'\u2500' * 123) + + return { + 'num_of_blocks': self.num_of_blocks, + 'average_block_size': self.average_block_size, + 'median_block_size': median, + 'max_block_size': self.max_block_size, + 'min_block_size': self.min_block_size, + 'num_of_blocks_dropped': self.num_of_blocks_dropped, + 'total_num_of_comparisons': self.total_num_of_comparisons, + 'max_block_comparisons': self.max_block_comparisons, + 'min_block_comparisons': self.min_block_comparisons, + 'entities_in_blocks': len(self.entities_in_blocks), + 'average_blocks_per_entity': self.average_blocks_per_entity, + 'average_number_of_block_assignments_per_entity': self.average_number_of_block_assignments_per_entity, + 'average_comparison_per_entity': self.average_comparison_per_entity, + 'average_relative_number_of_comparisons_per_entity': self.average_relative_number_of_comparisons_per_entity, + 'entropy_of_blocks_per_entity': self.entropy_of_blocks_per_entity, + 'entropy_of_comparison_per_entity': self.entropy_of_comparison_per_entity, + 'kurtosis_of_blocks_per_entity': self.kurtosis_of_blocks_per_entity, + 'kurtosis_of_comparison_per_entity': self.kurtosis_of_comparison_per_entity, + 'skewness_of_blocks_per_entity': self.skewness_of_blocks_per_entity, + 'skewness_of_comparison_per_entity': self.skewness_of_comparison_per_entity + } def export_to_df( self, @@ -140,9 +249,6 @@ def export_to_df( Returns: pd.DataFrame: Dataframe predicted pairs (can be exported to csv) """ - if self.data.ground_truth is None: - raise AttributeError("Can not proceed to evaluation without a ground-truth file. \ - Data object mush have initialized with the ground-truth file") pairs_df = pd.DataFrame(columns=['id1', 'id2']) for _, block in blocks.items(): if self.data.is_dirty_er: diff --git a/docs/pyjedai/clustering.py b/docs/pyjedai/clustering.py index d1c8c14..4f84470 100644 --- a/docs/pyjedai/clustering.py +++ b/docs/pyjedai/clustering.py @@ -2,14 +2,322 @@ from time import time import pandas as pd -from networkx import Graph, connected_components +from networkx import Graph, connected_components, gomory_hu_tree from tqdm.autonotebook import tqdm +from ordered_set import OrderedSet +import numpy as np +from scipy.sparse import csr_matrix, lil_matrix from .datamodel import Data, PYJEDAIFeature from .evaluation import Evaluation from .utils import are_matching +from collections import defaultdict +import random +from ordered_set import OrderedSet +RANDOM_SEED = 42 + +class EquivalenceCluster(PYJEDAIFeature): + + def __init__(self, data : Data) -> None: + super().__init__() + self.data : Data = data + self.d1_entities = OrderedSet() + self.d2_entities = OrderedSet() + + def __init__(self, data : Data, flattened_cluster : list) -> None: + super().__init__() + self.data : Data = data + self.d1_entities = set() + self.d2_entities = set() + self.add_entities(flattened_cluster) + + def get_entity_dataset(self, entity : int) -> set: + return self.d1_entities \ + if(entity < self.data.dataset_limit) \ + else self.d2_entities + + def add_entity(self, entity : int) -> None: + target_dataset_entities = self.get_entity_dataset(entity) + target_dataset_entities.add(entity) + + def add_entities(self, entities : list) -> None: + for entity in entities: + self.add_entity(entity) + + def get_entities(self) -> list: + return list((self.get_D1_entities() | self.get_D2_entities())) + + def get_D1_entities(self) -> set: + return self.d1_entities + + def get_D2_entities(self) -> set: + return self.d2_entities + + def has_entities(self) -> bool: + return self.has_D1_entities() or self.has_D2_entities() + + def has_D1_entities(self) -> bool: + return (len(self.d1_entities) > 0) + + def has_D2_entities(self) -> bool: + return (len(self.d1_entities) > 0) + + def has_entity(self, entity : int) -> bool: + target_dataset_entities = self.get_entity_dataset(entity) + return (entity in target_dataset_entities) + + def remove_entity(self, entity: int) -> None: + target_dataset_entities = self.get_entity_dataset(entity) + target_dataset_entities.remove(entity) + + def remove_entities(self, entities: list) -> None: + for entity in entities: + self.remove_entity(entity) + + def flatten(self) -> list: + flattened_cluster : list = [] + + for d1_entity in self.d1_entities: + flattened_cluster.append(d1_entity) + for d2_entity in self.d2_entities: + flattened_cluster.append(d2_entity) + + return flattened_cluster + + def evaluate(self, + prediction=None, + export_to_df: bool = False, + export_to_dict: bool = False, + with_classification_report: bool = False, + verbose: bool = True) -> any: + pass + + def _configuration(self) -> dict: + pass + + def stats(self) -> None: + pass + +class ExtendedSimilarityEdge(PYJEDAIFeature): + def __init__(self, + left_node : int, + right_node : int, + similarity : float, + active : bool = True) -> None: + super().__init__() + self.set_left_node(left_node=left_node) + self.set_right_node(right_node=right_node) + self.set_similarity(similarity=similarity) + self.set_active(active=active) + + def set_left_node(self, left_node : int): + self.left_node : int = left_node + + def set_right_node(self, right_node : int): + self.right_node : int = right_node + + def set_similarity(self, similarity : float): + self.similarity : float = similarity + + def set_active(self, active : bool): + self.active : bool = active + + def is_active(self): + return self.active + + def __lt__(self, other): + return self.similarity < other.similarity + + def __le__(self, other): + return self.similarity <= other.similarity + + def __eq__(self, other): + return self.similarity == other.similarity + + def __ne__(self, other): + return self.similarity != other.similarity + + def __gt__(self, other): + return self.similarity > other.similarity + + def __ge__(self, other): + return self.similarity >= other.similarity + + def evaluate(self, + prediction=None, + export_to_df: bool = False, + export_to_dict: bool = False, + with_classification_report: bool = False, + verbose: bool = True) -> any: + pass + + def _configuration(self) -> dict: + pass + + def stats(self) -> None: + pass + +class Vertex(PYJEDAIFeature): + def __init__(self, + identifier : int, + edges : list = None) -> None: + super().__init__() + self.set_identifier(identifier=identifier) + self.set_attached_edges(attached_edges=0) + self.set_weight_sum(weight_sum=0) + self.set_edges(edges={}) + if(edges is not None): self.insert_edges(edges=edges) + + def set_identifier(self, identifier : int) -> None: + self.identifier : int = identifier + + def set_attached_edges(self, attached_edges : int) -> None: + self.attached_edges : int = attached_edges + + def set_weight_sum(self, weight_sum : float) -> None: + self.weight_sum : float = weight_sum + + def set_edges(self, edges : dict) -> None: + self.edges : dict = edges + + def set_average_weight(self, average_weight : float) -> None: + self.average_weight : float = average_weight + + def insert_edges(self, edges : list) -> None: + for edge in edges: + self.insert_edge(edge=edge) + + def insert_edge(self, edge : tuple) -> None: + vertex, weight = edge + self.update_weight_sum_by(update_value=weight) + self.update_attached_edges_by(update_value=1) + self.edges[vertex] = weight + self.update_average_weight() + + def remove_edges(self, edges : list) -> None: + for edge in edges: + self.remove_edge(edge=edge) + + def remove_edge(self, edge : int) -> None: + weight = self.edges.pop(edge, None) + if(weight is not None): + self.update_attached_edges_by(update_value=-1) + self.update_weight_sum_by(update_value=-weight) + self.update_average_weight() + + def get_attached_edges(self) -> int: + return self.attached_edges + + def get_weight_sum(self) -> float: + return self.weight_sum + + def get_edges(self) -> list: + return self.edges + + def get_identifier(self) -> int: + return self.identifier + + def get_similarity_with(self, entity : int) -> float: + return self.edges[entity] if entity in self.edges else 0.0 + + def update_weight_sum_by(self, update_value : float) -> None: + self.set_weight_sum(self.get_weight_sum() + update_value) + + def update_attached_edges_by(self, update_value : float) -> None: + self.set_attached_edges(self.get_attached_edges() + update_value) + + def update_average_weight(self, negative = True) -> None: + _average_weight : float = (self.get_weight_sum() / self.get_attached_edges()) + _average_weight = -_average_weight if negative else _average_weight + self.set_average_weight(average_weight=_average_weight) + + def has_edges(self): + return (self.get_attached_edges() > 0) + + def __lt__(self, other): + return self.average_weight < other.average_weight + + def __le__(self, other): + return self.average_weight <= other.average_weight + + def __eq__(self, other): + return self.average_weight == other.average_weight + + def __ne__(self, other): + return self.average_weight != other.average_weight + + def __gt__(self, other): + return self.average_weight > other.average_weight + + def __ge__(self, other): + return self.average_weight >= other.average_weight + + def evaluate(self, + prediction=None, + export_to_df: bool = False, + export_to_dict: bool = False, + with_classification_report: bool = False, + verbose: bool = True) -> any: + pass + + def _configuration(self) -> dict: + pass + + def stats(self) -> None: + pass + +class RicochetCluster(PYJEDAIFeature): + def __init__(self, + center : int, + members : []) -> None: + super().__init__() + self.set_center(center=center) + self.set_members(members=set()) + self.add_members(new_members=members) + + def set_center(self, center : int) -> None: + self.center : int = center + + def set_members(self, members : set) -> None: + self.members : set = members + + def add_members(self, new_members : list) -> None: + for new_member in new_members: + self.add_member(new_member) + + def add_member(self, new_member: int) -> None: + self.members.add(new_member) + + def remove_member(self, member : int) -> None: + self.members.remove(member) + + def get_members(self) -> list: + return self.members + + def get_center(self) -> int: + return self.center + + def change_center(self, new_center : int): + self.remove_member(member=self.get_center()) + self.add_member(new_member=new_center) + self.set_center(center=new_center) + + def evaluate(self, + prediction=None, + export_to_df: bool = False, + export_to_dict: bool = False, + with_classification_report: bool = False, + verbose: bool = True) -> any: + pass + + def _configuration(self) -> dict: + pass + + def stats(self) -> None: + pass + class AbstractClustering(PYJEDAIFeature): def __init__(self) -> None: @@ -68,9 +376,6 @@ def export_to_df(self, prediction: list) -> pd.DataFrame: Returns: pd.DataFrame: Dataframe containg evaluation scores and stats """ - if self.data.ground_truth is None: - raise AttributeError("Can not proceed to evaluation without a ground-truth file. \ - Data object mush have initialized with the ground-truth file") pairs_df = pd.DataFrame(columns=['id1', 'id2']) for cluster in prediction: lcluster = list(cluster) @@ -88,9 +393,17 @@ def export_to_df(self, prediction: list) -> pd.DataFrame: ) return pairs_df - - + def sorted_indicators(self, first_indicator : int, second_indicator : int): + return (first_indicator, second_indicator) if (first_indicator < second_indicator) else (second_indicator, first_indicator) + def id_to_index(self, identifier : int): + return identifier \ + if identifier < self.data.dataset_limit \ + else (identifier - self.data.dataset_limit) + + def index_to_id(self, index : int, left_dataset : True): + return index if left_dataset else index + self.data.dataset_limit + class ConnectedComponentsClustering(AbstractClustering): """Creates the connected components of the graph. \ Applied to graph created from entity matching. \ @@ -174,7 +487,7 @@ def process(self, graph: Graph, data: Data, similarity_threshold: float = 0.1) - self.similarity_threshold: float = similarity_threshold start_time = time() - matched_entities = set() + matched_entities = OrderedSet() self.data = data new_graph = Graph() priority_queue = PriorityQueue(maxsize = graph.number_of_edges()*2) @@ -221,3 +534,892 @@ def process(self, graph: Graph, data: Data) -> list: def _configuration(self) -> dict: return {} + +class CenterClustering(AbstractClustering): + """Implements the Center Clustering algorithm. Input comparisons (graph edges) are sorted in descending order of similarity. + Pairs of entities connected by these edges form the basis of the updated graph. Entities are evaluated to determine if they will serve + as a center of a future cluster or as its member. This evaluation is based on a comparison of their cumulative edge weights in the graph, + normalized by the number of edges in which they are involved. Finally, the algorithm identifies connected components within the graph, + using the previously defined centers as the focal points for forming clusters. + """ + + + _method_name: str = "Center Clustering" + _method_short_name: str = "CC" + _method_info: str = "Ιmplements the Center Clustering algorithm," + \ + "In essence, it keeps it defines if a node within an edge constitutes a center or member of future clusters" + \ + " by normalized over the graph weight sum comparison" + def __init__(self) -> None: + super().__init__() + self.similarity_threshold: float + + def process(self, graph: Graph, data: Data, similarity_threshold: float = 0.5) -> list: + + start_time = time() + self.similarity_threshold : float = similarity_threshold + self.data = data + edges_weight = defaultdict(float) + edges_attached = defaultdict(int) + comparisons = PriorityQueue(maxsize = graph.number_of_edges()*2) + + for (v1, v2, data) in graph.edges(data=True): + similarity_score = data.get('weight', 0) + if similarity_score > self.similarity_threshold: + comparisons.put((-similarity_score, v1, v2)) + edges_weight[v1] = edges_weight[v1] + similarity_score + edges_weight[v2] = edges_weight[v2] + similarity_score + + edges_attached[v1] = edges_attached[v1] + 1 + edges_attached[v2] = edges_attached[v2] + 1 + + new_graph = Graph() + cluster_centers = set() + cluster_members = set() + + while not comparisons.empty(): + similarity_score, v1, v2 = comparisons.get() + v1_is_center : bool = v1 in cluster_centers + v2_is_center : bool = v2 in cluster_centers + v1_is_member : bool = v1 in cluster_members + v2_is_member : bool = v2 in cluster_members + + if(not(v1_is_center or v2_is_center or v1_is_member or v2_is_member)): + w1 = edges_weight[v1] / edges_attached[v1] + w2 = edges_weight[v2] / edges_attached[v2] + + cluster_centers.add(v1 if w1 > w2 else v2) + cluster_members.add(v1 if w1 <= w2 else v2) + new_graph.add_edge(v1, v2, weight=-similarity_score) + elif ((v1_is_center and v2_is_center) or (v1_is_member and v2_is_member)): + continue + elif (v1_is_center and not v2_is_member): + cluster_members.add(v2) + new_graph.add_edge(v1, v2, weight=-similarity_score) + elif (v2_is_center and not v1_is_member): + cluster_members.add(v1) + new_graph.add_edge(v1, v2, weight=-similarity_score) + + clusters = list(connected_components(new_graph)) + self.execution_time = time() - start_time + return clusters + + def _configuration(self) -> dict: + return {} + +class BestMatchClustering(AbstractClustering): + """Implements the Best Match Clustering algorithm. Based on supplied order, it either traverse the entities of the left (inorder) + or right (reverse) dataset. For each entity, it retrieves all of its candidate pairs, stores them in descending similarity order. + For each source entity, only the best candidate is kept (only highest similarity edge is kept in the new graph). + """ + + _method_name: str = "Best Match Clustering" + _method_short_name: str = "BMC" + _method_info: str = "Ιmplements the Best Match Clustering algorithm," + \ + "In essence, it keeps the best candidate for each entity of the source dataset (defined through ordering)" + def __init__(self) -> None: + super().__init__() + self.similarity_threshold: float + + def process(self, graph: Graph, data: Data, similarity_threshold: float = 0.5, order : str = "inorder") -> list: + + start_time = time() + self.data = data + self.similarity_threshold : float = similarity_threshold + self.order : str = order + + if(self.order != "inorder" and self.order != "reverse"): + raise ValueError(f"Best Match Clustering doesn't support {self.order} ordering - Use inorder/reverse.") + + number_of_comparisons = len(graph.edges(data=True)) + matched_entities = set() + new_graph = Graph() + candidates_of = {} + clusters = [] + + if(number_of_comparisons == 0): + return clusters + + if self.data.is_dirty_er: + raise ValueError(f"Best Match Clustering doesn't support Dirty ER.") + + source_entities_num = self.data.num_of_entities_1 \ + if(self.order == "inorder") else \ + self.data.num_of_entities_2 + + candidates_of = [PriorityQueue() for _ in range(source_entities_num)] + + for (v1, v2, data) in graph.edges(data=True): + similarity_score = data.get('weight', 0) + original_d1_entity, original_d2_entity = (v1, v2) if (v1 < v2) else (v2, v1) + + source_entity, target_entity = (original_d1_entity, original_d2_entity) \ + if(self.order == "inorder") else \ + (original_d2_entity, original_d1_entity) + + source_index = source_entity \ + if(self.order == "inorder") else \ + source_entity - self.data.dataset_limit + + if similarity_score > self.similarity_threshold: + candidates_of[source_index].put((-similarity_score, target_entity)) + + for source_index, source_candidates in enumerate(candidates_of): + while not source_candidates.empty(): + similarity, target_entity = source_candidates.get() + + if target_entity in matched_entities: + continue + + source_entity = source_index \ + if(self.order == "inorder") else \ + source_index + self.data.dataset_limit + + e1, e2 = (source_entity, target_entity) \ + if(self.order == "inorder") else \ + (target_entity, source_entity) + new_graph.add_edge(e1, e2, weight=-similarity) + matched_entities.add(source_entity) + matched_entities.add(target_entity) + break + + clusters = list(connected_components(new_graph)) + self.execution_time = time() - start_time + return clusters + + def _configuration(self) -> dict: + return {} + + def set_order(self, order : str) -> None: + self.order : str = order + + +class MergeCenterClustering(AbstractClustering): + """Implements the Merge Center Clustering algorithm. It is a simplified version of the Center Clustering algorithm, + where the pair entities are not chosen as cluster center and member respectively based on their cumulative, normalized + weight in the original graph. Rather, entities of the left dataset are set as centers and their right dataset candidates + are set as member of the corresponding clusters. + """ + + + _method_name: str = "Merge Center Clustering" + _method_short_name: str = "MCC" + _method_info: str = "Ιmplements the Merge Center Clustering algorithm," + \ + "In essence, it implements Center Clustering without the cumulative, " + \ + "normalized weight calculation. Left dataset entities are set as candidate cluster centers." + def __init__(self) -> None: + super().__init__() + self.similarity_threshold: float + + def process(self, graph: Graph, data: Data, similarity_threshold: float = 0.5) -> list: + + start_time = time() + self.similarity_threshold : float = similarity_threshold + self.data = data + comparisons = PriorityQueue(maxsize = graph.number_of_edges()*2) + + for (v1, v2, data) in graph.edges(data=True): + similarity_score = data.get('weight', 0) + d1_id, d2_id = self.sorted_indicators(v1, v2) + if similarity_score > self.similarity_threshold: + comparisons.put((-similarity_score, d1_id, d2_id)) + + new_graph = Graph() + cluster_centers = set() + cluster_members = set() + + while not comparisons.empty(): + similarity_score, v1, v2 = comparisons.get() + v1_is_center : bool = v1 in cluster_centers + v2_is_center : bool = v2 in cluster_centers + v1_is_member : bool = v1 in cluster_members + v2_is_member : bool = v2 in cluster_members + + if(not(v1_is_center or v2_is_center or v1_is_member or v2_is_member)): + cluster_centers.add(v1) + cluster_members.add(v2) + new_graph.add_edge(v1, v2, weight=-similarity_score) + elif ((v1_is_center and v2_is_center) or (v1_is_member and v2_is_member)): + continue + elif (v1_is_center): + cluster_members.add(v2) + new_graph.add_edge(v1, v2, weight=-similarity_score) + elif (v2_is_center): + cluster_members.add(v1) + new_graph.add_edge(v1, v2, weight=-similarity_score) + + clusters = list(connected_components(new_graph)) + self.execution_time = time() - start_time + return clusters + + def _configuration(self) -> dict: + return {} + + +class CorrelationClustering(AbstractClustering): + """Implements the Correlation Clustering algorithm. Candidate pairs are mapped into a graph, whose connected components + act as our initial clusters. We iteratively choose one of the 3 possible moves (change, merge, break up cluster) and + we apply them on randomly chosen entities. We decide whether we should conduct the move or not, based on an objective function, + which quantifies the quality of our clusters (contain similar entities, seperate disimilar ones) + """ + + + _method_name: str = "Correlation Clustering" + _method_short_name: str = "CC" + _method_info: str = "Ιmplements the Correlation Clustering algorithm," + \ + "In essence, it implements iterative clustering, " + \ + "reassigning clusters to randomly chosen entities based on the reassignment's effect on our objective function " + \ + "that evaluates the quality of the newly defined clusters." + def __init__(self) -> None: + super().__init__() + self.similarity_threshold: float + self.initial_threshold : float + self.similarity_threshold : float + self.non_similarity_threshold : float + self.move_limit : int + self.lsi_iterations: int + def process(self, + graph: Graph, + data: Data, + initial_threshold: float = 0.5, + similarity_threshold: float = 0.8, + non_similarity_threshold: float = 0.2, + move_limit: int = 1, + lsi_iterations: int = 10000) -> list: + + start_time = time() + self.data : Data = data + self.initial_threshold : float = initial_threshold + self.similarity_threshold : float = similarity_threshold + self.non_similarity_threshold : float = non_similarity_threshold + self.move_limit : int = move_limit + self.lsi_iterations: int = lsi_iterations + self.similarity = lil_matrix((self.data.num_of_entities_1, self.data.num_of_entities_2), dtype=float) + new_graph = graph.copy() + + for (v1, v2, data) in graph.edges(data=True): + d1_id, d2_id = self.sorted_indicators(v1, v2) + d1_index, d2_index = (self.id_to_index(d1_id), self.id_to_index(d2_id)) + similarity_score = data['weight'] + self.similarity[d1_index, d2_index] = similarity_score + + if similarity_score < self.initial_threshold: + new_graph.remove_edge(v1, v2) + + initial_clusters = [list(connected_component) for connected_component in connected_components(new_graph)] + + print(len(initial_clusters)) + self.clusters = [EquivalenceCluster(data=self.data, flattened_cluster=cluster) for cluster in initial_clusters] + self.initial_clusters_num = len(initial_clusters) + self.max_clusters_num = self.initial_clusters_num + 10 + self.entity_cluster_index = [0] * self.data.num_of_entities + self.valid_entities = set() + + for cluster_index, cluster in enumerate(self.clusters): + for entity in range(self.data.num_of_entities): + if(cluster.has_entity(entity=entity)): + self.valid_entities.add(entity) + self.entity_cluster_index[entity] = cluster_index + self.valid_entities = list(self.valid_entities) + + self.similar = lil_matrix((self.data.num_of_entities_1, self.data.num_of_entities_2), dtype=bool) + self.not_similar = lil_matrix((self.data.num_of_entities_1, self.data.num_of_entities_2), dtype=bool) + + for d1_index in range(self.data.num_of_entities_1): + for d2_index in range(d1_index, self.data.num_of_entities_2): + self.not_similar[d1_index, d2_index] = self.similarity[d1_index, d2_index] < self.non_similarity_threshold + self.similar[d1_index, d2_index] = self.similarity[d1_index, d2_index] > self.similarity_threshold + + random.seed(RANDOM_SEED) + previous_OF : int = self.calculate_OF() + + for iteration in range(self.lsi_iterations): + move_index : int = random.randint(0, self.move_limit - 1) + current_OF : int = self.move(move_index, previous_OF) + previous_OF = current_OF + + final_clusters : list = [] + for cluster in self.clusters: + if(cluster.has_entities()): + final_clusters.append(set(cluster.flatten())) + self.execution_time = time() - start_time + return final_clusters + + def calculate_OF(self) -> int: + OF : int = 0 + + for d1_index in range(self.data.num_of_entities_1): + for d2_index in range(d1_index, self.data.num_of_entities_2): + d1_entity = self.index_to_id(index=d1_index, left_dataset=True) + d2_entity = self.index_to_id(index=d2_index, left_dataset=False) + + similar_and_cluster_match = self.similar[d1_index, d2_index] and \ + (self.entity_cluster_index[d1_entity] == self.entity_cluster_index[d2_entity]) + dissimilar_and_cluster_missmatch = self.not_similar[d1_index, d2_index] and \ + (self.entity_cluster_index[d1_entity] != self.entity_cluster_index[d2_entity]) + + if(similar_and_cluster_match or dissimilar_and_cluster_missmatch): + OF += 1 + + return OF + + def move(self, move_index : int, previous_OF : int): + print(f"Move[{move_index}] OF[{previous_OF}]") + if(move_index == 0): + random_entity = random.choice(self.valid_entities) + random_cluster = random.randint(0, self.initial_clusters_num - 1) + while(not self.clusters[random_cluster].has_entities()): + random_cluster = random.randint(0, self.initial_clusters_num - 1) + return self.change_entity_cluster(previous_OF, random_entity, random_cluster) + elif(move_index == 1): + previous_cluster = random.randint(0, self.initial_clusters_num - 1) + while(not self.clusters[previous_cluster].has_entities()): + previous_cluster = random.randint(0, self.initial_clusters_num - 1) + + new_cluster = random.randint(0, self.initial_clusters_num - 1) + while((previous_cluster == new_cluster) or (not self.clusters[new_cluster].has_entities())): + new_cluster = random.randint(0, self.initial_clusters_num - 1) + + return self.unify_clusters(previous_OF, previous_cluster, new_cluster) + + elif(move_index == 2): + previous_cluster = random.randint(0, self.initial_clusters_num - 1) + while(not self.clusters[previous_cluster].has_entities()): + previous_cluster = random.randint(0, self.initial_clusters_num - 1) + return self.seperate_clusters(previous_OF, previous_cluster) + else: + raise ValueError(f"Invalid Move Index \"{move_index}\": Choose 0->2") + return float("inf") + + + def change_entity_cluster(self, previous_OF : int, entity : int, new_cluster : int): + previous_cluster = self.entity_cluster_index[entity] + self.entity_cluster_index[entity] = new_cluster + + new_OF = self.calculate_OF() + if(new_OF > previous_OF): + self.clusters[previous_cluster].remove_entity(entity) + self.clusters[new_cluster].add_entity(entity) + return new_OF + else: + self.entity_cluster_index[entity] = previous_cluster + return previous_OF + + def unify_clusters(self, previous_OF : int, previous_cluster_index : int, new_cluster_index : int): + previous_cluster = self.clusters[previous_cluster_index] + new_cluster = self.clusters[new_cluster_index] + to_be_removed_entities = [] + previous_cluster_entities = previous_cluster.get_entities() + + for entity in previous_cluster_entities: + to_be_removed_entities.append(entity) + self.entity_cluster_index[entity] = new_cluster_index + + new_OF : int = self.calculate_OF() + + if(new_OF > previous_OF): + previous_cluster.remove_entities(previous_cluster_entities) + new_cluster.add_entities(previous_cluster_entities) + return new_OF + + for to_be_removed_entity in to_be_removed_entities: + self.entity_cluster_index[to_be_removed_entity] = previous_cluster_index + + return previous_OF + + def seperate_clusters(self, previous_OF, previous_cluster_index): + previous_cluster = self.clusters[previous_cluster_index] + previous_cluster_entities = previous_cluster.get_entities() + to_be_removed_entities = [] + new_cluster_index = self.initial_clusters_num + + for index in range(0, len(previous_cluster_entities), 2): + to_be_removed_entity = previous_cluster_entities[index] + to_be_removed_entities.append(to_be_removed_entity) + self.entity_cluster_index[to_be_removed_entity] = new_cluster_index + + new_OF : int = self.calculate_OF() + + if(new_OF > previous_OF): + self.clusters.append(EquivalenceCluster(data=self.data, flattened_cluster=to_be_removed_entities)) + self.initial_clusters_num += 1 + previous_cluster.remove_entities(to_be_removed_entities) + return new_OF + + for to_be_removed_entity in to_be_removed_entities: + self.entity_cluster_index[to_be_removed_entity] = previous_cluster_index + + return previous_OF + + def _configuration(self) -> dict: + return {} + +class CutClustering(AbstractClustering): + """Implements the Cut Clustering algorithm. Retains the candidate pairs whose similarity is over the specified threshold. + Those pairs are mapped into graph edges. Using the newly defined graph, we retrieve its Gomory Hu Tree representation + using the Edmonds Karp flow function, while edges' capacity is considered to be infinite. We return the connected components + of the resulting minimum s-t cuts for the pairs in the original, trimmed graph. + """ + + _method_name: str = "Cut Clustering" + _method_short_name: str = "CTC" + _method_info: str = "Ιmplements the Cut Clustering algorithm," + \ + "In essence, it calculates the Gomory Hu Tree of the graph resulting from input similarity pairs. " + \ + "We retain the connected components of this tree." + def __init__(self) -> None: + super().__init__() + self.similarity_threshold: float + + def process(self, graph: Graph, data: Data, similarity_threshold: float = 0.5, alpha: float = 0.2) -> list: + + start_time = time() + self.similarity_threshold : float = similarity_threshold + self.data = data + threshold_trimmed_graph : Graph = Graph() + + for (v1, v2, data) in graph.edges(data=True): + similarity_score = data.get('weight', 0) + d1_id, d2_id = self.sorted_indicators(v1, v2) + if similarity_score > self.similarity_threshold: + threshold_trimmed_graph.add_edge(d1_id, d2_id, weight=similarity_score) + + sink_node : int = self.data.num_of_entities + threshold_trimmed_graph.add_node(sink_node) + for node in graph.nodes(): + if node != sink_node: + threshold_trimmed_graph.add_edge(sink_node, node, weight=alpha) + + final_gomory_hu_tree = gomory_hu_tree(G=threshold_trimmed_graph, capacity='weight') + final_gomory_hu_tree.remove_node(sink_node) + clusters = list(connected_components(final_gomory_hu_tree)) + + print(len(clusters)) + self.execution_time = time() - start_time + return clusters + + def _configuration(self) -> dict: + return {} + +class MarkovClustering(AbstractClustering): + """Implements the Markov Clustering algorithm. It simulates random walks on a (n x n) matrix as the adjacency matrix + of a weighted, similarity graph. It alternates an expansion step and an inflation step until an equilibrium state is reached. + Entries with similarity above threhold, are inserted into final graph, whose CCs we retain. + """ + + _method_name: str = "Markov Clustering" + _method_short_name: str = "MCL" + _method_info: str = "Ιmplements the Markov Clustering algorithm," + \ + "In essence, it simulates random walks on a (n x n) matrix as the adjacency " + \ + "matrix of a graph. It alternates an expansion step and an inflation step " + \ + "until an equilibrium state is reached. We retain the connected components " + \ + "of the graph resulting from final similarity matrix entries valued over threshold." + def __init__(self) -> None: + super().__init__() + self.similarity_threshold : float + self.cluster_threshold : float + self.matrix_similarity_threshold : float + self.similarity_checks_limit : int + + def process(self, graph: Graph, + data: Data, + similarity_threshold: float = 0.5, + cluster_threshold: float = 0.001, + matrix_similarity_threshold: float = 0.00001, + similarity_checks_limit : int = 10) -> list: + + start_time = time() + self.similarity_threshold : float = similarity_threshold + self.cluster_threshold : float = cluster_threshold + self.matrix_similarity_threshold : float = matrix_similarity_threshold + self.similarity_checks_limit : int = similarity_checks_limit + self.data = data + self.current_similarity = lil_matrix((self.data.num_of_entities, data.num_of_entities), dtype=float) + new_graph : Graph = Graph() + + + for (v1, v2, data) in graph.edges(data=True): + d1_id, d2_id = self.sorted_indicators(v1, v2) + similarity_score = data.get('weight', 0) + + if(similarity_score > self.similarity_threshold): + self.current_similarity[d1_id, d2_id] = similarity_score + self.current_similarity[d2_id, d1_id] = similarity_score + + self.set_node_loop(similarity = 1.0) + self.normalize() + + for check in range(self.similarity_checks_limit): + self.previous_similarity = self.current_similarity.copy() + self.inflate() + self.normalize() + self.expand() + self.normalize() + print(check+1) + if(self.equilibrium()): + break + + edges_populated = self.get_existing_indices(matrix=self.current_similarity) + for edge in edges_populated: + row, column = edge + new_similarity = self.current_similarity[row, column] + final_row, final_column = self.sorted_indicators(row, column) + + if(new_graph.has_edge(final_row, final_column)): + existing_similarity = new_graph[final_row][final_column]["weight"] + if(new_similarity > existing_similarity): + new_graph[final_row][final_column]["weight"] = new_similarity + elif(new_similarity > self.cluster_threshold): + new_graph.add_edge(final_row, final_column, weight=new_similarity) + + clusters = list(connected_components(new_graph)) + self.execution_time = time() - start_time + return clusters + + def set_node_loop(self, similarity : float = 1.0) -> None: + rows : int = self.current_similarity.shape[0] + print(rows) + for row in range(rows): + self.current_similarity[row, row] = similarity + + def normalize(self) -> None: + column_sums = self.current_similarity.sum(axis=0) + column_sums[column_sums == 0] = 1 + self.current_similarity = self.current_similarity.multiply(1. / column_sums) + + def expand(self) -> None: + self.current_similarity = self.current_similarity.power(2) + + def inflate(self) -> None: + self.current_similarity = self.current_similarity.dot(self.current_similarity) + + def equilibrium(self) -> None: + self.current_similarity = self.current_similarity.tocsr() + self.previous_similarity = self.previous_similarity.tocsr() + + current_indices = self.get_existing_indices(matrix=self.current_similarity) + previous_indices = self.get_existing_indices(matrix=self.previous_similarity) + shared_indices = current_indices & previous_indices + + for indices in shared_indices: + row, column = indices + if(abs(self.current_similarity[row, column] - self.previous_similarity[row, column]) > self.matrix_similarity_threshold): + return False + + return True + + def get_existing_indices(self, matrix): + return set([indices for indices in zip(*matrix.nonzero())]) + + def _configuration(self) -> dict: + return {} + +class KiralyMSMApproximateClustering(AbstractClustering): + """Implements the Kiraly MSM Approximate Clustering algorithm. Implements the so-called "New Algorithm" + by Zoltan Kiraly 2013, which is a 3/2-approximation to the Maximum Stable Marriage (MSM) problem. + The pairs resulting from the approximation of the stable relationships are translated into a graph, + whose connected components we retain. + """ + + _method_name: str = "Kiraly MSM Approximate Clustering" + _method_short_name: str = "KMAC" + _method_info: str = "Ιmplements the Kiraly MSM Approximate Clustering algorithm," + \ + "In essence, it is a 3/2-approximation to the Maximum Stable Marriage (MSM) problem." + def __init__(self) -> None: + super().__init__() + self.similarity_threshold : float + + def process(self, + graph: Graph, + data: Data, + similarity_threshold: float = 0.1) -> list: + + start_time = time() + self.similarity_threshold : float = similarity_threshold + self.data = data + number_of_comparisons : int = len(graph.edges(data=True)) + clusters : list = [] + + if(number_of_comparisons == 0): + return clusters + + if self.data.is_dirty_er: + raise ValueError(f"Kiraly MSM Approximate Clustering doesn't support Dirty ER.") + + new_graph : Graph = Graph() + men : set = set() + self.men_candidates : dict = defaultdict(list) + self.women_candidates : dict = defaultdict(list) + + for (v1, v2, data) in graph.edges(data=True): + man, woman = self.sorted_indicators(v1, v2) + similarity = data.get('weight', 0) + if similarity > self.similarity_threshold: + self.men_candidates[man].append(ExtendedSimilarityEdge(left_node=man, + right_node=woman, + similarity=similarity)) + self.women_candidates[woman].append(ExtendedSimilarityEdge(left_node=woman, + right_node=man, + similarity=similarity)) + men.add(man) + + for man, candidates in self.men_candidates.items(): + self.men_candidates[man] = sorted(candidates, reverse=True) + for woman, candidates in self.women_candidates.items(): + self.women_candidates[woman] = sorted(candidates, reverse=True) + + self.is_bachelor : list = [False] * self.data.num_of_entities_1 + self.is_uncertain : list = [False] * self.data.num_of_entities_1 + self.fiances : list = [-1] * self.data.num_of_entities_2 + self.current_matches : dict = {} + self.free_men : list = list(men) + + while(len(self.free_men) > 0): + man = self.free_men.pop(0) + woman = self.get_first_active_candidate(entity=man) + + if(woman == -1): + if(not self.is_bachelor[man]): + self.is_bachelor[man] = True + if(not self.has_candidates(entity=man)): + self.free_men.append(man) + self.activate_candidates_of(entity=man) + else: + continue + else: + fiance = self.get_woman_fiance(woman=woman) + if(fiance == -1): + self.add_match(man=man, woman=woman, similarity=0.0) + self.set_woman_fiance(woman=woman, fiance=man) + else: + if(self.accepts_proposal(woman=woman, + man=man)): + self.remove_match(man=fiance, woman=woman) + self.add_match(man=man, woman=woman, similarity=0.0) + self.set_woman_fiance(woman=woman, fiance=man) + if(not self.is_uncertain[fiance]): + self.deactivate_candidate(entity=fiance, candidate=woman) + else: + self.deactivate_candidate(entity=man, candidate=woman) + + for _, edges in self.current_matches.items(): + for edge in edges: + man, woman, similarity = edge.left_node, edge.right_node, edge.similarity + new_graph.add_edge(man, woman, weight=similarity) + + clusters = list(connected_components(new_graph)) + self.execution_time = time() - start_time + return clusters + + def is_male(self, entity: int) -> bool: + return entity < self.data.dataset_limit + + def get_entity_candidates(self, entity : int) -> PriorityQueue: + candidates = self.men_candidates if self.is_male(entity) else self.women_candidates + return candidates[entity] + + def has_candidates(self, entity : int) -> bool: + return len(self.get_entity_candidates(entity=entity)) > 0 + + def activate_candidates_of(self, entity : int) -> None: + candidates = self.get_entity_candidates(entity=entity) + for candidate in candidates: + candidate.set_active(active=True) + + def get_first_active_candidate(self, entity : int) -> int: + candidates = self.get_entity_candidates(entity=entity) + for candidate in candidates: + if(candidate.is_active()): + return candidate.right_node + return -1 + + def add_match(self, man : int, woman : int, similarity : float) -> None: + if man not in self.current_matches: + self.current_matches[man] = [] + self.current_matches[man].append(ExtendedSimilarityEdge(left_node=man, + right_node=woman, + similarity=similarity)) + def remove_match(self, man : int, woman : int) -> None: + self.current_matches[man] = [match for match in self.current_matches[man] \ + if (match.left_node != man or match.right_node != woman)] + + def get_woman_fiance(self, woman : int) -> int: + return self.fiances[woman - self.data.dataset_limit] + + def set_woman_fiance(self, woman : int, fiance : int) -> None: + self.fiances[woman - self.data.dataset_limit] = fiance + + def deactivate_candidate(self, entity : int, candidate : int) -> bool: + entity_candidates = self.get_entity_candidates(entity=entity) + for entity_candidate in entity_candidates: + if(entity_candidate.right_node == candidate): + entity_candidate.set_active(active=False) + return True + return False + + def accepts_proposal(self, woman : int, man : int): + current_fiance : int = self.get_woman_fiance(woman=woman) + + if(current_fiance == -1): + return True + if(self.is_uncertain[current_fiance]): + return True + + man_score : float = 0.0 + current_fiance_score : float = 0.0 + + woman_candidates : list = self.get_entity_candidates(entity=woman) + + for comparison in woman_candidates: + candidate : int = comparison.right_node + if(candidate == man): + man_score = comparison.similarity + elif(candidate == current_fiance): + current_fiance_score = comparison.similarity + + return (man_score > current_fiance_score) + + def _configuration(self) -> dict: + return {} + +class RicochetSRClustering(AbstractClustering): + """Implements the Ricochet SR Clustering algorithm. Implements the so-called "New Algorithm" + by Zoltan Kiraly 2013, which is a 3/2-approximation to the Maximum Stable Marriage (MSM) problem. + The pairs resulting from the approximation of the stable relationships are translated into a graph, + whose connected components we retain. + """ + + _method_name: str = "Ricochet SR Clustering" + _method_short_name: str = "RSRC" + _method_info: str = "Ιmplements the Ricochet SR Clustering algorithm," + \ + "In essence, it is a 3/2-approximation to the Maximum Stable Marriage (MSM) problem." + def __init__(self) -> None: + super().__init__() + self.similarity_threshold : float + + def process(self, + graph: Graph, + data: Data, + similarity_threshold: float = 0.5) -> list: + + start_time = time() + self.similarity_threshold : float = similarity_threshold + self.data = data + clusters : list = [] + self.vertices : dict = {} + self.sorted_vertices = PriorityQueue(maxsize = self.data.num_of_entities) + + for (v1, v2, data) in graph.edges(data=True): + d1_id, d2_id = self.sorted_indicators(v1, v2) + similarity = data.get('weight', 0) + if similarity > self.similarity_threshold: + if d1_id not in self.vertices: self.vertices[d1_id] = Vertex(identifier=d1_id) + if d2_id not in self.vertices: self.vertices[d2_id] = Vertex(identifier=d2_id) + self.vertices[d1_id].insert_edge(edge=(d2_id, similarity)) + self.vertices[d2_id].insert_edge(edge=(d1_id, similarity)) + + for _, vertex in self.vertices.items(): + if(vertex.has_edges()): + self.sorted_vertices.put(vertex) + + if(self.sorted_vertices.empty()): + return clusters + + self.centers : set = set() + self.members : set = set() + self.center_of : dict = {} + self.similarity_with_center : dict = defaultdict(float) + self.current_clusters : dict = defaultdict(set) + + top_vertex : Vertex = self.sorted_vertices.get() + vertex_id : int = top_vertex.get_identifier() + self.centers.add(vertex_id) + self.center_of[vertex_id] = vertex_id + self.current_clusters[vertex_id].add(vertex_id) + self.similarity_with_center[vertex_id] = 1.0 + + top_vertex_neighbor = list(top_vertex.edges.keys())[0] + self.members.add(top_vertex_neighbor) + self.center_of[top_vertex_neighbor] = vertex_id + self.current_clusters[vertex_id].add(top_vertex_neighbor) + self.similarity_with_center[top_vertex_neighbor] = top_vertex.get_similarity_with(top_vertex_neighbor) + + while(not self.sorted_vertices.empty()): + vertex = self.sorted_vertices.get() + vertex_id = vertex.get_identifier() + to_reassign : set = set() + centers_to_reassign : set = set() + + for neighbor, similarity in vertex.edges.items(): + if(neighbor in self.centers): + continue + previous_similarity = self.similarity_with_center[neighbor] + if(previous_similarity >= similarity): + continue + to_reassign.add(neighbor) + break + + if(to_reassign): + if(vertex_id in self.members): + self.members.remove(vertex_id) + previous_center = self.center_of[vertex_id] + self.current_clusters[previous_center].remove(vertex_id) + if(len(self.current_clusters[previous_center]) < 2): + centers_to_reassign.add(previous_center) + to_reassign.add(vertex_id) + for assignee in to_reassign: + self.current_clusters[vertex_id].add(assignee) + self.centers.add(vertex_id) + + for reassign in to_reassign: + if(reassign != vertex_id): + if(reassign in self.members): + reassign_previous_center = self.center_of[reassign] + self.current_clusters[reassign_previous_center].remove(reassign) + + if(len(self.current_clusters[reassign_previous_center]) < 2): + centers_to_reassign.add(reassign_previous_center) + self.members.add(reassign) + self.center_of[reassign] = vertex_id + self.similarity_with_center[reassign] = vertex.get_similarity_with(reassign) + + for center_to_reassign in centers_to_reassign: + if(len(self.current_clusters[center_to_reassign]) > 1): + continue + self.centers.remove(center_to_reassign) + _ = self.current_clusters.pop(center_to_reassign, None) + + max_similarity : float = 0.0 + new_center : int = vertex_id + + for center in self.centers: + new_similarity : float = self.vertices[center].get_similarity_with(center_to_reassign) + if(new_similarity > 0.0): + if(len(self.current_clusters[center]) > 1): + continue + if(new_similarity > max_similarity): + max_similarity = new_similarity + new_center = center + if(len(self.current_clusters[new_center]) > 1): + continue + self.current_clusters[new_center].add(center_to_reassign) + self.members.add(center_to_reassign) + self.center_of[center_to_reassign]= new_center + self.similarity_with_center[center_to_reassign] = max_similarity + + for entity in range(self.data.num_of_entities): + if(entity not in self.members and entity not in self.centers): + self.centers.add(entity) + self.center_of[entity] = entity + self.current_clusters[entity].add(entity) + self.similarity_with_center[entity] = 1.0 + + clusters = [] + for center, members in self.current_clusters.items(): + center_equivalence_cluster = EquivalenceCluster(data=self.data, + flattened_cluster=list(members)) + clusters.append(set(center_equivalence_cluster.flatten())) + + self.execution_time = time() - start_time + return clusters + + def _configuration(self) -> dict: + return {} \ No newline at end of file diff --git a/docs/pyjedai/comparison_cleaning.py b/docs/pyjedai/comparison_cleaning.py index b2608e8..4d32086 100644 --- a/docs/pyjedai/comparison_cleaning.py +++ b/docs/pyjedai/comparison_cleaning.py @@ -157,13 +157,9 @@ def export_to_df(self, prediction) -> pd.DataFrame: Returns: pd.DataFrame: Dataframe with the predicted pairs """ - if self.data.ground_truth is None: - raise AttributeError("Can not proceed to evaluation without a ground-truth file. \ - Data object mush have initialized with the ground-truth file") pairs_df = pd.DataFrame(columns=['id1', 'id2']) - for entity_id, candidates in prediction.items(): - id1 = self.data._gt_to_ids_reversed_1[entity_id] + id1 = self.data._gt_to_ids_reversed_1[entity_id] for candiadate_id in candidates: id2 = self.data._gt_to_ids_reversed_1[candiadate_id] if self.data.is_dirty_er \ else self.data._gt_to_ids_reversed_2[candiadate_id] @@ -196,19 +192,28 @@ def __init__(self) -> None: def _apply_main_processing(self) -> dict: self._counters = np.empty([self.data.num_of_entities], dtype=float) self._flags = np.empty([self.data.num_of_entities], dtype=int) - if self.weighting_scheme == 'EJS': + if(self._comparisons_per_entity_required()): self._set_statistics() self._set_threshold() return self._prune_edges() + def _comparisons_per_entity_required(self): + return (self.weighting_scheme == 'EJS' or + self.weighting_scheme == 'CNC' or + self.weighting_scheme == 'SNC' or + self.weighting_scheme == 'SND' or + self.weighting_scheme == 'CND' or + self.weighting_scheme == 'CNJ' or + self.weighting_scheme == 'SNJ') + def _get_weight(self, entity_id: int, neighbor_id: int) -> float: ws = self.weighting_scheme - if ws == 'ARCS' or ws == 'CBS': + if ws == 'CN-CBS' or ws == 'CBS' or ws == 'SN-CBS': return self._counters[neighbor_id] # CARDINALITY_NORM_COSINE, SIZE_NORM_COSINE elif ws == 'CNC' or ws == 'SNC': - return self._counters[neighbor_id] / float(sqrt(len(self._comparisons_per_entity[entity_id]) * self._comparisons_per_entity[neighbor_id])) + return self._counters[neighbor_id] / float(sqrt(self._comparisons_per_entity[entity_id] * self._comparisons_per_entity[neighbor_id])) # SIZE_NORM_DICE, CARDINALITY_NORM_DICE elif ws == 'SND' or ws == 'CND': return 2 * self._counters[neighbor_id] / float(self._comparisons_per_entity[entity_id] + self._comparisons_per_entity[neighbor_id]) @@ -372,8 +377,10 @@ def _process_entity(self, entity_id: int) -> None: if self._flags[neighbor_id] != entity_id: self._counters[neighbor_id] = 0 self._flags[neighbor_id] = entity_id - if self.weighting_scheme == 'ARCS': + if self.weighting_scheme == 'CN-CBS' or self.weighting_scheme == 'CNC' or self.weighting_scheme == 'CND' or self.weighting_scheme == 'CNJ': self._counters[neighbor_id] += 1 / self._blocks[block_id].get_cardinality(self.data.is_dirty_er) + if self.weighting_scheme == 'SN-CBS' or self.weighting_scheme == 'SNC' or self.weighting_scheme == 'SND' or self.weighting_scheme == 'SNJ': + self._counters[neighbor_id] += 1 / self._blocks[block_id].get_size() else: self._counters[neighbor_id] += 1 self._valid_entities.add(neighbor_id) @@ -480,6 +487,7 @@ def __init__(self, weighting_scheme: str = 'CBS') -> None: self._nearest_entities: dict self._node_centric = True self._top_k_edges: PriorityQueue + self._number_of_nearest_neighbors : int = None def _prune_edges(self) -> dict: self._nearest_entities = dict() @@ -508,10 +516,13 @@ def _is_valid_comparison(self, entity_id: int, neighbor_id: int) -> bool: return True def _set_threshold(self) -> None: - block_assignments = 0 - for block in self._blocks.values(): - block_assignments += block.get_size() - self._threshold = max(1, block_assignments / self.data.num_of_entities) + if(self._number_of_nearest_neighbors is None): + block_assignments = 0 + for block in self._blocks.values(): + block_assignments += block.get_size() + self._threshold = max(1, block_assignments / self.data.num_of_entities) + else: + self._threshold = self._number_of_nearest_neighbors def _verify_valid_entities(self, entity_id: int) -> None: if entity_id not in self._entity_index: @@ -546,7 +557,7 @@ class ReciprocalCardinalityNodePruning(CardinalityNodePruning): "that correspond to edges in the blocking graph that are among " + \ "the top-k weighted ones for both adjacent entities/nodes." - def __init__(self, weighting_scheme: str = 'ARCS') -> None: + def __init__(self, weighting_scheme: str = 'CN-CBS') -> None: super().__init__(weighting_scheme) def _is_valid_comparison(self, entity_id: int, neighbor_id: int) -> bool: @@ -645,7 +656,7 @@ class ReciprocalWeightedNodePruning(WeightedNodePruning): "that correspond to edges in the blocking graph that are " + \ "exceed the average edge weight in both adjacent node neighborhoods." - def __init__(self, weighting_scheme: str = 'ARCS') -> None: + def __init__(self, weighting_scheme: str = 'CN-CBS') -> None: super().__init__(weighting_scheme) def _get_valid_weight(self, entity_id: int, neighbor_id: int) -> float: @@ -665,6 +676,7 @@ def _set_threshold(self) -> None: def process(self, blocks: dict, data: Data, tqdm_disable: bool = False, store_weights: bool = True, cc: AbstractMetablocking = None, emit_all_tps_stop : bool = False) -> dict: self._emit_all_tps_stop : bool = emit_all_tps_stop + self._budget = self._budget if not self._emit_all_tps_stop else float('inf') if(cc is None): return super().process(blocks, data, tqdm_disable, store_weights) else: @@ -697,20 +709,22 @@ def __init__(self, weighting_scheme: str = 'CBS', budget: int = 0) -> None: self._budget = budget def _set_threshold(self) -> None: - self._threshold = max(1, 2 * self._budget / self.data.num_of_entities) if not self._emit_all_tps_stop else 2 * self._budget + self._threshold = self._number_of_nearest_neighbors def process(self, blocks: dict, data: Data, + number_of_nearest_neighbors : int = 10, tqdm_disable: bool = False, store_weights: bool = True, cc: AbstractMetablocking = None, emit_all_tps_stop : bool = False) -> dict: self._emit_all_tps_stop : bool = emit_all_tps_stop + self._number_of_nearest_neighbors : int = number_of_nearest_neighbors if(cc is None): - return super().process(blocks, data, tqdm_disable, store_weights) + return super().process(blocks=blocks, data=data, tqdm_disable=tqdm_disable, store_weights=store_weights) else: - self._threshold = max(1, 2 * self._budget / data.num_of_entities) if not self._emit_all_tps_stop else 2 * self._budget + self._threshold = self._number_of_nearest_neighbors self.trimmed_blocks : dict = defaultdict(set) for entity_id, neighbors in blocks.items(): @@ -753,9 +767,10 @@ def process( self, blocks: dict, data: Data, + window_size : int = 10, tqdm_disable: bool = False, emit_all_tps_stop : bool = False - ) -> PriorityQueue: + ) -> List[Tuple[float, int, int]]: """Calculates top comparisons for Progressive Matching Args: @@ -778,6 +793,7 @@ def process( self._emit_all_tps_stop : bool = emit_all_tps_stop self._num_of_blocks = len(blocks) self._blocks: dict = blocks + self._max_window_size : int = window_size self._sorted_entity_ids = get_sorted_blocks_shuffled_entities(self.data.is_dirty_er, self._blocks) self._total_sorted_entities = len(self._sorted_entity_ids) @@ -787,7 +803,7 @@ def process( self._flags = np.empty([self.data.num_of_entities], dtype=int) self._counters[:] = 0 self._flags[:] = -1 - self._pairs = self._apply_main_processing() + self._pairs : List[Tuple[float, int, int]]= self._apply_main_processing() self.execution_time = time() - start_time self._progress_bar.close() @@ -801,8 +817,12 @@ def _get_weight(self, entity_id: int, neighbor_id: int) -> float: return self._counters[neighbor_id] / denominator elif ws == 'ACF' or ws == 'ID': return self._counters[neighbor_id] + elif ws == 'COSINE': + return self._counters[neighbor_id] / float(sqrt(len(self._position_index.get_positions(entity_id)) * len(self._position_index.get_positions(neighbor_id)))) + elif ws == 'DICE': + return 2 * self._counters[neighbor_id] / float(len(self._position_index.get_positions(entity_id)) + len(self._position_index.get_positions(neighbor_id))) else: - raise ValueError("This weighting scheme does not exist") + raise ValueError("This weighting scheme does not exist") def valid_entity_neighbor_index(self, entity: int, neighbor_index: int) -> bool: """Verifies if the neighbor identifier at the specified index is valid for candidate (the pair hasn't been considered previously) @@ -837,43 +857,35 @@ class GlobalProgressiveSortedNeighborhood(ProgressiveSortedNeighborhood): def __init__(self, weighting_scheme: str = 'ACF', budget: int = 0) -> None: super().__init__(weighting_scheme, budget) - def _apply_main_processing(self) -> PriorityQueue: - self._max_window = 2 if self.data.num_of_entities <= 100 else int(2 ** (math.log10(self.data.num_of_entities) + 1) + 1) + def _apply_main_processing(self) -> List[Tuple[float, int, int]]: # TO DO: budget taken as argument in prediction, not algorithm constructor self._budget = float('inf') if self._emit_all_tps_stop else self._budget - self._top_pairs : PriorityQueue = PriorityQueue(2 * int(self._budget)) if not self._emit_all_tps_stop else PriorityQueue() - _top_unsorted_pairs: PriorityQueue = PriorityQueue(2 * int(self._budget)) if not self._emit_all_tps_stop else PriorityQueue() + self._top_pairs : List[Tuple[float, int, int]] = [] + default_weight = 0.0 + self._pair_weight : dict = defaultdict(lambda: default_weight) for entity in range(self.data.dataset_limit): entity_positions = self._position_index.get_positions(entity) self._neighbors.clear() - for current_window in range(1,self._max_window): + for current_window in range(1,self._max_window_size + 1): for entity_position in entity_positions: right_neighbor = entity_position + current_window left_neighbor = entity_position - current_window - + if(right_neighbor < self._total_sorted_entities): if(self.valid_entity_neighbor_index(entity, right_neighbor)): self._update_local_weight(current_window, entity, self._sorted_entity_ids[right_neighbor]) if(left_neighbor >= 0): if(self.valid_entity_neighbor_index(entity, left_neighbor)): self._update_local_weight(current_window, entity, self._sorted_entity_ids[left_neighbor]) - - current_minimum_weight = -1 + for neighbor in self._neighbors: self._flags[neighbor] = -1 - pair_weight = self._get_weight(entity, neighbor) + self._pair_weight[(entity, neighbor)] = max(self._pair_weight[(entity, neighbor)], self._get_weight(entity, neighbor)) - if(pair_weight >= current_minimum_weight): - _top_unsorted_pairs.put( - (pair_weight, entity, neighbor) - ) - if self._budget < _top_unsorted_pairs.qsize(): - current_minimum_weight = _top_unsorted_pairs.get()[0] - - while(not _top_unsorted_pairs.empty()): - _score, _entity, _neighbor = _top_unsorted_pairs.get() - self._top_pairs.put((-_score, _entity, _neighbor)) + for pair in self._pair_weight: + id1, id2 = pair + self._top_pairs.append((self._pair_weight[(id1, id2)], id1, id2)) return self._top_pairs @@ -915,17 +927,17 @@ def _has_next(self) -> bool: Returns: bool: Another pair can be emitted """ - return self._emitted_comparisons < self._budget and self._current_window < self._total_sorted_entities + return self._current_window <= self._max_window_size - def _apply_main_processing(self) -> List[Tuple[int, int]]: - self._emitted_comparisons = 0 + def _apply_main_processing(self) -> List[Tuple[float, int, int]]: self._current_window = 1 - self._top_pairs: List[Tuple[int, int]] = [] + self._top_pairs: List[Tuple[float, int, int]] = [] + default_weight = 0.0 + self._pair_weight : dict = defaultdict(lambda: default_weight) # TO DO: budget taken as argument in prediction, not algorithm constructor self._budget = float('inf') if self._emit_all_tps_stop else self._budget while(self._has_next()): - _window_top_pairs = PriorityQueue() for entity in range(self.data.dataset_limit): entity_positions = self._position_index.get_positions(entity) self._neighbors.clear() @@ -940,21 +952,16 @@ def _apply_main_processing(self) -> List[Tuple[int, int]]: if(left_neighbor >= 0): if(self.valid_entity_neighbor_index(entity, left_neighbor)): self._update_counters(entity, self._sorted_entity_ids[left_neighbor]) - + for neighbor in self._neighbors: self._flags[neighbor] = -1 - pair_weight = self._get_weight(entity, neighbor) - - _window_top_pairs.put( - (-pair_weight, entity, neighbor) - ) - - while(len(self._top_pairs) < self._budget and not _window_top_pairs.empty()): - _, _entity, _neighbor = _window_top_pairs.get() - self._top_pairs.append((_entity, _neighbor)) - self._emitted_comparisons += 1 + self._pair_weight[(entity, neighbor)] = max(self._pair_weight[(entity, neighbor)], self._get_weight(entity, neighbor)) self._current_window += 1 + + for pair in self._pair_weight: + id1, id2 = pair + self._top_pairs.append((self._pair_weight[(id1, id2)], id1, id2)) return self._top_pairs @@ -1008,113 +1015,40 @@ def _process_entity(self, entity_id: int) -> None: if self._flags[neighbor_id] != entity_id: self._counters[neighbor_id] = 0 self._flags[neighbor_id] = entity_id - if self.weighting_scheme == 'ARCS': + if self.weighting_scheme == 'CN-CBS' or self.weighting_scheme == 'CNC' or self.weighting_scheme == 'CND' or self.weighting_scheme == 'CNJ': self._counters[neighbor_id] += 1 / self._blocks[block_id].get_cardinality(self.data.is_dirty_er) + if self.weighting_scheme == 'SN-CBS' or self.weighting_scheme == 'SNC' or self.weighting_scheme == 'SND' or self.weighting_scheme == 'SNJ': + self._counters[neighbor_id] += 1 / self._blocks[block_id].get_size() else: self._counters[neighbor_id] += 1 self._valid_entities.add(neighbor_id) - + for valid_entity_id in self._valid_entities: - _current_neighbor_weight = self._get_weight(entity_id, valid_entity_id) - self._sorted_neighbors[entity_id].put((-_current_neighbor_weight, valid_entity_id)) - if(self.store_weights): - self._stored_weights[canonical_swap(entity_id, valid_entity_id)] = _current_neighbor_weight + _current_neighbor_weight = self._get_weight(entity_id, valid_entity_id) + if(self.store_weights): + self._stored_weights[canonical_swap(entity_id, valid_entity_id)] = _current_neighbor_weight - if(self.method == 'HB' and not self._sorted_neighbors[entity_id].empty()): - _top_entity_weight, _top_entity_neighbor = self._sorted_neighbors[entity_id].get() - self._to_emit_pairs.append((-_top_entity_weight, entity_id, _top_entity_neighbor)) - + self._to_emit_pairs.append((_current_neighbor_weight, entity_id, valid_entity_id)) self.blocks[entity_id] = self._valid_entities.copy() def _prune_edges(self) -> dict: return None def process_raw_blocks(self, blocks: dict): + self._average_weight = np.zeros(self._limit, dtype=float) self._entity_index = create_entity_index(blocks, self.data.is_dirty_er) self._apply_main_processing() def process_prunned_blocks(self, blocks : dict, cc : AbstractMetablocking): - self._average_weight = np.zeros(self._limit, dtype=float) self.blocks = blocks for entity in sorted(blocks.keys()): neighbors = blocks[entity] - _neighbors_weigth_sum : float = 0.0 for neighbor in neighbors: _current_neighbor_weigth = cc.get_precalculated_weight(entity, neighbor) - _neighbors_weigth_sum += _current_neighbor_weigth - self._sorted_neighbors[entity].put((-_current_neighbor_weigth, neighbor)) - - self._average_weight[entity] = _neighbors_weigth_sum / len(neighbors) if len(neighbors) else 0.0 - if(self.method == 'HB' and not self._sorted_neighbors[entity].empty()): - _top_entity_weight, _top_entity_neighbor = self._sorted_neighbors[entity].get() - self._to_emit_pairs.append((-_top_entity_weight, entity, _top_entity_neighbor)) + self._to_emit_pairs.append((_current_neighbor_weigth, entity, neighbor)) - def successful_emission(self, pair : tuple) -> bool: - """Attempts to emit given pair, returns True / False on Success / Fail - In the case of full emission, it always emits given pair - - Args: - pair (tuple): Tuple in the form (score, entity1, entity2) - Returns: - bool: Successful / Failed Emission - """ - _weigth, _entity, _neighbor = pair - - _budget = float('inf') if self._emit_all_tps_stop else self._budget - - if(self._emitted_comparisons < _budget): - self.pairs.append((_entity, _neighbor)) - self._emitted_comparisons += 1 - self._progress_bar.update(1) - return True - else: - self.execution_time = time() - self.start_time - self._progress_bar.close() - return False - - - def produce_pairs(self) -> List[Tuple[int, int]]: - """Emits the top pair for each entity in decreasing average weigth order. - Traverses the entities in decreasing average weigth order and emits its - pairs in decreasing weight order - - Returns: - List[Tuple[float, int, int]]: List of emitted pairs - """ - self._emitted_comparisons = 0 - checked_entity = np.zeros(self._limit, dtype=bool) - self.pairs = [] - - for pair in self._to_emit_pairs: - if(not self.successful_emission(pair)): - return self.pairs - - if(self.method == 'HB' or self.method == 'DFS'): - for entity in self._avg_weight_sorted_entities: - checked_entity[entity] = True - while(not self._sorted_neighbors[entity].empty()): - weight, neighbor = self._sorted_neighbors[entity].get() - pair = -weight, entity, neighbor - if(not checked_entity[neighbor]): - if(not self.successful_emission(pair)): - return self.pairs - else: - _available_emissions = True - while(_available_emissions): - _available_emissions = False - for entity in self._avg_weight_sorted_entities: - if(not self._sorted_neighbors[entity].empty()): - weight, neighbor = self._sorted_neighbors[entity].get() - pair = -weight, entity, neighbor - if canonical_swap(entity, neighbor) not in self._checked_pairs: - if(not self.successful_emission(pair)): return self.pairs - self._checked_pairs.add(canonical_swap(entity, neighbor)) - _available_emissions = True - - return self.pairs - - def process(self, blocks: dict, data: Data, tqdm_disable: bool = False, store_weigths : bool = True, cc: AbstractMetablocking = None, method : str = 'HB', emit_all_tps_stop : bool = False) -> None: + def process(self, blocks: dict, data: Data, tqdm_disable: bool = False, store_weigths : bool = True, cc: AbstractMetablocking = None, method : str = 'HB', emit_all_tps_stop : bool = False) -> List[Tuple[float, int, int]]: """Calculates the weights between entities, stores them in descending order of their average weight, stores the top comparison per entity @@ -1143,15 +1077,13 @@ def process(self, blocks: dict, data: Data, tqdm_disable: bool = False, store_we self._blocks: dict = blocks self._stored_weights : dict = defaultdict(float) self._to_emit_pairs = [] - self._sorted_neighbors = [PriorityQueue() for _ in range(self._limit)] - if(self.method == 'BFS'): self._checked_pairs = set() - + if(cc is None): self.process_raw_blocks(blocks) else: self.process_prunned_blocks(blocks, cc) - self._avg_weight_sorted_entities = sorted_enumerate(self._average_weight) + return self._to_emit_pairs def get_meta_blocking_approach(acronym: str, w_scheme: str, budget: int = 0) -> any: """Return method by acronym diff --git a/docs/pyjedai/datamodel.py b/docs/pyjedai/datamodel.py index 92d725e..debbb51 100644 --- a/docs/pyjedai/datamodel.py +++ b/docs/pyjedai/datamodel.py @@ -79,7 +79,6 @@ def __init__( id_column_name_2: str = None, dataset_name_2: str = None, ground_truth: DataFrame = None, - inorder_gt: bool = True ) -> None: # Original Datasets as pd.DataFrame if isinstance(dataset_1, pd.DataFrame): @@ -105,7 +104,6 @@ def __init__( self.entities: DataFrame # Datasets specs - self.inorder_gt = inorder_gt self.is_dirty_er = dataset_2 is None self.dataset_limit = self.num_of_entities_1 = len(dataset_1) self.num_of_entities_2: int = len(dataset_2) if dataset_2 is not None else 0 @@ -160,6 +158,8 @@ def __init__( self._gt_to_ids_reversed_1: dict self._ids_mapping_2: dict self._gt_to_ids_reversed_2: dict + else: + self.ground_truth = None self.entities = self.dataset_1 = self.dataset_1.astype(str) @@ -172,23 +172,48 @@ def __init__( self.entities = pd.concat([self.dataset_1, self.dataset_2], ignore_index=True) + self._create_gt_mapping() if ground_truth is not None: - self._create_gt_mapping() self._store_pairs() else: self.ground_truth = None + # def _store_pairs(self) -> None: + # """Creates a mapping: + # - pairs_of : ids of first dataset to ids of true matches from second dataset""" + + # self.pairs_of = defaultdict(set) + # d1_col_index, d2_col_index = (0, 1) if self.inorder_gt else (1,0) + + # for _, row in self.ground_truth.iterrows(): + # id1, id2 = (row[d1_col_index], row[d2_col_index]) + # if id1 in self.pairs_of: self.pairs_of[id1].append(id2) + # else: self.pairs_of[id1] = [id2] + + def _store_pairs(self) -> None: """Creates a mapping: - pairs_of : ids of first dataset to ids of true matches from second dataset""" - self.pairs_of = defaultdict(set) - d1_col_index, d2_col_index = (0, 1) if self.inorder_gt else (1,0) + self.duplicate_of = defaultdict(set) for _, row in self.ground_truth.iterrows(): - id1, id2 = (row[d1_col_index], row[d2_col_index]) - if id1 in self.pairs_of: self.pairs_of[id1].append(id2) - else: self.pairs_of[id1] = [id2] + id1, id2 = (row[0], row[1]) + if id1 in self.duplicate_of: self.duplicate_of[id1].add(id2) + else: self.duplicate_of[id1] = {id2} + + def _are_true_positives(self, id1 : int, id2 : int): + """Checks if given pair of identifiers represents a duplicate. + Identifiers must be inorder, first one belonging to the first and the second to the second dataset + + Args: + id1 (int, optional): Identifier from the first dataframe. + id2 (int, optional): Identifier from the second dataframe. + + Returns: + _type_: _description_ + """ + return id1 in self.duplicate_of and id2 in self.duplicate_of[id1] def _create_gt_mapping(self) -> None: """Creates two mappings: @@ -198,8 +223,8 @@ def _create_gt_mapping(self) -> None: """ if self.ground_truth is not None: self.ground_truth = self.ground_truth.astype(str) - else: - return + # else: + # return self._ids_mapping_1 = dict( zip( @@ -312,7 +337,6 @@ def stats_about_data(self) -> None: return stats_df - class Block: """The main module used for storing entities in the blocking steps of pyjedai module. \ Consists of 2 sets of profile entities 1 for Dirty ER and 2 for Clean-Clean ER. @@ -361,3 +385,4 @@ def verbose(self, key: any, is_dirty_er: bool) -> None: print("Clean dataset 2: " + "[\033[1;34m" + str(len(self.entities_D2)) + \ " entities\033[0m]") print(self.entities_D2) + diff --git a/docs/pyjedai/evaluation.py b/docs/pyjedai/evaluation.py index 02a8a59..fd0f922 100644 --- a/docs/pyjedai/evaluation.py +++ b/docs/pyjedai/evaluation.py @@ -18,6 +18,7 @@ from .utils import canonical_swap from math import inf from .utils import PredictionData +from .utils import generate_unique_identifier import random import matplotlib.pyplot as plt @@ -196,12 +197,19 @@ def confusion_matrix(self): plt.ylabel("Real matching pairs", fontsize=10, fontweight='bold') plt.show() - def visualize_roc(method_names : List[str], methods_data : List[Tuple[str, float, List[float]]], proportional : bool =True) -> None: + def visualize_roc(self, methods_data : List[dict], proportional : bool =True, drop_tp_indices=True) -> None: fig, ax = plt.subplots(figsize=(10, 6)) # set the size of the plot colors = [] normalized_aucs = [] # for each method layout its plot - for method_name, normalized_auc, cumulative_recall in methods_data: + for method_data in methods_data: + cumulative_recall, normalized_auc = self._generate_auc_data(total_candidates=method_data['total_emissions'], tp_positions=method_data['tp_idx']) + if(drop_tp_indices): + del(method_data['tp_idx']) + method_name=method_data['name'] + method_data['auc'] = normalized_auc + method_data['recall'] = cumulative_recall[-1] if len(cumulative_recall) != 0 else 0.0 + x_values = range(len(cumulative_recall)) color = "#{:06x}".format(random.randint(0, 0xFFFFFF)) colors.append(color) @@ -262,14 +270,14 @@ def _till_full_tps_emission(self) -> bool: Returns: bool: Stop emission on all TPs found / Emit all pairs """ - return self._true_positive_checked is not None + return self._duplicate_emitted is not None def _all_tps_emitted(self) -> bool: """Checks if all TPs have been emitted (Defaults to False in the case of all pairs emission approach) Returns: bool: All TPs emitted / not emitted """ - if(self._till_full_tps_emission()): return self._tps_found >= len(self._true_positive_checked) + if(self._till_full_tps_emission()): return self._tps_found >= len(self._duplicate_emitted) else: False def _update_true_positive_entry(self, entity : int, candidate : int) -> None: @@ -280,75 +288,113 @@ def _update_true_positive_entry(self, entity : int, candidate : int) -> None: candidate (int): Candidate ID """ if(self._till_full_tps_emission()): - if(not self._true_positive_checked[canonical_swap(entity, candidate)]): - self._true_positive_checked[canonical_swap(entity, candidate)] = True + if(not self._duplicate_emitted[(entity, candidate)]): + self._duplicate_emitted[(entity, candidate)] = True self._tps_found += 1 return - def calculate_roc_auc_data(self, data: Data, pairs, batch_size : int = 1, true_positive_checked : dict = None) -> List[Tuple[int, int]]: - """Progressively calculates total recall, AUC for each batch of candidate pairs + def calculate_tps_indices(self, pairs : List[Tuple[float, int, int]], duplicate_of : dict = None, duplicate_emitted : dict = None, batch_size : int = 1) -> Tuple[List[int], int]: + """ Args: - data (Data): Data Module - pairs: List containing pairs in form (entity1 id, entity2 id, score) + pairs (List[float, int, int]): Candidate pairs to emit in the form [similarity, first dataframe entity ID, second dataframe entity ID] + duplicate_of (dict, optional): Dictionary of the form [entity ID] -> [IDs of duplicate entities]. Defaults to None. + duplicate_emitted (dict, optional): Dictionary of the form [true positive pair] -> [emission status: emitted/not]. Defaults to None. + batch_size (int, optional): Recall update emission rate. Defaults to 1. + Raises: - AttributeError: Ground Truth file hasn't been supplied, cannot calculate ROC AUC + AttributeError: No ground truth has been given Returns: - List[Tuple[int, int]]: List of ROC graph points information (recall up to e, normalized auc up to e) + Tuple[List[int], int]: Indices of true positive duplicates within the candidates list and the total emissions """ - if(true_positive_checked is not None): - for pair in true_positive_checked.keys(): - true_positive_checked[pair] = False + if(duplicate_emitted is not None): + for pair in duplicate_emitted.keys(): + duplicate_emitted[pair] = False - if(data.ground_truth is None): + if(duplicate_of is None): raise AttributeError("Can calculate ROC AUC without a ground-truth file. \ Data object mush have initialized with the ground-truth file") - - if(len(data.ground_truth) == 0): - raise AttributeError("Cannot calculate AUC score, number of true duplicates is equal to 0.") - _true_positives: int = 0 - _normalized_auc: int = 0 - _current_recall: int = 0 - _new_recall: int = 0 self._tps_found : int = 0 - self._true_positive_checked : dict = true_positive_checked - self.num_of_true_duplicates = len(data.ground_truth) - _recall_progress = [0] - + self._duplicate_emitted : dict = duplicate_emitted + self._tps_indices : List[int] = [] + batches = batch_pairs(pairs, batch_size) # ideal_auc = self.calculate_ideal_auc(len(pairs), self.num_of_true_duplicates) - self._total_emissions : int = 0 + self.total_emissions : int = 0 for batch in batches: - _current_batch_size : int = 0 - for entity, candidate in batch: - if(self._all_tps_emitted()): break - entity_id = data._gt_to_ids_reversed_1[entity] if entity < data.dataset_limit else data._gt_to_ids_reversed_2[entity] - candidate_id = data._gt_to_ids_reversed_1[candidate] if candidate < data.dataset_limit else data._gt_to_ids_reversed_2[candidate] - _d1_entity, _d2_entity = (entity_id, candidate_id) if entity < data.dataset_limit else (candidate_id, entity_id) - - if _d2_entity in self.data.pairs_of[_d1_entity]: - self._update_true_positive_entry(entity_id, candidate_id) - _true_positives += 1 - _current_batch_size += 1 - self._total_emissions += 1 - _new_recall = _true_positives / self.num_of_true_duplicates + for score, entity, candidate in batch: + if(self._all_tps_emitted()): break + if candidate in duplicate_of[entity]: + self._update_true_positive_entry(entity, candidate) + self._tps_indices.append(self.total_emissions) + + self.total_emissions += 1 # _normalized_auc += ((_new_recall + _current_recall) / 2) * (_current_batch_size / self.num_of_true_duplicates) - _current_recall = _new_recall - _recall_progress.append(_current_recall) if(self._all_tps_emitted()): break - # _normalized_auc = 0 if(ideal_auc == 0) else _normalized_auc / ideal_auc - _normalized_auc = sum(_recall_progress) / (len(pairs) + 1.0) - return _recall_progress, _normalized_auc + return self._tps_indices, self.total_emissions + + + def _generate_auc_data(self, total_candidates : int, tp_positions : List[int]) -> Tuple[List[float], float]: + """Generates the recall axis containing the recall value for each emission and calculates the normalized AUC + + Args: + total_candidates (int): Total number of pairs emitted + tp_positions (List[int]): Indices of true positives within the candidate pairs list + + Returns: + Tuple[List[float], float]: Recall axis and the normalized AUC + """ + + _recall_axis : List[float] = [] + _recall : float = 0.0 + _tp_index : int = 0 + _dataset_total_tps : int = len(self.data.ground_truth) + _total_found_tps : int = len(tp_positions) + + for recall_index in range(total_candidates): + if(_tp_index < _total_found_tps): + if(recall_index == tp_positions[_tp_index]): + _recall = (_tp_index + 1.0) / _dataset_total_tps + _tp_index += 1 + _recall_axis.append(_recall) + + _normalized_auc : float = sum(_recall_axis) / (total_candidates + 1.0) + + return _recall_axis, _normalized_auc + + + def visualize_results_roc(self, results : dict, drop_tp_indices=True) -> None: + """For each of the executed workflows, calculates the cumulative recall and normalized AUC based upon true positive indices. + Finally, displays the ROC for all of the workflows with proper annotation (each workflow gains a unique identifier). + Args: + results (dict): Nested dictionary of the form [dataset] -> [matcher] -> [executed workflows and their info] / [model] -> [executed -//-] + """ + + workflows_info : List[Tuple[dict]] = [] + + for dataset in results: + matchers = results[dataset] + for matcher in matchers: + matcher_info = matchers[matcher] + if(isinstance(matcher_info, list)): + for workflow_info in matcher_info: + workflows_info.append((workflow_info)) + else: + for model in matcher_info: + for workflow_info in matcher_info[model]: + workflows_info.append((workflow_info)) + + self.visualize_roc(workflows_info, drop_tp_indices=drop_tp_indices) - def evaluate_auc_roc(self, matchers_data : List[Tuple], batch_size : int = 1, proportional : bool = True) -> None: + + def evaluate_auc_roc(self, matchers : List, batch_size : int = 1, proportional : bool = True, drop_tp_indices=True) -> None: """For each matcher, takes its prediction data, calculates cumulative recall and auc, plots the corresponding ROC curve, populates prediction data with performance info Args: - matchers_data List[Tuple[str, ProgressiveMatching]]: Progressive Matchers and their names - data (Data) : Data Module + matchers List[ProgressiveMatching]: Progressive Matchers batch_size (int, optional): Emitted pairs step at which cumulative recall is recalculated. Defaults to 1. proportional (bool) : Proportional Visualization Raises: @@ -363,22 +409,18 @@ def evaluate_auc_roc(self, matchers_data : List[Tuple], batch_size : int = 1, pr raise AttributeError("Can not proceed to AUC ROC evaluation without a ground-truth file. " + "Data object has not been initialized with the ground-truth file") - self._matchers_auc_roc_data = [] - - for matcher_data in matchers_data: - - matcher_name, progressive_matcher = matcher_data - matcher_prediction_data : PredictionData = PredictionData(matcher_name, progressive_matcher.pairs, progressive_matcher.true_pair_checked) - - matcher_predictions = matcher_prediction_data.get_predictions() - matcher_tps_checked = matcher_prediction_data.get_tps_checked() + self.matchers_info = [] + + for matcher in matchers: + _tp_indices, _total_emissions = self.calculate_tps_indices(pairs=matcher.pairs, duplicate_of=matcher.duplicate_of, duplicate_emitted=matcher.duplicate_emitted) + matcher_info = {} + matcher_info['name'] = generate_unique_identifier() + matcher_info['total_emissions'] = _total_emissions + matcher_info['tp_idx'] = _tp_indices + matcher_info['time'] = matcher.execution_time - cumulative_recall, normalized_auc = self.calculate_roc_auc_data(self.data, matcher_predictions, batch_size, matcher_tps_checked) - - self._matchers_auc_roc_data.append((matcher_name, normalized_auc, cumulative_recall)) - matcher_prediction_data.set_total_emissions(self._total_emissions) - matcher_prediction_data.set_normalized_auc(normalized_auc) - matcher_prediction_data.set_cumulative_recall(cumulative_recall[-1]) - progressive_matcher.set_prediction_data(matcher_prediction_data) + matcher_prediction_data : PredictionData = PredictionData(matcher=matcher, matcher_info=matcher_info) + matcher.set_prediction_data(matcher_prediction_data) + self.matchers_info.append(matcher_info) - self.visualize_roc(methods_data = self._matchers_auc_roc_data, proportional = proportional) + self.visualize_roc(methods_data=self.matchers_info, drop_tp_indices=drop_tp_indices) diff --git a/docs/pyjedai/joins.py b/docs/pyjedai/joins.py index 196c7cc..23a9ade 100644 --- a/docs/pyjedai/joins.py +++ b/docs/pyjedai/joins.py @@ -14,6 +14,7 @@ from .datamodel import Data, PYJEDAIFeature from .evaluation import Evaluation +from .utils import FrequencyEvaluator class AbstractJoin(PYJEDAIFeature): """Abstract class of Joins module @@ -44,23 +45,94 @@ def __init__( self.attributes_2: list self._flags: np.array self.pairs: networkx.Graph + self.vectorizer = None + def vectorizer_based(self) -> bool: + """ + Checks whether current instance of Joins algorithm is using a frequency vectorizer + + Returns: + bool: Candidate scores are being calculated through frequency vectorizer + """ + return (self.vectorizer is not None) + + def dirty_indexing(self): + """Applies Dirty Indexing - Evaluates the similarity of all the entities of the target dataset + """ + eid = 0 + for entity in self.indexed_entities: + candidates = set() + for token in entity: + if token in self.entity_index: + current_candidates = self.entity_index[token] + for candidate_id in current_candidates: + if(not self.vectorizer_based()): + if self._flags[candidate_id] != eid: + self._counters[candidate_id] = 0 + self._flags[candidate_id] = eid + self._counters[candidate_id] += 1 + candidates.add(candidate_id) + self._process_candidates(candidates, eid, len(entity)) + self._progress_bar.update(1) + eid += 1 + + def get_id_from_index(self, index : int): + return (i if self.reverse_order else (index+self.data.dataset_limit)) + + + def clean_indexing(self): + """Applies Dirty Indexing - One of the datasets (depends on the order of indexing) is set as the indexer. + For each entry of that dataset, its similarity scores are being calculated with each entity of the target dataset. + The top-K best results for each source entity are chosen. + """ + for i in range(0, self.indexed_entities_count): + candidates = set() + record = self.indexed_entities[i] + entity_id = self.get_id_from_index(i) + for token in record: + if token in self.entity_index: + current_candidates = self.entity_index[token] + for candidate_id in current_candidates: + if(not self.vectorizer_based()): + if self._flags[candidate_id] != entity_id: + self._counters[candidate_id] = 0 + self._flags[candidate_id] = entity_id + self._counters[candidate_id] += 1 + candidates.add(candidate_id) + if 0 < len(candidates): + self._process_candidates(candidates, entity_id, len(record)) + self._progress_bar.update(1) + + def setup_indexing(self): + """Defines the indexed and target entities, as well as their total count + + """ + self.indexed_entities, self.indexed_entities_count = (self._entities_d1, self.data.num_of_entities_1) if (self.reverse_order or self.data.is_dirty_er) \ + else (self._entities_d2, self.data.num_of_entities_2) + + self.target_entities, self.target_entities_count = (self._entities_d1, self.data.num_of_entities_1) if (not self.reverse_order or self.data.is_dirty_er) \ + else (self._entities_d2, self.data.num_of_entities_2) + def fit(self, data: Data, + vectorizer: FrequencyEvaluator = None, reverse_order: bool = False, attributes_1: list = None, attributes_2: list = None, - tqdm_disable: bool = False + tqdm_disable: bool = False, + store_neighborhoods : bool = False ) -> networkx.Graph: """Joins main method Args: data (Data): dataset module + vectorizer (FrequencyEvaluator, optional): Vectorizer will be used for similarity evaluation reverse_order (bool, optional): _description_. Defaults to False. attributes_1 (list, optional): _description_. Defaults to None. attributes_2 (list, optional): _description_. Defaults to None. tqdm_disable (bool, optional): _description_. Defaults to False. - + save_to_json (bool, optional): Store indexed dataset neighborhoods in a dictionary of form + [indexed dataset entity id] -> [ids of top-k neighbors in target dataset] Returns: networkx.Graph: graph containg nodes as entities and edges as similarity score """ @@ -68,8 +140,8 @@ def fit(self, raise ValueError("Can't have reverse order in Dirty Entity Resolution") start_time = time() - self.tqdm_disable, self.reverse_order, self.attributes_1, self.attributes_2, self.data = \ - tqdm_disable, reverse_order, attributes_1, attributes_2, data + self.tqdm_disable, self.reverse_order, self.attributes_1, self.attributes_2, self.data, self.vectorizer, self.store_neighborhoods = \ + tqdm_disable, reverse_order, attributes_1, attributes_2, data, vectorizer, store_neighborhoods self._entities_d1 = data.dataset_1[attributes_1 if attributes_1 else data.attributes_1] \ .apply(" ".join, axis=1) \ @@ -82,74 +154,36 @@ def fit(self, .apply(self._tokenize_entity) \ .values.tolist() - num_of_entities = self.data.num_of_entities_2 if reverse_order else self.data.num_of_entities_1 - + self.neighborhoods = defaultdict(list) if self.store_neighborhoods else None + self.setup_indexing() + self._progress_bar = tqdm( - total=self.data.num_of_entities if not self.data.is_dirty_er else num_of_entities*2, + total=self.indexed_entities_count, desc=self._method_name+" ("+self.metric+")", disable=self.tqdm_disable ) - - self._flags, \ - self._counters, \ - self._sims, \ - self._source_frequency, \ - self.pairs = np.empty([num_of_entities]), \ - np.zeros([num_of_entities]), \ - np.empty([self.data.num_of_entities_1*self.data.num_of_entities_2]), \ - np.empty([num_of_entities]), \ - networkx.Graph() - self._flags[:] = -1 - entity_index = self._create_entity_index( - self._entities_d2 if reverse_order else self._entities_d1 - ) + + self._flags = np.empty([self.target_entities_count]) if (not self.vectorizer_based()) else None + self._counters = np.zeros([self.target_entities_count]) if (not self.vectorizer_based()) else None + self._source_frequency = np.empty([self.target_entities_count]) if (not self.vectorizer_based()) else None + if(not self.vectorizer_based()) : self._flags[:] = -1 + self.pairs = networkx.Graph() + self.entity_index = self._create_entity_index() if self.data.is_dirty_er: - eid = 0 - for entity in self._entities_d1: - candidates = set() - for token in entity: - if token in entity_index: - current_candidates = entity_index[token] - for candidate_id in current_candidates: - if self._flags[candidate_id] != eid: - self._counters[candidate_id] = 0 - self._flags[candidate_id] = eid - self._counters[candidate_id] += 1 - candidates.add(candidate_id) - self._process_candidates(candidates, eid, len(entity)) - self._progress_bar.update(1) - eid += 1 + self.dirty_indexing() else: - if reverse_order: - entities = self._entities_d1 - num_of_entities = self.data.num_of_entities_1 - else: - entities = self._entities_d2 - num_of_entities = self.data.num_of_entities_2 - - for i in range(0, num_of_entities): - candidates = set() - record = entities[i] - entity_id = i if reverse_order else i+self.data.dataset_limit - for token in record: - if token in entity_index: - current_candidates = entity_index[token] - for candidate_id in current_candidates: - if self._flags[candidate_id] != entity_id: - self._counters[candidate_id] = 0 - self._flags[candidate_id] = entity_id - self._counters[candidate_id] += 1 - candidates.add(candidate_id) - if 0 < len(candidates): - self._process_candidates(candidates, entity_id, len(record)) - self._progress_bar.update(1) + self.clean_indexing() + + if(self.store_neighborhoods): self._process_neighborhoods() + self._progress_bar.close() self.execution_time = time() - start_time - return self.pairs def _tokenize_entity(self, entity: str) -> set: - if self.tokenization == 'qgrams': + if self.vectorizer is not None: + return entity.lower() + elif self.tokenization == 'qgrams': return set([' '.join(grams) for grams in nltk.ngrams(entity.lower(), n=self.qgrams)]) elif self.tokenization == 'standard': return set(filter(None, re.split('[\\W_]', entity.lower()))) @@ -194,47 +228,46 @@ def _calc_similarity( return 2 * common_tokens / (source_frequency+tokens_size) elif self.metric == 'jaccard': return common_tokens / (source_frequency+tokens_size-common_tokens) + + def _calc_vector_similarity(self, id1 : int, id2 : int) -> float: + """Vector based similarity score + + Args: + id1 (int): D1 entity ID + id2 (int): D2 entity ID - def _create_entity_index(self, entities: list) -> dict: + Returns: + float: vector based similarity + """ + return self.vectorizer.predict(id1=id1, id2=id2) + + def _create_entity_index(self) -> dict: entity_index = defaultdict(set) - entity_id = itertools.count() - for entity in entities: - eid = next(entity_id) + for eid, entity in enumerate(self.target_entities): for token in entity: entity_index[token].add(eid) - self._source_frequency[eid] = len(entity) + + if(not self.vectorizer_based()): + self._source_frequency[eid] = len(entity) self._progress_bar.update(1) - return entity_index - -# def _similarity(self, entity_id1: int, entity_id2: int, attributes: any=None) -> float: -# similarity: float = 0.0 -# if isinstance(attributes, dict): -# for attribute, weight in self.attributes.items(): -# similarity += weight*self._metric( -# self.data.entities.iloc[entity_id1][attribute], -# self.data.entities.iloc[entity_id2][attribute] -# ) -# if isinstance(attributes, list): -# for attribute in self.attributes: -# similarity += self._metric( -# self.data.entities.iloc[entity_id1][attribute], -# self.data.entities.iloc[entity_id2][attribute] -# ) -# similarity /= len(self.attributes) -# else: -# # print(self.data.entities.iloc[entity_id1].str.cat(sep=' '), -# # self.data.entities.iloc[entity_id2].str.cat(sep=' ')) -# # concatenated row string -# similarity = self._metric( -# self.data.entities.iloc[entity_id1].str.cat(sep=' '), -# self.data.entities.iloc[entity_id2].str.cat(sep=' ') -# ) -# return similarity + return entity_index def _insert_to_graph(self, entity_id1, entity_id2, similarity): if self.similarity_threshold <= similarity: self.pairs.add_edge(entity_id1, entity_id2, weight=similarity) + + def _store_neighborhood(self, entity_id1, entity_id2, similarity): + if self.similarity_threshold <= similarity: + self.neighborhoods[entity_id2].append((similarity, entity_id1)) + + def _process_neighborhoods(self): + """Sorts the candidates of each indexed entity's neighborhood in descending order + of similarity. + """ + for d1_id, d2_ids in self.neighborhoods.items(): + self.neighborhoods[d1_id] = sorted(d2_ids, key=lambda x: (-x[0], x[1])) + def evaluate(self, prediction=None, export_to_df: bool = False, export_to_dict: bool = False, with_classification_report: bool = False, @@ -296,9 +329,6 @@ def export_to_df(self, prediction) -> pd.DataFrame: pairs_df = pd.concat([pairs_df, pd.DataFrame([{'id1':id1, 'id2':id2}], index=[0])], ignore_index=True) return pairs_df - - - class EJoin(AbstractJoin): """ @@ -319,17 +349,17 @@ def __init__( def _process_candidates(self, candidates: set, entity_id: int, tokens_size: int) -> None: for candidate_id in candidates: - self._insert_to_graph( - candidate_id+self.data.dataset_limit if self.reverse_order \ - and not self.data.is_dirty_er \ - else candidate_id, - entity_id, - self._calc_similarity( - self._counters[candidate_id], - self._source_frequency[candidate_id], - tokens_size + sim = self._calc_similarity( + self._counters[candidate_id], + self._source_frequency[candidate_id], + tokens_size ) - ) + d1_id = candidate_id+self.data.dataset_limit if (self.reverse_order \ + and not self.data.is_dirty_er) \ + else candidate_id + d2_id = entity_id + self._insert_to_graph(d1_id, d2_id, sim) + if(self.store_neighborhoods): self._store_neighborhood(d1_id, d2_id, sim) class TopKJoin(AbstractJoin): """Top-K Join algorithm @@ -351,27 +381,149 @@ def __init__(self, def _process_candidates(self, candidates: set, entity_id: int, tokens_size: int) -> None: minimum_weight=0 pq = PriorityQueue() - for candidate_id in candidates: - sim = self._calc_similarity( - self._counters[candidate_id], self._source_frequency[candidate_id], tokens_size - ) + pq.put(minimum_weight) + for index, candidate_id in enumerate(candidates): + if(self.vectorizer is None): + sim = self._calc_similarity(self._counters[candidate_id], self._source_frequency[candidate_id], tokens_size) + else: + sim = self._calc_vector_similarity(((candidate_id + self.data.dataset_limit) if self.reverse_order else candidate_id), entity_id) if minimum_weight < sim: pq.put(sim) if self.K < pq.qsize(): minimum_weight = pq.get() minimum_weight = pq.get() - for candidate_id in candidates: + for index, candidate_id in enumerate(candidates): self.similarity_threshold = minimum_weight + if(self.vectorizer is None): + sim = self._calc_similarity(self._counters[candidate_id], self._source_frequency[candidate_id], tokens_size) + else: + sim = self._calc_vector_similarity(((candidate_id + self.data.dataset_limit) if self.reverse_order else candidate_id), entity_id) self._insert_to_graph( candidate_id + self.data.dataset_limit if self.reverse_order else candidate_id, entity_id, - self._calc_similarity( - self._counters[candidate_id], - self._source_frequency[candidate_id], - tokens_size - ) + sim ) + if(self.store_neighborhoods): self._store_neighborhood(candidate_id + self.data.dataset_limit if self.reverse_order else candidate_id, \ + entity_id, \ + sim) + + def _configuration(self) -> dict: + return { + "similarity_threshold" : self.similarity_threshold, + "K" : self.K, + "metric" : self.metric, + "tokenization" : self.tokenization, + "qgrams": self.qgrams + } + + +class PETopKJoin(TopKJoin): + """Progressive Entity Resolution Top-K class of Joins module + """ + _method_name = "Progressive Top-K Join" + _method_info = "Progressive Top-K Join algorithm" + _method_short_name = "PETopKJ" + + def __init__( + self, + K: int, + metric: str, + tokenization: str, + qgrams: int = 2 + ) -> None: + """AbstractJoin Constructor + + Args: + K (int): Number of candidates per entity + metric (str): String similarity metric + tokenization (str): Tokenizer + qgrams (int, optional): For Jaccard metric. Defaults to 2. + """ + super().__init__(K=K, + metric=metric, + tokenization=tokenization, + qgrams=qgrams) + + + def _get_similarity(self, target_id : int, indexed_id : int, tokens_size : int): + return self._calc_similarity(self._counters[target_id], self._source_frequency[target_id], tokens_size) \ + if (self.vectorizer is None) else \ + self._calc_vector_similarity(target_id , indexed_id) + + def _process_candidates(self, candidates: set, entity_id: int, tokens_size: int) -> None: + minimum_weight=0 + pq = PriorityQueue() + for index, candidate_id in enumerate(candidates): + + _target_id = candidate_id + _indexed_id = entity_id + self.data.dataset_limit + + sim : float = self._get_similarity(target_id=_target_id, + indexed_id=_indexed_id, + tokens_size=tokens_size) + + # target dataset entity id set to negative + # so higher identifier kicked out first (simulating descending order with ascending PQ) + _pair = (sim, -_target_id, _indexed_id) + + if minimum_weight <= sim: + pq.put(_pair) + if self.K < pq.qsize(): + minimum_weight, _, _ = pq.get() + + if(self.store_neighborhoods): + _first_element = True + while(not pq.empty()): + _sim, _target_id, _indexed_id = pq.get() + if _first_element: + self.similarity_threshold = _sim + _first_element = False + + self._store_neighborhood(entity_id1= -_target_id, + entity_id2= _indexed_id, + similarity= _sim) + self._insert_to_graph(entity_id1=-_target_id, + entity_id2=_indexed_id, + similarity=_sim) + else: + self.similarity_threshold, _, _ = pq.get() + for index, candidate_id in enumerate(candidates): + _target_id = candidate_id + _indexed_id = entity_id + self.data.dataset_limit + self._insert_to_graph(entity_id1=_target_id, + entity_id2=_indexed_id, + similarity=self._get_similarity(target_id=_target_id, + indexed_id=_indexed_id, + tokens_size=tokens_size)) + + def _process_neighborhoods(self, strict_top_k : bool = True): + """Sorts the candidates of each indexed entity's neighborhood in descending order + of similarity. If strict top-K instance is chosen, it retains max K best candidates + per entity. + Args: + strict_top_k (bool, optional): Retain strictly (max) top-K candidates per entity + """ + for d1_id, d2_ids in self.neighborhoods.items(): + _sorted_neighborhood = sorted(d2_ids, key=lambda x: (-x[0], x[1])) + self.neighborhoods[d1_id] = _sorted_neighborhood[:self.K] if strict_top_k else \ + _sorted_neighborhood + + def setup_indexing(self): + """Defines the indexed and target entities, as well as their total count + + """ + # self.indexed_entities, self.indexed_entities_count = (self._entities_d2, self.data.num_of_entities_2) if (self.reverse_order) \ + # else (self._entities_d1, self.data.num_of_entities_1) + + # self.target_entities, self.target_entities_count = (self._entities_d1, self.data.num_of_entities_1) if (self.reverse_order or self.data.is_dirty_er) \ + # else (self._entities_d2, self.data.num_of_entities_2) + self.indexed_entities, self.indexed_entities_count = (self._entities_d2, self.data.num_of_entities_2) + + self.target_entities, self.target_entities_count = (self._entities_d1, self.data.num_of_entities_1) + + def get_id_from_index(self, index : int): + return index def _configuration(self) -> dict: return { @@ -381,3 +533,12 @@ def _configuration(self) -> dict: "tokenization" : self.tokenization, "qgrams": self.qgrams } + + + + + + + + + diff --git a/docs/pyjedai/matching.py b/docs/pyjedai/matching.py index 00e6951..6880ab4 100644 --- a/docs/pyjedai/matching.py +++ b/docs/pyjedai/matching.py @@ -19,13 +19,11 @@ from py_stringmatching.tokenizer.qgram_tokenizer import QgramTokenizer from py_stringmatching.tokenizer.whitespace_tokenizer import \ WhitespaceTokenizer -from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer -from sklearn.metrics.pairwise import pairwise_distances from tqdm.autonotebook import tqdm from .datamodel import Data, PYJEDAIFeature from .evaluation import Evaluation -from .utils import WordQgramTokenizer, cosine, get_qgram_from_tokenizer_name +from .utils import WordQgramTokenizer, cosine, get_qgram_from_tokenizer_name, FrequencyEvaluator metrics_mapping = { @@ -51,36 +49,44 @@ ] vector_metrics = [ - 'cosine', 'dice', 'jaccard' + 'cosine', 'dice', 'jaccard', 'sqeuclidean' ] whoosh_index_metrics = [ 'TF-IDF', 'Frequency', 'PL2', 'BM25F' ] +faiss_metrics = [ + 'cosine', 'euclidean' +] + magellan_metrics = string_metrics + set_metrics -available_metrics = magellan_metrics + vector_metrics + whoosh_index_metrics +available_metrics = magellan_metrics + vector_metrics + whoosh_index_metrics + faiss_metrics # # Tokenizers # -char_qgram_tokenizers = { 'char_'+ str(i) + 'gram':i for i in range(1, 7) } -word_qgram_tokenizers = { 'word_'+ str(i) + 'gram':i for i in range(1, 7) } +# char_qgram_tokenizers = { 'char_'+ str(i) + 'gram':i for i in range(1, 7) } +# word_qgram_tokenizers = { 'word_'+ str(i) + 'gram':i for i in range(1, 7) } +char_qgram_tokenizers = ['char_tokenizer'] +word_qgram_tokenizers = ['word_tokenizer'] magellan_tokenizers = ['white_space_tokenizer'] +joins_tokenizers = ["qgrams", "standard", "standard_multiset", "qgrams_multiset"] -tfidf_tokenizers = [ 'tfidf_' + cq for cq in char_qgram_tokenizers.keys() ] + \ - [ 'tfidf_' + wq for wq in word_qgram_tokenizers.keys() ] +# tfidf_tokenizers = [ 'tfidf_' + cq for cq in char_qgram_tokenizers.keys() ] + \ +# [ 'tfidf_' + wq for wq in word_qgram_tokenizers.keys() ] -tf_tokenizers = [ 'tf_' + cq for cq in char_qgram_tokenizers.keys() ] + \ - [ 'tf_' + wq for wq in word_qgram_tokenizers.keys() ] +# tf_tokenizers = [ 'tf_' + cq for cq in char_qgram_tokenizers.keys() ] + \ +# [ 'tf_' + wq for wq in word_qgram_tokenizers.keys() ] -boolean_tokenizers = [ 'boolean_' + cq for cq in char_qgram_tokenizers.keys() ] + \ - [ 'boolean_' + wq for wq in word_qgram_tokenizers.keys() ] - -vector_tokenizers = tfidf_tokenizers + tf_tokenizers + boolean_tokenizers +# boolean_tokenizers = [ 'boolean_' + cq for cq in char_qgram_tokenizers.keys() ] + \ +# [ 'boolean_' + wq for wq in word_qgram_tokenizers.keys() ] -available_tokenizers = [key for key in char_qgram_tokenizers] + [key for key in word_qgram_tokenizers] + magellan_tokenizers + vector_tokenizers +# vector_tokenizers = tfidf_tokenizers + tf_tokenizers + boolean_tokenizers +# available_tokenizers = [key for key in char_qgram_tokenizers] + [key for key in word_qgram_tokenizers] + magellan_tokenizers + vector_tokenizers +available_tokenizers = char_qgram_tokenizers + word_qgram_tokenizers + magellan_tokenizers + joins_tokenizers +available_vectorizers = ['tfidf', 'tf', 'boolean'] class AbstractEntityMatching(PYJEDAIFeature): """Calculates similarity from 0.0 to 1.0 @@ -337,6 +343,8 @@ def __init__( self, metric: str = 'dice', tokenizer: str = 'white_space_tokenizer', + vectorizer : str = None, + qgram : int = 1, similarity_threshold: float = 0.5, tokenizer_return_unique_values = False, # unique values or not, attributes: any = None, @@ -348,7 +356,7 @@ def __init__( self.similarity_threshold = similarity_threshold self.tokenizer = tokenizer self.execution_time = 0 - self._input_type = None + self.vectorizer = vectorizer self.qgram: int = -1 # # Selecting tokenizer @@ -362,36 +370,30 @@ def __init__( else: self._metric = metric - if metric in set_metrics: - self.tokenizer_return_set = True - else: - self.tokenizer_return_set = tokenizer_return_unique_values - - if 'gram' in tokenizer: - self.qgram = get_qgram_from_tokenizer_name(tokenizer) + self.tokenizer_return_set = (metric in set_metrics) or tokenizer_return_unique_values + self.qgram : int = qgram - if tokenizer == 'white_space_tokenizer': - self._input_type = 'white_space' - self._tokenizer = WhitespaceTokenizer(return_set=self.tokenizer_return_set) - elif tokenizer in char_qgram_tokenizers.keys(): - self._input_type = 'char_qgram' - self._tokenizer = QgramTokenizer(qval=self.qgram, - return_set=self.tokenizer_return_set) - elif tokenizer in word_qgram_tokenizers.keys(): - self._input_type = 'word_qgram' - self._tokenizer = WordQgramTokenizer(q=self.qgram) - elif 'tfidf' in tokenizer: - self._input_type = 'tfidf' - elif 'tf' in tokenizer: - self._input_type = 'tf' - elif 'boolean' in tokenizer: - self._input_type = 'boolean' - else: - raise AttributeError( - 'Tokenizer ({}) does not exist. Please select one of the available. ({})'.format( - tokenizer, available_tokenizers + if(vectorizer is not None): + if self.vectorizer not in available_vectorizers: + raise AttributeError( + 'Weighting Scheme ({}) does not exist. Please select one of the available. ({})'.format( + vectorizer, available_vectorizers + ) + ) + elif(tokenizer is not None): + if tokenizer == 'white_space_tokenizer': + self._tokenizer = WhitespaceTokenizer(return_set=self.tokenizer_return_set) + elif tokenizer == 'char_tokenizer': + self._tokenizer = QgramTokenizer(qval=self.qgram, + return_set=self.tokenizer_return_set) + elif tokenizer == 'word_tokenizer': + self._tokenizer = WordQgramTokenizer(q=self.qgram) + elif tokenizer not in available_tokenizers: + raise AttributeError( + 'Tokenizer ({}) does not exist. Please select one of the available. ({})'.format( + tokenizer, available_tokenizers + ) ) - ) def predict(self, blocks: dict, @@ -420,8 +422,8 @@ def predict(self, desc=self._method_name+" ("+self.metric+ ", " + str(self.tokenizer) + ")", disable=self.tqdm_disable) - if self._input_type in ['tfidf', 'tf', 'boolean']: - self._calculate_tf_tfidf() + if self.vectorizer is not None: + self.initialize_vectorizer() if 'Block' in str(type(all_blocks[0])): self._predict_raw_blocks(blocks) @@ -459,52 +461,40 @@ def _predict_raw_blocks(self, blocks: dict) -> None: self._insert_to_graph(entity_id1, entity_id2, similarity) self._progress_bar.update(1) - def _calculate_tf_tfidf(self) -> None: - - analyzer = 'char' if 'char' in self.tokenizer else 'word' - + def initialize_vectorizer(self) -> None: + self.frequency_evaluator : FrequencyEvaluator = FrequencyEvaluator(vectorizer=self.vectorizer, + tokenizer=self.tokenizer, + qgram=self.qgram) d1 = self.data.dataset_1[self.attributes] if self.attributes else self.data.dataset_1 self._entities_d1 = d1 \ .apply(" ".join, axis=1) \ .apply(lambda x: x.lower()) \ .values.tolist() - d2 = self.data.dataset_2[self.attributes] if self.attributes and not self.data.is_dirty_er else self.data.dataset_2 + d2 = None + if(not self.data.is_dirty_er): + d2 = self.data.dataset_2 + if self.attributes: + d2 = d2[self.attributes] + self._entities_d2 = d2 \ .apply(" ".join, axis=1) \ .apply(lambda x: x.lower()) \ - .values.tolist() if not self.data.is_dirty_er else None - - if self._input_type == 'tfidf' or self._input_type == 'boolean': - vectorizer = TfidfVectorizer(analyzer='') if self.qgram is None else \ - TfidfVectorizer(analyzer=analyzer, ngram_range=(self.qgram, self.qgram)) - elif self._input_type == 'tf': - vectorizer = CountVectorizer(analyzer=analyzer) if self.qgram is None else \ - CountVectorizer(analyzer=analyzer, ngram_range=(self.qgram, self.qgram)) + .values.tolist() if not self.data.is_dirty_er else self._entities_d1 - self._calculate_tf_and_tfidf_similarities(vectorizer) - - def _calculate_tf_and_tfidf_similarities(self, vectorizer) -> None: - if self.data.is_dirty_er: - raise NotImplementedError("TFIDF for dirty ER is not implemented yet") - else: - self.corpus = self._entities_d1 + self._entities_d2 - self.corpus_as_matrix = vectorizer.fit_transform(self.corpus) - if self._input_type == 'boolean': - # transform to boolean if value is positive to 1 and negative to 0 - self.similarity_matrix = self.corpus_as_matrix.astype(bool).astype(int) - - self.similarity_matrix = 1 - pairwise_distances(self.corpus_as_matrix.toarray(), - metric=self.metric) - - def _calculate_vector_similarity(self, entity_id1: int, entity_id2: int) -> float: - return self.similarity_matrix[entity_id1][entity_id2] + + _dataset_identifier : str = ('_'.join([self.data.dataset_name_1, self.data.dataset_name_2])) if(self.data.dataset_name_1 is not None and self.data.dataset_name_2 is not None) else ("dataset") + self.frequency_evaluator.fit(metric=self.metric, + dataset_identifier=_dataset_identifier, + indexing='inorder', + d1_entities=self._entities_d1, + d2_entities=self._entities_d2) def _similarity(self, entity_id1: int, entity_id2: int) -> float: similarity: float = 0.0 - if self._input_type in ['tfidf', 'tf', 'boolean']: - return self._calculate_vector_similarity(entity_id1, entity_id2) + if self.vectorizer is not None: + return self.frequency_evaluator.predict(id1=entity_id1, id2=entity_id2) elif isinstance(self.attributes, dict): for attribute, weight in self.attributes.items(): e1 = self.data.entities.iloc[entity_id1][attribute].lower() @@ -537,7 +527,9 @@ def _configuration(self) -> dict: "Metric" : self.metric, "Attributes" : self.attributes, "Similarity threshold" : self.similarity_threshold, - "Tokenizer" : self.tokenizer + "Tokenizer" : self.tokenizer, + "Vectorizer" : self.vectorizer if self.vectorizer is not None else "None", + "Qgrams" : self.qgram } class VectorBasedMatching(AbstractEntityMatching): diff --git a/docs/pyjedai/prioritization.py b/docs/pyjedai/prioritization.py index 4ce6f62..db5bc63 100644 --- a/docs/pyjedai/prioritization.py +++ b/docs/pyjedai/prioritization.py @@ -11,132 +11,115 @@ GlobalProgressiveSortedNeighborhood, LocalProgressiveSortedNeighborhood, ProgressiveEntityScheduling) +from .joins import PETopKJoin from .vector_based_blocking import EmbeddingsNNBlockBuilding -from sklearn.metrics.pairwise import ( - cosine_similarity -) + from networkx import Graph -from py_stringmatching.similarity_measure.affine import Affine -from py_stringmatching.similarity_measure.bag_distance import BagDistance from py_stringmatching.similarity_measure.cosine import Cosine from py_stringmatching.similarity_measure.dice import Dice -from py_stringmatching.similarity_measure.editex import Editex from py_stringmatching.similarity_measure.generalized_jaccard import \ GeneralizedJaccard -from py_stringmatching.similarity_measure.hamming_distance import \ - HammingDistance from py_stringmatching.similarity_measure.jaccard import Jaccard from py_stringmatching.similarity_measure.jaro import Jaro -from py_stringmatching.similarity_measure.jaro_winkler import JaroWinkler from py_stringmatching.similarity_measure.levenshtein import Levenshtein -from py_stringmatching.similarity_measure.monge_elkan import MongeElkan -from py_stringmatching.similarity_measure.needleman_wunsch import \ - NeedlemanWunsch from py_stringmatching.similarity_measure.overlap_coefficient import \ OverlapCoefficient -from py_stringmatching.similarity_measure.partial_ratio import PartialRatio -from py_stringmatching.similarity_measure.token_sort import TokenSort -from py_stringmatching.similarity_measure.partial_token_sort import \ - PartialTokenSort -from py_stringmatching.similarity_measure.ratio import Ratio -from py_stringmatching.similarity_measure.smith_waterman import SmithWaterman -from py_stringmatching.similarity_measure.soundex import Soundex -from py_stringmatching.similarity_measure.tfidf import TfIdf -from py_stringmatching.similarity_measure.tversky_index import TverskyIndex -from py_stringmatching.tokenizer.alphabetic_tokenizer import \ - AlphabeticTokenizer -from py_stringmatching.tokenizer.alphanumeric_tokenizer import \ - AlphanumericTokenizer -from py_stringmatching.tokenizer.delimiter_tokenizer import DelimiterTokenizer from py_stringmatching.tokenizer.qgram_tokenizer import QgramTokenizer from py_stringmatching.tokenizer.whitespace_tokenizer import \ WhitespaceTokenizer +from sklearn.metrics.pairwise import pairwise_distances from tqdm.autonotebook import tqdm -from .evaluation import Evaluation from .datamodel import Data, PYJEDAIFeature +from .evaluation import Evaluation from .matching import EntityMatching from .comparison_cleaning import AbstractMetablocking from queue import PriorityQueue from random import sample -from .utils import sorted_enumerate, canonical_swap from abc import abstractmethod from typing import Tuple, List -from .utils import SubsetIndexer, WhooshDataset, WhooshNeighborhood, is_infinite, PredictionData +from .utils import ( + SubsetIndexer, + DatasetScheduler, + EntityScheduler, + is_infinite, + PredictionData, + reverse_data_indexing, + reverse_blocks_entity_indexing, + sorted_enumerate, + canonical_swap, + WordQgramTokenizer, + cosine, + get_qgram_from_tokenizer_name, + FrequencyEvaluator) import pandas as pd import os -from whoosh.fields import TEXT, Schema, ID -from whoosh.index import create_in -from whoosh import qparser -from whoosh.scoring import TF_IDF, Frequency, PL2, BM25F +from collections import defaultdict +import sys +from faiss import METRIC_INNER_PRODUCT, METRIC_L2 +import json +import re # Directory where the whoosh index is stored INDEXER_DIR='.indexer' -# Package import from https://anhaidgroup.github.io/py_stringmatching/v0.4.2/index.html - -available_tokenizers = [ - 'white_space_tokenizer', 'qgram_tokenizer', 'delimiter_tokenizer', - 'alphabetic_tokenizer', 'alphanumeric_tokenizer' -] - metrics_mapping = { - 'levenshtein' : Levenshtein(), 'edit_distance': Levenshtein(), - 'jaro_winkler' : JaroWinkler(), - 'bag_distance' : BagDistance(), - 'editex' : Editex(), 'cosine' : Cosine(), 'jaro' : Jaro(), - 'soundex' : Soundex(), - 'tfidf' : TfIdf(), - 'tversky_index':TverskyIndex(), - 'ratio' : Ratio(), - 'partial_token_sort' : PartialTokenSort(), - 'partial_ratio' : PartialRatio(), - 'hamming_distance' : HammingDistance(), 'jaccard' : Jaccard(), 'generalized_jaccard' : GeneralizedJaccard(), 'dice': Dice(), 'overlap_coefficient' : OverlapCoefficient(), - 'token_sort': TokenSort(), - 'cosine_vector_similarity': cosine_similarity, - 'TF-IDF' : TF_IDF(), - 'Frequency' : Frequency(), - 'PL2' : PL2(), - 'BM25F' : BM25F() } -whoosh_similarity_function = { - 'TF-IDF' : TF_IDF(), - 'Frequency' : Frequency(), - 'PL2' : PL2(), - 'BM25F' : BM25F() +vector_metrics_mapping = { + 'cosine': cosine } string_metrics = [ - 'bag_distance', 'editex', 'hamming_distance', 'jaro', 'jaro_winkler', 'levenshtein', - 'edit_distance', 'partial_ratio', 'partial_token_sort', 'ratio', 'soundex', 'token_sort' + 'jaro', 'edit_distance' ] set_metrics = [ - 'cosine', 'dice', 'generalized_jaccard', 'jaccard', 'overlap_coefficient', 'tversky_index' + 'cosine', 'dice', 'generalized_jaccard', 'jaccard', 'overlap_coefficient' ] -bag_metrics = [ - 'tfidf' +vector_metrics = [ + 'cosine', 'dice', 'jaccard' ] -index_metrics = [ +whoosh_index_metrics = [ 'TF-IDF', 'Frequency', 'PL2', 'BM25F' ] -vector_metrics = [ - 'cosine_vector_similarity' +faiss_metrics = [ + 'cosine', 'euclidean' ] -available_metrics = string_metrics + set_metrics + bag_metrics + vector_metrics + index_metrics +magellan_metrics = string_metrics + set_metrics +available_metrics = magellan_metrics + vector_metrics + whoosh_index_metrics + faiss_metrics + +# +# Tokenizers +# +char_qgram_tokenizers = { 'char_'+ str(i) + 'gram':i for i in range(1, 7) } +word_qgram_tokenizers = { 'word_'+ str(i) + 'gram':i for i in range(1, 7) } +magellan_tokenizers = ['white_space_tokenizer'] + +tfidf_tokenizers = [ 'tfidf_' + cq for cq in char_qgram_tokenizers.keys() ] + \ + [ 'tfidf_' + wq for wq in word_qgram_tokenizers.keys() ] + +tf_tokenizers = [ 'tf_' + cq for cq in char_qgram_tokenizers.keys() ] + \ + [ 'tf_' + wq for wq in word_qgram_tokenizers.keys() ] + +boolean_tokenizers = [ 'boolean_' + cq for cq in char_qgram_tokenizers.keys() ] + \ + [ 'boolean_' + wq for wq in word_qgram_tokenizers.keys() ] + +vector_tokenizers = tfidf_tokenizers + tf_tokenizers + boolean_tokenizers + +available_tokenizers = [key for key in char_qgram_tokenizers] + [key for key in word_qgram_tokenizers] + magellan_tokenizers + vector_tokenizers class ProgressiveMatching(EntityMatching): """Applies the matching process to a subset of available pairs progressively @@ -144,34 +127,39 @@ class ProgressiveMatching(EntityMatching): _method_name: str = "Progressive Matching" _method_info: str = "Applies the matching process to a subset of available pairs progressively " - def __init__( self, - budget: int = 0, - metric: str = 'dice', + similarity_function: str = 'dice', tokenizer: str = 'white_space_tokenizer', - similarity_threshold: float = 0.5, - qgram: int = 2, # for jaccard - tokenizer_return_set = True, # unique values or not + vectorizer : str = None, + qgram : int = 1, + similarity_threshold: float = 0.0, + tokenizer_return_unique_values = True, # unique values or not attributes: any = None, - delim_set: list = None, # DelimiterTokenizer - padding: bool = True, # QgramTokenizer - prefix_pad: str = '#', # QgramTokenizer (if padding=True) - suffix_pad: str = '$' # QgramTokenizer (if padding=True) ) -> None: - super().__init__(metric, tokenizer, similarity_threshold, qgram, tokenizer_return_set, attributes, delim_set, padding, prefix_pad, suffix_pad) - self._budget : int = budget - + super().__init__(metric=similarity_function, + tokenizer=tokenizer, + vectorizer=vectorizer, + qgram=qgram, + similarity_threshold=similarity_threshold, + tokenizer_return_unique_values=tokenizer_return_unique_values, + attributes=attributes) + self.similarity_function : str = similarity_function + self.dataset_identifier : str = None + def predict(self, - blocks: dict, data: Data, + blocks: dict, + dataset_identifier: str = "dataset", + budget: int = 0, + algorithm : str = 'HB', + indexing : str = 'inorder', comparison_cleaner: AbstractMetablocking = None, tqdm_disable: bool = False, - method : str = 'HB', - emit_all_tps_stop : bool = False) -> Graph: - """Main method of progressive entity matching. Inputs a set of blocks and outputs a graph \ - that contains of the entity ids (nodes) and the similarity scores between them (edges). + emit_all_tps_stop : bool = False) -> List[Tuple[float, int, int]]: + """Main method of progressive entity matching. Inputs a set of blocks and outputs a list \ + that contains duplets of ids corresponding to candidate pairs to emit. Args: blocks (dict): blocks of entities data (Data): dataset module @@ -183,32 +171,165 @@ def predict(self, """ start_time = time() self.tqdm_disable = tqdm_disable + self._budget : int = budget + self._indexing : str = indexing self._comparison_cleaner: AbstractMetablocking = comparison_cleaner - self._method = method - self._emit_all_tps_stop = emit_all_tps_stop - self.true_pair_checked = None + self._algorithm : str= algorithm + self._emit_all_tps_stop : bool = emit_all_tps_stop + self.duplicate_emitted : dict = None if not self._emit_all_tps_stop else {} self._prediction_data : PredictionData = None + self.data : Data = data + self.duplicate_of = data.duplicate_of + self.scheduler : DatasetScheduler = None + self.dataset_identifier : str = dataset_identifier if not blocks: raise ValueError("Empty blocks structure") - self.data = data - self.pairs = Graph() + + if self.data.is_dirty_er and self._indexing == 'bilateral': + raise ValueError("Cannot apply bilateral indexing to dirty Entity Resolution (single dataset)") + + _inorder_blocks = blocks + self._pairs_top_score : dict = defaultdict(lambda: -1) all_blocks = list(blocks.values()) self._progress_bar = tqdm(total=len(blocks), - desc=self._method_name+" ("+self.metric+")", - disable=self.tqdm_disable) - if 'Block' in str(type(all_blocks[0])): - self._predict_raw_blocks(blocks) - elif isinstance(all_blocks[0], set): - if(self._comparison_cleaner == None): - raise AttributeError("No precalculated weights were given from the CC step") - self._predict_prunned_blocks(blocks) - else: - raise AttributeError("Wrong type of Blocks") + desc=self._method_name, + disable=self.tqdm_disable) + + if(indexing == 'bilateral'): self._indexing = 'inorder' + if(self._indexing == 'inorder'): + if 'Block' in str(type(all_blocks[0])): + self._predict_raw_blocks(blocks) + elif isinstance(all_blocks[0], set): + if(self._comparison_cleaner == None): + raise AttributeError("No precalculated weights were given from the CC step") + self._predict_prunned_blocks(blocks) + else: + raise AttributeError("Wrong type of Blocks") + self._schedule_candidates() + + + if(indexing == 'bilateral'): self._indexing = 'reverse' + if(self._indexing == 'reverse'): + _reverse_blocks = reverse_blocks_entity_indexing(_inorder_blocks, self.data) + self.data = reverse_data_indexing(self.data) + if 'Block' in str(type(all_blocks[0])): + self._predict_raw_blocks(_reverse_blocks) + elif isinstance(all_blocks[0], set): + if(self._comparison_cleaner == None): + raise AttributeError("No precalculated weights were given from the CC step") + self._predict_prunned_blocks(_reverse_blocks) + else: + raise AttributeError("Wrong type of Blocks") + self._schedule_candidates() + + self._gather_top_pairs() self.execution_time = time() - start_time self._progress_bar.close() - + return self.pairs + + + def _store_id_mappings(self) -> None: + """Stores the mapping [Workflow ID -> Dataframe ID] for the current indexing phase + """ + if(self._indexing == "inorder"): + self._inorder_d1_id = self.data._gt_to_ids_reversed_1 + self._inorder_d2_id = self.data._gt_to_ids_reversed_2 + if(self._indexing == "reverse"): + self._reverse_d1_id = self.data._gt_to_ids_reversed_1 + self._reverse_d2_id = self.data._gt_to_ids_reversed_2 + + def _schedule_candidates(self) -> None: + """Translates the workflow identifiers back into dataframe identifiers + Populates the dataset scheduler with the candidate pairs of the current indexing stage + """ + self.scheduler = DatasetScheduler(budget=float('inf') if self._emit_all_tps_stop else self._budget, global_top=(self._algorithm=="TOP")) if self.scheduler == None else self.scheduler + self._store_id_mappings() + + for score, entity, candidate in self.pairs: + # entities of first and second dataframe in the context of the current indexing + d1_entity, d2_entity = (entity, candidate) if(entity < candidate) else (candidate, entity) + d1_map, d2_map = (self._inorder_d1_id, self._inorder_d2_id) if (self._indexing == 'inorder') else (self._reverse_d1_id, self._reverse_d2_id) + + # print(f"#############################################################") + # print(f"Score: {score}") + # print(f"---------------Workflow IDs [{self._indexing}]---------------") + # print(f"Entity: {entity}") + # print(f"Candidate: {candidate}") + # print(f"---------------Workflow IDs [D1 context Ent First]---------------") + # print(f"D1 Entity: {d1_entity}") + # print(f"D2 Entity: {d2_entity}") + + # the dataframe ids of the entities from first and second dataset in the context of indexing + d1_entity_df_id, d2_entity_df_id = (d1_map[d1_entity], d2_map[d2_entity]) + _inorder_d1_entity_df_id, _inorder_d2_entity_df_id = (d1_entity_df_id, d2_entity_df_id) if (self._indexing == 'inorder') else (d2_entity_df_id, d1_entity_df_id) + if(self._emit_all_tps_stop and _inorder_d2_entity_df_id in self.duplicate_of[_inorder_d1_entity_df_id]): + self.duplicate_emitted[(_inorder_d1_entity_df_id, _inorder_d2_entity_df_id)] = False + + # in the case of reverse indexing stage, adjust the workflow identifiers of the entities so we can differ them from inorder entity ids + d1_entity = d1_entity if(self._indexing == 'inorder') else d1_entity + self.data.num_of_entities + d2_entity = d2_entity if(self._indexing == 'inorder') else d2_entity + self.data.num_of_entities + + # print(f"---------------Dataframe IDs [{self._indexing}]---------------") + # print(f"D1 Entity DF ID: {d1_entity_df_id}") + # print(f"D2 Entity DF ID: {d2_entity_df_id}") + # print(f"---------------Inorder Dataframe IDs [{self._indexing}]---------------") + # print(f"Inorder D1 Entity DF ID: {_inorder_d1_entity_df_id}") + # print(f"Inorder D2 Entity DF ID: {_inorder_d2_entity_df_id}") + # print(f"---------------Scheduler IDs [D1 context Ent First]---------------") + # print(f"D1 Entity: {d1_entity}") + # print(f"D2 Entity: {d2_entity}") + # if(_inorder_d2_entity_df_id in self.duplicate_of[_inorder_d1_entity_df_id]): + # print("^ THIS IS A TRUE POSITIVE ^") + # we want entities to be inserted in D1 -> D2 order (current context e.x. reverse) which translates to D2 -> D1 order (reverse context e.x. inorder) + self.scheduler._insert_entity_neighbor(d1_entity, d2_entity, score) + + def _inorder_phase_entity(self, id : int) -> bool: + """Given identifier corresponds to an entity proposed in the inorder indexing phase + + Args: + id (int): Identifier + + Returns: + bool: Identifier proposed in the inorder phase + """ + return id < self.data.num_of_entities + + def _retrieve_entity_df_id(self, id : int) -> int: + """Returns the corresponding id in the dataframe of the given entity id in the context of its indexing phase + + Args: + id (int): Workflow Identifier + + Returns: + int: Dataframe Identifier + """ + _workflow_id : int + _df_id_of : dict + if(self._inorder_phase_entity(id)): + _workflow_id = id + _df_id_of = self._inorder_d1_id if (_workflow_id < len(self._inorder_d1_id)) else self._inorder_d2_id + else: + _workflow_id = id - self.data.num_of_entities + _df_id_of = self._reverse_d1_id if (_workflow_id < len(self._reverse_d1_id)) else self._reverse_d2_id + + return _df_id_of[_workflow_id] + + def _gather_top_pairs(self) -> None: + """Emits the pairs from the scheduler based on the defined algorithm + """ + self.scheduler._sort_neighborhoods_by_avg_weight() + self.pairs = self.scheduler._emit_pairs(method=self._algorithm, data=self.data) + + _identified_pairs = [] + for score, entity, candidate in self.pairs: + _inorder_entities : bool = self._inorder_phase_entity(entity) + entity, candidate = (self._retrieve_entity_df_id(entity), self._retrieve_entity_df_id(candidate)) + entity, candidate = (entity, candidate) if _inorder_entities else (candidate, entity) + _identified_pairs.append((score, entity, candidate)) + + self.pairs = _identified_pairs def evaluate(self, prediction, @@ -242,22 +363,6 @@ def evaluate(self, export_to_dict, with_classification_report, verbose) - - def get_true_pair_checked(self): - if(self.true_pair_checked is None): - raise AttributeError("True positive pairs not defined in specified workflow.") - else: return self.true_pair_checked - - - @abstractmethod - def extract_tps_checked(self, **kwargs) -> dict: - """Constructs a dictionary of the form [true positive pair] -> emitted status, - containing all the true positive pairs that are emittable from the current subset of the dataset - - Returns: - dict: Dictionary that shows whether a TP pair (key) has been emitted (value) - """ - pass def get_prediction_data(self) -> PredictionData: if(self._prediction_data is None): @@ -275,46 +380,137 @@ def get_normalized_auc(self) -> float: def set_prediction_data(self, prediction_data : PredictionData): self._prediction_data : PredictionData = prediction_data - - -class HashBasedProgressiveMatching(ProgressiveMatching): - """Applies hash based candidate graph prunning, sorts retained comparisons and applies Progressive Matching +class BlockIndependentPM(ProgressiveMatching): + """Applies the matching process to a subset of available pairs progressively """ - _method_name: str = "Hash Based Progressive Matching" - _method_info: str = "Applies hash based candidate graph prunning, sorts retained comparisons and applies Progressive Matching" + _method_name: str = "Progressive Matching" + _method_info: str = "Applies the matching process to a subset of available pairs progressively " def __init__( self, - budget: int = 0, - w_scheme: str = 'X2', - metric: str = 'dice', + similarity_function: str = 'dice', tokenizer: str = 'white_space_tokenizer', - similarity_threshold: float = 0.5, - qgram: int = 2, # for jaccard - tokenizer_return_set = True, # unique values or not + vectorizer : str = None, + qgram : int = 1, + similarity_threshold: float = 0.0, + tokenizer_return_unique_values = True, # unique values or not attributes: any = None, - delim_set: list = None, # DelimiterTokenizer - padding: bool = True, # QgramTokenizer - prefix_pad: str = '#', # QgramTokenizer (if padding=True) - suffix_pad: str = '$' # QgramTokenizer (if padding=True) ) -> None: - super().__init__(budget, metric, tokenizer, similarity_threshold, qgram, tokenizer_return_set, attributes, delim_set, padding, prefix_pad, suffix_pad) - self._w_scheme : str = w_scheme + super().__init__(similarity_function=similarity_function, + tokenizer=tokenizer, + vectorizer=vectorizer, + qgram=qgram, + similarity_threshold=similarity_threshold, + tokenizer_return_unique_values=tokenizer_return_unique_values, + attributes=attributes) - def extract_tps_checked(self, **kwargs) -> dict: - _tps_checked = dict() - for entity, neighbors in self.blocks.items(): - for neighbor in neighbors: - entity_id = self.data._gt_to_ids_reversed_1[entity] if entity < self.data.dataset_limit else self.data._gt_to_ids_reversed_2[entity] - neighbor_id = self.data._gt_to_ids_reversed_1[neighbor] if neighbor < self.data.dataset_limit else self.data._gt_to_ids_reversed_2[neighbor] - _d1_entity, _d2_entity = (entity_id, neighbor_id) if entity < self.data.dataset_limit else (neighbor_id, entity_id) + def predict(self, + data: Data, + blocks: dict, + dataset_identifier: str = "dataset", + budget: int = 0, + algorithm : str = 'HB', + indexing : str = 'inorder', + comparison_cleaner: AbstractMetablocking = None, + tqdm_disable: bool = False, + emit_all_tps_stop : bool = False) -> List[Tuple[float, int, int]]: + """Main method of progressive entity matching. Inputs a set of blocks and outputs a list \ + that contains duplets of ids corresponding to candidate pairs to emit. + Args: + blocks (dict): blocks of entities + data (Data): dataset module + tqdm_disable (bool, optional): Disables progress bar. Defaults to False. + method (str) : DFS/BFS/Hybrid approach for specified algorithm + emit_all_tps_stop (bool) : Stop emission once all true positives are found + Returns: + networkx.Graph: entity ids (nodes) and similarity scores between them (edges) + """ + start_time = time() + self.tqdm_disable = tqdm_disable + self._budget : int = budget + self._indexing : str = indexing + self._comparison_cleaner: AbstractMetablocking = comparison_cleaner + self._algorithm : str= algorithm + self._emit_all_tps_stop : bool = emit_all_tps_stop + self.duplicate_emitted : dict = None + self._prediction_data : PredictionData = None + self.data : Data = data + self.duplicate_of = data.duplicate_of + self.scheduler : DatasetScheduler = None + self.dataset_identifier : str = dataset_identifier + + if self.data.is_dirty_er and self._indexing == 'bilateral': + raise ValueError("Cannot apply bilateral indexing to dirty Entity Resolution (single dataset)") - if _d2_entity in self.data.pairs_of[_d1_entity]: - _tps_checked[canonical_swap(_d1_entity, _d2_entity)] = False - return _tps_checked + _inorder_blocks = blocks + self._pairs_top_score : dict = defaultdict(lambda: -1) + all_blocks = list(blocks.values()) if blocks is not None else None + self._progress_bar = tqdm(total=len(blocks) if blocks is not None else 0, + desc=self._method_name, + disable=self.tqdm_disable) + if(indexing == 'bilateral'): self._indexing = 'inorder' + if(self._indexing == 'inorder'): + if all_blocks is None or 'Block' in str(type(all_blocks[0])): + self._predict_raw_blocks(blocks) + elif isinstance(all_blocks[0], set): + if(self._comparison_cleaner == None): + raise AttributeError("No precalculated weights were given from the CC step") + self._predict_prunned_blocks(blocks) + else: + raise AttributeError("Wrong type of Blocks") + self._schedule_candidates() + + + if(indexing == 'bilateral'): self._indexing = 'reverse' + if(self._indexing == 'reverse'): + _reverse_blocks = reverse_blocks_entity_indexing(_inorder_blocks, self.data) + self.data = reverse_data_indexing(self.data) + if all_blocks is None or 'Block' in str(type(all_blocks[0])): + self._predict_raw_blocks(_reverse_blocks) + elif isinstance(all_blocks[0], set): + if(self._comparison_cleaner == None): + raise AttributeError("No precalculated weights were given from the CC step") + self._predict_prunned_blocks(_reverse_blocks) + else: + raise AttributeError("Wrong type of Blocks") + self._schedule_candidates() + + self._gather_top_pairs() + self.execution_time = time() - start_time + self._progress_bar.close() + + return self.pairs + +class HashBasedProgressiveMatching(ProgressiveMatching): + """Applies hash based candidate graph prunning, sorts retained comparisons and applies Progressive Matching + """ + + _method_name: str = "Hash Based Progressive Matching" + _method_info: str = "Applies hash based candidate graph prunning, sorts retained comparisons and applies Progressive Matching" + + def __init__( + self, + weighting_scheme: str = 'X2', + similarity_function: str = 'dice', + tokenizer: str = 'white_space_tokenizer', + vectorizer : str = None, + qgram : int = 1, + similarity_threshold: float = 0.0, + tokenizer_return_unique_values = True, # unique values or not + attributes: any = None, + ) -> None: + + super().__init__(similarity_function=similarity_function, + tokenizer=tokenizer, + vectorizer=vectorizer, + qgram=qgram, + similarity_threshold=similarity_threshold, + tokenizer_return_unique_values=tokenizer_return_unique_values, + attributes=attributes) + self._weighting_scheme : str = weighting_scheme class GlobalTopPM(HashBasedProgressiveMatching): """Applies Progressive CEP, sorts retained comparisons and applies Progressive Matching @@ -324,49 +520,54 @@ class GlobalTopPM(HashBasedProgressiveMatching): _method_info: str = "Applies Progressive CEP, sorts retained comparisons and applies Progressive Matching" def __init__( - self, - budget: int = 0, - w_scheme: str = 'X2', - metric: str = 'dice', - tokenizer: str = 'white_space_tokenizer', - similarity_threshold: float = 0.5, - qgram: int = 2, # for jaccard - tokenizer_return_set = True, # unique values or not - attributes: any = None, - delim_set: list = None, # DelimiterTokenizer - padding: bool = True, # QgramTokenizer - prefix_pad: str = '#', # QgramTokenizer (if padding=True) - suffix_pad: str = '$' # QgramTokenizer (if padding=True) - ) -> None: - - super().__init__(budget, w_scheme, metric, tokenizer, similarity_threshold, qgram, tokenizer_return_set, attributes, delim_set, padding, prefix_pad, suffix_pad) - - def _predict_raw_blocks(self, blocks: dict) -> None: - pcep : ProgressiveCardinalityEdgePruning = ProgressiveCardinalityEdgePruning(self._w_scheme, self._budget) + self, + weighting_scheme: str = 'X2', + similarity_function: str = 'dice', + tokenizer: str = 'white_space_tokenizer', + vectorizer : str = None, + qgram : int = 1, + similarity_threshold: float = 0.0, + tokenizer_return_unique_values = True, # unique values or not + attributes: any = None, + ) -> None: + + super().__init__(weighting_scheme=weighting_scheme, + similarity_function=similarity_function, + tokenizer=tokenizer, + vectorizer=vectorizer, + qgram=qgram, + similarity_threshold=similarity_threshold, + tokenizer_return_unique_values=tokenizer_return_unique_values, + attributes=attributes) + + def _predict_raw_blocks(self, blocks: dict) -> List[Tuple[int, int]]: + self.pairs = Graph() + pcep : ProgressiveCardinalityEdgePruning = ProgressiveCardinalityEdgePruning(self._weighting_scheme, self._budget) candidates : dict = pcep.process(blocks=blocks, data=self.data, tqdm_disable=True, cc=None, emit_all_tps_stop=self._emit_all_tps_stop) self.blocks = candidates - if(self._emit_all_tps_stop): self.true_pair_checked = self.extract_tps_checked() for entity_id, candidate_ids in candidates.items(): for candidate_id in candidate_ids: self._insert_to_graph(entity_id, candidate_id, pcep.get_precalculated_weight(entity_id, candidate_id)) - self.pairs.edges = sorted(self.pairs.edges(data=True), key=lambda x: x[2]['weight'], reverse=True) - return self.pairs.edges + self.pairs.edges = sorted(self.pairs.edges(data=True), key=lambda x: x[2]['weight'], reverse=True) + self.pairs = [(edge[2]['weight'], edge[0], edge[1]) for edge in self.pairs.edges] + return self.pairs - def _predict_prunned_blocks(self, blocks: dict) -> None: - pcep : ProgressiveCardinalityEdgePruning = ProgressiveCardinalityEdgePruning(self._w_scheme, self._budget) + def _predict_prunned_blocks(self, blocks: dict) -> List[Tuple[int, int]]: + self.pairs = Graph() + pcep : ProgressiveCardinalityEdgePruning = ProgressiveCardinalityEdgePruning(self._weighting_scheme, self._budget) candidates : dict = pcep.process(blocks=blocks, data=self.data, tqdm_disable=True, cc=self._comparison_cleaner, emit_all_tps_stop=self._emit_all_tps_stop) self.blocks = candidates - if(self._emit_all_tps_stop): self.true_pair_checked = self.extract_tps_checked() for entity_id, candidate_ids in candidates.items(): for candidate_id in candidate_ids: self._insert_to_graph(entity_id, candidate_id, self._comparison_cleaner.get_precalculated_weight(entity_id, candidate_id)) self.pairs.edges = sorted(self.pairs.edges(data=True), key=lambda x: x[2]['weight'], reverse=True) - return self.pairs.edges + self.pairs = [(edge[2]['weight'], edge[0], edge[1]) for edge in self.pairs.edges] + return self.pairs class LocalTopPM(HashBasedProgressiveMatching): """Applies Progressive CNP, sorts retained comparisons and applies Progressive Matching @@ -376,53 +577,58 @@ class LocalTopPM(HashBasedProgressiveMatching): _method_info: str = "Applies Progressive CNP, sorts retained comparisons and applies Progressive Matching" def __init__( - self, - budget: int = 0, - w_scheme: str = 'X2', - metric: str = 'dice', - tokenizer: str = 'white_space_tokenizer', - similarity_threshold: float = 0.5, - qgram: int = 2, # for jaccard - tokenizer_return_set = True, # unique values or not - attributes: any = None, - delim_set: list = None, # DelimiterTokenizer - padding: bool = True, # QgramTokenizer - prefix_pad: str = '#', # QgramTokenizer (if padding=True) - suffix_pad: str = '$' # QgramTokenizer (if padding=True) - ) -> None: - - super().__init__(budget, w_scheme, metric, tokenizer, similarity_threshold, qgram, tokenizer_return_set, attributes, delim_set, padding, prefix_pad, suffix_pad) - - - def _predict_raw_blocks(self, blocks: dict) -> None: - pcnp : ProgressiveCardinalityNodePruning = ProgressiveCardinalityNodePruning(self._w_scheme, self._budget) - candidates : dict = pcnp.process(blocks=blocks, data=self.data, tqdm_disable=True, cc=None, emit_all_tps_stop=self._emit_all_tps_stop) + self, + weighting_scheme: str = 'X2', + similarity_function: str = 'dice', + number_of_nearest_neighbors: int = 10, + tokenizer: str = 'white_space_tokenizer', + vectorizer : str = None, + qgram : int = 1, + similarity_threshold: float = 0.0, + tokenizer_return_unique_values = True, # unique values or not + attributes: any = None, + ) -> None: + + super().__init__(weighting_scheme=weighting_scheme, + similarity_function=similarity_function, + tokenizer=tokenizer, + vectorizer=vectorizer, + qgram=qgram, + similarity_threshold=similarity_threshold, + tokenizer_return_unique_values=tokenizer_return_unique_values, + attributes=attributes) + self._number_of_nearest_neighbors : int = number_of_nearest_neighbors + + def _predict_raw_blocks(self, blocks: dict) -> List[Tuple[int, int]]: + self.pairs = Graph() + pcnp : ProgressiveCardinalityNodePruning = ProgressiveCardinalityNodePruning(weighting_scheme=self._weighting_scheme, budget=self._budget) + candidates : dict = pcnp.process(blocks=blocks, data=self.data, number_of_nearest_neighbors=self._number_of_nearest_neighbors, tqdm_disable=True, cc=None, emit_all_tps_stop=self._emit_all_tps_stop) self.blocks = candidates - if(self._emit_all_tps_stop): self.true_pair_checked = self.extract_tps_checked() for entity_id, candidate_ids in candidates.items(): for candidate_id in candidate_ids: self._insert_to_graph(entity_id, candidate_id, pcnp.get_precalculated_weight(entity_id, candidate_id)) - self.pairs.edges = sorted(self.pairs.edges(data=True), key=lambda x: x[2]['weight'], reverse=True) - return self.pairs.edges - - def _predict_prunned_blocks(self, blocks: dict) -> None: + self.pairs.edges = sorted(self.pairs.edges(data=True), key=lambda x: x[2]['weight'], reverse=True) + self.pairs = [(edge[2]['weight'], edge[0], edge[1]) for edge in self.pairs.edges] + return self.pairs - pcnp : ProgressiveCardinalityNodePruning = ProgressiveCardinalityNodePruning(self._w_scheme, self._budget) - candidates : dict = pcnp.process(blocks=blocks, data=self.data, tqdm_disable=True, cc=self._comparison_cleaner, emit_all_tps_stop=self._emit_all_tps_stop) + def _predict_prunned_blocks(self, blocks: dict) -> List[Tuple[int, int]]: + self.pairs = Graph() + pcnp : ProgressiveCardinalityNodePruning = ProgressiveCardinalityNodePruning(self._weighting_scheme, self._budget) + candidates : dict = pcnp.process(blocks=blocks, data=self.data, number_of_nearest_neighbors=self._number_of_nearest_neighbors, tqdm_disable=True, cc=self._comparison_cleaner, emit_all_tps_stop=self._emit_all_tps_stop) self.blocks = candidates - if(self._emit_all_tps_stop): self.true_pair_checked = self.extract_tps_checked() for entity_id, candidate_ids in candidates.items(): for candidate_id in candidate_ids: self._insert_to_graph(entity_id, candidate_id, self._comparison_cleaner.get_precalculated_weight(entity_id, candidate_id)) self.pairs.edges = sorted(self.pairs.edges(data=True), key=lambda x: x[2]['weight'], reverse=True) - return self.pairs.edges + self.pairs = [(edge[2]['weight'], edge[0], edge[1]) for edge in self.pairs.edges] + return self.pairs -class EmbeddingsNNBPM(ProgressiveMatching): +class EmbeddingsNNBPM(BlockIndependentPM): """Utilizes/Creates entity embeddings, constructs neighborhoods via NN Approach and applies Progressive Matching """ @@ -431,78 +637,33 @@ class EmbeddingsNNBPM(ProgressiveMatching): def __init__( self, - budget: int = 0, - vectorizer: str = 'bert', + language_model: str = 'bert', + number_of_nearest_neighbors: int = 10, similarity_search: str = 'faiss', - vector_size: int = 200, + vector_size: int = 300, num_of_clusters: int = 5, - metric: str = 'dice', + similarity_function: str = 'cosine', tokenizer: str = 'white_space_tokenizer', - similarity_threshold: float = 0.5, - qgram: int = 2, # for jaccard - tokenizer_return_set = True, # unique values or not - attributes: any = None, - delim_set: list = None, # DelimiterTokenizer - padding: bool = True, # QgramTokenizer - prefix_pad: str = '#', # QgramTokenizer (if padding=True) - suffix_pad: str = '$' # QgramTokenizer (if padding=True) + vectorizer : str = None, + qgram : int = 1, + similarity_threshold: float = 0.0, + tokenizer_return_unique_values = True, # unique values or not + attributes: any = None ) -> None: - super().__init__(budget, metric, tokenizer, similarity_threshold, qgram, tokenizer_return_set, attributes, delim_set, padding, prefix_pad, suffix_pad) - self._vectorizer = vectorizer - self._similarity_search = similarity_search - self._vector_size = vector_size - self._num_of_clusters = num_of_clusters - + super().__init__(similarity_function=similarity_function, + tokenizer=tokenizer, + vectorizer=vectorizer, + qgram=qgram, + similarity_threshold=similarity_threshold, + tokenizer_return_unique_values=tokenizer_return_unique_values, + attributes=attributes) - def predict(self, - data: Data, - blocks: dict = None, - comparison_cleaner: AbstractMetablocking = None, - tqdm_disable: bool = False, - method : str = 'HB', - emit_all_tps_stop : bool = False) -> Graph: - """Main method of progressive entity matching. Inputs a set of blocks and outputs a graph \ - that contains of the entity ids (nodes) and the similarity scores between them (edges). - Args: - blocks (dict): blocks of entities - data (Data): dataset module - tqdm_disable (bool, optional): Disables progress bar. Defaults to False. - method (str) : DFS/BFS/Hybrid approach for specified algorithm - emit_all_tps_stop (bool) : Stop emission once all true positives are found - Returns: - networkx.Graph: entity ids (nodes) and similarity scores between them (edges) - """ - start_time = time() - self.tqdm_disable = tqdm_disable - self._comparison_cleaner: AbstractMetablocking = comparison_cleaner - self._method = method - self._emit_all_tps_stop = emit_all_tps_stop - self.true_pair_checked = None - self._prediction_data : PredictionData = None - self.data = data - self.pairs = Graph() - - if blocks is None: - # applying the process to the whole dataset - self._predict_raw_blocks(blocks) - else: - all_blocks = list(blocks.values()) - self._progress_bar = tqdm(total=len(blocks), - desc=self._method_name+" ("+self.metric+")", - disable=self.tqdm_disable) - if 'Block' in str(type(all_blocks[0])): - self._predict_raw_blocks(blocks) - elif isinstance(all_blocks[0], set): - if(self._comparison_cleaner == None): - raise AttributeError("No precalculated weights were given from the CC step") - self._predict_prunned_blocks(blocks) - else: - raise AttributeError("Wrong type of Blocks") - self._progress_bar.close() - - self.execution_time = time() - start_time - return self.pairs + self._language_model : str = language_model + self._number_of_nearest_neighbors : int = number_of_nearest_neighbors + self._similarity_search : str = similarity_search + self._vector_size : int = vector_size + self._num_of_clusters : int = num_of_clusters def _top_pair_emission(self) -> None: """Applies global sorting to all entity pairs produced by NN, @@ -518,8 +679,7 @@ def _top_pair_emission(self) -> None: candidate_id = self.ennbb._si.d1_retained_ids[self.neighbors[i][j]] self.pairs.append((entity_id, candidate_id, self.scores[i][j])) - self.pairs = sorted(self.pairs, key=lambda x: x[2], reverse=True) - self.pairs = [(x[0], x[1]) for x in self.pairs] + self.pairs = [(x[2], x[0], x[1]) for x in self.pairs] def _dfs_pair_emission(self) -> None: """Sorts NN neighborhoods in ascending average distance from their query entity, @@ -542,7 +702,7 @@ def _dfs_pair_emission(self) -> None: neighbor_id = self.ennbb._si.d1_retained_ids[neighbor] self.pairs.append((entity_id, neighbor_id, neighbor_scores[neighbor_index])) - self.pairs = [(x[0], x[1]) for x in self.pairs] + self.pairs = [(x[2], x[0], x[1]) for x in self.pairs] def _hb_pair_emission(self) -> None: """Sorts NN neighborhoods in ascending average distance from their query entity, @@ -568,7 +728,7 @@ def _hb_pair_emission(self) -> None: _current_emissions = _remaining_emissions if neighbor_index else _first_emissions _current_emissions.append((entity_id, neighbor_id, neighbor_scores[neighbor_index])) - self.pairs = [(x[0], x[1]) for x in _first_emissions] + [(x[0], x[1]) for x in _remaining_emissions] + self.pairs = [(x[2], x[0], x[1]) for x in _first_emissions] + [(x[2], x[0], x[1]) for x in _remaining_emissions] def _bfs_pair_emission(self) -> None: """Sorts NN neighborhoods in ascending average distance from their query entity, @@ -589,68 +749,106 @@ def _bfs_pair_emission(self) -> None: else self.ennbb._si.d2_retained_ids[sorted_neighborhood] self.pairs.append((entity_id, neighbor_id, self.scores[sorted_neighborhood][current_emission_per_pair])) - self.pairs = [(x[0], x[1]) for x in self.pairs] + self.pairs = [(x[2], x[0], x[1]) for x in self.pairs] def _produce_pairs(self): """Calls pairs emission based on the requested approach Raises: AttributeError: Given emission technique hasn't been defined """ - if(self._method == 'DFS'): - self._dfs_pair_emission() - elif(self._method == 'HB'): - self._hb_pair_emission() - elif(self._method == 'BFS'): - self._bfs_pair_emission() - elif(self._method == 'TOP'): - self._top_pair_emission() + # currently first phase algorithms are in charge of gathering the subset of the original dataset + # that will be used to initialize the scheduler, we simply retrieve all the pairs and their scores + self._top_pair_emission() + + def save_datasets_embeddings(self, vectors_1: np.array, vectors_2: np.array) -> None: + """Stores the non-precalculated (not loaded) embeddings in corresponding dataset paths + """ + + if(self._d1_emb_load_path is None): + try: + print(f"Saving D1 Embeddings -> {self._d1_emb_save_path}") + np.save(self._d1_emb_save_path, vectors_1) + pass + except FileNotFoundError: + print(f"Unable to save Embeddings -> {self._d1_emb_save_path}") + + if(self._d2_emb_load_path is None): + try: + print(f"Saving D2 Embeddings -> {self._d2_emb_save_path}") + np.save(self._d2_emb_save_path, vectors_2) + pass + except FileNotFoundError: + print(f"Unable to save Embeddings -> {self._d2_emb_save_path}") + + def retrieve_embeddings_file_paths(self): + return(self.retrieve_dataset_embeddings_file_path(first_dataset=True), self.retrieve_dataset_embeddings_file_path(first_dataset=False)) + + def retrieve_dataset_embeddings_file_path(self, first_dataset : bool = True) -> str: + """Attemps to retrieve the precalculated embeddings of first/second dataset from disk for current experiment + Returns: + str: Precalculated Embeddings file path (None if doesn't exist) + """ + + _requested_indexing, _opposite_indexing = ("reverse", "inorder") if (self._indexing == "reverse") \ + else ("inorder", "reverse") + _requested_dataset, _opposite_dataset = ("1","2") if(first_dataset) \ + else ("2", "1") + + _requested_indexing_file_name = '_'.join([_requested_indexing, self.dataset_identifier, self._language_model, _requested_dataset + ".npy"]) + _opposite_indexing_file_name = '_'.join([_opposite_indexing, self.dataset_identifier, self._language_model, _opposite_dataset + ".npy"]) + + hidden_directory_path = os.path.join(os.getcwd(), ".embs") + os.makedirs(hidden_directory_path, exist_ok=True) + + + _available_file_path : str = None + _requested_indexing_file_path = os.path.join(hidden_directory_path, _requested_indexing_file_name) + _opposite_indexing_file_path = os.path.join(hidden_directory_path, _opposite_indexing_file_name) + + if(os.path.exists(_requested_indexing_file_path) and os.path.isfile(_requested_indexing_file_path)): + _available_file_path = _requested_indexing_file_path + elif(os.path.exists(_opposite_indexing_file_path) and os.path.isfile(_opposite_indexing_file_path)): + _available_file_path = _opposite_indexing_file_path + + if(first_dataset): + self._d1_emb_load_path = _available_file_path + self._d1_emb_save_path = _requested_indexing_file_path else: - raise AttributeError(self._method + ' emission technique is undefined!') - - def _predict_raw_blocks(self, blocks: dict = None) -> None: - self.ennbb : EmbeddingsNNBlockBuilding = EmbeddingsNNBlockBuilding(self._vectorizer, self._similarity_search) - self.final_blocks = self.ennbb.build_blocks(data = self.data, - num_of_clusters = self._num_of_clusters, - top_k = int(max(1, int(self._budget / self.data.num_of_entities) + (self._budget % self.data.num_of_entities > 0))) - if not self._emit_all_tps_stop else self._budget, - return_vectors = False, - tqdm_disable = False, - save_embeddings = True, - load_embeddings_if_exist = True, - with_entity_matching = False, - input_cleaned_blocks = blocks) + self._d2_emb_load_path = _available_file_path + self._d2_emb_save_path = _requested_indexing_file_path + + return _available_file_path + def _predict_raw_blocks(self, blocks: dict = None) -> List[Tuple[int, int]]: + self.ennbb : EmbeddingsNNBlockBuilding = EmbeddingsNNBlockBuilding(self._language_model, self._similarity_search) + + + load_path_d1, load_path_d2 = self.retrieve_embeddings_file_paths() + + self.final_blocks = self.ennbb.build_blocks(data=self.data, + vector_size=self._vector_size, + num_of_clusters=self._num_of_clusters, + top_k=self._number_of_nearest_neighbors, + return_vectors=False, + tqdm_disable=False, + save_embeddings=False, + load_embeddings_if_exist=True, + load_path_d1=load_path_d1, + load_path_d2=load_path_d2, + with_entity_matching=False, + input_cleaned_blocks=blocks, + similarity_distance=self.similarity_function) + + self.save_datasets_embeddings(vectors_1=self.ennbb.vectors_1, vectors_2=self.ennbb.vectors_2) self.scores = self.ennbb.distances self.neighbors = self.ennbb.neighbors self.final_vectors = (self.ennbb.vectors_1, self.ennbb.vectors_2) - self._produce_pairs() - if(self._emit_all_tps_stop): - self.true_pair_checked = self.extract_tps_checked() return self.pairs - def _predict_prunned_blocks(self, blocks: dict = None) -> None: + def _predict_prunned_blocks(self, blocks: dict = None) -> List[Tuple[int, int]]: return self._predict_raw_blocks(blocks) - def extract_tps_checked(self, **kwargs) -> dict: - _tps_checked = dict() - _neighbors = self.neighbors - - for row in range(_neighbors.shape[0]): - entity = self.ennbb._si.d1_retained_ids[row] \ - if self.data.is_dirty_er \ - else self.ennbb._si.d2_retained_ids[row] - entity_id = self.data._gt_to_ids_reversed_1[entity] if entity < self.data.dataset_limit else self.data._gt_to_ids_reversed_2[entity] - for column in range(_neighbors.shape[1]): - if(_neighbors[row][column] != -1): - neighbor = self.ennbb._si.d1_retained_ids[_neighbors[row][column]] - neighbor_id = self.data._gt_to_ids_reversed_1[neighbor] if neighbor < self.data.dataset_limit else self.data._gt_to_ids_reversed_2[neighbor] - _d1_entity, _d2_entity = (entity_id, neighbor_id) if entity < self.data.dataset_limit else (neighbor_id, entity_id) - if _d2_entity in self.data.pairs_of[_d1_entity]: - _tps_checked[canonical_swap(_d1_entity, _d2_entity)] = False - - return _tps_checked - class SimilarityBasedProgressiveMatching(ProgressiveMatching): """Applies similarity based candidate graph prunning, sorts retained comparisons and applies Progressive Matching """ @@ -660,25 +858,25 @@ class SimilarityBasedProgressiveMatching(ProgressiveMatching): def __init__( self, - budget: int = 0, - pwScheme: str = 'ACF', - metric: str = 'dice', + weighting_scheme: str = 'ACF', + window_size: int = 10, + similarity_function: str = 'dice', tokenizer: str = 'white_space_tokenizer', - similarity_threshold: float = 0.5, - qgram: int = 2, # for jaccard - tokenizer_return_set = True, # unique values or not - attributes: any = None, - delim_set: list = None, # DelimiterTokenizer - padding: bool = True, # QgramTokenizer - prefix_pad: str = '#', # QgramTokenizer (if padding=True) - suffix_pad: str = '$' # QgramTokenizer (if padding=True) + vectorizer : str = None, + qgram : int = 1, + similarity_threshold: float = 0.0, + tokenizer_return_unique_values = True, # unique values or not + attributes: any = None ) -> None: - - super().__init__(budget, metric, tokenizer, similarity_threshold, qgram, tokenizer_return_set, attributes, delim_set, padding, prefix_pad, suffix_pad) - self._pwScheme : str = pwScheme - - def extract_tps_checked(self, **kwargs) -> dict: - pass + super().__init__(similarity_function=similarity_function, + tokenizer=tokenizer, + vectorizer=vectorizer, + qgram=qgram, + similarity_threshold=similarity_threshold, + tokenizer_return_unique_values=tokenizer_return_unique_values, + attributes=attributes) + self._weighting_scheme : str = weighting_scheme + self._window_size : int = window_size class GlobalPSNM(SimilarityBasedProgressiveMatching): """Applies Global Progressive Sorted Neighborhood Matching @@ -691,49 +889,35 @@ class GlobalPSNM(SimilarityBasedProgressiveMatching): def __init__( self, - budget: int = 0, - pwScheme: str = 'ACF', - metric: str = 'dice', + weighting_scheme: str = 'ACF', + window_size: int = 10, + similarity_function: str = 'dice', tokenizer: str = 'white_space_tokenizer', - similarity_threshold: float = 0.5, - qgram: int = 2, # for jaccard - tokenizer_return_set = True, # unique values or not - attributes: any = None, - delim_set: list = None, # DelimiterTokenizer - padding: bool = True, # QgramTokenizer - prefix_pad: str = '#', # QgramTokenizer (if padding=True) - suffix_pad: str = '$' # QgramTokenizer (if padding=True) + vectorizer : str = None, + qgram : int = 1, + similarity_threshold: float = 0.0, + tokenizer_return_unique_values = True, # unique values or not + attributes: any = None ) -> None: - super().__init__(budget, pwScheme, metric, tokenizer, similarity_threshold, qgram, tokenizer_return_set, attributes, delim_set, padding, prefix_pad, suffix_pad) - - def _predict_raw_blocks(self, blocks: dict): - gpsn : GlobalProgressiveSortedNeighborhood = GlobalProgressiveSortedNeighborhood(self._pwScheme, self._budget) - candidates : PriorityQueue = gpsn.process(blocks=blocks, data=self.data, tqdm_disable=True, emit_all_tps_stop=self._emit_all_tps_stop) - self.pairs = [] - while(not candidates.empty()): - _, entity_id, candidate_id = candidates.get() - self.pairs.append((entity_id, candidate_id)) - if(self._emit_all_tps_stop): self.true_pair_checked = self.extract_tps_checked(entity=entity_id, neighbor=candidate_id) - + super().__init__(weighting_scheme=weighting_scheme, + window_size=window_size, + similarity_function=similarity_function, + tokenizer=tokenizer, + vectorizer=vectorizer, + qgram=qgram, + similarity_threshold=similarity_threshold, + tokenizer_return_unique_values=tokenizer_return_unique_values, + attributes=attributes) + + def _predict_raw_blocks(self, blocks: dict) -> List[Tuple[float, int, int]]: + gpsn : GlobalProgressiveSortedNeighborhood = GlobalProgressiveSortedNeighborhood(self._weighting_scheme, self._budget) + self.pairs : List[Tuple[float, int, int]] = gpsn.process(blocks=blocks, data=self.data, window_size=self._window_size, tqdm_disable=True, emit_all_tps_stop=self._emit_all_tps_stop) return self.pairs - def _predict_prunned_blocks(self, blocks: dict): + def _predict_prunned_blocks(self, blocks: dict) -> List[Tuple[float, int, int]]: raise NotImplementedError("Sorter Neighborhood Algorithms don't support prunned blocks") - def extract_tps_checked(self, **kwargs) -> dict: - self.true_pair_checked = dict() if self.true_pair_checked is None else self.true_pair_checked - entity = kwargs['entity'] - neighbor = kwargs['neighbor'] - - entity_id = self.data._gt_to_ids_reversed_1[entity] if entity < self.data.dataset_limit else self.data._gt_to_ids_reversed_2[entity] - neighbor_id = self.data._gt_to_ids_reversed_1[neighbor] if neighbor < self.data.dataset_limit else self.data._gt_to_ids_reversed_2[neighbor] - _d1_entity, _d2_entity = (entity_id, neighbor_id) if entity < self.data.dataset_limit else (neighbor_id, entity_id) - if _d2_entity in self.data.pairs_of[_d1_entity]: - self.true_pair_checked[canonical_swap(_d1_entity, _d2_entity)] = False - - return self.true_pair_checked - class LocalPSNM(SimilarityBasedProgressiveMatching): """Applies Local Progressive Sorted Neighborhood Matching """ @@ -745,44 +929,35 @@ class LocalPSNM(SimilarityBasedProgressiveMatching): def __init__( self, - budget: int = 0, - pwScheme: str = 'ACF', - metric: str = 'dice', + weighting_scheme: str = 'ACF', + window_size: int = 10, + similarity_function: str = 'dice', tokenizer: str = 'white_space_tokenizer', - similarity_threshold: float = 0.5, - qgram: int = 2, # for jaccard - tokenizer_return_set = True, # unique values or not - attributes: any = None, - delim_set: list = None, # DelimiterTokenizer - padding: bool = True, # QgramTokenizer - prefix_pad: str = '#', # QgramTokenizer (if padding=True) - suffix_pad: str = '$' # QgramTokenizer (if padding=True) + vectorizer : str = None, + qgram : int = 1, + similarity_threshold: float = 0.0, + tokenizer_return_unique_values = True, # unique values or not + attributes: any = None ) -> None: - super().__init__(budget, pwScheme, metric, tokenizer, similarity_threshold, qgram, tokenizer_return_set, attributes, delim_set, padding, prefix_pad, suffix_pad) - - def _predict_raw_blocks(self, blocks: dict): - lpsn : LocalProgressiveSortedNeighborhood = LocalProgressiveSortedNeighborhood(self._pwScheme, self._budget) - candidates : list = lpsn.process(blocks=blocks, data=self.data, tqdm_disable=True, emit_all_tps_stop=self._emit_all_tps_stop) - self.pairs = candidates - if(self._emit_all_tps_stop): self.true_pair_checked = self.extract_tps_checked(candidates=candidates) + super().__init__(weighting_scheme=weighting_scheme, + window_size=window_size, + similarity_function=similarity_function, + tokenizer=tokenizer, + vectorizer=vectorizer, + qgram=qgram, + similarity_threshold=similarity_threshold, + tokenizer_return_unique_values=tokenizer_return_unique_values, + attributes=attributes) + + def _predict_raw_blocks(self, blocks: dict) -> List[Tuple[float, int, int]]: + lpsn : LocalProgressiveSortedNeighborhood = LocalProgressiveSortedNeighborhood(self._weighting_scheme, self._budget) + self.pairs : List[Tuple[float, int, int]] = lpsn.process(blocks=blocks, data=self.data, window_size=self._window_size, tqdm_disable=True, emit_all_tps_stop=self._emit_all_tps_stop) return self.pairs - def _predict_prunned_blocks(self, blocks: dict): + def _predict_prunned_blocks(self, blocks: dict) -> List[Tuple[float, int, int]]: raise NotImplementedError("Sorter Neighborhood Algorithms don't support prunned blocks " + \ "(pre comparison-cleaning entities per block distribution required") - - def extract_tps_checked(self, **kwargs) -> dict: - _tps_checked = dict() - _candidates = kwargs['candidates'] - - for entity, neighbor in _candidates: - entity_id = self.data._gt_to_ids_reversed_1[entity] if entity < self.data.dataset_limit else self.data._gt_to_ids_reversed_2[entity] - neighbor_id = self.data._gt_to_ids_reversed_1[neighbor] if neighbor < self.data.dataset_limit else self.data._gt_to_ids_reversed_2[neighbor] - _d1_entity, _d2_entity = (entity_id, neighbor_id) if entity < self.data.dataset_limit else (neighbor_id, entity_id) - if _d2_entity in self.data.pairs_of[_d1_entity]: - _tps_checked[canonical_swap(_d1_entity, _d2_entity)] = False - return _tps_checked class RandomPM(ProgressiveMatching): """Picks a number of random comparisons equal to the available budget """ @@ -792,12 +967,11 @@ class RandomPM(ProgressiveMatching): def __init__( self, - budget: int = 0, - metric: str = 'dice', + similarity_function: str = 'dice', tokenizer: str = 'white_space_tokenizer', - similarity_threshold: float = 0.5, + similarity_threshold: float = 0.0, qgram: int = 2, # for jaccard - tokenizer_return_set = True, # unique values or not + tokenizer_return_unique_values = True, # unique values or not attributes: any = None, delim_set: list = None, # DelimiterTokenizer padding: bool = True, # QgramTokenizer @@ -805,32 +979,19 @@ def __init__( suffix_pad: str = '$' # QgramTokenizer (if padding=True) ) -> None: - super().__init__(budget, metric, tokenizer, similarity_threshold, qgram, tokenizer_return_set, attributes, delim_set, padding, prefix_pad, suffix_pad) + super().__init__(similarity_function, tokenizer, similarity_threshold, qgram, tokenizer_return_unique_values, attributes, delim_set, padding, prefix_pad, suffix_pad) - def _predict_raw_blocks(self, blocks: dict) -> None: + def _predict_raw_blocks(self, blocks: dict) -> List[Tuple[int, int]]: cp : ComparisonPropagation = ComparisonPropagation() cleaned_blocks = cp.process(blocks=blocks, data=self.data, tqdm_disable=True) self._predict_prunned_blocks(cleaned_blocks) - def _predict_prunned_blocks(self, blocks: dict) -> None: + def _predict_prunned_blocks(self, blocks: dict) -> List[Tuple[int, int]]: _all_pairs = [(id1, id2) for id1 in blocks for id2 in blocks[id1]] _total_pairs = len(_all_pairs) random_pairs = sample(_all_pairs, self._budget) if self._budget <= _total_pairs and not self._emit_all_tps_stop else _all_pairs - if(self._emit_all_tps_stop): self.true_pair_checked = self.extract_tps_checked(candidates=random_pairs) self.pairs.add_edges_from(random_pairs) - def extract_tps_checked(self, **kwargs) -> dict: - _tps_checked = dict() - _candidates = kwargs['candidates'] - - for entity, neighbor in _candidates: - entity_id = self.data._gt_to_ids_reversed_1[entity] if entity < self.data.dataset_limit else self.data._gt_to_ids_reversed_2[entity] - neighbor_id = self.data._gt_to_ids_reversed_1[neighbor] if neighbor < self.data.dataset_limit else self.data._gt_to_ids_reversed_2[neighbor] - _d1_entity, _d2_entity = (entity_id, neighbor_id) if entity < self.data.dataset_limit else (neighbor_id, entity_id) - if _d2_entity in self.data.pairs_of[_d1_entity]: - _tps_checked[canonical_swap(_d1_entity, _d2_entity)] = False - return _tps_checked - class PESM(HashBasedProgressiveMatching): """Applies Progressive Entity Scheduling Matching """ @@ -840,189 +1001,369 @@ class PESM(HashBasedProgressiveMatching): "emits the top pair per entity. Finally, traverses the sorted " + \ "entities and emits their comparisons in descending weight order " + \ "within specified budget." + def __init__( self, - budget: int = 0, - w_scheme: str = 'X2', - metric: str = 'dice', + weighting_scheme: str = 'CBS', + similarity_function: str = 'dice', tokenizer: str = 'white_space_tokenizer', - similarity_threshold: float = 0.5, - qgram: int = 2, # for jaccard - tokenizer_return_set = True, # unique values or not + vectorizer : str = None, + qgram : int = 1, + similarity_threshold: float = 0.0, + tokenizer_return_unique_values = True, # unique values or not attributes: any = None, - delim_set: list = None, # DelimiterTokenizer - padding: bool = True, # QgramTokenizer - prefix_pad: str = '#', # QgramTokenizer (if padding=True) - suffix_pad: str = '$' # QgramTokenizer (if padding=True) ) -> None: - - super().__init__(budget, w_scheme, metric, tokenizer, similarity_threshold, qgram, tokenizer_return_set, attributes, delim_set, padding, prefix_pad, suffix_pad) + super().__init__(weighting_scheme=weighting_scheme, + similarity_function=similarity_function, + tokenizer=tokenizer, + vectorizer=vectorizer, + qgram=qgram, + similarity_threshold=similarity_threshold, + tokenizer_return_unique_values=tokenizer_return_unique_values, + attributes=attributes) - def _predict_raw_blocks(self, blocks: dict) -> None: + def _predict_raw_blocks(self, blocks: dict) -> List[Tuple[int, int]]: - pes : ProgressiveEntityScheduling = ProgressiveEntityScheduling(self._w_scheme, self._budget) - pes.process(blocks=blocks, data=self.data, tqdm_disable=True, cc=None, method=self._method, emit_all_tps_stop=self._emit_all_tps_stop) - self.pairs = pes.produce_pairs() - if(self._emit_all_tps_stop): self.true_pair_checked = self.extract_tps_checked(candidates=self.pairs) - - def _predict_prunned_blocks(self, blocks: dict): + pes : ProgressiveEntityScheduling = ProgressiveEntityScheduling(self._weighting_scheme, self._budget) + self.pairs = pes.process(blocks=blocks, data=self.data, tqdm_disable=True, cc=None, method=self._algorithm, emit_all_tps_stop=self._emit_all_tps_stop) + return self.pairs + + def _predict_prunned_blocks(self, blocks: dict) -> List[Tuple[int, int]]: return self._predict_raw_blocks(blocks) # raise NotImplementedError("Sorter Neighborhood Algorithms doesn't support prunned blocks (lack of precalculated weights)") + +# class WhooshPM(BlockIndependentPM): +# """Applies progressive index based matching using whoosh library +# """ + +# _method_name: str = "Whoosh Progressive Matching" +# _method_info: str = "Applies Whoosh Progressive Matching - Indexes the entities of the second dataset, " + \ +# "stores their specified attributes, " + \ +# "defines a query for each entity of the first dataset, " + \ +# "and retrieves its pair candidates from the indexer within specified budget" + +# def __init__( +# self, +# similarity_function: str = 'WH-TF-IDF', +# number_of_nearest_neighbors: int = 10, +# tokenizer: str = 'white_space_tokenizer', +# similarity_threshold: float = 0.0, +# qgram: int = 2, # for jaccard +# tokenizer_return_unique_values = True, # unique values or not +# attributes: any = None, +# delim_set: list = None, # DelimiterTokenizer +# padding: bool = True, # QgramTokenizer +# prefix_pad: str = '#', # QgramTokenizer (if padding=True) +# suffix_pad: str = '$' # QgramTokenizer (if padding=True) +# ) -> None: +# # budget set to float('inf') implies unlimited budget +# super().__init__(similarity_function, tokenizer, similarity_threshold, qgram, tokenizer_return_unique_values, attributes, delim_set, padding, prefix_pad, suffix_pad) +# self._number_of_nearest_neighbors : int = number_of_nearest_neighbors + +# def _set_whoosh_datasets(self) -> None: +# """Saves the rows of both datasets corresponding to the indices of the entities that have been retained after comparison cleaning +# """ - def extract_tps_checked(self, **kwargs) -> dict: - _tps_checked = dict() - _candidates = kwargs['candidates'] +# self._whoosh_d1 = self.data.dataset_1[self.attributes + [self.data.id_column_name_1]] if self.attributes else self.data.dataset_1 +# self._whoosh_d1 = self._whoosh_d1[self._whoosh_d1[self.data.id_column_name_1].isin(self._whoosh_d1_retained_index)] +# if(not self.data.is_dirty_er): +# self._whoosh_d2 = self.data.dataset_2[self.attributes + [self.data.id_column_name_2]] if self.attributes else self.data.dataset_2 +# self._whoosh_d2 = self._whoosh_d2[self._whoosh_d2[self.data.id_column_name_2].isin(self._whoosh_d2_retained_index)] - for entity, neighbor in _candidates: - entity_id = self.data._gt_to_ids_reversed_1[entity] if entity < self.data.dataset_limit else self.data._gt_to_ids_reversed_2[entity] - neighbor_id = self.data._gt_to_ids_reversed_1[neighbor] if neighbor < self.data.dataset_limit else self.data._gt_to_ids_reversed_2[neighbor] - _d1_entity, _d2_entity = (entity_id, neighbor_id) if entity < self.data.dataset_limit else (neighbor_id, entity_id) - if _d2_entity in self.data.pairs_of[_d1_entity]: - _tps_checked[canonical_swap(_d1_entity, _d2_entity)] = False - return _tps_checked + +# def _set_retained_entries(self) -> None: +# """Saves the indices of entities of both datasets that have been retained after comparison cleaning +# """ +# self._whoosh_d1_retained_index = pd.Index([self.data._gt_to_ids_reversed_1[id] +# for id in self._si.d1_retained_ids]) + +# if(not self.data.is_dirty_er): +# self._whoosh_d2_retained_index = pd.Index([self.data._gt_to_ids_reversed_2[id] +# for id in self._si.d2_retained_ids]) -class WhooshPM(ProgressiveMatching): - """Applies progressive index based matching using whoosh library - """ +# def _initialize_index_path(self): +# """Creates index directory if non-existent, constructs the absolute path to the current whoosh index +# """ +# global INDEXER_DIR +# INDEXER_DIR = os.path.abspath(INDEXER_DIR) +# _d1_name = self.data.dataset_name_1 if self.data.dataset_name_1 is not None else 'd3' +# self._index_path = os.path.join(INDEXER_DIR, _d1_name if self.data.is_dirty_er else (_d1_name + (self.data.dataset_name_2 if self.data.dataset_name_2 is not None else 'd4'))) +# if not os.path.exists(self._index_path): +# print('Created index directory at: ' + self._index_path) +# os.makedirs(self._index_path, exist_ok=True) + + +# def _create_index(self): +# """Defines the schema [ID, CONTENT], creates the index in the defined path +# and populates it with all the entities of the target dataset (first - Dirty ER, second - Clean ER) +# """ +# self._schema = Schema(ID=ID(stored=True), content=TEXT(stored=True)) +# self._index = create_in(self._index_path, self._schema) +# writer = self._index.writer() + +# _target_dataset = self._whoosh_d1 if self.data.is_dirty_er else self._whoosh_d2 +# _id_column_name = self.data.id_column_name_1 if self.data.is_dirty_er else self.data.id_column_name_2 + +# for _, entity in _target_dataset.iterrows(): +# entity_values = [str(entity[column]) for column in _target_dataset.columns if column != _id_column_name] +# writer.add_document(ID=entity[_id_column_name], content=' '.join(entity_values)) +# writer.commit() + +# def _populate_whoosh_dataset(self) -> None: +# """For each retained entity in the first dataset, construct a query with its text content, +# parses it to the indexers, retrieves best candidates and stores them in entity's neighborhood. +# Populates a list with all the retrieved pairs. +# """ +# # None value for budget implies unlimited budget in whoosh +# _query_budget = self._number_of_nearest_neighbors + +# if(self.similarity_function not in whoosh_similarity_function): +# print(f'{self.similarity_function} Similarity Function is Undefined') +# self.similarity_function = 'Frequency' +# print(f'Applying {self.similarity_function} Similarity Function') +# _scorer = whoosh_similarity_function[self.similarity_function] + +# with self._index.searcher(weighting=_scorer) as searcher: +# self._parser = qparser.QueryParser('content', schema=self._index.schema, group=qparser.OrGroup) +# for _, entity in self._whoosh_d1.iterrows(): +# entity_values = [str(entity[column]) for column in self._whoosh_d1.columns if column != self.data.id_column_name_1] +# entity_string = ' '.join(entity_values) +# entity_id = entity[self.data.id_column_name_1] +# entity_query = self._parser.parse(entity_string) +# query_results = searcher.search(entity_query, limit = _query_budget) + +# for neighbor in query_results: +# _score = neighbor.score +# _neighbor_id = neighbor['ID'] +# self.pairs.append((_score, self.data._ids_mapping_1[entity], self.data._ids_mapping_2[_neighbor_id])) + +# def _predict_raw_blocks(self, blocks: dict) -> List[Tuple[int, int]]: +# self._start_time = time() +# self._si = SubsetIndexer(blocks=blocks, data=self.data, subset=False) +# self._set_retained_entries() +# self._set_whoosh_datasets() +# self._initialize_index_path() +# self._create_index() +# self.pairs : List[Tuple[float, int, int]] = [] +# self._budget = float('inf') if self._emit_all_tps_stop else self._budget +# self._populate_whoosh_dataset() +# self.execution_time = time() - self._start_time +# return self.pairs + +# def _predict_prunned_blocks(self, blocks: dict) -> List[Tuple[int, int]]: +# self._predict_raw_blocks(blocks) - _method_name: str = "Whoosh Progressive Matching" - _method_info: str = "Applies Whoosh Progressive Matching - Indexes the entities of the second dataset, " + \ - "stores their specified attributes, " + \ - "defines a query for each entity of the first dataset, " + \ - "and retrieves its pair candidates from the indexer within specified budget" +class TopKJoinPM(ProgressiveMatching): + """Applies index based matching for ES, emits candidate pairs using defined budget/emission technique + """ + _method_name: str = "Top-K Join Progressive Matching" + _method_info: str = "Applies index based matching for ES, emits candidate pairs using defined budget/emission technique" def __init__( self, - budget: int = 0, - metric: str = 'TF-IDF', - tokenizer: str = 'white_space_tokenizer', - similarity_threshold: float = 0.5, - qgram: int = 2, # for jaccard - tokenizer_return_set = True, # unique values or not + similarity_function: str = 'dice', + number_of_nearest_neighbors : int = 10, + tokenizer: str = None, + weighting_scheme : str = None, + qgram : int = 1, + similarity_threshold: float = 0.0, + tokenizer_return_unique_values = True, # unique values or not attributes: any = None, - delim_set: list = None, # DelimiterTokenizer - padding: bool = True, # QgramTokenizer - prefix_pad: str = '#', # QgramTokenizer (if padding=True) - suffix_pad: str = '$' # QgramTokenizer (if padding=True) ) -> None: - # budget set to float('inf') implies unlimited budget - super().__init__(budget, metric, tokenizer, similarity_threshold, qgram, tokenizer_return_set, attributes, delim_set, padding, prefix_pad, suffix_pad) - - def _set_whoosh_datasets(self) -> None: - """Saves the rows of both datasets corresponding to the indices of the entities that have been retained after comparison cleaning - """ + + super().__init__(similarity_function=similarity_function, + tokenizer=tokenizer, + vectorizer=weighting_scheme, + qgram=qgram, + similarity_threshold=similarity_threshold, + tokenizer_return_unique_values=tokenizer_return_unique_values, + attributes=attributes) - self._whoosh_d1 = self.data.dataset_1[self.attributes + [self.data.id_column_name_1]] if self.attributes else self.data.dataset_1 - self._whoosh_d1 = self._whoosh_d1[self._whoosh_d1[self.data.id_column_name_1].isin(self._whoosh_d1_retained_index)] - if(not self.data.is_dirty_er): - self._whoosh_d2 = self.data.dataset_2[self.attributes + [self.data.id_column_name_2]] if self.attributes else self.data.dataset_2 - self._whoosh_d2 = self._whoosh_d2[self._whoosh_d2[self.data.id_column_name_2].isin(self._whoosh_d2_retained_index)] + self.similarity_function : str = similarity_function + self.number_of_nearest_neighbors : int = number_of_nearest_neighbors + self.weighting_scheme : str = weighting_scheme + self.qgram : int = qgram - - def _set_retained_entries(self) -> None: - """Saves the indices of entities of both datasets that have been retained after comparison cleaning - """ - self._whoosh_d1_retained_index = pd.Index([self.data._gt_to_ids_reversed_1[id] - for id in self._si.d1_retained_ids]) + def _predict_raw_blocks(self, blocks: dict, load_neighborhoods : bool = True) -> List[Tuple[int, int]]: - if(not self.data.is_dirty_er): - self._whoosh_d2_retained_index = pd.Index([self.data._gt_to_ids_reversed_2[id] - for id in self._si.d2_retained_ids]) - - - def _initialize_index_path(self): - """Creates index directory if non-existent, constructs the absolute path to the current whoosh index - """ - global INDEXER_DIR - INDEXER_DIR = os.path.abspath(INDEXER_DIR) - _d1_name = self.data.dataset_name_1 if self.data.dataset_name_1 is not None else 'd3' - self._index_path = os.path.join(INDEXER_DIR, _d1_name if self.data.is_dirty_er else (_d1_name + (self.data.dataset_name_2 if self.data.dataset_name_2 is not None else 'd4'))) - if not os.path.exists(self._index_path): - print('Created index directory at: ' + self._index_path) - os.makedirs(self._index_path, exist_ok=True) + _store_neighborhoods : bool = load_neighborhoods + _loaded_neighborhoods : dict[List[Tuple[float, int]]] - - def _create_index(self): - """Defines the schema [ID, CONTENT], creates the index in the defined path - and populates it with all the entities of the target dataset (first - Dirty ER, second - Clean ER) - """ - self._schema = Schema(ID=ID(stored=True), content=TEXT(stored=True)) - self._index = create_in(self._index_path, self._schema) - writer = self._index.writer() + if(load_neighborhoods): + print("Neighborhood Retrieval Enabled...") + _loaded_neighborhoods = self.retrieve_neighborhoods_from_disk() + else: + print("Neighborhood Retrieval Disabled...") + _loaded_neighborhoods = None - _target_dataset = self._whoosh_d1 if self.data.is_dirty_er else self._whoosh_d2 - _id_column_name = self.data.id_column_name_1 if self.data.is_dirty_er else self.data.id_column_name_2 + if(_loaded_neighborhoods is None): + ptkj : PETopKJoin = PETopKJoin(K=self.number_of_nearest_neighbors, + metric=self.similarity_function, + tokenization=self.tokenizer, + qgrams=self.qgram) + + _pet_vectorizer = self.initialize_vectorizer() if (self.weighting_scheme is not None) else None + self.pairs = ptkj.fit(data=self.data, + reverse_order=True, + attributes_1=self.data.attributes_1, + attributes_2=self.data.attributes_2, + vectorizer=_pet_vectorizer, + store_neighborhoods=_store_neighborhoods) + + if(_store_neighborhoods): + self.pairs = self.neighborhoods_to_pairs(neighborhoods=ptkj.neighborhoods, strict_top_k=True) + self.neighborhoods_to_json(neighborhoods=ptkj.neighborhoods) + else: + self.pairs = [(edge[2]['weight'], edge[0], edge[1]) for edge in self.pairs.edges(data=True)] + else: + self.pairs = self.neighborhoods_to_pairs(neighborhoods=_loaded_neighborhoods, strict_top_k=True) + + return self.pairs + + def _predict_prunned_blocks(self, blocks: dict) -> List[Tuple[int, int]]: + raise NotImplementedError("Progressive TopKJoin PM for prunned blocks - Not implemented yet!") - for _, entity in _target_dataset.iterrows(): - entity_values = [str(entity[column]) for column in _target_dataset.columns if column != _id_column_name] - writer.add_document(ID=entity[_id_column_name], content=' '.join(entity_values)) - writer.commit() - def _populate_whoosh_dataset(self) -> None: - """For each retained entity in the first dataset, construct a query with its text content, - parses it to the indexers, retrieves best candidates and stores them in entity's neighborhood. - Finally, neighborhoods are sorted in descending order of their average weight + def neighborhoods_to_pairs(self, neighborhoods : dict[List[Tuple[float, int]]], strict_top_k : bool = False) -> List[Tuple[float, int, int]]: + previous_weight = None + _pairs : List[Tuple[float, int, int]] = [] + for d1_id, d2_ids in neighborhoods.items(): + distinct_weights = 0 + _d1_id = int(d1_id) + for current_weight, d2_id in d2_ids: + if(strict_top_k or current_weight != previous_weight): + previous_weight = current_weight + distinct_weights += 1 + if distinct_weights <= self.number_of_nearest_neighbors: + _pairs.append((current_weight, d2_id, _d1_id)) + else: + break + return _pairs + + def neighborhoods_to_json(self, neighborhoods : dict[List[Tuple[float, int]]]) -> None: + """Stores the neighborhood in the corresponding experiment's neighborhoods json file within the hidden .ngbs directory + Args: + neighborhoods (dict[List[Tuple[float, int]]]): Neighborhoods of indexed entities of current experiment, dictionary in the form + [indexed entity id] -> [sorted target dataset neighbors in descending similarity order] """ - # None value for budget implies unlimited budget in whoosh - _query_budget = None if is_infinite(self._budget) else max(1, 2 * self._budget / len(self._whoosh_d1)) - if(self.metric not in whoosh_similarity_function): - print(f'{self.metric} Similarity Function is Undefined') - self.metric = 'Frequency' - print(f'Applying {self.metric} Similarity Function') - _scorer = whoosh_similarity_function[self.metric] + _json_file_name = '_'.join(self._requested_file_components) - with self._index.searcher(weighting=_scorer) as searcher: - self._parser = qparser.QueryParser('content', schema=self._index.schema, group=qparser.OrGroup) - for _, entity in self._whoosh_d1.iterrows(): - entity_values = [str(entity[column]) for column in self._whoosh_d1.columns if column != self.data.id_column_name_1] - entity_string = ' '.join(entity_values) - entity_id = entity[self.data.id_column_name_1] - entity_query = self._parser.parse(entity_string) - query_results = searcher.search(entity_query, limit = _query_budget) - - for neighbor in query_results: - _score = neighbor.score - _neighbor_id = neighbor['ID'] - self._sorted_dataset._insert_entity_neighbor(entity=entity_id, neighbor=_neighbor_id, weight=_score) + neighborhoods_directory_path = os.path.join(os.getcwd(), ".ngbs") + os.makedirs(neighborhoods_directory_path, exist_ok=True) - self._sorted_dataset._sort_neighborhoods_by_avg_weight() + _json_store_path = os.path.join(neighborhoods_directory_path, _json_file_name) + print(f"Storing Neighborhood Json in -> {_json_store_path}") + with open(_json_store_path, 'w') as json_file: + json.dump(neighborhoods, json_file, indent=4) + + def matching_file_components(self, + source_components : List[str], + target_components : List[str], + variable_component_index : int = 6) -> bool: + """Takes as inputs lists containing the component of the source and target file name (strings connecte by underscore). + Checks whether those components match (files are equivalent). Variable component (number of nearest neighbor) must be less or equal + to the target component. + Args: + source_components (List[str]): Components (substrings seperated by underscore) that constitute source file name + target_components (List[str]): Components (substrings seperated by underscore) that constitute target file name + variable_component_index (int, optional): Index in file name's components list where the variable component is placed (number of nearest neighbors) + Returns: + bool: Source and target file name components are equivalent (target file can be loaded for source file request) + """ + number_pattern = r"[-+]?\d*\.\d+|\d+" + zipped_components = list(zip(source_components, target_components)) + matching_components = True - def _emit_pairs(self) -> None: - """Returns a list of candidate pairs that have been emitted following the requested method""" - self.pairs = self._sorted_dataset._emit_pairs(method=self._method, data=self.data) - - def _predict_raw_blocks(self, blocks: dict) -> None: - self._start_time = time() - self._si = SubsetIndexer(blocks=blocks, data=self.data, subset=False) - self._set_retained_entries() - self._set_whoosh_datasets() - self._initialize_index_path() - self._create_index() - self._to_emit_pairs : List[Tuple[int, int]] = [] - self._budget = float('inf') if self._emit_all_tps_stop else self._budget - self._sorted_dataset = WhooshDataset(list(self._whoosh_d1_retained_index), self._budget) - self._populate_whoosh_dataset() - self._emit_pairs() - self.execution_time = time() - self._start_time - if(self._emit_all_tps_stop): self.true_pair_checked = self.extract_tps_checked(candidates=self.pairs) + for index, components in enumerate(zipped_components): + source_component, target_component = components + + if(index == variable_component_index): + source_nns = int((re.findall(number_pattern, source_component))[0]) + target_nns = int((re.findall(number_pattern, target_component))[0]) + if(source_nns > target_nns): + matching_components = False + break + else: + if(source_component != target_component): + matching_components = False + break + return matching_components + + def retrieve_neighborhoods_from_disk(self) -> dict[List[Tuple[float, int]]]: + """Attemps to retrieve a precalculated neighborhoods for indexed entities of the current experiment + Returns: + dict[List[Tuple[float, int]]]: Dictionary of neighborhoods for each indexed entity containing a sorted list of neighbords in descending similarity order + """ + self._requested_file_components = [self._indexing, + self.dataset_identifier, + self.weighting_scheme, + self.tokenizer.split('_')[0], + self.similarity_function, + "q" + str(self.qgram), + "n" + str(self.number_of_nearest_neighbors) + ".json"] - def _predict_prunned_blocks(self, blocks: dict) -> None: - self._predict_raw_blocks(blocks) + _neighbors_count_index : int = len(self._requested_file_components) - 1 + neighborhoods_directory_path : str = os.path.join(os.getcwd(), ".ngbs") + _matching_neighborhood_file_name : str = None + _matching_neighborhood : dict[List[Tuple[float, int]]] = None - def extract_tps_checked(self, **kwargs) -> dict: - _tps_checked = dict() - _candidates = kwargs['candidates'] + os.makedirs(neighborhoods_directory_path, exist_ok=True) + print(f"Searching for matching neighborhood file in -> {neighborhoods_directory_path}") - for entity, neighbor in _candidates: - entity_id = self.data._gt_to_ids_reversed_1[entity] if entity < self.data.dataset_limit else self.data._gt_to_ids_reversed_2[entity] - neighbor_id = self.data._gt_to_ids_reversed_1[neighbor] if neighbor < self.data.dataset_limit else self.data._gt_to_ids_reversed_2[neighbor] - _d1_entity, _d2_entity = (entity_id, neighbor_id) if entity < self.data.dataset_limit else (neighbor_id, entity_id) - if _d2_entity in self.data.pairs_of[_d1_entity]: - _tps_checked[canonical_swap(_d1_entity, _d2_entity)] = False - return _tps_checked + if os.path.isdir(neighborhoods_directory_path): + neighborhoods_file_names = os.listdir(neighborhoods_directory_path) + + for neighborhood_file_name in neighborhoods_file_names: + _neighborhood_file_components = neighborhood_file_name.split('_') + if(self.matching_file_components(source_components=self._requested_file_components, + target_components=_neighborhood_file_components, + variable_component_index=_neighbors_count_index)): + _matching_neighborhood_file_name = neighborhood_file_name + break + if(_matching_neighborhood_file_name is not None): + _matching_neighborhood_file_path = os.path.join(neighborhoods_directory_path, _matching_neighborhood_file_name) + if(os.path.exists(_matching_neighborhood_file_path) and os.path.isfile(_matching_neighborhood_file_path)): + with open(_matching_neighborhood_file_path, 'r') as neighborhood_file: + _matching_neighborhood = json.load(neighborhood_file) + print(f"Retrieved matching neighborhood from -> {_matching_neighborhood_file_path}!") + else: + print(f"Matching Neighborhood File not found - Executing Joins Algorithm!") + + return _matching_neighborhood + + def initialize_vectorizer(self) -> FrequencyEvaluator: + self.vectorizer : FrequencyEvaluator = FrequencyEvaluator(vectorizer=self.weighting_scheme, + tokenizer=self.tokenizer, + qgram=self.qgram) + d1 = self.data.dataset_1[self.data.attributes_1] if self.data.attributes_1 is not None else self.data.dataset_1 + self._entities_d1 = d1 \ + .apply(" ".join, axis=1) \ + .apply(lambda x: x.lower()) \ + .values.tolist() + d2 = self.data.dataset_2[self.data.attributes_2] if self.data.attributes_2 is not None else self.data.dataset_2 + self._entities_d2 = d2 \ + .apply(" ".join, axis=1) \ + .apply(lambda x: x.lower()) \ + .values.tolist() if not self.data.is_dirty_er else None + self.vectorizer.fit(metric=self.similarity_function, + dataset_identifier=self.dataset_identifier, + indexing=self._indexing, + d1_entities=self._entities_d1, + d2_entities=self._entities_d2) + return self.vectorizer + +class_references = { + 'GlobalTopPM' : GlobalTopPM, + 'LocalTopPM' : LocalTopPM, + 'GlobalPSNM' : GlobalPSNM, + 'LocalPSNM' : LocalPSNM, + 'PESM' : PESM, + 'EmbeddingsNNBPM' : EmbeddingsNNBPM, + 'TopKJoinPM' : TopKJoinPM +} diff --git a/docs/pyjedai/schema_matching.py b/docs/pyjedai/schema_matching.py new file mode 100644 index 0000000..9459821 --- /dev/null +++ b/docs/pyjedai/schema_matching.py @@ -0,0 +1,148 @@ +"""Schema Matching methods +""" +import pandas as pd +import valentine +from valentine.algorithms.base_matcher import BaseMatcher +from valentine.algorithms.coma.coma import Coma +from valentine.algorithms.cupid.cupid_model import Cupid +from valentine.algorithms.distribution_based.distribution_based import DistributionBased +from valentine.algorithms.jaccard_levenshtein.jaccard_leven import JaccardLevenMatcher +from valentine.algorithms.similarity_flooding.similarity_flooding import SimilarityFlooding +import valentine.metrics as valentine_metrics +from pandas import DataFrame, concat + +from .datamodel import Block, Data, PYJEDAIFeature +from .evaluation import Evaluation +from abc import abstractmethod + +class AbstractSchemaMatching(PYJEDAIFeature): + """Abstract class for schema matching methods + """ + + @abstractmethod + def evaluate(self, + prediction=None, + export_to_df: bool = False, + export_to_dict: bool = False, + with_classification_report: bool = False, + verbose: bool = True) -> any: + pass + + @abstractmethod + def _configuration(self) -> dict: + pass + + @abstractmethod + def stats(self) -> None: + pass + + @abstractmethod + def process(self, + data: Data, + ) -> list: + pass + + @abstractmethod + def process_sm_weighted(self, + data: Data): + pass + + def __init__(self): + super().__init__() + + +class ValentineMethodBuilder(PYJEDAIFeature): + """Class to provide valentine matching methods + """ + + def evaluate(self, + prediction=None, + export_to_df: bool = False, + export_to_dict: bool = False, + with_classification_report: bool = False, + verbose: bool = True) -> any: + pass + + def _configuration(self) -> dict: + pass + + def __init__(self): + super().__init__() + + @staticmethod + def coma_matcher(max_n: int = 0, + strategy: str = "COMA_OPT" + ) -> Coma: + return Coma(max_n, strategy) + + @staticmethod + def cupid_matcher(w_struct: float = 0.2, + leaf_w_struct: float = 0.2, + th_accept: float = 0.7 + ) -> Cupid: + return Cupid(w_struct, leaf_w_struct, th_accept) + + @staticmethod + def distribution_based_matcher(threshold1: float = 0.15, + threshold2: float = 0.15 + ) -> DistributionBased: + return DistributionBased(threshold1, threshold2) + + @staticmethod + def jaccard_leven_matcher(threshold_leven: float = 0.8) -> JaccardLevenMatcher: + return JaccardLevenMatcher(threshold_leven) + + @staticmethod + def similarity_flooding_mathcer(coeff_policy: str = "inverse_average", + formula: str = "formula_c") -> SimilarityFlooding: + return SimilarityFlooding(coeff_policy, formula) + +class ValentineSchemaMatching(AbstractSchemaMatching): + """Class for schema matching methods provided by Valentine + """ + + def __init__(self, matcher: BaseMatcher): + super().__init__() + self.data: Data = None + self.matcher: BaseMatcher = matcher + self.matches = None + self.top_columns: list = [] + + def process(self, + data: Data, + ) -> list: + self.data = data + df1 = self.data.dataset_1 + df2 = self.data.dataset_2 + self.matches = valentine.valentine_match(df1, df2, self.matcher) + self.top_columns = [[x[0][1] for x in self.matches.keys()], [x[1][1] for x in self.matches.keys()]] + return self.top_columns + + def process_sm_weighted(self, data: Data): + pass + + def print_matches(self): + print(self.matches) + + def evaluate(self, + prediction=None, + export_to_df: bool = False, + export_to_dict: bool = False, + with_classification_report: bool = False, + verbose: bool = True) -> any: + + if self.data is None: + raise AttributeError("Can not proceed to evaluation without data object.") + + if self.data.ground_truth is None: + raise AttributeError("Can not proceed to evaluation without a ground-truth file. " + + "Data object has not been initialized with the ground-truth file") + + return valentine_metrics.all_metrics(self.matches, self.data.ground_truth.to_records(index=False).tolist()) + + def _configuration(self) -> dict: + pass + + def stats(self) -> None: + pass + diff --git a/docs/pyjedai/utils.py b/docs/pyjedai/utils.py index 4c89912..6e69d81 100644 --- a/docs/pyjedai/utils.py +++ b/docs/pyjedai/utils.py @@ -1,6 +1,7 @@ from abc import ABC, abstractmethod from collections import defaultdict - +from sklearn.metrics.pairwise import pairwise_distances +from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer import numpy as np import re from nltk import ngrams @@ -13,8 +14,14 @@ import sys from time import time from networkx import Graph +import inspect from ordered_set import OrderedSet +import uuid +import os +import json +import copy from math import floor +import pandas as pd # ----------------------- # # Constants # ----------------------- # @@ -317,7 +324,7 @@ def __init__(self, num_of_entities: int, sorted_entities: List[int]) -> None: def get_positions(self, entity: int): return self._entity_positions[entity] -class WhooshNeighborhood(ABC): +class EntityScheduler(ABC): """Stores information about the neighborhood of a given entity ID: - ID : The identifier of the entity as it is defined within the original dataframe - Total Weight : The total weight of entity's neighbors @@ -329,50 +336,30 @@ class WhooshNeighborhood(ABC): ABC (ABC): ABC Module """ - def __init__(self, id : int, budget : float) -> None: + def __init__(self, id : int) -> None: self._id : int = id - self._budget : float = budget - self._neighbors : PriorityQueue = PriorityQueue(self._budget) if not is_infinite(self._budget) else PriorityQueue() - self._insert_stage : bool = True - self._minimum_weight : float = sys.float_info.min + self._neighbors : PriorityQueue = PriorityQueue() self._neighbors_num : int = 0 self._total_weight : float = 0.0 self._average_weight : float = None def _insert(self, neighbor_id: int, weight : float) -> None: - if(not self._insert_stage): self._change_state() - - if weight >= self._minimum_weight: - self._neighbors.put((weight, neighbor_id)) - if self._neighbors.qsize() > self._budget: - self._minimum_weight = self._neighbors.get()[0] - + self._neighbors.put((-weight, neighbor_id)) self._update_neighbors_counter_by(1) self._update_total_weight_by(weight) - def _pop(self) -> None: - if(self._insert_stage): self._change_state() - + def _pop(self) -> Tuple[float, int]: if(self._empty()): raise ValueError("No neighbors to pop!") _weight, _neighbor_id = self._neighbors.get() + self._update_neighbors_counter_by(-1) + self._update_total_weight_by(_weight) + return -_weight, _neighbor_id def _empty(self) -> bool: return self._neighbors.empty() - - def _change_state(self) -> None: - "Neighborhood can either be accepting or emitting neighbors" + \ - "Accepting Stage - Neighbors stored in ascending weight order" + \ - "Emitting Stage - Neighbors stored in descending weight order" - _neighbors_resorted : PriorityQueue = PriorityQueue(int(self._budget)) if not is_infinite(self._budget) else PriorityQueue() - while(not self._neighbors.empty()): - _weight, _neighbor_id = self._neighbors.get() - _neighbors_resorted.put((-_weight, _neighbor_id)) - - self._neighbors = _neighbors_resorted - self._insert_stage = not self._insert_stage def _update_total_weight_by(self, weight) -> None: self._total_weight = self._total_weight + weight @@ -394,31 +381,31 @@ def _get_average_weight(self) -> float: return self._average_weight def __eq__(self, other): - if isinstance(other, WhooshNeighborhood): + if isinstance(other, EntityScheduler): return self._get_average_weight() == other._get_average_weight() return NotImplemented def __lt__(self, other): - if isinstance(other, WhooshNeighborhood): + if isinstance(other, EntityScheduler): return self._get_average_weight() < other._get_average_weight() return NotImplemented def __gt__(self, other): - if isinstance(other, WhooshNeighborhood): + if isinstance(other, EntityScheduler): return self._get_average_weight() > other._get_average_weight() return NotImplemented def __le__(self, other): - if isinstance(other, WhooshNeighborhood): + if isinstance(other, EntityScheduler): return self._get_average_weight() <= other._get_average_weight() return NotImplemented def __ge__(self, other): - if isinstance(other, WhooshNeighborhood): + if isinstance(other, EntityScheduler): return self._get_average_weight() >= other._get_average_weight() return NotImplemented -class WhooshDataset(ABC): +class DatasetScheduler(ABC): """Stores a dictionary [Entity -> Entity's Neighborhood Data (Whoosh Neighborhood)] Supplies auxiliarry functions for information retrieval from the sorted dataset @@ -426,26 +413,36 @@ class WhooshDataset(ABC): ABC (ABC): ABC Module """ - def __init__(self, entity_ids : List[int], budget : float) -> None: + def __init__(self, budget : float = float('inf'), entity_ids : List[int] = [], global_top : bool = False) -> None: self._budget : float = budget self._total_entities : int = len(entity_ids) - self._entity_budget : float = budget if is_infinite(self._budget) else max(1, 2 * self._budget / self._total_entities) self._neighborhoods : dict = {} + # global emission case + self._global_top : bool = global_top + self._all_candidates = PriorityQueue() if self._global_top else None for entity_id in entity_ids: - self._neighborhoods[entity_id] = WhooshNeighborhood(id=entity_id, budget=self._entity_budget) + self._neighborhoods[entity_id] = EntityScheduler(id=entity_id) # used in defining proper emission strategy self._sorted_entities : List[int] = None self._current_neighborhood_index : int = 0 self._current_entity : int = None - self._current_neighborhood : WhooshNeighborhood = None + self._current_neighborhood : EntityScheduler = None def _insert_entity_neighbor(self, entity : int, neighbor : int, weight : float) -> None: - self._neighborhoods[entity]._insert(neighbor, weight) + if(not self._global_top): + if(entity not in self._neighborhoods): + _new_neighborhood : EntityScheduler = EntityScheduler(entity) + _new_neighborhood._insert(neighbor, weight) + self._neighborhoods[entity] = _new_neighborhood + else: + self._neighborhoods[entity]._insert(neighbor, weight) + else: + self._all_candidates.put((-weight, entity, neighbor)) def _pop_entity_neighbor(self, entity : int) -> Tuple[float, int]: return self._neighborhoods[entity]._pop() - def _get_entity_neighborhood(self, entity : int) -> WhooshNeighborhood: + def _get_entity_neighborhood(self, entity : int) -> EntityScheduler: return self._neighborhoods[entity] def _entity_has_neighbors(self, entity : int) -> bool: @@ -455,7 +452,7 @@ def _sort_neighborhoods_by_avg_weight(self) -> None: """Store a list of entity ids sorted in descending order of the average weight of their corresponding neighborhood""" self._sorted_entities : List = sorted(self._neighborhoods, key=lambda entity: self._neighborhoods[entity]._get_average_weight(), reverse=True) - def _get_current_neighborhood(self) -> WhooshNeighborhood: + def _get_current_neighborhood(self) -> EntityScheduler: return self._neighborhoods[self._current_entity] def _enter_next_neighborhood(self) -> None: @@ -466,67 +463,126 @@ def _enter_next_neighborhood(self) -> None: self._current_entity = self._sorted_entities[self._current_neighborhood_index] self._current_neighborhood = self._neighborhoods[self._current_entity] - def _successful_emission(self, pair : Tuple[int, int]) -> bool: - - _entity, _neighbor = pair - _entity_id = self._data._ids_mapping_1[_entity] - _neighbor_id = self._data._ids_mapping_1[_neighbor] if self._data.is_dirty_er else self._data._ids_mapping_2[_neighbor] + def _successful_emission(self, pair : Tuple[float, int, int]) -> bool: + _score, _entity, _neighbor = pair if(self._emitted_comparisons < self._budget): - self._emitted_pairs.append((_entity_id, _neighbor_id)) + self._emitted_pairs.append((_score, _entity, _neighbor)) self._emitted_comparisons += 1 return True else: return False - def _emit_pairs(self, method : str, data : Data) -> List[Tuple[int, int]]: + def _print_info(self): + _n_ids : int + if(self._sorted_entities is None): + print("Neighborhood Status - Not Sorted by average weight") + _n_ids = self._neighborhoods.keys() + else: + print("Neighborhood Status - Sorted by average weight") + _n_ids = self._sorted_entities + for _n_id in _n_ids: + _current_neighborhood = self._neighborhoods[_n_id] + print("#############################") + print(f"Neighborhood[{_n_id}]") + print(f"Total Neighbords[{_current_neighborhood._get_neighbors_num()}]") + print(f"Average Weight[{_current_neighborhood._get_average_weight()}]") + + def _checked_pair(self, entity : int, candidate : int) -> bool: + """Checks if the given pair has been checked previously in the scheduling process. + In the case the given pair has been constructed in the reverse indexing context, + proper translation to inorder indexing identification is done for correct checking. + Finally, if the pair has not been checked in the past, it is added to the checked pool. + Args: + entity (int): Entity ID + candidate (int): Candidate ID + + Returns: + bool: Given pair has already been checked in the scheduling process + """ + _d1_inorder_entity, _d2_inorder_entity = self._get_inorder_representation(entity, candidate) + + if((_d1_inorder_entity, _d2_inorder_entity) not in self._checked_entities): + self._checked_entities.add((_d1_inorder_entity, _d2_inorder_entity)) + return False + else: + return True + + def _get_inorder_representation(self, entity : int, candidate : int) -> Tuple[int, int]: + """Takes as input the ID of the entity of the first and second dataset in its schedule indexing context (in that order!). + Returns the ids of given entities in the inorder context, + in the following order (id of the entity of the first dataset in the inorder context, -//- second -//-) + Args: + entity (int): Entity ID + candidate (int): Candidate ID + + Returns: + Tuple[int, int]: (id of entity of first dataframe, id of entity of second dataframe) in inorder context + """ + if(entity < self._data.num_of_entities): return entity, candidate + + # reverse context case + # - number of entities (to transfer the IDs from Scheduler -> Workflow ID representation) + # + / - dataset limit in order to express (D1 in reverse context == D2 in inorder context, and the reverse) + entity = entity - self._data.num_of_entities + self._data.num_of_entities_2 + candidate = candidate - self._data.num_of_entities - self._data.num_of_entities_1 + + return candidate, entity + + + def _emit_pairs(self, method : str, data : Data) -> List[Tuple[float, int, int]]: """Emits candidate pairs according to specified method Args: method (str): Emission Method - data (Data): Dataset Module + data (Data): Dataset Module of the Returns: List[Tuple[int, int]]: List of candidate pairs """ self._method : str = method - self._data : Data = data - self._emitted_pairs = [] self._emitted_comparisons = 0 + self._checked_entities = set() + self._data : Data = data + + if(self._method == 'TOP'): + while(not self._all_candidates.empty()): + score, sorted_entity, neighbor = self._all_candidates.get() + if(not self._checked_pair(sorted_entity, neighbor)): + if(not self._successful_emission(pair=(-score, sorted_entity, neighbor))): + return self._emitted_pairs + + return self._emitted_pairs + if(self._method == 'HB'): for sorted_entity in self._sorted_entities: if(self._entity_has_neighbors(sorted_entity)): - _, neighbor = self._pop_entity_neighbor(sorted_entity) - if(not self._successful_emission(pair=(sorted_entity, neighbor))): - return self._emitted_pairs + score, neighbor = self._pop_entity_neighbor(sorted_entity) + if(not self._checked_pair(sorted_entity, neighbor)): + if(not self._successful_emission(pair=(score, sorted_entity, neighbor))): + return self._emitted_pairs - if(self._method == 'HB' or self._method == 'DFS'): - _checked_entity = np.zeros(self._total_entities, dtype=bool) - _sorted_entity_to_index = dict(zip(self._sorted_entities, range(0, self._total_entities))) - - for index, sorted_entity in enumerate(self._sorted_entities): - _checked_entity[index] = True + if(self._method == 'HB' or self._method == 'DFS'): + for sorted_entity in self._sorted_entities: while(self._entity_has_neighbors(sorted_entity)): - _, neighbor = self._pop_entity_neighbor(sorted_entity) - if(neighbor not in _sorted_entity_to_index or _checked_entity[_sorted_entity_to_index[neighbor]]): - if(not self._successful_emission(pair=(sorted_entity, neighbor))): + score, neighbor = self._pop_entity_neighbor(sorted_entity) + if(not self._checked_pair(sorted_entity, neighbor)): + if(not self._successful_emission(pair=(score, sorted_entity, neighbor))): return self._emitted_pairs else: _emissions_left = True - _checked_entities = set() while(_emissions_left): _emissions_left = False for sorted_entity in self._sorted_entities: if(self._entity_has_neighbors(sorted_entity)): - _, neighbor = self._pop_entity_neighbor(sorted_entity) - if(canonical_swap(sorted_entity, neighbor) not in _checked_entities): - if(not self._successful_emission(pair=(sorted_entity, neighbor))): + score, neighbor = self._pop_entity_neighbor(sorted_entity) + if(not self._checked_pair(sorted_entity, neighbor)): + if(not self._successful_emission(pair=(score, sorted_entity, neighbor))): return self._emitted_pairs - _checked_entities.add(canonical_swap(sorted_entity, neighbor)) - _emissions_left = True + _emissions_left = True return self._emitted_pairs class PredictionData(ABC): @@ -536,14 +592,10 @@ class PredictionData(ABC): Args: ABC (ABC): ABC Module """ - def __init__(self, name : str, predictions, tps_checked = dict) -> None: - self.set_name(name) - self.set_tps_checked(tps_checked) - self.set_predictions(self._format_predictions(predictions)) - # Pairs have not been emitted yet - Data Module has not been populated with performance data - self.set_total_emissions(None) - self.set_normalized_auc(None) - self.set_cumulative_recall(None) + def __init__(self, matcher, matcher_info : dict) -> None: + self.set_matcher_info(matcher_info) + self.set_duplicate_emitted(matcher.duplicate_emitted) + self.set_candidate_pairs(self._format_predictions(matcher.pairs)) def _format_predictions(self, predictions) -> List[Tuple[int, int]]: """Transforms given predictions into a list of duplets (candidate pairs) @@ -555,48 +607,65 @@ def _format_predictions(self, predictions) -> List[Tuple[int, int]]: Returns: List[Tuple[int, int]]: Formatted Predictions """ - return [edge[:2] for edge in predictions.edges] if isinstance(predictions, Graph) else predictions + return [edge[:3] for edge in predictions.edges] if isinstance(predictions, Graph) else predictions def get_name(self) -> str: - return self._name + _matcher_info : dict = self.get_matcher_info() + if('name' not in _matcher_info): raise ValueError("Matcher doesn't have a name - Make sure its execution data has been calculated") + return _matcher_info['name'] - def get_predictions(self) -> List[Tuple[int, int]]: - return self._predictions + def get_candidate_pairs(self) -> List[Tuple[float, int, int]]: + if(self._candidate_pairs is None): raise ValueError("Pairs not scheduled yet - Cannot retrieve candidate pairs") + return self._candidate_pairs - def get_tps_checked(self) -> dict: - return self._tps_checked + def get_duplicate_emitted(self) -> dict: + if(self._duplicate_emitted is None): raise ValueError("No information about the status of true positives' emission") + return self._duplicate_emitted def get_total_emissions(self) -> int: - if(self._total_emissions is None): raise ValueError("Pairs not emitted yet - Total Emissions are undefined") - return self._total_emissions + _matcher_info : dict = self.get_matcher_info() + if('total_emissions' not in _matcher_info): raise ValueError("Pairs not emitted yet - Total Emissions are undefined") + return _matcher_info['total_emissions'] def get_normalized_auc(self) -> float: - if(self._normalized_auc is None): raise ValueError("Pairs not emitted yet - Normalized AUC is undefined") - return self._normalized_auc + _matcher_info : dict = self.get_matcher_info() + if('auc' not in _matcher_info): raise ValueError("Pairs not emitted yet - Normalized AUC is undefined") + return _matcher_info['auc'] def get_cumulative_recall(self) -> float: - if(self._cumulative_recall is None): raise ValueError("Pairs not emitted yet - Cumulative Recall is undefined") - return self._cumulative_recall + _matcher_info : dict = self.get_matcher_info() + if('recall' not in _matcher_info): raise ValueError("Pairs not emitted yet - Cumulative Recall is undefined") + return _matcher_info['recall'] + + def get_matcher_info(self) -> dict: + if(self._matcher_info is None): raise ValueError("Pairs not emitted yet - Matcher Info is undefined") + return self._matcher_info + + def set_matcher_info(self, matcher_info : dict) -> None: + self._matcher_info : dict = matcher_info def set_name(self, name : str): - self._name : str = name + _matcher_info : dict = self.get_matcher_info() + _matcher_info['name'] = name - def set_predictions(self, predictions : List[Tuple[int, int]]) -> None: - self._predictions : List[Tuple[int, int]] = predictions + def set_candidate_pairs(self, candidate_pairs : List[Tuple[float, int, int]]) -> None: + self._candidate_pairs : List[Tuple[float, int, int]] = candidate_pairs - def set_tps_checked(self, tps_checked : dict) -> None: - self._tps_checked : dict = tps_checked + def set_duplicate_emitted(self, duplicate_emitted : dict) -> None: + self._duplicate_emitted : dict = duplicate_emitted def set_total_emissions(self, total_emissions : int) -> None: - self._total_emissions : int = total_emissions + _matcher_info : dict = self.get_matcher_info() + _matcher_info['total_emissions'] = total_emissions def set_normalized_auc(self, normalized_auc : float) -> None: - self._normalized_auc : float = normalized_auc + _matcher_info : dict = self.get_matcher_info() + _matcher_info['auc'] = normalized_auc def set_cumulative_recall(self, cumulative_recall : float) -> None: - self._cumulative_recall : float = cumulative_recall - - + _matcher_info : dict = self.get_matcher_info() + _matcher_info['recall'] = cumulative_recall + def canonical_swap(id1: int, id2: int) -> Tuple[int, int]: """Returns the identifiers in canonical order @@ -619,14 +688,574 @@ def sorted_enumerate(seq, reverse=True): def is_infinite(value : float): return math.isinf(value) and value > 0 +def reverse_data_indexing(data : Data) -> Data: + """Returns a new data model based upon the given data model with reversed indexing of the datasets + Args: + data (Data): input dat a model + + Returns: + Data : New Data Module with reversed indexing + """ + return Data(dataset_1 = data.dataset_2, + id_column_name_1 = data.id_column_name_2, + attributes_1 = data.attributes_2, + dataset_name_1 = data.dataset_name_2, + dataset_2 = data.dataset_1, + attributes_2 = data.attributes_1, + id_column_name_2 = data.id_column_name_1, + dataset_name_2 = data.dataset_name_1, + ground_truth = data.ground_truth) + +def get_class_function_arguments(class_reference, function_name : str) -> List[str]: + """Returns a list of argument names for requested function of the given class + Args: + class_reference: Reference to a class + function_name (str): Name of the requested function + + Returns: + List[str] : List of requested function's arguments' names + """ + if not inspect.isclass(class_reference): + raise ValueError(f"{class_reference.__name__} class reference is not valid.") + if not hasattr(class_reference, function_name): + raise ValueError(f"The class {class_reference.__name__} does not have a function named {function_name}.") + function_obj = getattr(class_reference, function_name) + if not inspect.isfunction(function_obj): + raise ValueError(f"The provided name {function_name} does not correspond to a function in class '{class_reference.__name__}'.") + function_signature = inspect.signature(function_obj) + argument_names = list(function_signature.parameters.keys())[1:] + + return argument_names + +def new_dictionary_from_keys(dictionary : dict, keys : list) -> dict: + """Returns a subset of the given dictionary including only the given keys. + Unrecognized keys are not included. + Args: + dictionary (dict): Target dictionary + keys (list): Keys to keep + + Returns: + dict : Subset of the given dictionary including only the requested keys + """ + new_dictionary : dict = {key: dictionary[key] for key in keys if key in dictionary} + return new_dictionary + + +def has_duplicate_pairs(pairs : List[Tuple[float, int, int]]): + seen_pairs = set() + for pair in pairs: + entity : int = pair[1] + candidate : int = pair[2] + if (entity, candidate) in seen_pairs: + return True + seen_pairs.add((entity, candidate)) + return False + +def reverse_blocks_entity_indexing(blocks : dict, data : Data) -> dict: + """Returns a new instance of blocks containing the entity IDs of the given blocks translated into the reverse indexing system + Args: + blocks (dict): blocks as defined in the previous indexing + data (Data): Previous data module used to define the reversed ids based on previous dataset limit and dataset sizes + + Returns: + dict : New block instance with identifiers defined in the context of the reverse indexing + """ + if(blocks is None): return None + all_blocks = list(blocks.values()) + if 'Block' in str(type(all_blocks[0])): + return reverse_raw_blocks_entity_indexing(blocks, data) + elif isinstance(all_blocks[0], set): + return reverse_prunned_blocks_entity_indexing(blocks, data) + +def reverse_prunned_blocks_entity_indexing(blocks : dict, data : Data) -> dict: + _reversed_blocks : dict = dict() + _reversed_block : set + + for entity in blocks: + _updated_entity : int = get_reverse_indexing_id(entity, data) + _reversed_block = set() + block : set = blocks[entity] + for candidate in block: + _reversed_block.add(get_reverse_indexing_id(candidate, data)) + _reversed_blocks[_updated_entity] = _reversed_block + + return _reversed_blocks + +def reverse_raw_blocks_entity_indexing(blocks : dict, data : Data) -> dict: + _reversed_blocks : dict = dict() + _reversed_block : Block + + for token in blocks: + _current_block : Block = blocks[token] + _updated_D1_entities = OrderedSet() + _updated_D2_entities = OrderedSet() + + for d1_entity in _current_block.entities_D1: + _updated_D2_entities.add(get_reverse_indexing_id(d1_entity, data)) - + for d2_entity in _current_block.entities_D2: + _updated_D1_entities.add(get_reverse_indexing_id(d2_entity, data)) + + _reversed_block = Block() + _reversed_block.entities_D1 = _updated_D1_entities + _reversed_block.entities_D2 = _updated_D2_entities + _reversed_blocks[token] = _reversed_block + + return _reversed_blocks + +def get_reverse_indexing_id(id : int, data : Data) -> int: + return (id + data.num_of_entities_2) if (id < data.num_of_entities_1) else (id - data.num_of_entities_1) + + +# Progressive Workflow Grid Search Utility Functions + +def values_given(configuration: dict, parameter: str) -> bool: + """Values for requested parameters have been supplied by the user in the configuration file + + Args: + configuration (dict): Configuration File + parameter (str): Requested parameter name + + Returns: + bool: Values for requested parameter supplied + """ + return (parameter in configuration) and (isinstance(configuration[parameter], list)) and (len(configuration[parameter]) > 0) + +def get_multiples(num : int, n : int) -> list: + """Returns a list of multiples of the requested number up to n * number + + Args: + num (int): Number + n (int): Multiplier + + Returns: + list: Multiplies of num up to n * num + """ + multiples = [] + for i in range(1, n+1): + multiples.append(num * i) + return multiples + +def necessary_dfs_supplied(configuration : dict) -> bool: + """Configuration file contains values for source, target and ground truth dataframes + + Args: + configuration (dict): Configuration file + + Raises: + ValueError: Zero values supplied for one or more paths + + Returns: + bool: _description_ + """ + for path in ['source_dataset_path', 'target_dataset_path', 'ground_truth_path']: + if(not values_given(configuration, path)): + raise ValueError(f"{path}: No values given") + return len(configuration['source_dataset_path']) == len(configuration['target_dataset_path']) == len(configuration['ground_truth_path']) + +def generate_unique_identifier() -> str: + """Returns unique identifier which is used to cross reference workflows stored in json file and their performance graphs + + Returns: + str: Unique identifier + """ + return str(uuid.uuid4()) + + +def to_path(path : str): + return os.path.expanduser(path) + +def clear_json_file(path : str): + if os.path.exists(path): + if os.path.getsize(path) > 0: + open(path, 'w').close() + + +def purge_id_column(columns : list): + non_id_columns : list = [] + for column in columns: + if(column != 'id'): + non_id_columns.append(column) + + return non_id_columns + +def common_elements(elements1 : list, elements2 : list) -> list: + """Returns the union of the elements of both lists in the order they appear in the first list + + Args: + elements1 (list): Source list of elements + elements2 (list): Target list of elements + + Returns: + list : Returns the union of the elements of both lists in the order they appear in the first list + """ + _common_elements : list = [] + + for element in elements1: + if element in elements2: + _common_elements.append(element) + return _common_elements + +def matching_arguments(workflow : dict, arguments : dict) -> bool: + """Checks if given workflow's arguments that are shared with the target arguments have values that appear in the those arguments + + Args: + workflow (dict): Dictionary of argument -> value for the given workflow + arguments (dict): Dictionary of argument -> lists of values that are valid for the workflow in order for it to be matching + + Returns: + bool : Checks if given workflow's arguments that are shared with the target arguments have values that appear in the those arguments + """ + for argument, value in workflow.items(): + if argument in arguments and value not in arguments[argument]: + return False + return True + +def update_top_results(results : dict, new_workflow : dict, metric : str, keep_top_budget : bool) -> dict: + """Based on its performance, sets the new workflow as the top one in + its budget/global category (don't / only keep the budget with top performance) + + Args: + results (dict): Budget -> Best workflow for giben budget + new_workflow (dict): Arguments -> values for given workflow + metric (str) : Metric upon which workflows are being compared + keep_top_budget (bool): Keep only the workflow corresponding to the budget with the best performance + + Returns: + dict : Updated Results Dictionary + """ + + _budget : int = new_workflow['budget'] + _current_top_workflow = (None if not results else results[next(iter(results))]) if keep_top_budget \ + else (None if _budget not in results else results[_budget]) + + if(_current_top_workflow is None or _current_top_workflow[metric] < new_workflow[metric]): + if(keep_top_budget): + return {_budget : new_workflow} + else: + results[_budget] = new_workflow + return results + return results + +def retrieve_top_workflows(workflows : dict = None, + workflows_path : str = None, + store_path : str = None, + metric : str = 'auc', + top_budget : bool = False, + **arguments): + """Takes a workflow dictionary or retrieves it from given path. + Gathers the best workflows was specified comparison metric and argument values. + Stores the best workflows in the given storage path. + + Args: + workflows (dict): Dictionary containing the workflows (Defaults to None) + workflows_path (dict): Path from which the program will attempt to retrieve the workflows (Defaults to None) + store_path (str) : Path in which the best workflows will be stored in json format (Defaults to None) + metric (bool): Metric used to compare workflows (Default to 'auc') + top_budget (bool): Store only the workflow for the budget with the best performance (Defaults to False) + arguments (dict): Arguments and the corresponding values that workflows have to possess in order to be considered + + Returns: + dict : Updated Results Dictionary + """ + + retrievable_metrics = ['time', 'auc', 'recall'] + + if(workflows is not None): + _workflows = workflows + elif(workflows_path is not None): + with open(workflows_path) as file: + _workflows = json.load(file) + else: + raise ValueError("Please provide workflows dictionary / json file path.") + + if metric not in ['time', 'auc', 'recall']: + raise AttributeError( + 'Metric ({}) does not exist. Please select one of the available. ({})'.format( + metric, retrievable_metrics + ) + ) + + _results : dict = {} + # datasets, matchers and language models + # for which we want to find the top workflows + datasets : List[str] = None if 'dataset' not in arguments else arguments['dataset'] + matchers : List[str] = None if 'matcher' not in arguments else arguments['matcher'] + lms : List[str] = None if 'language_model' not in arguments else arguments['language_model'] + + _dataset_names : List[str] = _workflows.keys() if datasets is None else common_elements(datasets, workflows.keys()) + _current_workflows : List[dict] = [] + + for _dataset_name in _dataset_names: + _dataset_info : dict = _workflows[_dataset_name] + _matcher_names = _dataset_info.keys() if matchers is None else common_elements(matchers, _dataset_info.keys()) + for _matcher_name in _matcher_names: + _matcher_info : dict = _dataset_info[_matcher_name] + if _matcher_name == 'EmbeddingsNNBPM': + _lm_names = _matcher_info.keys() if lms is None else common_elements(lms, _matcher_info.keys()) + for _lm_workflows in _matcher_info[_lm_names]: + _current_workflows += _lm_workflows + else: + _current_workflows += _matcher_info + for _current_workflow in _current_workflows: + if(matching_arguments(workflow=_current_workflow, arguments=arguments)): + _results = update_top_results(results=_results, + new_workflow=_current_workflow, + metric=metric, + keep_top_budget=top_budget) + + print(_results) + if (store_path is not None): + with open(store_path, 'w', encoding="utf-8") as file: + json.dump(_results, file, indent=4) + + +def add_entry(workflow : dict, dataframe_dictionary : dict) -> None: + """Retrieves features and their values from the given workflow dictionary, + and stores them in the to-be-constructed dataframe dictionary + + Args: + workflow (dict): Dictionary containing workflow's arguments and their values + dataframe_dictionary (dict): Dictionary that stores workflows arguments and their values - + to be transformed into columns + """ + for feature, value in workflow.items(): + if(feature != 'tp_idx'): + if feature not in dataframe_dictionary: + dataframe_dictionary[feature] = [] + dataframe_dictionary[feature].append(value) + +def workflows_to_dataframe(workflows : dict = None, + workflows_path : str = None, + store_path : str = None) -> pd.DataFrame: + """Takes a workflow dictionary or retrieves it from given path. + Stores all of its entries in a dataframe. + Stores the dataframe in specified path if provided. + + Args: + workflows (dict): Dictionary containing the workflows (Defaults to None) + workflows_path (dict): Path from which the program will attempt to retrieve the workflows (Defaults to None) + store_path (str) : Path in which the dataframe will be stored in json format (Defaults to None) + + Returns: + pd.Dataframe : Dataframe containing the workflow entries in the given workflows dictionary + """ + if(workflows is not None): + _workflows = workflows + elif(workflows_path is not None): + with open(workflows_path) as file: + _workflows = json.load(file) + else: + raise ValueError("Please provide workflows dictionary / json file path.") + + dataframe_dictionary : dict = {} + workflows_dataframe : pd.DataFrame + + for dataset in _workflows: + dataset_info : dict = _workflows[dataset] + for matcher in dataset_info: + matcher_info : dict = dataset_info[matcher] + current_workflows : list = [] + if(matcher == 'EmbeddingsNNBPM'): + for lm in matcher_info: + current_workflows += matcher_info[lm] + else: + current_workflows += matcher_info + + for current_workflow in current_workflows: + add_entry(current_workflow, dataframe_dictionary) + + workflows_dataframe = pd.DataFrame(dataframe_dictionary) + if(store_path is not None): + workflows_dataframe.to_csv(store_path, index=False) + + return workflows_dataframe - \ No newline at end of file +# Frequency based Vectorization/Similarity evaluation Module +class FrequencyEvaluator(ABC): + def __init__(self, vectorizer : str, tokenizer : str, qgram : int) -> None: + super().__init__() + self.vectorizer_name : str = vectorizer + self.tokenizer : str = tokenizer + self.qgram : int = qgram + self.analyzer = 'char' if 'char' in self.tokenizer else 'word' + + if self.vectorizer_name == 'tfidf' or self.vectorizer_name == 'boolean': + self.vectorizer = TfidfVectorizer(analyzer='') if self.qgram is None else \ + TfidfVectorizer(analyzer=self.analyzer, ngram_range=(self.qgram, self.qgram)) + elif self.vectorizer_name == 'tf': + self.vectorizer = CountVectorizer(analyzer=self.analyzer) if self.qgram is None else \ + CountVectorizer(analyzer=self.analyzer, ngram_range=(self.qgram, self.qgram)) + else: + raise ValueError(f"{self.vectorizer_name}: Invalid Frequency Evaluator Model Name") + + self.dataset_identifier : str = None + self.indexing : str = None + self.distance_matrix : np.ndarray = None + self.distance_matrix_loaded : bool = False + self.distance_matrix_indexing : str = None + + def save_distance_matrix(self) -> None: + """Store the distance matrix of frequency evaluator in the hidden .dm directory within the execution path. + The name of the file contains the vectorizer, tokenizer, dataset and metric, so it can be retrieved and + used as precalculated distances matrix. + """ + distance_matrix_file_name = '_'.join([self.indexing, self.dataset_identifier, self.vectorizer_name, self.tokenizer.split('_')[0], self.metric, "q" + str(self.qgram) + ".npy"]) + + hidden_directory_path = os.path.join(os.getcwd(), ".dm") + os.makedirs(hidden_directory_path, exist_ok=True) + distance_matrix_file_path = os.path.join(hidden_directory_path, distance_matrix_file_name) + try: + print(f"Saving Distance Matrix -> {distance_matrix_file_path}") + np.save(distance_matrix_file_path, self.distance_matrix) + pass + except FileNotFoundError: + print(f"Unable to save distance matrix -> {distance_matrix_file_path}") + + + def load_distance_matrix_from_path(self, path : str) -> np.ndarray: + """Load the precalculated distance matrix for current execution's arguments combination. + Args: + path (str): Path to the distance matrix file + Returns: + np.ndarray: Precalculated distance matrix for current execution parameters combination + """ + try: + print(f"Loading Distance Matrix from: {path}") + return np.load(path) + pass + except FileNotFoundError: + print(f"Unable to load distance matrix -> {path}") + + def retrieve_distance_matrix_file_path(self) -> Tuple[str, str]: + """Attemps to retrieve a precalculated DM from disk for current experiment + Returns: + str: Precalculated DM file path (None if doesn't exist) + """ + + _requested_indexing : str = self.indexing + _opposite_indexing : str = "inorder" if (self.indexing == "reverse") else "reverse" + _requested_indexing_file_name = '_'.join([_requested_indexing, self.dataset_identifier, self.vectorizer_name, self.tokenizer.split('_')[0], self.metric, "q" + str(self.qgram) + ".npy"]) + _opposite_indexing_file_name = '_'.join([_opposite_indexing, self.dataset_identifier, self.vectorizer_name, self.tokenizer.split('_')[0], self.metric, "q" + str(self.qgram) + ".npy"]) + + hidden_directory_path = os.path.join(os.getcwd(), ".dm") + os.makedirs(hidden_directory_path, exist_ok=True) + + + _available_indexing : str = None + _available_file_path : str = None + _requested_indexing_file_path = os.path.join(hidden_directory_path, _requested_indexing_file_name) + _opposite_indexing_file_path = os.path.join(hidden_directory_path, _opposite_indexing_file_name) + + + if(os.path.exists(_requested_indexing_file_path) and os.path.isfile(_requested_indexing_file_path)): + _available_indexing = _requested_indexing + _available_file_path = _requested_indexing_file_path + elif(os.path.exists(_opposite_indexing_file_path) and os.path.isfile(_opposite_indexing_file_path)): + _available_indexing = _opposite_indexing + _available_file_path = _opposite_indexing_file_path + + return (_available_indexing, _available_file_path) + + + def distance_to_similarity_matrix(self, distance_matrix : np.ndarray) -> np.ndarray: + """Transforms the input distance matrix into similarity matrix + Args: + distance_matrix (np.ndarray): Input pairwise distance matrix + Returns: + np.ndarray: Pairwise similarity matrix + """ + + if(self.metric == 'sqeuclidean'): + return 1.0 / (1.0 + (distance_matrix ** 2)) + elif('cosine' in self.metric): + return 1.0 - distance_matrix + else: + return distance_matrix + + + def _get_sparse_matrix_method(self, metric : str) -> str: + if(metric == 'sqeuclidean'): + return 'euclidean' + else: + return metric + + def fit(self, + metric : str, + dataset_identifier : str, + indexing : str, + d1_entities : list = None, + d2_entities : list = None, + save_dm : bool = True) -> None: + """Initializes the entities' corpus, and constructs the similarity matrix + Args: + metric (str): Distance metric for entity strings + dataset_identifier (str): Name of the dataset we are conducting our experiment on + indexing (str): Indexing that the candidate entities follow + d1_entities (list): List of D1 entities' string representations + d2_entities (list): List of D2 entities' string representations + save_dm (bool): Save the distance matrix in hidden directory on disk + """ + if(d1_entities is None or d2_entities is None): + raise NotImplementedError(f"{self.vectorizer_name} Frequency Evaluator Model - Dirty ER is not implemented yet") + else: + self.metric : str = metric + self._entities_d1 : list = d1_entities + self._entities_d2 : list = d2_entities + self._entities_d1_num : int = len(self._entities_d1) + self._entities_d2_num : int = len(self._entities_d2) + self.save_dm : bool = save_dm + self.dataset_identifier : str = dataset_identifier + self.indexing : str = indexing + + _dm_indexing, _dm_path = self.retrieve_distance_matrix_file_path() + if(_dm_path is not None): + self.distance_matrix : np.ndarray = self.load_distance_matrix_from_path(path=_dm_path) + self.distance_matrix_loaded : bool = True + self.distance_matrix_indexing : str = _dm_indexing + else: + self.corpus = self._entities_d1 + self._entities_d2 + self._tf_limit = len(self._entities_d1) + self.corpus_as_matrix = self.vectorizer.fit_transform(self.corpus) + if self.vectorizer_name == 'boolean': + self.corpus_as_matrix = self.corpus_as_matrix.astype(bool).astype(int) + + self.distance_matrix : np.ndarray = self.distance_to_similarity_matrix( + distance_matrix=pairwise_distances( + self.corpus_as_matrix, + metric=self._get_sparse_matrix_method(metric=self.metric))) + + self.distance_matrix_loaded : bool = False + self.distance_matrix_indexing : str = self.indexing + + if(self.save_dm): + self.save_distance_matrix() + + + def predict(self, id1 : int, id2 : int) -> float: + """Returns the predicted similarity score for the given entities + Args: + id1 (int): id of an entity of the 1nd dataset within experiment context (not necessarily preloaded matrix) + id2 (int): id of an entity of the 2nd dataset within experiment context (not necessarily preloaded matrix) + Returns: + float: Similarity score of entities with specified IDs + """ + # candidates = np.vstack((self.corpus_as_matrix[id1], self.corpus_as_matrix[id2])) + # distances = pairwise_distances(candidates, metric=self.metric) + # return 1.0 - distances[0][1] + if(self.indexing == self.distance_matrix_indexing): + return self.distance_matrix[id1][id2] + # _id1 = (id1 + self._entities_d2_num) if (self.indexing == "inorder") else (id1 + self._entities_d1_num) + # _id2 = (id2 - self._entities_d1_num) if (self.indexing == "inorder") else (id2 - self._entities_d2_num) + _id1 = (id1 + self._entities_d2_num) + _id2 = (id2 - self._entities_d1_num) + + return self.distance_matrix[_id1][_id2] + \ No newline at end of file diff --git a/docs/pyjedai/vector_based_blocking.py b/docs/pyjedai/vector_based_blocking.py index 0c3c032..ca4259f 100644 --- a/docs/pyjedai/vector_based_blocking.py +++ b/docs/pyjedai/vector_based_blocking.py @@ -31,7 +31,7 @@ from .evaluation import Evaluation from .utils import SubsetIndexer -EMBEDDINGS_DIR = '.embeddings' +EMBEDDINGS_DIR = '.embs' if not os.path.exists(EMBEDDINGS_DIR): os.makedirs(EMBEDDINGS_DIR) EMBEDDINGS_DIR = os.path.abspath(EMBEDDINGS_DIR) @@ -105,6 +105,8 @@ def build_blocks(self, tqdm_disable: bool = False, save_embeddings: bool = True, load_embeddings_if_exist: bool = False, + load_path_d1: str = None, + load_path_d2: str = None, with_entity_matching: bool = False, input_cleaned_blocks: dict = None, similarity_distance: str = 'cosine' @@ -142,9 +144,10 @@ def build_blocks(self, self.with_entity_matching = with_entity_matching self.save_embeddings, self.load_embeddings_if_exist = save_embeddings, load_embeddings_if_exist self.max_word_embeddings_size = max_word_embeddings_size - self.simiarity_distance = similarity_distance + self.similarity_distance = similarity_distance self.data, self.attributes_1, self.attributes_2, self.vector_size, self.num_of_clusters, self.top_k, self.input_cleaned_blocks \ = data, attributes_1, attributes_2, vector_size, num_of_clusters, top_k, input_cleaned_blocks + self.load_path_d1, self.load_path_d2 = load_path_d1, load_path_d2 self._progress_bar = tqdm(total=data.num_of_entities, desc=(self._method_name + ' [' + self.vectorizer + ', ' + self.similarity_search + ']'), disable=tqdm_disable) @@ -163,7 +166,7 @@ def build_blocks(self, self._si = SubsetIndexer(self.input_cleaned_blocks, self.data, self._applied_to_subset) self._d1_valid_indices: list[int] = self._si.d1_retained_ids - self._d2_valid_indices: list[int] = [x - self.data.dataset_limit for x in self._si.d2_retained_ids] + self._d2_valid_indices: list[int] = [x - self.data.dataset_limit for x in self._si.d2_retained_ids] if not data.is_dirty_er else None self._entities_d1 = data.dataset_1[attributes_1 if attributes_1 else data.attributes_1] \ .apply(" ".join, axis=1) \ @@ -188,29 +191,35 @@ def build_blocks(self, self._d2_loaded : bool = False if load_embeddings_if_exist: print("Loading embeddings from file...") - - p1 = os.path.join(EMBEDDINGS_DIR, self.vectorizer + '_' + (self.data.dataset_name_1 \ - if self.data.dataset_name_1 is not None else "d1") +'.npy') - print("Loading file: ", p1) + if(self.load_path_d1 is not None): + p1 = self.load_path_d1 + else: + p1 = os.path.join(EMBEDDINGS_DIR, self.vectorizer + '_' + (self.data.dataset_name_1 \ + if self.data.dataset_name_1 is not None else "d1") +'.npy') + print("Attempting to load D1 embeddings...") if os.path.exists(p1): self.vectors_1 = vectors_1 = np.load(p1) self.vectors_1 = vectors_1 = vectors_1[self._d1_valid_indices] self._progress_bar.update(data.num_of_entities_1) self._d1_loaded = True + print(f"{p1} -> Loaded Successfully") else: print("Embeddings not found. Creating new ones.") - p2 = os.path.join(EMBEDDINGS_DIR, self.vectorizer + '_' + (self.data.dataset_name_2 \ - if self.data.dataset_name_2 is not None else "d2") +'.npy') - print("Loading file: ", p2) + if(self.load_path_d2 is not None): + p2 = self.load_path_d2 + else: + p2 = os.path.join(EMBEDDINGS_DIR, self.vectorizer + '_' + (self.data.dataset_name_2 \ + if self.data.dataset_name_2 is not None else "d2") +'.npy') + print("Attempting to load D2 embeddings...") if os.path.exists(p2): self.vectors_2 = vectors_2 = np.load(p2) self.vectors_2 = vectors_2 = vectors_2[self._d2_valid_indices] self._progress_bar.update(data.num_of_entities_2) self._d2_loaded = True + print(f"{p2} -> Loaded Successfully") else: print("Embeddings not found. Creating new ones.") - print("Loading embeddings from file finished") if not self._d1_loaded or not self._d2_loaded: if self.vectorizer in ['word2vec', 'fasttext', 'doc2vec', 'glove']: self.vectors_1, self.vectors_2 = self._create_gensim_embeddings() @@ -368,33 +377,33 @@ def _create_pretrained_sentence_embeddings(self): def _similarity_search_with_FAISS(self): index = faiss.IndexFlatL2(self.vectors_1.shape[1]) - if self.simiarity_distance == 'cosine' or self.simiarity_distance == 'cosine_without_normalization': + if self.similarity_distance == 'cosine' or self.similarity_distance == 'cosine_without_normalization': index.metric_type = faiss.METRIC_INNER_PRODUCT - elif self.simiarity_distance == 'euclidean': + elif self.similarity_distance == 'euclidean': index.metric_type = faiss.METRIC_L2 else: - raise ValueError("Invalid similarity distance: ", self.simiarity_distance) + raise ValueError("Invalid similarity distance: ", self.similarity_distance) - if self.simiarity_distance == 'cosine': + if self.similarity_distance == 'cosine': faiss.normalize_L2(self.vectors_1) - faiss.normalize_L2(self.vectors_2) + if not self.data.is_dirty_er: faiss.normalize_L2(self.vectors_2) index.train(self.vectors_1) # train on the vectors of dataset 1 - if self.simiarity_distance == 'cosine': + if self.similarity_distance == 'cosine': faiss.normalize_L2(self.vectors_1) - faiss.normalize_L2(self.vectors_2) + if not self.data.is_dirty_er: faiss.normalize_L2(self.vectors_2) index.add(self.vectors_1) # add the vectors and update the index - if self.simiarity_distance == 'cosine': + if self.similarity_distance == 'cosine': faiss.normalize_L2(self.vectors_1) - faiss.normalize_L2(self.vectors_2) + if not self.data.is_dirty_er: faiss.normalize_L2(self.vectors_2) self.distances, self.neighbors = index.search(self.vectors_1 if self.data.is_dirty_er else self.vectors_2, self.top_k) - if self.simiarity_distance == 'euclidean': + if self.similarity_distance == 'euclidean': self.distances = 1/(1 + self.distances) self.blocks = dict() @@ -516,12 +525,9 @@ def export_to_df(self, prediction) -> pd.DataFrame: Returns: pd.DataFrame: Dataframe with the predicted pairs """ - if self.data.ground_truth is None: - raise AttributeError("Can not proceed to evaluation without a ground-truth file. \ - Data object mush have initialized with the ground-truth file") pairs_df = pd.DataFrame(columns=['id1', 'id2']) for entity_id, candidates in prediction: - id1 = self.data._gt_to_ids_reversed_1[entity_id] + id1 = self.data._gt_to_ids_reversed_1[entity_id] for candiadate_id in candidates: id2 = self.data._gt_to_ids_reversed_1[candiadate_id] if self.data.is_dirty_er \ else self.data._gt_to_ids_reversed_2[candiadate_id] diff --git a/docs/pyjedai/visualization.py b/docs/pyjedai/visualization.py index 8075458..d01db52 100644 --- a/docs/pyjedai/visualization.py +++ b/docs/pyjedai/visualization.py @@ -1,6 +1,9 @@ import itertools import matplotlib.pyplot as plt import numpy as np +import pandas as pd +import os +from typing import List, Tuple # Function that creates a confusion matrix def create_confusion_matrix(confusion_matrix, title): @@ -25,4 +28,510 @@ def create_confusion_matrix(confusion_matrix, title): plt.xlabel('Predicted label') plt.tight_layout() plt.ylim([1.5, -.5]) - plt.show() \ No newline at end of file + plt.show() + +def plot_feature_progress_per_attribute_group(method_name : str, + dataset_name : str, + feature : str, + attributes : list, + df : pd.DataFrame = None, + load_path : str = None, + grid : bool = True, + save : bool = True, + verbose : bool = True, + in_plot_directory : bool = True + ) -> None: + """Plots the progress of the value of requested feature per budget for experiments grouped by the attributes. + Saves the plot as an image in the requested path. + + Args: + method_name (str): Name of the method used in the dataframe's experiments + dataset_name (str): Name of dataset on which the dataframe's experiments have been applied on + feature (str): The feature whose per budget progress we want to plot (e.x. auc) + attributes (list): Group of experiments' arguments whose each distinct combination constitutes a seperate curve + df (pd.DataFrame): Dataframe containing the information about progressive PER experiments (Defaults to None) + load_path (str): Path from which the dataframe should be loaded from (Defaults to None) + grid (bool): Grid to be displayed in the plot (Defaults to True) + save (bool) : Save the plot as an image on disk (Defaults to True) + verbose (bool) : Show the produced plot + in_plot_directory (bool) : Plot to be saved in an experiment directory - + created in the target dataframe's / current directory if non-existent (Defaults to True) + """ + + experiments : pd.DataFrame + if(df is not None): + experiments = df + elif(load_path is not None): + experiments = pd.read_csv(load_path) + else: + raise ValueError("No dataframe or csv file given - Cannot plot the experiments.") + + experiments = experiments.groupby(attributes) + + + fig = plt.figure(figsize=(16, 12)) + ax = plt.subplot(111) + + for attributes_unique_values, attributes_experiment_group in experiments: + group_name = '-'.join([str(attribute) for attribute in attributes_unique_values]) + attributes_experiment_group_per_budget = attributes_experiment_group.sort_values(by='budget').groupby('budget') + budgets = [] + average_feature_values = [] + for _, current_budget_attributes_experiment_group in attributes_experiment_group_per_budget: + budgets.append(current_budget_attributes_experiment_group['budget'].mean()) + average_feature_values.append(current_budget_attributes_experiment_group[feature].mean()) + + ax.plot(budgets, average_feature_values, label=str(group_name), marker='o', linestyle='-') + + # Customize the plot + ax.set_title(f'{method_name.capitalize()}/{dataset_name.capitalize()} - Average {feature.capitalize()} vs. Budget Curves') + ax.set_xlabel('Budget') + ax.set_ylabel(f'Average {feature.capitalize()}') + + pos = ax.get_position() + ax.set_position([pos.x0, pos.y0, pos.width * 0.9, pos.height]) + ax.legend(title=attributes, fontsize="9", loc='center right', bbox_to_anchor=(1.23, 0.5)) + + ax.grid(grid) + + if(save): + file_name = '_'.join([dataset_name, method_name, feature, 'for', '_'.join(attributes)]) + '.png' + dataframe_directory = os.path.dirname(load_path) if load_path is not None else './' + store_directory = dataframe_directory if not in_plot_directory else os.path.join(dataframe_directory, 'plots/') + + if in_plot_directory and not os.path.exists(store_directory): + os.makedirs(store_directory) + + plt.savefig(os.path.join(store_directory, file_name)) + + plt.show() + + + +def plot_attribute_group_avg_ranking(method_name : str, + feature : str, + attributes : list, + dfs : List[pd.DataFrame] = None, + load_paths : List[str] = None, + grid : bool = True, + save : bool = True, + verbose : bool = True, + in_plot_directory : bool = True + ) -> None: + """For each unique combination of given attributes calculates its average feature value across datasets for each budget. + Plots the corresponding results and stores them as an image if it is requested. + + Args: + method_name (str): The name of the PER method whose experiments we are evaluating + feature (str): The feature that we want to evaluate the average ranking of the attribute group for + attributes (list): Group of experiments' arguments whose each distinct combination constitutes a seperate curve + dfs (List[pd.DataFrame]): Dataframes containing the information about progressive PER experiments (Defaults to None) + load_paths (List[str]): Paths from which the dataframe should be loaded from (Defaults to None) + grid (bool): Grid to be displayed in the plot (Defaults to True) + save (bool) : Save the plot as an image on disk (Defaults to True) + verbose (bool) : Show the produced plot + in_plot_directory (bool) : Plot to be saved in an experiment directory - + created in the target dataframe's / current directory if non-existent (Defaults to True) + """ + + if(dfs is None and load_paths is None): + raise ValueError("No dataframes or csv files given - Cannot calculate and plot average combinations rankings.") + + total_datasets = len(dfs) if dfs is not None else len(load_paths) + attributes_combinations = {} + attributes_combinations_budget_scores : List[Tuple[float, str]] + + for current_dataset in range(total_datasets): + if(dfs is not None): + experiments = dfs[current_dataset] + else: + current_dataset_path = load_paths[current_dataset] + experiments = pd.read_csv(current_dataset_path) + + budgets_experiments = experiments.sort_values(by='budget').groupby('budget') + + for current_budget, current_budget_experiments in budgets_experiments: + current_budget_attributes_combinations = current_budget_experiments.groupby(attributes[0] if len(attributes) == 1 else attributes) + attributes_combinations_budget_scores = [] + + for attributes_combination, current_budget_attributes_combination in current_budget_attributes_combinations: + attributes_combination_budget_feature_value = current_budget_attributes_combination[feature].mean() + attributes_combinations_budget_scores.append((attributes_combination_budget_feature_value, attributes_combination)) + + for ranking, attributes_combinations_budget_score in enumerate(sorted(attributes_combinations_budget_scores, reverse=True)): + attributes_combination_budget_feature_value, attributes_combination = attributes_combinations_budget_score + if attributes_combination not in attributes_combinations: + attributes_combinations[attributes_combination] = {} + + if current_budget not in attributes_combinations[attributes_combination]: + attributes_combinations[attributes_combination][current_budget] = [] + + attributes_combinations[attributes_combination][current_budget].append(ranking+1) + + fig = plt.figure(figsize=(16, 12)) + ax = plt.subplot(111) + + for attributes_combination, attributes_combination_budgets in attributes_combinations.items(): + + attributes_combination_average_rankings = [] + sorted_budgets = sorted(attributes_combination_budgets.keys(), reverse=False) + for budget in sorted_budgets: + attributes_combination_average_rankings.append(sum(attributes_combination_budgets[budget]) / len(attributes_combination_budgets[budget])) + + ax.plot(sorted_budgets, attributes_combination_average_rankings, label=str(attributes_combination), marker='o', linestyle='-') + + + # Customize the plot + ax.set_title(f'{method_name.capitalize()} - Average {feature.capitalize()} Ranking vs. Budget Curves') + ax.set_xlabel('Budget') + ax.set_ylabel('Average Ranking') + + pos = ax.get_position() + ax.set_position([pos.x0, pos.y0, pos.width * 0.9, pos.height]) + ax.legend(title=attributes, fontsize="9", loc='center right', bbox_to_anchor=(1.23, 0.5)) + + ax.grid(grid) + + if(save): + file_name = '_'.join([method_name, 'for', '_'.join(attributes), 'avg_rankings', feature]) + '.png' + dataframe_directory = os.path.dirname(load_paths[0]) if load_paths is not None else './' + store_directory = dataframe_directory if not in_plot_directory else os.path.join(dataframe_directory, 'avg_rankings/') + + if in_plot_directory and not os.path.exists(store_directory): + os.makedirs(store_directory) + + plt.savefig(os.path.join(store_directory, file_name)) + + plt.show() + + + +def plot_attribute_group_avg_top_distance(method_name : str, + feature : str, + attributes : list, + dfs : List[pd.DataFrame] = None, + load_paths : List[str] = None, + grid : bool = True, + save : bool = True, + verbose : bool = True, + in_plot_directory : bool = True + ) -> None: + """For each unique combination of given attributes calculates its feature's value average difference from the best value across datasets for each budget. + Plots the corresponding results and stores them as an image if it is requested. + + Args: + method_name (str): The name of the PER method whose experiments we are evaluating + feature (str): The feature that we want to evaluate the average ranking of the attribute group for + attributes (list): Group of experiments' arguments whose each distinct combination constitutes a seperate curve + dfs (List[pd.DataFrame]): Dataframes containing the information about progressive PER experiments (Defaults to None) + load_paths (List[str]): Paths from which the dataframe should be loaded from (Defaults to None) + grid (bool): Grid to be displayed in the plot (Defaults to True) + save (bool) : Save the plot as an image on disk (Defaults to True) + verbose (bool) : Show the produced plot + in_plot_directory (bool) : Plot to be saved in an experiment directory - + created in the target dataframe's / current directory if non-existent (Defaults to True) + """ + + if(dfs is None and load_paths is None): + raise ValueError("No dataframes or csv files given - Cannot calculate and plot average combinations rankings.") + + total_datasets = len(dfs) if dfs is not None else len(load_paths) + attributes_combinations = {} + attributes_combinations_budget_scores : List[Tuple[float, str]] + + for current_dataset in range(total_datasets): + if(dfs is not None): + experiments = dfs[current_dataset] + else: + current_dataset_path = load_paths[current_dataset] + experiments = pd.read_csv(current_dataset_path) + + budgets_experiments = experiments.sort_values(by='budget').groupby('budget') + + for current_budget, current_budget_experiments in budgets_experiments: + current_budget_attributes_combinations = current_budget_experiments.groupby(attributes[0] if len(attributes) == 1 else attributes) + attributes_combinations_budget_scores = [] + + for attributes_combination, current_budget_attributes_combination in current_budget_attributes_combinations: + attributes_combination_budget_feature_value = current_budget_attributes_combination[feature].mean() + attributes_combinations_budget_scores.append((attributes_combination_budget_feature_value, attributes_combination)) + + attributes_combinations_budget_scores = sorted(attributes_combinations_budget_scores, reverse=True) + budget_highest_feature_value = attributes_combinations_budget_scores[0][0] + + for attributes_combinations_budget_score in attributes_combinations_budget_scores: + attributes_combination_budget_feature_value, attributes_combination = attributes_combinations_budget_score + if attributes_combination not in attributes_combinations: + attributes_combinations[attributes_combination] = {} + + if current_budget not in attributes_combinations[attributes_combination]: + attributes_combinations[attributes_combination][current_budget] = [] + + attributes_combinations[attributes_combination][current_budget].append(budget_highest_feature_value - attributes_combination_budget_feature_value) + + fig = plt.figure(figsize=(16, 12)) + ax = plt.subplot(111) + + for attributes_combination, attributes_combination_budgets in attributes_combinations.items(): + + attributes_combination_average_rankings = [] + sorted_budgets = sorted(attributes_combination_budgets.keys(), reverse=False) + for budget in sorted_budgets: + attributes_combination_average_rankings.append(sum(attributes_combination_budgets[budget]) / len(attributes_combination_budgets[budget])) + + ax.plot(sorted_budgets, attributes_combination_average_rankings, label=str(attributes_combination), marker='o', linestyle='-') + + + # Customize the plot + ax.set_title(f'{method_name.capitalize()} - Average {feature.capitalize()} Distance from Top vs. Budget Curves') + ax.set_xlabel('Budget') + ax.set_ylabel('Average Distance from Top') + + pos = ax.get_position() + ax.set_position([pos.x0, pos.y0, pos.width * 0.9, pos.height]) + ax.legend(title=attributes, fontsize="9", loc='center right', bbox_to_anchor=(1.23, 0.5)) + + ax.grid(grid) + + if(save): + file_name = '_'.join([method_name, 'for', '_'.join(attributes), 'avg_distances', feature]) + '.png' + dataframe_directory = os.path.dirname(load_paths[0]) if load_paths is not None else './' + store_directory = dataframe_directory if not in_plot_directory else os.path.join(dataframe_directory, 'avg_distances/') + + if in_plot_directory and not os.path.exists(store_directory): + os.makedirs(store_directory) + + plt.savefig(os.path.join(store_directory, file_name)) + + plt.show() + + +def plot_attributes_performance_for_budget(method_name : str, + feature : str, + attributes : list, + dfs : List[pd.DataFrame] = None, + load_paths : List[str] = None, + calculate_distance : bool = False, + grid : bool = True, + save : bool = True, + verbose : bool = True, + in_plot_directory : bool = True + ) -> pd.DataFrame: + """For each unique combination of given attributes calculates its feature value's average distance from best / ranking per budget. + Then calculates the same values for each combination of budget and dataset. Combination rows are sorted by the average of the averages + of the feature value's distance from best / ranking per budget. + + Args: + method_name (str): The name of the PER method whose experiments we are evaluating + feature (str): The feature that we want to evaluate the average ranking of the attribute group for + attributes (list): Group of experiments' arguments whose each distinct combination constitutes a seperate curve + dfs (List[pd.DataFrame]): Dataframes containing the information about progressive PER experiments (Defaults to None) + load_paths (List[str]): Paths from which the dataframe should be loaded from (Defaults to None) + calculate_distance (bool): Calculate distance for the feature from top within dataset (Defaults to False) + grid (bool): Grid to be displayed in the plot (Defaults to True) + save (bool) : Save the plot as an image on disk (Defaults to True) + verbose (bool) : Show the produced plot + in_plot_directory (bool) : Plot to be saved in an experiment directory - + created in the target dataframe's / current directory if non-existent (Defaults to True) + Returns: + pd.DataFrame: Dataframe containing the performance of the feature for each attributes' value combination across all datasets + for the requested budget order (e.x. first budget for each dataset) + """ + + if(dfs is None and load_paths is None): + raise ValueError("No dataframes or csv files given - Cannot calculate and plot average combinations rankings.") + + total_datasets : int = len(dfs) if dfs is not None else len(load_paths) + attributes_combinations : dict = {} + budget_dataset_best_feature_value : dict = {} + + attributes_column : str = ' + '.join([' '.join([word.capitalize() for word in attribute.split('_')]) for attribute in attributes]) + budget_dataframe : dict = {attributes_column : []} + + for current_dataset in range(total_datasets): + if(dfs is not None): + experiments = dfs[current_dataset] + else: + current_dataset_path = load_paths[current_dataset] + experiments = pd.read_csv(current_dataset_path) + + current_dataset_name : str = "D" + str(current_dataset+1) + budgets_experiments = experiments.sort_values(by='budget').groupby('budget') + + current_budget = 0 + for _, current_budget_experiments in budgets_experiments: + current_budget += 1 + current_budget_attributes_combinations = current_budget_experiments.groupby(attributes[0] if len(attributes) == 1 else attributes) + + for attributes_combination, current_budget_attributes_combination in current_budget_attributes_combinations: + + if attributes_combination not in attributes_combinations: + attributes_combinations[attributes_combination] = {} + + if current_budget not in attributes_combinations[attributes_combination]: + attributes_combinations[attributes_combination][current_budget] = {} + current_budget_attributes_combination_feature_value = current_budget_attributes_combination[feature].mean() + attributes_combinations[attributes_combination][current_budget][current_dataset_name] = current_budget_attributes_combination_feature_value + + if current_budget not in budget_dataset_best_feature_value: + budget_dataset_best_feature_value[current_budget] = {} + + if current_dataset_name not in budget_dataset_best_feature_value[current_budget]: + budget_dataset_best_feature_value[current_budget][current_dataset_name] = 0.0 + + if(current_budget_attributes_combination_feature_value > budget_dataset_best_feature_value[current_budget][current_dataset_name]): + budget_dataset_best_feature_value[current_budget][current_dataset_name] = current_budget_attributes_combination_feature_value + + if calculate_distance: + # we want to calculate each combination's performance distance from best performance per dataset + for attributes_combination, current_budget_attributes_combination in current_budget_attributes_combinations: + attributes_combinations[attributes_combination][current_budget][current_dataset_name] = budget_dataset_best_feature_value[current_budget][current_dataset_name] - attributes_combinations[attributes_combination][current_budget][current_dataset_name] + else: + # we want to calculate each combination's ranking per dataset + combinations_performance : List[Tuple[float, str]] = [] + for attributes_combination, current_budget_attributes_combination in current_budget_attributes_combinations: + combinations_performance.append((attributes_combinations[attributes_combination][current_budget][current_dataset_name], attributes_combination)) + combinations_performance = sorted(combinations_performance, reverse=True) + + for ranking, combination_performance in enumerate(combinations_performance): + performance, combination = combination_performance + attributes_combinations[combination][current_budget][current_dataset_name] = ranking + 1 + + + for attributes_combination, budgets_attributes_combination in attributes_combinations.items(): + budget_dataframe[attributes_column].append(attributes_combination) + for budget in budgets_attributes_combination: + + budget_attribute_combination = budgets_attributes_combination[budget] + budget_name = "B" + str(budget) + budget_feature_avg_value = 0.0 + + for dataset, dataset_budget_attribute_combination in budget_attribute_combination.items(): + + budget_dataset_column = '_'.join([str(budget_name),str(dataset)]) + + if(budget_dataset_column not in budget_dataframe): + budget_dataframe[budget_dataset_column] = [] + + budget_dataset_feature_value = attributes_combinations[attributes_combination][budget][dataset] + budget_feature_avg_value += budget_dataset_feature_value + budget_dataframe[budget_dataset_column].append(budget_dataset_feature_value) + + budget_average_column = '_'.join(["AVERAGE",budget_name]) + if(budget_average_column not in budget_dataframe): + budget_dataframe[budget_average_column] = [] + + budget_dataframe[budget_average_column].append(budget_feature_avg_value / len(budget_attribute_combination)) + + budget_dataframe = pd.DataFrame(budget_dataframe) + # Sort Attributes Combinations rows based on the average of the averages of their per budget performances + average_budget_performance_columns = ['_'.join(["AVERAGE", "B" + str(index+1)]) for index in range(len(budgets_experiments))] + budget_dataframe['AA_BS'] = budget_dataframe[average_budget_performance_columns].mean(axis=1) + budget_dataframe = budget_dataframe.sort_values(by='AA_BS', ascending=True) + + if(save): + metric = "distance" if calculate_distance else "ranking" + file_name = '_'.join([feature, metric, 'for', method_name, 'with', '_'.join(attributes)]) + '.csv' + + dataframe_directory = os.path.dirname(load_paths[0]) if load_paths is not None else './' + store_directory = dataframe_directory if not in_plot_directory else os.path.join(dataframe_directory, metric + '-analytical-performances/') + + if in_plot_directory and not os.path.exists(store_directory): + os.makedirs(store_directory) + + budget_dataframe.to_csv(os.path.join(store_directory, file_name), index=False) + + return budget_dataframe + + # total_datasets : int = len(dfs) if dfs is not None else len(load_paths) + # attributes_combinations : dict = {} + # budget_dataset_best_feature_value : dict = {} + # attributes_column : str = '_'.join([attribute.capitalize() for attribute in attributes]) + + # budget_dataframe : dict = {attributes_column : [], "AVERAGE" : []} + # budget_dataframe_column_data_types = {"AVERAGE" : float} + + + # for current_dataset in range(total_datasets): + # if(dfs is not None): + # experiments = dfs[current_dataset] + # else: + # current_dataset_path = load_paths[current_dataset] + # experiments = pd.read_csv(current_dataset_path) + + # current_dataset_name : str = "D" + str(current_dataset+1) + # budget_dataframe_column_data_types[current_dataset_name] = float if calculate_distance else int + + # budget_dataset_best_feature_value[current_dataset_name] = {} + # budget_dataframe[current_dataset_name] = [] + + + # budgets_experiments = experiments.sort_values(by='budget').groupby('budget') + + # for current_budget, current_budget_experiments in budgets_experiments: + + # current_budget_experiments = budgets_experiments.get_group(list(budgets_experiments.groups.keys())[budget_order]) + # current_budget_attributes_combinations = current_budget_experiments.groupby(attributes[0] if len(attributes) == 1 else attributes) + + # for attributes_combination, current_budget_attributes_combination in current_budget_attributes_combinations: + + # if attributes_combination not in attributes_combinations: + # attributes_combinations[attributes_combination] = {} + + # if current_budget not in attributes_combinations[attributes_combination]: + # attributes_combinations[attributes_combination][current_budget] = {} + + # current_budget_attributes_combination_feature_value = current_budget_attributes_combination[feature].mean() + # attributes_combinations[attributes_combination][current_budget][current_dataset] = current_budget_attributes_combination_feature_value + + + + # if current_budget_attributes_combination_feature_value > dataset_best_feature_value[current_dataset_name]: + # dataset_best_feature_value[current_dataset_name] = current_budget_attributes_combination_feature_value + + # if calculate_distance: + # # we want to calculate each combination's performance distance from best performance per dataset + # for attributes_combination, current_budget_attributes_combination in current_budget_attributes_combinations: + # attributes_combinations[attributes_combination][current_dataset_name] = dataset_best_feature_value[current_dataset_name] - attributes_combinations[attributes_combination][current_dataset_name] + # else: + # # we want to calculate each combination's ranking per dataset + # combinations_performance : List[Tuple[float, str]] = [] + # for attributes_combination, current_budget_attributes_combination in current_budget_attributes_combinations: + # combinations_performance.append((attributes_combinations[attributes_combination][current_dataset_name], attributes_combination)) + # combinations_performance = sorted(combinations_performance, reverse=True) + + # for ranking, combination_performance in enumerate(combinations_performance): + # performance, combination = combination_performance + # attributes_combinations[combination][current_dataset_name] = ranking + 1 + + # for attributes_combination, attributes_combination_datasets_performance in attributes_combinations.items(): + + # budget_dataframe[attributes_column].append(attributes_combination) + # average_attributes_combination_performance = 0.0 + + # for dataset, performance in attributes_combination_datasets_performance.items(): + # # print(performance) + # budget_dataframe[dataset].append(performance) + # average_attributes_combination_performance += performance + + # average_attributes_combination_performance /= len(attributes_combination_datasets_performance) + # budget_dataframe["AVERAGE"].append(average_attributes_combination_performance) + + # budget_dataframe = pd.DataFrame(budget_dataframe) + # budget_dataframe = budget_dataframe.astype(budget_dataframe_column_data_types) + # budget_dataframe = budget_dataframe.sort_values(by='AVERAGE', ascending=True) + + # if(save): + # metric = "distance" if calculate_distance else "ranking" + # budget_index = "b" + str((budget_order + 1)) + # file_name = '_'.join([budget_index, feature, metric, 'for', method_name, 'with', '_'.join(attributes)]) + '.csv' + + # dataframe_directory = os.path.dirname(load_paths[0]) if load_paths is not None else './' + # store_directory = dataframe_directory if not in_plot_directory else os.path.join(dataframe_directory, metric + '-performances/') + + # if in_plot_directory and not os.path.exists(store_directory): + # os.makedirs(store_directory) + + # budget_dataframe.to_csv(os.path.join(store_directory, file_name), index=False) + + # return budget_dataframe diff --git a/docs/pyjedai/workflow.py b/docs/pyjedai/workflow.py index 7b7ceed..185a10d 100644 --- a/docs/pyjedai/workflow.py +++ b/docs/pyjedai/workflow.py @@ -4,6 +4,8 @@ from typing import Callable, List, Tuple import matplotlib.pyplot as plt +import os +import json import optuna import pandas as pd from networkx import Graph @@ -19,7 +21,10 @@ from .vector_based_blocking import EmbeddingsNNBlockBuilding from .joins import EJoin, TopKJoin -plt.style.use('seaborn-whitegrid') +from .prioritization import ProgressiveMatching, BlockIndependentPM, class_references +from .utils import new_dictionary_from_keys, get_class_function_arguments, generate_unique_identifier + + class PYJEDAIWorkFlow(ABC): """Main module of the pyjedAI and the simplest way to create an end-to-end ER workflow. @@ -204,6 +209,270 @@ def get_final_scores(self) -> Tuple[float, float, float]: Tuple[float, float, float]: F-Measure, Precision, Recall. """ return self.f1[-1], self.precision[-1], self.recall[-1] +class ProgressiveWorkFlow(PYJEDAIWorkFlow): + """Main module of the pyjedAI and the simplest way to create an end-to-end PER workflow. + """ + + def __init__( + self, + name: str = None + ) -> None: + self.f1: list = [] + self.recall: list = [] + self.precision: list = [] + self.runtime: list = [] + self.configurations: list = [] + self.workflow_exec_time: float + self._id: int = next(self._id) + self.name: str = name if name else "Workflow-" + str(self._id) + self._workflow_bar: tqdm + self.final_pairs = None + + def run(self, + data: Data, + verbose: bool = False, + with_classification_report: bool = False, + workflow_step_tqdm_disable: bool = True, + workflow_tqdm_enable: bool = False, + block_building : dict = None, + block_purging : dict = None, + block_filtering : dict = None, + **matcher_arguments + ) -> None: + """Main function for creating an Progressive ER workflow. + + Args: + data (Data): Dataset Module, used to derive schema-awereness status + verbose (bool, optional): Print detailed report for each step. Defaults to False. + with_classification_report (bool, optional): Print pairs counts. Defaults to False. + workflow_step_tqdm_disable (bool, optional): Tqdm progress bar in each step. Defaults to True. + workflow_tqdm_enable (bool, optional): Overall progress bar. Defaults to False. + number_of_nearest_neighbors (int, optional): Number of nearest neighbours in cardinality based algorithms. Defaults to None. + indexing (str, optional): Inorder/Reverse/Bilateral indexing of datasets. Defaults to None. + similarity_function (str, optional): Function used to evaluate the similarity of two vector based representations of entities. Defaults to None. + language_model (str, optional): Language model used to vectorize the entities. Defaults to None. + tokenizer (str, optional): Text tokenizer used. Defaults to None. + weighting_scheme (str, optional): Scheme used to evaluate the weight between nodes of intermediate representation graph. Defaults to None. + block_building (dict, optional): Algorithm and its parameters used to construct the blocks. Defaults to None. + block_purging (dict, optional): Algorithm and its parameters used to delete obsolete blocks. Defaults to None. + block_filtering (dict, optional): Algorithm and its parameters used to lower the cardinality of blocks. Defaults to None. + window_size (dict, optional): Window size in the Sorted Neighborhood Progressive ER workflows. Defaults to None. + """ + self.block_building, self.block_purging, self.block_filtering, self.algorithm = \ + block_building, block_purging, block_filtering, matcher_arguments['algorithm'] + steps = [self.block_building, self.block_purging, self.block_filtering, self.algorithm] + num_of_steps = sum(x is not None for x in steps) + self._workflow_bar = tqdm(total=num_of_steps, + desc=self.name, + disable=not workflow_tqdm_enable) + + self.data : Data = data + self._init_experiment() + start_time = time() + self.matcher_arguments = matcher_arguments + self.matcher_name = self.matcher_arguments['matcher'] + self.dataset_name = self.matcher_arguments['dataset'] + matcher = class_references[matcher_arguments['matcher']] + self.constructor_arguments = new_dictionary_from_keys(dictionary=self.matcher_arguments, keys=get_class_function_arguments(class_reference=matcher, function_name='__init__')) + self.predictor_arguments = new_dictionary_from_keys(dictionary=self.matcher_arguments, keys=get_class_function_arguments(class_reference=matcher, function_name='predict')) + print(self.constructor_arguments) + print(self.predictor_arguments) + + progressive_matcher : ProgressiveMatching = matcher(**self.constructor_arguments) + self.progressive_matcher : ProgressiveMatching = progressive_matcher + # + # Block Building step: Only one algorithm can be performed + # + block_building_method = (self.block_building['method'](**self.block_building["params"]) \ + if "params" in self.block_building \ + else self.block_building['method']()) if self.block_building \ + else (None if not self._blocks_required() else StandardBlocking()) + + bblocks = None + block_building_blocks = None + if block_building_method: + block_building_blocks = \ + block_building_method.build_blocks(data, + attributes_1=self.block_building["attributes_1"] \ + if(self.block_building is not None and "attributes_1" in self.block_building) else None, + attributes_2=self.block_building["attributes_2"] \ + if(self.block_building is not None and "attributes_2" in self.block_building) else None, + tqdm_disable=workflow_step_tqdm_disable) + self.final_pairs = bblocks = block_building_blocks + res = block_building_method.evaluate(block_building_blocks, + export_to_dict=True, + with_classification_report=with_classification_report, + verbose=verbose) + self._save_step(res, block_building_method.method_configuration()) + self._workflow_bar.update(1) + + if(block_building_blocks is not None): + # + # Block Purging step [optional] + # + bblocks = block_building_blocks + block_purging_blocks = None + if(self.block_purging is not None): + block_purging_method = self.block_purging['method'](**self.block_purging["params"]) \ + if "params" in self.block_purging \ + else self.block_purging['method']() + block_purging_blocks = block_purging_method.process(bblocks, + data, + tqdm_disable=workflow_step_tqdm_disable) + self.final_pairs = bblocks = block_purging_blocks + res = block_purging_method.evaluate(bblocks, + export_to_dict=True, + with_classification_report=with_classification_report, + verbose=verbose) + self._save_step(res, block_purging_method.method_configuration()) + self._workflow_bar.update(1) + # + # Block Filtering step [optional] + # + block_filtering_blocks = None + if(self.block_filtering is not None): + block_filtering_method = self.block_filtering['method'](**self.block_filtering["params"]) \ + if "params" in self.block_filtering \ + else self.block_filtering['method']() + block_filtering_blocks = block_filtering_method.process(bblocks, + data, + tqdm_disable=workflow_step_tqdm_disable) + self.final_pairs = bblocks = block_filtering_blocks + res = block_filtering_method.evaluate(bblocks, + export_to_dict=True, + with_classification_report=with_classification_report, + verbose=verbose) + self._save_step(res, block_filtering_method.method_configuration()) + self._workflow_bar.update(1) + + # + # Progressive Matching step + # + self.final_pairs : List[Tuple[float, int, int]] = progressive_matcher.predict(data=data, blocks=bblocks, dataset_identifier=self.dataset_name, **self.predictor_arguments) + evaluator = Evaluation(self.data) + self.tp_indices, self.total_emissions = evaluator.calculate_tps_indices(pairs=self.final_pairs,duplicate_of=progressive_matcher.duplicate_of, duplicate_emitted=progressive_matcher.duplicate_emitted) + self.total_candidates = len(self.final_pairs) + self._workflow_bar.update(1) + self.workflow_exec_time = time() - start_time + + def _blocks_required(self): + return not isinstance(self.progressive_matcher, BlockIndependentPM) + + def _init_experiment(self) -> None: + self.f1: list = [] + self.recall: list = [] + self.precision: list = [] + self.runtime: list = [] + self.configurations: list = [] + self.workflow_exec_time: float + + def visualize( + self, + f1: bool = True, + recall: bool = True, + precision: bool = True, + separate: bool = False + ) -> None: + pass + + def to_df(self) -> pd.DataFrame: + pass + + def export_pairs(self) -> pd.DataFrame: + pass + + def _save_step(self, results: dict, configuration: dict) -> None: + pass + + def get_final_scores(self) -> Tuple[float, float, float]: + pass + + def retrieve_matcher_workflows(self, workflows : dict, arguments : dict) -> list: + """Retrieves the list of already executed workflows for the matcher/model of current workflow + + Args: + workflows (dict): Dictionary of script's executed workflows' information + arguments (dict): Arguments that have been supplied for current workflow execution + + Returns: + list: List of already executed workflows for given workflow's arguments' matcher/model + """ + dataset : str = self.dataset_name + matcher : str = self.matcher_name + + workflows[dataset] = workflows[dataset] if dataset in workflows else dict() + matcher_results = workflows[dataset] + matcher_results[matcher] = matcher_results[matcher] if matcher in matcher_results \ + else ([] if('language_model' not in arguments) else {}) + + matcher_info = matcher_results[matcher] + workflows_info = matcher_info + if(isinstance(matcher_info, dict)): + lm_name = arguments['language_model'] + matcher_info[lm_name] = matcher_info[lm_name] if lm_name in matcher_info else [] + workflows_info = matcher_info[lm_name] + + return workflows_info + + + + def save(self, arguments : dict, path : str = None, results = None) -> dict: + """Stores argument / execution information for current workflow within a workflows dictionary. + + Args: + arguments (dict): Arguments that have been supplied for current workflow execution + path (str): Path where the workflows results are stored at (Default to None), + results (str): A dictionary of workflows results at which we want to store current workflow's arguments/info + Returns: + dict: Dictionary containing the information about the given workflow + """ + if(path is None and results is None): + raise ValueError(f"No dictionary path or workflows dictionary given - Cannot save workflow.") + + if(results is not None): + workflows = results + elif(not os.path.exists(path) or os.path.getsize(path) == 0): + workflows = {} + else: + with open(path, 'r', encoding="utf-8") as file: + workflows = json.load(file) + + category_workflows = self.retrieve_matcher_workflows(workflows=workflows, arguments=arguments) + self.save_workflow_info(arguments=arguments) + category_workflows.append(self.info) + + if(path is not None): + with open(path, 'w', encoding="utf-8") as file: + json.dump(workflows, file, indent=4) + + return self.info + + def save_workflow_info(self, arguments : dict) -> dict: + """Stores current workflow argument values and execution related data (like execution time and total emissions) + + Args: + arguments (dict): Arguments that were passed to progressive workflow at hand + """ + + workflow_info : dict = {k: v for k, v in arguments.items()} + workflow_info['total_candidates'] = self.total_candidates + workflow_info['total_emissions'] = self.total_emissions + workflow_info['time'] = self.workflow_exec_time + workflow_info['name'] = generate_unique_identifier() + workflow_info['tp_idx'] = self.tp_indices + workflow_info['dataset'] = self.dataset_name + workflow_info['matcher'] = self.matcher_name + + self.info = workflow_info + + def print_info(self, info : dict): + for attribute in info: + value = info[attribute] + if(attribute != 'tp_idx'): + print(f"{attribute} : {value}") + else: + print(f"true_positives : {len(value)}") + def compare_workflows(workflows: List[PYJEDAIWorkFlow], with_visualization=True) -> pd.DataFrame: """Compares workflows by creating multiple plots and tables with results. @@ -526,8 +795,10 @@ def best_blocking_workflow_ccer(self) -> None: self.comparison_cleaning = dict(method=WeightedEdgePruning, params=dict(weighting_scheme='EJS')) self.entity_matching = dict(method=EntityMatching, params=dict(metric='cosine', - tokenizer='tfidf_char_3gram', - similarity_threshold=0.0)) + tokenizer='char_tokenizer', + vectorizer='tfidf', + qgram=3, + similarity_threshold=0.0)) self.clustering = dict(method=UniqueMappingClustering, exec_params=dict(similarity_threshold=0.17)) self.name="best-ccer-workflow"