diff --git a/docs/pyjedai/_version.py b/docs/pyjedai/_version.py
index a73339b..ae73625 100644
--- a/docs/pyjedai/_version.py
+++ b/docs/pyjedai/_version.py
@@ -1 +1 @@
-__version__ = "0.0.8"
+__version__ = "0.1.3"
diff --git a/docs/pyjedai/block_building.py b/docs/pyjedai/block_building.py
index fd29790..36e7414 100644
--- a/docs/pyjedai/block_building.py
+++ b/docs/pyjedai/block_building.py
@@ -13,7 +13,7 @@
 from tqdm.auto import tqdm
 
 from .datamodel import Block, Data, PYJEDAIFeature
-from .utils import (are_matching, drop_big_blocks_by_size,
+from .utils import (are_matching, drop_big_blocks_by_size, create_entity_index,
                     drop_single_entity_blocks, get_blocks_cardinality)
 from .evaluation import Evaluation
 
@@ -93,39 +93,148 @@ def evaluate(self,
             self.stats(eval_blocks)
         return eval_result
 
-    def stats(self, blocks: dict) -> None:
-        self.list_of_sizes = []
+    def stats(self, blocks: dict, verbose: bool = True) -> dict:
+
+        # Atomic features
+        self.portion_of_singleton_entites =  0
+        self.portion_of_duplicate_blocks = 0 # contain the same entities
+        self.num_of_block_assignments = 0
+        self.num_of_minimal_blocks = 0 # one-comparison blocks
+        self.num_of_blocks_per_entity = 0
+        self.average_number_of_block_assignments_per_comparison = 0
+        self.optimality_distance = 0
         self.entities_in_blocks = set()
+        self.size_per_block = []
+        self.cardinalities = []
+        self.num_of_blocks = len(blocks)
         for block in blocks.values():
             self.sum_of_sizes += block.get_size()
             self.min_block_size = min(self.min_block_size, block.get_size()) if self.min_block_size else block.get_size()
             self.max_block_size = max(self.max_block_size, block.get_size()) if self.max_block_size else block.get_size()
             self.min_block_comparisons = min(self.min_block_comparisons, block.get_cardinality(self.data.is_dirty_er)) if self.min_block_comparisons else block.get_cardinality(self.data.is_dirty_er)
             self.max_block_comparisons = max(self.max_block_comparisons, block.get_cardinality(self.data.is_dirty_er)) if self.max_block_comparisons else block.get_cardinality(self.data.is_dirty_er)
-            self.list_of_sizes.append(block.get_size())
+            self.size_per_block.append(block.get_size())
             self.entities_in_blocks = self.entities_in_blocks.union(block.entities_D1)
             if not self.data.is_dirty_er:
                 self.entities_in_blocks = self.entities_in_blocks.union(block.entities_D2)
-            self.total_num_of_comparisons += block.get_cardinality(self.data.is_dirty_er)
-        
-        self.num_of_blocks = len(blocks)
+            cardinality = block.get_cardinality(self.data.is_dirty_er)
+            self.cardinalities.append(cardinality)
+            if cardinality == 1:
+                self.num_of_minimal_blocks += 1
+                
+        self.num_of_minimal_blocks /= self.num_of_blocks
+        self.num_of_entities_in_blocks = len(self.entities_in_blocks)
+        self.num_of_block_assignments = self.total_num_of_comparisons = sum(self.cardinalities)
         self.average_block_size = int(self.sum_of_sizes / self.num_of_blocks)
-        self.list_of_sizes = sorted(self.list_of_sizes)
-        median = self.list_of_sizes[int(len(self.list_of_sizes)/2)]
-        print(
-            "Statistics:" +
-            "\n\tNumber of blocks: " + str(self.num_of_blocks) +
-            "\n\tAverage block size: " + str(self.average_block_size) +
-            "\n\tMedian block size: " + str(median) +
-            "\n\tMax block size: " + str(self.max_block_size) +
-            "\n\tMin block size: " + str(self.min_block_size) +
-            "\n\tNumber of blocks dropped: " + str(self.num_of_blocks_dropped) +
-            "\n\tNumber of comparisons: " + str(self.total_num_of_comparisons) +
-            "\n\tMax comparisons per block: " + str(self.max_block_comparisons) +
-            "\n\tMin comparisons per block: " + str(self.min_block_comparisons) +
-            "\n\tEntities in blocks: " + str(len(self.entities_in_blocks))
-        )
-        print(u'\u2500' * 123)
+        self.size_per_block = sorted(self.size_per_block)
+        self.num_of_blocks_per_entity = self.num_of_blocks / self.num_of_entities_in_blocks
+        self.average_number_of_block_assignments_per_comparison = self.num_of_block_assignments / (2*self.total_num_of_comparisons)
+        median = self.size_per_block[int(len(self.size_per_block)/2)]
+
+        entity_index = create_entity_index(blocks, self.data.is_dirty_er)
+        
+        # Distributional features
+        self.blocks_frequency = []
+        self.relative_block_frequency = []
+        self.comparison_frequency = []
+        self.relative_comparison_frequency = []
+        
+        for entity in entity_index:
+            if len(entity_index[entity]) == 1:
+                self.portion_of_singleton_entites += 1
+            self.blocks_frequency.append(len(entity_index[entity]))
+            self.relative_block_frequency.append(len(entity_index[entity]) / self.num_of_blocks)
+            self.comparison_frequency.append(sum([blocks[block_key].get_cardinality(self.data.is_dirty_er) for block_key in entity_index[entity]]))
+            self.relative_comparison_frequency.append(sum([blocks[block_key].get_cardinality(self.data.is_dirty_er) for block_key in entity_index[entity]]) / self.total_num_of_comparisons)
+        
+        self.portion_of_singleton_entites /= self.num_of_entities_in_blocks
+        self.portion_of_minimal_blocks = self.num_of_minimal_blocks / self.num_of_blocks
+        
+        # Distributional features
+        self.average_blocks_per_entity = np.mean(self.blocks_frequency)
+        self.average_number_of_block_assignments_per_entity = np.mean(self.relative_block_frequency)
+        self.average_comparison_per_entity = np.mean(self.comparison_frequency)
+        self.average_relative_number_of_comparisons_per_entity = np.mean(self.relative_comparison_frequency)
+
+        self.entropy_of_blocks_per_entity = -np.sum([p * np.log2(p) for p in self.blocks_frequency])
+        self.entropy_of_comparison_per_entity = -np.sum([p * np.log2(p) for p in self.comparison_frequency])
+        
+        self.kurtosis_of_blocks_per_entity = np.sum([(p - self.average_blocks_per_entity)**4 for p in self.blocks_frequency]) /\
+                                                    (self.num_of_blocks * self.average_blocks_per_entity**4)
+        self.kurtosis_of_comparison_per_entity = np.sum([(p - self.average_comparison_per_entity)**4 for p in self.comparison_frequency]) /\
+                                                    (self.num_of_blocks * self.average_comparison_per_entity**4)
+        
+        self.skewness_of_blocks_per_entity = np.sum([(p - self.average_blocks_per_entity)**3 for p in self.blocks_frequency]) /\
+                                                (self.num_of_blocks * self.average_blocks_per_entity**3)
+        self.skewness_of_comparison_per_entity = np.sum([(p - self.average_comparison_per_entity)**3 for p in self.comparison_frequency]) /\
+                                                (self.num_of_blocks * self.average_comparison_per_entity**3)
+        
+        
+        if verbose:
+            print(
+                "Statistics:" +
+                "\n\tNumber of blocks: " + str(self.num_of_blocks) +
+                "\n\tAverage block size: " + str(self.average_block_size) +
+                "\n\tMedian block size: " + str(median) +
+                "\n\tMax block size: " + str(self.max_block_size) +
+                "\n\tMin block size: " + str(self.min_block_size) +
+                "\n\tNumber of blocks dropped: " + str(self.num_of_blocks_dropped) +
+                "\n\tNumber of comparisons: " + str(self.total_num_of_comparisons) +
+                "\n\tMax comparisons per block: " + str(self.max_block_comparisons) +
+                "\n\tMin comparisons per block: " + str(self.min_block_comparisons) +
+                "\n\tEntities in blocks: " + str(len(self.entities_in_blocks))
+            )
+            print(u'\u2500' * 123)
+            print(
+                "\tAtomic feautures" +
+                "\n\t\tNumber of entities in blocks: " + str(self.num_of_entities_in_blocks) +
+                "\n\t\tNumber of blocks: " + str(self.num_of_blocks) +
+                "\n\t\tPortion of singleton entities: " + str(self.portion_of_singleton_entites) +
+                "\n\t\tTotal number of comparisons: " + str(self.total_num_of_comparisons) +
+                "\n\t\tNumber of blocks: " + str(self.num_of_blocks) +
+                "\n\t\tNumber of block assignments: " + str(self.num_of_block_assignments) +
+                "\n\t\tPortion of minimal blocks: " + str(self.portion_of_minimal_blocks) +
+                "\n\t\tNumber of blocks per entity: " + str(self.num_of_blocks_per_entity) +
+                "\n\t\tAverage number of block assignments per comparison: " + str(self.average_number_of_block_assignments_per_comparison)                
+            )
+            print(u'\u2500' * 123)
+            print(
+                "\tDistributional feautures" +
+                "\n\t\tAverage blocks per entity: " + str(self.average_blocks_per_entity) +
+                "\n\t\tAverage number of block assignments per entity: " + str(self.average_number_of_block_assignments_per_entity) +
+                "\n\t\tAverage comparison per entity: " + str(self.average_comparison_per_entity) +
+                "\n\t\tAverage relative number of comparisons per entity: " + str(self.average_relative_number_of_comparisons_per_entity) +
+                "\n\t\tEntropy of blocks per entity: " + str(self.entropy_of_blocks_per_entity) +
+                "\n\t\tEntropy of comparison per entity: " + str(self.entropy_of_comparison_per_entity) +
+                "\n\t\tKurtosis of blocks per entity: " + str(self.kurtosis_of_blocks_per_entity) +
+                "\n\t\tKurtosis of comparison per entity: " + str(self.kurtosis_of_comparison_per_entity) +
+                "\n\t\tSkewness of blocks per entity: " + str(self.skewness_of_blocks_per_entity) +
+                "\n\t\tSkewness of comparison per entity: " + str(self.skewness_of_comparison_per_entity)
+            )
+            print(u'\u2500' * 123)
+        
+        return {
+            'num_of_blocks': self.num_of_blocks,
+            'average_block_size': self.average_block_size,
+            'median_block_size': median,
+            'max_block_size': self.max_block_size,
+            'min_block_size': self.min_block_size,
+            'num_of_blocks_dropped': self.num_of_blocks_dropped,
+            'total_num_of_comparisons': self.total_num_of_comparisons,
+            'max_block_comparisons': self.max_block_comparisons,
+            'min_block_comparisons': self.min_block_comparisons,
+            'entities_in_blocks': len(self.entities_in_blocks),
+            'average_blocks_per_entity': self.average_blocks_per_entity,
+            'average_number_of_block_assignments_per_entity': self.average_number_of_block_assignments_per_entity,
+            'average_comparison_per_entity': self.average_comparison_per_entity,
+            'average_relative_number_of_comparisons_per_entity': self.average_relative_number_of_comparisons_per_entity,
+            'entropy_of_blocks_per_entity': self.entropy_of_blocks_per_entity,
+            'entropy_of_comparison_per_entity': self.entropy_of_comparison_per_entity,
+            'kurtosis_of_blocks_per_entity': self.kurtosis_of_blocks_per_entity,
+            'kurtosis_of_comparison_per_entity': self.kurtosis_of_comparison_per_entity,
+            'skewness_of_blocks_per_entity': self.skewness_of_blocks_per_entity,
+            'skewness_of_comparison_per_entity': self.skewness_of_comparison_per_entity
+        }
 
     def export_to_df(
         self,
@@ -140,9 +249,6 @@ def export_to_df(
         Returns:
             pd.DataFrame: Dataframe predicted pairs (can be exported to csv)
         """
-        if self.data.ground_truth is None:
-            raise AttributeError("Can not proceed to evaluation without a ground-truth file. \
-                Data object mush have initialized with the ground-truth file")
         pairs_df = pd.DataFrame(columns=['id1', 'id2'])
         for _, block in blocks.items():
             if self.data.is_dirty_er:
diff --git a/docs/pyjedai/clustering.py b/docs/pyjedai/clustering.py
index d1c8c14..4f84470 100644
--- a/docs/pyjedai/clustering.py
+++ b/docs/pyjedai/clustering.py
@@ -2,14 +2,322 @@
 from time import time
 
 import pandas as pd
-from networkx import Graph, connected_components
+from networkx import Graph, connected_components, gomory_hu_tree
 from tqdm.autonotebook import tqdm
+from ordered_set import OrderedSet
+import numpy as np
+from scipy.sparse import csr_matrix, lil_matrix
 
 from .datamodel import Data, PYJEDAIFeature
 from .evaluation import Evaluation
 from .utils import are_matching
+from collections import defaultdict
+import random
+from ordered_set import OrderedSet
 
+RANDOM_SEED = 42
 
+
+class EquivalenceCluster(PYJEDAIFeature):
+    
+    def __init__(self, data : Data) -> None:
+        super().__init__()
+        self.data : Data = data
+        self.d1_entities = OrderedSet()
+        self.d2_entities = OrderedSet()
+        
+    def __init__(self, data : Data, flattened_cluster : list) -> None:
+        super().__init__()
+        self.data : Data = data
+        self.d1_entities = set()
+        self.d2_entities = set()
+        self.add_entities(flattened_cluster)
+    
+    def get_entity_dataset(self, entity : int) -> set:
+        return self.d1_entities \
+                if(entity < self.data.dataset_limit) \
+                else self.d2_entities
+    
+    def add_entity(self, entity : int) -> None:
+        target_dataset_entities = self.get_entity_dataset(entity) 
+        target_dataset_entities.add(entity)
+    
+    def add_entities(self, entities : list) -> None:
+        for entity in entities:
+            self.add_entity(entity)
+    
+    def get_entities(self) -> list:
+        return list((self.get_D1_entities() | self.get_D2_entities()))
+           
+    def get_D1_entities(self) -> set:
+        return self.d1_entities
+    
+    def get_D2_entities(self) -> set:
+        return self.d2_entities 
+    
+    def has_entities(self) -> bool:
+        return self.has_D1_entities() or self.has_D2_entities()
+    
+    def has_D1_entities(self) -> bool:
+        return (len(self.d1_entities) > 0)
+    
+    def has_D2_entities(self) -> bool:
+        return (len(self.d1_entities) > 0)
+    
+    def has_entity(self, entity : int) -> bool:
+        target_dataset_entities = self.get_entity_dataset(entity)
+        return (entity in target_dataset_entities)
+    
+    def remove_entity(self, entity: int) -> None:
+        target_dataset_entities = self.get_entity_dataset(entity)
+        target_dataset_entities.remove(entity)
+        
+    def remove_entities(self, entities: list) -> None:
+        for entity in entities:
+            self.remove_entity(entity)        
+    
+    def flatten(self) -> list:
+        flattened_cluster : list = []
+        
+        for d1_entity in self.d1_entities:
+            flattened_cluster.append(d1_entity) 
+        for d2_entity in self.d2_entities:
+            flattened_cluster.append(d2_entity)
+            
+        return flattened_cluster 
+    
+    def evaluate(self,
+                 prediction=None,
+                 export_to_df: bool = False,
+                 export_to_dict: bool = False,
+                 with_classification_report: bool = False,
+                 verbose: bool = True) -> any:
+        pass
+
+    def _configuration(self) -> dict:
+        pass
+
+    def stats(self) -> None:
+        pass
+
+class ExtendedSimilarityEdge(PYJEDAIFeature):
+    def __init__(self,
+                 left_node : int,
+                 right_node : int,
+                 similarity : float,
+                 active : bool = True) -> None:
+        super().__init__()
+        self.set_left_node(left_node=left_node)
+        self.set_right_node(right_node=right_node)
+        self.set_similarity(similarity=similarity)
+        self.set_active(active=active)
+        
+    def set_left_node(self, left_node : int):
+        self.left_node : int = left_node
+        
+    def set_right_node(self, right_node : int):
+        self.right_node : int = right_node
+
+    def set_similarity(self, similarity : float):
+        self.similarity : float = similarity
+        
+    def set_active(self, active : bool):
+        self.active : bool = active
+        
+    def is_active(self):
+        return self.active
+        
+    def __lt__(self, other):
+        return self.similarity < other.similarity
+
+    def __le__(self, other):
+        return self.similarity <= other.similarity
+
+    def __eq__(self, other):
+        return self.similarity == other.similarity
+
+    def __ne__(self, other):
+        return self.similarity != other.similarity
+
+    def __gt__(self, other):
+        return self.similarity > other.similarity
+
+    def __ge__(self, other):
+        return self.similarity >= other.similarity
+
+    def evaluate(self,
+                 prediction=None,
+                 export_to_df: bool = False,
+                 export_to_dict: bool = False,
+                 with_classification_report: bool = False,
+                 verbose: bool = True) -> any:
+        pass
+
+    def _configuration(self) -> dict:
+        pass
+
+    def stats(self) -> None:
+        pass
+    
+class Vertex(PYJEDAIFeature):
+    def __init__(self,
+                 identifier : int,
+                 edges : list = None) -> None:
+        super().__init__()
+        self.set_identifier(identifier=identifier)
+        self.set_attached_edges(attached_edges=0)
+        self.set_weight_sum(weight_sum=0)
+        self.set_edges(edges={})
+        if(edges is not None): self.insert_edges(edges=edges)
+        
+    def set_identifier(self, identifier : int) -> None:
+        self.identifier : int = identifier
+        
+    def set_attached_edges(self, attached_edges : int) -> None:
+        self.attached_edges : int = attached_edges
+        
+    def set_weight_sum(self, weight_sum : float) -> None:
+        self.weight_sum : float = weight_sum
+        
+    def set_edges(self, edges : dict) -> None:
+        self.edges : dict = edges
+
+    def set_average_weight(self, average_weight : float) -> None:
+        self.average_weight : float = average_weight
+    
+    def insert_edges(self, edges : list) -> None:
+        for edge in edges:
+            self.insert_edge(edge=edge)
+        
+    def insert_edge(self, edge : tuple) -> None:
+        vertex, weight = edge
+        self.update_weight_sum_by(update_value=weight)
+        self.update_attached_edges_by(update_value=1)
+        self.edges[vertex] = weight
+        self.update_average_weight()
+        
+    def remove_edges(self, edges : list) -> None:
+        for edge in edges:
+            self.remove_edge(edge=edge)
+                
+    def remove_edge(self, edge : int) -> None:
+        weight = self.edges.pop(edge, None)
+        if(weight is not None):
+            self.update_attached_edges_by(update_value=-1)
+            self.update_weight_sum_by(update_value=-weight)
+            self.update_average_weight()
+            
+    def get_attached_edges(self) -> int:
+        return self.attached_edges
+    
+    def get_weight_sum(self) -> float:
+        return self.weight_sum
+    
+    def get_edges(self) -> list:
+        return self.edges
+    
+    def get_identifier(self) -> int:
+        return self.identifier 
+    
+    def get_similarity_with(self, entity : int) -> float:
+        return self.edges[entity] if entity in self.edges else 0.0
+    
+    def update_weight_sum_by(self, update_value : float) -> None:
+        self.set_weight_sum(self.get_weight_sum() + update_value)
+        
+    def update_attached_edges_by(self, update_value : float) -> None:
+        self.set_attached_edges(self.get_attached_edges() + update_value)
+        
+    def update_average_weight(self, negative = True) -> None:
+        _average_weight : float = (self.get_weight_sum() / self.get_attached_edges())
+        _average_weight = -_average_weight if negative else _average_weight 
+        self.set_average_weight(average_weight=_average_weight)
+        
+    def has_edges(self):
+        return (self.get_attached_edges() > 0)
+        
+    def __lt__(self, other):
+        return self.average_weight < other.average_weight
+
+    def __le__(self, other):
+        return self.average_weight <= other.average_weight
+
+    def __eq__(self, other):
+        return self.average_weight == other.average_weight
+
+    def __ne__(self, other):
+        return self.average_weight != other.average_weight
+
+    def __gt__(self, other):
+        return self.average_weight > other.average_weight
+
+    def __ge__(self, other):
+        return self.average_weight >= other.average_weight
+    
+    def evaluate(self,
+                 prediction=None,
+                 export_to_df: bool = False,
+                 export_to_dict: bool = False,
+                 with_classification_report: bool = False,
+                 verbose: bool = True) -> any:
+        pass
+
+    def _configuration(self) -> dict:
+        pass
+
+    def stats(self) -> None:
+        pass
+    
+class RicochetCluster(PYJEDAIFeature):
+    def __init__(self,
+                 center : int,
+                 members : []) -> None:
+        super().__init__()
+        self.set_center(center=center)
+        self.set_members(members=set())
+        self.add_members(new_members=members)
+    
+    def set_center(self, center : int) -> None:
+        self.center : int = center
+    
+    def set_members(self, members : set) -> None:
+        self.members : set = members
+    
+    def add_members(self, new_members : list) -> None:
+        for new_member in new_members:
+            self.add_member(new_member)
+    
+    def add_member(self, new_member: int) -> None:
+        self.members.add(new_member)
+        
+    def remove_member(self, member : int) -> None:
+        self.members.remove(member)
+        
+    def get_members(self) -> list:
+        return self.members
+    
+    def get_center(self) -> int:
+        return self.center
+    
+    def change_center(self, new_center : int):
+        self.remove_member(member=self.get_center())
+        self.add_member(new_member=new_center)
+        self.set_center(center=new_center)
+        
+    def evaluate(self,
+                 prediction=None,
+                 export_to_df: bool = False,
+                 export_to_dict: bool = False,
+                 with_classification_report: bool = False,
+                 verbose: bool = True) -> any:
+        pass
+
+    def _configuration(self) -> dict:
+        pass
+
+    def stats(self) -> None:
+        pass
+        
 class AbstractClustering(PYJEDAIFeature):
     
     def __init__(self) -> None:
@@ -68,9 +376,6 @@ def export_to_df(self, prediction: list) -> pd.DataFrame:
         Returns:
             pd.DataFrame: Dataframe containg evaluation scores and stats
         """
-        if self.data.ground_truth is None:
-            raise AttributeError("Can not proceed to evaluation without a ground-truth file. \
-                Data object mush have initialized with the ground-truth file")
         pairs_df = pd.DataFrame(columns=['id1', 'id2'])
         for cluster in prediction:
             lcluster = list(cluster)
@@ -88,9 +393,17 @@ def export_to_df(self, prediction: list) -> pd.DataFrame:
                     )
         return pairs_df
     
-    
-    
+    def sorted_indicators(self, first_indicator : int, second_indicator : int):
+        return (first_indicator, second_indicator) if (first_indicator < second_indicator) else (second_indicator, first_indicator)
 
+    def id_to_index(self, identifier : int):
+        return identifier \
+            if identifier < self.data.dataset_limit \
+            else (identifier - self.data.dataset_limit)
+        
+    def index_to_id(self, index : int, left_dataset : True):
+        return index if left_dataset else index + self.data.dataset_limit 
+    
 class ConnectedComponentsClustering(AbstractClustering):
     """Creates the connected components of the graph. \
         Applied to graph created from entity matching. \
@@ -174,7 +487,7 @@ def process(self, graph: Graph, data: Data, similarity_threshold: float = 0.1) -
         self.similarity_threshold: float = similarity_threshold
         
         start_time = time()
-        matched_entities = set()
+        matched_entities = OrderedSet()
         self.data = data
         new_graph = Graph()
         priority_queue = PriorityQueue(maxsize = graph.number_of_edges()*2)
@@ -221,3 +534,892 @@ def process(self, graph: Graph, data: Data) -> list:
 
     def _configuration(self) -> dict:
         return {}
+
+class CenterClustering(AbstractClustering):
+    """Implements the Center Clustering algorithm. Input comparisons (graph edges) are sorted in descending order of similarity.
+       Pairs of entities connected by these edges form the basis of the updated graph. Entities are evaluated to determine if they will serve
+       as a center of a future cluster or as its member. This evaluation is based on a comparison of their cumulative edge weights in the graph,
+       normalized by the number of edges in which they are involved. Finally, the algorithm identifies connected components within the graph,
+       using the previously defined centers as the focal points for forming clusters.
+    """
+
+
+    _method_name: str = "Center Clustering"
+    _method_short_name: str = "CC"
+    _method_info: str = "Ιmplements the Center Clustering algorithm," + \
+        "In essence, it keeps it defines if a node within an edge constitutes a center or member of future clusters" + \
+        " by normalized over the graph weight sum comparison"
+    def __init__(self) -> None:
+        super().__init__()
+        self.similarity_threshold: float
+
+    def process(self, graph: Graph, data: Data, similarity_threshold: float = 0.5) -> list:
+
+        start_time = time()
+        self.similarity_threshold : float = similarity_threshold
+        self.data = data
+        edges_weight = defaultdict(float)
+        edges_attached = defaultdict(int)
+        comparisons = PriorityQueue(maxsize = graph.number_of_edges()*2)
+
+        for (v1, v2, data) in graph.edges(data=True):
+            similarity_score = data.get('weight', 0)
+            if similarity_score > self.similarity_threshold:
+                comparisons.put((-similarity_score, v1, v2))
+                edges_weight[v1] = edges_weight[v1] + similarity_score
+                edges_weight[v2] = edges_weight[v2] + similarity_score
+
+                edges_attached[v1] = edges_attached[v1] + 1
+                edges_attached[v2] = edges_attached[v2] + 1
+
+        new_graph = Graph()
+        cluster_centers = set()
+        cluster_members = set()
+
+        while not comparisons.empty():
+            similarity_score, v1, v2 = comparisons.get()
+            v1_is_center : bool = v1 in cluster_centers
+            v2_is_center : bool = v2 in cluster_centers
+            v1_is_member : bool = v1 in cluster_members
+            v2_is_member : bool = v2 in cluster_members
+            
+            if(not(v1_is_center or v2_is_center or v1_is_member or v2_is_member)):
+                w1 = edges_weight[v1] / edges_attached[v1]
+                w2 = edges_weight[v2] / edges_attached[v2]
+
+                cluster_centers.add(v1 if w1 > w2 else v2)
+                cluster_members.add(v1 if w1 <= w2 else v2)
+                new_graph.add_edge(v1, v2, weight=-similarity_score)
+            elif ((v1_is_center and v2_is_center) or (v1_is_member and v2_is_member)):
+                continue
+            elif (v1_is_center and not v2_is_member):
+                cluster_members.add(v2)
+                new_graph.add_edge(v1, v2, weight=-similarity_score)
+            elif (v2_is_center and not v1_is_member):
+                cluster_members.add(v1)
+                new_graph.add_edge(v1, v2, weight=-similarity_score)
+
+        clusters = list(connected_components(new_graph))
+        self.execution_time = time() - start_time
+        return clusters
+
+    def _configuration(self) -> dict:
+        return {}
+
+class BestMatchClustering(AbstractClustering):
+    """Implements the Best Match Clustering algorithm. Based on supplied order, it either traverse the entities of the left (inorder)
+       or right (reverse) dataset. For each entity, it retrieves all of its candidate pairs, stores them in descending similarity order.
+       For each source entity, only the best candidate is kept (only highest similarity edge is kept in the new graph).
+    """
+
+    _method_name: str = "Best Match Clustering"
+    _method_short_name: str = "BMC"
+    _method_info: str = "Ιmplements the Best Match Clustering algorithm," + \
+        "In essence, it keeps the best candidate for each entity of the source dataset (defined through ordering)"
+    def __init__(self) -> None:
+        super().__init__()
+        self.similarity_threshold: float
+
+    def process(self, graph: Graph, data: Data, similarity_threshold: float = 0.5, order : str = "inorder") -> list:
+
+        start_time = time()
+        self.data = data
+        self.similarity_threshold : float = similarity_threshold
+        self.order : str = order
+        
+        if(self.order != "inorder" and self.order != "reverse"):
+             raise ValueError(f"Best Match Clustering doesn't support {self.order} ordering - Use inorder/reverse.")
+        
+        number_of_comparisons = len(graph.edges(data=True))
+        matched_entities = set()
+        new_graph = Graph()
+        candidates_of = {} 
+        clusters = []
+        
+        if(number_of_comparisons == 0):
+            return clusters
+
+        if self.data.is_dirty_er:
+            raise ValueError(f"Best Match Clustering doesn't support Dirty ER.")
+
+        source_entities_num = self.data.num_of_entities_1 \
+                              if(self.order == "inorder") else \
+                              self.data.num_of_entities_2
+
+        candidates_of = [PriorityQueue() for _ in range(source_entities_num)]
+
+        for (v1, v2, data) in graph.edges(data=True):
+            similarity_score = data.get('weight', 0)
+            original_d1_entity, original_d2_entity = (v1, v2) if (v1 < v2) else (v2, v1)
+            
+            source_entity, target_entity = (original_d1_entity, original_d2_entity) \
+                                           if(self.order == "inorder") else \
+                                           (original_d2_entity, original_d1_entity)
+                                           
+            source_index = source_entity \
+                           if(self.order == "inorder") else \
+                           source_entity - self.data.dataset_limit
+            
+            if similarity_score > self.similarity_threshold:
+                candidates_of[source_index].put((-similarity_score, target_entity))
+
+        for source_index, source_candidates in enumerate(candidates_of):
+            while not source_candidates.empty():
+                similarity, target_entity = source_candidates.get()
+                
+                if target_entity in matched_entities:
+                    continue
+                
+                source_entity = source_index \
+                                if(self.order == "inorder") else \
+                                source_index + self.data.dataset_limit 
+
+                e1, e2 = (source_entity, target_entity) \
+                         if(self.order == "inorder") else \
+                         (target_entity, source_entity)
+                new_graph.add_edge(e1, e2, weight=-similarity)
+                matched_entities.add(source_entity)
+                matched_entities.add(target_entity)
+                break
+
+        clusters = list(connected_components(new_graph))
+        self.execution_time = time() - start_time
+        return clusters
+
+    def _configuration(self) -> dict:
+        return {}
+    
+    def set_order(self, order : str) -> None:
+        self.order : str = order
+        
+        
+class MergeCenterClustering(AbstractClustering):
+    """Implements the Merge Center Clustering algorithm. It is a simplified version of the Center Clustering algorithm,
+       where the pair entities are not chosen as cluster center and member respectively based on their cumulative, normalized
+       weight in the original graph. Rather, entities of the left dataset are set as centers and their right dataset candidates
+       are set as member of the corresponding clusters.  
+    """
+
+
+    _method_name: str = "Merge Center Clustering"
+    _method_short_name: str = "MCC"
+    _method_info: str = "Ιmplements the Merge Center Clustering algorithm," + \
+        "In essence, it implements Center Clustering without the cumulative, " + \
+        "normalized weight calculation. Left dataset entities are set as candidate cluster centers."
+    def __init__(self) -> None:
+        super().__init__()
+        self.similarity_threshold: float
+
+    def process(self, graph: Graph, data: Data, similarity_threshold: float = 0.5) -> list:
+
+        start_time = time()
+        self.similarity_threshold : float = similarity_threshold
+        self.data = data
+        comparisons = PriorityQueue(maxsize = graph.number_of_edges()*2)
+
+        for (v1, v2, data) in graph.edges(data=True):
+            similarity_score = data.get('weight', 0)
+            d1_id, d2_id = self.sorted_indicators(v1, v2)
+            if similarity_score > self.similarity_threshold:
+                comparisons.put((-similarity_score, d1_id, d2_id))
+
+        new_graph = Graph()
+        cluster_centers = set()
+        cluster_members = set()
+
+        while not comparisons.empty():
+            similarity_score, v1, v2 = comparisons.get()
+            v1_is_center : bool = v1 in cluster_centers
+            v2_is_center : bool = v2 in cluster_centers
+            v1_is_member : bool = v1 in cluster_members
+            v2_is_member : bool = v2 in cluster_members
+            
+            if(not(v1_is_center or v2_is_center or v1_is_member or v2_is_member)):
+                cluster_centers.add(v1)
+                cluster_members.add(v2)
+                new_graph.add_edge(v1, v2, weight=-similarity_score)
+            elif ((v1_is_center and v2_is_center) or (v1_is_member and v2_is_member)):
+                continue
+            elif (v1_is_center):
+                cluster_members.add(v2)
+                new_graph.add_edge(v1, v2, weight=-similarity_score)
+            elif (v2_is_center):
+                cluster_members.add(v1)
+                new_graph.add_edge(v1, v2, weight=-similarity_score)
+
+        clusters = list(connected_components(new_graph))
+        self.execution_time = time() - start_time
+        return clusters
+
+    def _configuration(self) -> dict:
+        return {}
+    
+    
+class CorrelationClustering(AbstractClustering):
+    """Implements the Correlation Clustering algorithm. Candidate pairs are mapped into a graph, whose connected components
+       act as our initial clusters. We iteratively choose one of the 3 possible moves (change, merge, break up cluster) and
+       we apply them on randomly chosen entities. We decide whether we should conduct the move or not, based on an objective function,
+       which quantifies the quality of our clusters (contain similar entities, seperate disimilar ones)   
+    """
+
+
+    _method_name: str = "Correlation Clustering"
+    _method_short_name: str = "CC"
+    _method_info: str = "Ιmplements the Correlation Clustering algorithm," + \
+        "In essence, it implements iterative clustering, " + \
+        "reassigning clusters to randomly chosen entities based on the reassignment's effect on our objective function " + \
+        "that evaluates the quality of the newly defined clusters." 
+    def __init__(self) -> None:
+        super().__init__()
+        self.similarity_threshold: float
+        self.initial_threshold : float
+        self.similarity_threshold : float
+        self.non_similarity_threshold : float
+        self.move_limit : int
+        self.lsi_iterations: int
+    def process(self,
+                graph: Graph,
+                data: Data,
+                initial_threshold: float = 0.5,
+                similarity_threshold: float = 0.8,
+                non_similarity_threshold: float = 0.2,
+                move_limit: int = 1,
+                lsi_iterations: int = 10000) -> list:
+
+        start_time = time()
+        self.data : Data = data
+        self.initial_threshold : float = initial_threshold
+        self.similarity_threshold : float = similarity_threshold
+        self.non_similarity_threshold : float = non_similarity_threshold
+        self.move_limit : int = move_limit
+        self.lsi_iterations: int = lsi_iterations
+        self.similarity = lil_matrix((self.data.num_of_entities_1, self.data.num_of_entities_2), dtype=float)
+        new_graph = graph.copy()
+
+        for (v1, v2, data) in graph.edges(data=True):
+            d1_id, d2_id = self.sorted_indicators(v1, v2)
+            d1_index, d2_index = (self.id_to_index(d1_id), self.id_to_index(d2_id))
+            similarity_score = data['weight']
+            self.similarity[d1_index, d2_index] = similarity_score 
+            
+            if similarity_score < self.initial_threshold:
+                new_graph.remove_edge(v1, v2)
+
+        initial_clusters = [list(connected_component) for connected_component in connected_components(new_graph)]
+        
+        print(len(initial_clusters))
+        self.clusters = [EquivalenceCluster(data=self.data, flattened_cluster=cluster) for cluster in initial_clusters]
+        self.initial_clusters_num = len(initial_clusters) 
+        self.max_clusters_num = self.initial_clusters_num + 10
+        self.entity_cluster_index = [0] * self.data.num_of_entities
+        self.valid_entities = set()
+        
+        for cluster_index, cluster in enumerate(self.clusters):
+            for entity in range(self.data.num_of_entities):
+                if(cluster.has_entity(entity=entity)):
+                    self.valid_entities.add(entity)
+                    self.entity_cluster_index[entity] = cluster_index
+        self.valid_entities = list(self.valid_entities)
+                              
+        self.similar = lil_matrix((self.data.num_of_entities_1, self.data.num_of_entities_2), dtype=bool)
+        self.not_similar = lil_matrix((self.data.num_of_entities_1, self.data.num_of_entities_2), dtype=bool)
+
+        for d1_index in range(self.data.num_of_entities_1):
+            for d2_index in range(d1_index, self.data.num_of_entities_2):
+                self.not_similar[d1_index, d2_index] = self.similarity[d1_index, d2_index] < self.non_similarity_threshold
+                self.similar[d1_index, d2_index] = self.similarity[d1_index, d2_index] > self.similarity_threshold
+        
+        random.seed(RANDOM_SEED)
+        previous_OF : int = self.calculate_OF()
+        
+        for iteration in range(self.lsi_iterations):
+            move_index : int = random.randint(0, self.move_limit - 1)
+            current_OF : int = self.move(move_index, previous_OF)
+            previous_OF = current_OF
+
+        final_clusters : list = []
+        for cluster in self.clusters:
+            if(cluster.has_entities()):
+                final_clusters.append(set(cluster.flatten()))
+        self.execution_time = time() - start_time
+        return final_clusters
+    
+    def calculate_OF(self) -> int:
+        OF : int = 0
+        
+        for d1_index in range(self.data.num_of_entities_1):
+            for d2_index in range(d1_index, self.data.num_of_entities_2):
+                d1_entity = self.index_to_id(index=d1_index, left_dataset=True)
+                d2_entity = self.index_to_id(index=d2_index, left_dataset=False)
+                
+                similar_and_cluster_match = self.similar[d1_index, d2_index] and \
+                (self.entity_cluster_index[d1_entity] == self.entity_cluster_index[d2_entity])
+                dissimilar_and_cluster_missmatch = self.not_similar[d1_index, d2_index] and \
+                (self.entity_cluster_index[d1_entity] != self.entity_cluster_index[d2_entity])
+                
+                if(similar_and_cluster_match or dissimilar_and_cluster_missmatch):
+                    OF += 1
+                    
+        return OF   
+            
+    def move(self, move_index : int, previous_OF : int):
+        print(f"Move[{move_index}] OF[{previous_OF}]")
+        if(move_index == 0):
+            random_entity = random.choice(self.valid_entities)
+            random_cluster = random.randint(0, self.initial_clusters_num - 1)
+            while(not self.clusters[random_cluster].has_entities()):
+                random_cluster = random.randint(0, self.initial_clusters_num - 1)
+            return self.change_entity_cluster(previous_OF, random_entity, random_cluster)
+        elif(move_index == 1):
+            previous_cluster = random.randint(0, self.initial_clusters_num - 1)
+            while(not self.clusters[previous_cluster].has_entities()):
+                previous_cluster = random.randint(0, self.initial_clusters_num - 1)
+                
+            new_cluster = random.randint(0, self.initial_clusters_num - 1)
+            while((previous_cluster == new_cluster) or (not self.clusters[new_cluster].has_entities())):
+                new_cluster = random.randint(0, self.initial_clusters_num - 1)
+                
+            return self.unify_clusters(previous_OF, previous_cluster, new_cluster)
+        
+        elif(move_index == 2):
+            previous_cluster = random.randint(0, self.initial_clusters_num - 1)
+            while(not self.clusters[previous_cluster].has_entities()):
+                previous_cluster = random.randint(0, self.initial_clusters_num - 1)
+            return self.seperate_clusters(previous_OF, previous_cluster)
+        else:
+            raise ValueError(f"Invalid Move Index \"{move_index}\": Choose 0->2")
+            return float("inf")
+        
+        
+    def change_entity_cluster(self, previous_OF : int, entity : int, new_cluster : int):
+        previous_cluster = self.entity_cluster_index[entity]
+        self.entity_cluster_index[entity] = new_cluster
+        
+        new_OF = self.calculate_OF()
+        if(new_OF > previous_OF):
+            self.clusters[previous_cluster].remove_entity(entity)
+            self.clusters[new_cluster].add_entity(entity)
+            return new_OF
+        else:
+            self.entity_cluster_index[entity] = previous_cluster
+            return previous_OF
+        
+    def unify_clusters(self, previous_OF : int, previous_cluster_index : int, new_cluster_index : int):
+        previous_cluster = self.clusters[previous_cluster_index]
+        new_cluster = self.clusters[new_cluster_index]
+        to_be_removed_entities = []    
+        previous_cluster_entities = previous_cluster.get_entities()
+    
+        for entity in previous_cluster_entities:
+            to_be_removed_entities.append(entity)
+            self.entity_cluster_index[entity] = new_cluster_index
+        
+        new_OF : int = self.calculate_OF()
+        
+        if(new_OF > previous_OF):
+            previous_cluster.remove_entities(previous_cluster_entities)
+            new_cluster.add_entities(previous_cluster_entities)
+            return new_OF
+
+        for to_be_removed_entity in to_be_removed_entities:
+            self.entity_cluster_index[to_be_removed_entity] = previous_cluster_index
+            
+        return previous_OF
+    
+    def seperate_clusters(self, previous_OF, previous_cluster_index):
+        previous_cluster = self.clusters[previous_cluster_index]
+        previous_cluster_entities = previous_cluster.get_entities()
+        to_be_removed_entities = []
+        new_cluster_index = self.initial_clusters_num
+        
+        for index in range(0, len(previous_cluster_entities), 2):
+            to_be_removed_entity = previous_cluster_entities[index]
+            to_be_removed_entities.append(to_be_removed_entity)
+            self.entity_cluster_index[to_be_removed_entity] = new_cluster_index
+        
+        new_OF : int = self.calculate_OF()
+        
+        if(new_OF > previous_OF):
+            self.clusters.append(EquivalenceCluster(data=self.data, flattened_cluster=to_be_removed_entities))
+            self.initial_clusters_num += 1
+            previous_cluster.remove_entities(to_be_removed_entities)    
+            return new_OF
+        
+        for to_be_removed_entity in to_be_removed_entities:
+            self.entity_cluster_index[to_be_removed_entity] = previous_cluster_index
+              
+        return previous_OF
+
+    def _configuration(self) -> dict:
+        return {}
+    
+class CutClustering(AbstractClustering):
+    """Implements the Cut Clustering algorithm. Retains the candidate pairs whose similarity is over the specified threshold.
+       Those pairs are mapped into graph edges. Using the newly defined graph, we retrieve its Gomory Hu Tree representation
+       using the Edmonds Karp flow function, while edges' capacity is considered to be infinite. We return the connected components
+       of the resulting minimum s-t cuts for the pairs in the original, trimmed graph.    
+    """
+
+    _method_name: str = "Cut Clustering"
+    _method_short_name: str = "CTC"
+    _method_info: str = "Ιmplements the Cut Clustering algorithm," + \
+        "In essence, it calculates the Gomory Hu Tree of the graph resulting from input similarity pairs. " + \
+        "We retain the connected components of this tree."
+    def __init__(self) -> None:
+        super().__init__()
+        self.similarity_threshold: float
+
+    def process(self, graph: Graph, data: Data, similarity_threshold: float = 0.5, alpha: float = 0.2) -> list:
+
+        start_time = time()
+        self.similarity_threshold : float = similarity_threshold
+        self.data = data
+        threshold_trimmed_graph : Graph = Graph()
+
+        for (v1, v2, data) in graph.edges(data=True):
+            similarity_score = data.get('weight', 0)
+            d1_id, d2_id = self.sorted_indicators(v1, v2)
+            if similarity_score > self.similarity_threshold:
+                threshold_trimmed_graph.add_edge(d1_id, d2_id, weight=similarity_score)
+        
+        sink_node : int = self.data.num_of_entities  
+        threshold_trimmed_graph.add_node(sink_node)
+        for node in graph.nodes():
+            if node != sink_node:
+                threshold_trimmed_graph.add_edge(sink_node, node, weight=alpha)
+
+        final_gomory_hu_tree = gomory_hu_tree(G=threshold_trimmed_graph, capacity='weight')
+        final_gomory_hu_tree.remove_node(sink_node)
+        clusters = list(connected_components(final_gomory_hu_tree))
+        
+        print(len(clusters))
+        self.execution_time = time() - start_time
+        return clusters
+
+    def _configuration(self) -> dict:
+        return {}
+    
+class MarkovClustering(AbstractClustering):
+    """Implements the Markov Clustering algorithm. It simulates random walks on a (n x n) matrix as the adjacency matrix
+       of a weighted, similarity graph. It alternates an expansion step and an inflation step until an equilibrium state is reached.
+       Entries with similarity above threhold, are inserted into final graph, whose CCs we retain.    
+    """
+
+    _method_name: str = "Markov Clustering"
+    _method_short_name: str = "MCL"
+    _method_info: str = "Ιmplements the Markov Clustering algorithm," + \
+        "In essence, it simulates random walks on a (n x n) matrix as the adjacency " + \
+        "matrix of a graph. It alternates an expansion step and an inflation step " + \
+        "until an equilibrium state is reached. We retain the connected components " + \
+        "of the graph resulting from final similarity matrix entries valued over threshold."
+    def __init__(self) -> None:
+        super().__init__()
+        self.similarity_threshold : float
+        self.cluster_threshold : float
+        self.matrix_similarity_threshold : float
+        self.similarity_checks_limit : int
+        
+    def process(self, graph: Graph,
+                data: Data, 
+                similarity_threshold: float = 0.5,
+                cluster_threshold: float = 0.001,
+                matrix_similarity_threshold: float = 0.00001,
+                similarity_checks_limit : int = 10) -> list:
+
+        start_time = time()
+        self.similarity_threshold : float = similarity_threshold
+        self.cluster_threshold : float = cluster_threshold
+        self.matrix_similarity_threshold : float = matrix_similarity_threshold
+        self.similarity_checks_limit : int = similarity_checks_limit
+        self.data = data
+        self.current_similarity = lil_matrix((self.data.num_of_entities, data.num_of_entities), dtype=float)
+        new_graph : Graph = Graph()
+        
+        
+        for (v1, v2, data) in graph.edges(data=True):
+            d1_id, d2_id = self.sorted_indicators(v1, v2)
+            similarity_score = data.get('weight', 0)
+            
+            if(similarity_score > self.similarity_threshold):
+                self.current_similarity[d1_id, d2_id] = similarity_score 
+                self.current_similarity[d2_id, d1_id] = similarity_score 
+        
+        self.set_node_loop(similarity = 1.0)
+        self.normalize()
+        
+        for check in range(self.similarity_checks_limit):
+            self.previous_similarity = self.current_similarity.copy()
+            self.inflate()
+            self.normalize()
+            self.expand()
+            self.normalize()
+            print(check+1)
+            if(self.equilibrium()):
+                break
+        
+        edges_populated = self.get_existing_indices(matrix=self.current_similarity)    
+        for edge in edges_populated:
+            row, column = edge
+            new_similarity = self.current_similarity[row, column]
+            final_row, final_column = self.sorted_indicators(row, column)
+            
+            if(new_graph.has_edge(final_row, final_column)):
+                existing_similarity = new_graph[final_row][final_column]["weight"]
+                if(new_similarity > existing_similarity):
+                    new_graph[final_row][final_column]["weight"] = new_similarity
+            elif(new_similarity > self.cluster_threshold):
+                new_graph.add_edge(final_row, final_column, weight=new_similarity)   
+        
+        clusters = list(connected_components(new_graph))
+        self.execution_time = time() - start_time
+        return clusters
+    
+    def set_node_loop(self, similarity : float = 1.0) -> None:
+        rows : int = self.current_similarity.shape[0]
+        print(rows)
+        for row in range(rows):
+            self.current_similarity[row, row] = similarity
+            
+    def normalize(self) -> None:
+        column_sums = self.current_similarity.sum(axis=0)
+        column_sums[column_sums == 0] = 1
+        self.current_similarity = self.current_similarity.multiply(1. / column_sums)
+        
+    def expand(self) -> None:
+        self.current_similarity = self.current_similarity.power(2)
+        
+    def inflate(self) -> None:
+        self.current_similarity = self.current_similarity.dot(self.current_similarity)
+        
+    def equilibrium(self) -> None:
+        self.current_similarity = self.current_similarity.tocsr()
+        self.previous_similarity = self.previous_similarity.tocsr()
+        
+        current_indices = self.get_existing_indices(matrix=self.current_similarity)
+        previous_indices = self.get_existing_indices(matrix=self.previous_similarity)
+        shared_indices = current_indices & previous_indices
+        
+        for indices in shared_indices:
+            row, column = indices
+            if(abs(self.current_similarity[row, column] - self.previous_similarity[row, column]) > self.matrix_similarity_threshold):
+                return False
+            
+        return True  
+    
+    def get_existing_indices(self, matrix):
+        return set([indices for indices in zip(*matrix.nonzero())])
+    
+    def _configuration(self) -> dict:
+        return {}
+    
+class KiralyMSMApproximateClustering(AbstractClustering):
+    """Implements the Kiraly MSM Approximate Clustering algorithm. Implements the so-called "New Algorithm"
+       by Zoltan Kiraly 2013, which is a 3/2-approximation to the Maximum Stable Marriage (MSM) problem.
+       The pairs resulting from the approximation of the stable relationships are translated into a graph,
+       whose connected components we retain.    
+    """
+
+    _method_name: str = "Kiraly MSM Approximate Clustering"
+    _method_short_name: str = "KMAC"
+    _method_info: str = "Ιmplements the Kiraly MSM Approximate Clustering algorithm," + \
+        "In essence, it is a 3/2-approximation to the Maximum Stable Marriage (MSM) problem."
+    def __init__(self) -> None:
+        super().__init__()
+        self.similarity_threshold : float
+        
+    def process(self, 
+                graph: Graph,
+                data: Data, 
+                similarity_threshold: float = 0.1) -> list:
+
+        start_time = time()
+        self.similarity_threshold : float = similarity_threshold
+        self.data = data
+        number_of_comparisons : int = len(graph.edges(data=True))
+        clusters : list = []
+        
+        if(number_of_comparisons == 0):
+            return clusters
+
+        if self.data.is_dirty_er:
+            raise ValueError(f"Kiraly MSM Approximate Clustering doesn't support Dirty ER.")
+
+        new_graph : Graph = Graph()
+        men : set = set()
+        self.men_candidates : dict = defaultdict(list)
+        self.women_candidates : dict = defaultdict(list)
+        
+        for (v1, v2, data) in graph.edges(data=True):
+            man, woman = self.sorted_indicators(v1, v2)
+            similarity = data.get('weight', 0) 
+            if similarity > self.similarity_threshold:    
+                self.men_candidates[man].append(ExtendedSimilarityEdge(left_node=man,
+                                                                right_node=woman,
+                                                                similarity=similarity))   
+                self.women_candidates[woman].append(ExtendedSimilarityEdge(left_node=woman,
+                                                                    right_node=man,
+                                                                    similarity=similarity))
+            men.add(man)
+            
+        for man, candidates in self.men_candidates.items():
+            self.men_candidates[man] = sorted(candidates, reverse=True)
+        for woman, candidates in self.women_candidates.items():
+            self.women_candidates[woman] = sorted(candidates, reverse=True)
+            
+        self.is_bachelor : list = [False] * self.data.num_of_entities_1
+        self.is_uncertain : list = [False] * self.data.num_of_entities_1
+        self.fiances : list = [-1] * self.data.num_of_entities_2
+        self.current_matches : dict = {}
+        self.free_men : list = list(men)
+        
+        while(len(self.free_men) > 0):
+            man = self.free_men.pop(0)
+            woman = self.get_first_active_candidate(entity=man)
+            
+            if(woman == -1):
+                if(not self.is_bachelor[man]):
+                    self.is_bachelor[man] = True
+                    if(not self.has_candidates(entity=man)):
+                        self.free_men.append(man)
+                    self.activate_candidates_of(entity=man)
+                else:
+                    continue
+            else:
+                fiance = self.get_woman_fiance(woman=woman)
+                if(fiance == -1):
+                    self.add_match(man=man, woman=woman, similarity=0.0)
+                    self.set_woman_fiance(woman=woman, fiance=man)
+                else:
+                    if(self.accepts_proposal(woman=woman,
+                                             man=man)):
+                        self.remove_match(man=fiance, woman=woman)
+                        self.add_match(man=man, woman=woman, similarity=0.0)
+                        self.set_woman_fiance(woman=woman, fiance=man)
+                        if(not self.is_uncertain[fiance]):
+                            self.deactivate_candidate(entity=fiance, candidate=woman)
+                    else:
+                        self.deactivate_candidate(entity=man, candidate=woman)
+                    
+        for _, edges in self.current_matches.items():
+            for edge in edges:
+                man, woman, similarity = edge.left_node, edge.right_node, edge.similarity
+                new_graph.add_edge(man, woman, weight=similarity)
+
+        clusters = list(connected_components(new_graph))
+        self.execution_time = time() - start_time
+        return clusters
+    
+    def is_male(self, entity: int) -> bool:
+        return entity < self.data.dataset_limit
+    
+    def get_entity_candidates(self, entity : int) -> PriorityQueue:
+        candidates = self.men_candidates if self.is_male(entity) else self.women_candidates
+        return candidates[entity] 
+    
+    def has_candidates(self, entity : int) -> bool:
+        return len(self.get_entity_candidates(entity=entity)) > 0
+    
+    def activate_candidates_of(self, entity : int) -> None:
+        candidates = self.get_entity_candidates(entity=entity)
+        for candidate in candidates:
+            candidate.set_active(active=True)     
+            
+    def get_first_active_candidate(self, entity : int) -> int:
+        candidates = self.get_entity_candidates(entity=entity)
+        for candidate in candidates:
+            if(candidate.is_active()):
+                return candidate.right_node 
+        return -1
+    
+    def add_match(self, man : int, woman : int, similarity : float) -> None:
+        if man not in self.current_matches:
+            self.current_matches[man] = []
+        self.current_matches[man].append(ExtendedSimilarityEdge(left_node=man,
+                                                                right_node=woman,
+                                                                similarity=similarity))
+    def remove_match(self, man : int, woman : int) -> None:
+        self.current_matches[man] = [match for match in self.current_matches[man] \
+                                    if (match.left_node != man or match.right_node != woman)]
+        
+    def get_woman_fiance(self, woman : int) -> int:
+        return self.fiances[woman - self.data.dataset_limit]
+    
+    def set_woman_fiance(self, woman : int, fiance : int) -> None:
+        self.fiances[woman - self.data.dataset_limit] = fiance
+        
+    def deactivate_candidate(self, entity : int, candidate : int) -> bool:
+        entity_candidates = self.get_entity_candidates(entity=entity)
+        for entity_candidate in entity_candidates:
+            if(entity_candidate.right_node == candidate):
+                entity_candidate.set_active(active=False)
+                return True
+        return False
+    
+    def accepts_proposal(self, woman : int, man : int):
+        current_fiance : int = self.get_woman_fiance(woman=woman)
+        
+        if(current_fiance == -1):
+            return True
+        if(self.is_uncertain[current_fiance]):
+            return True
+        
+        man_score : float = 0.0
+        current_fiance_score : float = 0.0
+        
+        woman_candidates : list = self.get_entity_candidates(entity=woman)
+        
+        for comparison in woman_candidates:
+            candidate : int = comparison.right_node
+            if(candidate == man):
+                man_score = comparison.similarity
+            elif(candidate == current_fiance):
+                current_fiance_score = comparison.similarity
+        
+        return (man_score > current_fiance_score)
+    
+    def _configuration(self) -> dict:
+        return {}
+    
+class RicochetSRClustering(AbstractClustering):
+    """Implements the Ricochet SR Clustering algorithm. Implements the so-called "New Algorithm"
+       by Zoltan Kiraly 2013, which is a 3/2-approximation to the Maximum Stable Marriage (MSM) problem.
+       The pairs resulting from the approximation of the stable relationships are translated into a graph,
+       whose connected components we retain.    
+    """
+
+    _method_name: str = "Ricochet SR Clustering"
+    _method_short_name: str = "RSRC"
+    _method_info: str = "Ιmplements the Ricochet SR Clustering algorithm," + \
+        "In essence, it is a 3/2-approximation to the Maximum Stable Marriage (MSM) problem."
+    def __init__(self) -> None:
+        super().__init__()
+        self.similarity_threshold : float
+        
+    def process(self, 
+                graph: Graph,
+                data: Data, 
+                similarity_threshold: float = 0.5) -> list:
+
+        start_time = time()
+        self.similarity_threshold : float = similarity_threshold
+        self.data = data
+        clusters : list = []
+        self.vertices : dict = {}
+        self.sorted_vertices = PriorityQueue(maxsize = self.data.num_of_entities)
+        
+        for (v1, v2, data) in graph.edges(data=True):
+            d1_id, d2_id = self.sorted_indicators(v1, v2)
+            similarity = data.get('weight', 0) 
+            if similarity > self.similarity_threshold:  
+                if d1_id not in self.vertices: self.vertices[d1_id] = Vertex(identifier=d1_id)
+                if d2_id not in self.vertices: self.vertices[d2_id] = Vertex(identifier=d2_id)
+                self.vertices[d1_id].insert_edge(edge=(d2_id, similarity))
+                self.vertices[d2_id].insert_edge(edge=(d1_id, similarity))
+        
+        for _, vertex in self.vertices.items():
+            if(vertex.has_edges()):
+                self.sorted_vertices.put(vertex)
+
+        if(self.sorted_vertices.empty()):            
+            return clusters
+        
+        self.centers : set = set()
+        self.members : set = set()
+        self.center_of : dict = {}
+        self.similarity_with_center : dict = defaultdict(float)
+        self.current_clusters : dict = defaultdict(set)
+        
+        top_vertex : Vertex = self.sorted_vertices.get()
+        vertex_id : int = top_vertex.get_identifier()
+        self.centers.add(vertex_id)
+        self.center_of[vertex_id] = vertex_id
+        self.current_clusters[vertex_id].add(vertex_id)
+        self.similarity_with_center[vertex_id] = 1.0
+        
+        top_vertex_neighbor = list(top_vertex.edges.keys())[0]
+        self.members.add(top_vertex_neighbor)
+        self.center_of[top_vertex_neighbor] = vertex_id
+        self.current_clusters[vertex_id].add(top_vertex_neighbor)
+        self.similarity_with_center[top_vertex_neighbor] = top_vertex.get_similarity_with(top_vertex_neighbor)
+         
+        while(not self.sorted_vertices.empty()):
+            vertex = self.sorted_vertices.get()
+            vertex_id = vertex.get_identifier() 
+            to_reassign : set = set()
+            centers_to_reassign : set = set()
+            
+            for neighbor, similarity in vertex.edges.items():
+                if(neighbor in self.centers):
+                    continue
+                previous_similarity = self.similarity_with_center[neighbor]
+                if(previous_similarity >= similarity):
+                    continue
+                to_reassign.add(neighbor)
+                break
+            
+            if(to_reassign):
+                if(vertex_id in self.members):
+                    self.members.remove(vertex_id)
+                    previous_center = self.center_of[vertex_id]
+                    self.current_clusters[previous_center].remove(vertex_id)
+                    if(len(self.current_clusters[previous_center]) < 2):
+                        centers_to_reassign.add(previous_center)
+                to_reassign.add(vertex_id)
+                for assignee in to_reassign:
+                    self.current_clusters[vertex_id].add(assignee)
+                self.centers.add(vertex_id)
+                
+            for reassign in to_reassign:
+                if(reassign != vertex_id):
+                    if(reassign in self.members):
+                        reassign_previous_center = self.center_of[reassign]
+                        self.current_clusters[reassign_previous_center].remove(reassign)
+                        
+                        if(len(self.current_clusters[reassign_previous_center]) < 2):
+                            centers_to_reassign.add(reassign_previous_center)
+                    self.members.add(reassign)
+                    self.center_of[reassign] = vertex_id
+                    self.similarity_with_center[reassign] = vertex.get_similarity_with(reassign)
+                    
+            for center_to_reassign in centers_to_reassign:
+                if(len(self.current_clusters[center_to_reassign]) > 1):
+                    continue
+                self.centers.remove(center_to_reassign)
+                _ = self.current_clusters.pop(center_to_reassign, None)
+                
+                max_similarity : float = 0.0
+                new_center : int = vertex_id
+                
+                for center in self.centers:
+                    new_similarity : float = self.vertices[center].get_similarity_with(center_to_reassign)
+                    if(new_similarity > 0.0):
+                        if(len(self.current_clusters[center]) > 1):
+                            continue
+                        if(new_similarity > max_similarity):
+                            max_similarity = new_similarity
+                            new_center = center
+                if(len(self.current_clusters[new_center]) > 1):
+                    continue
+                self.current_clusters[new_center].add(center_to_reassign)
+                self.members.add(center_to_reassign)
+                self.center_of[center_to_reassign]= new_center
+                self.similarity_with_center[center_to_reassign] = max_similarity
+                
+        for entity in range(self.data.num_of_entities):
+            if(entity not in self.members and entity not in self.centers):
+                self.centers.add(entity)
+                self.center_of[entity] = entity
+                self.current_clusters[entity].add(entity)
+                self.similarity_with_center[entity] = 1.0
+                    
+        clusters = []
+        for center, members in self.current_clusters.items():
+            center_equivalence_cluster = EquivalenceCluster(data=self.data,
+                                                            flattened_cluster=list(members)) 
+            clusters.append(set(center_equivalence_cluster.flatten()))
+        
+        self.execution_time = time() - start_time
+        return clusters
+    
+    def _configuration(self) -> dict:
+        return {}
\ No newline at end of file
diff --git a/docs/pyjedai/comparison_cleaning.py b/docs/pyjedai/comparison_cleaning.py
index b2608e8..4d32086 100644
--- a/docs/pyjedai/comparison_cleaning.py
+++ b/docs/pyjedai/comparison_cleaning.py
@@ -157,13 +157,9 @@ def export_to_df(self, prediction) -> pd.DataFrame:
         Returns:
             pd.DataFrame: Dataframe with the predicted pairs
         """
-        if self.data.ground_truth is None:
-            raise AttributeError("Can not proceed to evaluation without a ground-truth file. \
-                Data object mush have initialized with the ground-truth file")
         pairs_df = pd.DataFrame(columns=['id1', 'id2'])
-        
         for entity_id, candidates in prediction.items():
-            id1 = self.data._gt_to_ids_reversed_1[entity_id]                                            
+            id1 = self.data._gt_to_ids_reversed_1[entity_id]                                           
             for candiadate_id in candidates:
                 id2 = self.data._gt_to_ids_reversed_1[candiadate_id] if self.data.is_dirty_er \
                         else self.data._gt_to_ids_reversed_2[candiadate_id]
@@ -196,19 +192,28 @@ def __init__(self) -> None:
     def _apply_main_processing(self) -> dict:
         self._counters = np.empty([self.data.num_of_entities], dtype=float)
         self._flags = np.empty([self.data.num_of_entities], dtype=int)
-        if self.weighting_scheme == 'EJS':
+        if(self._comparisons_per_entity_required()):
             self._set_statistics()
         self._set_threshold()
 
         return self._prune_edges()
 
+    def _comparisons_per_entity_required(self):
+        return (self.weighting_scheme == 'EJS' or 
+                self.weighting_scheme == 'CNC' or
+                self.weighting_scheme == 'SNC' or
+                self.weighting_scheme == 'SND' or
+                self.weighting_scheme == 'CND' or
+                self.weighting_scheme == 'CNJ' or
+                self.weighting_scheme == 'SNJ')
+        
     def _get_weight(self, entity_id: int, neighbor_id: int) -> float:
         ws = self.weighting_scheme
-        if ws == 'ARCS' or ws == 'CBS':
+        if ws == 'CN-CBS' or ws == 'CBS' or ws == 'SN-CBS':
             return self._counters[neighbor_id]
         # CARDINALITY_NORM_COSINE, SIZE_NORM_COSINE
         elif ws == 'CNC' or ws == 'SNC':
-            return self._counters[neighbor_id] / float(sqrt(len(self._comparisons_per_entity[entity_id]) * self._comparisons_per_entity[neighbor_id]))
+            return self._counters[neighbor_id] / float(sqrt(self._comparisons_per_entity[entity_id] * self._comparisons_per_entity[neighbor_id]))
         # SIZE_NORM_DICE, CARDINALITY_NORM_DICE
         elif ws == 'SND' or ws == 'CND':
             return 2 * self._counters[neighbor_id] / float(self._comparisons_per_entity[entity_id] + self._comparisons_per_entity[neighbor_id])
@@ -372,8 +377,10 @@ def _process_entity(self, entity_id: int) -> None:
                 if self._flags[neighbor_id] != entity_id:
                     self._counters[neighbor_id] = 0
                     self._flags[neighbor_id] = entity_id
-                if self.weighting_scheme == 'ARCS':
+                if self.weighting_scheme == 'CN-CBS' or self.weighting_scheme == 'CNC' or self.weighting_scheme == 'CND' or self.weighting_scheme == 'CNJ':
                     self._counters[neighbor_id] += 1 / self._blocks[block_id].get_cardinality(self.data.is_dirty_er)
+                if self.weighting_scheme == 'SN-CBS' or self.weighting_scheme == 'SNC' or self.weighting_scheme == 'SND' or self.weighting_scheme == 'SNJ':
+                    self._counters[neighbor_id] += 1 / self._blocks[block_id].get_size()
                 else:
                     self._counters[neighbor_id] += 1
                 self._valid_entities.add(neighbor_id)
@@ -480,6 +487,7 @@ def __init__(self, weighting_scheme: str = 'CBS') -> None:
         self._nearest_entities: dict
         self._node_centric = True
         self._top_k_edges: PriorityQueue
+        self._number_of_nearest_neighbors : int = None
 
     def _prune_edges(self) -> dict:
         self._nearest_entities = dict()
@@ -508,10 +516,13 @@ def _is_valid_comparison(self, entity_id: int, neighbor_id: int) -> bool:
         return True
 
     def _set_threshold(self) -> None:
-        block_assignments = 0
-        for block in self._blocks.values():
-            block_assignments += block.get_size()
-        self._threshold = max(1, block_assignments / self.data.num_of_entities)
+        if(self._number_of_nearest_neighbors is None):
+            block_assignments = 0
+            for block in self._blocks.values():
+                block_assignments += block.get_size()
+            self._threshold = max(1, block_assignments / self.data.num_of_entities)
+        else:
+            self._threshold = self._number_of_nearest_neighbors         
 
     def _verify_valid_entities(self, entity_id: int) -> None:
         if entity_id not in self._entity_index:
@@ -546,7 +557,7 @@ class ReciprocalCardinalityNodePruning(CardinalityNodePruning):
                     "that correspond to edges in the blocking graph that are among " + \
                     "the top-k weighted ones for both adjacent entities/nodes."
 
-    def __init__(self, weighting_scheme: str = 'ARCS') -> None:
+    def __init__(self, weighting_scheme: str = 'CN-CBS') -> None:
         super().__init__(weighting_scheme)
 
     def _is_valid_comparison(self, entity_id: int, neighbor_id: int) -> bool:
@@ -645,7 +656,7 @@ class ReciprocalWeightedNodePruning(WeightedNodePruning):
                     "that correspond to edges in the blocking graph that are " + \
                     "exceed the average edge weight in both adjacent node neighborhoods."
 
-    def __init__(self, weighting_scheme: str = 'ARCS') -> None:
+    def __init__(self, weighting_scheme: str = 'CN-CBS') -> None:
         super().__init__(weighting_scheme)
 
     def _get_valid_weight(self, entity_id: int, neighbor_id: int) -> float:
@@ -665,6 +676,7 @@ def _set_threshold(self) -> None:
     def process(self, blocks: dict, data: Data, tqdm_disable: bool = False, store_weights: bool = True, cc: AbstractMetablocking = None, emit_all_tps_stop : bool = False) -> dict:
         
         self._emit_all_tps_stop : bool = emit_all_tps_stop
+        self._budget = self._budget if not self._emit_all_tps_stop else float('inf')
         if(cc is None):
             return super().process(blocks, data, tqdm_disable, store_weights)
         else:
@@ -697,20 +709,22 @@ def __init__(self, weighting_scheme: str = 'CBS', budget: int = 0) -> None:
         self._budget = budget
 
     def _set_threshold(self) -> None:
-        self._threshold = max(1, 2 * self._budget / self.data.num_of_entities) if not self._emit_all_tps_stop else 2 * self._budget
+        self._threshold = self._number_of_nearest_neighbors
 
     def process(self, blocks: dict,
                 data: Data,
+                number_of_nearest_neighbors : int = 10,
                 tqdm_disable: bool = False,
                 store_weights: bool = True,
                 cc: AbstractMetablocking = None,
                 emit_all_tps_stop : bool = False) -> dict:
         self._emit_all_tps_stop : bool = emit_all_tps_stop
+        self._number_of_nearest_neighbors : int = number_of_nearest_neighbors
         if(cc is None):
-            return super().process(blocks, data, tqdm_disable, store_weights)
+            return super().process(blocks=blocks, data=data, tqdm_disable=tqdm_disable, store_weights=store_weights)
             
         else:
-            self._threshold = max(1, 2 * self._budget / data.num_of_entities) if not self._emit_all_tps_stop else 2 * self._budget         
+            self._threshold = self._number_of_nearest_neighbors         
             self.trimmed_blocks : dict = defaultdict(set)
 
             for entity_id, neighbors in blocks.items():
@@ -753,9 +767,10 @@ def process(
             self,
             blocks: dict,
             data: Data,
+            window_size : int = 10,
             tqdm_disable: bool = False,
             emit_all_tps_stop : bool = False
-    ) -> PriorityQueue:
+    ) -> List[Tuple[float, int, int]]:
         """Calculates top comparisons for Progressive Matching
 
         Args:
@@ -778,6 +793,7 @@ def process(
         self._emit_all_tps_stop : bool = emit_all_tps_stop
         self._num_of_blocks = len(blocks)
         self._blocks: dict = blocks
+        self._max_window_size : int = window_size
         
         self._sorted_entity_ids = get_sorted_blocks_shuffled_entities(self.data.is_dirty_er, self._blocks)
         self._total_sorted_entities = len(self._sorted_entity_ids)
@@ -787,7 +803,7 @@ def process(
         self._flags = np.empty([self.data.num_of_entities], dtype=int)
         self._counters[:] = 0
         self._flags[:] = -1
-        self._pairs = self._apply_main_processing()
+        self._pairs : List[Tuple[float, int, int]]= self._apply_main_processing()
         self.execution_time = time() - start_time
         self._progress_bar.close()
 
@@ -801,8 +817,12 @@ def _get_weight(self, entity_id: int, neighbor_id: int) -> float:
             return self._counters[neighbor_id] / denominator
         elif ws == 'ACF' or ws == 'ID':
             return self._counters[neighbor_id]
+        elif ws == 'COSINE':
+            return self._counters[neighbor_id] / float(sqrt(len(self._position_index.get_positions(entity_id)) * len(self._position_index.get_positions(neighbor_id))))
+        elif ws == 'DICE':
+            return 2 * self._counters[neighbor_id] / float(len(self._position_index.get_positions(entity_id)) + len(self._position_index.get_positions(neighbor_id)))
         else:
-            raise ValueError("This weighting scheme does not exist")
+            raise ValueError("This weighting scheme does not exist")    
         
     def valid_entity_neighbor_index(self, entity: int, neighbor_index: int) -> bool:
         """Verifies if the neighbor identifier at the specified index is valid for candidate (the pair hasn't been considered previously)
@@ -837,43 +857,35 @@ class GlobalProgressiveSortedNeighborhood(ProgressiveSortedNeighborhood):
     def __init__(self, weighting_scheme: str = 'ACF', budget: int = 0) -> None:
         super().__init__(weighting_scheme, budget)
         
-    def _apply_main_processing(self) -> PriorityQueue:
-        self._max_window = 2 if self.data.num_of_entities <= 100 else int(2 ** (math.log10(self.data.num_of_entities) + 1) + 1)
+    def _apply_main_processing(self) -> List[Tuple[float, int, int]]:
         # TO DO: budget taken as argument in prediction, not algorithm constructor
         self._budget = float('inf') if self._emit_all_tps_stop else self._budget
-        self._top_pairs : PriorityQueue = PriorityQueue(2 * int(self._budget)) if not self._emit_all_tps_stop else PriorityQueue()
-        _top_unsorted_pairs: PriorityQueue = PriorityQueue(2 * int(self._budget)) if not self._emit_all_tps_stop else PriorityQueue()
+        self._top_pairs : List[Tuple[float, int, int]] = []
+        default_weight = 0.0
+        self._pair_weight : dict = defaultdict(lambda: default_weight)
         
         for entity in range(self.data.dataset_limit):
             entity_positions = self._position_index.get_positions(entity)
             self._neighbors.clear()
-            for current_window in range(1,self._max_window):
+            for current_window in range(1,self._max_window_size + 1):
                 for entity_position in entity_positions:
                     right_neighbor = entity_position + current_window
                     left_neighbor = entity_position - current_window
-                    
+
                     if(right_neighbor < self._total_sorted_entities):
                          if(self.valid_entity_neighbor_index(entity, right_neighbor)):
                             self._update_local_weight(current_window, entity, self._sorted_entity_ids[right_neighbor])
                     if(left_neighbor >= 0):
                         if(self.valid_entity_neighbor_index(entity, left_neighbor)):
                             self._update_local_weight(current_window, entity, self._sorted_entity_ids[left_neighbor])
-            
-            current_minimum_weight = -1               
+                      
             for neighbor in self._neighbors:
                 self._flags[neighbor] = -1
-                pair_weight = self._get_weight(entity, neighbor)
+                self._pair_weight[(entity, neighbor)] = max(self._pair_weight[(entity, neighbor)], self._get_weight(entity, neighbor))
                 
-                if(pair_weight >= current_minimum_weight):
-                    _top_unsorted_pairs.put(
-                    (pair_weight, entity, neighbor)
-                    )
-                    if self._budget < _top_unsorted_pairs.qsize():
-                        current_minimum_weight = _top_unsorted_pairs.get()[0]
-                        
-        while(not _top_unsorted_pairs.empty()):
-            _score, _entity, _neighbor = _top_unsorted_pairs.get()
-            self._top_pairs.put((-_score, _entity, _neighbor))
+        for pair in self._pair_weight:
+            id1, id2 = pair
+            self._top_pairs.append((self._pair_weight[(id1, id2)], id1, id2))
                         
         return self._top_pairs
                                             
@@ -915,17 +927,17 @@ def _has_next(self) -> bool:
         Returns:
             bool: Another pair can be emitted
         """
-        return self._emitted_comparisons < self._budget and self._current_window < self._total_sorted_entities
+        return self._current_window <= self._max_window_size
         
-    def _apply_main_processing(self) -> List[Tuple[int, int]]:
-        self._emitted_comparisons = 0
+    def _apply_main_processing(self) -> List[Tuple[float, int, int]]:
         self._current_window = 1 
-        self._top_pairs: List[Tuple[int, int]] = []
+        self._top_pairs: List[Tuple[float, int, int]] = []
+        default_weight = 0.0
+        self._pair_weight : dict = defaultdict(lambda: default_weight)
         # TO DO: budget taken as argument in prediction, not algorithm constructor
         self._budget = float('inf') if self._emit_all_tps_stop else self._budget
         
         while(self._has_next()):
-            _window_top_pairs = PriorityQueue()
             for entity in range(self.data.dataset_limit):
                 entity_positions = self._position_index.get_positions(entity)
                 self._neighbors.clear()
@@ -940,21 +952,16 @@ def _apply_main_processing(self) -> List[Tuple[int, int]]:
                     if(left_neighbor >= 0):
                         if(self.valid_entity_neighbor_index(entity, left_neighbor)):
                             self._update_counters(entity, self._sorted_entity_ids[left_neighbor])
-                          
+                 
                 for neighbor in self._neighbors:
                     self._flags[neighbor] = -1
-                    pair_weight = self._get_weight(entity, neighbor)
-                    
-                    _window_top_pairs.put(
-                    (-pair_weight, entity, neighbor)
-                    )
-                    
-                while(len(self._top_pairs) < self._budget and not _window_top_pairs.empty()):
-                    _, _entity, _neighbor = _window_top_pairs.get()
-                    self._top_pairs.append((_entity, _neighbor))
-                    self._emitted_comparisons += 1
+                    self._pair_weight[(entity, neighbor)] = max(self._pair_weight[(entity, neighbor)], self._get_weight(entity, neighbor))
 
             self._current_window += 1
+            
+        for pair in self._pair_weight:
+            id1, id2 = pair
+            self._top_pairs.append((self._pair_weight[(id1, id2)], id1, id2))
            
         return self._top_pairs
                                             
@@ -1008,113 +1015,40 @@ def _process_entity(self, entity_id: int) -> None:
                 if self._flags[neighbor_id] != entity_id:
                     self._counters[neighbor_id] = 0
                     self._flags[neighbor_id] = entity_id
-                if self.weighting_scheme == 'ARCS':
+                if self.weighting_scheme == 'CN-CBS' or self.weighting_scheme == 'CNC' or self.weighting_scheme == 'CND' or self.weighting_scheme == 'CNJ':
                     self._counters[neighbor_id] += 1 / self._blocks[block_id].get_cardinality(self.data.is_dirty_er)
+                if self.weighting_scheme == 'SN-CBS' or self.weighting_scheme == 'SNC' or self.weighting_scheme == 'SND' or self.weighting_scheme == 'SNJ':
+                    self._counters[neighbor_id] += 1 / self._blocks[block_id].get_size()
                 else:
                     self._counters[neighbor_id] += 1
                 self._valid_entities.add(neighbor_id)
-                        
+                       
         for valid_entity_id in self._valid_entities:  
-                _current_neighbor_weight = self._get_weight(entity_id, valid_entity_id)
-                self._sorted_neighbors[entity_id].put((-_current_neighbor_weight, valid_entity_id))
-                if(self.store_weights):
-                    self._stored_weights[canonical_swap(entity_id, valid_entity_id)] = _current_neighbor_weight
+            _current_neighbor_weight = self._get_weight(entity_id, valid_entity_id)
+            if(self.store_weights):
+                self._stored_weights[canonical_swap(entity_id, valid_entity_id)] = _current_neighbor_weight
                 
-        if(self.method == 'HB' and not self._sorted_neighbors[entity_id].empty()):
-            _top_entity_weight, _top_entity_neighbor = self._sorted_neighbors[entity_id].get()
-            self._to_emit_pairs.append((-_top_entity_weight, entity_id, _top_entity_neighbor))
-        
+            self._to_emit_pairs.append((_current_neighbor_weight, entity_id, valid_entity_id))
         self.blocks[entity_id] = self._valid_entities.copy()        
 
     def _prune_edges(self) -> dict:
         return None
 
     def process_raw_blocks(self, blocks: dict):
+        self._average_weight = np.zeros(self._limit, dtype=float)
         self._entity_index = create_entity_index(blocks, self.data.is_dirty_er)
         self._apply_main_processing()
         
     def process_prunned_blocks(self, blocks : dict, cc : AbstractMetablocking):
-        self._average_weight = np.zeros(self._limit, dtype=float)
         self.blocks = blocks
         for entity in sorted(blocks.keys()):
             neighbors = blocks[entity]
-            _neighbors_weigth_sum : float = 0.0
             for neighbor in neighbors:
                 _current_neighbor_weigth = cc.get_precalculated_weight(entity, neighbor) 
-                _neighbors_weigth_sum += _current_neighbor_weigth
-                self._sorted_neighbors[entity].put((-_current_neighbor_weigth, neighbor))
-        
-            self._average_weight[entity] = _neighbors_weigth_sum / len(neighbors) if len(neighbors) else 0.0
-            if(self.method == 'HB' and not self._sorted_neighbors[entity].empty()):
-                _top_entity_weight, _top_entity_neighbor = self._sorted_neighbors[entity].get()
-                self._to_emit_pairs.append((-_top_entity_weight, entity, _top_entity_neighbor))
+                self._to_emit_pairs.append((_current_neighbor_weigth, entity, neighbor))
     
-    def successful_emission(self, pair : tuple) -> bool:
-        """Attempts to emit given pair, returns True / False on Success / Fail 
-           In the case of full emission, it always emits given pair
-
-        Args:
-            pair (tuple): Tuple in the form (score, entity1, entity2)
 
-        Returns:
-            bool: Successful / Failed Emission
-        """
-        _weigth, _entity, _neighbor = pair
-        
-        _budget = float('inf') if self._emit_all_tps_stop else self._budget
-        
-        if(self._emitted_comparisons < _budget):
-            self.pairs.append((_entity, _neighbor))
-            self._emitted_comparisons += 1
-            self._progress_bar.update(1)
-            return True
-        else:
-            self.execution_time = time() - self.start_time
-            self._progress_bar.close()
-            return False
-     
-            
-    def produce_pairs(self) -> List[Tuple[int, int]]:
-        """Emits the top pair for each entity in decreasing average weigth order.
-           Traverses the entities in decreasing average weigth order and emits its
-           pairs in decreasing weight order
-
-        Returns:
-            List[Tuple[float, int, int]]: List of emitted pairs
-        """
-        self._emitted_comparisons = 0
-        checked_entity = np.zeros(self._limit, dtype=bool)
-        self.pairs = []
-
-        for pair in self._to_emit_pairs:
-            if(not self.successful_emission(pair)):
-                return self.pairs
-        
-        if(self.method == 'HB' or self.method == 'DFS'):
-            for entity in self._avg_weight_sorted_entities:
-                checked_entity[entity] = True
-                while(not self._sorted_neighbors[entity].empty()):
-                    weight, neighbor = self._sorted_neighbors[entity].get()
-                    pair = -weight, entity, neighbor
-                    if(not checked_entity[neighbor]):
-                        if(not self.successful_emission(pair)): 
-                            return self.pairs
-        else:
-            _available_emissions = True
-            while(_available_emissions):
-                _available_emissions = False
-                for entity in self._avg_weight_sorted_entities:
-                    if(not self._sorted_neighbors[entity].empty()):
-                        weight, neighbor = self._sorted_neighbors[entity].get()
-                        pair = -weight, entity, neighbor
-                        if canonical_swap(entity, neighbor) not in self._checked_pairs:
-                            if(not self.successful_emission(pair)): return self.pairs
-                            self._checked_pairs.add(canonical_swap(entity, neighbor))
-                            _available_emissions = True   
-                      
-        return self.pairs     
-
-    def process(self, blocks: dict, data: Data, tqdm_disable: bool = False, store_weigths : bool = True, cc: AbstractMetablocking = None, method : str = 'HB', emit_all_tps_stop : bool = False) -> None:
+    def process(self, blocks: dict, data: Data, tqdm_disable: bool = False, store_weigths : bool = True, cc: AbstractMetablocking = None, method : str = 'HB', emit_all_tps_stop : bool = False) -> List[Tuple[float, int, int]]:
         """Calculates the weights between entities, stores them in descending order of their average weight,
            stores the top comparison per entity
 
@@ -1143,15 +1077,13 @@ def process(self, blocks: dict, data: Data, tqdm_disable: bool = False, store_we
         self._blocks: dict = blocks
         self._stored_weights : dict = defaultdict(float)
         self._to_emit_pairs = []
-        self._sorted_neighbors = [PriorityQueue() for _ in range(self._limit)]
-        if(self.method == 'BFS'): self._checked_pairs = set()
-        
+
         if(cc is None):
             self.process_raw_blocks(blocks)
         else:
             self.process_prunned_blocks(blocks, cc)
             
-        self._avg_weight_sorted_entities = sorted_enumerate(self._average_weight)
+        return self._to_emit_pairs
             
 def get_meta_blocking_approach(acronym: str, w_scheme: str, budget: int = 0) -> any:
     """Return method by acronym
diff --git a/docs/pyjedai/datamodel.py b/docs/pyjedai/datamodel.py
index 92d725e..debbb51 100644
--- a/docs/pyjedai/datamodel.py
+++ b/docs/pyjedai/datamodel.py
@@ -79,7 +79,6 @@ def __init__(
                 id_column_name_2: str = None,
                 dataset_name_2: str = None,
                 ground_truth: DataFrame = None,
-                inorder_gt: bool = True
     ) -> None:
         # Original Datasets as pd.DataFrame
         if isinstance(dataset_1, pd.DataFrame):
@@ -105,7 +104,6 @@ def __init__(
         self.entities: DataFrame
 
         # Datasets specs
-        self.inorder_gt = inorder_gt
         self.is_dirty_er = dataset_2 is None
         self.dataset_limit = self.num_of_entities_1 = len(dataset_1)
         self.num_of_entities_2: int = len(dataset_2) if dataset_2 is not None else 0
@@ -160,6 +158,8 @@ def __init__(
             self._gt_to_ids_reversed_1: dict
             self._ids_mapping_2: dict
             self._gt_to_ids_reversed_2: dict
+        else:
+            self.ground_truth = None
 
         self.entities = self.dataset_1 = self.dataset_1.astype(str)
         
@@ -172,23 +172,48 @@ def __init__(
             self.entities = pd.concat([self.dataset_1, self.dataset_2],
                                       ignore_index=True)
 
+        self._create_gt_mapping()
         if ground_truth is not None:
-            self._create_gt_mapping()
             self._store_pairs()
         else:
             self.ground_truth = None
 
+    # def _store_pairs(self) -> None:
+    #     """Creates a mapping:
+    #         - pairs_of : ids of first dataset to ids of true matches from second dataset"""
+        
+    #     self.pairs_of = defaultdict(set)
+    #     d1_col_index, d2_col_index = (0, 1) if self.inorder_gt else (1,0)
+        
+    #     for _, row in self.ground_truth.iterrows():
+    #         id1, id2 = (row[d1_col_index], row[d2_col_index])
+    #         if id1 in self.pairs_of: self.pairs_of[id1].append(id2)
+    #         else: self.pairs_of[id1] = [id2]  
+    
+    
     def _store_pairs(self) -> None:
         """Creates a mapping:
             - pairs_of : ids of first dataset to ids of true matches from second dataset"""
         
-        self.pairs_of = defaultdict(set)
-        d1_col_index, d2_col_index = (0, 1) if self.inorder_gt else (1,0)
+        self.duplicate_of = defaultdict(set)
         
         for _, row in self.ground_truth.iterrows():
-            id1, id2 = (row[d1_col_index], row[d2_col_index])
-            if id1 in self.pairs_of: self.pairs_of[id1].append(id2)
-            else: self.pairs_of[id1] = [id2]  
+            id1, id2 = (row[0], row[1])
+            if id1 in self.duplicate_of: self.duplicate_of[id1].add(id2)
+            else: self.duplicate_of[id1] = {id2}
+            
+    def _are_true_positives(self, id1 : int, id2 : int):
+        """Checks if given pair of identifiers represents a duplicate.
+           Identifiers must be inorder, first one belonging to the first and the second to the second dataset
+
+        Args:
+            id1 (int, optional): Identifier from the first dataframe. 
+            id2 (int, optional): Identifier from the second dataframe.
+
+        Returns:
+            _type_: _description_
+        """
+        return id1 in self.duplicate_of and id2 in self.duplicate_of[id1]
     
     def _create_gt_mapping(self) -> None:
         """Creates two mappings:
@@ -198,8 +223,8 @@ def _create_gt_mapping(self) -> None:
         """
         if self.ground_truth is not None:
             self.ground_truth = self.ground_truth.astype(str)
-        else:
-            return
+        # else:
+        #     return
 
         self._ids_mapping_1 = dict(
             zip(
@@ -312,7 +337,6 @@ def stats_about_data(self) -> None:
             
         return stats_df
 
-        
 class Block:
     """The main module used for storing entities in the blocking steps of pyjedai module. \
         Consists of 2 sets of profile entities 1 for Dirty ER and 2 for Clean-Clean ER.
@@ -361,3 +385,4 @@ def verbose(self, key: any, is_dirty_er: bool) -> None:
             print("Clean dataset 2: " + "[\033[1;34m" + str(len(self.entities_D2)) + \
             " entities\033[0m]")
             print(self.entities_D2)
+
diff --git a/docs/pyjedai/evaluation.py b/docs/pyjedai/evaluation.py
index 02a8a59..fd0f922 100644
--- a/docs/pyjedai/evaluation.py
+++ b/docs/pyjedai/evaluation.py
@@ -18,6 +18,7 @@
 from .utils import canonical_swap
 from math import inf
 from .utils import PredictionData
+from .utils import generate_unique_identifier
 import random
 import matplotlib.pyplot as plt
 
@@ -196,12 +197,19 @@ def confusion_matrix(self):
         plt.ylabel("Real matching pairs", fontsize=10, fontweight='bold')
         plt.show()
         
-    def visualize_roc(method_names : List[str], methods_data : List[Tuple[str, float, List[float]]], proportional : bool =True) -> None:
+    def visualize_roc(self, methods_data : List[dict], proportional : bool =True, drop_tp_indices=True) -> None:
         fig, ax = plt.subplots(figsize=(10, 6))  # set the size of the plot
         colors = []
         normalized_aucs = []
         # for each method layout its plot
-        for method_name, normalized_auc, cumulative_recall in methods_data:
+        for method_data in methods_data:
+            cumulative_recall, normalized_auc = self._generate_auc_data(total_candidates=method_data['total_emissions'], tp_positions=method_data['tp_idx'])
+            if(drop_tp_indices):
+                del(method_data['tp_idx'])
+            method_name=method_data['name']
+            method_data['auc'] = normalized_auc
+            method_data['recall'] = cumulative_recall[-1] if len(cumulative_recall) != 0 else 0.0
+            
             x_values = range(len(cumulative_recall))
             color = "#{:06x}".format(random.randint(0, 0xFFFFFF))
             colors.append(color)
@@ -262,14 +270,14 @@ def _till_full_tps_emission(self) -> bool:
         Returns:
             bool: Stop emission on all TPs found / Emit all pairs
         """
-        return self._true_positive_checked is not None
+        return self._duplicate_emitted is not None
     
     def _all_tps_emitted(self) -> bool:
         """Checks if all TPs have been emitted (Defaults to False in the case of all pairs emission approach)
         Returns:
             bool: All TPs emitted / not emitted
         """
-        if(self._till_full_tps_emission()): return self._tps_found >= len(self._true_positive_checked)
+        if(self._till_full_tps_emission()): return self._tps_found >= len(self._duplicate_emitted)
         else: False
         
     def _update_true_positive_entry(self, entity : int, candidate : int) -> None:
@@ -280,75 +288,113 @@ def _update_true_positive_entry(self, entity : int, candidate : int) -> None:
             candidate (int): Candidate ID
         """
         if(self._till_full_tps_emission()):
-            if(not self._true_positive_checked[canonical_swap(entity, candidate)]):
-                self._true_positive_checked[canonical_swap(entity, candidate)] = True
+            if(not self._duplicate_emitted[(entity, candidate)]):
+                self._duplicate_emitted[(entity, candidate)] = True
                 self._tps_found += 1
                 return
     
 
-    def calculate_roc_auc_data(self, data: Data, pairs, batch_size : int  = 1, true_positive_checked : dict = None) -> List[Tuple[int, int]]:
-        """Progressively calculates total recall, AUC for each batch of candidate pairs
+    def calculate_tps_indices(self, pairs : List[Tuple[float, int, int]], duplicate_of : dict = None, duplicate_emitted : dict = None, batch_size : int  = 1) -> Tuple[List[int], int]:
+        """
         Args:
-            data (Data): Data Module
-            pairs: List containing pairs in form (entity1 id, entity2 id, score)
+            pairs (List[float, int, int]): Candidate pairs to emit in the form [similarity, first dataframe entity ID, second dataframe entity ID]
+            duplicate_of (dict, optional): Dictionary of the form [entity ID] -> [IDs of duplicate entities]. Defaults to None.
+            duplicate_emitted (dict, optional): Dictionary of the form [true positive pair] -> [emission status: emitted/not]. Defaults to None.
+            batch_size (int, optional): Recall update emission rate. Defaults to 1.
+
         Raises:
-            AttributeError: Ground Truth file hasn't been supplied, cannot calculate ROC AUC
+            AttributeError: No ground truth has been given
         Returns:
-            List[Tuple[int, int]]: List of ROC graph points information (recall up to e, normalized auc up to e)
+            Tuple[List[int], int]: Indices of true positive duplicates within the candidates list and the total emissions
         """
 
-        if(true_positive_checked is not None): 
-            for pair in true_positive_checked.keys():
-                true_positive_checked[pair] = False
+        if(duplicate_emitted is not None): 
+            for pair in duplicate_emitted.keys():
+                duplicate_emitted[pair] = False
 
-        if(data.ground_truth is None):
+        if(duplicate_of is None):
             raise AttributeError("Can calculate ROC AUC without a ground-truth file. \
                 Data object mush have initialized with the ground-truth file")
-
-        if(len(data.ground_truth) == 0):
-            raise AttributeError("Cannot calculate AUC score, number of true duplicates is equal to 0.")
         
-        _true_positives: int = 0
-        _normalized_auc: int = 0
-        _current_recall: int = 0
-        _new_recall: int = 0
         self._tps_found : int = 0
-        self._true_positive_checked : dict = true_positive_checked
-        self.num_of_true_duplicates = len(data.ground_truth)
-        _recall_progress = [0]
-
+        self._duplicate_emitted : dict = duplicate_emitted
+        self._tps_indices : List[int] = []
+        
         batches = batch_pairs(pairs, batch_size)
         # ideal_auc = self.calculate_ideal_auc(len(pairs), self.num_of_true_duplicates)
-        self._total_emissions : int = 0
+        self.total_emissions : int = 0
         for batch in batches:
-            _current_batch_size : int = 0
-            for entity, candidate in batch:
-                if(self._all_tps_emitted()): break
-                entity_id = data._gt_to_ids_reversed_1[entity] if entity < data.dataset_limit else data._gt_to_ids_reversed_2[entity]
-                candidate_id = data._gt_to_ids_reversed_1[candidate] if candidate < data.dataset_limit else data._gt_to_ids_reversed_2[candidate]
-                _d1_entity, _d2_entity = (entity_id, candidate_id) if entity < data.dataset_limit else (candidate_id, entity_id)
-                
-                if _d2_entity in self.data.pairs_of[_d1_entity]:
-                    self._update_true_positive_entry(entity_id, candidate_id)
-                    _true_positives += 1  
-                _current_batch_size += 1
-            self._total_emissions += 1
-            _new_recall = _true_positives / self.num_of_true_duplicates
+            for score, entity, candidate in batch:
+                if(self._all_tps_emitted()): break                
+                if candidate in duplicate_of[entity]:
+                    self._update_true_positive_entry(entity, candidate)
+                    self._tps_indices.append(self.total_emissions)
+                    
+            self.total_emissions += 1
             # _normalized_auc += ((_new_recall + _current_recall) / 2) * (_current_batch_size / self.num_of_true_duplicates)
-            _current_recall = _new_recall
-            _recall_progress.append(_current_recall)
             if(self._all_tps_emitted()): break
             
-
         # _normalized_auc = 0 if(ideal_auc == 0) else _normalized_auc / ideal_auc
-        _normalized_auc = sum(_recall_progress) / (len(pairs) + 1.0)
-        return _recall_progress, _normalized_auc
+        return self._tps_indices, self.total_emissions
+    
+    
+    def _generate_auc_data(self, total_candidates : int, tp_positions : List[int]) -> Tuple[List[float], float]:
+        """Generates the recall axis containing the recall value for each emission and calculates the normalized AUC
+
+        Args:
+            total_candidates (int): Total number of pairs emitted
+            tp_positions (List[int]): Indices of true positives within the candidate pairs list
+
+        Returns:
+            Tuple[List[float], float]: Recall axis and the normalized AUC
+        """
+        
+        _recall_axis : List[float] = []
+        _recall : float = 0.0
+        _tp_index : int = 0
+        _dataset_total_tps : int = len(self.data.ground_truth)
+        _total_found_tps : int = len(tp_positions)
+                
+        for recall_index in range(total_candidates):
+            if(_tp_index < _total_found_tps):
+                if(recall_index == tp_positions[_tp_index]):
+                   _recall =  (_tp_index + 1.0) / _dataset_total_tps
+                   _tp_index += 1
+            _recall_axis.append(_recall)
+            
+        _normalized_auc : float = sum(_recall_axis) / (total_candidates + 1.0)
+        
+        return _recall_axis, _normalized_auc    
+    
+    
+    def visualize_results_roc(self, results : dict, drop_tp_indices=True) -> None:
+        """For each of the executed workflows, calculates the cumulative recall and normalized AUC based upon true positive indices.
+           Finally, displays the ROC for all of the workflows with proper annotation (each workflow gains a unique identifier).
+        Args:
+            results (dict): Nested dictionary of the form [dataset] -> [matcher] -> [executed workflows and their info] / [model] -> [executed -//-]
+        """
+        
+        workflows_info : List[Tuple[dict]] = []
+        
+        for dataset in results:
+            matchers = results[dataset]
+            for matcher in matchers:
+                matcher_info = matchers[matcher]
+                if(isinstance(matcher_info, list)):
+                    for workflow_info in matcher_info:
+                        workflows_info.append((workflow_info))
+                else:
+                    for model in matcher_info:
+                        for workflow_info in matcher_info[model]:
+                            workflows_info.append((workflow_info))
+                          
+        self.visualize_roc(workflows_info, drop_tp_indices=drop_tp_indices)
     
-    def evaluate_auc_roc(self, matchers_data : List[Tuple], batch_size : int = 1, proportional : bool = True) -> None:
+    
+    def evaluate_auc_roc(self, matchers : List, batch_size : int = 1, proportional : bool = True, drop_tp_indices=True) -> None:
         """For each matcher, takes its prediction data, calculates cumulative recall and auc, plots the corresponding ROC curve, populates prediction data with performance info
         Args:
-            matchers_data List[Tuple[str, ProgressiveMatching]]: Progressive Matchers and their names
-            data (Data) : Data Module
+            matchers List[ProgressiveMatching]: Progressive Matchers
             batch_size (int, optional): Emitted pairs step at which cumulative recall is recalculated. Defaults to 1.
             proportional (bool) : Proportional Visualization
         Raises:
@@ -363,22 +409,18 @@ def evaluate_auc_roc(self, matchers_data : List[Tuple], batch_size : int = 1, pr
             raise AttributeError("Can not proceed to AUC ROC evaluation without a ground-truth file. " +
                     "Data object has not been initialized with the ground-truth file")
 
-        self._matchers_auc_roc_data = []
-
-        for matcher_data in matchers_data:
-            
-            matcher_name, progressive_matcher = matcher_data
-            matcher_prediction_data : PredictionData = PredictionData(matcher_name, progressive_matcher.pairs, progressive_matcher.true_pair_checked)
-            
-            matcher_predictions = matcher_prediction_data.get_predictions()
-            matcher_tps_checked = matcher_prediction_data.get_tps_checked()
+        self.matchers_info = []
+        
+        for matcher in matchers:
+            _tp_indices, _total_emissions  = self.calculate_tps_indices(pairs=matcher.pairs, duplicate_of=matcher.duplicate_of, duplicate_emitted=matcher.duplicate_emitted)
+            matcher_info = {}
+            matcher_info['name'] = generate_unique_identifier()
+            matcher_info['total_emissions'] = _total_emissions
+            matcher_info['tp_idx'] = _tp_indices
+            matcher_info['time'] = matcher.execution_time
             
-            cumulative_recall, normalized_auc = self.calculate_roc_auc_data(self.data, matcher_predictions, batch_size, matcher_tps_checked)
-                    
-            self._matchers_auc_roc_data.append((matcher_name, normalized_auc, cumulative_recall))
-            matcher_prediction_data.set_total_emissions(self._total_emissions)
-            matcher_prediction_data.set_normalized_auc(normalized_auc)
-            matcher_prediction_data.set_cumulative_recall(cumulative_recall[-1])
-            progressive_matcher.set_prediction_data(matcher_prediction_data)
+            matcher_prediction_data : PredictionData = PredictionData(matcher=matcher, matcher_info=matcher_info)
+            matcher.set_prediction_data(matcher_prediction_data)
+            self.matchers_info.append(matcher_info)
 
-        self.visualize_roc(methods_data = self._matchers_auc_roc_data, proportional = proportional)
+        self.visualize_roc(methods_data=self.matchers_info, drop_tp_indices=drop_tp_indices)
diff --git a/docs/pyjedai/joins.py b/docs/pyjedai/joins.py
index 196c7cc..23a9ade 100644
--- a/docs/pyjedai/joins.py
+++ b/docs/pyjedai/joins.py
@@ -14,6 +14,7 @@
 
 from .datamodel import Data, PYJEDAIFeature
 from .evaluation import Evaluation
+from .utils import FrequencyEvaluator
 
 class AbstractJoin(PYJEDAIFeature):
     """Abstract class of Joins module
@@ -44,23 +45,94 @@ def __init__(
         self.attributes_2: list
         self._flags: np.array
         self.pairs: networkx.Graph
+        self.vectorizer = None
 
+    def vectorizer_based(self) -> bool:
+        """
+            Checks whether current instance of Joins algorithm is using a frequency vectorizer
+
+            Returns:
+                bool: Candidate scores are being calculated through frequency vectorizer
+        """
+        return (self.vectorizer is not None)
+   
+    def dirty_indexing(self):
+        """Applies Dirty Indexing - Evaluates the similarity of all the entities of the target dataset
+        """
+        eid = 0
+        for entity in self.indexed_entities:
+            candidates = set()
+            for token in entity:
+                if token in self.entity_index:
+                    current_candidates = self.entity_index[token]
+                    for candidate_id in current_candidates:
+                        if(not self.vectorizer_based()):
+                            if self._flags[candidate_id] != eid:
+                                self._counters[candidate_id] = 0
+                                self._flags[candidate_id] = eid
+                            self._counters[candidate_id] += 1
+                        candidates.add(candidate_id)
+            self._process_candidates(candidates, eid, len(entity))
+            self._progress_bar.update(1)
+            eid += 1
+    
+    def get_id_from_index(self, index : int):
+        return (i if self.reverse_order else (index+self.data.dataset_limit))
+    
+       
+    def clean_indexing(self):
+        """Applies Dirty Indexing - One of the datasets (depends on the order of indexing) is set as the indexer.
+           For each entry of that dataset, its similarity scores are being calculated with each entity of the target dataset.
+           The top-K best results for each source entity are chosen.
+        """
+        for i in range(0, self.indexed_entities_count):
+            candidates = set()
+            record = self.indexed_entities[i]
+            entity_id =  self.get_id_from_index(i)
+            for token in record:
+                if token in self.entity_index:
+                    current_candidates = self.entity_index[token]
+                    for candidate_id in current_candidates:
+                        if(not self.vectorizer_based()):
+                            if self._flags[candidate_id] != entity_id:
+                                self._counters[candidate_id] = 0
+                                self._flags[candidate_id] = entity_id
+                            self._counters[candidate_id] += 1
+                        candidates.add(candidate_id)
+            if 0 < len(candidates):
+                self._process_candidates(candidates, entity_id, len(record))
+            self._progress_bar.update(1)
+        
+    def setup_indexing(self):
+        """Defines the indexed and target entities, as well as their total count
+        
+        """
+        self.indexed_entities, self.indexed_entities_count = (self._entities_d1, self.data.num_of_entities_1) if (self.reverse_order or self.data.is_dirty_er) \
+                                                              else (self._entities_d2, self.data.num_of_entities_2)
+                                                              
+        self.target_entities, self.target_entities_count = (self._entities_d1, self.data.num_of_entities_1) if (not self.reverse_order or self.data.is_dirty_er) \
+                                                              else (self._entities_d2, self.data.num_of_entities_2)                                                      
+                                                    
     def fit(self,
             data: Data,
+            vectorizer: FrequencyEvaluator = None,
             reverse_order: bool = False,
             attributes_1: list = None,
             attributes_2: list = None,
-            tqdm_disable: bool = False
+            tqdm_disable: bool = False,
+            store_neighborhoods : bool = False
     ) -> networkx.Graph:
         """Joins main method
 
             Args:
                 data (Data): dataset module
+                vectorizer (FrequencyEvaluator, optional): Vectorizer will be used for similarity evaluation
                 reverse_order (bool, optional): _description_. Defaults to False.
                 attributes_1 (list, optional): _description_. Defaults to None.
                 attributes_2 (list, optional): _description_. Defaults to None.
                 tqdm_disable (bool, optional): _description_. Defaults to False.
-
+                save_to_json (bool, optional): Store indexed dataset neighborhoods in a dictionary of form
+                                               [indexed dataset entity id] -> [ids of top-k neighbors in target dataset]
             Returns:
                 networkx.Graph: graph containg nodes as entities and edges as similarity score
         """
@@ -68,8 +140,8 @@ def fit(self,
             raise ValueError("Can't have reverse order in Dirty Entity Resolution")
 
         start_time = time()
-        self.tqdm_disable, self.reverse_order, self.attributes_1, self.attributes_2, self.data = \
-            tqdm_disable, reverse_order, attributes_1, attributes_2, data
+        self.tqdm_disable, self.reverse_order, self.attributes_1, self.attributes_2, self.data, self.vectorizer, self.store_neighborhoods = \
+            tqdm_disable, reverse_order, attributes_1, attributes_2, data, vectorizer, store_neighborhoods
 
         self._entities_d1 = data.dataset_1[attributes_1 if attributes_1 else data.attributes_1] \
                             .apply(" ".join, axis=1) \
@@ -82,74 +154,36 @@ def fit(self,
                     .apply(self._tokenize_entity) \
                     .values.tolist()
 
-        num_of_entities = self.data.num_of_entities_2 if reverse_order else self.data.num_of_entities_1
-
+        self.neighborhoods = defaultdict(list) if self.store_neighborhoods else None
+        self.setup_indexing()
+        
         self._progress_bar = tqdm(
-            total=self.data.num_of_entities if not self.data.is_dirty_er else num_of_entities*2,
+            total=self.indexed_entities_count,
             desc=self._method_name+" ("+self.metric+")", disable=self.tqdm_disable
         )
-
-        self._flags, \
-        self._counters, \
-        self._sims, \
-        self._source_frequency, \
-        self.pairs = np.empty([num_of_entities]), \
-                    np.zeros([num_of_entities]), \
-                    np.empty([self.data.num_of_entities_1*self.data.num_of_entities_2]), \
-                    np.empty([num_of_entities]), \
-                    networkx.Graph()
-        self._flags[:] = -1
-        entity_index = self._create_entity_index(
-                self._entities_d2 if reverse_order else self._entities_d1
-            )
+        
+        self._flags = np.empty([self.target_entities_count]) if (not self.vectorizer_based()) else None
+        self._counters = np.zeros([self.target_entities_count]) if (not self.vectorizer_based()) else None
+        self._source_frequency = np.empty([self.target_entities_count]) if (not self.vectorizer_based()) else None
+        if(not self.vectorizer_based()) : self._flags[:] = -1
+        self.pairs = networkx.Graph()
+        self.entity_index = self._create_entity_index()
 
         if self.data.is_dirty_er:
-            eid = 0
-            for entity in self._entities_d1:
-                candidates = set()
-                for token in entity:
-                    if token in entity_index:
-                        current_candidates = entity_index[token]
-                        for candidate_id in current_candidates:
-                            if self._flags[candidate_id] != eid:
-                                self._counters[candidate_id] = 0
-                                self._flags[candidate_id] = eid
-                            self._counters[candidate_id] += 1
-                            candidates.add(candidate_id)
-                self._process_candidates(candidates, eid, len(entity))
-                self._progress_bar.update(1)
-                eid += 1
+            self.dirty_indexing()
         else:
-            if reverse_order:
-                entities = self._entities_d1
-                num_of_entities = self.data.num_of_entities_1
-            else:
-                entities = self._entities_d2
-                num_of_entities = self.data.num_of_entities_2
-
-            for i in range(0, num_of_entities):
-                candidates = set()
-                record = entities[i]
-                entity_id = i if reverse_order else i+self.data.dataset_limit
-                for token in record:
-                    if token in entity_index:
-                        current_candidates = entity_index[token]
-                        for candidate_id in current_candidates:
-                            if self._flags[candidate_id] != entity_id:
-                                self._counters[candidate_id] = 0
-                                self._flags[candidate_id] = entity_id
-                            self._counters[candidate_id] += 1
-                            candidates.add(candidate_id)
-                if 0 < len(candidates):
-                    self._process_candidates(candidates, entity_id, len(record))
-                self._progress_bar.update(1)
+            self.clean_indexing()
+            
+        if(self.store_neighborhoods): self._process_neighborhoods()   
+                
         self._progress_bar.close()
         self.execution_time = time() - start_time
-
         return self.pairs
 
     def _tokenize_entity(self, entity: str) -> set:
-        if self.tokenization == 'qgrams':
+        if self.vectorizer is not None:
+            return entity.lower()
+        elif self.tokenization == 'qgrams':
             return set([' '.join(grams) for grams in nltk.ngrams(entity.lower(), n=self.qgrams)])
         elif self.tokenization == 'standard':
             return set(filter(None, re.split('[\\W_]', entity.lower())))
@@ -194,47 +228,46 @@ def _calc_similarity(
             return 2 * common_tokens / (source_frequency+tokens_size)
         elif self.metric == 'jaccard':
             return common_tokens / (source_frequency+tokens_size-common_tokens)
+        
+    def _calc_vector_similarity(self, id1 : int, id2 : int) -> float:
+        """Vector based similarity score
+
+        Args:
+            id1 (int): D1 entity ID
+            id2 (int): D2 entity ID
 
-    def _create_entity_index(self, entities: list) -> dict:
+        Returns:
+            float: vector based similarity
+        """
+        return self.vectorizer.predict(id1=id1, id2=id2)
+
+    def _create_entity_index(self) -> dict:
         entity_index = defaultdict(set)
-        entity_id = itertools.count()
-        for entity in entities:
-            eid = next(entity_id)
+        for eid, entity in enumerate(self.target_entities):
             for token in entity:
                 entity_index[token].add(eid)
-            self._source_frequency[eid] = len(entity)
+                
+            if(not self.vectorizer_based()):
+                self._source_frequency[eid] = len(entity)
             self._progress_bar.update(1)
 
-        return entity_index
-
-#     def _similarity(self, entity_id1: int, entity_id2: int, attributes: any=None) -> float:
-#         similarity: float = 0.0
-#         if isinstance(attributes, dict):
-#             for attribute, weight in self.attributes.items():
-#                 similarity += weight*self._metric(
-#                     self.data.entities.iloc[entity_id1][attribute],
-#                     self.data.entities.iloc[entity_id2][attribute]
-#                 )
-#         if isinstance(attributes, list):
-#             for attribute in self.attributes:
-#                 similarity += self._metric(
-#                     self.data.entities.iloc[entity_id1][attribute],
-#                     self.data.entities.iloc[entity_id2][attribute]
-#                 )
-#                 similarity /= len(self.attributes)
-#         else:
-#             # print(self.data.entities.iloc[entity_id1].str.cat(sep=' '),
-#                 # self.data.entities.iloc[entity_id2].str.cat(sep=' '))
-#             # concatenated row string
-#             similarity = self._metric(
-#                 self.data.entities.iloc[entity_id1].str.cat(sep=' '),
-#                 self.data.entities.iloc[entity_id2].str.cat(sep=' ')
-#             )
-#         return similarity
+        return entity_index   
 
     def _insert_to_graph(self, entity_id1, entity_id2, similarity):
         if self.similarity_threshold <= similarity:
             self.pairs.add_edge(entity_id1, entity_id2, weight=similarity)
+            
+    def _store_neighborhood(self, entity_id1, entity_id2, similarity):
+        if self.similarity_threshold <= similarity:
+            self.neighborhoods[entity_id2].append((similarity, entity_id1))
+            
+    def _process_neighborhoods(self):
+        """Sorts the candidates of each indexed entity's neighborhood in descending order
+           of similarity. 
+        """
+        for d1_id, d2_ids in self.neighborhoods.items():
+            self.neighborhoods[d1_id] = sorted(d2_ids, key=lambda x: (-x[0], x[1]))
+            
 
     def evaluate(self, prediction=None, export_to_df: bool = False,
                  export_to_dict: bool = False, with_classification_report: bool = False,
@@ -296,9 +329,6 @@ def export_to_df(self, prediction) -> pd.DataFrame:
             pairs_df = pd.concat([pairs_df, pd.DataFrame([{'id1':id1, 'id2':id2}], index=[0])], ignore_index=True)
 
         return pairs_df
-
-    
-    
     
 class EJoin(AbstractJoin):
     """
@@ -319,17 +349,17 @@ def __init__(
 
     def _process_candidates(self, candidates: set, entity_id: int, tokens_size: int) -> None:
         for candidate_id in candidates:
-            self._insert_to_graph(
-                candidate_id+self.data.dataset_limit if self.reverse_order \
-                                                        and not self.data.is_dirty_er \
-                                                    else candidate_id,
-                entity_id,
-                self._calc_similarity(
-                    self._counters[candidate_id],
-                    self._source_frequency[candidate_id],
-                    tokens_size
+            sim = self._calc_similarity(
+                  self._counters[candidate_id],
+                  self._source_frequency[candidate_id],
+                  tokens_size
                 )
-            )
+            d1_id = candidate_id+self.data.dataset_limit if (self.reverse_order \
+                                                    and not self.data.is_dirty_er) \
+                                                    else candidate_id
+            d2_id = entity_id
+            self._insert_to_graph(d1_id, d2_id, sim)
+            if(self.store_neighborhoods): self._store_neighborhood(d1_id, d2_id, sim)
 
 class TopKJoin(AbstractJoin):
     """Top-K Join algorithm
@@ -351,27 +381,149 @@ def __init__(self,
     def _process_candidates(self, candidates: set, entity_id: int, tokens_size: int) -> None:
         minimum_weight=0
         pq = PriorityQueue()
-        for candidate_id in candidates:
-            sim = self._calc_similarity(
-                self._counters[candidate_id], self._source_frequency[candidate_id], tokens_size
-            )
+        pq.put(minimum_weight)
+        for index, candidate_id in enumerate(candidates):
+            if(self.vectorizer is None):
+                sim = self._calc_similarity(self._counters[candidate_id], self._source_frequency[candidate_id], tokens_size)
+            else:
+                sim = self._calc_vector_similarity(((candidate_id + self.data.dataset_limit) if self.reverse_order else candidate_id), entity_id)
             if minimum_weight < sim:
                 pq.put(sim)
                 if self.K < pq.qsize():
                     minimum_weight = pq.get()
 
         minimum_weight = pq.get()
-        for candidate_id in candidates:
+        for index, candidate_id in enumerate(candidates):
             self.similarity_threshold = minimum_weight
+            if(self.vectorizer is None):
+                sim = self._calc_similarity(self._counters[candidate_id], self._source_frequency[candidate_id], tokens_size)
+            else:
+                sim = self._calc_vector_similarity(((candidate_id + self.data.dataset_limit) if self.reverse_order else candidate_id), entity_id)
             self._insert_to_graph(
                 candidate_id + self.data.dataset_limit if self.reverse_order else candidate_id,
                 entity_id,
-                self._calc_similarity(
-                    self._counters[candidate_id], 
-                    self._source_frequency[candidate_id],
-                    tokens_size
-                )
+                sim
             )
+            if(self.store_neighborhoods): self._store_neighborhood(candidate_id + self.data.dataset_limit if self.reverse_order else candidate_id, \
+                                                                   entity_id, \
+                                                                   sim)
+
+    def _configuration(self) -> dict:
+        return {
+            "similarity_threshold" : self.similarity_threshold,
+            "K" : self.K,
+            "metric" : self.metric,
+            "tokenization" : self.tokenization,
+            "qgrams": self.qgrams
+        }
+        
+        
+class PETopKJoin(TopKJoin):
+    """Progressive Entity Resolution Top-K class of Joins module
+    """
+    _method_name = "Progressive Top-K Join"
+    _method_info = "Progressive Top-K Join algorithm"
+    _method_short_name = "PETopKJ"    
+
+    def __init__(
+            self,
+            K: int,
+            metric: str,
+            tokenization: str,
+            qgrams: int = 2
+    ) -> None:
+        """AbstractJoin Constructor
+
+        Args:
+            K (int): Number of candidates per entity
+            metric (str): String similarity metric
+            tokenization (str): Tokenizer
+            qgrams (int, optional): For Jaccard metric. Defaults to 2.
+        """
+        super().__init__(K=K,
+                        metric=metric,
+                        tokenization=tokenization,
+                        qgrams=qgrams)
+    
+    
+    def _get_similarity(self, target_id : int, indexed_id : int, tokens_size : int):
+        return self._calc_similarity(self._counters[target_id], self._source_frequency[target_id], tokens_size) \
+               if (self.vectorizer is None) else \
+               self._calc_vector_similarity(target_id , indexed_id)
+        
+    def _process_candidates(self, candidates: set, entity_id: int, tokens_size: int) -> None:
+        minimum_weight=0
+        pq = PriorityQueue()
+        for index, candidate_id in enumerate(candidates):
+            
+            _target_id = candidate_id
+            _indexed_id = entity_id + self.data.dataset_limit
+            
+            sim : float = self._get_similarity(target_id=_target_id,
+                                               indexed_id=_indexed_id,
+                                               tokens_size=tokens_size)
+            
+            # target dataset entity id set to negative
+            # so higher identifier kicked out first (simulating descending order with ascending PQ)
+            _pair = (sim, -_target_id, _indexed_id)
+
+            if minimum_weight <= sim:
+                pq.put(_pair)
+                if self.K < pq.qsize():
+                    minimum_weight, _, _ = pq.get()
+        
+        if(self.store_neighborhoods):
+            _first_element = True
+            while(not pq.empty()):
+                _sim, _target_id, _indexed_id = pq.get()
+                if _first_element:
+                    self.similarity_threshold = _sim
+                    _first_element = False
+                    
+                self._store_neighborhood(entity_id1= -_target_id,
+                                         entity_id2= _indexed_id,
+                                         similarity= _sim)
+                self._insert_to_graph(entity_id1=-_target_id,
+                                      entity_id2=_indexed_id,
+                                      similarity=_sim) 
+        else:
+            self.similarity_threshold, _, _ = pq.get()
+            for index, candidate_id in enumerate(candidates):
+                _target_id = candidate_id
+                _indexed_id = entity_id + self.data.dataset_limit
+                self._insert_to_graph(entity_id1=_target_id,
+                                      entity_id2=_indexed_id,
+                                      similarity=self._get_similarity(target_id=_target_id,
+                                                                      indexed_id=_indexed_id,
+                                                                      tokens_size=tokens_size))
+
+    def _process_neighborhoods(self, strict_top_k : bool = True):
+        """Sorts the candidates of each indexed entity's neighborhood in descending order
+           of similarity. If strict top-K instance is chosen, it retains max K best candidates
+           per entity.
+        Args:
+            strict_top_k (bool, optional): Retain strictly (max) top-K candidates per entity
+        """
+        for d1_id, d2_ids in self.neighborhoods.items():
+            _sorted_neighborhood = sorted(d2_ids, key=lambda x: (-x[0], x[1])) 
+            self.neighborhoods[d1_id] = _sorted_neighborhood[:self.K] if strict_top_k else \
+                                        _sorted_neighborhood
+
+    def setup_indexing(self):
+        """Defines the indexed and target entities, as well as their total count
+        
+        """
+        # self.indexed_entities, self.indexed_entities_count = (self._entities_d2, self.data.num_of_entities_2) if (self.reverse_order) \
+        #                                                       else (self._entities_d1, self.data.num_of_entities_1)
+                                                              
+        # self.target_entities, self.target_entities_count = (self._entities_d1, self.data.num_of_entities_1) if (self.reverse_order or self.data.is_dirty_er) \
+        #                                                       else (self._entities_d2, self.data.num_of_entities_2)     
+        self.indexed_entities, self.indexed_entities_count = (self._entities_d2, self.data.num_of_entities_2)
+                                                              
+        self.target_entities, self.target_entities_count = (self._entities_d1, self.data.num_of_entities_1)                                                      
+
+    def get_id_from_index(self, index : int):
+        return index
 
     def _configuration(self) -> dict:
         return {
@@ -381,3 +533,12 @@ def _configuration(self) -> dict:
             "tokenization" : self.tokenization,
             "qgrams": self.qgrams
         }
+
+
+
+
+
+
+        
+
+
diff --git a/docs/pyjedai/matching.py b/docs/pyjedai/matching.py
index 00e6951..6880ab4 100644
--- a/docs/pyjedai/matching.py
+++ b/docs/pyjedai/matching.py
@@ -19,13 +19,11 @@
 from py_stringmatching.tokenizer.qgram_tokenizer import QgramTokenizer
 from py_stringmatching.tokenizer.whitespace_tokenizer import \
     WhitespaceTokenizer
-from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
-from sklearn.metrics.pairwise import pairwise_distances
 from tqdm.autonotebook import tqdm
 
 from .datamodel import Data, PYJEDAIFeature
 from .evaluation import Evaluation
-from .utils import WordQgramTokenizer, cosine, get_qgram_from_tokenizer_name
+from .utils import WordQgramTokenizer, cosine, get_qgram_from_tokenizer_name, FrequencyEvaluator
 
 
 metrics_mapping = {
@@ -51,36 +49,44 @@
 ]
 
 vector_metrics = [ 
-    'cosine', 'dice', 'jaccard'
+    'cosine', 'dice', 'jaccard', 'sqeuclidean'
 ]
 
 whoosh_index_metrics = [
     'TF-IDF', 'Frequency', 'PL2', 'BM25F'
 ] 
 
+faiss_metrics = [
+    'cosine', 'euclidean'
+]
+
 magellan_metrics = string_metrics + set_metrics
-available_metrics = magellan_metrics + vector_metrics + whoosh_index_metrics
+available_metrics = magellan_metrics + vector_metrics + whoosh_index_metrics + faiss_metrics
 
 #
 # Tokenizers
 #
-char_qgram_tokenizers = { 'char_'+ str(i) + 'gram':i for i in range(1, 7) }
-word_qgram_tokenizers = { 'word_'+ str(i) + 'gram':i for i in range(1, 7) }
+# char_qgram_tokenizers = { 'char_'+ str(i) + 'gram':i for i in range(1, 7) }
+# word_qgram_tokenizers = { 'word_'+ str(i) + 'gram':i for i in range(1, 7) }
+char_qgram_tokenizers = ['char_tokenizer']
+word_qgram_tokenizers = ['word_tokenizer']
 magellan_tokenizers = ['white_space_tokenizer']
+joins_tokenizers = ["qgrams", "standard", "standard_multiset", "qgrams_multiset"]
 
-tfidf_tokenizers = [ 'tfidf_' + cq for cq in char_qgram_tokenizers.keys() ] + \
-                    [ 'tfidf_' + wq for wq in word_qgram_tokenizers.keys() ]
+# tfidf_tokenizers = [ 'tfidf_' + cq for cq in char_qgram_tokenizers.keys() ] + \
+#                     [ 'tfidf_' + wq for wq in word_qgram_tokenizers.keys() ]
 
-tf_tokenizers = [ 'tf_' + cq for cq in char_qgram_tokenizers.keys() ] + \
-                    [ 'tf_' + wq for wq in word_qgram_tokenizers.keys() ]
+# tf_tokenizers = [ 'tf_' + cq for cq in char_qgram_tokenizers.keys() ] + \
+#                     [ 'tf_' + wq for wq in word_qgram_tokenizers.keys() ]
                         
-boolean_tokenizers = [ 'boolean_' + cq for cq in char_qgram_tokenizers.keys() ] + \
-                        [ 'boolean_' + wq for wq in word_qgram_tokenizers.keys() ]
-
-vector_tokenizers = tfidf_tokenizers + tf_tokenizers + boolean_tokenizers
+# boolean_tokenizers = [ 'boolean_' + cq for cq in char_qgram_tokenizers.keys() ] + \
+#                         [ 'boolean_' + wq for wq in word_qgram_tokenizers.keys() ]
 
-available_tokenizers = [key for key in char_qgram_tokenizers] + [key for key in word_qgram_tokenizers] + magellan_tokenizers + vector_tokenizers
+# vector_tokenizers = tfidf_tokenizers + tf_tokenizers + boolean_tokenizers
 
+# available_tokenizers = [key for key in char_qgram_tokenizers] + [key for key in word_qgram_tokenizers] + magellan_tokenizers + vector_tokenizers
+available_tokenizers = char_qgram_tokenizers + word_qgram_tokenizers + magellan_tokenizers + joins_tokenizers
+available_vectorizers = ['tfidf', 'tf', 'boolean']
 
 class AbstractEntityMatching(PYJEDAIFeature):
     """Calculates similarity from 0.0 to 1.0
@@ -337,6 +343,8 @@ def __init__(
             self,
             metric: str = 'dice',
             tokenizer: str = 'white_space_tokenizer',
+            vectorizer : str = None,
+            qgram : int = 1,
             similarity_threshold: float = 0.5,
             tokenizer_return_unique_values = False, # unique values or not,
             attributes: any = None,
@@ -348,7 +356,7 @@ def __init__(
         self.similarity_threshold = similarity_threshold
         self.tokenizer = tokenizer
         self.execution_time = 0
-        self._input_type = None
+        self.vectorizer = vectorizer
         self.qgram: int = -1
         #
         # Selecting tokenizer
@@ -362,36 +370,30 @@ def __init__(
         else:
             self._metric = metric
 
-        if metric in set_metrics:
-            self.tokenizer_return_set = True
-        else:
-            self.tokenizer_return_set = tokenizer_return_unique_values
-
-        if 'gram' in tokenizer:
-            self.qgram = get_qgram_from_tokenizer_name(tokenizer)
+        self.tokenizer_return_set = (metric in set_metrics) or tokenizer_return_unique_values    
+        self.qgram : int = qgram
         
-        if tokenizer == 'white_space_tokenizer':
-            self._input_type = 'white_space'
-            self._tokenizer = WhitespaceTokenizer(return_set=self.tokenizer_return_set)
-        elif tokenizer in char_qgram_tokenizers.keys():
-            self._input_type = 'char_qgram'
-            self._tokenizer = QgramTokenizer(qval=self.qgram,
-                                             return_set=self.tokenizer_return_set)
-        elif tokenizer in word_qgram_tokenizers.keys():
-            self._input_type = 'word_qgram'
-            self._tokenizer = WordQgramTokenizer(q=self.qgram)
-        elif 'tfidf' in tokenizer:
-            self._input_type = 'tfidf'
-        elif 'tf' in tokenizer:
-            self._input_type = 'tf'
-        elif 'boolean' in tokenizer:
-            self._input_type = 'boolean'
-        else:
-            raise AttributeError(
-                'Tokenizer ({}) does not exist. Please select one of the available. ({})'.format(
-                    tokenizer, available_tokenizers
+        if(vectorizer is not None):
+            if self.vectorizer not in available_vectorizers:
+                raise AttributeError(
+                    'Weighting Scheme ({}) does not exist. Please select one of the available. ({})'.format(
+                        vectorizer, available_vectorizers
+                    )
+                )
+        elif(tokenizer is not None):
+            if tokenizer == 'white_space_tokenizer':
+                self._tokenizer = WhitespaceTokenizer(return_set=self.tokenizer_return_set)
+            elif tokenizer == 'char_tokenizer':
+                self._tokenizer = QgramTokenizer(qval=self.qgram,
+                                                return_set=self.tokenizer_return_set)
+            elif tokenizer == 'word_tokenizer':
+                self._tokenizer = WordQgramTokenizer(q=self.qgram)
+            elif tokenizer not in available_tokenizers:
+                raise AttributeError(
+                    'Tokenizer ({}) does not exist. Please select one of the available. ({})'.format(
+                        tokenizer, available_tokenizers
+                    )
                 )
-            )
         
     def predict(self,
                 blocks: dict,
@@ -420,8 +422,8 @@ def predict(self,
                                   desc=self._method_name+" ("+self.metric+ ", " + str(self.tokenizer) + ")",
                                   disable=self.tqdm_disable)
                 
-        if self._input_type in ['tfidf', 'tf', 'boolean']:
-            self._calculate_tf_tfidf()
+        if self.vectorizer is not None:
+            self.initialize_vectorizer()
 
         if 'Block' in str(type(all_blocks[0])):
             self._predict_raw_blocks(blocks)
@@ -459,52 +461,40 @@ def _predict_raw_blocks(self, blocks: dict) -> None:
                         self._insert_to_graph(entity_id1, entity_id2, similarity)
                 self._progress_bar.update(1)
 
-    def _calculate_tf_tfidf(self) -> None:
-        
-        analyzer = 'char' if 'char' in self.tokenizer else 'word'
-        
+    def initialize_vectorizer(self) -> None:
+        self.frequency_evaluator : FrequencyEvaluator = FrequencyEvaluator(vectorizer=self.vectorizer,
+                                                                            tokenizer=self.tokenizer,
+                                                                            qgram=self.qgram)
         d1 = self.data.dataset_1[self.attributes] if self.attributes else self.data.dataset_1
         self._entities_d1 = d1 \
                     .apply(" ".join, axis=1) \
                     .apply(lambda x: x.lower()) \
                     .values.tolist()
         
-        d2 = self.data.dataset_2[self.attributes] if self.attributes and not self.data.is_dirty_er else self.data.dataset_2
+        d2 = None
+        if(not self.data.is_dirty_er):
+            d2 = self.data.dataset_2
+            if self.attributes:
+                d2 = d2[self.attributes]
+
         self._entities_d2 = d2 \
                     .apply(" ".join, axis=1) \
                     .apply(lambda x: x.lower()) \
-                    .values.tolist() if not self.data.is_dirty_er else None
-
-        if self._input_type == 'tfidf' or self._input_type == 'boolean':
-            vectorizer = TfidfVectorizer(analyzer='') if self.qgram is None else \
-                            TfidfVectorizer(analyzer=analyzer, ngram_range=(self.qgram, self.qgram))
-        elif self._input_type == 'tf':
-            vectorizer = CountVectorizer(analyzer=analyzer) if self.qgram is None else \
-                            CountVectorizer(analyzer=analyzer, ngram_range=(self.qgram, self.qgram))
+                    .values.tolist() if not self.data.is_dirty_er else self._entities_d1 
         
-        self._calculate_tf_and_tfidf_similarities(vectorizer)
-
-    def _calculate_tf_and_tfidf_similarities(self, vectorizer) -> None:
-        if self.data.is_dirty_er:
-            raise NotImplementedError("TFIDF for dirty ER is not implemented yet")
-        else:
-            self.corpus = self._entities_d1 + self._entities_d2
-            self.corpus_as_matrix = vectorizer.fit_transform(self.corpus)
-            if self._input_type == 'boolean':
-                # transform to boolean if value is positive to 1 and negative to 0
-                self.similarity_matrix = self.corpus_as_matrix.astype(bool).astype(int)
-                
-            self.similarity_matrix = 1 - pairwise_distances(self.corpus_as_matrix.toarray(), 
-                                                            metric=self.metric)
-
-    def _calculate_vector_similarity(self, entity_id1: int, entity_id2: int) -> float:
-            return self.similarity_matrix[entity_id1][entity_id2]
+        
+        _dataset_identifier : str = ('_'.join([self.data.dataset_name_1, self.data.dataset_name_2])) if(self.data.dataset_name_1 is not None and self.data.dataset_name_2 is not None) else ("dataset") 
+        self.frequency_evaluator.fit(metric=self.metric,
+                                    dataset_identifier=_dataset_identifier,
+                                    indexing='inorder',
+                                    d1_entities=self._entities_d1,
+                                    d2_entities=self._entities_d2)
 
     def _similarity(self, entity_id1: int, entity_id2: int) -> float:
 
         similarity: float = 0.0
-        if self._input_type in ['tfidf', 'tf', 'boolean']:
-            return self._calculate_vector_similarity(entity_id1, entity_id2)
+        if self.vectorizer is not None:
+            return self.frequency_evaluator.predict(id1=entity_id1, id2=entity_id2)
         elif isinstance(self.attributes, dict):
             for attribute, weight in self.attributes.items():
                 e1 = self.data.entities.iloc[entity_id1][attribute].lower()
@@ -537,7 +527,9 @@ def _configuration(self) -> dict:
             "Metric" : self.metric,
             "Attributes" : self.attributes,
             "Similarity threshold" : self.similarity_threshold,
-            "Tokenizer" : self.tokenizer
+            "Tokenizer" : self.tokenizer,
+            "Vectorizer" : self.vectorizer if self.vectorizer is not None else "None",
+            "Qgrams" : self.qgram
         }
 
 class VectorBasedMatching(AbstractEntityMatching):
diff --git a/docs/pyjedai/prioritization.py b/docs/pyjedai/prioritization.py
index 4ce6f62..db5bc63 100644
--- a/docs/pyjedai/prioritization.py
+++ b/docs/pyjedai/prioritization.py
@@ -11,132 +11,115 @@
     GlobalProgressiveSortedNeighborhood,
     LocalProgressiveSortedNeighborhood,
     ProgressiveEntityScheduling)
+from .joins import PETopKJoin
 from .vector_based_blocking import EmbeddingsNNBlockBuilding
-from sklearn.metrics.pairwise import (
-    cosine_similarity
-)
+
 from networkx import Graph
-from py_stringmatching.similarity_measure.affine import Affine
-from py_stringmatching.similarity_measure.bag_distance import BagDistance
 from py_stringmatching.similarity_measure.cosine import Cosine
 from py_stringmatching.similarity_measure.dice import Dice
-from py_stringmatching.similarity_measure.editex import Editex
 from py_stringmatching.similarity_measure.generalized_jaccard import \
     GeneralizedJaccard
-from py_stringmatching.similarity_measure.hamming_distance import \
-    HammingDistance
 from py_stringmatching.similarity_measure.jaccard import Jaccard
 from py_stringmatching.similarity_measure.jaro import Jaro
-from py_stringmatching.similarity_measure.jaro_winkler import JaroWinkler
 from py_stringmatching.similarity_measure.levenshtein import Levenshtein
-from py_stringmatching.similarity_measure.monge_elkan import MongeElkan
-from py_stringmatching.similarity_measure.needleman_wunsch import \
-    NeedlemanWunsch
 from py_stringmatching.similarity_measure.overlap_coefficient import \
     OverlapCoefficient
-from py_stringmatching.similarity_measure.partial_ratio import PartialRatio
-from py_stringmatching.similarity_measure.token_sort import TokenSort
-from py_stringmatching.similarity_measure.partial_token_sort import \
-    PartialTokenSort
-from py_stringmatching.similarity_measure.ratio import Ratio
-from py_stringmatching.similarity_measure.smith_waterman import SmithWaterman
-from py_stringmatching.similarity_measure.soundex import Soundex
-from py_stringmatching.similarity_measure.tfidf import TfIdf
-from py_stringmatching.similarity_measure.tversky_index import TverskyIndex
-from py_stringmatching.tokenizer.alphabetic_tokenizer import \
-    AlphabeticTokenizer
-from py_stringmatching.tokenizer.alphanumeric_tokenizer import \
-    AlphanumericTokenizer
-from py_stringmatching.tokenizer.delimiter_tokenizer import DelimiterTokenizer
 from py_stringmatching.tokenizer.qgram_tokenizer import QgramTokenizer
 from py_stringmatching.tokenizer.whitespace_tokenizer import \
     WhitespaceTokenizer
+from sklearn.metrics.pairwise import pairwise_distances
 from tqdm.autonotebook import tqdm
 
-from .evaluation import Evaluation
 from .datamodel import Data, PYJEDAIFeature
+from .evaluation import Evaluation
 from .matching import EntityMatching
 from .comparison_cleaning import AbstractMetablocking
 from queue import PriorityQueue
 from random import sample
-from .utils import sorted_enumerate, canonical_swap
 from abc import abstractmethod
 from typing import Tuple, List
-from .utils import SubsetIndexer, WhooshDataset, WhooshNeighborhood, is_infinite, PredictionData
+from .utils import (
+    SubsetIndexer,
+    DatasetScheduler,
+    EntityScheduler,
+    is_infinite,
+    PredictionData,
+    reverse_data_indexing,
+    reverse_blocks_entity_indexing,
+    sorted_enumerate,
+    canonical_swap,
+    WordQgramTokenizer,
+    cosine,
+    get_qgram_from_tokenizer_name,
+    FrequencyEvaluator)
 import pandas as pd
 import os
-from whoosh.fields import TEXT, Schema, ID
-from whoosh.index import create_in
-from whoosh import qparser
-from whoosh.scoring import TF_IDF, Frequency, PL2, BM25F
+from collections import defaultdict
+import sys
+from faiss import METRIC_INNER_PRODUCT, METRIC_L2
+import json
+import re
 
 
 # Directory where the whoosh index is stored
 INDEXER_DIR='.indexer'
 
-# Package import from https://anhaidgroup.github.io/py_stringmatching/v0.4.2/index.html
-
-available_tokenizers = [
-    'white_space_tokenizer', 'qgram_tokenizer', 'delimiter_tokenizer',
-    'alphabetic_tokenizer', 'alphanumeric_tokenizer'
-]
-
 metrics_mapping = {
-    'levenshtein' : Levenshtein(),
     'edit_distance': Levenshtein(),
-    'jaro_winkler' : JaroWinkler(),
-    'bag_distance' : BagDistance(),
-    'editex' : Editex(),
     'cosine' : Cosine(),
     'jaro' : Jaro(),
-    'soundex' : Soundex(),
-    'tfidf' : TfIdf(),
-    'tversky_index':TverskyIndex(),
-    'ratio' : Ratio(),
-    'partial_token_sort' : PartialTokenSort(),
-    'partial_ratio' : PartialRatio(),
-    'hamming_distance' : HammingDistance(),
     'jaccard' : Jaccard(),
     'generalized_jaccard' : GeneralizedJaccard(),
     'dice': Dice(),
     'overlap_coefficient' : OverlapCoefficient(),
-    'token_sort': TokenSort(),
-    'cosine_vector_similarity': cosine_similarity,
-    'TF-IDF' : TF_IDF(),
-    'Frequency' : Frequency(),
-    'PL2' : PL2(),
-    'BM25F' : BM25F()
 }
 
-whoosh_similarity_function = {
-    'TF-IDF' : TF_IDF(),
-    'Frequency' : Frequency(),
-    'PL2' : PL2(),
-    'BM25F' : BM25F()
+vector_metrics_mapping = {
+    'cosine': cosine
 }
 
 string_metrics = [
-    'bag_distance', 'editex', 'hamming_distance', 'jaro', 'jaro_winkler', 'levenshtein',
-    'edit_distance', 'partial_ratio', 'partial_token_sort', 'ratio', 'soundex', 'token_sort'
+    'jaro', 'edit_distance'
 ]
 
 set_metrics = [
-    'cosine', 'dice', 'generalized_jaccard', 'jaccard', 'overlap_coefficient', 'tversky_index'
+    'cosine', 'dice', 'generalized_jaccard', 'jaccard', 'overlap_coefficient'
 ]
 
-bag_metrics = [
-    'tfidf'
+vector_metrics = [ 
+    'cosine', 'dice', 'jaccard'
 ]
 
-index_metrics = [
+whoosh_index_metrics = [
     'TF-IDF', 'Frequency', 'PL2', 'BM25F'
 ] 
 
-vector_metrics = [
-    'cosine_vector_similarity'
+faiss_metrics = [
+    'cosine', 'euclidean'
 ]
 
-available_metrics = string_metrics + set_metrics + bag_metrics + vector_metrics + index_metrics
+magellan_metrics = string_metrics + set_metrics
+available_metrics = magellan_metrics + vector_metrics + whoosh_index_metrics + faiss_metrics
+
+#
+# Tokenizers
+#
+char_qgram_tokenizers = { 'char_'+ str(i) + 'gram':i for i in range(1, 7) }
+word_qgram_tokenizers = { 'word_'+ str(i) + 'gram':i for i in range(1, 7) }
+magellan_tokenizers = ['white_space_tokenizer']
+
+tfidf_tokenizers = [ 'tfidf_' + cq for cq in char_qgram_tokenizers.keys() ] + \
+                    [ 'tfidf_' + wq for wq in word_qgram_tokenizers.keys() ]
+
+tf_tokenizers = [ 'tf_' + cq for cq in char_qgram_tokenizers.keys() ] + \
+                    [ 'tf_' + wq for wq in word_qgram_tokenizers.keys() ]
+                        
+boolean_tokenizers = [ 'boolean_' + cq for cq in char_qgram_tokenizers.keys() ] + \
+                        [ 'boolean_' + wq for wq in word_qgram_tokenizers.keys() ]
+
+vector_tokenizers = tfidf_tokenizers + tf_tokenizers + boolean_tokenizers
+
+available_tokenizers = [key for key in char_qgram_tokenizers] + [key for key in word_qgram_tokenizers] + magellan_tokenizers + vector_tokenizers
 
 class ProgressiveMatching(EntityMatching):
     """Applies the matching process to a subset of available pairs progressively 
@@ -144,34 +127,39 @@ class ProgressiveMatching(EntityMatching):
 
     _method_name: str = "Progressive Matching"
     _method_info: str = "Applies the matching process to a subset of available pairs progressively "
-
     def __init__(
             self,
-            budget: int = 0,
-            metric: str = 'dice',
+            similarity_function: str = 'dice',
             tokenizer: str = 'white_space_tokenizer',
-            similarity_threshold: float = 0.5,
-            qgram: int = 2, # for jaccard
-            tokenizer_return_set = True, # unique values or not
+            vectorizer : str = None,
+            qgram : int = 1,
+            similarity_threshold: float = 0.0,
+            tokenizer_return_unique_values = True, # unique values or not
             attributes: any = None,
-            delim_set: list = None, # DelimiterTokenizer
-            padding: bool = True, # QgramTokenizer
-            prefix_pad: str = '#', # QgramTokenizer (if padding=True)
-            suffix_pad: str = '$' # QgramTokenizer (if padding=True)
         ) -> None:
 
-        super().__init__(metric, tokenizer, similarity_threshold, qgram, tokenizer_return_set, attributes, delim_set, padding, prefix_pad, suffix_pad)
-        self._budget : int = budget
-
+        super().__init__(metric=similarity_function,
+                        tokenizer=tokenizer,
+                        vectorizer=vectorizer,
+                        qgram=qgram,
+                        similarity_threshold=similarity_threshold,
+                        tokenizer_return_unique_values=tokenizer_return_unique_values,
+                        attributes=attributes)
+        self.similarity_function : str = similarity_function
+        self.dataset_identifier : str = None
+        
     def predict(self,
-            blocks: dict,
             data: Data,
+            blocks: dict,
+            dataset_identifier: str = "dataset",
+            budget: int = 0,
+            algorithm : str = 'HB',
+            indexing : str = 'inorder',
             comparison_cleaner: AbstractMetablocking = None,
             tqdm_disable: bool = False,
-            method : str = 'HB',
-            emit_all_tps_stop : bool = False) -> Graph:
-        """Main method of  progressive entity matching. Inputs a set of blocks and outputs a graph \
-            that contains of the entity ids (nodes) and the similarity scores between them (edges).
+            emit_all_tps_stop : bool = False) -> List[Tuple[float, int, int]]:
+        """Main method of  progressive entity matching. Inputs a set of blocks and outputs a list \
+           that contains duplets of ids corresponding to candidate pairs to emit.
             Args:
                 blocks (dict): blocks of entities
                 data (Data): dataset module
@@ -183,32 +171,165 @@ def predict(self,
         """
         start_time = time()
         self.tqdm_disable = tqdm_disable
+        self._budget : int = budget
+        self._indexing : str = indexing
         self._comparison_cleaner: AbstractMetablocking = comparison_cleaner
-        self._method = method
-        self._emit_all_tps_stop = emit_all_tps_stop
-        self.true_pair_checked = None 
+        self._algorithm : str= algorithm
+        self._emit_all_tps_stop : bool = emit_all_tps_stop
+        self.duplicate_emitted : dict = None if not self._emit_all_tps_stop else {}
         self._prediction_data : PredictionData = None
+        self.data : Data = data
+        self.duplicate_of = data.duplicate_of
+        self.scheduler : DatasetScheduler = None
+        self.dataset_identifier : str = dataset_identifier
 
         if not blocks:
             raise ValueError("Empty blocks structure")
-        self.data = data
-        self.pairs = Graph()
+        
+        if self.data.is_dirty_er and self._indexing == 'bilateral':
+            raise ValueError("Cannot apply bilateral indexing to dirty Entity Resolution (single dataset)")
+            
+        _inorder_blocks = blocks  
+        self._pairs_top_score : dict = defaultdict(lambda: -1)
         all_blocks = list(blocks.values())
         self._progress_bar = tqdm(total=len(blocks),
-                                desc=self._method_name+" ("+self.metric+")",
-                                disable=self.tqdm_disable)
-        if 'Block' in str(type(all_blocks[0])):
-            self._predict_raw_blocks(blocks)
-        elif isinstance(all_blocks[0], set):
-            if(self._comparison_cleaner == None):
-                raise AttributeError("No precalculated weights were given from the CC step") 
-            self._predict_prunned_blocks(blocks)
-        else:
-            raise AttributeError("Wrong type of Blocks")
+                        desc=self._method_name,
+                        disable=self.tqdm_disable)
+        
+        if(indexing == 'bilateral'): self._indexing = 'inorder'
+        if(self._indexing == 'inorder'):
+            if 'Block' in str(type(all_blocks[0])):
+                self._predict_raw_blocks(blocks)
+            elif isinstance(all_blocks[0], set):
+                if(self._comparison_cleaner == None):
+                    raise AttributeError("No precalculated weights were given from the CC step") 
+                self._predict_prunned_blocks(blocks)
+            else:
+                raise AttributeError("Wrong type of Blocks")
+            self._schedule_candidates()
+            
+            
+        if(indexing == 'bilateral'): self._indexing = 'reverse'
+        if(self._indexing == 'reverse'):
+            _reverse_blocks = reverse_blocks_entity_indexing(_inorder_blocks, self.data)
+            self.data = reverse_data_indexing(self.data)
+            if 'Block' in str(type(all_blocks[0])):
+                self._predict_raw_blocks(_reverse_blocks)
+            elif isinstance(all_blocks[0], set):
+                if(self._comparison_cleaner == None):
+                    raise AttributeError("No precalculated weights were given from the CC step") 
+                self._predict_prunned_blocks(_reverse_blocks)
+            else:
+                raise AttributeError("Wrong type of Blocks")
+            self._schedule_candidates()
+        
+        self._gather_top_pairs()
         self.execution_time = time() - start_time
         self._progress_bar.close()
-
+        
         return self.pairs
+    
+    
+    def _store_id_mappings(self) -> None:
+        """Stores the mapping [Workflow ID -> Dataframe ID] for the current indexing phase
+        """
+        if(self._indexing == "inorder"):
+            self._inorder_d1_id = self.data._gt_to_ids_reversed_1
+            self._inorder_d2_id = self.data._gt_to_ids_reversed_2  
+        if(self._indexing == "reverse"):
+            self._reverse_d1_id = self.data._gt_to_ids_reversed_1
+            self._reverse_d2_id = self.data._gt_to_ids_reversed_2 
+      
+    def _schedule_candidates(self) -> None:
+        """Translates the workflow identifiers back into dataframe identifiers
+           Populates the dataset scheduler with the candidate pairs of the current indexing stage
+        """
+        self.scheduler = DatasetScheduler(budget=float('inf') if self._emit_all_tps_stop else self._budget, global_top=(self._algorithm=="TOP")) if self.scheduler == None else self.scheduler
+        self._store_id_mappings()
+        
+        for score, entity, candidate in self.pairs:            
+            # entities of first and second dataframe in the context of the current indexing
+            d1_entity, d2_entity = (entity, candidate) if(entity < candidate) else (candidate, entity)
+            d1_map, d2_map = (self._inorder_d1_id, self._inorder_d2_id) if (self._indexing == 'inorder') else (self._reverse_d1_id, self._reverse_d2_id)
+            
+            # print(f"#############################################################")
+            # print(f"Score: {score}")
+            # print(f"---------------Workflow IDs [{self._indexing}]---------------")
+            # print(f"Entity: {entity}")
+            # print(f"Candidate: {candidate}")
+            # print(f"---------------Workflow IDs [D1 context Ent First]---------------")
+            # print(f"D1 Entity: {d1_entity}")
+            # print(f"D2 Entity: {d2_entity}")
+            
+            # the dataframe ids of the entities from first and second dataset in the context of indexing
+            d1_entity_df_id, d2_entity_df_id = (d1_map[d1_entity], d2_map[d2_entity])
+            _inorder_d1_entity_df_id, _inorder_d2_entity_df_id = (d1_entity_df_id, d2_entity_df_id) if (self._indexing == 'inorder') else (d2_entity_df_id, d1_entity_df_id)
+            if(self._emit_all_tps_stop and _inorder_d2_entity_df_id in self.duplicate_of[_inorder_d1_entity_df_id]):
+                self.duplicate_emitted[(_inorder_d1_entity_df_id, _inorder_d2_entity_df_id)] = False          
+            
+            # in the case of reverse indexing stage, adjust the workflow identifiers of the entities so we can differ them from inorder entity ids
+            d1_entity = d1_entity if(self._indexing == 'inorder') else d1_entity + self.data.num_of_entities
+            d2_entity = d2_entity if(self._indexing == 'inorder') else d2_entity + self.data.num_of_entities
+            
+            # print(f"---------------Dataframe IDs [{self._indexing}]---------------")
+            # print(f"D1 Entity DF ID: {d1_entity_df_id}")
+            # print(f"D2 Entity DF ID: {d2_entity_df_id}")
+            # print(f"---------------Inorder Dataframe IDs [{self._indexing}]---------------")
+            # print(f"Inorder D1 Entity DF ID: {_inorder_d1_entity_df_id}")
+            # print(f"Inorder D2 Entity DF ID: {_inorder_d2_entity_df_id}")
+            # print(f"---------------Scheduler IDs [D1 context Ent First]---------------")
+            # print(f"D1 Entity: {d1_entity}")
+            # print(f"D2 Entity: {d2_entity}")
+            # if(_inorder_d2_entity_df_id in self.duplicate_of[_inorder_d1_entity_df_id]):
+            #     print("^ THIS IS A TRUE POSITIVE ^") 
+            # we want entities to be inserted in D1 -> D2 order (current context e.x. reverse) which translates to D2 -> D1 order (reverse context e.x. inorder)
+            self.scheduler._insert_entity_neighbor(d1_entity, d2_entity, score)
+            
+    def _inorder_phase_entity(self, id : int) -> bool:
+        """Given identifier corresponds to an entity proposed in the inorder indexing phase
+
+        Args:
+            id (int): Identifier
+
+        Returns:
+            bool: Identifier proposed in the inorder phase
+        """
+        return id < self.data.num_of_entities
+    
+    def _retrieve_entity_df_id(self, id : int) -> int:
+        """Returns the corresponding id in the dataframe of the given entity id in the context of its indexing phase 
+
+        Args:
+            id (int): Workflow Identifier
+
+        Returns:
+            int: Dataframe Identifier
+        """
+        _workflow_id : int
+        _df_id_of : dict
+        if(self._inorder_phase_entity(id)):
+            _workflow_id = id
+            _df_id_of = self._inorder_d1_id if (_workflow_id < len(self._inorder_d1_id)) else self._inorder_d2_id
+        else:
+            _workflow_id = id - self.data.num_of_entities
+            _df_id_of = self._reverse_d1_id if (_workflow_id < len(self._reverse_d1_id)) else self._reverse_d2_id
+            
+        return _df_id_of[_workflow_id]    
+    
+    def _gather_top_pairs(self) -> None:
+        """Emits the pairs from the scheduler based on the defined algorithm
+        """
+        self.scheduler._sort_neighborhoods_by_avg_weight()
+        self.pairs = self.scheduler._emit_pairs(method=self._algorithm, data=self.data)
+        
+        _identified_pairs = []
+        for score, entity, candidate in self.pairs:
+            _inorder_entities : bool = self._inorder_phase_entity(entity)
+            entity, candidate = (self._retrieve_entity_df_id(entity), self._retrieve_entity_df_id(candidate))
+            entity, candidate = (entity, candidate) if _inorder_entities else (candidate, entity)
+            _identified_pairs.append((score, entity, candidate))
+            
+        self.pairs = _identified_pairs
 
     def evaluate(self,
                  prediction,
@@ -242,22 +363,6 @@ def evaluate(self,
                                 export_to_dict,
                                 with_classification_report,
                                 verbose)
-        
-    def get_true_pair_checked(self):
-        if(self.true_pair_checked is None):
-            raise AttributeError("True positive pairs not defined in specified workflow.")
-        else: return self.true_pair_checked   
-        
-        
-    @abstractmethod
-    def extract_tps_checked(self, **kwargs) -> dict:
-        """Constructs a dictionary of the form [true positive pair] -> emitted status,
-           containing all the true positive pairs that are emittable from the current subset of the dataset
-
-        Returns:
-            dict: Dictionary that shows whether a TP pair (key) has been emitted (value)
-        """
-        pass
     
     def get_prediction_data(self) -> PredictionData:
         if(self._prediction_data is None):
@@ -275,46 +380,137 @@ def get_normalized_auc(self) -> float:
     
     def set_prediction_data(self, prediction_data : PredictionData):
         self._prediction_data : PredictionData = prediction_data
-        
-
-class HashBasedProgressiveMatching(ProgressiveMatching):
-    """Applies hash based candidate graph prunning, sorts retained comparisons and applies Progressive Matching
+class BlockIndependentPM(ProgressiveMatching):
+    """Applies the matching process to a subset of available pairs progressively 
     """
 
-    _method_name: str = "Hash Based Progressive Matching"
-    _method_info: str = "Applies hash based candidate graph prunning, sorts retained comparisons and applies Progressive Matching"
+    _method_name: str = "Progressive Matching"
+    _method_info: str = "Applies the matching process to a subset of available pairs progressively "
 
     def __init__(
             self,
-            budget: int = 0,
-            w_scheme: str = 'X2',
-            metric: str = 'dice',
+            similarity_function: str = 'dice',
             tokenizer: str = 'white_space_tokenizer',
-            similarity_threshold: float = 0.5,
-            qgram: int = 2, # for jaccard
-            tokenizer_return_set = True, # unique values or not
+            vectorizer : str = None,
+            qgram : int = 1,
+            similarity_threshold: float = 0.0,
+            tokenizer_return_unique_values = True, # unique values or not
             attributes: any = None,
-            delim_set: list = None, # DelimiterTokenizer
-            padding: bool = True, # QgramTokenizer
-            prefix_pad: str = '#', # QgramTokenizer (if padding=True)
-            suffix_pad: str = '$' # QgramTokenizer (if padding=True)
         ) -> None:
 
-        super().__init__(budget, metric, tokenizer, similarity_threshold, qgram, tokenizer_return_set, attributes, delim_set, padding, prefix_pad, suffix_pad)
-        self._w_scheme : str = w_scheme
+        super().__init__(similarity_function=similarity_function,
+                        tokenizer=tokenizer,
+                        vectorizer=vectorizer,
+                        qgram=qgram,
+                        similarity_threshold=similarity_threshold,
+                        tokenizer_return_unique_values=tokenizer_return_unique_values,
+                        attributes=attributes)
         
-    def extract_tps_checked(self, **kwargs) -> dict:
-        _tps_checked = dict()
-        for entity, neighbors in self.blocks.items():
-            for neighbor in neighbors:
-                entity_id = self.data._gt_to_ids_reversed_1[entity] if entity < self.data.dataset_limit else self.data._gt_to_ids_reversed_2[entity]
-                neighbor_id = self.data._gt_to_ids_reversed_1[neighbor] if neighbor < self.data.dataset_limit else self.data._gt_to_ids_reversed_2[neighbor]
-                _d1_entity, _d2_entity = (entity_id, neighbor_id) if entity < self.data.dataset_limit else (neighbor_id, entity_id)
+    def predict(self,
+            data: Data,
+            blocks: dict,
+            dataset_identifier: str = "dataset",
+            budget: int = 0,
+            algorithm : str = 'HB',
+            indexing : str = 'inorder',
+            comparison_cleaner: AbstractMetablocking = None,
+            tqdm_disable: bool = False,
+            emit_all_tps_stop : bool = False) -> List[Tuple[float, int, int]]:
+        """Main method of  progressive entity matching. Inputs a set of blocks and outputs a list \
+           that contains duplets of ids corresponding to candidate pairs to emit.
+            Args:
+                blocks (dict): blocks of entities
+                data (Data): dataset module
+                tqdm_disable (bool, optional): Disables progress bar. Defaults to False.
+                method (str) : DFS/BFS/Hybrid approach for specified algorithm
+                emit_all_tps_stop (bool) : Stop emission once all true positives are found
+            Returns:
+                networkx.Graph: entity ids (nodes) and similarity scores between them (edges)
+        """
+        start_time = time()
+        self.tqdm_disable = tqdm_disable
+        self._budget : int = budget
+        self._indexing : str = indexing
+        self._comparison_cleaner: AbstractMetablocking = comparison_cleaner
+        self._algorithm : str= algorithm
+        self._emit_all_tps_stop : bool = emit_all_tps_stop
+        self.duplicate_emitted : dict = None 
+        self._prediction_data : PredictionData = None
+        self.data : Data = data
+        self.duplicate_of = data.duplicate_of
+        self.scheduler : DatasetScheduler = None
+        self.dataset_identifier : str = dataset_identifier
+        
+        if self.data.is_dirty_er and self._indexing == 'bilateral':
+            raise ValueError("Cannot apply bilateral indexing to dirty Entity Resolution (single dataset)")
             
-                if _d2_entity in self.data.pairs_of[_d1_entity]:
-                    _tps_checked[canonical_swap(_d1_entity, _d2_entity)] = False
-        return _tps_checked
+        _inorder_blocks = blocks  
+        self._pairs_top_score : dict = defaultdict(lambda: -1)
+        all_blocks = list(blocks.values()) if blocks is not None else None
+        self._progress_bar = tqdm(total=len(blocks) if blocks is not None else 0,
+                        desc=self._method_name,
+                        disable=self.tqdm_disable)
         
+        if(indexing == 'bilateral'): self._indexing = 'inorder'
+        if(self._indexing == 'inorder'):
+            if all_blocks is None or 'Block' in str(type(all_blocks[0])):
+                self._predict_raw_blocks(blocks)
+            elif isinstance(all_blocks[0], set):
+                if(self._comparison_cleaner == None):
+                    raise AttributeError("No precalculated weights were given from the CC step") 
+                self._predict_prunned_blocks(blocks)
+            else:
+                raise AttributeError("Wrong type of Blocks")
+            self._schedule_candidates()
+            
+            
+        if(indexing == 'bilateral'): self._indexing = 'reverse'
+        if(self._indexing == 'reverse'):
+            _reverse_blocks = reverse_blocks_entity_indexing(_inorder_blocks, self.data)
+            self.data = reverse_data_indexing(self.data)
+            if all_blocks is None or 'Block' in str(type(all_blocks[0])):
+                self._predict_raw_blocks(_reverse_blocks)
+            elif isinstance(all_blocks[0], set):
+                if(self._comparison_cleaner == None):
+                    raise AttributeError("No precalculated weights were given from the CC step") 
+                self._predict_prunned_blocks(_reverse_blocks)
+            else:
+                raise AttributeError("Wrong type of Blocks")
+            self._schedule_candidates()
+        
+        self._gather_top_pairs()
+        self.execution_time = time() - start_time
+        self._progress_bar.close()
+        
+        return self.pairs
+
+class HashBasedProgressiveMatching(ProgressiveMatching):
+    """Applies hash based candidate graph prunning, sorts retained comparisons and applies Progressive Matching
+    """
+
+    _method_name: str = "Hash Based Progressive Matching"
+    _method_info: str = "Applies hash based candidate graph prunning, sorts retained comparisons and applies Progressive Matching"
+
+    def __init__(
+        self,
+        weighting_scheme: str = 'X2',
+        similarity_function: str = 'dice',
+        tokenizer: str = 'white_space_tokenizer',
+        vectorizer : str = None,
+        qgram : int = 1,
+        similarity_threshold: float = 0.0,
+        tokenizer_return_unique_values = True, # unique values or not
+        attributes: any = None,
+    ) -> None:
+
+        super().__init__(similarity_function=similarity_function,
+                        tokenizer=tokenizer,
+                        vectorizer=vectorizer,
+                        qgram=qgram,
+                        similarity_threshold=similarity_threshold,
+                        tokenizer_return_unique_values=tokenizer_return_unique_values,
+                        attributes=attributes)
+        self._weighting_scheme : str = weighting_scheme
 
 class GlobalTopPM(HashBasedProgressiveMatching):
     """Applies Progressive CEP, sorts retained comparisons and applies Progressive Matching
@@ -324,49 +520,54 @@ class GlobalTopPM(HashBasedProgressiveMatching):
     _method_info: str = "Applies Progressive CEP, sorts retained comparisons and applies Progressive Matching"
 
     def __init__(
-            self,
-            budget: int = 0,
-            w_scheme: str = 'X2',
-            metric: str = 'dice',
-            tokenizer: str = 'white_space_tokenizer',
-            similarity_threshold: float = 0.5,
-            qgram: int = 2, # for jaccard
-            tokenizer_return_set = True, # unique values or not
-            attributes: any = None,
-            delim_set: list = None, # DelimiterTokenizer
-            padding: bool = True, # QgramTokenizer
-            prefix_pad: str = '#', # QgramTokenizer (if padding=True)
-            suffix_pad: str = '$' # QgramTokenizer (if padding=True)
-        ) -> None:
-
-        super().__init__(budget, w_scheme, metric, tokenizer, similarity_threshold, qgram, tokenizer_return_set, attributes, delim_set, padding, prefix_pad, suffix_pad)
-
-    def _predict_raw_blocks(self, blocks: dict) -> None:
-        pcep : ProgressiveCardinalityEdgePruning = ProgressiveCardinalityEdgePruning(self._w_scheme, self._budget)
+        self,
+        weighting_scheme: str = 'X2',
+        similarity_function: str = 'dice',
+        tokenizer: str = 'white_space_tokenizer',
+        vectorizer : str = None,
+        qgram : int = 1,
+        similarity_threshold: float = 0.0,
+        tokenizer_return_unique_values = True, # unique values or not
+        attributes: any = None,
+    ) -> None:
+
+        super().__init__(weighting_scheme=weighting_scheme,
+                        similarity_function=similarity_function,
+                        tokenizer=tokenizer,
+                        vectorizer=vectorizer,
+                        qgram=qgram,
+                        similarity_threshold=similarity_threshold,
+                        tokenizer_return_unique_values=tokenizer_return_unique_values,
+                        attributes=attributes)
+
+    def _predict_raw_blocks(self, blocks: dict) -> List[Tuple[int, int]]:
+        self.pairs = Graph()
+        pcep : ProgressiveCardinalityEdgePruning = ProgressiveCardinalityEdgePruning(self._weighting_scheme, self._budget)
         candidates : dict = pcep.process(blocks=blocks, data=self.data, tqdm_disable=True, cc=None, emit_all_tps_stop=self._emit_all_tps_stop)
         self.blocks = candidates
-        if(self._emit_all_tps_stop): self.true_pair_checked = self.extract_tps_checked()
 
         for entity_id, candidate_ids in candidates.items():
             for candidate_id in candidate_ids:
                 self._insert_to_graph(entity_id, candidate_id, pcep.get_precalculated_weight(entity_id, candidate_id))
          
-        self.pairs.edges = sorted(self.pairs.edges(data=True), key=lambda x: x[2]['weight'], reverse=True) 
-        return self.pairs.edges
+        self.pairs.edges = sorted(self.pairs.edges(data=True), key=lambda x: x[2]['weight'], reverse=True)
+        self.pairs = [(edge[2]['weight'], edge[0], edge[1]) for edge in self.pairs.edges]
+        return self.pairs
 
 
-    def _predict_prunned_blocks(self, blocks: dict) -> None:
-        pcep : ProgressiveCardinalityEdgePruning = ProgressiveCardinalityEdgePruning(self._w_scheme, self._budget)
+    def _predict_prunned_blocks(self, blocks: dict) -> List[Tuple[int, int]]:
+        self.pairs = Graph()
+        pcep : ProgressiveCardinalityEdgePruning = ProgressiveCardinalityEdgePruning(self._weighting_scheme, self._budget)
         candidates : dict = pcep.process(blocks=blocks, data=self.data, tqdm_disable=True, cc=self._comparison_cleaner, emit_all_tps_stop=self._emit_all_tps_stop)
         self.blocks = candidates
-        if(self._emit_all_tps_stop): self.true_pair_checked = self.extract_tps_checked()
 
         for entity_id, candidate_ids in candidates.items():
             for candidate_id in candidate_ids:
                 self._insert_to_graph(entity_id, candidate_id, self._comparison_cleaner.get_precalculated_weight(entity_id, candidate_id))
 
         self.pairs.edges = sorted(self.pairs.edges(data=True), key=lambda x: x[2]['weight'], reverse=True)
-        return self.pairs.edges
+        self.pairs = [(edge[2]['weight'], edge[0], edge[1]) for edge in self.pairs.edges]
+        return self.pairs
 
 class LocalTopPM(HashBasedProgressiveMatching):
     """Applies Progressive CNP, sorts retained comparisons and applies Progressive Matching
@@ -376,53 +577,58 @@ class LocalTopPM(HashBasedProgressiveMatching):
     _method_info: str = "Applies Progressive CNP, sorts retained comparisons and applies Progressive Matching"
 
     def __init__(
-            self,
-            budget: int = 0,
-            w_scheme: str = 'X2',
-            metric: str = 'dice',
-            tokenizer: str = 'white_space_tokenizer',
-            similarity_threshold: float = 0.5,
-            qgram: int = 2, # for jaccard
-            tokenizer_return_set = True, # unique values or not
-            attributes: any = None,
-            delim_set: list = None, # DelimiterTokenizer
-            padding: bool = True, # QgramTokenizer
-            prefix_pad: str = '#', # QgramTokenizer (if padding=True)
-            suffix_pad: str = '$' # QgramTokenizer (if padding=True)
-        ) -> None:
-
-        super().__init__(budget, w_scheme, metric, tokenizer, similarity_threshold, qgram, tokenizer_return_set, attributes, delim_set, padding, prefix_pad, suffix_pad)
-
-
-    def _predict_raw_blocks(self, blocks: dict) -> None:
-        pcnp : ProgressiveCardinalityNodePruning = ProgressiveCardinalityNodePruning(self._w_scheme, self._budget)
-        candidates : dict = pcnp.process(blocks=blocks, data=self.data, tqdm_disable=True, cc=None, emit_all_tps_stop=self._emit_all_tps_stop)
+        self,
+        weighting_scheme: str = 'X2',
+        similarity_function: str = 'dice',
+        number_of_nearest_neighbors: int = 10,
+        tokenizer: str = 'white_space_tokenizer',
+        vectorizer : str = None,
+        qgram : int = 1,
+        similarity_threshold: float = 0.0,
+        tokenizer_return_unique_values = True, # unique values or not
+        attributes: any = None,
+    ) -> None:
+
+        super().__init__(weighting_scheme=weighting_scheme,
+                        similarity_function=similarity_function,
+                        tokenizer=tokenizer,
+                        vectorizer=vectorizer,
+                        qgram=qgram,
+                        similarity_threshold=similarity_threshold,
+                        tokenizer_return_unique_values=tokenizer_return_unique_values,
+                        attributes=attributes)
+        self._number_of_nearest_neighbors : int = number_of_nearest_neighbors
+        
+    def _predict_raw_blocks(self, blocks: dict) -> List[Tuple[int, int]]:
+        self.pairs = Graph()
+        pcnp : ProgressiveCardinalityNodePruning = ProgressiveCardinalityNodePruning(weighting_scheme=self._weighting_scheme, budget=self._budget)
+        candidates : dict = pcnp.process(blocks=blocks, data=self.data, number_of_nearest_neighbors=self._number_of_nearest_neighbors, tqdm_disable=True, cc=None, emit_all_tps_stop=self._emit_all_tps_stop)
         self.blocks = candidates
-        if(self._emit_all_tps_stop): self.true_pair_checked = self.extract_tps_checked()
         
         for entity_id, candidate_ids in candidates.items():
             for candidate_id in candidate_ids:
                 self._insert_to_graph(entity_id, candidate_id, pcnp.get_precalculated_weight(entity_id, candidate_id))
 
-        self.pairs.edges = sorted(self.pairs.edges(data=True), key=lambda x: x[2]['weight'], reverse=True) 
-        return self.pairs.edges
-
-    def _predict_prunned_blocks(self, blocks: dict) -> None:
+        self.pairs.edges = sorted(self.pairs.edges(data=True), key=lambda x: x[2]['weight'], reverse=True)
+        self.pairs = [(edge[2]['weight'], edge[0], edge[1]) for edge in self.pairs.edges]
+        return self.pairs
 
-        pcnp : ProgressiveCardinalityNodePruning = ProgressiveCardinalityNodePruning(self._w_scheme, self._budget)
-        candidates : dict = pcnp.process(blocks=blocks, data=self.data, tqdm_disable=True, cc=self._comparison_cleaner, emit_all_tps_stop=self._emit_all_tps_stop)
+    def _predict_prunned_blocks(self, blocks: dict) -> List[Tuple[int, int]]:
+        self.pairs = Graph()
+        pcnp : ProgressiveCardinalityNodePruning = ProgressiveCardinalityNodePruning(self._weighting_scheme, self._budget)
+        candidates : dict = pcnp.process(blocks=blocks, data=self.data, number_of_nearest_neighbors=self._number_of_nearest_neighbors, tqdm_disable=True, cc=self._comparison_cleaner, emit_all_tps_stop=self._emit_all_tps_stop)
         self.blocks = candidates
-        if(self._emit_all_tps_stop): self.true_pair_checked = self.extract_tps_checked()
 
         for entity_id, candidate_ids in candidates.items():
             for candidate_id in candidate_ids:
                 self._insert_to_graph(entity_id, candidate_id, self._comparison_cleaner.get_precalculated_weight(entity_id, candidate_id))
 
         self.pairs.edges = sorted(self.pairs.edges(data=True), key=lambda x: x[2]['weight'], reverse=True) 
-        return self.pairs.edges
+        self.pairs = [(edge[2]['weight'], edge[0], edge[1]) for edge in self.pairs.edges]
+        return self.pairs
 
 
-class EmbeddingsNNBPM(ProgressiveMatching):
+class EmbeddingsNNBPM(BlockIndependentPM):
     """Utilizes/Creates entity embeddings, constructs neighborhoods via NN Approach and applies Progressive Matching
     """
 
@@ -431,78 +637,33 @@ class EmbeddingsNNBPM(ProgressiveMatching):
 
     def __init__(
             self,
-            budget: int = 0,
-            vectorizer: str = 'bert',
+            language_model: str = 'bert',
+            number_of_nearest_neighbors: int = 10,
             similarity_search: str = 'faiss',
-            vector_size: int = 200,
+            vector_size: int = 300,
             num_of_clusters: int = 5,
-            metric: str = 'dice',
+            similarity_function: str = 'cosine',
             tokenizer: str = 'white_space_tokenizer',
-            similarity_threshold: float = 0.5,
-            qgram: int = 2, # for jaccard
-            tokenizer_return_set = True, # unique values or not
-            attributes: any = None,
-            delim_set: list = None, # DelimiterTokenizer
-            padding: bool = True, # QgramTokenizer
-            prefix_pad: str = '#', # QgramTokenizer (if padding=True)
-            suffix_pad: str = '$' # QgramTokenizer (if padding=True)
+            vectorizer : str = None,
+            qgram : int = 1,
+            similarity_threshold: float = 0.0,
+            tokenizer_return_unique_values = True, # unique values or not
+            attributes: any = None
         ) -> None:
 
-        super().__init__(budget, metric, tokenizer, similarity_threshold, qgram, tokenizer_return_set, attributes, delim_set, padding, prefix_pad, suffix_pad)
-        self._vectorizer = vectorizer
-        self._similarity_search = similarity_search
-        self._vector_size = vector_size
-        self._num_of_clusters = num_of_clusters
-        
+        super().__init__(similarity_function=similarity_function,
+                        tokenizer=tokenizer,
+                        vectorizer=vectorizer,
+                        qgram=qgram,
+                        similarity_threshold=similarity_threshold,
+                        tokenizer_return_unique_values=tokenizer_return_unique_values,
+                        attributes=attributes)
         
-    def predict(self,
-        data: Data,
-        blocks: dict = None,
-        comparison_cleaner: AbstractMetablocking = None,
-        tqdm_disable: bool = False,
-        method : str = 'HB',
-        emit_all_tps_stop : bool = False) -> Graph:
-        """Main method of  progressive entity matching. Inputs a set of blocks and outputs a graph \
-            that contains of the entity ids (nodes) and the similarity scores between them (edges).
-            Args:
-                blocks (dict): blocks of entities
-                data (Data): dataset module
-                tqdm_disable (bool, optional): Disables progress bar. Defaults to False.
-                method (str) : DFS/BFS/Hybrid approach for specified algorithm
-                emit_all_tps_stop (bool) : Stop emission once all true positives are found
-            Returns:
-                networkx.Graph: entity ids (nodes) and similarity scores between them (edges)
-        """
-        start_time = time()
-        self.tqdm_disable = tqdm_disable
-        self._comparison_cleaner: AbstractMetablocking = comparison_cleaner
-        self._method = method
-        self._emit_all_tps_stop = emit_all_tps_stop
-        self.true_pair_checked = None 
-        self._prediction_data : PredictionData = None
-        self.data = data
-        self.pairs = Graph()
-        
-        if blocks is None:
-        # applying the process to the whole dataset
-            self._predict_raw_blocks(blocks)
-        else:
-            all_blocks = list(blocks.values())
-            self._progress_bar = tqdm(total=len(blocks),
-                                    desc=self._method_name+" ("+self.metric+")",
-                                    disable=self.tqdm_disable)
-            if 'Block' in str(type(all_blocks[0])):
-                self._predict_raw_blocks(blocks)
-            elif isinstance(all_blocks[0], set):
-                if(self._comparison_cleaner == None):
-                    raise AttributeError("No precalculated weights were given from the CC step") 
-                self._predict_prunned_blocks(blocks)
-            else:
-                raise AttributeError("Wrong type of Blocks")
-            self._progress_bar.close()
-            
-        self.execution_time = time() - start_time
-        return self.pairs
+        self._language_model : str = language_model
+        self._number_of_nearest_neighbors : int = number_of_nearest_neighbors
+        self._similarity_search : str = similarity_search
+        self._vector_size : int = vector_size
+        self._num_of_clusters : int = num_of_clusters
 
     def _top_pair_emission(self) -> None:
         """Applies global sorting to all entity pairs produced by NN,
@@ -518,8 +679,7 @@ def _top_pair_emission(self) -> None:
                     candidate_id = self.ennbb._si.d1_retained_ids[self.neighbors[i][j]]
                     self.pairs.append((entity_id, candidate_id, self.scores[i][j]))
 
-        self.pairs = sorted(self.pairs, key=lambda x: x[2], reverse=True)
-        self.pairs = [(x[0], x[1]) for x in self.pairs]
+        self.pairs = [(x[2], x[0], x[1]) for x in self.pairs]
 
     def _dfs_pair_emission(self) -> None:
         """Sorts NN neighborhoods in ascending average distance from their query entity,
@@ -542,7 +702,7 @@ def _dfs_pair_emission(self) -> None:
                     neighbor_id = self.ennbb._si.d1_retained_ids[neighbor]
                     self.pairs.append((entity_id, neighbor_id, neighbor_scores[neighbor_index]))
 
-        self.pairs = [(x[0], x[1]) for x in self.pairs]
+        self.pairs = [(x[2], x[0], x[1]) for x in self.pairs]
         
     def _hb_pair_emission(self) -> None:
         """Sorts NN neighborhoods in ascending average distance from their query entity,
@@ -568,7 +728,7 @@ def _hb_pair_emission(self) -> None:
                     _current_emissions = _remaining_emissions if neighbor_index else _first_emissions
                     _current_emissions.append((entity_id, neighbor_id, neighbor_scores[neighbor_index]))
 
-        self.pairs = [(x[0], x[1]) for x in _first_emissions] + [(x[0], x[1]) for x in _remaining_emissions]
+        self.pairs = [(x[2], x[0], x[1]) for x in _first_emissions] + [(x[2], x[0], x[1]) for x in _remaining_emissions]
         
     def _bfs_pair_emission(self) -> None:
         """Sorts NN neighborhoods in ascending average distance from their query entity,
@@ -589,68 +749,106 @@ def _bfs_pair_emission(self) -> None:
                     else self.ennbb._si.d2_retained_ids[sorted_neighborhood]
                     self.pairs.append((entity_id, neighbor_id, self.scores[sorted_neighborhood][current_emission_per_pair]))
                     
-        self.pairs = [(x[0], x[1]) for x in self.pairs]
+        self.pairs = [(x[2], x[0], x[1]) for x in self.pairs]
 
     def _produce_pairs(self):
         """Calls pairs emission based on the requested approach
         Raises:
             AttributeError: Given emission technique hasn't been defined
         """
-        if(self._method == 'DFS'):
-            self._dfs_pair_emission()
-        elif(self._method == 'HB'):
-            self._hb_pair_emission()
-        elif(self._method == 'BFS'):
-            self._bfs_pair_emission()
-        elif(self._method == 'TOP'):
-            self._top_pair_emission()
+        # currently first phase algorithms are in charge of gathering the subset of the original dataset
+        # that will be used to initialize the scheduler, we simply retrieve all the pairs and their scores
+        self._top_pair_emission()
+
+    def save_datasets_embeddings(self, vectors_1: np.array, vectors_2: np.array) -> None:
+        """Stores the non-precalculated (not loaded) embeddings in corresponding dataset paths
+        """
+
+        if(self._d1_emb_load_path is None):
+            try:
+                print(f"Saving D1 Embeddings -> {self._d1_emb_save_path}")
+                np.save(self._d1_emb_save_path, vectors_1) 
+                pass
+            except FileNotFoundError:
+                print(f"Unable to save Embeddings -> {self._d1_emb_save_path}") 
+                
+        if(self._d2_emb_load_path is None):
+            try:
+                print(f"Saving D2 Embeddings -> {self._d2_emb_save_path}")
+                np.save(self._d2_emb_save_path, vectors_2) 
+                pass
+            except FileNotFoundError:
+                print(f"Unable to save Embeddings -> {self._d2_emb_save_path}") 
+                 
+    def retrieve_embeddings_file_paths(self):
+        return(self.retrieve_dataset_embeddings_file_path(first_dataset=True), self.retrieve_dataset_embeddings_file_path(first_dataset=False))
+            
+    def retrieve_dataset_embeddings_file_path(self, first_dataset : bool = True) -> str:
+        """Attemps to retrieve the precalculated embeddings of first/second dataset from disk for current experiment
+        Returns:
+            str: Precalculated Embeddings file path (None if doesn't exist)
+        """
+    
+        _requested_indexing, _opposite_indexing = ("reverse", "inorder") if (self._indexing == "reverse") \
+                                                else ("inorder", "reverse")
+        _requested_dataset, _opposite_dataset = ("1","2") if(first_dataset) \
+                                            else ("2", "1")
+        
+        _requested_indexing_file_name = '_'.join([_requested_indexing, self.dataset_identifier, self._language_model, _requested_dataset + ".npy"])
+        _opposite_indexing_file_name = '_'.join([_opposite_indexing, self.dataset_identifier, self._language_model, _opposite_dataset + ".npy"])
+        
+        hidden_directory_path = os.path.join(os.getcwd(), ".embs")
+        os.makedirs(hidden_directory_path, exist_ok=True)
+        
+        
+        _available_file_path : str = None
+        _requested_indexing_file_path = os.path.join(hidden_directory_path, _requested_indexing_file_name)
+        _opposite_indexing_file_path = os.path.join(hidden_directory_path, _opposite_indexing_file_name)
+        
+        if(os.path.exists(_requested_indexing_file_path) and os.path.isfile(_requested_indexing_file_path)):
+            _available_file_path = _requested_indexing_file_path
+        elif(os.path.exists(_opposite_indexing_file_path) and os.path.isfile(_opposite_indexing_file_path)):
+            _available_file_path = _opposite_indexing_file_path
+            
+        if(first_dataset):
+            self._d1_emb_load_path = _available_file_path
+            self._d1_emb_save_path = _requested_indexing_file_path
         else:
-            raise AttributeError(self._method + ' emission technique is undefined!')
-
-    def _predict_raw_blocks(self, blocks: dict = None) -> None:
-        self.ennbb : EmbeddingsNNBlockBuilding = EmbeddingsNNBlockBuilding(self._vectorizer, self._similarity_search)
-        self.final_blocks = self.ennbb.build_blocks(data = self.data,
-                     num_of_clusters = self._num_of_clusters,
-                     top_k = int(max(1, int(self._budget / self.data.num_of_entities) + (self._budget % self.data.num_of_entities > 0)))
-                     if not self._emit_all_tps_stop else self._budget,
-                     return_vectors = False,
-                     tqdm_disable = False,
-                     save_embeddings = True,
-                     load_embeddings_if_exist = True,
-                     with_entity_matching = False,
-                     input_cleaned_blocks = blocks)
+            self._d2_emb_load_path = _available_file_path
+            self._d2_emb_save_path = _requested_indexing_file_path
+            
+        return _available_file_path 
 
+    def _predict_raw_blocks(self, blocks: dict = None) -> List[Tuple[int, int]]:
+        self.ennbb : EmbeddingsNNBlockBuilding = EmbeddingsNNBlockBuilding(self._language_model, self._similarity_search)
+        
+        
+        load_path_d1, load_path_d2 = self.retrieve_embeddings_file_paths()
+        
+        self.final_blocks = self.ennbb.build_blocks(data=self.data,
+                                                    vector_size=self._vector_size,
+                                                    num_of_clusters=self._num_of_clusters,
+                                                    top_k=self._number_of_nearest_neighbors,
+                                                    return_vectors=False,
+                                                    tqdm_disable=False,
+                                                    save_embeddings=False,
+                                                    load_embeddings_if_exist=True,
+                                                    load_path_d1=load_path_d1,
+                                                    load_path_d2=load_path_d2,
+                                                    with_entity_matching=False,
+                                                    input_cleaned_blocks=blocks,
+                                                    similarity_distance=self.similarity_function)
+        
+        self.save_datasets_embeddings(vectors_1=self.ennbb.vectors_1, vectors_2=self.ennbb.vectors_2)
         self.scores = self.ennbb.distances
         self.neighbors = self.ennbb.neighbors
         self.final_vectors = (self.ennbb.vectors_1, self.ennbb.vectors_2)
-
         self._produce_pairs()
-        if(self._emit_all_tps_stop):
-            self.true_pair_checked = self.extract_tps_checked()
         return self.pairs
 
-    def _predict_prunned_blocks(self, blocks: dict = None) -> None:
+    def _predict_prunned_blocks(self, blocks: dict = None) -> List[Tuple[int, int]]:
         return self._predict_raw_blocks(blocks)
     
-    def extract_tps_checked(self, **kwargs) -> dict:
-        _tps_checked = dict()
-        _neighbors = self.neighbors
-        
-        for row in range(_neighbors.shape[0]):
-            entity = self.ennbb._si.d1_retained_ids[row] \
-                    if self.data.is_dirty_er \
-                    else self.ennbb._si.d2_retained_ids[row]
-            entity_id = self.data._gt_to_ids_reversed_1[entity] if entity < self.data.dataset_limit else self.data._gt_to_ids_reversed_2[entity]
-            for column in range(_neighbors.shape[1]):
-                if(_neighbors[row][column] != -1):
-                    neighbor = self.ennbb._si.d1_retained_ids[_neighbors[row][column]]
-                    neighbor_id = self.data._gt_to_ids_reversed_1[neighbor] if neighbor < self.data.dataset_limit else self.data._gt_to_ids_reversed_2[neighbor]
-                    _d1_entity, _d2_entity = (entity_id, neighbor_id) if entity < self.data.dataset_limit else (neighbor_id, entity_id)
-                if _d2_entity in self.data.pairs_of[_d1_entity]:
-                    _tps_checked[canonical_swap(_d1_entity, _d2_entity)] = False
-        
-        return _tps_checked
-    
 class SimilarityBasedProgressiveMatching(ProgressiveMatching):
     """Applies similarity based candidate graph prunning, sorts retained comparisons and applies Progressive Matching
     """
@@ -660,25 +858,25 @@ class SimilarityBasedProgressiveMatching(ProgressiveMatching):
 
     def __init__(
             self,
-            budget: int = 0,
-            pwScheme: str = 'ACF',
-            metric: str = 'dice',
+            weighting_scheme: str = 'ACF',
+            window_size: int = 10,
+            similarity_function: str = 'dice',
             tokenizer: str = 'white_space_tokenizer',
-            similarity_threshold: float = 0.5,
-            qgram: int = 2, # for jaccard
-            tokenizer_return_set = True, # unique values or not
-            attributes: any = None,
-            delim_set: list = None, # DelimiterTokenizer
-            padding: bool = True, # QgramTokenizer
-            prefix_pad: str = '#', # QgramTokenizer (if padding=True)
-            suffix_pad: str = '$' # QgramTokenizer (if padding=True)
+            vectorizer : str = None,
+            qgram : int = 1,
+            similarity_threshold: float = 0.0,
+            tokenizer_return_unique_values = True, # unique values or not
+            attributes: any = None
         ) -> None:
-
-        super().__init__(budget, metric, tokenizer, similarity_threshold, qgram, tokenizer_return_set, attributes, delim_set, padding, prefix_pad, suffix_pad)
-        self._pwScheme : str = pwScheme
-        
-    def extract_tps_checked(self, **kwargs) -> dict:
-        pass
+        super().__init__(similarity_function=similarity_function,
+                        tokenizer=tokenizer,
+                        vectorizer=vectorizer,
+                        qgram=qgram,
+                        similarity_threshold=similarity_threshold,
+                        tokenizer_return_unique_values=tokenizer_return_unique_values,
+                        attributes=attributes)
+        self._weighting_scheme : str = weighting_scheme
+        self._window_size : int = window_size
         
 class GlobalPSNM(SimilarityBasedProgressiveMatching):
     """Applies Global Progressive Sorted Neighborhood Matching
@@ -691,49 +889,35 @@ class GlobalPSNM(SimilarityBasedProgressiveMatching):
 
     def __init__(
             self,
-            budget: int = 0,
-            pwScheme: str = 'ACF',
-            metric: str = 'dice',
+            weighting_scheme: str = 'ACF',
+            window_size: int = 10,
+            similarity_function: str = 'dice',
             tokenizer: str = 'white_space_tokenizer',
-            similarity_threshold: float = 0.5,
-            qgram: int = 2, # for jaccard
-            tokenizer_return_set = True, # unique values or not
-            attributes: any = None,
-            delim_set: list = None, # DelimiterTokenizer
-            padding: bool = True, # QgramTokenizer
-            prefix_pad: str = '#', # QgramTokenizer (if padding=True)
-            suffix_pad: str = '$' # QgramTokenizer (if padding=True)
+            vectorizer : str = None,
+            qgram : int = 1,
+            similarity_threshold: float = 0.0,
+            tokenizer_return_unique_values = True, # unique values or not
+            attributes: any = None
         ) -> None:
 
-        super().__init__(budget, pwScheme, metric, tokenizer, similarity_threshold, qgram, tokenizer_return_set, attributes, delim_set, padding, prefix_pad, suffix_pad)
-
-    def _predict_raw_blocks(self, blocks: dict):
-        gpsn : GlobalProgressiveSortedNeighborhood = GlobalProgressiveSortedNeighborhood(self._pwScheme, self._budget)
-        candidates :  PriorityQueue = gpsn.process(blocks=blocks, data=self.data, tqdm_disable=True, emit_all_tps_stop=self._emit_all_tps_stop)
-        self.pairs = []
-        while(not candidates.empty()):
-            _, entity_id, candidate_id = candidates.get()
-            self.pairs.append((entity_id, candidate_id))
-            if(self._emit_all_tps_stop): self.true_pair_checked = self.extract_tps_checked(entity=entity_id, neighbor=candidate_id)
-          
+        super().__init__(weighting_scheme=weighting_scheme,
+                        window_size=window_size,
+                        similarity_function=similarity_function,
+                        tokenizer=tokenizer,
+                        vectorizer=vectorizer,
+                        qgram=qgram,
+                        similarity_threshold=similarity_threshold,
+                        tokenizer_return_unique_values=tokenizer_return_unique_values,
+                        attributes=attributes)
+
+    def _predict_raw_blocks(self, blocks: dict) -> List[Tuple[float, int, int]]:
+        gpsn : GlobalProgressiveSortedNeighborhood = GlobalProgressiveSortedNeighborhood(self._weighting_scheme, self._budget)
+        self.pairs : List[Tuple[float, int, int]] = gpsn.process(blocks=blocks, data=self.data, window_size=self._window_size, tqdm_disable=True, emit_all_tps_stop=self._emit_all_tps_stop)
         return self.pairs
 
-    def _predict_prunned_blocks(self, blocks: dict):
+    def _predict_prunned_blocks(self, blocks: dict) -> List[Tuple[float, int, int]]:
         raise NotImplementedError("Sorter Neighborhood Algorithms don't support prunned blocks")
     
-    def extract_tps_checked(self, **kwargs) -> dict:
-        self.true_pair_checked = dict() if self.true_pair_checked is None else self.true_pair_checked
-        entity = kwargs['entity']
-        neighbor = kwargs['neighbor']
-        
-        entity_id = self.data._gt_to_ids_reversed_1[entity] if entity < self.data.dataset_limit else self.data._gt_to_ids_reversed_2[entity]
-        neighbor_id = self.data._gt_to_ids_reversed_1[neighbor] if neighbor < self.data.dataset_limit else self.data._gt_to_ids_reversed_2[neighbor]
-        _d1_entity, _d2_entity = (entity_id, neighbor_id) if entity < self.data.dataset_limit else (neighbor_id, entity_id)
-        if _d2_entity in self.data.pairs_of[_d1_entity]:
-            self.true_pair_checked[canonical_swap(_d1_entity, _d2_entity)] = False
-            
-        return self.true_pair_checked
-    
 class LocalPSNM(SimilarityBasedProgressiveMatching):
     """Applies Local Progressive Sorted Neighborhood Matching
     """
@@ -745,44 +929,35 @@ class LocalPSNM(SimilarityBasedProgressiveMatching):
 
     def __init__(
             self,
-            budget: int = 0,
-            pwScheme: str = 'ACF',
-            metric: str = 'dice',
+            weighting_scheme: str = 'ACF',
+            window_size: int = 10,
+            similarity_function: str = 'dice',
             tokenizer: str = 'white_space_tokenizer',
-            similarity_threshold: float = 0.5,
-            qgram: int = 2, # for jaccard
-            tokenizer_return_set = True, # unique values or not
-            attributes: any = None,
-            delim_set: list = None, # DelimiterTokenizer
-            padding: bool = True, # QgramTokenizer
-            prefix_pad: str = '#', # QgramTokenizer (if padding=True)
-            suffix_pad: str = '$' # QgramTokenizer (if padding=True)
+            vectorizer : str = None,
+            qgram : int = 1,
+            similarity_threshold: float = 0.0,
+            tokenizer_return_unique_values = True, # unique values or not
+            attributes: any = None
         ) -> None:
 
-        super().__init__(budget, pwScheme, metric, tokenizer, similarity_threshold, qgram, tokenizer_return_set, attributes, delim_set, padding, prefix_pad, suffix_pad)
-
-    def _predict_raw_blocks(self, blocks: dict):
-        lpsn : LocalProgressiveSortedNeighborhood = LocalProgressiveSortedNeighborhood(self._pwScheme, self._budget)
-        candidates : list = lpsn.process(blocks=blocks, data=self.data, tqdm_disable=True, emit_all_tps_stop=self._emit_all_tps_stop)
-        self.pairs = candidates
-        if(self._emit_all_tps_stop): self.true_pair_checked = self.extract_tps_checked(candidates=candidates) 
+        super().__init__(weighting_scheme=weighting_scheme,
+                        window_size=window_size,
+                        similarity_function=similarity_function,
+                        tokenizer=tokenizer,
+                        vectorizer=vectorizer,
+                        qgram=qgram,
+                        similarity_threshold=similarity_threshold,
+                        tokenizer_return_unique_values=tokenizer_return_unique_values,
+                        attributes=attributes)
+
+    def _predict_raw_blocks(self, blocks: dict) -> List[Tuple[float, int, int]]:
+        lpsn : LocalProgressiveSortedNeighborhood = LocalProgressiveSortedNeighborhood(self._weighting_scheme, self._budget)
+        self.pairs : List[Tuple[float, int, int]] = lpsn.process(blocks=blocks, data=self.data, window_size=self._window_size, tqdm_disable=True, emit_all_tps_stop=self._emit_all_tps_stop)
         return self.pairs
 
-    def _predict_prunned_blocks(self, blocks: dict):
+    def _predict_prunned_blocks(self, blocks: dict) -> List[Tuple[float, int, int]]:
         raise NotImplementedError("Sorter Neighborhood Algorithms don't support prunned blocks " + \
                                 "(pre comparison-cleaning entities per block distribution required")
-        
-    def extract_tps_checked(self, **kwargs) -> dict:
-        _tps_checked = dict()
-        _candidates = kwargs['candidates']
-        
-        for entity, neighbor in _candidates:
-            entity_id = self.data._gt_to_ids_reversed_1[entity] if entity < self.data.dataset_limit else self.data._gt_to_ids_reversed_2[entity]
-            neighbor_id = self.data._gt_to_ids_reversed_1[neighbor] if neighbor < self.data.dataset_limit else self.data._gt_to_ids_reversed_2[neighbor]
-            _d1_entity, _d2_entity = (entity_id, neighbor_id) if entity < self.data.dataset_limit else (neighbor_id, entity_id)
-            if _d2_entity in self.data.pairs_of[_d1_entity]:
-                _tps_checked[canonical_swap(_d1_entity, _d2_entity)] = False  
-        return _tps_checked
 class RandomPM(ProgressiveMatching):
     """Picks a number of random comparisons equal to the available budget
     """
@@ -792,12 +967,11 @@ class RandomPM(ProgressiveMatching):
 
     def __init__(
             self,
-            budget: int = 0,
-            metric: str = 'dice',
+            similarity_function: str = 'dice',
             tokenizer: str = 'white_space_tokenizer',
-            similarity_threshold: float = 0.5,
+            similarity_threshold: float = 0.0,
             qgram: int = 2, # for jaccard
-            tokenizer_return_set = True, # unique values or not
+            tokenizer_return_unique_values = True, # unique values or not
             attributes: any = None,
             delim_set: list = None, # DelimiterTokenizer
             padding: bool = True, # QgramTokenizer
@@ -805,32 +979,19 @@ def __init__(
             suffix_pad: str = '$' # QgramTokenizer (if padding=True)
         ) -> None:
 
-        super().__init__(budget, metric, tokenizer, similarity_threshold, qgram, tokenizer_return_set, attributes, delim_set, padding, prefix_pad, suffix_pad)
+        super().__init__(similarity_function, tokenizer, similarity_threshold, qgram, tokenizer_return_unique_values, attributes, delim_set, padding, prefix_pad, suffix_pad)
 
-    def _predict_raw_blocks(self, blocks: dict) -> None:
+    def _predict_raw_blocks(self, blocks: dict) -> List[Tuple[int, int]]:
         cp : ComparisonPropagation = ComparisonPropagation()
         cleaned_blocks = cp.process(blocks=blocks, data=self.data, tqdm_disable=True)
         self._predict_prunned_blocks(cleaned_blocks)
 
-    def _predict_prunned_blocks(self, blocks: dict) -> None:
+    def _predict_prunned_blocks(self, blocks: dict) -> List[Tuple[int, int]]:
         _all_pairs = [(id1, id2) for id1 in blocks for id2 in blocks[id1]]
         _total_pairs = len(_all_pairs)
         random_pairs = sample(_all_pairs, self._budget) if self._budget <= _total_pairs and not self._emit_all_tps_stop else _all_pairs
-        if(self._emit_all_tps_stop): self.true_pair_checked = self.extract_tps_checked(candidates=random_pairs)
         self.pairs.add_edges_from(random_pairs)
         
-    def extract_tps_checked(self, **kwargs) -> dict:
-        _tps_checked = dict()
-        _candidates = kwargs['candidates']
-        
-        for entity, neighbor in _candidates:
-            entity_id = self.data._gt_to_ids_reversed_1[entity] if entity < self.data.dataset_limit else self.data._gt_to_ids_reversed_2[entity]
-            neighbor_id = self.data._gt_to_ids_reversed_1[neighbor] if neighbor < self.data.dataset_limit else self.data._gt_to_ids_reversed_2[neighbor]
-            _d1_entity, _d2_entity = (entity_id, neighbor_id) if entity < self.data.dataset_limit else (neighbor_id, entity_id)
-            if _d2_entity in self.data.pairs_of[_d1_entity]:
-                _tps_checked[canonical_swap(_d1_entity, _d2_entity)] = False  
-        return _tps_checked
-        
 class PESM(HashBasedProgressiveMatching):
     """Applies Progressive Entity Scheduling Matching
     """
@@ -840,189 +1001,369 @@ class PESM(HashBasedProgressiveMatching):
                         "emits the top pair per entity. Finally, traverses the sorted " + \
                         "entities and emits their comparisons in descending weight order " + \
                         "within specified budget."
+                        
     def __init__(
             self,
-            budget: int = 0,
-            w_scheme: str = 'X2',
-            metric: str = 'dice',
+            weighting_scheme: str = 'CBS',
+            similarity_function: str = 'dice',
             tokenizer: str = 'white_space_tokenizer',
-            similarity_threshold: float = 0.5,
-            qgram: int = 2, # for jaccard
-            tokenizer_return_set = True, # unique values or not
+            vectorizer : str = None,
+            qgram : int = 1,
+            similarity_threshold: float = 0.0,
+            tokenizer_return_unique_values = True, # unique values or not
             attributes: any = None,
-            delim_set: list = None, # DelimiterTokenizer
-            padding: bool = True, # QgramTokenizer
-            prefix_pad: str = '#', # QgramTokenizer (if padding=True)
-            suffix_pad: str = '$' # QgramTokenizer (if padding=True)
         ) -> None:
-        
-        super().__init__(budget, w_scheme, metric, tokenizer, similarity_threshold, qgram, tokenizer_return_set, attributes, delim_set, padding, prefix_pad, suffix_pad)
 
+        super().__init__(weighting_scheme=weighting_scheme,
+                        similarity_function=similarity_function,
+                        tokenizer=tokenizer,
+                        vectorizer=vectorizer,
+                        qgram=qgram,
+                        similarity_threshold=similarity_threshold,
+                        tokenizer_return_unique_values=tokenizer_return_unique_values,
+                        attributes=attributes)
 
-    def _predict_raw_blocks(self, blocks: dict) -> None:
+    def _predict_raw_blocks(self, blocks: dict) -> List[Tuple[int, int]]:
         
-        pes : ProgressiveEntityScheduling = ProgressiveEntityScheduling(self._w_scheme, self._budget)
-        pes.process(blocks=blocks, data=self.data, tqdm_disable=True, cc=None, method=self._method, emit_all_tps_stop=self._emit_all_tps_stop)
-        self.pairs = pes.produce_pairs()
-        if(self._emit_all_tps_stop): self.true_pair_checked = self.extract_tps_checked(candidates=self.pairs)
-
-    def _predict_prunned_blocks(self, blocks: dict):
+        pes : ProgressiveEntityScheduling = ProgressiveEntityScheduling(self._weighting_scheme, self._budget)
+        self.pairs = pes.process(blocks=blocks, data=self.data, tqdm_disable=True, cc=None, method=self._algorithm, emit_all_tps_stop=self._emit_all_tps_stop)
+        return self.pairs
+    
+    def _predict_prunned_blocks(self, blocks: dict) -> List[Tuple[int, int]]:
         return self._predict_raw_blocks(blocks)
         # raise NotImplementedError("Sorter Neighborhood Algorithms doesn't support prunned blocks (lack of precalculated weights)")
+    
+# class WhooshPM(BlockIndependentPM):
+#     """Applies progressive index based matching using whoosh library 
+#     """
+
+#     _method_name: str = "Whoosh Progressive Matching"
+#     _method_info: str = "Applies Whoosh Progressive Matching - Indexes the entities of the second dataset, " + \
+#                         "stores their specified attributes, " + \
+#                         "defines a query for each entity of the first dataset, " + \
+#                         "and retrieves its pair candidates from the indexer within specified budget"
+
+#     def __init__(
+#             self,
+#             similarity_function: str = 'WH-TF-IDF',
+#             number_of_nearest_neighbors: int = 10,
+#             tokenizer: str = 'white_space_tokenizer',
+#             similarity_threshold: float = 0.0,
+#             qgram: int = 2, # for jaccard
+#             tokenizer_return_unique_values = True, # unique values or not
+#             attributes: any = None,
+#             delim_set: list = None, # DelimiterTokenizer
+#             padding: bool = True, # QgramTokenizer
+#             prefix_pad: str = '#', # QgramTokenizer (if padding=True)
+#             suffix_pad: str = '$' # QgramTokenizer (if padding=True)
+#         ) -> None:
+#         # budget set to float('inf') implies unlimited budget
+#         super().__init__(similarity_function, tokenizer, similarity_threshold, qgram, tokenizer_return_unique_values, attributes, delim_set, padding, prefix_pad, suffix_pad)
+#         self._number_of_nearest_neighbors : int = number_of_nearest_neighbors
+        
+#     def _set_whoosh_datasets(self) -> None:
+#         """Saves the rows of both datasets corresponding to the indices of the entities that have been retained after comparison cleaning
+#         """
         
-    def extract_tps_checked(self, **kwargs) -> dict:
-        _tps_checked = dict()
-        _candidates = kwargs['candidates']
+#         self._whoosh_d1 = self.data.dataset_1[self.attributes + [self.data.id_column_name_1]] if self.attributes else self.data.dataset_1
+#         self._whoosh_d1 = self._whoosh_d1[self._whoosh_d1[self.data.id_column_name_1].isin(self._whoosh_d1_retained_index)]
+#         if(not self.data.is_dirty_er):  
+#             self._whoosh_d2 = self.data.dataset_2[self.attributes + [self.data.id_column_name_2]] if self.attributes else self.data.dataset_2
+#             self._whoosh_d2 = self._whoosh_d2[self._whoosh_d2[self.data.id_column_name_2].isin(self._whoosh_d2_retained_index)]
         
-        for entity, neighbor in _candidates:
-            entity_id = self.data._gt_to_ids_reversed_1[entity] if entity < self.data.dataset_limit else self.data._gt_to_ids_reversed_2[entity]
-            neighbor_id = self.data._gt_to_ids_reversed_1[neighbor] if neighbor < self.data.dataset_limit else self.data._gt_to_ids_reversed_2[neighbor]
-            _d1_entity, _d2_entity = (entity_id, neighbor_id) if entity < self.data.dataset_limit else (neighbor_id, entity_id)
-            if _d2_entity in self.data.pairs_of[_d1_entity]:
-                _tps_checked[canonical_swap(_d1_entity, _d2_entity)] = False  
-        return _tps_checked
+
+#     def _set_retained_entries(self) -> None:
+#         """Saves the indices of entities of both datasets that have been retained after comparison cleaning
+#         """
+#         self._whoosh_d1_retained_index = pd.Index([self.data._gt_to_ids_reversed_1[id] 
+#         for id in self._si.d1_retained_ids])
+        
+#         if(not self.data.is_dirty_er):
+#             self._whoosh_d2_retained_index = pd.Index([self.data._gt_to_ids_reversed_2[id] 
+#         for id in self._si.d2_retained_ids])
     
     
-class WhooshPM(ProgressiveMatching):
-    """Applies progressive index based matching using whoosh library 
-    """
+#     def _initialize_index_path(self):
+#         """Creates index directory if non-existent, constructs the absolute path to the current whoosh index
+#         """
+#         global INDEXER_DIR
+#         INDEXER_DIR = os.path.abspath(INDEXER_DIR)  
+#         _d1_name = self.data.dataset_name_1 if self.data.dataset_name_1 is not None else 'd3'    
+#         self._index_path = os.path.join(INDEXER_DIR, _d1_name if self.data.is_dirty_er else (_d1_name + (self.data.dataset_name_2 if self.data.dataset_name_2 is not None else 'd4')))
+#         if not os.path.exists(self._index_path):
+#             print('Created index directory at: ' + self._index_path)
+#             os.makedirs(self._index_path, exist_ok=True)
+        
+    
+#     def _create_index(self):
+#         """Defines the schema [ID, CONTENT], creates the index in the defined path 
+#            and populates it with all the entities of the target dataset (first - Dirty ER, second - Clean ER)
+#         """
+#         self._schema = Schema(ID=ID(stored=True), content=TEXT(stored=True))
+#         self._index = create_in(self._index_path, self._schema)
+#         writer = self._index.writer()
+        
+#         _target_dataset = self._whoosh_d1 if self.data.is_dirty_er else self._whoosh_d2
+#         _id_column_name = self.data.id_column_name_1 if self.data.is_dirty_er else self.data.id_column_name_2
+        
+#         for _, entity in _target_dataset.iterrows():
+#             entity_values = [str(entity[column]) for column in _target_dataset.columns if column != _id_column_name]
+#             writer.add_document(ID=entity[_id_column_name], content=' '.join(entity_values))
+#         writer.commit()
+    
+#     def _populate_whoosh_dataset(self) -> None:
+#         """For each retained entity in the first dataset, construct a query with its text content,
+#            parses it to the indexers, retrieves best candidates and stores them in entity's neighborhood.
+#            Populates a list with all the retrieved pairs.
+#         """
+#         # None value for budget implies unlimited budget in whoosh 
+#         _query_budget = self._number_of_nearest_neighbors
+        
+#         if(self.similarity_function not in whoosh_similarity_function):
+#             print(f'{self.similarity_function} Similarity Function is Undefined')
+#             self.similarity_function = 'Frequency'
+#         print(f'Applying {self.similarity_function} Similarity Function')
+#         _scorer = whoosh_similarity_function[self.similarity_function]
+        
+#         with self._index.searcher(weighting=_scorer) as searcher:
+#             self._parser = qparser.QueryParser('content', schema=self._index.schema, group=qparser.OrGroup)
+#             for _, entity in self._whoosh_d1.iterrows():
+#                 entity_values = [str(entity[column]) for column in self._whoosh_d1.columns if column != self.data.id_column_name_1]
+#                 entity_string = ' '.join(entity_values)
+#                 entity_id = entity[self.data.id_column_name_1]
+#                 entity_query = self._parser.parse(entity_string)
+#                 query_results = searcher.search(entity_query, limit = _query_budget)
+                
+#                 for neighbor in query_results:
+#                     _score = neighbor.score
+#                     _neighbor_id = neighbor['ID']
+#                     self.pairs.append((_score, self.data._ids_mapping_1[entity], self.data._ids_mapping_2[_neighbor_id]))
+                       
+#     def _predict_raw_blocks(self, blocks: dict) -> List[Tuple[int, int]]:
+#         self._start_time = time()
+#         self._si = SubsetIndexer(blocks=blocks, data=self.data, subset=False)
+#         self._set_retained_entries()
+#         self._set_whoosh_datasets()
+#         self._initialize_index_path()
+#         self._create_index()
+#         self.pairs : List[Tuple[float, int, int]] = []
+#         self._budget = float('inf') if self._emit_all_tps_stop else self._budget
+#         self._populate_whoosh_dataset()
+#         self.execution_time = time() - self._start_time
+#         return self.pairs
+        
+#     def _predict_prunned_blocks(self, blocks: dict) -> List[Tuple[int, int]]:
+#         self._predict_raw_blocks(blocks)  
 
-    _method_name: str = "Whoosh Progressive Matching"
-    _method_info: str = "Applies Whoosh Progressive Matching - Indexes the entities of the second dataset, " + \
-                        "stores their specified attributes, " + \
-                        "defines a query for each entity of the first dataset, " + \
-                        "and retrieves its pair candidates from the indexer within specified budget"
+class TopKJoinPM(ProgressiveMatching):
+    """Applies index based matching for ES, emits candidate pairs using defined budget/emission technique
+    """
 
+    _method_name: str = "Top-K Join Progressive Matching"
+    _method_info: str = "Applies index based matching for ES, emits candidate pairs using defined budget/emission technique"
     def __init__(
             self,
-            budget: int = 0,
-            metric: str = 'TF-IDF',
-            tokenizer: str = 'white_space_tokenizer',
-            similarity_threshold: float = 0.5,
-            qgram: int = 2, # for jaccard
-            tokenizer_return_set = True, # unique values or not
+            similarity_function: str = 'dice',
+            number_of_nearest_neighbors : int = 10,
+            tokenizer: str = None,
+            weighting_scheme : str = None,
+            qgram : int = 1,
+            similarity_threshold: float = 0.0,
+            tokenizer_return_unique_values = True, # unique values or not
             attributes: any = None,
-            delim_set: list = None, # DelimiterTokenizer
-            padding: bool = True, # QgramTokenizer
-            prefix_pad: str = '#', # QgramTokenizer (if padding=True)
-            suffix_pad: str = '$' # QgramTokenizer (if padding=True)
         ) -> None:
-        # budget set to float('inf') implies unlimited budget
-        super().__init__(budget, metric, tokenizer, similarity_threshold, qgram, tokenizer_return_set, attributes, delim_set, padding, prefix_pad, suffix_pad)
-     
-    def _set_whoosh_datasets(self) -> None:
-        """Saves the rows of both datasets corresponding to the indices of the entities that have been retained after comparison cleaning
-        """
+
+        super().__init__(similarity_function=similarity_function,
+                        tokenizer=tokenizer,
+                        vectorizer=weighting_scheme,
+                        qgram=qgram,
+                        similarity_threshold=similarity_threshold,
+                        tokenizer_return_unique_values=tokenizer_return_unique_values,
+                        attributes=attributes)
         
-        self._whoosh_d1 = self.data.dataset_1[self.attributes + [self.data.id_column_name_1]] if self.attributes else self.data.dataset_1
-        self._whoosh_d1 = self._whoosh_d1[self._whoosh_d1[self.data.id_column_name_1].isin(self._whoosh_d1_retained_index)]
-        if(not self.data.is_dirty_er):  
-            self._whoosh_d2 = self.data.dataset_2[self.attributes + [self.data.id_column_name_2]] if self.attributes else self.data.dataset_2
-            self._whoosh_d2 = self._whoosh_d2[self._whoosh_d2[self.data.id_column_name_2].isin(self._whoosh_d2_retained_index)]
+        self.similarity_function : str = similarity_function
+        self.number_of_nearest_neighbors : int = number_of_nearest_neighbors
+        self.weighting_scheme : str = weighting_scheme
+        self.qgram : int = qgram 
         
-
-    def _set_retained_entries(self) -> None:
-        """Saves the indices of entities of both datasets that have been retained after comparison cleaning
-        """
-        self._whoosh_d1_retained_index = pd.Index([self.data._gt_to_ids_reversed_1[id] 
-        for id in self._si.d1_retained_ids])
+    def _predict_raw_blocks(self, blocks: dict, load_neighborhoods : bool = True) -> List[Tuple[int, int]]:
         
-        if(not self.data.is_dirty_er):
-            self._whoosh_d2_retained_index = pd.Index([self.data._gt_to_ids_reversed_2[id] 
-        for id in self._si.d2_retained_ids])
-    
-    
-    def _initialize_index_path(self):
-        """Creates index directory if non-existent, constructs the absolute path to the current whoosh index
-        """
-        global INDEXER_DIR
-        INDEXER_DIR = os.path.abspath(INDEXER_DIR)  
-        _d1_name = self.data.dataset_name_1 if self.data.dataset_name_1 is not None else 'd3'    
-        self._index_path = os.path.join(INDEXER_DIR, _d1_name if self.data.is_dirty_er else (_d1_name + (self.data.dataset_name_2 if self.data.dataset_name_2 is not None else 'd4')))
-        if not os.path.exists(self._index_path):
-            print('Created index directory at: ' + self._index_path)
-            os.makedirs(self._index_path, exist_ok=True)
+        _store_neighborhoods : bool = load_neighborhoods
+        _loaded_neighborhoods : dict[List[Tuple[float, int]]]
         
-    
-    def _create_index(self):
-        """Defines the schema [ID, CONTENT], creates the index in the defined path 
-           and populates it with all the entities of the target dataset (first - Dirty ER, second - Clean ER)
-        """
-        self._schema = Schema(ID=ID(stored=True), content=TEXT(stored=True))
-        self._index = create_in(self._index_path, self._schema)
-        writer = self._index.writer()
+        if(load_neighborhoods):
+            print("Neighborhood Retrieval Enabled...")
+            _loaded_neighborhoods = self.retrieve_neighborhoods_from_disk()
+        else:
+            print("Neighborhood Retrieval Disabled...")
+            _loaded_neighborhoods = None
         
-        _target_dataset = self._whoosh_d1 if self.data.is_dirty_er else self._whoosh_d2
-        _id_column_name = self.data.id_column_name_1 if self.data.is_dirty_er else self.data.id_column_name_2
+        if(_loaded_neighborhoods is None):
+            ptkj : PETopKJoin = PETopKJoin(K=self.number_of_nearest_neighbors,
+                                            metric=self.similarity_function,
+                                            tokenization=self.tokenizer,
+                                            qgrams=self.qgram)
+
+            _pet_vectorizer = self.initialize_vectorizer() if (self.weighting_scheme is not None) else None
+            self.pairs = ptkj.fit(data=self.data,
+                                reverse_order=True,
+                                attributes_1=self.data.attributes_1,
+                                attributes_2=self.data.attributes_2,
+                                vectorizer=_pet_vectorizer,
+                                store_neighborhoods=_store_neighborhoods)
+            
+            if(_store_neighborhoods): 
+                self.pairs = self.neighborhoods_to_pairs(neighborhoods=ptkj.neighborhoods, strict_top_k=True)
+                self.neighborhoods_to_json(neighborhoods=ptkj.neighborhoods)
+            else:
+                self.pairs = [(edge[2]['weight'], edge[0], edge[1]) for edge in self.pairs.edges(data=True)]
+        else:
+            self.pairs = self.neighborhoods_to_pairs(neighborhoods=_loaded_neighborhoods, strict_top_k=True) 
+            
+        return self.pairs
+    
+    def _predict_prunned_blocks(self, blocks: dict) -> List[Tuple[int, int]]:
+        raise NotImplementedError("Progressive TopKJoin PM for prunned blocks - Not implemented yet!")
         
-        for _, entity in _target_dataset.iterrows():
-            entity_values = [str(entity[column]) for column in _target_dataset.columns if column != _id_column_name]
-            writer.add_document(ID=entity[_id_column_name], content=' '.join(entity_values))
-        writer.commit()
     
-    def _populate_whoosh_dataset(self) -> None:
-        """For each retained entity in the first dataset, construct a query with its text content,
-           parses it to the indexers, retrieves best candidates and stores them in entity's neighborhood.
-           Finally, neighborhoods are sorted in descending order of their average weight
+    def neighborhoods_to_pairs(self, neighborhoods : dict[List[Tuple[float, int]]], strict_top_k : bool = False) -> List[Tuple[float, int, int]]:
+        previous_weight = None
+        _pairs : List[Tuple[float, int, int]] = []
+        for d1_id, d2_ids in neighborhoods.items():
+            distinct_weights = 0
+            _d1_id = int(d1_id)
+            for current_weight, d2_id in d2_ids:
+                if(strict_top_k or current_weight != previous_weight):
+                    previous_weight = current_weight
+                    distinct_weights += 1
+                if distinct_weights <= self.number_of_nearest_neighbors:
+                    _pairs.append((current_weight, d2_id, _d1_id))
+                else:
+                    break  
+        return _pairs
+    
+    def neighborhoods_to_json(self, neighborhoods : dict[List[Tuple[float, int]]]) -> None:
+        """Stores the neighborhood in the corresponding experiment's neighborhoods json file within the hidden .ngbs directory
+        Args:
+            neighborhoods (dict[List[Tuple[float, int]]]): Neighborhoods of indexed entities of current experiment, dictionary in the form
+                                                           [indexed entity id] -> [sorted target dataset neighbors in descending similarity order]
         """
-        # None value for budget implies unlimited budget in whoosh 
-        _query_budget = None if is_infinite(self._budget) else max(1, 2 * self._budget / len(self._whoosh_d1))
         
-        if(self.metric not in whoosh_similarity_function):
-            print(f'{self.metric} Similarity Function is Undefined')
-            self.metric = 'Frequency'
-        print(f'Applying {self.metric} Similarity Function')
-        _scorer = whoosh_similarity_function[self.metric]
+        _json_file_name = '_'.join(self._requested_file_components)
         
-        with self._index.searcher(weighting=_scorer) as searcher:
-            self._parser = qparser.QueryParser('content', schema=self._index.schema, group=qparser.OrGroup)
-            for _, entity in self._whoosh_d1.iterrows():
-                entity_values = [str(entity[column]) for column in self._whoosh_d1.columns if column != self.data.id_column_name_1]
-                entity_string = ' '.join(entity_values)
-                entity_id = entity[self.data.id_column_name_1]
-                entity_query = self._parser.parse(entity_string)
-                query_results = searcher.search(entity_query, limit = _query_budget)
-                
-                for neighbor in query_results:
-                    _score = neighbor.score
-                    _neighbor_id = neighbor['ID']
-                    self._sorted_dataset._insert_entity_neighbor(entity=entity_id, neighbor=_neighbor_id, weight=_score)
+        neighborhoods_directory_path = os.path.join(os.getcwd(), ".ngbs")
+        os.makedirs(neighborhoods_directory_path, exist_ok=True)
         
-        self._sorted_dataset._sort_neighborhoods_by_avg_weight()
+        _json_store_path = os.path.join(neighborhoods_directory_path, _json_file_name)
+        print(f"Storing Neighborhood Json in -> {_json_store_path}")
+        with open(_json_store_path, 'w') as json_file:
+            json.dump(neighborhoods, json_file, indent=4)
+    
+    def matching_file_components(self, 
+                                 source_components : List[str], 
+                                 target_components : List[str], 
+                                 variable_component_index : int = 6) -> bool:
+        """Takes as inputs lists containing the component of the source and target file name (strings connecte by underscore).
+           Checks whether those components match (files are equivalent). Variable component (number of nearest neighbor) must be less or equal
+           to the target component.
+        Args:
+            source_components (List[str]): Components (substrings seperated by underscore) that constitute source file name
+            target_components (List[str]): Components (substrings seperated by underscore) that constitute target file name
+            variable_component_index (int, optional): Index in file name's components list where the variable component is placed (number of nearest neighbors)
+        Returns:
+            bool: Source and target file name components are equivalent (target file can be loaded for source file request)
+        """
+        number_pattern = r"[-+]?\d*\.\d+|\d+"
+        zipped_components = list(zip(source_components, target_components))
+        matching_components = True
         
-    def _emit_pairs(self) -> None:
-        """Returns a list of candidate pairs that have been emitted following the requested method"""       
-        self.pairs = self._sorted_dataset._emit_pairs(method=self._method, data=self.data)
-                       
-    def _predict_raw_blocks(self, blocks: dict) -> None:
-        self._start_time = time()
-        self._si = SubsetIndexer(blocks=blocks, data=self.data, subset=False)
-        self._set_retained_entries()
-        self._set_whoosh_datasets()
-        self._initialize_index_path()
-        self._create_index()
-        self._to_emit_pairs : List[Tuple[int, int]] = []
-        self._budget = float('inf') if self._emit_all_tps_stop else self._budget
-        self._sorted_dataset = WhooshDataset(list(self._whoosh_d1_retained_index), self._budget)
-        self._populate_whoosh_dataset()
-        self._emit_pairs()
-        self.execution_time = time() - self._start_time
-        if(self._emit_all_tps_stop): self.true_pair_checked = self.extract_tps_checked(candidates=self.pairs)
+        for index, components in enumerate(zipped_components):
+            source_component, target_component = components
+            
+            if(index == variable_component_index):
+                source_nns = int((re.findall(number_pattern, source_component))[0])
+                target_nns = int((re.findall(number_pattern, target_component))[0])
+                if(source_nns > target_nns):
+                    matching_components = False
+                    break
+            else:
+                if(source_component != target_component):
+                    matching_components = False
+                    break   
+        return matching_components
+    
+    def retrieve_neighborhoods_from_disk(self) -> dict[List[Tuple[float, int]]]:
+        """Attemps to retrieve a precalculated neighborhoods for indexed entities of the current experiment
+        Returns:
+            dict[List[Tuple[float, int]]]: Dictionary of neighborhoods for each indexed entity containing a sorted list of neighbords in descending similarity order
+        """
+        self._requested_file_components = [self._indexing,
+                                      self.dataset_identifier,
+                                      self.weighting_scheme,
+                                      self.tokenizer.split('_')[0],
+                                      self.similarity_function,
+                                      "q" + str(self.qgram),
+                                      "n" + str(self.number_of_nearest_neighbors) + ".json"]
         
-    def _predict_prunned_blocks(self, blocks: dict) -> None:
-        self._predict_raw_blocks(blocks)    
+        _neighbors_count_index : int = len(self._requested_file_components) - 1
+        neighborhoods_directory_path : str = os.path.join(os.getcwd(), ".ngbs")
+        _matching_neighborhood_file_name : str = None
+        _matching_neighborhood : dict[List[Tuple[float, int]]] = None 
         
-    def extract_tps_checked(self, **kwargs) -> dict:
-        _tps_checked = dict()
-        _candidates = kwargs['candidates']
+        os.makedirs(neighborhoods_directory_path, exist_ok=True)
+        print(f"Searching for matching neighborhood file in -> {neighborhoods_directory_path}")
         
-        for entity, neighbor in _candidates:
-            entity_id = self.data._gt_to_ids_reversed_1[entity] if entity < self.data.dataset_limit else self.data._gt_to_ids_reversed_2[entity]
-            neighbor_id = self.data._gt_to_ids_reversed_1[neighbor] if neighbor < self.data.dataset_limit else self.data._gt_to_ids_reversed_2[neighbor]
-            _d1_entity, _d2_entity = (entity_id, neighbor_id) if entity < self.data.dataset_limit else (neighbor_id, entity_id)
-            if _d2_entity in self.data.pairs_of[_d1_entity]:
-                _tps_checked[canonical_swap(_d1_entity, _d2_entity)] = False  
-        return _tps_checked 
+        if os.path.isdir(neighborhoods_directory_path):
+            neighborhoods_file_names = os.listdir(neighborhoods_directory_path)
+    
+            for neighborhood_file_name in neighborhoods_file_names:
+                _neighborhood_file_components = neighborhood_file_name.split('_')
+                if(self.matching_file_components(source_components=self._requested_file_components,
+                                                target_components=_neighborhood_file_components,
+                                                variable_component_index=_neighbors_count_index)):
+                    _matching_neighborhood_file_name = neighborhood_file_name
+                    break
         
+        if(_matching_neighborhood_file_name is not None):
+            _matching_neighborhood_file_path = os.path.join(neighborhoods_directory_path, _matching_neighborhood_file_name)
+            if(os.path.exists(_matching_neighborhood_file_path) and os.path.isfile(_matching_neighborhood_file_path)):
+                with open(_matching_neighborhood_file_path, 'r') as neighborhood_file:
+                    _matching_neighborhood = json.load(neighborhood_file)
+                print(f"Retrieved matching neighborhood from -> {_matching_neighborhood_file_path}!")
+        else:
+            print(f"Matching Neighborhood File not found - Executing Joins Algorithm!")
+            
+        return _matching_neighborhood
+                    
+    def initialize_vectorizer(self) -> FrequencyEvaluator:
+        self.vectorizer : FrequencyEvaluator = FrequencyEvaluator(vectorizer=self.weighting_scheme,
+                                                                  tokenizer=self.tokenizer,
+                                                                  qgram=self.qgram)
         
+        d1 = self.data.dataset_1[self.data.attributes_1] if self.data.attributes_1 is not None else self.data.dataset_1
+        self._entities_d1 = d1 \
+                    .apply(" ".join, axis=1) \
+                    .apply(lambda x: x.lower()) \
+                    .values.tolist()
+        d2 = self.data.dataset_2[self.data.attributes_2] if self.data.attributes_2 is not None else self.data.dataset_2
+        self._entities_d2 = d2 \
+                    .apply(" ".join, axis=1) \
+                    .apply(lambda x: x.lower()) \
+                    .values.tolist() if not self.data.is_dirty_er else None         
+        self.vectorizer.fit(metric=self.similarity_function,
+                            dataset_identifier=self.dataset_identifier,
+                            indexing=self._indexing,
+                            d1_entities=self._entities_d1,
+                            d2_entities=self._entities_d2)
+        return self.vectorizer
+
+class_references = {
+    'GlobalTopPM' : GlobalTopPM,
+    'LocalTopPM' : LocalTopPM,
+    'GlobalPSNM' : GlobalPSNM,
+    'LocalPSNM' : LocalPSNM,
+    'PESM' : PESM,
+    'EmbeddingsNNBPM' : EmbeddingsNNBPM,
+    'TopKJoinPM' : TopKJoinPM
+} 
 
diff --git a/docs/pyjedai/schema_matching.py b/docs/pyjedai/schema_matching.py
new file mode 100644
index 0000000..9459821
--- /dev/null
+++ b/docs/pyjedai/schema_matching.py
@@ -0,0 +1,148 @@
+"""Schema Matching methods
+"""
+import pandas as pd
+import valentine
+from valentine.algorithms.base_matcher import BaseMatcher
+from valentine.algorithms.coma.coma import Coma
+from valentine.algorithms.cupid.cupid_model import Cupid
+from valentine.algorithms.distribution_based.distribution_based import DistributionBased
+from valentine.algorithms.jaccard_levenshtein.jaccard_leven import JaccardLevenMatcher
+from valentine.algorithms.similarity_flooding.similarity_flooding import SimilarityFlooding
+import valentine.metrics as valentine_metrics
+from pandas import DataFrame, concat
+
+from .datamodel import Block, Data, PYJEDAIFeature
+from .evaluation import Evaluation
+from abc import abstractmethod
+
+class AbstractSchemaMatching(PYJEDAIFeature):
+    """Abstract class for schema matching methods
+    """
+
+    @abstractmethod
+    def evaluate(self,
+                 prediction=None,
+                 export_to_df: bool = False,
+                 export_to_dict: bool = False,
+                 with_classification_report: bool = False,
+                 verbose: bool = True) -> any:
+        pass
+
+    @abstractmethod
+    def _configuration(self) -> dict:
+        pass
+
+    @abstractmethod
+    def stats(self) -> None:
+        pass
+
+    @abstractmethod
+    def process(self,
+                data: Data,
+                ) -> list:
+        pass
+
+    @abstractmethod
+    def process_sm_weighted(self,
+                            data: Data):
+        pass
+
+    def __init__(self):
+        super().__init__()
+
+
+class ValentineMethodBuilder(PYJEDAIFeature):
+    """Class to provide valentine matching methods
+    """
+
+    def evaluate(self,
+                 prediction=None,
+                 export_to_df: bool = False,
+                 export_to_dict: bool = False,
+                 with_classification_report: bool = False,
+                 verbose: bool = True) -> any:
+        pass
+
+    def _configuration(self) -> dict:
+        pass
+
+    def __init__(self):
+        super().__init__()
+
+    @staticmethod
+    def coma_matcher(max_n: int = 0,
+                     strategy: str = "COMA_OPT"
+                     ) -> Coma:
+        return Coma(max_n, strategy)
+
+    @staticmethod
+    def cupid_matcher(w_struct: float = 0.2,
+                      leaf_w_struct: float = 0.2,
+                      th_accept: float = 0.7
+                      ) -> Cupid:
+        return Cupid(w_struct, leaf_w_struct, th_accept)
+
+    @staticmethod
+    def distribution_based_matcher(threshold1: float = 0.15,
+                                   threshold2: float = 0.15
+                                   ) -> DistributionBased:
+        return DistributionBased(threshold1, threshold2)
+
+    @staticmethod
+    def jaccard_leven_matcher(threshold_leven: float = 0.8) -> JaccardLevenMatcher:
+        return JaccardLevenMatcher(threshold_leven)
+
+    @staticmethod
+    def similarity_flooding_mathcer(coeff_policy: str = "inverse_average",
+                                    formula: str = "formula_c") -> SimilarityFlooding:
+        return SimilarityFlooding(coeff_policy, formula)
+
+class ValentineSchemaMatching(AbstractSchemaMatching):
+    """Class for schema matching methods provided by Valentine
+    """
+
+    def __init__(self, matcher: BaseMatcher):
+        super().__init__()
+        self.data: Data = None
+        self.matcher: BaseMatcher = matcher
+        self.matches = None
+        self.top_columns: list = []
+
+    def process(self,
+                data: Data,
+                ) -> list:
+        self.data = data
+        df1 = self.data.dataset_1
+        df2 = self.data.dataset_2
+        self.matches = valentine.valentine_match(df1, df2, self.matcher)
+        self.top_columns = [[x[0][1] for x in self.matches.keys()], [x[1][1] for x in self.matches.keys()]]
+        return self.top_columns
+
+    def process_sm_weighted(self, data: Data):
+        pass
+
+    def print_matches(self):
+        print(self.matches)
+
+    def evaluate(self,
+                 prediction=None,
+                 export_to_df: bool = False,
+                 export_to_dict: bool = False,
+                 with_classification_report: bool = False,
+                 verbose: bool = True) -> any:
+
+        if self.data is None:
+            raise AttributeError("Can not proceed to evaluation without data object.")
+
+        if self.data.ground_truth is None:
+            raise AttributeError("Can not proceed to evaluation without a ground-truth file. " +
+                                 "Data object has not been initialized with the ground-truth file")
+
+        return valentine_metrics.all_metrics(self.matches, self.data.ground_truth.to_records(index=False).tolist())
+
+    def _configuration(self) -> dict:
+        pass
+
+    def stats(self) -> None:
+        pass
+
diff --git a/docs/pyjedai/utils.py b/docs/pyjedai/utils.py
index 4c89912..6e69d81 100644
--- a/docs/pyjedai/utils.py
+++ b/docs/pyjedai/utils.py
@@ -1,6 +1,7 @@
 from abc import ABC, abstractmethod
 from collections import defaultdict
-
+from sklearn.metrics.pairwise import pairwise_distances
+from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
 import numpy as np
 import re
 from nltk import ngrams
@@ -13,8 +14,14 @@
 import sys
 from time import time
 from networkx import Graph
+import inspect
 from ordered_set import OrderedSet
+import uuid
+import os
+import json
+import copy
 from math import floor
+import pandas as pd
 # ----------------------- #
 # Constants
 # ----------------------- #
@@ -317,7 +324,7 @@ def __init__(self, num_of_entities: int, sorted_entities: List[int]) -> None:
     def get_positions(self, entity: int):
         return self._entity_positions[entity]
 
-class WhooshNeighborhood(ABC):
+class EntityScheduler(ABC):
     """Stores information about the neighborhood of a given entity ID:
     - ID : The identifier of the entity as it is defined within the original dataframe
     - Total Weight : The total weight of entity's neighbors
@@ -329,50 +336,30 @@ class WhooshNeighborhood(ABC):
         ABC (ABC): ABC Module 
     """
     
-    def __init__(self, id : int, budget : float) -> None:
+    def __init__(self, id : int) -> None:
         self._id : int = id
-        self._budget : float = budget
-        self._neighbors : PriorityQueue = PriorityQueue(self._budget) if not is_infinite(self._budget) else PriorityQueue()
-        self._insert_stage : bool = True
-        self._minimum_weight : float = sys.float_info.min
+        self._neighbors : PriorityQueue = PriorityQueue()
         self._neighbors_num : int = 0
         self._total_weight : float = 0.0
         self._average_weight : float = None
         
     def _insert(self, neighbor_id: int, weight : float) -> None:
-        if(not self._insert_stage): self._change_state()
-        
-        if weight >= self._minimum_weight:
-            self._neighbors.put((weight, neighbor_id))
-        if self._neighbors.qsize() > self._budget:
-            self._minimum_weight = self._neighbors.get()[0]
-            
+        self._neighbors.put((-weight, neighbor_id))
         self._update_neighbors_counter_by(1)
         self._update_total_weight_by(weight)
             
-    def _pop(self) -> None:
-        if(self._insert_stage): self._change_state()
-        
+    def _pop(self) -> Tuple[float, int]:
         if(self._empty()):
             raise ValueError("No neighbors to pop!")
         
         _weight, _neighbor_id = self._neighbors.get()
+        self._update_neighbors_counter_by(-1)
+        self._update_total_weight_by(_weight)
+        
         return -_weight, _neighbor_id
     
     def _empty(self) -> bool:
         return self._neighbors.empty()
-    
-    def _change_state(self) -> None:
-        "Neighborhood can either be accepting or emitting neighbors" + \
-        "Accepting Stage - Neighbors stored in ascending weight order" + \
-        "Emitting Stage - Neighbors stored in descending weight order" 
-        _neighbors_resorted : PriorityQueue = PriorityQueue(int(self._budget)) if not is_infinite(self._budget) else PriorityQueue()
-        while(not self._neighbors.empty()):
-            _weight, _neighbor_id = self._neighbors.get()
-            _neighbors_resorted.put((-_weight, _neighbor_id))
-            
-        self._neighbors = _neighbors_resorted
-        self._insert_stage = not self._insert_stage
         
     def _update_total_weight_by(self, weight) -> None:
         self._total_weight = self._total_weight + weight
@@ -394,31 +381,31 @@ def _get_average_weight(self) -> float:
             return self._average_weight
     
     def __eq__(self, other):
-        if isinstance(other, WhooshNeighborhood):
+        if isinstance(other, EntityScheduler):
             return self._get_average_weight() == other._get_average_weight()
         return NotImplemented
     
     def __lt__(self, other):
-        if isinstance(other, WhooshNeighborhood):
+        if isinstance(other, EntityScheduler):
             return self._get_average_weight() < other._get_average_weight()
         return NotImplemented
     
     def __gt__(self, other):
-        if isinstance(other, WhooshNeighborhood):
+        if isinstance(other, EntityScheduler):
             return self._get_average_weight() > other._get_average_weight()
         return NotImplemented
     
     def __le__(self, other):
-        if isinstance(other, WhooshNeighborhood):
+        if isinstance(other, EntityScheduler):
             return self._get_average_weight() <= other._get_average_weight()
         return NotImplemented
     
     def __ge__(self, other):
-        if isinstance(other, WhooshNeighborhood):
+        if isinstance(other, EntityScheduler):
             return self._get_average_weight() >= other._get_average_weight()
         return NotImplemented 
     
-class WhooshDataset(ABC):
+class DatasetScheduler(ABC):
     """Stores a dictionary [Entity -> Entity's Neighborhood Data (Whoosh Neighborhood)]
        Supplies auxiliarry functions for information retrieval from the sorted dataset
 
@@ -426,26 +413,36 @@ class WhooshDataset(ABC):
         ABC (ABC): ABC Module
     """
      
-    def __init__(self, entity_ids : List[int], budget : float) -> None:
+    def __init__(self, budget : float = float('inf'), entity_ids : List[int] = [], global_top : bool = False) -> None:
         self._budget : float = budget
         self._total_entities : int = len(entity_ids)
-        self._entity_budget : float = budget if is_infinite(self._budget) else max(1, 2 * self._budget / self._total_entities)
         self._neighborhoods : dict = {}
+        # global emission case
+        self._global_top : bool = global_top
+        self._all_candidates = PriorityQueue() if self._global_top else None
         for entity_id in entity_ids:  
-            self._neighborhoods[entity_id] = WhooshNeighborhood(id=entity_id, budget=self._entity_budget)
+            self._neighborhoods[entity_id] = EntityScheduler(id=entity_id)
         # used in defining proper emission strategy
         self._sorted_entities : List[int] = None
         self._current_neighborhood_index : int = 0
         self._current_entity : int = None
-        self._current_neighborhood : WhooshNeighborhood = None
+        self._current_neighborhood : EntityScheduler = None
             
     def _insert_entity_neighbor(self, entity : int, neighbor : int, weight : float) -> None:
-        self._neighborhoods[entity]._insert(neighbor, weight)
+        if(not self._global_top):
+            if(entity not in self._neighborhoods):
+                _new_neighborhood : EntityScheduler = EntityScheduler(entity)
+                _new_neighborhood._insert(neighbor, weight)
+                self._neighborhoods[entity] = _new_neighborhood
+            else:
+                self._neighborhoods[entity]._insert(neighbor, weight)
+        else:
+            self._all_candidates.put((-weight, entity, neighbor))
         
     def _pop_entity_neighbor(self, entity : int) -> Tuple[float, int]:
         return self._neighborhoods[entity]._pop()
     
-    def _get_entity_neighborhood(self, entity : int) -> WhooshNeighborhood:
+    def _get_entity_neighborhood(self, entity : int) -> EntityScheduler:
         return self._neighborhoods[entity]
     
     def _entity_has_neighbors(self, entity : int) -> bool:
@@ -455,7 +452,7 @@ def _sort_neighborhoods_by_avg_weight(self) -> None:
         """Store a list of entity ids sorted in descending order of the average weight of their corresponding neighborhood"""
         self._sorted_entities : List = sorted(self._neighborhoods, key=lambda entity: self._neighborhoods[entity]._get_average_weight(), reverse=True)
     
-    def _get_current_neighborhood(self) -> WhooshNeighborhood:
+    def _get_current_neighborhood(self) -> EntityScheduler:
         return self._neighborhoods[self._current_entity]
         
     def _enter_next_neighborhood(self) -> None:
@@ -466,67 +463,126 @@ def _enter_next_neighborhood(self) -> None:
         self._current_entity = self._sorted_entities[self._current_neighborhood_index]
         self._current_neighborhood = self._neighborhoods[self._current_entity]
         
-    def _successful_emission(self, pair : Tuple[int, int]) -> bool:
-        
-        _entity, _neighbor = pair
-        _entity_id = self._data._ids_mapping_1[_entity]
-        _neighbor_id = self._data._ids_mapping_1[_neighbor] if self._data.is_dirty_er else self._data._ids_mapping_2[_neighbor]
+    def _successful_emission(self, pair : Tuple[float, int, int]) -> bool:
+        _score, _entity, _neighbor = pair
                 
         if(self._emitted_comparisons < self._budget):
-            self._emitted_pairs.append((_entity_id, _neighbor_id))
+            self._emitted_pairs.append((_score, _entity, _neighbor))
             self._emitted_comparisons += 1
             return True
         else:
             return False
         
-    def _emit_pairs(self, method : str, data : Data) -> List[Tuple[int, int]]:
+    def _print_info(self):
+        _n_ids : int
+        if(self._sorted_entities is None):
+            print("Neighborhood Status - Not Sorted by average weight")
+            _n_ids = self._neighborhoods.keys()
+        else:
+            print("Neighborhood Status - Sorted by average weight")
+            _n_ids = self._sorted_entities
+        for _n_id in _n_ids:
+            _current_neighborhood = self._neighborhoods[_n_id]
+            print("#############################")
+            print(f"Neighborhood[{_n_id}]")
+            print(f"Total Neighbords[{_current_neighborhood._get_neighbors_num()}]")
+            print(f"Average Weight[{_current_neighborhood._get_average_weight()}]")
+
+    def _checked_pair(self, entity : int, candidate : int) -> bool:
+        """Checks if the given pair has been checked previously in the scheduling process.
+           In the case the given pair has been constructed in the reverse indexing context,
+           proper translation to inorder indexing identification is done for correct checking.
+           Finally, if the pair has not been checked in the past, it is added to the checked pool.
+        Args:
+            entity (int): Entity ID
+            candidate (int): Candidate ID
+
+        Returns:
+            bool: Given pair has already been checked in the scheduling process
+        """
+        _d1_inorder_entity, _d2_inorder_entity = self._get_inorder_representation(entity, candidate)
+        
+        if((_d1_inorder_entity, _d2_inorder_entity) not in self._checked_entities):
+            self._checked_entities.add((_d1_inorder_entity, _d2_inorder_entity))
+            return False
+        else:
+            return True
+        
+    def _get_inorder_representation(self, entity : int, candidate : int) -> Tuple[int, int]:
+        """Takes as input the ID of the entity of the first and second dataset in its schedule indexing context (in that order!). 
+           Returns the ids of given entities in the inorder context,
+           in the following order (id of the entity of the first dataset in the inorder context, -//- second -//-)
+        Args:
+            entity (int): Entity ID
+            candidate (int): Candidate ID
+
+        Returns:
+            Tuple[int, int]: (id of entity of first dataframe, id of entity of second dataframe) in inorder context
+        """
+        if(entity < self._data.num_of_entities): return entity, candidate
+
+        # reverse context case
+        # - number of entities (to transfer the IDs from Scheduler -> Workflow ID representation)
+        # + / - dataset limit in order to express (D1 in reverse context == D2 in inorder context, and the reverse)
+        entity = entity - self._data.num_of_entities + self._data.num_of_entities_2
+        candidate = candidate - self._data.num_of_entities - self._data.num_of_entities_1
+        
+        return candidate, entity 
+            
+
+    def _emit_pairs(self, method : str, data : Data) -> List[Tuple[float, int, int]]:
         """Emits candidate pairs according to specified method
 
         Args:
             method (str): Emission Method
-            data (Data): Dataset Module
+            data (Data): Dataset Module of the 
 
         Returns:
             List[Tuple[int, int]]: List of candidate pairs
         """
         
         self._method : str = method
-        self._data : Data = data
-        
         self._emitted_pairs = []
         self._emitted_comparisons = 0    
+        self._checked_entities = set()
+        self._data : Data = data
+
+        if(self._method == 'TOP'):
+            while(not self._all_candidates.empty()):
+                score, sorted_entity, neighbor = self._all_candidates.get()
+                if(not self._checked_pair(sorted_entity, neighbor)):
+                    if(not self._successful_emission(pair=(-score, sorted_entity, neighbor))):
+                        return self._emitted_pairs
+                
+            return self._emitted_pairs
+            
         
         if(self._method == 'HB'):
             for sorted_entity in self._sorted_entities:
                 if(self._entity_has_neighbors(sorted_entity)):
-                    _, neighbor = self._pop_entity_neighbor(sorted_entity)
-                    if(not self._successful_emission(pair=(sorted_entity, neighbor))):
-                        return self._emitted_pairs
+                    score, neighbor = self._pop_entity_neighbor(sorted_entity)
+                    if(not self._checked_pair(sorted_entity, neighbor)):
+                        if(not self._successful_emission(pair=(score, sorted_entity, neighbor))):
+                            return self._emitted_pairs
                    
-        if(self._method == 'HB' or self._method == 'DFS'):
-            _checked_entity = np.zeros(self._total_entities, dtype=bool)
-            _sorted_entity_to_index = dict(zip(self._sorted_entities, range(0, self._total_entities)))
-            
-            for index, sorted_entity in enumerate(self._sorted_entities):
-                _checked_entity[index] = True
+        if(self._method == 'HB' or self._method == 'DFS'):            
+            for sorted_entity in self._sorted_entities:
                 while(self._entity_has_neighbors(sorted_entity)):
-                    _, neighbor = self._pop_entity_neighbor(sorted_entity)
-                    if(neighbor not in _sorted_entity_to_index or _checked_entity[_sorted_entity_to_index[neighbor]]):
-                        if(not self._successful_emission(pair=(sorted_entity, neighbor))):
+                    score, neighbor = self._pop_entity_neighbor(sorted_entity)
+                    if(not self._checked_pair(sorted_entity, neighbor)):
+                        if(not self._successful_emission(pair=(score, sorted_entity, neighbor))):
                             return self._emitted_pairs
         else:
             _emissions_left = True
-            _checked_entities = set()
             while(_emissions_left):
                 _emissions_left = False
                 for sorted_entity in self._sorted_entities:
                     if(self._entity_has_neighbors(sorted_entity)):
-                        _, neighbor = self._pop_entity_neighbor(sorted_entity)
-                        if(canonical_swap(sorted_entity, neighbor) not in _checked_entities):
-                            if(not self._successful_emission(pair=(sorted_entity, neighbor))):
+                        score, neighbor = self._pop_entity_neighbor(sorted_entity)
+                        if(not self._checked_pair(sorted_entity, neighbor)):
+                            if(not self._successful_emission(pair=(score, sorted_entity, neighbor))):
                                 return self._emitted_pairs
-                            _checked_entities.add(canonical_swap(sorted_entity, neighbor))
-                            _emissions_left = True
+                        _emissions_left = True
         return self._emitted_pairs
     
 class PredictionData(ABC):
@@ -536,14 +592,10 @@ class PredictionData(ABC):
     Args:
         ABC (ABC): ABC Module
     """
-    def __init__(self, name : str, predictions, tps_checked = dict) -> None:
-        self.set_name(name)
-        self.set_tps_checked(tps_checked)
-        self.set_predictions(self._format_predictions(predictions))
-        # Pairs have not been emitted yet - Data Module has not been populated with performance data
-        self.set_total_emissions(None)
-        self.set_normalized_auc(None)
-        self.set_cumulative_recall(None)
+    def __init__(self, matcher, matcher_info : dict) -> None:
+        self.set_matcher_info(matcher_info)
+        self.set_duplicate_emitted(matcher.duplicate_emitted)
+        self.set_candidate_pairs(self._format_predictions(matcher.pairs))
     
     def _format_predictions(self, predictions) -> List[Tuple[int, int]]:
         """Transforms given predictions into a list of duplets (candidate pairs)
@@ -555,48 +607,65 @@ def _format_predictions(self, predictions) -> List[Tuple[int, int]]:
         Returns:
             List[Tuple[int, int]]: Formatted Predictions
         """
-        return [edge[:2] for edge in predictions.edges] if isinstance(predictions, Graph) else predictions
+        return [edge[:3] for edge in predictions.edges] if isinstance(predictions, Graph) else predictions
         
     def get_name(self) -> str:
-        return self._name
+        _matcher_info : dict = self.get_matcher_info()
+        if('name' not in _matcher_info): raise ValueError("Matcher doesn't have a name - Make sure its execution data has been calculated")
+        return _matcher_info['name']
     
-    def get_predictions(self) -> List[Tuple[int, int]]:
-        return self._predictions
+    def get_candidate_pairs(self) -> List[Tuple[float, int, int]]:
+        if(self._candidate_pairs is None): raise ValueError("Pairs not scheduled yet - Cannot retrieve candidate pairs")
+        return self._candidate_pairs
     
-    def get_tps_checked(self) -> dict:
-        return self._tps_checked
+    def get_duplicate_emitted(self) -> dict:
+        if(self._duplicate_emitted is None): raise ValueError("No information about the status of true positives' emission")
+        return self._duplicate_emitted
     
     def get_total_emissions(self) -> int:
-        if(self._total_emissions is None): raise ValueError("Pairs not emitted yet - Total Emissions are undefined")
-        return self._total_emissions
+        _matcher_info : dict = self.get_matcher_info()
+        if('total_emissions' not in _matcher_info): raise ValueError("Pairs not emitted yet - Total Emissions are undefined")
+        return _matcher_info['total_emissions']
     
     def get_normalized_auc(self) -> float:
-        if(self._normalized_auc is None): raise ValueError("Pairs not emitted yet - Normalized AUC is undefined")
-        return self._normalized_auc
+        _matcher_info : dict = self.get_matcher_info()
+        if('auc' not in _matcher_info): raise ValueError("Pairs not emitted yet - Normalized AUC is undefined")
+        return _matcher_info['auc']
     
     def get_cumulative_recall(self) -> float:
-        if(self._cumulative_recall is None): raise ValueError("Pairs not emitted yet - Cumulative Recall is undefined")
-        return self._cumulative_recall
+        _matcher_info : dict = self.get_matcher_info()
+        if('recall' not in _matcher_info): raise ValueError("Pairs not emitted yet - Cumulative Recall is undefined")
+        return _matcher_info['recall']
+    
+    def get_matcher_info(self) -> dict:
+        if(self._matcher_info is None): raise ValueError("Pairs not emitted yet - Matcher Info is undefined")
+        return self._matcher_info
+    
+    def set_matcher_info(self, matcher_info : dict) -> None:
+        self._matcher_info : dict = matcher_info
     
     def set_name(self, name : str):
-        self._name : str = name
+        _matcher_info : dict = self.get_matcher_info()
+        _matcher_info['name'] = name  
     
-    def set_predictions(self, predictions : List[Tuple[int, int]]) -> None:
-        self._predictions : List[Tuple[int, int]] = predictions
+    def set_candidate_pairs(self, candidate_pairs : List[Tuple[float, int, int]]) -> None:
+        self._candidate_pairs : List[Tuple[float, int, int]] = candidate_pairs
     
-    def set_tps_checked(self, tps_checked : dict) -> None:
-        self._tps_checked : dict = tps_checked
+    def set_duplicate_emitted(self, duplicate_emitted : dict) -> None:
+        self._duplicate_emitted : dict = duplicate_emitted
     
     def set_total_emissions(self, total_emissions : int) -> None:
-        self._total_emissions : int = total_emissions
+        _matcher_info : dict = self.get_matcher_info()
+        _matcher_info['total_emissions'] = total_emissions
         
     def set_normalized_auc(self, normalized_auc : float) -> None:
-        self._normalized_auc : float = normalized_auc
+        _matcher_info : dict = self.get_matcher_info()
+        _matcher_info['auc'] = normalized_auc
         
     def set_cumulative_recall(self, cumulative_recall : float) -> None:
-        self._cumulative_recall : float = cumulative_recall        
-
-            
+        _matcher_info : dict = self.get_matcher_info()
+        _matcher_info['recall'] = cumulative_recall        
+       
 def canonical_swap(id1: int, id2: int) -> Tuple[int, int]:
     """Returns the identifiers in canonical order
 
@@ -619,14 +688,574 @@ def sorted_enumerate(seq, reverse=True):
 def is_infinite(value : float):
     return math.isinf(value) and value > 0
 
+def reverse_data_indexing(data : Data) -> Data:
+    """Returns a new data model based upon the given data model with reversed indexing of the datasets
+    Args:
+        data (Data): input dat a model
+
+    Returns:
+        Data : New Data Module with reversed indexing
+    """
+    return Data(dataset_1 = data.dataset_2,
+                id_column_name_1 = data.id_column_name_2,
+                attributes_1 = data.attributes_2,
+                dataset_name_1 = data.dataset_name_2,
+                dataset_2 = data.dataset_1,
+                attributes_2 = data.attributes_1,
+                id_column_name_2 = data.id_column_name_1,
+                dataset_name_2 = data.dataset_name_1,
+                ground_truth = data.ground_truth)
+
+def get_class_function_arguments(class_reference, function_name : str) -> List[str]:
+    """Returns a list of argument names for requested function of the given class
+    Args:
+        class_reference: Reference to a class
+        function_name (str): Name of the requested function
+
+    Returns:
+        List[str] : List of requested function's arguments' names
+    """
+    if not inspect.isclass(class_reference):
+        raise ValueError(f"{class_reference.__name__} class reference is not valid.")
 
+    if not hasattr(class_reference, function_name):
+        raise ValueError(f"The class {class_reference.__name__} does not have a function named {function_name}.")
 
+    function_obj = getattr(class_reference, function_name)
+    if not inspect.isfunction(function_obj):
+        raise ValueError(f"The provided name {function_name} does not correspond to a function in class '{class_reference.__name__}'.")
 
+    function_signature = inspect.signature(function_obj)
+    argument_names = list(function_signature.parameters.keys())[1:]
+
+    return argument_names
+
+def new_dictionary_from_keys(dictionary : dict, keys : list) -> dict:
+    """Returns a subset of the given dictionary including only the given keys.
+       Unrecognized keys are not included.
+    Args:
+        dictionary (dict): Target dictionary
+        keys (list): Keys to keep
+
+    Returns:
+        dict : Subset of the given dictionary including only the requested keys
+    """
+    new_dictionary : dict = {key: dictionary[key] for key in keys if key in dictionary}
+    return new_dictionary
+
+
+def has_duplicate_pairs(pairs : List[Tuple[float, int, int]]):
+    seen_pairs = set()
+    for pair in pairs:
+        entity : int = pair[1]
+        candidate : int = pair[2]
+        if (entity, candidate) in seen_pairs:
+            return True
+        seen_pairs.add((entity, candidate))
+    return False
+
+def reverse_blocks_entity_indexing(blocks : dict, data : Data) -> dict:
+    """Returns a new instance of blocks containing the entity IDs of the given blocks translated into the reverse indexing system
+    Args:
+        blocks (dict): blocks as defined in the previous indexing
+        data (Data): Previous data module used to define the reversed ids based on previous dataset limit and dataset sizes
+
+    Returns:
+        dict : New block instance with identifiers defined in the context of the reverse indexing
+    """
+    if(blocks is None): return None
+    all_blocks = list(blocks.values())
+    if 'Block' in str(type(all_blocks[0])):
+        return reverse_raw_blocks_entity_indexing(blocks, data)
+    elif isinstance(all_blocks[0], set):
+        return reverse_prunned_blocks_entity_indexing(blocks, data)
+ 
+def reverse_prunned_blocks_entity_indexing(blocks : dict, data : Data) -> dict:
+    _reversed_blocks : dict = dict()
+    _reversed_block : set
+     
+    for entity in blocks:
+        _updated_entity : int = get_reverse_indexing_id(entity, data)
+        _reversed_block = set()
+        block : set = blocks[entity]
+        for candidate in block:
+            _reversed_block.add(get_reverse_indexing_id(candidate, data))
+        _reversed_blocks[_updated_entity] = _reversed_block
+        
+    return _reversed_blocks
+        
+def reverse_raw_blocks_entity_indexing(blocks : dict, data : Data) -> dict:
+    _reversed_blocks : dict = dict()
+    _reversed_block : Block 
+    
+    for token in blocks:
+        _current_block : Block = blocks[token]
+        _updated_D1_entities = OrderedSet()
+        _updated_D2_entities = OrderedSet()
+        
+        for d1_entity in _current_block.entities_D1:
+            _updated_D2_entities.add(get_reverse_indexing_id(d1_entity, data))
             
-            
+        for d2_entity in _current_block.entities_D2:
+            _updated_D1_entities.add(get_reverse_indexing_id(d2_entity, data))
+         
+        _reversed_block = Block()   
+        _reversed_block.entities_D1 = _updated_D1_entities
+        _reversed_block.entities_D2 = _updated_D2_entities
         
+        _reversed_blocks[token] = _reversed_block
+    
+    return _reversed_blocks
         
+    
+def get_reverse_indexing_id(id : int, data : Data) -> int:
+    return (id + data.num_of_entities_2) if (id < data.num_of_entities_1) else (id - data.num_of_entities_1)
+
+
+# Progressive Workflow Grid Search Utility Functions
+
+def values_given(configuration: dict, parameter: str) -> bool:
+    """Values for requested parameters have been supplied by the user in the configuration file
+
+    Args:
+        configuration (dict): Configuration File
+        parameter (str): Requested parameter name
+
+    Returns:
+        bool: Values for requested parameter supplied
+    """
+    return (parameter in configuration) and (isinstance(configuration[parameter], list)) and (len(configuration[parameter]) > 0)
+
+def get_multiples(num : int, n : int) -> list:
+    """Returns a list of multiples of the requested number up to n * number
+
+    Args:
+        num (int): Number
+        n (int): Multiplier
+
+    Returns:
+        list: Multiplies of num up to n * num 
+    """
+    multiples = []
+    for i in range(1, n+1):
+        multiples.append(num * i)
+    return multiples
+
+def necessary_dfs_supplied(configuration : dict) -> bool:
+    """Configuration file contains values for source, target and ground truth dataframes
+
+    Args:
+        configuration (dict): Configuration file
+
+    Raises:
+        ValueError: Zero values supplied for one or more paths
+
+    Returns:
+        bool: _description_
+    """
+    for path in ['source_dataset_path', 'target_dataset_path', 'ground_truth_path']:
+        if(not values_given(configuration, path)):
+            raise ValueError(f"{path}: No values given")
         
+    return len(configuration['source_dataset_path']) == len(configuration['target_dataset_path']) == len(configuration['ground_truth_path'])
+
+def generate_unique_identifier() -> str:
+    """Returns unique identifier which is used to cross reference workflows stored in json file and their performance graphs
+
+    Returns:
+        str: Unique identifier
+    """
+    return str(uuid.uuid4())  
+
+
+def to_path(path : str):
+    return os.path.expanduser(path)
+            
+def clear_json_file(path : str):
+    if os.path.exists(path):
+        if os.path.getsize(path) > 0:
+            open(path, 'w').close()
+            
+            
+def purge_id_column(columns : list):
+    non_id_columns : list = []
+    for column in columns:
+        if(column != 'id'):
+            non_id_columns.append(column)
+    
+    return non_id_columns
+
+def common_elements(elements1 : list, elements2 : list) -> list:
+    """Returns the union of the elements of both lists in the order they appear in the first list
+
+    Args:
+        elements1 (list): Source list of elements
+        elements2 (list): Target list of elements
+
+    Returns:
+        list : Returns the union of the elements of both lists in the order they appear in the first list
+    """
+    _common_elements : list = []
+    
+    for element in elements1:
+        if element in elements2:
+            _common_elements.append(element)
+    return _common_elements
+
+def matching_arguments(workflow : dict, arguments : dict) -> bool:
+    """Checks if given workflow's arguments that are shared with the target arguments have values that appear in the those arguments
+
+    Args:
+        workflow (dict): Dictionary of argument -> value for the given workflow
+        arguments (dict): Dictionary of argument -> lists of values that are valid for the workflow in order for it to be matching
+
+    Returns:
+        bool : Checks if given workflow's arguments that are shared with the target arguments have values that appear in the those arguments
+    """
+    for argument, value in workflow.items():
+        if argument in arguments and value not in arguments[argument]:
+            return False  
+    return True
+
+def update_top_results(results : dict, new_workflow : dict, metric : str, keep_top_budget : bool) -> dict:
+    """Based on its performance, sets the new workflow as the top one in 
+       its budget/global category (don't / only keep the budget with top performance)  
+
+    Args:
+        results (dict): Budget -> Best workflow for giben budget
+        new_workflow (dict): Arguments -> values for given workflow
+        metric (str) : Metric upon which workflows are being compared
+        keep_top_budget (bool): Keep only the workflow corresponding to the budget with the best performance
+
+    Returns:
+        dict : Updated Results Dictionary
+    """
+    
+    _budget : int = new_workflow['budget']
+    _current_top_workflow = (None if not results else results[next(iter(results))]) if keep_top_budget \
+                            else (None if _budget not in results else results[_budget])
+    
+    if(_current_top_workflow is None or _current_top_workflow[metric] < new_workflow[metric]):
+        if(keep_top_budget):
+            return {_budget : new_workflow}
+        else:
+            results[_budget] = new_workflow
+            return results
+    return results
+
+def retrieve_top_workflows(workflows : dict = None,
+                           workflows_path : str = None,
+                           store_path : str = None,
+                           metric : str = 'auc',
+                           top_budget : bool = False,
+                           **arguments):
+    """Takes a workflow dictionary or retrieves it from given path.
+       Gathers the best workflows was specified comparison metric and argument values.
+       Stores the best workflows in the given storage path.  
+
+    Args:
+        workflows (dict): Dictionary containing the workflows (Defaults to None)
+        workflows_path (dict): Path from which the program will attempt to retrieve the workflows (Defaults to None)
+        store_path (str) : Path in which the best workflows will be stored in json format (Defaults to None)
+        metric (bool): Metric used to compare workflows (Default to 'auc')
+        top_budget (bool): Store only the workflow for the budget with the best performance (Defaults to False)
+        arguments (dict): Arguments and the corresponding values that workflows have to possess in order to be considered
+
+    Returns:
+        dict : Updated Results Dictionary
+    """
+    
+    retrievable_metrics = ['time', 'auc', 'recall']
+    
+    if(workflows is not None):
+        _workflows = workflows
+    elif(workflows_path is not None):
+        with open(workflows_path) as file:
+            _workflows = json.load(file)
+    else:
+        raise ValueError("Please provide workflows dictionary / json file path.")
+    
+    if metric not in ['time', 'auc', 'recall']:
+        raise AttributeError(
+            'Metric ({}) does not exist. Please select one of the available. ({})'.format(
+                metric, retrievable_metrics
+                )
+            )
+     
+    _results : dict = {} 
+    # datasets, matchers and language models
+    # for which we want to find the top workflows  
+    datasets : List[str] = None if 'dataset' not in arguments else arguments['dataset'] 
+    matchers : List[str] = None if 'matcher' not in arguments else arguments['matcher'] 
+    lms : List[str] = None if 'language_model' not in arguments else arguments['language_model']
+    
+    _dataset_names : List[str] = _workflows.keys() if datasets is None else common_elements(datasets, workflows.keys())
+    _current_workflows : List[dict] = []
+
+    for _dataset_name in _dataset_names:
+        _dataset_info : dict = _workflows[_dataset_name]
+        _matcher_names = _dataset_info.keys() if matchers is None else common_elements(matchers, _dataset_info.keys())
+        for _matcher_name in _matcher_names:
+            _matcher_info : dict = _dataset_info[_matcher_name]
+            if _matcher_name == 'EmbeddingsNNBPM':
+                _lm_names = _matcher_info.keys() if lms is None else common_elements(lms, _matcher_info.keys())
+                for _lm_workflows in _matcher_info[_lm_names]:
+                    _current_workflows += _lm_workflows
+            else:
+                _current_workflows += _matcher_info
+            for _current_workflow in _current_workflows:
+                if(matching_arguments(workflow=_current_workflow, arguments=arguments)):
+                    _results = update_top_results(results=_results, 
+                                                  new_workflow=_current_workflow,
+                                                  metric=metric,
+                                                  keep_top_budget=top_budget)
+    
+    print(_results)                
+    if (store_path is not None):
+        with open(store_path, 'w', encoding="utf-8") as file:
+            json.dump(_results, file, indent=4)
+  
+
+def add_entry(workflow : dict, dataframe_dictionary : dict) -> None:
+    """Retrieves features and their values from the given workflow dictionary,
+       and stores them in the to-be-constructed dataframe dictionary
+
+    Args:
+        workflow (dict): Dictionary containing workflow's arguments and their values
+        dataframe_dictionary (dict): Dictionary that stores workflows arguments and their values - 
+                                     to be transformed into columns
+    """
+    for feature, value in workflow.items():
+        if(feature != 'tp_idx'):
+            if feature not in dataframe_dictionary:
+                dataframe_dictionary[feature] = []
+            dataframe_dictionary[feature].append(value)
+          
+def workflows_to_dataframe(workflows : dict = None,
+                           workflows_path : str = None,
+                           store_path : str = None) -> pd.DataFrame:
+    """Takes a workflow dictionary or retrieves it from given path.
+       Stores all of its entries in a dataframe.
+       Stores the dataframe in specified path if provided.
+
+    Args:
+        workflows (dict): Dictionary containing the workflows (Defaults to None)
+        workflows_path (dict): Path from which the program will attempt to retrieve the workflows (Defaults to None)
+        store_path (str) : Path in which the dataframe will be stored in json format (Defaults to None)
+
+    Returns:
+        pd.Dataframe : Dataframe containing the workflow entries in the given workflows dictionary
+    """
+    if(workflows is not None):
+        _workflows = workflows
+    elif(workflows_path is not None):
+        with open(workflows_path) as file:
+            _workflows = json.load(file)
+    else:
+        raise ValueError("Please provide workflows dictionary / json file path.")
     
+    
+    dataframe_dictionary : dict = {}
+    workflows_dataframe : pd.DataFrame   
+    
+    for dataset in _workflows:
+        dataset_info : dict = _workflows[dataset]
+        for matcher in dataset_info:
+            matcher_info : dict = dataset_info[matcher]
+            current_workflows : list = []
+            if(matcher == 'EmbeddingsNNBPM'):
+                for lm in matcher_info:
+                    current_workflows += matcher_info[lm]
+            else:
+                current_workflows += matcher_info
+                
+            for current_workflow in current_workflows:
+                add_entry(current_workflow, dataframe_dictionary)
+     
+    workflows_dataframe = pd.DataFrame(dataframe_dictionary)           
+    if(store_path is not None):
+        workflows_dataframe.to_csv(store_path, index=False)
+        
+    return workflows_dataframe
 
-        
\ No newline at end of file
+# Frequency based Vectorization/Similarity evaluation Module   
+class FrequencyEvaluator(ABC):
+    def __init__(self, vectorizer : str, tokenizer : str, qgram : int) -> None:
+        super().__init__()
+        self.vectorizer_name : str = vectorizer
+        self.tokenizer : str = tokenizer
+        self.qgram : int = qgram
+        self.analyzer = 'char' if 'char' in self.tokenizer else 'word'
+        
+        if self.vectorizer_name == 'tfidf' or self.vectorizer_name == 'boolean':
+            self.vectorizer = TfidfVectorizer(analyzer='') if self.qgram is None else \
+                            TfidfVectorizer(analyzer=self.analyzer, ngram_range=(self.qgram, self.qgram))
+        elif self.vectorizer_name == 'tf':
+            self.vectorizer = CountVectorizer(analyzer=self.analyzer) if self.qgram is None else \
+                            CountVectorizer(analyzer=self.analyzer, ngram_range=(self.qgram, self.qgram))
+        else:
+            raise ValueError(f"{self.vectorizer_name}: Invalid Frequency Evaluator Model Name")
+        
+        self.dataset_identifier : str = None 
+        self.indexing : str = None
+        self.distance_matrix : np.ndarray = None
+        self.distance_matrix_loaded : bool = False
+        self.distance_matrix_indexing : str = None
+      
+    def save_distance_matrix(self) -> None:
+        """Store the distance matrix of frequency evaluator in the hidden .dm directory within the execution path.
+           The name of the file contains the vectorizer, tokenizer, dataset and metric, so it can be retrieved and
+           used as precalculated distances matrix.
+        """
+        distance_matrix_file_name = '_'.join([self.indexing, self.dataset_identifier, self.vectorizer_name, self.tokenizer.split('_')[0], self.metric, "q" + str(self.qgram) + ".npy"])
+        
+        hidden_directory_path = os.path.join(os.getcwd(), ".dm")
+        os.makedirs(hidden_directory_path, exist_ok=True)
+        distance_matrix_file_path = os.path.join(hidden_directory_path, distance_matrix_file_name)
+        try:
+            print(f"Saving Distance Matrix -> {distance_matrix_file_path}")
+            np.save(distance_matrix_file_path, self.distance_matrix) 
+            pass
+        except FileNotFoundError:
+            print(f"Unable to save distance matrix -> {distance_matrix_file_path}")   
+            
+            
+    def load_distance_matrix_from_path(self, path : str) -> np.ndarray:
+        """Load the precalculated distance matrix for current execution's arguments combination.
+        Args:
+            path (str): Path to the distance matrix file
+        Returns:
+            np.ndarray: Precalculated distance matrix for current execution parameters combination
+        """
+        try:
+            print(f"Loading Distance Matrix from: {path}")
+            return np.load(path) 
+            pass
+        except FileNotFoundError:
+            print(f"Unable to load distance matrix -> {path}")       
+            
+    def retrieve_distance_matrix_file_path(self) -> Tuple[str, str]:
+        """Attemps to retrieve a precalculated DM from disk for current experiment
+        Returns:
+            str: Precalculated DM file path (None if doesn't exist)
+        """
+        
+        _requested_indexing : str = self.indexing
+        _opposite_indexing : str = "inorder" if (self.indexing == "reverse") else "reverse"
+        _requested_indexing_file_name = '_'.join([_requested_indexing, self.dataset_identifier, self.vectorizer_name, self.tokenizer.split('_')[0], self.metric, "q" + str(self.qgram) + ".npy"])
+        _opposite_indexing_file_name = '_'.join([_opposite_indexing, self.dataset_identifier, self.vectorizer_name, self.tokenizer.split('_')[0], self.metric, "q" + str(self.qgram) + ".npy"])
+        
+        hidden_directory_path = os.path.join(os.getcwd(), ".dm")
+        os.makedirs(hidden_directory_path, exist_ok=True)
+        
+        
+        _available_indexing : str = None
+        _available_file_path : str = None
+        _requested_indexing_file_path = os.path.join(hidden_directory_path, _requested_indexing_file_name)
+        _opposite_indexing_file_path = os.path.join(hidden_directory_path, _opposite_indexing_file_name)
+        
+        
+        if(os.path.exists(_requested_indexing_file_path) and os.path.isfile(_requested_indexing_file_path)):
+            _available_indexing = _requested_indexing
+            _available_file_path = _requested_indexing_file_path
+        elif(os.path.exists(_opposite_indexing_file_path) and os.path.isfile(_opposite_indexing_file_path)):
+            _available_indexing = _opposite_indexing
+            _available_file_path = _opposite_indexing_file_path
+
+        return (_available_indexing, _available_file_path)
+    
+    
+    def distance_to_similarity_matrix(self, distance_matrix : np.ndarray) -> np.ndarray:
+        """Transforms the input distance matrix into similarity matrix
+        Args:
+            distance_matrix (np.ndarray): Input pairwise distance matrix
+        Returns:
+            np.ndarray: Pairwise similarity matrix
+        """
+        
+        if(self.metric == 'sqeuclidean'):
+            return 1.0 / (1.0 + (distance_matrix ** 2))
+        elif('cosine' in self.metric):
+            return 1.0 - distance_matrix
+        else:
+            return distance_matrix
+      
+      
+    def _get_sparse_matrix_method(self, metric : str) -> str:
+        if(metric == 'sqeuclidean'):
+            return 'euclidean'
+        else:
+            return metric
+        
+    def fit(self, 
+            metric : str, 
+            dataset_identifier : str,
+            indexing : str,
+            d1_entities : list = None, 
+            d2_entities : list = None, 
+            save_dm : bool = True) -> None:
+        """Initializes the entities' corpus, and constructs the similarity matrix 
+        Args:
+            metric (str): Distance metric for entity strings
+            dataset_identifier (str): Name of the dataset we are conducting our experiment on
+            indexing (str): Indexing that the candidate entities follow
+            d1_entities (list): List of D1 entities' string representations
+            d2_entities (list): List of D2 entities' string representations
+            save_dm (bool): Save the distance matrix in hidden directory on disk
+        """
+        if(d1_entities is None or d2_entities is None):
+            raise NotImplementedError(f"{self.vectorizer_name} Frequency Evaluator Model - Dirty ER is not implemented yet")
+        else:
+            self.metric : str = metric
+            self._entities_d1 : list = d1_entities
+            self._entities_d2 : list = d2_entities
+            self._entities_d1_num : int = len(self._entities_d1)
+            self._entities_d2_num : int = len(self._entities_d2)
+            self.save_dm : bool = save_dm
+            self.dataset_identifier : str = dataset_identifier
+            self.indexing : str = indexing
+            
+            _dm_indexing, _dm_path = self.retrieve_distance_matrix_file_path()
+            if(_dm_path is not None):
+                self.distance_matrix : np.ndarray = self.load_distance_matrix_from_path(path=_dm_path)
+                self.distance_matrix_loaded : bool = True
+                self.distance_matrix_indexing : str = _dm_indexing
+            else:
+                self.corpus = self._entities_d1 + self._entities_d2
+                self._tf_limit = len(self._entities_d1)
+                self.corpus_as_matrix = self.vectorizer.fit_transform(self.corpus)
+                if self.vectorizer_name == 'boolean':
+                    self.corpus_as_matrix = self.corpus_as_matrix.astype(bool).astype(int)
+
+                self.distance_matrix : np.ndarray = self.distance_to_similarity_matrix(
+                                                    distance_matrix=pairwise_distances(
+                                                    self.corpus_as_matrix, 
+                                                    metric=self._get_sparse_matrix_method(metric=self.metric)))
+
+                self.distance_matrix_loaded : bool = False
+                self.distance_matrix_indexing : str = self.indexing
+                
+                if(self.save_dm): 
+                    self.save_distance_matrix()
+                    
+     
+    def predict(self, id1 : int, id2 : int) -> float:
+        """Returns the predicted similarity score for the given entities
+        Args:
+            id1 (int): id of an entity of the 1nd dataset within experiment context (not necessarily preloaded matrix)
+            id2 (int): id of an entity of the 2nd dataset within experiment context (not necessarily preloaded matrix)
+        Returns:
+            float: Similarity score of entities with specified IDs
+        """
+        # candidates = np.vstack((self.corpus_as_matrix[id1], self.corpus_as_matrix[id2]))
+        # distances = pairwise_distances(candidates, metric=self.metric)        
+        # return 1.0 - distances[0][1]
+        if(self.indexing == self.distance_matrix_indexing):
+            return self.distance_matrix[id1][id2]
+        # _id1 = (id1 + self._entities_d2_num) if (self.indexing == "inorder") else (id1 + self._entities_d1_num)
+        # _id2 = (id2 - self._entities_d1_num) if (self.indexing == "inorder") else (id2 - self._entities_d2_num)
+        _id1 = (id1 + self._entities_d2_num)
+        _id2 = (id2 - self._entities_d1_num)
+
+        return self.distance_matrix[_id1][_id2]
+    
\ No newline at end of file
diff --git a/docs/pyjedai/vector_based_blocking.py b/docs/pyjedai/vector_based_blocking.py
index 0c3c032..ca4259f 100644
--- a/docs/pyjedai/vector_based_blocking.py
+++ b/docs/pyjedai/vector_based_blocking.py
@@ -31,7 +31,7 @@
 from .evaluation import Evaluation
 from .utils import SubsetIndexer
 
-EMBEDDINGS_DIR = '.embeddings'
+EMBEDDINGS_DIR = '.embs'
 if not os.path.exists(EMBEDDINGS_DIR):
     os.makedirs(EMBEDDINGS_DIR)
     EMBEDDINGS_DIR = os.path.abspath(EMBEDDINGS_DIR)
@@ -105,6 +105,8 @@ def build_blocks(self,
                      tqdm_disable: bool = False,
                      save_embeddings: bool = True,
                      load_embeddings_if_exist: bool = False,
+                     load_path_d1: str = None,
+                     load_path_d2: str = None,
                      with_entity_matching: bool = False,
                      input_cleaned_blocks: dict = None,
                      similarity_distance: str = 'cosine'
@@ -142,9 +144,10 @@ def build_blocks(self,
         self.with_entity_matching = with_entity_matching
         self.save_embeddings, self.load_embeddings_if_exist = save_embeddings, load_embeddings_if_exist
         self.max_word_embeddings_size = max_word_embeddings_size
-        self.simiarity_distance = similarity_distance
+        self.similarity_distance = similarity_distance
         self.data, self.attributes_1, self.attributes_2, self.vector_size, self.num_of_clusters, self.top_k, self.input_cleaned_blocks \
             = data, attributes_1, attributes_2, vector_size, num_of_clusters, top_k, input_cleaned_blocks
+        self.load_path_d1, self.load_path_d2 = load_path_d1, load_path_d2
         self._progress_bar = tqdm(total=data.num_of_entities,
                                   desc=(self._method_name + ' [' + self.vectorizer + ', ' + self.similarity_search + ']'),
                                   disable=tqdm_disable)
@@ -163,7 +166,7 @@ def build_blocks(self,
 
         self._si = SubsetIndexer(self.input_cleaned_blocks, self.data, self._applied_to_subset)
         self._d1_valid_indices: list[int] = self._si.d1_retained_ids
-        self._d2_valid_indices: list[int] = [x - self.data.dataset_limit for x in self._si.d2_retained_ids]   
+        self._d2_valid_indices: list[int] = [x - self.data.dataset_limit for x in self._si.d2_retained_ids]  if not data.is_dirty_er else None
 
         self._entities_d1 = data.dataset_1[attributes_1 if attributes_1 else data.attributes_1] \
                             .apply(" ".join, axis=1) \
@@ -188,29 +191,35 @@ def build_blocks(self,
         self._d2_loaded : bool = False
         if load_embeddings_if_exist:
                 print("Loading embeddings from file...")
-                
-                p1 = os.path.join(EMBEDDINGS_DIR, self.vectorizer + '_' + (self.data.dataset_name_1 \
-                                                    if self.data.dataset_name_1 is not None else "d1") +'.npy')
-                print("Loading file: ", p1)
+                if(self.load_path_d1 is not None):
+                    p1 = self.load_path_d1
+                else:
+                    p1 = os.path.join(EMBEDDINGS_DIR, self.vectorizer + '_' + (self.data.dataset_name_1 \
+                                                        if self.data.dataset_name_1 is not None else "d1") +'.npy')
+                print("Attempting to load D1 embeddings...")
                 if os.path.exists(p1):
                     self.vectors_1 = vectors_1 = np.load(p1)
                     self.vectors_1 = vectors_1 = vectors_1[self._d1_valid_indices]
                     self._progress_bar.update(data.num_of_entities_1)
                     self._d1_loaded = True
+                    print(f"{p1} -> Loaded Successfully")
                 else:
                     print("Embeddings not found. Creating new ones.")
                 
-                p2 = os.path.join(EMBEDDINGS_DIR, self.vectorizer + '_' + (self.data.dataset_name_2 \
-                                                    if self.data.dataset_name_2 is not None else "d2") +'.npy')    
-                print("Loading file: ", p2)
+                if(self.load_path_d2 is not None):
+                    p2 = self.load_path_d2
+                else:
+                    p2 = os.path.join(EMBEDDINGS_DIR, self.vectorizer + '_' + (self.data.dataset_name_2 \
+                                                        if self.data.dataset_name_2 is not None else "d2") +'.npy')    
+                print("Attempting to load D2 embeddings...")
                 if os.path.exists(p2):
                     self.vectors_2 = vectors_2 = np.load(p2)
                     self.vectors_2 = vectors_2 = vectors_2[self._d2_valid_indices]
                     self._progress_bar.update(data.num_of_entities_2)
                     self._d2_loaded = True
+                    print(f"{p2} -> Loaded Successfully")
                 else:
                     print("Embeddings not found. Creating new ones.")
-                print("Loading embeddings from file finished")
         if not self._d1_loaded or not self._d2_loaded:
             if self.vectorizer in ['word2vec', 'fasttext', 'doc2vec', 'glove']:
                 self.vectors_1, self.vectors_2 = self._create_gensim_embeddings()
@@ -368,33 +377,33 @@ def _create_pretrained_sentence_embeddings(self):
     def _similarity_search_with_FAISS(self):
         index = faiss.IndexFlatL2(self.vectors_1.shape[1])
         
-        if self.simiarity_distance == 'cosine' or self.simiarity_distance == 'cosine_without_normalization':
+        if self.similarity_distance == 'cosine' or self.similarity_distance == 'cosine_without_normalization':
             index.metric_type = faiss.METRIC_INNER_PRODUCT
-        elif self.simiarity_distance == 'euclidean':
+        elif self.similarity_distance == 'euclidean':
             index.metric_type = faiss.METRIC_L2
         else:
-            raise ValueError("Invalid similarity distance: ", self.simiarity_distance)
+            raise ValueError("Invalid similarity distance: ", self.similarity_distance)
 
-        if self.simiarity_distance == 'cosine':
+        if self.similarity_distance == 'cosine':
             faiss.normalize_L2(self.vectors_1)
-            faiss.normalize_L2(self.vectors_2)
+            if not self.data.is_dirty_er: faiss.normalize_L2(self.vectors_2)
             
         index.train(self.vectors_1)  # train on the vectors of dataset 1
 
-        if self.simiarity_distance == 'cosine':
+        if self.similarity_distance == 'cosine':
             faiss.normalize_L2(self.vectors_1)
-            faiss.normalize_L2(self.vectors_2)
+            if not self.data.is_dirty_er: faiss.normalize_L2(self.vectors_2)
 
         index.add(self.vectors_1)   # add the vectors and update the index
 
-        if self.simiarity_distance == 'cosine':
+        if self.similarity_distance == 'cosine':
             faiss.normalize_L2(self.vectors_1)
-            faiss.normalize_L2(self.vectors_2)
+            if not self.data.is_dirty_er: faiss.normalize_L2(self.vectors_2)
         
         self.distances, self.neighbors = index.search(self.vectors_1 if self.data.is_dirty_er else self.vectors_2,
                                     self.top_k)
 
-        if self.simiarity_distance == 'euclidean':
+        if self.similarity_distance == 'euclidean':
             self.distances = 1/(1 + self.distances)
 
         self.blocks = dict()
@@ -516,12 +525,9 @@ def export_to_df(self, prediction) -> pd.DataFrame:
         Returns:
             pd.DataFrame: Dataframe with the predicted pairs
         """
-        if self.data.ground_truth is None:
-            raise AttributeError("Can not proceed to evaluation without a ground-truth file. \
-                Data object mush have initialized with the ground-truth file")
         pairs_df = pd.DataFrame(columns=['id1', 'id2'])
         for entity_id, candidates in prediction:
-            id1 = self.data._gt_to_ids_reversed_1[entity_id]                                            
+            id1 = self.data._gt_to_ids_reversed_1[entity_id]
             for candiadate_id in candidates:
                 id2 = self.data._gt_to_ids_reversed_1[candiadate_id] if self.data.is_dirty_er \
                         else self.data._gt_to_ids_reversed_2[candiadate_id]
diff --git a/docs/pyjedai/visualization.py b/docs/pyjedai/visualization.py
index 8075458..d01db52 100644
--- a/docs/pyjedai/visualization.py
+++ b/docs/pyjedai/visualization.py
@@ -1,6 +1,9 @@
 import itertools 
 import matplotlib.pyplot as plt
 import numpy as np
+import pandas as pd
+import os
+from typing import List, Tuple
 
 # Function that creates a confusion matrix
 def create_confusion_matrix(confusion_matrix, title):
@@ -25,4 +28,510 @@ def create_confusion_matrix(confusion_matrix, title):
     plt.xlabel('Predicted label')
     plt.tight_layout()
     plt.ylim([1.5, -.5])
-    plt.show() 
\ No newline at end of file
+    plt.show() 
+
+def plot_feature_progress_per_attribute_group(method_name : str,
+                                              dataset_name : str,
+                                              feature : str,
+                                              attributes : list,
+                                              df : pd.DataFrame = None,
+                                              load_path : str = None,
+                                              grid : bool = True,
+                                              save : bool = True,
+                                              verbose : bool = True,
+                                              in_plot_directory : bool = True
+                                              ) -> None:
+    """Plots the progress of the value of requested feature per budget for experiments grouped by the attributes.
+       Saves the plot as an image in the requested path.
+    
+    Args:
+        method_name (str): Name of the method used in the dataframe's experiments
+        dataset_name (str): Name of dataset on which the dataframe's experiments have been applied on
+        feature (str): The feature whose per budget progress we want to plot (e.x. auc)
+        attributes (list): Group of experiments' arguments whose each distinct combination constitutes a seperate curve
+        df (pd.DataFrame): Dataframe containing the information about progressive PER experiments (Defaults to None)
+        load_path (str): Path from which the dataframe should be loaded from (Defaults to None)
+        grid (bool): Grid to be displayed in the plot (Defaults to True)
+        save (bool) : Save the plot as an image on disk (Defaults to True)
+        verbose (bool) : Show the produced plot
+        in_plot_directory (bool) : Plot to be saved in an experiment directory - 
+        created in the target dataframe's / current directory if non-existent (Defaults to True)
+    """
+    
+    experiments : pd.DataFrame
+    if(df is not None):
+        experiments = df
+    elif(load_path is not None):
+        experiments = pd.read_csv(load_path)
+    else:
+        raise ValueError("No dataframe or csv file given - Cannot plot the experiments.")
+    
+    experiments = experiments.groupby(attributes)
+
+
+    fig = plt.figure(figsize=(16, 12))
+    ax = plt.subplot(111)
+
+    for attributes_unique_values, attributes_experiment_group in experiments:
+        group_name = '-'.join([str(attribute) for attribute in attributes_unique_values])
+        attributes_experiment_group_per_budget = attributes_experiment_group.sort_values(by='budget').groupby('budget')
+        budgets = []
+        average_feature_values = []
+        for _, current_budget_attributes_experiment_group in attributes_experiment_group_per_budget:
+            budgets.append(current_budget_attributes_experiment_group['budget'].mean())
+            average_feature_values.append(current_budget_attributes_experiment_group[feature].mean())
+
+        ax.plot(budgets, average_feature_values, label=str(group_name), marker='o', linestyle='-')
+
+    # Customize the plot
+    ax.set_title(f'{method_name.capitalize()}/{dataset_name.capitalize()} - Average {feature.capitalize()} vs. Budget Curves')
+    ax.set_xlabel('Budget')
+    ax.set_ylabel(f'Average {feature.capitalize()}')
+    
+    pos = ax.get_position()
+    ax.set_position([pos.x0, pos.y0, pos.width * 0.9, pos.height])
+    ax.legend(title=attributes, fontsize="9", loc='center right', bbox_to_anchor=(1.23, 0.5))
+    
+    ax.grid(grid)
+    
+    if(save):
+        file_name = '_'.join([dataset_name, method_name, feature, 'for', '_'.join(attributes)]) + '.png'
+        dataframe_directory = os.path.dirname(load_path) if load_path is not None else './'
+        store_directory = dataframe_directory if not in_plot_directory else os.path.join(dataframe_directory, 'plots/')        
+        
+        if in_plot_directory and not os.path.exists(store_directory):
+            os.makedirs(store_directory)
+            
+        plt.savefig(os.path.join(store_directory, file_name))
+        
+    plt.show()
+    
+    
+    
+def plot_attribute_group_avg_ranking(method_name : str,
+                                     feature : str,
+                                     attributes : list,
+                                     dfs : List[pd.DataFrame] = None,
+                                     load_paths : List[str] = None,
+                                     grid : bool = True,
+                                     save : bool = True,
+                                     verbose : bool = True,
+                                     in_plot_directory : bool = True
+                                     ) -> None:
+    """For each unique combination of given attributes calculates its average feature value across datasets for each budget.
+       Plots the corresponding results and stores them as an image if it is requested.
+    
+    Args:
+        method_name (str): The name of the PER method whose experiments we are evaluating
+        feature (str): The feature that we want to evaluate the average ranking of the attribute group for  
+        attributes (list): Group of experiments' arguments whose each distinct combination constitutes a seperate curve
+        dfs (List[pd.DataFrame]): Dataframes containing the information about progressive PER experiments (Defaults to None)
+        load_paths (List[str]): Paths from which the dataframe should be loaded from (Defaults to None)
+        grid (bool): Grid to be displayed in the plot (Defaults to True)
+        save (bool) : Save the plot as an image on disk (Defaults to True)
+        verbose (bool) : Show the produced plot
+        in_plot_directory (bool) : Plot to be saved in an experiment directory - 
+        created in the target dataframe's / current directory if non-existent (Defaults to True)
+    """
+    
+    if(dfs is None and load_paths is None):
+        raise ValueError("No dataframes or csv files given - Cannot calculate and plot average combinations rankings.")
+
+    total_datasets = len(dfs) if dfs is not None else len(load_paths)
+    attributes_combinations = {}
+    attributes_combinations_budget_scores : List[Tuple[float, str]]
+    
+    for current_dataset in range(total_datasets):
+        if(dfs is not None):
+            experiments = dfs[current_dataset]
+        else:
+            current_dataset_path = load_paths[current_dataset]
+            experiments = pd.read_csv(current_dataset_path)
+    
+        budgets_experiments = experiments.sort_values(by='budget').groupby('budget')
+
+        for current_budget, current_budget_experiments in budgets_experiments:
+            current_budget_attributes_combinations = current_budget_experiments.groupby(attributes[0] if len(attributes) == 1 else attributes)
+            attributes_combinations_budget_scores = []
+            
+            for attributes_combination, current_budget_attributes_combination in current_budget_attributes_combinations:
+                attributes_combination_budget_feature_value = current_budget_attributes_combination[feature].mean()
+                attributes_combinations_budget_scores.append((attributes_combination_budget_feature_value, attributes_combination))
+                
+            for ranking, attributes_combinations_budget_score in enumerate(sorted(attributes_combinations_budget_scores, reverse=True)):
+                attributes_combination_budget_feature_value, attributes_combination = attributes_combinations_budget_score
+                if attributes_combination not in attributes_combinations:
+                    attributes_combinations[attributes_combination] = {}
+                    
+                if current_budget not in attributes_combinations[attributes_combination]:
+                    attributes_combinations[attributes_combination][current_budget] = []
+                    
+                attributes_combinations[attributes_combination][current_budget].append(ranking+1)
+
+    fig = plt.figure(figsize=(16, 12))
+    ax = plt.subplot(111)       
+            
+    for attributes_combination, attributes_combination_budgets in attributes_combinations.items():
+        
+        attributes_combination_average_rankings = []
+        sorted_budgets = sorted(attributes_combination_budgets.keys(), reverse=False)
+        for budget in sorted_budgets:
+            attributes_combination_average_rankings.append(sum(attributes_combination_budgets[budget]) / len(attributes_combination_budgets[budget]))
+            
+        ax.plot(sorted_budgets, attributes_combination_average_rankings, label=str(attributes_combination), marker='o', linestyle='-')
+        
+        
+    # Customize the plot
+    ax.set_title(f'{method_name.capitalize()} - Average {feature.capitalize()} Ranking vs. Budget Curves')
+    ax.set_xlabel('Budget')
+    ax.set_ylabel('Average Ranking')
+    
+    pos = ax.get_position()
+    ax.set_position([pos.x0, pos.y0, pos.width * 0.9, pos.height])
+    ax.legend(title=attributes, fontsize="9", loc='center right', bbox_to_anchor=(1.23, 0.5))
+    
+    ax.grid(grid)
+    
+    if(save):
+        file_name = '_'.join([method_name, 'for', '_'.join(attributes), 'avg_rankings', feature]) + '.png'
+        dataframe_directory = os.path.dirname(load_paths[0]) if load_paths is not None else './'
+        store_directory = dataframe_directory if not in_plot_directory else os.path.join(dataframe_directory, 'avg_rankings/')        
+        
+        if in_plot_directory and not os.path.exists(store_directory):
+            os.makedirs(store_directory)
+            
+        plt.savefig(os.path.join(store_directory, file_name))
+        
+    plt.show()
+    
+    
+    
+def plot_attribute_group_avg_top_distance(method_name : str,
+                                     feature : str,
+                                     attributes : list,
+                                     dfs : List[pd.DataFrame] = None,
+                                     load_paths : List[str] = None,
+                                     grid : bool = True,
+                                     save : bool = True,
+                                     verbose : bool = True,
+                                     in_plot_directory : bool = True
+                                     ) -> None:
+    """For each unique combination of given attributes calculates its feature's value average difference from the best value across datasets for each budget.
+       Plots the corresponding results and stores them as an image if it is requested.
+    
+    Args:
+        method_name (str): The name of the PER method whose experiments we are evaluating
+        feature (str): The feature that we want to evaluate the average ranking of the attribute group for  
+        attributes (list): Group of experiments' arguments whose each distinct combination constitutes a seperate curve
+        dfs (List[pd.DataFrame]): Dataframes containing the information about progressive PER experiments (Defaults to None)
+        load_paths (List[str]): Paths from which the dataframe should be loaded from (Defaults to None)
+        grid (bool): Grid to be displayed in the plot (Defaults to True)
+        save (bool) : Save the plot as an image on disk (Defaults to True)
+        verbose (bool) : Show the produced plot
+        in_plot_directory (bool) : Plot to be saved in an experiment directory - 
+        created in the target dataframe's / current directory if non-existent (Defaults to True)
+    """
+    
+    if(dfs is None and load_paths is None):
+        raise ValueError("No dataframes or csv files given - Cannot calculate and plot average combinations rankings.")
+
+    total_datasets = len(dfs) if dfs is not None else len(load_paths)
+    attributes_combinations = {}
+    attributes_combinations_budget_scores : List[Tuple[float, str]]
+    
+    for current_dataset in range(total_datasets):
+        if(dfs is not None):
+            experiments = dfs[current_dataset]
+        else:
+            current_dataset_path = load_paths[current_dataset]
+            experiments = pd.read_csv(current_dataset_path)
+    
+        budgets_experiments = experiments.sort_values(by='budget').groupby('budget')
+
+        for current_budget, current_budget_experiments in budgets_experiments:
+            current_budget_attributes_combinations = current_budget_experiments.groupby(attributes[0] if len(attributes) == 1 else attributes)
+            attributes_combinations_budget_scores = []
+            
+            for attributes_combination, current_budget_attributes_combination in current_budget_attributes_combinations:
+                attributes_combination_budget_feature_value = current_budget_attributes_combination[feature].mean()
+                attributes_combinations_budget_scores.append((attributes_combination_budget_feature_value, attributes_combination))
+                
+            attributes_combinations_budget_scores = sorted(attributes_combinations_budget_scores, reverse=True)
+            budget_highest_feature_value = attributes_combinations_budget_scores[0][0]
+                
+            for attributes_combinations_budget_score in attributes_combinations_budget_scores:
+                attributes_combination_budget_feature_value, attributes_combination = attributes_combinations_budget_score
+                if attributes_combination not in attributes_combinations:
+                    attributes_combinations[attributes_combination] = {}
+                    
+                if current_budget not in attributes_combinations[attributes_combination]:
+                    attributes_combinations[attributes_combination][current_budget] = []
+                    
+                attributes_combinations[attributes_combination][current_budget].append(budget_highest_feature_value - attributes_combination_budget_feature_value)
+
+    fig = plt.figure(figsize=(16, 12))
+    ax = plt.subplot(111)       
+            
+    for attributes_combination, attributes_combination_budgets in attributes_combinations.items():
+        
+        attributes_combination_average_rankings = []
+        sorted_budgets = sorted(attributes_combination_budgets.keys(), reverse=False)
+        for budget in sorted_budgets:
+            attributes_combination_average_rankings.append(sum(attributes_combination_budgets[budget]) / len(attributes_combination_budgets[budget]))
+            
+        ax.plot(sorted_budgets, attributes_combination_average_rankings, label=str(attributes_combination), marker='o', linestyle='-')
+        
+        
+    # Customize the plot
+    ax.set_title(f'{method_name.capitalize()} - Average {feature.capitalize()} Distance from Top vs. Budget Curves')
+    ax.set_xlabel('Budget')
+    ax.set_ylabel('Average Distance from Top')
+    
+    pos = ax.get_position()
+    ax.set_position([pos.x0, pos.y0, pos.width * 0.9, pos.height])
+    ax.legend(title=attributes, fontsize="9", loc='center right', bbox_to_anchor=(1.23, 0.5))
+    
+    ax.grid(grid)
+    
+    if(save):
+        file_name = '_'.join([method_name, 'for', '_'.join(attributes), 'avg_distances', feature]) + '.png'
+        dataframe_directory = os.path.dirname(load_paths[0]) if load_paths is not None else './'
+        store_directory = dataframe_directory if not in_plot_directory else os.path.join(dataframe_directory, 'avg_distances/')        
+        
+        if in_plot_directory and not os.path.exists(store_directory):
+            os.makedirs(store_directory)
+            
+        plt.savefig(os.path.join(store_directory, file_name))
+        
+    plt.show()
+    
+    
+def plot_attributes_performance_for_budget(method_name : str,
+                                            feature : str,
+                                            attributes : list,
+                                            dfs : List[pd.DataFrame] = None,
+                                            load_paths : List[str] = None,
+                                            calculate_distance : bool = False,
+                                            grid : bool = True,
+                                            save : bool = True,
+                                            verbose : bool = True,
+                                            in_plot_directory : bool = True
+                                            ) -> pd.DataFrame:
+    """For each unique combination of given attributes calculates its feature value's average distance from best / ranking per budget.
+       Then calculates the same values for each combination of budget and dataset. Combination rows are sorted by the average of the averages
+       of the feature value's distance from best / ranking per budget.
+    
+    Args:
+        method_name (str): The name of the PER method whose experiments we are evaluating
+        feature (str): The feature that we want to evaluate the average ranking of the attribute group for  
+        attributes (list): Group of experiments' arguments whose each distinct combination constitutes a seperate curve
+        dfs (List[pd.DataFrame]): Dataframes containing the information about progressive PER experiments (Defaults to None)
+        load_paths (List[str]): Paths from which the dataframe should be loaded from (Defaults to None)
+        calculate_distance (bool): Calculate distance for the feature from top within dataset (Defaults to False)
+        grid (bool): Grid to be displayed in the plot (Defaults to True)
+        save (bool) : Save the plot as an image on disk (Defaults to True)
+        verbose (bool) : Show the produced plot
+        in_plot_directory (bool) : Plot to be saved in an experiment directory - 
+        created in the target dataframe's / current directory if non-existent (Defaults to True)
+    Returns:
+        pd.DataFrame: Dataframe containing the performance of the feature for each attributes' value combination across all datasets
+                      for the requested budget order (e.x. first budget for each dataset)
+    """
+    
+    if(dfs is None and load_paths is None):
+        raise ValueError("No dataframes or csv files given - Cannot calculate and plot average combinations rankings.")
+            
+    total_datasets : int = len(dfs) if dfs is not None else len(load_paths)
+    attributes_combinations : dict = {}  
+    budget_dataset_best_feature_value : dict = {}
+    
+    attributes_column : str = ' + '.join([' '.join([word.capitalize() for word in attribute.split('_')]) for attribute in attributes])
+    budget_dataframe : dict = {attributes_column : []}    
+
+    for current_dataset in range(total_datasets):
+        if(dfs is not None):
+            experiments = dfs[current_dataset]
+        else:
+            current_dataset_path = load_paths[current_dataset]
+            experiments = pd.read_csv(current_dataset_path)
+            
+        current_dataset_name : str = "D" + str(current_dataset+1)
+        budgets_experiments = experiments.sort_values(by='budget').groupby('budget')
+        
+        current_budget = 0
+        for _, current_budget_experiments in budgets_experiments:
+            current_budget += 1
+            current_budget_attributes_combinations = current_budget_experiments.groupby(attributes[0] if len(attributes) == 1 else attributes)
+
+            for attributes_combination, current_budget_attributes_combination in current_budget_attributes_combinations:
+                
+                if attributes_combination not in attributes_combinations:
+                    attributes_combinations[attributes_combination] = {}
+                    
+                if current_budget not in attributes_combinations[attributes_combination]:
+                    attributes_combinations[attributes_combination][current_budget] = {}
+                current_budget_attributes_combination_feature_value = current_budget_attributes_combination[feature].mean()          
+                attributes_combinations[attributes_combination][current_budget][current_dataset_name] = current_budget_attributes_combination_feature_value
+                
+                if current_budget not in budget_dataset_best_feature_value:
+                    budget_dataset_best_feature_value[current_budget] = {}
+                    
+                if current_dataset_name not in budget_dataset_best_feature_value[current_budget]:
+                    budget_dataset_best_feature_value[current_budget][current_dataset_name] = 0.0
+                    
+                if(current_budget_attributes_combination_feature_value > budget_dataset_best_feature_value[current_budget][current_dataset_name]):
+                    budget_dataset_best_feature_value[current_budget][current_dataset_name] = current_budget_attributes_combination_feature_value
+                    
+            if calculate_distance:
+                # we want to calculate each combination's performance distance from best performance per dataset
+                for attributes_combination, current_budget_attributes_combination in current_budget_attributes_combinations:
+                    attributes_combinations[attributes_combination][current_budget][current_dataset_name] = budget_dataset_best_feature_value[current_budget][current_dataset_name] - attributes_combinations[attributes_combination][current_budget][current_dataset_name]
+            else:
+                # we want to calculate each combination's ranking per dataset
+                combinations_performance : List[Tuple[float, str]] = []
+                for attributes_combination, current_budget_attributes_combination in current_budget_attributes_combinations:
+                    combinations_performance.append((attributes_combinations[attributes_combination][current_budget][current_dataset_name], attributes_combination))
+                combinations_performance = sorted(combinations_performance, reverse=True)
+                
+                for ranking, combination_performance in enumerate(combinations_performance):
+                    performance, combination = combination_performance
+                    attributes_combinations[combination][current_budget][current_dataset_name] = ranking + 1
+                    
+                    
+    for attributes_combination, budgets_attributes_combination in attributes_combinations.items(): 
+        budget_dataframe[attributes_column].append(attributes_combination)
+        for budget in budgets_attributes_combination:
+            
+            budget_attribute_combination = budgets_attributes_combination[budget]
+            budget_name = "B" + str(budget)
+            budget_feature_avg_value = 0.0
+            
+            for dataset, dataset_budget_attribute_combination in budget_attribute_combination.items():
+                
+                budget_dataset_column = '_'.join([str(budget_name),str(dataset)])
+                
+                if(budget_dataset_column not in budget_dataframe):
+                    budget_dataframe[budget_dataset_column] = []
+                
+                budget_dataset_feature_value = attributes_combinations[attributes_combination][budget][dataset]
+                budget_feature_avg_value += budget_dataset_feature_value   
+                budget_dataframe[budget_dataset_column].append(budget_dataset_feature_value)
+                
+            budget_average_column = '_'.join(["AVERAGE",budget_name])
+            if(budget_average_column not in budget_dataframe):
+                    budget_dataframe[budget_average_column] = []
+                    
+            budget_dataframe[budget_average_column].append(budget_feature_avg_value / len(budget_attribute_combination))
+     
+    budget_dataframe = pd.DataFrame(budget_dataframe)       
+    # Sort Attributes Combinations rows based on the average of the averages of their per budget performances
+    average_budget_performance_columns = ['_'.join(["AVERAGE", "B" + str(index+1)]) for index in range(len(budgets_experiments))]
+    budget_dataframe['AA_BS'] = budget_dataframe[average_budget_performance_columns].mean(axis=1)
+    budget_dataframe = budget_dataframe.sort_values(by='AA_BS', ascending=True)
+    
+    if(save):
+        metric = "distance" if calculate_distance else "ranking"
+        file_name = '_'.join([feature, metric, 'for', method_name, 'with', '_'.join(attributes)]) + '.csv'
+        
+        dataframe_directory = os.path.dirname(load_paths[0]) if load_paths is not None else './'
+        store_directory = dataframe_directory if not in_plot_directory else os.path.join(dataframe_directory, metric + '-analytical-performances/')        
+        
+        if in_plot_directory and not os.path.exists(store_directory):
+            os.makedirs(store_directory)
+            
+        budget_dataframe.to_csv(os.path.join(store_directory, file_name), index=False)
+            
+    return budget_dataframe    
+                     
+    # total_datasets : int = len(dfs) if dfs is not None else len(load_paths)
+    # attributes_combinations : dict = {}  
+    # budget_dataset_best_feature_value : dict = {}
+    # attributes_column : str = '_'.join([attribute.capitalize() for attribute in attributes])
+    
+    # budget_dataframe : dict = {attributes_column : [], "AVERAGE" : []}
+    # budget_dataframe_column_data_types = {"AVERAGE" : float}
+
+    
+    # for current_dataset in range(total_datasets):
+    #     if(dfs is not None):
+    #         experiments = dfs[current_dataset]
+    #     else:
+    #         current_dataset_path = load_paths[current_dataset]
+    #         experiments = pd.read_csv(current_dataset_path)
+        
+    #     current_dataset_name : str = "D" + str(current_dataset+1)    
+    #     budget_dataframe_column_data_types[current_dataset_name] = float if calculate_distance else int
+        
+    #     budget_dataset_best_feature_value[current_dataset_name] = {}
+    #     budget_dataframe[current_dataset_name] = []
+        
+        
+    #     budgets_experiments = experiments.sort_values(by='budget').groupby('budget')
+        
+    #     for current_budget, current_budget_experiments in budgets_experiments:
+        
+    #         current_budget_experiments = budgets_experiments.get_group(list(budgets_experiments.groups.keys())[budget_order])
+    #         current_budget_attributes_combinations = current_budget_experiments.groupby(attributes[0] if len(attributes) == 1 else attributes)
+            
+    #         for attributes_combination, current_budget_attributes_combination in current_budget_attributes_combinations:
+                
+    #             if attributes_combination not in attributes_combinations:
+    #                 attributes_combinations[attributes_combination] = {}
+                    
+    #             if current_budget not in attributes_combinations[attributes_combination]:
+    #                 attributes_combinations[attributes_combination][current_budget] = {}
+                    
+    #             current_budget_attributes_combination_feature_value = current_budget_attributes_combination[feature].mean()           
+    #             attributes_combinations[attributes_combination][current_budget][current_dataset] = current_budget_attributes_combination_feature_value
+                
+                
+                
+    #             if current_budget_attributes_combination_feature_value > dataset_best_feature_value[current_dataset_name]:
+    #                 dataset_best_feature_value[current_dataset_name] = current_budget_attributes_combination_feature_value
+                        
+    #         if calculate_distance:
+    #             # we want to calculate each combination's performance distance from best performance per dataset
+    #             for attributes_combination, current_budget_attributes_combination in current_budget_attributes_combinations:
+    #                 attributes_combinations[attributes_combination][current_dataset_name] = dataset_best_feature_value[current_dataset_name] - attributes_combinations[attributes_combination][current_dataset_name]
+    #         else:
+    #             # we want to calculate each combination's ranking per dataset
+    #             combinations_performance : List[Tuple[float, str]] = []
+    #             for attributes_combination, current_budget_attributes_combination in current_budget_attributes_combinations:
+    #                 combinations_performance.append((attributes_combinations[attributes_combination][current_dataset_name], attributes_combination))
+    #             combinations_performance = sorted(combinations_performance, reverse=True)
+                
+    #             for ranking, combination_performance in enumerate(combinations_performance):
+    #                 performance, combination = combination_performance
+    #                 attributes_combinations[combination][current_dataset_name] = ranking + 1
+                  
+    # for attributes_combination, attributes_combination_datasets_performance in attributes_combinations.items():
+        
+    #     budget_dataframe[attributes_column].append(attributes_combination)
+    #     average_attributes_combination_performance = 0.0
+        
+    #     for dataset, performance in attributes_combination_datasets_performance.items():
+    #         # print(performance)
+    #         budget_dataframe[dataset].append(performance)
+    #         average_attributes_combination_performance += performance
+            
+    #     average_attributes_combination_performance /= len(attributes_combination_datasets_performance) 
+    #     budget_dataframe["AVERAGE"].append(average_attributes_combination_performance)
+        
+    # budget_dataframe = pd.DataFrame(budget_dataframe)
+    # budget_dataframe = budget_dataframe.astype(budget_dataframe_column_data_types)
+    # budget_dataframe = budget_dataframe.sort_values(by='AVERAGE', ascending=True)
+
+    # if(save):
+    #     metric = "distance" if calculate_distance else "ranking"
+    #     budget_index = "b" + str((budget_order + 1)) 
+    #     file_name = '_'.join([budget_index, feature, metric, 'for', method_name, 'with', '_'.join(attributes)]) + '.csv'
+        
+    #     dataframe_directory = os.path.dirname(load_paths[0]) if load_paths is not None else './'
+    #     store_directory = dataframe_directory if not in_plot_directory else os.path.join(dataframe_directory, metric + '-performances/')        
+        
+    #     if in_plot_directory and not os.path.exists(store_directory):
+    #         os.makedirs(store_directory)
+            
+    #     budget_dataframe.to_csv(os.path.join(store_directory, file_name), index=False)
+            
+    # return budget_dataframe            
diff --git a/docs/pyjedai/workflow.py b/docs/pyjedai/workflow.py
index 7b7ceed..185a10d 100644
--- a/docs/pyjedai/workflow.py
+++ b/docs/pyjedai/workflow.py
@@ -4,6 +4,8 @@
 from typing import Callable, List, Tuple
 
 import matplotlib.pyplot as plt
+import os
+import json
 import optuna
 import pandas as pd
 from networkx import Graph
@@ -19,7 +21,10 @@
 from .vector_based_blocking import EmbeddingsNNBlockBuilding
 from .joins import EJoin, TopKJoin
 
-plt.style.use('seaborn-whitegrid')
+from .prioritization import ProgressiveMatching, BlockIndependentPM, class_references
+from .utils import new_dictionary_from_keys, get_class_function_arguments, generate_unique_identifier
+
+
 
 class PYJEDAIWorkFlow(ABC):
     """Main module of the pyjedAI and the simplest way to create an end-to-end ER workflow.
@@ -204,6 +209,270 @@ def get_final_scores(self) -> Tuple[float, float, float]:
             Tuple[float, float, float]: F-Measure, Precision, Recall.
         """
         return self.f1[-1], self.precision[-1], self.recall[-1]
+class ProgressiveWorkFlow(PYJEDAIWorkFlow):
+    """Main module of the pyjedAI and the simplest way to create an end-to-end PER workflow.
+    """
+
+    def __init__(
+            self,
+            name: str = None
+    ) -> None:
+        self.f1: list = []
+        self.recall: list = []
+        self.precision: list = []
+        self.runtime: list = []
+        self.configurations: list = []
+        self.workflow_exec_time: float
+        self._id: int = next(self._id)
+        self.name: str = name if name else "Workflow-" + str(self._id)
+        self._workflow_bar: tqdm
+        self.final_pairs = None
+
+    def run(self,
+            data: Data,
+            verbose: bool = False,
+            with_classification_report: bool = False,
+            workflow_step_tqdm_disable: bool = True,
+            workflow_tqdm_enable: bool = False,
+            block_building : dict = None,
+            block_purging : dict = None,
+            block_filtering : dict = None,
+            **matcher_arguments
+        ) -> None:
+        """Main function for creating an Progressive ER workflow.
+
+        Args:
+            data (Data): Dataset Module, used to derive schema-awereness status
+            verbose (bool, optional): Print detailed report for each step. Defaults to False.
+            with_classification_report (bool, optional): Print pairs counts. Defaults to False.
+            workflow_step_tqdm_disable (bool, optional):  Tqdm progress bar in each step. Defaults to True.
+            workflow_tqdm_enable (bool, optional): Overall progress bar. Defaults to False.
+            number_of_nearest_neighbors (int, optional): Number of nearest neighbours in cardinality based algorithms. Defaults to None.
+            indexing (str, optional): Inorder/Reverse/Bilateral indexing of datasets. Defaults to None.
+            similarity_function (str, optional): Function used to evaluate the similarity of two vector based representations of entities. Defaults to None.
+            language_model (str, optional): Language model used to vectorize the entities. Defaults to None.
+            tokenizer (str, optional): Text tokenizer used. Defaults to None.
+            weighting_scheme (str, optional): Scheme used to evaluate the weight between nodes of intermediate representation graph. Defaults to None.
+            block_building (dict, optional): Algorithm and its parameters used to construct the blocks. Defaults to None.
+            block_purging (dict, optional): Algorithm and its parameters used to delete obsolete blocks. Defaults to None.
+            block_filtering (dict, optional): Algorithm and its parameters used to lower the cardinality of blocks. Defaults to None.
+            window_size (dict, optional): Window size in the Sorted Neighborhood Progressive ER workflows. Defaults to None.
+        """
+        self.block_building, self.block_purging, self.block_filtering, self.algorithm = \
+        block_building, block_purging, block_filtering, matcher_arguments['algorithm']
+        steps = [self.block_building, self.block_purging, self.block_filtering, self.algorithm]
+        num_of_steps = sum(x is not None for x in steps)
+        self._workflow_bar = tqdm(total=num_of_steps,
+                                  desc=self.name,
+                                  disable=not workflow_tqdm_enable)
+         
+        self.data : Data = data
+        self._init_experiment()
+        start_time = time()
+        self.matcher_arguments = matcher_arguments
+        self.matcher_name = self.matcher_arguments['matcher']
+        self.dataset_name = self.matcher_arguments['dataset']
+        matcher = class_references[matcher_arguments['matcher']]
+        self.constructor_arguments = new_dictionary_from_keys(dictionary=self.matcher_arguments, keys=get_class_function_arguments(class_reference=matcher, function_name='__init__'))
+        self.predictor_arguments = new_dictionary_from_keys(dictionary=self.matcher_arguments, keys=get_class_function_arguments(class_reference=matcher, function_name='predict'))
+        print(self.constructor_arguments)
+        print(self.predictor_arguments)
+        
+        progressive_matcher : ProgressiveMatching = matcher(**self.constructor_arguments)
+        self.progressive_matcher : ProgressiveMatching = progressive_matcher
+        #
+        # Block Building step: Only one algorithm can be performed
+        #
+        block_building_method = (self.block_building['method'](**self.block_building["params"]) \
+                                                    if "params" in self.block_building \
+                                                    else self.block_building['method']()) if self.block_building \
+                                                    else (None if not self._blocks_required() else StandardBlocking())
+
+        bblocks = None
+        block_building_blocks = None
+        if block_building_method:
+            block_building_blocks = \
+                block_building_method.build_blocks(data,
+                                                attributes_1=self.block_building["attributes_1"] \
+                                                                    if(self.block_building is not None and "attributes_1" in self.block_building) else None,
+                                                    attributes_2=self.block_building["attributes_2"] \
+                                                                    if(self.block_building is not None and "attributes_2" in self.block_building) else None,
+                                                    tqdm_disable=workflow_step_tqdm_disable)
+            self.final_pairs = bblocks = block_building_blocks
+            res = block_building_method.evaluate(block_building_blocks,
+                                                export_to_dict=True,
+                                                with_classification_report=with_classification_report,
+                                                verbose=verbose)
+            self._save_step(res, block_building_method.method_configuration())
+            self._workflow_bar.update(1)
+
+        if(block_building_blocks is not None):
+            #
+            # Block Purging step [optional]
+            #
+            bblocks = block_building_blocks
+            block_purging_blocks = None
+            if(self.block_purging is not None):
+                block_purging_method = self.block_purging['method'](**self.block_purging["params"]) \
+                                                if "params" in self.block_purging \
+                                                else self.block_purging['method']()
+                block_purging_blocks = block_purging_method.process(bblocks,
+                                                                    data,
+                                                                    tqdm_disable=workflow_step_tqdm_disable)
+                self.final_pairs = bblocks = block_purging_blocks
+                res = block_purging_method.evaluate(bblocks,
+                                                    export_to_dict=True,
+                                                    with_classification_report=with_classification_report,
+                                                    verbose=verbose)
+                self._save_step(res, block_purging_method.method_configuration())
+                self._workflow_bar.update(1)
+            #
+            # Block Filtering step [optional]
+            #
+            block_filtering_blocks = None
+            if(self.block_filtering is not None):
+                block_filtering_method = self.block_filtering['method'](**self.block_filtering["params"]) \
+                                                if "params" in self.block_filtering \
+                                                else self.block_filtering['method']()
+                block_filtering_blocks = block_filtering_method.process(bblocks,
+                                                                        data,
+                                                                        tqdm_disable=workflow_step_tqdm_disable)
+                self.final_pairs = bblocks = block_filtering_blocks
+                res = block_filtering_method.evaluate(bblocks,
+                                                    export_to_dict=True,
+                                                    with_classification_report=with_classification_report,
+                                                    verbose=verbose)
+                self._save_step(res, block_filtering_method.method_configuration())
+                self._workflow_bar.update(1)
+
+        #
+        # Progressive Matching step
+        #
+        self.final_pairs : List[Tuple[float, int, int]] = progressive_matcher.predict(data=data, blocks=bblocks, dataset_identifier=self.dataset_name, **self.predictor_arguments)
+        evaluator = Evaluation(self.data)
+        self.tp_indices, self.total_emissions = evaluator.calculate_tps_indices(pairs=self.final_pairs,duplicate_of=progressive_matcher.duplicate_of, duplicate_emitted=progressive_matcher.duplicate_emitted)
+        self.total_candidates = len(self.final_pairs)       
+        self._workflow_bar.update(1)
+        self.workflow_exec_time = time() - start_time
+
+    def _blocks_required(self):
+        return not isinstance(self.progressive_matcher, BlockIndependentPM)
+
+    def _init_experiment(self) -> None:
+        self.f1: list = []
+        self.recall: list = []
+        self.precision: list = []
+        self.runtime: list = []
+        self.configurations: list = []
+        self.workflow_exec_time: float
+
+    def visualize(
+            self,
+            f1: bool = True,
+            recall: bool = True,
+            precision: bool = True,
+            separate: bool = False
+    ) -> None:
+        pass
+
+    def to_df(self) -> pd.DataFrame:
+        pass
+
+    def export_pairs(self) -> pd.DataFrame:
+        pass
+
+    def _save_step(self, results: dict, configuration: dict) -> None:
+        pass
+
+    def get_final_scores(self) -> Tuple[float, float, float]:
+        pass
+    
+    def retrieve_matcher_workflows(self, workflows : dict, arguments : dict) -> list:
+        """Retrieves the list of already executed workflows for the matcher/model of current workflow 
+
+        Args:
+            workflows (dict): Dictionary of script's executed workflows' information
+            arguments (dict): Arguments that have been supplied for current workflow execution
+
+        Returns:
+            list: List of already executed workflows for given workflow's arguments' matcher/model
+        """
+        dataset : str = self.dataset_name
+        matcher : str = self.matcher_name
+        
+        workflows[dataset] = workflows[dataset] if dataset in workflows else dict()
+        matcher_results = workflows[dataset]
+        matcher_results[matcher] = matcher_results[matcher] if matcher in matcher_results \
+                                else ([] if('language_model' not in arguments) else {})
+                
+        matcher_info = matcher_results[matcher]
+        workflows_info = matcher_info
+        if(isinstance(matcher_info, dict)):
+            lm_name = arguments['language_model']
+            matcher_info[lm_name] = matcher_info[lm_name] if lm_name in matcher_info else []
+            workflows_info = matcher_info[lm_name]  
+            
+        return workflows_info
+    
+    
+    
+    def save(self, arguments : dict, path : str = None, results = None) -> dict:
+        """Stores argument / execution information for current workflow within a workflows dictionary.
+        
+        Args:
+            arguments (dict): Arguments that have been supplied for current workflow execution
+            path (str): Path where the workflows results are stored at (Default to None),
+            results (str): A dictionary of workflows results at which we want to store current workflow's arguments/info
+        Returns:
+            dict: Dictionary containing the information about the given workflow
+        """
+        if(path is None and results is None):
+            raise ValueError(f"No dictionary path or workflows dictionary given - Cannot save workflow.")
+        
+        if(results is not None):
+            workflows = results
+        elif(not os.path.exists(path) or os.path.getsize(path) == 0):
+            workflows = {}
+        else:
+            with open(path, 'r', encoding="utf-8") as file:
+                workflows = json.load(file)
+                
+        category_workflows = self.retrieve_matcher_workflows(workflows=workflows, arguments=arguments)
+        self.save_workflow_info(arguments=arguments) 
+        category_workflows.append(self.info)
+        
+        if(path is not None):
+            with open(path, 'w', encoding="utf-8") as file:
+                json.dump(workflows, file, indent=4)
+            
+        return self.info
+    
+    def save_workflow_info(self, arguments : dict) -> dict:
+        """Stores current workflow argument values and execution related data (like execution time and total emissions)
+
+        Args:
+            arguments (dict): Arguments that were passed to progressive workflow at hand
+        """
+        
+        workflow_info : dict = {k: v for k, v in arguments.items()}
+        workflow_info['total_candidates'] = self.total_candidates
+        workflow_info['total_emissions'] = self.total_emissions
+        workflow_info['time'] = self.workflow_exec_time
+        workflow_info['name'] = generate_unique_identifier()
+        workflow_info['tp_idx'] = self.tp_indices
+        workflow_info['dataset'] = self.dataset_name
+        workflow_info['matcher'] = self.matcher_name
+
+        self.info = workflow_info  
+    
+    def print_info(self, info : dict):
+        for attribute in info:
+            value = info[attribute]
+            if(attribute != 'tp_idx'):
+                print(f"{attribute} : {value}")
+            else:
+                print(f"true_positives : {len(value)}")
+    
 
 def compare_workflows(workflows: List[PYJEDAIWorkFlow], with_visualization=True) -> pd.DataFrame:
     """Compares workflows by creating multiple plots and tables with results.
@@ -526,8 +795,10 @@ def best_blocking_workflow_ccer(self) -> None:
         self.comparison_cleaning = dict(method=WeightedEdgePruning, params=dict(weighting_scheme='EJS'))
         self.entity_matching = dict(method=EntityMatching,
                                     params=dict(metric='cosine',
-                                                     tokenizer='tfidf_char_3gram', 
-                                                     similarity_threshold=0.0))
+                                                    tokenizer='char_tokenizer', 
+                                                    vectorizer='tfidf',
+                                                    qgram=3,
+                                                    similarity_threshold=0.0))
         self.clustering = dict(method=UniqueMappingClustering, 
                                exec_params=dict(similarity_threshold=0.17))
         self.name="best-ccer-workflow"