Skip to content

Commit

Permalink
updated code for release 0.0.7
Browse files Browse the repository at this point in the history
  • Loading branch information
Nikoletos-K committed Jun 30, 2023
1 parent fd5f177 commit 39a4371
Show file tree
Hide file tree
Showing 10 changed files with 2,376 additions and 349 deletions.
23 changes: 11 additions & 12 deletions src/pyjedai/block_building.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

from .datamodel import Block, Data, PYJEDAIFeature
from .utils import (are_matching, drop_big_blocks_by_size,
drop_single_entity_blocks)
drop_single_entity_blocks, get_blocks_cardinality)
from .evaluation import Evaluation

class AbstractBlockProcessing(PYJEDAIFeature):
Expand All @@ -27,7 +27,13 @@ def __init__(self):
self.attributes_2: list
self.num_of_blocks_dropped: int
self.original_num_of_blocks: int

self.sum_of_sizes: int = 0
self.total_num_of_comparisons: int = 0
self.min_block_size: int = None
self.max_block_size: int = None
self.min_block_comparisons: int = None
self.max_block_comparisons: int = None

def report(self) -> None:
"""Prints Block Building method configuration
"""
Expand Down Expand Up @@ -75,7 +81,8 @@ def evaluate(self,
id2 in entity_index and are_matching(entity_index, id1, id2):
true_positives += 1

eval_obj.calculate_scores(true_positives=true_positives)
total_matching_pairs = get_blocks_cardinality(eval_blocks, self.data.is_dirty_er)
eval_obj.calculate_scores(true_positives=true_positives, total_matching_pairs=total_matching_pairs)
eval_result = eval_obj.report(self.method_configuration(),
export_to_df,
export_to_dict,
Expand All @@ -84,8 +91,7 @@ def evaluate(self,
if with_stats:
self.stats(eval_blocks)
return eval_result



def stats(self, blocks: dict) -> None:
self.list_of_sizes = []
self.entities_in_blocks = set()
Expand Down Expand Up @@ -120,7 +126,6 @@ def stats(self, blocks: dict) -> None:
)
print(u'\u2500' * 123)


class AbstractBlockBuilding(AbstractBlockProcessing):
"""Abstract class for the block building method
"""
Expand All @@ -137,13 +142,7 @@ def __init__(self):
self.attributes_2: list
self.execution_time: float
self.data: Data
self.sum_of_sizes: int = 0
self.list_of_sizes: list = []
self.total_num_of_comparisons: int = 0
self.min_block_size: int = None
self.max_block_size: int = None
self.min_block_comparisons: int = None
self.max_block_comparisons: int = None

def build_blocks(
self,
Expand Down
15 changes: 7 additions & 8 deletions src/pyjedai/block_cleaning.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

from .block_building import AbstractBlockProcessing
from .datamodel import Block, Data
from .utils import create_entity_index, drop_single_entity_blocks
from .utils import create_entity_index, drop_single_entity_blocks, java_math_round

class AbstractBlockCleaning(AbstractBlockProcessing):

Expand Down Expand Up @@ -81,14 +81,13 @@ def process(
)
sorted_blocks = _sort_blocks_cardinality(blocks, self.data.is_dirty_er)
self._progress_bar.update(1)
entity_index = create_entity_index(sorted_blocks, self.data.is_dirty_er)
self.entity_index = create_entity_index(sorted_blocks, self.data.is_dirty_er)
self._progress_bar.update(1)
filtered_blocks = {}
for entity_id, block_keys in entity_index.items():
for entity_id, block_keys in self.entity_index.items():
# Create new blocks from the entity index
# print(list(block_keys[:int(round(self.ratio*len(block_keys)))]))
block_keys = list(block_keys)
for key in list(block_keys[:int(round(self.ratio*len(block_keys)))]):
for key in list(block_keys[:java_math_round(self.ratio*float(len(block_keys)))]):
filtered_blocks.setdefault(key, Block())
# Entities ids start to 0 ... n-1 for 1st dataset
# and n ... m for 2nd dataset
Expand All @@ -97,17 +96,17 @@ def process(
self._progress_bar.update(1)
new_blocks = drop_single_entity_blocks(filtered_blocks, self.data.is_dirty_er)
self._progress_bar.close()
self.num_of_blocks_dropped = len(blocks) - len(new_blocks)
self.execution_time = time() - start_time
self.blocks = new_blocks
return new_blocks

return self.blocks

def _configuration(self) -> dict:
return {
"Ratio" : self.ratio
}


class BlockPurging(AbstractBlockCleaning):
"""Discards the blocks exceeding a certain number of comparisons.
"""
Expand Down
Loading

0 comments on commit 39a4371

Please sign in to comment.