Skip to content

Commit db0ba8f

Browse files
committed
refactor: Move schema matchers to a higher directory level for better organization
1 parent 3ae3632 commit db0ba8f

17 files changed

+221
-287
lines changed

bdikit/api.py

+11-10
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,11 @@
77
import panel as pn
88
from IPython.display import display, Markdown
99

10-
from bdikit.schema_matching.one2one.base import BaseSchemaMatcher
11-
from bdikit.schema_matching.one2one.matcher_factory import SchemaMatchers
12-
from bdikit.schema_matching.topk.base import BaseTopkSchemaMatcher
13-
from bdikit.schema_matching.topk.matcher_factory import TopkMatchers
10+
from bdikit.schema_matching.base import BaseOne2oneSchemaMatcher, BaseTopkSchemaMatcher
11+
from bdikit.schema_matching.matcher_factory import (
12+
get_one2one_schema_matcher,
13+
get_topk_schema_matcher,
14+
)
1415
from bdikit.value_matching.base import BaseValueMatcher, ValueMatch, ValueMatchingResult
1516
from bdikit.value_matching.matcher_factory import ValueMatchers
1617
from bdikit.standards.standard_factory import Standards
@@ -43,7 +44,7 @@
4344
def match_schema(
4445
source: pd.DataFrame,
4546
target: Union[str, pd.DataFrame] = "gdc",
46-
method: Union[str, BaseSchemaMatcher] = DEFAULT_SCHEMA_MATCHING_METHOD,
47+
method: Union[str, BaseOne2oneSchemaMatcher] = DEFAULT_SCHEMA_MATCHING_METHOD,
4748
method_args: Optional[Dict[str, Any]] = None,
4849
standard_args: Optional[Dict[str, Any]] = None,
4950
) -> pd.DataFrame:
@@ -74,15 +75,15 @@ def match_schema(
7475
if isinstance(method, str):
7576
if method_args is None:
7677
method_args = {}
77-
matcher_instance = SchemaMatchers.get_matcher(method, **method_args)
78-
elif isinstance(method, BaseSchemaMatcher):
78+
matcher_instance = get_one2one_schema_matcher(method, **method_args)
79+
elif isinstance(method, BaseOne2oneSchemaMatcher):
7980
matcher_instance = method
8081
else:
8182
raise ValueError(
8283
"The method must be a string or an instance of BaseColumnMappingAlgorithm"
8384
)
8485

85-
matches = matcher_instance.map(source, target_table)
86+
matches = matcher_instance.get_one2one_match(source, target_table)
8687

8788
return pd.DataFrame(matches.items(), columns=["source", "target"])
8889

@@ -138,15 +139,15 @@ def top_matches(
138139
if isinstance(method, str):
139140
if method_args is None:
140141
method_args = {}
141-
topk_matcher = TopkMatchers.get_matcher(method, **method_args)
142+
topk_matcher = get_topk_schema_matcher(method, **method_args)
142143
elif isinstance(method, BaseTopkSchemaMatcher):
143144
topk_matcher = method
144145
else:
145146
raise ValueError(
146147
"The method must be a string or an instance of BaseTopkColumnMatcher"
147148
)
148149

149-
top_k_matches = topk_matcher.get_recommendations(
150+
top_k_matches = topk_matcher.get_topk_matches(
150151
selected_columns, target=target_table, top_k=top_k
151152
)
152153

bdikit/schema_matching/topk/base.py bdikit/schema_matching/base.py

+19-5
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,22 @@
1-
from bdikit.schema_matching.one2one.base import BaseSchemaMatcher
21
from typing import List, NamedTuple, TypedDict, Dict
32
import pandas as pd
43

54

5+
class BaseOne2oneSchemaMatcher:
6+
def get_one2one_match(
7+
self, source: pd.DataFrame, target: pd.DataFrame
8+
) -> Dict[str, str]:
9+
raise NotImplementedError("Subclasses must implement this method")
10+
11+
def _fill_missing_matches(
12+
self, dataset: pd.DataFrame, matches: Dict[str, str]
13+
) -> Dict[str, str]:
14+
for column in dataset.columns:
15+
if column not in matches:
16+
matches[column] = ""
17+
return matches
18+
19+
620
class ColumnScore(NamedTuple):
721
column_name: str
822
score: float
@@ -13,19 +27,19 @@ class TopkMatching(TypedDict):
1327
top_k_columns: List[ColumnScore]
1428

1529

16-
class BaseTopkSchemaMatcher(BaseSchemaMatcher):
30+
class BaseTopkSchemaMatcher(BaseOne2oneSchemaMatcher):
1731

18-
def get_recommendations(
32+
def get_topk_matches(
1933
self, source: pd.DataFrame, target: pd.DataFrame, top_k: int
2034
) -> List[TopkMatching]:
2135
raise NotImplementedError("Subclasses must implement this method")
2236

23-
def map(
37+
def get_one2one_match(
2438
self,
2539
source: pd.DataFrame,
2640
target: pd.DataFrame,
2741
) -> Dict[str, str]:
28-
top_matches = self.get_recommendations(source, target, 1)
42+
top_matches = self.get_topk_matches(source, target, 1)
2943
matches = {}
3044

3145
for top_match in top_matches:

bdikit/schema_matching/topk/contrastivelearning.py bdikit/schema_matching/contrastivelearning.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import pandas as pd
22
import numpy as np
33
from typing import List
4-
from bdikit.schema_matching.topk.base import (
4+
from bdikit.schema_matching.base import (
55
ColumnScore,
66
TopkMatching,
77
BaseTopkSchemaMatcher,
@@ -14,12 +14,12 @@
1414
from bdikit.models import ColumnEmbedder
1515

1616

17-
class EmbeddingSimilarityTopkSchemaMatcher(BaseTopkSchemaMatcher):
17+
class EmbeddingSimilarity(BaseTopkSchemaMatcher):
1818
def __init__(self, column_embedder: ColumnEmbedder, metric: str = "cosine"):
1919
self.api = column_embedder
2020
self.metric = metric
2121

22-
def get_recommendations(
22+
def get_topk_matches(
2323
self, source: pd.DataFrame, target: pd.DataFrame, top_k: int = 10
2424
) -> List[TopkMatching]:
2525
"""
@@ -54,7 +54,7 @@ def get_recommendations(
5454
return top_k_results
5555

5656

57-
class CLTopkSchemaMatcher(EmbeddingSimilarityTopkSchemaMatcher):
57+
class ContrastiveLearning(EmbeddingSimilarity):
5858
def __init__(self, model_name: str = DEFAULT_CL_MODEL, metric: str = "cosine"):
5959
super().__init__(
6060
column_embedder=ContrastiveLearningAPI(model_name=model_name), metric=metric

bdikit/schema_matching/one2one/gpt.py bdikit/schema_matching/gpt.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
import pandas as pd
22
from openai import OpenAI
3-
from bdikit.schema_matching.one2one.base import BaseSchemaMatcher
3+
from bdikit.schema_matching.base import BaseOne2oneSchemaMatcher
44

55

6-
class GPTSchemaMatcher(BaseSchemaMatcher):
6+
class GPT(BaseOne2oneSchemaMatcher):
77
def __init__(self):
88
self.client = OpenAI()
99

10-
def map(self, source: pd.DataFrame, target: pd.DataFrame):
10+
def get_one2one_match(self, source: pd.DataFrame, target: pd.DataFrame):
1111
target_columns = target.columns
1212
labels = ", ".join(target_columns)
1313
candidate_columns = source.columns

bdikit/schema_matching/topk/magneto.py bdikit/schema_matching/magneto.py

+2-7
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,8 @@
11
import pandas as pd
22
from typing import Dict, Any, List
33
from magneto import Magneto as Magneto_Lib
4-
from bdikit.schema_matching.one2one.base import BaseSchemaMatcher
54
from bdikit.download import get_cached_model_or_download
6-
from bdikit.schema_matching.topk.base import (
7-
ColumnScore,
8-
TopkMatching,
9-
BaseTopkSchemaMatcher,
10-
)
5+
from bdikit.schema_matching.base import ColumnScore, TopkMatching, BaseTopkSchemaMatcher
116

127
DEFAULT_MAGNETO_MODEL = "magneto-gdc-v0.1"
138

@@ -18,7 +13,7 @@ def __init__(self, kwargs: Dict[str, Any] = None):
1813
kwargs = {}
1914
self.magneto = Magneto_Lib(**kwargs)
2015

21-
def get_recommendations(
16+
def get_topk_matches(
2217
self, source: pd.DataFrame, target: pd.DataFrame, top_k: int
2318
) -> List[TopkMatching]:
2419
self.magneto.params["topk"] = (
+114
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
import importlib
2+
from enum import Enum
3+
from typing import Mapping, Dict, Any
4+
from bdikit.schema_matching.base import BaseOne2oneSchemaMatcher, BaseTopkSchemaMatcher
5+
6+
7+
class One2oneSchemaMatchers(Enum):
8+
SIMFLOOD = (
9+
"similarity_flooding",
10+
"bdikit.schema_matching.valentine.SimFlood",
11+
)
12+
COMA = (
13+
"coma",
14+
"bdikit.schema_matching.valentine.Coma",
15+
)
16+
CUPID = (
17+
"cupid",
18+
"bdikit.schema_matching.valentine.Cupid",
19+
)
20+
DISTRIBUTION_BASED = (
21+
"distribution_based",
22+
"bdikit.schema_matching.valentine.DistributionBased",
23+
)
24+
JACCARD_DISTANCE = (
25+
"jaccard_distance",
26+
"bdikit.schema_matching.valentine.Jaccard",
27+
)
28+
GPT = ("gpt", "bdikit.schema_matching.gpt.GPT")
29+
30+
TWO_PHASE = (
31+
"two_phase",
32+
"bdikit.schema_matching.twophase.TwoPhase",
33+
)
34+
35+
def __init__(self, matcher_name: str, matcher_path: str):
36+
self.matcher_name = matcher_name
37+
self.matcher_path = matcher_path
38+
39+
40+
class TopkSchemaMatchers(Enum):
41+
CT_LEARNING = (
42+
"ct_learning",
43+
"bdikit.schema_matching.contrastivelearning.ContrastiveLearning",
44+
)
45+
46+
MAX_VAL_SIM = (
47+
"max_val_sim",
48+
"bdikit.schema_matching.topk.maxvalsim.MaxValSim",
49+
)
50+
51+
MAGNETO_ZS_BP = (
52+
"magneto_zs_bp",
53+
"bdikit.schema_matching.magneto.MagnetoZSBP",
54+
)
55+
56+
MAGNETO_FT_BP = (
57+
"magneto_ft_bp",
58+
"bdikit.schema_matching.magneto.MagnetoFTBP",
59+
)
60+
61+
MAGNETO_ZS_LLM = (
62+
"magneto_zs_llm",
63+
"bdikit.schema_matching.magneto.MagnetoZSLLM",
64+
)
65+
66+
MAGNETO_FT_LLM = (
67+
"magneto_ft_llm",
68+
"bdikit.schema_matching.magneto.MagnetoFTLLM",
69+
)
70+
71+
def __init__(self, matcher_name: str, matcher_path: str):
72+
self.matcher_name = matcher_name
73+
self.matcher_path = matcher_path
74+
75+
76+
one2one_schema_matchers = {
77+
method.matcher_name: method.matcher_path for method in One2oneSchemaMatchers
78+
}
79+
topk_schema_matchers = {
80+
method.matcher_name: method.matcher_path for method in TopkSchemaMatchers
81+
}
82+
one2one_schema_matchers.update(topk_schema_matchers)
83+
84+
85+
def create_matcher(
86+
matcher_name: str,
87+
available_matchers: Dict[str, str],
88+
**matcher_kwargs: Mapping[str, Any],
89+
):
90+
if matcher_name not in available_matchers:
91+
names = ", ".join(list(available_matchers.keys()))
92+
raise ValueError(
93+
f"The {matcher_name} algorithm is not supported. "
94+
f"Supported algorithms are: {names}"
95+
)
96+
# Load the class dynamically
97+
module_path, class_name = available_matchers[matcher_name].rsplit(".", 1)
98+
module = importlib.import_module(module_path)
99+
100+
return getattr(module, class_name)(**matcher_kwargs)
101+
102+
103+
def get_one2one_schema_matcher(
104+
matcher_name: str, **matcher_kwargs: Mapping[str, Any]
105+
) -> BaseOne2oneSchemaMatcher:
106+
107+
return create_matcher(matcher_name, one2one_schema_matchers, **matcher_kwargs)
108+
109+
110+
def get_topk_schema_matcher(
111+
matcher_name: str, **matcher_kwargs: Mapping[str, Any]
112+
) -> BaseTopkSchemaMatcher:
113+
114+
return create_matcher(matcher_name, topk_schema_matchers, **matcher_kwargs)

bdikit/schema_matching/topk/maxvalsim.py bdikit/schema_matching/maxvalsim.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,17 @@
11
import pandas as pd
22
from typing import Optional, List
33
from bdikit.models.contrastive_learning.cl_api import DEFAULT_CL_MODEL
4-
from bdikit.schema_matching.topk.base import (
4+
from bdikit.schema_matching.base import (
55
BaseTopkSchemaMatcher,
66
TopkMatching,
77
ColumnScore,
88
)
9-
from bdikit.schema_matching.topk.contrastivelearning import CLTopkSchemaMatcher
9+
from bdikit.schema_matching.contrastivelearning import ContrastiveLearning
1010
from bdikit.value_matching.polyfuzz import TFIDFValueMatcher
1111
from bdikit.value_matching.base import BaseValueMatcher
1212

1313

14-
class MaxValSimSchemaMatcher(BaseTopkSchemaMatcher):
14+
class MaxValSim(BaseTopkSchemaMatcher):
1515
def __init__(
1616
self,
1717
top_k: int = 20,
@@ -20,7 +20,7 @@ def __init__(
2020
value_matcher: Optional[BaseValueMatcher] = None,
2121
):
2222
if top_k_matcher is None:
23-
self.api = CLTopkSchemaMatcher(DEFAULT_CL_MODEL)
23+
self.api = ContrastiveLearning(DEFAULT_CL_MODEL)
2424
elif isinstance(top_k_matcher, BaseTopkSchemaMatcher):
2525
self.api = top_k_matcher
2626
else:
@@ -49,13 +49,13 @@ def unique_string_values(self, column: pd.Series) -> pd.Series:
4949
else:
5050
return pd.Series(column.unique().astype(str), name=column.name)
5151

52-
def get_recommendations(
52+
def get_topk_matches(
5353
self, source: pd.DataFrame, target: pd.DataFrame, top_k: int
5454
) -> List[TopkMatching]:
5555
max_topk = max(
5656
top_k, self.top_k
5757
) # If self.top_k (method param) is smaller than the requested top_k, use top_k
58-
topk_column_matches = self.api.get_recommendations(source, target, max_topk)
58+
topk_column_matches = self.api.get_topk_matches(source, target, max_topk)
5959
matches = {}
6060
top_k_results = []
6161

bdikit/schema_matching/one2one/__init__.py

Whitespace-only changes.

bdikit/schema_matching/one2one/base.py

-15
This file was deleted.

bdikit/schema_matching/one2one/contrastivelearning.py

-18
This file was deleted.

0 commit comments

Comments
 (0)