VIDA-NYU
diff --git a/‎CONTRIBUTING.md
+2-2 b/‎CONTRIBUTING.md
+2-2
diff --git a/‎bdikit/api.py
+49-39 b/‎bdikit/api.py
+49-39
diff --git a/‎bdikit/schema_matching/topk/base.py ‎bdikit/schema_matching/base.py
+19-5 b/‎bdikit/schema_matching/topk/base.py ‎bdikit/schema_matching/base.py
+19-5
diff --git a/‎bdikit/schema_matching/topk/contrastivelearning.py ‎bdikit/schema_matching/contrastivelearning.py
+4-4 b/‎bdikit/schema_matching/topk/contrastivelearning.py ‎bdikit/schema_matching/contrastivelearning.py
+4-4
diff --git a/‎bdikit/schema_matching/one2one/gpt.py ‎bdikit/schema_matching/gpt.py
+3-3 b/‎bdikit/schema_matching/one2one/gpt.py ‎bdikit/schema_matching/gpt.py
+3-3
diff --git a/‎bdikit/schema_matching/topk/magneto.py ‎bdikit/schema_matching/magneto.py
+2-7 b/‎bdikit/schema_matching/topk/magneto.py ‎bdikit/schema_matching/magneto.py
+2-7
@@ -37,9 +37,9 @@ Contributors can add new methods for schema and value matching by following thes
 
 1. Create a Python module inside the "task folder" folder (e.g., `bdikit/value_matching`).
 
-2. Define a class in the module that implements either `BaseValueMatcher` (for value matching) or `BaseSchemaMatcher` (for schema matching).
+2. Define a class in the module that implements a base class. For value matching, it could be `BaseOne2oneValueMatcher` or `BaseTopkValueMatcher`.  For schema matching, it could be `BaseOne2oneSchemaMatcher` or `BaseTopkSchemaMatcher`.
 
-3. Add a new entry to the Enum class (e.g. `ValueMatchers`) in `matcher_factory.py` (e.g., `bdikit/value_matching/matcher_factory.py`). 
+3. Add a new entry to the Enum class (e.g. `One2OneValueMatchers`) in `matcher_factory.py` (e.g., `bdikit/value_matching/matcher_factory.py`). 
 Make sure to add the correct import path for your module to ensure it can be accessed without errors.
 
 
 
@@ -7,12 +7,21 @@
 import panel as pn
 from IPython.display import display, Markdown
 
-from bdikit.schema_matching.one2one.base import BaseSchemaMatcher
-from bdikit.schema_matching.one2one.matcher_factory import SchemaMatchers
-from bdikit.schema_matching.topk.base import BaseTopkSchemaMatcher
-from bdikit.schema_matching.topk.matcher_factory import TopkMatchers
-from bdikit.value_matching.base import BaseValueMatcher, ValueMatch, ValueMatchingResult
-from bdikit.value_matching.matcher_factory import ValueMatchers
+from bdikit.schema_matching.base import BaseOne2oneSchemaMatcher, BaseTopkSchemaMatcher
+from bdikit.schema_matching.matcher_factory import (
+    get_one2one_schema_matcher,
+    get_topk_schema_matcher,
+)
+from bdikit.value_matching.base import (
+    BaseOne2oneValueMatcher,
+    BaseTopkValueMatcher,
+    ValueMatch,
+    ValueMatchingResult,
+)
+from bdikit.value_matching.matcher_factory import (
+    get_one2one_value_matcher,
+    get_topk_value_matcher,
+)
 from bdikit.standards.standard_factory import Standards
 
 from bdikit.mapping_functions import (
@@ -43,7 +52,7 @@
 def match_schema(
     source: pd.DataFrame,
     target: Union[str, pd.DataFrame] = "gdc",
-    method: Union[str, BaseSchemaMatcher] = DEFAULT_SCHEMA_MATCHING_METHOD,
+    method: Union[str, BaseOne2oneSchemaMatcher] = DEFAULT_SCHEMA_MATCHING_METHOD,
     method_args: Optional[Dict[str, Any]] = None,
     standard_args: Optional[Dict[str, Any]] = None,
 ) -> pd.DataFrame:
@@ -74,23 +83,22 @@ def match_schema(
     if isinstance(method, str):
         if method_args is None:
             method_args = {}
-        matcher_instance = SchemaMatchers.get_matcher(method, **method_args)
-    elif isinstance(method, BaseSchemaMatcher):
+        matcher_instance = get_one2one_schema_matcher(method, **method_args)
+    elif isinstance(method, BaseOne2oneSchemaMatcher):
         matcher_instance = method
     else:
         raise ValueError(
             "The method must be a string or an instance of BaseColumnMappingAlgorithm"
         )
 
-    matches = matcher_instance.map(source, target_table)
+    matches = matcher_instance.get_one2one_match(source, target_table)
 
     return pd.DataFrame(matches.items(), columns=["source", "target"])
 
 
 def _load_table_for_standard(name: str, standard_args: Dict[str, Any]) -> pd.DataFrame:
     """
-    Load the table for the given standard data vocabulary. Currently, only the
-    GDC standard is supported.
+    Load the table for the given standard data vocabulary.
     """
     if standard_args is None:
         standard_args = {}
@@ -138,15 +146,15 @@ def top_matches(
     if isinstance(method, str):
         if method_args is None:
             method_args = {}
-        topk_matcher = TopkMatchers.get_matcher(method, **method_args)
+        topk_matcher = get_topk_schema_matcher(method, **method_args)
     elif isinstance(method, BaseTopkSchemaMatcher):
         topk_matcher = method
     else:
         raise ValueError(
             "The method must be a string or an instance of BaseTopkColumnMatcher"
         )
 
-    top_k_matches = topk_matcher.get_recommendations(
+    top_k_matches = topk_matcher.get_topk_matches(
         selected_columns, target=target_table, top_k=top_k
     )
 
@@ -164,7 +172,7 @@ def match_values(
     source: pd.DataFrame,
     target: Union[str, pd.DataFrame],
     column_mapping: Union[Tuple[str, str], pd.DataFrame],
-    method: Union[str, BaseValueMatcher] = DEFAULT_VALUE_MATCHING_METHOD,
+    method: Union[str, BaseOne2oneValueMatcher] = DEFAULT_VALUE_MATCHING_METHOD,
     method_args: Optional[Dict[str, Any]] = None,
     standard_args: Optional[Dict[str, Any]] = None,
 ) -> Union[pd.DataFrame, List[pd.DataFrame]]:
@@ -206,20 +214,19 @@ def match_values(
         ValueError: If the target is neither a DataFrame nor a standard vocabulary name.
         ValueError: If the source column is not present in the source dataset.
     """
-    if method_args is None:
-        method_args = {}
 
     if standard_args is None:
         standard_args = {}
 
-    if "top_k" in method_args and method_args["top_k"] > 1:
-        logger.warning(
-            f"Ignoring 'top_k' argument, use the 'top_value_matches()' method to get top-k value matches."
-        )
-        method_args["top_k"] = 1
+    if isinstance(method, str):
+        if method_args is None:
+            method_args = {}
+        matcher_instance = get_one2one_value_matcher(method, **method_args)
+    elif isinstance(method, BaseOne2oneValueMatcher):
+        matcher_instance = method
 
     matches = _match_values(
-        source, target, column_mapping, method, method_args, standard_args
+        source, target, column_mapping, matcher_instance, standard_args
     )
 
     if isinstance(column_mapping, tuple):
@@ -240,7 +247,7 @@ def top_value_matches(
     target: Union[str, pd.DataFrame],
     column_mapping: Union[Tuple[str, str], pd.DataFrame],
     top_k: int = 5,
-    method: str = DEFAULT_VALUE_MATCHING_METHOD,
+    method: Union[str, BaseTopkValueMatcher] = DEFAULT_VALUE_MATCHING_METHOD,
     method_args: Optional[Dict[str, Any]] = None,
     standard_args: Optional[Dict[str, Any]] = None,
 ) -> List[pd.DataFrame]:
@@ -283,21 +290,19 @@ def top_value_matches(
         ValueError: If the target is neither a DataFrame nor a standard vocabulary name.
         ValueError: If the source column is not present in the source dataset.
     """
-    if method_args is None:
-        method_args = {}
 
     if standard_args is None:
         standard_args = {}
 
-    if "top_k" in method_args:
-        logger.warning(
-            f"Ignoring 'top_k' argument, using top_k argument instead (top_k={top_k})"
-        )
-
-    method_args["top_k"] = top_k
+    if isinstance(method, str):
+        if method_args is None:
+            method_args = {}
+        matcher_instance = get_topk_value_matcher(method, **method_args)
+    elif isinstance(method, BaseTopkValueMatcher):
+        matcher_instance = method
 
     matches = _match_values(
-        source, target, column_mapping, method, method_args, standard_args
+        source, target, column_mapping, matcher_instance, standard_args, top_k
     )
 
     match_list = []
@@ -358,15 +363,15 @@ def _match_values(
     source: pd.DataFrame,
     target: Union[str, pd.DataFrame],
     column_mapping: Union[Tuple[str, str], pd.DataFrame],
-    method: str,
-    method_args: Dict[str, Any],
+    value_matcher: Union[BaseOne2oneValueMatcher, BaseTopkValueMatcher],
     standard_args: Dict[str, Any],
+    top_k: int = 1,
 ) -> List[pd.DataFrame]:
 
     target_domain, column_mapping_list = _format_value_matching_input(
         source, target, column_mapping, standard_args
     )
-    value_matcher = ValueMatchers.get_matcher(method, **method_args)
+
     mapping_results: List[ValueMatchingResult] = []
 
     for mapping in column_mapping_list:
@@ -388,9 +393,14 @@ def _match_values(
         }
 
         # 3. Apply the value matcher to create value mapping dictionaries
-        raw_matches = value_matcher.match(
-            list(source_values_dict.keys()), list(target_values_dict.keys())
-        )
+        if isinstance(value_matcher, BaseTopkValueMatcher):
+            raw_matches = value_matcher.get_topk_matches(
+                list(source_values_dict.keys()), list(target_values_dict.keys()), top_k
+            )
+        else:
+            raw_matches = value_matcher.get_one2one_match(
+                list(source_values_dict.keys()), list(target_values_dict.keys())
+            )
 
         # 4. Transform the matches to the original
         matches: List[ValueMatch] = []
 
@@ -1,8 +1,22 @@
-from bdikit.schema_matching.one2one.base import BaseSchemaMatcher
 from typing import List, NamedTuple, TypedDict, Dict
 import pandas as pd
 
 
+class BaseOne2oneSchemaMatcher:
+    def get_one2one_match(
+        self, source: pd.DataFrame, target: pd.DataFrame
+    ) -> Dict[str, str]:
+        raise NotImplementedError("Subclasses must implement this method")
+
+    def _fill_missing_matches(
+        self, dataset: pd.DataFrame, matches: Dict[str, str]
+    ) -> Dict[str, str]:
+        for column in dataset.columns:
+            if column not in matches:
+                matches[column] = ""
+        return matches
+
+
 class ColumnScore(NamedTuple):
     column_name: str
     score: float
@@ -13,19 +27,19 @@ class TopkMatching(TypedDict):
     top_k_columns: List[ColumnScore]
 
 
-class BaseTopkSchemaMatcher(BaseSchemaMatcher):
+class BaseTopkSchemaMatcher(BaseOne2oneSchemaMatcher):
 
-    def get_recommendations(
+    def get_topk_matches(
         self, source: pd.DataFrame, target: pd.DataFrame, top_k: int
     ) -> List[TopkMatching]:
         raise NotImplementedError("Subclasses must implement this method")
 
-    def map(
+    def get_one2one_match(
         self,
         source: pd.DataFrame,
         target: pd.DataFrame,
     ) -> Dict[str, str]:
-        top_matches = self.get_recommendations(source, target, 1)
+        top_matches = self.get_topk_matches(source, target, 1)
         matches = {}
 
         for top_match in top_matches:
 
@@ -1,7 +1,7 @@
 import pandas as pd
 import numpy as np
 from typing import List
-from bdikit.schema_matching.topk.base import (
+from bdikit.schema_matching.base import (
     ColumnScore,
     TopkMatching,
     BaseTopkSchemaMatcher,
@@ -14,12 +14,12 @@
 from bdikit.models import ColumnEmbedder
 
 
-class EmbeddingSimilarityTopkSchemaMatcher(BaseTopkSchemaMatcher):
+class EmbeddingSimilarity(BaseTopkSchemaMatcher):
     def __init__(self, column_embedder: ColumnEmbedder, metric: str = "cosine"):
         self.api = column_embedder
         self.metric = metric
 
-    def get_recommendations(
+    def get_topk_matches(
         self, source: pd.DataFrame, target: pd.DataFrame, top_k: int = 10
     ) -> List[TopkMatching]:
         """
@@ -54,7 +54,7 @@ def get_recommendations(
         return top_k_results
 
 
-class CLTopkSchemaMatcher(EmbeddingSimilarityTopkSchemaMatcher):
+class ContrastiveLearning(EmbeddingSimilarity):
     def __init__(self, model_name: str = DEFAULT_CL_MODEL, metric: str = "cosine"):
         super().__init__(
             column_embedder=ContrastiveLearningAPI(model_name=model_name), metric=metric
 
@@ -1,13 +1,13 @@
 import pandas as pd
 from openai import OpenAI
-from bdikit.schema_matching.one2one.base import BaseSchemaMatcher
+from bdikit.schema_matching.base import BaseOne2oneSchemaMatcher
 
 
-class GPTSchemaMatcher(BaseSchemaMatcher):
+class GPT(BaseOne2oneSchemaMatcher):
     def __init__(self):
         self.client = OpenAI()
 
-    def map(self, source: pd.DataFrame, target: pd.DataFrame):
+    def get_one2one_match(self, source: pd.DataFrame, target: pd.DataFrame):
         target_columns = target.columns
         labels = ", ".join(target_columns)
         candidate_columns = source.columns
 
@@ -1,13 +1,8 @@
 import pandas as pd
 from typing import Dict, Any, List
 from magneto import Magneto as Magneto_Lib
-from bdikit.schema_matching.one2one.base import BaseSchemaMatcher
 from bdikit.download import get_cached_model_or_download
-from bdikit.schema_matching.topk.base import (
-    ColumnScore,
-    TopkMatching,
-    BaseTopkSchemaMatcher,
-)
+from bdikit.schema_matching.base import ColumnScore, TopkMatching, BaseTopkSchemaMatcher
 
 DEFAULT_MAGNETO_MODEL = "magneto-gdc-v0.1"
 
@@ -18,7 +13,7 @@ def __init__(self, kwargs: Dict[str, Any] = None):
             kwargs = {}
         self.magneto = Magneto_Lib(**kwargs)
 
-    def get_recommendations(
+    def get_topk_matches(
         self, source: pd.DataFrame, target: pd.DataFrame, top_k: int
     ) -> List[TopkMatching]:
         self.magneto.params["topk"] = (