VIDA-NYU
diff --git a/‎CONTRIBUTING.md
+2-2 b/‎CONTRIBUTING.md
+2-2
diff --git a/‎bdikit/api.py
+38-29 b/‎bdikit/api.py
+38-29
diff --git a/‎bdikit/schema_matching/matcher_factory.py
+2-20 b/‎bdikit/schema_matching/matcher_factory.py
+2-20
diff --git a/‎bdikit/schema_matching/maxvalsim.py
+9-7 b/‎bdikit/schema_matching/maxvalsim.py
+9-7
diff --git a/‎bdikit/utils.py
+20 b/‎bdikit/utils.py
+20
diff --git a/‎bdikit/value_matching/base.py
+16-2 b/‎bdikit/value_matching/base.py
+16-2
diff --git a/‎bdikit/value_matching/gpt.py
+3-3 b/‎bdikit/value_matching/gpt.py
+3-3
@@ -37,9 +37,9 @@ Contributors can add new methods for schema and value matching by following thes
 
 1. Create a Python module inside the "task folder" folder (e.g., `bdikit/value_matching`).
 
-2. Define a class in the module that implements either `BaseValueMatcher` (for value matching) or `BaseSchemaMatcher` (for schema matching).
+2. Define a class in the module that implements a base class. For value matching, it could be `BaseOne2oneValueMatcher` or `BaseTopkValueMatcher`.  For schema matching, it could be `BaseOne2oneSchemaMatcher` or `BaseTopkSchemaMatcher`.
 
-3. Add a new entry to the Enum class (e.g. `ValueMatchers`) in `matcher_factory.py` (e.g., `bdikit/value_matching/matcher_factory.py`). 
+3. Add a new entry to the Enum class (e.g. `One2OneValueMatchers`) in `matcher_factory.py` (e.g., `bdikit/value_matching/matcher_factory.py`). 
 Make sure to add the correct import path for your module to ensure it can be accessed without errors.
 
 
 
@@ -12,8 +12,16 @@
     get_one2one_schema_matcher,
     get_topk_schema_matcher,
 )
-from bdikit.value_matching.base import BaseValueMatcher, ValueMatch, ValueMatchingResult
-from bdikit.value_matching.matcher_factory import ValueMatchers
+from bdikit.value_matching.base import (
+    BaseOne2oneValueMatcher,
+    BaseTopkValueMatcher,
+    ValueMatch,
+    ValueMatchingResult,
+)
+from bdikit.value_matching.matcher_factory import (
+    get_one2one_value_matcher,
+    get_topk_value_matcher,
+)
 from bdikit.standards.standard_factory import Standards
 
 from bdikit.mapping_functions import (
@@ -90,8 +98,7 @@ def match_schema(
 
 def _load_table_for_standard(name: str, standard_args: Dict[str, Any]) -> pd.DataFrame:
     """
-    Load the table for the given standard data vocabulary. Currently, only the
-    GDC standard is supported.
+    Load the table for the given standard data vocabulary.
     """
     if standard_args is None:
         standard_args = {}
@@ -165,7 +172,7 @@ def match_values(
     source: pd.DataFrame,
     target: Union[str, pd.DataFrame],
     column_mapping: Union[Tuple[str, str], pd.DataFrame],
-    method: Union[str, BaseValueMatcher] = DEFAULT_VALUE_MATCHING_METHOD,
+    method: Union[str, BaseOne2oneValueMatcher] = DEFAULT_VALUE_MATCHING_METHOD,
     method_args: Optional[Dict[str, Any]] = None,
     standard_args: Optional[Dict[str, Any]] = None,
 ) -> Union[pd.DataFrame, List[pd.DataFrame]]:
@@ -207,20 +214,19 @@ def match_values(
         ValueError: If the target is neither a DataFrame nor a standard vocabulary name.
         ValueError: If the source column is not present in the source dataset.
     """
-    if method_args is None:
-        method_args = {}
 
     if standard_args is None:
         standard_args = {}
 
-    if "top_k" in method_args and method_args["top_k"] > 1:
-        logger.warning(
-            f"Ignoring 'top_k' argument, use the 'top_value_matches()' method to get top-k value matches."
-        )
-        method_args["top_k"] = 1
+    if isinstance(method, str):
+        if method_args is None:
+            method_args = {}
+        matcher_instance = get_one2one_value_matcher(method, **method_args)
+    elif isinstance(method, BaseOne2oneValueMatcher):
+        matcher_instance = method
 
     matches = _match_values(
-        source, target, column_mapping, method, method_args, standard_args
+        source, target, column_mapping, matcher_instance, standard_args
     )
 
     if isinstance(column_mapping, tuple):
@@ -241,7 +247,7 @@ def top_value_matches(
     target: Union[str, pd.DataFrame],
     column_mapping: Union[Tuple[str, str], pd.DataFrame],
     top_k: int = 5,
-    method: str = DEFAULT_VALUE_MATCHING_METHOD,
+    method: Union[str, BaseTopkValueMatcher] = DEFAULT_VALUE_MATCHING_METHOD,
     method_args: Optional[Dict[str, Any]] = None,
     standard_args: Optional[Dict[str, Any]] = None,
 ) -> List[pd.DataFrame]:
@@ -284,21 +290,19 @@ def top_value_matches(
         ValueError: If the target is neither a DataFrame nor a standard vocabulary name.
         ValueError: If the source column is not present in the source dataset.
     """
-    if method_args is None:
-        method_args = {}
 
     if standard_args is None:
         standard_args = {}
 
-    if "top_k" in method_args:
-        logger.warning(
-            f"Ignoring 'top_k' argument, using top_k argument instead (top_k={top_k})"
-        )
-
-    method_args["top_k"] = top_k
+    if isinstance(method, str):
+        if method_args is None:
+            method_args = {}
+        matcher_instance = get_topk_value_matcher(method, **method_args)
+    elif isinstance(method, BaseTopkValueMatcher):
+        matcher_instance = method
 
     matches = _match_values(
-        source, target, column_mapping, method, method_args, standard_args
+        source, target, column_mapping, matcher_instance, standard_args, top_k
     )
 
     match_list = []
@@ -359,15 +363,15 @@ def _match_values(
     source: pd.DataFrame,
     target: Union[str, pd.DataFrame],
     column_mapping: Union[Tuple[str, str], pd.DataFrame],
-    method: str,
-    method_args: Dict[str, Any],
+    value_matcher: Union[BaseOne2oneValueMatcher, BaseTopkValueMatcher],
     standard_args: Dict[str, Any],
+    top_k: int = 1,
 ) -> List[pd.DataFrame]:
 
     target_domain, column_mapping_list = _format_value_matching_input(
         source, target, column_mapping, standard_args
     )
-    value_matcher = ValueMatchers.get_matcher(method, **method_args)
+
     mapping_results: List[ValueMatchingResult] = []
 
     for mapping in column_mapping_list:
@@ -389,9 +393,14 @@ def _match_values(
         }
 
         # 3. Apply the value matcher to create value mapping dictionaries
-        raw_matches = value_matcher.match(
-            list(source_values_dict.keys()), list(target_values_dict.keys())
-        )
+        if isinstance(value_matcher, BaseTopkValueMatcher):
+            raw_matches = value_matcher.get_topk_matches(
+                list(source_values_dict.keys()), list(target_values_dict.keys()), top_k
+            )
+        else:
+            raw_matches = value_matcher.get_one2one_match(
+                list(source_values_dict.keys()), list(target_values_dict.keys())
+            )
 
         # 4. Transform the matches to the original
         matches: List[ValueMatch] = []
 
@@ -1,7 +1,7 @@
-import importlib
 from enum import Enum
-from typing import Mapping, Dict, Any
+from typing import Mapping, Any
 from bdikit.schema_matching.base import BaseOne2oneSchemaMatcher, BaseTopkSchemaMatcher
+from bdikit.utils import create_matcher
 
 
 class One2oneSchemaMatchers(Enum):
@@ -82,24 +82,6 @@ def __init__(self, matcher_name: str, matcher_path: str):
 one2one_schema_matchers.update(topk_schema_matchers)
 
 
-def create_matcher(
-    matcher_name: str,
-    available_matchers: Dict[str, str],
-    **matcher_kwargs: Mapping[str, Any],
-):
-    if matcher_name not in available_matchers:
-        names = ", ".join(list(available_matchers.keys()))
-        raise ValueError(
-            f"The {matcher_name} algorithm is not supported. "
-            f"Supported algorithms are: {names}"
-        )
-    # Load the class dynamically
-    module_path, class_name = available_matchers[matcher_name].rsplit(".", 1)
-    module = importlib.import_module(module_path)
-
-    return getattr(module, class_name)(**matcher_kwargs)
-
-
 def get_one2one_schema_matcher(
     matcher_name: str, **matcher_kwargs: Mapping[str, Any]
 ) -> BaseOne2oneSchemaMatcher:
 
@@ -7,8 +7,8 @@
     ColumnScore,
 )
 from bdikit.schema_matching.contrastivelearning import ContrastiveLearning
-from bdikit.value_matching.polyfuzz import TFIDFValueMatcher
-from bdikit.value_matching.base import BaseValueMatcher
+from bdikit.value_matching.polyfuzz import TFIDF
+from bdikit.value_matching.base import BaseOne2oneValueMatcher
 
 
 class MaxValSim(BaseTopkSchemaMatcher):
@@ -17,7 +17,7 @@ def __init__(
         top_k: int = 20,
         contribution_factor: float = 0.5,
         top_k_matcher: Optional[BaseTopkSchemaMatcher] = None,
-        value_matcher: Optional[BaseValueMatcher] = None,
+        value_matcher: Optional[BaseOne2oneValueMatcher] = None,
     ):
         if top_k_matcher is None:
             self.api = ContrastiveLearning(DEFAULT_CL_MODEL)
@@ -30,13 +30,13 @@ def __init__(
             )
 
         if value_matcher is None:
-            self.value_matcher = TFIDFValueMatcher()
-        elif isinstance(value_matcher, BaseValueMatcher):
+            self.value_matcher = TFIDF()
+        elif isinstance(value_matcher, BaseOne2oneValueMatcher):
             self.value_matcher = value_matcher
         else:
             raise ValueError(
                 f"Invalid value_matcher type: {type(value_matcher)}. "
-                "Must be a subclass of {BaseValueMatcher.__name__}"
+                "Must be a subclass of {BaseOne2oneValueMatcher.__name__}"
             )
 
         self.top_k = top_k
@@ -76,7 +76,9 @@ def get_topk_matches(
                 target_column_name = top_column.column_name
                 target_column = target[target_column_name]
                 target_values = self.unique_string_values(target_column).to_list()
-                value_matches = self.value_matcher.match(source_values, target_values)
+                value_matches = self.value_matcher.get_one2one_match(
+                    source_values, target_values
+                )
                 if len(target_values) == 0:
                     value_score = 0.0
                 else:
 
@@ -1,7 +1,9 @@
 import os
 import hashlib
+import importlib
 import pandas as pd
 from os.path import join, dirname, isfile
+from typing import Mapping, Dict, Any
 from bdikit.download import BDIKIT_EMBEDDINGS_CACHE_DIR
 
 
@@ -58,3 +60,21 @@ def check_embedding_cache(table: pd.DataFrame, model_path: str):
                 embeddings = None
 
     return embedding_file, embeddings
+
+
+def create_matcher(
+    matcher_name: str,
+    available_matchers: Dict[str, str],
+    **matcher_kwargs: Mapping[str, Any],
+):
+    if matcher_name not in available_matchers:
+        names = ", ".join(list(available_matchers.keys()))
+        raise ValueError(
+            f"The {matcher_name} algorithm is not supported. "
+            f"Supported algorithms are: {names}"
+        )
+    # Load the class dynamically
+    module_path, class_name = available_matchers[matcher_name].rsplit(".", 1)
+    module = importlib.import_module(module_path)
+
+    return getattr(module, class_name)(**matcher_kwargs)
@@ -25,13 +25,27 @@ class ValueMatchingResult(TypedDict):
     unmatch_values: Set[str]
 
 
-class BaseValueMatcher:
+class BaseOne2oneValueMatcher:
     """
     Base class for value matching algorithms, i.e., algorithms that match
     values from a source domain to values from a target domain.
     """
 
-    def match(
+    def get_one2one_match(
         self, source_values: List[str], target_values: List[str]
     ) -> List[ValueMatch]:
         raise NotImplementedError("Subclasses must implement this method")
+
+
+class BaseTopkValueMatcher(BaseOne2oneValueMatcher):
+    def get_topk_matches(
+        self, source_values: List[str], target_values: List[str], top_k: int
+    ) -> List[ValueMatch]:
+        raise NotImplementedError("Subclasses must implement this method")
+
+    def get_one2one_match(
+        self, source_values: List[str], target_values: List[str]
+    ) -> List[ValueMatch]:
+        matches = self.get_topk_matches(source_values, target_values, 1)
+
+        return matches
@@ -1,19 +1,19 @@
 import ast
 from typing import List
 from openai import OpenAI
-from bdikit.value_matching.base import BaseValueMatcher, ValueMatch
+from bdikit.value_matching.base import BaseOne2oneValueMatcher, ValueMatch
 from bdikit.config import VALUE_MATCHING_THRESHOLD
 
 
-class GPTValueMatcher(BaseValueMatcher):
+class GPT(BaseOne2oneValueMatcher):
     def __init__(
         self,
         threshold: float = VALUE_MATCHING_THRESHOLD,
     ):
         self.client = OpenAI()
         self.threshold = threshold
 
-    def match(
+    def get_one2one_match(
         self,
         source_values: List[str],
         target_values: List[str],