7
7
import panel as pn
8
8
from IPython .display import display , Markdown
9
9
10
- from bdikit .schema_matching .one2one .base import BaseSchemaMatcher
11
- from bdikit .schema_matching .one2one .matcher_factory import SchemaMatchers
12
- from bdikit .schema_matching .topk .base import BaseTopkSchemaMatcher
13
- from bdikit .schema_matching .topk .matcher_factory import TopkMatchers
14
- from bdikit .value_matching .base import BaseValueMatcher , ValueMatch , ValueMatchingResult
15
- from bdikit .value_matching .matcher_factory import ValueMatchers
10
+ from bdikit .schema_matching .base import BaseOne2oneSchemaMatcher , BaseTopkSchemaMatcher
11
+ from bdikit .schema_matching .matcher_factory import (
12
+ get_one2one_schema_matcher ,
13
+ get_topk_schema_matcher ,
14
+ )
15
+ from bdikit .value_matching .base import (
16
+ BaseOne2oneValueMatcher ,
17
+ BaseTopkValueMatcher ,
18
+ ValueMatch ,
19
+ ValueMatchingResult ,
20
+ )
21
+ from bdikit .value_matching .matcher_factory import (
22
+ get_one2one_value_matcher ,
23
+ get_topk_value_matcher ,
24
+ )
16
25
from bdikit .standards .standard_factory import Standards
17
26
18
27
from bdikit .mapping_functions import (
43
52
def match_schema (
44
53
source : pd .DataFrame ,
45
54
target : Union [str , pd .DataFrame ] = "gdc" ,
46
- method : Union [str , BaseSchemaMatcher ] = DEFAULT_SCHEMA_MATCHING_METHOD ,
55
+ method : Union [str , BaseOne2oneSchemaMatcher ] = DEFAULT_SCHEMA_MATCHING_METHOD ,
47
56
method_args : Optional [Dict [str , Any ]] = None ,
48
57
standard_args : Optional [Dict [str , Any ]] = None ,
49
58
) -> pd .DataFrame :
@@ -74,23 +83,22 @@ def match_schema(
74
83
if isinstance (method , str ):
75
84
if method_args is None :
76
85
method_args = {}
77
- matcher_instance = SchemaMatchers . get_matcher (method , ** method_args )
78
- elif isinstance (method , BaseSchemaMatcher ):
86
+ matcher_instance = get_one2one_schema_matcher (method , ** method_args )
87
+ elif isinstance (method , BaseOne2oneSchemaMatcher ):
79
88
matcher_instance = method
80
89
else :
81
90
raise ValueError (
82
91
"The method must be a string or an instance of BaseColumnMappingAlgorithm"
83
92
)
84
93
85
- matches = matcher_instance .map (source , target_table )
94
+ matches = matcher_instance .get_one2one_match (source , target_table )
86
95
87
96
return pd .DataFrame (matches .items (), columns = ["source" , "target" ])
88
97
89
98
90
99
def _load_table_for_standard (name : str , standard_args : Dict [str , Any ]) -> pd .DataFrame :
91
100
"""
92
- Load the table for the given standard data vocabulary. Currently, only the
93
- GDC standard is supported.
101
+ Load the table for the given standard data vocabulary.
94
102
"""
95
103
if standard_args is None :
96
104
standard_args = {}
@@ -138,15 +146,15 @@ def top_matches(
138
146
if isinstance (method , str ):
139
147
if method_args is None :
140
148
method_args = {}
141
- topk_matcher = TopkMatchers . get_matcher (method , ** method_args )
149
+ topk_matcher = get_topk_schema_matcher (method , ** method_args )
142
150
elif isinstance (method , BaseTopkSchemaMatcher ):
143
151
topk_matcher = method
144
152
else :
145
153
raise ValueError (
146
154
"The method must be a string or an instance of BaseTopkColumnMatcher"
147
155
)
148
156
149
- top_k_matches = topk_matcher .get_recommendations (
157
+ top_k_matches = topk_matcher .get_topk_matches (
150
158
selected_columns , target = target_table , top_k = top_k
151
159
)
152
160
@@ -164,7 +172,7 @@ def match_values(
164
172
source : pd .DataFrame ,
165
173
target : Union [str , pd .DataFrame ],
166
174
column_mapping : Union [Tuple [str , str ], pd .DataFrame ],
167
- method : Union [str , BaseValueMatcher ] = DEFAULT_VALUE_MATCHING_METHOD ,
175
+ method : Union [str , BaseOne2oneValueMatcher ] = DEFAULT_VALUE_MATCHING_METHOD ,
168
176
method_args : Optional [Dict [str , Any ]] = None ,
169
177
standard_args : Optional [Dict [str , Any ]] = None ,
170
178
) -> Union [pd .DataFrame , List [pd .DataFrame ]]:
@@ -206,20 +214,19 @@ def match_values(
206
214
ValueError: If the target is neither a DataFrame nor a standard vocabulary name.
207
215
ValueError: If the source column is not present in the source dataset.
208
216
"""
209
- if method_args is None :
210
- method_args = {}
211
217
212
218
if standard_args is None :
213
219
standard_args = {}
214
220
215
- if "top_k" in method_args and method_args ["top_k" ] > 1 :
216
- logger .warning (
217
- f"Ignoring 'top_k' argument, use the 'top_value_matches()' method to get top-k value matches."
218
- )
219
- method_args ["top_k" ] = 1
221
+ if isinstance (method , str ):
222
+ if method_args is None :
223
+ method_args = {}
224
+ matcher_instance = get_one2one_value_matcher (method , ** method_args )
225
+ elif isinstance (method , BaseOne2oneValueMatcher ):
226
+ matcher_instance = method
220
227
221
228
matches = _match_values (
222
- source , target , column_mapping , method , method_args , standard_args
229
+ source , target , column_mapping , matcher_instance , standard_args
223
230
)
224
231
225
232
if isinstance (column_mapping , tuple ):
@@ -240,7 +247,7 @@ def top_value_matches(
240
247
target : Union [str , pd .DataFrame ],
241
248
column_mapping : Union [Tuple [str , str ], pd .DataFrame ],
242
249
top_k : int = 5 ,
243
- method : str = DEFAULT_VALUE_MATCHING_METHOD ,
250
+ method : Union [ str , BaseTopkValueMatcher ] = DEFAULT_VALUE_MATCHING_METHOD ,
244
251
method_args : Optional [Dict [str , Any ]] = None ,
245
252
standard_args : Optional [Dict [str , Any ]] = None ,
246
253
) -> List [pd .DataFrame ]:
@@ -283,21 +290,19 @@ def top_value_matches(
283
290
ValueError: If the target is neither a DataFrame nor a standard vocabulary name.
284
291
ValueError: If the source column is not present in the source dataset.
285
292
"""
286
- if method_args is None :
287
- method_args = {}
288
293
289
294
if standard_args is None :
290
295
standard_args = {}
291
296
292
- if "top_k" in method_args :
293
- logger . warning (
294
- f"Ignoring 'top_k' argument, using top_k argument instead (top_k= { top_k } )"
295
- )
296
-
297
- method_args [ "top_k" ] = top_k
297
+ if isinstance ( method , str ) :
298
+ if method_args is None :
299
+ method_args = {}
300
+ matcher_instance = get_topk_value_matcher ( method , ** method_args )
301
+ elif isinstance ( method , BaseTopkValueMatcher ):
302
+ matcher_instance = method
298
303
299
304
matches = _match_values (
300
- source , target , column_mapping , method , method_args , standard_args
305
+ source , target , column_mapping , matcher_instance , standard_args , top_k
301
306
)
302
307
303
308
match_list = []
@@ -358,15 +363,15 @@ def _match_values(
358
363
source : pd .DataFrame ,
359
364
target : Union [str , pd .DataFrame ],
360
365
column_mapping : Union [Tuple [str , str ], pd .DataFrame ],
361
- method : str ,
362
- method_args : Dict [str , Any ],
366
+ value_matcher : Union [BaseOne2oneValueMatcher , BaseTopkValueMatcher ],
363
367
standard_args : Dict [str , Any ],
368
+ top_k : int = 1 ,
364
369
) -> List [pd .DataFrame ]:
365
370
366
371
target_domain , column_mapping_list = _format_value_matching_input (
367
372
source , target , column_mapping , standard_args
368
373
)
369
- value_matcher = ValueMatchers . get_matcher ( method , ** method_args )
374
+
370
375
mapping_results : List [ValueMatchingResult ] = []
371
376
372
377
for mapping in column_mapping_list :
@@ -388,9 +393,14 @@ def _match_values(
388
393
}
389
394
390
395
# 3. Apply the value matcher to create value mapping dictionaries
391
- raw_matches = value_matcher .match (
392
- list (source_values_dict .keys ()), list (target_values_dict .keys ())
393
- )
396
+ if isinstance (value_matcher , BaseTopkValueMatcher ):
397
+ raw_matches = value_matcher .get_topk_matches (
398
+ list (source_values_dict .keys ()), list (target_values_dict .keys ()), top_k
399
+ )
400
+ else :
401
+ raw_matches = value_matcher .get_one2one_match (
402
+ list (source_values_dict .keys ()), list (target_values_dict .keys ())
403
+ )
394
404
395
405
# 4. Transform the matches to the original
396
406
matches : List [ValueMatch ] = []
0 commit comments