@@ -103,8 +103,8 @@ def concat(objs: List[IdaDataFrame], axis: int=0, join: str='outer', keys: List[
103
103
return idadf
104
104
105
105
106
- def merge (left : IdaDataFrame , right : IdaDataFrame , how : str = 'inner' , on : str = None ,
107
- left_on : str = None , right_on : str = None , left_index : bool = False ,
106
+ def merge (left : IdaDataFrame , right : IdaDataFrame , how : str = 'inner' , on = None ,
107
+ left_on = None , right_on = None , left_index : bool = False ,
108
108
right_index : bool = False , suffixes : List [str ]= ["_x" , "_y" ],
109
109
indicator : bool = False ):
110
110
"""
@@ -150,7 +150,7 @@ def merge(left: IdaDataFrame, right: IdaDataFrame, how: str='inner', on: str=Non
150
150
>>> iris2['id'] = iris2['id'] + 50
151
151
>>> iris1_ida = idadb.as_idadataframe(iris, 'IRIS_TEST1', indexer='index')
152
152
>>> iris2_ida = idadb.as_idadataframe(iris2, 'IRIS_TEST2', indexer='id')
153
- >>> iris_merge = nzpyida.merge(iris1_ida, iris2_ida)
153
+ >>> iris_merge = nzpyida.merge(iris1_ida, iris2_ida, left_on='index', right_on='id' )
154
154
>>> iris_merge.tail()
155
155
index sepal_length_x sepal_width_x petal_length_x petal_width species id sepal_length_y sepal_width_y petal_length_y PETAL_WIDTH CLASS
156
156
95 145 6.7 3.0 5.2 2.3 virginica 145 5.7 3.0 4.2 1.2 versicolor
@@ -164,11 +164,11 @@ def merge(left: IdaDataFrame, right: IdaDataFrame, how: str='inner', on: str=Non
164
164
how='outer', indicator=True, suffixes=("_a", "_b"))
165
165
>>> iris_merge.head()
166
166
index sepal_length_a sepal_width_a petal_length_a petal_width species id sepal_length_b sepal_width_b petal_length_b PETAL_WIDTH CLASS INDICATOR
167
- 0 None None None None None None 152 7.1 3.0 5.9 2.1 virginica only right
168
- 1 None None None None None None 151 5.8 2.7 5.1 1.9 virginica only right
169
- 2 None None None None None None 154 6.5 3.0 5.8 2.2 virginica only right
170
- 3 None None None None None None 153 6.3 2.9 5.6 1.8 virginica only right
171
- 4 None None None None None None 150 6.3 3.3 6.0 2.5 virginica only right
167
+ 0 None None None None None None 152 7.1 3.0 5.9 2.1 virginica right_only
168
+ 1 None None None None None None 151 5.8 2.7 5.1 1.9 virginica right_only
169
+ 2 None None None None None None 154 6.5 3.0 5.8 2.2 virginica right_only
170
+ 3 None None None None None None 153 6.3 2.9 5.6 1.8 virginica right_only
171
+ 4 None None None None None None 150 6.3 3.3 6.0 2.5 virginica right_only
172
172
"""
173
173
idadb = left ._idadb
174
174
suffixes = [suffix if suffix else "" for suffix in suffixes ]
@@ -181,23 +181,61 @@ def merge(left: IdaDataFrame, right: IdaDataFrame, how: str='inner', on: str=Non
181
181
if how not in available_join_types :
182
182
raise ValueError (f"Invalid value in 'how', should be one of: { available_join_types } " )
183
183
184
+
185
+ if not on and not any ([left_on , right_on , left_index , right_index ]):
186
+ if how != "cross" :
187
+ on = common_columns
188
+ if not on :
189
+ raise ValueError ("No common columns to perform merge on." ,
190
+ "Merge options: left_on=None, right_on=None," ,
191
+ "left_index=False, right_index=False" )
184
192
if on :
185
- if on and not on in left .columns :
186
- raise KeyError (f"No column { on } in { left .name } dataframe" )
187
- if on and not on in right .columns :
188
- raise KeyError (f"No column { on } in { right .name } dataframe" )
189
- on_query = f" using ({ q (on )} )"
190
- common_columns .remove (on )
191
- left_indexer = on
192
- right_indexer = on
193
+ if isinstance (on , str ):
194
+ if not on in left .columns :
195
+ raise KeyError (f"No column { on } in { left .name } dataframe" )
196
+ if not on in right .columns :
197
+ raise KeyError (f"No column { on } in { right .name } dataframe" )
198
+ on_query = f" using ({ q (on )} )"
199
+ left_indexer = on
200
+ right_indexer = on
201
+ on = [on ]
202
+ else :
203
+ if not all (on_col in left .columns for on_col in on ):
204
+ raise KeyError (f"Not all on columns { on } in { left .name } dataframe" )
205
+ if not all (on_col in right .columns for on_col in on ):
206
+ raise KeyError (f"Not all on columns { on } in { right .name } dataframe" )
207
+ if len (on ) == 1 :
208
+ on_query = f" using ({ q (on [0 ])} )"
209
+ else :
210
+ on_queries = [f"left_table.{ q (on_col )} = right_table.{ q (on_col )} "
211
+ for on_col in on ]
212
+ on_query = " on " + " and " .join (on_queries )
213
+
214
+
193
215
194
216
elif any ([left_on , right_on , left_index , right_index ]):
195
217
if how == "cross" :
196
218
raise ValueError ("Can not pass on, right_on, left_on or set right_index=True or left_index=True" )
197
- if left_on and not left_on in left .columns :
198
- raise KeyError (f"No column { left_on } in { left .name } dataframe" )
199
- if right_on and not right_on in right .columns :
200
- raise KeyError (f"No column { right_on } in { right .name } dataframe" )
219
+ if left_on :
220
+ if isinstance (left_on , str ):
221
+ if not left_on in left .columns :
222
+ raise KeyError (f"No column { left_on } in { left .name } dataframe" )
223
+ left_on = [left_on ]
224
+ elif not all (left_on_col in left .columns for left_on_col in left_on ):
225
+ raise KeyError (f"Not all columns { left_on } in { left .name } dataframe" )
226
+ if right_on :
227
+ if isinstance (right_on , str ):
228
+ if not right_on in right .columns :
229
+ raise KeyError (f"No column { right_on } in { right .name } dataframe" )
230
+ right_on = [right_on ]
231
+ elif not all (right_on_col in right .columns for right_on_col in right_on ):
232
+ raise KeyError (f"Not all columns { right_on } in { right .name } dataframe" )
233
+ if left_on and len (left_on ) > 1 :
234
+ if not right_on or len (right_on ) != len (left_on ):
235
+ raise ValueError ("len(right_on) must equal len(left_on)" )
236
+ if right_on and len (right_on ) > 1 :
237
+ if not left_on or len (left_on ) != len (right_on ):
238
+ raise ValueError ("len(right_on) must equal len(left_on)" )
201
239
if (left_on or left_index ) and not (right_on or right_index ):
202
240
raise ValueError ('Must pass "right_on" OR "right_index"' )
203
241
if (right_on or right_index ) and not (left_on or left_index ):
@@ -210,38 +248,46 @@ def merge(left: IdaDataFrame, right: IdaDataFrame, how: str='inner', on: str=Non
210
248
raise ValueError (f"'left_index' set to True, but { left .name } has no indexer" )
211
249
if right_index and not right .indexer :
212
250
raise ValueError (f"'right_index' set to True, but { right .name } has no indexer" )
213
-
214
- left_indexer = left_on if left_on else left .indexer
215
- right_indexer = right_on if right_on else right .indexer
216
- if left_indexer == right_indexer :
217
- on = left_indexer
218
- on_query = f" using ({ q (on )} )"
219
- common_columns .remove (on )
251
+ if not left_on or len (left_on ) == 1 :
252
+ left_indexer = left_on [0 ] if left_on else left .indexer
253
+ right_indexer = right_on [0 ] if right_on else right .indexer
254
+ if left_indexer == right_indexer :
255
+ on = left_indexer
256
+ on_query = f" using ({ q (on )} )"
257
+ on = [on ]
258
+ else :
259
+ on_query = f" on left_table.{ q (left_indexer )} = right_table.{ q (right_indexer )} "
220
260
else :
221
- on_query = f" on { left .name } .{ q (left_indexer )} = { right .name } .{ q (right_indexer )} "
222
- else :
223
- if how != "cross" :
224
- on_columns = [f"{ left .name } .{ q (col )} = { right .name } .{ q (col )} "
225
- for col in common_columns ]
226
- on_query = f" on { ' AND ' .join (on_columns )} "
261
+ on_queries = [f"left_table.{ q (left_on [i ])} = right_table.{ q (right_on [i ])} "
262
+ for i in range (len (left_on ))]
263
+ on_query = " on " + " and " .join (on_queries )
227
264
228
- if common_columns :
229
- lcols = [f"{ left .name } .{ q (lcol )} " if lcol not in common_columns
230
- else f"{ left .name } .{ q (lcol )} AS { q (lcol + suffixes [0 ])} "
231
- for lcol in left .columns ]
232
- rcols = [f"{ right .name } .{ q (rcol )} " if rcol not in common_columns
233
- else f"{ right .name } .{ q (rcol )} AS { q (rcol + suffixes [1 ])} "
234
- for rcol in right .columns ]
265
+ lcols = [f"left_table.{ q (lcol )} " if lcol not in common_columns
266
+ else f"left_table.{ q (lcol )} AS { q (lcol + suffixes [0 ])} "
267
+ for lcol in left .columns ]
268
+ rcols = [f"right_table.{ q (rcol )} " if rcol not in common_columns
269
+ else f"right_table.{ q (rcol )} AS { q (rcol + suffixes [1 ])} "
270
+ for rcol in right .columns ]
271
+ nvl_statement = ""
272
+ if on and len (on ) == 1 :
273
+ lcols .remove (f"left_table.{ q (on [0 ])} AS { q (on [0 ] + suffixes [0 ])} " )
274
+ rcols .remove (f"right_table.{ q (on [0 ])} AS { q (on [0 ] + suffixes [1 ])} " )
275
+ all_cols = [q (on [0 ])] + lcols + rcols
276
+ else :
235
277
if on :
236
- lcols .remove (f"{ left .name } .{ q (on )} " )
237
- rcols .remove (f"{ right .name } .{ q (on )} " )
238
- all_cols = [q (on )] + lcols + rcols
239
- else :
240
- all_cols = lcols + rcols
241
- cols = ", " .join (all_cols )
242
- else :
243
- cols = "*"
244
-
278
+ for on_col in on :
279
+ lcols .remove (f"left_table.{ q (on_col )} AS { q (on_col + suffixes [0 ])} " )
280
+ rcols .remove (f"right_table.{ q (on_col )} AS { q (on_col + suffixes [1 ])} " )
281
+ nvl_statement += f" nvl(left_table.{ q (on_col )} ,right_table.{ q (on_col )} ) AS { q (on_col )} , "
282
+ elif left_on and len (left_on ) > 1 :
283
+ for i in range (len (left_on )):
284
+ if left_on [i ] == right_on [i ]:
285
+ lcols .remove (f"left_table.{ q (left_on [i ])} AS { q (left_on [i ] + suffixes [0 ])} " )
286
+ rcols .remove (f"right_table.{ q (right_on [i ])} AS { q (right_on [i ] + suffixes [1 ])} " )
287
+ nvl_statement += f" nvl(left_table.{ q (left_on [i ])} ,right_table.{ q (right_on [i ])} ) AS { q (left_on [i ])} , "
288
+ all_cols = lcols + rcols
289
+ cols = ", " .join (all_cols )
290
+
245
291
join_type = {
246
292
"inner" : "inner" ,
247
293
"left" : "left outer" ,
@@ -250,19 +296,22 @@ def merge(left: IdaDataFrame, right: IdaDataFrame, how: str='inner', on: str=Non
250
296
"cross" : "cross"
251
297
}
252
298
if indicator :
253
- case_statement = f", case when { left .name } .{ q (left_indexer )} is not null " + \
254
- f"and { right .name } .{ q (right_indexer )} is not null " + \
255
- f"then 'both' " + \
256
- f"when { left .name } .{ q (left_indexer )} is not null " + \
257
- f"then 'only left' " + \
258
- f"when { right .name } .{ q (right_indexer )} is not null " + \
259
- f"then 'only right' " + \
260
- f"else null end as indicator"
261
-
299
+ case_statement = ", case when t1=1 and t2=1 then 'both' when t1=1 then 'left_only'" + \
300
+ "else 'right_only' end as indicator"
301
+ select_statement1 = "(select 1 as t1, * from "
302
+ select_statement2 = "(select 1 as t2, * from "
303
+ as_statement1 = ") as left_table "
304
+ as_statement2 = ") as right_table "
262
305
else :
263
306
case_statement = ""
264
- query = f"select { cols } { case_statement } from { left .internal_state .current_state } " + \
265
- f" { join_type [how ]} join { right .internal_state .current_state } " + on_query
307
+ select_statement1 = ""
308
+ select_statement2 = ""
309
+ as_statement1 = " as left_table"
310
+ as_statement2 = " as right_table"
311
+ query = f"select { nvl_statement } { cols } { case_statement } from" + \
312
+ f"{ select_statement1 } { left .internal_state .current_state } { as_statement1 } " + \
313
+ f" { join_type [how ]} join { select_statement2 } " + \
314
+ f"{ right .internal_state .current_state } { as_statement2 } " + on_query
266
315
267
316
if how == 'right' :
268
317
idx = right_indexer
0 commit comments