Skip to content

Commit 8b0dd77

Browse files
authored
Merge pull request #92 from IBM/merge-on-few-columns
add_merging_on_multiple_columns
2 parents faf883b + 08958a5 commit 8b0dd77

File tree

5 files changed

+232
-68
lines changed

5 files changed

+232
-68
lines changed

docs/source/frame.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,7 @@ join
193193

194194
merge
195195
-----
196-
.. automethod:: nzpyida.join_tables.merge
196+
.. automethod:: IdaDataFrame.merge
197197

198198
concat
199199
------

nzpyida/base.py

-4
Original file line numberDiff line numberDiff line change
@@ -1666,10 +1666,6 @@ def append(self, idadf, df, maxnrow=None):
16661666

16671667
idadf._reset_attributes(['shape', 'axes', 'dtypes', 'index'])
16681668

1669-
def merge(self, idadf, other, key):
1670-
# TODO:
1671-
pass
1672-
16731669
###############################################################################
16741670
#### Connection management
16751671
###############################################################################

nzpyida/frame.py

+69
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
import warnings
2727
from numbers import Number
2828
from collections import OrderedDict
29+
from typing import List
2930

3031
import numpy as np
3132
import pandas as pd
@@ -1300,6 +1301,74 @@ def join(self, other, on: str=None, how: str='left', lsuffix: str='_x', rsuffix:
13001301
return nzpyida.join_tables.merge(self, other, left_index=left_index,
13011302
left_on=on, right_index=True,
13021303
how=how, suffixes=(lsuffix, rsuffix))
1304+
1305+
def merge(self, right, how: str='inner', on=None,
1306+
left_on=None, right_on=None, left_index: bool=False,
1307+
right_index: bool=False, suffixes: List[str]=["_x", "_y"],
1308+
indicator: bool=False):
1309+
"""
1310+
Implement pandas-like interface to merge IdaDataFrames
1311+
1312+
Parameters
1313+
----------
1314+
right : IdaDataFrame
1315+
right IdaDataDrame to merge
1316+
how : str, optional
1317+
type of join, possible types - 'outer', 'inner', 'right', 'left', 'cross'
1318+
on : str, optional
1319+
name of column in both IdaDataFrames to do join on
1320+
left_on : str, optional
1321+
name of column in left IdaDataFrame to do join on
1322+
right_on : str, optional
1323+
name of column in right IdaDataFrame to do join on
1324+
left_index : bool, optional
1325+
whether to join on indexer of left IdaDataFrame
1326+
right_index : bool, optional
1327+
whether to join on indexer of right IdaDataFrame
1328+
suffixes: List[str], optional
1329+
list of suffixes to add to columns that are present in both IdaDataFrames
1330+
indicator: bool optional
1331+
whether to add to output IdaDataFrame, column with information
1332+
about the source of a given row
1333+
1334+
Returns
1335+
-------
1336+
IdaDataFrame
1337+
result of merging IdaDataFrames
1338+
1339+
Examples
1340+
--------
1341+
>>> from nzpyida.sampledata.iris import iris
1342+
>>> from nzpyida.sampledata.iris import iris
1343+
>>> iris.reset_index()
1344+
>>> iris2.reset_index()
1345+
>>> iris2.columns = ['id', 'sepal_length', 'sepal_width', 'petal_length', \
1346+
'PETAL_WIDTH', 'CLASS']
1347+
>>> iris2['id'] = iris2['id'] + 50
1348+
>>> iris1_ida = idadb.as_idadataframe(iris, 'IRIS_TEST1', indexer='index')
1349+
>>> iris2_ida = idadb.as_idadataframe(iris2, 'IRIS_TEST2', indexer='id')
1350+
>>> iris_merge = iris1_ida.merge(iris2_ida, left_on='index', right_on='id')
1351+
>>> iris_merge.tail()
1352+
index sepal_length_x sepal_width_x petal_length_x petal_width species id sepal_length_y sepal_width_y petal_length_y PETAL_WIDTH CLASS
1353+
95 145 6.7 3.0 5.2 2.3 virginica 145 5.7 3.0 4.2 1.2 versicolor
1354+
96 146 6.3 2.5 5.0 1.9 virginica 146 5.7 2.9 4.2 1.3 versicolor
1355+
97 147 6.5 3.0 5.2 2.0 virginica 147 6.2 2.9 4.3 1.3 versicolor
1356+
98 148 6.2 3.4 5.4 2.3 virginica 148 5.1 2.5 3.0 1.1 versicolor
1357+
99 149 5.9 3.0 5.1 1.8 virginica 149 5.7 2.8 4.1 1.3 versicolor
1358+
>>> len(iris_merge)
1359+
100
1360+
>>> iris_merge = iris1_ida.merge() iris2_ida, left_on='index', right_index=True,
1361+
how='outer', indicator=True, suffixes=("_a", "_b"))
1362+
>>> iris_merge.head()
1363+
index sepal_length_a sepal_width_a petal_length_a petal_width species id sepal_length_b sepal_width_b petal_length_b PETAL_WIDTH CLASS INDICATOR
1364+
0 None None None None None None 152 7.1 3.0 5.9 2.1 virginica right_only
1365+
1 None None None None None None 151 5.8 2.7 5.1 1.9 virginica right_only
1366+
2 None None None None None None 154 6.5 3.0 5.8 2.2 virginica right_only
1367+
3 None None None None None None 153 6.3 2.9 5.6 1.8 virginica right_only
1368+
4 None None None None None None 150 6.3 3.3 6.0 2.5 virginica right_only
1369+
"""
1370+
return nzpyida.merge(self, right, how, on, left_on, right_on,
1371+
left_index, right_index, suffixes, indicator)
13031372

13041373

13051374
# TODO : implement NULL FIRST option

nzpyida/join_tables.py

+109-60
Original file line numberDiff line numberDiff line change
@@ -103,8 +103,8 @@ def concat(objs: List[IdaDataFrame], axis: int=0, join: str='outer', keys: List[
103103
return idadf
104104

105105

106-
def merge(left: IdaDataFrame, right: IdaDataFrame, how: str='inner', on: str=None,
107-
left_on: str=None, right_on: str=None, left_index: bool=False,
106+
def merge(left: IdaDataFrame, right: IdaDataFrame, how: str='inner', on=None,
107+
left_on=None, right_on=None, left_index: bool=False,
108108
right_index: bool=False, suffixes: List[str]=["_x", "_y"],
109109
indicator: bool=False):
110110
"""
@@ -150,7 +150,7 @@ def merge(left: IdaDataFrame, right: IdaDataFrame, how: str='inner', on: str=Non
150150
>>> iris2['id'] = iris2['id'] + 50
151151
>>> iris1_ida = idadb.as_idadataframe(iris, 'IRIS_TEST1', indexer='index')
152152
>>> iris2_ida = idadb.as_idadataframe(iris2, 'IRIS_TEST2', indexer='id')
153-
>>> iris_merge = nzpyida.merge(iris1_ida, iris2_ida)
153+
>>> iris_merge = nzpyida.merge(iris1_ida, iris2_ida, left_on='index', right_on='id')
154154
>>> iris_merge.tail()
155155
index sepal_length_x sepal_width_x petal_length_x petal_width species id sepal_length_y sepal_width_y petal_length_y PETAL_WIDTH CLASS
156156
95 145 6.7 3.0 5.2 2.3 virginica 145 5.7 3.0 4.2 1.2 versicolor
@@ -164,11 +164,11 @@ def merge(left: IdaDataFrame, right: IdaDataFrame, how: str='inner', on: str=Non
164164
how='outer', indicator=True, suffixes=("_a", "_b"))
165165
>>> iris_merge.head()
166166
index sepal_length_a sepal_width_a petal_length_a petal_width species id sepal_length_b sepal_width_b petal_length_b PETAL_WIDTH CLASS INDICATOR
167-
0 None None None None None None 152 7.1 3.0 5.9 2.1 virginica only right
168-
1 None None None None None None 151 5.8 2.7 5.1 1.9 virginica only right
169-
2 None None None None None None 154 6.5 3.0 5.8 2.2 virginica only right
170-
3 None None None None None None 153 6.3 2.9 5.6 1.8 virginica only right
171-
4 None None None None None None 150 6.3 3.3 6.0 2.5 virginica only right
167+
0 None None None None None None 152 7.1 3.0 5.9 2.1 virginica right_only
168+
1 None None None None None None 151 5.8 2.7 5.1 1.9 virginica right_only
169+
2 None None None None None None 154 6.5 3.0 5.8 2.2 virginica right_only
170+
3 None None None None None None 153 6.3 2.9 5.6 1.8 virginica right_only
171+
4 None None None None None None 150 6.3 3.3 6.0 2.5 virginica right_only
172172
"""
173173
idadb = left._idadb
174174
suffixes = [suffix if suffix else "" for suffix in suffixes ]
@@ -181,23 +181,61 @@ def merge(left: IdaDataFrame, right: IdaDataFrame, how: str='inner', on: str=Non
181181
if how not in available_join_types:
182182
raise ValueError(f"Invalid value in 'how', should be one of: {available_join_types}")
183183

184+
185+
if not on and not any([left_on, right_on, left_index, right_index]):
186+
if how != "cross":
187+
on = common_columns
188+
if not on:
189+
raise ValueError("No common columns to perform merge on.",
190+
"Merge options: left_on=None, right_on=None,",
191+
"left_index=False, right_index=False")
184192
if on:
185-
if on and not on in left.columns:
186-
raise KeyError(f"No column {on} in {left.name} dataframe")
187-
if on and not on in right.columns:
188-
raise KeyError(f"No column {on} in {right.name} dataframe")
189-
on_query = f" using ({q(on)})"
190-
common_columns.remove(on)
191-
left_indexer = on
192-
right_indexer = on
193+
if isinstance(on, str):
194+
if not on in left.columns:
195+
raise KeyError(f"No column {on} in {left.name} dataframe")
196+
if not on in right.columns:
197+
raise KeyError(f"No column {on} in {right.name} dataframe")
198+
on_query = f" using ({q(on)})"
199+
left_indexer = on
200+
right_indexer = on
201+
on = [on]
202+
else:
203+
if not all(on_col in left.columns for on_col in on):
204+
raise KeyError(f"Not all on columns {on} in {left.name} dataframe")
205+
if not all(on_col in right.columns for on_col in on):
206+
raise KeyError(f"Not all on columns {on} in {right.name} dataframe")
207+
if len(on) == 1:
208+
on_query = f" using ({q(on[0])})"
209+
else:
210+
on_queries = [f"left_table.{q(on_col)} = right_table.{q(on_col)}"
211+
for on_col in on]
212+
on_query = " on " + " and ".join(on_queries)
213+
214+
193215

194216
elif any([left_on, right_on, left_index, right_index]):
195217
if how == "cross":
196218
raise ValueError("Can not pass on, right_on, left_on or set right_index=True or left_index=True")
197-
if left_on and not left_on in left.columns:
198-
raise KeyError(f"No column {left_on} in {left.name} dataframe")
199-
if right_on and not right_on in right.columns:
200-
raise KeyError(f"No column {right_on} in {right.name} dataframe")
219+
if left_on:
220+
if isinstance(left_on, str):
221+
if not left_on in left.columns:
222+
raise KeyError(f"No column {left_on} in {left.name} dataframe")
223+
left_on = [left_on]
224+
elif not all(left_on_col in left.columns for left_on_col in left_on):
225+
raise KeyError(f"Not all columns {left_on} in {left.name} dataframe")
226+
if right_on:
227+
if isinstance(right_on, str):
228+
if not right_on in right.columns:
229+
raise KeyError(f"No column {right_on} in {right.name} dataframe")
230+
right_on = [right_on]
231+
elif not all(right_on_col in right.columns for right_on_col in right_on):
232+
raise KeyError(f"Not all columns {right_on} in {right.name} dataframe")
233+
if left_on and len(left_on) > 1:
234+
if not right_on or len(right_on) != len(left_on):
235+
raise ValueError("len(right_on) must equal len(left_on)")
236+
if right_on and len(right_on) > 1:
237+
if not left_on or len(left_on) != len(right_on):
238+
raise ValueError("len(right_on) must equal len(left_on)")
201239
if (left_on or left_index) and not (right_on or right_index):
202240
raise ValueError('Must pass "right_on" OR "right_index"')
203241
if (right_on or right_index) and not (left_on or left_index):
@@ -210,38 +248,46 @@ def merge(left: IdaDataFrame, right: IdaDataFrame, how: str='inner', on: str=Non
210248
raise ValueError(f"'left_index' set to True, but {left.name} has no indexer")
211249
if right_index and not right.indexer:
212250
raise ValueError(f"'right_index' set to True, but {right.name} has no indexer")
213-
214-
left_indexer = left_on if left_on else left.indexer
215-
right_indexer = right_on if right_on else right.indexer
216-
if left_indexer == right_indexer:
217-
on = left_indexer
218-
on_query = f" using ({q(on)})"
219-
common_columns.remove(on)
251+
if not left_on or len(left_on) == 1:
252+
left_indexer = left_on[0] if left_on else left.indexer
253+
right_indexer = right_on[0] if right_on else right.indexer
254+
if left_indexer == right_indexer:
255+
on = left_indexer
256+
on_query = f" using ({q(on)})"
257+
on = [on]
258+
else:
259+
on_query = f" on left_table.{q(left_indexer)} = right_table.{q(right_indexer)}"
220260
else:
221-
on_query = f" on {left.name}.{q(left_indexer)} = {right.name}.{q(right_indexer)}"
222-
else:
223-
if how != "cross":
224-
on_columns = [f"{left.name}.{q(col)} = {right.name}.{q(col)}"
225-
for col in common_columns]
226-
on_query = f" on {' AND '.join(on_columns)}"
261+
on_queries = [f"left_table.{q(left_on[i])} = right_table.{q(right_on[i])}"
262+
for i in range(len(left_on))]
263+
on_query = " on " + " and ".join(on_queries)
227264

228-
if common_columns:
229-
lcols = [f"{left.name}.{q(lcol)}" if lcol not in common_columns
230-
else f"{left.name}.{q(lcol)} AS {q(lcol + suffixes[0])}"
231-
for lcol in left.columns]
232-
rcols = [f"{right.name}.{q(rcol)}" if rcol not in common_columns
233-
else f"{right.name}.{q(rcol)} AS {q(rcol + suffixes[1])}"
234-
for rcol in right.columns]
265+
lcols = [f"left_table.{q(lcol)}" if lcol not in common_columns
266+
else f"left_table.{q(lcol)} AS {q(lcol + suffixes[0])}"
267+
for lcol in left.columns]
268+
rcols = [f"right_table.{q(rcol)}" if rcol not in common_columns
269+
else f"right_table.{q(rcol)} AS {q(rcol + suffixes[1])}"
270+
for rcol in right.columns]
271+
nvl_statement = ""
272+
if on and len(on) == 1:
273+
lcols.remove(f"left_table.{q(on[0])} AS {q(on[0] + suffixes[0])}")
274+
rcols.remove(f"right_table.{q(on[0])} AS {q(on[0] + suffixes[1])}")
275+
all_cols = [q(on[0])] + lcols + rcols
276+
else:
235277
if on:
236-
lcols.remove(f"{left.name}.{q(on)}")
237-
rcols.remove(f"{right.name}.{q(on)}")
238-
all_cols = [q(on)] + lcols + rcols
239-
else:
240-
all_cols = lcols + rcols
241-
cols = ", ".join(all_cols)
242-
else:
243-
cols = "*"
244-
278+
for on_col in on:
279+
lcols.remove(f"left_table.{q(on_col)} AS {q(on_col + suffixes[0])}")
280+
rcols.remove(f"right_table.{q(on_col)} AS {q(on_col + suffixes[1])}")
281+
nvl_statement += f" nvl(left_table.{q(on_col)},right_table.{q(on_col)}) AS {q(on_col)}, "
282+
elif left_on and len(left_on) > 1:
283+
for i in range(len(left_on)):
284+
if left_on[i] == right_on[i]:
285+
lcols.remove(f"left_table.{q(left_on[i])} AS {q(left_on[i] + suffixes[0])}")
286+
rcols.remove(f"right_table.{q(right_on[i])} AS {q(right_on[i] + suffixes[1])}")
287+
nvl_statement += f" nvl(left_table.{q(left_on[i])},right_table.{q(right_on[i])}) AS {q(left_on[i])}, "
288+
all_cols = lcols + rcols
289+
cols = ", ".join(all_cols)
290+
245291
join_type = {
246292
"inner": "inner",
247293
"left": "left outer",
@@ -250,19 +296,22 @@ def merge(left: IdaDataFrame, right: IdaDataFrame, how: str='inner', on: str=Non
250296
"cross": "cross"
251297
}
252298
if indicator:
253-
case_statement = f", case when {left.name}.{q(left_indexer)} is not null " + \
254-
f"and {right.name}.{q(right_indexer)} is not null " + \
255-
f"then 'both' " + \
256-
f"when {left.name}.{q(left_indexer)} is not null " + \
257-
f"then 'only left' " + \
258-
f"when {right.name}.{q(right_indexer)} is not null " + \
259-
f"then 'only right' " + \
260-
f"else null end as indicator"
261-
299+
case_statement = ", case when t1=1 and t2=1 then 'both' when t1=1 then 'left_only'" + \
300+
"else 'right_only' end as indicator"
301+
select_statement1 = "(select 1 as t1, * from "
302+
select_statement2 = "(select 1 as t2, * from "
303+
as_statement1 = ") as left_table "
304+
as_statement2 = ") as right_table "
262305
else:
263306
case_statement = ""
264-
query = f"select {cols} {case_statement} from {left.internal_state.current_state}" + \
265-
f" {join_type[how]} join {right.internal_state.current_state}" + on_query
307+
select_statement1 = ""
308+
select_statement2 = ""
309+
as_statement1 = " as left_table"
310+
as_statement2 = " as right_table"
311+
query = f"select {nvl_statement} {cols} {case_statement} from" + \
312+
f"{select_statement1} {left.internal_state.current_state}{as_statement1}" + \
313+
f" {join_type[how]} join {select_statement2}" + \
314+
f"{right.internal_state.current_state}{as_statement2}" + on_query
266315

267316
if how == 'right':
268317
idx = right_indexer

0 commit comments

Comments
 (0)