@@ -35,8 +35,8 @@ def combine_frames(this, *args, how="full"):
35
35
This method combines `this` DataFrame with a different `that` DataFrame or
36
36
Series from a different DataFrame.
37
37
38
- It returns a dataframe that has prefix, `this_` and `that_` to distinct
39
- the columns names.
38
+ It returns a DataFrame that has prefix `this_` and `that_` to distinct
39
+ the columns names from both DataFrames
40
40
41
41
It internally performs a join operation which can be expensive in general.
42
42
So, if `OPS_ON_DIFF_FRAMES` environment variable is not set,
@@ -70,11 +70,12 @@ def combine_frames(this, *args, how="full"):
70
70
join_scols = []
71
71
merged_index_scols = []
72
72
73
+ # If the same named index is found, that's used.
73
74
for this_column , this_name in this_index_map :
74
75
for that_col , that_name in that_index_map :
75
76
if this_name == that_name :
76
- # We should map the actual Spark columns even if
77
- # the index names are the name .
77
+ # We should merge the Spark columns into one
78
+ # to mimic pandas' behavior .
78
79
this_scol = this ._internal .scol_for (this_column )
79
80
that_scol = that ._internal .scol_for (that_col )
80
81
join_scol = this_scol == that_scol
@@ -85,7 +86,7 @@ def combine_frames(this, *args, how="full"):
85
86
).otherwise (that_scol ).alias (this_column ))
86
87
break
87
88
else :
88
- raise ValueError ("Index names must be matched." )
89
+ raise ValueError ("Index names must be exactly matched currently ." )
89
90
90
91
assert len (join_scols ) > 0 , "cannot join with no overlapping index names"
91
92
@@ -106,13 +107,13 @@ def combine_frames(this, *args, how="full"):
106
107
"it comes from a different dataframe" )
107
108
108
109
109
- def align_diff_frames (func , this , that , fillna = True , how = "full" , include_all_that_columns = False ):
110
+ def align_diff_frames (resolve_func , this , that , fillna = True , how = "full" ):
110
111
"""
111
112
This method aligns two different DataFrames with a given `func`. Columns are resolved and
112
113
handled within the given `func`.
113
114
To use this, `OPS_ON_DIFF_FRAMES` environment variable should be enabled, for now.
114
115
115
- :param func : Takes aligned (joined) DataFrame, the column of the current DataFrame, and
116
+ :param resolve_func : Takes aligned (joined) DataFrame, the column of the current DataFrame, and
116
117
the column of another DataFrame. It returns an iterable that produces Series.
117
118
118
119
>>> import os
@@ -152,15 +153,19 @@ def align_diff_frames(func, this, that, fillna=True, how="full", include_all_tha
152
153
:param that: another DataFrame to align
153
154
:param fillna: If True, it fills missing values in non-common columns in both `this` and `that`.
154
155
Otherwise, it returns as are.
155
- :param how: join way.
156
- :param include_all_that_columns: If True, all non-common columns from `that` are added into
157
- `that_columns` into `func`, and they are excluded in non-common columns group
158
- (controlled by `fillna`). Otherwise, `this_columns` and `that_columns` will always in the
159
- same common column group.
156
+ :param how: join way. In addition, it affects how `resolve_func` resolves the column conflict.
157
+ - full: `resolve_func` should resolve only common columns from 'this' and 'that' DataFrames.
158
+ For instance, if 'this' has columns A, B, C and that has B, C, D, `this_columns` and
159
+ 'that_columns' in this function are B, C and B, C.
160
+ - left: `resolve_func` should resolve columns including that columns.
161
+ For instance, if 'this' has columns A, B, C and that has B, C, D, `this_columns` is
162
+ B, C but `that_columns` are B, C, D.
160
163
:return: Alined DataFrame
161
164
"""
162
165
from databricks .koalas import DataFrame
163
166
167
+ assert how == "full" or how == "left"
168
+
164
169
this_data_columns = this ._internal .data_columns
165
170
that_data_columns = that ._internal .data_columns
166
171
common_columns = set (this_data_columns ).intersection (that_data_columns )
@@ -185,11 +190,12 @@ def align_diff_frames(func, this, that, fillna=True, how="full", include_all_tha
185
190
that_columns_to_apply .append (combined_column )
186
191
break
187
192
else :
188
- if include_all_that_columns and \
193
+ if how == "left" and \
189
194
combined_column in ["__that_%s" % c for c in that_data_columns ]:
190
- # In this case, we will drop that columns in columns to keep but passes it later
191
- # to `func`. Note that adding this into a separate list is intentional so that
192
- # `this_columns` and `that_columns` can be paired.
195
+ # In this case, we will drop `that_columns` in `columns_to_keep` but passes
196
+ # it later to `func`. `func` should resolve it.
197
+ # Note that adding this into a separate list (`additional_that_columns`)
198
+ # is intentional so that `this_columns` and `that_columns` can be paired.
193
199
additional_that_columns .append (combined_column )
194
200
elif fillna :
195
201
columns_to_keep .append (F .lit (None ).cast (FloatType ()).alias (combined_column ))
@@ -200,7 +206,7 @@ def align_diff_frames(func, this, that, fillna=True, how="full", include_all_tha
200
206
201
207
# Should extract columns to apply and do it in a batch in case
202
208
# it adds new columns for example.
203
- kser_set = list (func (combined , this_columns_to_apply , that_columns_to_apply ))
209
+ kser_set = list (resolve_func (combined , this_columns_to_apply , that_columns_to_apply ))
204
210
columns_applied = [c ._scol for c in kser_set ]
205
211
206
212
sdf = combined ._sdf .select (
0 commit comments