Skip to content

Commit a553ad6

Browse files
authored
Fix DataFrame.mad to work properly (#1749)
`DataFrame.mad()` has not been working properly as shown below. ```python >>> pdf A B C 0 3 3 a 1 4 4 b 2 5 5 c 3 6 6 d 4 7 7 e >>> pdf.mad() A 1.2 B 1.2 dtype: float64 >>> ks.from_pandas(pdf).mad() A 1.2 B 1.2 C NaN # It should've not been here dtype: float64 ``` This PR fixed it and also fixed related tests. ```python >>> pdf.mad() A 1.2 B 1.2 dtype: float64 >>> ks.from_pandas(pdf).mad() A 1.2 B 1.2 dtype: float64 ```
1 parent ef192c6 commit a553ad6

File tree

2 files changed

+20
-5
lines changed

2 files changed

+20
-5
lines changed

databricks/koalas/frame.py

+12-3
Original file line numberDiff line numberDiff line change
@@ -9940,18 +9940,27 @@ def get_spark_column(kdf, label):
99409940

99419941
return scol
99429942

9943+
new_column_labels = []
9944+
for label in self._internal.column_labels:
9945+
# Filtering out only columns of numeric and boolean type column.
9946+
dtype = self._kser_for(label).spark.data_type
9947+
if isinstance(dtype, (NumericType, BooleanType)):
9948+
new_column_labels.append(label)
9949+
99439950
new_columns = [
99449951
F.avg(get_spark_column(self, label)).alias(name_like_string(label))
9945-
for label in self._internal.column_labels
9952+
for label in new_column_labels
99469953
]
9954+
99479955
mean_data = self._internal.spark_frame.select(new_columns).first()
99489956

99499957
new_columns = [
99509958
F.avg(
99519959
F.abs(get_spark_column(self, label) - mean_data[name_like_string(label)])
99529960
).alias(name_like_string(label))
9953-
for label in self._internal.column_labels
9961+
for label in new_column_labels
99549962
]
9963+
99559964
sdf = self._internal.spark_frame.select(
99569965
[F.lit(None).cast(StringType()).alias(SPARK_DEFAULT_INDEX_NAME)] + new_columns
99579966
)
@@ -9960,7 +9969,7 @@ def get_spark_column(kdf, label):
99609969
internal = InternalFrame(
99619970
spark_frame=sdf,
99629971
index_map=OrderedDict([(SPARK_DEFAULT_INDEX_NAME, None)]),
9963-
column_labels=self._internal.column_labels,
9972+
column_labels=new_column_labels,
99649973
column_label_names=self._internal.column_label_names,
99659974
)
99669975

databricks/koalas/tests/test_dataframe.py

+8-2
Original file line numberDiff line numberDiff line change
@@ -3814,7 +3814,13 @@ def test_explain_hint(self):
38143814
sys.stdout = prev
38153815

38163816
def test_mad(self):
3817-
pdf = pd.DataFrame({"A": [1, 2, None, 4, np.nan], "B": [-0.1, 0.2, -0.3, np.nan, 0.5]})
3817+
pdf = pd.DataFrame(
3818+
{
3819+
"A": [1, 2, None, 4, np.nan],
3820+
"B": [-0.1, 0.2, -0.3, np.nan, 0.5],
3821+
"C": ["a", "b", "c", "d", "e"],
3822+
}
3823+
)
38183824
kdf = ks.from_pandas(pdf)
38193825

38203826
self.assert_eq(kdf.mad(), pdf.mad())
@@ -3824,7 +3830,7 @@ def test_mad(self):
38243830
kdf.mad(axis=2)
38253831

38263832
# MultiIndex columns
3827-
columns = pd.MultiIndex.from_tuples([("A", "X"), ("A", "Y")])
3833+
columns = pd.MultiIndex.from_tuples([("A", "X"), ("A", "Y"), ("A", "Z")])
38283834
pdf.columns = columns
38293835
kdf.columns = columns
38303836

0 commit comments

Comments
 (0)