@@ -219,8 +219,8 @@ def _spark_groupby(kdf, func, groupkeys):
219
219
else :
220
220
index_map = None
221
221
return _InternalFrame (sdf = sdf ,
222
- data_columns = data_columns ,
223
222
column_index = column_index ,
223
+ column_scols = [scol_for (sdf , col ) for col in data_columns ],
224
224
index_map = index_map )
225
225
226
226
def count (self ):
@@ -493,10 +493,10 @@ def size(self):
493
493
else :
494
494
name = 'count'
495
495
internal = _InternalFrame (sdf = sdf ,
496
- data_columns = [name ],
497
496
index_map = [(SPARK_INDEX_NAME_FORMAT (i ),
498
497
s ._internal .column_index [0 ])
499
- for i , s in enumerate (groupkeys )])
498
+ for i , s in enumerate (groupkeys )],
499
+ column_scols = [scol_for (sdf , name )])
500
500
return _col (DataFrame (internal ))
501
501
502
502
def diff (self , periods = 1 ):
@@ -893,7 +893,9 @@ def apply(self, func):
893
893
894
894
if should_infer_schema :
895
895
# If schema is inferred, we can restore indexes too.
896
- internal = kdf ._internal .copy (sdf = sdf )
896
+ internal = kdf ._internal .copy (sdf = sdf ,
897
+ column_scols = [scol_for (sdf , col )
898
+ for col in kdf ._internal .data_columns ])
897
899
else :
898
900
# Otherwise, it loses index.
899
901
internal = _InternalFrame (sdf = sdf )
@@ -945,7 +947,9 @@ def pandas_filter(pdf):
945
947
946
948
sdf = self ._spark_group_map_apply (
947
949
pandas_filter , data_schema , retain_index = True )
948
- return DataFrame (self ._kdf ._internal .copy (sdf = sdf ))
950
+ return DataFrame (self ._kdf ._internal .copy (
951
+ sdf = sdf ,
952
+ column_scols = [scol_for (sdf , col ) for col in self ._kdf ._internal .data_columns ]))
949
953
950
954
def _spark_group_map_apply (self , func , return_schema , retain_index ):
951
955
index_columns = self ._kdf ._internal .index_columns
@@ -1153,13 +1157,13 @@ def idxmax(self, skipna=True):
1153
1157
stat_exprs .append (F .max (scol_for (sdf , name )).alias (name ))
1154
1158
sdf = sdf .groupby (* groupkey_cols ).agg (* stat_exprs )
1155
1159
internal = _InternalFrame (sdf = sdf ,
1156
- data_columns = [ks ._internal .data_columns [0 ]
1157
- for ks in self ._agg_columns ],
1158
- column_index = [ks ._internal .column_index [0 ]
1159
- for ks in self ._agg_columns ],
1160
1160
index_map = [(SPARK_INDEX_NAME_FORMAT (i ),
1161
1161
s ._internal .column_index [0 ])
1162
- for i , s in enumerate (groupkeys )])
1162
+ for i , s in enumerate (groupkeys )],
1163
+ column_index = [ks ._internal .column_index [0 ]
1164
+ for ks in self ._agg_columns ],
1165
+ column_scols = [scol_for (sdf , ks ._internal .data_columns [0 ])
1166
+ for ks in self ._agg_columns ])
1163
1167
return DataFrame (internal )
1164
1168
1165
1169
# TODO: add axis parameter
@@ -1223,13 +1227,13 @@ def idxmin(self, skipna=True):
1223
1227
stat_exprs .append (F .max (scol_for (sdf , name )).alias (name ))
1224
1228
sdf = sdf .groupby (* groupkey_cols ).agg (* stat_exprs )
1225
1229
internal = _InternalFrame (sdf = sdf ,
1226
- data_columns = [ks ._internal .data_columns [0 ]
1227
- for ks in self ._agg_columns ],
1228
- column_index = [ks ._internal .column_index [0 ]
1229
- for ks in self ._agg_columns ],
1230
1230
index_map = [(SPARK_INDEX_NAME_FORMAT (i ),
1231
1231
s ._internal .column_index [0 ])
1232
- for i , s in enumerate (groupkeys )])
1232
+ for i , s in enumerate (groupkeys )],
1233
+ column_index = [ks ._internal .column_index [0 ]
1234
+ for ks in self ._agg_columns ],
1235
+ column_scols = [scol_for (sdf , ks ._internal .data_columns [0 ])
1236
+ for ks in self ._agg_columns ])
1233
1237
return DataFrame (internal )
1234
1238
1235
1239
def fillna (self , value = None , method = None , axis = None , inplace = False , limit = None ):
@@ -1581,7 +1585,9 @@ def pandas_transform(pdf):
1581
1585
sdf = self ._spark_group_map_apply (
1582
1586
pandas_transform , return_schema , retain_index = True )
1583
1587
# If schema is inferred, we can restore indexes too.
1584
- internal = kdf ._internal .copy (sdf = sdf )
1588
+ internal = kdf ._internal .copy (sdf = sdf ,
1589
+ column_scols = [scol_for (sdf , col )
1590
+ for col in kdf ._internal .data_columns ])
1585
1591
else :
1586
1592
return_type = _infer_return_type (func ).tpe
1587
1593
data_columns = self ._kdf ._internal .data_columns
@@ -1708,8 +1714,8 @@ def _reduce_for_stat_function(self, sfun, only_numeric):
1708
1714
index_map = [(SPARK_INDEX_NAME_FORMAT (i ),
1709
1715
s ._internal .column_index [0 ])
1710
1716
for i , s in enumerate (groupkeys )],
1711
- data_columns = data_columns ,
1712
1717
column_index = column_index ,
1718
+ column_scols = [scol_for (sdf , col ) for col in data_columns ],
1713
1719
column_index_names = self ._kdf ._internal .column_index_names )
1714
1720
kdf = DataFrame (internal )
1715
1721
if not self ._as_index :
@@ -1767,8 +1773,9 @@ def _diff(self, *args, **kwargs):
1767
1773
1768
1774
sdf = kdf ._sdf .select (kdf ._internal .index_scols + [c ._scol for c in applied ])
1769
1775
internal = kdf ._internal .copy (sdf = sdf ,
1770
- data_columns = [c ._internal .data_columns [0 ] for c in applied ],
1771
- column_index = [c ._internal .column_index [0 ] for c in applied ])
1776
+ column_index = [c ._internal .column_index [0 ] for c in applied ],
1777
+ column_scols = [scol_for (sdf , c ._internal .data_columns [0 ])
1778
+ for c in applied ])
1772
1779
return DataFrame (internal )
1773
1780
1774
1781
def _rank (self , * args , ** kwargs ):
@@ -1781,8 +1788,9 @@ def _rank(self, *args, **kwargs):
1781
1788
1782
1789
sdf = kdf ._sdf .select (kdf ._internal .index_scols + [c ._scol for c in applied ])
1783
1790
internal = kdf ._internal .copy (sdf = sdf ,
1784
- data_columns = [c ._internal .data_columns [0 ] for c in applied ],
1785
- column_index = [c ._internal .column_index [0 ] for c in applied ])
1791
+ column_index = [c ._internal .column_index [0 ] for c in applied ],
1792
+ column_scols = [scol_for (sdf , c ._internal .data_columns [0 ])
1793
+ for c in applied ])
1786
1794
return DataFrame (internal )
1787
1795
1788
1796
def _cum (self , func ):
@@ -1806,8 +1814,9 @@ def _cum(self, func):
1806
1814
sdf = kdf ._sdf .select (
1807
1815
kdf ._internal .index_scols + [c ._scol for c in applied ])
1808
1816
internal = kdf ._internal .copy (sdf = sdf ,
1809
- data_columns = [c ._internal .data_columns [0 ] for c in applied ],
1810
- column_index = [c ._internal .column_index [0 ] for c in applied ])
1817
+ column_index = [c ._internal .column_index [0 ] for c in applied ],
1818
+ column_scols = [scol_for (sdf , c ._internal .data_columns [0 ])
1819
+ for c in applied ])
1811
1820
return DataFrame (internal )
1812
1821
1813
1822
def _fillna (self , * args , ** kwargs ):
@@ -1820,8 +1829,9 @@ def _fillna(self, *args, **kwargs):
1820
1829
1821
1830
sdf = kdf ._sdf .select (kdf ._internal .index_scols + [c ._scol for c in applied ])
1822
1831
internal = kdf ._internal .copy (sdf = sdf ,
1823
- data_columns = [c ._internal .data_columns [0 ] for c in applied ],
1824
- column_index = [c ._internal .column_index [0 ] for c in applied ])
1832
+ column_index = [c ._internal .column_index [0 ] for c in applied ],
1833
+ column_scols = [scol_for (sdf , c ._internal .data_columns [0 ])
1834
+ for c in applied ])
1825
1835
return DataFrame (internal )
1826
1836
1827
1837
def _shift (self , periods , fill_value ):
@@ -1833,8 +1843,9 @@ def _shift(self, periods, fill_value):
1833
1843
1834
1844
sdf = kdf ._sdf .select (kdf ._internal .index_scols + [c ._scol for c in applied ])
1835
1845
internal = kdf ._internal .copy (sdf = sdf ,
1836
- data_columns = [c ._internal .data_columns [0 ] for c in applied ],
1837
- column_index = [c ._internal .column_index [0 ] for c in applied ])
1846
+ column_index = [c ._internal .column_index [0 ] for c in applied ],
1847
+ column_scols = [scol_for (sdf , c ._internal .data_columns [0 ])
1848
+ for c in applied ])
1838
1849
return DataFrame (internal )
1839
1850
1840
1851
@@ -1956,11 +1967,11 @@ def nsmallest(self, n=5):
1956
1967
window = Window .partitionBy ([s ._scol for s in groupkeys ]).orderBy (F .col (name ))
1957
1968
sdf = sdf .withColumn ('rank' , F .row_number ().over (window )).filter (F .col ('rank' ) <= n )
1958
1969
internal = _InternalFrame (sdf = sdf ,
1959
- data_columns = [name ],
1960
1970
index_map = ([(s ._internal .data_columns [0 ],
1961
1971
s ._internal .column_index [0 ])
1962
1972
for s in self ._groupkeys ]
1963
- + self ._kdf ._internal .index_map ))
1973
+ + self ._kdf ._internal .index_map ),
1974
+ column_scols = [scol_for (sdf , name )])
1964
1975
return _col (DataFrame (internal ))
1965
1976
1966
1977
# TODO: add keep parameter
@@ -2002,11 +2013,11 @@ def nlargest(self, n=5):
2002
2013
window = Window .partitionBy ([s ._scol for s in groupkeys ]).orderBy (F .col (name ).desc ())
2003
2014
sdf = sdf .withColumn ('rank' , F .row_number ().over (window )).filter (F .col ('rank' ) <= n )
2004
2015
internal = _InternalFrame (sdf = sdf ,
2005
- data_columns = [name ],
2006
2016
index_map = ([(s ._internal .data_columns [0 ],
2007
2017
s ._internal .column_index [0 ])
2008
2018
for s in self ._groupkeys ]
2009
- + self ._kdf ._internal .index_map ))
2019
+ + self ._kdf ._internal .index_map ),
2020
+ column_scols = [scol_for (sdf , name )])
2010
2021
return _col (DataFrame (internal ))
2011
2022
2012
2023
# TODO: add bins, normalize parameter
@@ -2064,10 +2075,10 @@ def value_counts(self, sort=None, ascending=None, dropna=True):
2064
2075
sdf = sdf .orderBy (F .col (agg_column ).desc ())
2065
2076
2066
2077
internal = _InternalFrame (sdf = sdf ,
2067
- data_columns = [agg_column ],
2068
2078
index_map = [(SPARK_INDEX_NAME_FORMAT (i ),
2069
2079
s ._internal .column_index [0 ])
2070
- for i , s in enumerate (groupkeys )])
2080
+ for i , s in enumerate (groupkeys )],
2081
+ column_scols = [scol_for (sdf , agg_column )])
2071
2082
return _col (DataFrame (internal ))
2072
2083
2073
2084
0 commit comments