|
71 | 71 | from databricks.koalas.spark.utils import as_nullable_spark_type, force_decimal_precision_scale
|
72 | 72 | from databricks.koalas.window import RollingGroupby, ExpandingGroupby
|
73 | 73 | from databricks.koalas.exceptions import DataError
|
| 74 | +from databricks.koalas.spark import functions as SF |
74 | 75 |
|
75 | 76 | # to keep it the same as pandas
|
76 | 77 | NamedAgg = namedtuple("NamedAgg", ["column", "aggfunc"])
|
@@ -2343,6 +2344,73 @@ def get_group(self, name) -> Union[DataFrame, Series]:
|
2343 | 2344 |
|
2344 | 2345 | return DataFrame(internal)
|
2345 | 2346 |
|
| 2347 | + def median(self, numeric_only=True, accuracy=10000) -> Union[DataFrame, Series]: |
| 2348 | + """ |
| 2349 | + Compute median of groups, excluding missing values. |
| 2350 | +
|
| 2351 | + For multiple groupings, the result index will be a MultiIndex |
| 2352 | +
|
| 2353 | + .. note:: Unlike pandas', the median in Koalas is an approximated median based upon |
| 2354 | + approximate percentile computation because computing median across a large dataset |
| 2355 | + is extremely expensive. |
| 2356 | +
|
| 2357 | + Parameters |
| 2358 | + ---------- |
| 2359 | + numeric_only : bool, default True |
| 2360 | + Include only float, int, boolean columns. False is not supported. This parameter |
| 2361 | + is mainly for pandas compatibility. |
| 2362 | +
|
| 2363 | + Returns |
| 2364 | + ------- |
| 2365 | + Series or DataFrame |
| 2366 | + Median of values within each group. |
| 2367 | +
|
| 2368 | + Examples |
| 2369 | + -------- |
| 2370 | + >>> kdf = ks.DataFrame({'a': [1., 1., 1., 1., 2., 2., 2., 3., 3., 3.], |
| 2371 | + ... 'b': [2., 3., 1., 4., 6., 9., 8., 10., 7., 5.], |
| 2372 | + ... 'c': [3., 5., 2., 5., 1., 2., 6., 4., 3., 6.]}, |
| 2373 | + ... columns=['a', 'b', 'c'], |
| 2374 | + ... index=[7, 2, 4, 1, 3, 4, 9, 10, 5, 6]) |
| 2375 | + >>> kdf |
| 2376 | + a b c |
| 2377 | + 7 1.0 2.0 3.0 |
| 2378 | + 2 1.0 3.0 5.0 |
| 2379 | + 4 1.0 1.0 2.0 |
| 2380 | + 1 1.0 4.0 5.0 |
| 2381 | + 3 2.0 6.0 1.0 |
| 2382 | + 4 2.0 9.0 2.0 |
| 2383 | + 9 2.0 8.0 6.0 |
| 2384 | + 10 3.0 10.0 4.0 |
| 2385 | + 5 3.0 7.0 3.0 |
| 2386 | + 6 3.0 5.0 6.0 |
| 2387 | +
|
| 2388 | + DataFrameGroupBy |
| 2389 | +
|
| 2390 | + >>> kdf.groupby('a').median().sort_index() # doctest: +NORMALIZE_WHITESPACE |
| 2391 | + b c |
| 2392 | + a |
| 2393 | + 1.0 2.0 3.0 |
| 2394 | + 2.0 8.0 2.0 |
| 2395 | + 3.0 7.0 4.0 |
| 2396 | +
|
| 2397 | + SeriesGroupBy |
| 2398 | +
|
| 2399 | + >>> kdf.groupby('a')['b'].median().sort_index() |
| 2400 | + a |
| 2401 | + 1.0 2.0 |
| 2402 | + 2.0 8.0 |
| 2403 | + 3.0 7.0 |
| 2404 | + Name: b, dtype: float64 |
| 2405 | + """ |
| 2406 | + if not isinstance(accuracy, int): |
| 2407 | + raise ValueError( |
| 2408 | + "accuracy must be an integer; however, got [%s]" % type(accuracy).__name__ |
| 2409 | + ) |
| 2410 | + |
| 2411 | + stat_function = lambda col: SF.percentile_approx(col, 0.5, accuracy) |
| 2412 | + return self._reduce_for_stat_function(stat_function, only_numeric=numeric_only) |
| 2413 | + |
2346 | 2414 | def _reduce_for_stat_function(self, sfun, only_numeric):
|
2347 | 2415 | agg_columns = self._agg_columns
|
2348 | 2416 | agg_columns_scols = self._agg_columns_scols
|
|
0 commit comments