Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion datafusion/functions-aggregate-common/src/tdigest.rs
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,7 @@ impl TDigest {
}

fn clamp(v: f64, lo: f64, hi: f64) -> f64 {
if lo.is_nan() && hi.is_nan() {
if lo.is_nan() || hi.is_nan() {
return v;
}
v.clamp(lo, hi)
Expand Down Expand Up @@ -539,6 +539,18 @@ impl TDigest {
let value = self.centroids[pos].mean()
+ ((rank - t) / self.centroids[pos].weight() - 0.5) * delta;

// In `merge_digests()`: `min` is initialized to Inf, `max` is initialized to -Inf
// and gets updated according to different `TDigest`s
// However, `min`/`max` won't get updated if there is only one `NaN` within `TDigest`
// The following two checks is for such edge case
if !min.is_finite() && min.is_sign_positive() {
min = f64::NEG_INFINITY;
}

if !max.is_finite() && max.is_sign_negative() {
max = f64::INFINITY;
}

Self::clamp(value, min, max)
}

Expand Down
40 changes: 39 additions & 1 deletion datafusion/sqllogictest/test_files/aggregate.slt
Original file line number Diff line number Diff line change
Expand Up @@ -1249,6 +1249,44 @@ SELECT APPROX_PERCENTILE_CONT(v, 0.5) FROM (VALUES (CAST(NULL as INT))) as t (v)
----
NULL

#
# percentile_cont edge cases
#

statement ok
CREATE TABLE tmp_percentile_cont(v1 INT, v2 DOUBLE);

statement ok
INSERT INTO tmp_percentile_cont VALUES (1, 'NaN'::Double), (2, 'NaN'::Double), (3, 'NaN'::Double);

# ISSUE: https://github.com/apache/datafusion/issues/11871
# Note `approx_median()` is using the same implementation as `approx_percentile_cont()`
query R
select APPROX_MEDIAN(v2) from tmp_percentile_cont WHERE v1 = 1;
----
NaN

# ISSUE: https://github.com/apache/datafusion/issues/11870
query R
select APPROX_PERCENTILE_CONT(v2, 0.8) from tmp_percentile_cont;
----
NaN

# ISSUE: https://github.com/apache/datafusion/issues/11869
# Note: `approx_percentile_cont_with_weight()` uses the same implementation as `approx_percentile_cont()`
query R
SELECT APPROX_PERCENTILE_CONT_WITH_WEIGHT(
v2,
'+Inf'::Double,
0.9
)
FROM tmp_percentile_cont;
----
NaN

statement ok
DROP TABLE tmp_percentile_cont;

# csv_query_cube_avg
query TIR
SELECT c1, c2, AVG(c3) FROM aggregate_test_100 GROUP BY CUBE (c1, c2) ORDER BY c1, c2
Expand Down Expand Up @@ -5553,4 +5591,4 @@ drop table employee_csv;
query I??III?T
select count(null), min(null), max(null), bit_and(NULL), bit_or(NULL), bit_xor(NULL), nth_value(NULL, 1), string_agg(NULL, ',');
----
0 NULL NULL NULL NULL NULL NULL NULL
0 NULL NULL NULL NULL NULL NULL NULL