apache · Dandandan · Aug 11, 2024 · Aug 11, 2024
diff --git a/datafusion/functions-aggregate-common/src/tdigest.rs b/datafusion/functions-aggregate-common/src/tdigest.rs
@@ -233,7 +233,7 @@ impl TDigest {
     }
 
     fn clamp(v: f64, lo: f64, hi: f64) -> f64 {
-        if lo.is_nan() && hi.is_nan() {
+        if lo.is_nan() || hi.is_nan() {
             return v;
         }
         v.clamp(lo, hi)
@@ -539,6 +539,18 @@ impl TDigest {
         let value = self.centroids[pos].mean()
             + ((rank - t) / self.centroids[pos].weight() - 0.5) * delta;
 
+        // In `merge_digests()`: `min` is initialized to Inf, `max` is initialized to -Inf
+        // and gets updated according to different `TDigest`s
+        // However, `min`/`max` won't get updated if there is only one `NaN` within `TDigest`
+        // The following two checks is for such edge case
+        if !min.is_finite() && min.is_sign_positive() {
+            min = f64::NEG_INFINITY;
+        }
+
+        if !max.is_finite() && max.is_sign_negative() {
+            max = f64::INFINITY;
+        }
+
         Self::clamp(value, min, max)
     }
 

diff --git a/datafusion/sqllogictest/test_files/aggregate.slt b/datafusion/sqllogictest/test_files/aggregate.slt
@@ -1249,6 +1249,44 @@ SELECT APPROX_PERCENTILE_CONT(v, 0.5) FROM (VALUES (CAST(NULL as INT))) as t (v)
 ----
 NULL
 
+#
+# percentile_cont edge cases
+#
+
+statement ok
+CREATE TABLE tmp_percentile_cont(v1 INT, v2 DOUBLE);
+
+statement ok
+INSERT INTO tmp_percentile_cont VALUES (1, 'NaN'::Double), (2, 'NaN'::Double), (3, 'NaN'::Double);
+
+# ISSUE: https://github.com/apache/datafusion/issues/11871
+# Note `approx_median()` is using the same implementation as `approx_percentile_cont()`
+query R
+select APPROX_MEDIAN(v2) from tmp_percentile_cont WHERE v1 = 1;
+----
+NaN
+
+# ISSUE: https://github.com/apache/datafusion/issues/11870
+query R
+select APPROX_PERCENTILE_CONT(v2, 0.8) from tmp_percentile_cont;
+----
+NaN
+
+# ISSUE: https://github.com/apache/datafusion/issues/11869
+# Note: `approx_percentile_cont_with_weight()` uses the same implementation as `approx_percentile_cont()`
+query R
+SELECT APPROX_PERCENTILE_CONT_WITH_WEIGHT(
+    v2,
+    '+Inf'::Double,
+    0.9
+)
+FROM tmp_percentile_cont;
+----
+NaN
+
+statement ok
+DROP TABLE tmp_percentile_cont;
+
 # csv_query_cube_avg
 query TIR
 SELECT c1, c2, AVG(c3) FROM aggregate_test_100 GROUP BY CUBE (c1, c2) ORDER BY c1, c2
@@ -5553,4 +5591,4 @@ drop table employee_csv;
 query I??III?T
 select count(null), min(null), max(null), bit_and(NULL), bit_or(NULL), bit_xor(NULL), nth_value(NULL, 1), string_agg(NULL, ',');
 ----
-0 NULL NULL NULL NULL NULL NULL NULL
+0 NULL NULL NULL NULL NULL NULL NULL