From 16a3557325eb8f949f4a87ab90c0a0b174dc8d86 Mon Sep 17 00:00:00 2001 From: Marco Neumann Date: Tue, 6 Aug 2024 15:42:00 +0200 Subject: [PATCH] fix: `collect_columns` quadratic complexity (#11843) Fix accidental $O(n^2)$ in `collect_columns`. There are the following ways to insert a clone into a hash set: - **clone before check:** Basically `set.insert(x.clone())`. That's rather expensive if you have a lot of duplicates. - **iter & clone:** That's what we do right now, but that's in $O(n^2)$. - **check & insert:** Basically `if !set.contains(x) {set.insert(x.clone())}` That requires two hash probes though. - **entry-based API:** Sadly the stdlib doesn't really offer any handle/entry-based APIs yet (see https://github.com/rust-lang/rust/issues/60896 ), but `hashbrown` does, so we can use the nice `set.get_or_insert_owned(x)` which will only clone the reference if it doesn't exists yet and only hashes once. We now use the last approach. --- datafusion/physical-expr/src/utils/mod.rs | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/datafusion/physical-expr/src/utils/mod.rs b/datafusion/physical-expr/src/utils/mod.rs index 6c4791b158c8..4c37db4849a7 100644 --- a/datafusion/physical-expr/src/utils/mod.rs +++ b/datafusion/physical-expr/src/utils/mod.rs @@ -17,9 +17,10 @@ mod guarantee; pub use guarantee::{Guarantee, LiteralGuarantee}; +use hashbrown::HashSet; use std::borrow::Borrow; -use std::collections::{HashMap, HashSet}; +use std::collections::HashMap; use std::sync::Arc; use crate::expressions::{BinaryExpr, Column}; @@ -204,9 +205,7 @@ pub fn collect_columns(expr: &Arc) -> HashSet { let mut columns = HashSet::::new(); expr.apply(|expr| { if let Some(column) = expr.as_any().downcast_ref::() { - if !columns.iter().any(|c| c.eq(column)) { - columns.insert(column.clone()); - } + columns.get_or_insert_owned(column); } Ok(TreeNodeRecursion::Continue) })