Skip to content

Commit

Permalink
fix: column not found in analyze (#17321)
Browse files Browse the repository at this point in the history
* fix: Column not found in analyze

* fix
  • Loading branch information
zhyass authored Jan 18, 2025
1 parent 3a32c18 commit 07034df
Show file tree
Hide file tree
Showing 2 changed files with 77 additions and 37 deletions.
70 changes: 33 additions & 37 deletions src/query/service/src/interpreters/interpreter_table_analyze.rs
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,6 @@ impl Interpreter for AnalyzeTableInterpreter {

if let Some(snapshot) = snapshot_opt {
// plan sql
let schema = table.schema();
let _table_info = table.get_table_info();

let table_statistics = table
Expand Down Expand Up @@ -165,22 +164,20 @@ impl Interpreter for AnalyzeTableInterpreter {
.get_settings()
.get_sql_dialect()?
.default_ident_quote();
let index_cols: Vec<(u32, String)> = schema
.fields()
.iter()
.filter(|f| RangeIndex::supported_type(&f.data_type().into()))
.map(|f| (f.column_id(), format!("{quote}{}{quote}", f.name)))
.collect();

// 0.01625 --> 12 buckets --> 4K size per column
// 1.04 / math.sqrt(1<<12) --> 0.01625
const DISTINCT_ERROR_RATE: f64 = 0.01625;
let ndv_select_expr = index_cols
let ndv_select_expr = snapshot
.schema
.fields()
.iter()
.map(|c| {
.filter(|f| RangeIndex::supported_type(&f.data_type().into()))
.map(|f| {
format!(
"approx_count_distinct_state({DISTINCT_ERROR_RATE})({}) as ndv_{}",
c.1, c.0
"approx_count_distinct_state({DISTINCT_ERROR_RATE})({quote}{}{quote}) as ndv_{}",
f.name,
f.column_id()
)
})
.join(", ");
Expand All @@ -190,7 +187,7 @@ impl Interpreter for AnalyzeTableInterpreter {
plan.database, plan.table,
);

info!("Analyze via sql {:?}", sql);
info!("Analyze via sql: {sql}");

let (physical_plan, bind_context) = self.plan_sql(sql).await?;
let mut build_res =
Expand All @@ -200,34 +197,33 @@ impl Interpreter for AnalyzeTableInterpreter {
// We add a setting `enable_analyze_histogram` to control whether to compute histogram(default is closed).
let mut histogram_info_receivers = HashMap::new();
if self.ctx.get_settings().get_enable_analyze_histogram()? {
let histogram_sqls = index_cols
let histogram_sqls = table
.schema()
.fields()
.iter()
.map(|c| {
format!(
"SELECT quantile,
COUNT(DISTINCT {}) AS ndv,
MAX({}) AS max_value,
MIN({}) AS min_value,
COUNT() as count
FROM (
SELECT {}, NTILE({}) OVER (ORDER BY {}) AS quantile
FROM {}.{} WHERE {} IS DISTINCT FROM NULL
)
GROUP BY quantile ORDER BY quantile \n",
c.1,
c.1,
c.1,
c.1,
DEFAULT_HISTOGRAM_BUCKETS,
c.1,
plan.database,
plan.table,
c.1,
.filter(|f| RangeIndex::supported_type(&f.data_type().into()))
.map(|f| {
let col_name = format!("{quote}{}{quote}", f.name);
(
format!(
"SELECT quantile, \
COUNT(DISTINCT {col_name}) AS ndv, \
MAX({col_name}) AS max_value, \
MIN({col_name}) AS min_value, \
COUNT() as count \
FROM ( \
SELECT {col_name}, NTILE({}) OVER (ORDER BY {col_name}) AS quantile \
FROM {}.{} WHERE {col_name} IS DISTINCT FROM NULL \
) \
GROUP BY quantile ORDER BY quantile",
DEFAULT_HISTOGRAM_BUCKETS, plan.database, plan.table,
),
f.column_id(),
)
})
.collect::<Vec<_>>();
for (sql, (col_id, _)) in histogram_sqls.into_iter().zip(index_cols.iter()) {
info!("Analyze histogram via sql {:?}", sql);
for (sql, col_id) in histogram_sqls.into_iter() {
info!("Analyze histogram via sql: {sql}");
let (mut histogram_plan, bind_context) = self.plan_sql(sql).await?;
if !self.ctx.get_cluster().is_empty() {
histogram_plan = remove_exchange(histogram_plan);
Expand All @@ -253,7 +249,7 @@ impl Interpreter for AnalyzeTableInterpreter {
build_res
.sources_pipelines
.extend(histogram_build_res.sources_pipelines);
histogram_info_receivers.insert(*col_id, rx);
histogram_info_receivers.insert(col_id, rx);
}
}
FuseTable::do_analyze(
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
statement ok
create or replace database issue_17314;

statement ok
use issue_17314

statement ok
set enable_analyze_histogram=1;

statement ok
create or replace table t1(a string, biz_date1 string);

statement ok
insert into t1 values('1', '11');

statement ok
alter table t1 rename BIZ_date1 to BIZ_DATE;

statement ok
analyze table t1;

statement ok
insert into t1 values('2', '22');

statement ok
insert into t1 values('3', '33');

statement ok
alter table t1 rename BIZ_DATE to b;

statement ok
analyze table t1;

query IIT
select * from fuse_statistic('issue_17314', 't1') order by column_name;
----
a 3 [bucket id: 0, min: "1", max: "1", ndv: 1.0, count: 1.0], [bucket id: 1, min: "2", max: "2", ndv: 1.0, count: 1.0], [bucket id: 2, min: "3", max: "3", ndv: 1.0, count: 1.0]
b 3 [bucket id: 0, min: "11", max: "11", ndv: 1.0, count: 1.0], [bucket id: 1, min: "22", max: "22", ndv: 1.0, count: 1.0], [bucket id: 2, min: "33", max: "33", ndv: 1.0, count: 1.0]

statement ok
drop table t1 all;

statement ok
drop database issue_17314;

0 comments on commit 07034df

Please sign in to comment.