Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: column not found in analyze #17321

Merged
merged 2 commits into from
Jan 18, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 33 additions & 37 deletions src/query/service/src/interpreters/interpreter_table_analyze.rs
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,6 @@ impl Interpreter for AnalyzeTableInterpreter {

if let Some(snapshot) = snapshot_opt {
// plan sql
let schema = table.schema();
let _table_info = table.get_table_info();

let table_statistics = table
Expand Down Expand Up @@ -165,22 +164,20 @@ impl Interpreter for AnalyzeTableInterpreter {
.get_settings()
.get_sql_dialect()?
.default_ident_quote();
let index_cols: Vec<(u32, String)> = schema
.fields()
.iter()
.filter(|f| RangeIndex::supported_type(&f.data_type().into()))
.map(|f| (f.column_id(), format!("{quote}{}{quote}", f.name)))
.collect();

// 0.01625 --> 12 buckets --> 4K size per column
// 1.04 / math.sqrt(1<<12) --> 0.01625
const DISTINCT_ERROR_RATE: f64 = 0.01625;
let ndv_select_expr = index_cols
let ndv_select_expr = snapshot
.schema
.fields()
.iter()
.map(|c| {
.filter(|f| RangeIndex::supported_type(&f.data_type().into()))
.map(|f| {
format!(
"approx_count_distinct_state({DISTINCT_ERROR_RATE})({}) as ndv_{}",
c.1, c.0
"approx_count_distinct_state({DISTINCT_ERROR_RATE})({quote}{}{quote}) as ndv_{}",
f.name,
f.column_id()
)
})
.join(", ");
Expand All @@ -190,7 +187,7 @@ impl Interpreter for AnalyzeTableInterpreter {
plan.database, plan.table,
);

info!("Analyze via sql {:?}", sql);
info!("Analyze via sql: {sql}");

let (physical_plan, bind_context) = self.plan_sql(sql).await?;
let mut build_res =
Expand All @@ -200,34 +197,33 @@ impl Interpreter for AnalyzeTableInterpreter {
// We add a setting `enable_analyze_histogram` to control whether to compute histogram(default is closed).
let mut histogram_info_receivers = HashMap::new();
if self.ctx.get_settings().get_enable_analyze_histogram()? {
let histogram_sqls = index_cols
let histogram_sqls = table
.schema()
.fields()
.iter()
.map(|c| {
format!(
"SELECT quantile,
COUNT(DISTINCT {}) AS ndv,
MAX({}) AS max_value,
MIN({}) AS min_value,
COUNT() as count
FROM (
SELECT {}, NTILE({}) OVER (ORDER BY {}) AS quantile
FROM {}.{} WHERE {} IS DISTINCT FROM NULL
)
GROUP BY quantile ORDER BY quantile \n",
c.1,
c.1,
c.1,
c.1,
DEFAULT_HISTOGRAM_BUCKETS,
c.1,
plan.database,
plan.table,
c.1,
.filter(|f| RangeIndex::supported_type(&f.data_type().into()))
.map(|f| {
let col_name = format!("{quote}{}{quote}", f.name);
(
format!(
"SELECT quantile, \
COUNT(DISTINCT {col_name}) AS ndv, \
MAX({col_name}) AS max_value, \
MIN({col_name}) AS min_value, \
COUNT() as count \
FROM ( \
SELECT {col_name}, NTILE({}) OVER (ORDER BY {col_name}) AS quantile \
FROM {}.{} WHERE {col_name} IS DISTINCT FROM NULL \
) \
GROUP BY quantile ORDER BY quantile",
DEFAULT_HISTOGRAM_BUCKETS, plan.database, plan.table,
),
f.column_id(),
)
})
.collect::<Vec<_>>();
for (sql, (col_id, _)) in histogram_sqls.into_iter().zip(index_cols.iter()) {
info!("Analyze histogram via sql {:?}", sql);
for (sql, col_id) in histogram_sqls.into_iter() {
info!("Analyze histogram via sql: {sql}");
let (mut histogram_plan, bind_context) = self.plan_sql(sql).await?;
if !self.ctx.get_cluster().is_empty() {
histogram_plan = remove_exchange(histogram_plan);
Expand All @@ -253,7 +249,7 @@ impl Interpreter for AnalyzeTableInterpreter {
build_res
.sources_pipelines
.extend(histogram_build_res.sources_pipelines);
histogram_info_receivers.insert(*col_id, rx);
histogram_info_receivers.insert(col_id, rx);
}
}
FuseTable::do_analyze(
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
statement ok
create or replace database issue_17314;

statement ok
use issue_17314

statement ok
set enable_analyze_histogram=1;

statement ok
create or replace table t1(a string, biz_date1 string);

statement ok
insert into t1 values('1', '11');

statement ok
alter table t1 rename BIZ_date1 to BIZ_DATE;

statement ok
analyze table t1;

statement ok
insert into t1 values('2', '22');

statement ok
insert into t1 values('3', '33');

statement ok
alter table t1 rename BIZ_DATE to b;

statement ok
analyze table t1;

query IIT
select * from fuse_statistic('issue_17314', 't1') order by column_name;
----
a 3 [bucket id: 0, min: "1", max: "1", ndv: 1.0, count: 1.0], [bucket id: 1, min: "2", max: "2", ndv: 1.0, count: 1.0], [bucket id: 2, min: "3", max: "3", ndv: 1.0, count: 1.0]
b 3 [bucket id: 0, min: "11", max: "11", ndv: 1.0, count: 1.0], [bucket id: 1, min: "22", max: "22", ndv: 1.0, count: 1.0], [bucket id: 2, min: "33", max: "33", ndv: 1.0, count: 1.0]

statement ok
drop table t1 all;

statement ok
drop database issue_17314;
Loading