Skip to content

Commit 17396a5

Browse files
Lordwormsalamb
andauthored
feat: add hint for missing fields (#14521)
* feat: add hint for missing fields * set threshold to include 0.5 * fix failed test * add diagnose * fix clippy * fix bugs fix bugs * add test * fix test * fix clippy --------- Co-authored-by: Andrew Lamb <[email protected]>
1 parent b3ee833 commit 17396a5

File tree

9 files changed

+153
-31
lines changed

9 files changed

+153
-31
lines changed

datafusion/common/src/column.rs

Lines changed: 6 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
//! Column
1919
20-
use crate::error::_schema_err;
20+
use crate::error::{_schema_err, add_possible_columns_to_diag};
2121
use crate::utils::{parse_identifiers_normalized, quote_identifier};
2222
use crate::{DFSchema, Diagnostic, Result, SchemaError, Spans, TableReference};
2323
use arrow_schema::{Field, FieldRef};
@@ -273,18 +273,11 @@ impl Column {
273273
// user which columns are candidates, or which table
274274
// they come from. For now, let's list the table names
275275
// only.
276-
for qualified_field in qualified_fields {
277-
let (Some(table), _) = qualified_field else {
278-
continue;
279-
};
280-
diagnostic.add_note(
281-
format!(
282-
"possible reference to '{}' in table '{}'",
283-
&self.name, table
284-
),
285-
None,
286-
);
287-
}
276+
add_possible_columns_to_diag(
277+
&mut diagnostic,
278+
&Column::new_unqualified(&self.name),
279+
&columns,
280+
);
288281
err.with_diagnostic(diagnostic)
289282
});
290283
}

datafusion/common/src/dfschema.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1069,7 +1069,7 @@ mod tests {
10691069
Column names are case sensitive. \
10701070
You can use double quotes to refer to the \"\"t1.c0\"\" column \
10711071
or set the datafusion.sql_parser.enable_ident_normalization configuration. \
1072-
Valid fields are t1.c0, t1.c1.";
1072+
Did you mean 't1.c0'?.";
10731073
assert_eq!(err.strip_backtrace(), expected);
10741074
Ok(())
10751075
}

datafusion/common/src/error.rs

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ use std::io;
2727
use std::result;
2828
use std::sync::Arc;
2929

30+
use crate::utils::datafusion_strsim::normalized_levenshtein;
3031
use crate::utils::quote_identifier;
3132
use crate::{Column, DFSchema, Diagnostic, TableReference};
3233
#[cfg(feature = "avro")]
@@ -190,6 +191,11 @@ impl Display for SchemaError {
190191
.iter()
191192
.map(|column| column.flat_name().to_lowercase())
192193
.collect::<Vec<String>>();
194+
195+
let valid_fields_names = valid_fields
196+
.iter()
197+
.map(|column| column.flat_name())
198+
.collect::<Vec<String>>();
193199
if lower_valid_fields.contains(&field.flat_name().to_lowercase()) {
194200
write!(
195201
f,
@@ -198,7 +204,15 @@ impl Display for SchemaError {
198204
field.quoted_flat_name()
199205
)?;
200206
}
201-
if !valid_fields.is_empty() {
207+
let field_name = field.name();
208+
if let Some(matched) = valid_fields_names
209+
.iter()
210+
.filter(|str| normalized_levenshtein(str, field_name) >= 0.5)
211+
.collect::<Vec<&String>>()
212+
.first()
213+
{
214+
write!(f, ". Did you mean '{matched}'?")?;
215+
} else if !valid_fields.is_empty() {
202216
write!(
203217
f,
204218
". Valid fields are {}",
@@ -827,6 +841,27 @@ pub fn unqualified_field_not_found(name: &str, schema: &DFSchema) -> DataFusionE
827841
})
828842
}
829843

844+
pub fn add_possible_columns_to_diag(
845+
diagnostic: &mut Diagnostic,
846+
field: &Column,
847+
valid_fields: &[Column],
848+
) {
849+
let field_names: Vec<String> = valid_fields
850+
.iter()
851+
.filter_map(|f| {
852+
if normalized_levenshtein(f.name(), field.name()) >= 0.5 {
853+
Some(f.flat_name())
854+
} else {
855+
None
856+
}
857+
})
858+
.collect();
859+
860+
for name in field_names {
861+
diagnostic.add_note(format!("possible column {}", name), None);
862+
}
863+
}
864+
830865
#[cfg(test)]
831866
mod test {
832867
use std::sync::Arc;

datafusion/common/src/utils/mod.rs

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -735,6 +735,27 @@ pub mod datafusion_strsim {
735735
pub fn levenshtein(a: &str, b: &str) -> usize {
736736
generic_levenshtein(&StringWrapper(a), &StringWrapper(b))
737737
}
738+
739+
/// Calculates the normalized Levenshtein distance between two strings.
740+
/// The normalized distance is a value between 0.0 and 1.0, where 1.0 indicates
741+
/// that the strings are identical and 0.0 indicates no similarity.
742+
///
743+
/// ```
744+
/// use datafusion_common::utils::datafusion_strsim::normalized_levenshtein;
745+
///
746+
/// assert!((normalized_levenshtein("kitten", "sitting") - 0.57142).abs() < 0.00001);
747+
///
748+
/// assert!(normalized_levenshtein("", "second").abs() < 0.00001);
749+
///
750+
/// assert!((normalized_levenshtein("kitten", "sitten") - 0.833).abs() < 0.001);
751+
/// ```
752+
pub fn normalized_levenshtein(a: &str, b: &str) -> f64 {
753+
if a.is_empty() && b.is_empty() {
754+
return 1.0;
755+
}
756+
1.0 - (levenshtein(a, b) as f64)
757+
/ (a.chars().count().max(b.chars().count()) as f64)
758+
}
738759
}
739760

740761
/// Merges collections `first` and `second`, removes duplicates and sorts the

datafusion/sql/src/planner.rs

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ use std::sync::Arc;
2121
use std::vec;
2222

2323
use arrow_schema::*;
24+
use datafusion_common::error::add_possible_columns_to_diag;
2425
use datafusion_common::{
2526
field_not_found, internal_err, plan_datafusion_err, DFSchemaRef, Diagnostic,
2627
SchemaError,
@@ -368,10 +369,13 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
368369
}
369370
.map_err(|err: DataFusionError| match &err {
370371
DataFusionError::SchemaError(
371-
SchemaError::FieldNotFound { .. },
372+
SchemaError::FieldNotFound {
373+
field,
374+
valid_fields,
375+
},
372376
_,
373377
) => {
374-
let diagnostic = if let Some(relation) = &col.relation {
378+
let mut diagnostic = if let Some(relation) = &col.relation {
375379
Diagnostic::new_error(
376380
format!(
377381
"column '{}' not found in '{}'",
@@ -385,6 +389,11 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
385389
col.spans().first(),
386390
)
387391
};
392+
add_possible_columns_to_diag(
393+
&mut diagnostic,
394+
field,
395+
valid_fields,
396+
);
388397
err.with_diagnostic(diagnostic)
389398
}
390399
_ => err,

datafusion/sql/tests/cases/diagnostic.rs

Lines changed: 57 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ fn do_query(sql: &'static str) -> Diagnostic {
3535
collect_spans: true,
3636
..ParserOptions::default()
3737
};
38+
3839
let state = MockSessionState::default();
3940
let context = MockContextProvider { state };
4041
let sql_to_rel = SqlToRel::new_with_options(&context, options);
@@ -200,14 +201,8 @@ fn test_ambiguous_reference() -> Result<()> {
200201
let diag = do_query(query);
201202
assert_eq!(diag.message, "column 'first_name' is ambiguous");
202203
assert_eq!(diag.span, Some(spans["a"]));
203-
assert_eq!(
204-
diag.notes[0].message,
205-
"possible reference to 'first_name' in table 'a'"
206-
);
207-
assert_eq!(
208-
diag.notes[1].message,
209-
"possible reference to 'first_name' in table 'b'"
210-
);
204+
assert_eq!(diag.notes[0].message, "possible column a.first_name");
205+
assert_eq!(diag.notes[1].message, "possible column b.first_name");
211206
Ok(())
212207
}
213208

@@ -225,3 +220,57 @@ fn test_incompatible_types_binary_arithmetic() -> Result<()> {
225220
assert_eq!(diag.notes[1].span, Some(spans["right"]));
226221
Ok(())
227222
}
223+
224+
#[test]
225+
fn test_field_not_found_suggestion() -> Result<()> {
226+
let query = "SELECT /*whole*/first_na/*whole*/ FROM person";
227+
let spans = get_spans(query);
228+
let diag = do_query(query);
229+
assert_eq!(diag.message, "column 'first_na' not found");
230+
assert_eq!(diag.span, Some(spans["whole"]));
231+
assert_eq!(diag.notes.len(), 1);
232+
233+
let mut suggested_fields: Vec<String> = diag
234+
.notes
235+
.iter()
236+
.filter_map(|note| {
237+
if note.message.starts_with("possible column") {
238+
Some(note.message.replace("possible column ", ""))
239+
} else {
240+
None
241+
}
242+
})
243+
.collect();
244+
suggested_fields.sort();
245+
assert_eq!(suggested_fields[0], "person.first_name");
246+
Ok(())
247+
}
248+
249+
#[test]
250+
fn test_ambiguous_column_suggestion() -> Result<()> {
251+
let query = "SELECT /*whole*/id/*whole*/ FROM test_decimal, person";
252+
let spans = get_spans(query);
253+
let diag = do_query(query);
254+
255+
assert_eq!(diag.message, "column 'id' is ambiguous");
256+
assert_eq!(diag.span, Some(spans["whole"]));
257+
258+
assert_eq!(diag.notes.len(), 2);
259+
260+
let mut suggested_fields: Vec<String> = diag
261+
.notes
262+
.iter()
263+
.filter_map(|note| {
264+
if note.message.starts_with("possible column") {
265+
Some(note.message.replace("possible column ", ""))
266+
} else {
267+
None
268+
}
269+
})
270+
.collect();
271+
272+
suggested_fields.sort();
273+
assert_eq!(suggested_fields, vec!["person.id", "test_decimal.id"]);
274+
275+
Ok(())
276+
}

datafusion/sqllogictest/test_files/errors.slt

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -169,4 +169,19 @@ statement ok
169169
INSERT INTO tab0 VALUES(83,0,38);
170170

171171
query error DataFusion error: Arrow error: Divide by zero error
172-
SELECT DISTINCT - 84 FROM tab0 AS cor0 WHERE NOT + 96 / + col1 <= NULL GROUP BY col1, col0;
172+
SELECT DISTINCT - 84 FROM tab0 AS cor0 WHERE NOT + 96 / + col1 <= NULL GROUP BY col1, col0;
173+
174+
statement ok
175+
create table a(timestamp int, birthday int, ts int, tokens int, amp int, staamp int);
176+
177+
query error DataFusion error: Schema error: No field named timetamp\. Did you mean 'a\.timestamp'\?\.
178+
select timetamp from a;
179+
180+
query error DataFusion error: Schema error: No field named dadsada\. Valid fields are a\.timestamp, a\.birthday, a\.ts, a\.tokens, a\.amp, a\.staamp\.
181+
select dadsada from a;
182+
183+
query error DataFusion error: Schema error: No field named ammp\. Did you mean 'a\.amp'\?\.
184+
select ammp from a;
185+
186+
statement ok
187+
drop table a;

datafusion/sqllogictest/test_files/identifiers.slt

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -90,16 +90,16 @@ drop table case_insensitive_test
9090
statement ok
9191
CREATE TABLE test("Column1" string) AS VALUES ('content1');
9292

93-
statement error DataFusion error: Schema error: No field named column1\. Valid fields are test\."Column1"\.
93+
statement error DataFusion error: Schema error: No field named column1\. Did you mean 'test\.Column1'\?\.
9494
SELECT COLumn1 from test
9595

96-
statement error DataFusion error: Schema error: No field named column1\. Valid fields are test\."Column1"\.
96+
statement error DataFusion error: Schema error: No field named column1\. Did you mean 'test\.Column1'\?\.
9797
SELECT Column1 from test
9898

99-
statement error DataFusion error: Schema error: No field named column1\. Valid fields are test\."Column1"\.
99+
statement error DataFusion error: Schema error: No field named column1\. Did you mean 'test\.Column1'\?\.
100100
SELECT column1 from test
101101

102-
statement error DataFusion error: Schema error: No field named column1\. Valid fields are test\."Column1"\.
102+
statement error DataFusion error: Schema error: No field named column1\. Did you mean 'test\.Column1'\?\.
103103
SELECT "column1" from test
104104

105105
statement ok

datafusion/sqllogictest/test_files/join.slt.part

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ statement ok
9494
set datafusion.execution.batch_size = 4096;
9595

9696
# left semi with wrong where clause
97-
query error DataFusion error: Schema error: No field named t2\.t2_id\. Valid fields are t1\.t1_id, t1\.t1_name, t1\.t1_int\.
97+
query error DataFusion error: Schema error: No field named t2\.t2_id\. Did you mean 't1\.t1_id'\?\.
9898
SELECT t1.t1_id, t1.t1_name, t1.t1_int
9999
FROM t1
100100
LEFT SEMI JOIN t2 ON t1.t1_id = t2.t2_id

0 commit comments

Comments
 (0)