From f446a28f5e550a41f4944d02079fced1aae8211e Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Thu, 30 Apr 2026 18:25:30 -0500 Subject: [PATCH 1/3] fix: error on CREATE EXTERNAL TABLE with no files and no explicit schema Pointing CREATE EXTERNAL TABLE at an empty (or non-existent) location without an explicit column list previously produced a 0-column table. Subsequent queries against that table failed with a confusing "column not found" error far from the real cause. Now ListingOptions::infer_schema returns a clear Plan error when the location yields no files, instructing the user to either add data files or declare an explicit schema. The existing behavior of pre-declaring an empty table with an explicit schema (for later INSERT) still works. Co-Authored-By: Claude Opus 4.7 (1M context) --- datafusion/catalog-listing/src/options.rs | 16 ++++++++- datafusion/sqllogictest/test_files/ddl.slt | 41 ++++++++++++++++++++++ 2 files changed, 56 insertions(+), 1 deletion(-) diff --git a/datafusion/catalog-listing/src/options.rs b/datafusion/catalog-listing/src/options.rs index 146f98d62335e..c817debe941aa 100644 --- a/datafusion/catalog-listing/src/options.rs +++ b/datafusion/catalog-listing/src/options.rs @@ -263,7 +263,12 @@ impl ListingOptions { /// Infer the schema of the files at the given path on the provided object store. /// /// If the table_path contains one or more files (i.e. it is a directory / - /// prefix of files) their schema is merged by calling [`FileFormat::infer_schema`] + /// prefix of files) their schema is merged by calling [`FileFormat::infer_schema`]. + /// + /// Returns a `Plan` error if no files are found at `table_path`, since an + /// inferred schema with zero columns produces confusing errors at query time. + /// Callers that need to support empty locations must declare an explicit + /// schema instead of relying on inference. /// /// Note: The inferred schema does not include any partitioning columns. /// @@ -283,6 +288,15 @@ impl ListingOptions { .try_collect() .await?; + if files.is_empty() { + return plan_err!( + "No files found at {}. \ + Cannot infer schema from an empty location; either add data files \ + or declare an explicit schema for the table.", + table_path + ); + } + let schema = self.format.infer_schema(state, &store, &files).await?; Ok(schema) diff --git a/datafusion/sqllogictest/test_files/ddl.slt b/datafusion/sqllogictest/test_files/ddl.slt index 0579659832feb..977d2d03a1d07 100644 --- a/datafusion/sqllogictest/test_files/ddl.slt +++ b/datafusion/sqllogictest/test_files/ddl.slt @@ -711,6 +711,47 @@ c1 Null 0 c2 Null 1 c3 Null 2 +# Creating an external table over a location with no files and without an +# explicit schema should error rather than producing a 0-column table that +# fails with a confusing "column not found" error at query time. +statement error DataFusion error: Error during planning: No files found at .*\. Cannot infer schema from an empty location; either add data files or declare an explicit schema for the table\. +CREATE EXTERNAL TABLE empty_dir_parquet STORED AS PARQUET LOCATION 'test_files/scratch/ddl/empty_dir/'; + +statement error DataFusion error: Error during planning: No files found at .*\. Cannot infer schema from an empty location; either add data files or declare an explicit schema for the table\. +CREATE EXTERNAL TABLE empty_dir_csv STORED AS CSV LOCATION 'test_files/scratch/ddl/empty_dir/' OPTIONS ('format.has_header' 'true'); + +statement error DataFusion error: Error during planning: No files found at .*\. Cannot infer schema from an empty location; either add data files or declare an explicit schema for the table\. +CREATE EXTERNAL TABLE empty_dir_json STORED AS JSON LOCATION 'test_files/scratch/ddl/empty_dir/'; + +# Providing an explicit schema for the same empty location is still allowed, +# so users can pre-declare a table to be populated later via INSERT. +statement ok +CREATE EXTERNAL TABLE empty_dir_with_schema(x int) STORED AS PARQUET LOCATION 'test_files/scratch/ddl/empty_dir/'; + +query I +select * from empty_dir_with_schema; +---- + +statement ok +drop table empty_dir_with_schema; + +# Once a file is written to the directory, schema inference works as before. +statement ok +COPY (values (1), (2), (3)) TO 'test_files/scratch/ddl/empty_dir_filled/' STORED AS PARQUET; + +statement ok +CREATE EXTERNAL TABLE filled_dir_inferred STORED AS PARQUET LOCATION 'test_files/scratch/ddl/empty_dir_filled/'; + +query I rowsort +select * from filled_dir_inferred; +---- +1 +2 +3 + +statement ok +drop table filled_dir_inferred; + ## should allow any type of exprs as values statement ok From f484851eb85a1bfe34fc9b37c8717a712752d631 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Thu, 30 Apr 2026 18:45:21 -0500 Subject: [PATCH 2/3] fix: only error on empty location when no files are listed at all Narrows the schema-inference error to the case the user actually encounters confusion in: an empty or non-existent directory that returns zero files from list_all_files. Locations that contain files which all happen to be 0-byte continue to produce an empty inferred schema as before, preserving the "0-byte files don't crash reads" behavior that several existing tests depend on. Also updates a few tests in datafusion/core that previously relied on empty fixture directories producing a 0-column table: - listing_table_factory tests now write a 0-byte placeholder file matching the format extension so the glob/extension assertions still exercise the inference code path. - read_dummy_folder and the empty-folder branch of read_from_different_file_extension now assert the new error. Co-Authored-By: Claude Opus 4.7 (1M context) --- datafusion/catalog-listing/src/options.rs | 23 +++++++---- .../src/datasource/listing_table_factory.rs | 9 ++++ .../core/src/execution/context/parquet.rs | 41 ++++++++++--------- 3 files changed, 45 insertions(+), 28 deletions(-) diff --git a/datafusion/catalog-listing/src/options.rs b/datafusion/catalog-listing/src/options.rs index c817debe941aa..0ab15e05abba1 100644 --- a/datafusion/catalog-listing/src/options.rs +++ b/datafusion/catalog-listing/src/options.rs @@ -23,7 +23,7 @@ use datafusion_datasource::file_format::FileFormat; use datafusion_execution::config::SessionConfig; use datafusion_expr::SortExpr; use futures::StreamExt; -use futures::{TryStreamExt, future}; +use futures::TryStreamExt; use itertools::Itertools; use std::sync::Arc; @@ -265,10 +265,13 @@ impl ListingOptions { /// If the table_path contains one or more files (i.e. it is a directory / /// prefix of files) their schema is merged by calling [`FileFormat::infer_schema`]. /// - /// Returns a `Plan` error if no files are found at `table_path`, since an - /// inferred schema with zero columns produces confusing errors at query time. + /// Returns a `Plan` error if `table_path` contains no files at all (e.g. an + /// empty or non-existent directory), since an inferred schema with zero + /// columns produces confusing "column not found" errors at query time. /// Callers that need to support empty locations must declare an explicit - /// schema instead of relying on inference. + /// schema instead of relying on inference. Locations that contain files + /// which all happen to be 0-byte are still accepted — the empty files are + /// filtered out before format-specific inference runs. /// /// Note: The inferred schema does not include any partitioning columns. /// @@ -280,15 +283,13 @@ impl ListingOptions { ) -> datafusion_common::Result { let store = state.runtime_env().object_store(table_path)?; - let files: Vec<_> = table_path + let all_files: Vec<_> = table_path .list_all_files(state, store.as_ref(), &self.file_extension) .await? - // Empty files cannot affect schema but may throw when trying to read for it - .try_filter(|object_meta| future::ready(object_meta.size > 0)) .try_collect() .await?; - if files.is_empty() { + if all_files.is_empty() { return plan_err!( "No files found at {}. \ Cannot infer schema from an empty location; either add data files \ @@ -297,6 +298,12 @@ impl ListingOptions { ); } + // Empty files cannot affect schema but may throw when trying to read for it + let files: Vec<_> = all_files + .into_iter() + .filter(|object_meta| object_meta.size > 0) + .collect(); + let schema = self.format.infer_schema(state, &store, &files).await?; Ok(schema) diff --git a/datafusion/core/src/datasource/listing_table_factory.rs b/datafusion/core/src/datasource/listing_table_factory.rs index a5139346752a9..f9ab87b528088 100644 --- a/datafusion/core/src/datasource/listing_table_factory.rs +++ b/datafusion/core/src/datasource/listing_table_factory.rs @@ -309,6 +309,10 @@ mod tests { #[tokio::test] async fn test_create_using_folder_with_compression() { let dir = tempfile::tempdir().unwrap(); + // Schema inference now requires at least one file at the location. + // The file itself can be 0-byte — it will be filtered out before the + // format-specific inference runs, leaving an empty inferred schema. + fs::File::create_new(dir.path().join("placeholder.csv.gz")).unwrap(); let factory = ListingTableFactory::new(); let context = SessionContext::new(); @@ -351,6 +355,9 @@ mod tests { #[tokio::test] async fn test_create_using_folder_without_compression() { let dir = tempfile::tempdir().unwrap(); + // See `test_create_using_folder_with_compression` — a placeholder file + // is required so schema inference does not error on an empty location. + fs::File::create_new(dir.path().join("placeholder.csv")).unwrap(); let factory = ListingTableFactory::new(); let context = SessionContext::new(); @@ -387,6 +394,8 @@ mod tests { let mut path = PathBuf::from(dir.path()); path.extend(["odd.v1", "odd.v2"]); fs::create_dir_all(&path).unwrap(); + // Placeholder so schema inference does not error on an empty location. + fs::File::create_new(path.join("placeholder.parquet")).unwrap(); let factory = ListingTableFactory::new(); let context = SessionContext::new(); diff --git a/datafusion/core/src/execution/context/parquet.rs b/datafusion/core/src/execution/context/parquet.rs index 823dc946ea732..3c750352f199a 100644 --- a/datafusion/core/src/execution/context/parquet.rs +++ b/datafusion/core/src/execution/context/parquet.rs @@ -108,9 +108,7 @@ mod tests { use arrow::util::pretty::pretty_format_batches; use datafusion_common::config::TableParquetOptions; - use datafusion_common::{ - assert_batches_eq, assert_batches_sorted_eq, assert_contains, - }; + use datafusion_common::{assert_batches_sorted_eq, assert_contains}; use datafusion_execution::config::SessionConfig; use tempfile::{TempDir, tempdir}; @@ -374,20 +372,22 @@ mod tests { let total_rows: usize = results.iter().map(|rb| rb.num_rows()).sum(); assert_eq!(total_rows, 5); - // Read the dataframe from 'output4/' + // Read the dataframe from 'output4/' — an empty folder. Inference now + // errors on an empty location instead of producing a 0-column table. std::fs::create_dir(&path4)?; - let read_df = ctx + let err = ctx .read_parquet( &path4, ParquetReadOptions { ..Default::default() }, ) - .await?; - - let results = read_df.collect().await?; - let total_rows: usize = results.iter().map(|rb| rb.num_rows()).sum(); - assert_eq!(total_rows, 0); + .await + .expect_err("read_parquet on an empty folder should error"); + assert!( + err.strip_backtrace().contains("No files found at"), + "unexpected error: {err}" + ); // Read the dataframe from double dot folder; let read_df = ctx @@ -510,17 +510,18 @@ mod tests { let ctx = SessionContext::new(); let test_path = "/foo/"; - let actual = ctx + // Reading from a non-existent / empty location now errors at planning + // time rather than producing a 0-column table that surfaces a confusing + // "column not found" error at query time. + let err = ctx .read_parquet(test_path, ParquetReadOptions::default()) - .await? - .collect() - .await?; - - #[cfg_attr(any(), rustfmt::skip)] - assert_batches_eq!(&[ - "++", - "++", - ], &actual); + .await + .expect_err("read_parquet on an empty location should error"); + let msg = err.strip_backtrace(); + assert!( + msg.contains("No files found at") && msg.contains(test_path), + "unexpected error: {msg}" + ); Ok(()) } From 8e72ad8388b00735cd01594409a8e64b875d078f Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Wed, 6 May 2026 10:38:12 -0500 Subject: [PATCH 3/3] test: import std::fs::File to drop fs::File:: prefix in tests Addresses review feedback on PR #21965. Co-Authored-By: Claude Opus 4.7 (1M context) --- datafusion/core/src/datasource/listing_table_factory.rs | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/datafusion/core/src/datasource/listing_table_factory.rs b/datafusion/core/src/datasource/listing_table_factory.rs index f9ab87b528088..7fbf5696de4a9 100644 --- a/datafusion/core/src/datasource/listing_table_factory.rs +++ b/datafusion/core/src/datasource/listing_table_factory.rs @@ -237,6 +237,7 @@ mod tests { use glob::Pattern; use std::collections::HashMap; use std::fs; + use std::fs::File; use std::path::PathBuf; use datafusion_common::parsers::CompressionTypeVariant; @@ -312,7 +313,7 @@ mod tests { // Schema inference now requires at least one file at the location. // The file itself can be 0-byte — it will be filtered out before the // format-specific inference runs, leaving an empty inferred schema. - fs::File::create_new(dir.path().join("placeholder.csv.gz")).unwrap(); + File::create_new(dir.path().join("placeholder.csv.gz")).unwrap(); let factory = ListingTableFactory::new(); let context = SessionContext::new(); @@ -357,7 +358,7 @@ mod tests { let dir = tempfile::tempdir().unwrap(); // See `test_create_using_folder_with_compression` — a placeholder file // is required so schema inference does not error on an empty location. - fs::File::create_new(dir.path().join("placeholder.csv")).unwrap(); + File::create_new(dir.path().join("placeholder.csv")).unwrap(); let factory = ListingTableFactory::new(); let context = SessionContext::new(); @@ -395,7 +396,7 @@ mod tests { path.extend(["odd.v1", "odd.v2"]); fs::create_dir_all(&path).unwrap(); // Placeholder so schema inference does not error on an empty location. - fs::File::create_new(path.join("placeholder.parquet")).unwrap(); + File::create_new(path.join("placeholder.parquet")).unwrap(); let factory = ListingTableFactory::new(); let context = SessionContext::new(); @@ -423,7 +424,7 @@ mod tests { path.extend(["key1=value1", "key2=value2"]); fs::create_dir_all(&path).unwrap(); path.push("data.parquet"); - fs::File::create_new(&path).unwrap(); + File::create_new(&path).unwrap(); let factory = ListingTableFactory::new(); let context = SessionContext::new();