Skip to content

Commit

Permalink
Extract catalog API to separate crate
Browse files Browse the repository at this point in the history
This moves `CatalogProvider`, `TableProvider`, `SchemaProvider` to a new
`datafusion-catalog` crate.  The circular dependency between core
`SessionState` and implementations is broken up by introducing
`CatalogSession` dyn trait.  Implementations of `TableProvider` that
reside under core current have access to `CatalogSession` by
downcasting. This is supposed to be an intermediate step.
  • Loading branch information
findepi committed Jul 25, 2024
1 parent fab7e23 commit dbb8905
Show file tree
Hide file tree
Showing 45 changed files with 872 additions and 556 deletions.
2 changes: 2 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ exclude = ["datafusion-cli", "dev/depcheck"]
members = [
"datafusion/common",
"datafusion/common-runtime",
"datafusion/catalog",
"datafusion/core",
"datafusion/expr",
"datafusion/execution",
Expand Down Expand Up @@ -88,6 +89,7 @@ chrono = { version = "0.4.34", default-features = false }
ctor = "0.2.0"
dashmap = "6.0.1"
datafusion = { path = "datafusion/core", version = "40.0.0", default-features = false }
datafusion-catalog = { path = "datafusion/catalog", version = "40.0.0" }
datafusion-common = { path = "datafusion/common", version = "40.0.0", default-features = false }
datafusion-common-runtime = { path = "datafusion/common-runtime", version = "40.0.0" }
datafusion-execution = { path = "datafusion/execution", version = "40.0.0" }
Expand Down
13 changes: 13 additions & 0 deletions datafusion-cli/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 2 additions & 3 deletions datafusion-cli/src/catalog.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,7 @@ use std::sync::{Arc, Weak};

use crate::object_storage::{get_object_store, AwsOptions, GcpOptions};

use datafusion::catalog::schema::SchemaProvider;
use datafusion::catalog::{CatalogProvider, CatalogProviderList};
use datafusion::catalog::{CatalogProvider, CatalogProviderList, SchemaProvider};
use datafusion::common::plan_datafusion_err;
use datafusion::datasource::listing::{
ListingTable, ListingTableConfig, ListingTableUrl,
Expand Down Expand Up @@ -237,7 +236,7 @@ fn substitute_tilde(cur: String) -> String {
mod tests {
use super::*;

use datafusion::catalog::schema::SchemaProvider;
use datafusion::catalog::SchemaProvider;
use datafusion::prelude::SessionContext;

fn setup_context() -> (SessionContext, Arc<dyn SchemaProvider>) {
Expand Down
4 changes: 2 additions & 2 deletions datafusion-cli/src/functions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,11 @@ use arrow::record_batch::RecordBatch;
use arrow::util::pretty::pretty_format_batches;
use async_trait::async_trait;

use datafusion::catalog::Session;
use datafusion::common::{plan_err, Column};
use datafusion::datasource::function::TableFunctionImpl;
use datafusion::datasource::TableProvider;
use datafusion::error::Result;
use datafusion::execution::context::SessionState;
use datafusion::logical_expr::Expr;
use datafusion::physical_plan::memory::MemoryExec;
use datafusion::physical_plan::ExecutionPlan;
Expand Down Expand Up @@ -234,7 +234,7 @@ impl TableProvider for ParquetMetadataTable {

async fn scan(
&self,
_state: &SessionState,
_state: &dyn Session,
projection: Option<&Vec<usize>>,
_filters: &[Expr],
_limit: Option<usize>,
Expand Down
6 changes: 3 additions & 3 deletions datafusion-examples/examples/advanced_parquet_index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ use arrow::array::{ArrayRef, Int32Array, RecordBatch, StringArray};
use arrow_schema::SchemaRef;
use async_trait::async_trait;
use bytes::Bytes;
use datafusion::catalog::Session;
use datafusion::datasource::listing::PartitionedFile;
use datafusion::datasource::physical_plan::parquet::{
ParquetAccessPlan, ParquetExecBuilder,
Expand All @@ -27,7 +28,6 @@ use datafusion::datasource::physical_plan::{
parquet::ParquetFileReaderFactory, FileMeta, FileScanConfig,
};
use datafusion::datasource::TableProvider;
use datafusion::execution::context::SessionState;
use datafusion::execution::object_store::ObjectStoreUrl;
use datafusion::parquet::arrow::arrow_reader::{
ArrowReaderOptions, ParquetRecordBatchReaderBuilder, RowSelection, RowSelector,
Expand Down Expand Up @@ -271,7 +271,7 @@ impl IndexTableProvider {
/// to a single predicate like `a = 1 AND b = 2` suitable for execution
fn filters_to_predicate(
&self,
state: &SessionState,
state: &dyn Session,
filters: &[Expr],
) -> Result<Arc<dyn PhysicalExpr>> {
let df_schema = DFSchema::try_from(self.schema())?;
Expand Down Expand Up @@ -463,7 +463,7 @@ impl TableProvider for IndexTableProvider {

async fn scan(
&self,
state: &SessionState,
state: &dyn Session,
projection: Option<&Vec<usize>>,
filters: &[Expr],
limit: Option<usize>,
Expand Down
5 changes: 1 addition & 4 deletions datafusion-examples/examples/catalog.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,7 @@
use async_trait::async_trait;
use datafusion::{
arrow::util::pretty,
catalog::{
schema::SchemaProvider,
{CatalogProvider, CatalogProviderList},
},
catalog::{CatalogProvider, CatalogProviderList, SchemaProvider},
datasource::{
file_format::{csv::CsvFormat, FileFormat},
listing::{ListingOptions, ListingTable, ListingTableConfig, ListingTableUrl},
Expand Down
5 changes: 3 additions & 2 deletions datafusion-examples/examples/custom_datasource.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ use datafusion::arrow::datatypes::{DataType, Field, Schema, SchemaRef};
use datafusion::arrow::record_batch::RecordBatch;
use datafusion::datasource::{provider_as_source, TableProvider, TableType};
use datafusion::error::Result;
use datafusion::execution::context::{SessionState, TaskContext};
use datafusion::execution::context::TaskContext;
use datafusion::physical_plan::memory::MemoryStream;
use datafusion::physical_plan::{
project_schema, DisplayAs, DisplayFormatType, ExecutionMode, ExecutionPlan,
Expand All @@ -37,6 +37,7 @@ use datafusion_expr::LogicalPlanBuilder;
use datafusion_physical_expr::EquivalenceProperties;

use async_trait::async_trait;
use datafusion::catalog::Session;
use tokio::time::timeout;

/// This example demonstrates executing a simple query against a custom datasource
Expand Down Expand Up @@ -175,7 +176,7 @@ impl TableProvider for CustomDataSource {

async fn scan(
&self,
_state: &SessionState,
_state: &dyn Session,
projection: Option<&Vec<usize>>,
// filters and limit can be used here to inject some push-down operations if needed
_filters: &[Expr],
Expand Down
4 changes: 2 additions & 2 deletions datafusion-examples/examples/parquet_index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,13 @@ use arrow::datatypes::Int32Type;
use arrow::util::pretty::pretty_format_batches;
use arrow_schema::SchemaRef;
use async_trait::async_trait;
use datafusion::catalog::Session;
use datafusion::datasource::listing::PartitionedFile;
use datafusion::datasource::physical_plan::{
parquet::StatisticsConverter,
{FileScanConfig, ParquetExec},
};
use datafusion::datasource::TableProvider;
use datafusion::execution::context::SessionState;
use datafusion::execution::object_store::ObjectStoreUrl;
use datafusion::parquet::arrow::{
arrow_reader::ParquetRecordBatchReaderBuilder, ArrowWriter,
Expand Down Expand Up @@ -222,7 +222,7 @@ impl TableProvider for IndexTableProvider {

async fn scan(
&self,
state: &SessionState,
state: &dyn Session,
projection: Option<&Vec<usize>>,
filters: &[Expr],
limit: Option<usize>,
Expand Down
6 changes: 3 additions & 3 deletions datafusion-examples/examples/simple_udtf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,11 @@ use arrow::csv::ReaderBuilder;
use async_trait::async_trait;
use datafusion::arrow::datatypes::SchemaRef;
use datafusion::arrow::record_batch::RecordBatch;
use datafusion::catalog::Session;
use datafusion::datasource::function::TableFunctionImpl;
use datafusion::datasource::TableProvider;
use datafusion::error::Result;
use datafusion::execution::context::{ExecutionProps, SessionState};
use datafusion::execution::context::ExecutionProps;
use datafusion::physical_plan::memory::MemoryExec;
use datafusion::physical_plan::ExecutionPlan;
use datafusion::prelude::SessionContext;
Expand All @@ -35,7 +36,6 @@ use std::fs::File;
use std::io::Seek;
use std::path::Path;
use std::sync::Arc;

// To define your own table function, you only need to do the following 3 things:
// 1. Implement your own [`TableProvider`]
// 2. Implement your own [`TableFunctionImpl`] and return your [`TableProvider`]
Expand Down Expand Up @@ -95,7 +95,7 @@ impl TableProvider for LocalCsvTable {

async fn scan(
&self,
_state: &SessionState,
_state: &dyn Session,
projection: Option<&Vec<usize>>,
_filters: &[Expr],
_limit: Option<usize>,
Expand Down
38 changes: 38 additions & 0 deletions datafusion/catalog/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

[package]
name = "datafusion-catalog"
authors.workspace = true
edition.workspace = true
homepage.workspace = true
license.workspace = true
readme.workspace = true
repository.workspace = true
rust-version.workspace = true
version.workspace = true

[dependencies]
arrow-schema = { workspace = true }
async-trait = "0.1.41"
datafusion-common = { workspace = true }
datafusion-execution = { workspace = true }
datafusion-expr = { workspace = true }
datafusion-physical-plan = { workspace = true }

[lints]
workspace = true
Loading

0 comments on commit dbb8905

Please sign in to comment.