diff --git a/datafusion-postgres/src/pg_catalog.rs b/datafusion-postgres/src/pg_catalog.rs index 4e0a841..b4a46b3 100644 --- a/datafusion-postgres/src/pg_catalog.rs +++ b/datafusion-postgres/src/pg_catalog.rs @@ -1,15 +1,16 @@ use std::collections::HashMap; -use std::sync::atomic::{AtomicU32, Ordering}; +use std::sync::atomic::AtomicU32; use std::sync::Arc; use async_trait::async_trait; use datafusion::arrow::array::{ - as_boolean_array, ArrayRef, BooleanArray, BooleanBuilder, Float32Array, Float64Array, - Int16Array, Int32Array, RecordBatch, StringArray, StringBuilder, + as_boolean_array, ArrayRef, BooleanArray, BooleanBuilder, RecordBatch, StringArray, + StringBuilder, }; -use datafusion::arrow::datatypes::{DataType, Field, Schema, SchemaRef}; +use datafusion::arrow::datatypes::{DataType, Field, SchemaRef}; +use datafusion::arrow::ipc::reader::FileReader; use datafusion::catalog::streaming::StreamingTable; -use datafusion::catalog::{CatalogProviderList, MemTable, SchemaProvider}; +use datafusion::catalog::{CatalogProviderList, SchemaProvider}; use datafusion::common::utils::SingleRowListArrayBuilder; use datafusion::datasource::{TableProvider, ViewTable}; use datafusion::error::{DataFusionError, Result}; @@ -21,16 +22,72 @@ use datafusion::prelude::{create_udf, SessionContext}; use postgres_types::Oid; use tokio::sync::RwLock; +mod pg_attribute; +mod pg_class; +mod pg_database; +mod pg_namespace; + +const PG_CATALOG_TABLE_PG_AGGREGATE: &str = "pg_aggregate"; +const PG_CATALOG_TABLE_PG_AM: &str = "pg_am"; +const PG_CATALOG_TABLE_PG_AMOP: &str = "pg_amop"; +const PG_CATALOG_TABLE_PG_AMPROC: &str = "pg_amproc"; +const PG_CATALOG_TABLE_PG_CAST: &str = "pg_cast"; +const PG_CATALOG_TABLE_PG_COLLATION: &str = "pg_collation"; +const PG_CATALOG_TABLE_PG_CONVERSION: &str = "pg_conversion"; +const PG_CATALOG_TABLE_PG_LANGUAGE: &str = "pg_language"; +const PG_CATALOG_TABLE_PG_OPCLASS: &str = "pg_opclass"; +const PG_CATALOG_TABLE_PG_OPERATOR: &str = "pg_operator"; +const PG_CATALOG_TABLE_PG_OPFAMILY: &str = "pg_opfamily"; +const PG_CATALOG_TABLE_PG_PROC: &str = "pg_proc"; +const PG_CATALOG_TABLE_PG_RANGE: &str = "pg_range"; +const PG_CATALOG_TABLE_PG_TS_CONFIG: &str = "pg_ts_config"; +const PG_CATALOG_TABLE_PG_TS_DICT: &str = "pg_ts_dict"; +const PG_CATALOG_TABLE_PG_TS_PARSER: &str = "pg_ts_parser"; +const PG_CATALOG_TABLE_PG_TS_TEMPLATE: &str = "pg_ts_template"; const PG_CATALOG_TABLE_PG_TYPE: &str = "pg_type"; -const PG_CATALOG_TABLE_PG_CLASS: &str = "pg_class"; const PG_CATALOG_TABLE_PG_ATTRIBUTE: &str = "pg_attribute"; -const PG_CATALOG_TABLE_PG_NAMESPACE: &str = "pg_namespace"; -const PG_CATALOG_TABLE_PG_PROC: &str = "pg_proc"; +const PG_CATALOG_TABLE_PG_ATTRDEF: &str = "pg_attrdef"; +const PG_CATALOG_TABLE_PG_AUTH_MEMBERS: &str = "pg_auth_members"; +const PG_CATALOG_TABLE_PG_AUTHID: &str = "pg_authid"; +const PG_CATALOG_TABLE_PG_CLASS: &str = "pg_class"; +const PG_CATALOG_TABLE_PG_CONSTRAINT: &str = "pg_constraint"; const PG_CATALOG_TABLE_PG_DATABASE: &str = "pg_database"; -const PG_CATALOG_TABLE_PG_AM: &str = "pg_am"; -const PG_CATALOG_TABLE_PG_RANGE: &str = "pg_range"; -const PG_CATALOG_TABLE_PG_ENUM: &str = "pg_enum"; +const PG_CATALOG_TABLE_PG_DB_ROLE_SETTING: &str = "pg_db_role_setting"; +const PG_CATALOG_TABLE_PG_DEFAULT_ACL: &str = "pg_default_acl"; +const PG_CATALOG_TABLE_PG_DEPEND: &str = "pg_depend"; const PG_CATALOG_TABLE_PG_DESCRIPTION: &str = "pg_description"; +const PG_CATALOG_TABLE_PG_ENUM: &str = "pg_enum"; +const PG_CATALOG_TABLE_PG_EVENT_TRIGGER: &str = "pg_event_trigger"; +const PG_CATALOG_TABLE_PG_EXTENSION: &str = "pg_extension"; +const PG_CATALOG_TABLE_PG_FOREIGN_DATA_WRAPPER: &str = "pg_foreign_data_wrapper"; +const PG_CATALOG_TABLE_PG_FOREIGN_SERVER: &str = "pg_foreign_server"; +const PG_CATALOG_TABLE_PG_FOREIGN_TABLE: &str = "pg_foreign_table"; +const PG_CATALOG_TABLE_PG_INDEX: &str = "pg_index"; +const PG_CATALOG_TABLE_PG_INHERITS: &str = "pg_inherits"; +const PG_CATALOG_TABLE_PG_INIT_PRIVS: &str = "pg_init_privs"; +const PG_CATALOG_TABLE_PG_LARGEOBJECT: &str = "pg_largeobject"; +const PG_CATALOG_TABLE_PG_LARGEOBJECT_METADATA: &str = "pg_largeobject_metadata"; +const PG_CATALOG_TABLE_PG_NAMESPACE: &str = "pg_namespace"; +const PG_CATALOG_TABLE_PG_PARTITIONED_TABLE: &str = "pg_partitioned_table"; +const PG_CATALOG_TABLE_PG_POLICY: &str = "pg_policy"; +const PG_CATALOG_TABLE_PG_PUBLICATION: &str = "pg_publication"; +const PG_CATALOG_TABLE_PG_PUBLICATION_NAMESPACE: &str = "pg_publication_namespace"; +const PG_CATALOG_TABLE_PG_PUBLICATION_REL: &str = "pg_publication_rel"; +const PG_CATALOG_TABLE_PG_REPLICATION_ORIGIN: &str = "pg_replication_origin"; +const PG_CATALOG_TABLE_PG_REWRITE: &str = "pg_rewrite"; +const PG_CATALOG_TABLE_PG_SECLABEL: &str = "pg_seclabel"; +const PG_CATALOG_TABLE_PG_SEQUENCE: &str = "pg_sequence"; +const PG_CATALOG_TABLE_PG_SHDEPEND: &str = "pg_shdepend"; +const PG_CATALOG_TABLE_PG_SHDESCRIPTION: &str = "pg_shdescription"; +const PG_CATALOG_TABLE_PG_SHSECLABEL: &str = "pg_shseclabel"; +const PG_CATALOG_TABLE_PG_STATISTIC: &str = "pg_statistic"; +const PG_CATALOG_TABLE_PG_STATISTIC_EXT: &str = "pg_statistic_ext"; +const PG_CATALOG_TABLE_PG_STATISTIC_EXT_DATA: &str = "pg_statistic_ext_data"; +const PG_CATALOG_TABLE_PG_SUBSCRIPTION: &str = "pg_subscription"; +const PG_CATALOG_TABLE_PG_SUBSCRIPTION_REL: &str = "pg_subscription_rel"; +const PG_CATALOG_TABLE_PG_TABLESPACE: &str = "pg_tablespace"; +const PG_CATALOG_TABLE_PG_TRIGGER: &str = "pg_trigger"; +const PG_CATALOG_TABLE_PG_USER_MAPPING: &str = "pg_user_mapping"; /// Determine PostgreSQL table type (relkind) from DataFusion TableProvider fn get_table_type(table: &Arc) -> &'static str { @@ -64,156 +121,69 @@ fn get_table_type_with_name( } pub const PG_CATALOG_TABLES: &[&str] = &[ + PG_CATALOG_TABLE_PG_AGGREGATE, + PG_CATALOG_TABLE_PG_AM, + PG_CATALOG_TABLE_PG_AMOP, + PG_CATALOG_TABLE_PG_AMPROC, + PG_CATALOG_TABLE_PG_CAST, + PG_CATALOG_TABLE_PG_COLLATION, + PG_CATALOG_TABLE_PG_CONVERSION, + PG_CATALOG_TABLE_PG_LANGUAGE, + PG_CATALOG_TABLE_PG_OPCLASS, + PG_CATALOG_TABLE_PG_OPERATOR, + PG_CATALOG_TABLE_PG_OPFAMILY, + PG_CATALOG_TABLE_PG_PROC, + PG_CATALOG_TABLE_PG_RANGE, + PG_CATALOG_TABLE_PG_TS_CONFIG, + PG_CATALOG_TABLE_PG_TS_DICT, + PG_CATALOG_TABLE_PG_TS_PARSER, + PG_CATALOG_TABLE_PG_TS_TEMPLATE, PG_CATALOG_TABLE_PG_TYPE, - PG_CATALOG_TABLE_PG_CLASS, PG_CATALOG_TABLE_PG_ATTRIBUTE, - PG_CATALOG_TABLE_PG_NAMESPACE, - PG_CATALOG_TABLE_PG_PROC, + PG_CATALOG_TABLE_PG_ATTRDEF, + PG_CATALOG_TABLE_PG_AUTH_MEMBERS, + PG_CATALOG_TABLE_PG_AUTHID, + PG_CATALOG_TABLE_PG_CLASS, + PG_CATALOG_TABLE_PG_CONSTRAINT, PG_CATALOG_TABLE_PG_DATABASE, - PG_CATALOG_TABLE_PG_AM, - PG_CATALOG_TABLE_PG_RANGE, - PG_CATALOG_TABLE_PG_ENUM, + PG_CATALOG_TABLE_PG_DB_ROLE_SETTING, + PG_CATALOG_TABLE_PG_DEFAULT_ACL, + PG_CATALOG_TABLE_PG_DEPEND, PG_CATALOG_TABLE_PG_DESCRIPTION, + PG_CATALOG_TABLE_PG_ENUM, + PG_CATALOG_TABLE_PG_EVENT_TRIGGER, + PG_CATALOG_TABLE_PG_EXTENSION, + PG_CATALOG_TABLE_PG_FOREIGN_DATA_WRAPPER, + PG_CATALOG_TABLE_PG_FOREIGN_SERVER, + PG_CATALOG_TABLE_PG_FOREIGN_TABLE, + PG_CATALOG_TABLE_PG_INDEX, + PG_CATALOG_TABLE_PG_INHERITS, + PG_CATALOG_TABLE_PG_INIT_PRIVS, + PG_CATALOG_TABLE_PG_LARGEOBJECT, + PG_CATALOG_TABLE_PG_LARGEOBJECT_METADATA, + PG_CATALOG_TABLE_PG_NAMESPACE, + PG_CATALOG_TABLE_PG_PARTITIONED_TABLE, + PG_CATALOG_TABLE_PG_POLICY, + PG_CATALOG_TABLE_PG_PUBLICATION, + PG_CATALOG_TABLE_PG_PUBLICATION_NAMESPACE, + PG_CATALOG_TABLE_PG_PUBLICATION_REL, + PG_CATALOG_TABLE_PG_REPLICATION_ORIGIN, + PG_CATALOG_TABLE_PG_REWRITE, + PG_CATALOG_TABLE_PG_SECLABEL, + PG_CATALOG_TABLE_PG_SEQUENCE, + PG_CATALOG_TABLE_PG_SHDEPEND, + PG_CATALOG_TABLE_PG_SHDESCRIPTION, + PG_CATALOG_TABLE_PG_SHSECLABEL, + PG_CATALOG_TABLE_PG_STATISTIC, + PG_CATALOG_TABLE_PG_STATISTIC_EXT, + PG_CATALOG_TABLE_PG_STATISTIC_EXT_DATA, + PG_CATALOG_TABLE_PG_SUBSCRIPTION, + PG_CATALOG_TABLE_PG_SUBSCRIPTION_REL, + PG_CATALOG_TABLE_PG_TABLESPACE, + PG_CATALOG_TABLE_PG_TRIGGER, + PG_CATALOG_TABLE_PG_USER_MAPPING, ]; -// Data structure to hold pg_type table data -#[derive(Debug)] -struct PgTypesData { - oids: Vec, - typnames: Vec, - typnamespaces: Vec, - typowners: Vec, - typlens: Vec, - typbyvals: Vec, - typtypes: Vec, - typcategories: Vec, - typispreferreds: Vec, - typisdefineds: Vec, - typdelims: Vec, - typrelids: Vec, - typelems: Vec, - typarrays: Vec, - typinputs: Vec, - typoutputs: Vec, - typreceives: Vec, - typsends: Vec, - typmodins: Vec, - typmodouts: Vec, - typanalyzes: Vec, - typaligns: Vec, - typstorages: Vec, - typnotnulls: Vec, - typbasetypes: Vec, - typtymods: Vec, - typndimss: Vec, - typcollations: Vec, - typdefaultbins: Vec>, - typdefaults: Vec>, -} - -impl PgTypesData { - fn new() -> Self { - Self { - oids: Vec::new(), - typnames: Vec::new(), - typnamespaces: Vec::new(), - typowners: Vec::new(), - typlens: Vec::new(), - typbyvals: Vec::new(), - typtypes: Vec::new(), - typcategories: Vec::new(), - typispreferreds: Vec::new(), - typisdefineds: Vec::new(), - typdelims: Vec::new(), - typrelids: Vec::new(), - typelems: Vec::new(), - typarrays: Vec::new(), - typinputs: Vec::new(), - typoutputs: Vec::new(), - typreceives: Vec::new(), - typsends: Vec::new(), - typmodins: Vec::new(), - typmodouts: Vec::new(), - typanalyzes: Vec::new(), - typaligns: Vec::new(), - typstorages: Vec::new(), - typnotnulls: Vec::new(), - typbasetypes: Vec::new(), - typtymods: Vec::new(), - typndimss: Vec::new(), - typcollations: Vec::new(), - typdefaultbins: Vec::new(), - typdefaults: Vec::new(), - } - } - - #[allow(clippy::too_many_arguments)] - fn add_type( - &mut self, - oid: i32, - typname: &str, - typnamespace: i32, - typowner: i32, - typlen: i16, - typbyval: bool, - typtype: &str, - typcategory: &str, - typispreferred: bool, - typisdefined: bool, - typdelim: &str, - typrelid: i32, - typelem: i32, - typarray: i32, - typinput: &str, - typoutput: &str, - typreceive: &str, - typsend: &str, - typmodin: &str, - typmodout: &str, - typanalyze: &str, - typalign: &str, - typstorage: &str, - typnotnull: bool, - typbasetype: i32, - typtypmod: i32, - typndims: i32, - typcollation: i32, - typdefaultbin: Option, - typdefault: Option, - ) { - self.oids.push(oid); - self.typnames.push(typname.to_string()); - self.typnamespaces.push(typnamespace); - self.typowners.push(typowner); - self.typlens.push(typlen); - self.typbyvals.push(typbyval); - self.typtypes.push(typtype.to_string()); - self.typcategories.push(typcategory.to_string()); - self.typispreferreds.push(typispreferred); - self.typisdefineds.push(typisdefined); - self.typdelims.push(typdelim.to_string()); - self.typrelids.push(typrelid); - self.typelems.push(typelem); - self.typarrays.push(typarray); - self.typinputs.push(typinput.to_string()); - self.typoutputs.push(typoutput.to_string()); - self.typreceives.push(typreceive.to_string()); - self.typsends.push(typsend.to_string()); - self.typmodins.push(typmodin.to_string()); - self.typmodouts.push(typmodout.to_string()); - self.typanalyzes.push(typanalyze.to_string()); - self.typaligns.push(typalign.to_string()); - self.typstorages.push(typstorage.to_string()); - self.typnotnulls.push(typnotnull); - self.typbasetypes.push(typbasetype); - self.typtymods.push(typtypmod); - self.typndimss.push(typndims); - self.typcollations.push(typcollation); - self.typdefaultbins.push(typdefaultbin); - self.typdefaults.push(typdefault); - } -} - #[derive(Debug, Hash, Eq, PartialEq, PartialOrd, Ord)] enum OidCacheKey { Catalog(String), @@ -228,6 +198,7 @@ pub struct PgCatalogSchemaProvider { catalog_list: Arc, oid_counter: Arc, oid_cache: Arc>>, + static_tables: PgCatalogStaticTables, } #[async_trait] @@ -242,10 +213,102 @@ impl SchemaProvider for PgCatalogSchemaProvider { async fn table(&self, name: &str) -> Result>> { match name.to_ascii_lowercase().as_str() { - PG_CATALOG_TABLE_PG_TYPE => Ok(Some(self.create_pg_type_table())), - PG_CATALOG_TABLE_PG_AM => Ok(Some(self.create_pg_am_table())), - PG_CATALOG_TABLE_PG_CLASS => { - let table = Arc::new(PgClassTable::new( + PG_CATALOG_TABLE_PG_AGGREGATE => Ok(Some(self.static_tables.pg_aggregate.clone())), + PG_CATALOG_TABLE_PG_AM => Ok(Some(self.static_tables.pg_am.clone())), + PG_CATALOG_TABLE_PG_AMOP => Ok(Some(self.static_tables.pg_amop.clone())), + PG_CATALOG_TABLE_PG_AMPROC => Ok(Some(self.static_tables.pg_amproc.clone())), + PG_CATALOG_TABLE_PG_CAST => Ok(Some(self.static_tables.pg_cast.clone())), + PG_CATALOG_TABLE_PG_COLLATION => Ok(Some(self.static_tables.pg_collation.clone())), + PG_CATALOG_TABLE_PG_CONVERSION => Ok(Some(self.static_tables.pg_conversion.clone())), + PG_CATALOG_TABLE_PG_LANGUAGE => Ok(Some(self.static_tables.pg_language.clone())), + PG_CATALOG_TABLE_PG_OPCLASS => Ok(Some(self.static_tables.pg_opclass.clone())), + PG_CATALOG_TABLE_PG_OPERATOR => Ok(Some(self.static_tables.pg_operator.clone())), + PG_CATALOG_TABLE_PG_OPFAMILY => Ok(Some(self.static_tables.pg_opfamily.clone())), + PG_CATALOG_TABLE_PG_PROC => Ok(Some(self.static_tables.pg_proc.clone())), + PG_CATALOG_TABLE_PG_RANGE => Ok(Some(self.static_tables.pg_range.clone())), + PG_CATALOG_TABLE_PG_TS_CONFIG => Ok(Some(self.static_tables.pg_ts_config.clone())), + PG_CATALOG_TABLE_PG_TS_DICT => Ok(Some(self.static_tables.pg_ts_dict.clone())), + PG_CATALOG_TABLE_PG_TS_PARSER => Ok(Some(self.static_tables.pg_ts_parser.clone())), + PG_CATALOG_TABLE_PG_TS_TEMPLATE => Ok(Some(self.static_tables.pg_ts_template.clone())), + PG_CATALOG_TABLE_PG_TYPE => Ok(Some(self.static_tables.pg_type.clone())), + PG_CATALOG_TABLE_PG_ATTRDEF => Ok(Some(self.static_tables.pg_attrdef.clone())), + PG_CATALOG_TABLE_PG_AUTH_MEMBERS => { + Ok(Some(self.static_tables.pg_auth_members.clone())) + } + PG_CATALOG_TABLE_PG_AUTHID => Ok(Some(self.static_tables.pg_authid.clone())), + + PG_CATALOG_TABLE_PG_CONSTRAINT => Ok(Some(self.static_tables.pg_constraint.clone())), + + PG_CATALOG_TABLE_PG_DB_ROLE_SETTING => { + Ok(Some(self.static_tables.pg_db_role_setting.clone())) + } + PG_CATALOG_TABLE_PG_DEFAULT_ACL => Ok(Some(self.static_tables.pg_default_acl.clone())), + PG_CATALOG_TABLE_PG_DEPEND => Ok(Some(self.static_tables.pg_depend.clone())), + PG_CATALOG_TABLE_PG_DESCRIPTION => Ok(Some(self.static_tables.pg_description.clone())), + PG_CATALOG_TABLE_PG_ENUM => Ok(Some(self.static_tables.pg_enum.clone())), + PG_CATALOG_TABLE_PG_EVENT_TRIGGER => { + Ok(Some(self.static_tables.pg_event_trigger.clone())) + } + PG_CATALOG_TABLE_PG_EXTENSION => Ok(Some(self.static_tables.pg_extension.clone())), + PG_CATALOG_TABLE_PG_FOREIGN_DATA_WRAPPER => { + Ok(Some(self.static_tables.pg_foreign_data_wrapper.clone())) + } + PG_CATALOG_TABLE_PG_FOREIGN_SERVER => { + Ok(Some(self.static_tables.pg_foreign_server.clone())) + } + PG_CATALOG_TABLE_PG_FOREIGN_TABLE => { + Ok(Some(self.static_tables.pg_foreign_table.clone())) + } + PG_CATALOG_TABLE_PG_INDEX => Ok(Some(self.static_tables.pg_index.clone())), + PG_CATALOG_TABLE_PG_INHERITS => Ok(Some(self.static_tables.pg_inherits.clone())), + PG_CATALOG_TABLE_PG_INIT_PRIVS => Ok(Some(self.static_tables.pg_init_privs.clone())), + PG_CATALOG_TABLE_PG_LARGEOBJECT => Ok(Some(self.static_tables.pg_largeobject.clone())), + PG_CATALOG_TABLE_PG_LARGEOBJECT_METADATA => { + Ok(Some(self.static_tables.pg_largeobject_metadata.clone())) + } + PG_CATALOG_TABLE_PG_PARTITIONED_TABLE => { + Ok(Some(self.static_tables.pg_partitioned_table.clone())) + } + PG_CATALOG_TABLE_PG_POLICY => Ok(Some(self.static_tables.pg_policy.clone())), + PG_CATALOG_TABLE_PG_PUBLICATION => Ok(Some(self.static_tables.pg_publication.clone())), + PG_CATALOG_TABLE_PG_PUBLICATION_NAMESPACE => { + Ok(Some(self.static_tables.pg_publication_namespace.clone())) + } + PG_CATALOG_TABLE_PG_PUBLICATION_REL => { + Ok(Some(self.static_tables.pg_publication_rel.clone())) + } + PG_CATALOG_TABLE_PG_REPLICATION_ORIGIN => { + Ok(Some(self.static_tables.pg_replication_origin.clone())) + } + PG_CATALOG_TABLE_PG_REWRITE => Ok(Some(self.static_tables.pg_rewrite.clone())), + PG_CATALOG_TABLE_PG_SECLABEL => Ok(Some(self.static_tables.pg_seclabel.clone())), + PG_CATALOG_TABLE_PG_SEQUENCE => Ok(Some(self.static_tables.pg_sequence.clone())), + PG_CATALOG_TABLE_PG_SHDEPEND => Ok(Some(self.static_tables.pg_shdepend.clone())), + PG_CATALOG_TABLE_PG_SHDESCRIPTION => { + Ok(Some(self.static_tables.pg_shdescription.clone())) + } + PG_CATALOG_TABLE_PG_SHSECLABEL => Ok(Some(self.static_tables.pg_shseclabel.clone())), + PG_CATALOG_TABLE_PG_STATISTIC => Ok(Some(self.static_tables.pg_statistic.clone())), + PG_CATALOG_TABLE_PG_STATISTIC_EXT => { + Ok(Some(self.static_tables.pg_statistic_ext.clone())) + } + PG_CATALOG_TABLE_PG_STATISTIC_EXT_DATA => { + Ok(Some(self.static_tables.pg_statistic_ext_data.clone())) + } + PG_CATALOG_TABLE_PG_SUBSCRIPTION => { + Ok(Some(self.static_tables.pg_subscription.clone())) + } + PG_CATALOG_TABLE_PG_SUBSCRIPTION_REL => { + Ok(Some(self.static_tables.pg_subscription_rel.clone())) + } + PG_CATALOG_TABLE_PG_TABLESPACE => Ok(Some(self.static_tables.pg_tablespace.clone())), + PG_CATALOG_TABLE_PG_TRIGGER => Ok(Some(self.static_tables.pg_trigger.clone())), + PG_CATALOG_TABLE_PG_USER_MAPPING => { + Ok(Some(self.static_tables.pg_user_mapping.clone())) + } + + PG_CATALOG_TABLE_PG_ATTRIBUTE => { + let table = Arc::new(pg_attribute::PgAttributeTable::new( self.catalog_list.clone(), self.oid_counter.clone(), self.oid_cache.clone(), @@ -254,8 +317,8 @@ impl SchemaProvider for PgCatalogSchemaProvider { StreamingTable::try_new(Arc::clone(table.schema()), vec![table]).unwrap(), ))) } - PG_CATALOG_TABLE_PG_NAMESPACE => { - let table = Arc::new(PgNamespaceTable::new( + PG_CATALOG_TABLE_PG_CLASS => { + let table = Arc::new(pg_class::PgClassTable::new( self.catalog_list.clone(), self.oid_counter.clone(), self.oid_cache.clone(), @@ -265,7 +328,7 @@ impl SchemaProvider for PgCatalogSchemaProvider { ))) } PG_CATALOG_TABLE_PG_DATABASE => { - let table = Arc::new(PgDatabaseTable::new( + let table = Arc::new(pg_database::PgDatabaseTable::new( self.catalog_list.clone(), self.oid_counter.clone(), self.oid_cache.clone(), @@ -274,16 +337,17 @@ impl SchemaProvider for PgCatalogSchemaProvider { StreamingTable::try_new(Arc::clone(table.schema()), vec![table]).unwrap(), ))) } - PG_CATALOG_TABLE_PG_ATTRIBUTE => { - let table = Arc::new(PgAttributeTable::new(self.catalog_list.clone())); + PG_CATALOG_TABLE_PG_NAMESPACE => { + let table = Arc::new(pg_namespace::PgNamespaceTable::new( + self.catalog_list.clone(), + self.oid_counter.clone(), + self.oid_cache.clone(), + )); Ok(Some(Arc::new( StreamingTable::try_new(Arc::clone(table.schema()), vec![table]).unwrap(), ))) } - PG_CATALOG_TABLE_PG_PROC => Ok(Some(self.create_pg_proc_table())), - PG_CATALOG_TABLE_PG_RANGE => Ok(Some(self.create_pg_range_table())), - PG_CATALOG_TABLE_PG_ENUM => Ok(Some(self.create_pg_enum_table())), - PG_CATALOG_TABLE_PG_DESCRIPTION => Ok(Some(self.create_pg_description_table())), + _ => Ok(None), } } @@ -294,1518 +358,314 @@ impl SchemaProvider for PgCatalogSchemaProvider { } impl PgCatalogSchemaProvider { - pub fn new(catalog_list: Arc) -> PgCatalogSchemaProvider { - Self { + pub fn try_new(catalog_list: Arc) -> Result { + Ok(Self { catalog_list, oid_counter: Arc::new(AtomicU32::new(16384)), oid_cache: Arc::new(RwLock::new(HashMap::new())), - } - } - - /// Create a populated pg_type table with standard PostgreSQL data types - fn create_pg_type_table(&self) -> Arc { - // Define complete schema for pg_type (matching PostgreSQL) - let schema = Arc::new(Schema::new(vec![ - Field::new("oid", DataType::Int32, false), - Field::new("typname", DataType::Utf8, false), - Field::new("typnamespace", DataType::Int32, false), - Field::new("typowner", DataType::Int32, false), - Field::new("typlen", DataType::Int16, false), - Field::new("typbyval", DataType::Boolean, false), - Field::new("typtype", DataType::Utf8, false), - Field::new("typcategory", DataType::Utf8, false), - Field::new("typispreferred", DataType::Boolean, false), - Field::new("typisdefined", DataType::Boolean, false), - Field::new("typdelim", DataType::Utf8, false), - Field::new("typrelid", DataType::Int32, false), - Field::new("typelem", DataType::Int32, false), - Field::new("typarray", DataType::Int32, false), - Field::new("typinput", DataType::Utf8, false), - Field::new("typoutput", DataType::Utf8, false), - Field::new("typreceive", DataType::Utf8, false), - Field::new("typsend", DataType::Utf8, false), - Field::new("typmodin", DataType::Utf8, false), - Field::new("typmodout", DataType::Utf8, false), - Field::new("typanalyze", DataType::Utf8, false), - Field::new("typalign", DataType::Utf8, false), - Field::new("typstorage", DataType::Utf8, false), - Field::new("typnotnull", DataType::Boolean, false), - Field::new("typbasetype", DataType::Int32, false), - Field::new("typtypmod", DataType::Int32, false), - Field::new("typndims", DataType::Int32, false), - Field::new("typcollation", DataType::Int32, false), - Field::new("typdefaultbin", DataType::Utf8, true), - Field::new("typdefault", DataType::Utf8, true), - ])); - - // Create standard PostgreSQL data types - let pg_types_data = Self::get_standard_pg_types(); - - // Create RecordBatch from the data - let arrays: Vec = vec![ - Arc::new(Int32Array::from(pg_types_data.oids)), - Arc::new(StringArray::from(pg_types_data.typnames)), - Arc::new(Int32Array::from(pg_types_data.typnamespaces)), - Arc::new(Int32Array::from(pg_types_data.typowners)), - Arc::new(Int16Array::from(pg_types_data.typlens)), - Arc::new(BooleanArray::from(pg_types_data.typbyvals)), - Arc::new(StringArray::from(pg_types_data.typtypes)), - Arc::new(StringArray::from(pg_types_data.typcategories)), - Arc::new(BooleanArray::from(pg_types_data.typispreferreds)), - Arc::new(BooleanArray::from(pg_types_data.typisdefineds)), - Arc::new(StringArray::from(pg_types_data.typdelims)), - Arc::new(Int32Array::from(pg_types_data.typrelids)), - Arc::new(Int32Array::from(pg_types_data.typelems)), - Arc::new(Int32Array::from(pg_types_data.typarrays)), - Arc::new(StringArray::from(pg_types_data.typinputs)), - Arc::new(StringArray::from(pg_types_data.typoutputs)), - Arc::new(StringArray::from(pg_types_data.typreceives)), - Arc::new(StringArray::from(pg_types_data.typsends)), - Arc::new(StringArray::from(pg_types_data.typmodins)), - Arc::new(StringArray::from(pg_types_data.typmodouts)), - Arc::new(StringArray::from(pg_types_data.typanalyzes)), - Arc::new(StringArray::from(pg_types_data.typaligns)), - Arc::new(StringArray::from(pg_types_data.typstorages)), - Arc::new(BooleanArray::from(pg_types_data.typnotnulls)), - Arc::new(Int32Array::from(pg_types_data.typbasetypes)), - Arc::new(Int32Array::from(pg_types_data.typtymods)), - Arc::new(Int32Array::from(pg_types_data.typndimss)), - Arc::new(Int32Array::from(pg_types_data.typcollations)), - Arc::new(StringArray::from_iter( - pg_types_data.typdefaultbins.into_iter(), - )), - Arc::new(StringArray::from_iter( - pg_types_data.typdefaults.into_iter(), - )), - ]; - - let batch = RecordBatch::try_new(schema.clone(), arrays).unwrap(); - - // Create memory table with populated data - let provider = MemTable::try_new(schema, vec![vec![batch]]).unwrap(); - - Arc::new(provider) - } - - /// Generate standard PostgreSQL data types for pg_type table - fn get_standard_pg_types() -> PgTypesData { - let mut data = PgTypesData::new(); - - // Basic data types commonly used - data.add_type( - 16, "bool", 11, 10, 1, true, "b", "B", true, true, ",", 0, 0, 1000, "boolin", - "boolout", "boolrecv", "boolsend", "-", "-", "-", "c", "p", false, 0, -1, 0, 0, None, - None, - ); - data.add_type( - 17, - "bytea", - 11, - 10, - -1, - false, - "b", - "U", - false, - true, - ",", - 0, - 0, - 1001, - "byteain", - "byteaout", - "bytearecv", - "byteasend", - "-", - "-", - "-", - "i", - "x", - false, - 0, - -1, - 0, - 0, - None, - None, - ); - data.add_type( - 18, "char", 11, 10, 1, true, "b", "S", false, true, ",", 0, 0, 1002, "charin", - "charout", "charrecv", "charsend", "-", "-", "-", "c", "p", false, 0, -1, 0, 0, None, - None, - ); - data.add_type( - 19, "name", 11, 10, 64, false, "b", "S", false, true, ",", 0, 0, 1003, "namein", - "nameout", "namerecv", "namesend", "-", "-", "-", "i", "p", false, 0, -1, 0, 0, None, - None, - ); - data.add_type( - 20, "int8", 11, 10, 8, true, "b", "N", false, true, ",", 0, 0, 1016, "int8in", - "int8out", "int8recv", "int8send", "-", "-", "-", "d", "p", false, 0, -1, 0, 0, None, - None, - ); - data.add_type( - 21, "int2", 11, 10, 2, true, "b", "N", false, true, ",", 0, 0, 1005, "int2in", - "int2out", "int2recv", "int2send", "-", "-", "-", "s", "p", false, 0, -1, 0, 0, None, - None, - ); - data.add_type( - 23, "int4", 11, 10, 4, true, "b", "N", true, true, ",", 0, 0, 1007, "int4in", - "int4out", "int4recv", "int4send", "-", "-", "-", "i", "p", false, 0, -1, 0, 0, None, - None, - ); - data.add_type( - 25, "text", 11, 10, -1, false, "b", "S", true, true, ",", 0, 0, 1009, "textin", - "textout", "textrecv", "textsend", "-", "-", "-", "i", "x", false, 0, -1, 0, 100, None, - None, - ); - data.add_type( - 700, - "float4", - 11, - 10, - 4, - true, - "b", - "N", - false, - true, - ",", - 0, - 0, - 1021, - "float4in", - "float4out", - "float4recv", - "float4send", - "-", - "-", - "-", - "i", - "p", - false, - 0, - -1, - 0, - 0, - None, - None, - ); - data.add_type( - 701, - "float8", - 11, - 10, - 8, - true, - "b", - "N", - true, - true, - ",", - 0, - 0, - 1022, - "float8in", - "float8out", - "float8recv", - "float8send", - "-", - "-", - "-", - "d", - "p", - false, - 0, - -1, - 0, - 0, - None, - None, - ); - data.add_type( - 1043, - "varchar", - 11, - 10, - -1, - false, - "b", - "S", - false, - true, - ",", - 0, - 0, - 1015, - "varcharin", - "varcharout", - "varcharrecv", - "varcharsend", - "varchartypmodin", - "varchartypmodout", - "-", - "i", - "x", - false, - 0, - -1, - 0, - 100, - None, - None, - ); - data.add_type( - 1082, - "date", - 11, - 10, - 4, - true, - "b", - "D", - false, - true, - ",", - 0, - 0, - 1182, - "date_in", - "date_out", - "date_recv", - "date_send", - "-", - "-", - "-", - "i", - "p", - false, - 0, - -1, - 0, - 0, - None, - None, - ); - data.add_type( - 1083, - "time", - 11, - 10, - 8, - true, - "b", - "D", - false, - true, - ",", - 0, - 0, - 1183, - "time_in", - "time_out", - "time_recv", - "time_send", - "timetypmodin", - "timetypmodout", - "-", - "d", - "p", - false, - 0, - -1, - 0, - 0, - None, - None, - ); - data.add_type( - 1114, - "timestamp", - 11, - 10, - 8, - true, - "b", - "D", - false, - true, - ",", - 0, - 0, - 1115, - "timestamp_in", - "timestamp_out", - "timestamp_recv", - "timestamp_send", - "timestamptypmodin", - "timestamptypmodout", - "-", - "d", - "p", - false, - 0, - -1, - 0, - 0, - None, - None, - ); - data.add_type( - 1184, - "timestamptz", - 11, - 10, - 8, - true, - "b", - "D", - true, - true, - ",", - 0, - 0, - 1185, - "timestamptz_in", - "timestamptz_out", - "timestamptz_recv", - "timestamptz_send", - "timestamptztypmodin", - "timestamptztypmodout", - "-", - "d", - "p", - false, - 0, - -1, - 0, - 0, - None, - None, - ); - data.add_type( - 1700, - "numeric", - 11, - 10, - -1, - false, - "b", - "N", - false, - true, - ",", - 0, - 0, - 1231, - "numeric_in", - "numeric_out", - "numeric_recv", - "numeric_send", - "numerictypmodin", - "numerictypmodout", - "-", - "i", - "m", - false, - 0, - -1, - 0, - 0, - None, - None, - ); - - data - } - - /// Create a mock empty table for pg_am - fn create_pg_am_table(&self) -> Arc { - // Define the schema for pg_am - // This matches PostgreSQL's pg_am table columns - let schema = Arc::new(Schema::new(vec![ - Field::new("oid", DataType::Int32, false), // Object identifier - Field::new("amname", DataType::Utf8, false), // Name of the access method - Field::new("amhandler", DataType::Int32, false), // OID of handler function - Field::new("amtype", DataType::Utf8, false), // Type of access method (i=index, t=table) - Field::new("amstrategies", DataType::Int32, false), // Number of operator strategies - Field::new("amsupport", DataType::Int32, false), // Number of support routines - Field::new("amcanorder", DataType::Boolean, false), // Does AM support ordered scans? - Field::new("amcanorderbyop", DataType::Boolean, false), // Does AM support order by operator result? - Field::new("amcanbackward", DataType::Boolean, false), // Does AM support backward scanning? - Field::new("amcanunique", DataType::Boolean, false), // Does AM support unique indexes? - Field::new("amcanmulticol", DataType::Boolean, false), // Does AM support multi-column indexes? - Field::new("amoptionalkey", DataType::Boolean, false), // Can first index column be omitted in search? - Field::new("amsearcharray", DataType::Boolean, false), // Does AM support ScalarArrayOpExpr searches? - Field::new("amsearchnulls", DataType::Boolean, false), // Does AM support searching for NULL/NOT NULL? - Field::new("amstorage", DataType::Boolean, false), // Can storage type differ from column type? - Field::new("amclusterable", DataType::Boolean, false), // Can index be clustered on? - Field::new("ampredlocks", DataType::Boolean, false), // Does AM manage fine-grained predicate locks? - Field::new("amcanparallel", DataType::Boolean, false), // Does AM support parallel scan? - Field::new("amcanbeginscan", DataType::Boolean, false), // Does AM support BRIN index scans? - Field::new("amcanmarkpos", DataType::Boolean, false), // Does AM support mark/restore positions? - Field::new("amcanfetch", DataType::Boolean, false), // Does AM support fetching specific tuples? - Field::new("amkeytype", DataType::Int32, false), // Type of data in index - ])); - - // Create memory table with schema - let provider = MemTable::try_new(schema, vec![vec![]]).unwrap(); - - Arc::new(provider) - } - - /// Create a mock empty table for pg_range - fn create_pg_range_table(&self) -> Arc { - // Define the schema for pg_range - // This matches PostgreSQL's pg_range table columns - let schema = Arc::new(Schema::new(vec![ - Field::new("rngtypid", DataType::Int32, false), // OID of the range type - Field::new("rngsubtype", DataType::Int32, false), // OID of the element type (subtype) of this range type - Field::new("rngmultitypid", DataType::Int32, false), // OID of the multirange type for this range type - Field::new("rngcollation", DataType::Int32, false), // OID of the collation used for range comparisons, or zero if none - Field::new("rngsubopc", DataType::Int32, false), // OID of the subtype's operator class used for range comparisons - Field::new("rngcanonical", DataType::Int32, false), // OID of the function to convert a range value into canonical form, or zero if none - Field::new("rngsubdiff", DataType::Int32, false), // OID of the function to return the difference between two element values as double precision, or zero if none - ])); - - // Create memory table with schema - let provider = MemTable::try_new(schema, vec![vec![]]).unwrap(); - Arc::new(provider) - } - - /// Create a mock empty table for pg_enum - fn create_pg_enum_table(&self) -> Arc { - let schema = Arc::new(Schema::new(vec![ - Field::new("oid", DataType::Int32, false), // Row identifier - Field::new("enumtypid", DataType::Int32, false), // The OID of the pg_type entry owning this enum value - Field::new("enumsortorder", DataType::Float32, false), // The sort position of this enum value within its enum type - Field::new("enumlabel", DataType::Utf8, false), // The textual label for this enum value - ])); - let provider = MemTable::try_new(schema, vec![vec![]]).unwrap(); - Arc::new(provider) - } - - /// Create a mock empty table for pg_description - fn create_pg_description_table(&self) -> Arc { - let schema = Arc::new(Schema::new(vec![ - Field::new("objoid", DataType::Int32, false), // Oid - Field::new("classoid", DataType::Int32, false), // Oid of the obj class - Field::new("objsubid", DataType::Int32, false), // subid - Field::new("description", DataType::Utf8, false), - ])); - let provider = MemTable::try_new(schema, vec![vec![]]).unwrap(); - Arc::new(provider) - } - - /// Create a populated pg_proc table with standard PostgreSQL functions - fn create_pg_proc_table(&self) -> Arc { - // Define complete schema for pg_proc (matching PostgreSQL) - let schema = Arc::new(Schema::new(vec![ - Field::new("oid", DataType::Int32, false), // Object identifier - Field::new("proname", DataType::Utf8, false), // Function name - Field::new("pronamespace", DataType::Int32, false), // OID of namespace containing function - Field::new("proowner", DataType::Int32, false), // Owner of the function - Field::new("prolang", DataType::Int32, false), // Implementation language - Field::new("procost", DataType::Float32, false), // Estimated execution cost - Field::new("prorows", DataType::Float32, false), // Estimated result size for set-returning functions - Field::new("provariadic", DataType::Int32, false), // Element type of variadic array - Field::new("prosupport", DataType::Int32, false), // Support function OID - Field::new("prokind", DataType::Utf8, false), // f=function, p=procedure, a=aggregate, w=window - Field::new("prosecdef", DataType::Boolean, false), // Security definer flag - Field::new("proleakproof", DataType::Boolean, false), // Leak-proof flag - Field::new("proisstrict", DataType::Boolean, false), // Returns null if any argument is null - Field::new("proretset", DataType::Boolean, false), // Returns a set (vs scalar) - Field::new("provolatile", DataType::Utf8, false), // i=immutable, s=stable, v=volatile - Field::new("proparallel", DataType::Utf8, false), // s=safe, r=restricted, u=unsafe - Field::new("pronargs", DataType::Int16, false), // Number of input arguments - Field::new("pronargdefaults", DataType::Int16, false), // Number of arguments with defaults - Field::new("prorettype", DataType::Int32, false), // OID of return type - Field::new("proargtypes", DataType::Utf8, false), // Array of argument type OIDs - Field::new("proallargtypes", DataType::Utf8, true), // Array of all argument type OIDs - Field::new("proargmodes", DataType::Utf8, true), // Array of argument modes - Field::new("proargnames", DataType::Utf8, true), // Array of argument names - Field::new("proargdefaults", DataType::Utf8, true), // Expression for argument defaults - Field::new("protrftypes", DataType::Utf8, true), // Transform types - Field::new("prosrc", DataType::Utf8, false), // Function source code - Field::new("probin", DataType::Utf8, true), // Binary file containing function - Field::new("prosqlbody", DataType::Utf8, true), // SQL function body - Field::new("proconfig", DataType::Utf8, true), // Configuration variables - Field::new("proacl", DataType::Utf8, true), // Access privileges - ])); - - // Create standard PostgreSQL functions - let pg_proc_data = Self::get_standard_pg_functions(); - - // Create RecordBatch from the data - let arrays: Vec = vec![ - Arc::new(Int32Array::from(pg_proc_data.oids)), - Arc::new(StringArray::from(pg_proc_data.pronames)), - Arc::new(Int32Array::from(pg_proc_data.pronamespaces)), - Arc::new(Int32Array::from(pg_proc_data.proowners)), - Arc::new(Int32Array::from(pg_proc_data.prolangs)), - Arc::new(Float32Array::from(pg_proc_data.procosts)), - Arc::new(Float32Array::from(pg_proc_data.prorows)), - Arc::new(Int32Array::from(pg_proc_data.provariadics)), - Arc::new(Int32Array::from(pg_proc_data.prosupports)), - Arc::new(StringArray::from(pg_proc_data.prokinds)), - Arc::new(BooleanArray::from(pg_proc_data.prosecdefs)), - Arc::new(BooleanArray::from(pg_proc_data.proleakproofs)), - Arc::new(BooleanArray::from(pg_proc_data.proisstricts)), - Arc::new(BooleanArray::from(pg_proc_data.proretsets)), - Arc::new(StringArray::from(pg_proc_data.provolatiles)), - Arc::new(StringArray::from(pg_proc_data.proparallels)), - Arc::new(Int16Array::from(pg_proc_data.pronargs)), - Arc::new(Int16Array::from(pg_proc_data.pronargdefaults)), - Arc::new(Int32Array::from(pg_proc_data.prorettypes)), - Arc::new(StringArray::from(pg_proc_data.proargtypes)), - Arc::new(StringArray::from_iter( - pg_proc_data.proallargtypes.into_iter(), - )), - Arc::new(StringArray::from_iter(pg_proc_data.proargmodes.into_iter())), - Arc::new(StringArray::from_iter(pg_proc_data.proargnames.into_iter())), - Arc::new(StringArray::from_iter( - pg_proc_data.proargdefaults.into_iter(), - )), - Arc::new(StringArray::from_iter(pg_proc_data.protrftypes.into_iter())), - Arc::new(StringArray::from(pg_proc_data.prosrcs)), - Arc::new(StringArray::from_iter(pg_proc_data.probins.into_iter())), - Arc::new(StringArray::from_iter(pg_proc_data.prosqlbodys.into_iter())), - Arc::new(StringArray::from_iter(pg_proc_data.proconfigs.into_iter())), - Arc::new(StringArray::from_iter(pg_proc_data.proacls.into_iter())), - ]; - - let batch = RecordBatch::try_new(schema.clone(), arrays).unwrap(); - - // Create memory table with populated data - let provider = MemTable::try_new(schema, vec![vec![batch]]).unwrap(); - - Arc::new(provider) - } - - /// Generate standard PostgreSQL functions for pg_proc table - fn get_standard_pg_functions() -> PgProcData { - let mut data = PgProcData::new(); - - // Essential PostgreSQL functions that many tools expect - data.add_function( - 1242, "boolin", 11, 10, 12, 1.0, 0.0, 0, 0, "f", false, true, true, false, "i", "s", 1, - 0, 16, "2275", None, None, None, None, None, "boolin", None, None, None, None, - ); - data.add_function( - 1243, "boolout", 11, 10, 12, 1.0, 0.0, 0, 0, "f", false, true, true, false, "i", "s", - 1, 0, 2275, "16", None, None, None, None, None, "boolout", None, None, None, None, - ); - data.add_function( - 1564, "textin", 11, 10, 12, 1.0, 0.0, 0, 0, "f", false, true, true, false, "i", "s", 1, - 0, 25, "2275", None, None, None, None, None, "textin", None, None, None, None, - ); - data.add_function( - 1565, "textout", 11, 10, 12, 1.0, 0.0, 0, 0, "f", false, true, true, false, "i", "s", - 1, 0, 2275, "25", None, None, None, None, None, "textout", None, None, None, None, - ); - data.add_function( - 1242, - "version", - 11, - 10, - 12, - 1.0, - 0.0, - 0, - 0, - "f", - false, - true, - false, - false, - "s", - "s", - 0, - 0, - 25, - "", - None, - None, - None, - None, - None, - "SELECT 'DataFusion PostgreSQL 48.0.0 on x86_64-pc-linux-gnu'", - None, - None, - None, - None, - ); - - data - } -} - -// Data structure to hold pg_proc table data -#[derive(Debug)] -struct PgProcData { - oids: Vec, - pronames: Vec, - pronamespaces: Vec, - proowners: Vec, - prolangs: Vec, - procosts: Vec, - prorows: Vec, - provariadics: Vec, - prosupports: Vec, - prokinds: Vec, - prosecdefs: Vec, - proleakproofs: Vec, - proisstricts: Vec, - proretsets: Vec, - provolatiles: Vec, - proparallels: Vec, - pronargs: Vec, - pronargdefaults: Vec, - prorettypes: Vec, - proargtypes: Vec, - proallargtypes: Vec>, - proargmodes: Vec>, - proargnames: Vec>, - proargdefaults: Vec>, - protrftypes: Vec>, - prosrcs: Vec, - probins: Vec>, - prosqlbodys: Vec>, - proconfigs: Vec>, - proacls: Vec>, -} - -impl PgProcData { - fn new() -> Self { - Self { - oids: Vec::new(), - pronames: Vec::new(), - pronamespaces: Vec::new(), - proowners: Vec::new(), - prolangs: Vec::new(), - procosts: Vec::new(), - prorows: Vec::new(), - provariadics: Vec::new(), - prosupports: Vec::new(), - prokinds: Vec::new(), - prosecdefs: Vec::new(), - proleakproofs: Vec::new(), - proisstricts: Vec::new(), - proretsets: Vec::new(), - provolatiles: Vec::new(), - proparallels: Vec::new(), - pronargs: Vec::new(), - pronargdefaults: Vec::new(), - prorettypes: Vec::new(), - proargtypes: Vec::new(), - proallargtypes: Vec::new(), - proargmodes: Vec::new(), - proargnames: Vec::new(), - proargdefaults: Vec::new(), - protrftypes: Vec::new(), - prosrcs: Vec::new(), - probins: Vec::new(), - prosqlbodys: Vec::new(), - proconfigs: Vec::new(), - proacls: Vec::new(), - } - } - - #[allow(clippy::too_many_arguments)] - fn add_function( - &mut self, - oid: i32, - proname: &str, - pronamespace: i32, - proowner: i32, - prolang: i32, - procost: f32, - prorows: f32, - provariadic: i32, - prosupport: i32, - prokind: &str, - prosecdef: bool, - proleakproof: bool, - proisstrict: bool, - proretset: bool, - provolatile: &str, - proparallel: &str, - pronargs: i16, - pronargdefaults: i16, - prorettype: i32, - proargtypes: &str, - proallargtypes: Option, - proargmodes: Option, - proargnames: Option, - proargdefaults: Option, - protrftypes: Option, - prosrc: &str, - probin: Option, - prosqlbody: Option, - proconfig: Option, - proacl: Option, - ) { - self.oids.push(oid); - self.pronames.push(proname.to_string()); - self.pronamespaces.push(pronamespace); - self.proowners.push(proowner); - self.prolangs.push(prolang); - self.procosts.push(procost); - self.prorows.push(prorows); - self.provariadics.push(provariadic); - self.prosupports.push(prosupport); - self.prokinds.push(prokind.to_string()); - self.prosecdefs.push(prosecdef); - self.proleakproofs.push(proleakproof); - self.proisstricts.push(proisstrict); - self.proretsets.push(proretset); - self.provolatiles.push(provolatile.to_string()); - self.proparallels.push(proparallel.to_string()); - self.pronargs.push(pronargs); - self.pronargdefaults.push(pronargdefaults); - self.prorettypes.push(prorettype); - self.proargtypes.push(proargtypes.to_string()); - self.proallargtypes.push(proallargtypes); - self.proargmodes.push(proargmodes); - self.proargnames.push(proargnames); - self.proargdefaults.push(proargdefaults); - self.protrftypes.push(protrftypes); - self.prosrcs.push(prosrc.to_string()); - self.probins.push(probin); - self.prosqlbodys.push(prosqlbody); - self.proconfigs.push(proconfig); - self.proacls.push(proacl); + static_tables: PgCatalogStaticTables::try_new()?, + }) } } +/// A table that reads data from Avro bytes #[derive(Debug, Clone)] -struct PgClassTable { +struct ArrowTable { schema: SchemaRef, - catalog_list: Arc, - oid_counter: Arc, - oid_cache: Arc>>, + data: Vec, } -impl PgClassTable { - fn new( - catalog_list: Arc, - oid_counter: Arc, - oid_cache: Arc>>, - ) -> PgClassTable { - // Define the schema for pg_class - // This matches key columns from PostgreSQL's pg_class - let schema = Arc::new(Schema::new(vec![ - Field::new("oid", DataType::Int32, false), // Object identifier - Field::new("relname", DataType::Utf8, false), // Name of the table, index, view, etc. - Field::new("relnamespace", DataType::Int32, false), // OID of the namespace that contains this relation - Field::new("reltype", DataType::Int32, false), // OID of the data type (composite type) this table describes - Field::new("reloftype", DataType::Int32, true), // OID of the composite type for typed table, 0 otherwise - Field::new("relowner", DataType::Int32, false), // Owner of the relation - Field::new("relam", DataType::Int32, false), // If this is an index, the access method used - Field::new("relfilenode", DataType::Int32, false), // Name of the on-disk file of this relation - Field::new("reltablespace", DataType::Int32, false), // Tablespace OID for this relation - Field::new("relpages", DataType::Int32, false), // Size of the on-disk representation in pages - Field::new("reltuples", DataType::Float64, false), // Number of tuples - Field::new("relallvisible", DataType::Int32, false), // Number of all-visible pages - Field::new("reltoastrelid", DataType::Int32, false), // OID of the TOAST table - Field::new("relhasindex", DataType::Boolean, false), // True if this is a table and it has (or recently had) any indexes - Field::new("relisshared", DataType::Boolean, false), // True if this table is shared across all databases - Field::new("relpersistence", DataType::Utf8, false), // p=permanent table, u=unlogged table, t=temporary table - Field::new("relkind", DataType::Utf8, false), // r=ordinary table, i=index, S=sequence, v=view, etc. - Field::new("relnatts", DataType::Int16, false), // Number of user columns - Field::new("relchecks", DataType::Int16, false), // Number of CHECK constraints - Field::new("relhasrules", DataType::Boolean, false), // True if table has (or once had) rules - Field::new("relhastriggers", DataType::Boolean, false), // True if table has (or once had) triggers - Field::new("relhassubclass", DataType::Boolean, false), // True if table or index has (or once had) any inheritance children - Field::new("relrowsecurity", DataType::Boolean, false), // True if row security is enabled - Field::new("relforcerowsecurity", DataType::Boolean, false), // True if row security forced for owners - Field::new("relispopulated", DataType::Boolean, false), // True if relation is populated (not true for some materialized views) - Field::new("relreplident", DataType::Utf8, false), // Columns used to form "replica identity" for rows - Field::new("relispartition", DataType::Boolean, false), // True if table is a partition - Field::new("relrewrite", DataType::Int32, true), // OID of a rule that rewrites this relation - Field::new("relfrozenxid", DataType::Int32, false), // All transaction IDs before this have been replaced with a permanent ("frozen") transaction ID - Field::new("relminmxid", DataType::Int32, false), // All Multixact IDs before this have been replaced with a transaction ID - ])); - - Self { - schema, - catalog_list, - oid_counter, - oid_cache, - } - } - - /// Generate record batches based on the current state of the catalog - async fn get_data(this: PgClassTable) -> Result { - // Vectors to store column data - let mut oids = Vec::new(); - let mut relnames = Vec::new(); - let mut relnamespaces = Vec::new(); - let mut reltypes = Vec::new(); - let mut reloftypes = Vec::new(); - let mut relowners = Vec::new(); - let mut relams = Vec::new(); - let mut relfilenodes = Vec::new(); - let mut reltablespaces = Vec::new(); - let mut relpages = Vec::new(); - let mut reltuples = Vec::new(); - let mut relallvisibles = Vec::new(); - let mut reltoastrelids = Vec::new(); - let mut relhasindexes = Vec::new(); - let mut relisshareds = Vec::new(); - let mut relpersistences = Vec::new(); - let mut relkinds = Vec::new(); - let mut relnattses = Vec::new(); - let mut relcheckses = Vec::new(); - let mut relhasruleses = Vec::new(); - let mut relhastriggersses = Vec::new(); - let mut relhassubclasses = Vec::new(); - let mut relrowsecurities = Vec::new(); - let mut relforcerowsecurities = Vec::new(); - let mut relispopulateds = Vec::new(); - let mut relreplidents = Vec::new(); - let mut relispartitions = Vec::new(); - let mut relrewrites = Vec::new(); - let mut relfrozenxids = Vec::new(); - let mut relminmxids = Vec::new(); - - let mut oid_cache = this.oid_cache.write().await; - // Every time when call pg_catalog we generate a new cache and drop the - // original one in case that schemas or tables were dropped. - let mut swap_cache = HashMap::new(); - - // Iterate through all catalogs and schemas - for catalog_name in this.catalog_list.catalog_names() { - let cache_key = OidCacheKey::Catalog(catalog_name.clone()); - let catalog_oid = if let Some(oid) = oid_cache.get(&cache_key) { - *oid - } else { - this.oid_counter.fetch_add(1, Ordering::Relaxed) - }; - swap_cache.insert(cache_key, catalog_oid); - - if let Some(catalog) = this.catalog_list.catalog(&catalog_name) { - for schema_name in catalog.schema_names() { - if let Some(schema) = catalog.schema(&schema_name) { - let cache_key = - OidCacheKey::Schema(catalog_name.clone(), schema_name.clone()); - let schema_oid = if let Some(oid) = oid_cache.get(&cache_key) { - *oid - } else { - this.oid_counter.fetch_add(1, Ordering::Relaxed) - }; - swap_cache.insert(cache_key, schema_oid); - - // Add an entry for the schema itself (as a namespace) - // (In a full implementation, this would go in pg_namespace) - - // Now process all tables in this schema - for table_name in schema.table_names() { - let cache_key = OidCacheKey::Table( - catalog_name.clone(), - schema_name.clone(), - table_name.clone(), - ); - let table_oid = if let Some(oid) = oid_cache.get(&cache_key) { - *oid - } else { - this.oid_counter.fetch_add(1, Ordering::Relaxed) - }; - swap_cache.insert(cache_key, table_oid); - - if let Some(table) = schema.table(&table_name).await? { - // Determine the correct table type based on the table provider and context - let table_type = - get_table_type_with_name(&table, &table_name, &schema_name); - - // Get column count from schema - let column_count = table.schema().fields().len() as i16; - - // Add table entry - oids.push(table_oid as i32); - relnames.push(table_name.clone()); - relnamespaces.push(schema_oid as i32); - reltypes.push(0); // Simplified: we're not tracking data types - reloftypes.push(None); - relowners.push(0); // Simplified: no owner tracking - relams.push(0); // Default access method - relfilenodes.push(table_oid as i32); // Use OID as filenode - reltablespaces.push(0); // Default tablespace - relpages.push(1); // Default page count - reltuples.push(0.0); // No row count stats - relallvisibles.push(0); - reltoastrelids.push(0); - relhasindexes.push(false); - relisshareds.push(false); - relpersistences.push("p".to_string()); // Permanent - relkinds.push(table_type.to_string()); - relnattses.push(column_count); - relcheckses.push(0); - relhasruleses.push(false); - relhastriggersses.push(false); - relhassubclasses.push(false); - relrowsecurities.push(false); - relforcerowsecurities.push(false); - relispopulateds.push(true); - relreplidents.push("d".to_string()); // Default - relispartitions.push(false); - relrewrites.push(None); - relfrozenxids.push(0); - relminmxids.push(0); - } - } - } - } - } - } +impl ArrowTable { + /// Create a new ArrowTable from bytes + pub fn from_ipc_data(data: Vec) -> Result { + let cursor = std::io::Cursor::new(data); + let reader = FileReader::try_new(cursor, None)?; - *oid_cache = swap_cache; - - // Create Arrow arrays from the collected data - let arrays: Vec = vec![ - Arc::new(Int32Array::from(oids)), - Arc::new(StringArray::from(relnames)), - Arc::new(Int32Array::from(relnamespaces)), - Arc::new(Int32Array::from(reltypes)), - Arc::new(Int32Array::from_iter(reloftypes.into_iter())), - Arc::new(Int32Array::from(relowners)), - Arc::new(Int32Array::from(relams)), - Arc::new(Int32Array::from(relfilenodes)), - Arc::new(Int32Array::from(reltablespaces)), - Arc::new(Int32Array::from(relpages)), - Arc::new(Float64Array::from_iter(reltuples.into_iter())), - Arc::new(Int32Array::from(relallvisibles)), - Arc::new(Int32Array::from(reltoastrelids)), - Arc::new(BooleanArray::from(relhasindexes)), - Arc::new(BooleanArray::from(relisshareds)), - Arc::new(StringArray::from(relpersistences)), - Arc::new(StringArray::from(relkinds)), - Arc::new(Int16Array::from(relnattses)), - Arc::new(Int16Array::from(relcheckses)), - Arc::new(BooleanArray::from(relhasruleses)), - Arc::new(BooleanArray::from(relhastriggersses)), - Arc::new(BooleanArray::from(relhassubclasses)), - Arc::new(BooleanArray::from(relrowsecurities)), - Arc::new(BooleanArray::from(relforcerowsecurities)), - Arc::new(BooleanArray::from(relispopulateds)), - Arc::new(StringArray::from(relreplidents)), - Arc::new(BooleanArray::from(relispartitions)), - Arc::new(Int32Array::from_iter(relrewrites.into_iter())), - Arc::new(Int32Array::from(relfrozenxids)), - Arc::new(Int32Array::from(relminmxids)), - ]; - - // Create a record batch - let batch = RecordBatch::try_new(this.schema.clone(), arrays)?; - - Ok(batch) - } -} + let schema = reader.schema(); + let mut batches = Vec::new(); -impl PartitionStream for PgClassTable { - fn schema(&self) -> &SchemaRef { - &self.schema - } - - fn execute(&self, _ctx: Arc) -> SendableRecordBatchStream { - let this = self.clone(); - Box::pin(RecordBatchStreamAdapter::new( - this.schema.clone(), - futures::stream::once(async move { PgClassTable::get_data(this).await }), - )) - } -} - -#[derive(Debug, Clone)] -struct PgNamespaceTable { - schema: SchemaRef, - catalog_list: Arc, - oid_counter: Arc, - oid_cache: Arc>>, -} - -impl PgNamespaceTable { - pub fn new( - catalog_list: Arc, - oid_counter: Arc, - oid_cache: Arc>>, - ) -> Self { - // Define the schema for pg_namespace - // This matches the columns from PostgreSQL's pg_namespace - let schema = Arc::new(Schema::new(vec![ - Field::new("oid", DataType::Int32, false), // Object identifier - Field::new("nspname", DataType::Utf8, false), // Name of the namespace (schema) - Field::new("nspowner", DataType::Int32, false), // Owner of the namespace - Field::new("nspacl", DataType::Utf8, true), // Access privileges - Field::new("options", DataType::Utf8, true), // Schema-level options - ])); - - Self { - schema, - catalog_list, - oid_counter, - oid_cache, - } - } - - /// Generate record batches based on the current state of the catalog - async fn get_data(this: PgNamespaceTable) -> Result { - // Vectors to store column data - let mut oids = Vec::new(); - let mut nspnames = Vec::new(); - let mut nspowners = Vec::new(); - let mut nspacls: Vec> = Vec::new(); - let mut options: Vec> = Vec::new(); - - // to store all schema-oid mapping temporarily before adding to global oid cache - let mut schema_oid_cache = HashMap::new(); - - let mut oid_cache = this.oid_cache.write().await; - - // Now add all schemas from DataFusion catalogs - for catalog_name in this.catalog_list.catalog_names() { - if let Some(catalog) = this.catalog_list.catalog(&catalog_name) { - for schema_name in catalog.schema_names() { - let cache_key = OidCacheKey::Schema(catalog_name.clone(), schema_name.clone()); - let schema_oid = if let Some(oid) = oid_cache.get(&cache_key) { - *oid - } else { - this.oid_counter.fetch_add(1, Ordering::Relaxed) - }; - schema_oid_cache.insert(cache_key, schema_oid); - - oids.push(schema_oid as i32); - nspnames.push(schema_name.clone()); - nspowners.push(10); // Default owner - nspacls.push(None); - options.push(None); - } - } + // Read all record batches from the IPC stream + for batch in reader { + batches.push(batch?); } - // remove all schema cache and table of the schema which is no longer exists - oid_cache.retain(|key, _| match key { - OidCacheKey::Catalog(..) => true, - OidCacheKey::Schema(..) => false, - OidCacheKey::Table(catalog, schema_name, _) => schema_oid_cache - .contains_key(&OidCacheKey::Schema(catalog.clone(), schema_name.clone())), - }); - // add new schema cache - oid_cache.extend(schema_oid_cache); - - // Create Arrow arrays from the collected data - let arrays: Vec = vec![ - Arc::new(Int32Array::from(oids)), - Arc::new(StringArray::from(nspnames)), - Arc::new(Int32Array::from(nspowners)), - Arc::new(StringArray::from_iter(nspacls.into_iter())), - Arc::new(StringArray::from_iter(options.into_iter())), - ]; - - // Create a full record batch - let batch = RecordBatch::try_new(this.schema.clone(), arrays)?; - - Ok(batch) - } -} - -impl PartitionStream for PgNamespaceTable { - fn schema(&self) -> &SchemaRef { - &self.schema - } - - fn execute(&self, _ctx: Arc) -> SendableRecordBatchStream { - let this = self.clone(); - Box::pin(RecordBatchStreamAdapter::new( - this.schema.clone(), - futures::stream::once(async move { Self::get_data(this).await }), - )) - } -} - -#[derive(Debug, Clone)] -struct PgDatabaseTable { - schema: SchemaRef, - catalog_list: Arc, - oid_counter: Arc, - oid_cache: Arc>>, -} - -impl PgDatabaseTable { - pub fn new( - catalog_list: Arc, - oid_counter: Arc, - oid_cache: Arc>>, - ) -> Self { - // Define the schema for pg_database - // This matches PostgreSQL's pg_database table columns - let schema = Arc::new(Schema::new(vec![ - Field::new("oid", DataType::Int32, false), // Object identifier - Field::new("datname", DataType::Utf8, false), // Database name - Field::new("datdba", DataType::Int32, false), // Database owner's user ID - Field::new("encoding", DataType::Int32, false), // Character encoding - Field::new("datcollate", DataType::Utf8, false), // LC_COLLATE for this database - Field::new("datctype", DataType::Utf8, false), // LC_CTYPE for this database - Field::new("datistemplate", DataType::Boolean, false), // If true, database can be used as a template - Field::new("datallowconn", DataType::Boolean, false), // If false, no one can connect to this database - Field::new("datconnlimit", DataType::Int32, false), // Max number of concurrent connections (-1=no limit) - Field::new("datlastsysoid", DataType::Int32, false), // Last system OID in database - Field::new("datfrozenxid", DataType::Int32, false), // Frozen XID for this database - Field::new("datminmxid", DataType::Int32, false), // Minimum multixact ID - Field::new("dattablespace", DataType::Int32, false), // Default tablespace for this database - Field::new("datacl", DataType::Utf8, true), // Access privileges - ])); - - Self { + Ok(Self { schema, - catalog_list, - oid_counter, - oid_cache, - } - } - - /// Generate record batches based on the current state of the catalog - async fn get_data(this: PgDatabaseTable) -> Result { - // Vectors to store column data - let mut oids = Vec::new(); - let mut datnames = Vec::new(); - let mut datdbas = Vec::new(); - let mut encodings = Vec::new(); - let mut datcollates = Vec::new(); - let mut datctypes = Vec::new(); - let mut datistemplates = Vec::new(); - let mut datallowconns = Vec::new(); - let mut datconnlimits = Vec::new(); - let mut datlastsysoids = Vec::new(); - let mut datfrozenxids = Vec::new(); - let mut datminmxids = Vec::new(); - let mut dattablespaces = Vec::new(); - let mut datacles: Vec> = Vec::new(); - - // to store all schema-oid mapping temporarily before adding to global oid cache - let mut catalog_oid_cache = HashMap::new(); - - let mut oid_cache = this.oid_cache.write().await; - - // Add a record for each catalog (treating catalogs as "databases") - for catalog_name in this.catalog_list.catalog_names() { - let cache_key = OidCacheKey::Catalog(catalog_name.clone()); - let catalog_oid = if let Some(oid) = oid_cache.get(&cache_key) { - *oid - } else { - this.oid_counter.fetch_add(1, Ordering::Relaxed) - }; - catalog_oid_cache.insert(cache_key, catalog_oid); - - oids.push(catalog_oid as i32); - datnames.push(catalog_name.clone()); - datdbas.push(10); // Default owner (assuming 10 = postgres user) - encodings.push(6); // 6 = UTF8 in PostgreSQL - datcollates.push("en_US.UTF-8".to_string()); // Default collation - datctypes.push("en_US.UTF-8".to_string()); // Default ctype - datistemplates.push(false); - datallowconns.push(true); - datconnlimits.push(-1); // No connection limit - datlastsysoids.push(100000); // Arbitrary last system OID - datfrozenxids.push(1); // Simplified transaction ID - datminmxids.push(1); // Simplified multixact ID - dattablespaces.push(1663); // Default tablespace (1663 = pg_default in PostgreSQL) - datacles.push(None); // No specific ACLs - } - - // Always include a "postgres" database entry if not already present - // (This is for compatibility with tools that expect it) - let default_datname = "postgres".to_string(); - if !datnames.contains(&default_datname) { - let cache_key = OidCacheKey::Catalog(default_datname.clone()); - let catalog_oid = if let Some(oid) = oid_cache.get(&cache_key) { - *oid - } else { - this.oid_counter.fetch_add(1, Ordering::Relaxed) - }; - catalog_oid_cache.insert(cache_key, catalog_oid); - - oids.push(catalog_oid as i32); - datnames.push(default_datname); - datdbas.push(10); - encodings.push(6); - datcollates.push("en_US.UTF-8".to_string()); - datctypes.push("en_US.UTF-8".to_string()); - datistemplates.push(false); - datallowconns.push(true); - datconnlimits.push(-1); - datlastsysoids.push(100000); - datfrozenxids.push(1); - datminmxids.push(1); - dattablespaces.push(1663); - datacles.push(None); - } - - // Create Arrow arrays from the collected data - let arrays: Vec = vec![ - Arc::new(Int32Array::from(oids)), - Arc::new(StringArray::from(datnames)), - Arc::new(Int32Array::from(datdbas)), - Arc::new(Int32Array::from(encodings)), - Arc::new(StringArray::from(datcollates)), - Arc::new(StringArray::from(datctypes)), - Arc::new(BooleanArray::from(datistemplates)), - Arc::new(BooleanArray::from(datallowconns)), - Arc::new(Int32Array::from(datconnlimits)), - Arc::new(Int32Array::from(datlastsysoids)), - Arc::new(Int32Array::from(datfrozenxids)), - Arc::new(Int32Array::from(datminmxids)), - Arc::new(Int32Array::from(dattablespaces)), - Arc::new(StringArray::from_iter(datacles.into_iter())), - ]; - - // Create a full record batch - let full_batch = RecordBatch::try_new(this.schema.clone(), arrays)?; - - // update cache - // remove all schema cache and table of the schema which is no longer exists - oid_cache.retain(|key, _| match key { - OidCacheKey::Catalog(..) => false, - OidCacheKey::Schema(catalog, ..) => { - catalog_oid_cache.contains_key(&OidCacheKey::Catalog(catalog.clone())) - } - OidCacheKey::Table(catalog, ..) => { - catalog_oid_cache.contains_key(&OidCacheKey::Catalog(catalog.clone())) - } - }); - // add new schema cache - oid_cache.extend(catalog_oid_cache); - - Ok(full_batch) + data: batches, + }) } } -impl PartitionStream for PgDatabaseTable { +impl PartitionStream for ArrowTable { fn schema(&self) -> &SchemaRef { &self.schema } fn execute(&self, _ctx: Arc) -> SendableRecordBatchStream { - let this = self.clone(); + let data = self.data.clone(); Box::pin(RecordBatchStreamAdapter::new( - this.schema.clone(), - futures::stream::once(async move { Self::get_data(this).await }), + self.schema.clone(), + futures::stream::iter(data.into_iter().map(Ok)), )) } } +/// pg_catalog table as datafusion table provider +/// +/// This implementation only contains static tables #[derive(Debug)] -struct PgAttributeTable { - schema: SchemaRef, - catalog_list: Arc, +pub struct PgCatalogStaticTables { + pub pg_aggregate: Arc, + pub pg_am: Arc, + pub pg_amop: Arc, + pub pg_amproc: Arc, + pub pg_cast: Arc, + pub pg_collation: Arc, + pub pg_conversion: Arc, + pub pg_language: Arc, + pub pg_opclass: Arc, + pub pg_operator: Arc, + pub pg_opfamily: Arc, + pub pg_proc: Arc, + pub pg_range: Arc, + pub pg_ts_config: Arc, + pub pg_ts_dict: Arc, + pub pg_ts_parser: Arc, + pub pg_ts_template: Arc, + pub pg_type: Arc, + pub pg_attrdef: Arc, + pub pg_auth_members: Arc, + pub pg_authid: Arc, + pub pg_constraint: Arc, + pub pg_db_role_setting: Arc, + pub pg_default_acl: Arc, + pub pg_depend: Arc, + pub pg_description: Arc, + pub pg_enum: Arc, + pub pg_event_trigger: Arc, + pub pg_extension: Arc, + pub pg_foreign_data_wrapper: Arc, + pub pg_foreign_server: Arc, + pub pg_foreign_table: Arc, + pub pg_index: Arc, + pub pg_inherits: Arc, + pub pg_init_privs: Arc, + pub pg_largeobject: Arc, + pub pg_largeobject_metadata: Arc, + pub pg_partitioned_table: Arc, + pub pg_policy: Arc, + pub pg_publication: Arc, + pub pg_publication_namespace: Arc, + pub pg_publication_rel: Arc, + pub pg_replication_origin: Arc, + pub pg_rewrite: Arc, + pub pg_seclabel: Arc, + pub pg_sequence: Arc, + pub pg_shdepend: Arc, + pub pg_shdescription: Arc, + pub pg_shseclabel: Arc, + pub pg_statistic: Arc, + pub pg_statistic_ext: Arc, + pub pg_statistic_ext_data: Arc, + pub pg_subscription: Arc, + pub pg_subscription_rel: Arc, + pub pg_tablespace: Arc, + pub pg_trigger: Arc, + pub pg_user_mapping: Arc, } -impl PgAttributeTable { - pub fn new(catalog_list: Arc) -> Self { - // Define the schema for pg_attribute - // This matches PostgreSQL's pg_attribute table columns - let schema = Arc::new(Schema::new(vec![ - Field::new("attrelid", DataType::Int32, false), // OID of the relation this column belongs to - Field::new("attname", DataType::Utf8, false), // Column name - Field::new("atttypid", DataType::Int32, false), // OID of the column data type - Field::new("attstattarget", DataType::Int32, false), // Statistics target - Field::new("attlen", DataType::Int16, false), // Length of the type - Field::new("attnum", DataType::Int16, false), // Column number (positive for regular columns) - Field::new("attndims", DataType::Int32, false), // Number of dimensions for array types - Field::new("attcacheoff", DataType::Int32, false), // Cache offset - Field::new("atttypmod", DataType::Int32, false), // Type-specific modifier - Field::new("attbyval", DataType::Boolean, false), // True if the type is pass-by-value - Field::new("attalign", DataType::Utf8, false), // Type alignment - Field::new("attstorage", DataType::Utf8, false), // Storage type - Field::new("attcompression", DataType::Utf8, true), // Compression method - Field::new("attnotnull", DataType::Boolean, false), // True if column cannot be null - Field::new("atthasdef", DataType::Boolean, false), // True if column has a default value - Field::new("atthasmissing", DataType::Boolean, false), // True if column has missing values - Field::new("attidentity", DataType::Utf8, false), // Identity column type - Field::new("attgenerated", DataType::Utf8, false), // Generated column type - Field::new("attisdropped", DataType::Boolean, false), // True if column has been dropped - Field::new("attislocal", DataType::Boolean, false), // True if column is local to this relation - Field::new("attinhcount", DataType::Int32, false), // Number of direct inheritance ancestors - Field::new("attcollation", DataType::Int32, false), // OID of collation - Field::new("attacl", DataType::Utf8, true), // Access privileges - Field::new("attoptions", DataType::Utf8, true), // Attribute-level options - Field::new("attfdwoptions", DataType::Utf8, true), // Foreign data wrapper options - Field::new("attmissingval", DataType::Utf8, true), // Missing value for added columns - ])); - - Self { - schema, - catalog_list, - } - } - - /// Generate record batches based on the current state of the catalog - async fn get_data( - schema: SchemaRef, - catalog_list: Arc, - ) -> Result { - // Vectors to store column data - let mut attrelids = Vec::new(); - let mut attnames = Vec::new(); - let mut atttypids = Vec::new(); - let mut attstattargets = Vec::new(); - let mut attlens = Vec::new(); - let mut attnums = Vec::new(); - let mut attndimss = Vec::new(); - let mut attcacheoffs = Vec::new(); - let mut atttymods = Vec::new(); - let mut attbyvals = Vec::new(); - let mut attaligns = Vec::new(); - let mut attstorages = Vec::new(); - let mut attcompressions: Vec> = Vec::new(); - let mut attnotnulls = Vec::new(); - let mut atthasdefs = Vec::new(); - let mut atthasmissings = Vec::new(); - let mut attidentitys = Vec::new(); - let mut attgenerateds = Vec::new(); - let mut attisdroppeds = Vec::new(); - let mut attislocals = Vec::new(); - let mut attinhcounts = Vec::new(); - let mut attcollations = Vec::new(); - let mut attacls: Vec> = Vec::new(); - let mut attoptions: Vec> = Vec::new(); - let mut attfdwoptions: Vec> = Vec::new(); - let mut attmissingvals: Vec> = Vec::new(); - - // Start OID counter (should be consistent with pg_class) - let mut next_oid = 10000; - - // Iterate through all catalogs and schemas - for catalog_name in catalog_list.catalog_names() { - if let Some(catalog) = catalog_list.catalog(&catalog_name) { - for schema_name in catalog.schema_names() { - if let Some(schema_provider) = catalog.schema(&schema_name) { - // Process all tables in this schema - for table_name in schema_provider.table_names() { - let table_oid = next_oid; - next_oid += 1; - - if let Some(table) = schema_provider.table(&table_name).await? { - let table_schema = table.schema(); - - // Add column entries for this table - for (column_idx, field) in table_schema.fields().iter().enumerate() - { - let attnum = (column_idx + 1) as i16; // PostgreSQL column numbers start at 1 - let (pg_type_oid, type_len, by_val, align, storage) = - Self::datafusion_to_pg_type(field.data_type()); - - attrelids.push(table_oid); - attnames.push(field.name().clone()); - atttypids.push(pg_type_oid); - attstattargets.push(-1); // Default statistics target - attlens.push(type_len); - attnums.push(attnum); - attndimss.push(0); // No array support for now - attcacheoffs.push(-1); // Not cached - atttymods.push(-1); // No type modifiers - attbyvals.push(by_val); - attaligns.push(align.to_string()); - attstorages.push(storage.to_string()); - attcompressions.push(None); // No compression - attnotnulls.push(!field.is_nullable()); - atthasdefs.push(false); // No default values - atthasmissings.push(false); // No missing values - attidentitys.push("".to_string()); // No identity columns - attgenerateds.push("".to_string()); // No generated columns - attisdroppeds.push(false); // Not dropped - attislocals.push(true); // Local to this relation - attinhcounts.push(0); // No inheritance - attcollations.push(0); // Default collation - attacls.push(None); // No ACLs - attoptions.push(None); // No options - attfdwoptions.push(None); // No FDW options - attmissingvals.push(None); // No missing values - } - } - } - } - } - } - } - - // Create Arrow arrays from the collected data - let arrays: Vec = vec![ - Arc::new(Int32Array::from(attrelids)), - Arc::new(StringArray::from(attnames)), - Arc::new(Int32Array::from(atttypids)), - Arc::new(Int32Array::from(attstattargets)), - Arc::new(Int16Array::from(attlens)), - Arc::new(Int16Array::from(attnums)), - Arc::new(Int32Array::from(attndimss)), - Arc::new(Int32Array::from(attcacheoffs)), - Arc::new(Int32Array::from(atttymods)), - Arc::new(BooleanArray::from(attbyvals)), - Arc::new(StringArray::from(attaligns)), - Arc::new(StringArray::from(attstorages)), - Arc::new(StringArray::from_iter(attcompressions.into_iter())), - Arc::new(BooleanArray::from(attnotnulls)), - Arc::new(BooleanArray::from(atthasdefs)), - Arc::new(BooleanArray::from(atthasmissings)), - Arc::new(StringArray::from(attidentitys)), - Arc::new(StringArray::from(attgenerateds)), - Arc::new(BooleanArray::from(attisdroppeds)), - Arc::new(BooleanArray::from(attislocals)), - Arc::new(Int32Array::from(attinhcounts)), - Arc::new(Int32Array::from(attcollations)), - Arc::new(StringArray::from_iter(attacls.into_iter())), - Arc::new(StringArray::from_iter(attoptions.into_iter())), - Arc::new(StringArray::from_iter(attfdwoptions.into_iter())), - Arc::new(StringArray::from_iter(attmissingvals.into_iter())), - ]; - - // Create a record batch - let batch = RecordBatch::try_new(schema.clone(), arrays)?; - Ok(batch) - } - - /// Map DataFusion data types to PostgreSQL type information - fn datafusion_to_pg_type(data_type: &DataType) -> (i32, i16, bool, &'static str, &'static str) { - match data_type { - DataType::Boolean => (16, 1, true, "c", "p"), // bool - DataType::Int8 => (18, 1, true, "c", "p"), // char - DataType::Int16 => (21, 2, true, "s", "p"), // int2 - DataType::Int32 => (23, 4, true, "i", "p"), // int4 - DataType::Int64 => (20, 8, true, "d", "p"), // int8 - DataType::UInt8 => (21, 2, true, "s", "p"), // Treat as int2 - DataType::UInt16 => (23, 4, true, "i", "p"), // Treat as int4 - DataType::UInt32 => (20, 8, true, "d", "p"), // Treat as int8 - DataType::UInt64 => (1700, -1, false, "i", "m"), // Treat as numeric - DataType::Float32 => (700, 4, true, "i", "p"), // float4 - DataType::Float64 => (701, 8, true, "d", "p"), // float8 - DataType::Utf8 => (25, -1, false, "i", "x"), // text - DataType::LargeUtf8 => (25, -1, false, "i", "x"), // text - DataType::Binary => (17, -1, false, "i", "x"), // bytea - DataType::LargeBinary => (17, -1, false, "i", "x"), // bytea - DataType::Date32 => (1082, 4, true, "i", "p"), // date - DataType::Date64 => (1082, 4, true, "i", "p"), // date - DataType::Time32(_) => (1083, 8, true, "d", "p"), // time - DataType::Time64(_) => (1083, 8, true, "d", "p"), // time - DataType::Timestamp(_, _) => (1114, 8, true, "d", "p"), // timestamp - DataType::Decimal128(_, _) => (1700, -1, false, "i", "m"), // numeric - DataType::Decimal256(_, _) => (1700, -1, false, "i", "m"), // numeric - _ => (25, -1, false, "i", "x"), // Default to text for unknown types - } +impl PgCatalogStaticTables { + pub fn try_new() -> Result { + Ok(Self { + pg_aggregate: Self::create_arrow_table( + include_bytes!("../../pg_catalog_arrow_exports/pg_aggregate.feather").to_vec(), + )?, + pg_am: Self::create_arrow_table( + include_bytes!("../../pg_catalog_arrow_exports/pg_am.feather").to_vec(), + )?, + pg_amop: Self::create_arrow_table( + include_bytes!("../../pg_catalog_arrow_exports/pg_amop.feather").to_vec(), + )?, + pg_amproc: Self::create_arrow_table( + include_bytes!("../../pg_catalog_arrow_exports/pg_amproc.feather").to_vec(), + )?, + pg_cast: Self::create_arrow_table( + include_bytes!("../../pg_catalog_arrow_exports/pg_cast.feather").to_vec(), + )?, + pg_collation: Self::create_arrow_table( + include_bytes!("../../pg_catalog_arrow_exports/pg_collation.feather").to_vec(), + )?, + pg_conversion: Self::create_arrow_table( + include_bytes!("../../pg_catalog_arrow_exports/pg_conversion.feather").to_vec(), + )?, + pg_language: Self::create_arrow_table( + include_bytes!("../../pg_catalog_arrow_exports/pg_language.feather").to_vec(), + )?, + pg_opclass: Self::create_arrow_table( + include_bytes!("../../pg_catalog_arrow_exports/pg_opclass.feather").to_vec(), + )?, + pg_operator: Self::create_arrow_table( + include_bytes!("../../pg_catalog_arrow_exports/pg_operator.feather").to_vec(), + )?, + pg_opfamily: Self::create_arrow_table( + include_bytes!("../../pg_catalog_arrow_exports/pg_opfamily.feather").to_vec(), + )?, + pg_proc: Self::create_arrow_table( + include_bytes!("../../pg_catalog_arrow_exports/pg_proc.feather").to_vec(), + )?, + pg_range: Self::create_arrow_table( + include_bytes!("../../pg_catalog_arrow_exports/pg_range.feather").to_vec(), + )?, + pg_ts_config: Self::create_arrow_table( + include_bytes!("../../pg_catalog_arrow_exports/pg_ts_config.feather").to_vec(), + )?, + pg_ts_dict: Self::create_arrow_table( + include_bytes!("../../pg_catalog_arrow_exports/pg_ts_dict.feather").to_vec(), + )?, + pg_ts_parser: Self::create_arrow_table( + include_bytes!("../../pg_catalog_arrow_exports/pg_ts_parser.feather").to_vec(), + )?, + pg_ts_template: Self::create_arrow_table( + include_bytes!("../../pg_catalog_arrow_exports/pg_ts_template.feather").to_vec(), + )?, + pg_type: Self::create_arrow_table( + include_bytes!("../../pg_catalog_arrow_exports/pg_type.feather").to_vec(), + )?, + pg_attrdef: Self::create_arrow_table( + include_bytes!("../../pg_catalog_arrow_exports/pg_attrdef.feather").to_vec(), + )?, + pg_auth_members: Self::create_arrow_table( + include_bytes!("../../pg_catalog_arrow_exports/pg_auth_members.feather").to_vec(), + )?, + pg_authid: Self::create_arrow_table( + include_bytes!("../../pg_catalog_arrow_exports/pg_authid.feather").to_vec(), + )?, + pg_constraint: Self::create_arrow_table( + include_bytes!("../../pg_catalog_arrow_exports/pg_constraint.feather").to_vec(), + )?, + pg_db_role_setting: Self::create_arrow_table( + include_bytes!("../../pg_catalog_arrow_exports/pg_db_role_setting.feather") + .to_vec(), + )?, + pg_default_acl: Self::create_arrow_table( + include_bytes!("../../pg_catalog_arrow_exports/pg_default_acl.feather").to_vec(), + )?, + pg_depend: Self::create_arrow_table( + include_bytes!("../../pg_catalog_arrow_exports/pg_depend.feather").to_vec(), + )?, + pg_description: Self::create_arrow_table( + include_bytes!("../../pg_catalog_arrow_exports/pg_description.feather").to_vec(), + )?, + pg_enum: Self::create_arrow_table( + include_bytes!("../../pg_catalog_arrow_exports/pg_enum.feather").to_vec(), + )?, + pg_event_trigger: Self::create_arrow_table( + include_bytes!("../../pg_catalog_arrow_exports/pg_event_trigger.feather").to_vec(), + )?, + pg_extension: Self::create_arrow_table( + include_bytes!("../../pg_catalog_arrow_exports/pg_extension.feather").to_vec(), + )?, + pg_foreign_data_wrapper: Self::create_arrow_table( + include_bytes!("../../pg_catalog_arrow_exports/pg_foreign_data_wrapper.feather") + .to_vec(), + )?, + pg_foreign_server: Self::create_arrow_table( + include_bytes!("../../pg_catalog_arrow_exports/pg_foreign_server.feather").to_vec(), + )?, + pg_foreign_table: Self::create_arrow_table( + include_bytes!("../../pg_catalog_arrow_exports/pg_foreign_table.feather").to_vec(), + )?, + pg_index: Self::create_arrow_table( + include_bytes!("../../pg_catalog_arrow_exports/pg_index.feather").to_vec(), + )?, + pg_inherits: Self::create_arrow_table( + include_bytes!("../../pg_catalog_arrow_exports/pg_inherits.feather").to_vec(), + )?, + pg_init_privs: Self::create_arrow_table( + include_bytes!("../../pg_catalog_arrow_exports/pg_init_privs.feather").to_vec(), + )?, + pg_largeobject: Self::create_arrow_table( + include_bytes!("../../pg_catalog_arrow_exports/pg_largeobject.feather").to_vec(), + )?, + pg_largeobject_metadata: Self::create_arrow_table( + include_bytes!("../../pg_catalog_arrow_exports/pg_largeobject_metadata.feather") + .to_vec(), + )?, + + pg_partitioned_table: Self::create_arrow_table( + include_bytes!("../../pg_catalog_arrow_exports/pg_partitioned_table.feather") + .to_vec(), + )?, + pg_policy: Self::create_arrow_table( + include_bytes!("../../pg_catalog_arrow_exports/pg_policy.feather").to_vec(), + )?, + pg_publication: Self::create_arrow_table( + include_bytes!("../../pg_catalog_arrow_exports/pg_publication.feather").to_vec(), + )?, + pg_publication_namespace: Self::create_arrow_table( + include_bytes!("../../pg_catalog_arrow_exports/pg_publication_namespace.feather") + .to_vec(), + )?, + pg_publication_rel: Self::create_arrow_table( + include_bytes!("../../pg_catalog_arrow_exports/pg_publication_rel.feather") + .to_vec(), + )?, + pg_replication_origin: Self::create_arrow_table( + include_bytes!("../../pg_catalog_arrow_exports/pg_replication_origin.feather") + .to_vec(), + )?, + pg_rewrite: Self::create_arrow_table( + include_bytes!("../../pg_catalog_arrow_exports/pg_rewrite.feather").to_vec(), + )?, + pg_seclabel: Self::create_arrow_table( + include_bytes!("../../pg_catalog_arrow_exports/pg_seclabel.feather").to_vec(), + )?, + pg_sequence: Self::create_arrow_table( + include_bytes!("../../pg_catalog_arrow_exports/pg_sequence.feather").to_vec(), + )?, + pg_shdepend: Self::create_arrow_table( + include_bytes!("../../pg_catalog_arrow_exports/pg_shdepend.feather").to_vec(), + )?, + pg_shdescription: Self::create_arrow_table( + include_bytes!("../../pg_catalog_arrow_exports/pg_shdescription.feather").to_vec(), + )?, + pg_shseclabel: Self::create_arrow_table( + include_bytes!("../../pg_catalog_arrow_exports/pg_shseclabel.feather").to_vec(), + )?, + pg_statistic: Self::create_arrow_table( + include_bytes!("../../pg_catalog_arrow_exports/pg_statistic.feather").to_vec(), + )?, + pg_statistic_ext: Self::create_arrow_table( + include_bytes!("../../pg_catalog_arrow_exports/pg_statistic_ext.feather").to_vec(), + )?, + pg_statistic_ext_data: Self::create_arrow_table( + include_bytes!("../../pg_catalog_arrow_exports/pg_statistic_ext_data.feather") + .to_vec(), + )?, + pg_subscription: Self::create_arrow_table( + include_bytes!("../../pg_catalog_arrow_exports/pg_subscription.feather").to_vec(), + )?, + pg_subscription_rel: Self::create_arrow_table( + include_bytes!("../../pg_catalog_arrow_exports/pg_subscription_rel.feather") + .to_vec(), + )?, + pg_tablespace: Self::create_arrow_table( + include_bytes!("../../pg_catalog_arrow_exports/pg_tablespace.feather").to_vec(), + )?, + pg_trigger: Self::create_arrow_table( + include_bytes!("../../pg_catalog_arrow_exports/pg_trigger.feather").to_vec(), + )?, + pg_user_mapping: Self::create_arrow_table( + include_bytes!("../../pg_catalog_arrow_exports/pg_user_mapping.feather").to_vec(), + )?, + }) } -} -impl PartitionStream for PgAttributeTable { - fn schema(&self) -> &SchemaRef { - &self.schema - } - - fn execute(&self, _ctx: Arc) -> SendableRecordBatchStream { - let catalog_list = self.catalog_list.clone(); - let schema = Arc::clone(&self.schema); - Box::pin(RecordBatchStreamAdapter::new( - schema.clone(), - futures::stream::once(async move { Self::get_data(schema, catalog_list).await }), - )) + /// Create table from dumped arrow data + fn create_arrow_table(data_bytes: Vec) -> Result> { + let table = ArrowTable::from_ipc_data(data_bytes)?; + let streaming_table = StreamingTable::try_new(table.schema.clone(), vec![Arc::new(table)])?; + Ok(Arc::new(streaming_table)) } } @@ -2025,7 +885,8 @@ pub fn setup_pg_catalog( session_context: &SessionContext, catalog_name: &str, ) -> Result<(), Box> { - let pg_catalog = PgCatalogSchemaProvider::new(session_context.state().catalog_list().clone()); + let pg_catalog = + PgCatalogSchemaProvider::try_new(session_context.state().catalog_list().clone())?; session_context .catalog(catalog_name) .ok_or_else(|| { @@ -2045,3 +906,255 @@ pub fn setup_pg_catalog( Ok(()) } + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_load_arrow_data() { + let table = ArrowTable::from_ipc_data( + include_bytes!("../../pg_catalog_arrow_exports/pg_aggregate.feather").to_vec(), + ) + .expect("Failed to load ipc data"); + + assert_eq!(table.schema.fields.len(), 22); + assert_eq!(table.data.len(), 1); + + let _ = ArrowTable::from_ipc_data( + include_bytes!("../../pg_catalog_arrow_exports/pg_aggregate.feather").to_vec(), + ) + .expect("Failed to load ipc data"); + let _ = ArrowTable::from_ipc_data( + include_bytes!("../../pg_catalog_arrow_exports/pg_am.feather").to_vec(), + ) + .expect("Failed to load ipc data"); + let _ = ArrowTable::from_ipc_data( + include_bytes!("../../pg_catalog_arrow_exports/pg_amop.feather").to_vec(), + ) + .expect("Failed to load ipc data"); + let _ = ArrowTable::from_ipc_data( + include_bytes!("../../pg_catalog_arrow_exports/pg_amproc.feather").to_vec(), + ) + .expect("Failed to load ipc data"); + let _ = ArrowTable::from_ipc_data( + include_bytes!("../../pg_catalog_arrow_exports/pg_cast.feather").to_vec(), + ) + .expect("Failed to load ipc data"); + let _ = ArrowTable::from_ipc_data( + include_bytes!("../../pg_catalog_arrow_exports/pg_collation.feather").to_vec(), + ) + .expect("Failed to load ipc data"); + let _ = ArrowTable::from_ipc_data( + include_bytes!("../../pg_catalog_arrow_exports/pg_conversion.feather").to_vec(), + ) + .expect("Failed to load ipc data"); + let _ = ArrowTable::from_ipc_data( + include_bytes!("../../pg_catalog_arrow_exports/pg_language.feather").to_vec(), + ) + .expect("Failed to load ipc data"); + let _ = ArrowTable::from_ipc_data( + include_bytes!("../../pg_catalog_arrow_exports/pg_opclass.feather").to_vec(), + ) + .expect("Failed to load ipc data"); + let _ = ArrowTable::from_ipc_data( + include_bytes!("../../pg_catalog_arrow_exports/pg_operator.feather").to_vec(), + ) + .expect("Failed to load ipc data"); + let _ = ArrowTable::from_ipc_data( + include_bytes!("../../pg_catalog_arrow_exports/pg_opfamily.feather").to_vec(), + ) + .expect("Failed to load ipc data"); + let _ = ArrowTable::from_ipc_data( + include_bytes!("../../pg_catalog_arrow_exports/pg_proc.feather").to_vec(), + ) + .expect("Failed to load ipc data"); + let _ = ArrowTable::from_ipc_data( + include_bytes!("../../pg_catalog_arrow_exports/pg_range.feather").to_vec(), + ) + .expect("Failed to load ipc data"); + let _ = ArrowTable::from_ipc_data( + include_bytes!("../../pg_catalog_arrow_exports/pg_ts_config.feather").to_vec(), + ) + .expect("Failed to load ipc data"); + let _ = ArrowTable::from_ipc_data( + include_bytes!("../../pg_catalog_arrow_exports/pg_ts_dict.feather").to_vec(), + ) + .expect("Failed to load ipc data"); + let _ = ArrowTable::from_ipc_data( + include_bytes!("../../pg_catalog_arrow_exports/pg_ts_parser.feather").to_vec(), + ) + .expect("Failed to load ipc data"); + let _ = ArrowTable::from_ipc_data( + include_bytes!("../../pg_catalog_arrow_exports/pg_ts_template.feather").to_vec(), + ) + .expect("Failed to load ipc data"); + let _ = ArrowTable::from_ipc_data( + include_bytes!("../../pg_catalog_arrow_exports/pg_type.feather").to_vec(), + ) + .expect("Failed to load ipc data"); + + let _ = ArrowTable::from_ipc_data( + include_bytes!("../../pg_catalog_arrow_exports/pg_attrdef.feather").to_vec(), + ) + .expect("Failed to load ipc data"); + let _ = ArrowTable::from_ipc_data( + include_bytes!("../../pg_catalog_arrow_exports/pg_auth_members.feather").to_vec(), + ) + .expect("Failed to load ipc data"); + let _ = ArrowTable::from_ipc_data( + include_bytes!("../../pg_catalog_arrow_exports/pg_authid.feather").to_vec(), + ) + .expect("Failed to load ipc data"); + + let _ = ArrowTable::from_ipc_data( + include_bytes!("../../pg_catalog_arrow_exports/pg_constraint.feather").to_vec(), + ) + .expect("Failed to load ipc data"); + + let _ = ArrowTable::from_ipc_data( + include_bytes!("../../pg_catalog_arrow_exports/pg_db_role_setting.feather").to_vec(), + ) + .expect("Failed to load ipc data"); + let _ = ArrowTable::from_ipc_data( + include_bytes!("../../pg_catalog_arrow_exports/pg_default_acl.feather").to_vec(), + ) + .expect("Failed to load ipc data"); + let _ = ArrowTable::from_ipc_data( + include_bytes!("../../pg_catalog_arrow_exports/pg_depend.feather").to_vec(), + ) + .expect("Failed to load ipc data"); + let _ = ArrowTable::from_ipc_data( + include_bytes!("../../pg_catalog_arrow_exports/pg_description.feather").to_vec(), + ) + .expect("Failed to load ipc data"); + let _ = ArrowTable::from_ipc_data( + include_bytes!("../../pg_catalog_arrow_exports/pg_enum.feather").to_vec(), + ) + .expect("Failed to load ipc data"); + let _ = ArrowTable::from_ipc_data( + include_bytes!("../../pg_catalog_arrow_exports/pg_event_trigger.feather").to_vec(), + ) + .expect("Failed to load ipc data"); + let _ = ArrowTable::from_ipc_data( + include_bytes!("../../pg_catalog_arrow_exports/pg_extension.feather").to_vec(), + ) + .expect("Failed to load ipc data"); + let _ = ArrowTable::from_ipc_data( + include_bytes!("../../pg_catalog_arrow_exports/pg_foreign_data_wrapper.feather") + .to_vec(), + ) + .expect("Failed to load ipc data"); + let _ = ArrowTable::from_ipc_data( + include_bytes!("../../pg_catalog_arrow_exports/pg_foreign_server.feather").to_vec(), + ) + .expect("Failed to load ipc data"); + let _ = ArrowTable::from_ipc_data( + include_bytes!("../../pg_catalog_arrow_exports/pg_foreign_table.feather").to_vec(), + ) + .expect("Failed to load ipc data"); + let _ = ArrowTable::from_ipc_data( + include_bytes!("../../pg_catalog_arrow_exports/pg_index.feather").to_vec(), + ) + .expect("Failed to load ipc data"); + let _ = ArrowTable::from_ipc_data( + include_bytes!("../../pg_catalog_arrow_exports/pg_inherits.feather").to_vec(), + ) + .expect("Failed to load ipc data"); + let _ = ArrowTable::from_ipc_data( + include_bytes!("../../pg_catalog_arrow_exports/pg_init_privs.feather").to_vec(), + ) + .expect("Failed to load ipc data"); + let _ = ArrowTable::from_ipc_data( + include_bytes!("../../pg_catalog_arrow_exports/pg_largeobject.feather").to_vec(), + ) + .expect("Failed to load ipc data"); + let _ = ArrowTable::from_ipc_data( + include_bytes!("../../pg_catalog_arrow_exports/pg_largeobject_metadata.feather") + .to_vec(), + ) + .expect("Failed to load ipc data"); + + let _ = ArrowTable::from_ipc_data( + include_bytes!("../../pg_catalog_arrow_exports/pg_partitioned_table.feather").to_vec(), + ) + .expect("Failed to load ipc data"); + let _ = ArrowTable::from_ipc_data( + include_bytes!("../../pg_catalog_arrow_exports/pg_policy.feather").to_vec(), + ) + .expect("Failed to load ipc data"); + let _ = ArrowTable::from_ipc_data( + include_bytes!("../../pg_catalog_arrow_exports/pg_publication.feather").to_vec(), + ) + .expect("Failed to load ipc data"); + let _ = ArrowTable::from_ipc_data( + include_bytes!("../../pg_catalog_arrow_exports/pg_publication_namespace.feather") + .to_vec(), + ) + .expect("Failed to load ipc data"); + let _ = ArrowTable::from_ipc_data( + include_bytes!("../../pg_catalog_arrow_exports/pg_publication_rel.feather").to_vec(), + ) + .expect("Failed to load ipc data"); + let _ = ArrowTable::from_ipc_data( + include_bytes!("../../pg_catalog_arrow_exports/pg_replication_origin.feather").to_vec(), + ) + .expect("Failed to load ipc data"); + let _ = ArrowTable::from_ipc_data( + include_bytes!("../../pg_catalog_arrow_exports/pg_rewrite.feather").to_vec(), + ) + .expect("Failed to load ipc data"); + let _ = ArrowTable::from_ipc_data( + include_bytes!("../../pg_catalog_arrow_exports/pg_seclabel.feather").to_vec(), + ) + .expect("Failed to load ipc data"); + let _ = ArrowTable::from_ipc_data( + include_bytes!("../../pg_catalog_arrow_exports/pg_sequence.feather").to_vec(), + ) + .expect("Failed to load ipc data"); + let _ = ArrowTable::from_ipc_data( + include_bytes!("../../pg_catalog_arrow_exports/pg_shdepend.feather").to_vec(), + ) + .expect("Failed to load ipc data"); + let _ = ArrowTable::from_ipc_data( + include_bytes!("../../pg_catalog_arrow_exports/pg_shdescription.feather").to_vec(), + ) + .expect("Failed to load ipc data"); + let _ = ArrowTable::from_ipc_data( + include_bytes!("../../pg_catalog_arrow_exports/pg_shseclabel.feather").to_vec(), + ) + .expect("Failed to load ipc data"); + let _ = ArrowTable::from_ipc_data( + include_bytes!("../../pg_catalog_arrow_exports/pg_statistic.feather").to_vec(), + ) + .expect("Failed to load ipc data"); + let _ = ArrowTable::from_ipc_data( + include_bytes!("../../pg_catalog_arrow_exports/pg_statistic_ext.feather").to_vec(), + ) + .expect("Failed to load ipc data"); + let _ = ArrowTable::from_ipc_data( + include_bytes!("../../pg_catalog_arrow_exports/pg_statistic_ext_data.feather").to_vec(), + ) + .expect("Failed to load ipc data"); + let _ = ArrowTable::from_ipc_data( + include_bytes!("../../pg_catalog_arrow_exports/pg_subscription.feather").to_vec(), + ) + .expect("Failed to load ipc data"); + let _ = ArrowTable::from_ipc_data( + include_bytes!("../../pg_catalog_arrow_exports/pg_subscription_rel.feather").to_vec(), + ) + .expect("Failed to load ipc data"); + let _ = ArrowTable::from_ipc_data( + include_bytes!("../../pg_catalog_arrow_exports/pg_tablespace.feather").to_vec(), + ) + .expect("Failed to load ipc data"); + let _ = ArrowTable::from_ipc_data( + include_bytes!("../../pg_catalog_arrow_exports/pg_trigger.feather").to_vec(), + ) + .expect("Failed to load ipc data"); + let _ = ArrowTable::from_ipc_data( + include_bytes!("../../pg_catalog_arrow_exports/pg_user_mapping.feather").to_vec(), + ) + .expect("Failed to load ipc data"); + } +} diff --git a/datafusion-postgres/src/pg_catalog/pg_attribute.rs b/datafusion-postgres/src/pg_catalog/pg_attribute.rs new file mode 100644 index 0000000..1f6e596 --- /dev/null +++ b/datafusion-postgres/src/pg_catalog/pg_attribute.rs @@ -0,0 +1,248 @@ +use std::collections::HashMap; +use std::sync::atomic::{AtomicU32, Ordering}; +use std::sync::Arc; + +use datafusion::arrow::array::{ + ArrayRef, BooleanArray, Int16Array, Int32Array, RecordBatch, StringArray, +}; +use datafusion::arrow::datatypes::{DataType, Field, Schema, SchemaRef}; +use datafusion::catalog::CatalogProviderList; +use datafusion::error::Result; +use datafusion::execution::{SendableRecordBatchStream, TaskContext}; +use datafusion::physical_plan::stream::RecordBatchStreamAdapter; +use datafusion::physical_plan::streaming::PartitionStream; +use postgres_types::Oid; +use tokio::sync::RwLock; + +use super::OidCacheKey; + +#[derive(Debug, Clone)] +pub(crate) struct PgAttributeTable { + schema: SchemaRef, + catalog_list: Arc, + oid_counter: Arc, + oid_cache: Arc>>, +} + +impl PgAttributeTable { + pub(crate) fn new( + catalog_list: Arc, + oid_counter: Arc, + oid_cache: Arc>>, + ) -> Self { + // Define the schema for pg_attribute + // This matches PostgreSQL's pg_attribute table columns + let schema = Arc::new(Schema::new(vec![ + Field::new("attrelid", DataType::Int32, false), // OID of the relation this column belongs to + Field::new("attname", DataType::Utf8, false), // Column name + Field::new("atttypid", DataType::Int32, false), // OID of the column data type + Field::new("attstattarget", DataType::Int32, false), // Statistics target + Field::new("attlen", DataType::Int16, false), // Length of the type + Field::new("attnum", DataType::Int16, false), // Column number (positive for regular columns) + Field::new("attndims", DataType::Int32, false), // Number of dimensions for array types + Field::new("attcacheoff", DataType::Int32, false), // Cache offset + Field::new("atttypmod", DataType::Int32, false), // Type-specific modifier + Field::new("attbyval", DataType::Boolean, false), // True if the type is pass-by-value + Field::new("attalign", DataType::Utf8, false), // Type alignment + Field::new("attstorage", DataType::Utf8, false), // Storage type + Field::new("attcompression", DataType::Utf8, true), // Compression method + Field::new("attnotnull", DataType::Boolean, false), // True if column cannot be null + Field::new("atthasdef", DataType::Boolean, false), // True if column has a default value + Field::new("atthasmissing", DataType::Boolean, false), // True if column has missing values + Field::new("attidentity", DataType::Utf8, false), // Identity column type + Field::new("attgenerated", DataType::Utf8, false), // Generated column type + Field::new("attisdropped", DataType::Boolean, false), // True if column has been dropped + Field::new("attislocal", DataType::Boolean, false), // True if column is local to this relation + Field::new("attinhcount", DataType::Int32, false), // Number of direct inheritance ancestors + Field::new("attcollation", DataType::Int32, false), // OID of collation + Field::new("attacl", DataType::Utf8, true), // Access privileges + Field::new("attoptions", DataType::Utf8, true), // Attribute-level options + Field::new("attfdwoptions", DataType::Utf8, true), // Foreign data wrapper options + Field::new("attmissingval", DataType::Utf8, true), // Missing value for added columns + ])); + + Self { + schema, + catalog_list, + oid_counter, + oid_cache, + } + } + + /// Generate record batches based on the current state of the catalog + async fn get_data(this: Self) -> Result { + // Vectors to store column data + let mut attrelids = Vec::new(); + let mut attnames = Vec::new(); + let mut atttypids = Vec::new(); + let mut attstattargets = Vec::new(); + let mut attlens = Vec::new(); + let mut attnums = Vec::new(); + let mut attndimss = Vec::new(); + let mut attcacheoffs = Vec::new(); + let mut atttymods = Vec::new(); + let mut attbyvals = Vec::new(); + let mut attaligns = Vec::new(); + let mut attstorages = Vec::new(); + let mut attcompressions: Vec> = Vec::new(); + let mut attnotnulls = Vec::new(); + let mut atthasdefs = Vec::new(); + let mut atthasmissings = Vec::new(); + let mut attidentitys = Vec::new(); + let mut attgenerateds = Vec::new(); + let mut attisdroppeds = Vec::new(); + let mut attislocals = Vec::new(); + let mut attinhcounts = Vec::new(); + let mut attcollations = Vec::new(); + let mut attacls: Vec> = Vec::new(); + let mut attoptions: Vec> = Vec::new(); + let mut attfdwoptions: Vec> = Vec::new(); + let mut attmissingvals: Vec> = Vec::new(); + + let mut oid_cache = this.oid_cache.write().await; + // Every time when call pg_catalog we generate a new cache and drop the + // original one in case that schemas or tables were dropped. + let mut swap_cache = HashMap::new(); + + for catalog_name in this.catalog_list.catalog_names() { + if let Some(catalog) = this.catalog_list.catalog(&catalog_name) { + for schema_name in catalog.schema_names() { + if let Some(schema_provider) = catalog.schema(&schema_name) { + // Process all tables in this schema + for table_name in schema_provider.table_names() { + let cache_key = OidCacheKey::Table( + catalog_name.clone(), + schema_name.clone(), + table_name.clone(), + ); + let table_oid = if let Some(oid) = oid_cache.get(&cache_key) { + *oid + } else { + this.oid_counter.fetch_add(1, Ordering::Relaxed) + }; + swap_cache.insert(cache_key, table_oid); + + if let Some(table) = schema_provider.table(&table_name).await? { + let table_schema = table.schema(); + + // Add column entries for this table + for (column_idx, field) in table_schema.fields().iter().enumerate() + { + let attnum = (column_idx + 1) as i16; // PostgreSQL column numbers start at 1 + let (pg_type_oid, type_len, by_val, align, storage) = + Self::datafusion_to_pg_type(field.data_type()); + + attrelids.push(table_oid as i32); + attnames.push(field.name().clone()); + atttypids.push(pg_type_oid); + attstattargets.push(-1); // Default statistics target + attlens.push(type_len); + attnums.push(attnum); + attndimss.push(0); // No array support for now + attcacheoffs.push(-1); // Not cached + atttymods.push(-1); // No type modifiers + attbyvals.push(by_val); + attaligns.push(align.to_string()); + attstorages.push(storage.to_string()); + attcompressions.push(None); // No compression + attnotnulls.push(!field.is_nullable()); + atthasdefs.push(false); // No default values + atthasmissings.push(false); // No missing values + attidentitys.push("".to_string()); // No identity columns + attgenerateds.push("".to_string()); // No generated columns + attisdroppeds.push(false); // Not dropped + attislocals.push(true); // Local to this relation + attinhcounts.push(0); // No inheritance + attcollations.push(0); // Default collation + attacls.push(None); // No ACLs + attoptions.push(None); // No options + attfdwoptions.push(None); // No FDW options + attmissingvals.push(None); // No missing values + } + } + } + } + } + } + } + + *oid_cache = swap_cache; + + // Create Arrow arrays from the collected data + let arrays: Vec = vec![ + Arc::new(Int32Array::from(attrelids)), + Arc::new(StringArray::from(attnames)), + Arc::new(Int32Array::from(atttypids)), + Arc::new(Int32Array::from(attstattargets)), + Arc::new(Int16Array::from(attlens)), + Arc::new(Int16Array::from(attnums)), + Arc::new(Int32Array::from(attndimss)), + Arc::new(Int32Array::from(attcacheoffs)), + Arc::new(Int32Array::from(atttymods)), + Arc::new(BooleanArray::from(attbyvals)), + Arc::new(StringArray::from(attaligns)), + Arc::new(StringArray::from(attstorages)), + Arc::new(StringArray::from_iter(attcompressions.into_iter())), + Arc::new(BooleanArray::from(attnotnulls)), + Arc::new(BooleanArray::from(atthasdefs)), + Arc::new(BooleanArray::from(atthasmissings)), + Arc::new(StringArray::from(attidentitys)), + Arc::new(StringArray::from(attgenerateds)), + Arc::new(BooleanArray::from(attisdroppeds)), + Arc::new(BooleanArray::from(attislocals)), + Arc::new(Int32Array::from(attinhcounts)), + Arc::new(Int32Array::from(attcollations)), + Arc::new(StringArray::from_iter(attacls.into_iter())), + Arc::new(StringArray::from_iter(attoptions.into_iter())), + Arc::new(StringArray::from_iter(attfdwoptions.into_iter())), + Arc::new(StringArray::from_iter(attmissingvals.into_iter())), + ]; + + // Create a record batch + let batch = RecordBatch::try_new(this.schema.clone(), arrays)?; + Ok(batch) + } + + /// Map DataFusion data types to PostgreSQL type information + fn datafusion_to_pg_type(data_type: &DataType) -> (i32, i16, bool, &'static str, &'static str) { + match data_type { + DataType::Boolean => (16, 1, true, "c", "p"), // bool + DataType::Int8 => (18, 1, true, "c", "p"), // char + DataType::Int16 => (21, 2, true, "s", "p"), // int2 + DataType::Int32 => (23, 4, true, "i", "p"), // int4 + DataType::Int64 => (20, 8, true, "d", "p"), // int8 + DataType::UInt8 => (21, 2, true, "s", "p"), // Treat as int2 + DataType::UInt16 => (23, 4, true, "i", "p"), // Treat as int4 + DataType::UInt32 => (20, 8, true, "d", "p"), // Treat as int8 + DataType::UInt64 => (1700, -1, false, "i", "m"), // Treat as numeric + DataType::Float32 => (700, 4, true, "i", "p"), // float4 + DataType::Float64 => (701, 8, true, "d", "p"), // float8 + DataType::Utf8 => (25, -1, false, "i", "x"), // text + DataType::LargeUtf8 => (25, -1, false, "i", "x"), // text + DataType::Binary => (17, -1, false, "i", "x"), // bytea + DataType::LargeBinary => (17, -1, false, "i", "x"), // bytea + DataType::Date32 => (1082, 4, true, "i", "p"), // date + DataType::Date64 => (1082, 4, true, "i", "p"), // date + DataType::Time32(_) => (1083, 8, true, "d", "p"), // time + DataType::Time64(_) => (1083, 8, true, "d", "p"), // time + DataType::Timestamp(_, _) => (1114, 8, true, "d", "p"), // timestamp + DataType::Decimal128(_, _) => (1700, -1, false, "i", "m"), // numeric + DataType::Decimal256(_, _) => (1700, -1, false, "i", "m"), // numeric + _ => (25, -1, false, "i", "x"), // Default to text for unknown types + } + } +} + +impl PartitionStream for PgAttributeTable { + fn schema(&self) -> &SchemaRef { + &self.schema + } + + fn execute(&self, _ctx: Arc) -> SendableRecordBatchStream { + let this = self.clone(); + Box::pin(RecordBatchStreamAdapter::new( + this.schema.clone(), + futures::stream::once(async move { Self::get_data(this).await }), + )) + } +} diff --git a/datafusion-postgres/src/pg_catalog/pg_class.rs b/datafusion-postgres/src/pg_catalog/pg_class.rs new file mode 100644 index 0000000..72f2211 --- /dev/null +++ b/datafusion-postgres/src/pg_catalog/pg_class.rs @@ -0,0 +1,255 @@ +use std::collections::HashMap; +use std::sync::atomic::{AtomicU32, Ordering}; +use std::sync::Arc; + +use datafusion::arrow::array::{ + ArrayRef, BooleanArray, Float64Array, Int16Array, Int32Array, RecordBatch, StringArray, +}; +use datafusion::arrow::datatypes::{DataType, Field, Schema, SchemaRef}; +use datafusion::catalog::CatalogProviderList; +use datafusion::error::Result; +use datafusion::execution::{SendableRecordBatchStream, TaskContext}; +use datafusion::physical_plan::stream::RecordBatchStreamAdapter; +use datafusion::physical_plan::streaming::PartitionStream; +use postgres_types::Oid; +use tokio::sync::RwLock; + +use super::{get_table_type_with_name, OidCacheKey}; + +#[derive(Debug, Clone)] +pub(crate) struct PgClassTable { + schema: SchemaRef, + catalog_list: Arc, + oid_counter: Arc, + oid_cache: Arc>>, +} + +impl PgClassTable { + pub(crate) fn new( + catalog_list: Arc, + oid_counter: Arc, + oid_cache: Arc>>, + ) -> PgClassTable { + // Define the schema for pg_class + // This matches key columns from PostgreSQL's pg_class + let schema = Arc::new(Schema::new(vec![ + Field::new("oid", DataType::Int32, false), // Object identifier + Field::new("relname", DataType::Utf8, false), // Name of the table, index, view, etc. + Field::new("relnamespace", DataType::Int32, false), // OID of the namespace that contains this relation + Field::new("reltype", DataType::Int32, false), // OID of the data type (composite type) this table describes + Field::new("reloftype", DataType::Int32, true), // OID of the composite type for typed table, 0 otherwise + Field::new("relowner", DataType::Int32, false), // Owner of the relation + Field::new("relam", DataType::Int32, false), // If this is an index, the access method used + Field::new("relfilenode", DataType::Int32, false), // Name of the on-disk file of this relation + Field::new("reltablespace", DataType::Int32, false), // Tablespace OID for this relation + Field::new("relpages", DataType::Int32, false), // Size of the on-disk representation in pages + Field::new("reltuples", DataType::Float64, false), // Number of tuples + Field::new("relallvisible", DataType::Int32, false), // Number of all-visible pages + Field::new("reltoastrelid", DataType::Int32, false), // OID of the TOAST table + Field::new("relhasindex", DataType::Boolean, false), // True if this is a table and it has (or recently had) any indexes + Field::new("relisshared", DataType::Boolean, false), // True if this table is shared across all databases + Field::new("relpersistence", DataType::Utf8, false), // p=permanent table, u=unlogged table, t=temporary table + Field::new("relkind", DataType::Utf8, false), // r=ordinary table, i=index, S=sequence, v=view, etc. + Field::new("relnatts", DataType::Int16, false), // Number of user columns + Field::new("relchecks", DataType::Int16, false), // Number of CHECK constraints + Field::new("relhasrules", DataType::Boolean, false), // True if table has (or once had) rules + Field::new("relhastriggers", DataType::Boolean, false), // True if table has (or once had) triggers + Field::new("relhassubclass", DataType::Boolean, false), // True if table or index has (or once had) any inheritance children + Field::new("relrowsecurity", DataType::Boolean, false), // True if row security is enabled + Field::new("relforcerowsecurity", DataType::Boolean, false), // True if row security forced for owners + Field::new("relispopulated", DataType::Boolean, false), // True if relation is populated (not true for some materialized views) + Field::new("relreplident", DataType::Utf8, false), // Columns used to form "replica identity" for rows + Field::new("relispartition", DataType::Boolean, false), // True if table is a partition + Field::new("relrewrite", DataType::Int32, true), // OID of a rule that rewrites this relation + Field::new("relfrozenxid", DataType::Int32, false), // All transaction IDs before this have been replaced with a permanent ("frozen") transaction ID + Field::new("relminmxid", DataType::Int32, false), // All Multixact IDs before this have been replaced with a transaction ID + ])); + + Self { + schema, + catalog_list, + oid_counter, + oid_cache, + } + } + + /// Generate record batches based on the current state of the catalog + async fn get_data(this: PgClassTable) -> Result { + // Vectors to store column data + let mut oids = Vec::new(); + let mut relnames = Vec::new(); + let mut relnamespaces = Vec::new(); + let mut reltypes = Vec::new(); + let mut reloftypes = Vec::new(); + let mut relowners = Vec::new(); + let mut relams = Vec::new(); + let mut relfilenodes = Vec::new(); + let mut reltablespaces = Vec::new(); + let mut relpages = Vec::new(); + let mut reltuples = Vec::new(); + let mut relallvisibles = Vec::new(); + let mut reltoastrelids = Vec::new(); + let mut relhasindexes = Vec::new(); + let mut relisshareds = Vec::new(); + let mut relpersistences = Vec::new(); + let mut relkinds = Vec::new(); + let mut relnattses = Vec::new(); + let mut relcheckses = Vec::new(); + let mut relhasruleses = Vec::new(); + let mut relhastriggersses = Vec::new(); + let mut relhassubclasses = Vec::new(); + let mut relrowsecurities = Vec::new(); + let mut relforcerowsecurities = Vec::new(); + let mut relispopulateds = Vec::new(); + let mut relreplidents = Vec::new(); + let mut relispartitions = Vec::new(); + let mut relrewrites = Vec::new(); + let mut relfrozenxids = Vec::new(); + let mut relminmxids = Vec::new(); + + let mut oid_cache = this.oid_cache.write().await; + // Every time when call pg_catalog we generate a new cache and drop the + // original one in case that schemas or tables were dropped. + let mut swap_cache = HashMap::new(); + + // Iterate through all catalogs and schemas + for catalog_name in this.catalog_list.catalog_names() { + let cache_key = OidCacheKey::Catalog(catalog_name.clone()); + let catalog_oid = if let Some(oid) = oid_cache.get(&cache_key) { + *oid + } else { + this.oid_counter.fetch_add(1, Ordering::Relaxed) + }; + swap_cache.insert(cache_key, catalog_oid); + + if let Some(catalog) = this.catalog_list.catalog(&catalog_name) { + for schema_name in catalog.schema_names() { + if let Some(schema) = catalog.schema(&schema_name) { + let cache_key = + OidCacheKey::Schema(catalog_name.clone(), schema_name.clone()); + let schema_oid = if let Some(oid) = oid_cache.get(&cache_key) { + *oid + } else { + this.oid_counter.fetch_add(1, Ordering::Relaxed) + }; + swap_cache.insert(cache_key, schema_oid); + + // Add an entry for the schema itself (as a namespace) + // (In a full implementation, this would go in pg_namespace) + + // Now process all tables in this schema + for table_name in schema.table_names() { + let cache_key = OidCacheKey::Table( + catalog_name.clone(), + schema_name.clone(), + table_name.clone(), + ); + let table_oid = if let Some(oid) = oid_cache.get(&cache_key) { + *oid + } else { + this.oid_counter.fetch_add(1, Ordering::Relaxed) + }; + swap_cache.insert(cache_key, table_oid); + + if let Some(table) = schema.table(&table_name).await? { + // Determine the correct table type based on the table provider and context + let table_type = + get_table_type_with_name(&table, &table_name, &schema_name); + + // Get column count from schema + let column_count = table.schema().fields().len() as i16; + + // Add table entry + oids.push(table_oid as i32); + relnames.push(table_name.clone()); + relnamespaces.push(schema_oid as i32); + reltypes.push(0); // Simplified: we're not tracking data types + reloftypes.push(None); + relowners.push(0); // Simplified: no owner tracking + relams.push(0); // Default access method + relfilenodes.push(table_oid as i32); // Use OID as filenode + reltablespaces.push(0); // Default tablespace + relpages.push(1); // Default page count + reltuples.push(0.0); // No row count stats + relallvisibles.push(0); + reltoastrelids.push(0); + relhasindexes.push(false); + relisshareds.push(false); + relpersistences.push("p".to_string()); // Permanent + relkinds.push(table_type.to_string()); + relnattses.push(column_count); + relcheckses.push(0); + relhasruleses.push(false); + relhastriggersses.push(false); + relhassubclasses.push(false); + relrowsecurities.push(false); + relforcerowsecurities.push(false); + relispopulateds.push(true); + relreplidents.push("d".to_string()); // Default + relispartitions.push(false); + relrewrites.push(None); + relfrozenxids.push(0); + relminmxids.push(0); + } + } + } + } + } + } + + *oid_cache = swap_cache; + + // Create Arrow arrays from the collected data + let arrays: Vec = vec![ + Arc::new(Int32Array::from(oids)), + Arc::new(StringArray::from(relnames)), + Arc::new(Int32Array::from(relnamespaces)), + Arc::new(Int32Array::from(reltypes)), + Arc::new(Int32Array::from_iter(reloftypes.into_iter())), + Arc::new(Int32Array::from(relowners)), + Arc::new(Int32Array::from(relams)), + Arc::new(Int32Array::from(relfilenodes)), + Arc::new(Int32Array::from(reltablespaces)), + Arc::new(Int32Array::from(relpages)), + Arc::new(Float64Array::from_iter(reltuples.into_iter())), + Arc::new(Int32Array::from(relallvisibles)), + Arc::new(Int32Array::from(reltoastrelids)), + Arc::new(BooleanArray::from(relhasindexes)), + Arc::new(BooleanArray::from(relisshareds)), + Arc::new(StringArray::from(relpersistences)), + Arc::new(StringArray::from(relkinds)), + Arc::new(Int16Array::from(relnattses)), + Arc::new(Int16Array::from(relcheckses)), + Arc::new(BooleanArray::from(relhasruleses)), + Arc::new(BooleanArray::from(relhastriggersses)), + Arc::new(BooleanArray::from(relhassubclasses)), + Arc::new(BooleanArray::from(relrowsecurities)), + Arc::new(BooleanArray::from(relforcerowsecurities)), + Arc::new(BooleanArray::from(relispopulateds)), + Arc::new(StringArray::from(relreplidents)), + Arc::new(BooleanArray::from(relispartitions)), + Arc::new(Int32Array::from_iter(relrewrites.into_iter())), + Arc::new(Int32Array::from(relfrozenxids)), + Arc::new(Int32Array::from(relminmxids)), + ]; + + // Create a record batch + let batch = RecordBatch::try_new(this.schema.clone(), arrays)?; + + Ok(batch) + } +} + +impl PartitionStream for PgClassTable { + fn schema(&self) -> &SchemaRef { + &self.schema + } + + fn execute(&self, _ctx: Arc) -> SendableRecordBatchStream { + let this = self.clone(); + Box::pin(RecordBatchStreamAdapter::new( + this.schema.clone(), + futures::stream::once(async move { PgClassTable::get_data(this).await }), + )) + } +} diff --git a/datafusion-postgres/src/pg_catalog/pg_database.rs b/datafusion-postgres/src/pg_catalog/pg_database.rs new file mode 100644 index 0000000..5959977 --- /dev/null +++ b/datafusion-postgres/src/pg_catalog/pg_database.rs @@ -0,0 +1,186 @@ +use std::collections::HashMap; +use std::sync::atomic::{AtomicU32, Ordering}; +use std::sync::Arc; + +use datafusion::arrow::array::{ArrayRef, BooleanArray, Int32Array, RecordBatch, StringArray}; +use datafusion::arrow::datatypes::{DataType, Field, Schema, SchemaRef}; +use datafusion::catalog::CatalogProviderList; +use datafusion::error::Result; +use datafusion::execution::{SendableRecordBatchStream, TaskContext}; +use datafusion::physical_plan::stream::RecordBatchStreamAdapter; +use datafusion::physical_plan::streaming::PartitionStream; +use postgres_types::Oid; +use tokio::sync::RwLock; + +use super::OidCacheKey; + +#[derive(Debug, Clone)] +pub(crate) struct PgDatabaseTable { + schema: SchemaRef, + catalog_list: Arc, + oid_counter: Arc, + oid_cache: Arc>>, +} + +impl PgDatabaseTable { + pub(crate) fn new( + catalog_list: Arc, + oid_counter: Arc, + oid_cache: Arc>>, + ) -> Self { + // Define the schema for pg_database + // This matches PostgreSQL's pg_database table columns + let schema = Arc::new(Schema::new(vec![ + Field::new("oid", DataType::Int32, false), // Object identifier + Field::new("datname", DataType::Utf8, false), // Database name + Field::new("datdba", DataType::Int32, false), // Database owner's user ID + Field::new("encoding", DataType::Int32, false), // Character encoding + Field::new("datcollate", DataType::Utf8, false), // LC_COLLATE for this database + Field::new("datctype", DataType::Utf8, false), // LC_CTYPE for this database + Field::new("datistemplate", DataType::Boolean, false), // If true, database can be used as a template + Field::new("datallowconn", DataType::Boolean, false), // If false, no one can connect to this database + Field::new("datconnlimit", DataType::Int32, false), // Max number of concurrent connections (-1=no limit) + Field::new("datlastsysoid", DataType::Int32, false), // Last system OID in database + Field::new("datfrozenxid", DataType::Int32, false), // Frozen XID for this database + Field::new("datminmxid", DataType::Int32, false), // Minimum multixact ID + Field::new("dattablespace", DataType::Int32, false), // Default tablespace for this database + Field::new("datacl", DataType::Utf8, true), // Access privileges + ])); + + Self { + schema, + catalog_list, + oid_counter, + oid_cache, + } + } + + /// Generate record batches based on the current state of the catalog + async fn get_data(this: PgDatabaseTable) -> Result { + // Vectors to store column data + let mut oids = Vec::new(); + let mut datnames = Vec::new(); + let mut datdbas = Vec::new(); + let mut encodings = Vec::new(); + let mut datcollates = Vec::new(); + let mut datctypes = Vec::new(); + let mut datistemplates = Vec::new(); + let mut datallowconns = Vec::new(); + let mut datconnlimits = Vec::new(); + let mut datlastsysoids = Vec::new(); + let mut datfrozenxids = Vec::new(); + let mut datminmxids = Vec::new(); + let mut dattablespaces = Vec::new(); + let mut datacles: Vec> = Vec::new(); + + // to store all schema-oid mapping temporarily before adding to global oid cache + let mut catalog_oid_cache = HashMap::new(); + + let mut oid_cache = this.oid_cache.write().await; + + // Add a record for each catalog (treating catalogs as "databases") + for catalog_name in this.catalog_list.catalog_names() { + let cache_key = OidCacheKey::Catalog(catalog_name.clone()); + let catalog_oid = if let Some(oid) = oid_cache.get(&cache_key) { + *oid + } else { + this.oid_counter.fetch_add(1, Ordering::Relaxed) + }; + catalog_oid_cache.insert(cache_key, catalog_oid); + + oids.push(catalog_oid as i32); + datnames.push(catalog_name.clone()); + datdbas.push(10); // Default owner (assuming 10 = postgres user) + encodings.push(6); // 6 = UTF8 in PostgreSQL + datcollates.push("en_US.UTF-8".to_string()); // Default collation + datctypes.push("en_US.UTF-8".to_string()); // Default ctype + datistemplates.push(false); + datallowconns.push(true); + datconnlimits.push(-1); // No connection limit + datlastsysoids.push(100000); // Arbitrary last system OID + datfrozenxids.push(1); // Simplified transaction ID + datminmxids.push(1); // Simplified multixact ID + dattablespaces.push(1663); // Default tablespace (1663 = pg_default in PostgreSQL) + datacles.push(None); // No specific ACLs + } + + // Always include a "postgres" database entry if not already present + // (This is for compatibility with tools that expect it) + let default_datname = "postgres".to_string(); + if !datnames.contains(&default_datname) { + let cache_key = OidCacheKey::Catalog(default_datname.clone()); + let catalog_oid = if let Some(oid) = oid_cache.get(&cache_key) { + *oid + } else { + this.oid_counter.fetch_add(1, Ordering::Relaxed) + }; + catalog_oid_cache.insert(cache_key, catalog_oid); + + oids.push(catalog_oid as i32); + datnames.push(default_datname); + datdbas.push(10); + encodings.push(6); + datcollates.push("en_US.UTF-8".to_string()); + datctypes.push("en_US.UTF-8".to_string()); + datistemplates.push(false); + datallowconns.push(true); + datconnlimits.push(-1); + datlastsysoids.push(100000); + datfrozenxids.push(1); + datminmxids.push(1); + dattablespaces.push(1663); + datacles.push(None); + } + + // Create Arrow arrays from the collected data + let arrays: Vec = vec![ + Arc::new(Int32Array::from(oids)), + Arc::new(StringArray::from(datnames)), + Arc::new(Int32Array::from(datdbas)), + Arc::new(Int32Array::from(encodings)), + Arc::new(StringArray::from(datcollates)), + Arc::new(StringArray::from(datctypes)), + Arc::new(BooleanArray::from(datistemplates)), + Arc::new(BooleanArray::from(datallowconns)), + Arc::new(Int32Array::from(datconnlimits)), + Arc::new(Int32Array::from(datlastsysoids)), + Arc::new(Int32Array::from(datfrozenxids)), + Arc::new(Int32Array::from(datminmxids)), + Arc::new(Int32Array::from(dattablespaces)), + Arc::new(StringArray::from_iter(datacles.into_iter())), + ]; + + // Create a full record batch + let full_batch = RecordBatch::try_new(this.schema.clone(), arrays)?; + + // update cache + // remove all schema cache and table of the schema which is no longer exists + oid_cache.retain(|key, _| match key { + OidCacheKey::Catalog(..) => false, + OidCacheKey::Schema(catalog, ..) => { + catalog_oid_cache.contains_key(&OidCacheKey::Catalog(catalog.clone())) + } + OidCacheKey::Table(catalog, ..) => { + catalog_oid_cache.contains_key(&OidCacheKey::Catalog(catalog.clone())) + } + }); + // add new schema cache + oid_cache.extend(catalog_oid_cache); + + Ok(full_batch) + } +} + +impl PartitionStream for PgDatabaseTable { + fn schema(&self) -> &SchemaRef { + &self.schema + } + + fn execute(&self, _ctx: Arc) -> SendableRecordBatchStream { + let this = self.clone(); + Box::pin(RecordBatchStreamAdapter::new( + this.schema.clone(), + futures::stream::once(async move { Self::get_data(this).await }), + )) + } +} diff --git a/datafusion-postgres/src/pg_catalog/pg_namespace.rs b/datafusion-postgres/src/pg_catalog/pg_namespace.rs new file mode 100644 index 0000000..060a996 --- /dev/null +++ b/datafusion-postgres/src/pg_catalog/pg_namespace.rs @@ -0,0 +1,122 @@ +use std::collections::HashMap; +use std::sync::atomic::{AtomicU32, Ordering}; +use std::sync::Arc; + +use datafusion::arrow::array::{ArrayRef, Int32Array, RecordBatch, StringArray}; +use datafusion::arrow::datatypes::{DataType, Field, Schema, SchemaRef}; +use datafusion::catalog::CatalogProviderList; +use datafusion::error::Result; +use datafusion::execution::{SendableRecordBatchStream, TaskContext}; +use datafusion::physical_plan::stream::RecordBatchStreamAdapter; +use datafusion::physical_plan::streaming::PartitionStream; +use postgres_types::Oid; +use tokio::sync::RwLock; + +use super::OidCacheKey; + +#[derive(Debug, Clone)] +pub(crate) struct PgNamespaceTable { + schema: SchemaRef, + catalog_list: Arc, + oid_counter: Arc, + oid_cache: Arc>>, +} + +impl PgNamespaceTable { + pub(crate) fn new( + catalog_list: Arc, + oid_counter: Arc, + oid_cache: Arc>>, + ) -> Self { + // Define the schema for pg_namespace + // This matches the columns from PostgreSQL's pg_namespace + let schema = Arc::new(Schema::new(vec![ + Field::new("oid", DataType::Int32, false), // Object identifier + Field::new("nspname", DataType::Utf8, false), // Name of the namespace (schema) + Field::new("nspowner", DataType::Int32, false), // Owner of the namespace + Field::new("nspacl", DataType::Utf8, true), // Access privileges + Field::new("options", DataType::Utf8, true), // Schema-level options + ])); + + Self { + schema, + catalog_list, + oid_counter, + oid_cache, + } + } + + /// Generate record batches based on the current state of the catalog + async fn get_data(this: PgNamespaceTable) -> Result { + // Vectors to store column data + let mut oids = Vec::new(); + let mut nspnames = Vec::new(); + let mut nspowners = Vec::new(); + let mut nspacls: Vec> = Vec::new(); + let mut options: Vec> = Vec::new(); + + // to store all schema-oid mapping temporarily before adding to global oid cache + let mut schema_oid_cache = HashMap::new(); + + let mut oid_cache = this.oid_cache.write().await; + + // Now add all schemas from DataFusion catalogs + for catalog_name in this.catalog_list.catalog_names() { + if let Some(catalog) = this.catalog_list.catalog(&catalog_name) { + for schema_name in catalog.schema_names() { + let cache_key = OidCacheKey::Schema(catalog_name.clone(), schema_name.clone()); + let schema_oid = if let Some(oid) = oid_cache.get(&cache_key) { + *oid + } else { + this.oid_counter.fetch_add(1, Ordering::Relaxed) + }; + schema_oid_cache.insert(cache_key, schema_oid); + + oids.push(schema_oid as i32); + nspnames.push(schema_name.clone()); + nspowners.push(10); // Default owner + nspacls.push(None); + options.push(None); + } + } + } + + // remove all schema cache and table of the schema which is no longer exists + oid_cache.retain(|key, _| match key { + OidCacheKey::Catalog(..) => true, + OidCacheKey::Schema(..) => false, + OidCacheKey::Table(catalog, schema_name, _) => schema_oid_cache + .contains_key(&OidCacheKey::Schema(catalog.clone(), schema_name.clone())), + }); + // add new schema cache + oid_cache.extend(schema_oid_cache); + + // Create Arrow arrays from the collected data + let arrays: Vec = vec![ + Arc::new(Int32Array::from(oids)), + Arc::new(StringArray::from(nspnames)), + Arc::new(Int32Array::from(nspowners)), + Arc::new(StringArray::from_iter(nspacls.into_iter())), + Arc::new(StringArray::from_iter(options.into_iter())), + ]; + + // Create a full record batch + let batch = RecordBatch::try_new(this.schema.clone(), arrays)?; + + Ok(batch) + } +} + +impl PartitionStream for PgNamespaceTable { + fn schema(&self) -> &SchemaRef { + &self.schema + } + + fn execute(&self, _ctx: Arc) -> SendableRecordBatchStream { + let this = self.clone(); + Box::pin(RecordBatchStreamAdapter::new( + this.schema.clone(), + futures::stream::once(async move { Self::get_data(this).await }), + )) + } +} diff --git a/export_pg_catalog_arrow.sh b/export_pg_catalog_arrow.sh new file mode 100755 index 0000000..10ac861 --- /dev/null +++ b/export_pg_catalog_arrow.sh @@ -0,0 +1,514 @@ +#!/bin/bash + +# Exit on error +set -e + +# Configuration +CONTAINER_NAME="postgres-arrow-export" +DB_NAME="postgres" +DB_USER="postgres" +DB_PASSWORD="postgres" +EXPORT_DIR="./pg_catalog_arrow_exports" +POSTGRES_PORT="5432" + +# Export mode: "static" or "all" +EXPORT_MODE="${1:-static}" + +# Static tables whitelist - these contain only built-in PostgreSQL data +STATIC_TABLES=( + "pg_aggregate" + "pg_am" + "pg_amop" + "pg_amproc" + "pg_cast" + "pg_collation" + "pg_conversion" + "pg_language" + "pg_opclass" + "pg_operator" + "pg_opfamily" + "pg_proc" + "pg_range" + "pg_ts_config" + "pg_ts_dict" + "pg_ts_parser" + "pg_ts_template" + "pg_type" +) + +# Dynamic tables blacklist - these contain user/database-specific data +DYNAMIC_TABLES=( + "pg_attribute" + "pg_attrdef" + "pg_auth_members" + "pg_authid" + "pg_class" + "pg_constraint" + "pg_database" + "pg_db_role_setting" + "pg_default_acl" + "pg_depend" + "pg_description" + "pg_enum" + "pg_event_trigger" + "pg_extension" + "pg_foreign_data_wrapper" + "pg_foreign_server" + "pg_foreign_table" + "pg_index" + "pg_inherits" + "pg_init_privs" + "pg_largeobject" + "pg_largeobject_metadata" + "pg_namespace" + "pg_partitioned_table" + "pg_policy" + "pg_publication" + "pg_publication_namespace" + "pg_publication_rel" + "pg_replication_origin" + "pg_rewrite" + "pg_seclabel" + "pg_sequence" + "pg_shdepend" + "pg_shdescription" + "pg_shseclabel" + "pg_statistic" + "pg_statistic_ext" + "pg_statistic_ext_data" + "pg_subscription" + "pg_subscription_rel" + "pg_tablespace" + "pg_trigger" + "pg_user_mapping" +) + +echo "=== PostgreSQL pg_catalog to Arrow IPC Export Script ===" +echo "Export mode: $EXPORT_MODE" +if [ "$EXPORT_MODE" = "static" ]; then + echo "Will export only static tables (built-in PostgreSQL data)" +else + echo "Will export all tables (including user-specific data)" +fi +echo "" + +# Clean up any existing container +echo "Cleaning up existing container if any..." +docker rm -f $CONTAINER_NAME 2>/dev/null || true + +# Create export directory +echo "Creating export directory..." +mkdir -p "$EXPORT_DIR" + +# Start PostgreSQL container +echo "Starting PostgreSQL container..." +docker run -d \ + --name $CONTAINER_NAME \ + -e POSTGRES_PASSWORD=$DB_PASSWORD \ + -e POSTGRES_USER=$DB_USER \ + -e POSTGRES_DB=$DB_NAME \ + -p $POSTGRES_PORT:5432 \ + postgres:17.6 + +# Wait for PostgreSQL to be ready +echo "Waiting for PostgreSQL to be ready..." +for i in {1..30}; do + if docker exec $CONTAINER_NAME pg_isready -U $DB_USER >/dev/null 2>&1; then + echo "PostgreSQL is ready!" + break + fi + if [ $i -eq 30 ]; then + echo "Timeout waiting for PostgreSQL to start" + exit 1 + fi + echo -n "." + sleep 1 +done + +# Install required tools in container +echo "Installing required tools in container..." +docker exec $CONTAINER_NAME apt-get update +docker exec $CONTAINER_NAME apt-get install -y python3 python3-pip python3-venv + +# Create Python virtual environment and install dependencies +echo "Setting up Python environment..." +docker exec $CONTAINER_NAME python3 -m venv /opt/venv +docker exec $CONTAINER_NAME /opt/venv/bin/pip install psycopg2-binary pyarrow pandas numpy + +# Pass table lists to Python script +if [ "$EXPORT_MODE" = "static" ]; then + WHITELIST_STR=$(printf '"%s",' "${STATIC_TABLES[@]}") + WHITELIST_STR="[${WHITELIST_STR%,}]" + TABLE_FILTER="whitelist" +else + WHITELIST_STR="[]" + TABLE_FILTER="all" +fi + +# Create Python script for exporting to Arrow IPC +echo "Creating export script..." +cat << 'EOF' > export_to_arrow.py +import psycopg2 +import pyarrow as pa +import pyarrow.feather as feather +import pandas as pd +import os +import sys +from datetime import datetime, date, time +from decimal import Decimal +import json +import numpy as np + +def pg_type_to_arrow_type(pg_type, is_nullable=True): + """Map PostgreSQL types to Arrow types""" + type_mapping = { + 'bigint': pa.int64(), + 'integer': pa.int32(), + 'smallint': pa.int16(), + 'numeric': pa.decimal128(38, 10), # Max precision for decimal128 + 'real': pa.float32(), + 'double precision': pa.float64(), + 'boolean': pa.bool_(), + 'text': pa.string(), + 'character varying': pa.string(), + 'character': pa.string(), + 'name': pa.string(), + 'oid': pa.uint32(), # OIDs are unsigned + 'regproc': pa.string(), + 'regtype': pa.string(), + 'regclass': pa.string(), + 'timestamp': pa.timestamp('us'), + 'timestamp without time zone': pa.timestamp('us'), + 'timestamp with time zone': pa.timestamp('us', tz='UTC'), + 'date': pa.date32(), + 'time': pa.time64('us'), + 'time without time zone': pa.time64('us'), + 'interval': pa.string(), # Store as string for now + 'bytea': pa.binary(), + 'json': pa.string(), + 'jsonb': pa.string(), + 'uuid': pa.string(), + 'inet': pa.string(), + 'cidr': pa.string(), + 'macaddr': pa.string(), + 'money': pa.decimal128(19, 2), + 'char': pa.string(), + 'bpchar': pa.string(), + 'aclitem': pa.string(), + 'pg_node_tree': pa.string(), + 'pg_ndistinct': pa.string(), + 'pg_dependencies': pa.string(), + 'pg_mcv_list': pa.string(), + 'anyarray': pa.string(), # Store as string representation + 'oidvector': pa.string(), # Store as string + 'int2vector': pa.string(), # Store as string + } + + arrow_type = type_mapping.get(pg_type, pa.string()) + return arrow_type + +def convert_value(value, arrow_type): + """Convert PostgreSQL value to Arrow-compatible value""" + if value is None: + return None + + # Handle array types - convert to string representation + if isinstance(value, (list, tuple)): + return str(value) + + # Handle decimal types + if isinstance(arrow_type, pa.lib.Decimal128Type) and isinstance(value, Decimal): + return value + + # Handle datetime types + if isinstance(value, datetime): + return value + elif isinstance(value, date): + return value + elif isinstance(value, time): + # Convert time to microseconds since midnight + return (value.hour * 3600 + value.minute * 60 + value.second) * 1000000 + value.microsecond + + # Handle bytea + if isinstance(value, (bytes, memoryview)): + return bytes(value) + + # Default: return as-is + return value + +def get_table_info(conn, schema_name, table_name): + """Get detailed table information""" + cur = conn.cursor() + + # Get table comment/description + cur.execute(""" + SELECT obj_description(c.oid, 'pg_class') as table_comment + FROM pg_class c + JOIN pg_namespace n ON n.oid = c.relnamespace + WHERE n.nspname = %s AND c.relname = %s + """, (schema_name, table_name)) + + result = cur.fetchone() + table_comment = result[0] if result and result[0] else "No description available" + + # Get row count + cur.execute(f"SELECT COUNT(*) FROM {schema_name}.{table_name}") + row_count = cur.fetchone()[0] + + # Get table size + cur.execute(""" + SELECT pg_size_pretty(pg_total_relation_size(c.oid)) as size + FROM pg_class c + JOIN pg_namespace n ON n.oid = c.relnamespace + WHERE n.nspname = %s AND c.relname = %s + """, (schema_name, table_name)) + + result = cur.fetchone() + table_size = result[0] if result else "Unknown" + + cur.close() + return table_comment, row_count, table_size + +def export_table_to_arrow(conn, schema_name, table_name, output_dir): + """Export a single table to Arrow IPC format""" + try: + cur = conn.cursor() + + # Get table information for display + table_comment, row_count, table_size = get_table_info(conn, schema_name, table_name) + + # Determine if table is static or dynamic + static_tables = { + 'pg_aggregate', 'pg_am', 'pg_amop', 'pg_amproc', 'pg_cast', + 'pg_collation', 'pg_conversion', 'pg_language', 'pg_opclass', + 'pg_operator', 'pg_opfamily', 'pg_proc', 'pg_range', + 'pg_ts_config', 'pg_ts_dict', 'pg_ts_parser', 'pg_ts_template', 'pg_type' + } + + table_type = "STATIC (Built-in data)" if table_name in static_tables else "DYNAMIC (User/DB-specific data)" + + print(f"\n{'='*80}") + print(f"Table: {schema_name}.{table_name}") + print(f"Type: {table_type}") + print(f"Description: {table_comment}") + print(f"Row Count: {row_count:,}") + print(f"Size: {table_size}") + print(f"{'='*80}") + + # Get column information + cur.execute(""" + SELECT + c.column_name, + c.data_type, + c.is_nullable, + c.column_default, + pgd.description as column_comment + FROM information_schema.columns c + LEFT JOIN pg_catalog.pg_description pgd ON + pgd.objoid = ( + SELECT oid FROM pg_class + WHERE relname = c.table_name AND relnamespace = ( + SELECT oid FROM pg_namespace WHERE nspname = c.table_schema + ) + ) AND pgd.objsubid = c.ordinal_position + WHERE c.table_schema = %s AND c.table_name = %s + ORDER BY c.ordinal_position + """, (schema_name, table_name)) + + columns = cur.fetchall() + if not columns: + print(f"No columns found for {schema_name}.{table_name}") + return False + + # Display column information + print("\nColumns:") + print(f"{'Column Name':<30} {'Type':<20} {'Nullable':<10} {'Description':<40}") + print("-" * 100) + + # Build Arrow schema + arrow_fields = [] + for col_name, data_type, is_nullable, default, comment in columns: + nullable = "YES" if is_nullable == 'YES' else "NO" + desc = (comment[:37] + '...') if comment and len(comment) > 40 else (comment or '') + print(f"{col_name:<30} {data_type:<20} {nullable:<10} {desc:<40}") + + # Create Arrow field + arrow_type = pg_type_to_arrow_type(data_type, is_nullable == 'YES') + field = pa.field(col_name, arrow_type, nullable=(is_nullable == 'YES')) + arrow_fields.append(field) + + arrow_schema = pa.schema(arrow_fields) + + # Fetch all data + cur.execute(f"SELECT * FROM {schema_name}.{table_name}") + + # Process data in batches to handle large tables + batch_size = 10000 + all_batches = [] + + while True: + rows = cur.fetchmany(batch_size) + if not rows: + break + + # Convert rows to columnar format + columns_data = {} + for i, field in enumerate(arrow_fields): + col_values = [] + for row in rows: + value = convert_value(row[i], field.type) + col_values.append(value) + + # Create Arrow array + try: + if isinstance(field.type, pa.lib.Decimal128Type): + # Special handling for decimal types + arr = pa.array(col_values, type=field.type) + else: + arr = pa.array(col_values, type=field.type) + columns_data[field.name] = arr + except Exception as e: + print(f"Warning: Error converting column {field.name}: {e}") + # Fallback to string + str_values = [str(v) if v is not None else None for v in col_values] + columns_data[field.name] = pa.array(str_values, type=pa.string()) + + # Create record batch + batch = pa.RecordBatch.from_pydict(columns_data, schema=arrow_schema) + all_batches.append(batch) + + if not all_batches: + # Create empty table with schema + table = pa.Table.from_batches([], schema=arrow_schema) + else: + # Combine all batches into a table + table = pa.Table.from_batches(all_batches, schema=arrow_schema) + + # Also save as Feather format for compatibility + feather_file = os.path.join(output_dir, f'{table_name}.feather') + feather.write_feather(table, feather_file) + print(f"Saved as Feather format: {feather_file}") + + cur.close() + return True + + except Exception as e: + print(f"\nError exporting {schema_name}.{table_name}: {str(e)}") + import traceback + traceback.print_exc() + return False + +def main(): + # Database connection parameters + conn_params = { + 'host': 'localhost', + 'database': os.environ.get('DB_NAME', 'postgres'), + 'user': os.environ.get('DB_USER', 'postgres'), + 'password': os.environ.get('DB_PASSWORD', 'postgres') + } + + # Get whitelist from environment variable + whitelist_str = os.environ.get('WHITELIST_TABLES', '[]') + whitelist_tables = json.loads(whitelist_str) + table_filter = os.environ.get('TABLE_FILTER', 'all') + + output_dir = '/exports' + os.makedirs(output_dir, exist_ok=True) + + try: + # Connect to database + conn = psycopg2.connect(**conn_params) + cur = conn.cursor() + + # Get all tables in pg_catalog schema + cur.execute(""" + SELECT tablename + FROM pg_tables + WHERE schemaname = 'pg_catalog' + ORDER BY tablename + """) + + all_tables = [row[0] for row in cur.fetchall()] + print(f"\nFound {len(all_tables)} tables in pg_catalog schema") + + # Filter tables based on mode + if table_filter == 'whitelist' and whitelist_tables: + tables = [t for t in all_tables if t in whitelist_tables] + print(f"Filtering to {len(tables)} static tables only") + print(f"Static tables: {', '.join(sorted(tables))}") + else: + tables = all_tables + print(f"Exporting all {len(tables)} tables") + + print(f"\nWill export {len(tables)} tables") + print("="*80) + + # Export each table + success_count = 0 + failed_tables = [] + + for i, table in enumerate(sorted(tables), 1): + print(f"\n[{i}/{len(tables)}] Processing {table}...") + + if export_table_to_arrow(conn, 'pg_catalog', table, output_dir): + success_count += 1 + else: + failed_tables.append(table) + + # Summary + print("\n" + "="*80) + print("EXPORT SUMMARY") + print("="*80) + print(f"Export mode: {table_filter}") + print(f"Total tables found: {len(all_tables)}") + print(f"Tables selected for export: {len(tables)}") + print(f"Successfully exported: {success_count}") + print(f"Failed to export: {len(failed_tables)}") + + if failed_tables: + print(f"\nFailed tables: {', '.join(failed_tables)}") + + print("\n" + "="*80) + + cur.close() + conn.close() + + except Exception as e: + print(f"Database connection error: {str(e)}") + sys.exit(1) + +if __name__ == '__main__': + main() +EOF + +# Copy the Python script to container +docker cp export_to_arrow.py $CONTAINER_NAME:/export_to_arrow.py + +# Run the export script with appropriate filter +echo "Running export script..." +docker exec -e DB_NAME=$DB_NAME -e DB_USER=$DB_USER -e DB_PASSWORD=$DB_PASSWORD \ + -e WHITELIST_TABLES="$WHITELIST_STR" -e TABLE_FILTER="$TABLE_FILTER" \ + $CONTAINER_NAME /opt/venv/bin/python /export_to_arrow.py + +# Copy exported files from container to host +echo "Copying exported files to host..." +docker cp $CONTAINER_NAME:/exports/. "$EXPORT_DIR/" + +# Clean up +echo "Cleaning up..." +rm -f export_to_arrow.py +docker stop $CONTAINER_NAME +docker rm $CONTAINER_NAME + +echo "=== Export completed! ===" +echo "Arrow IPC files are available in: $EXPORT_DIR" +echo "" +echo "Files exported: $(ls -1 "$EXPORT_DIR"/*.arrow 2>/dev/null | wc -l)" +echo "Total size: $(du -sh "$EXPORT_DIR" 2>/dev/null | cut -f1)" +echo "" +echo "To view the exported files:" +echo "ls -la $EXPORT_DIR/" +echo "" +echo "Usage:" +echo " ./export_pg_catalog_arrow.sh # Export only static tables (default)" +echo " ./export_pg_catalog_arrow.sh all # Export all tables" diff --git a/pg_catalog_arrow_exports/pg_aggregate.feather b/pg_catalog_arrow_exports/pg_aggregate.feather new file mode 100644 index 0000000..7e69410 Binary files /dev/null and b/pg_catalog_arrow_exports/pg_aggregate.feather differ diff --git a/pg_catalog_arrow_exports/pg_am.feather b/pg_catalog_arrow_exports/pg_am.feather new file mode 100644 index 0000000..372445a Binary files /dev/null and b/pg_catalog_arrow_exports/pg_am.feather differ diff --git a/pg_catalog_arrow_exports/pg_amop.feather b/pg_catalog_arrow_exports/pg_amop.feather new file mode 100644 index 0000000..8919bc5 Binary files /dev/null and b/pg_catalog_arrow_exports/pg_amop.feather differ diff --git a/pg_catalog_arrow_exports/pg_amproc.feather b/pg_catalog_arrow_exports/pg_amproc.feather new file mode 100644 index 0000000..f9149d4 Binary files /dev/null and b/pg_catalog_arrow_exports/pg_amproc.feather differ diff --git a/pg_catalog_arrow_exports/pg_attrdef.feather b/pg_catalog_arrow_exports/pg_attrdef.feather new file mode 100644 index 0000000..4470857 Binary files /dev/null and b/pg_catalog_arrow_exports/pg_attrdef.feather differ diff --git a/pg_catalog_arrow_exports/pg_attribute.feather b/pg_catalog_arrow_exports/pg_attribute.feather new file mode 100644 index 0000000..ee0af97 Binary files /dev/null and b/pg_catalog_arrow_exports/pg_attribute.feather differ diff --git a/pg_catalog_arrow_exports/pg_auth_members.feather b/pg_catalog_arrow_exports/pg_auth_members.feather new file mode 100644 index 0000000..5e5dde1 Binary files /dev/null and b/pg_catalog_arrow_exports/pg_auth_members.feather differ diff --git a/pg_catalog_arrow_exports/pg_authid.feather b/pg_catalog_arrow_exports/pg_authid.feather new file mode 100644 index 0000000..87d11d1 Binary files /dev/null and b/pg_catalog_arrow_exports/pg_authid.feather differ diff --git a/pg_catalog_arrow_exports/pg_cast.feather b/pg_catalog_arrow_exports/pg_cast.feather new file mode 100644 index 0000000..60d6285 Binary files /dev/null and b/pg_catalog_arrow_exports/pg_cast.feather differ diff --git a/pg_catalog_arrow_exports/pg_class.feather b/pg_catalog_arrow_exports/pg_class.feather new file mode 100644 index 0000000..ee79ccf Binary files /dev/null and b/pg_catalog_arrow_exports/pg_class.feather differ diff --git a/pg_catalog_arrow_exports/pg_collation.feather b/pg_catalog_arrow_exports/pg_collation.feather new file mode 100644 index 0000000..c42fcee Binary files /dev/null and b/pg_catalog_arrow_exports/pg_collation.feather differ diff --git a/pg_catalog_arrow_exports/pg_constraint.feather b/pg_catalog_arrow_exports/pg_constraint.feather new file mode 100644 index 0000000..a618337 Binary files /dev/null and b/pg_catalog_arrow_exports/pg_constraint.feather differ diff --git a/pg_catalog_arrow_exports/pg_conversion.feather b/pg_catalog_arrow_exports/pg_conversion.feather new file mode 100644 index 0000000..308f777 Binary files /dev/null and b/pg_catalog_arrow_exports/pg_conversion.feather differ diff --git a/pg_catalog_arrow_exports/pg_database.feather b/pg_catalog_arrow_exports/pg_database.feather new file mode 100644 index 0000000..a43468f Binary files /dev/null and b/pg_catalog_arrow_exports/pg_database.feather differ diff --git a/pg_catalog_arrow_exports/pg_db_role_setting.feather b/pg_catalog_arrow_exports/pg_db_role_setting.feather new file mode 100644 index 0000000..0d2577d Binary files /dev/null and b/pg_catalog_arrow_exports/pg_db_role_setting.feather differ diff --git a/pg_catalog_arrow_exports/pg_default_acl.feather b/pg_catalog_arrow_exports/pg_default_acl.feather new file mode 100644 index 0000000..097878e Binary files /dev/null and b/pg_catalog_arrow_exports/pg_default_acl.feather differ diff --git a/pg_catalog_arrow_exports/pg_depend.feather b/pg_catalog_arrow_exports/pg_depend.feather new file mode 100644 index 0000000..88178cc Binary files /dev/null and b/pg_catalog_arrow_exports/pg_depend.feather differ diff --git a/pg_catalog_arrow_exports/pg_description.feather b/pg_catalog_arrow_exports/pg_description.feather new file mode 100644 index 0000000..b0d71b9 Binary files /dev/null and b/pg_catalog_arrow_exports/pg_description.feather differ diff --git a/pg_catalog_arrow_exports/pg_enum.feather b/pg_catalog_arrow_exports/pg_enum.feather new file mode 100644 index 0000000..dbad2b7 Binary files /dev/null and b/pg_catalog_arrow_exports/pg_enum.feather differ diff --git a/pg_catalog_arrow_exports/pg_event_trigger.feather b/pg_catalog_arrow_exports/pg_event_trigger.feather new file mode 100644 index 0000000..e63feda Binary files /dev/null and b/pg_catalog_arrow_exports/pg_event_trigger.feather differ diff --git a/pg_catalog_arrow_exports/pg_extension.feather b/pg_catalog_arrow_exports/pg_extension.feather new file mode 100644 index 0000000..c3a438b Binary files /dev/null and b/pg_catalog_arrow_exports/pg_extension.feather differ diff --git a/pg_catalog_arrow_exports/pg_foreign_data_wrapper.feather b/pg_catalog_arrow_exports/pg_foreign_data_wrapper.feather new file mode 100644 index 0000000..61899a5 Binary files /dev/null and b/pg_catalog_arrow_exports/pg_foreign_data_wrapper.feather differ diff --git a/pg_catalog_arrow_exports/pg_foreign_server.feather b/pg_catalog_arrow_exports/pg_foreign_server.feather new file mode 100644 index 0000000..8d22dc6 Binary files /dev/null and b/pg_catalog_arrow_exports/pg_foreign_server.feather differ diff --git a/pg_catalog_arrow_exports/pg_foreign_table.feather b/pg_catalog_arrow_exports/pg_foreign_table.feather new file mode 100644 index 0000000..47079d4 Binary files /dev/null and b/pg_catalog_arrow_exports/pg_foreign_table.feather differ diff --git a/pg_catalog_arrow_exports/pg_index.feather b/pg_catalog_arrow_exports/pg_index.feather new file mode 100644 index 0000000..1e9b990 Binary files /dev/null and b/pg_catalog_arrow_exports/pg_index.feather differ diff --git a/pg_catalog_arrow_exports/pg_inherits.feather b/pg_catalog_arrow_exports/pg_inherits.feather new file mode 100644 index 0000000..ea4e2ea Binary files /dev/null and b/pg_catalog_arrow_exports/pg_inherits.feather differ diff --git a/pg_catalog_arrow_exports/pg_init_privs.feather b/pg_catalog_arrow_exports/pg_init_privs.feather new file mode 100644 index 0000000..892210a Binary files /dev/null and b/pg_catalog_arrow_exports/pg_init_privs.feather differ diff --git a/pg_catalog_arrow_exports/pg_language.feather b/pg_catalog_arrow_exports/pg_language.feather new file mode 100644 index 0000000..aad59be Binary files /dev/null and b/pg_catalog_arrow_exports/pg_language.feather differ diff --git a/pg_catalog_arrow_exports/pg_largeobject.feather b/pg_catalog_arrow_exports/pg_largeobject.feather new file mode 100644 index 0000000..296772f Binary files /dev/null and b/pg_catalog_arrow_exports/pg_largeobject.feather differ diff --git a/pg_catalog_arrow_exports/pg_largeobject_metadata.feather b/pg_catalog_arrow_exports/pg_largeobject_metadata.feather new file mode 100644 index 0000000..193c4c8 Binary files /dev/null and b/pg_catalog_arrow_exports/pg_largeobject_metadata.feather differ diff --git a/pg_catalog_arrow_exports/pg_namespace.feather b/pg_catalog_arrow_exports/pg_namespace.feather new file mode 100644 index 0000000..ebe0609 Binary files /dev/null and b/pg_catalog_arrow_exports/pg_namespace.feather differ diff --git a/pg_catalog_arrow_exports/pg_opclass.feather b/pg_catalog_arrow_exports/pg_opclass.feather new file mode 100644 index 0000000..c353565 Binary files /dev/null and b/pg_catalog_arrow_exports/pg_opclass.feather differ diff --git a/pg_catalog_arrow_exports/pg_operator.feather b/pg_catalog_arrow_exports/pg_operator.feather new file mode 100644 index 0000000..245fc95 Binary files /dev/null and b/pg_catalog_arrow_exports/pg_operator.feather differ diff --git a/pg_catalog_arrow_exports/pg_opfamily.feather b/pg_catalog_arrow_exports/pg_opfamily.feather new file mode 100644 index 0000000..05c8806 Binary files /dev/null and b/pg_catalog_arrow_exports/pg_opfamily.feather differ diff --git a/pg_catalog_arrow_exports/pg_parameter_acl.feather b/pg_catalog_arrow_exports/pg_parameter_acl.feather new file mode 100644 index 0000000..365964c Binary files /dev/null and b/pg_catalog_arrow_exports/pg_parameter_acl.feather differ diff --git a/pg_catalog_arrow_exports/pg_partitioned_table.feather b/pg_catalog_arrow_exports/pg_partitioned_table.feather new file mode 100644 index 0000000..da6e42d Binary files /dev/null and b/pg_catalog_arrow_exports/pg_partitioned_table.feather differ diff --git a/pg_catalog_arrow_exports/pg_policy.feather b/pg_catalog_arrow_exports/pg_policy.feather new file mode 100644 index 0000000..ac03a61 Binary files /dev/null and b/pg_catalog_arrow_exports/pg_policy.feather differ diff --git a/pg_catalog_arrow_exports/pg_proc.feather b/pg_catalog_arrow_exports/pg_proc.feather new file mode 100644 index 0000000..d6bbfdf Binary files /dev/null and b/pg_catalog_arrow_exports/pg_proc.feather differ diff --git a/pg_catalog_arrow_exports/pg_publication.feather b/pg_catalog_arrow_exports/pg_publication.feather new file mode 100644 index 0000000..f659f1c Binary files /dev/null and b/pg_catalog_arrow_exports/pg_publication.feather differ diff --git a/pg_catalog_arrow_exports/pg_publication_namespace.feather b/pg_catalog_arrow_exports/pg_publication_namespace.feather new file mode 100644 index 0000000..8bb2eeb Binary files /dev/null and b/pg_catalog_arrow_exports/pg_publication_namespace.feather differ diff --git a/pg_catalog_arrow_exports/pg_publication_rel.feather b/pg_catalog_arrow_exports/pg_publication_rel.feather new file mode 100644 index 0000000..ea8ee46 Binary files /dev/null and b/pg_catalog_arrow_exports/pg_publication_rel.feather differ diff --git a/pg_catalog_arrow_exports/pg_range.feather b/pg_catalog_arrow_exports/pg_range.feather new file mode 100644 index 0000000..c7ad9f1 Binary files /dev/null and b/pg_catalog_arrow_exports/pg_range.feather differ diff --git a/pg_catalog_arrow_exports/pg_replication_origin.feather b/pg_catalog_arrow_exports/pg_replication_origin.feather new file mode 100644 index 0000000..7b0298a Binary files /dev/null and b/pg_catalog_arrow_exports/pg_replication_origin.feather differ diff --git a/pg_catalog_arrow_exports/pg_rewrite.feather b/pg_catalog_arrow_exports/pg_rewrite.feather new file mode 100644 index 0000000..dc08887 Binary files /dev/null and b/pg_catalog_arrow_exports/pg_rewrite.feather differ diff --git a/pg_catalog_arrow_exports/pg_seclabel.feather b/pg_catalog_arrow_exports/pg_seclabel.feather new file mode 100644 index 0000000..a327f32 Binary files /dev/null and b/pg_catalog_arrow_exports/pg_seclabel.feather differ diff --git a/pg_catalog_arrow_exports/pg_sequence.feather b/pg_catalog_arrow_exports/pg_sequence.feather new file mode 100644 index 0000000..ca46efd Binary files /dev/null and b/pg_catalog_arrow_exports/pg_sequence.feather differ diff --git a/pg_catalog_arrow_exports/pg_shdepend.feather b/pg_catalog_arrow_exports/pg_shdepend.feather new file mode 100644 index 0000000..cbdb81e Binary files /dev/null and b/pg_catalog_arrow_exports/pg_shdepend.feather differ diff --git a/pg_catalog_arrow_exports/pg_shdescription.feather b/pg_catalog_arrow_exports/pg_shdescription.feather new file mode 100644 index 0000000..d7f039c Binary files /dev/null and b/pg_catalog_arrow_exports/pg_shdescription.feather differ diff --git a/pg_catalog_arrow_exports/pg_shseclabel.feather b/pg_catalog_arrow_exports/pg_shseclabel.feather new file mode 100644 index 0000000..2907e5d Binary files /dev/null and b/pg_catalog_arrow_exports/pg_shseclabel.feather differ diff --git a/pg_catalog_arrow_exports/pg_statistic.feather b/pg_catalog_arrow_exports/pg_statistic.feather new file mode 100644 index 0000000..3c1ea37 Binary files /dev/null and b/pg_catalog_arrow_exports/pg_statistic.feather differ diff --git a/pg_catalog_arrow_exports/pg_statistic_ext.feather b/pg_catalog_arrow_exports/pg_statistic_ext.feather new file mode 100644 index 0000000..b1a5a70 Binary files /dev/null and b/pg_catalog_arrow_exports/pg_statistic_ext.feather differ diff --git a/pg_catalog_arrow_exports/pg_statistic_ext_data.feather b/pg_catalog_arrow_exports/pg_statistic_ext_data.feather new file mode 100644 index 0000000..3eaee81 Binary files /dev/null and b/pg_catalog_arrow_exports/pg_statistic_ext_data.feather differ diff --git a/pg_catalog_arrow_exports/pg_subscription.feather b/pg_catalog_arrow_exports/pg_subscription.feather new file mode 100644 index 0000000..2bf7478 Binary files /dev/null and b/pg_catalog_arrow_exports/pg_subscription.feather differ diff --git a/pg_catalog_arrow_exports/pg_subscription_rel.feather b/pg_catalog_arrow_exports/pg_subscription_rel.feather new file mode 100644 index 0000000..4b7c393 Binary files /dev/null and b/pg_catalog_arrow_exports/pg_subscription_rel.feather differ diff --git a/pg_catalog_arrow_exports/pg_tablespace.feather b/pg_catalog_arrow_exports/pg_tablespace.feather new file mode 100644 index 0000000..f2035f8 Binary files /dev/null and b/pg_catalog_arrow_exports/pg_tablespace.feather differ diff --git a/pg_catalog_arrow_exports/pg_transform.feather b/pg_catalog_arrow_exports/pg_transform.feather new file mode 100644 index 0000000..8cf564a Binary files /dev/null and b/pg_catalog_arrow_exports/pg_transform.feather differ diff --git a/pg_catalog_arrow_exports/pg_trigger.feather b/pg_catalog_arrow_exports/pg_trigger.feather new file mode 100644 index 0000000..b8524c9 Binary files /dev/null and b/pg_catalog_arrow_exports/pg_trigger.feather differ diff --git a/pg_catalog_arrow_exports/pg_ts_config.feather b/pg_catalog_arrow_exports/pg_ts_config.feather new file mode 100644 index 0000000..bb99403 Binary files /dev/null and b/pg_catalog_arrow_exports/pg_ts_config.feather differ diff --git a/pg_catalog_arrow_exports/pg_ts_config_map.feather b/pg_catalog_arrow_exports/pg_ts_config_map.feather new file mode 100644 index 0000000..d4688f2 Binary files /dev/null and b/pg_catalog_arrow_exports/pg_ts_config_map.feather differ diff --git a/pg_catalog_arrow_exports/pg_ts_dict.feather b/pg_catalog_arrow_exports/pg_ts_dict.feather new file mode 100644 index 0000000..2cd8bbf Binary files /dev/null and b/pg_catalog_arrow_exports/pg_ts_dict.feather differ diff --git a/pg_catalog_arrow_exports/pg_ts_parser.feather b/pg_catalog_arrow_exports/pg_ts_parser.feather new file mode 100644 index 0000000..1253c92 Binary files /dev/null and b/pg_catalog_arrow_exports/pg_ts_parser.feather differ diff --git a/pg_catalog_arrow_exports/pg_ts_template.feather b/pg_catalog_arrow_exports/pg_ts_template.feather new file mode 100644 index 0000000..6526971 Binary files /dev/null and b/pg_catalog_arrow_exports/pg_ts_template.feather differ diff --git a/pg_catalog_arrow_exports/pg_type.feather b/pg_catalog_arrow_exports/pg_type.feather new file mode 100644 index 0000000..d2a78b2 Binary files /dev/null and b/pg_catalog_arrow_exports/pg_type.feather differ diff --git a/pg_catalog_arrow_exports/pg_user_mapping.feather b/pg_catalog_arrow_exports/pg_user_mapping.feather new file mode 100644 index 0000000..cde913b Binary files /dev/null and b/pg_catalog_arrow_exports/pg_user_mapping.feather differ