@@ -949,8 +949,8 @@ impl PhysicalPlanner {
949949 ) )
950950 }
951951 OpStruct :: NativeScan ( scan) => {
952- let data_schema = parse_message_type ( & * scan. data_schema ) . unwrap ( ) ;
953- let required_schema = parse_message_type ( & * scan. required_schema ) . unwrap ( ) ;
952+ let data_schema = parse_message_type ( & scan. data_schema ) . unwrap ( ) ;
953+ let required_schema = parse_message_type ( & scan. required_schema ) . unwrap ( ) ;
954954
955955 let data_schema_descriptor =
956956 parquet:: schema:: types:: SchemaDescriptor :: new ( Arc :: new ( data_schema) ) ;
@@ -968,16 +968,6 @@ impl PhysicalPlanner {
968968 )
969969 . unwrap ( ) ,
970970 ) ;
971- assert ! ( !required_schema_arrow. fields. is_empty( ) ) ;
972-
973- let mut projection_vector: Vec < usize > =
974- Vec :: with_capacity ( required_schema_arrow. fields . len ( ) ) ;
975- // TODO: could be faster with a hashmap rather than iterating over data_schema_arrow with index_of.
976- required_schema_arrow. fields . iter ( ) . for_each ( |field| {
977- projection_vector. push ( data_schema_arrow. index_of ( field. name ( ) ) . unwrap ( ) ) ;
978- } ) ;
979-
980- assert_eq ! ( projection_vector. len( ) , required_schema_arrow. fields. len( ) ) ;
981971
982972 // Convert the Spark expressions to Physical expressions
983973 let data_filters: Result < Vec < Arc < dyn PhysicalExpr > > , ExecutionError > = scan
@@ -997,39 +987,56 @@ impl PhysicalPlanner {
997987 ) )
998988 } ) ;
999989
1000- let object_store_url = ObjectStoreUrl :: local_filesystem ( ) ;
1001- let paths: Vec < Url > = scan
1002- . path
1003- . iter ( )
1004- . map ( |path| Url :: parse ( path) . unwrap ( ) )
1005- . collect ( ) ;
1006-
1007990 let object_store = object_store:: local:: LocalFileSystem :: new ( ) ;
1008991 // register the object store with the runtime environment
1009992 let url = Url :: try_from ( "file://" ) . unwrap ( ) ;
1010993 self . session_ctx
1011994 . runtime_env ( )
1012995 . register_object_store ( & url, Arc :: new ( object_store) ) ;
1013996
1014- let files: Vec < PartitionedFile > = paths
1015- . iter ( )
1016- . map ( |path| PartitionedFile :: from_path ( path. path ( ) . to_string ( ) ) . unwrap ( ) )
1017- . collect ( ) ;
1018-
1019- // partition the files
1020- // TODO really should partition the row groups
1021-
1022- let mut file_groups = vec ! [ vec![ ] ; partition_count] ;
1023- files. iter ( ) . enumerate ( ) . for_each ( |( idx, file) | {
1024- file_groups[ idx % partition_count] . push ( file. clone ( ) ) ;
997+ // Generate file groups
998+ let mut file_groups: Vec < Vec < PartitionedFile > > =
999+ Vec :: with_capacity ( partition_count) ;
1000+ scan. file_partitions . iter ( ) . for_each ( |partition| {
1001+ let mut files = Vec :: with_capacity ( partition. partitioned_file . len ( ) ) ;
1002+ partition. partitioned_file . iter ( ) . for_each ( |file| {
1003+ assert ! ( file. start + file. length <= file. file_size) ;
1004+ files. push ( PartitionedFile :: new_with_range (
1005+ Url :: parse ( file. file_path . as_ref ( ) )
1006+ . unwrap ( )
1007+ . path ( )
1008+ . to_string ( ) ,
1009+ file. file_size as u64 ,
1010+ file. start ,
1011+ file. start + file. length ,
1012+ ) ) ;
1013+ } ) ;
1014+ file_groups. push ( files) ;
10251015 } ) ;
10261016
1027- let file_scan_config =
1017+ // TODO: I think we can remove partition_count in the future, but leave for testing.
1018+ assert_eq ! ( file_groups. len( ) , partition_count) ;
1019+
1020+ let object_store_url = ObjectStoreUrl :: local_filesystem ( ) ;
1021+ let mut file_scan_config =
10281022 FileScanConfig :: new ( object_store_url, Arc :: clone ( & data_schema_arrow) )
1029- . with_file_groups ( file_groups)
1030- . with_projection ( Some ( projection_vector) ) ;
1023+ . with_file_groups ( file_groups) ;
1024+
1025+ // Check for projection, if so generate the vector and add to FileScanConfig.
1026+ if !required_schema_arrow. fields . is_empty ( ) {
1027+ let mut projection_vector: Vec < usize > =
1028+ Vec :: with_capacity ( required_schema_arrow. fields . len ( ) ) ;
1029+ // TODO: could be faster with a hashmap rather than iterating over data_schema_arrow with index_of.
1030+ required_schema_arrow. fields . iter ( ) . for_each ( |field| {
1031+ projection_vector. push ( data_schema_arrow. index_of ( field. name ( ) ) . unwrap ( ) ) ;
1032+ } ) ;
1033+
1034+ assert_eq ! ( projection_vector. len( ) , required_schema_arrow. fields. len( ) ) ;
1035+ file_scan_config = file_scan_config. with_projection ( Some ( projection_vector) ) ;
1036+ }
10311037
10321038 let mut table_parquet_options = TableParquetOptions :: new ( ) ;
1039+ // TODO: Maybe these are configs?
10331040 table_parquet_options. global . pushdown_filters = true ;
10341041 table_parquet_options. global . reorder_filters = true ;
10351042
@@ -1041,7 +1048,7 @@ impl PhysicalPlanner {
10411048 }
10421049
10431050 let scan = builder. build ( ) ;
1044- return Ok ( ( vec ! [ ] , Arc :: new ( scan) ) ) ;
1051+ Ok ( ( vec ! [ ] , Arc :: new ( scan) ) )
10451052 }
10461053 OpStruct :: Scan ( scan) => {
10471054 let data_types = scan. fields . iter ( ) . map ( to_arrow_datatype) . collect_vec ( ) ;
0 commit comments