diff --git a/rust/arrow/src/csv/reader.rs b/rust/arrow/src/csv/reader.rs index c6f90ae4633..985c88b4978 100644 --- a/rust/arrow/src/csv/reader.rs +++ b/rust/arrow/src/csv/reader.rs @@ -98,8 +98,32 @@ fn infer_field_schema(string: &str) -> DataType { /// /// If `max_read_records` is not set, the whole file is read to infer its schema. /// +/// Return infered schema and number of records used for inference. This function does not change +/// reader cursor offset. +pub fn infer_file_schema( + reader: &mut R, + delimiter: u8, + max_read_records: Option, + has_header: bool, +) -> Result<(Schema, usize)> { + let saved_offset = reader.seek(SeekFrom::Current(0))?; + + let (schema, records_count) = + infer_reader_schema(reader, delimiter, max_read_records, has_header)?; + + // return the reader seek back to the start + reader.seek(SeekFrom::Start(saved_offset))?; + + Ok((schema, records_count)) +} + +/// Infer schema of CSV records provided by struct that implements `Read` trait. +/// +/// `max_read_records` controlling the maximum number of records to read. If `max_read_records` is +/// not set, all records are read to infer the schema. +/// /// Return infered schema and number of records used for inference. -fn infer_file_schema( +pub fn infer_reader_schema( reader: &mut R, delimiter: u8, max_read_records: Option, @@ -121,18 +145,12 @@ fn infer_file_schema( .collect() }; - // save the csv reader position after reading headers - let position = csv_reader.position().clone(); - let header_length = headers.len(); // keep track of inferred field types let mut column_types: Vec> = vec![HashSet::new(); header_length]; // keep track of columns with nulls let mut nulls: Vec = vec![false; header_length]; - // return csv reader position to after headers - csv_reader.seek(position)?; - let mut records_count = 0; let mut fields = vec![]; @@ -184,9 +202,6 @@ fn infer_file_schema( } } - // return the reader seek back to the start - csv_reader.into_inner().seek(SeekFrom::Start(0))?; - Ok((Schema::new(fields), records_count)) }