Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 16 additions & 1 deletion arrow-schema/src/datatype.rs
Original file line number Diff line number Diff line change
Expand Up @@ -458,7 +458,22 @@ pub enum UnionMode {

impl fmt::Display for DataType {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{self:?}")
match &self {
DataType::Struct(fields) => {
write!(f, "Struct(")?;
if !fields.is_empty() {
let fields_str = fields
.iter()
.map(|f| format!("{} {}", f.name(), f.data_type()))
.collect::<Vec<_>>()
.join(", ");
write!(f, "{}", fields_str)?;
}
write!(f, ")")?;
Ok(())
}
_ => write!(f, "{self:?}"),
}
}
}

Expand Down
102 changes: 92 additions & 10 deletions arrow-schema/src/datatype_parse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

use std::{fmt::Display, iter::Peekable, str::Chars, sync::Arc};

use crate::{ArrowError, DataType, Field, IntervalUnit, TimeUnit};
use crate::{ArrowError, DataType, Field, Fields, IntervalUnit, TimeUnit};

pub(crate) fn parse_data_type(val: &str) -> ArrowResult<DataType> {
Parser::new(val).parse()
Expand Down Expand Up @@ -78,6 +78,11 @@ impl<'a> Parser<'a> {
Token::List => self.parse_list(),
Token::LargeList => self.parse_large_list(),
Token::FixedSizeList => self.parse_fixed_size_list(),
Token::Struct => self.parse_struct(),
Token::FieldName(word) => Err(make_error(
self.val,
&format!("unrecognized word: {}", word),
)),
tok => Err(make_error(
self.val,
&format!("finding next type, got unexpected '{tok}'"),
Expand Down Expand Up @@ -150,6 +155,10 @@ impl<'a> Parser<'a> {
fn parse_double_quoted_string(&mut self, context: &str) -> ArrowResult<String> {
match self.next_token()? {
Token::DoubleQuotedString(s) => Ok(s),
Token::FieldName(word) => Err(make_error(
self.val,
&format!("unrecognized word: {}", word),
)),
tok => Err(make_error(
self.val,
&format!("finding double quoted string for {context}, got '{tok}'"),
Expand Down Expand Up @@ -291,6 +300,46 @@ impl<'a> Parser<'a> {
Box::new(value_type),
))
}
fn parse_struct(&mut self) -> ArrowResult<DataType> {
self.expect_token(Token::LParen)?;
let mut fields = Vec::new();
loop {
let field_name = match self.next_token()? {
// It's valid to have a name that is a type name
Token::SimpleType(data_type) => data_type.to_string(),
Token::FieldName(name) => name,
Token::RParen => {
if fields.is_empty() {
break;
} else {
return Err(make_error(
self.val,
"Unexpected token while parsing Struct fields. Expected a word for the name of Struct, but got trailing comma",
));
}
}
tok => {
return Err(make_error(
self.val,
&format!("Expected a word for the name of Struct, but got {tok}"),
))
}
};
let field_type = self.parse_next_type()?;
fields.push(Arc::new(Field::new(field_name, field_type, true)));
match self.next_token()? {
Token::Comma => continue,
Token::RParen => break,
tok => {
return Err(make_error(
self.val,
&format!("Unexpected token while parsing Struct fields. Expected ',' or ')', but got '{tok}'"),
))
}
}
}
Ok(DataType::Struct(Fields::from(fields)))
}

/// return the next token, or an error if there are none left
fn next_token(&mut self) -> ArrowResult<Token> {
Expand Down Expand Up @@ -479,12 +528,9 @@ impl<'a> Tokenizer<'a> {
"Some" => Token::Some,
"None" => Token::None,

_ => {
return Err(make_error(
self.val,
&format!("unrecognized word: {}", self.word),
))
}
"Struct" => Token::Struct,
// If we don't recognize the word, treat it as a field name
word => Token::FieldName(word.to_string()),
};
Ok(token)
}
Expand Down Expand Up @@ -546,6 +592,8 @@ enum Token {
List,
LargeList,
FixedSizeList,
Struct,
FieldName(String),
}

impl Display for Token {
Expand Down Expand Up @@ -573,6 +621,8 @@ impl Display for Token {
Token::Dictionary => write!(f, "Dictionary"),
Token::Integer(v) => write!(f, "Integer({v})"),
Token::DoubleQuotedString(s) => write!(f, "DoubleQuotedString({s})"),
Token::Struct => write!(f, "Struct"),
Token::FieldName(s) => write!(f, "FieldName({s})"),
}
}
}
Expand Down Expand Up @@ -680,7 +730,37 @@ mod test {
DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Utf8)),
),
),
// TODO support more structured types (List, LargeList, Struct, Union, Map, RunEndEncoded, etc)
DataType::Struct(Fields::from(vec![
Field::new("f1", DataType::Int64, true),
Field::new("f2", DataType::Float64, true),
Field::new(
"f3",
DataType::Timestamp(TimeUnit::Second, Some("+08:00".into())),
true,
),
Field::new(
"f4",
DataType::Dictionary(
Box::new(DataType::Int8),
Box::new(DataType::FixedSizeBinary(23)),
),
true,
),
])),
DataType::Struct(Fields::from(vec![
Field::new("Int64", DataType::Int64, true),
Field::new("Float64", DataType::Float64, true),
])),
DataType::Struct(Fields::from(vec![
Field::new("f1", DataType::Int64, true),
Field::new(
"nested_struct",
DataType::Struct(Fields::from(vec![Field::new("n1", DataType::Int64, true)])),
true,
),
])),
DataType::Struct(Fields::empty()),
// TODO support more structured types (List, LargeList, Union, Map, RunEndEncoded, etc)
]
}

Expand Down Expand Up @@ -754,11 +834,13 @@ mod test {
("Decimal256(-3, 5)", "Error converting -3 into u8 for Decimal256: out of range integral type conversion attempted"),
("Decimal128(3, 500)", "Error converting 500 into i8 for Decimal128: out of range integral type conversion attempted"),
("Decimal256(3, 500)", "Error converting 500 into i8 for Decimal256: out of range integral type conversion attempted"),

("Struct(f1, Int64)", "Error finding next type, got unexpected ','"),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

❤️

("Struct(f1 Int64,)", "Expected a word for the name of Struct, but got trailing comma"),
("Struct(f1)", "Error finding next type, got unexpected ')'"),
];

for (data_type_string, expected_message) in cases {
print!("Parsing '{data_type_string}', expecting '{expected_message}'");
println!("Parsing '{data_type_string}', expecting '{expected_message}'");
match parse_data_type(data_type_string) {
Ok(d) => panic!("Expected error while parsing '{data_type_string}', but got '{d}'"),
Err(e) => {
Expand Down
Loading