Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 10 additions & 8 deletions arrow-array/src/record_batch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -445,14 +445,16 @@ impl RecordBatch {
})
.collect::<Result<Vec<_>, _>>()?;

RecordBatch::try_new_with_options(
SchemaRef::new(projected_schema),
batch_fields,
&RecordBatchOptions {
match_field_names: true,
row_count: Some(self.row_count),
},
)
unsafe {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I double checked the checks that happen as part of creating a RecordBatch and I agree they are entirely redundant when projecting an already valid RecordBatch

// check that number of fields in schema match column length
if schema.fields().len() != columns.len() {
return Err(ArrowError::InvalidArgumentError(format!(
"number of columns({}) must match number of fields({}) in schema",
columns.len(),
schema.fields().len(),
)));
}
let row_count = options
.row_count
.or_else(|| columns.first().map(|col| col.len()))
.ok_or_else(|| {
ArrowError::InvalidArgumentError(
"must either specify a row count or at least one column".to_string(),
)
})?;
for (c, f) in columns.iter().zip(&schema.fields) {
if !f.is_nullable() && c.null_count() > 0 {
return Err(ArrowError::InvalidArgumentError(format!(
"Column '{}' is declared as non-nullable but contains null values",
f.name()
)));
}
}
// check that all columns have the same row count
if columns.iter().any(|c| c.len() != row_count) {
let err = match options.row_count {
Some(_) => "all columns in a record batch must have the specified row count",
None => "all columns in a record batch must have the same length",
};
return Err(ArrowError::InvalidArgumentError(err.to_string()));
}
// function for comparing column type and field type
// return true if 2 types are not matched
let type_not_match = if options.match_field_names {
|(_, (col_type, field_type)): &(usize, (&DataType, &DataType))| col_type != field_type
} else {
|(_, (col_type, field_type)): &(usize, (&DataType, &DataType))| {
!col_type.equals_datatype(field_type)
}
};
// check that all columns match the schema
let not_match = columns
.iter()
.zip(schema.fields().iter())
.map(|(col, field)| (col.data_type(), field.data_type()))
.enumerate()
.find(type_not_match);
if let Some((i, (col_type, field_type))) = not_match {
return Err(ArrowError::InvalidArgumentError(format!(
"column types must match schema types, expected {field_type} but found {col_type} at column index {i}"
)));
}

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for verifying!

// Since we're starting from a valid RecordBatch and project
// creates a strict subset of the original, there's no need to
// redo the validation checks in `try_new_with_options`.
Ok(RecordBatch::new_unchecked(
SchemaRef::new(projected_schema),
batch_fields,
self.row_count,
))
}
}

/// Normalize a semi-structured [`RecordBatch`] into a flat table.
Expand Down
Loading