diff --git a/src/lib.rs b/src/lib.rs index 7f9c753e..ad28469f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -69,6 +69,7 @@ mod ods; mod xls; mod xlsb; mod xlsx; +mod xlsx_iter; mod de; mod errors; diff --git a/src/xlsx.rs b/src/xlsx.rs index 9421a748..d49c854e 100644 --- a/src/xlsx.rs +++ b/src/xlsx.rs @@ -17,12 +17,13 @@ use crate::formats::{ builtin_format_by_id, detect_custom_number_format, format_excel_f64_ref, CellFormat, }; use crate::vba::VbaProject; +use crate::xlsx_iter::XlsxCellReader; use crate::{ Cell, CellErrorType, CellType, DataType, Metadata, Range, Reader, Sheet, SheetType, SheetVisible, Table, }; -type XlReader<'a> = XmlReader>>; +pub(crate) type XlReader<'a> = XmlReader>>; /// Maximum number of rows allowed in an xlsx file pub const MAX_ROWS: u32 = 1_048_576; @@ -49,7 +50,6 @@ pub enum XlsxError { ParseFloat(std::num::ParseFloatError), /// ParseInt error ParseInt(std::num::ParseIntError), - /// Unexpected end of xml XmlEof(&'static str), /// Unexpected node @@ -81,6 +81,8 @@ pub enum XlsxError { }, /// Cell error CellError(String), + /// Worksheet not found + WorksheetNotFound(String), } from_err!(std::io::Error, XlsxError, Io); @@ -94,40 +96,40 @@ from_err!(std::num::ParseIntError, XlsxError, ParseInt); impl std::fmt::Display for XlsxError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { - XlsxError::Io(e) => write!(f, "I/O error: {}", e), - XlsxError::Zip(e) => write!(f, "Zip error: {}", e), - XlsxError::Xml(e) => write!(f, "Xml error: {}", e), - XlsxError::XmlAttr(e) => write!(f, "Xml attribute error: {}", e), - XlsxError::Vba(e) => write!(f, "Vba error: {}", e), - XlsxError::Parse(e) => write!(f, "Parse string error: {}", e), - XlsxError::ParseInt(e) => write!(f, "Parse integer error: {}", e), - XlsxError::ParseFloat(e) => write!(f, "Parse float error: {}", e), + XlsxError::Io(e) => write!(f, "I/O error: {e}"), + XlsxError::Zip(e) => write!(f, "Zip error: {e}"), + XlsxError::Xml(e) => write!(f, "Xml error: {e}"), + XlsxError::XmlAttr(e) => write!(f, "Xml attribute error: {e}"), + XlsxError::Vba(e) => write!(f, "Vba error: {e}"), + XlsxError::Parse(e) => write!(f, "Parse string error: {e}"), + XlsxError::ParseInt(e) => write!(f, "Parse integer error: {e}"), + XlsxError::ParseFloat(e) => write!(f, "Parse float error: {e}"), - XlsxError::XmlEof(e) => write!(f, "Unexpected end of xml, expecting ''", e), - XlsxError::UnexpectedNode(e) => write!(f, "Expecting '{}' node", e), - XlsxError::FileNotFound(e) => write!(f, "File not found '{}'", e), + XlsxError::XmlEof(e) => write!(f, "Unexpected end of xml, expecting ''"), + XlsxError::UnexpectedNode(e) => write!(f, "Expecting '{e}' node"), + XlsxError::FileNotFound(e) => write!(f, "File not found '{e}'"), XlsxError::RelationshipNotFound => write!(f, "Relationship not found"), XlsxError::Alphanumeric(e) => { - write!(f, "Expecting alphanumeric character, got {:X}", e) + write!(f, "Expecting alphanumeric character, got {e:X}") } XlsxError::NumericColumn(e) => write!( f, - "Numeric character is not allowed for column name, got {}", - e + "Numeric character is not allowed for column name, got {e}", ), XlsxError::DimensionCount(e) => { - write!(f, "Range dimension must be lower than 2. Got {}", e) + write!(f, "Range dimension must be lower than 2. Got {e}") } - XlsxError::CellTAttribute(e) => write!(f, "Unknown cell 't' attribute: {:?}", e), + XlsxError::CellTAttribute(e) => write!(f, "Unknown cell 't' attribute: {e:?}"), XlsxError::RangeWithoutColumnComponent => { write!(f, "Range is missing the expected column component.") } XlsxError::RangeWithoutRowComponent => { write!(f, "Range is missing the expected row component.") } - XlsxError::Unexpected(e) => write!(f, "{}", e), - XlsxError::Unrecognized { typ, val } => write!(f, "Unrecognized {}: {}", typ, val), - XlsxError::CellError(e) => write!(f, "Unsupported cell error value '{}'", e), + XlsxError::Unexpected(e) => write!(f, "{e}"), + XlsxError::Unrecognized { typ, val } => write!(f, "Unrecognized {typ}: {val}"), + XlsxError::CellError(e) => write!(f, "Unsupported cell error value '{e}'"), + XlsxError::WorksheetNotFound(n) => write!(f, "Worksheet '{n}' not found"), } } } @@ -770,21 +772,51 @@ where } impl Xlsx { - /// Get worksheet range where shared string values are only borrowed - pub fn worksheet_range_ref<'a>( + /// Get a worksheet cell reader + pub fn worksheet_cells_reader<'a>( &'a mut self, name: &str, - ) -> Option>, XlsxError>> { - let (_, path) = self.sheets.iter().find(|&&(ref n, _)| n == name)?; - let xml = xml_reader(&mut self.zip, path); + ) -> Result, XlsxError> { + let (_, path) = self + .sheets + .iter() + .find(|&&(ref n, _)| n == name) + .ok_or_else(|| XlsxError::WorksheetNotFound(name.into()))?; + let xml = xml_reader(&mut self.zip, path) + .ok_or_else(|| XlsxError::WorksheetNotFound(name.into()))??; let is_1904 = self.is_1904; let strings = &self.strings; let formats = &self.formats; - xml.map(|xml| { - worksheet(strings, formats, xml?, &mut |s, f, xml, cells| { - read_sheet_data(xml, s, f, cells, is_1904) - }) - }) + XlsxCellReader::new(xml, strings, formats, is_1904) + } + + /// Get worksheet range where shared string values are only borrowed + pub fn worksheet_range_ref<'a>( + &'a mut self, + name: &str, + ) -> Option>, XlsxError>> { + let mut cell_reader = match self.worksheet_cells_reader(name) { + Ok(reader) => reader, + Err(XlsxError::WorksheetNotFound(_)) => return None, + Err(e) => return Some(Err(e)), + }; + let len = cell_reader.dimensions().len(); + let mut cells = Vec::new(); + if len < 100_000 { + cells.reserve(len as usize); + } + loop { + match cell_reader.next_cell() { + Ok(Some(Cell { + val: DataTypeRef::Empty, + .. + })) => (), + Ok(Some(cell)) => cells.push(cell), + Ok(None) => break, + Err(e) => return Some(Err(e)), + } + } + Some(Ok(Range::from_sparse(cells))) } } @@ -924,7 +956,10 @@ fn xml_reader<'a, RS: Read + Seek>( } /// search through an Element's attributes for the named one -fn get_attribute<'a>(atts: Attributes<'a>, n: QName) -> Result, XlsxError> { +pub(crate) fn get_attribute<'a>( + atts: Attributes<'a>, + n: QName, +) -> Result, XlsxError> { for a in atts { match a { Ok(Attribute { @@ -1126,14 +1161,14 @@ fn read_sheet_data<'s>( }) } -#[derive(Debug, PartialEq)] -struct Dimensions { +#[derive(Debug, PartialEq, Default, Clone, Copy)] +pub(crate) struct Dimensions { start: (u32, u32), end: (u32, u32), } impl Dimensions { - fn len(&self) -> u64 { + pub fn len(&self) -> u64 { (self.end.0 - self.start.0 + 1) as u64 * (self.end.1 - self.start.1 + 1) as u64 } } @@ -1141,7 +1176,7 @@ impl Dimensions { /// converts a text representation (e.g. "A6:G67") of a dimension into integers /// - top left (row, column), /// - bottom right (row, column) -fn get_dimension(dimension: &[u8]) -> Result { +pub(crate) fn get_dimension(dimension: &[u8]) -> Result { let parts: Vec<_> = dimension .split(|c| *c == b':') .map(get_row_column) @@ -1179,7 +1214,7 @@ fn get_dimension(dimension: &[u8]) -> Result { /// Converts a text range name into its position (row, column) (0 based index). /// If the row or column component in the range is missing, an Error is returned. -fn get_row_column(range: &[u8]) -> Result<(u32, u32), XlsxError> { +pub(crate) fn get_row_column(range: &[u8]) -> Result<(u32, u32), XlsxError> { let (row, col) = get_row_and_optional_column(range)?; let col = col.ok_or(XlsxError::RangeWithoutColumnComponent)?; Ok((row, col)) @@ -1188,7 +1223,7 @@ fn get_row_column(range: &[u8]) -> Result<(u32, u32), XlsxError> { /// Converts a text row name into its position (0 based index). /// If the row component in the range is missing, an Error is returned. /// If the text row name also contains a column component, it is ignored. -fn get_row(range: &[u8]) -> Result { +pub(crate) fn get_row(range: &[u8]) -> Result { get_row_and_optional_column(range).map(|(row, _)| row) } @@ -1235,7 +1270,10 @@ fn get_row_and_optional_column(range: &[u8]) -> Result<(u32, Option), XlsxE } /// attempts to read either a simple or richtext string -fn read_string(xml: &mut XlReader<'_>, QName(closing): QName) -> Result, XlsxError> { +pub(crate) fn read_string( + xml: &mut XlReader<'_>, + QName(closing): QName, +) -> Result, XlsxError> { let mut buf = Vec::with_capacity(1024); let mut val_buf = Vec::with_capacity(1024); let mut rich_buffer: Option = None; diff --git a/src/xlsx_iter.rs b/src/xlsx_iter.rs new file mode 100644 index 00000000..6e31b8b7 --- /dev/null +++ b/src/xlsx_iter.rs @@ -0,0 +1,251 @@ +use quick_xml::{ + events::{attributes::Attribute, BytesStart, Event}, + name::QName, +}; + +use crate::{ + datatype::DataTypeRef, + formats::{format_excel_f64_ref, CellFormat}, + xlsx::{ + get_attribute, get_dimension, get_row, get_row_column, read_string, Dimensions, XlReader, + }, + Cell, XlsxError, +}; + +/// An xlsx Cell Iterator +pub struct XlsxCellReader<'a> { + xml: XlReader<'a>, + strings: &'a [String], + formats: &'a [CellFormat], + is_1904: bool, + dimensions: Dimensions, + row_index: u32, + col_index: u32, + buf: Vec, + cell_buf: Vec, +} + +impl<'a> XlsxCellReader<'a> { + pub fn new( + mut xml: XlReader<'a>, + strings: &'a [String], + formats: &'a [CellFormat], + is_1904: bool, + ) -> Result { + let mut buf = Vec::with_capacity(1024); + let mut dimensions = Dimensions::default(); + 'xml: loop { + buf.clear(); + match xml.read_event_into(&mut buf).map_err(XlsxError::Xml)? { + Event::Start(ref e) => match e.local_name().as_ref() { + b"dimension" => { + for a in e.attributes() { + if let Attribute { + key: QName(b"ref"), + value: rdim, + } = a.map_err(XlsxError::XmlAttr)? + { + dimensions = get_dimension(&rdim)?; + continue 'xml; + } + } + return Err(XlsxError::UnexpectedNode("dimension")); + } + b"sheetData" => break, + _ => (), + }, + Event::Eof => return Err(XlsxError::XmlEof("sheetData")), + _ => (), + } + } + Ok(Self { + xml, + strings, + formats, + is_1904, + dimensions, + row_index: 0, + col_index: 0, + buf: Vec::with_capacity(1024), + cell_buf: Vec::with_capacity(1024), + }) + } + + pub(crate) fn dimensions(&self) -> Dimensions { + self.dimensions + } + + pub fn next_cell(&mut self) -> Result>>, XlsxError> { + loop { + self.buf.clear(); + match self.xml.read_event_into(&mut self.buf) { + Ok(Event::Start(ref row_element)) + if row_element.local_name().as_ref() == b"row" => + { + let attribute = get_attribute(row_element.attributes(), QName(b"r"))?; + if let Some(range) = attribute { + let row = get_row(range)?; + self.row_index = row; + } + } + Ok(Event::End(ref row_element)) if row_element.local_name().as_ref() == b"row" => { + self.row_index += 1; + self.col_index = 0; + } + Ok(Event::Start(ref c_element)) if c_element.local_name().as_ref() == b"c" => { + let attribute = get_attribute(c_element.attributes(), QName(b"r"))?; + let pos = if let Some(range) = attribute { + let (row, col) = get_row_column(range)?; + self.col_index = col; + (row, col) + } else { + (self.row_index, self.col_index) + }; + let mut value = DataTypeRef::Empty; + loop { + self.cell_buf.clear(); + match self.xml.read_event_into(&mut self.cell_buf) { + Ok(Event::Start(ref e)) => { + value = read_value( + self.strings, + self.formats, + self.is_1904, + &mut self.xml, + e, + c_element, + )? + } + Ok(Event::End(ref e)) if e.local_name().as_ref() == b"c" => break, + Ok(Event::Eof) => return Err(XlsxError::XmlEof("c")), + Err(e) => return Err(XlsxError::Xml(e)), + _ => (), + } + } + self.col_index += 1; + return Ok(Some(Cell::new(pos, value))); + } + Ok(Event::End(ref e)) if e.local_name().as_ref() == b"sheetData" => { + return Ok(None); + } + Ok(Event::Eof) => return Err(XlsxError::XmlEof("sheetData")), + Err(e) => return Err(XlsxError::Xml(e)), + _ => (), + } + } + } +} + +fn read_value<'s>( + strings: &'s [String], + formats: &[CellFormat], + is_1904: bool, + xml: &mut XlReader<'_>, + e: &BytesStart<'_>, + c_element: &BytesStart<'_>, +) -> Result, XlsxError> { + Ok(match e.local_name().as_ref() { + b"is" => { + // inlineStr + read_string(xml, e.name())?.map_or(DataTypeRef::Empty, DataTypeRef::String) + } + b"v" => { + // value + let mut v = String::new(); + let mut v_buf = Vec::new(); + loop { + v_buf.clear(); + match xml.read_event_into(&mut v_buf)? { + Event::Text(t) => v.push_str(&t.unescape()?), + Event::End(end) if end.name() == e.name() => break, + Event::Eof => return Err(XlsxError::XmlEof("v")), + _ => (), + } + } + read_v(v, strings, formats, c_element, is_1904)? + } + b"f" => { + xml.read_to_end_into(e.name(), &mut Vec::new())?; + DataTypeRef::Empty + } + _n => return Err(XlsxError::UnexpectedNode("v, f, or is")), + }) +} + +/// read the contents of a cell +fn read_v<'s>( + v: String, + strings: &'s [String], + formats: &[CellFormat], + c_element: &BytesStart<'_>, + is_1904: bool, +) -> Result, XlsxError> { + let cell_format = match get_attribute(c_element.attributes(), QName(b"s")) { + Ok(Some(style)) => { + let id: usize = std::str::from_utf8(style).unwrap_or("0").parse()?; + formats.get(id) + } + _ => Some(&CellFormat::Other), + }; + match get_attribute(c_element.attributes(), QName(b"t"))? { + Some(b"s") => { + // shared string + let idx: usize = v.parse()?; + Ok(DataTypeRef::SharedString(&strings[idx])) + } + Some(b"b") => { + // boolean + Ok(DataTypeRef::Bool(v != "0")) + } + Some(b"e") => { + // error + Ok(DataTypeRef::Error(v.parse()?)) + } + Some(b"d") => { + // date + Ok(DataTypeRef::DateTimeIso(v)) + } + Some(b"str") => { + // see http://officeopenxml.com/SScontentOverview.php + // str - refers to formula cells + // * indicates calculated value (this case) + // * to the formula string (ignored case + // TODO: Fully support a DataType::Formula representing both Formula string & + // last calculated value? + // + // NB: the result of a formula may not be a numeric value (=A3&" "&A4). + // We do try an initial parse as Float for utility, but fall back to a string + // representation if that fails + v.parse() + .map(DataTypeRef::Float) + .or(Ok(DataTypeRef::String(v))) + } + Some(b"n") => { + // n - number + if v.is_empty() { + Ok(DataTypeRef::Empty) + } else { + v.parse() + .map(|n| format_excel_f64_ref(n, cell_format, is_1904)) + .map_err(XlsxError::ParseFloat) + } + } + None => { + // If type is not known, we try to parse as Float for utility, but fall back to + // String if this fails. + v.parse() + .map(|n| format_excel_f64_ref(n, cell_format, is_1904)) + .or(Ok(DataTypeRef::String(v))) + } + Some(b"is") => { + // this case should be handled in outer loop over cell elements, in which + // case read_inline_str is called instead. Case included here for completeness. + Err(XlsxError::Unexpected( + "called read_value on a cell of type inlineStr", + )) + } + Some(t) => { + let t = std::str::from_utf8(t).unwrap_or("").to_string(); + Err(XlsxError::CellTAttribute(t)) + } + } +}