From 4b5bf6d619c48e5e9e6eea0a05a857b98ed429bf Mon Sep 17 00:00:00 2001 From: Eric Jolibois Date: Sat, 20 Jul 2024 17:48:26 +0200 Subject: [PATCH 01/14] feat: add option to keep first empty rows for xlsx --- Cargo.toml | 1 + src/lib.rs | 2 +- src/xlsx/mod.rs | 46 +++++++++++++++++++++++++++++++ tests/keep-empty-rows.xlsx | Bin 0 -> 9753 bytes tests/temperature-in-middle.xlsx | Bin 0 -> 5837 bytes tests/test.rs | 39 +++++++++++++++++++++++++- 6 files changed, 86 insertions(+), 2 deletions(-) create mode 100644 tests/keep-empty-rows.xlsx create mode 100644 tests/temperature-in-middle.xlsx diff --git a/Cargo.toml b/Cargo.toml index 675ed428..7662e2e2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -28,6 +28,7 @@ chrono = { version = "0.4", features = [ [dev-dependencies] glob = "0.3" env_logger = "0.11" +rstest = { version = "0.21.0", default-features = false } serde_derive = "1.0" sha256 = "1.3" diff --git a/src/lib.rs b/src/lib.rs index c2896edf..bb9ece36 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -89,7 +89,7 @@ pub use crate::errors::Error; pub use crate::ods::{Ods, OdsError}; pub use crate::xls::{Xls, XlsError, XlsOptions}; pub use crate::xlsb::{Xlsb, XlsbError}; -pub use crate::xlsx::{Xlsx, XlsxError}; +pub use crate::xlsx::{Xlsx, XlsxError, XlsxOptions}; use crate::vba::VbaProject; diff --git a/src/xlsx/mod.rs b/src/xlsx/mod.rs index 452dcaa0..a38e8ff4 100644 --- a/src/xlsx/mod.rs +++ b/src/xlsx/mod.rs @@ -197,6 +197,29 @@ pub struct Xlsx { pictures: Option)>>, /// Merged Regions: Name, Sheet, Merged Dimensions merged_regions: Option>, + /// Reader options + options: XlsxOptions, +} + +/// Xlsx reader options +#[derive(Debug, Default)] +pub struct XlsxOptions { + /// By default, calamine skips empty rows until a nonempty row is found + pub keep_first_empty_rows: bool, +} + +impl XlsxOptions { + /// Create a new XlsxOptions + pub fn new() -> Self { + Self::default() + } + + /// Avoid skipping first empty rows + pub fn with_keep_first_empty_rows(self, keep_first_empty_rows: bool) -> Self { + Self { + keep_first_empty_rows, + } + } } impl Xlsx { @@ -850,6 +873,14 @@ impl Xlsx { } } +impl Xlsx { + /// Set reader options + pub fn with_options(mut self, options: XlsxOptions) -> Self { + self.options = options; + self + } +} + impl Reader for Xlsx { type Error = XlsxError; @@ -867,6 +898,7 @@ impl Reader for Xlsx { #[cfg(feature = "picture")] pictures: None, merged_regions: None, + options: XlsxOptions::default(), }; xlsx.read_shared_strings()?; xlsx.read_styles()?; @@ -947,6 +979,7 @@ impl Reader for Xlsx { impl ReaderRef for Xlsx { fn worksheet_range_ref<'a>(&'a mut self, name: &str) -> Result>, XlsxError> { + let keep_first_empty_rows = self.options.keep_first_empty_rows; let mut cell_reader = match self.worksheet_cells_reader(name) { Ok(reader) => reader, Err(XlsxError::NotAWorksheet(typ)) => { @@ -971,6 +1004,19 @@ impl ReaderRef for Xlsx { Err(e) => return Err(e), } } + + // If the first cell doesn't start at row 0, we add an empty cell + // at row 0 but still at the same column as the first cell + if keep_first_empty_rows && cells.first().map_or(false, |c| c.pos.0 != 0) { + cells.insert( + 0, + Cell { + pos: (0, cells.first().expect("cells should not be empty").pos.1), + val: DataRef::Empty, + }, + ); + } + Ok(Range::from_sparse(cells)) } } diff --git a/tests/keep-empty-rows.xlsx b/tests/keep-empty-rows.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..c2c48a7340d11bbed2c7c89ca1228bd19c33359b GIT binary patch literal 9753 zcmeHN^;?u%*CwUAr9q^-m6jSh97?)-fEk9697zRfkd{T}afO&Y_HnZToA7wMI5Xs;p|HJ1LP7lh-|;`Z10{)rt{r@Y3g4412;ypi zmYcan94IfOlDT!ZFzUXA7@V<;$gCXfP`)kVKxdrkQjq|U40nm<_0OeQq7!}a5~kKe z)eA~6br3C|?cFVCvO|w|Q=vR~ZA&d9LS<$aZJUl{knb8U>=fC7y21A3Jqjm%TR~=z z5+~bYk2rxGPZ!PEp1E6Zffw3Km2IIJ?-R6WgS#AA(_2R$0cg)kycfu!C25g_QOU4) z-4>5+f`SF2qRCXJ_fyEAc)XscSc~x6Ad}@Mchx7cpTCPZ;c%{OpAHK=uRA*l(55Bd za^FfU2WrS(Xpu`FTO*uKM=P#w&M!Nfs1)AQ+?O#_O6458Nh#vY$F3Tm0 zqsY%uwyDNQzMx~SWrF#>c@^*Oj@tTx;9Pmb_O5?h;%m$|nilkNWsSH1OeTjjEWonG zC1c@9rX+OpYpG1;K(gvR&`1x+#`lxN?NHA$0$aDYs7P9WAt;Gyiwz2bpvwsU;UEZV z=3(Ou=H>bE{Fj*j!)*L%=_T>%s;K;gVF$2V#-Zki^Bhd$&b$hn>ekoD{HfRGqN%a@ z4cFflWg$5=biNtoURge=r?!=O-g$&ip~T4wSg;J+iAKE+)$6)C459=N#pqq-j$kI| zqMhPX!Dx>4oS9Rp=OwGTAv`R%L6n|KdfhO|dSeGMgCq}1qZ1W^A{OJMb=t7yyIR9S z-Ed{hhhWrPsk5f@CJ|Bfa)&`*P=5h4XKbN7Jl60mc2;itSU6WNr{G@u`+$M1h=EHc zD^xmGqwDO=1E(<^9YvPVG}O#5=euI))lK;dV3NRvR7JlHz9s>JHwLb}{kh-yDQr!Roc3ixA!2Qdo6uEkhc+?fPx~qxo-l zvSB{-W5-lj00}%Fq_NFOM$WT#g>RT&o33otec(>*c^A0rG{vfPMZ%JPVGxG!J|+4< z_IU7mZfYa&U3i5>gkUDk4B&0Rfqx!BPSo zbIlVY^*3|4MLyDqZYPAPC=tp$c}EL$C|vX}GIo0oY%{boWJ&_)lsr zr9BDJ%7QT_tD3%lOzPih^Z`Atq!4@4i_FZE*+@47Wm@Zep}a&+h%?B0O<2Di^{s=D zg7^6(Hh|tKwz)m4BJS{XCEUe6!=2UO!CcWwC~2lakzkPGlX0bsifF!@SlF;TzQ!Xa zB}iEv5qxKiuN{%nSD)LbSZTjo!aJfffP7S zymfd}8PBPw_ysM@a#%{OFCgipDGcb`NSVHA3EH4`NOT5V8gaMvF-xFb+)N%p?ol$A zktJ{go$ni=OD$|&07vj9H{#pdQ38YY>>Y^4d-ubN1~zSIQ#YSrtZK(( zzN4?BoXUOu0r))k(Loa+hM*(Rjg`CF8ZE81spdBDBf*Ka*n>kw0P2ldp}tn=lM58# z-~rUql|Iw${6+#7#z$9&A1E&^2^qNk+*nTVu&HBLSx68$vscMP-qG5E&`aAcnX z_XbLHm=k1NVv@_YY>nHYGsjz;!dfFHYQ&d*f_7K`q?^ zyevQM?wqb(bj=hyAGu{`-=#5Qb@21`lgRE3^D$*KUD zpgBsKrx+oNV|dQ@oosbZmK!A_*!GV6W4yefU=8Lgt+-dSD~_`agkJSAevrD`8()7V zLcE1=enIi3*iSvJTqO}5O*ZagZ#C$9lts0$D~Cvq7HB*GvKt036$|BZQi7Ey?3dDP zU34O%M%M?n-J-R`vp7D~?v~1frfPjkA}ySveVqdK+NE!gzJXHgvWn#Bh1z67vj$D{ zU{fZ;SF>h_CH;NDhnNN@1&9=;a|dxe`(wcnA7`82R+)g)Y|NMO}krcIT-oYH0qumyyxap18Pe1pk^!}Qe$HRJhBMZ8Bf>1uD0s&C>5YF z38mx*Za*(p4{MboYsY2Yd0X^$^tp}q2vTw1nPS!o#`LE5vE3reoJ=cIARE>i_CAUzYtxJh zSXiH5%so6-$;aY%?rruH({w7$P$+OsLgf#B3u<3A+&U;a%x(fl4Cr2>?!86Qi~?jO z8`(djQO3x8(WIo^9~SY1<803Dpyb_}rF7jTxOpyItUu?w)N^mT@a_&qo~}sXE>;k; zrmR$N6o>gU{BIL2!6(@FU4phg-w-wPL+HEvH<7kTzHAw#W8c4Ah{@yBL@%Z#6!xn+ zEuO%;D*FI_9gwA$;aZ*Bks=V3fy?&BT2A6(>sHUq+)G%elq2iiH`KElg;~e&P$i=Ibs_RH#Q_>OpHqNrc^dsvtXKC+(+;=1G55AjQG?^=uu29|h z)L!wKEt0wO;iOcVdkn{$XL0Gv3e=mF>2*s)brLr$hd9nw>EK(mUOrl;{cUndntpp2Va8((=NDbrYR?%=`VE*^#mG zV~3x-zxH>y_oL}6mA=G|Ze}2MzSTh!=a7X5-Z=;TBY|f)F!10AJa;3c2Fl;@{3FML z?QLuzVBX(8_A8W|pLyT-jX$Spd*ILjQ%r~@-VAwXQ_H2> zidQNyjXrv@p1h6Z9;EDb>34tlD-v$|{riRC_^m=cj{8SbuBkiRhrY)7WG_11282>* zn+ZKGF*}8#Nwugdrbo|lx?rnx*NuAL_EjcUi88)GYY$K@ATbfqkH@o>*jS9#%IDz* zE!s2R*;VQpSYPNzd3hvJ^&TWvlYSJNIKzsc?LPLu)ZC49cV}MweoJ^?B{lRJjmRM$ zS>TXP;-bM3HP<$7#AgNBxs~F@9mX*z=sjlN;FD(X%*UIw!u_Mv^jTWn%_@NPy z&mqYDp(z+_vhQHYREk$4Y%mt=_vsy%7ezI6B-c~%cwSm}PoI2jQ2+X)-N09znid7zdNQGOQMe#GtYu6*vM^)4=HzUuN?x@%D+O-viLx*N(#PUh>H+ zx6B(RI;8&**nUGpB|BVMo!cG21*ooKAK=wzs{{(@>9FxA4{_E21*!$Kb>8v`0n}<# z-a!D|8XCi#6+nSeL7mDfz#VqaJ4NWvNm0*jpoJnAHc*j^ei>XLEY}Q{qaJaTsg7vD zFlvmN0W3o}5)e*h2qy%>(GB64jU;$@%5Kcc{0njS?O&p)z+foDcyKL?kW+KsdYMon zsZg{GDJBiX?EajivYO_pI+x1G5T{-rK>hbQ=wIpvBHRitv3zCr=2s{Zn=w{r}I2A^PCRN6(6K?z4d z36E*(`8|uxL`;Cs99GdZ2Qt%Eh^E7X12lr(JRMFMofXJ)Dop^Z0TI9jriRQ}POn0N8M<{e#eSz)`0A|4Y7MT&z-yMn;WT>R(I+U8BpikSPuW^@WIz%}aWoIBK)LKLKzLlY6^3lG5&2xCxJ+mzo zOD{XLtCBf+wPNwo)h2X@yo-3^y|4m?x-r{~A}kbi^Yw59I!XFYq-;+Nv>-AY{P$K+ z-WYed73fg`w-5Di!@eX$@+>jW#D5fl0X>|Q=fl?IM?<&KYGs|wWlg05Z&qhDd9$uB zyD7|?yubNN)oQQz?R|#04qksNFTcHv85s)fJTJOeJW(;+`r4Vq7k8zsG=)5u;Y#>wHmgOL4f zV=k8k)F@4V@OSrZH19fNq&=PNfdDlc!2`hz4hdFvZRNmTP0XW`eO(pWR=V-XPw^w9 z7m2xag{CP{TNMe)p`%jj3_P6K!Gw;zqTw-GF@noQ7;BRX9W`x3R#&h1dZTL(bym9G zctnxdKD4h#rpr{I`6Vmp)JV1!756j2CF2to9vwE{)d`;< z@!B}lfbkgj%ELLeiJ6`NCQSwf{)Ukxf`Gn7+T3hLe6dBUeWgUPl>kX12{exn>Kw%K zfmNhg21`r}i!q!con5^GCUIiBb7Yh0w9V@afIU<>N~um=xzSEd3G$P(jw2t7lRZiU zo9r{<9T6xPDE&sB*xE6}<+;S`alkCIz{bN}apO$pEHVueE~rBbj<%N$g@!n9(vnpI z3AJ~MTqNfCs^@?t)`OBYy8Urq=^prOL3V+(nSMv)vcfmY-N9Z^1FvjYQw1*JWvIVk z&rs@XnDw!sUm4Rh#v!R!$$-<~jY6m1@fEw|gqN4gDN0_ufu*!+oY)U}?!w``B^V3UT z-VbWpEiyK`5I?nB)_Sc=J4LF8I-H95`CCW;@5a=Z=A+f`)RsLa?izO;@`EJA>+0JS z?_ibP$N~LGKciPi78O>Ar_^uP%j59mJhcu}-$}}taPid|T#MDL!nglW)R>w2kVUWC zn`JKQQLl>4ref9Rw##I?)YbyTa{7+_tSEm++K_PWK7VdKT}Z@L7Llz@ilxH5m4a%q zS-2x@cQj{Pwff^vr_}ALPgx^6N~&<9#o4CbJkT&q@S!!K%}|X#;jt5Vi28cL)J}$I ztER?tN+~d|Kub2srxvIA2)Gs>H)Ry(^oZ29uCUe$7V9W@>D~w9@ZHPU>tM zb-@K9y6;IdJzJl~+4vT&OKoW@m| z)#_OY2NPi)fOT|m*&0<9FPw6|NSGmn;A5T0=s>tF4ca1}K>0D^P#<9|F)Lncb6A=Mo-~P!>dCO1BL9g z00IBN@Fp9ed&C1hn@N|Ya+`L~J>hBDTwl$|>5K*nF+#41=lwJj)!p+#CE^e1J=Gh# zU~OOJt-_Z@>_VnVKu^CuTR1P7!su$~5sCzr&FrOv0a23p; zf~jM7oa0zI%86D;jIighq>AMi-%s1R7$Qc*9m0!U4NaaGB`V0@zQMA5jZM2fjtrRXwQR{Jzs zr*VlksYA9L==hzwdHnSl2w(A8z@%eL)}77Cpib9jdXcbFoxHh&lnzA&-hwF};ZCDc zYLUiLiy8m46roR{Q7Rs-LB-M0qBn??`}bs=<9waijL6dmzxTe7twB~=9w0X`uazel z0&@AI_w}zph^SMl6X%~Jj+z-H7cz%6rUgZqrqP zx+kZiO#VF2v#|4;Ui?HC1og0DDIlVFaOv=LUvj_ciX{O5@y1=l98fW&-l*VSH^hX? zT|O6`XBB1dpjO7D%g_p;C@pIFWvPSzP|jl1E_ zQDGfFX-EU$IbJk6Zb08!maaFP53F!~)wqIb<1Lx80;;*qkLuNF33}p=A)1&tik5(; zq0AS>5DJ*rTfxCBlp;?^Pc9rA3~nE z3<&lq%ksZqY{yb%FJsQSp#4mw8&Ff}a1$J@zA7tr;0~H#^H!gb`}E*5DQm8ln0|c+ zhn;|~d{>C3VEK|>Z~efy{l+kOtnNjXNoL4ZsZB?*?nvN2mY^M`hM2Jk)KeqmPC|q_ z{$JEvxVilcdBm3eF*4%SLGygLVF#!e^tg-E;x5{_!6lZ;`X}!#k>jCu8iiIlmG>e_ z;&!JsYsMyLp>xsnyrxpQZ};&tIepcvpOtrcXg^13#`rzKsv( zQQM|d3Ge5AEwz`$S+~>G__m+&QWp7b%}ZOlHcOD<<&&Bp^JQ{YR;3tOkcz$S{*cLrwv(AgYd~0&GZvG6Xjf&>6LG9RQ;1zSe;I$s{Re0s2 zizR1aq|RE-EFidC~clwgL0z74@6V^waVa8m|>P%H?Sv)5r7V z#2?QwGhdqi9Ji%`gr;ks%@HbZe}OUB3)*5i;%zm9=mnp zW#n}cD7Hrk3PeAn*b3xf^Dm7517aj3zj!m%PCnA^W%SlZ{Z%GuP_Rff?b}@i3>aB= zZR~C*H0Q}7$gyE94WW2a}r;EO}$1n zm~UPXADhceSv|nROKhrJCM4kid{%=_ITDwj)1)!BglP9WpV4vH+QJz#>wpcGA0{dV@;<>+ZZhz_yp;zO-n_ieItWD9m&U)r@e6~=!O;OoQhLOku(=7`&EU5bMz?)!S z&NTqWdipJlMG*QK-pnntKv&fcU9ZnVyZuO8GoNes?9iBJ4&<(5n%8*g7hTDR$t)dl z%?Iy~SLDab{SLh{3{0=$ef%imHWDoT>e4zj;Z=MP4>`#yR;*|J$gScxgbMpR5|L53 z5gPPA-=X<)>wo|D4>xMGRR8MWuRZ>s9poT_^iLiCUk(4->HOJn62Y4Pzwi00pI_Tw ze|o~e`dc&XSL0uW`9F<|aDOxYn@sg88i=1(smgi3>$z^@|BuMYlNO8#k%gmiTm3F*Hn_RqrdR|kL1%0D~!O8b+8 pKXUZ1)_;wLKU>!^{AB%~*r=t7j)2FH`y@C>?T7#>$Nb~h{{e@EbB6!` literal 0 HcmV?d00001 diff --git a/tests/temperature-in-middle.xlsx b/tests/temperature-in-middle.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..9db479f0b8b243503ea4543c927c83e8ac03055f GIT binary patch literal 5837 zcmaJ_1z418x1FJ-yGsxOX$d6-iII+>LqbXzVi>vwq`Ra`rMp8~a1d#cln&{ZhCAnY z?tg{zd%tK-9udnxLs;35w)ym)#V8~%^VNHi+&*}K1v(v-+mXUMk8jp^iXCX`NdBt+fD>8Tt zYqa!JK>r>Rrtsxq0(3^RCED1Ew`&BoPF`3SaoJO-U>g0mr*i_j=`%!E5jeoE$Bby~ zJf%c$1j0V)&jotK%LrF~s_f4UDU9aVC!M@{9^BZk=SlB@OTHXS*6%lz-BfC-)@bf{ zkSWvqvF?jVJJsIFQ$Fv8wgvRbQFN6XaY4EJjieEwoeBs55Fp}WYWG6L!OkAW^}^1< zjMLQy^53Fk6RBt&AdWA!bA_4QRBKC*qH~l8CPk7pDwCWVGET?K6{hgf#%UB*H+u~RAh2zG;krmSQlO^A(53`*JYBY3DYlnKT zP%q}db@PnxT^{-tM>=__mGfo+!GUxvttM4_t@(E^Pv2~hu$d|!by3|^lb>Jxbs3?= z5%DSU5Ncc@TrPGF)+Tm#)_)Qcq1hZRdM=e+xgYD5ba($d$JgFX&sR-Eb|6ny@h5L zva~IUsHf$1XMtbRpdEg-mUMhT6hVV$V@)Lzu+p17a8beHEMijjFkP-2_l~XY^`4xh z)cZJzFQu(If@Y#{cex{DsKP$2L(Zmq?eid2fY)Kw97hba+oGr{J18^5#bpcC#^Pa! znJQ_US>JWiK;4_q9~@ZboI0qCs+@LP$xJGUY8Tgt&*H8hKg{OcZSiRE@N;U`ZeB3biHzPMVoknd z+Qyq_QPSf^%|XIr9Gxz(GE@nlUNikZ@@$t4DD;2LX8dfk`4IPBn{yevv94= zjy=)$Pr2lbJZs67Q#;GcOJuC2sfq#q}<{o1x0@VV@O20{}v`005$2p>nh|voYiP z`^fXBLA9knWjpm4-)puE|8VAWCDSL|6yk>BA=Ju}7}JV@YAL-!RyldK2Cqxpf+i;B zmm}2^U5|Z5??suz=jWZWJWpKTaBl=+QsiC0ooAyUVf*wrFIogEuN$9#NS&aI@;MjB z)sP~x0;cpzI+%jw%z`2rk>U6Ey;4+h%{J8UNlBVpREAIsrG7Bq`;%eE!?$ z6E%pYg@X2@0GbKLgNpP;JS7~X!YI8YigJOoQU*z3CmqtBAl9>--It+%(k`}%?QF9YepmJ!}JQLyAU)^k1#Lw4O=wB_m~u*kru7%kJ1^#f7> zcRJSLQexx4j}#Q&bEyd&4XAkW0AJoDuHd_+JL2?osE>sO;*zc}F!==1YdylNVw+oZ zKZP3^P}0mG%L$2H);lMg7RaDRPU9Z|)U~(sZjytdhxf zkPC#TpdZoGj8`Rdp!71R;Wp`5e8T;F2!}Jl7PR*As`kR6Qkj zTEScwvF2AQI@9T$ha8@z1}9cAd!&g{Hb+)n0u9fjnkLuqRP$=?@)yid@ECZZPOX-I z@6(B4_y5oryC-ueG>zw;E`Q~xrW#2aoA)!*Fs$@8 z@oX(^3gqw;$_r64m@Qa&1G5EW*xw*!C&{iK`J-oD!vl{V4M=?mQ5V&%jMIM~Rnr=00ZBn~2j2+0jE1r|tScMceFgiDK2$QG8&8kavY78C;^=wz`afm588RQhzDFWG> ztvx`uGME*7?RlgUc6O-AES8r>sT!jpO|{#`RBx@}NE}1_Gq20aX8g-MfxUrr)`C`u ztGv@hLJU1R^^X+qFGZkQH$vvMu?5*uJ5TKk@hAK(!?Plz#da|FYx2$MNk^ma(CCfL z%ATgxCEnO#DFrlKczSGqPd%7nCvKi0Z{L)OQ{H@r`;mv4P~)>tK%xE_N}48O)(}+( z({O5uNNlL-w7sd{ud8AEkCKt}puooq{M36fjPRJdktaByTsXlJ93Pa61Hz5|`x!MK zb@E+MG@~SSGCy@PA?-w4!TyuS z8Yb01R=**%B@7v=Xhw1y@kpa;px})M^dRm@Uuw7Bp+F&c)^eb>cXjq_ITPvE zT?_9#M=VSi*^o~@16m&XmSN8^ynb^6C;0FYgwVQ6NGIs`zqfNtVh$)=2<5IJQPl17 z7dx#VxpubGudXQzujq#M?5=Os>QQU%ABTuKZGHy;ApgIP24-ny<_P2Z`}D^ul#r+h zOXbE7+&Lw|4ocJYuZT!TiIaWBm!2EKtTTOYI`|EivD8{f-dSDK`@Llaxa`TDjXJ*40*3d zn40SisBQ+SMvXgpiR&(bAt;xd_t%M9k6mm&%JKfF4^o%~1&s5h;L&q-({pk!EMN>t z1aHMl3$G|p*W)t?M35o}Z|7hCFp)V9h6a*Pv6j;6nTTwXH6Xk8s=CUD2mSh+Wk9a>xcB9UTF@nkoae7hfTRb+3f%=(%H5^w*OF#~ z$1Bw+1r?|I85(Kdy1vgG&`9Y;;$7B z>Y$f0L1kS514C>sRy)xTpoy>zQqMc{`(2BA(7TSOv#XtB--XdC#P4U8C!9CNAeAfo ze=10bI=692OfTRg2=lwrm`lNQV2e^KmVc)yv~rq5e1x0xUZ~I4nm3o4ql`y6bOqa) z9=(kxUveZ(y_)v@&{Bjs(I%0fCZzhT1CNXf6H*Z*-ZGd7+IX0SJa8>j@<#Cpc(X@+ z7VSj>LZk>9EC2xKACZo35VJq38`adaOXVi^-YF};XrUaowPwy*e_%Ex&tFhdt%Z*J zF_sJdm|&tTlMDYD@mKYscPOkC1Ixmz!ssx1H%nBI^&k%TVb`8h00CT|L_w3)+aukl zu@i4XV>k&X=na#v1Sx$gF;_%EE5Y!RJbrb9CjT>Q)&$b8+NxUb3it|{75p7)V!O@17piD`ld#rGJuPR}5D)e7MNt&Iuk%#MH8EZeSJf7(Vk&z; zR>E2bWn9f*E9En(^d*H%3+KhDi|@L9Ee@kB_aHFe0iE3O#0gv`7VX5$mHeKja9`+E zxS@V8`RY_+pFM&>F#^c3sww{c$_YHxBG1&sJ+nizBR8TbaxQ{&(zRm1VlWOC=%-fK z>B#0I&C;oHEK=o%%Djj#PxTv;ll^P}r{$4h@ZkeP60a<`YF1~!P;Udj7t_senH|FG zoHo{I7imTC#zt)*O%8JxU8v{aZISyn4cEbNZ~2WBy9_N8p!+smnuM?xa8~wbBzEnN z*fRYS2TfVvi#EFw?4@qyV0dg+a~Ku%cKL*THoKv*-efBjn4PA|WXP2t8?_fc5M%LJ zLn&C<*7oB1+3bu@u#=~Eu^nfiljHomfiur&+v$kzlYabJs~8Vx-pK^d3VDs0;bbVZ z(=;0HnlhOrEsU*Z$M0y|+UHz}ZHP%a*zB7jq^gD0D(xJ%P#J!)3^-4;r>Rvi&VgUz z>ZRE{7$Ej%4Ifd~$*ouQISt%C3OH;t|X4dc7BSPLk1s=6rXKJ(s%nr#Nxyu!2R(hBskP~XfoyaB_8MH;ml*Dl_A~?NaiOurH}z ztansuB{ZobM5!qlrss)cW6bXaf$7)l_%Z@d$2@Hk3wrEQ`8MXu2ZvGHi8L zd{TdEaz4vlXov-+8~T`b^f@TLWoGvsco+Pljq}jNfhO z7xX)N5rMwt4BZCbv`Bsj-$D5s{0GD2HqTA};CCKA)W3Ody9u`mZuWn_6VxEm{XZ)H z)eYWexmgE)XF0(Ao8@0s@ixQFBJ(@LINmRY|CF5D(3_dzcj$9Mz< = wb(fixture_path); + assert_eq!( + excel.sheets_metadata(), + &[Sheet { + name: "Sheet1".to_string(), + typ: SheetType::WorkSheet, + visible: SheetVisible::Visible + },] + ); + + // By default empty cells are skipped so the first row is skipped + let range = excel + .with_options(XlsxOptions::new().with_keep_first_empty_rows(keep_first_empty_rows)) + .worksheet_range("Sheet1") + .unwrap(); + assert_eq!(range.start(), Some(expected_start)); + assert_eq!(range.end(), Some(expected_end)); + assert_eq!(range.rows().next().unwrap(), expected_first_row,); + assert_eq!(range.cells().count(), expected_total_cells); +} From b27b0aa60811ff60b9cdec8a1bde67d30ef9a496 Mon Sep 17 00:00:00 2001 From: Eric Jolibois Date: Tue, 17 Sep 2024 19:00:21 +0200 Subject: [PATCH 02/14] refactor: switch to `header_row` --- src/xlsx/mod.rs | 21 +++++++++--------- .../{keep-empty-rows.xlsx => header-row.xlsx} | Bin tests/test.rs | 18 +++++++-------- 3 files changed, 19 insertions(+), 20 deletions(-) rename tests/{keep-empty-rows.xlsx => header-row.xlsx} (100%) diff --git a/src/xlsx/mod.rs b/src/xlsx/mod.rs index e28d59d3..8371d36e 100644 --- a/src/xlsx/mod.rs +++ b/src/xlsx/mod.rs @@ -204,8 +204,9 @@ pub struct Xlsx { /// Xlsx reader options #[derive(Debug, Default)] pub struct XlsxOptions { - /// By default, calamine skips empty rows until a nonempty row is found - pub keep_first_empty_rows: bool, + /// Index of the header row + /// If not set, the first non-empty row is considered the header row + pub header_row: Option, } impl XlsxOptions { @@ -214,11 +215,9 @@ impl XlsxOptions { Self::default() } - /// Avoid skipping first empty rows - pub fn with_keep_first_empty_rows(self, keep_first_empty_rows: bool) -> Self { - Self { - keep_first_empty_rows, - } + /// Set the header row index + pub fn with_header_row(self, header_row: Option) -> Self { + Self { header_row } } } @@ -976,7 +975,7 @@ impl Reader for Xlsx { impl ReaderRef for Xlsx { fn worksheet_range_ref<'a>(&'a mut self, name: &str) -> Result>, XlsxError> { - let keep_first_empty_rows = self.options.keep_first_empty_rows; + let header_row = self.options.header_row; let mut cell_reader = match self.worksheet_cells_reader(name) { Ok(reader) => reader, Err(XlsxError::NotAWorksheet(typ)) => { @@ -1004,7 +1003,7 @@ impl ReaderRef for Xlsx { // If the first cell doesn't start at row 0, we add an empty cell // at row 0 but still at the same column as the first cell - if keep_first_empty_rows && cells.first().map_or(false, |c| c.pos.0 != 0) { + if header_row == Some(0) && cells.first().map_or(false, |c| c.pos.0 != 0) { cells.insert( 0, Cell { @@ -1315,7 +1314,7 @@ fn replace_cell_names(s: &str, offset: (i64, i64)) -> Result } } -/// Convert the integer to Excelsheet column title. +/// Convert the integer to Excelsheet column title. /// If the column number not in 1~16384, an Error is returned. pub(crate) fn column_number_to_name(num: u32) -> Result, XlsxError> { if num >= MAX_COLUMNS { @@ -1332,7 +1331,7 @@ pub(crate) fn column_number_to_name(num: u32) -> Result, XlsxError> { Ok(col) } -/// Convert a cell coordinate to Excelsheet cell name. +/// Convert a cell coordinate to Excelsheet cell name. /// If the column number not in 1~16384, an Error is returned. pub(crate) fn coordinate_to_name(cell: (u32, u32)) -> Result, XlsxError> { let cell = &[ diff --git a/tests/keep-empty-rows.xlsx b/tests/header-row.xlsx similarity index 100% rename from tests/keep-empty-rows.xlsx rename to tests/header-row.xlsx diff --git a/tests/test.rs b/tests/test.rs index c108028a..65afe4bc 100644 --- a/tests/test.rs +++ b/tests/test.rs @@ -1798,15 +1798,15 @@ fn test_ref_xlsb() { } #[rstest] -#[case("keep-empty-rows.xlsx", false, (2, 0), (9, 3), &[Empty, Empty, String("Note 1".to_string()), Empty], 32)] -#[case("keep-empty-rows.xlsx", true, (0, 0), (9, 3), &[Empty, Empty, Empty, Empty], 40)] -#[case("temperature.xlsx", false, (0, 0), (2, 1), &[String("label".to_string()), String("value".to_string())], 6)] -#[case("temperature.xlsx", true, (0, 0), (2, 1), &[String("label".to_string()), String("value".to_string())], 6)] -#[case("temperature-in-middle.xlsx", false, (3, 1), (5, 2), &[String("label".to_string()), String("value".to_string())], 6)] -#[case("temperature-in-middle.xlsx", true, (0, 1), (5, 2), &[Empty, Empty], 12)] -fn keep_first_empty_rows_xlsx( +#[case("header-row.xlsx", None, (2, 0), (9, 3), &[Empty, Empty, String("Note 1".to_string()), Empty], 32)] +#[case("header-row.xlsx", Some(0), (0, 0), (9, 3), &[Empty, Empty, Empty, Empty], 40)] +#[case("temperature.xlsx", None, (0, 0), (2, 1), &[String("label".to_string()), String("value".to_string())], 6)] +#[case("temperature.xlsx", Some(0), (0, 0), (2, 1), &[String("label".to_string()), String("value".to_string())], 6)] +#[case("temperature-in-middle.xlsx", None, (3, 1), (5, 2), &[String("label".to_string()), String("value".to_string())], 6)] +#[case("temperature-in-middle.xlsx", Some(0), (0, 1), (5, 2), &[Empty, Empty], 12)] +fn header_row_xlsx( #[case] fixture_path: &str, - #[case] keep_first_empty_rows: bool, + #[case] header_row: Option, #[case] expected_start: (u32, u32), #[case] expected_end: (u32, u32), #[case] expected_first_row: &[Data], @@ -1824,7 +1824,7 @@ fn keep_first_empty_rows_xlsx( // By default empty cells are skipped so the first row is skipped let range = excel - .with_options(XlsxOptions::new().with_keep_first_empty_rows(keep_first_empty_rows)) + .with_options(XlsxOptions::new().with_header_row(header_row)) .worksheet_range("Sheet1") .unwrap(); assert_eq!(range.start(), Some(expected_start)); From dba5b60aea363561b5a4645c03d378d862707490 Mon Sep 17 00:00:00 2001 From: Eric Jolibois Date: Tue, 17 Sep 2024 19:41:04 +0200 Subject: [PATCH 03/14] refactor: implement properly header_row for xlsx --- src/xlsx/mod.rs | 66 ++++++++++++++++++++++++++++++++++--------------- tests/test.rs | 3 ++- 2 files changed, 48 insertions(+), 21 deletions(-) diff --git a/src/xlsx/mod.rs b/src/xlsx/mod.rs index 8371d36e..432d1d8a 100644 --- a/src/xlsx/mod.rs +++ b/src/xlsx/mod.rs @@ -989,28 +989,54 @@ impl ReaderRef for Xlsx { if len < 100_000 { cells.reserve(len as usize); } - loop { - match cell_reader.next_cell() { - Ok(Some(Cell { - val: DataRef::Empty, - .. - })) => (), - Ok(Some(cell)) => cells.push(cell), - Ok(None) => break, - Err(e) => return Err(e), + + // If `header_row` is set, we only add non-empty cells after the `header_row`. + if let Some(header_row) = header_row { + loop { + match cell_reader.next_cell() { + Ok(Some(Cell { + val: DataRef::Empty, + .. + })) => (), + Ok(Some(cell)) => { + if cell.pos.0 >= header_row { + cells.push(cell); + } + } + Ok(None) => break, + Err(e) => return Err(e), + } } - } - // If the first cell doesn't start at row 0, we add an empty cell - // at row 0 but still at the same column as the first cell - if header_row == Some(0) && cells.first().map_or(false, |c| c.pos.0 != 0) { - cells.insert( - 0, - Cell { - pos: (0, cells.first().expect("cells should not be empty").pos.1), - val: DataRef::Empty, - }, - ); + // If `header_row` is set and the first non-empty cell is not at the `header_row`, we add + // an empty cell at the beginning with row `header_row` and same column as the first non-empty cell. + if cells.first().map_or(false, |c| c.pos.0 != header_row) { + cells.insert( + header_row as usize, + Cell { + pos: ( + header_row, + cells.first().expect("cells should not be empty").pos.1, + ), + val: DataRef::Empty, + }, + ); + } + // If `header_row` is not specified (default), the header row is the row of the first non-empty cell. + } else { + loop { + match cell_reader.next_cell() { + Ok(Some(Cell { + val: DataRef::Empty, + .. + })) => (), + Ok(Some(cell)) => { + cells.push(cell); + } + Ok(None) => break, + Err(e) => return Err(e), + } + } } Ok(Range::from_sparse(cells)) diff --git a/tests/test.rs b/tests/test.rs index 65afe4bc..685e8007 100644 --- a/tests/test.rs +++ b/tests/test.rs @@ -1800,6 +1800,7 @@ fn test_ref_xlsb() { #[rstest] #[case("header-row.xlsx", None, (2, 0), (9, 3), &[Empty, Empty, String("Note 1".to_string()), Empty], 32)] #[case("header-row.xlsx", Some(0), (0, 0), (9, 3), &[Empty, Empty, Empty, Empty], 40)] +#[case("header-row.xlsx", Some(8), (8, 0), (9, 3), &[String("Columns".to_string()), String("Column A".to_string()), String("Column B".to_string()), String("Column C".to_string())], 8)] #[case("temperature.xlsx", None, (0, 0), (2, 1), &[String("label".to_string()), String("value".to_string())], 6)] #[case("temperature.xlsx", Some(0), (0, 0), (2, 1), &[String("label".to_string()), String("value".to_string())], 6)] #[case("temperature-in-middle.xlsx", None, (3, 1), (5, 2), &[String("label".to_string()), String("value".to_string())], 6)] @@ -1829,7 +1830,7 @@ fn header_row_xlsx( .unwrap(); assert_eq!(range.start(), Some(expected_start)); assert_eq!(range.end(), Some(expected_end)); - assert_eq!(range.rows().next().unwrap(), expected_first_row,); + assert_eq!(range.rows().next().unwrap(), expected_first_row); assert_eq!(range.cells().count(), expected_total_cells); } From 8b1a77ff4994d4285e616b1756f91058c8a7fe3d Mon Sep 17 00:00:00 2001 From: Eric Jolibois Date: Wed, 18 Sep 2024 13:38:05 +0200 Subject: [PATCH 04/14] feat: add header_row to ods file --- src/lib.rs | 2 +- src/ods.rs | 49 ++++++++++++++++++++++++++++++++++++++++++++++--- tests/test.rs | 39 +++++++++++++++++++++++++++++++++++---- 3 files changed, 82 insertions(+), 8 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 719ca838..a6e71539 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -86,7 +86,7 @@ pub use crate::auto::{open_workbook_auto, open_workbook_auto_from_rs, Sheets}; pub use crate::datatype::{Data, DataRef, DataType, ExcelDateTime, ExcelDateTimeType}; pub use crate::de::{DeError, RangeDeserializer, RangeDeserializerBuilder, ToCellDeserializer}; pub use crate::errors::Error; -pub use crate::ods::{Ods, OdsError}; +pub use crate::ods::{Ods, OdsError, OdsOptions}; pub use crate::xls::{Xls, XlsError, XlsOptions}; pub use crate::xlsb::{Xlsb, XlsbError}; pub use crate::xlsx::{Xlsx, XlsxError, XlsxOptions}; diff --git a/src/ods.rs b/src/ods.rs index 19459e9b..da748b61 100644 --- a/src/ods.rs +++ b/src/ods.rs @@ -62,6 +62,34 @@ pub enum OdsError { WorksheetNotFound(String), } +/// Ods reader options +#[derive(Debug, Default)] +pub struct OdsOptions { + /// Index of the header row + /// If not set, the first non-empty row is considered the header row + pub header_row: Option, +} + +impl OdsOptions { + /// Create a new XlsxOptions + pub fn new() -> Self { + Self::default() + } + + /// Set the header row index + pub fn with_header_row(self, header_row: Option) -> Self { + Self { header_row } + } +} + +impl Ods { + /// Set reader options + pub fn with_options(mut self, options: OdsOptions) -> Self { + self.options = options; + self + } +} + from_err!(std::io::Error, OdsError, Io); from_err!(zip::result::ZipError, OdsError, Zip); from_err!(quick_xml::Error, OdsError, Xml); @@ -116,6 +144,8 @@ pub struct Ods { marker: PhantomData, #[cfg(feature = "picture")] pictures: Option)>>, + /// Reader options + options: OdsOptions, } impl Reader for Ods @@ -161,6 +191,7 @@ where sheets, #[cfg(feature = "picture")] pictures, + options: OdsOptions::default(), }) } @@ -176,10 +207,22 @@ where /// Read worksheet data in corresponding worksheet path fn worksheet_range(&mut self, name: &str) -> Result, OdsError> { - self.sheets + let sheet = self + .sheets .get(name) - .ok_or_else(|| OdsError::WorksheetNotFound(name.into())) - .map(|r| r.0.to_owned()) + .ok_or_else(|| OdsError::WorksheetNotFound(name.into()))? + .0 + .to_owned(); + + // If a header_row is defined, adjust the range + if let Some(header_row) = self.options.header_row { + if let (Some(start), Some(end)) = (sheet.start(), sheet.end()) { + return Ok(sheet.range((header_row, start.1), end)); + } + } + + // Return the original range if no header row is set + Ok(sheet) } fn worksheets(&mut self) -> Vec<(String, Range)> { diff --git a/tests/test.rs b/tests/test.rs index 685e8007..f72e74ac 100644 --- a/tests/test.rs +++ b/tests/test.rs @@ -1,8 +1,8 @@ use calamine::Data::{Bool, DateTime, DateTimeIso, DurationIso, Empty, Error, Float, String}; use calamine::{ open_workbook, open_workbook_auto, DataRef, DataType, Dimensions, ExcelDateTime, - ExcelDateTimeType, Ods, Range, Reader, ReaderRef, Sheet, SheetType, SheetVisible, Xls, Xlsb, - Xlsx, XlsxOptions, + ExcelDateTimeType, Ods, OdsOptions, Range, Reader, ReaderRef, Sheet, SheetType, SheetVisible, + Xls, Xlsb, Xlsx, XlsxOptions, }; use calamine::{CellErrorType::*, Data}; use rstest::rstest; @@ -1805,7 +1805,7 @@ fn test_ref_xlsb() { #[case("temperature.xlsx", Some(0), (0, 0), (2, 1), &[String("label".to_string()), String("value".to_string())], 6)] #[case("temperature-in-middle.xlsx", None, (3, 1), (5, 2), &[String("label".to_string()), String("value".to_string())], 6)] #[case("temperature-in-middle.xlsx", Some(0), (0, 1), (5, 2), &[Empty, Empty], 12)] -fn header_row_xlsx( +fn test_header_row_xlsx( #[case] fixture_path: &str, #[case] header_row: Option, #[case] expected_start: (u32, u32), @@ -1823,7 +1823,6 @@ fn header_row_xlsx( },] ); - // By default empty cells are skipped so the first row is skipped let range = excel .with_options(XlsxOptions::new().with_header_row(header_row)) .worksheet_range("Sheet1") @@ -1834,6 +1833,38 @@ fn header_row_xlsx( assert_eq!(range.cells().count(), expected_total_cells); } +#[rstest] +fn test_header_row_ods() { + let mut ods: Ods<_> = wb("date.ods"); + assert_eq!( + ods.sheets_metadata(), + &[Sheet { + name: "Sheet1".to_string(), + typ: SheetType::WorkSheet, + visible: SheetVisible::Visible + },] + ); + + let range = ods.worksheet_range("Sheet1").unwrap(); + assert_eq!(range.start(), Some((0, 0))); + assert_eq!(range.end(), Some((3, 1))); + assert_eq!( + range.rows().next().unwrap(), + &[DateTimeIso("2021-01-01".to_string()), Float(15.0)] + ); + + let range = ods + .with_options(OdsOptions::new().with_header_row(Some(2))) + .worksheet_range("Sheet1") + .unwrap(); + assert_eq!(range.start(), Some((2, 0))); + assert_eq!(range.end(), Some((3, 1))); + assert_eq!( + range.rows().next().unwrap(), + &[DurationIso("PT10H10M10S".to_string()), Float(17.0)] + ); +} + #[rstest] #[case("single-empty.ods")] #[case("multi-empty.ods")] From 25635b9dd4725a83971e7e06a78398935f49df06 Mon Sep 17 00:00:00 2001 From: Eric Jolibois Date: Thu, 26 Sep 2024 00:53:47 +0200 Subject: [PATCH 05/14] chore: simplify --- src/ods.rs | 11 ++++------- src/xlsx/mod.rs | 11 ++++------- tests/test.rs | 4 ++-- 3 files changed, 10 insertions(+), 16 deletions(-) diff --git a/src/ods.rs b/src/ods.rs index da748b61..cd04800c 100644 --- a/src/ods.rs +++ b/src/ods.rs @@ -71,14 +71,11 @@ pub struct OdsOptions { } impl OdsOptions { - /// Create a new XlsxOptions - pub fn new() -> Self { - Self::default() - } - /// Set the header row index - pub fn with_header_row(self, header_row: Option) -> Self { - Self { header_row } + pub fn with_header_row(self, header_row: u32) -> Self { + Self { + header_row: Some(header_row), + } } } diff --git a/src/xlsx/mod.rs b/src/xlsx/mod.rs index 432d1d8a..93d448d8 100644 --- a/src/xlsx/mod.rs +++ b/src/xlsx/mod.rs @@ -210,14 +210,11 @@ pub struct XlsxOptions { } impl XlsxOptions { - /// Create a new XlsxOptions - pub fn new() -> Self { - Self::default() - } - /// Set the header row index - pub fn with_header_row(self, header_row: Option) -> Self { - Self { header_row } + pub fn with_header_row(self, header_row: u32) -> Self { + Self { + header_row: Some(header_row), + } } } diff --git a/tests/test.rs b/tests/test.rs index f72e74ac..d1a65d6b 100644 --- a/tests/test.rs +++ b/tests/test.rs @@ -1824,7 +1824,7 @@ fn test_header_row_xlsx( ); let range = excel - .with_options(XlsxOptions::new().with_header_row(header_row)) + .with_options(XlsxOptions { header_row }) .worksheet_range("Sheet1") .unwrap(); assert_eq!(range.start(), Some(expected_start)); @@ -1854,7 +1854,7 @@ fn test_header_row_ods() { ); let range = ods - .with_options(OdsOptions::new().with_header_row(Some(2))) + .with_options(OdsOptions::default().with_header_row(2)) .worksheet_range("Sheet1") .unwrap(); assert_eq!(range.start(), Some((2, 0))); From ff35b2ccf63131791ef459b8f43bafbc89ff8bd9 Mon Sep 17 00:00:00 2001 From: Eric Jolibois Date: Thu, 26 Sep 2024 01:30:58 +0200 Subject: [PATCH 06/14] refactor: impl with associated type --- src/auto.rs | 45 +++++++++++++++++++- src/lib.rs | 20 ++++++++- src/ods.rs | 22 +++++----- src/xls.rs | 47 +++++++++++++++++++-- src/xlsb/mod.rs | 98 ++++++++++++++++++++++++++++++++++-------- src/xlsx/mod.rs | 17 +++++--- tests/test.rs | 110 ++++++++++++++++++++++++++++++++++++++++++------ 7 files changed, 305 insertions(+), 54 deletions(-) diff --git a/src/auto.rs b/src/auto.rs index 6250d35b..412fe4d5 100644 --- a/src/auto.rs +++ b/src/auto.rs @@ -2,9 +2,10 @@ use crate::errors::Error; use crate::vba::VbaProject; +use crate::xlsb::XlsbOptions; use crate::{ - open_workbook, open_workbook_from_rs, Data, DataRef, Metadata, Ods, Range, Reader, ReaderRef, - Xls, Xlsb, Xlsx, + open_workbook, open_workbook_from_rs, Data, DataRef, Metadata, Ods, OdsOptions, Range, Reader, + ReaderOptions, ReaderRef, Xls, XlsOptions, Xlsb, Xlsx, XlsxOptions, }; use std::borrow::Cow; use std::fs::File; @@ -74,17 +75,57 @@ where } } +pub enum AutoReaderOptions { + Xls(XlsOptions), + Xlsx(XlsxOptions), + Xlsb(XlsbOptions), + Ods(OdsOptions), +} + +impl ReaderOptions for AutoReaderOptions { + fn with_header_row(self, header_row: u32) -> Self { + match self { + AutoReaderOptions::Xls(e) => AutoReaderOptions::Xls(e.with_header_row(header_row)), + AutoReaderOptions::Xlsx(e) => AutoReaderOptions::Xlsx(e.with_header_row(header_row)), + AutoReaderOptions::Xlsb(e) => AutoReaderOptions::Xlsb(e.with_header_row(header_row)), + AutoReaderOptions::Ods(e) => AutoReaderOptions::Ods(e.with_header_row(header_row)), + } + } +} + impl Reader for Sheets where RS: std::io::Read + std::io::Seek, { type Error = Error; + type Options = AutoReaderOptions; /// Creates a new instance. fn new(_reader: RS) -> Result { Err(Error::Msg("Sheets must be created from a Path")) } + fn set_options(&mut self, options: Self::Options) { + match *self { + Sheets::Xls(ref mut e) => match options { + AutoReaderOptions::Xls(opts) => e.set_options(opts), + _ => unreachable!(), + }, + Sheets::Xlsx(ref mut e) => match options { + AutoReaderOptions::Xlsx(opts) => e.set_options(opts), + _ => unreachable!(), + }, + Sheets::Xlsb(ref mut e) => match options { + AutoReaderOptions::Xlsb(opts) => e.set_options(opts), + _ => unreachable!(), + }, + Sheets::Ods(ref mut e) => match options { + AutoReaderOptions::Ods(opts) => e.set_options(opts), + _ => unreachable!(), + }, + } + } + /// Gets `VbaProject` fn vba_project(&mut self) -> Option, Self::Error>> { match *self { diff --git a/src/lib.rs b/src/lib.rs index a6e71539..3663040d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -88,7 +88,7 @@ pub use crate::de::{DeError, RangeDeserializer, RangeDeserializerBuilder, ToCell pub use crate::errors::Error; pub use crate::ods::{Ods, OdsError, OdsOptions}; pub use crate::xls::{Xls, XlsError, XlsOptions}; -pub use crate::xlsb::{Xlsb, XlsbError}; +pub use crate::xlsb::{Xlsb, XlsbError, XlsbOptions}; pub use crate::xlsx::{Xlsx, XlsxError, XlsxOptions}; use crate::vba::VbaProject; @@ -215,6 +215,12 @@ pub struct Sheet { pub visible: SheetVisible, } +/// A trait to share reader options across different `FileType`s +pub trait ReaderOptions: Sized { + /// Set the header row + fn with_header_row(self, _header_row: u32) -> Self; +} + // FIXME `Reader` must only be seek `Seek` for `Xls::xls`. Because of the present API this limits // the kinds of readers (other) data in formats can be read from. /// A trait to share spreadsheets reader functions across different `FileType`s @@ -225,9 +231,21 @@ where /// Error specific to file type type Error: std::fmt::Debug + From; + /// Options specific to file type + type Options: ReaderOptions; + /// Creates a new instance. fn new(reader: RS) -> Result; + /// Set options + fn set_options(&mut self, options: Self::Options); + + /// Set options and return the reader + fn with_options(mut self, options: Self::Options) -> Self { + self.set_options(options); + self + } + /// Gets `VbaProject` fn vba_project(&mut self) -> Option, Self::Error>>; diff --git a/src/ods.rs b/src/ods.rs index cd04800c..7c1c5f44 100644 --- a/src/ods.rs +++ b/src/ods.rs @@ -16,7 +16,9 @@ use zip::read::{ZipArchive, ZipFile}; use zip::result::ZipError; use crate::vba::VbaProject; -use crate::{Data, DataType, Metadata, Range, Reader, Sheet, SheetType, SheetVisible}; +use crate::{ + Data, DataType, Metadata, Range, Reader, ReaderOptions, Sheet, SheetType, SheetVisible, +}; use std::marker::PhantomData; const MIMETYPE: &[u8] = b"application/vnd.oasis.opendocument.spreadsheet"; @@ -70,23 +72,15 @@ pub struct OdsOptions { pub header_row: Option, } -impl OdsOptions { +impl ReaderOptions for OdsOptions { /// Set the header row index - pub fn with_header_row(self, header_row: u32) -> Self { + fn with_header_row(self, header_row: u32) -> Self { Self { header_row: Some(header_row), } } } -impl Ods { - /// Set reader options - pub fn with_options(mut self, options: OdsOptions) -> Self { - self.options = options; - self - } -} - from_err!(std::io::Error, OdsError, Io); from_err!(zip::result::ZipError, OdsError, Zip); from_err!(quick_xml::Error, OdsError, Xml); @@ -150,6 +144,7 @@ where RS: Read + Seek, { type Error = OdsError; + type Options = OdsOptions; fn new(reader: RS) -> Result { let mut zip = ZipArchive::new(reader)?; @@ -192,6 +187,11 @@ where }) } + /// Set options + fn set_options(&mut self, options: Self::Options) { + self.options = options; + } + /// Gets `VbaProject` fn vba_project(&mut self) -> Option, OdsError>> { None diff --git a/src/xls.rs b/src/xls.rs index fe19c354..239a0cc8 100644 --- a/src/xls.rs +++ b/src/xls.rs @@ -17,7 +17,8 @@ use crate::utils::read_usize; use crate::utils::{push_column, read_f64, read_i16, read_i32, read_u16, read_u32}; use crate::vba::VbaProject; use crate::{ - Cell, CellErrorType, Data, Dimensions, Metadata, Range, Reader, Sheet, SheetType, SheetVisible, + Cell, CellErrorType, Data, Dimensions, Metadata, Range, Reader, ReaderOptions, Sheet, + SheetType, SheetVisible, }; #[derive(Debug)] @@ -136,6 +137,30 @@ pub struct XlsOptions { /// /// [code page]: https://docs.microsoft.com/en-us/windows/win32/intl/code-page-identifiers pub force_codepage: Option, + /// Index of the header row + /// If not set, the first non-empty row is considered the header row + pub header_row: Option, +} + +impl XlsOptions { + #[allow(dead_code)] + /// Set the code page + fn with_codepage(self, codepage: u16) -> Self { + Self { + force_codepage: Some(codepage), + ..self + } + } +} + +impl ReaderOptions for XlsOptions { + /// Set the header row index + fn with_header_row(self, header_row: u32) -> Self { + Self { + header_row: Some(header_row), + ..self + } + } } struct SheetData { @@ -226,11 +251,16 @@ impl Xls { impl Reader for Xls { type Error = XlsError; + type Options = XlsOptions; fn new(reader: RS) -> Result { Self::new_with_options(reader, XlsOptions::default()) } + fn set_options(&mut self, options: Self::Options) { + self.options = options; + } + fn vba_project(&mut self) -> Option, XlsError>> { self.vba.as_ref().map(|vba| Ok(Cow::Borrowed(vba))) } @@ -241,10 +271,21 @@ impl Reader for Xls { } fn worksheet_range(&mut self, name: &str) -> Result, XlsError> { - self.sheets + let sheet = self + .sheets .get(name) .map(|r| r.range.clone()) - .ok_or_else(|| XlsError::WorksheetNotFound(name.into())) + .ok_or_else(|| XlsError::WorksheetNotFound(name.into()))?; + + // If a header_row is defined, adjust the range + if let Some(header_row) = self.options.header_row { + if let (Some(start), Some(end)) = (sheet.start(), sheet.end()) { + return Ok(sheet.range((header_row, start.1), end)); + } + } + + // Return the original range if no header row is set + Ok(sheet) } fn worksheets(&mut self) -> Vec<(String, Range)> { diff --git a/src/xlsb/mod.rs b/src/xlsb/mod.rs index 6d6e0667..ac95bd60 100644 --- a/src/xlsb/mod.rs +++ b/src/xlsb/mod.rs @@ -20,7 +20,9 @@ use crate::datatype::DataRef; use crate::formats::{builtin_format_by_code, detect_custom_number_format, CellFormat}; use crate::utils::{push_column, read_f64, read_i32, read_u16, read_u32, read_usize}; use crate::vba::VbaProject; -use crate::{Cell, Data, Metadata, Range, Reader, ReaderRef, Sheet, SheetType, SheetVisible}; +use crate::{ + Cell, Data, Metadata, Range, Reader, ReaderOptions, ReaderRef, Sheet, SheetType, SheetVisible, +}; /// A Xlsb specific error #[derive(Debug)] @@ -128,6 +130,23 @@ impl std::error::Error for XlsbError { } } +/// Xlsb reader options +#[derive(Debug, Default)] +pub struct XlsbOptions { + /// Index of the header row + /// If not set, the first non-empty row is considered the header row + pub header_row: Option, +} + +impl ReaderOptions for XlsbOptions { + /// Set the header row index + fn with_header_row(self, header_row: u32) -> Self { + Self { + header_row: Some(header_row), + } + } +} + /// A Xlsb reader pub struct Xlsb { zip: ZipArchive, @@ -140,6 +159,7 @@ pub struct Xlsb { metadata: Metadata, #[cfg(feature = "picture")] pictures: Option)>>, + options: XlsbOptions, } impl Xlsb { @@ -435,6 +455,7 @@ impl Xlsb { impl Reader for Xlsb { type Error = XlsbError; + type Options = XlsbOptions; fn new(mut reader: RS) -> Result { check_for_password_protected(&mut reader)?; @@ -449,6 +470,7 @@ impl Reader for Xlsb { metadata: Metadata::default(), #[cfg(feature = "picture")] pictures: None, + options: XlsbOptions::default(), }; xlsb.read_shared_strings()?; xlsb.read_styles()?; @@ -460,6 +482,10 @@ impl Reader for Xlsb { Ok(xlsb) } + fn set_options(&mut self, options: Self::Options) { + self.options = options; + } + fn vba_project(&mut self) -> Option, XlsbError>> { self.zip.by_name("xl/vbaProject.bin").ok().map(|mut f| { let len = f.size() as usize; @@ -475,14 +501,13 @@ impl Reader for Xlsb { /// MS-XLSB 2.1.7.62 fn worksheet_range(&mut self, name: &str) -> Result, XlsbError> { - let mut cells_reader = self.worksheet_cells_reader(name)?; - let mut cells = Vec::with_capacity(cells_reader.dimensions().len().min(1_000_000) as _); - while let Some(cell) = cells_reader.next_cell()? { - if cell.val != DataRef::Empty { - cells.push(Cell::new(cell.pos, Data::from(cell.val))); - } - } - Ok(Range::from_sparse(cells)) + let rge = self.worksheet_range_ref(name)?; + let inner = rge.inner.into_iter().map(|v| v.into()).collect(); + Ok(Range { + start: rge.start, + end: rge.end, + inner, + }) } /// MS-XLSB 2.1.7.62 @@ -521,21 +546,58 @@ impl Reader for Xlsb { impl ReaderRef for Xlsb { fn worksheet_range_ref<'a>(&'a mut self, name: &str) -> Result>, XlsbError> { + let header_row = self.options.header_row; let mut cell_reader = self.worksheet_cells_reader(name)?; let len = cell_reader.dimensions().len(); let mut cells = Vec::new(); if len < 100_000 { cells.reserve(len as usize); } - loop { - match cell_reader.next_cell() { - Ok(Some(Cell { - val: DataRef::Empty, - .. - })) => (), - Ok(Some(cell)) => cells.push(cell), - Ok(None) => break, - Err(e) => return Err(e), + + // If `header_row` is set, we only add non-empty cells after the `header_row`. + if let Some(header_row) = header_row { + loop { + match cell_reader.next_cell() { + Ok(Some(Cell { + val: DataRef::Empty, + .. + })) => (), + Ok(Some(cell)) => { + if cell.pos.0 >= header_row { + cells.push(cell); + } + } + Ok(None) => break, + Err(e) => return Err(e), + } + } + + // If `header_row` is set and the first non-empty cell is not at the `header_row`, we add + // an empty cell at the beginning with row `header_row` and same column as the first non-empty cell. + if cells.first().map_or(false, |c| c.pos.0 != header_row) { + cells.insert( + header_row as usize, + Cell { + pos: ( + header_row, + cells.first().expect("cells should not be empty").pos.1, + ), + val: DataRef::Empty, + }, + ); + } + // If `header_row` is not specified (default), the header row is the row of the first non-empty cell. + } else { + loop { + match cell_reader.next_cell() { + Ok(Some(Cell { + val: DataRef::Empty, + .. + })) => (), + Ok(Some(cell)) => cells.push(cell), + Ok(None) => break, + Err(e) => return Err(e), + } } } Ok(Range::from_sparse(cells)) diff --git a/src/xlsx/mod.rs b/src/xlsx/mod.rs index 93d448d8..1ebb3a3e 100644 --- a/src/xlsx/mod.rs +++ b/src/xlsx/mod.rs @@ -18,8 +18,8 @@ use crate::datatype::DataRef; use crate::formats::{builtin_format_by_id, detect_custom_number_format, CellFormat}; use crate::vba::VbaProject; use crate::{ - Cell, CellErrorType, Data, Dimensions, Metadata, Range, Reader, ReaderRef, Sheet, SheetType, - SheetVisible, Table, + Cell, CellErrorType, Data, Dimensions, Metadata, Range, Reader, ReaderOptions, ReaderRef, + Sheet, SheetType, SheetVisible, Table, }; pub use cells_reader::XlsxCellReader; @@ -209,9 +209,9 @@ pub struct XlsxOptions { pub header_row: Option, } -impl XlsxOptions { +impl ReaderOptions for XlsxOptions { /// Set the header row index - pub fn with_header_row(self, header_row: u32) -> Self { + fn with_header_row(self, header_row: u32) -> Self { Self { header_row: Some(header_row), } @@ -876,6 +876,7 @@ impl Xlsx { impl Reader for Xlsx { type Error = XlsxError; + type Options = XlsxOptions; fn new(mut reader: RS) -> Result { check_for_password_protected(&mut reader)?; @@ -903,6 +904,10 @@ impl Reader for Xlsx { Ok(xlsx) } + fn set_options(&mut self, options: Self::Options) { + self.options = options; + } + fn vba_project(&mut self) -> Option, XlsxError>> { let mut f = self.zip.by_name("xl/vbaProject.bin").ok()?; let len = f.size() as usize; @@ -1027,9 +1032,7 @@ impl ReaderRef for Xlsx { val: DataRef::Empty, .. })) => (), - Ok(Some(cell)) => { - cells.push(cell); - } + Ok(Some(cell)) => cells.push(cell), Ok(None) => break, Err(e) => return Err(e), } diff --git a/tests/test.rs b/tests/test.rs index d1a65d6b..aff6e038 100644 --- a/tests/test.rs +++ b/tests/test.rs @@ -1,8 +1,8 @@ -use calamine::Data::{Bool, DateTime, DateTimeIso, DurationIso, Empty, Error, Float, String}; +use calamine::Data::{Bool, DateTime, DateTimeIso, DurationIso, Empty, Error, Float, Int, String}; use calamine::{ open_workbook, open_workbook_auto, DataRef, DataType, Dimensions, ExcelDateTime, - ExcelDateTimeType, Ods, OdsOptions, Range, Reader, ReaderRef, Sheet, SheetType, SheetVisible, - Xls, Xlsb, Xlsx, XlsxOptions, + ExcelDateTimeType, Ods, OdsOptions, Range, Reader, ReaderOptions, ReaderRef, Sheet, SheetType, + SheetVisible, Xls, XlsOptions, Xlsb, XlsbOptions, Xlsx, XlsxOptions, }; use calamine::{CellErrorType::*, Data}; use rstest::rstest; @@ -1833,6 +1833,94 @@ fn test_header_row_xlsx( assert_eq!(range.cells().count(), expected_total_cells); } +#[rstest] +fn test_header_row_xlsb() { + let mut xlsb: Xlsb<_> = wb("date.xlsb"); + assert_eq!( + xlsb.sheets_metadata(), + &[Sheet { + name: "Sheet1".to_string(), + typ: SheetType::WorkSheet, + visible: SheetVisible::Visible + }] + ); + + let first_line = [ + DateTime(ExcelDateTime::new( + 44197.0, + ExcelDateTimeType::DateTime, + false, + )), + Float(15.0), + ]; + let second_line = [ + DateTime(ExcelDateTime::new( + 44198.0, + ExcelDateTimeType::DateTime, + false, + )), + Float(16.0), + ]; + + let range = xlsb.worksheet_range("Sheet1").unwrap(); + assert_eq!(range.start(), Some((0, 0))); + assert_eq!(range.end(), Some((2, 1))); + assert_eq!(range.rows().next().unwrap(), &first_line); + assert_eq!(range.rows().nth(1).unwrap(), &second_line); + + let range = xlsb + .with_options(XlsbOptions::default().with_header_row(1)) + .worksheet_range("Sheet1") + .unwrap(); + assert_eq!(range.start(), Some((1, 0))); + assert_eq!(range.end(), Some((2, 1))); + assert_eq!(range.rows().next().unwrap(), &second_line); +} + +#[rstest] +fn test_header_row_xls() { + let mut xls: Xls<_> = wb("date.xls"); + assert_eq!( + xls.sheets_metadata(), + &[Sheet { + name: "Sheet1".to_string(), + typ: SheetType::WorkSheet, + visible: SheetVisible::Visible + }] + ); + + let first_line = [ + DateTime(ExcelDateTime::new( + 44197.0, + ExcelDateTimeType::DateTime, + false, + )), + Int(15), + ]; + let second_line = [ + DateTime(ExcelDateTime::new( + 44198.0, + ExcelDateTimeType::DateTime, + false, + )), + Int(16), + ]; + + let range = xls.worksheet_range("Sheet1").unwrap(); + assert_eq!(range.start(), Some((0, 0))); + assert_eq!(range.end(), Some((2, 1))); + assert_eq!(range.rows().next().unwrap(), &first_line); + assert_eq!(range.rows().nth(1).unwrap(), &second_line); + + let range = xls + .with_options(XlsOptions::default().with_header_row(1)) + .worksheet_range("Sheet1") + .unwrap(); + assert_eq!(range.start(), Some((1, 0))); + assert_eq!(range.end(), Some((2, 1))); + assert_eq!(range.rows().next().unwrap(), &second_line); +} + #[rstest] fn test_header_row_ods() { let mut ods: Ods<_> = wb("date.ods"); @@ -1842,16 +1930,17 @@ fn test_header_row_ods() { name: "Sheet1".to_string(), typ: SheetType::WorkSheet, visible: SheetVisible::Visible - },] + }] ); + let first_line = [DateTimeIso("2021-01-01".to_string()), Float(15.0)]; + let third_line = [DurationIso("PT10H10M10S".to_string()), Float(17.0)]; + let range = ods.worksheet_range("Sheet1").unwrap(); assert_eq!(range.start(), Some((0, 0))); assert_eq!(range.end(), Some((3, 1))); - assert_eq!( - range.rows().next().unwrap(), - &[DateTimeIso("2021-01-01".to_string()), Float(15.0)] - ); + assert_eq!(range.rows().next().unwrap(), &first_line); + assert_eq!(range.rows().nth(2).unwrap(), &third_line); let range = ods .with_options(OdsOptions::default().with_header_row(2)) @@ -1859,10 +1948,7 @@ fn test_header_row_ods() { .unwrap(); assert_eq!(range.start(), Some((2, 0))); assert_eq!(range.end(), Some((3, 1))); - assert_eq!( - range.rows().next().unwrap(), - &[DurationIso("PT10H10M10S".to_string()), Float(17.0)] - ); + assert_eq!(range.rows().next().unwrap(), &third_line); } #[rstest] From dd33174899447bdb02d34b9eee622eeafd0eb9bd Mon Sep 17 00:00:00 2001 From: Eric Jolibois Date: Fri, 4 Oct 2024 14:46:42 +0200 Subject: [PATCH 07/14] refactor: switch to matching on reference --- src/auto.rs | 37 +++++++++++++------------------------ 1 file changed, 13 insertions(+), 24 deletions(-) diff --git a/src/auto.rs b/src/auto.rs index 412fe4d5..3c57ee35 100644 --- a/src/auto.rs +++ b/src/auto.rs @@ -106,29 +106,18 @@ where } fn set_options(&mut self, options: Self::Options) { - match *self { - Sheets::Xls(ref mut e) => match options { - AutoReaderOptions::Xls(opts) => e.set_options(opts), - _ => unreachable!(), - }, - Sheets::Xlsx(ref mut e) => match options { - AutoReaderOptions::Xlsx(opts) => e.set_options(opts), - _ => unreachable!(), - }, - Sheets::Xlsb(ref mut e) => match options { - AutoReaderOptions::Xlsb(opts) => e.set_options(opts), - _ => unreachable!(), - }, - Sheets::Ods(ref mut e) => match options { - AutoReaderOptions::Ods(opts) => e.set_options(opts), - _ => unreachable!(), - }, + match (self, options) { + (Sheets::Xls(ref mut e), AutoReaderOptions::Xls(opts)) => e.set_options(opts), + (Sheets::Xlsx(ref mut e), AutoReaderOptions::Xlsx(opts)) => e.set_options(opts), + (Sheets::Xlsb(ref mut e), AutoReaderOptions::Xlsb(opts)) => e.set_options(opts), + (Sheets::Ods(ref mut e), AutoReaderOptions::Ods(opts)) => e.set_options(opts), + _ => unreachable!(), } } /// Gets `VbaProject` fn vba_project(&mut self) -> Option, Self::Error>> { - match *self { + match self { Sheets::Xls(ref mut e) => e.vba_project().map(|vba| vba.map_err(Error::Xls)), Sheets::Xlsx(ref mut e) => e.vba_project().map(|vba| vba.map_err(Error::Xlsx)), Sheets::Xlsb(ref mut e) => e.vba_project().map(|vba| vba.map_err(Error::Xlsb)), @@ -138,7 +127,7 @@ where /// Initialize fn metadata(&self) -> &Metadata { - match *self { + match self { Sheets::Xls(ref e) => e.metadata(), Sheets::Xlsx(ref e) => e.metadata(), Sheets::Xlsb(ref e) => e.metadata(), @@ -148,7 +137,7 @@ where /// Read worksheet data in corresponding worksheet path fn worksheet_range(&mut self, name: &str) -> Result, Self::Error> { - match *self { + match self { Sheets::Xls(ref mut e) => e.worksheet_range(name).map_err(Error::Xls), Sheets::Xlsx(ref mut e) => e.worksheet_range(name).map_err(Error::Xlsx), Sheets::Xlsb(ref mut e) => e.worksheet_range(name).map_err(Error::Xlsb), @@ -158,7 +147,7 @@ where /// Read worksheet formula in corresponding worksheet path fn worksheet_formula(&mut self, name: &str) -> Result, Self::Error> { - match *self { + match self { Sheets::Xls(ref mut e) => e.worksheet_formula(name).map_err(Error::Xls), Sheets::Xlsx(ref mut e) => e.worksheet_formula(name).map_err(Error::Xlsx), Sheets::Xlsb(ref mut e) => e.worksheet_formula(name).map_err(Error::Xlsb), @@ -167,7 +156,7 @@ where } fn worksheets(&mut self) -> Vec<(String, Range)> { - match *self { + match self { Sheets::Xls(ref mut e) => e.worksheets(), Sheets::Xlsx(ref mut e) => e.worksheets(), Sheets::Xlsb(ref mut e) => e.worksheets(), @@ -177,7 +166,7 @@ where #[cfg(feature = "picture")] fn pictures(&self) -> Option)>> { - match *self { + match self { Sheets::Xls(ref e) => e.pictures(), Sheets::Xlsx(ref e) => e.pictures(), Sheets::Xlsb(ref e) => e.pictures(), @@ -194,7 +183,7 @@ where &'a mut self, name: &str, ) -> Result>, Self::Error> { - match *self { + match self { Sheets::Xlsx(ref mut e) => e.worksheet_range_ref(name).map_err(Error::Xlsx), Sheets::Xlsb(ref mut e) => e.worksheet_range_ref(name).map_err(Error::Xlsb), Sheets::Xls(_) => unimplemented!(), From 65d4c2d01e1a6c2886227747a569ed4ce32b4dd2 Mon Sep 17 00:00:00 2001 From: Eric Jolibois Date: Sat, 5 Oct 2024 10:50:26 +0200 Subject: [PATCH 08/14] refactor: use mutable ref --- src/lib.rs | 2 +- src/xlsx/mod.rs | 8 -------- tests/test.rs | 21 +++++++++++++++++---- 3 files changed, 18 insertions(+), 13 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 3663040d..046190f4 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -241,7 +241,7 @@ where fn set_options(&mut self, options: Self::Options); /// Set options and return the reader - fn with_options(mut self, options: Self::Options) -> Self { + fn with_options(&mut self, options: Self::Options) -> &mut Self { self.set_options(options); self } diff --git a/src/xlsx/mod.rs b/src/xlsx/mod.rs index 1ebb3a3e..e5f608d6 100644 --- a/src/xlsx/mod.rs +++ b/src/xlsx/mod.rs @@ -866,14 +866,6 @@ impl Xlsx { } } -impl Xlsx { - /// Set reader options - pub fn with_options(mut self, options: XlsxOptions) -> Self { - self.options = options; - self - } -} - impl Reader for Xlsx { type Error = XlsxError; type Options = XlsxOptions; diff --git a/tests/test.rs b/tests/test.rs index aff6e038..59d40a17 100644 --- a/tests/test.rs +++ b/tests/test.rs @@ -1813,7 +1813,7 @@ fn test_header_row_xlsx( #[case] expected_first_row: &[Data], #[case] expected_total_cells: usize, ) { - let excel: Xlsx<_> = wb(fixture_path); + let mut excel: Xlsx<_> = wb(fixture_path); assert_eq!( excel.sheets_metadata(), &[Sheet { @@ -1833,7 +1833,20 @@ fn test_header_row_xlsx( assert_eq!(range.cells().count(), expected_total_cells); } -#[rstest] +#[test] +fn test_read_twice_with_different_header_rows() { + let mut xlsx: Xlsx<_> = wb("any_sheets.xlsx"); + let _ = xlsx + .with_options(XlsxOptions::default().with_header_row(2)) + .worksheet_range("Visible") + .unwrap(); + let _ = xlsx + .with_options(XlsxOptions::default().with_header_row(1)) + .worksheet_range("Visible") + .unwrap(); +} + +#[test] fn test_header_row_xlsb() { let mut xlsb: Xlsb<_> = wb("date.xlsb"); assert_eq!( @@ -1877,7 +1890,7 @@ fn test_header_row_xlsb() { assert_eq!(range.rows().next().unwrap(), &second_line); } -#[rstest] +#[test] fn test_header_row_xls() { let mut xls: Xls<_> = wb("date.xls"); assert_eq!( @@ -1921,7 +1934,7 @@ fn test_header_row_xls() { assert_eq!(range.rows().next().unwrap(), &second_line); } -#[rstest] +#[test] fn test_header_row_ods() { let mut ods: Ods<_> = wb("date.ods"); assert_eq!( From 72e74b0012dac56778365edcd40287216d306c51 Mon Sep 17 00:00:00 2001 From: Eric Jolibois Date: Sat, 5 Oct 2024 11:08:20 +0200 Subject: [PATCH 09/14] add example in README --- README.md | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 2f398aeb..29a9aeda 100644 --- a/README.md +++ b/README.md @@ -103,6 +103,25 @@ if let Some(Ok(r)) = excel.worksheet_range("Sheet1") { } ``` +### Reader: With options + +```rs +use calamine::{Reader, Xlsx, XlsxOptions, open_workbook}; + +let mut excel: Xlsx<_> = open_workbook("file.xlsx").unwrap(); + +let sheet1 = excel + .with_options(XlsxOptions::default().with_header_row(3)) + .worksheet_range("Sheet1") + .unwrap(); +``` + +Keep in mind that `xlsx` and `xlsb` files support lazy loading, +meaning the specified options are applied immediately when reading a sheet range. +However, for `xls` and `ods` files, all sheets are loaded at once when +opening the workbook with default settings, so the options are only applied +afterward, offering no performance advantages. + ### Reader: More complex Let's assume @@ -190,7 +209,7 @@ The programs are all structured to follow the same constructs: use calamine::{open_workbook, Reader, Xlsx}; fn main() { - // Open workbook + // Open workbook let mut excel: Xlsx<_> = open_workbook("NYC_311_SR_2010-2020-sample-1M.xlsx").expect("failed to find file"); From 23f4c26077f3bb8d145643731608fec1c06c902f Mon Sep 17 00:00:00 2001 From: Eric Jolibois Date: Sun, 6 Oct 2024 21:53:32 +0200 Subject: [PATCH 10/14] feat: add helper method --- src/auto.rs | 18 ++++++++++++++++++ src/lib.rs | 3 +++ src/ods.rs | 5 +++++ src/xls.rs | 5 +++++ src/xlsb/mod.rs | 5 +++++ src/xlsx/mod.rs | 5 +++++ tests/test.rs | 16 ++++++++-------- 7 files changed, 49 insertions(+), 8 deletions(-) diff --git a/src/auto.rs b/src/auto.rs index 3c57ee35..576a5961 100644 --- a/src/auto.rs +++ b/src/auto.rs @@ -115,6 +115,24 @@ where } } + fn with_header_row(&mut self, header_row: Option) -> &mut Self { + match self { + Sheets::Xls(ref mut e) => { + e.with_header_row(header_row); + } + Sheets::Xlsx(ref mut e) => { + e.with_header_row(header_row); + } + Sheets::Xlsb(ref mut e) => { + e.with_header_row(header_row); + } + Sheets::Ods(ref mut e) => { + e.with_header_row(header_row); + } + } + self + } + /// Gets `VbaProject` fn vba_project(&mut self) -> Option, Self::Error>> { match self { diff --git a/src/lib.rs b/src/lib.rs index 046190f4..039fe003 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -246,6 +246,9 @@ where self } + /// Set current header row + fn with_header_row(&mut self, header_row: Option) -> &mut Self; + /// Gets `VbaProject` fn vba_project(&mut self) -> Option, Self::Error>>; diff --git a/src/ods.rs b/src/ods.rs index 7c1c5f44..10ef581f 100644 --- a/src/ods.rs +++ b/src/ods.rs @@ -192,6 +192,11 @@ where self.options = options; } + fn with_header_row(&mut self, header_row: Option) -> &mut Self { + self.options.header_row = header_row; + self + } + /// Gets `VbaProject` fn vba_project(&mut self) -> Option, OdsError>> { None diff --git a/src/xls.rs b/src/xls.rs index 239a0cc8..49f9a6cf 100644 --- a/src/xls.rs +++ b/src/xls.rs @@ -261,6 +261,11 @@ impl Reader for Xls { self.options = options; } + fn with_header_row(&mut self, header_row: Option) -> &mut Self { + self.options.header_row = header_row; + self + } + fn vba_project(&mut self) -> Option, XlsError>> { self.vba.as_ref().map(|vba| Ok(Cow::Borrowed(vba))) } diff --git a/src/xlsb/mod.rs b/src/xlsb/mod.rs index ac95bd60..d79e4b08 100644 --- a/src/xlsb/mod.rs +++ b/src/xlsb/mod.rs @@ -486,6 +486,11 @@ impl Reader for Xlsb { self.options = options; } + fn with_header_row(&mut self, header_row: Option) -> &mut Self { + self.options.header_row = header_row; + self + } + fn vba_project(&mut self) -> Option, XlsbError>> { self.zip.by_name("xl/vbaProject.bin").ok().map(|mut f| { let len = f.size() as usize; diff --git a/src/xlsx/mod.rs b/src/xlsx/mod.rs index e5f608d6..6b5c6e92 100644 --- a/src/xlsx/mod.rs +++ b/src/xlsx/mod.rs @@ -900,6 +900,11 @@ impl Reader for Xlsx { self.options = options; } + fn with_header_row(&mut self, header_row: Option) -> &mut Self { + self.options.header_row = header_row; + self + } + fn vba_project(&mut self) -> Option, XlsxError>> { let mut f = self.zip.by_name("xl/vbaProject.bin").ok()?; let len = f.size() as usize; diff --git a/tests/test.rs b/tests/test.rs index 59d40a17..fb69cc9a 100644 --- a/tests/test.rs +++ b/tests/test.rs @@ -1,8 +1,8 @@ use calamine::Data::{Bool, DateTime, DateTimeIso, DurationIso, Empty, Error, Float, Int, String}; use calamine::{ open_workbook, open_workbook_auto, DataRef, DataType, Dimensions, ExcelDateTime, - ExcelDateTimeType, Ods, OdsOptions, Range, Reader, ReaderOptions, ReaderRef, Sheet, SheetType, - SheetVisible, Xls, XlsOptions, Xlsb, XlsbOptions, Xlsx, XlsxOptions, + ExcelDateTimeType, Ods, Range, Reader, ReaderRef, Sheet, SheetType, SheetVisible, Xls, Xlsb, + Xlsx, }; use calamine::{CellErrorType::*, Data}; use rstest::rstest; @@ -1824,7 +1824,7 @@ fn test_header_row_xlsx( ); let range = excel - .with_options(XlsxOptions { header_row }) + .with_header_row(header_row) .worksheet_range("Sheet1") .unwrap(); assert_eq!(range.start(), Some(expected_start)); @@ -1837,11 +1837,11 @@ fn test_header_row_xlsx( fn test_read_twice_with_different_header_rows() { let mut xlsx: Xlsx<_> = wb("any_sheets.xlsx"); let _ = xlsx - .with_options(XlsxOptions::default().with_header_row(2)) + .with_header_row(Some(2)) .worksheet_range("Visible") .unwrap(); let _ = xlsx - .with_options(XlsxOptions::default().with_header_row(1)) + .with_header_row(Some(1)) .worksheet_range("Visible") .unwrap(); } @@ -1882,7 +1882,7 @@ fn test_header_row_xlsb() { assert_eq!(range.rows().nth(1).unwrap(), &second_line); let range = xlsb - .with_options(XlsbOptions::default().with_header_row(1)) + .with_header_row(Some(1)) .worksheet_range("Sheet1") .unwrap(); assert_eq!(range.start(), Some((1, 0))); @@ -1926,7 +1926,7 @@ fn test_header_row_xls() { assert_eq!(range.rows().nth(1).unwrap(), &second_line); let range = xls - .with_options(XlsOptions::default().with_header_row(1)) + .with_header_row(Some(1)) .worksheet_range("Sheet1") .unwrap(); assert_eq!(range.start(), Some((1, 0))); @@ -1956,7 +1956,7 @@ fn test_header_row_ods() { assert_eq!(range.rows().nth(2).unwrap(), &third_line); let range = ods - .with_options(OdsOptions::default().with_header_row(2)) + .with_header_row(Some(2)) .worksheet_range("Sheet1") .unwrap(); assert_eq!(range.start(), Some((2, 0))); From d56c7f9a67088123323217a75c70adf6a67f27f6 Mon Sep 17 00:00:00 2001 From: Eric Jolibois Date: Sun, 6 Oct 2024 22:01:43 +0200 Subject: [PATCH 11/14] clean --- README.md | 17 +++++++++-------- src/auto.rs | 34 ++-------------------------------- src/lib.rs | 24 +++--------------------- src/ods.rs | 19 +------------------ src/xls.rs | 29 +---------------------------- src/xlsb/mod.rs | 18 +----------------- src/xlsx/mod.rs | 18 ++---------------- 7 files changed, 19 insertions(+), 140 deletions(-) diff --git a/README.md b/README.md index 29a9aeda..530eed50 100644 --- a/README.md +++ b/README.md @@ -103,24 +103,25 @@ if let Some(Ok(r)) = excel.worksheet_range("Sheet1") { } ``` -### Reader: With options +### Reader: With header row ```rs -use calamine::{Reader, Xlsx, XlsxOptions, open_workbook}; +use calamine::{Reader, Xlsx, open_workbook}; let mut excel: Xlsx<_> = open_workbook("file.xlsx").unwrap(); let sheet1 = excel - .with_options(XlsxOptions::default().with_header_row(3)) + .with_header_row(Some(3)) .worksheet_range("Sheet1") .unwrap(); ``` -Keep in mind that `xlsx` and `xlsb` files support lazy loading, -meaning the specified options are applied immediately when reading a sheet range. -However, for `xls` and `ods` files, all sheets are loaded at once when -opening the workbook with default settings, so the options are only applied -afterward, offering no performance advantages. +Note that `xlsx` and `xlsb` files support lazy loading, so specifying a +header row takes effect immediately when reading a sheet range. +In contrast, for `xls` and `ods` files, all sheets are loaded at once when +opening the workbook with default settings. +As a result, setting the header row only applies afterward and does not +provide any performance benefits. ### Reader: More complex diff --git a/src/auto.rs b/src/auto.rs index 576a5961..b1d28fd7 100644 --- a/src/auto.rs +++ b/src/auto.rs @@ -2,10 +2,9 @@ use crate::errors::Error; use crate::vba::VbaProject; -use crate::xlsb::XlsbOptions; use crate::{ - open_workbook, open_workbook_from_rs, Data, DataRef, Metadata, Ods, OdsOptions, Range, Reader, - ReaderOptions, ReaderRef, Xls, XlsOptions, Xlsb, Xlsx, XlsxOptions, + open_workbook, open_workbook_from_rs, Data, DataRef, Metadata, Ods, Range, Reader, ReaderRef, + Xls, Xlsb, Xlsx, }; use std::borrow::Cow; use std::fs::File; @@ -75,46 +74,17 @@ where } } -pub enum AutoReaderOptions { - Xls(XlsOptions), - Xlsx(XlsxOptions), - Xlsb(XlsbOptions), - Ods(OdsOptions), -} - -impl ReaderOptions for AutoReaderOptions { - fn with_header_row(self, header_row: u32) -> Self { - match self { - AutoReaderOptions::Xls(e) => AutoReaderOptions::Xls(e.with_header_row(header_row)), - AutoReaderOptions::Xlsx(e) => AutoReaderOptions::Xlsx(e.with_header_row(header_row)), - AutoReaderOptions::Xlsb(e) => AutoReaderOptions::Xlsb(e.with_header_row(header_row)), - AutoReaderOptions::Ods(e) => AutoReaderOptions::Ods(e.with_header_row(header_row)), - } - } -} - impl Reader for Sheets where RS: std::io::Read + std::io::Seek, { type Error = Error; - type Options = AutoReaderOptions; /// Creates a new instance. fn new(_reader: RS) -> Result { Err(Error::Msg("Sheets must be created from a Path")) } - fn set_options(&mut self, options: Self::Options) { - match (self, options) { - (Sheets::Xls(ref mut e), AutoReaderOptions::Xls(opts)) => e.set_options(opts), - (Sheets::Xlsx(ref mut e), AutoReaderOptions::Xlsx(opts)) => e.set_options(opts), - (Sheets::Xlsb(ref mut e), AutoReaderOptions::Xlsb(opts)) => e.set_options(opts), - (Sheets::Ods(ref mut e), AutoReaderOptions::Ods(opts)) => e.set_options(opts), - _ => unreachable!(), - } - } - fn with_header_row(&mut self, header_row: Option) -> &mut Self { match self { Sheets::Xls(ref mut e) => { diff --git a/src/lib.rs b/src/lib.rs index 039fe003..be971ffe 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -86,10 +86,10 @@ pub use crate::auto::{open_workbook_auto, open_workbook_auto_from_rs, Sheets}; pub use crate::datatype::{Data, DataRef, DataType, ExcelDateTime, ExcelDateTimeType}; pub use crate::de::{DeError, RangeDeserializer, RangeDeserializerBuilder, ToCellDeserializer}; pub use crate::errors::Error; -pub use crate::ods::{Ods, OdsError, OdsOptions}; +pub use crate::ods::{Ods, OdsError}; pub use crate::xls::{Xls, XlsError, XlsOptions}; -pub use crate::xlsb::{Xlsb, XlsbError, XlsbOptions}; -pub use crate::xlsx::{Xlsx, XlsxError, XlsxOptions}; +pub use crate::xlsb::{Xlsb, XlsbError}; +pub use crate::xlsx::{Xlsx, XlsxError}; use crate::vba::VbaProject; @@ -215,12 +215,6 @@ pub struct Sheet { pub visible: SheetVisible, } -/// A trait to share reader options across different `FileType`s -pub trait ReaderOptions: Sized { - /// Set the header row - fn with_header_row(self, _header_row: u32) -> Self; -} - // FIXME `Reader` must only be seek `Seek` for `Xls::xls`. Because of the present API this limits // the kinds of readers (other) data in formats can be read from. /// A trait to share spreadsheets reader functions across different `FileType`s @@ -231,21 +225,9 @@ where /// Error specific to file type type Error: std::fmt::Debug + From; - /// Options specific to file type - type Options: ReaderOptions; - /// Creates a new instance. fn new(reader: RS) -> Result; - /// Set options - fn set_options(&mut self, options: Self::Options); - - /// Set options and return the reader - fn with_options(&mut self, options: Self::Options) -> &mut Self { - self.set_options(options); - self - } - /// Set current header row fn with_header_row(&mut self, header_row: Option) -> &mut Self; diff --git a/src/ods.rs b/src/ods.rs index 10ef581f..003bcef4 100644 --- a/src/ods.rs +++ b/src/ods.rs @@ -16,9 +16,7 @@ use zip::read::{ZipArchive, ZipFile}; use zip::result::ZipError; use crate::vba::VbaProject; -use crate::{ - Data, DataType, Metadata, Range, Reader, ReaderOptions, Sheet, SheetType, SheetVisible, -}; +use crate::{Data, DataType, Metadata, Range, Reader, Sheet, SheetType, SheetVisible}; use std::marker::PhantomData; const MIMETYPE: &[u8] = b"application/vnd.oasis.opendocument.spreadsheet"; @@ -72,15 +70,6 @@ pub struct OdsOptions { pub header_row: Option, } -impl ReaderOptions for OdsOptions { - /// Set the header row index - fn with_header_row(self, header_row: u32) -> Self { - Self { - header_row: Some(header_row), - } - } -} - from_err!(std::io::Error, OdsError, Io); from_err!(zip::result::ZipError, OdsError, Zip); from_err!(quick_xml::Error, OdsError, Xml); @@ -144,7 +133,6 @@ where RS: Read + Seek, { type Error = OdsError; - type Options = OdsOptions; fn new(reader: RS) -> Result { let mut zip = ZipArchive::new(reader)?; @@ -187,11 +175,6 @@ where }) } - /// Set options - fn set_options(&mut self, options: Self::Options) { - self.options = options; - } - fn with_header_row(&mut self, header_row: Option) -> &mut Self { self.options.header_row = header_row; self diff --git a/src/xls.rs b/src/xls.rs index 49f9a6cf..761b6cc5 100644 --- a/src/xls.rs +++ b/src/xls.rs @@ -17,8 +17,7 @@ use crate::utils::read_usize; use crate::utils::{push_column, read_f64, read_i16, read_i32, read_u16, read_u32}; use crate::vba::VbaProject; use crate::{ - Cell, CellErrorType, Data, Dimensions, Metadata, Range, Reader, ReaderOptions, Sheet, - SheetType, SheetVisible, + Cell, CellErrorType, Data, Dimensions, Metadata, Range, Reader, Sheet, SheetType, SheetVisible, }; #[derive(Debug)] @@ -142,27 +141,6 @@ pub struct XlsOptions { pub header_row: Option, } -impl XlsOptions { - #[allow(dead_code)] - /// Set the code page - fn with_codepage(self, codepage: u16) -> Self { - Self { - force_codepage: Some(codepage), - ..self - } - } -} - -impl ReaderOptions for XlsOptions { - /// Set the header row index - fn with_header_row(self, header_row: u32) -> Self { - Self { - header_row: Some(header_row), - ..self - } - } -} - struct SheetData { range: Range, formula: Range, @@ -251,16 +229,11 @@ impl Xls { impl Reader for Xls { type Error = XlsError; - type Options = XlsOptions; fn new(reader: RS) -> Result { Self::new_with_options(reader, XlsOptions::default()) } - fn set_options(&mut self, options: Self::Options) { - self.options = options; - } - fn with_header_row(&mut self, header_row: Option) -> &mut Self { self.options.header_row = header_row; self diff --git a/src/xlsb/mod.rs b/src/xlsb/mod.rs index d79e4b08..9bf33474 100644 --- a/src/xlsb/mod.rs +++ b/src/xlsb/mod.rs @@ -20,9 +20,7 @@ use crate::datatype::DataRef; use crate::formats::{builtin_format_by_code, detect_custom_number_format, CellFormat}; use crate::utils::{push_column, read_f64, read_i32, read_u16, read_u32, read_usize}; use crate::vba::VbaProject; -use crate::{ - Cell, Data, Metadata, Range, Reader, ReaderOptions, ReaderRef, Sheet, SheetType, SheetVisible, -}; +use crate::{Cell, Data, Metadata, Range, Reader, ReaderRef, Sheet, SheetType, SheetVisible}; /// A Xlsb specific error #[derive(Debug)] @@ -138,15 +136,6 @@ pub struct XlsbOptions { pub header_row: Option, } -impl ReaderOptions for XlsbOptions { - /// Set the header row index - fn with_header_row(self, header_row: u32) -> Self { - Self { - header_row: Some(header_row), - } - } -} - /// A Xlsb reader pub struct Xlsb { zip: ZipArchive, @@ -455,7 +444,6 @@ impl Xlsb { impl Reader for Xlsb { type Error = XlsbError; - type Options = XlsbOptions; fn new(mut reader: RS) -> Result { check_for_password_protected(&mut reader)?; @@ -482,10 +470,6 @@ impl Reader for Xlsb { Ok(xlsb) } - fn set_options(&mut self, options: Self::Options) { - self.options = options; - } - fn with_header_row(&mut self, header_row: Option) -> &mut Self { self.options.header_row = header_row; self diff --git a/src/xlsx/mod.rs b/src/xlsx/mod.rs index 6b5c6e92..1e5436c9 100644 --- a/src/xlsx/mod.rs +++ b/src/xlsx/mod.rs @@ -18,8 +18,8 @@ use crate::datatype::DataRef; use crate::formats::{builtin_format_by_id, detect_custom_number_format, CellFormat}; use crate::vba::VbaProject; use crate::{ - Cell, CellErrorType, Data, Dimensions, Metadata, Range, Reader, ReaderOptions, ReaderRef, - Sheet, SheetType, SheetVisible, Table, + Cell, CellErrorType, Data, Dimensions, Metadata, Range, Reader, ReaderRef, Sheet, SheetType, + SheetVisible, Table, }; pub use cells_reader::XlsxCellReader; @@ -209,15 +209,6 @@ pub struct XlsxOptions { pub header_row: Option, } -impl ReaderOptions for XlsxOptions { - /// Set the header row index - fn with_header_row(self, header_row: u32) -> Self { - Self { - header_row: Some(header_row), - } - } -} - impl Xlsx { fn read_shared_strings(&mut self) -> Result<(), XlsxError> { let mut xml = match xml_reader(&mut self.zip, "xl/sharedStrings.xml") { @@ -868,7 +859,6 @@ impl Xlsx { impl Reader for Xlsx { type Error = XlsxError; - type Options = XlsxOptions; fn new(mut reader: RS) -> Result { check_for_password_protected(&mut reader)?; @@ -896,10 +886,6 @@ impl Reader for Xlsx { Ok(xlsx) } - fn set_options(&mut self, options: Self::Options) { - self.options = options; - } - fn with_header_row(&mut self, header_row: Option) -> &mut Self { self.options.header_row = header_row; self From d4df81a0439a930dd88a71d5e3384e72fc49460e Mon Sep 17 00:00:00 2001 From: Eric Jolibois Date: Sun, 6 Oct 2024 22:28:24 +0200 Subject: [PATCH 12/14] fix: doc --- src/lib.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/lib.rs b/src/lib.rs index be971ffe..7dc815c8 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -228,7 +228,8 @@ where /// Creates a new instance. fn new(reader: RS) -> Result; - /// Set current header row + /// Set header row (i.e. first row to be read) + /// If `header_row` is `None`, the first non-empty row will be used as header row fn with_header_row(&mut self, header_row: Option) -> &mut Self; /// Gets `VbaProject` From 0d63b39a16c9c5c514af184f5d436233520ba4b1 Mon Sep 17 00:00:00 2001 From: Eric Jolibois Date: Mon, 7 Oct 2024 11:35:29 +0200 Subject: [PATCH 13/14] chore: options are not exported anymore --- src/ods.rs | 4 +--- src/xlsb/mod.rs | 4 +--- src/xlsx/mod.rs | 4 +--- 3 files changed, 3 insertions(+), 9 deletions(-) diff --git a/src/ods.rs b/src/ods.rs index 003bcef4..d18c4624 100644 --- a/src/ods.rs +++ b/src/ods.rs @@ -64,9 +64,7 @@ pub enum OdsError { /// Ods reader options #[derive(Debug, Default)] -pub struct OdsOptions { - /// Index of the header row - /// If not set, the first non-empty row is considered the header row +struct OdsOptions { pub header_row: Option, } diff --git a/src/xlsb/mod.rs b/src/xlsb/mod.rs index 9bf33474..492ea29f 100644 --- a/src/xlsb/mod.rs +++ b/src/xlsb/mod.rs @@ -130,9 +130,7 @@ impl std::error::Error for XlsbError { /// Xlsb reader options #[derive(Debug, Default)] -pub struct XlsbOptions { - /// Index of the header row - /// If not set, the first non-empty row is considered the header row +struct XlsbOptions { pub header_row: Option, } diff --git a/src/xlsx/mod.rs b/src/xlsx/mod.rs index 1e5436c9..7d5d484a 100644 --- a/src/xlsx/mod.rs +++ b/src/xlsx/mod.rs @@ -203,9 +203,7 @@ pub struct Xlsx { /// Xlsx reader options #[derive(Debug, Default)] -pub struct XlsxOptions { - /// Index of the header row - /// If not set, the first non-empty row is considered the header row +struct XlsxOptions { pub header_row: Option, } From b6f91e026a94bb94bdbd4961dd23a6e6cf4a7e33 Mon Sep 17 00:00:00 2001 From: Eric Jolibois Date: Mon, 7 Oct 2024 11:50:27 +0200 Subject: [PATCH 14/14] refactor: switch to enum for header row --- src/auto.rs | 6 ++-- src/lib.rs | 19 ++++++++++- src/ods.rs | 23 +++++++------ src/xls.rs | 26 +++++++------- src/xlsb/mod.rs | 91 ++++++++++++++++++++++++++----------------------- src/xlsx/mod.rs | 90 +++++++++++++++++++++++++----------------------- tests/test.rs | 30 ++++++++-------- 7 files changed, 159 insertions(+), 126 deletions(-) diff --git a/src/auto.rs b/src/auto.rs index b1d28fd7..824abf16 100644 --- a/src/auto.rs +++ b/src/auto.rs @@ -3,8 +3,8 @@ use crate::errors::Error; use crate::vba::VbaProject; use crate::{ - open_workbook, open_workbook_from_rs, Data, DataRef, Metadata, Ods, Range, Reader, ReaderRef, - Xls, Xlsb, Xlsx, + open_workbook, open_workbook_from_rs, Data, DataRef, HeaderRow, Metadata, Ods, Range, Reader, + ReaderRef, Xls, Xlsb, Xlsx, }; use std::borrow::Cow; use std::fs::File; @@ -85,7 +85,7 @@ where Err(Error::Msg("Sheets must be created from a Path")) } - fn with_header_row(&mut self, header_row: Option) -> &mut Self { + fn with_header_row(&mut self, header_row: HeaderRow) -> &mut Self { match self { Sheets::Xls(ref mut e) => { e.with_header_row(header_row); diff --git a/src/lib.rs b/src/lib.rs index 7dc815c8..f8849326 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -215,6 +215,23 @@ pub struct Sheet { pub visible: SheetVisible, } +/// Row to use as header +/// By default, the first non-empty row is used as header +#[derive(Debug, Clone, Copy)] +#[non_exhaustive] +pub enum HeaderRow { + /// First non-empty row + FirstNonEmptyRow, + /// Index of the header row + Row(u32), +} + +impl Default for HeaderRow { + fn default() -> Self { + HeaderRow::FirstNonEmptyRow + } +} + // FIXME `Reader` must only be seek `Seek` for `Xls::xls`. Because of the present API this limits // the kinds of readers (other) data in formats can be read from. /// A trait to share spreadsheets reader functions across different `FileType`s @@ -230,7 +247,7 @@ where /// Set header row (i.e. first row to be read) /// If `header_row` is `None`, the first non-empty row will be used as header row - fn with_header_row(&mut self, header_row: Option) -> &mut Self; + fn with_header_row(&mut self, header_row: HeaderRow) -> &mut Self; /// Gets `VbaProject` fn vba_project(&mut self) -> Option, Self::Error>>; diff --git a/src/ods.rs b/src/ods.rs index d18c4624..aa34a9b8 100644 --- a/src/ods.rs +++ b/src/ods.rs @@ -16,7 +16,7 @@ use zip::read::{ZipArchive, ZipFile}; use zip::result::ZipError; use crate::vba::VbaProject; -use crate::{Data, DataType, Metadata, Range, Reader, Sheet, SheetType, SheetVisible}; +use crate::{Data, DataType, HeaderRow, Metadata, Range, Reader, Sheet, SheetType, SheetVisible}; use std::marker::PhantomData; const MIMETYPE: &[u8] = b"application/vnd.oasis.opendocument.spreadsheet"; @@ -64,8 +64,9 @@ pub enum OdsError { /// Ods reader options #[derive(Debug, Default)] +#[non_exhaustive] struct OdsOptions { - pub header_row: Option, + pub header_row: HeaderRow, } from_err!(std::io::Error, OdsError, Io); @@ -173,7 +174,7 @@ where }) } - fn with_header_row(&mut self, header_row: Option) -> &mut Self { + fn with_header_row(&mut self, header_row: HeaderRow) -> &mut Self { self.options.header_row = header_row; self } @@ -197,15 +198,17 @@ where .0 .to_owned(); - // If a header_row is defined, adjust the range - if let Some(header_row) = self.options.header_row { - if let (Some(start), Some(end)) = (sheet.start(), sheet.end()) { - return Ok(sheet.range((header_row, start.1), end)); + match self.options.header_row { + HeaderRow::FirstNonEmptyRow => Ok(sheet), + HeaderRow::Row(header_row_idx) => { + // If `header_row` is a row index, adjust the range + if let (Some(start), Some(end)) = (sheet.start(), sheet.end()) { + Ok(sheet.range((header_row_idx, start.1), end)) + } else { + Ok(sheet) + } } } - - // Return the original range if no header row is set - Ok(sheet) } fn worksheets(&mut self) -> Vec<(String, Range)> { diff --git a/src/xls.rs b/src/xls.rs index 761b6cc5..a04fcbe1 100644 --- a/src/xls.rs +++ b/src/xls.rs @@ -17,7 +17,8 @@ use crate::utils::read_usize; use crate::utils::{push_column, read_f64, read_i16, read_i32, read_u16, read_u32}; use crate::vba::VbaProject; use crate::{ - Cell, CellErrorType, Data, Dimensions, Metadata, Range, Reader, Sheet, SheetType, SheetVisible, + Cell, CellErrorType, Data, Dimensions, HeaderRow, Metadata, Range, Reader, Sheet, SheetType, + SheetVisible, }; #[derive(Debug)] @@ -136,9 +137,8 @@ pub struct XlsOptions { /// /// [code page]: https://docs.microsoft.com/en-us/windows/win32/intl/code-page-identifiers pub force_codepage: Option, - /// Index of the header row - /// If not set, the first non-empty row is considered the header row - pub header_row: Option, + /// Row to use as header + pub header_row: HeaderRow, } struct SheetData { @@ -234,7 +234,7 @@ impl Reader for Xls { Self::new_with_options(reader, XlsOptions::default()) } - fn with_header_row(&mut self, header_row: Option) -> &mut Self { + fn with_header_row(&mut self, header_row: HeaderRow) -> &mut Self { self.options.header_row = header_row; self } @@ -255,15 +255,17 @@ impl Reader for Xls { .map(|r| r.range.clone()) .ok_or_else(|| XlsError::WorksheetNotFound(name.into()))?; - // If a header_row is defined, adjust the range - if let Some(header_row) = self.options.header_row { - if let (Some(start), Some(end)) = (sheet.start(), sheet.end()) { - return Ok(sheet.range((header_row, start.1), end)); + match self.options.header_row { + HeaderRow::FirstNonEmptyRow => Ok(sheet), + HeaderRow::Row(header_row_idx) => { + // If `header_row` is a row index, adjust the range + if let (Some(start), Some(end)) = (sheet.start(), sheet.end()) { + Ok(sheet.range((header_row_idx, start.1), end)) + } else { + Ok(sheet) + } } } - - // Return the original range if no header row is set - Ok(sheet) } fn worksheets(&mut self) -> Vec<(String, Range)> { diff --git a/src/xlsb/mod.rs b/src/xlsb/mod.rs index 492ea29f..3de08a45 100644 --- a/src/xlsb/mod.rs +++ b/src/xlsb/mod.rs @@ -20,7 +20,9 @@ use crate::datatype::DataRef; use crate::formats::{builtin_format_by_code, detect_custom_number_format, CellFormat}; use crate::utils::{push_column, read_f64, read_i32, read_u16, read_u32, read_usize}; use crate::vba::VbaProject; -use crate::{Cell, Data, Metadata, Range, Reader, ReaderRef, Sheet, SheetType, SheetVisible}; +use crate::{ + Cell, Data, HeaderRow, Metadata, Range, Reader, ReaderRef, Sheet, SheetType, SheetVisible, +}; /// A Xlsb specific error #[derive(Debug)] @@ -130,8 +132,9 @@ impl std::error::Error for XlsbError { /// Xlsb reader options #[derive(Debug, Default)] +#[non_exhaustive] struct XlsbOptions { - pub header_row: Option, + pub header_row: HeaderRow, } /// A Xlsb reader @@ -468,7 +471,7 @@ impl Reader for Xlsb { Ok(xlsb) } - fn with_header_row(&mut self, header_row: Option) -> &mut Self { + fn with_header_row(&mut self, header_row: HeaderRow) -> &mut Self { self.options.header_row = header_row; self } @@ -541,52 +544,56 @@ impl ReaderRef for Xlsb { cells.reserve(len as usize); } - // If `header_row` is set, we only add non-empty cells after the `header_row`. - if let Some(header_row) = header_row { - loop { - match cell_reader.next_cell() { - Ok(Some(Cell { - val: DataRef::Empty, - .. - })) => (), - Ok(Some(cell)) => { - if cell.pos.0 >= header_row { - cells.push(cell); - } + match header_row { + HeaderRow::FirstNonEmptyRow => { + // the header row is the row of the first non-empty cell + loop { + match cell_reader.next_cell() { + Ok(Some(Cell { + val: DataRef::Empty, + .. + })) => (), + Ok(Some(cell)) => cells.push(cell), + Ok(None) => break, + Err(e) => return Err(e), } - Ok(None) => break, - Err(e) => return Err(e), } } + HeaderRow::Row(header_row_idx) => { + // If `header_row` is a row index, we only add non-empty cells after this index. + loop { + match cell_reader.next_cell() { + Ok(Some(Cell { + val: DataRef::Empty, + .. + })) => (), + Ok(Some(cell)) => { + if cell.pos.0 >= header_row_idx { + cells.push(cell); + } + } + Ok(None) => break, + Err(e) => return Err(e), + } + } - // If `header_row` is set and the first non-empty cell is not at the `header_row`, we add - // an empty cell at the beginning with row `header_row` and same column as the first non-empty cell. - if cells.first().map_or(false, |c| c.pos.0 != header_row) { - cells.insert( - header_row as usize, - Cell { - pos: ( - header_row, - cells.first().expect("cells should not be empty").pos.1, - ), - val: DataRef::Empty, - }, - ); - } - // If `header_row` is not specified (default), the header row is the row of the first non-empty cell. - } else { - loop { - match cell_reader.next_cell() { - Ok(Some(Cell { - val: DataRef::Empty, - .. - })) => (), - Ok(Some(cell)) => cells.push(cell), - Ok(None) => break, - Err(e) => return Err(e), + // If `header_row` is set and the first non-empty cell is not at the `header_row`, we add + // an empty cell at the beginning with row `header_row` and same column as the first non-empty cell. + if cells.first().map_or(false, |c| c.pos.0 != header_row_idx) { + cells.insert( + header_row_idx as usize, + Cell { + pos: ( + header_row_idx, + cells.first().expect("cells should not be empty").pos.1, + ), + val: DataRef::Empty, + }, + ); } } } + Ok(Range::from_sparse(cells)) } } diff --git a/src/xlsx/mod.rs b/src/xlsx/mod.rs index 7d5d484a..78d1ae9e 100644 --- a/src/xlsx/mod.rs +++ b/src/xlsx/mod.rs @@ -18,8 +18,8 @@ use crate::datatype::DataRef; use crate::formats::{builtin_format_by_id, detect_custom_number_format, CellFormat}; use crate::vba::VbaProject; use crate::{ - Cell, CellErrorType, Data, Dimensions, Metadata, Range, Reader, ReaderRef, Sheet, SheetType, - SheetVisible, Table, + Cell, CellErrorType, Data, Dimensions, HeaderRow, Metadata, Range, Reader, ReaderRef, Sheet, + SheetType, SheetVisible, Table, }; pub use cells_reader::XlsxCellReader; @@ -203,8 +203,9 @@ pub struct Xlsx { /// Xlsx reader options #[derive(Debug, Default)] +#[non_exhaustive] struct XlsxOptions { - pub header_row: Option, + pub header_row: HeaderRow, } impl Xlsx { @@ -884,7 +885,7 @@ impl Reader for Xlsx { Ok(xlsx) } - fn with_header_row(&mut self, header_row: Option) -> &mut Self { + fn with_header_row(&mut self, header_row: HeaderRow) -> &mut Self { self.options.header_row = header_row; self } @@ -973,49 +974,52 @@ impl ReaderRef for Xlsx { cells.reserve(len as usize); } - // If `header_row` is set, we only add non-empty cells after the `header_row`. - if let Some(header_row) = header_row { - loop { - match cell_reader.next_cell() { - Ok(Some(Cell { - val: DataRef::Empty, - .. - })) => (), - Ok(Some(cell)) => { - if cell.pos.0 >= header_row { - cells.push(cell); - } + match header_row { + HeaderRow::FirstNonEmptyRow => { + // the header row is the row of the first non-empty cell + loop { + match cell_reader.next_cell() { + Ok(Some(Cell { + val: DataRef::Empty, + .. + })) => (), + Ok(Some(cell)) => cells.push(cell), + Ok(None) => break, + Err(e) => return Err(e), } - Ok(None) => break, - Err(e) => return Err(e), } } + HeaderRow::Row(header_row_idx) => { + // If `header_row` is a row index, we only add non-empty cells after this index. + loop { + match cell_reader.next_cell() { + Ok(Some(Cell { + val: DataRef::Empty, + .. + })) => (), + Ok(Some(cell)) => { + if cell.pos.0 >= header_row_idx { + cells.push(cell); + } + } + Ok(None) => break, + Err(e) => return Err(e), + } + } - // If `header_row` is set and the first non-empty cell is not at the `header_row`, we add - // an empty cell at the beginning with row `header_row` and same column as the first non-empty cell. - if cells.first().map_or(false, |c| c.pos.0 != header_row) { - cells.insert( - header_row as usize, - Cell { - pos: ( - header_row, - cells.first().expect("cells should not be empty").pos.1, - ), - val: DataRef::Empty, - }, - ); - } - // If `header_row` is not specified (default), the header row is the row of the first non-empty cell. - } else { - loop { - match cell_reader.next_cell() { - Ok(Some(Cell { - val: DataRef::Empty, - .. - })) => (), - Ok(Some(cell)) => cells.push(cell), - Ok(None) => break, - Err(e) => return Err(e), + // If `header_row` is set and the first non-empty cell is not at the `header_row`, we add + // an empty cell at the beginning with row `header_row` and same column as the first non-empty cell. + if cells.first().map_or(false, |c| c.pos.0 != header_row_idx) { + cells.insert( + header_row_idx as usize, + Cell { + pos: ( + header_row_idx, + cells.first().expect("cells should not be empty").pos.1, + ), + val: DataRef::Empty, + }, + ); } } } diff --git a/tests/test.rs b/tests/test.rs index fb69cc9a..95a4643c 100644 --- a/tests/test.rs +++ b/tests/test.rs @@ -1,8 +1,8 @@ use calamine::Data::{Bool, DateTime, DateTimeIso, DurationIso, Empty, Error, Float, Int, String}; use calamine::{ open_workbook, open_workbook_auto, DataRef, DataType, Dimensions, ExcelDateTime, - ExcelDateTimeType, Ods, Range, Reader, ReaderRef, Sheet, SheetType, SheetVisible, Xls, Xlsb, - Xlsx, + ExcelDateTimeType, HeaderRow, Ods, Range, Reader, ReaderRef, Sheet, SheetType, SheetVisible, + Xls, Xlsb, Xlsx, }; use calamine::{CellErrorType::*, Data}; use rstest::rstest; @@ -1798,16 +1798,16 @@ fn test_ref_xlsb() { } #[rstest] -#[case("header-row.xlsx", None, (2, 0), (9, 3), &[Empty, Empty, String("Note 1".to_string()), Empty], 32)] -#[case("header-row.xlsx", Some(0), (0, 0), (9, 3), &[Empty, Empty, Empty, Empty], 40)] -#[case("header-row.xlsx", Some(8), (8, 0), (9, 3), &[String("Columns".to_string()), String("Column A".to_string()), String("Column B".to_string()), String("Column C".to_string())], 8)] -#[case("temperature.xlsx", None, (0, 0), (2, 1), &[String("label".to_string()), String("value".to_string())], 6)] -#[case("temperature.xlsx", Some(0), (0, 0), (2, 1), &[String("label".to_string()), String("value".to_string())], 6)] -#[case("temperature-in-middle.xlsx", None, (3, 1), (5, 2), &[String("label".to_string()), String("value".to_string())], 6)] -#[case("temperature-in-middle.xlsx", Some(0), (0, 1), (5, 2), &[Empty, Empty], 12)] +#[case("header-row.xlsx", HeaderRow::FirstNonEmptyRow, (2, 0), (9, 3), &[Empty, Empty, String("Note 1".to_string()), Empty], 32)] +#[case("header-row.xlsx", HeaderRow::Row(0), (0, 0), (9, 3), &[Empty, Empty, Empty, Empty], 40)] +#[case("header-row.xlsx", HeaderRow::Row(8), (8, 0), (9, 3), &[String("Columns".to_string()), String("Column A".to_string()), String("Column B".to_string()), String("Column C".to_string())], 8)] +#[case("temperature.xlsx", HeaderRow::FirstNonEmptyRow, (0, 0), (2, 1), &[String("label".to_string()), String("value".to_string())], 6)] +#[case("temperature.xlsx", HeaderRow::Row(0), (0, 0), (2, 1), &[String("label".to_string()), String("value".to_string())], 6)] +#[case("temperature-in-middle.xlsx", HeaderRow::FirstNonEmptyRow, (3, 1), (5, 2), &[String("label".to_string()), String("value".to_string())], 6)] +#[case("temperature-in-middle.xlsx", HeaderRow::Row(0), (0, 1), (5, 2), &[Empty, Empty], 12)] fn test_header_row_xlsx( #[case] fixture_path: &str, - #[case] header_row: Option, + #[case] header_row: HeaderRow, #[case] expected_start: (u32, u32), #[case] expected_end: (u32, u32), #[case] expected_first_row: &[Data], @@ -1837,11 +1837,11 @@ fn test_header_row_xlsx( fn test_read_twice_with_different_header_rows() { let mut xlsx: Xlsx<_> = wb("any_sheets.xlsx"); let _ = xlsx - .with_header_row(Some(2)) + .with_header_row(HeaderRow::Row(2)) .worksheet_range("Visible") .unwrap(); let _ = xlsx - .with_header_row(Some(1)) + .with_header_row(HeaderRow::Row(1)) .worksheet_range("Visible") .unwrap(); } @@ -1882,7 +1882,7 @@ fn test_header_row_xlsb() { assert_eq!(range.rows().nth(1).unwrap(), &second_line); let range = xlsb - .with_header_row(Some(1)) + .with_header_row(HeaderRow::Row(1)) .worksheet_range("Sheet1") .unwrap(); assert_eq!(range.start(), Some((1, 0))); @@ -1926,7 +1926,7 @@ fn test_header_row_xls() { assert_eq!(range.rows().nth(1).unwrap(), &second_line); let range = xls - .with_header_row(Some(1)) + .with_header_row(HeaderRow::Row(1)) .worksheet_range("Sheet1") .unwrap(); assert_eq!(range.start(), Some((1, 0))); @@ -1956,7 +1956,7 @@ fn test_header_row_ods() { assert_eq!(range.rows().nth(2).unwrap(), &third_line); let range = ods - .with_header_row(Some(2)) + .with_header_row(HeaderRow::Row(2)) .worksheet_range("Sheet1") .unwrap(); assert_eq!(range.start(), Some((2, 0)));