diff --git a/benchmarks/NPOI.Benchmarks/LargeExcelFileBenchmark.cs b/benchmarks/NPOI.Benchmarks/LargeExcelFileBenchmark.cs index 4a63ac15f0..65a4e0160c 100644 --- a/benchmarks/NPOI.Benchmarks/LargeExcelFileBenchmark.cs +++ b/benchmarks/NPOI.Benchmarks/LargeExcelFileBenchmark.cs @@ -14,6 +14,11 @@ public class LargeExcelFileBenchmark private static MemoryStream _memoryStream; private string _filePath; + // 36 MB .xlsx sourced from https://github.com/mini-software/MiniExcel/tree/master/benchmarks/MiniExcel.Benchmarks + // 1,000,000 rows × 10 cols, all cells are shared strings → uniqueCount=1,000,000 + // Uncompressed sharedStrings.xml is ~31 MB, making SST the dominant parse cost. + private string _largeFileWithSstPath; + [GlobalSetup] public void GlobalSetup() { @@ -37,6 +42,8 @@ public void GlobalSetup() _loadedWorkBook = new XSSFWorkbook(copyPath); _memoryStream = new MemoryStream(); + + _largeFileWithSstPath = Path.Combine("data", "Test1000000x10_SharingStrings.xlsx"); } [Benchmark] @@ -46,6 +53,35 @@ public void XSSFWorkbookLoad() workbook.Dispose(); } + /// + /// Opens a 36 MB workbook whose sharedStrings.xml decompresses to ~31 MB + /// (1,000,000 unique strings) and immediately disposes without reading any cells. + /// With lazy SST loading the shared strings table is never parsed, so this + /// benchmark represents the minimum overhead of opening the workbook. + /// Compare with which forces + /// SST parsing to be able to read cell values. + /// + [Benchmark] + public void XSSFWorkbookLargeSstOpenDispose() + { + using var workbook = new XSSFWorkbook(_largeFileWithSstPath, true); + } + + /// + /// Opens the same 36 MB workbook and explicitly forces the SST to load by + /// accessing . + /// This is the baseline that shows how expensive eager DOM-based SST parsing + /// would be; with lazy loading + streaming parser the cost is deferred and + /// reduced in allocation compared to the old DOM path. + /// + [Benchmark] + public void XSSFWorkbookLargeSstLoadStrings() + { + using var workbook = new XSSFWorkbook(_largeFileWithSstPath, true); + // Force SST parse + _ = workbook.GetSharedStringSource().Count; + } + [Benchmark] public void XSSFReaderLoad() { diff --git a/benchmarks/NPOI.Benchmarks/NPOI.Benchmarks.csproj b/benchmarks/NPOI.Benchmarks/NPOI.Benchmarks.csproj index ad3b78fece..0cc19034c8 100644 --- a/benchmarks/NPOI.Benchmarks/NPOI.Benchmarks.csproj +++ b/benchmarks/NPOI.Benchmarks/NPOI.Benchmarks.csproj @@ -19,6 +19,9 @@ Always + + Always + diff --git a/benchmarks/NPOI.Benchmarks/data/Test1000000x10_SharingStrings.xlsx b/benchmarks/NPOI.Benchmarks/data/Test1000000x10_SharingStrings.xlsx new file mode 100644 index 0000000000..1675417f73 Binary files /dev/null and b/benchmarks/NPOI.Benchmarks/data/Test1000000x10_SharingStrings.xlsx differ diff --git a/ooxml/XSSF/Model/SharedStringsTable.cs b/ooxml/XSSF/Model/SharedStringsTable.cs index 581453bc9b..8470f3e292 100644 --- a/ooxml/XSSF/Model/SharedStringsTable.cs +++ b/ooxml/XSSF/Model/SharedStringsTable.cs @@ -18,14 +18,14 @@ limitations under the License. namespace NPOI.XSSF.Model { + using System.Buffers; using System.Collections.Generic; + using System.Globalization; using OpenXmlFormats.Spreadsheet; using System; using System.IO; using NPOI.OpenXml4Net.OPC; using System.Xml; - using System.Security; - using System.Text.RegularExpressions; using System.Text; /** @@ -61,7 +61,8 @@ public class SharedStringsTable : POIXMLDocumentPart private readonly List strings = new List(); /** - * Maps strings and their indexes in the strings arrays + * Maps strings and their indexes in the strings arrays. + * Built lazily on the first call to AddEntry(). */ private readonly Dictionary stmap = new Dictionary(); @@ -80,46 +81,437 @@ public class SharedStringsTable : POIXMLDocumentPart private SstDocument _sstDoc; + // Lazy-loading state + private PackagePart _loadPart; + private bool _loaded; + private bool _dirty; + private bool _stmapBuilt; + public SharedStringsTable() : base() { - _sstDoc = new SstDocument(); _sstDoc.AddNewSst(); + _loaded = true; + _stmapBuilt = true; } public SharedStringsTable(PackagePart part) : base(part) { - ReadFrom(part.GetInputStream()); + // Defer full parsing until first access (lazy loading). + // However, perform an early security scan to detect DOCTYPE entity + // expansion attacks (e.g. "billion laughs") in the SST, matching + // the behaviour of ConvertStreamToXml / LoadXmlSafe. + _loadPart = part; + ValidateSstSecurity(part); } [Obsolete("deprecated in POI 3.14, scheduled for removal in POI 3.16")] public SharedStringsTable(PackagePart part, PackageRelationship rel) : this(part) { + } + /// + /// Performs a lightweight security scan of the SST part to reject files that + /// contain DOCTYPE entity declarations (e.g. "billion laughs" / XML bomb attacks). + /// This is a fast, forward-only scan that does not build any object model. + /// + private static void ValidateSstSecurity(PackagePart part) + { + Stream stream = null; + try + { + stream = part.GetInputStream(); + if (stream == null || (stream.CanSeek && stream.Length == 0)) + return; + + var settings = new XmlReaderSettings + { + DtdProcessing = DtdProcessing.Prohibit, + XmlResolver = null, + IgnoreWhitespace = true, + IgnoreComments = true, + IgnoreProcessingInstructions = true, + }; + using var reader = XmlReader.Create(stream, settings); + // Reading up to the first Element node is sufficient: + // DtdProcessing.Prohibit throws when + /// Ensures stmap is consistent with the strings list. Called lazily before AddEntry. + /// + private void EnsureStmapBuilt() + { + if (_stmapBuilt) return; + _stmapBuilt = true; + for (int i = 0; i < strings.Count; i++) + { + string key = GetKey(strings[i]); + if (key != null && !stmap.ContainsKey(key)) + stmap[key] = i; + } + } + + /// + /// Streaming XmlReader-based parser for sharedStrings.xml. Replaces the + /// DOM-based ConvertStreamToXml + SstDocument.Parse approach to reduce + /// allocations. Uses ArrayPool<char> for text buffering. + /// + private const int TextReadBufferSize = 1024; + + private void ReadFromStream(Stream stream) { + _sstDoc = new SstDocument(); + _sstDoc.AddNewSst(); + CT_Sst sst = _sstDoc.GetSst(); + + var settings = new XmlReaderSettings + { + DtdProcessing = DtdProcessing.Prohibit, + XmlResolver = null, + IgnoreWhitespace = false, + IgnoreComments = true, + IgnoreProcessingInstructions = true, + }; + + char[] readBuf = ArrayPool.Shared.Rent(TextReadBufferSize); try { - int cnt = 0; - XmlDocument xml = ConvertStreamToXml(is1); - _sstDoc = SstDocument.Parse(xml, NamespaceManager); - CT_Sst sst = _sstDoc.GetSst(); - count = (int)sst.count; - uniqueCount = (int)sst.uniqueCount; - foreach (CT_Rst st in sst.si) + using var reader = XmlReader.Create(stream, settings); + + CT_Rst currentSi = null; + CT_RElt currentR = null; + CT_RPrElt currentRPr = null; + CT_PhoneticRun currentRPh = null; + StringBuilder textBuf = null; + bool inSiT = false, inRT = false, inRPhT = false; + + while (reader.Read()) { - string key = GetKey(st); - if (key != null && !stmap.ContainsKey(key)) - stmap.Add(key, cnt); - strings.Add(st); - cnt++; + switch (reader.NodeType) + { + case XmlNodeType.Element: + { + string localName = reader.LocalName; + bool isEmpty = reader.IsEmptyElement; + + // rPr children are all self-closing attribute-only elements + if (currentRPr != null && localName != "rPr") + { + ParseRPrChild(reader, localName, currentRPr); + break; + } + + switch (localName) + { + case "sst": + { + string cStr = reader.GetAttribute("count"); + if (cStr != null && int.TryParse(cStr, NumberStyles.None, CultureInfo.InvariantCulture, out int c)) + count = c; + string ucStr = reader.GetAttribute("uniqueCount"); + if (ucStr != null && int.TryParse(ucStr, NumberStyles.None, CultureInfo.InvariantCulture, out int uc)) + uniqueCount = uc; + break; + } + case "si": + currentSi = new CT_Rst(); + break; + case "t": + if (isEmpty) + { + if (currentR != null) currentR.t = string.Empty; + else if (currentRPh != null) currentRPh.t = string.Empty; + else if (currentSi != null) currentSi.t = string.Empty; + } + else + { + textBuf = new StringBuilder(); + if (currentR != null) inRT = true; + else if (currentRPh != null) inRPhT = true; + else if (currentSi != null) inSiT = true; + } + break; + case "r": + currentR = new CT_RElt(); + if (currentSi != null) + { + if (currentSi.r == null) currentSi.r = new List(); + currentSi.r.Add(currentR); + } + if (isEmpty) currentR = null; + break; + case "rPr": + currentRPr = new CT_RPrElt(); + if (currentR != null) currentR.rPr = currentRPr; + if (isEmpty) currentRPr = null; + break; + case "rPh": + { + currentRPh = new CT_PhoneticRun(); + string sbStr = reader.GetAttribute("sb"); + string ebStr = reader.GetAttribute("eb"); + if (sbStr != null && uint.TryParse(sbStr, out uint sbVal)) currentRPh.sb = sbVal; + if (ebStr != null && uint.TryParse(ebStr, out uint ebVal)) currentRPh.eb = ebVal; + if (currentSi != null) + { + if (currentSi.rPh == null) currentSi.rPh = new List(); + currentSi.rPh.Add(currentRPh); + } + if (isEmpty) currentRPh = null; + break; + } + case "phoneticPr": + if (currentSi != null) + currentSi.phoneticPr = ParsePhoneticPrAttributes(reader); + break; + } + break; + } + case XmlNodeType.Text: + case XmlNodeType.SignificantWhitespace: + case XmlNodeType.Whitespace: + { + if ((inSiT || inRT || inRPhT) && textBuf != null) + { + int charsRead; + while ((charsRead = reader.ReadValueChunk(readBuf, 0, readBuf.Length)) > 0) + textBuf.Append(readBuf, 0, charsRead); + } + break; + } + case XmlNodeType.EndElement: + { + switch (reader.LocalName) + { + case "si": + if (currentSi != null) + { + sst.si.Add(currentSi); + strings.Add(currentSi); + } + currentSi = null; + break; + case "t": + { + string text = textBuf?.ToString() ?? string.Empty; + textBuf = null; + if (inSiT && currentSi != null) { currentSi.t = text; inSiT = false; } + else if (inRT && currentR != null) { currentR.t = text; inRT = false; } + else if (inRPhT && currentRPh != null) { currentRPh.t = text; inRPhT = false; } + break; + } + case "r": + currentR = null; + break; + case "rPr": + currentRPr = null; + break; + case "rPh": + currentRPh = null; + break; + } + break; + } + } } } + finally + { + ArrayPool.Shared.Return(readBuf); + } + } + + private static CT_PhoneticPr ParsePhoneticPrAttributes(XmlReader reader) + { + var pr = new CT_PhoneticPr(); + string fontId = reader.GetAttribute("fontId"); + if (fontId != null && uint.TryParse(fontId, out uint fid)) pr.fontId = fid; + string type = reader.GetAttribute("type"); + if (type != null && Enum.TryParse(type, out ST_PhoneticType pt)) pr.type = pt; + string alignment = reader.GetAttribute("alignment"); + if (alignment != null && Enum.TryParse(alignment, out ST_PhoneticAlignment pa)) pr.alignment = pa; + return pr; + } + + private static void ParseRPrChild(XmlReader reader, string localName, CT_RPrElt rPr) + { + switch (localName) + { + case "sz": + { + string val = reader.GetAttribute("val"); + if (val != null && double.TryParse(val, NumberStyles.Any, CultureInfo.InvariantCulture, out double sz)) + rPr.sz = new CT_FontSize { val = sz }; + break; + } + case "color": + rPr.color = ParseColorAttributes(reader); + break; + case "rFont": + { + string val = reader.GetAttribute("val"); + if (val != null) rPr.rFont = new CT_FontName { val = val }; + break; + } + case "family": + { + string val = reader.GetAttribute("val"); + if (val != null && int.TryParse(val, out int fam)) + rPr.family = new CT_IntProperty { val = fam }; + break; + } + case "charset": + { + string val = reader.GetAttribute("val"); + if (val != null && int.TryParse(val, out int cs)) + rPr.charset = new CT_IntProperty { val = cs }; + break; + } + case "b": + rPr.b = ParseBoolProp(reader); + break; + case "i": + rPr.i = ParseBoolProp(reader); + break; + case "strike": + rPr.strike = ParseBoolProp(reader); + break; + case "outline": + rPr.outline = ParseBoolProp(reader); + break; + case "shadow": + rPr.shadow = ParseBoolProp(reader); + break; + case "condense": + rPr.condense = ParseBoolProp(reader); + break; + case "extend": + rPr.extend = ParseBoolProp(reader); + break; + case "u": + { + string val = reader.GetAttribute("val") ?? "single"; + if (Enum.TryParse(val, out ST_UnderlineValues uv)) + rPr.u = new CT_UnderlineProperty { val = uv }; + break; + } + case "vertAlign": + { + string val = reader.GetAttribute("val"); + if (val != null && Enum.TryParse(val, out ST_VerticalAlignRun va)) + rPr.vertAlign = new CT_VerticalAlignFontProperty { val = va }; + break; + } + case "scheme": + { + string val = reader.GetAttribute("val"); + if (val != null && Enum.TryParse(val, out ST_FontScheme fs)) + rPr.scheme = new CT_FontScheme { val = fs }; + break; + } + } + } + + private static CT_BooleanProperty ParseBoolProp(XmlReader reader) + { + string val = reader.GetAttribute("val"); + // Default for boolean property is true when attribute is absent + bool v = val == null || (val != "0" && !val.Equals("false", StringComparison.OrdinalIgnoreCase)); + return new CT_BooleanProperty { val = v }; + } + + private static CT_Color ParseColorAttributes(XmlReader reader) + { + var color = new CT_Color(); + string auto = reader.GetAttribute("auto"); + if (auto != null) + color.auto = auto != "0" && !auto.Equals("false", StringComparison.OrdinalIgnoreCase); + string indexed = reader.GetAttribute("indexed"); + if (indexed != null && uint.TryParse(indexed, out uint idx)) color.indexed = idx; + string rgb = reader.GetAttribute("rgb"); + if (rgb != null && rgb.Length >= 2) + { + byte[] bytes = new byte[rgb.Length / 2]; + for (int i = 0; i < bytes.Length; i++) + bytes[i] = Convert.ToByte(rgb.Substring(i * 2, 2), 16); + color.rgb = bytes; + } + string theme = reader.GetAttribute("theme"); + if (theme != null && uint.TryParse(theme, out uint th)) color.theme = th; + string tint = reader.GetAttribute("tint"); + if (tint != null && double.TryParse(tint, NumberStyles.Any, CultureInfo.InvariantCulture, out double t)) color.tint = t; + return color; + } + + /// + /// Read shared strings from a stream. Kept for backward compatibility; internally + /// delegates to the streaming parser. + /// + public void ReadFrom(Stream is1) + { + try + { + ReadFromStream(is1); + _loaded = true; + _stmapBuilt = false; + } catch (XmlException e) { throw new IOException("unable to parse shared strings table", e); @@ -139,6 +531,7 @@ private static String GetKey(CT_Rst st) */ public CT_Rst GetEntryAt(int idx) { + EnsureLoaded(); return strings[idx]; } @@ -152,6 +545,7 @@ public int Count { get { + EnsureLoaded(); return count; } } @@ -167,16 +561,17 @@ public int UniqueCount { get { + EnsureLoaded(); return uniqueCount; } } /** - * Add an entry to this Shared String table (a new value is appened to the end). + * Add an entry to this Shared String table (a new value is appended to the end). * *

* If the Shared String table already Contains this CT_Rst bean, its index is returned. - * Otherwise a new entry is aded. + * Otherwise a new entry is added. *

* * @param st the entry to add @@ -184,6 +579,8 @@ public int UniqueCount */ public int AddEntry(CT_Rst st) { + EnsureLoaded(); + EnsureStmapBuilt(); String s = GetKey(st); count++; if (stmap.TryGetValue(s, out int entry)) @@ -199,8 +596,10 @@ public int AddEntry(CT_Rst st) int idx = strings.Count; stmap[s] = idx; strings.Add(newSt); + _dirty = true; return idx; } + /** * Provide low-level access to the underlying array of CT_Rst beans * @@ -210,36 +609,50 @@ public IList Items { get { + EnsureLoaded(); return strings.AsReadOnly(); } } /** - * - * this table out as XML. - * + * Write this table out as XML. + * * @param out The stream to write to. * @throws IOException if an error occurs while writing. */ public void WriteTo(Stream out1) { - // the following two lines turn off writing CDATA - // see Bugzilla 48936 - //options.SetSaveCDataLengthThreshold(1000000); - //options.SetSaveCDataEntityCountThreshold(-1); + EnsureLoaded(); CT_Sst sst = _sstDoc.GetSst(); sst.count = count; - sst.uniqueCount = uniqueCount; - - //re-create the sst table every time saving a workbook - _sstDoc.Save(out1); + sst.uniqueCount = uniqueCount; + _sstDoc.Save(out1); } + /// + /// Returns true if the SST has been parsed from its backing part. + /// Used in tests to verify lazy-load behaviour. + /// + internal bool IsLoaded => _loaded; + + /// + /// Prepares the part for commit. No-op when SST has not been modified, + /// preserving the original part bytes. + /// + protected internal override void PrepareForCommit() + { + if (_dirty) + base.PrepareForCommit(); + } + /// + /// Commits the SST to the package. No-op when SST has not been modified, + /// so the original sharedStrings.xml bytes are preserved without parsing. + /// protected internal override void Commit() { + if (!_dirty) return; PackagePart part = GetPackagePart(); - //Stream out1 = part.GetInputStream(); Stream out1 = part.GetOutputStream(); WriteTo(out1); out1.Close(); diff --git a/testcases/ooxml/XSSF/Model/TestSharedStringsTable.cs b/testcases/ooxml/XSSF/Model/TestSharedStringsTable.cs index f93a0996b8..9d596a265c 100644 --- a/testcases/ooxml/XSSF/Model/TestSharedStringsTable.cs +++ b/testcases/ooxml/XSSF/Model/TestSharedStringsTable.cs @@ -191,6 +191,116 @@ private List ReadStrings(String filename) return strs; } + /// + /// Verify that opening a workbook and writing it without accessing shared + /// strings does not cause the SST to be parsed or rewritten. + /// + [Test] + public void TestLazyLoadNotTriggeredByWrite() + { + XSSFWorkbook wb = XSSFTestDataSamples.OpenSampleWorkbook("sample.xlsx"); + SharedStringsTable sst = wb.GetSharedStringSource(); + + // SST should NOT be loaded yet + ClassicAssert.IsFalse(sst.IsLoaded, "SST should not be loaded before any access"); + + // Write the workbook without touching any string cells + byte[] writtenBytes; + using (MemoryStream ms = new MemoryStream()) + { + wb.Write(ms, false); + writtenBytes = ms.ToArray(); + } + + // SST should still be unloaded (not dirty, not accessed) + ClassicAssert.IsFalse(sst.IsLoaded, "SST should still not be loaded after Write() without string access"); + + // The written workbook should still contain the correct SST + XSSFWorkbook wb2 = new XSSFWorkbook(new MemoryStream(writtenBytes)); + SharedStringsTable sst2 = wb2.GetSharedStringSource(); + + // Now access to force load + ClassicAssert.IsTrue(sst2.Count > 0, "Written workbook should have preserved the SST"); + ClassicAssert.IsTrue(sst2.Items.Count > 0); + + wb.Close(); + wb2.Close(); + } + + /// + /// Verify that reading SST content marks it loaded but not dirty, + /// and that saving preserves the original SST data without re-serializing. + /// + [Test] + public void TestReadSstNotDirtyAfterAccess() + { + XSSFWorkbook wb1 = XSSFTestDataSamples.OpenSampleWorkbook("sample.xlsx"); + SharedStringsTable sst1 = wb1.GetSharedStringSource(); + + // Access SST – this should load but not dirty it + int origCount = sst1.Count; + int origUnique = sst1.UniqueCount; + IList origItems = sst1.Items; + ClassicAssert.IsTrue(sst1.IsLoaded, "SST should be loaded after Count access"); + + // Round-trip: write + read back + XSSFWorkbook wb2 = XSSFTestDataSamples.WriteOutAndReadBack(wb1); + SharedStringsTable sst2 = wb2.GetSharedStringSource(); + + ClassicAssert.AreEqual(origCount, sst2.Count); + ClassicAssert.AreEqual(origUnique, sst2.UniqueCount); + ClassicAssert.AreEqual(origItems.Count, sst2.Items.Count); + for (int i = 0; i < origItems.Count; i++) + ClassicAssert.AreEqual(origItems[i].ToString(), sst2.Items[i].ToString()); + + wb1.Close(); + wb2.Close(); + } + + /// + /// Verify that rich text runs and phonetic runs in 51519.xlsx are parsed + /// correctly by the streaming parser and survive a round-trip write + read. + /// + [Test] + public void TestPhoneticAndRichTextFidelity() + { + POIDataSamples ssTests = POIDataSamples.GetSpreadSheetInstance(); + XSSFWorkbook wb = new XSSFWorkbook(ssTests.OpenResourceAsStream("51519.xlsx")); + SharedStringsTable sst = wb.GetSharedStringSource(); + + ClassicAssert.AreEqual(49, sst.Items.Count, "Expected 49 shared strings in 51519.xlsx"); + + // Entry 0: plain Japanese text (no rich runs) + CT_Rst entry0 = sst.GetEntryAt(0); + ClassicAssert.AreEqual("\u30B3\u30E1\u30F3\u30C8", + new XSSFRichTextString(entry0).ToString(), + "Entry 0 text mismatch"); + + // Entry 3: should have phonetic runs (rPh elements) + CT_Rst entry3 = sst.GetEntryAt(3); + ClassicAssert.IsNotNull(entry3.rPh, "Entry 3 should have phonetic runs"); + ClassicAssert.IsTrue(entry3.rPh.Count > 0, "Entry 3 should have at least one phonetic run"); + + // Round-trip: write + read back + XSSFWorkbook wb2 = XSSFTestDataSamples.WriteOutAndReadBack(wb); + SharedStringsTable sst2 = wb2.GetSharedStringSource(); + + ClassicAssert.AreEqual(49, sst2.Items.Count, "Round-tripped SST should still have 49 entries"); + + CT_Rst entry0rt = sst2.GetEntryAt(0); + ClassicAssert.AreEqual("\u30B3\u30E1\u30F3\u30C8", + new XSSFRichTextString(entry0rt).ToString(), + "Entry 0 text mismatch after round-trip"); + + CT_Rst entry3rt = sst2.GetEntryAt(3); + ClassicAssert.IsNotNull(entry3rt.rPh, "Entry 3 should have phonetic runs after round-trip"); + ClassicAssert.AreEqual(entry3.rPh.Count, entry3rt.rPh.Count, + "Phonetic run count should match after round-trip"); + + wb.Close(); + wb2.Close(); + } + } }