From da98732a40bd225ac86ad17c4795d6ece17faa35 Mon Sep 17 00:00:00 2001 From: ricflams Date: Fri, 26 Sep 2025 20:36:43 +0200 Subject: [PATCH 1/5] Add xref-streams tied to any parts, not just the first On a large sample of pdf-files PdfPig failed to read the correct StructTree-object for about 1% of them. The StructTree object was simply missing in the CrossReferenceTable.CrossReferenceTable. It turned out that the constructed CrossReferenceTable could miss Stream-parts if there were multiple Table-parts because a stream will only be added if it's associated with the very first Table-part. The remedy would seem to be to check for and add streams that are associated with any of the Table-parts, not just the first one. On a sample of 72 files where this failed, this changed fixed the StructTree for all of them. --- .../CrossReference/CrossReferenceTableBuilder.cs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/UglyToad.PdfPig/CrossReference/CrossReferenceTableBuilder.cs b/src/UglyToad.PdfPig/CrossReference/CrossReferenceTableBuilder.cs index 8ea75530e..7f32fb66a 100644 --- a/src/UglyToad.PdfPig/CrossReference/CrossReferenceTableBuilder.cs +++ b/src/UglyToad.PdfPig/CrossReference/CrossReferenceTableBuilder.cs @@ -56,16 +56,16 @@ public CrossReferenceTable Build(long firstCrossReferenceOffset, long offsetCorr // add this and follow chain defined by 'Prev' keys xrefPartToBytePositionOrder.Add(firstCrossReferenceOffset); - // Get any streams that are tied to this table. - var activePart = currentPart; - var dependents = parts.Where(x => x.TiedToXrefAtOffset == activePart.Offset); - foreach (var dependent in dependents) - { - xrefPartToBytePositionOrder.Add(dependent.Offset); - } - while (currentPart.Dictionary != null) { + // Get any streams that are tied to this table. + var activePart = currentPart; + var dependents = parts.Where(x => x.TiedToXrefAtOffset == activePart.Offset); + foreach (var dependent in dependents) + { + xrefPartToBytePositionOrder.Add(dependent.Offset); + } + long prevBytePos = currentPart.GetPreviousOffset(); if (prevBytePos == -1) { From b8b8c7b2abb6de6fed24d543fdcb1b45f5d0f53a Mon Sep 17 00:00:00 2001 From: ricflams Date: Mon, 29 Sep 2025 17:46:52 +0200 Subject: [PATCH 2/5] Add table-xref's associated stream-xrefs - If an XrefTable has an associated stream, as indicated via the XrefStm-property, then read and add that XrefStream - Any table can have 0 or 1 such associated streams - A caveat: such an associated stream might also theoretically be part of the Parts-sequence in which case it would be encountered both by looping through all those parts along with all the regular tables and now also by association to any of those tables. It doesn't seem harmful since the offsets are flattened eventually anyway and stored by their offset-key into a mapping-table. --- .../Parser/FileStructure/FirstPassParser.cs | 17 +++++++++++++++++ .../Parser/FileStructure/XrefTable.cs | 10 ++++++++++ 2 files changed, 27 insertions(+) diff --git a/src/UglyToad.PdfPig/Parser/FileStructure/FirstPassParser.cs b/src/UglyToad.PdfPig/Parser/FileStructure/FirstPassParser.cs index 078023517..a0b847e37 100644 --- a/src/UglyToad.PdfPig/Parser/FileStructure/FirstPassParser.cs +++ b/src/UglyToad.PdfPig/Parser/FileStructure/FirstPassParser.cs @@ -153,6 +153,23 @@ private static IReadOnlyList GetXrefPartsDirectly( { results.Add(table); nextLocation = table.GetPrevious(); + + // Also add any optional associated Stream + var xRefStm = table.GetXRefStm(); + if (xRefStm is long xRefStmValue) + { + var stream = GetXrefStreamOrTable( + offset, + input, + scanner, + xRefStmValue, + log); + + if (stream != null) + { + results.Add(stream); + } + } } else if (streamOrTable is XrefStream stream) { diff --git a/src/UglyToad.PdfPig/Parser/FileStructure/XrefTable.cs b/src/UglyToad.PdfPig/Parser/FileStructure/XrefTable.cs index c02a1aa8d..011b25ba4 100644 --- a/src/UglyToad.PdfPig/Parser/FileStructure/XrefTable.cs +++ b/src/UglyToad.PdfPig/Parser/FileStructure/XrefTable.cs @@ -44,4 +44,14 @@ public XrefTable( return null; } + + public long? GetXRefStm() + { + if (Dictionary != null && Dictionary.TryGet(NameToken.XrefStm, out NumericToken xRefStm)) + { + return xRefStm.Long; + } + + return null; + } } \ No newline at end of file From 8015e13564e75d052e6251044c8d18de1a5bdcbf Mon Sep 17 00:00:00 2001 From: Richard Flamsholt Date: Tue, 30 Sep 2025 15:54:09 +0200 Subject: [PATCH 3/5] Update test Issue874: Now finds more text on page 1 With the fix for including associated streams, this test now finds more text on the first page. I've verified using Aspose.PDF and by viewing the ErcotFacts.pdf file being tested that yes, it was indeed missing part of the text before. --- src/UglyToad.PdfPig.Tests/Integration/GithubIssuesTests.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/UglyToad.PdfPig.Tests/Integration/GithubIssuesTests.cs b/src/UglyToad.PdfPig.Tests/Integration/GithubIssuesTests.cs index 04b3b60a2..9f2cebdd9 100644 --- a/src/UglyToad.PdfPig.Tests/Integration/GithubIssuesTests.cs +++ b/src/UglyToad.PdfPig.Tests/Integration/GithubIssuesTests.cs @@ -475,7 +475,7 @@ public void Issue874() using (var document = PdfDocument.Open(doc, new ParsingOptions() { UseLenientParsing = true, SkipMissingFonts = true })) { var page1 = document.GetPage(1); - Assert.Equal(1788, page1.Letters.Count); + Assert.Equal(1939, page1.Letters.Count); var page2 = document.GetPage(2); Assert.Equal(2430, page2.Letters.Count); From 78d80fa59afb3f818f77ed8c78fa8a7ed75b1b8e Mon Sep 17 00:00:00 2001 From: Richard Flamsholt Date: Tue, 30 Sep 2025 16:18:07 +0200 Subject: [PATCH 4/5] Update test Issue874: Also more text on page 2 Page two has had four more characters added, which is now delected by this xref-stream fix --- src/UglyToad.PdfPig.Tests/Integration/GithubIssuesTests.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/UglyToad.PdfPig.Tests/Integration/GithubIssuesTests.cs b/src/UglyToad.PdfPig.Tests/Integration/GithubIssuesTests.cs index 9f2cebdd9..0b9f4be95 100644 --- a/src/UglyToad.PdfPig.Tests/Integration/GithubIssuesTests.cs +++ b/src/UglyToad.PdfPig.Tests/Integration/GithubIssuesTests.cs @@ -478,7 +478,7 @@ public void Issue874() Assert.Equal(1939, page1.Letters.Count); var page2 = document.GetPage(2); - Assert.Equal(2430, page2.Letters.Count); + Assert.Equal(2434, page2.Letters.Count); } using (var document = PdfDocument.Open(doc, new ParsingOptions() { UseLenientParsing = true, SkipMissingFonts = false })) From 4a4ede79a1793a8698ca0125c44fc86188750c5b Mon Sep 17 00:00:00 2001 From: Richard Flamsholt Date: Tue, 30 Sep 2025 18:00:47 +0200 Subject: [PATCH 5/5] Update test Issue874: No longer missing a font Including the stream-xref means that the formerly missing font is no longer missing, so simply run the two test-cases under the (stricter) assumption of SkipMissingFonts=false. --- .../Integration/GithubIssuesTests.cs | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/src/UglyToad.PdfPig.Tests/Integration/GithubIssuesTests.cs b/src/UglyToad.PdfPig.Tests/Integration/GithubIssuesTests.cs index 0b9f4be95..8ec946158 100644 --- a/src/UglyToad.PdfPig.Tests/Integration/GithubIssuesTests.cs +++ b/src/UglyToad.PdfPig.Tests/Integration/GithubIssuesTests.cs @@ -472,7 +472,7 @@ public void Issue874() { var doc = IntegrationHelpers.GetDocumentPath("ErcotFacts.pdf"); - using (var document = PdfDocument.Open(doc, new ParsingOptions() { UseLenientParsing = true, SkipMissingFonts = true })) + using (var document = PdfDocument.Open(doc, new ParsingOptions() { UseLenientParsing = true, SkipMissingFonts = false })) { var page1 = document.GetPage(1); Assert.Equal(1939, page1.Letters.Count); @@ -480,12 +480,6 @@ public void Issue874() var page2 = document.GetPage(2); Assert.Equal(2434, page2.Letters.Count); } - - using (var document = PdfDocument.Open(doc, new ParsingOptions() { UseLenientParsing = true, SkipMissingFonts = false })) - { - var ex = Assert.Throws(() => document.GetPage(1)); - Assert.StartsWith("Value cannot be null.", ex.Message); - } } [Fact]