diff --git a/.github/workflows/run_common_crawl_tests.yml b/.github/workflows/run_common_crawl_tests.yml index c92ee6d87..f8a923c83 100644 --- a/.github/workflows/run_common_crawl_tests.yml +++ b/.github/workflows/run_common_crawl_tests.yml @@ -12,7 +12,7 @@ jobs: strategy: fail-fast: false matrix: - pair: ["0000-0001", "0002-0003", "0004-0005", "0006-0007"] + pair: ["0000-0001", "0002-0003", "0004-0005", "0006-0007", "0008-0009"] steps: - uses: actions/checkout@v2 diff --git a/src/UglyToad.PdfPig.Core/XrefEntryType.cs b/src/UglyToad.PdfPig.Core/XrefEntryType.cs new file mode 100644 index 000000000..8f954163e --- /dev/null +++ b/src/UglyToad.PdfPig.Core/XrefEntryType.cs @@ -0,0 +1,20 @@ +namespace UglyToad.PdfPig.Core; + +/// +/// Indicates where an object is located in the Xref. +/// +public enum XrefEntryType : byte +{ + /// + /// Free object. + /// + Free = 0, + /// + /// Located as an object in the file. + /// + File = 1, + /// + /// Located in a compressed object stream. + /// + ObjectStream = 2 +} \ No newline at end of file diff --git a/src/UglyToad.PdfPig.Core/XrefLocation.cs b/src/UglyToad.PdfPig.Core/XrefLocation.cs new file mode 100644 index 000000000..1f966ae56 --- /dev/null +++ b/src/UglyToad.PdfPig.Core/XrefLocation.cs @@ -0,0 +1,42 @@ +namespace UglyToad.PdfPig.Core; + +/// +/// Information about where an object is located in the file according to the Xref (or brute force parsing). +/// +public readonly struct XrefLocation +{ + /// + /// Which type of location is indicated. + /// + public readonly XrefEntryType Type; + + /// + /// If is then byte offset, otherwise this is the stream number. + /// + public readonly long Value1; + + /// + /// If is then the index of the object in the stream. + /// + public readonly int Value2; // only used for ObjectStream + + private XrefLocation(XrefEntryType type, long value1, int value2) + { + Type = type; + Value1 = value1; + Value2 = value2; + } + + /// + /// Create a location mapped to a byte offset in the file. + /// + public static XrefLocation File(long offset) + => new XrefLocation(XrefEntryType.File, offset, 0); + + /// + /// Create a location mapped to an index inside and object stream. + /// + public static XrefLocation Stream(long objStream, int index) + => new XrefLocation(XrefEntryType.ObjectStream, objStream, index); + +} \ No newline at end of file diff --git a/src/UglyToad.PdfPig.Tests/Filters/FlateFilterTests.cs b/src/UglyToad.PdfPig.Tests/Filters/FlateFilterTests.cs index 68289e876..4f1e0da81 100644 --- a/src/UglyToad.PdfPig.Tests/Filters/FlateFilterTests.cs +++ b/src/UglyToad.PdfPig.Tests/Filters/FlateFilterTests.cs @@ -1,5 +1,6 @@ namespace UglyToad.PdfPig.Tests.Filters { + using PdfPig.Core; using PdfPig.Filters; using PdfPig.Tokens; @@ -11,15 +12,32 @@ public class FlateFilterTests public void EncodeAndDecodePreservesInput() { var parameters = new DictionaryToken(new Dictionary()); - var input = new byte[] {67, 69, 69, 10, 4, 20, 6, 19, 120, 64, 64, 64, 32}; + var input = new byte[] { 67, 69, 69, 10, 4, 20, 6, 19, 120, 64, 64, 64, 32 }; using (var inputStream = new MemoryStream(input)) { inputStream.Seek(0, SeekOrigin.Begin); - var result = filter.Encode(inputStream, parameters, 0); + var result = filter.Encode(inputStream, parameters); var decoded = filter.Decode(result, parameters, TestFilterProvider.Instance, 0); Assert.Equal(input, decoded.ToArray()); } } + + [Fact] + public void CanDecodeCorruptedInputIssue1235() + { + const string hexStr = + "789C958D5D0AC2400C844FB077980B74BB7FD9D982F820B43E8B7B03C542C187EAFDC1F84B7D1164200999E49BD9044C6653D10E1E443DA1AF6636ED76EF315E7572968E1ECDAB7FB7506C4C59C0AEB3912EE270366AAAF4E36D364BF7911450DC274A5112B1AC9751D77A58680B51A4D8AE433D62953C037396E0F290FBE098B267A43051725AA34E77E44EF50B1B52B42C90E4ADF83FB94FDD0000000000"; + + var hex = new HexToken(hexStr.AsSpan()); + + var parameters = new DictionaryToken(new Dictionary()); + + var result = filter.Decode(hex.Bytes.ToArray(), parameters, TestFilterProvider.Instance, 0); + + var text = OtherEncodings.BytesAsLatin1String(result.ToArray()); + + Assert.StartsWith("q", text); + } } } diff --git a/src/UglyToad.PdfPig.Tests/Integration/GithubIssuesTests.cs b/src/UglyToad.PdfPig.Tests/Integration/GithubIssuesTests.cs index f8c464540..4c5a17b27 100644 --- a/src/UglyToad.PdfPig.Tests/Integration/GithubIssuesTests.cs +++ b/src/UglyToad.PdfPig.Tests/Integration/GithubIssuesTests.cs @@ -327,7 +327,7 @@ public void Issue1122() var path = IntegrationHelpers.GetSpecificTestDocumentPath("StackOverflow_Issue_1122.pdf"); var ex = Assert.Throws(() => PdfDocument.Open(path, new ParsingOptions() { UseLenientParsing = true })); - Assert.Equal("The root object in the trailer did not resolve to a readable dictionary.", ex.Message); + Assert.StartsWith("Circular reference encountered when looking", ex.Message); } [Fact] @@ -386,7 +386,7 @@ public void Issue1050() { var path = IntegrationHelpers.GetSpecificTestDocumentPath("SpookyPass.pdf"); var ex = Assert.Throws(() => PdfDocument.Open(path, new ParsingOptions() { UseLenientParsing = true })); - Assert.Equal("The root object in the trailer did not resolve to a readable dictionary.", ex.Message); + Assert.StartsWith("Object stream cannot contain itself", ex.Message); } [Fact] @@ -552,7 +552,7 @@ public void Issue953_IntOverflow() { var page = document.GetPage(13); // This used to fail with an overflow exception when we failed to validate the zlib encoded data - Assert.NotNull(DocstrumBoundingBoxes.Instance.GetBlocks(page.GetWords())); + Assert.Throws(() => DocstrumBoundingBoxes.Instance.GetBlocks(page.GetWords())); } } diff --git a/src/UglyToad.PdfPig.Tests/Parser/FileStructure/FirstPassParserTests.cs b/src/UglyToad.PdfPig.Tests/Parser/FileStructure/FirstPassParserTests.cs index d47cec01d..a7d3eef0a 100644 --- a/src/UglyToad.PdfPig.Tests/Parser/FileStructure/FirstPassParserTests.cs +++ b/src/UglyToad.PdfPig.Tests/Parser/FileStructure/FirstPassParserTests.cs @@ -47,7 +47,7 @@ 0000000576 00000 n Assert.Equal(2, results.Parts.Count); Assert.NotNull(results.Trailer); - Assert.Equal(results.XrefOffsets[new IndirectReference(8, 0)], 500); + Assert.Equal(results.XrefOffsets[new IndirectReference(8, 0)].Value1, 500); } [Fact] diff --git a/src/UglyToad.PdfPig.Tests/Parser/FileStructure/XrefTableParserTests.cs b/src/UglyToad.PdfPig.Tests/Parser/FileStructure/XrefTableParserTests.cs index afb653a8f..74454b14b 100644 --- a/src/UglyToad.PdfPig.Tests/Parser/FileStructure/XrefTableParserTests.cs +++ b/src/UglyToad.PdfPig.Tests/Parser/FileStructure/XrefTableParserTests.cs @@ -589,7 +589,7 @@ private static void AssertObjectsMatch( { Assert.True(table.ObjectOffsets.TryGetValue(offset.Key, out var actual)); - Assert.Equal(offset.Value, actual); + Assert.Equal(offset.Value, actual.Value1); } } diff --git a/src/UglyToad.PdfPig.Tests/Parser/PageContentParserTests.cs b/src/UglyToad.PdfPig.Tests/Parser/PageContentParserTests.cs index b88d169e4..a4fd04853 100644 --- a/src/UglyToad.PdfPig.Tests/Parser/PageContentParserTests.cs +++ b/src/UglyToad.PdfPig.Tests/Parser/PageContentParserTests.cs @@ -216,6 +216,68 @@ public void CorrectlyHandlesFile0007511CorruptInlineImage() Assert.NotEmpty(result); } + [Fact] + public void HandlesIssue953_IntOverflowContent() + { + // After ( + ) Tj operator the content stream becomes corrupt, our current parser therefore reads wrong + // values for operations and this results in a problem when applying the show text operations, we should safely discard or recover on BT/ET boundaries. + const string s = + """ + BT + /TT6 1 Tf + 12.007 0 0 12.007 163.2j + -0.19950 Tc + 0 Tw + (x)Tj + -0.1949 1.4142 TD + (H)Tj + /TT7 1 Tf + 12.031 0 0 12.031 157.38 85.2 Tm + <0077>Tj + -0.1945 1.4114 TD + <0077>Tj + /TT4 1 Tf + 12.007 0 0 12.007 174.42 94.5601 Tm + 0.0004 Tc + -0.0005 Tw + ( + )Tj + E9 478l)]T862.68E9 478E9 484.54 9 155l)]T862.6av9 478E9 15.2( + ET + 154.386( i92 m + 171.6 97.62 l + S + BT + /TT6 28 Tf + 12.03128 T2002.0307 163.2j + -0.19950 DAc + 0 Tw853Tj + 0.1945 1.4142 om)873j + -0.574142 om)68.80 + -0.5797 0 TD + (f)Tj + /TT( )7Tf + 0.31945 1.5341 TD371.4j + 2.82 + 8.2652 0 5.724 TD + 0 Tc + -0.0001 2748.3( = 091ity )-27483 + [(te27483 + [(te27483 + [(te27483 + [(te27483 + [(te27483 + [(Eq.)52 \(2.1 + ( + """; + + var input = StringBytesTestConverter.Convert(s, false); + + var lenientParser = new PageContentParser(ReflectionGraphicsStateOperationFactory.Instance, new StackDepthGuard(256), true); + var result = lenientParser.Parse(1, input.Bytes, log); + + Assert.NotEmpty(result); + } + private static string LineEndingsToWhiteSpace(string str) { return str.Replace("\r\n", " ").Replace('\n', ' ').Replace('\r', ' '); diff --git a/src/UglyToad.PdfPig.Tests/Parser/Parts/BruteForceSearcherTests.cs b/src/UglyToad.PdfPig.Tests/Parser/Parts/BruteForceSearcherTests.cs index d275ad418..8cc206b1a 100644 --- a/src/UglyToad.PdfPig.Tests/Parser/Parts/BruteForceSearcherTests.cs +++ b/src/UglyToad.PdfPig.Tests/Parser/Parts/BruteForceSearcherTests.cs @@ -59,7 +59,7 @@ public void SearcherFindsCorrectObjects() Assert.Equal(4, locations.Count); - Assert.Equal(TestDataOffsets, locations.Values); + Assert.Equal(TestDataOffsets, locations.Values.Select(x => x.Value1)); } [Fact] @@ -111,7 +111,7 @@ 11 0 obj s.IndexOf("11 0 obj", StringComparison.OrdinalIgnoreCase) }; - Assert.Equal(expectedLocations, locations.Values); + Assert.Equal(expectedLocations, locations.Values.Select(x => x.Value1)); } [Fact] @@ -142,7 +142,7 @@ 5 0 obj s.IndexOf("5 0 obj", StringComparison.OrdinalIgnoreCase) }; - Assert.Equal(expectedLocations, locations.Values); + Assert.Equal(expectedLocations, locations.Values.Select(x => x.Value1)); } [Fact] @@ -156,17 +156,17 @@ public void BruteForceSearcherFileOffsetsCorrect() Assert.Equal(13, locations.Count); - Assert.Equal(6183, locations[new IndirectReference(1, 0)]); - Assert.Equal(244, locations[new IndirectReference(2, 0)]); - Assert.Equal(15, locations[new IndirectReference(3, 0)]); - Assert.Equal(222, locations[new IndirectReference(4, 0)]); - Assert.Equal(5766, locations[new IndirectReference(5, 0)]); - Assert.Equal(353, locations[new IndirectReference(6, 0)]); - Assert.Equal(581, locations[new IndirectReference(7, 0)]); - Assert.Equal(5068, locations[new IndirectReference(8, 0)]); - Assert.Equal(5091, locations[new IndirectReference(9, 0)]); - - var s = GetStringAt(bytes, locations[new IndirectReference(3, 0)]); + Assert.Equal(6183, locations[new IndirectReference(1, 0)].Value1); + Assert.Equal(244, locations[new IndirectReference(2, 0)].Value1); + Assert.Equal(15, locations[new IndirectReference(3, 0)].Value1); + Assert.Equal(222, locations[new IndirectReference(4, 0)].Value1); + Assert.Equal(5766, locations[new IndirectReference(5, 0)].Value1); + Assert.Equal(353, locations[new IndirectReference(6, 0)].Value1); + Assert.Equal(581, locations[new IndirectReference(7, 0)].Value1); + Assert.Equal(5068, locations[new IndirectReference(8, 0)].Value1); + Assert.Equal(5091, locations[new IndirectReference(9, 0)].Value1); + + var s = GetStringAt(bytes, locations[new IndirectReference(3, 0)].Value1); Assert.StartsWith("3 0 obj", s); } } @@ -180,17 +180,17 @@ public void BruteForceSearcherBytesFileOffsetsCorrect() Assert.Equal(13, locations.Count); - Assert.Equal(6183, locations[new IndirectReference(1, 0)]); - Assert.Equal(244, locations[new IndirectReference(2, 0)]); - Assert.Equal(15, locations[new IndirectReference(3, 0)]); - Assert.Equal(222, locations[new IndirectReference(4, 0)]); - Assert.Equal(5766, locations[new IndirectReference(5, 0)]); - Assert.Equal(353, locations[new IndirectReference(6, 0)]); - Assert.Equal(581, locations[new IndirectReference(7, 0)]); - Assert.Equal(5068, locations[new IndirectReference(8, 0)]); - Assert.Equal(5091, locations[new IndirectReference(9, 0)]); - - var s = GetStringAt(bytes, locations[new IndirectReference(3, 0)]); + Assert.Equal(6183, locations[new IndirectReference(1, 0)].Value1); + Assert.Equal(244, locations[new IndirectReference(2, 0)].Value1); + Assert.Equal(15, locations[new IndirectReference(3, 0)].Value1); + Assert.Equal(222, locations[new IndirectReference(4, 0)].Value1); + Assert.Equal(5766, locations[new IndirectReference(5, 0)].Value1); + Assert.Equal(353, locations[new IndirectReference(6, 0)].Value1); + Assert.Equal(581, locations[new IndirectReference(7, 0)].Value1); + Assert.Equal(5068, locations[new IndirectReference(8, 0)].Value1); + Assert.Equal(5091, locations[new IndirectReference(9, 0)].Value1); + + var s = GetStringAt(bytes, locations[new IndirectReference(3, 0)].Value1); Assert.StartsWith("3 0 obj", s); } @@ -203,21 +203,21 @@ public void BruteForceSearcherFileOffsetsCorrectOpenOffice() Assert.Equal(13, locations.Count); - Assert.Equal(17, locations[new IndirectReference(1, 0)]); - Assert.Equal(249, locations[new IndirectReference(2, 0)]); - Assert.Equal(14291, locations[new IndirectReference(3, 0)]); - Assert.Equal(275, locations[new IndirectReference(4, 0)]); - Assert.Equal(382, locations[new IndirectReference(5, 0)]); - Assert.Equal(13283, locations[new IndirectReference(6, 0)]); - Assert.Equal(13309, locations[new IndirectReference(7, 0)]); - Assert.Equal(13556, locations[new IndirectReference(8, 0)]); - Assert.Equal(13926, locations[new IndirectReference(9, 0)]); - Assert.Equal(14183, locations[new IndirectReference(10, 0)]); - Assert.Equal(14224, locations[new IndirectReference(11, 0)]); - Assert.Equal(14428, locations[new IndirectReference(12, 0)]); - Assert.Equal(14488, locations[new IndirectReference(13, 0)]); - - var s = GetStringAt(bytes, locations[new IndirectReference(12, 0)]); + Assert.Equal(17, locations[new IndirectReference(1, 0)].Value1); + Assert.Equal(249, locations[new IndirectReference(2, 0)].Value1); + Assert.Equal(14291, locations[new IndirectReference(3, 0)].Value1); + Assert.Equal(275, locations[new IndirectReference(4, 0)].Value1); + Assert.Equal(382, locations[new IndirectReference(5, 0)].Value1); + Assert.Equal(13283, locations[new IndirectReference(6, 0)].Value1); + Assert.Equal(13309, locations[new IndirectReference(7, 0)].Value1); + Assert.Equal(13556, locations[new IndirectReference(8, 0)].Value1); + Assert.Equal(13926, locations[new IndirectReference(9, 0)].Value1); + Assert.Equal(14183, locations[new IndirectReference(10, 0)].Value1); + Assert.Equal(14224, locations[new IndirectReference(11, 0)].Value1); + Assert.Equal(14428, locations[new IndirectReference(12, 0)].Value1); + Assert.Equal(14488, locations[new IndirectReference(13, 0)].Value1); + + var s = GetStringAt(bytes, locations[new IndirectReference(12, 0)].Value1); Assert.StartsWith("12 0 obj", s); } @@ -230,7 +230,7 @@ public void BruteForceSearcherCorrectlyFindsAllObjectsWhenOffset() var locations = BruteForceSearcher.GetObjectLocations(input); - Assert.Equal(TestDataOffsets, locations.Values); + Assert.Equal(TestDataOffsets, locations.Values.Select(x => x.Value1)); } [Fact] @@ -265,7 +265,7 @@ 11 0 obj s.IndexOf("11 0 obj", StringComparison.OrdinalIgnoreCase) }; - Assert.Equal(expectedLocations, locations.Values); + Assert.Equal(expectedLocations, locations.Values.Select(x => x.Value1)); } private static string GetStringAt(IInputBytes bytes, long location) diff --git a/src/UglyToad.PdfPig.Tests/Parser/Parts/DirectObjectFinderTests.cs b/src/UglyToad.PdfPig.Tests/Parser/Parts/DirectObjectFinderTests.cs index ca84a8823..d91d275ec 100644 --- a/src/UglyToad.PdfPig.Tests/Parser/Parts/DirectObjectFinderTests.cs +++ b/src/UglyToad.PdfPig.Tests/Parser/Parts/DirectObjectFinderTests.cs @@ -15,8 +15,8 @@ public void TryGetCanFollowMultipleReferenceLinks() var reference1 = new IndirectReference(7, 0); var reference2 = new IndirectReference(9, 0); - scanner.Objects[reference1] = new ObjectToken(10, reference1, new IndirectReferenceToken(reference2)); - scanner.Objects[reference2] = new ObjectToken(12, reference2, new NumericToken(69)); + scanner.Objects[reference1] = new ObjectToken(XrefLocation.File(10), reference1, new IndirectReferenceToken(reference2)); + scanner.Objects[reference2] = new ObjectToken(XrefLocation.File(12), reference2, new NumericToken(69)); Assert.True(DirectObjectFinder.TryGet(new IndirectReferenceToken(reference1), scanner, out NumericToken result)); @@ -29,8 +29,8 @@ public void GetCanFollowMultipleReferenceLinks() var reference1 = new IndirectReference(7, 0); var reference2 = new IndirectReference(9, 0); - scanner.Objects[reference1] = new ObjectToken(10, reference1, new IndirectReferenceToken(reference2)); - scanner.Objects[reference2] = new ObjectToken(12, reference2, new NumericToken(69)); + scanner.Objects[reference1] = new ObjectToken(XrefLocation.File(10), reference1, new IndirectReferenceToken(reference2)); + scanner.Objects[reference2] = new ObjectToken(XrefLocation.File(12), reference2, new NumericToken(69)); var result = DirectObjectFinder.Get(reference1, scanner); @@ -43,8 +43,8 @@ public void GetTokenCanFollowMultipleReferenceLinks() var reference1 = new IndirectReference(7, 0); var reference2 = new IndirectReference(9, 0); - scanner.Objects[reference1] = new ObjectToken(10, reference1, new IndirectReferenceToken(reference2)); - scanner.Objects[reference2] = new ObjectToken(12, reference2, new NumericToken(69)); + scanner.Objects[reference1] = new ObjectToken(XrefLocation.File(10), reference1, new IndirectReferenceToken(reference2)); + scanner.Objects[reference2] = new ObjectToken(XrefLocation.File(12), reference2, new NumericToken(69)); var result = DirectObjectFinder.Get(new IndirectReferenceToken(reference1), scanner); @@ -57,7 +57,7 @@ public void GetReturnsSingleItemFromArray() var reference = new IndirectReference(10, 0); const string expected = "Goopy"; - scanner.Objects[reference] = new ObjectToken(10, reference, new ArrayToken(new [] + scanner.Objects[reference] = new ObjectToken(XrefLocation.File(10), reference, new ArrayToken(new [] { new StringToken(expected) })); @@ -74,12 +74,12 @@ public void GetFollowsSingleIndirectReferenceFromArray() var reference2 = new IndirectReference(69, 0); const string expected = "Goopy"; - scanner.Objects[reference] = new ObjectToken(10, reference, new ArrayToken(new[] + scanner.Objects[reference] = new ObjectToken(XrefLocation.File(10), reference, new ArrayToken(new[] { new IndirectReferenceToken(reference2) })); - scanner.Objects[reference2] = new ObjectToken(69, reference2, new StringToken(expected)); + scanner.Objects[reference2] = new ObjectToken(XrefLocation.File(69), reference2, new StringToken(expected)); var result = DirectObjectFinder.Get(reference, scanner); @@ -91,7 +91,7 @@ public void GetThrowsOnInvalidArray() { var reference = new IndirectReference(10, 0); - scanner.Objects[reference] = new ObjectToken(10, reference, new ArrayToken(new[] + scanner.Objects[reference] = new ObjectToken(XrefLocation.File(10), reference, new ArrayToken(new[] { new NumericToken(5), new NumericToken(6), new NumericToken(0) })); diff --git a/src/UglyToad.PdfPig.Tests/TestObjectLocationProvider.cs b/src/UglyToad.PdfPig.Tests/TestObjectLocationProvider.cs index 54e568a38..51ca7116d 100644 --- a/src/UglyToad.PdfPig.Tests/TestObjectLocationProvider.cs +++ b/src/UglyToad.PdfPig.Tests/TestObjectLocationProvider.cs @@ -6,14 +6,14 @@ internal class TestObjectLocationProvider : IObjectLocationProvider { - public Dictionary Offsets { get; } = new Dictionary(); + public Dictionary Offsets { get; } = new Dictionary(); - public bool TryGetOffset(IndirectReference reference, out long offset) + public bool TryGetOffset(IndirectReference reference, out XrefLocation offset) { return Offsets.TryGetValue(reference, out offset); } - public void UpdateOffset(IndirectReference reference, long offset) + public void UpdateOffset(IndirectReference reference, XrefLocation offset) { Offsets[reference] = offset; } diff --git a/src/UglyToad.PdfPig.Tests/Tokenization/Scanner/PdfTokenScannerTests.cs b/src/UglyToad.PdfPig.Tests/Tokenization/Scanner/PdfTokenScannerTests.cs index 3fa902b5b..637f2477f 100644 --- a/src/UglyToad.PdfPig.Tests/Tokenization/Scanner/PdfTokenScannerTests.cs +++ b/src/UglyToad.PdfPig.Tests/Tokenization/Scanner/PdfTokenScannerTests.cs @@ -1,752 +1,752 @@ -namespace UglyToad.PdfPig.Tests.Tokenization.Scanner -{ - using System.Text; - using PdfPig.Core; - using PdfPig.Encryption; - using PdfPig.Parser.FileStructure; - using PdfPig.Tokenization.Scanner; - using PdfPig.Tokens; - - public class PdfTokenScannerTests - { - [Fact] - public void ReadsSimpleObject() - { - const string s = @"294 0 obj -/WDKAAR+CMBX12 -endobj"; - - var pdfScanner = GetScanner(s); - - pdfScanner.MoveNext(); - - var objectToken = Assert.IsType(pdfScanner.CurrentToken); - - var name = Assert.IsType(objectToken.Data); - - Assert.Equal(294, objectToken.Number.ObjectNumber); - Assert.Equal(0, objectToken.Number.Generation); - - Assert.Equal("WDKAAR+CMBX12", name.Data); - - Assert.StartsWith("294 0 obj", s.Substring((int)objectToken.Position)); - } - - [Fact] - public void ReadsIndirectReferenceInObject() - { - const string s = @" -15 0 obj -12 7 R -endobj"; - - var scanner = GetScanner(s); - - var token = ReadToEnd(scanner)[0]; - - var reference = Assert.IsType(token.Data); - - Assert.Equal(new IndirectReference(12, 7), reference.Data); - } - - [Fact] - public void ReadsObjectWithUndefinedIndirectReference() - { - const string s = @" -5 0 obj -<< -/XObject << -/Pic1 7 0 R ->> -/ProcSet [/PDF /Text /ImageC ] -/Font << -/F0 8 0 R -/F1 9 0 R -/F2 10 0 R -/F3 0 0 R ->> ->> -endobj"; - - var scanner = GetScanner(s); - - ReadToEnd(scanner); - - var token = scanner.Get(new IndirectReference(5, 0)); - Assert.NotNull(token); - - token = scanner.Get(new IndirectReference(0, 0)); - Assert.Null(token); - } - - [Fact] - public void ReadsNumericObjectWithComment() - { - const string s = @"%PDF-1.2 - -% I commented here too, tee hee -10383384 2 obj -%and here, I just love comments - -45 - -endobj - -%%EOF"; - - var pdfScanner = GetScanner(s); - - pdfScanner.MoveNext(); - - var obj = Assert.IsType(pdfScanner.CurrentToken); - - var num = Assert.IsType(obj.Data); - - Assert.Equal(45, num.Int); - - Assert.Equal(10383384, obj.Number.ObjectNumber); - Assert.Equal(2, obj.Number.Generation); - - Assert.StartsWith("10383384 2 obj", s.Substring((int)obj.Position)); - - Assert.False(pdfScanner.MoveNext()); - } - - [Fact] - public void ReadsArrayObject() - { - const string s = @" -endobj - -295 0 obj -[ -676 938 875 787 750 880 813 875 813 875 813 656 625 625 938 938 313 -344 563 563 563 563 563 850 500 574 813 875 563 1019 1144 875 313 -] -endobj"; - - var pdfScanner = GetScanner(s); - - pdfScanner.MoveNext(); - - var obj = Assert.IsType(pdfScanner.CurrentToken); - - var array = Assert.IsType(obj.Data); - - Assert.Equal(676, ((NumericToken)array.Data[0]).Int); - - Assert.Equal(33, array.Data.Count); - - Assert.Equal(295, obj.Number.ObjectNumber); - Assert.Equal(0, obj.Number.Generation); - - Assert.StartsWith("295 0 obj", s.Substring((int)obj.Position)); - - Assert.False(pdfScanner.MoveNext()); - } - - [Fact] - public void ReadsDictionaryObjectThenNameThenDictionary() - { - const string s = @" - -274 0 obj -<< -/Type /Pages -/Count 2 -/Parent 275 0 R -/Kids [ 121 0 R 125 0 R ] ->> -endobj - -%Other parts... - -310 0 obj -/WPXNWT+CMR9 -endobj 311 0 obj -<< -/Type /Font -/Subtype /Type1 -/FirstChar 0 -/LastChar 127 -/Widths 313 0 R -/BaseFont 310 0 R /FontDescriptor 312 0 R ->> -endobj"; - - var scanner = GetScanner(s); - - var tokens = ReadToEnd(scanner); - - var dictionary = Assert.IsType(tokens[0].Data); - - Assert.Equal(4, dictionary.Data.Count); - Assert.Equal(274, tokens[0].Number.ObjectNumber); - Assert.StartsWith("274 0 obj", s.Substring((int)tokens[0].Position)); - - var nameObject = Assert.IsType(tokens[1].Data); - - Assert.Equal("WPXNWT+CMR9", nameObject.Data); - Assert.Equal(310, tokens[1].Number.ObjectNumber); - Assert.StartsWith("310 0 obj", s.Substring((int)tokens[1].Position)); - - dictionary = Assert.IsType(tokens[2].Data); - - Assert.Equal(7, dictionary.Data.Count); - Assert.Equal(311, tokens[2].Number.ObjectNumber); - Assert.StartsWith("311 0 obj", s.Substring((int)tokens[2].Position)); - } - - [Fact] - public void ReadsStringObject() - { - const string s = @" - -58949797283757 0 obj (An object begins with obj and ends with endobj...) endobj -"; - - var scanner = GetScanner(s); - - var token = ReadToEnd(scanner)[0]; - - Assert.Equal(58949797283757L, token.Number.ObjectNumber); - Assert.Equal("An object begins with obj and ends with endobj...", Assert.IsType(token.Data).Data); - - Assert.StartsWith("58949797283757 0 obj", s.Substring((int)token.Position)); - } - - [Fact] - public void ReadsStreamObject() - { - const string s = @" -352 0 obj -<< /S 1273 /Filter /FlateDecode /Length 353 0 R >> -stream -H‰œUkLSgþÚh¹IÝÅlK(%[ÈÅ©+ ƒåꩊèæÇtnZ)Z¹¨Oå~9ŠÊµo”[éiK)÷B¹´ -É² ©¸˜ n±º×dKöcÏ÷ãœç{ßï}¾÷ÍÉs  Ô;€ -À»—ÀF`ÇF@ƒ 4 ˜ï @¥T¨³fY: žw̵;’’Îq®]cƒÿdp¨ÛI3F#G©#œ)TÇqW£NÚѬgOKbü‡µ#á¡£Þaîtƒƒ›ß– -¾“S>}µuÕõ5M±¢ª†»øÞû•q÷îÜ~¬PòžÞ~•¬ëɃGÅ-Ñ­ím·°gêêb,/,£P§õ^ v¾ãÁô¿¿ŠTE]²±{šuwÔ`LG³DªìTÈ -A¡¬àð‰É©ˆ°‘¼›‚%¥×s³®í»š}%§X{{tøNåÝž¶ö¢ÖÞ¾–~´¼¬°À“Éððr¥8»P£ØêÁi½®Û(éhŽ‘ú;x#dÃÄ$m -+) -)†…±n -9ùyŽA·n\ï»t!=3£½¡:®­µåâ¹Ô³ø¼ËiûSÎsë;•Dt—ö$WÉ4U‘¢ºÚšñá1íÐèÔó‚svõ(/(+D²#mZÏ6êüÝ7x‡—†”‡E„²‚|ê«êªDµ5q°šR¦RÈ£ n¾[è~“}ýƒÝ½SꞦ'æQŽzÝ‚mæ -óF+Õ%ù‡ƒß9SˆŒÓãšH¶~L-#T]êîÁ©ÎkbjÒp½¸$¤´(4<,""øfvΕ< VЍ«#4'2l'Ð1ñðn?sìûãI'OŸøñçŸN5(äÊ'âÎѾÞþíðƒQmu}]Õ£‡c›©.Œòµ9zz0Ѳ‚B¢«#š-3ªàŸŸ¦Pà8®Ó…¼æ¢BaÅÐkëÊŠukÈÊÖL£­ivvv…k2=µZMØ|Úl(ŠZ­V›ÍbI>Ÿl¹œ(â±Äb­ø”Uª ñeü©U*‹’“Oð,„E+¶Êà>ŽU”ÎÌõçlºFÃ_ÃÙl?¶=>>!>þC¿-×à©©©x¾€¢ŠÊåòtÃ0‹Æôz“‰ NÊ,¬‚kÀ°F‚XÛ4&“ÉfÃñÅæûæy=ÆãIðE _¾Èårår/XÞ„/·qò›m¶ìÖ|†óx8Wð¹hºÜÂÕalÎü’˜Ã0^Òòòü¼yÞ¶´´DX - )¨ÇM8lüM…Oúý| 1Ïãk»:t<…ÂÚl¶e¾†” éKÜl6c¹¸É„› ”)‰'3¤œ\–™ËN–™ÿe^Ё² y÷ð¹f`3ëž´ ¸“$d:e†)!%2ºdvË@½N¼ªŠ Ùná¹ ¼¿@ €Ã.èšs ì÷ûM€2(E4_ | FÑ.@v@÷¤ÃÅ0È Pž~,€:»H¤k¾ hT Œ € êÇV:Ô…©@@oH¯(3T‰{""C½SñŠœþtz3€•ƒ ñf.¬SЍøzWþ*$9gj=~Ì·QD E6o¥Ûi/Â`1ígGMq,;}޼sÔ×®kDü˜J{e5‚²ìɐ~Y)}fA>:˜ù–""Yò ç¹=ù²yÛ¡¿i aœ‘ØÏºþÇoäO ôkÆ) - endstream - endobj - 353 0 obj - 1479 - endobj"; - - var locationProvider = new TestObjectLocationProvider(); - // Mark location of "353 0 obj" - locationProvider.Offsets[new IndirectReference(353, 0)] = 1643; - - var scanner = GetScanner(s, locationProvider); - - var tokens = ReadToEnd(scanner); - - Assert.Equal(2, tokens.Count); - - var stream = Assert.IsType(tokens[0].Data); - - var str = Encoding.UTF8.GetString(stream.Data.ToArray()); - - Assert.StartsWith("H‰œUkLSgþÚh¹IÝÅl", str); - - Assert.Equal(2, locationProvider.Offsets[new IndirectReference(352, 0)]); - } - - [Fact] - public void ReadsStreamObjectWithInvalidLength() - { - string invalidLengthStream = "ABCD" + new string('e', 3996); - - string s = $@" -352 0 obj -<< /S 1273 /Filter /FlateDecode /Length 353 0 R >> -stream -{invalidLengthStream} -endstream -endobj -353 0 obj -1479 -endobj"; - - var locationProvider = new TestObjectLocationProvider(); - // Mark location of "353 0 obj" - locationProvider.Offsets[new IndirectReference(353, 0)] = 1643; - - var scanner = GetScanner(s, locationProvider); - - var tokens = ReadToEnd(scanner); - - Assert.Equal(2, tokens.Count); - - var stream = Assert.IsType(tokens[0].Data); - - var data = stream.Data.ToArray(); - - var str = Encoding.UTF8.GetString(data); - - Assert.Equal(data.Length, invalidLengthStream.Length); - Assert.StartsWith("ABCDeeeee", str); - - Assert.Equal(2, locationProvider.Offsets[new IndirectReference(352, 0)]); - } - - [Fact] - public void ReadsSimpleStreamObject() - { - // Length of the bytes as found by Encoding.UTF8.GetBytes is 45 - const string s = @" -574387 0 obj -<< /Length 45 >> -stream -À“Éððr¥8»P£ØêÁi½®Û(éhŽ‘ú -endstream -endobj"; - - var scanner = GetScanner(s); - - var token = ReadToEnd(scanner)[0]; - - var stream = Assert.IsType(token.Data); - - var bytes = stream.Data.ToArray(); - Assert.Equal(45, bytes.Length); - - var outputString = Encoding.UTF8.GetString(bytes); - - Assert.Equal("À“Éððr¥8»P£ØêÁi½®Û(éhŽ‘ú", outputString); - } - - [Fact] - public void ReadsStreamWithIndirectLength() - { - const string s = @"5 0 obj 52 endobj - - - -12 0 obj - -<< /Length 5 0 R /S 1245 >> - -stream -%¥×³®í»š}%§X{{tøNåÝž¶ö¢ÖÞ¾–~´¼ -endstream -endobj"; - var locationProvider = new TestObjectLocationProvider(); - - locationProvider.Offsets[new IndirectReference(5, 0)] = 0; - - var scanner = GetScanner(s, locationProvider); - - var token = ReadToEnd(scanner)[1]; - - var stream = Assert.IsType(token.Data); - - var bytes = stream.Data.ToArray(); - Assert.Equal(52, bytes.Length); - - var outputString = Encoding.UTF8.GetString(bytes); - - Assert.Equal("%¥×³®í»š}%§X{{tøNåÝž¶ö¢ÖÞ¾–~´¼", outputString); - } - - [Fact] - public void ReadsStreamWithMissingLength() - { - const string s = @" -12655 0 obj - -<< /S 1245 >> - -stream -%¥×³®í»š}%§X{{tøNåÝž¶ö¢ÖÞgrehtyyy$&%&£$££(*¾–~´¼ -endstream -endobj"; - - var scanner = GetScanner(s); - - var token = ReadToEnd(scanner)[0]; - - Assert.Equal(12655, token.Number.ObjectNumber); - - var stream = Assert.IsType(token.Data); - - Assert.Equal("1245", stream.StreamDictionary.Data["S"].ToString()); - - Assert.Equal("%¥×³®í»š}%§X{{tøNåÝž¶ö¢ÖÞgrehtyyy$&%&£$££(*¾–~´¼", Encoding.UTF8.GetString(stream.Data.ToArray())); - } - - [Fact] - public void ReadsStreamWithoutBreakBeforeEndstream() - { - const string s = @" -1 0 obj -12 -endobj - -7 0 obj -<< /Length 288 - /Filter /FlateDecode >> -stream -xœ]‘ËjÃ0E÷ÿÃ,ÓEð#NÒ€1¤N^ôA~€-]A- YYøï+Ï4¡t#qfîFWQY*­Dïv5:è”–§ñjB‹½Òa¤ •p7¤K  ƒÈûëyr8Tº!Ïà úð‚ÉÙVG9¶ø@Å7+Ñ*ÝÃ곬¹T_ùƵƒ8 Š$vË̗Ƽ6BDöu%½B¹yí$—Ù ¤\Hx71JœL#Ð6ºÇ0È㸀ü|. µüßõÏ""WÛ‰¯Æ.êÄ«ã8; ¤iL°!Ø %É`K°ßì¸ÃöÜáÜ)  [‚#CFðİ#(yƒg^ÿ¶æò -ÿž“¸Zë#¢?¢h–P”Æû?šÑï÷ø¯‰Šendstream -endobj - -9 0 obj -16 -endobj"; - - var scanner = GetScanner(s); - - var token = ReadToEnd(scanner)[1]; - - Assert.Equal(7, token.Number.ObjectNumber); - } - - [Fact] - public void ReadsStringsWithMissingEndBracket() - { - const string input = @"5 0 obj -<< -/Kids [4 0 R 12 0 R 17 0 R 20 0 R 25 0 R 28 0 R ] -/Count 6 -/Type /Pages -/MediaBox [ 0 0 612 792 ] ->> -endobj -1 0 obj -<< -/Creator (Corel WordPerfect - [D:\Wpdocs\WEBSITE\PROC&POL.WP6 (unmodified) -/CreationDate (D:19980224130723) -/Title (Proc&Pol.pdf) -/Author (J. L. Swezey) -/Producer (Acrobat PDFWriter 3.03 for Windows NT) -/Keywords (Budapest Treaty; Patent deposits; IDA) -/Subject (Patent Collection Procedures and Policies) ->> -endobj -3 0 obj -<< -/Pages 5 0 R -/Type /Catalog ->> -endobj"; - - var scanner = GetScanner(input); - - var tokens = ReadToEnd(scanner); - - Assert.Equal(3, tokens.Count); - - var first = tokens[0]; - Assert.Equal(5, first.Number.ObjectNumber); - - var second = tokens[1]; - Assert.Equal(1, second.Number.ObjectNumber); - - var third = tokens[2]; - Assert.Equal(3, third.Number.ObjectNumber); - } - - [Fact] - public void ReadsDictionaryContainingNull() - { - const string input = @"14224 0 obj -<> -endobj"; - - var scanner = GetScanner(input); - - var tokens = ReadToEnd(scanner); - - var dictionaryToken = tokens[0].Data as DictionaryToken; - - Assert.NotNull(dictionaryToken); - - var encryptValue = dictionaryToken.Data["Encrypt"]; - - Assert.IsType(encryptValue); - } - - [Fact] - public void ReadMultipleNestedDictionary() - { - const string input = - @" - 4 0 obj - << /Type /Font /Subtype /Type1 /Name /AF1F040+Arial /BaseFont /Arial /FirstChar 32 /LastChar 255 - /Encoding - << - /Type /Encoding /BaseEncoding /WinAnsiEncoding - /Differences [128 /Euro 130 /quotesinglbase /florin /quotedblbase /ellipsis /dagger /daggerdbl /circumflex /perthousand /Scaron /guilsinglleft /OE 142 /Zcaron 145 - /quoteleft /quoteright /quotedblleft /quotedblright /bullet /endash /emdash /tilde /trademark /scaron /guilsinglright /oe 158 /zcaron /Ydieresis /space /exclamdown - /cent /sterling /currency /yen /brokenbar /section /dieresis /copyright /ordfeminine /guillemotleft /logicalnot /hyphen /registered /macron /degree /plusminus - /twosuperior /threesuperior /acute /mu /paragraph /periodcentered /cedilla /onesuperior /ordmasculine /guillemotright /onequarter /onehalf /threequarters - /questiondown /Agrave /Aacute /Acircumflex /Atilde /Adieresis /Aring /AE /Ccedilla /Egrave /Eacute /Ecircumflex /Edieresis /Igrave /Iacute /Icircumflex /Idieresis - /Eth /Ntilde /Ograve /Oacute /Ocircumflex /Otilde /Odieresis /multiply /Oslash /Ugrave /Uacute /Ucircumflex /Udieresis /Yacute /Thorn /germandbls /agrave /aacute - /acircumflex /atilde /adieresis /aring /ae /ccedilla /egrave /eacute /ecircumflex /edieresis /igrave /iacute /icircumflex /idieresis /eth /ntilde /ograve /oacute - /ocircumflex /otilde /odieresis /divide /oslash /ugrave /uacute /ucircumflex /udieresis /yacute /thorn /ydieresis ] - >> - /Widths [278 278 355 556 556 889 667 191 333 333 389 584 278 333 278 278 - 556 556 556 556 556 556 556 556 556 556 278 278 584 584 584 556 - 1015 667 667 722 722 667 611 778 722 278 500 667 556 833 722 778 - 667 778 722 667 611 722 667 944 667 667 611 278 278 278 469 556 - 333 556 556 500 556 556 278 556 556 222 222 500 222 833 556 556 - 556 556 333 500 278 556 500 722 500 500 500 334 260 334 584 750 - 556 750 222 556 333 1000 556 556 333 1000 667 333 1000 750 611 750 - 750 222 222 333 333 350 556 1000 333 1000 500 333 944 750 500 667 - 278 333 556 556 556 556 260 556 333 737 370 556 584 333 737 552 - 400 549 333 333 333 576 537 278 333 333 365 556 834 834 834 611 - 667 667 667 667 667 667 1000 722 667 667 667 667 278 278 278 278 - 722 722 778 778 778 778 778 584 778 722 722 722 722 667 667 611 - 556 556 556 556 556 556 889 500 556 556 556 556 278 278 278 278 - 556 556 556 556 556 556 556 549 611 556 556 556 556 500 556 500 - ] - >> - >> - endobj - "; - - var scanner = GetScanner(input); - - var tokens = ReadToEnd(scanner); - - var dictionaryToken = tokens[0].Data as DictionaryToken; - - Assert.NotNull(dictionaryToken); - } - - [Fact] - public void ReadsDictionaryWithoutEndObjBeforeNextObject() - { - const string input = @"1 0 obj -<> -2 0 obj -<> -endobj"; - - var scanner = GetScanner(input); - - var tokens = ReadToEnd(scanner); - - Assert.Equal(2, tokens.Count); - - var dictionaryToken = Assert.IsType(tokens[0].Data); - var typeValue = dictionaryToken.Data["Type"]; - Assert.IsType(typeValue); - - dictionaryToken = tokens[1].Data as DictionaryToken; - Assert.NotNull(dictionaryToken); - typeValue = dictionaryToken.Data["Length"]; - Assert.IsType(typeValue); - } - - [Fact] - public void ReadsStreamWithoutEndObjBeforeNextObject() - { - const string input = @"1 0 obj -<> -stream -aaaa -endstream -2 0 obj -<> -endobj"; - - var scanner = GetScanner(input); - - var tokens = ReadToEnd(scanner); - - Assert.Equal(2, tokens.Count); - - Assert.IsType(tokens[0].Data); - - var dictionaryToken = Assert.IsType(tokens[1].Data); - var typeValue = dictionaryToken.Data["Length"]; - Assert.IsType(typeValue); - } - - [Theory] - [InlineData("startxref")] - [InlineData("xref")] - public void ReadsStreamWithoutEndObjBeforeToken(string token) - { - string input = @$"1 0 obj -<> -stream -aaaa -endstream -{token}"; - - var scanner = GetScanner(input); - - var tokens = ReadToEnd(scanner); - - Assert.Single(tokens); - - Assert.IsType(tokens[0].Data); - } - - [Theory] - [InlineData("startxref")] - [InlineData("xref")] - public void ReadsDictionaryWithoutEndObjBeforeToken(string token) - { - string input = @$"1 0 obj -<> -{token}"; - - var scanner = GetScanner(input); - - var tokens = ReadToEnd(scanner); - - Assert.Single(tokens); - - var dictionaryToken = Assert.IsType(tokens[0].Data); - var typeValue = dictionaryToken.Data["Type"]; - Assert.IsType(typeValue); - } - - [Fact] - public void ReadsStreamWithoutEndStreamBeforeEndObj() - { - const string input = @"1 0 obj -<> -stream -aaaa -endobj -2 0 obj -<> -endobj"; - - var scanner = GetScanner(input); - - var tokens = ReadToEnd(scanner); - - Assert.Equal(2, tokens.Count); - - Assert.IsType(tokens[0].Data); - - var dictionaryToken = Assert.IsType(tokens[1].Data); - var lengthValue = dictionaryToken.Data["Length"]; - Assert.IsType(lengthValue); - } - - [Theory] - [InlineData(">>")] - [InlineData("randomstring")] - public void ReadsIndirectObjectsDictionaryWithContentBeforeEndObj(string addedContent) - { - string input = @$"1 0 obj -<> -{addedContent}endobj -2 0 obj -<> -endobj"; - - var strictScanner = GetScanner(input); - - var tokens = ReadToEnd(strictScanner); - Assert.Empty(tokens); - - - var lenientScanner = GetScanner(input, useLenientParsing: true); - tokens = ReadToEnd(lenientScanner); - - Assert.Equal(2, tokens.Count); - - var dictionaryToken = Assert.IsType(tokens[0].Data); - var typeValue = dictionaryToken.Data["Type"]; - Assert.IsType(typeValue); - - dictionaryToken = Assert.IsType(tokens[1].Data); - var lengthValue = dictionaryToken.Data["Length"]; - Assert.IsType(lengthValue); - } - - [Theory] - [InlineData(">>")] - [InlineData("randomstring")] - public void ReadsIndirectObjectsStreamWithAddedContentBeforeStream(string addedContent) - { - string input = @$"1 0 obj -<> -{addedContent}stream -aaaa -endstream -endobj -2 0 obj -<> -endobj"; - - var strictScanner = GetScanner(input); - - var tokens = ReadToEnd(strictScanner); - Assert.Equal(2, tokens.Count); - // this is linked to the parsing choosing the last token parsed in obj. - // It can probably be challenged against taking the first one. - var operatorToken = Assert.IsType(tokens[0].Data); - Assert.Equal("endstream", operatorToken.Data); - - var dictionaryToken = Assert.IsType(tokens[1].Data); - var lengthValue = dictionaryToken.Data["Length"]; - Assert.IsType(lengthValue); - - var lenientScanner = GetScanner(input, useLenientParsing:true); - tokens = ReadToEnd(lenientScanner); - - Assert.Equal(2, tokens.Count); - - Assert.IsType(tokens[0].Data); - - dictionaryToken = Assert.IsType(tokens[1].Data); - lengthValue = dictionaryToken.Data["Length"]; - Assert.IsType(lengthValue); - } - - private static PdfTokenScanner GetScanner(string s, TestObjectLocationProvider locationProvider = null, bool useLenientParsing = false) - { - var input = StringBytesTestConverter.Convert(s, false); - - return new PdfTokenScanner(input.Bytes, - locationProvider ?? new TestObjectLocationProvider(), - new TestFilterProvider(), - NoOpEncryptionHandler.Instance, - new FileHeaderOffset(0), - useLenientParsing ? new ParsingOptions() : ParsingOptions.LenientParsingOff, - new StackDepthGuard(256)); - } - - private static IReadOnlyList ReadToEnd(PdfTokenScanner scanner) - { - var result = new List(); - - while (scanner.MoveNext()) - { - if (scanner.CurrentToken is ObjectToken obj) - { - result.Add(obj); - } - else - { - throw new InvalidOperationException($"Pdf token scanner produced token which was not an object token: {scanner.CurrentToken}."); - } - } - - return result; - } - } -} +namespace UglyToad.PdfPig.Tests.Tokenization.Scanner +{ + using System.Text; + using PdfPig.Core; + using PdfPig.Encryption; + using PdfPig.Parser.FileStructure; + using PdfPig.Tokenization.Scanner; + using PdfPig.Tokens; + + public class PdfTokenScannerTests + { + [Fact] + public void ReadsSimpleObject() + { + var s = @"294 0 obj +/WDKAAR+CMBX12 +endobj".Replace("\r\n", "\n").Replace("\n", "\r\n"); + + var pdfScanner = GetScanner(s); + + pdfScanner.MoveNext(); + + var objectToken = Assert.IsType(pdfScanner.CurrentToken); + + var name = Assert.IsType(objectToken.Data); + + Assert.Equal(294, objectToken.Number.ObjectNumber); + Assert.Equal(0, objectToken.Number.Generation); + + Assert.Equal("WDKAAR+CMBX12", name.Data); + + Assert.StartsWith("294 0 obj", s.Substring((int)objectToken.Position.Value1)); + } + + [Fact] + public void ReadsIndirectReferenceInObject() + { + var s = @" +15 0 obj +12 7 R +endobj".Replace("\r\n", "\n").Replace("\n", "\r\n"); + + var scanner = GetScanner(s); + + var token = ReadToEnd(scanner)[0]; + + var reference = Assert.IsType(token.Data); + + Assert.Equal(new IndirectReference(12, 7), reference.Data); + } + + [Fact] + public void ReadsObjectWithUndefinedIndirectReference() + { + var s = @" +5 0 obj +<< +/XObject << +/Pic1 7 0 R +>> +/ProcSet [/PDF /Text /ImageC ] +/Font << +/F0 8 0 R +/F1 9 0 R +/F2 10 0 R +/F3 0 0 R +>> +>> +endobj".Replace("\r\n", "\n").Replace("\n", "\r\n"); + + var scanner = GetScanner(s); + + ReadToEnd(scanner); + + var token = scanner.Get(new IndirectReference(5, 0)); + Assert.NotNull(token); + + token = scanner.Get(new IndirectReference(0, 0)); + Assert.Null(token); + } + + [Fact] + public void ReadsNumericObjectWithComment() + { + var s = @"%PDF-1.2 + +% I commented here too, tee hee +10383384 2 obj +%and here, I just love comments + +45 + +endobj + +%%EOF".Replace("\r\n", "\n").Replace("\n", "\r\n"); + + var pdfScanner = GetScanner(s); + + pdfScanner.MoveNext(); + + var obj = Assert.IsType(pdfScanner.CurrentToken); + + var num = Assert.IsType(obj.Data); + + Assert.Equal(45, num.Int); + + Assert.Equal(10383384, obj.Number.ObjectNumber); + Assert.Equal(2, obj.Number.Generation); + + Assert.StartsWith("10383384 2 obj", s.Substring((int)obj.Position.Value1)); + + Assert.False(pdfScanner.MoveNext()); + } + + [Fact] + public void ReadsArrayObject() + { + var s = @" +endobj + +295 0 obj +[ +676 938 875 787 750 880 813 875 813 875 813 656 625 625 938 938 313 +344 563 563 563 563 563 850 500 574 813 875 563 1019 1144 875 313 +] +endobj".Replace("\r\n", "\n").Replace("\n", "\r\n"); + + var pdfScanner = GetScanner(s); + + pdfScanner.MoveNext(); + + var obj = Assert.IsType(pdfScanner.CurrentToken); + + var array = Assert.IsType(obj.Data); + + Assert.Equal(676, ((NumericToken)array.Data[0]).Int); + + Assert.Equal(33, array.Data.Count); + + Assert.Equal(295, obj.Number.ObjectNumber); + Assert.Equal(0, obj.Number.Generation); + + Assert.StartsWith("295 0 obj", s.Substring((int)obj.Position.Value1)); + + Assert.False(pdfScanner.MoveNext()); + } + + [Fact] + public void ReadsDictionaryObjectThenNameThenDictionary() + { + var s = @" + +274 0 obj +<< +/Type /Pages +/Count 2 +/Parent 275 0 R +/Kids [ 121 0 R 125 0 R ] +>> +endobj + +%Other parts... + +310 0 obj +/WPXNWT+CMR9 +endobj 311 0 obj +<< +/Type /Font +/Subtype /Type1 +/FirstChar 0 +/LastChar 127 +/Widths 313 0 R +/BaseFont 310 0 R /FontDescriptor 312 0 R +>> +endobj".Replace("\r\n", "\n").Replace("\n", "\r\n"); + + var scanner = GetScanner(s); + + var tokens = ReadToEnd(scanner); + + var dictionary = Assert.IsType(tokens[0].Data); + + Assert.Equal(4, dictionary.Data.Count); + Assert.Equal(274, tokens[0].Number.ObjectNumber); + Assert.StartsWith("274 0 obj", s.Substring((int)tokens[0].Position.Value1)); + + var nameObject = Assert.IsType(tokens[1].Data); + + Assert.Equal("WPXNWT+CMR9", nameObject.Data); + Assert.Equal(310, tokens[1].Number.ObjectNumber); + Assert.StartsWith("310 0 obj", s.Substring((int)tokens[1].Position.Value1)); + + dictionary = Assert.IsType(tokens[2].Data); + + Assert.Equal(7, dictionary.Data.Count); + Assert.Equal(311, tokens[2].Number.ObjectNumber); + Assert.StartsWith("311 0 obj", s.Substring((int)tokens[2].Position.Value1)); + } + + [Fact] + public void ReadsStringObject() + { + var s = @" + +58949797283757 0 obj (An object begins with obj and ends with endobj...) endobj +".Replace("\r\n", "\n").Replace("\n", "\r\n"); + + var scanner = GetScanner(s); + + var token = ReadToEnd(scanner)[0]; + + Assert.Equal(58949797283757L, token.Number.ObjectNumber); + Assert.Equal("An object begins with obj and ends with endobj...", Assert.IsType(token.Data).Data); + + Assert.StartsWith("58949797283757 0 obj", s.Substring((int)token.Position.Value1)); + } + + [Fact] + public void ReadsStreamObject() + { + var s = @" +352 0 obj +<< /S 1273 /Filter /FlateDecode /Length 353 0 R >> +stream +H‰œUkLSgþÚh¹IÝÅlK(%[ÈÅ©+ ƒåꩊèæÇtnZ)Z¹¨Oå~9ŠÊµo”[éiK)÷B¹´ +É² ©¸˜ n±º×dKöcÏ÷ãœç{ßï}¾÷ÍÉs  Ô;€ +À»—ÀF`ÇF@ƒ 4 ˜ï @¥T¨³fY: žw̵;’’Îq®]cƒÿdp¨ÛI3F#G©#œ)TÇqW£NÚѬgOKbü‡µ#á¡£Þaîtƒƒ›ß– +¾“S>}µuÕõ5M±¢ª†»øÞû•q÷îÜ~¬PòžÞ~•¬ëɃGÅ-Ñ­ím·°gêêb,/,£P§õ^ v¾ãÁô¿¿ŠTE]²±{šuwÔ`LG³DªìTÈ +A¡¬àð‰É©ˆ°‘¼›‚%¥×s³®í»š}%§X{{tøNåÝž¶ö¢ÖÞ¾–~´¼¬°À“Éððr¥8»P£ØêÁi½®Û(éhŽ‘ú;x#dÃÄ$m ++) +)†…±n +9ùyŽA·n\ï»t!=3£½¡:®­µåâ¹Ô³ø¼ËiûSÎsë;•Dt—ö$WÉ4U‘¢ºÚšñá1íÐèÔó‚svõ(/(+D²#mZÏ6êüÝ7x‡—†”‡E„²‚|ê«êªDµ5q°šR¦RÈ£ n¾[è~“}ýƒÝ½SꞦ'æQŽzÝ‚mæ +óF+Õ%ù‡ƒß9SˆŒÓãšH¶~L-#T]êîÁ©ÎkbjÒp½¸$¤´(4<,""øfvΕ< VЍ«#4'2l'Ð1ñðn?sìûãI'OŸøñçŸN5(äÊ'âÎѾÞþíðƒQmu}]Õ£‡c›©.Œòµ9zz0Ѳ‚B¢«#š-3ªàŸŸ¦Pà8®Ó…¼æ¢BaÅÐkëÊŠukÈÊÖL£­ivvv…k2=µZMØ|Úl(ŠZ­V›ÍbI>Ÿl¹œ(â±Äb­ø”Uª ñeü©U*‹’“Oð,„E+¶Êà>ŽU”ÎÌõçlºFÃ_ÃÙl?¶=>>!>þC¿-×à©©©x¾€¢ŠÊåòtÃ0‹Æôz“‰ NÊ,¬‚kÀ°F‚XÛ4&“ÉfÃñÅæûæy=ÆãIðE _¾Èårår/XÞ„/·qò›m¶ìÖ|†óx8Wð¹hºÜÂÕalÎü’˜Ã0^Òòòü¼yÞ¶´´DX + )¨ÇM8lüM…Oúý| 1Ïãk»:t<…ÂÚl¶e¾†” éKÜl6c¹¸É„› ”)‰'3¤œ\–™ËN–™ÿe^Ё² y÷ð¹f`3ëž´ ¸“$d:e†)!%2ºdvË@½N¼ªŠ Ùná¹ ¼¿@ €Ã.èšs ì÷ûM€2(E4_ | FÑ.@v@÷¤ÃÅ0È Pž~,€:»H¤k¾ hT Œ € êÇV:Ô…©@@oH¯(3T‰{""C½SñŠœþtz3€•ƒ ñf.¬SЍøzWþ*$9gj=~Ì·QD E6o¥Ûi/Â`1ígGMq,;}޼sÔ×®kDü˜J{e5‚²ìɐ~Y)}fA>:˜ù–""Yò ç¹=ù²yÛ¡¿i aœ‘ØÏºþÇoäO ôkÆ) + endstream + endobj + 353 0 obj + 1479 + endobj".Replace("\r\n", "\n").Replace("\n", "\r\n"); + + var locationProvider = new TestObjectLocationProvider(); + // Mark location of "353 0 obj" + locationProvider.Offsets[new IndirectReference(353, 0)] = XrefLocation.File(1643); + + var scanner = GetScanner(s, locationProvider); + + var tokens = ReadToEnd(scanner); + + Assert.Equal(2, tokens.Count); + + var stream = Assert.IsType(tokens[0].Data); + + var str = Encoding.UTF8.GetString(stream.Data.ToArray()); + + Assert.StartsWith("H‰œUkLSgþÚh¹IÝÅl", str); + + Assert.Equal(2, locationProvider.Offsets[new IndirectReference(352, 0)].Value1); + } + + [Fact] + public void ReadsStreamObjectWithInvalidLength() + { + string invalidLengthStream = "ABCD" + new string('e', 3996); + + var s = $@" +352 0 obj +<< /S 1273 /Filter /FlateDecode /Length 353 0 R >> +stream +{invalidLengthStream} +endstream +endobj +353 0 obj +1479 +endobj".Replace("\r\n", "\n").Replace("\n", "\r\n"); + + var locationProvider = new TestObjectLocationProvider(); + // Mark location of "353 0 obj" + locationProvider.Offsets[new IndirectReference(353, 0)] = XrefLocation.File(1643); + + var scanner = GetScanner(s, locationProvider); + + var tokens = ReadToEnd(scanner); + + Assert.Equal(2, tokens.Count); + + var stream = Assert.IsType(tokens[0].Data); + + var data = stream.Data.ToArray(); + + var str = Encoding.UTF8.GetString(data); + + Assert.Equal(data.Length, invalidLengthStream.Length); + Assert.StartsWith("ABCDeeeee", str); + + Assert.Equal(2, locationProvider.Offsets[new IndirectReference(352, 0)].Value1); + } + + [Fact] + public void ReadsSimpleStreamObject() + { + // Length of the bytes as found by Encoding.UTF8.GetBytes is 45 + var s = @" +574387 0 obj +<< /Length 45 >> +stream +À“Éððr¥8»P£ØêÁi½®Û(éhŽ‘ú +endstream +endobj".Replace("\r\n", "\n").Replace("\n", "\r\n"); + + var scanner = GetScanner(s); + + var token = ReadToEnd(scanner)[0]; + + var stream = Assert.IsType(token.Data); + + var bytes = stream.Data.ToArray(); + Assert.Equal(45, bytes.Length); + + var outputString = Encoding.UTF8.GetString(bytes); + + Assert.Equal("À“Éððr¥8»P£ØêÁi½®Û(éhŽ‘ú", outputString); + } + + [Fact] + public void ReadsStreamWithIndirectLength() + { + var s = @"5 0 obj 52 endobj + + + +12 0 obj + +<< /Length 5 0 R /S 1245 >> + +stream +%¥×³®í»š}%§X{{tøNåÝž¶ö¢ÖÞ¾–~´¼ +endstream +endobj".Replace("\r\n", "\n").Replace("\n", "\r\n"); + var locationProvider = new TestObjectLocationProvider(); + + locationProvider.Offsets[new IndirectReference(5, 0)] = XrefLocation.File(0); + + var scanner = GetScanner(s, locationProvider); + + var token = ReadToEnd(scanner)[1]; + + var stream = Assert.IsType(token.Data); + + var bytes = stream.Data.ToArray(); + Assert.Equal(52, bytes.Length); + + var outputString = Encoding.UTF8.GetString(bytes); + + Assert.Equal("%¥×³®í»š}%§X{{tøNåÝž¶ö¢ÖÞ¾–~´¼", outputString); + } + + [Fact] + public void ReadsStreamWithMissingLength() + { + var s = @" +12655 0 obj + +<< /S 1245 >> + +stream +%¥×³®í»š}%§X{{tøNåÝž¶ö¢ÖÞgrehtyyy$&%&£$££(*¾–~´¼ +endstream +endobj".Replace("\r\n", "\n").Replace("\n", "\r\n"); + + var scanner = GetScanner(s); + + var token = ReadToEnd(scanner)[0]; + + Assert.Equal(12655, token.Number.ObjectNumber); + + var stream = Assert.IsType(token.Data); + + Assert.Equal("1245", stream.StreamDictionary.Data["S"].ToString()); + + Assert.Equal("%¥×³®í»š}%§X{{tøNåÝž¶ö¢ÖÞgrehtyyy$&%&£$££(*¾–~´¼", Encoding.UTF8.GetString(stream.Data.ToArray())); + } + + [Fact] + public void ReadsStreamWithoutBreakBeforeEndstream() + { + var s = @" +1 0 obj +12 +endobj + +7 0 obj +<< /Length 288 + /Filter /FlateDecode >> +stream +xœ]‘ËjÃ0E÷ÿÃ,ÓEð#NÒ€1¤N^ôA~€-]A- YYøï+Ï4¡t#qfîFWQY*­Dïv5:è”–§ñjB‹½Òa¤ •p7¤K  ƒÈûëyr8Tº!Ïà úð‚ÉÙVG9¶ø@Å7+Ñ*ÝÃ곬¹T_ùƵƒ8 Š$vË̗Ƽ6BDöu%½B¹yí$—Ù ¤\Hx71JœL#Ð6ºÇ0È㸀ü|. µüßõÏ""WÛ‰¯Æ.êÄ«ã8; ¤iL°!Ø %É`K°ßì¸ÃöÜáÜ)  [‚#CFðİ#(yƒg^ÿ¶æò +ÿž“¸Zë#¢?¢h–P”Æû?šÑï÷ø¯‰Šendstream +endobj + +9 0 obj +16 +endobj".Replace("\r\n", "\n").Replace("\n", "\r\n"); + + var scanner = GetScanner(s); + + var token = ReadToEnd(scanner)[1]; + + Assert.Equal(7, token.Number.ObjectNumber); + } + + [Fact] + public void ReadsStringsWithMissingEndBracket() + { + var input = @"5 0 obj +<< +/Kids [4 0 R 12 0 R 17 0 R 20 0 R 25 0 R 28 0 R ] +/Count 6 +/Type /Pages +/MediaBox [ 0 0 612 792 ] +>> +endobj +1 0 obj +<< +/Creator (Corel WordPerfect - [D:\Wpdocs\WEBSITE\PROC&POL.WP6 (unmodified) +/CreationDate (D:19980224130723) +/Title (Proc&Pol.pdf) +/Author (J. L. Swezey) +/Producer (Acrobat PDFWriter 3.03 for Windows NT) +/Keywords (Budapest Treaty; Patent deposits; IDA) +/Subject (Patent Collection Procedures and Policies) +>> +endobj +3 0 obj +<< +/Pages 5 0 R +/Type /Catalog +>> +endobj".Replace("\r\n", "\n").Replace("\n", "\r\n"); + + var scanner = GetScanner(input); + + var tokens = ReadToEnd(scanner); + + Assert.Equal(3, tokens.Count); + + var first = tokens[0]; + Assert.Equal(5, first.Number.ObjectNumber); + + var second = tokens[1]; + Assert.Equal(1, second.Number.ObjectNumber); + + var third = tokens[2]; + Assert.Equal(3, third.Number.ObjectNumber); + } + + [Fact] + public void ReadsDictionaryContainingNull() + { + var input = @"14224 0 obj +<> +endobj".Replace("\r\n", "\n").Replace("\n", "\r\n"); + + var scanner = GetScanner(input); + + var tokens = ReadToEnd(scanner); + + var dictionaryToken = tokens[0].Data as DictionaryToken; + + Assert.NotNull(dictionaryToken); + + var encryptValue = dictionaryToken.Data["Encrypt"]; + + Assert.IsType(encryptValue); + } + + [Fact] + public void ReadMultipleNestedDictionary() + { + var input = + @" + 4 0 obj + << /Type /Font /Subtype /Type1 /Name /AF1F040+Arial /BaseFont /Arial /FirstChar 32 /LastChar 255 + /Encoding + << + /Type /Encoding /BaseEncoding /WinAnsiEncoding + /Differences [128 /Euro 130 /quotesinglbase /florin /quotedblbase /ellipsis /dagger /daggerdbl /circumflex /perthousand /Scaron /guilsinglleft /OE 142 /Zcaron 145 + /quoteleft /quoteright /quotedblleft /quotedblright /bullet /endash /emdash /tilde /trademark /scaron /guilsinglright /oe 158 /zcaron /Ydieresis /space /exclamdown + /cent /sterling /currency /yen /brokenbar /section /dieresis /copyright /ordfeminine /guillemotleft /logicalnot /hyphen /registered /macron /degree /plusminus + /twosuperior /threesuperior /acute /mu /paragraph /periodcentered /cedilla /onesuperior /ordmasculine /guillemotright /onequarter /onehalf /threequarters + /questiondown /Agrave /Aacute /Acircumflex /Atilde /Adieresis /Aring /AE /Ccedilla /Egrave /Eacute /Ecircumflex /Edieresis /Igrave /Iacute /Icircumflex /Idieresis + /Eth /Ntilde /Ograve /Oacute /Ocircumflex /Otilde /Odieresis /multiply /Oslash /Ugrave /Uacute /Ucircumflex /Udieresis /Yacute /Thorn /germandbls /agrave /aacute + /acircumflex /atilde /adieresis /aring /ae /ccedilla /egrave /eacute /ecircumflex /edieresis /igrave /iacute /icircumflex /idieresis /eth /ntilde /ograve /oacute + /ocircumflex /otilde /odieresis /divide /oslash /ugrave /uacute /ucircumflex /udieresis /yacute /thorn /ydieresis ] + >> + /Widths [278 278 355 556 556 889 667 191 333 333 389 584 278 333 278 278 + 556 556 556 556 556 556 556 556 556 556 278 278 584 584 584 556 + 1015 667 667 722 722 667 611 778 722 278 500 667 556 833 722 778 + 667 778 722 667 611 722 667 944 667 667 611 278 278 278 469 556 + 333 556 556 500 556 556 278 556 556 222 222 500 222 833 556 556 + 556 556 333 500 278 556 500 722 500 500 500 334 260 334 584 750 + 556 750 222 556 333 1000 556 556 333 1000 667 333 1000 750 611 750 + 750 222 222 333 333 350 556 1000 333 1000 500 333 944 750 500 667 + 278 333 556 556 556 556 260 556 333 737 370 556 584 333 737 552 + 400 549 333 333 333 576 537 278 333 333 365 556 834 834 834 611 + 667 667 667 667 667 667 1000 722 667 667 667 667 278 278 278 278 + 722 722 778 778 778 778 778 584 778 722 722 722 722 667 667 611 + 556 556 556 556 556 556 889 500 556 556 556 556 278 278 278 278 + 556 556 556 556 556 556 556 549 611 556 556 556 556 500 556 500 + ] + >> + >> + endobj + ".Replace("\r\n", "\n").Replace("\n", "\r\n"); + + var scanner = GetScanner(input); + + var tokens = ReadToEnd(scanner); + + var dictionaryToken = tokens[0].Data as DictionaryToken; + + Assert.NotNull(dictionaryToken); + } + + [Fact] + public void ReadsDictionaryWithoutEndObjBeforeNextObject() + { + var input = @"1 0 obj +<> +2 0 obj +<> +endobj".Replace("\r\n", "\n").Replace("\n", "\r\n"); + + var scanner = GetScanner(input); + + var tokens = ReadToEnd(scanner); + + Assert.Equal(2, tokens.Count); + + var dictionaryToken = Assert.IsType(tokens[0].Data); + var typeValue = dictionaryToken.Data["Type"]; + Assert.IsType(typeValue); + + dictionaryToken = tokens[1].Data as DictionaryToken; + Assert.NotNull(dictionaryToken); + typeValue = dictionaryToken.Data["Length"]; + Assert.IsType(typeValue); + } + + [Fact] + public void ReadsStreamWithoutEndObjBeforeNextObject() + { + var input = @"1 0 obj +<> +stream +aaaa +endstream +2 0 obj +<> +endobj".Replace("\r\n", "\n").Replace("\n", "\r\n"); + + var scanner = GetScanner(input); + + var tokens = ReadToEnd(scanner); + + Assert.Equal(2, tokens.Count); + + Assert.IsType(tokens[0].Data); + + var dictionaryToken = Assert.IsType(tokens[1].Data); + var typeValue = dictionaryToken.Data["Length"]; + Assert.IsType(typeValue); + } + + [Theory] + [InlineData("startxref")] + [InlineData("xref")] + public void ReadsStreamWithoutEndObjBeforeToken(string token) + { + var input = @$"1 0 obj +<> +stream +aaaa +endstream +{token}".Replace("\r\n", "\n").Replace("\n", "\r\n"); + + var scanner = GetScanner(input); + + var tokens = ReadToEnd(scanner); + + Assert.Single(tokens); + + Assert.IsType(tokens[0].Data); + } + + [Theory] + [InlineData("startxref")] + [InlineData("xref")] + public void ReadsDictionaryWithoutEndObjBeforeToken(string token) + { + var input = @$"1 0 obj +<> +{token}".Replace("\r\n", "\n").Replace("\n", "\r\n"); + + var scanner = GetScanner(input); + + var tokens = ReadToEnd(scanner); + + Assert.Single(tokens); + + var dictionaryToken = Assert.IsType(tokens[0].Data); + var typeValue = dictionaryToken.Data["Type"]; + Assert.IsType(typeValue); + } + + [Fact] + public void ReadsStreamWithoutEndStreamBeforeEndObj() + { + var input = @"1 0 obj +<> +stream +aaaa +endobj +2 0 obj +<> +endobj".Replace("\r\n", "\n").Replace("\n", "\r\n"); + + var scanner = GetScanner(input); + + var tokens = ReadToEnd(scanner); + + Assert.Equal(2, tokens.Count); + + Assert.IsType(tokens[0].Data); + + var dictionaryToken = Assert.IsType(tokens[1].Data); + var lengthValue = dictionaryToken.Data["Length"]; + Assert.IsType(lengthValue); + } + + [Theory] + [InlineData(">>")] + [InlineData("randomstring")] + public void ReadsIndirectObjectsDictionaryWithContentBeforeEndObj(string addedContent) + { + var input = @$"1 0 obj +<> +{addedContent}endobj +2 0 obj +<> +endobj".Replace("\r\n", "\n").Replace("\n", "\r\n"); + + var strictScanner = GetScanner(input); + + var tokens = ReadToEnd(strictScanner); + Assert.Empty(tokens); + + + var lenientScanner = GetScanner(input, useLenientParsing: true); + tokens = ReadToEnd(lenientScanner); + + Assert.Equal(2, tokens.Count); + + var dictionaryToken = Assert.IsType(tokens[0].Data); + var typeValue = dictionaryToken.Data["Type"]; + Assert.IsType(typeValue); + + dictionaryToken = Assert.IsType(tokens[1].Data); + var lengthValue = dictionaryToken.Data["Length"]; + Assert.IsType(lengthValue); + } + + [Theory] + [InlineData(">>")] + [InlineData("randomstring")] + public void ReadsIndirectObjectsStreamWithAddedContentBeforeStream(string addedContent) + { + var input = @$"1 0 obj +<> +{addedContent}stream +aaaa +endstream +endobj +2 0 obj +<> +endobj".Replace("\r\n", "\n").Replace("\n", "\r\n"); + + var strictScanner = GetScanner(input); + + var tokens = ReadToEnd(strictScanner); + Assert.Equal(2, tokens.Count); + // this is linked to the parsing choosing the last token parsed in obj. + // It can probably be challenged against taking the first one. + var operatorToken = Assert.IsType(tokens[0].Data); + Assert.Equal("endstream", operatorToken.Data); + + var dictionaryToken = Assert.IsType(tokens[1].Data); + var lengthValue = dictionaryToken.Data["Length"]; + Assert.IsType(lengthValue); + + var lenientScanner = GetScanner(input, useLenientParsing:true); + tokens = ReadToEnd(lenientScanner); + + Assert.Equal(2, tokens.Count); + + Assert.IsType(tokens[0].Data); + + dictionaryToken = Assert.IsType(tokens[1].Data); + lengthValue = dictionaryToken.Data["Length"]; + Assert.IsType(lengthValue); + } + + private static PdfTokenScanner GetScanner(string s, TestObjectLocationProvider locationProvider = null, bool useLenientParsing = false) + { + var input = StringBytesTestConverter.Convert(s, false); + + return new PdfTokenScanner(input.Bytes, + locationProvider ?? new TestObjectLocationProvider(), + new TestFilterProvider(), + NoOpEncryptionHandler.Instance, + new FileHeaderOffset(0), + useLenientParsing ? new ParsingOptions() : ParsingOptions.LenientParsingOff, + new StackDepthGuard(256)); + } + + private static IReadOnlyList ReadToEnd(PdfTokenScanner scanner) + { + var result = new List(); + + while (scanner.MoveNext()) + { + if (scanner.CurrentToken is ObjectToken obj) + { + result.Add(obj); + } + else + { + throw new InvalidOperationException($"Pdf token scanner produced token which was not an object token: {scanner.CurrentToken}."); + } + } + + return result; + } + } +} diff --git a/src/UglyToad.PdfPig.Tokens/ObjectToken.cs b/src/UglyToad.PdfPig.Tokens/ObjectToken.cs index 7417429a6..606434645 100644 --- a/src/UglyToad.PdfPig.Tokens/ObjectToken.cs +++ b/src/UglyToad.PdfPig.Tokens/ObjectToken.cs @@ -12,7 +12,7 @@ public class ObjectToken : IDataToken /// /// The offset to the start of the object number from the start of the file in bytes. /// - public long Position { get; } + public XrefLocation Position { get; } /// /// The object and generation number of the object. @@ -30,7 +30,7 @@ public class ObjectToken : IDataToken /// The offset in bytes from the start of the file for this object. /// The identifier for this object. /// The data contained in this object. - public ObjectToken(long position, IndirectReference number, IToken data) + public ObjectToken(XrefLocation position, IndirectReference number, IToken data) { Position = position; Number = number; diff --git a/src/UglyToad.PdfPig/AcroForms/AcroFormFactory.cs b/src/UglyToad.PdfPig/AcroForms/AcroFormFactory.cs index a9ee9ef09..875b45447 100644 --- a/src/UglyToad.PdfPig/AcroForms/AcroFormFactory.cs +++ b/src/UglyToad.PdfPig/AcroForms/AcroFormFactory.cs @@ -28,12 +28,12 @@ internal class AcroFormFactory private readonly IPdfTokenScanner tokenScanner; private readonly ILookupFilterProvider filterProvider; - private readonly IReadOnlyDictionary objectOffsets; + private readonly IReadOnlyDictionary objectOffsets; public AcroFormFactory( IPdfTokenScanner tokenScanner, ILookupFilterProvider filterProvider, - IReadOnlyDictionary objectOffsets) + IReadOnlyDictionary objectOffsets) { this.tokenScanner = tokenScanner ?? throw new ArgumentNullException(nameof(tokenScanner)); this.filterProvider = filterProvider ?? throw new ArgumentNullException(nameof(filterProvider)); diff --git a/src/UglyToad.PdfPig/Filters/FlateFilter.cs b/src/UglyToad.PdfPig/Filters/FlateFilter.cs index 73428907d..09fdad062 100644 --- a/src/UglyToad.PdfPig/Filters/FlateFilter.cs +++ b/src/UglyToad.PdfPig/Filters/FlateFilter.cs @@ -2,11 +2,10 @@ { using Fonts; using System; - using System.Buffers.Binary; using System.IO; using System.IO.Compression; using Tokens; - using UglyToad.PdfPig.Core; + using Core; using Util; /// @@ -55,89 +54,41 @@ public Memory Decode(Memory input, DictionaryToken streamDictionary, return input; } - private static Memory Decompress(Memory input, int predictor, int colors, int bitsPerComponent, int columns) + private static Memory Decompress(Memory input, + int predictor, + int colors, + int bitsPerComponent, + int columns) { -#if NET using var memoryStream = MemoryHelper.AsReadOnlyMemoryStream(input); - try - { - using (var zlib = new ZLibStream(memoryStream, CompressionMode.Decompress)) - using (var output = new MemoryStream((int)(input.Length * 1.5))) - using (var f = PngPredictor.WrapPredictor(output, predictor, colors, bitsPerComponent, columns)) - { - zlib.CopyTo(f); - f.Flush(); - - return output.AsMemory(); - } - } - catch (InvalidDataException ex) - { - throw new CorruptCompressedDataException("Invalid Flate compressed stream encountered", ex); - } -#else - // Ideally we would like to use the ZLibStream class but that is only available in .NET 5+. - // We look at the raw data now - // * First we have 2 bytes, specifying the type of compression - // * Then we have the deflated data - // * Then we have a 4 byte checksum (Adler32) - - // Would be so nice to have zlib do the framing here... but the deflate stream already reads data from the stream that we need. - - using var memoryStream = MemoryHelper.AsReadOnlyMemoryStream(input.Slice(2, input.Length - 2 /* Header */ - 4 /* Checksum */)); - // The first 2 bytes are the header which DeflateStream can't handle. After the s - var adlerBytes = input.Slice(input.Length - 4, 4).Span; - uint expected = BinaryPrimitives.ReadUInt32BigEndian(adlerBytes); - uint altExpected = expected; - - // Sometimes the data ends with "\r\n", "\r" or "\n" and we don't know if it is part of the zlib - // Ideally this would have been removed by the caller from the provided length... - if (adlerBytes[3] == '\n' || adlerBytes[3] == '\r') - { - if (adlerBytes[3] == '\n' && adlerBytes[2] == '\r') - { - // Now we don't know which value is the good one. The value could be ok, or padding. - // Lets allow both values for now. Allowing two out of 2^32 is much better than allowing everything - adlerBytes = input.Slice(input.Length - 6, 4).Span; - } - else - { - // Same but now for just '\n' or '\r' instead of '\r\n' - adlerBytes = input.Slice(input.Length - 5, 4).Span; - } - - altExpected = BinaryPrimitives.ReadUInt32BigEndian(adlerBytes); - } - + // The first 2 bytes are the header which DeflateStream does not support. + memoryStream.ReadByte(); + memoryStream.ReadByte(); try { - using (var deflate = new DeflateStream(memoryStream, CompressionMode.Decompress)) - using (var adlerStream = new Adler32ChecksumStream(deflate)) - using (var output = new MemoryStream((int)(input.Length * 1.5))) - using (var f = PngPredictor.WrapPredictor(output, predictor, colors, bitsPerComponent, columns)) - { - adlerStream.CopyTo(f); - f.Flush(); - - uint actual = adlerStream.Checksum; - if (expected != actual && altExpected != actual) - { - throw new CorruptCompressedDataException("Flate stream has invalid checksum"); - } - - return output.AsMemory(); - } + using var deflate = new DeflateStream(memoryStream, CompressionMode.Decompress); + using var output = new MemoryStream((int)(input.Length * 1.5)); + using var f = PngPredictor.WrapPredictor(output, predictor, colors, bitsPerComponent, columns); + + deflate.CopyTo(f); + f.Flush(); + + return output.AsMemory(); } catch (InvalidDataException ex) { throw new CorruptCompressedDataException("Invalid Flate compressed stream encountered", ex); } -#endif } - /// - public byte[] Encode(Stream input, DictionaryToken streamDictionary, int index) + /// + /// Convert a decoded data stream back to the encoded version. + /// + /// The decoded data. + /// The stream dictionary with the parameters to use. + /// The Flate encoded data. + public byte[] Encode(Stream input, DictionaryToken streamDictionary) { const int headerLength = 2; const int checksumLength = 4; diff --git a/src/UglyToad.PdfPig/Parser/FileStructure/FirstPassParser.cs b/src/UglyToad.PdfPig/Parser/FileStructure/FirstPassParser.cs index 4553b8550..35d8e2c6e 100644 --- a/src/UglyToad.PdfPig/Parser/FileStructure/FirstPassParser.cs +++ b/src/UglyToad.PdfPig/Parser/FileStructure/FirstPassParser.cs @@ -16,7 +16,7 @@ public static FirstPassResults Parse( { log ??= new NoOpLog(); - IReadOnlyDictionary? bruteForceOffsets = null; + IReadOnlyDictionary? bruteForceOffsets = null; var didBruteForce = false; DictionaryToken? bruteForceTrailer = null; @@ -92,7 +92,7 @@ public static FirstPassResults Parse( } DictionaryToken? lastTrailer = null; - var flattenedOffsets = new Dictionary(); + var flattenedOffsets = new Dictionary(); foreach (var xrefPart in orderedXrefs) { if (xrefPart.Dictionary != null) @@ -230,12 +230,12 @@ internal class FirstPassResults /// /// All offsets found if a brute-force search was applied. /// - public IReadOnlyDictionary? BruteForceOffsets { get; } + public IReadOnlyDictionary? BruteForceOffsets { get; } /// /// All offsets found from the leaf xref. /// - public IReadOnlyDictionary XrefOffsets { get; } + public IReadOnlyDictionary XrefOffsets { get; } /// /// The trailer dictionary of the leaf xref if we found any. @@ -244,8 +244,8 @@ internal class FirstPassResults public FirstPassResults( IReadOnlyList parts, - IReadOnlyDictionary? bruteForceOffsets, - IReadOnlyDictionary xrefOffsets, + IReadOnlyDictionary? bruteForceOffsets, + IReadOnlyDictionary xrefOffsets, DictionaryToken? trailer) { Parts = parts; diff --git a/src/UglyToad.PdfPig/Parser/FileStructure/IXrefSection.cs b/src/UglyToad.PdfPig/Parser/FileStructure/IXrefSection.cs index 470722935..8e56f5821 100644 --- a/src/UglyToad.PdfPig/Parser/FileStructure/IXrefSection.cs +++ b/src/UglyToad.PdfPig/Parser/FileStructure/IXrefSection.cs @@ -14,7 +14,7 @@ internal interface IXrefSection /// /// The bytes offsets of the objects in this xref. /// - public IReadOnlyDictionary ObjectOffsets { get; } + public IReadOnlyDictionary ObjectOffsets { get; } /// /// The dictionary for this xref, for the trailer xref this is the trailer dictionary, for streams the stream dictionary. diff --git a/src/UglyToad.PdfPig/Parser/FileStructure/XrefBruteForcer.cs b/src/UglyToad.PdfPig/Parser/FileStructure/XrefBruteForcer.cs index b038f1625..998d0f5ee 100644 --- a/src/UglyToad.PdfPig/Parser/FileStructure/XrefBruteForcer.cs +++ b/src/UglyToad.PdfPig/Parser/FileStructure/XrefBruteForcer.cs @@ -19,7 +19,7 @@ public static Result FindAllXrefsInFileOrder( // Guard against circular references; only read xref at each offset once var xrefOffsetSeen = new HashSet(); - var bruteForceObjPositions = new Dictionary(); + var bruteForceObjPositions = new Dictionary(); DictionaryToken? trailer = null; @@ -123,7 +123,7 @@ void AddQueues(long num) if (buffer.EndsWith(" obj") && numericsQueue[0] > 0) { - bruteForceObjPositions[new IndirectReference(numericsQueue[0], (int)numericsQueue[1])] = positionsQueue[0]; + bruteForceObjPositions[new IndirectReference(numericsQueue[0], (int)numericsQueue[1])] = XrefLocation.File(positionsQueue[0]); lastObjPosition = positionsQueue[0]; @@ -208,12 +208,12 @@ void AddQueues(long num) public class Result( IReadOnlyList xRefParts, - IReadOnlyDictionary objectOffsets, + IReadOnlyDictionary objectOffsets, DictionaryToken? lastTrailer) { public IReadOnlyList XRefParts { get; } = xRefParts; - public IReadOnlyDictionary ObjectOffsets { get; } = objectOffsets; + public IReadOnlyDictionary ObjectOffsets { get; } = objectOffsets; public DictionaryToken? LastTrailer { get; } = lastTrailer; } diff --git a/src/UglyToad.PdfPig/Parser/FileStructure/XrefStream.cs b/src/UglyToad.PdfPig/Parser/FileStructure/XrefStream.cs index 1a8f3f99d..353288425 100644 --- a/src/UglyToad.PdfPig/Parser/FileStructure/XrefStream.cs +++ b/src/UglyToad.PdfPig/Parser/FileStructure/XrefStream.cs @@ -10,7 +10,7 @@ internal sealed class XrefStream : IXrefSection /// /// The corresponding byte offset for each keyed object in this document. /// - public IReadOnlyDictionary ObjectOffsets { get; } + public IReadOnlyDictionary ObjectOffsets { get; } public DictionaryToken Dictionary { get; } @@ -20,7 +20,7 @@ internal sealed class XrefStream : IXrefSection public XrefStream( long offset, - IReadOnlyDictionary objectOffsets, + IReadOnlyDictionary objectOffsets, DictionaryToken streamDictionary, XrefOffsetCorrection correctionType, long offsetCorrection) diff --git a/src/UglyToad.PdfPig/Parser/FileStructure/XrefStreamParser.cs b/src/UglyToad.PdfPig/Parser/FileStructure/XrefStreamParser.cs index 3f5b70029..6bf1b5f1b 100644 --- a/src/UglyToad.PdfPig/Parser/FileStructure/XrefStreamParser.cs +++ b/src/UglyToad.PdfPig/Parser/FileStructure/XrefStreamParser.cs @@ -98,7 +98,7 @@ internal static class XrefStreamParser ? stackalloc byte[fieldSizes.LineLength] : new byte[fieldSizes.LineLength]; - var numbers = new List<(long obj, int gen, int off)>(); + var numbers = new List<(long obj, int gen, XrefLocation location)>(); foreach (var objectNumber in objectNumbers) { @@ -136,7 +136,7 @@ internal static class XrefStreamParser return new XrefStream( xrefOffset, - numbers.ToDictionary(x => new IndirectReference(x.obj, x.gen), x => (long)x.off), + numbers.ToDictionary(x => new IndirectReference(x.obj, x.gen), x => x.location), dictToken, offsetCorrectionType, offsetCorrection); @@ -175,7 +175,7 @@ private static void ReadNextStreamObject( int type, long objectNumber, XrefFieldSize fieldSizes, - List<(long, int, int)> results, + List<(long, int, XrefLocation)> results, ReadOnlySpan lineBuffer) { switch (type) @@ -184,19 +184,23 @@ private static void ReadNextStreamObject( // Ignore free objects. break; case 1: - // Non object stream entries. - var offset = 0; - for (var i = 0; i < fieldSizes.Field2Size; i++) - { - offset += (lineBuffer[i + fieldSizes.Field1Size] & 0x00ff) << ((fieldSizes.Field2Size - i - 1) * 8); - } - var genNum = 0; - for (var i = 0; i < fieldSizes.Field3Size; i++) - { - genNum += (lineBuffer[i + fieldSizes.Field1Size + fieldSizes.Field2Size] & 0x00ff) << ((fieldSizes.Field3Size - i - 1) * 8); + var offset = ReadUnsigned( + lineBuffer, + fieldSizes.Field1Size, + fieldSizes.Field2Size); + + var genNum = ReadUnsigned( + lineBuffer, + fieldSizes.Field1Size + fieldSizes.Field2Size, + fieldSizes.Field3Size); + + if (offset < 0) + { + throw new PdfDocumentFormatException( + $"Location with negative offset {offset} found for object {objectNumber}"); } - results.Add((objectNumber, genNum, offset)); + results.Add((objectNumber, (int)genNum, XrefLocation.File(offset))); break; case 2: @@ -205,28 +209,49 @@ private static void ReadNextStreamObject( * 2nd argument is object number of object stream * 3rd argument is index of object within object stream * - * For sequential PDFParser we do not need this information - * because - * These objects are handled by the dereferenceObjects() method - * since they're only pointing to object numbers - * - * However for XRef aware parsers we have to know which objects contain - * object streams. We will store this information in normal xref mapping - * table but add object stream number with minus sign in order to - * distinguish from file offsets */ - var objstmObjNr = 0; - for (var i = 0; i < fieldSizes.Field2Size; i++) + + var objectStreamNumber = ReadUnsigned( + lineBuffer, + fieldSizes.Field1Size, + fieldSizes.Field2Size); + + var streamIndex = ReadUnsigned( + lineBuffer, + fieldSizes.Field1Size + fieldSizes.Field2Size, + fieldSizes.Field3Size); + + if (objectStreamNumber < 0) + { + throw new PdfDocumentFormatException( + $"Location with negative or zero object stream number {objectStreamNumber} found for object {objectNumber}"); + } + + if (streamIndex < 0) { - objstmObjNr += (lineBuffer[i + fieldSizes.Field1Size] & 0x00ff) << ((fieldSizes.Field2Size - i - 1) * 8); + throw new PdfDocumentFormatException( + $"Location with negative stream index {streamIndex} found for object {objectNumber} in stream {objectStreamNumber}"); } - results.Add((objectNumber, 0, -objstmObjNr)); + results.Add((objectNumber, 0, XrefLocation.Stream(objectStreamNumber, (int)streamIndex))); break; } } + private static long ReadUnsigned(ReadOnlySpan buffer, int start, int width) + { + long value = 0; + + for (int i = 0; i < width; i++) + { + value <<= 8; + value |= buffer[start + i]; + } + + return value; + } + private static (long from, long? to) ReadStreamTolerant(IInputBytes bytes) { var buffer = new CircularByteBuffer("endstream ".Length); diff --git a/src/UglyToad.PdfPig/Parser/FileStructure/XrefTable.cs b/src/UglyToad.PdfPig/Parser/FileStructure/XrefTable.cs index 011b25ba4..0370c37f6 100644 --- a/src/UglyToad.PdfPig/Parser/FileStructure/XrefTable.cs +++ b/src/UglyToad.PdfPig/Parser/FileStructure/XrefTable.cs @@ -13,7 +13,7 @@ internal sealed class XrefTable : IXrefSection /// /// The corresponding byte offset for each keyed object in this document. /// - public IReadOnlyDictionary ObjectOffsets { get; } + public IReadOnlyDictionary ObjectOffsets { get; } public DictionaryToken? Dictionary { get; } @@ -23,7 +23,7 @@ internal sealed class XrefTable : IXrefSection public XrefTable( long offset, - IReadOnlyDictionary objectOffsets, + IReadOnlyDictionary objectOffsets, DictionaryToken? trailer, XrefOffsetCorrection correctionType, long offsetCorrection) diff --git a/src/UglyToad.PdfPig/Parser/FileStructure/XrefTableParser.cs b/src/UglyToad.PdfPig/Parser/FileStructure/XrefTableParser.cs index 44f643c3e..99db0af7c 100644 --- a/src/UglyToad.PdfPig/Parser/FileStructure/XrefTableParser.cs +++ b/src/UglyToad.PdfPig/Parser/FileStructure/XrefTableParser.cs @@ -152,7 +152,7 @@ internal static class XrefTableParser } } - var offsets = new Dictionary(); + var offsets = new Dictionary(); if (readNums.Count == 0) { if (trailer != null) @@ -233,7 +233,7 @@ bool TryReadBuff(int len) if (type == occupiedSentinel) { var indirectRef = new IndirectReference(objNum, (int)gen); - offsets[indirectRef] = objOffset; + offsets[indirectRef] = XrefLocation.File(objOffset); } objNum++; diff --git a/src/UglyToad.PdfPig/Parser/Parts/BruteForceSearcher.cs b/src/UglyToad.PdfPig/Parser/Parts/BruteForceSearcher.cs index 0ecde8965..ceecff647 100644 --- a/src/UglyToad.PdfPig/Parser/Parts/BruteForceSearcher.cs +++ b/src/UglyToad.PdfPig/Parser/Parts/BruteForceSearcher.cs @@ -18,7 +18,7 @@ internal static class BruteForceSearcher /// /// The bytes of the document. /// The object keys and offsets for the objects in this document. - public static IReadOnlyDictionary GetObjectLocations(IInputBytes bytes) + public static IReadOnlyDictionary GetObjectLocations(IInputBytes bytes) { if (bytes is null) { @@ -29,7 +29,7 @@ public static IReadOnlyDictionary GetObjectLocations(II var lastEndOfFile = GetLastEndOfFileMarker(bytes); - var results = new Dictionary(); + var results = new Dictionary(); var generationBytes = new StringBuilder(); var objectNumberBytes = new StringBuilder(); @@ -174,7 +174,7 @@ public static IReadOnlyDictionary GetObjectLocations(II var obj = long.Parse(objectNumberBytes.ToString(), CultureInfo.InvariantCulture); var generation = int.Parse(generationBytes.ToString(), CultureInfo.InvariantCulture); - results[new IndirectReference(obj, generation)] = bytes.CurrentOffset; + results[new IndirectReference(obj, generation)] = XrefLocation.File(bytes.CurrentOffset); generationBytes.Clear(); objectNumberBytes.Clear(); diff --git a/src/UglyToad.PdfPig/Tokenization/Scanner/IObjectLocationProvider.cs b/src/UglyToad.PdfPig/Tokenization/Scanner/IObjectLocationProvider.cs index c84d937fa..972969eae 100644 --- a/src/UglyToad.PdfPig/Tokenization/Scanner/IObjectLocationProvider.cs +++ b/src/UglyToad.PdfPig/Tokenization/Scanner/IObjectLocationProvider.cs @@ -6,9 +6,9 @@ internal interface IObjectLocationProvider { - bool TryGetOffset(IndirectReference reference, out long offset); + bool TryGetOffset(IndirectReference reference, out XrefLocation offset); - void UpdateOffset(IndirectReference reference, long offset); + void UpdateOffset(IndirectReference reference, XrefLocation offset); bool TryGetCached(IndirectReference reference, [NotNullWhen(true)] out ObjectToken? objectToken); diff --git a/src/UglyToad.PdfPig/Tokenization/Scanner/ObjectLocationProvider.cs b/src/UglyToad.PdfPig/Tokenization/Scanner/ObjectLocationProvider.cs index 8f0a8e9c5..c15d55fc0 100644 --- a/src/UglyToad.PdfPig/Tokenization/Scanner/ObjectLocationProvider.cs +++ b/src/UglyToad.PdfPig/Tokenization/Scanner/ObjectLocationProvider.cs @@ -13,16 +13,16 @@ internal class ObjectLocationProvider : IObjectLocationProvider private readonly IInputBytes bytes; - private IReadOnlyDictionary? bruteForcedOffsets; + private IReadOnlyDictionary? bruteForcedOffsets; - private readonly Dictionary offsets; + private readonly Dictionary offsets; public ObjectLocationProvider( - IReadOnlyDictionary xrefOffsets, - IReadOnlyDictionary? bruteForcedOffsets, + IReadOnlyDictionary xrefOffsets, + IReadOnlyDictionary? bruteForcedOffsets, IInputBytes bytes) { - offsets = new Dictionary(); + offsets = new Dictionary(); foreach (var xrefOffset in xrefOffsets) { offsets[xrefOffset.Key] = xrefOffset.Value; @@ -32,7 +32,7 @@ public ObjectLocationProvider( this.bytes = bytes; } - public bool TryGetOffset(IndirectReference reference, out long offset) + public bool TryGetOffset(IndirectReference reference, out XrefLocation offset) { if (bruteForcedOffsets != null && bruteForcedOffsets.TryGetValue(reference, out var bfOffset)) { @@ -42,16 +42,6 @@ public bool TryGetOffset(IndirectReference reference, out long offset) if (offsets.TryGetValue(reference, out offset)) { - if (offset + reference.ObjectNumber == 0) - { - // We have a case where 'offset' and - // 'reference.ObjectNumber' have the same value - // and opposite signs. - // This results in an infinite recursion in - // PdfTokenScanner.GetObjectFromStream() where - // `var streamObjectNumber = offset * -1;` - throw new PdfDocumentFormatException("Avoiding infinite recursion in ObjectLocationProvider.TryGetOffset() as 'offset' and 'reference.ObjectNumber' have the same value and opposite signs."); - } return true; } @@ -63,7 +53,7 @@ public bool TryGetOffset(IndirectReference reference, out long offset) return bruteForcedOffsets.TryGetValue(reference, out offset); } - public void UpdateOffset(IndirectReference reference, long offset) + public void UpdateOffset(IndirectReference reference, XrefLocation offset) { offsets[reference] = offset; } @@ -81,8 +71,9 @@ public void Cache(ObjectToken objectToken, bool force = false) } // Don't cache incorrect locations. - if (!force && offsets.TryGetValue(objectToken.Number, out var expected) - && objectToken.Position != expected) + if (!force + && offsets.TryGetValue(objectToken.Number, out var expected) + && (objectToken.Position.Type != expected.Type || objectToken.Position.Value1 != expected.Value1)) { return; } diff --git a/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs b/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs index 4794e6d49..8435573f6 100644 --- a/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs +++ b/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs @@ -1,5 +1,8 @@ namespace UglyToad.PdfPig.Tokenization.Scanner { + using Core; + using Encryption; + using Filters; using System; using System.Collections.Generic; using System.Diagnostics; @@ -7,9 +10,6 @@ using System.Globalization; using System.Linq; using System.Text.RegularExpressions; - using Core; - using Encryption; - using Filters; using Tokens; using UglyToad.PdfPig.Parser.FileStructure; @@ -164,7 +164,7 @@ public bool MoveNext() var actualReference = new IndirectReference(objectNumber.Int, generation.Int); var actualToken = encryptionHandler.Decrypt(actualReference, readTokens[0]); - CurrentToken = new ObjectToken(startPosition, actualReference, actualToken); + CurrentToken = new ObjectToken(XrefLocation.File(startPosition), actualReference, actualToken); readTokens.Clear(); coreTokenScanner.Seek(previousTokenPositions[0]); @@ -191,7 +191,7 @@ public bool MoveNext() var actualReference = new IndirectReference(objectNumber.Int, generation.Int); var actualToken = encryptionHandler.Decrypt(actualReference, readTokens[0]); - CurrentToken = new ObjectToken(startPosition, actualReference, actualToken); + CurrentToken = new ObjectToken(XrefLocation.File(startPosition), actualReference, actualToken); readTokens.Clear(); coreTokenScanner.Seek(previousTokenPositions[2]); @@ -291,9 +291,9 @@ public bool MoveNext() token = encryptionHandler.Decrypt(reference, token); - CurrentToken = new ObjectToken(startPosition, reference, token); + CurrentToken = new ObjectToken(XrefLocation.File(startPosition), reference, token); - objectLocationProvider.UpdateOffset(reference, startPosition); + objectLocationProvider.UpdateOffset(reference, XrefLocation.File(startPosition)); readTokens.Clear(); return true; @@ -626,10 +626,10 @@ private DictionaryToken GetStreamDictionary() // We can only find it if we know where it is. if (objectLocationProvider.TryGetOffset(lengthReference.Data, out var offset)) { - if (offset < 0) + if (offset.Type == XrefEntryType.ObjectStream) { - ushort searchDepth = 0; - var result = GetObjectFromStream(lengthReference.Data, offset, ref searchDepth); + Span stack = stackalloc int[7]; + var result = GetObjectFromStream(lengthReference.Data, offset, stack, 0); if (!(result.Data is NumericToken streamLengthToken)) { @@ -639,8 +639,9 @@ private DictionaryToken GetStreamDictionary() return streamLengthToken.Long; } + // Move to the length object and read it. - Seek(offset); + Seek(offset.Value1); // Keep a copy of the read tokens here since this list must be empty prior to move next. var oldData = new List(readTokens); @@ -721,19 +722,31 @@ public void DeregisterCustomTokenizer(ITokenizer tokenizer) public ObjectToken? Get(IndirectReference reference) { - ushort searchDepth = 0; - return Get(reference, ref searchDepth); + Span stack = stackalloc int[7]; + return Get(reference, stack, 0); } - private ObjectToken? Get(IndirectReference reference, ref ushort searchDepth) + private ObjectToken? Get(IndirectReference reference, Span navSet, byte depth) { - if (searchDepth > 100) + if (depth >= navSet.Length) { - throw new PdfDocumentFormatException("Reached maximum search depth while getting indirect reference."); + var chain = string.Join(", ", navSet.ToArray()); + throw new PdfDocumentFormatException($"Deep object chain detected when looking for {reference}: {chain}."); } - searchDepth++; + // Cycle detection (linear scan, but depth is tiny) + for (var i = 0; i < depth; i++) + { + if (navSet[i] == reference.ObjectNumber) + { + var chain = string.Join(", ", navSet.ToArray()); + throw new PdfDocumentFormatException( + $"Circular reference encountered when looking for object {reference}. Involved objects were: {chain}"); + } + } + navSet[depth] = (int)reference.ObjectNumber; + depth++; if (isDisposed) { @@ -756,20 +769,20 @@ public void DeregisterCustomTokenizer(ITokenizer tokenizer) } // Negative offsets refer to a stream with that number. - if (offset < 0) + if (offset.Type == XrefEntryType.ObjectStream) { - var result = GetObjectFromStream(reference, offset, ref searchDepth); + if (offset.Value1 == reference.ObjectNumber) + { + throw new PdfDocumentFormatException( + $"Object stream cannot contain itself, looking for object {reference} in {offset.Value1}"); + } - return result; - } + var result = GetObjectFromStream(reference, offset, navSet, depth); - if (offset == 0 && reference.Generation > ushort.MaxValue) - { - // TODO - To remove as should not happen anymore - return new ObjectToken(offset, reference, NullToken.Instance); + return result; } - Seek(offset); + Seek(offset.Value1); if (!MoveNext()) { @@ -793,7 +806,7 @@ public void ReplaceToken(IndirectReference reference, IToken token) { // Using 0 position as it isn't written to stream and this value doesn't // seem to be used by any callers. In future may need to revisit this. - overwrittenTokens[reference] = new ObjectToken(0, reference, token); + overwrittenTokens[reference] = new ObjectToken(XrefLocation.File(0), reference, token); } private bool TryBruteForceFileToFindReference(IndirectReference reference, [NotNullWhen(true)] out ObjectToken? result) @@ -826,11 +839,11 @@ private bool TryBruteForceFileToFindReference(IndirectReference reference, [NotN } } - private ObjectToken GetObjectFromStream(IndirectReference reference, long offset, ref ushort searchDepth) + private ObjectToken GetObjectFromStream(IndirectReference reference, XrefLocation offset, Span navSet, byte depth) { - var streamObjectNumber = offset * -1; + var streamObjectNumber = offset.Value1; - var streamObject = Get(new IndirectReference(streamObjectNumber, 0), ref searchDepth); + var streamObject = Get(new IndirectReference(streamObjectNumber, 0), navSet, depth); if (!(streamObject?.Data is StreamToken stream)) { @@ -853,7 +866,7 @@ private ObjectToken GetObjectFromStream(IndirectReference reference, long offset return result; } - private IReadOnlyList ParseObjectStream(StreamToken stream, long offset) + private IReadOnlyList ParseObjectStream(StreamToken stream, XrefLocation offset) { if (!stream.StreamDictionary.TryGet(NameToken.N, out var numberToken) || !(numberToken is NumericToken numberOfObjects)) diff --git a/src/UglyToad.PdfPig/Writer/Colors/OutputIntentsFactory.cs b/src/UglyToad.PdfPig/Writer/Colors/OutputIntentsFactory.cs index f2c48955b..3b92c13eb 100644 --- a/src/UglyToad.PdfPig/Writer/Colors/OutputIntentsFactory.cs +++ b/src/UglyToad.PdfPig/Writer/Colors/OutputIntentsFactory.cs @@ -15,7 +15,7 @@ public static ArrayToken GetOutputIntentsArray(Func { diff --git a/src/UglyToad.PdfPig/Writer/DataCompresser.cs b/src/UglyToad.PdfPig/Writer/DataCompressor.cs similarity index 96% rename from src/UglyToad.PdfPig/Writer/DataCompresser.cs rename to src/UglyToad.PdfPig/Writer/DataCompressor.cs index e143ab3ba..b47f397ff 100644 --- a/src/UglyToad.PdfPig/Writer/DataCompresser.cs +++ b/src/UglyToad.PdfPig/Writer/DataCompressor.cs @@ -6,7 +6,7 @@ using Filters; using Tokens; - internal static class DataCompresser + internal static class DataCompressor { public static byte[] CompressBytes(IReadOnlyList bytes) => CompressBytes(bytes.ToArray()); public static byte[] CompressBytes(byte[] bytes) @@ -15,7 +15,7 @@ public static byte[] CompressBytes(byte[] bytes) { var parameters = new DictionaryToken(new Dictionary()); var flater = new FlateFilter(); - var result = flater.Encode(memoryStream, parameters, 0); + var result = flater.Encode(memoryStream, parameters); return result; } } diff --git a/src/UglyToad.PdfPig/Writer/Fonts/TrueTypeWritingFont.cs b/src/UglyToad.PdfPig/Writer/Fonts/TrueTypeWritingFont.cs index 5b7941fc9..fce7184fa 100644 --- a/src/UglyToad.PdfPig/Writer/Fonts/TrueTypeWritingFont.cs +++ b/src/UglyToad.PdfPig/Writer/Fonts/TrueTypeWritingFont.cs @@ -51,7 +51,7 @@ public IndirectReferenceToken WriteFont(IPdfStreamWriter writer, IndirectRefere var newEncoding = new TrueTypeSubsetEncoding(characterMapping.Keys.ToList()); var subsetBytes = TrueTypeSubsetter.Subset(fontFileBytes.ToArray(), newEncoding); - var embeddedFile = DataCompresser.CompressToStream(subsetBytes); + var embeddedFile = DataCompressor.CompressToStream(subsetBytes); var fileRef = writer.WriteToken(embeddedFile); @@ -110,7 +110,7 @@ public IndirectReferenceToken WriteFont(IPdfStreamWriter writer, IndirectRefere var descriptor = writer.WriteToken(new DictionaryToken(descriptorDictionary)); var toUnicodeCMap = ToUnicodeCMapBuilder.ConvertToCMapStream(characterMapping); - var toUnicodeStream = DataCompresser.CompressToStream(toUnicodeCMap); + var toUnicodeStream = DataCompressor.CompressToStream(toUnicodeCMap); var toUnicode = writer.WriteToken(toUnicodeStream); var dictionary = new Dictionary diff --git a/src/UglyToad.PdfPig/Writer/NoTextTokenWriter.cs b/src/UglyToad.PdfPig/Writer/NoTextTokenWriter.cs index 05edada13..bb6364f12 100644 --- a/src/UglyToad.PdfPig/Writer/NoTextTokenWriter.cs +++ b/src/UglyToad.PdfPig/Writer/NoTextTokenWriter.cs @@ -107,7 +107,7 @@ private bool TryGetStreamWithoutText(StreamToken streamToken, [NotNullWhen(true) } outputStreamT.Seek(0, SeekOrigin.Begin); - var compressedBytes = DataCompresser.CompressBytes(outputStreamT.ToArray()); + var compressedBytes = DataCompressor.CompressBytes(outputStreamT.ToArray()); var outputStreamDictionary = new Dictionary() { { NameToken.Length, new NumericToken(compressedBytes.Length) }, diff --git a/src/UglyToad.PdfPig/Writer/PdfPageBuilder.cs b/src/UglyToad.PdfPig/Writer/PdfPageBuilder.cs index c3bacccdc..643429334 100644 --- a/src/UglyToad.PdfPig/Writer/PdfPageBuilder.cs +++ b/src/UglyToad.PdfPig/Writer/PdfPageBuilder.cs @@ -767,7 +767,7 @@ public AddedImage AddPng(Stream pngStream, PdfRectangle placementRectangle = def } } - var compressedSmask = DataCompresser.CompressBytes(smaskData); + var compressedSmask = DataCompressor.CompressBytes(smaskData); // Create a soft-mask. var smaskDictionary = new Dictionary @@ -786,7 +786,7 @@ public AddedImage AddPng(Stream pngStream, PdfRectangle placementRectangle = def smaskReference = documentBuilder.AddImage(new DictionaryToken(smaskDictionary), compressedSmask); } - var compressed = DataCompresser.CompressBytes(data); + var compressed = DataCompressor.CompressBytes(data); var imgDictionary = new Dictionary { @@ -1218,7 +1218,7 @@ public IndirectReferenceToken Write(IPdfStreamWriter writer) var bytes = memoryStream.ToArray(); - var stream = DataCompresser.CompressToStream(bytes); + var stream = DataCompressor.CompressToStream(bytes); return writer.WriteToken(stream); } diff --git a/src/UglyToad.PdfPig/Writer/PdfStreamWriter.cs b/src/UglyToad.PdfPig/Writer/PdfStreamWriter.cs index c1c360643..cd912ba9a 100644 --- a/src/UglyToad.PdfPig/Writer/PdfStreamWriter.cs +++ b/src/UglyToad.PdfPig/Writer/PdfStreamWriter.cs @@ -58,7 +58,7 @@ public virtual IndirectReferenceToken WriteToken(IToken token) var ir = ReserveObjectNumber(); offsets.Add(ir.Data, Stream.Position); - var obj = new ObjectToken(Stream.Position, ir.Data, token); + var obj = new ObjectToken(XrefLocation.File(Stream.Position), ir.Data, token); TokenWriter.WriteToken(obj, Stream); return ir; } @@ -71,7 +71,7 @@ public virtual IndirectReferenceToken WriteToken(IToken token, IndirectReference } offsets.Add(indirectReference.Data, Stream.Position); - var obj = new ObjectToken(Stream.Position, indirectReference.Data, token); + var obj = new ObjectToken(XrefLocation.File(Stream.Position), indirectReference.Data, token); TokenWriter.WriteToken(obj, Stream); return indirectReference; } @@ -98,7 +98,6 @@ public void CompletePdf(IndirectReferenceToken catalogReference, IndirectReferen TokenWriter.WriteCrossReferenceTable(offsets, catalogReference.Data, Stream, documentInformationReference?.Data); } - public void Dispose() { if (DisposeStream)