diff --git a/.github/workflows/run_common_crawl_tests.yml b/.github/workflows/run_common_crawl_tests.yml
index c92ee6d87..f8a923c83 100644
--- a/.github/workflows/run_common_crawl_tests.yml
+++ b/.github/workflows/run_common_crawl_tests.yml
@@ -12,7 +12,7 @@ jobs:
strategy:
fail-fast: false
matrix:
- pair: ["0000-0001", "0002-0003", "0004-0005", "0006-0007"]
+ pair: ["0000-0001", "0002-0003", "0004-0005", "0006-0007", "0008-0009"]
steps:
- uses: actions/checkout@v2
diff --git a/src/UglyToad.PdfPig.Core/XrefEntryType.cs b/src/UglyToad.PdfPig.Core/XrefEntryType.cs
new file mode 100644
index 000000000..8f954163e
--- /dev/null
+++ b/src/UglyToad.PdfPig.Core/XrefEntryType.cs
@@ -0,0 +1,20 @@
+namespace UglyToad.PdfPig.Core;
+
+///
+/// Indicates where an object is located in the Xref.
+///
+public enum XrefEntryType : byte
+{
+ ///
+ /// Free object.
+ ///
+ Free = 0,
+ ///
+ /// Located as an object in the file.
+ ///
+ File = 1,
+ ///
+ /// Located in a compressed object stream.
+ ///
+ ObjectStream = 2
+}
\ No newline at end of file
diff --git a/src/UglyToad.PdfPig.Core/XrefLocation.cs b/src/UglyToad.PdfPig.Core/XrefLocation.cs
new file mode 100644
index 000000000..1f966ae56
--- /dev/null
+++ b/src/UglyToad.PdfPig.Core/XrefLocation.cs
@@ -0,0 +1,42 @@
+namespace UglyToad.PdfPig.Core;
+
+///
+/// Information about where an object is located in the file according to the Xref (or brute force parsing).
+///
+public readonly struct XrefLocation
+{
+ ///
+ /// Which type of location is indicated.
+ ///
+ public readonly XrefEntryType Type;
+
+ ///
+ /// If is then byte offset, otherwise this is the stream number.
+ ///
+ public readonly long Value1;
+
+ ///
+ /// If is then the index of the object in the stream.
+ ///
+ public readonly int Value2; // only used for ObjectStream
+
+ private XrefLocation(XrefEntryType type, long value1, int value2)
+ {
+ Type = type;
+ Value1 = value1;
+ Value2 = value2;
+ }
+
+ ///
+ /// Create a location mapped to a byte offset in the file.
+ ///
+ public static XrefLocation File(long offset)
+ => new XrefLocation(XrefEntryType.File, offset, 0);
+
+ ///
+ /// Create a location mapped to an index inside and object stream.
+ ///
+ public static XrefLocation Stream(long objStream, int index)
+ => new XrefLocation(XrefEntryType.ObjectStream, objStream, index);
+
+}
\ No newline at end of file
diff --git a/src/UglyToad.PdfPig.Tests/Filters/FlateFilterTests.cs b/src/UglyToad.PdfPig.Tests/Filters/FlateFilterTests.cs
index 68289e876..4f1e0da81 100644
--- a/src/UglyToad.PdfPig.Tests/Filters/FlateFilterTests.cs
+++ b/src/UglyToad.PdfPig.Tests/Filters/FlateFilterTests.cs
@@ -1,5 +1,6 @@
namespace UglyToad.PdfPig.Tests.Filters
{
+ using PdfPig.Core;
using PdfPig.Filters;
using PdfPig.Tokens;
@@ -11,15 +12,32 @@ public class FlateFilterTests
public void EncodeAndDecodePreservesInput()
{
var parameters = new DictionaryToken(new Dictionary());
- var input = new byte[] {67, 69, 69, 10, 4, 20, 6, 19, 120, 64, 64, 64, 32};
+ var input = new byte[] { 67, 69, 69, 10, 4, 20, 6, 19, 120, 64, 64, 64, 32 };
using (var inputStream = new MemoryStream(input))
{
inputStream.Seek(0, SeekOrigin.Begin);
- var result = filter.Encode(inputStream, parameters, 0);
+ var result = filter.Encode(inputStream, parameters);
var decoded = filter.Decode(result, parameters, TestFilterProvider.Instance, 0);
Assert.Equal(input, decoded.ToArray());
}
}
+
+ [Fact]
+ public void CanDecodeCorruptedInputIssue1235()
+ {
+ const string hexStr =
+ "789C958D5D0AC2400C844FB077980B74BB7FD9D982F820B43E8B7B03C542C187EAFDC1F84B7D1164200999E49BD9044C6653D10E1E443DA1AF6636ED76EF315E7572968E1ECDAB7FB7506C4C59C0AEB3912EE270366AAAF4E36D364BF7911450DC274A5112B1AC9751D77A58680B51A4D8AE433D62953C037396E0F290FBE098B267A43051725AA34E77E44EF50B1B52B42C90E4ADF83FB94FDD0000000000";
+
+ var hex = new HexToken(hexStr.AsSpan());
+
+ var parameters = new DictionaryToken(new Dictionary());
+
+ var result = filter.Decode(hex.Bytes.ToArray(), parameters, TestFilterProvider.Instance, 0);
+
+ var text = OtherEncodings.BytesAsLatin1String(result.ToArray());
+
+ Assert.StartsWith("q", text);
+ }
}
}
diff --git a/src/UglyToad.PdfPig.Tests/Integration/GithubIssuesTests.cs b/src/UglyToad.PdfPig.Tests/Integration/GithubIssuesTests.cs
index f8c464540..4c5a17b27 100644
--- a/src/UglyToad.PdfPig.Tests/Integration/GithubIssuesTests.cs
+++ b/src/UglyToad.PdfPig.Tests/Integration/GithubIssuesTests.cs
@@ -327,7 +327,7 @@ public void Issue1122()
var path = IntegrationHelpers.GetSpecificTestDocumentPath("StackOverflow_Issue_1122.pdf");
var ex = Assert.Throws(() => PdfDocument.Open(path, new ParsingOptions() { UseLenientParsing = true }));
- Assert.Equal("The root object in the trailer did not resolve to a readable dictionary.", ex.Message);
+ Assert.StartsWith("Circular reference encountered when looking", ex.Message);
}
[Fact]
@@ -386,7 +386,7 @@ public void Issue1050()
{
var path = IntegrationHelpers.GetSpecificTestDocumentPath("SpookyPass.pdf");
var ex = Assert.Throws(() => PdfDocument.Open(path, new ParsingOptions() { UseLenientParsing = true }));
- Assert.Equal("The root object in the trailer did not resolve to a readable dictionary.", ex.Message);
+ Assert.StartsWith("Object stream cannot contain itself", ex.Message);
}
[Fact]
@@ -552,7 +552,7 @@ public void Issue953_IntOverflow()
{
var page = document.GetPage(13);
// This used to fail with an overflow exception when we failed to validate the zlib encoded data
- Assert.NotNull(DocstrumBoundingBoxes.Instance.GetBlocks(page.GetWords()));
+ Assert.Throws(() => DocstrumBoundingBoxes.Instance.GetBlocks(page.GetWords()));
}
}
diff --git a/src/UglyToad.PdfPig.Tests/Parser/FileStructure/FirstPassParserTests.cs b/src/UglyToad.PdfPig.Tests/Parser/FileStructure/FirstPassParserTests.cs
index d47cec01d..a7d3eef0a 100644
--- a/src/UglyToad.PdfPig.Tests/Parser/FileStructure/FirstPassParserTests.cs
+++ b/src/UglyToad.PdfPig.Tests/Parser/FileStructure/FirstPassParserTests.cs
@@ -47,7 +47,7 @@ 0000000576 00000 n
Assert.Equal(2, results.Parts.Count);
Assert.NotNull(results.Trailer);
- Assert.Equal(results.XrefOffsets[new IndirectReference(8, 0)], 500);
+ Assert.Equal(results.XrefOffsets[new IndirectReference(8, 0)].Value1, 500);
}
[Fact]
diff --git a/src/UglyToad.PdfPig.Tests/Parser/FileStructure/XrefTableParserTests.cs b/src/UglyToad.PdfPig.Tests/Parser/FileStructure/XrefTableParserTests.cs
index afb653a8f..74454b14b 100644
--- a/src/UglyToad.PdfPig.Tests/Parser/FileStructure/XrefTableParserTests.cs
+++ b/src/UglyToad.PdfPig.Tests/Parser/FileStructure/XrefTableParserTests.cs
@@ -589,7 +589,7 @@ private static void AssertObjectsMatch(
{
Assert.True(table.ObjectOffsets.TryGetValue(offset.Key, out var actual));
- Assert.Equal(offset.Value, actual);
+ Assert.Equal(offset.Value, actual.Value1);
}
}
diff --git a/src/UglyToad.PdfPig.Tests/Parser/PageContentParserTests.cs b/src/UglyToad.PdfPig.Tests/Parser/PageContentParserTests.cs
index b88d169e4..a4fd04853 100644
--- a/src/UglyToad.PdfPig.Tests/Parser/PageContentParserTests.cs
+++ b/src/UglyToad.PdfPig.Tests/Parser/PageContentParserTests.cs
@@ -216,6 +216,68 @@ public void CorrectlyHandlesFile0007511CorruptInlineImage()
Assert.NotEmpty(result);
}
+ [Fact]
+ public void HandlesIssue953_IntOverflowContent()
+ {
+ // After ( + ) Tj operator the content stream becomes corrupt, our current parser therefore reads wrong
+ // values for operations and this results in a problem when applying the show text operations, we should safely discard or recover on BT/ET boundaries.
+ const string s =
+ """
+ BT
+ /TT6 1 Tf
+ 12.007 0 0 12.007 163.2j
+ -0.19950 Tc
+ 0 Tw
+ (x)Tj
+ -0.1949 1.4142 TD
+ (H)Tj
+ /TT7 1 Tf
+ 12.031 0 0 12.031 157.38 85.2 Tm
+ <0077>Tj
+ -0.1945 1.4114 TD
+ <0077>Tj
+ /TT4 1 Tf
+ 12.007 0 0 12.007 174.42 94.5601 Tm
+ 0.0004 Tc
+ -0.0005 Tw
+ ( + )Tj
+ E9 478l)]T862.68E9 478E9 484.54 9 155l)]T862.6av9 478E9 15.2(
+ ET
+ 154.386( i92 m
+ 171.6 97.62 l
+ S
+ BT
+ /TT6 28 Tf
+ 12.03128 T2002.0307 163.2j
+ -0.19950 DAc
+ 0 Tw853Tj
+ 0.1945 1.4142 om)873j
+ -0.574142 om)68.80
+ -0.5797 0 TD
+ (f)Tj
+ /TT( )7Tf
+ 0.31945 1.5341 TD371.4j
+ 2.82
+ 8.2652 0 5.724 TD
+ 0 Tc
+ -0.0001 2748.3( = 091ity )-27483
+ [(te27483
+ [(te27483
+ [(te27483
+ [(te27483
+ [(te27483
+ [(Eq.)52 \(2.1
+ (
+ """;
+
+ var input = StringBytesTestConverter.Convert(s, false);
+
+ var lenientParser = new PageContentParser(ReflectionGraphicsStateOperationFactory.Instance, new StackDepthGuard(256), true);
+ var result = lenientParser.Parse(1, input.Bytes, log);
+
+ Assert.NotEmpty(result);
+ }
+
private static string LineEndingsToWhiteSpace(string str)
{
return str.Replace("\r\n", " ").Replace('\n', ' ').Replace('\r', ' ');
diff --git a/src/UglyToad.PdfPig.Tests/Parser/Parts/BruteForceSearcherTests.cs b/src/UglyToad.PdfPig.Tests/Parser/Parts/BruteForceSearcherTests.cs
index d275ad418..8cc206b1a 100644
--- a/src/UglyToad.PdfPig.Tests/Parser/Parts/BruteForceSearcherTests.cs
+++ b/src/UglyToad.PdfPig.Tests/Parser/Parts/BruteForceSearcherTests.cs
@@ -59,7 +59,7 @@ public void SearcherFindsCorrectObjects()
Assert.Equal(4, locations.Count);
- Assert.Equal(TestDataOffsets, locations.Values);
+ Assert.Equal(TestDataOffsets, locations.Values.Select(x => x.Value1));
}
[Fact]
@@ -111,7 +111,7 @@ 11 0 obj
s.IndexOf("11 0 obj", StringComparison.OrdinalIgnoreCase)
};
- Assert.Equal(expectedLocations, locations.Values);
+ Assert.Equal(expectedLocations, locations.Values.Select(x => x.Value1));
}
[Fact]
@@ -142,7 +142,7 @@ 5 0 obj
s.IndexOf("5 0 obj", StringComparison.OrdinalIgnoreCase)
};
- Assert.Equal(expectedLocations, locations.Values);
+ Assert.Equal(expectedLocations, locations.Values.Select(x => x.Value1));
}
[Fact]
@@ -156,17 +156,17 @@ public void BruteForceSearcherFileOffsetsCorrect()
Assert.Equal(13, locations.Count);
- Assert.Equal(6183, locations[new IndirectReference(1, 0)]);
- Assert.Equal(244, locations[new IndirectReference(2, 0)]);
- Assert.Equal(15, locations[new IndirectReference(3, 0)]);
- Assert.Equal(222, locations[new IndirectReference(4, 0)]);
- Assert.Equal(5766, locations[new IndirectReference(5, 0)]);
- Assert.Equal(353, locations[new IndirectReference(6, 0)]);
- Assert.Equal(581, locations[new IndirectReference(7, 0)]);
- Assert.Equal(5068, locations[new IndirectReference(8, 0)]);
- Assert.Equal(5091, locations[new IndirectReference(9, 0)]);
-
- var s = GetStringAt(bytes, locations[new IndirectReference(3, 0)]);
+ Assert.Equal(6183, locations[new IndirectReference(1, 0)].Value1);
+ Assert.Equal(244, locations[new IndirectReference(2, 0)].Value1);
+ Assert.Equal(15, locations[new IndirectReference(3, 0)].Value1);
+ Assert.Equal(222, locations[new IndirectReference(4, 0)].Value1);
+ Assert.Equal(5766, locations[new IndirectReference(5, 0)].Value1);
+ Assert.Equal(353, locations[new IndirectReference(6, 0)].Value1);
+ Assert.Equal(581, locations[new IndirectReference(7, 0)].Value1);
+ Assert.Equal(5068, locations[new IndirectReference(8, 0)].Value1);
+ Assert.Equal(5091, locations[new IndirectReference(9, 0)].Value1);
+
+ var s = GetStringAt(bytes, locations[new IndirectReference(3, 0)].Value1);
Assert.StartsWith("3 0 obj", s);
}
}
@@ -180,17 +180,17 @@ public void BruteForceSearcherBytesFileOffsetsCorrect()
Assert.Equal(13, locations.Count);
- Assert.Equal(6183, locations[new IndirectReference(1, 0)]);
- Assert.Equal(244, locations[new IndirectReference(2, 0)]);
- Assert.Equal(15, locations[new IndirectReference(3, 0)]);
- Assert.Equal(222, locations[new IndirectReference(4, 0)]);
- Assert.Equal(5766, locations[new IndirectReference(5, 0)]);
- Assert.Equal(353, locations[new IndirectReference(6, 0)]);
- Assert.Equal(581, locations[new IndirectReference(7, 0)]);
- Assert.Equal(5068, locations[new IndirectReference(8, 0)]);
- Assert.Equal(5091, locations[new IndirectReference(9, 0)]);
-
- var s = GetStringAt(bytes, locations[new IndirectReference(3, 0)]);
+ Assert.Equal(6183, locations[new IndirectReference(1, 0)].Value1);
+ Assert.Equal(244, locations[new IndirectReference(2, 0)].Value1);
+ Assert.Equal(15, locations[new IndirectReference(3, 0)].Value1);
+ Assert.Equal(222, locations[new IndirectReference(4, 0)].Value1);
+ Assert.Equal(5766, locations[new IndirectReference(5, 0)].Value1);
+ Assert.Equal(353, locations[new IndirectReference(6, 0)].Value1);
+ Assert.Equal(581, locations[new IndirectReference(7, 0)].Value1);
+ Assert.Equal(5068, locations[new IndirectReference(8, 0)].Value1);
+ Assert.Equal(5091, locations[new IndirectReference(9, 0)].Value1);
+
+ var s = GetStringAt(bytes, locations[new IndirectReference(3, 0)].Value1);
Assert.StartsWith("3 0 obj", s);
}
@@ -203,21 +203,21 @@ public void BruteForceSearcherFileOffsetsCorrectOpenOffice()
Assert.Equal(13, locations.Count);
- Assert.Equal(17, locations[new IndirectReference(1, 0)]);
- Assert.Equal(249, locations[new IndirectReference(2, 0)]);
- Assert.Equal(14291, locations[new IndirectReference(3, 0)]);
- Assert.Equal(275, locations[new IndirectReference(4, 0)]);
- Assert.Equal(382, locations[new IndirectReference(5, 0)]);
- Assert.Equal(13283, locations[new IndirectReference(6, 0)]);
- Assert.Equal(13309, locations[new IndirectReference(7, 0)]);
- Assert.Equal(13556, locations[new IndirectReference(8, 0)]);
- Assert.Equal(13926, locations[new IndirectReference(9, 0)]);
- Assert.Equal(14183, locations[new IndirectReference(10, 0)]);
- Assert.Equal(14224, locations[new IndirectReference(11, 0)]);
- Assert.Equal(14428, locations[new IndirectReference(12, 0)]);
- Assert.Equal(14488, locations[new IndirectReference(13, 0)]);
-
- var s = GetStringAt(bytes, locations[new IndirectReference(12, 0)]);
+ Assert.Equal(17, locations[new IndirectReference(1, 0)].Value1);
+ Assert.Equal(249, locations[new IndirectReference(2, 0)].Value1);
+ Assert.Equal(14291, locations[new IndirectReference(3, 0)].Value1);
+ Assert.Equal(275, locations[new IndirectReference(4, 0)].Value1);
+ Assert.Equal(382, locations[new IndirectReference(5, 0)].Value1);
+ Assert.Equal(13283, locations[new IndirectReference(6, 0)].Value1);
+ Assert.Equal(13309, locations[new IndirectReference(7, 0)].Value1);
+ Assert.Equal(13556, locations[new IndirectReference(8, 0)].Value1);
+ Assert.Equal(13926, locations[new IndirectReference(9, 0)].Value1);
+ Assert.Equal(14183, locations[new IndirectReference(10, 0)].Value1);
+ Assert.Equal(14224, locations[new IndirectReference(11, 0)].Value1);
+ Assert.Equal(14428, locations[new IndirectReference(12, 0)].Value1);
+ Assert.Equal(14488, locations[new IndirectReference(13, 0)].Value1);
+
+ var s = GetStringAt(bytes, locations[new IndirectReference(12, 0)].Value1);
Assert.StartsWith("12 0 obj", s);
}
@@ -230,7 +230,7 @@ public void BruteForceSearcherCorrectlyFindsAllObjectsWhenOffset()
var locations = BruteForceSearcher.GetObjectLocations(input);
- Assert.Equal(TestDataOffsets, locations.Values);
+ Assert.Equal(TestDataOffsets, locations.Values.Select(x => x.Value1));
}
[Fact]
@@ -265,7 +265,7 @@ 11 0 obj
s.IndexOf("11 0 obj", StringComparison.OrdinalIgnoreCase)
};
- Assert.Equal(expectedLocations, locations.Values);
+ Assert.Equal(expectedLocations, locations.Values.Select(x => x.Value1));
}
private static string GetStringAt(IInputBytes bytes, long location)
diff --git a/src/UglyToad.PdfPig.Tests/Parser/Parts/DirectObjectFinderTests.cs b/src/UglyToad.PdfPig.Tests/Parser/Parts/DirectObjectFinderTests.cs
index ca84a8823..d91d275ec 100644
--- a/src/UglyToad.PdfPig.Tests/Parser/Parts/DirectObjectFinderTests.cs
+++ b/src/UglyToad.PdfPig.Tests/Parser/Parts/DirectObjectFinderTests.cs
@@ -15,8 +15,8 @@ public void TryGetCanFollowMultipleReferenceLinks()
var reference1 = new IndirectReference(7, 0);
var reference2 = new IndirectReference(9, 0);
- scanner.Objects[reference1] = new ObjectToken(10, reference1, new IndirectReferenceToken(reference2));
- scanner.Objects[reference2] = new ObjectToken(12, reference2, new NumericToken(69));
+ scanner.Objects[reference1] = new ObjectToken(XrefLocation.File(10), reference1, new IndirectReferenceToken(reference2));
+ scanner.Objects[reference2] = new ObjectToken(XrefLocation.File(12), reference2, new NumericToken(69));
Assert.True(DirectObjectFinder.TryGet(new IndirectReferenceToken(reference1), scanner, out NumericToken result));
@@ -29,8 +29,8 @@ public void GetCanFollowMultipleReferenceLinks()
var reference1 = new IndirectReference(7, 0);
var reference2 = new IndirectReference(9, 0);
- scanner.Objects[reference1] = new ObjectToken(10, reference1, new IndirectReferenceToken(reference2));
- scanner.Objects[reference2] = new ObjectToken(12, reference2, new NumericToken(69));
+ scanner.Objects[reference1] = new ObjectToken(XrefLocation.File(10), reference1, new IndirectReferenceToken(reference2));
+ scanner.Objects[reference2] = new ObjectToken(XrefLocation.File(12), reference2, new NumericToken(69));
var result = DirectObjectFinder.Get(reference1, scanner);
@@ -43,8 +43,8 @@ public void GetTokenCanFollowMultipleReferenceLinks()
var reference1 = new IndirectReference(7, 0);
var reference2 = new IndirectReference(9, 0);
- scanner.Objects[reference1] = new ObjectToken(10, reference1, new IndirectReferenceToken(reference2));
- scanner.Objects[reference2] = new ObjectToken(12, reference2, new NumericToken(69));
+ scanner.Objects[reference1] = new ObjectToken(XrefLocation.File(10), reference1, new IndirectReferenceToken(reference2));
+ scanner.Objects[reference2] = new ObjectToken(XrefLocation.File(12), reference2, new NumericToken(69));
var result = DirectObjectFinder.Get(new IndirectReferenceToken(reference1), scanner);
@@ -57,7 +57,7 @@ public void GetReturnsSingleItemFromArray()
var reference = new IndirectReference(10, 0);
const string expected = "Goopy";
- scanner.Objects[reference] = new ObjectToken(10, reference, new ArrayToken(new []
+ scanner.Objects[reference] = new ObjectToken(XrefLocation.File(10), reference, new ArrayToken(new []
{
new StringToken(expected)
}));
@@ -74,12 +74,12 @@ public void GetFollowsSingleIndirectReferenceFromArray()
var reference2 = new IndirectReference(69, 0);
const string expected = "Goopy";
- scanner.Objects[reference] = new ObjectToken(10, reference, new ArrayToken(new[]
+ scanner.Objects[reference] = new ObjectToken(XrefLocation.File(10), reference, new ArrayToken(new[]
{
new IndirectReferenceToken(reference2)
}));
- scanner.Objects[reference2] = new ObjectToken(69, reference2, new StringToken(expected));
+ scanner.Objects[reference2] = new ObjectToken(XrefLocation.File(69), reference2, new StringToken(expected));
var result = DirectObjectFinder.Get(reference, scanner);
@@ -91,7 +91,7 @@ public void GetThrowsOnInvalidArray()
{
var reference = new IndirectReference(10, 0);
- scanner.Objects[reference] = new ObjectToken(10, reference, new ArrayToken(new[]
+ scanner.Objects[reference] = new ObjectToken(XrefLocation.File(10), reference, new ArrayToken(new[]
{
new NumericToken(5), new NumericToken(6), new NumericToken(0)
}));
diff --git a/src/UglyToad.PdfPig.Tests/TestObjectLocationProvider.cs b/src/UglyToad.PdfPig.Tests/TestObjectLocationProvider.cs
index 54e568a38..51ca7116d 100644
--- a/src/UglyToad.PdfPig.Tests/TestObjectLocationProvider.cs
+++ b/src/UglyToad.PdfPig.Tests/TestObjectLocationProvider.cs
@@ -6,14 +6,14 @@
internal class TestObjectLocationProvider : IObjectLocationProvider
{
- public Dictionary Offsets { get; } = new Dictionary();
+ public Dictionary Offsets { get; } = new Dictionary();
- public bool TryGetOffset(IndirectReference reference, out long offset)
+ public bool TryGetOffset(IndirectReference reference, out XrefLocation offset)
{
return Offsets.TryGetValue(reference, out offset);
}
- public void UpdateOffset(IndirectReference reference, long offset)
+ public void UpdateOffset(IndirectReference reference, XrefLocation offset)
{
Offsets[reference] = offset;
}
diff --git a/src/UglyToad.PdfPig.Tests/Tokenization/Scanner/PdfTokenScannerTests.cs b/src/UglyToad.PdfPig.Tests/Tokenization/Scanner/PdfTokenScannerTests.cs
index 3fa902b5b..637f2477f 100644
--- a/src/UglyToad.PdfPig.Tests/Tokenization/Scanner/PdfTokenScannerTests.cs
+++ b/src/UglyToad.PdfPig.Tests/Tokenization/Scanner/PdfTokenScannerTests.cs
@@ -1,752 +1,752 @@
-namespace UglyToad.PdfPig.Tests.Tokenization.Scanner
-{
- using System.Text;
- using PdfPig.Core;
- using PdfPig.Encryption;
- using PdfPig.Parser.FileStructure;
- using PdfPig.Tokenization.Scanner;
- using PdfPig.Tokens;
-
- public class PdfTokenScannerTests
- {
- [Fact]
- public void ReadsSimpleObject()
- {
- const string s = @"294 0 obj
-/WDKAAR+CMBX12
-endobj";
-
- var pdfScanner = GetScanner(s);
-
- pdfScanner.MoveNext();
-
- var objectToken = Assert.IsType(pdfScanner.CurrentToken);
-
- var name = Assert.IsType(objectToken.Data);
-
- Assert.Equal(294, objectToken.Number.ObjectNumber);
- Assert.Equal(0, objectToken.Number.Generation);
-
- Assert.Equal("WDKAAR+CMBX12", name.Data);
-
- Assert.StartsWith("294 0 obj", s.Substring((int)objectToken.Position));
- }
-
- [Fact]
- public void ReadsIndirectReferenceInObject()
- {
- const string s = @"
-15 0 obj
-12 7 R
-endobj";
-
- var scanner = GetScanner(s);
-
- var token = ReadToEnd(scanner)[0];
-
- var reference = Assert.IsType(token.Data);
-
- Assert.Equal(new IndirectReference(12, 7), reference.Data);
- }
-
- [Fact]
- public void ReadsObjectWithUndefinedIndirectReference()
- {
- const string s = @"
-5 0 obj
-<<
-/XObject <<
-/Pic1 7 0 R
->>
-/ProcSet [/PDF /Text /ImageC ]
-/Font <<
-/F0 8 0 R
-/F1 9 0 R
-/F2 10 0 R
-/F3 0 0 R
->>
->>
-endobj";
-
- var scanner = GetScanner(s);
-
- ReadToEnd(scanner);
-
- var token = scanner.Get(new IndirectReference(5, 0));
- Assert.NotNull(token);
-
- token = scanner.Get(new IndirectReference(0, 0));
- Assert.Null(token);
- }
-
- [Fact]
- public void ReadsNumericObjectWithComment()
- {
- const string s = @"%PDF-1.2
-
-% I commented here too, tee hee
-10383384 2 obj
-%and here, I just love comments
-
-45
-
-endobj
-
-%%EOF";
-
- var pdfScanner = GetScanner(s);
-
- pdfScanner.MoveNext();
-
- var obj = Assert.IsType(pdfScanner.CurrentToken);
-
- var num = Assert.IsType(obj.Data);
-
- Assert.Equal(45, num.Int);
-
- Assert.Equal(10383384, obj.Number.ObjectNumber);
- Assert.Equal(2, obj.Number.Generation);
-
- Assert.StartsWith("10383384 2 obj", s.Substring((int)obj.Position));
-
- Assert.False(pdfScanner.MoveNext());
- }
-
- [Fact]
- public void ReadsArrayObject()
- {
- const string s = @"
-endobj
-
-295 0 obj
-[
-676 938 875 787 750 880 813 875 813 875 813 656 625 625 938 938 313
-344 563 563 563 563 563 850 500 574 813 875 563 1019 1144 875 313
-]
-endobj";
-
- var pdfScanner = GetScanner(s);
-
- pdfScanner.MoveNext();
-
- var obj = Assert.IsType(pdfScanner.CurrentToken);
-
- var array = Assert.IsType(obj.Data);
-
- Assert.Equal(676, ((NumericToken)array.Data[0]).Int);
-
- Assert.Equal(33, array.Data.Count);
-
- Assert.Equal(295, obj.Number.ObjectNumber);
- Assert.Equal(0, obj.Number.Generation);
-
- Assert.StartsWith("295 0 obj", s.Substring((int)obj.Position));
-
- Assert.False(pdfScanner.MoveNext());
- }
-
- [Fact]
- public void ReadsDictionaryObjectThenNameThenDictionary()
- {
- const string s = @"
-
-274 0 obj
-<<
-/Type /Pages
-/Count 2
-/Parent 275 0 R
-/Kids [ 121 0 R 125 0 R ]
->>
-endobj
-
-%Other parts...
-
-310 0 obj
-/WPXNWT+CMR9
-endobj 311 0 obj
-<<
-/Type /Font
-/Subtype /Type1
-/FirstChar 0
-/LastChar 127
-/Widths 313 0 R
-/BaseFont 310 0 R /FontDescriptor 312 0 R
->>
-endobj";
-
- var scanner = GetScanner(s);
-
- var tokens = ReadToEnd(scanner);
-
- var dictionary = Assert.IsType(tokens[0].Data);
-
- Assert.Equal(4, dictionary.Data.Count);
- Assert.Equal(274, tokens[0].Number.ObjectNumber);
- Assert.StartsWith("274 0 obj", s.Substring((int)tokens[0].Position));
-
- var nameObject = Assert.IsType(tokens[1].Data);
-
- Assert.Equal("WPXNWT+CMR9", nameObject.Data);
- Assert.Equal(310, tokens[1].Number.ObjectNumber);
- Assert.StartsWith("310 0 obj", s.Substring((int)tokens[1].Position));
-
- dictionary = Assert.IsType(tokens[2].Data);
-
- Assert.Equal(7, dictionary.Data.Count);
- Assert.Equal(311, tokens[2].Number.ObjectNumber);
- Assert.StartsWith("311 0 obj", s.Substring((int)tokens[2].Position));
- }
-
- [Fact]
- public void ReadsStringObject()
- {
- const string s = @"
-
-58949797283757 0 obj (An object begins with obj and ends with endobj...) endobj
-";
-
- var scanner = GetScanner(s);
-
- var token = ReadToEnd(scanner)[0];
-
- Assert.Equal(58949797283757L, token.Number.ObjectNumber);
- Assert.Equal("An object begins with obj and ends with endobj...", Assert.IsType(token.Data).Data);
-
- Assert.StartsWith("58949797283757 0 obj", s.Substring((int)token.Position));
- }
-
- [Fact]
- public void ReadsStreamObject()
- {
- const string s = @"
-352 0 obj
-<< /S 1273 /Filter /FlateDecode /Length 353 0 R >>
-stream
-H‰œUkLSgþÚh¹IÝÅlK(%[ÈÅ©+ƒåꩊèæÇtnZ)Z¹¨Oå~9ŠÊµo”[éiK)÷B¹´
-ɲ ©¸˜ n±º×dKöcÏ÷ãœç{ßï}¾÷ÍÉs Ô;€
-À»—ÀF`ÇF@ƒ4˜ï @¥T¨³fY: žw̵;’’Îq®]cƒÿdp¨ÛI3F#G©#œ)TÇqW£NÚѬgOKbü‡µ#á¡£Þaîtƒƒ›ß–
-¾“S>}µuÕõ5M±¢ª†»øÞû•q÷îÜ~¬PòžÞ~•¬ëɃGÅ-Ñím·°gêêb,/,£P§õ^v¾ãÁô¿¿ŠTE]²±{šuwÔ`LG³DªìTÈ
-A¡¬àð‰É©ˆ°‘¼›‚%¥×s³®í»š}%§X{{tøNåÝž¶ö¢ÖÞ¾–~´¼¬°À“Éððr¥8»P£ØêÁi½®Û(éhŽ‘ú;x#dÃÄ$m
-+)
-)†…±n
-9ùyŽA·n\ï»t!=3£½¡:®µåâ¹Ô³ø¼ËiûSÎsë;•Dt—ö$WÉ4U‘¢ºÚšñá1íÐèÔó‚svõ(/(+D²#mZÏ6êüÝ7x‡—†”‡E„²‚|ê«êªDµ5q°šR¦RÈ£n¾[è~“}ýƒÝ½SꞦ'æQŽzÝ‚mæ
-óF+Õ%ù‡ƒß9SˆŒÓãšH¶~L-#T]êîÁ©ÎkbjÒp½¸$¤´(4<,""øfvΕ< VЫ#4'2l'Ð1ñðn?sìûãI'OŸøñçŸN5(äÊ'âÎѾÞþíðƒQmu}]Õ£‡c›©.Œòµ9zz0Ѳ‚B¢«#š-3ªàŸŸ¦Pà8®Ó…¼æ¢BaÅÐkëÊŠukÈÊÖL£ivvv…k2=µZMØ|Úl(ŠZV›ÍbI>Ÿl¹œ(â±Äbø”Uªñeü©U*‹’“Oð,„E+¶Êà>ŽU”ÎÌõçlºFÃ_ÃÙl?¶=>>!>þC¿-×à©©©x¾€¢ŠÊåòtÃ0‹Æôz“‰ NÊ,¬‚kÀ°F‚XÛ4&“ÉfÃñÅæûæy=ÆãIðE_¾Èårår/XÞ„/·qò›m¶ìÖ|†óx8Wð¹hºÜÂÕalÎü’˜Ã0^Òòòü¼yÞ¶´´DX
- )¨ÇM8lüM…Oúý| 1Ïãk»:t<…ÂÚl¶e¾†” éKÜl6c¹¸É„› ”)‰'3¤œ\–™ËN–™ÿe^в y÷ð¹f`3ëž´ ¸“$d:e†)!%2ºdvË@½N¼ªŠ Ùná¹ ¼¿@€Ã.èšs ì÷ûM€2(E4_ | FÑ.@v@÷¤ÃÅ0È Pž~,€:»H¤k¾hT Œ € êÇV:Ô…©@@oH¯(3T‰{""C½SñŠœþtz3€•ƒ ñf.¬SÐøzWþ*$9gj=~Ì·QD E6o¥Ûi/Â`1ígGMq,;}޼sÔ×®kDü˜J{e5‚²ìÉ~Y)}fA>:˜ù–""Yò ç¹=ù²yÛ¡¿i aœ‘ØÏºþÇoäO ôkÆ)
- endstream
- endobj
- 353 0 obj
- 1479
- endobj";
-
- var locationProvider = new TestObjectLocationProvider();
- // Mark location of "353 0 obj"
- locationProvider.Offsets[new IndirectReference(353, 0)] = 1643;
-
- var scanner = GetScanner(s, locationProvider);
-
- var tokens = ReadToEnd(scanner);
-
- Assert.Equal(2, tokens.Count);
-
- var stream = Assert.IsType(tokens[0].Data);
-
- var str = Encoding.UTF8.GetString(stream.Data.ToArray());
-
- Assert.StartsWith("H‰œUkLSgþÚh¹IÝÅl", str);
-
- Assert.Equal(2, locationProvider.Offsets[new IndirectReference(352, 0)]);
- }
-
- [Fact]
- public void ReadsStreamObjectWithInvalidLength()
- {
- string invalidLengthStream = "ABCD" + new string('e', 3996);
-
- string s = $@"
-352 0 obj
-<< /S 1273 /Filter /FlateDecode /Length 353 0 R >>
-stream
-{invalidLengthStream}
-endstream
-endobj
-353 0 obj
-1479
-endobj";
-
- var locationProvider = new TestObjectLocationProvider();
- // Mark location of "353 0 obj"
- locationProvider.Offsets[new IndirectReference(353, 0)] = 1643;
-
- var scanner = GetScanner(s, locationProvider);
-
- var tokens = ReadToEnd(scanner);
-
- Assert.Equal(2, tokens.Count);
-
- var stream = Assert.IsType(tokens[0].Data);
-
- var data = stream.Data.ToArray();
-
- var str = Encoding.UTF8.GetString(data);
-
- Assert.Equal(data.Length, invalidLengthStream.Length);
- Assert.StartsWith("ABCDeeeee", str);
-
- Assert.Equal(2, locationProvider.Offsets[new IndirectReference(352, 0)]);
- }
-
- [Fact]
- public void ReadsSimpleStreamObject()
- {
- // Length of the bytes as found by Encoding.UTF8.GetBytes is 45
- const string s = @"
-574387 0 obj
-<< /Length 45 >>
-stream
-À“Éððr¥8»P£ØêÁi½®Û(éhŽ‘ú
-endstream
-endobj";
-
- var scanner = GetScanner(s);
-
- var token = ReadToEnd(scanner)[0];
-
- var stream = Assert.IsType(token.Data);
-
- var bytes = stream.Data.ToArray();
- Assert.Equal(45, bytes.Length);
-
- var outputString = Encoding.UTF8.GetString(bytes);
-
- Assert.Equal("À“Éððr¥8»P£ØêÁi½®Û(éhŽ‘ú", outputString);
- }
-
- [Fact]
- public void ReadsStreamWithIndirectLength()
- {
- const string s = @"5 0 obj 52 endobj
-
-
-
-12 0 obj
-
-<< /Length 5 0 R /S 1245 >>
-
-stream
-%¥×³®í»š}%§X{{tøNåÝž¶ö¢ÖÞ¾–~´¼
-endstream
-endobj";
- var locationProvider = new TestObjectLocationProvider();
-
- locationProvider.Offsets[new IndirectReference(5, 0)] = 0;
-
- var scanner = GetScanner(s, locationProvider);
-
- var token = ReadToEnd(scanner)[1];
-
- var stream = Assert.IsType(token.Data);
-
- var bytes = stream.Data.ToArray();
- Assert.Equal(52, bytes.Length);
-
- var outputString = Encoding.UTF8.GetString(bytes);
-
- Assert.Equal("%¥×³®í»š}%§X{{tøNåÝž¶ö¢ÖÞ¾–~´¼", outputString);
- }
-
- [Fact]
- public void ReadsStreamWithMissingLength()
- {
- const string s = @"
-12655 0 obj
-
-<< /S 1245 >>
-
-stream
-%¥×³®í»š}%§X{{tøNåÝž¶ö¢ÖÞgrehtyyy$&%&£$££(*¾–~´¼
-endstream
-endobj";
-
- var scanner = GetScanner(s);
-
- var token = ReadToEnd(scanner)[0];
-
- Assert.Equal(12655, token.Number.ObjectNumber);
-
- var stream = Assert.IsType(token.Data);
-
- Assert.Equal("1245", stream.StreamDictionary.Data["S"].ToString());
-
- Assert.Equal("%¥×³®í»š}%§X{{tøNåÝž¶ö¢ÖÞgrehtyyy$&%&£$££(*¾–~´¼", Encoding.UTF8.GetString(stream.Data.ToArray()));
- }
-
- [Fact]
- public void ReadsStreamWithoutBreakBeforeEndstream()
- {
- const string s = @"
-1 0 obj
-12
-endobj
-
-7 0 obj
-<< /Length 288
- /Filter /FlateDecode >>
-stream
-xœ]‘ËjÃ0E÷ÿÃ,ÓEð#NÒ€1¤N^ôA~€-]A-YYøï+Ï4¡t#qfîFWQY*Dïv5:è”–§ñjB‹½Òa¤ •p7¤K ƒÈûëyr8Tº!Ïà úð‚ÉÙVG9¶ø@Å7+Ñ*ÝÃ곬¹T_ùƵƒ8Š$vË̗Ƽ6BDöu%½B¹yí$—Ù ¤\Hx71JœL#Ð6ºÇ0È㸀ü|. µüßõÏ""WÛ‰¯Æ.êÄ«ã8;¤iL°!Ø %É`K°ßì¸ÃöÜáÜ) [‚#CFðİ#(yƒg^ÿ¶æò
-ÿž“¸Zë#¢?¢h–P”Æû?šÑï÷ø¯‰Šendstream
-endobj
-
-9 0 obj
-16
-endobj";
-
- var scanner = GetScanner(s);
-
- var token = ReadToEnd(scanner)[1];
-
- Assert.Equal(7, token.Number.ObjectNumber);
- }
-
- [Fact]
- public void ReadsStringsWithMissingEndBracket()
- {
- const string input = @"5 0 obj
-<<
-/Kids [4 0 R 12 0 R 17 0 R 20 0 R 25 0 R 28 0 R ]
-/Count 6
-/Type /Pages
-/MediaBox [ 0 0 612 792 ]
->>
-endobj
-1 0 obj
-<<
-/Creator (Corel WordPerfect - [D:\Wpdocs\WEBSITE\PROC&POL.WP6 (unmodified)
-/CreationDate (D:19980224130723)
-/Title (Proc&Pol.pdf)
-/Author (J. L. Swezey)
-/Producer (Acrobat PDFWriter 3.03 for Windows NT)
-/Keywords (Budapest Treaty; Patent deposits; IDA)
-/Subject (Patent Collection Procedures and Policies)
->>
-endobj
-3 0 obj
-<<
-/Pages 5 0 R
-/Type /Catalog
->>
-endobj";
-
- var scanner = GetScanner(input);
-
- var tokens = ReadToEnd(scanner);
-
- Assert.Equal(3, tokens.Count);
-
- var first = tokens[0];
- Assert.Equal(5, first.Number.ObjectNumber);
-
- var second = tokens[1];
- Assert.Equal(1, second.Number.ObjectNumber);
-
- var third = tokens[2];
- Assert.Equal(3, third.Number.ObjectNumber);
- }
-
- [Fact]
- public void ReadsDictionaryContainingNull()
- {
- const string input = @"14224 0 obj
-<>
-endobj";
-
- var scanner = GetScanner(input);
-
- var tokens = ReadToEnd(scanner);
-
- var dictionaryToken = tokens[0].Data as DictionaryToken;
-
- Assert.NotNull(dictionaryToken);
-
- var encryptValue = dictionaryToken.Data["Encrypt"];
-
- Assert.IsType(encryptValue);
- }
-
- [Fact]
- public void ReadMultipleNestedDictionary()
- {
- const string input =
- @"
- 4 0 obj
- << /Type /Font /Subtype /Type1 /Name /AF1F040+Arial /BaseFont /Arial /FirstChar 32 /LastChar 255
- /Encoding
- <<
- /Type /Encoding /BaseEncoding /WinAnsiEncoding
- /Differences [128 /Euro 130 /quotesinglbase /florin /quotedblbase /ellipsis /dagger /daggerdbl /circumflex /perthousand /Scaron /guilsinglleft /OE 142 /Zcaron 145
- /quoteleft /quoteright /quotedblleft /quotedblright /bullet /endash /emdash /tilde /trademark /scaron /guilsinglright /oe 158 /zcaron /Ydieresis /space /exclamdown
- /cent /sterling /currency /yen /brokenbar /section /dieresis /copyright /ordfeminine /guillemotleft /logicalnot /hyphen /registered /macron /degree /plusminus
- /twosuperior /threesuperior /acute /mu /paragraph /periodcentered /cedilla /onesuperior /ordmasculine /guillemotright /onequarter /onehalf /threequarters
- /questiondown /Agrave /Aacute /Acircumflex /Atilde /Adieresis /Aring /AE /Ccedilla /Egrave /Eacute /Ecircumflex /Edieresis /Igrave /Iacute /Icircumflex /Idieresis
- /Eth /Ntilde /Ograve /Oacute /Ocircumflex /Otilde /Odieresis /multiply /Oslash /Ugrave /Uacute /Ucircumflex /Udieresis /Yacute /Thorn /germandbls /agrave /aacute
- /acircumflex /atilde /adieresis /aring /ae /ccedilla /egrave /eacute /ecircumflex /edieresis /igrave /iacute /icircumflex /idieresis /eth /ntilde /ograve /oacute
- /ocircumflex /otilde /odieresis /divide /oslash /ugrave /uacute /ucircumflex /udieresis /yacute /thorn /ydieresis ]
- >>
- /Widths [278 278 355 556 556 889 667 191 333 333 389 584 278 333 278 278
- 556 556 556 556 556 556 556 556 556 556 278 278 584 584 584 556
- 1015 667 667 722 722 667 611 778 722 278 500 667 556 833 722 778
- 667 778 722 667 611 722 667 944 667 667 611 278 278 278 469 556
- 333 556 556 500 556 556 278 556 556 222 222 500 222 833 556 556
- 556 556 333 500 278 556 500 722 500 500 500 334 260 334 584 750
- 556 750 222 556 333 1000 556 556 333 1000 667 333 1000 750 611 750
- 750 222 222 333 333 350 556 1000 333 1000 500 333 944 750 500 667
- 278 333 556 556 556 556 260 556 333 737 370 556 584 333 737 552
- 400 549 333 333 333 576 537 278 333 333 365 556 834 834 834 611
- 667 667 667 667 667 667 1000 722 667 667 667 667 278 278 278 278
- 722 722 778 778 778 778 778 584 778 722 722 722 722 667 667 611
- 556 556 556 556 556 556 889 500 556 556 556 556 278 278 278 278
- 556 556 556 556 556 556 556 549 611 556 556 556 556 500 556 500
- ]
- >>
- >>
- endobj
- ";
-
- var scanner = GetScanner(input);
-
- var tokens = ReadToEnd(scanner);
-
- var dictionaryToken = tokens[0].Data as DictionaryToken;
-
- Assert.NotNull(dictionaryToken);
- }
-
- [Fact]
- public void ReadsDictionaryWithoutEndObjBeforeNextObject()
- {
- const string input = @"1 0 obj
-<>
-2 0 obj
-<>
-endobj";
-
- var scanner = GetScanner(input);
-
- var tokens = ReadToEnd(scanner);
-
- Assert.Equal(2, tokens.Count);
-
- var dictionaryToken = Assert.IsType(tokens[0].Data);
- var typeValue = dictionaryToken.Data["Type"];
- Assert.IsType(typeValue);
-
- dictionaryToken = tokens[1].Data as DictionaryToken;
- Assert.NotNull(dictionaryToken);
- typeValue = dictionaryToken.Data["Length"];
- Assert.IsType(typeValue);
- }
-
- [Fact]
- public void ReadsStreamWithoutEndObjBeforeNextObject()
- {
- const string input = @"1 0 obj
-<>
-stream
-aaaa
-endstream
-2 0 obj
-<>
-endobj";
-
- var scanner = GetScanner(input);
-
- var tokens = ReadToEnd(scanner);
-
- Assert.Equal(2, tokens.Count);
-
- Assert.IsType(tokens[0].Data);
-
- var dictionaryToken = Assert.IsType(tokens[1].Data);
- var typeValue = dictionaryToken.Data["Length"];
- Assert.IsType(typeValue);
- }
-
- [Theory]
- [InlineData("startxref")]
- [InlineData("xref")]
- public void ReadsStreamWithoutEndObjBeforeToken(string token)
- {
- string input = @$"1 0 obj
-<>
-stream
-aaaa
-endstream
-{token}";
-
- var scanner = GetScanner(input);
-
- var tokens = ReadToEnd(scanner);
-
- Assert.Single(tokens);
-
- Assert.IsType(tokens[0].Data);
- }
-
- [Theory]
- [InlineData("startxref")]
- [InlineData("xref")]
- public void ReadsDictionaryWithoutEndObjBeforeToken(string token)
- {
- string input = @$"1 0 obj
-<>
-{token}";
-
- var scanner = GetScanner(input);
-
- var tokens = ReadToEnd(scanner);
-
- Assert.Single(tokens);
-
- var dictionaryToken = Assert.IsType(tokens[0].Data);
- var typeValue = dictionaryToken.Data["Type"];
- Assert.IsType(typeValue);
- }
-
- [Fact]
- public void ReadsStreamWithoutEndStreamBeforeEndObj()
- {
- const string input = @"1 0 obj
-<>
-stream
-aaaa
-endobj
-2 0 obj
-<>
-endobj";
-
- var scanner = GetScanner(input);
-
- var tokens = ReadToEnd(scanner);
-
- Assert.Equal(2, tokens.Count);
-
- Assert.IsType(tokens[0].Data);
-
- var dictionaryToken = Assert.IsType(tokens[1].Data);
- var lengthValue = dictionaryToken.Data["Length"];
- Assert.IsType(lengthValue);
- }
-
- [Theory]
- [InlineData(">>")]
- [InlineData("randomstring")]
- public void ReadsIndirectObjectsDictionaryWithContentBeforeEndObj(string addedContent)
- {
- string input = @$"1 0 obj
-<>
-{addedContent}endobj
-2 0 obj
-<>
-endobj";
-
- var strictScanner = GetScanner(input);
-
- var tokens = ReadToEnd(strictScanner);
- Assert.Empty(tokens);
-
-
- var lenientScanner = GetScanner(input, useLenientParsing: true);
- tokens = ReadToEnd(lenientScanner);
-
- Assert.Equal(2, tokens.Count);
-
- var dictionaryToken = Assert.IsType(tokens[0].Data);
- var typeValue = dictionaryToken.Data["Type"];
- Assert.IsType(typeValue);
-
- dictionaryToken = Assert.IsType(tokens[1].Data);
- var lengthValue = dictionaryToken.Data["Length"];
- Assert.IsType(lengthValue);
- }
-
- [Theory]
- [InlineData(">>")]
- [InlineData("randomstring")]
- public void ReadsIndirectObjectsStreamWithAddedContentBeforeStream(string addedContent)
- {
- string input = @$"1 0 obj
-<>
-{addedContent}stream
-aaaa
-endstream
-endobj
-2 0 obj
-<>
-endobj";
-
- var strictScanner = GetScanner(input);
-
- var tokens = ReadToEnd(strictScanner);
- Assert.Equal(2, tokens.Count);
- // this is linked to the parsing choosing the last token parsed in obj.
- // It can probably be challenged against taking the first one.
- var operatorToken = Assert.IsType(tokens[0].Data);
- Assert.Equal("endstream", operatorToken.Data);
-
- var dictionaryToken = Assert.IsType(tokens[1].Data);
- var lengthValue = dictionaryToken.Data["Length"];
- Assert.IsType(lengthValue);
-
- var lenientScanner = GetScanner(input, useLenientParsing:true);
- tokens = ReadToEnd(lenientScanner);
-
- Assert.Equal(2, tokens.Count);
-
- Assert.IsType(tokens[0].Data);
-
- dictionaryToken = Assert.IsType(tokens[1].Data);
- lengthValue = dictionaryToken.Data["Length"];
- Assert.IsType(lengthValue);
- }
-
- private static PdfTokenScanner GetScanner(string s, TestObjectLocationProvider locationProvider = null, bool useLenientParsing = false)
- {
- var input = StringBytesTestConverter.Convert(s, false);
-
- return new PdfTokenScanner(input.Bytes,
- locationProvider ?? new TestObjectLocationProvider(),
- new TestFilterProvider(),
- NoOpEncryptionHandler.Instance,
- new FileHeaderOffset(0),
- useLenientParsing ? new ParsingOptions() : ParsingOptions.LenientParsingOff,
- new StackDepthGuard(256));
- }
-
- private static IReadOnlyList ReadToEnd(PdfTokenScanner scanner)
- {
- var result = new List();
-
- while (scanner.MoveNext())
- {
- if (scanner.CurrentToken is ObjectToken obj)
- {
- result.Add(obj);
- }
- else
- {
- throw new InvalidOperationException($"Pdf token scanner produced token which was not an object token: {scanner.CurrentToken}.");
- }
- }
-
- return result;
- }
- }
-}
+namespace UglyToad.PdfPig.Tests.Tokenization.Scanner
+{
+ using System.Text;
+ using PdfPig.Core;
+ using PdfPig.Encryption;
+ using PdfPig.Parser.FileStructure;
+ using PdfPig.Tokenization.Scanner;
+ using PdfPig.Tokens;
+
+ public class PdfTokenScannerTests
+ {
+ [Fact]
+ public void ReadsSimpleObject()
+ {
+ var s = @"294 0 obj
+/WDKAAR+CMBX12
+endobj".Replace("\r\n", "\n").Replace("\n", "\r\n");
+
+ var pdfScanner = GetScanner(s);
+
+ pdfScanner.MoveNext();
+
+ var objectToken = Assert.IsType(pdfScanner.CurrentToken);
+
+ var name = Assert.IsType(objectToken.Data);
+
+ Assert.Equal(294, objectToken.Number.ObjectNumber);
+ Assert.Equal(0, objectToken.Number.Generation);
+
+ Assert.Equal("WDKAAR+CMBX12", name.Data);
+
+ Assert.StartsWith("294 0 obj", s.Substring((int)objectToken.Position.Value1));
+ }
+
+ [Fact]
+ public void ReadsIndirectReferenceInObject()
+ {
+ var s = @"
+15 0 obj
+12 7 R
+endobj".Replace("\r\n", "\n").Replace("\n", "\r\n");
+
+ var scanner = GetScanner(s);
+
+ var token = ReadToEnd(scanner)[0];
+
+ var reference = Assert.IsType(token.Data);
+
+ Assert.Equal(new IndirectReference(12, 7), reference.Data);
+ }
+
+ [Fact]
+ public void ReadsObjectWithUndefinedIndirectReference()
+ {
+ var s = @"
+5 0 obj
+<<
+/XObject <<
+/Pic1 7 0 R
+>>
+/ProcSet [/PDF /Text /ImageC ]
+/Font <<
+/F0 8 0 R
+/F1 9 0 R
+/F2 10 0 R
+/F3 0 0 R
+>>
+>>
+endobj".Replace("\r\n", "\n").Replace("\n", "\r\n");
+
+ var scanner = GetScanner(s);
+
+ ReadToEnd(scanner);
+
+ var token = scanner.Get(new IndirectReference(5, 0));
+ Assert.NotNull(token);
+
+ token = scanner.Get(new IndirectReference(0, 0));
+ Assert.Null(token);
+ }
+
+ [Fact]
+ public void ReadsNumericObjectWithComment()
+ {
+ var s = @"%PDF-1.2
+
+% I commented here too, tee hee
+10383384 2 obj
+%and here, I just love comments
+
+45
+
+endobj
+
+%%EOF".Replace("\r\n", "\n").Replace("\n", "\r\n");
+
+ var pdfScanner = GetScanner(s);
+
+ pdfScanner.MoveNext();
+
+ var obj = Assert.IsType(pdfScanner.CurrentToken);
+
+ var num = Assert.IsType(obj.Data);
+
+ Assert.Equal(45, num.Int);
+
+ Assert.Equal(10383384, obj.Number.ObjectNumber);
+ Assert.Equal(2, obj.Number.Generation);
+
+ Assert.StartsWith("10383384 2 obj", s.Substring((int)obj.Position.Value1));
+
+ Assert.False(pdfScanner.MoveNext());
+ }
+
+ [Fact]
+ public void ReadsArrayObject()
+ {
+ var s = @"
+endobj
+
+295 0 obj
+[
+676 938 875 787 750 880 813 875 813 875 813 656 625 625 938 938 313
+344 563 563 563 563 563 850 500 574 813 875 563 1019 1144 875 313
+]
+endobj".Replace("\r\n", "\n").Replace("\n", "\r\n");
+
+ var pdfScanner = GetScanner(s);
+
+ pdfScanner.MoveNext();
+
+ var obj = Assert.IsType(pdfScanner.CurrentToken);
+
+ var array = Assert.IsType(obj.Data);
+
+ Assert.Equal(676, ((NumericToken)array.Data[0]).Int);
+
+ Assert.Equal(33, array.Data.Count);
+
+ Assert.Equal(295, obj.Number.ObjectNumber);
+ Assert.Equal(0, obj.Number.Generation);
+
+ Assert.StartsWith("295 0 obj", s.Substring((int)obj.Position.Value1));
+
+ Assert.False(pdfScanner.MoveNext());
+ }
+
+ [Fact]
+ public void ReadsDictionaryObjectThenNameThenDictionary()
+ {
+ var s = @"
+
+274 0 obj
+<<
+/Type /Pages
+/Count 2
+/Parent 275 0 R
+/Kids [ 121 0 R 125 0 R ]
+>>
+endobj
+
+%Other parts...
+
+310 0 obj
+/WPXNWT+CMR9
+endobj 311 0 obj
+<<
+/Type /Font
+/Subtype /Type1
+/FirstChar 0
+/LastChar 127
+/Widths 313 0 R
+/BaseFont 310 0 R /FontDescriptor 312 0 R
+>>
+endobj".Replace("\r\n", "\n").Replace("\n", "\r\n");
+
+ var scanner = GetScanner(s);
+
+ var tokens = ReadToEnd(scanner);
+
+ var dictionary = Assert.IsType(tokens[0].Data);
+
+ Assert.Equal(4, dictionary.Data.Count);
+ Assert.Equal(274, tokens[0].Number.ObjectNumber);
+ Assert.StartsWith("274 0 obj", s.Substring((int)tokens[0].Position.Value1));
+
+ var nameObject = Assert.IsType(tokens[1].Data);
+
+ Assert.Equal("WPXNWT+CMR9", nameObject.Data);
+ Assert.Equal(310, tokens[1].Number.ObjectNumber);
+ Assert.StartsWith("310 0 obj", s.Substring((int)tokens[1].Position.Value1));
+
+ dictionary = Assert.IsType(tokens[2].Data);
+
+ Assert.Equal(7, dictionary.Data.Count);
+ Assert.Equal(311, tokens[2].Number.ObjectNumber);
+ Assert.StartsWith("311 0 obj", s.Substring((int)tokens[2].Position.Value1));
+ }
+
+ [Fact]
+ public void ReadsStringObject()
+ {
+ var s = @"
+
+58949797283757 0 obj (An object begins with obj and ends with endobj...) endobj
+".Replace("\r\n", "\n").Replace("\n", "\r\n");
+
+ var scanner = GetScanner(s);
+
+ var token = ReadToEnd(scanner)[0];
+
+ Assert.Equal(58949797283757L, token.Number.ObjectNumber);
+ Assert.Equal("An object begins with obj and ends with endobj...", Assert.IsType(token.Data).Data);
+
+ Assert.StartsWith("58949797283757 0 obj", s.Substring((int)token.Position.Value1));
+ }
+
+ [Fact]
+ public void ReadsStreamObject()
+ {
+ var s = @"
+352 0 obj
+<< /S 1273 /Filter /FlateDecode /Length 353 0 R >>
+stream
+H‰œUkLSgþÚh¹IÝÅlK(%[ÈÅ©+ƒåꩊèæÇtnZ)Z¹¨Oå~9ŠÊµo”[éiK)÷B¹´
+ɲ ©¸˜ n±º×dKöcÏ÷ãœç{ßï}¾÷ÍÉs Ô;€
+À»—ÀF`ÇF@ƒ4˜ï @¥T¨³fY: žw̵;’’Îq®]cƒÿdp¨ÛI3F#G©#œ)TÇqW£NÚѬgOKbü‡µ#á¡£Þaîtƒƒ›ß–
+¾“S>}µuÕõ5M±¢ª†»øÞû•q÷îÜ~¬PòžÞ~•¬ëɃGÅ-Ñím·°gêêb,/,£P§õ^v¾ãÁô¿¿ŠTE]²±{šuwÔ`LG³DªìTÈ
+A¡¬àð‰É©ˆ°‘¼›‚%¥×s³®í»š}%§X{{tøNåÝž¶ö¢ÖÞ¾–~´¼¬°À“Éððr¥8»P£ØêÁi½®Û(éhŽ‘ú;x#dÃÄ$m
++)
+)†…±n
+9ùyŽA·n\ï»t!=3£½¡:®µåâ¹Ô³ø¼ËiûSÎsë;•Dt—ö$WÉ4U‘¢ºÚšñá1íÐèÔó‚svõ(/(+D²#mZÏ6êüÝ7x‡—†”‡E„²‚|ê«êªDµ5q°šR¦RÈ£n¾[è~“}ýƒÝ½SꞦ'æQŽzÝ‚mæ
+óF+Õ%ù‡ƒß9SˆŒÓãšH¶~L-#T]êîÁ©ÎkbjÒp½¸$¤´(4<,""øfvΕ< VЫ#4'2l'Ð1ñðn?sìûãI'OŸøñçŸN5(äÊ'âÎѾÞþíðƒQmu}]Õ£‡c›©.Œòµ9zz0Ѳ‚B¢«#š-3ªàŸŸ¦Pà8®Ó…¼æ¢BaÅÐkëÊŠukÈÊÖL£ivvv…k2=µZMØ|Úl(ŠZV›ÍbI>Ÿl¹œ(â±Äbø”Uªñeü©U*‹’“Oð,„E+¶Êà>ŽU”ÎÌõçlºFÃ_ÃÙl?¶=>>!>þC¿-×à©©©x¾€¢ŠÊåòtÃ0‹Æôz“‰ NÊ,¬‚kÀ°F‚XÛ4&“ÉfÃñÅæûæy=ÆãIðE_¾Èårår/XÞ„/·qò›m¶ìÖ|†óx8Wð¹hºÜÂÕalÎü’˜Ã0^Òòòü¼yÞ¶´´DX
+ )¨ÇM8lüM…Oúý| 1Ïãk»:t<…ÂÚl¶e¾†” éKÜl6c¹¸É„› ”)‰'3¤œ\–™ËN–™ÿe^в y÷ð¹f`3ëž´ ¸“$d:e†)!%2ºdvË@½N¼ªŠ Ùná¹ ¼¿@€Ã.èšs ì÷ûM€2(E4_ | FÑ.@v@÷¤ÃÅ0È Pž~,€:»H¤k¾hT Œ € êÇV:Ô…©@@oH¯(3T‰{""C½SñŠœþtz3€•ƒ ñf.¬SÐøzWþ*$9gj=~Ì·QD E6o¥Ûi/Â`1ígGMq,;}޼sÔ×®kDü˜J{e5‚²ìÉ~Y)}fA>:˜ù–""Yò ç¹=ù²yÛ¡¿i aœ‘ØÏºþÇoäO ôkÆ)
+ endstream
+ endobj
+ 353 0 obj
+ 1479
+ endobj".Replace("\r\n", "\n").Replace("\n", "\r\n");
+
+ var locationProvider = new TestObjectLocationProvider();
+ // Mark location of "353 0 obj"
+ locationProvider.Offsets[new IndirectReference(353, 0)] = XrefLocation.File(1643);
+
+ var scanner = GetScanner(s, locationProvider);
+
+ var tokens = ReadToEnd(scanner);
+
+ Assert.Equal(2, tokens.Count);
+
+ var stream = Assert.IsType(tokens[0].Data);
+
+ var str = Encoding.UTF8.GetString(stream.Data.ToArray());
+
+ Assert.StartsWith("H‰œUkLSgþÚh¹IÝÅl", str);
+
+ Assert.Equal(2, locationProvider.Offsets[new IndirectReference(352, 0)].Value1);
+ }
+
+ [Fact]
+ public void ReadsStreamObjectWithInvalidLength()
+ {
+ string invalidLengthStream = "ABCD" + new string('e', 3996);
+
+ var s = $@"
+352 0 obj
+<< /S 1273 /Filter /FlateDecode /Length 353 0 R >>
+stream
+{invalidLengthStream}
+endstream
+endobj
+353 0 obj
+1479
+endobj".Replace("\r\n", "\n").Replace("\n", "\r\n");
+
+ var locationProvider = new TestObjectLocationProvider();
+ // Mark location of "353 0 obj"
+ locationProvider.Offsets[new IndirectReference(353, 0)] = XrefLocation.File(1643);
+
+ var scanner = GetScanner(s, locationProvider);
+
+ var tokens = ReadToEnd(scanner);
+
+ Assert.Equal(2, tokens.Count);
+
+ var stream = Assert.IsType(tokens[0].Data);
+
+ var data = stream.Data.ToArray();
+
+ var str = Encoding.UTF8.GetString(data);
+
+ Assert.Equal(data.Length, invalidLengthStream.Length);
+ Assert.StartsWith("ABCDeeeee", str);
+
+ Assert.Equal(2, locationProvider.Offsets[new IndirectReference(352, 0)].Value1);
+ }
+
+ [Fact]
+ public void ReadsSimpleStreamObject()
+ {
+ // Length of the bytes as found by Encoding.UTF8.GetBytes is 45
+ var s = @"
+574387 0 obj
+<< /Length 45 >>
+stream
+À“Éððr¥8»P£ØêÁi½®Û(éhŽ‘ú
+endstream
+endobj".Replace("\r\n", "\n").Replace("\n", "\r\n");
+
+ var scanner = GetScanner(s);
+
+ var token = ReadToEnd(scanner)[0];
+
+ var stream = Assert.IsType(token.Data);
+
+ var bytes = stream.Data.ToArray();
+ Assert.Equal(45, bytes.Length);
+
+ var outputString = Encoding.UTF8.GetString(bytes);
+
+ Assert.Equal("À“Éððr¥8»P£ØêÁi½®Û(éhŽ‘ú", outputString);
+ }
+
+ [Fact]
+ public void ReadsStreamWithIndirectLength()
+ {
+ var s = @"5 0 obj 52 endobj
+
+
+
+12 0 obj
+
+<< /Length 5 0 R /S 1245 >>
+
+stream
+%¥×³®í»š}%§X{{tøNåÝž¶ö¢ÖÞ¾–~´¼
+endstream
+endobj".Replace("\r\n", "\n").Replace("\n", "\r\n");
+ var locationProvider = new TestObjectLocationProvider();
+
+ locationProvider.Offsets[new IndirectReference(5, 0)] = XrefLocation.File(0);
+
+ var scanner = GetScanner(s, locationProvider);
+
+ var token = ReadToEnd(scanner)[1];
+
+ var stream = Assert.IsType(token.Data);
+
+ var bytes = stream.Data.ToArray();
+ Assert.Equal(52, bytes.Length);
+
+ var outputString = Encoding.UTF8.GetString(bytes);
+
+ Assert.Equal("%¥×³®í»š}%§X{{tøNåÝž¶ö¢ÖÞ¾–~´¼", outputString);
+ }
+
+ [Fact]
+ public void ReadsStreamWithMissingLength()
+ {
+ var s = @"
+12655 0 obj
+
+<< /S 1245 >>
+
+stream
+%¥×³®í»š}%§X{{tøNåÝž¶ö¢ÖÞgrehtyyy$&%&£$££(*¾–~´¼
+endstream
+endobj".Replace("\r\n", "\n").Replace("\n", "\r\n");
+
+ var scanner = GetScanner(s);
+
+ var token = ReadToEnd(scanner)[0];
+
+ Assert.Equal(12655, token.Number.ObjectNumber);
+
+ var stream = Assert.IsType(token.Data);
+
+ Assert.Equal("1245", stream.StreamDictionary.Data["S"].ToString());
+
+ Assert.Equal("%¥×³®í»š}%§X{{tøNåÝž¶ö¢ÖÞgrehtyyy$&%&£$££(*¾–~´¼", Encoding.UTF8.GetString(stream.Data.ToArray()));
+ }
+
+ [Fact]
+ public void ReadsStreamWithoutBreakBeforeEndstream()
+ {
+ var s = @"
+1 0 obj
+12
+endobj
+
+7 0 obj
+<< /Length 288
+ /Filter /FlateDecode >>
+stream
+xœ]‘ËjÃ0E÷ÿÃ,ÓEð#NÒ€1¤N^ôA~€-]A-YYøï+Ï4¡t#qfîFWQY*Dïv5:è”–§ñjB‹½Òa¤ •p7¤K ƒÈûëyr8Tº!Ïà úð‚ÉÙVG9¶ø@Å7+Ñ*ÝÃ곬¹T_ùƵƒ8Š$vË̗Ƽ6BDöu%½B¹yí$—Ù ¤\Hx71JœL#Ð6ºÇ0È㸀ü|. µüßõÏ""WÛ‰¯Æ.êÄ«ã8;¤iL°!Ø %É`K°ßì¸ÃöÜáÜ) [‚#CFðİ#(yƒg^ÿ¶æò
+ÿž“¸Zë#¢?¢h–P”Æû?šÑï÷ø¯‰Šendstream
+endobj
+
+9 0 obj
+16
+endobj".Replace("\r\n", "\n").Replace("\n", "\r\n");
+
+ var scanner = GetScanner(s);
+
+ var token = ReadToEnd(scanner)[1];
+
+ Assert.Equal(7, token.Number.ObjectNumber);
+ }
+
+ [Fact]
+ public void ReadsStringsWithMissingEndBracket()
+ {
+ var input = @"5 0 obj
+<<
+/Kids [4 0 R 12 0 R 17 0 R 20 0 R 25 0 R 28 0 R ]
+/Count 6
+/Type /Pages
+/MediaBox [ 0 0 612 792 ]
+>>
+endobj
+1 0 obj
+<<
+/Creator (Corel WordPerfect - [D:\Wpdocs\WEBSITE\PROC&POL.WP6 (unmodified)
+/CreationDate (D:19980224130723)
+/Title (Proc&Pol.pdf)
+/Author (J. L. Swezey)
+/Producer (Acrobat PDFWriter 3.03 for Windows NT)
+/Keywords (Budapest Treaty; Patent deposits; IDA)
+/Subject (Patent Collection Procedures and Policies)
+>>
+endobj
+3 0 obj
+<<
+/Pages 5 0 R
+/Type /Catalog
+>>
+endobj".Replace("\r\n", "\n").Replace("\n", "\r\n");
+
+ var scanner = GetScanner(input);
+
+ var tokens = ReadToEnd(scanner);
+
+ Assert.Equal(3, tokens.Count);
+
+ var first = tokens[0];
+ Assert.Equal(5, first.Number.ObjectNumber);
+
+ var second = tokens[1];
+ Assert.Equal(1, second.Number.ObjectNumber);
+
+ var third = tokens[2];
+ Assert.Equal(3, third.Number.ObjectNumber);
+ }
+
+ [Fact]
+ public void ReadsDictionaryContainingNull()
+ {
+ var input = @"14224 0 obj
+<>
+endobj".Replace("\r\n", "\n").Replace("\n", "\r\n");
+
+ var scanner = GetScanner(input);
+
+ var tokens = ReadToEnd(scanner);
+
+ var dictionaryToken = tokens[0].Data as DictionaryToken;
+
+ Assert.NotNull(dictionaryToken);
+
+ var encryptValue = dictionaryToken.Data["Encrypt"];
+
+ Assert.IsType(encryptValue);
+ }
+
+ [Fact]
+ public void ReadMultipleNestedDictionary()
+ {
+ var input =
+ @"
+ 4 0 obj
+ << /Type /Font /Subtype /Type1 /Name /AF1F040+Arial /BaseFont /Arial /FirstChar 32 /LastChar 255
+ /Encoding
+ <<
+ /Type /Encoding /BaseEncoding /WinAnsiEncoding
+ /Differences [128 /Euro 130 /quotesinglbase /florin /quotedblbase /ellipsis /dagger /daggerdbl /circumflex /perthousand /Scaron /guilsinglleft /OE 142 /Zcaron 145
+ /quoteleft /quoteright /quotedblleft /quotedblright /bullet /endash /emdash /tilde /trademark /scaron /guilsinglright /oe 158 /zcaron /Ydieresis /space /exclamdown
+ /cent /sterling /currency /yen /brokenbar /section /dieresis /copyright /ordfeminine /guillemotleft /logicalnot /hyphen /registered /macron /degree /plusminus
+ /twosuperior /threesuperior /acute /mu /paragraph /periodcentered /cedilla /onesuperior /ordmasculine /guillemotright /onequarter /onehalf /threequarters
+ /questiondown /Agrave /Aacute /Acircumflex /Atilde /Adieresis /Aring /AE /Ccedilla /Egrave /Eacute /Ecircumflex /Edieresis /Igrave /Iacute /Icircumflex /Idieresis
+ /Eth /Ntilde /Ograve /Oacute /Ocircumflex /Otilde /Odieresis /multiply /Oslash /Ugrave /Uacute /Ucircumflex /Udieresis /Yacute /Thorn /germandbls /agrave /aacute
+ /acircumflex /atilde /adieresis /aring /ae /ccedilla /egrave /eacute /ecircumflex /edieresis /igrave /iacute /icircumflex /idieresis /eth /ntilde /ograve /oacute
+ /ocircumflex /otilde /odieresis /divide /oslash /ugrave /uacute /ucircumflex /udieresis /yacute /thorn /ydieresis ]
+ >>
+ /Widths [278 278 355 556 556 889 667 191 333 333 389 584 278 333 278 278
+ 556 556 556 556 556 556 556 556 556 556 278 278 584 584 584 556
+ 1015 667 667 722 722 667 611 778 722 278 500 667 556 833 722 778
+ 667 778 722 667 611 722 667 944 667 667 611 278 278 278 469 556
+ 333 556 556 500 556 556 278 556 556 222 222 500 222 833 556 556
+ 556 556 333 500 278 556 500 722 500 500 500 334 260 334 584 750
+ 556 750 222 556 333 1000 556 556 333 1000 667 333 1000 750 611 750
+ 750 222 222 333 333 350 556 1000 333 1000 500 333 944 750 500 667
+ 278 333 556 556 556 556 260 556 333 737 370 556 584 333 737 552
+ 400 549 333 333 333 576 537 278 333 333 365 556 834 834 834 611
+ 667 667 667 667 667 667 1000 722 667 667 667 667 278 278 278 278
+ 722 722 778 778 778 778 778 584 778 722 722 722 722 667 667 611
+ 556 556 556 556 556 556 889 500 556 556 556 556 278 278 278 278
+ 556 556 556 556 556 556 556 549 611 556 556 556 556 500 556 500
+ ]
+ >>
+ >>
+ endobj
+ ".Replace("\r\n", "\n").Replace("\n", "\r\n");
+
+ var scanner = GetScanner(input);
+
+ var tokens = ReadToEnd(scanner);
+
+ var dictionaryToken = tokens[0].Data as DictionaryToken;
+
+ Assert.NotNull(dictionaryToken);
+ }
+
+ [Fact]
+ public void ReadsDictionaryWithoutEndObjBeforeNextObject()
+ {
+ var input = @"1 0 obj
+<>
+2 0 obj
+<>
+endobj".Replace("\r\n", "\n").Replace("\n", "\r\n");
+
+ var scanner = GetScanner(input);
+
+ var tokens = ReadToEnd(scanner);
+
+ Assert.Equal(2, tokens.Count);
+
+ var dictionaryToken = Assert.IsType(tokens[0].Data);
+ var typeValue = dictionaryToken.Data["Type"];
+ Assert.IsType(typeValue);
+
+ dictionaryToken = tokens[1].Data as DictionaryToken;
+ Assert.NotNull(dictionaryToken);
+ typeValue = dictionaryToken.Data["Length"];
+ Assert.IsType(typeValue);
+ }
+
+ [Fact]
+ public void ReadsStreamWithoutEndObjBeforeNextObject()
+ {
+ var input = @"1 0 obj
+<>
+stream
+aaaa
+endstream
+2 0 obj
+<>
+endobj".Replace("\r\n", "\n").Replace("\n", "\r\n");
+
+ var scanner = GetScanner(input);
+
+ var tokens = ReadToEnd(scanner);
+
+ Assert.Equal(2, tokens.Count);
+
+ Assert.IsType(tokens[0].Data);
+
+ var dictionaryToken = Assert.IsType(tokens[1].Data);
+ var typeValue = dictionaryToken.Data["Length"];
+ Assert.IsType(typeValue);
+ }
+
+ [Theory]
+ [InlineData("startxref")]
+ [InlineData("xref")]
+ public void ReadsStreamWithoutEndObjBeforeToken(string token)
+ {
+ var input = @$"1 0 obj
+<>
+stream
+aaaa
+endstream
+{token}".Replace("\r\n", "\n").Replace("\n", "\r\n");
+
+ var scanner = GetScanner(input);
+
+ var tokens = ReadToEnd(scanner);
+
+ Assert.Single(tokens);
+
+ Assert.IsType(tokens[0].Data);
+ }
+
+ [Theory]
+ [InlineData("startxref")]
+ [InlineData("xref")]
+ public void ReadsDictionaryWithoutEndObjBeforeToken(string token)
+ {
+ var input = @$"1 0 obj
+<>
+{token}".Replace("\r\n", "\n").Replace("\n", "\r\n");
+
+ var scanner = GetScanner(input);
+
+ var tokens = ReadToEnd(scanner);
+
+ Assert.Single(tokens);
+
+ var dictionaryToken = Assert.IsType(tokens[0].Data);
+ var typeValue = dictionaryToken.Data["Type"];
+ Assert.IsType(typeValue);
+ }
+
+ [Fact]
+ public void ReadsStreamWithoutEndStreamBeforeEndObj()
+ {
+ var input = @"1 0 obj
+<>
+stream
+aaaa
+endobj
+2 0 obj
+<>
+endobj".Replace("\r\n", "\n").Replace("\n", "\r\n");
+
+ var scanner = GetScanner(input);
+
+ var tokens = ReadToEnd(scanner);
+
+ Assert.Equal(2, tokens.Count);
+
+ Assert.IsType(tokens[0].Data);
+
+ var dictionaryToken = Assert.IsType(tokens[1].Data);
+ var lengthValue = dictionaryToken.Data["Length"];
+ Assert.IsType(lengthValue);
+ }
+
+ [Theory]
+ [InlineData(">>")]
+ [InlineData("randomstring")]
+ public void ReadsIndirectObjectsDictionaryWithContentBeforeEndObj(string addedContent)
+ {
+ var input = @$"1 0 obj
+<>
+{addedContent}endobj
+2 0 obj
+<>
+endobj".Replace("\r\n", "\n").Replace("\n", "\r\n");
+
+ var strictScanner = GetScanner(input);
+
+ var tokens = ReadToEnd(strictScanner);
+ Assert.Empty(tokens);
+
+
+ var lenientScanner = GetScanner(input, useLenientParsing: true);
+ tokens = ReadToEnd(lenientScanner);
+
+ Assert.Equal(2, tokens.Count);
+
+ var dictionaryToken = Assert.IsType(tokens[0].Data);
+ var typeValue = dictionaryToken.Data["Type"];
+ Assert.IsType(typeValue);
+
+ dictionaryToken = Assert.IsType(tokens[1].Data);
+ var lengthValue = dictionaryToken.Data["Length"];
+ Assert.IsType(lengthValue);
+ }
+
+ [Theory]
+ [InlineData(">>")]
+ [InlineData("randomstring")]
+ public void ReadsIndirectObjectsStreamWithAddedContentBeforeStream(string addedContent)
+ {
+ var input = @$"1 0 obj
+<>
+{addedContent}stream
+aaaa
+endstream
+endobj
+2 0 obj
+<>
+endobj".Replace("\r\n", "\n").Replace("\n", "\r\n");
+
+ var strictScanner = GetScanner(input);
+
+ var tokens = ReadToEnd(strictScanner);
+ Assert.Equal(2, tokens.Count);
+ // this is linked to the parsing choosing the last token parsed in obj.
+ // It can probably be challenged against taking the first one.
+ var operatorToken = Assert.IsType(tokens[0].Data);
+ Assert.Equal("endstream", operatorToken.Data);
+
+ var dictionaryToken = Assert.IsType(tokens[1].Data);
+ var lengthValue = dictionaryToken.Data["Length"];
+ Assert.IsType(lengthValue);
+
+ var lenientScanner = GetScanner(input, useLenientParsing:true);
+ tokens = ReadToEnd(lenientScanner);
+
+ Assert.Equal(2, tokens.Count);
+
+ Assert.IsType(tokens[0].Data);
+
+ dictionaryToken = Assert.IsType(tokens[1].Data);
+ lengthValue = dictionaryToken.Data["Length"];
+ Assert.IsType(lengthValue);
+ }
+
+ private static PdfTokenScanner GetScanner(string s, TestObjectLocationProvider locationProvider = null, bool useLenientParsing = false)
+ {
+ var input = StringBytesTestConverter.Convert(s, false);
+
+ return new PdfTokenScanner(input.Bytes,
+ locationProvider ?? new TestObjectLocationProvider(),
+ new TestFilterProvider(),
+ NoOpEncryptionHandler.Instance,
+ new FileHeaderOffset(0),
+ useLenientParsing ? new ParsingOptions() : ParsingOptions.LenientParsingOff,
+ new StackDepthGuard(256));
+ }
+
+ private static IReadOnlyList ReadToEnd(PdfTokenScanner scanner)
+ {
+ var result = new List();
+
+ while (scanner.MoveNext())
+ {
+ if (scanner.CurrentToken is ObjectToken obj)
+ {
+ result.Add(obj);
+ }
+ else
+ {
+ throw new InvalidOperationException($"Pdf token scanner produced token which was not an object token: {scanner.CurrentToken}.");
+ }
+ }
+
+ return result;
+ }
+ }
+}
diff --git a/src/UglyToad.PdfPig.Tokens/ObjectToken.cs b/src/UglyToad.PdfPig.Tokens/ObjectToken.cs
index 7417429a6..606434645 100644
--- a/src/UglyToad.PdfPig.Tokens/ObjectToken.cs
+++ b/src/UglyToad.PdfPig.Tokens/ObjectToken.cs
@@ -12,7 +12,7 @@ public class ObjectToken : IDataToken
///
/// The offset to the start of the object number from the start of the file in bytes.
///
- public long Position { get; }
+ public XrefLocation Position { get; }
///
/// The object and generation number of the object.
@@ -30,7 +30,7 @@ public class ObjectToken : IDataToken
/// The offset in bytes from the start of the file for this object.
/// The identifier for this object.
/// The data contained in this object.
- public ObjectToken(long position, IndirectReference number, IToken data)
+ public ObjectToken(XrefLocation position, IndirectReference number, IToken data)
{
Position = position;
Number = number;
diff --git a/src/UglyToad.PdfPig/AcroForms/AcroFormFactory.cs b/src/UglyToad.PdfPig/AcroForms/AcroFormFactory.cs
index a9ee9ef09..875b45447 100644
--- a/src/UglyToad.PdfPig/AcroForms/AcroFormFactory.cs
+++ b/src/UglyToad.PdfPig/AcroForms/AcroFormFactory.cs
@@ -28,12 +28,12 @@ internal class AcroFormFactory
private readonly IPdfTokenScanner tokenScanner;
private readonly ILookupFilterProvider filterProvider;
- private readonly IReadOnlyDictionary objectOffsets;
+ private readonly IReadOnlyDictionary objectOffsets;
public AcroFormFactory(
IPdfTokenScanner tokenScanner,
ILookupFilterProvider filterProvider,
- IReadOnlyDictionary objectOffsets)
+ IReadOnlyDictionary objectOffsets)
{
this.tokenScanner = tokenScanner ?? throw new ArgumentNullException(nameof(tokenScanner));
this.filterProvider = filterProvider ?? throw new ArgumentNullException(nameof(filterProvider));
diff --git a/src/UglyToad.PdfPig/Filters/FlateFilter.cs b/src/UglyToad.PdfPig/Filters/FlateFilter.cs
index 73428907d..09fdad062 100644
--- a/src/UglyToad.PdfPig/Filters/FlateFilter.cs
+++ b/src/UglyToad.PdfPig/Filters/FlateFilter.cs
@@ -2,11 +2,10 @@
{
using Fonts;
using System;
- using System.Buffers.Binary;
using System.IO;
using System.IO.Compression;
using Tokens;
- using UglyToad.PdfPig.Core;
+ using Core;
using Util;
///
@@ -55,89 +54,41 @@ public Memory Decode(Memory input, DictionaryToken streamDictionary,
return input;
}
- private static Memory Decompress(Memory input, int predictor, int colors, int bitsPerComponent, int columns)
+ private static Memory Decompress(Memory input,
+ int predictor,
+ int colors,
+ int bitsPerComponent,
+ int columns)
{
-#if NET
using var memoryStream = MemoryHelper.AsReadOnlyMemoryStream(input);
- try
- {
- using (var zlib = new ZLibStream(memoryStream, CompressionMode.Decompress))
- using (var output = new MemoryStream((int)(input.Length * 1.5)))
- using (var f = PngPredictor.WrapPredictor(output, predictor, colors, bitsPerComponent, columns))
- {
- zlib.CopyTo(f);
- f.Flush();
-
- return output.AsMemory();
- }
- }
- catch (InvalidDataException ex)
- {
- throw new CorruptCompressedDataException("Invalid Flate compressed stream encountered", ex);
- }
-#else
- // Ideally we would like to use the ZLibStream class but that is only available in .NET 5+.
- // We look at the raw data now
- // * First we have 2 bytes, specifying the type of compression
- // * Then we have the deflated data
- // * Then we have a 4 byte checksum (Adler32)
-
- // Would be so nice to have zlib do the framing here... but the deflate stream already reads data from the stream that we need.
-
- using var memoryStream = MemoryHelper.AsReadOnlyMemoryStream(input.Slice(2, input.Length - 2 /* Header */ - 4 /* Checksum */));
- // The first 2 bytes are the header which DeflateStream can't handle. After the s
- var adlerBytes = input.Slice(input.Length - 4, 4).Span;
- uint expected = BinaryPrimitives.ReadUInt32BigEndian(adlerBytes);
- uint altExpected = expected;
-
- // Sometimes the data ends with "\r\n", "\r" or "\n" and we don't know if it is part of the zlib
- // Ideally this would have been removed by the caller from the provided length...
- if (adlerBytes[3] == '\n' || adlerBytes[3] == '\r')
- {
- if (adlerBytes[3] == '\n' && adlerBytes[2] == '\r')
- {
- // Now we don't know which value is the good one. The value could be ok, or padding.
- // Lets allow both values for now. Allowing two out of 2^32 is much better than allowing everything
- adlerBytes = input.Slice(input.Length - 6, 4).Span;
- }
- else
- {
- // Same but now for just '\n' or '\r' instead of '\r\n'
- adlerBytes = input.Slice(input.Length - 5, 4).Span;
- }
-
- altExpected = BinaryPrimitives.ReadUInt32BigEndian(adlerBytes);
- }
-
+ // The first 2 bytes are the header which DeflateStream does not support.
+ memoryStream.ReadByte();
+ memoryStream.ReadByte();
try
{
- using (var deflate = new DeflateStream(memoryStream, CompressionMode.Decompress))
- using (var adlerStream = new Adler32ChecksumStream(deflate))
- using (var output = new MemoryStream((int)(input.Length * 1.5)))
- using (var f = PngPredictor.WrapPredictor(output, predictor, colors, bitsPerComponent, columns))
- {
- adlerStream.CopyTo(f);
- f.Flush();
-
- uint actual = adlerStream.Checksum;
- if (expected != actual && altExpected != actual)
- {
- throw new CorruptCompressedDataException("Flate stream has invalid checksum");
- }
-
- return output.AsMemory();
- }
+ using var deflate = new DeflateStream(memoryStream, CompressionMode.Decompress);
+ using var output = new MemoryStream((int)(input.Length * 1.5));
+ using var f = PngPredictor.WrapPredictor(output, predictor, colors, bitsPerComponent, columns);
+
+ deflate.CopyTo(f);
+ f.Flush();
+
+ return output.AsMemory();
}
catch (InvalidDataException ex)
{
throw new CorruptCompressedDataException("Invalid Flate compressed stream encountered", ex);
}
-#endif
}
- ///
- public byte[] Encode(Stream input, DictionaryToken streamDictionary, int index)
+ ///
+ /// Convert a decoded data stream back to the encoded version.
+ ///
+ /// The decoded data.
+ /// The stream dictionary with the parameters to use.
+ /// The Flate encoded data.
+ public byte[] Encode(Stream input, DictionaryToken streamDictionary)
{
const int headerLength = 2;
const int checksumLength = 4;
diff --git a/src/UglyToad.PdfPig/Parser/FileStructure/FirstPassParser.cs b/src/UglyToad.PdfPig/Parser/FileStructure/FirstPassParser.cs
index 4553b8550..35d8e2c6e 100644
--- a/src/UglyToad.PdfPig/Parser/FileStructure/FirstPassParser.cs
+++ b/src/UglyToad.PdfPig/Parser/FileStructure/FirstPassParser.cs
@@ -16,7 +16,7 @@ public static FirstPassResults Parse(
{
log ??= new NoOpLog();
- IReadOnlyDictionary? bruteForceOffsets = null;
+ IReadOnlyDictionary? bruteForceOffsets = null;
var didBruteForce = false;
DictionaryToken? bruteForceTrailer = null;
@@ -92,7 +92,7 @@ public static FirstPassResults Parse(
}
DictionaryToken? lastTrailer = null;
- var flattenedOffsets = new Dictionary();
+ var flattenedOffsets = new Dictionary();
foreach (var xrefPart in orderedXrefs)
{
if (xrefPart.Dictionary != null)
@@ -230,12 +230,12 @@ internal class FirstPassResults
///
/// All offsets found if a brute-force search was applied.
///
- public IReadOnlyDictionary? BruteForceOffsets { get; }
+ public IReadOnlyDictionary? BruteForceOffsets { get; }
///
/// All offsets found from the leaf xref.
///
- public IReadOnlyDictionary XrefOffsets { get; }
+ public IReadOnlyDictionary XrefOffsets { get; }
///
/// The trailer dictionary of the leaf xref if we found any.
@@ -244,8 +244,8 @@ internal class FirstPassResults
public FirstPassResults(
IReadOnlyList parts,
- IReadOnlyDictionary? bruteForceOffsets,
- IReadOnlyDictionary xrefOffsets,
+ IReadOnlyDictionary? bruteForceOffsets,
+ IReadOnlyDictionary xrefOffsets,
DictionaryToken? trailer)
{
Parts = parts;
diff --git a/src/UglyToad.PdfPig/Parser/FileStructure/IXrefSection.cs b/src/UglyToad.PdfPig/Parser/FileStructure/IXrefSection.cs
index 470722935..8e56f5821 100644
--- a/src/UglyToad.PdfPig/Parser/FileStructure/IXrefSection.cs
+++ b/src/UglyToad.PdfPig/Parser/FileStructure/IXrefSection.cs
@@ -14,7 +14,7 @@ internal interface IXrefSection
///
/// The bytes offsets of the objects in this xref.
///
- public IReadOnlyDictionary ObjectOffsets { get; }
+ public IReadOnlyDictionary ObjectOffsets { get; }
///
/// The dictionary for this xref, for the trailer xref this is the trailer dictionary, for streams the stream dictionary.
diff --git a/src/UglyToad.PdfPig/Parser/FileStructure/XrefBruteForcer.cs b/src/UglyToad.PdfPig/Parser/FileStructure/XrefBruteForcer.cs
index b038f1625..998d0f5ee 100644
--- a/src/UglyToad.PdfPig/Parser/FileStructure/XrefBruteForcer.cs
+++ b/src/UglyToad.PdfPig/Parser/FileStructure/XrefBruteForcer.cs
@@ -19,7 +19,7 @@ public static Result FindAllXrefsInFileOrder(
// Guard against circular references; only read xref at each offset once
var xrefOffsetSeen = new HashSet();
- var bruteForceObjPositions = new Dictionary();
+ var bruteForceObjPositions = new Dictionary();
DictionaryToken? trailer = null;
@@ -123,7 +123,7 @@ void AddQueues(long num)
if (buffer.EndsWith(" obj") && numericsQueue[0] > 0)
{
- bruteForceObjPositions[new IndirectReference(numericsQueue[0], (int)numericsQueue[1])] = positionsQueue[0];
+ bruteForceObjPositions[new IndirectReference(numericsQueue[0], (int)numericsQueue[1])] = XrefLocation.File(positionsQueue[0]);
lastObjPosition = positionsQueue[0];
@@ -208,12 +208,12 @@ void AddQueues(long num)
public class Result(
IReadOnlyList xRefParts,
- IReadOnlyDictionary objectOffsets,
+ IReadOnlyDictionary objectOffsets,
DictionaryToken? lastTrailer)
{
public IReadOnlyList XRefParts { get; } = xRefParts;
- public IReadOnlyDictionary ObjectOffsets { get; } = objectOffsets;
+ public IReadOnlyDictionary ObjectOffsets { get; } = objectOffsets;
public DictionaryToken? LastTrailer { get; } = lastTrailer;
}
diff --git a/src/UglyToad.PdfPig/Parser/FileStructure/XrefStream.cs b/src/UglyToad.PdfPig/Parser/FileStructure/XrefStream.cs
index 1a8f3f99d..353288425 100644
--- a/src/UglyToad.PdfPig/Parser/FileStructure/XrefStream.cs
+++ b/src/UglyToad.PdfPig/Parser/FileStructure/XrefStream.cs
@@ -10,7 +10,7 @@ internal sealed class XrefStream : IXrefSection
///
/// The corresponding byte offset for each keyed object in this document.
///
- public IReadOnlyDictionary ObjectOffsets { get; }
+ public IReadOnlyDictionary ObjectOffsets { get; }
public DictionaryToken Dictionary { get; }
@@ -20,7 +20,7 @@ internal sealed class XrefStream : IXrefSection
public XrefStream(
long offset,
- IReadOnlyDictionary objectOffsets,
+ IReadOnlyDictionary objectOffsets,
DictionaryToken streamDictionary,
XrefOffsetCorrection correctionType,
long offsetCorrection)
diff --git a/src/UglyToad.PdfPig/Parser/FileStructure/XrefStreamParser.cs b/src/UglyToad.PdfPig/Parser/FileStructure/XrefStreamParser.cs
index 3f5b70029..6bf1b5f1b 100644
--- a/src/UglyToad.PdfPig/Parser/FileStructure/XrefStreamParser.cs
+++ b/src/UglyToad.PdfPig/Parser/FileStructure/XrefStreamParser.cs
@@ -98,7 +98,7 @@ internal static class XrefStreamParser
? stackalloc byte[fieldSizes.LineLength]
: new byte[fieldSizes.LineLength];
- var numbers = new List<(long obj, int gen, int off)>();
+ var numbers = new List<(long obj, int gen, XrefLocation location)>();
foreach (var objectNumber in objectNumbers)
{
@@ -136,7 +136,7 @@ internal static class XrefStreamParser
return new XrefStream(
xrefOffset,
- numbers.ToDictionary(x => new IndirectReference(x.obj, x.gen), x => (long)x.off),
+ numbers.ToDictionary(x => new IndirectReference(x.obj, x.gen), x => x.location),
dictToken,
offsetCorrectionType,
offsetCorrection);
@@ -175,7 +175,7 @@ private static void ReadNextStreamObject(
int type,
long objectNumber,
XrefFieldSize fieldSizes,
- List<(long, int, int)> results,
+ List<(long, int, XrefLocation)> results,
ReadOnlySpan lineBuffer)
{
switch (type)
@@ -184,19 +184,23 @@ private static void ReadNextStreamObject(
// Ignore free objects.
break;
case 1:
- // Non object stream entries.
- var offset = 0;
- for (var i = 0; i < fieldSizes.Field2Size; i++)
- {
- offset += (lineBuffer[i + fieldSizes.Field1Size] & 0x00ff) << ((fieldSizes.Field2Size - i - 1) * 8);
- }
- var genNum = 0;
- for (var i = 0; i < fieldSizes.Field3Size; i++)
- {
- genNum += (lineBuffer[i + fieldSizes.Field1Size + fieldSizes.Field2Size] & 0x00ff) << ((fieldSizes.Field3Size - i - 1) * 8);
+ var offset = ReadUnsigned(
+ lineBuffer,
+ fieldSizes.Field1Size,
+ fieldSizes.Field2Size);
+
+ var genNum = ReadUnsigned(
+ lineBuffer,
+ fieldSizes.Field1Size + fieldSizes.Field2Size,
+ fieldSizes.Field3Size);
+
+ if (offset < 0)
+ {
+ throw new PdfDocumentFormatException(
+ $"Location with negative offset {offset} found for object {objectNumber}");
}
- results.Add((objectNumber, genNum, offset));
+ results.Add((objectNumber, (int)genNum, XrefLocation.File(offset)));
break;
case 2:
@@ -205,28 +209,49 @@ private static void ReadNextStreamObject(
* 2nd argument is object number of object stream
* 3rd argument is index of object within object stream
*
- * For sequential PDFParser we do not need this information
- * because
- * These objects are handled by the dereferenceObjects() method
- * since they're only pointing to object numbers
- *
- * However for XRef aware parsers we have to know which objects contain
- * object streams. We will store this information in normal xref mapping
- * table but add object stream number with minus sign in order to
- * distinguish from file offsets
*/
- var objstmObjNr = 0;
- for (var i = 0; i < fieldSizes.Field2Size; i++)
+
+ var objectStreamNumber = ReadUnsigned(
+ lineBuffer,
+ fieldSizes.Field1Size,
+ fieldSizes.Field2Size);
+
+ var streamIndex = ReadUnsigned(
+ lineBuffer,
+ fieldSizes.Field1Size + fieldSizes.Field2Size,
+ fieldSizes.Field3Size);
+
+ if (objectStreamNumber < 0)
+ {
+ throw new PdfDocumentFormatException(
+ $"Location with negative or zero object stream number {objectStreamNumber} found for object {objectNumber}");
+ }
+
+ if (streamIndex < 0)
{
- objstmObjNr += (lineBuffer[i + fieldSizes.Field1Size] & 0x00ff) << ((fieldSizes.Field2Size - i - 1) * 8);
+ throw new PdfDocumentFormatException(
+ $"Location with negative stream index {streamIndex} found for object {objectNumber} in stream {objectStreamNumber}");
}
- results.Add((objectNumber, 0, -objstmObjNr));
+ results.Add((objectNumber, 0, XrefLocation.Stream(objectStreamNumber, (int)streamIndex)));
break;
}
}
+ private static long ReadUnsigned(ReadOnlySpan buffer, int start, int width)
+ {
+ long value = 0;
+
+ for (int i = 0; i < width; i++)
+ {
+ value <<= 8;
+ value |= buffer[start + i];
+ }
+
+ return value;
+ }
+
private static (long from, long? to) ReadStreamTolerant(IInputBytes bytes)
{
var buffer = new CircularByteBuffer("endstream ".Length);
diff --git a/src/UglyToad.PdfPig/Parser/FileStructure/XrefTable.cs b/src/UglyToad.PdfPig/Parser/FileStructure/XrefTable.cs
index 011b25ba4..0370c37f6 100644
--- a/src/UglyToad.PdfPig/Parser/FileStructure/XrefTable.cs
+++ b/src/UglyToad.PdfPig/Parser/FileStructure/XrefTable.cs
@@ -13,7 +13,7 @@ internal sealed class XrefTable : IXrefSection
///
/// The corresponding byte offset for each keyed object in this document.
///
- public IReadOnlyDictionary ObjectOffsets { get; }
+ public IReadOnlyDictionary ObjectOffsets { get; }
public DictionaryToken? Dictionary { get; }
@@ -23,7 +23,7 @@ internal sealed class XrefTable : IXrefSection
public XrefTable(
long offset,
- IReadOnlyDictionary objectOffsets,
+ IReadOnlyDictionary objectOffsets,
DictionaryToken? trailer,
XrefOffsetCorrection correctionType,
long offsetCorrection)
diff --git a/src/UglyToad.PdfPig/Parser/FileStructure/XrefTableParser.cs b/src/UglyToad.PdfPig/Parser/FileStructure/XrefTableParser.cs
index 44f643c3e..99db0af7c 100644
--- a/src/UglyToad.PdfPig/Parser/FileStructure/XrefTableParser.cs
+++ b/src/UglyToad.PdfPig/Parser/FileStructure/XrefTableParser.cs
@@ -152,7 +152,7 @@ internal static class XrefTableParser
}
}
- var offsets = new Dictionary();
+ var offsets = new Dictionary();
if (readNums.Count == 0)
{
if (trailer != null)
@@ -233,7 +233,7 @@ bool TryReadBuff(int len)
if (type == occupiedSentinel)
{
var indirectRef = new IndirectReference(objNum, (int)gen);
- offsets[indirectRef] = objOffset;
+ offsets[indirectRef] = XrefLocation.File(objOffset);
}
objNum++;
diff --git a/src/UglyToad.PdfPig/Parser/Parts/BruteForceSearcher.cs b/src/UglyToad.PdfPig/Parser/Parts/BruteForceSearcher.cs
index 0ecde8965..ceecff647 100644
--- a/src/UglyToad.PdfPig/Parser/Parts/BruteForceSearcher.cs
+++ b/src/UglyToad.PdfPig/Parser/Parts/BruteForceSearcher.cs
@@ -18,7 +18,7 @@ internal static class BruteForceSearcher
///
/// The bytes of the document.
/// The object keys and offsets for the objects in this document.
- public static IReadOnlyDictionary GetObjectLocations(IInputBytes bytes)
+ public static IReadOnlyDictionary GetObjectLocations(IInputBytes bytes)
{
if (bytes is null)
{
@@ -29,7 +29,7 @@ public static IReadOnlyDictionary GetObjectLocations(II
var lastEndOfFile = GetLastEndOfFileMarker(bytes);
- var results = new Dictionary();
+ var results = new Dictionary();
var generationBytes = new StringBuilder();
var objectNumberBytes = new StringBuilder();
@@ -174,7 +174,7 @@ public static IReadOnlyDictionary GetObjectLocations(II
var obj = long.Parse(objectNumberBytes.ToString(), CultureInfo.InvariantCulture);
var generation = int.Parse(generationBytes.ToString(), CultureInfo.InvariantCulture);
- results[new IndirectReference(obj, generation)] = bytes.CurrentOffset;
+ results[new IndirectReference(obj, generation)] = XrefLocation.File(bytes.CurrentOffset);
generationBytes.Clear();
objectNumberBytes.Clear();
diff --git a/src/UglyToad.PdfPig/Tokenization/Scanner/IObjectLocationProvider.cs b/src/UglyToad.PdfPig/Tokenization/Scanner/IObjectLocationProvider.cs
index c84d937fa..972969eae 100644
--- a/src/UglyToad.PdfPig/Tokenization/Scanner/IObjectLocationProvider.cs
+++ b/src/UglyToad.PdfPig/Tokenization/Scanner/IObjectLocationProvider.cs
@@ -6,9 +6,9 @@
internal interface IObjectLocationProvider
{
- bool TryGetOffset(IndirectReference reference, out long offset);
+ bool TryGetOffset(IndirectReference reference, out XrefLocation offset);
- void UpdateOffset(IndirectReference reference, long offset);
+ void UpdateOffset(IndirectReference reference, XrefLocation offset);
bool TryGetCached(IndirectReference reference, [NotNullWhen(true)] out ObjectToken? objectToken);
diff --git a/src/UglyToad.PdfPig/Tokenization/Scanner/ObjectLocationProvider.cs b/src/UglyToad.PdfPig/Tokenization/Scanner/ObjectLocationProvider.cs
index 8f0a8e9c5..c15d55fc0 100644
--- a/src/UglyToad.PdfPig/Tokenization/Scanner/ObjectLocationProvider.cs
+++ b/src/UglyToad.PdfPig/Tokenization/Scanner/ObjectLocationProvider.cs
@@ -13,16 +13,16 @@ internal class ObjectLocationProvider : IObjectLocationProvider
private readonly IInputBytes bytes;
- private IReadOnlyDictionary? bruteForcedOffsets;
+ private IReadOnlyDictionary? bruteForcedOffsets;
- private readonly Dictionary offsets;
+ private readonly Dictionary offsets;
public ObjectLocationProvider(
- IReadOnlyDictionary xrefOffsets,
- IReadOnlyDictionary? bruteForcedOffsets,
+ IReadOnlyDictionary xrefOffsets,
+ IReadOnlyDictionary? bruteForcedOffsets,
IInputBytes bytes)
{
- offsets = new Dictionary();
+ offsets = new Dictionary();
foreach (var xrefOffset in xrefOffsets)
{
offsets[xrefOffset.Key] = xrefOffset.Value;
@@ -32,7 +32,7 @@ public ObjectLocationProvider(
this.bytes = bytes;
}
- public bool TryGetOffset(IndirectReference reference, out long offset)
+ public bool TryGetOffset(IndirectReference reference, out XrefLocation offset)
{
if (bruteForcedOffsets != null && bruteForcedOffsets.TryGetValue(reference, out var bfOffset))
{
@@ -42,16 +42,6 @@ public bool TryGetOffset(IndirectReference reference, out long offset)
if (offsets.TryGetValue(reference, out offset))
{
- if (offset + reference.ObjectNumber == 0)
- {
- // We have a case where 'offset' and
- // 'reference.ObjectNumber' have the same value
- // and opposite signs.
- // This results in an infinite recursion in
- // PdfTokenScanner.GetObjectFromStream() where
- // `var streamObjectNumber = offset * -1;`
- throw new PdfDocumentFormatException("Avoiding infinite recursion in ObjectLocationProvider.TryGetOffset() as 'offset' and 'reference.ObjectNumber' have the same value and opposite signs.");
- }
return true;
}
@@ -63,7 +53,7 @@ public bool TryGetOffset(IndirectReference reference, out long offset)
return bruteForcedOffsets.TryGetValue(reference, out offset);
}
- public void UpdateOffset(IndirectReference reference, long offset)
+ public void UpdateOffset(IndirectReference reference, XrefLocation offset)
{
offsets[reference] = offset;
}
@@ -81,8 +71,9 @@ public void Cache(ObjectToken objectToken, bool force = false)
}
// Don't cache incorrect locations.
- if (!force && offsets.TryGetValue(objectToken.Number, out var expected)
- && objectToken.Position != expected)
+ if (!force
+ && offsets.TryGetValue(objectToken.Number, out var expected)
+ && (objectToken.Position.Type != expected.Type || objectToken.Position.Value1 != expected.Value1))
{
return;
}
diff --git a/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs b/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs
index 4794e6d49..8435573f6 100644
--- a/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs
+++ b/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs
@@ -1,5 +1,8 @@
namespace UglyToad.PdfPig.Tokenization.Scanner
{
+ using Core;
+ using Encryption;
+ using Filters;
using System;
using System.Collections.Generic;
using System.Diagnostics;
@@ -7,9 +10,6 @@
using System.Globalization;
using System.Linq;
using System.Text.RegularExpressions;
- using Core;
- using Encryption;
- using Filters;
using Tokens;
using UglyToad.PdfPig.Parser.FileStructure;
@@ -164,7 +164,7 @@ public bool MoveNext()
var actualReference = new IndirectReference(objectNumber.Int, generation.Int);
var actualToken = encryptionHandler.Decrypt(actualReference, readTokens[0]);
- CurrentToken = new ObjectToken(startPosition, actualReference, actualToken);
+ CurrentToken = new ObjectToken(XrefLocation.File(startPosition), actualReference, actualToken);
readTokens.Clear();
coreTokenScanner.Seek(previousTokenPositions[0]);
@@ -191,7 +191,7 @@ public bool MoveNext()
var actualReference = new IndirectReference(objectNumber.Int, generation.Int);
var actualToken = encryptionHandler.Decrypt(actualReference, readTokens[0]);
- CurrentToken = new ObjectToken(startPosition, actualReference, actualToken);
+ CurrentToken = new ObjectToken(XrefLocation.File(startPosition), actualReference, actualToken);
readTokens.Clear();
coreTokenScanner.Seek(previousTokenPositions[2]);
@@ -291,9 +291,9 @@ public bool MoveNext()
token = encryptionHandler.Decrypt(reference, token);
- CurrentToken = new ObjectToken(startPosition, reference, token);
+ CurrentToken = new ObjectToken(XrefLocation.File(startPosition), reference, token);
- objectLocationProvider.UpdateOffset(reference, startPosition);
+ objectLocationProvider.UpdateOffset(reference, XrefLocation.File(startPosition));
readTokens.Clear();
return true;
@@ -626,10 +626,10 @@ private DictionaryToken GetStreamDictionary()
// We can only find it if we know where it is.
if (objectLocationProvider.TryGetOffset(lengthReference.Data, out var offset))
{
- if (offset < 0)
+ if (offset.Type == XrefEntryType.ObjectStream)
{
- ushort searchDepth = 0;
- var result = GetObjectFromStream(lengthReference.Data, offset, ref searchDepth);
+ Span stack = stackalloc int[7];
+ var result = GetObjectFromStream(lengthReference.Data, offset, stack, 0);
if (!(result.Data is NumericToken streamLengthToken))
{
@@ -639,8 +639,9 @@ private DictionaryToken GetStreamDictionary()
return streamLengthToken.Long;
}
+
// Move to the length object and read it.
- Seek(offset);
+ Seek(offset.Value1);
// Keep a copy of the read tokens here since this list must be empty prior to move next.
var oldData = new List(readTokens);
@@ -721,19 +722,31 @@ public void DeregisterCustomTokenizer(ITokenizer tokenizer)
public ObjectToken? Get(IndirectReference reference)
{
- ushort searchDepth = 0;
- return Get(reference, ref searchDepth);
+ Span stack = stackalloc int[7];
+ return Get(reference, stack, 0);
}
- private ObjectToken? Get(IndirectReference reference, ref ushort searchDepth)
+ private ObjectToken? Get(IndirectReference reference, Span navSet, byte depth)
{
- if (searchDepth > 100)
+ if (depth >= navSet.Length)
{
- throw new PdfDocumentFormatException("Reached maximum search depth while getting indirect reference.");
+ var chain = string.Join(", ", navSet.ToArray());
+ throw new PdfDocumentFormatException($"Deep object chain detected when looking for {reference}: {chain}.");
}
- searchDepth++;
+ // Cycle detection (linear scan, but depth is tiny)
+ for (var i = 0; i < depth; i++)
+ {
+ if (navSet[i] == reference.ObjectNumber)
+ {
+ var chain = string.Join(", ", navSet.ToArray());
+ throw new PdfDocumentFormatException(
+ $"Circular reference encountered when looking for object {reference}. Involved objects were: {chain}");
+ }
+ }
+ navSet[depth] = (int)reference.ObjectNumber;
+ depth++;
if (isDisposed)
{
@@ -756,20 +769,20 @@ public void DeregisterCustomTokenizer(ITokenizer tokenizer)
}
// Negative offsets refer to a stream with that number.
- if (offset < 0)
+ if (offset.Type == XrefEntryType.ObjectStream)
{
- var result = GetObjectFromStream(reference, offset, ref searchDepth);
+ if (offset.Value1 == reference.ObjectNumber)
+ {
+ throw new PdfDocumentFormatException(
+ $"Object stream cannot contain itself, looking for object {reference} in {offset.Value1}");
+ }
- return result;
- }
+ var result = GetObjectFromStream(reference, offset, navSet, depth);
- if (offset == 0 && reference.Generation > ushort.MaxValue)
- {
- // TODO - To remove as should not happen anymore
- return new ObjectToken(offset, reference, NullToken.Instance);
+ return result;
}
- Seek(offset);
+ Seek(offset.Value1);
if (!MoveNext())
{
@@ -793,7 +806,7 @@ public void ReplaceToken(IndirectReference reference, IToken token)
{
// Using 0 position as it isn't written to stream and this value doesn't
// seem to be used by any callers. In future may need to revisit this.
- overwrittenTokens[reference] = new ObjectToken(0, reference, token);
+ overwrittenTokens[reference] = new ObjectToken(XrefLocation.File(0), reference, token);
}
private bool TryBruteForceFileToFindReference(IndirectReference reference, [NotNullWhen(true)] out ObjectToken? result)
@@ -826,11 +839,11 @@ private bool TryBruteForceFileToFindReference(IndirectReference reference, [NotN
}
}
- private ObjectToken GetObjectFromStream(IndirectReference reference, long offset, ref ushort searchDepth)
+ private ObjectToken GetObjectFromStream(IndirectReference reference, XrefLocation offset, Span navSet, byte depth)
{
- var streamObjectNumber = offset * -1;
+ var streamObjectNumber = offset.Value1;
- var streamObject = Get(new IndirectReference(streamObjectNumber, 0), ref searchDepth);
+ var streamObject = Get(new IndirectReference(streamObjectNumber, 0), navSet, depth);
if (!(streamObject?.Data is StreamToken stream))
{
@@ -853,7 +866,7 @@ private ObjectToken GetObjectFromStream(IndirectReference reference, long offset
return result;
}
- private IReadOnlyList ParseObjectStream(StreamToken stream, long offset)
+ private IReadOnlyList ParseObjectStream(StreamToken stream, XrefLocation offset)
{
if (!stream.StreamDictionary.TryGet(NameToken.N, out var numberToken)
|| !(numberToken is NumericToken numberOfObjects))
diff --git a/src/UglyToad.PdfPig/Writer/Colors/OutputIntentsFactory.cs b/src/UglyToad.PdfPig/Writer/Colors/OutputIntentsFactory.cs
index f2c48955b..3b92c13eb 100644
--- a/src/UglyToad.PdfPig/Writer/Colors/OutputIntentsFactory.cs
+++ b/src/UglyToad.PdfPig/Writer/Colors/OutputIntentsFactory.cs
@@ -15,7 +15,7 @@ public static ArrayToken GetOutputIntentsArray(Func
{
diff --git a/src/UglyToad.PdfPig/Writer/DataCompresser.cs b/src/UglyToad.PdfPig/Writer/DataCompressor.cs
similarity index 96%
rename from src/UglyToad.PdfPig/Writer/DataCompresser.cs
rename to src/UglyToad.PdfPig/Writer/DataCompressor.cs
index e143ab3ba..b47f397ff 100644
--- a/src/UglyToad.PdfPig/Writer/DataCompresser.cs
+++ b/src/UglyToad.PdfPig/Writer/DataCompressor.cs
@@ -6,7 +6,7 @@
using Filters;
using Tokens;
- internal static class DataCompresser
+ internal static class DataCompressor
{
public static byte[] CompressBytes(IReadOnlyList bytes) => CompressBytes(bytes.ToArray());
public static byte[] CompressBytes(byte[] bytes)
@@ -15,7 +15,7 @@ public static byte[] CompressBytes(byte[] bytes)
{
var parameters = new DictionaryToken(new Dictionary());
var flater = new FlateFilter();
- var result = flater.Encode(memoryStream, parameters, 0);
+ var result = flater.Encode(memoryStream, parameters);
return result;
}
}
diff --git a/src/UglyToad.PdfPig/Writer/Fonts/TrueTypeWritingFont.cs b/src/UglyToad.PdfPig/Writer/Fonts/TrueTypeWritingFont.cs
index 5b7941fc9..fce7184fa 100644
--- a/src/UglyToad.PdfPig/Writer/Fonts/TrueTypeWritingFont.cs
+++ b/src/UglyToad.PdfPig/Writer/Fonts/TrueTypeWritingFont.cs
@@ -51,7 +51,7 @@ public IndirectReferenceToken WriteFont(IPdfStreamWriter writer, IndirectRefere
var newEncoding = new TrueTypeSubsetEncoding(characterMapping.Keys.ToList());
var subsetBytes = TrueTypeSubsetter.Subset(fontFileBytes.ToArray(), newEncoding);
- var embeddedFile = DataCompresser.CompressToStream(subsetBytes);
+ var embeddedFile = DataCompressor.CompressToStream(subsetBytes);
var fileRef = writer.WriteToken(embeddedFile);
@@ -110,7 +110,7 @@ public IndirectReferenceToken WriteFont(IPdfStreamWriter writer, IndirectRefere
var descriptor = writer.WriteToken(new DictionaryToken(descriptorDictionary));
var toUnicodeCMap = ToUnicodeCMapBuilder.ConvertToCMapStream(characterMapping);
- var toUnicodeStream = DataCompresser.CompressToStream(toUnicodeCMap);
+ var toUnicodeStream = DataCompressor.CompressToStream(toUnicodeCMap);
var toUnicode = writer.WriteToken(toUnicodeStream);
var dictionary = new Dictionary
diff --git a/src/UglyToad.PdfPig/Writer/NoTextTokenWriter.cs b/src/UglyToad.PdfPig/Writer/NoTextTokenWriter.cs
index 05edada13..bb6364f12 100644
--- a/src/UglyToad.PdfPig/Writer/NoTextTokenWriter.cs
+++ b/src/UglyToad.PdfPig/Writer/NoTextTokenWriter.cs
@@ -107,7 +107,7 @@ private bool TryGetStreamWithoutText(StreamToken streamToken, [NotNullWhen(true)
}
outputStreamT.Seek(0, SeekOrigin.Begin);
- var compressedBytes = DataCompresser.CompressBytes(outputStreamT.ToArray());
+ var compressedBytes = DataCompressor.CompressBytes(outputStreamT.ToArray());
var outputStreamDictionary = new Dictionary()
{
{ NameToken.Length, new NumericToken(compressedBytes.Length) },
diff --git a/src/UglyToad.PdfPig/Writer/PdfPageBuilder.cs b/src/UglyToad.PdfPig/Writer/PdfPageBuilder.cs
index c3bacccdc..643429334 100644
--- a/src/UglyToad.PdfPig/Writer/PdfPageBuilder.cs
+++ b/src/UglyToad.PdfPig/Writer/PdfPageBuilder.cs
@@ -767,7 +767,7 @@ public AddedImage AddPng(Stream pngStream, PdfRectangle placementRectangle = def
}
}
- var compressedSmask = DataCompresser.CompressBytes(smaskData);
+ var compressedSmask = DataCompressor.CompressBytes(smaskData);
// Create a soft-mask.
var smaskDictionary = new Dictionary
@@ -786,7 +786,7 @@ public AddedImage AddPng(Stream pngStream, PdfRectangle placementRectangle = def
smaskReference = documentBuilder.AddImage(new DictionaryToken(smaskDictionary), compressedSmask);
}
- var compressed = DataCompresser.CompressBytes(data);
+ var compressed = DataCompressor.CompressBytes(data);
var imgDictionary = new Dictionary
{
@@ -1218,7 +1218,7 @@ public IndirectReferenceToken Write(IPdfStreamWriter writer)
var bytes = memoryStream.ToArray();
- var stream = DataCompresser.CompressToStream(bytes);
+ var stream = DataCompressor.CompressToStream(bytes);
return writer.WriteToken(stream);
}
diff --git a/src/UglyToad.PdfPig/Writer/PdfStreamWriter.cs b/src/UglyToad.PdfPig/Writer/PdfStreamWriter.cs
index c1c360643..cd912ba9a 100644
--- a/src/UglyToad.PdfPig/Writer/PdfStreamWriter.cs
+++ b/src/UglyToad.PdfPig/Writer/PdfStreamWriter.cs
@@ -58,7 +58,7 @@ public virtual IndirectReferenceToken WriteToken(IToken token)
var ir = ReserveObjectNumber();
offsets.Add(ir.Data, Stream.Position);
- var obj = new ObjectToken(Stream.Position, ir.Data, token);
+ var obj = new ObjectToken(XrefLocation.File(Stream.Position), ir.Data, token);
TokenWriter.WriteToken(obj, Stream);
return ir;
}
@@ -71,7 +71,7 @@ public virtual IndirectReferenceToken WriteToken(IToken token, IndirectReference
}
offsets.Add(indirectReference.Data, Stream.Position);
- var obj = new ObjectToken(Stream.Position, indirectReference.Data, token);
+ var obj = new ObjectToken(XrefLocation.File(Stream.Position), indirectReference.Data, token);
TokenWriter.WriteToken(obj, Stream);
return indirectReference;
}
@@ -98,7 +98,6 @@ public void CompletePdf(IndirectReferenceToken catalogReference, IndirectReferen
TokenWriter.WriteCrossReferenceTable(offsets, catalogReference.Data, Stream, documentInformationReference?.Data);
}
-
public void Dispose()
{
if (DisposeStream)