Skip to content
2 changes: 1 addition & 1 deletion .github/workflows/run_common_crawl_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ jobs:
strategy:
fail-fast: false
matrix:
pair: ["0000-0001", "0002-0003", "0004-0005", "0006-0007"]
pair: ["0000-0001", "0002-0003", "0004-0005", "0006-0007", "0008-0009"]

steps:
- uses: actions/checkout@v2
Expand Down
20 changes: 20 additions & 0 deletions src/UglyToad.PdfPig.Core/XrefEntryType.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
namespace UglyToad.PdfPig.Core;

/// <summary>
/// Indicates where an object is located in the Xref.
/// </summary>
public enum XrefEntryType : byte
{
/// <summary>
/// Free object.
/// </summary>
Free = 0,
/// <summary>
/// Located as an object in the file.
/// </summary>
File = 1,
/// <summary>
/// Located in a compressed object stream.
/// </summary>
ObjectStream = 2
}
42 changes: 42 additions & 0 deletions src/UglyToad.PdfPig.Core/XrefLocation.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
namespace UglyToad.PdfPig.Core;

/// <summary>
/// Information about where an object is located in the file according to the Xref (or brute force parsing).
/// </summary>
public readonly struct XrefLocation
{
/// <summary>
/// Which type of location is indicated.
/// </summary>
public readonly XrefEntryType Type;

/// <summary>
/// If <see cref="Type"/> is <see cref="XrefEntryType.File"/> then byte offset, otherwise <see cref="XrefEntryType.ObjectStream"/> this is the stream number.
/// </summary>
public readonly long Value1;

/// <summary>
/// If <see cref="Type"/> is <see cref="XrefEntryType.ObjectStream"/> then the index of the object in the stream.
/// </summary>
public readonly int Value2; // only used for ObjectStream

private XrefLocation(XrefEntryType type, long value1, int value2)
{
Type = type;
Value1 = value1;
Value2 = value2;
}

/// <summary>
/// Create a location mapped to a byte offset in the file.
/// </summary>
public static XrefLocation File(long offset)
=> new XrefLocation(XrefEntryType.File, offset, 0);

/// <summary>
/// Create a location mapped to an index inside and object stream.
/// </summary>
public static XrefLocation Stream(long objStream, int index)
=> new XrefLocation(XrefEntryType.ObjectStream, objStream, index);

}
22 changes: 20 additions & 2 deletions src/UglyToad.PdfPig.Tests/Filters/FlateFilterTests.cs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
namespace UglyToad.PdfPig.Tests.Filters
{
using PdfPig.Core;
using PdfPig.Filters;
using PdfPig.Tokens;

Expand All @@ -11,15 +12,32 @@ public class FlateFilterTests
public void EncodeAndDecodePreservesInput()
{
var parameters = new DictionaryToken(new Dictionary<NameToken, IToken>());
var input = new byte[] {67, 69, 69, 10, 4, 20, 6, 19, 120, 64, 64, 64, 32};
var input = new byte[] { 67, 69, 69, 10, 4, 20, 6, 19, 120, 64, 64, 64, 32 };

using (var inputStream = new MemoryStream(input))
{
inputStream.Seek(0, SeekOrigin.Begin);
var result = filter.Encode(inputStream, parameters, 0);
var result = filter.Encode(inputStream, parameters);
var decoded = filter.Decode(result, parameters, TestFilterProvider.Instance, 0);
Assert.Equal(input, decoded.ToArray());
}
}

[Fact]
public void CanDecodeCorruptedInputIssue1235()
{
const string hexStr =
"789C958D5D0AC2400C844FB077980B74BB7FD9D982F820B43E8B7B03C542C187EAFDC1F84B7D1164200999E49BD9044C6653D10E1E443DA1AF6636ED76EF315E7572968E1ECDAB7FB7506C4C59C0AEB3912EE270366AAAF4E36D364BF7911450DC274A5112B1AC9751D77A58680B51A4D8AE433D62953C037396E0F290FBE098B267A43051725AA34E77E44EF50B1B52B42C90E4ADF83FB94FDD0000000000";

var hex = new HexToken(hexStr.AsSpan());

var parameters = new DictionaryToken(new Dictionary<NameToken, IToken>());

var result = filter.Decode(hex.Bytes.ToArray(), parameters, TestFilterProvider.Instance, 0);

var text = OtherEncodings.BytesAsLatin1String(result.ToArray());

Assert.StartsWith("q", text);
}
}
}
6 changes: 3 additions & 3 deletions src/UglyToad.PdfPig.Tests/Integration/GithubIssuesTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -327,7 +327,7 @@ public void Issue1122()
var path = IntegrationHelpers.GetSpecificTestDocumentPath("StackOverflow_Issue_1122.pdf");

var ex = Assert.Throws<PdfDocumentFormatException>(() => PdfDocument.Open(path, new ParsingOptions() { UseLenientParsing = true }));
Assert.Equal("The root object in the trailer did not resolve to a readable dictionary.", ex.Message);
Assert.StartsWith("Circular reference encountered when looking", ex.Message);
}

[Fact]
Expand Down Expand Up @@ -386,7 +386,7 @@ public void Issue1050()
{
var path = IntegrationHelpers.GetSpecificTestDocumentPath("SpookyPass.pdf");
var ex = Assert.Throws<PdfDocumentFormatException>(() => PdfDocument.Open(path, new ParsingOptions() { UseLenientParsing = true }));
Assert.Equal("The root object in the trailer did not resolve to a readable dictionary.", ex.Message);
Assert.StartsWith("Object stream cannot contain itself", ex.Message);
}

[Fact]
Expand Down Expand Up @@ -552,7 +552,7 @@ public void Issue953_IntOverflow()
{
var page = document.GetPage(13);
// This used to fail with an overflow exception when we failed to validate the zlib encoded data
Assert.NotNull(DocstrumBoundingBoxes.Instance.GetBlocks(page.GetWords()));
Assert.Throws<OverflowException>(() => DocstrumBoundingBoxes.Instance.GetBlocks(page.GetWords()));
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ 0000000576 00000 n
Assert.Equal(2, results.Parts.Count);
Assert.NotNull(results.Trailer);

Assert.Equal(results.XrefOffsets[new IndirectReference(8, 0)], 500);
Assert.Equal(results.XrefOffsets[new IndirectReference(8, 0)].Value1, 500);
}

[Fact]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -589,7 +589,7 @@ private static void AssertObjectsMatch(
{
Assert.True(table.ObjectOffsets.TryGetValue(offset.Key, out var actual));

Assert.Equal(offset.Value, actual);
Assert.Equal(offset.Value, actual.Value1);
}
}

Expand Down
62 changes: 62 additions & 0 deletions src/UglyToad.PdfPig.Tests/Parser/PageContentParserTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,68 @@ public void CorrectlyHandlesFile0007511CorruptInlineImage()
Assert.NotEmpty(result);
}

[Fact]
public void HandlesIssue953_IntOverflowContent()
{
// After ( + ) Tj operator the content stream becomes corrupt, our current parser therefore reads wrong
// values for operations and this results in a problem when applying the show text operations, we should safely discard or recover on BT/ET boundaries.
const string s =
"""
BT
/TT6 1 Tf
12.007 0 0 12.007 163.2j
-0.19950 Tc
0 Tw
(x)Tj
-0.1949 1.4142 TD
(H)Tj
/TT7 1 Tf
12.031 0 0 12.031 157.38 85.2 Tm
<0077>Tj
-0.1945 1.4114 TD
<0077>Tj
/TT4 1 Tf
12.007 0 0 12.007 174.42 94.5601 Tm
0.0004 Tc
-0.0005 Tw
( + )Tj
E9 478l)]T862.68E9 478E9 484.54 9 155l)]T862.6av9 478E9 15.2(
ET
154.386( i92 m
171.6 97.62 l
S
BT
/TT6 28 Tf
12.03128 T2002.0307 163.2j
-0.19950 DAc
0 Tw853Tj
0.1945 1.4142 om)873j
-0.574142 om)68.80
-0.5797 0 TD
(f)Tj
/TT( )7Tf
0.31945 1.5341 TD371.4j
2.82
8.2652 0 5.724 TD
0 Tc
-0.0001 2748.3( = 091ity )-27483
[(te27483
[(te27483
[(te27483
[(te27483
[(te27483
[(Eq.)52 \(2.1
(
""";

var input = StringBytesTestConverter.Convert(s, false);

var lenientParser = new PageContentParser(ReflectionGraphicsStateOperationFactory.Instance, new StackDepthGuard(256), true);
var result = lenientParser.Parse(1, input.Bytes, log);

Assert.NotEmpty(result);
}

private static string LineEndingsToWhiteSpace(string str)
{
return str.Replace("\r\n", " ").Replace('\n', ' ').Replace('\r', ' ');
Expand Down
84 changes: 42 additions & 42 deletions src/UglyToad.PdfPig.Tests/Parser/Parts/BruteForceSearcherTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ public void SearcherFindsCorrectObjects()

Assert.Equal(4, locations.Count);

Assert.Equal(TestDataOffsets, locations.Values);
Assert.Equal(TestDataOffsets, locations.Values.Select(x => x.Value1));
}

[Fact]
Expand Down Expand Up @@ -111,7 +111,7 @@ 11 0 obj
s.IndexOf("11 0 obj", StringComparison.OrdinalIgnoreCase)
};

Assert.Equal(expectedLocations, locations.Values);
Assert.Equal(expectedLocations, locations.Values.Select(x => x.Value1));
}

[Fact]
Expand Down Expand Up @@ -142,7 +142,7 @@ 5 0 obj
s.IndexOf("5 0 obj", StringComparison.OrdinalIgnoreCase)
};

Assert.Equal(expectedLocations, locations.Values);
Assert.Equal(expectedLocations, locations.Values.Select(x => x.Value1));
}

[Fact]
Expand All @@ -156,17 +156,17 @@ public void BruteForceSearcherFileOffsetsCorrect()

Assert.Equal(13, locations.Count);

Assert.Equal(6183, locations[new IndirectReference(1, 0)]);
Assert.Equal(244, locations[new IndirectReference(2, 0)]);
Assert.Equal(15, locations[new IndirectReference(3, 0)]);
Assert.Equal(222, locations[new IndirectReference(4, 0)]);
Assert.Equal(5766, locations[new IndirectReference(5, 0)]);
Assert.Equal(353, locations[new IndirectReference(6, 0)]);
Assert.Equal(581, locations[new IndirectReference(7, 0)]);
Assert.Equal(5068, locations[new IndirectReference(8, 0)]);
Assert.Equal(5091, locations[new IndirectReference(9, 0)]);

var s = GetStringAt(bytes, locations[new IndirectReference(3, 0)]);
Assert.Equal(6183, locations[new IndirectReference(1, 0)].Value1);
Assert.Equal(244, locations[new IndirectReference(2, 0)].Value1);
Assert.Equal(15, locations[new IndirectReference(3, 0)].Value1);
Assert.Equal(222, locations[new IndirectReference(4, 0)].Value1);
Assert.Equal(5766, locations[new IndirectReference(5, 0)].Value1);
Assert.Equal(353, locations[new IndirectReference(6, 0)].Value1);
Assert.Equal(581, locations[new IndirectReference(7, 0)].Value1);
Assert.Equal(5068, locations[new IndirectReference(8, 0)].Value1);
Assert.Equal(5091, locations[new IndirectReference(9, 0)].Value1);

var s = GetStringAt(bytes, locations[new IndirectReference(3, 0)].Value1);
Assert.StartsWith("3 0 obj", s);
}
}
Expand All @@ -180,17 +180,17 @@ public void BruteForceSearcherBytesFileOffsetsCorrect()

Assert.Equal(13, locations.Count);

Assert.Equal(6183, locations[new IndirectReference(1, 0)]);
Assert.Equal(244, locations[new IndirectReference(2, 0)]);
Assert.Equal(15, locations[new IndirectReference(3, 0)]);
Assert.Equal(222, locations[new IndirectReference(4, 0)]);
Assert.Equal(5766, locations[new IndirectReference(5, 0)]);
Assert.Equal(353, locations[new IndirectReference(6, 0)]);
Assert.Equal(581, locations[new IndirectReference(7, 0)]);
Assert.Equal(5068, locations[new IndirectReference(8, 0)]);
Assert.Equal(5091, locations[new IndirectReference(9, 0)]);

var s = GetStringAt(bytes, locations[new IndirectReference(3, 0)]);
Assert.Equal(6183, locations[new IndirectReference(1, 0)].Value1);
Assert.Equal(244, locations[new IndirectReference(2, 0)].Value1);
Assert.Equal(15, locations[new IndirectReference(3, 0)].Value1);
Assert.Equal(222, locations[new IndirectReference(4, 0)].Value1);
Assert.Equal(5766, locations[new IndirectReference(5, 0)].Value1);
Assert.Equal(353, locations[new IndirectReference(6, 0)].Value1);
Assert.Equal(581, locations[new IndirectReference(7, 0)].Value1);
Assert.Equal(5068, locations[new IndirectReference(8, 0)].Value1);
Assert.Equal(5091, locations[new IndirectReference(9, 0)].Value1);

var s = GetStringAt(bytes, locations[new IndirectReference(3, 0)].Value1);
Assert.StartsWith("3 0 obj", s);
}

Expand All @@ -203,21 +203,21 @@ public void BruteForceSearcherFileOffsetsCorrectOpenOffice()

Assert.Equal(13, locations.Count);

Assert.Equal(17, locations[new IndirectReference(1, 0)]);
Assert.Equal(249, locations[new IndirectReference(2, 0)]);
Assert.Equal(14291, locations[new IndirectReference(3, 0)]);
Assert.Equal(275, locations[new IndirectReference(4, 0)]);
Assert.Equal(382, locations[new IndirectReference(5, 0)]);
Assert.Equal(13283, locations[new IndirectReference(6, 0)]);
Assert.Equal(13309, locations[new IndirectReference(7, 0)]);
Assert.Equal(13556, locations[new IndirectReference(8, 0)]);
Assert.Equal(13926, locations[new IndirectReference(9, 0)]);
Assert.Equal(14183, locations[new IndirectReference(10, 0)]);
Assert.Equal(14224, locations[new IndirectReference(11, 0)]);
Assert.Equal(14428, locations[new IndirectReference(12, 0)]);
Assert.Equal(14488, locations[new IndirectReference(13, 0)]);

var s = GetStringAt(bytes, locations[new IndirectReference(12, 0)]);
Assert.Equal(17, locations[new IndirectReference(1, 0)].Value1);
Assert.Equal(249, locations[new IndirectReference(2, 0)].Value1);
Assert.Equal(14291, locations[new IndirectReference(3, 0)].Value1);
Assert.Equal(275, locations[new IndirectReference(4, 0)].Value1);
Assert.Equal(382, locations[new IndirectReference(5, 0)].Value1);
Assert.Equal(13283, locations[new IndirectReference(6, 0)].Value1);
Assert.Equal(13309, locations[new IndirectReference(7, 0)].Value1);
Assert.Equal(13556, locations[new IndirectReference(8, 0)].Value1);
Assert.Equal(13926, locations[new IndirectReference(9, 0)].Value1);
Assert.Equal(14183, locations[new IndirectReference(10, 0)].Value1);
Assert.Equal(14224, locations[new IndirectReference(11, 0)].Value1);
Assert.Equal(14428, locations[new IndirectReference(12, 0)].Value1);
Assert.Equal(14488, locations[new IndirectReference(13, 0)].Value1);

var s = GetStringAt(bytes, locations[new IndirectReference(12, 0)].Value1);
Assert.StartsWith("12 0 obj", s);
}

Expand All @@ -230,7 +230,7 @@ public void BruteForceSearcherCorrectlyFindsAllObjectsWhenOffset()

var locations = BruteForceSearcher.GetObjectLocations(input);

Assert.Equal(TestDataOffsets, locations.Values);
Assert.Equal(TestDataOffsets, locations.Values.Select(x => x.Value1));
}

[Fact]
Expand Down Expand Up @@ -265,7 +265,7 @@ 11 0 obj
s.IndexOf("11 0 obj", StringComparison.OrdinalIgnoreCase)
};

Assert.Equal(expectedLocations, locations.Values);
Assert.Equal(expectedLocations, locations.Values.Select(x => x.Value1));
}

private static string GetStringAt(IInputBytes bytes, long location)
Expand Down
Loading
Loading