diff --git a/.github/workflows/run_common_crawl_tests.yml b/.github/workflows/run_common_crawl_tests.yml index f8a923c83..6a7e4db06 100644 --- a/.github/workflows/run_common_crawl_tests.yml +++ b/.github/workflows/run_common_crawl_tests.yml @@ -12,7 +12,7 @@ jobs: strategy: fail-fast: false matrix: - pair: ["0000-0001", "0002-0003", "0004-0005", "0006-0007", "0008-0009"] + pair: ["0000-0001", "0002-0003", "0004-0005", "0006-0007", "0008-0009", "0010-0011", "0012-0013"] steps: - uses: actions/checkout@v2 diff --git a/src/UglyToad.PdfPig.Tests/Tokenization/ArrayTokenizerTests.cs b/src/UglyToad.PdfPig.Tests/Tokenization/ArrayTokenizerTests.cs index 93478cff4..a0f4362c2 100644 --- a/src/UglyToad.PdfPig.Tests/Tokenization/ArrayTokenizerTests.cs +++ b/src/UglyToad.PdfPig.Tests/Tokenization/ArrayTokenizerTests.cs @@ -6,7 +6,7 @@ public class ArrayTokenizerTests { - private readonly ArrayTokenizer tokenizer = new ArrayTokenizer(true, new StackDepthGuard(256)); + private readonly ArrayTokenizer tokenizer = new ArrayTokenizer(true, new StackDepthGuard(256), false); [Theory] [InlineData("]")] diff --git a/src/UglyToad.PdfPig.Tokenization/ArrayTokenizer.cs b/src/UglyToad.PdfPig.Tokenization/ArrayTokenizer.cs index c054d0e24..1520528af 100644 --- a/src/UglyToad.PdfPig.Tokenization/ArrayTokenizer.cs +++ b/src/UglyToad.PdfPig.Tokenization/ArrayTokenizer.cs @@ -9,13 +9,15 @@ internal sealed class ArrayTokenizer : ITokenizer { private readonly bool usePdfDocEncoding; private readonly StackDepthGuard stackDepthGuard; + private readonly bool useLenientParsing; public bool ReadsNextByte { get; } = false; - public ArrayTokenizer(bool usePdfDocEncoding, StackDepthGuard stackDepthGuard) + public ArrayTokenizer(bool usePdfDocEncoding, StackDepthGuard stackDepthGuard, bool useLenientParsing) { this.usePdfDocEncoding = usePdfDocEncoding; this.stackDepthGuard = stackDepthGuard; + this.useLenientParsing = useLenientParsing; } public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token) @@ -27,7 +29,7 @@ public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken tok return false; } - var scanner = new CoreTokenScanner(inputBytes, usePdfDocEncoding, stackDepthGuard, ScannerScope.Array); + var scanner = new CoreTokenScanner(inputBytes, usePdfDocEncoding, stackDepthGuard, ScannerScope.Array, useLenientParsing: useLenientParsing); var contents = new List(); diff --git a/src/UglyToad.PdfPig.Tokenization/Scanner/CoreTokenScanner.cs b/src/UglyToad.PdfPig.Tokenization/Scanner/CoreTokenScanner.cs index f1f4e9eac..4f51bf3a7 100644 --- a/src/UglyToad.PdfPig.Tokenization/Scanner/CoreTokenScanner.cs +++ b/src/UglyToad.PdfPig.Tokenization/Scanner/CoreTokenScanner.cs @@ -70,7 +70,7 @@ public CoreTokenScanner( this.usePdfDocEncoding = usePdfDocEncoding; this.stackDepthGuard = stackDepthGuard; this.stringTokenizer = new StringTokenizer(usePdfDocEncoding); - this.arrayTokenizer = new ArrayTokenizer(usePdfDocEncoding, this.stackDepthGuard); + this.arrayTokenizer = new ArrayTokenizer(usePdfDocEncoding, this.stackDepthGuard, useLenientParsing); this.dictionaryTokenizer = new DictionaryTokenizer(usePdfDocEncoding, this.stackDepthGuard, useLenientParsing: useLenientParsing); this.scope = scope; this.namedDictionaryRequiredKeys = namedDictionaryRequiredKeys; diff --git a/tools/UglyToad.PdfPig.ConsoleRunner/Program.cs b/tools/UglyToad.PdfPig.ConsoleRunner/Program.cs index 1fe36cdcb..ec6d2049b 100644 --- a/tools/UglyToad.PdfPig.ConsoleRunner/Program.cs +++ b/tools/UglyToad.PdfPig.ConsoleRunner/Program.cs @@ -185,7 +185,11 @@ public static int Main(string[] args) sw.Reset(); sw.Start(); - using (var pdfDocument = PdfDocument.Open(file)) + using (var pdfDocument = PdfDocument.Open(file, new ParsingOptions + { + UseLenientParsing = true, + SkipMissingFonts = true, + })) { sw.Stop(); diff --git a/tools/common-crawl-ignore.txt b/tools/common-crawl-ignore.txt index 5d7f95b3c..0c8523ffb 100644 --- a/tools/common-crawl-ignore.txt +++ b/tools/common-crawl-ignore.txt @@ -46,4 +46,27 @@ 0009309.pdf 0009464.pdf 0009706.pdf -0009944.pdf \ No newline at end of file +0009944.pdf +0010114.pdf +0010117.pdf +0010216.pdf +0010472.pdf +0010697.pdf +0010902.pdf +0010950.pdf +0011041.pdf +0011171.pdf +0011398.pdf +0011450.pdf +0011758.pdf +0011989.pdf +0012117.pdf +0012684.pdf +0012730.pdf +0013051.pdf +0013178.pdf +0013338.pdf +0013425.pdf +0013587.pdf +0013721.pdf +0013822.pdf \ No newline at end of file