Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/run_common_crawl_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ jobs:
strategy:
fail-fast: false
matrix:
pair: ["0000-0001", "0002-0003", "0004-0005", "0006-0007", "0008-0009"]
pair: ["0000-0001", "0002-0003", "0004-0005", "0006-0007", "0008-0009", "0010-0011", "0012-0013"]

steps:
- uses: actions/checkout@v2
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

public class ArrayTokenizerTests
{
private readonly ArrayTokenizer tokenizer = new ArrayTokenizer(true, new StackDepthGuard(256));
private readonly ArrayTokenizer tokenizer = new ArrayTokenizer(true, new StackDepthGuard(256), false);

[Theory]
[InlineData("]")]
Expand Down
6 changes: 4 additions & 2 deletions src/UglyToad.PdfPig.Tokenization/ArrayTokenizer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,15 @@ internal sealed class ArrayTokenizer : ITokenizer
{
private readonly bool usePdfDocEncoding;
private readonly StackDepthGuard stackDepthGuard;
private readonly bool useLenientParsing;

public bool ReadsNextByte { get; } = false;

public ArrayTokenizer(bool usePdfDocEncoding, StackDepthGuard stackDepthGuard)
public ArrayTokenizer(bool usePdfDocEncoding, StackDepthGuard stackDepthGuard, bool useLenientParsing)
{
this.usePdfDocEncoding = usePdfDocEncoding;
this.stackDepthGuard = stackDepthGuard;
this.useLenientParsing = useLenientParsing;
}

public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token)
Expand All @@ -27,7 +29,7 @@ public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken tok
return false;
}

var scanner = new CoreTokenScanner(inputBytes, usePdfDocEncoding, stackDepthGuard, ScannerScope.Array);
var scanner = new CoreTokenScanner(inputBytes, usePdfDocEncoding, stackDepthGuard, ScannerScope.Array, useLenientParsing: useLenientParsing);

var contents = new List<IToken>();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ public CoreTokenScanner(
this.usePdfDocEncoding = usePdfDocEncoding;
this.stackDepthGuard = stackDepthGuard;
this.stringTokenizer = new StringTokenizer(usePdfDocEncoding);
this.arrayTokenizer = new ArrayTokenizer(usePdfDocEncoding, this.stackDepthGuard);
this.arrayTokenizer = new ArrayTokenizer(usePdfDocEncoding, this.stackDepthGuard, useLenientParsing);
this.dictionaryTokenizer = new DictionaryTokenizer(usePdfDocEncoding, this.stackDepthGuard, useLenientParsing: useLenientParsing);
this.scope = scope;
this.namedDictionaryRequiredKeys = namedDictionaryRequiredKeys;
Expand Down
6 changes: 5 additions & 1 deletion tools/UglyToad.PdfPig.ConsoleRunner/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,11 @@ public static int Main(string[] args)
sw.Reset();
sw.Start();

using (var pdfDocument = PdfDocument.Open(file))
using (var pdfDocument = PdfDocument.Open(file, new ParsingOptions
{
UseLenientParsing = true,
SkipMissingFonts = true,
}))
{
sw.Stop();

Expand Down
25 changes: 24 additions & 1 deletion tools/common-crawl-ignore.txt
Original file line number Diff line number Diff line change
Expand Up @@ -46,4 +46,27 @@
0009309.pdf
0009464.pdf
0009706.pdf
0009944.pdf
0009944.pdf
0010114.pdf
0010117.pdf
0010216.pdf
0010472.pdf
0010697.pdf
0010902.pdf
0010950.pdf
0011041.pdf
0011171.pdf
0011398.pdf
0011450.pdf
0011758.pdf
0011989.pdf
0012117.pdf
0012684.pdf
0012730.pdf
0013051.pdf
0013178.pdf
0013338.pdf
0013425.pdf
0013587.pdf
0013721.pdf
0013822.pdf
Loading