From dd505465596f375ac537a7c72a10e4c2b9715bef Mon Sep 17 00:00:00 2001 From: Bert Huijben Date: Thu, 16 Oct 2025 11:36:49 +0200 Subject: [PATCH] Avoid a lot of seeks by making most tokenizers no longer read to far by using seek. Optimize the FirstPassParser to just fetch a final chunk before doing things char-by-char backwards. --- src/UglyToad.PdfPig.Core/ReadHelper.cs | 7 +- src/UglyToad.PdfPig.Core/StreamInputBytes.cs | 11 +++ .../Type1/Parser/Type1ArrayTokenizer.cs | 2 +- .../Type1/Parser/Type1FontParser.cs | 5 ++ .../Type1/Parser/Type1Tokenizer.cs | 65 +++++++++++------ .../Tokenization/NumericTokenizerTests.cs | 5 +- .../ArrayTokenizer.cs | 2 +- .../CommentTokenizer.cs | 7 +- .../DictionaryTokenizer.cs | 2 +- .../EndOfLineTokenizer.cs | 2 +- .../HexTokenizer.cs | 2 +- .../NameTokenizer.cs | 17 ++--- .../NumericTokenizer.cs | 71 +++++++++---------- .../PlainTokenizer.cs | 17 ++--- .../Scanner/CoreTokenScanner.cs | 7 +- .../Parser/FileStructure/FileHeaderParser.cs | 3 +- .../FirstPassParser.StartXref.cs | 43 +++++++++++ .../Parser/Parts/BruteForceSearcher.cs | 2 +- .../Tokenization/Scanner/PdfTokenScanner.cs | 2 +- 19 files changed, 177 insertions(+), 95 deletions(-) diff --git a/src/UglyToad.PdfPig.Core/ReadHelper.cs b/src/UglyToad.PdfPig.Core/ReadHelper.cs index 184489fc1..4d6e9842f 100644 --- a/src/UglyToad.PdfPig.Core/ReadHelper.cs +++ b/src/UglyToad.PdfPig.Core/ReadHelper.cs @@ -24,12 +24,17 @@ public static class ReadHelper /// public const byte AsciiCarriageReturn = 13; + /// + /// The tab '\t' character. + /// + public const byte AsciiTab = 9; + private static readonly HashSet EndOfNameCharacters = [ ' ', AsciiCarriageReturn, AsciiLineFeed, - 9, + AsciiTab, '>', '<', '[', diff --git a/src/UglyToad.PdfPig.Core/StreamInputBytes.cs b/src/UglyToad.PdfPig.Core/StreamInputBytes.cs index d29868741..21f215b8f 100644 --- a/src/UglyToad.PdfPig.Core/StreamInputBytes.cs +++ b/src/UglyToad.PdfPig.Core/StreamInputBytes.cs @@ -96,6 +96,17 @@ public bool IsAtEnd() /// public void Seek(long position) { + var current = CurrentOffset; + if (position == current) + { + return; + } + else if (peekByte.HasValue && position == current + 1) + { + MoveNext(); + return; + } + isAtEnd = false; peekByte = null; diff --git a/src/UglyToad.PdfPig.Fonts/Type1/Parser/Type1ArrayTokenizer.cs b/src/UglyToad.PdfPig.Fonts/Type1/Parser/Type1ArrayTokenizer.cs index eb709f04f..e1d6e30c5 100644 --- a/src/UglyToad.PdfPig.Fonts/Type1/Parser/Type1ArrayTokenizer.cs +++ b/src/UglyToad.PdfPig.Fonts/Type1/Parser/Type1ArrayTokenizer.cs @@ -14,7 +14,7 @@ public sealed class Type1ArrayTokenizer : ITokenizer /// public bool ReadsNextByte { get; } = false; - private static readonly string[] Space = [" "]; + private static readonly char[] Space = [' ']; /// public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token) diff --git a/src/UglyToad.PdfPig.Fonts/Type1/Parser/Type1FontParser.cs b/src/UglyToad.PdfPig.Fonts/Type1/Parser/Type1FontParser.cs index 606725be1..6b3e26e4a 100644 --- a/src/UglyToad.PdfPig.Fonts/Type1/Parser/Type1FontParser.cs +++ b/src/UglyToad.PdfPig.Fonts/Type1/Parser/Type1FontParser.cs @@ -88,6 +88,11 @@ public static Type1Font Parse(IInputBytes inputBytes, int length1, int length2) { int offset = 0; + while (inputBytes.Peek() is { } b && ReadHelper.IsWhitespace(b)) + { + inputBytes.MoveNext(); + } + while (inputBytes.MoveNext()) { if (inputBytes.CurrentByte == (byte)ClearToMark[offset]) diff --git a/src/UglyToad.PdfPig.Fonts/Type1/Parser/Type1Tokenizer.cs b/src/UglyToad.PdfPig.Fonts/Type1/Parser/Type1Tokenizer.cs index 333f008ed..aac323f3f 100644 --- a/src/UglyToad.PdfPig.Fonts/Type1/Parser/Type1Tokenizer.cs +++ b/src/UglyToad.PdfPig.Fonts/Type1/Parser/Type1Tokenizer.cs @@ -2,6 +2,7 @@ { using System; using System.Collections.Generic; + using System.Diagnostics; using System.Globalization; using System.Text; using Core; @@ -41,35 +42,43 @@ private Type1Token ReadNextToken() do { skip = false; - while (bytes.MoveNext()) + while (bytes.Peek() is { } b) { - var b = bytes.CurrentByte; var c = (char)b; switch (c) { case '%': + bytes.MoveNext(); comments.Add(ReadComment()); break; case '(': + bytes.MoveNext(); return ReadString(); case ')': throw new InvalidOperationException("Encountered an end of string ')' outside of string."); case '[': + bytes.MoveNext(); return new Type1Token(c, Type1Token.TokenType.StartArray); case ']': + bytes.MoveNext(); return new Type1Token(c, Type1Token.TokenType.EndArray); case '{': + bytes.MoveNext(); return new Type1Token(c, Type1Token.TokenType.StartProc); case '}': + bytes.MoveNext(); return new Type1Token(c, Type1Token.TokenType.EndProc); case '/': { - var name = ReadLiteral(); + bytes.MoveNext(); + TryReadLiteral(out var name); + Debug.Assert(name != null); return new Type1Token(name, Type1Token.TokenType.Literal); } case '<': { + bytes.MoveNext(); var following = bytes.Peek(); if (following == '<') { @@ -81,6 +90,7 @@ private Type1Token ReadNextToken() } case '>': { + bytes.MoveNext(); var following = bytes.Peek(); if (following == '>') { @@ -94,23 +104,24 @@ private Type1Token ReadNextToken() { if (ReadHelper.IsWhitespace(b)) { + bytes.MoveNext(); skip = true; break; } if (b == 0) { + bytes.MoveNext(); skip = true; break; } - if (TryReadNumber(c, out var number)) + if (TryReadNumber(out var number)) { return number; } - var name = ReadLiteral(c); - if (name == null) + if (!TryReadLiteral(out var name)) { throw new InvalidOperationException($"The binary portion of the type 1 font was invalid at position {bytes.CurrentOffset}."); } @@ -197,12 +208,21 @@ char GetNext() return null; } - private bool TryReadNumber(char c, out Type1Token numberToken) + private bool TryReadNumber(out Type1Token numberToken) { char GetNext() { bytes.MoveNext(); - return (char)bytes.CurrentByte; + return (char)(bytes.Peek() ?? 0); + } + + char c = (char)(bytes.Peek() ?? 0); + + if (!((c >= '0' && c <= '9') || c is '+' or '-')) + { + // Easy out. Not a valid number + numberToken = null; + return false; } numberToken = null; @@ -251,8 +271,6 @@ char GetNext() else { // integer - bytes.Seek(bytes.CurrentOffset - 1); - numberToken = new Type1Token(sb.ToString(), Type1Token.TokenType.Integer); return true; } @@ -309,7 +327,6 @@ char GetNext() } } - bytes.Seek(bytes.CurrentOffset - 1); if (radix != null) { var number = Convert.ToInt32(sb.ToString(), int.Parse(radix.ToString(), CultureInfo.InvariantCulture)); @@ -323,14 +340,9 @@ char GetNext() return true; } - private string ReadLiteral(char? previousCharacter = null) + private bool TryReadLiteral(out string? value) { literalBuffer.Clear(); - if (previousCharacter.HasValue) - { - literalBuffer.Append(previousCharacter); - } - do { var b = bytes.Peek(); @@ -350,8 +362,16 @@ private string ReadLiteral(char? previousCharacter = null) literalBuffer.Append(c); } while (bytes.MoveNext()); - var literal = literalBuffer.ToString(); - return literal.Length == 0 ? null : literal; + if (literalBuffer.Length > 0) + { + value = literalBuffer.ToString(); + return true; + } + else + { + value = null; + return false; + } } private string ReadComment() @@ -375,9 +395,10 @@ private string ReadComment() private Type1DataToken ReadCharString(int length) { // Skip preceding space. - bytes.MoveNext(); - // TODO: may be wrong - // bytes.MoveNext(); + if (bytes.Peek() is { } ws && ReadHelper.IsWhitespace(ws)) + { + bytes.MoveNext(); + } byte[] data = new byte[length]; for (int i = 0; i < length; i++) diff --git a/src/UglyToad.PdfPig.Tests/Tokenization/NumericTokenizerTests.cs b/src/UglyToad.PdfPig.Tests/Tokenization/NumericTokenizerTests.cs index be8a19a25..fa9654ff9 100644 --- a/src/UglyToad.PdfPig.Tests/Tokenization/NumericTokenizerTests.cs +++ b/src/UglyToad.PdfPig.Tests/Tokenization/NumericTokenizerTests.cs @@ -91,7 +91,10 @@ public void OnlyParsesNumberPart() Assert.True(result); Assert.Equal(135.6654, AssertNumericToken(token).Data); - Assert.Equal('/', (char)input.Bytes.CurrentByte); + if (tokenizer.ReadsNextByte) + Assert.Equal('/', (char)input.Bytes.CurrentByte); + else + Assert.Equal('4', (char)input.Bytes.CurrentByte); } [Fact] diff --git a/src/UglyToad.PdfPig.Tokenization/ArrayTokenizer.cs b/src/UglyToad.PdfPig.Tokenization/ArrayTokenizer.cs index 8c95e41ff..da1f468dd 100644 --- a/src/UglyToad.PdfPig.Tokenization/ArrayTokenizer.cs +++ b/src/UglyToad.PdfPig.Tokenization/ArrayTokenizer.cs @@ -9,7 +9,7 @@ internal sealed class ArrayTokenizer : ITokenizer { private readonly bool usePdfDocEncoding; - public bool ReadsNextByte { get; } = false; + public bool ReadsNextByte => false; public ArrayTokenizer(bool usePdfDocEncoding) { diff --git a/src/UglyToad.PdfPig.Tokenization/CommentTokenizer.cs b/src/UglyToad.PdfPig.Tokenization/CommentTokenizer.cs index cd51a0e23..25c588952 100644 --- a/src/UglyToad.PdfPig.Tokenization/CommentTokenizer.cs +++ b/src/UglyToad.PdfPig.Tokenization/CommentTokenizer.cs @@ -6,7 +6,7 @@ internal sealed class CommentTokenizer : ITokenizer { - public bool ReadsNextByte { get; } = true; + public bool ReadsNextByte => false; public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token) { @@ -17,10 +17,11 @@ public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken tok return false; } - using var builder = new ValueStringBuilder(); + using var builder = new ValueStringBuilder(stackalloc char[32]); - while (inputBytes.MoveNext() && !ReadHelper.IsEndOfLine(inputBytes.CurrentByte)) + while (inputBytes.Peek() is { } c && !ReadHelper.IsEndOfLine(c)) { + inputBytes.MoveNext(); builder.Append((char) inputBytes.CurrentByte); } diff --git a/src/UglyToad.PdfPig.Tokenization/DictionaryTokenizer.cs b/src/UglyToad.PdfPig.Tokenization/DictionaryTokenizer.cs index 213fdcfc2..3b75ffcc6 100644 --- a/src/UglyToad.PdfPig.Tokenization/DictionaryTokenizer.cs +++ b/src/UglyToad.PdfPig.Tokenization/DictionaryTokenizer.cs @@ -11,7 +11,7 @@ internal class DictionaryTokenizer : ITokenizer private readonly IReadOnlyList requiredKeys; private readonly bool useLenientParsing; - public bool ReadsNextByte { get; } = false; + public bool ReadsNextByte => false; /// /// Create a new . diff --git a/src/UglyToad.PdfPig.Tokenization/EndOfLineTokenizer.cs b/src/UglyToad.PdfPig.Tokenization/EndOfLineTokenizer.cs index 7fb7d2c7a..66597f646 100644 --- a/src/UglyToad.PdfPig.Tokenization/EndOfLineTokenizer.cs +++ b/src/UglyToad.PdfPig.Tokenization/EndOfLineTokenizer.cs @@ -9,7 +9,7 @@ public sealed class EndOfLineTokenizer : ITokenizer { /// - public bool ReadsNextByte { get; } = false; + public bool ReadsNextByte => false; /// public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token) diff --git a/src/UglyToad.PdfPig.Tokenization/HexTokenizer.cs b/src/UglyToad.PdfPig.Tokenization/HexTokenizer.cs index 72fe06772..92bc16309 100644 --- a/src/UglyToad.PdfPig.Tokenization/HexTokenizer.cs +++ b/src/UglyToad.PdfPig.Tokenization/HexTokenizer.cs @@ -5,7 +5,7 @@ internal sealed class HexTokenizer : ITokenizer { - public bool ReadsNextByte { get; } = false; + public bool ReadsNextByte => false; public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token) { diff --git a/src/UglyToad.PdfPig.Tokenization/NameTokenizer.cs b/src/UglyToad.PdfPig.Tokenization/NameTokenizer.cs index 1aa624471..16974167a 100644 --- a/src/UglyToad.PdfPig.Tokenization/NameTokenizer.cs +++ b/src/UglyToad.PdfPig.Tokenization/NameTokenizer.cs @@ -11,14 +11,14 @@ internal sealed class NameTokenizer : ITokenizer { +#if NET static NameTokenizer() { -#if NET Encoding.RegisterProvider(CodePagesEncodingProvider.Instance); -#endif } +#endif - public bool ReadsNextByte { get; } = true; + public bool ReadsNextByte => false; public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token) { @@ -35,10 +35,8 @@ public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken tok int postEscapeRead = 0; Span escapedChars = stackalloc char[2]; - while (inputBytes.MoveNext()) + while (inputBytes.Peek() is { } b) { - var b = inputBytes.CurrentByte; - if (b == '#') { escapeActive = true; @@ -52,8 +50,9 @@ public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken tok if (postEscapeRead == 2) { - int high = escapedChars[0] <= '9' ? escapedChars[0] - '0' : char.ToUpper(escapedChars[0]) - 'A' + 10; - int low = escapedChars[1] <= '9' ? escapedChars[1] - '0' : char.ToUpper(escapedChars[1]) - 'A' + 10; + // We validated that the char is hex. So assume ASCII rules apply and shortcut hex decoding + int high = escapedChars[0] <= '9' ? escapedChars[0] - '0' : ((escapedChars[0] & 0xF) + 9); + int low = escapedChars[1] <= '9' ? escapedChars[1] - '0' : ((escapedChars[1] & 0xF) + 9); byte characterToWrite = (byte)(high * 16 + low); @@ -100,6 +99,8 @@ public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken tok { bytes.Write(b); } + + inputBytes.MoveNext(); } #if NET8_0_OR_GREATER diff --git a/src/UglyToad.PdfPig.Tokenization/NumericTokenizer.cs b/src/UglyToad.PdfPig.Tokenization/NumericTokenizer.cs index e5123eb47..55413f0d7 100644 --- a/src/UglyToad.PdfPig.Tokenization/NumericTokenizer.cs +++ b/src/UglyToad.PdfPig.Tokenization/NumericTokenizer.cs @@ -7,15 +7,7 @@ namespace UglyToad.PdfPig.Tokenization; internal sealed class NumericTokenizer : ITokenizer { - private const byte Zero = 48; - private const byte Nine = 57; - private const byte Negative = (byte)'-'; - private const byte Positive = (byte)'+'; - private const byte Period = (byte)'.'; - private const byte ExponentLower = (byte)'e'; - private const byte ExponentUpper = (byte)'E'; - - public bool ReadsNextByte => true; + public bool ReadsNextByte => false; public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken? token) { @@ -37,30 +29,50 @@ public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken? to var isExponentNegative = false; var exponentPart = 0; - do + byte? firstByte = currentByte; + bool noRead = true; + bool acceptSign = true; + while (!inputBytes.IsAtEnd() || firstByte is { }) { - var b = inputBytes.CurrentByte; - if (b >= Zero && b <= Nine) + if (firstByte is { } b) + { + firstByte = null; + } + else if (noRead) + { + noRead = false; + b = inputBytes.Peek() ?? 0; + } + else { + inputBytes.MoveNext(); + b = inputBytes.Peek() ?? 0; + } + + if (b >= '0' && b <= '9') + { + var value = b - '0'; if (hasExponent) { - exponentPart = (exponentPart * 10) + (b - Zero); + exponentPart = (exponentPart * 10) + value; } else if (hasFraction) { - fractionalPart = (fractionalPart * 10) + (b - Zero); + fractionalPart = (fractionalPart * 10) + value; fractionalCount++; } else { - integerPart = (integerPart * 10) + (b - Zero); + integerPart = (integerPart * 10) + value; } + acceptSign = false; } - else if (b == Positive) + else if (b == '+' && acceptSign) { // Has no impact + acceptSign = false; } - else if (b == Negative) + else if (b == '-' && acceptSign) { if (hasExponent) { @@ -70,30 +82,17 @@ public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken? to { isNegative = true; } + // acceptSign = false; // Somehow we have a test that expects to support "--21.72" to return -21.72 } - else if (b == Period) + else if (b == '.' && !hasExponent && !hasFraction) { - if (hasExponent || hasFraction) - { - return false; - } - hasFraction = true; + acceptSign = false; } - else if (b == ExponentLower || b == ExponentUpper) + else if ((b == 'e' || b == 'E') && readBytes > 0 && !hasExponent) { - // Don't allow leading exponent. - if (readBytes == 0) - { - return false; - } - - if (hasExponent) - { - return false; - } - hasExponent = true; + acceptSign = true; } else { @@ -107,7 +106,7 @@ public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken? to } readBytes++; - } while (inputBytes.MoveNext()); + } if (hasExponent && !isExponentNegative) { diff --git a/src/UglyToad.PdfPig.Tokenization/PlainTokenizer.cs b/src/UglyToad.PdfPig.Tokenization/PlainTokenizer.cs index 5019eda8c..c637d7aa2 100644 --- a/src/UglyToad.PdfPig.Tokenization/PlainTokenizer.cs +++ b/src/UglyToad.PdfPig.Tokenization/PlainTokenizer.cs @@ -6,7 +6,7 @@ internal sealed class PlainTokenizer : ITokenizer { - public bool ReadsNextByte { get; } = true; + public bool ReadsNextByte => false; public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token) { @@ -21,18 +21,11 @@ public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken tok builder.Append((char)currentByte); - while (inputBytes.MoveNext()) + while (inputBytes.Peek() is { } b + && !ReadHelper.IsWhitespace(b) + && (char)b is not '<' and not '[' and not '/' and not ']' and not '>' and not '(' and not ')') { - if (ReadHelper.IsWhitespace(inputBytes.CurrentByte)) - { - break; - } - - if (inputBytes.CurrentByte is (byte)'<' or (byte)'[' or (byte)'/' or (byte)']' or (byte)'>' or (byte)'(' or (byte)')') - { - break; - } - + inputBytes.MoveNext(); builder.Append((char) inputBytes.CurrentByte); } diff --git a/src/UglyToad.PdfPig.Tokenization/Scanner/CoreTokenScanner.cs b/src/UglyToad.PdfPig.Tokenization/Scanner/CoreTokenScanner.cs index 9d74ba031..228862fbd 100644 --- a/src/UglyToad.PdfPig.Tokenization/Scanner/CoreTokenScanner.cs +++ b/src/UglyToad.PdfPig.Tokenization/Scanner/CoreTokenScanner.cs @@ -246,7 +246,7 @@ public bool MoveNext() /* * Some tokenizers need to read the symbol of the next token to know if they have ended - * so we don't want to move on to the next byte, we would lose a byte, e.g.: /NameOne/NameTwo or /Name(string) + * so we don't want to move on to the next byte, we would lose a byte, e.g.: /NameOne/NameTwo or /Name(string) */ hasBytePreRead = tokenizer.ReadsNextByte; @@ -317,12 +317,13 @@ private List ReadInlineImageData() { // The ID operator should be followed by a single white-space character, and the next character is interpreted // as the first byte of image data. - if (!ReadHelper.IsWhitespace(inputBytes.CurrentByte)) + if (inputBytes.Peek() is { } c + && !ReadHelper.IsWhitespace(c)) { throw new PdfDocumentFormatException($"No whitespace character following the image data (ID) operator. Position: {inputBytes.CurrentOffset}."); } - var startsAt = inputBytes.CurrentOffset - 2; + var startsAt = inputBytes.CurrentOffset - 1; return ReadUntilEndImage(startsAt); } diff --git a/src/UglyToad.PdfPig/Parser/FileStructure/FileHeaderParser.cs b/src/UglyToad.PdfPig/Parser/FileStructure/FileHeaderParser.cs index 237515bc1..8775244cb 100644 --- a/src/UglyToad.PdfPig/Parser/FileStructure/FileHeaderParser.cs +++ b/src/UglyToad.PdfPig/Parser/FileStructure/FileHeaderParser.cs @@ -80,9 +80,8 @@ private static HeaderVersion GetHeaderVersionAndResetScanner(CommentToken commen } var atEnd = scanner.CurrentPosition == scanner.Length; - var rewind = atEnd ? 1 : 2; - var commentOffset = scanner.CurrentPosition - comment.Data.Length - rewind; + var commentOffset = scanner.CurrentPosition - comment.Data.Length - 1; scanner.Seek(0); diff --git a/src/UglyToad.PdfPig/Parser/FileStructure/FirstPassParser.StartXref.cs b/src/UglyToad.PdfPig/Parser/FileStructure/FirstPassParser.StartXref.cs index 43b24b722..984083236 100644 --- a/src/UglyToad.PdfPig/Parser/FileStructure/FirstPassParser.StartXref.cs +++ b/src/UglyToad.PdfPig/Parser/FileStructure/FirstPassParser.StartXref.cs @@ -10,13 +10,56 @@ internal static partial class FirstPassParser { private static ReadOnlySpan StartXRefBytes => "startxref"u8; + public const long EndOfFileBufferSize = 1024; + public static StartXRefLocation GetFirstCrossReferenceOffset( IInputBytes bytes, ISeekableTokenScanner scanner, ILog log) { + // We used to read backward through the file, but this is quite expensive for streams that directly wrap OS files. + // Instead we fetch the last 1024 bytes of the file and do a memory search, as cheap first attempt. This is significantly faster + // in practice, if there is no in-process caching of the file involved + // + // If that fails (in practice it should never) we fall back to the old method of reading backwards. var fileLength = bytes.Length; + { + var fetchFrom = Math.Max(bytes.Length - EndOfFileBufferSize, 0L); + + bytes.Seek(fetchFrom); + + Span byteBuffer = new byte[bytes.Length - fetchFrom]; // TODO: Maybe use PoolArray? + + int n = bytes.Read(byteBuffer); + + if (n == byteBuffer.Length) + { + int lx = byteBuffer.LastIndexOf("startxref"u8); + + if (lx < 0) + { + // See old code. We also try a mangled version + lx = byteBuffer.LastIndexOf("startref"u8); + } + + if (lx >= 0) + { + scanner.Seek(fetchFrom + lx); + + if (scanner.TryReadToken(out OperatorToken startXrefOp) && (startXrefOp.Data == "startxref" || startXrefOp.Data == "startref")) + { + var pos = GetNumericTokenFollowingCurrent(scanner); + + log.Debug($"Found startxref at {pos}"); + + return new StartXRefLocation(fetchFrom + lx, pos); + } + } + + } + } + // Now fall through in the old code var buffer = new CircularByteBuffer(StartXRefBytes.Length); // Start from the end of the file diff --git a/src/UglyToad.PdfPig/Parser/Parts/BruteForceSearcher.cs b/src/UglyToad.PdfPig/Parser/Parts/BruteForceSearcher.cs index 0ecde8965..e26ba0e08 100644 --- a/src/UglyToad.PdfPig/Parser/Parts/BruteForceSearcher.cs +++ b/src/UglyToad.PdfPig/Parser/Parts/BruteForceSearcher.cs @@ -57,7 +57,7 @@ public static IReadOnlyDictionary GetObjectLocations(II { var next = bytes.Peek(); - if (next.HasValue && next == 'n') + if (next == 'n') { if (ReadHelper.IsString(bytes, "endobj")) { diff --git a/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs b/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs index 8fbcf4180..c5c9fcd42 100644 --- a/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs +++ b/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs @@ -465,7 +465,7 @@ private bool TryReadStream(long startStreamTokenOffset, bool getLength, [NotNull read++; } - long streamDataEnd = inputBytes.CurrentOffset + 1; + long streamDataEnd = inputBytes.CurrentOffset; if (possibleEndLocation == null) return false;