diff --git a/README.md b/README.md index 81ed5df1d..5c18d079a 100644 --- a/README.md +++ b/README.md @@ -42,7 +42,7 @@ New in v0.0.5 - To create documents use the class ```PdfDocumentBuilder```. Thou byte[] documentBytes = builder.Build(); - File.WriteAllBytes(@"C:\git\newPdf.pdf"); + File.WriteAllBytes(@"C:\git\newPdf.pdf", documentBytes); Each font must be registered with the PdfDocumentBuilder prior to use enable pages to share the font resources. Currently only Standard 14 fonts and TrueType fonts (.ttf) are supported. diff --git a/src/UglyToad.PdfPig.Tests/Graphics/Operations/GraphicsStateOperationTests.cs b/src/UglyToad.PdfPig.Tests/Graphics/Operations/GraphicsStateOperationTests.cs index ca3100630..1aacdee0e 100644 --- a/src/UglyToad.PdfPig.Tests/Graphics/Operations/GraphicsStateOperationTests.cs +++ b/src/UglyToad.PdfPig.Tests/Graphics/Operations/GraphicsStateOperationTests.cs @@ -6,6 +6,7 @@ using System.Linq; using System.Reflection; using PdfPig.Graphics.Operations; + using PdfPig.Graphics.Operations.InlineImages; using PdfPig.Tokens; using Xunit; @@ -41,6 +42,10 @@ public void AllOperationsCanBeWritten() operation = (IGraphicsStateOperation)field.GetValue(null); } + else if (operationType == typeof(EndInlineImage)) + { + operation = new EndInlineImage(new List(), new List()); + } else { var constructor = constructors[0]; diff --git a/src/UglyToad.PdfPig.Tests/IO/InputBytesTests.cs b/src/UglyToad.PdfPig.Tests/IO/InputBytesTests.cs index 18a1666b9..e94f70071 100644 --- a/src/UglyToad.PdfPig.Tests/IO/InputBytesTests.cs +++ b/src/UglyToad.PdfPig.Tests/IO/InputBytesTests.cs @@ -64,6 +64,12 @@ public void ArrayAndStreamBehaveTheSame() Assert.True(stream.IsAtEnd()); Assert.True(array.IsAtEnd()); + + stream.Seek(0); + array.Seek(0); + + Assert.False(stream.IsAtEnd()); + Assert.False(array.IsAtEnd()); } } } diff --git a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs index 1c3c5e96f..865a6240b 100644 --- a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs +++ b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs @@ -65,6 +65,8 @@ public void OnlyExposedApiIsPublic() "UglyToad.PdfPig.DocumentLayoutAnalysis.RecursiveXYCut", "UglyToad.PdfPig.DocumentLayoutAnalysis.XYNode", "UglyToad.PdfPig.DocumentLayoutAnalysis.XYLeaf", + "UglyToad.PdfPig.DocumentLayoutAnalysis.TextEdgesExtractor", + "UglyToad.PdfPig.DocumentLayoutAnalysis.EdgeType", "UglyToad.PdfPig.Exceptions.PdfDocumentEncryptedException", "UglyToad.PdfPig.Exceptions.PdfDocumentFormatException", "UglyToad.PdfPig.Fonts.DescriptorFontFile", @@ -174,6 +176,7 @@ public void OnlyExposedApiIsPublic() "UglyToad.PdfPig.Tokens.HexToken", "UglyToad.PdfPig.Tokens.IDataToken`1", "UglyToad.PdfPig.Tokens.IndirectReferenceToken", + "UglyToad.PdfPig.Tokens.InlineImageDataToken", "UglyToad.PdfPig.Tokens.IToken", "UglyToad.PdfPig.Tokens.NameToken", "UglyToad.PdfPig.Tokens.NullToken", diff --git a/src/UglyToad.PdfPig/Content/Page.cs b/src/UglyToad.PdfPig/Content/Page.cs index f28c1834a..6ce7b9ff0 100644 --- a/src/UglyToad.PdfPig/Content/Page.cs +++ b/src/UglyToad.PdfPig/Content/Page.cs @@ -9,7 +9,7 @@ using Util; using Util.JetBrains.Annotations; using XObjects; - using UglyToad.PdfPig.Geometry; + using Geometry; /// /// Contains the content and provides access to methods of a single page in the . @@ -30,23 +30,18 @@ public class Page internal CropBox CropBox { get; } + internal PageContent Content { get; } + /// /// The rotation of the page in degrees (clockwise). Valid values are 0, 90, 180 and 270. /// public PageRotationDegrees Rotation { get; } - internal PageContent Content { get; } - /// /// The set of s drawn by the PDF content. /// public IReadOnlyList Letters => Content?.Letters ?? new Letter[0]; - - /// - /// The set of s drawn by the PDF content. - /// - public IReadOnlyList Paths => Content?.Paths ?? new List(); - + /// /// The full text of all characters on the page in the order they are presented in the PDF content. /// @@ -136,6 +131,11 @@ public class Experimental private readonly Page page; private readonly AnnotationProvider annotationProvider; + /// + /// The set of s drawn by the PDF content. + /// + public IReadOnlyList Paths => page.Content?.Paths ?? new List(); + internal Experimental(Page page, AnnotationProvider annotationProvider) { this.page = page; diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/RecursiveXYCut.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/RecursiveXYCut.cs index a961ff171..ecaa610b9 100644 --- a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/RecursiveXYCut.cs +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/RecursiveXYCut.cs @@ -11,7 +11,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis /// https://en.wikipedia.org/wiki/Recursive_X-Y_cut /// See 'Recursive X-Y Cut using Bounding Boxes of Connected Components' by Jaekyu Ha, Robert M.Haralick and Ihsin T. Phillips /// - public class RecursiveXYCut + public static class RecursiveXYCut { /// /// Get the blocks. diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs new file mode 100644 index 000000000..83fc76616 --- /dev/null +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs @@ -0,0 +1,109 @@ +using System; +using System.Collections.Concurrent; +using System.Collections.Generic; +using System.Linq; +using System.Threading.Tasks; +using UglyToad.PdfPig.Content; +using UglyToad.PdfPig.Geometry; + +namespace UglyToad.PdfPig.DocumentLayoutAnalysis +{ + /// + /// Text edges extractor. Text edges are where words have either their BoundingBox's left, right or mid coordinates aligned on the same vertical line. + /// Useful to detect text columns, tables, justified text, lists, etc. + /// + public static class TextEdgesExtractor + { + /// + /// Functions used to define left, middle and right edges. + /// + private static readonly Tuple>[] edgesFuncs = new Tuple>[] + { + Tuple.Create>(EdgeType.Left, x => Math.Round(x.Left, 0)), // use BoundingBox's left coordinate + Tuple.Create>(EdgeType.Mid, x => Math.Round(x.Left + x.Width / 2, 0)), // use BoundingBox's mid coordinate + Tuple.Create>(EdgeType.Right, x => Math.Round(x.Right, 0)) // use BoundingBox's right coordinate + }; + + /// + /// Get the text edges. + /// + /// The words in the page. + /// The minimum number of elements to define a text edge. + public static IReadOnlyDictionary> GetEdges(IEnumerable pageWords, int minimumElements = 4) + { + if (minimumElements < 0) + { + throw new ArgumentException("TextEdgesExtractor.GetEdges(): The minimum number of elements should be positive.", "minimumElements"); + } + + var cleanWords = pageWords.Where(x => !string.IsNullOrWhiteSpace(x.Text.Trim())); + + ConcurrentDictionary> dictionary = new ConcurrentDictionary>(); + + Parallel.ForEach(edgesFuncs, f => + { + dictionary.TryAdd(f.Item1, GetVerticalEdges(cleanWords, f.Item2, minimumElements)); + }); + return dictionary.ToDictionary(x => x.Key, x => x.Value); + } + + private static List GetVerticalEdges(IEnumerable pageWords, Func func, int minimumElements) + { + Dictionary> edges = pageWords.GroupBy(x => func(x.BoundingBox)) + .Where(x => x.Count() >= minimumElements).ToDictionary(gdc => gdc.Key, gdc => gdc.ToList()); + Dictionary>> cleanEdges = new Dictionary>>(); + + foreach (var edge in edges) + { + var sortedEdges = edge.Value.OrderBy(x => x.BoundingBox.Bottom).ToList(); + cleanEdges.Add(edge.Key, new List>()); + + var cuttings = pageWords.Except(edge.Value) // remove selected words + // words that cut the vertical line + .Where(x => x.BoundingBox.Left < edge.Key && x.BoundingBox.Right > edge.Key) + // and that are within the boundaries of the edge + .Where(k => k.BoundingBox.Bottom > edge.Value.Min(z => z.BoundingBox.Bottom) + && k.BoundingBox.Top < edge.Value.Max(z => z.BoundingBox.Top)) + .OrderBy(x => x.BoundingBox.Bottom).ToList(); + + if (cuttings.Count > 0) + { + foreach (var cut in cuttings) + { + var group1 = sortedEdges.Where(x => x.BoundingBox.Top < cut.BoundingBox.Bottom).ToList(); + if (group1.Count >= minimumElements) cleanEdges[edge.Key].Add(group1); + sortedEdges = sortedEdges.Except(group1).ToList(); + } + if (sortedEdges.Count >= minimumElements) cleanEdges[edge.Key].Add(sortedEdges); + } + else + { + cleanEdges[edge.Key].Add(sortedEdges); + } + } + + return cleanEdges.SelectMany(x => x.Value.Select(y => new PdfLine(x.Key, y.Min(w => w.BoundingBox.Bottom), x.Key, y.Max(w => w.BoundingBox.Top)))).ToList(); + } + } + + /// + /// The type of text edge. + /// + public enum EdgeType + { + /// + /// Text edges where words have their BoundingBox's left coordinate aligned on the same vertical line. + /// + Left = 0, + + /// + /// Text edges where words have their BoundingBox's mid coordinate aligned on the same vertical line. + /// + Mid = 1, + + /// + /// Text edges where words have their BoundingBox's right coordinate aligned on the same vertical line. + /// + Right = 2 + } +} diff --git a/src/UglyToad.PdfPig/Fonts/CidFonts/Type2CidFont.cs b/src/UglyToad.PdfPig/Fonts/CidFonts/Type2CidFont.cs index 8689c4920..3e08fad6e 100644 --- a/src/UglyToad.PdfPig/Fonts/CidFonts/Type2CidFont.cs +++ b/src/UglyToad.PdfPig/Fonts/CidFonts/Type2CidFont.cs @@ -48,6 +48,11 @@ public Type2CidFont(NameToken type, NameToken subType, NameToken baseFont, Chara public decimal GetWidthFromFont(int characterIdentifier) { + if (fontProgram == null) + { + return GetWidthFromDictionary(characterIdentifier); + } + if (fontProgram.TryGetBoundingAdvancedWidth(characterIdentifier, cidToGid.GetGlyphIndex, out var width)) { return width; diff --git a/src/UglyToad.PdfPig/Geometry/PdfPath.cs b/src/UglyToad.PdfPig/Geometry/PdfPath.cs index 5d2616922..7b4061ed6 100644 --- a/src/UglyToad.PdfPig/Geometry/PdfPath.cs +++ b/src/UglyToad.PdfPig/Geometry/PdfPath.cs @@ -5,19 +5,28 @@ namespace UglyToad.PdfPig.Geometry using System.Collections.Generic; using System.Linq; using System.Text; - using UglyToad.PdfPig.Core; + using Core; /// - /// A path in a PDF document, used by glyphs and page content. + /// A path in a PDF document, used by glyphs and page content. Can contain multiple sub-paths. /// public class PdfPath { private readonly List commands = new List(); + + /// + /// The sequence of sub-paths which form this . + /// public IReadOnlyList Commands => commands; private PdfPoint? currentPosition; - private TransformationMatrix currentTransformationMatrix = TransformationMatrix.Identity; + private readonly TransformationMatrix currentTransformationMatrix; + + /// + /// Create a new . + /// + /// The transformation to apply to all points in this path. public PdfPath(TransformationMatrix transformationMatrix) { currentTransformationMatrix = transformationMatrix; @@ -162,79 +171,140 @@ string BboxToRect(PdfRectangle box, string stroke) return result; } + /// + /// A command in a . + /// public interface IPathCommand { + /// + /// Returns the smallest rectangle which contains the path region given by this command. + /// + /// PdfRectangle? GetBoundingRectangle(); + /// + /// Converts from the path command to an SVG string representing the path operation. + /// void WriteSvg(StringBuilder builder); } - private class Close : IPathCommand + /// + /// Close the current . + /// + public class Close : IPathCommand { + /// public PdfRectangle? GetBoundingRectangle() { return null; } + /// public void WriteSvg(StringBuilder builder) { builder.Append("Z "); } } + /// + /// Move drawing of the current to the specified location. + /// public class Move : IPathCommand { + /// + /// The location to move to. + /// public PdfPoint Location { get; } + /// + /// Create a new path command. + /// + /// public Move(PdfPoint location) { Location = location; } + /// + /// Returns since this generates no visible path. + /// public PdfRectangle? GetBoundingRectangle() { return null; } + /// public void WriteSvg(StringBuilder builder) { builder.Append("M ").Append(Location.X).Append(' ').Append(Location.Y).Append(' '); } } + /// + /// Draw a straight line between two points. + /// public class Line : IPathCommand { + /// + /// The start of the line. + /// public PdfPoint From { get; } + /// + /// The end of the line. + /// public PdfPoint To { get; } + /// + /// Create a new . + /// public Line(PdfPoint from, PdfPoint to) { From = from; To = to; } + /// public PdfRectangle? GetBoundingRectangle() { return new PdfRectangle(From, To); } + /// public void WriteSvg(StringBuilder builder) { builder.AppendFormat("L {0} {1} ", To.X, To.Y); } } + /// + /// Draw a Bezier curve given by the start, control and end points. + /// public class BezierCurve : IPathCommand { + /// + /// The start point of the Bezier curve. + /// public PdfPoint StartPoint { get; } + /// + /// The first control point of the curve. + /// public PdfPoint FirstControlPoint { get; } + /// + /// The second control point of the curve. + /// public PdfPoint SecondControlPoint { get; } + /// + /// The end point of the curve. + /// public PdfPoint EndPoint { get; } + /// + /// Create a Bezier curve at the provided points. + /// public BezierCurve(PdfPoint startPoint, PdfPoint firstControlPoint, PdfPoint secondControlPoint, PdfPoint endPoint) { StartPoint = startPoint; @@ -243,6 +313,7 @@ public BezierCurve(PdfPoint startPoint, PdfPoint firstControlPoint, PdfPoint sec EndPoint = endPoint; } + /// public PdfRectangle? GetBoundingRectangle() { // Optimised @@ -287,6 +358,13 @@ public BezierCurve(PdfPoint startPoint, PdfPoint firstControlPoint, PdfPoint sec return new PdfRectangle((decimal)minX, (decimal)minY, (decimal)maxX, (decimal)maxY); } + /// + public void WriteSvg(StringBuilder builder) + { + builder.AppendFormat("C {0} {1}, {2} {3}, {4} {5} ", FirstControlPoint.X, FirstControlPoint.Y, SecondControlPoint.X, SecondControlPoint.Y, + EndPoint.X, EndPoint.Y); + } + private bool TrySolveQuadratic(bool isX, double currentMin, double currentMax, out (double min, double max) solutions) { @@ -378,12 +456,6 @@ private static double ValueWithT(double p1, double p2, double p3, double p4, dou return p; } - - public void WriteSvg(StringBuilder builder) - { - builder.AppendFormat("C {0} {1}, {2} {3}, {4} {5} ", FirstControlPoint.X, FirstControlPoint.Y, SecondControlPoint.X, SecondControlPoint.Y, - EndPoint.X, EndPoint.Y); - } } internal void Rectangle(decimal x, decimal y, decimal width, decimal height) diff --git a/src/UglyToad.PdfPig/Geometry/PdfPoint.cs b/src/UglyToad.PdfPig/Geometry/PdfPoint.cs index 260c5d669..8ddc1bdf5 100644 --- a/src/UglyToad.PdfPig/Geometry/PdfPoint.cs +++ b/src/UglyToad.PdfPig/Geometry/PdfPoint.cs @@ -82,6 +82,27 @@ internal PdfVector ToVector() return new PdfVector(X, Y); } + /// + /// Returns a value indicating whether this is equal to a specified . + /// + /// + public override bool Equals(object obj) + { + if (obj is PdfPoint point) + { + return point.X == this.X && point.Y == this.Y; + } + return false; + } + + /// + /// Returns the hash code for this . + /// + public override int GetHashCode() + { + return (X, Y).GetHashCode(); + } + /// /// Get a string representation of this point. /// diff --git a/src/UglyToad.PdfPig/Geometry/PdfRectangle.cs b/src/UglyToad.PdfPig/Geometry/PdfRectangle.cs index df8e985b4..d658653a7 100644 --- a/src/UglyToad.PdfPig/Geometry/PdfRectangle.cs +++ b/src/UglyToad.PdfPig/Geometry/PdfRectangle.cs @@ -30,6 +30,11 @@ public struct PdfRectangle /// public PdfPoint BottomLeft { get; } + /// + /// Centroid point of the rectangle. + /// + public PdfPoint Centroid { get; } + /// /// Width of the rectangle. /// @@ -105,15 +110,14 @@ public PdfRectangle(decimal x1, decimal y1, decimal x2, decimal y2) BottomLeft = new PdfPoint(left, bottom); BottomRight = new PdfPoint(right, bottom); + + Centroid = new PdfPoint(left + (right - left) / 2, bottom + (top - bottom) / 2); } internal PdfRectangle(PdfVector topLeft, PdfVector topRight, PdfVector bottomLeft, PdfVector bottomRight) + : this(topLeft.ToPoint(), topRight.ToPoint(), bottomLeft.ToPoint(), bottomRight.ToPoint()) { - TopLeft = topLeft.ToPoint(); - TopRight = topRight.ToPoint(); - BottomLeft = bottomLeft.ToPoint(); - BottomRight = bottomRight.ToPoint(); } internal PdfRectangle(PdfPoint topLeft, PdfPoint topRight, PdfPoint bottomLeft, PdfPoint bottomRight) @@ -123,6 +127,8 @@ internal PdfRectangle(PdfPoint topLeft, PdfPoint topRight, PdfPoint bottomLeft, BottomLeft = bottomLeft; BottomRight = bottomRight; + + Centroid = new PdfPoint(topLeft.X + (topRight.X - topLeft.X) / 2, bottomLeft.Y + (topLeft.Y - bottomLeft.Y) / 2); } /// diff --git a/src/UglyToad.PdfPig/Graphics/Operations/InlineImages/EndInlineImage.cs b/src/UglyToad.PdfPig/Graphics/Operations/InlineImages/EndInlineImage.cs index 87eacfec9..f6b7ceb3f 100644 --- a/src/UglyToad.PdfPig/Graphics/Operations/InlineImages/EndInlineImage.cs +++ b/src/UglyToad.PdfPig/Graphics/Operations/InlineImages/EndInlineImage.cs @@ -1,6 +1,9 @@ namespace UglyToad.PdfPig.Graphics.Operations.InlineImages { + using System; + using System.Collections.Generic; using System.IO; + using Tokens; /// /// @@ -14,15 +17,27 @@ public class EndInlineImage : IGraphicsStateOperation public const string Symbol = "EI"; /// - /// The instance of the operation. + /// The tokens declared in order for this inline image object. /// - public static readonly EndInlineImage Value = new EndInlineImage(); + public IReadOnlyList ImageTokens { get; } + /// + /// The raw data for the inline image which should be interpreted according to the . + /// + public IReadOnlyList ImageData { get; } + /// public string Operator => Symbol; - private EndInlineImage() + /// + /// Create a new operation. + /// + /// The tokens which were set during the declaration of this image. + /// The raw byte data of this image. + public EndInlineImage(IReadOnlyList imageTokens, IReadOnlyList imageData) { + ImageTokens = imageTokens ?? throw new ArgumentNullException(nameof(imageTokens)); + ImageData = imageData ?? throw new ArgumentNullException(nameof(imageData)); } /// diff --git a/src/UglyToad.PdfPig/IO/StreamInputBytes.cs b/src/UglyToad.PdfPig/IO/StreamInputBytes.cs index 2ce50bcfe..729c7b72a 100644 --- a/src/UglyToad.PdfPig/IO/StreamInputBytes.cs +++ b/src/UglyToad.PdfPig/IO/StreamInputBytes.cs @@ -75,6 +75,8 @@ public bool IsAtEnd() public void Seek(long position) { + isAtEnd = false; + if (position == 0) { stream.Seek(0, SeekOrigin.Begin); diff --git a/src/UglyToad.PdfPig/Parser/PageContentParser.cs b/src/UglyToad.PdfPig/Parser/PageContentParser.cs index 4c3712c8d..a777a0be4 100644 --- a/src/UglyToad.PdfPig/Parser/PageContentParser.cs +++ b/src/UglyToad.PdfPig/Parser/PageContentParser.cs @@ -3,6 +3,7 @@ using System.Collections.Generic; using Graphics; using Graphics.Operations; + using Graphics.Operations.InlineImages; using IO; using Tokenization.Scanner; using Tokens; @@ -27,7 +28,13 @@ public IReadOnlyList Parse(IInputBytes inputBytes) { var token = scanner.CurrentToken; - if (token is OperatorToken op) + if (token is InlineImageDataToken inlineImageData) + { + graphicsStateOperations.Add(BeginInlineImageData.Value); + graphicsStateOperations.Add(new EndInlineImage(precedingTokens, inlineImageData.Data)); + precedingTokens.Clear(); + } + else if (token is OperatorToken op) { var operation = operationFactory.Create(op, precedingTokens); diff --git a/src/UglyToad.PdfPig/Tokenization/Scanner/CoreTokenScanner.cs b/src/UglyToad.PdfPig/Tokenization/Scanner/CoreTokenScanner.cs index 07563135d..2d7925924 100644 --- a/src/UglyToad.PdfPig/Tokenization/Scanner/CoreTokenScanner.cs +++ b/src/UglyToad.PdfPig/Tokenization/Scanner/CoreTokenScanner.cs @@ -20,11 +20,12 @@ internal class CoreTokenScanner : ISeekableTokenScanner private readonly ScannerScope scope; private readonly IInputBytes inputBytes; - private readonly List currentBuffer = new List(); private readonly List<(byte firstByte, ITokenizer tokenizer)> customTokenizers = new List<(byte, ITokenizer)>(); internal long CurrentTokenStart { get; private set; } + public IToken CurrentToken { get; private set; } + public bool TryReadToken(out T token) where T : class, IToken { token = default(T); @@ -51,6 +52,7 @@ public void Seek(long position) public long CurrentPosition => inputBytes.CurrentOffset; private bool hasBytePreRead; + private bool isInInlineImage; internal CoreTokenScanner(IInputBytes inputBytes, ScannerScope scope = ScannerScope.None) { @@ -60,8 +62,6 @@ internal CoreTokenScanner(IInputBytes inputBytes, ScannerScope scope = ScannerSc public bool MoveNext() { - currentBuffer.Clear(); - var endAngleBracesRead = 0; bool isSkippingSymbol = false; @@ -89,7 +89,6 @@ public bool MoveNext() continue; } - // If we failed to read the symbol for whatever reason we pass over it. if (isSkippingSymbol && c != '>') { @@ -161,6 +160,23 @@ public bool MoveNext() continue; } + if (token is OperatorToken op) + { + if (op.Data == "BI") + { + isInInlineImage = true; + } + else if (isInInlineImage && op.Data == "ID") + { + // Special case handling for inline images. + var imageData = ReadInlineImageData(); + isInInlineImage = false; + CurrentToken = new InlineImageDataToken(imageData); + hasBytePreRead = false; + return true; + } + } + CurrentToken = token; /* @@ -190,6 +206,35 @@ public void DeregisterCustomTokenizer(ITokenizer tokenizer) customTokenizers.RemoveAll(x => ReferenceEquals(x.tokenizer, tokenizer)); } + private IReadOnlyList ReadInlineImageData() + { + // The ID operator should be followed by a single white-space character, and the next character is interpreted + // as the first byte of image data. + if (inputBytes.CurrentByte != ' ') + { + throw new PdfDocumentFormatException($"No whitespace character following the image data (ID) operator. Position: {inputBytes.CurrentOffset}."); + } + + var startsAt = inputBytes.CurrentOffset - 2; + + var imageData = new List(); + byte prevByte = 0; + while (inputBytes.MoveNext()) + { + if (inputBytes.CurrentByte == 'I' && prevByte == 'E') + { + imageData.RemoveAt(imageData.Count - 1); + return imageData; + } + + imageData.Add(inputBytes.CurrentByte); + + prevByte = inputBytes.CurrentByte; + } + + throw new PdfDocumentFormatException($"No end of inline image data (EI) was found for image data at position {startsAt}."); + } + private static bool IsEmpty(byte b) { return b == ' ' || b == '\r' || b == '\n' || b == 0; diff --git a/src/UglyToad.PdfPig/Tokens/InlineImageDataToken.cs b/src/UglyToad.PdfPig/Tokens/InlineImageDataToken.cs new file mode 100644 index 000000000..19b3083e2 --- /dev/null +++ b/src/UglyToad.PdfPig/Tokens/InlineImageDataToken.cs @@ -0,0 +1,22 @@ +namespace UglyToad.PdfPig.Tokens +{ + using System.Collections.Generic; + + /// + /// Inline image data is used to embed images in PDF content streams. The content is wrapped by ID and ED tags in a BI operation. + /// + public class InlineImageDataToken : IDataToken> + { + /// + public IReadOnlyList Data { get; } + + /// + /// Create a new . + /// + /// + public InlineImageDataToken(IReadOnlyList data) + { + Data = data; + } + } +} \ No newline at end of file diff --git a/src/UglyToad.PdfPig/UglyToad.PdfPig.csproj b/src/UglyToad.PdfPig/UglyToad.PdfPig.csproj index 716e04dc3..8e78c4aae 100644 --- a/src/UglyToad.PdfPig/UglyToad.PdfPig.csproj +++ b/src/UglyToad.PdfPig/UglyToad.PdfPig.csproj @@ -6,15 +6,15 @@ full UglyToad PdfPig - Reads text content from PDF documents and supports document creation. + Reads text content from PDF documents and supports document creation. Apache 2.0 licensed. https://raw.githubusercontent.com/UglyToad/PdfPig/master/LICENSE https://github.com/UglyToad/PdfPig PDF;Reader;Document;Adobe;PDFBox;PdfPig;pdf-extract https://github.com/UglyToad/PdfPig true - 0.0.6 - 0.0.6.0 - 0.0.6.0 + 0.0.7 + 0.0.7.0 + 0.0.7.0 https://raw.githubusercontent.com/UglyToad/PdfPig/master/documentation/pdfpig.png PdfPig true