Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
177 changes: 177 additions & 0 deletions examples/AdvancedMerge.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using UglyToad.PdfPig;
using UglyToad.PdfPig.Core;
using UglyToad.PdfPig.Tokens;
using UglyToad.PdfPig.Writer;

namespace UglyToad.Examples;

public class AdvancedMerge
{
public static void Run(Stream input, Stream output)
{
using var pdf = PdfDocument.Open(input);
var pdfObjects = pdf.Structure
.CrossReferenceTable
.ObjectOffsets
.ToDictionary(kvp => kvp.Key, kvp => kvp.Value);

if (!pdf.Structure.Catalog.CatalogDictionary.TryGet<IndirectReferenceToken>(NameToken.Pages, out var pages))
throw new ArgumentException("No pages reference were found");

if (ResolveIndirect(pdf, pages) is not DictionaryToken pagesObj)
throw new ArgumentException("No pages object were found");

// Assume, we have only 1 page in here
if (!pagesObj.TryGet(NameToken.Kids, out ArrayToken kids) || kids.Length != 1)
throw new ArgumentException("Invalid catalog dictionary");

var kidReference = kids.Data[0] as IndirectReferenceToken;
if (ResolveIndirect(pdf, kidReference) is not DictionaryToken pageObj)
throw new ArgumentException("Invalid catalog dictionary");

// Here you can extract page content and save bind it in output page document
if (pageObj.TryGet<IndirectReferenceToken>(NameToken.Contents, out var contentObj))
{
var reference = new IndirectReference(0, 0);
var xrefLocation = XrefLocation.File(output.Position);
var newPageObject = new ObjectToken(xrefLocation, reference, pageObj.With(NameToken.Contents, contentObj));
TokenWriter.Instance.WriteToken(newPageObject, output);
}

// Skip all pdf meta structure objects
var skippedRefs = new HashSet<IndirectReference>
{
pages.Data, // Pages
kidReference!.Data, // Page
pdf.Structure.Trailer.Root, // Catalog
};

// Skip all refs from "skippedRefs" and order it by object number
var oldRefs = pdf.Structure.CrossReferenceTable.ObjectOffsets.Keys
.Where(k => !skippedRefs.Contains(k))
.OrderBy(k => k.ObjectNumber)
.ToList();

// Building refs map, to rebind old objects to their new values
var refMap = new Dictionary<IndirectReference, IndirectReference>();
var currentObjectNumber = pdf.Structure.Trailer.Size;
foreach (var oldRef in oldRefs)
{
var newRef = new IndirectReference(currentObjectNumber++, 0);
refMap[oldRef] = newRef;
}

foreach (var oldRef in oldRefs)
{
var newObjRef = refMap[oldRef];
var newXref = XrefLocation.File(output.Position);
var token = ResolveIndirect(pdf, oldRef);
var updatedToken = ReplaceReferences(token, refMap);

pdfObjects[newObjRef] = newXref;
TokenWriter.Instance.WriteToken(new ObjectToken(newXref, newObjRef, updatedToken), output);
}

// Writer new xref table
TokenWriter.Instance.WriteCrossReferenceTable(
pdfObjects.ToDictionary(kvp => kvp.Key, kvp => kvp.Value.Value1),
pdf.Structure.Trailer.Root,
output,
(pdf.Structure.Trailer.Info as IndirectReferenceToken)?.Data,
pdf.Structure.Trailer.PreviousCrossReferenceOffset);
}

/// <summary>
/// Recursively replaces IndirectReferenceToken in the token tree according to the map.
/// </summary>
private static IToken ReplaceReferences(IToken token, Dictionary<IndirectReference, IndirectReference> mapping)
{
return token switch
{
IndirectReferenceToken irt => mapping.TryGetValue(irt.Data, out var newRef) ? new IndirectReferenceToken(newRef) : token,
DictionaryToken dict => ReplaceDictionary(dict, mapping),
ArrayToken arr => ReplaceArray(arr, mapping),
StreamToken stream => ReplaceStream(stream, mapping),
_ => token
};
}

private static DictionaryToken ReplaceDictionary(DictionaryToken original, Dictionary<IndirectReference, IndirectReference> mapping)
{
var newDict = new Dictionary<NameToken, IToken>(original.Data.Count);
foreach (var kvp in original.Data)
{
newDict[NameToken.Create(kvp.Key)] = ReplaceReferences(kvp.Value, mapping);
}
return new DictionaryToken(newDict);
}

private static ArrayToken ReplaceArray(ArrayToken original, Dictionary<IndirectReference, IndirectReference> mapping)
{
var newData = new IToken[original.Length];
for (int i = 0; i < original.Length; i++)
{
newData[i] = ReplaceReferences(original.Data[i], mapping);
}
return new ArrayToken(newData);
}

private static StreamToken ReplaceStream(StreamToken original, Dictionary<IndirectReference, IndirectReference> mapping)
{
var updatedDict = ReplaceDictionary(original.StreamDictionary, mapping);
// We create a new StreamToken with the replaced dictionary, preserving the original byte stream.
return new StreamToken(updatedDict, original.Data);
}

private const int MaxIndirectResolutionDepth = 32;

private static IndirectReferenceToken FindLastPage(PdfDocument pdf, DictionaryToken pageTree)
{
return FindLastPage(pdf, (pageTree.Data[NameToken.Kids] as ArrayToken)!);
}

private static IndirectReferenceToken FindLastPage(PdfDocument pdf, ArrayToken pageTree)
{
while (true)
{
if (pageTree.Length == 0)
throw new ArgumentException("No leaf in page tree");

var root = pageTree.Data.Last()!;
if (ResolveIndirect(pdf, root) is not DictionaryToken newRoot)
throw new ArgumentException("Indirect page tree");

if (newRoot.Data[NameToken.Type] is not NameToken type)
throw new ArgumentException("Indirect page tree");

if (type.Data == NameToken.Page)
return (root as IndirectReferenceToken)!;
pageTree = (newRoot.Data[NameToken.Kids] as ArrayToken)!;
}
}

private static IToken ResolveIndirect(PdfDocument doc, IndirectReference reference)
{
return ResolveIndirect(doc, new IndirectReferenceToken(reference));
}

private static IToken ResolveIndirect(PdfDocument doc, IToken token)
{
var depth = 0;
while (token is IndirectReferenceToken ir)
{
if (++depth > MaxIndirectResolutionDepth)
throw new ArgumentException(
"Cyclic or excessively deep indirect reference in PDF signature dictionary.");

var obj = doc.Structure.GetObject(ir.Data);
token = obj.Data ?? throw new ArgumentException("Failed to parse PDF digital signature.");
}

return token;
}
}
Binary file modified src/UglyToad.PdfPig.Tests/Integration/Documents/capas.pdf
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -92,13 +92,13 @@ public void CanTokenizeAllAccessibleObjects(string documentName)
{
Assert.NotNull(document.Structure.Catalog);

//Assert.True(document.Structure.CrossReferenceTable.ObjectOffsets.Count > 0, "Cross reference table was empty.");
//foreach (var objectOffset in document.Structure.CrossReferenceTable.ObjectOffsets)
//{
// var token = document.Structure.GetObject(objectOffset.Key);
Assert.True(document.Structure.CrossReferenceTable.ObjectOffsets.Count > 0, "Cross reference table was empty.");
foreach (var objectOffset in document.Structure.CrossReferenceTable.ObjectOffsets)
{
var token = document.Structure.GetObject(objectOffset.Key);

// Assert.NotNull(token);
//}
Assert.NotNull(token);
}
}
}

Expand Down
1 change: 1 addition & 0 deletions src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ public void OnlyExposedApiIsPublic()
"UglyToad.PdfPig.Content.TextOrientation",
"UglyToad.PdfPig.Content.XmpMetadata",
"UglyToad.PdfPig.CrossReference.CrossReferenceTable",
"UglyToad.PdfPig.CrossReference.CrossReferenceTablePart",
"UglyToad.PdfPig.CrossReference.CrossReferenceType",
"UglyToad.PdfPig.CrossReference.TrailerDictionary",
"UglyToad.PdfPig.Exceptions.PdfDocumentEncryptedException",
Expand Down
26 changes: 14 additions & 12 deletions src/UglyToad.PdfPig.Tests/Writer/PdfDocumentBuilderTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -174,12 +174,12 @@ public void CanReadSingleBlankPage()

Assert.NotNull(document.Structure.Catalog);

//foreach (var offset in document.Structure.CrossReferenceTable.ObjectOffsets)
//{
// var obj = document.Structure.GetObject(offset.Key);

// Assert.NotNull(obj);
//}
foreach (var offset in document.Structure.CrossReferenceTable.ObjectOffsets)
{
var obj = document.Structure.GetObject(offset.Key);
Assert.NotNull(obj);
}
}
}

Expand Down Expand Up @@ -988,8 +988,8 @@ public void CanDedupObjectsFromSameDoc_Builder()
using (var document = PdfDocument.Open(result, ParsingOptions.LenientParsingOff))
{
Assert.Equal(2, document.NumberOfPages);
// Assert.True(document.Structure.CrossReferenceTable.ObjectOffsets.Count <= 29,
// "Expected object count to be lower than 30"); // 45 objects with duplicates, 29 with correct re-use
Assert.True(document.Structure.CrossReferenceTable.ObjectOffsets.Count <= 29,
"Expected object count to be lower than 30"); // 45 objects with duplicates, 29 with correct re-use
}
}
}
Expand All @@ -1010,8 +1010,8 @@ public void CanDedupObjectsFromDifferentDoc_HashBuilder()
using (var document = PdfDocument.Open(result, ParsingOptions.LenientParsingOff))
{
Assert.Equal(2, document.NumberOfPages);
// Assert.True(document.Structure.CrossReferenceTable.ObjectOffsets.Count <= 29,
// "Expected object count to be lower than 30"); // 45 objects with duplicates, 29 with correct re-use
Assert.True(document.Structure.CrossReferenceTable.ObjectOffsets.Count <= 29,
"Expected object count to be lower than 30"); // 45 objects with duplicates, 29 with correct re-use
}
}
}
Expand Down Expand Up @@ -1404,10 +1404,12 @@ public void WriteObject(long objectNumber, int generation, byte[] data, Stream o
Objects++;
}

public void WriteCrossReferenceTable(IReadOnlyDictionary<IndirectReference, long> objectOffsets,
public void WriteCrossReferenceTable(
IReadOnlyDictionary<IndirectReference, long> objectOffsets,
IndirectReference catalogToken,
Stream outputStream,
IndirectReference? documentInformationReference)
IndirectReference? documentInformationReference,
long? prevTableLocation)
{
WroteCrossReferenceTable = true;
}
Expand Down
Loading