Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@ public void OnlyExposedApiIsPublic()
"UglyToad.PdfPig.DocumentLayoutAnalysis.RecursiveXYCut",
"UglyToad.PdfPig.DocumentLayoutAnalysis.XYNode",
"UglyToad.PdfPig.DocumentLayoutAnalysis.XYLeaf",
"UglyToad.PdfPig.DocumentLayoutAnalysis.TextEdgesExtractor",
"UglyToad.PdfPig.DocumentLayoutAnalysis.EdgeType",
"UglyToad.PdfPig.Exceptions.PdfDocumentEncryptedException",
"UglyToad.PdfPig.Exceptions.PdfDocumentFormatException",
"UglyToad.PdfPig.Fonts.DescriptorFontFile",
Expand Down
10 changes: 10 additions & 0 deletions src/UglyToad.PdfPig/DocumentLayoutAnalysis/Docstrum.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
using System;
using System.Collections.Generic;
using System.Text;

namespace UglyToad.PdfPig.DocumentLayoutAnalysis
{
class Docstrum
{
}
}
109 changes: 109 additions & 0 deletions src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
using System;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.Linq;
using System.Threading.Tasks;
using UglyToad.PdfPig.Content;
using UglyToad.PdfPig.Geometry;

namespace UglyToad.PdfPig.DocumentLayoutAnalysis
{
/// <summary>
/// Text edges extractor. Text edges are where words have either their BoundingBox's left, right or mid coordinates aligned on the same vertical line.
/// <para>Useful to detect text columns, tables, justified text, lists, etc.</para>
/// </summary>
public static class TextEdgesExtractor
{
/// <summary>
/// Functions used to define left, middle and right edges.
/// </summary>
private static readonly Tuple<EdgeType, Func<PdfRectangle, decimal>>[] edgesFuncs = new Tuple<EdgeType, Func<PdfRectangle, decimal>>[]
{
Tuple.Create<EdgeType, Func<PdfRectangle, decimal>>(EdgeType.Left, x => Math.Round(x.Left, 0)), // use BoundingBox's left coordinate
Tuple.Create<EdgeType, Func<PdfRectangle, decimal>>(EdgeType.Mid, x => Math.Round(x.Left + x.Width / 2, 0)), // use BoundingBox's mid coordinate
Tuple.Create<EdgeType, Func<PdfRectangle, decimal>>(EdgeType.Right, x => Math.Round(x.Right, 0)) // use BoundingBox's right coordinate
};

/// <summary>
/// Get the text edges.
/// </summary>
/// <param name="pageWords">The words in the page.</param>
/// <param name="minimumElements">The minimum number of elements to define a text edge.</param>
public static IReadOnlyDictionary<EdgeType, List<PdfLine>> GetEdges(IEnumerable<Word> pageWords, int minimumElements = 4)
{
if (minimumElements < 0)
{
throw new ArgumentException("TextEdgesExtractor.GetEdges(): The minimum number of elements should be positive.", "minimumElements");
}

var cleanWords = pageWords.Where(x => !string.IsNullOrWhiteSpace(x.Text.Trim()));

ConcurrentDictionary<EdgeType, List<PdfLine>> dictionary = new ConcurrentDictionary<EdgeType, List<PdfLine>>();

Parallel.ForEach(edgesFuncs, f =>
{
dictionary.TryAdd(f.Item1, GetVerticalEdges(cleanWords, f.Item2, minimumElements));
});
return dictionary.ToDictionary(x => x.Key, x => x.Value);
}

private static List<PdfLine> GetVerticalEdges(IEnumerable<Word> pageWords, Func<PdfRectangle, decimal> func, int minimumElements)
{
Dictionary<decimal, List<Word>> edges = pageWords.GroupBy(x => func(x.BoundingBox))
.Where(x => x.Count() >= minimumElements).ToDictionary(gdc => gdc.Key, gdc => gdc.ToList());
Dictionary<decimal, List<List<Word>>> cleanEdges = new Dictionary<decimal, List<List<Word>>>();

foreach (var edge in edges)
{
var sortedEdges = edge.Value.OrderBy(x => x.BoundingBox.Bottom).ToList();
cleanEdges.Add(edge.Key, new List<List<Word>>());

var cuttings = pageWords.Except(edge.Value) // remove selected words
// words that cut the vertical line
.Where(x => x.BoundingBox.Left < edge.Key && x.BoundingBox.Right > edge.Key)
// and that are within the boundaries of the edge
.Where(k => k.BoundingBox.Bottom > edge.Value.Min(z => z.BoundingBox.Bottom)
&& k.BoundingBox.Top < edge.Value.Max(z => z.BoundingBox.Top))
.OrderBy(x => x.BoundingBox.Bottom).ToList();

if (cuttings.Count > 0)
{
foreach (var cut in cuttings)
{
var group1 = sortedEdges.Where(x => x.BoundingBox.Top < cut.BoundingBox.Bottom).ToList();
if (group1.Count >= minimumElements) cleanEdges[edge.Key].Add(group1);
sortedEdges = sortedEdges.Except(group1).ToList();
}
if (sortedEdges.Count >= minimumElements) cleanEdges[edge.Key].Add(sortedEdges);
}
else
{
cleanEdges[edge.Key].Add(sortedEdges);
}
}

return cleanEdges.SelectMany(x => x.Value.Select(y => new PdfLine(x.Key, y.Min(w => w.BoundingBox.Bottom), x.Key, y.Max(w => w.BoundingBox.Top)))).ToList();
}
}

/// <summary>
/// The type of text edge.
/// </summary>
public enum EdgeType
{
/// <summary>
/// Text edges where words have their BoundingBox's left coordinate aligned on the same vertical line.
/// </summary>
Left = 0,

/// <summary>
/// Text edges where words have their BoundingBox's mid coordinate aligned on the same vertical line.
/// </summary>
Mid = 1,

/// <summary>
/// Text edges where words have their BoundingBox's right coordinate aligned on the same vertical line.
/// </summary>
Right = 2
}
}