From 83889cfb52b1780da7e685b87f248eaa20018134 Mon Sep 17 00:00:00 2001 From: BobLd Date: Tue, 6 Aug 2019 15:24:16 +0100 Subject: [PATCH 1/6] Document Layout Analysis - Text edges extractor Text edges are where words have either there BoundingBox's left, right or mid coordinate aligned on the same vertical line. Useful to detect tables, justified text, lists, etc. --- .../PublicApiScannerTests.cs | 1 + .../TextEdgesExtractor.cs | 83 +++++++++++++++++++ 2 files changed, 84 insertions(+) create mode 100644 src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs diff --git a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs index 2b26f5164..54a98e37e 100644 --- a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs +++ b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs @@ -65,6 +65,7 @@ public void OnlyExposedApiIsPublic() "UglyToad.PdfPig.DocumentLayoutAnalysis.RecursiveXYCut", "UglyToad.PdfPig.DocumentLayoutAnalysis.XYNode", "UglyToad.PdfPig.DocumentLayoutAnalysis.XYLeaf", + "UglyToad.PdfPig.DocumentLayoutAnalysis.TextEdgesExtractor", "UglyToad.PdfPig.Exceptions.PdfDocumentEncryptedException", "UglyToad.PdfPig.Exceptions.PdfDocumentFormatException", "UglyToad.PdfPig.Fonts.DescriptorFontFile", diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs new file mode 100644 index 000000000..d07eab708 --- /dev/null +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs @@ -0,0 +1,83 @@ +using System; +using System.Collections.Concurrent; +using System.Collections.Generic; +using System.Linq; +using System.Threading.Tasks; +using UglyToad.PdfPig.Content; +using UglyToad.PdfPig.Geometry; + +namespace UglyToad.PdfPig.DocumentLayoutAnalysis +{ + /// + /// Text edges extractor. Text edges are where words have either there BoundingBox's left, right or mid coordinate aligned on the same vertical line. + /// Useful to detect tables, justified text, lists, etc. + /// + public class TextEdgesExtractor + { + /// + /// Functions used to define left, middle and right edges. + /// + private static readonly Tuple>[] edgesFuncs = new Tuple>[] + { + Tuple.Create>("left", x => Math.Round(x.Left, 0)), // use BoundingBox's left coordinate + Tuple.Create>("mid", x => Math.Round(x.Left + x.Width / 2, 0)), // use BoundingBox's mid coordinate + Tuple.Create>("right", x => Math.Round(x.Right, 0)) // use BoundingBox's right coordinate + }; + + /// + /// Get the text edges. + /// + /// The words in the page. + /// The minimum number of elements to define a text edge. + public static Dictionary> GetEdges(IEnumerable pageWords, int minimumElements = 4) + { + var cleanWords = pageWords.Where(x => !string.IsNullOrWhiteSpace(x.Text.Trim())); + + ConcurrentDictionary> dictionary = new ConcurrentDictionary>(); + + Parallel.ForEach(edgesFuncs, f => + { + dictionary.TryAdd(f.Item1, GetVerticalEdges(cleanWords, f.Item2, minimumElements)); + }); + return dictionary.ToDictionary(x => x.Key, x => x.Value); + } + + private static List GetVerticalEdges(IEnumerable pageWords, Func func, int minimumElements) + { + Dictionary> edges = pageWords.GroupBy(x => func(x.BoundingBox)) + .Where(x => x.Count() >= minimumElements).ToDictionary(gdc => gdc.Key, gdc => gdc.ToList()); + Dictionary>> cleanEdges = new Dictionary>>(); + + foreach (var edge in edges) + { + var sortedEdges = edge.Value.OrderBy(x => x.BoundingBox.Bottom).ToList(); + cleanEdges.Add(edge.Key, new List>()); + + var cuttings = pageWords.Except(edge.Value) // remove selected words + // words that cut the vertical line + .Where(x => x.BoundingBox.Left < edge.Key && x.BoundingBox.Right > edge.Key) + // and that are within the boundaries of the edge + .Where(k => k.BoundingBox.Bottom > edge.Value.Min(z => z.BoundingBox.Bottom) + && k.BoundingBox.Top < edge.Value.Max(z => z.BoundingBox.Top)) + .OrderBy(x => x.BoundingBox.Bottom).ToList(); + + if (cuttings.Count > 0) + { + foreach (var cut in cuttings) + { + var group1 = sortedEdges.Where(x => x.BoundingBox.Top < cut.BoundingBox.Bottom).ToList(); + if (group1.Count >= minimumElements) cleanEdges[edge.Key].Add(group1); + sortedEdges = sortedEdges.Except(group1).ToList(); + } + if (sortedEdges.Count >= minimumElements) cleanEdges[edge.Key].Add(sortedEdges); + } + else + { + cleanEdges[edge.Key].Add(sortedEdges); + } + } + + return cleanEdges.SelectMany(x => x.Value.Select(y => new PdfLine(x.Key, y.Min(w => w.BoundingBox.Bottom), x.Key, y.Max(w => w.BoundingBox.Top)))).ToList(); + } + } +} From 9694b1f8e89ea6fce751ef43a020ec0d3edac967 Mon Sep 17 00:00:00 2001 From: BobLd <38405645+BobLd@users.noreply.github.com> Date: Tue, 6 Aug 2019 15:27:16 +0100 Subject: [PATCH 2/6] Update TextEdgesExtractor.cs --- .../DocumentLayoutAnalysis/TextEdgesExtractor.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs index d07eab708..957e6acf8 100644 --- a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs @@ -9,8 +9,8 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis { /// - /// Text edges extractor. Text edges are where words have either there BoundingBox's left, right or mid coordinate aligned on the same vertical line. - /// Useful to detect tables, justified text, lists, etc. + /// Text edges extractor. Text edges are where words have either there BoundingBox's left, right or mid coordinates aligned on the same vertical line. + /// Useful to detect text columns, tables, justified text, lists, etc. /// public class TextEdgesExtractor { From 85d5bb7c7e09aa0e932d0f994781780d6755b4ad Mon Sep 17 00:00:00 2001 From: BobLd Date: Wed, 7 Aug 2019 13:45:57 +0100 Subject: [PATCH 3/6] Adding enum EdgeType --- .../TextEdgesExtractor.cs | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs index 957e6acf8..20f84ae41 100644 --- a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs @@ -80,4 +80,25 @@ private static List GetVerticalEdges(IEnumerable pageWords, Func< return cleanEdges.SelectMany(x => x.Value.Select(y => new PdfLine(x.Key, y.Min(w => w.BoundingBox.Bottom), x.Key, y.Max(w => w.BoundingBox.Top)))).ToList(); } } + + /// + /// The type of edge. + /// + public enum EdgeType + { + /// + /// Text edges where words have their BoundingBox's left coordinate aligned on the same vertical line. + /// + Left = 0, + + /// + /// Text edges where words have their BoundingBox's mid coordinate aligned on the same vertical line. + /// + Mid = 1, + + /// + /// Text edges where words have their BoundingBox's right coordinate aligned on the same vertical line. + /// + Right = 2 + } } From e19b03035ef56e7e7e506b639af1dc262addcbec Mon Sep 17 00:00:00 2001 From: BobLd Date: Wed, 7 Aug 2019 13:49:05 +0100 Subject: [PATCH 4/6] Updating woth comments --- .../TextEdgesExtractor.cs | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs index 20f84ae41..c22071843 100644 --- a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs @@ -9,7 +9,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis { /// - /// Text edges extractor. Text edges are where words have either there BoundingBox's left, right or mid coordinates aligned on the same vertical line. + /// Text edges extractor. Text edges are where words have either their BoundingBox's left, right or mid coordinates aligned on the same vertical line. /// Useful to detect text columns, tables, justified text, lists, etc. /// public class TextEdgesExtractor @@ -17,11 +17,11 @@ public class TextEdgesExtractor /// /// Functions used to define left, middle and right edges. /// - private static readonly Tuple>[] edgesFuncs = new Tuple>[] + private static readonly Tuple>[] edgesFuncs = new Tuple>[] { - Tuple.Create>("left", x => Math.Round(x.Left, 0)), // use BoundingBox's left coordinate - Tuple.Create>("mid", x => Math.Round(x.Left + x.Width / 2, 0)), // use BoundingBox's mid coordinate - Tuple.Create>("right", x => Math.Round(x.Right, 0)) // use BoundingBox's right coordinate + Tuple.Create>(EdgeType.Left, x => Math.Round(x.Left, 0)), // use BoundingBox's left coordinate + Tuple.Create>(EdgeType.Mid, x => Math.Round(x.Left + x.Width / 2, 0)), // use BoundingBox's mid coordinate + Tuple.Create>(EdgeType.Right, x => Math.Round(x.Right, 0)) // use BoundingBox's right coordinate }; /// @@ -29,11 +29,16 @@ public class TextEdgesExtractor /// /// The words in the page. /// The minimum number of elements to define a text edge. - public static Dictionary> GetEdges(IEnumerable pageWords, int minimumElements = 4) + public static IReadOnlyDictionary> GetEdges(IEnumerable pageWords, int minimumElements = 4) { + if (minimumElements < 0) + { + throw new ArgumentException("TextEdgesExtractor.GetEdges(): The minimum number of elements should be positive.", "minimumElements"); + } + var cleanWords = pageWords.Where(x => !string.IsNullOrWhiteSpace(x.Text.Trim())); - ConcurrentDictionary> dictionary = new ConcurrentDictionary>(); + ConcurrentDictionary> dictionary = new ConcurrentDictionary>(); Parallel.ForEach(edgesFuncs, f => { From 7de6de3780a231b1cff1736ceed315a2c2721c1b Mon Sep 17 00:00:00 2001 From: BobLd Date: Wed, 7 Aug 2019 13:50:07 +0100 Subject: [PATCH 5/6] Updating with comments --- .../DocumentLayoutAnalysis/TextEdgesExtractor.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs index c22071843..2cd6175f6 100644 --- a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs @@ -12,7 +12,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis /// Text edges extractor. Text edges are where words have either their BoundingBox's left, right or mid coordinates aligned on the same vertical line. /// Useful to detect text columns, tables, justified text, lists, etc. /// - public class TextEdgesExtractor + public static class TextEdgesExtractor { /// /// Functions used to define left, middle and right edges. From 801ea3ba7f9710e9d63e6ebf74dc799b7335bbbf Mon Sep 17 00:00:00 2001 From: BobLd Date: Wed, 7 Aug 2019 14:22:39 +0100 Subject: [PATCH 6/6] Modified PublicApiScannerTests --- src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs | 1 + src/UglyToad.PdfPig/DocumentLayoutAnalysis/Docstrum.cs | 10 ++++++++++ .../DocumentLayoutAnalysis/TextEdgesExtractor.cs | 2 +- 3 files changed, 12 insertions(+), 1 deletion(-) create mode 100644 src/UglyToad.PdfPig/DocumentLayoutAnalysis/Docstrum.cs diff --git a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs index 54a98e37e..865a6240b 100644 --- a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs +++ b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs @@ -66,6 +66,7 @@ public void OnlyExposedApiIsPublic() "UglyToad.PdfPig.DocumentLayoutAnalysis.XYNode", "UglyToad.PdfPig.DocumentLayoutAnalysis.XYLeaf", "UglyToad.PdfPig.DocumentLayoutAnalysis.TextEdgesExtractor", + "UglyToad.PdfPig.DocumentLayoutAnalysis.EdgeType", "UglyToad.PdfPig.Exceptions.PdfDocumentEncryptedException", "UglyToad.PdfPig.Exceptions.PdfDocumentFormatException", "UglyToad.PdfPig.Fonts.DescriptorFontFile", diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Docstrum.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Docstrum.cs new file mode 100644 index 000000000..ffb821bef --- /dev/null +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Docstrum.cs @@ -0,0 +1,10 @@ +using System; +using System.Collections.Generic; +using System.Text; + +namespace UglyToad.PdfPig.DocumentLayoutAnalysis +{ + class Docstrum + { + } +} diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs index 2cd6175f6..83fc76616 100644 --- a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs @@ -87,7 +87,7 @@ private static List GetVerticalEdges(IEnumerable pageWords, Func< } /// - /// The type of edge. + /// The type of text edge. /// public enum EdgeType {