diff --git a/src/UglyToad.PdfPig.Core/UglyToad.PdfPig.Core.csproj b/src/UglyToad.PdfPig.Core/UglyToad.PdfPig.Core.csproj index 4b0a23c5e..51cac1a1e 100644 --- a/src/UglyToad.PdfPig.Core/UglyToad.PdfPig.Core.csproj +++ b/src/UglyToad.PdfPig.Core/UglyToad.PdfPig.Core.csproj @@ -1,6 +1,6 @@ - netstandard2.0;net462;net471;net6.0;net8.0 + netstandard2.0;net462;net471;net6.0;net8.0;net9.0 12 0.1.12-alpha001 False diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/DocstrumBoundingBoxes.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/DocstrumBoundingBoxes.cs index f07dcbd1b..56bbef6eb 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/DocstrumBoundingBoxes.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/DocstrumBoundingBoxes.cs @@ -48,12 +48,19 @@ public DocstrumBoundingBoxes(DocstrumBoundingBoxesOptions options) /// The s generated by the document spectrum method. public IReadOnlyList GetBlocks(IEnumerable words) { - if (words?.Any() != true) + if (words is null) { return Array.Empty(); } - return GetBlocks(words.ToList(), + // Avoid multiple enumeration and unnecessary ToArray() if already a list + var wordList = words as IReadOnlyList ?? words.ToArray(); + if (wordList.Count == 0) + { + return Array.Empty(); + } + + return GetBlocks(wordList, options.WithinLineBounds, options.WithinLineMultiplier, options.WithinLineBinSize, options.BetweenLineBounds, options.BetweenLineMultiplier, options.BetweenLineBinSize, options.AngularDifferenceBounds, diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/UglyToad.PdfPig.DocumentLayoutAnalysis.csproj b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/UglyToad.PdfPig.DocumentLayoutAnalysis.csproj index 1a750f4cf..6878b7e1e 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/UglyToad.PdfPig.DocumentLayoutAnalysis.csproj +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/UglyToad.PdfPig.DocumentLayoutAnalysis.csproj @@ -1,6 +1,6 @@ - netstandard2.0;net462;net471;net6.0;net8.0 + netstandard2.0;net462;net471;net6.0;net8.0;net9.0 12 0.1.12-alpha001 False diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WordExtractor/NearestNeighbourWordExtractor.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WordExtractor/NearestNeighbourWordExtractor.cs index d5c48b208..4e0ce710c 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WordExtractor/NearestNeighbourWordExtractor.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WordExtractor/NearestNeighbourWordExtractor.cs @@ -51,34 +51,49 @@ public IEnumerable GetWords(IReadOnlyList letters) if (options.GroupByOrientation) { - // axis aligned - List words = GetWords( - letters.Where(l => l.TextOrientation == TextOrientation.Horizontal).ToList(), - options.MaximumDistance, options.DistanceMeasureAA, options.FilterPivot, - options.Filter, options.MaxDegreeOfParallelism); - - words.AddRange(GetWords( - letters.Where(l => l.TextOrientation == TextOrientation.Rotate270).ToList(), - options.MaximumDistance, options.DistanceMeasureAA, options.FilterPivot, - options.Filter, options.MaxDegreeOfParallelism)); + var buckets = new List[5]; + for (int i = 0; i < buckets.Length; i++) buckets[i] = new List(); - words.AddRange(GetWords( - letters.Where(l => l.TextOrientation == TextOrientation.Rotate180).ToList(), - options.MaximumDistance, options.DistanceMeasureAA, options.FilterPivot, - options.Filter, options.MaxDegreeOfParallelism)); - - words.AddRange(GetWords( - letters.Where(l => l.TextOrientation == TextOrientation.Rotate90).ToList(), - options.MaximumDistance, options.DistanceMeasureAA, options.FilterPivot, - options.Filter, options.MaxDegreeOfParallelism)); + foreach (var l in letters) + { + switch (l.TextOrientation) + { + case TextOrientation.Horizontal: buckets[0].Add(l); break; + case TextOrientation.Rotate270: buckets[1].Add(l); break; + case TextOrientation.Rotate180: buckets[2].Add(l); break; + case TextOrientation.Rotate90: buckets[3].Add(l); break; + default: buckets[4].Add(l); break; + } + } - // not axis aligned - words.AddRange(GetWords( - letters.Where(l => l.TextOrientation == TextOrientation.Other).ToList(), - options.MaximumDistance, options.DistanceMeasure, options.FilterPivot, - options.Filter, options.MaxDegreeOfParallelism)); + // Use a thread-safe collection to avoid lock contention. + var results = new List(letters.Count); // Pre-allocate for performance - return words; + // Limit parallelism to avoid oversubscription. + var parallelOptions = new System.Threading.Tasks.ParallelOptions + { + MaxDegreeOfParallelism = options.MaxDegreeOfParallelism > 0 ? options.MaxDegreeOfParallelism : Environment.ProcessorCount + }; + + // Use partitioner for better load balancing and avoid ConcurrentBag overhead + System.Threading.Tasks.Parallel.ForEach( + System.Collections.Concurrent.Partitioner.Create(0, buckets.Length), + parallelOptions, + range => + { + for (int i = range.Item1; i < range.Item2; i++) + { + if (buckets[i].Count == 0) continue; + var measure = (i == 4) ? options.DistanceMeasure : options.DistanceMeasureAA; + var words = GetWords(buckets[i], options.MaximumDistance, measure, options.FilterPivot, options.Filter, options.MaxDegreeOfParallelism); + lock (results) + { + results.AddRange(words); + } + } + }); + results.TrimExcess(); + return results; } else { diff --git a/src/UglyToad.PdfPig.Fonts/UglyToad.PdfPig.Fonts.csproj b/src/UglyToad.PdfPig.Fonts/UglyToad.PdfPig.Fonts.csproj index 8c8b2075e..774c47c28 100644 --- a/src/UglyToad.PdfPig.Fonts/UglyToad.PdfPig.Fonts.csproj +++ b/src/UglyToad.PdfPig.Fonts/UglyToad.PdfPig.Fonts.csproj @@ -1,6 +1,6 @@ - netstandard2.0;net462;net471;net6.0;net8.0 + netstandard2.0;net462;net471;net6.0;net8.0;net9.0 12 0.1.12-alpha001 False diff --git a/src/UglyToad.PdfPig.Tests/UglyToad.PdfPig.Tests.csproj b/src/UglyToad.PdfPig.Tests/UglyToad.PdfPig.Tests.csproj index 2d8aff568..e3e298c8c 100644 --- a/src/UglyToad.PdfPig.Tests/UglyToad.PdfPig.Tests.csproj +++ b/src/UglyToad.PdfPig.Tests/UglyToad.PdfPig.Tests.csproj @@ -1,7 +1,7 @@  - net471;net8.0 + net471;net8.0;net9.0 true false full diff --git a/src/UglyToad.PdfPig.Tokenization/UglyToad.PdfPig.Tokenization.csproj b/src/UglyToad.PdfPig.Tokenization/UglyToad.PdfPig.Tokenization.csproj index 88731a0b3..2f3e0d293 100644 --- a/src/UglyToad.PdfPig.Tokenization/UglyToad.PdfPig.Tokenization.csproj +++ b/src/UglyToad.PdfPig.Tokenization/UglyToad.PdfPig.Tokenization.csproj @@ -1,6 +1,6 @@ - netstandard2.0;net462;net471;net6.0;net8.0 + netstandard2.0;net462;net471;net6.0;net8.0;net9.0 12 0.1.12-alpha001 False diff --git a/src/UglyToad.PdfPig.Tokens/UglyToad.PdfPig.Tokens.csproj b/src/UglyToad.PdfPig.Tokens/UglyToad.PdfPig.Tokens.csproj index 4c95062f3..069167812 100644 --- a/src/UglyToad.PdfPig.Tokens/UglyToad.PdfPig.Tokens.csproj +++ b/src/UglyToad.PdfPig.Tokens/UglyToad.PdfPig.Tokens.csproj @@ -1,6 +1,6 @@ - netstandard2.0;net462;net471;net6.0;net8.0 + netstandard2.0;net462;net471;net6.0;net8.0;net9.0 12 0.1.12-alpha001 False diff --git a/src/UglyToad.PdfPig/UglyToad.PdfPig.csproj b/src/UglyToad.PdfPig/UglyToad.PdfPig.csproj index 2fb282e6e..797c83107 100644 --- a/src/UglyToad.PdfPig/UglyToad.PdfPig.csproj +++ b/src/UglyToad.PdfPig/UglyToad.PdfPig.csproj @@ -1,6 +1,6 @@ - netstandard2.0;net462;net471;net6.0;net8.0 + netstandard2.0;net462;net471;net6.0;net8.0;net9.0 12 0.1.12-alpha001 False