diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Clustering.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Clustering.cs index bd11c7b5d..6af787324 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Clustering.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Clustering.cs @@ -216,71 +216,37 @@ public static IEnumerable> NearestNeighbours(IReadOnlyList elements[i]).ToList(); } } - - /// - /// Group elements using Depth-first search. - /// https://en.wikipedia.org/wiki/Depth-first_search - /// - /// The graph. edges[i] = j indicates that there is an edge between i and j. - /// A List of HashSets containing the grouped indexes. - internal static List> GroupIndexes(int[] edges) + + internal static List> GroupIndexes(int[] edges) { - int[][] adjacency = new int[edges.Length][]; + // Improved thanks to https://github.com/UglyToad/PdfPig/issues/1178 + var adjacency = new List[edges.Length]; for (int i = 0; i < edges.Length; i++) { - HashSet matches = new HashSet(); - if (edges[i] != -1) matches.Add(edges[i]); - for (int j = 0; j < edges.Length; j++) - { - if (edges[j] == i) matches.Add(j); - } - adjacency[i] = matches.ToArray(); + adjacency[i] = new List(); } - List> groupedIndexes = new List>(); - bool[] isDone = new bool[edges.Length]; - - for (int p = 0; p < edges.Length; p++) - { - if (isDone[p]) continue; - groupedIndexes.Add(DfsIterative(p, adjacency, ref isDone)); - } - return groupedIndexes; - } - - /// - /// Group elements using Depth-first search. - /// https://en.wikipedia.org/wiki/Depth-first_search - /// - /// The graph. edges[i] = [j, k, l, ...] indicates that there is an edge between i and each element j, k, l, ... - /// A List of HashSets containing the grouped indexes. - internal static List> GroupIndexes(int[][] edges) - { - int[][] adjacency = new int[edges.Length][]; + // one pass O(n) for (int i = 0; i < edges.Length; i++) { - HashSet matches = new HashSet(); - for (int j = 0; j < edges[i].Length; j++) + int j = edges[i]; + if (j != -1) { - if (edges[i][j] != -1) matches.Add(edges[i][j]); + // i <-> j + adjacency[i].Add(j); + adjacency[j].Add(i); } - - for (int j = 0; j < edges.Length; j++) - { - for (int k = 0; k < edges[j].Length; k++) - { - if (edges[j][k] == i) matches.Add(j); - } - } - adjacency[i] = matches.ToArray(); } - List> groupedIndexes = new List>(); + List> groupedIndexes = new List>(); bool[] isDone = new bool[edges.Length]; for (int p = 0; p < edges.Length; p++) { - if (isDone[p]) continue; + if (isDone[p]) + { + continue; + } groupedIndexes.Add(DfsIterative(p, adjacency, ref isDone)); } return groupedIndexes; @@ -290,22 +256,33 @@ internal static List> GroupIndexes(int[][] edges) /// Depth-first search /// https://en.wikipedia.org/wiki/Depth-first_search /// - private static HashSet DfsIterative(int s, int[][] adj, ref bool[] isDone) + private static List DfsIterative(int s, List[] adj, ref bool[] isDone) { - HashSet group = new HashSet(); - Stack S = new Stack(); + List group = new List(); + Stack S = new Stack(4); S.Push(s); + isDone[s] = true; while (S.Count > 0) { var u = S.Pop(); - if (!isDone[u]) + group.Add(u); + +#if NET + var currentAdj = System.Runtime.InteropServices.CollectionsMarshal.AsSpan(adj[u]); + int count = currentAdj.Length; +#else + var currentAdj = adj[u]; + int count = currentAdj.Count; +#endif + for (int i = 0; i < count; ++i) { - group.Add(u); - isDone[u] = true; - foreach (var v in adj[u]) + var v = currentAdj[i]; + ref bool done = ref isDone[v]; + if (!done) { S.Push(v); + done = true; } } }