Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 34 additions & 57 deletions src/UglyToad.PdfPig.DocumentLayoutAnalysis/Clustering.cs
Original file line number Diff line number Diff line change
Expand Up @@ -216,71 +216,37 @@ public static IEnumerable<IReadOnlyList<T>> NearestNeighbours<T>(IReadOnlyList<T
yield return group.Select(i => elements[i]).ToList();
}
}

/// <summary>
/// Group elements using Depth-first search.
/// <para>https://en.wikipedia.org/wiki/Depth-first_search</para>
/// </summary>
/// <param name="edges">The graph. edges[i] = j indicates that there is an edge between i and j.</param>
/// <returns>A List of HashSets containing the grouped indexes.</returns>
internal static List<HashSet<int>> GroupIndexes(int[] edges)

internal static List<List<int>> GroupIndexes(int[] edges)
{
int[][] adjacency = new int[edges.Length][];
// Improved thanks to https://github.com/UglyToad/PdfPig/issues/1178
var adjacency = new List<int>[edges.Length];
for (int i = 0; i < edges.Length; i++)
{
HashSet<int> matches = new HashSet<int>();
if (edges[i] != -1) matches.Add(edges[i]);
for (int j = 0; j < edges.Length; j++)
{
if (edges[j] == i) matches.Add(j);
}
adjacency[i] = matches.ToArray();
adjacency[i] = new List<int>();
}

List<HashSet<int>> groupedIndexes = new List<HashSet<int>>();
bool[] isDone = new bool[edges.Length];

for (int p = 0; p < edges.Length; p++)
{
if (isDone[p]) continue;
groupedIndexes.Add(DfsIterative(p, adjacency, ref isDone));
}
return groupedIndexes;
}

/// <summary>
/// Group elements using Depth-first search.
/// <para>https://en.wikipedia.org/wiki/Depth-first_search</para>
/// </summary>
/// <param name="edges">The graph. edges[i] = [j, k, l, ...] indicates that there is an edge between i and each element j, k, l, ...</param>
/// <returns>A List of HashSets containing the grouped indexes.</returns>
internal static List<HashSet<int>> GroupIndexes(int[][] edges)
{
int[][] adjacency = new int[edges.Length][];
// one pass O(n)
for (int i = 0; i < edges.Length; i++)
{
HashSet<int> matches = new HashSet<int>();
for (int j = 0; j < edges[i].Length; j++)
int j = edges[i];
if (j != -1)
{
if (edges[i][j] != -1) matches.Add(edges[i][j]);
// i <-> j
adjacency[i].Add(j);
adjacency[j].Add(i);
}

for (int j = 0; j < edges.Length; j++)
{
for (int k = 0; k < edges[j].Length; k++)
{
if (edges[j][k] == i) matches.Add(j);
}
}
adjacency[i] = matches.ToArray();
}

List<HashSet<int>> groupedIndexes = new List<HashSet<int>>();
List<List<int>> groupedIndexes = new List<List<int>>();
bool[] isDone = new bool[edges.Length];

for (int p = 0; p < edges.Length; p++)
{
if (isDone[p]) continue;
if (isDone[p])
{
continue;
}
groupedIndexes.Add(DfsIterative(p, adjacency, ref isDone));
}
return groupedIndexes;
Expand All @@ -290,22 +256,33 @@ internal static List<HashSet<int>> GroupIndexes(int[][] edges)
/// Depth-first search
/// <para>https://en.wikipedia.org/wiki/Depth-first_search</para>
/// </summary>
private static HashSet<int> DfsIterative(int s, int[][] adj, ref bool[] isDone)
private static List<int> DfsIterative(int s, List<int>[] adj, ref bool[] isDone)
{
HashSet<int> group = new HashSet<int>();
Stack<int> S = new Stack<int>();
List<int> group = new List<int>();
Stack<int> S = new Stack<int>(4);
S.Push(s);

isDone[s] = true;
while (S.Count > 0)
{
var u = S.Pop();
if (!isDone[u])
group.Add(u);

#if NET
var currentAdj = System.Runtime.InteropServices.CollectionsMarshal.AsSpan(adj[u]);
int count = currentAdj.Length;
#else
var currentAdj = adj[u];
int count = currentAdj.Count;
#endif
for (int i = 0; i < count; ++i)
{
group.Add(u);
isDone[u] = true;
foreach (var v in adj[u])
var v = currentAdj[i];
ref bool done = ref isDone[v];
if (!done)
{
S.Push(v);
done = true;
}
}
}
Expand Down
Loading