Add code for explorer cluster history #17

martin-ueding · Dec 3, 2023 · 122606d · 122606d
1 parent 2d1f8e0
commit 122606d
Showing 1 changed file with 152 additions and 0 deletions.
diff --git a/geo_activity_playground/explorer/clusters.py b/geo_activity_playground/explorer/clusters.py
@@ -0,0 +1,152 @@
+import json
+import pathlib
+from typing import Iterator
+
+import altair as alt
+import pandas as pd
+
+
+def adjacent_to(tile: tuple[int, int]) -> Iterator[tuple[int, int]]:
+    x, y = tile
+    yield (x + 1, y)
+    yield (x - 1, y)
+    yield (x, y + 1)
+    yield (x, y - 1)
+
+
+class ExplorerClusterState:
+    def __init__(self, zoom: int) -> None:
+        self.num_neighbors: dict[tuple[int, int], int] = {}
+        self.cluster_tiles: dict[tuple[int, int], list[tuple[int, int]]] = {}
+        self.cluster_evolution = pd.DataFrame()
+        self.start = 0
+
+        self._state_path = pathlib.Path(f"Cache/explorer_cluster_{zoom}_state.json")
+        self._cluster_evolution_path = pathlib.Path(
+            f"Cache/explorer_cluster_{zoom}_evolution.parquet"
+        )
+
+    def load(self) -> None:
+        if self._state_path.exists():
+            with open(self._state_path) as f:
+                data = json.load(f)
+            self.num_neighbors = {
+                tuple(map(int, key.split("/"))): value
+                for key, value in data["num_neighbors"].items()
+            }
+            self.cluster_tiles = {
+                tuple(map(int, key.split("/"))): [
+                    tuple(t) for t in data["clusters"][str(value)]
+                ]
+                for key, value in data["memberships"].items()
+            }
+            self.start = data["start"]
+
+        if self._cluster_evolution_path.exists():
+            self.cluster_evolution = pd.read_parquet(self._cluster_evolution_path)
+
+    def save(self) -> None:
+        data = {
+            "num_neighbors": {
+                f"{x}/{y}": count for (x, y), count in self.num_neighbors.items()
+            },
+            "memberships": {
+                f"{x}/{y}": id(members)
+                for (x, y), members in self.cluster_tiles.items()
+            },
+            "clusters": {
+                id(members): members for members in self.cluster_tiles.values()
+            },
+            "start": self.start,
+        }
+        with open(self._state_path, "w") as f:
+            json.dump(data, f)
+
+        self.cluster_evolution.to_parquet(self._cluster_evolution_path)
+
+
+def get_explorer_cluster_evolution(zoom: int) -> ExplorerClusterState:
+    tiles = pd.read_parquet(pathlib.Path(f"Cache/first_time_per_tile_{zoom}.parquet"))
+    tiles.sort_values("first_time", inplace=True)
+
+    s = ExplorerClusterState(zoom)
+    s.load()
+
+    if len(s.cluster_evolution) > 0:
+        max_cluster_so_far = s.cluster_evolution["max_cluster_size"].iloc[-1]
+    else:
+        max_cluster_so_far = 0
+
+    rows = []
+    for index, row in tiles.iloc[s.start :].iterrows():
+        new_clusters = False
+        # Current tile.
+        tile = (row["tile_x"], row["tile_y"])
+
+        # This tile is new, therefore it doesn't have an entries in the neighbor list yet.
+        s.num_neighbors[tile] = 0
+
+        # Go through the adjacent tile and check whether there are neighbors.
+        for other in adjacent_to(tile):
+            if other in s.num_neighbors:
+                # The other tile is already visited. That means that the current tile has a neighbor.
+                s.num_neighbors[tile] += 1
+                # Alto the other tile has gained a neighbor.
+                s.num_neighbors[other] += 1
+
+        # If the current tile has all neighbors, make it it's own cluster.
+        if s.num_neighbors[tile] == 4:
+            s.cluster_tiles[tile] = [tile]
+
+        # Also make the adjacent tiles their own clusters, if they are full.
+        this_and_neighbors = [tile] + list(adjacent_to(tile))
+        for other in this_and_neighbors:
+            if s.num_neighbors.get(other, 0) == 4:
+                s.cluster_tiles[other] = [other]
+
+        for candidate in this_and_neighbors:
+            if candidate not in s.cluster_tiles:
+                continue
+            # The candidate is a cluster tile. Let's see whether any of the neighbors are also cluster tiles but with a different cluster. Then we need to join them.
+            for other in adjacent_to(candidate):
+                if other not in s.cluster_tiles:
+                    continue
+                # The other tile is also a cluster tile.
+                if s.cluster_tiles[candidate] is s.cluster_tiles[other]:
+                    continue
+                # The two clusters are not the same. We add the other's cluster tile to this tile.
+                s.cluster_tiles[candidate].extend(s.cluster_tiles[other])
+                # Update the other cluster tiles that they now point to the new cluster. This also updates the other tile.
+                for member in s.cluster_tiles[other]:
+                    s.cluster_tiles[member] = s.cluster_tiles[candidate]
+                new_clusters = True
+
+        if new_clusters:
+            max_cluster_size = max(
+                (len(members) for members in s.cluster_tiles.values()),
+                default=0,
+            )
+            if max_cluster_size > max_cluster_so_far:
+                rows.append(
+                    {
+                        "time": row["first_time"],
+                        "max_cluster_size": max_cluster_size,
+                    }
+                )
+                max_cluster_size = max_cluster_so_far
+
+    new_cluster_evolution = pd.DataFrame(rows)
+    s.cluster_evolution = pd.concat([s.cluster_evolution, new_cluster_evolution])
+    s.start = len(tiles)
+    s.save()
+    return s
+
+
+def plot_cluster_evolution(cluster_evolution: pd.DataFrame) -> str:
+    return (
+        alt.Chart(cluster_evolution)
+        .mark_line(interpolate="step-after")
+        .encode(alt.X("time"), alt.Y("max_cluster_size"))
+        .interactive()
+        .to_json(format="vega")
+    )