Try speeding up CPU ops

neverix · neverix · commit 457136fb97ee · 2025-03-25T20:05:00.000Z
diff --git a/data_analyzer.py b/data_analyzer.py
@@ -231,3 +231,4 @@ def hadamard_matrix(n):
 plt.yscale("log")
 plt.legend()
 # %%
+
diff --git a/download_stuff.py b/download_stuff.py
@@ -1,9 +1,10 @@
 from huggingface_hub import HfApi, hf_hub_download
 from tqdm.auto import tqdm
 import os
+import argparse
+from pathlib import Path
 
 
-# Thanks Deepseek
 def download_subdirectory(repo_id, subdir, local_dir, repo_type="model"):
     print(f"Downloading {subdir} from {repo_id} to {local_dir}")
     # Initialize HfApi
@@ -32,13 +33,27 @@ def download_subdirectory(repo_id, subdir, local_dir, repo_type="model"):
         )
     print(f"Downloaded {len(target_files)} files to {local_dir}")
 
-# Usage
-
-repo_id = "nev/flux1-saes"
-for directory_name in "sae_double_l18_img".split():
-    download_subdirectory(
-        repo_id=repo_id,
-        subdir=directory_name,
-        local_dir="somewhere"
-    )
+def main():
+    parser = argparse.ArgumentParser(description="Download directories from Hugging Face")
+    parser.add_argument("--repo", type=str, default="dmitriihook/flux1-saes",
+                        help="HuggingFace repository ID")
+    parser.add_argument("--dirs", type=str, nargs="+", 
+                        default=["maxacts_itda_50k_256/itda_new_data", "maxacts_itda_50k_256"],
+                        help="Directories to download")
+    parser.add_argument("--output", type=str, default="somewhere",
+                        help="Local directory to save files")
+    
+    args = parser.parse_args()
     
+    for directory in args.dirs:
+        # Create a proper local directory path that matches the structure
+        local_path = Path(args.output) / directory
+        local_path.parent.mkdir(parents=True, exist_ok=True)
+        download_subdirectory(
+            repo_id=args.repo,
+            subdir=directory,
+            local_dir=args.output
+        )
+
+if __name__ == "__main__":
+    main()
diff --git a/main.py b/main.py
@@ -41,6 +41,81 @@
         break
 app, rt = fast_app(hdrs=plotly_headers)
 
+
+# Add a function to compute spatial metrics for a feature
+def compute_spatial_metrics(feature_id):
+    """Compute spatial metrics for a specific feature."""
+    rows = scored_storage.get_rows(feature_id)
+    
+    # Group rows by idx
+    metrics_by_image = {}
+    for (idx, h, w), score in rows:
+        key = idx
+        if key not in metrics_by_image:
+            # Create activation grid for this image
+            grid = np.zeros((HEIGHT, WIDTH), dtype=float)
+            metrics_by_image[key] = {"grid": grid, "activations": []}
+        
+        # Add score to the grid
+        metrics_by_image[key]["grid"][h, w] = score
+        metrics_by_image[key]["activations"].append((h, w, score))
+    
+    # Compute metrics for each image
+    results = {}
+    for idx, data in metrics_by_image.items():
+        grid = data["grid"]
+        
+        # Skip if no activations
+        if grid.sum() == 0:
+            continue
+            
+        # Get positions where activation occurs
+        active_positions = np.where(grid > 0)
+        if len(active_positions[0]) == 0:
+            continue
+            
+        # Compute center of mass
+        h_indices, w_indices = np.indices((HEIGHT, WIDTH))
+        total_activation = grid.sum()
+        center_h = np.sum(h_indices * grid) / total_activation if total_activation > 0 else 0
+        center_w = np.sum(w_indices * grid) / total_activation if total_activation > 0 else 0
+        
+        # Compute average distance from center of mass (spatial spread)
+        distances = np.sqrt((h_indices - center_h)**2 + (w_indices - center_w)**2)
+        avg_distance = np.sum(distances * grid) / total_activation if total_activation > 0 else 0
+        
+        # Compute concentration ratio: what percentage of total activation is in the top 25% of active pixels
+        active_values = grid[active_positions]
+        sorted_values = np.sort(active_values)[::-1]  # Sort in descending order
+        quarter_point = max(1, len(sorted_values) // 4)
+        concentration_ratio = np.sum(sorted_values[:quarter_point]) / total_activation if total_activation > 0 else 0
+        
+        # Compute activation area: percentage of image area that has activations
+        activation_area = len(active_positions[0]) / (HEIGHT * WIDTH)
+        
+        # Store metrics
+        results[idx] = {
+            "spatial_spread": float(avg_distance),
+            "concentration_ratio": float(concentration_ratio),
+            "activation_area": float(activation_area),
+            "max_activation": float(grid.max()),
+            "center": (float(center_h), float(center_w))
+        }
+    
+    # Aggregate metrics across images
+    if results:
+        avg_metrics = {
+            "spatial_spread": float(np.mean([m["spatial_spread"] for m in results.values()])),
+            "concentration_ratio": float(np.mean([m["concentration_ratio"] for m in results.values()])),
+            "activation_area": float(np.mean([m["activation_area"] for m in results.values()])),
+            "num_images": len(results)
+        }
+        return avg_metrics
+    return None
+
+# Cache for spatial metrics to avoid recomputation
+spatial_metrics_cache = {}
+
 @rt("/cached_image/{image_id}")
 def cached_image(image_id: int):
     img_path = image_cache_dir / f"{image_id}.jpg"
@@ -79,6 +154,8 @@ def top_features():
         Br(),
         H1(f"Spatial sparsity: {spatial_sparsity():.3f}"),
         Br(),
+        P(A("View Spatial Metrics", href="/spatial_metrics")),
+        Br(),
         *[Card(
             P(f"Feature {i}, Frequency: {frequencies[i]:.5f}, Max: {maxima[i]}"),
             A("View Max Acts", href=f"/maxacts/{i}")
@@ -133,6 +210,15 @@ def maxacts(feature_id: int):
         # Add score to the corresponding location in the grid
         grouped_rows[key][h, w] = score
 
+    # Compute spatial metrics for this feature if not already cached
+    if feature_id not in spatial_metrics_cache:
+        spatial_metrics_cache[feature_id] = compute_spatial_metrics(feature_id)
+    
+    metrics = spatial_metrics_cache[feature_id]
+    metrics_display = ""
+    if metrics:
+        metrics_display = f"Spatial Spread: {metrics['spatial_spread']:.3f}, Concentration: {metrics['concentration_ratio']:.3f}, Active Area: {metrics['activation_area']:.3f}"
+    
     # Prepare images and cards
     imgs = []
     for idx, grid in sorted(grouped_rows.items(), key=lambda x: x[1].max(), reverse=True)[:20]:
@@ -191,10 +277,73 @@ def maxacts(feature_id: int):
 
     return Div(
         P(A("<- Go back", href="/top_features")),
+        H2(f"Feature {feature_id} Spatial Metrics: {metrics_display}"),
         Div(*imgs, style="display: flex; flex-wrap: wrap; gap: 20px; justify-content: center"),
         style="padding: 20px"
     )
 
+# Add a new endpoint to view spatial metrics for all features
+@rt("/spatial_metrics")
+def spatial_metrics_view():
+    # Get all feature IDs
+    counts = scored_storage.key_counts()
+    maxima = scored_storage.key_maxima()
+    
+    # Filter features with significant activations
+    cond = maxima > 4
+    features = np.arange(len(scored_storage))[cond]
+    
+    # Compute metrics for all features (with caching)
+    all_metrics = []
+    for feature_id in features:
+        if feature_id not in spatial_metrics_cache:
+            spatial_metrics_cache[feature_id] = compute_spatial_metrics(feature_id)
+        
+        metrics = spatial_metrics_cache[feature_id]
+        if metrics:
+            all_metrics.append({
+                "feature_id": int(feature_id),
+                "spatial_spread": metrics["spatial_spread"],
+                "concentration_ratio": metrics["concentration_ratio"],
+                "activation_area": metrics["activation_area"],
+                "num_images": metrics["num_images"]
+            })
+    
+    # Sort by activation area (from most concentrated to most dispersed)
+    all_metrics.sort(key=lambda x: x["activation_area"])
+    
+    # Create scatter plot of concentration vs spatial spread
+    scatter_plot = plotly2fasthtml(px.scatter(
+        x=[m["activation_area"] for m in all_metrics],
+        y=[m["concentration_ratio"] for m in all_metrics],
+        hover_name=[f"Feature {m['feature_id']}" for m in all_metrics],
+        labels={"x": "Activation Area (% of image)", "y": "Concentration Ratio"},
+        title="Spatial Concentration Analysis"
+    ))
+    
+    # Create cards for features
+    feature_cards = [
+        Card(
+            P(f"Feature {m['feature_id']}"),
+            P(f"Concentration: {m['concentration_ratio']:.3f}"),
+            P(f"Active Area: {m['activation_area']:.3f}%"),
+            P(f"Spatial Spread: {m['spatial_spread']:.3f}"),
+            A("View Max Acts", href=f"/maxacts/{m['feature_id']}"),
+            style="width: 200px; margin: 10px;"
+        ) for m in all_metrics[:50]  # Show top 50 most concentrated features
+    ]
+    
+    return Div(
+        H1("Spatial Metrics Analysis"),
+        P(A("<- Go back", href="/top_features")),
+        Br(),
+        scatter_plot,
+        Br(),
+        H2("Most Concentrated Features (Lowest Activation Area)"),
+        Div(*feature_cards, style="display: flex; flex-wrap: wrap; justify-content: center;"),
+        style="padding: 20px;"
+    )
+
 NUM_PROMPTS = 4
 
 @rt("/gen_image", methods=["GET"])
@@ -248,8 +397,9 @@ def home():
         H1("fae"),
         H2("SAE"),
         P(A("Top features", href="/top_features")),
+        P(A("Spatial Metrics", href="/spatial_metrics")),
         P(A("Generator", href="/gen_image")),
         style="padding: 5em"
     )
 
-serve()
+serve()
diff --git a/run_all.sh b/run_all.sh
@@ -16,5 +16,35 @@ set -e
 
 # 768 = 512 + (256 = 16*16 = 256/16 * 256/16)
 # 1536 = 512 + (1024 = 32*32 = 512/16 * 512/16)
+# Run with default settings
+# uv run python run_fae.py
 
-uv run python -m src.fae.sae_trainer --train_mode=False -seq_len=1536 --batch_size=4 --restore_from=somewhere/sae_double_l18_img
+# Specify a different cache path
+# uv run python run_fae.py --cache-path="somewhere/other_cache_dir"
+
+# Change image dimensions
+# uv run python run_fae.py --width=768 --height=768
+
+# Use a different port
+# uv run python run_fae.py --port=5002
+
+# Only compute and output metrics without starting the server
+# uv run python run_fae.py --metrics-only
+
+# Clear the image cache before starting
+# uv run python run_fae.py --clear-image-cache
+
+# uv run python -m src.fae.sae_trainer --train_mode=False -seq_len=1536 --batch_size=4 --restore_from=somewhere/sae_double_l18_img
+# uv run python -m src.fae.sae_trainer --train_mode=True -seq_len=1536 --timesteps=4 --batch_size=4 --restore_from=somewhere/sae_double_l18_img
+# uv run python -m src.fae.sae_trainer
+# uv run python -m src.fae.sae_trainer --train_mode=False
+
+# uv run python -m src.fae.sae_trainer --layer=18 --block_type=double --train_mode=False --restore_from=somewhere/sae_double_l18_img
+# uv run python -m src.fae.sae_trainer --layer=12 --block_type=double --sae_train_every=1
+
+# uv run python -m src.fae.sae_trainer --layer=3 --block_type=double
+# uv run python -m src.fae.sae_trainer --layer=3 --block_type=double --train_mode=False
+# uv run python -m src.fae.sae_trainer --layer=6 --block_type=double
+# uv run python -m src.fae.sae_trainer --layer=6 --block_type=double --train_mode=False
+uv run python -m src.fae.sae_trainer --layer=15 --block_type=double
+uv run python -m src.fae.sae_trainer --layer=15 --block_type=double --train_mode=False
diff --git a/src/fae/sae_common.py b/src/fae/sae_common.py
@@ -1,4 +1,5 @@
 import jax
+import equinox as eqx
 import jax.numpy as jnp
 from dataclasses import dataclass
 from jaxtyping import Array, Float, UInt
@@ -143,6 +144,7 @@ def width_and_height(self):
         width_and_height = math.isqrt(width_height_product)
         return width_and_height, width_and_height
 
+    @eqx.filter_jit
     def cut_up(
         self, training_data: Float[Array, "*batch seq_len d_model"]
     ) -> Float[Array, "full_batch_size d_model"]:
diff --git a/src/fae/sae_trainer.py b/src/fae/sae_trainer.py
@@ -631,6 +631,7 @@ def main(*, restore: bool = False,
         **(dict(sae_train_every=1,
         sae_batch_size_multiplier=1)
         if not train_mode else {}),
+        site = (block_type, layer),
         **extra_config_items,
     )
     if not train_mode and not restore:
@@ -660,6 +661,17 @@ def main(*, restore: bool = False,
     cycle_detected = False
     activation_cache = []
     # normalization_hack = None
+    
+    @jax.jit
+    def process_reaped(reaped):
+        if block_type == "double":
+            training_data = jnp.concatenate((reaped[f"double.resid.txt"], reaped[f"double.resid.img"]), axis=-2)
+        else:
+            training_data = reaped[f"single.resid"]
+        training_data = training_data.reshape(-1, *training_data.shape[2:])  # (timesteps, sequence_length, d_model)
+        training_data = config.cut_up(training_data)
+        return training_data
+    
     for step, prompts in zip(range(config.n_steps), chunked(prompts_iterator, config.batch_size)):
         if len(prompts) < config.batch_size:
             logger.warning("End of dataset")
@@ -686,46 +698,47 @@ def main(*, restore: bool = False,
         reaped = outputs[1].reaped  # (timesteps, batch, sequence_length, d_model)
         assert isinstance(images, jnp.ndarray)  # to silence mypy
         logger.add(sys.stderr, level="INFO")
-        if block_type == "double":
-            training_data = jnp.concatenate((reaped[f"double.resid.txt"], reaped[f"double.resid.img"]), axis=-2)
-        else:
-            training_data = reaped[f"single.resid"]
-        training_data = training_data.reshape(-1, *training_data.shape[2:])  # (timesteps, sequence_length, d_model)
-        training_data = config.cut_up(training_data)
+        training_data = process_reaped(reaped)
         take_data = int(len(training_data) * config.use_data_fraction)
         if config.transfer_to_cpu:
-            training_data = np.asarray(training_data)
+            training_data = np.array(training_data)
             np.random.shuffle(training_data)
             training_data = training_data[:take_data]
         else:
             training_data = jax.random.permutation(key, training_data)[:take_data]
-            training_data = jax.device_put(
-                training_data,
-                jax.sharding.NamedSharding(sae_trainer.mesh, jax.sharding.PartitionSpec("dp", *((None,) * (training_data.ndim - 1)))))
         activation_cache.append(training_data)
         if len(activation_cache) >= config.sae_train_every:
             assert config.sae_train_every % config.sae_batch_size_multiplier == 0
             for inner_step in range(config.sae_train_every // config.sae_batch_size_multiplier):
                 if train_mode:
-                    cache_data = np.concatenate(activation_cache, axis=0)
-                    if config.transfer_to_cpu and inner_step == 0:
-                        np.random.shuffle(cache_data)
+                    if config.transfer_to_cpu:
+                        cache_data = np.concatenate(activation_cache, axis=0)
+                        if inner_step == 0:
+                            np.random.shuffle(cache_data)
+                    else:
+                        if len(activation_cache) == 1:
+                            cache_data = activation_cache[0]
+                        else:
+                            cache_data = jnp.concatenate(activation_cache, axis=0)
+                        # cache_data = jax.jit(lambda x: jnp.concatenate(x, axis=0))(activation_cache)
                     if len(cache_data) == config.train_batch_size:
                         training_data, activation_cache = cache_data, []
                     else:
                         activation_cache = [cache_data[config.train_batch_size:]]
                         training_data = cache_data[:config.train_batch_size]
+                    if config.transfer_to_cpu:
+                        training_data = jnp.asarray(training_data)
                 else:
                     assert config.sae_batch_size_multiplier == config.sae_train_every == 1
-                if int(sae_trainer.sae_trainer.sae_logic.info.n_steps) == 0 and config.use_pca:
+                if int(sae_trainer.sae_trainer.sae_logic.info.n_steps) == 0 and config.use_pca and config.do_update:
                     sae_trainer.sae_trainer = eqx.tree_at(
                         lambda x: x.sae_logic.info.whitening_matrix,
                         sae_trainer.sae_trainer,
                         compute_whitening(training_data)
                     )
                 sae_outputs = sae_trainer.step(
                     jax.device_put(
-                        jnp.asarray(training_data),
+                        training_data,
                         jax.sharding.NamedSharding(sae_trainer.mesh, jax.sharding.PartitionSpec("dp", None))))
                 if not train_mode:
                     sae_weights, sae_indices = map(