pytorch · jainapurva · Nov 10, 2025 · Nov 11, 2025 · Nov 13, 2025 · Nov 14, 2025
diff --git a/.github/workflows/run_microbenchmarks.yml b/.github/workflows/run_microbenchmarks.yml
@@ -12,6 +12,7 @@ on:
 jobs:
   benchmark:
     runs-on: linux.aws.h100
+    timeout-minutes: 480  # 8 hours (increased from default 6 hours)
     strategy:
       matrix:
         torch-spec:
@@ -22,7 +23,7 @@ jobs:
       - name: Setup miniconda
         uses: pytorch/test-infra/.github/actions/setup-miniconda@main
         with:
-          python-version: "3.9"
+          python-version: "3.10"
 
       - name: Run benchmark
         shell: bash

diff --git a/benchmarks/dashboard/microbenchmark_quantization_config.yml b/benchmarks/dashboard/microbenchmark_quantization_config.yml
@@ -17,4 +17,3 @@ model_params:
     torch_compile_mode: "max-autotune"
     device: "cuda"
     model_type: "linear"
-    enable_memory_profiler: true
diff --git a/benchmarks/microbenchmarks/README.md b/benchmarks/microbenchmarks/README.md
@@ -52,17 +52,18 @@ model_params:
   device: "cuda"  # Options: "cuda", "mps", "xpu", "cpu"
   model_type: "linear"  # Options: "linear", "ln_linear_sigmoid"
   enable_profiler: true  # Enable standard profiling
-  enable_memory_profiler: true  # Enable CUDA memory profiling
+  # enable_memory_visualizer: true  # Enable HTML memory visualization (slow)
 ```
 
 ## Configuration Options
 
 ### Profiling Options
 - `enable_profiler`: Enable standard PyTorch profiling (default: false)
-- `enable_memory_profiler`: Enable CUDA memory profiling (default: false)
+- `enable_memory_visualizer`: Enable HTML memory visualization (default: false)
+  - Memory profiling (pickle snapshots + peak stats) ALWAYS runs automatically
   - Only works when device is set to "cuda"
-  - Generates memory snapshots before and after inference
-  - Creates visualizations of memory usage
+  - Generates HTML visualizations from memory snapshots (can be slow for large models)
+  - Set to `true` only when debugging memory issues
   - Outputs are saved in the memory_profiler subdirectory
 
 ### Quantization Methods

diff --git a/benchmarks/microbenchmarks/benchmark_inference.py b/benchmarks/microbenchmarks/benchmark_inference.py
@@ -213,7 +213,7 @@ def run(config: BenchmarkConfig) -> BenchmarkResult:
             ndigits=2,
         )
 
-        # Run profiler if enabled
+        # Run performance profiler if enabled
         if config.enable_profiler:
             print("Running profiler...")
             try:
@@ -230,42 +230,42 @@ def run(config: BenchmarkConfig) -> BenchmarkResult:
             except Exception as e:
                 print(f"Error running profiler: {e}")
 
-        # Run memory profiler if enabled
-        if config.enable_memory_profiler:
-            print("Running memory profiler...")
-            try:
-                # Create memory profiler directory if it doesn't exist
-                memory_profiler_dir = os.path.join(
-                    config.output_dir, "memory_profiler/pickle"
-                )
-                os.makedirs(memory_profiler_dir, exist_ok=True)
-
-                # Save memory profile with .pickle extension
-                result.memory_profile_path, result.memory_stats = (
-                    generate_memory_profile(
-                        model=m_copy,
-                        input_data=input_data,
-                        profile_file_path=os.path.join(
-                            memory_profiler_dir,
-                            f"{config._file_name}_memory_profile.pickle",
-                        ),
-                    )
-                )
+        # Always run memory profiler to get peak stats and save pickle snapshot (fast)
+        print("Running memory profiler...")
+        try:
+            # Create memory profiler directory if it doesn't exist
+            memory_profiler_dir = os.path.join(
+                config.output_dir, "memory_profiler/pickle"
+            )
+            os.makedirs(memory_profiler_dir, exist_ok=True)
+
+            # Save memory profile with .pickle extension
+            result.memory_profile_path, result.memory_stats = generate_memory_profile(
+                model=m_copy,
+                input_data=input_data,
+                profile_file_path=os.path.join(
+                    memory_profiler_dir,
+                    f"{config._file_name}_memory_profile.pickle",
+                ),
+            )
 
+            # Generate HTML visualization ONLY if explicitly enabled (slow: minutes to hours)
+            if config.enable_memory_visualizer:
+                print("Generating HTML visualization (this may take a while)...")
                 if result.memory_profile_path:
                     result.memory_visualization_path = visualize_memory_profile(
                         result.memory_profile_path
                     )
-            except ValueError as e:
-                if "not enough values to unpack" in str(e):
-                    print(
-                        "Failed due to existing bugs, re‑run the code to generate memory profile. Please raise an issue if it persists."
-                    )
-            except Exception as e:
-                print(f"Error running memory profiler: {e}")
-                import traceback
+        except ValueError as e:
+            if "not enough values to unpack" in str(e):
+                print(
+                    "Failed due to existing bugs, re‑run the code to generate memory profile. Please raise an issue if it persists."
+                )
+        except Exception as e:
+            print(f"Error running memory profiler: {e}")
+            import traceback
 
-                traceback.print_exc()
+            traceback.print_exc()
 
         return result
     except Exception as e:

diff --git a/benchmarks/microbenchmarks/profiler.py b/benchmarks/microbenchmarks/profiler.py
@@ -73,19 +73,23 @@ def generate_model_profile(model, input_data, profile_file_path):
 
 
 def generate_memory_profile(model, input_data, profile_file_path):
-    """Function to generate CUDA memory profile.
+    """Generate CUDA memory profile with snapshot and peak statistics.
+
+    This function generates a memory snapshot pickle file and collects peak
+    memory statistics. HTML visualization is done separately via visualize_memory_profile().
 
     Args:
         model: The model to profile
         input_data: Input data for the model
         profile_file_path: Path to save the memory profile (.pickle)
 
     Returns:
-        str: Path to the saved profile file.
+        tuple: (profile_file_path, memory_stats) where memory_stats contains
+               peak memory usage in MB
     """
     if not torch.cuda.is_available():
         print("Warning: CUDA is not available. Memory profiling requires CUDA.")
-        return None
+        return None, {}
     if model is None or input_data is None:
         raise ValueError("Model and input_data must not be None.")
 
@@ -120,7 +124,7 @@ def generate_memory_profile(model, input_data, profile_file_path):
                     torch.cuda.synchronize()
 
                 # Take memory snapshot after inference and save to temporary pickle file
-                torch.cuda.memory._dump_snapshot(profile_file_path)
+                # torch.cuda.memory._dump_snapshot(profile_file_path)
 
                 if _validate_pickle_file(profile_file_path):
                     print(f"Saved memory profile to {profile_file_path}")

diff --git a/benchmarks/microbenchmarks/test/benchmark_config.yml b/benchmarks/microbenchmarks/test/benchmark_config.yml
@@ -17,7 +17,7 @@ model_params:
     device: "cuda"
     model_type: "linear"
     enable_profiler: true  # Enable profiling for this model
-    enable_memory_profiler: true  # Enable memory profiling for this model
+    enable_memory_visualizer: true  # Enable memory visualization for this model
 
   - name: "ln_linear_sigmoid_cuda"
     matrix_shapes:
@@ -30,7 +30,7 @@ model_params:
     device: "cuda"
     model_type: "ln_linear_sigmoid"
     enable_profiler: true
-    enable_memory_profiler: true
+    enable_memory_visualizer: true
 
   - name: "bf16_transformer_block"
     matrix_shapes:
@@ -43,7 +43,7 @@ model_params:
     device: "cuda"
     model_type: "transformer_block" # TODO: Add a custom model (Figure out how to do this, maybe pass a .py file with model definition)
     enable_profiler: true
-    enable_memory_profiler: true
+    enable_memory_visualizer: true
 
   - name: "large_bf16_ln_linear"
     matrix_shapes:
@@ -59,4 +59,4 @@ model_params:
     device: "cuda"
     model_type: "linear"
     enable_profiler: true
-    enable_memory_profiler: true
+    enable_memory_visualizer: true
diff --git a/benchmarks/microbenchmarks/test/test_benchmark_profiler.py b/benchmarks/microbenchmarks/test/test_benchmark_profiler.py
@@ -162,7 +162,6 @@ def test_memory_profiler_enabled(self):
             quantization=None,
             sparsity=None,
             params={
-                "enable_memory_profiler": True,
                 "device": "cuda",
             },
             shape_name="test",
@@ -201,7 +200,6 @@ def test_memory_profiler_visualization(self):
             quantization=None,
             sparsity=None,
             params={
-                "enable_memory_profiler": True,
                 "device": "cuda",
             },
             shape_name="test",
@@ -255,7 +253,6 @@ def test_memory_profiler_cuda_unavailable(self):
                 quantization=None,
                 sparsity=None,
                 params={
-                    "enable_memory_profiler": True,
                     "device": "cpu",  # Force CPU to test CUDA unavailable case
                 },
                 shape_name="test",

diff --git a/benchmarks/microbenchmarks/utils.py b/benchmarks/microbenchmarks/utils.py
@@ -82,7 +82,9 @@ def __init__(
             f"benchmark_{self.quantization}_{self.model_type}_m{self.m}_k{self.k}_n{self.n}{'_compile'}",
         )
         self.enable_profiler = bool(params.get("enable_profiler", False))
-        self.enable_memory_profiler = bool(params.get("enable_memory_profiler", False))
+        self.enable_memory_visualizer = bool(
+            params.get("enable_memory_visualizer", False)
+        )
         # Create profiler directory path without leading slash
         profiler_dir = os.path.join(self.output_dir, "profiler")
         os.makedirs(profiler_dir, exist_ok=True)
@@ -108,7 +110,7 @@ def to_dict(self) -> Dict[str, Any]:
             "model_type": self.model_type,
             "output_dir": self.output_dir,
             "enable_profiler": self.enable_profiler,
-            "enable_memory_profiler": self.enable_memory_profiler,
+            "enable_memory_visualizer": self.enable_memory_visualizer,
         }