configurable explore 2d option

nv-lschneider · nv-lschneider · commit ca6dd60d5a4e · 2025-11-05T16:56:53.000-06:00
Signed-off-by: Ludwig Schneider &lt;lschneider@nvidia.com&gt;
diff --git a/tests/microbenchmarks/all_reduce.py b/tests/microbenchmarks/all_reduce.py
@@ -126,10 +126,12 @@ def allreduce_benchmark(
     strategy: str = None,
     inner_loop: int = 200,
     outer_loop: int = 10,
+    tokens_range: str = "1,16384,2",
+    hidden_sizes_range: str = "128,8192,2",
 ):
     """
     Benchmark AllReduce operations.
-    
+
     Args:
         dtype: Data type for benchmarking
         test_range: Range specification (min,max,ratio)
@@ -139,6 +141,8 @@ def allreduce_benchmark(
         strategy: Specific strategy to test (if None, tests default set: NCCL, NCCL_SYMMETRIC, NCCL_DEVICE, MNNVL)
         inner_loop: Number of iterations per timing measurement (default: 200)
         outer_loop: Number of timing measurements to take (default: 10)
+        tokens_range: Range for number of tokens in 2D mode (min,max,ratio) (default: "1,16384,2")
+        hidden_sizes_range: Range for hidden sizes in 2D mode (min,max,ratio) (default: "128,8192,2")
     """
     tllm.logger.set_level('error')
     world_size = tllm.mpi_world_size()
@@ -166,15 +170,36 @@ def allreduce_benchmark(
 
     # Parse test range
     min_size, max_size, ratio = [int(i) for i in test_range.split(",")]
-    
+
     # generate shape list
     shape_list = []
 
     if explore_2d:
-        num_seqs_list = [
-            1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384
+        # Parse tokens range
+        min_tokens, max_tokens, tokens_ratio = [
+            int(i) for i in tokens_range.split(",")
+        ]
+
+        # Parse hidden sizes range
+        min_hidden, max_hidden, hidden_ratio = [
+            int(i) for i in hidden_sizes_range.split(",")
         ]
-        hidden_size_list = [128, 256, 512, 1024, 2048, 4096, 8192]
+
+        # Generate token counts list
+        num_seqs_list = []
+        current = min_tokens
+        while current <= max_tokens:
+            num_seqs_list.append(current)
+            current *= tokens_ratio
+
+        # Generate hidden sizes list
+        hidden_size_list = []
+        current = min_hidden
+        while current <= max_hidden:
+            hidden_size_list.append(current)
+            current *= hidden_ratio
+
+        # Create all combinations
         for num_tokens, hidden_size in product(num_seqs_list, hidden_size_list):
             shape_list.append((num_tokens, hidden_size))
     else:
@@ -196,7 +221,7 @@ def allreduce_benchmark(
         AllReduceFusionOp.RESIDUAL_RMS_NORM_QUANT_FP8,
         AllReduceFusionOp.RESIDUAL_RMS_NORM_QUANT_NVFP4,
     ]
-    
+
     # Map strategy names to enum values
     strategy_map = {
         "NCCL": AllReduceStrategy.NCCL,
@@ -209,12 +234,14 @@ def allreduce_benchmark(
         "TWOSHOT": AllReduceStrategy.TWOSHOT,
         "AUTO": AllReduceStrategy.AUTO,
     }
-    
+
     # Select strategies based on input
     if strategy:
         # Single strategy specified
         if strategy.upper() not in strategy_map:
-            raise ValueError(f"Unknown strategy: {strategy}. Available: {', '.join(strategy_map.keys())}")
+            raise ValueError(
+                f"Unknown strategy: {strategy}. Available: {', '.join(strategy_map.keys())}"
+            )
         strategies = [strategy_map[strategy.upper()]]
     else:
         # Default: test main strategies
@@ -224,38 +251,44 @@ def allreduce_benchmark(
             AllReduceStrategy.NCCL_DEVICE,
             AllReduceStrategy.MNNVL,
         ]
-    
+
     # Validate strategy compatibility for user buffer initialization
     # NCCL_SYMMETRIC and NCCL_DEVICE need UB with use_multicast=True
     # UB strategy needs UB with use_multicast=False
     # These two groups cannot be mixed in a single run
-    ub_multicast_strategies = {AllReduceStrategy.NCCL_SYMMETRIC, AllReduceStrategy.NCCL_DEVICE}
+    ub_multicast_strategies = {
+        AllReduceStrategy.NCCL_SYMMETRIC, AllReduceStrategy.NCCL_DEVICE
+    }
     ub_no_multicast_strategies = {AllReduceStrategy.UB}
-    
-    has_multicast_strategies = any(s in ub_multicast_strategies for s in strategies)
-    has_no_multicast_strategies = any(s in ub_no_multicast_strategies for s in strategies)
-    
+
+    has_multicast_strategies = any(s in ub_multicast_strategies
+                                   for s in strategies)
+    has_no_multicast_strategies = any(s in ub_no_multicast_strategies
+                                      for s in strategies)
+
     # Error out if incompatible strategies are mixed
     if has_multicast_strategies and has_no_multicast_strategies:
-        multicast_strats = [s.name for s in strategies if s in ub_multicast_strategies]
-        no_multicast_strats = [s.name for s in strategies if s in ub_no_multicast_strategies]
+        multicast_strats = [
+            s.name for s in strategies if s in ub_multicast_strategies
+        ]
+        no_multicast_strats = [
+            s.name for s in strategies if s in ub_no_multicast_strategies
+        ]
         raise ValueError(
             f"Incompatible strategies selected: {multicast_strats} require use_multicast=True "
             f"while {no_multicast_strats} require use_multicast=False. "
-            f"Please run these strategies separately using --strategy."
-        )
-    
+            f"Please run these strategies separately using --strategy.")
+
     # Initialize user buffers if any strategy needs it
     needs_ub = has_multicast_strategies or has_no_multicast_strategies
-    
+
     if needs_ub:
         max_bytes = max_size * dtype_size_bytes
         use_multicast = has_multicast_strategies  # True for NCCL_SYMMETRIC/NCCL_DEVICE, False for UB
-        
-        ub.initialize_userbuffers_manager(
-            world_size, 1, 1, rank,
-            torch.cuda.device_count(), max_bytes, use_multicast
-        )
+
+        ub.initialize_userbuffers_manager(world_size, 1, 1, rank,
+                                          torch.cuda.device_count(), max_bytes,
+                                          use_multicast)
 
     df = pd.DataFrame()
     for (num_tokens, hidden_size) in shape_list:
@@ -285,6 +318,10 @@ def allreduce_benchmark(
             if fusion == AllReduceFusionOp.RESIDUAL_RMS_NORM_QUANT_NVFP4 and sm_version < 100:
                 continue
 
+            # UB strategy doesn't support NONE fusion
+            if strategy == AllReduceStrategy.UB and fusion == AllReduceFusionOp.NONE:
+                continue
+
             median_ms = profile_allreduce(
                 mapping=mapping,
                 enable_cudagraph=enable_cudagraph,
@@ -327,26 +364,59 @@ def allreduce_benchmark(
 
 if __name__ == "__main__":
     parser = ArgumentParser()
-    parser.add_argument("--dtype", "-t", default="bfloat16",
+    parser.add_argument("--dtype",
+                        "-t",
+                        default="bfloat16",
                         help="Data type for benchmarking")
     parser.add_argument(
         "--range",
         "-r",
         default="256,256000000,4",  # 256 to 256M
         help="min_size,max_size,multiplicative_ratio")
-    parser.add_argument("--explore_2d", action="store_true", default=False,
-                        help="Explore 2D parameter space (num_tokens x hidden_size)")
-    parser.add_argument("--enable_cudagraph", action="store_true",
+    parser.add_argument(
+        "--explore_2d",
+        action="store_true",
+        default=False,
+        help="Explore 2D parameter space (num_tokens x hidden_size)")
+    parser.add_argument("--enable_cudagraph",
+                        action="store_true",
                         help="Enable CUDA graph capture")
-    parser.add_argument("--save_csv", type=str, default=None,
+    parser.add_argument("--save_csv",
+                        type=str,
+                        default=None,
                         help="Path to save CSV results")
-    parser.add_argument("--strategy", type=str, default=None,
-                        help="Test specific strategy. If not specified, defaults to: NCCL, NCCL_SYMMETRIC, NCCL_DEVICE, MNNVL. "
-                             "Available: NCCL, NCCL_SYMMETRIC, NCCL_DEVICE, MNNVL, MIN_LATENCY, UB, ONESHOT, TWOSHOT, AUTO")
-    parser.add_argument("--inner_loop", type=int, default=200,
-                        help="Number of iterations per timing measurement (default: 200)")
-    parser.add_argument("--outer_loop", type=int, default=10,
-                        help="Number of timing measurements to take (default: 10)")
+    parser.add_argument(
+        "--strategy",
+        type=str,
+        default=None,
+        help=
+        "Test specific strategy. If not specified, defaults to: NCCL, NCCL_SYMMETRIC, NCCL_DEVICE, MNNVL. "
+        "Available: NCCL, NCCL_SYMMETRIC, NCCL_DEVICE, MNNVL, MIN_LATENCY, UB, ONESHOT, TWOSHOT, AUTO"
+    )
+    parser.add_argument(
+        "--inner_loop",
+        type=int,
+        default=200,
+        help="Number of iterations per timing measurement (default: 200)")
+    parser.add_argument(
+        "--outer_loop",
+        type=int,
+        default=10,
+        help="Number of timing measurements to take (default: 10)")
+    parser.add_argument(
+        "--tokens_range",
+        type=str,
+        default="1,16384,2",
+        help=
+        "Range for number of tokens in 2D mode: min,max,ratio (default: 1,16384,2)"
+    )
+    parser.add_argument(
+        "--hidden_sizes_range",
+        type=str,
+        default="128,8192,2",
+        help=
+        "Range for hidden sizes in 2D mode: min,max,ratio (default: 128,8192,2)"
+    )
 
     args = parser.parse_args()
 
@@ -359,4 +429,6 @@ def allreduce_benchmark(
         strategy=args.strategy,
         inner_loop=args.inner_loop,
         outer_loop=args.outer_loop,
+        tokens_range=args.tokens_range,
+        hidden_sizes_range=args.hidden_sizes_range,
     )