Fix bucket sizes for AutoParallel 1D (#1545)

fmassa · web-flow · commit 4712163eb9d0 · 2025-08-08T15:29:37.000+02:00
This PR makes bucket sizes for all-gather and reduce-scatter to be of
the same size for 1d FSDP.
diff --git a/torchtitan/experiments/auto_parallel/parallelize_llama.py b/torchtitan/experiments/auto_parallel/parallelize_llama.py
@@ -41,19 +41,28 @@ def input_fn():
             # step.
             dp_degree = parallel_dims.dp_replicate * parallel_dims.dp_shard
             global_batch_size = job_config.training.local_batch_size * dp_degree
-        return torch.randint(
-            0,
-            # job_config.training.vocab_size,
-            model.vocab_size,
-            (global_batch_size, job_config.training.seq_len),
-            device=torch.device("cuda"),
-        ),
+        return (
+            torch.randint(
+                0,
+                # job_config.training.vocab_size,
+                model.vocab_size,
+                (global_batch_size, job_config.training.seq_len),
+                device=torch.device("cuda"),
+            ),
+        )
 
     # TODO make autop work correctly with different combinations of DP, DP+TP, TP, and support DDP / HSDP
     assert parallel_dims.dp_replicate_enabled is False, "DDP not supported yet"
     assert parallel_dims.cp_enabled is False, "CP not supported yet"
     assert parallel_dims.pp_enabled is False, "PP not supported yet"
 
+    torch._inductor.config.bucket_all_gathers_fx_bucket_size_determinator = (
+        lambda bucket_idx: 500 / parallel_dims.tp
+    )
+    torch._inductor.config.bucket_reduce_scatters_fx_bucket_size_determinator = (
+        lambda bucket_idx: 1000 / parallel_dims.tp
+    )
+
     # bail out
     # model = model_fn()
     # return model
@@ -64,7 +73,13 @@ def input_fn():
     param_dtype = TORCH_DTYPE_MAP[job_config.training.mixed_precision_param]
     reduce_dtype = TORCH_DTYPE_MAP[job_config.training.mixed_precision_reduce]
     mp_policy = MixedPrecisionPolicy(param_dtype=param_dtype, reduce_dtype=reduce_dtype)
-    with AutoParallel(model, input_fn, world_mesh, mp_policy=mp_policy, compile=job_config.training.compile) as autop:
+    with AutoParallel(
+        model,
+        input_fn,
+        world_mesh,
+        mp_policy=mp_policy,
+        compile=job_config.training.compile,
+    ) as autop:
         autop.add_parameter_memory_constraint(low=None, high=None)
 
         possible_input_shardings = {