Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion run_train.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ TORCHFT_LIGHTHOUSE=${TORCHFT_LIGHTHOUSE:-"http://localhost:29510"}
if [ "$DRY_RUN" = "1" ]; then
# Dry run mode: validate configuration without GPU/distributed setup
echo "Running in DRY RUN mode - configuration validation only"
python scripts/dry_run.py --job.config_file ${CONFIG_FILE} "$@"
NGPU="${NGPU}" LOCAL_RANK=0 python3 -m "${TRAIN_FILE}" --job.config_file "${CONFIG_FILE}" "$@" --comm.local_tensor_mode
else
# Normal training with torchrun
PYTORCH_ALLOC_CONF="expandable_segments:True" \
Expand Down
159 changes: 0 additions & 159 deletions scripts/dry_run.py

This file was deleted.

3 changes: 3 additions & 0 deletions torchtitan/config/job_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -791,6 +791,9 @@ class Comm:
save_traces_file_prefix: str = "rank_"
"""Flight recorder trace files prefix"""

local_tensor_mode: bool = False
"""Local tensor mode, for debugging purposes. This is an experimental feature."""


@dataclass
class MemoryEstimation:
Expand Down
38 changes: 37 additions & 1 deletion torchtitan/distributed/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import torch.distributed._functional_collectives as funcol
import torch.distributed.distributed_c10d as c10d
from torch import distributed as dist
from torch.distributed import _local_tensor
from torch.distributed.device_mesh import DeviceMesh
from torch.distributed.tensor import DTensor

Expand Down Expand Up @@ -258,12 +259,45 @@ def maybe_enable_amp(
)


def init_local_tensor_mode(world_size: int) -> int:
"""Initialize local tensor mode for debugging purposes.

Args:
world_size: The number of GPUs to simulate

Returns:
The world size
"""
torch.distributed.init_process_group(
"fake",
rank=0,
world_size=world_size,
)
lm = _local_tensor.LocalTensorMode(world_size)
lm.__enter__()
return world_size


def init_distributed(
comm_config: CommConfig,
enable_cpu_backend: bool = False,
base_folder: str = "",
ranks: list[int] | None = None,
):
) -> int:
if comm_config.local_tensor_mode:
ngpu_str = os.environ.get("NGPU")
if ngpu_str is None:
raise ValueError(
"NGPU environment variable must be set when using local_tensor_mode"
)
try:
world_size = int(ngpu_str)
except ValueError as e:
raise ValueError(
f"NGPU environment variable must be a valid integer, got: {ngpu_str}"
) from e
return init_local_tensor_mode(world_size)

def _warn_overwrite_env(env, val):
if env in os.environ:
logger.warning(
Expand Down Expand Up @@ -309,6 +343,8 @@ def _get_distributed_backend(enable_cpu_backend):
_ranks=ranks if ranks is not None else [],
)

return torch.distributed.get_world_size()


def set_pg_timeouts(timeout, world_mesh):
"""
Expand Down
16 changes: 14 additions & 2 deletions torchtitan/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,12 @@ def __init__(self, job_config: JobConfig):
self.loss_fn, self.gradient_accumulation_steps
)

# TODO(local_tensor): Remove this early return once LocalTensor supports
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pytorch/pytorch#166540 is merged, shall we remove this early return?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are still some gaps. I updated the comment.

# init_weights().Currently skipping parallelism setup and model initialization
# in local tensor mode.
if job_config.comm.local_tensor_mode:
return

# apply parallelisms and initialization
if parallel_dims.pp_enabled:
if not self.train_spec.pipelining_fn:
Expand Down Expand Up @@ -360,13 +366,12 @@ def __init__(self, job_config: JobConfig):

def init_distributed(self) -> ParallelDims:
job_config = self.job_config
dist_utils.init_distributed(
world_size = dist_utils.init_distributed(
job_config.comm,
enable_cpu_backend=job_config.training.enable_cpu_offload,
base_folder=job_config.job.dump_folder,
)

world_size = int(os.environ["WORLD_SIZE"])
parallelism_config = job_config.parallelism

return ParallelDims(
Expand Down Expand Up @@ -718,6 +723,13 @@ def main(trainer_class: type[Trainer]) -> None:
try:
trainer = trainer_class(config)

# TODO(local_tensor): Remove this special case once LocalTensor supports
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

similarly, can we remove this now?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are still some gaps. I updated the comment.

# init_weights(). In local tensor mode, skip training/checkpointing as the
# model is not fully initialized
if config.comm.local_tensor_mode:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

probably a naive question. what's the advantage of localtensor over faketensor if we're just running the parallelization setup code?

localtensor would do the actual compute and give correct results, but it would run (more slowly) on a single gpu since it simulates each rank's operations. fake tensor would skip all the compute and run (more quickly, i think) on a single CPU.

Do we intend to run with numerics or just smoke-test that we don't get API errors along the way to setup?

Copy link
Contributor Author

@fegin fegin Nov 18, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we intend to run with numerics or just smoke-test that we don't get API errors along the way to setup?

For DryRun mode that validates the configurations, fake backend should be enough. But I also want to use this infra to enable some tests that do not require GPUs, running one step and verify the output. We are putting all the different parallelisms test into integration tests which require one H100 machine. The queuing time is going to become longer.

Also, DeviceMesh uses tensor operations, if we want to verify DeviceMesh operations on all ranks, we will need LocalTensor. Fake backend only allow you to verify rank0 DeviceMesh behavior, though this should not be a big deal as we mostly do SPMD.

Copy link
Contributor Author

@fegin fegin Nov 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I re-think you comment. I think I should not couple dry run mode with local tensor mode, which my intention was to use it for light-weight integration tests.

I add another option to enable ONLY fake PG mode. Local tensor mode depends on fake PG mode and dry run only requires fake PG mode.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok- that makes sense! i like the idea of using localtensor mode for running actual numerics validation. it just seemed overkill if used for dry-run.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the local tensor mode might help to some extent to debug numerics, however I met numeric issue because of DTensor, or missing communication. So we would recommend user debug with following order: dry_run mode -> local tensor model -> parallelism , right?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IMO, dry run mode is purely debugging the setup of the trainer phase. The setup means the trainer configurations, DeviceMesh setting up and parallelism configurations. For an end user of TorchTitan, dry run mode is mostly useful to detect configurations error before launching a large scale training. For TorchTitan developers like us, dry run mode is useful as an early debugging signal when developing trainer, components and parallelisms (e.g., parallelize.py).

LocalTensor, on the other hand is useful to actually check what happen during the forward and backward. While fake tensor can also help, it doesn't actually incur the communication and computation which may hide the issues, and when the computation involve data dependent op, it will fail. For example, you won't be able to debug CP load balancing with fake tensor.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm confused by the options here.

IIUC fake backend doesn't mean fake tensor right? What happens if local_tensor_mode=False but fake_backend=True?

Since this is user facing, I feel it might be more clear to organize options based on user intent. E.g. comm_mode = "dry" / "local" instead of providing multiple knobs which only functions when combined properly.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fake_backend=True, just means that we use fake backend for the communication. It doesn't have to be used with comm_mode if we don't care about accuracy. One computation will be done locally on rank0 but the collectives will not. For local_tensor_mode, all the computation (rank0 - rankN-1) will be done on rank0 and the collectives will be simulated as well.

I'm okay to combine the two together.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

With fake_backend=True, is rank0 the only device which participated in computation? Or does each rank compute their own stuff without actual comms?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@tianyu-l All ranks will perform the computation locally, but it actually doesn't matter because ranks don't talk to each others. For dry run mode, we always launch with one rank only (but fake it as there are N ranks).

logger.info("Local tensor mode enabled - skipping training execution")
return

if config.checkpoint.create_seed_checkpoint:
assert (
int(os.environ["WORLD_SIZE"]) == 1
Expand Down
Loading