Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions torchtitan/config/job_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -416,6 +416,23 @@ class Parallelism:
Note that this is still an experimental feature.
"""

expert_parallel_comm_backend: Literal["standard", "deepep"] = "standard"
"""
Expert-parallel communication backend. No effect for non-MoE models or when ep = 1.

- "standard": Uses PyTorch all-to-all collectives (default)
- "deepep": Uses DeepEP custom kernels for more efficient communication

DeepEP requires installation:
https://github.com/deepseek-ai/DeepEP.
"""

deepep_use_alignment_padding: bool = False
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why this is optional? IIUC this is must in order to use torch._grouped_mm.

"""
Whether to use alignment padding for DeepEP token dispatch.
Only applies when expert_parallel_comm_backend="deepep".
"""


@dataclass
class Checkpoint:
Expand Down
7 changes: 6 additions & 1 deletion torchtitan/distributed/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,14 @@
from torch.distributed.tensor.placement_types import Placement

from torchtitan.distributed.parallel_dims import ParallelDims
from torchtitan.distributed.expert_parallel import DeepEPExpertParallel


__all__ = ["ParallelDims", "NoParallel"]
__all__ = [
"ParallelDims",
"NoParallel",
"DeepEPExpertParallel",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

let's not expose this here for now

]


# NOTE: This is to achieve replicate computation on the gate module in the MoE router.
Expand Down
19 changes: 19 additions & 0 deletions torchtitan/distributed/deepep/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

"""DeepEP distributed communication primitives for MoE."""

from .deepep import (
dispatch_tokens,
combine_tokens,
DispatchState,
)

__all__ = [
"dispatch_tokens",
"combine_tokens",
"DispatchState",
]
Loading