pytorch · elfiegg · Dec 4, 2025 · tianyu-l · Dec 4, 2025 · elfiegg
@@ -124,6 +124,12 @@ class Model:
     which can be found here: https://github.com/pytorch/ao
     """
 
+    use_flex_attn: bool | None = None
+    """
+    Whether to use FlexAttention. If None, uses model's default.
+    For DeepEP, should be False to avoid OOM (FlexAttention compilation fails with DeepEP).
+    """
+
     print_after_conversion: bool = False
     """
     If true, model definition will be printed to stdout after all model

diff --git a/torchtitan/experiments/__init__.py b/torchtitan/experiments/__init__.py
@@ -9,6 +9,7 @@
         "gpt_oss",
         "simple_fsdp.llama3",
         "simple_fsdp.deepseek_v3",
+        "deepep.deepseek_v3",  # DeepEP + DeepSeek-V3
         "vlm",
         "compiler_toolkit.deepseek_v3",
         "compiler_toolkit.llama3",

diff --git a/torchtitan/experiments/deepep/__init__.py b/torchtitan/experiments/deepep/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+from .moe_deepep import MoEWithDeepEP, get_deepep_buffer, get_hidden_bytes
+from .expert_parallel import DeepEPExpertParallel
+
+__all__ = [
+    "MoEWithDeepEP",
+    "get_deepep_buffer",
+    "get_hidden_bytes",
+    "DeepEPExpertParallel",
+]
+
+__version__ = "1.0.0"
diff --git a/torchtitan/experiments/deepep/deepseek_v3/__init__.py b/torchtitan/experiments/deepep/deepseek_v3/__init__.py
@@ -0,0 +1,47 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from torchtitan.components.loss import build_cross_entropy_loss
+from torchtitan.components.lr_scheduler import build_lr_schedulers
+from torchtitan.components.optimizer import build_optimizers_with_moe_load_balancing
+from torchtitan.components.tokenizer import build_hf_tokenizer
+from torchtitan.distributed.pipeline_parallel import pipeline_llm
+from torchtitan.hf_datasets.text_datasets import build_text_dataloader
+from torchtitan.models.deepseek_v3 import deepseekv3_args, DeepSeekV3StateDictAdapter
+from torchtitan.protocols.train_spec import TrainSpec
+
+from .model import DeepEPDeepSeekV3Model
+from .parallelize import parallelize_deepseekv3
+
+
+def get_train_spec() -> TrainSpec:
+    """
+    Get the training specification for DeepSeek-V3 with DeepEP.
+
+    Returns:
+        TrainSpec: Complete training specification including model, parallelization,
+                   optimization, and data loading functions.
+    """
+    return TrainSpec(
+        model_cls=DeepEPDeepSeekV3Model,
+        model_args=deepseekv3_args,
+        parallelize_fn=parallelize_deepseekv3,
+        pipelining_fn=pipeline_llm,
+        build_optimizers_fn=build_optimizers_with_moe_load_balancing,
+        build_lr_schedulers_fn=build_lr_schedulers,
+        build_dataloader_fn=build_text_dataloader,
+        build_tokenizer_fn=build_hf_tokenizer,
+        build_loss_fn=build_cross_entropy_loss,
+        state_dict_adapter=DeepSeekV3StateDictAdapter,
+    )
+
+
+__all__ = [
+    "get_train_spec",
+    "DeepEPDeepSeekV3Model",
+    "parallelize_deepseekv3",
+]
+
diff --git a/torchtitan/experiments/deepep/deepseek_v3/model.py b/torchtitan/experiments/deepep/deepseek_v3/model.py
@@ -0,0 +1,34 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+DeepSeek-V3 model wrapper for DeepEP experiments.
+
+This module provides a DeepSeekV3 model class that is compatible with
+DeepEP's MoE parallelization strategy.
+"""
+
+from torchtitan.models.deepseek_v3 import DeepSeekV3Model, DeepSeekV3ModelArgs
+
+
+class DeepEPDeepSeekV3Model(DeepSeekV3Model):
+    """
+    DeepSeek-V3 model with DeepEP-compatible initialization.
+
+    This class extends the base DeepSeekV3Model to ensure proper
+    initialization for DeepEP experiments. The main difference is
+    that MoE layers will be replaced with DeepEP versions during
+    the parallelization step.
+    """
+
+    def __init__(self, model_args: DeepSeekV3ModelArgs):
+        super().__init__(model_args)
+        self.init_weights()
+
+    def init_weights(self, *args, **kwargs):
+        """Initialize model weights."""
+        super().init_weights(*args, **kwargs)
+