tests/unit_tests/test_job_config.py

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -52,73 +52,78 @@ def test_job_config_file_cmd_overrides(self):
  
            )

            assert config.job.dump_folder == "/tmp/test_tt/"

        def test_parse_pp_split_points(self):

            toml_splits = ["layers.2", "layers.4", "layers.6"]

            cmdline_splits = ["layers.1", "layers.3", "layers.5"]

            # no split points specified

            config_manager = ConfigManager()

            config = config_manager.parse_args(

                [

                    "--job.config_file",

                    "./torchtitan/models/llama3/train_configs/debug_model.toml",

                ]

            )

            assert config.parallelism.pipeline_parallel_split_points == []

        def test_parse_module_fqns_per_model_part(self):

            toml_chunks = [

                ["tok_embeddings", "layers.0"],

                ["layers.1", "layers.2"],

                ["layers.3", "norm", "output"],

            ]

            cmdline_chunks = [

                ["tok_embeddings", "layers.0", "layers.1"],

                ["layers.2", "layers.3", "norm", "output"],

            ]

            # toml has no split points, but cmdline splits are specified

            # no module names specified

            config_manager = ConfigManager()

            config = config_manager.parse_args(

                [

                    "--job.config_file",

                    "./torchtitan/models/llama3/train_configs/debug_model.toml",

                    "--parallelism.pipeline_parallel_split_points",

                    ",".join(cmdline_splits),

                ]

            )

            assert (

                config.parallelism.pipeline_parallel_split_points == cmdline_splits

            ), config.parallelism.pipeline_parallel_split_points

            assert config.parallelism.module_fqns_per_model_part is None

            # toml has split points, cmdline does not

            # toml has module names, cmdline does not

            with tempfile.NamedTemporaryFile() as fp:

                with open(fp.name, "wb") as f:

                    tomli_w.dump(

                        {

                            "parallelism": {

                                "pipeline_parallel_split_points": toml_splits,

                                "module_fqns_per_model_part": toml_chunks,

                            }

                        },

                        f,

                    )

                config_manager = ConfigManager()

                config = config_manager.parse_args(["--job.config_file", fp.name])

                assert (

                    config.parallelism.pipeline_parallel_split_points == toml_splits

                ), config.parallelism.pipeline_parallel_split_points

                    config.parallelism.module_fqns_per_model_part == toml_chunks

                ), config.parallelism.module_fqns_per_model_part

            # toml has split points, cmdline overrides them

            # test that the field accepts list of lists structure

            with tempfile.NamedTemporaryFile() as fp:

                with open(fp.name, "wb") as f:

                    tomli_w.dump(

                        {

                            "parallelism": {

                                "pipeline_parallel_split_points": toml_splits,

                                "module_fqns_per_model_part": cmdline_chunks,

                            }

                        },

                        f,

                    )

                config_manager = ConfigManager()

                config = config_manager.parse_args(

                    [

                        "--job.config_file",

                        fp.name,

                        "--parallelism.pipeline_parallel_split_points",

                        ",".join(cmdline_splits),

                    ]

                )

                config = config_manager.parse_args(["--job.config_file", fp.name])

                assert (

                    config.parallelism.module_fqns_per_model_part == cmdline_chunks

                ), config.parallelism.module_fqns_per_model_part

            # test empty chunks are handled correctly

            empty_chunks = [[], ["tok_embeddings"], []]

            with tempfile.NamedTemporaryFile() as fp:

                with open(fp.name, "wb") as f:

                    tomli_w.dump(

                        {

                            "parallelism": {

                                "module_fqns_per_model_part": empty_chunks,

                            }

                        },

                        f,

                    )

                config_manager = ConfigManager()

                config = config_manager.parse_args(["--job.config_file", fp.name])

                assert (

                    config.parallelism.pipeline_parallel_split_points == cmdline_splits

                ), config.parallelism.pipeline_parallel_split_points

                    config.parallelism.module_fqns_per_model_part == empty_chunks

                ), config.parallelism.module_fqns_per_model_part

        def test_parse_exclude_from_loading(self):

            toml_splits = ["optimizer", "dataloader"]

torchtitan/config/job_config.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -290,6 +290,7 @@ class Parallelism: @@
         pipeline_parallel_split_points: list[str] = field(default_factory=list)
         """
+        DEPRECATED: Use module_fqns_per_model_part instead.
         Specify comma-separated names of modules to use as the beginning of a split point.
         e.g. "layers.0,layers.2" will cause the model to be split into 3 stages,
         the first containing all the layers up to layers.0,
@@ Expand All / @@ -299,9 +300,31 @@ class Parallelism: @@
         but currently the split points must be specified manually.
         """
+        module_fqns_per_model_part: list[list[str]] | None = None
+        """
+        Specify a list of lists containing the FQNs (Fully Qualified Names) of modules for each model chunk.
+        Each inner list represents one model chunk and contains the module names that belong to that chunk.
+        e.g. [['tok_embeddings', 'layers.0'], ['layers.1', 'layers.2'], ['layers.3', 'layers.4']]
+        will create 3 chunks: the first containing tok_embeddings and layers.0,
+        the second containing layers.1 and layers.2, and the third containing layers.3 and layers.4.
+        This provides more explicit control over which modules belong to each chunk compared to split points.
+        """
+        pipeline_parallel_first_stage_less_layers: int = 1
+        """
+        The number of layers to reduce in the first stage of pipeline parallelism. This is because
+        the first stage has the extra overhead of the embedding layer, which is not present in the other stages.
+        """
+        pipeline_parallel_last_stage_less_layers: int = 1
+        """
+        The number of layers to reduce in the last stage of pipeline parallelism. This is because
+        the last stage has the extra overhead of the output layer, which is not present in the other stages.
+        """
         pipeline_parallel_layers_per_stage: int | None = None
         """
-        The number of layers per (virtual) pipeline stage. If specified, the split points will be
+        The number of layers per (virtual) pipeline stage. If specified, the module_fqns_per_model_part will be
         calculated from the number of layers and pipeline_parallel_degree. If not specified, the
         layers per stage will be inferred from the model, schedule, and pipeline_parallel_degree.
         """
@@ Expand Down @@

Refactor PP splitting #1416

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged

H-Huang merged 1 commit into pytorch:main from H-Huang:pp_config

Jul 30, 2025

-Original file line number
+Diff line change
@@ Expand Up / @@ -290,6 +290,7 @@ class Parallelism: @@
         pipeline_parallel_split_points: list[str] = field(default_factory=list)
         """
+        DEPRECATED: Use module_fqns_per_model_part instead.
         Specify comma-separated names of modules to use as the beginning of a split point.
         e.g. "layers.0,layers.2" will cause the model to be split into 3 stages,
         the first containing all the layers up to layers.0,
@@ Expand All / @@ -299,9 +300,31 @@ class Parallelism: @@
         but currently the split points must be specified manually.
         """
+        module_fqns_per_model_part: list[list[str]] | None = None
+        """
+        Specify a list of lists containing the FQNs (Fully Qualified Names) of modules for each model chunk.
+        Each inner list represents one model chunk and contains the module names that belong to that chunk.
+        e.g. [['tok_embeddings', 'layers.0'], ['layers.1', 'layers.2'], ['layers.3', 'layers.4']]
+        will create 3 chunks: the first containing tok_embeddings and layers.0,
+        the second containing layers.1 and layers.2, and the third containing layers.3 and layers.4.
+        This provides more explicit control over which modules belong to each chunk compared to split points.
+        """
+        pipeline_parallel_first_stage_less_layers: int = 1
+        """
+        The number of layers to reduce in the first stage of pipeline parallelism. This is because
+        the first stage has the extra overhead of the embedding layer, which is not present in the other stages.
+        """
+        pipeline_parallel_last_stage_less_layers: int = 1
+        """
+        The number of layers to reduce in the last stage of pipeline parallelism. This is because
+        the last stage has the extra overhead of the output layer, which is not present in the other stages.
+        """
         pipeline_parallel_layers_per_stage: int | None = None
         """
-        The number of layers per (virtual) pipeline stage. If specified, the split points will be
+        The number of layers per (virtual) pipeline stage. If specified, the module_fqns_per_model_part will be
         calculated from the number of layers and pipeline_parallel_degree. If not specified, the
         layers per stage will be inferred from the model, schedule, and pipeline_parallel_degree.
         """
@@ Expand Down @@

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Refactor PP splitting #1416

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!

Uh oh!