Skip to content

Commit aa939c2

Browse files
committed
add CI to guard compiler optimization passes
1 parent 605a9a1 commit aa939c2

File tree

3 files changed

+252
-27
lines changed

3 files changed

+252
-27
lines changed

.github/workflows/integration_test_8gpu_simple_fsdp.yaml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,11 @@ jobs:
5050
python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126
5151
5252
mkdir artifacts-to-be-uploaded
53-
python -m torchtitan.experiments.simple_fsdp.tests.integration_tests artifacts-to-be-uploaded --ngpu 8
53+
# Run front-end integration tests of SimpleFSDP
54+
python -m torchtitan.experiments.simple_fsdp.tests.frontend_integration_tests artifacts-to-be-uploaded --ngpu 8
55+
56+
# Run backend pass integration tests of SimpleFSDP
57+
python -m torchtitan.experiments.simple_fsdp.tests.compiler_pass_integration_tests artifacts-to-be-uploaded --ngpu 8
5458
5559
# Run the numerics unit tests of SimpleFSDP
5660
torchrun --nproc-per-node=8 -m pytest torchtitan/experiments/simple_fsdp/tests/test_numerics.py -v
Lines changed: 247 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,247 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
import argparse
8+
import os
9+
10+
from tests.integration_tests import OverrideDefinitions
11+
from tests.integration_tests.run_tests import run_tests
12+
13+
14+
def build_simple_fsdp_test_list() -> list[OverrideDefinitions]:
15+
"""
16+
key is the config file name and value is a list of OverrideDefinitions
17+
that is used to generate variations of integration tests based on the
18+
same root config file.
19+
"""
20+
integration_tests_flavors = [
21+
OverrideDefinitions(
22+
[
23+
[
24+
"--model.name simple_fsdp.llama3",
25+
"--model.flavor 8B",
26+
"--compile.enable",
27+
"--job.custom_config_module=torchtitan.experiments.simple_fsdp.job_config",
28+
"--compile.backend aot_eager",
29+
"--compile.graph_passes auto_bucketing",
30+
],
31+
],
32+
"1D+autobucketing",
33+
"1d_autobucketing",
34+
),
35+
OverrideDefinitions(
36+
[
37+
[
38+
"--model.name simple_fsdp.llama3",
39+
"--model.flavor 8B",
40+
"--compile.enable",
41+
"--job.custom_config_module=torchtitan.experiments.simple_fsdp.job_config",
42+
"--compile.backend aot_eager",
43+
"--compile.graph_passes transformer_block_bucketing",
44+
],
45+
],
46+
"1D+transformer_block_bucketing",
47+
"1d_transformer_block_bucketing",
48+
),
49+
OverrideDefinitions(
50+
[
51+
[
52+
"--model.name simple_fsdp.llama3",
53+
"--model.flavor 8B",
54+
"--parallelism.tensor_parallel_degree 2",
55+
"--compile.enable",
56+
"--job.custom_config_module=torchtitan.experiments.simple_fsdp.job_config",
57+
"--compile.backend aot_eager",
58+
"--compile.graph_passes auto_bucketing",
59+
],
60+
],
61+
"2D+autobucketing",
62+
"2d_autobucketing",
63+
),
64+
OverrideDefinitions(
65+
[
66+
[
67+
"--model.name simple_fsdp.llama3",
68+
"--model.flavor 8B",
69+
"--parallelism.tensor_parallel_degree 2",
70+
"--compile.enable",
71+
"--job.custom_config_module=torchtitan.experiments.simple_fsdp.job_config",
72+
"--compile.backend aot_eager",
73+
"--compile.graph_passes transformer_block_bucketing",
74+
],
75+
],
76+
"2D+transformer_block_bucketing",
77+
"2d_transformer_block_bucketing",
78+
),
79+
# TODO(ruisizhang123): add back after passes + PP is supported
80+
# OverrideDefinitions(
81+
# [
82+
# [
83+
# "--model.name simple_fsdp.llama3",
84+
# "--model.flavor 8B",
85+
# "--parallelism.tensor_parallel_degree 2",
86+
# "--parallelism.pipeline_parallel_degree 2",
87+
# "--compile.enable",
88+
# "--job.custom_config_module=torchtitan.experiments.simple_fsdp.job_config",
89+
# "--compile.backend aot_eager",
90+
# "--compile.graph_passes auto_bucketing",
91+
# ],
92+
# ],
93+
# "3D+autobucketing",
94+
# "3d_autobucketing",
95+
# ),
96+
# OverrideDefinitions(
97+
# [
98+
# [
99+
# "--model.name simple_fsdp.llama3",
100+
# "--model.flavor 8B",
101+
# "--parallelism.tensor_parallel_degree 2",
102+
# "--parallelism.pipeline_parallel_degree 2",
103+
# "--compile.enable",
104+
# "--job.custom_config_module=torchtitan.experiments.simple_fsdp.job_config",
105+
# "--compile.backend aot_eager",
106+
# "--compile.graph_passes transformer_block_bucketing",
107+
# ],
108+
# ],
109+
# "3D+transformer_block_bucketing",
110+
# "3d_transformer_block_bucketing",
111+
# ),
112+
# OverrideDefinitions(
113+
# [
114+
# [
115+
# "--model.name simple_fsdp.llama3",
116+
# "--model.flavor 8B",
117+
# "--parallelism.tensor_parallel_degree 2",
118+
# "--parallelism.context_parallel_degree 2",
119+
# "--compile.enable",
120+
# "--job.custom_config_module=torchtitan.experiments.simple_fsdp.job_config",
121+
# "--compile.backend aot_eager",
122+
# "--compile.graph_passes auto_bucketing",
123+
# ],
124+
# ],
125+
# "FSDP+TP+CP+autobucketing",
126+
# "fsdp+tp+cp_autobucketing",
127+
# ),
128+
OverrideDefinitions(
129+
[
130+
[
131+
"--model.name simple_fsdp.llama3",
132+
"--model.flavor 8B",
133+
"--parallelism.tensor_parallel_degree 2",
134+
"--parallelism.context_parallel_degree 2",
135+
"--compile.enable",
136+
"--job.custom_config_module=torchtitan.experiments.simple_fsdp.job_config",
137+
"--compile.backend aot_eager",
138+
"--compile.graph_passes transformer_block_bucketing",
139+
],
140+
],
141+
"FSDP+TP+CP+transformer_block_bucketing",
142+
"fsdp+tp+cp_transformer_block_bucketing",
143+
),
144+
OverrideDefinitions(
145+
[
146+
[
147+
"--model.name simple_fsdp.deepseek_v3",
148+
"--model.flavor 16B",
149+
"--parallelism.data_parallel_shard_degree 4",
150+
"--parallelism.expert_parallel_degree 2",
151+
"--job.custom_config_module=torchtitan.experiments.simple_fsdp.job_config",
152+
"--compile.backend aot_eager",
153+
"--compile.graph_passes auto_bucketing",
154+
],
155+
],
156+
"FSDP+EP+autobucketing",
157+
"fsdp+ep_autobucketing",
158+
ngpu=4,
159+
),
160+
OverrideDefinitions(
161+
[
162+
[
163+
"--model.name simple_fsdp.deepseek_v3",
164+
"--model.flavor 16B",
165+
"--parallelism.data_parallel_shard_degree 4",
166+
"--parallelism.expert_parallel_degree 2",
167+
"--job.custom_config_module=torchtitan.experiments.simple_fsdp.job_config",
168+
"--compile.backend aot_eager",
169+
"--compile.graph_passes transformer_block_bucketing",
170+
],
171+
],
172+
"FSDP+EP+transformer_block_bucketing",
173+
"fsdp+ep_transformer_block_bucketing",
174+
ngpu=4,
175+
),
176+
OverrideDefinitions(
177+
[
178+
[
179+
"--model.name simple_fsdp.deepseek_v3",
180+
"--model.flavor 16B",
181+
"--parallelism.data_parallel_shard_degree 2",
182+
"--parallelism.tensor_parallel_degree 2",
183+
"--parallelism.expert_parallel_degree 4",
184+
"--parallelism.expert_tensor_parallel_degree 1",
185+
"--job.custom_config_module=torchtitan.experiments.simple_fsdp.job_config",
186+
"--compile.backend aot_eager",
187+
"--compile.graph_passes auto_bucketing",
188+
],
189+
],
190+
"FSDP+TP+EP+autobucketing",
191+
"fsdp+tp+ep_autobucketing",
192+
ngpu=4,
193+
),
194+
OverrideDefinitions(
195+
[
196+
[
197+
"--model.name simple_fsdp.deepseek_v3",
198+
"--model.flavor 16B",
199+
"--parallelism.data_parallel_shard_degree 2",
200+
"--parallelism.tensor_parallel_degree 2",
201+
"--parallelism.expert_parallel_degree 4",
202+
"--parallelism.expert_tensor_parallel_degree 1",
203+
"--job.custom_config_module=torchtitan.experiments.simple_fsdp.job_config",
204+
"--compile.backend aot_eager",
205+
"--compile.graph_passes transformer_block_bucketing",
206+
],
207+
],
208+
"FSDP+TP+EP+transformer_block_bucketing",
209+
"fsdp+tp+ep_transformer_block_bucketing",
210+
ngpu=4,
211+
),
212+
]
213+
return integration_tests_flavors
214+
215+
216+
_TEST_SUITES_FUNCTION = {
217+
"simple_fsdp": build_simple_fsdp_test_list,
218+
}
219+
220+
221+
def main():
222+
parser = argparse.ArgumentParser()
223+
parser.add_argument("output_dir")
224+
parser.add_argument(
225+
"--config_path",
226+
default="./tests/integration_tests/base_config.toml",
227+
help="Base config path for integration tests. This is the config that will be used as a base for all tests.",
228+
)
229+
parser.add_argument(
230+
"--test_name",
231+
default="all",
232+
help="test to run, acceptable values: `test_name` in `build_test_list` (default: all)",
233+
)
234+
parser.add_argument("--ngpu", default=8, type=int)
235+
args = parser.parse_args()
236+
237+
if not os.path.exists(args.output_dir):
238+
os.makedirs(args.output_dir)
239+
if os.listdir(args.output_dir):
240+
raise RuntimeError("Please provide an empty output directory.")
241+
242+
test_list = _TEST_SUITES_FUNCTION["simple_fsdp"]()
243+
run_tests(args, test_list)
244+
245+
246+
if __name__ == "__main__":
247+
main()

torchtitan/experiments/simple_fsdp/tests/integration_tests.py renamed to torchtitan/experiments/simple_fsdp/tests/frontend_integration_tests.py

Lines changed: 0 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -29,32 +29,6 @@ def build_simple_fsdp_test_list() -> list[OverrideDefinitions]:
2929
"1D",
3030
"1d",
3131
),
32-
OverrideDefinitions(
33-
[
34-
[
35-
"--model.name simple_fsdp.llama3",
36-
"--compile.enable",
37-
"--job.custom_config_module=torchtitan.experiments.simple_fsdp.job_config",
38-
"--compile.backend aot_eager",
39-
"--compile.graph_passes auto_bucketing",
40-
],
41-
],
42-
"1D+autobucketing",
43-
"1d_autobucketing",
44-
),
45-
OverrideDefinitions(
46-
[
47-
[
48-
"--model.name simple_fsdp.llama3",
49-
"--compile.enable",
50-
"--job.custom_config_module=torchtitan.experiments.simple_fsdp.job_config",
51-
"--compile.backend aot_eager",
52-
"--compile.graph_passes transformer_block_bucketing",
53-
],
54-
],
55-
"1D+transformer_block_bucketing",
56-
"1d_transformer_block_bucketing",
57-
),
5832
OverrideDefinitions(
5933
[
6034
[

0 commit comments

Comments
 (0)