@@ -164,6 +164,75 @@ jobs:
164164        ls -al "${RUNNER_ARTIFACT_DIR}" 
165165        echo "::endgroup::" 
166166
167+ export-gemma3-cuda-artifact :
168+     name : export-gemma3-cuda-${{ matrix.quant.name }} 
169+     uses : pytorch/test-infra/.github/workflows/linux_job_v2.yml@main 
170+     permissions :
171+       id-token : write 
172+       contents : read 
173+     secrets : inherit 
174+     strategy :
175+       fail-fast : false 
176+       matrix :
177+         quant :
178+           - name : " non-quantized" 
179+             artifact : " voxtral-cuda-export" 
180+             extra_args : " " 
181+           #  TODO: enable gemma3 quantization
182+           #  - name: "quantized-int4-tile-packed"
183+           #    artifact: "voxtral-cuda-quantized-int4-tile-packed"
184+           #    extra_args: "--qlinear 4w --qlinear_encoder 4w --qlinear_packing_format tile_packed_to_4d --qlinear_encoder_packing_format tile_packed_to_4d"
185+           #  - name: "quantized-int4-weight-only"
186+           #    artifact: "voxtral-cuda-quantized-int4-weight-only"
187+           #    # TODO: adding "--qlinear 4w" produces invalid results. Need further investigation.
188+           #    extra_args: "--qlinear_encoder 4w"
189+     with :
190+       timeout : 90 
191+       secrets-env : EXECUTORCH_HF_TOKEN 
192+       runner : linux.g5.4xlarge.nvidia.gpu 
193+       gpu-arch-type : cuda 
194+       gpu-arch-version : 12.6 
195+       use-custom-docker-registry : false 
196+       submodules : recursive 
197+       upload-artifact : gemma3-cuda-export 
198+       ref : ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} 
199+       script : | 
200+         set -eux 
201+ 
202+         echo "::group::Setup ExecuTorch" 
203+         ./install_executorch.sh 
204+         echo "::endgroup::" 
205+ 
206+         echo "::group::Setup Huggingface" 
207+         pip install -U "huggingface_hub[cli]" accelerate 
208+         huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN 
209+         OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt) 
210+         pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION} 
211+         pip list 
212+         echo "::endgroup::" 
213+ 
214+         echo "::group::Export Gemma3 (${{ matrix.quant.name }})" 
215+         EXTRA_ARGS="${{ matrix.quant.extra_args }}" 
216+         optimum-cli export executorch \ 
217+             --model "google/gemma-3-4b-it" \ 
218+             --task "multimodal-text-to-text" \ 
219+             --recipe "cuda" \ 
220+             --dtype bfloat16 \ 
221+             --device cuda \ 
222+             --max_seq_len 64 \ 
223+             --output_dir ./ 
224+ 
225+         test -f model.pte 
226+         test -f aoti_cuda_blob.ptd 
227+         echo "::endgroup::" 
228+ 
229+         echo "::group::Store Gemma3 Artifacts (${{ matrix.quant.name }})" 
230+         mkdir -p "${RUNNER_ARTIFACT_DIR}/" 
231+         cp model.pte "${RUNNER_ARTIFACT_DIR}/" 
232+         cp aoti_cuda_blob.ptd "${RUNNER_ARTIFACT_DIR}/" 
233+         ls -al "${RUNNER_ARTIFACT_DIR}/" 
234+         echo "::endgroup::" 
235+ 
167236benchmark-voxtral-cuda :
168237    name : benchmark-voxtral-cuda 
169238    needs : export-voxtral-cuda-artifact 
@@ -204,13 +273,63 @@ jobs:
204273              -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \ 
205274              -DEXECUTORCH_BUILD_TESTS=ON \ 
206275              -Bcmake-out . 
207-         cmake --build cmake-out -j$(( $(nproc) - 1 )) --target voxtral_runner  
276+         cmake --build cmake-out -j$(( $(nproc) - 1 )) --target multimodal_benchmark  
208277        echo "::endgroup::" 
209278
210279        echo "::group::Run Voxtral Benchmark" 
211280
212281        export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH 
213-         cmake-out/backends/cuda/voxtral_runner model.pte aoti_cuda_blob.ptd 
282+         cmake-out/backends/cuda/multimodal_benchmark voxtral model.pte aoti_cuda_blob.ptd 
283+ 
284+         echo "::endgroup::" 
285+ 
286+ benchmark-gemma3-cuda :
287+     name : benchmark-gemma3-cuda 
288+     needs : export-gemma3-cuda-artifact 
289+     uses : pytorch/test-infra/.github/workflows/linux_job_v2.yml@main 
290+     permissions :
291+       id-token : write 
292+       contents : read 
293+     strategy :
294+       fail-fast : false 
295+     with :
296+       timeout : 90 
297+       runner : linux.g5.4xlarge.nvidia.gpu 
298+       gpu-arch-type : cuda 
299+       gpu-arch-version : 12.6 
300+       use-custom-docker-registry : false 
301+       submodules : recursive 
302+       download-artifact : gemma3-cuda-export 
303+       ref : ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} 
304+       script : | 
305+         set -eux 
306+ 
307+         echo "::group::Setup ExecuTorch Requirements" 
308+         CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_requirements.sh 
309+         pip list 
310+         echo "::endgroup::" 
311+ 
312+         echo "::group::Prepare Gemma3 Artifacts" 
313+         cp "${RUNNER_ARTIFACT_DIR}/model.pte" . 
314+         cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" . 
315+         ls -al model.pte aoti_cuda_blob.ptd 
316+         echo "::endgroup::" 
317+ 
318+         echo "::group::Build Gemma3 Benchmark" 
319+         cmake -DCMAKE_BUILD_TYPE=Release \ 
320+               -DEXECUTORCH_BUILD_CUDA=ON \ 
321+               -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ 
322+               -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ 
323+               -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \ 
324+               -DEXECUTORCH_BUILD_TESTS=ON \ 
325+               -Bcmake-out . 
326+         cmake --build cmake-out -j$(( $(nproc) - 1 )) --target multimodal_benchmark 
327+         echo "::endgroup::" 
328+ 
329+         echo "::group::Run Gemma3 Benchmark" 
330+ 
331+         export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH 
332+         cmake-out/backends/cuda/multimodal_benchmark gemma3 model.pte aoti_cuda_blob.ptd 
214333
215334        echo "::endgroup::" 
216335
@@ -302,3 +421,87 @@ jobs:
302421          exit $EXIT_CODE 
303422        fi 
304423        echo "::endgroup::" 
424+ 
425+ test-gemma3-cuda-e2e :
426+     name : test-gemma3-cuda-e2e-${{ matrix.format.name }} 
427+     needs : export-gemma3-cuda-artifact 
428+     uses : pytorch/test-infra/.github/workflows/linux_job_v2.yml@main 
429+     permissions :
430+       id-token : write 
431+       contents : read 
432+     strategy :
433+       fail-fast : false 
434+       matrix :
435+         format :
436+           - name : " non-quantized" 
437+             artifact : " gemma3-cuda-export" 
438+           #  TODO: enable quantized gemma3.
439+           #  - name: "quantized-int4-tile-packed"
440+           #    artifact: "gemma3-cuda-quantized-int4-tile-packed"
441+           #  - name: "quantized-int4-weight-only"
442+           #    artifact: "gemma3-cuda-quantized-int4-weight-only"
443+     with :
444+       timeout : 90 
445+       runner : linux.g5.4xlarge.nvidia.gpu 
446+       gpu-arch-type : cuda 
447+       gpu-arch-version : 12.6 
448+       use-custom-docker-registry : false 
449+       submodules : recursive 
450+       download-artifact : ${{ matrix.format.artifact }} 
451+       ref : ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} 
452+       script : | 
453+         set -eux 
454+ 
455+         echo "::group::Setup ExecuTorch Requirements" 
456+         ./install_requirements.sh 
457+         pip list 
458+         echo "::endgroup::" 
459+ 
460+         echo "::group::Prepare Gemma3 Artifacts (${{ matrix.format.name }})" 
461+         cp "${RUNNER_ARTIFACT_DIR}/model.pte" . 
462+         cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" . 
463+         TOKENIZER_URL="https://huggingface.co/unsloth/gemma-3-1b-it/resolve/main/tokenizer.json" 
464+         curl -L $TOKENIZER_URL -o tokenizer.json 
465+         ls -al model.pte aoti_cuda_blob.ptd tokenizer.json 
466+         IMAGE_PATH="docs/source/_static/img/et-logo.png" 
467+         echo "::endgroup::" 
468+ 
469+         echo "::group::Build Gemma3 Runner" 
470+         cmake --preset llm \ 
471+               -DEXECUTORCH_BUILD_CUDA=ON \ 
472+               -DCMAKE_INSTALL_PREFIX=cmake-out \ 
473+               -DCMAKE_BUILD_TYPE=Release \ 
474+               -Bcmake-out -S. 
475+         cmake --build cmake-out -j$(( $(nproc) - 1 )) --target install --config Release 
476+ 
477+         cmake -DEXECUTORCH_BUILD_CUDA=ON \ 
478+               -DCMAKE_BUILD_TYPE=Release \ 
479+               -Sexamples/models/gemma3 \ 
480+               -Bcmake-out/examples/models/gemma3/ 
481+         cmake --build cmake-out/examples/models/gemma3 --target gemma3_e2e_runner --config Release 
482+         echo "::endgroup::" 
483+ 
484+         echo "::group::Run Gemma3 Runner (${{ matrix.format.name }})" 
485+         set +e 
486+         export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH 
487+         OUTPUT=$(cmake-out/examples/models/gemma3/gemma3_e2e_runner \ 
488+               --model_path model.pte \ 
489+               --data_path aoti_cuda_blob.ptd \ 
490+               --tokenizer_path tokenizer.json \ 
491+               --image_path $IMAGE_PATH \ 
492+               --temperature 0 2>&1) 
493+         EXIT_CODE=$? 
494+         set -e 
495+ 
496+         echo "$OUTPUT" 
497+ 
498+         if ! echo "$OUTPUT" | grep -iq "chip"; then 
499+           echo "Expected output 'chip' not found in output" 
500+           exit 1 
501+         fi 
502+ 
503+         if [ $EXIT_CODE -ne 0 ]; then 
504+           echo "Unexpected exit code: $EXIT_CODE" 
505+           exit $EXIT_CODE 
506+         fi 
507+         echo "::endgroup::" 
0 commit comments