Skip to content

Commit d70c10d

Browse files
committed
gemma3 e2e runner on cuda
Pull Request resolved: #15282 This diff introduces e2e runner for gemma3 model on cuda delegating using AOTI library, which is guarded by CI. Also other necessary infrastructure updates for building and running the `gemma3 e2e runner` on CUDA devices. ghstack-source-id: 317561282 Differential Revision: [D85087532](https://our.internmc.facebook.com/intern/diff/D85087532/)
1 parent 223473a commit d70c10d

File tree

4 files changed

+482
-3
lines changed

4 files changed

+482
-3
lines changed

.github/workflows/cuda.yml

Lines changed: 101 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -165,14 +165,27 @@ jobs:
165165
echo "::endgroup::"
166166
167167
export-gemma3-cuda-artifact:
168-
name: export-gemma3-cuda-artifact
168+
name: export-gemma3-cuda-${{ matrix.quant.name }}
169169
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
170170
permissions:
171171
id-token: write
172172
contents: read
173173
secrets: inherit
174174
strategy:
175175
fail-fast: false
176+
matrix:
177+
quant:
178+
- name: "non-quantized"
179+
artifact: "voxtral-cuda-export"
180+
extra_args: ""
181+
# TODO: enable gemma3 quantization
182+
# - name: "quantized-int4-tile-packed"
183+
# artifact: "voxtral-cuda-quantized-int4-tile-packed"
184+
# extra_args: "--qlinear 4w --qlinear_encoder 4w --qlinear_packing_format tile_packed_to_4d --qlinear_encoder_packing_format tile_packed_to_4d"
185+
# - name: "quantized-int4-weight-only"
186+
# artifact: "voxtral-cuda-quantized-int4-weight-only"
187+
# # TODO: adding "--qlinear 4w" produces invalid results. Need further investigation.
188+
# extra_args: "--qlinear_encoder 4w"
176189
with:
177190
timeout: 90
178191
secrets-env: EXECUTORCH_HF_TOKEN
@@ -198,7 +211,8 @@ jobs:
198211
pip list
199212
echo "::endgroup::"
200213
201-
echo "::group::Export Gemma3"
214+
echo "::group::Export Gemma3 (${{ matrix.quant.name }})"
215+
EXTRA_ARGS="${{ matrix.quant.extra_args }}"
202216
optimum-cli export executorch \
203217
--model "google/gemma-3-4b-it" \
204218
--task "multimodal-text-to-text" \
@@ -212,7 +226,7 @@ jobs:
212226
test -f aoti_cuda_blob.ptd
213227
echo "::endgroup::"
214228
215-
echo "::group::Store Gemma3 Artifacts"
229+
echo "::group::Store Gemma3 Artifacts (${{ matrix.quant.name }})"
216230
mkdir -p "${RUNNER_ARTIFACT_DIR}/"
217231
cp model.pte "${RUNNER_ARTIFACT_DIR}/"
218232
cp aoti_cuda_blob.ptd "${RUNNER_ARTIFACT_DIR}/"
@@ -407,3 +421,87 @@ jobs:
407421
exit $EXIT_CODE
408422
fi
409423
echo "::endgroup::"
424+
425+
test-gemma3-cuda-e2e:
426+
name: test-gemma3-cuda-e2e-${{ matrix.format.name }}
427+
needs: export-gemma3-cuda-artifact
428+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
429+
permissions:
430+
id-token: write
431+
contents: read
432+
strategy:
433+
fail-fast: false
434+
matrix:
435+
format:
436+
- name: "non-quantized"
437+
artifact: "gemma3-cuda-export"
438+
# TODO: enable quantized gemma3.
439+
# - name: "quantized-int4-tile-packed"
440+
# artifact: "gemma3-cuda-quantized-int4-tile-packed"
441+
# - name: "quantized-int4-weight-only"
442+
# artifact: "gemma3-cuda-quantized-int4-weight-only"
443+
with:
444+
timeout: 90
445+
runner: linux.g5.4xlarge.nvidia.gpu
446+
gpu-arch-type: cuda
447+
gpu-arch-version: 12.6
448+
use-custom-docker-registry: false
449+
submodules: recursive
450+
download-artifact: ${{ matrix.format.artifact }}
451+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
452+
script: |
453+
set -eux
454+
455+
echo "::group::Setup ExecuTorch Requirements"
456+
./install_requirements.sh
457+
pip list
458+
echo "::endgroup::"
459+
460+
echo "::group::Prepare Gemma3 Artifacts (${{ matrix.format.name }})"
461+
cp "${RUNNER_ARTIFACT_DIR}/model.pte" .
462+
cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" .
463+
TOKENIZER_URL="https://huggingface.co/unsloth/gemma-3-1b-it/resolve/main/tokenizer.json"
464+
curl -L $TOKENIZER_URL -o tokenizer.json
465+
ls -al model.pte aoti_cuda_blob.ptd tokenizer.json
466+
IMAGE_PATH="docs/source/_static/img/et-logo.png"
467+
echo "::endgroup::"
468+
469+
echo "::group::Build Gemma3 Runner"
470+
cmake --preset llm \
471+
-DEXECUTORCH_BUILD_CUDA=ON \
472+
-DCMAKE_INSTALL_PREFIX=cmake-out \
473+
-DCMAKE_BUILD_TYPE=Release \
474+
-Bcmake-out -S.
475+
cmake --build cmake-out -j$(( $(nproc) - 1 )) --target install --config Release
476+
477+
cmake -DEXECUTORCH_BUILD_CUDA=ON \
478+
-DCMAKE_BUILD_TYPE=Release \
479+
-Sexamples/models/gemma3 \
480+
-Bcmake-out/examples/models/gemma3/
481+
cmake --build cmake-out/examples/models/gemma3 --target gemma3_e2e_runner --config Release
482+
echo "::endgroup::"
483+
484+
echo "::group::Run Gemma3 Runner (${{ matrix.format.name }})"
485+
set +e
486+
export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
487+
OUTPUT=$(cmake-out/examples/models/gemma3/gemma3_e2e_runner \
488+
--model_path model.pte \
489+
--data_path aoti_cuda_blob.ptd \
490+
--tokenizer_path tokenizer.json \
491+
--image_path $IMAGE_PATH \
492+
--temperature 0 2>&1)
493+
EXIT_CODE=$?
494+
set -e
495+
496+
echo "$OUTPUT"
497+
498+
if ! echo "$OUTPUT" | grep -iq "chip"; then
499+
echo "Expected output 'chip' not found in output"
500+
exit 1
501+
fi
502+
503+
if [ $EXIT_CODE -ne 0 ]; then
504+
echo "Unexpected exit code: $EXIT_CODE"
505+
exit $EXIT_CODE
506+
fi
507+
echo "::endgroup::"
Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
#
8+
# Simple CMake build system for gemma3 e2e runner.
9+
#
10+
cmake_minimum_required(VERSION 3.24)
11+
project(gemma3)
12+
13+
set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
14+
15+
include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
16+
17+
if(CMAKE_TOOLCHAIN_FILE MATCHES ".*(iOS|ios\.toolchain)\.cmake$")
18+
set(CMAKE_TOOLCHAIN_IOS ON)
19+
else()
20+
set(CMAKE_TOOLCHAIN_IOS OFF)
21+
endif()
22+
23+
if(NOT CMAKE_CXX_STANDARD)
24+
set(CMAKE_CXX_STANDARD 17)
25+
endif()
26+
27+
# Let files say "include <executorch/path/to/header.h>"
28+
set(_common_include_directories ${EXECUTORCH_ROOT}/..)
29+
30+
# Need this for gflags for some reason
31+
set(gflags_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/gflags)
32+
find_package(gflags REQUIRED)
33+
34+
# Find `executorch` libraries, same as for gflags
35+
list(APPEND CMAKE_FIND_ROOT_PATH ${CMAKE_CURRENT_BINARY_DIR}/../../..)
36+
find_package(executorch CONFIG REQUIRED FIND_ROOT_PATH_BOTH)
37+
executorch_target_link_options_shared_lib(executorch)
38+
39+
set(link_libraries executorch gflags)
40+
set(_srcs e2e_runner.cpp)
41+
42+
list(
43+
APPEND
44+
link_libraries
45+
optimized_native_cpu_ops_lib
46+
quantized_ops_lib
47+
custom_ops
48+
cpublas
49+
eigen_blas
50+
)
51+
executorch_target_link_options_shared_lib(optimized_native_cpu_ops_lib)
52+
executorch_target_link_options_shared_lib(quantized_ops_lib)
53+
executorch_target_link_options_shared_lib(custom_ops)
54+
55+
# XNNPACK
56+
if(TARGET xnnpack_backend)
57+
set(xnnpack_backend_libs xnnpack_backend XNNPACK xnnpack-microkernels-prod)
58+
if(TARGET kleidiai)
59+
list(APPEND xnnpack_backend_libs kleidiai)
60+
endif()
61+
list(APPEND link_libraries ${xnnpack_backend_libs})
62+
executorch_target_link_options_shared_lib(xnnpack_backend)
63+
endif()
64+
65+
# Add LLM runner and extension module
66+
if(NOT TARGET extension_llm_runner)
67+
message(
68+
FATAL_ERROR
69+
"ExecuTorch must be installed with EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER enabled."
70+
)
71+
endif()
72+
73+
# Needed for cpuinfo where it uses android specific log lib
74+
if(ANDROID)
75+
list(APPEND link_libraries log)
76+
endif()
77+
78+
# stb_image: a lightweight library to load images
79+
include(FetchContent)
80+
FetchContent_Declare(
81+
stb
82+
GIT_REPOSITORY https://github.com/nothings/stb.git
83+
GIT_TAG f0569113c93ad095470c54bf34a17b36646bbbb5
84+
)
85+
FetchContent_MakeAvailable(stb)
86+
# Add deprecated/ to use stb_image_resize.h for internal compatibility
87+
list(APPEND _common_include_directories ${stb_SOURCE_DIR}
88+
${stb_SOURCE_DIR}/deprecated
89+
)
90+
91+
# Add the required ExecuTorch extensions for multimodal LLM runner
92+
list(
93+
APPEND
94+
link_libraries
95+
extension_llm_runner
96+
extension_module
97+
extension_data_loader
98+
extension_tensor
99+
extension_flat_tensor
100+
)
101+
102+
# Link CUDA backend
103+
if(EXECUTORCH_BUILD_CUDA)
104+
find_package(CUDAToolkit REQUIRED)
105+
list(APPEND link_libraries aoti_cuda)
106+
executorch_target_link_options_shared_lib(aoti_cuda)
107+
endif()
108+
109+
# Add tokenizers
110+
list(APPEND link_libraries tokenizers::tokenizers)
111+
112+
add_executable(gemma3_e2e_runner ${_srcs})
113+
if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
114+
target_link_options_gc_sections(gemma3_e2e_runner)
115+
if(NOT APPLE)
116+
target_link_options(gemma3_e2e_runner PRIVATE "LINKER:-s")
117+
endif()
118+
endif()
119+
120+
target_include_directories(
121+
gemma3_e2e_runner PUBLIC ${_common_include_directories}
122+
)
123+
target_link_libraries(gemma3_e2e_runner PUBLIC ${link_libraries})
124+
target_compile_options(gemma3_e2e_runner PUBLIC ${_common_compile_options})

0 commit comments

Comments
 (0)