pytorch
diff --git a/‎.ci/docker/ci_commit_pins/pytorch.txt‎
Lines changed: 1 addition & 1 deletion b/‎.ci/docker/ci_commit_pins/pytorch.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.ci/scripts/setup-samsung-linux-deps.sh‎
Lines changed: 2 additions & 2 deletions b/‎.ci/scripts/setup-samsung-linux-deps.sh‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.ci/scripts/test_ios_ci.sh‎
Lines changed: 1 addition & 0 deletions b/‎.ci/scripts/test_ios_ci.sh‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.ci/scripts/test_llava.sh‎
Lines changed: 1 addition & 0 deletions b/‎.ci/scripts/test_llava.sh‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/workflows/cuda.yml‎
Lines changed: 131 additions & 20 deletions b/‎.github/workflows/cuda.yml‎
Lines changed: 131 additions & 20 deletions
diff --git a/‎.github/workflows/pull.yml‎
Lines changed: 6 additions & 0 deletions b/‎.github/workflows/pull.yml‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎.github/workflows/trunk.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/trunk.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎CMakeLists.txt‎
Lines changed: 12 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎backends/aoti/CMakeLists.txt‎
Lines changed: 2 additions & 7 deletions b/‎backends/aoti/CMakeLists.txt‎
Lines changed: 2 additions & 7 deletions
diff --git a/‎backends/aoti/aoti_model_container.h‎
Lines changed: 2 additions & 0 deletions b/‎backends/aoti/aoti_model_container.h‎
Lines changed: 2 additions & 0 deletions
@@ -1 +1 @@
-53a2908a10f414a2f85caa06703a26a40e873869
+cf9d09490c7f6685ec68d5db3acf2e0d73c54d00
@@ -13,7 +13,7 @@ download_ai_lite_core() {
   API_BASE="https://soc-developer.semiconductor.samsung.com/api/v1/resource/ai-litecore/download"
   API_KEY=$SAMSUNG_AI_LITECORE_KEY
 
-  VERSION="0.5"
+  VERSION="0.7"
   OS_NAME="Ubuntu 22.04"
   OUT_FILE="/tmp/exynos-ai-litecore-v${VERSION}.tar.gz"
   TARGET_PATH="/tmp/exynos_ai_lite_core"
@@ -62,7 +62,7 @@ install_enn_backend() {
   export PYTHONPATH=${PYTHONPATH:-}:${EXECUTORCH_ROOT}/..
 }
 
-AI_LITE_CORE_VERSION=0.5.0
+AI_LITE_CORE_VERSION=0.7.0
 
 download_ai_lite_core ${AI_LITE_CORE_VERSION}
 install_enn_backend
@@ -36,6 +36,7 @@ say() {
 
 say "Cloning the Demo App"
 
+git config --global http.postBuffer 524288000
 git clone --depth 1 https://github.com/meta-pytorch/executorch-examples.git
 
 say "Installing CoreML Backend Requirements"
 
@@ -38,6 +38,7 @@ EXECUTORCH_COMMON_CMAKE_ARGS="                      \
         -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON      \
         -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
         -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
         -DEXECUTORCH_BUILD_EXTENSION_LLM=ON \
         -DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=ON \
         -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON      \
 
@@ -87,8 +87,8 @@ jobs:
         export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
         PYTHON_EXECUTABLE=python source .ci/scripts/test_model.sh "${{ matrix.model }}" cmake cuda
 
-  test-voxtral-cuda-e2e:
-    name: test-voxtral-cuda-e2e
+  export-voxtral-cuda-artifact:
+    name: export-voxtral-cuda-artifact
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     permissions:
       id-token: write
@@ -104,6 +104,7 @@ jobs:
       gpu-arch-version: 12.6
       use-custom-docker-registry: false
       submodules: recursive
+      upload-artifact: voxtral-cuda-export
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       script: |
         set -eux
@@ -118,6 +119,7 @@ jobs:
         OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
         pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
         pip install mistral-common librosa
+        pip list
         echo "::endgroup::"
 
         echo "::group::Export Voxtral"
@@ -129,43 +131,152 @@ jobs:
             --device cuda \
             --max_seq_len 1024 \
             --output_dir ./
+        python -m executorch.extension.audio.mel_spectrogram \
+            --feature_size 128 \
+            --stack_output \
+            --max_audio_len 300 \
+            --output_file voxtral_preprocessor.pte
+
+        test -f model.pte
+        test -f aoti_cuda_blob.ptd
+        test -f voxtral_preprocessor.pte
         echo "::endgroup::"
 
-        echo "::group::Build Voxtral Runner"
+        echo "::group::Store Voxtral Artifacts"
+        mkdir -p "${RUNNER_ARTIFACT_DIR}"
+        cp model.pte "${RUNNER_ARTIFACT_DIR}/"
+        cp aoti_cuda_blob.ptd "${RUNNER_ARTIFACT_DIR}/"
+        cp voxtral_preprocessor.pte "${RUNNER_ARTIFACT_DIR}/"
+        ls -al "${RUNNER_ARTIFACT_DIR}"
+        echo "::endgroup::"
+
+  benchmark-voxtral-cuda:
+    name: benchmark-voxtral-cuda
+    needs: export-voxtral-cuda-artifact
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    strategy:
+      fail-fast: false
+    with:
+      timeout: 90
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: 12.6
+      use-custom-docker-registry: false
+      submodules: recursive
+      download-artifact: voxtral-cuda-export
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      script: |
+        set -eux
+
+        echo "::group::Setup ExecuTorch Requirements"
+        CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_requirements.sh
+        pip list
+        echo "::endgroup::"
+
+        echo "::group::Prepare Voxtral Artifacts"
+        cp "${RUNNER_ARTIFACT_DIR}/model.pte" .
+        cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" .
+        ls -al model.pte aoti_cuda_blob.ptd
+        echo "::endgroup::"
+
+        echo "::group::Build Voxtral Benchmark"
         cmake -DCMAKE_BUILD_TYPE=Release \
               -DEXECUTORCH_BUILD_CUDA=ON \
               -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
               -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+              -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
               -DEXECUTORCH_BUILD_TESTS=ON \
               -Bcmake-out .
         cmake --build cmake-out -j$(( $(nproc) - 1 )) --target voxtral_runner
         echo "::endgroup::"
 
+        echo "::group::Run Voxtral Benchmark"
+
+        export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
+        cmake-out/backends/cuda/voxtral_runner model.pte aoti_cuda_blob.ptd
+
+        echo "::endgroup::"
+
+  test-voxtral-cuda-e2e:
+    name: test-voxtral-cuda-e2e
+    needs: export-voxtral-cuda-artifact
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    strategy:
+      fail-fast: false
+    with:
+      timeout: 90
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: 12.6
+      use-custom-docker-registry: false
+      submodules: recursive
+      download-artifact: voxtral-cuda-export
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      script: |
+        set -eux
+
+        echo "::group::Setup ExecuTorch Requirements"
+        CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_requirements.sh
+        pip list
+        echo "::endgroup::"
+
+        echo "::group::Prepare Voxtral Artifacts"
+        cp "${RUNNER_ARTIFACT_DIR}/model.pte" .
+        cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" .
+        cp "${RUNNER_ARTIFACT_DIR}/voxtral_preprocessor.pte" .
+        TOKENIZER_URL="https://huggingface.co/mistralai/Voxtral-Mini-3B-2507/resolve/main/tekken.json"
+        curl -L $TOKENIZER_URL -o tekken.json
+        ls -al model.pte aoti_cuda_blob.ptd voxtral_preprocessor.pte tekken.json
+        echo "::endgroup::"
+
+        echo "::group::Download Test Audio File"
+        AUDIO_URL="https://github.com/voxserv/audio_quality_testing_samples/raw/refs/heads/master/testaudio/16000/test01_20s.wav"
+        curl -L $AUDIO_URL -o poem.wav
+        echo "::endgroup::"
+
+        echo "::group::Build Voxtral Runner"
+        cmake --preset llm \
+              -DEXECUTORCH_BUILD_CUDA=ON \
+              -DCMAKE_INSTALL_PREFIX=cmake-out \
+              -DCMAKE_BUILD_TYPE=Release \
+              -Bcmake-out -S.
+        cmake --build cmake-out -j$(( $(nproc) - 1 )) --target install --config Release
+
+        cmake -DEXECUTORCH_BUILD_CUDA=ON \
+              -DCMAKE_BUILD_TYPE=Release \
+              -Sexamples/models/voxtral \
+              -Bcmake-out/examples/models/voxtral/
+        cmake --build cmake-out/examples/models/voxtral --target voxtral_runner --config Release
+        echo "::endgroup::"
+
         echo "::group::Run Voxtral Runner"
-        # Capture output and allow exit code 139 if we have the expected printout
         set +e
         export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
-        OUTPUT=$(cmake-out/backends/cuda/voxtral_runner model.pte aoti_cuda_blob.ptd 2>&1)
+        OUTPUT=$(cmake-out/examples/models/voxtral/voxtral_runner \
+              --model_path model.pte \
+              --data_path aoti_cuda_blob.ptd \
+              --tokenizer_path tekken.json \
+              --audio_path poem.wav \
+              --processor_path voxtral_preprocessor.pte \
+              --temperature 0 2>&1)
         EXIT_CODE=$?
         set -e
 
         echo "$OUTPUT"
 
-        # Check if the output contains "Run latency (ms):"
-        if echo "$OUTPUT" | grep -q "Run latency (ms):"; then
-          echo "Found expected output: 'Run latency (ms):'"
-          if [ $EXIT_CODE -eq 139 ]; then
-            echo "Exit code 139 (segfault) detected, but passing since we have the expected output"
-            exit 0
-          elif [ $EXIT_CODE -ne 0 ]; then
-            echo "Unexpected exit code: $EXIT_CODE"
-            exit $EXIT_CODE
-          else
-            echo "Command succeeded with exit code 0"
-            exit 0
-          fi
-        else
-          echo "Expected output 'Run latency (ms):' not found in output"
+        if ! echo "$OUTPUT" | grep -iq "poem"; then
+          echo "Expected output 'poem' not found in output"
           exit 1
         fi
+
+        if [ $EXIT_CODE -ne 0 ]; then
+          echo "Unexpected exit code: $EXIT_CODE"
+          exit $EXIT_CODE
+        fi
         echo "::endgroup::"
@@ -935,6 +935,12 @@ jobs:
           python -m executorch.examples.samsung.aot_compiler --model_name=$model -c E9955
         done
 
+        # Test quant models
+        model_scripts="deeplab_v3 edsr inception_v3 inception_v4 mobilenet_v2 mobilenet_v3 resnet18 resnet50 vit wav2letter"
+        for m_script in $model_scripts; do
+          python -m executorch.examples.samsung.scripts.${m_script} -c e9955 -p A8W8
+        done
+
         # Test ops
         python -m unittest discover -s backends/samsung/test/ops -p "test_*.py"
 
 
@@ -346,7 +346,7 @@ jobs:
         elif [[ ${{ matrix.os}} == "zephyr-preset" ]]; then
           setup_script_args="--target-toolchain zephyr"
           toolchain_prefix=arm-zephyr-eabi-
-          threshold="135168" # 132 KiB
+          threshold="135240" # 132 KiB
           toolchain_cmake=examples/zephyr/x86_64-linux-arm-zephyr-eabi-gcc.cmake
         else
           echo "Fail unsupport OS selection ${{ matrix.os }}"
 
@@ -266,6 +266,18 @@ if(EXECUTORCH_BUILD_PTHREADPOOL)
   executorch_move_interface_include_directories_to_build_time_only(
     pthreadpool_interface
   )
+
+  if(APPLE)
+    # Use hidden visibility for pthreadpool on Apple platforms to avoid issues
+    # with pthreadpool symbols from libtorch_cpu taking precedence over the ones
+    # from the pthreadpool library statically linked in _portable_lib. The
+    # pthreadpool public APIs are marked as weak by default on some Apple
+    # platforms, so setting to hidden visibility works around this by not
+    # putting the symbol in the indirection table. See
+    # https://github.com/pytorch/executorch/issues/14321 for more details.
+    target_compile_options(pthreadpool PRIVATE -fvisibility=hidden)
+  endif()
+
   install(
     TARGETS pthreadpool pthreadpool_interface fxdiv
     EXPORT ExecuTorchTargets
 
@@ -40,13 +40,8 @@ target_compile_options(aoti_common PUBLIC -fexceptions -frtti -fPIC)
 # Ensure symbols are exported properly
 target_link_options(aoti_common PUBLIC -Wl,--export-dynamic)
 
-# Link against PyTorch libraries and standard libraries
-target_link_libraries(
-  aoti_common
-  PUBLIC extension_tensor ${CMAKE_DL_LIBS}
-         # Link PyTorch libraries for AOTI functions
-         ${TORCH_LIBRARIES}
-)
+# Link against ExecuTorch libraries and standard libraries
+target_link_libraries(aoti_common PUBLIC extension_tensor ${CMAKE_DL_LIBS})
 executorch_target_link_options_shared_lib(aoti_common)
 
 install(
 
@@ -77,6 +77,8 @@ struct AOTIDelegateHandle {
   void* so_handle;
   std::string so_path;
   AOTInductorModelContainerHandle container_handle;
+  void* cuda_stream; // cudaStream_t stored as void* to avoid CUDA header
+                     // dependency
 };
 
 } // namespace aoti
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-53a2908a10f414a2f85caa06703a26a40e873869`
	`1`	`+cf9d09490c7f6685ec68d5db3acf2e0d73c54d00`