pytorch
diff --git a/‎.ci/scripts/setup-samsung-linux-deps.sh‎
Lines changed: 1 addition & 1 deletion b/‎.ci/scripts/setup-samsung-linux-deps.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.ci/scripts/test-cuda-build.sh‎
Lines changed: 16 additions & 16 deletions b/‎.ci/scripts/test-cuda-build.sh‎
Lines changed: 16 additions & 16 deletions
diff --git a/‎.github/scripts/propose_ghstack_orig_pr.py‎
Lines changed: 13 additions & 0 deletions b/‎.github/scripts/propose_ghstack_orig_pr.py‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎.github/workflows/pull.yml‎
Lines changed: 3 additions & 0 deletions b/‎.github/workflows/pull.yml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎.github/workflows/test-cuda-builds.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/test-cuda-builds.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎CONTRIBUTING.md‎
Lines changed: 1 addition & 2 deletions b/‎CONTRIBUTING.md‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎backends/cortex_m/CMakeLists.txt‎
Lines changed: 2 additions & 18 deletions b/‎backends/cortex_m/CMakeLists.txt‎
Lines changed: 2 additions & 18 deletions
diff --git a/‎backends/cortex_m/test/test_quantize_op_fusion_pass.py‎
Lines changed: 1 addition & 1 deletion b/‎backends/cortex_m/test/test_quantize_op_fusion_pass.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/nxp/backend/edge_program_converter.py‎
Lines changed: 3 additions & 1 deletion b/‎backends/nxp/backend/edge_program_converter.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎backends/nxp/backend/ir/converter/node_converters/ops_converters/__init__.py‎
Lines changed: 4 additions & 2 deletions b/‎backends/nxp/backend/ir/converter/node_converters/ops_converters/__init__.py‎
Lines changed: 4 additions & 2 deletions
@@ -11,7 +11,7 @@ set -ex
 
 download_ai_lite_core() {
   API_BASE="https://soc-developer.semiconductor.samsung.com/api/v1/resource/ai-litecore/download"
-  API_KEY="kn10SoSY3hkC-9Qny5TqD2mnqVrlupv3krnjLeBt5cY"
+  API_KEY=$SAMSUNG_AI_LITECORE_KEY
 
   VERSION="0.5"
   OS_NAME="Ubuntu 22.04"
 
@@ -9,14 +9,14 @@ set -exu
 
 CUDA_VERSION=${1:-"12.6"}
 
-echo "=== Testing ExecutorTorch CUDA ${CUDA_VERSION} Build ==="
+echo "=== Testing ExecuTorch CUDA ${CUDA_VERSION} Build ==="
 
-# Function to build and test ExecutorTorch with CUDA support
+# Function to build and test ExecuTorch with CUDA support
 test_executorch_cuda_build() {
     local cuda_version=$1
 
-    echo "Building ExecutorTorch with CUDA ${cuda_version} support..."
-    echo "ExecutorTorch will automatically detect CUDA and install appropriate PyTorch wheel"
+    echo "Building ExecuTorch with CUDA ${cuda_version} support..."
+    echo "ExecuTorch will automatically detect CUDA and install appropriate PyTorch wheel"
 
     # Check available resources before starting
     echo "=== System Information ==="
@@ -27,11 +27,11 @@ test_executorch_cuda_build() {
     nvcc --version || echo "nvcc not found"
     nvidia-smi || echo "nvidia-smi not found"
 
-    # Set CMAKE_ARGS to enable CUDA build - ExecutorTorch will handle PyTorch installation automatically
+    # Set CMAKE_ARGS to enable CUDA build - ExecuTorch will handle PyTorch installation automatically
     export CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON"
 
-    echo "=== Starting ExecutorTorch Installation ==="
-    # Install ExecutorTorch with CUDA support with timeout and error handling
+    echo "=== Starting ExecuTorch Installation ==="
+    # Install ExecuTorch with CUDA support with timeout and error handling
     timeout 5400 ./install_executorch.sh || {
         local exit_code=$?
         echo "ERROR: install_executorch.sh failed with exit code: $exit_code"
@@ -41,15 +41,15 @@ test_executorch_cuda_build() {
         exit $exit_code
     }
 
-    echo "SUCCESS: ExecutorTorch CUDA build completed"
+    echo "SUCCESS: ExecuTorch CUDA build completed"
 
     # Verify the installation
-    echo "=== Verifying ExecutorTorch CUDA Installation ==="
+    echo "=== Verifying ExecuTorch CUDA Installation ==="
 
-    # Test that ExecutorTorch was built successfully
+    # Test that ExecuTorch was built successfully
     python -c "
 import executorch
-print('SUCCESS: ExecutorTorch imported successfully')
+print('SUCCESS: ExecuTorch imported successfully')
 "
 
     # Test CUDA availability and show details
@@ -60,7 +60,7 @@ try:
     print('INFO: CUDA available:', torch.cuda.is_available())
 
     if torch.cuda.is_available():
-        print('SUCCESS: CUDA is available for ExecutorTorch')
+        print('SUCCESS: CUDA is available for ExecuTorch')
         print('INFO: CUDA version:', torch.version.cuda)
         print('INFO: GPU device count:', torch.cuda.device_count())
         print('INFO: Current GPU device:', torch.cuda.current_device())
@@ -74,16 +74,16 @@ try:
         print('SUCCESS: CUDA tensor operation completed on device:', z.device)
         print('INFO: Result tensor shape:', z.shape)
 
-        print('SUCCESS: ExecutorTorch CUDA integration verified')
+        print('SUCCESS: ExecuTorch CUDA integration verified')
     else:
-        print('WARNING: CUDA not detected, but ExecutorTorch built successfully')
+        print('WARNING: CUDA not detected, but ExecuTorch built successfully')
         exit(1)
 except Exception as e:
-    print('ERROR: ExecutorTorch CUDA test failed:', e)
+    print('ERROR: ExecuTorch CUDA test failed:', e)
     exit(1)
 "
 
-    echo "SUCCESS: ExecutorTorch CUDA ${cuda_version} build and verification completed successfully"
+    echo "SUCCESS: ExecuTorch CUDA ${cuda_version} build and verification completed successfully"
 }
 
 # Main execution
 
@@ -86,6 +86,17 @@ def get_pr_stack_from_number(ref: str, repo: Repository) -> List[int]:
     return pr_stack
 
 
+def get_differential_revision(pr, repo: Repository) -> str:
+    body = repo.get_pull(pr.number).body
+    matches = re.findall(r"Differential Revision: .*", body)
+    count = len(matches)
+    if count == 1:
+        # If there's more than one Differential Revision, let's just return empty
+        # so that we can disambiguate manually.
+        return matches[0]
+    return ""
+
+
 def create_prs_for_orig_branch(pr_stack: List[int], repo: Repository):
     # For the first PR, we want to merge to `main` branch, and we will update
     # as we go through the stack
@@ -100,13 +111,15 @@ def create_prs_for_orig_branch(pr_stack: List[int], repo: Repository):
         # The PR we want to create is then "branch_to_merge" <- gh/user/x/orig
         # gh/user/x/orig is the clean diff between gh/user/x/base <- gh/user/x/head
         orig_branch_merge_head = pr.base.ref.replace("base", "orig")
+        differential_revision_text = get_differential_revision(pr, repo)
         bot_metadata = f"""This PR was created by the merge bot to help merge the original PR into the main branch.
 ghstack PR number: https://github.com/pytorch/executorch/pull/{pr.number} by @{pr.user.login}
 ^ Please use this as the source of truth for the PR details, comments, and reviews
 ghstack PR base: https://github.com/pytorch/executorch/tree/{pr.base.ref}
 ghstack PR head: https://github.com/pytorch/executorch/tree/{pr.head.ref}
 Merge bot PR base: https://github.com/pytorch/executorch/tree/{orig_branch_merge_base}
 Merge bot PR head: https://github.com/pytorch/executorch/tree/{orig_branch_merge_head}
+{differential_revision_text}
 @diff-train-skip-merge"""
 
         existing_orig_pr = repo.get_pulls(
 
@@ -900,12 +900,14 @@ jobs:
     permissions:
       id-token: write
       contents: read
+    secrets: inherit
     with:
       runner: linux.2xlarge
       docker-image: ci-image:executorch-ubuntu-22.04-clang12-android
       submodules: 'recursive'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       timeout: 90
+      secrets-env: SAMSUNG_AI_LITECORE_KEY
       script: |
         set -ex
 
@@ -917,6 +919,7 @@ jobs:
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool "cmake"
 
         # Setup Samsung SDK (AI Lite Core) and install enn backend
+        export SAMSUNG_AI_LITECORE_KEY=$SECRET_SAMSUNG_AI_LITECORE_KEY
         source .ci/scripts/setup-samsung-linux-deps.sh
 
         # Test models serially
 
@@ -24,7 +24,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        cuda-version: ["12.6", "12.8", "12.9"]
+        cuda-version: ["12.6", "12.8", "13.0"]
 
     name: test-executorch-cuda-build-${{ matrix.cuda-version }}
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
 
@@ -199,8 +199,7 @@ We use [`lintrunner`](https://pypi.org/project/lintrunner/) to help make sure th
 code follows our standards. Set it up with:
 
 ```
-pip install lintrunner==0.12.7
-pip install lintrunner-adapters==0.12.4
+./install_requirements.sh  # (automatically run by install_executorch.sh)
 lintrunner init
 ```
 
 
@@ -36,7 +36,7 @@ set(CMSIS_NN_LOCAL_PATH
 # library is downloaded via FetchContent in the default/regular case.
 if(CMSIS_NN_LOCAL_PATH AND EXISTS "${CMSIS_NN_LOCAL_PATH}")
   message(STATUS "Using CMSIS-NN from specified path: ${CMSIS_NN_LOCAL_PATH}")
-  add_subdirectory(${CMSIS_NN_LOCAL_PATH} cmsis_nn_build)
+  add_subdirectory(${CMSIS_NN_LOCAL_PATH} _deps/cmsis_nn-build)
 else()
   # Use FetchContent with automatic fallback
   message(STATUS "Using CMSIS-NN via FetchContent")
@@ -48,23 +48,7 @@ else()
     GIT_SHALLOW TRUE
   )
 
-  FetchContent_GetProperties(cmsis_nn)
-  if(NOT cmsis_nn_POPULATED)
-    FetchContent_Populate(cmsis_nn)
-    add_subdirectory(${cmsis_nn_SOURCE_DIR} ${cmsis_nn_BINARY_DIR})
-  endif()
-endif()
-
-# Add MVEI define to cmsis-nn target
-if(TARGET cmsis-nn)
-  target_compile_definitions(cmsis-nn PUBLIC ARM_MATH_MVEI=1)
-  get_target_property(CMSIS_NN_INCLUDES cmsis-nn INTERFACE_INCLUDE_DIRECTORIES)
-  message(STATUS "CMSIS-NN include dirs: ${CMSIS_NN_INCLUDES}")
-else()
-  message(
-    FATAL_ERROR
-      "CMSIS-NN target not found. Check your CMSIS_NN_LOCAL_PATH or network connection."
-  )
+  FetchContent_MakeAvailable(cmsis_nn)
 endif()
 
 # Cortex-M ops kernel sources
 
@@ -313,7 +313,7 @@ def forward(self, x, y):
         # Apply passes
         transformed_program = self._apply_passes(edge_program)
 
-        # Generate ExecutorTorch program
+        # Generate ExecuTorch program
         executorch_program = transformed_program.to_executorch()
 
         # Verify the program contains the expected fused operator
 
@@ -134,6 +134,7 @@ def _process_nodes(self, nodes: list[Node], conversion_context: ConversionContex
 
         qdq_related_functions = [
             exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default,
+            exir_ops.edge.quantized_decomposed.dequantize_per_channel.default,
             exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
         ]
 
@@ -203,7 +204,8 @@ def _convert_qdq_cluster_q_dq_nodes(
         :param conversion_context: ConversionContext instance.
         """
         qdq_q_ops_converters = {
-            exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default: QDQDequantizeConverter,  # noqa F405
+            exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default: QDQPerTensorDequantizeConverter,  # noqa F405
+            exir_ops.edge.quantized_decomposed.dequantize_per_channel.default: QDQPerChannelDequantizeConverter,  # noqa F405
             exir_ops.edge.quantized_decomposed.quantize_per_tensor.default: QDQQuantizeConverter,  # noqa F405
         }
 
 
@@ -41,7 +41,8 @@
     PermuteCopyConverter,
 )
 from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters.qdq_dequantize_converter import (
-    QDQDequantizeConverter,
+    QDQPerChannelDequantizeConverter,
+    QDQPerTensorDequantizeConverter,
 )
 from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters.qdq_quantize_converter import (
     QDQQuantizeConverter,
@@ -70,7 +71,8 @@
     "PermuteCopyConverter",
     "SoftmaxConverter",
     "ViewCopyConverter",
-    "QDQDequantizeConverter",
+    "QDQPerTensorDequantizeConverter",
+    "QDQPerChannelDequantizeConverter",
     "QDQQuantizeConverter",
     "ConstantPadNDConverter",
     "ReLUConverter",
Original file line number	Diff line number	Diff line change
`@@ -134,6 +134,7 @@ def _process_nodes(self, nodes: list[Node], conversion_context: ConversionContex`
`134`	`134`
`135`	`135`	`qdq_related_functions = [`
`136`	`136`	`exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default,`
	`137`	`+ exir_ops.edge.quantized_decomposed.dequantize_per_channel.default,`
`137`	`138`	`exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,`
`138`	`139`	`]`
`139`	`140`
`@@ -203,7 +204,8 @@ def _convert_qdq_cluster_q_dq_nodes(`
`203`	`204`	`:param conversion_context: ConversionContext instance.`
`204`	`205`	`"""`
`205`	`206`	`qdq_q_ops_converters = {`
`206`		`- exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default: QDQDequantizeConverter, # noqa F405`
	`207`	`+ exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default: QDQPerTensorDequantizeConverter, # noqa F405`
	`208`	`+ exir_ops.edge.quantized_decomposed.dequantize_per_channel.default: QDQPerChannelDequantizeConverter, # noqa F405`
`207`	`209`	`exir_ops.edge.quantized_decomposed.quantize_per_tensor.default: QDQQuantizeConverter, # noqa F405`
`208`	`210`	`}`
`209`	`211`