Merge branch 'main' into nlp/cache-start-pos

Jack-Khuu · web-flow · commit 36dc78b245ff · 2025-01-23T10:39:57.000-08:00
diff --git a/.github/workflows/run-readme-pr-linuxaarch64.yml b/.github/workflows/run-readme-pr-linuxaarch64.yml
@@ -9,22 +9,20 @@ on:
 
 jobs:
   test-readme-cpu:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
     with:
-      runner: linux-aarch64
-      gpu-arch-type: cuda
-      gpu-arch-version: "12.1"
+      runner: linux.arm64.2xlarge
+      docker-image: "pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-main"
+      gpu-arch-type: cpu-aarch64
       timeout: 60
       script: |
         echo "::group::Print machine info"
         uname -a
         echo "::endgroup::"
 
-        echo "::group::Install newer objcopy that supports --set-section-alignment"
-        yum install -y  devtoolset-10-binutils
-        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
-        echo "::endgroup::"
-
         TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs readme
 
         echo "::group::Completion"
@@ -33,41 +31,37 @@ jobs:
         echo "::endgroup::"
 
   test-quantization-cpu:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
     with:
-      runner: linux-aarch64
-      gpu-arch-type: cuda
-      gpu-arch-version: "12.1"
+      runner: linux.arm64.2xlarge
+      docker-image: "pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-main"
+      gpu-arch-type: cpu-aarch64
       timeout: 60
       script: |
         echo "::group::Print machine info"
         uname -a
         echo "::endgroup::"
 
-        echo "::group::Install newer objcopy that supports --set-section-alignment"
-        yum install -y  devtoolset-10-binutils
-        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
-        echo "::endgroup::"
-
         TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs quantization
 
   test-gguf-cpu:
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    permissions:
+      id-token: write
+      contents: read
     with:
-      runner: linux-aarch64
-      gpu-arch-type: cuda
-      gpu-arch-version: "12.1"
+      runner: linux.arm64.2xlarge
+      docker-image: "pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-main"
+      gpu-arch-type: cpu-aarch64
       timeout: 60
       script: |
         echo "::group::Print machine info"
         uname -a
         echo "::endgroup::"
 
-        echo "::group::Install newer objcopy that supports --set-section-alignment"
-        yum install -y  devtoolset-10-binutils
-        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
-        echo "::endgroup::"
-
         TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs gguf
 
         echo "::group::Completion"
@@ -77,21 +71,19 @@ jobs:
 
   test-advanced-cpu:
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    permissions:
+      id-token: write
+      contents: read
     with:
-      runner: linux-aarch64
-      gpu-arch-type: cuda
-      gpu-arch-version: "12.1"
+      runner: linux.arm64.2xlarge
+      docker-image: "pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-main"
+      gpu-arch-type: cpu-aarch64
       timeout: 60
       script: |
         echo "::group::Print machine info"
         uname -a
         echo "::endgroup::"
 
-        echo "::group::Install newer objcopy that supports --set-section-alignment"
-        yum install -y  devtoolset-10-binutils
-        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
-        echo "::endgroup::"
-
         TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs advanced
 
         echo "::group::Completion"
@@ -101,21 +93,19 @@ jobs:
 
   test-evaluation-cpu:
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    permissions:
+      id-token: write
+      contents: read
     with:
-      runner: linux-aarch64
-      gpu-arch-type: cuda
-      gpu-arch-version: "12.1"
+      runner: linux.arm64.2xlarge
+      docker-image: "pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-main"
+      gpu-arch-type: cpu-aarch64
       timeout: 60
       script: |
         echo "::group::Print machine info"
         uname -a
         echo "::endgroup::"
 
-        echo "::group::Install newer objcopy that supports --set-section-alignment"
-        yum install -y  devtoolset-10-binutils
-        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
-        echo "::endgroup::"
-
         TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs evaluation
 
         echo "::group::Completion"
diff --git a/torchchat/generate.py b/torchchat/generate.py
@@ -685,7 +685,7 @@ def generate(
         sequential_prefill=True,
         callback=lambda x: x,
         max_seq_length: int,
-        attention_backend: str = "math",
+        attention_backend: SDPBackend = torch.nn.attention.SDPBackend.MATH,
         seed: Optional[int] = None,
         **sampling_kwargs,
     ) -> torch.Tensor:
@@ -1126,7 +1126,7 @@ def chat(
                     messages_to_encode.append(
                         {"role": "system", "content": self.system_prompt}
                     )
-                messages_to_encode.append({"role": "system", "content": prompt})
+                messages_to_encode.append({"role": "user", "content": prompt})
                 encoded = self.chat_formatter.encode_dialog_prompt(
                     messages_to_encode, add_generation_prompt=True,
                 )