Skip to content

Commit 961ba6e

Browse files
authored
Merge branch 'main' into patch-42
2 parents 618e72e + 025d412 commit 961ba6e

File tree

8 files changed

+163
-13
lines changed

8 files changed

+163
-13
lines changed

Diff for: .github/workflows/run-readme-pr-linuxaarch64.yml

+124
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
name: Run the README instructions - with stories - on Linux aarch64
2+
3+
on:
4+
pull_request:
5+
push:
6+
branches:
7+
- main
8+
workflow_dispatch:
9+
10+
jobs:
11+
test-readme-cpu:
12+
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
13+
with:
14+
runner: linux-aarch64
15+
gpu-arch-type: cuda
16+
gpu-arch-version: "12.1"
17+
timeout: 60
18+
script: |
19+
echo "::group::Print machine info"
20+
uname -a
21+
echo "::endgroup::"
22+
23+
echo "::group::Install newer objcopy that supports --set-section-alignment"
24+
yum install -y devtoolset-10-binutils
25+
export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
26+
echo "::endgroup::"
27+
28+
TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs readme
29+
30+
echo "::group::Completion"
31+
echo "tests complete"
32+
echo "*******************************************"
33+
echo "::endgroup::"
34+
35+
test-quantization-cpu:
36+
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
37+
with:
38+
runner: linux-aarch64
39+
gpu-arch-type: cuda
40+
gpu-arch-version: "12.1"
41+
timeout: 60
42+
script: |
43+
echo "::group::Print machine info"
44+
uname -a
45+
echo "::endgroup::"
46+
47+
echo "::group::Install newer objcopy that supports --set-section-alignment"
48+
yum install -y devtoolset-10-binutils
49+
export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
50+
echo "::endgroup::"
51+
52+
TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs quantization
53+
54+
test-gguf-cpu:
55+
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
56+
with:
57+
runner: linux-aarch64
58+
gpu-arch-type: cuda
59+
gpu-arch-version: "12.1"
60+
timeout: 60
61+
script: |
62+
echo "::group::Print machine info"
63+
uname -a
64+
echo "::endgroup::"
65+
66+
echo "::group::Install newer objcopy that supports --set-section-alignment"
67+
yum install -y devtoolset-10-binutils
68+
export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
69+
echo "::endgroup::"
70+
71+
TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs gguf
72+
73+
echo "::group::Completion"
74+
echo "tests complete"
75+
echo "*******************************************"
76+
echo "::endgroup::"
77+
78+
test-advanced-cpu:
79+
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
80+
with:
81+
runner: linux-aarch64
82+
gpu-arch-type: cuda
83+
gpu-arch-version: "12.1"
84+
timeout: 60
85+
script: |
86+
echo "::group::Print machine info"
87+
uname -a
88+
echo "::endgroup::"
89+
90+
echo "::group::Install newer objcopy that supports --set-section-alignment"
91+
yum install -y devtoolset-10-binutils
92+
export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
93+
echo "::endgroup::"
94+
95+
TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs advanced
96+
97+
echo "::group::Completion"
98+
echo "tests complete"
99+
echo "*******************************************"
100+
echo "::endgroup::"
101+
102+
test-evaluation-cpu:
103+
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
104+
with:
105+
runner: linux-aarch64
106+
gpu-arch-type: cuda
107+
gpu-arch-version: "12.1"
108+
timeout: 60
109+
script: |
110+
echo "::group::Print machine info"
111+
uname -a
112+
echo "::endgroup::"
113+
114+
echo "::group::Install newer objcopy that supports --set-section-alignment"
115+
yum install -y devtoolset-10-binutils
116+
export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
117+
echo "::endgroup::"
118+
119+
TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs evaluation
120+
121+
echo "::group::Completion"
122+
echo "tests complete"
123+
echo "*******************************************"
124+
echo "::endgroup::"

Diff for: .github/workflows/run-readme-pr-mps.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ jobs:
1010
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
1111
with:
1212
runner: macos-m1-14
13-
timeout: 50
13+
timeout: 60
1414
script: |
1515
conda create -y -n test-readme-mps-macos python=3.10.11 llvm-openmp
1616
conda activate test-readme-mps-macos

Diff for: docs/ADVANCED-USERS.md

+2-2
Original file line numberDiff line numberDiff line change
@@ -479,7 +479,7 @@ in a Python-free environment with AOT Inductor and ExecuTorch.
479479
| Hardware | OS | Eager | Eager + Compile | AOT Compile | ET Runtime |
480480
|-----|------|-----|-----|-----|-----|
481481
| x86 | Linux |||||
482-
| aarch64 | Linux | n/t | n/t | n/t | n/t |
482+
| aarch64 | Linux | | | | n/t |
483483
| aarch64 | macOS |||||
484484
| AMD GPU | Linux |||||
485485
| Nvidia GPU | Linux |||||
@@ -490,7 +490,7 @@ in a Python-free environment with AOT Inductor and ExecuTorch.
490490
| Mobile GPU (Vulkan) | Android |||||
491491
| CoreML | iOS |||||
492492
| Hexagon DSP | Android |||||
493-
| Raspberry Pi 4/5 | Raspbian | n/t | n/t | n/t ||
493+
| Raspberry Pi 4/5 | Raspbian | | | ||
494494
| Raspberry Pi 4/5 | Android |||| n/t |
495495
| ARM 32b (up to v7) | any |||||
496496

Diff for: install/install_requirements.sh

+7-7
Original file line numberDiff line numberDiff line change
@@ -51,13 +51,13 @@ echo "Using pip executable: $PIP_EXECUTABLE"
5151
# NOTE: If a newly-fetched version of the executorch repo changes the value of
5252
# PYTORCH_NIGHTLY_VERSION, you should re-run this script to install the necessary
5353
# package versions.
54-
PYTORCH_NIGHTLY_VERSION=dev20241218
54+
PYTORCH_NIGHTLY_VERSION=dev20250119
5555

5656
# Nightly version for torchvision
57-
VISION_NIGHTLY_VERSION=dev20241218
57+
VISION_NIGHTLY_VERSION=dev20250119
5858

5959
# Nightly version for torchtune
60-
TUNE_NIGHTLY_VERSION=dev20241218
60+
TUNE_NIGHTLY_VERSION=dev20250119
6161

6262
# The pip repository that hosts nightly torch packages. cpu by default.
6363
# If cuda is available, based on presence of nvidia-smi, install the pytorch nightly
@@ -79,15 +79,15 @@ fi
7979
if [[ -x "$(command -v xpu-smi)" ]];
8080
then
8181
REQUIREMENTS_TO_INSTALL=(
82-
torch=="2.6.0.${PYTORCH_NIGHTLY_VERSION}"
82+
torch=="2.7.0.${PYTORCH_NIGHTLY_VERSION}"
8383
torchvision=="0.22.0.${VISION_NIGHTLY_VERSION}"
84-
torchtune=="0.5.0"
84+
torchtune=="0.6.0"
8585
)
8686
else
8787
REQUIREMENTS_TO_INSTALL=(
88-
torch=="2.6.0.${PYTORCH_NIGHTLY_VERSION}"
88+
torch=="2.7.0.${PYTORCH_NIGHTLY_VERSION}"
8989
torchvision=="0.22.0.${VISION_NIGHTLY_VERSION}"
90-
torchtune=="0.5.0.${TUNE_NIGHTLY_VERSION}"
90+
torchtune=="0.6.0.${TUNE_NIGHTLY_VERSION}"
9191
)
9292
fi
9393

Diff for: torchchat/cli/builder.py

+13
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ class BuilderArgs:
6969
prefill_possible: bool = False
7070
dynamic_shapes: bool = False
7171
max_seq_length: Optional[int] = None
72+
attention_backend: str = "math"
7273

7374
def __post_init__(self):
7475
if self.device is None:
@@ -183,6 +184,17 @@ def from_args(cls, args: argparse.Namespace) -> "BuilderArgs":
183184
pp = getattr(args, "pp", 1)
184185
tp = getattr(args, "tp", 1)
185186
chpt_from = getattr(args, "chpt_from", "hf")
187+
sdp_backend_dict = {
188+
'math': torch.nn.attention.SDPBackend.MATH,
189+
'flash_attention': torch.nn.attention.SDPBackend.FLASH_ATTENTION,
190+
'efficient_attention': torch.nn.attention.SDPBackend.EFFICIENT_ATTENTION,
191+
'cudnn_attention': torch.nn.attention.SDPBackend.CUDNN_ATTENTION,
192+
}
193+
attention_backend = sdp_backend_dict[args.attention_backend]
194+
if args.device == "cpu" and (args.attention_backend == "efficient_attention"
195+
or args.attention_backend == "cudnn_attention"):
196+
print(f"Warning: {args.attention_backend} is not supported on CPU. Using math instead.")
197+
attention_backend = torch.nn.attention.SDPBackend.MATH
186198
return cls(
187199
checkpoint_dir=checkpoint_dir,
188200
checkpoint_path=checkpoint_path,
@@ -207,6 +219,7 @@ def from_args(cls, args: argparse.Namespace) -> "BuilderArgs":
207219
is_chat_model=is_chat_model,
208220
dynamic_shapes=getattr(args, "dynamic_shapes", False),
209221
max_seq_length=getattr(args, "max_seq_length", None),
222+
attention_backend=attention_backend,
210223
)
211224

212225
@classmethod

Diff for: torchchat/cli/cli.py

+7
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,13 @@ def _add_model_config_args(parser, verb: str) -> None:
179179
choices=["fast", "cpu", "cuda", "mps", "xpu"],
180180
help="Hardware device to use. Options: fast, cpu, cuda, mps, xpu",
181181
)
182+
model_config_parser.add_argument(
183+
"--attention-backend",
184+
type=str,
185+
default="math",
186+
choices=["math", "flash_attention", "efficient_attention", "cudnn_attention"],
187+
help="SDPBackend to use. Options: MATH, FLASH_ATTENTION, EFFICIENT_ATTENTION, CUDNN_ATTENTION",
188+
)
182189

183190

184191
# Add CLI Args representing output paths of exported model files

Diff for: torchchat/generate.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
import torch.distributed as dist
2727
import torch.multiprocessing as mp
2828
from torch.distributed.pipelining import PipelineStage, ScheduleGPipe
29+
from torch._C import _SDPBackend as SDPBackend
2930

3031
from PIL import Image
3132

@@ -531,6 +532,7 @@ def decode_n_tokens(
531532
callback=lambda _: _,
532533
eos_token_id: int = 2,
533534
eot_id: Optional[int] = None,
535+
attention_backend: SDPBackend = torch.nn.attention.SDPBackend.MATH,
534536
**sampling_kwargs,
535537
):
536538
new_tokens, new_probs = [], []
@@ -539,7 +541,7 @@ def decode_n_tokens(
539541
num_new_tokens - 1
540542
): # -1 to save space to run an EoS if dont generate it naturally
541543
# Actually better for Inductor to codegen attention here
542-
with torch.nn.attention.sdpa_kernel([torch.nn.attention.SDPBackend.MATH]):
544+
with torch.nn.attention.sdpa_kernel([attention_backend]):
543545

544546
out_token = cur_token.clone()
545547
next_token, next_prob = self.decode_one_token(
@@ -683,6 +685,7 @@ def generate(
683685
sequential_prefill=True,
684686
callback=lambda x: x,
685687
max_seq_length: int,
688+
attention_backend: SDPBackend = torch.nn.attention.SDPBackend.MATH,
686689
seed: Optional[int] = None,
687690
**sampling_kwargs,
688691
) -> torch.Tensor:
@@ -799,6 +802,7 @@ def generate(
799802
if self.is_llama3_model
800803
else None
801804
),
805+
attention_backend=attention_backend,
802806
**sampling_kwargs,
803807
):
804808
generated_tokens.append(generated_token.view(-1))
@@ -1186,6 +1190,7 @@ def callback(x, *, done_generating=False):
11861190
start_pos=start_pos,
11871191
skip_cache_setup=not is_first_sample,
11881192
max_seq_length=max_seq_length,
1193+
attention_backend=self.builder_args.attention_backend,
11891194
)
11901195
for token_tensor, metrics in generator_func:
11911196
if token_tensor is not None:

Diff for: torchchat/model.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -1025,7 +1025,7 @@ def apply_rotary_emb(x: Tensor, freqs_cis: Tensor) -> Tensor:
10251025
# For quantized_decomposed ops
10261026
from executorch.kernels import quantized # no-qa
10271027
# For llama::sdpa_with_kv_cache.out, preprocess ops
1028-
from executorch.extension.llm.custom_ops import sdpa_with_kv_cache # no-qa
1028+
from executorch.extension.llm.custom_ops import custom_ops # no-qa
10291029

10301030
class PTEModel(nn.Module):
10311031
def __init__(self, config, path) -> None:
@@ -1062,5 +1062,6 @@ def forward(self, x, input_pos):
10621062
def setup_caches(self, max_batch_size, max_seq_length):
10631063
pass
10641064

1065-
except:
1065+
except Exception as e:
1066+
print(f"Warning: PTEModel (ExecuTorch) not available with exception: {e}")
10661067
pass

0 commit comments

Comments
 (0)