pytorch
diff --git a/‎backends/qualcomm/tests/test_qnn_delegate.py‎
Lines changed: 68 additions & 1 deletion b/‎backends/qualcomm/tests/test_qnn_delegate.py‎
Lines changed: 68 additions & 1 deletion
diff --git a/‎examples/models/granite/__init__.py‎
Lines changed: 16 additions & 0 deletions b/‎examples/models/granite/__init__.py‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎examples/models/granite/config/2b_config.json‎
Lines changed: 19 additions & 0 deletions b/‎examples/models/granite/config/2b_config.json‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎examples/models/granite/convert_weights.py‎
Lines changed: 106 additions & 0 deletions b/‎examples/models/granite/convert_weights.py‎
Lines changed: 106 additions & 0 deletions
diff --git a/‎examples/models/llama/evaluate/eager_eval.py‎
Lines changed: 2 additions & 2 deletions b/‎examples/models/llama/evaluate/eager_eval.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/models/llama/model_args.py‎
Lines changed: 9 additions & 0 deletions b/‎examples/models/llama/model_args.py‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎examples/qualcomm/oss_scripts/llama/README.md‎
Lines changed: 23 additions & 15 deletions b/‎examples/qualcomm/oss_scripts/llama/README.md‎
Lines changed: 23 additions & 15 deletions
diff --git a/‎examples/qualcomm/oss_scripts/llama/__init__.py‎
Lines changed: 32 additions & 0 deletions b/‎examples/qualcomm/oss_scripts/llama/__init__.py‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎examples/qualcomm/oss_scripts/llama/decoder_constants.py‎
Lines changed: 1 addition & 0 deletions b/‎examples/qualcomm/oss_scripts/llama/decoder_constants.py‎
Lines changed: 1 addition & 0 deletions
@@ -5933,7 +5933,7 @@ def test_static_llm_model(self):
             "kv",
             "--max_seq_len",
             "1024",
-            "--eval_perplexity",
+            "--run_lm_eval",
             "--tasks",
             "wikitext",
             "--limit",
@@ -6051,6 +6051,73 @@ def test_codegen2_1b(self):
                 if not self.compile_only and not self.enable_x86_64:
                     self.assertGreaterEqual(msg["inference_speed"], 60)
 
+    def test_granite_3_3_2b_instruct(self):
+        if not self.required_envs():
+            self.skipTest("missing required envs")
+
+        prompt = "What is the meaning of life?"
+        cmds = [
+            "python",
+            f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama/llama.py",
+            "--artifact",
+            self.artifact_dir,
+            "--build_folder",
+            self.build_folder,
+            "--model",
+            self.model,
+            "--ip",
+            self.ip,
+            "--port",
+            str(self.port),
+            "--prompt",
+            f"{prompt}",
+            "--temperature",
+            "0",
+            "--decoder_model",
+            "granite_3_3-2b_instruct",
+            "--model_mode",
+            "kv",
+            "--max_seq_len",
+            "1024",
+            "--run_lm_eval",
+            "--tasks",
+            "hellaswag",
+            "--limit",
+            "10",
+            "--kv_updater",
+            "shift_pointer",
+        ]
+        if self.compile_only:
+            cmds.extend(["--compile_only"])
+        elif self.device:
+            cmds.extend(["--device", self.device])
+        if self.host:
+            cmds.extend(["--host", self.host])
+        elif self.enable_x86_64:
+            cmds.extend(["--enable_x86_64"])
+        if self.pre_gen_pte:
+            cmds.extend(["--pre_gen_pte", self.pre_gen_pte])
+
+        p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
+        with Listener((self.ip, self.port)) as listener:
+            conn = listener.accept()
+            p.communicate()
+            msg = json.loads(conn.recv())
+            if "Error" in msg:
+                self.fail(msg["Error"])
+            else:
+                inference_speed_ref = {"SM8650": 20, "SM8750": 22}
+                if (
+                    not self.compile_only
+                    and not self.enable_x86_64
+                    and self.model in inference_speed_ref
+                ):
+                    self.assertLessEqual(msg["pte_size"], 1_600_000_000)
+                    self.assertGreaterEqual(msg["acc_norm"], 0.2)
+                    self.assertGreaterEqual(
+                        msg["inference_speed"], inference_speed_ref[self.model]
+                    )
+
     def test_llama_stories_260k(self):
         if not self.required_envs():
             self.skipTest("missing required envs")
 
@@ -0,0 +1,16 @@
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.examples.models.granite.convert_weights import convert_weights
+from executorch.examples.models.llama.model import Llama2Model
+
+
+class GraniteModel(Llama2Model):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+
+__all__ = [
+    "GraniteModel",
+    "convert_weights",
+]
@@ -0,0 +1,19 @@
+{
+  "dim": 2048,
+  "attention_qkv_bias": false,
+  "attention_multiplier": 0.015625,
+  "bos_idx": 0,
+  "embedding_scale_factor": 12.0,
+  "eos_idx": 0,
+  "act_fn": "silu",
+  "hidden_dim": 8192,
+  "n_heads": 32,
+  "n_layers": 40,
+  "n_kv_heads": 8,
+  "norm_eps": 1e-05,
+  "rope_theta": 10000000.0,
+  "vocab_size": 49159,
+  "use_hf_rope": false,
+  "residual_multiplier": 0.22,
+  "logits_scaling": 8.0
+}
@@ -0,0 +1,106 @@
+import argparse
+
+import json
+import os
+from typing import Dict
+
+import torch
+from safetensors.torch import load_file
+
+from torchtune.models.convert_weights import get_mapped_key
+
+
+# Weight mappings from Granite 3's checkpoint to ExecuTorch's transformer parameters.
+_GRANITE_TO_EXECUTORCH = {
+    "model.embed_tokens.weight": "tok_embeddings.weight",
+    "model.norm.weight": "norm.weight",
+    "model.layers.{}.self_attn.k_proj.weight": "layers.{}.attention.wk.weight",
+    "model.layers.{}.self_attn.q_proj.weight": "layers.{}.attention.wq.weight",
+    "model.layers.{}.self_attn.v_proj.weight": "layers.{}.attention.wv.weight",
+    "model.layers.{}.self_attn.o_proj.weight": "layers.{}.attention.wo.weight",
+    "model.layers.{}.input_layernorm.weight": "layers.{}.attention_norm.weight",
+    "model.layers.{}.post_attention_layernorm.weight": "layers.{}.ffn_norm.weight",
+    "model.layers.{}.mlp.gate_proj.weight": "layers.{}.feed_forward.w1.weight",
+    "model.layers.{}.mlp.down_proj.weight": "layers.{}.feed_forward.w2.weight",
+    "model.layers.{}.mlp.up_proj.weight": "layers.{}.feed_forward.w3.weight",
+}
+
+
+def granite_to_executorch(
+    state_dict: Dict[str, torch.Tensor]
+) -> Dict[str, torch.Tensor]:
+    """
+    Convert the state dict so that it matches what ExecuTorch's transformer definition expects.
+    """
+    converted_state_dict = {}
+    for key, value in state_dict.items():
+        new_key = get_mapped_key(key, _GRANITE_TO_EXECUTORCH)
+        converted_state_dict[new_key] = value
+    converted_state_dict["output.weight"] = converted_state_dict[
+        "tok_embeddings.weight"
+    ]
+    return converted_state_dict
+
+
+def load_checkpoint_from_safetensors(input_dir: str) -> Dict:
+    index_path = os.path.join(input_dir, "model.safetensors.index.json")
+    if os.path.exists(index_path):
+        # Sharded checkpoint.
+        with open(index_path, "r") as f:
+            index = json.load(f)
+        weight_map = index["weight_map"]
+        checkpoint_shards = sorted(set(weight_map.values()))
+
+        # Load all the shards into memory
+        shard_to_weights = {}
+        for shard in checkpoint_shards:
+            shard_to_weights[shard] = load_file(os.path.join(input_dir, shard))
+
+        # Merge tensors into consolidated state dict.
+        merged_state_dict = {}
+        for weight_name, shard in weight_map.items():
+            tensor = shard_to_weights[shard][weight_name]
+            merged_state_dict[weight_name] = tensor
+        return merged_state_dict
+    else:
+        # Single checkpoint.
+        state_dict = load_file(os.path.join(input_dir, "model.safetensors"))
+        return state_dict
+
+
+def load_checkpoint(input_dir: str) -> Dict:
+    pytorch_path = os.path.join(input_dir, "pytorch_model.bin")
+    if os.path.exists(pytorch_path):
+        print("Loading checkpoint from PyTorch .bin file")
+        return torch.load(pytorch_path, map_location="cpu", weights_only=True)
+    print("Loading checkpoint from safetensors directory")
+    return load_checkpoint_from_safetensors(input_dir)
+
+
+def convert_weights(input_dir: str, output_file: str) -> None:
+    print("Loading checkpoint...")
+    sd = load_checkpoint(input_dir)
+    print("Converting checkpoint...")
+    sd = granite_to_executorch(sd)
+    print("Saving checkpoint...")
+    torch.save(sd, output_file)
+    print("Done.")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Convert Granite weights to ExecuTorch transformer format."
+    )
+    parser.add_argument(
+        "input_dir",
+        type=str,
+        help="Path to directory containing safetensor checkpoint files, or PyTorch checkpoint file.",
+    )
+    parser.add_argument("output", type=str, help="Path to the output checkpoint")
+
+    args = parser.parse_args()
+    convert_weights(args.input_dir, args.output)
+
+
+if __name__ == "__main__":
+    main()
@@ -69,8 +69,8 @@ def device(self):
     def tok_encode(self, string: str, **kwargs):  # pyre-ignore
         return self._tokenizer.encode(string, bos=False, eos=False)
 
-    def tok_decode(self, tokens):
-        return self._tokenizer.decode(tokens)
+    def tok_decode(self, tokens, **kwargs):
+        return self._tokenizer.decode([tokens] if isinstance(tokens, int) else tokens)
 
     def _model_call(self, inps):
         if self._use_kv_cache:
 
@@ -49,6 +49,9 @@ class ModelArgs:
     model_architecture: str = (
         "LlamaForCausalLM"  # This setting is currently only supported for the QNN backend
     )
+    attention_multiplier: Optional[float] = (
+        None  # Scaling factor 1/sqrt(d_k) in attention formula
+    )
     norm_eps: float = 1e-5
     post_attention_norm: bool = False
     post_ffn_norm: bool = False
@@ -75,6 +78,9 @@ class ModelArgs:
     # at runtime. Enable it only necessary (e.g., use perplexity tools that requires
     # logits for all input tokens.)
     generate_full_logits: bool = False
+    logits_scaling: Optional[float] = (
+        None  # Scaling factor applied to the logits of model, functioning similarly to a temperature parameter.
+    )
     enable_dynamic_shape: bool = False  # export model with dynamic shape support
     # A dictionary mapping from pruned token-id to original token-id
     input_prune_map: Optional[Dict[int, int]] = None
@@ -85,6 +91,9 @@ class ModelArgs:
     apply_output: bool = True  # Use output layer (unembedding) inside the transformer
     use_qk_norm: bool = False  # apply normalization to q and k in the attention
     qk_norm_before_rope: bool = False  # when to apply qk norm
+    residual_multiplier: Optional[float] = (
+        None  # Scaling factor applied to the residual hidden states
+    )
     use_hf_rope: bool = False  # Use HuggingFace's RoPE implementation
     no_rope_layer_interval: Optional[int] = (
         None  # Interval at which to skip RoPE. From Rope to Nope and Back Again: A New Hybrid Attention Strategy (https://huggingface.co/papers/2501.18795).
 
@@ -2,17 +2,19 @@
 
 ## Overview
 This file provides you the instructions to run LLM Decoder model with different parameters via Qualcomm HTP backend. We currently support the following models:
+<!-- numbered list will be automatically generated -->
  1. LLAMA2 Stories 110M
- 2. LLAMA3.2 1B
- 3. LLAMA3.2 3B
- 4. Codegen2 1B
- 5. Gemma 2B
- 6. Gemma3 1B
- 7. Phi4-mini-instruct
- 8. QWEN2.5 0.5B / 1.5B
- 9. QWEN3 0.6B / 1.7B
- 10. SmolLM2 135M
- 11. SmolLM3 3B
+ 1. LLAMA3.2 1B
+ 1. LLAMA3.2 3B
+ 1. Codegen2 1B
+ 1. Gemma 2B
+ 1. Gemma3 1B
+ 1. Granite3.3 2B
+ 1. Phi4-mini-instruct
+ 1. QWEN2.5 0.5B / 1.5B
+ 1. QWEN3 0.6B / 1.7B
+ 1. SmolLM2 135M
+ 1. SmolLM3 3B
 
 
 We offer the following modes to execute the model:
@@ -100,6 +102,12 @@ Default example using hybrid mode
 python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --temperature 0 --model_mode hybrid --max_seq_len 1024 --prefill_ar_len 128 --decoder_model gemma3-1b --prompt "I would like to learn python, could you teach me with a simple example?" --tasks wikitext --limit 1
 ```
 
+#### Granite3.3 2B
+Default example using hybrid mode
+```bash
+python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --temperature 0 --model_mode hybrid --max_seq_len 1024 --prefill_ar_len 128 --decoder_model granite_3_3-2b_instruct --prompt "I would like to learn python, could you teach me with a simple example?" --run_lm_eval --task hellaswag --limit 10
+```
+
 #### Phi4-mini-instruct
 Default example using kv mode.
 ```bash
@@ -227,24 +235,24 @@ python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL
 #### Perplexity Evaluation
 This script supports perplexity evaluation and is capable of assessing perplexity scores across 3 phases: prepare_pt2e(CPU FP), convert_pt2e(CPU QDQ), QNN on device.
 
-To evaluate the perplexity across all 3 phases, users should provide the `--eval_perplexity` flag and specify the evaluation task. Please notice when this flag is provided, the `--prompt ${PROMPT}` will be ignored.
+To evaluate the perplexity across all 3 phases, users should provide the `--run_lm_eval` flag and specify the evaluation task. Please notice when this flag is provided, the `--prompt ${PROMPT}` will be ignored.
 
 For example, using the Qwen model and 1 wikitext sample as the evaluation task, users can assess all 3 phases perplexity score in a single run by including the appropriate configuration:
 ```bash
-python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --prompt "What is 1+1?" --temperature 0 --model_mode kv --max_seq_len 1024 --decoder_model qwen2_5-0_5b --eval_perplexity --tasks wikitext --limit 1
+python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --prompt "What is 1+1?" --temperature 0 --model_mode kv --max_seq_len 1024 --decoder_model qwen2_5-0_5b --run_lm_eval --tasks wikitext --limit 1
 ```
 
 For the example script above, 1 wikitext sample is used to evaluate all 3 phases. However, there are cases where a user may want to use one sample for quantization calibration and multiple samples for perplexity evaluation. In this case, the process should be split into two runs. In the 1st run, the model is compiled using one sample. In the 2nd run, the user can provide a different configuration for QNN device execution.
 Example:
 ```bash
 # 1st run to compile with --limit 1
-python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --prompt "What is 1+1?" --temperature 0 --model_mode kv --max_seq_len 1024 --decoder_model qwen2_5-0_5b --eval_perplexity --tasks wikitext --limit 1 --compile_only
+python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --prompt "What is 1+1?" --temperature 0 --model_mode kv --max_seq_len 1024 --decoder_model qwen2_5-0_5b --run_lm_eval --tasks wikitext --limit 1 --compile_only
 ```
 ```bash
 # 2nd run to perform QNN device execution with --limit 3
-python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --prompt "What is 1+1?" --temperature 0 --model_mode kv --max_seq_len 1024 --decoder_model qwen2_5-0_5b --eval_perplexity --tasks wikitext --limit 3 --pre_gen_pte ${PATH_TO_ARTIFACT_IN_1ST_RUN} --quant_attrs_path ${PATH_TO_ARTIFACT_IN_1ST_RUN}/kv_llama_qnn_quant_attrs.json
+python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --prompt "What is 1+1?" --temperature 0 --model_mode kv --max_seq_len 1024 --decoder_model qwen2_5-0_5b --run_lm_eval --tasks wikitext --limit 3 --pre_gen_pte ${PATH_TO_ARTIFACT_IN_1ST_RUN} --quant_attrs_path ${PATH_TO_ARTIFACT_IN_1ST_RUN}/kv_llama_qnn_quant_attrs.json
 ```
 
 #### Tasks quantization calibration
 If `--tasks ${TASK}` is not provided, the program will use `--prompt ${PROMPT}` as the dataset for quantization calibration.
-Regardless of whether `--eval_perplexity` is provided, as long as `--tasks ${TASK}` is specified, the specified tasks will be used for model quantization calibration instead of the prompt.
+Regardless of whether `--run_lm_eval` is provided, as long as `--tasks ${TASK}` is specified, the specified tasks will be used for model quantization calibration instead of the prompt.
@@ -29,6 +29,9 @@
 
 from executorch.examples.models.gemma import convert_weights as convert_gemma_weights
 from executorch.examples.models.gemma3 import convert_weights as convert_gemma3_weights
+from executorch.examples.models.granite import (
+    convert_weights as convert_granite_weights,
+)
 from executorch.examples.models.phi_4_mini import (
     convert_weights as convert_phi_4_mini_weights,
 )
@@ -385,6 +388,35 @@ class Gemma3(LLMModelConfig):
     )
 
 
+@register_llm_model("granite_3_3-2b_instruct")
+@dataclass(init=False, frozen=True)
+class Granite_3_3_2b_Instruct(LLMModelConfig):
+    repo_id: str = "ibm-granite/granite-3.3-2b-instruct"
+    params_path: str = os.path.join(
+        BASE_DIR, "../../../models/granite/config/2b_config.json"
+    )
+    convert_weights = convert_granite_weights
+    transform_weight = False
+    instruct_model = True
+
+    num_sharding = 1
+    # quant config
+    ptq = QuantDtype.use_16a4w_block
+    group_size = 64
+    masked_softmax = True
+    seq_mse_candidates = 0
+    r1 = False
+    r2 = False
+    r3 = False
+    quantization_config_wv_sha_16a8w = get_ptq_per_channel_quant_config(
+        torch.uint16, weight_dtype=torch.int8, act_observer=MinMaxObserver
+    )
+    custom_annotation = (
+        annotate_kv_8bit,
+        partial(annotate_wv_sha, quantization_config=quantization_config_wv_sha_16a8w),
+    )
+
+
 @register_llm_model("phi_4_mini")
 @dataclass(init=False, frozen=True)
 class Phi4Mini(LLMModelConfig):
 
@@ -16,6 +16,7 @@
     "stories110m": "llama2",
     "gemma-2b": "gemma",
     "gemma3-1b": "gemma3",
+    "granite_3_3-2b_instruct": "granite",
     "phi_4_mini": "phi_4_mini",
     "llama3_2-1b_instruct": "llama3",
     "llama3_2-3b_instruct": "llama3",