From 4769c7c0a9098cc5c23348b94e4fa446bf521da6 Mon Sep 17 00:00:00 2001
From: "shijie.yu" <shijie@tensorplex.ai>
Date: Thu, 15 May 2025 19:42:47 +0000
Subject: [PATCH 1/6] Qwen 3

---
 README.md                                |  1 +
 litgpt/config.py                         | 25 ++++++++
 litgpt/prompts.py                        | 28 ++++++---
 litgpt/scripts/convert_hf_checkpoint.py  | 72 ++++++++++++++++++++++++
 litgpt/scripts/convert_lit_checkpoint.py | 51 +++++++++++++++++
 tests/test_model.py                      | 66 +++++++++++++++++++++-
 tests/test_tokenizer.py                  |  2 +-
 tutorials/download_model_weights.md      |  1 +
 8 files changed, 235 insertions(+), 11 deletions(-)

diff --git a/README.md b/README.md
index 929d78c04e..c00285e527 100644
--- a/README.md
+++ b/README.md
@@ -146,6 +146,7 @@ Every model is written from scratch to maximize performance and remove layers of
 | Qwen2.5 Math | 1.5B, 7B, 72B | Alibaba Group | [An, Yang et al. 2024](https://arxiv.org/abs/2409.12122)                                          |
 | QwQ | 32B | Alibaba Group | [Qwen Team 2025](https://qwenlm.github.io/blog/qwq-32b/)                                                                         |
 | QwQ-Preview | 32B | Alibaba Group | [Qwen Team 2024](https://qwenlm.github.io/blog/qwq-32b-preview/)                                                                         |
+| Qwen3 | 8B | Alibaba Group | [Qwen Team 2025](https://arxiv.org/abs/2505.09388/)                                                                         |
 | R1 Distill Llama | 8B, 70B | DeepSeek AI | [DeepSeek AI 2025](https://github.com/deepseek-ai/DeepSeek-R1/blob/main/DeepSeek_R1.pdf)                                                                                 |
 | SmolLM2 | 135M, 360M, 1.7B | Hugging Face | [Hugging Face 2024](https://github.com/huggingface/smollm)                                                               |
 | Salamandra | 2B, 7B | Barcelona Supercomputing Centre | [BSC-LTC 2024](https://github.com/BSC-LTC/salamandra)                                                                         |
diff --git a/litgpt/config.py b/litgpt/config.py
index 70bd6079a7..147823a59a 100644
--- a/litgpt/config.py
+++ b/litgpt/config.py
@@ -2460,6 +2460,31 @@ def norm_class(self) -> Type:
 
 configs.extend(qwq)
 
+qwen_3 = [
+    # https://huggingface.co/Qwen/Qwen3-8B/blob/main/config.json
+    dict(
+        name="Qwen3-8B",
+        hf_config=dict(org="Qwen", name="Qwen3-8B"),
+        block_size=40960,
+        vocab_size=151936,
+        padded_vocab_size=151936,
+        n_layer=36,
+        n_head=32,
+        n_embd=4096,
+        n_query_groups=8,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        norm_class_name="RMSNorm",
+        mlp_class_name="LLaMAMLP",
+        intermediate_size=12288,
+        norm_eps=1e-6,
+        rope_base=1000000,
+        norm_qk=True,
+    ),
+]
+
+configs.extend(qwen_3)
 
 #############
 # Salamandra
diff --git a/litgpt/prompts.py b/litgpt/prompts.py
index 17fda9010e..de87c54b6a 100644
--- a/litgpt/prompts.py
+++ b/litgpt/prompts.py
@@ -4,7 +4,7 @@
 from abc import abstractmethod
 from json import dumps
 from pathlib import Path
-from typing import TYPE_CHECKING, Dict, List, Tuple, Type, Union
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Type, Union
 
 import yaml
 
@@ -343,22 +343,25 @@ def apply(self, prompt: str, **kwargs: str) -> str:
 
 
 class ChatML(PromptStyle):
-    def __init__(self, system_message: str):
+    def __init__(self, system_message: Optional[str] = None):
         self.system_message = system_message
 
     def apply(self, prompt: str, **kwargs: str) -> str:
+        if not self.system_message:
+            return f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
         return f"<|im_start|>system\n{self.system_message}<|im_end|>\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
 
-
 class Qwen2_5(ChatML):
     def __init__(self):
-        super().__init__("You are Qwen, created by Alibaba Cloud. You are a helpful assistant.")
-
+        super().__init__(
+            "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."
+        )
 
 class Qwen2_5_Math(ChatML):
     def __init__(self):
-        super().__init__("Please reason step by step, and put your final answer within \\boxed{}.")
-
+        super().__init__(
+            "Please reason step by step, and put your final answer within \\boxed{}."
+        )
 
 class QwQ(ChatML):
     def __init__(self):
@@ -366,11 +369,15 @@ def __init__(self):
             "You are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step."
         )
 
+class Qwen3(ChatML):
+    def __init__(self):
+        super().__init__()
 
 class SmolLM2(ChatML):
     def __init__(self):
-        super().__init__("You are a helpful AI assistant named SmolLM, trained by Hugging Face")
-
+        super().__init__(
+            "You are a helpful AI assistant named SmolLM, trained by Hugging Face"
+        )
 
 class Salamandra(ChatML):
     def __init__(self):
@@ -406,6 +413,7 @@ def __init__(self):
     "qwen2.5": Qwen2_5,
     "qwen2.5-math": Qwen2_5_Math,
     "qwq": QwQ,
+    "qwen3": Qwen3,
     "smollm2": SmolLM2,
     "salamandra": Salamandra,
 }
@@ -458,6 +466,8 @@ def model_name_to_prompt_style(model_name: str) -> PromptStyle:
         return Qwen2_5()
     if re.search(r"QwQ-.*", model_name):
         return QwQ()
+    if re.search(r"Qwen3-.*", model_name):
+        return Qwen3()
     if re.search(r"SmolLM2.*-Instruct", model_name):
         return SmolLM2()
     if re.search(r"salamandra-.*-instruct", model_name):
diff --git a/litgpt/scripts/convert_hf_checkpoint.py b/litgpt/scripts/convert_hf_checkpoint.py
index 341e1a757d..1662ba5b10 100644
--- a/litgpt/scripts/convert_hf_checkpoint.py
+++ b/litgpt/scripts/convert_hf_checkpoint.py
@@ -532,6 +532,74 @@ def copy_weights_qwen_2_5(
             if progress_per_file is not None:
                 pbar.update(progress_per_file)
 
+def copy_weights_qwen_3(
+    config: Config,
+    qkv_weights: Dict[int, List[Optional[NotYetLoadedTensor]]],
+    state_dict: Dict[str, torch.Tensor],
+    hf_weights: Dict[str, Union[torch.Tensor, NotYetLoadedTensor]],
+    saver: Optional[incremental_save] = None,
+    dtype: Optional[torch.dtype] = None,
+    pbar: Optional[tqdm] = None,
+    progress_per_file: Optional[float] = None,
+    debug_mode: Optional[bool] = False,
+) -> None:
+    weight_map = {
+        "model.embed_tokens.weight": "transformer.wte.weight",
+        "model.layers.{}.input_layernorm.weight": "transformer.h.{}.norm_1.weight",
+        "model.layers.{}.self_attn.q_proj.weight": None,
+        "model.layers.{}.self_attn.k_proj.weight": None,
+        "model.layers.{}.self_attn.v_proj.weight": None,
+        "model.layers.{}.self_attn.q_norm.weight": "transformer.h.{}.attn.q_norm.weight",
+        "model.layers.{}.self_attn.k_norm.weight": "transformer.h.{}.attn.k_norm.weight",
+        "model.layers.{}.self_attn.o_proj.weight": "transformer.h.{}.attn.proj.weight",
+        "model.layers.{}.post_attention_layernorm.weight": "transformer.h.{}.norm_2.weight",
+        "model.layers.{}.mlp.gate_proj.weight": "transformer.h.{}.mlp.fc_1.weight",
+        "model.layers.{}.mlp.up_proj.weight": "transformer.h.{}.mlp.fc_2.weight",
+        "model.layers.{}.mlp.down_proj.weight": "transformer.h.{}.mlp.proj.weight",
+        "model.norm.weight": "transformer.ln_f.weight",
+        "lm_head.weight": "lm_head.weight",
+    }
+
+    if progress_per_file is not None:
+        progress_per_file = progress_per_file / max(1, len(hf_weights) + len(qkv_weights))
+
+    for from_name, param in hf_weights.items():
+        name_template, *ids = layer_template(from_name, num_matches=2)
+        to_name = weight_map[name_template]
+        param = load_param(param, from_name, dtype, verbose=debug_mode)
+        if any(w in from_name for w in ("q_proj", "k_proj", "v_proj")):
+            qkv = qkv_weights.setdefault(ids[0], defaultdict(dict))
+            weight_name, weight_type = from_name.split(".")[-2:]
+            qkv[weight_type][weight_name] = param
+        if to_name is None:
+            continue
+        to_name = to_name.format(*ids)
+        if saver is not None:
+            param = saver.store_early(param)
+        state_dict[to_name] = param
+
+        if progress_per_file is not None:
+            pbar.update(progress_per_file)
+
+    if "lm_head.weight" not in state_dict:
+        state_dict["lm_head.weight"] = state_dict["transformer.wte.weight"]
+
+    for i in list(qkv_weights):
+        for weight_type in list(qkv_weights[i]):
+            qkv = qkv_weights[i][weight_type]
+            if len(qkv) != 3:
+                # qkv is split across different .bin files
+                continue
+            q = load_param(qkv["q_proj"], f"layer {i} q {weight_type}", dtype, verbose=debug_mode)
+            k = load_param(qkv["k_proj"], f"layer {i} k {weight_type}", dtype, verbose=debug_mode)
+            v = load_param(qkv["v_proj"], f"layer {i} v {weight_type}", dtype, verbose=debug_mode)
+            qkv = torch.cat((q, k, v))
+            state_dict[f"transformer.h.{i}.attn.qkv.{weight_type}"] = qkv
+            del qkv_weights[i][weight_type]
+
+            if progress_per_file is not None:
+                pbar.update(progress_per_file)
+
 
 def qkv_reassemble(
     param: Union[torch.Tensor, NotYetLoadedTensor], config: Config
@@ -624,6 +692,10 @@ def convert_hf_checkpoint(
         # holder to reconstitute the split q, k, v
         qkv_weights = {}
         copy_fn = partial(copy_weights_qwen_2_5, config, qkv_weights)
+    elif model_name.lower().startswith("qwen3"):
+        # holder to reconstitute the split q, k, v
+        qkv_weights = {}
+        copy_fn = partial(copy_weights_qwen_3, config, qkv_weights)
     elif config.mlp_class_name in ("LLaMAMLP", "GemmaMLP", "LLaMAMoE"):
         # holder to reconstitute the split q, k, v
         qkv_weights = {}
diff --git a/litgpt/scripts/convert_lit_checkpoint.py b/litgpt/scripts/convert_lit_checkpoint.py
index 01c4ca7785..0a89f279bb 100644
--- a/litgpt/scripts/convert_lit_checkpoint.py
+++ b/litgpt/scripts/convert_lit_checkpoint.py
@@ -393,6 +393,55 @@ def copy_weights_qwen_2_5(
             state_dict[to_name] = param
 
 
+def copy_weights_qwen_3(
+    config: Config,
+    state_dict: Dict[str, torch.Tensor],
+    lit_weights: Dict[str, Union[torch.Tensor, NotYetLoadedTensor]],
+    untie_weights: bool = False,
+    saver: Optional[incremental_save] = None,
+) -> None:
+    weight_map = {
+        "transformer.wte.weight": "model.embed_tokens.weight",
+        "transformer.h.{}.norm_1.weight": "model.layers.{}.input_layernorm.weight",
+        "transformer.h.{}.norm_2.weight": "model.layers.{}.post_attention_layernorm.weight",
+        "transformer.h.{}.attn.proj.weight": "model.layers.{}.self_attn.o_proj.weight",
+        "transformer.h.{}.attn.q_norm.weight": "model.layers.{}.self_attn.q_norm.weight",
+        "transformer.h.{}.attn.k_norm.weight": "model.layers.{}.self_attn.k_norm.weight",
+        "transformer.h.{}.mlp.fc_1.weight": "model.layers.{}.mlp.gate_proj.weight",
+        "transformer.h.{}.mlp.fc_2.weight": "model.layers.{}.mlp.up_proj.weight",
+        "transformer.h.{}.mlp.proj.weight": "model.layers.{}.mlp.down_proj.weight",
+        "transformer.ln_f.weight": "model.norm.weight",
+        "lm_head.weight": "lm_head.weight",
+    }
+
+    for from_name, param in lit_weights.items():
+        if from_name == "lm_head.weight" and untie_weights:
+            continue
+        name_template, *ids = layer_template(from_name, num_matches=2)
+        param = load_param(param, from_name, None)
+        if from_name.endswith(".attn.qkv.weight"):
+            weight_type = from_name.split(".")[-1]  # weight or bias
+            to_names = (
+                "model.layers.{}.self_attn.q_proj.{}".format(*ids, weight_type),
+                "model.layers.{}.self_attn.k_proj.{}".format(*ids, weight_type),
+                "model.layers.{}.self_attn.v_proj.{}".format(*ids, weight_type),
+            )
+            params = param.split(
+                (
+                    config.n_head * config.head_size,
+                    config.n_query_groups * config.head_size,
+                    config.n_query_groups * config.head_size,
+                )
+            )
+        else:
+            to_names = (weight_map[name_template].format(*ids),)
+            params = (param,)
+
+        for to_name, param in zip(to_names, params):
+            if saver is not None:
+                param = saver.store_early(param)
+            state_dict[to_name] = param
+
 def qkv_reassemble(param: Union[torch.Tensor, NotYetLoadedTensor], config: Config) -> torch.Tensor:
     """Reassemble from a normal to an interleaved placement in a QKV matrix.
     [Q, Q, ..., K, K, ..., V, V, ...] --> [Q, K, V, Q, K, V, ...]
@@ -437,6 +486,8 @@ def convert_lit_checkpoint(checkpoint_dir: Path, output_dir: Path) -> None:
         copy_fn = partial(copy_weights_phi, config)
     elif config.name.lower().startswith(("qwen2.5", "qwq")):
         copy_fn = partial(copy_weights_qwen_2_5, config)
+    elif config.name.lower().startswith("qwen3"):
+        copy_fn = partial(copy_weights_qwen_3, config)
     elif config.mlp_class_name in ("LLaMAMLP", "GemmaMLP", "LLaMAMoE"):
         untie_weights = "Gemma" in config.name
         copy_fn = partial(copy_weights_llama, config, untie_weights=untie_weights)
diff --git a/tests/test_model.py b/tests/test_model.py
index 39d946fb2d..fdd1212af5 100644
--- a/tests/test_model.py
+++ b/tests/test_model.py
@@ -30,7 +30,7 @@
 from transformers.models.mixtral import MixtralConfig, MixtralForCausalLM
 from transformers.models.olmo import OlmoConfig, OlmoForCausalLM
 from transformers.models.qwen2 import Qwen2Config, Qwen2ForCausalLM
-
+from transformers.models.qwen3 import Qwen3Config, Qwen3ForCausalLM
 import litgpt.config as config_module
 from litgpt import GPT, Config
 from litgpt.model import CausalSelfAttention, batched_index_copy_
@@ -42,6 +42,7 @@
     copy_weights_hf_llama,
     copy_weights_phi,
     copy_weights_qwen_2_5,
+    copy_weights_qwen_3,
 )
 from litgpt.scripts.convert_lit_checkpoint import qkv_reassemble as make_qkv_interleaved
 from litgpt.utils import _RunIf
@@ -1008,6 +1009,69 @@ def test_against_original_qwen_2_5(model_name, device, dtype):
     torch.testing.assert_close(ours_y, theirs_y)
 
 
+@torch.inference_mode()
+@pytest.mark.parametrize(
+    "model_name", ["Qwen3-8B"]
+)
+@pytest.mark.parametrize(
+    ("device", "dtype"),
+    [
+        (torch.device("cpu"), torch.float32),
+        pytest.param(
+            torch.device("cuda"),
+            torch.float16,
+            marks=[
+                # the reference does softmax upscaled to fp32 during attention. additionally, the final layernorm input
+                # is slightly different
+                pytest.mark.xfail(raises=AssertionError, strict=False),
+                _RunIf(min_cuda_gpus=1),
+            ],
+        ),
+    ],
+)
+def test_against_original_qwen_3(model_name, device, dtype):
+    torch.set_default_dtype(dtype)
+
+    T = 20
+    ours_config = Config.from_name(
+        model_name,
+        block_size=T,
+        n_layer=2,
+        n_head=16,
+        n_embd=32,
+        intermediate_size=86,
+    )
+    theirs_config = Qwen3Config(
+        vocab_size=ours_config.padded_vocab_size,
+        hidden_size=ours_config.n_embd,
+        head_dim=ours_config.head_size,
+        num_attention_heads=ours_config.n_head,
+        num_hidden_layers=ours_config.n_layer,
+        intermediate_size=ours_config.intermediate_size,
+        max_position_embeddings=ours_config.block_size,
+        rms_norm_eps=ours_config.norm_eps,
+        num_key_value_heads=ours_config.n_query_groups,
+        rope_theta=ours_config.rope_base,
+        tie_word_embeddings=False,
+    )
+
+    theirs_model = Qwen3ForCausalLM(theirs_config).to(device)
+    theirs_state_dict = theirs_model.state_dict()
+    # Gemma weights are shipped without `lm_head.weight`
+    theirs_state_dict.pop("lm_head.weight")
+    state_dict = {}
+    copy_weights_qwen_3(ours_config, {}, state_dict, theirs_state_dict)
+    ours_model = GPT(ours_config).to(device)
+    ours_model.load_state_dict(state_dict)
+
+    # test end to end
+    x = torch.randint(low=0, high=ours_config.padded_vocab_size, size=(T,), device=device).unsqueeze(0)
+    assert x.size(1) == T
+    ours_y = ours_model(x)
+    theirs_y = theirs_model(x)["logits"].to(dtype)  # HF converts logits to float
+    torch.testing.assert_close(ours_y, theirs_y)
+
+
 @torch.inference_mode()
 @pytest.mark.parametrize("model_name", ("salamandra-2b", "salamandra-7b"))
 @pytest.mark.parametrize(
diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py
index a94f46d710..09e1d0d2e0 100644
--- a/tests/test_tokenizer.py
+++ b/tests/test_tokenizer.py
@@ -47,7 +47,7 @@ def test_tokenizer_against_hf(config, tmp_path):
     else:
         assert ours.vocab_size == config.vocab_size
 
-    if config.name.startswith(("falcon", "stablecode", "Qwen2.5", "QwQ")):
+    if config.name.startswith(("falcon", "stablecode", "Qwen2.5", "QwQ", "Qwen3")):
         # even though their config defines it, it's set as None in HF
         assert isinstance(ours.bos_id, int)
         assert theirs.bos_token_id is None
diff --git a/tutorials/download_model_weights.md b/tutorials/download_model_weights.md
index 0a41110be4..e28e6a7a11 100644
--- a/tutorials/download_model_weights.md
+++ b/tutorials/download_model_weights.md
@@ -44,6 +44,7 @@ LitGPT supports a variety of LLM architectures with publicly available weights.
 | Qwen2.5 Math | 1.5B, 7B, 72B | Alibaba Group | [An, Yang et al. 2024](https://arxiv.org/abs/2409.12122)                                          |
 | QwQ | 32B | Alibaba Group | [Qwen Team 2025](https://qwenlm.github.io/blog/qwq-32b/)                                                                         |
 | QwQ-Preview | 32B | Alibaba Group | [Qwen Team 2024](https://qwenlm.github.io/blog/qwq-32b-preview/)                                                                         |
+| Qwen3 | 8B | Alibaba Group | [Qwen Team 2025](https://arxiv.org/abs/2505.09388/)                                                                         |
 | R1 Distll Llama | 8B, 70B | DeepSeek AI | [DeepSeek AI 2025](https://github.com/deepseek-ai/DeepSeek-R1/blob/main/DeepSeek_R1.pdf)                                                                         |
 | RedPajama-INCITE | 3B, 7B | Together | [Together 2023](https://together.ai/blog/redpajama-models-v1)                                                                 |
 | SmolLM2 | 135M, 360M, 1.7B | Hugging Face | [Hugging Face 2024](https://github.com/huggingface/smollm)                                                               |

From 2c82729bbc007bb6541a0e89d6f63ec5b5aec7ec Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 15 May 2025 19:49:54 +0000
Subject: [PATCH 2/6] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 litgpt/prompts.py                        | 18 +++++++++---------
 litgpt/scripts/convert_hf_checkpoint.py  |  1 +
 litgpt/scripts/convert_lit_checkpoint.py |  1 +
 tests/test_model.py                      |  5 ++---
 4 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/litgpt/prompts.py b/litgpt/prompts.py
index de87c54b6a..369504f17f 100644
--- a/litgpt/prompts.py
+++ b/litgpt/prompts.py
@@ -351,17 +351,16 @@ def apply(self, prompt: str, **kwargs: str) -> str:
             return f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
         return f"<|im_start|>system\n{self.system_message}<|im_end|>\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
 
+
 class Qwen2_5(ChatML):
     def __init__(self):
-        super().__init__(
-            "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."
-        )
+        super().__init__("You are Qwen, created by Alibaba Cloud. You are a helpful assistant.")
+
 
 class Qwen2_5_Math(ChatML):
     def __init__(self):
-        super().__init__(
-            "Please reason step by step, and put your final answer within \\boxed{}."
-        )
+        super().__init__("Please reason step by step, and put your final answer within \\boxed{}.")
+
 
 class QwQ(ChatML):
     def __init__(self):
@@ -369,15 +368,16 @@ def __init__(self):
             "You are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step."
         )
 
+
 class Qwen3(ChatML):
     def __init__(self):
         super().__init__()
 
+
 class SmolLM2(ChatML):
     def __init__(self):
-        super().__init__(
-            "You are a helpful AI assistant named SmolLM, trained by Hugging Face"
-        )
+        super().__init__("You are a helpful AI assistant named SmolLM, trained by Hugging Face")
+
 
 class Salamandra(ChatML):
     def __init__(self):
diff --git a/litgpt/scripts/convert_hf_checkpoint.py b/litgpt/scripts/convert_hf_checkpoint.py
index 1662ba5b10..f4010f34a4 100644
--- a/litgpt/scripts/convert_hf_checkpoint.py
+++ b/litgpt/scripts/convert_hf_checkpoint.py
@@ -532,6 +532,7 @@ def copy_weights_qwen_2_5(
             if progress_per_file is not None:
                 pbar.update(progress_per_file)
 
+
 def copy_weights_qwen_3(
     config: Config,
     qkv_weights: Dict[int, List[Optional[NotYetLoadedTensor]]],
diff --git a/litgpt/scripts/convert_lit_checkpoint.py b/litgpt/scripts/convert_lit_checkpoint.py
index 0a89f279bb..1ae74d703c 100644
--- a/litgpt/scripts/convert_lit_checkpoint.py
+++ b/litgpt/scripts/convert_lit_checkpoint.py
@@ -442,6 +442,7 @@ def copy_weights_qwen_3(
                 param = saver.store_early(param)
             state_dict[to_name] = param
 
+
 def qkv_reassemble(param: Union[torch.Tensor, NotYetLoadedTensor], config: Config) -> torch.Tensor:
     """Reassemble from a normal to an interleaved placement in a QKV matrix.
     [Q, Q, ..., K, K, ..., V, V, ...] --> [Q, K, V, Q, K, V, ...]
diff --git a/tests/test_model.py b/tests/test_model.py
index fdd1212af5..ce77db23c8 100644
--- a/tests/test_model.py
+++ b/tests/test_model.py
@@ -31,6 +31,7 @@
 from transformers.models.olmo import OlmoConfig, OlmoForCausalLM
 from transformers.models.qwen2 import Qwen2Config, Qwen2ForCausalLM
 from transformers.models.qwen3 import Qwen3Config, Qwen3ForCausalLM
+
 import litgpt.config as config_module
 from litgpt import GPT, Config
 from litgpt.model import CausalSelfAttention, batched_index_copy_
@@ -1010,9 +1011,7 @@ def test_against_original_qwen_2_5(model_name, device, dtype):
 
 
 @torch.inference_mode()
-@pytest.mark.parametrize(
-    "model_name", ["Qwen3-8B"]
-)
+@pytest.mark.parametrize("model_name", ["Qwen3-8B"])
 @pytest.mark.parametrize(
     ("device", "dtype"),
     [

From cd4f5a7a18b8795a434e32745973e6147f4348d7 Mon Sep 17 00:00:00 2001
From: "shijie.yu" <shijie@tensorplex.ai>
Date: Fri, 23 May 2025 17:33:50 +0000
Subject: [PATCH 3/6] vocab size and module fixes

---
 litgpt/config.py                         | 2 +-
 litgpt/scripts/convert_hf_checkpoint.py  | 4 ++--
 litgpt/scripts/convert_lit_checkpoint.py | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/litgpt/config.py b/litgpt/config.py
index 147823a59a..0c46a1d845 100644
--- a/litgpt/config.py
+++ b/litgpt/config.py
@@ -2466,7 +2466,7 @@ def norm_class(self) -> Type:
         name="Qwen3-8B",
         hf_config=dict(org="Qwen", name="Qwen3-8B"),
         block_size=40960,
-        vocab_size=151936,
+        vocab_size=151643,
         padded_vocab_size=151936,
         n_layer=36,
         n_head=32,
diff --git a/litgpt/scripts/convert_hf_checkpoint.py b/litgpt/scripts/convert_hf_checkpoint.py
index f4010f34a4..47ebc41edf 100644
--- a/litgpt/scripts/convert_hf_checkpoint.py
+++ b/litgpt/scripts/convert_hf_checkpoint.py
@@ -550,8 +550,8 @@ def copy_weights_qwen_3(
         "model.layers.{}.self_attn.q_proj.weight": None,
         "model.layers.{}.self_attn.k_proj.weight": None,
         "model.layers.{}.self_attn.v_proj.weight": None,
-        "model.layers.{}.self_attn.q_norm.weight": "transformer.h.{}.attn.q_norm.weight",
-        "model.layers.{}.self_attn.k_norm.weight": "transformer.h.{}.attn.k_norm.weight",
+        "model.layers.{}.self_attn.q_norm.weight": "transformer.h.{}.attn.norm_q.weight",
+        "model.layers.{}.self_attn.k_norm.weight": "transformer.h.{}.attn.norm_k.weight",
         "model.layers.{}.self_attn.o_proj.weight": "transformer.h.{}.attn.proj.weight",
         "model.layers.{}.post_attention_layernorm.weight": "transformer.h.{}.norm_2.weight",
         "model.layers.{}.mlp.gate_proj.weight": "transformer.h.{}.mlp.fc_1.weight",
diff --git a/litgpt/scripts/convert_lit_checkpoint.py b/litgpt/scripts/convert_lit_checkpoint.py
index 1ae74d703c..232070b1fa 100644
--- a/litgpt/scripts/convert_lit_checkpoint.py
+++ b/litgpt/scripts/convert_lit_checkpoint.py
@@ -405,8 +405,8 @@ def copy_weights_qwen_3(
         "transformer.h.{}.norm_1.weight": "model.layers.{}.input_layernorm.weight",
         "transformer.h.{}.norm_2.weight": "model.layers.{}.post_attention_layernorm.weight",
         "transformer.h.{}.attn.proj.weight": "model.layers.{}.self_attn.o_proj.weight",
-        "transformer.h.{}.attn.q_norm.weight": "model.layers.{}.self_attn.q_norm.weight",
-        "transformer.h.{}.attn.k_norm.weight": "model.layers.{}.self_attn.k_norm.weight",
+        "transformer.h.{}.attn.norm_q.weight": "model.layers.{}.self_attn.q_norm.weight",
+        "transformer.h.{}.attn.norm_k.weight": "model.layers.{}.self_attn.k_norm.weight",
         "transformer.h.{}.mlp.fc_1.weight": "model.layers.{}.mlp.gate_proj.weight",
         "transformer.h.{}.mlp.fc_2.weight": "model.layers.{}.mlp.up_proj.weight",
         "transformer.h.{}.mlp.proj.weight": "model.layers.{}.mlp.down_proj.weight",

From 0365936e7c92969dee713c37e9bf09053f507e3f Mon Sep 17 00:00:00 2001
From: "shijie.yu" <shijie@tensorplex.ai>
Date: Fri, 23 May 2025 19:23:08 +0000
Subject: [PATCH 4/6] modified test for correctness

---
 tests/test_model.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/test_model.py b/tests/test_model.py
index ce77db23c8..83b8212166 100644
--- a/tests/test_model.py
+++ b/tests/test_model.py
@@ -1056,8 +1056,6 @@ def test_against_original_qwen_3(model_name, device, dtype):
 
     theirs_model = Qwen3ForCausalLM(theirs_config).to(device)
     theirs_state_dict = theirs_model.state_dict()
-    # Gemma weights are shipped without `lm_head.weight`
-    theirs_state_dict.pop("lm_head.weight")
     state_dict = {}
     copy_weights_qwen_3(ours_config, {}, state_dict, theirs_state_dict)
     ours_model = GPT(ours_config).to(device)

From aac04b81e023c58b8511983ff25032c5634bba8a Mon Sep 17 00:00:00 2001
From: "shijie.yu" <shijie@tensorplex.ai>
Date: Sat, 24 May 2025 02:54:46 +0000
Subject: [PATCH 5/6] added all qwen3 dense models

---
 README.md                           |   2 +-
 litgpt/config.py                    | 119 +++++++++++++++++++++++++++-
 tests/test_model.py                 |   2 +-
 tutorials/download_model_weights.md |   2 +-
 4 files changed, 119 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index bed5e29c69..3c69cea729 100644
--- a/README.md
+++ b/README.md
@@ -146,7 +146,7 @@ Every model is written from scratch to maximize performance and remove layers of
 | Qwen2.5 Math | 1.5B, 7B, 72B | Alibaba Group | [An, Yang et al. 2024](https://arxiv.org/abs/2409.12122)                                          |
 | QwQ | 32B | Alibaba Group | [Qwen Team 2025](https://qwenlm.github.io/blog/qwq-32b/)                                                                         |
 | QwQ-Preview | 32B | Alibaba Group | [Qwen Team 2024](https://qwenlm.github.io/blog/qwq-32b-preview/)                                                                         |
-| Qwen3 | 8B | Alibaba Group | [Qwen Team 2025](https://arxiv.org/abs/2505.09388/)                                                                         |
+| Qwen3 | 0.6B, 1.7B, 4B, 8B, 14B, 32B | Alibaba Group | [Qwen Team 2025](https://arxiv.org/abs/2505.09388/)                                                                         |
 | R1 Distill Llama | 8B, 70B | DeepSeek AI | [DeepSeek AI 2025](https://github.com/deepseek-ai/DeepSeek-R1/blob/main/DeepSeek_R1.pdf)                                                                                 |
 | SmolLM2 | 135M, 360M, 1.7B | Hugging Face | [Hugging Face 2024](https://github.com/huggingface/smollm)                                                               |
 | Salamandra | 2B, 7B | Barcelona Supercomputing Centre | [BSC-LTC 2024](https://github.com/BSC-LTC/salamandra)                                                                         |
diff --git a/litgpt/config.py b/litgpt/config.py
index 0c46a1d845..40eb03969e 100644
--- a/litgpt/config.py
+++ b/litgpt/config.py
@@ -2461,10 +2461,75 @@ def norm_class(self) -> Type:
 configs.extend(qwq)
 
 qwen_3 = [
+    # https://huggingface.co/Qwen/Qwen3-0.6B/blob/main/config.json
+    dict(
+        name="Qwen3-0.6B{}",
+        hf_config=dict(org="Qwen", name="Qwen3-0.6B{}"),
+        block_size=40960,
+        vocab_size=151643,
+        padded_vocab_size=151936,
+        n_layer=28,
+        n_head=16,
+        n_embd=1024,
+        n_query_groups=8,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        norm_class_name="RMSNorm",
+        mlp_class_name="LLaMAMLP",
+        intermediate_size=3072,
+        norm_eps=1e-6,
+        rope_base=1000000,
+        head_size=128,
+        norm_qk=True,
+    ),
+    # https://huggingface.co/Qwen/Qwen3-1.7B/blob/main/config.json
+    dict(
+        name="Qwen3-1.7B{}",
+        hf_config=dict(org="Qwen", name="Qwen3-1.7B{}"),
+        block_size=40960,
+        vocab_size=151643,
+        padded_vocab_size=151936,
+        n_layer=28,
+        n_head=16,
+        n_embd=2048,
+        n_query_groups=8,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        norm_class_name="RMSNorm",
+        mlp_class_name="LLaMAMLP",
+        intermediate_size=6144,
+        norm_eps=1e-6,
+        rope_base=1000000,
+        norm_qk=True,
+    ),
+    # https://huggingface.co/Qwen/Qwen3-4B/blob/main/config.json
+    dict(
+        name="Qwen3-4B{}",
+        hf_config=dict(org="Qwen", name="Qwen3-4B{}"),
+        block_size=40960,
+        vocab_size=151643,
+        padded_vocab_size=151936,
+        n_layer=36,
+        n_head=32,
+        n_embd=2560,
+        n_query_groups=8,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        norm_class_name="RMSNorm",
+        mlp_class_name="LLaMAMLP",
+        intermediate_size=9728,
+        norm_eps=1e-6,
+        rope_base=1000000,
+        head_size=128,
+        norm_qk=True,
+    ),
     # https://huggingface.co/Qwen/Qwen3-8B/blob/main/config.json
     dict(
-        name="Qwen3-8B",
-        hf_config=dict(org="Qwen", name="Qwen3-8B"),
+        name="Qwen3-8B{}",
+        hf_config=dict(org="Qwen", name="Qwen3-8B{}"),
         block_size=40960,
         vocab_size=151643,
         padded_vocab_size=151936,
@@ -2482,9 +2547,57 @@ def norm_class(self) -> Type:
         rope_base=1000000,
         norm_qk=True,
     ),
+    # https://huggingface.co/Qwen/Qwen3-14B/blob/main/config.json
+    dict(
+        name="Qwen3-14B{}",
+        hf_config=dict(org="Qwen", name="Qwen3-14B{}"),
+        block_size=40960,
+        vocab_size=151643,
+        padded_vocab_size=151936,
+        n_layer=40,
+        n_head=40,
+        n_embd=5120,
+        n_query_groups=8,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        norm_class_name="RMSNorm",
+        mlp_class_name="LLaMAMLP",
+        intermediate_size=17408,
+        norm_eps=1e-6,
+        rope_base=1000000,
+        norm_qk=True,
+    ),
+    # https://huggingface.co/Qwen/Qwen3-32B/blob/main/config.json
+    dict(
+        name="Qwen3-32B{}",
+        hf_config=dict(org="Qwen", name="Qwen3-32B{}"),
+        block_size=40960,
+        vocab_size=151643,
+        padded_vocab_size=151936,
+        n_layer=64,
+        n_head=64,
+        n_embd=5120,
+        n_query_groups=8,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        norm_class_name="RMSNorm",
+        mlp_class_name="LLaMAMLP",
+        intermediate_size=25600,
+        norm_eps=1e-6,
+        rope_base=1000000,
+        head_size=128,
+        norm_qk=True,
+    ),
 ]
+for c in qwen_3:
+    for kind in ("", "-Base"):
+        copy = deepcopy(c)
+        copy["name"] = c["name"].format(kind)
+        copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind)
+        configs.append(copy)
 
-configs.extend(qwen_3)
 
 #############
 # Salamandra
diff --git a/tests/test_model.py b/tests/test_model.py
index 83b8212166..bb6d48fff9 100644
--- a/tests/test_model.py
+++ b/tests/test_model.py
@@ -1011,7 +1011,7 @@ def test_against_original_qwen_2_5(model_name, device, dtype):
 
 
 @torch.inference_mode()
-@pytest.mark.parametrize("model_name", ["Qwen3-8B"])
+@pytest.mark.parametrize("model_name", ["Qwen3-0.6B", "Qwen3-8B", "Qwen3-4B-Base", "Qwen3-32B-Base"])
 @pytest.mark.parametrize(
     ("device", "dtype"),
     [
diff --git a/tutorials/download_model_weights.md b/tutorials/download_model_weights.md
index e28e6a7a11..0dfaee49bf 100644
--- a/tutorials/download_model_weights.md
+++ b/tutorials/download_model_weights.md
@@ -44,7 +44,7 @@ LitGPT supports a variety of LLM architectures with publicly available weights.
 | Qwen2.5 Math | 1.5B, 7B, 72B | Alibaba Group | [An, Yang et al. 2024](https://arxiv.org/abs/2409.12122)                                          |
 | QwQ | 32B | Alibaba Group | [Qwen Team 2025](https://qwenlm.github.io/blog/qwq-32b/)                                                                         |
 | QwQ-Preview | 32B | Alibaba Group | [Qwen Team 2024](https://qwenlm.github.io/blog/qwq-32b-preview/)                                                                         |
-| Qwen3 | 8B | Alibaba Group | [Qwen Team 2025](https://arxiv.org/abs/2505.09388/)                                                                         |
+| Qwen3 | 0.6B, 1.7B, 4B, 8B, 14B, 32B | Alibaba Group | [Qwen Team 2025](https://arxiv.org/abs/2505.09388/)                                                                         |
 | R1 Distll Llama | 8B, 70B | DeepSeek AI | [DeepSeek AI 2025](https://github.com/deepseek-ai/DeepSeek-R1/blob/main/DeepSeek_R1.pdf)                                                                         |
 | RedPajama-INCITE | 3B, 7B | Together | [Together 2023](https://together.ai/blog/redpajama-models-v1)                                                                 |
 | SmolLM2 | 135M, 360M, 1.7B | Hugging Face | [Hugging Face 2024](https://github.com/huggingface/smollm)                                                               |

From 0a42be3282ade0b79d8860aa3dfdc3668f057bb8 Mon Sep 17 00:00:00 2001
From: "shijie.yu" <shijie@tensorplex.ai>
Date: Sat, 24 May 2025 03:21:22 +0000
Subject: [PATCH 6/6] fixes

---
 litgpt/config.py    | 19 +++++++++++--------
 tests/test_model.py |  2 +-
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/litgpt/config.py b/litgpt/config.py
index 40eb03969e..cae2f5f388 100644
--- a/litgpt/config.py
+++ b/litgpt/config.py
@@ -2568,10 +2568,18 @@ def norm_class(self) -> Type:
         rope_base=1000000,
         norm_qk=True,
     ),
+]
+for c in qwen_3:
+    for kind in ("", "-Base"):
+        copy = deepcopy(c)
+        copy["name"] = c["name"].format(kind)
+        copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind)
+        configs.append(copy)
+qwen_3_32b = [
     # https://huggingface.co/Qwen/Qwen3-32B/blob/main/config.json
     dict(
-        name="Qwen3-32B{}",
-        hf_config=dict(org="Qwen", name="Qwen3-32B{}"),
+        name="Qwen3-32B",
+        hf_config=dict(org="Qwen", name="Qwen3-32B"),
         block_size=40960,
         vocab_size=151643,
         padded_vocab_size=151936,
@@ -2591,12 +2599,7 @@ def norm_class(self) -> Type:
         norm_qk=True,
     ),
 ]
-for c in qwen_3:
-    for kind in ("", "-Base"):
-        copy = deepcopy(c)
-        copy["name"] = c["name"].format(kind)
-        copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind)
-        configs.append(copy)
+configs.extend(qwen_3_32b)
 
 
 #############
diff --git a/tests/test_model.py b/tests/test_model.py
index bb6d48fff9..c48dbd9e83 100644
--- a/tests/test_model.py
+++ b/tests/test_model.py
@@ -1011,7 +1011,7 @@ def test_against_original_qwen_2_5(model_name, device, dtype):
 
 
 @torch.inference_mode()
-@pytest.mark.parametrize("model_name", ["Qwen3-0.6B", "Qwen3-8B", "Qwen3-4B-Base", "Qwen3-32B-Base"])
+@pytest.mark.parametrize("model_name", ["Qwen3-0.6B", "Qwen3-8B", "Qwen3-4B-Base", "Qwen3-14B-Base", "Qwen3-32B"])
 @pytest.mark.parametrize(
     ("device", "dtype"),
     [