Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -150,8 +150,8 @@ Every model is written from scratch to maximize performance and remove layers of
| Qwen2.5 Math | 1.5B, 7B, 72B | Alibaba Group | [An, Yang et al. 2024](https://arxiv.org/abs/2409.12122) |
| QwQ | 32B | Alibaba Group | [Qwen Team 2025](https://qwenlm.github.io/blog/qwq-32b/) |
| QwQ-Preview | 32B | Alibaba Group | [Qwen Team 2024](https://qwenlm.github.io/blog/qwq-32b-preview/) |
| Qwen3 | 0.6B, 1.7B, 4B{Hybrid, Thinking-2507}, 8B, 14B, 32B | Alibaba Group | [Qwen Team 2025](https://arxiv.org/abs/2505.09388/) |
| Qwen3 MoE | 30B{Hybrid, Thinking-2507}, 235B{Hybrid, Thinking-2507} | Alibaba Group | [Qwen Team 2025](https://arxiv.org/abs/2505.09388/) |
| Qwen3 | 0.6B, 1.7B, 4B{Hybrid, Thinking-2507, Instruct-2507}, 8B, 14B, 32B | Alibaba Group | [Qwen Team 2025](https://arxiv.org/abs/2505.09388/) |
| Qwen3 MoE | 30B{Hybrid, Thinking-2507, Instruct-2507}, 235B{Hybrid, Thinking-2507, Instruct-2507} | Alibaba Group | [Qwen Team 2025](https://arxiv.org/abs/2505.09388/) |
| R1 Distill Llama | 8B, 70B | DeepSeek AI | [DeepSeek AI 2025](https://github.com/deepseek-ai/DeepSeek-R1/blob/main/DeepSeek_R1.pdf) |
| SmolLM2 | 135M, 360M, 1.7B | Hugging Face | [Hugging Face 2024](https://github.com/huggingface/smollm) |
| Salamandra | 2B, 7B | Barcelona Supercomputing Centre | [BSC-LTC 2024](https://github.com/BSC-LTC/salamandra) |
Expand Down
22 changes: 14 additions & 8 deletions litgpt/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -2859,11 +2859,11 @@ def norm_class(self) -> Type:
]
configs.extend(qwen_3_moe)

qwen_3_2507_thinking = [
qwen_3_2507_thinking_instruct = [
# https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507/blob/main/config.json
dict(
name="Qwen3-235B-A22B-Thinking-2507",
hf_config=dict(org="Qwen", name="Qwen3-235B-A22B-Thinking-2507"),
name="Qwen3-235B-A22B-{}-2507",
hf_config=dict(org="Qwen", name="Qwen3-235B-A22B-{}-2507"),
block_size=262144,
head_size=128,
vocab_size=151643,
Expand All @@ -2887,8 +2887,8 @@ def norm_class(self) -> Type:
),
# https://huggingface.co/Qwen/Qwen3-30B-A3B-Thinking-2507/blob/main/config.json
dict(
name="Qwen3-30B-A3B-Thinking-2507",
hf_config=dict(org="Qwen", name="Qwen3-30B-A3B-Thinking-2507"),
name="Qwen3-30B-A3B-{}-2507",
hf_config=dict(org="Qwen", name="Qwen3-30B-A3B-{}-2507"),
block_size=262144,
head_size=128,
vocab_size=151643,
Expand All @@ -2912,8 +2912,8 @@ def norm_class(self) -> Type:
),
# https://huggingface.co/Qwen/Qwen3-4B-Thinking-2507/blob/main/config.json
dict(
name="Qwen3-4B-Thinking-2507",
hf_config=dict(org="Qwen", name="Qwen3-4B-Thinking-2507"),
name="Qwen3-4B-{}-2507",
hf_config=dict(org="Qwen", name="Qwen3-4B-{}-2507"),
block_size=262144,
vocab_size=151643,
padded_vocab_size=151936,
Expand All @@ -2933,7 +2933,13 @@ def norm_class(self) -> Type:
norm_qk=True,
),
]
configs.extend(qwen_3_2507_thinking)

for c in qwen_3_2507_thinking_instruct:
for kind in ("Thinking", "Instruct"):
copy = deepcopy(c)
copy["name"] = c["name"].format(kind)
copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind)
configs.append(copy)

#############
# Salamandra
Expand Down
15 changes: 13 additions & 2 deletions tests/test_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -1083,7 +1083,16 @@ def test_against_original_qwen_2_5(model_name, device, dtype):

@torch.inference_mode()
@pytest.mark.parametrize(
"model_name", ["Qwen3-0.6B", "Qwen3-8B", "Qwen3-4B-Base", "Qwen3-14B-Base", "Qwen3-32B", "Qwen3-4B-Thinking-2507"]
"model_name",
[
"Qwen3-0.6B",
"Qwen3-8B",
"Qwen3-4B-Base",
"Qwen3-14B-Base",
"Qwen3-32B",
"Qwen3-4B-Thinking-2507",
"Qwen3-4B-Instruct-2507",
],
)
@pytest.mark.parametrize(
("device", "dtype"),
Expand Down Expand Up @@ -1143,7 +1152,9 @@ def test_against_original_qwen_3(model_name, device, dtype):


@torch.inference_mode()
@pytest.mark.parametrize("model_name", ["Qwen3-30B-A3B", "Qwen3-235B-A22B", "Qwen3-235B-A22B-Thinking-2507"])
@pytest.mark.parametrize(
"model_name", ["Qwen3-30B-A3B", "Qwen3-235B-A22B", "Qwen3-235B-A22B-Thinking-2507", "Qwen3-235B-A22B-Instruct-2507"]
)
@pytest.mark.parametrize(
("device", "dtype"),
[
Expand Down
24 changes: 22 additions & 2 deletions tutorials/download_model_weights.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,8 @@ LitGPT supports a variety of LLM architectures with publicly available weights.
| Qwen2.5 Math | 1.5B, 7B, 72B | Alibaba Group | [An, Yang et al. 2024](https://arxiv.org/abs/2409.12122) |
| QwQ | 32B | Alibaba Group | [Qwen Team 2025](https://qwenlm.github.io/blog/qwq-32b/) |
| QwQ-Preview | 32B | Alibaba Group | [Qwen Team 2024](https://qwenlm.github.io/blog/qwq-32b-preview/) |
| Qwen3 | 0.6B, 1.7B, 4B{Hybrid, Thinking-2507}, 8B, 14B, 32B | Alibaba Group | [Qwen Team 2025](https://arxiv.org/abs/2505.09388/) |
| Qwen3 MoE | 30B{Hybrid, Thinking-2507}, 235B{Hybrid, Thinking-2507} | Alibaba Group | [Qwen Team 2025](https://arxiv.org/abs/2505.09388/) |
| Qwen3 | 0.6B, 1.7B, 4B{Hybrid, Thinking-2507, Instruct-2507}, 8B, 14B, 32B | Alibaba Group | [Qwen Team 2025](https://arxiv.org/abs/2505.09388/) |
| Qwen3 MoE | 30B{Hybrid, Thinking-2507, Instruct-2507}, 235B{Hybrid, Thinking-2507, Instruct-2507} | Alibaba Group | [Qwen Team 2025](https://arxiv.org/abs/2505.09388/) |
| R1 Distll Llama | 8B, 70B | DeepSeek AI | [DeepSeek AI 2025](https://github.com/deepseek-ai/DeepSeek-R1/blob/main/DeepSeek_R1.pdf) |
| RedPajama-INCITE | 3B, 7B | Together | [Together 2023](https://together.ai/blog/redpajama-models-v1) |
| SmolLM2 | 135M, 360M, 1.7B | Hugging Face | [Hugging Face 2024](https://github.com/huggingface/smollm) |
Expand Down Expand Up @@ -237,6 +237,26 @@ Qwen/Qwen2.5-Math-7B
Qwen/Qwen2.5-Math-7B-Instruct
Qwen/Qwen2.5-Math-72B
Qwen/Qwen2.5-Math-72B-Instruct
Qwen/Qwen3-0.6B
Qwen/Qwen3-0.6B-Base
Qwen/Qwen3-1.7B
Qwen/Qwen3-1.7B-Base
Qwen/Qwen3-4B
Qwen/Qwen3-4B-Base
Qwen/Qwen3-8B
Qwen/Qwen3-8B-Base
Qwen/Qwen3-14B
Qwen/Qwen3-14B-Base
Qwen/Qwen3-32B
Qwen/Qwen3-30B-A3B
Qwen/Qwen3-30B-A3B-Base
Qwen/Qwen3-235B-A22B
Qwen/Qwen3-4B-Thinking-2507
Qwen/Qwen3-4B-Instruct-2507
Qwen/Qwen3-30B-A3B-Thinking-2507
Qwen/Qwen3-30B-A3B-Instruct-2507
Qwen/Qwen3-235B-A22B-Thinking-2507
Qwen/Qwen3-235B-A22B-Instruct-2507
Qwen/QwQ-32B
Qwen/QwQ-32B-Preview
stabilityai/FreeWilly2
Expand Down
Loading