diff --git a/README.md b/README.md index 2e8b6f281a..527c28bf30 100644 --- a/README.md +++ b/README.md @@ -150,8 +150,8 @@ Every model is written from scratch to maximize performance and remove layers of | Qwen2.5 Math | 1.5B, 7B, 72B | Alibaba Group | [An, Yang et al. 2024](https://arxiv.org/abs/2409.12122) | | QwQ | 32B | Alibaba Group | [Qwen Team 2025](https://qwenlm.github.io/blog/qwq-32b/) | | QwQ-Preview | 32B | Alibaba Group | [Qwen Team 2024](https://qwenlm.github.io/blog/qwq-32b-preview/) | -| Qwen3 | 0.6B, 1.7B, 4B{Hybrid, Thinking-2507}, 8B, 14B, 32B | Alibaba Group | [Qwen Team 2025](https://arxiv.org/abs/2505.09388/) | -| Qwen3 MoE | 30B{Hybrid, Thinking-2507}, 235B{Hybrid, Thinking-2507} | Alibaba Group | [Qwen Team 2025](https://arxiv.org/abs/2505.09388/) | +| Qwen3 | 0.6B, 1.7B, 4B{Hybrid, Thinking-2507, Instruct-2507}, 8B, 14B, 32B | Alibaba Group | [Qwen Team 2025](https://arxiv.org/abs/2505.09388/) | +| Qwen3 MoE | 30B{Hybrid, Thinking-2507, Instruct-2507}, 235B{Hybrid, Thinking-2507, Instruct-2507} | Alibaba Group | [Qwen Team 2025](https://arxiv.org/abs/2505.09388/) | | R1 Distill Llama | 8B, 70B | DeepSeek AI | [DeepSeek AI 2025](https://github.com/deepseek-ai/DeepSeek-R1/blob/main/DeepSeek_R1.pdf) | | SmolLM2 | 135M, 360M, 1.7B | Hugging Face | [Hugging Face 2024](https://github.com/huggingface/smollm) | | Salamandra | 2B, 7B | Barcelona Supercomputing Centre | [BSC-LTC 2024](https://github.com/BSC-LTC/salamandra) | diff --git a/litgpt/config.py b/litgpt/config.py index 062653f940..63ee4c9ee1 100644 --- a/litgpt/config.py +++ b/litgpt/config.py @@ -2859,11 +2859,11 @@ def norm_class(self) -> Type: ] configs.extend(qwen_3_moe) -qwen_3_2507_thinking = [ +qwen_3_2507_thinking_instruct = [ # https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507/blob/main/config.json dict( - name="Qwen3-235B-A22B-Thinking-2507", - hf_config=dict(org="Qwen", name="Qwen3-235B-A22B-Thinking-2507"), + name="Qwen3-235B-A22B-{}-2507", + hf_config=dict(org="Qwen", name="Qwen3-235B-A22B-{}-2507"), block_size=262144, head_size=128, vocab_size=151643, @@ -2887,8 +2887,8 @@ def norm_class(self) -> Type: ), # https://huggingface.co/Qwen/Qwen3-30B-A3B-Thinking-2507/blob/main/config.json dict( - name="Qwen3-30B-A3B-Thinking-2507", - hf_config=dict(org="Qwen", name="Qwen3-30B-A3B-Thinking-2507"), + name="Qwen3-30B-A3B-{}-2507", + hf_config=dict(org="Qwen", name="Qwen3-30B-A3B-{}-2507"), block_size=262144, head_size=128, vocab_size=151643, @@ -2912,8 +2912,8 @@ def norm_class(self) -> Type: ), # https://huggingface.co/Qwen/Qwen3-4B-Thinking-2507/blob/main/config.json dict( - name="Qwen3-4B-Thinking-2507", - hf_config=dict(org="Qwen", name="Qwen3-4B-Thinking-2507"), + name="Qwen3-4B-{}-2507", + hf_config=dict(org="Qwen", name="Qwen3-4B-{}-2507"), block_size=262144, vocab_size=151643, padded_vocab_size=151936, @@ -2933,7 +2933,13 @@ def norm_class(self) -> Type: norm_qk=True, ), ] -configs.extend(qwen_3_2507_thinking) + +for c in qwen_3_2507_thinking_instruct: + for kind in ("Thinking", "Instruct"): + copy = deepcopy(c) + copy["name"] = c["name"].format(kind) + copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind) + configs.append(copy) ############# # Salamandra diff --git a/tests/test_model.py b/tests/test_model.py index b6cf94a0ef..a3677d21af 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -1083,7 +1083,16 @@ def test_against_original_qwen_2_5(model_name, device, dtype): @torch.inference_mode() @pytest.mark.parametrize( - "model_name", ["Qwen3-0.6B", "Qwen3-8B", "Qwen3-4B-Base", "Qwen3-14B-Base", "Qwen3-32B", "Qwen3-4B-Thinking-2507"] + "model_name", + [ + "Qwen3-0.6B", + "Qwen3-8B", + "Qwen3-4B-Base", + "Qwen3-14B-Base", + "Qwen3-32B", + "Qwen3-4B-Thinking-2507", + "Qwen3-4B-Instruct-2507", + ], ) @pytest.mark.parametrize( ("device", "dtype"), @@ -1143,7 +1152,9 @@ def test_against_original_qwen_3(model_name, device, dtype): @torch.inference_mode() -@pytest.mark.parametrize("model_name", ["Qwen3-30B-A3B", "Qwen3-235B-A22B", "Qwen3-235B-A22B-Thinking-2507"]) +@pytest.mark.parametrize( + "model_name", ["Qwen3-30B-A3B", "Qwen3-235B-A22B", "Qwen3-235B-A22B-Thinking-2507", "Qwen3-235B-A22B-Instruct-2507"] +) @pytest.mark.parametrize( ("device", "dtype"), [ diff --git a/tutorials/download_model_weights.md b/tutorials/download_model_weights.md index 25b34acdd2..fcd3111ea6 100644 --- a/tutorials/download_model_weights.md +++ b/tutorials/download_model_weights.md @@ -48,8 +48,8 @@ LitGPT supports a variety of LLM architectures with publicly available weights. | Qwen2.5 Math | 1.5B, 7B, 72B | Alibaba Group | [An, Yang et al. 2024](https://arxiv.org/abs/2409.12122) | | QwQ | 32B | Alibaba Group | [Qwen Team 2025](https://qwenlm.github.io/blog/qwq-32b/) | | QwQ-Preview | 32B | Alibaba Group | [Qwen Team 2024](https://qwenlm.github.io/blog/qwq-32b-preview/) | -| Qwen3 | 0.6B, 1.7B, 4B{Hybrid, Thinking-2507}, 8B, 14B, 32B | Alibaba Group | [Qwen Team 2025](https://arxiv.org/abs/2505.09388/) | -| Qwen3 MoE | 30B{Hybrid, Thinking-2507}, 235B{Hybrid, Thinking-2507} | Alibaba Group | [Qwen Team 2025](https://arxiv.org/abs/2505.09388/) | +| Qwen3 | 0.6B, 1.7B, 4B{Hybrid, Thinking-2507, Instruct-2507}, 8B, 14B, 32B | Alibaba Group | [Qwen Team 2025](https://arxiv.org/abs/2505.09388/) | +| Qwen3 MoE | 30B{Hybrid, Thinking-2507, Instruct-2507}, 235B{Hybrid, Thinking-2507, Instruct-2507} | Alibaba Group | [Qwen Team 2025](https://arxiv.org/abs/2505.09388/) | | R1 Distll Llama | 8B, 70B | DeepSeek AI | [DeepSeek AI 2025](https://github.com/deepseek-ai/DeepSeek-R1/blob/main/DeepSeek_R1.pdf) | | RedPajama-INCITE | 3B, 7B | Together | [Together 2023](https://together.ai/blog/redpajama-models-v1) | | SmolLM2 | 135M, 360M, 1.7B | Hugging Face | [Hugging Face 2024](https://github.com/huggingface/smollm) | @@ -237,6 +237,26 @@ Qwen/Qwen2.5-Math-7B Qwen/Qwen2.5-Math-7B-Instruct Qwen/Qwen2.5-Math-72B Qwen/Qwen2.5-Math-72B-Instruct +Qwen/Qwen3-0.6B +Qwen/Qwen3-0.6B-Base +Qwen/Qwen3-1.7B +Qwen/Qwen3-1.7B-Base +Qwen/Qwen3-4B +Qwen/Qwen3-4B-Base +Qwen/Qwen3-8B +Qwen/Qwen3-8B-Base +Qwen/Qwen3-14B +Qwen/Qwen3-14B-Base +Qwen/Qwen3-32B +Qwen/Qwen3-30B-A3B +Qwen/Qwen3-30B-A3B-Base +Qwen/Qwen3-235B-A22B +Qwen/Qwen3-4B-Thinking-2507 +Qwen/Qwen3-4B-Instruct-2507 +Qwen/Qwen3-30B-A3B-Thinking-2507 +Qwen/Qwen3-30B-A3B-Instruct-2507 +Qwen/Qwen3-235B-A22B-Thinking-2507 +Qwen/Qwen3-235B-A22B-Instruct-2507 Qwen/QwQ-32B Qwen/QwQ-32B-Preview stabilityai/FreeWilly2