diff --git a/README.md b/README.md index d2e0494443..6161e4d3d6 100644 --- a/README.md +++ b/README.md @@ -142,7 +142,8 @@ Every model is written from scratch to maximize performance and remove layers of | Qwen2.5 | 0.5B, 1.5B, 3B, 7B, 14B, 32B, 72B | Alibaba Group | [Qwen Team 2024](https://qwenlm.github.io/blog/qwen2.5/) | | Qwen2.5 Coder | 0.5B, 1.5B, 3B, 7B, 14B, 32B | Alibaba Group | [Hui, Binyuan et al. 2024](https://arxiv.org/abs/2409.12186) | | Qwen2.5 Math | 1.5B, 7B, 72B | Alibaba Group | [An, Yang et al. 2024](https://arxiv.org/abs/2409.12122) | -| QwQ | 32B | Alibaba Group | [Qwen Team 2024](https://qwenlm.github.io/blog/qwq-32b-preview/) | +| QwQ | 32B | Alibaba Group | [Qwen Team 2025](https://qwenlm.github.io/blog/qwq-32b/) | +| QwQ-Preview | 32B | Alibaba Group | [Qwen Team 2024](https://qwenlm.github.io/blog/qwq-32b-preview/) | | R1 Distill Llama | 8B, 70B | DeepSeek AI | [DeepSeek AI 2025](https://github.com/deepseek-ai/DeepSeek-R1/blob/main/DeepSeek_R1.pdf) | | SmolLM2 | 135M, 360M, 1.7B | Hugging Face | [Hugging Face 2024](https://github.com/huggingface/smollm) | | Salamandra | 2B, 7B | Barcelona Supercomputing Centre | [BSC-LTC 2024](https://github.com/BSC-LTC/salamandra) | diff --git a/litgpt/config.py b/litgpt/config.py index f36bb11a7a..401233399d 100644 --- a/litgpt/config.py +++ b/litgpt/config.py @@ -2254,11 +2254,32 @@ def norm_class(self) -> Type: configs.append(copy) qwq = [ + # https://huggingface.co/Qwen/QwQ-32B/blob/main/config.json + dict( + name="QwQ-32B", + hf_config=dict(org="Qwen", name="QwQ-32B"), + block_size=131072, + vocab_size=151643, + padded_vocab_size=152064, + n_layer=64, + n_head=40, + n_embd=5120, + n_query_groups=8, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + attn_bias=True, + norm_class_name="RMSNorm", + mlp_class_name="LLaMAMLP", + intermediate_size=27648, + norm_eps=1e-5, + rope_base=1000000, + ), # https://huggingface.co/Qwen/QwQ-32B-Preview/blob/main/config.json dict( name="QwQ-32B-Preview", hf_config=dict(org="Qwen", name="QwQ-32B-Preview"), - block_size=131072, + block_size=32768, vocab_size=151643, padded_vocab_size=152064, n_layer=64, diff --git a/tests/convert/test_lit_checkpoint.py b/tests/convert/test_lit_checkpoint.py index c80d3f5ec4..9259c76ddd 100644 --- a/tests/convert/test_lit_checkpoint.py +++ b/tests/convert/test_lit_checkpoint.py @@ -529,7 +529,9 @@ def test_check_conversion_supported_lora(): @torch.inference_mode() -@pytest.mark.parametrize("model_name", ("Qwen2.5-1.5B", "Qwen2.5-Coder-1.5B", "Qwen2.5-Math-1.5B", "QwQ-32B-Preview")) +@pytest.mark.parametrize( + "model_name", ["Qwen2.5-1.5B", "Qwen2.5-Coder-1.5B", "Qwen2.5-Math-1.5B", "QwQ-32B-Preview", "QwQ-32B"] +) @pytest.mark.parametrize( ("device", "dtype"), [ diff --git a/tests/test_model.py b/tests/test_model.py index d0babac8f8..c1e1b1a6a4 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -800,7 +800,9 @@ def test_against_original_gemma_2(model_name, device, dtype): @torch.inference_mode() -@pytest.mark.parametrize("model_name", ("Qwen2.5-1.5B", "Qwen2.5-Coder-1.5B", "Qwen2.5-Math-1.5B", "QwQ-32B-Preview")) +@pytest.mark.parametrize( + "model_name", ["Qwen2.5-1.5B", "Qwen2.5-Coder-1.5B", "Qwen2.5-Math-1.5B", "QwQ-32B-Preview", "QwQ-32B"] +) @pytest.mark.parametrize( ("device", "dtype"), [ diff --git a/tutorials/download_model_weights.md b/tutorials/download_model_weights.md index 40335d949a..dbd25e72d1 100644 --- a/tutorials/download_model_weights.md +++ b/tutorials/download_model_weights.md @@ -40,7 +40,8 @@ LitGPT supports a variety of LLM architectures with publicly available weights. | Qwen2.5 | 0.5B, 1.5B, 3B, 7B, 14B, 32B, 72B | Alibaba Group | [Qwen Team 2024](https://qwenlm.github.io/blog/qwen2.5/) | | Qwen2.5 Coder | 0.5B, 1.5B, 3B, 7B, 14B, 32B | Alibaba Group | [Hui, Binyuan et al. 2024](https://arxiv.org/abs/2409.12186) | | Qwen2.5 Math | 1.5B, 7B, 72B | Alibaba Group | [An, Yang et al. 2024](https://arxiv.org/abs/2409.12122) | -| QwQ | 32B | Alibaba Group | [Qwen Team 2024](https://qwenlm.github.io/blog/qwq-32b-preview/) | +| QwQ | 32B | Alibaba Group | [Qwen Team 2025](https://qwenlm.github.io/blog/qwq-32b/) | +| QwQ-Preview | 32B | Alibaba Group | [Qwen Team 2024](https://qwenlm.github.io/blog/qwq-32b-preview/) | | R1 Distll Llama | 8B, 70B | DeepSeek AI | [DeepSeek AI 2025](https://github.com/deepseek-ai/DeepSeek-R1/blob/main/DeepSeek_R1.pdf) | | RedPajama-INCITE | 3B, 7B | Together | [Together 2023](https://together.ai/blog/redpajama-models-v1) | | SmolLM2 | 135M, 360M, 1.7B | Hugging Face | [Hugging Face 2024](https://github.com/huggingface/smollm) | @@ -221,6 +222,7 @@ Qwen/Qwen2.5-Math-7B Qwen/Qwen2.5-Math-7B-Instruct Qwen/Qwen2.5-Math-72B Qwen/Qwen2.5-Math-72B-Instruct +Qwen/QwQ-32B Qwen/QwQ-32B-Preview stabilityai/FreeWilly2 stabilityai/stable-code-3b