diff --git a/.typos.toml b/.typos.toml
index c4af3a89ea..7e33273636 100644
--- a/.typos.toml
+++ b/.typos.toml
@@ -13,6 +13,7 @@ extend-ignore-identifiers-re = [
"tese",
"seperable",
"Seperable",
+ "setp",
]
[files]
diff --git a/README.md b/README.md
index 6ea9a222b0..a45355bad7 100644
--- a/README.md
+++ b/README.md
@@ -67,6 +67,22 @@ Please submit requests for new models [here](https://github.com/EricLBuehler/mis
## Quick examples
*After following installation instructions*
+
+- 🤗🤗🤗 Run the **SmolLM 3** long-context hybrid-reasoning model with full tool-calling support: [documentation](docs/SMOLLM3.md)
+
+ Show command
+
+ **Default, easiest:**
+ ```bash
+ ./mistralrs-server -i --isq 8 run -m HuggingFaceTB/SmolLM3-3B
+ ```
+
+ **UQFF prequantized:**
+ ```bash
+ ./mistralrs-server -i run -m EricB/SmolLM3-3B-UQFF -f smollm33b-q4k-0.uqff
+ ```
+
+
- 🔊 Run the **Dia 1.6b** model for highly-realistic dialogue generation: [documentation](docs/DIA.md)
Show command
@@ -493,6 +509,7 @@ If you do not specify the architecture, an attempt will be made to use the model
- `deepseekv3`
- `qwen3`
- `qwen3moe`
+- `smollm3`
@@ -576,6 +593,7 @@ Please submit more benchmarks via raising an issue!
|Mistral 3| | |✅|
|Llama 4| | |✅|
|Qwen 3|✅| |✅|
+|SmolLM3| | |✅|
|Dia 1.6b| | |✅|
@@ -622,6 +640,7 @@ Please submit more benchmarks via raising an issue!
|Mistral 3| | | |
|Llama 4| | | |
|Qwen 3| | | |
+|SmolLM3|✅| | |
@@ -655,6 +674,7 @@ Please submit more benchmarks via raising an issue!
|Mistral 3|✅|
|Llama 4| |
|Qwen 3| |
+|SmolLM3|✅|
### Using derivative and adapter models
diff --git a/chat_templates/smollm3.jinja b/chat_templates/smollm3.jinja
new file mode 100644
index 0000000000..107bc71912
--- /dev/null
+++ b/chat_templates/smollm3.jinja
@@ -0,0 +1,89 @@
+{# ───── defaults ───── #}
+{%- if enable_thinking is not defined -%}
+{%- set enable_thinking = true -%}
+{%- endif -%}
+
+{# ───── reasoning mode ───── #}
+{%- if enable_thinking -%}
+ {%- set reasoning_mode = "/think" -%}
+{%- else -%}
+ {%- set reasoning_mode = "/no_think" -%}
+{%- endif -%}
+
+{# ───── header (system message) ───── #}
+{{- "<|im_start|>system\n" -}}
+
+{%- if messages[0].role == "system" -%}
+ {%- set system_message = messages[0].content -%}
+ {%- if "/no_think" in system_message -%}
+ {%- set reasoning_mode = "/no_think" -%}
+ {%- elif "/think" in system_message -%}
+ {%- set reasoning_mode = "/think" -%}
+ {%- endif -%}
+ {%- set custom_instructions = system_message.replace("/no_think", "").replace("/think", "").rstrip() -%}
+{%- endif -%}
+
+{%- if "/system_override" in system_message -%}
+ {{- custom_instructions.replace("/system_override", "").rstrip() -}}
+ {{- "<|im_end|>\n" -}}
+{%- else -%}
+ {{- "## Metadata\n\n" -}}
+ {{- "Knowledge Cutoff Date: June 2025\n" -}}
+ {%- set today = strftime_now("%d %B %Y") -%}
+ {{- "Today Date: " ~ today ~ "\n" -}}
+ {{- "Reasoning Mode: " + reasoning_mode + "\n\n" -}}
+
+ {{- "## Custom Instructions\n\n" -}}
+ {%- if custom_instructions -%}
+ {{- custom_instructions + "\n\n" -}}
+ {%- elif reasoning_mode == "/think" -%}
+ {{- "You are a helpful AI assistant named SmolLM, trained by Hugging Face. Your role as an assistant involves thoroughly exploring questions through a systematic thinking process before providing the final precise and accurate solutions. This requires engaging in a comprehensive cycle of analysis, summarizing, exploration, reassessment, reflection, backtracking, and iteration to develop well-considered thinking process. Please structure your response into two main sections: Thought and Solution using the specified format: Thought section Solution section. In the Thought section, detail your reasoning process in steps. Each step should include detailed considerations such as analysing questions, summarizing relevant findings, brainstorming new ideas, verifying the accuracy of the current steps, refining any errors, and revisiting previous steps. In the Solution section, based on various attempts, explorations, and reflections from the Thought section, systematically present the final solution that you deem correct. The Solution section should be logical, accurate, and concise and detail necessary steps needed to reach the conclusion.\n\n" -}}
+ {%- else -%}
+ {{- "You are a helpful AI assistant named SmolLM, trained by Hugging Face.\n\n" -}}
+ {%- endif -%}
+
+ {%- if xml_tools or python_tools -%}
+ {{- "### Tools\n\n" -}}
+ {%- if xml_tools -%}
+ {%- set ns = namespace(xml_tool_string="You may call one or more functions to assist with the user query.\nYou are provided with function signatures within XML tags:\n\n\n") -%}
+ {%- for tool in xml_tools[:] -%} {# The slicing makes sure that xml_tools is a list #}
+ {%- set ns.xml_tool_string = ns.xml_tool_string ~ (tool | string) ~ "\n" -%}
+ {%- endfor -%}
+ {%- set xml_tool_string = ns.xml_tool_string + "\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n" -%}
+ {{- xml_tool_string -}}
+ {%- endif -%}
+ {%- if python_tools -%}
+ {%- set ns = namespace(python_tool_string="When you send a message containing Python code between '' and '' tags, it will be executed in a stateful Jupyter notebook environment, and you will then be given the output to continued reasoning in an agentic loop.\n\nYou can use the following tools in your python code like regular functions:\n\n") -%}
+ {%- for tool in python_tools[:] -%} {# The slicing makes sure that python_tools is a list #}
+ {%- set ns.python_tool_string = ns.python_tool_string ~ (tool | string) ~ "\n" -%}
+ {%- endfor -%}
+ {%- set python_tool_string = ns.python_tool_string + "\n\nThe state persists between code executions: so variables that you define in one step are still available thereafter." -%}
+ {{- python_tool_string -}}
+ {%- endif -%}
+ {{- "\n\n" -}}
+ {{- "<|im_end|>\n" -}}
+ {%- endif -%}
+{%- endif -%}
+{# ───── main loop ───── #}
+{%- for message in messages -%}
+ {%- set content = message.content if message.content is string else "" -%}
+ {%- if message.role == "user" -%}
+ {{ "<|im_start|>" + message.role + "\n" + content + "<|im_end|>\n" }}
+ {%- elif message.role == "assistant" -%}
+ {%- if reasoning_mode == "/think" -%}
+ {{ "<|im_start|>assistant\n" + content.lstrip("\n") + "<|im_end|>\n" }}
+ {%- else -%}
+ {{ "<|im_start|>assistant\n" + "\n\n\n" + content.lstrip("\n") + "<|im_end|>\n" }}
+ {%- endif -%}
+ {%- elif message.role == "tool" -%}
+ {{ "<|im_start|>" + "user\n" + content + "<|im_end|>\n" }}
+ {%- endif -%}
+{%- endfor -%}
+{# ───── generation prompt ───── #}
+{%- if add_generation_prompt -%}
+ {%- if reasoning_mode == "/think" -%}
+ {{ "<|im_start|>assistant\n" }}
+ {%- else -%}
+ {{ "<|im_start|>assistant\n" + "\n\n\n" }}
+ {%- endif -%}
+{%- endif -%}
\ No newline at end of file
diff --git a/docs/ANYMOE.md b/docs/ANYMOE.md
index 12dd2accca..f3fcba2f53 100644
--- a/docs/ANYMOE.md
+++ b/docs/ANYMOE.md
@@ -126,7 +126,7 @@ runner = Runner(
res = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=[
{"role": "user", "content": "Tell me a story about the Rust type system."}
],
diff --git a/docs/DEEPSEEKV2.md b/docs/DEEPSEEKV2.md
index 52ad7b6ef3..e3be667471 100644
--- a/docs/DEEPSEEKV2.md
+++ b/docs/DEEPSEEKV2.md
@@ -35,7 +35,7 @@ while True:
prompt = input(">>> ")
messages.append({"role": "user", "content": prompt})
completion = client.chat.completions.create(
- model="ignore",
+ model="default",
messages=messages,
max_tokens=256,
frequency_penalty=1.0,
@@ -60,7 +60,7 @@ runner = Runner(
res = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=[
{"role": "user", "content": "Tell me a story about the Rust type system."}
],
diff --git a/docs/DEEPSEEKV3.md b/docs/DEEPSEEKV3.md
index ab10c03902..08e18b244e 100644
--- a/docs/DEEPSEEKV3.md
+++ b/docs/DEEPSEEKV3.md
@@ -44,7 +44,7 @@ while True:
prompt = input(">>> ")
messages.append({"role": "user", "content": prompt})
completion = client.chat.completions.create(
- model="ignore",
+ model="default",
messages=messages,
max_tokens=256,
frequency_penalty=1.0,
@@ -69,7 +69,7 @@ runner = Runner(
res = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=[
{"role": "user", "content": "Tell me a story about the Rust type system."}
],
diff --git a/docs/DIA.md b/docs/DIA.md
index 41e965c648..953134d005 100644
--- a/docs/DIA.md
+++ b/docs/DIA.md
@@ -29,7 +29,7 @@ client = OpenAI(api_key="foobar", base_url="http://localhost:1234/v1/")
text_to_speak = "[S1] mistral r s is a local LLM inference engine. [S2] You can run text and vision models, and also image generation and speech generation. [S1] There is agentic web search, tool calling, and a convenient Python API. [S2] Check it out on github."
response = client.audio.speech.create(
- model="ignore", voice="N/A", input=text_to_speak, response_format="wav"
+ model="default", voice="N/A", input=text_to_speak, response_format="wav"
)
output_path = Path("output.wav")
diff --git a/docs/FLUX.md b/docs/FLUX.md
index b2cb1e5893..28c998ddcd 100644
--- a/docs/FLUX.md
+++ b/docs/FLUX.md
@@ -30,7 +30,7 @@ from openai import OpenAI
client = OpenAI(api_key="foobar", base_url="http://localhost:1234/v1/")
result = client.images.generate(
- model="ignore",
+ model="default",
prompt="A vibrant sunset in the mountains, 4k, high quality.",
n=1,
)
diff --git a/docs/GEMMA2.md b/docs/GEMMA2.md
index 1362c5df3d..ce43e7f77d 100644
--- a/docs/GEMMA2.md
+++ b/docs/GEMMA2.md
@@ -19,7 +19,7 @@ while True:
prompt = input(">>> ")
messages.append({"role": "user", "content": prompt})
completion = client.chat.completions.create(
- model="ignore",
+ model="default",
messages=messages,
max_tokens=256,
frequency_penalty=1.0,
@@ -44,7 +44,7 @@ runner = Runner(
res = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=[
{"role": "user", "content": "Tell me a story about the Rust type system."}
],
diff --git a/docs/GEMMA3.md b/docs/GEMMA3.md
index 784cc4b25c..d10dfd3d61 100644
--- a/docs/GEMMA3.md
+++ b/docs/GEMMA3.md
@@ -63,7 +63,7 @@ client = OpenAI(api_key="foobar", base_url="http://localhost:1234/v1/")
completion = client.chat.completions.create(
- model="ignore",
+ model="default",
messages=[
{
"role": "user",
@@ -160,7 +160,7 @@ runner = Runner(
res = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=[
{
"role": "user",
diff --git a/docs/GLM4.md b/docs/GLM4.md
index 4a47696e7b..e8ec6a78fe 100644
--- a/docs/GLM4.md
+++ b/docs/GLM4.md
@@ -19,7 +19,7 @@ while True:
prompt = input(">>> ")
messages.append({"role": "user", "content": prompt})
completion = client.chat.completions.create(
- model="ignore",
+ model="default",
messages=messages,
max_tokens=256,
frequency_penalty=1.0,
@@ -44,7 +44,7 @@ runner = Runner(
res = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=[
{"role": "user", "content": "Tell me a story about the Rust type system."}
],
diff --git a/docs/HTTP.md b/docs/HTTP.md
index b9227f5218..af45e401d9 100644
--- a/docs/HTTP.md
+++ b/docs/HTTP.md
@@ -46,7 +46,7 @@ client = openai.OpenAI(
)
completion = client.chat.completions.create(
-model="ignore",
+model="default",
messages=[
{"role": "system", "content": "You are Mistral.rs, an AI assistant."},
{"role": "user", "content": "Write a story about Rust error handling."}
@@ -62,7 +62,7 @@ curl http://localhost:8080/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer EMPTY" \
-d '{
-"model": "ignore",
+"model": "default",
"messages": [
{
"role": "system",
@@ -116,7 +116,7 @@ client = openai.OpenAI(
)
completion = client.completions.create(
- model="ignore",
+ model="default",
prompt="What is Rust?",
max_tokens=256,
frequency_penalty=1.0,
@@ -133,7 +133,7 @@ curl http://localhost:8080/v1/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer EMPTY" \
-d '{
-"model": "ignore",
+"model": "default",
"prompt": "What is Rust?"
}'
```
diff --git a/docs/IDEFICS2.md b/docs/IDEFICS2.md
index 86af1fd3e9..de6c314048 100644
--- a/docs/IDEFICS2.md
+++ b/docs/IDEFICS2.md
@@ -58,7 +58,7 @@ from openai import OpenAI
client = OpenAI(api_key="foobar", base_url="http://localhost:1234/v1/")
completion = client.chat.completions.create(
- model="ignore",
+ model="default",
messages=[
{
"role": "user",
@@ -155,7 +155,7 @@ runner = Runner(
res = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=[
{
"role": "user",
diff --git a/docs/IDEFICS3.md b/docs/IDEFICS3.md
index d1d391bb74..012fb109db 100644
--- a/docs/IDEFICS3.md
+++ b/docs/IDEFICS3.md
@@ -120,7 +120,7 @@ from openai import OpenAI
client = OpenAI(api_key="foobar", base_url="http://localhost:1234/v1/")
completion = client.chat.completions.create(
- model="ignore",
+ model="default",
messages=[
{
"role": "user",
@@ -217,7 +217,7 @@ runner = Runner(
res = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=[
{
"role": "user",
diff --git a/docs/LLAMA4.md b/docs/LLAMA4.md
index 6177aaae1b..c092ee11d7 100644
--- a/docs/LLAMA4.md
+++ b/docs/LLAMA4.md
@@ -82,7 +82,7 @@ client = OpenAI(api_key="foobar", base_url="http://localhost:1234/v1/")
completion = client.chat.completions.create(
- model="ignore",
+ model="default",
messages=[
{
"role": "user",
@@ -181,7 +181,7 @@ runner = Runner(
res = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=[
{
"role": "user",
diff --git a/docs/LLaVA.md b/docs/LLaVA.md
index 5f9ceb18f8..b104a4cbf6 100644
--- a/docs/LLaVA.md
+++ b/docs/LLaVA.md
@@ -68,7 +68,7 @@ from openai import OpenAI
client = OpenAI(api_key="foobar", base_url="http://localhost:1234/v1/")
completion = client.chat.completions.create(
- model="ignore",
+ model="default",
messages=[
{
"role": "user",
@@ -164,7 +164,7 @@ runner = Runner(
res = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=[
{
"role": "user",
diff --git a/docs/MCP/README.md b/docs/MCP/README.md
index 0546c6bed9..20c61a2a6c 100644
--- a/docs/MCP/README.md
+++ b/docs/MCP/README.md
@@ -150,7 +150,7 @@ runner = mistralrs.Runner(
# Use the model - tools are automatically available
res = runner.send_chat_completion_request(
mistralrs.ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=[
{"role": "user", "content": "List the files in the current directory and create a test.txt file"}
],
diff --git a/docs/MINICPMO_2_6.md b/docs/MINICPMO_2_6.md
index e22a63fc30..3768ea879d 100644
--- a/docs/MINICPMO_2_6.md
+++ b/docs/MINICPMO_2_6.md
@@ -105,7 +105,7 @@ from openai import OpenAI
client = OpenAI(api_key="foobar", base_url="http://localhost:1234/v1/")
completion = client.chat.completions.create(
- model="ignore",
+ model="default",
messages=[
{
"role": "user",
@@ -205,7 +205,7 @@ runner = Runner(
res = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=[
{
"role": "user",
diff --git a/docs/MISTRAL3.md b/docs/MISTRAL3.md
index a7b7da26bb..ae3dc6e5d5 100644
--- a/docs/MISTRAL3.md
+++ b/docs/MISTRAL3.md
@@ -73,7 +73,7 @@ client = OpenAI(api_key="foobar", base_url="http://localhost:1234/v1/")
completion = client.chat.completions.create(
- model="ignore",
+ model="default",
messages=[
{
"role": "user",
@@ -171,7 +171,7 @@ runner = Runner(
res = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=[
{
"role": "user",
diff --git a/docs/PAGED_ATTENTION.md b/docs/PAGED_ATTENTION.md
index 765d8fdc8d..83c253cd1d 100644
--- a/docs/PAGED_ATTENTION.md
+++ b/docs/PAGED_ATTENTION.md
@@ -153,7 +153,7 @@ runner = Runner(
res = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=[
{"role": "user", "content": "Tell me a story about the Rust type system."}
],
diff --git a/docs/PHI3.5MOE.md b/docs/PHI3.5MOE.md
index 5abaf550b5..f5f5501bf5 100644
--- a/docs/PHI3.5MOE.md
+++ b/docs/PHI3.5MOE.md
@@ -44,7 +44,7 @@ while True:
prompt = input(">>> ")
messages.append({"role": "user", "content": prompt})
completion = client.chat.completions.create(
- model="ignore",
+ model="default",
messages=messages,
max_tokens=256,
frequency_penalty=1.0,
@@ -69,7 +69,7 @@ runner = Runner(
res = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=[
{"role": "user", "content": "Tell me a story about the Rust type system."}
],
diff --git a/docs/PHI3V.md b/docs/PHI3V.md
index a2120a826d..47df4f9c79 100644
--- a/docs/PHI3V.md
+++ b/docs/PHI3V.md
@@ -62,7 +62,7 @@ from openai import OpenAI
client = OpenAI(api_key="foobar", base_url="http://localhost:1234/v1/")
completion = client.chat.completions.create(
- model="ignore",
+ model="default",
messages=[
{
"role": "user",
@@ -158,7 +158,7 @@ runner = Runner(
res = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=[
{
"role": "user",
diff --git a/docs/PHI4MM.md b/docs/PHI4MM.md
index 172acfba69..6ceb84be19 100644
--- a/docs/PHI4MM.md
+++ b/docs/PHI4MM.md
@@ -62,7 +62,7 @@ from openai import OpenAI
client = OpenAI(api_key="foobar", base_url="http://localhost:1234/v1/")
completion = client.chat.completions.create(
- model="ignore",
+ model="default",
messages=[
{
"role": "user",
@@ -159,7 +159,7 @@ runner = Runner(
res = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=[
{
"role": "user",
diff --git a/docs/QWEN2VL.md b/docs/QWEN2VL.md
index 88961b7adb..dc8d92aad1 100644
--- a/docs/QWEN2VL.md
+++ b/docs/QWEN2VL.md
@@ -103,7 +103,7 @@ from openai import OpenAI
client = OpenAI(api_key="foobar", base_url="http://localhost:1234/v1/")
completion = client.chat.completions.create(
- model="ignore",
+ model="default",
messages=[
{
"role": "user",
@@ -204,7 +204,7 @@ runner = Runner(
res = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=[
{
"role": "user",
diff --git a/docs/QWEN3.md b/docs/QWEN3.md
index 0e8806f702..b62f50b05f 100644
--- a/docs/QWEN3.md
+++ b/docs/QWEN3.md
@@ -12,7 +12,7 @@ The Qwen 3 family is a collection of hybrid reasoning MoE and non-MoE models ran
> Note: tool calling support is fully implemented for the Qwen 3 models, including agentic web search.
## Enabling thinking
-The Qwen 3 models are hybrid reasoning models which can be controlled at inference-time. By default, reasoning is enabled for these models. To dynamically control this, it is recommended to either add `/no_think` or `/think` to your prompt. Alternatively, you can specify the `enable_thinking` flag as detailed by the API-specific examples.
+The Qwen 3 models are hybrid reasoning models which can be controlled at inference-time. **By default, reasoning is enabled for these models.** To dynamically control this, it is recommended to either add `/no_think` or `/think` to your prompt. Alternatively, you can specify the `enable_thinking` flag as detailed by the API-specific examples.
## HTTP API
You can find a more detailed example demonstrating enabling/disabling thinking [here](../examples/server/qwen3.py).
@@ -34,12 +34,13 @@ while True:
prompt = input(">>> ")
messages.append({"role": "user", "content": prompt})
completion = client.chat.completions.create(
- model="ignore",
+ model="default",
messages=messages,
max_tokens=256,
frequency_penalty=1.0,
top_p=0.1,
temperature=0,
+ # enable_thinking=False,
)
resp = completion.choices[0].message.content
print(resp)
@@ -61,7 +62,7 @@ runner = Runner(
res = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=[
{"role": "user", "content": "Tell me a story about the Rust type system."}
],
@@ -69,6 +70,7 @@ res = runner.send_chat_completion_request(
presence_penalty=1.0,
top_p=0.1,
temperature=0.1,
+ # enable_thinking=False,
)
)
print(res.choices[0].message.content)
@@ -94,6 +96,7 @@ async fn main() -> Result<()> {
.await?;
let messages = TextMessages::new()
+ // .enable_thinking(false)
.add_message(
TextMessageRole::System,
"You are an AI agent with a specialty in programming.",
diff --git a/docs/SMOLLM3.md b/docs/SMOLLM3.md
new file mode 100644
index 0000000000..79572e58a5
--- /dev/null
+++ b/docs/SMOLLM3.md
@@ -0,0 +1,124 @@
+# SmolLM3: [`HuggingFaceTB/SmolLM3-3B`](https://huggingface.co/HuggingFaceTB/SmolLM3-3B)
+
+SmolLM3 is a 3B parameter long-context hybrid reasoning language model. It supports 6 languages, advanced reasoning and long context. SmolLM3 is a fully open model that offers strong performance at the 3B–4B scale.
+
+**Default, easiest:**
+```bash
+./mistralrs-server -i --isq 8 run -m HuggingFaceTB/SmolLM3-3B
+```
+
+**UQFF prequantized:**
+```bash
+./mistralrs-server -i run -m EricB/SmolLM3-3B-UQFF -f smollm33b-q4k-0.uqff
+```
+
+> Note: tool calling support is fully implemented for the SmolLM3 models, including agentic web search.
+
+> Check out prequantized UQFF SmolLM3 here: https://huggingface.co/EricB/SmolLM3-3B-UQFF
+
+## Enabling thinking
+The SmolLM3 models are hybrid reasoning models which can be controlled at inference-time. **By default, reasoning is enabled for these models.** To dynamically control this, it is recommended to either add `/no_think` or `/think` to your prompt. Alternatively, you can specify the `enable_thinking` flag as detailed by the API-specific examples.
+
+## HTTP API
+You can find a more detailed example demonstrating enabling/disabling thinking [here](../examples/server/smollm3.py).
+
+```
+./mistralrs-server --isq 8 --port 1234 plain -m HuggingFaceTB/SmolLM3-3B
+```
+
+```py
+import openai
+
+messages = []
+prompt = input("Enter system prompt >>> ")
+if len(prompt) > 0:
+ messages.append({"role": "system", "content": prompt})
+
+
+while True:
+ prompt = input(">>> ")
+ messages.append({"role": "user", "content": prompt})
+ completion = client.chat.completions.create(
+ model="default",
+ messages=messages,
+ max_tokens=256,
+ frequency_penalty=1.0,
+ top_p=0.1,
+ temperature=0,
+ # enable_thinking=False,
+ )
+ resp = completion.choices[0].message.content
+ print(resp)
+ messages.append({"role": "assistant", "content": resp})
+```
+
+## Python API
+You can find a more detailed example demonstrating enabling/disabling thinking [here](../examples/python/smollm3.py).
+
+```py
+from mistralrs import Runner, Which, ChatCompletionRequest, Architecture
+
+runner = Runner(
+ which=Which.Plain(
+ model_id="HuggingFaceTB/SmolLM3-3B",
+ arch=Architecture.SmolLm3,
+ ),
+)
+
+res = runner.send_chat_completion_request(
+ ChatCompletionRequest(
+ model="default",
+ messages=[
+ {"role": "user", "content": "Tell me a story about the Rust type system."}
+ ],
+ max_tokens=256,
+ presence_penalty=1.0,
+ top_p=0.1,
+ temperature=0.1,
+ # enable_thinking=False,
+ )
+)
+print(res.choices[0].message.content)
+print(res.usage)
+```
+
+## Rust API
+You can find a more detailed example demonstrating enabling/disabling thinking [here](../mistralrs/examples/smollm3/main.rs).
+
+```rust
+use anyhow::Result;
+use mistralrs::{
+ IsqType, PagedAttentionMetaBuilder, TextMessageRole, TextMessages, TextModelBuilder,
+};
+
+#[tokio::main]
+async fn main() -> Result<()> {
+ let model = TextModelBuilder::new("HuggingFaceTB/SmolLM3-3B")
+ .with_isq(IsqType::Q8_0)
+ .with_logging()
+ .with_paged_attn(|| PagedAttentionMetaBuilder::default().build())?
+ .build()
+ .await?;
+
+ let messages = TextMessages::new()
+ // .enable_thinking(false)
+ .add_message(
+ TextMessageRole::System,
+ "You are an AI agent with a specialty in programming.",
+ )
+ .add_message(
+ TextMessageRole::User,
+ "Hello! How are you? Please write generic binary search function in Rust.",
+ );
+
+ let response = model.send_chat_request(messages).await?;
+
+ println!("{}", response.choices[0].message.content.as_ref().unwrap());
+ dbg!(
+ response.usage.avg_prompt_tok_per_sec,
+ response.usage.avg_compl_tok_per_sec
+ );
+
+ Ok(())
+}
+```
diff --git a/docs/VLLAMA.md b/docs/VLLAMA.md
index d64338e58b..9b7ef37ad0 100644
--- a/docs/VLLAMA.md
+++ b/docs/VLLAMA.md
@@ -118,7 +118,7 @@ from openai import OpenAI
client = OpenAI(api_key="foobar", base_url="http://localhost:1234/v1/")
completion = client.chat.completions.create(
- model="ignore",
+ model="default",
messages=[
{
"role": "user",
@@ -218,7 +218,7 @@ runner = Runner(
res = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=[
{
"role": "user",
diff --git a/docs/WEB_SEARCH.md b/docs/WEB_SEARCH.md
index a2f7899f91..a815c3914a 100644
--- a/docs/WEB_SEARCH.md
+++ b/docs/WEB_SEARCH.md
@@ -82,7 +82,7 @@ messages = [
]
completion = client.chat.completions.create(
- model="ignore",
+ model="default",
messages=messages,
tool_choice="auto",
max_tokens=1024,
@@ -132,7 +132,7 @@ runner = Runner(
res = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=[
{
"role": "user",
diff --git a/examples/python/anymoe.py b/examples/python/anymoe.py
index d44b99f481..fa33839ed1 100644
--- a/examples/python/anymoe.py
+++ b/examples/python/anymoe.py
@@ -29,7 +29,7 @@
res = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=[
{"role": "user", "content": "Tell me a story about the Rust type system."}
],
diff --git a/examples/python/anymoe_inference.py b/examples/python/anymoe_inference.py
index 3c7aa4798a..539834222f 100644
--- a/examples/python/anymoe_inference.py
+++ b/examples/python/anymoe_inference.py
@@ -30,7 +30,7 @@
res = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=[
{"role": "user", "content": "Tell me a story about the Rust type system."}
],
diff --git a/examples/python/anymoe_lora.py b/examples/python/anymoe_lora.py
index e5a3281ebb..a1789672e4 100644
--- a/examples/python/anymoe_lora.py
+++ b/examples/python/anymoe_lora.py
@@ -31,7 +31,7 @@
res = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=[
{"role": "user", "content": "Tell me a story about the Rust type system."}
],
diff --git a/examples/python/custom_search.py b/examples/python/custom_search.py
index 260ba584a9..96007c6aaf 100644
--- a/examples/python/custom_search.py
+++ b/examples/python/custom_search.py
@@ -41,7 +41,7 @@ def local_search(query: str):
res = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=[{"role": "user", "content": "Where is Cargo.toml in this repo?"}],
max_tokens=64,
web_search_options=WebSearchOptions(
diff --git a/examples/python/custom_tool_call.py b/examples/python/custom_tool_call.py
index daa84a8d45..7dfc21c84a 100644
--- a/examples/python/custom_tool_call.py
+++ b/examples/python/custom_tool_call.py
@@ -61,7 +61,7 @@ def tool_cb(name: str, args: dict) -> str:
res = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=[{"role": "user", "content": "Where is Cargo.toml in this repo?"}],
max_tokens=64,
tool_schemas=[schema],
diff --git a/examples/python/deepseekr1.py b/examples/python/deepseekr1.py
index 45215268b4..cd465365d9 100644
--- a/examples/python/deepseekr1.py
+++ b/examples/python/deepseekr1.py
@@ -9,7 +9,7 @@
res = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=[
{"role": "user", "content": "Tell me a story about the Rust type system."}
],
diff --git a/examples/python/deepseekv2.py b/examples/python/deepseekv2.py
index 3ea0046b81..dc65b041b7 100644
--- a/examples/python/deepseekv2.py
+++ b/examples/python/deepseekv2.py
@@ -9,7 +9,7 @@
res = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=[
{"role": "user", "content": "Tell me a story about the Rust type system."}
],
diff --git a/examples/python/gemma3.py b/examples/python/gemma3.py
index 809483c518..6fb7a7f535 100644
--- a/examples/python/gemma3.py
+++ b/examples/python/gemma3.py
@@ -9,7 +9,7 @@
res = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=[
{
"role": "user",
diff --git a/examples/python/gguf.py b/examples/python/gguf.py
index da2394a99e..205a4b9f33 100644
--- a/examples/python/gguf.py
+++ b/examples/python/gguf.py
@@ -10,7 +10,7 @@
res = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=[
{"role": "user", "content": "Tell me a story about the Rust type system."}
],
diff --git a/examples/python/idefics2.py b/examples/python/idefics2.py
index 9b54f358f3..7e219cbc53 100644
--- a/examples/python/idefics2.py
+++ b/examples/python/idefics2.py
@@ -9,7 +9,7 @@
res = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=[
{
"role": "user",
diff --git a/examples/python/idefics3.py b/examples/python/idefics3.py
index fe8954bc9f..192ae28cbe 100644
--- a/examples/python/idefics3.py
+++ b/examples/python/idefics3.py
@@ -9,7 +9,7 @@
res = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=[
{
"role": "user",
diff --git a/examples/python/imatrix.py b/examples/python/imatrix.py
index 5b74ae8e80..662b8f449f 100644
--- a/examples/python/imatrix.py
+++ b/examples/python/imatrix.py
@@ -10,7 +10,7 @@
res = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=[
{"role": "user", "content": "Tell me a story about the Rust type system."}
],
diff --git a/examples/python/isq.py b/examples/python/isq.py
index beba1236e1..ff65ced356 100644
--- a/examples/python/isq.py
+++ b/examples/python/isq.py
@@ -9,7 +9,7 @@
res = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=[
{"role": "user", "content": "Tell me a story about the Rust type system."}
],
diff --git a/examples/python/json_schema.py b/examples/python/json_schema.py
index ea84f9b983..415fe6f0f3 100644
--- a/examples/python/json_schema.py
+++ b/examples/python/json_schema.py
@@ -10,7 +10,7 @@
res = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=[{"role": "user", "content": "Give me a sample address."}],
max_tokens=256,
temperature=0.1,
diff --git a/examples/python/lark.py b/examples/python/lark.py
index 0c88ce82f7..2bf0f60853 100644
--- a/examples/python/lark.py
+++ b/examples/python/lark.py
@@ -35,7 +35,7 @@
res = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=[{"role": "user", "content": "Give me a sample address."}],
max_tokens=30,
temperature=0.1,
diff --git a/examples/python/lark_llg.py b/examples/python/lark_llg.py
index adc7118c40..12a3735be1 100644
--- a/examples/python/lark_llg.py
+++ b/examples/python/lark_llg.py
@@ -25,7 +25,7 @@
res = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=[
{
"role": "user",
diff --git a/examples/python/llama4.py b/examples/python/llama4.py
index f37c3c96eb..795f2c29f8 100644
--- a/examples/python/llama4.py
+++ b/examples/python/llama4.py
@@ -10,7 +10,7 @@
res = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=[
{
"role": "user",
diff --git a/examples/python/llama_vision.py b/examples/python/llama_vision.py
index 85dc563a19..8dbb71b492 100644
--- a/examples/python/llama_vision.py
+++ b/examples/python/llama_vision.py
@@ -12,7 +12,7 @@
res = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=[
{
"role": "user",
diff --git a/examples/python/llava_next.py b/examples/python/llava_next.py
index b3dd0044d7..0d466de689 100644
--- a/examples/python/llava_next.py
+++ b/examples/python/llava_next.py
@@ -9,7 +9,7 @@
res = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=[
{
"role": "user",
diff --git a/examples/python/llguidance.py b/examples/python/llguidance.py
index 337cbc2a9c..8d26344feb 100644
--- a/examples/python/llguidance.py
+++ b/examples/python/llguidance.py
@@ -32,7 +32,7 @@
res = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=[
{
"role": "user",
diff --git a/examples/python/lora_activation.py b/examples/python/lora_activation.py
index 79282edb24..7ef65c9d87 100644
--- a/examples/python/lora_activation.py
+++ b/examples/python/lora_activation.py
@@ -12,7 +12,7 @@
res = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=[
{"role": "user", "content": "Tell me a story about the Rust type system."}
],
diff --git a/examples/python/lora_zephyr.py b/examples/python/lora_zephyr.py
index bc82337d39..377b1d39ea 100644
--- a/examples/python/lora_zephyr.py
+++ b/examples/python/lora_zephyr.py
@@ -15,7 +15,7 @@
res = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=[
{"role": "user", "content": "Tell me a story about the Rust type system."}
],
diff --git a/examples/python/mcp_client.py b/examples/python/mcp_client.py
index a375e9a327..ed0cd2e08c 100644
--- a/examples/python/mcp_client.py
+++ b/examples/python/mcp_client.py
@@ -95,7 +95,7 @@ async def main():
# Create a conversation that demonstrates MCP tool usage
request = mistralrs.ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=[
{
"role": "system",
diff --git a/examples/python/minicpmo_2_6.py b/examples/python/minicpmo_2_6.py
index f346cdc1ce..e966adff46 100644
--- a/examples/python/minicpmo_2_6.py
+++ b/examples/python/minicpmo_2_6.py
@@ -11,7 +11,7 @@
res = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=[
{
"role": "user",
diff --git a/examples/python/mistral3.py b/examples/python/mistral3.py
index 4f3c4a875d..374f17bd2a 100644
--- a/examples/python/mistral3.py
+++ b/examples/python/mistral3.py
@@ -10,7 +10,7 @@
res = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=[
{
"role": "user",
diff --git a/examples/python/mixture_of_quant_experts.py b/examples/python/mixture_of_quant_experts.py
index 64cd1853d7..5203f1aae7 100644
--- a/examples/python/mixture_of_quant_experts.py
+++ b/examples/python/mixture_of_quant_experts.py
@@ -11,7 +11,7 @@
res = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=[
{"role": "user", "content": "Tell me a story about the Rust type system."}
],
diff --git a/examples/python/paged_attention.py b/examples/python/paged_attention.py
index 536a18ec6a..13acb0a2e1 100644
--- a/examples/python/paged_attention.py
+++ b/examples/python/paged_attention.py
@@ -11,7 +11,7 @@
res = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=[
{"role": "user", "content": "Tell me a story about the Rust type system."}
],
diff --git a/examples/python/phi3v.py b/examples/python/phi3v.py
index 0e7cefd755..ce0f8f8164 100644
--- a/examples/python/phi3v.py
+++ b/examples/python/phi3v.py
@@ -9,7 +9,7 @@
res = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=[
{
"role": "user",
diff --git a/examples/python/phi3v_base64.py b/examples/python/phi3v_base64.py
index 0607de3c76..513063c017 100644
--- a/examples/python/phi3v_base64.py
+++ b/examples/python/phi3v_base64.py
@@ -14,7 +14,7 @@
res = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=[
{
"role": "user",
diff --git a/examples/python/phi3v_local_img.py b/examples/python/phi3v_local_img.py
index 776f8f5920..cfd9b7e01b 100644
--- a/examples/python/phi3v_local_img.py
+++ b/examples/python/phi3v_local_img.py
@@ -11,7 +11,7 @@
res = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=[
{
"role": "user",
diff --git a/examples/python/phi4mm.py b/examples/python/phi4mm.py
index 92c26478c9..b4445e2a11 100644
--- a/examples/python/phi4mm.py
+++ b/examples/python/phi4mm.py
@@ -9,7 +9,7 @@
res = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=[
{
"role": "user",
diff --git a/examples/python/phi4mm_audio.py b/examples/python/phi4mm_audio.py
index 1963823f17..c45b91788b 100644
--- a/examples/python/phi4mm_audio.py
+++ b/examples/python/phi4mm_audio.py
@@ -16,7 +16,7 @@
response = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=[
{
"role": "user",
diff --git a/examples/python/plain.py b/examples/python/plain.py
index 978dc76fa3..142535e8f7 100644
--- a/examples/python/plain.py
+++ b/examples/python/plain.py
@@ -9,7 +9,7 @@
res = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=[
{"role": "user", "content": "Tell me a story about the Rust type system."}
],
diff --git a/examples/python/pydantic_schema.py b/examples/python/pydantic_schema.py
index 12305b19af..9219ca5f23 100644
--- a/examples/python/pydantic_schema.py
+++ b/examples/python/pydantic_schema.py
@@ -38,7 +38,7 @@ class Fleet(BaseModel):
res = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=[{"role": "user", "content": "Give me a sample address."}],
max_tokens=256,
temperature=0.1,
diff --git a/examples/python/qwen2vl.py b/examples/python/qwen2vl.py
index 515f28ccdd..269179bd6f 100644
--- a/examples/python/qwen2vl.py
+++ b/examples/python/qwen2vl.py
@@ -11,7 +11,7 @@
res = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=[
{
"role": "user",
diff --git a/examples/python/qwen3.py b/examples/python/qwen3.py
index f368bd2e6d..9e6e8834aa 100644
--- a/examples/python/qwen3.py
+++ b/examples/python/qwen3.py
@@ -30,7 +30,7 @@
# ------------------------------------------------------------------
completion = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=messages,
max_tokens=1024,
frequency_penalty=1.0,
@@ -55,7 +55,7 @@
# ------------------------------------------------------------------
completion = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=messages,
max_tokens=1024,
frequency_penalty=1.0,
@@ -82,7 +82,7 @@
# ------------------------------------------------------------------
completion = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=messages,
max_tokens=1024,
frequency_penalty=1.0,
diff --git a/examples/python/regex.py b/examples/python/regex.py
index abe1cfed22..2efa2748d6 100644
--- a/examples/python/regex.py
+++ b/examples/python/regex.py
@@ -10,7 +10,7 @@
res = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=[{"role": "user", "content": "Tell me a short joke."}],
max_tokens=30,
temperature=0.1,
diff --git a/examples/python/smollm3.py b/examples/python/smollm3.py
new file mode 100644
index 0000000000..557317c2f5
--- /dev/null
+++ b/examples/python/smollm3.py
@@ -0,0 +1,28 @@
+#!/usr/bin/env python
+"""
+Example of using SmolLM3 model with mistral.rs
+"""
+
+from mistralrs import Runner, Which, ChatCompletionRequest, Architecture
+
+# Create a SmolLM3 model runner
+runner = Runner(
+ which=Which.Plain(
+ model_id="HuggingFaceTB/SmolLM3-3B", # You can use any SmolLM3 model from HuggingFace
+ arch=Architecture.SmolLm3,
+ ),
+)
+
+# Send a chat completion request
+res = runner.send_chat_completion_request(
+ ChatCompletionRequest(
+ model="smollm3",
+ messages=[{"role": "user", "content": "What is the capital of France?"}],
+ max_tokens=256,
+ temperature=0.7,
+ )
+)
+
+# Print the response
+print(res.choices[0].message.content)
+print(f"\nUsage: {res.usage}")
diff --git a/examples/python/smolvlm.py b/examples/python/smolvlm.py
index 5b0478eef1..eda21aecf2 100644
--- a/examples/python/smolvlm.py
+++ b/examples/python/smolvlm.py
@@ -9,7 +9,7 @@
res = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=[
{
"role": "user",
diff --git a/examples/python/speculative.py b/examples/python/speculative.py
index b6e2a614bb..19b6c2e9a9 100644
--- a/examples/python/speculative.py
+++ b/examples/python/speculative.py
@@ -15,7 +15,7 @@
res = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=[
{"role": "user", "content": "Tell me a story about the Rust type system."}
],
diff --git a/examples/python/speculative_xlora.py b/examples/python/speculative_xlora.py
index 90b9e4e4fc..527b98729c 100644
--- a/examples/python/speculative_xlora.py
+++ b/examples/python/speculative_xlora.py
@@ -19,7 +19,7 @@
res = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=[
{"role": "user", "content": "Tell me a story about the Rust type system."}
],
diff --git a/examples/python/streaming.py b/examples/python/streaming.py
index 931a4b64f4..eb81277c83 100644
--- a/examples/python/streaming.py
+++ b/examples/python/streaming.py
@@ -10,7 +10,7 @@
res = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=[
{"role": "user", "content": "Tell me a story about the Rust type system."}
],
diff --git a/examples/python/text_auto_device_map.py b/examples/python/text_auto_device_map.py
index 872e1181ab..672a92fc91 100644
--- a/examples/python/text_auto_device_map.py
+++ b/examples/python/text_auto_device_map.py
@@ -9,7 +9,7 @@
res = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=[
{"role": "user", "content": "Tell me a story about the Rust type system."}
],
diff --git a/examples/python/token_source.py b/examples/python/token_source.py
index fcf2ebe671..b719886ed2 100644
--- a/examples/python/token_source.py
+++ b/examples/python/token_source.py
@@ -11,7 +11,7 @@
res = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=[
{"role": "user", "content": "Tell me a story about the Rust type system."}
],
diff --git a/examples/python/tool_call.py b/examples/python/tool_call.py
index 4da812e088..291867dc9f 100644
--- a/examples/python/tool_call.py
+++ b/examples/python/tool_call.py
@@ -71,7 +71,7 @@ def run_python(code: str) -> str:
res = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=messages,
max_tokens=256,
presence_penalty=1.0,
@@ -102,7 +102,7 @@ def run_python(code: str) -> str:
res = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=messages,
max_tokens=256,
presence_penalty=1.0,
diff --git a/examples/python/topology.py b/examples/python/topology.py
index 4dd6b94c78..0b3e23ca75 100644
--- a/examples/python/topology.py
+++ b/examples/python/topology.py
@@ -11,7 +11,7 @@
res = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=[
{"role": "user", "content": "Tell me a story about the Rust type system."}
],
diff --git a/examples/python/vision_auto_device_map.py b/examples/python/vision_auto_device_map.py
index 5f469d4e14..9374e222ef 100644
--- a/examples/python/vision_auto_device_map.py
+++ b/examples/python/vision_auto_device_map.py
@@ -21,7 +21,7 @@
res = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=[
{
"role": "user",
diff --git a/examples/python/web_search.py b/examples/python/web_search.py
index 62986b5d85..89de7efdfc 100644
--- a/examples/python/web_search.py
+++ b/examples/python/web_search.py
@@ -16,7 +16,7 @@
res = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=[
{
"role": "user",
diff --git a/examples/python/xlora_gemma.py b/examples/python/xlora_gemma.py
index c7567b7b12..097737712f 100644
--- a/examples/python/xlora_gemma.py
+++ b/examples/python/xlora_gemma.py
@@ -12,7 +12,7 @@
res = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=[
{"role": "user", "content": "Tell me a story about the Rust type system."}
],
diff --git a/examples/python/xlora_zephyr.py b/examples/python/xlora_zephyr.py
index 7e1e3dc825..baa83a9bf8 100644
--- a/examples/python/xlora_zephyr.py
+++ b/examples/python/xlora_zephyr.py
@@ -13,7 +13,7 @@
res = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=[
{"role": "user", "content": "Tell me a story about the Rust type system."}
],
diff --git a/examples/server/adapter_chat.py b/examples/server/adapter_chat.py
index 4f165e3b81..50ca54b73c 100644
--- a/examples/server/adapter_chat.py
+++ b/examples/server/adapter_chat.py
@@ -46,7 +46,7 @@ def log_response(response: httpx.Response):
adapter = input("Active adapter >>> ")
messages.append({"role": "user", "content": prompt})
completion = client.chat.completions.create(
- model="ignore",
+ model="default",
messages=messages,
max_tokens=256,
frequency_penalty=1.0,
diff --git a/examples/server/completion.py b/examples/server/completion.py
index 803810476e..7118f7d552 100644
--- a/examples/server/completion.py
+++ b/examples/server/completion.py
@@ -38,7 +38,7 @@ def log_response(response: httpx.Response):
while True:
prompt = input(">>> ")
completion = client.completions.create(
- model="ignore",
+ model="default",
prompt=prompt,
max_tokens=256,
frequency_penalty=1.0,
diff --git a/examples/server/dia.py b/examples/server/dia.py
index b44cb0103a..0ec520713f 100644
--- a/examples/server/dia.py
+++ b/examples/server/dia.py
@@ -7,7 +7,7 @@
text_to_speak = "[S1] mistral r s is a local LLM inference engine. [S2] You can run text and vision models, and also image generation and speech generation. [S1] There is agentic web search, tool calling, and a convenient Python API. [S2] Check it out on github."
response = client.audio.speech.create(
- model="ignore", voice="N/A", input=text_to_speak, response_format="wav"
+ model="default", voice="N/A", input=text_to_speak, response_format="wav"
)
output_path = Path("output.wav")
diff --git a/examples/server/flux.py b/examples/server/flux.py
index 57242df3ec..0bfc16a532 100644
--- a/examples/server/flux.py
+++ b/examples/server/flux.py
@@ -3,7 +3,7 @@
client = OpenAI(api_key="foobar", base_url="http://localhost:1234/v1/")
result = client.images.generate(
- model="ignore",
+ model="default",
prompt="A vibrant sunset in the mountains, 4k, high quality.",
n=1,
)
diff --git a/examples/server/gemma3.py b/examples/server/gemma3.py
index 3963b88f58..0b55cc0ae8 100644
--- a/examples/server/gemma3.py
+++ b/examples/server/gemma3.py
@@ -36,7 +36,7 @@ def log_response(response: httpx.Response):
# )
completion = client.chat.completions.create(
- model="ignore",
+ model="default",
messages=[
{
"role": "user",
diff --git a/examples/server/idefics2.py b/examples/server/idefics2.py
index 28f59cc2ec..3662994985 100644
--- a/examples/server/idefics2.py
+++ b/examples/server/idefics2.py
@@ -36,7 +36,7 @@ def log_response(response: httpx.Response):
# )
completion = client.chat.completions.create(
- model="ignore",
+ model="default",
messages=[
{
"role": "user",
diff --git a/examples/server/idefics3.py b/examples/server/idefics3.py
index 6a9aab56c7..a0a5e97be7 100644
--- a/examples/server/idefics3.py
+++ b/examples/server/idefics3.py
@@ -36,7 +36,7 @@ def log_response(response: httpx.Response):
# )
completion = client.chat.completions.create(
- model="ignore",
+ model="default",
messages=[
{
"role": "user",
diff --git a/examples/server/json_schema.py b/examples/server/json_schema.py
index 17d6ceb9ca..80794524a3 100644
--- a/examples/server/json_schema.py
+++ b/examples/server/json_schema.py
@@ -15,7 +15,7 @@
}
completion = client.chat.completions.create(
- model="ignore",
+ model="default",
messages=[
{
"role": "user",
diff --git a/examples/server/lark.py b/examples/server/lark.py
index e141632fa9..1b18e41cb5 100644
--- a/examples/server/lark.py
+++ b/examples/server/lark.py
@@ -47,7 +47,7 @@
"""
completion = client.chat.completions.create(
- model="ignore",
+ model="default",
messages=[
{
"role": "user",
diff --git a/examples/server/llama4.py b/examples/server/llama4.py
index 1d03307e8a..b01ba70b5a 100644
--- a/examples/server/llama4.py
+++ b/examples/server/llama4.py
@@ -36,7 +36,7 @@ def log_response(response: httpx.Response):
# )
completion = client.chat.completions.create(
- model="ignore",
+ model="default",
messages=[
{
"role": "user",
diff --git a/examples/server/llama_vision.py b/examples/server/llama_vision.py
index 86fbe86516..1047293dbc 100644
--- a/examples/server/llama_vision.py
+++ b/examples/server/llama_vision.py
@@ -36,7 +36,7 @@ def log_response(response: httpx.Response):
# )
completion = client.chat.completions.create(
- model="ignore",
+ model="default",
messages=[
{
"role": "user",
diff --git a/examples/server/llava.py b/examples/server/llava.py
index 3730e9c05a..598cdcb17f 100644
--- a/examples/server/llava.py
+++ b/examples/server/llava.py
@@ -36,7 +36,7 @@ def log_response(response: httpx.Response):
# )
completion = client.chat.completions.create(
- model="ignore",
+ model="default",
messages=[
{
"role": "user",
diff --git a/examples/server/llava_next.py b/examples/server/llava_next.py
index 3730e9c05a..598cdcb17f 100644
--- a/examples/server/llava_next.py
+++ b/examples/server/llava_next.py
@@ -36,7 +36,7 @@ def log_response(response: httpx.Response):
# )
completion = client.chat.completions.create(
- model="ignore",
+ model="default",
messages=[
{
"role": "user",
diff --git a/examples/server/llguidance.py b/examples/server/llguidance.py
index 065925a5fc..90b0667ad4 100644
--- a/examples/server/llguidance.py
+++ b/examples/server/llguidance.py
@@ -22,7 +22,7 @@
]
completion = client.chat.completions.create(
- model="ignore",
+ model="default",
messages=[
{
"role": "user",
diff --git a/examples/server/mcp_chat.py b/examples/server/mcp_chat.py
index f2c7915bec..1c99a51d51 100644
--- a/examples/server/mcp_chat.py
+++ b/examples/server/mcp_chat.py
@@ -70,7 +70,7 @@ def main():
try:
# Send the chat completion request
response = client.chat.completions.create(
- model="ignore", # This will be handled by the configured model
+ model="default", # This will be handled by the configured model
messages=messages,
max_tokens=1000,
temperature=0.1,
diff --git a/examples/server/minicpmo_2_6.py b/examples/server/minicpmo_2_6.py
index 542e29dace..ed1e5b27fa 100644
--- a/examples/server/minicpmo_2_6.py
+++ b/examples/server/minicpmo_2_6.py
@@ -36,7 +36,7 @@ def log_response(response: httpx.Response):
# )
completion = client.chat.completions.create(
- model="ignore",
+ model="default",
messages=[
{
"role": "user",
diff --git a/examples/server/openai_response_format.py b/examples/server/openai_response_format.py
index 0cbec773ad..ef1f0aa4b8 100644
--- a/examples/server/openai_response_format.py
+++ b/examples/server/openai_response_format.py
@@ -32,7 +32,7 @@ class Fleet(BaseModel):
completion = client.beta.chat.completions.parse(
- model="ignore",
+ model="default",
messages=[
{
"role": "user",
diff --git a/examples/server/phi3v.py b/examples/server/phi3v.py
index 6e90ef9c6e..665f7ff51e 100644
--- a/examples/server/phi3v.py
+++ b/examples/server/phi3v.py
@@ -36,7 +36,7 @@ def log_response(response: httpx.Response):
# )
completion = client.chat.completions.create(
- model="ignore",
+ model="default",
messages=[
{
"role": "user",
diff --git a/examples/server/phi4mm.py b/examples/server/phi4mm.py
index 6e90ef9c6e..665f7ff51e 100644
--- a/examples/server/phi4mm.py
+++ b/examples/server/phi4mm.py
@@ -36,7 +36,7 @@ def log_response(response: httpx.Response):
# )
completion = client.chat.completions.create(
- model="ignore",
+ model="default",
messages=[
{
"role": "user",
diff --git a/examples/server/phi4mm_audio.py b/examples/server/phi4mm_audio.py
index 920c0c5f36..d2c50fc292 100644
--- a/examples/server/phi4mm_audio.py
+++ b/examples/server/phi4mm_audio.py
@@ -9,7 +9,7 @@
AUDIO_URL = "https://upload.wikimedia.org/wikipedia/commons/4/42/Bird_singing.ogg"
completion = client.chat.completions.create(
- model="ignore",
+ model="default",
messages=[
{
"role": "user",
diff --git a/examples/server/qwen2vl.py b/examples/server/qwen2vl.py
index 7c8c42bdab..4f774f51af 100644
--- a/examples/server/qwen2vl.py
+++ b/examples/server/qwen2vl.py
@@ -3,7 +3,7 @@
client = OpenAI(api_key="foobar", base_url="http://localhost:1234/v1/")
completion = client.chat.completions.create(
- model="ignore",
+ model="default",
messages=[
{
"role": "user",
diff --git a/examples/server/qwen3.py b/examples/server/qwen3.py
index 7e6f1f37cd..800bbc0ea6 100644
--- a/examples/server/qwen3.py
+++ b/examples/server/qwen3.py
@@ -46,7 +46,7 @@ def log_response(response: httpx.Response):
# First question, thinking mode is enabled by default
# ------------------------------------------------------------------
completion = client.chat.completions.create(
- model="ignore",
+ model="default",
messages=messages,
max_tokens=1024,
frequency_penalty=1.0,
@@ -69,15 +69,13 @@ def log_response(response: httpx.Response):
# Second question, disable thinking mode with extra body or /no_think
# ------------------------------------------------------------------
completion = client.chat.completions.create(
- model="ignore",
+ model="default",
messages=messages,
max_tokens=1024,
frequency_penalty=1.0,
top_p=0.1,
temperature=0,
- # extra_body={
- # "enable_thinking": False
- # }
+ # enable_thinking=False,
)
resp = completion.choices[0].message.content
print(resp)
@@ -96,15 +94,13 @@ def log_response(response: httpx.Response):
# Third question, reenable thinking mode with extra body or /think
# ------------------------------------------------------------------
completion = client.chat.completions.create(
- model="ignore",
+ model="default",
messages=messages,
max_tokens=1024,
frequency_penalty=1.0,
top_p=0.1,
temperature=0,
- # extra_body={
- # "enable_thinking": True
- # }
+ # enable_thinking=False,
)
resp = completion.choices[0].message.content
print(resp)
diff --git a/examples/server/regex.py b/examples/server/regex.py
index 325ad51d72..822c7391da 100644
--- a/examples/server/regex.py
+++ b/examples/server/regex.py
@@ -5,7 +5,7 @@
BULLET_LIST_REGEX = "(- [^\n]*\n)+(- [^\n]*)(\n\n)?"
completion = client.chat.completions.create(
- model="ignore",
+ model="default",
messages=[
{
"role": "user",
@@ -28,7 +28,7 @@
# "Sure!" we guarantee a space after "Sure!" but we haven't forced which token that starts with space should be used yet.
completion = client.chat.completions.create(
- model="ignore",
+ model="default",
messages=[
{
"role": "user",
diff --git a/examples/server/smollm3.py b/examples/server/smollm3.py
new file mode 100644
index 0000000000..ee1c655b6a
--- /dev/null
+++ b/examples/server/smollm3.py
@@ -0,0 +1,110 @@
+from openai import OpenAI
+import httpx
+import textwrap
+import json
+
+
+def log_response(response: httpx.Response):
+ request = response.request
+ print(f"Request: {request.method} {request.url}")
+ print(" Headers:")
+ for key, value in request.headers.items():
+ if key.lower() == "authorization":
+ value = "[...]"
+ if key.lower() == "cookie":
+ value = value.split("=")[0] + "=..."
+ print(f" {key}: {value}")
+ print(" Body:")
+ try:
+ request_body = json.loads(request.content)
+ print(textwrap.indent(json.dumps(request_body, indent=2), " "))
+ except json.JSONDecodeError:
+ print(textwrap.indent(request.content.decode(), " "))
+ print(f"Response: status_code={response.status_code}")
+ print(" Headers:")
+ for key, value in response.headers.items():
+ if key.lower() == "set-cookie":
+ value = value.split("=")[0] + "=..."
+ print(f" {key}: {value}")
+
+
+client = OpenAI(api_key="foobar", base_url="http://localhost:1234/v1/")
+
+# Enable this to log requests and responses
+# client._client = httpx.Client(
+# event_hooks={"request": [print], "response": [log_response]}
+# )
+
+messages = [
+ {
+ "role": "user",
+ "content": "Hello! How many rs in strawberry?",
+ },
+]
+
+# ------------------------------------------------------------------
+# First question, thinking mode is enabled by default
+# ------------------------------------------------------------------
+completion = client.chat.completions.create(
+ model="default",
+ messages=messages,
+ max_tokens=1024,
+ frequency_penalty=1.0,
+ top_p=0.1,
+ temperature=0,
+)
+resp = completion.choices[0].message.content
+print(resp)
+
+messages.append({"role": "assistant", "content": completion.choices[0].message.content})
+
+messages = [
+ {
+ "role": "user",
+ "content": "How many rs in blueberry? /no_think",
+ },
+]
+
+# ------------------------------------------------------------------
+# Second question, disable thinking mode with extra body or /no_think
+# ------------------------------------------------------------------
+completion = client.chat.completions.create(
+ model="default",
+ messages=messages,
+ max_tokens=1024,
+ frequency_penalty=1.0,
+ top_p=0.1,
+ temperature=0,
+ # extra_body={
+ # "enable_thinking": False
+ # }
+)
+resp = completion.choices[0].message.content
+print(resp)
+
+
+messages.append({"role": "assistant", "content": completion.choices[0].message.content})
+
+messages = [
+ {
+ "role": "user",
+ "content": "Are you sure? /think",
+ },
+]
+
+# ------------------------------------------------------------------
+# Third question, reenable thinking mode with extra body or /think
+# ------------------------------------------------------------------
+completion = client.chat.completions.create(
+ model="default",
+ messages=messages,
+ max_tokens=1024,
+ frequency_penalty=1.0,
+ top_p=0.1,
+ temperature=0,
+ # extra_body={
+ # "enable_thinking": True
+ # }
+)
+resp = completion.choices[0].message.content
+print(resp)
diff --git a/examples/server/stream_completion_bench.py b/examples/server/stream_completion_bench.py
index ad015d3e47..fade07d139 100644
--- a/examples/server/stream_completion_bench.py
+++ b/examples/server/stream_completion_bench.py
@@ -9,7 +9,7 @@
def request(stream: bool):
client = openai.Client(api_key="foobar", base_url=ENDPOINT)
return client.chat.completions.create(
- model="ignore",
+ model="default",
messages=[
{
"role": "user",
diff --git a/examples/server/streaming.py b/examples/server/streaming.py
index c9ef1ab468..f5c9f39d5b 100644
--- a/examples/server/streaming.py
+++ b/examples/server/streaming.py
@@ -14,7 +14,7 @@
messages.append({"role": "user", "content": prompt})
resp = ""
response = client.chat.completions.create(
- model="ignore",
+ model="default",
messages=messages,
max_tokens=256,
stream=True,
diff --git a/examples/server/streaming_completion.py b/examples/server/streaming_completion.py
index a99ee88352..6dc6ec5464 100644
--- a/examples/server/streaming_completion.py
+++ b/examples/server/streaming_completion.py
@@ -5,7 +5,7 @@
response = client.completions.create(
- model="ignore",
+ model="default",
prompt="My favorite theorem is",
max_tokens=32,
stream=True,
diff --git a/examples/server/tool_calling.py b/examples/server/tool_calling.py
index 312640cb3c..3dd53ada7c 100644
--- a/examples/server/tool_calling.py
+++ b/examples/server/tool_calling.py
@@ -83,7 +83,7 @@ def run_python(code: str) -> str:
]
completion = client.chat.completions.create(
- model="ignore",
+ model="default",
messages=messages,
tools=tools,
tool_choice="auto",
@@ -109,7 +109,7 @@ def run_python(code: str) -> str:
messages.append({"role": "tool", "content": result})
completion = client.chat.completions.create(
- model="ignore", messages=messages, tools=tools, tool_choice="auto"
+ model="default", messages=messages, tools=tools, tool_choice="auto"
)
# print(completion.usage)
print(completion.choices[0].message.content)
diff --git a/examples/server/web_search.py b/examples/server/web_search.py
index b76830ae84..2cae8a4ef3 100644
--- a/examples/server/web_search.py
+++ b/examples/server/web_search.py
@@ -10,7 +10,7 @@
]
completion = client.chat.completions.create(
- model="ignore",
+ model="default",
messages=messages,
tool_choice="auto",
max_tokens=1024,
diff --git a/mistralrs-core/src/layers.rs b/mistralrs-core/src/layers.rs
index 4cbe92c3ec..16e6c1b7e3 100644
--- a/mistralrs-core/src/layers.rs
+++ b/mistralrs-core/src/layers.rs
@@ -24,7 +24,7 @@ pub use crate::layers_utils::repeat_kv;
use crate::{
amoe::{AnyMoeTrainableLayer, MlpLayer},
gguf::Content,
- models::llama,
+ models::{llama, smollm3},
ops::SplitOp,
vision_models::{
gemma3::config::Gemma3TextConfig,
@@ -970,6 +970,139 @@ impl Llama3RotaryEmbedding {
}
}
+/// RoPE for SmolLm3
+#[derive(Debug, Clone)]
+pub struct SmolLm3RotaryEmbedding(RotaryEmbedding);
+
+#[derive(Debug, Clone, Deserialize, Serialize, Default)]
+pub enum SmolLm3RopeType {
+ #[serde(rename = "llama3")]
+ Llama3,
+ #[serde(rename = "linear")]
+ Linear,
+ #[default]
+ #[serde(rename = "default")]
+ Default,
+}
+
+#[derive(Debug, Clone, Deserialize, Serialize, Default)]
+pub struct SmolLm3RopeConfig {
+ pub factor: f32,
+ pub low_freq_factor: Option,
+ pub high_freq_factor: Option,
+ pub original_max_position_embeddings: Option,
+ pub rope_type: SmolLm3RopeType,
+}
+
+fn calculate_default_inv_freq_smollm3(cfg: &smollm3::Config) -> Vec {
+ let head_dim = cfg.hidden_size / cfg.num_attention_heads;
+ (0..head_dim)
+ .step_by(2)
+ .map(|i| 1f32 / cfg.rope_theta.powf(i as f32 / head_dim as f32))
+ .collect()
+}
+
+impl SmolLm3RotaryEmbedding {
+ pub fn new_llama3(
+ dtype: DType,
+ cfg: &smollm3::Config,
+ dev: &Device,
+ is_gpt_neox: bool,
+ ) -> Result {
+ match &cfg.rope_scaling {
+ None
+ | Some(SmolLm3RopeConfig {
+ rope_type: SmolLm3RopeType::Default,
+ ..
+ }) => Ok(Self(RotaryEmbedding::new(
+ cfg.rope_theta,
+ cfg.hidden_size / cfg.num_attention_heads,
+ cfg.max_position_embeddings,
+ dev,
+ is_gpt_neox,
+ dtype,
+ )?)),
+ Some(SmolLm3RopeConfig {
+ rope_type: SmolLm3RopeType::Llama3,
+ factor,
+ low_freq_factor,
+ high_freq_factor,
+ original_max_position_embeddings,
+ }) => {
+ let low_freq_factor = low_freq_factor.context("low_freq_factor is required")?;
+ let high_freq_factor = high_freq_factor.context("high_freq_factor is required")?;
+ let original_max_position_embeddings = original_max_position_embeddings
+ .context("original_max_position_embeddings is required")?;
+
+ let low_freq_wavelen = original_max_position_embeddings as f32 / low_freq_factor;
+ let high_freq_wavelen = original_max_position_embeddings as f32 / high_freq_factor;
+
+ let inv_freq = calculate_default_inv_freq_smollm3(cfg)
+ .into_iter()
+ .map(|freq| {
+ let wavelen = 2. * PI / freq;
+ if wavelen < high_freq_wavelen {
+ freq
+ } else if wavelen > low_freq_wavelen {
+ freq / *factor
+ } else {
+ let smooth = (original_max_position_embeddings as f32 / wavelen
+ - low_freq_factor)
+ / (high_freq_factor - low_freq_factor);
+ (1. - smooth) * freq / *factor + smooth * freq
+ }
+ })
+ .collect::>();
+ let inv_freq_len = inv_freq.len();
+ let inv_freq = Tensor::from_vec(inv_freq, (1, inv_freq_len), dev)?;
+ let t = Tensor::arange(0u32, cfg.max_position_embeddings as u32, dev)?
+ .to_dtype(DType::F32)?
+ .reshape((cfg.max_position_embeddings, 1))?;
+ let freqs = t.matmul(&inv_freq)?;
+ let sin = freqs.sin()?.to_dtype(dtype)?;
+ let cos = freqs.cos()?.to_dtype(dtype)?;
+ Ok(Self(RotaryEmbedding {
+ sin,
+ cos,
+ is_gpt_neox,
+ }))
+ }
+ Some(SmolLm3RopeConfig {
+ rope_type: SmolLm3RopeType::Linear,
+ factor,
+ ..
+ }) => {
+ let inv_freq_vec = calculate_default_inv_freq_smollm3(cfg)
+ .into_iter()
+ .map(|freq| freq / *factor)
+ .collect::>();
+ let inv_freq_len = inv_freq_vec.len();
+ let inv_freq = Tensor::from_vec(inv_freq_vec, (1, inv_freq_len), dev)?;
+ let t = Tensor::arange(0u32, cfg.max_position_embeddings as u32, dev)?
+ .to_dtype(DType::F32)?
+ .reshape((cfg.max_position_embeddings, 1))?;
+ let freqs = t.matmul(&inv_freq)?;
+ let sin = freqs.sin()?.to_dtype(dtype)?;
+ let cos = freqs.cos()?.to_dtype(dtype)?;
+ Ok(Self(RotaryEmbedding {
+ sin,
+ cos,
+ is_gpt_neox,
+ }))
+ }
+ }
+ }
+
+ pub fn forward(
+ &self,
+ q: &Tensor,
+ k: &Tensor,
+ seqlen_offsets: &[usize],
+ ) -> Result<(Tensor, Tensor)> {
+ self.0.forward(q, k, seqlen_offsets)
+ }
+}
+
// https://github.com/huggingface/transformers/blob/f2c388e3f946862f657acc1e21b272ec946fc66c/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py#L107
#[derive(Debug, Clone)]
pub struct Qwen2VLRotaryEmbedding {
diff --git a/mistralrs-core/src/models/mod.rs b/mistralrs-core/src/models/mod.rs
index bd8ecad1dd..51bc372af9 100644
--- a/mistralrs-core/src/models/mod.rs
+++ b/mistralrs-core/src/models/mod.rs
@@ -18,4 +18,5 @@ pub(crate) mod quantized_starcoder2;
pub(crate) mod qwen2;
pub(crate) mod qwen3;
pub(crate) mod qwen3_moe;
+pub(crate) mod smollm3;
pub(crate) mod starcoder2;
diff --git a/mistralrs-core/src/models/smollm3.rs b/mistralrs-core/src/models/smollm3.rs
new file mode 100644
index 0000000000..c9877bf46e
--- /dev/null
+++ b/mistralrs-core/src/models/smollm3.rs
@@ -0,0 +1,799 @@
+#![allow(clippy::cast_possible_truncation, clippy::cast_precision_loss)]
+
+use candle_core::{Device, Result, Tensor};
+use candle_nn::{Embedding, Module};
+use mistralrs_quant::{
+ ColumnParallelLayer, QuantMethod, QuantizedConfig, ReplicatedLayer, RowParallelLayer,
+ ShardedVarBuilder,
+};
+use serde::{Deserialize, Serialize};
+use std::{collections::HashMap, sync::Arc};
+
+use crate::{
+ amoe::{AnyMoeBaseModelMixin, AnyMoeConfig, AnyMoeExpertType, MlpLayer, MoeMlp},
+ attention::SdpaParams,
+ device_map::DeviceMapper,
+ get_delta_from_lora_ab,
+ layers::{
+ embedding, Activation, CausalMasker, MatMul, Mlp, RmsNorm, Sdpa, SmolLm3RopeConfig,
+ SmolLm3RotaryEmbedding,
+ },
+ layers_masker::PastKvLenCache,
+ paged_attention::{AttentionImplementation, ModelConfigMetadata, PagedAttention},
+ pipeline::{
+ extract_logits,
+ text_models_inputs_processor::{FlashParams, PagedAttentionInputMetadata},
+ EitherCache, IsqModel, KvCache, NormalCache, NormalLoadingMetadata, NormalModel,
+ },
+ serde_default_fn,
+ utils::{progress::NiceProgressBar, unvarbuilder::UnVarBuilder},
+};
+
+serde_default_fn!(bool, word_emb_default, true);
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct Config {
+ pub hidden_act: Activation,
+ pub hidden_size: usize,
+ pub intermediate_size: usize,
+ pub vocab_size: usize,
+ pub num_hidden_layers: usize,
+ pub num_attention_heads: usize,
+ pub num_key_value_heads: usize,
+ pub rms_norm_eps: f64,
+ pub rope_theta: f32,
+ pub max_position_embeddings: usize,
+ pub rope_scaling: Option,
+ pub quantization_config: Option,
+ #[serde(default = "word_emb_default")]
+ pub tie_word_embeddings: bool,
+ pub no_rope_layers: Option>,
+ pub no_rope_layer_interval: usize,
+}
+
+impl Config {
+ fn no_rope_layers(&self) -> Vec {
+ self.no_rope_layers
+ .as_ref()
+ .map(|x| x.iter().map(|&x| x != 0).collect::>())
+ .clone()
+ .unwrap_or(
+ (0..self.num_hidden_layers)
+ .map(|i| (i + 1) % self.no_rope_layer_interval != 0)
+ .collect(),
+ )
+ }
+}
+
+struct CausalSelfAttention {
+ q_proj: Arc,
+ k_proj: Arc,
+ v_proj: Arc,
+ o_proj: Arc,
+ num_attention_heads: usize,
+ num_key_value_heads: usize,
+ head_dim: usize,
+ rotary_emb: Option>,
+ max_seq_len: usize,
+ paged_attn: Option,
+ sdpa_params: SdpaParams,
+}
+
+impl CausalSelfAttention {
+ #[allow(clippy::too_many_arguments)]
+ fn forward(
+ &self,
+ x: &Tensor,
+ attention_mask: &Option,
+ seqlen_offsets: &[usize],
+ kv_cache: &mut KvCache,
+ metadata: Option<((Tensor, Tensor), &PagedAttentionInputMetadata)>,
+ flash_params: &FlashParams,
+ ) -> Result {
+ let (b_sz, seq_len, _) = x.dims3()?;
+
+ let original_dtype = x.dtype();
+ let mut x = x.clone();
+ if let Some(t) = self.q_proj.quantized_act_type() {
+ x = x.to_dtype(t)?;
+ }
+ let mut q = MatMul.qmethod_matmul(&x, &*self.q_proj)?;
+ let mut k = MatMul.qmethod_matmul(&x, &*self.k_proj)?;
+ let mut v = MatMul.qmethod_matmul(&x, &*self.v_proj)?;
+ if self.q_proj.quantized_act_type().is_some() {
+ q = q.to_dtype(original_dtype)?;
+ k = k.to_dtype(original_dtype)?;
+ v = v.to_dtype(original_dtype)?;
+ }
+
+ (q, k, v) = if seq_len != 1 {
+ let q = q
+ .reshape((b_sz, seq_len, self.num_attention_heads, self.head_dim))?
+ .transpose(1, 2)?;
+ let k = k
+ .reshape((b_sz, seq_len, self.num_key_value_heads, self.head_dim))?
+ .transpose(1, 2)?;
+ let v = v
+ .reshape((b_sz, seq_len, self.num_key_value_heads, self.head_dim))?
+ .transpose(1, 2)?;
+ (q, k, v)
+ } else {
+ let q = q.reshape((b_sz, self.num_attention_heads, seq_len, self.head_dim))?;
+ let k = k.reshape((b_sz, self.num_key_value_heads, seq_len, self.head_dim))?;
+ let v = v.reshape((b_sz, self.num_key_value_heads, seq_len, self.head_dim))?;
+ (q, k, v)
+ };
+
+ if let Some(rotary_emb) = &self.rotary_emb {
+ (q, k) = rotary_emb.forward(&q, &k, seqlen_offsets)?;
+ }
+
+ let mut y = match &self.paged_attn {
+ Some(paged_attn) => match metadata {
+ Some(((key_cache, value_cache), input_metadata)) => paged_attn.forward(
+ &q,
+ &k,
+ &v,
+ attention_mask.clone().as_ref(),
+ Some(key_cache),
+ Some(value_cache),
+ input_metadata,
+ &self.sdpa_params,
+ Some(flash_params),
+ )?,
+ None => {
+ // If we don't have metadata, we are most likely generating an imatrix so we don't want to populate that.
+ // Generating the dummy metadata with the assumption that we are not generating text (only processing prompts).
+ let input_metadata = PagedAttentionInputMetadata::dummy(q.device())?;
+ // Sanity check.
+ assert!(attention_mask.is_some());
+ paged_attn.forward(
+ &q,
+ &k,
+ &v,
+ attention_mask.clone().as_ref(),
+ None,
+ None,
+ &input_metadata,
+ &self.sdpa_params,
+ Some(flash_params),
+ )?
+ }
+ },
+ None => {
+ let (k, v) = kv_cache.append(&k, &v)?;
+
+ Sdpa.run_attention(
+ &q,
+ &k,
+ &v,
+ attention_mask.clone().as_ref(),
+ Some(flash_params),
+ &self.sdpa_params,
+ )?
+ }
+ };
+
+ if let Some(t) = self.q_proj.quantized_act_type() {
+ y = y.to_dtype(t)?;
+ }
+ y = if attention_mask.is_some() {
+ y.transpose(1, 2)?.reshape((b_sz, seq_len, ()))?
+ } else {
+ y.reshape((b_sz, seq_len, ()))?
+ };
+ let mut res = MatMul.qmethod_matmul(&y, &*self.o_proj)?;
+ if self.q_proj.quantized_act_type().is_some() {
+ res = res.to_dtype(original_dtype)?;
+ }
+ Ok(res)
+ }
+
+ fn load(
+ vb: ShardedVarBuilder,
+ cfg: &Config,
+ rope: Option>,
+ paged_attn: Option,
+ comm: &Arc,
+ ) -> Result {
+ let size_in = cfg.hidden_size;
+ let size_q = (cfg.hidden_size / cfg.num_attention_heads) * cfg.num_attention_heads;
+ let size_kv = (cfg.hidden_size / cfg.num_attention_heads) * cfg.num_key_value_heads;
+ let q_proj = ColumnParallelLayer::new(
+ size_in,
+ size_q,
+ &cfg.quantization_config,
+ false,
+ comm,
+ vb.pp("q_proj"),
+ )?;
+ let kv_shard = mistralrs_quant::compute_kv_shard(
+ cfg.num_key_value_heads,
+ cfg.hidden_size / cfg.num_attention_heads,
+ comm,
+ );
+ let k_proj = ColumnParallelLayer::new_with_shard(
+ size_in,
+ size_kv,
+ &cfg.quantization_config,
+ false,
+ comm,
+ kv_shard,
+ vb.pp("k_proj"),
+ )?;
+ let v_proj = ColumnParallelLayer::new_with_shard(
+ size_in,
+ size_kv,
+ &cfg.quantization_config,
+ false,
+ comm,
+ kv_shard,
+ vb.pp("v_proj"),
+ )?;
+ let o_proj = RowParallelLayer::new(
+ size_q,
+ size_in,
+ &cfg.quantization_config,
+ false,
+ comm,
+ vb.pp("o_proj"),
+ )?;
+ Ok(Self {
+ q_proj,
+ k_proj,
+ v_proj,
+ o_proj,
+ num_attention_heads: cfg.num_attention_heads / comm.world_size(),
+ num_key_value_heads: (cfg.num_key_value_heads / comm.world_size()).max(1),
+ head_dim: cfg.hidden_size / cfg.num_attention_heads,
+ rotary_emb: rope,
+ max_seq_len: cfg.max_position_embeddings,
+ paged_attn,
+ sdpa_params: SdpaParams {
+ n_kv_groups: mistralrs_quant::compute_n_kv_groups(
+ cfg.num_key_value_heads,
+ cfg.num_attention_heads,
+ comm,
+ ),
+ softcap: None,
+ softmax_scale: 1.0 / ((cfg.hidden_size / cfg.num_attention_heads) as f32).sqrt(),
+ sliding_window: None,
+ },
+ })
+ }
+}
+
+struct Block {
+ rms_1: RmsNorm,
+ attn: CausalSelfAttention,
+ rms_2: RmsNorm,
+ mlp: Box,
+}
+
+impl Block {
+ #[allow(clippy::too_many_arguments)]
+ fn forward(
+ &self,
+ x: &Tensor,
+ attention_mask: &Option,
+ seqlen_offsets: &[usize],
+ kv_cache: &mut KvCache,
+ metadata: Option<((Tensor, Tensor), &PagedAttentionInputMetadata)>,
+ flash_params: &FlashParams,
+ ) -> Result {
+ let residual = x;
+ let x = self.rms_1.forward(x)?;
+ let x = (self.attn.forward(
+ &x,
+ attention_mask,
+ seqlen_offsets,
+ kv_cache,
+ metadata,
+ flash_params,
+ )? + residual)?;
+ let residual = &x;
+ let x = (self.mlp.forward(&self.rms_2.forward(&x)?)? + residual)?;
+ Ok(x)
+ }
+
+ #[allow(clippy::too_many_arguments)]
+ fn load(
+ vb: ShardedVarBuilder,
+ cfg: &Config,
+ mapper: &dyn DeviceMapper,
+ layer_idx: usize,
+ loading_isq: bool,
+ rope: Option>,
+ paged_attn: Option,
+ comm: &Arc,
+ ) -> Result {
+ let attn = CausalSelfAttention::load(
+ mapper.set_device(layer_idx, vb.pp("self_attn"), loading_isq),
+ cfg,
+ rope,
+ paged_attn,
+ comm,
+ )?;
+ let mlp = Mlp::new(
+ mapper.set_device(layer_idx, vb.pp("mlp"), loading_isq),
+ cfg.hidden_size,
+ cfg.intermediate_size,
+ &cfg.quantization_config,
+ cfg.hidden_act,
+ comm,
+ )?;
+ let rms_1 = RmsNorm::new(
+ cfg.hidden_size,
+ cfg.rms_norm_eps,
+ mapper.set_device(layer_idx, vb.pp("input_layernorm"), false),
+ )?;
+ let rms_2 = RmsNorm::new(
+ cfg.hidden_size,
+ cfg.rms_norm_eps,
+ mapper.set_device(layer_idx, vb.pp("post_attention_layernorm"), false),
+ )?;
+ Ok(Self {
+ rms_1,
+ attn,
+ rms_2,
+ mlp: Box::new(mlp),
+ })
+ }
+}
+
+pub struct SmolLm3 {
+ wte: Embedding,
+ blocks: Vec,
+ ln_f: RmsNorm,
+ lm_head: Arc,
+ kv_cache: crate::pipeline::EitherCache,
+ device: Device,
+ mapper: Box,
+ cfg: ModelConfigMetadata,
+}
+
+impl SmolLm3 {
+ pub fn new(
+ cfg: &Config,
+ vb: ShardedVarBuilder,
+ is_gptx: bool,
+ normal_loading_metadata: NormalLoadingMetadata,
+ attention_mechanism: AttentionImplementation,
+ ) -> Result {
+ let vb_m = vb.pp("model");
+ let vb_lm_head = vb.pp("lm_head");
+ Self::new_inner(
+ cfg,
+ vb_m,
+ vb_lm_head,
+ is_gptx,
+ normal_loading_metadata,
+ attention_mechanism,
+ )
+ }
+
+ pub fn new_inner(
+ cfg: &Config,
+ vb_m: ShardedVarBuilder,
+ vb_lm_head: ShardedVarBuilder,
+ is_gptx: bool,
+ normal_loading_metadata: NormalLoadingMetadata,
+ attention_mechanism: AttentionImplementation,
+ ) -> Result {
+ if let Some(ref quant_cfg) = &cfg.quantization_config {
+ tracing::info!(
+ "Using {} quantization: {}.",
+ quant_cfg.name(),
+ quant_cfg.get_bits_name(&vb_m)
+ );
+ }
+ let mapper = normal_loading_metadata.mapper;
+
+ let wte = embedding(
+ cfg.vocab_size,
+ cfg.hidden_size,
+ mapper.set_nm_device(vb_m.pp("embed_tokens"), false),
+ &cfg.quantization_config,
+ )?;
+ let lm_head = if !cfg.tie_word_embeddings {
+ ReplicatedLayer::new(
+ cfg.hidden_size,
+ cfg.vocab_size,
+ &cfg.quantization_config,
+ false,
+ mapper.set_nm_device(vb_lm_head, normal_loading_metadata.loading_isq),
+ )?
+ } else {
+ ReplicatedLayer::from_linear(candle_nn::Linear::new(
+ mapper.cast_nm_device(wte.embeddings(), normal_loading_metadata.loading_isq)?,
+ None,
+ ))?
+ };
+ let ln_f = RmsNorm::new(
+ cfg.hidden_size,
+ cfg.rms_norm_eps,
+ mapper.set_nm_device(vb_m.pp("norm"), false),
+ )?;
+ let head_dim = cfg.hidden_size / cfg.num_attention_heads;
+ let mut ropes = HashMap::new();
+ for i in 0..cfg.num_hidden_layers {
+ let device = mapper
+ .device_for(i, false)
+ .unwrap_or(&normal_loading_metadata.real_device);
+ ropes.insert(
+ device.location(),
+ Arc::new(SmolLm3RotaryEmbedding::new_llama3(
+ vb_m.dtype(),
+ cfg,
+ device,
+ is_gptx,
+ )?),
+ );
+ }
+ let blocks: Vec<_> = NiceProgressBar::<_, 'b'>(
+ 0..cfg.num_hidden_layers,
+ "Loading repeating layers",
+ &normal_loading_metadata.multi_progress,
+ )
+ .par_iter_if_isq(|i| {
+ let device = mapper
+ .device_for(i, false)
+ .unwrap_or(&normal_loading_metadata.real_device);
+ let use_rope = cfg.no_rope_layers()[i];
+ let rotary_emb = if use_rope {
+ Some(
+ ropes
+ .get(&device.location())
+ .expect("No RoPE for device location!")
+ .clone(),
+ )
+ } else {
+ None
+ };
+ let paged_attn = match &attention_mechanism {
+ AttentionImplementation::Eager => None,
+ AttentionImplementation::PagedAttention => {
+ Some(PagedAttention::new(head_dim, device, None)?)
+ }
+ };
+ let comm = mapper.get_comm_for(i)?;
+ Block::load(
+ vb_m.pp(format!("layers.{i}")),
+ cfg,
+ &*mapper,
+ i,
+ normal_loading_metadata.loading_isq,
+ rotary_emb,
+ paged_attn,
+ &comm,
+ )
+ })?;
+
+ Ok(Self {
+ wte,
+ blocks,
+ ln_f,
+ lm_head,
+ kv_cache: EitherCache::Normal(NormalCache::new(
+ cfg.num_hidden_layers,
+ cfg.max_position_embeddings,
+ )),
+ device: normal_loading_metadata.real_device,
+ cfg: ModelConfigMetadata {
+ max_seq_len: cfg.max_position_embeddings,
+ num_layers: cfg.num_hidden_layers,
+ hidden_size: cfg.hidden_size,
+ num_kv_heads: (cfg.num_key_value_heads / mapper.get_comm_for(0)?.world_size())
+ .max(1),
+ num_attn_heads: cfg.num_attention_heads / mapper.get_comm_for(0)?.world_size(),
+ sliding_window: None,
+ k_head_dim: cfg.hidden_size / cfg.num_attention_heads,
+ v_head_dim: cfg.hidden_size / cfg.num_attention_heads,
+ },
+ mapper,
+ })
+ }
+
+ pub fn forward(
+ &self,
+ input_ids: &Tensor,
+ seqlen_offsets: &[usize],
+ context_lens: Vec<(usize, usize)>,
+ metadata: Option<(Vec<(Tensor, Tensor)>, &PagedAttentionInputMetadata)>,
+ flash_params: &FlashParams,
+ ) -> Result {
+ self.forward_embeds(
+ input_ids,
+ self.wte.forward(input_ids)?,
+ seqlen_offsets,
+ context_lens,
+ metadata,
+ flash_params,
+ )
+ }
+
+ #[allow(clippy::too_many_arguments)]
+ pub fn forward_embeds(
+ &self,
+ input_ids: &Tensor,
+ input_embeds: Tensor,
+ seqlen_offsets: &[usize],
+ context_lens: Vec<(usize, usize)>,
+ metadata: Option<(Vec<(Tensor, Tensor)>, &PagedAttentionInputMetadata)>,
+ flash_params: &FlashParams,
+ ) -> Result {
+ let mut x = input_embeds;
+ let cache = &mut self.kv_cache.normal().0;
+ let mask = CausalMasker.make_causal_mask_matrix(
+ input_ids,
+ metadata
+ .as_ref()
+ .map(|(_, _)| &seqlen_offsets as &dyn PastKvLenCache)
+ .unwrap_or(cache as &dyn PastKvLenCache),
+ x.dtype(),
+ self.blocks[0].attn.num_attention_heads,
+ )?;
+ // PagedAttention prompt chunking
+ let mask = mask.filter(|_| {
+ metadata
+ .as_ref()
+ .map(|(_, meta)| meta.is_first_prompt_chunk)
+ .unwrap_or(true)
+ });
+ for (block_idx, block) in self.blocks.iter().enumerate() {
+ x = self.mapper.map(x, block_idx)?;
+ x = block.forward(
+ &x,
+ &mask.clone().map(|m| m.to_device(x.device()).unwrap()),
+ seqlen_offsets,
+ &mut cache[block_idx],
+ metadata
+ .as_ref()
+ .map(|(kv_cache, metadata)| (kv_cache[block_idx].clone(), *metadata)),
+ flash_params,
+ )?;
+ }
+ let x = x.to_device(&self.device)?;
+ let mut x = self.ln_f.forward(&x)?;
+ if let Some(t) = self.lm_head.quantized_act_type() {
+ x = x.to_dtype(t)?;
+ }
+ let xs = MatMul.qmethod_matmul(&x, &*self.lm_head)?;
+ extract_logits(&xs, context_lens)
+ }
+
+ pub fn residual_tensors_m(&self, uvb_m: UnVarBuilder) -> Vec<(String, Tensor)> {
+ uvb_m.pp("embed_tokens").add(&self.wte);
+ uvb_m.pp("norm").add(&self.ln_f);
+
+ for (layer_idx, layer) in self.blocks.iter().enumerate() {
+ let uvb_l = uvb_m.pp("layers").pp(layer_idx);
+ uvb_l.pp("input_layernorm").add(&layer.rms_1);
+ uvb_l.pp("post_attention_layernorm").add(&layer.rms_2);
+ }
+
+ uvb_m.to_safetensors()
+ }
+}
+
+impl IsqModel for SmolLm3 {
+ fn get_layers(
+ &mut self,
+ ) -> (
+ Vec<(&mut Arc, Option)>,
+ &dyn DeviceMapper,
+ ) {
+ let mut tensors = Vec::new();
+ tensors.push((&mut self.lm_head, None));
+ for (i, layer) in self.blocks.iter_mut().enumerate() {
+ tensors.push((&mut layer.attn.q_proj, Some(i)));
+ tensors.push((&mut layer.attn.k_proj, Some(i)));
+ tensors.push((&mut layer.attn.v_proj, Some(i)));
+ tensors.push((&mut layer.attn.o_proj, Some(i)));
+ tensors.extend(
+ layer
+ .mlp
+ .get_isq_layers()
+ .into_iter()
+ .map(|m| (m, Some(i)))
+ .collect::>(),
+ );
+ }
+ (tensors, &*self.mapper)
+ }
+
+ fn residual_tensors(&self) -> Vec<(String, Tensor)> {
+ let uvb = UnVarBuilder::new();
+ self.residual_tensors_m(uvb.pp("model"))
+ }
+
+ fn imatrix_names(&self) -> candle_core::Result>> {
+ // NOTE: dependant on the exact implementation in get_layers!
+ let mut names = Vec::new();
+ // lm_head
+ names.push(None);
+ for i in 0..self.blocks.len() {
+ names.push(Some(format!("blk.{i}.attn_q.weight")));
+ names.push(Some(format!("blk.{i}.attn_k.weight")));
+ names.push(Some(format!("blk.{i}.attn_v.weight")));
+ names.push(Some(format!("blk.{i}.attn_output.weight")));
+ names.push(Some(format!("blk.{i}.ffn_gate.weight")));
+ names.push(Some(format!("blk.{i}.ffn_up.weight")));
+ names.push(Some(format!("blk.{i}.ffn_down.weight")));
+ }
+ Ok(names)
+ }
+}
+
+impl NormalModel for SmolLm3 {
+ fn forward(
+ &self,
+ input_ids: &Tensor,
+ seqlen_offsets: &[usize],
+ context_lens: Vec<(usize, usize)>,
+ _position_ids: Vec,
+ metadata: Option<(Vec<(Tensor, Tensor)>, &PagedAttentionInputMetadata)>,
+ flash_params: &FlashParams,
+ ) -> Result {
+ self.forward(
+ input_ids,
+ seqlen_offsets,
+ context_lens,
+ metadata,
+ flash_params,
+ )
+ }
+ fn xlora_forward(
+ &self,
+ _input_ids: &Tensor,
+ _input_ids_full: &Tensor,
+ _seqlen_offsets: &[usize],
+ _seqlen_offsets_full: &[usize],
+ _no_kv_cache: bool,
+ _non_granular_state: &Option,
+ _context_lens: Vec<(usize, usize)>,
+ _position_ids: Vec,
+ _flash_params: &FlashParams,
+ _flash_params_full: &FlashParams,
+ ) -> Result {
+ unimplemented!()
+ }
+ fn cache(&self) -> &crate::pipeline::EitherCache {
+ &self.kv_cache
+ }
+ fn cache_mut(&mut self) -> &mut crate::pipeline::EitherCache {
+ &mut self.kv_cache
+ }
+ fn device(&self) -> &Device {
+ &self.device
+ }
+ fn is_xlora(&self) -> bool {
+ false
+ }
+ fn max_seq_len(&self) -> usize {
+ self.blocks[0].attn.max_seq_len
+ }
+ fn config(&self) -> &ModelConfigMetadata {
+ &self.cfg
+ }
+}
+
+impl AnyMoeBaseModelMixin for SmolLm3 {
+ fn get_mlps(&self) -> Vec<&dyn MlpLayer> {
+ let mut mlps = Vec::new();
+ for layer in &self.blocks {
+ mlps.push(&*layer.mlp);
+ }
+ mlps
+ }
+ fn get_mlps_mut(&mut self) -> Vec<&mut Box> {
+ let mut mlps = Vec::new();
+ for layer in &mut self.blocks {
+ mlps.push(&mut layer.mlp);
+ }
+ mlps
+ }
+ fn create_anymoe_layers(
+ &mut self,
+ additional_vbs: Vec,
+ config: AnyMoeConfig,
+ (prefix, mlp): (String, String),
+ mut layers: Vec,
+ expert_type: AnyMoeExpertType,
+ gate_vb: Option,
+ ) -> Result<()> {
+ let mut experts: Vec>> = Vec::new();
+ if layers.is_empty() {
+ layers = (0..self.blocks.len()).collect::>();
+ }
+ for _ in 0..layers.len() {
+ experts.push(Vec::new());
+ }
+ for vb in additional_vbs {
+ let vb = vb.pp(&prefix);
+ for (layer, row) in experts.iter_mut().enumerate() {
+ if !layers.contains(&layer) {
+ continue;
+ }
+
+ let intermediate_size = self.blocks[layer].mlp.get_params()[1];
+ let hidden_size = self.blocks[layer].mlp.get_params()[0];
+ match expert_type {
+ AnyMoeExpertType::FineTuned => {
+ let (dtype, device) = self.blocks[layer].mlp.dtype_device();
+ row.push(Box::new(Mlp::replicate(
+ self.blocks[layer].mlp.get_params(),
+ vb.pp(layer).pp(&mlp).set_dtype(dtype).set_device(device),
+ self.blocks[layer].mlp.hidden_act(),
+ &self.mapper.get_comm_for(layer)?,
+ )?));
+ }
+ AnyMoeExpertType::LoraAdapter {
+ rank,
+ alpha,
+ ref target_modules,
+ } => {
+ let vb_mlp = vb.pp(layer).pp(&mlp);
+
+ let c_fc1_delta = if target_modules.contains(&"c_fc1".to_string()) {
+ Some(get_delta_from_lora_ab!(
+ vb_mlp,
+ rank,
+ alpha,
+ (hidden_size, intermediate_size),
+ "c_fc1"
+ ))
+ } else {
+ None
+ };
+ let c_fc2_delta = if target_modules.contains(&"c_fc2".to_string()) {
+ Some(get_delta_from_lora_ab!(
+ vb_mlp,
+ rank,
+ alpha,
+ (hidden_size, intermediate_size),
+ "c_fc2"
+ ))
+ } else {
+ None
+ };
+ let c_proj_delta = if target_modules.contains(&"c_proj".to_string()) {
+ Some(get_delta_from_lora_ab!(
+ vb_mlp,
+ rank,
+ alpha,
+ (intermediate_size, hidden_size),
+ "c_proj"
+ ))
+ } else {
+ None
+ };
+
+ row.push(self.blocks[layer].mlp.new_added_delta(vec![
+ c_fc1_delta,
+ c_fc2_delta,
+ c_proj_delta,
+ ])?);
+ }
+ }
+ }
+ }
+ for (layer, expert) in layers.into_iter().zip(experts) {
+ let mut experts_all = vec![self.blocks[layer].mlp.clone()];
+ experts_all.extend(expert);
+ let (dtype, device) = self.blocks[layer].mlp.dtype_device();
+ self.blocks[layer].mlp = Box::new(MoeMlp::new(
+ experts_all,
+ config.clone(),
+ dtype,
+ &device,
+ layer,
+ gate_vb.as_ref(),
+ )?);
+ }
+ Ok(())
+ }
+ fn amoe_supported(&self) -> bool {
+ true
+ }
+}
diff --git a/mistralrs-core/src/pipeline/chat_template.rs b/mistralrs-core/src/pipeline/chat_template.rs
index fbf75b7d06..cfd72e4fd0 100644
--- a/mistralrs-core/src/pipeline/chat_template.rs
+++ b/mistralrs-core/src/pipeline/chat_template.rs
@@ -309,10 +309,15 @@ pub fn apply_chat_template_to(
.into_owned();
if template.contains("{{ meta }}") {
- //fix for GLM4 models
+ // Fix for GLM4 models
template = template.replace("{%- set meta = message.get(\"metadata\", \"\") %}", "");
template = template.replace("{{ meta }}", "");
}
+ if template.contains("{% generation %}") && template.contains("{% endgeneration %}") {
+ // Strip for smollm3 models
+ template = template.replace("{% generation %}", "");
+ template = template.replace("{% endgeneration %}", "");
+ }
env.add_template("chat_template", &template)?;
env.add_function("raise_exception", raise_exception);
@@ -331,7 +336,7 @@ pub fn apply_chat_template_to(
eos_token => eos_tok,
unk_token => unk_tok,
date_string => date_string,
- enable_thinking => enable_thinking,
+ enable_thinking => enable_thinking.unwrap_or(true),
})?)
} else {
Ok(tmpl.render(context! {
@@ -340,9 +345,10 @@ pub fn apply_chat_template_to(
bos_token => bos_tok,
eos_token => eos_tok,
unk_token => unk_tok,
+ xml_tools => tools.clone(), // SmolLM3
tools => tools,
date_string => date_string,
- enable_thinking => enable_thinking,
+ enable_thinking => enable_thinking.unwrap_or(true),
})?)
}
}
diff --git a/mistralrs-core/src/pipeline/isq.rs b/mistralrs-core/src/pipeline/isq.rs
index 76f574ae75..592ebb1229 100644
--- a/mistralrs-core/src/pipeline/isq.rs
+++ b/mistralrs-core/src/pipeline/isq.rs
@@ -710,6 +710,7 @@ pub trait IsqModel {
let config_out = parent.join("config.json");
let tokenizer_out = parent.join("tokenizer.json");
let tokenizer_cfg_out = parent.join("tokenizer_config.json");
+ let chat_template_jinja_out = parent.join("chat_template.jinja");
let gen_cfg_out = parent.join("generation_config.json");
let processor_out = parent.join("processor_config.json");
let preprocessor_out = parent.join("preprocessor_config.json");
@@ -741,15 +742,24 @@ pub trait IsqModel {
.map_err(candle_core::Error::msg)?;
if let Some(template_filename) = template_filename {
- info!(
- "Serializing tokenizer config to `{}`.",
- tokenizer_cfg_out.display()
- );
-
let template =
std::fs::read(template_filename).map_err(candle_core::Error::msg)?;
- std::fs::write(&tokenizer_cfg_out, template)
- .map_err(candle_core::Error::msg)?;
+
+ if template_filename.extension().map(|e| e.to_str()) == Some(Some("jinja")) {
+ info!(
+ "Serializing chat template to `{}`.",
+ chat_template_jinja_out.display()
+ );
+ std::fs::write(&chat_template_jinja_out, template)
+ .map_err(candle_core::Error::msg)?;
+ } else {
+ info!(
+ "Serializing tokenizer config to `{}`.",
+ tokenizer_cfg_out.display()
+ );
+ std::fs::write(&tokenizer_cfg_out, template)
+ .map_err(candle_core::Error::msg)?;
+ }
}
if let Some(generation_config) = generation_config {
diff --git a/mistralrs-core/src/pipeline/loaders/mod.rs b/mistralrs-core/src/pipeline/loaders/mod.rs
index b2600b7417..e8f6e5223c 100644
--- a/mistralrs-core/src/pipeline/loaders/mod.rs
+++ b/mistralrs-core/src/pipeline/loaders/mod.rs
@@ -23,7 +23,7 @@ pub use normal_loaders::{
AutoNormalLoader, DeepSeekV2Loader, DeepSeekV3Loader, GLM4Loader, Gemma2Loader, GemmaLoader,
LlamaLoader, MistralLoader, MixtralLoader, NormalLoaderType, NormalLoadingMetadata,
NormalModel, NormalModelLoader, Phi2Loader, Phi3Loader, Phi3_5MoELoader, Qwen2Loader,
- Qwen3Loader, Qwen3MoELoader, Starcoder2Loader,
+ Qwen3Loader, Qwen3MoELoader, SmolLm3Loader, Starcoder2Loader,
};
pub use vision_loaders::{
diff --git a/mistralrs-core/src/pipeline/loaders/normal_loaders.rs b/mistralrs-core/src/pipeline/loaders/normal_loaders.rs
index 4592a8f721..c15f9a5b6e 100644
--- a/mistralrs-core/src/pipeline/loaders/normal_loaders.rs
+++ b/mistralrs-core/src/pipeline/loaders/normal_loaders.rs
@@ -169,6 +169,8 @@ pub enum NormalLoaderType {
GLM4,
#[serde(rename = "qwen3moe")]
Qwen3Moe,
+ #[serde(rename = "smollm3")]
+ SmolLm3,
}
// https://github.com/huggingface/transformers/blob/cff06aac6fad28019930be03f5d467055bf62177/src/transformers/models/auto/modeling_auto.py#L448
@@ -190,6 +192,7 @@ impl NormalLoaderType {
"Qwen3ForCausalLM" => Ok(Self::Qwen3),
"Glm4ForCausalLM" => Ok(Self::GLM4),
"Qwen3MoeForCausalLM" => Ok(Self::Qwen3Moe),
+ "SmolLM3ForCausalLM" => Ok(Self::SmolLm3),
other => anyhow::bail!(
"Unsupported Hugging Face Transformers -CausalLM model class `{other}`. Please raise an issue."
),
@@ -216,7 +219,8 @@ impl FromStr for NormalLoaderType {
"qwen3" => Ok(Self::Qwen3),
"glm4" => Ok(Self::GLM4),
"qwen3moe" => Ok(Self::Qwen3Moe),
- a => Err(format!("Unknown architecture `{a}`. Possible architectures: `mistral`, `gemma`, `mixtral`, `llama`, `phi2`, `phi3`, `qwen2`, `gemma2`, `starcoder2`, `phi3.5moe`, `deepseekv2`, `deepseekv3`, `qwen3`, `glm4`, `qwen3moe`.")),
+ "smollm3" => Ok(Self::SmolLm3),
+ a => Err(format!("Unknown architecture `{a}`. Possible architectures: `mistral`, `gemma`, `mixtral`, `llama`, `phi2`, `phi3`, `qwen2`, `gemma2`, `starcoder2`, `phi3.5moe`, `deepseekv2`, `deepseekv3`, `qwen3`, `glm4`, `qwen3moe`, `smollm3`.")),
}
}
}
@@ -239,6 +243,7 @@ impl Display for NormalLoaderType {
Self::Qwen3 => write!(f, "qwen3"),
Self::GLM4 => write!(f, "glm4"),
Self::Qwen3Moe => write!(f, "qwen3moe"),
+ Self::SmolLm3 => write!(f, "smollm3"),
}
}
}
@@ -290,6 +295,7 @@ impl AutoNormalLoader {
NormalLoaderType::Qwen3 => Ok(Box::new(Qwen3Loader)),
NormalLoaderType::GLM4 => Ok(Box::new(GLM4Loader)),
NormalLoaderType::Qwen3Moe => Ok(Box::new(Qwen3MoELoader)),
+ NormalLoaderType::SmolLm3 => Ok(Box::new(SmolLm3Loader)),
}
}
}
@@ -3526,3 +3532,183 @@ impl DeviceMappedModelLoader for Qwen3MoELoader {
Ok(Box::new(cfg))
}
}
+
+// ======================== SmolLm3 loader
+
+/// [`NormalLoader`] for a SmolLm3 model.
+///
+/// [`NormalLoader`]: https://ericlbuehler.github.io/mistral.rs/mistralrs/struct.NormalLoader.html
+pub struct SmolLm3Loader;
+
+impl NormalModelLoader for SmolLm3Loader {
+ fn load(
+ &self,
+ config: &str,
+ vb: ShardedVarBuilder,
+ normal_loading_metadata: NormalLoadingMetadata,
+ attention_mechanism: AttentionImplementation,
+ ) -> Result> {
+ let cfg: crate::models::smollm3::Config = serde_json::from_str(config)?;
+
+ Ok(Box::new(models::smollm3::SmolLm3::new(
+ &cfg,
+ vb,
+ self.is_gptx(config)?,
+ normal_loading_metadata,
+ attention_mechanism,
+ )?))
+ }
+ fn load_xlora(
+ &self,
+ _config: &str,
+ _vb: ShardedVarBuilder,
+ _lora_config: &[((String, String), LoraConfig)],
+ _xlora_config: Option,
+ _xlora_ordering: Ordering,
+ _normal_loading_metadata: NormalLoadingMetadata,
+ _preload_adapters: &Option>,
+ ) -> Result> {
+ todo!()
+ }
+ fn is_gptx(&self, _: &str) -> Result {
+ Ok(true)
+ }
+ fn get_config_repr(&self, config: &str) -> Result> {
+ let cfg: crate::models::smollm3::Config = serde_json::from_str(config)?;
+ Ok(Box::new(cfg))
+ }
+}
+
+impl IsqModelLoader for SmolLm3Loader {
+ fn isq_layer_regexes(&self, _config: &str) -> Result> {
+ Ok(vec![
+ Regex::new(r"lm_head\.(weight|bias)$")?,
+ // Attention
+ Regex::new(r"layers\.(\d+)\.self_attn\.q_proj\.(weight|bias)$")?,
+ Regex::new(r"layers\.(\d+)\.self_attn\.k_proj\.(weight|bias)$")?,
+ Regex::new(r"layers\.(\d+)\.self_attn\.v_proj\.(weight|bias)$")?,
+ Regex::new(r"layers\.(\d+)\.self_attn\.o_proj\.(weight|bias)$")?,
+ // MLP
+ Regex::new(r"layers\.(\d+)\.mlp\.gate_proj\.(weight|bias)$")?,
+ Regex::new(r"layers\.(\d+)\.mlp\.up_proj\.(weight|bias)$")?,
+ Regex::new(r"layers\.(\d+)\.mlp\.down_proj\.(weight|bias)$")?,
+ ])
+ }
+ fn immediate_isq_predicates(&self, config: &str) -> Result> {
+ self.isq_layer_regexes(config)
+ }
+}
+
+impl DeviceMappedModelLoader for SmolLm3Loader {
+ fn mapped_max_act_size_elems(
+ &self,
+ config: &str,
+ params: &AutoDeviceMapParams,
+ prompt_chunksize: usize,
+ ) -> Result {
+ let AutoDeviceMapParams::Text {
+ max_seq_len: _,
+ max_batch_size,
+ } = params
+ else {
+ anyhow::bail!("Expected text AutoDeviceMapParams for this model!")
+ };
+
+ let cfg: crate::models::smollm3::Config = serde_json::from_str(config)?;
+
+ Ok(max_batch_size * cfg.num_attention_heads * prompt_chunksize * prompt_chunksize)
+ }
+ fn non_mapped_max_act_size_elems(
+ &self,
+ _config: &str,
+ _params: &AutoDeviceMapParams,
+ ) -> Result {
+ Ok(0)
+ }
+
+ fn non_mapped_size_in_bytes(
+ &self,
+ config: &str,
+ dtype: DType,
+ weight_pack_factor: usize,
+ ) -> Result {
+ let cfg: crate::models::smollm3::Config = serde_json::from_str(config)?;
+
+ let elems = {
+ let embed_tokens = cfg.hidden_size * cfg.vocab_size / weight_pack_factor;
+ // If embeddings are tied and no packing, reuse weights -> no separate lm_head needed
+ let lm_head = if !cfg.tie_word_embeddings || weight_pack_factor != 1 {
+ cfg.hidden_size * cfg.vocab_size / weight_pack_factor
+ } else {
+ 0
+ };
+ let norm = cfg.hidden_size;
+ embed_tokens + lm_head + norm
+ };
+ Ok(elems * dtype.size_in_bytes())
+ }
+
+ fn layer_sizes_in_bytes(
+ &self,
+ config: &str,
+ dtype: DType,
+ weight_pack_factor: usize,
+ ) -> Result> {
+ let cfg: crate::models::smollm3::Config = serde_json::from_str(config)?;
+
+ let per_layer_elems = {
+ let input_layernorm = cfg.hidden_size;
+ let post_attention_layernorm = cfg.hidden_size;
+
+ let size_in = cfg.hidden_size;
+ let size_q = (cfg.hidden_size / cfg.num_attention_heads) * cfg.num_attention_heads;
+ let size_kv = (cfg.hidden_size / cfg.num_attention_heads) * cfg.num_key_value_heads;
+ let q_proj = size_in * size_q / weight_pack_factor;
+ let k_proj = size_in * size_kv / weight_pack_factor;
+ let v_proj = size_in * size_kv / weight_pack_factor;
+ let o_proj = size_q * size_in / weight_pack_factor;
+
+ let h_size = cfg.hidden_size;
+ let i_size = cfg.intermediate_size;
+ let gate_proj = h_size * i_size / weight_pack_factor;
+ let up_proj = h_size * i_size / weight_pack_factor;
+ let down_proj = i_size * h_size / weight_pack_factor;
+
+ input_layernorm
+ + post_attention_layernorm
+ + q_proj
+ + k_proj
+ + v_proj
+ + o_proj
+ + gate_proj
+ + up_proj
+ + down_proj
+ };
+ Ok(vec![
+ per_layer_elems * dtype.size_in_bytes();
+ cfg.num_hidden_layers
+ ])
+ }
+
+ fn num_layers(&self, config: &str) -> Result {
+ let cfg: crate::models::smollm3::Config = serde_json::from_str(config)?;
+
+ Ok(cfg.num_hidden_layers)
+ }
+ fn model_config(&self, config: &str) -> Result> {
+ let cfg: crate::models::smollm3::Config = serde_json::from_str(config)?;
+
+ let cfg = ModelConfigMetadata {
+ max_seq_len: cfg.max_position_embeddings,
+ num_layers: cfg.num_hidden_layers,
+ hidden_size: cfg.hidden_size,
+ num_kv_heads: cfg.num_key_value_heads,
+ num_attn_heads: cfg.num_attention_heads,
+ sliding_window: None,
+ k_head_dim: cfg.hidden_size / cfg.num_attention_heads,
+ v_head_dim: cfg.hidden_size / cfg.num_attention_heads,
+ };
+
+ Ok(Box::new(cfg))
+ }
+}
diff --git a/mistralrs-core/src/pipeline/macros.rs b/mistralrs-core/src/pipeline/macros.rs
index b5dbacee96..84dafe7a79 100644
--- a/mistralrs-core/src/pipeline/macros.rs
+++ b/mistralrs-core/src/pipeline/macros.rs
@@ -195,6 +195,9 @@ macro_rules! get_paths {
let template_filename = if let Some(ref p) = $this.chat_template {
info!("Using chat template file at `{p}`");
Some(PathBuf::from_str(p)?)
+ } else if dir_list.contains(&"chat_template.jinja".to_string()) {
+ info!("Loading `chat_template.jinja` at `{}`", $this.model_id);
+ Some($crate::api_get_file!(api, "chat_template.jinja", model_id))
} else {
info!("Loading `tokenizer_config.json` at `{}`", $this.model_id);
Some($crate::api_get_file!(
@@ -299,16 +302,26 @@ macro_rules! get_paths_gguf {
));
let model_id = std::path::Path::new(&this_model_id);
+ let dir_list = $crate::api_dir_list!(api, model_id, false)
+ .collect::>();
+
let chat_template = if let Some(ref p) = $this.chat_template {
- if p.ends_with(".json") {
+ if p.ends_with(".json") || p.ends_with(".jinja") {
info!("Using chat template file at `{p}`");
Some(PathBuf::from_str(p)?)
} else {
- panic!("Specified chat template file must end with .json");
+ panic!("Specified chat template file must end with .json or .jinja");
}
} else {
if $this.model_id.is_none() {
None
+ } else if dir_list.contains(&"chat_template.jinja".to_string()) {
+ info!("Loading `chat_template.jinja` at `{}`", this_model_id);
+ Some($crate::api_get_file!(
+ api,
+ "chat_template.jinja",
+ model_id
+ ))
} else {
info!("Loading `tokenizer_config.json` at `{}` because no chat template file was specified.", this_model_id);
let res = $crate::api_get_file!(
@@ -340,9 +353,6 @@ macro_rules! get_paths_gguf {
$this.xlora_order.as_ref(),
)?;
- let dir_list = $crate::api_dir_list!(api, model_id, false)
- .collect::>();
-
let gen_conf = if dir_list.contains(&"generation_config.json".to_string()) {
info!("Loading `generation_config.json` at `{}`", this_model_id);
Some($crate::api_get_file!(
diff --git a/mistralrs-core/src/pipeline/mod.rs b/mistralrs-core/src/pipeline/mod.rs
index da0b186eb3..1ac4a2260a 100644
--- a/mistralrs-core/src/pipeline/mod.rs
+++ b/mistralrs-core/src/pipeline/mod.rs
@@ -43,8 +43,8 @@ pub use loaders::{
ModelPaths, NormalLoaderType, NormalLoadingMetadata, NormalModel, NormalModelLoader,
Phi2Loader, Phi3Loader, Phi3VLoader, Phi3_5MoELoader, Phi4MMLoader, PrettyName,
QuantizationKind, Qwen2Loader, Qwen2VLLoader, Qwen2_5VLLoader, Qwen3Loader, Qwen3MoELoader,
- Starcoder2Loader, TokenSource, VLlama4Loader, VLlamaLoader, VisionLoaderType, VisionModel,
- VisionModelLoader,
+ SmolLm3Loader, Starcoder2Loader, TokenSource, VLlama4Loader, VLlamaLoader, VisionLoaderType,
+ VisionModel, VisionModelLoader,
};
use mistralrs_quant::IsqType;
pub use normal::{NormalLoader, NormalLoaderBuilder, NormalSpecificConfig};
diff --git a/mistralrs-core/src/pipeline/normal.rs b/mistralrs-core/src/pipeline/normal.rs
index dc7de2e08b..289f7ac768 100644
--- a/mistralrs-core/src/pipeline/normal.rs
+++ b/mistralrs-core/src/pipeline/normal.rs
@@ -13,7 +13,7 @@ use super::{
use super::{
AutoNormalLoader, DeepSeekV2Loader, DeepSeekV3Loader, GLM4Loader, Gemma2Loader, GemmaLoader,
LlamaLoader, MistralLoader, MixtralLoader, NormalLoaderType, Phi2Loader, Phi3Loader,
- Phi3_5MoELoader, Qwen2Loader, Qwen3Loader, Qwen3MoELoader, Starcoder2Loader,
+ Phi3_5MoELoader, Qwen2Loader, Qwen3Loader, Qwen3MoELoader, SmolLm3Loader, Starcoder2Loader,
};
use crate::amoe::AnyMoeExpertType;
use crate::device_map::{self, DeviceMapper};
@@ -224,6 +224,7 @@ impl NormalLoaderBuilder {
Some(NormalLoaderType::Qwen3) => Box::new(Qwen3Loader),
Some(NormalLoaderType::GLM4) => Box::new(GLM4Loader),
Some(NormalLoaderType::Qwen3Moe) => Box::new(Qwen3MoELoader),
+ Some(NormalLoaderType::SmolLm3) => Box::new(SmolLm3Loader),
None => Box::new(AutoNormalLoader),
};
Ok(Box::new(NormalLoader {
diff --git a/mistralrs-core/src/pipeline/paths.rs b/mistralrs-core/src/pipeline/paths.rs
index 441619694e..110752eac6 100644
--- a/mistralrs-core/src/pipeline/paths.rs
+++ b/mistralrs-core/src/pipeline/paths.rs
@@ -445,7 +445,23 @@ pub(crate) fn get_chat_template(
template.chat_template = Some(ChatTemplateValue(Either::Left(chat_template)));
template
}
- None => serde_json::from_str(&template_content.as_ref().unwrap().clone()).unwrap(),
+ None => {
+ // Check if template_filename is a .jinja file
+ if let Some(template_filename) = paths.get_template_filename() {
+ if template_filename.extension().map(|e| e.to_str()) == Some(Some("jinja")) {
+ info!("Using chat template from .jinja file.");
+ let mut template = ChatTemplate::default();
+ template.chat_template = Some(ChatTemplateValue(Either::Left(
+ template_content.as_ref().unwrap().clone(),
+ )));
+ template
+ } else {
+ serde_json::from_str(&template_content.as_ref().unwrap().clone()).unwrap()
+ }
+ } else {
+ serde_json::from_str(&template_content.as_ref().unwrap().clone()).unwrap()
+ }
+ }
};
// Overwrite to use any present `chat_template.json`, only if there is not one present already.
if template.chat_template.is_none() {
@@ -539,7 +555,7 @@ pub(crate) fn get_chat_template(
}
}
None => {
- info!("No specified chat template. No chat template will be used. Only prompts will be accepted, not messages.");
+ warn!("No specified chat template. No chat template will be used. Only prompts will be accepted, not messages.");
deser.insert("chat_template".to_string(), Value::Null);
}
}
diff --git a/mistralrs-pyo3/API.md b/mistralrs-pyo3/API.md
index e75089c9f5..53c227950c 100644
--- a/mistralrs-pyo3/API.md
+++ b/mistralrs-pyo3/API.md
@@ -29,6 +29,9 @@ If you do not specify the architecture, an attempt will be made to use the model
- `Phi3_5MoE`
- `DeepseekV2`
- `DeepseekV3`
+- `Qwen3`
+- `Qwen3Moe`
+- `SmolLm3`
### ISQ Organization
- `Default`
@@ -389,7 +392,7 @@ runner = Runner(
res = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=[{"role":"user", "content":"Tell me a story about the Rust type system."}],
max_tokens=256,
presence_penalty=1.0,
diff --git a/mistralrs-pyo3/README.md b/mistralrs-pyo3/README.md
index dbdd9e3186..8154c76a02 100644
--- a/mistralrs-pyo3/README.md
+++ b/mistralrs-pyo3/README.md
@@ -17,7 +17,7 @@ runner = Runner(
res = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=[
{"role": "user", "content": "Tell me a story about the Rust type system."}
],
@@ -54,7 +54,7 @@ AUDIO_URL = "https://upload.wikimedia.org/wikipedia/commons/4/42/Bird_singing.og
response = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=[
{
"role": "user",
diff --git a/mistralrs-pyo3/_README.md b/mistralrs-pyo3/_README.md
index a4a82bba36..7bb2fe0351 100644
--- a/mistralrs-pyo3/_README.md
+++ b/mistralrs-pyo3/_README.md
@@ -17,7 +17,7 @@ runner = Runner(
res = runner.send_chat_completion_request(
ChatCompletionRequest(
- model="ignore",
+ model="default",
messages=[
{"role": "user", "content": "Tell me a story about the Rust type system."}
],
diff --git a/mistralrs-pyo3/mistralrs.pyi b/mistralrs-pyo3/mistralrs.pyi
index 97d7eaeaf8..4efe577d79 100644
--- a/mistralrs-pyo3/mistralrs.pyi
+++ b/mistralrs-pyo3/mistralrs.pyi
@@ -106,6 +106,7 @@ class Architecture(Enum):
Qwen3 = "qwen3"
GLM4 = "glm4"
Qwen3Moe = "qwen3moe"
+ SmolLm3 = "smollm3"
@dataclass
class VisionArchitecture(Enum):
diff --git a/mistralrs-pyo3/src/which.rs b/mistralrs-pyo3/src/which.rs
index 43c090714f..2e9230f62e 100644
--- a/mistralrs-pyo3/src/which.rs
+++ b/mistralrs-pyo3/src/which.rs
@@ -24,6 +24,7 @@ pub enum Architecture {
Qwen3,
GLM4,
Qwen3Moe,
+ SmolLm3,
}
impl From for NormalLoaderType {
@@ -44,6 +45,7 @@ impl From for NormalLoaderType {
Architecture::Qwen3 => Self::Qwen3,
Architecture::GLM4 => Self::GLM4,
Architecture::Qwen3Moe => Self::Qwen3Moe,
+ Architecture::SmolLm3 => Self::SmolLm3,
}
}
}
diff --git a/mistralrs/examples/qwen3/main.rs b/mistralrs/examples/qwen3/main.rs
index df8b059b17..c5b89f945d 100644
--- a/mistralrs/examples/qwen3/main.rs
+++ b/mistralrs/examples/qwen3/main.rs
@@ -3,7 +3,7 @@ use mistralrs::{IsqType, TextMessageRole, TextMessages, TextModelBuilder};
#[tokio::main]
async fn main() -> Result<()> {
- let model = TextModelBuilder::new("https://huggingface.co/Qwen/Qwen3-30B-A3B")
+ let model = TextModelBuilder::new("Qwen/Qwen3-30B-A3B")
.with_isq(IsqType::Q4K)
.with_logging()
.build()
diff --git a/mistralrs/examples/smollm3/main.rs b/mistralrs/examples/smollm3/main.rs
new file mode 100644
index 0000000000..80339d6b64
--- /dev/null
+++ b/mistralrs/examples/smollm3/main.rs
@@ -0,0 +1,35 @@
+use anyhow::Result;
+use mistralrs::{
+ IsqType, PagedAttentionMetaBuilder, TextMessageRole, TextMessages, TextModelBuilder,
+};
+
+#[tokio::main]
+async fn main() -> Result<()> {
+ let model = TextModelBuilder::new("HuggingFaceTB/SmolLM3-3B")
+ .with_isq(IsqType::Q8_0)
+ .with_logging()
+ .with_paged_attn(|| PagedAttentionMetaBuilder::default().build())?
+ .build()
+ .await?;
+
+ let messages = TextMessages::new()
+ // .enable_thinking(false)
+ .add_message(
+ TextMessageRole::System,
+ "You are an AI agent with a specialty in programming.",
+ )
+ .add_message(
+ TextMessageRole::User,
+ "Hello! How are you? Please write generic binary search function in Rust.",
+ );
+
+ let response = model.send_chat_request(messages).await?;
+
+ println!("{}", response.choices[0].message.content.as_ref().unwrap());
+ dbg!(
+ response.usage.avg_prompt_tok_per_sec,
+ response.usage.avg_compl_tok_per_sec
+ );
+
+ Ok(())
+}
diff --git a/mistralrs/src/messages.rs b/mistralrs/src/messages.rs
index 7094c50d22..6ccd32d51f 100644
--- a/mistralrs/src/messages.rs
+++ b/mistralrs/src/messages.rs
@@ -27,11 +27,14 @@ pub trait RequestLike {
/// No constraints, logits processors, logprobs, tools, or adapters.
///
/// Sampling is deterministic.
-pub struct TextMessages(Vec>);
+pub struct TextMessages {
+ messages: Vec>,
+ enable_thinking: Option,
+}
impl From for Vec> {
fn from(value: TextMessages) -> Self {
- value.0
+ value.messages
}
}
@@ -65,11 +68,14 @@ impl Default for TextMessages {
impl TextMessages {
pub fn new() -> Self {
- Self(Vec::new())
+ Self {
+ messages: Vec::new(),
+ enable_thinking: None,
+ }
}
pub fn add_message(mut self, role: TextMessageRole, text: impl ToString) -> Self {
- self.0.push(IndexMap::from([
+ self.messages.push(IndexMap::from([
("role".to_string(), Either::Left(role.to_string())),
("content".to_string(), Either::Left(text.to_string())),
]));
@@ -77,24 +83,29 @@ impl TextMessages {
}
pub fn clear(mut self) -> Self {
- self.0.clear();
+ self.messages.clear();
+ self
+ }
+
+ pub fn enable_thinking(mut self, enable_thinking: bool) -> Self {
+ self.enable_thinking = Some(enable_thinking);
self
}
}
impl RequestLike for TextMessages {
fn messages_ref(&self) -> &[IndexMap] {
- &self.0
+ &self.messages
}
fn images_ref(&self) -> &[DynamicImage] {
&[]
}
fn take_messages(&mut self) -> RequestMessage {
let mut other = Vec::new();
- std::mem::swap(&mut other, &mut self.0);
+ std::mem::swap(&mut other, &mut self.messages);
RequestMessage::Chat {
messages: other,
- enable_thinking: self.enable_search(),
+ enable_thinking: self.enable_thinking,
}
}
fn enable_search(&self) -> Option {
@@ -133,6 +144,7 @@ pub struct VisionMessages {
messages: Vec>,
images: Vec,
audios: Vec,
+ enable_thinking: Option,
}
impl Default for VisionMessages {
@@ -147,6 +159,7 @@ impl VisionMessages {
images: Vec::new(),
messages: Vec::new(),
audios: Vec::new(),
+ enable_thinking: None,
}
}
@@ -242,6 +255,11 @@ impl VisionMessages {
self
}
+
+ pub fn enable_thinking(mut self, enable_thinking: bool) -> Self {
+ self.enable_thinking = Some(enable_thinking);
+ self
+ }
}
impl RequestLike for VisionMessages {
@@ -262,7 +280,7 @@ impl RequestLike for VisionMessages {
images: other_images,
messages: other_messages,
audios: other_audios,
- enable_thinking: self.enable_search(),
+ enable_thinking: self.enable_thinking,
}
}
fn enable_search(&self) -> Option {
@@ -325,7 +343,7 @@ impl Default for RequestBuilder {
impl From for RequestBuilder {
fn from(value: TextMessages) -> Self {
Self {
- messages: value.0,
+ messages: value.messages,
images: Vec::new(),
audios: Vec::new(),
logits_processors: Vec::new(),
diff --git a/scripts/bench.py b/scripts/bench.py
index 7b15cd3856..12e62fb7c4 100644
--- a/scripts/bench.py
+++ b/scripts/bench.py
@@ -56,7 +56,7 @@ async def timed_chat(client: AsyncOpenAI, messages):
"""
start = time.perf_counter()
completion = await client.chat.completions.create(
- model="ignore",
+ model="default",
messages=messages,
max_tokens=256,
frequency_penalty=1.0,