EricLBuehler · EricLBuehler · Jun 19, 2025 · Jun 6, 2025 · Jun 10, 2025 · Jun 10, 2025
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -125,8 +125,8 @@ schemars = "0.8.22"
 serde_yaml = "0.9.34"
 serde_plain = "1.0.2"
 as-any = "0.3.2"
-llguidance = { version = "0.7.29", default-features = false, features = ["lark"] }
-toktrie_hf_tokenizers = "0.7.29"
+llguidance = { git = "https://github.com/guidance-ai/llguidance.git", version = "0.7.29", default-features = false, features = ["lark"], rev = "2ce5ab8" }
+toktrie_hf_tokenizers = {git = "https://github.com/guidance-ai/llguidance.git", version = "0.7.29", rev = "2ce5ab8" }
 objc = { version = "0.2.7" }
 serde-big-array = "0.5.1"
 interprocess = "2.2.3"

diff --git a/README.md b/README.md
@@ -384,6 +384,7 @@ If you do not specify the architecture, an attempt will be made to use the model
 - `phi3.5moe`
 - `qwen2`
 - `gemma2`
+- `glm4`
 - `starcoder2`
 - `deepseekv2`
 - `deepseekv3`
@@ -426,6 +427,7 @@ If you do not specify the architecture, an attempt will be made to use the model
 - phi3
 - starcoder2
 - qwen2
+- qwen3
 
 **With adapters:**
 - llama
@@ -456,6 +458,7 @@ Please submit more benchmarks via raising an issue!
 |Phi 3 Vision| | |✅|
 |Idefics 2| | |✅|
 |Gemma 2| | |✅|
+|GLM4| | |✅|
 |Starcoder 2| |✅|✅|
 |LLaVa Next| | |✅|
 |LLaVa| | |✅|
@@ -469,7 +472,7 @@ Please submit more benchmarks via raising an issue!
 |Gemma 3| | |✅|
 |Mistral 3| | |✅|
 |Llama 4| | |✅|
-|Qwen 3| | |✅|
+|Qwen 3|✅| |✅|
 |Dia 1.6b| | |✅|
 </details>
 
@@ -502,6 +505,7 @@ Please submit more benchmarks via raising an issue!
 |Phi 3 Vision| | | |
 |Idefics 2| | | |
 |Gemma 2|✅| | |
+|GLM4|✅| | |
 |Starcoder 2|✅| | |
 |LLaVa Next| | | |
 |LLaVa| | | |

diff --git a/docs/GLM4.md b/docs/GLM4.md
@@ -0,0 +1,59 @@
+# GLM4 Model
+
+**[See the GLM4 model Collection](https://huggingface.co/collections/THUDM/glm-4-0414-67f3cbcb34dd9d252707cb2e)**
+
+GLM4 is a series of open, multilingual, and multimodal large language models. The text-to-text LLM backbones in GLM4 are supported by mistral.rs.
+
+## HTTP API
+
+```py
+import openai
+
+messages = []
+prompt = input("Enter system prompt >>> ")
+if len(prompt) > 0:
+    messages.append({"role": "system", "content": prompt})
+
+
+while True:
+    prompt = input(">>> ")
+    messages.append({"role": "user", "content": prompt})
+    completion = client.chat.completions.create(
+        model="glm4",
+        messages=messages,
+        max_tokens=256,
+        frequency_penalty=1.0,
+        top_p=0.1,
+        temperature=0,
+    )
+    resp = completion.choices[0].message.content
+    print(resp)
+    messages.append({"role": "assistant", "content": resp})
+```
+
+## Python API
+```py
+from mistralrs import Runner, Which, ChatCompletionRequest, Architecture
+
+runner = Runner(
+    which=Which.Plain(
+        model_id="THUDM/GLM-4-9B-0414",
+        arch=Architecture.GLM4,
+    ),
+)
+
+res = runner.send_chat_completion_request(
+    ChatCompletionRequest(
+        model="glm4",
+        messages=[
+            {"role": "user", "content": "Tell me a story about the Rust type system."}
+        ],
+        max_tokens=256,
+        presence_penalty=1.0,
+        top_p=0.1,
+        temperature=0.1,
+    )
+)
+print(res.choices[0].message.content)
+print(res.usage)
+```
diff --git a/mistralrs-core/src/layers.rs b/mistralrs-core/src/layers.rs
@@ -2127,6 +2127,42 @@ impl Mlp {
         })
     }
 
+    pub fn new_merged(
+        vb: ShardedVarBuilder,
+        hidden_size: usize,
+        intermediate_size: usize,
+        chunks: usize,
+        quantization_config: &Option<QuantizedConfig>,
+        hidden_act: Activation,
+        comm: &Arc<mistralrs_quant::Comm>,
+    ) -> Result<Self> {
+        assert!(chunks == 2, "Only gate_up_proj merge is supported!");
+        let gate_up_projs = ColumnParallelLayer::new_merged(
+            hidden_size,
+            intermediate_size * 2,
+            2,
+            quantization_config,
+            false,
+            comm,
+            vb.pp("gate_up_proj"),
+        )?;
+
+        Ok(Self {
+            gate: gate_up_projs[0].to_owned(),
+            up: gate_up_projs[1].to_owned(),
+            down: RowParallelLayer::new(
+                intermediate_size,
+                hidden_size,
+                quantization_config,
+                false,
+                comm,
+                vb.pp("down_proj"),
+            )?,
+            act: hidden_act,
+            params: vec![hidden_size, intermediate_size],
+        })
+    }
+
     pub fn replicate(
         params: &[usize],
         vb: ShardedVarBuilder,