EricLBuehler
diff --git a/‎Cargo.lock‎
Lines changed: 3 additions & 6 deletions b/‎Cargo.lock‎
Lines changed: 3 additions & 6 deletions
diff --git a/‎Cargo.toml‎
Lines changed: 2 additions & 2 deletions b/‎Cargo.toml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎README.md‎
Lines changed: 5 additions & 1 deletion b/‎README.md‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎docs/GLM4.md‎
Lines changed: 59 additions & 0 deletions b/‎docs/GLM4.md‎
Lines changed: 59 additions & 0 deletions
diff --git a/‎mistralrs-core/src/layers.rs‎
Lines changed: 36 additions & 0 deletions b/‎mistralrs-core/src/layers.rs‎
Lines changed: 36 additions & 0 deletions
@@ -128,8 +128,8 @@ schemars = "0.8.22"
 serde_yaml = "0.9.34"
 serde_plain = "1.0.2"
 as-any = "0.3.2"
-llguidance = { version = "0.7.29", default-features = false, features = ["lark"] }
-toktrie_hf_tokenizers = "0.7.29"
+llguidance = { git = "https://github.com/guidance-ai/llguidance.git", version = "0.7.29", default-features = false, features = ["lark"], rev = "2ce5ab8" }
+toktrie_hf_tokenizers = {git = "https://github.com/guidance-ai/llguidance.git", version = "0.7.29", rev = "2ce5ab8" }
 objc = { version = "0.2.7" }
 serde-big-array = "0.5.1"
 interprocess = "2.2.3"
 
@@ -463,6 +463,7 @@ If you do not specify the architecture, an attempt will be made to use the model
 - `phi3.5moe`
 - `qwen2`
 - `gemma2`
+- `glm4`
 - `starcoder2`
 - `deepseekv2`
 - `deepseekv3`
@@ -505,6 +506,7 @@ If you do not specify the architecture, an attempt will be made to use the model
 - phi3
 - starcoder2
 - qwen2
+- qwen3
 
 **With adapters:**
 - llama
@@ -535,6 +537,7 @@ Please submit more benchmarks via raising an issue!
 |Phi 3 Vision| | |✅|
 |Idefics 2| | |✅|
 |Gemma 2| | |✅|
+|GLM4| | |✅|
 |Starcoder 2| |✅|✅|
 |LLaVa Next| | |✅|
 |LLaVa| | |✅|
@@ -548,7 +551,7 @@ Please submit more benchmarks via raising an issue!
 |Gemma 3| | |✅|
 |Mistral 3| | |✅|
 |Llama 4| | |✅|
-|Qwen 3| | |✅|
+|Qwen 3|✅| |✅|
 |Dia 1.6b| | |✅|
 </details>
 
@@ -581,6 +584,7 @@ Please submit more benchmarks via raising an issue!
 |Phi 3 Vision| | | |
 |Idefics 2| | | |
 |Gemma 2|✅| | |
+|GLM4|✅| | |
 |Starcoder 2|✅| | |
 |LLaVa Next| | | |
 |LLaVa| | | |
 
@@ -0,0 +1,59 @@
+# GLM4 Model
+
+**[See the GLM4 model Collection](https://huggingface.co/collections/THUDM/glm-4-0414-67f3cbcb34dd9d252707cb2e)**
+
+GLM4 is a series of open, multilingual, and multimodal large language models. The text-to-text LLM backbones in GLM4 are supported by mistral.rs.
+
+## HTTP API
+
+```py
+import openai
+
+messages = []
+prompt = input("Enter system prompt >>> ")
+if len(prompt) > 0:
+    messages.append({"role": "system", "content": prompt})
+
+
+while True:
+    prompt = input(">>> ")
+    messages.append({"role": "user", "content": prompt})
+    completion = client.chat.completions.create(
+        model="glm4",
+        messages=messages,
+        max_tokens=256,
+        frequency_penalty=1.0,
+        top_p=0.1,
+        temperature=0,
+    )
+    resp = completion.choices[0].message.content
+    print(resp)
+    messages.append({"role": "assistant", "content": resp})
+```
+
+## Python API
+```py
+from mistralrs import Runner, Which, ChatCompletionRequest, Architecture
+
+runner = Runner(
+    which=Which.Plain(
+        model_id="THUDM/GLM-4-9B-0414",
+        arch=Architecture.GLM4,
+    ),
+)
+
+res = runner.send_chat_completion_request(
+    ChatCompletionRequest(
+        model="glm4",
+        messages=[
+            {"role": "user", "content": "Tell me a story about the Rust type system."}
+        ],
+        max_tokens=256,
+        presence_penalty=1.0,
+        top_p=0.1,
+        temperature=0.1,
+    )
+)
+print(res.choices[0].message.content)
+print(res.usage)
+```
@@ -2185,6 +2185,42 @@ impl Mlp {
         })
     }
 
+    pub fn new_merged(
+        vb: ShardedVarBuilder,
+        hidden_size: usize,
+        intermediate_size: usize,
+        chunks: usize,
+        quantization_config: &Option<QuantizedConfig>,
+        hidden_act: Activation,
+        comm: &Arc<mistralrs_quant::Comm>,
+    ) -> Result<Self> {
+        assert!(chunks == 2, "Only gate_up_proj merge is supported!");
+        let gate_up_projs = ColumnParallelLayer::new_merged(
+            hidden_size,
+            intermediate_size * 2,
+            2,
+            quantization_config,
+            false,
+            comm,
+            vb.pp("gate_up_proj"),
+        )?;
+
+        Ok(Self {
+            gate: gate_up_projs[0].to_owned(),
+            up: gate_up_projs[1].to_owned(),
+            down: RowParallelLayer::new(
+                intermediate_size,
+                hidden_size,
+                quantization_config,
+                false,
+                comm,
+                vb.pp("down_proj"),
+            )?,
+            act: hidden_act,
+            params: vec![hidden_size, intermediate_size],
+        })
+    }
+
     pub fn replicate(
         params: &[usize],
         vb: ShardedVarBuilder,