diff --git a/vllm_mlx/api/models.py b/vllm_mlx/api/models.py
index b8ce7342..92cd38be 100644
--- a/vllm_mlx/api/models.py
+++ b/vllm_mlx/api/models.py
@@ -376,6 +376,9 @@ class ChatCompletionChunkDelta(BaseModel):
     role: Optional[str] = None
     content: Optional[str] = None
     tool_calls: Optional[List[dict]] = None
+    reasoning: Optional[str] = (
+        None  # For reasoning/thinking content (Qwen3, DeepSeek-R1)
+    )
 
 
 class ChatCompletionChunkChoice(BaseModel):
diff --git a/vllm_mlx/cli.py b/vllm_mlx/cli.py
index d5718247..daca60e0 100644
--- a/vllm_mlx/cli.py
+++ b/vllm_mlx/cli.py
@@ -53,6 +53,12 @@ def serve_command(args):
         server._enable_auto_tool_choice = False
         server._tool_call_parser = None
 
+    # Configure generation defaults
+    if args.default_temperature is not None:
+        server._default_temperature = args.default_temperature
+    if args.default_top_p is not None:
+        server._default_top_p = args.default_top_p
+
     # Security summary at startup
     print("=" * 60)
     print("SECURITY CONFIGURATION")
@@ -511,14 +517,28 @@ def main():
             "nemotron",
             "xlam",
             "functionary",
+            "glm47",
         ],
         help=(
             "Select the tool call parser for the model. Options: "
             "auto (auto-detect), mistral, qwen, llama, hermes, deepseek, "
-            "kimi, granite, nemotron, xlam, functionary. "
+            "kimi, granite, nemotron, xlam, functionary, glm47. "
             "Required for --enable-auto-tool-choice."
         ),
     )
+    # Generation defaults
+    serve_parser.add_argument(
+        "--default-temperature",
+        type=float,
+        default=None,
+        help="Default temperature for generation when not specified in request",
+    )
+    serve_parser.add_argument(
+        "--default-top-p",
+        type=float,
+        default=None,
+        help="Default top_p for generation when not specified in request",
+    )
 
     # Bench command
     bench_parser = subparsers.add_parser("bench", help="Run benchmark")
diff --git a/vllm_mlx/server.py b/vllm_mlx/server.py
index cd37165f..39fffcfd 100644
--- a/vllm_mlx/server.py
+++ b/vllm_mlx/server.py
@@ -107,6 +107,8 @@
 _model_name: str | None = None
 _default_max_tokens: int = 32768
 _default_timeout: float = 300.0  # Default request timeout in seconds (5 minutes)
+_default_temperature: float | None = None  # Set via --default-temperature
+_default_top_p: float | None = None  # Set via --default-top-p
 
 # Global MCP manager
 _mcp_manager = None
@@ -738,8 +740,14 @@ async def create_completion(request: CompletionRequest):
                 engine.generate(
                     prompt=prompt,
                     max_tokens=request.max_tokens or _default_max_tokens,
-                    temperature=request.temperature,
-                    top_p=request.top_p,
+                    temperature=(
+                        request.temperature
+                        if request.temperature is not None
+                        else _default_temperature
+                    ),
+                    top_p=(
+                        request.top_p if request.top_p is not None else _default_top_p
+                    ),
                     stop=request.stop,
                 ),
                 timeout=timeout,
@@ -856,8 +864,12 @@ async def create_chat_completion(request: ChatCompletionRequest):
     # Prepare kwargs
     chat_kwargs = {
         "max_tokens": request.max_tokens or _default_max_tokens,
-        "temperature": request.temperature,
-        "top_p": request.top_p,
+        "temperature": (
+            request.temperature
+            if request.temperature is not None
+            else _default_temperature
+        ),
+        "top_p": request.top_p if request.top_p is not None else _default_top_p,
     }
 
     # Add multimodal content
@@ -989,8 +1001,12 @@ async def stream_completion(
     async for output in engine.stream_generate(
         prompt=prompt,
         max_tokens=request.max_tokens or _default_max_tokens,
-        temperature=request.temperature,
-        top_p=request.top_p,
+        temperature=(
+            request.temperature
+            if request.temperature is not None
+            else _default_temperature
+        ),
+        top_p=request.top_p if request.top_p is not None else _default_top_p,
         stop=request.stop,
     ):
         data = {
@@ -1020,6 +1036,8 @@ async def stream_chat_completion(
     **kwargs,
 ) -> AsyncIterator[str]:
     """Stream chat completion response."""
+    global _tool_parser_instance
+
     response_id = f"chatcmpl-{uuid.uuid4().hex[:8]}"
 
     # Check if we should include usage in the final chunk
@@ -1046,7 +1064,7 @@ async def stream_chat_completion(
     if _reasoning_parser:
         _reasoning_parser.reset_state()
 
-    # Track accumulated text for reasoning parser
+    # Track accumulated text for reasoning parser and tool call parsing
     accumulated_text = ""
 
     # Track token counts for usage reporting
@@ -1054,10 +1072,39 @@ async def stream_chat_completion(
     completion_tokens = 0
     last_output = None
 
+    # Tool call streaming state
+    tool_call_enabled = (
+        _enable_auto_tool_choice
+        and _tool_call_parser
+        and request.tools  # Only parse if tools were provided in request
+    )
+    tool_calls_emitted = False
+
+    # Initialize tool parser if needed for streaming
+    if tool_call_enabled and _tool_parser_instance is None:
+        try:
+            parser_cls = ToolParserManager.get_tool_parser(_tool_call_parser)
+            tokenizer = None
+            if _engine is not None and hasattr(_engine, "_tokenizer"):
+                tokenizer = _engine._tokenizer
+            _tool_parser_instance = parser_cls(tokenizer)
+            logger.info(
+                f"Initialized tool call parser for streaming: {_tool_call_parser}"
+            )
+        except Exception as e:
+            logger.warning(f"Failed to initialize tool parser for streaming: {e}")
+            tool_call_enabled = False
+
+    # Reset tool parser state for this stream
+    if tool_call_enabled and _tool_parser_instance:
+        _tool_parser_instance.reset()
+
     # Stream content
     async for output in engine.stream_chat(messages=messages, **kwargs):
         delta_text = output.new_text
         last_output = output
+        previous_text = accumulated_text
+        accumulated_text += delta_text
 
         # Track token counts from output (updated each chunk)
         if hasattr(output, "prompt_tokens") and output.prompt_tokens:
@@ -1067,8 +1114,6 @@ async def stream_chat_completion(
 
         # Use reasoning parser if enabled
         if _reasoning_parser and delta_text:
-            previous_text = accumulated_text
-            accumulated_text += delta_text
             delta_msg = _reasoning_parser.extract_reasoning_streaming(
                 previous_text, accumulated_text, delta_text
             )
@@ -1092,8 +1137,62 @@ async def stream_chat_completion(
                 usage=get_usage(output) if output.finished else None,
             )
             yield f"data: {chunk.model_dump_json()}\n\n"
+        elif tool_call_enabled and _tool_parser_instance:
+            # Tool call parsing path
+            streaming_result = _tool_parser_instance.extract_tool_calls_streaming(
+                previous_text=previous_text,
+                current_text=accumulated_text,
+                delta_text=delta_text,
+            )
+
+            if streaming_result is None:
+                # Buffering - inside tool call, don't emit yet
+                continue
+
+            if "tool_calls" in streaming_result and streaming_result["tool_calls"]:
+                # Emit tool calls chunk
+                chunk = ChatCompletionChunk(
+                    id=response_id,
+                    model=request.model,
+                    choices=[
+                        ChatCompletionChunkChoice(
+                            delta=ChatCompletionChunkDelta(
+                                tool_calls=streaming_result["tool_calls"],
+                            ),
+                            finish_reason="tool_calls",
+                        )
+                    ],
+                    usage=get_usage(output) if output.finished else None,
+                )
+                yield f"data: {chunk.model_dump_json()}\n\n"
+                tool_calls_emitted = True
+            elif "content" in streaming_result and streaming_result["content"]:
+                # Emit content chunk
+                content = streaming_result["content"]
+
+                # Add <think> prefix on first content chunk for thinking models
+                if is_thinking_model and not think_prefix_sent and content:
+                    content = "<think>" + content
+                    think_prefix_sent = True
+
+                chunk = ChatCompletionChunk(
+                    id=response_id,
+                    model=request.model,
+                    choices=[
+                        ChatCompletionChunkChoice(
+                            delta=ChatCompletionChunkDelta(
+                                content=content,
+                            ),
+                            finish_reason=(
+                                output.finish_reason if output.finished else None
+                            ),
+                        )
+                    ],
+                    usage=get_usage(output) if output.finished else None,
+                )
+                yield f"data: {chunk.model_dump_json()}\n\n"
         else:
-            # Standard path without reasoning parsing
+            # Standard path without reasoning or tool call parsing
             content = delta_text
 
             # Add <think> prefix on first content chunk for thinking models
@@ -1116,6 +1215,48 @@ async def stream_chat_completion(
             )
             yield f"data: {chunk.model_dump_json()}\n\n"
 
+    # If tool call parsing is enabled but no tool_calls were emitted during streaming,
+    # check final accumulated text for tool calls (handles cases where </tool_call>
+    # wasn't detected during streaming)
+    if tool_call_enabled and _tool_parser_instance and not tool_calls_emitted:
+        final_result = _tool_parser_instance.extract_tool_calls(accumulated_text)
+        if final_result.tools_called:
+            tool_calls_list = [
+                {
+                    "index": i,
+                    "id": tc["id"],
+                    "type": "function",
+                    "function": {
+                        "name": tc["name"],
+                        "arguments": tc["arguments"],
+                    },
+                }
+                for i, tc in enumerate(final_result.tool_calls)
+            ]
+            chunk = ChatCompletionChunk(
+                id=response_id,
+                model=request.model,
+                choices=[
+                    ChatCompletionChunkChoice(
+                        delta=ChatCompletionChunkDelta(
+                            tool_calls=tool_calls_list,
+                        ),
+                        finish_reason="tool_calls",
+                    )
+                ],
+                usage=(
+                    Usage(
+                        prompt_tokens=prompt_tokens,
+                        completion_tokens=completion_tokens,
+                        total_tokens=prompt_tokens + completion_tokens,
+                    )
+                    if last_output and last_output.finished
+                    else None
+                ),
+            )
+            yield f"data: {chunk.model_dump_json()}\n\n"
+            tool_calls_emitted = True
+
     # Send final chunk with usage if requested
     if include_usage:
         usage_chunk = ChatCompletionChunk(
@@ -1248,13 +1389,42 @@ def main():
         choices=["qwen3", "deepseek_r1"],
         help="Enable reasoning content extraction with specified parser",
     )
+    parser.add_argument(
+        "--default-temperature",
+        type=float,
+        default=None,
+        help="Default temperature for generation when not specified in request",
+    )
+    parser.add_argument(
+        "--default-top-p",
+        type=float,
+        default=None,
+        help="Default top_p for generation when not specified in request",
+    )
+    parser.add_argument(
+        "--enable-auto-tool-choice",
+        action="store_true",
+        help="Enable automatic tool call parsing for models that support it",
+    )
+    parser.add_argument(
+        "--tool-call-parser",
+        type=str,
+        default=None,
+        choices=ToolParserManager.list_registered(),
+        help="Tool call parser to use (requires --enable-auto-tool-choice)",
+    )
 
     args = parser.parse_args()
 
     # Set global configuration
     global _api_key, _default_timeout, _rate_limiter
+    global _default_temperature, _default_top_p
     _api_key = args.api_key
     _default_timeout = args.timeout
+    if args.default_temperature is not None:
+        _default_temperature = args.default_temperature
+    if args.default_top_p is not None:
+        _default_top_p = args.default_top_p
 
     # Configure rate limiter
     if args.rate_limit > 0:
@@ -1291,6 +1461,16 @@ def main():
         _reasoning_parser = parser_cls()
         logger.info(f"Reasoning parser enabled: {args.reasoning_parser}")
 
+    # Configure tool call parsing
+    global _enable_auto_tool_choice, _tool_call_parser
+    if args.enable_auto_tool_choice:
+        _enable_auto_tool_choice = True
+        _tool_call_parser = args.tool_call_parser
+        if _tool_call_parser:
+            logger.info(f"Tool call parsing enabled with parser: {_tool_call_parser}")
+        else:
+            logger.info("Tool call parsing enabled with auto-detection")
+
     # Load model before starting server
     load_model(
         args.model,
diff --git a/vllm_mlx/tool_parsers/__init__.py b/vllm_mlx/tool_parsers/__init__.py
index 15d85f64..5a857448 100644
--- a/vllm_mlx/tool_parsers/__init__.py
+++ b/vllm_mlx/tool_parsers/__init__.py
@@ -17,6 +17,7 @@
 - nemotron/nemotron3: NVIDIA Nemotron models
 - xlam: Salesforce xLAM models
 - functionary/meetkai: MeetKai Functionary models
+- glm47/glm4: GLM-4.7 and GLM-4.7-Flash models
 
 Usage:
     from vllm_mlx.tool_parsers import ToolParserManager
@@ -53,6 +54,7 @@
 from .nemotron_tool_parser import NemotronToolParser
 from .qwen_tool_parser import QwenToolParser
 from .xlam_tool_parser import xLAMToolParser
+from .glm47_tool_parser import Glm47ToolParser
 
 __all__ = [
     # Base classes
@@ -71,4 +73,5 @@
     "NemotronToolParser",
     "xLAMToolParser",
     "FunctionaryToolParser",
+    "Glm47ToolParser",
 ]
diff --git a/vllm_mlx/tool_parsers/glm47_tool_parser.py b/vllm_mlx/tool_parsers/glm47_tool_parser.py
new file mode 100644
index 00000000..03facf59
--- /dev/null
+++ b/vllm_mlx/tool_parsers/glm47_tool_parser.py
@@ -0,0 +1,176 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+GLM-4.7 tool call parser for vllm-mlx.
+
+Handles GLM-4.7-Flash style tool calling format.
+Based on vLLM's glm47_moe_tool_parser.py
+"""
+
+import json
+import re
+import uuid
+from collections.abc import Sequence
+from typing import Any
+
+from .abstract_tool_parser import (
+    ExtractedToolCallInformation,
+    ToolParser,
+    ToolParserManager,
+)
+
+
+def generate_tool_id() -> str:
+    """Generate a unique tool call ID."""
+    return f"call_{uuid.uuid4().hex[:8]}"
+
+
+@ToolParserManager.register_module(["glm47", "glm4"])
+class Glm47ToolParser(ToolParser):
+    """
+    Tool call parser for GLM-4.7 and GLM-4.7-Flash models.
+
+    Supports GLM-4.7 tool call format:
+    <tool_call>function_name
+    <arg_key>param1</arg_key><arg_value>value1</arg_value>
+    <arg_key>param2</arg_key><arg_value>value2</arg_value>
+    </tool_call>
+
+    Used when --enable-auto-tool-choice --tool-call-parser glm47 are set.
+    """
+
+    # Match entire tool call block
+    TOOL_CALL_PATTERN = re.compile(r"<tool_call>(.*?)</tool_call>", re.DOTALL)
+
+    # Match function name and optional arguments
+    # GLM47 format: <tool_call>func_name\n<arg_key>...</arg_key>...
+    FUNC_DETAIL_PATTERN = re.compile(
+        r"<tool_call>\s*([^\n<]+?)(?:\n|\s*)(<arg_key>.*?)?</tool_call>", re.DOTALL
+    )
+
+    # Match individual argument key-value pairs
+    ARG_PATTERN = re.compile(
+        r"<arg_key>\s*(.*?)\s*</arg_key>\s*<arg_value>(.*?)</arg_value>", re.DOTALL
+    )
+
+    # Match thinking tags to remove from output
+    THINK_PATTERN = re.compile(r"<think>.*?</think>", re.DOTALL)
+
+    def _deserialize(self, value: str) -> Any:
+        """Convert string value to appropriate Python type."""
+        value = value.strip()
+
+        # Try JSON first
+        try:
+            return json.loads(value)
+        except json.JSONDecodeError:
+            pass
+
+        # Try as Python literal
+        try:
+            import ast
+
+            return ast.literal_eval(value)
+        except (ValueError, SyntaxError):
+            pass
+
+        # Return as string
+        return value
+
+    def extract_tool_calls(
+        self, model_output: str, request: dict[str, Any] | None = None
+    ) -> ExtractedToolCallInformation:
+        """
+        Extract tool calls from a complete GLM-4.7 model response.
+        """
+        tool_calls = []
+        cleaned_text = model_output
+
+        # Remove thinking tags first
+        cleaned_text = self.THINK_PATTERN.sub("", cleaned_text)
+
+        # Find all tool call blocks
+        matches = self.FUNC_DETAIL_PATTERN.findall(cleaned_text)
+
+        for match in matches:
+            func_name = match[0].strip() if match[0] else ""
+            args_section = match[1] if len(match) > 1 and match[1] else ""
+
+            if not func_name:
+                continue
+
+            # Parse arguments
+            arguments = {}
+            if args_section:
+                arg_matches = self.ARG_PATTERN.findall(args_section)
+                for arg_key, arg_value in arg_matches:
+                    key = arg_key.strip()
+                    value = self._deserialize(arg_value)
+                    if key:
+                        arguments[key] = value
+
+            tool_calls.append(
+                {
+                    "id": generate_tool_id(),
+                    "name": func_name,
+                    "arguments": json.dumps(arguments, ensure_ascii=False),
+                }
+            )
+
+        # When tool calls are found, don't return reasoning text as content
+        # GLM often outputs thinking/reasoning before tool calls without <think> tags
+        if tool_calls:
+            return ExtractedToolCallInformation(
+                tools_called=True,
+                tool_calls=tool_calls,
+                content=None,  # Don't include reasoning text when making tool calls
+            )
+        else:
+            # Remove thinking from final output even if no tool calls
+            cleaned_text = self.THINK_PATTERN.sub("", model_output).strip()
+            return ExtractedToolCallInformation(
+                tools_called=False, tool_calls=[], content=cleaned_text
+            )
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int] | None = None,
+        current_token_ids: Sequence[int] | None = None,
+        delta_token_ids: Sequence[int] | None = None,
+        request: dict[str, Any] | None = None,
+    ) -> dict[str, Any] | None:
+        """
+        Extract tool calls from streaming GLM-4.7 model output.
+        """
+        # Skip thinking content in streaming
+        if "<think>" in current_text and "</think>" not in current_text:
+            return None
+
+        if "<tool_call>" not in current_text:
+            # Remove thinking tags from delta
+            clean_delta = self.THINK_PATTERN.sub("", delta_text)
+            if clean_delta:
+                return {"content": clean_delta}
+            return None
+
+        if "</tool_call>" in delta_text:
+            result = self.extract_tool_calls(current_text)
+            if result.tools_called:
+                return {
+                    "tool_calls": [
+                        {
+                            "index": i,
+                            "id": tc["id"],
+                            "type": "function",
+                            "function": {
+                                "name": tc["name"],
+                                "arguments": tc["arguments"],
+                            },
+                        }
+                        for i, tc in enumerate(result.tool_calls)
+                    ]
+                }
+
+        return None