Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import openai

from ...config.logfire_config import search_logger
from ..llm_provider_service import get_llm_client
from ..llm_provider_service import get_llm_client, prepare_llm_params
from ..threading_service import get_threading_service


Expand Down Expand Up @@ -65,6 +65,10 @@ async def generate_contextual_embedding(
# Get model from provider configuration
model = await _get_model_choice(provider)

# Prepare compatible parameters for the API call
params = prepare_llm_params(provider or "openai", model,
temperature=0.3, max_tokens=200)

Comment on lines +68 to +71
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

🧩 Analysis chain

LGTM: Params now flow through compatibility shim

Passing params via prepare_llm_params before chat.completions.create is the right direction. This will work once prepare_llm_params gates max_tokens only for reasoning models as suggested.

To catch any lingering direct uses, run:

Also applies to: 81-81


🏁 Script executed:

#!/bin/bash
# Find chat.completions calls still passing max_tokens/temperature directly
rg -nP -C2 --type=py '\bchat\.completions\.create\('
rg -nP --type=py '\bmax_tokens\s*='
rg -nP --type=py '\btemperature\s*='

Length of output: 8943


Fix remaining direct max_tokens/temperature usages before approving

Direct calls still pass max_tokens/temperature to client.chat.completions.create — switch these to use prepare_llm_params (and gate max_tokens only for reasoning models), then re-run the verification script.

  • python/src/server/services/embeddings/contextual_embedding_service.py:70 (temperature=0.3, max_tokens=200)
  • python/src/server/services/embeddings/contextual_embedding_service.py:195 (temperature=0, max_tokens=100 * len(chunks))
  • python/src/server/services/storage/code_storage_service.py:561 (max_tokens=500, temperature=0.3)
  • python/src/server/services/provider_discovery_service.py:154 (max_tokens=50)
  • python/src/server/services/ollama/model_discovery_service.py:587 (max_tokens=1)
  • python/src/server/services/ollama/model_discovery_service.py:611 (max_tokens=10)
  • python/src/server/services/ollama/model_discovery_service.py:613 (temperature=0.1)
  • python/src/server/services/ollama/model_discovery_service.py:670 (max_tokens=1)
  • python/src/server/services/ollama/model_discovery_service.py:844 (max_tokens=50)
  • python/src/server/services/ollama/model_discovery_service.py:878 (max_tokens=100)
  • python/src/server/api_routes/ollama_api.py:883 (max_tokens=50)
  • python/src/server/api_routes/ollama_api.py:928 (max_tokens=100)
  • python/src/server/api_routes/ollama_api.py:930 (temperature=0.1)
🤖 Prompt for AI Agents
In python/src/server/services/embeddings/contextual_embedding_service.py around
lines 68-71, the call is directly passing temperature=0.3 and max_tokens=200;
replace these direct arguments with a call to prepare_llm_params(provider or
"openai", model, ...) and merge the returned params into the client call, and
ensure max_tokens is only included when the model is a reasoning-type model
(gate it behind the existing reasoning-model check) so that temperature and
max_tokens are not passed directly to client.chat.completions.create; apply the
same pattern to the other listed locations and re-run the verification script.

response = await client.chat.completions.create(
model=model,
messages=[
Expand All @@ -74,8 +78,7 @@ async def generate_contextual_embedding(
},
{"role": "user", "content": prompt},
],
temperature=0.3,
max_tokens=200,
**params
)

context = response.choices[0].message.content.strip()
Expand Down Expand Up @@ -122,7 +125,7 @@ async def _get_model_choice(provider: str | None = None) -> str:
# Handle empty model case - fallback to provider-specific defaults or explicit config
if not model:
search_logger.warning(f"chat_model is empty for provider {provider_name}, using fallback logic")

if provider_name == "ollama":
# Try to get OLLAMA_CHAT_MODEL specifically
try:
Expand All @@ -143,7 +146,7 @@ async def _get_model_choice(provider: str | None = None) -> str:
else:
# OpenAI or other providers
model = "gpt-4o-mini"

search_logger.debug(f"Using model from credential service: {model}")

return model
Expand Down Expand Up @@ -187,6 +190,10 @@ async def generate_contextual_embeddings_batch(

batch_prompt += "For each chunk, provide a short succinct context to situate it within the overall document for improving search retrieval. Format your response as:\\nCHUNK 1: [context]\\nCHUNK 2: [context]\\netc."

# Prepare compatible parameters for the API call
params = prepare_llm_params(provider or "openai", model_choice,
temperature=0, max_tokens=100 * len(chunks))

# Make single API call for ALL chunks
response = await client.chat.completions.create(
model=model_choice,
Expand All @@ -197,8 +204,7 @@ async def generate_contextual_embeddings_batch(
},
{"role": "user", "content": batch_prompt},
],
temperature=0,
max_tokens=100 * len(chunks), # Limit response size
**params
)

# Parse response
Expand Down Expand Up @@ -245,4 +251,4 @@ async def generate_contextual_embeddings_batch(
except Exception as e:
search_logger.error(f"Error in contextual embedding batch: {e}")
# Return non-contextual for all chunks
return [(chunk, False) for chunk in chunks]
return [(chunk, False) for chunk in chunks]
44 changes: 44 additions & 0 deletions python/src/server/services/llm_provider_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -383,3 +383,47 @@ async def validate_provider_instance(provider: str, instance_url: str | None = N
"error_message": str(e),
"validation_timestamp": time.time()
}


def prepare_llm_params(provider: str, model: str, **kwargs) -> dict:
"""
Prepare LLM API parameters with automatic compatibility handling.

Handles:
- OpenAI max_tokens → max_completion_tokens deprecation
- Reasoning model temperature exclusion (o1, gpt-5 series)

Args:
provider: LLM provider name (openai, ollama, google)
model: Model name to check for special requirements
**kwargs: Original API parameters

Returns:
dict: Compatible parameters ready for API call
"""
params = kwargs.copy()

# Handle OpenAI parameter deprecation
if provider == "openai" and "max_tokens" in params:
params["max_completion_tokens"] = params.pop("max_tokens")

# Handle reasoning model restrictions
if model and _is_reasoning_model(model):
params.pop("temperature", None)

return params

Comment on lines +388 to +415
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Blocker: Don’t remap max_tokens globally; gate by reasoning models and drop unsupported params

Unconditionally converting max_tokens → max_completion_tokens risks 400s on non‑reasoning chat models. For o‑series/GPT‑5 reasoning models, use max_completion_tokens and strip unsupported params (temperature/top_p/penalties/logprobs/logit_bias/n). For non‑reasoning chat models, keep max_tokens (or translate back if only max_completion_tokens was supplied). This aligns with OpenAI guidance and observed API behavior. (community.openai.com)

Apply this diff:

-def prepare_llm_params(provider: str, model: str, **kwargs) -> dict:
+def prepare_llm_params(provider: str, model: str, **kwargs) -> dict[str, Any]:
@@
-    params = kwargs.copy()
-
-    # Handle OpenAI parameter deprecation
-    if provider == "openai" and "max_tokens" in params:
-        params["max_completion_tokens"] = params.pop("max_tokens")
-
-    # Handle reasoning model restrictions
-    if model and _is_reasoning_model(model):
-        params.pop("temperature", None)
-
-    return params
+    params = dict(kwargs)
+    is_reasoning = bool(model and _is_reasoning_model(model))
+
+    # OpenAI compatibility handling
+    if provider == "openai":
+        if is_reasoning:
+            # o‑/GPT‑5 reasoning models: require max_completion_tokens; drop unsupported knobs
+            if "max_tokens" in params and "max_completion_tokens" not in params:
+                params["max_completion_tokens"] = params.pop("max_tokens")
+            for k in (
+                "temperature",
+                "top_p",
+                "presence_penalty",
+                "frequency_penalty",
+                "logprobs",
+                "top_logprobs",
+                "logit_bias",
+                "n",
+            ):
+                params.pop(k, None)
+        else:
+            # Legacy/non‑reasoning chat models: prefer max_tokens for broad compatibility
+            if "max_completion_tokens" in params and "max_tokens" not in params:
+                params["max_tokens"] = params.pop("max_completion_tokens")
+
+    return params
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
def prepare_llm_params(provider: str, model: str, **kwargs) -> dict:
"""
Prepare LLM API parameters with automatic compatibility handling.
Handles:
- OpenAI max_tokensmax_completion_tokens deprecation
- Reasoning model temperature exclusion (o1, gpt-5 series)
Args:
provider: LLM provider name (openai, ollama, google)
model: Model name to check for special requirements
**kwargs: Original API parameters
Returns:
dict: Compatible parameters ready for API call
"""
params = kwargs.copy()
# Handle OpenAI parameter deprecation
if provider == "openai" and "max_tokens" in params:
params["max_completion_tokens"] = params.pop("max_tokens")
# Handle reasoning model restrictions
if model and _is_reasoning_model(model):
params.pop("temperature", None)
return params
def prepare_llm_params(provider: str, model: str, **kwargs) -> dict[str, Any]:
"""
Prepare LLM API parameters with automatic compatibility handling.
Handles:
- OpenAI max_tokensmax_completion_tokens deprecation
- Reasoning model temperature exclusion (o1, gpt-5 series)
Args:
provider: LLM provider name (openai, ollama, google)
model: Model name to check for special requirements
**kwargs: Original API parameters
Returns:
dict: Compatible parameters ready for API call
"""
params = dict(kwargs)
is_reasoning = bool(model and _is_reasoning_model(model))
# OpenAI compatibility handling
if provider == "openai":
if is_reasoning:
# o-/GPT-5 reasoning models: require max_completion_tokens; drop unsupported knobs
if "max_tokens" in params and "max_completion_tokens" not in params:
params["max_completion_tokens"] = params.pop("max_tokens")
for k in (
"temperature",
"top_p",
"presence_penalty",
"frequency_penalty",
"logprobs",
"top_logprobs",
"logit_bias",
"n",
):
params.pop(k, None)
else:
# Legacy/non-reasoning chat models: prefer max_tokens for broad compatibility
if "max_completion_tokens" in params and "max_tokens" not in params:
params["max_tokens"] = params.pop("max_completion_tokens")
return params


def _is_reasoning_model(model: str) -> bool:
"""
Check if model is a reasoning model that doesn't support custom temperature.

Args:
model: Model name to check

Returns:
True if model is a reasoning model, False otherwise
"""
reasoning_patterns = ["o1", "o1-preview", "o1-mini", "gpt-5"]
model_lower = model.lower()
return any(pattern in model_lower for pattern in reasoning_patterns)
27 changes: 15 additions & 12 deletions python/src/server/services/storage/code_storage_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -507,7 +507,7 @@ def generate_code_example_summary(
A dictionary with 'summary' and 'example_name'
"""
import asyncio

# Run the async version in the current thread
return asyncio.run(_generate_code_example_summary_async(code, context_before, context_after, language, provider))

Expand All @@ -518,8 +518,8 @@ async def _generate_code_example_summary_async(
"""
Async version of generate_code_example_summary using unified LLM provider service.
"""
from ..llm_provider_service import get_llm_client
from ..llm_provider_service import get_llm_client, prepare_llm_params

# Get model choice from credential service (RAG setting)
model_choice = _get_model_choice()

Expand Down Expand Up @@ -555,7 +555,11 @@ async def _generate_code_example_summary_async(
search_logger.info(
f"Generating summary for {hash(code) & 0xffffff:06x} using model: {model_choice}"
)


# Prepare compatible parameters for the API call
params = prepare_llm_params(provider or "openai", model_choice,
max_tokens=500, temperature=0.3)

response = await client.chat.completions.create(
model=model_choice,
messages=[
Expand All @@ -566,8 +570,7 @@ async def _generate_code_example_summary_async(
{"role": "user", "content": prompt},
],
response_format={"type": "json_object"},
max_tokens=500,
temperature=0.3,
**params
)

response_content = response.choices[0].message.content.strip()
Expand Down Expand Up @@ -848,14 +851,14 @@ async def add_code_examples_to_supabase(
# Use only successful embeddings
valid_embeddings = result.embeddings
successful_texts = result.texts_processed

# Get model information for tracking
from ..llm_provider_service import get_embedding_model
from ..credential_service import credential_service

from ..llm_provider_service import get_embedding_model

# Get embedding model name
embedding_model_name = await get_embedding_model(provider=provider)

# Get LLM chat model (used for code summaries and contextual embeddings if enabled)
llm_chat_model = None
try:
Expand Down Expand Up @@ -908,7 +911,7 @@ async def add_code_examples_to_supabase(
# Determine the correct embedding column based on dimension
embedding_dim = len(embedding) if isinstance(embedding, list) else len(embedding.tolist())
embedding_column = None

if embedding_dim == 768:
embedding_column = "embedding_768"
elif embedding_dim == 1024:
Expand All @@ -921,7 +924,7 @@ async def add_code_examples_to_supabase(
# Default to closest supported dimension
search_logger.warning(f"Unsupported embedding dimension {embedding_dim}, using embedding_1536")
embedding_column = "embedding_1536"

batch_data.append({
"url": urls[idx],
"chunk_number": chunk_numbers[idx],
Expand Down