code rabbit fixes and additional tests

thoraxe · thoraxe · commit 8c0a1d0af7ff · 2025-08-14T13:29:21.000-04:00
diff --git a/src/app/endpoints/query.py b/src/app/endpoints/query.py
@@ -43,7 +43,7 @@
 router = APIRouter(tags=["query"])
 auth_dependency = get_auth_dependency()
 
-METADATA_PATTERN = re.compile(r"\nMetadata: (\{.+})\n")
+METADATA_PATTERN = re.compile(r"^Metadata:\s*(\{.*?\})\s*$", re.MULTILINE)
 
 
 def _process_knowledge_search_content(
@@ -78,28 +78,20 @@ def extract_referenced_documents_from_steps(steps: list) -> list[dict[str, str]]
     metadata_map: dict[str, dict[str, Any]] = {}
 
     for step in steps:
-        if step.step_type != "tool_execution" or not hasattr(step, "tool_responses"):
+        if getattr(step, "step_type", "") != "tool_execution" or not hasattr(step, "tool_responses"):
             continue
 
-        for tool_response in step.tool_responses:
-            if (
-                tool_response.tool_name != "knowledge_search"
-                or not tool_response.content
-            ):
+        for tool_response in getattr(step, "tool_responses", []) or []:
+            if getattr(tool_response, "tool_name", "") != "knowledge_search" or not getattr(tool_response, "content", []):
                 continue
 
             _process_knowledge_search_content(tool_response, metadata_map)
 
     # Extract referenced documents from metadata
     return [
-        {
-            "doc_url": v["docs_url"],
-            "doc_title": v["title"],
-        }
-        for v in filter(
-            lambda v: ("docs_url" in v) and ("title" in v),
-            metadata_map.values(),
-        )
+        {"doc_url": v["docs_url"], "doc_title": v["title"]}
+        for v in metadata_map.values()
+        if "docs_url" in v and "title" in v
     ]
 
 
@@ -480,10 +472,9 @@ async def retrieve_response(  # pylint: disable=too-many-locals
     # Check for validation errors and extract referenced documents
     steps = getattr(response, "steps", [])
     for step in steps:
-        if step.step_type == "shield_call" and step.violation:
+        if getattr(step, "step_type", "") == "shield_call" and getattr(step, "violation", False):
             # Metric for LLM validation errors
             metrics.llm_calls_validation_errors_total.inc()
-            break
 
     # Extract referenced documents from tool execution steps
     referenced_documents = extract_referenced_documents_from_steps(steps)
diff --git a/tests/unit/app/endpoints/test_query.py b/tests/unit/app/endpoints/test_query.py
@@ -22,6 +22,7 @@
     store_transcript,
     get_rag_toolgroups,
     evaluate_model_hints,
+    _process_knowledge_search_content,
 )
 
 from models.requests import QueryRequest, Attachment
@@ -1583,3 +1584,141 @@ def test_evaluate_model_hints(
 
     assert provider_id == expected_provider
     assert model_id == expected_model
+
+
+def test_process_knowledge_search_content_with_valid_metadata(mocker):
+    """Test _process_knowledge_search_content with valid metadata."""
+    # Mock tool response with valid metadata
+    text_content_item = mocker.Mock()
+    text_content_item.text = """Result 1
+Content: Test content
+Metadata: {'docs_url': 'https://example.com/doc1', 'title': 'Test Doc', 'document_id': 'doc-1'}
+"""
+    
+    tool_response = mocker.Mock()
+    tool_response.content = [text_content_item]
+    
+    metadata_map = {}
+    
+    _process_knowledge_search_content(tool_response, metadata_map)
+    
+    # Verify metadata was correctly parsed and added
+    assert "doc-1" in metadata_map
+    assert metadata_map["doc-1"]["docs_url"] == "https://example.com/doc1"
+    assert metadata_map["doc-1"]["title"] == "Test Doc"
+    assert metadata_map["doc-1"]["document_id"] == "doc-1"
+
+
+def test_process_knowledge_search_content_with_invalid_metadata_syntax_error(mocker):
+    """Test _process_knowledge_search_content handles SyntaxError from invalid metadata."""
+    mock_logger = mocker.patch("app.endpoints.query.logger")
+    
+    # Mock tool response with invalid metadata (invalid Python syntax)
+    text_content_item = mocker.Mock()
+    text_content_item.text = """Result 1
+Content: Test content
+Metadata: {'docs_url': 'https://example.com/doc1' 'title': 'Test Doc', 'document_id': 'doc-1'}
+"""  # Missing comma between 'doc1' and 'title' - will cause SyntaxError
+    
+    tool_response = mocker.Mock()
+    tool_response.content = [text_content_item]
+    
+    metadata_map = {}
+    
+    _process_knowledge_search_content(tool_response, metadata_map)
+    
+    # Verify metadata_map remains empty due to exception
+    assert len(metadata_map) == 0
+    
+    # Verify debug logging was called
+    mock_logger.debug.assert_called_once()
+    args = mock_logger.debug.call_args[0]
+    assert "An exception was thrown in processing" in args[0]
+
+
+def test_process_knowledge_search_content_with_invalid_metadata_value_error(mocker):
+    """Test _process_knowledge_search_content handles ValueError from invalid metadata."""
+    mock_logger = mocker.patch("app.endpoints.query.logger")
+    
+    # Mock tool response with invalid metadata containing complex expressions
+    text_content_item = mocker.Mock()
+    text_content_item.text = """Result 1
+Content: Test content
+Metadata: {func_call(): 'value', 'title': 'Test Doc', 'document_id': 'doc-1'}
+"""  # Function call in dict - will cause ValueError since it's not a literal
+    
+    tool_response = mocker.Mock()
+    tool_response.content = [text_content_item]
+    
+    metadata_map = {}
+    
+    _process_knowledge_search_content(tool_response, metadata_map)
+    
+    # Verify metadata_map remains empty due to exception
+    assert len(metadata_map) == 0
+    
+    # Verify debug logging was called
+    mock_logger.debug.assert_called_once()
+    args = mock_logger.debug.call_args[0]
+    assert "An exception was thrown in processing" in args[0]
+
+
+def test_process_knowledge_search_content_with_non_dict_metadata(mocker):
+    """Test _process_knowledge_search_content handles non-dict metadata gracefully."""
+    mock_logger = mocker.patch("app.endpoints.query.logger")
+    
+    # Mock tool response with metadata that's not a dict
+    text_content_item = mocker.Mock()
+    text_content_item.text = """Result 1
+Content: Test content
+Metadata: "just a string"
+"""
+    
+    tool_response = mocker.Mock()
+    tool_response.content = [text_content_item]
+    
+    metadata_map = {}
+    
+    _process_knowledge_search_content(tool_response, metadata_map)
+    
+    # Verify metadata_map remains empty (no document_id in string)
+    assert len(metadata_map) == 0
+    
+    # No exception should be logged since string is valid literal
+    mock_logger.debug.assert_not_called()
+
+
+def test_process_knowledge_search_content_with_metadata_missing_document_id(mocker):
+    """Test _process_knowledge_search_content skips metadata without document_id."""
+    # Mock tool response with valid metadata but missing document_id
+    text_content_item = mocker.Mock()
+    text_content_item.text = """Result 1
+Content: Test content
+Metadata: {'docs_url': 'https://example.com/doc1', 'title': 'Test Doc'}
+"""  # No document_id field
+    
+    tool_response = mocker.Mock()
+    tool_response.content = [text_content_item]
+    
+    metadata_map = {}
+    
+    _process_knowledge_search_content(tool_response, metadata_map)
+    
+    # Verify metadata_map remains empty since document_id is missing
+    assert len(metadata_map) == 0
+
+
+def test_process_knowledge_search_content_with_no_text_attribute(mocker):
+    """Test _process_knowledge_search_content skips content items without text attribute."""
+    # Mock tool response with content item that has no text attribute
+    text_content_item = mocker.Mock(spec=[])  # spec=[] means no attributes
+    
+    tool_response = mocker.Mock()
+    tool_response.content = [text_content_item]
+    
+    metadata_map = {}
+    
+    _process_knowledge_search_content(tool_response, metadata_map)
+    
+    # Verify metadata_map remains empty since text attribute is missing
+    assert len(metadata_map) == 0