diff --git a/packages/sample-app/sample_app/experiment/made_by_traceloop/compliance_exp.py b/packages/sample-app/sample_app/experiment/made_by_traceloop/compliance_exp.py new file mode 100644 index 0000000000..f0ebda34a0 --- /dev/null +++ b/packages/sample-app/sample_app/experiment/made_by_traceloop/compliance_exp.py @@ -0,0 +1,106 @@ +""" +Content Compliance Evaluators Experiment + +This example demonstrates Traceloop's content compliance evaluators: +- Profanity Detection: Flags inappropriate language +- Toxicity Detection: Identifies toxic or harmful content +- Sexism Detection: Identifies sexist language or bias + +These evaluators help ensure AI-generated content is compliant with community guidelines. +""" + +import asyncio +import os +from openai import AsyncOpenAI +from traceloop.sdk import Traceloop +from traceloop.sdk.evaluator import EvaluatorMadeByTraceloop + +# Initialize Traceloop +client = Traceloop.init() + + +async def generate_response(prompt: str, temperature: float = 0.7) -> str: + """Generate a response using OpenAI""" + openai_client = AsyncOpenAI(api_key=os.getenv("OPENAI_API_KEY")) + + response = await openai_client.chat.completions.create( + model="gpt-3.5-turbo", + messages=[{"role": "user", "content": prompt}], + temperature=temperature, + max_tokens=200, + ) + + return response.choices[0].message.content + + +async def content_safety_task(row): + """ + Task function that generates content to be evaluated for safety. + Returns text that will be checked for profanity and toxicity. + """ + prompt = row.get("question", "") + + # Generate response + response = await generate_response(prompt) + + # Return data for safety evaluation + return { + "text": response, + "prompt": prompt, + } + + +async def run_content_compliance_experiment(): + """ + Run experiment with content compliance evaluators. + + This experiment evaluates: + 1. Profanity Detection - Flags inappropriate language + 2. Toxicity Detection - Identifies harmful or toxic content + """ + + print("\n" + "="*80) + print("CONTENT COMPLIANCE EVALUATORS EXPERIMENT") + print("="*80 + "\n") + + print("This experiment tests content compliance:\n") + print("1. Profanity Detection - Identifies inappropriate language") + print("2. Toxicity Detection - Detects harmful, aggressive, or toxic content") + print("\n" + "-"*80 + "\n") + + # Configure content compliance evaluators + evaluators = [ + EvaluatorMadeByTraceloop.profanity_detector(), + EvaluatorMadeByTraceloop.toxicity_detector(threshold=0.7), + EvaluatorMadeByTraceloop.sexism_detector(threshold=0.7), + ] + + print("Running experiment with content safety evaluators:") + for evaluator in evaluators: + config_str = ", ".join(f"{k}={v}" for k, v in evaluator.config.items() if k != "description") + print(f" - {evaluator.slug}") + if config_str: + print(f" Config: {config_str}") + + print("\n" + "-"*80 + "\n") + + # Run the experiment + results, errors = await client.experiment.run( + dataset_slug="content-compliance", # Set a dataset slug that exists in the traceloop platform + dataset_version="v1", + task=content_safety_task, + evaluators=evaluators, + experiment_slug="content-compliance-exp", + stop_on_error=False, + wait_for_results=True, + ) + + print("\n" + "="*80) + print("Content compliance experiment completed!") + print("="*80 + "\n") + + +if __name__ == "__main__": + print("\nContent Compliance Evaluators Experiment\n") + + asyncio.run(run_content_compliance_experiment()) diff --git a/packages/sample-app/sample_app/experiment/made_by_traceloop/correctness_exp.py b/packages/sample-app/sample_app/experiment/made_by_traceloop/correctness_exp.py new file mode 100644 index 0000000000..b99038974e --- /dev/null +++ b/packages/sample-app/sample_app/experiment/made_by_traceloop/correctness_exp.py @@ -0,0 +1,107 @@ +""" +Quality Evaluators Experiment + +This example demonstrates Traceloop's correctness evaluators: +- Answer Relevancy: Verifies responses address the query +- Faithfulness: Detects hallucinations and verifies facts + +These evaluators help ensure your AI applications provide accurate, +relevant, and faithful responses. +""" + +import asyncio +import os +from openai import AsyncOpenAI +from traceloop.sdk import Traceloop +from traceloop.sdk.evaluator import EvaluatorMadeByTraceloop + +# Initialize Traceloop +client = Traceloop.init() + + +async def generate_response(prompt: str, context: str = None) -> str: + """Generate a response using OpenAI""" + openai_client = AsyncOpenAI(api_key=os.getenv("OPENAI_API_KEY")) + + messages = [{"role": "user", "content": prompt}] + if context: + messages.insert(0, {"role": "system", "content": f"Context: {context}"}) + + response = await openai_client.chat.completions.create( + model="gpt-3.5-turbo", + messages=messages, + temperature=0.7, + max_tokens=200, + ) + + return response.choices[0].message.content + + +async def quality_task(row): + """ + Task function that processes questions with context. + Returns data that will be evaluated for quality and faithfulness. + """ + question = row.get("question", "This is a demo question") + context = row.get("context", "This is a demo context") + # Generate response + completion = await generate_response(question, context) + + # Return data for evaluation + return { + "question": question, + "answer": completion, + "completion": completion, + "context": context, + } + + +async def run_correctness_experiment(): + """ + Run experiment with correctness evaluators. + + This experiment will evaluate responses for: + 1. Answer Relevancy - Does the answer address the question? + 2. Faithfulness - Is the answer faithful to the provided context? + """ + + print("\n" + "="*80) + print("CORRECTNESS EVALUATORS EXPERIMENT") + print("="*80 + "\n") + + print("This experiment will test two critical quality evaluators:\n") + print("1. Answer Relevancy - Verifies the response addresses the query") + print("2. Faithfulness - Detects hallucinations and verifies factual accuracy") + print("\n" + "-"*80 + "\n") + + # Configure correctness evaluators + evaluators = [ + EvaluatorMadeByTraceloop.answer_relevancy(), + EvaluatorMadeByTraceloop.faithfulness(), + ] + + print("Running experiment with evaluators:") + for evaluator in evaluators: + print(f" - {evaluator.slug}") + + print("\n" + "-"*80 + "\n") + + # Run the experiment + results, errors = await client.experiment.run( + dataset_slug="correctness", # Set a dataset slug that exists in the traceloop platform + dataset_version="v1", + task=quality_task, + evaluators=evaluators, + experiment_slug="correctness-evaluators-exp", + stop_on_error=False, + wait_for_results=True, + ) + + print("\n" + "="*80) + print("Correctness experiment completed!") + print("="*80 + "\n") + +if __name__ == "__main__": + print("\nCorrectness Evaluators Experiment\n") + + asyncio.run(run_correctness_experiment()) diff --git a/packages/sample-app/sample_app/experiment/made_by_traceloop/formatting_exp.py b/packages/sample-app/sample_app/experiment/made_by_traceloop/formatting_exp.py new file mode 100644 index 0000000000..57aebc0d5d --- /dev/null +++ b/packages/sample-app/sample_app/experiment/made_by_traceloop/formatting_exp.py @@ -0,0 +1,197 @@ +""" +Validation Evaluators Experiment + +This example demonstrates Traceloop's formatting evaluators: +- JSON Validation: Validates JSON format and optional schema compliance +- SQL Validation: Validates SQL query syntax +- Regex Validation: Validates text matches regex patterns +- Placeholder Regex: Validates dynamic text with placeholders + +These evaluators are essential for structured output validation, +ensuring AI-generated code, queries, and formatted data are correct. +""" + +import asyncio +import os +from openai import AsyncOpenAI +from traceloop.sdk import Traceloop +from traceloop.sdk.evaluator import EvaluatorMadeByTraceloop + +# Initialize Traceloop +client = Traceloop.init() + + +async def generate_structured_output(prompt: str, output_format: str) -> str: + """ + Generate structured output (JSON, SQL, etc.) using OpenAI. + + Args: + prompt: The user's request + output_format: Expected format (json, sql, regex, etc.) + """ + openai_client = AsyncOpenAI(api_key=os.getenv("OPENAI_API_KEY")) + + system_prompts = { + "json": "You are a helpful assistant that responds ONLY with valid JSON. No explanation, just JSON.", + "sql": "You are a SQL expert. Generate ONLY valid SQL queries. No explanation, just the SQL query.", + "regex": "You are a regex expert. Generate ONLY the regex pattern. No explanation, just the pattern.", + "text": "You are a helpful assistant. Generate the requested text content.", + } + + response = await openai_client.chat.completions.create( + model="gpt-3.5-turbo", + messages=[ + {"role": "system", "content": system_prompts.get(output_format, system_prompts["text"])}, + {"role": "user", "content": prompt} + ], + temperature=0.3, # Lower temperature for more structured outputs + max_tokens=300, + ) + + return response.choices[0].message.content.strip() + + +async def validation_task(row): + """ + Task function that generates structured outputs for validation. + Returns different types of structured data based on the task type. + """ + prompt = row.get("question", "") + task_type = row.get("task_type", "json") # json, sql, regex, text + placeholder_value = row.get("placeholder_value", "This is a demo placeholder value") + + # Generate the appropriate output + output = await generate_structured_output(prompt, task_type) + + # Return data for validation evaluators + return { + "text": output, + "placeholder_value": placeholder_value, + } + + +async def run_formatting_experiment(): + """ + Run experiment with formatting evaluators. + + This experiment validates: + 1. JSON Validation - Proper JSON format and schema + 2. SQL Validation - Valid SQL syntax + 3. Regex Validation - Pattern matching + 4. Placeholder Regex - Dynamic pattern validation + """ + + print("\n" + "="*80) + print("VALIDATION EVALUATORS EXPERIMENT") + print("="*80 + "\n") + + print("This experiment validates structured AI outputs:\n") + print("1. JSON Validation - Ensures valid JSON format") + print("2. SQL Validation - Verifies SQL query syntax") + print("3. Regex Validation - Validates pattern matching") + print("4. Placeholder Regex - Dynamic text validation") + print("\n" + "-"*80 + "\n") + + # Configure validation evaluators + json_schema = '''{ + "type": "object", + "properties": { + "name": {"type": "string"}, + "age": {"type": "number"}, + "email": {"type": "string"} + }, + "required": ["name", "email"] + }''' + + evaluators = [ + EvaluatorMadeByTraceloop.json_validator( + enable_schema_validation=True, + schema_string=json_schema + ), + EvaluatorMadeByTraceloop.sql_validator(), + EvaluatorMadeByTraceloop.regex_validator( + regex=r"^\d{3}-\d{2}-\d{4}$", # SSN format + should_match=True, + case_sensitive=True + ), + EvaluatorMadeByTraceloop.placeholder_regex( + regex=r"^user_.*", + placeholder_name="username", + should_match=True + ), + ] + + print("\n" + "-"*80 + "\n") + + # Run the experiment + results, errors = await client.experiment.run( + dataset_slug="formatting", # Set a dataset slug that exists in the traceloop platform + dataset_version="v1", + task=validation_task, + evaluators=evaluators, + experiment_slug="formatting-evaluators-exp", + stop_on_error=False, + wait_for_results=True, + ) + + print("\n" + "="*80) + print("Formatting experiment completed!") + print("="*80 + "\n") + + +async def run_validation_examples(): + """ + Run live examples showing each validator in action. + """ + print("\n" + "="*80) + print("LIVE VALIDATION EXAMPLES") + print("="*80 + "\n") + + # Example 1: JSON Validation + print("Example 1: JSON Validation\n") + json_prompts = [ + "Generate a user profile JSON with name, age, and email for John Doe, age 30", + "Create a product JSON with id, title, and price", + ] + + print("Generating JSON outputs...") + for prompt in json_prompts[:1]: # Just show one for demo + output = await generate_structured_output(prompt, "json") + print(f"Prompt: {prompt}") + print(f"Output: {output}\n") + + # Example 2: SQL Validation + print("\n" + "-"*80 + "\n") + print("Example 2: SQL Validation\n") + sql_prompts = [ + "Write a SQL query to select all users older than 18", + "Create a query to count active customers", + ] + + print("Generating SQL queries...") + for prompt in sql_prompts[:1]: # Just show one for demo + output = await generate_structured_output(prompt, "sql") + print(f"Prompt: {prompt}") + print(f"Output: {output}\n") + + # Example 3: Regex patterns + print("\n" + "-"*80 + "\n") + print("Example 3: Pattern Validation\n") + + patterns = [ + ("Email validation", r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"), + ("Phone number", r"^\(\d{3}\) \d{3}-\d{4}$"), + ("URL validation", r"^https?://[^\s/$.?#].[^\s]*$"), + ] + + print("Common validation patterns:") + for name, pattern in patterns: + print(f" {name}: {pattern}") + + print("\n" + "="*80 + "\n") + + +if __name__ == "__main__": + print("\nFormatting Evaluators Experiment\n") + + asyncio.run(run_formatting_experiment()) diff --git a/packages/sample-app/sample_app/experiment/made_by_traceloop/quality_exp.py b/packages/sample-app/sample_app/experiment/made_by_traceloop/quality_exp.py new file mode 100644 index 0000000000..6496753072 --- /dev/null +++ b/packages/sample-app/sample_app/experiment/made_by_traceloop/quality_exp.py @@ -0,0 +1,122 @@ +""" +Advanced Quality Evaluators Experiment + +This example demonstrates Traceloop's advanced quality evaluators: +- Measure Perplexity: Measure text perplexity from logprobs +- Agent Goal Accuracy: Validate agent goal achievement +- Semantic Similarity: Measure semantic similarity between texts +- Topic Adherence: Validate topic adherence + +These evaluators help analyze response quality, goal achievement, +semantic correctness, and topic consistency. +""" + +import asyncio +import os +from openai import AsyncOpenAI +from traceloop.sdk import Traceloop +from traceloop.sdk.evaluator import EvaluatorMadeByTraceloop + +# Initialize Traceloop +client = Traceloop.init() + + +async def generate_response(prompt: str, max_tokens: int = 300) -> str: + """Generate a response using OpenAI with logprobs for perplexity measurement""" + openai_client = AsyncOpenAI(api_key=os.getenv("OPENAI_API_KEY")) + + response = await openai_client.chat.completions.create( + model="gpt-3.5-turbo", + messages=[{"role": "user", "content": prompt}], + temperature=0.7, + max_tokens=max_tokens, + logprobs=True, # Enable logprobs for perplexity calculation + ) + + return response.choices[0].message.content + + +async def advanced_quality_task(row): + """ + Task function that generates responses and provides data for advanced quality evaluation. + + Expected dataset fields: + - question: The input question/prompt + - expected_goal: The expected outcome or goal (for agent goal accuracy) + - reference_answer: Reference answer for semantic similarity + - topic: The expected topic for adherence checking + """ + question = row.get("question", "") + reference_answer = row.get("reference_answer", "This is a demo reference answer") + topic = row.get("topic", "general knowledge") + + # Generate response + completion = await generate_response(question) + + # Return data for evaluation + # Different evaluators expect different fields + return { + "logprobs": completion, # For perplexity + "question": question, + "completion": completion, # Standard completion field + "reference": reference_answer, # For semantic similarity comparison + "reference_topics": topic, # For topic adherence + } + + +async def run_advanced_quality_experiment(): + """ + Run experiment with advanced quality evaluators. + + This experiment measures: + 1. Perplexity - Text fluency and predictability from logprobs + 2. Agent Goal Accuracy - Whether the response achieves its intended goal + 3. Semantic Similarity - Semantic alignment with reference answer + 4. Topic Adherence - Whether the response stays on topic + """ + + print("\n" + "="*80) + print("ADVANCED QUALITY EVALUATORS EXPERIMENT") + print("="*80 + "\n") + + print("This experiment measures response quality across multiple dimensions:\n") + print("1. Perplexity - Measures text fluency and predictability") + print("2. Agent Goal Accuracy - Validates goal achievement") + print("3. Semantic Similarity - Compares semantic meaning with reference") + print("4. Topic Adherence - Ensures response stays on topic") + print("\n" + "-"*80 + "\n") + + # Configure advanced quality evaluators + evaluators = [ + EvaluatorMadeByTraceloop.perplexity(), + EvaluatorMadeByTraceloop.agent_goal_accuracy(), + EvaluatorMadeByTraceloop.semantic_similarity(), + EvaluatorMadeByTraceloop.topic_adherence(), + ] + + print("Running experiment with advanced quality evaluators:") + for evaluator in evaluators: + print(f" - {evaluator.slug}") + + print("\n" + "-"*80 + "\n") + + # Run the experiment + results, errors = await client.experiment.run( + dataset_slug="quality", # Set a dataset slug that exists in the traceloop platform + dataset_version="v1", + task=advanced_quality_task, + evaluators=evaluators, + experiment_slug="advanced-quality-exp", + stop_on_error=False, + wait_for_results=True, + ) + + print("\n" + "="*80) + print("Advanced quality experiment completed!") + print("="*80 + "\n") + + +if __name__ == "__main__": + print("\n🚀 Advanced Quality Evaluators Experiment\n") + + asyncio.run(run_advanced_quality_experiment()) diff --git a/packages/sample-app/sample_app/experiment/made_by_traceloop/security_exp.py b/packages/sample-app/sample_app/experiment/made_by_traceloop/security_exp.py new file mode 100644 index 0000000000..bf5e0eefd3 --- /dev/null +++ b/packages/sample-app/sample_app/experiment/made_by_traceloop/security_exp.py @@ -0,0 +1,103 @@ +""" +Security Evaluators Experiment + +This example demonstrates Traceloop's security evaluators: +- PII Detector: Identifies personal information exposure +- Secrets Detector: Monitors for credential and key leaks +- Prompt Injection: Detects prompt injection attempts + +These evaluators help ensure your AI applications don't leak sensitive data +or fall victim to prompt injection attacks. +""" + +import asyncio +import os +from openai import AsyncOpenAI +from traceloop.sdk import Traceloop +from traceloop.sdk.evaluator import EvaluatorMadeByTraceloop + +# Initialize Traceloop +client = Traceloop.init() + + +async def generate_response(prompt: str) -> str: + """Generate a response using OpenAI""" + openai_client = AsyncOpenAI(api_key=os.getenv("OPENAI_API_KEY")) + + response = await openai_client.chat.completions.create( + model="gpt-3.5-turbo", + messages=[{"role": "user", "content": prompt}], + temperature=0.7, + max_tokens=200, + ) + + return response.choices[0].message.content + + +async def security_task(row): + """ + Task function that processes user queries. + Returns text that will be evaluated for security issues. + """ + user_query = row.get("query", "") + + # Generate response + response = await generate_response(user_query) + + # Return data for evaluation + return { + "text": response, # The text to check for PII, secrets, and prompt injection + "query": user_query, + } + + +async def run_security_experiment(): + """ + Run experiment with security evaluators. + + This experiment will evaluate responses for: + 1. PII (Personal Identifiable Information) + 2. Secrets (API keys, passwords, tokens) + 3. Prompt Injection attempts + """ + + print("\n" + "="*80) + print("SECURITY EVALUATORS EXPERIMENT") + print("="*80 + "\n") + + print("This experiment will test three critical security evaluators:\n") + print("1. PII Detector - Identifies personal information (names, emails, SSN, etc.)") + print("2. Secrets Detector - Finds API keys, passwords, and credentials") + print("3. Prompt Injection - Detects attempts to manipulate the AI system") + print("\n" + "-"*80 + "\n") + + # Configure security evaluators + evaluators = [ + EvaluatorMadeByTraceloop.pii_detector(probability_threshold=0.7,), + EvaluatorMadeByTraceloop.secrets_detector(), + EvaluatorMadeByTraceloop.prompt_injection(threshold=0.6), + ] + + print("\n" + "-"*80 + "\n") + + # Run the experiment + results, errors = await client.experiment.run( + dataset_slug="security", # Set a dataset slug that exists in the traceloop platform + dataset_version="v1", + task=security_task, + evaluators=evaluators, + experiment_slug="security-evaluators-exp", + stop_on_error=False, + wait_for_results=True, + ) + + print("\n" + "="*80) + print("Security experiment completed!") + print("="*80 + "\n") + + +if __name__ == "__main__": + print("\nSecurity Evaluators Experiment\n") + + # To run with actual dataset, uncomment: + asyncio.run(run_security_experiment()) diff --git a/packages/sample-app/sample_app/experiment/made_by_traceloop/style_exp.py b/packages/sample-app/sample_app/experiment/made_by_traceloop/style_exp.py new file mode 100644 index 0000000000..ba64e74af1 --- /dev/null +++ b/packages/sample-app/sample_app/experiment/made_by_traceloop/style_exp.py @@ -0,0 +1,108 @@ +""" +Metrics Evaluators Experiment + +This example demonstrates Traceloop's style evaluators: +- Character Count: Counts the number of characters in a text +- Word Count: Counts the number of words in a text +- Character Count Ratio: Compares the length of two texts +- Word Count Ratio: Compares the number of words between two texts +""" + +import asyncio +import os +from openai import AsyncOpenAI +from traceloop.sdk import Traceloop +from traceloop.sdk.evaluator import EvaluatorMadeByTraceloop + +# Initialize Traceloop +client = Traceloop.init() + + +async def generate_response(prompt: str, max_tokens: int = 200) -> str: + """Generate a response using OpenAI""" + openai_client = AsyncOpenAI(api_key=os.getenv("OPENAI_API_KEY")) + + response = await openai_client.chat.completions.create( + model="gpt-3.5-turbo", + messages=[{"role": "user", "content": prompt}], + temperature=0.7, + max_tokens=max_tokens, + ) + + return response.choices[0].message.content + + +async def style_task(row): + """ + Task function that generates responses and measures their metrics. + """ + question = row.get("question", "") + reference_answer = row.get("reference_answer", "This is a demo reference answer") + + # Generate response + completion = await generate_response(question) + + # Return data for style evaluation + return { + "text": completion, + "numerator_text": completion, + "denominator_text": reference_answer, + } + + +async def run_style_experiment(): + """ + Run experiment with style evaluators. + + This experiment measures: + 1. Character Count - Total characters in response + 2. Word Count - Total words in response + 3. Character Count Ratio - Response length vs reference + 4. Word Count Ratio - Response verbosity vs reference + """ + + print("\n" + "="*80) + print("METRICS EVALUATORS EXPERIMENT") + print("="*80 + "\n") + + print("This experiment measures response length and verbosity:\n") + print("1. Character Count - Total characters in the response") + print("2. Word Count - Total words in the response") + print("3. Character Count Ratio - Response length compared to reference") + print("4. Word Count Ratio - Response verbosity compared to reference") + print("\n" + "-"*80 + "\n") + + # Configure metrics evaluators + evaluators = [ + EvaluatorMadeByTraceloop.char_count(), + EvaluatorMadeByTraceloop.word_count(), + EvaluatorMadeByTraceloop.char_count_ratio(), + EvaluatorMadeByTraceloop.word_count_ratio(), + ] + + print("Running experiment with metrics evaluators:") + for evaluator in evaluators: + print(f" - {evaluator.slug}") + + print("\n" + "-"*80 + "\n") + + # Run the experiment + results, errors = await client.experiment.run( + dataset_slug="style", # Set a dataset slug that exists in the traceloop platform + dataset_version="v1", + task=style_task, + evaluators=evaluators, + experiment_slug="style-evaluators-exp", + stop_on_error=False, + wait_for_results=True, + ) + + print("\n" + "="*80) + print("Style experiment completed!") + print("="*80 + "\n") + + +if __name__ == "__main__": + print("\nStyle Evaluators Experiment\n") + + asyncio.run(run_style_experiment()) diff --git a/packages/sample-app/sample_app/experiment/run_research_experiment.py b/packages/sample-app/sample_app/experiment/run_research_experiment.py index 974f690d34..7ee891c034 100644 --- a/packages/sample-app/sample_app/experiment/run_research_experiment.py +++ b/packages/sample-app/sample_app/experiment/run_research_experiment.py @@ -49,7 +49,7 @@ async def research_task(row): return { "completion": answer, "question": query, - "sentence": answer + "text": answer } diff --git a/packages/traceloop-sdk/tests/conftest.py b/packages/traceloop-sdk/tests/conftest.py index 0daa90813b..e2c6dc54d8 100644 --- a/packages/traceloop-sdk/tests/conftest.py +++ b/packages/traceloop-sdk/tests/conftest.py @@ -231,3 +231,9 @@ def datasets(): http = HTTPClient(base_url=base_url, api_key=api_key, version="1.0.0") return Datasets(http) + + +@pytest.fixture(scope="session") +def anyio_backend(): + """Force anyio to use only asyncio backend.""" + return "asyncio" diff --git a/packages/traceloop-sdk/tests/evaluator/test_evaluator.py b/packages/traceloop-sdk/tests/evaluator/test_evaluator.py new file mode 100644 index 0000000000..fd07d16e77 --- /dev/null +++ b/packages/traceloop-sdk/tests/evaluator/test_evaluator.py @@ -0,0 +1,220 @@ +import pytest +from traceloop.sdk.evaluator.evaluator import validate_task_output +from traceloop.sdk.evaluator.config import EvaluatorDetails + + +class TestValidateTaskOutput: + """Tests for validate_task_output function""" + + def test_validate_task_output_with_no_evaluators(self): + """Test that validation passes when no evaluators are provided""" + task_output = {"text": "hello"} + evaluators = [] + + # Should not raise any exception + validate_task_output(task_output, evaluators) + + def test_validate_task_output_with_evaluators_no_required_fields(self): + """Test that validation passes when evaluators have no required fields""" + task_output = {"text": "hello", "score": 0.9} + evaluators = [ + EvaluatorDetails(slug="evaluator1"), + EvaluatorDetails(slug="evaluator2", config={"threshold": 0.5}), + ] + + # Should not raise any exception + validate_task_output(task_output, evaluators) + + def test_validate_task_output_with_valid_output(self): + """Test that validation passes when all required fields are present""" + task_output = {"text": "hello", "prompt": "say hello", "response": "world"} + evaluators = [ + EvaluatorDetails(slug="pii-detector", required_input_fields=["text"]), + EvaluatorDetails( + slug="relevance-checker", + required_input_fields=["prompt", "response"], + ), + ] + + # Should not raise any exception + validate_task_output(task_output, evaluators) + + def test_validate_task_output_missing_single_field(self): + """Test that validation fails when a single required field is missing""" + task_output = {"prompt": "say hello"} + evaluators = [ + EvaluatorDetails(slug="pii-detector", required_input_fields=["text"]), + ] + + with pytest.raises(ValueError) as exc_info: + validate_task_output(task_output, evaluators) + + error_message = str(exc_info.value) + assert "Task output missing required fields for evaluators:" in error_message + assert "pii-detector requires: ['text']" in error_message + assert "Task output contains: ['prompt']" in error_message + assert ( + "Hint: Update your task function to return a dictionary " + "with the required fields." + ) in error_message + + def test_validate_task_output_missing_multiple_fields_single_evaluator(self): + """Test that validation fails when multiple fields are missing for one evaluator""" + task_output = {"score": 0.9} + evaluators = [ + EvaluatorDetails( + slug="relevance-checker", + required_input_fields=["prompt", "response", "context"], + ), + ] + + with pytest.raises(ValueError) as exc_info: + validate_task_output(task_output, evaluators) + + error_message = str(exc_info.value) + assert "relevance-checker requires:" in error_message + assert "'context'" in error_message + assert "'prompt'" in error_message + assert "'response'" in error_message + + def test_validate_task_output_missing_fields_multiple_evaluators(self): + """Test that validation fails when fields are missing for multiple evaluators""" + task_output = {"score": 0.9} + evaluators = [ + EvaluatorDetails(slug="pii-detector", required_input_fields=["text"]), + EvaluatorDetails( + slug="relevance-checker", + required_input_fields=["prompt", "response"], + ), + EvaluatorDetails(slug="tone-analyzer", required_input_fields=["text"]), + ] + + with pytest.raises(ValueError) as exc_info: + validate_task_output(task_output, evaluators) + + error_message = str(exc_info.value) + assert "pii-detector requires:" in error_message + assert "relevance-checker requires:" in error_message + assert "tone-analyzer requires:" in error_message + assert "'text'" in error_message + assert "'prompt'" in error_message + assert "'response'" in error_message + + def test_validate_task_output_partial_match(self): + """Test validation when some evaluators pass and some fail""" + task_output = {"text": "hello world"} + evaluators = [ + EvaluatorDetails(slug="pii-detector", required_input_fields=["text"]), + EvaluatorDetails( + slug="relevance-checker", + required_input_fields=["prompt", "response"], + ), + ] + + with pytest.raises(ValueError) as exc_info: + validate_task_output(task_output, evaluators) + + error_message = str(exc_info.value) + # Should only mention the failing evaluator + assert "relevance-checker requires:" in error_message + assert "pii-detector requires:" not in error_message + + def test_validate_task_output_empty_task_output(self): + """Test validation with empty task output""" + task_output = {} + evaluators = [ + EvaluatorDetails(slug="pii-detector", required_input_fields=["text"]), + ] + + with pytest.raises(ValueError) as exc_info: + validate_task_output(task_output, evaluators) + + error_message = str(exc_info.value) + assert "Task output contains: []" in error_message + + def test_validate_task_output_with_extra_fields(self): + """Test that validation passes when task output has extra fields""" + task_output = { + "text": "hello", + "prompt": "say hello", + "response": "world", + "extra_field": "value", + "another_field": 123, + } + evaluators = [ + EvaluatorDetails(slug="pii-detector", required_input_fields=["text"]), + ] + + # Should not raise any exception - extra fields are allowed + validate_task_output(task_output, evaluators) + + def test_validate_task_output_case_sensitive_field_names(self): + """Test that field name matching is case-sensitive""" + task_output = {"Text": "hello"} + evaluators = [ + EvaluatorDetails(slug="pii-detector", required_input_fields=["text"]), + ] + + with pytest.raises(ValueError) as exc_info: + validate_task_output(task_output, evaluators) + + error_message = str(exc_info.value) + assert "pii-detector requires: ['text']" in error_message + assert "Task output contains: ['Text']" in error_message + + def test_validate_task_output_with_evaluator_config(self): + """Test validation with evaluators that have config""" + task_output = {"text": "hello world"} + evaluators = [ + EvaluatorDetails( + slug="pii-detector", + version="v2", + config={"probability_threshold": 0.8, "mode": "strict"}, + required_input_fields=["text"], + ), + ] + + # Should not raise any exception - config shouldn't affect validation + validate_task_output(task_output, evaluators) + + def test_validate_task_output_mixed_evaluators(self): + """Test validation with a mix of evaluators with and without required fields""" + task_output = {"text": "hello", "score": 0.9} + evaluators = [ + EvaluatorDetails(slug="evaluator-no-requirements"), + EvaluatorDetails(slug="pii-detector", required_input_fields=["text"]), + EvaluatorDetails(slug="another-no-requirements", config={"key": "value"}), + EvaluatorDetails( + slug="relevance-checker", + required_input_fields=["prompt"], + ), + ] + + with pytest.raises(ValueError) as exc_info: + validate_task_output(task_output, evaluators) + + error_message = str(exc_info.value) + # Should only mention failing evaluator + assert "relevance-checker requires: ['prompt']" in error_message + assert "evaluator-no-requirements" not in error_message + assert "pii-detector" not in error_message or "pii-detector requires:" not in error_message + + def test_validate_task_output_duplicate_required_fields(self): + """Test validation when multiple evaluators require the same field""" + task_output = {"score": 0.9} + evaluators = [ + EvaluatorDetails(slug="pii-detector", required_input_fields=["text"]), + EvaluatorDetails(slug="tone-analyzer", required_input_fields=["text"]), + EvaluatorDetails( + slug="sentiment-analyzer", + required_input_fields=["text", "language"], + ), + ] + + with pytest.raises(ValueError) as exc_info: + validate_task_output(task_output, evaluators) + + error_message = str(exc_info.value) + assert "pii-detector requires:" in error_message + assert "tone-analyzer requires:" in error_message + assert "sentiment-analyzer requires:" in error_message diff --git a/packages/traceloop-sdk/tests/experiment/test_experiment.py b/packages/traceloop-sdk/tests/experiment/test_experiment.py index e9927f4578..2e2ccd2aeb 100644 --- a/packages/traceloop-sdk/tests/experiment/test_experiment.py +++ b/packages/traceloop-sdk/tests/experiment/test_experiment.py @@ -1,7 +1,8 @@ import pytest -from unittest.mock import Mock +from unittest.mock import Mock, AsyncMock from traceloop.sdk.experiment.experiment import Experiment from traceloop.sdk.client.http import HTTPClient +from traceloop.sdk.evaluator.config import EvaluatorDetails @pytest.fixture @@ -34,12 +35,15 @@ def test_parse_jsonl_to_rows_valid_data(experiment): def test_parse_jsonl_to_rows_with_invalid_json_lines(experiment): """Test parsing JSONL data with some invalid JSON lines""" - jsonl_data = """{"columns":{"name":{"name":"Name","type":"string"},"age":{"name":"Age","type":"number"}}} -{"name": "John", "age": 30} -invalid json line -{"name": "Alice", "age": 25} -{broken json -{"name": "Bob", "age": 35}""" + jsonl_data = ( + '{"columns":{"name":{"name":"Name","type":"string"},' + '"age":{"name":"Age","type":"number"}}}\n' + '{"name": "John", "age": 30}\n' + 'invalid json line\n' + '{"name": "Alice", "age": 25}\n' + '{broken json\n' + '{"name": "Bob", "age": 35}' + ) result = experiment._parse_jsonl_to_rows(jsonl_data) @@ -73,12 +77,14 @@ def test_parse_jsonl_to_rows_only_header(experiment): def test_parse_jsonl_to_rows_with_empty_lines(experiment): """Test parsing JSONL data with empty lines""" - jsonl_data = """{"columns":{"name":{"name":"Name","type":"string"},"age":{"name":"Age","type":"number"}}} -{"name": "John", "age": 30} - -{"name": "Alice", "age": 25} - -""" + jsonl_data = ( + '{"columns":{"name":{"name":"Name","type":"string"},' + '"age":{"name":"Age","type":"number"}}}\n' + '{"name": "John", "age": 30}\n' + '\n' + '{"name": "Alice", "age": 25}\n' + '\n' + ) result = experiment._parse_jsonl_to_rows(jsonl_data) @@ -88,9 +94,14 @@ def test_parse_jsonl_to_rows_with_empty_lines(experiment): def test_parse_jsonl_to_rows_complex_json_objects(experiment): """Test parsing JSONL data with complex nested objects""" - jsonl_data = """{"columns":{"user":{"name":"User","type":"object"},"active":{"name":"Active","type":"boolean"}}} -{"user": {"name": "John", "details": {"age": 30, "location": ["NY", "US"]}}, "active": true} -{"user": {"name": "Alice", "details": {"age": 25, "location": ["CA", "US"]}}, "active": false}""" + jsonl_data = ( + '{"columns":{"user":{"name":"User","type":"object"},' + '"active":{"name":"Active","type":"boolean"}}}\n' + '{"user": {"name": "John", "details": {"age": 30, "location": ["NY", "US"]}}, ' + '"active": true}\n' + '{"user": {"name": "Alice", "details": {"age": 25, "location": ["CA", "US"]}}, ' + '"active": false}' + ) result = experiment._parse_jsonl_to_rows(jsonl_data) @@ -105,3 +116,253 @@ def test_parse_jsonl_to_rows_complex_json_objects(experiment): }, ] assert result == expected + + +class TestRunLocallyValidation: + """Tests for _run_locally to ensure validation failures are handled correctly""" + + @pytest.mark.anyio(backends=["asyncio"]) + async def test_run_locally_breaks_on_validation_failure_with_stop_on_error(self): + """Test that _run_locally stops processing when validation fails and stop_on_error=True""" + mock_http_client = Mock(spec=HTTPClient) + mock_http_client.base_url = "https://api.example.com" + mock_async_http_client = Mock() + + # Mock the _init_experiment method + mock_experiment_response = Mock() + mock_experiment_response.run.id = "run-123" + mock_experiment_response.experiment.id = "exp-456" + + experiment = Experiment(mock_http_client, mock_async_http_client, "test-exp") + experiment._init_experiment = Mock(return_value=mock_experiment_response) + + # Create a task that returns output missing required fields + async def task_missing_fields(row): + return {"wrong_field": "value"} + + # Define evaluator with required fields + evaluators = [ + EvaluatorDetails(slug="pii-detector", required_input_fields=["text"]), + ] + + # Mock dataset response with multiple rows + experiment._datasets.get_version_jsonl = Mock( + return_value=( + '{"columns":{}}\n{"input": "test1"}\n' + '{"input": "test2"}\n{"input": "test3"}' + ) + ) + + # Run with stop_on_error=True - should stop after first error + results, errors = await experiment._run_locally( + task=task_missing_fields, + evaluators=evaluators, + dataset_slug="test-dataset", + dataset_version="v1", + stop_on_error=True, + ) + + # Should have at least one error and should have broken early + assert len(errors) >= 1 + assert "Task output missing required fields for evaluators:" in errors[0] + assert "pii-detector requires: ['text']" in errors[0] + # With stop_on_error=True, it should break and not process all 3 rows + assert len(results) + len(errors) <= 3 + + @pytest.mark.anyio(backends=["asyncio"]) + async def test_run_locally_continues_on_validation_failure_without_stop_on_error(self): + """Test that _run_locally captures error when validation fails and stop_on_error=False""" + mock_http_client = Mock(spec=HTTPClient) + mock_http_client.base_url = "https://api.example.com" + mock_async_http_client = Mock() + + # Mock the _init_experiment method + mock_experiment_response = Mock() + mock_experiment_response.run.id = "run-123" + mock_experiment_response.experiment.id = "exp-456" + + experiment = Experiment(mock_http_client, mock_async_http_client, "test-exp") + experiment._init_experiment = Mock(return_value=mock_experiment_response) + + # Create a task that returns output missing required fields + async def task_missing_fields(row): + return {"wrong_field": "value"} + + # Define evaluator with required fields + evaluators = [ + EvaluatorDetails(slug="pii-detector", required_input_fields=["text"]), + ] + + # Mock dataset response with single row + experiment._datasets.get_version_jsonl = Mock( + return_value='{"columns":{}}\n{"input": "test"}' + ) + + # Run with stop_on_error=False - should capture error but not raise + results, errors = await experiment._run_locally( + task=task_missing_fields, + evaluators=evaluators, + dataset_slug="test-dataset", + dataset_version="v1", + stop_on_error=False, + ) + + # Should have errors, no successful results + assert len(errors) == 1 + assert len(results) == 0 + assert "Task output missing required fields for evaluators:" in errors[0] + assert "pii-detector requires: ['text']" in errors[0] + + @pytest.mark.anyio(backends=["asyncio"]) + async def test_run_locally_succeeds_with_valid_output(self): + """Test that _run_locally succeeds when task output matches required fields""" + mock_http_client = Mock(spec=HTTPClient) + mock_http_client.base_url = "https://api.example.com" + mock_async_http_client = Mock() + + # Mock the _init_experiment method + mock_experiment_response = Mock() + mock_experiment_response.run.id = "run-123" + mock_experiment_response.experiment.id = "exp-456" + + experiment = Experiment(mock_http_client, mock_async_http_client, "test-exp") + experiment._init_experiment = Mock(return_value=mock_experiment_response) + + # Mock _create_task + mock_task_response = Mock() + mock_task_response.id = "task-789" + experiment._create_task = Mock(return_value=mock_task_response) + + # Create a task that returns valid output + async def task_valid_output(row): + return {"text": "hello world", "score": 0.9} + + # Define evaluator with required fields + evaluators = [ + EvaluatorDetails(slug="pii-detector", required_input_fields=["text"]), + ] + + # Mock dataset response + experiment._datasets.get_version_jsonl = Mock( + return_value='{"columns":{}}\n{"input": "test"}' + ) + + # Mock evaluator execution + mock_eval_result = Mock() + mock_eval_result.result = {"score": 0.95} + experiment._evaluator.run_experiment_evaluator = AsyncMock( + return_value=mock_eval_result + ) + + # Run - should succeed + results, errors = await experiment._run_locally( + task=task_valid_output, + evaluators=evaluators, + dataset_slug="test-dataset", + dataset_version="v1", + ) + + # Should have successful result, no errors + assert len(results) == 1 + assert len(errors) == 0 + assert results[0].task_result == {"text": "hello world", "score": 0.9} + + @pytest.mark.anyio(backends=["asyncio"]) + async def test_run_locally_validation_with_multiple_evaluators(self): + """Test validation with multiple evaluators having different required fields""" + mock_http_client = Mock(spec=HTTPClient) + mock_http_client.base_url = "https://api.example.com" + mock_async_http_client = Mock() + + # Mock the _init_experiment method + mock_experiment_response = Mock() + mock_experiment_response.run.id = "run-123" + mock_experiment_response.experiment.id = "exp-456" + + experiment = Experiment(mock_http_client, mock_async_http_client, "test-exp") + experiment._init_experiment = Mock(return_value=mock_experiment_response) + + # Create a task that returns partial output + async def task_partial_output(row): + return {"text": "hello"} # Missing 'prompt' and 'response' + + # Define multiple evaluators with different required fields + evaluators = [ + EvaluatorDetails(slug="pii-detector", required_input_fields=["text"]), + EvaluatorDetails( + slug="relevance-checker", + required_input_fields=["prompt", "response"], + ), + ] + + # Mock dataset response + experiment._datasets.get_version_jsonl = Mock( + return_value='{"columns":{}}\n{"input": "test"}' + ) + + # Run with stop_on_error=True - should stop after error + results, errors = await experiment._run_locally( + task=task_partial_output, + evaluators=evaluators, + dataset_slug="test-dataset", + dataset_version="v1", + stop_on_error=True, + ) + + # Should have error with validation message + assert len(errors) >= 1 + error_message = errors[0] + assert "relevance-checker requires:" in error_message + assert "'prompt'" in error_message + assert "'response'" in error_message + + @pytest.mark.anyio(backends=["asyncio"]) + async def test_run_locally_no_validation_for_string_evaluators(self): + """Test that validation is not performed for string evaluators (only EvaluatorDetails)""" + mock_http_client = Mock(spec=HTTPClient) + mock_http_client.base_url = "https://api.example.com" + mock_async_http_client = Mock() + + # Mock the _init_experiment method + mock_experiment_response = Mock() + mock_experiment_response.run.id = "run-123" + mock_experiment_response.experiment.id = "exp-456" + + experiment = Experiment(mock_http_client, mock_async_http_client, "test-exp") + experiment._init_experiment = Mock(return_value=mock_experiment_response) + + # Mock _create_task + mock_task_response = Mock() + mock_task_response.id = "task-789" + experiment._create_task = Mock(return_value=mock_task_response) + + # Create a task that returns any output + async def task_any_output(row): + return {"random_field": "value"} + + # Use string evaluators (no validation) + evaluators = ["pii-detector", "relevance-checker"] + + # Mock dataset response + experiment._datasets.get_version_jsonl = Mock( + return_value='{"columns":{}}\n{"input": "test"}' + ) + + # Mock evaluator execution + mock_eval_result = Mock() + mock_eval_result.result = {"score": 0.95} + experiment._evaluator.run_experiment_evaluator = AsyncMock( + return_value=mock_eval_result + ) + + # Run - should succeed without validation + results, errors = await experiment._run_locally( + task=task_any_output, + evaluators=evaluators, + dataset_slug="test-dataset", + dataset_version="v1", + ) + + # Should succeed because string evaluators don't trigger validation + assert len(errors) == 0 + assert len(results) == 1 diff --git a/packages/traceloop-sdk/traceloop/sdk/client/http.py b/packages/traceloop-sdk/traceloop/sdk/client/http.py index 1e1af895f1..73fbfb2cd5 100644 --- a/packages/traceloop-sdk/traceloop/sdk/client/http.py +++ b/packages/traceloop-sdk/traceloop/sdk/client/http.py @@ -33,7 +33,8 @@ def post(self, path: str, data: Dict[str, Any]) -> Any: response.raise_for_status() return response.json() except requests.exceptions.RequestException as e: - print(Fore.RED + f"Error making request to {path}: {str(e)}" + Fore.RESET) + error_msg = self._format_error_message(path, e) + print(Fore.RED + error_msg + Fore.RESET) return None def get(self, path: str, params: Optional[Dict[str, Any]] = None) -> Any: @@ -54,7 +55,8 @@ def get(self, path: str, params: Optional[Dict[str, Any]] = None) -> Any: else: return response.json() except requests.exceptions.RequestException as e: - print(Fore.RED + f"Error making request to {path}: {str(e)}" + Fore.RESET) + error_msg = self._format_error_message(path, e) + print(Fore.RED + error_msg + Fore.RESET) return None def delete(self, path: str) -> bool: @@ -68,7 +70,8 @@ def delete(self, path: str) -> bool: response.raise_for_status() return response.status_code == 204 or response.status_code == 200 except requests.exceptions.RequestException as e: - print(Fore.RED + f"Error making request to {path}: {str(e)}" + Fore.RESET) + error_msg = self._format_error_message(path, e) + print(Fore.RED + error_msg + Fore.RESET) return False def put(self, path: str, data: Dict[str, Any]) -> Any: @@ -87,5 +90,41 @@ def put(self, path: str, data: Dict[str, Any]) -> Any: else: return {} except requests.exceptions.RequestException as e: - print(Fore.RED + f"Error making request to {path}: {str(e)}" + Fore.RESET) + error_msg = self._format_error_message(path, e) + print(Fore.RED + error_msg + Fore.RESET) return None + + def _format_error_message(self, path: str, exception: requests.exceptions.RequestException) -> str: + """ + Format a detailed error message including server response if available + """ + error_parts = [f"Error making request to {path}: {str(exception)}"] + + # Try to extract error details from response + if hasattr(exception, 'response') and exception.response is not None: + response = exception.response + + # Try to parse JSON error from response + try: + error_data = response.json() + if isinstance(error_data, dict): + # Check common error fields + if 'error' in error_data: + error_parts.append(f"Server error: {error_data['error']}") + elif 'message' in error_data: + error_parts.append(f"Server message: {error_data['message']}") + elif 'msg' in error_data: + error_parts.append(f"Server message: {error_data['msg']}") + else: + # Include the entire JSON if no standard field found + error_parts.append(f"Server response: {error_data}") + except (ValueError, AttributeError): + # Not JSON, try to get text + try: + error_text = response.text + if error_text and len(error_text) < 500: # Only include short error messages + error_parts.append(f"Server response: {error_text}") + except Exception: + pass + + return "\n".join(error_parts) diff --git a/packages/traceloop-sdk/traceloop/sdk/evaluator/__init__.py b/packages/traceloop-sdk/traceloop/sdk/evaluator/__init__.py index 737bb66012..b1233374b8 100644 --- a/packages/traceloop-sdk/traceloop/sdk/evaluator/__init__.py +++ b/packages/traceloop-sdk/traceloop/sdk/evaluator/__init__.py @@ -1,5 +1,9 @@ from .evaluator import Evaluator +from .config import EvaluatorDetails +from .evaluators_made_by_traceloop import EvaluatorMadeByTraceloop __all__ = [ - "Evaluator" + "Evaluator", + "EvaluatorDetails", + "EvaluatorMadeByTraceloop", ] diff --git a/packages/traceloop-sdk/traceloop/sdk/evaluator/config.py b/packages/traceloop-sdk/traceloop/sdk/evaluator/config.py new file mode 100644 index 0000000000..0523995c44 --- /dev/null +++ b/packages/traceloop-sdk/traceloop/sdk/evaluator/config.py @@ -0,0 +1,23 @@ +from typing import Dict, Any, Optional, List +from pydantic import BaseModel + + +class EvaluatorDetails(BaseModel): + """ + Details for configuring an evaluator. + + Args: + slug: The evaluator slug/identifier + version: Optional version of the evaluator + config: Optional configuration dictionary for the evaluator + required_input_fields: Optional list of required fields to the evaluator + input. These fields must be present in the task output. + + Example: + >>> EvaluatorDetails(slug="pii-detector", config={"probability_threshold": 0.8}, required_input_fields=["text"]) + >>> EvaluatorDetails(slug="my-custom-evaluator", version="v2") + """ + slug: str + version: Optional[str] = None + config: Optional[Dict[str, Any]] = None + required_input_fields: Optional[List[str]] = None diff --git a/packages/traceloop-sdk/traceloop/sdk/evaluator/evaluator.py b/packages/traceloop-sdk/traceloop/sdk/evaluator/evaluator.py index 64dd7933d0..799f519c72 100644 --- a/packages/traceloop-sdk/traceloop/sdk/evaluator/evaluator.py +++ b/packages/traceloop-sdk/traceloop/sdk/evaluator/evaluator.py @@ -1,5 +1,5 @@ import httpx -from typing import Dict, Optional +from typing import Dict, Optional, Any, List from .model import ( InputExtractor, @@ -9,6 +9,7 @@ ExecutionResponse, ) from .stream_client import SSEClient +from .config import EvaluatorDetails class Evaluator: @@ -28,6 +29,7 @@ def _build_evaluator_request( experiment_run_id: str, input: Dict[str, str], evaluator_version: Optional[str] = None, + evaluator_config: Optional[Dict[str, Any]] = None, ) -> ExecuteEvaluatorRequest: """Build evaluator request with common parameters""" schema_mapping = InputSchemaMapping( @@ -36,6 +38,7 @@ def _build_evaluator_request( return ExecuteEvaluatorRequest( input_schema_mapping=schema_mapping, evaluator_version=evaluator_version, + evaluator_config=evaluator_config, task_id=task_id, experiment_id=experiment_id, experiment_run_id=experiment_run_id, @@ -55,9 +58,10 @@ async def _execute_evaluator_request( full_url, json=body, timeout=httpx.Timeout(timeout_in_sec) ) if response.status_code != 200: + error_detail = _extract_error_from_response(response) raise Exception( - f"Failed to execute evaluator {evaluator_slug}: " - f"{response.status_code} – {response.text}" + f"Failed to execute evaluator '{evaluator_slug}': " + f"{response.status_code} - {error_detail}" ) result_data = response.json() return ExecuteEvaluatorResponse(**result_data) @@ -71,23 +75,26 @@ async def run_experiment_evaluator( input: Dict[str, str], timeout_in_sec: int = 120, evaluator_version: Optional[str] = None, + evaluator_config: Optional[Dict[str, Any]] = None, ) -> ExecutionResponse: """ Execute evaluator with input schema mapping and wait for result Args: evaluator_slug: Slug of the evaluator to execute + task_id: Task ID for the evaluation + experiment_id: Experiment ID + experiment_run_id: Experiment run ID input: Dict mapping evaluator input field names to their values. {field_name: value, ...} - client: Shared HTTP client for connection reuse (optional) - context_data: Context data to be passed to the evaluator (optional) - evaluator_version: Version of the evaluator to execute (optional) timeout_in_sec: Timeout in seconds for execution + evaluator_version: Version of the evaluator to execute (optional) + evaluator_config: Configuration for the evaluator (optional) Returns: ExecutionResponse: The evaluation result from SSE stream """ request = self._build_evaluator_request( - task_id, experiment_id, experiment_run_id, input, evaluator_version + task_id, experiment_id, experiment_run_id, input, evaluator_version, evaluator_config ) execute_response = await self._execute_evaluator_request( @@ -109,6 +116,7 @@ async def trigger_experiment_evaluator( experiment_run_id: str, input: Dict[str, str], evaluator_version: Optional[str] = None, + evaluator_config: Optional[Dict[str, Any]] = None, ) -> str: """ Trigger evaluator execution without waiting for result (fire-and-forget) @@ -120,13 +128,13 @@ async def trigger_experiment_evaluator( experiment_run_id: Experiment run ID input: Dict mapping evaluator input field names to their values evaluator_version: Version of the evaluator to execute (optional) - client: Shared HTTP client for connection reuse (optional) + evaluator_config: Configuration for the evaluator (optional) Returns: str: The execution_id that can be used to check results later """ request = self._build_evaluator_request( - task_id, experiment_id, experiment_run_id, input, evaluator_version + task_id, experiment_id, experiment_run_id, input, evaluator_version, evaluator_config ) execute_response = await self._execute_evaluator_request( @@ -135,3 +143,81 @@ async def trigger_experiment_evaluator( # Return execution_id without waiting for SSE result return execute_response.execution_id + + +def validate_task_output( + task_output: Dict[str, Any], + evaluators: List[EvaluatorDetails], +) -> None: + """ + Validate that task output contains all required fields for the given evaluators. + + Args: + task_output: The dictionary returned by the task function + evaluators: List of EvaluatorDetails to validate against + + Raises: + ValueError: If task output is missing required fields for any evaluator + """ + if not evaluators: + return + + # Collect all validation errors + missing_fields_by_evaluator: Dict[str, set[str]] = {} + + for evaluator in evaluators: + if not evaluator.required_input_fields: + continue + + missing_fields = [ + field for field in evaluator.required_input_fields + if field not in task_output + ] + + if missing_fields: + # Add to existing set or create new set + if evaluator.slug not in missing_fields_by_evaluator: + missing_fields_by_evaluator[evaluator.slug] = set() + missing_fields_by_evaluator[evaluator.slug].update(missing_fields) + + # If there are any missing fields, raise a detailed error + if missing_fields_by_evaluator: + error_lines = ["Task output missing required fields for evaluators:"] + + for slug, fields in missing_fields_by_evaluator.items(): + error_lines.append(f" - {slug} requires: {sorted(fields)}") + + error_lines.append(f"\nTask output contains: {list(task_output.keys())}") + + error_lines.append("\nHint: Update your task function to return a dictionary with the required fields.") + + raise ValueError("\n".join(error_lines)) + + +def _extract_error_from_response(response: httpx.Response) -> str: + """ + Extract error message from HTTP response. + + Tries to parse JSON and extract common error fields (error, message, msg). + Falls back to response.text if JSON parsing fails. + + Args: + response: The HTTP response object + + Returns: + Extracted error message string + """ + error_detail = response.text + try: + error_json = response.json() + if isinstance(error_json, dict): + if 'error' in error_json: + error_detail = error_json['error'] + elif 'message' in error_json: + error_detail = error_json['message'] + elif 'msg' in error_json: + error_detail = error_json['msg'] + except Exception: + pass # Use response.text as fallback + + return error_detail diff --git a/packages/traceloop-sdk/traceloop/sdk/evaluator/evaluators_made_by_traceloop.py b/packages/traceloop-sdk/traceloop/sdk/evaluator/evaluators_made_by_traceloop.py new file mode 100644 index 0000000000..5ad610eec0 --- /dev/null +++ b/packages/traceloop-sdk/traceloop/sdk/evaluator/evaluators_made_by_traceloop.py @@ -0,0 +1,455 @@ +from typing import Optional, Dict, Any +from .config import EvaluatorDetails + + +class EvaluatorMadeByTraceloop: + """ + Factory class for creating made by traceloop evaluators with proper configuration. + + This class provides easy-to-use factory methods for all made by traceloop evaluators, + with type hints and documentation for their configuration options. + + Example: + >>> from traceloop.sdk.evaluator import EvaluatorMadeByTraceloop + >>> + >>> evaluators = [ + ... EvaluatorMadeByTraceloop.pii_detector(probability_threshold=0.8), + ... EvaluatorMadeByTraceloop.toxicity_detector(threshold=0.7), + ... ] + """ + + @staticmethod + def pii_detector( + probability_threshold: float = 0.5, + ) -> EvaluatorDetails: + """ + PII (Personally Identifiable Information) detector evaluator. + + Required task output fields: + - text: The text to check for PII + + Args: + probability_threshold: Minimum probability threshold for detecting PII (0.0-1.0) + + Returns: + EvaluatorDetails configured for PII detection + """ + config: Dict[str, Any] = {"probability_threshold": probability_threshold} + return EvaluatorDetails(slug="pii-detector", version=None, config=config, required_input_fields=["text"]) + + @staticmethod + def toxicity_detector( + threshold: float = 0.5, + ) -> EvaluatorDetails: + """ + Toxicity detector evaluator. + + Required task output fields: + - text: The text to check for toxicity + + Args: + threshold: Minimum toxicity threshold for flagging content (0.0-1.0) + + Returns: + EvaluatorDetails configured for toxicity detection + """ + config: Dict[str, Any] = {"threshold": threshold} + + return EvaluatorDetails(slug="toxicity-detector", version=None, config=config, required_input_fields=["text"]) + + @staticmethod + def prompt_injection( + threshold: float = 0.5, + ) -> EvaluatorDetails: + """ + Prompt injection detector evaluator. + + Required task output fields: + - prompt: The prompt to check for prompt injection attempts + + Args: + threshold: Minimum threshold for detecting prompt injection attempts (0.0-1.0) + + Returns: + EvaluatorDetails configured for prompt injection detection + """ + config: Dict[str, Any] = {"threshold": threshold} + return EvaluatorDetails(slug="prompt-injection", version=None, config=config, required_input_fields=["prompt"]) + + @staticmethod + def regex_validator( + regex: str, + should_match: bool = True, + case_sensitive: bool = True, + dot_include_nl: bool = False, + multi_line: bool = False, + ) -> EvaluatorDetails: + """ + Regular expression validator evaluator. + + Required task output fields: + - text: The text to validate against the regex pattern + + Args: + regex: The regular expression pattern to match against + should_match: If True, pass when pattern matches; if False, pass when pattern doesn't match + case_sensitive: Whether the regex matching should be case-sensitive + dot_include_nl: Whether the dot (.) should match newline characters + multi_line: Whether to enable multi-line mode (^ and $ match line boundaries) + + Returns: + EvaluatorDetails configured for regex validation + """ + config: Dict[str, Any] = { + "regex": regex, + "should_match": should_match, + "case_sensitive": case_sensitive, + "dot_include_nl": dot_include_nl, + "multi_line": multi_line, + } + + return EvaluatorDetails(slug="regex-validator", version=None, config=config, required_input_fields=["text"]) + + @staticmethod + def json_validator( + enable_schema_validation: bool = False, + schema_string: Optional[str] = None, + ) -> EvaluatorDetails: + """ + JSON validator evaluator. + + Required task output fields: + - text: The JSON text to validate + + Args: + enable_schema_validation: Whether to validate against a JSON schema + schema_string: JSON schema string to validate against (required if enable_schema_validation is True) + + Returns: + EvaluatorDetails configured for JSON validation + """ + config: Dict[str, Any] = { + "enable_schema_validation": enable_schema_validation, + } + if schema_string: + config["schema_string"] = schema_string + + return EvaluatorDetails(slug="json-validator", version=None, config=config, required_input_fields=["text"]) + + @staticmethod + def placeholder_regex( + regex: str, + placeholder_name: str, + should_match: bool = True, + case_sensitive: bool = True, + dot_include_nl: bool = False, + multi_line: bool = False, + ) -> EvaluatorDetails: + """ + Placeholder regex evaluator - validates that placeholders match a regex pattern. + + Required task output fields: + - text: The text to validate against the regex pattern + - placeholder_value: The value of the placeholder to validate + + Args: + regex: The regular expression pattern to match against + placeholder_name: Name of the placeholder to validate + should_match: If True, pass when pattern matches; if False, pass when pattern doesn't match + case_sensitive: Whether the regex matching should be case-sensitive + dot_include_nl: Whether the dot (.) should match newline characters + multi_line: Whether to enable multi-line mode (^ and $ match line boundaries) + + Returns: + EvaluatorDetails configured for placeholder regex validation + """ + config: Dict[str, Any] = { + "regex": regex, + "placeholder_name": placeholder_name, + "should_match": should_match, + "case_sensitive": case_sensitive, + "dot_include_nl": dot_include_nl, + "multi_line": multi_line, + } + + return EvaluatorDetails( + slug="placeholder-regex", + version=None, + config=config, + required_input_fields=["text", "placeholder_value"], + ) + + @staticmethod + def char_count( + ) -> EvaluatorDetails: + """ + Character count evaluator - counts the number of characters in text. + + Required task output fields: + - text: The text to count characters in + + Returns: + EvaluatorDetails configured for character counting + """ + config: Dict[str, Any] = {} + + return EvaluatorDetails(slug="char-count", version=None, config=config, required_input_fields=["text"]) + + @staticmethod + def char_count_ratio( + ) -> EvaluatorDetails: + """ + Character count ratio evaluator - measures the ratio of characters between two texts. + + Required task output fields: + - numerator_text: The numerator text for ratio calculation + - denominator_text: The denominator text for ratio calculation + + Returns: + EvaluatorDetails configured for character count ratio calculation + """ + config: Dict[str, Any] = {} + + return EvaluatorDetails( + slug="char-count-ratio", + version=None, + config=config, + required_input_fields=["numerator_text", "denominator_text"], + ) + + @staticmethod + def word_count() -> EvaluatorDetails: + """ + Word count evaluator - counts the number of words in text. + + Required task output fields: + - text: The text to count words in + + Returns: + EvaluatorDetails configured for word counting + """ + config: Dict[str, Any] = {} + + return EvaluatorDetails(slug="word-count", version=None, config=config, required_input_fields=["text"]) + + @staticmethod + def word_count_ratio( + ) -> EvaluatorDetails: + """ + Word count ratio evaluator - measures the ratio of words between two texts. + + Required task output fields: + - numerator_text: The numerator text for ratio calculation ++ - denominator_text: The denominator text for ratio calculation + + Returns: + EvaluatorDetails configured for word count ratio calculation + """ + config: Dict[str, Any] = {} + + return EvaluatorDetails( + slug="word-count-ratio", + version=None, + config=config, + required_input_fields=["numerator_text", "denominator_text"], + ) + + @staticmethod + def answer_relevancy( + ) -> EvaluatorDetails: + """ + Answer relevancy evaluator - verifies responses address the query. + + Required task output fields: + - question: The input question + - answer: The answer to evaluate + + Returns: + EvaluatorDetails configured for answer relevancy evaluation + """ + config: Dict[str, Any] = {} + + return EvaluatorDetails( + slug="answer-relevancy", + version=None, + config=config, + required_input_fields=["question", "answer"], + ) + + @staticmethod + def faithfulness( + ) -> EvaluatorDetails: + """ + Faithfulness evaluator - detects hallucinations and verifies facts. + + Required task output fields: + - question: The input question + - completion: The completion to evaluate for faithfulness + - context: The context to verify against + + Returns: + EvaluatorDetails configured for faithfulness evaluation + """ + config: Dict[str, Any] = {} + + return EvaluatorDetails( + slug="faithfulness", + version=None, + config=config, + required_input_fields=["question", "completion", "context"], + ) + + @staticmethod + def profanity_detector() -> EvaluatorDetails: + """ + Profanity detector evaluator - flags inappropriate language. + + Required task output fields: + - text: The text to check for profanity + + Returns: + EvaluatorDetails configured for profanity detection + """ + config: Dict[str, Any] = {} + + return EvaluatorDetails(slug="profanity-detector", version=None, config=config, required_input_fields=["text"]) + + @staticmethod + def sexism_detector( + threshold: float = 0.5, + ) -> EvaluatorDetails: + """ + Sexism detector evaluator - detects sexist language and bias. + + Required task output fields: + - text: The text to check for sexism + + Args: + threshold: Minimum threshold for detecting sexism (0.0-1.0) + + Returns: + EvaluatorDetails configured for sexism detection + """ + config: Dict[str, Any] = {"threshold": threshold} + + return EvaluatorDetails(slug="sexism-detector", version=None, config=config, required_input_fields=["text"]) + + @staticmethod + def secrets_detector( + ) -> EvaluatorDetails: + """ + Secrets detector evaluator - monitors for credential and key leaks. + + Required task output fields: + - text: The text to check for secrets + + Returns: + EvaluatorDetails configured for secrets detection + """ + config: Dict[str, Any] = {} + return EvaluatorDetails(slug="secrets-detector", version=None, config=config, required_input_fields=["text"]) + + @staticmethod + def sql_validator( + ) -> EvaluatorDetails: + """ + SQL validator evaluator - validates SQL queries. + + Required task output fields: + - text: The SQL query to validate + + Returns: + EvaluatorDetails configured for SQL validation + """ + config: Dict[str, Any] = {} + + return EvaluatorDetails(slug="sql-validator", version=None, config=config, required_input_fields=["text"]) + + @staticmethod + def semantic_similarity( + ) -> EvaluatorDetails: + """ + Semantic similarity evaluator - measures semantic similarity between texts. + + Required task output fields: + - completion: The completion text to compare + - reference: The reference text to compare against + + Returns: + EvaluatorDetails configured for semantic similarity evaluation + """ + config: Dict[str, Any] = {} + + return EvaluatorDetails( + slug="semantic-similarity", + version=None, + config=config, + required_input_fields=["completion", "reference"], + ) + + @staticmethod + def agent_goal_accuracy( + ) -> EvaluatorDetails: + """ + Agent goal accuracy evaluator - validates agent goal achievement. + + Required task output fields: + - question: The input question or goal + - completion: The agent's completion + - reference: The reference answer or goal + + Returns: + EvaluatorDetails configured for agent goal accuracy evaluation + """ + config: Dict[str, Any] = {} + + return EvaluatorDetails( + slug="agent-goal-accuracy", + version=None, + config=config, + required_input_fields=["question", "completion", "reference"], + ) + + @staticmethod + def topic_adherence( + ) -> EvaluatorDetails: + """ + Topic adherence evaluator - validates topic adherence. + + Required task output fields: + - question: The input question or goal + - completion: The completion text to evaluate + - reference_topics: The expected topic or topics + + Returns: + EvaluatorDetails configured for topic adherence evaluation + """ + config: Dict[str, Any] = {} + + return EvaluatorDetails( + slug="topic-adherence", + version=None, + config=config, + required_input_fields=["question", "completion", "reference_topics"], + ) + + @staticmethod + def perplexity( + ) -> EvaluatorDetails: + """ + Perplexity evaluator - measures text perplexity from prompt. + + Required task output fields: + - prompt: The prompt to measure perplexity for + + Returns: + EvaluatorDetails configured for perplexity measurement + """ + config: Dict[str, Any] = {} + + return EvaluatorDetails( + slug="perplexity", + version=None, + config=config, + required_input_fields=["prompt"], + ) diff --git a/packages/traceloop-sdk/traceloop/sdk/evaluator/model.py b/packages/traceloop-sdk/traceloop/sdk/evaluator/model.py index 7723cc3df9..1fc4636221 100644 --- a/packages/traceloop-sdk/traceloop/sdk/evaluator/model.py +++ b/packages/traceloop-sdk/traceloop/sdk/evaluator/model.py @@ -16,6 +16,7 @@ class InputSchemaMapping(RootModel[Dict[str, InputExtractor]]): class ExecuteEvaluatorRequest(BaseModel): input_schema_mapping: InputSchemaMapping evaluator_version: Optional[str] = None + evaluator_config: Optional[Dict[str, Any]] = None task_id: str experiment_id: str experiment_run_id: str diff --git a/packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py b/packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py index 453c6f7e8d..23b88c758b 100644 --- a/packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py +++ b/packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py @@ -5,19 +5,20 @@ from typing import Any, List, Callable, Optional, Tuple, Dict, Awaitable, Union from traceloop.sdk.client.http import HTTPClient from traceloop.sdk.datasets.datasets import Datasets -from traceloop.sdk.evaluator.evaluator import Evaluator +from traceloop.sdk.evaluator.evaluator import Evaluator, validate_task_output from traceloop.sdk.experiment.model import ( InitExperimentRequest, ExperimentInitResponse, CreateTaskRequest, CreateTaskResponse, - EvaluatorDetails, + EvaluatorSpec, TaskResponse, RunInGithubRequest, RunInGithubResponse, TaskResult, GithubContext, ) +from traceloop.sdk.evaluator.config import EvaluatorDetails import httpx @@ -37,9 +38,9 @@ def __init__(self, http_client: HTTPClient, async_http_client: httpx.AsyncClient async def run( self, task: Callable[[Optional[Dict[str, Any]]], Awaitable[Dict[str, Any]]], + evaluators: List[EvaluatorSpec], dataset_slug: Optional[str] = None, dataset_version: Optional[str] = None, - evaluators: Optional[List[EvaluatorDetails]] = None, experiment_slug: Optional[str] = None, experiment_metadata: Optional[Dict[str, Any]] = None, related_ref: Optional[Dict[str, str]] = None, @@ -51,9 +52,9 @@ async def run( Args: task: Async function to run on each dataset row + evaluators: List of evaluator slugs or EvaluatorDetails objects to run dataset_slug: Slug of the dataset to use dataset_version: Version of the dataset to use - evaluators: List of evaluator slugs to run experiment_slug: Slug for this experiment run experiment_metadata: Metadata for this experiment (an experiment holds all the experiment runs) related_ref: Related reference for this experiment run @@ -76,9 +77,9 @@ async def run( else: return await self._run_locally( task=task, + evaluators=evaluators, dataset_slug=dataset_slug, dataset_version=dataset_version, - evaluators=evaluators, experiment_slug=experiment_slug, experiment_metadata=experiment_metadata, related_ref=related_ref, @@ -90,9 +91,9 @@ async def run( async def _run_locally( self, task: Callable[[Optional[Dict[str, Any]]], Awaitable[Dict[str, Any]]], + evaluators: List[EvaluatorSpec], dataset_slug: Optional[str] = None, dataset_version: Optional[str] = None, - evaluators: Optional[List[EvaluatorDetails]] = None, experiment_slug: Optional[str] = None, experiment_metadata: Optional[Dict[str, Any]] = None, related_ref: Optional[Dict[str, str]] = None, @@ -126,20 +127,23 @@ async def _run_locally( if value is not None } - evaluator_details = ( - [ - (evaluator, None) if isinstance(evaluator, str) else evaluator - for evaluator in evaluators - ] - if evaluators - else None - ) + # Convert evaluators to tuples of (slug, version, config) + evaluator_details: Optional[List[Tuple[str, Optional[str], Optional[Dict[str, Any]]]]] = None + if evaluators: + evaluator_details = [] + for evaluator in evaluators: + if isinstance(evaluator, str): + # Simple string slug + evaluator_details.append((evaluator, None, None)) + elif isinstance(evaluator, EvaluatorDetails): + # EvaluatorDetails object with config + evaluator_details.append((evaluator.slug, evaluator.version, evaluator.config)) experiment = self._init_experiment( experiment_slug, dataset_slug=dataset_slug, dataset_version=dataset_version, - evaluator_slugs=[slug for slug, _ in evaluator_details] + evaluator_slugs=[slug for slug, _, _ in evaluator_details] if evaluator_details else None, experiment_metadata=experiment_metadata, @@ -156,9 +160,16 @@ async def _run_locally( results: List[TaskResponse] = [] errors: List[str] = [] + evaluators_to_validate = [evaluator for evaluator in evaluators if isinstance(evaluator, EvaluatorDetails)] + async def run_single_row(row: Optional[Dict[str, Any]]) -> TaskResponse: try: task_result = await task(row) + + # Validate task output with EvaluatorDetails with required_input_fields from evaluators list + if evaluators_to_validate: + validate_task_output(task_result, evaluators_to_validate) + task_id = self._create_task( experiment_slug=experiment_slug, experiment_run_id=run_id, @@ -168,13 +179,14 @@ async def run_single_row(row: Optional[Dict[str, Any]]) -> TaskResponse: eval_results: Dict[str, Union[Dict[str, Any], str]] = {} if evaluator_details: - for evaluator_slug, evaluator_version in evaluator_details: + for evaluator_slug, evaluator_version, evaluator_config in evaluator_details: try: if wait_for_results: eval_result = ( await self._evaluator.run_experiment_evaluator( evaluator_slug=evaluator_slug, evaluator_version=evaluator_version, + evaluator_config=evaluator_config, task_id=task_id, experiment_id=experiment.experiment.id, experiment_run_id=run_id, @@ -187,6 +199,7 @@ async def run_single_row(row: Optional[Dict[str, Any]]) -> TaskResponse: await self._evaluator.trigger_experiment_evaluator( evaluator_slug=evaluator_slug, evaluator_version=evaluator_version, + evaluator_config=evaluator_config, task_id=task_id, experiment_id=experiment.experiment.id, experiment_run_id=run_id, @@ -197,7 +210,10 @@ async def run_single_row(row: Optional[Dict[str, Any]]) -> TaskResponse: eval_results[evaluator_slug] = msg except Exception as e: - eval_results[evaluator_slug] = f"Error: {str(e)}" + error_msg = f"Error: {str(e)}" + eval_results[evaluator_slug] = error_msg + # Log the error so user can see it + print(f"\033[91m❌ Evaluator '{evaluator_slug}' failed: {str(e)}\033[0m") return TaskResponse( task_result=task_result, @@ -205,6 +221,8 @@ async def run_single_row(row: Optional[Dict[str, Any]]) -> TaskResponse: ) except Exception as e: error_msg = f"Error processing row: {str(e)}" + # Print error to console so user can see it + print(f"\033[91m❌ Task execution failed: {str(e)}\033[0m") if stop_on_error: raise e return TaskResponse(error=error_msg) @@ -240,9 +258,9 @@ async def run_with_semaphore(row: Optional[Dict[str, Any]]) -> TaskResponse: async def _run_in_github( self, task: Callable[[Optional[Dict[str, Any]]], Awaitable[Dict[str, Any]]], + evaluators: List[EvaluatorSpec], dataset_slug: Optional[str] = None, dataset_version: Optional[str] = None, - evaluators: Optional[List[EvaluatorDetails]] = None, experiment_slug: Optional[str] = None, experiment_metadata: Optional[Dict[str, Any]] = None, related_ref: Optional[Dict[str, str]] = None, @@ -290,7 +308,7 @@ async def _run_in_github( jsonl_data = self._datasets.get_version_jsonl(dataset_slug, dataset_version) rows = self._parse_jsonl_to_rows(jsonl_data) - task_results = await self._execute_tasks(rows, task) + task_results = await self._execute_tasks(rows, task, evaluators) # Construct GitHub context repository = os.getenv("GITHUB_REPOSITORY") @@ -340,10 +358,12 @@ async def _run_in_github( # Extract evaluator slugs evaluator_slugs = None if evaluators: - evaluator_slugs = [ - slug if isinstance(slug, str) else slug[0] - for slug in evaluators - ] + evaluator_slugs = [] + for evaluator in evaluators: + if isinstance(evaluator, str): + evaluator_slugs.append(evaluator) + elif isinstance(evaluator, EvaluatorDetails): + evaluator_slugs.append(evaluator.slug) # Prepare request payload request_body = RunInGithubRequest( @@ -436,26 +456,47 @@ async def _execute_tasks( self, rows: List[Dict[str, Any]], task: Callable[[Optional[Dict[str, Any]]], Awaitable[Dict[str, Any]]], + evaluators: Optional[List[EvaluatorSpec]] = None, ) -> List[TaskResult]: """Execute tasks locally with concurrency control Args: rows: List of dataset rows to process task: Function to run on each row + evaluators: List of evaluators to validate task output against Returns: List of TaskResult objects with inputs, outputs, and errors """ task_results: List[TaskResult] = [] + # Extract EvaluatorDetails from evaluators list + evaluators_to_validate = [] + if evaluators: + for evaluator in evaluators: + if isinstance(evaluator, EvaluatorDetails): + evaluators_to_validate.append(evaluator) + async def run_single_row(row: Optional[Dict[str, Any]]) -> TaskResult: try: task_output = await task(row) + + # Validate task output schema on first execution + if evaluators_to_validate: + try: + validate_task_output(task_output, evaluators_to_validate) + except ValueError as validation_error: + print(f"\033[91m❌ Task validation failed: {str(validation_error)}\033[0m") + raise ValueError(str(validation_error)) + return TaskResult( input=row, output=task_output, ) except Exception as e: + if isinstance(e, ValueError): + raise e + print(f"\033[91m❌ Task execution error: {str(e)}\033[0m") return TaskResult( input=row, error=str(e), diff --git a/packages/traceloop-sdk/traceloop/sdk/experiment/model.py b/packages/traceloop-sdk/traceloop/sdk/experiment/model.py index f55ed0b487..00a3ce2009 100644 --- a/packages/traceloop-sdk/traceloop/sdk/experiment/model.py +++ b/packages/traceloop-sdk/traceloop/sdk/experiment/model.py @@ -1,10 +1,9 @@ from datetime import datetime -from typing import List, Dict, Any, Optional, Tuple, Union +from typing import List, Dict, Any, Optional, Union from pydantic import BaseModel, Field +from traceloop.sdk.evaluator.config import EvaluatorDetails -EvaluatorVersion = str -EvaluatorSlug = str -EvaluatorDetails = Union[EvaluatorSlug, Tuple[EvaluatorSlug, EvaluatorVersion]] +EvaluatorSpec = Union[str, EvaluatorDetails] class TaskResponse(BaseModel):