diff --git a/packages/sample-app/sample_app/experiment/made_by_traceloop/compliance_exp.py b/packages/sample-app/sample_app/experiment/made_by_traceloop/compliance_exp.py
new file mode 100644
index 0000000000..f0ebda34a0
--- /dev/null
+++ b/packages/sample-app/sample_app/experiment/made_by_traceloop/compliance_exp.py
@@ -0,0 +1,106 @@
+"""
+Content Compliance Evaluators Experiment
+
+This example demonstrates Traceloop's content compliance evaluators:
+- Profanity Detection: Flags inappropriate language
+- Toxicity Detection: Identifies toxic or harmful content
+- Sexism Detection: Identifies sexist language or bias
+
+These evaluators help ensure AI-generated content is compliant with community guidelines.
+"""
+
+import asyncio
+import os
+from openai import AsyncOpenAI
+from traceloop.sdk import Traceloop
+from traceloop.sdk.evaluator import EvaluatorMadeByTraceloop
+
+# Initialize Traceloop
+client = Traceloop.init()
+
+
+async def generate_response(prompt: str, temperature: float = 0.7) -> str:
+    """Generate a response using OpenAI"""
+    openai_client = AsyncOpenAI(api_key=os.getenv("OPENAI_API_KEY"))
+
+    response = await openai_client.chat.completions.create(
+        model="gpt-3.5-turbo",
+        messages=[{"role": "user", "content": prompt}],
+        temperature=temperature,
+        max_tokens=200,
+    )
+
+    return response.choices[0].message.content
+
+
+async def content_safety_task(row):
+    """
+    Task function that generates content to be evaluated for safety.
+    Returns text that will be checked for profanity and toxicity.
+    """
+    prompt = row.get("question", "")
+
+    # Generate response
+    response = await generate_response(prompt)
+
+    # Return data for safety evaluation
+    return {
+        "text": response,
+        "prompt": prompt,
+    }
+
+
+async def run_content_compliance_experiment():
+    """
+    Run experiment with content compliance evaluators.
+
+    This experiment evaluates:
+    1. Profanity Detection - Flags inappropriate language
+    2. Toxicity Detection - Identifies harmful or toxic content
+    """
+
+    print("\n" + "="*80)
+    print("CONTENT COMPLIANCE EVALUATORS EXPERIMENT")
+    print("="*80 + "\n")
+
+    print("This experiment tests content compliance:\n")
+    print("1. Profanity Detection - Identifies inappropriate language")
+    print("2. Toxicity Detection - Detects harmful, aggressive, or toxic content")
+    print("\n" + "-"*80 + "\n")
+
+    # Configure content compliance evaluators
+    evaluators = [
+        EvaluatorMadeByTraceloop.profanity_detector(),
+        EvaluatorMadeByTraceloop.toxicity_detector(threshold=0.7),
+        EvaluatorMadeByTraceloop.sexism_detector(threshold=0.7),
+    ]
+
+    print("Running experiment with content safety evaluators:")
+    for evaluator in evaluators:
+        config_str = ", ".join(f"{k}={v}" for k, v in evaluator.config.items() if k != "description")
+        print(f"  - {evaluator.slug}")
+        if config_str:
+            print(f"    Config: {config_str}")
+
+    print("\n" + "-"*80 + "\n")
+
+    # Run the experiment
+    results, errors = await client.experiment.run(
+        dataset_slug="content-compliance",  # Set a dataset slug that exists in the traceloop platform
+        dataset_version="v1",
+        task=content_safety_task,
+        evaluators=evaluators,
+        experiment_slug="content-compliance-exp",
+        stop_on_error=False,
+        wait_for_results=True,
+    )
+
+    print("\n" + "="*80)
+    print("Content compliance experiment completed!")
+    print("="*80 + "\n")
+
+
+if __name__ == "__main__":
+    print("\nContent Compliance Evaluators Experiment\n")
+
+    asyncio.run(run_content_compliance_experiment())
diff --git a/packages/sample-app/sample_app/experiment/made_by_traceloop/correctness_exp.py b/packages/sample-app/sample_app/experiment/made_by_traceloop/correctness_exp.py
new file mode 100644
index 0000000000..b99038974e
--- /dev/null
+++ b/packages/sample-app/sample_app/experiment/made_by_traceloop/correctness_exp.py
@@ -0,0 +1,107 @@
+"""
+Quality Evaluators Experiment
+
+This example demonstrates Traceloop's correctness evaluators:
+- Answer Relevancy: Verifies responses address the query
+- Faithfulness: Detects hallucinations and verifies facts
+
+These evaluators help ensure your AI applications provide accurate,
+relevant, and faithful responses.
+"""
+
+import asyncio
+import os
+from openai import AsyncOpenAI
+from traceloop.sdk import Traceloop
+from traceloop.sdk.evaluator import EvaluatorMadeByTraceloop
+
+# Initialize Traceloop
+client = Traceloop.init()
+
+
+async def generate_response(prompt: str, context: str = None) -> str:
+    """Generate a response using OpenAI"""
+    openai_client = AsyncOpenAI(api_key=os.getenv("OPENAI_API_KEY"))
+
+    messages = [{"role": "user", "content": prompt}]
+    if context:
+        messages.insert(0, {"role": "system", "content": f"Context: {context}"})
+
+    response = await openai_client.chat.completions.create(
+        model="gpt-3.5-turbo",
+        messages=messages,
+        temperature=0.7,
+        max_tokens=200,
+    )
+
+    return response.choices[0].message.content
+
+
+async def quality_task(row):
+    """
+    Task function that processes questions with context.
+    Returns data that will be evaluated for quality and faithfulness.
+    """
+    question = row.get("question", "This is a demo question")
+    context = row.get("context", "This is a demo context")
+    # Generate response
+    completion = await generate_response(question, context)
+
+    # Return data for evaluation
+    return {
+        "question": question,
+        "answer": completion,
+        "completion": completion,
+        "context": context,
+    }
+
+
+async def run_correctness_experiment():
+    """
+    Run experiment with correctness evaluators.
+
+    This experiment will evaluate responses for:
+    1. Answer Relevancy - Does the answer address the question?
+    2. Faithfulness - Is the answer faithful to the provided context?
+    """
+
+    print("\n" + "="*80)
+    print("CORRECTNESS EVALUATORS EXPERIMENT")
+    print("="*80 + "\n")
+
+    print("This experiment will test two critical quality evaluators:\n")
+    print("1. Answer Relevancy - Verifies the response addresses the query")
+    print("2. Faithfulness - Detects hallucinations and verifies factual accuracy")
+    print("\n" + "-"*80 + "\n")
+
+    # Configure correctness evaluators
+    evaluators = [
+        EvaluatorMadeByTraceloop.answer_relevancy(),
+        EvaluatorMadeByTraceloop.faithfulness(),
+    ]
+
+    print("Running experiment with evaluators:")
+    for evaluator in evaluators:
+        print(f"  - {evaluator.slug}")
+
+    print("\n" + "-"*80 + "\n")
+
+    # Run the experiment
+    results, errors = await client.experiment.run(
+        dataset_slug="correctness",  # Set a dataset slug that exists in the traceloop platform
+        dataset_version="v1",
+        task=quality_task,
+        evaluators=evaluators,
+        experiment_slug="correctness-evaluators-exp",
+        stop_on_error=False,
+        wait_for_results=True,
+    )
+
+    print("\n" + "="*80)
+    print("Correctness experiment completed!")
+    print("="*80 + "\n")
+
+if __name__ == "__main__":
+    print("\nCorrectness Evaluators Experiment\n")
+
+    asyncio.run(run_correctness_experiment())
diff --git a/packages/sample-app/sample_app/experiment/made_by_traceloop/formatting_exp.py b/packages/sample-app/sample_app/experiment/made_by_traceloop/formatting_exp.py
new file mode 100644
index 0000000000..57aebc0d5d
--- /dev/null
+++ b/packages/sample-app/sample_app/experiment/made_by_traceloop/formatting_exp.py
@@ -0,0 +1,197 @@
+"""
+Validation Evaluators Experiment
+
+This example demonstrates Traceloop's formatting evaluators:
+- JSON Validation: Validates JSON format and optional schema compliance
+- SQL Validation: Validates SQL query syntax
+- Regex Validation: Validates text matches regex patterns
+- Placeholder Regex: Validates dynamic text with placeholders
+
+These evaluators are essential for structured output validation,
+ensuring AI-generated code, queries, and formatted data are correct.
+"""
+
+import asyncio
+import os
+from openai import AsyncOpenAI
+from traceloop.sdk import Traceloop
+from traceloop.sdk.evaluator import EvaluatorMadeByTraceloop
+
+# Initialize Traceloop
+client = Traceloop.init()
+
+
+async def generate_structured_output(prompt: str, output_format: str) -> str:
+    """
+    Generate structured output (JSON, SQL, etc.) using OpenAI.
+
+    Args:
+        prompt: The user's request
+        output_format: Expected format (json, sql, regex, etc.)
+    """
+    openai_client = AsyncOpenAI(api_key=os.getenv("OPENAI_API_KEY"))
+
+    system_prompts = {
+        "json": "You are a helpful assistant that responds ONLY with valid JSON. No explanation, just JSON.",
+        "sql": "You are a SQL expert. Generate ONLY valid SQL queries. No explanation, just the SQL query.",
+        "regex": "You are a regex expert. Generate ONLY the regex pattern. No explanation, just the pattern.",
+        "text": "You are a helpful assistant. Generate the requested text content.",
+    }
+
+    response = await openai_client.chat.completions.create(
+        model="gpt-3.5-turbo",
+        messages=[
+            {"role": "system", "content": system_prompts.get(output_format, system_prompts["text"])},
+            {"role": "user", "content": prompt}
+        ],
+        temperature=0.3,  # Lower temperature for more structured outputs
+        max_tokens=300,
+    )
+
+    return response.choices[0].message.content.strip()
+
+
+async def validation_task(row):
+    """
+    Task function that generates structured outputs for validation.
+    Returns different types of structured data based on the task type.
+    """
+    prompt = row.get("question", "")
+    task_type = row.get("task_type", "json")  # json, sql, regex, text
+    placeholder_value = row.get("placeholder_value", "This is a demo placeholder value")
+
+    # Generate the appropriate output
+    output = await generate_structured_output(prompt, task_type)
+
+    # Return data for validation evaluators
+    return {
+        "text": output,
+        "placeholder_value": placeholder_value,
+    }
+
+
+async def run_formatting_experiment():
+    """
+    Run experiment with formatting evaluators.
+
+    This experiment validates:
+    1. JSON Validation - Proper JSON format and schema
+    2. SQL Validation - Valid SQL syntax
+    3. Regex Validation - Pattern matching
+    4. Placeholder Regex - Dynamic pattern validation
+    """
+
+    print("\n" + "="*80)
+    print("VALIDATION EVALUATORS EXPERIMENT")
+    print("="*80 + "\n")
+
+    print("This experiment validates structured AI outputs:\n")
+    print("1. JSON Validation - Ensures valid JSON format")
+    print("2. SQL Validation - Verifies SQL query syntax")
+    print("3. Regex Validation - Validates pattern matching")
+    print("4. Placeholder Regex - Dynamic text validation")
+    print("\n" + "-"*80 + "\n")
+
+    # Configure validation evaluators
+    json_schema = '''{
+        "type": "object",
+        "properties": {
+            "name": {"type": "string"},
+            "age": {"type": "number"},
+            "email": {"type": "string"}
+        },
+        "required": ["name", "email"]
+    }'''
+
+    evaluators = [
+        EvaluatorMadeByTraceloop.json_validator(
+            enable_schema_validation=True,
+            schema_string=json_schema
+        ),
+        EvaluatorMadeByTraceloop.sql_validator(),
+        EvaluatorMadeByTraceloop.regex_validator(
+            regex=r"^\d{3}-\d{2}-\d{4}$",  # SSN format
+            should_match=True,
+            case_sensitive=True
+        ),
+        EvaluatorMadeByTraceloop.placeholder_regex(
+            regex=r"^user_.*",
+            placeholder_name="username",
+            should_match=True
+        ),
+    ]
+
+    print("\n" + "-"*80 + "\n")
+
+    # Run the experiment
+    results, errors = await client.experiment.run(
+        dataset_slug="formatting",  # Set a dataset slug that exists in the traceloop platform
+        dataset_version="v1",
+        task=validation_task,
+        evaluators=evaluators,
+        experiment_slug="formatting-evaluators-exp",
+        stop_on_error=False,
+        wait_for_results=True,
+    )
+
+    print("\n" + "="*80)
+    print("Formatting experiment completed!")
+    print("="*80 + "\n")
+
+
+async def run_validation_examples():
+    """
+    Run live examples showing each validator in action.
+    """
+    print("\n" + "="*80)
+    print("LIVE VALIDATION EXAMPLES")
+    print("="*80 + "\n")
+
+    # Example 1: JSON Validation
+    print("Example 1: JSON Validation\n")
+    json_prompts = [
+        "Generate a user profile JSON with name, age, and email for John Doe, age 30",
+        "Create a product JSON with id, title, and price",
+    ]
+
+    print("Generating JSON outputs...")
+    for prompt in json_prompts[:1]:  # Just show one for demo
+        output = await generate_structured_output(prompt, "json")
+        print(f"Prompt: {prompt}")
+        print(f"Output: {output}\n")
+
+    # Example 2: SQL Validation
+    print("\n" + "-"*80 + "\n")
+    print("Example 2: SQL Validation\n")
+    sql_prompts = [
+        "Write a SQL query to select all users older than 18",
+        "Create a query to count active customers",
+    ]
+
+    print("Generating SQL queries...")
+    for prompt in sql_prompts[:1]:  # Just show one for demo
+        output = await generate_structured_output(prompt, "sql")
+        print(f"Prompt: {prompt}")
+        print(f"Output: {output}\n")
+
+    # Example 3: Regex patterns
+    print("\n" + "-"*80 + "\n")
+    print("Example 3: Pattern Validation\n")
+
+    patterns = [
+        ("Email validation", r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"),
+        ("Phone number", r"^\(\d{3}\) \d{3}-\d{4}$"),
+        ("URL validation", r"^https?://[^\s/$.?#].[^\s]*$"),
+    ]
+
+    print("Common validation patterns:")
+    for name, pattern in patterns:
+        print(f"  {name}: {pattern}")
+
+    print("\n" + "="*80 + "\n")
+
+
+if __name__ == "__main__":
+    print("\nFormatting Evaluators Experiment\n")
+
+    asyncio.run(run_formatting_experiment())
diff --git a/packages/sample-app/sample_app/experiment/made_by_traceloop/quality_exp.py b/packages/sample-app/sample_app/experiment/made_by_traceloop/quality_exp.py
new file mode 100644
index 0000000000..6496753072
--- /dev/null
+++ b/packages/sample-app/sample_app/experiment/made_by_traceloop/quality_exp.py
@@ -0,0 +1,122 @@
+"""
+Advanced Quality Evaluators Experiment
+
+This example demonstrates Traceloop's advanced quality evaluators:
+- Measure Perplexity: Measure text perplexity from logprobs
+- Agent Goal Accuracy: Validate agent goal achievement
+- Semantic Similarity: Measure semantic similarity between texts
+- Topic Adherence: Validate topic adherence
+
+These evaluators help analyze response quality, goal achievement,
+semantic correctness, and topic consistency.
+"""
+
+import asyncio
+import os
+from openai import AsyncOpenAI
+from traceloop.sdk import Traceloop
+from traceloop.sdk.evaluator import EvaluatorMadeByTraceloop
+
+# Initialize Traceloop
+client = Traceloop.init()
+
+
+async def generate_response(prompt: str, max_tokens: int = 300) -> str:
+    """Generate a response using OpenAI with logprobs for perplexity measurement"""
+    openai_client = AsyncOpenAI(api_key=os.getenv("OPENAI_API_KEY"))
+
+    response = await openai_client.chat.completions.create(
+        model="gpt-3.5-turbo",
+        messages=[{"role": "user", "content": prompt}],
+        temperature=0.7,
+        max_tokens=max_tokens,
+        logprobs=True,  # Enable logprobs for perplexity calculation
+    )
+
+    return response.choices[0].message.content
+
+
+async def advanced_quality_task(row):
+    """
+    Task function that generates responses and provides data for advanced quality evaluation.
+
+    Expected dataset fields:
+    - question: The input question/prompt
+    - expected_goal: The expected outcome or goal (for agent goal accuracy)
+    - reference_answer: Reference answer for semantic similarity
+    - topic: The expected topic for adherence checking
+    """
+    question = row.get("question", "")
+    reference_answer = row.get("reference_answer", "This is a demo reference answer")
+    topic = row.get("topic", "general knowledge")
+
+    # Generate response
+    completion = await generate_response(question)
+
+    # Return data for evaluation
+    # Different evaluators expect different fields
+    return {
+        "logprobs": completion,  # For perplexity
+        "question": question,
+        "completion": completion,  # Standard completion field
+        "reference": reference_answer,  # For semantic similarity comparison
+        "reference_topics": topic,  # For topic adherence
+    }
+
+
+async def run_advanced_quality_experiment():
+    """
+    Run experiment with advanced quality evaluators.
+
+    This experiment measures:
+    1. Perplexity - Text fluency and predictability from logprobs
+    2. Agent Goal Accuracy - Whether the response achieves its intended goal
+    3. Semantic Similarity - Semantic alignment with reference answer
+    4. Topic Adherence - Whether the response stays on topic
+    """
+
+    print("\n" + "="*80)
+    print("ADVANCED QUALITY EVALUATORS EXPERIMENT")
+    print("="*80 + "\n")
+
+    print("This experiment measures response quality across multiple dimensions:\n")
+    print("1. Perplexity - Measures text fluency and predictability")
+    print("2. Agent Goal Accuracy - Validates goal achievement")
+    print("3. Semantic Similarity - Compares semantic meaning with reference")
+    print("4. Topic Adherence - Ensures response stays on topic")
+    print("\n" + "-"*80 + "\n")
+
+    # Configure advanced quality evaluators
+    evaluators = [
+        EvaluatorMadeByTraceloop.perplexity(),
+        EvaluatorMadeByTraceloop.agent_goal_accuracy(),
+        EvaluatorMadeByTraceloop.semantic_similarity(),
+        EvaluatorMadeByTraceloop.topic_adherence(),
+    ]
+
+    print("Running experiment with advanced quality evaluators:")
+    for evaluator in evaluators:
+        print(f"  - {evaluator.slug}")
+
+    print("\n" + "-"*80 + "\n")
+
+    # Run the experiment
+    results, errors = await client.experiment.run(
+        dataset_slug="quality",  # Set a dataset slug that exists in the traceloop platform
+        dataset_version="v1",
+        task=advanced_quality_task,
+        evaluators=evaluators,
+        experiment_slug="advanced-quality-exp",
+        stop_on_error=False,
+        wait_for_results=True,
+    )
+
+    print("\n" + "="*80)
+    print("Advanced quality experiment completed!")
+    print("="*80 + "\n")
+
+
+if __name__ == "__main__":
+    print("\n🚀 Advanced Quality Evaluators Experiment\n")
+
+    asyncio.run(run_advanced_quality_experiment())
diff --git a/packages/sample-app/sample_app/experiment/made_by_traceloop/security_exp.py b/packages/sample-app/sample_app/experiment/made_by_traceloop/security_exp.py
new file mode 100644
index 0000000000..bf5e0eefd3
--- /dev/null
+++ b/packages/sample-app/sample_app/experiment/made_by_traceloop/security_exp.py
@@ -0,0 +1,103 @@
+"""
+Security Evaluators Experiment
+
+This example demonstrates Traceloop's security evaluators:
+- PII Detector: Identifies personal information exposure
+- Secrets Detector: Monitors for credential and key leaks
+- Prompt Injection: Detects prompt injection attempts
+
+These evaluators help ensure your AI applications don't leak sensitive data
+or fall victim to prompt injection attacks.
+"""
+
+import asyncio
+import os
+from openai import AsyncOpenAI
+from traceloop.sdk import Traceloop
+from traceloop.sdk.evaluator import EvaluatorMadeByTraceloop
+
+# Initialize Traceloop
+client = Traceloop.init()
+
+
+async def generate_response(prompt: str) -> str:
+    """Generate a response using OpenAI"""
+    openai_client = AsyncOpenAI(api_key=os.getenv("OPENAI_API_KEY"))
+
+    response = await openai_client.chat.completions.create(
+        model="gpt-3.5-turbo",
+        messages=[{"role": "user", "content": prompt}],
+        temperature=0.7,
+        max_tokens=200,
+    )
+
+    return response.choices[0].message.content
+
+
+async def security_task(row):
+    """
+    Task function that processes user queries.
+    Returns text that will be evaluated for security issues.
+    """
+    user_query = row.get("query", "")
+
+    # Generate response
+    response = await generate_response(user_query)
+
+    # Return data for evaluation
+    return {
+        "text": response,  # The text to check for PII, secrets, and prompt injection
+        "query": user_query,
+    }
+
+
+async def run_security_experiment():
+    """
+    Run experiment with security evaluators.
+
+    This experiment will evaluate responses for:
+    1. PII (Personal Identifiable Information)
+    2. Secrets (API keys, passwords, tokens)
+    3. Prompt Injection attempts
+    """
+
+    print("\n" + "="*80)
+    print("SECURITY EVALUATORS EXPERIMENT")
+    print("="*80 + "\n")
+
+    print("This experiment will test three critical security evaluators:\n")
+    print("1. PII Detector - Identifies personal information (names, emails, SSN, etc.)")
+    print("2. Secrets Detector - Finds API keys, passwords, and credentials")
+    print("3. Prompt Injection - Detects attempts to manipulate the AI system")
+    print("\n" + "-"*80 + "\n")
+
+    # Configure security evaluators
+    evaluators = [
+        EvaluatorMadeByTraceloop.pii_detector(probability_threshold=0.7,),
+        EvaluatorMadeByTraceloop.secrets_detector(),
+        EvaluatorMadeByTraceloop.prompt_injection(threshold=0.6),
+    ]
+
+    print("\n" + "-"*80 + "\n")
+
+    # Run the experiment
+    results, errors = await client.experiment.run(
+        dataset_slug="security",  # Set a dataset slug that exists in the traceloop platform
+        dataset_version="v1",
+        task=security_task,
+        evaluators=evaluators,
+        experiment_slug="security-evaluators-exp",
+        stop_on_error=False,
+        wait_for_results=True,
+    )
+
+    print("\n" + "="*80)
+    print("Security experiment completed!")
+    print("="*80 + "\n")
+
+
+if __name__ == "__main__":
+    print("\nSecurity Evaluators Experiment\n")
+
+    # To run with actual dataset, uncomment:
+    asyncio.run(run_security_experiment())
diff --git a/packages/sample-app/sample_app/experiment/made_by_traceloop/style_exp.py b/packages/sample-app/sample_app/experiment/made_by_traceloop/style_exp.py
new file mode 100644
index 0000000000..ba64e74af1
--- /dev/null
+++ b/packages/sample-app/sample_app/experiment/made_by_traceloop/style_exp.py
@@ -0,0 +1,108 @@
+"""
+Metrics Evaluators Experiment
+
+This example demonstrates Traceloop's style evaluators:
+- Character Count: Counts the number of characters in a text
+- Word Count: Counts the number of words in a text
+- Character Count Ratio: Compares the length of two texts
+- Word Count Ratio: Compares the number of words between two texts
+"""
+
+import asyncio
+import os
+from openai import AsyncOpenAI
+from traceloop.sdk import Traceloop
+from traceloop.sdk.evaluator import EvaluatorMadeByTraceloop
+
+# Initialize Traceloop
+client = Traceloop.init()
+
+
+async def generate_response(prompt: str, max_tokens: int = 200) -> str:
+    """Generate a response using OpenAI"""
+    openai_client = AsyncOpenAI(api_key=os.getenv("OPENAI_API_KEY"))
+
+    response = await openai_client.chat.completions.create(
+        model="gpt-3.5-turbo",
+        messages=[{"role": "user", "content": prompt}],
+        temperature=0.7,
+        max_tokens=max_tokens,
+    )
+
+    return response.choices[0].message.content
+
+
+async def style_task(row):
+    """
+    Task function that generates responses and measures their metrics.
+    """
+    question = row.get("question", "")
+    reference_answer = row.get("reference_answer", "This is a demo reference answer")
+
+    # Generate response
+    completion = await generate_response(question)
+
+    # Return data for style evaluation
+    return {
+        "text": completion,
+        "numerator_text": completion,
+        "denominator_text": reference_answer,
+    }
+
+
+async def run_style_experiment():
+    """
+    Run experiment with style evaluators.
+
+    This experiment measures:
+    1. Character Count - Total characters in response
+    2. Word Count - Total words in response
+    3. Character Count Ratio - Response length vs reference
+    4. Word Count Ratio - Response verbosity vs reference
+    """
+
+    print("\n" + "="*80)
+    print("METRICS EVALUATORS EXPERIMENT")
+    print("="*80 + "\n")
+
+    print("This experiment measures response length and verbosity:\n")
+    print("1. Character Count - Total characters in the response")
+    print("2. Word Count - Total words in the response")
+    print("3. Character Count Ratio - Response length compared to reference")
+    print("4. Word Count Ratio - Response verbosity compared to reference")
+    print("\n" + "-"*80 + "\n")
+
+    # Configure metrics evaluators
+    evaluators = [
+        EvaluatorMadeByTraceloop.char_count(),
+        EvaluatorMadeByTraceloop.word_count(),
+        EvaluatorMadeByTraceloop.char_count_ratio(),
+        EvaluatorMadeByTraceloop.word_count_ratio(),
+    ]
+
+    print("Running experiment with metrics evaluators:")
+    for evaluator in evaluators:
+        print(f"  - {evaluator.slug}")
+
+    print("\n" + "-"*80 + "\n")
+
+    # Run the experiment
+    results, errors = await client.experiment.run(
+        dataset_slug="style",  # Set a dataset slug that exists in the traceloop platform
+        dataset_version="v1",
+        task=style_task,
+        evaluators=evaluators,
+        experiment_slug="style-evaluators-exp",
+        stop_on_error=False,
+        wait_for_results=True,
+    )
+
+    print("\n" + "="*80)
+    print("Style experiment completed!")
+    print("="*80 + "\n")
+
+
+if __name__ == "__main__":
+    print("\nStyle Evaluators Experiment\n")
+
+    asyncio.run(run_style_experiment())
diff --git a/packages/sample-app/sample_app/experiment/run_research_experiment.py b/packages/sample-app/sample_app/experiment/run_research_experiment.py
index 974f690d34..7ee891c034 100644
--- a/packages/sample-app/sample_app/experiment/run_research_experiment.py
+++ b/packages/sample-app/sample_app/experiment/run_research_experiment.py
@@ -49,7 +49,7 @@ async def research_task(row):
     return {
         "completion": answer,
         "question": query,
-        "sentence": answer
+        "text": answer
     }
 
 
diff --git a/packages/traceloop-sdk/tests/conftest.py b/packages/traceloop-sdk/tests/conftest.py
index 0daa90813b..e2c6dc54d8 100644
--- a/packages/traceloop-sdk/tests/conftest.py
+++ b/packages/traceloop-sdk/tests/conftest.py
@@ -231,3 +231,9 @@ def datasets():
 
     http = HTTPClient(base_url=base_url, api_key=api_key, version="1.0.0")
     return Datasets(http)
+
+
+@pytest.fixture(scope="session")
+def anyio_backend():
+    """Force anyio to use only asyncio backend."""
+    return "asyncio"
diff --git a/packages/traceloop-sdk/tests/evaluator/test_evaluator.py b/packages/traceloop-sdk/tests/evaluator/test_evaluator.py
new file mode 100644
index 0000000000..fd07d16e77
--- /dev/null
+++ b/packages/traceloop-sdk/tests/evaluator/test_evaluator.py
@@ -0,0 +1,220 @@
+import pytest
+from traceloop.sdk.evaluator.evaluator import validate_task_output
+from traceloop.sdk.evaluator.config import EvaluatorDetails
+
+
+class TestValidateTaskOutput:
+    """Tests for validate_task_output function"""
+
+    def test_validate_task_output_with_no_evaluators(self):
+        """Test that validation passes when no evaluators are provided"""
+        task_output = {"text": "hello"}
+        evaluators = []
+
+        # Should not raise any exception
+        validate_task_output(task_output, evaluators)
+
+    def test_validate_task_output_with_evaluators_no_required_fields(self):
+        """Test that validation passes when evaluators have no required fields"""
+        task_output = {"text": "hello", "score": 0.9}
+        evaluators = [
+            EvaluatorDetails(slug="evaluator1"),
+            EvaluatorDetails(slug="evaluator2", config={"threshold": 0.5}),
+        ]
+
+        # Should not raise any exception
+        validate_task_output(task_output, evaluators)
+
+    def test_validate_task_output_with_valid_output(self):
+        """Test that validation passes when all required fields are present"""
+        task_output = {"text": "hello", "prompt": "say hello", "response": "world"}
+        evaluators = [
+            EvaluatorDetails(slug="pii-detector", required_input_fields=["text"]),
+            EvaluatorDetails(
+                slug="relevance-checker",
+                required_input_fields=["prompt", "response"],
+            ),
+        ]
+
+        # Should not raise any exception
+        validate_task_output(task_output, evaluators)
+
+    def test_validate_task_output_missing_single_field(self):
+        """Test that validation fails when a single required field is missing"""
+        task_output = {"prompt": "say hello"}
+        evaluators = [
+            EvaluatorDetails(slug="pii-detector", required_input_fields=["text"]),
+        ]
+
+        with pytest.raises(ValueError) as exc_info:
+            validate_task_output(task_output, evaluators)
+
+        error_message = str(exc_info.value)
+        assert "Task output missing required fields for evaluators:" in error_message
+        assert "pii-detector requires: ['text']" in error_message
+        assert "Task output contains: ['prompt']" in error_message
+        assert (
+            "Hint: Update your task function to return a dictionary "
+            "with the required fields."
+        ) in error_message
+
+    def test_validate_task_output_missing_multiple_fields_single_evaluator(self):
+        """Test that validation fails when multiple fields are missing for one evaluator"""
+        task_output = {"score": 0.9}
+        evaluators = [
+            EvaluatorDetails(
+                slug="relevance-checker",
+                required_input_fields=["prompt", "response", "context"],
+            ),
+        ]
+
+        with pytest.raises(ValueError) as exc_info:
+            validate_task_output(task_output, evaluators)
+
+        error_message = str(exc_info.value)
+        assert "relevance-checker requires:" in error_message
+        assert "'context'" in error_message
+        assert "'prompt'" in error_message
+        assert "'response'" in error_message
+
+    def test_validate_task_output_missing_fields_multiple_evaluators(self):
+        """Test that validation fails when fields are missing for multiple evaluators"""
+        task_output = {"score": 0.9}
+        evaluators = [
+            EvaluatorDetails(slug="pii-detector", required_input_fields=["text"]),
+            EvaluatorDetails(
+                slug="relevance-checker",
+                required_input_fields=["prompt", "response"],
+            ),
+            EvaluatorDetails(slug="tone-analyzer", required_input_fields=["text"]),
+        ]
+
+        with pytest.raises(ValueError) as exc_info:
+            validate_task_output(task_output, evaluators)
+
+        error_message = str(exc_info.value)
+        assert "pii-detector requires:" in error_message
+        assert "relevance-checker requires:" in error_message
+        assert "tone-analyzer requires:" in error_message
+        assert "'text'" in error_message
+        assert "'prompt'" in error_message
+        assert "'response'" in error_message
+
+    def test_validate_task_output_partial_match(self):
+        """Test validation when some evaluators pass and some fail"""
+        task_output = {"text": "hello world"}
+        evaluators = [
+            EvaluatorDetails(slug="pii-detector", required_input_fields=["text"]),
+            EvaluatorDetails(
+                slug="relevance-checker",
+                required_input_fields=["prompt", "response"],
+            ),
+        ]
+
+        with pytest.raises(ValueError) as exc_info:
+            validate_task_output(task_output, evaluators)
+
+        error_message = str(exc_info.value)
+        # Should only mention the failing evaluator
+        assert "relevance-checker requires:" in error_message
+        assert "pii-detector requires:" not in error_message
+
+    def test_validate_task_output_empty_task_output(self):
+        """Test validation with empty task output"""
+        task_output = {}
+        evaluators = [
+            EvaluatorDetails(slug="pii-detector", required_input_fields=["text"]),
+        ]
+
+        with pytest.raises(ValueError) as exc_info:
+            validate_task_output(task_output, evaluators)
+
+        error_message = str(exc_info.value)
+        assert "Task output contains: []" in error_message
+
+    def test_validate_task_output_with_extra_fields(self):
+        """Test that validation passes when task output has extra fields"""
+        task_output = {
+            "text": "hello",
+            "prompt": "say hello",
+            "response": "world",
+            "extra_field": "value",
+            "another_field": 123,
+        }
+        evaluators = [
+            EvaluatorDetails(slug="pii-detector", required_input_fields=["text"]),
+        ]
+
+        # Should not raise any exception - extra fields are allowed
+        validate_task_output(task_output, evaluators)
+
+    def test_validate_task_output_case_sensitive_field_names(self):
+        """Test that field name matching is case-sensitive"""
+        task_output = {"Text": "hello"}
+        evaluators = [
+            EvaluatorDetails(slug="pii-detector", required_input_fields=["text"]),
+        ]
+
+        with pytest.raises(ValueError) as exc_info:
+            validate_task_output(task_output, evaluators)
+
+        error_message = str(exc_info.value)
+        assert "pii-detector requires: ['text']" in error_message
+        assert "Task output contains: ['Text']" in error_message
+
+    def test_validate_task_output_with_evaluator_config(self):
+        """Test validation with evaluators that have config"""
+        task_output = {"text": "hello world"}
+        evaluators = [
+            EvaluatorDetails(
+                slug="pii-detector",
+                version="v2",
+                config={"probability_threshold": 0.8, "mode": "strict"},
+                required_input_fields=["text"],
+            ),
+        ]
+
+        # Should not raise any exception - config shouldn't affect validation
+        validate_task_output(task_output, evaluators)
+
+    def test_validate_task_output_mixed_evaluators(self):
+        """Test validation with a mix of evaluators with and without required fields"""
+        task_output = {"text": "hello", "score": 0.9}
+        evaluators = [
+            EvaluatorDetails(slug="evaluator-no-requirements"),
+            EvaluatorDetails(slug="pii-detector", required_input_fields=["text"]),
+            EvaluatorDetails(slug="another-no-requirements", config={"key": "value"}),
+            EvaluatorDetails(
+                slug="relevance-checker",
+                required_input_fields=["prompt"],
+            ),
+        ]
+
+        with pytest.raises(ValueError) as exc_info:
+            validate_task_output(task_output, evaluators)
+
+        error_message = str(exc_info.value)
+        # Should only mention failing evaluator
+        assert "relevance-checker requires: ['prompt']" in error_message
+        assert "evaluator-no-requirements" not in error_message
+        assert "pii-detector" not in error_message or "pii-detector requires:" not in error_message
+
+    def test_validate_task_output_duplicate_required_fields(self):
+        """Test validation when multiple evaluators require the same field"""
+        task_output = {"score": 0.9}
+        evaluators = [
+            EvaluatorDetails(slug="pii-detector", required_input_fields=["text"]),
+            EvaluatorDetails(slug="tone-analyzer", required_input_fields=["text"]),
+            EvaluatorDetails(
+                slug="sentiment-analyzer",
+                required_input_fields=["text", "language"],
+            ),
+        ]
+
+        with pytest.raises(ValueError) as exc_info:
+            validate_task_output(task_output, evaluators)
+
+        error_message = str(exc_info.value)
+        assert "pii-detector requires:" in error_message
+        assert "tone-analyzer requires:" in error_message
+        assert "sentiment-analyzer requires:" in error_message
diff --git a/packages/traceloop-sdk/tests/experiment/test_experiment.py b/packages/traceloop-sdk/tests/experiment/test_experiment.py
index e9927f4578..2e2ccd2aeb 100644
--- a/packages/traceloop-sdk/tests/experiment/test_experiment.py
+++ b/packages/traceloop-sdk/tests/experiment/test_experiment.py
@@ -1,7 +1,8 @@
 import pytest
-from unittest.mock import Mock
+from unittest.mock import Mock, AsyncMock
 from traceloop.sdk.experiment.experiment import Experiment
 from traceloop.sdk.client.http import HTTPClient
+from traceloop.sdk.evaluator.config import EvaluatorDetails
 
 
 @pytest.fixture
@@ -34,12 +35,15 @@ def test_parse_jsonl_to_rows_valid_data(experiment):
 
 def test_parse_jsonl_to_rows_with_invalid_json_lines(experiment):
     """Test parsing JSONL data with some invalid JSON lines"""
-    jsonl_data = """{"columns":{"name":{"name":"Name","type":"string"},"age":{"name":"Age","type":"number"}}}
-{"name": "John", "age": 30}
-invalid json line
-{"name": "Alice", "age": 25}
-{broken json
-{"name": "Bob", "age": 35}"""
+    jsonl_data = (
+        '{"columns":{"name":{"name":"Name","type":"string"},'
+        '"age":{"name":"Age","type":"number"}}}\n'
+        '{"name": "John", "age": 30}\n'
+        'invalid json line\n'
+        '{"name": "Alice", "age": 25}\n'
+        '{broken json\n'
+        '{"name": "Bob", "age": 35}'
+    )
 
     result = experiment._parse_jsonl_to_rows(jsonl_data)
 
@@ -73,12 +77,14 @@ def test_parse_jsonl_to_rows_only_header(experiment):
 
 def test_parse_jsonl_to_rows_with_empty_lines(experiment):
     """Test parsing JSONL data with empty lines"""
-    jsonl_data = """{"columns":{"name":{"name":"Name","type":"string"},"age":{"name":"Age","type":"number"}}}
-{"name": "John", "age": 30}
-
-{"name": "Alice", "age": 25}
-
-"""
+    jsonl_data = (
+        '{"columns":{"name":{"name":"Name","type":"string"},'
+        '"age":{"name":"Age","type":"number"}}}\n'
+        '{"name": "John", "age": 30}\n'
+        '\n'
+        '{"name": "Alice", "age": 25}\n'
+        '\n'
+    )
 
     result = experiment._parse_jsonl_to_rows(jsonl_data)
 
@@ -88,9 +94,14 @@ def test_parse_jsonl_to_rows_with_empty_lines(experiment):
 
 def test_parse_jsonl_to_rows_complex_json_objects(experiment):
     """Test parsing JSONL data with complex nested objects"""
-    jsonl_data = """{"columns":{"user":{"name":"User","type":"object"},"active":{"name":"Active","type":"boolean"}}}
-{"user": {"name": "John", "details": {"age": 30, "location": ["NY", "US"]}}, "active": true}
-{"user": {"name": "Alice", "details": {"age": 25, "location": ["CA", "US"]}}, "active": false}"""
+    jsonl_data = (
+        '{"columns":{"user":{"name":"User","type":"object"},'
+        '"active":{"name":"Active","type":"boolean"}}}\n'
+        '{"user": {"name": "John", "details": {"age": 30, "location": ["NY", "US"]}}, '
+        '"active": true}\n'
+        '{"user": {"name": "Alice", "details": {"age": 25, "location": ["CA", "US"]}}, '
+        '"active": false}'
+    )
 
     result = experiment._parse_jsonl_to_rows(jsonl_data)
 
@@ -105,3 +116,253 @@ def test_parse_jsonl_to_rows_complex_json_objects(experiment):
         },
     ]
     assert result == expected
+
+
+class TestRunLocallyValidation:
+    """Tests for _run_locally to ensure validation failures are handled correctly"""
+
+    @pytest.mark.anyio(backends=["asyncio"])
+    async def test_run_locally_breaks_on_validation_failure_with_stop_on_error(self):
+        """Test that _run_locally stops processing when validation fails and stop_on_error=True"""
+        mock_http_client = Mock(spec=HTTPClient)
+        mock_http_client.base_url = "https://api.example.com"
+        mock_async_http_client = Mock()
+
+        # Mock the _init_experiment method
+        mock_experiment_response = Mock()
+        mock_experiment_response.run.id = "run-123"
+        mock_experiment_response.experiment.id = "exp-456"
+
+        experiment = Experiment(mock_http_client, mock_async_http_client, "test-exp")
+        experiment._init_experiment = Mock(return_value=mock_experiment_response)
+
+        # Create a task that returns output missing required fields
+        async def task_missing_fields(row):
+            return {"wrong_field": "value"}
+
+        # Define evaluator with required fields
+        evaluators = [
+            EvaluatorDetails(slug="pii-detector", required_input_fields=["text"]),
+        ]
+
+        # Mock dataset response with multiple rows
+        experiment._datasets.get_version_jsonl = Mock(
+            return_value=(
+                '{"columns":{}}\n{"input": "test1"}\n'
+                '{"input": "test2"}\n{"input": "test3"}'
+            )
+        )
+
+        # Run with stop_on_error=True - should stop after first error
+        results, errors = await experiment._run_locally(
+            task=task_missing_fields,
+            evaluators=evaluators,
+            dataset_slug="test-dataset",
+            dataset_version="v1",
+            stop_on_error=True,
+        )
+
+        # Should have at least one error and should have broken early
+        assert len(errors) >= 1
+        assert "Task output missing required fields for evaluators:" in errors[0]
+        assert "pii-detector requires: ['text']" in errors[0]
+        # With stop_on_error=True, it should break and not process all 3 rows
+        assert len(results) + len(errors) <= 3
+
+    @pytest.mark.anyio(backends=["asyncio"])
+    async def test_run_locally_continues_on_validation_failure_without_stop_on_error(self):
+        """Test that _run_locally captures error when validation fails and stop_on_error=False"""
+        mock_http_client = Mock(spec=HTTPClient)
+        mock_http_client.base_url = "https://api.example.com"
+        mock_async_http_client = Mock()
+
+        # Mock the _init_experiment method
+        mock_experiment_response = Mock()
+        mock_experiment_response.run.id = "run-123"
+        mock_experiment_response.experiment.id = "exp-456"
+
+        experiment = Experiment(mock_http_client, mock_async_http_client, "test-exp")
+        experiment._init_experiment = Mock(return_value=mock_experiment_response)
+
+        # Create a task that returns output missing required fields
+        async def task_missing_fields(row):
+            return {"wrong_field": "value"}
+
+        # Define evaluator with required fields
+        evaluators = [
+            EvaluatorDetails(slug="pii-detector", required_input_fields=["text"]),
+        ]
+
+        # Mock dataset response with single row
+        experiment._datasets.get_version_jsonl = Mock(
+            return_value='{"columns":{}}\n{"input": "test"}'
+        )
+
+        # Run with stop_on_error=False - should capture error but not raise
+        results, errors = await experiment._run_locally(
+            task=task_missing_fields,
+            evaluators=evaluators,
+            dataset_slug="test-dataset",
+            dataset_version="v1",
+            stop_on_error=False,
+        )
+
+        # Should have errors, no successful results
+        assert len(errors) == 1
+        assert len(results) == 0
+        assert "Task output missing required fields for evaluators:" in errors[0]
+        assert "pii-detector requires: ['text']" in errors[0]
+
+    @pytest.mark.anyio(backends=["asyncio"])
+    async def test_run_locally_succeeds_with_valid_output(self):
+        """Test that _run_locally succeeds when task output matches required fields"""
+        mock_http_client = Mock(spec=HTTPClient)
+        mock_http_client.base_url = "https://api.example.com"
+        mock_async_http_client = Mock()
+
+        # Mock the _init_experiment method
+        mock_experiment_response = Mock()
+        mock_experiment_response.run.id = "run-123"
+        mock_experiment_response.experiment.id = "exp-456"
+
+        experiment = Experiment(mock_http_client, mock_async_http_client, "test-exp")
+        experiment._init_experiment = Mock(return_value=mock_experiment_response)
+
+        # Mock _create_task
+        mock_task_response = Mock()
+        mock_task_response.id = "task-789"
+        experiment._create_task = Mock(return_value=mock_task_response)
+
+        # Create a task that returns valid output
+        async def task_valid_output(row):
+            return {"text": "hello world", "score": 0.9}
+
+        # Define evaluator with required fields
+        evaluators = [
+            EvaluatorDetails(slug="pii-detector", required_input_fields=["text"]),
+        ]
+
+        # Mock dataset response
+        experiment._datasets.get_version_jsonl = Mock(
+            return_value='{"columns":{}}\n{"input": "test"}'
+        )
+
+        # Mock evaluator execution
+        mock_eval_result = Mock()
+        mock_eval_result.result = {"score": 0.95}
+        experiment._evaluator.run_experiment_evaluator = AsyncMock(
+            return_value=mock_eval_result
+        )
+
+        # Run - should succeed
+        results, errors = await experiment._run_locally(
+            task=task_valid_output,
+            evaluators=evaluators,
+            dataset_slug="test-dataset",
+            dataset_version="v1",
+        )
+
+        # Should have successful result, no errors
+        assert len(results) == 1
+        assert len(errors) == 0
+        assert results[0].task_result == {"text": "hello world", "score": 0.9}
+
+    @pytest.mark.anyio(backends=["asyncio"])
+    async def test_run_locally_validation_with_multiple_evaluators(self):
+        """Test validation with multiple evaluators having different required fields"""
+        mock_http_client = Mock(spec=HTTPClient)
+        mock_http_client.base_url = "https://api.example.com"
+        mock_async_http_client = Mock()
+
+        # Mock the _init_experiment method
+        mock_experiment_response = Mock()
+        mock_experiment_response.run.id = "run-123"
+        mock_experiment_response.experiment.id = "exp-456"
+
+        experiment = Experiment(mock_http_client, mock_async_http_client, "test-exp")
+        experiment._init_experiment = Mock(return_value=mock_experiment_response)
+
+        # Create a task that returns partial output
+        async def task_partial_output(row):
+            return {"text": "hello"}  # Missing 'prompt' and 'response'
+
+        # Define multiple evaluators with different required fields
+        evaluators = [
+            EvaluatorDetails(slug="pii-detector", required_input_fields=["text"]),
+            EvaluatorDetails(
+                slug="relevance-checker",
+                required_input_fields=["prompt", "response"],
+            ),
+        ]
+
+        # Mock dataset response
+        experiment._datasets.get_version_jsonl = Mock(
+            return_value='{"columns":{}}\n{"input": "test"}'
+        )
+
+        # Run with stop_on_error=True - should stop after error
+        results, errors = await experiment._run_locally(
+            task=task_partial_output,
+            evaluators=evaluators,
+            dataset_slug="test-dataset",
+            dataset_version="v1",
+            stop_on_error=True,
+        )
+
+        # Should have error with validation message
+        assert len(errors) >= 1
+        error_message = errors[0]
+        assert "relevance-checker requires:" in error_message
+        assert "'prompt'" in error_message
+        assert "'response'" in error_message
+
+    @pytest.mark.anyio(backends=["asyncio"])
+    async def test_run_locally_no_validation_for_string_evaluators(self):
+        """Test that validation is not performed for string evaluators (only EvaluatorDetails)"""
+        mock_http_client = Mock(spec=HTTPClient)
+        mock_http_client.base_url = "https://api.example.com"
+        mock_async_http_client = Mock()
+
+        # Mock the _init_experiment method
+        mock_experiment_response = Mock()
+        mock_experiment_response.run.id = "run-123"
+        mock_experiment_response.experiment.id = "exp-456"
+
+        experiment = Experiment(mock_http_client, mock_async_http_client, "test-exp")
+        experiment._init_experiment = Mock(return_value=mock_experiment_response)
+
+        # Mock _create_task
+        mock_task_response = Mock()
+        mock_task_response.id = "task-789"
+        experiment._create_task = Mock(return_value=mock_task_response)
+
+        # Create a task that returns any output
+        async def task_any_output(row):
+            return {"random_field": "value"}
+
+        # Use string evaluators (no validation)
+        evaluators = ["pii-detector", "relevance-checker"]
+
+        # Mock dataset response
+        experiment._datasets.get_version_jsonl = Mock(
+            return_value='{"columns":{}}\n{"input": "test"}'
+        )
+
+        # Mock evaluator execution
+        mock_eval_result = Mock()
+        mock_eval_result.result = {"score": 0.95}
+        experiment._evaluator.run_experiment_evaluator = AsyncMock(
+            return_value=mock_eval_result
+        )
+
+        # Run - should succeed without validation
+        results, errors = await experiment._run_locally(
+            task=task_any_output,
+            evaluators=evaluators,
+            dataset_slug="test-dataset",
+            dataset_version="v1",
+        )
+
+        # Should succeed because string evaluators don't trigger validation
+        assert len(errors) == 0
+        assert len(results) == 1
diff --git a/packages/traceloop-sdk/traceloop/sdk/client/http.py b/packages/traceloop-sdk/traceloop/sdk/client/http.py
index 1e1af895f1..73fbfb2cd5 100644
--- a/packages/traceloop-sdk/traceloop/sdk/client/http.py
+++ b/packages/traceloop-sdk/traceloop/sdk/client/http.py
@@ -33,7 +33,8 @@ def post(self, path: str, data: Dict[str, Any]) -> Any:
             response.raise_for_status()
             return response.json()
         except requests.exceptions.RequestException as e:
-            print(Fore.RED + f"Error making request to {path}: {str(e)}" + Fore.RESET)
+            error_msg = self._format_error_message(path, e)
+            print(Fore.RED + error_msg + Fore.RESET)
             return None
 
     def get(self, path: str, params: Optional[Dict[str, Any]] = None) -> Any:
@@ -54,7 +55,8 @@ def get(self, path: str, params: Optional[Dict[str, Any]] = None) -> Any:
             else:
                 return response.json()
         except requests.exceptions.RequestException as e:
-            print(Fore.RED + f"Error making request to {path}: {str(e)}" + Fore.RESET)
+            error_msg = self._format_error_message(path, e)
+            print(Fore.RED + error_msg + Fore.RESET)
             return None
 
     def delete(self, path: str) -> bool:
@@ -68,7 +70,8 @@ def delete(self, path: str) -> bool:
             response.raise_for_status()
             return response.status_code == 204 or response.status_code == 200
         except requests.exceptions.RequestException as e:
-            print(Fore.RED + f"Error making request to {path}: {str(e)}" + Fore.RESET)
+            error_msg = self._format_error_message(path, e)
+            print(Fore.RED + error_msg + Fore.RESET)
             return False
 
     def put(self, path: str, data: Dict[str, Any]) -> Any:
@@ -87,5 +90,41 @@ def put(self, path: str, data: Dict[str, Any]) -> Any:
             else:
                 return {}
         except requests.exceptions.RequestException as e:
-            print(Fore.RED + f"Error making request to {path}: {str(e)}" + Fore.RESET)
+            error_msg = self._format_error_message(path, e)
+            print(Fore.RED + error_msg + Fore.RESET)
             return None
+
+    def _format_error_message(self, path: str, exception: requests.exceptions.RequestException) -> str:
+        """
+        Format a detailed error message including server response if available
+        """
+        error_parts = [f"Error making request to {path}: {str(exception)}"]
+
+        # Try to extract error details from response
+        if hasattr(exception, 'response') and exception.response is not None:
+            response = exception.response
+
+            # Try to parse JSON error from response
+            try:
+                error_data = response.json()
+                if isinstance(error_data, dict):
+                    # Check common error fields
+                    if 'error' in error_data:
+                        error_parts.append(f"Server error: {error_data['error']}")
+                    elif 'message' in error_data:
+                        error_parts.append(f"Server message: {error_data['message']}")
+                    elif 'msg' in error_data:
+                        error_parts.append(f"Server message: {error_data['msg']}")
+                    else:
+                        # Include the entire JSON if no standard field found
+                        error_parts.append(f"Server response: {error_data}")
+            except (ValueError, AttributeError):
+                # Not JSON, try to get text
+                try:
+                    error_text = response.text
+                    if error_text and len(error_text) < 500:  # Only include short error messages
+                        error_parts.append(f"Server response: {error_text}")
+                except Exception:
+                    pass
+
+        return "\n".join(error_parts)
diff --git a/packages/traceloop-sdk/traceloop/sdk/evaluator/__init__.py b/packages/traceloop-sdk/traceloop/sdk/evaluator/__init__.py
index 737bb66012..b1233374b8 100644
--- a/packages/traceloop-sdk/traceloop/sdk/evaluator/__init__.py
+++ b/packages/traceloop-sdk/traceloop/sdk/evaluator/__init__.py
@@ -1,5 +1,9 @@
 from .evaluator import Evaluator
+from .config import EvaluatorDetails
+from .evaluators_made_by_traceloop import EvaluatorMadeByTraceloop
 
 __all__ = [
-    "Evaluator"
+    "Evaluator",
+    "EvaluatorDetails",
+    "EvaluatorMadeByTraceloop",
 ]
diff --git a/packages/traceloop-sdk/traceloop/sdk/evaluator/config.py b/packages/traceloop-sdk/traceloop/sdk/evaluator/config.py
new file mode 100644
index 0000000000..0523995c44
--- /dev/null
+++ b/packages/traceloop-sdk/traceloop/sdk/evaluator/config.py
@@ -0,0 +1,23 @@
+from typing import Dict, Any, Optional, List
+from pydantic import BaseModel
+
+
+class EvaluatorDetails(BaseModel):
+    """
+    Details for configuring an evaluator.
+
+    Args:
+        slug: The evaluator slug/identifier
+        version: Optional version of the evaluator
+        config: Optional configuration dictionary for the evaluator
+        required_input_fields: Optional list of required fields to the evaluator
+            input. These fields must be present in the task output.
+
+    Example:
+        >>> EvaluatorDetails(slug="pii-detector", config={"probability_threshold": 0.8}, required_input_fields=["text"])
+        >>> EvaluatorDetails(slug="my-custom-evaluator", version="v2")
+    """
+    slug: str
+    version: Optional[str] = None
+    config: Optional[Dict[str, Any]] = None
+    required_input_fields: Optional[List[str]] = None
diff --git a/packages/traceloop-sdk/traceloop/sdk/evaluator/evaluator.py b/packages/traceloop-sdk/traceloop/sdk/evaluator/evaluator.py
index 64dd7933d0..799f519c72 100644
--- a/packages/traceloop-sdk/traceloop/sdk/evaluator/evaluator.py
+++ b/packages/traceloop-sdk/traceloop/sdk/evaluator/evaluator.py
@@ -1,5 +1,5 @@
 import httpx
-from typing import Dict, Optional
+from typing import Dict, Optional, Any, List
 
 from .model import (
     InputExtractor,
@@ -9,6 +9,7 @@
     ExecutionResponse,
 )
 from .stream_client import SSEClient
+from .config import EvaluatorDetails
 
 
 class Evaluator:
@@ -28,6 +29,7 @@ def _build_evaluator_request(
         experiment_run_id: str,
         input: Dict[str, str],
         evaluator_version: Optional[str] = None,
+        evaluator_config: Optional[Dict[str, Any]] = None,
     ) -> ExecuteEvaluatorRequest:
         """Build evaluator request with common parameters"""
         schema_mapping = InputSchemaMapping(
@@ -36,6 +38,7 @@ def _build_evaluator_request(
         return ExecuteEvaluatorRequest(
             input_schema_mapping=schema_mapping,
             evaluator_version=evaluator_version,
+            evaluator_config=evaluator_config,
             task_id=task_id,
             experiment_id=experiment_id,
             experiment_run_id=experiment_run_id,
@@ -55,9 +58,10 @@ async def _execute_evaluator_request(
             full_url, json=body, timeout=httpx.Timeout(timeout_in_sec)
         )
         if response.status_code != 200:
+            error_detail = _extract_error_from_response(response)
             raise Exception(
-                f"Failed to execute evaluator {evaluator_slug}: "
-                f"{response.status_code} – {response.text}"
+                f"Failed to execute evaluator '{evaluator_slug}': "
+                f"{response.status_code} - {error_detail}"
             )
         result_data = response.json()
         return ExecuteEvaluatorResponse(**result_data)
@@ -71,23 +75,26 @@ async def run_experiment_evaluator(
         input: Dict[str, str],
         timeout_in_sec: int = 120,
         evaluator_version: Optional[str] = None,
+        evaluator_config: Optional[Dict[str, Any]] = None,
     ) -> ExecutionResponse:
         """
         Execute evaluator with input schema mapping and wait for result
 
         Args:
             evaluator_slug: Slug of the evaluator to execute
+            task_id: Task ID for the evaluation
+            experiment_id: Experiment ID
+            experiment_run_id: Experiment run ID
             input: Dict mapping evaluator input field names to their values. {field_name: value, ...}
-            client: Shared HTTP client for connection reuse (optional)
-            context_data: Context data to be passed to the evaluator (optional)
-            evaluator_version: Version of the evaluator to execute (optional)
             timeout_in_sec: Timeout in seconds for execution
+            evaluator_version: Version of the evaluator to execute (optional)
+            evaluator_config: Configuration for the evaluator (optional)
 
         Returns:
             ExecutionResponse: The evaluation result from SSE stream
         """
         request = self._build_evaluator_request(
-            task_id, experiment_id, experiment_run_id, input, evaluator_version
+            task_id, experiment_id, experiment_run_id, input, evaluator_version, evaluator_config
         )
 
         execute_response = await self._execute_evaluator_request(
@@ -109,6 +116,7 @@ async def trigger_experiment_evaluator(
         experiment_run_id: str,
         input: Dict[str, str],
         evaluator_version: Optional[str] = None,
+        evaluator_config: Optional[Dict[str, Any]] = None,
     ) -> str:
         """
         Trigger evaluator execution without waiting for result (fire-and-forget)
@@ -120,13 +128,13 @@ async def trigger_experiment_evaluator(
             experiment_run_id: Experiment run ID
             input: Dict mapping evaluator input field names to their values
             evaluator_version: Version of the evaluator to execute (optional)
-            client: Shared HTTP client for connection reuse (optional)
+            evaluator_config: Configuration for the evaluator (optional)
 
         Returns:
             str: The execution_id that can be used to check results later
         """
         request = self._build_evaluator_request(
-            task_id, experiment_id, experiment_run_id, input, evaluator_version
+            task_id, experiment_id, experiment_run_id, input, evaluator_version, evaluator_config
         )
 
         execute_response = await self._execute_evaluator_request(
@@ -135,3 +143,81 @@ async def trigger_experiment_evaluator(
 
         # Return execution_id without waiting for SSE result
         return execute_response.execution_id
+
+
+def validate_task_output(
+    task_output: Dict[str, Any],
+    evaluators: List[EvaluatorDetails],
+) -> None:
+    """
+    Validate that task output contains all required fields for the given evaluators.
+
+    Args:
+        task_output: The dictionary returned by the task function
+        evaluators: List of EvaluatorDetails to validate against
+
+    Raises:
+        ValueError: If task output is missing required fields for any evaluator
+    """
+    if not evaluators:
+        return
+
+    # Collect all validation errors
+    missing_fields_by_evaluator: Dict[str, set[str]] = {}
+
+    for evaluator in evaluators:
+        if not evaluator.required_input_fields:
+            continue
+
+        missing_fields = [
+            field for field in evaluator.required_input_fields
+            if field not in task_output
+        ]
+
+        if missing_fields:
+            # Add to existing set or create new set
+            if evaluator.slug not in missing_fields_by_evaluator:
+                missing_fields_by_evaluator[evaluator.slug] = set()
+            missing_fields_by_evaluator[evaluator.slug].update(missing_fields)
+
+    # If there are any missing fields, raise a detailed error
+    if missing_fields_by_evaluator:
+        error_lines = ["Task output missing required fields for evaluators:"]
+
+        for slug, fields in missing_fields_by_evaluator.items():
+            error_lines.append(f"  - {slug} requires: {sorted(fields)}")
+
+        error_lines.append(f"\nTask output contains: {list(task_output.keys())}")
+
+        error_lines.append("\nHint: Update your task function to return a dictionary with the required fields.")
+
+        raise ValueError("\n".join(error_lines))
+
+
+def _extract_error_from_response(response: httpx.Response) -> str:
+    """
+    Extract error message from HTTP response.
+
+    Tries to parse JSON and extract common error fields (error, message, msg).
+    Falls back to response.text if JSON parsing fails.
+
+    Args:
+        response: The HTTP response object
+
+    Returns:
+        Extracted error message string
+    """
+    error_detail = response.text
+    try:
+        error_json = response.json()
+        if isinstance(error_json, dict):
+            if 'error' in error_json:
+                error_detail = error_json['error']
+            elif 'message' in error_json:
+                error_detail = error_json['message']
+            elif 'msg' in error_json:
+                error_detail = error_json['msg']
+    except Exception:
+        pass  # Use response.text as fallback
+
+    return error_detail
diff --git a/packages/traceloop-sdk/traceloop/sdk/evaluator/evaluators_made_by_traceloop.py b/packages/traceloop-sdk/traceloop/sdk/evaluator/evaluators_made_by_traceloop.py
new file mode 100644
index 0000000000..5ad610eec0
--- /dev/null
+++ b/packages/traceloop-sdk/traceloop/sdk/evaluator/evaluators_made_by_traceloop.py
@@ -0,0 +1,455 @@
+from typing import Optional, Dict, Any
+from .config import EvaluatorDetails
+
+
+class EvaluatorMadeByTraceloop:
+    """
+    Factory class for creating made by traceloop evaluators with proper configuration.
+
+    This class provides easy-to-use factory methods for all made by traceloop evaluators,
+    with type hints and documentation for their configuration options.
+
+    Example:
+        >>> from traceloop.sdk.evaluator import EvaluatorMadeByTraceloop
+        >>>
+        >>> evaluators = [
+        ...     EvaluatorMadeByTraceloop.pii_detector(probability_threshold=0.8),
+        ...     EvaluatorMadeByTraceloop.toxicity_detector(threshold=0.7),
+        ... ]
+    """
+
+    @staticmethod
+    def pii_detector(
+        probability_threshold: float = 0.5,
+    ) -> EvaluatorDetails:
+        """
+        PII (Personally Identifiable Information) detector evaluator.
+
+        Required task output fields:
+            - text: The text to check for PII
+
+        Args:
+            probability_threshold: Minimum probability threshold for detecting PII (0.0-1.0)
+
+        Returns:
+            EvaluatorDetails configured for PII detection
+        """
+        config: Dict[str, Any] = {"probability_threshold": probability_threshold}
+        return EvaluatorDetails(slug="pii-detector", version=None, config=config, required_input_fields=["text"])
+
+    @staticmethod
+    def toxicity_detector(
+        threshold: float = 0.5,
+    ) -> EvaluatorDetails:
+        """
+        Toxicity detector evaluator.
+
+        Required task output fields:
+            - text: The text to check for toxicity
+
+        Args:
+            threshold: Minimum toxicity threshold for flagging content (0.0-1.0)
+
+        Returns:
+            EvaluatorDetails configured for toxicity detection
+        """
+        config: Dict[str, Any] = {"threshold": threshold}
+
+        return EvaluatorDetails(slug="toxicity-detector", version=None, config=config, required_input_fields=["text"])
+
+    @staticmethod
+    def prompt_injection(
+        threshold: float = 0.5,
+    ) -> EvaluatorDetails:
+        """
+        Prompt injection detector evaluator.
+
+        Required task output fields:
+            - prompt: The prompt to check for prompt injection attempts
+
+        Args:
+            threshold: Minimum threshold for detecting prompt injection attempts (0.0-1.0)
+
+        Returns:
+            EvaluatorDetails configured for prompt injection detection
+        """
+        config: Dict[str, Any] = {"threshold": threshold}
+        return EvaluatorDetails(slug="prompt-injection", version=None, config=config, required_input_fields=["prompt"])
+
+    @staticmethod
+    def regex_validator(
+        regex: str,
+        should_match: bool = True,
+        case_sensitive: bool = True,
+        dot_include_nl: bool = False,
+        multi_line: bool = False,
+    ) -> EvaluatorDetails:
+        """
+        Regular expression validator evaluator.
+
+        Required task output fields:
+            - text: The text to validate against the regex pattern
+
+        Args:
+            regex: The regular expression pattern to match against
+            should_match: If True, pass when pattern matches; if False, pass when pattern doesn't match
+            case_sensitive: Whether the regex matching should be case-sensitive
+            dot_include_nl: Whether the dot (.) should match newline characters
+            multi_line: Whether to enable multi-line mode (^ and $ match line boundaries)
+
+        Returns:
+            EvaluatorDetails configured for regex validation
+        """
+        config: Dict[str, Any] = {
+            "regex": regex,
+            "should_match": should_match,
+            "case_sensitive": case_sensitive,
+            "dot_include_nl": dot_include_nl,
+            "multi_line": multi_line,
+        }
+
+        return EvaluatorDetails(slug="regex-validator", version=None, config=config, required_input_fields=["text"])
+
+    @staticmethod
+    def json_validator(
+        enable_schema_validation: bool = False,
+        schema_string: Optional[str] = None,
+    ) -> EvaluatorDetails:
+        """
+        JSON validator evaluator.
+
+        Required task output fields:
+            - text: The JSON text to validate
+
+        Args:
+            enable_schema_validation: Whether to validate against a JSON schema
+            schema_string: JSON schema string to validate against (required if enable_schema_validation is True)
+
+        Returns:
+            EvaluatorDetails configured for JSON validation
+        """
+        config: Dict[str, Any] = {
+            "enable_schema_validation": enable_schema_validation,
+        }
+        if schema_string:
+            config["schema_string"] = schema_string
+
+        return EvaluatorDetails(slug="json-validator", version=None, config=config, required_input_fields=["text"])
+
+    @staticmethod
+    def placeholder_regex(
+        regex: str,
+        placeholder_name: str,
+        should_match: bool = True,
+        case_sensitive: bool = True,
+        dot_include_nl: bool = False,
+        multi_line: bool = False,
+    ) -> EvaluatorDetails:
+        """
+        Placeholder regex evaluator - validates that placeholders match a regex pattern.
+
+        Required task output fields:
+            - text: The text to validate against the regex pattern
+            - placeholder_value: The value of the placeholder to validate
+
+        Args:
+            regex: The regular expression pattern to match against
+            placeholder_name: Name of the placeholder to validate
+            should_match: If True, pass when pattern matches; if False, pass when pattern doesn't match
+            case_sensitive: Whether the regex matching should be case-sensitive
+            dot_include_nl: Whether the dot (.) should match newline characters
+            multi_line: Whether to enable multi-line mode (^ and $ match line boundaries)
+
+        Returns:
+            EvaluatorDetails configured for placeholder regex validation
+        """
+        config: Dict[str, Any] = {
+            "regex": regex,
+            "placeholder_name": placeholder_name,
+            "should_match": should_match,
+            "case_sensitive": case_sensitive,
+            "dot_include_nl": dot_include_nl,
+            "multi_line": multi_line,
+        }
+
+        return EvaluatorDetails(
+            slug="placeholder-regex",
+            version=None,
+            config=config,
+            required_input_fields=["text", "placeholder_value"],
+        )
+
+    @staticmethod
+    def char_count(
+    ) -> EvaluatorDetails:
+        """
+        Character count evaluator - counts the number of characters in text.
+
+        Required task output fields:
+            - text: The text to count characters in
+
+        Returns:
+            EvaluatorDetails configured for character counting
+        """
+        config: Dict[str, Any] = {}
+
+        return EvaluatorDetails(slug="char-count", version=None, config=config, required_input_fields=["text"])
+
+    @staticmethod
+    def char_count_ratio(
+    ) -> EvaluatorDetails:
+        """
+        Character count ratio evaluator - measures the ratio of characters between two texts.
+
+        Required task output fields:
+            - numerator_text: The numerator text for ratio calculation
+            - denominator_text: The denominator text for ratio calculation
+
+        Returns:
+            EvaluatorDetails configured for character count ratio calculation
+        """
+        config: Dict[str, Any] = {}
+
+        return EvaluatorDetails(
+            slug="char-count-ratio",
+            version=None,
+            config=config,
+            required_input_fields=["numerator_text", "denominator_text"],
+        )
+
+    @staticmethod
+    def word_count() -> EvaluatorDetails:
+        """
+        Word count evaluator - counts the number of words in text.
+
+        Required task output fields:
+            - text: The text to count words in
+
+        Returns:
+            EvaluatorDetails configured for word counting
+        """
+        config: Dict[str, Any] = {}
+
+        return EvaluatorDetails(slug="word-count", version=None, config=config, required_input_fields=["text"])
+
+    @staticmethod
+    def word_count_ratio(
+    ) -> EvaluatorDetails:
+        """
+        Word count ratio evaluator - measures the ratio of words between two texts.
+
+        Required task output fields:
+           - numerator_text: The numerator text for ratio calculation
++          - denominator_text: The denominator text for ratio calculation
+
+        Returns:
+            EvaluatorDetails configured for word count ratio calculation
+        """
+        config: Dict[str, Any] = {}
+
+        return EvaluatorDetails(
+            slug="word-count-ratio",
+            version=None,
+            config=config,
+            required_input_fields=["numerator_text", "denominator_text"],
+        )
+
+    @staticmethod
+    def answer_relevancy(
+    ) -> EvaluatorDetails:
+        """
+        Answer relevancy evaluator - verifies responses address the query.
+
+        Required task output fields:
+            - question: The input question
+            - answer: The answer to evaluate
+
+        Returns:
+            EvaluatorDetails configured for answer relevancy evaluation
+        """
+        config: Dict[str, Any] = {}
+
+        return EvaluatorDetails(
+            slug="answer-relevancy",
+            version=None,
+            config=config,
+            required_input_fields=["question", "answer"],
+        )
+
+    @staticmethod
+    def faithfulness(
+    ) -> EvaluatorDetails:
+        """
+        Faithfulness evaluator - detects hallucinations and verifies facts.
+
+        Required task output fields:
+            - question: The input question
+            - completion: The completion to evaluate for faithfulness
+            - context: The context to verify against
+
+        Returns:
+            EvaluatorDetails configured for faithfulness evaluation
+        """
+        config: Dict[str, Any] = {}
+
+        return EvaluatorDetails(
+            slug="faithfulness",
+            version=None,
+            config=config,
+            required_input_fields=["question", "completion", "context"],
+        )
+
+    @staticmethod
+    def profanity_detector() -> EvaluatorDetails:
+        """
+        Profanity detector evaluator - flags inappropriate language.
+
+        Required task output fields:
+            - text: The text to check for profanity
+
+        Returns:
+            EvaluatorDetails configured for profanity detection
+        """
+        config: Dict[str, Any] = {}
+
+        return EvaluatorDetails(slug="profanity-detector", version=None, config=config, required_input_fields=["text"])
+
+    @staticmethod
+    def sexism_detector(
+        threshold: float = 0.5,
+    ) -> EvaluatorDetails:
+        """
+        Sexism detector evaluator - detects sexist language and bias.
+
+        Required task output fields:
+            - text: The text to check for sexism
+
+        Args:
+            threshold: Minimum threshold for detecting sexism (0.0-1.0)
+
+        Returns:
+            EvaluatorDetails configured for sexism detection
+        """
+        config: Dict[str, Any] = {"threshold": threshold}
+
+        return EvaluatorDetails(slug="sexism-detector", version=None, config=config, required_input_fields=["text"])
+
+    @staticmethod
+    def secrets_detector(
+    ) -> EvaluatorDetails:
+        """
+        Secrets detector evaluator - monitors for credential and key leaks.
+
+        Required task output fields:
+            - text: The text to check for secrets
+
+        Returns:
+            EvaluatorDetails configured for secrets detection
+        """
+        config: Dict[str, Any] = {}
+        return EvaluatorDetails(slug="secrets-detector", version=None, config=config, required_input_fields=["text"])
+
+    @staticmethod
+    def sql_validator(
+    ) -> EvaluatorDetails:
+        """
+        SQL validator evaluator - validates SQL queries.
+
+        Required task output fields:
+            - text: The SQL query to validate
+
+        Returns:
+            EvaluatorDetails configured for SQL validation
+        """
+        config: Dict[str, Any] = {}
+
+        return EvaluatorDetails(slug="sql-validator", version=None, config=config, required_input_fields=["text"])
+
+    @staticmethod
+    def semantic_similarity(
+    ) -> EvaluatorDetails:
+        """
+        Semantic similarity evaluator - measures semantic similarity between texts.
+
+        Required task output fields:
+            - completion: The completion text to compare
+            - reference: The reference text to compare against
+
+        Returns:
+            EvaluatorDetails configured for semantic similarity evaluation
+        """
+        config: Dict[str, Any] = {}
+
+        return EvaluatorDetails(
+            slug="semantic-similarity",
+            version=None,
+            config=config,
+            required_input_fields=["completion", "reference"],
+        )
+
+    @staticmethod
+    def agent_goal_accuracy(
+    ) -> EvaluatorDetails:
+        """
+        Agent goal accuracy evaluator - validates agent goal achievement.
+
+        Required task output fields:
+            - question: The input question or goal
+            - completion: The agent's completion
+            - reference: The reference answer or goal
+
+        Returns:
+            EvaluatorDetails configured for agent goal accuracy evaluation
+        """
+        config: Dict[str, Any] = {}
+
+        return EvaluatorDetails(
+            slug="agent-goal-accuracy",
+            version=None,
+            config=config,
+            required_input_fields=["question", "completion", "reference"],
+        )
+
+    @staticmethod
+    def topic_adherence(
+    ) -> EvaluatorDetails:
+        """
+        Topic adherence evaluator - validates topic adherence.
+
+        Required task output fields:
+            - question: The input question or goal
+            - completion: The completion text to evaluate
+            - reference_topics: The expected topic or topics
+
+        Returns:
+            EvaluatorDetails configured for topic adherence evaluation
+        """
+        config: Dict[str, Any] = {}
+
+        return EvaluatorDetails(
+            slug="topic-adherence",
+            version=None,
+            config=config,
+            required_input_fields=["question", "completion", "reference_topics"],
+        )
+
+    @staticmethod
+    def perplexity(
+    ) -> EvaluatorDetails:
+        """
+        Perplexity evaluator - measures text perplexity from prompt.
+
+        Required task output fields:
+            - prompt: The prompt to measure perplexity for
+
+        Returns:
+            EvaluatorDetails configured for perplexity measurement
+        """
+        config: Dict[str, Any] = {}
+
+        return EvaluatorDetails(
+            slug="perplexity",
+            version=None,
+            config=config,
+            required_input_fields=["prompt"],
+        )
diff --git a/packages/traceloop-sdk/traceloop/sdk/evaluator/model.py b/packages/traceloop-sdk/traceloop/sdk/evaluator/model.py
index 7723cc3df9..1fc4636221 100644
--- a/packages/traceloop-sdk/traceloop/sdk/evaluator/model.py
+++ b/packages/traceloop-sdk/traceloop/sdk/evaluator/model.py
@@ -16,6 +16,7 @@ class InputSchemaMapping(RootModel[Dict[str, InputExtractor]]):
 class ExecuteEvaluatorRequest(BaseModel):
     input_schema_mapping: InputSchemaMapping
     evaluator_version: Optional[str] = None
+    evaluator_config: Optional[Dict[str, Any]] = None
     task_id: str
     experiment_id: str
     experiment_run_id: str
diff --git a/packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py b/packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py
index 453c6f7e8d..23b88c758b 100644
--- a/packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py
+++ b/packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py
@@ -5,19 +5,20 @@
 from typing import Any, List, Callable, Optional, Tuple, Dict, Awaitable, Union
 from traceloop.sdk.client.http import HTTPClient
 from traceloop.sdk.datasets.datasets import Datasets
-from traceloop.sdk.evaluator.evaluator import Evaluator
+from traceloop.sdk.evaluator.evaluator import Evaluator, validate_task_output
 from traceloop.sdk.experiment.model import (
     InitExperimentRequest,
     ExperimentInitResponse,
     CreateTaskRequest,
     CreateTaskResponse,
-    EvaluatorDetails,
+    EvaluatorSpec,
     TaskResponse,
     RunInGithubRequest,
     RunInGithubResponse,
     TaskResult,
     GithubContext,
 )
+from traceloop.sdk.evaluator.config import EvaluatorDetails
 import httpx
 
 
@@ -37,9 +38,9 @@ def __init__(self, http_client: HTTPClient, async_http_client: httpx.AsyncClient
     async def run(
         self,
         task: Callable[[Optional[Dict[str, Any]]], Awaitable[Dict[str, Any]]],
+        evaluators: List[EvaluatorSpec],
         dataset_slug: Optional[str] = None,
         dataset_version: Optional[str] = None,
-        evaluators: Optional[List[EvaluatorDetails]] = None,
         experiment_slug: Optional[str] = None,
         experiment_metadata: Optional[Dict[str, Any]] = None,
         related_ref: Optional[Dict[str, str]] = None,
@@ -51,9 +52,9 @@ async def run(
 
         Args:
             task: Async function to run on each dataset row
+            evaluators: List of evaluator slugs or EvaluatorDetails objects to run
             dataset_slug: Slug of the dataset to use
             dataset_version: Version of the dataset to use
-            evaluators: List of evaluator slugs to run
             experiment_slug: Slug for this experiment run
             experiment_metadata: Metadata for this experiment (an experiment holds all the experiment runs)
             related_ref: Related reference for this experiment run
@@ -76,9 +77,9 @@ async def run(
         else:
             return await self._run_locally(
                 task=task,
+                evaluators=evaluators,
                 dataset_slug=dataset_slug,
                 dataset_version=dataset_version,
-                evaluators=evaluators,
                 experiment_slug=experiment_slug,
                 experiment_metadata=experiment_metadata,
                 related_ref=related_ref,
@@ -90,9 +91,9 @@ async def run(
     async def _run_locally(
         self,
         task: Callable[[Optional[Dict[str, Any]]], Awaitable[Dict[str, Any]]],
+        evaluators: List[EvaluatorSpec],
         dataset_slug: Optional[str] = None,
         dataset_version: Optional[str] = None,
-        evaluators: Optional[List[EvaluatorDetails]] = None,
         experiment_slug: Optional[str] = None,
         experiment_metadata: Optional[Dict[str, Any]] = None,
         related_ref: Optional[Dict[str, str]] = None,
@@ -126,20 +127,23 @@ async def _run_locally(
             if value is not None
         }
 
-        evaluator_details = (
-            [
-                (evaluator, None) if isinstance(evaluator, str) else evaluator
-                for evaluator in evaluators
-            ]
-            if evaluators
-            else None
-        )
+        # Convert evaluators to tuples of (slug, version, config)
+        evaluator_details: Optional[List[Tuple[str, Optional[str], Optional[Dict[str, Any]]]]] = None
+        if evaluators:
+            evaluator_details = []
+            for evaluator in evaluators:
+                if isinstance(evaluator, str):
+                    # Simple string slug
+                    evaluator_details.append((evaluator, None, None))
+                elif isinstance(evaluator, EvaluatorDetails):
+                    # EvaluatorDetails object with config
+                    evaluator_details.append((evaluator.slug, evaluator.version, evaluator.config))
 
         experiment = self._init_experiment(
             experiment_slug,
             dataset_slug=dataset_slug,
             dataset_version=dataset_version,
-            evaluator_slugs=[slug for slug, _ in evaluator_details]
+            evaluator_slugs=[slug for slug, _, _ in evaluator_details]
             if evaluator_details
             else None,
             experiment_metadata=experiment_metadata,
@@ -156,9 +160,16 @@ async def _run_locally(
         results: List[TaskResponse] = []
         errors: List[str] = []
 
+        evaluators_to_validate = [evaluator for evaluator in evaluators if isinstance(evaluator, EvaluatorDetails)]
+
         async def run_single_row(row: Optional[Dict[str, Any]]) -> TaskResponse:
             try:
                 task_result = await task(row)
+
+                # Validate task output with EvaluatorDetails with required_input_fields from evaluators list
+                if evaluators_to_validate:
+                    validate_task_output(task_result, evaluators_to_validate)
+
                 task_id = self._create_task(
                     experiment_slug=experiment_slug,
                     experiment_run_id=run_id,
@@ -168,13 +179,14 @@ async def run_single_row(row: Optional[Dict[str, Any]]) -> TaskResponse:
 
                 eval_results: Dict[str, Union[Dict[str, Any], str]] = {}
                 if evaluator_details:
-                    for evaluator_slug, evaluator_version in evaluator_details:
+                    for evaluator_slug, evaluator_version, evaluator_config in evaluator_details:
                         try:
                             if wait_for_results:
                                 eval_result = (
                                     await self._evaluator.run_experiment_evaluator(
                                         evaluator_slug=evaluator_slug,
                                         evaluator_version=evaluator_version,
+                                        evaluator_config=evaluator_config,
                                         task_id=task_id,
                                         experiment_id=experiment.experiment.id,
                                         experiment_run_id=run_id,
@@ -187,6 +199,7 @@ async def run_single_row(row: Optional[Dict[str, Any]]) -> TaskResponse:
                                 await self._evaluator.trigger_experiment_evaluator(
                                     evaluator_slug=evaluator_slug,
                                     evaluator_version=evaluator_version,
+                                    evaluator_config=evaluator_config,
                                     task_id=task_id,
                                     experiment_id=experiment.experiment.id,
                                     experiment_run_id=run_id,
@@ -197,7 +210,10 @@ async def run_single_row(row: Optional[Dict[str, Any]]) -> TaskResponse:
                                 eval_results[evaluator_slug] = msg
 
                         except Exception as e:
-                            eval_results[evaluator_slug] = f"Error: {str(e)}"
+                            error_msg = f"Error: {str(e)}"
+                            eval_results[evaluator_slug] = error_msg
+                            # Log the error so user can see it
+                            print(f"\033[91m❌ Evaluator '{evaluator_slug}' failed: {str(e)}\033[0m")
 
                 return TaskResponse(
                     task_result=task_result,
@@ -205,6 +221,8 @@ async def run_single_row(row: Optional[Dict[str, Any]]) -> TaskResponse:
                 )
             except Exception as e:
                 error_msg = f"Error processing row: {str(e)}"
+                # Print error to console so user can see it
+                print(f"\033[91m❌ Task execution failed: {str(e)}\033[0m")
                 if stop_on_error:
                     raise e
                 return TaskResponse(error=error_msg)
@@ -240,9 +258,9 @@ async def run_with_semaphore(row: Optional[Dict[str, Any]]) -> TaskResponse:
     async def _run_in_github(
         self,
         task: Callable[[Optional[Dict[str, Any]]], Awaitable[Dict[str, Any]]],
+        evaluators: List[EvaluatorSpec],
         dataset_slug: Optional[str] = None,
         dataset_version: Optional[str] = None,
-        evaluators: Optional[List[EvaluatorDetails]] = None,
         experiment_slug: Optional[str] = None,
         experiment_metadata: Optional[Dict[str, Any]] = None,
         related_ref: Optional[Dict[str, str]] = None,
@@ -290,7 +308,7 @@ async def _run_in_github(
             jsonl_data = self._datasets.get_version_jsonl(dataset_slug, dataset_version)
             rows = self._parse_jsonl_to_rows(jsonl_data)
 
-        task_results = await self._execute_tasks(rows, task)
+        task_results = await self._execute_tasks(rows, task, evaluators)
 
         # Construct GitHub context
         repository = os.getenv("GITHUB_REPOSITORY")
@@ -340,10 +358,12 @@ async def _run_in_github(
         # Extract evaluator slugs
         evaluator_slugs = None
         if evaluators:
-            evaluator_slugs = [
-                slug if isinstance(slug, str) else slug[0]
-                for slug in evaluators
-            ]
+            evaluator_slugs = []
+            for evaluator in evaluators:
+                if isinstance(evaluator, str):
+                    evaluator_slugs.append(evaluator)
+                elif isinstance(evaluator, EvaluatorDetails):
+                    evaluator_slugs.append(evaluator.slug)
 
         # Prepare request payload
         request_body = RunInGithubRequest(
@@ -436,26 +456,47 @@ async def _execute_tasks(
         self,
         rows: List[Dict[str, Any]],
         task: Callable[[Optional[Dict[str, Any]]], Awaitable[Dict[str, Any]]],
+        evaluators: Optional[List[EvaluatorSpec]] = None,
     ) -> List[TaskResult]:
         """Execute tasks locally with concurrency control
 
         Args:
             rows: List of dataset rows to process
             task: Function to run on each row
+            evaluators: List of evaluators to validate task output against
 
         Returns:
             List of TaskResult objects with inputs, outputs, and errors
         """
         task_results: List[TaskResult] = []
 
+        # Extract EvaluatorDetails from evaluators list
+        evaluators_to_validate = []
+        if evaluators:
+            for evaluator in evaluators:
+                if isinstance(evaluator, EvaluatorDetails):
+                    evaluators_to_validate.append(evaluator)
+
         async def run_single_row(row: Optional[Dict[str, Any]]) -> TaskResult:
             try:
                 task_output = await task(row)
+
+                # Validate task output schema on first execution
+                if evaluators_to_validate:
+                    try:
+                        validate_task_output(task_output, evaluators_to_validate)
+                    except ValueError as validation_error:
+                        print(f"\033[91m❌ Task validation failed: {str(validation_error)}\033[0m")
+                        raise ValueError(str(validation_error))
+
                 return TaskResult(
                     input=row,
                     output=task_output,
                 )
             except Exception as e:
+                if isinstance(e, ValueError):
+                    raise e
+                print(f"\033[91m❌ Task execution error: {str(e)}\033[0m")
                 return TaskResult(
                     input=row,
                     error=str(e),
diff --git a/packages/traceloop-sdk/traceloop/sdk/experiment/model.py b/packages/traceloop-sdk/traceloop/sdk/experiment/model.py
index f55ed0b487..00a3ce2009 100644
--- a/packages/traceloop-sdk/traceloop/sdk/experiment/model.py
+++ b/packages/traceloop-sdk/traceloop/sdk/experiment/model.py
@@ -1,10 +1,9 @@
 from datetime import datetime
-from typing import List, Dict, Any, Optional, Tuple, Union
+from typing import List, Dict, Any, Optional, Union
 from pydantic import BaseModel, Field
+from traceloop.sdk.evaluator.config import EvaluatorDetails
 
-EvaluatorVersion = str
-EvaluatorSlug = str
-EvaluatorDetails = Union[EvaluatorSlug, Tuple[EvaluatorSlug, EvaluatorVersion]]
+EvaluatorSpec = Union[str, EvaluatorDetails]
 
 
 class TaskResponse(BaseModel):