diff --git a/.chloggen/genai-judgment-boundary.yaml b/.chloggen/genai-judgment-boundary.yaml new file mode 100644 index 0000000000..ff83ecb17e --- /dev/null +++ b/.chloggen/genai-judgment-boundary.yaml @@ -0,0 +1,4 @@ +change_type: enhancement +component: gen_ai +note: Add `gen_ai.evaluation.outcome` and `gen_ai.evaluation.multiple_outcomes` attributes for minimal evaluation metadata observability. +issues: [3336] diff --git a/.github/workflows/check-changes-ownership.yml b/.github/workflows/check-changes-ownership.yml index cf1c58d9c5..4d3afc2d6d 100644 --- a/.github/workflows/check-changes-ownership.yml +++ b/.github/workflows/check-changes-ownership.yml @@ -31,6 +31,7 @@ jobs: dir_names: "true" files: model/** separator: "," + base_sha: ${{ github.event.pull_request.base.sha }} validate-area-ownership: runs-on: ubuntu-latest diff --git a/docs/gen-ai/gen-ai-events.md b/docs/gen-ai/gen-ai-events.md index bb2c5c7cc1..ae41a79b25 100644 --- a/docs/gen-ai/gen-ai-events.md +++ b/docs/gen-ai/gen-ai-events.md @@ -10,6 +10,8 @@ linkTitle: Events - [Event: `gen_ai.client.inference.operation.details`](#event-gen_aiclientinferenceoperationdetails) - [Event: `gen_ai.evaluation.result`](#event-gen_aievaluationresult) + - [Evaluation Outcome Attributes](#evaluation-outcome-attributes) + - [Example](#example) @@ -254,6 +256,8 @@ This event captures the result of evaluating GenAI output for quality, accuracy, | [`gen_ai.evaluation.score.label`](/docs/registry/attributes/gen-ai.md) | ![Development](https://img.shields.io/badge/-development-blue) | `Conditionally Required` if applicable | string | Human readable label for evaluation. [2] | `relevant`; `not_relevant`; `correct`; `incorrect`; `pass`; `fail` | | [`gen_ai.evaluation.score.value`](/docs/registry/attributes/gen-ai.md) | ![Development](https://img.shields.io/badge/-development-blue) | `Conditionally Required` if applicable | double | The evaluation score returned by the evaluator. | `4.0` | | [`gen_ai.evaluation.explanation`](/docs/registry/attributes/gen-ai.md) | ![Development](https://img.shields.io/badge/-development-blue) | `Recommended` | string | A free-form explanation for the assigned score provided by the evaluator. | `The response is factually accurate but lacks sufficient detail to fully address the question.` | +| [`gen_ai.evaluation.multiple_outcomes`](/docs/registry/attributes/gen-ai.md) | ![Development](https://img.shields.io/badge/-development-blue) | `Recommended` | boolean | Indicates whether the evaluation process assessed multiple outcome categories or labels. | `true` | +| [`gen_ai.evaluation.outcome`](/docs/registry/attributes/gen-ai.md) | ![Development](https://img.shields.io/badge/-development-blue) | `Recommended` | string | The evaluation outcome label assigned by the evaluator. | `pass`; `fail`; `allow`; `block` | | [`gen_ai.response.id`](/docs/registry/attributes/gen-ai.md) | ![Development](https://img.shields.io/badge/-development-blue) | `Recommended` when available | string | The unique identifier for the completion. [3] | `chatcmpl-123` | **[1] `error.type`:** The `error.type` SHOULD match the error code returned by the Generative AI Evaluation provider or the client library, @@ -278,4 +282,23 @@ event with the corresponding operation when span id is not available. +### Evaluation Outcome Attributes + +The evaluation outcome attributes provide metadata about evaluations where implementations may have assessed multiple categories or outcome labels. + +#### Example + +```json +{ + "name": "gen_ai.evaluation.result", + "attributes": { + "gen_ai.evaluation.name": "ContentSafety", + "gen_ai.evaluation.score.value": 0.85, + "gen_ai.evaluation.score.label": "pass", + "gen_ai.evaluation.multiple_outcomes": true, + "gen_ai.evaluation.outcome": "pass" + } +} +``` + [DocumentStatus]: https://opentelemetry.io/docs/specs/otel/document-status diff --git a/docs/registry/attributes/gen-ai.md b/docs/registry/attributes/gen-ai.md index b7b4f0926d..8efdcc5185 100644 --- a/docs/registry/attributes/gen-ai.md +++ b/docs/registry/attributes/gen-ai.md @@ -23,7 +23,9 @@ This document defines the attributes used to describe telemetry in the context o | `gen_ai.data_source.id` | ![Development](https://img.shields.io/badge/-development-blue) | string | The data source identifier. [1] | `H7STPQYOND` | | `gen_ai.embeddings.dimension.count` | ![Development](https://img.shields.io/badge/-development-blue) | int | The number of dimensions the resulting output embeddings should have. | `512`; `1024` | | `gen_ai.evaluation.explanation` | ![Development](https://img.shields.io/badge/-development-blue) | string | A free-form explanation for the assigned score provided by the evaluator. | `The response is factually accurate but lacks sufficient detail to fully address the question.` | +| `gen_ai.evaluation.multiple_outcomes` | ![Development](https://img.shields.io/badge/-development-blue) | boolean | Indicates whether the evaluation process assessed multiple outcome categories or labels. | `true` | | `gen_ai.evaluation.name` | ![Development](https://img.shields.io/badge/-development-blue) | string | The name of the evaluation metric used for the GenAI response. | `Relevance`; `IntentResolution` | +| `gen_ai.evaluation.outcome` | ![Development](https://img.shields.io/badge/-development-blue) | string | The evaluation outcome label assigned by the evaluator. | `pass`; `fail`; `allow`; `block` | | `gen_ai.evaluation.score.label` | ![Development](https://img.shields.io/badge/-development-blue) | string | Human readable label for evaluation. [2] | `relevant`; `not_relevant`; `correct`; `incorrect`; `pass`; `fail` | | `gen_ai.evaluation.score.value` | ![Development](https://img.shields.io/badge/-development-blue) | double | The evaluation score returned by the evaluator. | `4.0` | | `gen_ai.input.messages` | ![Development](https://img.shields.io/badge/-development-blue) | any | The chat history provided to the model as an input. [3] | [
  {
    "role": "user",
    "parts": [
      {
        "type": "text",
        "content": "Weather in Paris?"
      }
    ]
  },
  {
    "role": "assistant",
    "parts": [
      {
        "type": "tool_call",
        "id": "call_VSPygqKTWdrhaFErNvMV18Yl",
        "name": "get_weather",
        "arguments": {
          "location": "Paris"
        }
      }
    ]
  },
  {
    "role": "tool",
    "parts": [
      {
        "type": "tool_call_response",
        "id": " call_VSPygqKTWdrhaFErNvMV18Yl",
        "result": "rainy, 57°F"
      }
    ]
  }
] | diff --git a/model/gen-ai/events.yaml b/model/gen-ai/events.yaml index ce86109a6d..61b2eed201 100644 --- a/model/gen-ai/events.yaml +++ b/model/gen-ai/events.yaml @@ -44,6 +44,10 @@ groups: The `error.type` SHOULD match the error code returned by the Generative AI Evaluation provider or the client library, the canonical name of exception that occurred, or another low-cardinality error identifier. Instrumentations SHOULD document the list of errors they report. + - ref: gen_ai.evaluation.multiple_outcomes + requirement_level: recommended + - ref: gen_ai.evaluation.outcome + requirement_level: recommended - id: event.gen_ai.client.operation.exception name: gen_ai.client.operation.exception diff --git a/model/gen-ai/registry.yaml b/model/gen-ai/registry.yaml index ddd2408b27..daae68ae88 100644 --- a/model/gen-ai/registry.yaml +++ b/model/gen-ai/registry.yaml @@ -659,6 +659,16 @@ groups: type: string brief: A free-form explanation for the assigned score provided by the evaluator. examples: ["The response is factually accurate but lacks sufficient detail to fully address the question."] + - id: gen_ai.evaluation.multiple_outcomes + stability: development + type: boolean + brief: Indicates whether the evaluation process assessed multiple outcome categories or labels. + examples: [true] + - id: gen_ai.evaluation.outcome + stability: development + type: string + brief: The evaluation outcome label assigned by the evaluator. + examples: ["pass", "fail", "allow", "block"] - id: gen_ai.prompt.name type: string brief: The name of the prompt that uniquely identifies it.