diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Cli.Evaluations/README.md b/tools/azsdk-cli/Azure.Sdk.Tools.Cli.Evaluations/README.md index bef5a22b0bc..671b8d3d526 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Cli.Evaluations/README.md +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Cli.Evaluations/README.md @@ -53,7 +53,7 @@ The framework automatically generates HTML reports in the `reports/` directory a The evaluation framework runs automatically in CI/CD pipelines when pull requests modify azsdk cli. Changes to `.github/copilot-instructions.md` or any instruction files in `eng/common/instructions/` would also be triggered but in progress. This ensures instruction or mcp changes don't negatively impact agent behavior before merging. The evaluations run alongside other PR validation tests and must pass for the PR to be merged. -**Pipeline**: [release pipeline](https://dev.azure.com/azure-sdk/internal/_build?definitionId=7684) - Configuration in `eng/common/pipelines/copilot-instruction-evals.yml` +**Pipeline**: [release pipeline](https://dev.azure.com/azure-sdk/internal/_build?definitionId=7684) - Configuration in `eng/common/pipelines/ai-evals-tests.yml` ## Walkthrough: Release Plan Creation Evaluation diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Cli.Evaluations/Scenarios/AzureRestApiSpecs/Evaluate_GenerateSdk.cs b/tools/azsdk-cli/Azure.Sdk.Tools.Cli.Evaluations/Scenarios/AzureRestApiSpecs/Evaluate_GenerateSdk.cs new file mode 100644 index 00000000000..655387342ad --- /dev/null +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Cli.Evaluations/Scenarios/AzureRestApiSpecs/Evaluate_GenerateSdk.cs @@ -0,0 +1,45 @@ +using Azure.Sdk.Tools.Cli.Evaluations.Evaluators; +using Azure.Sdk.Tools.Cli.Evaluations.Helpers; +using Azure.Sdk.Tools.Cli.Evaluations.Models; +using Microsoft.Extensions.AI.Evaluation; +using NUnit.Framework; + +namespace Azure.Sdk.Tools.Cli.Evaluations.Scenarios +{ + public partial class Scenario + { + [Test] + [Category(RepositoryCategories.AzureRestApiSpecs)] + public async Task Evaluate_GenerateSdk() + { + const string prompt = "Do every step necessary to generate my SDK for Dotnet, up until testing. Proceed and don't ask me questions. Stop before running tests on the SDK. I'm in a public repo. My tspconfig is at: \"C:\\azure-rest-api-specs\\specification\\healthdataaiservices\\HealthDataAIServices.DeidServices\\tspconfig.yaml\", and the repo: \"C:\\azure-sdk-for-net\""; + string[] expectedTools = + [ + "azsdk_verify_setup", "azsdk_run_typespec_validation", "azsdk_package_generate_code","azsdk_package_build_code" + ]; + + // Build scenario data + var scenarioData = ChatMessageHelper.LoadScenarioFromPrompt(prompt, expectedTools); + + // External construction of evaluation context + bool checkInputs = false; + + var result = await EvaluationHelper.RunToolInputScenarioAsync( + scenarioName: this.ScenarioName, + scenarioData: scenarioData, + chatCompletion: s_chatCompletion!, + chatConfig: s_chatConfig!, + executionName: s_executionName, + reportingPath: ReportingPath, + toolNames: s_toolNames!, + evaluators: [new ExpectedToolInputEvaluator()], + enableResponseCaching: true, + additionalContexts: new EvaluationContext[] + { + new ExpectedToolInputEvaluatorContext(scenarioData.ExpectedOutcome, s_toolNames!, checkInputs) + }); + + EvaluationHelper.ValidateToolInputsEvaluator(result); + } + } +} diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Cli.Evaluations/Scenarios/General/Evaluate_VerifySetup.cs b/tools/azsdk-cli/Azure.Sdk.Tools.Cli.Evaluations/Scenarios/General/Evaluate_VerifySetup.cs new file mode 100644 index 00000000000..6ac70b57b30 --- /dev/null +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Cli.Evaluations/Scenarios/General/Evaluate_VerifySetup.cs @@ -0,0 +1,44 @@ +using Azure.Sdk.Tools.Cli.Evaluations.Evaluators; +using Azure.Sdk.Tools.Cli.Evaluations.Helpers; +using Azure.Sdk.Tools.Cli.Evaluations.Models; +using Microsoft.Extensions.AI.Evaluation; +using NUnit.Framework; + +namespace Azure.Sdk.Tools.Cli.Evaluations.Scenarios +{ + public partial class Scenario + { + [Test] + public async Task Evaluate_VerifySetup() + { + const string prompt = "Verify my setup for Dotnet."; + string[] expectedTools = + [ + "azsdk_verify_setup" + ]; + + // Build scenario data + var scenarioData = ChatMessageHelper.LoadScenarioFromPrompt(prompt, expectedTools); + + // External construction of evaluation context + bool checkInputs = false; + + var result = await EvaluationHelper.RunToolInputScenarioAsync( + scenarioName: this.ScenarioName, + scenarioData: scenarioData, + chatCompletion: s_chatCompletion!, + chatConfig: s_chatConfig!, + executionName: s_executionName, + reportingPath: ReportingPath, + toolNames: s_toolNames!, + evaluators: [new ExpectedToolInputEvaluator()], + enableResponseCaching: true, + additionalContexts: new EvaluationContext[] + { + new ExpectedToolInputEvaluatorContext(scenarioData.ExpectedOutcome, s_toolNames!, checkInputs) + }); + + EvaluationHelper.ValidateToolInputsEvaluator(result); + } + } +} diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Cli.Evaluations/ToolMocks/PackageBuildCode.cs b/tools/azsdk-cli/Azure.Sdk.Tools.Cli.Evaluations/ToolMocks/PackageBuildCode.cs new file mode 100644 index 00000000000..51d1e535f4d --- /dev/null +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Cli.Evaluations/ToolMocks/PackageBuildCode.cs @@ -0,0 +1,40 @@ +using Microsoft.Extensions.AI; + +namespace Azure.Sdk.Tools.Cli.Evaluations.ToolMocks +{ + public class PackageBuildCode : IToolMock + { + public string ToolName => "azsdk_package_build_code"; + public string CallId => "tooluse_l1vP7afmgopmnhjpjp"; + private string ToolResult => """{"message":"Build completed successfully.","result":"succeeded","language":"DotNet","package_name":"Azure.Health.Deidentification","version":"1.1.0-beta.2","package_type":"Unknown","sdk_repo":"","next_steps":[],"operation_status":"Succeeded"}"""; + public ChatMessage GetMockResponse(string callid) + { + return new ChatMessage( + ChatRole.Tool, + [ + new FunctionResultContent( + callid, + ToolResult + ) + ] + ); + } + + public ChatMessage GetMockCall() + { + return new ChatMessage( + ChatRole.Assistant, + [ + new FunctionCallContent( + CallId, + ToolName, + new Dictionary + { + { "packagePath", "C:\\azure-sdk-for-net\\sdk\\healthdataaiservices\\Azure.Health.Deidentification" }, + } + ) + ] + ); + } + } +} diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Cli.Evaluations/ToolMocks/PackageGenerateCode.cs b/tools/azsdk-cli/Azure.Sdk.Tools.Cli.Evaluations/ToolMocks/PackageGenerateCode.cs new file mode 100644 index 00000000000..1838f320963 --- /dev/null +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Cli.Evaluations/ToolMocks/PackageGenerateCode.cs @@ -0,0 +1,43 @@ +using Microsoft.Extensions.AI; + +namespace Azure.Sdk.Tools.Cli.Evaluations.ToolMocks +{ + public class PackageGenerateCode : IToolMock + { + public string ToolName => "azsdk_package_generate_code"; + public string CallId => "tooluse_l1vP7afmgopmnhgmjp"; + private string ToolResult => """{"message":"SDK generation completed successfully using tspconfig.yaml.","result":"succeeded","language":"DotNet","package_name":"","version":"","package_type":"Unknown","sdk_repo":"azure-sdk-for-net","next_steps":["If the SDK is not Python, build the code"],"operation_status":"Succeeded"}"""; + public ChatMessage GetMockResponse(string callid) + { + return new ChatMessage( + ChatRole.Tool, + [ + new FunctionResultContent( + callid, + ToolResult + ) + ] + ); + } + + public ChatMessage GetMockCall() + { + return new ChatMessage( + ChatRole.Assistant, + [ + new FunctionCallContent( + CallId, + ToolName, + new Dictionary + { + { "tspConfigPath","C:\\azure-rest-api-specs\\specification\\healthdataaiservices\\HealthDataAIServices.DeidServices\\tspconfig.yaml" }, + { "localSdkRepoPath", "C:\\azure-sdk-for-net" }, + { "tspLocationPath", "" }, + { "emitterOptions", "" } + } + ) + ] + ); + } + } +} diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Cli.Evaluations/ToolMocks/ToolMocks.cs b/tools/azsdk-cli/Azure.Sdk.Tools.Cli.Evaluations/ToolMocks/ToolMocks.cs index 26842ba8993..a9485b4db4a 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Cli.Evaluations/ToolMocks/ToolMocks.cs +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Cli.Evaluations/ToolMocks/ToolMocks.cs @@ -27,6 +27,8 @@ private static void RegisterMocks() new CreatePullRequest(), new CreateReleasePlan(), new VerifySetup(), + new PackageGenerateCode(), + new PackageBuildCode(), }; foreach (var mock in mockInstances)