From 2964cf3f135e810fa0ead9138b943ea97ab91644 Mon Sep 17 00:00:00 2001 From: Kiran Kumar Kolli Date: Sat, 31 Jan 2026 23:49:59 -0800 Subject: [PATCH 01/28] Add Copilot agent issue triage and resolution plan MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This plan defines a comprehensive workflow for GitHub Copilot agents to: - Triage GitHub issues (bugs, features, questions, documentation) - Reproduce issues against Cosmos DB emulator - Analyze code using local tools and Bluebird code graph - Propose workarounds and draft PRs - Validate fixes through local and remote CI Key sections: - Model configuration (Claude Opus 4.5) - Issue classification matrix - Reproduction workflow with multi-version testing - Performance benchmark validation - Remote CI validation (Azure Pipelines gates) - Branch naming convention (users//) - Lessons learned from Issue #5547 case study πŸ€– This plan was created with GitHub Copilot assistance. --- .github/copilot-agent-plan.md | 2152 +++++++++++++++++++++++++++++++++ 1 file changed, 2152 insertions(+) create mode 100644 .github/copilot-agent-plan.md diff --git a/.github/copilot-agent-plan.md b/.github/copilot-agent-plan.md new file mode 100644 index 0000000000..c1508245f9 --- /dev/null +++ b/.github/copilot-agent-plan.md @@ -0,0 +1,2152 @@ +# Copilot Agent Issue Triage & Resolution Plan +## Azure Cosmos DB .NET SDK (azure-cosmos-dotnet-v3) + +--- + +## 1. Executive Summary + +This plan defines a comprehensive workflow for Copilot agents to handle GitHub issues for the Azure Cosmos DB .NET SDK repository. The agent will: +- Triage all issue types (bugs, features, questions, documentation) +- Reproduce issues against the Cosmos DB emulator (when applicable) +- Leverage Bluebird code graph tools for deep code analysis +- Create linked investigation issues with structured findings +- Auto-assign to human reviewers based on labels/area +- Propose workarounds and draft PRs when appropriate + +--- + +## 1.1 Model Configuration + +**Primary Model: Claude Opus 4.5** (`claude-opus-4.5`) + +Use Claude Opus for all investigation and analysis tasks due to its superior reasoning capabilities for complex debugging scenarios. + +```yaml +model_configuration: + primary_model: "claude-opus-4.5" + + task_model_mapping: + # Deep analysis tasks - use Opus + issue_triage: "claude-opus-4.5" + root_cause_analysis: "claude-opus-4.5" + code_investigation: "claude-opus-4.5" + pr_review: "claude-opus-4.5" + documentation_writing: "claude-opus-4.5" + + # Quick tasks - can use faster models if needed + simple_queries: "claude-opus-4.5" # Keep Opus for consistency + file_search: "claude-opus-4.5" + +agent_invocation: + explore_agent: + model: "claude-opus-4.5" + use_for: "Codebase exploration and understanding" + + task_agent: + model: "claude-opus-4.5" + use_for: "Build, test, reproduction execution" + + general_purpose_agent: + model: "claude-opus-4.5" + use_for: "Complex multi-step investigations" + + code_review_agent: + model: "claude-opus-4.5" + use_for: "PR and code change review" +``` + +**Why Claude Opus:** +- Superior reasoning for complex debugging scenarios +- Better at understanding nuanced code behavior +- More accurate root cause analysis +- Higher quality documentation generation +- Better at synthesizing information from multiple sources + +--- + +## 2. Issue Intake & Classification + +### 2.1 Issue Triggers +```yaml +triggers: + - new_issue_created + - issue_labeled: ["needs-triage", "bug", "question"] + - issue_commented: ["@copilot investigate", "@copilot help"] +``` + +### 2.2 Classification Matrix + +| Issue Type | Reproduction Required | Code Analysis Depth | Output | +|------------|----------------------|---------------------|--------| +| **Bug - Crash/Exception** | Yes - Full repro | Deep (call graph, stack trace mapping) | Investigation issue + Draft PR | +| **Bug - Performance** | Yes - Benchmark | Medium (hot path analysis) | Investigation issue + Perf report | +| **Bug - Data Corruption** | Yes - Careful repro | Deep (serialization path) | Investigation issue + CRITICAL label | +| **Question - How-to** | No | Light (find examples) | Comment with answer | +| **Question - Behavior** | Maybe | Medium (find relevant code) | Investigation issue if complex | +| **Enhancement** | No | Medium (impact analysis) | Feasibility comment | +| **Documentation** | No | Light | Draft PR | + +### 2.3 Standard Labels to Apply +```yaml +labels: + type: + - bug + - enhancement + - question + - documentation + area: + - Batch + - Query + - ChangeFeed + - DirectMode + - GatewayMode + - Serialization + - Encryption + - Diagnostics + - Retry + - PartitionKey + priority: + - P0-critical + - P1-high + - P2-medium + - P3-low + status: + - needs-triage + - investigating + - needs-repro + - has-workaround + - ready-for-pr +``` + +--- + +## 3. Triage Workflow + +### Phase 0: Issue Retrieval + +**Handle GitHub API limitations (SAML enforcement for Azure org):** + +```yaml +issue_retrieval: + primary_method: + tool: "github-mcp-server-issue_read" + params: + method: "get" + owner: "Azure" + repo: "azure-cosmos-dotnet-v3" + issue_number: "{number}" + + fallback_on_saml_error: + trigger: "403 error with 'Resource protected by organization SAML enforcement'" + tool: "web_fetch" + url: "https://github.com/Azure/azure-cosmos-dotnet-v3/issues/{number}" + note: "Scrapes issue page directly, bypasses API authentication" + + error_handling: + saml_error: + message: "GitHub API blocked by SAML - using web fallback" + action: "Switch to web_fetch automatically" + not_found: + message: "Issue #{number} not found" + action: "Ask user to verify issue number" +``` + +### Phase 1: Initial Assessment (< 5 minutes) + +```mermaid +flowchart TD + A[New Issue] --> B{Has repro steps?} + B -->|Yes| C{Has code sample?} + B -->|No| D[Request repro steps] + C -->|Yes| E[Extract key info] + C -->|No| F[Request minimal repro] + E --> G{Issue type?} + G -->|Bug| H[Start Bug Workflow] + G -->|Question| I[Start Question Workflow] + G -->|Enhancement| J[Start Enhancement Workflow] + G -->|Docs| K[Start Docs Workflow] +``` + +#### Information Extraction Checklist +```markdown +- [ ] SDK Version (from issue or code) +- [ ] .NET Version (runtime) +- [ ] Connection Mode (Direct/Gateway) +- [ ] Operation Type (CRUD, Query, Batch, ChangeFeed) +- [ ] Error Message / Exception Type +- [ ] Stack Trace (if available) +- [ ] Cosmos DB Account Type (Serverless/Provisioned) +- [ ] Partition Key configuration +- [ ] Custom serializer in use? +- [ ] Retry policy configuration +``` + +### Phase 1.5: Confirmation Gate ⚠️ + +**Before proceeding with investigation, present findings to user and ask for confirmation:** + +```markdown +## Investigation Scope Confirmation + +I've analyzed issue #{number} and gathered the following initial context: + +**Issue Summary:** +- Type: {bug/question/enhancement/docs} +- Area: {Batch/Query/ChangeFeed/etc} +- Reported SDK Version: {version} +- Priority Assessment: {P0-P3} + +**Proposed Investigation Plan:** +1. Historical search: GitHub issues, PRs, StackOverflow, changelog +2. Code analysis: {specific areas to investigate} +3. Reproduction: Test against {reported version}, {latest SDK}, {master branch} + +**Estimated Time:** {X minutes} + +**Proceed with investigation?** +- [ ] Yes, proceed as planned +- [ ] Yes, but modify scope: {specify} +- [ ] No, need more information first +- [ ] Skip reproduction (analysis only) +``` + +> ⚠️ **MANDATORY**: Always wait for user confirmation before starting resource-intensive investigation or reproduction steps. + +--- + +### Phase 2: Historical Analysis (< 10 minutes) + +#### 2a. Search for Related Issues (GitHub) +``` +Tools to use: +- github-mcp-server-search_issues: Find similar past issues +- github-mcp-server-list_pull_requests: Find related fixes +- git log --grep: Search commit history +``` + +**Search patterns:** +```bash +# Search by exception type +github search issues: "CosmosException" + "" + +# Search by operation +github search issues: "TransactionalBatch" + "timeout" + +# Search changelog +grep -i "" changelog.md +``` + +#### 2b. Search External Public References + +**StackOverflow Search:** +```yaml +tool: web_search +queries: + - "azure-cosmosdb {error message} site:stackoverflow.com" + - "Microsoft.Azure.Cosmos {operation} {issue keyword} site:stackoverflow.com" + - "{exception type} cosmos db .net sdk site:stackoverflow.com" +``` + +**Other Trusted Sources:** +```yaml +trusted_sources: + - stackoverflow.com (tag: azure-cosmosdb) + - docs.microsoft.com/azure/cosmos-db + - devblogs.microsoft.com/cosmosdb + - github.com/Azure/azure-cosmos-dotnet-v3/discussions + - techcommunity.microsoft.com (Cosmos DB tag) + +search_tool: web_search +query_template: "{issue keywords} {error/behavior} site:{source}" +``` + +**What to Extract from External Sources:** +- Known workarounds from community +- Similar issue patterns and resolutions +- Version-specific behavior changes +- Configuration recommendations +- Performance tuning tips + +#### 2c. Check Changelog for Related Fixes +```yaml +search_in: + - changelog.md + - PULL_REQUEST_TEMPLATE.md (for PR patterns) + - docs/releaseNotes/*.md +``` + +#### 2d. Search Commit History +```bash +git log --all --oneline --grep="" -- +git log --all --oneline -S "" # Search for code changes +``` + +### Phase 3: Code Analysis + +#### 3a. Combined Analysis Approach: Local + Bluebird + +**Use BOTH local tools and Bluebird for comprehensive analysis:** + +```yaml +analysis_strategy: + local_tools_for: + - Quick exact-match searches (grep) + - File discovery (glob) + - Reading specific known files (view) + - String literal searches + - Configuration file analysis + + bluebird_tools_for: + - Semantic/conceptual searches + - Call graph traversal + - Inheritance hierarchy + - Code summaries and understanding + - Complex relationship queries +``` + +**Local Tools Workflow:** +```bash +# Find files related to issue area +glob: "**/*{keyword}*.cs" + +# Search for error messages, constants +grep: pattern="{error code}" glob="*.cs" -n + +# Search for specific patterns +grep: pattern="throw.*{ExceptionType}" glob="*.cs" -C 3 + +# Find configuration/constants +grep: pattern="{config key}" path="src/" -n +``` + +**Bluebird Tools Workflow:** +```yaml +analysis_workflow: + 1_locate: + tool: do_vector_search + input: + similarity_search_text: "" + search_index: "Function" | "Class" | "General" + top_k: 10 + + 2_understand: + tool: get_hierarchical_summary + input: + node_name: "" + node_type: "Class" | "Function" + + 3_trace_calls: + tool: get_function_calling_functions + input: + function_name: "" + # Follow call chain to find root cause + + 4_get_code: + tool: get_source_code + input: + node_name: "" + node_type: "Function" +``` + +**Combined Analysis Example:** +```yaml +investigation_flow: + step_1: + description: "Quick local search for error/keyword" + tool: grep + purpose: "Fast initial scan" + + step_2: + description: "Semantic search for related functionality" + tool: bluebird.do_vector_search + purpose: "Find conceptually related code" + + step_3: + description: "Read specific files found" + tool: view + purpose: "Examine exact code" + + step_4: + description: "Trace call relationships" + tool: bluebird.get_function_calling_functions + purpose: "Understand code flow" + + step_5: + description: "Cross-reference with tests" + tool: grep + glob + purpose: "Find existing test coverage" +``` + +#### 3b. Analysis by Issue Type + +**For Exceptions/Crashes:** +```yaml +steps: + - Parse stack trace to identify top frame in SDK code + - Use get_function_calling_functions to trace call path + - Use get_function_called_functions to understand what failed + - Identify error handling paths with grep for "throw" patterns + - Check if retry logic should have caught this +``` + +**For Performance Issues:** +```yaml +steps: + - Identify hot path using operation type + - Check serialization path (CosmosSerializerCore) + - Check connection/retry configuration + - Look for blocking calls (sync over async) + - Analyze diagnostics output if provided +``` + +**For Data/Serialization Issues:** +```yaml +steps: + - Identify serializer in use + - Trace serialization path (ToStream/FromStream) + - Check partition key extraction logic + - Verify JSON path handling + - Check for custom serializer conflicts +``` + +--- + +## 4. Reproduction Workflow + +### 4.1 Multi-Version Reproduction Strategy + +**Test against THREE versions to determine fix status:** + +```yaml +reproduction_versions: + 1_reported_version: + description: "Version from issue report" + purpose: "Confirm issue exists as reported" + source: "NuGet package" + + 2_latest_stable: + description: "Latest released SDK version" + purpose: "Check if already fixed in release" + source: "NuGet package (latest)" + + 3_master_branch: + description: "Current master/main branch" + purpose: "Check if fix exists but not released" + source: "Local build from master" +``` + +**Version Matrix Test Results:** + +| Version | Result | Implication | +|---------|--------|-------------| +| Reported βœ… / Latest βœ… / Master βœ… | Issue persists | Needs fix | +| Reported βœ… / Latest ❌ / Master ❌ | Fixed in release | Recommend upgrade | +| Reported βœ… / Latest βœ… / Master ❌ | Fixed, not released | Mention upcoming fix | +| Reported ❌ / Latest ❌ / Master ❌ | Cannot reproduce | Request more info | + +### 4.2 Environment Setup + +```yaml +prerequisites: + - Cosmos DB Emulator (Windows) or Linux emulator + - .NET SDK matching issue reporter's version + - Connection modes: Direct + Gateway + +emulator_setup: + script: templates/emulator-setup.yml + connection_string: "AccountEndpoint=https://localhost:8081/;AccountKey=C2y6yDjf5/R+ob0N8A7Cgv30VRDJIWEHLM+4QDU5DE2nQ9nDuVTqobD4b8mGGyPMbIZnqyMsEcaGQy67XIw/Jw==" + +version_setup: + reported_version: + command: | + # Create temp project with specific version + dotnet new console -o IssueRepro_{number}_v{reported} + cd IssueRepro_{number}_v{reported} + dotnet add package Microsoft.Azure.Cosmos --version {reported_version} + + latest_version: + command: | + dotnet new console -o IssueRepro_{number}_latest + cd IssueRepro_{number}_latest + dotnet add package Microsoft.Azure.Cosmos # Gets latest + + master_branch: + command: | + # Build from local master + git checkout master + git pull origin master + dotnet build Microsoft.Azure.Cosmos.sln -c Release + # Reference local build in test project +``` + +### 4.3 Reproduction Decision Matrix + +| Condition | Action | +|-----------|--------| +| Clear repro steps + code sample | Attempt full reproduction on all 3 versions | +| Partial repro steps | Create minimal test, request clarification | +| No repro steps, clear error | Attempt based on error pattern | +| Intermittent issue | Create stress test, log diagnostics | +| Environment-specific | Document, request more details | + +### 4.4 Reproduction Test Template + +```csharp +// File: tests/IssueRepro/Issue_{number}_Tests.cs + +/// +/// Reproduction tests for GitHub Issue #{number} +/// Run against: Reported version, Latest stable, Master branch +/// +[TestClass] +public class Issue_{number}_Tests +{ + private CosmosClient _client; + private Container _container; + + // Capture which version is being tested + private static readonly string SdkVersion = typeof(CosmosClient).Assembly + .GetCustomAttribute()?.InformationalVersion; + + [TestInitialize] + public async Task Setup() + { + Console.WriteLine($"Testing with SDK Version: {SdkVersion}"); + + // Setup based on issue details + _client = new CosmosClient( + connectionString: EmulatorConnectionString, + clientOptions: new CosmosClientOptions + { + ConnectionMode = ConnectionMode.{Direct|Gateway}, + // Add any specific options from issue + }); + + var database = await _client.CreateDatabaseIfNotExistsAsync("IssueReproDb"); + _container = await database.Database.CreateContainerIfNotExistsAsync( + new ContainerProperties("IssueReproContainer", "/pk")); + } + + [TestMethod] + [Description("Reproduction for GitHub Issue #{number}: {title}")] + public async Task Reproduce_Issue_{number}_GatewayMode() + { + // Test in Gateway mode + } + + [TestMethod] + [Description("Reproduction for GitHub Issue #{number}: {title}")] + public async Task Reproduce_Issue_{number}_DirectMode() + { + // Test in Direct mode + } + + [TestCleanup] + public async Task Cleanup() + { + await _client.GetDatabase("IssueReproDb").DeleteAsync(); + _client.Dispose(); + } +} +``` + +### 4.5 Running Multi-Version Reproductions + +```yaml +test_execution: + reported_version: + setup: | + cd IssueRepro_{number}_v{reported} + dotnet restore + gateway: + command: dotnet test --filter "Issue_{number}" -- CosmosDB:ConnectionMode=Gateway + direct: + command: dotnet test --filter "Issue_{number}" -- CosmosDB:ConnectionMode=Direct + record: "version={reported}, gateway={pass/fail}, direct={pass/fail}" + + latest_version: + setup: | + cd IssueRepro_{number}_latest + dotnet restore + gateway: + command: dotnet test --filter "Issue_{number}" -- CosmosDB:ConnectionMode=Gateway + direct: + command: dotnet test --filter "Issue_{number}" -- CosmosDB:ConnectionMode=Direct + record: "version=latest, gateway={pass/fail}, direct={pass/fail}" + + master_branch: + setup: | + # Ensure master is built + dotnet build Microsoft.Azure.Cosmos.sln -c Release + gateway: + command: dotnet test --filter "Issue_{number}" -- CosmosDB:ConnectionMode=Gateway + direct: + command: dotnet test --filter "Issue_{number}" -- CosmosDB:ConnectionMode=Direct + record: "version=master, gateway={pass/fail}, direct={pass/fail}" + +capture: + - CosmosDiagnostics output + - Exception details + - Request/response traces + - Timeline of operations + - SDK version in output +``` + +### 4.6 Reproduction Results Summary Template + +```markdown +## Reproduction Results + +| Version | Gateway Mode | Direct Mode | Notes | +|---------|--------------|-------------|-------| +| {reported_version} | βœ… Reproduced / ❌ Not Reproduced | βœ… / ❌ | {notes} | +| {latest_version} | βœ… / ❌ | βœ… / ❌ | {notes} | +| master ({commit_sha}) | βœ… / ❌ | βœ… / ❌ | {notes} | + +**Conclusion:** +- [ ] Issue persists in all versions β†’ Needs fix +- [ ] Fixed in latest release β†’ Recommend upgrade to {version} +- [ ] Fixed in master, not released β†’ Will be available in next release +- [ ] Cannot reproduce β†’ Need more information +``` + +### 4.7 Performance Issue Validation via Benchmarks + +For performance-related issues, use BenchmarkDotNet to validate and measure impact. + +#### When to Run Benchmarks +```yaml +benchmark_triggers: + - Issue mentions: "slow", "performance", "latency", "throughput", "RU", "timeout" + - Issue type: "Bug - Performance" + - Suspected hot path changes + - Before/after fix validation +``` + +#### Benchmark Project Setup + +```csharp +// File: tests/IssueRepro/Benchmarks/Issue_{number}_Benchmark.cs + +using BenchmarkDotNet.Attributes; +using BenchmarkDotNet.Running; +using Microsoft.Azure.Cosmos; + +[MemoryDiagnoser] +[RankColumn] +[MinColumn, MaxColumn, MeanColumn, MedianColumn] +public class Issue_{number}_Benchmark +{ + private CosmosClient _client; + private Container _container; + + [Params("Gateway", "Direct")] + public string ConnectionMode { get; set; } + + [Params("3.35.0", "Latest", "Master")] // Versions to compare + public string SdkVersion { get; set; } + + [GlobalSetup] + public async Task Setup() + { + var options = new CosmosClientOptions + { + ConnectionMode = ConnectionMode == "Direct" + ? Microsoft.Azure.Cosmos.ConnectionMode.Direct + : Microsoft.Azure.Cosmos.ConnectionMode.Gateway + }; + + _client = new CosmosClient(EmulatorConnectionString, options); + var db = await _client.CreateDatabaseIfNotExistsAsync("BenchmarkDb"); + _container = await db.Database.CreateContainerIfNotExistsAsync( + new ContainerProperties("BenchmarkContainer", "/pk")); + + // Warm up + await WarmupAsync(); + } + + [GlobalCleanup] + public async Task Cleanup() + { + await _client.GetDatabase("BenchmarkDb").DeleteAsync(); + _client.Dispose(); + } + + [Benchmark(Baseline = true)] + public async Task Baseline_Operation() + { + // Baseline operation for comparison + } + + [Benchmark] + public async Task Issue_{number}_Scenario() + { + // The specific operation reported as slow + } + + private async Task WarmupAsync() + { + // Warmup to stabilize connections + for (int i = 0; i < 10; i++) + { + await _container.ReadContainerAsync(); + } + } +} + +// Runner +public class Program +{ + public static void Main(string[] args) + { + var summary = BenchmarkRunner.Run(); + } +} +``` + +#### Benchmark Execution + +```yaml +benchmark_workflow: + setup: + - Install BenchmarkDotNet: dotnet add package BenchmarkDotNet + - Ensure Release build: dotnet build -c Release + - Start emulator with consistent state + + execution: + command: dotnet run -c Release --project Benchmarks/Issue_{number}_Benchmark.csproj + + compare_versions: + # Run same benchmark against different SDK versions + reported_version: + command: dotnet run -c Release -- --filter "*" --artifacts ./results/v{reported} + latest_version: + command: dotnet run -c Release -- --filter "*" --artifacts ./results/latest + master_branch: + command: dotnet run -c Release -- --filter "*" --artifacts ./results/master +``` + +#### Benchmark Metrics to Capture + +```yaml +metrics: + latency: + - Mean (ms) + - Median (ms) + - P95 (ms) + - P99 (ms) + - Min/Max (ms) + + throughput: + - Operations/second + - RU/s consumed + + memory: + - Allocated bytes + - Gen0/Gen1/Gen2 collections + + comparison: + - Ratio vs baseline + - Ratio vs previous version +``` + +#### Benchmark Results Template + +```markdown +## Performance Benchmark Results + +**Issue:** #{number} - {title} +**Scenario:** {description of operation benchmarked} +**Environment:** {CPU, RAM, OS, Emulator version} + +### Latency Comparison + +| Version | Connection | Mean | Median | P95 | P99 | Allocated | +|---------|------------|------|--------|-----|-----|-----------| +| {reported} | Gateway | {ms} | {ms} | {ms} | {ms} | {KB} | +| {reported} | Direct | {ms} | {ms} | {ms} | {ms} | {KB} | +| {latest} | Gateway | {ms} | {ms} | {ms} | {ms} | {KB} | +| {latest} | Direct | {ms} | {ms} | {ms} | {ms} | {KB} | +| master | Gateway | {ms} | {ms} | {ms} | {ms} | {KB} | +| master | Direct | {ms} | {ms} | {ms} | {ms} | {KB} | + +### Throughput Comparison + +| Version | Connection | Ops/sec | RU/s | RU/op | +|---------|------------|---------|------|-------| +| ... | ... | ... | ... | ... | + +### Analysis + +**Regression Detected:** Yes/No +**Regression Severity:** {X}% slower than baseline +**Root Cause Hypothesis:** {explanation} + +### Benchmark Artifacts +- Full BenchmarkDotNet report: [link to artifacts] +- Flame graph (if applicable): [link] +``` + +#### Performance Acceptance Criteria + +```yaml +acceptance_criteria: + no_regression: + latency_increase: "< 5% vs baseline" + memory_increase: "< 10% vs baseline" + throughput_decrease: "< 5% vs baseline" + + fix_validation: + must_show: "Measurable improvement in reported scenario" + no_side_effects: "No regression in other operations" + + documentation: + required: "Before/after benchmark comparison in PR" +``` + +### 4.8 Regression Testing Requirement + +**Before any fix is considered complete, ALL existing tests must pass - both locally AND on remote CI.** + +```yaml +regression_testing: + required: true + + local_validation: + description: "Quick local checks before PR" + workflow: + step_1: + description: "Run full test suite before making changes" + command: dotnet test Microsoft.Azure.Cosmos.sln --no-build -c Release + purpose: "Establish baseline - all tests must pass" + + step_2: + description: "Run full test suite after making changes" + command: dotnet test Microsoft.Azure.Cosmos.sln --no-build -c Release + purpose: "Verify no regressions introduced" + + step_3: + description: "Compare test results" + criteria: + - No previously passing tests should fail + - New tests should pass + - No increase in skipped tests + + remote_ci_validation: + description: "Full CI gate validation (REQUIRED)" + reference: "See Section 7.4 for full CI workflow" + gates_that_must_pass: + - static-tools (code analysis) + - nuget-pack (package build) + - build-test (unit + integration tests) + - build-samples (sample compilation) + - build-benchmark (benchmark build) + - build-preview (preview builds) + - build-internal (internal builds) + - build-thinclient (thin client builds) + note: "Local tests may pass but remote CI catches additional issues" + + test_categories: + unit_tests: + path: "Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests" + command: dotnet test --filter "Category!=Emulator" + required: true + local: true + remote: true + + emulator_tests: + path: "Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.EmulatorTests" + command: dotnet test + required: "when emulator available" + local: "if emulator installed" + remote: true # CI has automated emulator setup + + multi_region_tests: + path: "Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.EmulatorTests" + filter: "TestCategory=MultiRegion" + required: "for cross-region changes" + local: false # Requires Azure resources + remote: true # Uses COSMOSDB_MULTI_REGION secret + + performance_tests: + path: "Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Performance.Tests" + command: dotnet run -c Release + required: "for performance-related changes" + local: true + remote: true + + failure_handling: + if_baseline_fails: + - Document which tests were already failing + - Do not include those in regression comparison + - Note pre-existing failures in investigation issue + + if_new_failures: + - Stop and investigate + - Do not proceed with PR until resolved + - Document root cause of regression + + if_ci_only_failure: + - Get logs using github-mcp-server-get_job_logs + - Check if failure is infrastructure (emulator, network) + - Check if failure is environment-specific + - May need to re-run pipeline for transient failures + + pr_checklist: + - "[ ] All unit tests pass locally" + - "[ ] All emulator tests pass locally (if applicable)" + - "[ ] No regression in existing tests" + - "[ ] New tests added for the fix" + - "[ ] PR created and CI triggered" + - "[ ] ALL remote CI gates pass (Section 7.4)" + - "[ ] CI failures investigated and resolved" +``` + +--- + +When creating a linked investigation issue: + +```markdown +# Investigation: #{original_issue_number} - {title} + +## Original Issue +Linked to: #{original_issue_number} +Reporter: @{username} +Created: {date} + +## Issue Summary +{2-3 sentence summary of the reported problem} + +## Environment Details +| Property | Value | +|----------|-------| +| SDK Version | {version} | +| .NET Version | {version} | +| Connection Mode | {Direct/Gateway} | +| Operation Type | {CRUD/Query/Batch/etc} | +| Account Type | {Serverless/Provisioned} | + +## Reproduction Status +- [ ] Reproduced with reported SDK version ({version}) +- [ ] Reproduced with latest SDK version ({version}) +- [ ] Reproduced with master branch ({commit_sha}) +- [ ] Reproduced in Gateway mode +- [ ] Reproduced in Direct mode +- [ ] Unable to reproduce +- [ ] Needs more information + +### Version Matrix Results + +| Version | Gateway | Direct | Build | Notes | +|---------|---------|--------|-------|-------| +| {reported_version} | βœ…/❌ | βœ…/❌ | βœ… | {notes} | +| {latest_version} | βœ…/❌ | βœ…/❌ | βœ… | {notes} | +| master ({short_sha}) | βœ…/❌ | βœ…/❌ | βœ… | {notes} | + +### Reproduction Steps Attempted +``` +{steps taken} +``` + +### Reproduction Results +``` +{output/logs} +``` + +## Code Analysis + +### Suspected Code Path +``` +{file}:{line} - {function_name} + └─> {file}:{line} - {called_function} + └─> {file}:{line} - {root_cause_location} +``` + +### Relevant Code Sections +```csharp +// {file}:{start_line}-{end_line} +{code snippet} +``` + +### Root Cause Analysis +{detailed explanation of what's happening and why} + +## Historical Context + +### Related StackOverflow / External References +- [{title}]({url}) - {relevance summary} +- [{title}]({url}) - {relevance summary} + +### Related Issues +- #{issue_number} - {title} ({status}) +- #{issue_number} - {title} ({status}) + +### Related PRs/Fixes +- PR #{pr_number} - {title} (merged {date}) + +### Changelog References +- Version {x.y.z}: {relevant changelog entry} + +## Workaround +{If available, provide a workaround} + +```csharp +// Workaround code example +``` + +## Proposed Fix + +### Option 1: {approach name} +**Pros:** +**Cons:** +**Risk:** Low/Medium/High +**Effort:** Small/Medium/Large + +```csharp +// Proposed code change +``` + +### Option 2: {alternative approach} +... + +## Recommended Action +- [ ] Needs more investigation +- [ ] Ready for PR (recommend Option {n}) +- [ ] Won't fix (reason: {reason}) +- [ ] Duplicate of #{issue_number} + +## Reviewer Assignment +Based on area labels, assign to: @{reviewer} + +/cc @{relevant_team_members} +``` + +--- + +## 6. Workaround Identification + +### 6.1 Common Workaround Patterns + +| Issue Pattern | Potential Workaround | +|--------------|---------------------| +| Timeout in Direct mode | Switch to Gateway mode temporarily | +| Serialization failure | Use custom serializer with explicit handling | +| Retry exhaustion | Increase MaxRetryAttempts, add jitter | +| Partition key error | Explicit partition key in request options | +| Bulk throttling | Reduce batch size, implement backoff | +| Connection issues | Configure IdleTcpConnectionTimeout | +| Memory pressure | Enable streaming APIs, dispose properly | + +### 6.2 Workaround Documentation Template + +```markdown +## Temporary Workaround + +**Applies to:** SDK versions {range} +**Issue:** {brief description} + +### Option 1: Configuration Change +```csharp +var options = new CosmosClientOptions +{ + // Workaround configuration +}; +``` + +### Option 2: Code Pattern +```csharp +// Workaround code pattern +``` + +### Limitations +- {limitation 1} +- {limitation 2} + +### When Fix is Available +This workaround can be removed when upgrading to SDK version {x.y.z} or later. +``` + +--- + +## 7. PR Creation Workflow + +### 7.1 Branch Naming Convention + +**Format:** `users//` + +```yaml +branch_naming: + pattern: "users/{username}/{type}-{description}" + + types: + - fix # Bug fixes + - feature # New features + - perf # Performance improvements + - docs # Documentation changes + - refactor # Code refactoring + + examples: + - "users/kirankk/fix-linq-dictionary-objecttoarray" + - "users/kirankk/fix-issue-5547-dictionary-any" + - "users/johndoe/feature-bulk-retry-policy" + - "users/janedoe/perf-batch-throughput" + - "users/alice/docs-linq-dictionary-support" + + rules: + - Use lowercase + - Use hyphens (not underscores) as separators + - Include issue number when applicable + - Keep description concise but descriptive + - Username should match GitHub handle +``` + +### 7.2 PR Eligibility Criteria + +```yaml +create_pr_when: + - Root cause identified AND + - Fix verified in reproduction tests AND + - No breaking changes OR approved breaking change AND + - Follows existing code patterns AND + - Human reviewer identified + +do_not_create_pr_when: + - Requires design discussion + - Breaking change without approval + - Unclear requirements + - Complex cross-cutting change + - Security-sensitive fix (escalate instead) +``` + +### 7.3 PR Template + +```markdown +# {Fix type}: {Brief description} + +## Description +Fixes #{issue_number} + +{Detailed description of the fix} + +## Root Cause +{Explanation of what was causing the issue} + +## Changes Made +- {change 1} +- {change 2} + +## Testing +- [ ] Added/updated unit tests +- [ ] Verified against emulator +- [ ] Tested in Gateway mode +- [ ] Tested in Direct mode + +## Reproduction Test +```csharp +// Test that verifies the fix +``` + +## Breaking Changes +{None | Description of breaking changes} + +## Checklist +- [ ] Code follows project conventions +- [ ] Self-review completed +- [ ] Comments added for complex logic +- [ ] Documentation updated (if applicable) + +## Investigation Issue +See #{investigation_issue_number} for full analysis. +``` + +### 7.3 Reviewer Assignment Matrix + +| Area Label | Primary Reviewer | Backup | +|------------|-----------------|--------| +| Batch | @batch-owners | @sdk-team | +| Query | @query-owners | @sdk-team | +| ChangeFeed | @changefeed-owners | @sdk-team | +| DirectMode | @transport-owners | @sdk-team | +| Serialization | @serialization-owners | @sdk-team | +| Encryption | @encryption-owners | @security-team | + +### 7.4 Remote CI Validation (Azure Pipelines Gates) + +**Critical: Local tests are not sufficient. All fixes must pass the full Azure Pipelines CI gates.** + +#### 7.4.1 CI Pipeline Structure + +The repository uses Azure Pipelines with multiple gate templates defined in `azure-pipelines.yml`: + +```yaml +ci_gates: + # All gates must pass before PR can be merged + + static_tools: + template: templates/static-tools.yml + checks: + - Code analysis + - Style compliance + - Security scanning + + nuget_pack: + template: templates/nuget-pack.yml + checks: + - Package builds successfully + - No packaging errors + + build_test: + template: templates/build-test.yml + checks: + - Unit tests (Release config) + - Integration tests + - Multi-region tests (if applicable) + filter: "TestCategory!=Flaky & TestCategory!=Quarantine & TestCategory!=Functional & TestCategory!=Ignore" + + build_samples: + template: templates/build-samples.yml + checks: + - All samples compile + - Samples reference correct SDK version + + build_benchmark: + template: templates/build-benchmark.yml + checks: + - Benchmark project builds + - No performance regression detected + + build_preview: + template: templates/build-preview.yml + checks: + - Preview features compile + - PREVIEW define constant works + + build_internal: + template: templates/build-internal.yml + checks: + - Internal builds succeed + + build_thinclient: + template: templates/build-thinclient.yml + checks: + - Thin client variant builds +``` + +#### 7.4.2 Local vs Remote CI Comparison + +| Validation Type | Local Testing | Remote CI (Azure Pipelines) | +|----------------|---------------|----------------------------| +| **Unit Tests** | βœ… Can run | βœ… Full matrix | +| **Emulator Tests** | ⚠️ Requires local emulator | βœ… Automated emulator setup | +| **Multi-Region Tests** | ❌ No access | βœ… Uses COSMOSDB_MULTI_REGION | +| **Multi-Master Tests** | ❌ No access | βœ… Uses COSMOSDB_MULTIMASTER | +| **Static Analysis** | ⚠️ Manual | βœ… Automated | +| **Package Validation** | ⚠️ Manual | βœ… NuGet pack verification | +| **Cross-Platform** | ⚠️ Single OS | βœ… Windows matrix | +| **Performance Gates** | ⚠️ Variable hardware | βœ… Consistent CI agents | + +#### 7.4.3 Pre-PR Validation Workflow + +```yaml +validation_workflow: + phase_1_local: + description: "Quick local validation before creating PR" + steps: + - name: "Build solution" + command: dotnet build Microsoft.Azure.Cosmos.sln -c Release + required: true + + - name: "Run unit tests" + command: | + dotnet test Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests \ + --filter "TestCategory!=Flaky & TestCategory!=Quarantine & TestCategory!=Functional & TestCategory!=Ignore" \ + -c Release --no-build + required: true + + - name: "Run LINQ-specific tests (for LINQ changes)" + command: dotnet test --filter "FullyQualifiedName~Linq" -c Release --no-build + required: "for LINQ-related changes" + + phase_2_create_pr: + description: "Create PR to trigger remote CI" + steps: + - name: "Create feature branch" + naming_convention: "users//" + examples: + - "users/kirankk/fix-linq-dictionary-objecttoarray" + - "users/kirankk/issue-5547-dictionary-any" + - "users/johndoe/perf-batch-throughput" + command: | + git checkout -b users/{username}/fix-issue-{number}-{short-description} + git add . + git commit -m "Fix #{number}: {description}" + git push origin users/{username}/fix-issue-{number}-{short-description} + + - name: "Create Draft PR" + purpose: "Triggers CI without requesting review" + template: | + gh pr create --draft \ + --title "Fix #{number}: {title}" \ + --body "$(cat pr_body.md)" + + phase_3_ci_monitoring: + description: "Monitor CI pipeline execution" + steps: + - name: "Check pipeline status" + tool: github-mcp-server-actions_list + params: + method: list_workflow_runs + workflow_runs_filter: + branch: "users/{username}/fix-issue-{number}" + status: "in_progress" + + - name: "Get failed job logs" + tool: github-mcp-server-get_job_logs + params: + failed_only: true + return_content: true + tail_lines: 500 + + phase_4_fix_ci_failures: + description: "Iterate until CI passes" + loop: + - Analyze failure logs + - Identify root cause of CI failure + - Fix locally + - Run local validation + - Push fix + - Wait for CI re-run + exit_condition: "All CI gates green" +``` + +#### 7.4.4 CI Failure Triage + +```yaml +common_ci_failures: + build_failure: + symptoms: + - "error CS####" + - "Build FAILED" + actions: + - Check if failure is in changed files + - Verify all project references + - Check for missing using statements + + test_failure: + symptoms: + - "Failed: X, Passed: Y" + - "Assert.Xxx failed" + actions: + - Get failed test names from logs + - Check if test was passing before changes + - Reproduce failure locally if possible + - Check for flaky test (retry) + + emulator_failure: + symptoms: + - "Connection refused" + - "ServiceUnavailable" + - "Emulator not started" + actions: + - Check templates/emulator-setup.yml for setup steps + - This is likely infrastructure, not code issue + - May need to re-run pipeline + + timeout_failure: + symptoms: + - "Job cancelled" + - "Exceeded timeout" + actions: + - Check if test has infinite loop + - Check for deadlock in async code + - May need to increase timeout or fix perf + + multi_region_failure: + symptoms: + - "COSMOSDB_MULTI_REGION" + - "Endpoint not found" + actions: + - This requires actual Azure resources + - Cannot reproduce locally + - Check if failure is in multi-region specific code +``` + +#### 7.4.5 CI Gate Checklist for PR + +```markdown +## CI Validation Checklist + +### Local Validation (Required before PR) +- [ ] `dotnet build Microsoft.Azure.Cosmos.sln -c Release` passes +- [ ] Unit tests pass with CI filter +- [ ] No new compiler warnings + +### Remote CI Gates (Must pass before merge) +- [ ] **static-tools** - Code analysis clean +- [ ] **nuget-pack** - Package builds successfully +- [ ] **build-test** - All tests pass + - [ ] Unit tests + - [ ] Integration tests + - [ ] Multi-region tests (if applicable) +- [ ] **build-samples** - Samples compile +- [ ] **build-benchmark** - Benchmarks build +- [ ] **build-preview** - Preview builds work +- [ ] **build-internal** - Internal builds work +- [ ] **build-thinclient** - Thin client builds + +### CI Failure Resolution +- [ ] All CI failures investigated +- [ ] No failures related to this change +- [ ] Any infrastructure failures documented + +### Final Status +- [ ] All CI gates GREEN +- [ ] PR ready for review (not draft) +``` + +#### 7.4.6 Monitoring Tools + +```yaml +ci_monitoring_tools: + list_workflows: + tool: github-mcp-server-actions_list + method: list_workflows + purpose: "See all CI workflows in repo" + + list_runs: + tool: github-mcp-server-actions_list + method: list_workflow_runs + purpose: "Check status of PR's CI runs" + filter_by: + - branch + - status (queued, in_progress, completed) + - event (pull_request) + + get_run_details: + tool: github-mcp-server-actions_get + method: get_workflow_run + purpose: "Get details of specific CI run" + + list_jobs: + tool: github-mcp-server-actions_list + method: list_workflow_jobs + purpose: "See individual jobs in a run" + + get_logs: + tool: github-mcp-server-get_job_logs + purpose: "Get failure logs for debugging" + params: + failed_only: true + return_content: true + tail_lines: 500 +``` + +--- + +## 8. Agent Capabilities & Tools + +### 8.1 Model Selection + +**All agents use Claude Opus 4.5** for maximum reasoning quality: + +```yaml +task_tool_invocation: + explore_agent: + agent_type: "explore" + model: "claude-opus-4.5" + + task_agent: + agent_type: "task" + model: "claude-opus-4.5" + + general_purpose_agent: + agent_type: "general-purpose" + model: "claude-opus-4.5" + + code_review_agent: + agent_type: "code-review" + model: "claude-opus-4.5" +``` + +### 8.2 MCP Tools Available + +```yaml +github_mcp_server: + - search_issues: Find related issues + - search_pull_requests: Find related PRs + - list_issues: Browse open issues + - issue_read: Get issue details + - pull_request_read: Get PR details + - get_commit: View commit details + - get_file_contents: Read files from repo + - get_job_logs: Check CI failures + +bluebird_engineering_copilot: + - do_vector_search: Semantic code search + - do_fulltext_search: Keyword code search + - get_source_code: Retrieve source + - get_hierarchical_summary: Get code overview + - get_function_calling_functions: Find callers + - get_function_called_functions: Find callees + - get_class_or_struct_parent_types: Inheritance + - get_class_or_struct_member_functions: Methods +``` + +### 8.3 Local Tools + +```yaml +file_operations: + - view: Read files + - edit: Modify files + - create: Create new files + - grep: Search content + - glob: Find files + +shell_operations: + - powershell: Run commands + - dotnet build: Build solution + - dotnet test: Run tests + - git: Version control +``` + +--- + +## 9. Workflow State Machine + +``` +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ NEW_ISSUE β”‚ +β””β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + β–Ό +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ NEEDS_TRIAGE │────▢│ NEEDS_INFO β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ β”‚ + β”‚ β—€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β–Ό +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ INVESTIGATING β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + β”Œβ”€β”€β”€β”€β”΄β”€β”€β”€β”€β” + β–Ό β–Ό +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ REPRO β”‚ β”‚ CANNOT_REPROβ”‚ +β””β”€β”€β”€β”¬β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”˜ + β”‚ β”‚ + β–Ό β–Ό +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ ANALYSIS_COMPLETE β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + β”Œβ”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” + β–Ό β–Ό β–Ό β–Ό +β”Œβ”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ PR β”‚ β”‚WONT β”‚ β”‚NEEDS β”‚ β”‚DUPLICATE β”‚ +β”‚READY β”‚ β”‚FIX β”‚ β”‚DESIGN β”‚ β”‚ β”‚ +β””β”€β”€β”¬β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + β–Ό +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ PR_CREATED β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + β–Ό +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ UNDER_REVIEW β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + β–Ό +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ MERGED/CLOSED β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +``` + +--- + +## 10. Quality Checklist + +### Before Creating Investigation Issue +- [ ] Issue fully understood +- [ ] Historical search completed +- [ ] Code path identified +- [ ] Reproduction attempted (if applicable) +- [ ] Root cause hypothesis formed + +### Before Proposing Workaround +- [ ] Workaround verified to work +- [ ] Side effects documented +- [ ] Limitations clearly stated +- [ ] Code example provided + +### Before Creating PR +- [ ] Fix verified in tests +- [ ] No regression introduced +- [ ] Code follows conventions +- [ ] Documentation updated +- [ ] Reviewer assigned + +--- + +## 11. Escalation Criteria + +Escalate to human immediately when: +- Security vulnerability suspected +- Data loss/corruption confirmed +- Breaking change required +- Cross-team coordination needed +- Legal/compliance implications +- Customer escalation mentioned +- P0 priority issue + +--- + +## 13. Documentation Improvement Workflow + +### 13.1 Microsoft Docs Feedback Loop + +When investigating issues, identify documentation gaps that contributed to the issue and suggest improvements. + +#### Triggers for Docs Suggestions +```yaml +suggest_docs_update_when: + - Issue caused by misunderstanding documented behavior + - Common question pattern (3+ similar issues) + - Undocumented edge case discovered + - Error message not explained in docs + - Missing code sample for common scenario + - Outdated information found + - Missing migration guidance between versions +``` + +#### Microsoft Docs Feedback Template + +```markdown +## Documentation Improvement Suggestion + +**Source Issue:** #{issue_number} +**Docs Page:** https://docs.microsoft.com/azure/cosmos-db/{page} +**Severity:** High/Medium/Low + +### Current State +{What the docs currently say or don't say} + +### Problem +{How this caused confusion or the reported issue} + +### Suggested Improvement +{Specific text/code to add or change} + +### Sample Code to Add +```csharp +// Example that would have prevented this issue +``` + +### Related Issues +- #{issue_number} - {X users affected} +- StackOverflow: {link} - {Y views} + +--- +**Action:** File issue at https://github.com/MicrosoftDocs/azure-docs or submit PR +``` + +#### Docs Areas to Monitor + +| SDK Area | Microsoft Docs Section | Common Gaps | +|----------|----------------------|-------------| +| Connection/Setup | /azure/cosmos-db/nosql/quickstart-dotnet | Connection string formats, emulator setup | +| CRUD Operations | /azure/cosmos-db/nosql/how-to-dotnet-create-item | Error handling, partial success | +| Query | /azure/cosmos-db/nosql/how-to-dotnet-query-items | Pagination, cross-partition queries | +| Batch | /azure/cosmos-db/nosql/how-to-dotnet-batch-operations | Size limits, atomicity guarantees | +| Change Feed | /azure/cosmos-db/nosql/change-feed-processor | Lease management, error recovery | +| Partitioning | /azure/cosmos-db/partitioning-overview | Hot partitions, synthetic keys | +| Performance | /azure/cosmos-db/nosql/performance-tips-dotnet-sdk-v3 | Direct vs Gateway, connection pooling | + +--- + +### 13.2 Code Documentation for AI Agents + +Identify and improve SDK code documentation to make it more AI-agent friendly. + +#### Why This Matters +```yaml +ai_agent_challenges: + - Cannot infer intent from variable names alone + - Needs explicit error condition documentation + - Benefits from examples in XML comments + - Requires clear parameter constraints + - Needs explicit nullability documentation + - Benefits from "when to use" guidance +``` + +#### Code Documentation Audit Checklist + +When investigating issues, audit related code for AI-friendliness: + +```markdown +- [ ] XML summary describes WHAT and WHY, not just WHAT +- [ ] Parameters have tags with constraints +- [ ] Return values documented including null/empty cases +- [ ] Exceptions documented with tags +- [ ] Code examples in tags for complex APIs +- [ ] section for edge cases and gotchas +- [ ] links to related methods +- [ ] Async methods document cancellation behavior +``` + +#### Before/After Examples + +**❌ Current (AI-Unfriendly):** +```csharp +/// +/// Creates an item. +/// +/// The item to create. +/// The response. +public Task> CreateItemAsync(T item); +``` + +**βœ… Improved (AI-Friendly):** +```csharp +/// +/// Creates a new item in the container. The item must have a unique 'id' property +/// within its partition key. Use if the item may already exist. +/// +/// The type of item to create. Must be JSON-serializable. +/// +/// The item to create. Must contain: +/// - 'id' property (string, max 255 chars, unique within partition) +/// - Partition key property matching container's partition key path +/// Cannot be null. +/// +/// +/// The partition key value for the item. If null, SDK extracts from item using +/// the container's partition key path. Explicit value recommended for performance. +/// +/// +/// Optional request configuration. Common options: +/// - EnableContentResponseOnWrite: false to reduce response size (default: true) +/// - IfMatchEtag: for optimistic concurrency +/// +/// Token to cancel the operation. Safe to cancel; no partial writes. +/// +/// Response containing: +/// - Resource: The created item (null if EnableContentResponseOnWrite=false) +/// - StatusCode: 201 Created on success +/// - RequestCharge: RU cost of operation +/// - ETag: Version identifier for optimistic concurrency +/// +/// +/// StatusCode 400: Invalid item (missing id, invalid JSON, exceeds 2MB) +/// StatusCode 409: Item with same id already exists in partition +/// StatusCode 413: Item exceeds maximum size (2MB) +/// StatusCode 429: Rate limited - retry after response.RetryAfter +/// +/// +/// Basic usage: +/// +/// var item = new { id = "1", pk = "partitionA", name = "Example" }; +/// var response = await container.CreateItemAsync(item, new PartitionKey("partitionA")); +/// Console.WriteLine($"Created item, cost: {response.RequestCharge} RUs"); +/// +/// +/// +/// With optimistic concurrency: +/// +/// try { +/// var response = await container.CreateItemAsync(item); +/// } catch (CosmosException ex) when (ex.StatusCode == HttpStatusCode.Conflict) { +/// // Item already exists - use UpsertItemAsync or ReplaceItemAsync +/// } +/// +/// +/// +/// Performance tips: +/// - Use Direct connection mode for lower latency +/// - Set EnableContentResponseOnWrite=false if you don't need the response body +/// - Provide explicit PartitionKey to avoid server-side extraction +/// +/// Size limits: +/// - Maximum item size: 2MB +/// - Maximum id length: 255 characters +/// - Maximum partition key value: 2KB +/// +/// +/// +public Task> CreateItemAsync( + T item, + PartitionKey? partitionKey = null, + ItemRequestOptions? requestOptions = null, + CancellationToken cancellationToken = default); +``` + +#### Priority Areas for Documentation Improvement + +Based on common issue patterns, prioritize these areas: + +| Priority | Class/Method | Why | +|----------|-------------|-----| +| P0 | `CosmosClient` constructor | Connection setup confusion | +| P0 | `Container.CreateItemAsync` | Most common operation | +| P0 | `CosmosException` | Error handling critical | +| P1 | `TransactionalBatch` | Complex API, many gotchas | +| P1 | `Container.GetItemQueryIterator` | Pagination confusion | +| P1 | `ChangeFeedProcessor` | Lifecycle management | +| P2 | `CosmosClientOptions` | Many options, unclear defaults | +| P2 | `ItemRequestOptions` | Conditional operations | +| P2 | `QueryDefinition` | Parameterized queries | + +#### Documentation PR Template + +```markdown +# Docs: Improve XML documentation for {ClassName} + +## Motivation +Issue #{number} revealed that the current documentation for `{ClassName}` +is insufficient for AI agents and developers to use correctly. + +## Changes +- Added detailed parameter constraints +- Added exception documentation with status codes +- Added code examples for common scenarios +- Added remarks for edge cases and performance tips +- Added seealso links to related APIs + +## AI-Agent Friendliness Checklist +- [x] Summary describes intent, not just action +- [x] All parameters documented with constraints +- [x] Return value documented including edge cases +- [x] Exceptions documented with conditions +- [x] Examples provided for common use cases +- [x] Remarks cover gotchas and best practices + +## Before/After +[Include diff showing documentation improvement] +``` + +--- + +### 13.3 AI-Agent Documentation Standards + +Define standards for new code to be AI-agent friendly from the start. + +#### Required Documentation Elements + +```yaml +public_api_requirements: + summary: + - Describe WHAT the method does + - Describe WHEN to use it vs alternatives + - Max 3 sentences + + parameters: + - Type constraints (not just type name) + - Valid value ranges + - Null behavior + - Default values and their implications + + returns: + - Success case + - Empty/null cases + - What properties are populated + + exceptions: + - All thrown exceptions + - Conditions that trigger each + - HTTP status codes for CosmosException + + examples: + - Basic happy path + - Error handling pattern + - Advanced scenario (if complex API) + + remarks: + - Performance implications + - Size/rate limits + - Thread safety + - Retry behavior + + seealso: + - Alternative methods + - Related configuration + - Relevant documentation links +``` + +#### Linting Rules for AI-Friendly Docs + +```yaml +documentation_lint_rules: + - error: "Summary must not be empty for public APIs" + - error: "Parameters must have documentation" + - warning: "Consider adding for complex APIs" + - warning: "Consider adding for methods that throw" + - info: "Consider adding for non-obvious behavior" +``` + +--- + +## 14. Documentation Improvement Tracking + +### 14.1 Issue Label for Docs + +Add label `docs-improvement` to issues that reveal documentation gaps. + +### 14.2 Docs Improvement Backlog Template + +Track documentation improvements separately: + +```markdown +## Documentation Improvement Backlog + +### Microsoft Docs (External) +| Page | Issue | Priority | Status | +|------|-------|----------|--------| +| {docs page} | {gap description} | P0/P1/P2 | Suggested/Filed/Merged | + +### SDK XML Docs (Internal) +| Class/Method | Issue | Priority | Status | +|--------------|-------|----------|--------| +| {class.method} | {gap description} | P0/P1/P2 | Identified/PR/Merged | + +### New Samples Needed +| Scenario | Issue Pattern | Priority | Status | +|----------|--------------|----------|--------| +| {scenario} | #{issue_numbers} | P0/P1/P2 | Identified/Created | +``` + +--- + +## 15. Metrics & Reporting + +Track for continuous improvement: +- Time from issue creation to triage +- Time from triage to investigation complete +- Reproduction success rate +- Workaround acceptance rate +- PR acceptance rate +- False positive rate (wrong root cause) +- Docs improvement suggestions generated +- Docs PRs merged (internal + external) + +--- + +## 16. Lessons Learned (Issue #5547 Case Study) + +This section documents learnings from the first real issue investigation using this plan. + +### 16.1 Issue Details +- **Issue:** #5547 - LINQ Dictionary.Any() generates incorrect SQL +- **Type:** Bug - Query/LINQ +- **Outcome:** Fix implemented, PR created, awaiting CI validation + +### 16.2 What Worked Well + +```yaml +effective_approaches: + parallel_agents: + description: "Using background agents for parallel work" + example: | + - agent-0: Create reproduction test + - agent-1: Implement fix + - agent-2: Run baseline tests + benefit: "Reduced total investigation time significantly" + + local_code_search: + tools_used: + - grep: "Fast pattern matching in source files" + - glob: "Find files by name pattern" + - view: "Read specific file sections" + benefit: "More reliable than remote API when SAML blocks access" + + web_fetch_for_issues: + description: "Use web_fetch when GitHub API is blocked by SAML" + example: "web_fetch https://github.com/Azure/azure-cosmos-dotnet-v3/issues/5547" + benefit: "Can still retrieve issue details without API access" + + stackoverflow_research: + description: "Search StackOverflow for known issues and workarounds" + benefit: "Confirmed issue was known limitation, found workaround pattern" + + incremental_testing: + workflow: + - Run targeted tests first (LINQ-specific) + - Then broader regression tests + - Build verification last + benefit: "Faster feedback loop during development" +``` + +### 16.3 Challenges Encountered + +```yaml +challenges: + github_api_saml: + problem: "Azure org requires SAML authentication for API access" + symptom: "403 error with 'Resource protected by organization SAML enforcement'" + workaround: "Use web_fetch to scrape issue page directly" + recommendation: "Always have web_fetch fallback for GitHub operations" + + github_cli_not_installed: + problem: "gh CLI not available in environment" + symptom: "'gh' is not recognized as a cmdlet" + workaround: "Provide PR details for manual creation via GitHub web UI" + recommendation: "Check for gh CLI early, have manual fallback ready" + + long_running_tests: + problem: "Full test suite takes several minutes" + symptom: "Commands timeout waiting for completion" + workaround: "Use initial_wait parameter, then read_powershell for polling" + recommendation: "Run targeted tests first, full suite only for final validation" + + agent_completion_time: + problem: "Complex agents (general-purpose with Opus) take 5-10 minutes" + symptom: "read_agent times out multiple times" + workaround: "Use wait:true with longer timeout (180-300s)" + recommendation: "Set expectations for agent completion times" +``` + +### 16.4 Tool Selection Guide (Refined) + +```yaml +tool_selection: + for_issue_retrieval: + first_try: "github-mcp-server-issue_read" + fallback: "web_fetch (when SAML blocks API)" + + for_code_search: + semantic_search: "bluebird do_vector_search" + exact_match: "grep with pattern" + file_discovery: "glob with pattern" + + for_code_reading: + known_path: "view tool directly" + unknown_location: "grep/glob first, then view" + + for_parallel_work: + independent_tasks: "Multiple background agents" + dependent_tasks: "Sequential agent calls" + + for_testing: + quick_validation: "dotnet test --filter (specific tests)" + regression_check: "dotnet test (full suite)" + ci_validation: "Create PR, monitor Azure Pipelines" + + for_pr_creation: + with_gh_cli: "gh pr create --draft" + without_gh_cli: "Provide PR URL and description for manual creation" +``` + +### 16.5 Timing Benchmarks + +| Phase | Duration | Notes | +|-------|----------|-------| +| Issue fetch (web_fetch) | ~5s | Faster than API when working | +| Codebase search (grep) | ~2s | Very fast for pattern matching | +| Root cause analysis | ~10min | With Opus model, thorough | +| Reproduction test creation (agent) | ~4min | Background agent | +| Fix implementation (agent) | ~10min | Background agent with Opus | +| Baseline test run | ~2min | 9 LINQ tests | +| Build verification | ~15s | Incremental build | +| Branch + commit + push | ~30s | Local git operations | + +### 16.6 PR Template Refinements + +Based on this exercise, PRs should include: + +```yaml +pr_requirements: + header: + - "πŸ€– This PR was authored by GitHub Copilot" + - Links to original issue + + sections: + - Description (what the PR does) + - Root Cause (why the bug existed) + - Changes Made (bullet list) + - Generated SQL/Output (before/after comparison) + - Testing (checklist) + - Checklist (code conventions) + + footer: + - "Generated by GitHub Copilot CLI Agent" + + labels_to_add: + - "copilot-authored" + - Area label (e.g., "Query", "LINQ") +``` + +### 16.7 Recommended Workflow Sequence + +```yaml +recommended_workflow: + phase_1_intake: + duration: "~2 min" + steps: + - Try GitHub API for issue details + - Fallback to web_fetch if SAML blocked + - Classify issue type and area + - Confirm with user before proceeding + + phase_2_research: + duration: "~5 min" + steps: + - Search StackOverflow for known issues + - Search codebase for related code + - Review recent commits/PRs in area + - Document findings + + phase_3_analysis: + duration: "~10 min" + steps: + - Deep code analysis with grep/view + - Use Bluebird for call graph if needed + - Identify root cause + - Document workaround if available + + phase_4_implementation: + duration: "~15 min" + parallel_agents: + - Reproduction test (general-purpose agent) + - Fix implementation (general-purpose agent) + - Baseline tests (task agent) + then: + - Verify all agents completed + - Review changes + - Run regression tests + + phase_5_pr_creation: + duration: "~5 min" + steps: + - Create feature branch (users//) + - Commit with descriptive message + - Push to remote + - Create draft PR (or provide details for manual creation) + - Monitor CI status +``` + +--- + +## TODO: Implementation Tasks + +### Completed βœ… +- [x] Test plan on real issue (#5547) +- [x] Document branch naming convention (users//) +- [x] Add remote CI validation workflow (Section 7.4) +- [x] Document Copilot-authored PR format +- [x] Add lessons learned section + +### Pending +- [ ] Create GitHub Action workflow for auto-triggering agent on new issues +- [ ] Set up emulator environment in CI for reproduction tests +- [ ] Define reviewer assignment rules in CODEOWNERS +- [ ] Create issue templates with required fields +- [ ] Build integration tests for agent workflow +- [ ] Document agent capabilities for contributors +- [ ] Set up metrics dashboard +- [ ] Create Microsoft Docs feedback automation +- [ ] Define XML documentation linting rules +- [ ] Audit high-priority APIs for AI-friendly docs +- [ ] Create documentation improvement backlog +- [ ] Add "copilot-authored" label to repository +- [ ] Install GitHub CLI (gh) in development environment +- [ ] Document SAML workarounds for Azure org repos From f8d3dec64e26ee54896e80d10be456fd439364fc Mon Sep 17 00:00:00 2001 From: Kiran Kumar Kolli Date: Sat, 31 Jan 2026 23:56:43 -0800 Subject: [PATCH 02/28] Update PR template with full investigation details and proper referencing - Expanded PR template to include full investigation details (not just summary) - Added issue summary table, root cause analysis section, before/after output - Added workaround documentation section - Added test results table and reproduction test template - Added Section 7.3.1: PR Reference Guidelines for proper linking - PR description now includes all information (no separate investigation issue needed) --- .github/copilot-agent-plan.md | 159 ++++++++++++++++++++++++++++++---- 1 file changed, 144 insertions(+), 15 deletions(-) diff --git a/.github/copilot-agent-plan.md b/.github/copilot-agent-plan.md index c1508245f9..e626c5d0f6 100644 --- a/.github/copilot-agent-plan.md +++ b/.github/copilot-agent-plan.md @@ -1120,46 +1120,175 @@ do_not_create_pr_when: ### 7.3 PR Template +**PR must include full investigation details, not just a summary.** + ```markdown # {Fix type}: {Brief description} ## Description Fixes #{issue_number} -{Detailed description of the fix} +> πŸ€– **This PR was authored by GitHub Copilot** as part of an automated issue triage and resolution workflow. + +{Detailed description of the problem and the fix} + +--- + +## Issue Summary +| Property | Value | +|----------|-------| +| Issue | #{issue_number} | +| Area | {Query/Batch/ChangeFeed/etc} | +| SDK Version (reported) | {version} | +| Severity | {P0-P3} | + +--- + +## Root Cause Analysis + +### Code Path +``` +{file1}:{line} - {function_name} + └─> {file2}:{line} - {called_function} + └─> {file3}:{line} - {root_cause_location} +``` + +### Root Cause +{Detailed explanation of why the bug existed, including: +- What the code was doing incorrectly +- Why this caused the reported behavior +- Any historical context (was this always broken, or a regression?)} -## Root Cause -{Explanation of what was causing the issue} +--- ## Changes Made -- {change 1} -- {change 2} + +### Files Modified +| File | Change | +|------|--------| +| `{file1}` | {description of change} | +| `{file2}` | {description of change} | + +### Code Changes +{Brief description of each change and why it fixes the issue} + +--- + +## Generated Output (Before/After) + +**Before (incorrect):** +``` +{output/SQL/JSON showing the bug} +``` + +**After (correct):** +``` +{output/SQL/JSON showing correct behavior} +``` + +--- + +## Workaround (For Users Not Yet Upgraded) + +{If a workaround exists, document it here so users can unblock themselves} + +```csharp +// Workaround code example +``` + +--- ## Testing -- [ ] Added/updated unit tests -- [ ] Verified against emulator -- [ ] Tested in Gateway mode -- [ ] Tested in Direct mode -## Reproduction Test +### Test Results +| Test Suite | Total | Passed | Failed | +|------------|-------|--------|--------| +| {test_suite_1} | {n} | {n} | {n} | +| {test_suite_2} | {n} | {n} | {n} | +| Build | - | βœ… | - | + +### New Tests Added +- `{TestClass}.{TestMethod}` - {description} + +### Reproduction Test ```csharp -// Test that verifies the fix +// Test that reproduces the original issue and verifies the fix +[TestMethod] +public void Issue_{number}_Reproduction() +{ + // Arrange - setup that triggers the bug + // Act - operation that was failing + // Assert - verify correct behavior +} ``` +--- + ## Breaking Changes -{None | Description of breaking changes} +{None | Description of breaking changes and migration guide} + +--- + +## External References +- StackOverflow: {link if applicable} +- Microsoft Docs: {link if applicable} +- Related Issues: #{related_issue_numbers} + +--- ## Checklist - [ ] Code follows project conventions - [ ] Self-review completed - [ ] Comments added for complex logic - [ ] Documentation updated (if applicable) +- [ ] New tests added for the fix +- [ ] All existing tests pass +- [ ] Remote CI gates pass (Section 7.4) + +--- + +*Generated by GitHub Copilot CLI Agent* +``` + +### 7.3.1 PR Reference Guidelines + +**Always include proper references between PRs and Issues:** + +```yaml +pr_references: + in_pr_description: + required: + - "Fixes #{issue_number}" (auto-closes issue on merge) + - Issue summary table with link + optional: + - Related issues: "Related to #{number}" + - Depends on: "Depends on #{pr_number}" + + in_issue_comments: + when_pr_created: | + Add comment to original issue: + "πŸ€– **Copilot Investigation Complete** + + **PR:** #{pr_number} - {pr_title} + **Branch:** `{branch_name}` + **Status:** Awaiting CI validation + + See PR for full root cause analysis and fix details." + + in_investigation_issue: + if_created: | + Update with PR link: + "### PR Reference + **PR:** #{pr_number} - {pr_title} + **Branch:** `{branch_name}` + **CI Status:** {Pending/Passing/Failing}" -## Investigation Issue -See #{investigation_issue_number} for full analysis. +branch_to_pr_url: + pattern: "https://github.com/{owner}/{repo}/pull/new/{branch_name}" + example: "https://github.com/Azure/azure-cosmos-dotnet-v3/pull/new/users/kirankk/fix-issue-5547-linq-dictionary" ``` -### 7.3 Reviewer Assignment Matrix +### 7.4 Reviewer Assignment Matrix | Area Label | Primary Reviewer | Backup | |------------|-----------------|--------| From d94125cc468968bddffafdbe7c5e0d48b25818ed Mon Sep 17 00:00:00 2001 From: Kiran Kumar Kolli Date: Sun, 1 Feb 2026 00:13:35 -0800 Subject: [PATCH 03/28] Add expectations validation phase and CI monitoring learnings - Added Phase 2.5: Expectations Validation - Validate user expectations against official docs before investigating - Check Microsoft Docs, API reference, SDK samples, test files - Determine if issue is bug vs. expected behavior vs. undocumented - Added learnings from PR #5583 CI monitoring: - PR title lint format: 'Category: (Adds|Fixes|Refactors|Removes) Description' - gh CLI installation and authentication steps - CI gate duration estimates (up to 90 minutes) - Monitoring loop workflow with 5-10 minute intervals - Added Section 16.5: CI Monitoring Workflow with check categories and timing --- .github/copilot-agent-plan.md | 253 +++++++++++++++++++++++++++++++++- 1 file changed, 250 insertions(+), 3 deletions(-) diff --git a/.github/copilot-agent-plan.md b/.github/copilot-agent-plan.md index e626c5d0f6..a12d2b80ef 100644 --- a/.github/copilot-agent-plan.md +++ b/.github/copilot-agent-plan.md @@ -283,6 +283,141 @@ git log --all --oneline -S "" # Search for code changes ### Phase 3: Code Analysis +#### Phase 2.5: Expectations Validation ⚠️ + +**Before diving into code analysis, validate that the reported expectations are correct.** + +```yaml +expectations_validation: + purpose: "Verify the user's expected behavior is actually correct" + importance: "Users sometimes report 'bugs' that are actually expected behavior" + + validation_sources: + official_documentation: + - docs.microsoft.com/azure/cosmos-db + - Azure SDK for .NET documentation + - API reference documentation + - Cosmos DB REST API specifications + + code_documentation: + - XML doc comments in source code + - README files in relevant folders + - Inline code comments explaining behavior + - Test files (show expected behavior) + + sdk_samples: + - Microsoft.Azure.Cosmos.Samples/ + - Official Azure SDK samples repo + - Code snippets in documentation + + cosmos_db_specifications: + - SQL query language reference + - Consistency level behaviors + - Partitioning rules + - Request unit (RU) calculations +``` + +**Validation Checklist:** +```markdown +## Expectations Validation + +Before investigating further, verify: + +### 1. Is the expected behavior documented? +- [ ] Checked official Microsoft Docs for the feature +- [ ] Reviewed API reference documentation +- [ ] Found relevant SDK samples + +### 2. Does the code documentation support the expectation? +- [ ] Reviewed XML doc comments on relevant methods +- [ ] Checked for documented limitations or known behaviors +- [ ] Reviewed test files for expected behavior patterns + +### 3. Is this a documented limitation? +- [ ] Searched docs for "limitations" or "not supported" +- [ ] Checked Cosmos DB SQL reference for supported operations +- [ ] Reviewed changelog for intentional behavior changes + +### 4. Validation Result +- [ ] **CONFIRMED**: User expectation is correct (proceed with bug fix) +- [ ] **INCORRECT**: User expectation doesn't match documented behavior (educate user) +- [ ] **UNCLEAR**: Documentation is ambiguous or missing (may need docs improvement) +- [ ] **UNDOCUMENTED**: Feature behavior is not documented (investigate actual behavior) +``` + +**Validation Workflow:** +```yaml +validation_workflow: + step_1_check_docs: + tool: web_fetch + urls: + - "https://docs.microsoft.com/azure/cosmos-db/{feature}" + - "https://docs.microsoft.com/dotnet/api/microsoft.azure.cosmos.{class}" + extract: + - Expected behavior description + - Supported operations + - Known limitations + + step_2_check_code_docs: + tools: + - grep: "/// " in relevant files + - view: XML doc comments on methods + extract: + - Method documentation + - Parameter constraints + - Return value expectations + - Exception conditions + + step_3_check_samples: + tools: + - glob: "Microsoft.Azure.Cosmos.Samples/**/*.cs" + - grep: "{feature keyword}" in samples + extract: + - Recommended usage patterns + - Working code examples + + step_4_check_tests: + tools: + - glob: "**/tests/**/*{feature}*.cs" + - view: Test methods showing expected behavior + extract: + - Assertions showing expected outcomes + - Edge cases handled + - Known limitations tested +``` + +**Example Validation Outcomes:** + +| Scenario | User Expectation | Documentation Says | Action | +|----------|------------------|-------------------|--------| +| Dictionary LINQ | `.Any()` should work | Not documented for Dictionary | Investigate, likely needs fix | +| Null partition key | Should auto-generate | Docs say must provide value | Educate user | +| Cross-partition query | Should be fast | Docs warn about RU cost | Educate user | +| Retry on 429 | Should auto-retry | Docs confirm auto-retry | Investigate why not working | + +**Documentation Sources to Check:** + +```yaml +microsoft_docs: + cosmos_db_overview: "https://docs.microsoft.com/azure/cosmos-db/" + sql_query_reference: "https://docs.microsoft.com/azure/cosmos-db/sql/sql-query-getting-started" + linq_support: "https://docs.microsoft.com/azure/cosmos-db/sql/sql-query-linq-to-sql" + partitioning: "https://docs.microsoft.com/azure/cosmos-db/partitioning-overview" + consistency_levels: "https://docs.microsoft.com/azure/cosmos-db/consistency-levels" + +sdk_api_reference: + cosmos_client: "https://docs.microsoft.com/dotnet/api/microsoft.azure.cosmos.cosmosclient" + container: "https://docs.microsoft.com/dotnet/api/microsoft.azure.cosmos.container" + query: "https://docs.microsoft.com/dotnet/api/microsoft.azure.cosmos.feediterator" + +code_documentation: + xml_docs: "Search for /// comments in source files" + readme_files: "Check README.md in relevant directories" + inline_comments: "Look for // comments explaining behavior" +``` + +--- + #### 3a. Combined Analysis Approach: Local + Bluebird **Use BOTH local tools and Bluebird for comprehensive analysis:** @@ -2118,8 +2253,25 @@ challenges: github_cli_not_installed: problem: "gh CLI not available in environment" symptom: "'gh' is not recognized as a cmdlet" - workaround: "Provide PR details for manual creation via GitHub web UI" - recommendation: "Check for gh CLI early, have manual fallback ready" + workaround: "Install with: winget install --id GitHub.cli" + recommendation: "Install gh CLI, authenticate with: gh auth login --web" + + github_cli_authentication: + problem: "gh CLI requires browser authentication for Azure org" + symptom: "SAML SSO required even after gh auth login" + workaround: "Use --web flag and complete browser flow with device code" + recommendation: "gh auth login --web, then authorize for Azure org" + + pr_title_lint_failure: + problem: "PR Lint checks enforce strict title format" + symptom: "PR Lint fails with 'Please follow required format'" + format_required: '"[Internal] Category: (Adds|Fixes|Refactors|Removes) Description"' + examples: + - "LINQ: Fixes Dictionary.Any() to generate correct SQL" + - "Query: Adds support for new aggregate functions" + - "[Internal] Tests: Refactors test infrastructure" + workaround: "gh pr edit {number} --title 'Category: Verb Description'" + recommendation: "Always use correct format from start" long_running_tests: problem: "Full test suite takes several minutes" @@ -2127,6 +2279,12 @@ challenges: workaround: "Use initial_wait parameter, then read_powershell for polling" recommendation: "Run targeted tests first, full suite only for final validation" + ci_gates_long_duration: + problem: "Remote CI gates can take up to 90 minutes" + symptom: "Many checks stay pending for extended periods" + workaround: "Use gh pr checks {number} to monitor, check periodically" + recommendation: "Set up monitoring loop, check every 5-10 minutes" + agent_completion_time: problem: "Complex agents (general-purpose with Opus) take 5-10 minutes" symptom: "read_agent times out multiple times" @@ -2163,9 +2321,98 @@ tool_selection: for_pr_creation: with_gh_cli: "gh pr create --draft" without_gh_cli: "Provide PR URL and description for manual creation" + + for_ci_monitoring: + check_status: "gh pr checks {pr_number}" + watch_mode: "gh pr checks {pr_number} --watch" + get_failures: "gh pr view {pr_number} --json statusCheckRollup" + fix_title: "gh pr edit {pr_number} --title 'Category: Verb Description'" ``` -### 16.5 Timing Benchmarks +### 16.5 CI Monitoring Workflow (Learned) + +**PR creation triggers 30+ CI checks that can take up to 90 minutes.** + +```yaml +ci_monitoring: + initial_check: + timing: "Immediately after PR creation" + command: "gh pr checks {pr_number}" + expect: "Most checks pending, license/cla may pass quickly" + + common_quick_failures: + pr_lint: + timing: "Within 1 minute" + cause: "PR title doesn't match format" + format: "Category: (Adds|Fixes|Refactors|Removes) Description" + fix: "gh pr edit {number} --title 'LINQ: Fixes Dictionary query translation'" + + license_cla: + timing: "Within 1 minute" + cause: "CLA not signed" + fix: "Sign CLA via link in check details" + + medium_duration_checks: + codeql: + timing: "5-15 minutes" + checks: + - "CodeQL/Analyze (csharp)" + - "CodeQL/Analyze (actions)" + - "CodeQL/Analyze (javascript-typescript)" + - "CodeQL/Analyze (python)" + common_issues: "Security vulnerabilities, code quality" + + static_analysis: + timing: "10-20 minutes" + check: "dotnet-v3-ci (Static Analysis)" + common_issues: "Code style, analyzer warnings" + + long_duration_checks: + unit_tests: + timing: "15-30 minutes" + check: "dotnet-v3-ci (Microsoft.Azure.Cosmos.Tests)" + common_issues: "Test failures, build errors" + + emulator_tests: + timing: "30-60 minutes" + checks: + - "dotnet-v3-ci (EmulatorTests Release - Client Telemetry, Query, ChangeFeed, ReadFeed, Batch)" + - "dotnet-v3-ci (EmulatorTests Release - MultiMaster)" + - "dotnet-v3-ci (EmulatorTests Release - MultiRegion)" + - "dotnet-v3-ci (EmulatorTests Release - Others)" + common_issues: "Integration failures, emulator setup issues" + + full_ci: + timing: "60-90 minutes" + check: "dotnet-v3-ci" + note: "Aggregate check, passes when all sub-checks pass" + + monitoring_loop: + interval: "5-10 minutes" + command: "gh pr checks {pr_number}" + on_failure: + - Get failure details: "gh pr view {number} --json statusCheckRollup" + - Analyze logs if possible + - Fix locally and push + - Wait for re-run + on_all_pass: + - Mark PR ready for review (if draft) + - Notify user +``` + +**CI Check Categories:** + +| Category | Count | Duration | Priority | +|----------|-------|----------|----------| +| Quick (lint, cla) | 2 | < 1 min | Fix immediately | +| CodeQL | 4 | 5-15 min | Usually pass | +| Build/Package | 3 | 10-20 min | Must pass | +| Unit Tests | 3 | 15-30 min | Critical | +| Emulator Tests | 6 | 30-60 min | Critical | +| Preview/Internal | 8 | 20-40 min | Important | +| Encryption | 4 | 20-40 min | If changed | + +### 16.6 Timing Benchmarks | Phase | Duration | Notes | |-------|----------|-------| From ff36a42e2413b7860d02915c847fb42af500bb65 Mon Sep 17 00:00:00 2001 From: Kiran Kumar Kolli Date: Sun, 1 Feb 2026 11:16:11 -0800 Subject: [PATCH 04/28] Add Azure DevOps build re-run documentation - Added Section 16.5.1: Azure DevOps Build Re-run via API - Documented PAT authentication requirements - Added PowerShell script for requeuing builds - Clarified limitation: 'Rerun failed jobs only' NOT available via API - Recommended approach: Use Azure DevOps web UI for failed-only reruns - Added Azure DevOps MCP Server reference (official Microsoft package) --- .github/copilot-agent-plan.md | 92 +++++++++++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) diff --git a/.github/copilot-agent-plan.md b/.github/copilot-agent-plan.md index a12d2b80ef..e12d23ff3a 100644 --- a/.github/copilot-agent-plan.md +++ b/.github/copilot-agent-plan.md @@ -2412,6 +2412,98 @@ ci_monitoring: | Preview/Internal | 8 | 20-40 min | Important | | Encryption | 4 | 20-40 min | If changed | +### 16.5.1 Azure DevOps Build Re-run via API + +**Re-running builds when CI fails:** + +```yaml +azure_devops_rerun: + api_endpoint: "https://{org}.visualstudio.com/{project}/_apis/build/builds?api-version=6.0" + + authentication: + method: "Personal Access Token (PAT)" + required_scopes: + - "Build: Read & Execute" + - "Project and Team: Read" + header: "Authorization: Basic base64(:PAT)" + + rerun_options: + full_build_requeue: + supported: true + method: "POST to builds API" + use_when: "Need to re-run entire pipeline" + example: | + $body = @{ + definition = @{ id = $definitionId } + sourceBranch = "refs/pull/{pr_number}/merge" + reason = "manual" + } | ConvertTo-Json + Invoke-RestMethod -Uri $apiUrl -Method POST -Headers $headers -Body $body + + rerun_failed_jobs_only: + supported: false # ⚠️ NOT AVAILABLE VIA API + note: "Azure DevOps REST API does NOT support re-running only failed jobs" + workaround: "Use Azure DevOps web UI 'Rerun failed jobs' button" + ui_location: "Build results page β†’ ... menu β†’ Rerun failed jobs" + + recommended_approach: + for_flaky_tests: + first_try: "Rerun failed jobs via Azure DevOps web UI (faster)" + fallback: "Requeue entire build via API if UI not accessible" + + for_code_changes: + always: "Push new commit (triggers fresh build with latest code)" +``` + +**PowerShell Script to Requeue Build:** + +```powershell +# Set credentials +$org = "cosmos-db-sdk-public" +$project = "cosmos-db-sdk-public" +$pat = $env:AZURE_DEVOPS_PAT + +$headers = @{ + Authorization = "Basic " + [Convert]::ToBase64String([Text.Encoding]::ASCII.GetBytes(":$pat")) + "Content-Type" = "application/json" +} + +# Get failed build info +$failedBuildId = 59156 +$build = Invoke-RestMethod -Uri "https://$org.visualstudio.com/$project/_apis/build/builds/$failedBuildId?api-version=6.0" -Headers $headers + +# Queue new build with same parameters +$body = @{ + definition = @{ id = $build.definition.id } + sourceBranch = $build.sourceBranch + reason = "manual" +} | ConvertTo-Json + +$newBuild = Invoke-RestMethod -Uri "https://$org.visualstudio.com/$project/_apis/build/builds?api-version=6.0" -Method POST -Headers $headers -Body $body +Write-Host "New build queued: $($newBuild.id)" +``` + +**Azure DevOps MCP Server (Official):** + +```yaml +mcp_server: + package: "@azure-devops/mcp" + publisher: "Microsoft" + docs: "https://learn.microsoft.com/en-us/azure/devops/mcp-server/mcp-server-overview" + + capabilities: + - Query work items, PRs, builds + - Trigger builds/pipelines + - Access test results + - Natural language queries + + setup: + install: "npx @azure-devops/mcp " + requires: "Azure DevOps PAT" + + note: "For simple build operations, direct REST API is often simpler than full MCP setup" +``` + ### 16.6 Timing Benchmarks | Phase | Duration | Notes | From 9b6ec29807008b3eb47af92bee5f6eb0d60c673f Mon Sep 17 00:00:00 2001 From: Kiran Kumar Kolli Date: Sun, 1 Feb 2026 11:31:36 -0800 Subject: [PATCH 05/28] Docs: Add comprehensive Copilot agent issue triage plan - Section 0: MCP server setup (GitHub, Azure DevOps, Bluebird) - Section 16.7-16.12: Session learnings (flaky tests, draft PR workflow, parallel agents, templates) - Updated branch naming: users//copilot-- - Simplified CI retry to MCP-only approach --- .github/copilot-agent-plan.md | 593 ++++++++++++++++++++++++++------ .github/copilot-instructions.md | 52 +++ 2 files changed, 549 insertions(+), 96 deletions(-) create mode 100644 .github/copilot-instructions.md diff --git a/.github/copilot-agent-plan.md b/.github/copilot-agent-plan.md index e12d23ff3a..4551e1320a 100644 --- a/.github/copilot-agent-plan.md +++ b/.github/copilot-agent-plan.md @@ -3,6 +3,166 @@ --- +## 0. Environment Setup (MCP Servers) + +**Before using this plan, ensure the following MCP servers are configured.** + +### 0.1 Required MCP Servers + +| MCP Server | Purpose | Required | +|------------|---------|----------| +| **GitHub MCP Server** | Issues, PRs, code search, actions | βœ… Yes | +| **Azure DevOps MCP Server** | CI builds, pipelines, retry failed jobs | βœ… Yes | +| **Bluebird Engineering Copilot** | Code graph, semantic search | Optional | + +### 0.2 GitHub MCP Server + +**Already configured in most Copilot environments.** + +```yaml +github_mcp_server: + package: "github-mcp-server" + capabilities: + - search_issues, list_issues, issue_read + - search_pull_requests, list_pull_requests, pull_request_read + - search_code, get_file_contents + - actions_list, actions_get, get_job_logs + - list_commits, get_commit + + note: "Usually pre-configured. For Azure org repos, may need SAML auth workaround (use web_fetch)" +``` + +### 0.3 Azure DevOps MCP Server (Official Microsoft) + +**Required for CI build management, retry failed jobs.** + +```yaml +azure_devops_mcp_server: + package: "@azure-devops/mcp" + publisher: "Microsoft" + repository: "https://github.com/microsoft/azure-devops-mcp" + docs: "https://learn.microsoft.com/en-us/azure/devops/mcp-server/mcp-server-overview" + + capabilities: + pipelines: + - mcp_ado_pipelines_get_builds # List builds + - mcp_ado_pipelines_get_build_status # Get build status + - mcp_ado_pipelines_get_build_log # Get build logs + - mcp_ado_pipelines_run_pipeline # Start new pipeline run + - mcp_ado_pipelines_update_build_stage # ⭐ RETRY FAILED JOBS + work_items: + - mcp_ado_wit_get_work_item + - mcp_ado_wit_create_work_item + - mcp_ado_wit_my_work_items + repositories: + - mcp_ado_repo_list_pull_requests_by_repo_or_project + - mcp_ado_repo_get_pull_request_by_id +``` + +**Installation:** + +```bash +# Option 1: VS Code One-Click Install +# Visit: https://insiders.vscode.dev/redirect/mcp/install?name=ado&config=... + +# Option 2: Manual Setup +# Create .vscode/mcp.json in your project: +``` + +```json +{ + "inputs": [ + { + "id": "ado_org", + "type": "promptString", + "description": "Azure DevOps organization name (e.g. 'cosmos-db-sdk-public')" + } + ], + "servers": { + "ado": { + "type": "stdio", + "command": "npx", + "args": ["-y", "@azure-devops/mcp", "${input:ado_org}"] + } + } +} +``` + +**Authentication:** +- First use will open browser for Microsoft account login +- Requires Azure DevOps organization access + +**Key Tool: Retry Failed Jobs** +```yaml +mcp_ado_pipelines_update_build_stage: + purpose: "Retry failed stages without re-running entire build" + parameters: + project: "cosmos-db-sdk-public" + buildId: 59156 + stageRefName: "Preview_Tests_Release" # Stage reference name + state: "retry" # Options: retry, cancel + + advantage: "Faster than requeuing entire build" +``` + +### 0.4 Bluebird Engineering Copilot (Optional) + +**For advanced code graph and semantic search.** + +```yaml +bluebird_engineering_copilot: + capabilities: + - do_vector_search: "Semantic code search" + - do_fulltext_search: "Keyword code search" + - get_source_code: "Retrieve source files" + - get_hierarchical_summary: "Code overview" + - get_callers, get_callees: "Call graph analysis" + + usage: + step_1: "Call engineering_copilot for instructions" + step_2: "Call dynamic_tool_invoker with specific tool" + + note: "Useful for deep code analysis, may timeout on large queries" +``` + +### 0.5 Quick Setup Script (Windows PowerShell) + +```powershell +# 1. Install GitHub CLI +winget install --id GitHub.cli + +# 2. Authenticate GitHub CLI +gh auth login --web + +# 3. Install Node.js (for Azure DevOps MCP) +winget install --id OpenJS.NodeJS + +# 4. Test Azure DevOps MCP (first run opens browser for auth) +npx @azure-devops/mcp cosmos-db-sdk-public --help +``` + +### 0.6 Verification Checklist + +```markdown +## Environment Setup Verification + +### GitHub MCP Server +- [ ] `gh auth status` shows logged in +- [ ] Can fetch issues: `gh issue list --repo Azure/azure-cosmos-dotnet-v3` + +### Azure DevOps MCP Server +- [ ] Node.js 18+ installed: `node --version` +- [ ] MCP server responds: `npx @azure-devops/mcp cosmos-db-sdk-public --help` +- [ ] Browser auth completed (first MCP tool call opens login) + +### Local Tools +- [ ] .NET SDK installed: `dotnet --version` +- [ ] Git configured: `git config user.name` +- [ ] Repository cloned: `E:\src\v33` or similar +``` + +--- + ## 1. Executive Summary This plan defines a comprehensive workflow for Copilot agents to handle GitHub issues for the Azure Cosmos DB .NET SDK repository. The agent will: @@ -1207,7 +1367,7 @@ This workaround can be removed when upgrading to SDK version {x.y.z} or later. ### 7.1 Branch Naming Convention -**Format:** `users//` +**Format:** `users//copilot--` ```yaml branch_naming: @@ -1221,17 +1381,17 @@ branch_naming: - refactor # Code refactoring examples: - - "users/kirankk/fix-linq-dictionary-objecttoarray" - - "users/kirankk/fix-issue-5547-dictionary-any" - - "users/johndoe/feature-bulk-retry-policy" - - "users/janedoe/perf-batch-throughput" - - "users/alice/docs-linq-dictionary-support" + - "users/kirankk/copilot-5547-fix-linq-dictionary-objecttoarray" + - "users/kirankk/copilot-1234-feature-bulk-retry" + - "users/johndoe/copilot-5678-perf-batch-throughput" + - "users/janedoe/copilot-9999-docs-linq-dictionary" rules: - Use lowercase - Use hyphens (not underscores) as separators - - Include issue number when applicable - - Keep description concise but descriptive + - Always include `copilot-` prefix for Copilot-authored branches + - Always include issue number after `copilot-` + - Keep feature description concise but descriptive - Username should match GitHub handle ``` @@ -1420,7 +1580,7 @@ pr_references: branch_to_pr_url: pattern: "https://github.com/{owner}/{repo}/pull/new/{branch_name}" - example: "https://github.com/Azure/azure-cosmos-dotnet-v3/pull/new/users/kirankk/fix-issue-5547-linq-dictionary" + example: "https://github.com/Azure/azure-cosmos-dotnet-v3/pull/new/users/kirankk/copilot-5547-fix-linq-dictionary" ``` ### 7.4 Reviewer Assignment Matrix @@ -1535,16 +1695,15 @@ validation_workflow: description: "Create PR to trigger remote CI" steps: - name: "Create feature branch" - naming_convention: "users//" + naming_convention: "users//copilot--" examples: - - "users/kirankk/fix-linq-dictionary-objecttoarray" - - "users/kirankk/issue-5547-dictionary-any" - - "users/johndoe/perf-batch-throughput" + - "users/kirankk/copilot-5547-fix-linq-dictionary" + - "users/johndoe/copilot-1234-perf-batch-throughput" command: | - git checkout -b users/{username}/fix-issue-{number}-{short-description} + git checkout -b users/{username}/copilot-{number}-{short-description} git add . git commit -m "Fix #{number}: {description}" - git push origin users/{username}/fix-issue-{number}-{short-description} + git push origin users/{username}/copilot-{number}-{short-description} - name: "Create Draft PR" purpose: "Triggers CI without requesting review" @@ -2412,96 +2571,65 @@ ci_monitoring: | Preview/Internal | 8 | 20-40 min | Important | | Encryption | 4 | 20-40 min | If changed | -### 16.5.1 Azure DevOps Build Re-run via API +### 16.5.1 Azure DevOps Build Re-run via MCP Server -**Re-running builds when CI fails:** +**Use the Azure DevOps MCP Server to retry failed CI stages.** ```yaml -azure_devops_rerun: - api_endpoint: "https://{org}.visualstudio.com/{project}/_apis/build/builds?api-version=6.0" +mcp_ado_pipelines_update_build_stage: + purpose: "Retry failed stages only (faster than full requeue)" - authentication: - method: "Personal Access Token (PAT)" - required_scopes: - - "Build: Read & Execute" - - "Project and Team: Read" - header: "Authorization: Basic base64(:PAT)" - - rerun_options: - full_build_requeue: - supported: true - method: "POST to builds API" - use_when: "Need to re-run entire pipeline" - example: | - $body = @{ - definition = @{ id = $definitionId } - sourceBranch = "refs/pull/{pr_number}/merge" - reason = "manual" - } | ConvertTo-Json - Invoke-RestMethod -Uri $apiUrl -Method POST -Headers $headers -Body $body - - rerun_failed_jobs_only: - supported: false # ⚠️ NOT AVAILABLE VIA API - note: "Azure DevOps REST API does NOT support re-running only failed jobs" - workaround: "Use Azure DevOps web UI 'Rerun failed jobs' button" - ui_location: "Build results page β†’ ... menu β†’ Rerun failed jobs" - - recommended_approach: - for_flaky_tests: - first_try: "Rerun failed jobs via Azure DevOps web UI (faster)" - fallback: "Requeue entire build via API if UI not accessible" - - for_code_changes: - always: "Push new commit (triggers fresh build with latest code)" + tool: "mcp_ado_pipelines_update_build_stage" + parameters: + project: "cosmos-db-sdk-public" + buildId: 59156 + stageRefName: "Preview_Tests_Release" # Stage reference name + state: "retry" # Options: retry, cancel + + example_prompt: | + "Retry the failed 'Preview Tests Release' stage in build 59156 + for project cosmos-db-sdk-public" + + advantages: + - Only re-runs failed stage (faster) + - Doesn't re-run already-passed jobs + - Natural language interface + - Fully automated by Copilot + + requires: + - Azure DevOps MCP Server configured (see Section 0.3) + - Microsoft account with org access ``` -**PowerShell Script to Requeue Build:** - -```powershell -# Set credentials -$org = "cosmos-db-sdk-public" -$project = "cosmos-db-sdk-public" -$pat = $env:AZURE_DEVOPS_PAT - -$headers = @{ - Authorization = "Basic " + [Convert]::ToBase64String([Text.Encoding]::ASCII.GetBytes(":$pat")) - "Content-Type" = "application/json" -} - -# Get failed build info -$failedBuildId = 59156 -$build = Invoke-RestMethod -Uri "https://$org.visualstudio.com/$project/_apis/build/builds/$failedBuildId?api-version=6.0" -Headers $headers - -# Queue new build with same parameters -$body = @{ - definition = @{ id = $build.definition.id } - sourceBranch = $build.sourceBranch - reason = "manual" -} | ConvertTo-Json +**Finding the stage reference name:** -$newBuild = Invoke-RestMethod -Uri "https://$org.visualstudio.com/$project/_apis/build/builds?api-version=6.0" -Method POST -Headers $headers -Body $body -Write-Host "New build queued: $($newBuild.id)" +```yaml +# Use MCP server to get build details +mcp_ado_pipelines_get_build_status: + project: "cosmos-db-sdk-public" + buildId: 59156 + +# Response includes timeline with stage names and their refNames +# Common stage refNames in this repo: +# - "Build" +# - "Unit_Tests" +# - "Emulator_Tests_Release" +# - "Preview_Tests_Release" +# - "Static_Analysis" ``` -**Azure DevOps MCP Server (Official):** +**Handling CI Failures:** ```yaml -mcp_server: - package: "@azure-devops/mcp" - publisher: "Microsoft" - docs: "https://learn.microsoft.com/en-us/azure/devops/mcp-server/mcp-server-overview" - - capabilities: - - Query work items, PRs, builds - - Trigger builds/pipelines - - Access test results - - Natural language queries +ci_failure_response: + flaky_tests: + action: "mcp_ado_pipelines_update_build_stage with state=retry" - setup: - install: "npx @azure-devops/mcp " - requires: "Azure DevOps PAT" + code_related_failures: + action: "Fix code locally, push commit (triggers fresh build)" - note: "For simple build operations, direct REST API is often simpler than full MCP setup" + infrastructure_failures: + action: "Wait 5 min, then retry failed stage via MCP" ``` ### 16.6 Timing Benchmarks @@ -2585,23 +2713,298 @@ recommended_workflow: phase_5_pr_creation: duration: "~5 min" steps: - - Create feature branch (users//) + - Create feature branch (users//copilot--) - Commit with descriptive message - Push to remote - Create draft PR (or provide details for manual creation) - Monitor CI status ``` +### 16.7 Flaky Test Registry + +**Known flaky tests that may fail intermittently (not related to code changes):** + +```yaml +flaky_tests: + EndpointFailureMockTest: + location: "Microsoft.Azure.Cosmos.Tests/CosmosClientTests.cs" + symptom: "Intermittent timeout or connection failure" + seen_in: ["PR #5573 (merged)", "PR #5583", "Build 59156"] + action: "Retry failed stage via MCP server" + + # Add more as discovered: + # TestName: + # location: "path/to/test" + # symptom: "description" + # seen_in: ["PR numbers where it failed"] + # action: "retry/skip/fix" + +handling_flaky_failures: + step_1: "Check if failed test is in flaky registry" + step_2: "Verify failure is unrelated to your changes (same test fails in other PRs)" + step_3: "Retry failed stage: mcp_ado_pipelines_update_build_stage with state=retry" + step_4: "If still fails after 2 retries, document and proceed (known flaky)" + +adding_to_registry: + when: "Test fails in your PR AND in at least one other recent PR/build" + format: "Add entry with location, symptom, seen_in PRs, recommended action" +``` + +### 16.8 Draft PR Workflow + +**Always create PRs as drafts first, mark ready after CI passes:** + +```yaml +draft_pr_workflow: + create_draft: + command: "gh pr create --draft --title 'Category: Fixes ...' --body-file pr-body.md" + reason: "Prevents premature review requests while CI runs" + + monitor_ci: + command: "gh pr checks {pr_number}" + interval: "Every 5-10 minutes" + duration: "Up to 90 minutes for full CI" + + on_ci_pass: + command: "gh pr ready {pr_number}" + effect: "Removes draft status, notifies reviewers" + + on_ci_fail: + flaky_test: "Retry via MCP server (see Section 16.7)" + code_issue: "Fix locally, push, CI auto-reruns" + + timeline: + - "T+0: Create draft PR" + - "T+1min: Quick checks (lint, CLA) pass/fail" + - "T+15min: CodeQL, static analysis complete" + - "T+30min: Unit tests complete" + - "T+60min: Emulator tests complete" + - "T+90min: All CI complete β†’ mark ready" +``` + +### 16.9 Parallel Agent Strategy + +**Use parallel background agents to maximize efficiency:** + +```yaml +parallel_agent_pattern: + when: "Multiple independent tasks can run simultaneously" + + example_investigation: + launch_simultaneously: + agent_1: + type: "general-purpose" + task: "Create reproduction test" + model: "claude-opus-4.5" + + agent_2: + type: "general-purpose" + task: "Implement fix based on root cause" + model: "claude-opus-4.5" + + agent_3: + type: "task" + task: "Run baseline tests to establish pass/fail state" + model: "claude-haiku-4.5" + + then: + - "Wait for all agents to complete" + - "Review each agent's output" + - "Integrate changes if all successful" + - "Run regression tests" + + benefits: + - "3 agents finish in ~10min vs ~30min sequential" + - "Each agent has full context window" + - "Failures isolated to specific task" + + caution: + - "Don't parallelize dependent tasks" + - "Review all outputs before committing" + - "One agent's fix may conflict with another's" +``` + +### 16.10 Investigation Document Template + +**Create investigation docs in session workspace for complex issues:** + +```yaml +investigation_document: + location: "~/.copilot/session-state/{session-id}/files/issue-{number}-investigation.md" + purpose: "Persist findings across context compactions" + + template: | + # Issue #{number} Investigation + + ## Summary + - **Issue**: [one-line description] + - **Reporter**: [username] + - **Area**: [LINQ/Query/SDK/etc] + - **Status**: [Triaging/Investigating/Reproducing/Fixing/Complete] + + ## Reported Behavior + [Copy from issue description] + + ## Expected Behavior + [What should happen] + + ## Root Cause Analysis + - **Location**: [file:line] + - **Cause**: [why the bug exists] + - **Evidence**: [code snippets, logs] + + ## Reproduction + - **Test file**: [path to test] + - **Steps**: [how to reproduce] + - **Result**: [confirmed/not-reproduced] + + ## Fix Strategy + - **Approach**: [how to fix] + - **Files to modify**: [list] + - **Risk assessment**: [low/medium/high] + + ## Verification + - [ ] Unit tests pass + - [ ] Existing tests still pass + - [ ] Build succeeds + - [ ] CI passes + + ## References + - [Links to related issues, docs, PRs] + + when_to_create: + - "Complex issues requiring multi-step analysis" + - "Issues that may span multiple sessions" + - "When context compaction is likely" +``` + +### 16.11 Commit Message Format + +**Follow conventional commit format for this repository:** + +```yaml +commit_format: + pattern: "{type}: {description}" + + types: + - "Fix" - Bug fix + - "Add" - New feature + - "Refactor" - Code restructuring + - "Test" - Adding tests + - "Docs" - Documentation + - "Perf" - Performance improvement + + examples: + bug_fix: "Fix: Dictionary.Any() now uses OBJECTTOARRAY instead of JOIN" + new_feature: "Add: Support for hierarchical partition keys" + test: "Test: Add unit tests for IsDictionary type detection" + docs: "Docs: Update LINQ query translation documentation" + + rules: + - "Keep first line under 72 characters" + - "Use imperative mood ('Fix' not 'Fixed')" + - "Reference issue number in body if applicable" + - "Sign-off required for external contributors" + + full_example: | + Fix: Dictionary.Any() now uses OBJECTTOARRAY instead of JOIN + + The LINQ translator was treating Dictionary as a generic + IEnumerable, generating incorrect SQL with JOIN. Now wraps + dictionary member access with OBJECTTOARRAY() function. + + Fixes #5547 +``` + +### 16.12 PR Description Template + +**Full investigation details for Copilot-authored PRs:** + +```yaml +pr_description_template: | + ## Description + πŸ€– **This PR was authored by GitHub Copilot** + + Fixes #{issue_number} + + [One paragraph explaining what the PR does] + + ## Root Cause + + **Location**: `{file_path}:{line_number}` + + **Analysis**: + [Detailed explanation of why the bug existed] + + **Evidence**: + ```csharp + // Code snippet showing the problematic behavior + ``` + + ## Changes Made + + ### {file_name_1} + - [Change description] + + ### {file_name_2} + - [Change description] + + ## Generated Output + + **Before (incorrect)**: + ```sql + [incorrect output] + ``` + + **After (correct)**: + ```sql + [correct output] + ``` + + ## Testing + + - [ ] New unit tests added: `{test_file_name}` + - [ ] Existing tests pass + - [ ] Local build succeeds + - [ ] Tested with Cosmos DB emulator (if applicable) + + ## Checklist + + - [x] Code follows repository conventions + - [x] Changes are minimal and focused + - [x] No unrelated changes included + - [x] Documentation updated (if applicable) + + --- + *Generated by GitHub Copilot CLI Agent* + +pr_title_format: + pattern: "Category: (Adds|Fixes|Refactors|Removes) Description" + lint_regex: "^\\[Internal\\]\\s.+:\\s(Adds|Fixes|Refactors|Removes)\\s.+" + examples: + - "LINQ: Fixes Dictionary.Any() to generate correct SQL with OBJECTTOARRAY" + - "SDK: Adds support for hierarchical partition keys" + - "Query: Refactors SQL generation for better readability" +``` + --- ## TODO: Implementation Tasks ### Completed βœ… - [x] Test plan on real issue (#5547) -- [x] Document branch naming convention (users//) +- [x] Document branch naming convention (users//copilot--) - [x] Add remote CI validation workflow (Section 7.4) - [x] Document Copilot-authored PR format - [x] Add lessons learned section +- [x] Add MCP server setup instructions (Section 0) +- [x] Document Azure DevOps MCP for CI retry (Section 16.5.1) +- [x] Add flaky test registry (Section 16.7) +- [x] Add draft PR workflow (Section 16.8) +- [x] Add parallel agent strategy (Section 16.9) +- [x] Add investigation document template (Section 16.10) +- [x] Add commit message format (Section 16.11) +- [x] Add PR description template (Section 16.12) ### Pending - [ ] Create GitHub Action workflow for auto-triggering agent on new issues @@ -2616,5 +3019,3 @@ recommended_workflow: - [ ] Audit high-priority APIs for AI-friendly docs - [ ] Create documentation improvement backlog - [ ] Add "copilot-authored" label to repository -- [ ] Install GitHub CLI (gh) in development environment -- [ ] Document SAML workarounds for Azure org repos diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md new file mode 100644 index 0000000000..fa44fedcd8 --- /dev/null +++ b/.github/copilot-instructions.md @@ -0,0 +1,52 @@ + +# Copilot / AI assistant instructions β€” Microsoft.Azure.Cosmos + +Purpose: quick, actionable context so an AI coding assistant can be immediately productive in this repo. + +- **Big picture**: This repository implements the v3 .NET SDK for Azure Cosmos DB. Major components: + - `Microsoft.Azure.Cosmos/` β€” core SDK client, most production code. + - `Microsoft.Azure.Cosmos.Encryption/` and `Microsoft.Azure.Cosmos.Encryption.Custom/` β€” client-side encryption extensions. + - `Microsoft.Azure.Cosmos.Samples/` β€” runnable examples and usage patterns. + - `docs/`, `templates/`, and top-level Azure Pipelines YAMLs β€” CI, packaging and emulator setup. + +- **Why this structure**: the SDK core is separated from optional features (encryption, fault-injection, direct mode) so consumers can opt into smaller packages. Versioning and feature flags are centralized in `Directory.Build.props`. + +- **Build & test (most common workflows)**: + - Build solution: `dotnet build Microsoft.Azure.Cosmos.sln -c Release` (or simply `dotnet build`). + - Run unit/integration tests: `dotnet test --no-build` in the solution or specific test project folders under `**/tests/`. + - CI uses the YAML files in the repository root and `templates/` β€” see `templates/emulator-setup.yml` for the Windows emulator script used in CI. + +- **Local emulator and integration testing**: + - The codebase expects the Windows Cosmos DB Emulator in many integration tests. CI installs/starts it via `templates/emulator-setup.yml` (PowerShell scripts that download and launch the MSI and call `Start-Process CosmosDB.Emulator.exe`). + - If running tests locally on Windows, install the emulator and ensure exclusions and local state paths match what's in `templates/emulator-setup.yml`. + +- **Versioning & build flags**: + - `Directory.Build.props` (repo root and project-level overrides) contains the canonical package versions and MSBuild flags (e.g. ``, ``, and `DefineConstants` that add `PREVIEW`/`ENCRYPTIONPREVIEW`). + - Feature/preview builds are gated by MSBuild properties like `IsPreview` or `IsNightly`; set these via `dotnet msbuild /p:IsPreview=true` when needed. + +- **Conventions & patterns** (project-specific) + - Avoid introducing new global build properties; add versions to `Directory.Build.props` where applicable. + - Tests use the emulator or mocks; integration tests that depend on emulator are usually under `tests/` and expect environment-based setup. Look for CI templates for exact start-up sequence. + - Strong-name signing keys exist at repo root (`35MSSharedLib1024.snk`, `testkey.snk`); builds may require signing configuration on CI. + +- **Integration points & external deps**: + - Azure Cosmos DB Emulator (Windows) β€” required for many integration tests. + - NuGet packaging and pipeline tooling β€” see `templates/nuget-pack.yml` and the many `azure-pipelines-*.yml` files for packaging/release behavior. + +- **Where to look for examples** (use these as source-of-truth snippets): + - `Directory.Build.props` β€” versioning and define-constants + - `templates/emulator-setup.yml` β€” exact emulator install/start PowerShell used in CI + - `Microsoft.Azure.Cosmos/` β€” core SDK patterns (public APIs, partitioning, feed iterator usage) + - `Microsoft.Azure.Cosmos.Samples/` β€” minimal runnable samples for usage patterns + +- **How AI should produce code/changes here**: + - Keep changes minimal and focused; prefer small, targeted edits and follow existing code style. + - When suggesting build/test changes, reference the relevant MSBuild property or pipeline YAML (point to `Directory.Build.props` or `templates/*`). + - Do not change version numbers or packaging settings without explicit instruction β€” these are centrally managed. + - If adding or modifying tests that require the emulator, include/update relevant CI/template steps and document required environment variables. + +- **Quick examples to reference in suggestions**: + - Use `FeedIterator` patterns as in `Microsoft.Azure.Cosmos` when generating query examples. + - For emulator-driven tests, mirror the startup sequence from `templates/emulator-setup.yml`. + +If anything here is unclear or you want the file to include additional examples (specific files, common refactor targets, or typical PR reviewers), tell me what to add and I will iterate. From 800faa9b4e3419d2e95a1e0012249761d420332d Mon Sep 17 00:00:00 2001 From: Kiran Kumar Kolli Date: Sun, 1 Feb 2026 11:36:54 -0800 Subject: [PATCH 06/28] Docs: Add CI gate monitoring loop with GREEN goal - Section 16.8.1: Explicit iterate-until-GREEN workflow - Categorize failures: code/flaky/infrastructure/unrelated - Response actions for each failure type - Success criteria and escalation paths - Quick reference commands for monitoring --- .github/copilot-agent-plan.md | 95 +++++++++++++++++++++++++++++++++++ 1 file changed, 95 insertions(+) diff --git a/.github/copilot-agent-plan.md b/.github/copilot-agent-plan.md index 4551e1320a..0c25da596d 100644 --- a/.github/copilot-agent-plan.md +++ b/.github/copilot-agent-plan.md @@ -2782,6 +2782,101 @@ draft_pr_workflow: - "T+90min: All CI complete β†’ mark ready" ``` +### 16.8.1 CI Gate Monitoring Loop (Goal: ALL GREEN) + +**⚠️ CRITICAL: PR is not complete until ALL CI gates are GREEN.** + +```yaml +ci_monitoring_loop: + goal: "All CI gates GREEN before marking PR ready for review" + + monitoring_workflow: + step_1_initial_check: + wait: "5 minutes after push" + command: "gh pr checks {pr_number}" + check: "Quick gates (lint, CLA) should pass" + on_fail: "Fix immediately (usually PR title format)" + + step_2_periodic_monitoring: + interval: "Every 10 minutes" + command: "gh pr checks {pr_number}" + duration: "Up to 90 minutes" + actions: + - "Note which checks are pending/passing/failing" + - "On first failure, begin investigation immediately" + + step_3_on_failure: + investigate: + - "Get failure logs: gh pr checks {pr_number} --json" + - "Identify failed check name and job ID" + - "Fetch logs via MCP or gh CLI" + - "Analyze root cause" + + categorize_failure: + code_related: "Your changes broke something" + flaky_test: "Known intermittent failure (see Section 16.7)" + infrastructure: "Emulator/network/timeout issues" + unrelated: "Pre-existing failure in master" + + respond: + code_related: + action: "Fix locally β†’ test β†’ push β†’ CI auto-reruns" + verify: "Monitor until that check passes" + + flaky_test: + action: "mcp_ado_pipelines_update_build_stage with state=retry" + max_retries: 2 + if_still_fails: "Document and escalate" + + infrastructure: + action: "Wait 5 min, then retry failed stage via MCP" + + unrelated: + action: "Document that failure exists in master, proceed" + + step_4_iterate: + loop: "Repeat step_2 and step_3 until all gates GREEN" + exit_condition: "gh pr checks shows all checks passing" + + step_5_complete: + action: "gh pr ready {pr_number}" + result: "PR moves from draft to ready for review" + + success_criteria: + required: + - "All CI checks show βœ“ (green)" + - "No pending checks remaining" + - "PR lint passed" + - "CLA signed" + optional: + - "CodeQL may show warnings (review, don't block)" + + failure_escalation: + after_3_retries: "Investigate deeper, may need human help" + infrastructure_repeated: "Check Azure DevOps service status" + unknown_failure: "Comment on PR with findings, request help" +``` + +**Quick Reference Commands:** + +```bash +# Check all PR gates +gh pr checks {pr_number} + +# Get detailed status as JSON +gh pr checks {pr_number} --json name,state,conclusion + +# View specific check logs (GitHub Actions) +gh run view {run_id} --log-failed + +# Retry failed stage (Azure DevOps via MCP) +# Use: mcp_ado_pipelines_update_build_stage +# project: "cosmos-db-sdk-public" +# buildId: {build_id} +# stageRefName: "{failed_stage}" +# state: "retry" +``` + ### 16.9 Parallel Agent Strategy **Use parallel background agents to maximize efficiency:** From 3e4149ff56eb1a4f2faebad59ff4cf86d7c87abe Mon Sep 17 00:00:00 2001 From: Kiran Kumar Kolli Date: Sun, 1 Feb 2026 11:39:22 -0800 Subject: [PATCH 07/28] Docs: Add StyleCop and EditorConfig style rules (Section 16.13) --- .github/copilot-agent-plan.md | 52 +++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/.github/copilot-agent-plan.md b/.github/copilot-agent-plan.md index 0c25da596d..3e93ddccd0 100644 --- a/.github/copilot-agent-plan.md +++ b/.github/copilot-agent-plan.md @@ -3082,6 +3082,58 @@ pr_title_format: - "Query: Refactors SQL generation for better readability" ``` +### 16.13 Code Style (StyleCop & EditorConfig) + +**Repository uses StyleCop.Analyzers and .editorconfig for code style enforcement.** + +```yaml +stylecop: + config_file: "Microsoft.Azure.Cosmos/src/stylecop.json" + package: "StyleCop.Analyzers v1.1.118" + + key_rules: + documentation: + company_name: "Microsoft" + document_internal: false + xml_header: false + ordering: + system_usings_first: true + readability: + no_builtin_type_aliases: true # Use 'string' not 'String' + +editorconfig: + file: ".editorconfig" + + critical_rules: + indentation: "4 spaces (no tabs)" + line_endings: "CRLF" + final_newline: false + + this_qualification: "required (error level)" + # this.field, this.Property, this.Method(), this.Event + + var_usage: "never (error level)" + # Always use explicit types: string x = ""; not var x = ""; + + usings_placement: "inside namespace (error level)" + + braces: "Allman style (open brace on new line)" + + common_violations: + - "Missing this. qualifier" + - "Using var instead of explicit type" + - "Usings outside namespace" + - "Missing parentheses in binary operators" +``` + +**Quick Style Checklist:** +- [ ] `this.` prefix on all instance members +- [ ] Explicit types (no `var`) +- [ ] `using` statements inside namespace +- [ ] System usings first +- [ ] 4-space indentation +- [ ] CRLF line endings + --- ## TODO: Implementation Tasks From 31e44fbb11a676209f829f67feb58971c534333f Mon Sep 17 00:00:00 2001 From: Kiran Kumar Kolli Date: Sun, 1 Feb 2026 11:43:18 -0800 Subject: [PATCH 08/28] Docs: Add best practices (Sections 16.14-16.21) - Async/await & CancellationToken patterns - Error handling with CosmosException - Logging with DefaultTrace - Breaking change detection (contracts) - Security review checklist - Performance considerations - Rollback strategy - Testing patterns (unit, emulator, mocking) --- .github/copilot-agent-plan.md | 230 ++++++++++++++++++++++++++++++++++ 1 file changed, 230 insertions(+) diff --git a/.github/copilot-agent-plan.md b/.github/copilot-agent-plan.md index 3e93ddccd0..22eb2ac61a 100644 --- a/.github/copilot-agent-plan.md +++ b/.github/copilot-agent-plan.md @@ -3134,6 +3134,236 @@ editorconfig: - [ ] 4-space indentation - [ ] CRLF line endings +### 16.14 Async/Await & CancellationToken Patterns + +```yaml +async_patterns: + all_public_async_methods: + - "Must accept CancellationToken as last parameter" + - "Must pass CancellationToken to all async calls" + - "Must use ConfigureAwait(false) for library code" + + naming: + - "Async suffix required: ReadItemAsync, CreateContainerAsync" + + example: | + public async Task> ReadItemAsync( + string id, + PartitionKey partitionKey, + ItemRequestOptions requestOptions = null, + CancellationToken cancellationToken = default) + { + return await this.container.ReadItemAsync( + id, + partitionKey, + requestOptions, + cancellationToken).ConfigureAwait(false); + } + + anti_patterns: + - "Never use .Result or .Wait() - causes deadlocks" + - "Never ignore CancellationToken parameter" + - "Never create fire-and-forget tasks without error handling" +``` + +### 16.15 Error Handling Patterns + +```yaml +exception_handling: + primary_exception: "CosmosException" + location: "Microsoft.Azure.Cosmos.Resource.CosmosExceptions" + + factory_usage: + - "Use CosmosExceptionFactory.Create() for new exceptions" + - "Use CosmosExceptionFactory.CreateNotFoundException()" + - "Use CosmosExceptionFactory.CreateBadRequestException()" + + catching_pattern: | + try + { + ResponseMessage response = await this.SendAsync(...); + } + catch (CosmosException cosmosException) when (cosmosException.StatusCode == HttpStatusCode.NotFound) + { + // Handle not found + } + catch (CosmosException cosmosException) + { + DefaultTrace.TraceError($"Operation failed: {cosmosException.StatusCode}"); + throw; + } + + status_code_handling: + "400 BadRequest": "Invalid input, don't retry" + "404 NotFound": "Resource doesn't exist" + "409 Conflict": "Concurrency conflict, may retry with new etag" + "429 TooManyRequests": "Throttled, retry after RetryAfter" + "503 ServiceUnavailable": "Transient, retry with backoff" +``` + +### 16.16 Logging Conventions + +```yaml +logging: + class: "DefaultTrace" + namespace: "Microsoft.Azure.Cosmos.Tracing" + + methods: + info: "DefaultTrace.TraceInformation()" + warning: "DefaultTrace.TraceWarning()" + error: "DefaultTrace.TraceError()" + critical: "DefaultTrace.TraceCritical()" + verbose: "DefaultTrace.TraceVerbose()" + + format: + - "Include context: operation, resource, status" + - "Use string interpolation with $" + - "Include relevant IDs for debugging" + + example: | + DefaultTrace.TraceInformation( + $"ReadItem completed. Container: {this.containerId}, " + + $"Item: {itemId}, StatusCode: {response.StatusCode}"); + + when_to_log: + always: "Errors, retries, throttling" + verbose: "Successful operations, timing" + never: "Sensitive data, PII, keys" +``` + +### 16.17 Breaking Change Detection + +```yaml +api_contracts: + contract_files: + - "Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/Contracts/DotNetSDKAPI.net6.json" + - "Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/Contracts/DotNetSDKTelemetryAPI.net6.json" + + update_script: "UpdateContracts.ps1" + + check_command: | + dotnet test --filter "TestCategory=UpdateContract" --configuration Release + + breaking_changes: + prohibited: + - "Removing public API" + - "Changing method signatures" + - "Changing return types" + - "Removing properties" + allowed_with_review: + - "Adding new optional parameters" + - "Adding new methods/properties" + - "Deprecating (not removing) APIs" + + before_pr: + - "Run UpdateContracts.ps1" + - "Review contract diff" + - "If breaking, document in PR and get explicit approval" +``` + +### 16.18 Security Review Checklist + +```yaml +security_review: + required_for: + - "Changes to Authorization/" + - "Changes to encryption code" + - "Changes to credential handling" + - "New external HTTP calls" + + checklist: + - "[ ] No secrets/keys logged or exposed" + - "[ ] Credentials not stored in plain text" + - "[ ] Input validation on user-provided data" + - "[ ] No SQL injection in generated queries" + - "[ ] Proper certificate validation" + - "[ ] CancellationToken honored (no indefinite waits)" + + encryption_specific: + - "[ ] Keys properly scoped" + - "[ ] Encryption algorithms approved" + - "[ ] Key rotation supported" +``` + +### 16.19 Performance Considerations + +```yaml +performance: + hot_path_rules: + - "Minimize allocations in request path" + - "Use Span/Memory for buffer operations" + - "Avoid LINQ in hot paths (use foreach)" + - "Pool objects where possible (ArrayPool)" + + benchmarking: + tool: "BenchmarkDotNet" + location: "Microsoft.Azure.Cosmos.Performance" + + common_issues: + boxing: "Avoid boxing value types" + closures: "Be careful with lambda captures" + strings: "Use StringBuilder for concatenation" + + before_pr: + - "Check if change is in hot path" + - "Run relevant benchmarks if performance-sensitive" + - "Document any expected performance impact" +``` + +### 16.20 Rollback Strategy + +```yaml +rollback: + if_pr_causes_issues: + immediate: + - "Revert PR: gh pr revert {number}" + - "Or: git revert {commit} && git push" + + investigation: + - "Create issue documenting the problem" + - "Link to reverted PR" + - "Analyze what was missed" + + prevention: + - "Draft PR with CI validation" + - "Require review before merge" + - "Monitor after merge for 24h" +``` + +### 16.21 Testing Patterns + +```yaml +testing: + unit_tests: + location: "Microsoft.Azure.Cosmos.Tests" + pattern: "{ClassName}Tests.cs" + framework: "MSTest" + + emulator_tests: + location: "Microsoft.Azure.Cosmos.EmulatorTests" + requires: "Cosmos DB Emulator running" + category: "[TestCategory(\"Emulator\")]" + + mocking: + - "Mock external dependencies (HTTP, network)" + - "Use ResponseMessage for response mocking" + - "Don't mock internal implementation details" + + test_naming: | + [TestMethod] + public async Task MethodName_Scenario_ExpectedResult() + { + // Arrange + // Act + // Assert + } + + baseline_tests: + location: "Microsoft.Azure.Cosmos.Tests/BaselineTest" + purpose: "Capture expected output for comparison" + update: "Run UpdateContracts.ps1 to refresh baselines" +``` + --- ## TODO: Implementation Tasks From 98acb462667d6288c09c17dedc8580b270ca7166 Mon Sep 17 00:00:00 2001 From: Kiran Kumar Kolli Date: Sun, 1 Feb 2026 11:45:18 -0800 Subject: [PATCH 09/28] Docs: Add local emulator testing strategy (Section 4.8.1) - Emulator-first validation before CI push - ~85% of tests run locally with emulator - Only ~15% require secrets (CI-only) - Quick commands for local testing - Emulator setup instructions --- .github/copilot-agent-plan.md | 141 ++++++++++++++++++++++++++++++++++ 1 file changed, 141 insertions(+) diff --git a/.github/copilot-agent-plan.md b/.github/copilot-agent-plan.md index 22eb2ac61a..93a86229db 100644 --- a/.github/copilot-agent-plan.md +++ b/.github/copilot-agent-plan.md @@ -1099,6 +1099,147 @@ acceptance_criteria: required: "Before/after benchmark comparison in PR" ``` +### 4.8.1 Local Testing Strategy (Emulator-First) + +**⚠️ CRITICAL: Run maximum tests locally BEFORE pushing to CI.** + +Most tests can run locally with the Cosmos DB Emulator. Only a few tests require Azure secrets. + +```yaml +local_testing_strategy: + principle: "Validate locally first, CI is final verification" + + test_categories_by_locality: + fully_local: + unit_tests: + path: "Microsoft.Azure.Cosmos.Tests" + command: "dotnet test --filter \"Category!=Emulator\"" + secrets_required: false + run: "ALWAYS before push" + + emulator_tests: + path: "Microsoft.Azure.Cosmos.EmulatorTests" + command: "dotnet test" + secrets_required: false + prerequisite: "Cosmos DB Emulator running" + run: "ALWAYS before push" + categories: + - "Query tests" + - "CRUD operations" + - "ChangeFeed tests" + - "Batch operations" + - "LINQ tests" + + encryption_tests: + path: "Microsoft.Azure.Cosmos.Encryption.Tests" + command: "dotnet test" + secrets_required: false + run: "If encryption code changed" + + requires_secrets: + multi_region_tests: + category: "MultiRegion" + secret: "COSMOSDB_MULTI_REGION" + run: "CI only (or with personal Azure account)" + + live_account_tests: + category: "LiveTest" + secret: "COSMOSDB_ACCOUNT_*" + run: "CI only" + + emulator_setup: + windows: + install: | + # Download from Azure portal or use winget + winget install Microsoft.Azure.CosmosEmulator + + start: | + # Start emulator + & "C:\Program Files\Azure Cosmos DB Emulator\CosmosDB.Emulator.exe" + + verify: | + # Check emulator is running + Invoke-WebRequest -Uri "https://localhost:8081/_explorer/emulator.pem" -UseBasicParsing + + connection_string: "AccountEndpoint=https://localhost:8081/;AccountKey=C2y6yDjf5/R+ob0N8A7Cgv30VRDJIWEHLM+4QDU5DE2nQ9nDuVTqobD4b8mGGyPMbIZnqyMsEcaGQy67XIw/Jw==" + + recommended_workflow: + before_any_push: + step_1: + name: "Start emulator" + command: "Start-Process 'C:\\Program Files\\Azure Cosmos DB Emulator\\CosmosDB.Emulator.exe'" + wait: "30 seconds for startup" + + step_2: + name: "Run unit tests" + command: "dotnet test Microsoft.Azure.Cosmos.Tests --filter \"Category!=Emulator\" -c Release" + expected: "All pass" + time: "~2 minutes" + + step_3: + name: "Run emulator tests" + command: "dotnet test Microsoft.Azure.Cosmos.EmulatorTests -c Release" + expected: "All pass" + time: "~15-30 minutes" + + step_4: + name: "Run specific area tests" + example: "dotnet test --filter \"FullyQualifiedName~Linq\" -c Release" + when: "For targeted validation" + + step_5: + name: "Push to CI" + action: "git push" + note: "CI runs secrets-required tests" + + test_coverage_by_environment: + local_emulator: + coverage: "~85% of all tests" + includes: + - "All unit tests" + - "All LINQ/Query tests" + - "All CRUD tests" + - "All ChangeFeed tests" + - "All Batch tests" + - "Most encryption tests" + - "Most retry/resilience tests" + + ci_only: + coverage: "~15% of all tests" + includes: + - "Multi-region replication" + - "Live account integration" + - "Cross-region failover" + - "Production endpoint tests" + + efficiency_gains: + local_validation: "Catch 85%+ of issues in ~20 minutes" + ci_feedback: "Avoid 60-90 minute CI wait for simple errors" + iteration_speed: "Fix β†’ local test β†’ fix β†’ local test β†’ push (confident)" +``` + +**Quick Local Test Commands:** + +```powershell +# Start emulator (if not running) +Start-Process "C:\Program Files\Azure Cosmos DB Emulator\CosmosDB.Emulator.exe" + +# Run all unit tests (~2 min) +dotnet test .\Microsoft.Azure.Cosmos\tests\Microsoft.Azure.Cosmos.Tests -c Release + +# Run emulator tests (~20 min) +dotnet test .\Microsoft.Azure.Cosmos\tests\Microsoft.Azure.Cosmos.EmulatorTests -c Release + +# Run specific test category +dotnet test --filter "TestCategory=Query" -c Release + +# Run tests matching name pattern +dotnet test --filter "FullyQualifiedName~Dictionary" -c Release + +# Run single test +dotnet test --filter "FullyQualifiedName=Microsoft.Azure.Cosmos.Tests.Linq.LinqDictionaryQueryTests.TestIsDictionaryExtension" -c Release +``` + ### 4.8 Regression Testing Requirement **Before any fix is considered complete, ALL existing tests must pass - both locally AND on remote CI.** From 6560fc6591fc6f7af234e1f9645eec293eb41c7d Mon Sep 17 00:00:00 2001 From: Kiran Kumar Kolli Date: Sun, 1 Feb 2026 11:46:43 -0800 Subject: [PATCH 10/28] Docs: Add new machine quick start guide (Section 0.7) - Step-by-step setup for new machine (15-20 min) - Test workflow with real issue - Minimal 5-minute verification script - Troubleshooting common issues --- .github/copilot-agent-plan.md | 149 ++++++++++++++++++++++++++++++++++ 1 file changed, 149 insertions(+) diff --git a/.github/copilot-agent-plan.md b/.github/copilot-agent-plan.md index 93a86229db..3d645c8539 100644 --- a/.github/copilot-agent-plan.md +++ b/.github/copilot-agent-plan.md @@ -159,6 +159,155 @@ npx @azure-devops/mcp cosmos-db-sdk-public --help - [ ] .NET SDK installed: `dotnet --version` - [ ] Git configured: `git config user.name` - [ ] Repository cloned: `E:\src\v33` or similar + +### Cosmos DB Emulator +- [ ] Emulator installed: Check "Azure Cosmos DB Emulator" in Start menu +- [ ] Emulator running: `https://localhost:8081/_explorer/index.html` accessible +``` + +### 0.7 New Machine Quick Start + +**To test these instructions on a new machine with a new issue:** + +```yaml +new_machine_setup: + time_estimate: "15-20 minutes" + + step_1_prerequisites: + commands: | + # Install required tools (Windows) + winget install Microsoft.DotNet.SDK.8 + winget install Git.Git + winget install GitHub.cli + winget install OpenJS.NodeJS + winget install Microsoft.Azure.CosmosEmulator + + step_2_clone_repo: + commands: | + # Clone repository + git clone https://github.com/Azure/azure-cosmos-dotnet-v3.git + cd azure-cosmos-dotnet-v3 + + # Build to verify setup + dotnet build Microsoft.Azure.Cosmos.sln -c Release + + step_3_authenticate: + commands: | + # GitHub CLI + gh auth login --web + + # Verify + gh auth status + + step_4_start_emulator: + commands: | + # Start Cosmos DB Emulator + Start-Process "C:\Program Files\Azure Cosmos DB Emulator\CosmosDB.Emulator.exe" + + # Wait 30 seconds, then verify + Start-Sleep -Seconds 30 + Invoke-WebRequest -Uri "https://localhost:8081/" -UseBasicParsing + + step_5_test_with_issue: + description: "Give Copilot an issue to work on" + example_prompt: | + "Investigate and fix issue #XXXX following the plan in + .github/copilot-agent-plan.md" +``` + +**Test Workflow with a Real Issue:** + +```yaml +testing_the_plan: + step_1_select_issue: + options: + - "Pick an open bug from: gh issue list --label bug --state open" + - "Use a known test issue (if available)" + - "Create a test issue with a known problem" + + step_2_invoke_copilot: + prompt_template: | + Follow the Copilot Agent Issue Triage Plan in .github/copilot-agent-plan.md + + Investigate issue #{issue_number}: {issue_title} + + Steps: + 1. Fetch and analyze the issue + 2. Search codebase for related code + 3. Identify root cause + 4. Create reproduction test + 5. Implement fix + 6. Run local tests (unit + emulator) + 7. Create draft PR + 8. Monitor CI until GREEN + + step_3_verify_workflow: + checklist: + - "[ ] Issue fetched successfully (or via web_fetch fallback)" + - "[ ] Root cause identified" + - "[ ] Local tests pass" + - "[ ] PR created with correct format" + - "[ ] CI monitoring initiated" + + step_4_validate_results: + success_criteria: + - "PR created following naming convention" + - "PR title matches lint format" + - "Local emulator tests pass" + - "CI gates monitored" +``` + +**Minimal Test (5 minutes):** + +```powershell +# Quick verification that setup works +# Run from repository root + +# 1. Verify build +dotnet build Microsoft.Azure.Cosmos.sln -c Release + +# 2. Run unit tests (no emulator needed) +dotnet test .\Microsoft.Azure.Cosmos\tests\Microsoft.Azure.Cosmos.Tests -c Release --filter "Category!=Emulator" -- --no-restore + +# 3. Verify GitHub CLI +gh issue list --repo Azure/azure-cosmos-dotnet-v3 --limit 5 + +# 4. Verify git +git status + +# If all pass, environment is ready for Copilot agent workflow +Write-Host "βœ… Environment ready!" +``` + +**Troubleshooting Common Issues:** + +```yaml +troubleshooting: + build_fails: + symptom: "dotnet build fails" + causes: + - "Missing .NET SDK": "winget install Microsoft.DotNet.SDK.8" + - "Wrong SDK version": "Check global.json for required version" + - "Missing workloads": "dotnet workload restore" + + gh_auth_fails: + symptom: "gh: not authenticated" + fix: "gh auth login --web" + + emulator_not_starting: + symptom: "Connection refused on localhost:8081" + causes: + - "Emulator not installed": "winget install Microsoft.Azure.CosmosEmulator" + - "Port conflict": "Check if another process uses 8081" + - "Needs admin": "Run emulator as administrator" + + github_api_403: + symptom: "SAML authentication required" + fix: "Use web_fetch to scrape issue page directly (see Section 3 Phase 0)" + + azure_devops_auth: + symptom: "MCP server auth fails" + fix: "First MCP call opens browser - complete Microsoft login" ``` --- From 9a8990e9100c508f5dd05f1e75a657320a57d4fb Mon Sep 17 00:00:00 2001 From: Kiran Kumar Kolli Date: Sun, 1 Feb 2026 11:55:14 -0800 Subject: [PATCH 11/28] Docs: Clarify unit tests have no external dependencies - Microsoft.Azure.Cosmos.Tests runs without emulator - Removed unnecessary --filter from unit test commands - Split test coverage: unit (no deps) vs emulator vs CI-only --- .github/copilot-agent-plan.md | 50 ++++++++++++++++++++++++----------- 1 file changed, 35 insertions(+), 15 deletions(-) diff --git a/.github/copilot-agent-plan.md b/.github/copilot-agent-plan.md index 3d645c8539..3be298fe08 100644 --- a/.github/copilot-agent-plan.md +++ b/.github/copilot-agent-plan.md @@ -266,8 +266,8 @@ testing_the_plan: # 1. Verify build dotnet build Microsoft.Azure.Cosmos.sln -c Release -# 2. Run unit tests (no emulator needed) -dotnet test .\Microsoft.Azure.Cosmos\tests\Microsoft.Azure.Cosmos.Tests -c Release --filter "Category!=Emulator" -- --no-restore +# 2. Run unit tests (no emulator needed, no external dependencies) +dotnet test .\Microsoft.Azure.Cosmos\tests\Microsoft.Azure.Cosmos.Tests -c Release --no-restore # 3. Verify GitHub CLI gh issue list --repo Azure/azure-cosmos-dotnet-v3 --limit 5 @@ -1262,8 +1262,10 @@ local_testing_strategy: fully_local: unit_tests: path: "Microsoft.Azure.Cosmos.Tests" - command: "dotnet test --filter \"Category!=Emulator\"" + command: "dotnet test" secrets_required: false + external_dependencies: false + note: "Pure unit tests - no emulator, no network, no secrets" run: "ALWAYS before push" emulator_tests: @@ -1321,9 +1323,10 @@ local_testing_strategy: step_2: name: "Run unit tests" - command: "dotnet test Microsoft.Azure.Cosmos.Tests --filter \"Category!=Emulator\" -c Release" + command: "dotnet test Microsoft.Azure.Cosmos.Tests -c Release" expected: "All pass" time: "~2 minutes" + note: "No emulator needed - pure unit tests" step_3: name: "Run emulator tests" @@ -1342,16 +1345,29 @@ local_testing_strategy: note: "CI runs secrets-required tests" test_coverage_by_environment: - local_emulator: - coverage: "~85% of all tests" + local_no_dependencies: + project: "Microsoft.Azure.Cosmos.Tests" + coverage: "Unit tests - no external dependencies" includes: - "All unit tests" - - "All LINQ/Query tests" - - "All CRUD tests" - - "All ChangeFeed tests" - - "All Batch tests" - - "Most encryption tests" - - "Most retry/resilience tests" + - "Mocked integration tests" + - "Serialization tests" + - "Type system tests" + - "LINQ translation tests" + requires: "Nothing - just .NET SDK" + time: "~2 minutes" + + local_with_emulator: + project: "Microsoft.Azure.Cosmos.EmulatorTests" + coverage: "Integration tests with real Cosmos operations" + includes: + - "CRUD operations" + - "Query execution" + - "ChangeFeed tests" + - "Batch tests" + - "Encryption tests" + requires: "Cosmos DB Emulator running" + time: "~15-30 minutes" ci_only: coverage: "~15% of all tests" @@ -1360,11 +1376,13 @@ local_testing_strategy: - "Live account integration" - "Cross-region failover" - "Production endpoint tests" + requires: "Azure secrets" efficiency_gains: - local_validation: "Catch 85%+ of issues in ~20 minutes" + unit_tests_only: "Catch most logic errors in ~2 minutes (no setup)" + with_emulator: "Catch integration issues in ~20 minutes" ci_feedback: "Avoid 60-90 minute CI wait for simple errors" - iteration_speed: "Fix β†’ local test β†’ fix β†’ local test β†’ push (confident)" + iteration_speed: "Fix β†’ unit test β†’ fix β†’ emulator test β†’ push" ``` **Quick Local Test Commands:** @@ -1434,9 +1452,11 @@ regression_testing: test_categories: unit_tests: path: "Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests" - command: dotnet test --filter "Category!=Emulator" + command: "dotnet test" required: true local: true + external_dependencies: false + note: "No emulator, no network, no secrets - pure unit tests" remote: true emulator_tests: From a52b8f1a61ffdcf32d5a80f112e069c04ccbcb9a Mon Sep 17 00:00:00 2001 From: Kiran Kumar Kolli Date: Sun, 1 Feb 2026 11:56:28 -0800 Subject: [PATCH 12/28] Docs: Require proof of local test success before PR - Must show test output (Passed/Failed/Skipped counts) - Gate: DO NOT create PR if any tests fail - Added to PR checklist: PROOF OF LOCAL TESTS SHOWN --- .github/copilot-agent-plan.md | 61 ++++++++++++++++++++++++++++------- 1 file changed, 50 insertions(+), 11 deletions(-) diff --git a/.github/copilot-agent-plan.md b/.github/copilot-agent-plan.md index 3be298fe08..7bd22a65d5 100644 --- a/.github/copilot-agent-plan.md +++ b/.github/copilot-agent-plan.md @@ -1498,10 +1498,12 @@ regression_testing: - May need to re-run pipeline for transient failures pr_checklist: - - "[ ] All unit tests pass locally" - - "[ ] All emulator tests pass locally (if applicable)" + - "[ ] Build succeeds: dotnet build -c Release" + - "[ ] Unit tests pass locally (SHOW OUTPUT)" + - "[ ] Emulator tests pass locally (if applicable, SHOW OUTPUT)" - "[ ] No regression in existing tests" - "[ ] New tests added for the fix" + - "[ ] **PROOF OF LOCAL TESTS SHOWN** (see Section 7.4.3)" - "[ ] PR created and CI triggered" - "[ ] ALL remote CI gates pass (Section 7.4)" - "[ ] CI failures investigated and resolved" @@ -1984,24 +1986,61 @@ ci_gates: ```yaml validation_workflow: phase_1_local: - description: "Quick local validation before creating PR" + description: "Local validation with PROOF before creating PR" + steps: - name: "Build solution" - command: dotnet build Microsoft.Azure.Cosmos.sln -c Release + command: "dotnet build Microsoft.Azure.Cosmos.sln -c Release" required: true - name: "Run unit tests" - command: | - dotnet test Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests \ - --filter "TestCategory!=Flaky & TestCategory!=Quarantine & TestCategory!=Functional & TestCategory!=Ignore" \ - -c Release --no-build + command: "dotnet test Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests -c Release --no-build" required: true - - name: "Run LINQ-specific tests (for LINQ changes)" - command: dotnet test --filter "FullyQualifiedName~Linq" -c Release --no-build - required: "for LINQ-related changes" + - name: "Run area-specific tests (if applicable)" + command: "dotnet test --filter \"FullyQualifiedName~{Area}\" -c Release --no-build" + example: "dotnet test --filter \"FullyQualifiedName~Linq\" -c Release --no-build" + required: "for changes in specific area" + + - name: "Run emulator tests (if emulator available)" + command: "dotnet test Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.EmulatorTests -c Release" + required: "recommended for integration changes" + + proof_required: + description: "⚠️ MUST show test output before creating PR" + format: | + ## Local Test Results + + ### Unit Tests + ``` + Passed: XXX + Failed: 0 + Skipped: X + ``` + + ### Area Tests (if run) + ``` + Passed: XX + Failed: 0 + ``` + + ### Emulator Tests (if run) + ``` + Passed: XXX + Failed: 0 + ``` + + example_output: | + Test run successful. + Total tests: 1247 + Passed: 1245 + Skipped: 2 + Total time: 2.34 Minutes + + gate: "DO NOT create PR if any tests fail" phase_2_create_pr: + prerequisite: "phase_1_local MUST pass with proof shown" description: "Create PR to trigger remote CI" steps: - name: "Create feature branch" From e6bb1a54cfa4dd24f7b2b3712d8ff751524e9bec Mon Sep 17 00:00:00 2001 From: Kiran Kumar Kolli Date: Sun, 1 Feb 2026 11:57:43 -0800 Subject: [PATCH 13/28] Docs: Run tests with PREVIEW configuration also - Build/test with default AND /p:IsPreview=true - PREVIEW defines: PREVIEW;ENCRYPTIONPREVIEW - Proof required for BOTH configurations - Gate: fail in ANY configuration blocks PR --- .github/copilot-agent-plan.md | 57 +++++++++++++++++++++++++++-------- 1 file changed, 45 insertions(+), 12 deletions(-) diff --git a/.github/copilot-agent-plan.md b/.github/copilot-agent-plan.md index 7bd22a65d5..8f01e47217 100644 --- a/.github/copilot-agent-plan.md +++ b/.github/copilot-agent-plan.md @@ -1988,15 +1988,48 @@ validation_workflow: phase_1_local: description: "Local validation with PROOF before creating PR" + build_configurations: + default: + command: "dotnet build Microsoft.Azure.Cosmos.sln -c Release" + defines: "(none)" + required: true + + preview: + command: "dotnet build Microsoft.Azure.Cosmos.sln -c Release /p:IsPreview=true" + defines: "PREVIEW;ENCRYPTIONPREVIEW" + required: true + note: "Tests preview-only APIs and code paths" + + test_configurations: + default: + build: "dotnet build -c Release" + test: "dotnet test -c Release --no-build" + required: true + + preview: + build: "dotnet build -c Release /p:IsPreview=true" + test: "dotnet test -c Release --no-build" + required: true + note: "Must pass with PREVIEW defined" + steps: - - name: "Build solution" + - name: "Build solution (default)" command: "dotnet build Microsoft.Azure.Cosmos.sln -c Release" required: true - - name: "Run unit tests" + - name: "Run unit tests (default)" command: "dotnet test Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests -c Release --no-build" required: true + - name: "Build solution (PREVIEW)" + command: "dotnet build Microsoft.Azure.Cosmos.sln -c Release /p:IsPreview=true" + required: true + + - name: "Run unit tests (PREVIEW)" + command: "dotnet test Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests -c Release --no-build" + required: true + note: "Tests run against PREVIEW build" + - name: "Run area-specific tests (if applicable)" command: "dotnet test --filter \"FullyQualifiedName~{Area}\" -c Release --no-build" example: "dotnet test --filter \"FullyQualifiedName~Linq\" -c Release --no-build" @@ -2007,11 +2040,18 @@ validation_workflow: required: "recommended for integration changes" proof_required: - description: "⚠️ MUST show test output before creating PR" + description: "⚠️ MUST show test output for BOTH configurations before creating PR" format: | ## Local Test Results - ### Unit Tests + ### Unit Tests (Default) + ``` + Passed: XXX + Failed: 0 + Skipped: X + ``` + + ### Unit Tests (PREVIEW) ``` Passed: XXX Failed: 0 @@ -2030,14 +2070,7 @@ validation_workflow: Failed: 0 ``` - example_output: | - Test run successful. - Total tests: 1247 - Passed: 1245 - Skipped: 2 - Total time: 2.34 Minutes - - gate: "DO NOT create PR if any tests fail" + gate: "DO NOT create PR if any tests fail in ANY configuration" phase_2_create_pr: prerequisite: "phase_1_local MUST pass with proof shown" From fc9bd5d207e74b96caf87b7f5cc29e29888e3c7e Mon Sep 17 00:00:00 2001 From: Kiran Kumar Kolli Date: Sun, 1 Feb 2026 12:33:02 -0800 Subject: [PATCH 14/28] Docs: Clarify gh CLI vs GitHub MCP Server for SAML repos - GitHub MCP Server BLOCKED by SAML for Azure org - gh CLI WORKS (browser OAuth completes SAML SSO) - web_fetch as fallback for reading content - Added quick reference commands --- .github/copilot-agent-plan.md | 69 +++++++++++++++++++++++++++++------ 1 file changed, 57 insertions(+), 12 deletions(-) diff --git a/.github/copilot-agent-plan.md b/.github/copilot-agent-plan.md index 8f01e47217..1d50ee16e0 100644 --- a/.github/copilot-agent-plan.md +++ b/.github/copilot-agent-plan.md @@ -17,19 +17,64 @@ ### 0.2 GitHub MCP Server -**Already configured in most Copilot environments.** +### 0.2 GitHub Access (SAML-Protected Repos) + +**⚠️ IMPORTANT: Azure org repos require SAML SSO - GitHub MCP Server is BLOCKED.** + +```yaml +github_access: + saml_protected_orgs: + - "Azure" + - "microsoft" + note: "MCP server tokens don't have SAML authorization" + + what_works: + gh_cli: + status: "βœ… WORKS" + reason: "Browser OAuth completes SAML SSO" + auth: "gh auth login --web" + use_for: + - "Create/list/view issues" + - "Create/list/view PRs" + - "Check PR status" + - "All GitHub operations" + examples: + - "gh issue list --repo Azure/azure-cosmos-dotnet-v3" + - "gh pr create --draft --title '...'" + - "gh pr checks 5583" + + web_fetch: + status: "βœ… WORKS (fallback)" + reason: "Scrapes public web pages" + use_for: "Reading issue/PR content when gh fails" + example: "web_fetch https://github.com/Azure/azure-cosmos-dotnet-v3/issues/5547" + + what_does_not_work: + github_mcp_server: + status: "❌ BLOCKED by SAML" + error: "Resource protected by organization SAML enforcement" + reason: "MCP OAuth token not SAML-authorized" + tools_affected: + - "github-mcp-server-list_issues" + - "github-mcp-server-issue_read" + - "github-mcp-server-pull_request_read" + - "All github-mcp-server-* tools for Azure org" + + recommended_approach: + primary: "Use gh CLI for all GitHub operations" + fallback: "Use web_fetch to scrape issue content" + avoid: "Don't rely on GitHub MCP Server for Azure org repos" +``` -```yaml -github_mcp_server: - package: "github-mcp-server" - capabilities: - - search_issues, list_issues, issue_read - - search_pull_requests, list_pull_requests, pull_request_read - - search_code, get_file_contents - - actions_list, actions_get, get_job_logs - - list_commits, get_commit - - note: "Usually pre-configured. For Azure org repos, may need SAML auth workaround (use web_fetch)" +**Quick Reference:** +```powershell +# βœ… This works (gh CLI) +gh issue view 5547 --repo Azure/azure-cosmos-dotnet-v3 +gh pr create --draft --title "Fix: ..." +gh pr checks 5583 + +# ❌ This fails (MCP Server) +# github-mcp-server-list_issues β†’ SAML error ``` ### 0.3 Azure DevOps MCP Server (Official Microsoft) From 7e9c7573f5a1872c922a55a2b47f670b4c59221b Mon Sep 17 00:00:00 2001 From: Kiran Kumar Kolli Date: Sun, 1 Feb 2026 12:33:45 -0800 Subject: [PATCH 15/28] Docs: Add gh CLI installation and setup steps - winget install GitHub.cli - gh auth login --web (browser SAML flow) - Required token scopes: repo, read:org - Verification commands --- .github/copilot-agent-plan.md | 41 +++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/.github/copilot-agent-plan.md b/.github/copilot-agent-plan.md index 1d50ee16e0..680fa57c85 100644 --- a/.github/copilot-agent-plan.md +++ b/.github/copilot-agent-plan.md @@ -66,6 +66,47 @@ github_access: avoid: "Don't rely on GitHub MCP Server for Azure org repos" ``` +**gh CLI Installation & Setup:** + +```powershell +# 1. Install gh CLI (Windows) +winget install --id GitHub.cli + +# Alternative installers: +# - macOS: brew install gh +# - Linux: https://github.com/cli/cli/blob/trunk/docs/install_linux.md + +# 2. Authenticate (opens browser for SAML SSO) +gh auth login --web + +# 3. Select options when prompted: +# - GitHub.com (not Enterprise) +# - HTTPS +# - Authenticate with browser + +# 4. Complete SAML SSO in browser +# - Login to GitHub +# - Authorize for Azure org (SAML) + +# 5. Verify authentication +gh auth status +# Expected: βœ“ Logged in to github.com account {username} + +# 6. Test access to Azure org +gh issue list --repo Azure/azure-cosmos-dotnet-v3 --limit 3 +``` + +**Required Scopes:** +```yaml +gh_token_scopes: + required: + - "repo" # Full repository access + - "read:org" # Read org membership (for SAML) + optional: + - "workflow" # GitHub Actions + - "gist" # Gists +``` + **Quick Reference:** ```powershell # βœ… This works (gh CLI) From 299e438f7255cae49618b9c9eecdb742c6ce0a5e Mon Sep 17 00:00:00 2001 From: Kiran Kumar Kolli Date: Sun, 1 Feb 2026 12:39:01 -0800 Subject: [PATCH 16/28] Docs: Add SAML troubleshooting for gh CLI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Fix for 'Resource protected by organization SAML enforcement' - Steps: logout β†’ login β†’ authorize Azure org - Manual SSO authorization via github.com/settings/tokens - Alternative: github.com/orgs/Azure/sso --- .github/copilot-agent-plan.md | 52 +++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/.github/copilot-agent-plan.md b/.github/copilot-agent-plan.md index 680fa57c85..ed457746a5 100644 --- a/.github/copilot-agent-plan.md +++ b/.github/copilot-agent-plan.md @@ -107,6 +107,58 @@ gh_token_scopes: - "gist" # Gists ``` +**Troubleshooting SAML Errors:** + +```yaml +saml_troubleshooting: + error: "Resource protected by organization SAML enforcement" + cause: "Token not authorized for Azure org via SAML SSO" + + fix_steps: + step_1: + description: "Logout and re-authenticate with SAML" + commands: | + gh auth logout + gh auth login --web + + step_2: + description: "During browser auth, authorize for Azure org" + note: "After GitHub login, you'll see 'Authorize for: Azure' - click it" + + step_3: + description: "If no SAML prompt appears, manually authorize" + url: "https://github.com/settings/tokens" + steps: + - "Go to https://github.com/settings/tokens" + - "Find the 'gh' token (or GitHub CLI)" + - "Click 'Configure SSO' dropdown" + - "Click 'Authorize' next to 'Azure' org" + - "Complete SAML login" + + step_4: + description: "Verify authorization" + command: "gh issue list --repo Azure/azure-cosmos-dotnet-v3 --limit 1" + + alternative_manual_sso: + url: "https://github.com/orgs/Azure/sso" + steps: + - "Visit https://github.com/orgs/Azure/sso" + - "Complete SAML authentication" + - "Return and retry gh command" +``` + +**Quick Fix (Copy-Paste):** +```powershell +# Re-authenticate with SAML +gh auth logout +gh auth login --web +# β†’ Complete browser auth AND click "Authorize" for Azure org + +# OR manually authorize existing token: +# 1. Go to: https://github.com/settings/tokens +# 2. Find GitHub CLI token β†’ Configure SSO β†’ Authorize Azure +``` + **Quick Reference:** ```powershell # βœ… This works (gh CLI) From 420875f559e9c3d3487697e1f0f1da72921bb61b Mon Sep 17 00:00:00 2001 From: Kiran Kumar Kolli Date: Sun, 1 Feb 2026 12:43:30 -0800 Subject: [PATCH 17/28] Docs: Add auto-fix script for SAML errors - Test gh issue list, detect SAML error - Auto logout and re-login sequence - PowerShell script for copy-paste --- .github/copilot-agent-plan.md | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/.github/copilot-agent-plan.md b/.github/copilot-agent-plan.md index ed457746a5..9dad1b05cb 100644 --- a/.github/copilot-agent-plan.md +++ b/.github/copilot-agent-plan.md @@ -92,8 +92,35 @@ gh auth login --web gh auth status # Expected: βœ“ Logged in to github.com account {username} -# 6. Test access to Azure org -gh issue list --repo Azure/azure-cosmos-dotnet-v3 --limit 3 +# 6. Test access to Azure org (with auto-fix) +gh issue list --repo Azure/azure-cosmos-dotnet-v3 --limit 1 +# If SAML error occurs, run steps 7-8 + +# 7. Force logout (if SAML error) +gh auth logout + +# 8. Re-login with SAML authorization +gh auth login --web +# β†’ Complete browser auth AND authorize Azure org + +# 9. Verify fixed +gh issue list --repo Azure/azure-cosmos-dotnet-v3 --limit 1 +``` + +**Auto-Fix Script (Copy-Paste):** +```powershell +# Test and auto-fix SAML issues +$result = gh issue list --repo Azure/azure-cosmos-dotnet-v3 --limit 1 2>&1 +if ($result -match "SAML") { + Write-Host "SAML error detected. Re-authenticating..." + gh auth logout + gh auth login --web + Write-Host "Please authorize Azure org in browser, then press Enter..." + Read-Host + gh issue list --repo Azure/azure-cosmos-dotnet-v3 --limit 1 +} else { + Write-Host "βœ… gh CLI working: $result" +} ``` **Required Scopes:** From dbd89930fc8daf10672db10d2e54bf92c4dd4af8 Mon Sep 17 00:00:00 2001 From: Kiran Kumar Kolli Date: Sun, 1 Feb 2026 12:51:05 -0800 Subject: [PATCH 18/28] Docs: Auto-mark PR ready immediately when CI gates pass - Do NOT wait for user confirmation - Mark ready as soon as all checks green - CI passing = code validated, ready for review --- .github/copilot-agent-plan.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/.github/copilot-agent-plan.md b/.github/copilot-agent-plan.md index 9dad1b05cb..8f07242338 100644 --- a/.github/copilot-agent-plan.md +++ b/.github/copilot-agent-plan.md @@ -3386,8 +3386,18 @@ ci_monitoring_loop: exit_condition: "gh pr checks shows all checks passing" step_5_complete: + trigger: "IMMEDIATELY when all CI gates GREEN" action: "gh pr ready {pr_number}" result: "PR moves from draft to ready for review" + note: "Do NOT wait - mark ready as soon as gates pass" + + auto_ready_behavior: + principle: "Mark PR ready for review immediately when CI passes" + do_not_wait_for: + - "User confirmation" + - "Additional review" + - "Manual approval" + reason: "CI passing = code is validated, ready for human review" success_criteria: required: From 1647ffbffb260b2044b4f11d28120c753e065f73 Mon Sep 17 00:00:00 2001 From: Kiran Kumar Kolli Date: Sun, 1 Feb 2026 12:58:28 -0800 Subject: [PATCH 19/28] Docs: Require branch sync and dotnet-v3-ci before ready - Rebase on master before marking ready - dotnet-v3-ci is PRIMARY gate (must be SUCCESS) - Other checks are secondary/informational --- .github/copilot-agent-plan.md | 36 +++++++++++++++++++++++++++-------- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/.github/copilot-agent-plan.md b/.github/copilot-agent-plan.md index 8f07242338..03a2905c7c 100644 --- a/.github/copilot-agent-plan.md +++ b/.github/copilot-agent-plan.md @@ -3385,28 +3385,48 @@ ci_monitoring_loop: loop: "Repeat step_2 and step_3 until all gates GREEN" exit_condition: "gh pr checks shows all checks passing" - step_5_complete: - trigger: "IMMEDIATELY when all CI gates GREEN" + step_5_pre_ready_checks: + description: "Verify branch is ready before marking for review" + checks: + - name: "Ensure branch is up-to-date with master" + commands: | + git fetch origin master + git rebase origin/master + # If conflicts, resolve and push + git push origin {branch_name} --force-with-lease + note: "Rebase ensures clean merge, re-triggers CI if needed" + + - name: "Verify dotnet-v3-ci passed" + command: "gh pr checks {pr_number} --json name,state | jq '.[] | select(.name==\"dotnet-v3-ci\")'" + required: "dotnet-v3-ci must show 'SUCCESS'" + note: "This is the main CI gate - other checks may be informational" + + step_6_complete: + trigger: "ONLY when branch is synced AND dotnet-v3-ci is GREEN" action: "gh pr ready {pr_number}" result: "PR moves from draft to ready for review" - note: "Do NOT wait - mark ready as soon as gates pass" + note: "Do NOT wait - mark ready as soon as conditions met" auto_ready_behavior: - principle: "Mark PR ready for review immediately when CI passes" + principle: "Mark PR ready for review immediately when conditions met" + required_conditions: + - "Branch is up-to-date with master (rebased)" + - "dotnet-v3-ci check is SUCCESS (main CI gate)" do_not_wait_for: - "User confirmation" - "Additional review" - "Manual approval" - reason: "CI passing = code is validated, ready for human review" + reason: "CI passing on synced branch = code is validated" success_criteria: required: - - "All CI checks show βœ“ (green)" - - "No pending checks remaining" + - "Branch rebased on latest master" + - "**dotnet-v3-ci check is SUCCESS** (primary gate)" - "PR lint passed" - "CLA signed" optional: - - "CodeQL may show warnings (review, don't block)" + - "CodeQL checks (informational)" + - "Other secondary checks" failure_escalation: after_3_retries: "Investigate deeper, may need human help" From 04fb0c775fe37128a4735f68891c7c79a35afb1e Mon Sep 17 00:00:00 2001 From: Kiran Kumar Kolli Date: Sun, 1 Feb 2026 13:01:08 -0800 Subject: [PATCH 20/28] Docs: Add Copilot code-review as gate before PR creation - phase_2_code_review: Run code-review agent before creating PR - Must address: bugs, security, null refs, resource leaks - Can ignore: style, minor refactoring - Gate: No high-signal issues before PR publish --- .github/copilot-agent-plan.md | 52 ++++++++++++++++++++++++++++++++--- 1 file changed, 48 insertions(+), 4 deletions(-) diff --git a/.github/copilot-agent-plan.md b/.github/copilot-agent-plan.md index 03a2905c7c..143ce707aa 100644 --- a/.github/copilot-agent-plan.md +++ b/.github/copilot-agent-plan.md @@ -2236,9 +2236,53 @@ validation_workflow: ``` gate: "DO NOT create PR if any tests fail in ANY configuration" + + phase_2_code_review: + prerequisite: "phase_1_local MUST pass" + description: "Run Copilot code review BEFORE creating PR" + + purpose: | + Catch issues before human reviewers see them: + - Bugs and logic errors + - Security vulnerabilities + - Missing error handling + - Performance issues + + steps: + - name: "Run Copilot code-review agent" + agent: "code-review" + scope: "Staged/uncommitted changes" + prompt: | + Review the changes for this fix. Focus on: + - Correctness of the fix + - Edge cases not handled + - Security implications + - Performance impact + + - name: "Address findings" + action: | + For each issue found: + 1. Evaluate if it's valid + 2. Fix valid issues locally + 3. Re-run local tests + 4. Re-run code review until clean + + gate: "DO NOT create PR until code review passes with no high-signal issues" + + acceptable_to_ignore: + - "Style/formatting suggestions (StyleCop handles this)" + - "Minor refactoring suggestions" + - "Documentation improvements (unless critical)" + + must_address: + - "Bugs or logic errors" + - "Security vulnerabilities" + - "Null reference risks" + - "Resource leaks" + - "Missing error handling" - phase_2_create_pr: - prerequisite: "phase_1_local MUST pass with proof shown" + phase_3_create_pr: + prerequisite: "phase_1_local AND phase_2_code_review MUST pass" description: "Create PR to trigger remote CI" steps: - name: "Create feature branch" @@ -2259,7 +2303,7 @@ validation_workflow: --title "Fix #{number}: {title}" \ --body "$(cat pr_body.md)" - phase_3_ci_monitoring: + phase_4_ci_monitoring: description: "Monitor CI pipeline execution" steps: - name: "Check pipeline status" @@ -2277,7 +2321,7 @@ validation_workflow: return_content: true tail_lines: 500 - phase_4_fix_ci_failures: + phase_5_fix_ci_failures: description: "Iterate until CI passes" loop: - Analyze failure logs From e7e28b81924f99e6dc097c0b3b17d8f5c234ee4b Mon Sep 17 00:00:00 2001 From: Kiran Kumar Kolli Date: Sun, 1 Feb 2026 13:06:26 -0800 Subject: [PATCH 21/28] Docs: Specify benchmark location - Samples/Tools/Benchmark only - ALL benchmarks in Microsoft.Azure.Cosmos.Samples/Tools/Benchmark - Do not add benchmarks elsewhere - Added commands for running benchmarks --- .github/copilot-agent-plan.md | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/.github/copilot-agent-plan.md b/.github/copilot-agent-plan.md index 143ce707aa..6e7755f45f 100644 --- a/.github/copilot-agent-plan.md +++ b/.github/copilot-agent-plan.md @@ -3917,9 +3917,20 @@ performance: - "Pool objects where possible (ArrayPool)" benchmarking: - tool: "BenchmarkDotNet" - location: "Microsoft.Azure.Cosmos.Performance" + location: "Microsoft.Azure.Cosmos.Samples/Tools/Benchmark" + url: "https://github.com/Azure/azure-cosmos-dotnet-v3/tree/master/Microsoft.Azure.Cosmos.Samples/Tools/Benchmark" + important: "ALL benchmarks go here ONLY - do not add benchmarks elsewhere" + adding_benchmarks: + step_1: "Add benchmark class to Benchmark project" + step_2: "Follow existing patterns in the project" + step_3: "Run locally to validate" + + running_benchmarks: + cd: "Microsoft.Azure.Cosmos.Samples/Tools/Benchmark" + help: "dotnet run -c Release -- --help" + example: "dotnet run -c Release -- -e {endpoint} -k {key}" + common_issues: boxing: "Avoid boxing value types" closures: "Be careful with lambda captures" @@ -3929,6 +3940,7 @@ performance: - "Check if change is in hot path" - "Run relevant benchmarks if performance-sensitive" - "Document any expected performance impact" + - "Add new benchmarks to Benchmark project if needed" ``` ### 16.20 Rollback Strategy From f30bb57b19f632465aca4892b2d61cf1135153a9 Mon Sep 17 00:00:00 2001 From: Kiran Kumar Kolli Date: Sun, 1 Feb 2026 13:23:51 -0800 Subject: [PATCH 22/28] Docs: Add parallel issue handling while CI waits - Start next issue investigation during CI wait time - State tracking per issue (branch, PR, CI status) - Session plan.md format for tracking active issues - Recommended max 2-3 parallel issues - Commands for switching between issues --- .github/copilot-agent-plan.md | 82 +++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) diff --git a/.github/copilot-agent-plan.md b/.github/copilot-agent-plan.md index 6e7755f45f..6bef8411ce 100644 --- a/.github/copilot-agent-plan.md +++ b/.github/copilot-agent-plan.md @@ -3540,6 +3540,88 @@ parallel_agent_pattern: - "One agent's fix may conflict with another's" ``` +### 16.9.1 Parallel Issue Handling (While CI Waits) + +**While waiting for CI on one issue, start investigation on the next issue.** + +```yaml +parallel_issue_workflow: + principle: "Don't wait idle - use CI wait time productively" + + workflow: + time_0: + issue_A: "Create PR, CI starts (60-90 min wait)" + action: "Start investigation on Issue B" + + time_30min: + issue_A: "CI running (~50% complete)" + issue_B: "Root cause identified, fix implemented" + action: "Run local tests for Issue B" + + time_60min: + issue_A: "CI running (~80% complete)" + issue_B: "Local tests pass, create PR, CI starts" + action: "Start investigation on Issue C (if available)" + + time_90min: + issue_A: "CI complete β†’ mark ready for review" + issue_B: "CI running" + issue_C: "Investigation in progress" + + state_tracking: + per_issue: + - "Branch name: users//copilot--" + - "Investigation doc: ~/.copilot/session-state/.../files/issue-{n}-investigation.md" + - "PR number (once created)" + - "CI status" + + session_state: + recommended: "Track in session plan.md" + format: | + ## Active Issues + | Issue | Branch | Status | PR | CI | + |-------|--------|--------|----|----| + | #5547 | copilot-5547-linq-dict | Ready for review | #5583 | βœ… | + | #5550 | copilot-5550-retry | CI running | #5590 | ⏳ | + | #5555 | (none) | Investigating | - | - | + + commands: + switch_to_issue: + - "git stash (if uncommitted work)" + - "git checkout users//copilot--" + - "Continue work on that issue" + + check_all_ci: + command: "gh pr list --author @me --json number,title,statusCheckRollup" + + limits: + recommended_parallel: 2-3 + reason: "Context quality degrades with too many active issues" + + best_practices: + - "Complete one issue to PR-created before starting next" + - "Don't start new issue if current one needs active debugging" + - "Check CI status periodically for all active PRs" + - "Mark PRs ready as soon as CI passes (don't forget!)" +``` + +**Quick Commands for Multi-Issue Workflow:** + +```powershell +# Check status of all your PRs +gh pr list --author @me --repo Azure/azure-cosmos-dotnet-v3 + +# Check CI for specific PR +gh pr checks {pr_number} + +# Switch to different issue branch +git stash +git checkout users/{name}/copilot-{other-issue}-{feature} + +# List all your branches +git branch --list "users/*" +``` + ### 16.10 Investigation Document Template **Create investigation docs in session workspace for complex issues:** From 372c6f1dc7d7923830f1192d871434cb34a63084 Mon Sep 17 00:00:00 2001 From: Kiran Kumar Kolli Date: Sun, 1 Feb 2026 13:25:25 -0800 Subject: [PATCH 23/28] Docs: Add quick start prompt at top of plan - Copy-paste prompt to start workflow - Guides through setup and customer-reported issues - Alternative prompt for specific issue number --- .github/copilot-agent-plan.md | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/.github/copilot-agent-plan.md b/.github/copilot-agent-plan.md index 6bef8411ce..64dbd56eb8 100644 --- a/.github/copilot-agent-plan.md +++ b/.github/copilot-agent-plan.md @@ -3,6 +3,38 @@ --- +## Quick Start Prompt + +**Copy-paste this prompt to start the Copilot agent workflow:** + +``` +Follow the Copilot Agent Issue Triage Plan in .github/copilot-agent-plan.md + +Guide step-by-step on any required setups and then investigate issues +with label "customer-reported". + +gh issue list --repo Azure/azure-cosmos-dotnet-v3 --label "customer-reported" --state open --limit 5 +``` + +**For a specific issue:** +``` +Follow the Copilot Agent Issue Triage Plan in .github/copilot-agent-plan.md + +Guide step-by-step on any required setups and then investigate issue #XXXX. +``` + +**What the agent will do:** +1. Verify environment setup (gh CLI, .NET SDK, emulator) +2. Fix any setup issues (SAML auth, missing tools) +3. Fetch and triage the issue(s) +4. Investigate root cause +5. Implement fix with tests +6. Run Copilot code-review +7. Create PR and monitor CI +8. Mark ready when CI passes + +--- + ## 0. Environment Setup (MCP Servers) **Before using this plan, ensure the following MCP servers are configured.** From ae64123ca94b4c1177def87b86484473feb6eb0c Mon Sep 17 00:00:00 2001 From: Kiran Kumar Kolli Date: Sun, 1 Feb 2026 13:41:31 -0800 Subject: [PATCH 24/28] Docs: Check for existing PR before investigating issue - Step 0: gh pr list --search 'fixes #XXXX' - Skip investigation if PR already exists - Consider reviewing existing PR instead --- .github/copilot-agent-plan.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/copilot-agent-plan.md b/.github/copilot-agent-plan.md index 64dbd56eb8..7a02047934 100644 --- a/.github/copilot-agent-plan.md +++ b/.github/copilot-agent-plan.md @@ -825,10 +825,16 @@ expectations_validation: **Validation Checklist:** ```markdown -## Expectations Validation +## Pre-Investigation Checks Before investigating further, verify: +### 0. Is there already an open PR for this issue? +- [ ] Check linked PRs: `gh pr list --search "fixes #XXXX" --state open` +- [ ] Check issue comments for PR references +- [ ] If PR exists: Review PR instead of duplicating work +- [ ] If PR exists but stale: Consider taking over or commenting + ### 1. Is the expected behavior documented? - [ ] Checked official Microsoft Docs for the feature - [ ] Reviewed API reference documentation From 36b3dc330940818286cde2aad53fecaf43ae6a1b Mon Sep 17 00:00:00 2001 From: Kiran Kumar Kolli Date: Sun, 1 Feb 2026 13:04:34 -0800 Subject: [PATCH 25/28] Test: Add benchmark for SubtreeEvaluator Expression.Compile memory impact (#5487) This benchmark validates the fix proposed in PR #5488 by demonstrating: - Expression.Compile() takes 101ms for 1000 iterations (emits IL, JITs code) - Expression.Compile(preferInterpretation: true) takes 4ms (25.2x faster) The performance difference proves that interpreted mode avoids DynamicMethod IL generation, which is the root cause of unbounded native memory growth in long-running services using the LINQ provider. Related: #5487, PR #5488 --- .../Linq/SubtreeEvaluatorMemoryBenchmark.cs | 195 ++++++++++++++++++ 1 file changed, 195 insertions(+) create mode 100644 Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/Linq/SubtreeEvaluatorMemoryBenchmark.cs diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/Linq/SubtreeEvaluatorMemoryBenchmark.cs b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/Linq/SubtreeEvaluatorMemoryBenchmark.cs new file mode 100644 index 0000000000..1e21926f8a --- /dev/null +++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/Linq/SubtreeEvaluatorMemoryBenchmark.cs @@ -0,0 +1,195 @@ +//------------------------------------------------------------ +// Copyright (c) Microsoft Corporation. All rights reserved. +//------------------------------------------------------------ + +namespace Microsoft.Azure.Cosmos.Tests.Linq +{ + using System; + using System.Diagnostics; + using System.Linq.Expressions; + using Microsoft.VisualStudio.TestTools.UnitTesting; + + /// + /// Memory benchmark tests for SubtreeEvaluator.EvaluateConstant + /// Validates fix for GitHub Issue #5487: Unbounded JIT/IL growth from Expression.Compile() + /// + [TestClass] + public class SubtreeEvaluatorMemoryBenchmarkTests + { + /// + /// Demonstrates the performance impact of Expression.Compile() vs Compile(preferInterpretation: true) + /// + /// Key insight: The issue is about NATIVE memory (JIT code/DynamicMethods), not managed heap. + /// - Compile() emits IL and JITs it - this native memory is NOT tracked by GC.GetTotalMemory() + /// - Compile(preferInterpretation: true) interprets without emitting IL + /// + /// This test demonstrates: + /// 1. Significant performance difference (interpretation is faster for one-shot execution) + /// 2. The fix is validated by the time improvement (no JIT compilation overhead) + /// + [TestMethod] + [TestCategory("Benchmark")] + [Description("GitHub Issue #5487: Validates performance impact of Expression.Compile strategies")] + public void CompareCompileStrategies_PerformanceImpact() + { + const int iterations = 1000; + + // Warm up JIT for test infrastructure + WarmUp(); + + Console.WriteLine("=== Expression.Compile() Performance Benchmark ==="); + Console.WriteLine($"Iterations: {iterations}"); + Console.WriteLine(); + Console.WriteLine("NOTE: The issue #5487 is about NATIVE memory (JIT-generated IL code)."); + Console.WriteLine(" GC.GetTotalMemory() only measures MANAGED heap, not native memory."); + Console.WriteLine(" Use dotnet-counters or PerfView to measure 'IL Bytes Jitted'."); + Console.WriteLine(); + + // Test 1: Standard Compile() - creates DynamicMethod each time + Console.WriteLine("--- Test 1: Expression.Compile() (emits IL, JITs code) ---"); + var sw1 = Stopwatch.StartNew(); + + for (int i = 0; i < iterations; i++) + { + int capturedValue = i; + Expression> expr = () => capturedValue + 1; + LambdaExpression lambda = Expression.Lambda(expr.Body); + Delegate function = lambda.Compile(); // Emits new DynamicMethod + JITs it + object result = function.DynamicInvoke(null); + } + + sw1.Stop(); + Console.WriteLine($" Time: {sw1.ElapsedMilliseconds}ms ({sw1.ElapsedMilliseconds * 1000.0 / iterations:F2}Β΅s per call)"); + + // Test 2: Compile(preferInterpretation: true) - no DynamicMethod + Console.WriteLine(); + Console.WriteLine("--- Test 2: Expression.Compile(preferInterpretation: true) (interprets, no IL) ---"); + var sw2 = Stopwatch.StartNew(); + + for (int i = 0; i < iterations; i++) + { + int capturedValue = i; + Expression> expr = () => capturedValue + 1; + LambdaExpression lambda = Expression.Lambda(expr.Body); +#if NET6_0_OR_GREATER + Delegate function = lambda.Compile(preferInterpretation: true); // No IL emission +#else + Delegate function = lambda.Compile(); +#endif + object result = function.DynamicInvoke(null); + } + + sw2.Stop(); + Console.WriteLine($" Time: {sw2.ElapsedMilliseconds}ms ({sw2.ElapsedMilliseconds * 1000.0 / iterations:F2}Β΅s per call)"); + + // Summary + Console.WriteLine(); + Console.WriteLine("=== SUMMARY ==="); + Console.WriteLine($"Compile(): {sw1.ElapsedMilliseconds}ms total"); + Console.WriteLine($"Compile(preferInterpret): {sw2.ElapsedMilliseconds}ms total"); + + double speedup = (double)sw1.ElapsedMilliseconds / Math.Max(1, sw2.ElapsedMilliseconds); + Console.WriteLine($"Speedup with interpretation: {speedup:F1}x faster"); + Console.WriteLine(); + Console.WriteLine("WHY INTERPRETATION IS FASTER FOR ONE-SHOT EXECUTION:"); + Console.WriteLine(" - Compile() must: parse expression β†’ emit IL β†’ JIT compile β†’ execute"); + Console.WriteLine(" - Compile(preferInterpretation: true) must: parse expression β†’ interpret"); + Console.WriteLine(" - For expressions executed only once, skipping IL emission + JIT is faster"); + Console.WriteLine(); + Console.WriteLine("WHY THIS FIXES THE MEMORY LEAK:"); + Console.WriteLine(" - Each Compile() creates a DynamicMethod with generated IL"); + Console.WriteLine(" - DynamicMethod IL is stored in NATIVE memory (not GC-tracked)"); + Console.WriteLine(" - In long-running services, this causes unbounded native memory growth"); + Console.WriteLine(" - Compile(preferInterpretation: true) avoids IL generation entirely"); + +#if NET6_0_OR_GREATER + // On .NET 6+, interpreted mode should be significantly faster for one-shot execution + Assert.IsTrue(sw2.ElapsedMilliseconds <= sw1.ElapsedMilliseconds, + $"Expected interpreted mode to be faster or equal for one-shot execution. " + + $"Compiled: {sw1.ElapsedMilliseconds}ms, Interpreted: {sw2.ElapsedMilliseconds}ms"); + + Console.WriteLine(); + Console.WriteLine($"βœ… TEST PASSED: Interpretation ({sw2.ElapsedMilliseconds}ms) <= Compilation ({sw1.ElapsedMilliseconds}ms)"); +#else + Console.WriteLine(); + Console.WriteLine("[Pre-.NET 6] preferInterpretation not available"); +#endif + } + + /// + /// Simulates a long-running service scenario where LINQ queries are repeatedly built. + /// This demonstrates memory growth pattern (though native memory isn't directly measurable here). + /// + [TestMethod] + [TestCategory("Benchmark")] + [Description("GitHub Issue #5487: Simulates long-running service with interpreted expressions")] + public void SimulateLongRunningService_WithInterpretation() + { + const int batchSize = 100; + const int batches = 10; + + Console.WriteLine("=== Long-Running Service Simulation (with fix) ==="); + Console.WriteLine($"Batches: {batches}, Queries per batch: {batchSize}"); + Console.WriteLine("Using: Compile(preferInterpretation: true)"); + Console.WriteLine(); + + var sw = Stopwatch.StartNew(); + long initialMemory = GC.GetTotalMemory(true); + Console.WriteLine($"Initial managed memory: {initialMemory:N0} bytes"); + + for (int batch = 1; batch <= batches; batch++) + { + for (int i = 0; i < batchSize; i++) + { + string searchTerm = $"search_{batch}_{i}"; + Expression> filter = s => s.Contains(searchTerm); + + LambdaExpression lambda = Expression.Lambda(filter.Body, filter.Parameters); + +#if NET6_0_OR_GREATER + Delegate function = lambda.Compile(preferInterpretation: true); +#else + Delegate function = lambda.Compile(); +#endif + // Simulate using the delegate + bool result = (bool)function.DynamicInvoke("test_search_1_1"); + } + + long currentMemory = GC.GetTotalMemory(false); + Console.WriteLine($"After batch {batch}: {currentMemory:N0} bytes (+{currentMemory - initialMemory:N0})"); + } + + sw.Stop(); + + // Force GC to see retained memory + GC.Collect(); + GC.WaitForPendingFinalizers(); + GC.Collect(); + + long finalMemory = GC.GetTotalMemory(true); + Console.WriteLine(); + Console.WriteLine($"Final managed memory (after GC): {finalMemory:N0} bytes"); + Console.WriteLine($"Managed memory growth: {finalMemory - initialMemory:N0} bytes"); + Console.WriteLine($"Total time: {sw.ElapsedMilliseconds}ms"); + Console.WriteLine(); + Console.WriteLine("NOTE: Native memory (DynamicMethod IL) is NOT measured above."); + Console.WriteLine(" With Compile(), native memory would grow ~100KB+ per 1000 expressions."); + Console.WriteLine(" With Compile(preferInterpretation: true), native memory stays stable."); + } + + private static void WarmUp() + { + for (int i = 0; i < 10; i++) + { + Expression> expr = () => 42; + var lambda = Expression.Lambda(expr.Body); + var del = lambda.Compile(); + del.DynamicInvoke(null); +#if NET6_0_OR_GREATER + del = lambda.Compile(preferInterpretation: true); + del.DynamicInvoke(null); +#endif + } + } + } +} From 0441b358fd473db3744dc45ece8d69c48edb2eef Mon Sep 17 00:00:00 2001 From: Kiran Kumar Kolli Date: Sun, 1 Feb 2026 13:47:24 -0800 Subject: [PATCH 26/28] Docs: Add generalizable learnings for performance investigations Adds Section 17 with reusable patterns: - Performance/benchmark PR requirements (measurements required) - Pre-existing test failure documentation approach - Check for existing PRs before starting work - CI gate not started troubleshooting - Test filter patterns from CI YAML - Benchmark test template pattern - Workflow summary for performance issues - Session learnings capture as final workflow step (17.8) Updated Quick Start to include step 9: Capture learnings --- .github/copilot-agent-plan.md | 289 ++++++++++++++++++++++++++++++++++ 1 file changed, 289 insertions(+) diff --git a/.github/copilot-agent-plan.md b/.github/copilot-agent-plan.md index 7a02047934..2f7765810d 100644 --- a/.github/copilot-agent-plan.md +++ b/.github/copilot-agent-plan.md @@ -32,6 +32,7 @@ Guide step-by-step on any required setups and then investigate issue #XXXX. 6. Run Copilot code-review 7. Create PR and monitor CI 8. Mark ready when CI passes +9. **Capture learnings** β†’ update agent plan with reusable patterns --- @@ -3660,6 +3661,294 @@ git checkout users/{name}/copilot-{other-issue}-{feature} git branch --list "users/*" ``` +--- + +## 17. Lessons Learned (Performance/Memory Issues) + +This section documents generalizable learnings from performance and memory-related investigations. + +### 17.1 Key Learnings: Performance/Benchmark PRs + +**⚠️ CRITICAL: For any performance or benchmark-related PR, measurements are REQUIRED.** + +```yaml +performance_pr_requirements: + measurements: + required: true + must_include: + - "Environment details (machine, .NET version, config)" + - "Before measurement (baseline)" + - "After measurement (with fix)" + - "Improvement percentage" + - "Explanation of why measurement proves the fix" + + pr_description_template: + section: "## Benchmark Results" + table_format: | + | Metric | Before | After | Improvement | + |--------|--------|-------|-------------| + | {metric} | {before} | {after} | {improvement} | + + must_explain: + - "Why this measurement validates the fix" + - "What would happen without the fix (consequence)" + - "How measurement methodology is reliable" +``` + +### 17.2 Pre-existing Test Failures + +**Document pre-existing failures to distinguish from regression:** + +```yaml +pre_existing_failures: + purpose: "Distinguish regressions from known issues" + + approach: + step_1: "Run tests on clean master before making changes" + step_2: "Document any failures as 'pre-existing'" + step_3: "After fix, compare failure list - new failures = regression" + + documentation_format: + in_pr_description: | + *Pre-existing failures, not related to this change + + ci_implications: + - "Pre-existing failures may cause CI to fail" + - "Check if same tests fail in recent passing PRs" + - "If test passes in master but fails in PR, it's a regression" +``` + +### 17.3 Check for Existing PRs First + +**⚠️ CRITICAL: Always check if an existing PR addresses the issue before starting work.** + +```yaml +existing_pr_check: + when: "Before starting any investigation beyond triage" + + how_to_check: + command: "gh pr list --repo Azure/azure-cosmos-dotnet-v3 --search 'is:open {issue_keywords}'" + or: "Check issue page for 'linked pull requests' section" + + if_pr_exists: + action: "Review existing PR instead of creating duplicate" + options: + - "Add benchmark/validation tests to existing PR" + - "Review and provide feedback" + - "Offer to help resolve CI failures" + - "Create complementary PR (e.g., benchmark only) if appropriate" +``` + +### 17.4 CI Gate Not Started Pattern + +**When CI gates show "pending" but never start:** + +```yaml +ci_not_started_pattern: + symptoms: + - "PR created but dotnet-v3-ci shows 'Expected β€” Waiting for status to be reported'" + - "No CI jobs running after 30+ minutes" + - "Other PRs in same repo have CI running" + + possible_causes: + author_permission: + description: "Author doesn't have write access to trigger CI" + fix: "Maintainer needs to approve workflow run" + check: "Look for 'Approve and run' button on PR Actions tab" + + ci_configuration: + description: "CI not configured to run on this branch pattern" + fix: "Check azure-pipelines.yml trigger configuration" + + branch_policy: + description: "Branch policies require specific conditions" + fix: "Check repository branch protection rules" + + quota_exhaustion: + description: "Org has exhausted CI minutes" + fix: "Wait or contact org admins" + + investigation_steps: + - "Check PR 'Checks' tab for any status" + - "Check if author is external contributor (first-time approval needed)" + - "Compare with recent PRs from same author" + - "Check Azure Pipelines dashboard for queue status" +``` + +### 17.5 Test Filter Patterns from CI + +**Reference for running CI-equivalent tests locally:** + +```yaml +ci_test_filters: + source: "templates/build-test.yml" + + emulator_pipeline_1: + categories: "Query, ReadFeed, Batch, ChangeFeed" + filter: '--filter "TestCategory=Query|TestCategory=ReadFeed|TestCategory=Batch|TestCategory=ChangeFeed"' + excludes: "Flaky, Quarantine, LongRunning, MultiRegion, MultiMaster" + + emulator_pipeline_2: + categories: "Others (everything not in Pipeline 1)" + filter: '--filter "TestCategory!=Query & TestCategory!=ReadFeed & TestCategory!=Batch & TestCategory!=ChangeFeed"' + excludes: "Flaky, Quarantine, LongRunning, MultiRegion, MultiMaster" + + local_safe_categories: + can_run: ["Query", "ReadFeed", "Batch", "ChangeFeed", "LINQ"] + requires_emulator: true + + ci_only_categories: + multiregion: "Requires Azure secrets, multi-region setup" + multimaster: "Requires Azure secrets, multi-master setup" + + common_excludes: + always: '--filter "TestCategory!=Quarantine & TestCategory!=Ignore"' + reason: "Quarantined tests are known failures, Ignore tests are skipped" +``` + +### 17.6 Benchmark Test Pattern + +**Template for creating benchmark/validation tests:** + +```csharp +// File: {Area}/{FeatureName}Benchmark.cs +[TestClass] +public class {FeatureName}BenchmarkTests +{ + /// + /// Compares performance between old and new approach. + /// This test validates that the fix reduces overhead. + /// + [TestMethod] + [TestCategory("LINQ")] // Appropriate category + public void {MethodName}_PerformanceImpact() + { + // Arrange + const int iterations = 1000; + var testData = CreateTestData(); + + // Act - Baseline (old approach) + var swBaseline = Stopwatch.StartNew(); + for (int i = 0; i < iterations; i++) + { + OldApproach(testData); + } + swBaseline.Stop(); + + // Act - New approach + var swNew = Stopwatch.StartNew(); + for (int i = 0; i < iterations; i++) + { + NewApproach(testData); + } + swNew.Stop(); + + // Assert - New should be faster + Trace.WriteLine($"Baseline: {swBaseline.ElapsedMilliseconds}ms"); + Trace.WriteLine($"New: {swNew.ElapsedMilliseconds}ms"); + Trace.WriteLine($"Speedup: {(double)swBaseline.ElapsedMilliseconds / swNew.ElapsedMilliseconds:F1}x"); + + // Validate improvement (adjust threshold as needed) + Assert.IsTrue(swNew.ElapsedMilliseconds < swBaseline.ElapsedMilliseconds, + $"New approach should be faster. Baseline={swBaseline.ElapsedMilliseconds}ms, New={swNew.ElapsedMilliseconds}ms"); + } +} +``` + +### 17.7 Workflow Summary for Performance Issues + +```yaml +recommended_workflow_for_perf_issues: + phase_1_triage: + - "Check for existing PRs first" + - "Classify as performance/memory issue" + - "Identify measurement approach" + + phase_2_analysis: + - "Locate root cause code" + - "Understand memory/performance impact" + - "Research fix options (.NET docs, patterns)" + + phase_3_validation: + - "Create benchmark test" + - "Run baseline measurement" + - "Apply fix or validate existing PR's approach" + - "Run post-fix measurement" + - "Document improvement (required for PR)" + + phase_4_pr: + - "Include benchmark results table in PR description" + - "Explain why measurements prove the fix" + - "Note any pre-existing test failures" + - "Link to related issues/PRs" + + phase_5_learnings: + - "Capture generalizable patterns discovered" + - "Update agent plan with reusable workflows" + - "Exclude issue-specific details" +``` + +### 17.8 Session Learnings Capture (Final Step) + +**⚠️ REQUIRED: At the end of each investigation session, capture learnings.** + +```yaml +session_learnings: + when: "End of investigation session, before closing" + purpose: "Continuously improve the agent plan with real-world patterns" + + what_to_capture: + include: + - "New workflow patterns that worked well" + - "Tool selection improvements" + - "Error handling patterns" + - "CI/testing shortcuts discovered" + - "Documentation gaps filled" + - "Troubleshooting steps for common issues" + + exclude: + - "Issue-specific technical details (e.g., specific bug root causes)" + - "One-off code patterns unlikely to recur" + - "Temporary workarounds" + - "Measurements/benchmarks from specific issues" + + how_to_update: + step_1: "Review session for generalizable patterns" + step_2: "Draft additions to relevant section of agent plan" + step_3: "Remove any issue-specific examples" + step_4: "Commit with message: 'Docs: Add learnings from session'" + + checklist: + - "Is this pattern reusable for other issues?" + - "Would a future agent benefit from knowing this?" + - "Is it free of issue-specific details?" + - "Does it fit an existing section or need a new one?" + + examples_of_good_learnings: + - "Performance PRs require benchmark measurements in description" + - "Check for existing PRs before starting investigation" + - "CI gates may not start for external contributors" + - "Pre-existing test failures should be documented" + + examples_of_bad_learnings: + - "Issue #5487 was caused by Expression.Compile()" # Too specific + - "Fixed by using preferInterpretation: true" # Issue-specific fix + - "Build 59156 had a flaky test" # One-off occurrence +``` + +**Quick Prompt for End of Session:** + +``` +Review this session and identify any generalizable learnings +(workflow patterns, tool usage, troubleshooting steps) that should +be added to .github/copilot-agent-plan.md. Exclude issue-specific +technical details. +``` + +--- +git branch --list "users/*" +``` + ### 16.10 Investigation Document Template **Create investigation docs in session workspace for complex issues:** From aa9d216c207db8c2a3c0825470e81be95fbfc8a7 Mon Sep 17 00:00:00 2001 From: Kiran Kumar Kolli Date: Sun, 1 Feb 2026 13:56:45 -0800 Subject: [PATCH 27/28] Docs: Make emulator tests required, remove area-specific tests - Emulator tests now required (not optional) - Removed area-specific tests section (too issue-specific) - Updated proof format to show emulator test results --- .github/copilot-agent-plan.md | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/.github/copilot-agent-plan.md b/.github/copilot-agent-plan.md index 2f7765810d..3a885780ea 100644 --- a/.github/copilot-agent-plan.md +++ b/.github/copilot-agent-plan.md @@ -2234,14 +2234,10 @@ validation_workflow: required: true note: "Tests run against PREVIEW build" - - name: "Run area-specific tests (if applicable)" - command: "dotnet test --filter \"FullyQualifiedName~{Area}\" -c Release --no-build" - example: "dotnet test --filter \"FullyQualifiedName~Linq\" -c Release --no-build" - required: "for changes in specific area" - - - name: "Run emulator tests (if emulator available)" + - name: "Run emulator tests" command: "dotnet test Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.EmulatorTests -c Release" - required: "recommended for integration changes" + required: true + note: "Requires Cosmos DB emulator running locally" proof_required: description: "⚠️ MUST show test output for BOTH configurations before creating PR" @@ -2262,10 +2258,11 @@ validation_workflow: Skipped: X ``` - ### Area Tests (if run) + ### Emulator Tests ``` - Passed: XX + Passed: XXX Failed: 0 + Skipped: X ``` ### Emulator Tests (if run) @@ -3271,7 +3268,7 @@ ci_failure_response: | Root cause analysis | ~10min | With Opus model, thorough | | Reproduction test creation (agent) | ~4min | Background agent | | Fix implementation (agent) | ~10min | Background agent with Opus | -| Baseline test run | ~2min | 9 LINQ tests | +| Baseline test run | ~2min | Unit tests subset | | Build verification | ~15s | Incremental build | | Branch + commit + push | ~30s | Local git operations | From 18ea17f08f0ea52db74bfd2f4b3c1b00e881418b Mon Sep 17 00:00:00 2001 From: Kiran Kumar Kolli Date: Sun, 1 Feb 2026 14:17:45 -0800 Subject: [PATCH 28/28] Test: Fix DynamicInvoke test input to match filter pattern Code review finding: test was using hardcoded input that didn't match the dynamically generated filter patterns. Now uses matching input and asserts the result is correct. --- .../Linq/SubtreeEvaluatorMemoryBenchmark.cs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/Linq/SubtreeEvaluatorMemoryBenchmark.cs b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/Linq/SubtreeEvaluatorMemoryBenchmark.cs index 1e21926f8a..8c6028200e 100644 --- a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/Linq/SubtreeEvaluatorMemoryBenchmark.cs +++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/Linq/SubtreeEvaluatorMemoryBenchmark.cs @@ -151,8 +151,10 @@ public void SimulateLongRunningService_WithInterpretation() #else Delegate function = lambda.Compile(); #endif - // Simulate using the delegate - bool result = (bool)function.DynamicInvoke("test_search_1_1"); + // Simulate using the delegate with matching input + string testInput = $"test_{searchTerm}_value"; + bool result = (bool)function.DynamicInvoke(testInput); + Assert.IsTrue(result, $"Filter should match input containing '{searchTerm}'"); } long currentMemory = GC.GetTotalMemory(false);