diff --git a/.github/workflows/python-integration-tests.yml b/.github/workflows/python-integration-tests.yml index f2fb5c6448..48d15bceda 100644 --- a/.github/workflows/python-integration-tests.yml +++ b/.github/workflows/python-integration-tests.yml @@ -87,6 +87,14 @@ jobs: -n logical --dist worksteal --timeout=120 --session-timeout=900 --timeout_method thread --retries 2 --retry-delay 5 + --junitxml=pytest.xml + - name: Upload test results + if: always() + uses: actions/upload-artifact@v7 + with: + name: test-results-openai + path: ./python/pytest.xml + if-no-files-found: ignore # Azure OpenAI integration tests python-tests-azure-openai: @@ -130,6 +138,14 @@ jobs: -n logical --dist worksteal --timeout=120 --session-timeout=900 --timeout_method thread --retries 2 --retry-delay 5 + --junitxml=pytest.xml + - name: Upload test results + if: always() + uses: actions/upload-artifact@v7 + with: + name: test-results-azure-openai + path: ./python/pytest.xml + if-no-files-found: ignore # Misc integration tests (Anthropic, Hyperlight, Ollama, MCP) python-tests-misc-integration: @@ -173,6 +189,14 @@ jobs: -n logical --dist worksteal --timeout=120 --session-timeout=900 --timeout_method thread --retries 2 --retry-delay 30 + --junitxml=pytest.xml + - name: Upload test results + if: always() + uses: actions/upload-artifact@v7 + with: + name: test-results-misc + path: ./python/pytest.xml + if-no-files-found: ignore - name: Stop local MCP server if: always() shell: bash @@ -249,6 +273,14 @@ jobs: -x --timeout=360 --session-timeout=900 --timeout_method thread --retries 2 --retry-delay 5 + --junitxml=pytest.xml + - name: Upload test results + if: always() + uses: actions/upload-artifact@v7 + with: + name: test-results-functions + path: ./python/pytest.xml + if-no-files-found: ignore # Foundry integration tests python-tests-foundry: @@ -295,6 +327,14 @@ jobs: -n logical --dist worksteal --timeout=120 --session-timeout=900 --timeout_method thread --retries 2 --retry-delay 5 + --junitxml=pytest.xml + - name: Upload test results + if: always() + uses: actions/upload-artifact@v7 + with: + name: test-results-foundry + path: ./python/pytest.xml + if-no-files-found: ignore # Azure Cosmos integration tests python-tests-cosmos: @@ -339,7 +379,80 @@ jobs: echo "Cosmos DB emulator did not become ready in time." >&2 exit 1 - name: Test with pytest (Cosmos integration) - run: uv run --directory packages/azure-cosmos poe integration-tests -n logical --dist worksteal --timeout=120 --session-timeout=900 --timeout_method thread --retries 2 --retry-delay 5 + run: uv run --directory packages/azure-cosmos poe integration-tests -n logical --dist worksteal --timeout=120 --session-timeout=900 --timeout_method thread --retries 2 --retry-delay 5 --junitxml=${{ github.workspace }}/python/pytest.xml + - name: Upload test results + if: always() + uses: actions/upload-artifact@v7 + with: + name: test-results-cosmos + path: ./python/pytest.xml + if-no-files-found: ignore + + # Flaky test trend report (aggregates per-job JUnit XML results) + python-flaky-test-report: + name: Flaky Test Report + if: > + always() && + (contains(join(needs.*.result, ','), 'success') || + contains(join(needs.*.result, ','), 'failure')) + needs: + [ + python-tests-openai, + python-tests-azure-openai, + python-tests-misc-integration, + python-tests-functions, + python-tests-foundry, + python-tests-cosmos, + ] + runs-on: ubuntu-latest + defaults: + run: + working-directory: python + steps: + - uses: actions/checkout@v6 + with: + ref: ${{ inputs.checkout-ref }} + persist-credentials: false + - name: Set up python and install the project + uses: ./.github/actions/python-setup + with: + python-version: ${{ env.UV_PYTHON }} + os: ${{ runner.os }} + - name: Download all test results from current run + uses: actions/download-artifact@v4 + with: + pattern: test-results-* + path: test-results/ + - name: Restore flaky report history cache + uses: actions/cache/restore@v4 + with: + path: python/flaky-report-history.json + key: flaky-report-history-integration-${{ github.run_id }} + restore-keys: | + flaky-report-history-integration- + - name: Generate trend report + run: > + uv run python scripts/flaky_report/aggregate.py + ../test-results/ + flaky-report-history.json + flaky-test-report.md + - name: Post to Job Summary + if: always() + run: cat flaky-test-report.md >> $GITHUB_STEP_SUMMARY + - name: Save flaky report history cache + if: always() + uses: actions/cache/save@v4 + with: + path: python/flaky-report-history.json + key: flaky-report-history-integration-${{ github.run_id }} + - name: Upload unified trend report + if: always() + uses: actions/upload-artifact@v7 + with: + name: flaky-test-report + path: | + python/flaky-test-report.md + python/flaky-report-history.json python-integration-tests-check: if: always() diff --git a/.github/workflows/python-merge-tests.yml b/.github/workflows/python-merge-tests.yml index dd48b268df..843253e788 100644 --- a/.github/workflows/python-merge-tests.yml +++ b/.github/workflows/python-merge-tests.yml @@ -181,6 +181,13 @@ jobs: display-options: fEX fail-on-empty: false title: OpenAI integration test results + - name: Upload test results + if: always() + uses: actions/upload-artifact@v7 + with: + name: test-results-openai + path: ./python/pytest.xml + if-no-files-found: ignore # Azure OpenAI integration tests python-tests-azure-openai: @@ -244,6 +251,13 @@ jobs: display-options: fEX fail-on-empty: false title: Azure OpenAI integration test results + - name: Upload test results + if: always() + uses: actions/upload-artifact@v7 + with: + name: test-results-azure-openai + path: ./python/pytest.xml + if-no-files-found: ignore # Misc integration tests (Anthropic, Ollama, MCP) python-tests-misc-integration: @@ -321,6 +335,13 @@ jobs: display-options: fEX fail-on-empty: false title: Misc integration test results + - name: Upload test results + if: always() + uses: actions/upload-artifact@v7 + with: + name: test-results-misc + path: ./python/pytest.xml + if-no-files-found: ignore # Azure Functions + Durable Task integration tests python-tests-functions: @@ -392,6 +413,13 @@ jobs: display-options: fEX fail-on-empty: false title: Functions integration test results + - name: Upload test results + if: always() + uses: actions/upload-artifact@v7 + with: + name: test-results-functions + path: ./python/pytest.xml + if-no-files-found: ignore python-tests-foundry: name: Python Integration Tests - Foundry @@ -409,6 +437,10 @@ jobs: FOUNDRY_MODEL: ${{ vars.FOUNDRY_MODEL }} FOUNDRY_AGENT_NAME: ${{ vars.FOUNDRY_AGENT_NAME }} FOUNDRY_AGENT_VERSION: ${{ vars.FOUNDRY_AGENT_VERSION }} + FOUNDRY_MODELS_ENDPOINT: ${{ vars.FOUNDRY_MODELS_ENDPOINT || '' }} + FOUNDRY_MODELS_API_KEY: ${{ secrets.FOUNDRY_MODELS_API_KEY || '' }} + FOUNDRY_EMBEDDING_MODEL: ${{ vars.FOUNDRY_EMBEDDING_MODEL || '' }} + FOUNDRY_IMAGE_EMBEDDING_MODEL: ${{ vars.FOUNDRY_IMAGE_EMBEDDING_MODEL || '' }} LOCAL_MCP_URL: ${{ vars.LOCAL_MCP__URL }} defaults: run: @@ -448,6 +480,13 @@ jobs: display-options: fEX fail-on-empty: false title: Test results + - name: Upload test results + if: always() + uses: actions/upload-artifact@v7 + with: + name: test-results-foundry + path: ./python/pytest.xml + if-no-files-found: ignore # TODO: Add python-tests-lab @@ -497,7 +536,7 @@ jobs: echo "Cosmos DB emulator did not become ready in time." >&2 exit 1 - name: Test with pytest (Cosmos integration) - run: uv run --directory packages/azure-cosmos poe integration-tests -n logical --dist worksteal --timeout=120 --session-timeout=900 --timeout_method thread --retries 2 --retry-delay 5 --junitxml=pytest.xml + run: uv run --directory packages/azure-cosmos poe integration-tests -n logical --dist worksteal --timeout=120 --session-timeout=900 --timeout_method thread --retries 2 --retry-delay 5 --junitxml=${{ github.workspace }}/python/pytest.xml working-directory: ./python - name: Surface failing tests if: always() @@ -508,6 +547,76 @@ jobs: display-options: fEX fail-on-empty: false title: Cosmos integration test results + - name: Upload test results + if: always() + uses: actions/upload-artifact@v7 + with: + name: test-results-cosmos + path: ./python/pytest.xml + if-no-files-found: ignore + + # Flaky test trend report (aggregates per-job JUnit XML results) + python-flaky-test-report: + name: Flaky Test Report + if: > + always() && + (contains(join(needs.*.result, ','), 'success') || + contains(join(needs.*.result, ','), 'failure')) + needs: + [ + python-tests-openai, + python-tests-azure-openai, + python-tests-misc-integration, + python-tests-functions, + python-tests-foundry, + python-tests-cosmos, + ] + runs-on: ubuntu-latest + defaults: + run: + working-directory: python + steps: + - uses: actions/checkout@v6 + - name: Set up python and install the project + uses: ./.github/actions/python-setup + with: + python-version: ${{ env.UV_PYTHON }} + os: ${{ runner.os }} + - name: Download all test results from current run + uses: actions/download-artifact@v4 + with: + pattern: test-results-* + path: test-results/ + - name: Restore flaky report history cache + uses: actions/cache/restore@v4 + with: + path: python/flaky-report-history.json + key: flaky-report-history-merge-${{ github.run_id }} + restore-keys: | + flaky-report-history-merge- + - name: Generate trend report + run: > + uv run python scripts/flaky_report/aggregate.py + ../test-results/ + flaky-report-history.json + flaky-test-report.md + - name: Post to Job Summary + if: always() + run: cat flaky-test-report.md >> $GITHUB_STEP_SUMMARY + - name: Save flaky report history cache + if: always() + uses: actions/cache/save@v4 + with: + path: python/flaky-report-history.json + key: flaky-report-history-merge-${{ github.run_id }} + - name: Upload unified trend report + if: always() + uses: actions/upload-artifact@v7 + with: + name: flaky-test-report + path: | + python/flaky-test-report.md + python/flaky-report-history.json python-integration-tests-check: if: always() diff --git a/python/scripts/flaky_report/__init__.py b/python/scripts/flaky_report/__init__.py new file mode 100644 index 0000000000..e5a0eeb0ca --- /dev/null +++ b/python/scripts/flaky_report/__init__.py @@ -0,0 +1,11 @@ +# Copyright (c) Microsoft. All rights reserved. + +"""Flaky test report aggregation and trend generation. + +Parses JUnit XML (``pytest.xml``) files produced by each CI job, merges +them with historical data, and generates a markdown trend report showing +per-test status across the last N runs. + +Usage: + uv run python -m scripts.flaky_report +""" diff --git a/python/scripts/flaky_report/__main__.py b/python/scripts/flaky_report/__main__.py new file mode 100644 index 0000000000..89969baae6 --- /dev/null +++ b/python/scripts/flaky_report/__main__.py @@ -0,0 +1,20 @@ +# Copyright (c) Microsoft. All rights reserved. + +"""CLI entry point for the flaky test report tool. + +Usage: + uv run python -m scripts.flaky_report + +Example (from python/ directory): + uv run python -m scripts.flaky_report \\ + ../flaky-reports/ \\ + flaky-report-history.json \\ + flaky-test-report.md +""" + +import sys + +from scripts.flaky_report.aggregate import main + +if __name__ == "__main__": + sys.exit(main()) diff --git a/python/scripts/flaky_report/aggregate.py b/python/scripts/flaky_report/aggregate.py new file mode 100644 index 0000000000..e07a5e136a --- /dev/null +++ b/python/scripts/flaky_report/aggregate.py @@ -0,0 +1,396 @@ +# Copyright (c) Microsoft. All rights reserved. + +"""Aggregate per-provider JUnit XML test results and generate a trend report. + +Parses ``pytest.xml`` (JUnit XML) files produced by each CI job, merges them +into a single run, combines with historical data, and generates a markdown +trend table — the same pattern used by ``scripts/sample_validation/aggregate.py``. + +Usage (from CI): + python aggregate.py + +The reports directory is expected to contain subdirectories named +``test-results-/`` each containing a ``pytest.xml`` file +(created by ``actions/download-artifact``). +""" + +from __future__ import annotations + +import json +import sys +import xml.etree.ElementTree as ET +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + +MAX_HISTORY = 5 + +STATUS_EMOJI = { + "passed": "✅", + "failed": "❌", + "skipped": "⏭️", + "xfailed": "⚠️", + "error": "❌", +} + + +def _format_run_label(timestamp: str) -> str: + """Format a timestamp as a compact column label (e.g. '04-16 00:57').""" + try: + dt = datetime.fromisoformat(timestamp) + return dt.strftime("%m-%d %H:%M") + except (ValueError, TypeError): + return timestamp[:16] + + +def _derive_provider(directory_name: str) -> str: + """Derive a provider label from a report directory name. + + ``test-results-openai`` → ``OpenAI`` + ``test-results-azure-openai`` → ``Azure OpenAI`` + """ + raw = directory_name.replace("test-results-", "") + known = { + "openai": "OpenAI", + "azure-openai": "Azure OpenAI", + "misc": "Misc (Anthropic, Ollama, MCP)", + "functions": "Functions", + "foundry": "Foundry", + "cosmos": "Cosmos", + "unit": "Unit", + } + if raw in known: + return known[raw] + parts = raw.split("-") + return " ".join(p.capitalize() for p in parts) + + +def _parse_junit_xml(xml_path: Path) -> list[dict[str, str]]: + """Parse a JUnit XML file and return a list of test result dicts. + + Each dict has keys: ``nodeid``, ``status``, ``duration``, ``message``. + """ + results: list[dict[str, str]] = [] + try: + tree = ET.parse(xml_path) # noqa: S314 + except ET.ParseError as exc: + print(f"Warning: failed to parse JUnit XML report '{xml_path}': {exc}", file=sys.stderr) + return results + root = tree.getroot() + + # Handle both ... and ... layouts + testcases: list[ET.Element] = [] + if root.tag == "testsuites": + for suite in root.findall("testsuite"): + testcases.extend(suite.findall("testcase")) + elif root.tag == "testsuite": + testcases = list(root.findall("testcase")) + + for tc in testcases: + classname = tc.get("classname", "") + name = tc.get("name", "") + duration = tc.get("time", "0") + + # Use classname::name as a stable identifier. + # pytest writes classname as the dotted module path (possibly including + # a test class), e.g. "packages.openai.tests.openai.test_chat_client" + # or "packages.openai.tests.openai.test_chat_client.TestClass". + nodeid = f"{classname}::{name}" if classname else name + + # Extract module/file name from classname for display context. + # pytest writes classname as a dotted path. For tests inside a class + # it appends the class name, e.g.: + # "packages.foundry.tests.foundry.test_foundry_embedding_client.TestFoundryEmbeddingIntegration" + # We want the file-level module: "test_foundry_embedding_client" + if classname: + parts = classname.rsplit(".", 2) + # If the last segment starts with uppercase it's a class name — take the one before it + if len(parts) >= 2 and parts[-1][0:1].isupper(): + module = parts[-2] + else: + module = parts[-1] + else: + module = "" + + # Determine status from child elements + failure = tc.find("failure") + error = tc.find("error") + skipped = tc.find("skipped") + + if failure is not None: + status = "failed" + message = failure.get("message", "") + elif error is not None: + status = "error" + message = error.get("message", "") + elif skipped is not None: + # pytest marks xfail as + skip_type = skipped.get("type", "") + status = "xfailed" if "xfail" in skip_type else "skipped" + message = skipped.get("message", "") + else: + status = "passed" + message = "" + + results.append({ + "nodeid": nodeid, + "status": status, + "duration": duration, + "message": message, + "module": module, + }) + + return results + + +# --------------------------------------------------------------------------- +# Loading +# --------------------------------------------------------------------------- + + +def load_current_run(reports_dir: Path) -> dict[str, Any]: + """Load per-provider JUnit XML reports from the current CI run and merge. + + Args: + reports_dir: Directory containing ``test-results-/`` subdirs. + + Returns: + Merged run dict with ``timestamp``, ``summary``, ``results``. + """ + combined_results: dict[str, dict[str, str]] = {} # nodeid → {status, provider} + + # actions/download-artifact creates: reports_dir/test-results-openai/pytest.xml + xml_files: list[tuple[str, Path]] = [] + if reports_dir.is_dir(): + for subdir in sorted(reports_dir.iterdir()): + if subdir.is_dir(): + xml_file = subdir / "pytest.xml" + if xml_file.exists(): + xml_files.append((subdir.name, xml_file)) + + if not xml_files: + print(f"Warning: No pytest.xml files found in {reports_dir}") + return { + "timestamp": datetime.now(timezone.utc).isoformat(), + "summary": { + "total": 0, + "passed": 0, + "failed": 0, + "skipped": 0, + }, + "results": {}, + } + + for dir_name, xml_file in xml_files: + print(f" Loading: {xml_file}") + provider = _derive_provider(dir_name) + tests = _parse_junit_xml(xml_file) + for test in tests: + combined_results[test["nodeid"]] = { + "status": test["status"], + "provider": provider, + "module": test.get("module", ""), + } + + # Build summary counts using mutually exclusive status buckets. + # Errors are folded into the failed count for display purposes. + statuses = [r["status"] for r in combined_results.values()] + summary = { + "total": len(statuses), + "passed": statuses.count("passed"), + "failed": statuses.count("failed") + statuses.count("error"), + "skipped": statuses.count("skipped"), + } + + return { + "timestamp": datetime.now(timezone.utc).isoformat(), + "summary": summary, + "results": combined_results, + } + + +def load_history(history_path: Path) -> list[dict[str, Any]]: + """Load previous run history from a cache file.""" + if history_path.exists(): + with open(history_path, encoding="utf-8") as f: + data = json.load(f) + runs = data.get("runs", []) + print(f" Loaded {len(runs)} previous run(s) from history") + return runs + print(" No previous history found") + return [] + + +def save_history(history_path: Path, runs: list[dict[str, Any]]) -> None: + """Save run history, keeping only the last ``MAX_HISTORY`` entries.""" + history_path.parent.mkdir(parents=True, exist_ok=True) + trimmed = runs[-MAX_HISTORY:] + with open(history_path, "w", encoding="utf-8") as f: + json.dump({"runs": trimmed}, f, indent=2) + print(f" Saved {len(trimmed)} run(s) to history") + + +# --------------------------------------------------------------------------- +# Report generation +# --------------------------------------------------------------------------- + + +def _short_name(nodeid: str) -> str: + """Extract a short test name from a full nodeid. + + ``packages.openai.tests.openai.test_openai_chat_client::test_integration_options`` + → ``test_integration_options`` + """ + return nodeid.split("::")[-1] if "::" in nodeid else nodeid + + +def generate_trend_report(runs: list[dict[str, Any]]) -> str: + """Generate a markdown trend report from run history.""" + lines = [ + "# 🔬 Flaky Test Report", + "", + f"*Generated: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')}*", + "", + ] + + # --- Overall status table (most recent first) --- + lines.append("## Overall Status (Last 5 Runs)") + lines.append("") + lines.append("| Run | Total | ✅ Passed | ❌ Failed | ⏭️ Skipped |") + lines.append("|-----|-------|-----------|-----------|------------|") + + for run in reversed(runs): + s = run.get("summary", {}) + total = s.get("total", 0) + label = _format_run_label(run["timestamp"]) + lines.append( + f"| {label} " + f"| {total} " + f"| {s.get('passed', 0)}/{total} " + f"| {s.get('failed', 0)}/{total} " + f"| {s.get('skipped', 0)}/{total} |" + ) + + for _ in range(MAX_HISTORY - len(runs)): + lines.append("| N/A | N/A | N/A | N/A | N/A |") + + lines.append("") + + # --- Per-test results table --- + lines.append("## Per-Test Results") + lines.append("") + + # Collect all test nodeids, providers, and modules across all runs + all_tests: dict[str, str] = {} # nodeid → provider (from most recent run) + all_modules: dict[str, str] = {} # nodeid → module (from most recent run) + for run in runs: + for nodeid, info in run.get("results", {}).items(): + provider = info.get("provider", "Unknown") if isinstance(info, dict) else "Unknown" + module = info.get("module", "") if isinstance(info, dict) else "" + all_tests[nodeid] = provider + all_modules[nodeid] = module + + if not all_tests: + lines.append("*No test results available.*") + return "\n".join(lines) + + # Build header (most recent run first) + header = "| Test | File | Provider |" + separator = "|------|------|----------|" + for run in reversed(runs): + label = _format_run_label(run["timestamp"]) + header += f" {label} |" + separator += "------------|" + for _ in range(MAX_HISTORY - len(runs)): + header += " N/A |" + separator += "-----|" + + lines.append(header) + lines.append(separator) + + # Sort by provider then test name + for nodeid in sorted(all_tests, key=lambda n: (all_tests[n], n)): + provider = all_tests[nodeid] + module = all_modules.get(nodeid, "") + short = _short_name(nodeid) + row = f"| `{short}` | `{module}` | {provider} |" + + for run in reversed(runs): + result = run.get("results", {}).get(nodeid) + if result is None: + emoji = "N/A" + else: + status = result.get("status", "N/A") if isinstance(result, dict) else result + emoji = STATUS_EMOJI.get(status, "❓") + row += f" {emoji} |" + + for _ in range(MAX_HISTORY - len(runs)): + row += " N/A |" + + lines.append(row) + + lines.append("") + lines.append("**Legend:** ✅ Passed · ❌ Failed · ⏭️ Skipped · ⚠️ Expected Failure (xfail) · N/A Not available") + lines.append("") + + return "\n".join(lines) + + +# --------------------------------------------------------------------------- +# CLI entry point +# --------------------------------------------------------------------------- + + +def main() -> int: + if len(sys.argv) != 4: + print("Usage: python aggregate.py ") + return 1 + + reports_dir = Path(sys.argv[1]) + history_path = Path(sys.argv[2]) + output_path = Path(sys.argv[3]) + + print("Aggregating test results from JUnit XML...") + + # Load current run's per-provider XML reports + print(f"\nLoading reports from {reports_dir}:") + current_run = load_current_run(reports_dir) + s = current_run.get("summary", {}) + total = s.get("total", 0) + print( + f" Current run: {s.get('passed', 0)} passed, " + f"{s.get('failed', 0)} failed, " + f"{s.get('skipped', 0)} skipped " + f"(total: {total})" + ) + + # Load history and append current run (skip empty runs to avoid polluting trend) + print(f"\nLoading history from {history_path}:") + runs = load_history(history_path) + if total > 0: + runs.append(current_run) + runs = runs[-MAX_HISTORY:] + else: + print(" Skipping history append (no test results in current run)") + + # Save updated history + print(f"\nSaving history to {history_path}:") + save_history(history_path, runs) + + # Generate trend report + print("\nGenerating trend report...") + report = generate_trend_report(runs) + + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text(report, encoding="utf-8") + print(f"Trend report written to {output_path}") + + # Print the report to stdout for CI visibility + print("\n" + "=" * 80) + print(report) + + return 0 + + +if __name__ == "__main__": + sys.exit(main())