microsoft · katriendg · Apr 1, 2026 · Mar 30, 2026 · Mar 30, 2026 · Mar 30, 2026
diff --git a/.cspell/general-technical.txt b/.cspell/general-technical.txt
@@ -5,6 +5,7 @@ abfs
 acrolinx
 activex
 acunetix
+addfinalizer
 adobe
 adrm
 adrs
@@ -1420,7 +1421,7 @@ upskill
 upskilling
 uri
 url
-usrr
+usefixtures
 utcnow
 uuid
 uvicorn

@@ -299,6 +299,42 @@ A regression test may be omitted when:
 | Manual test documented in PR           | Only if automated test is impractical |
 | Informal local verification            | No                                    |
 
+### End-to-End Tests
+
+Optionally run the RL end-to-end suite to capture regressions. This is good practice for changes to submission scripts, workflow templates, MLflow wiring, checkpoint handling, or shared RL training assets. The end-to-end suite validates:
+
+- Azure ML or OSMO job submission and lifecycle transitions
+- MLflow metrics and parameter tracking for the completed run
+- Checkpoint output upload for Azure ML runs
+- Workflow task success for OSMO runs
+
+> [!CAUTION]
+> These tests submit real GPU workloads and consume Azure ML, OSMO, Kubernetes, and MLflow resources. They are intentionally excluded from default `pytest` runs and must be invoked explicitly.
+
+Requirements:
+
+| Requirement | Details |
+|-------------|---------|
+| Azure CLI | `az` must be installed and authenticated. The Azure ML CLI extension must also be available. |
+| Azure subscription context | Set `AZURE_SUBSCRIPTION_ID`, or make sure `az account show` resolves to the subscription you want the test to use. |
+| Azure workspace context | Set `AZURE_RESOURCE_GROUP` and `AZUREML_WORKSPACE_NAME`, or make sure `terraform output -json` or `infrastructure/terraform/terraform.tfvars` resolves them. |
+| Azure ML compute target | For Azure ML validation, the compute target must resolve from `AZUREML_COMPUTE` or Terraform naming and its provisioning state must be `Succeeded`. |
+| OSMO and Kubernetes access | For OSMO validation, `osmo` and `kubectl` must be installed and authenticated, and the target cluster must expose at least one reachable GPU node. Connect the VPN first for private clusters. |
+| MLflow access | The Azure ML workspace used by the tests must expose a working MLflow tracking URI because both validation paths assert metrics and parameters after the run completes. |
+
+Run these commands from the repository root:
+
+```bash
+# Azure ML submission path only
+uv run pytest -vv -s -m e2e tests/e2e/test_e2e_training.py::test_aml_rl_training_e2e
+
+# OSMO submission path only
+uv run pytest -vv -s -m e2e tests/e2e/test_e2e_training.py::test_osmo_rl_training_e2e
+
+# Full RL e2e suite
+uv run pytest -vv -s -m e2e tests/e2e/test_e2e_training.py
+```
+
 #### Bug Fix PR Requirements
 
 When submitting a bug fix:

diff --git a/pyproject.toml b/pyproject.toml
@@ -25,6 +25,8 @@ dev = [
     "ipywidgets",
     "tqdm",
     "matplotlib>=3.10.8",
+    "azureml-mlflow>=1.62.0.post2",
+    "azure-ai-ml>=1.32.0",
 ]
 
 [tool.uv]
@@ -35,13 +37,21 @@ testpaths = ["tests", "training/tests", "data-management/tools/tests"]
 pythonpath = [".", "data-management/tools"]
 addopts = [
     "-ra",
+    "-m",
+    "not e2e",
     "--strict-markers",
     "--strict-config",
     "--cov=training",
     "--cov-report=term-missing",
     "--cov-report=xml",
 ]
-markers = []
+markers = [
+    "e2e: marks tests that submit real GPU training jobs (deselect: -m 'not e2e')",
+]
+filterwarnings = [
+    "ignore::marshmallow.warnings.RemovedInMarshmallow4Warning",
+    "ignore::marshmallow.warnings.ChangedInMarshmallow4Warning",
+]
 
 [tool.hypothesis]
 max_examples = 50

diff --git a/tests/e2e/__init__.py b/tests/e2e/__init__.py
@@ -0,0 +1 @@
+"""End-to-end tests package."""