microsoft · akzaidi · Apr 30, 2026 · Apr 29, 2026 · Apr 29, 2026 · Apr 29, 2026
diff --git a/.cspell/general-technical.txt b/.cspell/general-technical.txt
@@ -1926,6 +1926,7 @@ incluster
 instancetype
 instancetypes
 jtzh
+libdav
 libgl
 libglib
 lnkmpfy

@@ -31,10 +31,10 @@ Start the dataviewer app, optionally configuring the dataset path.
 If the user provides a dataset path:
 
 1. Read `data-management/viewer/backend/.env`.
-2. Replace the `HMI_DATA_PATH=` line with the absolute path to the user's dataset directory.
+2. Replace the `DATA_DIR=` line with the absolute path to the user's dataset directory.
 3. Confirm the update.
 
-If no path is provided, use the existing `HMI_DATA_PATH` value.
+If no path is provided, use the existing `DATA_DIR` value.
 
 #### Step 2: Start the Application
 
@@ -159,7 +159,7 @@ Batch analysis across all episodes using Python scripts via the terminal for eff
 2. For bulk annotation, use a Python script with `urllib.request` to loop over all episodes.
 3. After all labels are applied, persist with `POST /api/datasets/{id}/labels/save`.
 
-Labels are stored on disk at `{HMI_DATA_PATH}/{dataset_id}/meta/episode_labels.json`. To clear all labels for a fresh start, overwrite the `episodes` key with an empty object `{}` in this file and reload the page.
+Labels are stored on disk at `{DATA_DIR}/{dataset_id}/meta/episode_labels.json`. To clear all labels for a fresh start, overwrite the `episodes` key with an empty object `{}` in this file and reload the page.
 
 #### Step 4: Verify via Playwright UI
 

@@ -14,7 +14,7 @@ argument-hint: "[datasetPath=...] [backendPort=8000] [frontendPort=5173]"
 
 ## Requirements
 
-1. If datasetPath is provided, update `HMI_DATA_PATH` in `data-management/viewer/backend/.env` to the absolute path.
+1. If datasetPath is provided, update `DATA_DIR` in `data-management/viewer/backend/.env` to the absolute path.
 2. Start the dataviewer app using `data-management/viewer/start.sh` with configured ports.
 3. Wait for the backend health check to pass.
 4. Open `http://localhost:${frontendPort}` using `open_browser_page`. If Playwright MCP tools are available, take a snapshot instead.

@@ -30,7 +30,7 @@ cd data-management/viewer && ./start.sh
 With a custom dataset path:
 
 ```bash
-cd data-management/viewer && HMI_DATA_PATH=/path/to/datasets ./start.sh
+cd data-management/viewer && DATA_DIR=/path/to/datasets ./start.sh
 ```
 
 ### Step 2 — Open SimpleBrowser
@@ -99,47 +99,47 @@ cd data-management/viewer && ./start.sh
 Start with a custom dataset path:
 
 ```bash
-cd data-management/viewer && HMI_DATA_PATH=/path/to/datasets ./start.sh
+cd data-management/viewer && DATA_DIR=/path/to/datasets ./start.sh
 ```
 
 ## Parameters Reference
 
 | Parameter | Default | Description |
 |-----------|---------|-------------|
-| `HMI_DATA_PATH` | `../../../datasets` (relative to `backend/`) | Directory containing dataset subdirectories |
+| `DATA_DIR` | `../../../datasets` (relative to `backend/`) | Directory containing dataset subdirectories |
 | `BACKEND_PORT` | `8000` | FastAPI backend port |
 | `FRONTEND_PORT` | `5173` | Vite frontend dev server port |
 | `HEALTH_TIMEOUT` | `30` | Seconds to wait for backend health check |
 
 ### Dataset Path Configuration
 
-The `HMI_DATA_PATH` environment variable controls which datasets are visible in the app. Each subdirectory under this path is treated as a separate `dataset_id`.
+The `DATA_DIR` environment variable controls which datasets are visible in the app. Each subdirectory under this path is treated as a separate `dataset_id`.
 
-**Methods to set `HMI_DATA_PATH`:**
+**Methods to set `DATA_DIR`:**
 
 1. **Environment variable override** (recommended for ad-hoc use):
 
     ```bash
-    HMI_DATA_PATH=/path/to/datasets ./start.sh
+    DATA_DIR=/path/to/datasets ./start.sh
     ```
 
 2. **Edit `backend/.env`** (persists across restarts):
 
     ```env
-    HMI_DATA_PATH=/path/to/datasets
+    DATA_DIR=/path/to/datasets
     ```
 
 3. **Export before launch** (session-scoped):
 
     ```bash
-    export HMI_DATA_PATH=/path/to/datasets
+    export DATA_DIR=/path/to/datasets
     cd data-management/viewer && ./start.sh
     ```
 
 When a dataset path is provided, update `backend/.env` so the value persists:
 
 1. Read the current `backend/.env` file.
-2. Replace the `HMI_DATA_PATH=` line with the new absolute path.
+2. Replace the `DATA_DIR=` line with the new absolute path.
 3. Start the app with `./start.sh`.
 
 ## Architecture
@@ -148,7 +148,7 @@ When a dataset path is provided, update `backend/.env` so the value persists:
 data-management/viewer/
 ├── start.sh              # Orchestrator: launches backend + frontend
 ├── backend/
-│   ├── .env              # HMI_DATA_PATH and test config
+│   ├── .env              # DATA_DIR and test config
 │   ├── pyproject.toml    # Python dependencies (uv)
 │   └── src/api/
 │       ├── main.py       # FastAPI app, CORS, router registration
@@ -222,6 +222,34 @@ data-management/viewer/
 
 Annotation combines API calls for efficiency with Playwright UI interaction for verification. Use the API for bulk operations and the UI for visual review and spot-checking.
 
+### Annotation surfaces
+
+The annotation panel exposes three structured surfaces in addition to free-form labels:
+
+| Surface | Storage | Notes |
+|---------|---------|-------|
+| Labels | `meta/episode_labels.json` | Free-form tag set with shared dataset-level options |
+| Episode annotation | `EpisodeAnnotation` JSON | Task completeness, trajectory quality, data quality, anomalies |
+| Language instruction | `EpisodeAnnotation.language_instruction` | Optional VLA payload (instruction, source, paraphrases, subtask decomposition) |
+
+### Multi-camera selection
+
+Datasets that record multiple camera streams expose a camera selector in the annotation workspace header. Default selection is `episode.cameras[0]` (or the first key of `videoUrls` when `cameras` is empty). User selections persist for the current episode; switching to an episode that no longer contains the selected camera resets selection back to `cameras[0]`. Both video playback and `/frames/{idx}` thumbnail extraction follow the active camera.
+
+### Language instruction (VLA annotation)
+
+The `LanguageInstructionWidget` writes a structured payload through `PUT /api/datasets/{id}/episodes/{idx}/annotations`:
+
+| Field | Purpose | Bounds |
+|-------|---------|--------|
+| `instruction` | Primary natural-language task description | 1–1000 chars |
+| `source` | Provenance: `human`, `template`, `llm-generated`, `retroactive` | enum |
+| `language` | BCP-47 language tag, defaults to `en` | up to 10 chars |
+| `paraphrases` | Alternative phrasings for data augmentation | up to 50 entries, 1000 chars each |
+| `subtask_instructions` | Ordered subtask decomposition for hierarchical conditioning | up to 100 entries, 1000 chars each |
+
+When a dataset task description is available the widget seeds the instruction with `source = template`; otherwise it creates a blank instruction with `source = human`. The source dropdown allows changing the value at any time.
+
 ### Step 1 — Analyze trajectory data
 
 Fetch episode trajectory data from the API to determine labels programmatically:
@@ -303,7 +331,7 @@ curl -s -X POST "http://localhost:8000/api/datasets/{dataset_id}/labels/save"
 The save endpoint writes labels to a JSON file inside the dataset's `meta/` directory:
 
 ```text
-{HMI_DATA_PATH}/{dataset_id}/meta/episode_labels.json
+{DATA_DIR}/{dataset_id}/meta/episode_labels.json
 ```
 
 For example, with the default dataset path:
@@ -375,7 +403,7 @@ The React app has these key areas for Playwright interaction:
 |-------|----------|
 | Backend fails to start | Check `backend/.venv` exists; run `cd backend && uv venv --python 3.12 && source .venv/bin/activate && uv pip install -e ".[dev,analysis,export]"` |
 | Frontend shows "Loading..." indefinitely | Verify backend is healthy: `curl http://localhost:8000/health` |
-| No datasets visible | Check `HMI_DATA_PATH` in `backend/.env` points to a directory with dataset subdirectories |
+| No datasets visible | Check `DATA_DIR` in `backend/.env` points to a directory with dataset subdirectories |
 | Port conflict | Set `BACKEND_PORT` or `FRONTEND_PORT` environment variables |
 | CORS errors | Backend allows localhost ports 5173-5177; check the frontend port is in range |
 | Labels not persisted after restart | Call `POST /api/datasets/{id}/labels/save` after API-based annotation |

@@ -34,7 +34,7 @@ jobs:
         shell: bash
         working-directory: data-management/viewer
         env:
-          HMI_LOCAL_DATA_PATH: ./data/test-dataset
+          DATAVIEWER_HOST_DATA_DIR: ./data/test-dataset
         run: docker compose up -d --build --wait
 
       - name: Verify containers are healthy

@@ -42,8 +42,8 @@ Copy `backend/.env.example` to `backend/.env` and set values for your environmen
 ### Local File Storage (default)
 
 ```env
-HMI_STORAGE_BACKEND=local
-HMI_DATA_PATH=/path/to/your/datasets
+STORAGE_BACKEND=local
+DATA_DIR=/path/to/your/datasets
 ```
 
 ### Azure Blob Storage
@@ -54,7 +54,7 @@ which supports managed identity, workload identity, and Azure CLI credentials
 automatically — no SAS token required in AKS or Container Apps.
 
 ```env
-HMI_STORAGE_BACKEND=azure
+STORAGE_BACKEND=azure
 AZURE_STORAGE_ACCOUNT_NAME=mystorageaccount
 AZURE_STORAGE_DATASET_CONTAINER=datasets
 AZURE_STORAGE_ANNOTATION_CONTAINER=annotations
@@ -75,8 +75,8 @@ Expected blob structure:
 
 | Variable                             | Default         | Description                                                    |
 |--------------------------------------|-----------------|----------------------------------------------------------------|
-| `HMI_STORAGE_BACKEND`                | `local`         | Storage backend: `local` or `azure`                            |
-| `HMI_DATA_PATH`                      | `./data`        | Local dataset directory (local mode)                           |
+| `STORAGE_BACKEND`                    | `local`         | Storage backend: `local` or `azure`                            |
+| `DATA_DIR`                      | `./data`        | Local dataset directory (local mode)                           |
 | `AZURE_STORAGE_ACCOUNT_NAME`         | —               | Azure Storage account name (azure mode)                        |
 | `AZURE_STORAGE_DATASET_CONTAINER`    | —               | Blob container for dataset files                               |
 | `AZURE_STORAGE_ANNOTATION_CONTAINER` | —               | Blob container for annotations (defaults to dataset container) |
@@ -196,16 +196,45 @@ npm run dev
 
 The application will be available at `http://localhost:5173`.
 
+## Annotation Features
+
+The annotation workspace exposes per-episode controls grouped by panel. Persisted state is stored alongside the dataset and surfaced through the REST API.
+
+### Multi-camera viewing
+
+Datasets that record multiple camera streams (e.g. `observation.images.front`, `observation.images.wrist`) drive a camera selector in the annotation workspace header. The selector lists every camera advertised by the episode's `cameras` array, falling back to the keys of `videoUrls` when the array is empty.
+
+| Behavior | Detail |
+|----------|--------|
+| Default selection | First entry in `episode.cameras` (or `videoUrls`) |
+| Override | User selection persists for the current episode |
+| Stale fallback | When the selected camera is missing on episode change, selection resets to the new `cameras[0]` |
+| Frame extraction | The chosen camera drives both video playback and `/frames/{idx}` thumbnail requests |
+
+### Language instruction (VLA annotation)
+
+Each episode can carry a structured `LanguageInstructionAnnotation` for vision-language-action training. The widget appears in the annotation panel and writes through `PUT /api/datasets/{id}/episodes/{idx}/annotations`.
+
+| Field | Purpose |
+|-------|---------|
+| `instruction` | Primary natural-language task description (max 1000 characters) |
+| `source` | Provenance: `human`, `template`, `llm-generated`, or `retroactive` |
+| `language` | BCP-47 language tag, defaults to `en` |
+| `paraphrases` | Alternative phrasings for data augmentation (up to 50 entries, 1000 characters each) |
+| `subtask_instructions` | Ordered subtask decomposition for hierarchical conditioning (up to 100 entries, 1000 characters each) |
+
+When a dataset task description is available, the widget seeds the instruction with `source = template` via the "Use as Instruction" button. Otherwise, "Add Instruction" creates a blank instruction with `source = human`. The source can be changed at any time through the dropdown.
+
 ## Container Deployment
 
 ### Docker Compose (local)
 
 ```bash
 # Local storage mode (mount datasets directory)
-HMI_LOCAL_DATA_PATH=/path/to/datasets docker compose up --build
+DATAVIEWER_HOST_DATA_DIR=/path/to/datasets docker compose up --build
 
 # Azure Blob Storage mode
-export HMI_STORAGE_BACKEND=azure
+export STORAGE_BACKEND=azure
 export AZURE_STORAGE_ACCOUNT_NAME=mystorageaccount
 export AZURE_STORAGE_DATASET_CONTAINER=datasets
 export AZURE_STORAGE_ANNOTATION_CONTAINER=annotations
@@ -217,7 +246,7 @@ docker compose up --build
 For AKS with workload identity or Container Apps with managed identity, set:
 
 ```env
-HMI_STORAGE_BACKEND=azure
+STORAGE_BACKEND=azure
 AZURE_STORAGE_ACCOUNT_NAME=mystorageaccount
 AZURE_STORAGE_DATASET_CONTAINER=datasets
 BACKEND_HOST=0.0.0.0

@@ -8,7 +8,7 @@
 # Your Azure AD user must have Storage Blob Data Reader (or Contributor) on the account.
 # Do NOT set AZURE_CLIENT_ID locally — it overrides CLI credentials with managed identity.
 
-HMI_STORAGE_BACKEND=azure
+STORAGE_BACKEND=azure
 AZURE_STORAGE_ACCOUNT_NAME=<insert-storage-account-name>
 AZURE_STORAGE_DATASET_CONTAINER=datasets
 AZURE_STORAGE_ANNOTATION_CONTAINER=annotations
@@ -7,22 +7,22 @@
 
 # Active storage backend: local (default) | azure
 # Use 'azure' to connect directly to Azure Blob Storage via DefaultAzureCredential
-# or a SAS token. Use 'local' for local filesystem access via HMI_DATA_PATH.
-HMI_STORAGE_BACKEND=local
+# or a SAS token. Use 'local' for local filesystem access via DATA_DIR.
+STORAGE_BACKEND=local
 
 # ─────────────────────────────────────────────────────────────────────────────
-# Local storage (used when HMI_STORAGE_BACKEND=local)
+# Local storage (used when STORAGE_BACKEND=local)
 # ─────────────────────────────────────────────────────────────────────────────
 
 # Path to the directory containing datasets.
 # Each subdirectory is treated as a dataset_id.
 # Relative paths are resolved from this file's directory (backend/).
-HMI_DATA_PATH=../../../datasets
+DATA_DIR=../../../datasets
 
 # ─────────────────────────────────────────────────────────────────────────────
-# Azure Blob Storage (required when HMI_STORAGE_BACKEND=azure)
+# Azure Blob Storage (required when STORAGE_BACKEND=azure)
 # ─────────────────────────────────────────────────────────────────────────────
-# Azure Blob Storage (required when HMI_STORAGE_BACKEND=azure)
+# Azure Blob Storage (required when STORAGE_BACKEND=azure)
 # ─────────────────────────────────────────────────────────────────────────────
 # Prerequisite: run `az login` or `source infrastructure/terraform/prerequisites/az-sub-init.sh`
 # Your Azure AD user needs Storage Blob Data Reader (or Contributor) on the account.

@@ -46,8 +46,8 @@ EXPOSE 8000
 # BACKEND_HOST=0.0.0.0 binds to all interfaces (required in containers)
 ENV BACKEND_HOST=0.0.0.0 \
     BACKEND_PORT=8000 \
-    HMI_STORAGE_BACKEND=local \
-    HMI_DATA_PATH=/data
+    STORAGE_BACKEND=local \
+    DATA_DIR=/data
 
 # Health check
 HEALTHCHECK --interval=30s --timeout=5s --start-period=15s --retries=3 \

@@ -28,7 +28,7 @@ class AppConfig:
     """Active storage backend: 'local' or 'azure'."""
 
     data_path: str
-    """Local dataset directory (HMI_DATA_PATH). Used when storage_backend='local'."""
+    """Local dataset directory (DATA_DIR). Used when storage_backend='local'."""
 
     azure_account_name: str | None
     """Azure Storage account name. Required when storage_backend='azure'."""
@@ -73,8 +73,8 @@ def load_config(env_path: Path | None = None) -> AppConfig:
 
         load_dotenv(env_path)
 
-    storage_backend = os.environ.get("HMI_STORAGE_BACKEND", "local").lower()
-    data_path = os.environ.get("HMI_DATA_PATH", "./data")
+    storage_backend = os.environ.get("STORAGE_BACKEND", "local").lower()
+    data_path = os.environ.get("DATA_DIR", "./data")
 
     azure_account_name = os.environ.get("AZURE_STORAGE_ACCOUNT_NAME") or None
     azure_dataset_container = os.environ.get("AZURE_STORAGE_DATASET_CONTAINER") or None
@@ -126,12 +126,12 @@ def create_annotation_storage(config: AppConfig):
 
     if config.storage_backend == "azure":
         if not config.azure_account_name:
-            raise ValueError("AZURE_STORAGE_ACCOUNT_NAME is required when HMI_STORAGE_BACKEND=azure")
+            raise ValueError("AZURE_STORAGE_ACCOUNT_NAME is required when STORAGE_BACKEND=azure")
         annotation_container = config.azure_annotation_container or config.azure_dataset_container
         if not annotation_container:
             raise ValueError(
                 "AZURE_STORAGE_ANNOTATION_CONTAINER or AZURE_STORAGE_DATASET_CONTAINER is required "
-                "when HMI_STORAGE_BACKEND=azure"
+                "when STORAGE_BACKEND=azure"
             )
 
         from .storage.azure import AzureBlobStorageAdapter
-Original file line number
+Diff line change
@@ Expand Up / @@ -1926,6 +1926,7 @@ incluster @@
     instancetype
     instancetypes
     jtzh
+    libdav
     libgl
     libglib
     lnkmpfy
@@ Expand Down @@