seonghobae · seonghobae · Mar 28, 2026 · Mar 28, 2026 · Mar 28, 2026 · Mar 28, 2026
@@ -163,6 +163,7 @@ jobs:
       - name: Install node dependencies
         run: npm ci
       - name: Sync Python dependencies
+        if: runner.os != 'Windows' || runner.arch != 'ARM64' # llvmlite lacks wheel for Windows ARM64
         run: uv sync --project services/analysis-engine --group dev --frozen
       - name: Build frontend
         run: npm run build --workspace @bandscope/desktop

@@ -0,0 +1,59 @@
+# ML Engine Integration Plan
+
+## Overview
+Now that the basic IPC and React/Python orchestrator boundaries are proven (Issue #26 epics), the next phase is replacing the hardcoded, instantaneous mock data with real digital signal processing (DSP) and Machine Learning (ML) inference.
+
+This document outlines the MECE execution strategy to incrementally substitute mock systems with reality.
+
+## Execution Tracks
+
+### Track 1: Temporal Foundation (#105)
+- **Goal**: Replace simple count-based anchors with a real tempo and beat grid.
+- **Tech**: Add `librosa` or `soundfile` for robust decoding.
+- **Output**: Real file ingestion and tempo/beat arrays.
+
+### Track 2: Spectral & Stem Separation (#106)
+- **Goal**: Deconstruct the mixed audio into isolated stems.
+- **Tech**: Integrate `demucs` (or a smaller alternative) running locally.
+- **Output**: 4 or 6 discrete stems (vocals, bass, drums, other).
+
+### Track 3: Harmonic & Pitch Pipelines (#107)
+- **Goal**: Replace hardcoded `C#m7` strings with DSP-derived chord and pitch arrays.
+- **Tech**: Chromagram extraction and Viterbi decoding for chords. YIN/pYIN for pitch ranges.
+- **Output**: Accurate harmonic sequences tied to Track 1's beat grid.
+
+### Track 4: Structural Graph Assembly (#108)
+- **Goal**: Infer boundaries (Verse, Chorus) and detect which roles (stems) are playing.
+- **Tech**: Self-similarity matrices and energy thresholding on the stems.
+- **Output**: The true `PartGraph` and `Section` payloads.
+
+### Track 5: Orchestration & UX (#109)
+- **Goal**: Handle the fact that ML takes minutes, not milliseconds.
+- **Tech**: Async progress callbacks, IPC streaming updates.
+- **Output**: Responsive UI during long-running tasks.
+
+## Security Notes
+
+### Attack Surface
+The integration of ML libraries like `librosa`, `torch`, and `demucs` exposes the desktop app to complex audio processing pipelines that parse potentially malformed user-provided audio files.
+
+### Trust Boundary
+The primary trust boundary is between the user's filesystem (audio files) and the Python local analysis engine. All input audio is untrusted.
+
+### Mitigations
+We will restrict audio ingestion through `librosa`/`soundfile` using strict format constraints. We will execute ML tasks locally, without reaching out to external networks, and run them under low privileges where possible.
+
+### Test Points
+- Loading truncated or corrupted WAV/MP3 files.
+- Providing extremely large audio files to test OOM behavior.
+- Validating that no external network calls occur during offline ML processing.
+
+### Realistic Threats
+- OOM (Out Of Memory) crashing the user's host OS during `demucs` execution.
+- Arbitrary code execution (ACE) vulnerabilities within C-level parsing dependencies of `librosa`/`soundfile`.
+
+### Remaining Risk
+Large ML dependencies carry high vulnerability footprints. We depend on upstream patching for zero-days in C-level audio codec libraries.
+
+1. **Supply Chain**: Must follow `docs/security/dependency-policy.md`. Large ML dependencies carry high vulnerability footprints.
+2. **Execution**: Must gracefully handle lack of GPU/MPS, defaulting to CPU chunks without OOM-crashing the host OS.
@@ -8,6 +8,9 @@ version = "0.1.0"
 description = "BandScope local-first analysis engine"
 requires-python = ">=3.12"
 dependencies = [
+    "librosa>=0.11.0",
+    "numba<0.63.0",
+    "soundfile>=0.13.1",
     "yt-dlp>=2026.3.17",
 ]
 

@@ -3,10 +3,15 @@
 from __future__ import annotations
 
 import json
+import logging
 import sys
 from datetime import UTC, datetime
 
-from bandscope_analysis.api import run_analysis_job
+from bandscope_analysis.api import get_analysis_status, run_analysis_job
+from bandscope_analysis.temporal import TemporalAnalyzer
+
+# Temporary logging setup for temporal analyzer
+logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
 
 
 def failed_cli_response(message: str) -> dict[str, object]:
@@ -26,23 +31,66 @@ def failed_cli_response(message: str) -> dict[str, object]:
 
 def main() -> int:
     """Read a job payload from stdin and print a structured job response to stdout."""
+    # Read all input from stdin first
+    input_data = sys.stdin.read().strip()
+
+    # Check if there are command line arguments (fallback for manual testing)
+    if len(sys.argv) > 1:
+        if sys.argv[1] == "--status":
+            json.dump(get_analysis_status(), sys.stdout)
+            return 0
+        elif sys.argv[1] == "--job" and len(sys.argv) > 2:
+            input_data = sys.argv[2]
+            if not input_data.startswith("{"):
+                try:
+                    with open(input_data, "r", encoding="utf-8") as f:
+                        input_data = f.read()
+                except Exception as e:
+                    json.dump(failed_cli_response(f"Failed to read job file: {e}"), sys.stdout)
+                    return 1
+
+    if not input_data:
+        json.dump(failed_cli_response("Empty input"), sys.stdout)
+        return 0
+
     try:
-        payload = json.load(sys.stdin)
+        payload = json.loads(input_data)
     except json.JSONDecodeError as error:
         json.dump(failed_cli_response(f"Invalid analysis job request: {error.msg}"), sys.stdout)
         return 0
+
     if not isinstance(payload, dict):
         json.dump(
             failed_cli_response("Invalid analysis job request: invalid field 'root'"), sys.stdout
         )
         return 0
+
     job_id = payload.get("jobId")
     if not isinstance(job_id, str) or not job_id.strip():
         json.dump(
             failed_cli_response("Invalid analysis job request: invalid field 'jobId'"), sys.stdout
         )
         return 0
+
     request = payload.get("request")
+
+    # Temporary: Inject temporal analyzer call if it's a local file, just to prove it works
+    # before full orchestrator integration
+    if (
+        isinstance(request, dict)
+        and request.get("sourceKind") == "local_audio"
+        and "localSource" in request
+    ):
+        audio_path = request["localSource"].get("sourcePath")
+        if audio_path:
+            logging.info(f"Extracting temporal features from {audio_path}...")
+            try:
+                temporal_analyzer = TemporalAnalyzer()
+                features = temporal_analyzer.analyze(audio_path)
+                logging.info(f"Extracted BPM: {features['bpm']}")
+            except Exception as e:
+                logging.warning(f"Temporal analysis failed, continuing with mock: {e}")
+
     requested_at = datetime.now(UTC).isoformat().replace("+00:00", "Z")
     response = run_analysis_job(job_id, request, requested_at)
     json.dump(response, sys.stdout)

@@ -0,0 +1,6 @@
+"""Temporal analysis module (audio decoding, tempo, beat tracking)."""
+
+from .analyzer import TemporalAnalyzer
+from .model import TemporalFeatures
+
+__all__ = ["TemporalAnalyzer", "TemporalFeatures"]
@@ -0,0 +1,77 @@
+"""Temporal analyzer implementation for audio ingestion and beat tracking."""
+
+from __future__ import annotations
+
+import logging
+from pathlib import Path
+from typing import Any
+
+import librosa
+import numpy as np
+from numpy.typing import NDArray
+
+from .model import TemporalFeatures
+
+logger = logging.getLogger(__name__)
+
+# Standard sample rate for BandScope analysis
+TARGET_SR = 44100
+
+
+class TemporalAnalyzer:
+    """Analyzes temporal features (BPM, beats) from audio files."""
+
+    def __init__(self) -> None:
+        """Initialize the temporal analyzer."""
+        pass
+
+    def analyze(self, audio_path: str | Path) -> TemporalFeatures:
+        """Decode audio and extract temporal features.
+
+        Args:
+            audio_path: Path to the audio file.
+
+        Returns:
+            TemporalFeatures containing BPM and beat grids.
+        """
+        path_str = str(audio_path)
+        logger.info(f"Loading and decoding audio: {path_str}")
+
+        try:
+            # Load audio, converting to mono and standardizing sample rate
+            y, sr = librosa.load(path_str, sr=TARGET_SR, mono=True)
+
+            # Ensure it's a 1D float array for librosa
+            if not isinstance(y, np.ndarray):
+                raise ValueError("Expected numpy array from librosa.load")
+
+            y_array: NDArray[np.floating[Any]] = y
+            duration = float(librosa.get_duration(y=y_array, sr=sr))
+
+            logger.info("Extracting tempo and beat tracking...")
+            # Use librosa's robust beat tracker
+            tempo, beat_frames = librosa.beat.beat_track(y=y_array, sr=sr)
+
+            # Convert frame indices to time (seconds)
+            beat_times: NDArray[np.floating[Any]] = librosa.frames_to_time(beat_frames, sr=sr)
+
+            # Extract downbeats (simple approximation: every 4th beat)
+            # A real model might use madmom or complex DBNs for precise downbeats
+            downbeat_times = [float(bt) for i, bt in enumerate(beat_times) if i % 4 == 0]
+
+            bpm_val = float(tempo[0]) if isinstance(tempo, np.ndarray) else float(tempo)
+
+            logger.info(f"Analysis complete: {bpm_val:.1f} BPM, {len(beat_times)} beats detected.")
+
+            return {
+                "bpm": bpm_val,
+                "beat_times": [float(bt) for bt in beat_times],
+                "downbeat_times": downbeat_times,
+                "duration_seconds": duration,
+                "sample_rate": int(sr),
+                "audio_path": path_str,
+            }
+
+        except Exception as e:
+            logger.error(f"Failed to analyze audio {path_str}: {e}")
+            raise ValueError(f"Temporal analysis failed: {e}") from e
@@ -0,0 +1,16 @@
+"""Data models for temporal analysis."""
+
+from __future__ import annotations
+
+from typing import TypedDict
+
+
+class TemporalFeatures(TypedDict):
+    """Features extracted during temporal analysis."""
+
+    bpm: float
+    beat_times: list[float]
+    downbeat_times: list[float]
+    duration_seconds: float
+    sample_rate: int
+    audio_path: str