ContextualWisdomLab · seonghobae · Mar 28, 2026 · Mar 29, 2026 · Jun 10, 2026 · Jun 10, 2026
diff --git a/.github/workflows/build-baseline.yml b/.github/workflows/build-baseline.yml
@@ -222,6 +222,7 @@ jobs:
       - name: Install node dependencies
         run: npm ci
       - name: Sync Python dependencies
+        if: runner.os != 'macOS' || runner.arch != 'X64' # PyTorch lacks Python 3.12 wheel for macOS x86_64
         run: uv sync --project services/analysis-engine --group dev --frozen
       - name: Build frontend
         run: npm run build --workspace @bandscope/desktop

@@ -8,6 +8,7 @@ version = "0.1.0"
 description = "BandScope local-first analysis engine"
 requires-python = ">=3.12"
 dependencies = [
+    "demucs>=4.0.1",
     "librosa>=0.11.0",
     "numba<0.63.0",
     "soundfile>=0.13.1",

@@ -91,6 +91,20 @@ def main() -> int:
             except Exception as e:
                 logging.warning(f"Temporal analysis failed, continuing with mock: {e}")
 
+            logging.info(f"Performing stem separation on {audio_path}...")
+            try:
+                import librosa
+
+                from bandscope_analysis.separation.audio_separator import AudioStemSeparator
+
+                # Load only the first 10 seconds for the CLI proof to prevent hanging
+                y, sr = librosa.load(audio_path, sr=44100, mono=False, duration=10.0)
+                separator = AudioStemSeparator()
+                stems = separator.separate_audio(y, sample_rate=int(sr), segment_seconds=2.0)
+                logging.info(f"Successfully extracted {len(stems)} stems: {list(stems.keys())}")
+            except Exception as e:
+                logging.warning(f"Stem separation failed, continuing with mock: {e}")
+
     requested_at = datetime.now(UTC).isoformat().replace("+00:00", "Z")
     response = run_analysis_job(job_id, request, requested_at)
     json.dump(response, sys.stdout)

@@ -0,0 +1,121 @@
+"""Audio source separation using Demucs."""
+
+from __future__ import annotations
+
+import logging
+from typing import Any
+
+import numpy as np
+
+try:
+    from torch import Tensor
+except ImportError:  # pragma: no cover
+    Tensor = Any  # type: ignore
+
+logger = logging.getLogger(__name__)
+
+
+class AudioStemSeparator:
+    """Isolates standard stems from an audio mix using Demucs.
+
+    Security Notes:
+    - Trust boundary: Audio input is passed as raw numpy arrays from a prior decoding step
+      (e.g. librosa), reducing the risk of codec-based exploitation within Demucs itself.
+    - Limits: Employs chunked inference (split=True) to strictly bound peak memory (OOM avoidance).
+    - Network: Downloads model weights securely to local cache on first run. Future executions
+      should ideally be offline.
+    """
+
+    def __init__(self, model_name: str = "htdemucs") -> None:
+        """Initialize the audio stem separator.
+
+        Args:
+            model_name: The name of the pretrained Demucs model to use.
+        """
+        self.model_name = model_name
+        self._model = None
+
+    def _load_model(self) -> Any:
+        from demucs.pretrained import get_model
+
+        if self._model is None:
+            logger.info("Loading demucs model '%s'...", self.model_name)
+            self._model = get_model(self.model_name)
+            if self._model:
+                self._model.eval()
+        return self._model
+
+    def separate_audio(
+        self,
+        audio_data: np.ndarray,
+        sample_rate: int,
+        segment_seconds: float = 10.0,
+    ) -> dict[str, np.ndarray]:
+        """Perform source separation on the given audio array.
+
+        Args:
+            audio_data: The input audio waveform, shape (channels, samples).
+                        If mono (samples,), it will be converted to stereo.
+            sample_rate: The sample rate of the input audio.
+            segment_seconds: The length of each chunk for OOM-safe processing.
+
+        Returns:
+            A dictionary mapping stem names ('vocals', 'bass', 'drums', 'other')
+            to their separated audio waveforms (channels, samples).
+        """
+        import torch
+        from demucs.apply import apply_model
+        from demucs.audio import convert_audio
+
+        model = self._load_model()
+
+        # Ensure 2D (channels, samples)
+        if audio_data.ndim == 1:
+            audio_data = np.expand_dims(audio_data, axis=0)
+
+        # Convert to torch tensor
+        mix = torch.from_numpy(audio_data).float()
+
+        # Convert audio to match model expectations
+        mix = convert_audio(  # type: ignore
+            mix,
+            sample_rate,
+            model.samplerate,
+            model.audio_channels,
+        )
+
+        # Add batch dimension: (1, channels, samples)
+        mix = mix.unsqueeze(0)
+
+        # Determine device
+        device = "cpu"
+        if torch.cuda.is_available():
+            device = "cuda"
+        elif torch.backends.mps.is_available():
+            device = "mps"
+
+        model.to(device)
+        mix = mix.to(device)
+
+        logger.info("Applying model to mix using device %s...", device)
+        # Apply model with chunking
+        with torch.no_grad():
+            stems = apply_model(
+                model,
+                mix,
+                shifts=1,
+                split=True,
+                overlap=0.25,
+                segment=segment_seconds,
+                progress=False,
+            )
+
+        # stems shape: [batch, sources, channels, samples]
+        # Remove batch dim
+        stems_np: np.ndarray = stems[0].cpu().numpy()
+
+        result = {}
+        for idx, source_name in enumerate(model.sources):
+            result[source_name] = stems_np[idx]
+
+        return result
@@ -0,0 +1,124 @@
+"""Tests for audio stem separation."""
+
+from unittest import mock
+
+import numpy as np
+import pytest
+
+from bandscope_analysis.separation.audio_separator import AudioStemSeparator
+
+
+@pytest.fixture
+def mock_demucs_model():
+    """Provide a mock demucs model with standard sources."""
+    mock_model = mock.MagicMock()
+    mock_model.sources = ["drums", "bass", "other", "vocals"]
+    mock_model.samplerate = 44100
+    mock_model.audio_channels = 2
+    return mock_model
+
+
+@mock.patch("bandscope_analysis.separation.audio_separator.logger")
+@mock.patch("demucs.audio.convert_audio")
+@mock.patch("demucs.apply.apply_model")
+@mock.patch("demucs.pretrained.get_model")
+def test_audio_stem_separator(
+    mock_get_model, mock_apply_model, mock_convert_audio, mock_logger, mock_demucs_model
+):
+    """Test that the AudioStemSeparator correctly coordinates the mock Demucs model."""
+    import torch
+
+    # Setup mocks
+    mock_get_model.return_value = mock_demucs_model
+
+    # fake convert_audio output (channels, samples)
+    # convert_audio returns the tensor directly
+    def fake_convert(wav, from_sr, to_sr, channels):
+        # ensure shape matches expectations
+        return torch.zeros((2, 100))
+
+    mock_convert_audio.side_effect = fake_convert
+
+    # fake apply_model output (batch, sources, channels, samples)
+    mock_apply_model.return_value = torch.ones((1, 4, 2, 100))
+
+    separator = AudioStemSeparator(model_name="fake_model")
+
+    # Test mono audio
+    audio_data = np.zeros((100,))
+    result = separator.separate_audio(audio_data, sample_rate=22050, segment_seconds=2.0)
+
+    # Assertions
+    mock_get_model.assert_called_once_with("fake_model")
+    mock_apply_model.assert_called_once()
+
+    # Verify the results match the model sources
+    assert set(result.keys()) == {"drums", "bass", "other", "vocals"}
+    for stem_name in ["drums", "bass", "other", "vocals"]:
+        assert result[stem_name].shape == (2, 100)
+        assert np.all(result[stem_name] == 1.0)
+
+    # Check that model gets loaded only once
+    separator.separate_audio(audio_data, sample_rate=22050, segment_seconds=2.0)
+    assert mock_get_model.call_count == 1
+    assert mock_apply_model.call_count == 2
+
+
+@mock.patch("bandscope_analysis.separation.audio_separator.logger")
+@mock.patch("demucs.audio.convert_audio")
+@mock.patch("demucs.apply.apply_model")
+@mock.patch("demucs.pretrained.get_model")
+@mock.patch("torch.from_numpy")
+@mock.patch("torch.cuda.is_available")
+@mock.patch("torch.backends.mps.is_available")
+def test_audio_stem_separator_device(
+    mock_mps,
+    mock_cuda,
+    mock_from_numpy,
+    mock_get_model,
+    mock_apply_model,
+    mock_convert_audio,
+    mock_logger,
+    mock_demucs_model,
+):
+    """Test that device selection (mps, cuda, cpu) falls back correctly."""
+    # This test verifies that the correct device string is chosen.
+    # By mocking torch.from_numpy and convert_audio, we prevent real tensors
+    # from being created, thus avoiding actual PyTorch .to("cuda") calls
+    # that would fail on machines compiled without CUDA.
+    mock_get_model.return_value = mock_demucs_model
+
+    mock_tensor = mock.MagicMock()
+    mock_from_numpy.return_value.float.return_value = mock_tensor
+    mock_convert_audio.return_value = mock_tensor
+    mock_tensor.unsqueeze.return_value = mock_tensor
+    mock_tensor.to.return_value = mock_tensor
+
+    # Mock apply_model return value so stems[0].cpu().numpy() works
+    mock_stems_item = mock.MagicMock()
+    mock_stems_item.cpu.return_value.numpy.return_value = np.zeros((4, 2, 100))
+    mock_stems = mock.MagicMock()
+    mock_stems.__getitem__.return_value = mock_stems_item
+    mock_apply_model.return_value = mock_stems
+
+    separator = AudioStemSeparator(model_name="fake_model")
+    audio_data = np.zeros((2, 100))  # Test stereo
+
+    # 1. Test cuda
+    mock_cuda.return_value = True
+    mock_mps.return_value = False
+    result = separator.separate_audio(audio_data, sample_rate=22050, segment_seconds=2.0)
+    assert set(result.keys()) == {"drums", "bass", "other", "vocals"}
+    mock_tensor.to.assert_called_with("cuda")
+
+    # 2. Test mps
+    mock_cuda.return_value = False
+    mock_mps.return_value = True
+    result = separator.separate_audio(audio_data, sample_rate=22050, segment_seconds=2.0)
+    mock_tensor.to.assert_called_with("mps")
+
+    # 3. Test cpu
+    mock_cuda.return_value = False
+    mock_mps.return_value = False
+    result = separator.separate_audio(audio_data, sample_rate=22050, segment_seconds=2.0)
+    mock_tensor.to.assert_called_with("cpu")
@@ -372,3 +372,72 @@ def analyze(self, path):
     assert cli.main() == 0
     res = json.loads(stdout.getvalue())
     assert res["jobId"] == "job-audio-success"
+
+
+def test_cli_main_temporal_analyzer_and_separator_mock_success(monkeypatch) -> None:
+    """Ensure the temporal analyzer and stem separator injection block succeeds."""
+    import io
+    import json
+
+    from bandscope_analysis import cli
+
+    stdin = io.StringIO(
+        json.dumps(
+            {
+                "jobId": "job-audio-success-sep",
+                "request": {
+                    "sourceKind": "local_audio",
+                    "projectId": "p1",
+                    "sourceLabel": "test.wav",
+                    "roleFocus": [],
+                    "localSource": {
+                        "sourcePath": "/valid/path.wav",
+                        "fileName": "test.wav",
+                        "extension": "wav",
+                        "fileSizeBytes": 100,
+                    },
+                },
+            }
+        )
+    )
+    stdout = io.StringIO()
+
+    class FakeAnalyzerSuccess:
+        def analyze(self, path):
+            return {"bpm": 120.0, "beats": []}
+
+    class FakeAudioStemSeparator:
+        def separate_audio(self, audio, sample_rate, segment_seconds=2.0):
+            import numpy as np
+
+            return {
+                "vocals": np.zeros((2, 100), dtype=np.float32),
+                "drums": np.zeros((2, 100), dtype=np.float32),
+                "bass": np.zeros((2, 100), dtype=np.float32),
+                "other": np.zeros((2, 100), dtype=np.float32),
+            }
+
+    def fake_librosa_load(path, sr, mono, duration):
+        import numpy as np
+
+        return np.zeros((2, 100), dtype=np.float32), sr
+
+    import librosa
+
+    monkeypatch.setattr(librosa, "load", fake_librosa_load)
+    import bandscope_analysis.separation.audio_separator
+
+    monkeypatch.setattr(
+        bandscope_analysis.separation.audio_separator,
+        "AudioStemSeparator",
+        FakeAudioStemSeparator,
+    )
+
+    monkeypatch.setattr(cli, "TemporalAnalyzer", FakeAnalyzerSuccess)
+    monkeypatch.setattr(cli.sys, "stdin", stdin)
+    monkeypatch.setattr(cli.sys, "stdout", stdout)
+    monkeypatch.setattr(cli.sys, "argv", ["cli.py"])
+
+    assert cli.main() == 0
+    res = json.loads(stdout.getvalue())
+    assert res["jobId"] == "job-audio-success-sep"