Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
129 changes: 75 additions & 54 deletions .github/workflows/benchmark-qvac-lib-infer-parakeet.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,14 @@ on:
- ctc
- eou
- sortformer
- all
max_samples:
description: 'Max samples per test (0 = unlimited)'
required: false
type: number
default: 50
schedule:
- cron: '0 4 * * 0'
Comment thread
sharmaraju352 marked this conversation as resolved.
Outdated

env:
PKG_DIR: packages/qvac-lib-infer-parakeet
Expand All @@ -34,7 +37,7 @@ jobs:
- name: Generate matrix based on configuration
id: set-matrix
run: |
MODEL_TYPE="${{ github.event.inputs.model_type || 'tdt' }}"
MODEL_TYPE="${{ github.event.inputs.model_type || 'all' }}"
MAX_SAMPLES="${{ github.event.inputs.max_samples || '50' }}"

MATRIX=$(python3 <<PYEOF
Expand All @@ -43,48 +46,59 @@ jobs:
model_type = "${MODEL_TYPE}"
max_samples = int("${MAX_SAMPLES}")

# Model directory mapping
model_dirs = {
"tdt": "parakeet-tdt-0.6b-v3-onnx",
"ctc": "parakeet-ctc-0.6b-onnx",
"eou": "parakeet-eou-120m-v1-onnx",
"sortformer": "sortformer-4spk-v2-onnx"
}

# Sortformer is a diarization model — only run English batch (no WER/CER)
if model_type == "sortformer":
test_configs = [
{"language": "english", "dataset_type": "librispeech", "use_gpu": False, "streaming": False},
{"language": "english", "dataset_type": "librispeech", "use_gpu": True, "streaming": False},
]
else:
# All test configurations to run in parallel
test_configs = [
# English LibriSpeech tests
{"language": "english", "dataset_type": "librispeech", "use_gpu": False, "streaming": False},
{"language": "english", "dataset_type": "librispeech", "use_gpu": True, "streaming": False},
{"language": "english", "dataset_type": "librispeech", "use_gpu": False, "streaming": True},
# Multi-language FLEURS tests
{"language": "french", "dataset_type": "fleurs", "use_gpu": False, "streaming": False},
{"language": "german", "dataset_type": "fleurs", "use_gpu": False, "streaming": False},
{"language": "spanish", "dataset_type": "fleurs", "use_gpu": False, "streaming": False},
]
model_types = [model_type] if model_type != "all" else ["tdt", "ctc", "eou", "sortformer"]

def get_configs_for_type(mt):
if mt == "sortformer":
return [
{"language": "english", "dataset_type": "librispeech", "use_gpu": False, "streaming": False},
{"language": "english", "dataset_type": "librispeech", "use_gpu": True, "streaming": False},
]
elif mt == "eou":
return [
{"language": "english", "dataset_type": "librispeech", "use_gpu": False, "streaming": True},
{"language": "english", "dataset_type": "librispeech", "use_gpu": True, "streaming": True},
{"language": "english", "dataset_type": "librispeech", "use_gpu": False, "streaming": False},
]
elif mt == "ctc":
return [
{"language": "english", "dataset_type": "librispeech", "use_gpu": False, "streaming": False},
{"language": "english", "dataset_type": "librispeech", "use_gpu": True, "streaming": False},
{"language": "english", "dataset_type": "librispeech", "use_gpu": False, "streaming": True},
]
else:
return [
{"language": "english", "dataset_type": "librispeech", "use_gpu": False, "streaming": False},
{"language": "english", "dataset_type": "librispeech", "use_gpu": True, "streaming": False},
{"language": "english", "dataset_type": "librispeech", "use_gpu": False, "streaming": True},
{"language": "french", "dataset_type": "fleurs", "use_gpu": False, "streaming": False},
{"language": "german", "dataset_type": "fleurs", "use_gpu": False, "streaming": False},
{"language": "spanish", "dataset_type": "fleurs", "use_gpu": False, "streaming": False},
]

include = []
for i, cfg in enumerate(test_configs):
mode_str = "streaming" if cfg["streaming"] else "batch"
gpu_str = "gpu" if cfg["use_gpu"] else "cpu"
variant = f"{cfg['language']}-{cfg['dataset_type']}-{gpu_str}-{mode_str}"
include.append({
"model_variant": variant,
"model_dir": model_dirs[model_type],
"model_type": model_type,
"use_gpu": cfg["use_gpu"],
"language": cfg["language"],
"dataset_type": cfg["dataset_type"],
"streaming": cfg["streaming"],
"max_samples": max_samples
})
for mt in model_types:
for cfg in get_configs_for_type(mt):
mode_str = "streaming" if cfg["streaming"] else "batch"
gpu_str = "gpu" if cfg["use_gpu"] else "cpu"
variant = f"{mt}-{cfg['language']}-{cfg['dataset_type']}-{gpu_str}-{mode_str}"
include.append({
"model_variant": variant,
"model_dir": model_dirs[mt],
"model_type": mt,
"use_gpu": cfg["use_gpu"],
"language": cfg["language"],
"dataset_type": cfg["dataset_type"],
"streaming": cfg["streaming"],
"max_samples": max_samples
})

variants = [cfg["model_variant"] for cfg in include]
matrix = {
Expand Down Expand Up @@ -188,7 +202,7 @@ jobs:
- name: Set up Python
uses: actions/setup-python@v6
with:
python-version: '3.14'
python-version: '3.13'

- name: Install Poetry
run: |
Expand All @@ -201,6 +215,8 @@ jobs:
npm install -g bare bare-make

- name: Download prebuilds
id: prebuilds
continue-on-error: true
env:
GH_TOKEN: ${{ secrets.PAT_TOKEN }}
run: |
Expand All @@ -211,7 +227,7 @@ jobs:
RUN_ID=$(gh run list --workflow=prebuilds-qvac-lib-infer-parakeet.yml --status=success --limit 1 --json databaseId --jq '.[0].databaseId')

if [ -z "$RUN_ID" ]; then
echo "No successful prebuilds run found"
echo "::warning::No successful prebuilds run found — will fall back to published npm package"
exit 1
fi

Expand Down Expand Up @@ -257,8 +273,8 @@ jobs:
curl -L -o decoder_joint.onnx "${EOU_REPO}/decoder_joint.onnx"
curl -L -o tokenizer.json "${EOU_REPO}/tokenizer.json"
elif [ "$MODEL_TYPE" = "sortformer" ]; then
REPO_URL="https://huggingface.co/tetherto/sortformer-4spk-v2-onnx/resolve/main"
curl -L -o sortformer.onnx "${REPO_URL}/sortformer.onnx"
REPO_URL="https://huggingface.co/cgus/diar_streaming_sortformer_4spk-v2-onnx/resolve/main"
curl -L -o sortformer.onnx "${REPO_URL}/diar_streaming_sortformer_4spk-v2.onnx"
else
echo "Model type ${MODEL_TYPE} not supported for auto-download"
exit 1
Expand All @@ -280,13 +296,15 @@ jobs:
STREAMING_CHUNK_SIZE="${{ github.event.inputs.streaming_chunk_size || '64000' }}"
MAX_SAMPLES="${{ matrix.max_samples }}"

CONFIG_FILE="config.yaml"
# Select the appropriate config file for this model type
case "$MODEL_TYPE" in
ctc) CONFIG_FILE="config-ctc.yaml" ;;
eou) CONFIG_FILE="config-eou.yaml" ;;
sortformer) CONFIG_FILE="config-sortformer.yaml" ;;
*) CONFIG_FILE="config.yaml" ;;
esac

echo "Configuring benchmark:"
echo " Dataset: $DATASET_TYPE / $DATASET_LANGUAGE"
echo " Model: $MODEL_TYPE ($MODEL_DIR)"
echo " GPU: $USE_GPU, Streaming: $STREAMING_MODE"
echo " Max samples: $MAX_SAMPLES"
echo "Using config template: ${CONFIG_FILE}"

# Set timeout based on model type
case "$MODEL_TYPE" in
Expand Down Expand Up @@ -316,25 +334,28 @@ jobs:
sed -i "/^model:/,/^[^ ]/ s| streaming:.*| streaming: ${STREAMING_MODE}|" "$CONFIG_PATH"
sed -i "/^model:/,/^[^ ]/ s| streaming_chunk_size:.*| streaming_chunk_size: ${STREAMING_CHUNK_SIZE}|" "$CONFIG_PATH"

# Sortformer is a diarization model — disable WER/CER metrics
if [ "$MODEL_TYPE" = "sortformer" ]; then
sed -i "/^wer:/,/^[^ ]/ s| enabled:.*| enabled: false|" "$CONFIG_PATH"
sed -i "/^cer:/,/^[^ ]/ s| enabled:.*| enabled: false|" "$CONFIG_PATH"
fi

echo "=== Updated ${CONFIG_FILE} ==="
cat "$CONFIG_PATH"

# Export config file name for later steps
echo "BENCHMARK_CONFIG_FILE=${CONFIG_FILE}" >> $GITHUB_ENV

- name: Install main package dependencies
if: steps.prebuilds.outcome == 'success'
working-directory: ${{ env.PKG_DIR }}
run: npm install

- name: Install benchmark server dependencies
working-directory: ${{ env.PKG_DIR }}/benchmarks/server
run: |
npm install
# Install the main package from the local repo (uses current branch code)
npm install ../../
if [ "${{ steps.prebuilds.outcome }}" = "success" ]; then
echo "Installing addon from local source (prebuilds available)..."
npm install ../../
else
echo "Installing addon from npm (prebuilds not available)..."
npm install @qvac/transcription-parakeet@latest
fi

- name: Install benchmark client dependencies
working-directory: ${{ env.PKG_DIR }}/benchmarks/client
Expand All @@ -359,8 +380,8 @@ jobs:
PYTHONUNBUFFERED: "1"
HF_DATASETS_CACHE: /tmp/hf_cache
run: |
echo "Running benchmark with config: config.yaml"
poetry run python -u -m src.parakeet.main --config config/config.yaml
echo "Running benchmark with config: ${{ env.BENCHMARK_CONFIG_FILE }}"
poetry run python -u -m src.parakeet.main --config config/${{ env.BENCHMARK_CONFIG_FILE }}
rm -rf /tmp/hf_cache || true

- name: Stop benchmark server
Expand Down
51 changes: 45 additions & 6 deletions packages/qvac-lib-infer-parakeet/benchmarks/client/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -101,17 +101,41 @@ model:

## Usage

Run the benchmark with:
Run the benchmark with the default TDT config:

```bash
poetry run python -m src.parakeet.main --config config/config.yaml
```

### Per-Model Config Files

Each model type has a dedicated config file with appropriate defaults:

| Config File | Model Type | Description |
|-------------|------------|-------------|
| `config/config.yaml` | TDT | Token-and-Duration Transducer (default) |
| `config/config-ctc.yaml` | CTC | Connectionist Temporal Classification |
| `config/config-eou.yaml` | EOU | End-of-Utterance streaming model |
| `config/config-sortformer.yaml` | Sortformer | Speaker diarization (WER/CER disabled) |

Run a specific model benchmark:

```bash
# CTC benchmark
poetry run python -m src.parakeet.main --config config/config-ctc.yaml

# EOU benchmark (streaming enabled by default)
poetry run python -m src.parakeet.main --config config/config-eou.yaml

# Sortformer benchmark (diarization, no WER/CER)
poetry run python -m src.parakeet.main --config config/config-sortformer.yaml
```

The client will:

1. Load the specified dataset (LibriSpeech or FLEURS) and convert it to raw audio files
2. Send paths to audio files to the server for transcription
3. Calculate WER and CER scores
3. Calculate WER and CER scores (when enabled)
4. Report timing statistics

### Using Different Datasets and Languages
Expand All @@ -133,6 +157,21 @@ model:
model_type: "ctc" # or "tdt", "eou", "sortformer"
```

### Trigger Script

Trigger benchmarks from the command line using the script in `../../scripts/`:

```bash
# Trigger a single model type
../../scripts/trigger-benchmark.sh -t ctc

# Trigger all model types in one run
../../scripts/trigger-benchmark.sh -t all

# With custom sample count and watch mode
../../scripts/trigger-benchmark.sh -t eou -m 100 -W
```

## Output

- WER score (if enabled)
Expand All @@ -153,10 +192,10 @@ poetry run python -m pytest tests/ -v

| Type | Description | Best For |
|------|-------------|----------|
| `tdt` | Token-and-Duration Transducer | General purpose, accurate |
| `ctc` | Connectionist Temporal Classification | Faster inference |
| `eou` | End-of-Utterance | Live transcription with end detection |
| `sortformer` | Sortformer architecture | Advanced use cases |
| `tdt` | Token-and-Duration Transducer | General purpose, multilingual, accurate |
| `ctc` | Connectionist Temporal Classification | English-only, faster inference |
| `eou` | End-of-Utterance | Streaming, low latency with utterance detection |
| `sortformer` | Sortformer architecture | Speaker diarization (no WER/CER metrics) |

## Acknowledgments

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
server:
url: "http://localhost:8080/run"
timeout: 60
batch_size: 10
lib: "@qvac/transcription-parakeet"
dataset:
dataset_type: "librispeech"
speaker_group: "clean"
language: "english"
max_samples: 0
wer:
enabled: true
cer:
enabled: true
model:
path: "./models/parakeet-ctc-0.6b-onnx"
sample_rate: 16000
audio_format: "s16le"
model_type: "ctc"
max_threads: 4
use_gpu: false
caption_enabled: false
timestamps_enabled: true
streaming: false
streaming_chunk_size: 64000
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
server:
url: "http://localhost:8080/run"
timeout: 120
batch_size: 10
lib: "@qvac/transcription-parakeet"
dataset:
dataset_type: "librispeech"
speaker_group: "clean"
language: "english"
max_samples: 0
wer:
enabled: true
cer:
enabled: true
model:
path: "./models/parakeet-eou-120m-v1-onnx"
sample_rate: 16000
audio_format: "s16le"
model_type: "eou"
max_threads: 4
use_gpu: false
caption_enabled: false
timestamps_enabled: true
streaming: true
streaming_chunk_size: 64000
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
server:
url: "http://localhost:8080/run"
timeout: 180
batch_size: 10
lib: "@qvac/transcription-parakeet"
dataset:
dataset_type: "librispeech"
speaker_group: "clean"
language: "english"
max_samples: 0
wer:
enabled: false
cer:
enabled: false
model:
path: "./models/sortformer-4spk-v2-onnx"
sample_rate: 16000
audio_format: "s16le"
model_type: "sortformer"
max_threads: 4
use_gpu: false
caption_enabled: false
timestamps_enabled: true
streaming: false
streaming_chunk_size: 64000
Loading
Loading