Skip to content
Closed
4 changes: 2 additions & 2 deletions packages/sdk/bun.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

132 changes: 132 additions & 0 deletions packages/sdk/examples/tts/chatterbox-enhanced.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
import {
loadModel,
textToSpeech,
unloadModel,
type ModelProgressUpdate,
TTS_TOKENIZER_EN_CHATTERBOX,
TTS_SPEECH_ENCODER_EN_CHATTERBOX_FP32,
TTS_EMBED_TOKENS_EN_CHATTERBOX_FP32,
TTS_CONDITIONAL_DECODER_EN_CHATTERBOX_FP32,
TTS_LANGUAGE_MODEL_EN_CHATTERBOX_FP32,
TTS_ENHANCER_BACKBONE_LAVASR_FP32,
TTS_ENHANCER_SPEC_HEAD_LAVASR_FP32,
TTS_DENOISER_LAVASR_FP32,
} from "@qvac/sdk";
import {
createWav,
playAudio,
int16ArrayToBuffer,
createWavHeader,
} from "./utils";

// A/B comparison: Chatterbox TTS with and without LavaSR neural speech enhancement.
// Produces two WAV files so you can hear the difference.
// Usage: node chatterbox-enhanced.js <referenceAudioSrc>
// LavaSR models are loaded from the QVAC Registry automatically.
const [referenceAudioSrc] = process.argv.slice(2);

if (!referenceAudioSrc) {
console.error("Usage: node chatterbox-enhanced.js <referenceAudioSrc>");
process.exit(1);
}

const CHATTERBOX_SAMPLE_RATE = 24000;
const ENHANCED_SAMPLE_RATE = 48000;
const SYNTHESIS_TEXT =
"Hello! This sentence is synthesized twice, once at standard quality and once with LavaSR neural enhancement, so you can hear the difference.";

const chatterboxConfig = {
ttsEngine: "chatterbox" as const,
language: "en" as const,
ttsTokenizerSrc: TTS_TOKENIZER_EN_CHATTERBOX.src,
ttsSpeechEncoderSrc: TTS_SPEECH_ENCODER_EN_CHATTERBOX_FP32.src,
ttsEmbedTokensSrc: TTS_EMBED_TOKENS_EN_CHATTERBOX_FP32.src,
ttsConditionalDecoderSrc: TTS_CONDITIONAL_DECODER_EN_CHATTERBOX_FP32.src,
ttsLanguageModelSrc: TTS_LANGUAGE_MODEL_EN_CHATTERBOX_FP32.src,
referenceAudioSrc,
};

function onProgress(progress: ModelProgressUpdate) {
console.log(progress);
}

function saveAndPlay(samples: number[], sampleRate: number, filename: string) {
createWav(samples, sampleRate, filename);
console.log(`Saved ${filename}`);
const audioData = int16ArrayToBuffer(samples);
const wavBuffer = Buffer.concat([
createWavHeader(audioData.length, sampleRate),
audioData,
]);
playAudio(wavBuffer);
}

try {
// --- Pass 1: Raw Chatterbox (no enhancer) ---
console.log("\n--- Pass 1: Raw Chatterbox (24 kHz) ---\n");

const rawModelId = await loadModel({
modelSrc: TTS_TOKENIZER_EN_CHATTERBOX.src,
modelType: "tts",
modelConfig: chatterboxConfig,
onProgress,
});

const rawResult = textToSpeech({
modelId: rawModelId,
text: SYNTHESIS_TEXT,
inputType: "text",
stream: false,
});

const rawBuffer = await rawResult.buffer;
console.log(`Raw TTS complete. ${rawBuffer.length} samples @ ${CHATTERBOX_SAMPLE_RATE} Hz`);
saveAndPlay(rawBuffer, CHATTERBOX_SAMPLE_RATE, "tts-raw-output.wav");

await unloadModel({ modelId: rawModelId });
console.log("Raw model unloaded.\n");

// --- Pass 2: Chatterbox + LavaSR enhancement ---
console.log("--- Pass 2: Chatterbox + LavaSR enhancement (48 kHz) ---\n");

const enhancedModelId = await loadModel({
modelSrc: TTS_TOKENIZER_EN_CHATTERBOX.src,
modelType: "tts",
modelConfig: {
...chatterboxConfig,
enhancer: {
type: "lavasr",
enhance: true,
denoise: true,
backboneSrc: TTS_ENHANCER_BACKBONE_LAVASR_FP32.src,
specHeadSrc: TTS_ENHANCER_SPEC_HEAD_LAVASR_FP32.src,
denoiserSrc: TTS_DENOISER_LAVASR_FP32.src,
},
},
onProgress,
});

const enhancedResult = textToSpeech({
modelId: enhancedModelId,
text: SYNTHESIS_TEXT,
inputType: "text",
stream: false,
});

const enhancedBuffer = await enhancedResult.buffer;
console.log(`Enhanced TTS complete. ${enhancedBuffer.length} samples @ ${ENHANCED_SAMPLE_RATE} Hz`);
saveAndPlay(enhancedBuffer, ENHANCED_SAMPLE_RATE, "tts-enhanced-output.wav");

await unloadModel({ modelId: enhancedModelId });
console.log("Enhanced model unloaded.");

console.log("\n--- Done ---");
console.log("Compare the two files:");
console.log(" tts-raw-output.wav -- 24 kHz standard Chatterbox");
console.log(" tts-enhanced-output.wav -- 48 kHz with LavaSR enhancement");

process.exit(0);
} catch (error) {
console.error("❌ Error:", error);
process.exit(1);
}
2 changes: 1 addition & 1 deletion packages/sdk/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@
"@qvac/transcription-parakeet": "^0.3.1",
"@qvac/transcription-whispercpp": "^0.6.1",
"@qvac/translation-nmtcpp": "^1.0.1",
"@qvac/tts-onnx": "^0.8.2",
"@qvac/tts-onnx": "^0.8.3",
"fast-safe-stringify": "2.1.1",
"which-runtime": "^1.3.2",
"zod": "^4.0.17"
Expand Down
30 changes: 30 additions & 0 deletions packages/sdk/schemas/text-to-speech.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,33 @@ export const TTS_LANGUAGES = [

const ttsLanguageSchema = z.enum(TTS_LANGUAGES);

const lavaSREnhancerRuntimeSchema = z.object({
type: z.literal("lavasr"),
enhance: z.boolean().optional(),
denoise: z.boolean().optional(),
});

const ttsEnhancerRuntimeConfigSchema = z.discriminatedUnion("type", [
lavaSREnhancerRuntimeSchema,
]);

export const lavaSREnhancerConfigSchema = lavaSREnhancerRuntimeSchema.extend({
backboneSrc: modelSrcInputSchema,
specHeadSrc: modelSrcInputSchema,
denoiserSrc: modelSrcInputSchema.optional(),
});

export const ttsEnhancerConfigSchema = z
.discriminatedUnion("type", [lavaSREnhancerConfigSchema])
.refine(
(data) => data.type !== "lavasr" || !data.denoise || data.denoiserSrc !== undefined,
{ message: "denoiserSrc is required when denoise is true", path: ["denoiserSrc"] },
);

export const ttsChatterboxRuntimeConfigSchema = z.object({
ttsEngine: z.literal("chatterbox"),
language: ttsLanguageSchema,
enhancer: ttsEnhancerRuntimeConfigSchema.optional(),
});

export const ttsSupertonicRuntimeConfigSchema = z.object({
Expand All @@ -22,6 +46,7 @@ export const ttsSupertonicRuntimeConfigSchema = z.object({
ttsSpeed: z.number().optional(),
ttsNumInferenceSteps: z.number().optional(),
ttsSupertonicMultilingual: z.boolean().optional(),
enhancer: ttsEnhancerRuntimeConfigSchema.optional(),
});

export const ttsRuntimeConfigSchema = z.union([
Expand All @@ -36,6 +61,7 @@ export const ttsChatterboxConfigSchema = ttsChatterboxRuntimeConfigSchema.extend
ttsConditionalDecoderSrc: modelSrcInputSchema,
ttsLanguageModelSrc: modelSrcInputSchema,
referenceAudioSrc: modelSrcInputSchema,
enhancer: ttsEnhancerConfigSchema.optional(),
});

export const ttsSupertonicConfigSchema = ttsSupertonicRuntimeConfigSchema.extend({
Expand All @@ -46,6 +72,7 @@ export const ttsSupertonicConfigSchema = ttsSupertonicRuntimeConfigSchema.extend
ttsUnicodeIndexerSrc: modelSrcInputSchema,
ttsTtsConfigSrc: modelSrcInputSchema,
ttsVoiceStyleSrc: modelSrcInputSchema,
enhancer: ttsEnhancerConfigSchema.optional(),
});

export const ttsConfigSchema = z.union([
Expand Down Expand Up @@ -87,6 +114,9 @@ export type TtsSupertonicRuntimeConfig = z.infer<
typeof ttsSupertonicRuntimeConfigSchema
>;
export type TtsRuntimeConfig = z.infer<typeof ttsRuntimeConfigSchema>;
export type TtsEnhancerRuntimeConfig = z.infer<typeof ttsEnhancerRuntimeConfigSchema>;
export type TtsEnhancerConfig = z.infer<typeof ttsEnhancerConfigSchema>;
export type LavaSREnhancerConfig = z.infer<typeof lavaSREnhancerConfigSchema>;

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this actually needed to be public export ?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

doesn't seem to be used anywhere and I don't see obvious use for it

export type TtsClientParams = z.infer<typeof ttsClientParamsSchema>;
export type TtsRequest = z.infer<typeof ttsRequestSchema>;
export type TtsResponse = z.infer<typeof ttsResponseSchema>;
Expand Down
Loading
Loading