Skip to content

Commit 5705bfe

Browse files
authored
Merge pull request #35 from AssemblyAI/E07417BDFEA3614F5967B1520F8B2F61
Sync from internal repo (2024/02/15)
2 parents 24d94a8 + bfb0089 commit 5705bfe

File tree

7 files changed

+167
-6
lines changed

7 files changed

+167
-6
lines changed

CHANGELOG.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,19 @@
11
# Changelog
22

3+
## [4.3.0] - 2024-02-15
4+
5+
### Added
6+
7+
- Add `RealtimeTranscriber.configureEndUtteranceSilenceThreshold` function
8+
- Add `RealtimeTranscriber.forceEndUtterance` function
9+
- Add `end_utterance_silence_threshold` property to `CreateRealtimeTranscriberParams` and `RealtimeTranscriberParams` types.
10+
11+
## [4.2.3] - 2024-02-13
12+
13+
### Added
14+
15+
- Add `speech_model` field to `TranscriptParams` and add `SpeechModel` type.
16+
317
## [4.2.2] - 2024-01-29
418

519
### Changed

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "assemblyai",
3-
"version": "4.2.3",
3+
"version": "4.3.0",
44
"description": "The AssemblyAI JavaScript SDK provides an easy-to-use interface for interacting with the AssemblyAI API, which supports async and real-time transcription, as well as the latest LeMUR models.",
55
"engines": {
66
"node": ">=18"

scripts/kitchensink.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ const client = new AssemblyAI({
2626
})
2727
: undefined,
2828
encoding: "pcm_s16le",
29+
end_utterance_silence_threshold: 500,
2930
};
3031
const rt = client.realtime.transcriber(serviceParams);
3132

src/services/realtime/service.ts

Lines changed: 59 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,26 @@ import {
2020
} from "../../utils/errors";
2121

2222
const defaultRealtimeUrl = "wss://api.assemblyai.com/v2/realtime/ws";
23+
const forceEndOfUtteranceMessage = `{"force_end_utterance":true}`;
24+
const terminateSessionMessage = `{"terminate_session":true}`;
25+
26+
type BufferLike =
27+
| string
28+
| Buffer
29+
| DataView
30+
| number
31+
| ArrayBufferView
32+
| Uint8Array
33+
| ArrayBuffer
34+
| SharedArrayBuffer
35+
| ReadonlyArray<unknown>
36+
| ReadonlyArray<number>
37+
| { valueOf(): ArrayBuffer }
38+
| { valueOf(): SharedArrayBuffer }
39+
| { valueOf(): Uint8Array }
40+
| { valueOf(): ReadonlyArray<number> }
41+
| { valueOf(): string }
42+
| { [Symbol.toPrimitive](hint: string): string };
2343

2444
export class RealtimeTranscriber {
2545
private realtimeUrl: string;
@@ -28,6 +48,7 @@ export class RealtimeTranscriber {
2848
private encoding?: AudioEncoding;
2949
private apiKey?: string;
3050
private token?: string;
51+
private end_utterance_silence_threshold?: number;
3152
private socket?: WebSocket;
3253
private listeners: RealtimeListeners = {};
3354
private sessionTerminatedResolve?: () => void;
@@ -37,6 +58,8 @@ export class RealtimeTranscriber {
3758
this.sampleRate = params.sampleRate ?? 16_000;
3859
this.wordBoost = params.wordBoost;
3960
this.encoding = params.encoding;
61+
this.end_utterance_silence_threshold =
62+
params.end_utterance_silence_threshold;
4063
if ("token" in params && params.token) this.token = params.token;
4164
if ("apiKey" in params && params.apiKey) this.apiKey = params.apiKey;
4265

@@ -105,6 +128,18 @@ export class RealtimeTranscriber {
105128
}
106129
this.socket.binaryType = "arraybuffer";
107130

131+
this.socket.onopen = () => {
132+
if (
133+
this.end_utterance_silence_threshold === undefined ||
134+
this.end_utterance_silence_threshold === null
135+
) {
136+
return;
137+
}
138+
this.configureEndUtteranceSilenceThreshold(
139+
this.end_utterance_silence_threshold
140+
);
141+
};
142+
108143
this.socket.onclose = ({ code, reason }: CloseEvent) => {
109144
if (!reason) {
110145
if (code in RealtimeErrorType) {
@@ -159,10 +194,7 @@ export class RealtimeTranscriber {
159194
}
160195

161196
sendAudio(audio: AudioData) {
162-
if (!this.socket || this.socket.readyState !== WebSocket.OPEN) {
163-
throw new Error("Socket is not open for communication");
164-
}
165-
this.socket.send(audio);
197+
this.send(audio);
166198
}
167199

168200
stream(): WritableStream<AudioData> {
@@ -173,10 +205,32 @@ export class RealtimeTranscriber {
173205
});
174206
}
175207

208+
/**
209+
* Manually end an utterance
210+
*/
211+
forceEndUtterance() {
212+
this.send(forceEndOfUtteranceMessage);
213+
}
214+
215+
/**
216+
* Configure the threshold for how long to wait before ending an utterance. Default is 700ms.
217+
* @param threshold The duration of the end utterance silence threshold in milliseconds
218+
* @format integer
219+
*/
220+
configureEndUtteranceSilenceThreshold(threshold: number) {
221+
this.send(`{"end_utterance_silence_threshold":${threshold}}`);
222+
}
223+
224+
private send(data: BufferLike) {
225+
if (!this.socket || this.socket.readyState !== WebSocket.OPEN) {
226+
throw new Error("Socket is not open for communication");
227+
}
228+
this.socket.send(data);
229+
}
230+
176231
async close(waitForSessionTermination = true) {
177232
if (this.socket) {
178233
if (this.socket.readyState === WebSocket.OPEN) {
179-
const terminateSessionMessage = `{"terminate_session": true}`;
180234
if (waitForSessionTermination) {
181235
const sessionTerminatedPromise = new Promise<void>((resolve) => {
182236
this.sessionTerminatedResolve = resolve;

src/types/asyncapi.generated.ts

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,12 @@ export type AudioData = ArrayBufferLike;
2828
*/
2929
export type AudioEncoding = "pcm_s16le" | "pcm_mulaw";
3030

31+
/** @description Configure the threshold for how long to wait before ending an utterance. Default is 700ms. */
32+
export type ConfigureEndUtteranceSilenceThreshold = {
33+
/** @description The duration threshold in milliseconds */
34+
end_utterance_silence_threshold: number;
35+
};
36+
3137
export type FinalTranscript = RealtimeBaseTranscript & {
3238
/**
3339
* @description Describes the type of message
@@ -40,6 +46,12 @@ export type FinalTranscript = RealtimeBaseTranscript & {
4046
text_formatted: boolean;
4147
};
4248

49+
/** @description Manually end an utterance */
50+
export type ForceEndUtterance = {
51+
/** @description A boolean value to communicate that you wish to force the end of the utterance */
52+
force_end_utterance: boolean;
53+
};
54+
4355
/** @enum {string} */
4456
export type MessageType =
4557
| "SessionBegins"

src/types/realtime/index.ts

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,15 +7,38 @@ import {
77
} from "../asyncapi.generated";
88

99
type CreateRealtimeTranscriberParams = {
10+
/**
11+
* The WebSocket URL that the RealtimeTranscriber connects to
12+
*/
1013
realtimeUrl?: string;
14+
/**
15+
* The sample rate of the streamed audio
16+
*/
1117
sampleRate?: number;
18+
/**
19+
* Add up to 2500 characters of custom vocabulary
20+
*/
1221
wordBoost?: string[];
22+
/**
23+
* The encoding of the audio data
24+
*/
1325
encoding?: AudioEncoding;
26+
/**
27+
* The duration of the end utterance silence threshold in milliseconds
28+
*/
29+
end_utterance_silence_threshold?: number;
1430
} & (
1531
| {
32+
/**
33+
* The API key used to authenticate the RealtimeTranscriber
34+
* Using an API key to authenticate the RealtimeTranscriber is not supported in the browser.
35+
*/
1636
apiKey?: string;
1737
}
1838
| {
39+
/**
40+
* The temporary token used to authenticate the RealtimeTranscriber
41+
*/
1942
token: string;
2043
}
2144
);
@@ -26,15 +49,38 @@ type CreateRealtimeTranscriberParams = {
2649
type CreateRealtimeServiceParams = CreateRealtimeTranscriberParams;
2750

2851
type RealtimeTranscriberParams = {
52+
/**
53+
* The WebSocket URL that the RealtimeTranscriber connects to
54+
*/
2955
realtimeUrl?: string;
56+
/**
57+
* The sample rate of the streamed audio
58+
*/
3059
sampleRate?: number;
60+
/**
61+
* Add up to 2500 characters of custom vocabulary
62+
*/
3163
wordBoost?: string[];
64+
/**
65+
* The encoding of the audio data
66+
*/
3267
encoding?: AudioEncoding;
68+
/**
69+
* The duration of the end utterance silence threshold in milliseconds
70+
*/
71+
end_utterance_silence_threshold?: number;
3372
} & (
3473
| {
74+
/**
75+
* The API key used to authenticate the RealtimeTranscriber.
76+
* Using an API key to authenticate the RealtimeTranscriber is not supported in the browser.
77+
*/
3578
apiKey: string;
3679
}
3780
| {
81+
/**
82+
* The temporary token used to authenticate the RealtimeTranscriber
83+
*/
3884
token: string;
3985
}
4086
);

tests/realtime.test.ts

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,12 @@ describe("realtime", () => {
5757
WS.clean();
5858
}
5959

60+
it("fails without API key and token", async () => {
61+
expect(() => new RealtimeTranscriber({ apiKey: "" })).toThrowError(
62+
"API key or temporary token is required."
63+
);
64+
});
65+
6066
it("fails on redundant connection", async () => {
6167
await expect(async () => await rt.connect()).rejects.toThrowError(
6268
"Already connected"
@@ -149,6 +155,34 @@ describe("realtime", () => {
149155
await expect(server).toReceiveMessage(data);
150156
});
151157

158+
it("creates service with EndUtteranceSilenceThreshold", async () => {
159+
const realtimeUrl = "wss://localhost:5678";
160+
const server = new WS(realtimeUrl);
161+
const aai = createClient();
162+
const rt = aai.realtime.transcriber({
163+
realtimeUrl,
164+
apiKey: "123",
165+
end_utterance_silence_threshold: 500,
166+
});
167+
await connect(rt, server);
168+
await expect(server).toReceiveMessage(
169+
`{"end_utterance_silence_threshold":500}`
170+
);
171+
await close(rt, server);
172+
});
173+
174+
it("can set EndUtteranceSilenceThreshold", async () => {
175+
rt.configureEndUtteranceSilenceThreshold(500);
176+
await expect(server).toReceiveMessage(
177+
`{"end_utterance_silence_threshold":500}`
178+
);
179+
});
180+
181+
it("can set forceEndUtterance", async () => {
182+
rt.forceEndUtterance();
183+
await expect(server).toReceiveMessage(`{"force_end_utterance":true}`);
184+
});
185+
152186
it("can receive transcript", () => {
153187
const data = {
154188
created: "2023-09-14T03:37:11.516967",

0 commit comments

Comments
 (0)