livekit · simllll · Nov 11, 2025
diff --git a/agents/src/tokenize/blingfire/README.md b/agents/src/tokenize/blingfire/README.md
@@ -0,0 +1,141 @@
+# BlingFire Tokenizer
+
+This directory contains a TypeScript wrapper for the [BlingFire](https://github.com/microsoft/BlingFire) tokenization library using WebAssembly.
+
+## Overview
+
+BlingFire is a lightning-fast tokenizer developed by Microsoft that provides high-quality sentence and word segmentation. This implementation uses the WebAssembly build of BlingFire to enable fast tokenization in TypeScript/JavaScript environments.
+
+## Files
+
+### Auto-generated Files
+
+The following files are **auto-generated** and should not be manually edited:
+
+- **`blingfire.ts`** - JavaScript code generated by Emscripten, with the following header added:
+  ```typescript
+  // auto generated file
+  /* eslint-disable */
+  // @ts-ignore
+  ```
+  This file is copied directly from the BlingFire build output.
+
+- **`blingfire.wasm`** - The compiled WebAssembly binary, copied 1:1 from the BlingFire build output to the resources folder.
+
+### Source Files
+
+- **`blingfire_wrapper.js`** - Wrapper functions for the BlingFire WASM module. Based on the wrapper provided by BlingFire's WASM implementation, with slight adaptations for this project.
+
+- **`index.ts`** - Main entry point that implements the `SentenceTokenizer` interface following the LiveKit agents pattern.
+
+## Building BlingFire WASM
+
+To regenerate the `blingfire.ts` and `blingfire.wasm` files:
+
+### 1. Clone the BlingFire Repository
+
+```bash
+git clone https://github.com/microsoft/BlingFire.git
+cd BlingFire
+```
+
+### 2. Follow Initial Setup
+
+Follow the instructions at: https://github.com/microsoft/BlingFire/blob/master/wasm/readme.md
+
+### 3. Modify the Makefile
+
+Change the Makefile to run the following `em++` command:
+
+```bash
+em++ ../blingfiretools/blingfiretokdll/blingfiretokdll.cpp \
+     ../blingfiretools/blingfiretokdll/*.cxx \
+     ../blingfireclient.library/src/*.cpp \
+     -s WASM=1 \
+     -s EXPORTED_FUNCTIONS="[_GetBlingFireTokVersion, _TextToSentences, _TextToWords, _TextToIds, _SetModel, _FreeModel, _WordHyphenationWithModel, _malloc, _free]" \
+     -s "EXPORTED_RUNTIME_METHODS=['lengthBytesUTF8', 'stackAlloc', 'stringToUTF8', 'UTF8ToString', 'cwrap']" \
+     -s ALLOW_MEMORY_GROWTH=1 \
+     -s DISABLE_EXCEPTION_CATCHING=0 \
+     -s MODULARIZE=1 \
+     -s EXPORT_ES6 \
+     -I ../blingfireclient.library/inc/ \
+     -I ../blingfirecompile.library/inc/ \
+     -DHAVE_ICONV_LIB \
+     -DHAVE_NO_SPECSTRINGS \
+     -D_VERBOSE \
+     -DBLING_FIRE_NOAP \
+     -DBLING_FIRE_NOWINDOWS \
+     -DNDEBUG \
+     -O3 \
+     --std=c++11 \
+     -o blingfire.js
+```
+
+**Key Changes:**
+- Added `-s MODULARIZE=1` - Makes the output a module that can be imported
+- Added `-s EXPORT_ES6` - Exports as ES6 module
+- Fixed `_malloc` and `_free` exports in `EXPORTED_FUNCTIONS`
+
+### 4. Copy Files to LiveKit
+
+After building, copy the generated files:
+
+```bash
+# From the BlingFire wasm build directory
+cp blingfire.js /path/to/livekit/agents-js/agents/src/tokenize/blingfire/blingfire.ts
+cp blingfire.wasm /path/to/livekit/agents-js/agents/src/tokenize/blingfire/blingfire.wasm
+```
+
+Then add the header comments to `blingfire.ts`:
+
+```typescript
+// auto generated file
+/* eslint-disable */
+// @ts-ignore
+```
+
+## Usage
+
+```typescript
+import { tokenizer } from '@livekit/agents';
+
+// Create a tokenizer instance
+const tokenizer = new tokenizer.blingfire.SentenceTokenizer({
+  minSentenceLength: 20,
+  streamContextLength: 10,
+});
+
+// Batch tokenization
+const sentences = tokenizer.tokenize('This is a sentence. And another one.');
+console.log(sentences);
+// Output: ['This is a sentence. And another one.']
+
+// Stream tokenization
+const stream = tokenizer.stream();
+stream.pushText('This is the first sentence. ');
+stream.pushText('This is the second sentence.');
+stream.endInput();
+
+for await (const { token, segmentId } of stream) {
+  console.log(token);
+}
+```
+
+## Configuration Options
+
+- **`minSentenceLength`** (default: 20) - Minimum length for buffered sentences
+- **`streamContextLength`** (default: 10) - Minimum context length for stream processing
+
+## Features
+
+- Lightning-fast sentence tokenization using BlingFire
+- Support for batch and streaming tokenization
+- Handles abbreviations (Dr., Mr., etc.) correctly
+- Supports numbers with decimals
+- Multi-language support (Latin, CJK characters, etc.)
+- Compatible with the LiveKit agents tokenizer interface
+
+## License
+
+BlingFire is licensed under the MIT License by Microsoft Corporation.
+See: https://github.com/microsoft/BlingFire/blob/master/LICENSE
diff --git a/agents/src/tokenize/blingfire/blingfire.test.ts b/agents/src/tokenize/blingfire/blingfire.test.ts
@@ -0,0 +1,177 @@
+// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+import { describe, expect, it } from 'vitest';
+import { SentenceTokenizer } from './index.js';
+
+const TEXT =
+  'Hi! ' +
+  'LiveKit is a platform for live audio and video applications and services. \n\n' +
+  'R.T.C stands for Real-Time Communication... again R.T.C. ' +
+  'Mr. Theo is testing the sentence tokenizer. ' +
+  '\nThis is a test. Another test. ' +
+  'A short sentence.\n' +
+  'A longer sentence that is longer than the previous sentence. ' +
+  'f(x) = x * 2.54 + 42. ' +
+  'Hey!\n Hi! Hello! ' +
+  '\n\n' +
+  'This is a sentence. 这是一个中文句子。これは日本語の文章です。' +
+  '你好！LiveKit是一个直播音频和视频应用程序和服务的平台。' +
+  '\nThis is a sentence contains   consecutive spaces.';
+
+// BlingFire may split sentences differently than the basic tokenizer
+// These are the expected results when using BlingFire with minSentenceLength=20
+const EXPECTED_MIN_20 = [
+  'Hi! LiveKit is a platform for live audio and video applications and services.',
+  'R.T.C stands for Real-Time Communication... again R.T.C. Mr. Theo is testing the sentence tokenizer.',
+  'This is a test. Another test.',
+  'A short sentence. A longer sentence that is longer than the previous sentence. f(x) = x * 2.54 + 42.',
+  'Hey! Hi! Hello! This is a sentence.',
+  '这是一个中文句子。これは日本語の文章です。',
+  '你好！LiveKit是一个直播音频和视频应用程序和服务的平台。',
+  'This is a sentence contains   consecutive spaces.',
+];
+
+const SIMPLE_TEXT = 'This is a sentence. This is another sentence. And a third one.';
+
+describe('blingfire tokenizer', () => {
+  describe('SentenceTokenizer', () => {
+    const tokenizer = new SentenceTokenizer();
+
+    it('should tokenize simple sentences correctly', () => {
+      const result = tokenizer.tokenize(SIMPLE_TEXT);
+      expect(result).toBeDefined();
+      expect(result.length).toBeGreaterThan(0);
+      // BlingFire should split the text into sentences
+      expect(result.some((s) => s.includes('This is a sentence'))).toBeTruthy();
+    });
+
+    it('should tokenize complex text correctly', () => {
+      const result = tokenizer.tokenize(TEXT);
+      expect(result).toBeDefined();
+      expect(result.length).toBeGreaterThan(0);
+      // Verify we get similar structure to expected
+      expect(result.length).toBe(EXPECTED_MIN_20.length);
+    });
+
+    it('should handle empty string', () => {
+      const result = tokenizer.tokenize('');
+      expect(result).toEqual([]);
+    });
+
+    it('should handle single sentence', () => {
+      const result = tokenizer.tokenize('This is a single sentence.');
+      expect(result).toBeDefined();
+      expect(result.length).toBeGreaterThan(0);
+    });
+
+    it('should respect minSentenceLength option', () => {
+      const tokenizerMin50 = new SentenceTokenizer({ minSentenceLength: 50 });
+      const result = tokenizerMin50.tokenize(TEXT);
+      expect(result).toBeDefined();
+      // All tokens except possibly the last should be >= 50 chars
+      result.slice(0, -1).forEach((token) => {
+        expect(token.length).toBeGreaterThanOrEqual(50);
+      });
+    });
+
+    it('should stream tokenize sentences correctly', async () => {
+      const pattern = [1, 2, 4];
+      let text = TEXT;
+      const chunks = [];
+      const patternIter = Array(Math.ceil(text.length / pattern.reduce((sum, num) => sum + num, 0)))
+        .fill(pattern)
+        .flat()
+        [Symbol.iterator]();
+
+      // @ts-ignore
+      for (const size of patternIter) {
+        if (!text) break;
+        chunks.push(text.slice(undefined, size));
+        text = text.slice(size);
+      }
+
+      const stream = tokenizer.stream();
+      for (const chunk of chunks) {
+        stream.pushText(chunk);
+      }
+      stream.endInput();
+
+      const tokens = [];
+      for await (const value of stream) {
+        tokens.push(value.token);
+      }
+
+      expect(tokens).toBeDefined();
+      expect(tokens.length).toBeGreaterThan(0);
+      // Should produce the same number of tokens as batch mode
+      expect(tokens.length).toBe(EXPECTED_MIN_20.length);
+    });
+
+    it('should handle flush correctly', async () => {
+      const stream = tokenizer.stream();
+      stream.pushText('This is the first part. ');
+      stream.flush();
+      stream.pushText('This is the second part.');
+      stream.endInput();
+
+      const tokens = [];
+      for await (const value of stream) {
+        tokens.push(value.token);
+      }
+
+      expect(tokens.length).toBeGreaterThan(0);
+    });
+
+    it('should handle multiple pushText calls', async () => {
+      const stream = tokenizer.stream();
+      stream.pushText('First sentence. ');
+      stream.pushText('Second sentence. ');
+      stream.pushText('Third sentence.');
+      stream.endInput();
+
+      const tokens = [];
+      for await (const value of stream) {
+        tokens.push(value.token);
+      }
+
+      expect(tokens.length).toBeGreaterThan(0);
+    });
+
+    it('should handle abbreviations correctly', () => {
+      const text = 'Dr. Smith went to Washington D.C. yesterday. It was nice.';
+      const result = tokenizer.tokenize(text);
+      expect(result).toBeDefined();
+      expect(result.length).toBeGreaterThan(0);
+    });
+
+    it('should handle numbers with decimals', () => {
+      const text = 'The value is 3.14159. Another value is 2.71828.';
+      const result = tokenizer.tokenize(text);
+      expect(result).toBeDefined();
+      expect(result.some((s) => s.includes('3.14159'))).toBeTruthy();
+    });
+
+    it('should provide segment IDs in stream mode', async () => {
+      const stream = tokenizer.stream();
+      stream.pushText('First sentence. ');
+      stream.flush();
+      stream.pushText('Second sentence after flush.');
+      stream.endInput();
+
+      const tokens = [];
+      for await (const value of stream) {
+        tokens.push(value);
+        expect(value.segmentId).toBeDefined();
+        expect(typeof value.segmentId).toBe('string');
+      }
+
+      // Tokens from different segments should have different segment IDs
+      if (tokens.length > 1) {
+        const segmentIds = new Set(tokens.map((t) => t.segmentId));
+        // After flush, we should have at least 2 different segment IDs
+        expect(segmentIds.size).toBeGreaterThanOrEqual(1);
+      }
+    });
+  });
+});
diff --git a/agents/src/tokenize/blingfire/blingfire.ts b/agents/src/tokenize/blingfire/blingfire.ts
diff --git a/agents/src/tokenize/blingfire/blingfire.wasm b/agents/src/tokenize/blingfire/blingfire.wasm
diff --git a/agents/src/tokenize/blingfire/blingfire_wrapper.ts b/agents/src/tokenize/blingfire/blingfire_wrapper.ts
@@ -0,0 +1,35 @@
+import createModule from './blingfire.js';
+
+const Module = (await createModule()) as any;
+
+// breaks to sentences, takes a JS string and returns a JS string
+export function TextToSentences(s: string): string | null {
+  const len = Module['lengthBytesUTF8'](s);
+
+  if (!len) {
+    return null;
+  }
+
+  const inUtf8 = Module['_malloc'](len + 1); // if we don't do +1 this library won't copy the last character
+  Module['stringToUTF8'](s, inUtf8, len + 1); //  since it always also needs a space for a 0-char
+
+  const MaxOutLength = (len << 1) + 1; // worst case every character is a token
+  const outUtf8 = Module['_malloc'](MaxOutLength);
+
+  try {
+    const actualLen = Module['_TextToSentences'](inUtf8, len, outUtf8, MaxOutLength);
+    if (0 > actualLen || actualLen > MaxOutLength) {
+      return null;
+    }
+  } finally {
+    if (inUtf8 != 0) {
+      Module['_free'](inUtf8);
+    }
+
+    if (outUtf8 != 0) {
+      Module['_free'](outUtf8);
+    }
+  }
+
+  return Module['UTF8ToString'](outUtf8);
+}