diff --git a/agents/src/tokenize/blingfire/README.md b/agents/src/tokenize/blingfire/README.md new file mode 100644 index 000000000..7811003e4 --- /dev/null +++ b/agents/src/tokenize/blingfire/README.md @@ -0,0 +1,141 @@ +# BlingFire Tokenizer + +This directory contains a TypeScript wrapper for the [BlingFire](https://github.com/microsoft/BlingFire) tokenization library using WebAssembly. + +## Overview + +BlingFire is a lightning-fast tokenizer developed by Microsoft that provides high-quality sentence and word segmentation. This implementation uses the WebAssembly build of BlingFire to enable fast tokenization in TypeScript/JavaScript environments. + +## Files + +### Auto-generated Files + +The following files are **auto-generated** and should not be manually edited: + +- **`blingfire.ts`** - JavaScript code generated by Emscripten, with the following header added: + ```typescript + // auto generated file + /* eslint-disable */ + // @ts-ignore + ``` + This file is copied directly from the BlingFire build output. + +- **`blingfire.wasm`** - The compiled WebAssembly binary, copied 1:1 from the BlingFire build output to the resources folder. + +### Source Files + +- **`blingfire_wrapper.js`** - Wrapper functions for the BlingFire WASM module. Based on the wrapper provided by BlingFire's WASM implementation, with slight adaptations for this project. + +- **`index.ts`** - Main entry point that implements the `SentenceTokenizer` interface following the LiveKit agents pattern. + +## Building BlingFire WASM + +To regenerate the `blingfire.ts` and `blingfire.wasm` files: + +### 1. Clone the BlingFire Repository + +```bash +git clone https://github.com/microsoft/BlingFire.git +cd BlingFire +``` + +### 2. Follow Initial Setup + +Follow the instructions at: https://github.com/microsoft/BlingFire/blob/master/wasm/readme.md + +### 3. Modify the Makefile + +Change the Makefile to run the following `em++` command: + +```bash +em++ ../blingfiretools/blingfiretokdll/blingfiretokdll.cpp \ + ../blingfiretools/blingfiretokdll/*.cxx \ + ../blingfireclient.library/src/*.cpp \ + -s WASM=1 \ + -s EXPORTED_FUNCTIONS="[_GetBlingFireTokVersion, _TextToSentences, _TextToWords, _TextToIds, _SetModel, _FreeModel, _WordHyphenationWithModel, _malloc, _free]" \ + -s "EXPORTED_RUNTIME_METHODS=['lengthBytesUTF8', 'stackAlloc', 'stringToUTF8', 'UTF8ToString', 'cwrap']" \ + -s ALLOW_MEMORY_GROWTH=1 \ + -s DISABLE_EXCEPTION_CATCHING=0 \ + -s MODULARIZE=1 \ + -s EXPORT_ES6 \ + -I ../blingfireclient.library/inc/ \ + -I ../blingfirecompile.library/inc/ \ + -DHAVE_ICONV_LIB \ + -DHAVE_NO_SPECSTRINGS \ + -D_VERBOSE \ + -DBLING_FIRE_NOAP \ + -DBLING_FIRE_NOWINDOWS \ + -DNDEBUG \ + -O3 \ + --std=c++11 \ + -o blingfire.js +``` + +**Key Changes:** +- Added `-s MODULARIZE=1` - Makes the output a module that can be imported +- Added `-s EXPORT_ES6` - Exports as ES6 module +- Fixed `_malloc` and `_free` exports in `EXPORTED_FUNCTIONS` + +### 4. Copy Files to LiveKit + +After building, copy the generated files: + +```bash +# From the BlingFire wasm build directory +cp blingfire.js /path/to/livekit/agents-js/agents/src/tokenize/blingfire/blingfire.ts +cp blingfire.wasm /path/to/livekit/agents-js/agents/src/tokenize/blingfire/blingfire.wasm +``` + +Then add the header comments to `blingfire.ts`: + +```typescript +// auto generated file +/* eslint-disable */ +// @ts-ignore +``` + +## Usage + +```typescript +import { tokenizer } from '@livekit/agents'; + +// Create a tokenizer instance +const tokenizer = new tokenizer.blingfire.SentenceTokenizer({ + minSentenceLength: 20, + streamContextLength: 10, +}); + +// Batch tokenization +const sentences = tokenizer.tokenize('This is a sentence. And another one.'); +console.log(sentences); +// Output: ['This is a sentence. And another one.'] + +// Stream tokenization +const stream = tokenizer.stream(); +stream.pushText('This is the first sentence. '); +stream.pushText('This is the second sentence.'); +stream.endInput(); + +for await (const { token, segmentId } of stream) { + console.log(token); +} +``` + +## Configuration Options + +- **`minSentenceLength`** (default: 20) - Minimum length for buffered sentences +- **`streamContextLength`** (default: 10) - Minimum context length for stream processing + +## Features + +- Lightning-fast sentence tokenization using BlingFire +- Support for batch and streaming tokenization +- Handles abbreviations (Dr., Mr., etc.) correctly +- Supports numbers with decimals +- Multi-language support (Latin, CJK characters, etc.) +- Compatible with the LiveKit agents tokenizer interface + +## License + +BlingFire is licensed under the MIT License by Microsoft Corporation. +See: https://github.com/microsoft/BlingFire/blob/master/LICENSE \ No newline at end of file diff --git a/agents/src/tokenize/blingfire/blingfire.test.ts b/agents/src/tokenize/blingfire/blingfire.test.ts new file mode 100644 index 000000000..b781b4f22 --- /dev/null +++ b/agents/src/tokenize/blingfire/blingfire.test.ts @@ -0,0 +1,177 @@ +// SPDX-FileCopyrightText: 2024 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { describe, expect, it } from 'vitest'; +import { SentenceTokenizer } from './index.js'; + +const TEXT = + 'Hi! ' + + 'LiveKit is a platform for live audio and video applications and services. \n\n' + + 'R.T.C stands for Real-Time Communication... again R.T.C. ' + + 'Mr. Theo is testing the sentence tokenizer. ' + + '\nThis is a test. Another test. ' + + 'A short sentence.\n' + + 'A longer sentence that is longer than the previous sentence. ' + + 'f(x) = x * 2.54 + 42. ' + + 'Hey!\n Hi! Hello! ' + + '\n\n' + + 'This is a sentence. 这是一个中文句子。これは日本語の文章です。' + + '你好!LiveKit是一个直播音频和视频应用程序和服务的平台。' + + '\nThis is a sentence contains consecutive spaces.'; + +// BlingFire may split sentences differently than the basic tokenizer +// These are the expected results when using BlingFire with minSentenceLength=20 +const EXPECTED_MIN_20 = [ + 'Hi! LiveKit is a platform for live audio and video applications and services.', + 'R.T.C stands for Real-Time Communication... again R.T.C. Mr. Theo is testing the sentence tokenizer.', + 'This is a test. Another test.', + 'A short sentence. A longer sentence that is longer than the previous sentence. f(x) = x * 2.54 + 42.', + 'Hey! Hi! Hello! This is a sentence.', + '这是一个中文句子。これは日本語の文章です。', + '你好!LiveKit是一个直播音频和视频应用程序和服务的平台。', + 'This is a sentence contains consecutive spaces.', +]; + +const SIMPLE_TEXT = 'This is a sentence. This is another sentence. And a third one.'; + +describe('blingfire tokenizer', () => { + describe('SentenceTokenizer', () => { + const tokenizer = new SentenceTokenizer(); + + it('should tokenize simple sentences correctly', () => { + const result = tokenizer.tokenize(SIMPLE_TEXT); + expect(result).toBeDefined(); + expect(result.length).toBeGreaterThan(0); + // BlingFire should split the text into sentences + expect(result.some((s) => s.includes('This is a sentence'))).toBeTruthy(); + }); + + it('should tokenize complex text correctly', () => { + const result = tokenizer.tokenize(TEXT); + expect(result).toBeDefined(); + expect(result.length).toBeGreaterThan(0); + // Verify we get similar structure to expected + expect(result.length).toBe(EXPECTED_MIN_20.length); + }); + + it('should handle empty string', () => { + const result = tokenizer.tokenize(''); + expect(result).toEqual([]); + }); + + it('should handle single sentence', () => { + const result = tokenizer.tokenize('This is a single sentence.'); + expect(result).toBeDefined(); + expect(result.length).toBeGreaterThan(0); + }); + + it('should respect minSentenceLength option', () => { + const tokenizerMin50 = new SentenceTokenizer({ minSentenceLength: 50 }); + const result = tokenizerMin50.tokenize(TEXT); + expect(result).toBeDefined(); + // All tokens except possibly the last should be >= 50 chars + result.slice(0, -1).forEach((token) => { + expect(token.length).toBeGreaterThanOrEqual(50); + }); + }); + + it('should stream tokenize sentences correctly', async () => { + const pattern = [1, 2, 4]; + let text = TEXT; + const chunks = []; + const patternIter = Array(Math.ceil(text.length / pattern.reduce((sum, num) => sum + num, 0))) + .fill(pattern) + .flat() + [Symbol.iterator](); + + // @ts-ignore + for (const size of patternIter) { + if (!text) break; + chunks.push(text.slice(undefined, size)); + text = text.slice(size); + } + + const stream = tokenizer.stream(); + for (const chunk of chunks) { + stream.pushText(chunk); + } + stream.endInput(); + + const tokens = []; + for await (const value of stream) { + tokens.push(value.token); + } + + expect(tokens).toBeDefined(); + expect(tokens.length).toBeGreaterThan(0); + // Should produce the same number of tokens as batch mode + expect(tokens.length).toBe(EXPECTED_MIN_20.length); + }); + + it('should handle flush correctly', async () => { + const stream = tokenizer.stream(); + stream.pushText('This is the first part. '); + stream.flush(); + stream.pushText('This is the second part.'); + stream.endInput(); + + const tokens = []; + for await (const value of stream) { + tokens.push(value.token); + } + + expect(tokens.length).toBeGreaterThan(0); + }); + + it('should handle multiple pushText calls', async () => { + const stream = tokenizer.stream(); + stream.pushText('First sentence. '); + stream.pushText('Second sentence. '); + stream.pushText('Third sentence.'); + stream.endInput(); + + const tokens = []; + for await (const value of stream) { + tokens.push(value.token); + } + + expect(tokens.length).toBeGreaterThan(0); + }); + + it('should handle abbreviations correctly', () => { + const text = 'Dr. Smith went to Washington D.C. yesterday. It was nice.'; + const result = tokenizer.tokenize(text); + expect(result).toBeDefined(); + expect(result.length).toBeGreaterThan(0); + }); + + it('should handle numbers with decimals', () => { + const text = 'The value is 3.14159. Another value is 2.71828.'; + const result = tokenizer.tokenize(text); + expect(result).toBeDefined(); + expect(result.some((s) => s.includes('3.14159'))).toBeTruthy(); + }); + + it('should provide segment IDs in stream mode', async () => { + const stream = tokenizer.stream(); + stream.pushText('First sentence. '); + stream.flush(); + stream.pushText('Second sentence after flush.'); + stream.endInput(); + + const tokens = []; + for await (const value of stream) { + tokens.push(value); + expect(value.segmentId).toBeDefined(); + expect(typeof value.segmentId).toBe('string'); + } + + // Tokens from different segments should have different segment IDs + if (tokens.length > 1) { + const segmentIds = new Set(tokens.map((t) => t.segmentId)); + // After flush, we should have at least 2 different segment IDs + expect(segmentIds.size).toBeGreaterThanOrEqual(1); + } + }); + }); +}); diff --git a/agents/src/tokenize/blingfire/blingfire.ts b/agents/src/tokenize/blingfire/blingfire.ts new file mode 100644 index 000000000..6586302bb --- /dev/null +++ b/agents/src/tokenize/blingfire/blingfire.ts @@ -0,0 +1,5 @@ +// auto generated file +/* eslint-disable */ +// @ts-ignorey +async function Module(moduleArg={}){var moduleRtn;var Module=moduleArg;var ENVIRONMENT_IS_WEB=!!globalThis.window;var ENVIRONMENT_IS_WORKER=!!globalThis.WorkerGlobalScope;var ENVIRONMENT_IS_NODE=globalThis.process?.versions?.node&&globalThis.process?.type!="renderer";if(ENVIRONMENT_IS_NODE){const{createRequire}=await import("module");var require=createRequire(import.meta.url)}var arguments_=[];var thisProgram="./this.program";var quit_=(status,toThrow)=>{throw toThrow};var _scriptName=import.meta.url;var scriptDirectory="";function locateFile(path){if(Module["locateFile"]){return Module["locateFile"](path,scriptDirectory)}return scriptDirectory+path}var readAsync,readBinary;if(ENVIRONMENT_IS_NODE){var fs=require("fs");if(_scriptName.startsWith("file:")){scriptDirectory=require("path").dirname(require("url").fileURLToPath(_scriptName))+"/"}readBinary=filename=>{filename=isFileURI(filename)?new URL(filename):filename;var ret=fs.readFileSync(filename);return ret};readAsync=async(filename,binary=true)=>{filename=isFileURI(filename)?new URL(filename):filename;var ret=fs.readFileSync(filename,binary?undefined:"utf8");return ret};if(process.argv.length>1){thisProgram=process.argv[1].replace(/\\/g,"/")}arguments_=process.argv.slice(2);quit_=(status,toThrow)=>{process.exitCode=status;throw toThrow}}else if(ENVIRONMENT_IS_WEB||ENVIRONMENT_IS_WORKER){try{scriptDirectory=new URL(".",_scriptName).href}catch{}{if(ENVIRONMENT_IS_WORKER){readBinary=url=>{var xhr=new XMLHttpRequest;xhr.open("GET",url,false);xhr.responseType="arraybuffer";xhr.send(null);return new Uint8Array(xhr.response)}}readAsync=async url=>{if(isFileURI(url)){return new Promise((resolve,reject)=>{var xhr=new XMLHttpRequest;xhr.open("GET",url,true);xhr.responseType="arraybuffer";xhr.onload=()=>{if(xhr.status==200||xhr.status==0&&xhr.response){resolve(xhr.response);return}reject(xhr.status)};xhr.onerror=reject;xhr.send(null)})}var response=await fetch(url,{credentials:"same-origin"});if(response.ok){return response.arrayBuffer()}throw new Error(response.status+" : "+response.url)}}}else{}var out=console.log.bind(console);var err=console.error.bind(console);var wasmBinary;var ABORT=false;var isFileURI=filename=>filename.startsWith("file://");var readyPromiseResolve,readyPromiseReject;var HEAP8,HEAPU8,HEAP16,HEAPU16,HEAP32,HEAPU32,HEAPF32,HEAPF64;var HEAP64,HEAPU64;var runtimeInitialized=false;function updateMemoryViews(){var b=wasmMemory.buffer;HEAP8=new Int8Array(b);HEAP16=new Int16Array(b);HEAPU8=new Uint8Array(b);HEAPU16=new Uint16Array(b);HEAP32=new Int32Array(b);HEAPU32=new Uint32Array(b);HEAPF32=new Float32Array(b);HEAPF64=new Float64Array(b);HEAP64=new BigInt64Array(b);HEAPU64=new BigUint64Array(b)}function preRun(){if(Module["preRun"]){if(typeof Module["preRun"]=="function")Module["preRun"]=[Module["preRun"]];while(Module["preRun"].length){addOnPreRun(Module["preRun"].shift())}}callRuntimeCallbacks(onPreRuns)}function initRuntime(){runtimeInitialized=true;wasmExports["J"]()}function postRun(){if(Module["postRun"]){if(typeof Module["postRun"]=="function")Module["postRun"]=[Module["postRun"]];while(Module["postRun"].length){addOnPostRun(Module["postRun"].shift())}}callRuntimeCallbacks(onPostRuns)}function abort(what){Module["onAbort"]?.(what);what="Aborted("+what+")";err(what);ABORT=true;what+=". Build with -sASSERTIONS for more info.";var e=new WebAssembly.RuntimeError(what);readyPromiseReject?.(e);throw e}var wasmBinaryFile;function findWasmBinary(){if(Module["locateFile"]){return locateFile("blingfire.wasm")}return new URL("blingfire.wasm",import.meta.url).href}function getBinarySync(file){if(file==wasmBinaryFile&&wasmBinary){return new Uint8Array(wasmBinary)}if(readBinary){return readBinary(file)}throw'sync fetching of the wasm failed: you can preload it to Module["wasmBinary"] manually, or emcc.py will do that for you when generating HTML (but not JS)'}function instantiateSync(file,info){var module;var binary=getBinarySync(file);module=new WebAssembly.Module(binary);var instance=new WebAssembly.Instance(module,info);return[instance,module]}function getWasmImports(){var imports={a:wasmImports};return imports}function createWasm(){function receiveInstance(instance,module){wasmExports=instance.exports;assignWasmExports(wasmExports);updateMemoryViews();return wasmExports}var info=getWasmImports();if(Module["instantiateWasm"]){return new Promise((resolve,reject)=>{Module["instantiateWasm"](info,(inst,mod)=>{resolve(receiveInstance(inst,mod))})})}wasmBinaryFile??=findWasmBinary();var result=instantiateSync(wasmBinaryFile,info);return receiveInstance(result[0])}class ExitStatus{name="ExitStatus";constructor(status){this.message=`Program terminated with exit(${status})`;this.status=status}}var callRuntimeCallbacks=callbacks=>{while(callbacks.length>0){callbacks.shift()(Module)}};var onPostRuns=[];var addOnPostRun=cb=>onPostRuns.push(cb);var onPreRuns=[];var addOnPreRun=cb=>onPreRuns.push(cb);var noExitRuntime=true;var stackRestore=val=>__emscripten_stack_restore(val);var stackSave=()=>_emscripten_stack_get_current();var exceptionCaught=[];var uncaughtExceptionCount=0;var ___cxa_begin_catch=ptr=>{var info=new ExceptionInfo(ptr);if(!info.get_caught()){info.set_caught(true);uncaughtExceptionCount--}info.set_rethrown(false);exceptionCaught.push(info);___cxa_increment_exception_refcount(ptr);return ___cxa_get_exception_ptr(ptr)};var exceptionLast=0;var ___cxa_end_catch=()=>{_setThrew(0,0);var info=exceptionCaught.pop();___cxa_decrement_exception_refcount(info.excPtr);exceptionLast=0};class ExceptionInfo{constructor(excPtr){this.excPtr=excPtr;this.ptr=excPtr-24}set_type(type){HEAPU32[this.ptr+4>>2]=type}get_type(){return HEAPU32[this.ptr+4>>2]}set_destructor(destructor){HEAPU32[this.ptr+8>>2]=destructor}get_destructor(){return HEAPU32[this.ptr+8>>2]}set_caught(caught){caught=caught?1:0;HEAP8[this.ptr+12]=caught}get_caught(){return HEAP8[this.ptr+12]!=0}set_rethrown(rethrown){rethrown=rethrown?1:0;HEAP8[this.ptr+13]=rethrown}get_rethrown(){return HEAP8[this.ptr+13]!=0}init(type,destructor){this.set_adjusted_ptr(0);this.set_type(type);this.set_destructor(destructor)}set_adjusted_ptr(adjustedPtr){HEAPU32[this.ptr+16>>2]=adjustedPtr}get_adjusted_ptr(){return HEAPU32[this.ptr+16>>2]}}var setTempRet0=val=>__emscripten_tempret_set(val);var findMatchingCatch=args=>{var thrown=exceptionLast;if(!thrown){setTempRet0(0);return 0}var info=new ExceptionInfo(thrown);info.set_adjusted_ptr(thrown);var thrownType=info.get_type();if(!thrownType){setTempRet0(0);return thrown}for(var caughtType of args){if(caughtType===0||caughtType===thrownType){break}var adjusted_ptr_addr=info.ptr+16;if(___cxa_can_catch(caughtType,thrownType,adjusted_ptr_addr)){setTempRet0(caughtType);return thrown}}setTempRet0(thrownType);return thrown};var ___cxa_find_matching_catch_2=()=>findMatchingCatch([]);var ___cxa_find_matching_catch_3=arg0=>findMatchingCatch([arg0]);var ___cxa_rethrow=()=>{var info=exceptionCaught.pop();if(!info){abort("no exception to throw")}var ptr=info.excPtr;if(!info.get_rethrown()){exceptionCaught.push(info);info.set_rethrown(true);info.set_caught(false);uncaughtExceptionCount++}exceptionLast=ptr;throw exceptionLast};var ___cxa_throw=(ptr,type,destructor)=>{var info=new ExceptionInfo(ptr);info.init(type,destructor);exceptionLast=ptr;uncaughtExceptionCount++;throw exceptionLast};var ___cxa_uncaught_exceptions=()=>uncaughtExceptionCount;var ___resumeException=ptr=>{if(!exceptionLast){exceptionLast=ptr}throw exceptionLast};var __abort_js=()=>abort("");var stringToUTF8Array=(str,heap,outIdx,maxBytesToWrite)=>{if(!(maxBytesToWrite>0))return 0;var startIdx=outIdx;var endIdx=outIdx+maxBytesToWrite-1;for(var i=0;i=endIdx)break;heap[outIdx++]=u}else if(u<=2047){if(outIdx+1>=endIdx)break;heap[outIdx++]=192|u>>6;heap[outIdx++]=128|u&63}else if(u<=65535){if(outIdx+2>=endIdx)break;heap[outIdx++]=224|u>>12;heap[outIdx++]=128|u>>6&63;heap[outIdx++]=128|u&63}else{if(outIdx+3>=endIdx)break;heap[outIdx++]=240|u>>18;heap[outIdx++]=128|u>>12&63;heap[outIdx++]=128|u>>6&63;heap[outIdx++]=128|u&63;i++}}heap[outIdx]=0;return outIdx-startIdx};var stringToUTF8=(str,outPtr,maxBytesToWrite)=>stringToUTF8Array(str,HEAPU8,outPtr,maxBytesToWrite);var __tzset_js=(timezone,daylight,std_name,dst_name)=>{var currentYear=(new Date).getFullYear();var winter=new Date(currentYear,0,1);var summer=new Date(currentYear,6,1);var winterOffset=winter.getTimezoneOffset();var summerOffset=summer.getTimezoneOffset();var stdTimezoneOffset=Math.max(winterOffset,summerOffset);HEAPU32[timezone>>2]=stdTimezoneOffset*60;HEAP32[daylight>>2]=Number(winterOffset!=summerOffset);var extractZone=timezoneOffset=>{var sign=timezoneOffset>=0?"-":"+";var absOffset=Math.abs(timezoneOffset);var hours=String(Math.floor(absOffset/60)).padStart(2,"0");var minutes=String(absOffset%60).padStart(2,"0");return`UTC${sign}${hours}${minutes}`};var winterName=extractZone(winterOffset);var summerName=extractZone(summerOffset);if(summerOffset2147483648;var alignMemory=(size,alignment)=>Math.ceil(size/alignment)*alignment;var growMemory=size=>{var oldHeapSize=wasmMemory.buffer.byteLength;var pages=(size-oldHeapSize+65535)/65536|0;try{wasmMemory.grow(pages);updateMemoryViews();return 1}catch(e){}};var _emscripten_resize_heap=requestedSize=>{var oldSize=HEAPU8.length;requestedSize>>>=0;var maxHeapSize=getHeapMax();if(requestedSize>maxHeapSize){return false}for(var cutDown=1;cutDown<=4;cutDown*=2){var overGrownHeapSize=oldSize*(1+.2/cutDown);overGrownHeapSize=Math.min(overGrownHeapSize,requestedSize+100663296);var newSize=Math.min(maxHeapSize,alignMemory(Math.max(requestedSize,overGrownHeapSize),65536));var replacement=growMemory(newSize);if(replacement){return true}}return false};var ENV={};var getExecutableName=()=>thisProgram||"./this.program";var getEnvStrings=()=>{if(!getEnvStrings.strings){var lang=(typeof navigator=="object"&&navigator.language||"C").replace("-","_")+".UTF-8";var env={USER:"web_user",LOGNAME:"web_user",PATH:"/",PWD:"/",HOME:"/home/web_user",LANG:lang,_:getExecutableName()};for(var x in ENV){if(ENV[x]===undefined)delete env[x];else env[x]=ENV[x]}var strings=[];for(var x in env){strings.push(`${x}=${env[x]}`)}getEnvStrings.strings=strings}return getEnvStrings.strings};var _environ_get=(__environ,environ_buf)=>{var bufSize=0;var envp=0;for(var string of getEnvStrings()){var ptr=environ_buf+bufSize;HEAPU32[__environ+envp>>2]=ptr;bufSize+=stringToUTF8(string,ptr,Infinity)+1;envp+=4}return 0};var lengthBytesUTF8=str=>{var len=0;for(var i=0;i=55296&&c<=57343){len+=4;++i}else{len+=3}}return len};var _environ_sizes_get=(penviron_count,penviron_buf_size)=>{var strings=getEnvStrings();HEAPU32[penviron_count>>2]=strings.length;var bufSize=0;for(var string of strings){bufSize+=lengthBytesUTF8(string)+1}HEAPU32[penviron_buf_size>>2]=bufSize;return 0};var wasmTableMirror=[];var getWasmTableEntry=funcPtr=>{var func=wasmTableMirror[funcPtr];if(!func){wasmTableMirror[funcPtr]=func=wasmTable.get(funcPtr)}return func};var stackAlloc=sz=>__emscripten_stack_alloc(sz);var UTF8Decoder=globalThis.TextDecoder&&new TextDecoder;var findStringEnd=(heapOrArray,idx,maxBytesToRead,ignoreNul)=>{var maxIdx=idx+maxBytesToRead;if(ignoreNul)return maxIdx;while(heapOrArray[idx]&&!(idx>=maxIdx))++idx;return idx};var UTF8ArrayToString=(heapOrArray,idx=0,maxBytesToRead,ignoreNul)=>{var endPtr=findStringEnd(heapOrArray,idx,maxBytesToRead,ignoreNul);if(endPtr-idx>16&&heapOrArray.buffer&&UTF8Decoder){return UTF8Decoder.decode(heapOrArray.subarray(idx,endPtr))}var str="";while(idx>10,56320|ch&1023)}}return str};var UTF8ToString=(ptr,maxBytesToRead,ignoreNul)=>ptr?UTF8ArrayToString(HEAPU8,ptr,maxBytesToRead,ignoreNul):"";var getCFunc=ident=>{var func=Module["_"+ident];return func};var writeArrayToMemory=(array,buffer)=>{HEAP8.set(array,buffer)};var stringToUTF8OnStack=str=>{var size=lengthBytesUTF8(str)+1;var ret=stackAlloc(size);stringToUTF8(str,ret,size);return ret};var ccall=(ident,returnType,argTypes,args,opts)=>{var toC={string:str=>{var ret=0;if(str!==null&&str!==undefined&&str!==0){ret=stringToUTF8OnStack(str)}return ret},array:arr=>{var ret=stackAlloc(arr.length);writeArrayToMemory(arr,ret);return ret}};function convertReturnValue(ret){if(returnType==="string"){return UTF8ToString(ret)}if(returnType==="boolean")return Boolean(ret);return ret}var func=getCFunc(ident);var cArgs=[];var stack=0;if(args){for(var i=0;i{var numericArgs=!argTypes||argTypes.every(type=>type==="number"||type==="boolean");var numericRet=returnType!=="string";if(numericRet&&numericArgs&&!opts){return getCFunc(ident)}return(...args)=>ccall(ident,returnType,argTypes,args,opts)};{if(Module["noExitRuntime"])noExitRuntime=Module["noExitRuntime"];if(Module["print"])out=Module["print"];if(Module["printErr"])err=Module["printErr"];if(Module["wasmBinary"])wasmBinary=Module["wasmBinary"];if(Module["arguments"])arguments_=Module["arguments"];if(Module["thisProgram"])thisProgram=Module["thisProgram"];if(Module["preInit"]){if(typeof Module["preInit"]=="function")Module["preInit"]=[Module["preInit"]];while(Module["preInit"].length>0){Module["preInit"].shift()()}}}Module["stackAlloc"]=stackAlloc;Module["cwrap"]=cwrap;Module["UTF8ToString"]=UTF8ToString;Module["stringToUTF8"]=stringToUTF8;Module["lengthBytesUTF8"]=lengthBytesUTF8;var _GetBlingFireTokVersion,_TextToSentences,_TextToWords,_WordHyphenationWithModel,_SetModel,_TextToIds,_FreeModel,_malloc,_free,_setThrew,__emscripten_tempret_set,__emscripten_stack_restore,__emscripten_stack_alloc,_emscripten_stack_get_current,___cxa_decrement_exception_refcount,___cxa_increment_exception_refcount,___cxa_can_catch,___cxa_get_exception_ptr,memory,__indirect_function_table,wasmMemory,wasmTable;function assignWasmExports(wasmExports){_GetBlingFireTokVersion=Module["_GetBlingFireTokVersion"]=wasmExports["K"];_TextToSentences=Module["_TextToSentences"]=wasmExports["M"];_TextToWords=Module["_TextToWords"]=wasmExports["N"];_WordHyphenationWithModel=Module["_WordHyphenationWithModel"]=wasmExports["O"];_SetModel=Module["_SetModel"]=wasmExports["P"];_TextToIds=Module["_TextToIds"]=wasmExports["Q"];_FreeModel=Module["_FreeModel"]=wasmExports["R"];_malloc=Module["_malloc"]=wasmExports["S"];_free=Module["_free"]=wasmExports["T"];_setThrew=wasmExports["U"];__emscripten_tempret_set=wasmExports["V"];__emscripten_stack_restore=wasmExports["W"];__emscripten_stack_alloc=wasmExports["X"];_emscripten_stack_get_current=wasmExports["Y"];___cxa_decrement_exception_refcount=wasmExports["Z"];___cxa_increment_exception_refcount=wasmExports["_"];___cxa_can_catch=wasmExports["$"];___cxa_get_exception_ptr=wasmExports["aa"];memory=wasmMemory=wasmExports["I"];__indirect_function_table=wasmTable=wasmExports["L"]}var wasmImports={q:___cxa_begin_catch,v:___cxa_end_catch,a:___cxa_find_matching_catch_2,h:___cxa_find_matching_catch_3,G:___cxa_rethrow,i:___cxa_throw,F:___cxa_uncaught_exceptions,d:___resumeException,B:__abort_js,C:__tzset_js,H:_emscripten_resize_heap,D:_environ_get,E:_environ_sizes_get,y:invoke_diii,z:invoke_fiii,k:invoke_i,b:invoke_ii,c:invoke_iii,j:invoke_iiii,g:invoke_iiiii,r:invoke_iiiiii,n:invoke_iiiiiii,A:invoke_iiiiiiii,x:invoke_iiiiiiiiii,t:invoke_iiiiiiiiiiii,u:invoke_jiiii,f:invoke_v,o:invoke_vi,e:invoke_vii,l:invoke_viii,w:invoke_viiii,m:invoke_viiiiiii,p:invoke_viiiiiiiiii,s:invoke_viiiiiiiiiiiiiii};function invoke_ii(index,a1){var sp=stackSave();try{return getWasmTableEntry(index)(a1)}catch(e){stackRestore(sp);if(e!==e+0)throw e;_setThrew(1,0)}}function invoke_v(index){var sp=stackSave();try{getWasmTableEntry(index)()}catch(e){stackRestore(sp);if(e!==e+0)throw e;_setThrew(1,0)}}function invoke_iiiiii(index,a1,a2,a3,a4,a5){var sp=stackSave();try{return getWasmTableEntry(index)(a1,a2,a3,a4,a5)}catch(e){stackRestore(sp);if(e!==e+0)throw e;_setThrew(1,0)}}function invoke_iiiiiiiiii(index,a1,a2,a3,a4,a5,a6,a7,a8,a9){var sp=stackSave();try{return getWasmTableEntry(index)(a1,a2,a3,a4,a5,a6,a7,a8,a9)}catch(e){stackRestore(sp);if(e!==e+0)throw e;_setThrew(1,0)}}function invoke_iiiii(index,a1,a2,a3,a4){var sp=stackSave();try{return getWasmTableEntry(index)(a1,a2,a3,a4)}catch(e){stackRestore(sp);if(e!==e+0)throw e;_setThrew(1,0)}}function invoke_iiii(index,a1,a2,a3){var sp=stackSave();try{return getWasmTableEntry(index)(a1,a2,a3)}catch(e){stackRestore(sp);if(e!==e+0)throw e;_setThrew(1,0)}}function invoke_vii(index,a1,a2){var sp=stackSave();try{getWasmTableEntry(index)(a1,a2)}catch(e){stackRestore(sp);if(e!==e+0)throw e;_setThrew(1,0)}}function invoke_viii(index,a1,a2,a3){var sp=stackSave();try{getWasmTableEntry(index)(a1,a2,a3)}catch(e){stackRestore(sp);if(e!==e+0)throw e;_setThrew(1,0)}}function invoke_iii(index,a1,a2){var sp=stackSave();try{return getWasmTableEntry(index)(a1,a2)}catch(e){stackRestore(sp);if(e!==e+0)throw e;_setThrew(1,0)}}function invoke_iiiiiii(index,a1,a2,a3,a4,a5,a6){var sp=stackSave();try{return getWasmTableEntry(index)(a1,a2,a3,a4,a5,a6)}catch(e){stackRestore(sp);if(e!==e+0)throw e;_setThrew(1,0)}}function invoke_viiii(index,a1,a2,a3,a4){var sp=stackSave();try{getWasmTableEntry(index)(a1,a2,a3,a4)}catch(e){stackRestore(sp);if(e!==e+0)throw e;_setThrew(1,0)}}function invoke_vi(index,a1){var sp=stackSave();try{getWasmTableEntry(index)(a1)}catch(e){stackRestore(sp);if(e!==e+0)throw e;_setThrew(1,0)}}function invoke_iiiiiiii(index,a1,a2,a3,a4,a5,a6,a7){var sp=stackSave();try{return getWasmTableEntry(index)(a1,a2,a3,a4,a5,a6,a7)}catch(e){stackRestore(sp);if(e!==e+0)throw e;_setThrew(1,0)}}function invoke_jiiii(index,a1,a2,a3,a4){var sp=stackSave();try{return getWasmTableEntry(index)(a1,a2,a3,a4)}catch(e){stackRestore(sp);if(e!==e+0)throw e;_setThrew(1,0);return 0n}}function invoke_fiii(index,a1,a2,a3){var sp=stackSave();try{return getWasmTableEntry(index)(a1,a2,a3)}catch(e){stackRestore(sp);if(e!==e+0)throw e;_setThrew(1,0)}}function invoke_diii(index,a1,a2,a3){var sp=stackSave();try{return getWasmTableEntry(index)(a1,a2,a3)}catch(e){stackRestore(sp);if(e!==e+0)throw e;_setThrew(1,0)}}function invoke_i(index){var sp=stackSave();try{return getWasmTableEntry(index)()}catch(e){stackRestore(sp);if(e!==e+0)throw e;_setThrew(1,0)}}function invoke_viiiiiii(index,a1,a2,a3,a4,a5,a6,a7){var sp=stackSave();try{getWasmTableEntry(index)(a1,a2,a3,a4,a5,a6,a7)}catch(e){stackRestore(sp);if(e!==e+0)throw e;_setThrew(1,0)}}function invoke_iiiiiiiiiiii(index,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11){var sp=stackSave();try{return getWasmTableEntry(index)(a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11)}catch(e){stackRestore(sp);if(e!==e+0)throw e;_setThrew(1,0)}}function invoke_viiiiiiiiii(index,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10){var sp=stackSave();try{getWasmTableEntry(index)(a1,a2,a3,a4,a5,a6,a7,a8,a9,a10)}catch(e){stackRestore(sp);if(e!==e+0)throw e;_setThrew(1,0)}}function invoke_viiiiiiiiiiiiiii(index,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15){var sp=stackSave();try{getWasmTableEntry(index)(a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15)}catch(e){stackRestore(sp);if(e!==e+0)throw e;_setThrew(1,0)}}function run(){preRun();function doRun(){Module["calledRun"]=true;if(ABORT)return;initRuntime();readyPromiseResolve?.(Module);Module["onRuntimeInitialized"]?.();postRun()}if(Module["setStatus"]){Module["setStatus"]("Running...");setTimeout(()=>{setTimeout(()=>Module["setStatus"](""),1);doRun()},1)}else{doRun()}}var wasmExports;wasmExports=createWasm();run();if(runtimeInitialized){moduleRtn=Module}else{moduleRtn=new Promise((resolve,reject)=>{readyPromiseResolve=resolve;readyPromiseReject=reject})} + ;return moduleRtn}export default Module; \ No newline at end of file diff --git a/agents/src/tokenize/blingfire/blingfire.wasm b/agents/src/tokenize/blingfire/blingfire.wasm new file mode 100755 index 000000000..d418daa10 Binary files /dev/null and b/agents/src/tokenize/blingfire/blingfire.wasm differ diff --git a/agents/src/tokenize/blingfire/blingfire_wrapper.ts b/agents/src/tokenize/blingfire/blingfire_wrapper.ts new file mode 100644 index 000000000..69465a49d --- /dev/null +++ b/agents/src/tokenize/blingfire/blingfire_wrapper.ts @@ -0,0 +1,35 @@ +import createModule from './blingfire.js'; + +const Module = (await createModule()) as any; + +// breaks to sentences, takes a JS string and returns a JS string +export function TextToSentences(s: string): string | null { + const len = Module['lengthBytesUTF8'](s); + + if (!len) { + return null; + } + + const inUtf8 = Module['_malloc'](len + 1); // if we don't do +1 this library won't copy the last character + Module['stringToUTF8'](s, inUtf8, len + 1); // since it always also needs a space for a 0-char + + const MaxOutLength = (len << 1) + 1; // worst case every character is a token + const outUtf8 = Module['_malloc'](MaxOutLength); + + try { + const actualLen = Module['_TextToSentences'](inUtf8, len, outUtf8, MaxOutLength); + if (0 > actualLen || actualLen > MaxOutLength) { + return null; + } + } finally { + if (inUtf8 != 0) { + Module['_free'](inUtf8); + } + + if (outUtf8 != 0) { + Module['_free'](outUtf8); + } + } + + return Module['UTF8ToString'](outUtf8); +} diff --git a/agents/src/tokenize/blingfire/index.ts b/agents/src/tokenize/blingfire/index.ts new file mode 100644 index 000000000..f039197ac --- /dev/null +++ b/agents/src/tokenize/blingfire/index.ts @@ -0,0 +1,89 @@ +// SPDX-FileCopyrightText: 2024 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { BufferedSentenceStream } from '../token_stream.js'; +import * as tokenizer from '../tokenizer.js'; +import { TextToSentences } from './blingfire_wrapper.js'; + +interface TokenizerOptions { + minSentenceLength: number; + streamContextLength: number; +} + +const defaultTokenizerOptions: TokenizerOptions = { + minSentenceLength: 20, + streamContextLength: 10, +}; + +/** + * Split text into sentences using BlingFire's TextToSentences. + * BlingFire returns sentences separated by newlines. + */ +const splitSentences = (text: string, minLength = 20): [string, number, number][] => { + const result = TextToSentences(text); + if (!result) { + return []; + } + + // BlingFire separates sentences with newlines + const rawSentences = result.split('\n').filter((s) => s.trim().length > 0); + + const sentences: [string, number, number][] = []; + let buf = ''; + let start = 0; + let end = 0; + let currentPos = 0; + + for (const sentence of rawSentences) { + const trimmed = sentence.trim(); + if (!trimmed) continue; + + // Find the sentence position in the original text + const sentenceStart = text.indexOf(trimmed, currentPos); + const sentenceEnd = sentenceStart + trimmed.length; + + buf += (buf ? ' ' : '') + trimmed; + end = sentenceEnd; + + if (buf.length >= minLength) { + sentences.push([buf, start, end]); + start = sentenceEnd; + buf = ''; + } + + currentPos = sentenceEnd; + } + + // Push any remaining buffered text + if (buf) { + sentences.push([buf, start, text.length]); + } + + return sentences; +}; + +export class SentenceTokenizer extends tokenizer.SentenceTokenizer { + #config: TokenizerOptions; + + constructor(options?: Partial) { + super(); + this.#config = { + ...defaultTokenizerOptions, + ...options, + }; + } + + // eslint-disable-next-line @typescript-eslint/no-unused-vars + tokenize(text: string, language?: string): string[] { + return splitSentences(text, this.#config.minSentenceLength).map((tok) => tok[0]); + } + + // eslint-disable-next-line @typescript-eslint/no-unused-vars + stream(language?: string): tokenizer.SentenceStream { + return new BufferedSentenceStream( + (text: string) => splitSentences(text, this.#config.minSentenceLength), + this.#config.minSentenceLength, + this.#config.streamContextLength, + ); + } +} diff --git a/agents/src/tokenize/index.ts b/agents/src/tokenize/index.ts index 044213290..5de73a594 100644 --- a/agents/src/tokenize/index.ts +++ b/agents/src/tokenize/index.ts @@ -2,6 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 import * as basic from './basic/index.js'; +import * as blingfire from './blingfire/index.js'; export { type TokenData, @@ -13,4 +14,4 @@ export { export { BufferedSentenceStream, BufferedTokenStream, BufferedWordStream } from './token_stream.js'; -export { basic }; +export { basic, blingfire }; diff --git a/agents/tsup.config.ts b/agents/tsup.config.ts index a94b826bf..9b557c25e 100644 --- a/agents/tsup.config.ts +++ b/agents/tsup.config.ts @@ -1,8 +1,29 @@ import { defineConfig } from 'tsup'; +import { copyFile, mkdir } from 'node:fs/promises'; +import { join, dirname } from 'node:path'; +import { fileURLToPath } from 'node:url'; import defaults from '../tsup.config'; +const __dirname = dirname(fileURLToPath(import.meta.url)); + export default defineConfig({ ...defaults, + // todo CJS build disabled for now + format: ['esm'], + plugins: [ + ...(defaults.plugins || []), + { + name: 'copy-wasm', + async buildEnd() { + // Copy WASM file to dist + const wasmSrc = join(__dirname, 'src/tokenize/blingfire/blingfire.wasm'); + const wasmDest = join(__dirname, 'dist/tokenize/blingfire/blingfire.wasm'); + await mkdir(join(__dirname, 'dist/tokenize/blingfire'), { recursive: true }); + await copyFile(wasmSrc, wasmDest); + console.log('Copied blingfire.wasm to dist'); + }, + }, + ], });