You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
I'm building a dictation tool using React based on the WebGPU-whisper demo. Once data models are loaded, they periodically crash Chrome. I've tried with models from both my own CDN, and the Huggingface CDN, and both have the issue. I've also tried using smaller models (using whisper-tiny.en currently, but also saw the issue with the base model).
My Code:
worker.ts
/* eslint no-restricted-globals: 0 */import{AutoTokenizer,AutoProcessor,env,full,PreTrainedTokenizer,Processor,TextStreamer,WhisperForConditionalGeneration,}from"@huggingface/transformers";env.allowLocalModels=false;env.remoteHost="https://MYSUPERCOOLCDN.com/shared/dictation/models/";env.remotePathTemplate="{model}/";env.backends.onnx.wasm.wasmPaths="https://MYSUPERCOOLCDN.com/shared/dictation/";constMAX_NEW_TOKENS=64;classAutomaticSpeechRecognitionPipeline{staticmodel_id: string|null=null;statictokenizer: Promise<PreTrainedTokenizer>|null=null;staticprocessor: Promise<Processor>|null=null;staticmodel: Promise<any>|null=null;staticasyncgetInstance(progress_callback: Function|undefined=undefined){this.model_id="onnx-community/whisper-tiny.en";this.tokenizer??=AutoTokenizer.from_pretrained(this.model_id,{progress_callback: progress_callback||(()=>{}),});this.processor??=AutoProcessor.from_pretrained(this.model_id,{progress_callback: progress_callback||(()=>{}),});this.model??=WhisperForConditionalGeneration.from_pretrained(this.model_id,{dtype: {encoder_model: "fp32",// 'fp16' works toodecoder_model_merged: "q4",// or 'fp32' ('fp16' is broken)},device: "webgpu",progress_callback: progress_callback||(()=>{}),});returnPromise.all([this.tokenizer,this.processor,this.model]);}}letprocessing=false;asyncfunctiongenerate({ audio }: {audio: Float32Array}){if(processing)return;processing=true;// Tell the main thread we are startingself.postMessage({status: "start"});// Retrieve the text-generation pipeline.const[tokenizer,processor,model]=awaitAutomaticSpeechRecognitionPipeline.getInstance();letnumTokens=0;constcallback_function=(output: string[])=>{self.postMessage({status: "update",
output,
numTokens,});};conststreamer=newTextStreamer(tokenizer,{skip_prompt: true,
callback_function,});constinputs=awaitprocessor(audio);constoutputs=awaitmodel.generate({
...inputs,max_new_tokens: MAX_NEW_TOKENS,
streamer,});constoutputText=tokenizer.batch_decode(outputs,{skip_special_tokens: true,});// Send the output back to the main threadself.postMessage({status: "complete",output: outputText,});processing=false;}asyncfunctionload(){self.postMessage({status: "loading",data: "Loading model...",});// Load the pipeline and save it for future use.const[tokenizer,processor,model]=awaitAutomaticSpeechRecognitionPipeline.getInstance((x: any)=>{self.postMessage(x);});self.postMessage({status: "loading",data: "Compiling shaders and warming up model...",});awaitmodel.generate({input_features: full([1,80,3000],0.0),max_new_tokens: 1,});self.postMessage({status: "ready"});}self.addEventListener("message",async(e)=>{const{ type, data }=e.data;switch(type){case"load":
load();break;case"generate":
generate(data);break;}});
dictationOverlay.ts
constDictationOverlay=()=>{constdispatch=useDispatch()asThunkDispatch<any,any,AnyAction>;constfeatures: FeatureFlags=useSelector(({ sys }: ApplicationState)=>sys.features);constdictationKey=useSelector(({ sys })=>sys.dictation?.key);constprevDictationStatus=useRef<string>("loading");constprevText=useRef<string>("");constworker=useRef<Worker|null>(null);constrecorderRef=useRef<MediaRecorder|null>(null);constaudioContextRef=useRef<AudioContext|null>(null);const[dictationStatus,setDictationStatus]=useState<string>("loading");const[text,setText]=useState("");const[displayText,setDisplayText]=useState<string>("");const[isProcessing,setIsProcessing]=useState(false);const[chunks,setChunks]=useState<any>([]);const[open,setOpen]=useState(false);// setup the worker as soon as the component is mounted.useEffect(()=>{if(browserSupported&&features?.dictation){if(!worker.current){// Create the worker if it does not yet exist.worker.current=getWorker();}// Create a callback function for messages from the worker thread.constonMessageReceived=(e: any)=>{switch(e.data.status){case"loading":
// Model file start load: add a new progress item to the list.setDictationStatus("loading");break;case"ready":
// Pipeline ready: the worker is ready to accept messages.setDictationStatus("ready");break;case"start":
// Start generationsetIsProcessing(true);// Request new data from the recorder if present.if(recorderRef.current?.state!=="inactive"){recorderRef.current?.requestData();}break;case"complete":
setIsProcessing(false);if(recorderRef.current?.state==="inactive"){setText("");setDisplayText("");}else{setText(e.data.output);}break;}};// Attach the callback function as an event listener.worker.current.addEventListener("message",onMessageReceived);// Load the data model when the component is mounted.setTimeout(()=>{worker.current?.postMessage({type: "load"});},1000);// Define a cleanup function for when the component is unmounted.return()=>{worker.current?.removeEventListener("message",onMessageReceived);worker.current?.terminate();worker.current=null;dispatch(setDictationKey(null));dispatch(setDictationText(""));};}},[]);constsliceAfterString=(stringToSlice: string,searchString: string): string=>{constindex=stringToSlice.toLowerCase().indexOf(searchString);// If we can't find the search string, return the original string.if(index===-1){returnstringToSlice;}// Slice the string after the search string.returnstringToSlice.slice(index+searchString.length).replace(/^[\s\p{P}]+/u,"");};// Update the display text when the text changes.// This will slice the dispplay text based on the dictation key.useEffect(()=>{if(text.length>0){constfirstText=text[0];if(firstText!==prevText.current){prevText.current=firstText;if(!isSuppressedString(firstText)){constfoundDictationKey=findDictationKey(firstText);if(foundDictationKey&&!dictationKey){dispatch(setDictationKey(foundDictationKey));}elseif(dictationKey){consttrimmedText=sliceAfterString(firstText,dictationKey);dispatch(setDictationText(trimmedText));setDisplayText(trimmedText);}}}}else{setDisplayText("");}},[text]);useEffect(()=>{// If the dictation overlay is open and loading isn't complete,// once the loading is complete, start recording.if(open&&prevDictationStatus.current==="loading"&&dictationStatus==="ready"){record();}prevDictationStatus.current=dictationStatus;},[dictationStatus]);constrecord=()=>{setDictationStatus("recording");navigator.mediaDevices.getUserMedia({audio: true}).then((stream)=>{recorderRef.current=newMediaRecorder(stream);audioContextRef.current=newAudioContext({sampleRate: WHISPER_SAMPLING_RATE,});recorderRef.current.onstart=()=>{setChunks([]);};recorderRef.current.ondataavailable=(e)=>{if(e.data.size>0){setChunks((prev: any[])=>[...prev,e.data]);}else{// Empty chunk received, so we request new data after a short timeoutsetTimeout(()=>{if(recorderRef.current?.state!=="inactive"){recorderRef.current?.requestData();}},25);}};recorderRef.current.onstop=()=>{consttracks=stream.getTracks();// When all tracks have been stopped the stream will// no longer be active and release any permissioned inputtracks.forEach((track)=>track.stop());};recorderRef.current.start();}).catch((err)=>console.error("The following error occurred: ",err));};// Cleanup the recorder worker on unmountuseEffect(()=>{return()=>{recorderRef.current?.stop();recorderRef.current=null;};},[]);// Generate audio and post to the workeruseEffect(()=>{if(!recorderRef.current)return;if(recorderRef.current?.state==="inactive")return;if(isProcessing)return;if(dictationStatus!=="recording")return;if(chunks.length>0){// Generate from dataconstblob=newBlob(chunks,{type: recorderRef.current.mimeType});constfileReader=newFileReader();fileReader.onloadend=async()=>{constarrayBuffer=fileReader.resultasArrayBuffer;constdecoded=awaitaudioContextRef.current?.decodeAudioData(arrayBuffer);letaudio=decoded&&decoded.getChannelData(0);if(audio&&audio.length>MAX_SAMPLES){// Get last MAX_SAMPLESaudio=audio.slice(-MAX_SAMPLES);}worker.current?.postMessage({type: "generate",data: { audio,language: "en"},});};fileReader.readAsArrayBuffer(blob);}else{recorderRef.current?.requestData();}},[dictationStatus,isProcessing,chunks]);return(
...JSX);};
Reproduction
This happens immediately once the model files and wasm binary load. The only warning I get in the console is:
The text was updated successfully, but these errors were encountered:
System Info
Transformers.js version:
"@huggingface/transformers": "3.0.0-alpha.9"
Browser (if applicable): Chrome
Operating system: MacOS
Environment/Platform
Description
I'm building a dictation tool using React based on the WebGPU-whisper demo. Once data models are loaded, they periodically crash Chrome. I've tried with models from both my own CDN, and the Huggingface CDN, and both have the issue. I've also tried using smaller models (using
whisper-tiny.en
currently, but also saw the issue with the base model).My Code:
worker.ts
dictationOverlay.ts
Reproduction
This happens immediately once the model files and wasm binary load. The only warning I get in the console is:
The text was updated successfully, but these errors were encountered: