We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Following is my code. I am running in colab, and i copied some of the code from online streaming asr using microphone.(https://github.com/NVIDIA/NeMo/blob/main/tutorials/asr/Online_ASR_Microphone_Demo_Cache_Aware_Streaming.ipynb). And i am getting empty text from the model. i used https://huggingface.co/datasets/speechcolab/gigaspeech/viewer/xl this dataset to fake as streaming dataset. import time import torch import copy import numpy as np from datetime import datetime from logging import Logger from omegaconf import OmegaConf, open_dict import nemo.collections.asr as nemo_asr from nemo.collections.asr.models.ctc_bpe_models import EncDecCTCModelBPE
class Listener:
def __init__(self, ): self.init_model() self.init_preprocessor() self.run() def init_model(self): self.asr_model = nemo_asr.models.ASRModel.from_pretrained(model_name='stt_en_fastconformer_hybrid_large_streaming_multi') self.lookahead_size = 80 self.encoder_step_length = 80 self.left_context_size = self.asr_model.encoder.att_context_size[0] self.asr_model.encoder.set_default_att_context_size([self.left_context_size, int(self.lookahead_size / self.encoder_step_length)]) self.asr_model.change_decoding_strategy(decoder_type='rnnt') self.decoding_cfg = self.asr_model.cfg.decoding self.set_decoding_strategy() self.asr_model.eval() self.cache_last_channel, self.cache_last_time, self.cache_last_channel_len = self.asr_model.encoder.get_initial_cache_state(batch_size=1) self.previous_hypotheses = None self.pred_out_stream = None self.step_num = 0 self.pre_encode_cache_size = self.asr_model.encoder.streaming_cfg.pre_encode_cache_size[1] self.num_channels = self.asr_model.cfg.preprocessor.features self.cache_pre_encode = torch.zeros((1, self.num_channels, self.pre_encode_cache_size), device=self.asr_model.device) def init_preprocessor(self): cfg = copy.deepcopy(self.asr_model._cfg) OmegaConf.set_struct(cfg.preprocessor, False) # some changes for streaming scenario cfg.preprocessor.dither = 0.0 cfg.preprocessor.pad_to = 0 cfg.preprocessor.normalize = "None" self.preprocessor = EncDecCTCModelBPE.from_config_dict(cfg.preprocessor) self.preprocessor.to(self.asr_model.device) def set_decoding_strategy(self): with open_dict(self.decoding_cfg): self.decoding_cfg.strategy = "greedy" self.decoding_cfg.preserve_alignments = False if hasattr(self.asr_model, 'joint'): # if an RNNT model self.decoding_cfg.greedy.max_symbols = 10 self.decoding_cfg.fused_batch_size = -1 self.asr_model.change_decoding_strategy(self.decoding_cfg) def preprocess_audio(self, audio): audio = np.frombuffer(audio, dtype=np.int16) audio = audio.astype(np.float32) / 32768.0 audio = np.clip(audio, -1.0, 1.0) device = self.asr_model.device audio_signal = torch.from_numpy(audio).unsqueeze_(0).to(device) audio_signal_len = torch.Tensor([audio.shape[0]]).to(device) processed_signal, processed_signal_length = self.preprocessor( input_signal=audio_signal, length=audio_signal_len ) return processed_signal, processed_signal_length def transcribe(self, audio): processed_signal, processed_signal_length = self.preprocess_audio(audio) processed_signal = torch.cat([self.cache_pre_encode, processed_signal], dim=-1) processed_signal_length += self.cache_pre_encode.shape[1] self.cache_pre_encode = processed_signal[:, :, -self.pre_encode_cache_size:] with torch.no_grad(): ( self.pred_out_stream, transcribed_texts, self.cache_last_channel, self.cache_last_time, self.cache_last_channel_len, self.previous_hypotheses, ) = self.asr_model.conformer_stream_step( processed_signal=processed_signal, processed_signal_length=processed_signal_length, cache_last_channel=self.cache_last_channel, cache_last_time=self.cache_last_time, cache_last_channel_len=self.cache_last_channel_len, keep_all_outputs=False, previous_hypotheses=self.previous_hypotheses, previous_pred_out=self.pred_out_stream, drop_extra_pre_encoded=None, return_transcription=True, ) print(transcribed_texts[0].text) print(len(transcribed_texts)) self.step_num += 1 def run(self): from huggingface_hub import notebook_login notebook_login() gigaspeech = load_dataset("speechcolab/gigaspeech", "xs", use_auth_token=True,token="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",streaming=True) i=0 while True: audio_bytes = next(iter(gigaspeech["test"]))['audio']['array'] self.transcribe(audio_bytes) i+=1 if i==10: break # time.sleep(1)
Listener().run() resullt is 1 and empty line.
The text was updated successfully, but these errors were encountered:
titu1994
No branches or pull requests
Following is my code. I am running in colab, and i copied some of the code from online streaming asr using microphone.(https://github.com/NVIDIA/NeMo/blob/main/tutorials/asr/Online_ASR_Microphone_Demo_Cache_Aware_Streaming.ipynb). And i am getting empty text from the model. i used https://huggingface.co/datasets/speechcolab/gigaspeech/viewer/xl this dataset to fake as streaming dataset.
import time
import torch
import copy
import numpy as np
from datetime import datetime
from logging import Logger
from omegaconf import OmegaConf, open_dict
import nemo.collections.asr as nemo_asr
from nemo.collections.asr.models.ctc_bpe_models import EncDecCTCModelBPE
class Listener:
Listener().run()
resullt is 1 and empty line.
The text was updated successfully, but these errors were encountered: