From a906a59cb881623365817367ce74daa28d90c53f Mon Sep 17 00:00:00 2001 From: Joshua Lochner Date: Sat, 14 Dec 2024 04:13:50 +0000 Subject: [PATCH] Add support for Moonshine ASR --- README.md | 1 + docs/snippets/6_supported-models.snippet | 1 + src/configs.js | 1 + src/models.js | 24 +++++++++++++++++ src/models/feature_extractors.js | 1 + .../moonshine/feature_extraction_moonshine.js | 26 +++++++++++++++++++ src/models/moonshine/processing_moonshine.js | 20 ++++++++++++++ src/models/processors.js | 1 + 8 files changed, 75 insertions(+) create mode 100644 src/models/moonshine/feature_extraction_moonshine.js create mode 100644 src/models/moonshine/processing_moonshine.js diff --git a/README.md b/README.md index e56a7faaf..0b2ebf4c8 100644 --- a/README.md +++ b/README.md @@ -366,6 +366,7 @@ You can refine your search by selecting the task you're interested in (e.g., [te 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari. 1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (from Apple) released with the paper [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) by Sachin Mehta and Mohammad Rastegari. 1. **Moondream1** released in the repository [moondream](https://github.com/vikhyat/moondream) by vikhyat. +1. **[Moonshine](https://huggingface.co/docs/transformers/model_doc/moonshine)** (from Useful Sensors) released with the paper [Moonshine: Speech Recognition for Live Transcription and Voice Commands](https://arxiv.org/abs/2410.15608) by Nat Jeffries, Evan King, Manjunath Kudlur, Guy Nicholson, James Wang, Pete Warden. 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu. 1. **[MPT](https://huggingface.co/docs/transformers/model_doc/mpt)** (from MosaicML) released with the repository [llm-foundry](https://github.com/mosaicml/llm-foundry/) by the MosaicML NLP Team. 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel. diff --git a/docs/snippets/6_supported-models.snippet b/docs/snippets/6_supported-models.snippet index ad4f6cdc4..d7c149840 100644 --- a/docs/snippets/6_supported-models.snippet +++ b/docs/snippets/6_supported-models.snippet @@ -81,6 +81,7 @@ 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari. 1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (from Apple) released with the paper [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) by Sachin Mehta and Mohammad Rastegari. 1. **Moondream1** released in the repository [moondream](https://github.com/vikhyat/moondream) by vikhyat. +1. **[Moonshine](https://huggingface.co/docs/transformers/model_doc/moonshine)** (from Useful Sensors) released with the paper [Moonshine: Speech Recognition for Live Transcription and Voice Commands](https://arxiv.org/abs/2410.15608) by Nat Jeffries, Evan King, Manjunath Kudlur, Guy Nicholson, James Wang, Pete Warden. 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu. 1. **[MPT](https://huggingface.co/docs/transformers/model_doc/mpt)** (from MosaicML) released with the repository [llm-foundry](https://github.com/mosaicml/llm-foundry/) by the MosaicML NLP Team. 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel. diff --git a/src/configs.js b/src/configs.js index a40bb59d9..86e0dcd04 100644 --- a/src/configs.js +++ b/src/configs.js @@ -185,6 +185,7 @@ function getNormalizedConfig(config) { mapping['encoder_hidden_size'] = mapping['decoder_hidden_size'] = 'd_model'; break; case 'musicgen_decoder': + case 'moonshine': mapping['num_encoder_layers'] = mapping['num_decoder_layers'] = 'num_hidden_layers'; mapping['num_encoder_heads'] = mapping['num_decoder_heads'] = 'num_attention_heads'; mapping['encoder_hidden_size'] = mapping['decoder_hidden_size'] = 'hidden_size'; diff --git a/src/models.js b/src/models.js index 93d92e8c6..eaa992df8 100644 --- a/src/models.js +++ b/src/models.js @@ -3342,6 +3342,29 @@ export class WhisperForConditionalGeneration extends WhisperPreTrainedModel { } ////////////////////////////////////////////////// + +////////////////////////////////////////////////// +// Moonshine models +export class MoonshinePreTrainedModel extends PreTrainedModel { + + requires_attention_mask = false; + main_input_name = 'input_values'; + forward_params = [ + 'input_values', + 'decoder_input_ids', + 'past_key_values', + ]; +}; + +/** + * MoonshineModel class for training Moonshine models without a language model head. + */ +export class MoonshineModel extends MoonshinePreTrainedModel { } + +export class MoonshineForConditionalGeneration extends MoonshinePreTrainedModel { } +////////////////////////////////////////////////// + + ////////////////////////////////////////////////// /** * Vision Encoder-Decoder model based on OpenAI's GPT architecture for image captioning and other vision tasks @@ -6925,6 +6948,7 @@ const MODEL_MAPPING_NAMES_DECODER_ONLY = new Map([ const MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES = new Map([ ['speecht5', ['SpeechT5ForSpeechToText', SpeechT5ForSpeechToText]], ['whisper', ['WhisperForConditionalGeneration', WhisperForConditionalGeneration]], + ['moonshine', ['MoonshineForConditionalGeneration', MoonshineForConditionalGeneration]], ]); const MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING_NAMES = new Map([ diff --git a/src/models/feature_extractors.js b/src/models/feature_extractors.js index 869c8191b..98aa61572 100644 --- a/src/models/feature_extractors.js +++ b/src/models/feature_extractors.js @@ -1,6 +1,7 @@ export * from './audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.js'; export * from './clap/feature_extraction_clap.js'; +export * from './moonshine/feature_extraction_moonshine.js'; export * from './pyannote/feature_extraction_pyannote.js'; export * from './seamless_m4t/feature_extraction_seamless_m4t.js'; export * from './speecht5/feature_extraction_speecht5.js'; diff --git a/src/models/moonshine/feature_extraction_moonshine.js b/src/models/moonshine/feature_extraction_moonshine.js new file mode 100644 index 000000000..9f01ab342 --- /dev/null +++ b/src/models/moonshine/feature_extraction_moonshine.js @@ -0,0 +1,26 @@ +import { FeatureExtractor, validate_audio_inputs } from '../../base/feature_extraction_utils.js'; +import { Tensor } from '../../utils/tensor.js'; + + +export class MoonshineFeatureExtractor extends FeatureExtractor { + /** + * Asynchronously extracts input values from a given audio using the provided configuration. + * @param {Float32Array|Float64Array} audio The audio data as a Float32Array/Float64Array. + * @returns {Promise<{ input_values: Tensor; }>} The extracted input values. + */ + async _call(audio) { + validate_audio_inputs(audio, 'MoonshineFeatureExtractor'); + + if (audio instanceof Float64Array) { + audio = new Float32Array(audio); + } + + const shape = [ + 1, /* batch_size */ + audio.length, /* num_samples */ + ]; + return { + input_values: new Tensor('float32', audio, shape), + }; + } +} diff --git a/src/models/moonshine/processing_moonshine.js b/src/models/moonshine/processing_moonshine.js new file mode 100644 index 000000000..e313976ec --- /dev/null +++ b/src/models/moonshine/processing_moonshine.js @@ -0,0 +1,20 @@ +import { AutoFeatureExtractor } from "../auto/feature_extraction_auto.js" +import { AutoTokenizer } from "../../tokenizers.js" +import { Processor } from "../../base/processing_utils.js" + +/** + * Represents a MoonshineProcessor that extracts features from an audio input. + */ +export class MoonshineProcessor extends Processor { + static tokenizer_class = AutoTokenizer + static feature_extractor_class = AutoFeatureExtractor + + /** + * Calls the feature_extractor function with the given audio input. + * @param {any} audio The audio input to extract features from. + * @returns {Promise} A Promise that resolves with the extracted features. + */ + async _call(audio) { + return await this.feature_extractor(audio); + } +} diff --git a/src/models/processors.js b/src/models/processors.js index ee388851c..974f57039 100644 --- a/src/models/processors.js +++ b/src/models/processors.js @@ -1,5 +1,6 @@ export * from './florence2/processing_florence2.js'; export * from './mgp_str/processing_mgp_str.js'; +export * from './moonshine/processing_moonshine.js'; export * from './idefics3/processing_idefics3.js'; export * from './janus/processing_janus.js'; export * from './jina_clip/processing_jina_clip.js';