Here're are some resources about Text-to-Audio and Audio-to-Text modeling, understanding, generation in Multi-Modal LLMs
tag: Moonshine
| Useful Sensors
paper link: here
blog link: here
code link: here
citation:
@misc{jeffries2024moonshinespeechrecognitionlive,
title={Moonshine: Speech Recognition for Live Transcription and Voice Commands},
author={Nat Jeffries and Evan King and Manjunath Kudlur and Guy Nicholson and James Wang and Pete Warden},
year={2024},
eprint={2410.15608},
archivePrefix={arXiv},
primaryClass={cs.SD},
url={https://arxiv.org/abs/2410.15608},
}
tag: Qwen2-Audio
| Alibaba Group
paper link: here
code link: here
modelhub link: here
citation:
@misc{chu2024qwen2audiotechnicalreport,
title={Qwen2-Audio Technical Report},
author={Yunfei Chu and Jin Xu and Qian Yang and Haojie Wei and Xipin Wei and Zhifang Guo and Yichong Leng and Yuanjun Lv and Jinzheng He and Junyang Lin and Chang Zhou and Jingren Zhou},
year={2024},
eprint={2407.10759},
archivePrefix={arXiv},
primaryClass={eess.AS},
url={https://arxiv.org/abs/2407.10759},
}
tag: Spirit LM
| Meta
paper link: here
code link: here
homepage link: here
citation:
@misc{nguyen2024spiritlminterleavedspoken,
title={Spirit LM: Interleaved Spoken and Written Language Model},
author={Tu Anh Nguyen and Benjamin Muller and Bokai Yu and Marta R. Costa-jussa and Maha Elbayad and Sravya Popuri and Christophe Ropers and Paul-Ambroise Duquenne and Robin Algayres and Ruslan Mavlyutov and Itai Gat and Mary Williamson and Gabriel Synnaeve and Juan Pino and Benoit Sagot and Emmanuel Dupoux},
year={2024},
eprint={2402.05755},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2402.05755},
}
tag: AIR-Bench
| ACL24
| Alibaba Group
| Zhejiang University
paper link: here
code link: here
citation:
@misc{yang2024airbenchbenchmarkinglargeaudiolanguage,
title={AIR-Bench: Benchmarking Large Audio-Language Models via Generative Comprehension},
author={Qian Yang and Jin Xu and Wenrui Liu and Yunfei Chu and Ziyue Jiang and Xiaohuan Zhou and Yichong Leng and Yuanjun Lv and Zhou Zhao and Chang Zhou and Jingren Zhou},
year={2024},
eprint={2402.07729},
archivePrefix={arXiv},
primaryClass={eess.AS},
url={https://arxiv.org/abs/2402.07729},
}