diff --git a/README.md b/README.md
index e56a7faaf..68d79e27b 100644
--- a/README.md
+++ b/README.md
@@ -381,7 +381,8 @@ You can refine your search by selecting the task you're interested in (e.g., [te
 1. **[PatchTSMixer](https://huggingface.co/docs/transformers/main/model_doc/patchtsmixer)** (from IBM) released with the paper [TSMixer: Lightweight MLP-Mixer Model for Multivariate Time Series Forecasting](https://arxiv.org/abs/2306.09364) by Vijay Ekambaram, Arindam Jati, Nam Nguyen, Phanwadee Sinthong, Jayant Kalagnanam.
 1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (from Princeton University, IBM) released with the paper [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/abs/2211.14730) by Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam.
 1. **[Phi](https://huggingface.co/docs/transformers/main/model_doc/phi)** (from Microsoft) released with the papers - [Textbooks Are All You Need](https://arxiv.org/abs/2306.11644) by Suriya Gunasekar, Yi Zhang, Jyoti Aneja, Caio César Teodoro Mendes, Allie Del Giorno, Sivakanth Gopi, Mojan Javaheripi, Piero Kauffmann, Gustavo de Rosa, Olli Saarikivi, Adil Salim, Shital Shah, Harkirat Singh Behl, Xin Wang, Sébastien Bubeck, Ronen Eldan, Adam Tauman Kalai, Yin Tat Lee and Yuanzhi Li, [Textbooks Are All You Need II: phi-1.5 technical report](https://arxiv.org/abs/2309.05463) by Yuanzhi Li, Sébastien Bubeck, Ronen Eldan, Allie Del Giorno, Suriya Gunasekar and Yin Tat Lee.
-1. **[Phi3](https://huggingface.co/docs/transformers/main/model_doc/phi3)** (from Microsoft) released with the paper [Phi-3 Technical Report: A Highly Capable Language Model Locally on Your Phone](https://arxiv.org/abs/2404.14219) by Marah Abdin, Sam Ade Jacobs, Ammar Ahmad Awan, Jyoti Aneja, Ahmed Awadallah, Hany Awadalla, Nguyen Bach, Amit Bahree, Arash Bakhtiari, Harkirat Behl, Alon Benhaim, Misha Bilenko, Johan Bjorck, Sébastien Bubeck, Martin Cai, Caio César Teodoro Mendes, Weizhu Chen, Vishrav Chaudhary, Parul Chopra, Allie Del Giorno, Gustavo de Rosa, Matthew Dixon, Ronen Eldan, Dan Iter, Amit Garg, Abhishek Goswami, Suriya Gunasekar, Emman Haider, Junheng Hao, Russell J. Hewett, Jamie Huynh, Mojan Javaheripi, Xin Jin, Piero Kauffmann, Nikos Karampatziakis, Dongwoo Kim, Mahoud Khademi, Lev Kurilenko, James R. Lee, Yin Tat Lee, Yuanzhi Li, Chen Liang, Weishung Liu, Eric Lin, Zeqi Lin, Piyush Madan, Arindam Mitra, Hardik Modi, Anh Nguyen, Brandon Norick, Barun Patra, Daniel Perez-Becker, Thomas Portet, Reid Pryzant, Heyang Qin, Marko Radmilac, Corby Rosset, Sambudha Roy, Olatunji Ruwase, Olli Saarikivi, Amin Saied, Adil Salim, Michael Santacroce, Shital Shah, Ning Shang, Hiteshi Sharma, Xia Song, Masahiro Tanaka, Xin Wang, Rachel Ward, Guanhua Wang, Philipp Witte, Michael Wyatt, Can Xu, Jiahang Xu, Sonali Yadav, Fan Yang, Ziyi Yang, Donghan Yu, Chengruidong Zhang, Cyril Zhang, Jianwen Zhang, Li Lyna Zhang, Yi Zhang, Yue Zhang, Yunan Zhang, Xiren Zhou.
+1. **[Phi3](https://huggingface.co/docs/transformers/main/model_doc/phi3)** (from Microsoft) released with the paper [Phi-3 Technical Report: A Highly Capable Language Model Locally on Your Phone](https://arxiv.org/abs/2404.14219v2) by Marah Abdin, Sam Ade Jacobs, Ammar Ahmad Awan, Jyoti Aneja, Ahmed Awadallah, Hany Awadalla, Nguyen Bach, Amit Bahree, Arash Bakhtiari, Harkirat Behl, Alon Benhaim, Misha Bilenko, Johan Bjorck, Sébastien Bubeck, Martin Cai, Caio César Teodoro Mendes, Weizhu Chen, Vishrav Chaudhary, Parul Chopra, Allie Del Giorno, Gustavo de Rosa, Matthew Dixon, Ronen Eldan, Dan Iter, Amit Garg, Abhishek Goswami, Suriya Gunasekar, Emman Haider, Junheng Hao, Russell J. Hewett, Jamie Huynh, Mojan Javaheripi, Xin Jin, Piero Kauffmann, Nikos Karampatziakis, Dongwoo Kim, Mahoud Khademi, Lev Kurilenko, James R. Lee, Yin Tat Lee, Yuanzhi Li, Chen Liang, Weishung Liu, Eric Lin, Zeqi Lin, Piyush Madan, Arindam Mitra, Hardik Modi, Anh Nguyen, Brandon Norick, Barun Patra, Daniel Perez-Becker, Thomas Portet, Reid Pryzant, Heyang Qin, Marko Radmilac, Corby Rosset, Sambudha Roy, Olatunji Ruwase, Olli Saarikivi, Amin Saied, Adil Salim, Michael Santacroce, Shital Shah, Ning Shang, Hiteshi Sharma, Xia Song, Masahiro Tanaka, Xin Wang, Rachel Ward, Guanhua Wang, Philipp Witte, Michael Wyatt, Can Xu, Jiahang Xu, Sonali Yadav, Fan Yang, Ziyi Yang, Donghan Yu, Chengruidong Zhang, Cyril Zhang, Jianwen Zhang, Li Lyna Zhang, Yi Zhang, Yue Zhang, Yunan Zhang, Xiren Zhou.
+1. **Phi3V** (from Microsoft) released with the paper [Phi-3 Technical Report: A Highly Capable Language Model Locally on Your Phone](https://arxiv.org/abs/2404.14219v4) by Marah Abdin, Jyoti Aneja, Hany Awadalla, Ahmed Awadallah, Ammar Ahmad Awan, Nguyen Bach, Amit Bahree, Arash Bakhtiari, Jianmin Bao, Harkirat Behl, Alon Benhaim, Misha Bilenko, Johan Bjorck, Sébastien Bubeck, Martin Cai, Qin Cai, Vishrav Chaudhary, Dong Chen, Dongdong Chen, Weizhu Chen, Yen-Chun Chen, Yi-Ling Chen, Hao Cheng, Parul Chopra, Xiyang Dai, Matthew Dixon, Ronen Eldan, Victor Fragoso, Jianfeng Gao, Mei Gao, Min Gao, Amit Garg, Allie Del Giorno, Abhishek Goswami, Suriya Gunasekar, Emman Haider, Junheng Hao, Russell J. Hewett, Wenxiang Hu, Jamie Huynh, Dan Iter, Sam Ade Jacobs, Mojan Javaheripi, Xin Jin, Nikos Karampatziakis, Piero Kauffmann, Mahoud Khademi, Dongwoo Kim, Young Jin Kim, Lev Kurilenko, James R. Lee, Yin Tat Lee, Yuanzhi Li, Yunsheng Li, Chen Liang, Lars Liden, Xihui Lin, Zeqi Lin, Ce Liu, Liyuan Liu, Mengchen Liu, Weishung Liu, Xiaodong Liu, Chong Luo, Piyush Madan, Ali Mahmoudzadeh, David Majercak, Matt Mazzola, Caio César Teodoro Mendes, Arindam Mitra, Hardik Modi, Anh Nguyen, Brandon Norick, Barun Patra, Daniel Perez-Becker, Thomas Portet, Reid Pryzant, Heyang Qin, Marko Radmilac, Liliang Ren, Gustavo de Rosa, Corby Rosset, Sambudha Roy, Olatunji Ruwase, Olli Saarikivi, Amin Saied, Adil Salim, Michael Santacroce, Shital Shah, Ning Shang, Hiteshi Sharma, Yelong Shen, Swadheen Shukla, Xia Song, Masahiro Tanaka, Andrea Tupini, Praneetha Vaddamanu, Chunyu Wang, Guanhua Wang, Lijuan Wang , Shuohang Wang, Xin Wang, Yu Wang, Rachel Ward, Wen Wen, Philipp Witte, Haiping Wu, Xiaoxia Wu, Michael Wyatt, Bin Xiao, Can Xu, Jiahang Xu, Weijian Xu, Jilong Xue, Sonali Yadav, Fan Yang, Jianwei Yang, Yifan Yang, Ziyi Yang, Donghan Yu, Lu Yuan, Chenruidong Zhang, Cyril Zhang, Jianwen Zhang, Li Lyna Zhang, Yi Zhang, Yue Zhang, Yunan Zhang, Xiren Zhou.
 1. **[PVT](https://huggingface.co/docs/transformers/main/model_doc/pvt)** (from Nanjing University, The University of Hong Kong etc.) released with the paper [Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions](https://arxiv.org/pdf/2102.12122.pdf) by Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao.
 1. **PyAnnote** released in the repository [pyannote/pyannote-audio](https://github.com/pyannote/pyannote-audio) by Hervé Bredin.
 1. **[Qwen2](https://huggingface.co/docs/transformers/model_doc/qwen2)** (from the Qwen team, Alibaba Group) released with the paper [Qwen Technical Report](https://arxiv.org/abs/2309.16609) by Jinze Bai, Shuai Bai, Yunfei Chu, Zeyu Cui, Kai Dang, Xiaodong Deng, Yang Fan, Wenbin Ge, Yu Han, Fei Huang, Binyuan Hui, Luo Ji, Mei Li, Junyang Lin, Runji Lin, Dayiheng Liu, Gao Liu, Chengqiang Lu, Keming Lu, Jianxin Ma, Rui Men, Xingzhang Ren, Xuancheng Ren, Chuanqi Tan, Sinan Tan, Jianhong Tu, Peng Wang, Shijie Wang, Wei Wang, Shengguang Wu, Benfeng Xu, Jin Xu, An Yang, Hao Yang, Jian Yang, Shusheng Yang, Yang Yao, Bowen Yu, Hongyi Yuan, Zheng Yuan, Jianwei Zhang, Xingxuan Zhang, Yichang Zhang, Zhenru Zhang, Chang Zhou, Jingren Zhou, Xiaohuan Zhou and Tianhang Zhu.
diff --git a/docs/snippets/6_supported-models.snippet b/docs/snippets/6_supported-models.snippet
index ad4f6cdc4..aa971793e 100644
--- a/docs/snippets/6_supported-models.snippet
+++ b/docs/snippets/6_supported-models.snippet
@@ -96,7 +96,8 @@
 1. **[PatchTSMixer](https://huggingface.co/docs/transformers/main/model_doc/patchtsmixer)** (from IBM) released with the paper [TSMixer: Lightweight MLP-Mixer Model for Multivariate Time Series Forecasting](https://arxiv.org/abs/2306.09364) by Vijay Ekambaram, Arindam Jati, Nam Nguyen, Phanwadee Sinthong, Jayant Kalagnanam.
 1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (from Princeton University, IBM) released with the paper [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/abs/2211.14730) by Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam.
 1. **[Phi](https://huggingface.co/docs/transformers/main/model_doc/phi)** (from Microsoft) released with the papers - [Textbooks Are All You Need](https://arxiv.org/abs/2306.11644) by Suriya Gunasekar, Yi Zhang, Jyoti Aneja, Caio César Teodoro Mendes, Allie Del Giorno, Sivakanth Gopi, Mojan Javaheripi, Piero Kauffmann, Gustavo de Rosa, Olli Saarikivi, Adil Salim, Shital Shah, Harkirat Singh Behl, Xin Wang, Sébastien Bubeck, Ronen Eldan, Adam Tauman Kalai, Yin Tat Lee and Yuanzhi Li, [Textbooks Are All You Need II: phi-1.5 technical report](https://arxiv.org/abs/2309.05463) by Yuanzhi Li, Sébastien Bubeck, Ronen Eldan, Allie Del Giorno, Suriya Gunasekar and Yin Tat Lee.
-1. **[Phi3](https://huggingface.co/docs/transformers/main/model_doc/phi3)** (from Microsoft) released with the paper [Phi-3 Technical Report: A Highly Capable Language Model Locally on Your Phone](https://arxiv.org/abs/2404.14219) by Marah Abdin, Sam Ade Jacobs, Ammar Ahmad Awan, Jyoti Aneja, Ahmed Awadallah, Hany Awadalla, Nguyen Bach, Amit Bahree, Arash Bakhtiari, Harkirat Behl, Alon Benhaim, Misha Bilenko, Johan Bjorck, Sébastien Bubeck, Martin Cai, Caio César Teodoro Mendes, Weizhu Chen, Vishrav Chaudhary, Parul Chopra, Allie Del Giorno, Gustavo de Rosa, Matthew Dixon, Ronen Eldan, Dan Iter, Amit Garg, Abhishek Goswami, Suriya Gunasekar, Emman Haider, Junheng Hao, Russell J. Hewett, Jamie Huynh, Mojan Javaheripi, Xin Jin, Piero Kauffmann, Nikos Karampatziakis, Dongwoo Kim, Mahoud Khademi, Lev Kurilenko, James R. Lee, Yin Tat Lee, Yuanzhi Li, Chen Liang, Weishung Liu, Eric Lin, Zeqi Lin, Piyush Madan, Arindam Mitra, Hardik Modi, Anh Nguyen, Brandon Norick, Barun Patra, Daniel Perez-Becker, Thomas Portet, Reid Pryzant, Heyang Qin, Marko Radmilac, Corby Rosset, Sambudha Roy, Olatunji Ruwase, Olli Saarikivi, Amin Saied, Adil Salim, Michael Santacroce, Shital Shah, Ning Shang, Hiteshi Sharma, Xia Song, Masahiro Tanaka, Xin Wang, Rachel Ward, Guanhua Wang, Philipp Witte, Michael Wyatt, Can Xu, Jiahang Xu, Sonali Yadav, Fan Yang, Ziyi Yang, Donghan Yu, Chengruidong Zhang, Cyril Zhang, Jianwen Zhang, Li Lyna Zhang, Yi Zhang, Yue Zhang, Yunan Zhang, Xiren Zhou.
+1. **[Phi3](https://huggingface.co/docs/transformers/main/model_doc/phi3)** (from Microsoft) released with the paper [Phi-3 Technical Report: A Highly Capable Language Model Locally on Your Phone](https://arxiv.org/abs/2404.14219v2) by Marah Abdin, Sam Ade Jacobs, Ammar Ahmad Awan, Jyoti Aneja, Ahmed Awadallah, Hany Awadalla, Nguyen Bach, Amit Bahree, Arash Bakhtiari, Harkirat Behl, Alon Benhaim, Misha Bilenko, Johan Bjorck, Sébastien Bubeck, Martin Cai, Caio César Teodoro Mendes, Weizhu Chen, Vishrav Chaudhary, Parul Chopra, Allie Del Giorno, Gustavo de Rosa, Matthew Dixon, Ronen Eldan, Dan Iter, Amit Garg, Abhishek Goswami, Suriya Gunasekar, Emman Haider, Junheng Hao, Russell J. Hewett, Jamie Huynh, Mojan Javaheripi, Xin Jin, Piero Kauffmann, Nikos Karampatziakis, Dongwoo Kim, Mahoud Khademi, Lev Kurilenko, James R. Lee, Yin Tat Lee, Yuanzhi Li, Chen Liang, Weishung Liu, Eric Lin, Zeqi Lin, Piyush Madan, Arindam Mitra, Hardik Modi, Anh Nguyen, Brandon Norick, Barun Patra, Daniel Perez-Becker, Thomas Portet, Reid Pryzant, Heyang Qin, Marko Radmilac, Corby Rosset, Sambudha Roy, Olatunji Ruwase, Olli Saarikivi, Amin Saied, Adil Salim, Michael Santacroce, Shital Shah, Ning Shang, Hiteshi Sharma, Xia Song, Masahiro Tanaka, Xin Wang, Rachel Ward, Guanhua Wang, Philipp Witte, Michael Wyatt, Can Xu, Jiahang Xu, Sonali Yadav, Fan Yang, Ziyi Yang, Donghan Yu, Chengruidong Zhang, Cyril Zhang, Jianwen Zhang, Li Lyna Zhang, Yi Zhang, Yue Zhang, Yunan Zhang, Xiren Zhou.
+1. **Phi3V** (from Microsoft) released with the paper [Phi-3 Technical Report: A Highly Capable Language Model Locally on Your Phone](https://arxiv.org/abs/2404.14219v4) by Marah Abdin, Jyoti Aneja, Hany Awadalla, Ahmed Awadallah, Ammar Ahmad Awan, Nguyen Bach, Amit Bahree, Arash Bakhtiari, Jianmin Bao, Harkirat Behl, Alon Benhaim, Misha Bilenko, Johan Bjorck, Sébastien Bubeck, Martin Cai, Qin Cai, Vishrav Chaudhary, Dong Chen, Dongdong Chen, Weizhu Chen, Yen-Chun Chen, Yi-Ling Chen, Hao Cheng, Parul Chopra, Xiyang Dai, Matthew Dixon, Ronen Eldan, Victor Fragoso, Jianfeng Gao, Mei Gao, Min Gao, Amit Garg, Allie Del Giorno, Abhishek Goswami, Suriya Gunasekar, Emman Haider, Junheng Hao, Russell J. Hewett, Wenxiang Hu, Jamie Huynh, Dan Iter, Sam Ade Jacobs, Mojan Javaheripi, Xin Jin, Nikos Karampatziakis, Piero Kauffmann, Mahoud Khademi, Dongwoo Kim, Young Jin Kim, Lev Kurilenko, James R. Lee, Yin Tat Lee, Yuanzhi Li, Yunsheng Li, Chen Liang, Lars Liden, Xihui Lin, Zeqi Lin, Ce Liu, Liyuan Liu, Mengchen Liu, Weishung Liu, Xiaodong Liu, Chong Luo, Piyush Madan, Ali Mahmoudzadeh, David Majercak, Matt Mazzola, Caio César Teodoro Mendes, Arindam Mitra, Hardik Modi, Anh Nguyen, Brandon Norick, Barun Patra, Daniel Perez-Becker, Thomas Portet, Reid Pryzant, Heyang Qin, Marko Radmilac, Liliang Ren, Gustavo de Rosa, Corby Rosset, Sambudha Roy, Olatunji Ruwase, Olli Saarikivi, Amin Saied, Adil Salim, Michael Santacroce, Shital Shah, Ning Shang, Hiteshi Sharma, Yelong Shen, Swadheen Shukla, Xia Song, Masahiro Tanaka, Andrea Tupini, Praneetha Vaddamanu, Chunyu Wang, Guanhua Wang, Lijuan Wang , Shuohang Wang, Xin Wang, Yu Wang, Rachel Ward, Wen Wen, Philipp Witte, Haiping Wu, Xiaoxia Wu, Michael Wyatt, Bin Xiao, Can Xu, Jiahang Xu, Weijian Xu, Jilong Xue, Sonali Yadav, Fan Yang, Jianwei Yang, Yifan Yang, Ziyi Yang, Donghan Yu, Lu Yuan, Chenruidong Zhang, Cyril Zhang, Jianwen Zhang, Li Lyna Zhang, Yi Zhang, Yue Zhang, Yunan Zhang, Xiren Zhou.
 1. **[PVT](https://huggingface.co/docs/transformers/main/model_doc/pvt)** (from Nanjing University, The University of Hong Kong etc.) released with the paper [Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions](https://arxiv.org/pdf/2102.12122.pdf) by Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao.
 1. **PyAnnote** released in the repository [pyannote/pyannote-audio](https://github.com/pyannote/pyannote-audio) by Hervé Bredin.
 1. **[Qwen2](https://huggingface.co/docs/transformers/model_doc/qwen2)** (from the Qwen team, Alibaba Group) released with the paper [Qwen Technical Report](https://arxiv.org/abs/2309.16609) by Jinze Bai, Shuai Bai, Yunfei Chu, Zeyu Cui, Kai Dang, Xiaodong Deng, Yang Fan, Wenbin Ge, Yu Han, Fei Huang, Binyuan Hui, Luo Ji, Mei Li, Junyang Lin, Runji Lin, Dayiheng Liu, Gao Liu, Chengqiang Lu, Keming Lu, Jianxin Ma, Rui Men, Xingzhang Ren, Xuancheng Ren, Chuanqi Tan, Sinan Tan, Jianhong Tu, Peng Wang, Shijie Wang, Wei Wang, Shengguang Wu, Benfeng Xu, Jin Xu, An Yang, Hao Yang, Jian Yang, Shusheng Yang, Yang Yao, Bowen Yu, Hongyi Yuan, Zheng Yuan, Jianwei Zhang, Xingxuan Zhang, Yichang Zhang, Zhenru Zhang, Chang Zhou, Jingren Zhou, Xiaohuan Zhou and Tianhang Zhu.
diff --git a/src/base/image_processors_utils.js b/src/base/image_processors_utils.js
index 6788258f6..30111ad0c 100644
--- a/src/base/image_processors_utils.js
+++ b/src/base/image_processors_utils.js
@@ -699,7 +699,7 @@ export class ImageProcessor extends Callable {
      * Pad the image by a certain amount.
      * @param {Float32Array} pixelData The pixel data to pad.
      * @param {number[]} imgDims The dimensions of the image (height, width, channels).
-     * @param {{width:number; height:number}|number} padSize The dimensions of the padded image.
+     * @param {{width:number; height:number}|number|'square'} padSize The dimensions of the padded image.
      * @param {Object} options The options for padding.
      * @param {'constant'|'symmetric'} [options.mode='constant'] The type of padding to add.
      * @param {boolean} [options.center=false] Whether to center the image.
@@ -717,6 +717,8 @@ export class ImageProcessor extends Callable {
         if (typeof padSize === 'number') {
             paddedImageWidth = padSize;
             paddedImageHeight = padSize;
+        } else if (padSize === 'square') {
+            paddedImageWidth = paddedImageHeight = Math.max(imageHeight, imageWidth);
         } else {
             paddedImageWidth = padSize.width;
             paddedImageHeight = padSize.height;
diff --git a/src/configs.js b/src/configs.js
index a40bb59d9..8964c6506 100644
--- a/src/configs.js
+++ b/src/configs.js
@@ -95,8 +95,6 @@ function getNormalizedConfig(config) {
         case 'gpt_neox':
         case 'stablelm':
         case 'opt':
-        case 'phi':
-        case 'phi3':
         case 'falcon':
             mapping['num_heads'] = 'num_attention_heads';
             mapping['num_layers'] = 'num_hidden_layers';
@@ -112,6 +110,9 @@ function getNormalizedConfig(config) {
         case 'starcoder2':
         case 'qwen2':
         case 'qwen2_vl':
+        case 'phi':
+        case 'phi3':
+        case 'phi3_v':
             mapping['num_heads'] = 'num_key_value_heads';
             mapping['num_layers'] = 'num_hidden_layers';
             mapping['hidden_size'] = 'hidden_size';
diff --git a/src/models.js b/src/models.js
index 93d92e8c6..1094f10c3 100644
--- a/src/models.js
+++ b/src/models.js
@@ -131,6 +131,7 @@ const MODEL_TYPES = {
     ImageTextToText: 6,
     Musicgen: 7,
     MultiModality: 8,
+    Phi3V: 9,
 }
 //////////////////////////////////////////////////
 
@@ -906,6 +907,10 @@ export class PreTrainedModel extends Callable {
                 this._forward = imageTextToTextForward;
                 this._prepare_inputs_for_generation = image_text_to_text_prepare_inputs_for_generation;
                 break;
+            case MODEL_TYPES.Phi3V:
+                this.can_generate = true;
+                this._prepare_inputs_for_generation = image_text_to_text_prepare_inputs_for_generation;
+                break;
 
             case MODEL_TYPES.MultiModality:
                 this.can_generate = true;
@@ -1070,6 +1075,18 @@ export class PreTrainedModel extends Callable {
                 }, options),
             ]);
 
+        } else if (modelType === MODEL_TYPES.Phi3V) {
+            info = await Promise.all([
+                constructSessions(pretrained_model_name_or_path, {
+                    prepare_inputs_embeds: 'prepare_inputs_embeds',
+                    model: 'model',
+                    vision_encoder: 'vision_encoder',
+                }, options),
+                getOptionalConfigs(pretrained_model_name_or_path, {
+                    generation_config: 'generation_config.json',
+                }, options),
+            ]);
+
         } else { // should be MODEL_TYPES.EncoderOnly
             if (modelType !== MODEL_TYPES.EncoderOnly) {
                 const type = modelName ?? config?.model_type;
@@ -3612,6 +3629,77 @@ export class Idefics3ForConditionalGeneration extends Idefics3PreTrainedModel {
 }
 //////////////////////////////////////////////////
 
+export class Phi3VPreTrainedModel extends PreTrainedModel {
+    forward_params = [
+        'input_ids',
+        'inputs_embeds',
+        'attention_mask',
+        'position_ids',
+        'pixel_values',
+        'image_sizes',
+        'past_key_values',
+    ];
+}
+export class Phi3VForCausalLM extends Phi3VPreTrainedModel {
+
+    async forward({
+        // Produced by the tokenizer/processor:
+        input_ids = null,
+        attention_mask = null,
+        pixel_values = null,
+        image_sizes = null,
+
+        // Used during generation:
+        position_ids = null,
+        inputs_embeds = null,
+        past_key_values = null,
+
+        // Generic generation parameters
+        generation_config = null,
+        logits_processor = null,
+
+        // TODO: needed?
+        ...kwargs
+    }) {
+        if (!inputs_embeds) {
+            let image_features;
+            if (pixel_values && input_ids.dims[1] !== 1) {
+                if (!image_sizes) {
+                    throw new Error('`image_sizes` must be provided when `pixel_values` is provided.');
+                }
+
+                // Encode the image
+                ({ image_features } = await sessionRun(this.sessions['vision_encoder'], {
+                    pixel_values,
+                    image_sizes,
+                }));
+            } else {
+                const hidden_size = this.config.normalized_config.hidden_size;
+                image_features = new Tensor(
+                    'float32',
+                    [],
+                    [0, hidden_size],
+                );
+            }
+
+            ({ inputs_embeds } = await sessionRun(this.sessions['prepare_inputs_embeds'], {
+                input_ids,
+                image_features,
+            }));
+        }
+
+        const outputs = await decoderForward(this, {
+            inputs_embeds,
+            past_key_values,
+            attention_mask,
+            position_ids,
+            generation_config,
+            logits_processor,
+        }, false);
+        return outputs;
+    }
+}
+
 //////////////////////////////////////////////////
 export class CLIPPreTrainedModel extends PreTrainedModel { }
 
@@ -7014,6 +7102,9 @@ const MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = new Map([
     ['falcon', ['FalconForCausalLM', FalconForCausalLM]],
     ['trocr', ['TrOCRForCausalLM', TrOCRForCausalLM]],
     ['stablelm', ['StableLmForCausalLM', StableLmForCausalLM]],
+
+    // Also image-text-to-text
+    ['phi3_v', ['Phi3VForCausalLM', Phi3VForCausalLM]],
 ]);
 
 const MODEL_FOR_MULTIMODALITY_MAPPING_NAMES = new Map([
@@ -7251,6 +7342,7 @@ const CUSTOM_MAPPING = [
     // OVERRIDE:
     // TODO: Refactor to allow class to specify model
     ['MusicgenForConditionalGeneration', MusicgenForConditionalGeneration, MODEL_TYPES.Musicgen],
+    ['Phi3VForCausalLM', Phi3VForCausalLM, MODEL_TYPES.Phi3V],
 
     ['CLIPTextModelWithProjection', CLIPTextModelWithProjection, MODEL_TYPES.EncoderOnly],
     ['SiglipTextModel', SiglipTextModel, MODEL_TYPES.EncoderOnly],
diff --git a/src/models/idefics3/image_processing_idefics3.js b/src/models/idefics3/image_processing_idefics3.js
index 0da6c2cc7..8864661c9 100644
--- a/src/models/idefics3/image_processing_idefics3.js
+++ b/src/models/idefics3/image_processing_idefics3.js
@@ -3,7 +3,7 @@
 import {
     ImageProcessor,
 } from "../../base/image_processors_utils.js";
-import { cat, full, interpolate_4d, stack } from "../../utils/tensor.js";
+import { cat, full, interpolate_4d, slice, stack } from "../../utils/tensor.js";
 
 export class Idefics3ImageProcessor extends ImageProcessor {
     constructor(config) {
@@ -186,18 +186,29 @@ export class Idefics3ImageProcessor extends ImageProcessor {
             const optimal_width = Math.ceil(width / num_splits_w);
 
             // Iterate through each row and column
-            for (let r = 0; r < num_splits_h; r++) {
-                for (let c = 0; c < num_splits_w; c++) {
-                    // Calculate the starting point of the crop
-                    const start_x = c * optimal_width;
-                    const start_y = r * optimal_height;
-
-                    // Calculate the ending point of the crop
-                    const end_x = Math.min(start_x + optimal_width, width);
-                    const end_y = Math.min(start_y + optimal_height, height);
-
-                    // Crop the image
-                    frames.push(pixel_values.slice(null, null, [start_y, end_y], [start_x, end_x]));
+            for (let r = 0; r < num_splits_h; ++r) {
+                for (let c = 0; c < num_splits_w; ++c) {
+                    let start_x, start_y, end_x, end_y;
+                    if (r === num_splits_h - 1) { // At bottom
+                        start_y = height - optimal_height;
+                        end_y = height;
+                    } else {
+                        start_y = r * optimal_height;
+                        end_y = (r + 1) * optimal_height;
+                    }
+                    if (c === num_splits_w - 1) { // At right
+                        start_x = width - optimal_width;
+                        end_x = width;
+                    } else {
+                        start_x = c * optimal_width;
+                        end_x = (c + 1) * optimal_width;
+                    }
+
+                    const starts = [start_y, start_x];
+                    const ends = [end_y, end_x];
+
+                    const patch = await slice(pixel_values, starts, ends, [2, 3]);
+                    frames.push(patch);
                 }
             }
 
diff --git a/src/models/image_processors.js b/src/models/image_processors.js
index 02815771c..fd002c81c 100644
--- a/src/models/image_processors.js
+++ b/src/models/image_processors.js
@@ -24,6 +24,7 @@ export * from './mobilevit/image_processing_mobilevit.js'
 export * from './nougat/image_processing_nougat.js'
 export * from './owlv2/image_processing_owlv2.js'
 export * from './owlvit/image_processing_owlvit.js'
+export * from './phi3_v/image_processing_phi3_v.js'
 export * from './pvt/image_processing_pvt.js'
 export * from './qwen2_vl/image_processing_qwen2_vl.js'
 export * from './rt_detr/image_processing_rt_detr.js'
diff --git a/src/models/phi3_v/image_processing_phi3_v.js b/src/models/phi3_v/image_processing_phi3_v.js
new file mode 100644
index 000000000..5e032b294
--- /dev/null
+++ b/src/models/phi3_v/image_processing_phi3_v.js
@@ -0,0 +1,163 @@
+import {
+    ImageProcessor,
+} from "../../base/image_processors_utils.js";
+import { cat, interpolate_4d, slice, stack, Tensor } from "../../utils/tensor.js";
+
+const IMAGE_SIZE = 336;
+const SLICE_AXES = [2, 3]; // axes to slice on
+const { ceil, floor, sqrt } = Math;
+
+export class Phi3VImageProcessor extends ImageProcessor {
+    constructor(config) {
+        super({
+            ...config,
+            do_normalize: true,
+            do_pad: true,
+            pad_size: 'custom',
+            do_convert_rgb: true,
+            do_resize: true, // Smart resizing "hd_transform"
+        });
+
+        this._num_crops = config.num_crops;
+    }
+    calc_num_image_tokens_from_image_size(width, height) {
+        // @ts-expect-error
+        const { num_img_tokens } = this.config;
+        return floor(((floor((height / IMAGE_SIZE)) * floor((width / IMAGE_SIZE)) + 1) * num_img_tokens) + 1 + (floor(height / IMAGE_SIZE) + 1) * sqrt(num_img_tokens));
+    }
+
+    /** @type {ImageProcessor['get_resize_output_image_size']} */
+    get_resize_output_image_size(image, size) {
+        const hd_num = this._num_crops;
+        const [width, height] = image.size
+
+        let ratio = width / height;
+        let scale = 1;
+
+        // Calculate the scaling factor
+        while (scale * Math.ceil(scale / ratio) <= hd_num) {
+            scale += 1;
+        }
+        scale -= 1;
+
+        // Compute the new dimensions
+        const new_w = Math.floor(scale * 336);
+        const new_h = Math.floor(new_w / ratio);
+
+        return [new_w, new_h]
+    }
+
+
+    /** @type {ImageProcessor['pad_image']} */
+    pad_image(pixelData, imgDims, padSize, options = {}) {
+        // Phi3V uses a custom padding strategy:
+        // - Pad to a multiple of 336
+        // - Pad with white pixels
+        const [imageHeight, imageWidth] = imgDims;
+        const height = IMAGE_SIZE * ceil(imageHeight / IMAGE_SIZE);
+        const width = IMAGE_SIZE * ceil(imageWidth / IMAGE_SIZE);
+
+        // NOTE: Since padding is done after normalization, we need to fill with the normalized values
+        const constant_values = [1, 1, 1].map((x, i) => (x - this.image_mean[i]) / this.image_std[i]);
+        return super.pad_image(pixelData, imgDims, { width, height }, {
+            center: true,
+            constant_values,
+            ...options,
+        });
+    }
+
+    async _call(images, {
+        num_crops = null,
+    } = {}) {
+        // @ts-expect-error
+        this._num_crops = num_crops ??= this.config.num_crops;
+        if (num_crops < 4 || sqrt(num_crops) % 1 !== 0) {
+            throw new Error("num_crops must be a square number >= 4");
+        }
+
+        if (!Array.isArray(images)) {
+            images = [images];
+        }
+
+        const num_images = images.length;
+        const imageData = await Promise.all(images.map(x => this.preprocess(x)));
+
+        const original_sizes = imageData.map(x => x.original_size);
+        const reshaped_input_sizes = imageData.map(x => x.reshaped_input_size);
+
+        // Process each image in batch
+        const all_pixel_values = [];
+        for (const { pixel_values } of imageData) {
+            pixel_values.unsqueeze_(0); // Easier processing as 4D tensor
+
+            const [height, width] = pixel_values.dims.slice(-2);
+
+            // Global image (Tensor of shape [num_channels, height, width])
+            const batch_pixel_values = await interpolate_4d(pixel_values, {
+                size: [IMAGE_SIZE, IMAGE_SIZE],
+                mode: 'bicubic',
+            });
+
+            if (num_crops > 0) {
+                const patches = [];
+                const sqrt_patches = sqrt(num_crops);
+                const patch_width = floor(width / sqrt_patches);
+                const patch_height = floor(height / sqrt_patches);
+                for (let y = 0; y < sqrt_patches; ++y) {
+                    for (let x = 0; x < sqrt_patches; ++x) {
+                        let start_x, start_y, end_x, end_y;
+                        if (y === sqrt_patches - 1) { // At bottom
+                            start_y = height - patch_height;
+                            end_y = height;
+                        } else {
+                            start_y = y * patch_height;
+                            end_y = (y + 1) * patch_height;
+                        }
+                        if (x === sqrt_patches - 1) { // At right
+                            start_x = width - patch_width;
+                            end_x = width;
+                        } else {
+                            start_x = x * patch_width;
+                            end_x = (x + 1) * patch_width;
+                        }
+
+                        const starts = [start_y, start_x];
+                        const ends = [end_y, end_x];
+                        const patch = await slice(pixel_values, starts, ends, SLICE_AXES);
+                        patches.push(patch);
+                    }
+                }
+
+                const resized_tensors = await interpolate_4d(cat(patches, 0), {
+                    size: [IMAGE_SIZE, IMAGE_SIZE],
+                    mode: 'bicubic',
+                }); // [num_crops, 3, 336, 336]
+
+                // Concatenate the global image with the patches
+                all_pixel_values.push(cat([batch_pixel_values, resized_tensors], 0));
+            } else {
+                // Only use the global image
+                // NOTE: Not currently supported in modelling code
+                all_pixel_values.push(batch_pixel_values);
+            }
+        }
+
+        // [num_images, 1 + num_crops, num_channels=3, height, width]
+        const pixel_values = stack(all_pixel_values, 0);
+
+        // Calculate padded image sizes
+        const sizes = reshaped_input_sizes.map(x => x.map(y => IMAGE_SIZE * ceil(y / IMAGE_SIZE)));
+
+        const image_sizes = new Tensor(
+            'int64',
+            sizes.flat(),
+            [num_images, 2],
+        );
+
+        const num_img_tokens = sizes.map(
+            ([height, width]) => this.calc_num_image_tokens_from_image_size(width, height),
+        );
+
+        return { pixel_values, original_sizes, reshaped_input_sizes, image_sizes, num_img_tokens };
+    }
+}
diff --git a/src/models/phi3_v/processing_phi3_v.js b/src/models/phi3_v/processing_phi3_v.js
new file mode 100644
index 000000000..d07e9b176
--- /dev/null
+++ b/src/models/phi3_v/processing_phi3_v.js
@@ -0,0 +1,53 @@
+import { Processor } from "../../base/processing_utils.js";
+import { AutoImageProcessor } from "../auto/image_processing_auto.js";
+import { AutoTokenizer } from "../../tokenizers.js";
+import { RawImage } from "../../utils/image.js";
+
+const IMAGE_TOKEN = "<|image|>";
+const IMAGE_TOKEN_PATTERN = /<\|image_\d+\|>/g;
+
+export class Phi3VProcessor extends Processor {
+    static image_processor_class = AutoImageProcessor
+    static tokenizer_class = AutoTokenizer
+
+    /**
+     * 
+     * @param {string|string[]} text 
+     * @param {RawImage|RawImage[]} images 
+     * @param  {...any} args 
+     * @returns {Promise<any>}
+     */
+    async _call(text, images = null, {
+        padding = true,
+        truncation = true,
+        num_crops = null,
+    } = {}) {
+
+        if (!Array.isArray(text)) {
+            text = [text];
+        }
+
+        let text_inputs, image_inputs;
+        if (images) {
+            image_inputs = await this.image_processor(images, { num_crops });
+            const { num_img_tokens } = image_inputs;
+
+            // The original implementation adds a bos_token before the image tokens
+            // TODO: Check if this affects performance, since it looks like a bug in the original implementation
+            const prompt_chunks = text.map((t, i) => t.split(IMAGE_TOKEN_PATTERN).join(IMAGE_TOKEN.repeat(num_img_tokens[i])));
+
+            text_inputs = this.tokenizer(prompt_chunks, { padding, truncation });
+
+            // The model expects image tokens to be negative, so we negate the image token ids
+            const image_token_id = this.tokenizer.model.convert_tokens_to_ids([IMAGE_TOKEN])[0];
+            text_inputs.input_ids.map_(id => (id == image_token_id) ? -id : id);
+        } else {
+            text_inputs = this.tokenizer(text);
+        }
+
+        return {
+            ...text_inputs,
+            ...image_inputs,
+        }
+    }
+}
diff --git a/src/models/processors.js b/src/models/processors.js
index ee388851c..d254ad118 100644
--- a/src/models/processors.js
+++ b/src/models/processors.js
@@ -4,6 +4,7 @@ export * from './idefics3/processing_idefics3.js';
 export * from './janus/processing_janus.js';
 export * from './jina_clip/processing_jina_clip.js';
 export * from './owlvit/processing_owlvit.js';
+export * from './phi3_v/processing_phi3_v.js';
 export * from './paligemma/processing_paligemma.js';
 export * from './pyannote/processing_pyannote.js';
 export * from './qwen2_vl/processing_qwen2_vl.js';
diff --git a/src/ops/registry.js b/src/ops/registry.js
index 9b65fa4a8..f641fe878 100644
--- a/src/ops/registry.js
+++ b/src/ops/registry.js
@@ -100,4 +100,15 @@ export class TensorOpRegistry {
         }
         return this._top_k;
     }
+
+    static get slice() {
+        if (!this._slice) {
+            this._slice = wrap(
+                [8, 7, 18, 0, 58, 96, 10, 25, 10, 1, 120, 10, 1, 115, 10, 1, 101, 10, 1, 97, 10, 1, 116, 18, 1, 121, 34, 5, 83, 108, 105, 99, 101, 18, 1, 114, 90, 9, 10, 1, 120, 18, 4, 10, 2, 8, 1, 90, 9, 10, 1, 115, 18, 4, 10, 2, 8, 7, 90, 9, 10, 1, 101, 18, 4, 10, 2, 8, 7, 90, 9, 10, 1, 97, 18, 4, 10, 2, 8, 7, 90, 9, 10, 1, 116, 18, 4, 10, 2, 8, 7, 98, 9, 10, 1, 121, 18, 4, 10, 2, 8, 1, 66, 2, 16, 13],
+                this.session_options,
+                'y',
+            )
+        }
+        return this._slice;
+    }
 }
diff --git a/src/utils/tensor.js b/src/utils/tensor.js
index 553e09e8f..93a3e108e 100644
--- a/src/utils/tensor.js
+++ b/src/utils/tensor.js
@@ -984,6 +984,29 @@ export async function topk(x, k) {
     });
 }
 
+
+const arrayToIndexTensor = (array) => new Tensor('int64', array, [array.length]);
+/**
+ * Slice a multidimensional float32 tensor.
+ * @param {Tensor} data: Tensor of data to extract slices from
+ * @param {number[]} starts: 1-D array of starting indices of corresponding axis in axes
+ * @param {number[]} ends: 1-D array of ending indices (exclusive) of corresponding axis in axes
+ * @param {number[]} axes: 1-D array of axes that starts and ends apply to
+ * @param {number[]} [steps]: 1-D array of slice step of corresponding axis in axes.
+ * @returns {Promise<Tensor>} Sliced data tensor.
+ */
+export async function slice(data, starts, ends, axes, steps) {
+    const op = await TensorOpRegistry.slice;
+    return await op({
+        x: data, 
+        s: arrayToIndexTensor(starts), 
+        e: arrayToIndexTensor(ends), 
+        a: arrayToIndexTensor(axes), 
+        t: arrayToIndexTensor(steps ?? new Array(axes.length).fill(1)),
+    });
+}
+
+
 /**
  * Perform mean pooling of the last hidden state followed by a normalization step.
  * @param {Tensor} last_hidden_state Tensor of shape [batchSize, seqLength, embedDim]
diff --git a/tests/models/phi3_v/test_image_processing_phi3_v.js b/tests/models/phi3_v/test_image_processing_phi3_v.js
new file mode 100644
index 000000000..cbf21e897
--- /dev/null
+++ b/tests/models/phi3_v/test_image_processing_phi3_v.js
@@ -0,0 +1,93 @@
+import { AutoImageProcessor, Phi3VImageProcessor } from "../../../src/transformers.js";
+
+import { load_cached_image } from "../../asset_cache.js";
+import { MAX_PROCESSOR_LOAD_TIME, MAX_TEST_EXECUTION_TIME } from "../../init.js";
+
+const TARGET_IMAGE_SIZE = [3, 336, 336];
+
+export default () => {
+  // Phi3VImageProcessor
+  // - custom image processing (patching)
+  describe("Phi3VImageProcessor", () => {
+    const model_id = "onnx-community/Phi-3.5-vision-instruct";
+
+    /** @type {Record<string, import('../../../src/utils/image.js').RawImage>} */
+    const images = {};
+    /** @type {Phi3VImageProcessor} */
+    let processor;
+    beforeAll(async () => {
+      processor = await AutoImageProcessor.from_pretrained(model_id);
+
+      // Load images
+      const gradient_image = await load_cached_image("gradient_1280x640");
+      const white_image = await load_cached_image("white_image");
+
+      images.gradient_image = gradient_image;
+      images.white_image = white_image;
+    }, MAX_PROCESSOR_LOAD_TIME);
+
+    it(
+      "square image (num_crops=4)",
+      async () => {
+        const num_crops = 4;
+        const { pixel_values, image_sizes, num_img_tokens } = await processor(images.white_image, { num_crops });
+        expect(pixel_values.dims).toEqual([1, 1 + num_crops, ...TARGET_IMAGE_SIZE]);
+        expect(pixel_values.flatten(2).mean(2).tolist()).toBeCloseToNested([[2.050372362136841, 2.050372362136841, 2.050372362136841, 2.050372362136841, 2.050372362136841]], 1);
+        expect(pixel_values.mean().item()).toBeCloseTo(2.050372362136841, 1);
+
+        expect(image_sizes.tolist()).toEqual([[672n, 672n]]);
+        expect(num_img_tokens).toEqual([757]);
+      },
+      MAX_TEST_EXECUTION_TIME,
+    );
+
+    it(
+      "non-square image (num_crops=4)",
+      async () => {
+        const num_crops = 4;
+        const { pixel_values, image_sizes, num_img_tokens } = await processor(images.gradient_image, { num_crops });
+        expect(pixel_values.dims).toEqual([1, 1 + num_crops, ...TARGET_IMAGE_SIZE]);
+
+        // NOTE: We use a slighly different cropping strategy to the python implementation,
+        // meaning the following tests would fail.
+        // expect(pixel_values.flatten(2).mean(2).tolist()).toBeCloseToNested([[
+        //   0.18679802119731903, -0.5585645437240601, 0.9321606755256653, 0.0, 0.0,
+        // ]], 1);
+        // expect(pixel_values.mean().item()).toBeCloseTo(0.11207880824804306, 6);
+
+        expect(image_sizes.tolist()).toEqual([[336n, 672n]]);
+        expect(num_img_tokens).toEqual([457]);
+      },
+      MAX_TEST_EXECUTION_TIME,
+    );
+
+    it(
+      "single image (num_crops=16)",
+      async () => {
+        const num_crops = 16;
+        const { pixel_values, image_sizes, num_img_tokens } = await processor(images.gradient_image, { num_crops });
+        expect(pixel_values.dims).toEqual([1, 1 + num_crops, 3, 336, 336]);
+        expect(pixel_values.mean().item()).toBeCloseTo(0.4677375257015228, 1);
+
+        expect(image_sizes.tolist()).toEqual([[1008n, 1680n]]);
+        expect(num_img_tokens).toEqual([2353]);
+      },
+      MAX_TEST_EXECUTION_TIME,
+    );
+
+    it(
+      "multiple images (num_crops=4)",
+      async () => {
+        const num_crops = 4;
+        const { pixel_values, image_sizes, num_img_tokens } = await processor([images.gradient_image, images.white_image], { num_crops });
+        expect(pixel_values.dims).toEqual([2, 1 + num_crops, ...TARGET_IMAGE_SIZE]);
+        expect(image_sizes.tolist()).toEqual([
+          [336n, 672n],
+          [672n, 672n],
+        ]);
+        expect(num_img_tokens).toEqual([457, 757]);
+      },
+      MAX_TEST_EXECUTION_TIME,
+    );
+  });
+};
diff --git a/tests/models/phi3_v/test_processor_phi3_v.js b/tests/models/phi3_v/test_processor_phi3_v.js
new file mode 100644
index 000000000..6896046ef
--- /dev/null
+++ b/tests/models/phi3_v/test_processor_phi3_v.js
@@ -0,0 +1,87 @@
+import { AutoProcessor, Phi3VProcessor } from "../../../src/transformers.js";
+
+import { load_cached_image } from "../../asset_cache.js";
+import { MAX_PROCESSOR_LOAD_TIME, MAX_TEST_EXECUTION_TIME } from "../../init.js";
+
+export default () => {
+  const model_id = "onnx-community/Phi-3.5-vision-instruct";
+
+  describe("Phi3VProcessor", () => {
+    /** @type {Phi3VProcessor} */
+    let processor;
+    let images = {};
+
+    beforeAll(async () => {
+      processor = await AutoProcessor.from_pretrained(model_id, {
+        // Use legacy to match python version
+        legacy: true,
+      });
+      images = {
+        white_image: await load_cached_image("white_image"),
+      };
+    }, MAX_PROCESSOR_LOAD_TIME);
+
+    const create_prompt = (text, images = []) => {
+      const placeholder = images.map((_, i) => `<|image_${i + 1}|>\n`).join("");
+      const messages = [{ role: "user", content: placeholder + text }];
+      const prompt = processor.tokenizer.apply_chat_template(messages, { tokenize: false, add_generation_prompt: true });
+      return prompt;
+    };
+
+    it(
+      "Text-only",
+      async () => {
+        const prompt = create_prompt("Hi there.");
+        const { input_ids, pixel_values } = await processor(prompt);
+        expect(input_ids.dims).toEqual([1, 11]);
+        expect(pixel_values).toBeUndefined();
+      },
+      MAX_TEST_EXECUTION_TIME,
+    );
+
+    it(
+      "Single image & text",
+      async () => {
+        const imgs = [images.white_image];
+        const prompt = create_prompt("Describe this image.", imgs);
+        const { input_ids, attention_mask, pixel_values, image_sizes } = await processor(prompt, imgs);
+        expect(input_ids.dims).toEqual([1, /* 773 */ 770]);
+        expect(attention_mask.dims).toEqual(input_ids.dims);
+        expect(pixel_values.dims).toEqual([1, 5, 3, 336, 336]);
+        expect(image_sizes.tolist()).toEqual([[672n, 672n]]);
+      },
+      MAX_TEST_EXECUTION_TIME,
+    );
+
+    it(
+      "Single image (num_crops=16) & text",
+      async () => {
+        const imgs = [images.white_image];
+        const prompt = create_prompt("Describe this image.", imgs);
+        const { input_ids, attention_mask, pixel_values, image_sizes } = await processor(prompt, imgs, { num_crops: 16 });
+        expect(input_ids.dims).toEqual([1, /* 2525 */ 2522]);
+        expect(attention_mask.dims).toEqual(input_ids.dims);
+        expect(pixel_values.dims).toEqual([1, 17, 3, 336, 336]);
+        expect(image_sizes.tolist()).toEqual([[1344n, 1344n]]);
+      },
+      MAX_TEST_EXECUTION_TIME,
+    );
+
+    it(
+      "Multiple images & text",
+      async () => {
+        const imgs = [images.white_image, images.white_image];
+        const prompt = create_prompt("Describe these images.", imgs);
+        const { input_ids, attention_mask, pixel_values, image_sizes } = await processor(prompt, imgs);
+        expect(input_ids.dims).toEqual([1, /* 1533 */ 1527]);
+        expect(attention_mask.dims).toEqual(input_ids.dims);
+        expect(pixel_values.dims).toEqual([2, 5, 3, 336, 336]);
+        expect(image_sizes.tolist()).toEqual([
+          [672n, 672n],
+          [672n, 672n],
+        ]);
+      },
+      MAX_TEST_EXECUTION_TIME,
+    );
+  });
+};
diff --git a/tests/utils/tensor.test.js b/tests/utils/tensor.test.js
index e2fa808e6..684ded602 100644
--- a/tests/utils/tensor.test.js
+++ b/tests/utils/tensor.test.js
@@ -1,6 +1,9 @@
 import { Tensor, cat, mean, stack, layer_norm } from "../../src/transformers.js";
+import { init } from "../init.js";
 import { compare } from "../test_utils.js";
 
+init();
+
 describe("Tensor operations", () => {
   describe("cat", () => {
     it("should concatenate on dim=0", async () => {
diff --git a/tests/utils/tensor_ops.test.js b/tests/utils/tensor_ops.test.js
index 3227d5f58..a0ad1f076 100644
--- a/tests/utils/tensor_ops.test.js
+++ b/tests/utils/tensor_ops.test.js
@@ -1,4 +1,4 @@
-import { Tensor, interpolate_4d, matmul, rfft } from "../../src/transformers.js";
+import { Tensor, interpolate_4d, matmul, rfft, slice } from "../../src/transformers.js";
 import { init } from "../init.js";
 
 // Initialise the testing environment
@@ -6,7 +6,7 @@ init();
 
 function expectToBeCloseToArray(actual, expected) {
   expect(actual.length).toEqual(expected.length);
-  actual.forEach((x, i) => expect(x).toBeCloseTo(expected[i]));
+  actual.forEach((x, i) => expect(x).toBeCloseTo(expected[i]), 6);
 }
 
 function range(start, stop = undefined, step = 1) {
@@ -24,100 +24,211 @@ function range(start, stop = undefined, step = 1) {
 
 describe("Tensor operations", () => {
   describe("interpolate", () => {
-    const input = new Tensor(
-      "float32",
-      new Float32Array(2 * 3 * 4 * 5).map((_, i) => i),
-      [2, 3, 4, 5],
-    );
-
-    const size = [2, 3, 3, 2];
-    it("bilinear", async () => {
-      const resized = await interpolate_4d(input, { mode: "bilinear", size });
-      const target = new Float32Array(
-        [
+    describe("downscale", () => {
+      const input = new Tensor(
+        "float32",
+        new Float32Array(2 * 3 * 4 * 5).map((_, i) => i),
+        [2, 3, 4, 5],
+      );
+
+      const size = [2, 3, 3, 2];
+      it("bilinear", async () => {
+        const resized = await interpolate_4d(input, { mode: "bilinear", size });
+        const target = new Float32Array(
           [
             [
-              [1.5833335, 4.0833335],
-              [8.25, 10.75],
-              [14.916668, 17.416668],
-            ],
-            [
-              [21.583332, 24.083334],
-              [28.25, 30.75],
-              [34.916668, 37.416668],
+              [
+                [1.5833335, 4.0833335],
+                [8.25, 10.75],
+                [14.916668, 17.416668],
+              ],
+              [
+                [21.583332, 24.083334],
+                [28.25, 30.75],
+                [34.916668, 37.416668],
+              ],
+              [
+                [41.583332, 44.083332],
+                [48.25, 50.75],
+                [54.916668, 57.416668],
+              ],
             ],
             [
-              [41.583332, 44.083332],
-              [48.25, 50.75],
-              [54.916668, 57.416668],
+              [
+                [61.583332, 64.083336],
+                [68.25, 70.75],
+                [74.916664, 77.41667],
+              ],
+              [
+                [81.58333, 84.083336],
+                [88.25, 90.75],
+                [94.91667, 97.41667],
+              ],
+              [
+                [101.583336, 104.08333],
+                [108.25, 110.75],
+                [114.916664, 117.416664],
+              ],
             ],
-          ],
+          ].flat(Infinity),
+        );
+
+        expectToBeCloseToArray(target, resized.data);
+      });
+
+      it("bicubic", async () => {
+        const resized = await interpolate_4d(input, { mode: "bicubic", size });
+
+        const target = new Float32Array(
           [
             [
-              [61.583332, 64.083336],
-              [68.25, 70.75],
-              [74.916664, 77.41667],
+              [
+                [1.2987545, 3.9628172],
+                [8.167969, 10.832031],
+                [15.037184, 17.701244],
+              ],
+              [
+                [21.298756, 23.962818],
+                [28.167969, 30.832031],
+                [35.037186, 37.701252],
+              ],
+              [
+                [41.298756, 43.96282],
+                [48.16797, 50.83203],
+                [55.037193, 57.701256],
+              ],
             ],
             [
-              [81.58333, 84.083336],
-              [88.25, 90.75],
-              [94.91667, 97.41667],
+              [
+                [61.29875, 63.96282],
+                [68.16797, 70.83203],
+                [75.03719, 77.701256],
+              ],
+              [
+                [81.29875, 83.96282],
+                [88.16797, 90.83203],
+                [95.03721, 97.70126],
+              ],
+              [
+                [101.29875, 103.962814],
+                [108.16797, 110.83203],
+                [115.03721, 117.70127],
+              ],
             ],
-            [
-              [101.583336, 104.08333],
-              [108.25, 110.75],
-              [114.916664, 117.416664],
-            ],
-          ],
-        ].flat(Infinity),
-      );
+          ].flat(Infinity),
+        );
 
-      expectToBeCloseToArray(target, resized.data);
+        expectToBeCloseToArray(target, resized.data);
+      });
     });
+    describe("upscale", () => {
+      const input = new Tensor(
+        "float32",
+        new Float32Array(2 * 3 * 3 * 2).map((_, i) => i),
+        [2, 3, 3, 2],
+      );
 
-    it("bicubic", async () => {
-      const resized = await interpolate_4d(input, { mode: "bicubic", size });
-
-      const target = new Float32Array(
-        [
+      const size = [2, 3, 4, 5];
+      it("bilinear", async () => {
+        const resized = await interpolate_4d(input, { mode: "bilinear", size });
+        const target = new Float32Array(
           [
             [
-              [1.2987545, 3.9628172],
-              [8.167969, 10.832031],
-              [15.037184, 17.701244],
-            ],
-            [
-              [21.298756, 23.962818],
-              [28.167969, 30.832031],
-              [35.037186, 37.701252],
+              [
+                [0.0, 0.1, 0.5, 0.9, 1.0],
+                [1.25, 1.35, 1.75, 2.15, 2.25],
+                [2.75, 2.85, 3.25, 3.65, 3.75],
+                [4.0, 4.1, 4.5, 4.9, 5.0],
+              ],
+              [
+                [6.0, 6.1, 6.5, 6.9, 7.0],
+                [7.25, 7.35, 7.75, 8.15, 8.25],
+                [8.75, 8.85, 9.25, 9.65, 9.75],
+                [10.0, 10.1, 10.5, 10.9, 11.0],
+              ],
+              [
+                [12.0, 12.1, 12.5, 12.9, 13.0],
+                [13.25, 13.35, 13.75, 14.15, 14.25],
+                [14.75, 14.85, 15.25, 15.65, 15.75],
+                [16.0, 16.1, 16.5, 16.9, 17.0],
+              ],
             ],
             [
-              [41.298756, 43.96282],
-              [48.16797, 50.83203],
-              [55.037193, 57.701256],
+              [
+                [18.0, 18.1, 18.5, 18.9, 19.0],
+                [19.25, 19.35, 19.75, 20.15, 20.25],
+                [20.75, 20.85, 21.25, 21.65, 21.75],
+                [22.0, 22.1, 22.5, 22.9, 23.0],
+              ],
+              [
+                [24.0, 24.1, 24.5, 24.9, 25.0],
+                [25.25, 25.35, 25.75, 26.15, 26.25],
+                [26.75, 26.85, 27.25, 27.65, 27.75],
+                [28.0, 28.1, 28.5, 28.9, 29.0],
+              ],
+              [
+                [30.0, 30.1, 30.5, 30.9, 31.0],
+                [31.25, 31.35, 31.75, 32.15, 32.25],
+                [32.75, 32.85, 33.25, 33.65, 33.75],
+                [34.0, 34.1, 34.5, 34.9, 35.0],
+              ],
             ],
-          ],
+          ].flat(Infinity),
+        );
+
+        expectToBeCloseToArray(target, resized.data);
+      });
+
+      it("bicubic", async () => {
+        const resized = await interpolate_4d(input, { mode: "bicubic", size });
+
+        const target = new Float32Array(
           [
             [
-              [61.29875, 63.96282],
-              [68.16797, 70.83203],
-              [75.03719, 77.701256],
-            ],
-            [
-              [81.29875, 83.96282],
-              [88.16797, 90.83203],
-              [95.03721, 97.70126],
+              [
+                [-0.253804475069046, -0.06155451014637947, 0.3564453125, 0.7744455337524414, 0.9666945934295654],
+                [0.9493208527565002, 1.1415706872940063, 1.5595703125, 1.977570652961731, 2.1698191165924072],
+                [2.8301806449890137, 3.022430181503296, 3.4404296875, 3.8584301471710205, 4.050677299499512],
+                [4.033306121826172, 4.225555419921875, 4.6435546875, 5.061554908752441, 5.253802299499512],
+              ],
+              [
+                [5.746196269989014, 5.938446998596191, 6.3564453125, 6.774445533752441, 6.966691493988037],
+                [6.94932222366333, 7.14157247543335, 7.5595703125, 7.977570056915283, 8.169816970825195],
+                [8.830181121826172, 9.022432327270508, 9.4404296875, 9.858429908752441, 10.050675392150879],
+                [10.033307075500488, 10.225557327270508, 10.6435546875, 11.061556816101074, 11.253799438476562],
+              ],
+              [
+                [11.746198654174805, 11.938446998596191, 12.3564453125, 12.774446487426758, 12.966689109802246],
+                [12.949322700500488, 13.141572952270508, 13.5595703125, 13.977571487426758, 14.16981315612793],
+                [14.830183029174805, 15.022432327270508, 15.4404296875, 15.858430862426758, 16.05067253112793],
+                [16.033309936523438, 16.225557327270508, 16.6435546875, 17.061555862426758, 17.25379753112793],
+              ],
             ],
             [
-              [101.29875, 103.962814],
-              [108.16797, 110.83203],
-              [115.03721, 117.70127],
+              [
+                [17.746200561523438, 17.938447952270508, 18.3564453125, 18.774446487426758, 18.966686248779297],
+                [18.949325561523438, 19.14157485961914, 19.5595703125, 19.977571487426758, 20.169809341430664],
+                [20.830184936523438, 21.02243423461914, 21.4404296875, 21.858430862426758, 22.050668716430664],
+                [22.03331184387207, 22.225557327270508, 22.6435546875, 23.061555862426758, 23.25379180908203],
+              ],
+              [
+                [23.746200561523438, 23.93844985961914, 24.3564453125, 24.77444839477539, 24.96668243408203],
+                [24.949325561523438, 25.141576766967773, 25.5595703125, 25.977571487426758, 26.1698055267334],
+                [26.830184936523438, 27.022436141967773, 27.4404296875, 27.858430862426758, 28.05066680908203],
+                [28.033313751220703, 28.225557327270508, 28.6435546875, 29.061555862426758, 29.25379180908203],
+              ],
+              [
+                [29.74620246887207, 29.93844985961914, 30.3564453125, 30.77444839477539, 30.96668243408203],
+                [30.949325561523438, 31.141578674316406, 31.5595703125, 31.977571487426758, 32.16980743408203],
+                [32.8301887512207, 33.022438049316406, 33.4404296875, 33.858428955078125, 34.050662994384766],
+                [34.03330993652344, 34.22556686401367, 34.6435546875, 35.06155014038086, 35.253787994384766],
+              ],
             ],
-          ],
-        ].flat(Infinity),
-      );
+          ].flat(Infinity),
+        );
 
-      expectToBeCloseToArray(target, resized.data);
+        expectToBeCloseToArray(target, resized.data);
+      });
     });
   });
 
@@ -188,4 +299,25 @@ describe("Tensor operations", () => {
       expectToBeCloseToArray(target, result.data);
     });
   });
+
+  describe("slice", () => {
+    it("should slice", async () => {
+      const input = new Tensor("float32", [1, 2, 3, 4, 5, 6, 7, 8, 9], [3, 3]);
+
+      const target = new Float32Array(
+        [
+          [1, 2],
+          [4, 5],
+        ].flat(Infinity),
+      );
+
+      const starts = [0, 0];
+      const ends = [2, 2];
+      const axes = [0, 1];
+      const steps = [1, 1];
+
+      const result = await slice(input, starts, ends, axes, steps);
+      expectToBeCloseToArray(target, result.data);
+    });
+  });
 });