huggingface · xenova · May 2, 2024 · Apr 22, 2024 · Apr 22, 2024 · Apr 22, 2024
diff --git a/README.md b/README.md
@@ -318,6 +318,7 @@ You can refine your search by selecting the task you're interested in (e.g., [te
 1. **[MMS](https://huggingface.co/docs/transformers/model_doc/mms)** (from Facebook) released with the paper [Scaling Speech Technology to 1,000+ Languages](https://arxiv.org/abs/2305.13516) by Vineel Pratap, Andros Tjandra, Bowen Shi, Paden Tomasello, Arun Babu, Sayani Kundu, Ali Elkahky, Zhaoheng Ni, Apoorv Vyas, Maryam Fazel-Zarandi, Alexei Baevski, Yossi Adi, Xiaohui Zhang, Wei-Ning Hsu, Alexis Conneau, Michael Auli.
 1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (from CMU/Google Brain) released with the paper [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) by Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou.
 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari.
+1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (from Apple) released with the paper [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) by Sachin Mehta and Mohammad Rastegari.
 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
 1. **[MPT](https://huggingface.co/docs/transformers/model_doc/mpt)** (from MosaiML) released with the repository [llm-foundry](https://github.com/mosaicml/llm-foundry/) by the MosaicML NLP Team.
 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.

diff --git a/docs/snippets/6_supported-models.snippet b/docs/snippets/6_supported-models.snippet
@@ -53,6 +53,7 @@
 1. **[MMS](https://huggingface.co/docs/transformers/model_doc/mms)** (from Facebook) released with the paper [Scaling Speech Technology to 1,000+ Languages](https://arxiv.org/abs/2305.13516) by Vineel Pratap, Andros Tjandra, Bowen Shi, Paden Tomasello, Arun Babu, Sayani Kundu, Ali Elkahky, Zhaoheng Ni, Apoorv Vyas, Maryam Fazel-Zarandi, Alexei Baevski, Yossi Adi, Xiaohui Zhang, Wei-Ning Hsu, Alexis Conneau, Michael Auli.
 1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (from CMU/Google Brain) released with the paper [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) by Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou.
 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari.
+1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (from Apple) released with the paper [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) by Sachin Mehta and Mohammad Rastegari.
 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
 1. **[MPT](https://huggingface.co/docs/transformers/model_doc/mpt)** (from MosaiML) released with the repository [llm-foundry](https://github.com/mosaicml/llm-foundry/) by the MosaicML NLP Team.
 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.

diff --git a/scripts/supported_models.py b/scripts/supported_models.py
@@ -625,6 +625,17 @@
         #     'apple/deeplabv3-mobilevit-xx-small',
         # ],
     },
+    'mobilevitv2': {
+        # Image classification
+        'image-classification': [
+            'apple/mobilevitv2-1.0-imagenet1k-256',
+        ],
+
+        # TODO: Image segmentation
+        # 'image-segmentation': [
+        #     'apple/mobilevitv2-1.0-voc-deeplabv3',
+        # ],
+    },
     'mpt': {
         # Text generation
         'text-generation': [

diff --git a/src/models.js b/src/models.js
@@ -3822,6 +3822,21 @@ export class MobileViTForImageClassification extends MobileViTPreTrainedModel {
 
 //////////////////////////////////////////////////
 
+//////////////////////////////////////////////////
+export class MobileViTV2PreTrainedModel extends PreTrainedModel { }
+export class MobileViTV2Model extends MobileViTV2PreTrainedModel { }
+export class MobileViTV2ForImageClassification extends MobileViTV2PreTrainedModel {
+    /**
+     * @param {any} model_inputs
+     */
+    async _call(model_inputs) {
+        return new SequenceClassifierOutput(await super._call(model_inputs));
+    }
+}
+// TODO: MobileViTV2ForSemanticSegmentation
+
+//////////////////////////////////////////////////
+
 //////////////////////////////////////////////////
 export class OwlViTPreTrainedModel extends PreTrainedModel { }
 export class OwlViTModel extends OwlViTPreTrainedModel { }
@@ -5535,6 +5550,7 @@ const MODEL_MAPPING_NAMES_ENCODER_ONLY = new Map([
     ['table-transformer', ['TableTransformerModel', TableTransformerModel]],
     ['vit', ['ViTModel', ViTModel]],
     ['mobilevit', ['MobileViTModel', MobileViTModel]],
+    ['mobilevitv2', ['MobileViTV2Model', MobileViTV2Model]],
     ['owlvit', ['OwlViTModel', OwlViTModel]],
     ['owlv2', ['Owlv2Model', Owlv2Model]],
     ['beit', ['BeitModel', BeitModel]],
@@ -5718,6 +5734,7 @@ const MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES = new Map([
 const MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES = new Map([
     ['vit', ['ViTForImageClassification', ViTForImageClassification]],
     ['mobilevit', ['MobileViTForImageClassification', MobileViTForImageClassification]],
+    ['mobilevitv2', ['MobileViTV2ForImageClassification', MobileViTV2ForImageClassification]],
     ['beit', ['BeitForImageClassification', BeitForImageClassification]],
     ['deit', ['DeiTForImageClassification', DeiTForImageClassification]],
     ['convnext', ['ConvNextForImageClassification', ConvNextForImageClassification]],

diff --git a/src/processors.js b/src/processors.js
@@ -246,6 +246,8 @@ export class ImageFeatureExtractor extends FeatureExtractor {
      * @param {boolean} config.do_resize Whether to resize the image.
      * @param {number} config.resample What method to use for resampling.
      * @param {number|Object} config.size The size to resize the image to.
+     * @param {boolean} [config.do_flip_channel_order=false] Whether to flip the color channels from RGB to BGR.
+     * Can be overridden by the `do_flip_channel_order` parameter in the `preprocess` method.
      */
     constructor(config) {
         super(config);
@@ -276,6 +278,8 @@ export class ImageFeatureExtractor extends FeatureExtractor {
             // We infer the pad size from the resize size
             this.pad_size = this.size
         }
+
+        this.do_flip_channel_order = this.config.do_flip_channel_order ?? false;
     }
 
     /**
@@ -571,6 +575,7 @@ export class ImageFeatureExtractor extends FeatureExtractor {
         do_pad = null,
         do_convert_rgb = null,
         do_convert_grayscale = null,
+        do_flip_channel_order = null,
     } = {}) {
         if (this.do_crop_margin) {
             // NOTE: Specific to nougat processors. This is done before resizing,
@@ -661,6 +666,18 @@ export class ImageFeatureExtractor extends FeatureExtractor {
             }
         }
 
+        if (do_flip_channel_order ?? this.do_flip_channel_order) {
+            if (imgDims[2] !== 3) {
+                throw new Error('Flipping channel order is only supported for RGB images.');
+            }
+            // Convert RGB to BGR
+            for (let i = 0; i < pixelData.length; i += 3) {
+                const temp = pixelData[i];
+                pixelData[i] = pixelData[i + 2];
+                pixelData[i + 2] = temp;
+            }
+        }
+
         const pixel_values = new Tensor('float32', pixelData, imgDims)
             .permute(2, 0, 1); // convert to channel dimension format (hwc -> chw)
 
@@ -830,6 +847,7 @@ export class EfficientNetImageProcessor extends ImageFeatureExtractor {
 
 
 export class MobileViTFeatureExtractor extends ImageFeatureExtractor { }
+export class MobileViTImageProcessor extends MobileViTFeatureExtractor { } // NOTE extends MobileViTFeatureExtractor
 export class OwlViTFeatureExtractor extends ImageFeatureExtractor {
     /** @type {post_process_object_detection} */
     post_process_object_detection(...args) {
@@ -2132,6 +2150,7 @@ export class AutoProcessor {
         WhisperFeatureExtractor,
         ViTFeatureExtractor,
         MobileViTFeatureExtractor,
+        MobileViTImageProcessor,
         OwlViTFeatureExtractor,
         Owlv2ImageProcessor,
         CLIPFeatureExtractor,

diff --git a/tests/processors.test.js b/tests/processors.test.js
@@ -33,6 +33,7 @@ describe('Processors', () => {
             vit: 'google/vit-base-patch16-224',
             mobilevit: 'apple/mobilevit-small',
             mobilevit_2: 'Xenova/quickdraw-mobilevit-small',
+            mobilevit_3: 'apple/mobilevitv2-1.0-imagenet1k-256',
             deit: 'facebook/deit-tiny-distilled-patch16-224',
             beit: 'microsoft/beit-base-patch16-224-pt22k-ft22k',
             detr: 'facebook/detr-resnet-50',
@@ -205,6 +206,26 @@ describe('Processors', () => {
             }
         }, MAX_TEST_EXECUTION_TIME);
 
+        // MobileViTImageProcessor
+        //  - tests converting RGB to BGR (do_flip_channel_order=true)
+        it(MODELS.mobilevit_3, async () => {
+            const processor = await AutoProcessor.from_pretrained(m(MODELS.mobilevit_3))
+
+            {
+                const image = await load_image(TEST_IMAGES.cats);
+                const { pixel_values, original_sizes, reshaped_input_sizes } = await processor(image);
+
+                compare(pixel_values.dims, [1, 3, 256, 256]);
+                compare(avg(pixel_values.data), 0.5215385556221008);
+
+                compare(original_sizes, [[480, 640]]);
+                compare(reshaped_input_sizes, [[256, 256]]);
+
+                // Ensure RGB to BGR conversion
+                compare(pixel_values.data.slice(0, 3), [0.24313725531101227, 0.250980406999588, 0.364705890417099]);
+            }
+        }, MAX_TEST_EXECUTION_TIME);
+
         // DeiTFeatureExtractor
         it(MODELS.deit, async () => {
             const processor = await AutoProcessor.from_pretrained(m(MODELS.deit))