From 63d9c3b2724e400f122217f8fcbf9c0ac3db4517 Mon Sep 17 00:00:00 2001
From: Joshua Lochner <admin@xenova.com>
Date: Sun, 15 Dec 2024 14:55:02 +0200
Subject: [PATCH] Improve unit test coverage (#1095)

* Improve unit test auto-detection

* Use default model options

* Add mgp_str unit tests

* Add janus processing unit tests

* Add jina_clip processor unit tests

* Fix typo in filename

* Create `rand` tensor function

* Add VitPose unit test

* Add sam modelling unit test

* Improve pipeline unit tests

* Add image utilities unit testing

* Add image segmentation pipeline unit tests

* Add zero-shot classification pipeline unit test

* Move pipeline unit tests to subfolder

* Add instanceof checks

* Add image feature extraction pipeline tests

* Add feature extraction pipeline unit tests

* Add zero-shot object detection pipeline unit tests

* Add depth estimation pipeline unit test

* Add automatic speech recognition pipeline unit test

* Fix typo

* Add text to audio pipeline unit tests

* Add image to text pipeline unit test

* Add image to image pipeline unit test

* Add zero-shot audio classification pipeline unit test

* Fix typo

* Add summarization pipeline unit test

* Add text2text generation unit test

* Add text2text generation pipeline unit test

* Remove unused variables
---
 src/utils/tensor.js                           |  14 +
 tests/asset_cache.js                          |  27 +
 tests/feature_extractors.test.js              |   5 +
 tests/image_processors.test.js                |   5 +
 tests/init.js                                 |   6 +-
 tests/models.test.js                          |  17 +-
 tests/models/all_modeling_tests.js            |  33 -
 tests/models/all_tokenization_tests.js        |  22 -
 ...xtraction_audio_spectrogram_transformer.js |  53 ++
 .../clap/test_feature_extraction_clap.js      |  74 ++
 .../florence2/test_processor_florence2.js     | 222 +++++
 tests/models/janus/test_processor_janus.js    |  47 +
 .../jina_clip/test_processor_jina_clip.js     |  44 +
 tests/models/mgp_str/test_modeling_mgp_str.js |  84 ++
 .../models/musicgen/test_modeling_musicgen.js |   2 +-
 .../paligemma/test_processor_paligemma.js     |  51 ++
 .../models/qwen2_vl/test_modeling_qwen2_vl.js |   5 +-
 .../qwen2_vl/test_processor_qwen2_vl.js       |  44 +
 tests/models/sam/test_modeling_sam.js         |  48 +
 .../test_feature_extraction_seamless_m4t.js   |  65 ++
 .../test_modeling_vision_encoder_decoder.js   |   5 +-
 .../vitpose/test_image_processing_vitpose.js  |  50 ++
 ...est_feature_extraction_wespeaker_resnet.js |  56 ++
 .../test_feature_extraction_whisper.js        |  33 +
 tests/pipelines.test.js                       |   5 +-
 .../test_pipelines_audio_classification.js    |  80 ++
 ..._pipelines_automatic_speech_recognition.js | 129 +++
 .../test_pipelines_depth_estimation.js        |  57 ++
 ...t_pipelines_document_question_answering.js |  41 +
 .../test_pipelines_feature_extraction.js      | 121 +++
 tests/pipelines/test_pipelines_fill_mask.js   | 100 +++
 .../test_pipelines_image_classification.js    |  81 ++
 ...test_pipelines_image_feature_extraction.js |  51 ++
 .../test_pipelines_image_segmentation.js      | 119 +++
 .../test_pipelines_image_to_image.js          |  56 ++
 .../pipelines/test_pipelines_image_to_text.js |  51 ++
 .../test_pipelines_object_detection.js        | 131 +++
 .../test_pipelines_question_answering.js      |  49 +
 .../pipelines/test_pipelines_summarization.js |  40 +
 .../test_pipelines_text2text_generation.js    |  40 +
 .../test_pipelines_text_classification.js     | 107 +++
 .../test_pipelines_text_generation.js         | 109 +++
 .../pipelines/test_pipelines_text_to_audio.js |  37 +
 .../test_pipelines_token_classification.js    | 157 ++++
 tests/pipelines/test_pipelines_translation.js |  42 +
 tests/pipelines/test_pipelines_zero_shot.js   | 100 +++
 ...ipelines_zero_shot_audio_classification.js |  58 ++
 ...ipelines_zero_shot_image_classification.js |  98 ++
 ...st_pipelines_zero_shot_object_detection.js | 134 +++
 tests/processors.test.js                      | 522 +----------
 tests/test_utils.js                           |  68 ++
 tests/tiny_random.test.js                     | 841 ------------------
 tests/tokenizers.test.js                      |  25 +-
 tests/utils/hub.test.js                       |   8 +-
 tests/utils/image.test.js                     |  89 ++
 55 files changed, 3099 insertions(+), 1459 deletions(-)
 create mode 100644 tests/feature_extractors.test.js
 create mode 100644 tests/image_processors.test.js
 delete mode 100644 tests/models/all_modeling_tests.js
 delete mode 100644 tests/models/all_tokenization_tests.js
 create mode 100644 tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.js
 create mode 100644 tests/models/clap/test_feature_extraction_clap.js
 create mode 100644 tests/models/florence2/test_processor_florence2.js
 create mode 100644 tests/models/janus/test_processor_janus.js
 create mode 100644 tests/models/jina_clip/test_processor_jina_clip.js
 create mode 100644 tests/models/mgp_str/test_modeling_mgp_str.js
 create mode 100644 tests/models/paligemma/test_processor_paligemma.js
 create mode 100644 tests/models/qwen2_vl/test_processor_qwen2_vl.js
 create mode 100644 tests/models/sam/test_modeling_sam.js
 create mode 100644 tests/models/seamless_m4t/test_feature_extraction_seamless_m4t.js
 create mode 100644 tests/models/vitpose/test_image_processing_vitpose.js
 create mode 100644 tests/models/wespeaker_resnet/test_feature_extraction_wespeaker_resnet.js
 create mode 100644 tests/models/whisper/test_feature_extraction_whisper.js
 create mode 100644 tests/pipelines/test_pipelines_audio_classification.js
 create mode 100644 tests/pipelines/test_pipelines_automatic_speech_recognition.js
 create mode 100644 tests/pipelines/test_pipelines_depth_estimation.js
 create mode 100644 tests/pipelines/test_pipelines_document_question_answering.js
 create mode 100644 tests/pipelines/test_pipelines_feature_extraction.js
 create mode 100644 tests/pipelines/test_pipelines_fill_mask.js
 create mode 100644 tests/pipelines/test_pipelines_image_classification.js
 create mode 100644 tests/pipelines/test_pipelines_image_feature_extraction.js
 create mode 100644 tests/pipelines/test_pipelines_image_segmentation.js
 create mode 100644 tests/pipelines/test_pipelines_image_to_image.js
 create mode 100644 tests/pipelines/test_pipelines_image_to_text.js
 create mode 100644 tests/pipelines/test_pipelines_object_detection.js
 create mode 100644 tests/pipelines/test_pipelines_question_answering.js
 create mode 100644 tests/pipelines/test_pipelines_summarization.js
 create mode 100644 tests/pipelines/test_pipelines_text2text_generation.js
 create mode 100644 tests/pipelines/test_pipelines_text_classification.js
 create mode 100644 tests/pipelines/test_pipelines_text_generation.js
 create mode 100644 tests/pipelines/test_pipelines_text_to_audio.js
 create mode 100644 tests/pipelines/test_pipelines_token_classification.js
 create mode 100644 tests/pipelines/test_pipelines_translation.js
 create mode 100644 tests/pipelines/test_pipelines_zero_shot.js
 create mode 100644 tests/pipelines/test_pipelines_zero_shot_audio_classification.js
 create mode 100644 tests/pipelines/test_pipelines_zero_shot_image_classification.js
 create mode 100644 tests/pipelines/test_pipelines_zero_shot_object_detection.js
 delete mode 100644 tests/tiny_random.test.js
 create mode 100644 tests/utils/image.test.js

diff --git a/src/utils/tensor.js b/src/utils/tensor.js
index 6bdfd20a3..553e09e8f 100644
--- a/src/utils/tensor.js
+++ b/src/utils/tensor.js
@@ -1430,6 +1430,20 @@ export function zeros_like(tensor) {
     return zeros(tensor.dims);
 }
 
+/**
+ * Returns a tensor filled with random numbers from a uniform distribution on the interval [0, 1)
+ * @param {number[]} size A sequence of integers defining the shape of the output tensor.
+ * @returns {Tensor} The random tensor.
+ */
+export function rand(size) {
+    const length = size.reduce((a, b) => a * b, 1);
+    return new Tensor(
+        "float32",
+        Float32Array.from({ length }, () => Math.random()),
+        size,
+    )
+}
+
 /**
  * Quantizes the embeddings tensor to binary or unsigned binary precision.
  * @param {Tensor} tensor The tensor to quantize.
diff --git a/tests/asset_cache.js b/tests/asset_cache.js
index 8d62fb6bf..9d1182014 100644
--- a/tests/asset_cache.js
+++ b/tests/asset_cache.js
@@ -3,6 +3,7 @@ import { RawImage } from "../src/transformers.js";
 const BASE_URL = "https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/";
 const TEST_IMAGES = Object.freeze({
   white_image: BASE_URL + "white-image.png",
+  blue_image: BASE_URL + "blue-image.png",
   pattern_3x3: BASE_URL + "pattern_3x3.png",
   pattern_3x5: BASE_URL + "pattern_3x5.png",
   checkerboard_8x8: BASE_URL + "checkerboard_8x8.png",
@@ -21,8 +22,14 @@ const TEST_IMAGES = Object.freeze({
 
   beetle: BASE_URL + "beetle.png",
   book_cover: BASE_URL + "book-cover.png",
+  corgi: BASE_URL + "corgi.jpg",
+  man_on_car: BASE_URL + "young-man-standing-and-leaning-on-car.jpg",
 });
 
+const TEST_AUDIOS = {
+  mlk: BASE_URL + "mlk.npy",
+};
+
 /** @type {Map<string, RawImage>} */
 const IMAGE_CACHE = new Map();
 const load_image = async (url) => {
@@ -35,9 +42,29 @@ const load_image = async (url) => {
   return image;
 };
 
+/** @type {Map<string, any>} */
+const AUDIO_CACHE = new Map();
+const load_audio = async (url) => {
+  const cached = AUDIO_CACHE.get(url);
+  if (cached) {
+    return cached;
+  }
+  const buffer = await (await fetch(url)).arrayBuffer();
+  const audio = Float32Array.from(new Float64Array(buffer));
+  AUDIO_CACHE.set(url, audio);
+  return audio;
+};
+
 /**
  * Load a cached image.
  * @param {keyof typeof TEST_IMAGES} name The name of the image to load.
  * @returns {Promise<RawImage>} The loaded image.
  */
 export const load_cached_image = (name) => load_image(TEST_IMAGES[name]);
+
+/**
+ * Load a cached audio.
+ * @param {keyof typeof TEST_AUDIOS} name The name of the audio to load.
+ * @returns {Promise<Float32Array>} The loaded audio.
+ */
+export const load_cached_audio = (name) => load_audio(TEST_AUDIOS[name]);
diff --git a/tests/feature_extractors.test.js b/tests/feature_extractors.test.js
new file mode 100644
index 000000000..89ac48fa5
--- /dev/null
+++ b/tests/feature_extractors.test.js
@@ -0,0 +1,5 @@
+import { init } from "./init.js";
+import { collect_and_execute_tests } from "./test_utils.js";
+
+init();
+await collect_and_execute_tests("Feature extractors", "feature_extraction");
diff --git a/tests/image_processors.test.js b/tests/image_processors.test.js
new file mode 100644
index 000000000..07e867705
--- /dev/null
+++ b/tests/image_processors.test.js
@@ -0,0 +1,5 @@
+import { init } from "./init.js";
+import { collect_and_execute_tests } from "./test_utils.js";
+
+init();
+await collect_and_execute_tests("Image processors", "image_processing");
diff --git a/tests/init.js b/tests/init.js
index 29097eb47..93321529b 100644
--- a/tests/init.js
+++ b/tests/init.js
@@ -57,6 +57,8 @@ export function init() {
   registerBackend("test", onnxruntimeBackend, Number.POSITIVE_INFINITY);
 }
 
+export const MAX_TOKENIZER_LOAD_TIME = 10_000; // 10 seconds
+export const MAX_FEATURE_EXTRACTOR_LOAD_TIME = 10_000; // 10 seconds
 export const MAX_PROCESSOR_LOAD_TIME = 10_000; // 10 seconds
 export const MAX_MODEL_LOAD_TIME = 15_000; // 15 seconds
 export const MAX_TEST_EXECUTION_TIME = 60_000; // 60 seconds
@@ -64,9 +66,9 @@ export const MAX_MODEL_DISPOSE_TIME = 1_000; // 1 second
 
 export const MAX_TEST_TIME = MAX_MODEL_LOAD_TIME + MAX_TEST_EXECUTION_TIME + MAX_MODEL_DISPOSE_TIME;
 
-export const DEFAULT_MODEL_OPTIONS = {
+export const DEFAULT_MODEL_OPTIONS = Object.freeze({
   dtype: "fp32",
-};
+});
 
 expect.extend({
   toBeCloseToNested(received, expected, numDigits = 2) {
diff --git a/tests/models.test.js b/tests/models.test.js
index a668baee4..ec52fc49d 100644
--- a/tests/models.test.js
+++ b/tests/models.test.js
@@ -2,13 +2,9 @@
  * Test that models loaded outside of the `pipeline` function work correctly (e.g., `AutoModel.from_pretrained(...)`);
  */
 
-import * as MODEL_TESTS from "./models/all_modeling_tests.js";
-
 import { AutoTokenizer, AutoModel, BertModel, GPT2Model, T5ForConditionalGeneration, BertTokenizer, GPT2Tokenizer, T5Tokenizer } from "../src/transformers.js";
-
-import { init, MAX_TEST_EXECUTION_TIME } from "./init.js";
-
-import { compare } from "./test_utils.js";
+import { init, MAX_TEST_EXECUTION_TIME, DEFAULT_MODEL_OPTIONS } from "./init.js";
+import { compare, collect_and_execute_tests } from "./test_utils.js";
 
 // Initialise the testing environment
 init();
@@ -38,7 +34,7 @@ describe("Loading different architecture types", () => {
         async () => {
           // Load model and tokenizer
           const tokenizer = await tokenizerClassToTest.from_pretrained(model_id);
-          const model = await modelClassToTest.from_pretrained(model_id, { dtype: "fp32" });
+          const model = await modelClassToTest.from_pretrained(model_id, DEFAULT_MODEL_OPTIONS);
 
           const tests = [
             texts[0], // single
@@ -65,7 +61,6 @@ describe("Loading different architecture types", () => {
               throw new Error("Unexpected output");
             }
           }
-
           await model.dispose();
         },
         MAX_TEST_EXECUTION_TIME,
@@ -74,8 +69,4 @@ describe("Loading different architecture types", () => {
   }
 });
 
-describe("Model-specific tests", () => {
-  for (const [modelName, modelTest] of Object.entries(MODEL_TESTS)) {
-    describe(modelName, modelTest);
-  }
-});
+await collect_and_execute_tests("Model-specific tests", "modeling");
diff --git a/tests/models/all_modeling_tests.js b/tests/models/all_modeling_tests.js
deleted file mode 100644
index 0f64ec581..000000000
--- a/tests/models/all_modeling_tests.js
+++ /dev/null
@@ -1,33 +0,0 @@
-export { default as bert } from "./bert/test_modeling_bert.js";
-export { default as bloom } from "./bloom/test_modeling_bloom.js";
-export { default as clip } from "./clip/test_modeling_clip.js";
-export { default as codegen } from "./codegen/test_modeling_codegen.js";
-export { default as cohere } from "./cohere/test_modeling_cohere.js";
-export { default as florence2 } from "./florence2/test_modeling_florence2.js";
-export { default as gemma } from "./gemma/test_modeling_gemma.js";
-export { default as gemma2 } from "./gemma2/test_modeling_gemma2.js";
-export { default as gpt2 } from "./gpt2/test_modeling_gpt2.js";
-export { default as gpt_bigcode } from "./gpt_bigcode/test_modeling_gpt_bigcode.js";
-export { default as gpt_neo } from "./gpt_neo/test_modeling_gpt_neo.js";
-export { default as gpt_neox } from "./gpt_neox/test_modeling_gpt_neox.js";
-export { default as gptj } from "./gptj/test_modeling_gptj.js";
-export { default as granite } from "./granite/test_modeling_granite.js";
-export { default as idefics3 } from "./idefics3/test_modeling_idefics3.js";
-export { default as jais } from "./jais/test_modeling_jais.js";
-export { default as llama } from "./llama/test_modeling_llama.js";
-export { default as llava } from "./llava/test_modeling_llava.js";
-export { default as marian } from "./marian/test_modeling_marian.js";
-export { default as mistral } from "./mistral/test_modeling_mistral.js";
-export { default as mpt } from "./mpt/test_modeling_mpt.js";
-export { default as musicgen } from "./musicgen/test_modeling_musicgen.js";
-export { default as olmo } from "./olmo/test_modeling_olmo.js";
-export { default as olmo2 } from "./olmo2/test_modeling_olmo2.js";
-export { default as opt } from "./opt/test_modeling_opt.js";
-export { default as paligemma } from "./paligemma/test_modeling_paligemma.js";
-export { default as patchtsmixer } from "./patchtsmixer/test_modeling_patchtsmixer.js";
-export { default as patchtst } from "./patchtst/test_modeling_patchtst.js";
-export { default as pyannote } from "./pyannote/test_modeling_pyannote.js";
-export { default as qwen2_vl } from "./qwen2_vl/test_modeling_qwen2_vl.js";
-export { default as t5 } from "./t5/test_modeling_t5.js";
-export { default as vision_encoder_decoder } from "./vision_encoder_decoder/test_modeling_vision_encoder_decoder.js";
-export { default as whisper } from "./whisper/test_modeling_whisper.js";
diff --git a/tests/models/all_tokenization_tests.js b/tests/models/all_tokenization_tests.js
deleted file mode 100644
index b9bac9d1f..000000000
--- a/tests/models/all_tokenization_tests.js
+++ /dev/null
@@ -1,22 +0,0 @@
-export * as AlbertTokenizer from "./albert/test_tokenization_albert.js";
-export * as BertTokenizer from "./bert/test_tokenization_bert.js";
-export * as BlenderbotSmallTokenizer from "./blenderbot_small/test_tokenization_blenderbot_small.js";
-export * as BloomTokenizer from "./bloom/test_tokenization_bloom.js";
-export * as CLIPTokenizer from "./clip/test_tokenization_clip.js";
-export * as DebertaV2Tokenizer from "./deberta_v2/test_tokenization_deberta_v2.js";
-export * as DistilBertTokenizer from "./distilbert/test_tokenization_distilbert.js";
-export * as EsmTokenizer from "./esm/test_tokenization_esm.js";
-export * as FalconTokenizer from "./falcon/test_tokenization_falcon.js";
-export * as GPT2Tokenizer from "./gpt2/test_tokenization_gpt2.js";
-export * as GemmaTokenizer from "./gemma/test_tokenization_gemma.js";
-export * as LlamaTokenizer from "./llama/test_tokenization_llama.js";
-export * as M2M100Tokenizer from "./m2m_100/test_tokenization_m2m_100.js";
-export * as MPNetTokenizer from "./mpnet/test_tokenization_mpnet.js";
-export * as NllbTokenizer from "./nllb/test_tokenization_nllb.js";
-export * as Qwen2Tokenizer from "./qwen2/test_tokenization_qwen2.js";
-export * as RobertaTokenizer from "./roberta/test_tokenization_roberta.js";
-export * as T5Tokenizer from "./t5/test_tokenization_t5.js";
-export * as VitsTokenizer from "./vits/test_tokenization_vits.js";
-export * as Wav2Vec2CTCTokenizer from "./wav2vec2/test_tokenization_wav2vec2.js";
-export * as WhisperTokenizer from "./whisper/test_tokenization_whisper.js";
-export * as XLMRobertaTokenizer from "./xlm_roberta/test_tokenization_xlm_roberta.js";
diff --git a/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.js b/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.js
new file mode 100644
index 000000000..0fced5b01
--- /dev/null
+++ b/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.js
@@ -0,0 +1,53 @@
+import { AutoFeatureExtractor, ASTFeatureExtractor } from "../../../src/transformers.js";
+
+import { load_cached_audio } from "../../asset_cache.js";
+import { MAX_FEATURE_EXTRACTOR_LOAD_TIME, MAX_TEST_EXECUTION_TIME } from "../../init.js";
+
+export default () => {
+  // ASTFeatureExtractor
+  describe("ASTFeatureExtractor", () => {
+    const model_id = "Xenova/ast-finetuned-audioset-10-10-0.4593";
+
+    /** @type {ASTFeatureExtractor} */
+    let feature_extractor;
+    beforeAll(async () => {
+      feature_extractor = await AutoFeatureExtractor.from_pretrained(model_id);
+    }, MAX_FEATURE_EXTRACTOR_LOAD_TIME);
+
+    it(
+      "truncation",
+      async () => {
+        const audio = await load_cached_audio("mlk");
+        const { input_values } = await feature_extractor(audio);
+        expect(input_values.dims).toEqual([1, 1024, 128]);
+
+        expect(input_values.mean().item()).toBeCloseTo(-0.04054912979309085);
+        expect(input_values.data[0]).toBeCloseTo(-0.5662586092948914);
+        expect(input_values.data[1]).toBeCloseTo(-1.0300861597061157);
+        expect(input_values.data[129]).toBeCloseTo(-1.084834098815918);
+        expect(input_values.data[1025]).toBeCloseTo(-1.1204065084457397);
+      },
+      MAX_TEST_EXECUTION_TIME,
+    );
+
+    it(
+      "padding",
+      async () => {
+        const audio = await load_cached_audio("mlk");
+        const { input_values } = await feature_extractor(audio.slice(0, 1000));
+        expect(input_values.dims).toEqual([1, 1024, 128]); // [1, 4, 128] -> (padded to) -> [1, 1024, 128]
+
+        expect(input_values.mean().item()).toBeCloseTo(0.4647964835166931);
+        expect(input_values.data[0]).toBeCloseTo(-0.5662586092948914);
+        expect(input_values.data[1]).toBeCloseTo(-1.0300861597061157);
+        expect(input_values.data[129]).toBeCloseTo(-1.084834098815918);
+
+        // padded values
+        expect(input_values.data[1025]).toBeCloseTo(0.46703237295150757);
+        expect(input_values.data[2049]).toBeCloseTo(0.46703237295150757);
+        expect(input_values.data[10000]).toBeCloseTo(0.46703237295150757);
+      },
+      MAX_TEST_EXECUTION_TIME,
+    );
+  });
+};
diff --git a/tests/models/clap/test_feature_extraction_clap.js b/tests/models/clap/test_feature_extraction_clap.js
new file mode 100644
index 000000000..16991c186
--- /dev/null
+++ b/tests/models/clap/test_feature_extraction_clap.js
@@ -0,0 +1,74 @@
+import { AutoFeatureExtractor, ClapFeatureExtractor } from "../../../src/transformers.js";
+
+import { load_cached_audio } from "../../asset_cache.js";
+import { MAX_FEATURE_EXTRACTOR_LOAD_TIME, MAX_TEST_EXECUTION_TIME } from "../../init.js";
+
+export default () => {
+  // ClapFeatureExtractor
+  describe("ClapFeatureExtractor", () => {
+    const model_id = "Xenova/clap-htsat-unfused";
+
+    /** @type {ClapFeatureExtractor} */
+    let feature_extractor;
+    beforeAll(async () => {
+      feature_extractor = await AutoFeatureExtractor.from_pretrained(model_id);
+    }, MAX_FEATURE_EXTRACTOR_LOAD_TIME);
+
+    it(
+      "truncation",
+      async () => {
+        const audio = await load_cached_audio("mlk");
+
+        // Since truncation uses a random strategy, we override
+        // Math.random to ensure that the test is deterministic
+        const originalRandom = Math.random;
+        Math.random = () => 0.5;
+
+        let long_audio = new Float32Array(500000);
+        long_audio.set(audio);
+        long_audio.set(audio, long_audio.length - audio.length);
+
+        const { input_features } = await feature_extractor(long_audio);
+        const { dims, data } = input_features;
+        expect(dims).toEqual([1, 1, 1001, 64]);
+
+        expect(input_features.mean().item()).toBeCloseTo(-37.94569396972656);
+        expect(data[0]).toBeCloseTo(-53.32647705078125);
+        expect(data[1]).toBeCloseTo(-47.76755142211914);
+        expect(data[65]).toBeCloseTo(-36.32261276245117);
+        expect(data[1002]).toBeCloseTo(-28.0314884185791);
+        expect(data[10000]).toBeCloseTo(-21.905902862548828);
+        expect(data[60000]).toBeCloseTo(-14.877863883972168);
+        expect(data[64062]).toBeCloseTo(-37.9784049987793);
+        expect(data[64063]).toBeCloseTo(-37.73963928222656);
+
+        // Reset Math.random
+        Math.random = originalRandom;
+      },
+      MAX_TEST_EXECUTION_TIME,
+    );
+
+    it(
+      "padding",
+      async () => {
+        const audio = await load_cached_audio("mlk");
+        const { input_features } = await feature_extractor(audio);
+        const { data, dims } = input_features;
+        expect(dims).toEqual([1, 1, 1001, 64]);
+
+        expect(input_features.mean().item()).toBeCloseTo(-34.99049377441406);
+        expect(data[0]).toBeCloseTo(-21.32573890686035);
+        expect(data[1]).toBeCloseTo(-26.168411254882812);
+        expect(data[65]).toBeCloseTo(-29.716018676757812);
+        expect(data[1002]).toBeCloseTo(-32.16273498535156);
+        expect(data[10000]).toBeCloseTo(-19.9283390045166);
+
+        // padded values
+        expect(data[60000]).toBeCloseTo(-100.0);
+        expect(data[64062]).toBeCloseTo(-100.0);
+        expect(data[64063]).toBeCloseTo(-100.0);
+      },
+      MAX_TEST_EXECUTION_TIME,
+    );
+  });
+};
diff --git a/tests/models/florence2/test_processor_florence2.js b/tests/models/florence2/test_processor_florence2.js
new file mode 100644
index 000000000..5d4ff2faf
--- /dev/null
+++ b/tests/models/florence2/test_processor_florence2.js
@@ -0,0 +1,222 @@
+import { AutoProcessor, Florence2Processor } from "../../../src/transformers.js";
+import { MAX_TEST_EXECUTION_TIME, MAX_PROCESSOR_LOAD_TIME } from "../../init.js";
+import { load_cached_image } from "../../asset_cache.js";
+export default () => {
+  describe("FlorenceProcessor", () => {
+    const model_id = "Xenova/tiny-random-Florence2ForConditionalGeneration";
+
+    /** @type {Florence2Processor} */
+    let processor;
+    let images = {};
+
+    beforeAll(async () => {
+      processor = await AutoProcessor.from_pretrained(model_id);
+      images = {
+        beetle: await load_cached_image("beetle"),
+        book_cover: await load_cached_image("book_cover"),
+      };
+    }, MAX_PROCESSOR_LOAD_TIME);
+
+    describe("Prompt construction", () => {
+      it(
+        "Construct prompt",
+        async () => {
+          const text = "<OD>";
+          const prompts = processor.construct_prompts(text);
+          const target = ["Locate the objects with category name in the image."];
+          expect(prompts).toEqual(target);
+        },
+        MAX_TEST_EXECUTION_TIME,
+      );
+
+      it(
+        "Construct prompts",
+        async () => {
+          const texts = ["<MORE_DETAILED_CAPTION>", "Locate the objects with category name in the image.", "<OPEN_VOCABULARY_DETECTION>cat"];
+          const prompts = processor.construct_prompts(texts);
+          const target = ["Describe with a paragraph what is shown in the image.", "Locate the objects with category name in the image.", "Locate cat in the image."];
+          expect(prompts).toEqual(target);
+        },
+        MAX_TEST_EXECUTION_TIME,
+      );
+    });
+
+    describe("Post-process generation", () => {
+      const TESTS = [
+        {
+          task: "<CAPTION>",
+          generated_text: "</s><s>A green car parked in front of a yellow building.</s>",
+          target: { "<CAPTION>": "A green car parked in front of a yellow building." },
+          image: "beetle",
+        },
+        {
+          task: "<DETAILED_CAPTION>",
+          generated_text: "</s><s>The image shows a green Volkswagen Beetle parked in front of a yellow building with two brown doors. The sky is a mix of blue and white, and there are a few green trees in the background.</s>",
+          target: { "<DETAILED_CAPTION>": "The image shows a green Volkswagen Beetle parked in front of a yellow building with two brown doors. The sky is a mix of blue and white, and there are a few green trees in the background." },
+          image: "beetle",
+        },
+        {
+          task: "<MORE_DETAILED_CAPTION>",
+          generated_text: "</s><s>The image shows a vintage Volkswagen Beetle car parked on a cobblestone street in front of a yellow building with two wooden doors. The car is painted in a bright turquoise color and has a white stripe running along the side. It has two doors on either side of the car, one on top of the other, and a small window on the front. The building appears to be old and dilapidated, with peeling paint and crumbling walls. The sky is blue and there are trees in the background.</s>",
+          target: { "<MORE_DETAILED_CAPTION>": "The image shows a vintage Volkswagen Beetle car parked on a cobblestone street in front of a yellow building with two wooden doors. The car is painted in a bright turquoise color and has a white stripe running along the side. It has two doors on either side of the car, one on top of the other, and a small window on the front. The building appears to be old and dilapidated, with peeling paint and crumbling walls. The sky is blue and there are trees in the background." },
+          image: "beetle",
+        },
+        {
+          task: "<OD>",
+          generated_text: "</s><s><s><s>car<loc_53><loc_333><loc_933><loc_774>door<loc_712><loc_203><loc_906><loc_545>wheel<loc_704><loc_576><loc_866><loc_772><loc_149><loc_584><loc_310><loc_773></s>",
+          target: {
+            "<OD>": {
+              bboxes: [
+                [34.24, 160.08, 597.44, 371.76],
+                [456.0, 97.68, 580.16, 261.84],
+                [450.88, 276.72, 554.56, 370.8],
+                [95.68, 280.56, 198.72, 371.28],
+              ],
+              labels: ["car", "door", "wheel", "wheel"],
+            },
+          },
+          image: "beetle",
+        },
+        {
+          task: "<DENSE_REGION_CAPTION>",
+          generated_text: "</s><s>turquoise Volkswagen Beetle<loc_52><loc_333><loc_932><loc_774>wheel<loc_704><loc_576><loc_864><loc_772><loc_148><loc_584><loc_308><loc_773></s>",
+          target: {
+            "<DENSE_REGION_CAPTION>": {
+              bboxes: [
+                [33.6, 160.08, 596.8, 371.76],
+                [450.88, 276.72, 553.28, 370.8],
+                [95.04, 280.56, 197.44, 371.28],
+              ],
+              labels: ["turquoise Volkswagen Beetle", "wheel", "wheel"],
+            },
+          },
+          image: "beetle",
+        },
+        {
+          task: "<REGION_PROPOSAL>",
+          generated_text: "</s><s><s><s><loc_52><loc_333><loc_932><loc_774><loc_711><loc_203><loc_905><loc_545><loc_704><loc_576><loc_864><loc_772><loc_148><loc_584><loc_309><loc_773><loc_354><loc_184><loc_519><loc_342><loc_102><loc_555><loc_135><loc_616><loc_424><loc_503><loc_472><loc_514><loc_637><loc_642><loc_646><loc_668></s>",
+          target: {
+            "<REGION_PROPOSAL>": {
+              bboxes: [
+                [33.6, 160.08, 596.8, 371.76],
+                [455.36, 97.68, 579.52, 261.84],
+                [450.88, 276.72, 553.28, 370.8],
+                [95.04, 280.56, 198.08, 371.28],
+                [226.88, 88.56, 332.48, 164.4],
+                [65.6, 266.64, 86.72, 295.92],
+                [271.68, 241.68, 302.4, 246.96],
+                [408.0, 308.4, 413.76, 320.88],
+              ],
+              labels: ["", "", "", "", "", "", "", ""],
+            },
+          },
+          image: "beetle",
+        },
+        {
+          task: "<CAPTION_TO_PHRASE_GROUNDING>",
+          text_input: "A green car parked in front of a yellow building.",
+          generated_text: "</s><s><s><s>A green car<loc_54><loc_330><loc_911><loc_780>a yellow building<loc_0><loc_8><loc_998><loc_635></s>",
+          target: {
+            "<CAPTION_TO_PHRASE_GROUNDING>": {
+              bboxes: [
+                [34.88, 158.64, 583.36, 374.64],
+                [0.32, 4.08, 639.04, 305.04],
+              ],
+              labels: ["A green car", "a yellow building"],
+            },
+          },
+          image: "beetle",
+        },
+        // {
+        //     task: "<REFERRING_EXPRESSION_SEGMENTATION>",
+        //     text_input: "a green car",
+        //     generated_text: "</s><s><s><s><loc_279><loc_378><loc_282><loc_376><loc_285><loc_376><loc_293><loc_370><loc_296><loc_370><loc_301><loc_366><loc_304><loc_366><loc_309><loc_362><loc_313><loc_360><loc_318><loc_358><loc_323><loc_355><loc_327><loc_353><loc_334><loc_351><loc_340><loc_349><loc_346><loc_347><loc_353><loc_345><loc_360><loc_343><loc_370><loc_341><loc_381><loc_339><loc_395><loc_337><loc_414><loc_335><loc_486><loc_335><loc_514><loc_337><loc_528><loc_339><loc_539><loc_341><loc_547><loc_343><loc_553><loc_345><loc_560><loc_347><loc_566><loc_349><loc_572><loc_351><loc_578><loc_353><loc_583><loc_355><loc_586><loc_358><loc_589><loc_362><loc_592><loc_368><loc_594><loc_374><loc_597><loc_378><loc_600><loc_385><loc_603><loc_391><loc_605><loc_397><loc_608><loc_401><loc_609><loc_408><loc_612><loc_414><loc_616><loc_420><loc_619><loc_426><loc_622><loc_433><loc_630><loc_443><loc_634><loc_445><loc_639><loc_451><loc_644><loc_458><loc_674><loc_458><loc_675><loc_460><loc_691><loc_462><loc_713><loc_462><loc_727><loc_464><loc_738><loc_466><loc_747><loc_468><loc_757><loc_470><loc_765><loc_472><loc_771><loc_474><loc_777><loc_476><loc_783><loc_478><loc_788><loc_481><loc_793><loc_483><loc_797><loc_485><loc_802><loc_487><loc_807><loc_491><loc_810><loc_491><loc_818><loc_497><loc_821><loc_497><loc_824><loc_499><loc_827><loc_503><loc_832><loc_505><loc_837><loc_510><loc_841><loc_516><loc_846><loc_520><loc_852><loc_524><loc_857><loc_526><loc_860><loc_526><loc_865><loc_528><loc_869><loc_532><loc_872><loc_532><loc_882><loc_539><loc_885><loc_543><loc_888><loc_543><loc_891><loc_545><loc_894><loc_549><loc_896><loc_553><loc_897><loc_559><loc_897><loc_566><loc_896><loc_568><loc_894><loc_574><loc_894><loc_582><loc_896><loc_595><loc_897><loc_597><loc_899><loc_603><loc_900><loc_609><loc_902><loc_622><loc_902><loc_628><loc_900><loc_630><loc_899><loc_647><loc_899><loc_651><loc_900><loc_653><loc_902><loc_659><loc_902><loc_668><loc_897><loc_670><loc_888><loc_672><loc_874><loc_672><loc_865><loc_674><loc_863><loc_693><loc_862><loc_701><loc_860><loc_707><loc_859><loc_714><loc_857><loc_718><loc_854><loc_722><loc_852><loc_728><loc_849><loc_734><loc_846><loc_741><loc_835><loc_755><loc_830><loc_759><loc_821><loc_766><loc_816><loc_768><loc_810><loc_770><loc_774><loc_770><loc_765><loc_768><loc_760><loc_766><loc_755><loc_764><loc_749><loc_759><loc_744><loc_755><loc_738><loc_749><loc_727><loc_734><loc_724><loc_728><loc_721><loc_722><loc_719><loc_718><loc_719><loc_714><loc_716><loc_707><loc_715><loc_701><loc_715><loc_697><loc_713><loc_693><loc_710><loc_689><loc_707><loc_691><loc_700><loc_701><loc_697><loc_703><loc_666><loc_701><loc_663><loc_701><loc_661><loc_703><loc_657><loc_705><loc_647><loc_707><loc_644><loc_707><loc_642><loc_705><loc_594><loc_703><loc_339><loc_703><loc_337><loc_705><loc_329><loc_707><loc_323><loc_707><loc_318><loc_705><loc_315><loc_703><loc_312><loc_699><loc_309><loc_697><loc_304><loc_697><loc_301><loc_701><loc_299><loc_705><loc_299><loc_709><loc_298><loc_714><loc_295><loc_718><loc_293><loc_724><loc_290><loc_728><loc_288><loc_734><loc_285><loc_741><loc_276><loc_753><loc_271><loc_757><loc_266><loc_761><loc_260><loc_766><loc_255><loc_768><loc_251><loc_770><loc_240><loc_772><loc_205><loc_772><loc_199><loc_770><loc_194><loc_768><loc_185><loc_761><loc_180><loc_757><loc_174><loc_751><loc_166><loc_741><loc_163><loc_734><loc_161><loc_728><loc_158><loc_724><loc_157><loc_720><loc_155><loc_714><loc_155><loc_707><loc_154><loc_703><loc_149><loc_697><loc_146><loc_695><loc_135><loc_695><loc_125><loc_697><loc_124><loc_699><loc_116><loc_701><loc_103><loc_701><loc_99><loc_697><loc_83><loc_697><loc_78><loc_695><loc_75><loc_691><loc_75><loc_684><loc_78><loc_680><loc_80><loc_676><loc_80><loc_672><loc_69><loc_670><loc_63><loc_668><loc_60><loc_666><loc_58><loc_661><loc_56><loc_653><loc_56><loc_639><loc_60><loc_634><loc_66><loc_632><loc_72><loc_630><loc_86><loc_628><loc_102><loc_628><loc_105><loc_626><loc_108><loc_622><loc_110><loc_618><loc_110><loc_609><loc_108><loc_607><loc_107><loc_601><loc_105><loc_593><loc_105><loc_576><loc_107><loc_570><loc_108><loc_566><loc_113><loc_559><loc_116><loc_557><loc_121><loc_555><loc_124><loc_555><loc_127><loc_551><loc_125><loc_543><loc_127><loc_539><loc_130><loc_534><loc_138><loc_534><loc_141><loc_532><loc_144><loc_528><loc_144><loc_526><loc_152><loc_514><loc_179><loc_478><loc_183><loc_472><loc_191><loc_464><loc_196><loc_460><loc_197><loc_460><loc_202><loc_456><loc_208><loc_449><loc_216><loc_441><loc_224><loc_433><loc_233><loc_420><loc_240><loc_414><loc_241><loc_414><loc_246><loc_410><loc_254><loc_401><loc_263><loc_389><loc_268><loc_385><loc_276><loc_381><loc_279><loc_376></s>",
+        //     target: {
+        //         '<REFERRING_EXPRESSION_SEGMENTATION>': {
+        //             polygons: [[[[178.88, 181.68, 180.8, 180.72, 182.72, 180.72, 187.84, 177.84, 189.76, 177.84, 192.96, 175.92, 194.88, 175.92, 198.08, 174, 200.64, 173.04, 203.84, 172.08, 207.04, 170.64, 209.6, 169.68, 214.08, 168.72, 217.92, 167.76, 221.76, 166.8, 226.24, 165.84, 230.72, 164.88, 237.12, 163.92, 244.16, 162.96, 253.12, 162, 265.28, 161.04, 311.36, 161.04, 329.28, 162, 338.24, 162.96, 345.28, 163.92, 350.4, 164.88, 354.24, 165.84, 358.72, 166.8, 362.56, 167.76, 366.4, 168.72, 370.24, 169.68, 373.44, 170.64, 375.36, 172.08, 377.28, 174, 379.2, 176.88, 380.48, 179.76, 382.4, 181.68, 384.32, 185.04, 386.24, 187.92, 387.52, 190.8, 389.44, 192.72, 390.08, 196.08, 392, 198.96, 394.56, 201.84, 396.48, 204.72, 398.4, 208.08, 403.52, 212.88, 406.08, 213.84, 409.28, 216.72, 412.48, 220.08, 431.68, 220.08, 432.32, 221.04, 442.56, 222, 456.64, 222, 465.6, 222.96, 472.64, 223.92, 478.4, 224.88, 484.8, 225.84, 489.92, 226.8, 493.76, 227.76, 497.6, 228.72, 501.44, 229.68, 504.64, 231.12, 507.84, 232.08, 510.4, 233.04, 513.6, 234, 516.8, 235.92, 518.72, 235.92, 523.84, 238.8, 525.76, 238.8, 527.68, 239.76, 529.6, 241.68, 532.8, 242.64, 536, 245.04, 538.56, 247.92, 541.76, 249.84, 545.6, 251.76, 548.8, 252.72, 550.72, 252.72, 553.92, 253.68, 556.48, 255.6, 558.4, 255.6, 564.8, 258.96, 566.72, 260.88, 568.64, 260.88, 570.56, 261.84, 572.48, 263.76, 573.76, 265.68, 574.4, 268.56, 574.4, 271.92, 573.76, 272.88, 572.48, 275.76, 572.48, 279.6, 573.76, 285.84, 574.4, 286.8, 575.68, 289.68, 576.32, 292.56, 577.6, 298.8, 577.6, 301.68, 576.32, 302.64, 575.68, 310.8, 575.68, 312.72, 576.32, 313.68, 577.6, 316.56, 577.6, 320.88, 574.4, 321.84, 568.64, 322.8, 559.68, 322.8, 553.92, 323.76, 552.64, 332.88, 552, 336.72, 550.72, 339.6, 550.08, 342.96, 548.8, 344.88, 546.88, 346.8, 545.6, 349.68, 543.68, 352.56, 541.76, 355.92, 534.72, 362.64, 531.52, 364.56, 525.76, 367.92, 522.56, 368.88, 518.72, 369.84, 495.68, 369.84, 489.92, 368.88, 486.72, 367.92, 483.52, 366.96, 479.68, 364.56, 476.48, 362.64, 472.64, 359.76, 465.6, 352.56, 463.68, 349.68, 461.76, 346.8, 460.48, 344.88, 460.48, 342.96, 458.56, 339.6, 457.92, 336.72, 457.92, 334.8, 456.64, 332.88, 454.72, 330.96, 452.8, 331.92, 448.32, 336.72, 446.4, 337.68, 426.56, 336.72, 424.64, 336.72, 423.36, 337.68, 420.8, 338.64, 414.4, 339.6, 412.48, 339.6, 411.2, 338.64, 380.48, 337.68, 217.28, 337.68, 216, 338.64, 210.88, 339.6, 207.04, 339.6, 203.84, 338.64, 201.92, 337.68, 200, 335.76, 198.08, 334.8, 194.88, 334.8, 192.96, 336.72, 191.68, 338.64, 191.68, 340.56, 191.04, 342.96, 189.12, 344.88, 187.84, 347.76, 185.92, 349.68, 184.64, 352.56, 182.72, 355.92, 176.96, 361.68, 173.76, 363.6, 170.56, 365.52, 166.72, 367.92, 163.52, 368.88, 160.96, 369.84, 153.92, 370.8, 131.52, 370.8, 127.68, 369.84, 124.48, 368.88, 118.72, 365.52, 115.52, 363.6, 111.68, 360.72, 106.56, 355.92, 104.64, 352.56, 103.36, 349.68, 101.44, 347.76, 100.8, 345.84, 99.52, 342.96, 99.52, 339.6, 98.88, 337.68, 95.68, 334.8, 93.76, 333.84, 86.72, 333.84, 80.32, 334.8, 79.68, 335.76, 74.56, 336.72, 66.24, 336.72, 63.68, 334.8, 53.44, 334.8, 50.24, 333.84, 48.32, 331.92, 48.32, 328.56, 50.24, 326.64, 51.52, 324.72, 51.52, 322.8, 44.48, 321.84, 40.64, 320.88, 38.72, 319.92, 37.44, 317.52, 36.16, 313.68, 36.16, 306.96, 38.72, 304.56, 42.56, 303.6, 46.4, 302.64, 55.36, 301.68, 65.6, 301.68, 67.52, 300.72, 69.44, 298.8, 70.72, 296.88, 70.72, 292.56, 69.44, 291.6, 68.8, 288.72, 67.52, 284.88, 67.52, 276.72, 68.8, 273.84, 69.44, 271.92, 72.64, 268.56, 74.56, 267.6, 77.76, 266.64, 79.68, 266.64, 81.6, 264.72, 80.32, 260.88, 81.6, 258.96, 83.52, 256.56, 88.64, 256.56, 90.56, 255.6, 92.48, 253.68, 92.48, 252.72, 97.6, 246.96, 114.88, 229.68, 117.44, 226.8, 122.56, 222.96, 125.76, 221.04, 126.4, 221.04, 129.6, 219.12, 133.44, 215.76, 138.56, 211.92, 143.68, 208.08, 149.44, 201.84, 153.92, 198.96, 154.56, 198.96, 157.76, 197.04, 162.88, 192.72, 168.64, 186.96, 171.84, 185.04, 176.96, 183.12, 178.88, 180.72]]]],
+        //             labels: [''],
+        //         }
+        //     },
+        //     image: 'beetle',
+        // },
+        // {
+        //     task: "<REGION_TO_SEGMENTATION>",
+        //     text_input: "<loc_702><loc_575><loc_866><loc_772>",
+        //     generated_text: "</s><s><s><s><loc_734><loc_600><loc_740><loc_594><loc_745><loc_590><loc_748><loc_588><loc_751><loc_588><loc_756><loc_584><loc_760><loc_582><loc_765><loc_580><loc_773><loc_578><loc_800><loc_578><loc_804><loc_580><loc_809><loc_582><loc_814><loc_586><loc_817><loc_586><loc_820><loc_590><loc_825><loc_592><loc_829><loc_596><loc_834><loc_600><loc_848><loc_619><loc_851><loc_625><loc_854><loc_631><loc_859><loc_644><loc_861><loc_650><loc_862><loc_656><loc_864><loc_665><loc_864><loc_692><loc_862><loc_702><loc_861><loc_708><loc_859><loc_715><loc_856><loc_723><loc_853><loc_729><loc_850><loc_735><loc_845><loc_744><loc_839><loc_752><loc_831><loc_760><loc_826><loc_764><loc_823><loc_766><loc_818><loc_768><loc_814><loc_770><loc_806><loc_773><loc_782><loc_773><loc_768><loc_770><loc_762><loc_768><loc_757><loc_766><loc_748><loc_760><loc_743><loc_756><loc_737><loc_750><loc_726><loc_735><loc_723><loc_729><loc_720><loc_723><loc_718><loc_719><loc_718><loc_715><loc_715><loc_708><loc_713><loc_702><loc_712><loc_696><loc_710><loc_688><loc_710><loc_658><loc_712><loc_648><loc_713><loc_640><loc_715><loc_633><loc_718><loc_627><loc_718><loc_623><loc_720><loc_619><loc_723><loc_613></s>",
+        //     target: {
+        //         '<REGION_TO_SEGMENTATION>': {
+        //             polygons: [[[[470.08, 288.24, 473.92, 285.36, 477.12, 283.44, 479.04, 282.48, 480.96, 282.48, 484.16, 280.56, 486.72, 279.6, 489.92, 278.64, 495.04, 277.68, 512.32, 277.68, 514.88, 278.64, 518.08, 279.6, 521.28, 281.52, 523.2, 281.52, 525.12, 283.44, 528.32, 284.4, 530.88, 286.32, 534.08, 288.24, 543.04, 297.36, 544.96, 300.24, 546.88, 303.12, 550.08, 309.36, 551.36, 312.24, 552, 315.12, 553.28, 319.44, 553.28, 332.4, 552, 337.2, 551.36, 340.08, 550.08, 343.44, 548.16, 347.28, 546.24, 350.16, 544.32, 353.04, 541.12, 357.36, 537.28, 361.2, 532.16, 365.04, 528.96, 366.96, 527.04, 367.92, 523.84, 368.88, 521.28, 369.84, 516.16, 371.28, 500.8, 371.28, 491.84, 369.84, 488, 368.88, 484.8, 367.92, 479.04, 365.04, 475.84, 363.12, 472, 360.24, 464.96, 353.04, 463.04, 350.16, 461.12, 347.28, 459.84, 345.36, 459.84, 343.44, 457.92, 340.08, 456.64, 337.2, 456, 334.32, 454.72, 330.48, 454.72, 316.08, 456, 311.28, 456.64, 307.44, 457.92, 304.08, 459.84, 301.2, 459.84, 299.28, 461.12, 297.36, 463.04, 294.48]]]],
+        //             labels: [''],
+        //         }
+        //     },
+        //     image: 'beetle',
+        // },
+        // {
+        //     task: "<OPEN_VOCABULARY_DETECTION>",
+        //     text_input: "a green car",
+        //     generated_text: "</s><s><s>a green car<loc_53><loc_330><loc_910><loc_779></s>",
+        //     target: {
+        //         '<OPEN_VOCABULARY_DETECTION>': {
+        //             bboxes: [[34.24, 158.64, 582.72, 374.16]],
+        //             bboxes_labels: ['a green car'],
+        //             polygons: [],
+        //             polygons_labels: [],
+        //         }
+        //     },
+        //     image: 'beetle',
+        // },
+        {
+          task: "<REGION_TO_CATEGORY>",
+          text_input: "<loc_52><loc_332><loc_932><loc_774>",
+          generated_text: "</s><s>car<loc_52><loc_332><loc_932><loc_774></s>",
+          target: { "<REGION_TO_CATEGORY>": "car<loc_52><loc_332><loc_932><loc_774>" },
+          image: "beetle",
+        },
+        {
+          task: "<REGION_TO_DESCRIPTION>",
+          text_input: "<loc_52><loc_332><loc_932><loc_774>",
+          generated_text: "</s><s>turquoise Volkswagen Beetle<loc_52><loc_332><loc_932><loc_774></s>",
+          target: { "<REGION_TO_DESCRIPTION>": "turquoise Volkswagen Beetle<loc_52><loc_332><loc_932><loc_774>" },
+          image: "beetle",
+        },
+        {
+          task: "<OCR>",
+          generated_text: "</s><s>CUDAFOR ENGINEERSAn Introduction to High-PerformanceParallel ComputingDUANE STORTIMETE YURTOGLU</s>",
+          target: { "<OCR>": "CUDAFOR ENGINEERSAn Introduction to High-PerformanceParallel ComputingDUANE STORTIMETE YURTOGLU" },
+          image: "book_cover",
+        },
+        {
+          task: "<OCR_WITH_REGION>",
+          generated_text: "</s><s><s><s>CUDA<loc_414><loc_100><loc_932><loc_100><loc_932><loc_229><loc_414><loc_229>FOR ENGINEERS<loc_359><loc_241><loc_932><loc_241><loc_932><loc_298><loc_359><loc_298>An Introduction to High-Performance<loc_287><loc_330><loc_934><loc_332><loc_934><loc_368><loc_287><loc_366>Parallel Computing<loc_595><loc_368><loc_934><loc_372><loc_934><loc_408><loc_595><loc_404>DUANE STORTI<loc_660><loc_882><loc_934><loc_882><loc_934><loc_912><loc_660><loc_912>METE YURTOGLU<loc_625><loc_920><loc_934><loc_920><loc_934><loc_950><loc_625><loc_950></s>",
+          target: {
+            "<OCR_WITH_REGION>": {
+              quad_boxes: [
+                [167.0435028076172, 50.25, 375.7974853515625, 50.25, 375.7974853515625, 114.75, 167.0435028076172, 114.75],
+                [144.8784942626953, 120.75, 375.7974853515625, 120.75, 375.7974853515625, 149.25, 144.8784942626953, 149.25],
+                [115.86249542236328, 165.25, 376.6034851074219, 166.25, 376.6034851074219, 184.25, 115.86249542236328, 183.25],
+                [239.9864959716797, 184.25, 376.6034851074219, 186.25, 376.6034851074219, 204.25, 239.9864959716797, 202.25],
+                [266.1814880371094, 441.25, 376.6034851074219, 441.25, 376.6034851074219, 456.25, 266.1814880371094, 456.25],
+                [252.0764923095703, 460.25, 376.6034851074219, 460.25, 376.6034851074219, 475.25, 252.0764923095703, 475.25],
+              ],
+
+              // NOTE: Python version has a bug here, it should be "CUDA" instead of "</s>CUDA"
+              labels: [/* '</s>CUDA' */ "CUDA", "FOR ENGINEERS", "An Introduction to High-Performance", "Parallel Computing", "DUANE STORTI", "METE YURTOGLU"],
+            },
+          },
+          image: "book_cover",
+        },
+      ];
+
+      for (const { task, generated_text, target, image } of TESTS) {
+        it(
+          task,
+          () => {
+            const result = processor.post_process_generation(generated_text, task, images[image].size);
+            expect(result).toBeCloseToNested(target, 4);
+          },
+          MAX_TEST_EXECUTION_TIME,
+        );
+      }
+    });
+  });
+};
diff --git a/tests/models/janus/test_processor_janus.js b/tests/models/janus/test_processor_janus.js
new file mode 100644
index 000000000..3092fb987
--- /dev/null
+++ b/tests/models/janus/test_processor_janus.js
@@ -0,0 +1,47 @@
+import { AutoProcessor, VLChatProcessor } from "../../../src/transformers.js";
+
+import { MAX_PROCESSOR_LOAD_TIME, MAX_TEST_EXECUTION_TIME } from "../../init.js";
+
+export default () => {
+  describe("VLChatProcessor", () => {
+    const model_id = "onnx-community/Janus-1.3B-ONNX";
+
+    /** @type {VLChatProcessor} */
+    let processor;
+    beforeAll(async () => {
+      processor = await AutoProcessor.from_pretrained(model_id);
+    }, MAX_PROCESSOR_LOAD_TIME);
+
+    it(
+      "Image and text",
+      async () => {
+        // Prepare inputs
+        const conversation = [
+          {
+            role: "User",
+            content: "<image_placeholder>\nConvert the formula into latex code.",
+            images: ["https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/quadratic_formula.png"],
+          },
+        ];
+
+        const { input_ids, attention_mask, images_seq_mask, images_emb_mask, pixel_values, original_sizes, reshaped_input_sizes } = await processor(conversation);
+        const num_tokens = 631;
+        const { num_image_tokens } = processor.config; // 576
+        const { image_size } = processor.image_processor.config; // 384
+
+        expect(input_ids.dims).toEqual([1, num_tokens]);
+        expect(attention_mask.dims).toEqual([1, num_tokens]);
+        expect(images_seq_mask.dims).toEqual([1, num_tokens]);
+        expect(images_seq_mask.to("float32").mean().item()).toBeCloseTo(num_image_tokens / num_tokens, 6);
+        expect(images_emb_mask.dims).toEqual([1, 1, num_image_tokens]);
+        expect(images_emb_mask.to("float32").mean().item()).toBeCloseTo(1);
+        expect(pixel_values.dims).toEqual([1, 1, 3, image_size, image_size]);
+        expect(pixel_values.mean().item()).toBeCloseTo(0.5999642610549927, 6);
+
+        expect(original_sizes).toEqual([[206, 767]]);
+        expect(reshaped_input_sizes).toEqual([[103, image_size]]);
+      },
+      MAX_TEST_EXECUTION_TIME,
+    );
+  });
+};
diff --git a/tests/models/jina_clip/test_processor_jina_clip.js b/tests/models/jina_clip/test_processor_jina_clip.js
new file mode 100644
index 000000000..47ac16fe8
--- /dev/null
+++ b/tests/models/jina_clip/test_processor_jina_clip.js
@@ -0,0 +1,44 @@
+import { AutoProcessor, JinaCLIPProcessor } from "../../../src/transformers.js";
+import { load_cached_image } from "../../asset_cache.js";
+
+import { MAX_PROCESSOR_LOAD_TIME, MAX_TEST_EXECUTION_TIME } from "../../init.js";
+
+export default () => {
+  describe("JinaCLIPProcessor", () => {
+    const model_id = "jinaai/jina-clip-v2";
+
+    /** @type {JinaCLIPProcessor} */
+    let processor;
+    beforeAll(async () => {
+      processor = await AutoProcessor.from_pretrained(model_id);
+    }, MAX_PROCESSOR_LOAD_TIME);
+
+    it(
+      "Image and text",
+      async () => {
+        // Prepare inputs
+        const images = [await load_cached_image("white_image"), await load_cached_image("blue_image")];
+        const sentences = [
+          "غروب جميل على الشاطئ", // Arabic
+          "海滩上美丽的日落", // Chinese
+          "Un beau coucher de soleil sur la plage", // French
+          "Ein wunderschöner Sonnenuntergang am Strand", // German
+          "Ένα όμορφο ηλιοβασίλεμα πάνω από την παραλία", // Greek
+          "समुद्र तट पर एक खूबसूरत सूर्यास्त", // Hindi
+          "Un bellissimo tramonto sulla spiaggia", // Italian
+          "浜辺に沈む美しい夕日", // Japanese
+          "해변 위로 아름다운 일몰", // Korean
+        ];
+
+        // Encode text and images
+        const { input_ids, attention_mask, pixel_values } = await processor(sentences, images, { padding: true, truncation: true });
+
+        expect(input_ids.dims).toEqual([sentences.length, 19]);
+        expect(attention_mask.dims).toEqual([sentences.length, 19]);
+        expect(pixel_values.dims).toEqual([images.length, 3, 512, 512]);
+        expect(pixel_values.mean().item()).toBeCloseTo(0.7857685685157776, 6);
+      },
+      MAX_TEST_EXECUTION_TIME,
+    );
+  });
+};
diff --git a/tests/models/mgp_str/test_modeling_mgp_str.js b/tests/models/mgp_str/test_modeling_mgp_str.js
new file mode 100644
index 000000000..b0122f03b
--- /dev/null
+++ b/tests/models/mgp_str/test_modeling_mgp_str.js
@@ -0,0 +1,84 @@
+import { MgpstrProcessor, MgpstrForSceneTextRecognition } from "../../../src/transformers.js";
+
+import { load_cached_image } from "../../asset_cache.js";
+import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../../init.js";
+
+export default () => {
+  describe("MgpstrForSceneTextRecognition", () => {
+    const model_id = "onnx-community/tiny-random-MgpstrForSceneTextRecognition";
+    /** @type {MgpstrForSceneTextRecognition} */
+    let model;
+    /** @type {MgpstrProcessor} */
+    let processor;
+    beforeAll(async () => {
+      model = await MgpstrForSceneTextRecognition.from_pretrained(model_id, DEFAULT_MODEL_OPTIONS);
+      processor = await MgpstrProcessor.from_pretrained(model_id);
+    }, MAX_MODEL_LOAD_TIME);
+
+    const TARGETS = {
+      white_image: {
+        generated_text: ["mmmmmmmmmmmmmmmmmmmmmmmmmm"],
+        scores: [3.5553885547065065e-27],
+        char_preds: ["mmmmmmmmmmmmmmmmmmmmmmmmmm"],
+        bpe_preds: ["wwwwwwwwwwwwwwwwwwwwwwwwww"],
+        wp_preds: ["[unused65][unused65][unused65][unused65][unused65][unused65][unused65][unused65][unused65][unused65][unused65][unused65][unused65][unused65][unused65][unused65][unused65][unused65][unused65][unused65][unused65][unused65][unused65][unused65][unused65][unused65]"],
+      },
+      blue_image: {
+        generated_text: ["11111111111111111111111111"],
+        scores: [9.739909092663214e-32],
+        char_preds: ["11111111111111111111111111"],
+        bpe_preds: ["22222222222222222222222222"],
+        wp_preds: ["[unused59][unused59][unused59][unused59][unused59][unused59][unused59][unused59][unused59][unused59][unused59][unused59][unused59][unused59][unused59][unused59][unused59][unused59][unused59][unused59][unused59][unused59][unused59][unused59][unused59][unused59]"],
+      },
+    };
+
+    it(
+      "batch_size=1",
+      async () => {
+        const image_id = "white_image";
+        const image = await load_cached_image(image_id);
+
+        const inputs = await processor(image);
+        const outputs = await model(inputs);
+
+        const { max_token_length, num_character_labels, num_bpe_labels, num_wordpiece_labels } = model.config;
+        expect(outputs.char_logits.dims).toEqual([1, /* 27 */ max_token_length, /* 38 */ num_character_labels]);
+        expect(outputs.bpe_logits.dims).toEqual([1, /* 27 */ max_token_length, /* 99 */ num_bpe_labels]);
+        expect(outputs.wp_logits.dims).toEqual([1, /* 27 */ max_token_length, /* 99 */ num_wordpiece_labels]);
+
+        const decoded = processor.batch_decode(outputs.logits);
+        expect(decoded).toBeCloseToNested(TARGETS[image_id]);
+      },
+      MAX_TEST_EXECUTION_TIME,
+    );
+
+    it(
+      "batch_size>1",
+      async () => {
+        const image_ids = ["white_image", "blue_image"];
+        const images = await Promise.all(image_ids.map((image_id) => load_cached_image(image_id)));
+
+        const inputs = await processor(images);
+        const outputs = await model(inputs);
+
+        const { max_token_length, num_character_labels, num_bpe_labels, num_wordpiece_labels } = model.config;
+        expect(outputs.char_logits.dims).toEqual([images.length, /* 27 */ max_token_length, /* 38 */ num_character_labels]);
+        expect(outputs.bpe_logits.dims).toEqual([images.length, /* 27 */ max_token_length, /* 99 */ num_bpe_labels]);
+        expect(outputs.wp_logits.dims).toEqual([images.length, /* 27 */ max_token_length, /* 99 */ num_wordpiece_labels]);
+
+        const decoded = processor.batch_decode(outputs.logits);
+        const target = image_ids.reduce((acc, image_id) => {
+          for (const key in TARGETS[image_id]) (acc[key] ??= []).push(...TARGETS[image_id][key]);
+          return acc;
+        }, {});
+
+        expect(decoded).toBeCloseToNested(target);
+      },
+      MAX_TEST_EXECUTION_TIME,
+    );
+
+    afterAll(async () => {
+      await model?.dispose();
+    }, MAX_MODEL_DISPOSE_TIME);
+  });
+};
diff --git a/tests/models/musicgen/test_modeling_musicgen.js b/tests/models/musicgen/test_modeling_musicgen.js
index 7ebf808ed..e16cf022b 100644
--- a/tests/models/musicgen/test_modeling_musicgen.js
+++ b/tests/models/musicgen/test_modeling_musicgen.js
@@ -27,7 +27,7 @@ export default () => {
         const decoder_input_ids = full([inputs.input_ids.dims[0] * model.config.decoder.num_codebooks, 1], pad_token_id);
         const { logits } = await model({ ...inputs, decoder_input_ids });
         expect(logits.dims).toEqual([8, 1, 99]);
-        expect(logits.mean().item()).toBeCloseTo(-0.0018370470497757196, 5);
+        expect(logits.mean().item()).toBeCloseTo(-0.0018370470497757196, 4);
       },
       MAX_TEST_EXECUTION_TIME,
     );
diff --git a/tests/models/paligemma/test_processor_paligemma.js b/tests/models/paligemma/test_processor_paligemma.js
new file mode 100644
index 000000000..4096c5f64
--- /dev/null
+++ b/tests/models/paligemma/test_processor_paligemma.js
@@ -0,0 +1,51 @@
+import { AutoProcessor, PaliGemmaProcessor } from "../../../src/transformers.js";
+
+import { load_cached_image } from "../../asset_cache.js";
+import { MAX_PROCESSOR_LOAD_TIME, MAX_TEST_EXECUTION_TIME } from "../../init.js";
+
+export default () => {
+  const model_id = "hf-internal-testing/tiny-random-PaliGemmaForConditionalGeneration";
+
+  describe("PaliGemmaProcessor", () => {
+    /** @type {PaliGemmaProcessor} */
+    let processor;
+    let images = {};
+
+    beforeAll(async () => {
+      processor = await AutoProcessor.from_pretrained(model_id);
+      images = {
+        white_image: await load_cached_image("white_image"),
+      };
+    }, MAX_PROCESSOR_LOAD_TIME);
+
+    it(
+      "Image-only (default text)",
+      async () => {
+        const { input_ids, pixel_values } = await processor(images.white_image);
+        expect(input_ids.dims).toEqual([1, 258]);
+        expect(pixel_values.dims).toEqual([1, 3, 224, 224]);
+      },
+      MAX_TEST_EXECUTION_TIME,
+    );
+
+    it(
+      "Single image & text",
+      async () => {
+        const { input_ids, pixel_values } = await processor(images.white_image, "<image>What is on the flower?");
+        expect(input_ids.dims).toEqual([1, 264]);
+        expect(pixel_values.dims).toEqual([1, 3, 224, 224]);
+      },
+      MAX_TEST_EXECUTION_TIME,
+    );
+
+    it(
+      "Multiple images & text",
+      async () => {
+        const { input_ids, pixel_values } = await processor([images.white_image, images.white_image], "<image><image>Describe the images.");
+        expect(input_ids.dims).toEqual([1, 518]);
+        expect(pixel_values.dims).toEqual([2, 3, 224, 224]);
+      },
+      MAX_TEST_EXECUTION_TIME,
+    );
+  });
+};
diff --git a/tests/models/qwen2_vl/test_modeling_qwen2_vl.js b/tests/models/qwen2_vl/test_modeling_qwen2_vl.js
index 887a8e092..81af16e95 100644
--- a/tests/models/qwen2_vl/test_modeling_qwen2_vl.js
+++ b/tests/models/qwen2_vl/test_modeling_qwen2_vl.js
@@ -29,10 +29,7 @@ export default () => {
     /** @type {Qwen2VLProcessor} */
     let processor;
     beforeAll(async () => {
-      model = await Qwen2VLForConditionalGeneration.from_pretrained(model_id, {
-        // TODO move to config
-        ...DEFAULT_MODEL_OPTIONS,
-      });
+      model = await Qwen2VLForConditionalGeneration.from_pretrained(model_id, DEFAULT_MODEL_OPTIONS);
       processor = await Qwen2VLProcessor.from_pretrained(model_id);
     }, MAX_MODEL_LOAD_TIME);
 
diff --git a/tests/models/qwen2_vl/test_processor_qwen2_vl.js b/tests/models/qwen2_vl/test_processor_qwen2_vl.js
new file mode 100644
index 000000000..72ccf782f
--- /dev/null
+++ b/tests/models/qwen2_vl/test_processor_qwen2_vl.js
@@ -0,0 +1,44 @@
+import { AutoProcessor, Qwen2VLProcessor } from "../../../src/transformers.js";
+
+import { load_cached_image } from "../../asset_cache.js";
+import { MAX_PROCESSOR_LOAD_TIME, MAX_TEST_EXECUTION_TIME } from "../../init.js";
+
+export default () => {
+  describe("Qwen2VLProcessor", () => {
+    const model_id = "hf-internal-testing/tiny-random-Qwen2VLForConditionalGeneration";
+
+    /** @type {Qwen2VLProcessor} */
+    let processor;
+    let images = {};
+
+    beforeAll(async () => {
+      processor = await AutoProcessor.from_pretrained(model_id);
+      images = {
+        white_image: await load_cached_image("white_image"),
+      };
+    }, MAX_PROCESSOR_LOAD_TIME);
+
+    it(
+      "Image and text",
+      async () => {
+        const conversation = [
+          {
+            role: "user",
+            content: [{ type: "image" }, { type: "text", text: "Describe this image." }],
+          },
+        ];
+
+        const text = processor.apply_chat_template(conversation, {
+          add_generation_prompt: true,
+        });
+        const { input_ids, attention_mask, pixel_values, image_grid_thw } = await processor(text, images.white_image);
+
+        expect(input_ids.dims).toEqual([1, 89]);
+        expect(attention_mask.dims).toEqual([1, 89]);
+        expect(pixel_values.dims).toEqual([256, 1176]);
+        expect(image_grid_thw.dims).toEqual([1, 3]);
+      },
+      MAX_TEST_EXECUTION_TIME,
+    );
+  });
+};
diff --git a/tests/models/sam/test_modeling_sam.js b/tests/models/sam/test_modeling_sam.js
new file mode 100644
index 000000000..ec7beaf6b
--- /dev/null
+++ b/tests/models/sam/test_modeling_sam.js
@@ -0,0 +1,48 @@
+import { SamProcessor, SamModel } from "../../../src/transformers.js";
+import { load_cached_image } from "../../asset_cache.js";
+
+import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../../init.js";
+
+export default () => {
+  describe("SamModel", () => {
+    const model_id = "Xenova/slimsam-77-uniform";
+
+    /** @type {SamModel} */
+    let model;
+    /** @type {SamProcessor} */
+    let processor;
+    beforeAll(async () => {
+      model = await SamModel.from_pretrained(model_id, DEFAULT_MODEL_OPTIONS);
+      processor = await SamProcessor.from_pretrained(model_id);
+    }, MAX_MODEL_LOAD_TIME);
+
+    it(
+      "w/ input_points",
+      async () => {
+        // Prepare image and input points
+        const raw_image = await load_cached_image("corgi");
+        const input_points = [[[340, 250]]];
+
+        // Process inputs and perform mask generation
+        const inputs = await processor(raw_image, { input_points });
+        const { pred_masks, iou_scores } = await model(inputs);
+
+        expect(pred_masks.dims).toEqual([1, 1, 3, 256, 256]);
+        expect(pred_masks.mean().item()).toBeCloseTo(-5.76981782913208, 5);
+        expect(iou_scores.dims).toEqual([1, 1, 3]);
+        expect(iou_scores.tolist()).toBeCloseToNested([[[0.8583833575248718, 0.9773167967796326, 0.8511142730712891]]]);
+
+        // Post-process masks
+        const masks = await processor.post_process_masks(pred_masks, inputs.original_sizes, inputs.reshaped_input_sizes);
+        expect(masks).toHaveLength(1);
+        expect(masks[0].dims).toEqual([1, 3, 410, 614]);
+        expect(masks[0].type).toEqual("bool");
+      },
+      MAX_TEST_EXECUTION_TIME,
+    );
+
+    afterAll(async () => {
+      await model?.dispose();
+    }, MAX_MODEL_DISPOSE_TIME);
+  });
+};
diff --git a/tests/models/seamless_m4t/test_feature_extraction_seamless_m4t.js b/tests/models/seamless_m4t/test_feature_extraction_seamless_m4t.js
new file mode 100644
index 000000000..5392657a3
--- /dev/null
+++ b/tests/models/seamless_m4t/test_feature_extraction_seamless_m4t.js
@@ -0,0 +1,65 @@
+import { AutoFeatureExtractor, SeamlessM4TFeatureExtractor } from "../../../src/transformers.js";
+
+import { load_cached_audio } from "../../asset_cache.js";
+import { MAX_FEATURE_EXTRACTOR_LOAD_TIME, MAX_TEST_EXECUTION_TIME } from "../../init.js";
+
+const sum = (array) => Number(array.reduce((a, b) => a + b, array instanceof BigInt64Array ? 0n : 0));
+
+export default () => {
+  // SeamlessM4TFeatureExtractor
+  describe("SeamlessM4TFeatureExtractor", () => {
+    const model_id = "Xenova/wav2vec2-bert-CV16-en";
+
+    /** @type {SeamlessM4TFeatureExtractor} */
+    let feature_extractor;
+    beforeAll(async () => {
+      feature_extractor = await AutoFeatureExtractor.from_pretrained(model_id);
+    }, MAX_FEATURE_EXTRACTOR_LOAD_TIME);
+
+    it(
+      "default",
+      async () => {
+        const audio = await load_cached_audio("mlk");
+
+        const { input_features, attention_mask } = await feature_extractor(audio);
+        const { dims, data } = input_features;
+        expect(dims).toEqual([1, 649, 160]);
+        expect(attention_mask.dims).toEqual([1, 649]);
+
+        expect(input_features.mean().item()).toBeCloseTo(-2.938903875815413e-8);
+        expect(data[0]).toBeCloseTo(1.1939343214035034);
+        expect(data[1]).toBeCloseTo(0.7874255180358887);
+        expect(data[160]).toBeCloseTo(-0.712975025177002);
+        expect(data[161]).toBeCloseTo(0.045802414417266846);
+        expect(data.at(-1)).toBeCloseTo(-1.3328346014022827);
+
+        expect(sum(attention_mask.data)).toEqual(649);
+      },
+      MAX_TEST_EXECUTION_TIME,
+    );
+
+    it(
+      "padding (pad_to_multiple_of=2)",
+      async () => {
+        const audio = await load_cached_audio("mlk");
+
+        const { input_features, attention_mask } = await feature_extractor(audio.slice(0, 10000));
+        const { dims, data } = input_features;
+
+        // [1, 61, 80] -> [1, 62, 80] -> [1, 31, 160]
+        expect(dims).toEqual([1, 31, 160]);
+        expect(attention_mask.dims).toEqual([1, 31]);
+
+        expect(input_features.mean().item()).toBeCloseTo(0.01612919569015503);
+        expect(data[0]).toBeCloseTo(0.9657132029533386);
+        expect(data[1]).toBeCloseTo(0.12912897765636444);
+        expect(data[160]).toBeCloseTo(-1.2364212274551392);
+        expect(data[161]).toBeCloseTo(-0.9703778028488159);
+        expect(data.at(-1)).toBeCloseTo(1); // padding value
+
+        expect(sum(attention_mask.data)).toEqual(30);
+      },
+      MAX_TEST_EXECUTION_TIME,
+    );
+  });
+};
diff --git a/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.js b/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.js
index a13c4bb5e..f70cb682b 100644
--- a/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.js
+++ b/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.js
@@ -1,4 +1,4 @@
-import { GPT2Tokenizer, VisionEncoderDecoderModel, RawImage, full } from "../../../src/transformers.js";
+import { VisionEncoderDecoderModel, full } from "../../../src/transformers.js";
 
 import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../../init.js";
 
@@ -8,11 +8,8 @@ export default () => {
 
     /** @type {VisionEncoderDecoderModel} */
     let model;
-    /** @type {GPT2Tokenizer} */
-    let tokenizer;
     beforeAll(async () => {
       model = await VisionEncoderDecoderModel.from_pretrained(model_id, DEFAULT_MODEL_OPTIONS);
-      tokenizer = await GPT2Tokenizer.from_pretrained(model_id);
     }, MAX_MODEL_LOAD_TIME);
 
     it(
diff --git a/tests/models/vitpose/test_image_processing_vitpose.js b/tests/models/vitpose/test_image_processing_vitpose.js
new file mode 100644
index 000000000..b49afd95d
--- /dev/null
+++ b/tests/models/vitpose/test_image_processing_vitpose.js
@@ -0,0 +1,50 @@
+import { AutoImageProcessor, rand, Tensor, VitPoseImageProcessor } from "../../../src/transformers.js";
+
+import { load_cached_image } from "../../asset_cache.js";
+import { MAX_PROCESSOR_LOAD_TIME, MAX_TEST_EXECUTION_TIME } from "../../init.js";
+
+export default () => {
+  describe("VitPoseImageProcessor", () => {
+    const model_id = "onnx-community/vitpose-base-simple";
+
+    /** @type {VitPoseImageProcessor} */
+    let processor;
+    beforeAll(async () => {
+      processor = await AutoImageProcessor.from_pretrained(model_id);
+    }, MAX_PROCESSOR_LOAD_TIME);
+
+    it(
+      "default",
+      async () => {
+        const image = await load_cached_image("tiger");
+        const { pixel_values, original_sizes, reshaped_input_sizes } = await processor(image);
+
+        expect(pixel_values.dims).toEqual([1, 3, 256, 192]);
+        expect(pixel_values.mean().item()).toBeCloseTo(-0.2771204710006714, 6);
+
+        expect(original_sizes).toEqual([[408, 612]]);
+        expect(reshaped_input_sizes).toEqual([[256, 192]]);
+      },
+      MAX_TEST_EXECUTION_TIME,
+    );
+
+    it(
+      "post_process_pose_estimation",
+      async () => {
+        const num_classes = 17;
+        const size = [0, 0, 1000, 1500];
+        const heatmaps = rand([1, num_classes, 64, 48]);
+
+        const boxes = [[size]];
+        const { bbox, scores, labels, keypoints } = processor.post_process_pose_estimation(heatmaps, boxes, { threshold: null })[0][0];
+
+        expect(bbox).toEqual(size);
+        expect(scores).toHaveLength(num_classes);
+        expect(labels).toHaveLength(num_classes);
+        expect(keypoints).toHaveLength(num_classes);
+        expect(keypoints[0]).toHaveLength(2);
+      },
+      MAX_TEST_EXECUTION_TIME,
+    );
+  });
+};
diff --git a/tests/models/wespeaker_resnet/test_feature_extraction_wespeaker_resnet.js b/tests/models/wespeaker_resnet/test_feature_extraction_wespeaker_resnet.js
new file mode 100644
index 000000000..27d0dbd6f
--- /dev/null
+++ b/tests/models/wespeaker_resnet/test_feature_extraction_wespeaker_resnet.js
@@ -0,0 +1,56 @@
+import { AutoFeatureExtractor, WeSpeakerFeatureExtractor } from "../../../src/transformers.js";
+
+import { MAX_FEATURE_EXTRACTOR_LOAD_TIME, MAX_TEST_EXECUTION_TIME } from "../../init.js";
+
+export default () => {
+  // WeSpeakerFeatureExtractor
+  describe("WeSpeakerFeatureExtractor", () => {
+    const model_id = "onnx-community/wespeaker-voxceleb-resnet34-LM";
+
+    /** @type {WeSpeakerFeatureExtractor} */
+    let feature_extractor;
+    beforeAll(async () => {
+      feature_extractor = await AutoFeatureExtractor.from_pretrained(model_id);
+    }, MAX_FEATURE_EXTRACTOR_LOAD_TIME);
+
+    it(
+      "default",
+      async () => {
+        const audio = new Float32Array(16000).map((_, i) => Math.sin(i / 100));
+        const { input_features } = await feature_extractor(audio);
+        const { dims, data } = input_features;
+        expect(dims).toEqual([1, 98, 80]);
+
+        expect(input_features.mean().item()).toBeCloseTo(5.461731689138105e-8);
+        expect(data[0]).toBeCloseTo(-0.19300270080566406);
+        expect(data[1]).toBeCloseTo(-0.05825042724609375);
+        expect(data[78]).toBeCloseTo(0.2683420181274414);
+        expect(data[79]).toBeCloseTo(0.26250171661376953);
+        expect(data[80]).toBeCloseTo(0.19062232971191406);
+        expect(data.at(-2)).toBeCloseTo(-0.43694400787353516);
+        expect(data.at(-1)).toBeCloseTo(-0.4266204833984375);
+      },
+      MAX_TEST_EXECUTION_TIME,
+    );
+
+    it(
+      "pad to `min_num_frames`",
+      async () => {
+        const audio = new Float32Array(3).map((_, i) => Math.sin(i / 100));
+        const { input_features } = await feature_extractor(audio);
+        const { dims, data } = input_features;
+        expect(dims).toEqual([1, 9, 80]);
+
+        expect(input_features.mean().item()).toBeCloseTo(-0.0000010093053181966146);
+        expect(data[0]).toBeCloseTo(20.761859893798828);
+        expect(data[1]).toBeCloseTo(21.02924346923828);
+        expect(data[78]).toBeCloseTo(19.083993911743164);
+        expect(data[79]).toBeCloseTo(18.003454208374023);
+        expect(data[80]).toBeCloseTo(-2.595233917236328);
+        expect(data.at(-2)).toBeCloseTo(-2.385499954223633);
+        expect(data.at(-1)).toBeCloseTo(-2.2504329681396484);
+      },
+      MAX_TEST_EXECUTION_TIME,
+    );
+  });
+};
diff --git a/tests/models/whisper/test_feature_extraction_whisper.js b/tests/models/whisper/test_feature_extraction_whisper.js
new file mode 100644
index 000000000..20e132ff6
--- /dev/null
+++ b/tests/models/whisper/test_feature_extraction_whisper.js
@@ -0,0 +1,33 @@
+import { AutoFeatureExtractor, WhisperFeatureExtractor } from "../../../src/transformers.js";
+
+import { load_cached_audio } from "../../asset_cache.js";
+import { MAX_FEATURE_EXTRACTOR_LOAD_TIME, MAX_TEST_EXECUTION_TIME } from "../../init.js";
+
+export default () => {
+  // WhisperFeatureExtractor
+  describe("WhisperFeatureExtractor", () => {
+    const model_id = "Xenova/whisper-tiny.en";
+
+    /** @type {WhisperFeatureExtractor} */
+    let feature_extractor;
+    beforeAll(async () => {
+      feature_extractor = await AutoFeatureExtractor.from_pretrained(model_id);
+    }, MAX_FEATURE_EXTRACTOR_LOAD_TIME);
+
+    it(
+      "default",
+      async () => {
+        const audio = await load_cached_audio("mlk");
+        const { input_features } = await feature_extractor(audio);
+        const { dims, data } = input_features;
+        expect(dims).toEqual([1, 80, 3000]);
+        expect(input_features.mean().item()).toBeCloseTo(-0.2813588131551941);
+        expect(data[0]).toBeCloseTo(0.33168578147888184);
+        expect(data[1]).toBeCloseTo(0.30986475944519043);
+        expect(data[81]).toBeCloseTo(0.10727232694625854);
+        expect(data[3001]).toBeCloseTo(0.2555035352706909);
+      },
+      MAX_TEST_EXECUTION_TIME,
+    );
+  });
+};
diff --git a/tests/pipelines.test.js b/tests/pipelines.test.js
index 6bef83297..bfdef3872 100644
--- a/tests/pipelines.test.js
+++ b/tests/pipelines.test.js
@@ -1,16 +1,17 @@
 import { pipeline, cos_sim } from "../src/transformers.js";
 import { init, MAX_TEST_EXECUTION_TIME } from "./init.js";
-import { compare, loadAudio } from "./test_utils.js";
+import { collect_and_execute_pipeline_tests, compare, loadAudio } from "./test_utils.js";
 
 // Initialise the testing environment
 init();
+await collect_and_execute_pipeline_tests("Pipelines");
 
 // NOTE:
 // Due to a memory leak in Jest, we cannot have multiple tests for a single model.
 // This is due to how model construction and destruction occurs, in `beforeAll` and `afterAll`, respectively.
 // As a result, each test is responsible for exactly one model, but we run multiple inputs through it.
 // By encapsulating model construction and destruction in a single `it` block, we avoid these memory issues.
-xdescribe("Pipelines", () => {
+xdescribe("Pipelines (ignored)", () => {
   describe("Text classification", () => {
     // List all models which will be tested
     const models = ["Xenova/distilbert-base-uncased-finetuned-sst-2-english", "Xenova/toxic-bert"];
diff --git a/tests/pipelines/test_pipelines_audio_classification.js b/tests/pipelines/test_pipelines_audio_classification.js
new file mode 100644
index 000000000..e9e4ac703
--- /dev/null
+++ b/tests/pipelines/test_pipelines_audio_classification.js
@@ -0,0 +1,80 @@
+import { pipeline, AudioClassificationPipeline } from "../../src/transformers.js";
+
+import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../init.js";
+
+const PIPELINE_ID = "audio-classification";
+
+export default () => {
+  describe("Audio Classification", () => {
+    const model_id = "hf-internal-testing/tiny-random-unispeech";
+    const audios = [new Float32Array(16000).fill(0), Float32Array.from({ length: 16000 }, (_, i) => i)];
+
+    /** @type {AudioClassificationPipeline} */
+    let pipe;
+    beforeAll(async () => {
+      pipe = await pipeline(PIPELINE_ID, model_id, DEFAULT_MODEL_OPTIONS);
+    }, MAX_MODEL_LOAD_TIME);
+
+    it("should be an instance of AudioClassificationPipeline", () => {
+      expect(pipe).toBeInstanceOf(AudioClassificationPipeline);
+    });
+
+    describe("batch_size=1", () => {
+      it(
+        "default (top_k=5)",
+        async () => {
+          const output = await pipe(audios[0]);
+          const target = [
+            { score: 0.5043687224388123, label: "LABEL_0" },
+            { score: 0.4956313371658325, label: "LABEL_1" },
+          ];
+          expect(output).toBeCloseToNested(target, 5);
+        },
+        MAX_TEST_EXECUTION_TIME,
+      );
+      it(
+        "custom (top_k=1)",
+        async () => {
+          const output = await pipe(audios[0], { top_k: 1 });
+          const target = [{ score: 0.5043687224388123, label: "LABEL_0" }];
+          expect(output).toBeCloseToNested(target, 5);
+        },
+        MAX_TEST_EXECUTION_TIME,
+      );
+    });
+
+    describe("batch_size>1", () => {
+      it(
+        "default (top_k=5)",
+        async () => {
+          const output = await pipe(audios);
+          const target = [
+            [
+              { score: 0.5043687224388123, label: "LABEL_0" },
+              { score: 0.4956313371658325, label: "LABEL_1" },
+            ],
+            [
+              { score: 0.5187293887138367, label: "LABEL_0" },
+              { score: 0.4812707006931305, label: "LABEL_1" },
+            ],
+          ];
+          expect(output).toBeCloseToNested(target, 5);
+        },
+        MAX_TEST_EXECUTION_TIME,
+      );
+      it(
+        "custom (top_k=1)",
+        async () => {
+          const output = await pipe(audios, { top_k: 1 });
+          const target = [[{ score: 0.5043687224388123, label: "LABEL_0" }], [{ score: 0.5187293887138367, label: "LABEL_0" }]];
+          expect(output).toBeCloseToNested(target, 5);
+        },
+        MAX_TEST_EXECUTION_TIME,
+      );
+    });
+
+    afterAll(async () => {
+      await pipe.dispose();
+    }, MAX_MODEL_DISPOSE_TIME);
+  });
+};
diff --git a/tests/pipelines/test_pipelines_automatic_speech_recognition.js b/tests/pipelines/test_pipelines_automatic_speech_recognition.js
new file mode 100644
index 000000000..da9dd88b4
--- /dev/null
+++ b/tests/pipelines/test_pipelines_automatic_speech_recognition.js
@@ -0,0 +1,129 @@
+import { pipeline, AutomaticSpeechRecognitionPipeline } from "../../src/transformers.js";
+
+import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../init.js";
+
+const PIPELINE_ID = "automatic-speech-recognition";
+
+export default () => {
+  describe("Automatic Speech Recognition", () => {
+    describe("whisper", () => {
+      const model_id = "Xenova/tiny-random-WhisperForConditionalGeneration";
+      const SAMPLING_RATE = 16000;
+      const audios = [new Float32Array(SAMPLING_RATE).fill(0), Float32Array.from({ length: SAMPLING_RATE }, (_, i) => i / 16000)];
+      const long_audios = [new Float32Array(SAMPLING_RATE * 60).fill(0), Float32Array.from({ length: SAMPLING_RATE * 60 }, (_, i) => (i % 1000) / 1000)];
+
+      const max_new_tokens = 5;
+      /** @type {AutomaticSpeechRecognitionPipeline} */
+      let pipe;
+      beforeAll(async () => {
+        pipe = await pipeline(PIPELINE_ID, model_id, DEFAULT_MODEL_OPTIONS);
+      }, MAX_MODEL_LOAD_TIME);
+
+      it("should be an instance of AutomaticSpeechRecognitionPipeline", () => {
+        expect(pipe).toBeInstanceOf(AutomaticSpeechRecognitionPipeline);
+      });
+
+      describe("batch_size=1", () => {
+        it(
+          "default",
+          async () => {
+            const output = await pipe(audios[0], { max_new_tokens });
+            const target = { text: "นะคะนะคะURURUR" };
+            expect(output).toEqual(target);
+          },
+          MAX_TEST_EXECUTION_TIME,
+        );
+        it(
+          "transcribe w/ return_timestamps=true",
+          async () => {
+            const output = await pipe(audios[0], { return_timestamps: true, max_new_tokens });
+            const target = {
+              text: " riceUR",
+              chunks: [
+                { timestamp: [0.72, 17.72], text: " rice" },
+                { timestamp: [17.72, null], text: "UR" },
+              ],
+            };
+            expect(output).toBeCloseToNested(target, 5);
+          },
+          MAX_TEST_EXECUTION_TIME,
+        );
+        // TODO add: transcribe w/ return_timestamps="word"
+        // it(
+        //   "transcribe w/ word-level timestamps",
+        //   async () => {
+        //     const output = await pipe(audios[0], { return_timestamps: "word", max_new_tokens });
+        //     const target = [];
+        //     expect(output).toBeCloseToNested(target, 5);
+        //   },
+        //   MAX_TEST_EXECUTION_TIME,
+        // );
+        it(
+          "transcribe w/ language",
+          async () => {
+            const output = await pipe(audios[0], { language: "french", task: "transcribe", max_new_tokens });
+            const target = { text: "นะคะนะคะURURUR" };
+            expect(output).toEqual(target);
+          },
+          MAX_TEST_EXECUTION_TIME,
+        );
+        it(
+          "translate",
+          async () => {
+            const output = await pipe(audios[0], { language: "french", task: "translate", max_new_tokens });
+            const target = { text: "นะคะนะคะURURUR" };
+            expect(output).toEqual(target);
+          },
+          MAX_TEST_EXECUTION_TIME,
+        );
+        it(
+          "audio > 30 seconds",
+          async () => {
+            const output = await pipe(long_audios[0], { chunk_length_s: 30, stride_length_s: 5, max_new_tokens });
+            const target = { text: "นะคะนะคะURURUR" };
+            expect(output).toEqual(target);
+          },
+          MAX_TEST_EXECUTION_TIME,
+        );
+      });
+
+      afterAll(async () => {
+        await pipe.dispose();
+      }, MAX_MODEL_DISPOSE_TIME);
+    });
+
+    describe("wav2vec2", () => {
+      const model_id = "Xenova/tiny-random-Wav2Vec2ForCTC-ONNX";
+      const SAMPLING_RATE = 16000;
+      const audios = [new Float32Array(SAMPLING_RATE).fill(0), Float32Array.from({ length: SAMPLING_RATE }, (_, i) => i / 16000)];
+      const long_audios = [new Float32Array(SAMPLING_RATE * 60).fill(0), Float32Array.from({ length: SAMPLING_RATE * 60 }, (_, i) => (i % 1000) / 1000)];
+
+      const max_new_tokens = 5;
+      /** @type {AutomaticSpeechRecognitionPipeline} */
+      let pipe;
+      beforeAll(async () => {
+        pipe = await pipeline(PIPELINE_ID, model_id, DEFAULT_MODEL_OPTIONS);
+      }, MAX_MODEL_LOAD_TIME);
+
+      it("should be an instance of AutomaticSpeechRecognitionPipeline", () => {
+        expect(pipe).toBeInstanceOf(AutomaticSpeechRecognitionPipeline);
+      });
+
+      describe("batch_size=1", () => {
+        it(
+          "default",
+          async () => {
+            const output = await pipe(audios[0], { max_new_tokens });
+            const target = { text: "<unk>K" };
+            expect(output).toEqual(target);
+          },
+          MAX_TEST_EXECUTION_TIME,
+        );
+      });
+
+      afterAll(async () => {
+        await pipe.dispose();
+      }, MAX_MODEL_DISPOSE_TIME);
+    });
+  });
+};
diff --git a/tests/pipelines/test_pipelines_depth_estimation.js b/tests/pipelines/test_pipelines_depth_estimation.js
new file mode 100644
index 000000000..f0d5fe887
--- /dev/null
+++ b/tests/pipelines/test_pipelines_depth_estimation.js
@@ -0,0 +1,57 @@
+import { pipeline, DepthEstimationPipeline, RawImage } from "../../src/transformers.js";
+
+import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../init.js";
+import { load_cached_image } from "../asset_cache.js";
+
+const PIPELINE_ID = "depth-estimation";
+
+export default () => {
+  describe("Depth Estimation", () => {
+    const model_id = "hf-internal-testing/tiny-random-DPTForDepthEstimation";
+    /** @type {DepthEstimationPipeline} */
+    let pipe;
+    let images;
+    beforeAll(async () => {
+      pipe = await pipeline(PIPELINE_ID, model_id, DEFAULT_MODEL_OPTIONS);
+      images = await Promise.all([load_cached_image("white_image"), load_cached_image("blue_image")]);
+    }, MAX_MODEL_LOAD_TIME);
+
+    it("should be an instance of DepthEstimationPipeline", () => {
+      expect(pipe).toBeInstanceOf(DepthEstimationPipeline);
+    });
+
+    describe("batch_size=1", () => {
+      it(
+        "default",
+        async () => {
+          const output = await pipe(images[0]);
+          expect(output.predicted_depth.dims).toEqual([32, 32]);
+          expect(output.predicted_depth.mean().item()).toBeCloseTo(0.000006106501587055391, 6);
+          expect(output.depth.size).toEqual(images[0].size);
+        },
+        MAX_TEST_EXECUTION_TIME,
+      );
+    });
+
+    describe("batch_size>1", () => {
+      it(
+        "default",
+        async () => {
+          const output = await pipe(images);
+          expect(output).toHaveLength(images.length);
+          expect(output[0].predicted_depth.dims).toEqual([32, 32]);
+          expect(output[0].predicted_depth.mean().item()).toBeCloseTo(0.000006106501587055391, 6);
+          expect(output[0].depth.size).toEqual(images[0].size);
+          expect(output[1].predicted_depth.dims).toEqual([32, 32]);
+          expect(output[1].predicted_depth.mean().item()).toBeCloseTo(0.0000014548650142387487, 6);
+          expect(output[1].depth.size).toEqual(images[1].size);
+        },
+        MAX_TEST_EXECUTION_TIME,
+      );
+    });
+
+    afterAll(async () => {
+      await pipe.dispose();
+    }, MAX_MODEL_DISPOSE_TIME);
+  });
+};
diff --git a/tests/pipelines/test_pipelines_document_question_answering.js b/tests/pipelines/test_pipelines_document_question_answering.js
new file mode 100644
index 000000000..3ebb1e436
--- /dev/null
+++ b/tests/pipelines/test_pipelines_document_question_answering.js
@@ -0,0 +1,41 @@
+import { pipeline, DocumentQuestionAnsweringPipeline, RawImage } from "../../src/transformers.js";
+
+import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../init.js";
+
+const PIPELINE_ID = "document-question-answering";
+
+export default () => {
+  describe("Document Question Answering", () => {
+    const model_id = "hf-internal-testing/tiny-random-VisionEncoderDecoderModel-donutswin-mbart";
+
+    /** @type {DocumentQuestionAnsweringPipeline} */
+    let pipe;
+    beforeAll(async () => {
+      pipe = await pipeline(PIPELINE_ID, model_id, DEFAULT_MODEL_OPTIONS);
+    }, MAX_MODEL_LOAD_TIME);
+
+    it("should be an instance of DocumentQuestionAnsweringPipeline", () => {
+      expect(pipe).toBeInstanceOf(DocumentQuestionAnsweringPipeline);
+    });
+
+    describe("batch_size=1", () => {
+      it(
+        "default",
+        async () => {
+          const dims = [64, 32, 3];
+          const image = new RawImage(new Uint8ClampedArray(dims[0] * dims[1] * dims[2]).fill(255), ...dims);
+          const question = "What is the invoice number?";
+          const output = await pipe(image, question);
+
+          const target = [{ answer: null }];
+          expect(output).toEqual(target);
+        },
+        MAX_TEST_EXECUTION_TIME,
+      );
+    });
+
+    afterAll(async () => {
+      await pipe.dispose();
+    }, MAX_MODEL_DISPOSE_TIME);
+  });
+};
diff --git a/tests/pipelines/test_pipelines_feature_extraction.js b/tests/pipelines/test_pipelines_feature_extraction.js
new file mode 100644
index 000000000..b7bb79e59
--- /dev/null
+++ b/tests/pipelines/test_pipelines_feature_extraction.js
@@ -0,0 +1,121 @@
+import { pipeline, FeatureExtractionPipeline } from "../../src/transformers.js";
+
+import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../init.js";
+
+const PIPELINE_ID = "feature-extraction";
+
+export default () => {
+  describe("Feature Extraction", () => {
+    const model_id = "hf-internal-testing/tiny-random-BertModel";
+
+    const texts = ["This is a simple test.", "Hello world"];
+
+    /** @type {FeatureExtractionPipeline} */
+    let pipe;
+    beforeAll(async () => {
+      pipe = await pipeline(PIPELINE_ID, model_id, DEFAULT_MODEL_OPTIONS);
+    }, MAX_MODEL_LOAD_TIME);
+
+    it("should be an instance of FeatureExtractionPipeline ", () => {
+      expect(pipe).toBeInstanceOf(FeatureExtractionPipeline);
+    });
+
+    describe("batch_size=1", () => {
+      it(
+        "default",
+        async () => {
+          const output = await pipe(texts[0]);
+          expect(output.dims).toEqual([1, 20, 32]);
+          expect(output.type).toEqual("float32");
+          expect(output.mean().item()).toBeCloseTo(-1.538501215314625e-9, 6);
+        },
+        MAX_TEST_EXECUTION_TIME,
+      );
+      it(
+        "w/ cls pooling",
+        async () => {
+          const output = await pipe(texts[0], { pooling: "cls" });
+          expect(output.dims).toEqual([1, 32]);
+          expect(output.type).toEqual("float32");
+          expect(output.mean().item()).toBeCloseTo(2.491287887096405e-8, 6);
+        },
+        MAX_TEST_EXECUTION_TIME,
+      );
+      it(
+        "w/ mean pooling & normalization",
+        async () => {
+          const output = await pipe(texts[0], { pooling: "mean", normalize: true });
+          expect(output.dims).toEqual([1, 32]);
+          expect(output.type).toEqual("float32");
+          expect(output.mean().item()).toBeCloseTo(-2.0245352061465383e-9, 6);
+        },
+        MAX_TEST_EXECUTION_TIME,
+      );
+      it(
+        "w/ mean pooling & binary quantization",
+        async () => {
+          const output = await pipe(texts[0], { pooling: "mean", quantize: true, precision: "binary" });
+          expect(output.dims).toEqual([1, 32 / 8]);
+          expect(output.type).toEqual("int8");
+          expect(output.mean().item()).toEqual(-15);
+        },
+        MAX_TEST_EXECUTION_TIME,
+      );
+      it("w/ cls pooling & ubinary quantization", async () => {
+        const output = await pipe(texts[0], { pooling: "cls", quantize: true, precision: "ubinary" });
+        expect(output.dims).toEqual([1, 32 / 8]);
+        expect(output.type).toEqual("uint8");
+        expect(output.mean().item()).toEqual(140);
+      });
+    });
+
+    describe("batch_size>1", () => {
+      it(
+        "default",
+        async () => {
+          const output = await pipe(texts);
+          expect(output.dims).toEqual([texts.length, 20, 32]);
+          expect(output.type).toEqual("float32");
+          expect(output.mean().item()).toBeCloseTo(2.345950544935249e-9, 6);
+        },
+        MAX_TEST_EXECUTION_TIME,
+      );
+      it(
+        "w/ cls pooling",
+        async () => {
+          const output = await pipe(texts, { pooling: "cls" });
+          expect(output.dims).toEqual([texts.length, 32]);
+          expect(output.type).toEqual("float32");
+          expect(output.mean().item()).toBeCloseTo(1.6298145055770874e-8, 6);
+        },
+        MAX_TEST_EXECUTION_TIME,
+      );
+      it(
+        "w/ mean pooling & normalization",
+        async () => {
+          const output = await pipe(texts, { pooling: "mean", normalize: true });
+          expect(output.dims).toEqual([texts.length, 32]);
+          expect(output.type).toEqual("float32");
+          expect(output.mean().item()).toBeCloseTo(-1.538609240014921e-10, 6);
+        },
+        MAX_TEST_EXECUTION_TIME,
+      );
+      it("w/ mean pooling & binary quantization", async () => {
+        const output = await pipe(texts, { pooling: "mean", quantize: true, precision: "binary" });
+        expect(output.dims).toEqual([texts.length, 32 / 8]);
+        expect(output.type).toEqual("int8");
+        expect(output.mean().item()).toEqual(-14);
+      });
+      it("w/ cls pooling & ubinary quantization", async () => {
+        const output = await pipe(texts, { pooling: "cls", quantize: true, precision: "ubinary" });
+        expect(output.dims).toEqual([texts.length, 32 / 8]);
+        expect(output.type).toEqual("uint8");
+        expect(output.mean().item()).toEqual(140);
+      });
+    });
+
+    afterAll(async () => {
+      await pipe.dispose();
+    }, MAX_MODEL_DISPOSE_TIME);
+  });
+};
diff --git a/tests/pipelines/test_pipelines_fill_mask.js b/tests/pipelines/test_pipelines_fill_mask.js
new file mode 100644
index 000000000..9f0cd3515
--- /dev/null
+++ b/tests/pipelines/test_pipelines_fill_mask.js
@@ -0,0 +1,100 @@
+import { pipeline, FillMaskPipeline } from "../../src/transformers.js";
+
+import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../init.js";
+
+const PIPELINE_ID = "fill-mask";
+
+export default () => {
+  describe("Fill Mask", () => {
+    const model_id = "hf-internal-testing/tiny-random-BertForMaskedLM";
+
+    /** @type {FillMaskPipeline} */
+    let pipe;
+    beforeAll(async () => {
+      pipe = await pipeline(PIPELINE_ID, model_id, DEFAULT_MODEL_OPTIONS);
+    }, MAX_MODEL_LOAD_TIME);
+
+    it("should be an instance of FillMaskPipeline", () => {
+      expect(pipe).toBeInstanceOf(FillMaskPipeline);
+    });
+
+    describe("batch_size=1", () => {
+      it(
+        "default (top_k=5)",
+        async () => {
+          const output = await pipe("a [MASK] c");
+          const target = [
+            { score: 0.0013377574505284429, token: 854, token_str: "##ο", sequence: "aο c" },
+            { score: 0.001248967950232327, token: 962, token_str: "##ち", sequence: "aち c" },
+            { score: 0.0012304208939895034, token: 933, token_str: "##ع", sequence: "aع c" },
+            { score: 0.0012301815440878272, token: 313, token_str: "ფ", sequence: "a ფ c" },
+            { score: 0.001222139224410057, token: 624, token_str: "未", sequence: "a 未 c" },
+          ];
+          expect(output).toBeCloseToNested(target, 5);
+        },
+        MAX_TEST_EXECUTION_TIME,
+      );
+      it(
+        "custom (top_k=2)",
+        async () => {
+          const output = await pipe("a [MASK] c", { top_k: 2 });
+          const target = [
+            { score: 0.0013377574505284429, token: 854, token_str: "##ο", sequence: "aο c" },
+            { score: 0.001248967950232327, token: 962, token_str: "##ち", sequence: "aち c" },
+          ];
+          expect(output).toBeCloseToNested(target, 5);
+        },
+        MAX_TEST_EXECUTION_TIME,
+      );
+    });
+
+    describe("batch_size>1", () => {
+      it(
+        "default (top_k=5)",
+        async () => {
+          const output = await pipe(["a [MASK] c", "a b [MASK] c"]);
+          const target = [
+            [
+              { score: 0.0013377574505284429, token: 854, token_str: "##ο", sequence: "aο c" },
+              { score: 0.001248967950232327, token: 962, token_str: "##ち", sequence: "aち c" },
+              { score: 0.0012304208939895034, token: 933, token_str: "##ع", sequence: "aع c" },
+              { score: 0.0012301815440878272, token: 313, token_str: "ფ", sequence: "a ფ c" },
+              { score: 0.001222139224410057, token: 624, token_str: "未", sequence: "a 未 c" },
+            ],
+            [
+              { score: 0.0013287801994010806, token: 962, token_str: "##ち", sequence: "a bち c" },
+              { score: 0.0012486606137827039, token: 823, token_str: "##ن", sequence: "a bن c" },
+              { score: 0.0012320734094828367, token: 1032, token_str: "##ც", sequence: "a bც c" },
+              { score: 0.0012295148335397243, token: 854, token_str: "##ο", sequence: "a bο c" },
+              { score: 0.0012277684872969985, token: 624, token_str: "未", sequence: "a b 未 c" },
+            ],
+          ];
+          expect(output).toBeCloseToNested(target, 5);
+        },
+        MAX_TEST_EXECUTION_TIME,
+      );
+      it(
+        "custom (top_k=2)",
+        async () => {
+          const output = await pipe(["a [MASK] c", "a b [MASK] c"], { top_k: 2 });
+          const target = [
+            [
+              { score: 0.0013377574505284429, token: 854, token_str: "##ο", sequence: "aο c" },
+              { score: 0.001248967950232327, token: 962, token_str: "##ち", sequence: "aち c" },
+            ],
+            [
+              { score: 0.0013287801994010806, token: 962, token_str: "##ち", sequence: "a bち c" },
+              { score: 0.0012486606137827039, token: 823, token_str: "##ن", sequence: "a bن c" },
+            ],
+          ];
+          expect(output).toBeCloseToNested(target, 5);
+        },
+        MAX_TEST_EXECUTION_TIME,
+      );
+    });
+
+    afterAll(async () => {
+      await pipe.dispose();
+    }, MAX_MODEL_DISPOSE_TIME);
+  });
+};
diff --git a/tests/pipelines/test_pipelines_image_classification.js b/tests/pipelines/test_pipelines_image_classification.js
new file mode 100644
index 000000000..93b693a94
--- /dev/null
+++ b/tests/pipelines/test_pipelines_image_classification.js
@@ -0,0 +1,81 @@
+import { pipeline, ImageClassificationPipeline } from "../../src/transformers.js";
+
+import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../init.js";
+import { load_cached_image } from "../asset_cache.js";
+
+const PIPELINE_ID = "image-classification";
+
+export default () => {
+  describe("Image Classification", () => {
+    const model_id = "hf-internal-testing/tiny-random-vit";
+    /** @type {ImageClassificationPipeline} */
+    let pipe;
+    let images;
+    beforeAll(async () => {
+      pipe = await pipeline(PIPELINE_ID, model_id, DEFAULT_MODEL_OPTIONS);
+      images = await Promise.all([load_cached_image("white_image"), load_cached_image("blue_image")]);
+    }, MAX_MODEL_LOAD_TIME);
+
+    it("should be an instance of ImageClassificationPipeline", () => {
+      expect(pipe).toBeInstanceOf(ImageClassificationPipeline);
+    });
+
+    describe("batch_size=1", () => {
+      it(
+        "default (top_k=5)",
+        async () => {
+          const output = await pipe(images[0]);
+          const target = [
+            { label: "LABEL_1", score: 0.5020533800125122 },
+            { label: "LABEL_0", score: 0.4979466497898102 },
+          ];
+          expect(output).toBeCloseToNested(target, 5);
+        },
+        MAX_TEST_EXECUTION_TIME,
+      );
+      it(
+        "custom (top_k=1)",
+        async () => {
+          const output = await pipe(images[0], { top_k: 1 });
+          const target = [{ label: "LABEL_1", score: 0.5020533800125122 }];
+          expect(output).toBeCloseToNested(target, 5);
+        },
+        MAX_TEST_EXECUTION_TIME,
+      );
+    });
+
+    describe("batch_size>1", () => {
+      it(
+        "default (top_k=5)",
+        async () => {
+          const output = await pipe(images);
+          const target = [
+            [
+              { label: "LABEL_1", score: 0.5020533800125122 },
+              { label: "LABEL_0", score: 0.4979466497898102 },
+            ],
+            [
+              { label: "LABEL_1", score: 0.519227921962738 },
+              { label: "LABEL_0", score: 0.4807720482349396 },
+            ],
+          ];
+          expect(output).toBeCloseToNested(target, 5);
+        },
+        MAX_TEST_EXECUTION_TIME,
+      );
+      it(
+        "custom (top_k=1)",
+        async () => {
+          const output = await pipe(images, { top_k: 1 });
+          const target = [[{ label: "LABEL_1", score: 0.5020533800125122 }], [{ label: "LABEL_1", score: 0.519227921962738 }]];
+          expect(output).toBeCloseToNested(target, 5);
+        },
+        MAX_TEST_EXECUTION_TIME,
+      );
+    });
+
+    afterAll(async () => {
+      await pipe.dispose();
+    }, MAX_MODEL_DISPOSE_TIME);
+  });
+};
diff --git a/tests/pipelines/test_pipelines_image_feature_extraction.js b/tests/pipelines/test_pipelines_image_feature_extraction.js
new file mode 100644
index 000000000..c6e7980ee
--- /dev/null
+++ b/tests/pipelines/test_pipelines_image_feature_extraction.js
@@ -0,0 +1,51 @@
+import { pipeline, ImageFeatureExtractionPipeline } from "../../src/transformers.js";
+
+import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../init.js";
+import { load_cached_image } from "../asset_cache.js";
+
+const PIPELINE_ID = "image-feature-extraction";
+
+export default () => {
+  describe("Image Feature Extraction", () => {
+    const model_id = "hf-internal-testing/tiny-random-ViTMAEModel";
+    /** @type {ImageFeatureExtractionPipeline} */
+    let pipe;
+    let images;
+    beforeAll(async () => {
+      pipe = await pipeline(PIPELINE_ID, model_id, DEFAULT_MODEL_OPTIONS);
+      images = await Promise.all([load_cached_image("white_image"), load_cached_image("blue_image")]);
+    }, MAX_MODEL_LOAD_TIME);
+
+    it("should be an instance of ImageFeatureExtractionPipeline", () => {
+      expect(pipe).toBeInstanceOf(ImageFeatureExtractionPipeline);
+    });
+
+    describe("batch_size=1", () => {
+      it(
+        "default",
+        async () => {
+          const output = await pipe(images[0]);
+          expect(output.dims).toEqual([1, 91, 32]);
+          expect(output.mean().item()).toBeCloseTo(-8.507473614471905e-10, 6);
+        },
+        MAX_TEST_EXECUTION_TIME,
+      );
+    });
+
+    describe("batch_size>1", () => {
+      it(
+        "default",
+        async () => {
+          const output = await pipe(images);
+          expect(output.dims).toEqual([images.length, 91, 32]);
+          expect(output.mean().item()).toBeCloseTo(-5.997602414709036e-10, 6);
+        },
+        MAX_TEST_EXECUTION_TIME,
+      );
+    });
+
+    afterAll(async () => {
+      await pipe.dispose();
+    }, MAX_MODEL_DISPOSE_TIME);
+  });
+};
diff --git a/tests/pipelines/test_pipelines_image_segmentation.js b/tests/pipelines/test_pipelines_image_segmentation.js
new file mode 100644
index 000000000..7358601ea
--- /dev/null
+++ b/tests/pipelines/test_pipelines_image_segmentation.js
@@ -0,0 +1,119 @@
+import { pipeline, ImageSegmentationPipeline } from "../../src/transformers.js";
+
+import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../init.js";
+import { load_cached_image } from "../asset_cache.js";
+
+const PIPELINE_ID = "image-segmentation";
+
+export default () => {
+  describe("Image Segmentation", () => {
+    describe("Panoptic Segmentation", () => {
+      const model_id = "Xenova/detr-resnet-50-panoptic";
+      /** @type {ImageSegmentationPipeline} */
+      let pipe;
+      beforeAll(async () => {
+        pipe = await pipeline(PIPELINE_ID, model_id, DEFAULT_MODEL_OPTIONS);
+      }, MAX_MODEL_LOAD_TIME);
+
+      it("should be an instance of ImageSegmentationPipeline", () => {
+        expect(pipe).toBeInstanceOf(ImageSegmentationPipeline);
+      });
+
+      it(
+        "single",
+        async () => {
+          const image = await load_cached_image("cats");
+
+          const output = await pipe(image);
+
+          // First, check mask shapes
+          for (const item of output) {
+            expect(item.mask.width).toEqual(image.width);
+            expect(item.mask.height).toEqual(image.height);
+            expect(item.mask.channels).toEqual(1);
+            delete item.mask; // No longer needed
+          }
+
+          // Next, compare scores and labels
+          const target = [
+            {
+              score: 0.9918501377105713,
+              label: "cat",
+            },
+            {
+              score: 0.9985815286636353,
+              label: "remote",
+            },
+            {
+              score: 0.999537467956543,
+              label: "remote",
+            },
+            {
+              score: 0.9919270277023315,
+              label: "couch",
+            },
+            {
+              score: 0.9993696808815002,
+              label: "cat",
+            },
+          ];
+
+          expect(output).toBeCloseToNested(target, 2);
+        },
+        MAX_TEST_EXECUTION_TIME,
+      );
+
+      afterAll(async () => {
+        await pipe.dispose();
+      }, MAX_MODEL_DISPOSE_TIME);
+    });
+
+    describe("Semantic Segmentation", () => {
+      const model_id = "Xenova/segformer_b0_clothes";
+      /** @type {ImageSegmentationPipeline } */
+      let pipe;
+      beforeAll(async () => {
+        pipe = await pipeline(PIPELINE_ID, model_id, DEFAULT_MODEL_OPTIONS);
+      }, MAX_MODEL_LOAD_TIME);
+
+      it(
+        "single",
+        async () => {
+          const image = await load_cached_image("man_on_car");
+
+          const output = await pipe(image);
+
+          // First, check mask shapes
+          for (const item of output) {
+            expect(item.mask.width).toEqual(image.width);
+            expect(item.mask.height).toEqual(image.height);
+            expect(item.mask.channels).toEqual(1);
+            delete item.mask; // No longer needed
+          }
+
+          // Next, compare scores and labels
+          const target = [
+            { score: null, label: "Background" },
+            { score: null, label: "Hair" },
+            { score: null, label: "Upper-clothes" },
+            { score: null, label: "Pants" },
+            { score: null, label: "Left-shoe" },
+            { score: null, label: "Right-shoe" },
+            { score: null, label: "Face" },
+            { score: null, label: "Right-leg" },
+            { score: null, label: "Left-arm" },
+            { score: null, label: "Right-arm" },
+            { score: null, label: "Bag" },
+          ];
+
+          expect(output).toBeCloseToNested(target, 2);
+        },
+        MAX_TEST_EXECUTION_TIME,
+      );
+
+      afterAll(async () => {
+        await pipe.dispose();
+      }, MAX_MODEL_DISPOSE_TIME);
+    });
+  });
+};
diff --git a/tests/pipelines/test_pipelines_image_to_image.js b/tests/pipelines/test_pipelines_image_to_image.js
new file mode 100644
index 000000000..c7b9a00d2
--- /dev/null
+++ b/tests/pipelines/test_pipelines_image_to_image.js
@@ -0,0 +1,56 @@
+import { pipeline, ImageToImagePipeline } from "../../src/transformers.js";
+
+import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../init.js";
+import { load_cached_image } from "../asset_cache.js";
+
+const PIPELINE_ID = "image-to-image";
+
+export default () => {
+  describe("Image to Image", () => {
+    const model_id = "hf-internal-testing/tiny-random-Swin2SRForImageSuperResolution";
+    /** @type {ImageToImagePipeline} */
+    let pipe;
+    let images;
+    beforeAll(async () => {
+      pipe = await pipeline(PIPELINE_ID, model_id, DEFAULT_MODEL_OPTIONS);
+      images = await Promise.all([load_cached_image("white_image"), load_cached_image("blue_image")]);
+    }, MAX_MODEL_LOAD_TIME);
+
+    it("should be an instance of ImageToImagePipeline", () => {
+      expect(pipe).toBeInstanceOf(ImageToImagePipeline);
+    });
+
+    describe("batch_size=1", () => {
+      it(
+        "default",
+        async () => {
+          const output = await pipe(images[0]);
+          expect(output.size).toEqual([64, 64]);
+          expect(output.channels).toEqual(3);
+          expect(output.data.reduce((a, b) => a + b, 0) / output.data.length).toBeCloseTo(110.107421875, 3);
+        },
+        MAX_TEST_EXECUTION_TIME,
+      );
+    });
+
+    describe("batch_size>1", () => {
+      it(
+        "default",
+        async () => {
+          const output = await pipe(images);
+          expect(output[0].size).toEqual([64, 64]);
+          expect(output[0].channels).toEqual(3);
+          expect(output[0].data.reduce((a, b) => a + b, 0) / output[0].data.length).toBeCloseTo(110.107421875, 3);
+          expect(output[1].size).toEqual([64, 64]);
+          expect(output[1].channels).toEqual(3);
+          expect(output[1].data.reduce((a, b) => a + b, 0) / output[1].data.length).toBeCloseTo(110.60196940104167, 3);
+        },
+        MAX_TEST_EXECUTION_TIME,
+      );
+    });
+
+    afterAll(async () => {
+      await pipe.dispose();
+    }, MAX_MODEL_DISPOSE_TIME);
+  });
+};
diff --git a/tests/pipelines/test_pipelines_image_to_text.js b/tests/pipelines/test_pipelines_image_to_text.js
new file mode 100644
index 000000000..f7d951811
--- /dev/null
+++ b/tests/pipelines/test_pipelines_image_to_text.js
@@ -0,0 +1,51 @@
+import { pipeline, ImageToTextPipeline } from "../../src/transformers.js";
+
+import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../init.js";
+import { load_cached_image } from "../asset_cache.js";
+
+const PIPELINE_ID = "image-to-text";
+
+export default () => {
+  describe("Image to Text", () => {
+    const model_id = "hf-internal-testing/tiny-random-VisionEncoderDecoderModel-vit-gpt2";
+    /** @type {ImageToTextPipeline} */
+    let pipe;
+    let images;
+    beforeAll(async () => {
+      pipe = await pipeline(PIPELINE_ID, model_id, DEFAULT_MODEL_OPTIONS);
+      images = await Promise.all([load_cached_image("white_image"), load_cached_image("blue_image")]);
+    }, MAX_MODEL_LOAD_TIME);
+
+    it("should be an instance of ImageToTextPipeline", () => {
+      expect(pipe).toBeInstanceOf(ImageToTextPipeline);
+    });
+
+    describe("batch_size=1", () => {
+      it(
+        "default",
+        async () => {
+          const output = await pipe(images[0]);
+          const target = [{ generated_text: "" }];
+          expect(output).toEqual(target);
+        },
+        MAX_TEST_EXECUTION_TIME,
+      );
+    });
+
+    describe("batch_size>1", () => {
+      it(
+        "default",
+        async () => {
+          const output = await pipe(images);
+          const target = [[{ generated_text: "" }], [{ generated_text: "" }]];
+          expect(output).toEqual(target);
+        },
+        MAX_TEST_EXECUTION_TIME,
+      );
+    });
+
+    afterAll(async () => {
+      await pipe.dispose();
+    }, MAX_MODEL_DISPOSE_TIME);
+  });
+};
diff --git a/tests/pipelines/test_pipelines_object_detection.js b/tests/pipelines/test_pipelines_object_detection.js
new file mode 100644
index 000000000..e9b0375d0
--- /dev/null
+++ b/tests/pipelines/test_pipelines_object_detection.js
@@ -0,0 +1,131 @@
+import { pipeline, ObjectDetectionPipeline } from "../../src/transformers.js";
+
+import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../init.js";
+import { load_cached_image } from "../asset_cache.js";
+
+const PIPELINE_ID = "object-detection";
+
+export default () => {
+  describe("Object Detection", () => {
+    describe("yolos", () => {
+      const model_id = "Xenova/yolos-tiny";
+      /** @type {ObjectDetectionPipeline} */
+      let pipe;
+      beforeAll(async () => {
+        pipe = await pipeline(PIPELINE_ID, model_id, DEFAULT_MODEL_OPTIONS);
+      }, MAX_MODEL_LOAD_TIME);
+
+      it("should be an instance of ObjectDetectionPipeline", () => {
+        expect(pipe).toBeInstanceOf(ObjectDetectionPipeline);
+      });
+
+      it(
+        "single + threshold",
+        async () => {
+          const image = await load_cached_image("cats");
+          const output = await pipe(image, { threshold: 0.9 });
+
+          const target = [
+            {
+              score: 0.9921281933784485,
+              label: "remote",
+              box: { xmin: 32, ymin: 78, xmax: 185, ymax: 117 },
+            },
+            {
+              score: 0.9884883165359497,
+              label: "remote",
+              box: { xmin: 324, ymin: 82, xmax: 376, ymax: 191 },
+            },
+            {
+              score: 0.9197800159454346,
+              label: "cat",
+              box: { xmin: 5, ymin: 56, xmax: 321, ymax: 469 },
+            },
+            {
+              score: 0.9300552606582642,
+              label: "cat",
+              box: { xmin: 332, ymin: 25, xmax: 638, ymax: 369 },
+            },
+          ];
+          expect(output).toBeCloseToNested(target, 5);
+        },
+        MAX_TEST_EXECUTION_TIME,
+      );
+
+      afterAll(async () => {
+        await pipe.dispose();
+      }, MAX_MODEL_DISPOSE_TIME);
+    });
+
+    describe("tiny-random", () => {
+      const model_id = "hf-internal-testing/tiny-random-DetrForObjectDetection";
+
+      /** @type {ObjectDetectionPipeline} */
+      let pipe;
+      let images;
+
+      beforeAll(async () => {
+        pipe = await pipeline(PIPELINE_ID, model_id, DEFAULT_MODEL_OPTIONS);
+        images = await Promise.all([load_cached_image("white_image"), load_cached_image("blue_image")]);
+      }, MAX_MODEL_LOAD_TIME);
+
+      it("should be an instance of ObjectDetectionPipeline", () => {
+        expect(pipe).toBeInstanceOf(ObjectDetectionPipeline);
+      });
+
+      describe("batch_size=1", () => {
+        it(
+          "default (threshold unset)",
+          async () => {
+            const output = await pipe(images[0]);
+            const target = [];
+            expect(output).toBeCloseToNested(target, 5);
+          },
+          MAX_TEST_EXECUTION_TIME,
+        );
+        it(
+          "default (threshold=0)",
+          async () => {
+            const output = await pipe(images[0], { threshold: 0 });
+            const target = [
+              { score: 0.020360443741083145, label: "LABEL_31", box: { xmin: 56, ymin: 55, xmax: 169, ymax: 167 } },
+              { score: 0.020360419526696205, label: "LABEL_31", box: { xmin: 56, ymin: 55, xmax: 169, ymax: 167 } },
+              { score: 0.02036038413643837, label: "LABEL_31", box: { xmin: 56, ymin: 55, xmax: 169, ymax: 167 } },
+              { score: 0.020360447466373444, label: "LABEL_31", box: { xmin: 56, ymin: 55, xmax: 169, ymax: 167 } },
+              { score: 0.020360389724373817, label: "LABEL_31", box: { xmin: 56, ymin: 55, xmax: 169, ymax: 167 } },
+              { score: 0.020360423251986504, label: "LABEL_31", box: { xmin: 56, ymin: 55, xmax: 169, ymax: 167 } },
+              { score: 0.02036040835082531, label: "LABEL_31", box: { xmin: 56, ymin: 55, xmax: 169, ymax: 167 } },
+              { score: 0.020360363647341728, label: "LABEL_31", box: { xmin: 56, ymin: 55, xmax: 169, ymax: 167 } },
+              { score: 0.020360389724373817, label: "LABEL_31", box: { xmin: 56, ymin: 55, xmax: 169, ymax: 167 } },
+              { score: 0.020360389724373817, label: "LABEL_31", box: { xmin: 56, ymin: 55, xmax: 169, ymax: 167 } },
+              { score: 0.020360343158245087, label: "LABEL_31", box: { xmin: 56, ymin: 55, xmax: 169, ymax: 167 } },
+              { score: 0.020360423251986504, label: "LABEL_31", box: { xmin: 56, ymin: 55, xmax: 169, ymax: 167 } },
+            ];
+            expect(output).toBeCloseToNested(target, 5);
+          },
+          MAX_TEST_EXECUTION_TIME,
+        );
+      });
+
+      // TODO: Add batched support to object detection pipeline
+      // describe('batch_size>1', () => {
+      //     it('default (threshold unset)', async () => {
+      //         const output = await pipe(images);
+      //         console.log(output);
+      //         const target = [];
+      //         expect(output).toBeCloseToNested(target, 5);
+      //     }, MAX_TEST_EXECUTION_TIME);
+      //     it('default (threshold=0)', async () => {
+      //         const output = await pipe(images, { threshold: 0 });
+      //         console.log(output);
+      //         const target = [];
+      //         expect(output).toBeCloseToNested(target, 5);
+      //     }, MAX_TEST_EXECUTION_TIME);
+      // });
+
+      afterAll(async () => {
+        await pipe.dispose();
+      }, MAX_MODEL_DISPOSE_TIME);
+    });
+  });
+};
diff --git a/tests/pipelines/test_pipelines_question_answering.js b/tests/pipelines/test_pipelines_question_answering.js
new file mode 100644
index 000000000..ff346c03b
--- /dev/null
+++ b/tests/pipelines/test_pipelines_question_answering.js
@@ -0,0 +1,49 @@
+import { pipeline, QuestionAnsweringPipeline } from "../../src/transformers.js";
+
+import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../init.js";
+
+const PIPELINE_ID = "question-answering";
+
+export default () => {
+  describe("Question Answering", () => {
+    const model_id = "hf-internal-testing/tiny-random-BertForQuestionAnswering";
+    /** @type {QuestionAnsweringPipeline} */
+    let pipe;
+    beforeAll(async () => {
+      pipe = await pipeline(PIPELINE_ID, model_id, DEFAULT_MODEL_OPTIONS);
+    }, MAX_MODEL_LOAD_TIME);
+
+    it("should be an instance of QuestionAnsweringPipeline", () => {
+      expect(pipe).toBeInstanceOf(QuestionAnsweringPipeline);
+    });
+
+    describe("batch_size=1", () => {
+      it(
+        "default (top_k=1)",
+        async () => {
+          const output = await pipe("a", "b c");
+          const target = { score: 0.11395696550607681, /* start: 0, end: 1, */ answer: "b" };
+          expect(output).toBeCloseToNested(target, 5);
+        },
+        MAX_TEST_EXECUTION_TIME,
+      );
+      it(
+        "custom (top_k=3)",
+        async () => {
+          const output = await pipe("a", "b c", { top_k: 3 });
+          const target = [
+            { score: 0.11395696550607681, /* start: 0, end: 1, */ answer: "b" },
+            { score: 0.11300431191921234, /* start: 2, end: 3, */ answer: "c" },
+            { score: 0.10732574015855789, /* start: 0, end: 3, */ answer: "b c" },
+          ];
+          expect(output).toBeCloseToNested(target, 5);
+        },
+        MAX_TEST_EXECUTION_TIME,
+      );
+    });
+
+    afterAll(async () => {
+      await pipe.dispose();
+    }, MAX_MODEL_DISPOSE_TIME);
+  });
+};
diff --git a/tests/pipelines/test_pipelines_summarization.js b/tests/pipelines/test_pipelines_summarization.js
new file mode 100644
index 000000000..877fd81e9
--- /dev/null
+++ b/tests/pipelines/test_pipelines_summarization.js
@@ -0,0 +1,40 @@
+import { pipeline, SummarizationPipeline } from "../../src/transformers.js";
+
+import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../init.js";
+
+const PIPELINE_ID = "summarization";
+
+export default () => {
+  describe("Summarization", () => {
+    const model_id = "hf-internal-testing/tiny-random-T5ForConditionalGeneration";
+
+    /** @type {SummarizationPipeline} */
+    let pipe;
+    beforeAll(async () => {
+      pipe = await pipeline(PIPELINE_ID, model_id, DEFAULT_MODEL_OPTIONS);
+    }, MAX_MODEL_LOAD_TIME);
+
+    it("should be an instance of SummarizationPipeline", () => {
+      expect(pipe).toBeInstanceOf(SummarizationPipeline);
+    });
+
+    describe("batch_size=1", () => {
+      it(
+        "default",
+        async () => {
+          const text = "This is a test.";
+          const output = await pipe(text, {
+            max_new_tokens: 5,
+          });
+          const target = [{ summary_text: "" }];
+          expect(output).toEqual(target);
+        },
+        MAX_TEST_EXECUTION_TIME,
+      );
+    });
+
+    afterAll(async () => {
+      await pipe.dispose();
+    }, MAX_MODEL_DISPOSE_TIME);
+  });
+};
diff --git a/tests/pipelines/test_pipelines_text2text_generation.js b/tests/pipelines/test_pipelines_text2text_generation.js
new file mode 100644
index 000000000..0084fbbd2
--- /dev/null
+++ b/tests/pipelines/test_pipelines_text2text_generation.js
@@ -0,0 +1,40 @@
+import { pipeline, Text2TextGenerationPipeline } from "../../src/transformers.js";
+
+import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../init.js";
+
+const PIPELINE_ID = "text2text-generation";
+
+export default () => {
+  describe("Text to Text Generation", () => {
+    const model_id = "hf-internal-testing/tiny-random-T5ForConditionalGeneration";
+
+    /** @type {Text2TextGenerationPipeline} */
+    let pipe;
+    beforeAll(async () => {
+      pipe = await pipeline(PIPELINE_ID, model_id, DEFAULT_MODEL_OPTIONS);
+    }, MAX_MODEL_LOAD_TIME);
+
+    it("should be an instance of Text2TextGenerationPipeline", () => {
+      expect(pipe).toBeInstanceOf(Text2TextGenerationPipeline);
+    });
+
+    describe("batch_size=1", () => {
+      it(
+        "default",
+        async () => {
+          const text = "This is a test.";
+          const output = await pipe(text, {
+            max_new_tokens: 5,
+          });
+          const target = [{ generated_text: "" }];
+          expect(output).toEqual(target);
+        },
+        MAX_TEST_EXECUTION_TIME,
+      );
+    });
+
+    afterAll(async () => {
+      await pipe.dispose();
+    }, MAX_MODEL_DISPOSE_TIME);
+  });
+};
diff --git a/tests/pipelines/test_pipelines_text_classification.js b/tests/pipelines/test_pipelines_text_classification.js
new file mode 100644
index 000000000..13a78f1a6
--- /dev/null
+++ b/tests/pipelines/test_pipelines_text_classification.js
@@ -0,0 +1,107 @@
+import { pipeline, TextClassificationPipeline } from "../../src/transformers.js";
+
+import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../init.js";
+
+const PIPELINE_ID = "text-classification";
+
+export default () => {
+  describe("Text Classification", () => {
+    const model_id = "hf-internal-testing/tiny-random-BertForSequenceClassification";
+
+    /** @type {TextClassificationPipeline} */
+    let pipe;
+    beforeAll(async () => {
+      pipe = await pipeline(PIPELINE_ID, model_id, DEFAULT_MODEL_OPTIONS);
+    }, MAX_MODEL_LOAD_TIME);
+
+    it("should be an instance of TextClassificationPipeline", () => {
+      expect(pipe).toBeInstanceOf(TextClassificationPipeline);
+    });
+
+    describe("batch_size=1", () => {
+      it(
+        "default (top_k=1)",
+        async () => {
+          const output = await pipe("a");
+          const target = [{ label: "LABEL_0", score: 0.5076976418495178 }];
+          expect(output).toBeCloseToNested(target, 5);
+        },
+        MAX_TEST_EXECUTION_TIME,
+      );
+      it(
+        "custom (top_k=2)",
+        async () => {
+          const output = await pipe("a", { top_k: 2 });
+          const target = [
+            { label: "LABEL_0", score: 0.5076976418495178 },
+            { label: "LABEL_1", score: 0.49230238795280457 },
+          ];
+          expect(output).toBeCloseToNested(target, 5);
+        },
+        MAX_TEST_EXECUTION_TIME,
+      );
+    });
+
+    describe("batch_size>1", () => {
+      it(
+        "default (top_k=1)",
+        async () => {
+          const output = await pipe(["a", "b c"]);
+          const target = [
+            { label: "LABEL_0", score: 0.5076976418495178 },
+            { label: "LABEL_0", score: 0.5077522993087769 },
+          ];
+          expect(output).toBeCloseToNested(target, 5);
+        },
+        MAX_TEST_EXECUTION_TIME,
+      );
+      it(
+        "custom (top_k=2)",
+        async () => {
+          const output = await pipe(["a", "b c"], { top_k: 2 });
+          const target = [
+            [
+              { label: "LABEL_0", score: 0.5076976418495178 },
+              { label: "LABEL_1", score: 0.49230238795280457 },
+            ],
+            [
+              { label: "LABEL_0", score: 0.5077522993087769 },
+              { label: "LABEL_1", score: 0.49224773049354553 },
+            ],
+          ];
+          expect(output).toBeCloseToNested(target, 5);
+        },
+        MAX_TEST_EXECUTION_TIME,
+      );
+
+      it(
+        "multi_label_classification",
+        async () => {
+          const problem_type = pipe.model.config.problem_type;
+          pipe.model.config.problem_type = "multi_label_classification";
+
+          const output = await pipe(["a", "b c"], { top_k: 2 });
+          const target = [
+            [
+              { label: "LABEL_0", score: 0.5001373887062073 },
+              { label: "LABEL_1", score: 0.49243971705436707 },
+            ],
+            [
+              { label: "LABEL_0", score: 0.5001326203346252 },
+              { label: "LABEL_1", score: 0.492380291223526 },
+            ],
+          ];
+          expect(output).toBeCloseToNested(target, 5);
+
+          // Reset problem type
+          pipe.model.config.problem_type = problem_type;
+        },
+        MAX_TEST_EXECUTION_TIME,
+      );
+    });
+
+    afterAll(async () => {
+      await pipe.dispose();
+    }, MAX_MODEL_DISPOSE_TIME);
+  });
+};
diff --git a/tests/pipelines/test_pipelines_text_generation.js b/tests/pipelines/test_pipelines_text_generation.js
new file mode 100644
index 000000000..085808263
--- /dev/null
+++ b/tests/pipelines/test_pipelines_text_generation.js
@@ -0,0 +1,109 @@
+import { pipeline, TextGenerationPipeline } from "../../src/transformers.js";
+
+import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../init.js";
+
+const PIPELINE_ID = "text-generation";
+
+export default () => {
+  describe("Text Generation", () => {
+    const model_id = "hf-internal-testing/tiny-random-LlamaForCausalLM";
+
+    /** @type {TextGenerationPipeline} */
+    let pipe;
+    beforeAll(async () => {
+      pipe = await pipeline(PIPELINE_ID, model_id, DEFAULT_MODEL_OPTIONS);
+    }, MAX_MODEL_LOAD_TIME);
+
+    it("should be an instance of TextGenerationPipeline", () => {
+      expect(pipe).toBeInstanceOf(TextGenerationPipeline);
+    });
+
+    describe("batch_size=1", () => {
+      const text_input = "hello";
+      const generated_text_target = "erdingsAndroid Load";
+      const text_target = [{ generated_text: text_input + generated_text_target }];
+      const new_text_target = [{ generated_text: generated_text_target }];
+
+      const chat_input = [
+        { role: "system", content: "a" },
+        { role: "user", content: "b" },
+      ];
+      const chat_target = [
+        {
+          generated_text: [
+            { role: "system", content: "a" },
+            { role: "user", content: "b" },
+            { role: "assistant", content: " Southern abund Load" },
+          ],
+        },
+      ];
+
+      it(
+        "text input (single)",
+        async () => {
+          const output = await pipe(text_input, { max_new_tokens: 3 });
+          expect(output).toEqual(text_target);
+        },
+        MAX_TEST_EXECUTION_TIME,
+      );
+      it(
+        "text input (list)",
+        async () => {
+          const output = await pipe([text_input], { max_new_tokens: 3 });
+          expect(output).toEqual([text_target]);
+        },
+        MAX_TEST_EXECUTION_TIME,
+      );
+
+      it(
+        "text input (single) - return_full_text=false",
+        async () => {
+          const output = await pipe(text_input, { max_new_tokens: 3, return_full_text: false });
+          expect(output).toEqual(new_text_target);
+        },
+        MAX_TEST_EXECUTION_TIME,
+      );
+      it(
+        "text input (list) - return_full_text=false",
+        async () => {
+          const output = await pipe([text_input], { max_new_tokens: 3, return_full_text: false });
+          expect(output).toEqual([new_text_target]);
+        },
+        MAX_TEST_EXECUTION_TIME,
+      );
+
+      it(
+        "chat input (single)",
+        async () => {
+          const output = await pipe(chat_input, { max_new_tokens: 3 });
+          expect(output).toEqual(chat_target);
+        },
+        MAX_TEST_EXECUTION_TIME,
+      );
+      it(
+        "chat input (list)",
+        async () => {
+          const output = await pipe([chat_input], { max_new_tokens: 3 });
+          expect(output).toEqual([chat_target]);
+        },
+        MAX_TEST_EXECUTION_TIME,
+      );
+    });
+
+    // TODO: Fix batch_size>1
+    // describe('batch_size>1', () => {
+    //     it('default', async () => {
+    //         const output = await pipe(['hello', 'hello world']);
+    //         const target = [
+    //            [{generated_text: 'helloerdingsAndroid Load'}],
+    //            [{generated_text: 'hello world zerosMillнал'}],
+    //         ];
+    //         expect(output).toEqual(target);
+    //     }, MAX_TEST_EXECUTION_TIME);
+    // });
+
+    afterAll(async () => {
+      await pipe.dispose();
+    }, MAX_MODEL_DISPOSE_TIME);
+  });
+};
diff --git a/tests/pipelines/test_pipelines_text_to_audio.js b/tests/pipelines/test_pipelines_text_to_audio.js
new file mode 100644
index 000000000..d37f0203e
--- /dev/null
+++ b/tests/pipelines/test_pipelines_text_to_audio.js
@@ -0,0 +1,37 @@
+import { pipeline, TextToAudioPipeline } from "../../src/transformers.js";
+
+import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../init.js";
+
+const PIPELINE_ID = "text-to-audio";
+
+export default () => {
+  describe("Text to Audio", () => {
+    const model_id = "Xenova/tiny-random-vits";
+
+    /** @type {TextToAudioPipeline} */
+    let pipe;
+    beforeAll(async () => {
+      pipe = await pipeline(PIPELINE_ID, model_id, DEFAULT_MODEL_OPTIONS);
+    }, MAX_MODEL_LOAD_TIME);
+
+    it("should be an instance of TextToAudioPipeline", () => {
+      expect(pipe).toBeInstanceOf(TextToAudioPipeline);
+    });
+
+    it(
+      "default",
+      async () => {
+        const output = await pipe("hello");
+        expect(output.audio).toHaveLength(6400);
+        // NOTE: The mean value is not deterministic, so we just check the first few digits
+        expect(output.audio.reduce((a, b) => a + b, 0) / output.audio.length).toBeCloseTo(-0.0125, 2);
+        expect(output.sampling_rate).toEqual(16000);
+      },
+      MAX_TEST_EXECUTION_TIME,
+    );
+
+    afterAll(async () => {
+      await pipe.dispose();
+    }, MAX_MODEL_DISPOSE_TIME);
+  });
+};
diff --git a/tests/pipelines/test_pipelines_token_classification.js b/tests/pipelines/test_pipelines_token_classification.js
new file mode 100644
index 000000000..9d91813d4
--- /dev/null
+++ b/tests/pipelines/test_pipelines_token_classification.js
@@ -0,0 +1,157 @@
+import { pipeline, TokenClassificationPipeline } from "../../src/transformers.js";
+
+import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../init.js";
+
+const PIPELINE_ID = "token-classification";
+
+export default () => {
+  describe("Token Classification", () => {
+    const model_id = "hf-internal-testing/tiny-random-BertForTokenClassification";
+    /** @type {TokenClassificationPipeline} */
+    let pipe;
+    beforeAll(async () => {
+      pipe = await pipeline(PIPELINE_ID, model_id, DEFAULT_MODEL_OPTIONS);
+    }, MAX_MODEL_LOAD_TIME);
+
+    it("should be an instance of TokenClassificationPipeline", () => {
+      expect(pipe).toBeInstanceOf(TokenClassificationPipeline);
+    });
+
+    describe("batch_size=1", () => {
+      it(
+        "default",
+        async () => {
+          const output = await pipe("1 2 3");
+
+          // TODO: Add start/end to target
+          const target = [
+            {
+              entity: "LABEL_0",
+              score: 0.5292708,
+              index: 1,
+              word: "1",
+              // 'start': 0, 'end': 1
+            },
+            {
+              entity: "LABEL_0",
+              score: 0.5353687,
+              index: 2,
+              word: "2",
+              // 'start': 2, 'end': 3
+            },
+            {
+              entity: "LABEL_1",
+              score: 0.51381934,
+              index: 3,
+              word: "3",
+              // 'start': 4, 'end': 5
+            },
+          ];
+          expect(output).toBeCloseToNested(target, 5);
+        },
+        MAX_TEST_EXECUTION_TIME,
+      );
+      it(
+        "custom (ignore_labels set)",
+        async () => {
+          const output = await pipe("1 2 3", { ignore_labels: ["LABEL_0"] });
+          const target = [
+            {
+              entity: "LABEL_1",
+              score: 0.51381934,
+              index: 3,
+              word: "3",
+              // 'start': 4, 'end': 5
+            },
+          ];
+          expect(output).toBeCloseToNested(target, 5);
+        },
+        MAX_TEST_EXECUTION_TIME,
+      );
+    });
+
+    describe("batch_size>1", () => {
+      it(
+        "default",
+        async () => {
+          const output = await pipe(["1 2 3", "4 5"]);
+          const target = [
+            [
+              {
+                entity: "LABEL_0",
+                score: 0.5292708,
+                index: 1,
+                word: "1",
+                // 'start': 0, 'end': 1
+              },
+              {
+                entity: "LABEL_0",
+                score: 0.5353687,
+                index: 2,
+                word: "2",
+                // 'start': 2, 'end': 3
+              },
+              {
+                entity: "LABEL_1",
+                score: 0.51381934,
+                index: 3,
+                word: "3",
+                // 'start': 4, 'end': 5
+              },
+            ],
+            [
+              {
+                entity: "LABEL_0",
+                score: 0.5432807,
+                index: 1,
+                word: "4",
+                // 'start': 0, 'end': 1
+              },
+              {
+                entity: "LABEL_1",
+                score: 0.5007693,
+                index: 2,
+                word: "5",
+                // 'start': 2, 'end': 3
+              },
+            ],
+          ];
+          expect(output).toBeCloseToNested(target, 5);
+        },
+        MAX_TEST_EXECUTION_TIME,
+      );
+      it(
+        "custom (ignore_labels set)",
+        async () => {
+          const output = await pipe(["1 2 3", "4 5"], { ignore_labels: ["LABEL_0"] });
+          const target = [
+            [
+              {
+                entity: "LABEL_1",
+                score: 0.51381934,
+                index: 3,
+                word: "3",
+                // 'start': 4, 'end': 5
+              },
+            ],
+            [
+              {
+                entity: "LABEL_1",
+                score: 0.5007693,
+                index: 2,
+                word: "5",
+                // 'start': 2, 'end': 3
+              },
+            ],
+          ];
+          expect(output).toBeCloseToNested(target, 5);
+        },
+        MAX_TEST_EXECUTION_TIME,
+      );
+    });
+
+    afterAll(async () => {
+      await pipe.dispose();
+    }, MAX_MODEL_DISPOSE_TIME);
+  });
+};
diff --git a/tests/pipelines/test_pipelines_translation.js b/tests/pipelines/test_pipelines_translation.js
new file mode 100644
index 000000000..97afce8d6
--- /dev/null
+++ b/tests/pipelines/test_pipelines_translation.js
@@ -0,0 +1,42 @@
+import { pipeline, TranslationPipeline } from "../../src/transformers.js";
+
+import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../init.js";
+
+const PIPELINE_ID = "translation";
+
+export default () => {
+  describe("Translation", () => {
+    const model_id = "Xenova/tiny-random-M2M100ForConditionalGeneration";
+
+    /** @type {TranslationPipeline} */
+    let pipe;
+    beforeAll(async () => {
+      pipe = await pipeline(PIPELINE_ID, model_id, DEFAULT_MODEL_OPTIONS);
+    }, MAX_MODEL_LOAD_TIME);
+
+    it("should be an instance of TranslationPipeline", () => {
+      expect(pipe).toBeInstanceOf(TranslationPipeline);
+    });
+
+    describe("batch_size=1", () => {
+      it(
+        "default",
+        async () => {
+          const text = "जीवन एक चॉकलेट बॉक्स की तरह है।";
+          const output = await pipe(text, {
+            src_lang: "hi",
+            tgt_lang: "fr",
+            max_new_tokens: 5,
+          });
+          const target = [{ translation_text: "Slovenska төсли төсли төсли" }];
+          expect(output).toEqual(target);
+        },
+        MAX_TEST_EXECUTION_TIME,
+      );
+    });
+
+    afterAll(async () => {
+      await pipe.dispose();
+    }, MAX_MODEL_DISPOSE_TIME);
+  });
+};
diff --git a/tests/pipelines/test_pipelines_zero_shot.js b/tests/pipelines/test_pipelines_zero_shot.js
new file mode 100644
index 000000000..1c30db9ea
--- /dev/null
+++ b/tests/pipelines/test_pipelines_zero_shot.js
@@ -0,0 +1,100 @@
+import { pipeline, ZeroShotClassificationPipeline } from "../../src/transformers.js";
+
+import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../init.js";
+
+const PIPELINE_ID = "zero-shot-classification";
+
+export default () => {
+  describe("Zero-shot Classification", () => {
+    const model_id = "hf-internal-testing/tiny-random-BertForSequenceClassification";
+    /** @type {ZeroShotClassificationPipeline} */
+    let pipe;
+
+    beforeAll(async () => {
+      pipe = await pipeline(PIPELINE_ID, model_id, {
+        ...DEFAULT_MODEL_OPTIONS,
+
+        // The model isn't designed for zero-shot classification, so we set the config
+        config: {
+          model_type: "bert",
+          id2label: {
+            0: "contradiction",
+            1: "entailment",
+          },
+          label2id: {
+            contradiction: 0,
+            entailment: 1,
+          },
+        },
+      });
+    }, MAX_MODEL_LOAD_TIME);
+
+    it("should be an instance of ZeroShotClassificationPipeline", () => {
+      expect(pipe).toBeInstanceOf(ZeroShotClassificationPipeline);
+    });
+    const sequences_to_classify = ["one day I will see the world", "I love making pizza"];
+    const candidate_labels = ["travel", "cooking", "dancing"];
+
+    it(
+      "Single sequence classification",
+      async () => {
+        const output = await pipe(sequences_to_classify[0], candidate_labels);
+        const target = {
+          sequence: "one day I will see the world",
+          labels: ["dancing", "cooking", "travel"],
+          scores: [0.3333353410546293, 0.3333348269618681, 0.3333298319835025],
+        };
+        expect(output).toBeCloseToNested(target, 5);
+      },
+      MAX_TEST_EXECUTION_TIME,
+    );
+
+    it(
+      "Batched classification",
+      async () => {
+        const output = await pipe(sequences_to_classify, candidate_labels);
+        const target = [
+          {
+            sequence: "one day I will see the world",
+            labels: ["dancing", "cooking", "travel"],
+            scores: [0.3333353410546293, 0.3333348269618681, 0.3333298319835025],
+          },
+          {
+            sequence: "I love making pizza",
+            labels: ["dancing", "cooking", "travel"],
+            scores: [0.3333347058960895, 0.3333337292465588, 0.3333315648573516],
+          },
+        ];
+        expect(output).toBeCloseToNested(target, 5);
+      },
+      MAX_TEST_EXECUTION_TIME,
+    );
+
+    it(
+      "Batched + multilabel classification",
+      async () => {
+        const candidate_labels = ["travel", "cooking", "dancing"];
+
+        const output = await pipe(sequences_to_classify, candidate_labels, { multi_label: true });
+        const target = [
+          {
+            sequence: "one day I will see the world",
+            labels: ["dancing", "cooking", "travel"],
+            scores: [0.49231469615364476, 0.4923134953805702, 0.4923094795142658],
+          },
+          {
+            sequence: "I love making pizza",
+            labels: ["dancing", "cooking", "travel"],
+            scores: [0.49230751217535645, 0.49230615475943956, 0.4923042569480609],
+          },
+        ];
+        expect(output).toBeCloseToNested(target, 5);
+      },
+      MAX_TEST_EXECUTION_TIME,
+    );
+
+    afterAll(async () => {
+      await pipe.dispose();
+    }, MAX_MODEL_DISPOSE_TIME);
+  });
+};
diff --git a/tests/pipelines/test_pipelines_zero_shot_audio_classification.js b/tests/pipelines/test_pipelines_zero_shot_audio_classification.js
new file mode 100644
index 000000000..00dd328ea
--- /dev/null
+++ b/tests/pipelines/test_pipelines_zero_shot_audio_classification.js
@@ -0,0 +1,58 @@
+import { pipeline, ZeroShotAudioClassificationPipeline } from "../../src/transformers.js";
+
+import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../init.js";
+import { load_cached_audio } from "../asset_cache.js";
+
+const PIPELINE_ID = "zero-shot-audio-classification";
+
+export default () => {
+  describe("Zero-shot Audio Classification", () => {
+    const model_id = "hf-internal-testing/tiny-clap-htsat-unfused";
+
+    const labels = ["cat", "dog"];
+    const hypothesis_template = "sound of a {}";
+
+    /** @type {ZeroShotAudioClassificationPipeline} */
+    let pipe;
+    let audio;
+    beforeAll(async () => {
+      pipe = await pipeline(PIPELINE_ID, model_id, DEFAULT_MODEL_OPTIONS);
+      audio = await load_cached_audio("mlk");
+    }, MAX_MODEL_LOAD_TIME);
+
+    it("should be an instance of ZeroShotAudioClassificationPipeline", () => {
+      expect(pipe).toBeInstanceOf(ZeroShotAudioClassificationPipeline);
+    });
+
+    describe("batch_size=1", () => {
+      it(
+        "default",
+        async () => {
+          const output = await pipe(audio, labels);
+          const target = [
+            { score: 0.4990939795970917, label: "cat" },
+            { score: 0.5009059906005859, label: "dog" },
+          ];
+          expect(output).toBeCloseToNested(target, 5);
+        },
+        MAX_TEST_EXECUTION_TIME,
+      );
+      it(
+        "custom (w/ hypothesis_template)",
+        async () => {
+          const output = await pipe(audio, labels, { hypothesis_template });
+          const target = [
+            { score: 0.4987950325012207, label: "cat" },
+            { score: 0.5012049674987793, label: "dog" },
+          ];
+          expect(output).toBeCloseToNested(target, 5);
+        },
+        MAX_TEST_EXECUTION_TIME,
+      );
+    });
+
+    afterAll(async () => {
+      await pipe.dispose();
+    }, MAX_MODEL_DISPOSE_TIME);
+  });
+};
diff --git a/tests/pipelines/test_pipelines_zero_shot_image_classification.js b/tests/pipelines/test_pipelines_zero_shot_image_classification.js
new file mode 100644
index 000000000..dfa1e23e8
--- /dev/null
+++ b/tests/pipelines/test_pipelines_zero_shot_image_classification.js
@@ -0,0 +1,98 @@
+import { pipeline, ZeroShotImageClassificationPipeline } from "../../src/transformers.js";
+
+import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../init.js";
+import { load_cached_image } from "../asset_cache.js";
+
+const PIPELINE_ID = "zero-shot-image-classification";
+
+export default () => {
+  describe("Zero-shot Image Classification", () => {
+    const model_id = "hf-internal-testing/tiny-random-GroupViTModel";
+
+    // Example adapted from https://huggingface.co/docs/transformers/en/model_doc/groupvit
+    const labels = ["cat", "dog"];
+    const hypothesis_template = "a photo of a {}";
+
+    /** @type {ZeroShotImageClassificationPipeline} */
+    let pipe;
+    let images;
+    beforeAll(async () => {
+      pipe = await pipeline(PIPELINE_ID, model_id, DEFAULT_MODEL_OPTIONS);
+      images = await Promise.all([load_cached_image("white_image"), load_cached_image("blue_image")]);
+    }, MAX_MODEL_LOAD_TIME);
+
+    it("should be an instance of ZeroShotImageClassificationPipeline", () => {
+      expect(pipe).toBeInstanceOf(ZeroShotImageClassificationPipeline);
+    });
+
+    describe("batch_size=1", () => {
+      it(
+        "default",
+        async () => {
+          const output = await pipe(images[0], labels);
+          const target = [
+            { score: 0.5990662574768066, label: "cat" },
+            { score: 0.40093377232551575, label: "dog" },
+          ];
+          expect(output).toBeCloseToNested(target, 5);
+        },
+        MAX_TEST_EXECUTION_TIME,
+      );
+      it(
+        "custom (w/ hypothesis_template)",
+        async () => {
+          const output = await pipe(images[0], labels, { hypothesis_template });
+          const target = [
+            { score: 0.5527022480964661, label: "cat" },
+            { score: 0.44729775190353394, label: "dog" },
+          ];
+          expect(output).toBeCloseToNested(target, 5);
+        },
+        MAX_TEST_EXECUTION_TIME,
+      );
+    });
+
+    describe("batch_size>1", () => {
+      it(
+        "default",
+        async () => {
+          const output = await pipe(images, labels);
+          const target = [
+            [
+              { score: 0.5990662574768066, label: "cat" },
+              { score: 0.40093377232551575, label: "dog" },
+            ],
+            [
+              { score: 0.5006340146064758, label: "dog" },
+              { score: 0.49936598539352417, label: "cat" },
+            ],
+          ];
+          expect(output).toBeCloseToNested(target, 5);
+        },
+        MAX_TEST_EXECUTION_TIME,
+      );
+      it(
+        "custom (w/ hypothesis_template)",
+        async () => {
+          const output = await pipe(images, labels, { hypothesis_template });
+          const target = [
+            [
+              { score: 0.5527022480964661, label: "cat" },
+              { score: 0.44729775190353394, label: "dog" },
+            ],
+            [
+              { score: 0.5395973324775696, label: "cat" },
+              { score: 0.46040263772010803, label: "dog" },
+            ],
+          ];
+          expect(output).toBeCloseToNested(target, 5);
+        },
+        MAX_TEST_EXECUTION_TIME,
+      );
+    });
+
+    afterAll(async () => {
+      await pipe.dispose();
+    }, MAX_MODEL_DISPOSE_TIME);
+  });
+};
diff --git a/tests/pipelines/test_pipelines_zero_shot_object_detection.js b/tests/pipelines/test_pipelines_zero_shot_object_detection.js
new file mode 100644
index 000000000..f55690507
--- /dev/null
+++ b/tests/pipelines/test_pipelines_zero_shot_object_detection.js
@@ -0,0 +1,134 @@
+import { pipeline, ZeroShotObjectDetectionPipeline } from "../../src/transformers.js";
+
+import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../init.js";
+import { load_cached_image } from "../asset_cache.js";
+
+const PIPELINE_ID = "zero-shot-object-detection";
+
+export default () => {
+  describe("Zero-shot Object Detection", () => {
+    const model_id = "hf-internal-testing/tiny-random-OwlViTForObjectDetection";
+
+    const candidate_labels = ["hello", "hello world"];
+
+    /** @type {ZeroShotObjectDetectionPipeline} */
+    let pipe;
+    let images;
+    beforeAll(async () => {
+      pipe = await pipeline(PIPELINE_ID, model_id, DEFAULT_MODEL_OPTIONS);
+      images = await Promise.all([load_cached_image("white_image"), load_cached_image("blue_image")]);
+    }, MAX_MODEL_LOAD_TIME);
+
+    const targets = {
+      white_image: [
+        {
+          score: 0.6028420329093933,
+          label: "hello",
+          box: { xmin: 47, ymin: 117, xmax: 62, ymax: 134 },
+        },
+        {
+          score: 0.6026064157485962,
+          label: "hello world",
+          box: { xmin: 47, ymin: 117, xmax: 62, ymax: 134 },
+        },
+        {
+          score: 0.5987668037414551,
+          label: "hello world",
+          box: { xmin: 145, ymin: 47, xmax: 160, ymax: 63 },
+        },
+        {
+          score: 0.5986272692680359,
+          label: "hello",
+          box: { xmin: 89, ymin: 131, xmax: 104, ymax: 148 },
+        },
+        {
+          score: 0.5985949039459229,
+          label: "hello world",
+          box: { xmin: 89, ymin: 131, xmax: 104, ymax: 148 },
+        },
+        // ... many more
+      ],
+
+      blue_image: [
+        {
+          score: 0.6622366309165955,
+          label: "hello",
+          box: { xmin: 48, ymin: 45, xmax: 62, ymax: 61 },
+        },
+        {
+          score: 0.6562080383300781,
+          label: "hello world",
+          box: { xmin: 48, ymin: 45, xmax: 62, ymax: 61 },
+        },
+        {
+          score: 0.6493991613388062,
+          label: "hello world",
+          box: { xmin: 34, ymin: 58, xmax: 48, ymax: 74 },
+        },
+        {
+          score: 0.6476974487304688,
+          label: "hello",
+          box: { xmin: 34, ymin: 58, xmax: 48, ymax: 74 },
+        },
+        {
+          score: 0.6391685009002686,
+          label: "hello",
+          box: { xmin: 103, ymin: 59, xmax: 117, ymax: 75 },
+        },
+        // ... many more
+      ],
+    };
+
+    it("should be an instance of ZeroShotObjectDetectionPipeline", () => {
+      expect(pipe).toBeInstanceOf(ZeroShotObjectDetectionPipeline);
+    });
+
+    describe("batch_size=1", () => {
+      it(
+        "default",
+        async () => {
+          const output = await pipe(images[0], candidate_labels);
+          expect(output).toHaveLength(512);
+
+          expect(output.slice(0, targets.white_image.length)).toBeCloseToNested(targets.white_image, 5);
+        },
+        MAX_TEST_EXECUTION_TIME,
+      );
+      it(
+        "custom (w/ top_k & threshold)",
+        async () => {
+          const top_k = 3;
+          const output = await pipe(images[0], candidate_labels, { top_k, threshold: 0.05 });
+          expect(output).toBeCloseToNested(targets.white_image.slice(0, top_k), 5);
+        },
+        MAX_TEST_EXECUTION_TIME,
+      );
+    });
+
+    describe("batch_size>1", () => {
+      it(
+        "default",
+        async () => {
+          const output = await pipe(images, candidate_labels);
+          const target = Object.values(targets);
+          expect(output.map((x, i) => x.slice(0, target[i].length))).toBeCloseToNested(target, 5);
+        },
+        MAX_TEST_EXECUTION_TIME,
+      );
+      it(
+        "custom (w/ top_k & threshold)",
+        async () => {
+          const top_k = 3;
+          const output = await pipe(images, candidate_labels, { top_k, threshold: 0.05 });
+          const target = Object.values(targets).map((x) => x.slice(0, top_k));
+          expect(output).toBeCloseToNested(target, 5);
+        },
+        MAX_TEST_EXECUTION_TIME,
+      );
+    });
+
+    afterAll(async () => {
+      await pipe.dispose();
+    }, MAX_MODEL_DISPOSE_TIME);
+  });
+};
diff --git a/tests/processors.test.js b/tests/processors.test.js
index e35e555d2..d80ec91fb 100644
--- a/tests/processors.test.js
+++ b/tests/processors.test.js
@@ -1,521 +1,5 @@
-import fs from "fs";
-import path from "path";
+import { init } from "./init.js";
+import { collect_and_execute_tests } from "./test_utils.js";
 
-import { AutoProcessor } from "../src/transformers.js";
-import { load_cached_image } from "./asset_cache.js";
-import { init, MAX_TEST_TIME } from "./init.js";
-import { fileURLToPath } from "url";
-
-// Initialise the testing environment
 init();
-
-// Collect all unit tests, which can be found in files of the form:
-// `tests/models/<model_type>/test_image_processors_<model_type>.js`
-const __filename = fileURLToPath(import.meta.url);
-const __dirname = path.dirname(__filename);
-const models_dir = path.join(__dirname, "models");
-const model_types = fs.readdirSync(models_dir);
-for (const model_type of model_types) {
-  const dir = path.join(models_dir, model_type);
-
-  if (!fs.existsSync(dir) || !fs.statSync(dir).isDirectory()) {
-    continue;
-  }
-
-  const file = path.join(dir, `test_image_processing_${model_type}.js`);
-  if (!fs.existsSync(file)) {
-    continue;
-  }
-
-  const { default: tests } = await import(file);
-  describe(model_type, tests);
-}
-
-const sum = (array) => Number(array.reduce((a, b) => a + b, array instanceof BigInt64Array ? 0n : 0));
-const avg = (array) => sum(array) / array.length;
-
-const MODELS = {
-  florence2: "Xenova/tiny-random-Florence2ForConditionalGeneration",
-  qwen2_vl: "hf-internal-testing/tiny-random-Qwen2VLForConditionalGeneration",
-  idefics3: "hf-internal-testing/tiny-random-Idefics3ForConditionalGeneration",
-  paligemma: "hf-internal-testing/tiny-random-PaliGemmaForConditionalGeneration",
-};
-
-describe("Processors", () => {
-  describe("Audio processors", () => {
-    let audio;
-    beforeAll(async () => {
-      const url = "https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/mlk.npy";
-      const buffer = await (await fetch(url)).arrayBuffer();
-      audio = Float32Array.from(new Float64Array(buffer));
-    });
-
-    it(
-      "WhisperFeatureExtractor",
-      async () => {
-        const processor = await AutoProcessor.from_pretrained("Xenova/whisper-tiny.en");
-        const { input_features } = await processor(audio);
-        const { dims, data } = input_features;
-        expect(dims).toEqual([1, 80, 3000]);
-        expect(avg(data)).toBeCloseTo(-0.2813588131551941);
-        expect(data[0]).toBeCloseTo(0.33168578147888184);
-        expect(data[1]).toBeCloseTo(0.30986475944519043);
-        expect(data[81]).toBeCloseTo(0.10727232694625854);
-        expect(data[3001]).toBeCloseTo(0.2555035352706909);
-      },
-      MAX_TEST_TIME,
-    );
-
-    it(
-      "ASTFeatureExtractor",
-      async () => {
-        const processor = await AutoProcessor.from_pretrained("Xenova/ast-finetuned-audioset-10-10-0.4593");
-        {
-          // truncation
-          const { input_values } = await processor(audio);
-          expect(input_values.dims).toEqual([1, 1024, 128]);
-
-          expect(avg(input_values.data)).toBeCloseTo(-0.04054912979309085);
-          expect(input_values.data[0]).toBeCloseTo(-0.5662586092948914);
-          expect(input_values.data[1]).toBeCloseTo(-1.0300861597061157);
-          expect(input_values.data[129]).toBeCloseTo(-1.084834098815918);
-          expect(input_values.data[1025]).toBeCloseTo(-1.1204065084457397);
-        }
-        {
-          // padding
-          const { input_values } = await processor(audio.slice(0, 1000));
-          expect(input_values.dims).toEqual([1, 1024, 128]); // [1, 4, 128] -> (padded to) -> [1, 1024, 128]
-
-          expect(avg(input_values.data)).toBeCloseTo(0.4647964835166931);
-          expect(input_values.data[0]).toBeCloseTo(-0.5662586092948914);
-          expect(input_values.data[1]).toBeCloseTo(-1.0300861597061157);
-          expect(input_values.data[129]).toBeCloseTo(-1.084834098815918);
-
-          // padded values
-          expect(input_values.data[1025]).toBeCloseTo(0.46703237295150757);
-          expect(input_values.data[2049]).toBeCloseTo(0.46703237295150757);
-          expect(input_values.data[10000]).toBeCloseTo(0.46703237295150757);
-        }
-      },
-      MAX_TEST_TIME,
-    );
-
-    it(
-      "SeamlessM4TFeatureExtractor",
-      async () => {
-        const processor = await AutoProcessor.from_pretrained("Xenova/wav2vec2-bert-CV16-en");
-        {
-          // normal
-          const { input_features, attention_mask } = await processor(audio);
-          const { dims, data } = input_features;
-          expect(dims).toEqual([1, 649, 160]);
-          expect(attention_mask.dims).toEqual([1, 649]);
-
-          expect(avg(data)).toBeCloseTo(-2.938903875815413e-8);
-          expect(data[0]).toBeCloseTo(1.1939343214035034);
-          expect(data[1]).toBeCloseTo(0.7874255180358887);
-          expect(data[160]).toBeCloseTo(-0.712975025177002);
-          expect(data[161]).toBeCloseTo(0.045802414417266846);
-          expect(data.at(-1)).toBeCloseTo(-1.3328346014022827);
-
-          expect(sum(attention_mask.data)).toEqual(649);
-        }
-        {
-          // padding (pad_to_multiple_of=2)
-          const { input_features, attention_mask } = await processor(audio.slice(0, 10000));
-          const { dims, data } = input_features;
-
-          // [1, 61, 80] -> [1, 62, 80] -> [1, 31, 160]
-          expect(dims).toEqual([1, 31, 160]);
-          expect(attention_mask.dims).toEqual([1, 31]);
-
-          expect(avg(data)).toBeCloseTo(0.01612919569015503);
-          expect(data[0]).toBeCloseTo(0.9657132029533386);
-          expect(data[1]).toBeCloseTo(0.12912897765636444);
-          expect(data[160]).toBeCloseTo(-1.2364212274551392);
-          expect(data[161]).toBeCloseTo(-0.9703778028488159);
-          expect(data.at(-1)).toBeCloseTo(1); // padding value
-
-          expect(sum(attention_mask.data)).toEqual(30);
-        }
-      },
-      MAX_TEST_TIME,
-    );
-
-    it(
-      "ClapFeatureExtractor",
-      async () => {
-        const processor = await AutoProcessor.from_pretrained("Xenova/clap-htsat-unfused");
-        {
-          // truncation
-          // Since truncation uses a random strategy, we override
-          // Math.random to ensure that the test is deterministic
-          const originalRandom = Math.random;
-          Math.random = () => 0.5;
-
-          let long_audio = new Float32Array(500000);
-          long_audio.set(audio);
-          long_audio.set(audio, long_audio.length - audio.length);
-
-          const { input_features } = await processor(long_audio);
-          const { dims, data } = input_features;
-          expect(dims).toEqual([1, 1, 1001, 64]);
-
-          expect(avg(data)).toBeCloseTo(-37.94569396972656);
-          expect(data[0]).toBeCloseTo(-53.32647705078125);
-          expect(data[1]).toBeCloseTo(-47.76755142211914);
-          expect(data[65]).toBeCloseTo(-36.32261276245117);
-          expect(data[1002]).toBeCloseTo(-28.0314884185791);
-          expect(data[10000]).toBeCloseTo(-21.905902862548828);
-          expect(data[60000]).toBeCloseTo(-14.877863883972168);
-          expect(data[64062]).toBeCloseTo(-37.9784049987793);
-          expect(data[64063]).toBeCloseTo(-37.73963928222656);
-
-          // Reset Math.random
-          Math.random = originalRandom;
-        }
-        {
-          // padding
-          const { input_features } = await processor(audio);
-          const { data, dims } = input_features;
-          expect(dims).toEqual([1, 1, 1001, 64]);
-
-          expect(avg(data)).toBeCloseTo(-34.99049377441406);
-          expect(data[0]).toBeCloseTo(-21.32573890686035);
-          expect(data[1]).toBeCloseTo(-26.168411254882812);
-          expect(data[65]).toBeCloseTo(-29.716018676757812);
-          expect(data[1002]).toBeCloseTo(-32.16273498535156);
-          expect(data[10000]).toBeCloseTo(-19.9283390045166);
-
-          // padded values
-          expect(data[60000]).toBeCloseTo(-100.0);
-          expect(data[64062]).toBeCloseTo(-100.0);
-          expect(data[64063]).toBeCloseTo(-100.0);
-        }
-      },
-      MAX_TEST_TIME,
-    );
-
-    it(
-      "WeSpeakerFeatureExtractor",
-      async () => {
-        const processor = await AutoProcessor.from_pretrained("onnx-community/wespeaker-voxceleb-resnet34-LM");
-        {
-          // default
-          const audio = new Float32Array(16000).map((_, i) => Math.sin(i / 100));
-          const { input_features } = await processor(audio);
-          const { dims, data } = input_features;
-          expect(dims).toEqual([1, 98, 80]);
-
-          expect(avg(data)).toBeCloseTo(5.461731689138105e-8);
-          expect(data[0]).toBeCloseTo(-0.19300270080566406);
-          expect(data[1]).toBeCloseTo(-0.05825042724609375);
-          expect(data[78]).toBeCloseTo(0.2683420181274414);
-          expect(data[79]).toBeCloseTo(0.26250171661376953);
-          expect(data[80]).toBeCloseTo(0.19062232971191406);
-          expect(data.at(-2)).toBeCloseTo(-0.43694400787353516);
-          expect(data.at(-1)).toBeCloseTo(-0.4266204833984375);
-        }
-
-        {
-          // pad to `min_num_frames`
-          const audio = new Float32Array(3).map((_, i) => Math.sin(i / 100));
-          const { input_features } = await processor(audio);
-          const { dims, data } = input_features;
-          expect(dims).toEqual([1, 9, 80]);
-
-          expect(avg(data)).toBeCloseTo(-0.0000010093053181966146);
-          expect(data[0]).toBeCloseTo(20.761859893798828);
-          expect(data[1]).toBeCloseTo(21.02924346923828);
-          expect(data[78]).toBeCloseTo(19.083993911743164);
-          expect(data[79]).toBeCloseTo(18.003454208374023);
-          expect(data[80]).toBeCloseTo(-2.595233917236328);
-          expect(data.at(-2)).toBeCloseTo(-2.385499954223633);
-          expect(data.at(-1)).toBeCloseTo(-2.2504329681396484);
-        }
-      },
-      MAX_TEST_TIME,
-    );
-  });
-
-  describe("Other processors", () => {
-    describe(
-      "FlorenceProcessor",
-      () => {
-        /** @type {import('../src/transformers.js').Florence2Processor} */
-        let processor;
-        let images = {};
-
-        beforeAll(async () => {
-          processor = await AutoProcessor.from_pretrained(MODELS.florence2);
-          images = {
-            beetle: await load_cached_image("beetle"),
-            book_cover: await load_cached_image("book_cover"),
-          };
-        });
-
-        describe("Prompt construction", () => {
-          it("Construct prompt", async () => {
-            const text = "<OD>";
-            const prompts = processor.construct_prompts(text);
-            const target = ["Locate the objects with category name in the image."];
-            expect(prompts).toEqual(target);
-          });
-
-          it("Construct prompts", async () => {
-            const texts = ["<MORE_DETAILED_CAPTION>", "Locate the objects with category name in the image.", "<OPEN_VOCABULARY_DETECTION>cat"];
-            const prompts = processor.construct_prompts(texts);
-            const target = ["Describe with a paragraph what is shown in the image.", "Locate the objects with category name in the image.", "Locate cat in the image."];
-            expect(prompts).toEqual(target);
-          });
-        });
-
-        describe("Post-process generation", () => {
-          const TESTS = [
-            {
-              task: "<CAPTION>",
-              generated_text: "</s><s>A green car parked in front of a yellow building.</s>",
-              target: { "<CAPTION>": "A green car parked in front of a yellow building." },
-              image: "beetle",
-            },
-            {
-              task: "<DETAILED_CAPTION>",
-              generated_text: "</s><s>The image shows a green Volkswagen Beetle parked in front of a yellow building with two brown doors. The sky is a mix of blue and white, and there are a few green trees in the background.</s>",
-              target: { "<DETAILED_CAPTION>": "The image shows a green Volkswagen Beetle parked in front of a yellow building with two brown doors. The sky is a mix of blue and white, and there are a few green trees in the background." },
-              image: "beetle",
-            },
-            {
-              task: "<MORE_DETAILED_CAPTION>",
-              generated_text: "</s><s>The image shows a vintage Volkswagen Beetle car parked on a cobblestone street in front of a yellow building with two wooden doors. The car is painted in a bright turquoise color and has a white stripe running along the side. It has two doors on either side of the car, one on top of the other, and a small window on the front. The building appears to be old and dilapidated, with peeling paint and crumbling walls. The sky is blue and there are trees in the background.</s>",
-              target: { "<MORE_DETAILED_CAPTION>": "The image shows a vintage Volkswagen Beetle car parked on a cobblestone street in front of a yellow building with two wooden doors. The car is painted in a bright turquoise color and has a white stripe running along the side. It has two doors on either side of the car, one on top of the other, and a small window on the front. The building appears to be old and dilapidated, with peeling paint and crumbling walls. The sky is blue and there are trees in the background." },
-              image: "beetle",
-            },
-            {
-              task: "<OD>",
-              generated_text: "</s><s><s><s>car<loc_53><loc_333><loc_933><loc_774>door<loc_712><loc_203><loc_906><loc_545>wheel<loc_704><loc_576><loc_866><loc_772><loc_149><loc_584><loc_310><loc_773></s>",
-              target: {
-                "<OD>": {
-                  bboxes: [
-                    [34.24, 160.08, 597.44, 371.76],
-                    [456.0, 97.68, 580.16, 261.84],
-                    [450.88, 276.72, 554.56, 370.8],
-                    [95.68, 280.56, 198.72, 371.28],
-                  ],
-                  labels: ["car", "door", "wheel", "wheel"],
-                },
-              },
-              image: "beetle",
-            },
-            {
-              task: "<DENSE_REGION_CAPTION>",
-              generated_text: "</s><s>turquoise Volkswagen Beetle<loc_52><loc_333><loc_932><loc_774>wheel<loc_704><loc_576><loc_864><loc_772><loc_148><loc_584><loc_308><loc_773></s>",
-              target: {
-                "<DENSE_REGION_CAPTION>": {
-                  bboxes: [
-                    [33.6, 160.08, 596.8, 371.76],
-                    [450.88, 276.72, 553.28, 370.8],
-                    [95.04, 280.56, 197.44, 371.28],
-                  ],
-                  labels: ["turquoise Volkswagen Beetle", "wheel", "wheel"],
-                },
-              },
-              image: "beetle",
-            },
-            {
-              task: "<REGION_PROPOSAL>",
-              generated_text: "</s><s><s><s><loc_52><loc_333><loc_932><loc_774><loc_711><loc_203><loc_905><loc_545><loc_704><loc_576><loc_864><loc_772><loc_148><loc_584><loc_309><loc_773><loc_354><loc_184><loc_519><loc_342><loc_102><loc_555><loc_135><loc_616><loc_424><loc_503><loc_472><loc_514><loc_637><loc_642><loc_646><loc_668></s>",
-              target: {
-                "<REGION_PROPOSAL>": {
-                  bboxes: [
-                    [33.6, 160.08, 596.8, 371.76],
-                    [455.36, 97.68, 579.52, 261.84],
-                    [450.88, 276.72, 553.28, 370.8],
-                    [95.04, 280.56, 198.08, 371.28],
-                    [226.88, 88.56, 332.48, 164.4],
-                    [65.6, 266.64, 86.72, 295.92],
-                    [271.68, 241.68, 302.4, 246.96],
-                    [408.0, 308.4, 413.76, 320.88],
-                  ],
-                  labels: ["", "", "", "", "", "", "", ""],
-                },
-              },
-              image: "beetle",
-            },
-            {
-              task: "<CAPTION_TO_PHRASE_GROUNDING>",
-              text_input: "A green car parked in front of a yellow building.",
-              generated_text: "</s><s><s><s>A green car<loc_54><loc_330><loc_911><loc_780>a yellow building<loc_0><loc_8><loc_998><loc_635></s>",
-              target: {
-                "<CAPTION_TO_PHRASE_GROUNDING>": {
-                  bboxes: [
-                    [34.88, 158.64, 583.36, 374.64],
-                    [0.32, 4.08, 639.04, 305.04],
-                  ],
-                  labels: ["A green car", "a yellow building"],
-                },
-              },
-              image: "beetle",
-            },
-            // {
-            //     task: "<REFERRING_EXPRESSION_SEGMENTATION>",
-            //     text_input: "a green car",
-            //     generated_text: "</s><s><s><s><loc_279><loc_378><loc_282><loc_376><loc_285><loc_376><loc_293><loc_370><loc_296><loc_370><loc_301><loc_366><loc_304><loc_366><loc_309><loc_362><loc_313><loc_360><loc_318><loc_358><loc_323><loc_355><loc_327><loc_353><loc_334><loc_351><loc_340><loc_349><loc_346><loc_347><loc_353><loc_345><loc_360><loc_343><loc_370><loc_341><loc_381><loc_339><loc_395><loc_337><loc_414><loc_335><loc_486><loc_335><loc_514><loc_337><loc_528><loc_339><loc_539><loc_341><loc_547><loc_343><loc_553><loc_345><loc_560><loc_347><loc_566><loc_349><loc_572><loc_351><loc_578><loc_353><loc_583><loc_355><loc_586><loc_358><loc_589><loc_362><loc_592><loc_368><loc_594><loc_374><loc_597><loc_378><loc_600><loc_385><loc_603><loc_391><loc_605><loc_397><loc_608><loc_401><loc_609><loc_408><loc_612><loc_414><loc_616><loc_420><loc_619><loc_426><loc_622><loc_433><loc_630><loc_443><loc_634><loc_445><loc_639><loc_451><loc_644><loc_458><loc_674><loc_458><loc_675><loc_460><loc_691><loc_462><loc_713><loc_462><loc_727><loc_464><loc_738><loc_466><loc_747><loc_468><loc_757><loc_470><loc_765><loc_472><loc_771><loc_474><loc_777><loc_476><loc_783><loc_478><loc_788><loc_481><loc_793><loc_483><loc_797><loc_485><loc_802><loc_487><loc_807><loc_491><loc_810><loc_491><loc_818><loc_497><loc_821><loc_497><loc_824><loc_499><loc_827><loc_503><loc_832><loc_505><loc_837><loc_510><loc_841><loc_516><loc_846><loc_520><loc_852><loc_524><loc_857><loc_526><loc_860><loc_526><loc_865><loc_528><loc_869><loc_532><loc_872><loc_532><loc_882><loc_539><loc_885><loc_543><loc_888><loc_543><loc_891><loc_545><loc_894><loc_549><loc_896><loc_553><loc_897><loc_559><loc_897><loc_566><loc_896><loc_568><loc_894><loc_574><loc_894><loc_582><loc_896><loc_595><loc_897><loc_597><loc_899><loc_603><loc_900><loc_609><loc_902><loc_622><loc_902><loc_628><loc_900><loc_630><loc_899><loc_647><loc_899><loc_651><loc_900><loc_653><loc_902><loc_659><loc_902><loc_668><loc_897><loc_670><loc_888><loc_672><loc_874><loc_672><loc_865><loc_674><loc_863><loc_693><loc_862><loc_701><loc_860><loc_707><loc_859><loc_714><loc_857><loc_718><loc_854><loc_722><loc_852><loc_728><loc_849><loc_734><loc_846><loc_741><loc_835><loc_755><loc_830><loc_759><loc_821><loc_766><loc_816><loc_768><loc_810><loc_770><loc_774><loc_770><loc_765><loc_768><loc_760><loc_766><loc_755><loc_764><loc_749><loc_759><loc_744><loc_755><loc_738><loc_749><loc_727><loc_734><loc_724><loc_728><loc_721><loc_722><loc_719><loc_718><loc_719><loc_714><loc_716><loc_707><loc_715><loc_701><loc_715><loc_697><loc_713><loc_693><loc_710><loc_689><loc_707><loc_691><loc_700><loc_701><loc_697><loc_703><loc_666><loc_701><loc_663><loc_701><loc_661><loc_703><loc_657><loc_705><loc_647><loc_707><loc_644><loc_707><loc_642><loc_705><loc_594><loc_703><loc_339><loc_703><loc_337><loc_705><loc_329><loc_707><loc_323><loc_707><loc_318><loc_705><loc_315><loc_703><loc_312><loc_699><loc_309><loc_697><loc_304><loc_697><loc_301><loc_701><loc_299><loc_705><loc_299><loc_709><loc_298><loc_714><loc_295><loc_718><loc_293><loc_724><loc_290><loc_728><loc_288><loc_734><loc_285><loc_741><loc_276><loc_753><loc_271><loc_757><loc_266><loc_761><loc_260><loc_766><loc_255><loc_768><loc_251><loc_770><loc_240><loc_772><loc_205><loc_772><loc_199><loc_770><loc_194><loc_768><loc_185><loc_761><loc_180><loc_757><loc_174><loc_751><loc_166><loc_741><loc_163><loc_734><loc_161><loc_728><loc_158><loc_724><loc_157><loc_720><loc_155><loc_714><loc_155><loc_707><loc_154><loc_703><loc_149><loc_697><loc_146><loc_695><loc_135><loc_695><loc_125><loc_697><loc_124><loc_699><loc_116><loc_701><loc_103><loc_701><loc_99><loc_697><loc_83><loc_697><loc_78><loc_695><loc_75><loc_691><loc_75><loc_684><loc_78><loc_680><loc_80><loc_676><loc_80><loc_672><loc_69><loc_670><loc_63><loc_668><loc_60><loc_666><loc_58><loc_661><loc_56><loc_653><loc_56><loc_639><loc_60><loc_634><loc_66><loc_632><loc_72><loc_630><loc_86><loc_628><loc_102><loc_628><loc_105><loc_626><loc_108><loc_622><loc_110><loc_618><loc_110><loc_609><loc_108><loc_607><loc_107><loc_601><loc_105><loc_593><loc_105><loc_576><loc_107><loc_570><loc_108><loc_566><loc_113><loc_559><loc_116><loc_557><loc_121><loc_555><loc_124><loc_555><loc_127><loc_551><loc_125><loc_543><loc_127><loc_539><loc_130><loc_534><loc_138><loc_534><loc_141><loc_532><loc_144><loc_528><loc_144><loc_526><loc_152><loc_514><loc_179><loc_478><loc_183><loc_472><loc_191><loc_464><loc_196><loc_460><loc_197><loc_460><loc_202><loc_456><loc_208><loc_449><loc_216><loc_441><loc_224><loc_433><loc_233><loc_420><loc_240><loc_414><loc_241><loc_414><loc_246><loc_410><loc_254><loc_401><loc_263><loc_389><loc_268><loc_385><loc_276><loc_381><loc_279><loc_376></s>",
-            //     target: {
-            //         '<REFERRING_EXPRESSION_SEGMENTATION>': {
-            //             polygons: [[[[178.88, 181.68, 180.8, 180.72, 182.72, 180.72, 187.84, 177.84, 189.76, 177.84, 192.96, 175.92, 194.88, 175.92, 198.08, 174, 200.64, 173.04, 203.84, 172.08, 207.04, 170.64, 209.6, 169.68, 214.08, 168.72, 217.92, 167.76, 221.76, 166.8, 226.24, 165.84, 230.72, 164.88, 237.12, 163.92, 244.16, 162.96, 253.12, 162, 265.28, 161.04, 311.36, 161.04, 329.28, 162, 338.24, 162.96, 345.28, 163.92, 350.4, 164.88, 354.24, 165.84, 358.72, 166.8, 362.56, 167.76, 366.4, 168.72, 370.24, 169.68, 373.44, 170.64, 375.36, 172.08, 377.28, 174, 379.2, 176.88, 380.48, 179.76, 382.4, 181.68, 384.32, 185.04, 386.24, 187.92, 387.52, 190.8, 389.44, 192.72, 390.08, 196.08, 392, 198.96, 394.56, 201.84, 396.48, 204.72, 398.4, 208.08, 403.52, 212.88, 406.08, 213.84, 409.28, 216.72, 412.48, 220.08, 431.68, 220.08, 432.32, 221.04, 442.56, 222, 456.64, 222, 465.6, 222.96, 472.64, 223.92, 478.4, 224.88, 484.8, 225.84, 489.92, 226.8, 493.76, 227.76, 497.6, 228.72, 501.44, 229.68, 504.64, 231.12, 507.84, 232.08, 510.4, 233.04, 513.6, 234, 516.8, 235.92, 518.72, 235.92, 523.84, 238.8, 525.76, 238.8, 527.68, 239.76, 529.6, 241.68, 532.8, 242.64, 536, 245.04, 538.56, 247.92, 541.76, 249.84, 545.6, 251.76, 548.8, 252.72, 550.72, 252.72, 553.92, 253.68, 556.48, 255.6, 558.4, 255.6, 564.8, 258.96, 566.72, 260.88, 568.64, 260.88, 570.56, 261.84, 572.48, 263.76, 573.76, 265.68, 574.4, 268.56, 574.4, 271.92, 573.76, 272.88, 572.48, 275.76, 572.48, 279.6, 573.76, 285.84, 574.4, 286.8, 575.68, 289.68, 576.32, 292.56, 577.6, 298.8, 577.6, 301.68, 576.32, 302.64, 575.68, 310.8, 575.68, 312.72, 576.32, 313.68, 577.6, 316.56, 577.6, 320.88, 574.4, 321.84, 568.64, 322.8, 559.68, 322.8, 553.92, 323.76, 552.64, 332.88, 552, 336.72, 550.72, 339.6, 550.08, 342.96, 548.8, 344.88, 546.88, 346.8, 545.6, 349.68, 543.68, 352.56, 541.76, 355.92, 534.72, 362.64, 531.52, 364.56, 525.76, 367.92, 522.56, 368.88, 518.72, 369.84, 495.68, 369.84, 489.92, 368.88, 486.72, 367.92, 483.52, 366.96, 479.68, 364.56, 476.48, 362.64, 472.64, 359.76, 465.6, 352.56, 463.68, 349.68, 461.76, 346.8, 460.48, 344.88, 460.48, 342.96, 458.56, 339.6, 457.92, 336.72, 457.92, 334.8, 456.64, 332.88, 454.72, 330.96, 452.8, 331.92, 448.32, 336.72, 446.4, 337.68, 426.56, 336.72, 424.64, 336.72, 423.36, 337.68, 420.8, 338.64, 414.4, 339.6, 412.48, 339.6, 411.2, 338.64, 380.48, 337.68, 217.28, 337.68, 216, 338.64, 210.88, 339.6, 207.04, 339.6, 203.84, 338.64, 201.92, 337.68, 200, 335.76, 198.08, 334.8, 194.88, 334.8, 192.96, 336.72, 191.68, 338.64, 191.68, 340.56, 191.04, 342.96, 189.12, 344.88, 187.84, 347.76, 185.92, 349.68, 184.64, 352.56, 182.72, 355.92, 176.96, 361.68, 173.76, 363.6, 170.56, 365.52, 166.72, 367.92, 163.52, 368.88, 160.96, 369.84, 153.92, 370.8, 131.52, 370.8, 127.68, 369.84, 124.48, 368.88, 118.72, 365.52, 115.52, 363.6, 111.68, 360.72, 106.56, 355.92, 104.64, 352.56, 103.36, 349.68, 101.44, 347.76, 100.8, 345.84, 99.52, 342.96, 99.52, 339.6, 98.88, 337.68, 95.68, 334.8, 93.76, 333.84, 86.72, 333.84, 80.32, 334.8, 79.68, 335.76, 74.56, 336.72, 66.24, 336.72, 63.68, 334.8, 53.44, 334.8, 50.24, 333.84, 48.32, 331.92, 48.32, 328.56, 50.24, 326.64, 51.52, 324.72, 51.52, 322.8, 44.48, 321.84, 40.64, 320.88, 38.72, 319.92, 37.44, 317.52, 36.16, 313.68, 36.16, 306.96, 38.72, 304.56, 42.56, 303.6, 46.4, 302.64, 55.36, 301.68, 65.6, 301.68, 67.52, 300.72, 69.44, 298.8, 70.72, 296.88, 70.72, 292.56, 69.44, 291.6, 68.8, 288.72, 67.52, 284.88, 67.52, 276.72, 68.8, 273.84, 69.44, 271.92, 72.64, 268.56, 74.56, 267.6, 77.76, 266.64, 79.68, 266.64, 81.6, 264.72, 80.32, 260.88, 81.6, 258.96, 83.52, 256.56, 88.64, 256.56, 90.56, 255.6, 92.48, 253.68, 92.48, 252.72, 97.6, 246.96, 114.88, 229.68, 117.44, 226.8, 122.56, 222.96, 125.76, 221.04, 126.4, 221.04, 129.6, 219.12, 133.44, 215.76, 138.56, 211.92, 143.68, 208.08, 149.44, 201.84, 153.92, 198.96, 154.56, 198.96, 157.76, 197.04, 162.88, 192.72, 168.64, 186.96, 171.84, 185.04, 176.96, 183.12, 178.88, 180.72]]]],
-            //             labels: [''],
-            //         }
-            //     },
-            //     image: 'beetle',
-            // },
-            // {
-            //     task: "<REGION_TO_SEGMENTATION>",
-            //     text_input: "<loc_702><loc_575><loc_866><loc_772>",
-            //     generated_text: "</s><s><s><s><loc_734><loc_600><loc_740><loc_594><loc_745><loc_590><loc_748><loc_588><loc_751><loc_588><loc_756><loc_584><loc_760><loc_582><loc_765><loc_580><loc_773><loc_578><loc_800><loc_578><loc_804><loc_580><loc_809><loc_582><loc_814><loc_586><loc_817><loc_586><loc_820><loc_590><loc_825><loc_592><loc_829><loc_596><loc_834><loc_600><loc_848><loc_619><loc_851><loc_625><loc_854><loc_631><loc_859><loc_644><loc_861><loc_650><loc_862><loc_656><loc_864><loc_665><loc_864><loc_692><loc_862><loc_702><loc_861><loc_708><loc_859><loc_715><loc_856><loc_723><loc_853><loc_729><loc_850><loc_735><loc_845><loc_744><loc_839><loc_752><loc_831><loc_760><loc_826><loc_764><loc_823><loc_766><loc_818><loc_768><loc_814><loc_770><loc_806><loc_773><loc_782><loc_773><loc_768><loc_770><loc_762><loc_768><loc_757><loc_766><loc_748><loc_760><loc_743><loc_756><loc_737><loc_750><loc_726><loc_735><loc_723><loc_729><loc_720><loc_723><loc_718><loc_719><loc_718><loc_715><loc_715><loc_708><loc_713><loc_702><loc_712><loc_696><loc_710><loc_688><loc_710><loc_658><loc_712><loc_648><loc_713><loc_640><loc_715><loc_633><loc_718><loc_627><loc_718><loc_623><loc_720><loc_619><loc_723><loc_613></s>",
-            //     target: {
-            //         '<REGION_TO_SEGMENTATION>': {
-            //             polygons: [[[[470.08, 288.24, 473.92, 285.36, 477.12, 283.44, 479.04, 282.48, 480.96, 282.48, 484.16, 280.56, 486.72, 279.6, 489.92, 278.64, 495.04, 277.68, 512.32, 277.68, 514.88, 278.64, 518.08, 279.6, 521.28, 281.52, 523.2, 281.52, 525.12, 283.44, 528.32, 284.4, 530.88, 286.32, 534.08, 288.24, 543.04, 297.36, 544.96, 300.24, 546.88, 303.12, 550.08, 309.36, 551.36, 312.24, 552, 315.12, 553.28, 319.44, 553.28, 332.4, 552, 337.2, 551.36, 340.08, 550.08, 343.44, 548.16, 347.28, 546.24, 350.16, 544.32, 353.04, 541.12, 357.36, 537.28, 361.2, 532.16, 365.04, 528.96, 366.96, 527.04, 367.92, 523.84, 368.88, 521.28, 369.84, 516.16, 371.28, 500.8, 371.28, 491.84, 369.84, 488, 368.88, 484.8, 367.92, 479.04, 365.04, 475.84, 363.12, 472, 360.24, 464.96, 353.04, 463.04, 350.16, 461.12, 347.28, 459.84, 345.36, 459.84, 343.44, 457.92, 340.08, 456.64, 337.2, 456, 334.32, 454.72, 330.48, 454.72, 316.08, 456, 311.28, 456.64, 307.44, 457.92, 304.08, 459.84, 301.2, 459.84, 299.28, 461.12, 297.36, 463.04, 294.48]]]],
-            //             labels: [''],
-            //         }
-            //     },
-            //     image: 'beetle',
-            // },
-            // {
-            //     task: "<OPEN_VOCABULARY_DETECTION>",
-            //     text_input: "a green car",
-            //     generated_text: "</s><s><s>a green car<loc_53><loc_330><loc_910><loc_779></s>",
-            //     target: {
-            //         '<OPEN_VOCABULARY_DETECTION>': {
-            //             bboxes: [[34.24, 158.64, 582.72, 374.16]],
-            //             bboxes_labels: ['a green car'],
-            //             polygons: [],
-            //             polygons_labels: [],
-            //         }
-            //     },
-            //     image: 'beetle',
-            // },
-            {
-              task: "<REGION_TO_CATEGORY>",
-              text_input: "<loc_52><loc_332><loc_932><loc_774>",
-              generated_text: "</s><s>car<loc_52><loc_332><loc_932><loc_774></s>",
-              target: { "<REGION_TO_CATEGORY>": "car<loc_52><loc_332><loc_932><loc_774>" },
-              image: "beetle",
-            },
-            {
-              task: "<REGION_TO_DESCRIPTION>",
-              text_input: "<loc_52><loc_332><loc_932><loc_774>",
-              generated_text: "</s><s>turquoise Volkswagen Beetle<loc_52><loc_332><loc_932><loc_774></s>",
-              target: { "<REGION_TO_DESCRIPTION>": "turquoise Volkswagen Beetle<loc_52><loc_332><loc_932><loc_774>" },
-              image: "beetle",
-            },
-            {
-              task: "<OCR>",
-              generated_text: "</s><s>CUDAFOR ENGINEERSAn Introduction to High-PerformanceParallel ComputingDUANE STORTIMETE YURTOGLU</s>",
-              target: { "<OCR>": "CUDAFOR ENGINEERSAn Introduction to High-PerformanceParallel ComputingDUANE STORTIMETE YURTOGLU" },
-              image: "book_cover",
-            },
-            {
-              task: "<OCR_WITH_REGION>",
-              generated_text: "</s><s><s><s>CUDA<loc_414><loc_100><loc_932><loc_100><loc_932><loc_229><loc_414><loc_229>FOR ENGINEERS<loc_359><loc_241><loc_932><loc_241><loc_932><loc_298><loc_359><loc_298>An Introduction to High-Performance<loc_287><loc_330><loc_934><loc_332><loc_934><loc_368><loc_287><loc_366>Parallel Computing<loc_595><loc_368><loc_934><loc_372><loc_934><loc_408><loc_595><loc_404>DUANE STORTI<loc_660><loc_882><loc_934><loc_882><loc_934><loc_912><loc_660><loc_912>METE YURTOGLU<loc_625><loc_920><loc_934><loc_920><loc_934><loc_950><loc_625><loc_950></s>",
-              target: {
-                "<OCR_WITH_REGION>": {
-                  quad_boxes: [
-                    [167.0435028076172, 50.25, 375.7974853515625, 50.25, 375.7974853515625, 114.75, 167.0435028076172, 114.75],
-                    [144.8784942626953, 120.75, 375.7974853515625, 120.75, 375.7974853515625, 149.25, 144.8784942626953, 149.25],
-                    [115.86249542236328, 165.25, 376.6034851074219, 166.25, 376.6034851074219, 184.25, 115.86249542236328, 183.25],
-                    [239.9864959716797, 184.25, 376.6034851074219, 186.25, 376.6034851074219, 204.25, 239.9864959716797, 202.25],
-                    [266.1814880371094, 441.25, 376.6034851074219, 441.25, 376.6034851074219, 456.25, 266.1814880371094, 456.25],
-                    [252.0764923095703, 460.25, 376.6034851074219, 460.25, 376.6034851074219, 475.25, 252.0764923095703, 475.25],
-                  ],
-
-                  // NOTE: Python version has a bug here, it should be "CUDA" instead of "</s>CUDA"
-                  labels: [/* '</s>CUDA' */ "CUDA", "FOR ENGINEERS", "An Introduction to High-Performance", "Parallel Computing", "DUANE STORTI", "METE YURTOGLU"],
-                },
-              },
-              image: "book_cover",
-            },
-          ];
-
-          for (const { task, generated_text, target, image } of TESTS) {
-            it(task, () => {
-              const result = processor.post_process_generation(generated_text, task, images[image].size);
-              expect(result).toBeCloseToNested(target, 4);
-            });
-          }
-        });
-      },
-      MAX_TEST_TIME,
-    );
-
-    describe(
-      "Qwen2VLProcessor",
-      () => {
-        /** @type {import('../src/transformers.js').Qwen2VLProcessor} */
-        let processor;
-        let images = {};
-
-        beforeAll(async () => {
-          processor = await AutoProcessor.from_pretrained(MODELS.qwen2_vl);
-          images = {
-            white_image: await load_cached_image("white_image"),
-          };
-        });
-
-        it("Image and text", async () => {
-          const conversation = [
-            {
-              role: "user",
-              content: [{ type: "image" }, { type: "text", text: "Describe this image." }],
-            },
-          ];
-
-          const text = processor.apply_chat_template(conversation, {
-            add_generation_prompt: true,
-          });
-          const { input_ids, attention_mask, pixel_values, image_grid_thw } = await processor(text, images.white_image);
-
-          expect(input_ids.dims).toEqual([1, 89]);
-          expect(attention_mask.dims).toEqual([1, 89]);
-          expect(pixel_values.dims).toEqual([256, 1176]);
-          expect(image_grid_thw.dims).toEqual([1, 3]);
-        });
-      },
-      MAX_TEST_TIME,
-    );
-
-    describe(
-      "PaliGemmaProcessor",
-      () => {
-        /** @type {import('../src/transformers.js').PaliGemmaProcessor} */
-        let processor;
-        let images = {};
-
-        beforeAll(async () => {
-          processor = await AutoProcessor.from_pretrained(MODELS.paligemma);
-          images = {
-            white_image: await load_cached_image("white_image"),
-          };
-        });
-
-        it("Image-only (default text)", async () => {
-          const { input_ids, pixel_values } = await processor(images.white_image);
-          expect(input_ids.dims).toEqual([1, 258]);
-          expect(pixel_values.dims).toEqual([1, 3, 224, 224]);
-        });
-
-        it("Single image & text", async () => {
-          const { input_ids, pixel_values } = await processor(images.white_image, "<image>What is on the flower?");
-          expect(input_ids.dims).toEqual([1, 264]);
-          expect(pixel_values.dims).toEqual([1, 3, 224, 224]);
-        });
-
-        it("Multiple images & text", async () => {
-          const { input_ids, pixel_values } = await processor([images.white_image, images.white_image], "<image><image>Describe the images.");
-          expect(input_ids.dims).toEqual([1, 518]);
-          expect(pixel_values.dims).toEqual([2, 3, 224, 224]);
-        });
-      },
-      MAX_TEST_TIME,
-    );
-  });
-});
+await collect_and_execute_tests("Processors", "processor");
diff --git a/tests/test_utils.js b/tests/test_utils.js
index 9928bf75b..c42c5f201 100644
--- a/tests/test_utils.js
+++ b/tests/test_utils.js
@@ -1,3 +1,7 @@
+import fs from "fs";
+import path from "path";
+import { fileURLToPath } from "url";
+
 export async function loadAudio(url) {
   // NOTE: Since the Web Audio API is not available in Node.js, we will need to use the `wavefile` library to obtain the raw audio data.
   // For more information, see: https://huggingface.co/docs/transformers.js/guides/node-audio-processing
@@ -63,3 +67,67 @@ export function compare(val1, val2, tol = 0.1) {
     }
   }
 }
+
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = path.dirname(__filename);
+const models_dir = path.join(__dirname, "models");
+const pipelines_dir = path.join(__dirname, "pipelines");
+
+/**
+ * Helper function to collect all unit tests, which can be found in files
+ * of the form: `tests/models/<model_type>/test_<filename>_<model_type>.js`.
+ * @param {string} filename
+ * @returns {Promise<[string, Function][]>}
+ */
+export async function collect_tests(filename) {
+  const model_types = fs.readdirSync(models_dir);
+  const all_tests = [];
+  for (const model_type of model_types) {
+    const dir = path.join(models_dir, model_type);
+
+    if (!fs.existsSync(dir) || !fs.statSync(dir).isDirectory()) {
+      continue;
+    }
+
+    const file = path.join(dir, `test_${filename}_${model_type}.js`);
+    if (!fs.existsSync(file)) {
+      continue;
+    }
+
+    const items = await import(file);
+    all_tests.push([model_type, items]);
+  }
+  return all_tests;
+}
+
+/**
+ * Helper function to collect and execute all unit tests, which can be found in files
+ * of the form: `tests/models/<model_type>/test_<filename>_<model_type>.js`.
+ * @param {string} title The title of the test
+ * @param {string} filename The name of the test
+ */
+export async function collect_and_execute_tests(title, filename) {
+  // 1. Collect all tests
+  const all_tests = await collect_tests(filename);
+
+  // 2. Execute tests
+  describe(title, () => all_tests.forEach(([name, test]) => describe(name, test.default)));
+}
+
+/**
+ * Helper function to collect all pipeline tests, which can be found in files
+ * of the form: `tests/pipelines/test_pipeline_<pipeline_id>.js`.
+ */
+export async function collect_and_execute_pipeline_tests(title) {
+  // 1. Collect all tests
+  const all_tests = [];
+  const pipeline_types = fs.readdirSync(pipelines_dir);
+  for (const filename of pipeline_types) {
+    const file = path.join(pipelines_dir, filename);
+    const items = await import(file);
+    all_tests.push(items);
+  }
+
+  // 2. Execute tests
+  describe(title, () => all_tests.forEach((test) => test.default()));
+}
diff --git a/tests/tiny_random.test.js b/tests/tiny_random.test.js
deleted file mode 100644
index d80699a11..000000000
--- a/tests/tiny_random.test.js
+++ /dev/null
@@ -1,841 +0,0 @@
-import {
-  // Pipelines
-  pipeline,
-  FillMaskPipeline,
-  TextClassificationPipeline,
-  TextGenerationPipeline,
-  TranslationPipeline,
-  ImageClassificationPipeline,
-  ZeroShotImageClassificationPipeline,
-  TokenClassificationPipeline,
-  QuestionAnsweringPipeline,
-  DocumentQuestionAnsweringPipeline,
-
-  // Other
-  RawImage,
-} from "../src/transformers.js";
-
-import { init, MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "./init.js";
-import { compare } from "./test_utils.js";
-
-init();
-
-describe("Tiny random pipelines", () => {
-  describe("fill-mask", () => {
-    const model_id = "hf-internal-testing/tiny-random-BertForMaskedLM";
-
-    /** @type {FillMaskPipeline} */
-    let pipe;
-    beforeAll(async () => {
-      pipe = await pipeline("fill-mask", model_id, DEFAULT_MODEL_OPTIONS);
-    }, MAX_MODEL_LOAD_TIME);
-
-    describe("batch_size=1", () => {
-      it(
-        "default (top_k=5)",
-        async () => {
-          const output = await pipe("a [MASK] c");
-          const target = [
-            { score: 0.0013377574505284429, token: 854, token_str: "##ο", sequence: "aο c" },
-            { score: 0.001248967950232327, token: 962, token_str: "##ち", sequence: "aち c" },
-            { score: 0.0012304208939895034, token: 933, token_str: "##ع", sequence: "aع c" },
-            { score: 0.0012301815440878272, token: 313, token_str: "ფ", sequence: "a ფ c" },
-            { score: 0.001222139224410057, token: 624, token_str: "未", sequence: "a 未 c" },
-          ];
-          compare(output, target, 1e-5);
-        },
-        MAX_TEST_EXECUTION_TIME,
-      );
-      it(
-        "custom (top_k=2)",
-        async () => {
-          const output = await pipe("a [MASK] c", { top_k: 2 });
-          const target = [
-            { score: 0.0013377574505284429, token: 854, token_str: "##ο", sequence: "aο c" },
-            { score: 0.001248967950232327, token: 962, token_str: "##ち", sequence: "aち c" },
-          ];
-          compare(output, target, 1e-5);
-        },
-        MAX_TEST_EXECUTION_TIME,
-      );
-    });
-
-    describe("batch_size>1", () => {
-      it(
-        "default (top_k=5)",
-        async () => {
-          const output = await pipe(["a [MASK] c", "a b [MASK] c"]);
-          const target = [
-            [
-              { score: 0.0013377574505284429, token: 854, token_str: "##ο", sequence: "aο c" },
-              { score: 0.001248967950232327, token: 962, token_str: "##ち", sequence: "aち c" },
-              { score: 0.0012304208939895034, token: 933, token_str: "##ع", sequence: "aع c" },
-              { score: 0.0012301815440878272, token: 313, token_str: "ფ", sequence: "a ფ c" },
-              { score: 0.001222139224410057, token: 624, token_str: "未", sequence: "a 未 c" },
-            ],
-            [
-              { score: 0.0013287801994010806, token: 962, token_str: "##ち", sequence: "a bち c" },
-              { score: 0.0012486606137827039, token: 823, token_str: "##ن", sequence: "a bن c" },
-              { score: 0.0012320734094828367, token: 1032, token_str: "##ც", sequence: "a bც c" },
-              { score: 0.0012295148335397243, token: 854, token_str: "##ο", sequence: "a bο c" },
-              { score: 0.0012277684872969985, token: 624, token_str: "未", sequence: "a b 未 c" },
-            ],
-          ];
-          compare(output, target, 1e-5);
-        },
-        MAX_TEST_EXECUTION_TIME,
-      );
-      it(
-        "custom (top_k=2)",
-        async () => {
-          const output = await pipe(["a [MASK] c", "a b [MASK] c"], { top_k: 2 });
-          const target = [
-            [
-              { score: 0.0013377574505284429, token: 854, token_str: "##ο", sequence: "aο c" },
-              { score: 0.001248967950232327, token: 962, token_str: "##ち", sequence: "aち c" },
-            ],
-            [
-              { score: 0.0013287801994010806, token: 962, token_str: "##ち", sequence: "a bち c" },
-              { score: 0.0012486606137827039, token: 823, token_str: "##ن", sequence: "a bن c" },
-            ],
-          ];
-          compare(output, target, 1e-5);
-        },
-        MAX_TEST_EXECUTION_TIME,
-      );
-    });
-
-    afterAll(async () => {
-      await pipe?.dispose();
-    }, MAX_MODEL_DISPOSE_TIME);
-  });
-
-  describe("text-classification", () => {
-    const model_id = "hf-internal-testing/tiny-random-BertForSequenceClassification";
-
-    /** @type {TextClassificationPipeline} */
-    let pipe;
-    beforeAll(async () => {
-      pipe = await pipeline("text-classification", model_id, DEFAULT_MODEL_OPTIONS);
-    }, MAX_MODEL_LOAD_TIME);
-
-    describe("batch_size=1", () => {
-      it(
-        "default (top_k=1)",
-        async () => {
-          const output = await pipe("a");
-          const target = [{ label: "LABEL_0", score: 0.5076976418495178 }];
-          compare(output, target, 1e-5);
-        },
-        MAX_TEST_EXECUTION_TIME,
-      );
-      it(
-        "custom (top_k=2)",
-        async () => {
-          const output = await pipe("a", { top_k: 2 });
-          const target = [
-            { label: "LABEL_0", score: 0.5076976418495178 },
-            { label: "LABEL_1", score: 0.49230238795280457 },
-          ];
-          compare(output, target, 1e-5);
-        },
-        MAX_TEST_EXECUTION_TIME,
-      );
-    });
-
-    describe("batch_size>1", () => {
-      it(
-        "default (top_k=1)",
-        async () => {
-          const output = await pipe(["a", "b c"]);
-          const target = [
-            { label: "LABEL_0", score: 0.5076976418495178 },
-            { label: "LABEL_0", score: 0.5077522993087769 },
-          ];
-          compare(output, target, 1e-5);
-        },
-        MAX_TEST_EXECUTION_TIME,
-      );
-      it(
-        "custom (top_k=2)",
-        async () => {
-          const output = await pipe(["a", "b c"], { top_k: 2 });
-          const target = [
-            [
-              { label: "LABEL_0", score: 0.5076976418495178 },
-              { label: "LABEL_1", score: 0.49230238795280457 },
-            ],
-            [
-              { label: "LABEL_0", score: 0.5077522993087769 },
-              { label: "LABEL_1", score: 0.49224773049354553 },
-            ],
-          ];
-          compare(output, target, 1e-5);
-        },
-        MAX_TEST_EXECUTION_TIME,
-      );
-
-      it(
-        "multi_label_classification",
-        async () => {
-          const problem_type = pipe.model.config.problem_type;
-          pipe.model.config.problem_type = "multi_label_classification";
-
-          const output = await pipe(["a", "b c"], { top_k: 2 });
-          const target = [
-            [
-              { label: "LABEL_0", score: 0.5001373887062073 },
-              { label: "LABEL_1", score: 0.49243971705436707 },
-            ],
-            [
-              { label: "LABEL_0", score: 0.5001326203346252 },
-              { label: "LABEL_1", score: 0.492380291223526 },
-            ],
-          ];
-          compare(output, target, 1e-5);
-
-          // Reset problem type
-          pipe.model.config.problem_type = problem_type;
-        },
-        MAX_TEST_EXECUTION_TIME,
-      );
-    });
-
-    afterAll(async () => {
-      await pipe?.dispose();
-    }, MAX_MODEL_DISPOSE_TIME);
-  });
-
-  describe("token-classification", () => {
-    const model_id = "hf-internal-testing/tiny-random-BertForTokenClassification";
-
-    /** @type {TokenClassificationPipeline} */
-    let pipe;
-    beforeAll(async () => {
-      pipe = await pipeline("token-classification", model_id, DEFAULT_MODEL_OPTIONS);
-    }, MAX_MODEL_LOAD_TIME);
-
-    describe("batch_size=1", () => {
-      it(
-        "default",
-        async () => {
-          const output = await pipe("1 2 3");
-
-          // TODO: Add start/end to target
-          const target = [
-            {
-              entity: "LABEL_0",
-              score: 0.5292708,
-              index: 1,
-              word: "1",
-              // 'start': 0, 'end': 1
-            },
-            {
-              entity: "LABEL_0",
-              score: 0.5353687,
-              index: 2,
-              word: "2",
-              // 'start': 2, 'end': 3
-            },
-            {
-              entity: "LABEL_1",
-              score: 0.51381934,
-              index: 3,
-              word: "3",
-              // 'start': 4, 'end': 5
-            },
-          ];
-          compare(output, target, 1e-5);
-        },
-        MAX_TEST_EXECUTION_TIME,
-      );
-      it(
-        "custom (ignore_labels set)",
-        async () => {
-          const output = await pipe("1 2 3", { ignore_labels: ["LABEL_0"] });
-          const target = [
-            {
-              entity: "LABEL_1",
-              score: 0.51381934,
-              index: 3,
-              word: "3",
-              // 'start': 4, 'end': 5
-            },
-          ];
-          compare(output, target, 1e-5);
-        },
-        MAX_TEST_EXECUTION_TIME,
-      );
-    });
-
-    describe("batch_size>1", () => {
-      it(
-        "default",
-        async () => {
-          const output = await pipe(["1 2 3", "4 5"]);
-          const target = [
-            [
-              {
-                entity: "LABEL_0",
-                score: 0.5292708,
-                index: 1,
-                word: "1",
-                // 'start': 0, 'end': 1
-              },
-              {
-                entity: "LABEL_0",
-                score: 0.5353687,
-                index: 2,
-                word: "2",
-                // 'start': 2, 'end': 3
-              },
-              {
-                entity: "LABEL_1",
-                score: 0.51381934,
-                index: 3,
-                word: "3",
-                // 'start': 4, 'end': 5
-              },
-            ],
-            [
-              {
-                entity: "LABEL_0",
-                score: 0.5432807,
-                index: 1,
-                word: "4",
-                // 'start': 0, 'end': 1
-              },
-              {
-                entity: "LABEL_1",
-                score: 0.5007693,
-                index: 2,
-                word: "5",
-                // 'start': 2, 'end': 3
-              },
-            ],
-          ];
-          compare(output, target, 1e-5);
-        },
-        MAX_TEST_EXECUTION_TIME,
-      );
-      it(
-        "custom (ignore_labels set)",
-        async () => {
-          const output = await pipe(["1 2 3", "4 5"], { ignore_labels: ["LABEL_0"] });
-          const target = [
-            [
-              {
-                entity: "LABEL_1",
-                score: 0.51381934,
-                index: 3,
-                word: "3",
-                // 'start': 4, 'end': 5
-              },
-            ],
-            [
-              {
-                entity: "LABEL_1",
-                score: 0.5007693,
-                index: 2,
-                word: "5",
-                // 'start': 2, 'end': 3
-              },
-            ],
-          ];
-          compare(output, target, 1e-5);
-        },
-        MAX_TEST_EXECUTION_TIME,
-      );
-    });
-
-    afterAll(async () => {
-      await pipe?.dispose();
-    }, MAX_MODEL_DISPOSE_TIME);
-  });
-
-  describe("question-answering", () => {
-    const model_id = "hf-internal-testing/tiny-random-BertForQuestionAnswering";
-
-    /** @type {QuestionAnsweringPipeline} */
-    let pipe;
-    beforeAll(async () => {
-      pipe = await pipeline("question-answering", model_id, DEFAULT_MODEL_OPTIONS);
-    }, MAX_MODEL_LOAD_TIME);
-
-    describe("batch_size=1", () => {
-      it(
-        "default (top_k=1)",
-        async () => {
-          const output = await pipe("a", "b c");
-          const target = { score: 0.11395696550607681, /* start: 0, end: 1, */ answer: "b" };
-          compare(output, target, 1e-5);
-        },
-        MAX_TEST_EXECUTION_TIME,
-      );
-      it(
-        "custom (top_k=3)",
-        async () => {
-          const output = await pipe("a", "b c", { top_k: 3 });
-          const target = [
-            { score: 0.11395696550607681, /* start: 0, end: 1, */ answer: "b" },
-            { score: 0.11300431191921234, /* start: 2, end: 3, */ answer: "c" },
-            { score: 0.10732574015855789, /* start: 0, end: 3, */ answer: "b c" },
-          ];
-          compare(output, target, 1e-5);
-        },
-        MAX_TEST_EXECUTION_TIME,
-      );
-    });
-
-    afterAll(async () => {
-      await pipe?.dispose();
-    }, MAX_MODEL_DISPOSE_TIME);
-  });
-
-  describe("image-classification", () => {
-    const model_id = "hf-internal-testing/tiny-random-vit";
-    const urls = ["https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/white-image.png", "https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/blue-image.png"];
-
-    /** @type {ImageClassificationPipeline} */
-    let pipe;
-    beforeAll(async () => {
-      pipe = await pipeline("image-classification", model_id, DEFAULT_MODEL_OPTIONS);
-    }, MAX_MODEL_LOAD_TIME);
-
-    describe("batch_size=1", () => {
-      it(
-        "default (top_k=5)",
-        async () => {
-          const output = await pipe(urls[0]);
-          const target = [
-            { label: "LABEL_1", score: 0.5020533800125122 },
-            { label: "LABEL_0", score: 0.4979466497898102 },
-          ];
-          compare(output, target, 1e-5);
-        },
-        MAX_TEST_EXECUTION_TIME,
-      );
-      it(
-        "custom (top_k=1)",
-        async () => {
-          const output = await pipe(urls[0], { top_k: 1 });
-          const target = [{ label: "LABEL_1", score: 0.5020533800125122 }];
-          compare(output, target, 1e-5);
-        },
-        MAX_TEST_EXECUTION_TIME,
-      );
-    });
-
-    describe("batch_size>1", () => {
-      it(
-        "default (top_k=5)",
-        async () => {
-          const output = await pipe(urls);
-          const target = [
-            [
-              { label: "LABEL_1", score: 0.5020533800125122 },
-              { label: "LABEL_0", score: 0.4979466497898102 },
-            ],
-            [
-              { label: "LABEL_1", score: 0.519227921962738 },
-              { label: "LABEL_0", score: 0.4807720482349396 },
-            ],
-          ];
-          compare(output, target, 1e-5);
-        },
-        MAX_TEST_EXECUTION_TIME,
-      );
-      it(
-        "custom (top_k=1)",
-        async () => {
-          const output = await pipe(urls, { top_k: 1 });
-          const target = [[{ label: "LABEL_1", score: 0.5020533800125122 }], [{ label: "LABEL_1", score: 0.519227921962738 }]];
-          compare(output, target, 1e-5);
-        },
-        MAX_TEST_EXECUTION_TIME,
-      );
-    });
-
-    afterAll(async () => {
-      await pipe?.dispose();
-    }, MAX_MODEL_DISPOSE_TIME);
-  });
-
-  describe("zero-shot-image-classification", () => {
-    const model_id = "hf-internal-testing/tiny-random-GroupViTModel";
-
-    // Example adapted from https://huggingface.co/docs/transformers/en/model_doc/groupvit
-    const urls = ["https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/white-image.png", "https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/blue-image.png"];
-    const labels = ["cat", "dog"];
-    const hypothesis_template = "a photo of a {}";
-
-    /** @type {ZeroShotImageClassificationPipeline} */
-    let pipe;
-    beforeAll(async () => {
-      pipe = await pipeline("zero-shot-image-classification", model_id, DEFAULT_MODEL_OPTIONS);
-    }, MAX_MODEL_LOAD_TIME);
-
-    describe("batch_size=1", () => {
-      it(
-        "default",
-        async () => {
-          const output = await pipe(urls[0], labels);
-          const target = [
-            { score: 0.5990662574768066, label: "cat" },
-            { score: 0.40093377232551575, label: "dog" },
-          ];
-          compare(output, target, 1e-5);
-        },
-        MAX_TEST_EXECUTION_TIME,
-      );
-      it(
-        "custom (w/ hypothesis_template)",
-        async () => {
-          const output = await pipe(urls[0], labels, { hypothesis_template });
-          const target = [
-            { score: 0.5527022480964661, label: "cat" },
-            { score: 0.44729775190353394, label: "dog" },
-          ];
-          compare(output, target, 1e-5);
-        },
-        MAX_TEST_EXECUTION_TIME,
-      );
-    });
-
-    describe("batch_size>1", () => {
-      it(
-        "default",
-        async () => {
-          const output = await pipe(urls, labels);
-          const target = [
-            [
-              { score: 0.5990662574768066, label: "cat" },
-              { score: 0.40093377232551575, label: "dog" },
-            ],
-            [
-              { score: 0.5006340146064758, label: "dog" },
-              { score: 0.49936598539352417, label: "cat" },
-            ],
-          ];
-          compare(output, target, 1e-5);
-        },
-        MAX_TEST_EXECUTION_TIME,
-      );
-      it(
-        "custom (w/ hypothesis_template)",
-        async () => {
-          const output = await pipe(urls, labels, { hypothesis_template });
-          const target = [
-            [
-              { score: 0.5527022480964661, label: "cat" },
-              { score: 0.44729775190353394, label: "dog" },
-            ],
-            [
-              { score: 0.5395973324775696, label: "cat" },
-              { score: 0.46040263772010803, label: "dog" },
-            ],
-          ];
-          compare(output, target, 1e-5);
-        },
-        MAX_TEST_EXECUTION_TIME,
-      );
-    });
-
-    afterAll(async () => {
-      await pipe?.dispose();
-    }, MAX_MODEL_DISPOSE_TIME);
-  });
-
-  describe("audio-classification", () => {
-    const model_id = "hf-internal-testing/tiny-random-unispeech";
-    const audios = [new Float32Array(16000).fill(0), Float32Array.from({ length: 16000 }, (_, i) => i)];
-
-    /** @type {ImageClassificationPipeline} */
-    let pipe;
-    beforeAll(async () => {
-      pipe = await pipeline("audio-classification", model_id, DEFAULT_MODEL_OPTIONS);
-    }, MAX_MODEL_LOAD_TIME);
-
-    describe("batch_size=1", () => {
-      it(
-        "default (top_k=5)",
-        async () => {
-          const output = await pipe(audios[0]);
-          const target = [
-            { score: 0.5043687224388123, label: "LABEL_0" },
-            { score: 0.4956313371658325, label: "LABEL_1" },
-          ];
-          compare(output, target, 1e-5);
-        },
-        MAX_TEST_EXECUTION_TIME,
-      );
-      it(
-        "custom (top_k=1)",
-        async () => {
-          const output = await pipe(audios[0], { top_k: 1 });
-          const target = [{ score: 0.5043687224388123, label: "LABEL_0" }];
-          compare(output, target, 1e-5);
-        },
-        MAX_TEST_EXECUTION_TIME,
-      );
-    });
-
-    describe("batch_size>1", () => {
-      it(
-        "default (top_k=5)",
-        async () => {
-          const output = await pipe(audios);
-          const target = [
-            [
-              { score: 0.5043687224388123, label: "LABEL_0" },
-              { score: 0.4956313371658325, label: "LABEL_1" },
-            ],
-            [
-              { score: 0.5187293887138367, label: "LABEL_0" },
-              { score: 0.4812707006931305, label: "LABEL_1" },
-            ],
-          ];
-          compare(output, target, 1e-5);
-        },
-        MAX_TEST_EXECUTION_TIME,
-      );
-      it(
-        "custom (top_k=1)",
-        async () => {
-          const output = await pipe(audios, { top_k: 1 });
-          const target = [[{ score: 0.5043687224388123, label: "LABEL_0" }], [{ score: 0.5187293887138367, label: "LABEL_0" }]];
-          compare(output, target, 1e-5);
-        },
-        MAX_TEST_EXECUTION_TIME,
-      );
-    });
-
-    afterAll(async () => {
-      await pipe?.dispose();
-    }, MAX_MODEL_DISPOSE_TIME);
-  });
-
-  describe("text-generation", () => {
-    const model_id = "hf-internal-testing/tiny-random-LlamaForCausalLM";
-
-    /** @type {TextGenerationPipeline} */
-    let pipe;
-    beforeAll(async () => {
-      pipe = await pipeline("text-generation", model_id, DEFAULT_MODEL_OPTIONS);
-    }, MAX_MODEL_LOAD_TIME);
-
-    describe("batch_size=1", () => {
-      const text_input = "hello";
-      const generated_text_target = "erdingsAndroid Load";
-      const text_target = [{ generated_text: text_input + generated_text_target }];
-      const new_text_target = [{ generated_text: generated_text_target }];
-
-      const chat_input = [
-        { role: "system", content: "a" },
-        { role: "user", content: "b" },
-      ];
-      const chat_target = [
-        {
-          generated_text: [
-            { role: "system", content: "a" },
-            { role: "user", content: "b" },
-            { role: "assistant", content: " Southern abund Load" },
-          ],
-        },
-      ];
-
-      it(
-        "text input (single)",
-        async () => {
-          const output = await pipe(text_input, { max_new_tokens: 3 });
-          compare(output, text_target);
-        },
-        MAX_TEST_EXECUTION_TIME,
-      );
-      it(
-        "text input (list)",
-        async () => {
-          const output = await pipe([text_input], { max_new_tokens: 3 });
-          compare(output, [text_target]);
-        },
-        MAX_TEST_EXECUTION_TIME,
-      );
-
-      it(
-        "text input (single) - return_full_text=false",
-        async () => {
-          const output = await pipe(text_input, { max_new_tokens: 3, return_full_text: false });
-          compare(output, new_text_target);
-        },
-        MAX_TEST_EXECUTION_TIME,
-      );
-      it(
-        "text input (list) - return_full_text=false",
-        async () => {
-          const output = await pipe([text_input], { max_new_tokens: 3, return_full_text: false });
-          compare(output, [new_text_target]);
-        },
-        MAX_TEST_EXECUTION_TIME,
-      );
-
-      it(
-        "chat input (single)",
-        async () => {
-          const output = await pipe(chat_input, { max_new_tokens: 3 });
-          compare(output, chat_target);
-        },
-        MAX_TEST_EXECUTION_TIME,
-      );
-      it(
-        "chat input (list)",
-        async () => {
-          const output = await pipe([chat_input], { max_new_tokens: 3 });
-          compare(output, [chat_target]);
-        },
-        MAX_TEST_EXECUTION_TIME,
-      );
-    });
-
-    // TODO: Fix batch_size>1
-    // describe('batch_size>1', () => {
-    //     it('default', async () => {
-    //         const output = await pipe(['hello', 'hello world']);
-    //         const target = [
-    //            [{generated_text: 'helloerdingsAndroid Load'}],
-    //            [{generated_text: 'hello world zerosMillнал'}],
-    //         ];
-    //         compare(output, target);
-    //     }, MAX_TEST_EXECUTION_TIME);
-    // });
-
-    afterAll(async () => {
-      await pipe?.dispose();
-    }, MAX_MODEL_DISPOSE_TIME);
-  });
-
-  describe("translation", () => {
-    const model_id = "Xenova/tiny-random-M2M100ForConditionalGeneration";
-
-    /** @type {TranslationPipeline} */
-    let pipe;
-    beforeAll(async () => {
-      pipe = await pipeline("translation", model_id, DEFAULT_MODEL_OPTIONS);
-    }, MAX_MODEL_LOAD_TIME);
-
-    describe("batch_size=1", () => {
-      it(
-        "default",
-        async () => {
-          const text = "जीवन एक चॉकलेट बॉक्स की तरह है।";
-          const output = await pipe(text, {
-            src_lang: "hi",
-            tgt_lang: "fr",
-            max_new_tokens: 5,
-          });
-          const target = [{ translation_text: "Slovenska төсли төсли төсли" }];
-          compare(output, target);
-        },
-        MAX_TEST_EXECUTION_TIME,
-      );
-    });
-
-    afterAll(async () => {
-      await pipe?.dispose();
-    }, MAX_MODEL_DISPOSE_TIME);
-  });
-
-  describe("object-detection", () => {
-    const model_id = "hf-internal-testing/tiny-random-DetrForObjectDetection";
-    const urls = ["https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/white-image.png", "https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/blue-image.png"];
-
-    /** @type {ImageClassificationPipeline} */
-    let pipe;
-    beforeAll(async () => {
-      pipe = await pipeline("object-detection", model_id, DEFAULT_MODEL_OPTIONS);
-    }, MAX_MODEL_LOAD_TIME);
-
-    describe("batch_size=1", () => {
-      it(
-        "default (threshold unset)",
-        async () => {
-          const output = await pipe(urls[0]);
-          const target = [];
-          compare(output, target, 1e-5);
-        },
-        MAX_TEST_EXECUTION_TIME,
-      );
-      it(
-        "default (threshold=0)",
-        async () => {
-          const output = await pipe(urls[0], { threshold: 0 });
-          const target = [
-            { score: 0.020360443741083145, label: "LABEL_31", box: { xmin: 56, ymin: 55, xmax: 169, ymax: 167 } },
-            { score: 0.020360419526696205, label: "LABEL_31", box: { xmin: 56, ymin: 55, xmax: 169, ymax: 167 } },
-            { score: 0.02036038413643837, label: "LABEL_31", box: { xmin: 56, ymin: 55, xmax: 169, ymax: 167 } },
-            { score: 0.020360447466373444, label: "LABEL_31", box: { xmin: 56, ymin: 55, xmax: 169, ymax: 167 } },
-            { score: 0.020360389724373817, label: "LABEL_31", box: { xmin: 56, ymin: 55, xmax: 169, ymax: 167 } },
-            { score: 0.020360423251986504, label: "LABEL_31", box: { xmin: 56, ymin: 55, xmax: 169, ymax: 167 } },
-            { score: 0.02036040835082531, label: "LABEL_31", box: { xmin: 56, ymin: 55, xmax: 169, ymax: 167 } },
-            { score: 0.020360363647341728, label: "LABEL_31", box: { xmin: 56, ymin: 55, xmax: 169, ymax: 167 } },
-            { score: 0.020360389724373817, label: "LABEL_31", box: { xmin: 56, ymin: 55, xmax: 169, ymax: 167 } },
-            { score: 0.020360389724373817, label: "LABEL_31", box: { xmin: 56, ymin: 55, xmax: 169, ymax: 167 } },
-            { score: 0.020360343158245087, label: "LABEL_31", box: { xmin: 56, ymin: 55, xmax: 169, ymax: 167 } },
-            { score: 0.020360423251986504, label: "LABEL_31", box: { xmin: 56, ymin: 55, xmax: 169, ymax: 167 } },
-          ];
-          compare(output, target, 1e-5);
-        },
-        MAX_TEST_EXECUTION_TIME,
-      );
-    });
-
-    // TODO: Add batched support to object detection pipeline
-    // describe('batch_size>1', () => {
-    //     it('default (threshold unset)', async () => {
-    //         const output = await pipe(urls);
-    //         console.log(output);
-    //         const target = [];
-    //         compare(output, target, 1e-5);
-    //     }, MAX_TEST_EXECUTION_TIME);
-    //     it('default (threshold=0)', async () => {
-    //         const output = await pipe(urls, { threshold: 0 });
-    //         console.log(output);
-    //         const target = [];
-    //         compare(output, target, 1e-5);
-    //     }, MAX_TEST_EXECUTION_TIME);
-    // });
-
-    afterAll(async () => {
-      await pipe?.dispose();
-    }, MAX_MODEL_DISPOSE_TIME);
-  });
-
-  describe("document-question-answering", () => {
-    const model_id = "hf-internal-testing/tiny-random-VisionEncoderDecoderModel-donutswin-mbart";
-
-    /** @type {DocumentQuestionAnsweringPipeline} */
-    let pipe;
-    beforeAll(async () => {
-      pipe = await pipeline("document-question-answering", model_id, DEFAULT_MODEL_OPTIONS);
-    }, MAX_MODEL_LOAD_TIME);
-
-    describe("batch_size=1", () => {
-      it(
-        "default",
-        async () => {
-          const dims = [64, 32, 3];
-          const image = new RawImage(new Uint8ClampedArray(dims[0] * dims[1] * dims[2]).fill(255), ...dims);
-          const question = "What is the invoice number?";
-          const output = await pipe(image, question);
-
-          const target = [{ answer: null }];
-          compare(output, target);
-        },
-        MAX_TEST_EXECUTION_TIME,
-      );
-    });
-
-    afterAll(async () => {
-      await pipe?.dispose();
-    }, MAX_MODEL_DISPOSE_TIME);
-  });
-});
diff --git a/tests/tokenizers.test.js b/tests/tokenizers.test.js
index 00f58193d..943ce5898 100644
--- a/tests/tokenizers.test.js
+++ b/tests/tokenizers.test.js
@@ -1,13 +1,10 @@
 import { AutoTokenizer } from "../src/tokenizers.js";
-import * as TOKENIZER_TESTS from "./models/all_tokenization_tests.js";
-
-import { compare } from "./test_utils.js";
-
-const MAX_LOAD_TIME = 10_000;
-const MAX_EXECUTION_TIME = 10_000;
+import { MAX_TOKENIZER_LOAD_TIME, MAX_TEST_EXECUTION_TIME } from "./init.js";
+import { compare, collect_tests } from "./test_utils.js";
 
+const TOKENIZER_TESTS = await collect_tests("tokenization");
 describe("Tokenizers (model-specific)", () => {
-  for (const [tokenizer_name, { TOKENIZER_CLASS, TEST_CONFIG, CUSTOM_TESTS }] of Object.entries(TOKENIZER_TESTS)) {
+  for (const [tokenizer_name, { TOKENIZER_CLASS, TEST_CONFIG, CUSTOM_TESTS }] of TOKENIZER_TESTS) {
     describe(tokenizer_name, () => {
       for (const model_id in TEST_CONFIG) {
         describe(model_id, () => {
@@ -15,7 +12,7 @@ describe("Tokenizers (model-specific)", () => {
           let tokenizer;
           beforeAll(async () => {
             tokenizer = await TOKENIZER_CLASS.from_pretrained(model_id);
-          }, MAX_LOAD_TIME);
+          }, MAX_TOKENIZER_LOAD_TIME);
 
           for (const [test_name, test_case] of Object.entries(TEST_CONFIG[model_id])) {
             test(test_name, () => {
@@ -184,7 +181,7 @@ describe("Tokenizer padding/truncation", () => {
         ]);
       }
     },
-    MAX_EXECUTION_TIME,
+    MAX_TEST_EXECUTION_TIME,
   );
 });
 
@@ -218,7 +215,7 @@ describe("Token type ids", () => {
 
       compare(model_inputs, expected);
     },
-    MAX_EXECUTION_TIME,
+    MAX_TEST_EXECUTION_TIME,
   );
 
   it(
@@ -255,7 +252,7 @@ describe("Token type ids", () => {
         compare(model_inputs, expected);
       }
     },
-    MAX_EXECUTION_TIME,
+    MAX_TEST_EXECUTION_TIME,
   );
 });
 
@@ -269,7 +266,7 @@ describe("Edge cases", () => {
       let encoded = tokenizer(text);
       expect(encoded.input_ids.data.length).toBeGreaterThan(100000);
     },
-    MAX_EXECUTION_TIME,
+    MAX_TEST_EXECUTION_TIME,
   );
 
   it("should not take too long", async () => {
@@ -293,7 +290,7 @@ describe("Edge cases", () => {
         compare(token_ids, [109]); // Should not be [108, 108]
       }
     },
-    MAX_EXECUTION_TIME,
+    MAX_TEST_EXECUTION_TIME,
   );
 });
 
@@ -320,7 +317,7 @@ describe("Extra decoding tests", () => {
       expect(decoded3).toEqual(text);
       expect(decoded4).toEqual(text);
     },
-    MAX_EXECUTION_TIME,
+    MAX_TEST_EXECUTION_TIME,
   );
 });
 
diff --git a/tests/utils/hub.test.js b/tests/utils/hub.test.js
index 19077f009..3ef3f41f7 100644
--- a/tests/utils/hub.test.js
+++ b/tests/utils/hub.test.js
@@ -1,6 +1,6 @@
 import { AutoModel, PreTrainedModel } from "../../src/models.js";
 
-import { MAX_TEST_EXECUTION_TIME } from "../init.js";
+import { MAX_TEST_EXECUTION_TIME, DEFAULT_MODEL_OPTIONS } from "../init.js";
 
 // TODO: Set cache folder to a temp directory
 
@@ -10,7 +10,7 @@ describe("Hub", () => {
       "should load a model from the local cache",
       async () => {
         // 1. Local model exists (doesn't matter about status of remote file since local is tried first)
-        const model = await AutoModel.from_pretrained("hf-internal-testing/tiny-random-T5ForConditionalGeneration");
+        const model = await AutoModel.from_pretrained("hf-internal-testing/tiny-random-T5ForConditionalGeneration", DEFAULT_MODEL_OPTIONS);
         expect(model).toBeInstanceOf(PreTrainedModel);
       },
       MAX_TEST_EXECUTION_TIME,
@@ -21,7 +21,7 @@ describe("Hub", () => {
       async () => {
         // 2. Local model doesn't exist, remote file exists
         // This tests that fallback functionality is working
-        const model = await AutoModel.from_pretrained("hf-internal-testing/tiny-random-T5ForConditionalGeneration");
+        const model = await AutoModel.from_pretrained("hf-internal-testing/tiny-random-T5ForConditionalGeneration", DEFAULT_MODEL_OPTIONS);
         expect(model).toBeInstanceOf(PreTrainedModel);
       },
       MAX_TEST_EXECUTION_TIME,
@@ -32,7 +32,7 @@ describe("Hub", () => {
       async () => {
         // 3. Local model doesn't exist, remote file doesn't exist
         // This tests that error handling is working.
-        await expect(AutoModel.from_pretrained("hf-internal-testing/this-model-does-not-exist")).rejects.toBeInstanceOf(Error);
+        await expect(AutoModel.from_pretrained("hf-internal-testing/this-model-does-not-exist", DEFAULT_MODEL_OPTIONS)).rejects.toBeInstanceOf(Error);
       },
       MAX_TEST_EXECUTION_TIME,
     );
diff --git a/tests/utils/image.test.js b/tests/utils/image.test.js
new file mode 100644
index 000000000..7fc5d5b4a
--- /dev/null
+++ b/tests/utils/image.test.js
@@ -0,0 +1,89 @@
+import { RawImage, rand } from "../../src/transformers.js";
+import { load_cached_image } from "../asset_cache.js";
+
+const TEST_IMAGES = {
+  rgba: new RawImage(new Uint8ClampedArray([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]), 2, 3, 4),
+  rgb: new RawImage(new Uint8ClampedArray([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]), 2, 3, 3),
+  la: new RawImage(new Uint8ClampedArray([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]), 2, 3, 2),
+  l: new RawImage(new Uint8ClampedArray([0, 1, 2, 3, 4, 5]), 2, 3, 1),
+};
+
+describe("Image utilities", () => {
+  describe("Padding", () => {
+    it("should pad image", async () => {
+      /** @type {RawImage} */
+      const padded_image = await load_cached_image("blue_image")
+        .then((image) => image.resize(224, 224))
+        .then((image) => image.pad([128, 128, 128, 128]));
+
+      expect(padded_image.size).toEqual([480, 480]);
+
+      const avg = padded_image.data.reduce((acc, val) => acc + val, 0) / padded_image.data.length;
+      expect(avg).toBeCloseTo((224 * 224 * 255) / (3 * 480 * 480), 6);
+    });
+  });
+
+  describe("Tensor to Image", () => {
+    it("should create an image from a tensor (CHW)", () => {
+      const tensor_chw = rand([3, 128, 256]).mul_(255).to("uint8");
+      const image = RawImage.fromTensor(tensor_chw);
+      expect(image.size).toEqual([256, 128]);
+    });
+    it("should create an image from a tensor (HWC)", () => {
+      const tensor_hwc = rand([128, 256, 3]).mul_(255).to("uint8");
+      const image = RawImage.fromTensor(tensor_hwc, "HWC");
+      expect(image.size).toEqual([256, 128]);
+    });
+  });
+
+  describe("Channel conversions", () => {
+    it("should convert RGBA to L (grayscale)", async () => {
+      const grayscale = TEST_IMAGES.rgba.clone().grayscale();
+      expect(grayscale.size).toEqual(TEST_IMAGES.rgba.size);
+      expect(grayscale.channels).toEqual(1);
+    });
+
+    it("should convert RGB to L (grayscale)", async () => {
+      const grayscale = TEST_IMAGES.rgb.clone().grayscale();
+      expect(grayscale.size).toEqual(TEST_IMAGES.rgb.size);
+      expect(grayscale.channels).toEqual(1);
+    });
+
+    it("should convert L to RGB", async () => {
+      const rgb = TEST_IMAGES.l.clone().rgb();
+      expect(rgb.size).toEqual(TEST_IMAGES.l.size);
+      expect(rgb.channels).toEqual(3);
+    });
+
+    it("should convert L to RGBA", async () => {
+      const rgba = TEST_IMAGES.l.clone().rgba();
+      expect(rgba.size).toEqual(TEST_IMAGES.l.size);
+      expect(rgba.channels).toEqual(4);
+    });
+
+    it("should convert RGB to RGBA", async () => {
+      const rgba = TEST_IMAGES.rgb.clone().rgba();
+      expect(rgba.size).toEqual(TEST_IMAGES.rgb.size);
+      expect(rgba.channels).toEqual(4);
+    });
+
+    it("should convert RGBA to RGB", async () => {
+      const rgb = TEST_IMAGES.rgba.clone().rgb();
+      expect(rgb.size).toEqual(TEST_IMAGES.rgba.size);
+      expect(rgb.channels).toEqual(3);
+    });
+  });
+
+  describe("putAlpha", () => {
+    it("should add alpha to RGB image", async () => {
+      const rgba = TEST_IMAGES.rgb.clone().putAlpha(TEST_IMAGES.l);
+      expect(rgba.size).toEqual(TEST_IMAGES.rgb.size);
+      expect(rgba.channels).toEqual(4);
+    });
+    it("should add alpha to RGBA image", async () => {
+      const rgba = TEST_IMAGES.rgba.clone().putAlpha(TEST_IMAGES.l);
+      expect(rgba.size).toEqual(TEST_IMAGES.rgb.size);
+      expect(rgba.channels).toEqual(4);
+    });
+  });
+});