From 63d9c3b2724e400f122217f8fcbf9c0ac3db4517 Mon Sep 17 00:00:00 2001 From: Joshua Lochner Date: Sun, 15 Dec 2024 14:55:02 +0200 Subject: [PATCH] Improve unit test coverage (#1095) * Improve unit test auto-detection * Use default model options * Add mgp_str unit tests * Add janus processing unit tests * Add jina_clip processor unit tests * Fix typo in filename * Create `rand` tensor function * Add VitPose unit test * Add sam modelling unit test * Improve pipeline unit tests * Add image utilities unit testing * Add image segmentation pipeline unit tests * Add zero-shot classification pipeline unit test * Move pipeline unit tests to subfolder * Add instanceof checks * Add image feature extraction pipeline tests * Add feature extraction pipeline unit tests * Add zero-shot object detection pipeline unit tests * Add depth estimation pipeline unit test * Add automatic speech recognition pipeline unit test * Fix typo * Add text to audio pipeline unit tests * Add image to text pipeline unit test * Add image to image pipeline unit test * Add zero-shot audio classification pipeline unit test * Fix typo * Add summarization pipeline unit test * Add text2text generation unit test * Add text2text generation pipeline unit test * Remove unused variables --- src/utils/tensor.js | 14 + tests/asset_cache.js | 27 + tests/feature_extractors.test.js | 5 + tests/image_processors.test.js | 5 + tests/init.js | 6 +- tests/models.test.js | 17 +- tests/models/all_modeling_tests.js | 33 - tests/models/all_tokenization_tests.js | 22 - ...xtraction_audio_spectrogram_transformer.js | 53 ++ .../clap/test_feature_extraction_clap.js | 74 ++ .../florence2/test_processor_florence2.js | 222 +++++ tests/models/janus/test_processor_janus.js | 47 + .../jina_clip/test_processor_jina_clip.js | 44 + tests/models/mgp_str/test_modeling_mgp_str.js | 84 ++ .../models/musicgen/test_modeling_musicgen.js | 2 +- .../paligemma/test_processor_paligemma.js | 51 ++ .../models/qwen2_vl/test_modeling_qwen2_vl.js | 5 +- .../qwen2_vl/test_processor_qwen2_vl.js | 44 + tests/models/sam/test_modeling_sam.js | 48 + .../test_feature_extraction_seamless_m4t.js | 65 ++ .../test_modeling_vision_encoder_decoder.js | 5 +- .../vitpose/test_image_processing_vitpose.js | 50 ++ ...est_feature_extraction_wespeaker_resnet.js | 56 ++ .../test_feature_extraction_whisper.js | 33 + tests/pipelines.test.js | 5 +- .../test_pipelines_audio_classification.js | 80 ++ ..._pipelines_automatic_speech_recognition.js | 129 +++ .../test_pipelines_depth_estimation.js | 57 ++ ...t_pipelines_document_question_answering.js | 41 + .../test_pipelines_feature_extraction.js | 121 +++ tests/pipelines/test_pipelines_fill_mask.js | 100 +++ .../test_pipelines_image_classification.js | 81 ++ ...test_pipelines_image_feature_extraction.js | 51 ++ .../test_pipelines_image_segmentation.js | 119 +++ .../test_pipelines_image_to_image.js | 56 ++ .../pipelines/test_pipelines_image_to_text.js | 51 ++ .../test_pipelines_object_detection.js | 131 +++ .../test_pipelines_question_answering.js | 49 + .../pipelines/test_pipelines_summarization.js | 40 + .../test_pipelines_text2text_generation.js | 40 + .../test_pipelines_text_classification.js | 107 +++ .../test_pipelines_text_generation.js | 109 +++ .../pipelines/test_pipelines_text_to_audio.js | 37 + .../test_pipelines_token_classification.js | 157 ++++ tests/pipelines/test_pipelines_translation.js | 42 + tests/pipelines/test_pipelines_zero_shot.js | 100 +++ ...ipelines_zero_shot_audio_classification.js | 58 ++ ...ipelines_zero_shot_image_classification.js | 98 ++ ...st_pipelines_zero_shot_object_detection.js | 134 +++ tests/processors.test.js | 522 +---------- tests/test_utils.js | 68 ++ tests/tiny_random.test.js | 841 ------------------ tests/tokenizers.test.js | 25 +- tests/utils/hub.test.js | 8 +- tests/utils/image.test.js | 89 ++ 55 files changed, 3099 insertions(+), 1459 deletions(-) create mode 100644 tests/feature_extractors.test.js create mode 100644 tests/image_processors.test.js delete mode 100644 tests/models/all_modeling_tests.js delete mode 100644 tests/models/all_tokenization_tests.js create mode 100644 tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.js create mode 100644 tests/models/clap/test_feature_extraction_clap.js create mode 100644 tests/models/florence2/test_processor_florence2.js create mode 100644 tests/models/janus/test_processor_janus.js create mode 100644 tests/models/jina_clip/test_processor_jina_clip.js create mode 100644 tests/models/mgp_str/test_modeling_mgp_str.js create mode 100644 tests/models/paligemma/test_processor_paligemma.js create mode 100644 tests/models/qwen2_vl/test_processor_qwen2_vl.js create mode 100644 tests/models/sam/test_modeling_sam.js create mode 100644 tests/models/seamless_m4t/test_feature_extraction_seamless_m4t.js create mode 100644 tests/models/vitpose/test_image_processing_vitpose.js create mode 100644 tests/models/wespeaker_resnet/test_feature_extraction_wespeaker_resnet.js create mode 100644 tests/models/whisper/test_feature_extraction_whisper.js create mode 100644 tests/pipelines/test_pipelines_audio_classification.js create mode 100644 tests/pipelines/test_pipelines_automatic_speech_recognition.js create mode 100644 tests/pipelines/test_pipelines_depth_estimation.js create mode 100644 tests/pipelines/test_pipelines_document_question_answering.js create mode 100644 tests/pipelines/test_pipelines_feature_extraction.js create mode 100644 tests/pipelines/test_pipelines_fill_mask.js create mode 100644 tests/pipelines/test_pipelines_image_classification.js create mode 100644 tests/pipelines/test_pipelines_image_feature_extraction.js create mode 100644 tests/pipelines/test_pipelines_image_segmentation.js create mode 100644 tests/pipelines/test_pipelines_image_to_image.js create mode 100644 tests/pipelines/test_pipelines_image_to_text.js create mode 100644 tests/pipelines/test_pipelines_object_detection.js create mode 100644 tests/pipelines/test_pipelines_question_answering.js create mode 100644 tests/pipelines/test_pipelines_summarization.js create mode 100644 tests/pipelines/test_pipelines_text2text_generation.js create mode 100644 tests/pipelines/test_pipelines_text_classification.js create mode 100644 tests/pipelines/test_pipelines_text_generation.js create mode 100644 tests/pipelines/test_pipelines_text_to_audio.js create mode 100644 tests/pipelines/test_pipelines_token_classification.js create mode 100644 tests/pipelines/test_pipelines_translation.js create mode 100644 tests/pipelines/test_pipelines_zero_shot.js create mode 100644 tests/pipelines/test_pipelines_zero_shot_audio_classification.js create mode 100644 tests/pipelines/test_pipelines_zero_shot_image_classification.js create mode 100644 tests/pipelines/test_pipelines_zero_shot_object_detection.js delete mode 100644 tests/tiny_random.test.js create mode 100644 tests/utils/image.test.js diff --git a/src/utils/tensor.js b/src/utils/tensor.js index 6bdfd20a3..553e09e8f 100644 --- a/src/utils/tensor.js +++ b/src/utils/tensor.js @@ -1430,6 +1430,20 @@ export function zeros_like(tensor) { return zeros(tensor.dims); } +/** + * Returns a tensor filled with random numbers from a uniform distribution on the interval [0, 1) + * @param {number[]} size A sequence of integers defining the shape of the output tensor. + * @returns {Tensor} The random tensor. + */ +export function rand(size) { + const length = size.reduce((a, b) => a * b, 1); + return new Tensor( + "float32", + Float32Array.from({ length }, () => Math.random()), + size, + ) +} + /** * Quantizes the embeddings tensor to binary or unsigned binary precision. * @param {Tensor} tensor The tensor to quantize. diff --git a/tests/asset_cache.js b/tests/asset_cache.js index 8d62fb6bf..9d1182014 100644 --- a/tests/asset_cache.js +++ b/tests/asset_cache.js @@ -3,6 +3,7 @@ import { RawImage } from "../src/transformers.js"; const BASE_URL = "https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/"; const TEST_IMAGES = Object.freeze({ white_image: BASE_URL + "white-image.png", + blue_image: BASE_URL + "blue-image.png", pattern_3x3: BASE_URL + "pattern_3x3.png", pattern_3x5: BASE_URL + "pattern_3x5.png", checkerboard_8x8: BASE_URL + "checkerboard_8x8.png", @@ -21,8 +22,14 @@ const TEST_IMAGES = Object.freeze({ beetle: BASE_URL + "beetle.png", book_cover: BASE_URL + "book-cover.png", + corgi: BASE_URL + "corgi.jpg", + man_on_car: BASE_URL + "young-man-standing-and-leaning-on-car.jpg", }); +const TEST_AUDIOS = { + mlk: BASE_URL + "mlk.npy", +}; + /** @type {Map} */ const IMAGE_CACHE = new Map(); const load_image = async (url) => { @@ -35,9 +42,29 @@ const load_image = async (url) => { return image; }; +/** @type {Map} */ +const AUDIO_CACHE = new Map(); +const load_audio = async (url) => { + const cached = AUDIO_CACHE.get(url); + if (cached) { + return cached; + } + const buffer = await (await fetch(url)).arrayBuffer(); + const audio = Float32Array.from(new Float64Array(buffer)); + AUDIO_CACHE.set(url, audio); + return audio; +}; + /** * Load a cached image. * @param {keyof typeof TEST_IMAGES} name The name of the image to load. * @returns {Promise} The loaded image. */ export const load_cached_image = (name) => load_image(TEST_IMAGES[name]); + +/** + * Load a cached audio. + * @param {keyof typeof TEST_AUDIOS} name The name of the audio to load. + * @returns {Promise} The loaded audio. + */ +export const load_cached_audio = (name) => load_audio(TEST_AUDIOS[name]); diff --git a/tests/feature_extractors.test.js b/tests/feature_extractors.test.js new file mode 100644 index 000000000..89ac48fa5 --- /dev/null +++ b/tests/feature_extractors.test.js @@ -0,0 +1,5 @@ +import { init } from "./init.js"; +import { collect_and_execute_tests } from "./test_utils.js"; + +init(); +await collect_and_execute_tests("Feature extractors", "feature_extraction"); diff --git a/tests/image_processors.test.js b/tests/image_processors.test.js new file mode 100644 index 000000000..07e867705 --- /dev/null +++ b/tests/image_processors.test.js @@ -0,0 +1,5 @@ +import { init } from "./init.js"; +import { collect_and_execute_tests } from "./test_utils.js"; + +init(); +await collect_and_execute_tests("Image processors", "image_processing"); diff --git a/tests/init.js b/tests/init.js index 29097eb47..93321529b 100644 --- a/tests/init.js +++ b/tests/init.js @@ -57,6 +57,8 @@ export function init() { registerBackend("test", onnxruntimeBackend, Number.POSITIVE_INFINITY); } +export const MAX_TOKENIZER_LOAD_TIME = 10_000; // 10 seconds +export const MAX_FEATURE_EXTRACTOR_LOAD_TIME = 10_000; // 10 seconds export const MAX_PROCESSOR_LOAD_TIME = 10_000; // 10 seconds export const MAX_MODEL_LOAD_TIME = 15_000; // 15 seconds export const MAX_TEST_EXECUTION_TIME = 60_000; // 60 seconds @@ -64,9 +66,9 @@ export const MAX_MODEL_DISPOSE_TIME = 1_000; // 1 second export const MAX_TEST_TIME = MAX_MODEL_LOAD_TIME + MAX_TEST_EXECUTION_TIME + MAX_MODEL_DISPOSE_TIME; -export const DEFAULT_MODEL_OPTIONS = { +export const DEFAULT_MODEL_OPTIONS = Object.freeze({ dtype: "fp32", -}; +}); expect.extend({ toBeCloseToNested(received, expected, numDigits = 2) { diff --git a/tests/models.test.js b/tests/models.test.js index a668baee4..ec52fc49d 100644 --- a/tests/models.test.js +++ b/tests/models.test.js @@ -2,13 +2,9 @@ * Test that models loaded outside of the `pipeline` function work correctly (e.g., `AutoModel.from_pretrained(...)`); */ -import * as MODEL_TESTS from "./models/all_modeling_tests.js"; - import { AutoTokenizer, AutoModel, BertModel, GPT2Model, T5ForConditionalGeneration, BertTokenizer, GPT2Tokenizer, T5Tokenizer } from "../src/transformers.js"; - -import { init, MAX_TEST_EXECUTION_TIME } from "./init.js"; - -import { compare } from "./test_utils.js"; +import { init, MAX_TEST_EXECUTION_TIME, DEFAULT_MODEL_OPTIONS } from "./init.js"; +import { compare, collect_and_execute_tests } from "./test_utils.js"; // Initialise the testing environment init(); @@ -38,7 +34,7 @@ describe("Loading different architecture types", () => { async () => { // Load model and tokenizer const tokenizer = await tokenizerClassToTest.from_pretrained(model_id); - const model = await modelClassToTest.from_pretrained(model_id, { dtype: "fp32" }); + const model = await modelClassToTest.from_pretrained(model_id, DEFAULT_MODEL_OPTIONS); const tests = [ texts[0], // single @@ -65,7 +61,6 @@ describe("Loading different architecture types", () => { throw new Error("Unexpected output"); } } - await model.dispose(); }, MAX_TEST_EXECUTION_TIME, @@ -74,8 +69,4 @@ describe("Loading different architecture types", () => { } }); -describe("Model-specific tests", () => { - for (const [modelName, modelTest] of Object.entries(MODEL_TESTS)) { - describe(modelName, modelTest); - } -}); +await collect_and_execute_tests("Model-specific tests", "modeling"); diff --git a/tests/models/all_modeling_tests.js b/tests/models/all_modeling_tests.js deleted file mode 100644 index 0f64ec581..000000000 --- a/tests/models/all_modeling_tests.js +++ /dev/null @@ -1,33 +0,0 @@ -export { default as bert } from "./bert/test_modeling_bert.js"; -export { default as bloom } from "./bloom/test_modeling_bloom.js"; -export { default as clip } from "./clip/test_modeling_clip.js"; -export { default as codegen } from "./codegen/test_modeling_codegen.js"; -export { default as cohere } from "./cohere/test_modeling_cohere.js"; -export { default as florence2 } from "./florence2/test_modeling_florence2.js"; -export { default as gemma } from "./gemma/test_modeling_gemma.js"; -export { default as gemma2 } from "./gemma2/test_modeling_gemma2.js"; -export { default as gpt2 } from "./gpt2/test_modeling_gpt2.js"; -export { default as gpt_bigcode } from "./gpt_bigcode/test_modeling_gpt_bigcode.js"; -export { default as gpt_neo } from "./gpt_neo/test_modeling_gpt_neo.js"; -export { default as gpt_neox } from "./gpt_neox/test_modeling_gpt_neox.js"; -export { default as gptj } from "./gptj/test_modeling_gptj.js"; -export { default as granite } from "./granite/test_modeling_granite.js"; -export { default as idefics3 } from "./idefics3/test_modeling_idefics3.js"; -export { default as jais } from "./jais/test_modeling_jais.js"; -export { default as llama } from "./llama/test_modeling_llama.js"; -export { default as llava } from "./llava/test_modeling_llava.js"; -export { default as marian } from "./marian/test_modeling_marian.js"; -export { default as mistral } from "./mistral/test_modeling_mistral.js"; -export { default as mpt } from "./mpt/test_modeling_mpt.js"; -export { default as musicgen } from "./musicgen/test_modeling_musicgen.js"; -export { default as olmo } from "./olmo/test_modeling_olmo.js"; -export { default as olmo2 } from "./olmo2/test_modeling_olmo2.js"; -export { default as opt } from "./opt/test_modeling_opt.js"; -export { default as paligemma } from "./paligemma/test_modeling_paligemma.js"; -export { default as patchtsmixer } from "./patchtsmixer/test_modeling_patchtsmixer.js"; -export { default as patchtst } from "./patchtst/test_modeling_patchtst.js"; -export { default as pyannote } from "./pyannote/test_modeling_pyannote.js"; -export { default as qwen2_vl } from "./qwen2_vl/test_modeling_qwen2_vl.js"; -export { default as t5 } from "./t5/test_modeling_t5.js"; -export { default as vision_encoder_decoder } from "./vision_encoder_decoder/test_modeling_vision_encoder_decoder.js"; -export { default as whisper } from "./whisper/test_modeling_whisper.js"; diff --git a/tests/models/all_tokenization_tests.js b/tests/models/all_tokenization_tests.js deleted file mode 100644 index b9bac9d1f..000000000 --- a/tests/models/all_tokenization_tests.js +++ /dev/null @@ -1,22 +0,0 @@ -export * as AlbertTokenizer from "./albert/test_tokenization_albert.js"; -export * as BertTokenizer from "./bert/test_tokenization_bert.js"; -export * as BlenderbotSmallTokenizer from "./blenderbot_small/test_tokenization_blenderbot_small.js"; -export * as BloomTokenizer from "./bloom/test_tokenization_bloom.js"; -export * as CLIPTokenizer from "./clip/test_tokenization_clip.js"; -export * as DebertaV2Tokenizer from "./deberta_v2/test_tokenization_deberta_v2.js"; -export * as DistilBertTokenizer from "./distilbert/test_tokenization_distilbert.js"; -export * as EsmTokenizer from "./esm/test_tokenization_esm.js"; -export * as FalconTokenizer from "./falcon/test_tokenization_falcon.js"; -export * as GPT2Tokenizer from "./gpt2/test_tokenization_gpt2.js"; -export * as GemmaTokenizer from "./gemma/test_tokenization_gemma.js"; -export * as LlamaTokenizer from "./llama/test_tokenization_llama.js"; -export * as M2M100Tokenizer from "./m2m_100/test_tokenization_m2m_100.js"; -export * as MPNetTokenizer from "./mpnet/test_tokenization_mpnet.js"; -export * as NllbTokenizer from "./nllb/test_tokenization_nllb.js"; -export * as Qwen2Tokenizer from "./qwen2/test_tokenization_qwen2.js"; -export * as RobertaTokenizer from "./roberta/test_tokenization_roberta.js"; -export * as T5Tokenizer from "./t5/test_tokenization_t5.js"; -export * as VitsTokenizer from "./vits/test_tokenization_vits.js"; -export * as Wav2Vec2CTCTokenizer from "./wav2vec2/test_tokenization_wav2vec2.js"; -export * as WhisperTokenizer from "./whisper/test_tokenization_whisper.js"; -export * as XLMRobertaTokenizer from "./xlm_roberta/test_tokenization_xlm_roberta.js"; diff --git a/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.js b/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.js new file mode 100644 index 000000000..0fced5b01 --- /dev/null +++ b/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.js @@ -0,0 +1,53 @@ +import { AutoFeatureExtractor, ASTFeatureExtractor } from "../../../src/transformers.js"; + +import { load_cached_audio } from "../../asset_cache.js"; +import { MAX_FEATURE_EXTRACTOR_LOAD_TIME, MAX_TEST_EXECUTION_TIME } from "../../init.js"; + +export default () => { + // ASTFeatureExtractor + describe("ASTFeatureExtractor", () => { + const model_id = "Xenova/ast-finetuned-audioset-10-10-0.4593"; + + /** @type {ASTFeatureExtractor} */ + let feature_extractor; + beforeAll(async () => { + feature_extractor = await AutoFeatureExtractor.from_pretrained(model_id); + }, MAX_FEATURE_EXTRACTOR_LOAD_TIME); + + it( + "truncation", + async () => { + const audio = await load_cached_audio("mlk"); + const { input_values } = await feature_extractor(audio); + expect(input_values.dims).toEqual([1, 1024, 128]); + + expect(input_values.mean().item()).toBeCloseTo(-0.04054912979309085); + expect(input_values.data[0]).toBeCloseTo(-0.5662586092948914); + expect(input_values.data[1]).toBeCloseTo(-1.0300861597061157); + expect(input_values.data[129]).toBeCloseTo(-1.084834098815918); + expect(input_values.data[1025]).toBeCloseTo(-1.1204065084457397); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "padding", + async () => { + const audio = await load_cached_audio("mlk"); + const { input_values } = await feature_extractor(audio.slice(0, 1000)); + expect(input_values.dims).toEqual([1, 1024, 128]); // [1, 4, 128] -> (padded to) -> [1, 1024, 128] + + expect(input_values.mean().item()).toBeCloseTo(0.4647964835166931); + expect(input_values.data[0]).toBeCloseTo(-0.5662586092948914); + expect(input_values.data[1]).toBeCloseTo(-1.0300861597061157); + expect(input_values.data[129]).toBeCloseTo(-1.084834098815918); + + // padded values + expect(input_values.data[1025]).toBeCloseTo(0.46703237295150757); + expect(input_values.data[2049]).toBeCloseTo(0.46703237295150757); + expect(input_values.data[10000]).toBeCloseTo(0.46703237295150757); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); +}; diff --git a/tests/models/clap/test_feature_extraction_clap.js b/tests/models/clap/test_feature_extraction_clap.js new file mode 100644 index 000000000..16991c186 --- /dev/null +++ b/tests/models/clap/test_feature_extraction_clap.js @@ -0,0 +1,74 @@ +import { AutoFeatureExtractor, ClapFeatureExtractor } from "../../../src/transformers.js"; + +import { load_cached_audio } from "../../asset_cache.js"; +import { MAX_FEATURE_EXTRACTOR_LOAD_TIME, MAX_TEST_EXECUTION_TIME } from "../../init.js"; + +export default () => { + // ClapFeatureExtractor + describe("ClapFeatureExtractor", () => { + const model_id = "Xenova/clap-htsat-unfused"; + + /** @type {ClapFeatureExtractor} */ + let feature_extractor; + beforeAll(async () => { + feature_extractor = await AutoFeatureExtractor.from_pretrained(model_id); + }, MAX_FEATURE_EXTRACTOR_LOAD_TIME); + + it( + "truncation", + async () => { + const audio = await load_cached_audio("mlk"); + + // Since truncation uses a random strategy, we override + // Math.random to ensure that the test is deterministic + const originalRandom = Math.random; + Math.random = () => 0.5; + + let long_audio = new Float32Array(500000); + long_audio.set(audio); + long_audio.set(audio, long_audio.length - audio.length); + + const { input_features } = await feature_extractor(long_audio); + const { dims, data } = input_features; + expect(dims).toEqual([1, 1, 1001, 64]); + + expect(input_features.mean().item()).toBeCloseTo(-37.94569396972656); + expect(data[0]).toBeCloseTo(-53.32647705078125); + expect(data[1]).toBeCloseTo(-47.76755142211914); + expect(data[65]).toBeCloseTo(-36.32261276245117); + expect(data[1002]).toBeCloseTo(-28.0314884185791); + expect(data[10000]).toBeCloseTo(-21.905902862548828); + expect(data[60000]).toBeCloseTo(-14.877863883972168); + expect(data[64062]).toBeCloseTo(-37.9784049987793); + expect(data[64063]).toBeCloseTo(-37.73963928222656); + + // Reset Math.random + Math.random = originalRandom; + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "padding", + async () => { + const audio = await load_cached_audio("mlk"); + const { input_features } = await feature_extractor(audio); + const { data, dims } = input_features; + expect(dims).toEqual([1, 1, 1001, 64]); + + expect(input_features.mean().item()).toBeCloseTo(-34.99049377441406); + expect(data[0]).toBeCloseTo(-21.32573890686035); + expect(data[1]).toBeCloseTo(-26.168411254882812); + expect(data[65]).toBeCloseTo(-29.716018676757812); + expect(data[1002]).toBeCloseTo(-32.16273498535156); + expect(data[10000]).toBeCloseTo(-19.9283390045166); + + // padded values + expect(data[60000]).toBeCloseTo(-100.0); + expect(data[64062]).toBeCloseTo(-100.0); + expect(data[64063]).toBeCloseTo(-100.0); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); +}; diff --git a/tests/models/florence2/test_processor_florence2.js b/tests/models/florence2/test_processor_florence2.js new file mode 100644 index 000000000..5d4ff2faf --- /dev/null +++ b/tests/models/florence2/test_processor_florence2.js @@ -0,0 +1,222 @@ +import { AutoProcessor, Florence2Processor } from "../../../src/transformers.js"; +import { MAX_TEST_EXECUTION_TIME, MAX_PROCESSOR_LOAD_TIME } from "../../init.js"; +import { load_cached_image } from "../../asset_cache.js"; +export default () => { + describe("FlorenceProcessor", () => { + const model_id = "Xenova/tiny-random-Florence2ForConditionalGeneration"; + + /** @type {Florence2Processor} */ + let processor; + let images = {}; + + beforeAll(async () => { + processor = await AutoProcessor.from_pretrained(model_id); + images = { + beetle: await load_cached_image("beetle"), + book_cover: await load_cached_image("book_cover"), + }; + }, MAX_PROCESSOR_LOAD_TIME); + + describe("Prompt construction", () => { + it( + "Construct prompt", + async () => { + const text = ""; + const prompts = processor.construct_prompts(text); + const target = ["Locate the objects with category name in the image."]; + expect(prompts).toEqual(target); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "Construct prompts", + async () => { + const texts = ["", "Locate the objects with category name in the image.", "cat"]; + const prompts = processor.construct_prompts(texts); + const target = ["Describe with a paragraph what is shown in the image.", "Locate the objects with category name in the image.", "Locate cat in the image."]; + expect(prompts).toEqual(target); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); + + describe("Post-process generation", () => { + const TESTS = [ + { + task: "", + generated_text: "A green car parked in front of a yellow building.", + target: { "": "A green car parked in front of a yellow building." }, + image: "beetle", + }, + { + task: "", + generated_text: "The image shows a green Volkswagen Beetle parked in front of a yellow building with two brown doors. The sky is a mix of blue and white, and there are a few green trees in the background.", + target: { "": "The image shows a green Volkswagen Beetle parked in front of a yellow building with two brown doors. The sky is a mix of blue and white, and there are a few green trees in the background." }, + image: "beetle", + }, + { + task: "", + generated_text: "The image shows a vintage Volkswagen Beetle car parked on a cobblestone street in front of a yellow building with two wooden doors. The car is painted in a bright turquoise color and has a white stripe running along the side. It has two doors on either side of the car, one on top of the other, and a small window on the front. The building appears to be old and dilapidated, with peeling paint and crumbling walls. The sky is blue and there are trees in the background.", + target: { "": "The image shows a vintage Volkswagen Beetle car parked on a cobblestone street in front of a yellow building with two wooden doors. The car is painted in a bright turquoise color and has a white stripe running along the side. It has two doors on either side of the car, one on top of the other, and a small window on the front. The building appears to be old and dilapidated, with peeling paint and crumbling walls. The sky is blue and there are trees in the background." }, + image: "beetle", + }, + { + task: "", + generated_text: "cardoorwheel", + target: { + "": { + bboxes: [ + [34.24, 160.08, 597.44, 371.76], + [456.0, 97.68, 580.16, 261.84], + [450.88, 276.72, 554.56, 370.8], + [95.68, 280.56, 198.72, 371.28], + ], + labels: ["car", "door", "wheel", "wheel"], + }, + }, + image: "beetle", + }, + { + task: "", + generated_text: "turquoise Volkswagen Beetlewheel", + target: { + "": { + bboxes: [ + [33.6, 160.08, 596.8, 371.76], + [450.88, 276.72, 553.28, 370.8], + [95.04, 280.56, 197.44, 371.28], + ], + labels: ["turquoise Volkswagen Beetle", "wheel", "wheel"], + }, + }, + image: "beetle", + }, + { + task: "", + generated_text: "", + target: { + "": { + bboxes: [ + [33.6, 160.08, 596.8, 371.76], + [455.36, 97.68, 579.52, 261.84], + [450.88, 276.72, 553.28, 370.8], + [95.04, 280.56, 198.08, 371.28], + [226.88, 88.56, 332.48, 164.4], + [65.6, 266.64, 86.72, 295.92], + [271.68, 241.68, 302.4, 246.96], + [408.0, 308.4, 413.76, 320.88], + ], + labels: ["", "", "", "", "", "", "", ""], + }, + }, + image: "beetle", + }, + { + task: "", + text_input: "A green car parked in front of a yellow building.", + generated_text: "A green cara yellow building", + target: { + "": { + bboxes: [ + [34.88, 158.64, 583.36, 374.64], + [0.32, 4.08, 639.04, 305.04], + ], + labels: ["A green car", "a yellow building"], + }, + }, + image: "beetle", + }, + // { + // task: "", + // text_input: "a green car", + // generated_text: "", + // target: { + // '': { + // polygons: [[[[178.88, 181.68, 180.8, 180.72, 182.72, 180.72, 187.84, 177.84, 189.76, 177.84, 192.96, 175.92, 194.88, 175.92, 198.08, 174, 200.64, 173.04, 203.84, 172.08, 207.04, 170.64, 209.6, 169.68, 214.08, 168.72, 217.92, 167.76, 221.76, 166.8, 226.24, 165.84, 230.72, 164.88, 237.12, 163.92, 244.16, 162.96, 253.12, 162, 265.28, 161.04, 311.36, 161.04, 329.28, 162, 338.24, 162.96, 345.28, 163.92, 350.4, 164.88, 354.24, 165.84, 358.72, 166.8, 362.56, 167.76, 366.4, 168.72, 370.24, 169.68, 373.44, 170.64, 375.36, 172.08, 377.28, 174, 379.2, 176.88, 380.48, 179.76, 382.4, 181.68, 384.32, 185.04, 386.24, 187.92, 387.52, 190.8, 389.44, 192.72, 390.08, 196.08, 392, 198.96, 394.56, 201.84, 396.48, 204.72, 398.4, 208.08, 403.52, 212.88, 406.08, 213.84, 409.28, 216.72, 412.48, 220.08, 431.68, 220.08, 432.32, 221.04, 442.56, 222, 456.64, 222, 465.6, 222.96, 472.64, 223.92, 478.4, 224.88, 484.8, 225.84, 489.92, 226.8, 493.76, 227.76, 497.6, 228.72, 501.44, 229.68, 504.64, 231.12, 507.84, 232.08, 510.4, 233.04, 513.6, 234, 516.8, 235.92, 518.72, 235.92, 523.84, 238.8, 525.76, 238.8, 527.68, 239.76, 529.6, 241.68, 532.8, 242.64, 536, 245.04, 538.56, 247.92, 541.76, 249.84, 545.6, 251.76, 548.8, 252.72, 550.72, 252.72, 553.92, 253.68, 556.48, 255.6, 558.4, 255.6, 564.8, 258.96, 566.72, 260.88, 568.64, 260.88, 570.56, 261.84, 572.48, 263.76, 573.76, 265.68, 574.4, 268.56, 574.4, 271.92, 573.76, 272.88, 572.48, 275.76, 572.48, 279.6, 573.76, 285.84, 574.4, 286.8, 575.68, 289.68, 576.32, 292.56, 577.6, 298.8, 577.6, 301.68, 576.32, 302.64, 575.68, 310.8, 575.68, 312.72, 576.32, 313.68, 577.6, 316.56, 577.6, 320.88, 574.4, 321.84, 568.64, 322.8, 559.68, 322.8, 553.92, 323.76, 552.64, 332.88, 552, 336.72, 550.72, 339.6, 550.08, 342.96, 548.8, 344.88, 546.88, 346.8, 545.6, 349.68, 543.68, 352.56, 541.76, 355.92, 534.72, 362.64, 531.52, 364.56, 525.76, 367.92, 522.56, 368.88, 518.72, 369.84, 495.68, 369.84, 489.92, 368.88, 486.72, 367.92, 483.52, 366.96, 479.68, 364.56, 476.48, 362.64, 472.64, 359.76, 465.6, 352.56, 463.68, 349.68, 461.76, 346.8, 460.48, 344.88, 460.48, 342.96, 458.56, 339.6, 457.92, 336.72, 457.92, 334.8, 456.64, 332.88, 454.72, 330.96, 452.8, 331.92, 448.32, 336.72, 446.4, 337.68, 426.56, 336.72, 424.64, 336.72, 423.36, 337.68, 420.8, 338.64, 414.4, 339.6, 412.48, 339.6, 411.2, 338.64, 380.48, 337.68, 217.28, 337.68, 216, 338.64, 210.88, 339.6, 207.04, 339.6, 203.84, 338.64, 201.92, 337.68, 200, 335.76, 198.08, 334.8, 194.88, 334.8, 192.96, 336.72, 191.68, 338.64, 191.68, 340.56, 191.04, 342.96, 189.12, 344.88, 187.84, 347.76, 185.92, 349.68, 184.64, 352.56, 182.72, 355.92, 176.96, 361.68, 173.76, 363.6, 170.56, 365.52, 166.72, 367.92, 163.52, 368.88, 160.96, 369.84, 153.92, 370.8, 131.52, 370.8, 127.68, 369.84, 124.48, 368.88, 118.72, 365.52, 115.52, 363.6, 111.68, 360.72, 106.56, 355.92, 104.64, 352.56, 103.36, 349.68, 101.44, 347.76, 100.8, 345.84, 99.52, 342.96, 99.52, 339.6, 98.88, 337.68, 95.68, 334.8, 93.76, 333.84, 86.72, 333.84, 80.32, 334.8, 79.68, 335.76, 74.56, 336.72, 66.24, 336.72, 63.68, 334.8, 53.44, 334.8, 50.24, 333.84, 48.32, 331.92, 48.32, 328.56, 50.24, 326.64, 51.52, 324.72, 51.52, 322.8, 44.48, 321.84, 40.64, 320.88, 38.72, 319.92, 37.44, 317.52, 36.16, 313.68, 36.16, 306.96, 38.72, 304.56, 42.56, 303.6, 46.4, 302.64, 55.36, 301.68, 65.6, 301.68, 67.52, 300.72, 69.44, 298.8, 70.72, 296.88, 70.72, 292.56, 69.44, 291.6, 68.8, 288.72, 67.52, 284.88, 67.52, 276.72, 68.8, 273.84, 69.44, 271.92, 72.64, 268.56, 74.56, 267.6, 77.76, 266.64, 79.68, 266.64, 81.6, 264.72, 80.32, 260.88, 81.6, 258.96, 83.52, 256.56, 88.64, 256.56, 90.56, 255.6, 92.48, 253.68, 92.48, 252.72, 97.6, 246.96, 114.88, 229.68, 117.44, 226.8, 122.56, 222.96, 125.76, 221.04, 126.4, 221.04, 129.6, 219.12, 133.44, 215.76, 138.56, 211.92, 143.68, 208.08, 149.44, 201.84, 153.92, 198.96, 154.56, 198.96, 157.76, 197.04, 162.88, 192.72, 168.64, 186.96, 171.84, 185.04, 176.96, 183.12, 178.88, 180.72]]]], + // labels: [''], + // } + // }, + // image: 'beetle', + // }, + // { + // task: "", + // text_input: "", + // generated_text: "", + // target: { + // '': { + // polygons: [[[[470.08, 288.24, 473.92, 285.36, 477.12, 283.44, 479.04, 282.48, 480.96, 282.48, 484.16, 280.56, 486.72, 279.6, 489.92, 278.64, 495.04, 277.68, 512.32, 277.68, 514.88, 278.64, 518.08, 279.6, 521.28, 281.52, 523.2, 281.52, 525.12, 283.44, 528.32, 284.4, 530.88, 286.32, 534.08, 288.24, 543.04, 297.36, 544.96, 300.24, 546.88, 303.12, 550.08, 309.36, 551.36, 312.24, 552, 315.12, 553.28, 319.44, 553.28, 332.4, 552, 337.2, 551.36, 340.08, 550.08, 343.44, 548.16, 347.28, 546.24, 350.16, 544.32, 353.04, 541.12, 357.36, 537.28, 361.2, 532.16, 365.04, 528.96, 366.96, 527.04, 367.92, 523.84, 368.88, 521.28, 369.84, 516.16, 371.28, 500.8, 371.28, 491.84, 369.84, 488, 368.88, 484.8, 367.92, 479.04, 365.04, 475.84, 363.12, 472, 360.24, 464.96, 353.04, 463.04, 350.16, 461.12, 347.28, 459.84, 345.36, 459.84, 343.44, 457.92, 340.08, 456.64, 337.2, 456, 334.32, 454.72, 330.48, 454.72, 316.08, 456, 311.28, 456.64, 307.44, 457.92, 304.08, 459.84, 301.2, 459.84, 299.28, 461.12, 297.36, 463.04, 294.48]]]], + // labels: [''], + // } + // }, + // image: 'beetle', + // }, + // { + // task: "", + // text_input: "a green car", + // generated_text: "a green car", + // target: { + // '': { + // bboxes: [[34.24, 158.64, 582.72, 374.16]], + // bboxes_labels: ['a green car'], + // polygons: [], + // polygons_labels: [], + // } + // }, + // image: 'beetle', + // }, + { + task: "", + text_input: "", + generated_text: "car", + target: { "": "car" }, + image: "beetle", + }, + { + task: "", + text_input: "", + generated_text: "turquoise Volkswagen Beetle", + target: { "": "turquoise Volkswagen Beetle" }, + image: "beetle", + }, + { + task: "", + generated_text: "CUDAFOR ENGINEERSAn Introduction to High-PerformanceParallel ComputingDUANE STORTIMETE YURTOGLU", + target: { "": "CUDAFOR ENGINEERSAn Introduction to High-PerformanceParallel ComputingDUANE STORTIMETE YURTOGLU" }, + image: "book_cover", + }, + { + task: "", + generated_text: "CUDAFOR ENGINEERSAn Introduction to High-PerformanceParallel ComputingDUANE STORTIMETE YURTOGLU", + target: { + "": { + quad_boxes: [ + [167.0435028076172, 50.25, 375.7974853515625, 50.25, 375.7974853515625, 114.75, 167.0435028076172, 114.75], + [144.8784942626953, 120.75, 375.7974853515625, 120.75, 375.7974853515625, 149.25, 144.8784942626953, 149.25], + [115.86249542236328, 165.25, 376.6034851074219, 166.25, 376.6034851074219, 184.25, 115.86249542236328, 183.25], + [239.9864959716797, 184.25, 376.6034851074219, 186.25, 376.6034851074219, 204.25, 239.9864959716797, 202.25], + [266.1814880371094, 441.25, 376.6034851074219, 441.25, 376.6034851074219, 456.25, 266.1814880371094, 456.25], + [252.0764923095703, 460.25, 376.6034851074219, 460.25, 376.6034851074219, 475.25, 252.0764923095703, 475.25], + ], + + // NOTE: Python version has a bug here, it should be "CUDA" instead of "CUDA" + labels: [/* 'CUDA' */ "CUDA", "FOR ENGINEERS", "An Introduction to High-Performance", "Parallel Computing", "DUANE STORTI", "METE YURTOGLU"], + }, + }, + image: "book_cover", + }, + ]; + + for (const { task, generated_text, target, image } of TESTS) { + it( + task, + () => { + const result = processor.post_process_generation(generated_text, task, images[image].size); + expect(result).toBeCloseToNested(target, 4); + }, + MAX_TEST_EXECUTION_TIME, + ); + } + }); + }); +}; diff --git a/tests/models/janus/test_processor_janus.js b/tests/models/janus/test_processor_janus.js new file mode 100644 index 000000000..3092fb987 --- /dev/null +++ b/tests/models/janus/test_processor_janus.js @@ -0,0 +1,47 @@ +import { AutoProcessor, VLChatProcessor } from "../../../src/transformers.js"; + +import { MAX_PROCESSOR_LOAD_TIME, MAX_TEST_EXECUTION_TIME } from "../../init.js"; + +export default () => { + describe("VLChatProcessor", () => { + const model_id = "onnx-community/Janus-1.3B-ONNX"; + + /** @type {VLChatProcessor} */ + let processor; + beforeAll(async () => { + processor = await AutoProcessor.from_pretrained(model_id); + }, MAX_PROCESSOR_LOAD_TIME); + + it( + "Image and text", + async () => { + // Prepare inputs + const conversation = [ + { + role: "User", + content: "\nConvert the formula into latex code.", + images: ["https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/quadratic_formula.png"], + }, + ]; + + const { input_ids, attention_mask, images_seq_mask, images_emb_mask, pixel_values, original_sizes, reshaped_input_sizes } = await processor(conversation); + const num_tokens = 631; + const { num_image_tokens } = processor.config; // 576 + const { image_size } = processor.image_processor.config; // 384 + + expect(input_ids.dims).toEqual([1, num_tokens]); + expect(attention_mask.dims).toEqual([1, num_tokens]); + expect(images_seq_mask.dims).toEqual([1, num_tokens]); + expect(images_seq_mask.to("float32").mean().item()).toBeCloseTo(num_image_tokens / num_tokens, 6); + expect(images_emb_mask.dims).toEqual([1, 1, num_image_tokens]); + expect(images_emb_mask.to("float32").mean().item()).toBeCloseTo(1); + expect(pixel_values.dims).toEqual([1, 1, 3, image_size, image_size]); + expect(pixel_values.mean().item()).toBeCloseTo(0.5999642610549927, 6); + + expect(original_sizes).toEqual([[206, 767]]); + expect(reshaped_input_sizes).toEqual([[103, image_size]]); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); +}; diff --git a/tests/models/jina_clip/test_processor_jina_clip.js b/tests/models/jina_clip/test_processor_jina_clip.js new file mode 100644 index 000000000..47ac16fe8 --- /dev/null +++ b/tests/models/jina_clip/test_processor_jina_clip.js @@ -0,0 +1,44 @@ +import { AutoProcessor, JinaCLIPProcessor } from "../../../src/transformers.js"; +import { load_cached_image } from "../../asset_cache.js"; + +import { MAX_PROCESSOR_LOAD_TIME, MAX_TEST_EXECUTION_TIME } from "../../init.js"; + +export default () => { + describe("JinaCLIPProcessor", () => { + const model_id = "jinaai/jina-clip-v2"; + + /** @type {JinaCLIPProcessor} */ + let processor; + beforeAll(async () => { + processor = await AutoProcessor.from_pretrained(model_id); + }, MAX_PROCESSOR_LOAD_TIME); + + it( + "Image and text", + async () => { + // Prepare inputs + const images = [await load_cached_image("white_image"), await load_cached_image("blue_image")]; + const sentences = [ + "غروب جميل على الشاطئ", // Arabic + "海滩上美丽的日落", // Chinese + "Un beau coucher de soleil sur la plage", // French + "Ein wunderschöner Sonnenuntergang am Strand", // German + "Ένα όμορφο ηλιοβασίλεμα πάνω από την παραλία", // Greek + "समुद्र तट पर एक खूबसूरत सूर्यास्त", // Hindi + "Un bellissimo tramonto sulla spiaggia", // Italian + "浜辺に沈む美しい夕日", // Japanese + "해변 위로 아름다운 일몰", // Korean + ]; + + // Encode text and images + const { input_ids, attention_mask, pixel_values } = await processor(sentences, images, { padding: true, truncation: true }); + + expect(input_ids.dims).toEqual([sentences.length, 19]); + expect(attention_mask.dims).toEqual([sentences.length, 19]); + expect(pixel_values.dims).toEqual([images.length, 3, 512, 512]); + expect(pixel_values.mean().item()).toBeCloseTo(0.7857685685157776, 6); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); +}; diff --git a/tests/models/mgp_str/test_modeling_mgp_str.js b/tests/models/mgp_str/test_modeling_mgp_str.js new file mode 100644 index 000000000..b0122f03b --- /dev/null +++ b/tests/models/mgp_str/test_modeling_mgp_str.js @@ -0,0 +1,84 @@ +import { MgpstrProcessor, MgpstrForSceneTextRecognition } from "../../../src/transformers.js"; + +import { load_cached_image } from "../../asset_cache.js"; +import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../../init.js"; + +export default () => { + describe("MgpstrForSceneTextRecognition", () => { + const model_id = "onnx-community/tiny-random-MgpstrForSceneTextRecognition"; + /** @type {MgpstrForSceneTextRecognition} */ + let model; + /** @type {MgpstrProcessor} */ + let processor; + beforeAll(async () => { + model = await MgpstrForSceneTextRecognition.from_pretrained(model_id, DEFAULT_MODEL_OPTIONS); + processor = await MgpstrProcessor.from_pretrained(model_id); + }, MAX_MODEL_LOAD_TIME); + + const TARGETS = { + white_image: { + generated_text: ["mmmmmmmmmmmmmmmmmmmmmmmmmm"], + scores: [3.5553885547065065e-27], + char_preds: ["mmmmmmmmmmmmmmmmmmmmmmmmmm"], + bpe_preds: ["wwwwwwwwwwwwwwwwwwwwwwwwww"], + wp_preds: ["[unused65][unused65][unused65][unused65][unused65][unused65][unused65][unused65][unused65][unused65][unused65][unused65][unused65][unused65][unused65][unused65][unused65][unused65][unused65][unused65][unused65][unused65][unused65][unused65][unused65][unused65]"], + }, + blue_image: { + generated_text: ["11111111111111111111111111"], + scores: [9.739909092663214e-32], + char_preds: ["11111111111111111111111111"], + bpe_preds: ["22222222222222222222222222"], + wp_preds: ["[unused59][unused59][unused59][unused59][unused59][unused59][unused59][unused59][unused59][unused59][unused59][unused59][unused59][unused59][unused59][unused59][unused59][unused59][unused59][unused59][unused59][unused59][unused59][unused59][unused59][unused59]"], + }, + }; + + it( + "batch_size=1", + async () => { + const image_id = "white_image"; + const image = await load_cached_image(image_id); + + const inputs = await processor(image); + const outputs = await model(inputs); + + const { max_token_length, num_character_labels, num_bpe_labels, num_wordpiece_labels } = model.config; + expect(outputs.char_logits.dims).toEqual([1, /* 27 */ max_token_length, /* 38 */ num_character_labels]); + expect(outputs.bpe_logits.dims).toEqual([1, /* 27 */ max_token_length, /* 99 */ num_bpe_labels]); + expect(outputs.wp_logits.dims).toEqual([1, /* 27 */ max_token_length, /* 99 */ num_wordpiece_labels]); + + const decoded = processor.batch_decode(outputs.logits); + expect(decoded).toBeCloseToNested(TARGETS[image_id]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "batch_size>1", + async () => { + const image_ids = ["white_image", "blue_image"]; + const images = await Promise.all(image_ids.map((image_id) => load_cached_image(image_id))); + + const inputs = await processor(images); + const outputs = await model(inputs); + + const { max_token_length, num_character_labels, num_bpe_labels, num_wordpiece_labels } = model.config; + expect(outputs.char_logits.dims).toEqual([images.length, /* 27 */ max_token_length, /* 38 */ num_character_labels]); + expect(outputs.bpe_logits.dims).toEqual([images.length, /* 27 */ max_token_length, /* 99 */ num_bpe_labels]); + expect(outputs.wp_logits.dims).toEqual([images.length, /* 27 */ max_token_length, /* 99 */ num_wordpiece_labels]); + + const decoded = processor.batch_decode(outputs.logits); + const target = image_ids.reduce((acc, image_id) => { + for (const key in TARGETS[image_id]) (acc[key] ??= []).push(...TARGETS[image_id][key]); + return acc; + }, {}); + + expect(decoded).toBeCloseToNested(target); + }, + MAX_TEST_EXECUTION_TIME, + ); + + afterAll(async () => { + await model?.dispose(); + }, MAX_MODEL_DISPOSE_TIME); + }); +}; diff --git a/tests/models/musicgen/test_modeling_musicgen.js b/tests/models/musicgen/test_modeling_musicgen.js index 7ebf808ed..e16cf022b 100644 --- a/tests/models/musicgen/test_modeling_musicgen.js +++ b/tests/models/musicgen/test_modeling_musicgen.js @@ -27,7 +27,7 @@ export default () => { const decoder_input_ids = full([inputs.input_ids.dims[0] * model.config.decoder.num_codebooks, 1], pad_token_id); const { logits } = await model({ ...inputs, decoder_input_ids }); expect(logits.dims).toEqual([8, 1, 99]); - expect(logits.mean().item()).toBeCloseTo(-0.0018370470497757196, 5); + expect(logits.mean().item()).toBeCloseTo(-0.0018370470497757196, 4); }, MAX_TEST_EXECUTION_TIME, ); diff --git a/tests/models/paligemma/test_processor_paligemma.js b/tests/models/paligemma/test_processor_paligemma.js new file mode 100644 index 000000000..4096c5f64 --- /dev/null +++ b/tests/models/paligemma/test_processor_paligemma.js @@ -0,0 +1,51 @@ +import { AutoProcessor, PaliGemmaProcessor } from "../../../src/transformers.js"; + +import { load_cached_image } from "../../asset_cache.js"; +import { MAX_PROCESSOR_LOAD_TIME, MAX_TEST_EXECUTION_TIME } from "../../init.js"; + +export default () => { + const model_id = "hf-internal-testing/tiny-random-PaliGemmaForConditionalGeneration"; + + describe("PaliGemmaProcessor", () => { + /** @type {PaliGemmaProcessor} */ + let processor; + let images = {}; + + beforeAll(async () => { + processor = await AutoProcessor.from_pretrained(model_id); + images = { + white_image: await load_cached_image("white_image"), + }; + }, MAX_PROCESSOR_LOAD_TIME); + + it( + "Image-only (default text)", + async () => { + const { input_ids, pixel_values } = await processor(images.white_image); + expect(input_ids.dims).toEqual([1, 258]); + expect(pixel_values.dims).toEqual([1, 3, 224, 224]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "Single image & text", + async () => { + const { input_ids, pixel_values } = await processor(images.white_image, "What is on the flower?"); + expect(input_ids.dims).toEqual([1, 264]); + expect(pixel_values.dims).toEqual([1, 3, 224, 224]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "Multiple images & text", + async () => { + const { input_ids, pixel_values } = await processor([images.white_image, images.white_image], "Describe the images."); + expect(input_ids.dims).toEqual([1, 518]); + expect(pixel_values.dims).toEqual([2, 3, 224, 224]); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); +}; diff --git a/tests/models/qwen2_vl/test_modeling_qwen2_vl.js b/tests/models/qwen2_vl/test_modeling_qwen2_vl.js index 887a8e092..81af16e95 100644 --- a/tests/models/qwen2_vl/test_modeling_qwen2_vl.js +++ b/tests/models/qwen2_vl/test_modeling_qwen2_vl.js @@ -29,10 +29,7 @@ export default () => { /** @type {Qwen2VLProcessor} */ let processor; beforeAll(async () => { - model = await Qwen2VLForConditionalGeneration.from_pretrained(model_id, { - // TODO move to config - ...DEFAULT_MODEL_OPTIONS, - }); + model = await Qwen2VLForConditionalGeneration.from_pretrained(model_id, DEFAULT_MODEL_OPTIONS); processor = await Qwen2VLProcessor.from_pretrained(model_id); }, MAX_MODEL_LOAD_TIME); diff --git a/tests/models/qwen2_vl/test_processor_qwen2_vl.js b/tests/models/qwen2_vl/test_processor_qwen2_vl.js new file mode 100644 index 000000000..72ccf782f --- /dev/null +++ b/tests/models/qwen2_vl/test_processor_qwen2_vl.js @@ -0,0 +1,44 @@ +import { AutoProcessor, Qwen2VLProcessor } from "../../../src/transformers.js"; + +import { load_cached_image } from "../../asset_cache.js"; +import { MAX_PROCESSOR_LOAD_TIME, MAX_TEST_EXECUTION_TIME } from "../../init.js"; + +export default () => { + describe("Qwen2VLProcessor", () => { + const model_id = "hf-internal-testing/tiny-random-Qwen2VLForConditionalGeneration"; + + /** @type {Qwen2VLProcessor} */ + let processor; + let images = {}; + + beforeAll(async () => { + processor = await AutoProcessor.from_pretrained(model_id); + images = { + white_image: await load_cached_image("white_image"), + }; + }, MAX_PROCESSOR_LOAD_TIME); + + it( + "Image and text", + async () => { + const conversation = [ + { + role: "user", + content: [{ type: "image" }, { type: "text", text: "Describe this image." }], + }, + ]; + + const text = processor.apply_chat_template(conversation, { + add_generation_prompt: true, + }); + const { input_ids, attention_mask, pixel_values, image_grid_thw } = await processor(text, images.white_image); + + expect(input_ids.dims).toEqual([1, 89]); + expect(attention_mask.dims).toEqual([1, 89]); + expect(pixel_values.dims).toEqual([256, 1176]); + expect(image_grid_thw.dims).toEqual([1, 3]); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); +}; diff --git a/tests/models/sam/test_modeling_sam.js b/tests/models/sam/test_modeling_sam.js new file mode 100644 index 000000000..ec7beaf6b --- /dev/null +++ b/tests/models/sam/test_modeling_sam.js @@ -0,0 +1,48 @@ +import { SamProcessor, SamModel } from "../../../src/transformers.js"; +import { load_cached_image } from "../../asset_cache.js"; + +import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../../init.js"; + +export default () => { + describe("SamModel", () => { + const model_id = "Xenova/slimsam-77-uniform"; + + /** @type {SamModel} */ + let model; + /** @type {SamProcessor} */ + let processor; + beforeAll(async () => { + model = await SamModel.from_pretrained(model_id, DEFAULT_MODEL_OPTIONS); + processor = await SamProcessor.from_pretrained(model_id); + }, MAX_MODEL_LOAD_TIME); + + it( + "w/ input_points", + async () => { + // Prepare image and input points + const raw_image = await load_cached_image("corgi"); + const input_points = [[[340, 250]]]; + + // Process inputs and perform mask generation + const inputs = await processor(raw_image, { input_points }); + const { pred_masks, iou_scores } = await model(inputs); + + expect(pred_masks.dims).toEqual([1, 1, 3, 256, 256]); + expect(pred_masks.mean().item()).toBeCloseTo(-5.76981782913208, 5); + expect(iou_scores.dims).toEqual([1, 1, 3]); + expect(iou_scores.tolist()).toBeCloseToNested([[[0.8583833575248718, 0.9773167967796326, 0.8511142730712891]]]); + + // Post-process masks + const masks = await processor.post_process_masks(pred_masks, inputs.original_sizes, inputs.reshaped_input_sizes); + expect(masks).toHaveLength(1); + expect(masks[0].dims).toEqual([1, 3, 410, 614]); + expect(masks[0].type).toEqual("bool"); + }, + MAX_TEST_EXECUTION_TIME, + ); + + afterAll(async () => { + await model?.dispose(); + }, MAX_MODEL_DISPOSE_TIME); + }); +}; diff --git a/tests/models/seamless_m4t/test_feature_extraction_seamless_m4t.js b/tests/models/seamless_m4t/test_feature_extraction_seamless_m4t.js new file mode 100644 index 000000000..5392657a3 --- /dev/null +++ b/tests/models/seamless_m4t/test_feature_extraction_seamless_m4t.js @@ -0,0 +1,65 @@ +import { AutoFeatureExtractor, SeamlessM4TFeatureExtractor } from "../../../src/transformers.js"; + +import { load_cached_audio } from "../../asset_cache.js"; +import { MAX_FEATURE_EXTRACTOR_LOAD_TIME, MAX_TEST_EXECUTION_TIME } from "../../init.js"; + +const sum = (array) => Number(array.reduce((a, b) => a + b, array instanceof BigInt64Array ? 0n : 0)); + +export default () => { + // SeamlessM4TFeatureExtractor + describe("SeamlessM4TFeatureExtractor", () => { + const model_id = "Xenova/wav2vec2-bert-CV16-en"; + + /** @type {SeamlessM4TFeatureExtractor} */ + let feature_extractor; + beforeAll(async () => { + feature_extractor = await AutoFeatureExtractor.from_pretrained(model_id); + }, MAX_FEATURE_EXTRACTOR_LOAD_TIME); + + it( + "default", + async () => { + const audio = await load_cached_audio("mlk"); + + const { input_features, attention_mask } = await feature_extractor(audio); + const { dims, data } = input_features; + expect(dims).toEqual([1, 649, 160]); + expect(attention_mask.dims).toEqual([1, 649]); + + expect(input_features.mean().item()).toBeCloseTo(-2.938903875815413e-8); + expect(data[0]).toBeCloseTo(1.1939343214035034); + expect(data[1]).toBeCloseTo(0.7874255180358887); + expect(data[160]).toBeCloseTo(-0.712975025177002); + expect(data[161]).toBeCloseTo(0.045802414417266846); + expect(data.at(-1)).toBeCloseTo(-1.3328346014022827); + + expect(sum(attention_mask.data)).toEqual(649); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "padding (pad_to_multiple_of=2)", + async () => { + const audio = await load_cached_audio("mlk"); + + const { input_features, attention_mask } = await feature_extractor(audio.slice(0, 10000)); + const { dims, data } = input_features; + + // [1, 61, 80] -> [1, 62, 80] -> [1, 31, 160] + expect(dims).toEqual([1, 31, 160]); + expect(attention_mask.dims).toEqual([1, 31]); + + expect(input_features.mean().item()).toBeCloseTo(0.01612919569015503); + expect(data[0]).toBeCloseTo(0.9657132029533386); + expect(data[1]).toBeCloseTo(0.12912897765636444); + expect(data[160]).toBeCloseTo(-1.2364212274551392); + expect(data[161]).toBeCloseTo(-0.9703778028488159); + expect(data.at(-1)).toBeCloseTo(1); // padding value + + expect(sum(attention_mask.data)).toEqual(30); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); +}; diff --git a/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.js b/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.js index a13c4bb5e..f70cb682b 100644 --- a/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.js +++ b/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.js @@ -1,4 +1,4 @@ -import { GPT2Tokenizer, VisionEncoderDecoderModel, RawImage, full } from "../../../src/transformers.js"; +import { VisionEncoderDecoderModel, full } from "../../../src/transformers.js"; import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../../init.js"; @@ -8,11 +8,8 @@ export default () => { /** @type {VisionEncoderDecoderModel} */ let model; - /** @type {GPT2Tokenizer} */ - let tokenizer; beforeAll(async () => { model = await VisionEncoderDecoderModel.from_pretrained(model_id, DEFAULT_MODEL_OPTIONS); - tokenizer = await GPT2Tokenizer.from_pretrained(model_id); }, MAX_MODEL_LOAD_TIME); it( diff --git a/tests/models/vitpose/test_image_processing_vitpose.js b/tests/models/vitpose/test_image_processing_vitpose.js new file mode 100644 index 000000000..b49afd95d --- /dev/null +++ b/tests/models/vitpose/test_image_processing_vitpose.js @@ -0,0 +1,50 @@ +import { AutoImageProcessor, rand, Tensor, VitPoseImageProcessor } from "../../../src/transformers.js"; + +import { load_cached_image } from "../../asset_cache.js"; +import { MAX_PROCESSOR_LOAD_TIME, MAX_TEST_EXECUTION_TIME } from "../../init.js"; + +export default () => { + describe("VitPoseImageProcessor", () => { + const model_id = "onnx-community/vitpose-base-simple"; + + /** @type {VitPoseImageProcessor} */ + let processor; + beforeAll(async () => { + processor = await AutoImageProcessor.from_pretrained(model_id); + }, MAX_PROCESSOR_LOAD_TIME); + + it( + "default", + async () => { + const image = await load_cached_image("tiger"); + const { pixel_values, original_sizes, reshaped_input_sizes } = await processor(image); + + expect(pixel_values.dims).toEqual([1, 3, 256, 192]); + expect(pixel_values.mean().item()).toBeCloseTo(-0.2771204710006714, 6); + + expect(original_sizes).toEqual([[408, 612]]); + expect(reshaped_input_sizes).toEqual([[256, 192]]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "post_process_pose_estimation", + async () => { + const num_classes = 17; + const size = [0, 0, 1000, 1500]; + const heatmaps = rand([1, num_classes, 64, 48]); + + const boxes = [[size]]; + const { bbox, scores, labels, keypoints } = processor.post_process_pose_estimation(heatmaps, boxes, { threshold: null })[0][0]; + + expect(bbox).toEqual(size); + expect(scores).toHaveLength(num_classes); + expect(labels).toHaveLength(num_classes); + expect(keypoints).toHaveLength(num_classes); + expect(keypoints[0]).toHaveLength(2); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); +}; diff --git a/tests/models/wespeaker_resnet/test_feature_extraction_wespeaker_resnet.js b/tests/models/wespeaker_resnet/test_feature_extraction_wespeaker_resnet.js new file mode 100644 index 000000000..27d0dbd6f --- /dev/null +++ b/tests/models/wespeaker_resnet/test_feature_extraction_wespeaker_resnet.js @@ -0,0 +1,56 @@ +import { AutoFeatureExtractor, WeSpeakerFeatureExtractor } from "../../../src/transformers.js"; + +import { MAX_FEATURE_EXTRACTOR_LOAD_TIME, MAX_TEST_EXECUTION_TIME } from "../../init.js"; + +export default () => { + // WeSpeakerFeatureExtractor + describe("WeSpeakerFeatureExtractor", () => { + const model_id = "onnx-community/wespeaker-voxceleb-resnet34-LM"; + + /** @type {WeSpeakerFeatureExtractor} */ + let feature_extractor; + beforeAll(async () => { + feature_extractor = await AutoFeatureExtractor.from_pretrained(model_id); + }, MAX_FEATURE_EXTRACTOR_LOAD_TIME); + + it( + "default", + async () => { + const audio = new Float32Array(16000).map((_, i) => Math.sin(i / 100)); + const { input_features } = await feature_extractor(audio); + const { dims, data } = input_features; + expect(dims).toEqual([1, 98, 80]); + + expect(input_features.mean().item()).toBeCloseTo(5.461731689138105e-8); + expect(data[0]).toBeCloseTo(-0.19300270080566406); + expect(data[1]).toBeCloseTo(-0.05825042724609375); + expect(data[78]).toBeCloseTo(0.2683420181274414); + expect(data[79]).toBeCloseTo(0.26250171661376953); + expect(data[80]).toBeCloseTo(0.19062232971191406); + expect(data.at(-2)).toBeCloseTo(-0.43694400787353516); + expect(data.at(-1)).toBeCloseTo(-0.4266204833984375); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "pad to `min_num_frames`", + async () => { + const audio = new Float32Array(3).map((_, i) => Math.sin(i / 100)); + const { input_features } = await feature_extractor(audio); + const { dims, data } = input_features; + expect(dims).toEqual([1, 9, 80]); + + expect(input_features.mean().item()).toBeCloseTo(-0.0000010093053181966146); + expect(data[0]).toBeCloseTo(20.761859893798828); + expect(data[1]).toBeCloseTo(21.02924346923828); + expect(data[78]).toBeCloseTo(19.083993911743164); + expect(data[79]).toBeCloseTo(18.003454208374023); + expect(data[80]).toBeCloseTo(-2.595233917236328); + expect(data.at(-2)).toBeCloseTo(-2.385499954223633); + expect(data.at(-1)).toBeCloseTo(-2.2504329681396484); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); +}; diff --git a/tests/models/whisper/test_feature_extraction_whisper.js b/tests/models/whisper/test_feature_extraction_whisper.js new file mode 100644 index 000000000..20e132ff6 --- /dev/null +++ b/tests/models/whisper/test_feature_extraction_whisper.js @@ -0,0 +1,33 @@ +import { AutoFeatureExtractor, WhisperFeatureExtractor } from "../../../src/transformers.js"; + +import { load_cached_audio } from "../../asset_cache.js"; +import { MAX_FEATURE_EXTRACTOR_LOAD_TIME, MAX_TEST_EXECUTION_TIME } from "../../init.js"; + +export default () => { + // WhisperFeatureExtractor + describe("WhisperFeatureExtractor", () => { + const model_id = "Xenova/whisper-tiny.en"; + + /** @type {WhisperFeatureExtractor} */ + let feature_extractor; + beforeAll(async () => { + feature_extractor = await AutoFeatureExtractor.from_pretrained(model_id); + }, MAX_FEATURE_EXTRACTOR_LOAD_TIME); + + it( + "default", + async () => { + const audio = await load_cached_audio("mlk"); + const { input_features } = await feature_extractor(audio); + const { dims, data } = input_features; + expect(dims).toEqual([1, 80, 3000]); + expect(input_features.mean().item()).toBeCloseTo(-0.2813588131551941); + expect(data[0]).toBeCloseTo(0.33168578147888184); + expect(data[1]).toBeCloseTo(0.30986475944519043); + expect(data[81]).toBeCloseTo(0.10727232694625854); + expect(data[3001]).toBeCloseTo(0.2555035352706909); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); +}; diff --git a/tests/pipelines.test.js b/tests/pipelines.test.js index 6bef83297..bfdef3872 100644 --- a/tests/pipelines.test.js +++ b/tests/pipelines.test.js @@ -1,16 +1,17 @@ import { pipeline, cos_sim } from "../src/transformers.js"; import { init, MAX_TEST_EXECUTION_TIME } from "./init.js"; -import { compare, loadAudio } from "./test_utils.js"; +import { collect_and_execute_pipeline_tests, compare, loadAudio } from "./test_utils.js"; // Initialise the testing environment init(); +await collect_and_execute_pipeline_tests("Pipelines"); // NOTE: // Due to a memory leak in Jest, we cannot have multiple tests for a single model. // This is due to how model construction and destruction occurs, in `beforeAll` and `afterAll`, respectively. // As a result, each test is responsible for exactly one model, but we run multiple inputs through it. // By encapsulating model construction and destruction in a single `it` block, we avoid these memory issues. -xdescribe("Pipelines", () => { +xdescribe("Pipelines (ignored)", () => { describe("Text classification", () => { // List all models which will be tested const models = ["Xenova/distilbert-base-uncased-finetuned-sst-2-english", "Xenova/toxic-bert"]; diff --git a/tests/pipelines/test_pipelines_audio_classification.js b/tests/pipelines/test_pipelines_audio_classification.js new file mode 100644 index 000000000..e9e4ac703 --- /dev/null +++ b/tests/pipelines/test_pipelines_audio_classification.js @@ -0,0 +1,80 @@ +import { pipeline, AudioClassificationPipeline } from "../../src/transformers.js"; + +import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../init.js"; + +const PIPELINE_ID = "audio-classification"; + +export default () => { + describe("Audio Classification", () => { + const model_id = "hf-internal-testing/tiny-random-unispeech"; + const audios = [new Float32Array(16000).fill(0), Float32Array.from({ length: 16000 }, (_, i) => i)]; + + /** @type {AudioClassificationPipeline} */ + let pipe; + beforeAll(async () => { + pipe = await pipeline(PIPELINE_ID, model_id, DEFAULT_MODEL_OPTIONS); + }, MAX_MODEL_LOAD_TIME); + + it("should be an instance of AudioClassificationPipeline", () => { + expect(pipe).toBeInstanceOf(AudioClassificationPipeline); + }); + + describe("batch_size=1", () => { + it( + "default (top_k=5)", + async () => { + const output = await pipe(audios[0]); + const target = [ + { score: 0.5043687224388123, label: "LABEL_0" }, + { score: 0.4956313371658325, label: "LABEL_1" }, + ]; + expect(output).toBeCloseToNested(target, 5); + }, + MAX_TEST_EXECUTION_TIME, + ); + it( + "custom (top_k=1)", + async () => { + const output = await pipe(audios[0], { top_k: 1 }); + const target = [{ score: 0.5043687224388123, label: "LABEL_0" }]; + expect(output).toBeCloseToNested(target, 5); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); + + describe("batch_size>1", () => { + it( + "default (top_k=5)", + async () => { + const output = await pipe(audios); + const target = [ + [ + { score: 0.5043687224388123, label: "LABEL_0" }, + { score: 0.4956313371658325, label: "LABEL_1" }, + ], + [ + { score: 0.5187293887138367, label: "LABEL_0" }, + { score: 0.4812707006931305, label: "LABEL_1" }, + ], + ]; + expect(output).toBeCloseToNested(target, 5); + }, + MAX_TEST_EXECUTION_TIME, + ); + it( + "custom (top_k=1)", + async () => { + const output = await pipe(audios, { top_k: 1 }); + const target = [[{ score: 0.5043687224388123, label: "LABEL_0" }], [{ score: 0.5187293887138367, label: "LABEL_0" }]]; + expect(output).toBeCloseToNested(target, 5); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); + + afterAll(async () => { + await pipe.dispose(); + }, MAX_MODEL_DISPOSE_TIME); + }); +}; diff --git a/tests/pipelines/test_pipelines_automatic_speech_recognition.js b/tests/pipelines/test_pipelines_automatic_speech_recognition.js new file mode 100644 index 000000000..da9dd88b4 --- /dev/null +++ b/tests/pipelines/test_pipelines_automatic_speech_recognition.js @@ -0,0 +1,129 @@ +import { pipeline, AutomaticSpeechRecognitionPipeline } from "../../src/transformers.js"; + +import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../init.js"; + +const PIPELINE_ID = "automatic-speech-recognition"; + +export default () => { + describe("Automatic Speech Recognition", () => { + describe("whisper", () => { + const model_id = "Xenova/tiny-random-WhisperForConditionalGeneration"; + const SAMPLING_RATE = 16000; + const audios = [new Float32Array(SAMPLING_RATE).fill(0), Float32Array.from({ length: SAMPLING_RATE }, (_, i) => i / 16000)]; + const long_audios = [new Float32Array(SAMPLING_RATE * 60).fill(0), Float32Array.from({ length: SAMPLING_RATE * 60 }, (_, i) => (i % 1000) / 1000)]; + + const max_new_tokens = 5; + /** @type {AutomaticSpeechRecognitionPipeline} */ + let pipe; + beforeAll(async () => { + pipe = await pipeline(PIPELINE_ID, model_id, DEFAULT_MODEL_OPTIONS); + }, MAX_MODEL_LOAD_TIME); + + it("should be an instance of AutomaticSpeechRecognitionPipeline", () => { + expect(pipe).toBeInstanceOf(AutomaticSpeechRecognitionPipeline); + }); + + describe("batch_size=1", () => { + it( + "default", + async () => { + const output = await pipe(audios[0], { max_new_tokens }); + const target = { text: "นะคะนะคะURURUR" }; + expect(output).toEqual(target); + }, + MAX_TEST_EXECUTION_TIME, + ); + it( + "transcribe w/ return_timestamps=true", + async () => { + const output = await pipe(audios[0], { return_timestamps: true, max_new_tokens }); + const target = { + text: " riceUR", + chunks: [ + { timestamp: [0.72, 17.72], text: " rice" }, + { timestamp: [17.72, null], text: "UR" }, + ], + }; + expect(output).toBeCloseToNested(target, 5); + }, + MAX_TEST_EXECUTION_TIME, + ); + // TODO add: transcribe w/ return_timestamps="word" + // it( + // "transcribe w/ word-level timestamps", + // async () => { + // const output = await pipe(audios[0], { return_timestamps: "word", max_new_tokens }); + // const target = []; + // expect(output).toBeCloseToNested(target, 5); + // }, + // MAX_TEST_EXECUTION_TIME, + // ); + it( + "transcribe w/ language", + async () => { + const output = await pipe(audios[0], { language: "french", task: "transcribe", max_new_tokens }); + const target = { text: "นะคะนะคะURURUR" }; + expect(output).toEqual(target); + }, + MAX_TEST_EXECUTION_TIME, + ); + it( + "translate", + async () => { + const output = await pipe(audios[0], { language: "french", task: "translate", max_new_tokens }); + const target = { text: "นะคะนะคะURURUR" }; + expect(output).toEqual(target); + }, + MAX_TEST_EXECUTION_TIME, + ); + it( + "audio > 30 seconds", + async () => { + const output = await pipe(long_audios[0], { chunk_length_s: 30, stride_length_s: 5, max_new_tokens }); + const target = { text: "นะคะนะคะURURUR" }; + expect(output).toEqual(target); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); + + afterAll(async () => { + await pipe.dispose(); + }, MAX_MODEL_DISPOSE_TIME); + }); + + describe("wav2vec2", () => { + const model_id = "Xenova/tiny-random-Wav2Vec2ForCTC-ONNX"; + const SAMPLING_RATE = 16000; + const audios = [new Float32Array(SAMPLING_RATE).fill(0), Float32Array.from({ length: SAMPLING_RATE }, (_, i) => i / 16000)]; + const long_audios = [new Float32Array(SAMPLING_RATE * 60).fill(0), Float32Array.from({ length: SAMPLING_RATE * 60 }, (_, i) => (i % 1000) / 1000)]; + + const max_new_tokens = 5; + /** @type {AutomaticSpeechRecognitionPipeline} */ + let pipe; + beforeAll(async () => { + pipe = await pipeline(PIPELINE_ID, model_id, DEFAULT_MODEL_OPTIONS); + }, MAX_MODEL_LOAD_TIME); + + it("should be an instance of AutomaticSpeechRecognitionPipeline", () => { + expect(pipe).toBeInstanceOf(AutomaticSpeechRecognitionPipeline); + }); + + describe("batch_size=1", () => { + it( + "default", + async () => { + const output = await pipe(audios[0], { max_new_tokens }); + const target = { text: "K" }; + expect(output).toEqual(target); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); + + afterAll(async () => { + await pipe.dispose(); + }, MAX_MODEL_DISPOSE_TIME); + }); + }); +}; diff --git a/tests/pipelines/test_pipelines_depth_estimation.js b/tests/pipelines/test_pipelines_depth_estimation.js new file mode 100644 index 000000000..f0d5fe887 --- /dev/null +++ b/tests/pipelines/test_pipelines_depth_estimation.js @@ -0,0 +1,57 @@ +import { pipeline, DepthEstimationPipeline, RawImage } from "../../src/transformers.js"; + +import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../init.js"; +import { load_cached_image } from "../asset_cache.js"; + +const PIPELINE_ID = "depth-estimation"; + +export default () => { + describe("Depth Estimation", () => { + const model_id = "hf-internal-testing/tiny-random-DPTForDepthEstimation"; + /** @type {DepthEstimationPipeline} */ + let pipe; + let images; + beforeAll(async () => { + pipe = await pipeline(PIPELINE_ID, model_id, DEFAULT_MODEL_OPTIONS); + images = await Promise.all([load_cached_image("white_image"), load_cached_image("blue_image")]); + }, MAX_MODEL_LOAD_TIME); + + it("should be an instance of DepthEstimationPipeline", () => { + expect(pipe).toBeInstanceOf(DepthEstimationPipeline); + }); + + describe("batch_size=1", () => { + it( + "default", + async () => { + const output = await pipe(images[0]); + expect(output.predicted_depth.dims).toEqual([32, 32]); + expect(output.predicted_depth.mean().item()).toBeCloseTo(0.000006106501587055391, 6); + expect(output.depth.size).toEqual(images[0].size); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); + + describe("batch_size>1", () => { + it( + "default", + async () => { + const output = await pipe(images); + expect(output).toHaveLength(images.length); + expect(output[0].predicted_depth.dims).toEqual([32, 32]); + expect(output[0].predicted_depth.mean().item()).toBeCloseTo(0.000006106501587055391, 6); + expect(output[0].depth.size).toEqual(images[0].size); + expect(output[1].predicted_depth.dims).toEqual([32, 32]); + expect(output[1].predicted_depth.mean().item()).toBeCloseTo(0.0000014548650142387487, 6); + expect(output[1].depth.size).toEqual(images[1].size); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); + + afterAll(async () => { + await pipe.dispose(); + }, MAX_MODEL_DISPOSE_TIME); + }); +}; diff --git a/tests/pipelines/test_pipelines_document_question_answering.js b/tests/pipelines/test_pipelines_document_question_answering.js new file mode 100644 index 000000000..3ebb1e436 --- /dev/null +++ b/tests/pipelines/test_pipelines_document_question_answering.js @@ -0,0 +1,41 @@ +import { pipeline, DocumentQuestionAnsweringPipeline, RawImage } from "../../src/transformers.js"; + +import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../init.js"; + +const PIPELINE_ID = "document-question-answering"; + +export default () => { + describe("Document Question Answering", () => { + const model_id = "hf-internal-testing/tiny-random-VisionEncoderDecoderModel-donutswin-mbart"; + + /** @type {DocumentQuestionAnsweringPipeline} */ + let pipe; + beforeAll(async () => { + pipe = await pipeline(PIPELINE_ID, model_id, DEFAULT_MODEL_OPTIONS); + }, MAX_MODEL_LOAD_TIME); + + it("should be an instance of DocumentQuestionAnsweringPipeline", () => { + expect(pipe).toBeInstanceOf(DocumentQuestionAnsweringPipeline); + }); + + describe("batch_size=1", () => { + it( + "default", + async () => { + const dims = [64, 32, 3]; + const image = new RawImage(new Uint8ClampedArray(dims[0] * dims[1] * dims[2]).fill(255), ...dims); + const question = "What is the invoice number?"; + const output = await pipe(image, question); + + const target = [{ answer: null }]; + expect(output).toEqual(target); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); + + afterAll(async () => { + await pipe.dispose(); + }, MAX_MODEL_DISPOSE_TIME); + }); +}; diff --git a/tests/pipelines/test_pipelines_feature_extraction.js b/tests/pipelines/test_pipelines_feature_extraction.js new file mode 100644 index 000000000..b7bb79e59 --- /dev/null +++ b/tests/pipelines/test_pipelines_feature_extraction.js @@ -0,0 +1,121 @@ +import { pipeline, FeatureExtractionPipeline } from "../../src/transformers.js"; + +import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../init.js"; + +const PIPELINE_ID = "feature-extraction"; + +export default () => { + describe("Feature Extraction", () => { + const model_id = "hf-internal-testing/tiny-random-BertModel"; + + const texts = ["This is a simple test.", "Hello world"]; + + /** @type {FeatureExtractionPipeline} */ + let pipe; + beforeAll(async () => { + pipe = await pipeline(PIPELINE_ID, model_id, DEFAULT_MODEL_OPTIONS); + }, MAX_MODEL_LOAD_TIME); + + it("should be an instance of FeatureExtractionPipeline ", () => { + expect(pipe).toBeInstanceOf(FeatureExtractionPipeline); + }); + + describe("batch_size=1", () => { + it( + "default", + async () => { + const output = await pipe(texts[0]); + expect(output.dims).toEqual([1, 20, 32]); + expect(output.type).toEqual("float32"); + expect(output.mean().item()).toBeCloseTo(-1.538501215314625e-9, 6); + }, + MAX_TEST_EXECUTION_TIME, + ); + it( + "w/ cls pooling", + async () => { + const output = await pipe(texts[0], { pooling: "cls" }); + expect(output.dims).toEqual([1, 32]); + expect(output.type).toEqual("float32"); + expect(output.mean().item()).toBeCloseTo(2.491287887096405e-8, 6); + }, + MAX_TEST_EXECUTION_TIME, + ); + it( + "w/ mean pooling & normalization", + async () => { + const output = await pipe(texts[0], { pooling: "mean", normalize: true }); + expect(output.dims).toEqual([1, 32]); + expect(output.type).toEqual("float32"); + expect(output.mean().item()).toBeCloseTo(-2.0245352061465383e-9, 6); + }, + MAX_TEST_EXECUTION_TIME, + ); + it( + "w/ mean pooling & binary quantization", + async () => { + const output = await pipe(texts[0], { pooling: "mean", quantize: true, precision: "binary" }); + expect(output.dims).toEqual([1, 32 / 8]); + expect(output.type).toEqual("int8"); + expect(output.mean().item()).toEqual(-15); + }, + MAX_TEST_EXECUTION_TIME, + ); + it("w/ cls pooling & ubinary quantization", async () => { + const output = await pipe(texts[0], { pooling: "cls", quantize: true, precision: "ubinary" }); + expect(output.dims).toEqual([1, 32 / 8]); + expect(output.type).toEqual("uint8"); + expect(output.mean().item()).toEqual(140); + }); + }); + + describe("batch_size>1", () => { + it( + "default", + async () => { + const output = await pipe(texts); + expect(output.dims).toEqual([texts.length, 20, 32]); + expect(output.type).toEqual("float32"); + expect(output.mean().item()).toBeCloseTo(2.345950544935249e-9, 6); + }, + MAX_TEST_EXECUTION_TIME, + ); + it( + "w/ cls pooling", + async () => { + const output = await pipe(texts, { pooling: "cls" }); + expect(output.dims).toEqual([texts.length, 32]); + expect(output.type).toEqual("float32"); + expect(output.mean().item()).toBeCloseTo(1.6298145055770874e-8, 6); + }, + MAX_TEST_EXECUTION_TIME, + ); + it( + "w/ mean pooling & normalization", + async () => { + const output = await pipe(texts, { pooling: "mean", normalize: true }); + expect(output.dims).toEqual([texts.length, 32]); + expect(output.type).toEqual("float32"); + expect(output.mean().item()).toBeCloseTo(-1.538609240014921e-10, 6); + }, + MAX_TEST_EXECUTION_TIME, + ); + it("w/ mean pooling & binary quantization", async () => { + const output = await pipe(texts, { pooling: "mean", quantize: true, precision: "binary" }); + expect(output.dims).toEqual([texts.length, 32 / 8]); + expect(output.type).toEqual("int8"); + expect(output.mean().item()).toEqual(-14); + }); + it("w/ cls pooling & ubinary quantization", async () => { + const output = await pipe(texts, { pooling: "cls", quantize: true, precision: "ubinary" }); + expect(output.dims).toEqual([texts.length, 32 / 8]); + expect(output.type).toEqual("uint8"); + expect(output.mean().item()).toEqual(140); + }); + }); + + afterAll(async () => { + await pipe.dispose(); + }, MAX_MODEL_DISPOSE_TIME); + }); +}; diff --git a/tests/pipelines/test_pipelines_fill_mask.js b/tests/pipelines/test_pipelines_fill_mask.js new file mode 100644 index 000000000..9f0cd3515 --- /dev/null +++ b/tests/pipelines/test_pipelines_fill_mask.js @@ -0,0 +1,100 @@ +import { pipeline, FillMaskPipeline } from "../../src/transformers.js"; + +import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../init.js"; + +const PIPELINE_ID = "fill-mask"; + +export default () => { + describe("Fill Mask", () => { + const model_id = "hf-internal-testing/tiny-random-BertForMaskedLM"; + + /** @type {FillMaskPipeline} */ + let pipe; + beforeAll(async () => { + pipe = await pipeline(PIPELINE_ID, model_id, DEFAULT_MODEL_OPTIONS); + }, MAX_MODEL_LOAD_TIME); + + it("should be an instance of FillMaskPipeline", () => { + expect(pipe).toBeInstanceOf(FillMaskPipeline); + }); + + describe("batch_size=1", () => { + it( + "default (top_k=5)", + async () => { + const output = await pipe("a [MASK] c"); + const target = [ + { score: 0.0013377574505284429, token: 854, token_str: "##ο", sequence: "aο c" }, + { score: 0.001248967950232327, token: 962, token_str: "##ち", sequence: "aち c" }, + { score: 0.0012304208939895034, token: 933, token_str: "##ع", sequence: "aع c" }, + { score: 0.0012301815440878272, token: 313, token_str: "ფ", sequence: "a ფ c" }, + { score: 0.001222139224410057, token: 624, token_str: "未", sequence: "a 未 c" }, + ]; + expect(output).toBeCloseToNested(target, 5); + }, + MAX_TEST_EXECUTION_TIME, + ); + it( + "custom (top_k=2)", + async () => { + const output = await pipe("a [MASK] c", { top_k: 2 }); + const target = [ + { score: 0.0013377574505284429, token: 854, token_str: "##ο", sequence: "aο c" }, + { score: 0.001248967950232327, token: 962, token_str: "##ち", sequence: "aち c" }, + ]; + expect(output).toBeCloseToNested(target, 5); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); + + describe("batch_size>1", () => { + it( + "default (top_k=5)", + async () => { + const output = await pipe(["a [MASK] c", "a b [MASK] c"]); + const target = [ + [ + { score: 0.0013377574505284429, token: 854, token_str: "##ο", sequence: "aο c" }, + { score: 0.001248967950232327, token: 962, token_str: "##ち", sequence: "aち c" }, + { score: 0.0012304208939895034, token: 933, token_str: "##ع", sequence: "aع c" }, + { score: 0.0012301815440878272, token: 313, token_str: "ფ", sequence: "a ფ c" }, + { score: 0.001222139224410057, token: 624, token_str: "未", sequence: "a 未 c" }, + ], + [ + { score: 0.0013287801994010806, token: 962, token_str: "##ち", sequence: "a bち c" }, + { score: 0.0012486606137827039, token: 823, token_str: "##ن", sequence: "a bن c" }, + { score: 0.0012320734094828367, token: 1032, token_str: "##ც", sequence: "a bც c" }, + { score: 0.0012295148335397243, token: 854, token_str: "##ο", sequence: "a bο c" }, + { score: 0.0012277684872969985, token: 624, token_str: "未", sequence: "a b 未 c" }, + ], + ]; + expect(output).toBeCloseToNested(target, 5); + }, + MAX_TEST_EXECUTION_TIME, + ); + it( + "custom (top_k=2)", + async () => { + const output = await pipe(["a [MASK] c", "a b [MASK] c"], { top_k: 2 }); + const target = [ + [ + { score: 0.0013377574505284429, token: 854, token_str: "##ο", sequence: "aο c" }, + { score: 0.001248967950232327, token: 962, token_str: "##ち", sequence: "aち c" }, + ], + [ + { score: 0.0013287801994010806, token: 962, token_str: "##ち", sequence: "a bち c" }, + { score: 0.0012486606137827039, token: 823, token_str: "##ن", sequence: "a bن c" }, + ], + ]; + expect(output).toBeCloseToNested(target, 5); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); + + afterAll(async () => { + await pipe.dispose(); + }, MAX_MODEL_DISPOSE_TIME); + }); +}; diff --git a/tests/pipelines/test_pipelines_image_classification.js b/tests/pipelines/test_pipelines_image_classification.js new file mode 100644 index 000000000..93b693a94 --- /dev/null +++ b/tests/pipelines/test_pipelines_image_classification.js @@ -0,0 +1,81 @@ +import { pipeline, ImageClassificationPipeline } from "../../src/transformers.js"; + +import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../init.js"; +import { load_cached_image } from "../asset_cache.js"; + +const PIPELINE_ID = "image-classification"; + +export default () => { + describe("Image Classification", () => { + const model_id = "hf-internal-testing/tiny-random-vit"; + /** @type {ImageClassificationPipeline} */ + let pipe; + let images; + beforeAll(async () => { + pipe = await pipeline(PIPELINE_ID, model_id, DEFAULT_MODEL_OPTIONS); + images = await Promise.all([load_cached_image("white_image"), load_cached_image("blue_image")]); + }, MAX_MODEL_LOAD_TIME); + + it("should be an instance of ImageClassificationPipeline", () => { + expect(pipe).toBeInstanceOf(ImageClassificationPipeline); + }); + + describe("batch_size=1", () => { + it( + "default (top_k=5)", + async () => { + const output = await pipe(images[0]); + const target = [ + { label: "LABEL_1", score: 0.5020533800125122 }, + { label: "LABEL_0", score: 0.4979466497898102 }, + ]; + expect(output).toBeCloseToNested(target, 5); + }, + MAX_TEST_EXECUTION_TIME, + ); + it( + "custom (top_k=1)", + async () => { + const output = await pipe(images[0], { top_k: 1 }); + const target = [{ label: "LABEL_1", score: 0.5020533800125122 }]; + expect(output).toBeCloseToNested(target, 5); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); + + describe("batch_size>1", () => { + it( + "default (top_k=5)", + async () => { + const output = await pipe(images); + const target = [ + [ + { label: "LABEL_1", score: 0.5020533800125122 }, + { label: "LABEL_0", score: 0.4979466497898102 }, + ], + [ + { label: "LABEL_1", score: 0.519227921962738 }, + { label: "LABEL_0", score: 0.4807720482349396 }, + ], + ]; + expect(output).toBeCloseToNested(target, 5); + }, + MAX_TEST_EXECUTION_TIME, + ); + it( + "custom (top_k=1)", + async () => { + const output = await pipe(images, { top_k: 1 }); + const target = [[{ label: "LABEL_1", score: 0.5020533800125122 }], [{ label: "LABEL_1", score: 0.519227921962738 }]]; + expect(output).toBeCloseToNested(target, 5); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); + + afterAll(async () => { + await pipe.dispose(); + }, MAX_MODEL_DISPOSE_TIME); + }); +}; diff --git a/tests/pipelines/test_pipelines_image_feature_extraction.js b/tests/pipelines/test_pipelines_image_feature_extraction.js new file mode 100644 index 000000000..c6e7980ee --- /dev/null +++ b/tests/pipelines/test_pipelines_image_feature_extraction.js @@ -0,0 +1,51 @@ +import { pipeline, ImageFeatureExtractionPipeline } from "../../src/transformers.js"; + +import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../init.js"; +import { load_cached_image } from "../asset_cache.js"; + +const PIPELINE_ID = "image-feature-extraction"; + +export default () => { + describe("Image Feature Extraction", () => { + const model_id = "hf-internal-testing/tiny-random-ViTMAEModel"; + /** @type {ImageFeatureExtractionPipeline} */ + let pipe; + let images; + beforeAll(async () => { + pipe = await pipeline(PIPELINE_ID, model_id, DEFAULT_MODEL_OPTIONS); + images = await Promise.all([load_cached_image("white_image"), load_cached_image("blue_image")]); + }, MAX_MODEL_LOAD_TIME); + + it("should be an instance of ImageFeatureExtractionPipeline", () => { + expect(pipe).toBeInstanceOf(ImageFeatureExtractionPipeline); + }); + + describe("batch_size=1", () => { + it( + "default", + async () => { + const output = await pipe(images[0]); + expect(output.dims).toEqual([1, 91, 32]); + expect(output.mean().item()).toBeCloseTo(-8.507473614471905e-10, 6); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); + + describe("batch_size>1", () => { + it( + "default", + async () => { + const output = await pipe(images); + expect(output.dims).toEqual([images.length, 91, 32]); + expect(output.mean().item()).toBeCloseTo(-5.997602414709036e-10, 6); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); + + afterAll(async () => { + await pipe.dispose(); + }, MAX_MODEL_DISPOSE_TIME); + }); +}; diff --git a/tests/pipelines/test_pipelines_image_segmentation.js b/tests/pipelines/test_pipelines_image_segmentation.js new file mode 100644 index 000000000..7358601ea --- /dev/null +++ b/tests/pipelines/test_pipelines_image_segmentation.js @@ -0,0 +1,119 @@ +import { pipeline, ImageSegmentationPipeline } from "../../src/transformers.js"; + +import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../init.js"; +import { load_cached_image } from "../asset_cache.js"; + +const PIPELINE_ID = "image-segmentation"; + +export default () => { + describe("Image Segmentation", () => { + describe("Panoptic Segmentation", () => { + const model_id = "Xenova/detr-resnet-50-panoptic"; + /** @type {ImageSegmentationPipeline} */ + let pipe; + beforeAll(async () => { + pipe = await pipeline(PIPELINE_ID, model_id, DEFAULT_MODEL_OPTIONS); + }, MAX_MODEL_LOAD_TIME); + + it("should be an instance of ImageSegmentationPipeline", () => { + expect(pipe).toBeInstanceOf(ImageSegmentationPipeline); + }); + + it( + "single", + async () => { + const image = await load_cached_image("cats"); + + const output = await pipe(image); + + // First, check mask shapes + for (const item of output) { + expect(item.mask.width).toEqual(image.width); + expect(item.mask.height).toEqual(image.height); + expect(item.mask.channels).toEqual(1); + delete item.mask; // No longer needed + } + + // Next, compare scores and labels + const target = [ + { + score: 0.9918501377105713, + label: "cat", + }, + { + score: 0.9985815286636353, + label: "remote", + }, + { + score: 0.999537467956543, + label: "remote", + }, + { + score: 0.9919270277023315, + label: "couch", + }, + { + score: 0.9993696808815002, + label: "cat", + }, + ]; + + expect(output).toBeCloseToNested(target, 2); + }, + MAX_TEST_EXECUTION_TIME, + ); + + afterAll(async () => { + await pipe.dispose(); + }, MAX_MODEL_DISPOSE_TIME); + }); + + describe("Semantic Segmentation", () => { + const model_id = "Xenova/segformer_b0_clothes"; + /** @type {ImageSegmentationPipeline } */ + let pipe; + beforeAll(async () => { + pipe = await pipeline(PIPELINE_ID, model_id, DEFAULT_MODEL_OPTIONS); + }, MAX_MODEL_LOAD_TIME); + + it( + "single", + async () => { + const image = await load_cached_image("man_on_car"); + + const output = await pipe(image); + + // First, check mask shapes + for (const item of output) { + expect(item.mask.width).toEqual(image.width); + expect(item.mask.height).toEqual(image.height); + expect(item.mask.channels).toEqual(1); + delete item.mask; // No longer needed + } + + // Next, compare scores and labels + const target = [ + { score: null, label: "Background" }, + { score: null, label: "Hair" }, + { score: null, label: "Upper-clothes" }, + { score: null, label: "Pants" }, + { score: null, label: "Left-shoe" }, + { score: null, label: "Right-shoe" }, + { score: null, label: "Face" }, + { score: null, label: "Right-leg" }, + { score: null, label: "Left-arm" }, + { score: null, label: "Right-arm" }, + { score: null, label: "Bag" }, + ]; + + expect(output).toBeCloseToNested(target, 2); + }, + MAX_TEST_EXECUTION_TIME, + ); + + afterAll(async () => { + await pipe.dispose(); + }, MAX_MODEL_DISPOSE_TIME); + }); + }); +}; diff --git a/tests/pipelines/test_pipelines_image_to_image.js b/tests/pipelines/test_pipelines_image_to_image.js new file mode 100644 index 000000000..c7b9a00d2 --- /dev/null +++ b/tests/pipelines/test_pipelines_image_to_image.js @@ -0,0 +1,56 @@ +import { pipeline, ImageToImagePipeline } from "../../src/transformers.js"; + +import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../init.js"; +import { load_cached_image } from "../asset_cache.js"; + +const PIPELINE_ID = "image-to-image"; + +export default () => { + describe("Image to Image", () => { + const model_id = "hf-internal-testing/tiny-random-Swin2SRForImageSuperResolution"; + /** @type {ImageToImagePipeline} */ + let pipe; + let images; + beforeAll(async () => { + pipe = await pipeline(PIPELINE_ID, model_id, DEFAULT_MODEL_OPTIONS); + images = await Promise.all([load_cached_image("white_image"), load_cached_image("blue_image")]); + }, MAX_MODEL_LOAD_TIME); + + it("should be an instance of ImageToImagePipeline", () => { + expect(pipe).toBeInstanceOf(ImageToImagePipeline); + }); + + describe("batch_size=1", () => { + it( + "default", + async () => { + const output = await pipe(images[0]); + expect(output.size).toEqual([64, 64]); + expect(output.channels).toEqual(3); + expect(output.data.reduce((a, b) => a + b, 0) / output.data.length).toBeCloseTo(110.107421875, 3); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); + + describe("batch_size>1", () => { + it( + "default", + async () => { + const output = await pipe(images); + expect(output[0].size).toEqual([64, 64]); + expect(output[0].channels).toEqual(3); + expect(output[0].data.reduce((a, b) => a + b, 0) / output[0].data.length).toBeCloseTo(110.107421875, 3); + expect(output[1].size).toEqual([64, 64]); + expect(output[1].channels).toEqual(3); + expect(output[1].data.reduce((a, b) => a + b, 0) / output[1].data.length).toBeCloseTo(110.60196940104167, 3); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); + + afterAll(async () => { + await pipe.dispose(); + }, MAX_MODEL_DISPOSE_TIME); + }); +}; diff --git a/tests/pipelines/test_pipelines_image_to_text.js b/tests/pipelines/test_pipelines_image_to_text.js new file mode 100644 index 000000000..f7d951811 --- /dev/null +++ b/tests/pipelines/test_pipelines_image_to_text.js @@ -0,0 +1,51 @@ +import { pipeline, ImageToTextPipeline } from "../../src/transformers.js"; + +import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../init.js"; +import { load_cached_image } from "../asset_cache.js"; + +const PIPELINE_ID = "image-to-text"; + +export default () => { + describe("Image to Text", () => { + const model_id = "hf-internal-testing/tiny-random-VisionEncoderDecoderModel-vit-gpt2"; + /** @type {ImageToTextPipeline} */ + let pipe; + let images; + beforeAll(async () => { + pipe = await pipeline(PIPELINE_ID, model_id, DEFAULT_MODEL_OPTIONS); + images = await Promise.all([load_cached_image("white_image"), load_cached_image("blue_image")]); + }, MAX_MODEL_LOAD_TIME); + + it("should be an instance of ImageToTextPipeline", () => { + expect(pipe).toBeInstanceOf(ImageToTextPipeline); + }); + + describe("batch_size=1", () => { + it( + "default", + async () => { + const output = await pipe(images[0]); + const target = [{ generated_text: "" }]; + expect(output).toEqual(target); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); + + describe("batch_size>1", () => { + it( + "default", + async () => { + const output = await pipe(images); + const target = [[{ generated_text: "" }], [{ generated_text: "" }]]; + expect(output).toEqual(target); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); + + afterAll(async () => { + await pipe.dispose(); + }, MAX_MODEL_DISPOSE_TIME); + }); +}; diff --git a/tests/pipelines/test_pipelines_object_detection.js b/tests/pipelines/test_pipelines_object_detection.js new file mode 100644 index 000000000..e9b0375d0 --- /dev/null +++ b/tests/pipelines/test_pipelines_object_detection.js @@ -0,0 +1,131 @@ +import { pipeline, ObjectDetectionPipeline } from "../../src/transformers.js"; + +import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../init.js"; +import { load_cached_image } from "../asset_cache.js"; + +const PIPELINE_ID = "object-detection"; + +export default () => { + describe("Object Detection", () => { + describe("yolos", () => { + const model_id = "Xenova/yolos-tiny"; + /** @type {ObjectDetectionPipeline} */ + let pipe; + beforeAll(async () => { + pipe = await pipeline(PIPELINE_ID, model_id, DEFAULT_MODEL_OPTIONS); + }, MAX_MODEL_LOAD_TIME); + + it("should be an instance of ObjectDetectionPipeline", () => { + expect(pipe).toBeInstanceOf(ObjectDetectionPipeline); + }); + + it( + "single + threshold", + async () => { + const image = await load_cached_image("cats"); + const output = await pipe(image, { threshold: 0.9 }); + + const target = [ + { + score: 0.9921281933784485, + label: "remote", + box: { xmin: 32, ymin: 78, xmax: 185, ymax: 117 }, + }, + { + score: 0.9884883165359497, + label: "remote", + box: { xmin: 324, ymin: 82, xmax: 376, ymax: 191 }, + }, + { + score: 0.9197800159454346, + label: "cat", + box: { xmin: 5, ymin: 56, xmax: 321, ymax: 469 }, + }, + { + score: 0.9300552606582642, + label: "cat", + box: { xmin: 332, ymin: 25, xmax: 638, ymax: 369 }, + }, + ]; + expect(output).toBeCloseToNested(target, 5); + }, + MAX_TEST_EXECUTION_TIME, + ); + + afterAll(async () => { + await pipe.dispose(); + }, MAX_MODEL_DISPOSE_TIME); + }); + + describe("tiny-random", () => { + const model_id = "hf-internal-testing/tiny-random-DetrForObjectDetection"; + + /** @type {ObjectDetectionPipeline} */ + let pipe; + let images; + + beforeAll(async () => { + pipe = await pipeline(PIPELINE_ID, model_id, DEFAULT_MODEL_OPTIONS); + images = await Promise.all([load_cached_image("white_image"), load_cached_image("blue_image")]); + }, MAX_MODEL_LOAD_TIME); + + it("should be an instance of ObjectDetectionPipeline", () => { + expect(pipe).toBeInstanceOf(ObjectDetectionPipeline); + }); + + describe("batch_size=1", () => { + it( + "default (threshold unset)", + async () => { + const output = await pipe(images[0]); + const target = []; + expect(output).toBeCloseToNested(target, 5); + }, + MAX_TEST_EXECUTION_TIME, + ); + it( + "default (threshold=0)", + async () => { + const output = await pipe(images[0], { threshold: 0 }); + const target = [ + { score: 0.020360443741083145, label: "LABEL_31", box: { xmin: 56, ymin: 55, xmax: 169, ymax: 167 } }, + { score: 0.020360419526696205, label: "LABEL_31", box: { xmin: 56, ymin: 55, xmax: 169, ymax: 167 } }, + { score: 0.02036038413643837, label: "LABEL_31", box: { xmin: 56, ymin: 55, xmax: 169, ymax: 167 } }, + { score: 0.020360447466373444, label: "LABEL_31", box: { xmin: 56, ymin: 55, xmax: 169, ymax: 167 } }, + { score: 0.020360389724373817, label: "LABEL_31", box: { xmin: 56, ymin: 55, xmax: 169, ymax: 167 } }, + { score: 0.020360423251986504, label: "LABEL_31", box: { xmin: 56, ymin: 55, xmax: 169, ymax: 167 } }, + { score: 0.02036040835082531, label: "LABEL_31", box: { xmin: 56, ymin: 55, xmax: 169, ymax: 167 } }, + { score: 0.020360363647341728, label: "LABEL_31", box: { xmin: 56, ymin: 55, xmax: 169, ymax: 167 } }, + { score: 0.020360389724373817, label: "LABEL_31", box: { xmin: 56, ymin: 55, xmax: 169, ymax: 167 } }, + { score: 0.020360389724373817, label: "LABEL_31", box: { xmin: 56, ymin: 55, xmax: 169, ymax: 167 } }, + { score: 0.020360343158245087, label: "LABEL_31", box: { xmin: 56, ymin: 55, xmax: 169, ymax: 167 } }, + { score: 0.020360423251986504, label: "LABEL_31", box: { xmin: 56, ymin: 55, xmax: 169, ymax: 167 } }, + ]; + expect(output).toBeCloseToNested(target, 5); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); + + // TODO: Add batched support to object detection pipeline + // describe('batch_size>1', () => { + // it('default (threshold unset)', async () => { + // const output = await pipe(images); + // console.log(output); + // const target = []; + // expect(output).toBeCloseToNested(target, 5); + // }, MAX_TEST_EXECUTION_TIME); + // it('default (threshold=0)', async () => { + // const output = await pipe(images, { threshold: 0 }); + // console.log(output); + // const target = []; + // expect(output).toBeCloseToNested(target, 5); + // }, MAX_TEST_EXECUTION_TIME); + // }); + + afterAll(async () => { + await pipe.dispose(); + }, MAX_MODEL_DISPOSE_TIME); + }); + }); +}; diff --git a/tests/pipelines/test_pipelines_question_answering.js b/tests/pipelines/test_pipelines_question_answering.js new file mode 100644 index 000000000..ff346c03b --- /dev/null +++ b/tests/pipelines/test_pipelines_question_answering.js @@ -0,0 +1,49 @@ +import { pipeline, QuestionAnsweringPipeline } from "../../src/transformers.js"; + +import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../init.js"; + +const PIPELINE_ID = "question-answering"; + +export default () => { + describe("Question Answering", () => { + const model_id = "hf-internal-testing/tiny-random-BertForQuestionAnswering"; + /** @type {QuestionAnsweringPipeline} */ + let pipe; + beforeAll(async () => { + pipe = await pipeline(PIPELINE_ID, model_id, DEFAULT_MODEL_OPTIONS); + }, MAX_MODEL_LOAD_TIME); + + it("should be an instance of QuestionAnsweringPipeline", () => { + expect(pipe).toBeInstanceOf(QuestionAnsweringPipeline); + }); + + describe("batch_size=1", () => { + it( + "default (top_k=1)", + async () => { + const output = await pipe("a", "b c"); + const target = { score: 0.11395696550607681, /* start: 0, end: 1, */ answer: "b" }; + expect(output).toBeCloseToNested(target, 5); + }, + MAX_TEST_EXECUTION_TIME, + ); + it( + "custom (top_k=3)", + async () => { + const output = await pipe("a", "b c", { top_k: 3 }); + const target = [ + { score: 0.11395696550607681, /* start: 0, end: 1, */ answer: "b" }, + { score: 0.11300431191921234, /* start: 2, end: 3, */ answer: "c" }, + { score: 0.10732574015855789, /* start: 0, end: 3, */ answer: "b c" }, + ]; + expect(output).toBeCloseToNested(target, 5); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); + + afterAll(async () => { + await pipe.dispose(); + }, MAX_MODEL_DISPOSE_TIME); + }); +}; diff --git a/tests/pipelines/test_pipelines_summarization.js b/tests/pipelines/test_pipelines_summarization.js new file mode 100644 index 000000000..877fd81e9 --- /dev/null +++ b/tests/pipelines/test_pipelines_summarization.js @@ -0,0 +1,40 @@ +import { pipeline, SummarizationPipeline } from "../../src/transformers.js"; + +import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../init.js"; + +const PIPELINE_ID = "summarization"; + +export default () => { + describe("Summarization", () => { + const model_id = "hf-internal-testing/tiny-random-T5ForConditionalGeneration"; + + /** @type {SummarizationPipeline} */ + let pipe; + beforeAll(async () => { + pipe = await pipeline(PIPELINE_ID, model_id, DEFAULT_MODEL_OPTIONS); + }, MAX_MODEL_LOAD_TIME); + + it("should be an instance of SummarizationPipeline", () => { + expect(pipe).toBeInstanceOf(SummarizationPipeline); + }); + + describe("batch_size=1", () => { + it( + "default", + async () => { + const text = "This is a test."; + const output = await pipe(text, { + max_new_tokens: 5, + }); + const target = [{ summary_text: "" }]; + expect(output).toEqual(target); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); + + afterAll(async () => { + await pipe.dispose(); + }, MAX_MODEL_DISPOSE_TIME); + }); +}; diff --git a/tests/pipelines/test_pipelines_text2text_generation.js b/tests/pipelines/test_pipelines_text2text_generation.js new file mode 100644 index 000000000..0084fbbd2 --- /dev/null +++ b/tests/pipelines/test_pipelines_text2text_generation.js @@ -0,0 +1,40 @@ +import { pipeline, Text2TextGenerationPipeline } from "../../src/transformers.js"; + +import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../init.js"; + +const PIPELINE_ID = "text2text-generation"; + +export default () => { + describe("Text to Text Generation", () => { + const model_id = "hf-internal-testing/tiny-random-T5ForConditionalGeneration"; + + /** @type {Text2TextGenerationPipeline} */ + let pipe; + beforeAll(async () => { + pipe = await pipeline(PIPELINE_ID, model_id, DEFAULT_MODEL_OPTIONS); + }, MAX_MODEL_LOAD_TIME); + + it("should be an instance of Text2TextGenerationPipeline", () => { + expect(pipe).toBeInstanceOf(Text2TextGenerationPipeline); + }); + + describe("batch_size=1", () => { + it( + "default", + async () => { + const text = "This is a test."; + const output = await pipe(text, { + max_new_tokens: 5, + }); + const target = [{ generated_text: "" }]; + expect(output).toEqual(target); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); + + afterAll(async () => { + await pipe.dispose(); + }, MAX_MODEL_DISPOSE_TIME); + }); +}; diff --git a/tests/pipelines/test_pipelines_text_classification.js b/tests/pipelines/test_pipelines_text_classification.js new file mode 100644 index 000000000..13a78f1a6 --- /dev/null +++ b/tests/pipelines/test_pipelines_text_classification.js @@ -0,0 +1,107 @@ +import { pipeline, TextClassificationPipeline } from "../../src/transformers.js"; + +import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../init.js"; + +const PIPELINE_ID = "text-classification"; + +export default () => { + describe("Text Classification", () => { + const model_id = "hf-internal-testing/tiny-random-BertForSequenceClassification"; + + /** @type {TextClassificationPipeline} */ + let pipe; + beforeAll(async () => { + pipe = await pipeline(PIPELINE_ID, model_id, DEFAULT_MODEL_OPTIONS); + }, MAX_MODEL_LOAD_TIME); + + it("should be an instance of TextClassificationPipeline", () => { + expect(pipe).toBeInstanceOf(TextClassificationPipeline); + }); + + describe("batch_size=1", () => { + it( + "default (top_k=1)", + async () => { + const output = await pipe("a"); + const target = [{ label: "LABEL_0", score: 0.5076976418495178 }]; + expect(output).toBeCloseToNested(target, 5); + }, + MAX_TEST_EXECUTION_TIME, + ); + it( + "custom (top_k=2)", + async () => { + const output = await pipe("a", { top_k: 2 }); + const target = [ + { label: "LABEL_0", score: 0.5076976418495178 }, + { label: "LABEL_1", score: 0.49230238795280457 }, + ]; + expect(output).toBeCloseToNested(target, 5); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); + + describe("batch_size>1", () => { + it( + "default (top_k=1)", + async () => { + const output = await pipe(["a", "b c"]); + const target = [ + { label: "LABEL_0", score: 0.5076976418495178 }, + { label: "LABEL_0", score: 0.5077522993087769 }, + ]; + expect(output).toBeCloseToNested(target, 5); + }, + MAX_TEST_EXECUTION_TIME, + ); + it( + "custom (top_k=2)", + async () => { + const output = await pipe(["a", "b c"], { top_k: 2 }); + const target = [ + [ + { label: "LABEL_0", score: 0.5076976418495178 }, + { label: "LABEL_1", score: 0.49230238795280457 }, + ], + [ + { label: "LABEL_0", score: 0.5077522993087769 }, + { label: "LABEL_1", score: 0.49224773049354553 }, + ], + ]; + expect(output).toBeCloseToNested(target, 5); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "multi_label_classification", + async () => { + const problem_type = pipe.model.config.problem_type; + pipe.model.config.problem_type = "multi_label_classification"; + + const output = await pipe(["a", "b c"], { top_k: 2 }); + const target = [ + [ + { label: "LABEL_0", score: 0.5001373887062073 }, + { label: "LABEL_1", score: 0.49243971705436707 }, + ], + [ + { label: "LABEL_0", score: 0.5001326203346252 }, + { label: "LABEL_1", score: 0.492380291223526 }, + ], + ]; + expect(output).toBeCloseToNested(target, 5); + + // Reset problem type + pipe.model.config.problem_type = problem_type; + }, + MAX_TEST_EXECUTION_TIME, + ); + }); + + afterAll(async () => { + await pipe.dispose(); + }, MAX_MODEL_DISPOSE_TIME); + }); +}; diff --git a/tests/pipelines/test_pipelines_text_generation.js b/tests/pipelines/test_pipelines_text_generation.js new file mode 100644 index 000000000..085808263 --- /dev/null +++ b/tests/pipelines/test_pipelines_text_generation.js @@ -0,0 +1,109 @@ +import { pipeline, TextGenerationPipeline } from "../../src/transformers.js"; + +import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../init.js"; + +const PIPELINE_ID = "text-generation"; + +export default () => { + describe("Text Generation", () => { + const model_id = "hf-internal-testing/tiny-random-LlamaForCausalLM"; + + /** @type {TextGenerationPipeline} */ + let pipe; + beforeAll(async () => { + pipe = await pipeline(PIPELINE_ID, model_id, DEFAULT_MODEL_OPTIONS); + }, MAX_MODEL_LOAD_TIME); + + it("should be an instance of TextGenerationPipeline", () => { + expect(pipe).toBeInstanceOf(TextGenerationPipeline); + }); + + describe("batch_size=1", () => { + const text_input = "hello"; + const generated_text_target = "erdingsAndroid Load"; + const text_target = [{ generated_text: text_input + generated_text_target }]; + const new_text_target = [{ generated_text: generated_text_target }]; + + const chat_input = [ + { role: "system", content: "a" }, + { role: "user", content: "b" }, + ]; + const chat_target = [ + { + generated_text: [ + { role: "system", content: "a" }, + { role: "user", content: "b" }, + { role: "assistant", content: " Southern abund Load" }, + ], + }, + ]; + + it( + "text input (single)", + async () => { + const output = await pipe(text_input, { max_new_tokens: 3 }); + expect(output).toEqual(text_target); + }, + MAX_TEST_EXECUTION_TIME, + ); + it( + "text input (list)", + async () => { + const output = await pipe([text_input], { max_new_tokens: 3 }); + expect(output).toEqual([text_target]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "text input (single) - return_full_text=false", + async () => { + const output = await pipe(text_input, { max_new_tokens: 3, return_full_text: false }); + expect(output).toEqual(new_text_target); + }, + MAX_TEST_EXECUTION_TIME, + ); + it( + "text input (list) - return_full_text=false", + async () => { + const output = await pipe([text_input], { max_new_tokens: 3, return_full_text: false }); + expect(output).toEqual([new_text_target]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "chat input (single)", + async () => { + const output = await pipe(chat_input, { max_new_tokens: 3 }); + expect(output).toEqual(chat_target); + }, + MAX_TEST_EXECUTION_TIME, + ); + it( + "chat input (list)", + async () => { + const output = await pipe([chat_input], { max_new_tokens: 3 }); + expect(output).toEqual([chat_target]); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); + + // TODO: Fix batch_size>1 + // describe('batch_size>1', () => { + // it('default', async () => { + // const output = await pipe(['hello', 'hello world']); + // const target = [ + // [{generated_text: 'helloerdingsAndroid Load'}], + // [{generated_text: 'hello world zerosMillнал'}], + // ]; + // expect(output).toEqual(target); + // }, MAX_TEST_EXECUTION_TIME); + // }); + + afterAll(async () => { + await pipe.dispose(); + }, MAX_MODEL_DISPOSE_TIME); + }); +}; diff --git a/tests/pipelines/test_pipelines_text_to_audio.js b/tests/pipelines/test_pipelines_text_to_audio.js new file mode 100644 index 000000000..d37f0203e --- /dev/null +++ b/tests/pipelines/test_pipelines_text_to_audio.js @@ -0,0 +1,37 @@ +import { pipeline, TextToAudioPipeline } from "../../src/transformers.js"; + +import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../init.js"; + +const PIPELINE_ID = "text-to-audio"; + +export default () => { + describe("Text to Audio", () => { + const model_id = "Xenova/tiny-random-vits"; + + /** @type {TextToAudioPipeline} */ + let pipe; + beforeAll(async () => { + pipe = await pipeline(PIPELINE_ID, model_id, DEFAULT_MODEL_OPTIONS); + }, MAX_MODEL_LOAD_TIME); + + it("should be an instance of TextToAudioPipeline", () => { + expect(pipe).toBeInstanceOf(TextToAudioPipeline); + }); + + it( + "default", + async () => { + const output = await pipe("hello"); + expect(output.audio).toHaveLength(6400); + // NOTE: The mean value is not deterministic, so we just check the first few digits + expect(output.audio.reduce((a, b) => a + b, 0) / output.audio.length).toBeCloseTo(-0.0125, 2); + expect(output.sampling_rate).toEqual(16000); + }, + MAX_TEST_EXECUTION_TIME, + ); + + afterAll(async () => { + await pipe.dispose(); + }, MAX_MODEL_DISPOSE_TIME); + }); +}; diff --git a/tests/pipelines/test_pipelines_token_classification.js b/tests/pipelines/test_pipelines_token_classification.js new file mode 100644 index 000000000..9d91813d4 --- /dev/null +++ b/tests/pipelines/test_pipelines_token_classification.js @@ -0,0 +1,157 @@ +import { pipeline, TokenClassificationPipeline } from "../../src/transformers.js"; + +import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../init.js"; + +const PIPELINE_ID = "token-classification"; + +export default () => { + describe("Token Classification", () => { + const model_id = "hf-internal-testing/tiny-random-BertForTokenClassification"; + /** @type {TokenClassificationPipeline} */ + let pipe; + beforeAll(async () => { + pipe = await pipeline(PIPELINE_ID, model_id, DEFAULT_MODEL_OPTIONS); + }, MAX_MODEL_LOAD_TIME); + + it("should be an instance of TokenClassificationPipeline", () => { + expect(pipe).toBeInstanceOf(TokenClassificationPipeline); + }); + + describe("batch_size=1", () => { + it( + "default", + async () => { + const output = await pipe("1 2 3"); + + // TODO: Add start/end to target + const target = [ + { + entity: "LABEL_0", + score: 0.5292708, + index: 1, + word: "1", + // 'start': 0, 'end': 1 + }, + { + entity: "LABEL_0", + score: 0.5353687, + index: 2, + word: "2", + // 'start': 2, 'end': 3 + }, + { + entity: "LABEL_1", + score: 0.51381934, + index: 3, + word: "3", + // 'start': 4, 'end': 5 + }, + ]; + expect(output).toBeCloseToNested(target, 5); + }, + MAX_TEST_EXECUTION_TIME, + ); + it( + "custom (ignore_labels set)", + async () => { + const output = await pipe("1 2 3", { ignore_labels: ["LABEL_0"] }); + const target = [ + { + entity: "LABEL_1", + score: 0.51381934, + index: 3, + word: "3", + // 'start': 4, 'end': 5 + }, + ]; + expect(output).toBeCloseToNested(target, 5); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); + + describe("batch_size>1", () => { + it( + "default", + async () => { + const output = await pipe(["1 2 3", "4 5"]); + const target = [ + [ + { + entity: "LABEL_0", + score: 0.5292708, + index: 1, + word: "1", + // 'start': 0, 'end': 1 + }, + { + entity: "LABEL_0", + score: 0.5353687, + index: 2, + word: "2", + // 'start': 2, 'end': 3 + }, + { + entity: "LABEL_1", + score: 0.51381934, + index: 3, + word: "3", + // 'start': 4, 'end': 5 + }, + ], + [ + { + entity: "LABEL_0", + score: 0.5432807, + index: 1, + word: "4", + // 'start': 0, 'end': 1 + }, + { + entity: "LABEL_1", + score: 0.5007693, + index: 2, + word: "5", + // 'start': 2, 'end': 3 + }, + ], + ]; + expect(output).toBeCloseToNested(target, 5); + }, + MAX_TEST_EXECUTION_TIME, + ); + it( + "custom (ignore_labels set)", + async () => { + const output = await pipe(["1 2 3", "4 5"], { ignore_labels: ["LABEL_0"] }); + const target = [ + [ + { + entity: "LABEL_1", + score: 0.51381934, + index: 3, + word: "3", + // 'start': 4, 'end': 5 + }, + ], + [ + { + entity: "LABEL_1", + score: 0.5007693, + index: 2, + word: "5", + // 'start': 2, 'end': 3 + }, + ], + ]; + expect(output).toBeCloseToNested(target, 5); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); + + afterAll(async () => { + await pipe.dispose(); + }, MAX_MODEL_DISPOSE_TIME); + }); +}; diff --git a/tests/pipelines/test_pipelines_translation.js b/tests/pipelines/test_pipelines_translation.js new file mode 100644 index 000000000..97afce8d6 --- /dev/null +++ b/tests/pipelines/test_pipelines_translation.js @@ -0,0 +1,42 @@ +import { pipeline, TranslationPipeline } from "../../src/transformers.js"; + +import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../init.js"; + +const PIPELINE_ID = "translation"; + +export default () => { + describe("Translation", () => { + const model_id = "Xenova/tiny-random-M2M100ForConditionalGeneration"; + + /** @type {TranslationPipeline} */ + let pipe; + beforeAll(async () => { + pipe = await pipeline(PIPELINE_ID, model_id, DEFAULT_MODEL_OPTIONS); + }, MAX_MODEL_LOAD_TIME); + + it("should be an instance of TranslationPipeline", () => { + expect(pipe).toBeInstanceOf(TranslationPipeline); + }); + + describe("batch_size=1", () => { + it( + "default", + async () => { + const text = "जीवन एक चॉकलेट बॉक्स की तरह है।"; + const output = await pipe(text, { + src_lang: "hi", + tgt_lang: "fr", + max_new_tokens: 5, + }); + const target = [{ translation_text: "Slovenska төсли төсли төсли" }]; + expect(output).toEqual(target); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); + + afterAll(async () => { + await pipe.dispose(); + }, MAX_MODEL_DISPOSE_TIME); + }); +}; diff --git a/tests/pipelines/test_pipelines_zero_shot.js b/tests/pipelines/test_pipelines_zero_shot.js new file mode 100644 index 000000000..1c30db9ea --- /dev/null +++ b/tests/pipelines/test_pipelines_zero_shot.js @@ -0,0 +1,100 @@ +import { pipeline, ZeroShotClassificationPipeline } from "../../src/transformers.js"; + +import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../init.js"; + +const PIPELINE_ID = "zero-shot-classification"; + +export default () => { + describe("Zero-shot Classification", () => { + const model_id = "hf-internal-testing/tiny-random-BertForSequenceClassification"; + /** @type {ZeroShotClassificationPipeline} */ + let pipe; + + beforeAll(async () => { + pipe = await pipeline(PIPELINE_ID, model_id, { + ...DEFAULT_MODEL_OPTIONS, + + // The model isn't designed for zero-shot classification, so we set the config + config: { + model_type: "bert", + id2label: { + 0: "contradiction", + 1: "entailment", + }, + label2id: { + contradiction: 0, + entailment: 1, + }, + }, + }); + }, MAX_MODEL_LOAD_TIME); + + it("should be an instance of ZeroShotClassificationPipeline", () => { + expect(pipe).toBeInstanceOf(ZeroShotClassificationPipeline); + }); + const sequences_to_classify = ["one day I will see the world", "I love making pizza"]; + const candidate_labels = ["travel", "cooking", "dancing"]; + + it( + "Single sequence classification", + async () => { + const output = await pipe(sequences_to_classify[0], candidate_labels); + const target = { + sequence: "one day I will see the world", + labels: ["dancing", "cooking", "travel"], + scores: [0.3333353410546293, 0.3333348269618681, 0.3333298319835025], + }; + expect(output).toBeCloseToNested(target, 5); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "Batched classification", + async () => { + const output = await pipe(sequences_to_classify, candidate_labels); + const target = [ + { + sequence: "one day I will see the world", + labels: ["dancing", "cooking", "travel"], + scores: [0.3333353410546293, 0.3333348269618681, 0.3333298319835025], + }, + { + sequence: "I love making pizza", + labels: ["dancing", "cooking", "travel"], + scores: [0.3333347058960895, 0.3333337292465588, 0.3333315648573516], + }, + ]; + expect(output).toBeCloseToNested(target, 5); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "Batched + multilabel classification", + async () => { + const candidate_labels = ["travel", "cooking", "dancing"]; + + const output = await pipe(sequences_to_classify, candidate_labels, { multi_label: true }); + const target = [ + { + sequence: "one day I will see the world", + labels: ["dancing", "cooking", "travel"], + scores: [0.49231469615364476, 0.4923134953805702, 0.4923094795142658], + }, + { + sequence: "I love making pizza", + labels: ["dancing", "cooking", "travel"], + scores: [0.49230751217535645, 0.49230615475943956, 0.4923042569480609], + }, + ]; + expect(output).toBeCloseToNested(target, 5); + }, + MAX_TEST_EXECUTION_TIME, + ); + + afterAll(async () => { + await pipe.dispose(); + }, MAX_MODEL_DISPOSE_TIME); + }); +}; diff --git a/tests/pipelines/test_pipelines_zero_shot_audio_classification.js b/tests/pipelines/test_pipelines_zero_shot_audio_classification.js new file mode 100644 index 000000000..00dd328ea --- /dev/null +++ b/tests/pipelines/test_pipelines_zero_shot_audio_classification.js @@ -0,0 +1,58 @@ +import { pipeline, ZeroShotAudioClassificationPipeline } from "../../src/transformers.js"; + +import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../init.js"; +import { load_cached_audio } from "../asset_cache.js"; + +const PIPELINE_ID = "zero-shot-audio-classification"; + +export default () => { + describe("Zero-shot Audio Classification", () => { + const model_id = "hf-internal-testing/tiny-clap-htsat-unfused"; + + const labels = ["cat", "dog"]; + const hypothesis_template = "sound of a {}"; + + /** @type {ZeroShotAudioClassificationPipeline} */ + let pipe; + let audio; + beforeAll(async () => { + pipe = await pipeline(PIPELINE_ID, model_id, DEFAULT_MODEL_OPTIONS); + audio = await load_cached_audio("mlk"); + }, MAX_MODEL_LOAD_TIME); + + it("should be an instance of ZeroShotAudioClassificationPipeline", () => { + expect(pipe).toBeInstanceOf(ZeroShotAudioClassificationPipeline); + }); + + describe("batch_size=1", () => { + it( + "default", + async () => { + const output = await pipe(audio, labels); + const target = [ + { score: 0.4990939795970917, label: "cat" }, + { score: 0.5009059906005859, label: "dog" }, + ]; + expect(output).toBeCloseToNested(target, 5); + }, + MAX_TEST_EXECUTION_TIME, + ); + it( + "custom (w/ hypothesis_template)", + async () => { + const output = await pipe(audio, labels, { hypothesis_template }); + const target = [ + { score: 0.4987950325012207, label: "cat" }, + { score: 0.5012049674987793, label: "dog" }, + ]; + expect(output).toBeCloseToNested(target, 5); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); + + afterAll(async () => { + await pipe.dispose(); + }, MAX_MODEL_DISPOSE_TIME); + }); +}; diff --git a/tests/pipelines/test_pipelines_zero_shot_image_classification.js b/tests/pipelines/test_pipelines_zero_shot_image_classification.js new file mode 100644 index 000000000..dfa1e23e8 --- /dev/null +++ b/tests/pipelines/test_pipelines_zero_shot_image_classification.js @@ -0,0 +1,98 @@ +import { pipeline, ZeroShotImageClassificationPipeline } from "../../src/transformers.js"; + +import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../init.js"; +import { load_cached_image } from "../asset_cache.js"; + +const PIPELINE_ID = "zero-shot-image-classification"; + +export default () => { + describe("Zero-shot Image Classification", () => { + const model_id = "hf-internal-testing/tiny-random-GroupViTModel"; + + // Example adapted from https://huggingface.co/docs/transformers/en/model_doc/groupvit + const labels = ["cat", "dog"]; + const hypothesis_template = "a photo of a {}"; + + /** @type {ZeroShotImageClassificationPipeline} */ + let pipe; + let images; + beforeAll(async () => { + pipe = await pipeline(PIPELINE_ID, model_id, DEFAULT_MODEL_OPTIONS); + images = await Promise.all([load_cached_image("white_image"), load_cached_image("blue_image")]); + }, MAX_MODEL_LOAD_TIME); + + it("should be an instance of ZeroShotImageClassificationPipeline", () => { + expect(pipe).toBeInstanceOf(ZeroShotImageClassificationPipeline); + }); + + describe("batch_size=1", () => { + it( + "default", + async () => { + const output = await pipe(images[0], labels); + const target = [ + { score: 0.5990662574768066, label: "cat" }, + { score: 0.40093377232551575, label: "dog" }, + ]; + expect(output).toBeCloseToNested(target, 5); + }, + MAX_TEST_EXECUTION_TIME, + ); + it( + "custom (w/ hypothesis_template)", + async () => { + const output = await pipe(images[0], labels, { hypothesis_template }); + const target = [ + { score: 0.5527022480964661, label: "cat" }, + { score: 0.44729775190353394, label: "dog" }, + ]; + expect(output).toBeCloseToNested(target, 5); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); + + describe("batch_size>1", () => { + it( + "default", + async () => { + const output = await pipe(images, labels); + const target = [ + [ + { score: 0.5990662574768066, label: "cat" }, + { score: 0.40093377232551575, label: "dog" }, + ], + [ + { score: 0.5006340146064758, label: "dog" }, + { score: 0.49936598539352417, label: "cat" }, + ], + ]; + expect(output).toBeCloseToNested(target, 5); + }, + MAX_TEST_EXECUTION_TIME, + ); + it( + "custom (w/ hypothesis_template)", + async () => { + const output = await pipe(images, labels, { hypothesis_template }); + const target = [ + [ + { score: 0.5527022480964661, label: "cat" }, + { score: 0.44729775190353394, label: "dog" }, + ], + [ + { score: 0.5395973324775696, label: "cat" }, + { score: 0.46040263772010803, label: "dog" }, + ], + ]; + expect(output).toBeCloseToNested(target, 5); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); + + afterAll(async () => { + await pipe.dispose(); + }, MAX_MODEL_DISPOSE_TIME); + }); +}; diff --git a/tests/pipelines/test_pipelines_zero_shot_object_detection.js b/tests/pipelines/test_pipelines_zero_shot_object_detection.js new file mode 100644 index 000000000..f55690507 --- /dev/null +++ b/tests/pipelines/test_pipelines_zero_shot_object_detection.js @@ -0,0 +1,134 @@ +import { pipeline, ZeroShotObjectDetectionPipeline } from "../../src/transformers.js"; + +import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../init.js"; +import { load_cached_image } from "../asset_cache.js"; + +const PIPELINE_ID = "zero-shot-object-detection"; + +export default () => { + describe("Zero-shot Object Detection", () => { + const model_id = "hf-internal-testing/tiny-random-OwlViTForObjectDetection"; + + const candidate_labels = ["hello", "hello world"]; + + /** @type {ZeroShotObjectDetectionPipeline} */ + let pipe; + let images; + beforeAll(async () => { + pipe = await pipeline(PIPELINE_ID, model_id, DEFAULT_MODEL_OPTIONS); + images = await Promise.all([load_cached_image("white_image"), load_cached_image("blue_image")]); + }, MAX_MODEL_LOAD_TIME); + + const targets = { + white_image: [ + { + score: 0.6028420329093933, + label: "hello", + box: { xmin: 47, ymin: 117, xmax: 62, ymax: 134 }, + }, + { + score: 0.6026064157485962, + label: "hello world", + box: { xmin: 47, ymin: 117, xmax: 62, ymax: 134 }, + }, + { + score: 0.5987668037414551, + label: "hello world", + box: { xmin: 145, ymin: 47, xmax: 160, ymax: 63 }, + }, + { + score: 0.5986272692680359, + label: "hello", + box: { xmin: 89, ymin: 131, xmax: 104, ymax: 148 }, + }, + { + score: 0.5985949039459229, + label: "hello world", + box: { xmin: 89, ymin: 131, xmax: 104, ymax: 148 }, + }, + // ... many more + ], + + blue_image: [ + { + score: 0.6622366309165955, + label: "hello", + box: { xmin: 48, ymin: 45, xmax: 62, ymax: 61 }, + }, + { + score: 0.6562080383300781, + label: "hello world", + box: { xmin: 48, ymin: 45, xmax: 62, ymax: 61 }, + }, + { + score: 0.6493991613388062, + label: "hello world", + box: { xmin: 34, ymin: 58, xmax: 48, ymax: 74 }, + }, + { + score: 0.6476974487304688, + label: "hello", + box: { xmin: 34, ymin: 58, xmax: 48, ymax: 74 }, + }, + { + score: 0.6391685009002686, + label: "hello", + box: { xmin: 103, ymin: 59, xmax: 117, ymax: 75 }, + }, + // ... many more + ], + }; + + it("should be an instance of ZeroShotObjectDetectionPipeline", () => { + expect(pipe).toBeInstanceOf(ZeroShotObjectDetectionPipeline); + }); + + describe("batch_size=1", () => { + it( + "default", + async () => { + const output = await pipe(images[0], candidate_labels); + expect(output).toHaveLength(512); + + expect(output.slice(0, targets.white_image.length)).toBeCloseToNested(targets.white_image, 5); + }, + MAX_TEST_EXECUTION_TIME, + ); + it( + "custom (w/ top_k & threshold)", + async () => { + const top_k = 3; + const output = await pipe(images[0], candidate_labels, { top_k, threshold: 0.05 }); + expect(output).toBeCloseToNested(targets.white_image.slice(0, top_k), 5); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); + + describe("batch_size>1", () => { + it( + "default", + async () => { + const output = await pipe(images, candidate_labels); + const target = Object.values(targets); + expect(output.map((x, i) => x.slice(0, target[i].length))).toBeCloseToNested(target, 5); + }, + MAX_TEST_EXECUTION_TIME, + ); + it( + "custom (w/ top_k & threshold)", + async () => { + const top_k = 3; + const output = await pipe(images, candidate_labels, { top_k, threshold: 0.05 }); + const target = Object.values(targets).map((x) => x.slice(0, top_k)); + expect(output).toBeCloseToNested(target, 5); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); + + afterAll(async () => { + await pipe.dispose(); + }, MAX_MODEL_DISPOSE_TIME); + }); +}; diff --git a/tests/processors.test.js b/tests/processors.test.js index e35e555d2..d80ec91fb 100644 --- a/tests/processors.test.js +++ b/tests/processors.test.js @@ -1,521 +1,5 @@ -import fs from "fs"; -import path from "path"; +import { init } from "./init.js"; +import { collect_and_execute_tests } from "./test_utils.js"; -import { AutoProcessor } from "../src/transformers.js"; -import { load_cached_image } from "./asset_cache.js"; -import { init, MAX_TEST_TIME } from "./init.js"; -import { fileURLToPath } from "url"; - -// Initialise the testing environment init(); - -// Collect all unit tests, which can be found in files of the form: -// `tests/models//test_image_processors_.js` -const __filename = fileURLToPath(import.meta.url); -const __dirname = path.dirname(__filename); -const models_dir = path.join(__dirname, "models"); -const model_types = fs.readdirSync(models_dir); -for (const model_type of model_types) { - const dir = path.join(models_dir, model_type); - - if (!fs.existsSync(dir) || !fs.statSync(dir).isDirectory()) { - continue; - } - - const file = path.join(dir, `test_image_processing_${model_type}.js`); - if (!fs.existsSync(file)) { - continue; - } - - const { default: tests } = await import(file); - describe(model_type, tests); -} - -const sum = (array) => Number(array.reduce((a, b) => a + b, array instanceof BigInt64Array ? 0n : 0)); -const avg = (array) => sum(array) / array.length; - -const MODELS = { - florence2: "Xenova/tiny-random-Florence2ForConditionalGeneration", - qwen2_vl: "hf-internal-testing/tiny-random-Qwen2VLForConditionalGeneration", - idefics3: "hf-internal-testing/tiny-random-Idefics3ForConditionalGeneration", - paligemma: "hf-internal-testing/tiny-random-PaliGemmaForConditionalGeneration", -}; - -describe("Processors", () => { - describe("Audio processors", () => { - let audio; - beforeAll(async () => { - const url = "https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/mlk.npy"; - const buffer = await (await fetch(url)).arrayBuffer(); - audio = Float32Array.from(new Float64Array(buffer)); - }); - - it( - "WhisperFeatureExtractor", - async () => { - const processor = await AutoProcessor.from_pretrained("Xenova/whisper-tiny.en"); - const { input_features } = await processor(audio); - const { dims, data } = input_features; - expect(dims).toEqual([1, 80, 3000]); - expect(avg(data)).toBeCloseTo(-0.2813588131551941); - expect(data[0]).toBeCloseTo(0.33168578147888184); - expect(data[1]).toBeCloseTo(0.30986475944519043); - expect(data[81]).toBeCloseTo(0.10727232694625854); - expect(data[3001]).toBeCloseTo(0.2555035352706909); - }, - MAX_TEST_TIME, - ); - - it( - "ASTFeatureExtractor", - async () => { - const processor = await AutoProcessor.from_pretrained("Xenova/ast-finetuned-audioset-10-10-0.4593"); - { - // truncation - const { input_values } = await processor(audio); - expect(input_values.dims).toEqual([1, 1024, 128]); - - expect(avg(input_values.data)).toBeCloseTo(-0.04054912979309085); - expect(input_values.data[0]).toBeCloseTo(-0.5662586092948914); - expect(input_values.data[1]).toBeCloseTo(-1.0300861597061157); - expect(input_values.data[129]).toBeCloseTo(-1.084834098815918); - expect(input_values.data[1025]).toBeCloseTo(-1.1204065084457397); - } - { - // padding - const { input_values } = await processor(audio.slice(0, 1000)); - expect(input_values.dims).toEqual([1, 1024, 128]); // [1, 4, 128] -> (padded to) -> [1, 1024, 128] - - expect(avg(input_values.data)).toBeCloseTo(0.4647964835166931); - expect(input_values.data[0]).toBeCloseTo(-0.5662586092948914); - expect(input_values.data[1]).toBeCloseTo(-1.0300861597061157); - expect(input_values.data[129]).toBeCloseTo(-1.084834098815918); - - // padded values - expect(input_values.data[1025]).toBeCloseTo(0.46703237295150757); - expect(input_values.data[2049]).toBeCloseTo(0.46703237295150757); - expect(input_values.data[10000]).toBeCloseTo(0.46703237295150757); - } - }, - MAX_TEST_TIME, - ); - - it( - "SeamlessM4TFeatureExtractor", - async () => { - const processor = await AutoProcessor.from_pretrained("Xenova/wav2vec2-bert-CV16-en"); - { - // normal - const { input_features, attention_mask } = await processor(audio); - const { dims, data } = input_features; - expect(dims).toEqual([1, 649, 160]); - expect(attention_mask.dims).toEqual([1, 649]); - - expect(avg(data)).toBeCloseTo(-2.938903875815413e-8); - expect(data[0]).toBeCloseTo(1.1939343214035034); - expect(data[1]).toBeCloseTo(0.7874255180358887); - expect(data[160]).toBeCloseTo(-0.712975025177002); - expect(data[161]).toBeCloseTo(0.045802414417266846); - expect(data.at(-1)).toBeCloseTo(-1.3328346014022827); - - expect(sum(attention_mask.data)).toEqual(649); - } - { - // padding (pad_to_multiple_of=2) - const { input_features, attention_mask } = await processor(audio.slice(0, 10000)); - const { dims, data } = input_features; - - // [1, 61, 80] -> [1, 62, 80] -> [1, 31, 160] - expect(dims).toEqual([1, 31, 160]); - expect(attention_mask.dims).toEqual([1, 31]); - - expect(avg(data)).toBeCloseTo(0.01612919569015503); - expect(data[0]).toBeCloseTo(0.9657132029533386); - expect(data[1]).toBeCloseTo(0.12912897765636444); - expect(data[160]).toBeCloseTo(-1.2364212274551392); - expect(data[161]).toBeCloseTo(-0.9703778028488159); - expect(data.at(-1)).toBeCloseTo(1); // padding value - - expect(sum(attention_mask.data)).toEqual(30); - } - }, - MAX_TEST_TIME, - ); - - it( - "ClapFeatureExtractor", - async () => { - const processor = await AutoProcessor.from_pretrained("Xenova/clap-htsat-unfused"); - { - // truncation - // Since truncation uses a random strategy, we override - // Math.random to ensure that the test is deterministic - const originalRandom = Math.random; - Math.random = () => 0.5; - - let long_audio = new Float32Array(500000); - long_audio.set(audio); - long_audio.set(audio, long_audio.length - audio.length); - - const { input_features } = await processor(long_audio); - const { dims, data } = input_features; - expect(dims).toEqual([1, 1, 1001, 64]); - - expect(avg(data)).toBeCloseTo(-37.94569396972656); - expect(data[0]).toBeCloseTo(-53.32647705078125); - expect(data[1]).toBeCloseTo(-47.76755142211914); - expect(data[65]).toBeCloseTo(-36.32261276245117); - expect(data[1002]).toBeCloseTo(-28.0314884185791); - expect(data[10000]).toBeCloseTo(-21.905902862548828); - expect(data[60000]).toBeCloseTo(-14.877863883972168); - expect(data[64062]).toBeCloseTo(-37.9784049987793); - expect(data[64063]).toBeCloseTo(-37.73963928222656); - - // Reset Math.random - Math.random = originalRandom; - } - { - // padding - const { input_features } = await processor(audio); - const { data, dims } = input_features; - expect(dims).toEqual([1, 1, 1001, 64]); - - expect(avg(data)).toBeCloseTo(-34.99049377441406); - expect(data[0]).toBeCloseTo(-21.32573890686035); - expect(data[1]).toBeCloseTo(-26.168411254882812); - expect(data[65]).toBeCloseTo(-29.716018676757812); - expect(data[1002]).toBeCloseTo(-32.16273498535156); - expect(data[10000]).toBeCloseTo(-19.9283390045166); - - // padded values - expect(data[60000]).toBeCloseTo(-100.0); - expect(data[64062]).toBeCloseTo(-100.0); - expect(data[64063]).toBeCloseTo(-100.0); - } - }, - MAX_TEST_TIME, - ); - - it( - "WeSpeakerFeatureExtractor", - async () => { - const processor = await AutoProcessor.from_pretrained("onnx-community/wespeaker-voxceleb-resnet34-LM"); - { - // default - const audio = new Float32Array(16000).map((_, i) => Math.sin(i / 100)); - const { input_features } = await processor(audio); - const { dims, data } = input_features; - expect(dims).toEqual([1, 98, 80]); - - expect(avg(data)).toBeCloseTo(5.461731689138105e-8); - expect(data[0]).toBeCloseTo(-0.19300270080566406); - expect(data[1]).toBeCloseTo(-0.05825042724609375); - expect(data[78]).toBeCloseTo(0.2683420181274414); - expect(data[79]).toBeCloseTo(0.26250171661376953); - expect(data[80]).toBeCloseTo(0.19062232971191406); - expect(data.at(-2)).toBeCloseTo(-0.43694400787353516); - expect(data.at(-1)).toBeCloseTo(-0.4266204833984375); - } - - { - // pad to `min_num_frames` - const audio = new Float32Array(3).map((_, i) => Math.sin(i / 100)); - const { input_features } = await processor(audio); - const { dims, data } = input_features; - expect(dims).toEqual([1, 9, 80]); - - expect(avg(data)).toBeCloseTo(-0.0000010093053181966146); - expect(data[0]).toBeCloseTo(20.761859893798828); - expect(data[1]).toBeCloseTo(21.02924346923828); - expect(data[78]).toBeCloseTo(19.083993911743164); - expect(data[79]).toBeCloseTo(18.003454208374023); - expect(data[80]).toBeCloseTo(-2.595233917236328); - expect(data.at(-2)).toBeCloseTo(-2.385499954223633); - expect(data.at(-1)).toBeCloseTo(-2.2504329681396484); - } - }, - MAX_TEST_TIME, - ); - }); - - describe("Other processors", () => { - describe( - "FlorenceProcessor", - () => { - /** @type {import('../src/transformers.js').Florence2Processor} */ - let processor; - let images = {}; - - beforeAll(async () => { - processor = await AutoProcessor.from_pretrained(MODELS.florence2); - images = { - beetle: await load_cached_image("beetle"), - book_cover: await load_cached_image("book_cover"), - }; - }); - - describe("Prompt construction", () => { - it("Construct prompt", async () => { - const text = ""; - const prompts = processor.construct_prompts(text); - const target = ["Locate the objects with category name in the image."]; - expect(prompts).toEqual(target); - }); - - it("Construct prompts", async () => { - const texts = ["", "Locate the objects with category name in the image.", "cat"]; - const prompts = processor.construct_prompts(texts); - const target = ["Describe with a paragraph what is shown in the image.", "Locate the objects with category name in the image.", "Locate cat in the image."]; - expect(prompts).toEqual(target); - }); - }); - - describe("Post-process generation", () => { - const TESTS = [ - { - task: "", - generated_text: "A green car parked in front of a yellow building.", - target: { "": "A green car parked in front of a yellow building." }, - image: "beetle", - }, - { - task: "", - generated_text: "The image shows a green Volkswagen Beetle parked in front of a yellow building with two brown doors. The sky is a mix of blue and white, and there are a few green trees in the background.", - target: { "": "The image shows a green Volkswagen Beetle parked in front of a yellow building with two brown doors. The sky is a mix of blue and white, and there are a few green trees in the background." }, - image: "beetle", - }, - { - task: "", - generated_text: "The image shows a vintage Volkswagen Beetle car parked on a cobblestone street in front of a yellow building with two wooden doors. The car is painted in a bright turquoise color and has a white stripe running along the side. It has two doors on either side of the car, one on top of the other, and a small window on the front. The building appears to be old and dilapidated, with peeling paint and crumbling walls. The sky is blue and there are trees in the background.", - target: { "": "The image shows a vintage Volkswagen Beetle car parked on a cobblestone street in front of a yellow building with two wooden doors. The car is painted in a bright turquoise color and has a white stripe running along the side. It has two doors on either side of the car, one on top of the other, and a small window on the front. The building appears to be old and dilapidated, with peeling paint and crumbling walls. The sky is blue and there are trees in the background." }, - image: "beetle", - }, - { - task: "", - generated_text: "cardoorwheel", - target: { - "": { - bboxes: [ - [34.24, 160.08, 597.44, 371.76], - [456.0, 97.68, 580.16, 261.84], - [450.88, 276.72, 554.56, 370.8], - [95.68, 280.56, 198.72, 371.28], - ], - labels: ["car", "door", "wheel", "wheel"], - }, - }, - image: "beetle", - }, - { - task: "", - generated_text: "turquoise Volkswagen Beetlewheel", - target: { - "": { - bboxes: [ - [33.6, 160.08, 596.8, 371.76], - [450.88, 276.72, 553.28, 370.8], - [95.04, 280.56, 197.44, 371.28], - ], - labels: ["turquoise Volkswagen Beetle", "wheel", "wheel"], - }, - }, - image: "beetle", - }, - { - task: "", - generated_text: "", - target: { - "": { - bboxes: [ - [33.6, 160.08, 596.8, 371.76], - [455.36, 97.68, 579.52, 261.84], - [450.88, 276.72, 553.28, 370.8], - [95.04, 280.56, 198.08, 371.28], - [226.88, 88.56, 332.48, 164.4], - [65.6, 266.64, 86.72, 295.92], - [271.68, 241.68, 302.4, 246.96], - [408.0, 308.4, 413.76, 320.88], - ], - labels: ["", "", "", "", "", "", "", ""], - }, - }, - image: "beetle", - }, - { - task: "", - text_input: "A green car parked in front of a yellow building.", - generated_text: "A green cara yellow building", - target: { - "": { - bboxes: [ - [34.88, 158.64, 583.36, 374.64], - [0.32, 4.08, 639.04, 305.04], - ], - labels: ["A green car", "a yellow building"], - }, - }, - image: "beetle", - }, - // { - // task: "", - // text_input: "a green car", - // generated_text: "", - // target: { - // '': { - // polygons: [[[[178.88, 181.68, 180.8, 180.72, 182.72, 180.72, 187.84, 177.84, 189.76, 177.84, 192.96, 175.92, 194.88, 175.92, 198.08, 174, 200.64, 173.04, 203.84, 172.08, 207.04, 170.64, 209.6, 169.68, 214.08, 168.72, 217.92, 167.76, 221.76, 166.8, 226.24, 165.84, 230.72, 164.88, 237.12, 163.92, 244.16, 162.96, 253.12, 162, 265.28, 161.04, 311.36, 161.04, 329.28, 162, 338.24, 162.96, 345.28, 163.92, 350.4, 164.88, 354.24, 165.84, 358.72, 166.8, 362.56, 167.76, 366.4, 168.72, 370.24, 169.68, 373.44, 170.64, 375.36, 172.08, 377.28, 174, 379.2, 176.88, 380.48, 179.76, 382.4, 181.68, 384.32, 185.04, 386.24, 187.92, 387.52, 190.8, 389.44, 192.72, 390.08, 196.08, 392, 198.96, 394.56, 201.84, 396.48, 204.72, 398.4, 208.08, 403.52, 212.88, 406.08, 213.84, 409.28, 216.72, 412.48, 220.08, 431.68, 220.08, 432.32, 221.04, 442.56, 222, 456.64, 222, 465.6, 222.96, 472.64, 223.92, 478.4, 224.88, 484.8, 225.84, 489.92, 226.8, 493.76, 227.76, 497.6, 228.72, 501.44, 229.68, 504.64, 231.12, 507.84, 232.08, 510.4, 233.04, 513.6, 234, 516.8, 235.92, 518.72, 235.92, 523.84, 238.8, 525.76, 238.8, 527.68, 239.76, 529.6, 241.68, 532.8, 242.64, 536, 245.04, 538.56, 247.92, 541.76, 249.84, 545.6, 251.76, 548.8, 252.72, 550.72, 252.72, 553.92, 253.68, 556.48, 255.6, 558.4, 255.6, 564.8, 258.96, 566.72, 260.88, 568.64, 260.88, 570.56, 261.84, 572.48, 263.76, 573.76, 265.68, 574.4, 268.56, 574.4, 271.92, 573.76, 272.88, 572.48, 275.76, 572.48, 279.6, 573.76, 285.84, 574.4, 286.8, 575.68, 289.68, 576.32, 292.56, 577.6, 298.8, 577.6, 301.68, 576.32, 302.64, 575.68, 310.8, 575.68, 312.72, 576.32, 313.68, 577.6, 316.56, 577.6, 320.88, 574.4, 321.84, 568.64, 322.8, 559.68, 322.8, 553.92, 323.76, 552.64, 332.88, 552, 336.72, 550.72, 339.6, 550.08, 342.96, 548.8, 344.88, 546.88, 346.8, 545.6, 349.68, 543.68, 352.56, 541.76, 355.92, 534.72, 362.64, 531.52, 364.56, 525.76, 367.92, 522.56, 368.88, 518.72, 369.84, 495.68, 369.84, 489.92, 368.88, 486.72, 367.92, 483.52, 366.96, 479.68, 364.56, 476.48, 362.64, 472.64, 359.76, 465.6, 352.56, 463.68, 349.68, 461.76, 346.8, 460.48, 344.88, 460.48, 342.96, 458.56, 339.6, 457.92, 336.72, 457.92, 334.8, 456.64, 332.88, 454.72, 330.96, 452.8, 331.92, 448.32, 336.72, 446.4, 337.68, 426.56, 336.72, 424.64, 336.72, 423.36, 337.68, 420.8, 338.64, 414.4, 339.6, 412.48, 339.6, 411.2, 338.64, 380.48, 337.68, 217.28, 337.68, 216, 338.64, 210.88, 339.6, 207.04, 339.6, 203.84, 338.64, 201.92, 337.68, 200, 335.76, 198.08, 334.8, 194.88, 334.8, 192.96, 336.72, 191.68, 338.64, 191.68, 340.56, 191.04, 342.96, 189.12, 344.88, 187.84, 347.76, 185.92, 349.68, 184.64, 352.56, 182.72, 355.92, 176.96, 361.68, 173.76, 363.6, 170.56, 365.52, 166.72, 367.92, 163.52, 368.88, 160.96, 369.84, 153.92, 370.8, 131.52, 370.8, 127.68, 369.84, 124.48, 368.88, 118.72, 365.52, 115.52, 363.6, 111.68, 360.72, 106.56, 355.92, 104.64, 352.56, 103.36, 349.68, 101.44, 347.76, 100.8, 345.84, 99.52, 342.96, 99.52, 339.6, 98.88, 337.68, 95.68, 334.8, 93.76, 333.84, 86.72, 333.84, 80.32, 334.8, 79.68, 335.76, 74.56, 336.72, 66.24, 336.72, 63.68, 334.8, 53.44, 334.8, 50.24, 333.84, 48.32, 331.92, 48.32, 328.56, 50.24, 326.64, 51.52, 324.72, 51.52, 322.8, 44.48, 321.84, 40.64, 320.88, 38.72, 319.92, 37.44, 317.52, 36.16, 313.68, 36.16, 306.96, 38.72, 304.56, 42.56, 303.6, 46.4, 302.64, 55.36, 301.68, 65.6, 301.68, 67.52, 300.72, 69.44, 298.8, 70.72, 296.88, 70.72, 292.56, 69.44, 291.6, 68.8, 288.72, 67.52, 284.88, 67.52, 276.72, 68.8, 273.84, 69.44, 271.92, 72.64, 268.56, 74.56, 267.6, 77.76, 266.64, 79.68, 266.64, 81.6, 264.72, 80.32, 260.88, 81.6, 258.96, 83.52, 256.56, 88.64, 256.56, 90.56, 255.6, 92.48, 253.68, 92.48, 252.72, 97.6, 246.96, 114.88, 229.68, 117.44, 226.8, 122.56, 222.96, 125.76, 221.04, 126.4, 221.04, 129.6, 219.12, 133.44, 215.76, 138.56, 211.92, 143.68, 208.08, 149.44, 201.84, 153.92, 198.96, 154.56, 198.96, 157.76, 197.04, 162.88, 192.72, 168.64, 186.96, 171.84, 185.04, 176.96, 183.12, 178.88, 180.72]]]], - // labels: [''], - // } - // }, - // image: 'beetle', - // }, - // { - // task: "", - // text_input: "", - // generated_text: "", - // target: { - // '': { - // polygons: [[[[470.08, 288.24, 473.92, 285.36, 477.12, 283.44, 479.04, 282.48, 480.96, 282.48, 484.16, 280.56, 486.72, 279.6, 489.92, 278.64, 495.04, 277.68, 512.32, 277.68, 514.88, 278.64, 518.08, 279.6, 521.28, 281.52, 523.2, 281.52, 525.12, 283.44, 528.32, 284.4, 530.88, 286.32, 534.08, 288.24, 543.04, 297.36, 544.96, 300.24, 546.88, 303.12, 550.08, 309.36, 551.36, 312.24, 552, 315.12, 553.28, 319.44, 553.28, 332.4, 552, 337.2, 551.36, 340.08, 550.08, 343.44, 548.16, 347.28, 546.24, 350.16, 544.32, 353.04, 541.12, 357.36, 537.28, 361.2, 532.16, 365.04, 528.96, 366.96, 527.04, 367.92, 523.84, 368.88, 521.28, 369.84, 516.16, 371.28, 500.8, 371.28, 491.84, 369.84, 488, 368.88, 484.8, 367.92, 479.04, 365.04, 475.84, 363.12, 472, 360.24, 464.96, 353.04, 463.04, 350.16, 461.12, 347.28, 459.84, 345.36, 459.84, 343.44, 457.92, 340.08, 456.64, 337.2, 456, 334.32, 454.72, 330.48, 454.72, 316.08, 456, 311.28, 456.64, 307.44, 457.92, 304.08, 459.84, 301.2, 459.84, 299.28, 461.12, 297.36, 463.04, 294.48]]]], - // labels: [''], - // } - // }, - // image: 'beetle', - // }, - // { - // task: "", - // text_input: "a green car", - // generated_text: "a green car", - // target: { - // '': { - // bboxes: [[34.24, 158.64, 582.72, 374.16]], - // bboxes_labels: ['a green car'], - // polygons: [], - // polygons_labels: [], - // } - // }, - // image: 'beetle', - // }, - { - task: "", - text_input: "", - generated_text: "car", - target: { "": "car" }, - image: "beetle", - }, - { - task: "", - text_input: "", - generated_text: "turquoise Volkswagen Beetle", - target: { "": "turquoise Volkswagen Beetle" }, - image: "beetle", - }, - { - task: "", - generated_text: "CUDAFOR ENGINEERSAn Introduction to High-PerformanceParallel ComputingDUANE STORTIMETE YURTOGLU", - target: { "": "CUDAFOR ENGINEERSAn Introduction to High-PerformanceParallel ComputingDUANE STORTIMETE YURTOGLU" }, - image: "book_cover", - }, - { - task: "", - generated_text: "CUDAFOR ENGINEERSAn Introduction to High-PerformanceParallel ComputingDUANE STORTIMETE YURTOGLU", - target: { - "": { - quad_boxes: [ - [167.0435028076172, 50.25, 375.7974853515625, 50.25, 375.7974853515625, 114.75, 167.0435028076172, 114.75], - [144.8784942626953, 120.75, 375.7974853515625, 120.75, 375.7974853515625, 149.25, 144.8784942626953, 149.25], - [115.86249542236328, 165.25, 376.6034851074219, 166.25, 376.6034851074219, 184.25, 115.86249542236328, 183.25], - [239.9864959716797, 184.25, 376.6034851074219, 186.25, 376.6034851074219, 204.25, 239.9864959716797, 202.25], - [266.1814880371094, 441.25, 376.6034851074219, 441.25, 376.6034851074219, 456.25, 266.1814880371094, 456.25], - [252.0764923095703, 460.25, 376.6034851074219, 460.25, 376.6034851074219, 475.25, 252.0764923095703, 475.25], - ], - - // NOTE: Python version has a bug here, it should be "CUDA" instead of "CUDA" - labels: [/* 'CUDA' */ "CUDA", "FOR ENGINEERS", "An Introduction to High-Performance", "Parallel Computing", "DUANE STORTI", "METE YURTOGLU"], - }, - }, - image: "book_cover", - }, - ]; - - for (const { task, generated_text, target, image } of TESTS) { - it(task, () => { - const result = processor.post_process_generation(generated_text, task, images[image].size); - expect(result).toBeCloseToNested(target, 4); - }); - } - }); - }, - MAX_TEST_TIME, - ); - - describe( - "Qwen2VLProcessor", - () => { - /** @type {import('../src/transformers.js').Qwen2VLProcessor} */ - let processor; - let images = {}; - - beforeAll(async () => { - processor = await AutoProcessor.from_pretrained(MODELS.qwen2_vl); - images = { - white_image: await load_cached_image("white_image"), - }; - }); - - it("Image and text", async () => { - const conversation = [ - { - role: "user", - content: [{ type: "image" }, { type: "text", text: "Describe this image." }], - }, - ]; - - const text = processor.apply_chat_template(conversation, { - add_generation_prompt: true, - }); - const { input_ids, attention_mask, pixel_values, image_grid_thw } = await processor(text, images.white_image); - - expect(input_ids.dims).toEqual([1, 89]); - expect(attention_mask.dims).toEqual([1, 89]); - expect(pixel_values.dims).toEqual([256, 1176]); - expect(image_grid_thw.dims).toEqual([1, 3]); - }); - }, - MAX_TEST_TIME, - ); - - describe( - "PaliGemmaProcessor", - () => { - /** @type {import('../src/transformers.js').PaliGemmaProcessor} */ - let processor; - let images = {}; - - beforeAll(async () => { - processor = await AutoProcessor.from_pretrained(MODELS.paligemma); - images = { - white_image: await load_cached_image("white_image"), - }; - }); - - it("Image-only (default text)", async () => { - const { input_ids, pixel_values } = await processor(images.white_image); - expect(input_ids.dims).toEqual([1, 258]); - expect(pixel_values.dims).toEqual([1, 3, 224, 224]); - }); - - it("Single image & text", async () => { - const { input_ids, pixel_values } = await processor(images.white_image, "What is on the flower?"); - expect(input_ids.dims).toEqual([1, 264]); - expect(pixel_values.dims).toEqual([1, 3, 224, 224]); - }); - - it("Multiple images & text", async () => { - const { input_ids, pixel_values } = await processor([images.white_image, images.white_image], "Describe the images."); - expect(input_ids.dims).toEqual([1, 518]); - expect(pixel_values.dims).toEqual([2, 3, 224, 224]); - }); - }, - MAX_TEST_TIME, - ); - }); -}); +await collect_and_execute_tests("Processors", "processor"); diff --git a/tests/test_utils.js b/tests/test_utils.js index 9928bf75b..c42c5f201 100644 --- a/tests/test_utils.js +++ b/tests/test_utils.js @@ -1,3 +1,7 @@ +import fs from "fs"; +import path from "path"; +import { fileURLToPath } from "url"; + export async function loadAudio(url) { // NOTE: Since the Web Audio API is not available in Node.js, we will need to use the `wavefile` library to obtain the raw audio data. // For more information, see: https://huggingface.co/docs/transformers.js/guides/node-audio-processing @@ -63,3 +67,67 @@ export function compare(val1, val2, tol = 0.1) { } } } + +const __filename = fileURLToPath(import.meta.url); +const __dirname = path.dirname(__filename); +const models_dir = path.join(__dirname, "models"); +const pipelines_dir = path.join(__dirname, "pipelines"); + +/** + * Helper function to collect all unit tests, which can be found in files + * of the form: `tests/models//test__.js`. + * @param {string} filename + * @returns {Promise<[string, Function][]>} + */ +export async function collect_tests(filename) { + const model_types = fs.readdirSync(models_dir); + const all_tests = []; + for (const model_type of model_types) { + const dir = path.join(models_dir, model_type); + + if (!fs.existsSync(dir) || !fs.statSync(dir).isDirectory()) { + continue; + } + + const file = path.join(dir, `test_${filename}_${model_type}.js`); + if (!fs.existsSync(file)) { + continue; + } + + const items = await import(file); + all_tests.push([model_type, items]); + } + return all_tests; +} + +/** + * Helper function to collect and execute all unit tests, which can be found in files + * of the form: `tests/models//test__.js`. + * @param {string} title The title of the test + * @param {string} filename The name of the test + */ +export async function collect_and_execute_tests(title, filename) { + // 1. Collect all tests + const all_tests = await collect_tests(filename); + + // 2. Execute tests + describe(title, () => all_tests.forEach(([name, test]) => describe(name, test.default))); +} + +/** + * Helper function to collect all pipeline tests, which can be found in files + * of the form: `tests/pipelines/test_pipeline_.js`. + */ +export async function collect_and_execute_pipeline_tests(title) { + // 1. Collect all tests + const all_tests = []; + const pipeline_types = fs.readdirSync(pipelines_dir); + for (const filename of pipeline_types) { + const file = path.join(pipelines_dir, filename); + const items = await import(file); + all_tests.push(items); + } + + // 2. Execute tests + describe(title, () => all_tests.forEach((test) => test.default())); +} diff --git a/tests/tiny_random.test.js b/tests/tiny_random.test.js deleted file mode 100644 index d80699a11..000000000 --- a/tests/tiny_random.test.js +++ /dev/null @@ -1,841 +0,0 @@ -import { - // Pipelines - pipeline, - FillMaskPipeline, - TextClassificationPipeline, - TextGenerationPipeline, - TranslationPipeline, - ImageClassificationPipeline, - ZeroShotImageClassificationPipeline, - TokenClassificationPipeline, - QuestionAnsweringPipeline, - DocumentQuestionAnsweringPipeline, - - // Other - RawImage, -} from "../src/transformers.js"; - -import { init, MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "./init.js"; -import { compare } from "./test_utils.js"; - -init(); - -describe("Tiny random pipelines", () => { - describe("fill-mask", () => { - const model_id = "hf-internal-testing/tiny-random-BertForMaskedLM"; - - /** @type {FillMaskPipeline} */ - let pipe; - beforeAll(async () => { - pipe = await pipeline("fill-mask", model_id, DEFAULT_MODEL_OPTIONS); - }, MAX_MODEL_LOAD_TIME); - - describe("batch_size=1", () => { - it( - "default (top_k=5)", - async () => { - const output = await pipe("a [MASK] c"); - const target = [ - { score: 0.0013377574505284429, token: 854, token_str: "##ο", sequence: "aο c" }, - { score: 0.001248967950232327, token: 962, token_str: "##ち", sequence: "aち c" }, - { score: 0.0012304208939895034, token: 933, token_str: "##ع", sequence: "aع c" }, - { score: 0.0012301815440878272, token: 313, token_str: "ფ", sequence: "a ფ c" }, - { score: 0.001222139224410057, token: 624, token_str: "未", sequence: "a 未 c" }, - ]; - compare(output, target, 1e-5); - }, - MAX_TEST_EXECUTION_TIME, - ); - it( - "custom (top_k=2)", - async () => { - const output = await pipe("a [MASK] c", { top_k: 2 }); - const target = [ - { score: 0.0013377574505284429, token: 854, token_str: "##ο", sequence: "aο c" }, - { score: 0.001248967950232327, token: 962, token_str: "##ち", sequence: "aち c" }, - ]; - compare(output, target, 1e-5); - }, - MAX_TEST_EXECUTION_TIME, - ); - }); - - describe("batch_size>1", () => { - it( - "default (top_k=5)", - async () => { - const output = await pipe(["a [MASK] c", "a b [MASK] c"]); - const target = [ - [ - { score: 0.0013377574505284429, token: 854, token_str: "##ο", sequence: "aο c" }, - { score: 0.001248967950232327, token: 962, token_str: "##ち", sequence: "aち c" }, - { score: 0.0012304208939895034, token: 933, token_str: "##ع", sequence: "aع c" }, - { score: 0.0012301815440878272, token: 313, token_str: "ფ", sequence: "a ფ c" }, - { score: 0.001222139224410057, token: 624, token_str: "未", sequence: "a 未 c" }, - ], - [ - { score: 0.0013287801994010806, token: 962, token_str: "##ち", sequence: "a bち c" }, - { score: 0.0012486606137827039, token: 823, token_str: "##ن", sequence: "a bن c" }, - { score: 0.0012320734094828367, token: 1032, token_str: "##ც", sequence: "a bც c" }, - { score: 0.0012295148335397243, token: 854, token_str: "##ο", sequence: "a bο c" }, - { score: 0.0012277684872969985, token: 624, token_str: "未", sequence: "a b 未 c" }, - ], - ]; - compare(output, target, 1e-5); - }, - MAX_TEST_EXECUTION_TIME, - ); - it( - "custom (top_k=2)", - async () => { - const output = await pipe(["a [MASK] c", "a b [MASK] c"], { top_k: 2 }); - const target = [ - [ - { score: 0.0013377574505284429, token: 854, token_str: "##ο", sequence: "aο c" }, - { score: 0.001248967950232327, token: 962, token_str: "##ち", sequence: "aち c" }, - ], - [ - { score: 0.0013287801994010806, token: 962, token_str: "##ち", sequence: "a bち c" }, - { score: 0.0012486606137827039, token: 823, token_str: "##ن", sequence: "a bن c" }, - ], - ]; - compare(output, target, 1e-5); - }, - MAX_TEST_EXECUTION_TIME, - ); - }); - - afterAll(async () => { - await pipe?.dispose(); - }, MAX_MODEL_DISPOSE_TIME); - }); - - describe("text-classification", () => { - const model_id = "hf-internal-testing/tiny-random-BertForSequenceClassification"; - - /** @type {TextClassificationPipeline} */ - let pipe; - beforeAll(async () => { - pipe = await pipeline("text-classification", model_id, DEFAULT_MODEL_OPTIONS); - }, MAX_MODEL_LOAD_TIME); - - describe("batch_size=1", () => { - it( - "default (top_k=1)", - async () => { - const output = await pipe("a"); - const target = [{ label: "LABEL_0", score: 0.5076976418495178 }]; - compare(output, target, 1e-5); - }, - MAX_TEST_EXECUTION_TIME, - ); - it( - "custom (top_k=2)", - async () => { - const output = await pipe("a", { top_k: 2 }); - const target = [ - { label: "LABEL_0", score: 0.5076976418495178 }, - { label: "LABEL_1", score: 0.49230238795280457 }, - ]; - compare(output, target, 1e-5); - }, - MAX_TEST_EXECUTION_TIME, - ); - }); - - describe("batch_size>1", () => { - it( - "default (top_k=1)", - async () => { - const output = await pipe(["a", "b c"]); - const target = [ - { label: "LABEL_0", score: 0.5076976418495178 }, - { label: "LABEL_0", score: 0.5077522993087769 }, - ]; - compare(output, target, 1e-5); - }, - MAX_TEST_EXECUTION_TIME, - ); - it( - "custom (top_k=2)", - async () => { - const output = await pipe(["a", "b c"], { top_k: 2 }); - const target = [ - [ - { label: "LABEL_0", score: 0.5076976418495178 }, - { label: "LABEL_1", score: 0.49230238795280457 }, - ], - [ - { label: "LABEL_0", score: 0.5077522993087769 }, - { label: "LABEL_1", score: 0.49224773049354553 }, - ], - ]; - compare(output, target, 1e-5); - }, - MAX_TEST_EXECUTION_TIME, - ); - - it( - "multi_label_classification", - async () => { - const problem_type = pipe.model.config.problem_type; - pipe.model.config.problem_type = "multi_label_classification"; - - const output = await pipe(["a", "b c"], { top_k: 2 }); - const target = [ - [ - { label: "LABEL_0", score: 0.5001373887062073 }, - { label: "LABEL_1", score: 0.49243971705436707 }, - ], - [ - { label: "LABEL_0", score: 0.5001326203346252 }, - { label: "LABEL_1", score: 0.492380291223526 }, - ], - ]; - compare(output, target, 1e-5); - - // Reset problem type - pipe.model.config.problem_type = problem_type; - }, - MAX_TEST_EXECUTION_TIME, - ); - }); - - afterAll(async () => { - await pipe?.dispose(); - }, MAX_MODEL_DISPOSE_TIME); - }); - - describe("token-classification", () => { - const model_id = "hf-internal-testing/tiny-random-BertForTokenClassification"; - - /** @type {TokenClassificationPipeline} */ - let pipe; - beforeAll(async () => { - pipe = await pipeline("token-classification", model_id, DEFAULT_MODEL_OPTIONS); - }, MAX_MODEL_LOAD_TIME); - - describe("batch_size=1", () => { - it( - "default", - async () => { - const output = await pipe("1 2 3"); - - // TODO: Add start/end to target - const target = [ - { - entity: "LABEL_0", - score: 0.5292708, - index: 1, - word: "1", - // 'start': 0, 'end': 1 - }, - { - entity: "LABEL_0", - score: 0.5353687, - index: 2, - word: "2", - // 'start': 2, 'end': 3 - }, - { - entity: "LABEL_1", - score: 0.51381934, - index: 3, - word: "3", - // 'start': 4, 'end': 5 - }, - ]; - compare(output, target, 1e-5); - }, - MAX_TEST_EXECUTION_TIME, - ); - it( - "custom (ignore_labels set)", - async () => { - const output = await pipe("1 2 3", { ignore_labels: ["LABEL_0"] }); - const target = [ - { - entity: "LABEL_1", - score: 0.51381934, - index: 3, - word: "3", - // 'start': 4, 'end': 5 - }, - ]; - compare(output, target, 1e-5); - }, - MAX_TEST_EXECUTION_TIME, - ); - }); - - describe("batch_size>1", () => { - it( - "default", - async () => { - const output = await pipe(["1 2 3", "4 5"]); - const target = [ - [ - { - entity: "LABEL_0", - score: 0.5292708, - index: 1, - word: "1", - // 'start': 0, 'end': 1 - }, - { - entity: "LABEL_0", - score: 0.5353687, - index: 2, - word: "2", - // 'start': 2, 'end': 3 - }, - { - entity: "LABEL_1", - score: 0.51381934, - index: 3, - word: "3", - // 'start': 4, 'end': 5 - }, - ], - [ - { - entity: "LABEL_0", - score: 0.5432807, - index: 1, - word: "4", - // 'start': 0, 'end': 1 - }, - { - entity: "LABEL_1", - score: 0.5007693, - index: 2, - word: "5", - // 'start': 2, 'end': 3 - }, - ], - ]; - compare(output, target, 1e-5); - }, - MAX_TEST_EXECUTION_TIME, - ); - it( - "custom (ignore_labels set)", - async () => { - const output = await pipe(["1 2 3", "4 5"], { ignore_labels: ["LABEL_0"] }); - const target = [ - [ - { - entity: "LABEL_1", - score: 0.51381934, - index: 3, - word: "3", - // 'start': 4, 'end': 5 - }, - ], - [ - { - entity: "LABEL_1", - score: 0.5007693, - index: 2, - word: "5", - // 'start': 2, 'end': 3 - }, - ], - ]; - compare(output, target, 1e-5); - }, - MAX_TEST_EXECUTION_TIME, - ); - }); - - afterAll(async () => { - await pipe?.dispose(); - }, MAX_MODEL_DISPOSE_TIME); - }); - - describe("question-answering", () => { - const model_id = "hf-internal-testing/tiny-random-BertForQuestionAnswering"; - - /** @type {QuestionAnsweringPipeline} */ - let pipe; - beforeAll(async () => { - pipe = await pipeline("question-answering", model_id, DEFAULT_MODEL_OPTIONS); - }, MAX_MODEL_LOAD_TIME); - - describe("batch_size=1", () => { - it( - "default (top_k=1)", - async () => { - const output = await pipe("a", "b c"); - const target = { score: 0.11395696550607681, /* start: 0, end: 1, */ answer: "b" }; - compare(output, target, 1e-5); - }, - MAX_TEST_EXECUTION_TIME, - ); - it( - "custom (top_k=3)", - async () => { - const output = await pipe("a", "b c", { top_k: 3 }); - const target = [ - { score: 0.11395696550607681, /* start: 0, end: 1, */ answer: "b" }, - { score: 0.11300431191921234, /* start: 2, end: 3, */ answer: "c" }, - { score: 0.10732574015855789, /* start: 0, end: 3, */ answer: "b c" }, - ]; - compare(output, target, 1e-5); - }, - MAX_TEST_EXECUTION_TIME, - ); - }); - - afterAll(async () => { - await pipe?.dispose(); - }, MAX_MODEL_DISPOSE_TIME); - }); - - describe("image-classification", () => { - const model_id = "hf-internal-testing/tiny-random-vit"; - const urls = ["https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/white-image.png", "https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/blue-image.png"]; - - /** @type {ImageClassificationPipeline} */ - let pipe; - beforeAll(async () => { - pipe = await pipeline("image-classification", model_id, DEFAULT_MODEL_OPTIONS); - }, MAX_MODEL_LOAD_TIME); - - describe("batch_size=1", () => { - it( - "default (top_k=5)", - async () => { - const output = await pipe(urls[0]); - const target = [ - { label: "LABEL_1", score: 0.5020533800125122 }, - { label: "LABEL_0", score: 0.4979466497898102 }, - ]; - compare(output, target, 1e-5); - }, - MAX_TEST_EXECUTION_TIME, - ); - it( - "custom (top_k=1)", - async () => { - const output = await pipe(urls[0], { top_k: 1 }); - const target = [{ label: "LABEL_1", score: 0.5020533800125122 }]; - compare(output, target, 1e-5); - }, - MAX_TEST_EXECUTION_TIME, - ); - }); - - describe("batch_size>1", () => { - it( - "default (top_k=5)", - async () => { - const output = await pipe(urls); - const target = [ - [ - { label: "LABEL_1", score: 0.5020533800125122 }, - { label: "LABEL_0", score: 0.4979466497898102 }, - ], - [ - { label: "LABEL_1", score: 0.519227921962738 }, - { label: "LABEL_0", score: 0.4807720482349396 }, - ], - ]; - compare(output, target, 1e-5); - }, - MAX_TEST_EXECUTION_TIME, - ); - it( - "custom (top_k=1)", - async () => { - const output = await pipe(urls, { top_k: 1 }); - const target = [[{ label: "LABEL_1", score: 0.5020533800125122 }], [{ label: "LABEL_1", score: 0.519227921962738 }]]; - compare(output, target, 1e-5); - }, - MAX_TEST_EXECUTION_TIME, - ); - }); - - afterAll(async () => { - await pipe?.dispose(); - }, MAX_MODEL_DISPOSE_TIME); - }); - - describe("zero-shot-image-classification", () => { - const model_id = "hf-internal-testing/tiny-random-GroupViTModel"; - - // Example adapted from https://huggingface.co/docs/transformers/en/model_doc/groupvit - const urls = ["https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/white-image.png", "https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/blue-image.png"]; - const labels = ["cat", "dog"]; - const hypothesis_template = "a photo of a {}"; - - /** @type {ZeroShotImageClassificationPipeline} */ - let pipe; - beforeAll(async () => { - pipe = await pipeline("zero-shot-image-classification", model_id, DEFAULT_MODEL_OPTIONS); - }, MAX_MODEL_LOAD_TIME); - - describe("batch_size=1", () => { - it( - "default", - async () => { - const output = await pipe(urls[0], labels); - const target = [ - { score: 0.5990662574768066, label: "cat" }, - { score: 0.40093377232551575, label: "dog" }, - ]; - compare(output, target, 1e-5); - }, - MAX_TEST_EXECUTION_TIME, - ); - it( - "custom (w/ hypothesis_template)", - async () => { - const output = await pipe(urls[0], labels, { hypothesis_template }); - const target = [ - { score: 0.5527022480964661, label: "cat" }, - { score: 0.44729775190353394, label: "dog" }, - ]; - compare(output, target, 1e-5); - }, - MAX_TEST_EXECUTION_TIME, - ); - }); - - describe("batch_size>1", () => { - it( - "default", - async () => { - const output = await pipe(urls, labels); - const target = [ - [ - { score: 0.5990662574768066, label: "cat" }, - { score: 0.40093377232551575, label: "dog" }, - ], - [ - { score: 0.5006340146064758, label: "dog" }, - { score: 0.49936598539352417, label: "cat" }, - ], - ]; - compare(output, target, 1e-5); - }, - MAX_TEST_EXECUTION_TIME, - ); - it( - "custom (w/ hypothesis_template)", - async () => { - const output = await pipe(urls, labels, { hypothesis_template }); - const target = [ - [ - { score: 0.5527022480964661, label: "cat" }, - { score: 0.44729775190353394, label: "dog" }, - ], - [ - { score: 0.5395973324775696, label: "cat" }, - { score: 0.46040263772010803, label: "dog" }, - ], - ]; - compare(output, target, 1e-5); - }, - MAX_TEST_EXECUTION_TIME, - ); - }); - - afterAll(async () => { - await pipe?.dispose(); - }, MAX_MODEL_DISPOSE_TIME); - }); - - describe("audio-classification", () => { - const model_id = "hf-internal-testing/tiny-random-unispeech"; - const audios = [new Float32Array(16000).fill(0), Float32Array.from({ length: 16000 }, (_, i) => i)]; - - /** @type {ImageClassificationPipeline} */ - let pipe; - beforeAll(async () => { - pipe = await pipeline("audio-classification", model_id, DEFAULT_MODEL_OPTIONS); - }, MAX_MODEL_LOAD_TIME); - - describe("batch_size=1", () => { - it( - "default (top_k=5)", - async () => { - const output = await pipe(audios[0]); - const target = [ - { score: 0.5043687224388123, label: "LABEL_0" }, - { score: 0.4956313371658325, label: "LABEL_1" }, - ]; - compare(output, target, 1e-5); - }, - MAX_TEST_EXECUTION_TIME, - ); - it( - "custom (top_k=1)", - async () => { - const output = await pipe(audios[0], { top_k: 1 }); - const target = [{ score: 0.5043687224388123, label: "LABEL_0" }]; - compare(output, target, 1e-5); - }, - MAX_TEST_EXECUTION_TIME, - ); - }); - - describe("batch_size>1", () => { - it( - "default (top_k=5)", - async () => { - const output = await pipe(audios); - const target = [ - [ - { score: 0.5043687224388123, label: "LABEL_0" }, - { score: 0.4956313371658325, label: "LABEL_1" }, - ], - [ - { score: 0.5187293887138367, label: "LABEL_0" }, - { score: 0.4812707006931305, label: "LABEL_1" }, - ], - ]; - compare(output, target, 1e-5); - }, - MAX_TEST_EXECUTION_TIME, - ); - it( - "custom (top_k=1)", - async () => { - const output = await pipe(audios, { top_k: 1 }); - const target = [[{ score: 0.5043687224388123, label: "LABEL_0" }], [{ score: 0.5187293887138367, label: "LABEL_0" }]]; - compare(output, target, 1e-5); - }, - MAX_TEST_EXECUTION_TIME, - ); - }); - - afterAll(async () => { - await pipe?.dispose(); - }, MAX_MODEL_DISPOSE_TIME); - }); - - describe("text-generation", () => { - const model_id = "hf-internal-testing/tiny-random-LlamaForCausalLM"; - - /** @type {TextGenerationPipeline} */ - let pipe; - beforeAll(async () => { - pipe = await pipeline("text-generation", model_id, DEFAULT_MODEL_OPTIONS); - }, MAX_MODEL_LOAD_TIME); - - describe("batch_size=1", () => { - const text_input = "hello"; - const generated_text_target = "erdingsAndroid Load"; - const text_target = [{ generated_text: text_input + generated_text_target }]; - const new_text_target = [{ generated_text: generated_text_target }]; - - const chat_input = [ - { role: "system", content: "a" }, - { role: "user", content: "b" }, - ]; - const chat_target = [ - { - generated_text: [ - { role: "system", content: "a" }, - { role: "user", content: "b" }, - { role: "assistant", content: " Southern abund Load" }, - ], - }, - ]; - - it( - "text input (single)", - async () => { - const output = await pipe(text_input, { max_new_tokens: 3 }); - compare(output, text_target); - }, - MAX_TEST_EXECUTION_TIME, - ); - it( - "text input (list)", - async () => { - const output = await pipe([text_input], { max_new_tokens: 3 }); - compare(output, [text_target]); - }, - MAX_TEST_EXECUTION_TIME, - ); - - it( - "text input (single) - return_full_text=false", - async () => { - const output = await pipe(text_input, { max_new_tokens: 3, return_full_text: false }); - compare(output, new_text_target); - }, - MAX_TEST_EXECUTION_TIME, - ); - it( - "text input (list) - return_full_text=false", - async () => { - const output = await pipe([text_input], { max_new_tokens: 3, return_full_text: false }); - compare(output, [new_text_target]); - }, - MAX_TEST_EXECUTION_TIME, - ); - - it( - "chat input (single)", - async () => { - const output = await pipe(chat_input, { max_new_tokens: 3 }); - compare(output, chat_target); - }, - MAX_TEST_EXECUTION_TIME, - ); - it( - "chat input (list)", - async () => { - const output = await pipe([chat_input], { max_new_tokens: 3 }); - compare(output, [chat_target]); - }, - MAX_TEST_EXECUTION_TIME, - ); - }); - - // TODO: Fix batch_size>1 - // describe('batch_size>1', () => { - // it('default', async () => { - // const output = await pipe(['hello', 'hello world']); - // const target = [ - // [{generated_text: 'helloerdingsAndroid Load'}], - // [{generated_text: 'hello world zerosMillнал'}], - // ]; - // compare(output, target); - // }, MAX_TEST_EXECUTION_TIME); - // }); - - afterAll(async () => { - await pipe?.dispose(); - }, MAX_MODEL_DISPOSE_TIME); - }); - - describe("translation", () => { - const model_id = "Xenova/tiny-random-M2M100ForConditionalGeneration"; - - /** @type {TranslationPipeline} */ - let pipe; - beforeAll(async () => { - pipe = await pipeline("translation", model_id, DEFAULT_MODEL_OPTIONS); - }, MAX_MODEL_LOAD_TIME); - - describe("batch_size=1", () => { - it( - "default", - async () => { - const text = "जीवन एक चॉकलेट बॉक्स की तरह है।"; - const output = await pipe(text, { - src_lang: "hi", - tgt_lang: "fr", - max_new_tokens: 5, - }); - const target = [{ translation_text: "Slovenska төсли төсли төсли" }]; - compare(output, target); - }, - MAX_TEST_EXECUTION_TIME, - ); - }); - - afterAll(async () => { - await pipe?.dispose(); - }, MAX_MODEL_DISPOSE_TIME); - }); - - describe("object-detection", () => { - const model_id = "hf-internal-testing/tiny-random-DetrForObjectDetection"; - const urls = ["https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/white-image.png", "https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/blue-image.png"]; - - /** @type {ImageClassificationPipeline} */ - let pipe; - beforeAll(async () => { - pipe = await pipeline("object-detection", model_id, DEFAULT_MODEL_OPTIONS); - }, MAX_MODEL_LOAD_TIME); - - describe("batch_size=1", () => { - it( - "default (threshold unset)", - async () => { - const output = await pipe(urls[0]); - const target = []; - compare(output, target, 1e-5); - }, - MAX_TEST_EXECUTION_TIME, - ); - it( - "default (threshold=0)", - async () => { - const output = await pipe(urls[0], { threshold: 0 }); - const target = [ - { score: 0.020360443741083145, label: "LABEL_31", box: { xmin: 56, ymin: 55, xmax: 169, ymax: 167 } }, - { score: 0.020360419526696205, label: "LABEL_31", box: { xmin: 56, ymin: 55, xmax: 169, ymax: 167 } }, - { score: 0.02036038413643837, label: "LABEL_31", box: { xmin: 56, ymin: 55, xmax: 169, ymax: 167 } }, - { score: 0.020360447466373444, label: "LABEL_31", box: { xmin: 56, ymin: 55, xmax: 169, ymax: 167 } }, - { score: 0.020360389724373817, label: "LABEL_31", box: { xmin: 56, ymin: 55, xmax: 169, ymax: 167 } }, - { score: 0.020360423251986504, label: "LABEL_31", box: { xmin: 56, ymin: 55, xmax: 169, ymax: 167 } }, - { score: 0.02036040835082531, label: "LABEL_31", box: { xmin: 56, ymin: 55, xmax: 169, ymax: 167 } }, - { score: 0.020360363647341728, label: "LABEL_31", box: { xmin: 56, ymin: 55, xmax: 169, ymax: 167 } }, - { score: 0.020360389724373817, label: "LABEL_31", box: { xmin: 56, ymin: 55, xmax: 169, ymax: 167 } }, - { score: 0.020360389724373817, label: "LABEL_31", box: { xmin: 56, ymin: 55, xmax: 169, ymax: 167 } }, - { score: 0.020360343158245087, label: "LABEL_31", box: { xmin: 56, ymin: 55, xmax: 169, ymax: 167 } }, - { score: 0.020360423251986504, label: "LABEL_31", box: { xmin: 56, ymin: 55, xmax: 169, ymax: 167 } }, - ]; - compare(output, target, 1e-5); - }, - MAX_TEST_EXECUTION_TIME, - ); - }); - - // TODO: Add batched support to object detection pipeline - // describe('batch_size>1', () => { - // it('default (threshold unset)', async () => { - // const output = await pipe(urls); - // console.log(output); - // const target = []; - // compare(output, target, 1e-5); - // }, MAX_TEST_EXECUTION_TIME); - // it('default (threshold=0)', async () => { - // const output = await pipe(urls, { threshold: 0 }); - // console.log(output); - // const target = []; - // compare(output, target, 1e-5); - // }, MAX_TEST_EXECUTION_TIME); - // }); - - afterAll(async () => { - await pipe?.dispose(); - }, MAX_MODEL_DISPOSE_TIME); - }); - - describe("document-question-answering", () => { - const model_id = "hf-internal-testing/tiny-random-VisionEncoderDecoderModel-donutswin-mbart"; - - /** @type {DocumentQuestionAnsweringPipeline} */ - let pipe; - beforeAll(async () => { - pipe = await pipeline("document-question-answering", model_id, DEFAULT_MODEL_OPTIONS); - }, MAX_MODEL_LOAD_TIME); - - describe("batch_size=1", () => { - it( - "default", - async () => { - const dims = [64, 32, 3]; - const image = new RawImage(new Uint8ClampedArray(dims[0] * dims[1] * dims[2]).fill(255), ...dims); - const question = "What is the invoice number?"; - const output = await pipe(image, question); - - const target = [{ answer: null }]; - compare(output, target); - }, - MAX_TEST_EXECUTION_TIME, - ); - }); - - afterAll(async () => { - await pipe?.dispose(); - }, MAX_MODEL_DISPOSE_TIME); - }); -}); diff --git a/tests/tokenizers.test.js b/tests/tokenizers.test.js index 00f58193d..943ce5898 100644 --- a/tests/tokenizers.test.js +++ b/tests/tokenizers.test.js @@ -1,13 +1,10 @@ import { AutoTokenizer } from "../src/tokenizers.js"; -import * as TOKENIZER_TESTS from "./models/all_tokenization_tests.js"; - -import { compare } from "./test_utils.js"; - -const MAX_LOAD_TIME = 10_000; -const MAX_EXECUTION_TIME = 10_000; +import { MAX_TOKENIZER_LOAD_TIME, MAX_TEST_EXECUTION_TIME } from "./init.js"; +import { compare, collect_tests } from "./test_utils.js"; +const TOKENIZER_TESTS = await collect_tests("tokenization"); describe("Tokenizers (model-specific)", () => { - for (const [tokenizer_name, { TOKENIZER_CLASS, TEST_CONFIG, CUSTOM_TESTS }] of Object.entries(TOKENIZER_TESTS)) { + for (const [tokenizer_name, { TOKENIZER_CLASS, TEST_CONFIG, CUSTOM_TESTS }] of TOKENIZER_TESTS) { describe(tokenizer_name, () => { for (const model_id in TEST_CONFIG) { describe(model_id, () => { @@ -15,7 +12,7 @@ describe("Tokenizers (model-specific)", () => { let tokenizer; beforeAll(async () => { tokenizer = await TOKENIZER_CLASS.from_pretrained(model_id); - }, MAX_LOAD_TIME); + }, MAX_TOKENIZER_LOAD_TIME); for (const [test_name, test_case] of Object.entries(TEST_CONFIG[model_id])) { test(test_name, () => { @@ -184,7 +181,7 @@ describe("Tokenizer padding/truncation", () => { ]); } }, - MAX_EXECUTION_TIME, + MAX_TEST_EXECUTION_TIME, ); }); @@ -218,7 +215,7 @@ describe("Token type ids", () => { compare(model_inputs, expected); }, - MAX_EXECUTION_TIME, + MAX_TEST_EXECUTION_TIME, ); it( @@ -255,7 +252,7 @@ describe("Token type ids", () => { compare(model_inputs, expected); } }, - MAX_EXECUTION_TIME, + MAX_TEST_EXECUTION_TIME, ); }); @@ -269,7 +266,7 @@ describe("Edge cases", () => { let encoded = tokenizer(text); expect(encoded.input_ids.data.length).toBeGreaterThan(100000); }, - MAX_EXECUTION_TIME, + MAX_TEST_EXECUTION_TIME, ); it("should not take too long", async () => { @@ -293,7 +290,7 @@ describe("Edge cases", () => { compare(token_ids, [109]); // Should not be [108, 108] } }, - MAX_EXECUTION_TIME, + MAX_TEST_EXECUTION_TIME, ); }); @@ -320,7 +317,7 @@ describe("Extra decoding tests", () => { expect(decoded3).toEqual(text); expect(decoded4).toEqual(text); }, - MAX_EXECUTION_TIME, + MAX_TEST_EXECUTION_TIME, ); }); diff --git a/tests/utils/hub.test.js b/tests/utils/hub.test.js index 19077f009..3ef3f41f7 100644 --- a/tests/utils/hub.test.js +++ b/tests/utils/hub.test.js @@ -1,6 +1,6 @@ import { AutoModel, PreTrainedModel } from "../../src/models.js"; -import { MAX_TEST_EXECUTION_TIME } from "../init.js"; +import { MAX_TEST_EXECUTION_TIME, DEFAULT_MODEL_OPTIONS } from "../init.js"; // TODO: Set cache folder to a temp directory @@ -10,7 +10,7 @@ describe("Hub", () => { "should load a model from the local cache", async () => { // 1. Local model exists (doesn't matter about status of remote file since local is tried first) - const model = await AutoModel.from_pretrained("hf-internal-testing/tiny-random-T5ForConditionalGeneration"); + const model = await AutoModel.from_pretrained("hf-internal-testing/tiny-random-T5ForConditionalGeneration", DEFAULT_MODEL_OPTIONS); expect(model).toBeInstanceOf(PreTrainedModel); }, MAX_TEST_EXECUTION_TIME, @@ -21,7 +21,7 @@ describe("Hub", () => { async () => { // 2. Local model doesn't exist, remote file exists // This tests that fallback functionality is working - const model = await AutoModel.from_pretrained("hf-internal-testing/tiny-random-T5ForConditionalGeneration"); + const model = await AutoModel.from_pretrained("hf-internal-testing/tiny-random-T5ForConditionalGeneration", DEFAULT_MODEL_OPTIONS); expect(model).toBeInstanceOf(PreTrainedModel); }, MAX_TEST_EXECUTION_TIME, @@ -32,7 +32,7 @@ describe("Hub", () => { async () => { // 3. Local model doesn't exist, remote file doesn't exist // This tests that error handling is working. - await expect(AutoModel.from_pretrained("hf-internal-testing/this-model-does-not-exist")).rejects.toBeInstanceOf(Error); + await expect(AutoModel.from_pretrained("hf-internal-testing/this-model-does-not-exist", DEFAULT_MODEL_OPTIONS)).rejects.toBeInstanceOf(Error); }, MAX_TEST_EXECUTION_TIME, ); diff --git a/tests/utils/image.test.js b/tests/utils/image.test.js new file mode 100644 index 000000000..7fc5d5b4a --- /dev/null +++ b/tests/utils/image.test.js @@ -0,0 +1,89 @@ +import { RawImage, rand } from "../../src/transformers.js"; +import { load_cached_image } from "../asset_cache.js"; + +const TEST_IMAGES = { + rgba: new RawImage(new Uint8ClampedArray([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]), 2, 3, 4), + rgb: new RawImage(new Uint8ClampedArray([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]), 2, 3, 3), + la: new RawImage(new Uint8ClampedArray([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]), 2, 3, 2), + l: new RawImage(new Uint8ClampedArray([0, 1, 2, 3, 4, 5]), 2, 3, 1), +}; + +describe("Image utilities", () => { + describe("Padding", () => { + it("should pad image", async () => { + /** @type {RawImage} */ + const padded_image = await load_cached_image("blue_image") + .then((image) => image.resize(224, 224)) + .then((image) => image.pad([128, 128, 128, 128])); + + expect(padded_image.size).toEqual([480, 480]); + + const avg = padded_image.data.reduce((acc, val) => acc + val, 0) / padded_image.data.length; + expect(avg).toBeCloseTo((224 * 224 * 255) / (3 * 480 * 480), 6); + }); + }); + + describe("Tensor to Image", () => { + it("should create an image from a tensor (CHW)", () => { + const tensor_chw = rand([3, 128, 256]).mul_(255).to("uint8"); + const image = RawImage.fromTensor(tensor_chw); + expect(image.size).toEqual([256, 128]); + }); + it("should create an image from a tensor (HWC)", () => { + const tensor_hwc = rand([128, 256, 3]).mul_(255).to("uint8"); + const image = RawImage.fromTensor(tensor_hwc, "HWC"); + expect(image.size).toEqual([256, 128]); + }); + }); + + describe("Channel conversions", () => { + it("should convert RGBA to L (grayscale)", async () => { + const grayscale = TEST_IMAGES.rgba.clone().grayscale(); + expect(grayscale.size).toEqual(TEST_IMAGES.rgba.size); + expect(grayscale.channels).toEqual(1); + }); + + it("should convert RGB to L (grayscale)", async () => { + const grayscale = TEST_IMAGES.rgb.clone().grayscale(); + expect(grayscale.size).toEqual(TEST_IMAGES.rgb.size); + expect(grayscale.channels).toEqual(1); + }); + + it("should convert L to RGB", async () => { + const rgb = TEST_IMAGES.l.clone().rgb(); + expect(rgb.size).toEqual(TEST_IMAGES.l.size); + expect(rgb.channels).toEqual(3); + }); + + it("should convert L to RGBA", async () => { + const rgba = TEST_IMAGES.l.clone().rgba(); + expect(rgba.size).toEqual(TEST_IMAGES.l.size); + expect(rgba.channels).toEqual(4); + }); + + it("should convert RGB to RGBA", async () => { + const rgba = TEST_IMAGES.rgb.clone().rgba(); + expect(rgba.size).toEqual(TEST_IMAGES.rgb.size); + expect(rgba.channels).toEqual(4); + }); + + it("should convert RGBA to RGB", async () => { + const rgb = TEST_IMAGES.rgba.clone().rgb(); + expect(rgb.size).toEqual(TEST_IMAGES.rgba.size); + expect(rgb.channels).toEqual(3); + }); + }); + + describe("putAlpha", () => { + it("should add alpha to RGB image", async () => { + const rgba = TEST_IMAGES.rgb.clone().putAlpha(TEST_IMAGES.l); + expect(rgba.size).toEqual(TEST_IMAGES.rgb.size); + expect(rgba.channels).toEqual(4); + }); + it("should add alpha to RGBA image", async () => { + const rgba = TEST_IMAGES.rgba.clone().putAlpha(TEST_IMAGES.l); + expect(rgba.size).toEqual(TEST_IMAGES.rgb.size); + expect(rgba.channels).toEqual(4); + }); + }); +});