From a8749efde8bdb72bd60c310c500701565aa4c544 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Wed, 28 Jan 2026 17:11:43 +0000 Subject: [PATCH 1/2] qwen3_vl_example Signed-off-by: Kyle Sayers --- .../multimodal_vision/qwen3_vl_example.py | 124 ++++++++++++++++++ 1 file changed, 124 insertions(+) create mode 100644 examples/multimodal_vision/qwen3_vl_example.py diff --git a/examples/multimodal_vision/qwen3_vl_example.py b/examples/multimodal_vision/qwen3_vl_example.py new file mode 100644 index 0000000000..12bec4ba3a --- /dev/null +++ b/examples/multimodal_vision/qwen3_vl_example.py @@ -0,0 +1,124 @@ +import base64 +from io import BytesIO + +import torch +from datasets import load_dataset +from qwen_vl_utils import process_vision_info +from transformers import AutoProcessor, Qwen3VLForConditionalGeneration + +from llmcompressor import oneshot +from llmcompressor.modifiers.awq import AWQModifier +from llmcompressor.utils import dispatch_for_generation + +# Load model. +model_id = "Qwen/Qwen3-VL-8B-Instruct" +model = Qwen3VLForConditionalGeneration.from_pretrained(model_id, torch_dtype="auto") +processor = AutoProcessor.from_pretrained(model_id) + +# Oneshot arguments +DATASET_ID = "lmms-lab/flickr30k" +DATASET_SPLIT = "test[:12]" +NUM_CALIBRATION_SAMPLES = 12 +MAX_SEQUENCE_LENGTH = 2048 + +# Load dataset and preprocess. +ds = load_dataset(DATASET_ID, split=DATASET_SPLIT) +ds = ds.shuffle(seed=42) + + +# Apply chat template and tokenize inputs. +def preprocess_and_tokenize(example): + # preprocess + buffered = BytesIO() + example["image"].save(buffered, format="PNG") + encoded_image = base64.b64encode(buffered.getvalue()) + encoded_image_text = encoded_image.decode("utf-8") + base64_qwen = f"data:image;base64,{encoded_image_text}" + messages = [ + { + "role": "user", + "content": [ + {"type": "image", "image": base64_qwen}, + {"type": "text", "text": "What does the image show?"}, + ], + } + ] + text = processor.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + image_inputs, video_inputs = process_vision_info(messages) + + # tokenize + return processor( + text=[text], + images=image_inputs, + videos=video_inputs, + padding=False, + max_length=MAX_SEQUENCE_LENGTH, + truncation=True, + ) + + +ds = ds.map(preprocess_and_tokenize, remove_columns=ds.column_names) + + +# Define a oneshot data collator for multimodal inputs. +def data_collator(batch): + assert len(batch) == 1 + return {key: torch.tensor(value) for key, value in batch[0].items()} + + +# Recipe +recipe = AWQModifier( + scheme="W4A16", + ignore=["re:.*lm_head", "re:.*visual.*"], + duo_scaling=False, +) + +# Perform oneshot +oneshot( + model=model, + tokenizer=model_id, + dataset=ds, + recipe=recipe, + max_seq_length=MAX_SEQUENCE_LENGTH, + num_calibration_samples=NUM_CALIBRATION_SAMPLES, + data_collator=data_collator, + sequential_targets=["Qwen3VLTextDecoderLayer"], +) + +# Confirm generations of the quantized model look sane. +print("========== SAMPLE GENERATION ==============") +dispatch_for_generation(model) +messages = [ + { + "role": "user", + "content": [ + { + "type": "image", + "image": "http://images.cocodataset.org/train2017/000000231895.jpg", + }, + {"type": "text", "text": "Please describe the animal in this image\n"}, + ], + } +] +prompt = processor.apply_chat_template(messages, add_generation_prompt=True) +image_inputs, video_inputs = process_vision_info(messages) +inputs = processor( + text=[prompt], + images=image_inputs, + videos=video_inputs, + padding=False, + max_length=MAX_SEQUENCE_LENGTH, + truncation=True, + return_tensors="pt", +).to(model.device) +output = model.generate(**inputs, max_new_tokens=100) +print(processor.decode(output[0], skip_special_tokens=True)) +print("==========================================") + + +# Save to disk compressed. +SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-W4A16" +model.save_pretrained(SAVE_DIR, save_compressed=True) +processor.save_pretrained(SAVE_DIR) From 6fcf755d11c96fc1dc7dec333fbf5474ec4538cf Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Wed, 28 Jan 2026 17:13:32 +0000 Subject: [PATCH 2/2] use more samples Signed-off-by: Kyle Sayers --- examples/multimodal_vision/qwen3_vl_example.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/multimodal_vision/qwen3_vl_example.py b/examples/multimodal_vision/qwen3_vl_example.py index 12bec4ba3a..1272eb88c8 100644 --- a/examples/multimodal_vision/qwen3_vl_example.py +++ b/examples/multimodal_vision/qwen3_vl_example.py @@ -17,8 +17,8 @@ # Oneshot arguments DATASET_ID = "lmms-lab/flickr30k" -DATASET_SPLIT = "test[:12]" -NUM_CALIBRATION_SAMPLES = 12 +DATASET_SPLIT = "test[:512]" +NUM_CALIBRATION_SAMPLES = 512 MAX_SEQUENCE_LENGTH = 2048 # Load dataset and preprocess.