From 98b1e715e302643f1bcbd76d6a6963888deb8821 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 25 May 2026 11:46:11 +0000 Subject: [PATCH 1/5] vision_utils: clamp resize new_w/new_h to >=1 The integer-rounding formula in _resize_images_inplace can produce new_w=0 or new_h=0 for inputs with extreme aspect ratios (e.g. 1024x1 with image_size=256: new_h = (1*256 + 512) // 1024 = 0). PIL.resize then raises "height and width must be > 0". This is reachable from Studio's vision_image_size knob when a dataset contains a degenerate image and the user picks a small cap. The fix mirrors the max(1, ...) guard already present in Studio's MLX resize helper (studio/backend/core/training/worker.py::_mlx_vlm_max_resized_size). --- unsloth_zoo/vision_utils.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/unsloth_zoo/vision_utils.py b/unsloth_zoo/vision_utils.py index 1f36a595c..0d1f7aa05 100644 --- a/unsloth_zoo/vision_utils.py +++ b/unsloth_zoo/vision_utils.py @@ -978,9 +978,11 @@ def quantize_to_factor(x): image[i] = img.resize(image_size, LANCZOS) elif self.size_func(img) > image_size and hasattr(img, "resize"): w, h = img.size - # integer math rounding - new_w = (w * image_size + self.size_func(img) // 2) // self.size_func(img) - new_h = (h * image_size + self.size_func(img) // 2) // self.size_func(img) + # integer math rounding; clamp to >=1 so degenerate aspect + # ratios (e.g. 1024x1, 4000x4) where the downscale would + # round one side to 0 do not crash PIL.resize. + new_w = max(1, (w * image_size + self.size_func(img) // 2) // self.size_func(img)) + new_h = max(1, (h * image_size + self.size_func(img) // 2) // self.size_func(img)) if self.snap_to_patch_size: factor = self.patch_size * 2 new_w, new_h = quantize_to_factor(new_w), quantize_to_factor(new_h) From 14ced560fe2716abe33405468b49f131c78f1746 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 25 May 2026 13:16:46 +0000 Subject: [PATCH 2/5] Tighten degenerate-aspect clamp comment --- unsloth_zoo/vision_utils.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/unsloth_zoo/vision_utils.py b/unsloth_zoo/vision_utils.py index 0d1f7aa05..6903b0acd 100644 --- a/unsloth_zoo/vision_utils.py +++ b/unsloth_zoo/vision_utils.py @@ -978,9 +978,8 @@ def quantize_to_factor(x): image[i] = img.resize(image_size, LANCZOS) elif self.size_func(img) > image_size and hasattr(img, "resize"): w, h = img.size - # integer math rounding; clamp to >=1 so degenerate aspect - # ratios (e.g. 1024x1, 4000x4) where the downscale would - # round one side to 0 do not crash PIL.resize. + # integer math rounding; max(1, _) avoids zero-side crash + # on degenerate aspect ratios (e.g. 1024x1 with image_size=256). new_w = max(1, (w * image_size + self.size_func(img) // 2) // self.size_func(img)) new_h = max(1, (h * image_size + self.size_func(img) // 2) // self.size_func(img)) if self.snap_to_patch_size: From 5ab1205cffe183ec205e6ff840b038d789ef8bad Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 27 May 2026 13:38:30 +0000 Subject: [PATCH 3/5] Refactor _resize_images_inplace: cache size_func, hoist invariants Pure refactor on top of the existing clamp; output byte-identical for every input that has worked since the collator landed. - Hoist is_tuple, snap_to_patch_size, factor out of the per-image loop. - Cache size_func(img) so it is not called 3x per image. No behavioral change. Same code paths, same numeric output, same exception surface. --- unsloth_zoo/vision_utils.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/unsloth_zoo/vision_utils.py b/unsloth_zoo/vision_utils.py index 6903b0acd..e54aa9310 100644 --- a/unsloth_zoo/vision_utils.py +++ b/unsloth_zoo/vision_utils.py @@ -972,18 +972,25 @@ def quantize_to_factor(x): return image or [] # Resize images image_size = self.image_size + # Loop invariants hoisted once per call. + is_tuple = type(image_size) is tuple + snap = self.snap_to_patch_size + if snap: + factor = self.patch_size * 2 for i, img in enumerate(image): - if type(image_size) is tuple: + if is_tuple: image[i] = img.resize(image_size, LANCZOS) - elif self.size_func(img) > image_size and hasattr(img, "resize"): + continue + # Cache size_func(img) so it is not called 3x per image. + side = self.size_func(img) + if side > image_size and hasattr(img, "resize"): w, h = img.size # integer math rounding; max(1, _) avoids zero-side crash # on degenerate aspect ratios (e.g. 1024x1 with image_size=256). - new_w = max(1, (w * image_size + self.size_func(img) // 2) // self.size_func(img)) - new_h = max(1, (h * image_size + self.size_func(img) // 2) // self.size_func(img)) - if self.snap_to_patch_size: - factor = self.patch_size * 2 + new_w = max(1, (w * image_size + side // 2) // side) + new_h = max(1, (h * image_size + side // 2) // side) + if snap: new_w, new_h = quantize_to_factor(new_w), quantize_to_factor(new_h) image[i] = img.resize((new_w, new_h), LANCZOS) From 39737603cca437712ab5a30e79e8ef58ae838683 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 27 May 2026 15:00:02 +0000 Subject: [PATCH 4/5] Warn on post-resize aspect_ratio > MAX_RATIO After the clamp, surface a one-shot UserWarning when the resized image would have aspect_ratio > 200 (MAX_RATIO). Qwen2-VL / Qwen2.5-VL preprocessors reject such inputs in their own smart_resize; without this warning users only see the downstream crash and have no signal that the issue is a degenerate-aspect training image. Non-degenerate inputs are unaffected (warning is gated on the same MAX_RATIO check zoo's own smart_resize already enforces). --- unsloth_zoo/vision_utils.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/unsloth_zoo/vision_utils.py b/unsloth_zoo/vision_utils.py index e54aa9310..04ef1bd56 100644 --- a/unsloth_zoo/vision_utils.py +++ b/unsloth_zoo/vision_utils.py @@ -92,6 +92,10 @@ MAX_PIXELS = 16384 * 28 * 28 MAX_RATIO = 200 +# One-shot guard so the degenerate-aspect warning fires once per process, +# not once per image / per batch. +_WARNED_DEGENERATE_ASPECT = False + VIDEO_MIN_PIXELS = 128 * 28 * 28 VIDEO_MAX_PIXELS = 768 * 28 * 28 VIDEO_TOTAL_PIXELS = int(float(os.environ.get('VIDEO_MAX_PIXELS', 128000 * 28 * 28 * 0.9))) @@ -993,6 +997,25 @@ def quantize_to_factor(x): if snap: new_w, new_h = quantize_to_factor(new_w), quantize_to_factor(new_h) + # Heads-up: Qwen2-VL / Qwen2.5-VL preprocessors reject inputs + # with aspect_ratio > MAX_RATIO via smart_resize. Surface a + # single, actionable warning so users learn to filter their + # dataset before the downstream crash. + global _WARNED_DEGENERATE_ASPECT + if (not _WARNED_DEGENERATE_ASPECT + and max(new_w, new_h) > MAX_RATIO * min(new_w, new_h)): + _WARNED_DEGENERATE_ASPECT = True + warnings.warn( + f"Unsloth: image {w}x{h} resized to " + f"({new_w}, {new_h}) has aspect ratio " + f"{max(new_w, new_h) // min(new_w, new_h)}, exceeding " + f"MAX_RATIO={MAX_RATIO}. Qwen2-VL / Qwen2.5-VL will " + "reject this in smart_resize; filter degenerate-aspect " + "images from your dataset before training those models.", + UserWarning, + stacklevel = 2, + ) + image[i] = img.resize((new_w, new_h), LANCZOS) return image From 4a026c1b128d8b62213f9ddba810d49e6136ed68 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 27 May 2026 15:45:37 +0000 Subject: [PATCH 5/5] Tighten comments in vision_utils._resize_images_inplace --- unsloth_zoo/vision_utils.py | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/unsloth_zoo/vision_utils.py b/unsloth_zoo/vision_utils.py index 04ef1bd56..7d6b6cc9b 100644 --- a/unsloth_zoo/vision_utils.py +++ b/unsloth_zoo/vision_utils.py @@ -92,8 +92,7 @@ MAX_PIXELS = 16384 * 28 * 28 MAX_RATIO = 200 -# One-shot guard so the degenerate-aspect warning fires once per process, -# not once per image / per batch. +# Fire degenerate-aspect warning once per process. _WARNED_DEGENERATE_ASPECT = False VIDEO_MIN_PIXELS = 128 * 28 * 28 @@ -976,7 +975,7 @@ def quantize_to_factor(x): return image or [] # Resize images image_size = self.image_size - # Loop invariants hoisted once per call. + # Hoist loop invariants. is_tuple = type(image_size) is tuple snap = self.snap_to_patch_size if snap: @@ -986,32 +985,26 @@ def quantize_to_factor(x): if is_tuple: image[i] = img.resize(image_size, LANCZOS) continue - # Cache size_func(img) so it is not called 3x per image. + # Cache size_func(img) once. side = self.size_func(img) if side > image_size and hasattr(img, "resize"): w, h = img.size - # integer math rounding; max(1, _) avoids zero-side crash - # on degenerate aspect ratios (e.g. 1024x1 with image_size=256). + # max(1, _) avoids zero-side crash on degenerate aspect ratios. new_w = max(1, (w * image_size + side // 2) // side) new_h = max(1, (h * image_size + side // 2) // side) if snap: new_w, new_h = quantize_to_factor(new_w), quantize_to_factor(new_h) - # Heads-up: Qwen2-VL / Qwen2.5-VL preprocessors reject inputs - # with aspect_ratio > MAX_RATIO via smart_resize. Surface a - # single, actionable warning so users learn to filter their - # dataset before the downstream crash. + # Qwen2-VL smart_resize rejects aspect > MAX_RATIO; warn once. global _WARNED_DEGENERATE_ASPECT if (not _WARNED_DEGENERATE_ASPECT and max(new_w, new_h) > MAX_RATIO * min(new_w, new_h)): _WARNED_DEGENERATE_ASPECT = True warnings.warn( - f"Unsloth: image {w}x{h} resized to " - f"({new_w}, {new_h}) has aspect ratio " - f"{max(new_w, new_h) // min(new_w, new_h)}, exceeding " - f"MAX_RATIO={MAX_RATIO}. Qwen2-VL / Qwen2.5-VL will " - "reject this in smart_resize; filter degenerate-aspect " - "images from your dataset before training those models.", + f"Unsloth: {w}x{h} -> ({new_w}, {new_h}) aspect " + f"{max(new_w, new_h) // min(new_w, new_h)} > " + f"MAX_RATIO={MAX_RATIO}. Qwen2-VL/2.5-VL will reject; " + "filter degenerate-aspect images from your dataset.", UserWarning, stacklevel = 2, )