Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
d43aa6b
fix
roG0d Apr 17, 2026
9846e09
fix
roG0d Apr 17, 2026
12c8e5f
refactoring
baonudesifeizhai Apr 18, 2026
f79a574
refactoring
baonudesifeizhai Apr 18, 2026
e398ab3
continue refacoring
baonudesifeizhai Apr 18, 2026
8a3b83d
fix huawei
baonudesifeizhai Apr 19, 2026
b2b15f0
fix online server problem
baonudesifeizhai Apr 19, 2026
fc91876
[Quant] Wire quant_config through HunyuanVideo-1.5 DiT (extracted fro…
lishunyang12 Apr 19, 2026
8aa8f99
[Quant] ModelOpt FP8 calibration script + stage config for HunyuanVid…
lishunyang12 Apr 19, 2026
bd2a01d
[Quant] HV-1.5 calibration: handle Guider config (fall back if guidan…
lishunyang12 Apr 19, 2026
f76a50c
[Quant] Add check_modelopt_fp8_export.py post-calibration verifier
lishunyang12 Apr 19, 2026
b2f4522
[Quant] check script: read safetensors header for dtype (not get_tens…
lishunyang12 Apr 19, 2026
2b38838
[Quant] Force FP8 weight serialization + patch quant_algo for HV-1.5
lishunyang12 Apr 19, 2026
d876c58
[Quant] hide_quantizers_from_state_dict: pass transformer (nn.Module)…
lishunyang12 Apr 19, 2026
737db25
[Quant] Fix calibrator's 'next' hint: text_to_video.py uses --quantiz…
lishunyang12 Apr 19, 2026
26b53f3
[Quant] Add --weight-block-size to switch HV-1.5 ModelOpt FP8 to per-…
lishunyang12 Apr 19, 2026
23af41f
[Quant] check script: classify weight_scale granularity (per-tensor v…
lishunyang12 Apr 19, 2026
9c22163
[Quant] check: count only meaningful (>1) dims when classifying scale…
lishunyang12 Apr 19, 2026
4047d1c
[Quant] Wire quant_config through Wan2.2 DiT (extracted from #2920)
lishunyang12 Apr 19, 2026
6cc5478
[Quant] ModelOpt FP8 calibration script + stage config for Wan2.2 TI2…
lishunyang12 Apr 19, 2026
4e6f557
[Quant] Fix Wan2.2 ModelOpt FP8 FFN scale-name remap (ffn.net.0 -> ff…
lishunyang12 Apr 19, 2026
a5fb789
[Quant] ModelOpt FP8 adapter: log first 3 skipped scales for diagnostics
lishunyang12 Apr 19, 2026
7e57b69
[Quant] Walk submodules when collecting hf_to_vllm_mapper for ModelOp…
lishunyang12 Apr 19, 2026
2b543e9
Phase1 (video-gen) ModelOpt FP8 Follow-ups (#57)
ArtificialRay May 17, 2026
4114b98
Merge upstream/main into modelopt-fp8-hv15
lishunyang12 May 17, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions examples/offline_inference/vace/vace_video_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,13 @@ def parse_args() -> argparse.Namespace:
choices=["t2v", "i2v", "v2lf", "flf2v", "inpaint", "r2v"],
help="Generation mode.",
)
parser.add_argument(
"--quantization",
type=str,
default=None,
choices=["fp8", "gguf"],
help="Quantization method for the transformer (fp8 for online FP8 quantization).",
)
parser.add_argument("--prompt", default="A cat walking in a garden", help="Text prompt.")
parser.add_argument("--negative-prompt", default="", help="Negative prompt.")
parser.add_argument("--image", type=str, default=None, help="Input image path (for I2V, R2V, FLF2V, inpaint).")
Expand Down Expand Up @@ -159,6 +166,7 @@ def main():
flow_shift=args.flow_shift,
enforce_eager=args.enforce_eager,
parallel_config=parallel_config,
quantization = args.quantization
)

prompt_data = build_prompts(args)
Expand Down
311 changes: 311 additions & 0 deletions examples/quantization/check_modelopt_fp8_export.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,311 @@
#!/usr/bin/env python3
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Verify a ModelOpt FP8 diffusers checkpoint exported by
quantize_hunyuanvideo_15_modelopt_fp8.py (or any sibling quantize_*.py).

Three checks:
A. transformer/config.json has a sane quantization_config block.
B. transformer/*.safetensors contains FP8 (float8_e4m3fn) tensors.
C. transformer disk size is materially smaller than a BF16 baseline.

Example:
python examples/quantization/check_modelopt_fp8_export.py \\
--output ./hv15-480p-modelopt-fp8

# Optional: compare disk size against a local or HF BF16 baseline.
python examples/quantization/check_modelopt_fp8_export.py \\
--output ./hv15-480p-modelopt-fp8 \\
--baseline hunyuanvideo-community/HunyuanVideo-1.5-Diffusers-480p_t2v
"""

from __future__ import annotations

import argparse
import json
import sys
from collections import Counter
from pathlib import Path


def _check_config(transformer_dir: Path) -> int:
"""Returns 0 on pass, 1 on fail. Prints findings."""
cfg_path = transformer_dir / "config.json"
if not cfg_path.exists():
print(f"[FAIL] {cfg_path} missing.")
return 1

with cfg_path.open(encoding="utf-8") as f:
cfg = json.load(f)

qc = cfg.get("quantization_config")
if not isinstance(qc, dict):
print(f"[FAIL] No `quantization_config` block in {cfg_path}.")
return 1

print(f"[A] quantization_config from {cfg_path}:")
print(json.dumps(qc, indent=2))

issues = []
if qc.get("quant_method") != "modelopt":
issues.append(f"quant_method={qc.get('quant_method')!r} (expected 'modelopt')")

quant_algo = qc.get("quant_algo")
if quant_algo not in ("FP8", "FP8_PB_WO"):
issues.append(
f"quant_algo={quant_algo!r} (expected 'FP8' for per-tensor or "
"'FP8_PB_WO' for 128x128 block-wise — other algos aren't routed by "
"vllm-omni's adapter today)"
)

# Cross-check that the saved weight strategy and the dispatch field agree.
# Producer scripts can in principle drift apart (e.g. metadata says "block"
# but quant_algo still claims "FP8"), and that lands as an AssertionError at
# weight load time because the runtime LinearMethod expects scalar scales but
# finds 4D block ones. Failing here is much friendlier.
cfg_groups = qc.get("config_groups", {})
weight_strategies = {
(group or {}).get("weights", {}).get("strategy")
for group in cfg_groups.values()
if isinstance(group, dict)
}
weight_strategies.discard(None)
if weight_strategies == {"block"} and quant_algo != "FP8_PB_WO":
issues.append(
f"weights.strategy='block' but quant_algo={quant_algo!r}. Per-block "
"weight scales require FP8_PB_WO so upstream vLLM dispatches to "
"ModelOptFp8PbWoLinearMethod; FP8 routes to per-tensor and crashes "
"on the 4D weight_scale at weight load time."
)
elif quant_algo == "FP8_PB_WO" and weight_strategies != {"block"}:
issues.append(
f"quant_algo='FP8_PB_WO' but weights.strategy={weight_strategies!r} "
"(expected {'block'}). FP8_PB_WO consumers expect 4D per-block scales."
)

if issues:
print("[A] WARN — config looks incomplete:")
for issue in issues:
print(f" - {issue}")
return 2
print(f"[A] PASS — config looks correct (quant_algo={quant_algo}).")
return 0


def _read_safetensors_header(path: Path) -> dict:
"""Read the JSON header of a safetensors file. Bypass-safe — doesn't materialize tensors.

Returns {tensor_name: {'dtype': 'F8_E4M3', 'shape': [...], 'data_offsets': [...]}}.
Header dtype strings: F8_E4M3, F8_E5M2, BF16, F16, F32, F64, I8, I16, I32, I64, BOOL, U8, ...
"""
import struct

with open(path, "rb") as f:
header_len = struct.unpack("<Q", f.read(8))[0]
header = json.loads(f.read(header_len))
header.pop("__metadata__", None)
return header


def _classify_weight_scale_granularity(weight_scale_shapes: list[list[int]]) -> str:
"""Infer per-tensor vs per-channel vs per-block from sample weight_scale shapes.

ModelOpt block-wise produces shapes like `[16, 1, 16, 1]` (broadcasting dims of 1
interleaved with block-count dims). We count "meaningful" dims — ones with size > 1 —
and classify: 0 meaningful dims = per-tensor (scalar), 1 = per-channel, 2+ = per-block.
"""
if not weight_scale_shapes:
return "no weight_scale tensors found"

def meaningful_dims(shape: list[int]) -> int:
return sum(1 for d in shape if d > 1)

per_tensor = sum(1 for s in weight_scale_shapes if meaningful_dims(s) == 0)
per_channel = sum(1 for s in weight_scale_shapes if meaningful_dims(s) == 1)
per_block = sum(1 for s in weight_scale_shapes if meaningful_dims(s) >= 2)
total = len(weight_scale_shapes)
if per_tensor == total:
return "per-tensor (all scalar scales)"
if per_channel == total:
return "per-channel (1 meaningful dim)"
if per_block == total:
return "per-block (2+ meaningful dims — e.g. [M//bm, 1, N//bn, 1] for tiles)"
return f"mixed: per-tensor={per_tensor}, per-channel={per_channel}, per-block={per_block} of {total}"


def _check_safetensors(transformer_dir: Path) -> int:
"""Returns 0 on pass, 1 on fail. Reads on-disk dtype from the safetensors header."""
files = sorted(transformer_dir.glob("*.safetensors"))
if not files:
print(f"[FAIL] No *.safetensors in {transformer_dir}.")
return 1

header_dtype_counts: Counter[str] = Counter()
sample_fp8_keys: list[str] = []
sample_scale_keys: list[str] = []
weight_scale_shapes: list[list[int]] = []
sample_weight_scale_entries: list[tuple[str, list[int]]] = []
for f in files:
try:
header = _read_safetensors_header(f)
except Exception as exc:
print(f"[B] WARN — could not parse header of {f}: {exc}")
continue
for k, info in header.items():
dtype = info.get("dtype", "?")
header_dtype_counts[dtype] += 1
if dtype.startswith("F8") and len(sample_fp8_keys) < 5:
sample_fp8_keys.append(k)
if k.endswith(("_scale", ".weight_scale", ".input_scale", "_scale_inv")) and len(sample_scale_keys) < 5:
sample_scale_keys.append(k)
if k.endswith(".weight_scale"):
weight_scale_shapes.append(info.get("shape", []))
if len(sample_weight_scale_entries) < 5:
sample_weight_scale_entries.append((k, info.get("shape", [])))

print(f"\n[B] On-disk dtype counts across {len(files)} safetensors file(s) (from header, not get_tensor):")
for dtype, count in sorted(header_dtype_counts.items(), key=lambda kv: -kv[1]):
marker = " <-- FP8" if dtype.startswith("F8") else ""
print(f" {dtype:10s} {count:>6d}{marker}")

fp8_count = sum(c for d, c in header_dtype_counts.items() if d.startswith("F8"))
if fp8_count == 0:
print("[B] FAIL — no FP8 tensors on disk. Calibration likely did not actually quantize the weights.")
return 1

print(f"[B] PASS — {fp8_count} FP8 tensors stored on disk.")
if sample_fp8_keys:
print(f" sample FP8 tensors: {sample_fp8_keys[:3]}")
if sample_scale_keys:
print(f" sample scale tensors: {sample_scale_keys[:3]}")
print(" (Note: torch's get_tensor() may return these as bf16 views on some versions —")
print(" irrelevant; vLLM's loader uses native FP8 ops.)")

# Weight-scale granularity — per-tensor (scalar) vs per-channel (1-D) vs per-block (N-D).
print(f"\n weight_scale granularity: {_classify_weight_scale_granularity(weight_scale_shapes)}")
for key, shape in sample_weight_scale_entries[:3]:
print(f" {key}: shape {shape}")
return 0


def _disk_size_gib(p: Path) -> float:
return sum(f.stat().st_size for f in p.rglob("*") if f.is_file()) / (1024**3)


def _transformer_subdirs(root: Path) -> list[Path]:
"""Return [<root>/transformer, <root>/transformer_2] for those that exist.

Wan2.2 MoE A14B (T2V/I2V) and Wan2.2-VACE-A14B export TWO transformer
subfolders; single-transformer checkpoints just have `transformer/`.
Falls back to `[root]` if neither exists (e.g., a baseline directory
that wasn't structured as a diffusers repo).
"""
found = [root / name for name in ("transformer", "transformer_2") if (root / name).is_dir()]
return found if found else [root]


def _check_size_vs_baseline(transformer_dir: Path, baseline: str | None) -> int:
"""Returns 0 always (informational only)."""
# transformer_dir is <fp8_root>/transformer; walk one level up so we can
# also pick up transformer_2/ for Wan2.2 MoE A14B checkpoints.
fp8_root = transformer_dir.parent
fp8_subdirs = _transformer_subdirs(fp8_root)
fp8_size = sum(_disk_size_gib(p) for p in fp8_subdirs)
fp8_label = " + ".join(p.name for p in fp8_subdirs)
print(f"\n[C] FP8 transformer disk size ({fp8_label}): {fp8_size:.2f} GiB")

if baseline is None:
print("[C] SKIP — pass --baseline <path or HF id> to compare against BF16.")
return 0

baseline_path = Path(baseline)
if not baseline_path.exists():
# Treat `baseline` as an HF repo id and read from the local cache.
# Don't trigger a download: this script is meant to run AFTER
# quantize_*_modelopt_fp8.py, which already pulled the whole repo
# into the cache. local_files_only=True makes that assumption
# explicit — if the cache is empty we surface a clear error rather
# than silently kicking off a multi-GB download.
try:
from huggingface_hub import snapshot_download
from huggingface_hub.errors import LocalEntryNotFoundError
except ImportError:
print("[C] SKIP — huggingface_hub not installed and baseline not a local path.")
return 0
try:
baseline_path = Path(snapshot_download(baseline, local_files_only=True))
except LocalEntryNotFoundError:
print(
f"[C] SKIP — '{baseline}' not found in local HF cache. "
"Run the matching quantize_*_modelopt_fp8.py first (it caches the BF16 repo), "
"or pass --baseline <local-dir>."
)
return 0
print(f" Resolved baseline from HF cache: {baseline_path}")

bf16_subdirs = _transformer_subdirs(baseline_path)
bf16_size = sum(_disk_size_gib(p) for p in bf16_subdirs)
if bf16_size == 0:
print(f"[C] WARN — baseline transformer dir empty: {baseline_path}")
return 0

bf16_label = " + ".join(p.name for p in bf16_subdirs)
reduction = (1 - fp8_size / bf16_size) * 100
print(f"[C] BF16 baseline transformer disk size ({bf16_label}): {bf16_size:.2f} GiB ({baseline_path})")
print(f"[C] Disk reduction: {reduction:.1f}% (FP8 is {fp8_size / bf16_size:.0%} of BF16)")
if reduction < 30:
print("[C] WARN — FP8 should typically reduce disk by ~40-50%; <30% suggests partial quantization.")

# Whole-repo view: includes VAE / text_encoder / tokenizer / scheduler /
# top-level metadata. Quantization only touches transformer(s) so this
# reduction is always smaller than the transformer-only one — but it's
# what the deployment footprint actually is.
fp8_total = _disk_size_gib(fp8_root)
bf16_total = _disk_size_gib(baseline_path)
if bf16_total > 0:
total_reduction = (1 - fp8_total / bf16_total) * 100
print(
f"[C] Whole-repo: FP8 {fp8_total:.2f} GiB / BF16 {bf16_total:.2f} GiB "
f"(reduction {total_reduction:.1f}%, deployment footprint)"
)
return 0


def main() -> None:
p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
p.add_argument("--output", required=True, help="Path to the exported ModelOpt FP8 checkpoint root.")
p.add_argument(
"--baseline",
default=None,
help="Optional BF16 baseline (local diffusers dir or HF id) for disk-size comparison.",
)
args = p.parse_args()

out_root = Path(args.output).expanduser().resolve()
transformer_dir = out_root / "transformer"
if not transformer_dir.exists():
print(f"[FAIL] {transformer_dir} does not exist.")
sys.exit(1)

print(f"Checking: {out_root}\n")

fail = 0
fail |= _check_config(transformer_dir)
fail |= _check_safetensors(transformer_dir)
_check_size_vs_baseline(transformer_dir, args.baseline)

print()
if fail == 0:
print("=" * 60)
print("ALL CHECKS PASSED — checkpoint looks ready for vllm-omni serving.")
elif fail == 1:
print("=" * 60)
print("FAILURES detected — calibration may need to be re-run.")
sys.exit(1)
else:
print("=" * 60)
print("WARNINGS only — checkpoint may serve but with caveats. See [A] above.")


if __name__ == "__main__":
main()
Loading
Loading