diff --git a/examples/text-generation/quantization_config/unit_scale_quant.json b/examples/text-generation/quantization_config/unit_scale_quant.json index 216cf27e68..20783ea3f1 100644 --- a/examples/text-generation/quantization_config/unit_scale_quant.json +++ b/examples/text-generation/quantization_config/unit_scale_quant.json @@ -3,5 +3,10 @@ "mode": "QUANTIZE", "observer": "maxabs", "scale_method": "unit_scale", - "dump_stats_path": "./hqt_output/measure" + "whitelist": {"types": [], "names": []}, + "blacklist": {"types": [], "names": []}, + "quantize_weight": false, + "dump_stats_path": "./results/hk", + "ignore_modules_wo_measures": "True", + "dump_stats_xlsx_path": "./run_outputs/fp8stats.xlsx" } diff --git a/examples/text-generation/utils.py b/examples/text-generation/utils.py index c4bd2147ac..7a4ae91425 100644 --- a/examples/text-generation/utils.py +++ b/examples/text-generation/utils.py @@ -430,7 +430,12 @@ def setup_distributed_model(args, model_dtype, model_kwargs, logger): logger.info("DeepSpeed is enabled.") deepspeed.init_distributed(dist_backend="hccl") config = AutoConfig.from_pretrained(args.model_name_or_path, torch_dtype=model_dtype, **model_kwargs) - load_to_meta = model_on_meta(config) + + keep_module_on_host = False + if "Llama-3.1-405B" in args.model_name_or_path: + keep_module_on_host = True + + load_to_meta = False if keep_module_on_host else model_on_meta(config) if args.assistant_model is None: assistant_model = None @@ -485,6 +490,7 @@ def setup_distributed_model(args, model_dtype, model_kwargs, logger): # Initialize the model ds_inference_kwargs = {"dtype": model_dtype} + ds_inference_kwargs["keep_module_on_host"] = keep_module_on_host ds_inference_kwargs["tensor_parallel"] = {"tp_size": args.world_size} ds_inference_kwargs["enable_cuda_graph"] = args.use_hpu_graphs ds_inference_kwargs["injection_policy"] = get_ds_injection_policy(config)