NVIDIA-NeMo · ko3n1g · Mar 3, 2026 · Feb 4, 2026 · Feb 4, 2026 · Feb 5, 2026
@@ -393,10 +393,11 @@ jobs:
           - script: L2_Launch_models_nemotron_vl
           - script: L2_Launch_models_olmoe
           - script: L2_Launch_models_qwen
-          # - script: L2_Launch_models_qwen_quantization
+          - script: L2_Launch_models_qwen_quantization
           - script: L2_Launch_models_qwen_vl
           - script: L2_Launch_recipes_gemma_vl
           - script: L2_Launch_recipes_gpt_oss
+          - script: L2_Launch_models_qwen_vl_quantization
           - script: L2_Launch_recipes_llama_1b
           - script: L2_Launch_recipes_llama_3b
           - script: L2_Launch_recipes_llama_distill
@@ -405,7 +406,7 @@ jobs:
           - script: L2_Launch_data
           - script: L2_Launch_post_training_quantization
           - script: L2_Launch_quantization_aware_training
-          - script: L2_Launch_quantization_export
+          # - script: L2_Launch_quantization_export
           - script: L2_Launch_recipes_llama_cuda_graphs
     needs: [pre-flight, cicd-unit-tests]
     runs-on: ${{ needs.pre-flight.outputs.runner_prefix }}-gpu-x2

diff --git a/examples/evaluation/deploy.sh b/examples/evaluation/deploy.sh
@@ -13,7 +13,4 @@ python \
   --host 0.0.0.0 \
   --port 8000 \
   --num_gpus "$NUM_GPUS" \
-  --num_replicas "$NUM_REPLICAS" \
-  --tensor_model_parallel_size 1 \
-  --pipeline_model_parallel_size 1 \
-  --context_parallel_size 1 
+  --num_replicas "$NUM_REPLICAS"
diff --git a/examples/evaluation/launch_evaluation_pipeline.py b/examples/evaluation/launch_evaluation_pipeline.py
@@ -111,7 +111,16 @@ def main(args):
         executor=executor,
     )
     job.start(
-        command=f"bash /opt/Megatron-Bridge/examples/evaluation/deploy.sh {args.megatron_checkpoint} {args.num_replicas} {args.num_gpus} | tee -a deploy.log & sleep 120; bash /opt/Megatron-Bridge/examples/evaluation/eval.sh {args.output_dir} {args.parallelism} | tee -a eval.log",
+        command=f"""
+        bash /opt/Megatron-Bridge/examples/evaluation/deploy.sh \
+            {args.megatron_checkpoint} \
+            {args.num_replicas} \
+            {args.num_gpus}| tee -a deploy.log & \
+        sleep 120; \
+        bash /opt/Megatron-Bridge/examples/evaluation/eval.sh \
+            {args.output_dir} \
+            {args.parallelism} | tee -a eval.log
+        """,
         workdir=None,
     )
 

diff --git a/examples/quantization/export.py b/examples/quantization/export.py
@@ -42,6 +42,7 @@
 
 from megatron.bridge import AutoBridge
 from megatron.bridge.models.decorators import torchrun_main
+from megatron.bridge.models.hf_pretrained.utils import is_safe_repo
 
 
 warnings.filterwarnings("ignore")
@@ -61,6 +62,7 @@ def main(
     export_dir: str = "./hf_export",
     export_extra_modules: bool = False,
     dtype: str = "bfloat16",
+    trust_remote_code: bool | None = None,
 ) -> None:
     """Export a quantized Megatron-LM checkpoint to HuggingFace format on multiple GPUs."""
     if os.environ.get("WORLD_SIZE") is None:
@@ -78,7 +80,13 @@ def main(
         sys.exit(1)
 
     # Initialize bridge from HF model to get tokenizer and model structure
-    bridge = AutoBridge.from_hf_pretrained(hf_model_id)
+    bridge = AutoBridge.from_hf_pretrained(
+        hf_model_id,
+        trust_remote_code=is_safe_repo(
+            trust_remote_code=trust_remote_code,
+            hf_path=hf_model_id,
+        ),
+    )
 
     # Get model provider and configure for multi-GPU execution
     model_provider = bridge.to_megatron_provider(load_weights=False)
@@ -152,6 +160,7 @@ def main(
         export_extra_modules=export_extra_modules_flag,
         dtype=torch_dtype,
         export_dir=export_dir,
+        trust_remote_code=is_safe_repo(trust_remote_code=trust_remote_code, hf_path=hf_model_id),
     )
 
     if is_rank_0:
@@ -195,6 +204,11 @@ def main(
         choices=["bfloat16", "float16", "float32"],
         help="Data type for export",
     )
+    parser.add_argument(
+        "--trust-remote-code",
+        action="store_true",
+        help="if trust_remote_code",
+    )
 
     args = parser.parse_args()
     main(
@@ -207,6 +221,7 @@ def main(
         args.export_dir,
         args.export_extra_modules,
         args.dtype,
+        args.trust_remote_code,
     )
 
     if torch.distributed.is_initialized():

diff --git a/examples/quantization/pretrain_quantized_llama3_8b.py b/examples/quantization/pretrain_quantized_llama3_8b.py
@@ -152,14 +152,17 @@ def main() -> None:
     logger.info("------------------------------------------------------------------")
 
     # Load base configuration from the recipe as a Python dataclass
-    # If --hf-path is provided, pass it to the recipe function
-    recipe_kwargs = {}
+    # Pretrain configs use parameterless API
+    cfg: ConfigContainer = pretrain_config()
+    logger.info("Loaded base configuration")
+
+    # If --hf-path is provided, override the model's HuggingFace path
     if args.hf_path:
         logger.info(f"Using custom HuggingFace path: {args.hf_path}")
-        recipe_kwargs["hf_path"] = args.hf_path
+        # Import AutoBridge to create a new model provider with the custom HF path
+        from megatron.bridge.models import AutoBridge
 
-    cfg: ConfigContainer = pretrain_config(**recipe_kwargs)
-    logger.info("Loaded base configuration")
+        cfg.model = AutoBridge.from_hf_pretrained(args.hf_path).to_megatron_provider(load_weights=False)
 
     # Print configuration on rank 0
     if get_rank_safe() == 0:

diff --git a/examples/quantization/ptq_generate.py b/examples/quantization/ptq_generate.py
@@ -58,8 +58,9 @@ def _validate_quantized_model(model: torch.nn.Module, is_rank_0: bool) -> None:
     If someone accidentally breaks the quantization loading logic (e.g., in
     has_modelopt_state or build_and_load_model), this check will catch it.
 
-    We check for QuantRowParallelLinear and QuantColumnParallelLinear as these
-    are present in all quantized model architectures (GPT, Llama, Qwen, Nemotron-H, etc).
+    We check for quantized layer types that indicate successful quantization:
+    - Local spec: QuantRowParallelLinear, QuantColumnParallelLinear
+    - TE spec: QuantTERowParallelLinear, QuantTELayerNormColumnParallelLinear
 
     Args:
         model: The unwrapped model to validate
@@ -68,25 +69,36 @@ def _validate_quantized_model(model: torch.nn.Module, is_rank_0: bool) -> None:
     Raises:
         RuntimeError: If the model doesn't contain expected quantized layers
     """
-    # Check for quantized layer types that are universal across all architectures
     model_str = str(model)
 
-    required_quant_layers = [
+    # Local spec quantized layers
+    local_spec_layers = [
         "QuantRowParallelLinear",
         "QuantColumnParallelLinear",
     ]
 
-    missing_layers = [layer for layer in required_quant_layers if layer not in model_str]
+    # TE spec quantized layers
+    te_spec_layers = [
+        "QuantTERowParallelLinear",
+        "QuantTELayerNormColumnParallelLinear",
+    ]
+
+    # Check if model has local spec quantized layers
+    has_local_spec = all(layer in model_str for layer in local_spec_layers)
+
+    # Check if model has TE spec quantized layers
+    has_te_spec = all(layer in model_str for layer in te_spec_layers)
 
-    if missing_layers:
+    if not has_local_spec and not has_te_spec:
         error_msg = (
             f"\n{'=' * 80}\n"
             f"QUANTIZATION VALIDATION FAILED!\n"
             f"{'=' * 80}\n"
             f"Expected quantized layers not found in the loaded model.\n"
             f"This indicates the quantized checkpoint was not loaded correctly.\n\n"
-            f"Missing: {missing_layers}\n"
-            f"Expected: {required_quant_layers}\n\n"
+            f"Expected one of:\n"
+            f"  - Local spec: {local_spec_layers}\n"
+            f"  - TE spec: {te_spec_layers}\n\n"
             f"This is likely due to a bug in the checkpoint loading logic.\n"
             f"{'=' * 80}\n"
         )
@@ -95,9 +107,16 @@ def _validate_quantized_model(model: torch.nn.Module, is_rank_0: bool) -> None:
         raise RuntimeError(error_msg)
 
     if is_rank_0:
-        console.print(
-            "[green]✓ Quantization validation passed: Found QuantRowParallelLinear and QuantColumnParallelLinear[/green]"
-        )
+        if has_te_spec:
+            console.print(
+                "[green]✓ Quantization validation passed: Found TE spec quantized layers "
+                "(QuantTERowParallelLinear, QuantTELayerNormColumnParallelLinear)[/green]"
+            )
+        else:
+            console.print(
+                "[green]✓ Quantization validation passed: Found local spec quantized layers "
+                "(QuantRowParallelLinear, QuantColumnParallelLinear)[/green]"
+            )
 
 
 @torchrun_main