From 5d6ccf78764a1f4f8b7b36f0d835ece1b5df6213 Mon Sep 17 00:00:00 2001
From: Jan Lasek <janek.lasek@gmail.com>
Date: Tue, 1 Oct 2024 14:51:00 +0200
Subject: [PATCH 1/6] Fix loading legacy checkpoints

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
---
 nemo/deploy/nlp/megatronllm_deployable.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/nemo/deploy/nlp/megatronllm_deployable.py b/nemo/deploy/nlp/megatronllm_deployable.py
index 1fe029f9fade..500246b692c1 100644
--- a/nemo/deploy/nlp/megatronllm_deployable.py
+++ b/nemo/deploy/nlp/megatronllm_deployable.py
@@ -33,6 +33,18 @@
 from nemo.deploy import ITritonDeployable
 from nemo.deploy.utils import cast_output, str_ndarray2list
 
+try:
+    from megatron.core.dist_checkpointing.validation import StrictHandling
+
+    HAVE_MEGATRON_CORE = True
+
+except (ImportError, ModuleNotFoundError) as e:
+
+    HAVE_MEGATRON_CORE = False
+    IMPORT_ERROR = (
+        "megatron-core was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-core."
+        f" Exact error: {e}"
+    )
 
 @wrapt.decorator
 def noop_decorator(func):
@@ -99,6 +111,8 @@ def __init__(
         num_nodes: int = 1,
         existing_model: MegatronGPTModel = None,
     ):
+        if not HAVE_MEGATRON_CORE:
+            raise ImportError(IMPORT_ERROR)
         if nemo_checkpoint_filepath is None and existing_model is None:
             raise ValueError(
                 "MegatronLLMDeployable requires either a .nemo checkpoint filepath or an existing MegatronGPTModel, but both provided were None"
@@ -142,6 +156,7 @@ def _load_from_nemo_checkpoint(self, nemo_checkpoint_filepath: str, num_devices:
             # had to override these to make Nemotron3-22B work, see sample_sequence_batch() in text_generation_utils.py
             custom_config.activations_checkpoint_granularity = None
             custom_config.activations_checkpoint_method = None
+            custom_config.dist_ckpt_load_strictness = StrictHandling.LOG_ALL.value
 
             self.model = MegatronGPTModel.restore_from(
                 nemo_checkpoint_filepath, trainer=trainer, override_config_path=custom_config

From 01e680ddc730d703dd7d255d4727857806076d76 Mon Sep 17 00:00:00 2001
From: Jan Lasek <janek.lasek@gmail.com>
Date: Tue, 1 Oct 2024 14:52:47 +0200
Subject: [PATCH 2/6] Fix inference issues FP8-trained models

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
---
 nemo/deploy/nlp/megatronllm_deployable.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/nemo/deploy/nlp/megatronllm_deployable.py b/nemo/deploy/nlp/megatronllm_deployable.py
index 500246b692c1..6c5624aeb0b1 100644
--- a/nemo/deploy/nlp/megatronllm_deployable.py
+++ b/nemo/deploy/nlp/megatronllm_deployable.py
@@ -157,6 +157,8 @@ def _load_from_nemo_checkpoint(self, nemo_checkpoint_filepath: str, num_devices:
             custom_config.activations_checkpoint_granularity = None
             custom_config.activations_checkpoint_method = None
             custom_config.dist_ckpt_load_strictness = StrictHandling.LOG_ALL.value
+            if custom_config.get("fp8", False):
+                custom_config.fp8 = False
 
             self.model = MegatronGPTModel.restore_from(
                 nemo_checkpoint_filepath, trainer=trainer, override_config_path=custom_config

From 1938b3875cb10c7bbb0b0081b5c3dbeba4c26f13 Mon Sep 17 00:00:00 2001
From: janekl <janekl@users.noreply.github.com>
Date: Tue, 1 Oct 2024 16:46:29 +0000
Subject: [PATCH 3/6] Apply isort and black reformatting

Signed-off-by: janekl <janekl@users.noreply.github.com>
---
 nemo/deploy/nlp/megatronllm_deployable.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/nemo/deploy/nlp/megatronllm_deployable.py b/nemo/deploy/nlp/megatronllm_deployable.py
index 6c5624aeb0b1..eb5762496fb6 100644
--- a/nemo/deploy/nlp/megatronllm_deployable.py
+++ b/nemo/deploy/nlp/megatronllm_deployable.py
@@ -46,6 +46,7 @@
         f" Exact error: {e}"
     )
 
+
 @wrapt.decorator
 def noop_decorator(func):
     def wrapper(*args, **kwargs):

From 87bcae68f5be14a2e04983d0615456788340226b Mon Sep 17 00:00:00 2001
From: Jan Lasek <janek.lasek@gmail.com>
Date: Tue, 1 Oct 2024 18:53:57 +0200
Subject: [PATCH 4/6] Comment on TE shape contraints during inference

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
---
 nemo/deploy/nlp/megatronllm_deployable.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/nemo/deploy/nlp/megatronllm_deployable.py b/nemo/deploy/nlp/megatronllm_deployable.py
index eb5762496fb6..6ceb436e8077 100644
--- a/nemo/deploy/nlp/megatronllm_deployable.py
+++ b/nemo/deploy/nlp/megatronllm_deployable.py
@@ -159,6 +159,8 @@ def _load_from_nemo_checkpoint(self, nemo_checkpoint_filepath: str, num_devices:
             custom_config.activations_checkpoint_method = None
             custom_config.dist_ckpt_load_strictness = StrictHandling.LOG_ALL.value
             if custom_config.get("fp8", False):
+                # Need to disable FP8 for in-framework inference due to shape constraints imposed by TE,
+                # see https://github.com/NVIDIA/TransformerEngine/blob/v1.8/transformer_engine/pytorch/utils.py#L229
                 custom_config.fp8 = False
 
             self.model = MegatronGPTModel.restore_from(

From 8e1e1c8b4386d62cfb3f8ddafe726f1a1b5f4e73 Mon Sep 17 00:00:00 2001
From: Jan Lasek <janek.lasek@gmail.com>
Date: Wed, 2 Oct 2024 01:19:59 -0700
Subject: [PATCH 5/6] Simplify import error handling

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
---
 nemo/deploy/nlp/megatronllm_deployable.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/nemo/deploy/nlp/megatronllm_deployable.py b/nemo/deploy/nlp/megatronllm_deployable.py
index 6ceb436e8077..a9e3740c5ec2 100644
--- a/nemo/deploy/nlp/megatronllm_deployable.py
+++ b/nemo/deploy/nlp/megatronllm_deployable.py
@@ -41,10 +41,7 @@
 except (ImportError, ModuleNotFoundError) as e:
 
     HAVE_MEGATRON_CORE = False
-    IMPORT_ERROR = (
-        "megatron-core was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-core."
-        f" Exact error: {e}"
-    )
+    IMPORT_ERROR = e
 
 
 @wrapt.decorator
@@ -113,7 +110,7 @@ def __init__(
         existing_model: MegatronGPTModel = None,
     ):
         if not HAVE_MEGATRON_CORE:
-            raise ImportError(IMPORT_ERROR)
+            raise IMPORT_ERROR
         if nemo_checkpoint_filepath is None and existing_model is None:
             raise ValueError(
                 "MegatronLLMDeployable requires either a .nemo checkpoint filepath or an existing MegatronGPTModel, but both provided were None"

From b29975fd4fb58ab0caacbcebacf9b44c223b03c7 Mon Sep 17 00:00:00 2001
From: Jan Lasek <janek.lasek@gmail.com>
Date: Wed, 2 Oct 2024 15:48:08 +0200
Subject: [PATCH 6/6] Comment on issues

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
---
 nemo/deploy/nlp/megatronllm_deployable.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/nemo/deploy/nlp/megatronllm_deployable.py b/nemo/deploy/nlp/megatronllm_deployable.py
index a9e3740c5ec2..64cf6114ceba 100644
--- a/nemo/deploy/nlp/megatronllm_deployable.py
+++ b/nemo/deploy/nlp/megatronllm_deployable.py
@@ -154,10 +154,13 @@ def _load_from_nemo_checkpoint(self, nemo_checkpoint_filepath: str, num_devices:
             # had to override these to make Nemotron3-22B work, see sample_sequence_batch() in text_generation_utils.py
             custom_config.activations_checkpoint_granularity = None
             custom_config.activations_checkpoint_method = None
+            # Models trained with TE < 1.10 and loaded with TE >= 1.10 require
+            # special handling on loading checkpoint due to structural updates
             custom_config.dist_ckpt_load_strictness = StrictHandling.LOG_ALL.value
             if custom_config.get("fp8", False):
                 # Need to disable FP8 for in-framework inference due to shape constraints imposed by TE,
-                # see https://github.com/NVIDIA/TransformerEngine/blob/v1.8/transformer_engine/pytorch/utils.py#L229
+                # see https://github.com/NVIDIA/TransformerEngine/blob/v1.10/transformer_engine/pytorch/utils.py#L229
+                LOGGER.warning("Disabling FP8 inference due to shape constraints imposed by Transformer Engine.")
                 custom_config.fp8 = False
 
             self.model = MegatronGPTModel.restore_from(