From 77b03d7e0d5b1eb32330d571c6df5818fbd09e5a Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Fri, 4 Apr 2025 17:20:36 -0700
Subject: [PATCH 01/23] make chat template configurable from config, save chat
 template as attribute of tokenizer

Signed-off-by: ashors1 <ashors@nvidia.com>
---
 examples/configs/sft.yaml                     |  2 +
 examples/run_sft.py                           | 10 ++++-
 nemo_reinforcer/algorithms/sft.py             |  3 ++
 nemo_reinforcer/data/hf_datasets/__init__.py  |  3 +-
 .../data/hf_datasets/chat_templates.py        |  9 +++++
 .../data/hf_datasets/interfaces.py            | 39 -------------------
 nemo_reinforcer/data/hf_datasets/oasst.py     | 10 ++---
 .../data/hf_datasets/openmathinstruct2.py     |  9 +++--
 nemo_reinforcer/data/hf_datasets/squad.py     | 13 +++----
 nemo_reinforcer/data/interfaces.py            |  2 -
 nemo_reinforcer/data/llm_message_utils.py     |  4 +-
 nemo_reinforcer/models/policy/hf_policy.py    |  6 ++-
 12 files changed, 47 insertions(+), 63 deletions(-)
 create mode 100644 nemo_reinforcer/data/hf_datasets/chat_templates.py
 delete mode 100644 nemo_reinforcer/data/hf_datasets/interfaces.py

diff --git a/examples/configs/sft.yaml b/examples/configs/sft.yaml
index e4b116a351..78cc1463d4 100644
--- a/examples/configs/sft.yaml
+++ b/examples/configs/sft.yaml
@@ -18,6 +18,8 @@ checkpointing:
 
 policy:
   model_name: "meta-llama/Llama-3.2-1B"
+  tokenizer:
+    chat_template: "{% for message in messages %}{%- if message['role'] == 'system'  %}{{'Context: ' + message['content'].strip()}}{%- elif message['role'] == 'user'  %}{{' Question: ' + message['content'].strip() + ' Answer:'}}{%- elif message['role'] == 'assistant'  %}{{' ' + message['content'].strip()}}{%- endif %}{% endfor %}"
   train_global_batch_size: 32
   train_micro_batch_size: 1
   max_total_sequence_length: 1024
diff --git a/examples/run_sft.py b/examples/run_sft.py
index 950938b4da..4a35d6a242 100644
--- a/examples/run_sft.py
+++ b/examples/run_sft.py
@@ -100,7 +100,15 @@ def setup_data(data_config: DataConfig, policy_config: PolicyConfig):
     val_dataset = data.formatted_ds["validation"]
     sft_task_spec = data.task_spec
 
+    ## TODO: save tokenizer whenever we save an hf checkpoint
     tokenizer = AutoTokenizer.from_pretrained(policy_config["model_name"])
+    if "chat_template" in policy_config["tokenizer"]:
+        if policy_config["tokenizer"]["chat_template"].lower() in {"none", "null"}:
+            tokenizer.chat_template = (
+                hf_datasets.COMMON_CHAT_TEMPLATES.passthrough_prompt_response
+            )
+        else:
+            tokenizer.chat_template = policy_config["tokenizer"]["chat_template"]
 
     train_dataset = AllTaskProcessedDataset(
         train_dataset,
@@ -166,7 +174,7 @@ def main():
         checkpointer,
         sft_save_state,
         master_config,
-    ) = setup(config, dataset, val_dataset)
+    ) = setup(config, dataset, val_dataset, tokenizer)
     sft_train(
         policy,
         train_dataloader,
diff --git a/nemo_reinforcer/algorithms/sft.py b/nemo_reinforcer/algorithms/sft.py
index 8f9e34f9da..fc327f3915 100644
--- a/nemo_reinforcer/algorithms/sft.py
+++ b/nemo_reinforcer/algorithms/sft.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
+from transformers import AutoTokenizer
 from pathlib import Path
 from typing import Optional, Tuple, TypedDict
 
@@ -78,6 +79,7 @@ def setup(
     master_config: MasterConfig,
     train_dataset: AllTaskProcessedDataset,
     val_dataset: AllTaskProcessedDataset,
+    tokenizer: AutoTokenizer,
 ) -> Tuple[
     HfPolicy,
     RayVirtualCluster,
@@ -175,6 +177,7 @@ def setup(
     policy = HfPolicy(
         cluster=cluster,
         config=policy_config,
+        tokenizer=tokenizer,
         weights_path=Path(last_checkpoint_path) / "policy.pt"
         if last_checkpoint_path
         else None,
diff --git a/nemo_reinforcer/data/hf_datasets/__init__.py b/nemo_reinforcer/data/hf_datasets/__init__.py
index df1227140d..12d1a0b368 100644
--- a/nemo_reinforcer/data/hf_datasets/__init__.py
+++ b/nemo_reinforcer/data/hf_datasets/__init__.py
@@ -14,5 +14,6 @@
 
 from nemo_reinforcer.data.hf_datasets.oasst import OasstDataset
 from nemo_reinforcer.data.hf_datasets.squad import SquadDataset
+from nemo_reinforcer.data.hf_datasets.chat_templates import COMMON_CHAT_TEMPLATES
 
-__all__ = ["OasstDataset", "SquadDataset"]
+__all__ = ["OasstDataset", "SquadDataset", "COMMON_CHAT_TEMPLATES"]
diff --git a/nemo_reinforcer/data/hf_datasets/chat_templates.py b/nemo_reinforcer/data/hf_datasets/chat_templates.py
new file mode 100644
index 0000000000..0454ae68e4
--- /dev/null
+++ b/nemo_reinforcer/data/hf_datasets/chat_templates.py
@@ -0,0 +1,9 @@
+## a reference to frequently used chat templates for convenience
+class COMMON_CHAT_TEMPLATES:
+    ### simple template which prepends a role header to the content
+    simple_role_header = "{% for message in messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
+
+    ### passthrough template which just concatenates the content of the messages with no special tokens
+    passthough_prompt_response = (
+        "{% for message in messages %}{{ message['content'] }}{% endfor %}"
+    )
diff --git a/nemo_reinforcer/data/hf_datasets/interfaces.py b/nemo_reinforcer/data/hf_datasets/interfaces.py
deleted file mode 100644
index 63be96c5a7..0000000000
--- a/nemo_reinforcer/data/hf_datasets/interfaces.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Dict, Any, Optional
-from nemo_reinforcer.data.interfaces import TaskDataSpec
-
-
-class COMMON_CHAT_TEMPLATES:
-    ### simple template which prepends a role header to the content
-    simple_role_header = "{% for message in messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
-
-
-class HfDataset:
-    """Interface for HuggingFace datasets."""
-
-    formatted_ds: Dict[str, Any]
-
-    def __init__(
-        self,
-        dataset_name: str,
-        custom_template: Optional[
-            str
-        ] = None,  ## "None" means use HuggingFace's tokenizer's template
-    ):
-        self.task_spec = TaskDataSpec(
-            task_name=dataset_name,
-            custom_template=custom_template,
-        )
diff --git a/nemo_reinforcer/data/hf_datasets/oasst.py b/nemo_reinforcer/data/hf_datasets/oasst.py
index 4525c0fa92..f4e5acecba 100644
--- a/nemo_reinforcer/data/hf_datasets/oasst.py
+++ b/nemo_reinforcer/data/hf_datasets/oasst.py
@@ -20,7 +20,8 @@
 import copy
 from dataclasses import dataclass
 from typing import Optional
-from nemo_reinforcer.data.hf_datasets.interfaces import HfDataset, COMMON_CHAT_TEMPLATES
+
+from nemo_reinforcer.data.interfaces import TaskDataSpec
 
 SYSTEM_PROMPT = "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\n\n"
 
@@ -119,10 +120,9 @@ def download_and_process_oasst(output_directory=".", seed=42, split_ratio=0.95):
 
 
 @dataclass
-class OasstDataset(HfDataset):
+class OasstDataset:
     def __init__(self, output_dir: str = "."):
         self.formatted_ds = download_and_process_oasst(output_dir)
-        super().__init__(
-            dataset_name="oasst",
-            custom_template=COMMON_CHAT_TEMPLATES.simple_role_header,
+        self.task_spec = TaskDataSpec(
+            task_name="OASST",
         )
diff --git a/nemo_reinforcer/data/hf_datasets/openmathinstruct2.py b/nemo_reinforcer/data/hf_datasets/openmathinstruct2.py
index c3c5126263..57f7384568 100644
--- a/nemo_reinforcer/data/hf_datasets/openmathinstruct2.py
+++ b/nemo_reinforcer/data/hf_datasets/openmathinstruct2.py
@@ -14,9 +14,10 @@
 
 from typing import Optional
 from datasets import load_dataset
-from nemo_reinforcer.data.hf_datasets.interfaces import HfDataset
 from dataclasses import dataclass
 
+from nemo_reinforcer.data.interfaces import TaskDataSpec
+
 
 def format_math(data):
     return {
@@ -62,7 +63,7 @@ def prepare_openinstructmath2_dataset(split: str = "train_1M", seed=42, test_siz
 
 
 @dataclass
-class OpenMathInstruct2Dataset(HfDataset):
+class OpenMathInstruct2Dataset:
     def __init__(
         self, split: str = "train_1M", seed: int = 42, test_size: float = 0.05
     ):
@@ -82,6 +83,6 @@ def __init__(
             split=split, seed=seed, test_size=test_size
         )
 
-        super().__init__(
-            dataset_name="OpenMathInstruct-2",
+        self.task_spec = TaskDataSpec(
+            task_name="OpenMathInstruct-2",
         )
diff --git a/nemo_reinforcer/data/hf_datasets/squad.py b/nemo_reinforcer/data/hf_datasets/squad.py
index 4ba3e87949..3bc257c88a 100644
--- a/nemo_reinforcer/data/hf_datasets/squad.py
+++ b/nemo_reinforcer/data/hf_datasets/squad.py
@@ -14,7 +14,8 @@
 
 from typing import Optional
 from datasets import load_dataset
-from nemo_reinforcer.data.hf_datasets.interfaces import HfDataset
+
+from nemo_reinforcer.data.interfaces import TaskDataSpec
 
 
 def format_squad(data):
@@ -36,14 +37,10 @@ def format_squad(data):
     }
 
 
-class SquadDataset(HfDataset):
+class SquadDataset:
     def __init__(self):
         original_ds = load_dataset("rajpurkar/squad")
         self.formatted_ds = original_ds.map(format_squad)
-
-        custom_template = "{% for message in messages %}{%- if message['role'] == 'system'  %}{{'Context: ' + message['content'].strip()}}{%- elif message['role'] == 'user'  %}{{' Question: ' + message['content'].strip() + ' Answer:'}}{%- elif message['role'] == 'assistant'  %}{{' ' + message['content'].strip()}}{%- endif %}{% endfor %}"
-
-        super().__init__(
-            dataset_name="squad",
-            custom_template=custom_template,
+        self.task_spec = TaskDataSpec(
+            task_name="SQuAD",
         )
diff --git a/nemo_reinforcer/data/interfaces.py b/nemo_reinforcer/data/interfaces.py
index ade6f145e8..6ae1152be7 100644
--- a/nemo_reinforcer/data/interfaces.py
+++ b/nemo_reinforcer/data/interfaces.py
@@ -42,7 +42,6 @@ class TaskDataSpec:
     prompt_file: Optional[os.PathLike] = None
 
     system_prompt_file: Optional[Union[str, os.PathLike]] = None
-    custom_template: Optional[Union[str, os.PathLike]] = None
 
     def __post_init__(self):
         def load_prompt_file(
@@ -66,7 +65,6 @@ def copy_defaults(self, from_spec: "TaskDataSpec"):
         default_attrs = {
             "system_prompt": from_spec.system_prompt,
             "prompt": from_spec.prompt,
-            "custom_template": from_spec.custom_template,
         }
 
         for attr_name, default_value in default_attrs.items():
diff --git a/nemo_reinforcer/data/llm_message_utils.py b/nemo_reinforcer/data/llm_message_utils.py
index 93f6ddc50b..b14956cdbd 100644
--- a/nemo_reinforcer/data/llm_message_utils.py
+++ b/nemo_reinforcer/data/llm_message_utils.py
@@ -355,12 +355,10 @@ def get_formatted_message_log(
     """
     new_message_log = []
     prev_formatted_message = ""
-    template = task_data_spec.custom_template
 
     for i, message in enumerate(message_log):
         formatted_message = tokenizer.apply_chat_template(
             message_log[: i + 1],
-            chat_template=template,
             add_generation_prompt=False,
             tokenize=False,
             add_special_tokens=False,
@@ -376,11 +374,13 @@ def get_formatted_message_log(
         message_chunk = formatted_message[prev_message_len_no_eos:]
 
         if tokenizer.bos_token is not None:
+            ## TODO: make configurable
             if i == 0 and not message_chunk.startswith(tokenizer.bos_token):
                 message_chunk = tokenizer.bos_token + message_chunk
 
         if i == len(message_log) - 1:
             message_chunk = message_chunk.rstrip("\n")
+            ## TODO: make configurable
             if tokenizer.eos_token is not None and not message_chunk.endswith(
                 tokenizer.eos_token
             ):
diff --git a/nemo_reinforcer/models/policy/hf_policy.py b/nemo_reinforcer/models/policy/hf_policy.py
index 3a316ba3ae..ceed86b213 100644
--- a/nemo_reinforcer/models/policy/hf_policy.py
+++ b/nemo_reinforcer/models/policy/hf_policy.py
@@ -65,6 +65,7 @@ def __repr__(self):
     def __init__(
         self,
         config: PolicyConfig,
+        tokenizer: AutoTokenizer,
         weights_path: Optional[str] = None,
         optimizer_path: Optional[str] = None,
         init_optimizer: bool = True,
@@ -97,7 +98,8 @@ def __init__(
             )
         else:
             self.reference_model = None
-        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+
+        self.tokenizer = tokenizer
         # If no pad token is defined, you might need:
         if self.tokenizer.pad_token is None:
             self.tokenizer.pad_token = self.tokenizer.eos_token
@@ -895,6 +897,7 @@ def __init__(
         self,
         cluster: RayVirtualCluster,
         config: PolicyConfig,
+        tokenizer: AutoTokenizer,
         name_prefix: str = "hf_policy",
         workers_per_node: Optional[Union[int, List[int]]] = None,
         init_optimizer: bool = True,
@@ -910,6 +913,7 @@ def __init__(
         worker_builder = RayWorkerBuilder(
             HfPolicyWorker,
             config,
+            tokenizer=tokenizer,
             init_optimizer=init_optimizer,
             weights_path=weights_path,
             optimizer_path=optimizer_path,

From 4f171273a095359ca5ed6c09afa782d10f52ef23 Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Mon, 7 Apr 2025 15:36:46 -0700
Subject: [PATCH 02/23] save hf tokenizer

Signed-off-by: ashors1 <ashors@nvidia.com>
---
 nemo_reinforcer/models/policy/hf_policy.py | 1 +
 nemo_reinforcer/utils/native_checkpoint.py | 4 ++++
 2 files changed, 5 insertions(+)

diff --git a/nemo_reinforcer/models/policy/hf_policy.py b/nemo_reinforcer/models/policy/hf_policy.py
index f69be02aa7..73c7346440 100644
--- a/nemo_reinforcer/models/policy/hf_policy.py
+++ b/nemo_reinforcer/models/policy/hf_policy.py
@@ -858,6 +858,7 @@ def save_checkpoint(
             optimizer_path=optimizer_path,
             save_torch_dist=save_torch_dist,
             save_hf=save_hf,
+            tokenizer=self.tokenizer if save_hf else None,
         )
 
     def load_checkpoint(self, weights_path: str, optimizer_path: Optional[str] = None):
diff --git a/nemo_reinforcer/utils/native_checkpoint.py b/nemo_reinforcer/utils/native_checkpoint.py
index 6f22ea82fd..fbf13e8fb2 100644
--- a/nemo_reinforcer/utils/native_checkpoint.py
+++ b/nemo_reinforcer/utils/native_checkpoint.py
@@ -137,6 +137,7 @@ def save_checkpoint(
     optimizer_path: Optional[str] = None,
     save_torch_dist: bool = True,
     save_hf: bool = False,
+    tokenizer: Optional[Any] = None,
 ) -> None:
     """Save a checkpoint of the model and optionally optimizer state.
 
@@ -161,6 +162,9 @@ def save_checkpoint(
                 state_dict=model_state_dict,
             )
 
+            if tokenizer is not None:
+                tokenizer.save_pretrained(hf_weights_path)
+
     if save_torch_dist:
         model_state = {"model": ModelState(model)}
         dcp.save(model_state, checkpoint_id=weights_path)

From 525296a30cf7c82b035e1f167a9a71504022e716 Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Fri, 4 Apr 2025 11:21:05 -0700
Subject: [PATCH 03/23] add sft example with json data

Signed-off-by: ashors1 <ashors@nvidia.com>
---
 examples/run_sft.py                           |  2 ++
 nemo_reinforcer/data/hf_datasets/__init__.py  |  3 ++-
 .../data/hf_datasets/json_dataset.py          | 26 +++++++++++++++++++
 3 files changed, 30 insertions(+), 1 deletion(-)
 create mode 100644 nemo_reinforcer/data/hf_datasets/json_dataset.py

diff --git a/examples/run_sft.py b/examples/run_sft.py
index bb5b4731c7..6e431c7ac4 100644
--- a/examples/run_sft.py
+++ b/examples/run_sft.py
@@ -102,6 +102,8 @@ def setup_data(tokenizer: AutoTokenizer, data_config: DataConfig):
         data = hf_datasets.OasstDataset(output_dir="/tmp/open_assistant")
     elif data_cls == "squad":
         data = hf_datasets.SquadDataset()
+    elif data_cls == "json_dataset":
+        data = hf_datasets.JsonDataset(data_config["data_path"])
     else:
         raise ValueError(f"Unknown dataset class: {data_cls}")
     print(
diff --git a/nemo_reinforcer/data/hf_datasets/__init__.py b/nemo_reinforcer/data/hf_datasets/__init__.py
index 12d1a0b368..4eb0b68d37 100644
--- a/nemo_reinforcer/data/hf_datasets/__init__.py
+++ b/nemo_reinforcer/data/hf_datasets/__init__.py
@@ -12,8 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from nemo_reinforcer.data.hf_datasets.json_dataset import JsonDataset
 from nemo_reinforcer.data.hf_datasets.oasst import OasstDataset
 from nemo_reinforcer.data.hf_datasets.squad import SquadDataset
 from nemo_reinforcer.data.hf_datasets.chat_templates import COMMON_CHAT_TEMPLATES
 
-__all__ = ["OasstDataset", "SquadDataset", "COMMON_CHAT_TEMPLATES"]
+__all__ = ["JsonDataset", "OasstDataset", "SquadDataset", "COMMON_CHAT_TEMPLATES"]
diff --git a/nemo_reinforcer/data/hf_datasets/json_dataset.py b/nemo_reinforcer/data/hf_datasets/json_dataset.py
new file mode 100644
index 0000000000..1517a4e210
--- /dev/null
+++ b/nemo_reinforcer/data/hf_datasets/json_dataset.py
@@ -0,0 +1,26 @@
+from datasets import load_dataset
+from nemo_reinforcer.data.hf_datasets.interfaces import HfDataset
+
+
+class JsonDataset(HfDataset):
+    def __init__(self, path: str):
+        original_dataset = load_dataset("json", data_files=path)["train"]
+        formatted_dataset = original_dataset.map(self.add_messages_key)
+
+        ## just duplicating the dataset for train and validation for simplicity
+        self.formatted_ds = {
+            "train": formatted_dataset,
+            "validation": formatted_dataset,
+        }
+
+        super().__init__(
+            "json_dataset",
+        )
+
+    def add_messages_key(self, example):
+        return {
+            "messages": [
+                {"role": "user", "content": example["input"]},
+                {"role": "assistant", "content": example["output"]},
+            ]
+        }

From ce04b9bfbba9b7fe8a74f0a8abe371c71548ff1d Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Mon, 7 Apr 2025 16:58:30 -0700
Subject: [PATCH 04/23] improved configurability

Signed-off-by: ashors1 <ashors@nvidia.com>
---
 examples/run_sft.py                           | 12 +++++++--
 .../data/hf_datasets/json_dataset.py          | 25 +++++++++++++------
 nemo_reinforcer/data/llm_message_utils.py     | 12 +++++----
 3 files changed, 35 insertions(+), 14 deletions(-)

diff --git a/examples/run_sft.py b/examples/run_sft.py
index 6e431c7ac4..f16e50ecfc 100644
--- a/examples/run_sft.py
+++ b/examples/run_sft.py
@@ -59,7 +59,11 @@ def sft_preprocessor(
 ) -> DatumSpec:
     """Process a datum dictionary for SFT training."""
     message_log = get_formatted_message_log(
-        datum_dict["messages"], tokenizer, task_data_spec
+        datum_dict["messages"],
+        tokenizer,
+        task_data_spec,
+        add_bos_token=True,
+        add_eos_token=True,
     )
 
     length = sum(len(m["token_ids"]) for m in message_log)
@@ -103,7 +107,11 @@ def setup_data(tokenizer: AutoTokenizer, data_config: DataConfig):
     elif data_cls == "squad":
         data = hf_datasets.SquadDataset()
     elif data_cls == "json_dataset":
-        data = hf_datasets.JsonDataset(data_config["data_path"])
+        data = hf_datasets.JsonDataset(
+            data_config["data_path"],
+            data_config["input_key"],
+            data_config["output_key"],
+        )
     else:
         raise ValueError(f"Unknown dataset class: {data_cls}")
     print(
diff --git a/nemo_reinforcer/data/hf_datasets/json_dataset.py b/nemo_reinforcer/data/hf_datasets/json_dataset.py
index 1517a4e210..11dc61ba50 100644
--- a/nemo_reinforcer/data/hf_datasets/json_dataset.py
+++ b/nemo_reinforcer/data/hf_datasets/json_dataset.py
@@ -3,16 +3,27 @@
 
 
 class JsonDataset(HfDataset):
-    def __init__(self, path: str):
-        original_dataset = load_dataset("json", data_files=path)["train"]
-        formatted_dataset = original_dataset.map(self.add_messages_key)
+    def __init__(
+        self,
+        train_ds_path: str,
+        val_ds_path: str,
+        input_key: str = "input",
+        output_key: str = "output",
+    ):
+        train_original_dataset = load_dataset("json", data_files=train_ds_path)["train"]
+        val_original_dataset = load_dataset("json", data_files=val_ds_path)["train"]
+        formatted_train_dataset = train_original_dataset.map(self.add_messages_key)
+        formatted_val_dataset = val_original_dataset.map(self.add_messages_key)
 
         ## just duplicating the dataset for train and validation for simplicity
         self.formatted_ds = {
-            "train": formatted_dataset,
-            "validation": formatted_dataset,
+            "train": formatted_train_dataset,
+            "validation": formatted_val_dataset,
         }
 
+        self.input_key = input_key
+        self.output_key = output_key
+
         super().__init__(
             "json_dataset",
         )
@@ -20,7 +31,7 @@ def __init__(self, path: str):
     def add_messages_key(self, example):
         return {
             "messages": [
-                {"role": "user", "content": example["input"]},
-                {"role": "assistant", "content": example["output"]},
+                {"role": "user", "content": example[self.input_key]},
+                {"role": "assistant", "content": example[self.output_key]},
             ]
         }
diff --git a/nemo_reinforcer/data/llm_message_utils.py b/nemo_reinforcer/data/llm_message_utils.py
index b14956cdbd..0ffbf30364 100644
--- a/nemo_reinforcer/data/llm_message_utils.py
+++ b/nemo_reinforcer/data/llm_message_utils.py
@@ -342,6 +342,8 @@ def get_formatted_message_log(
     message_log: LLMMessageLogType,
     tokenizer,
     task_data_spec: TaskDataSpec,
+    add_bos_token: bool = True,
+    add_eos_token: bool = True,
 ) -> LLMMessageLogType:
     """Format and tokenize chat messages using the specified template.
 
@@ -373,16 +375,16 @@ def get_formatted_message_log(
         ## pull out the chunk corresponding to the current message
         message_chunk = formatted_message[prev_message_len_no_eos:]
 
-        if tokenizer.bos_token is not None:
-            ## TODO: make configurable
+        if add_bos_token and tokenizer.bos_token is not None:
             if i == 0 and not message_chunk.startswith(tokenizer.bos_token):
                 message_chunk = tokenizer.bos_token + message_chunk
 
         if i == len(message_log) - 1:
             message_chunk = message_chunk.rstrip("\n")
-            ## TODO: make configurable
-            if tokenizer.eos_token is not None and not message_chunk.endswith(
-                tokenizer.eos_token
+            if (
+                add_eos_token
+                and tokenizer.eos_token is not None
+                and not message_chunk.endswith(tokenizer.eos_token)
             ):
                 message_chunk += tokenizer.eos_token
 

From 1597d0fb124e66ca488c35e188906f71e6924608 Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Mon, 7 Apr 2025 17:01:10 -0700
Subject: [PATCH 05/23] fixes

Signed-off-by: ashors1 <ashors@nvidia.com>
---
 examples/run_sft.py                           | 12 +++++----
 nemo_reinforcer/data/hf_datasets/__init__.py  | 11 ++++++--
 .../data/hf_datasets/chat_templates.py        | 16 ++++++++++-
 ..._dataset.py => prompt_response_dataset.py} | 27 ++++++++++++++-----
 nemo_reinforcer/data/llm_message_utils.py     | 10 ++++---
 nemo_reinforcer/models/policy/__init__.py     |  7 ++++-
 nemo_reinforcer/models/policy/hf_policy.py    |  4 +--
 7 files changed, 67 insertions(+), 20 deletions(-)
 rename nemo_reinforcer/data/hf_datasets/{json_dataset.py => prompt_response_dataset.py} (60%)

diff --git a/examples/run_sft.py b/examples/run_sft.py
index f16e50ecfc..b5159a9c64 100644
--- a/examples/run_sft.py
+++ b/examples/run_sft.py
@@ -29,6 +29,7 @@
 from nemo_reinforcer.distributed.virtual_cluster import init_ray
 from nemo_reinforcer.utils.config import load_config
 from nemo_reinforcer.utils.logger import get_next_experiment_dir
+from nemo_reinforcer.models.policy import TokenizerConfig
 
 
 def parse_args():
@@ -90,7 +91,7 @@ def sft_preprocessor(
 def setup_tokenizer(tokenizer_config: TokenizerConfig):
     tokenizer = get_tokenizer(tokenizer_config["name"])
     if "chat_template" in tokenizer_config:
-        if tokenizer_config["chat_template"].lower() in {"none", "null"}:
+        if tokenizer_config["chat_template"] is None:
             tokenizer.chat_template = (
                 hf_datasets.COMMON_CHAT_TEMPLATES.passthrough_prompt_response
             )
@@ -106,9 +107,10 @@ def setup_data(tokenizer: AutoTokenizer, data_config: DataConfig):
         data = hf_datasets.OasstDataset(output_dir="/tmp/open_assistant")
     elif data_cls == "squad":
         data = hf_datasets.SquadDataset()
-    elif data_cls == "json_dataset":
-        data = hf_datasets.JsonDataset(
-            data_config["data_path"],
+    elif data_cls == "prompt_response_dataset":
+        data = hf_datasets.PromptResponseDataset(
+            data_config["train_data_path"],
+            data_config["val_data_path"],
             data_config["input_key"],
             data_config["output_key"],
         )
@@ -173,7 +175,7 @@ def main():
     init_ray()
 
     # setup tokenizer
-    tokenizer = setup_tokenizer(config["tokenizer"])
+    tokenizer = setup_tokenizer(config["policy"]["tokenizer"])
 
     # setup data
     (
diff --git a/nemo_reinforcer/data/hf_datasets/__init__.py b/nemo_reinforcer/data/hf_datasets/__init__.py
index 4eb0b68d37..919f1a494e 100644
--- a/nemo_reinforcer/data/hf_datasets/__init__.py
+++ b/nemo_reinforcer/data/hf_datasets/__init__.py
@@ -12,9 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from nemo_reinforcer.data.hf_datasets.json_dataset import JsonDataset
+from nemo_reinforcer.data.hf_datasets.prompt_response_dataset import (
+    PromptResponseDataset,
+)
 from nemo_reinforcer.data.hf_datasets.oasst import OasstDataset
 from nemo_reinforcer.data.hf_datasets.squad import SquadDataset
 from nemo_reinforcer.data.hf_datasets.chat_templates import COMMON_CHAT_TEMPLATES
 
-__all__ = ["JsonDataset", "OasstDataset", "SquadDataset", "COMMON_CHAT_TEMPLATES"]
+__all__ = [
+    "OasstDataset",
+    "PromptResponseDataset",
+    "SquadDataset",
+    "COMMON_CHAT_TEMPLATES",
+]
diff --git a/nemo_reinforcer/data/hf_datasets/chat_templates.py b/nemo_reinforcer/data/hf_datasets/chat_templates.py
index 0454ae68e4..8e7ad957a6 100644
--- a/nemo_reinforcer/data/hf_datasets/chat_templates.py
+++ b/nemo_reinforcer/data/hf_datasets/chat_templates.py
@@ -1,9 +1,23 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 ## a reference to frequently used chat templates for convenience
 class COMMON_CHAT_TEMPLATES:
     ### simple template which prepends a role header to the content
     simple_role_header = "{% for message in messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
 
     ### passthrough template which just concatenates the content of the messages with no special tokens
-    passthough_prompt_response = (
+    passthrough_prompt_response = (
         "{% for message in messages %}{{ message['content'] }}{% endfor %}"
     )
diff --git a/nemo_reinforcer/data/hf_datasets/json_dataset.py b/nemo_reinforcer/data/hf_datasets/prompt_response_dataset.py
similarity index 60%
rename from nemo_reinforcer/data/hf_datasets/json_dataset.py
rename to nemo_reinforcer/data/hf_datasets/prompt_response_dataset.py
index 11dc61ba50..c9ff14d348 100644
--- a/nemo_reinforcer/data/hf_datasets/json_dataset.py
+++ b/nemo_reinforcer/data/hf_datasets/prompt_response_dataset.py
@@ -1,8 +1,22 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from datasets import load_dataset
-from nemo_reinforcer.data.hf_datasets.interfaces import HfDataset
+from nemo_reinforcer.data.interfaces import TaskDataSpec
 
 
-class JsonDataset(HfDataset):
+class PromptResponseDataset:
     def __init__(
         self,
         train_ds_path: str,
@@ -12,6 +26,10 @@ def __init__(
     ):
         train_original_dataset = load_dataset("json", data_files=train_ds_path)["train"]
         val_original_dataset = load_dataset("json", data_files=val_ds_path)["train"]
+
+        self.input_key = input_key
+        self.output_key = output_key
+
         formatted_train_dataset = train_original_dataset.map(self.add_messages_key)
         formatted_val_dataset = val_original_dataset.map(self.add_messages_key)
 
@@ -21,10 +39,7 @@ def __init__(
             "validation": formatted_val_dataset,
         }
 
-        self.input_key = input_key
-        self.output_key = output_key
-
-        super().__init__(
+        self.task_spec = TaskDataSpec(
             "json_dataset",
         )
 
diff --git a/nemo_reinforcer/data/llm_message_utils.py b/nemo_reinforcer/data/llm_message_utils.py
index 0ffbf30364..e88e0ab9d9 100644
--- a/nemo_reinforcer/data/llm_message_utils.py
+++ b/nemo_reinforcer/data/llm_message_utils.py
@@ -375,17 +375,21 @@ def get_formatted_message_log(
         ## pull out the chunk corresponding to the current message
         message_chunk = formatted_message[prev_message_len_no_eos:]
 
-        if add_bos_token and tokenizer.bos_token is not None:
-            if i == 0 and not message_chunk.startswith(tokenizer.bos_token):
+        if i == 0:
+            if (
+                add_bos_token
+                and tokenizer.bos_token is not None
+                and not message_chunk.startswith(tokenizer.bos_token)
+            ):
                 message_chunk = tokenizer.bos_token + message_chunk
 
         if i == len(message_log) - 1:
-            message_chunk = message_chunk.rstrip("\n")
             if (
                 add_eos_token
                 and tokenizer.eos_token is not None
                 and not message_chunk.endswith(tokenizer.eos_token)
             ):
+                message_chunk = message_chunk.rstrip("\n")
                 message_chunk += tokenizer.eos_token
 
         new_message = message.copy()
diff --git a/nemo_reinforcer/models/policy/__init__.py b/nemo_reinforcer/models/policy/__init__.py
index 24390b9670..43c9422e2a 100644
--- a/nemo_reinforcer/models/policy/__init__.py
+++ b/nemo_reinforcer/models/policy/__init__.py
@@ -17,9 +17,14 @@
 from nemo_reinforcer.models.generation.interfaces import GenerationConfig
 
 
+class TokenizerConfig(TypedDict):
+    name: str
+    chat_template: str
+
+
 class PolicyConfig(TypedDict):
     model_name: str
-    tokenizer_name: str
+    tokenizer: TokenizerConfig
     train_global_batch_size: int
     train_micro_batch_size: int
     learning_rate: float
diff --git a/nemo_reinforcer/models/policy/hf_policy.py b/nemo_reinforcer/models/policy/hf_policy.py
index 73c7346440..d7a0a50b0d 100644
--- a/nemo_reinforcer/models/policy/hf_policy.py
+++ b/nemo_reinforcer/models/policy/hf_policy.py
@@ -26,7 +26,7 @@
     MixedPrecision,
 )
 from torch.distributed.fsdp.wrap import size_based_auto_wrap_policy
-from transformers import AutoModelForCausalLM
+from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from nemo_reinforcer.algorithms.interfaces import LossFunction
 from nemo_reinforcer.algorithms.utils import get_tokenizer
@@ -80,7 +80,7 @@ def __init__(
         rank = torch.distributed.get_rank()
         world_size = torch.distributed.get_world_size()
         model_name = self.cfg["model_name"]
-        tokenizer_name = self.cfg["tokenizer_name"]
+        tokenizer_name = self.cfg["tokenizer"]["name"]
         if self.cfg["precision"] == "float32":
             self.dtype = torch.float32
         elif self.cfg["precision"] == "bfloat16":

From 70f11a3f563d0f10d11364b3329ddea73f8d4789 Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Mon, 7 Apr 2025 22:19:39 -0700
Subject: [PATCH 06/23] update grpo and clean up

Signed-off-by: ashors1 <ashors@nvidia.com>
---
 examples/configs/grpo_math_1B.yaml  |  3 ++-
 examples/configs/grpo_math_8B.yaml  |  3 ++-
 examples/run_grpo_math.py           |  4 +---
 examples/run_sft.py                 | 17 ++---------------
 nemo_reinforcer/algorithms/grpo.py  |  1 +
 nemo_reinforcer/algorithms/sft.py   |  2 +-
 nemo_reinforcer/algorithms/utils.py | 14 ++++++++++++--
 7 files changed, 21 insertions(+), 23 deletions(-)

diff --git a/examples/configs/grpo_math_1B.yaml b/examples/configs/grpo_math_1B.yaml
index 3d8fdfce43..d781b90414 100644
--- a/examples/configs/grpo_math_1B.yaml
+++ b/examples/configs/grpo_math_1B.yaml
@@ -25,7 +25,8 @@ checkpointing:
 
 policy:
   model_name: "meta-llama/Llama-3.2-1B-Instruct"
-  tokenizer_name: ${policy.model_name} ## specify if you'd like to use a tokenizer different from the model's default
+  tokenizer:
+    name: ${policy.model_name} ## specify if you'd like to use a tokenizer different from the model's default
   train_global_batch_size: 512
   train_micro_batch_size: 4
   generation_batch_size: 32 # Only used when generating using HF backend
diff --git a/examples/configs/grpo_math_8B.yaml b/examples/configs/grpo_math_8B.yaml
index f2e0576fbc..64e2f2aff5 100644
--- a/examples/configs/grpo_math_8B.yaml
+++ b/examples/configs/grpo_math_8B.yaml
@@ -7,7 +7,8 @@ grpo:
 
 policy:
   model_name: "meta-llama/Llama-3.1-8B-Instruct"
-  tokenizer_name: ${policy.model_name} ## specify if you'd like to use a tokenizer different from the model's default
+  tokenizer:
+    name: ${policy.model_name} ## specify if you'd like to use a tokenizer different from the model's default
   train_global_batch_size: 512
   train_micro_batch_size: 1
   generation_batch_size: 32 # Only used when generating using HF backend
diff --git a/examples/run_grpo_math.py b/examples/run_grpo_math.py
index c6dea5609c..e147d167f9 100644
--- a/examples/run_grpo_math.py
+++ b/examples/run_grpo_math.py
@@ -64,7 +64,6 @@ def openinstructmath2_data_processor(
     problem = user_message[0]["content"]
     extra_env_info = {"ground_truth": user_message[1]["content"]}
 
-    template = task_data_spec.custom_template
     message_log: LLMMessageLogType = []
     user_message = {
         "role": "user",
@@ -72,7 +71,6 @@ def openinstructmath2_data_processor(
     }
     message = tokenizer.apply_chat_template(
         [user_message],
-        chat_template=template,
         tokenize=False,
         add_generation_prompt=True,
         add_special_tokens=False,
@@ -254,7 +252,7 @@ def main():
     init_ray()
 
     # setup tokenizer
-    tokenizer = get_tokenizer(config["policy"]["model_name"])
+    tokenizer = get_tokenizer(config["policy"]["tokenizer"])
     config["policy"]["generation"] = configure_generation_config(
         config["policy"]["generation"], tokenizer
     )
diff --git a/examples/run_sft.py b/examples/run_sft.py
index b5159a9c64..be56330c9a 100644
--- a/examples/run_sft.py
+++ b/examples/run_sft.py
@@ -29,7 +29,6 @@
 from nemo_reinforcer.distributed.virtual_cluster import init_ray
 from nemo_reinforcer.utils.config import load_config
 from nemo_reinforcer.utils.logger import get_next_experiment_dir
-from nemo_reinforcer.models.policy import TokenizerConfig
 
 
 def parse_args():
@@ -88,18 +87,6 @@ def sft_preprocessor(
     return output
 
 
-def setup_tokenizer(tokenizer_config: TokenizerConfig):
-    tokenizer = get_tokenizer(tokenizer_config["name"])
-    if "chat_template" in tokenizer_config:
-        if tokenizer_config["chat_template"] is None:
-            tokenizer.chat_template = (
-                hf_datasets.COMMON_CHAT_TEMPLATES.passthrough_prompt_response
-            )
-        else:
-            tokenizer.chat_template = tokenizer_config["chat_template"]
-    return tokenizer
-
-
 def setup_data(tokenizer: AutoTokenizer, data_config: DataConfig):
     print("\n▶ Setting up data...")
     data_cls = data_config["dataset_name"]
@@ -175,7 +162,7 @@ def main():
     init_ray()
 
     # setup tokenizer
-    tokenizer = setup_tokenizer(config["policy"]["tokenizer"])
+    tokenizer = get_tokenizer(config["policy"]["tokenizer"])
 
     # setup data
     (
@@ -194,7 +181,7 @@ def main():
         checkpointer,
         sft_save_state,
         master_config,
-    ) = setup(config, dataset, val_dataset, tokenizer)
+    ) = setup(config, tokenizer, dataset, val_dataset)
     sft_train(
         policy,
         train_dataloader,
diff --git a/nemo_reinforcer/algorithms/grpo.py b/nemo_reinforcer/algorithms/grpo.py
index 0eda853375..a89a29fa97 100644
--- a/nemo_reinforcer/algorithms/grpo.py
+++ b/nemo_reinforcer/algorithms/grpo.py
@@ -236,6 +236,7 @@ def setup(
     policy = HfPolicy(
         cluster=cluster,
         config=policy_config,
+        tokenizer=tokenizer,
         weights_path=Path(last_checkpoint_path) / "policy" / "weights"
         if last_checkpoint_path
         else None,
diff --git a/nemo_reinforcer/algorithms/sft.py b/nemo_reinforcer/algorithms/sft.py
index 76ec5777a4..0bb42bd27a 100644
--- a/nemo_reinforcer/algorithms/sft.py
+++ b/nemo_reinforcer/algorithms/sft.py
@@ -77,9 +77,9 @@ class MasterConfig(TypedDict):
 # =======================================================
 def setup(
     master_config: MasterConfig,
+    tokenizer: AutoTokenizer,
     train_dataset: AllTaskProcessedDataset,
     val_dataset: AllTaskProcessedDataset,
-    tokenizer: AutoTokenizer,
 ) -> Tuple[
     HfPolicy,
     RayVirtualCluster,
diff --git a/nemo_reinforcer/algorithms/utils.py b/nemo_reinforcer/algorithms/utils.py
index a3c42e2a19..6441d280ff 100644
--- a/nemo_reinforcer/algorithms/utils.py
+++ b/nemo_reinforcer/algorithms/utils.py
@@ -20,6 +20,9 @@
 from torch.masked import as_masked_tensor
 from transformers import AutoTokenizer
 
+from nemo_reinforcer.data import hf_datasets
+from nemo_reinforcer.models.policy import TokenizerConfig
+
 
 def calculate_kl_penalty_joschu2020(
     logprobs_policy: torch.Tensor, logprobs_reference: torch.Tensor
@@ -133,9 +136,16 @@ def set_seed(seed: int):
     torch.cuda.manual_seed_all(seed)
 
 
-def get_tokenizer(model_name: str) -> AutoTokenizer:
+def get_tokenizer(tokenizer_config: TokenizerConfig) -> AutoTokenizer:
     """Get the tokenizer and set pad token to eos token if it is not already set."""
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_config["name"])
     if tokenizer.pad_token is None:
         tokenizer.pad_token = tokenizer.eos_token
+    if "chat_template" in tokenizer_config:
+        if tokenizer_config["chat_template"] is None:
+            tokenizer.chat_template = (
+                hf_datasets.COMMON_CHAT_TEMPLATES.passthrough_prompt_response
+            )
+        else:
+            tokenizer.chat_template = tokenizer_config["chat_template"]
     return tokenizer

From 83552a65ceef8c927e0fcf6aa5bc5f8b4488e2ed Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Mon, 7 Apr 2025 22:55:39 -0700
Subject: [PATCH 07/23] fix unit tests

Signed-off-by: ashors1 <ashors@nvidia.com>
---
 nemo_reinforcer/data/llm_message_utils.py     |  2 +-
 .../models/generation/test_vllm_generation.py | 15 ++++++-------
 .../unit/models/policy/test_hf_ray_policy.py  | 21 ++++++++++++-------
 tests/unit/utils/test_native_checkpoint.py    | 14 ++++++++-----
 4 files changed, 32 insertions(+), 20 deletions(-)

diff --git a/nemo_reinforcer/data/llm_message_utils.py b/nemo_reinforcer/data/llm_message_utils.py
index e88e0ab9d9..e7ba160c1c 100644
--- a/nemo_reinforcer/data/llm_message_utils.py
+++ b/nemo_reinforcer/data/llm_message_utils.py
@@ -384,12 +384,12 @@ def get_formatted_message_log(
                 message_chunk = tokenizer.bos_token + message_chunk
 
         if i == len(message_log) - 1:
+            message_chunk = message_chunk.rstrip("\n")
             if (
                 add_eos_token
                 and tokenizer.eos_token is not None
                 and not message_chunk.endswith(tokenizer.eos_token)
             ):
-                message_chunk = message_chunk.rstrip("\n")
                 message_chunk += tokenizer.eos_token
 
         new_message = message.copy()
diff --git a/tests/unit/models/generation/test_vllm_generation.py b/tests/unit/models/generation/test_vllm_generation.py
index ba1bade3fc..0ae1c2e9ad 100644
--- a/tests/unit/models/generation/test_vllm_generation.py
+++ b/tests/unit/models/generation/test_vllm_generation.py
@@ -29,7 +29,9 @@
 basic_vllm_test_config: VllmConfig = {
     "backend": "vllm",
     "model_name": "meta-llama/Llama-3.2-1B",  # Small model for testing
-    "tokenizer_name": "meta-llama/Llama-3.2-1B",
+    "tokenizer": {
+        "name": "meta-llama/Llama-3.2-1B",
+    },
     "dtype": "bfloat16",
     "max_new_tokens": 10,
     "temperature": 1.0,
@@ -70,8 +72,7 @@ def cluster():
 @pytest.fixture(scope="function")
 def tokenizer():
     """Initialize tokenizer for the test model."""
-    model_name = basic_vllm_test_config["model_name"]
-    tokenizer = get_tokenizer(model_name)
+    tokenizer = get_tokenizer(basic_vllm_test_config["tokenizer"])
     return tokenizer
 
 
@@ -205,7 +206,7 @@ def test_vllm_generation_with_hf_training(cluster, tokenizer):
     # Create HF-specific config with required parameters
     hf_config = {
         "model_name": basic_vllm_test_config["model_name"],
-        "tokenizer_name": basic_vllm_test_config["tokenizer_name"],
+        "tokenizer": basic_vllm_test_config["tokenizer"],
         # Required training parameters
         "train_global_batch_size": 4,
         "train_micro_batch_size": 1,
@@ -276,7 +277,7 @@ def test_vllm_generation_with_hf_training(cluster, tokenizer):
         vllm_policy.finish_generation()
 
         print("Creating HF policy...")
-        hf_policy = HfPolicy(cluster, hf_config)
+        hf_policy = HfPolicy(cluster, hf_config, tokenizer)
 
         print(f"refitting vllm policy...")
         ipc_handles = hf_policy.get_weights_ipc_handles()
@@ -509,7 +510,7 @@ def test_vllm_weight_update_and_prefix_cache_reset(
 
     hf_config = {
         "model_name": basic_vllm_test_config["model_name"],
-        "tokenizer_name": "meta-llama/Llama-3.2-1B",
+        "tokenizer": basic_vllm_test_config["tokenizer"],
         "train_global_batch_size": 1,
         "train_micro_batch_size": 1,
         "learning_rate": 1e-6,
@@ -525,7 +526,7 @@ def test_vllm_weight_update_and_prefix_cache_reset(
     hf_policy = None
     try:
         print(f"Creating HF policy for TP={tensor_parallel_size}...")
-        hf_policy = HfPolicy(cluster, hf_config)
+        hf_policy = HfPolicy(cluster, hf_config, tokenizer)
         print(f"Creating vLLM policy for TP={tensor_parallel_size}...")
         vllm_policy = VllmGeneration(cluster, vllm_config)
 
diff --git a/tests/unit/models/policy/test_hf_ray_policy.py b/tests/unit/models/policy/test_hf_ray_policy.py
index 7cde591049..6753b04c5a 100644
--- a/tests/unit/models/policy/test_hf_ray_policy.py
+++ b/tests/unit/models/policy/test_hf_ray_policy.py
@@ -28,7 +28,7 @@
 
 basic_llama_test_config: PolicyConfig = {
     "model_name": "meta-llama/Llama-3.2-1B",
-    "tokenizer_name": "meta-llama/Llama-3.2-1B",
+    "tokenizer": {"name": "meta-llama/Llama-3.2-1B"},
     "generation_batch_size": 1,  # Small batch size for testing
     "train_global_batch_size": 4,
     "train_micro_batch_size": 1,
@@ -71,8 +71,7 @@ def gc_collect():
 @pytest.fixture(scope="function")
 def tokenizer():
     """Initialize tokenizer for the test model."""
-    model_name = basic_llama_test_config["model_name"]
-    tokenizer = get_tokenizer(model_name)
+    tokenizer = get_tokenizer(basic_llama_test_config["tokenizer"])
     return tokenizer
 
 
@@ -97,7 +96,7 @@ def policy_setup(tokenizer):
     config["generation"] = configure_generation_config(config["generation"], tokenizer)
 
     print("Creating HfPolicy...")
-    policy = HfPolicy(cluster=cluster, config=config)
+    policy = HfPolicy(cluster=cluster, config=config, tokenizer=tokenizer)
 
     yield policy, cluster
 
@@ -189,7 +188,7 @@ def test_hf_policy_init(policy_setup):
 
 
 @pytest.fixture
-def training_setup():
+def training_setup(tokenizer):
     """Setup and teardown specifically for training tests."""
     policy = None
     cluster = None
@@ -212,7 +211,12 @@ def training_setup():
         config = basic_llama_test_config
 
         print("Creating training HfPolicy...")
-        policy = HfPolicy(cluster=cluster, config=config, init_reference_model=False)
+        policy = HfPolicy(
+            cluster=cluster,
+            config=config,
+            init_reference_model=False,
+            tokenizer=tokenizer,
+        )
 
         # Create a test batch
         print("Creating test batch...")
@@ -315,7 +319,10 @@ def generation_setup(request, tokenizer):
 
         print("Creating generation HfPolicy...")
         policy = HfPolicy(
-            cluster=cluster, config=config, init_reference_model=request.param
+            cluster=cluster,
+            config=config,
+            tokenizer=tokenizer,
+            init_reference_model=request.param,
         )
 
         # Create a test batch
diff --git a/tests/unit/utils/test_native_checkpoint.py b/tests/unit/utils/test_native_checkpoint.py
index 8f71badea1..aa913f8a97 100755
--- a/tests/unit/utils/test_native_checkpoint.py
+++ b/tests/unit/utils/test_native_checkpoint.py
@@ -17,6 +17,7 @@
 import torch
 from tempfile import TemporaryDirectory
 
+from nemo_reinforcer.algorithms.utils import get_tokenizer
 from nemo_reinforcer.distributed.batched_data_dict import BatchedDataDict
 from nemo_reinforcer.distributed.virtual_cluster import RayVirtualCluster
 from nemo_reinforcer.models.policy.hf_policy import HfPolicy
@@ -31,7 +32,9 @@
 # Define basic test config
 simple_policy_config = {
     "model_name": "meta-llama/Llama-3.2-1B",  # "hf-internal-testing/tiny-random-Gemma3ForCausalLM",
-    "tokenizer_name": "meta-llama/Llama-3.2-1B",  # "hf-internal-testing/tiny-random-Gemma3ForCausalLM",
+    "tokenizer": {
+        "name": "meta-llama/Llama-3.2-1B",
+    },
     "train_global_batch_size": 32,
     "train_micro_batch_size": 1,
     "logprob_batch_size": 1,
@@ -75,10 +78,7 @@ def cluster():
 @pytest.fixture(scope="function")
 def tokenizer():
     """Initialize tokenizer for the test model."""
-    tokenizer_name = simple_policy_config["tokenizer_name"]
-    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, trust_remote_code=True)
-    if tokenizer.pad_token is None:
-        tokenizer.pad_token = tokenizer.eos_token
+    tokenizer = get_tokenizer(simple_policy_config["tokenizer"])
     return tokenizer
 
 
@@ -87,6 +87,7 @@ def policy(cluster, tokenizer):
     """Initialize the policy."""
     return HfPolicy(
         cluster=cluster,
+        tokenizer=tokenizer,
         config=simple_policy_config,
         init_optimizer=False,
         init_reference_model=False,
@@ -273,6 +274,9 @@ def test_save_and_load_hf_checkpoint(policy):
             "model-00001-of-00002.safetensors",
             "model-00002-of-00002.safetensors",
             "model.safetensors.index.json",
+            "tokenizer_config.json",
+            "tokenizer.json",
+            "special_tokens_map.json",
         }
 
         coverted_model = AutoModelForCausalLM.from_pretrained(

From 23f06d278cd54e19c8906ad15e0ef2ff0013efdf Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Wed, 9 Apr 2025 15:08:15 -0700
Subject: [PATCH 08/23] address comments

Signed-off-by: ashors1 <ashors@nvidia.com>
---
 examples/run_sft.py                           | 19 +++++++++++++++----
 nemo_reinforcer/data/__init__.py              |  2 ++
 nemo_reinforcer/data/hf_datasets/oasst.py     |  1 -
 .../data/hf_datasets/openmathinstruct2.py     |  1 -
 4 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/examples/run_sft.py b/examples/run_sft.py
index be56330c9a..1b812b71a2 100644
--- a/examples/run_sft.py
+++ b/examples/run_sft.py
@@ -15,6 +15,7 @@
 import argparse
 import os
 import pprint
+from functools import partial
 from typing import Dict, Any
 
 from omegaconf import OmegaConf
@@ -56,14 +57,16 @@ def sft_preprocessor(
     tokenizer,
     max_seq_length: int,
     idx: int,
+    add_bos: bool = True,
+    add_eos: bool = True,
 ) -> DatumSpec:
     """Process a datum dictionary for SFT training."""
     message_log = get_formatted_message_log(
         datum_dict["messages"],
         tokenizer,
         task_data_spec,
-        add_bos_token=True,
-        add_eos_token=True,
+        add_bos_token=add_bos,
+        add_eos_token=add_eos,
     )
 
     length = sum(len(m["token_ids"]) for m in message_log)
@@ -115,7 +118,11 @@ def setup_data(tokenizer: AutoTokenizer, data_config: DataConfig):
         train_dataset,
         tokenizer,
         sft_task_spec,
-        sft_preprocessor,
+        partial(
+            sft_preprocessor,
+            add_bos=data_config.get("add_bos", True),
+            add_eos=data_config.get("add_eos", True),
+        ),
         max_seq_length=data_config["max_input_seq_length"],
     )
 
@@ -123,7 +130,11 @@ def setup_data(tokenizer: AutoTokenizer, data_config: DataConfig):
         val_dataset,
         tokenizer,
         sft_task_spec,
-        sft_preprocessor,
+        partial(
+            sft_preprocessor,
+            add_bos=data_config.get("add_bos", True),
+            add_eos=data_config.get("add_eos", True),
+        ),
         max_seq_length=data_config["max_input_seq_length"],
     )
 
diff --git a/nemo_reinforcer/data/__init__.py b/nemo_reinforcer/data/__init__.py
index 09eaf35fb5..eba1eaf7ce 100644
--- a/nemo_reinforcer/data/__init__.py
+++ b/nemo_reinforcer/data/__init__.py
@@ -21,6 +21,8 @@ class DataConfig(TypedDict):
     system_prompt_file: Optional[str]
     dataset_name: str
     val_dataset_name: Optional[str]
+    add_bos: Optional[bool]
+    add_eos: Optional[bool]
 
 
 class MathDataConfig(DataConfig):
diff --git a/nemo_reinforcer/data/hf_datasets/oasst.py b/nemo_reinforcer/data/hf_datasets/oasst.py
index f4e5acecba..3d10c46e37 100644
--- a/nemo_reinforcer/data/hf_datasets/oasst.py
+++ b/nemo_reinforcer/data/hf_datasets/oasst.py
@@ -119,7 +119,6 @@ def download_and_process_oasst(output_directory=".", seed=42, split_ratio=0.95):
     return formatted_ds
 
 
-@dataclass
 class OasstDataset:
     def __init__(self, output_dir: str = "."):
         self.formatted_ds = download_and_process_oasst(output_dir)
diff --git a/nemo_reinforcer/data/hf_datasets/openmathinstruct2.py b/nemo_reinforcer/data/hf_datasets/openmathinstruct2.py
index 57f7384568..b5a0bfa1bc 100644
--- a/nemo_reinforcer/data/hf_datasets/openmathinstruct2.py
+++ b/nemo_reinforcer/data/hf_datasets/openmathinstruct2.py
@@ -62,7 +62,6 @@ def prepare_openinstructmath2_dataset(split: str = "train_1M", seed=42, test_siz
     }
 
 
-@dataclass
 class OpenMathInstruct2Dataset:
     def __init__(
         self, split: str = "train_1M", seed: int = 42, test_size: float = 0.05

From 58fe349fce573c600dc35bbb4bd0e5748fa1dd6c Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Fri, 11 Apr 2025 11:12:11 -0700
Subject: [PATCH 09/23] add unit tests and documentation

Signed-off-by: ashors1 <ashors@nvidia.com>
---
 nemo_reinforcer/algorithms/utils.py |  58 +++++++++++++-
 tests/unit/algorithms/test_utils.py | 115 ++++++++++++++++++++++++++++
 2 files changed, 172 insertions(+), 1 deletion(-)
 create mode 100755 tests/unit/algorithms/test_utils.py

diff --git a/nemo_reinforcer/algorithms/utils.py b/nemo_reinforcer/algorithms/utils.py
index 6441d280ff..66d7c0e0c2 100644
--- a/nemo_reinforcer/algorithms/utils.py
+++ b/nemo_reinforcer/algorithms/utils.py
@@ -137,15 +137,71 @@ def set_seed(seed: int):
 
 
 def get_tokenizer(tokenizer_config: TokenizerConfig) -> AutoTokenizer:
-    """Get the tokenizer and set pad token to eos token if it is not already set."""
+    """Get the tokenizer and set pad token to eos token if it is not already set.
+
+    This function initializes a tokenizer from the Hugging Face transformers library
+    and configures it with appropriate chat templates and padding tokens.
+
+    Args:
+        tokenizer_config: A dictionary containing tokenizer configuration.
+            Required keys:
+                - name: The name or path of the pretrained tokenizer
+            Optional keys:
+                - chat_template: The chat template to use. Can be:
+                    - None: Uses a passthrough template that just returns message content
+                    - "default": Uses the tokenizer's default template
+                    - A custom jinja2 template string
+                    If not specified, the tokenizer's default template will be used.
+
+    Returns:
+        AutoTokenizer: The configured tokenizer instance
+
+    Examples:
+        >>> from transformers import AutoTokenizer
+        >>> from nemo_reinforcer.models.policy import TokenizerConfig
+        >>> # not specifying a chat template uses the tokenizer's default
+        >>> config = {"name": "meta-llama/Llama-3.2-1B-Instruct"}
+        >>> tokenizer = get_tokenizer(config)
+        >>> messages = [
+        ...     {"role": "system", "content": "You are a helpful AI assistant."},
+        ...     {"role": "user", "content": "Hello!"}
+        ... ]
+        >>> formatted = tokenizer.apply_chat_template(messages, tokenize=False)
+        >>> assert formatted == AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B-Instruct").apply_chat_template(messages, tokenize=False)
+
+        >>> # Using a passthrough template
+        >>> config = {
+        ...     "name": "meta-llama/Llama-3.2-1B-Instruct",
+        ...     "chat_template": None
+        ... }
+        >>> tokenizer = get_tokenizer(config)
+        >>> formatted = tokenizer.apply_chat_template(messages, tokenize=False)
+        >>> assert formatted == "".join(msg["content"] for msg in messages)
+
+        >>> # Using a custom template
+        >>> config = {
+        ...     "name": "meta-llama/Llama-3.2-1B-Instruct",
+        ...     "chat_template": "{% for message in messages %}{{ ' START: ' + message['content'] + ' END.' }}{% endfor %}"
+        ... }
+        >>> tokenizer = get_tokenizer(config)
+        >>> formatted = tokenizer.apply_chat_template(messages, tokenize=False)
+        >>> assert formatted == " START: You are a helpful AI assistant. END. START: Hello! END."
+    """
     tokenizer = AutoTokenizer.from_pretrained(tokenizer_config["name"])
     if tokenizer.pad_token is None:
         tokenizer.pad_token = tokenizer.eos_token
     if "chat_template" in tokenizer_config:
         if tokenizer_config["chat_template"] is None:
+            print("Using passthrough chat template")
             tokenizer.chat_template = (
                 hf_datasets.COMMON_CHAT_TEMPLATES.passthrough_prompt_response
             )
+        elif tokenizer_config["chat_template"].lower() == "default":
+            print("Using tokenizer's default chat template")
         else:
+            print("Using custom chat template")
             tokenizer.chat_template = tokenizer_config["chat_template"]
+    else:
+        print("No chat template provided, using tokenizer's default")
+
     return tokenizer
diff --git a/tests/unit/algorithms/test_utils.py b/tests/unit/algorithms/test_utils.py
new file mode 100755
index 0000000000..ce517a2b3a
--- /dev/null
+++ b/tests/unit/algorithms/test_utils.py
@@ -0,0 +1,115 @@
+import pytest
+from datetime import datetime
+from transformers import AutoTokenizer
+from nemo_reinforcer.algorithms.utils import get_tokenizer
+from nemo_reinforcer.data.hf_datasets.chat_templates import COMMON_CHAT_TEMPLATES
+
+
+@pytest.fixture
+def conversation_messages():
+    """Fixture providing a multi-turn conversation for testing chat templates"""
+    return [
+        {"role": "system", "content": "You are a helpful AI assistant."},
+        {"role": "user", "content": "What's the weather like today?"},
+        {
+            "role": "assistant",
+            "content": "I don't have access to real-time weather data.",
+        },
+        {"role": "user", "content": "Can you help me with something else then?"},
+        {"role": "assistant", "content": "Of course! What would you like help with?"},
+    ]
+
+
+def get_expected_llama_format(messages):
+    """Generate the expected output format for Llama's chat template"""
+    # Extract the date from the formatted output
+    # Get current date
+    current_date = datetime.now()
+    formatted_date = current_date.strftime("%d %b %Y")
+
+    # Extract system message if present
+    if messages[0]["role"] == "system":
+        system_message = messages[0]["content"].strip()
+        messages = messages[1:]
+    else:
+        system_message = ""
+
+    # Start with BOS token and system header
+    expected = "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n"
+    expected += "Cutting Knowledge Date: December 2023\n"
+    expected += f"Today Date: {formatted_date}\n\n"
+    expected += f"{system_message}<|eot_id|>"
+
+    # Add each message
+    for message in messages:
+        if message["role"] not in ["ipython", "tool"]:
+            expected += f"<|start_header_id|>{message['role']}<|end_header_id|>\n\n"
+            expected += f"{message['content'].strip()}<|eot_id|>"
+
+    return expected
+
+
+def get_format_with_simple_role_header(messages):
+    message = "<|begin_of_text|>"
+    for msg in messages:
+        message += (
+            "<|start_header_id|>"
+            + msg["role"]
+            + "<|end_header_id|>\n\n"
+            + msg["content"].strip()
+            + "<|eot_id|>"
+        )
+    return message
+
+
+def test_get_tokenizer_no_chat_template(conversation_messages):
+    """Test get_tokenizer when no chat template is specified in config"""
+    config = {"name": "meta-llama/Llama-3.2-1B-Instruct"}
+    tokenizer = get_tokenizer(config)
+
+    # Verify that the tokenizer's default template is used
+    formatted = tokenizer.apply_chat_template(conversation_messages, tokenize=False)
+
+    expected = get_expected_llama_format(conversation_messages)
+    assert formatted == expected
+
+
+def test_get_tokenizer_default_chat_template(conversation_messages):
+    """Test get_tokenizer when chat_template is 'default' in config"""
+    config = {"name": "meta-llama/Llama-3.2-1B-Instruct", "chat_template": "default"}
+    tokenizer = get_tokenizer(config)
+
+    # Verify that the tokenizer's default template is used
+    formatted = tokenizer.apply_chat_template(conversation_messages, tokenize=False)
+    expected = get_expected_llama_format(conversation_messages)
+    assert formatted == expected
+
+
+def test_get_tokenizer_null_chat_template(conversation_messages):
+    """Test get_tokenizer when chat_template is None in config"""
+    config = {"name": "meta-llama/Llama-3.2-1B-Instruct", "chat_template": None}
+    tokenizer = get_tokenizer(config)
+
+    # Verify that the passthrough template is used
+    formatted = tokenizer.apply_chat_template(conversation_messages, tokenize=False)
+
+    expected = "".join(msg["content"] for msg in conversation_messages)
+
+    assert (
+        formatted == expected
+    )  # Passthrough template should just return the last content
+
+
+def test_get_tokenizer_custom_jinja_template(conversation_messages):
+    """Test get_tokenizer when a custom jinja template is specified"""
+    custom_template = COMMON_CHAT_TEMPLATES.simple_role_header
+    config = {
+        "name": "meta-llama/Llama-3.2-1B-Instruct",
+        "chat_template": custom_template,
+    }
+    tokenizer = get_tokenizer(config)
+
+    # Verify that the custom template is used
+    formatted = tokenizer.apply_chat_template(conversation_messages, tokenize=False)
+    expected = get_format_with_simple_role_header(conversation_messages)
+    assert formatted == expected  # Should use our custom template

From 956c7f08d8b71437a02f4832536424ba6292a258 Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Fri, 11 Apr 2025 11:14:32 -0700
Subject: [PATCH 10/23] copyright header

Signed-off-by: ashors1 <ashors@nvidia.com>
---
 tests/unit/algorithms/test_utils.py | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/tests/unit/algorithms/test_utils.py b/tests/unit/algorithms/test_utils.py
index ce517a2b3a..c3cc381fe9 100755
--- a/tests/unit/algorithms/test_utils.py
+++ b/tests/unit/algorithms/test_utils.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import pytest
 from datetime import datetime
 from transformers import AutoTokenizer
@@ -95,9 +109,7 @@ def test_get_tokenizer_null_chat_template(conversation_messages):
 
     expected = "".join(msg["content"] for msg in conversation_messages)
 
-    assert (
-        formatted == expected
-    )  # Passthrough template should just return the last content
+    assert formatted == expected
 
 
 def test_get_tokenizer_custom_jinja_template(conversation_messages):
@@ -112,4 +124,4 @@ def test_get_tokenizer_custom_jinja_template(conversation_messages):
     # Verify that the custom template is used
     formatted = tokenizer.apply_chat_template(conversation_messages, tokenize=False)
     expected = get_format_with_simple_role_header(conversation_messages)
-    assert formatted == expected  # Should use our custom template
+    assert formatted == expected

From 4913f2589996c689607cf1686f3faf89235e07ff Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Fri, 11 Apr 2025 12:44:10 -0700
Subject: [PATCH 11/23] address comments

Signed-off-by: ashors1 <ashors@nvidia.com>
---
 examples/configs/sft.yaml                     |  2 +
 examples/run_sft.py                           |  4 +-
 nemo_reinforcer/algorithms/grpo.py            |  3 +
 nemo_reinforcer/algorithms/sft.py             |  3 +
 nemo_reinforcer/algorithms/utils.py           |  2 +
 nemo_reinforcer/data/llm_message_utils.py     | 26 +++---
 nemo_reinforcer/models/policy/hf_policy.py    |  4 +-
 nemo_reinforcer/utils/native_checkpoint.py    | 13 ++-
 .../data/hf_datasets/test_prompt_response.py  | 90 +++++++++++++++++++
 .../test_squad.py}                            |  5 +-
 10 files changed, 132 insertions(+), 20 deletions(-)
 create mode 100644 tests/unit/data/hf_datasets/test_prompt_response.py
 rename tests/unit/data/{test_hf_datasets.py => hf_datasets/test_squad.py} (80%)

diff --git a/examples/configs/sft.yaml b/examples/configs/sft.yaml
index 973c9fee78..dad99b2479 100644
--- a/examples/configs/sft.yaml
+++ b/examples/configs/sft.yaml
@@ -37,6 +37,8 @@ policy:
 data:
   max_input_seq_length: ${policy.max_total_sequence_length}
   dataset_name: "squad"
+  add_bos: true
+  add_eos: true
 
 logger:
   log_dir: "logs"  # Base directory for all logs
diff --git a/examples/run_sft.py b/examples/run_sft.py
index 1b812b71a2..875aa9a000 100644
--- a/examples/run_sft.py
+++ b/examples/run_sft.py
@@ -120,8 +120,8 @@ def setup_data(tokenizer: AutoTokenizer, data_config: DataConfig):
         sft_task_spec,
         partial(
             sft_preprocessor,
-            add_bos=data_config.get("add_bos", True),
-            add_eos=data_config.get("add_eos", True),
+            add_bos=data_config["add_bos"],
+            add_eos=data_config["add_eos"],
         ),
         max_seq_length=data_config["max_input_seq_length"],
     )
diff --git a/nemo_reinforcer/algorithms/grpo.py b/nemo_reinforcer/algorithms/grpo.py
index a89a29fa97..eb0947011f 100644
--- a/nemo_reinforcer/algorithms/grpo.py
+++ b/nemo_reinforcer/algorithms/grpo.py
@@ -629,6 +629,9 @@ def grpo_train(
                         optimizer_path=os.path.join(
                             checkpoint_path, "policy", "optimizer"
                         ),
+                        tokenizer_path=os.path.join(
+                            checkpoint_path, "policy", "tokenizer"
+                        ),
                         save_hf=is_last_checkpoint,
                     )
                     torch.save(
diff --git a/nemo_reinforcer/algorithms/sft.py b/nemo_reinforcer/algorithms/sft.py
index 0bb42bd27a..45f4f08575 100644
--- a/nemo_reinforcer/algorithms/sft.py
+++ b/nemo_reinforcer/algorithms/sft.py
@@ -419,6 +419,9 @@ def sft_train(
                         optimizer_path=os.path.join(
                             checkpoint_path, "policy", "optimizer"
                         ),
+                        tokenizer_path=os.path.join(
+                            checkpoint_path, "policy", "tokenizer"
+                        ),
                         save_hf=is_last_checkpoint,
                     )
                     torch.save(
diff --git a/nemo_reinforcer/algorithms/utils.py b/nemo_reinforcer/algorithms/utils.py
index 66d7c0e0c2..418f14c074 100644
--- a/nemo_reinforcer/algorithms/utils.py
+++ b/nemo_reinforcer/algorithms/utils.py
@@ -157,6 +157,7 @@ def get_tokenizer(tokenizer_config: TokenizerConfig) -> AutoTokenizer:
         AutoTokenizer: The configured tokenizer instance
 
     Examples:
+        ```{doctest}
         >>> from transformers import AutoTokenizer
         >>> from nemo_reinforcer.models.policy import TokenizerConfig
         >>> # not specifying a chat template uses the tokenizer's default
@@ -186,6 +187,7 @@ def get_tokenizer(tokenizer_config: TokenizerConfig) -> AutoTokenizer:
         >>> tokenizer = get_tokenizer(config)
         >>> formatted = tokenizer.apply_chat_template(messages, tokenize=False)
         >>> assert formatted == " START: You are a helpful AI assistant. END. START: Hello! END."
+        ```
     """
     tokenizer = AutoTokenizer.from_pretrained(tokenizer_config["name"])
     if tokenizer.pad_token is None:
diff --git a/nemo_reinforcer/data/llm_message_utils.py b/nemo_reinforcer/data/llm_message_utils.py
index e7ba160c1c..c9290271bf 100644
--- a/nemo_reinforcer/data/llm_message_utils.py
+++ b/nemo_reinforcer/data/llm_message_utils.py
@@ -376,21 +376,23 @@ def get_formatted_message_log(
         message_chunk = formatted_message[prev_message_len_no_eos:]
 
         if i == 0:
-            if (
-                add_bos_token
-                and tokenizer.bos_token is not None
-                and not message_chunk.startswith(tokenizer.bos_token)
-            ):
-                message_chunk = tokenizer.bos_token + message_chunk
+            if add_bos_token:
+                if tokenizer.bos_token is None:
+                    logging.warning(
+                        "add_bos_token is True but the tokenizer does not have a BOS token. Skipping BOS token addition."
+                    )
+                elif not message_chunk.startswith(tokenizer.bos_token):
+                    message_chunk = tokenizer.bos_token + message_chunk
 
         if i == len(message_log) - 1:
             message_chunk = message_chunk.rstrip("\n")
-            if (
-                add_eos_token
-                and tokenizer.eos_token is not None
-                and not message_chunk.endswith(tokenizer.eos_token)
-            ):
-                message_chunk += tokenizer.eos_token
+            if add_eos_token:
+                if tokenizer.eos_token is None:
+                    logging.warning(
+                        "add_eos_token is True but the tokenizer does not have an EOS token. Skipping EOS token addition."
+                    )
+                elif not message_chunk.endswith(tokenizer.eos_token):
+                    message_chunk += tokenizer.eos_token
 
         new_message = message.copy()
         new_message["token_ids"] = tokenizer(
diff --git a/nemo_reinforcer/models/policy/hf_policy.py b/nemo_reinforcer/models/policy/hf_policy.py
index d7a0a50b0d..ab656f2f9e 100644
--- a/nemo_reinforcer/models/policy/hf_policy.py
+++ b/nemo_reinforcer/models/policy/hf_policy.py
@@ -825,6 +825,7 @@ def save_checkpoint(
         self,
         weights_path: str,
         optimizer_path: Optional[str] = None,
+        tokenizer_path: Optional[str] = None,
         save_torch_dist: bool = True,
         save_hf: bool = False,
     ):
@@ -856,9 +857,10 @@ def save_checkpoint(
             optimizer=self.optimizer if optimizer_path else None,
             scheduler=self.scheduler if optimizer_path else None,
             optimizer_path=optimizer_path,
+            tokenizer=self.tokenizer,
+            tokenizer_path=tokenizer_path,
             save_torch_dist=save_torch_dist,
             save_hf=save_hf,
-            tokenizer=self.tokenizer if save_hf else None,
         )
 
     def load_checkpoint(self, weights_path: str, optimizer_path: Optional[str] = None):
diff --git a/nemo_reinforcer/utils/native_checkpoint.py b/nemo_reinforcer/utils/native_checkpoint.py
index fbf13e8fb2..bf6ce0f81f 100644
--- a/nemo_reinforcer/utils/native_checkpoint.py
+++ b/nemo_reinforcer/utils/native_checkpoint.py
@@ -135,9 +135,10 @@ def save_checkpoint(
     optimizer: Optional[torch.optim.Optimizer] = None,
     scheduler: Optional[Any] = None,
     optimizer_path: Optional[str] = None,
+    tokenizer: Optional[Any] = None,
+    tokenizer_path: Optional[str] = None,
     save_torch_dist: bool = True,
     save_hf: bool = False,
-    tokenizer: Optional[Any] = None,
 ) -> None:
     """Save a checkpoint of the model and optionally optimizer state.
 
@@ -162,9 +163,6 @@ def save_checkpoint(
                 state_dict=model_state_dict,
             )
 
-            if tokenizer is not None:
-                tokenizer.save_pretrained(hf_weights_path)
-
     if save_torch_dist:
         model_state = {"model": ModelState(model)}
         dcp.save(model_state, checkpoint_id=weights_path)
@@ -177,6 +175,13 @@ def save_checkpoint(
             optimizer_state = {"optim": OptimizerState(model, optimizer, scheduler)}
             dcp.save(optimizer_state, checkpoint_id=optimizer_path)
 
+    if tokenizer is not None:
+        if tokenizer_path is None:
+            raise ValueError(
+                "tokenizer_path must be provided when saving tokenizer state"
+            )
+        tokenizer.save_pretrained(tokenizer_path)
+
 
 def load_checkpoint(
     model,
diff --git a/tests/unit/data/hf_datasets/test_prompt_response.py b/tests/unit/data/hf_datasets/test_prompt_response.py
new file mode 100644
index 0000000000..748b41fd18
--- /dev/null
+++ b/tests/unit/data/hf_datasets/test_prompt_response.py
@@ -0,0 +1,90 @@
+import pytest
+import tempfile
+import json
+from nemo_reinforcer.data.hf_datasets.chat_templates import COMMON_CHAT_TEMPLATES
+from nemo_reinforcer.data.hf_datasets.prompt_response_dataset import (
+    PromptResponseDataset,
+)
+from transformers import AutoTokenizer
+
+
+@pytest.fixture
+def sample_data(request):
+    input_key = request.param[0]
+    output_key = request.param[1]
+
+    train_data = [
+        {input_key: "Hello", output_key: "Hi there!"},
+        {input_key: "How are you?", output_key: "I'm good, thanks!"},
+    ]
+    val_data = [
+        {input_key: "What's up?", output_key: "Not much!"},
+        {input_key: "Bye", output_key: "Goodbye!"},
+    ]
+
+    # Create temporary files for train and validation data
+    with tempfile.NamedTemporaryFile(
+        mode="w", suffix=".json", delete=False
+    ) as train_file:
+        json.dump(train_data, train_file)
+        train_path = train_file.name
+
+    with tempfile.NamedTemporaryFile(
+        mode="w", suffix=".json", delete=False
+    ) as val_file:
+        json.dump(val_data, val_file)
+        val_path = val_file.name
+
+    return train_path, val_path
+
+
+@pytest.mark.parametrize("sample_data", [("input", "output")], indirect=True)
+def test_dataset_initialization(sample_data):
+    train_path, val_path = sample_data
+    dataset = PromptResponseDataset(train_path, val_path)
+
+    assert dataset.input_key == "input"
+    assert dataset.output_key == "output"
+    assert "train" in dataset.formatted_ds
+    assert "validation" in dataset.formatted_ds
+
+
+@pytest.mark.parametrize("sample_data", [("question", "answer")], indirect=True)
+def test_custom_keys(sample_data):
+    train_path, val_path = sample_data
+    dataset = PromptResponseDataset(
+        train_path, val_path, input_key="question", output_key="answer"
+    )
+
+    assert dataset.input_key == "question"
+    assert dataset.output_key == "answer"
+
+
+@pytest.mark.parametrize("sample_data", [("question", "answer")], indirect=True)
+def test_message_formatting(sample_data):
+    train_path, val_path = sample_data
+    dataset = PromptResponseDataset(
+        train_path, val_path, input_key="question", output_key="answer"
+    )
+
+    first_example = dataset.formatted_ds["train"][0]
+
+    assert first_example["messages"][0]["role"] == "user"
+    assert first_example["messages"][0]["content"] == "Hello"
+    assert first_example["messages"][1]["role"] == "assistant"
+    assert first_example["messages"][1]["content"] == "Hi there!"
+
+    chat_template = COMMON_CHAT_TEMPLATES.passthrough_prompt_response
+    tokenizer = AutoTokenizer.from_pretrained("Meta-Llama/Meta-Llama-3-8B-Instruct")
+
+    combined_message = tokenizer.apply_chat_template(
+        first_example["messages"],
+        chat_template=chat_template,
+        tokenize=False,
+        add_generation_prompt=False,
+        add_special_tokens=False,
+    )
+
+    assert combined_message == "".join(
+        message["content"] for message in first_example["messages"]
+    )
diff --git a/tests/unit/data/test_hf_datasets.py b/tests/unit/data/hf_datasets/test_squad.py
similarity index 80%
rename from tests/unit/data/test_hf_datasets.py
rename to tests/unit/data/hf_datasets/test_squad.py
index 5fe634a2a7..633a5e4c0c 100644
--- a/tests/unit/data/test_hf_datasets.py
+++ b/tests/unit/data/hf_datasets/test_squad.py
@@ -14,6 +14,7 @@
 import pytest
 
 from transformers import AutoTokenizer
+from nemo_reinforcer.data.hf_dataset.chat_templates import COMMON_CHAT_TEMPLATES
 from nemo_reinforcer.data.hf_datasets.squad import SquadDataset
 
 
@@ -31,10 +32,12 @@ def test_squad_dataset():
         assert example["messages"][1]["role"] == "user"
         assert example["messages"][2]["role"] == "assistant"
 
+        template = "{% for message in messages %}{%- if message['role'] == 'system'  %}{{'Context: ' + message['content'].strip()}}{%- elif message['role'] == 'user'  %}{{' Question: ' + message['content'].strip() + ' Answer:'}}{%- elif message['role'] == 'assistant'  %}{{' ' + message['content'].strip()}}{%- endif %}{% endfor %}"
+
         ## check that applying chat template works as expected
         default_templated = tokenizer.apply_chat_template(
             example["messages"],
-            chat_template=squad_dataset.task_spec.custom_template,
+            chat_template=template,
             tokenize=False,
             add_generation_prompt=False,
             add_special_tokens=False,

From c016902e8ac1ee3e6320e0f8a7184181da111379 Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Fri, 11 Apr 2025 12:46:04 -0700
Subject: [PATCH 12/23] small fixes

Signed-off-by: ashors1 <ashors@nvidia.com>
---
 nemo_reinforcer/models/policy/hf_policy.py         |  1 -
 .../unit/data/hf_datasets/test_prompt_response.py  | 14 ++++++++++++++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/nemo_reinforcer/models/policy/hf_policy.py b/nemo_reinforcer/models/policy/hf_policy.py
index ab656f2f9e..d529212dcc 100644
--- a/nemo_reinforcer/models/policy/hf_policy.py
+++ b/nemo_reinforcer/models/policy/hf_policy.py
@@ -80,7 +80,6 @@ def __init__(
         rank = torch.distributed.get_rank()
         world_size = torch.distributed.get_world_size()
         model_name = self.cfg["model_name"]
-        tokenizer_name = self.cfg["tokenizer"]["name"]
         if self.cfg["precision"] == "float32":
             self.dtype = torch.float32
         elif self.cfg["precision"] == "bfloat16":
diff --git a/tests/unit/data/hf_datasets/test_prompt_response.py b/tests/unit/data/hf_datasets/test_prompt_response.py
index 748b41fd18..12b98d7fb2 100644
--- a/tests/unit/data/hf_datasets/test_prompt_response.py
+++ b/tests/unit/data/hf_datasets/test_prompt_response.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import pytest
 import tempfile
 import json

From 326b15116926a948311c8da7dc0add46d727909c Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Fri, 11 Apr 2025 12:59:40 -0700
Subject: [PATCH 13/23] fix typo

Signed-off-by: ashors1 <ashors@nvidia.com>
---
 tests/unit/data/hf_datasets/test_squad.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/unit/data/hf_datasets/test_squad.py b/tests/unit/data/hf_datasets/test_squad.py
index 633a5e4c0c..47511363b5 100644
--- a/tests/unit/data/hf_datasets/test_squad.py
+++ b/tests/unit/data/hf_datasets/test_squad.py
@@ -14,7 +14,7 @@
 import pytest
 
 from transformers import AutoTokenizer
-from nemo_reinforcer.data.hf_dataset.chat_templates import COMMON_CHAT_TEMPLATES
+from nemo_reinforcer.data.hf_datasets.chat_templates import COMMON_CHAT_TEMPLATES
 from nemo_reinforcer.data.hf_datasets.squad import SquadDataset
 
 

From 506848752f9f19ec22d3deaa422d6efd936c8d88 Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Fri, 11 Apr 2025 13:59:41 -0700
Subject: [PATCH 14/23] fix tests

Signed-off-by: ashors1 <ashors@nvidia.com>
---
 nemo_reinforcer/data/llm_message_utils.py  | 5 +++--
 nemo_reinforcer/models/policy/hf_policy.py | 2 ++
 tests/unit/utils/test_native_checkpoint.py | 4 ++++
 3 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/nemo_reinforcer/data/llm_message_utils.py b/nemo_reinforcer/data/llm_message_utils.py
index c9290271bf..13fc7dbd30 100644
--- a/nemo_reinforcer/data/llm_message_utils.py
+++ b/nemo_reinforcer/data/llm_message_utils.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import warnings
 from typing import Dict, List
 
 import torch
@@ -378,7 +379,7 @@ def get_formatted_message_log(
         if i == 0:
             if add_bos_token:
                 if tokenizer.bos_token is None:
-                    logging.warning(
+                    warnings.warn(
                         "add_bos_token is True but the tokenizer does not have a BOS token. Skipping BOS token addition."
                     )
                 elif not message_chunk.startswith(tokenizer.bos_token):
@@ -388,7 +389,7 @@ def get_formatted_message_log(
             message_chunk = message_chunk.rstrip("\n")
             if add_eos_token:
                 if tokenizer.eos_token is None:
-                    logging.warning(
+                    warnings.warn(
                         "add_eos_token is True but the tokenizer does not have an EOS token. Skipping EOS token addition."
                     )
                 elif not message_chunk.endswith(tokenizer.eos_token):
diff --git a/nemo_reinforcer/models/policy/hf_policy.py b/nemo_reinforcer/models/policy/hf_policy.py
index d529212dcc..4166d6df77 100644
--- a/nemo_reinforcer/models/policy/hf_policy.py
+++ b/nemo_reinforcer/models/policy/hf_policy.py
@@ -1091,6 +1091,7 @@ def save_checkpoint(
         self,
         weights_path: str,
         optimizer_path: Optional[str] = None,
+        tokenizer_path: Optional[str] = None,
         save_torch_dist: bool = True,
         save_hf: bool = False,
     ):
@@ -1099,6 +1100,7 @@ def save_checkpoint(
             "save_checkpoint",
             weights_path,
             optimizer_path,
+            tokenizer_path,
             save_torch_dist,
             save_hf,
             respect_tied_workers=True,
diff --git a/tests/unit/utils/test_native_checkpoint.py b/tests/unit/utils/test_native_checkpoint.py
index aa913f8a97..4d30ef66ec 100755
--- a/tests/unit/utils/test_native_checkpoint.py
+++ b/tests/unit/utils/test_native_checkpoint.py
@@ -259,6 +259,7 @@ def test_save_and_load_hf_checkpoint(policy):
             os.path.join(tmp_dir, "test_hf_and_dcp"),
             save_hf=True,
             save_torch_dist=True,
+            tokenizer_path=os.path.join(tmp_dir, "test_hf_and_dcp_tokenizer"),
         )
 
         ## make sure we save both HF and DCP checkpoints
@@ -274,6 +275,9 @@ def test_save_and_load_hf_checkpoint(policy):
             "model-00001-of-00002.safetensors",
             "model-00002-of-00002.safetensors",
             "model.safetensors.index.json",
+        }
+
+        assert set(os.listdir(os.path.join(tmp_dir, "test_hf_and_dcp_tokenizer"))) == {
             "tokenizer_config.json",
             "tokenizer.json",
             "special_tokens_map.json",

From 8e835ba86bcf768ccf9d741e503dca6ad2784ad1 Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Mon, 14 Apr 2025 21:35:39 -0700
Subject: [PATCH 15/23] update chat template documentation

Signed-off-by: ashors1 <ashors@nvidia.com>
---
 docs/guides/sft.md | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/docs/guides/sft.md b/docs/guides/sft.md
index 4d452b109d..fe49b71734 100644
--- a/docs/guides/sft.md
+++ b/docs/guides/sft.md
@@ -27,7 +27,7 @@ uv run examples/run_sft.py \
 
 SFT datasets in Reinforcer are encapsulated using classes. Each SFT data class is expected to have the following attributes:
   1. `formatted_ds`: The dictionary of formatted datasets. This dictionary should contain `train` and `validation` splits, and each split should conform to the format described below.
-  2. `task_spec`: The `TaskDataSpec` for this dataset. This should specify the name you choose for this dataset as well as the `custom_template` for this dataset. More on custom templates below.
+  2. `task_spec`: The `TaskDataSpec` for this dataset. This should specify the name you choose for this dataset.
 
 SFT datasets are expected to follow the HuggingFace chat format. Refer to the [chat dataset document](../design_docs/chat_datasets.md) for details. If your data is not in the correct format, simply write a preprocessing script to convert the data into this format. [data/hf_datasets/squad.py](../../nemo_reinforcer/data/hf_datasets/squad.py) has an example:
 
@@ -51,17 +51,21 @@ def format_squad(data):
     }
 ```
 
-Reinforcer SFT uses HuggingFace chat templates to format the individual examples. If you would like to use a custom template, create a string template in [jinja format](https://huggingface.co/docs/transformers/v4.34.0/en/chat_templating#how-do-i-create-a-chat-template) and pass it to the dataset's `TaskDataSpec`. For example,
+Reinforcer SFT uses HuggingFace chat templates to format the individual examples. Three types of chat templates are supported, which can be configured via `tokenizer.chat_template` in your yaml config (see [sft.yaml](../../examples/configs/sft.yaml) for an example):
+
+1. Apply the tokenizer's default chat template. To use the tokenizer's default, either omit `tokenizer.chat_template` from the config altogether, or set `tokenizer.chat_template="default"`.
+2. Use a "passthrough" template which simply concatenates all messages. This is desirable if the chat template has been applied to your dataset as an offline preprocessing step. In this case, you should set `tokenizer.chat_template` to None as follows:
+    ```yaml
+    tokenizer:
+      chat_template: NULL
+    ```
+3. Use a custom template: If you would like to use a custom template, create a string template in [jinja format](https://huggingface.co/docs/transformers/v4.34.0/en/chat_templating#how-do-i-create-a-chat-template), and add that string to the config. For example,
+
+    ```yaml
+    tokenizer:
+    custom_template: "{% for message in messages %}{%- if message['role'] == 'system'  %}{{'Context: ' + message['content'].strip()}}{%- elif message['role'] == 'user'  %}{{' Question: ' + message['content'].strip() + ' Answer: '}}{%- elif message['role'] == 'assistant'  %}{{message['content'].strip()}}{%- endif %}{% endfor %}"
+    ```
 
-```python
-custom_template = (
-    "{% for message in messages %}{%- if message['role'] == 'system'  %}{{'Context: ' + message['content'].strip()}}{%- elif message['role'] == 'user'  %}{{' Question: ' + message['content'].strip() + ' Answer: '}}{%- elif message['role'] == 'assistant'  %}{{message['content'].strip()}}{%- endif %}{% endfor %}"
-)
-task_spec = TaskDataSpec(
-    task_name="squad",
-    custom_template=custom_template,
-)
-```
 
 By default, NeMo-Reinforcer has support for `Squad` and `OpenAssistant` datasets. Both of these datasets are downloaded from HuggingFace and preprocessed on-the-fly, so there's no need to provide a path to any datasets on disk.
 

From d95d88b6cf65ae97ec7c518578bcbdbf4c83802f Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Tue, 15 Apr 2025 11:32:54 -0700
Subject: [PATCH 16/23] fix unit tests

Signed-off-by: ashors1 <ashors@nvidia.com>
---
 tests/unit/models/generation/test_vllm_generation.py | 2 +-
 tests/unit/models/policy/test_hf_ray_policy.py       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/unit/models/generation/test_vllm_generation.py b/tests/unit/models/generation/test_vllm_generation.py
index 20607c6a65..23b0ca356d 100644
--- a/tests/unit/models/generation/test_vllm_generation.py
+++ b/tests/unit/models/generation/test_vllm_generation.py
@@ -622,7 +622,7 @@ def test_vllm_generation_with_stop(cluster, test_input_data, tokenizer, is_eval)
 
         print("Creating HF policy...")
         hf_config = basic_hf_test_config.copy()
-        hf_policy = HfPolicy(cluster, hf_config)
+        hf_policy = HfPolicy(cluster, hf_config, tokenizer)
 
         print(f"refitting vllm policy...")
         ipc_handles = hf_policy.get_weights_ipc_handles()
diff --git a/tests/unit/models/policy/test_hf_ray_policy.py b/tests/unit/models/policy/test_hf_ray_policy.py
index e24238ac05..f56d3b8df3 100644
--- a/tests/unit/models/policy/test_hf_ray_policy.py
+++ b/tests/unit/models/policy/test_hf_ray_policy.py
@@ -591,7 +591,7 @@ def test_hf_policy_generation_with_stop(test_input_data, tokenizer):
     )
 
     # Create policy
-    policy = HfPolicy(cluster=cluster, config=config)
+    policy = HfPolicy(cluster=cluster, config=config, tokenizer=tokenizer)
 
     # Call prepare_for_generation if available
     print("Preparing for generation...")

From 1958bb99daa3c97993a8e35e2b76b16520682b40 Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Tue, 15 Apr 2025 15:05:10 -0700
Subject: [PATCH 17/23] fix doctest

Signed-off-by: ashors1 <ashors@nvidia.com>
---
 nemo_reinforcer/algorithms/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nemo_reinforcer/algorithms/utils.py b/nemo_reinforcer/algorithms/utils.py
index 418f14c074..6f814b3597 100644
--- a/nemo_reinforcer/algorithms/utils.py
+++ b/nemo_reinforcer/algorithms/utils.py
@@ -159,7 +159,7 @@ def get_tokenizer(tokenizer_config: TokenizerConfig) -> AutoTokenizer:
     Examples:
         ```{doctest}
         >>> from transformers import AutoTokenizer
-        >>> from nemo_reinforcer.models.policy import TokenizerConfig
+        >>> from nemo_reinforcer.algorithms.utils import get_tokenizer
         >>> # not specifying a chat template uses the tokenizer's default
         >>> config = {"name": "meta-llama/Llama-3.2-1B-Instruct"}
         >>> tokenizer = get_tokenizer(config)

From 2c7c5c575252417e95bff8f66d026c32955a94c0 Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Tue, 15 Apr 2025 19:11:49 -0700
Subject: [PATCH 18/23] fix checkpoint save when tokenizer not provided

Signed-off-by: ashors1 <ashors@nvidia.com>
---
 nemo_reinforcer/models/policy/hf_policy.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nemo_reinforcer/models/policy/hf_policy.py b/nemo_reinforcer/models/policy/hf_policy.py
index 2585396abe..0141a382ef 100644
--- a/nemo_reinforcer/models/policy/hf_policy.py
+++ b/nemo_reinforcer/models/policy/hf_policy.py
@@ -858,7 +858,7 @@ def save_checkpoint(
             optimizer=self.optimizer if optimizer_path else None,
             scheduler=self.scheduler if optimizer_path else None,
             optimizer_path=optimizer_path,
-            tokenizer=self.tokenizer,
+            tokenizer=self.tokenizer if tokenizer_path else None,
             tokenizer_path=tokenizer_path,
             save_torch_dist=save_torch_dist,
             save_hf=save_hf,

From 1df9a9f951d4b4210a264a1251cb02fb719a25b7 Mon Sep 17 00:00:00 2001
From: Parth Chadha <pchadha@nvidia.com>
Date: Tue, 15 Apr 2025 15:42:01 -0700
Subject: [PATCH 19/23] feat: add a unique seed for each vllm llm engine (#171)

Signed-off-by: Parth Chadha <pchadha@nvidia.com>
---
 nemo_reinforcer/distributed/worker_groups.py  |   6 +-
 nemo_reinforcer/models/generation/vllm.py     |  27 +++-
 .../models/generation/test_vllm_generation.py | 142 ++++++++++++++++++
 3 files changed, 167 insertions(+), 8 deletions(-)

diff --git a/nemo_reinforcer/distributed/worker_groups.py b/nemo_reinforcer/distributed/worker_groups.py
index d4ec9d7f1a..4e3bbbf2a6 100644
--- a/nemo_reinforcer/distributed/worker_groups.py
+++ b/nemo_reinforcer/distributed/worker_groups.py
@@ -91,7 +91,7 @@ def __call__(
         placement_group: PlacementGroup,
         placement_group_bundle_index: int,
         num_gpus: int,
-        bundle_indices: Optional[list] = None,
+        bundle_indices: Optional[tuple] = None,
         **extra_options: Dict[str, Any],
     ):
         """Create a Ray worker with the specified configuration.
@@ -108,7 +108,7 @@ def __call__(
             placement_group: Ray placement group for resource allocation
             placement_group_bundle_index: Index of the bundle in the placement group
             num_gpus: Number of GPUs to allocate to this worker
-            bundle_indices: List of bundle indices for tensor parallelism (if applicable)
+            bundle_indices: Tuple of (node_idx, local_bundle_indices) for tensor parallelism (if applicable)
             extra_options: Additional options to pass to the Ray actor (may be overridden by actor's configure_worker(...) method)
 
         Returns:
@@ -300,7 +300,7 @@ def _create_workers_from_bundle_indices(
 
                 # For tensor parallel groups, only the first worker gets bundle_indices
                 worker_bundle_indices = (
-                    local_bundle_indices if local_rank == 0 else None
+                    (node_idx, local_bundle_indices) if local_rank == 0 else None
                 )
 
                 # Create a descriptive name based on group structure
diff --git a/nemo_reinforcer/models/generation/vllm.py b/nemo_reinforcer/models/generation/vllm.py
index c9676a2f2b..4e8ff364c7 100644
--- a/nemo_reinforcer/models/generation/vllm.py
+++ b/nemo_reinforcer/models/generation/vllm.py
@@ -61,7 +61,7 @@ def __repr__(self):
 
     @staticmethod
     def configure_worker(
-        num_gpus: int | float, bundle_indices: Optional[list] = None
+        num_gpus: int | float, bundle_indices: Optional[tuple] = None
     ) -> tuple[dict, dict, dict]:
         """Provides complete worker configuration for vLLM tensor parallelism.
 
@@ -70,7 +70,7 @@ def configure_worker(
 
         Args:
             num_gpus: Original GPU allocation for this worker based on the placement group
-            bundle_indices: Bundle indices for tensor parallelism (if applicable)
+            bundle_indices: Tuple of (node_idx, local_bundle_indices) for tensor parallelism (if applicable)
 
         Returns:
             tuple with complete worker configuration:
@@ -83,11 +83,26 @@ def configure_worker(
         init_kwargs = {}
         env_vars = {}
 
-        init_kwargs["bundle_indices"] = bundle_indices
+        local_bundle_indices = None
+        if bundle_indices is not None:
+            node_idx = bundle_indices[0]
+            local_bundle_indices = bundle_indices[1]
+            init_kwargs["bundle_indices"] = local_bundle_indices
+
+            """
+            compute a unique seed from the node_idx and bundle_indices:
+            node_idx = 0, bundle_indices = [0, 1, 2, 3] -> seed = 0*1024 + 0
+            node_idx = 0, bundle_indices = [4, 5, 6, 7] -> seed = 0*1024 + 1
+            node_idx = 1, bundle_indices = [0, 1, 2, 3] -> seed = 1*1024 + 0
+            node_idx = 1, bundle_indices = [4, 5, 6, 7] -> seed = 1*1024 + 1
+            """
+            bundle_id = local_bundle_indices[0] // len(local_bundle_indices)
+            seed = node_idx * 1024 + bundle_id
+            init_kwargs["seed"] = seed
 
         is_part_of_tp_workers = (
-            bundle_indices is not None and len(bundle_indices) > 1
-        ) or bundle_indices is None
+            local_bundle_indices is not None and len(local_bundle_indices) > 1
+        ) or local_bundle_indices is None
         if is_part_of_tp_workers:
             # Ray + vllm likes to manage GPU assignment internally
             resources["num_gpus"] = 0
@@ -104,6 +119,7 @@ def __init__(
         config: VllmConfig,
         bundle_indices: Optional[list] = None,
         fraction_of_gpus: float = 1.0,
+        seed: Optional[int] = None,
     ):
         """Initialize a vLLM worker for distributed inference.
 
@@ -177,6 +193,7 @@ def __init__(
             gpu_memory_utilization=self.cfg["vllm_cfg"]["gpu_memory_utilization"],
             enable_prefix_caching=True,
             dtype="auto",
+            seed=seed,
             # Don't use cuda-graph by default as it leads to convergence issue (see https://github.com/NVIDIA/reinforcer/issues/186)
             enforce_eager=True,
             max_model_len=self.cfg["vllm_cfg"]["max_model_len"],
diff --git a/tests/unit/models/generation/test_vllm_generation.py b/tests/unit/models/generation/test_vllm_generation.py
index 23b0ca356d..62294ef46e 100644
--- a/tests/unit/models/generation/test_vllm_generation.py
+++ b/tests/unit/models/generation/test_vllm_generation.py
@@ -208,6 +208,148 @@ def test_vllm_policy_generation(policy, test_input_data, tokenizer):
     )
 
 
+def test_vllm_worker_seed_behavior(cluster, tokenizer):
+    """
+    1. Different workers generate different outputs for identical prompts due to different seeds
+    2. When forced to use the same seed, workers generate identical outputs
+    """
+    from nemo_reinforcer.models.generation.vllm import VllmGenerationWorker
+
+    unique_prompts = [
+        "Hello, my name is",
+        "The capital of France is",
+    ]
+
+    # Create a batch where each prompt appears twice
+    # When sharded, different workers will get the same prompt
+    duplicated_prompts = unique_prompts + unique_prompts
+
+    # Tokenize prompts
+    encodings = tokenizer(
+        duplicated_prompts,
+        padding="max_length",
+        max_length=20,
+        truncation=True,
+        return_tensors="pt",
+        padding_side="right",
+    )
+
+    input_lengths = encodings["attention_mask"].sum(dim=1).to(torch.int32)
+
+    # Create input data dictionary
+    duplicated_batch = BatchedDataDict(
+        {
+            "input_ids": encodings["input_ids"],
+            "input_lengths": input_lengths,
+        }
+    )
+
+    # Part 1: Test that different workers generate different outputs due to different seeds
+    print("Creating vLLM policy with default seed behavior...")
+    vllm_config = basic_vllm_test_config.copy()
+    vllm_config = configure_generation_config(vllm_config, tokenizer)
+    policy = VllmGeneration(cluster, vllm_config)
+    policy.finish_generation()
+
+    from nemo_reinforcer.models.policy.hf_policy import HfPolicy
+
+    hf_config = basic_hf_test_config.copy()
+    hf_policy = HfPolicy(cluster, hf_config)
+
+    print(f"refitting vllm policy...")
+    ipc_handles = hf_policy.get_weights_ipc_handles()
+    policy.prepare_for_generation()
+    policy.update_weights(ipc_handles)
+
+    try:
+        # Generate with duplicated prompts
+        print("Running generation with duplicated prompts...")
+        outputs = policy.generate(duplicated_batch, greedy=False)
+
+        # Decode the generated sequences
+        gen_texts = tokenizer.batch_decode(
+            outputs["output_ids"], skip_special_tokens=True
+        )
+
+        print(f"Generated texts with duplicated prompts: {gen_texts}")
+
+        # Check if the duplicated prompts generated different texts
+        # The first half and second half should be different due to different worker seeds
+        first_half = gen_texts[: len(unique_prompts)]
+        second_half = gen_texts[len(unique_prompts) :]
+
+        print(f"First worker outputs: {first_half}")
+        print(f"Second worker outputs: {second_half}")
+
+        # At least one of the pairs should be different due to different seeds
+        assert first_half != second_half, (
+            "Different workers should generate different outputs for identical prompts due to different seeds"
+        )
+
+        # Clean up before the second test
+        policy.shutdown()
+
+        # Part 2: Test with fixed seed to verify identical outputs
+        print("\nNow testing with fixed seed...")
+
+        # Store the original configure_worker method
+        original_configure_worker = VllmGenerationWorker.configure_worker
+
+        # Override the configure_worker method to always use the same seed
+        def configure_worker_fixed_seed(num_gpus, bundle_indices=None):
+            resources, env_vars, init_kwargs = original_configure_worker(
+                num_gpus, bundle_indices
+            )
+            # Override with fixed seed
+            init_kwargs["seed"] = 42
+            return resources, env_vars, init_kwargs
+
+        VllmGenerationWorker.configure_worker = configure_worker_fixed_seed
+
+        # Create a new policy with fixed seed
+        fixed_seed_policy = VllmGeneration(cluster, vllm_config)
+
+        # Generate with the same duplicated prompts
+        print("Running generation with fixed seed...")
+        fixed_seed_outputs = fixed_seed_policy.generate(duplicated_batch, greedy=False)
+
+        # Decode the generated sequences
+        fixed_seed_gen_texts = tokenizer.batch_decode(
+            fixed_seed_outputs["output_ids"], skip_special_tokens=True
+        )
+
+        print(f"Generated texts with fixed seed: {fixed_seed_gen_texts}")
+
+        # Check if the duplicated prompts now generate the same texts
+        fixed_seed_first_half = fixed_seed_gen_texts[: len(unique_prompts)]
+        fixed_seed_second_half = fixed_seed_gen_texts[len(unique_prompts) :]
+
+        print(f"First worker outputs (fixed seed): {fixed_seed_first_half}")
+        print(f"Second worker outputs (fixed seed): {fixed_seed_second_half}")
+
+        # With the same seed, outputs should be identical
+        assert fixed_seed_first_half == fixed_seed_second_half, (
+            "Workers with the same fixed seed should generate identical outputs for identical prompts"
+        )
+
+    finally:
+        # Restore the original method if we patched it
+        if "original_configure_worker" in locals():
+            VllmGenerationWorker.configure_worker = original_configure_worker
+
+        # Clean up resources
+        if "policy" in locals() and hasattr(policy, "shutdown"):
+            policy.shutdown()
+        if "fixed_seed_policy" in locals() and hasattr(fixed_seed_policy, "shutdown"):
+            fixed_seed_policy.shutdown()
+
+        # Force garbage collection
+        import gc
+
+        gc.collect()
+        torch.cuda.empty_cache()
+
+
 @pytest.mark.timeout(140)
 def test_vllm_generation_with_hf_training(cluster, tokenizer):
     """1. Use vLLM for generation

From f35ad95af26b91ba38d5dca884c1229e64ddd1ca Mon Sep 17 00:00:00 2001
From: Terry Kong <terrycurtiskong@gmail.com>
Date: Tue, 15 Apr 2025 16:00:04 -0700
Subject: [PATCH 20/23] fix: unit test script halts on first failure (#189)

Signed-off-by: Terry Kong <terryk@nvidia.com>
---
 pyproject.toml    | 1 +
 tests/run_unit.sh | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index b257b7aed5..febbf9c5f1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -79,6 +79,7 @@ exclude = '''
 '''
 
 [tool.pytest.ini_options]
+addopts = "--cov=nemo_reinforcer --cov-report=term --cov-report=json -s -rA -x"
 testpaths = ["tests"]
 python_files = "test_*.py"
 
diff --git a/tests/run_unit.sh b/tests/run_unit.sh
index 093583faf9..5367749199 100755
--- a/tests/run_unit.sh
+++ b/tests/run_unit.sh
@@ -32,7 +32,7 @@ export PYTHONPATH=$(realpath ${SCRIPT_DIR}/..):${PYTHONPATH:-}
 
 # Run unit tests
 echo "Running unit tests..."
-if ! pytest unit/ --cov=nemo_reinforcer --cov-report=term --cov-report=json -s -rA "$@"; then
+if ! pytest unit/ "$@"; then
     echo "[ERROR]: Unit tests failed."
     exit 1
 fi

From 844e47015bb30a409ce6ea4b43ced8d6f6578471 Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Tue, 15 Apr 2025 19:52:56 -0700
Subject: [PATCH 21/23] fix new vllm test and doctest

Signed-off-by: ashors1 <ashors@nvidia.com>
---
 nemo_reinforcer/algorithms/utils.py                  | 3 +++
 tests/unit/models/generation/test_vllm_generation.py | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/nemo_reinforcer/algorithms/utils.py b/nemo_reinforcer/algorithms/utils.py
index 6f814b3597..520625cb79 100644
--- a/nemo_reinforcer/algorithms/utils.py
+++ b/nemo_reinforcer/algorithms/utils.py
@@ -168,6 +168,7 @@ def get_tokenizer(tokenizer_config: TokenizerConfig) -> AutoTokenizer:
         ...     {"role": "user", "content": "Hello!"}
         ... ]
         >>> formatted = tokenizer.apply_chat_template(messages, tokenize=False)
+        "No chat template provided, using tokenizer's default"
         >>> assert formatted == AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B-Instruct").apply_chat_template(messages, tokenize=False)
 
         >>> # Using a passthrough template
@@ -177,6 +178,7 @@ def get_tokenizer(tokenizer_config: TokenizerConfig) -> AutoTokenizer:
         ... }
         >>> tokenizer = get_tokenizer(config)
         >>> formatted = tokenizer.apply_chat_template(messages, tokenize=False)
+        "Using passthrough chat template"
         >>> assert formatted == "".join(msg["content"] for msg in messages)
 
         >>> # Using a custom template
@@ -186,6 +188,7 @@ def get_tokenizer(tokenizer_config: TokenizerConfig) -> AutoTokenizer:
         ... }
         >>> tokenizer = get_tokenizer(config)
         >>> formatted = tokenizer.apply_chat_template(messages, tokenize=False)
+        "Using custom chat template"
         >>> assert formatted == " START: You are a helpful AI assistant. END. START: Hello! END."
         ```
     """
diff --git a/tests/unit/models/generation/test_vllm_generation.py b/tests/unit/models/generation/test_vllm_generation.py
index 62294ef46e..d86f4983cc 100644
--- a/tests/unit/models/generation/test_vllm_generation.py
+++ b/tests/unit/models/generation/test_vllm_generation.py
@@ -254,7 +254,7 @@ def test_vllm_worker_seed_behavior(cluster, tokenizer):
     from nemo_reinforcer.models.policy.hf_policy import HfPolicy
 
     hf_config = basic_hf_test_config.copy()
-    hf_policy = HfPolicy(cluster, hf_config)
+    hf_policy = HfPolicy(cluster, hf_config, tokenizer)
 
     print(f"refitting vllm policy...")
     ipc_handles = hf_policy.get_weights_ipc_handles()

From ac4b6eac61456f6f9d9fd73b34c8491c51f8fec9 Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Wed, 16 Apr 2025 09:04:19 -0700
Subject: [PATCH 22/23] remove old comment

Signed-off-by: ashors1 <ashors@nvidia.com>
---
 nemo_reinforcer/data/hf_datasets/prompt_response_dataset.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/nemo_reinforcer/data/hf_datasets/prompt_response_dataset.py b/nemo_reinforcer/data/hf_datasets/prompt_response_dataset.py
index c9ff14d348..928af4fdff 100644
--- a/nemo_reinforcer/data/hf_datasets/prompt_response_dataset.py
+++ b/nemo_reinforcer/data/hf_datasets/prompt_response_dataset.py
@@ -33,7 +33,6 @@ def __init__(
         formatted_train_dataset = train_original_dataset.map(self.add_messages_key)
         formatted_val_dataset = val_original_dataset.map(self.add_messages_key)
 
-        ## just duplicating the dataset for train and validation for simplicity
         self.formatted_ds = {
             "train": formatted_train_dataset,
             "validation": formatted_val_dataset,

From c5328f0d80a2874e38026f35d4f257ef8aed5459 Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Wed, 16 Apr 2025 10:11:28 -0700
Subject: [PATCH 23/23] fix doctest

Signed-off-by: ashors1 <ashors@nvidia.com>
---
 nemo_reinforcer/algorithms/utils.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/nemo_reinforcer/algorithms/utils.py b/nemo_reinforcer/algorithms/utils.py
index 520625cb79..3cfd8569a9 100644
--- a/nemo_reinforcer/algorithms/utils.py
+++ b/nemo_reinforcer/algorithms/utils.py
@@ -163,12 +163,12 @@ def get_tokenizer(tokenizer_config: TokenizerConfig) -> AutoTokenizer:
         >>> # not specifying a chat template uses the tokenizer's default
         >>> config = {"name": "meta-llama/Llama-3.2-1B-Instruct"}
         >>> tokenizer = get_tokenizer(config)
+        No chat template provided, using tokenizer's default
         >>> messages = [
         ...     {"role": "system", "content": "You are a helpful AI assistant."},
         ...     {"role": "user", "content": "Hello!"}
         ... ]
         >>> formatted = tokenizer.apply_chat_template(messages, tokenize=False)
-        "No chat template provided, using tokenizer's default"
         >>> assert formatted == AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B-Instruct").apply_chat_template(messages, tokenize=False)
 
         >>> # Using a passthrough template
@@ -177,8 +177,8 @@ def get_tokenizer(tokenizer_config: TokenizerConfig) -> AutoTokenizer:
         ...     "chat_template": None
         ... }
         >>> tokenizer = get_tokenizer(config)
+        Using passthrough chat template
         >>> formatted = tokenizer.apply_chat_template(messages, tokenize=False)
-        "Using passthrough chat template"
         >>> assert formatted == "".join(msg["content"] for msg in messages)
 
         >>> # Using a custom template
@@ -187,8 +187,8 @@ def get_tokenizer(tokenizer_config: TokenizerConfig) -> AutoTokenizer:
         ...     "chat_template": "{% for message in messages %}{{ ' START: ' + message['content'] + ' END.' }}{% endfor %}"
         ... }
         >>> tokenizer = get_tokenizer(config)
+        Using custom chat template
         >>> formatted = tokenizer.apply_chat_template(messages, tokenize=False)
-        "Using custom chat template"
         >>> assert formatted == " START: You are a helpful AI assistant. END. START: Hello! END."
         ```
     """