NVIDIA-NeMo · parthchadha · Jul 2, 2025 · Jun 5, 2025 · Jun 18, 2025 · Jun 18, 2025
@@ -35,9 +35,7 @@
 def parse_args():
     """Parse command line arguments."""
     parser = argparse.ArgumentParser(description="Run SFT training with configuration")
-    parser.add_argument(
-        "--config", type=str, default=None, help="Path to YAML config file"
-    )
+    parser.add_argument("--config", type=str, default=None, help="Path to YAML config file")
 
     # Parse known args for the script
     args, overrides = parser.parse_known_args()
@@ -74,9 +72,7 @@ def sft_preprocessor(
     if length > max_seq_length:
         # make smaller and mask out
         for message in message_log:
-            message["token_ids"] = message["token_ids"][
-                : min(4, max_seq_length // len(message_log))
-            ]
+            message["token_ids"] = message["token_ids"][: min(4, max_seq_length // len(message_log))]
         loss_multiplier = 0.0
 
     output = {
@@ -109,6 +105,14 @@ def setup_data(tokenizer: AutoTokenizer, data_config: DataConfig):
             output_key=data_config["output_key"],
             prompt_file=data_config["prompt_file"],
         )
+    elif data_cls == "openai_format":
+        data = hf_datasets.OpenAIFormatDataset(
+            data_config["train_data_path"],
+            data_config["val_data_path"],
+            data_config["chat_key"],
+            data_config["system_key"],
+            data_config["system_prompt"],
+        )
     else:
         raise ValueError(f"Unknown dataset class: {data_cls}")
     print(
@@ -173,9 +177,7 @@ def main():
     config["logger"]["log_dir"] = get_next_experiment_dir(config["logger"]["log_dir"])
     print(f"📊 Using log directory: {config['logger']['log_dir']}")
     if config["checkpointing"]["enabled"]:
-        print(
-            f"📊 Using checkpoint directory: {config['checkpointing']['checkpoint_dir']}"
-        )
+        print(f"📊 Using checkpoint directory: {config['checkpointing']['checkpoint_dir']}")
 
     init_ray()
 

@@ -16,6 +16,7 @@
 from nemo_rl.data.hf_datasets.dpo import DPODataset
 from nemo_rl.data.hf_datasets.helpsteer3 import HelpSteer3Dataset
 from nemo_rl.data.hf_datasets.oasst import OasstDataset
+from nemo_rl.data.hf_datasets.oai_format_dataset import OpenAIFormatDataset
 from nemo_rl.data.hf_datasets.openmathinstruct2 import OpenMathInstruct2Dataset
 from nemo_rl.data.hf_datasets.prompt_response_dataset import (
     PromptResponseDataset,
@@ -26,6 +27,7 @@
     "DPODataset",
     "HelpSteer3Dataset",
     "OasstDataset",
+    "OpenAIFormatDataset",
     "OpenMathInstruct2Dataset",
     "PromptResponseDataset",
     "SquadDataset",

@@ -0,0 +1,58 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any
+
+from datasets import load_dataset
+
+from nemo_rl.data.interfaces import TaskDataSpec
+
+
+class OpenAIFormatDataset:
+    def __init__(
+        self,
+        train_ds_path: str,
+        val_ds_path: str,
+        chat_key: str = "messages",
+        system_key: str = None,
+        system_prompt: str = None,
+    ):
+        self.chat_key = chat_key
+        self.system_key = system_key
+        self.system_prompt = system_prompt
+        train_original_dataset = load_dataset("json", data_files=train_ds_path)["train"]
+        val_original_dataset = load_dataset("json", data_files=val_ds_path)["train"]
+
+        formatted_train_dataset = train_original_dataset.map(self.add_messages_key)
+        formatted_val_dataset = val_original_dataset.map(self.add_messages_key)
+
+        self.formatted_ds = {
+            "train": formatted_train_dataset,
+            "validation": formatted_val_dataset,
+        }
+
+        self.task_spec = TaskDataSpec(
+            "json_dataset",
+        )
+
+    def add_messages_key(
+        self, example: dict[str, Any],
+    ) -> dict[str, list[dict[str, Any]]]:
+        messages = [message for message in example[self.chat_key]]
+        if self.system_key in example:
+            messages = [{"role": "system", "content": example[self.system_key]}] + messages
+        elif self.system_prompt:
+            messages = [{"role": "system", "content": self.system_prompt}] + messages
+        assert messages[-1]["role"] == "assistant"
+        return {"messages": messages}