NVIDIA-NeMo · fsiino-nvidia · Dec 2, 2025 · Dec 2, 2025 · Dec 3, 2025 · Dec 3, 2025
diff --git a/README.md b/README.md
@@ -182,7 +182,7 @@ Purpose: Training-ready environments with curated datasets.
 | Google Search              | agent                 | <a href='https://huggingface.co/datasets/nvidia/Nemotron-RL-knowledge-web_search-mcqa'>Nemotron-RL-knowledge-web_search-mcqa</a>                               | Multi-choice question answering problems with search tools integrated                                | Improve knowledge-related benchmarks with search tools                   | <a href='resources_servers/google_search/configs/google_search.yaml'>config</a>                           | ✓     | -          | Apache 2.0                                     |
 | Math Advanced Calculations | agent                 | <a href='https://huggingface.co/datasets/nvidia/Nemotron-RL-math-advanced_calculations'>Nemotron-RL-math-advanced_calculations</a>                             | An instruction following math environment with counter-intuitive calculators                         | Improve instruction following capabilities in specific math environments | <a href='resources_servers/math_advanced_calculations/configs/math_advanced_calculations.yaml'>config</a> | ✓     | -          | Apache 2.0                                     |
 | Workplace Assistant        | agent                 | <a href='https://huggingface.co/datasets/nvidia/Nemotron-RL-agent-workplace_assistant'>Nemotron-RL-agent-workplace_assistant</a>                               | Workplace assistant multi-step tool-using environment                                                | Improve multi-step tool use capability                                   | <a href='resources_servers/workplace_assistant/configs/workplace_assistant.yaml'>config</a>               | ✓     | ✓          | Apache 2.0                                     |
-| Mini Swe Agent             | coding                | <a href='https://huggingface.co/datasets/princeton-nlp/SWE-bench_Verified'>SWE-bench_Verified</a>                                                              | A software development with mini-swe-agent orchestration                                             | Improve software development capabilities, like SWE-bench                | <a href='resources_servers/mini_swe_agent/configs/mini_swe_agent.yaml'>config</a>                         | ✓     | ✓          | MIT                                            |
+| Mini Swe Agent             | coding                | <a href='https://huggingface.co/datasets/SWE-Gym/SWE-Gym'>SWE-Gym</a>                                                                                          | A software development with mini-swe-agent orchestration                                             | Improve software development capabilities, like SWE-bench                | <a href='resources_servers/mini_swe_agent/configs/mini_swe_agent.yaml'>config</a>                         | ✓     | ✓          | MIT                                            |
 | Instruction Following      | instruction_following | <a href='https://huggingface.co/datasets/nvidia/Nemotron-RL-instruction_following'>Nemotron-RL-instruction_following</a>                                       | Instruction following datasets targeting IFEval and IFBench style instruction following capabilities | Improve IFEval and IFBench                                               | <a href='resources_servers/instruction_following/configs/instruction_following.yaml'>config</a>           | ✓     | -          | Apache 2.0                                     |
 | Structured Outputs         | instruction_following | <a href='https://huggingface.co/datasets/nvidia/Nemotron-RL-instruction_following-structured_outputs'>Nemotron-RL-instruction_following-structured_outputs</a> | Check if responses are following structured output requirements in prompts                           | Improve instruction following capabilities                               | <a href='resources_servers/structured_outputs/configs/structured_outputs_json.yaml'>config</a>            | ✓     | ✓          | Apache 2.0                                     |
 | Equivalence Llm Judge      | knowledge             | <a href='https://huggingface.co/datasets/nvidia/Nemotron-RL-knowledge-openQA'>Nemotron-RL-knowledge-openQA</a>                                                 | Short answer questions with LLM-as-a-judge                                                           | Improve knowledge-related benchmarks like GPQA / HLE                     | <a href='resources_servers/equivalence_llm_judge/configs/equivalence_llm_judge.yaml'>config</a>           | ✓     | -          | Apache 2.0                                     |

diff --git a/docs/reference/faq.md b/docs/reference/faq.md
@@ -84,12 +84,25 @@ Gitlab model names are case sensitive. There can be models named 'My_Model' and
 :::
 
 Downloading a dataset from Huggingface is straightforward:
+
+**For structured datasets (with train/validation/test splits):**
+```bash
+ng_download_dataset_from_hf \
+    +repo_id=nvidia/Nemotron-RL-knowledge-mcqa \
+    +output_dirpath=data/mcqa \
+    +split=train
+```
+The `split` parameter is optional. If omitted, all available splits will be downloaded as separate JSONL files.
+
+
+**For raw file repositories (with specific JSONL files):**
 ```bash
 ng_download_dataset_from_hf \
-    +repo_id=NVIDIA/NeMo-Gym-Instruction_Following-multineedle-{your dataset name} \
-    +artifact_fpath=multineedle_benchmark.jsonl \
-    +output_fpath=data/multineedle_benchmark_hf.jsonl
+    +repo_id=nvidia/Nemotron-RL-instruction_following \
+    +output_dirpath=data/instruction_following \
+    +artifact_fpath=instruction_following.jsonl
 ```
+Use `artifact_fpath` when the HuggingFace repo contains raw/arbitrary JSONL files rather than structured dataset splits. You cannot specify both `split` and `artifact_fpath`.
 
 
 # How To: Prepare and validate data for PR submission or RL training
@@ -120,6 +133,9 @@ example_multi_step_simple_agent:
           dataset_name: example_multi_step
           version: 0.0.1
           artifact_fpath: example_multi_step/train.jsonl
+        huggingface_identifier:
+          repo_id: nvidia/Nemotron-RL-instruction_following
+          artifact_fpath: instruction_following.jsonl
         license: Apache 2.0
       - name: validation
         type: validation
@@ -130,6 +146,9 @@ example_multi_step_simple_agent:
           dataset_name: example_multi_step
           version: 0.0.1
           artifact_fpath: example_multi_step/validation.jsonl
+        huggingface_identifier:
+          repo_id: nvidia/Nemotron-RL-instruction_following
+          artifact_fpath: if_validation.jsonl
         license: Apache 2.0
       - name: example
         type: example
@@ -142,7 +161,8 @@ A dataset object consists of:
 - Type: train, validation, or example. Train and validation are as used in NeMo RL or other train frameworks. More information about the example type is in the next section.
 - Jsonl fpath: the local file path to your jsonl file for this dataset.
 - Num repeats: optionally repeat each row when preparing or collating data. Defaults to 1 if unspecified.
-- Gitlab identifier: The remote path to the dataset as held in the Gitlab dataset registry. This field is required for train and validation datasets. (Not required for example datasets since those are required to be committed to Git).
+- Gitlab identifier: (NVIDIA internal) The remote path to the dataset as held in the Gitlab dataset registry. This field is required for train and validation datasets. (Not required for example datasets since those are required to be committed to Git).
+- HuggingFace identifier: (Public) The remote path to the dataset on HuggingFace. Contains `repo_id` (required) and optionally `artifact_fpath` for raw file repos. If `artifact_fpath` is omitted, the datasets library will infer the `split` from the dataset `type`.
 - License: The license of that dataset. Required for train and validation datasets and not required for example datasets, similar in principle to the Gitlab identifier.
 - Start idx, end idx: used for slicing your dataset.
 ```yaml
@@ -153,6 +173,9 @@ A dataset object consists of:
     dataset_name: example_multi_step
     version: 0.0.1
     artifact_fpath: example_multi_step/validation.jsonl
+  huggingface_identifier:
+    repo_id: nvidia/example_multi_step
+    artifact_fpath: example_validation.jsonl
   license: Apache 2.0
 ```
 
@@ -165,11 +188,32 @@ responses_api_models/openai_model/configs/openai_model.yaml"
 ng_prepare_data "+config_paths=[$config_paths]" \
     +output_dirpath=data/example_multi_step \
     +mode=example_validation
+```
+
+To download missing datasets automatically, add +should_download=true. By default, datasets are downloaded from HuggingFace:
+```bash
+ng_prepare_data "+config_paths=[$config_paths]" \
+    +output_dirpath=data/example_multi_step \
+    +mode=train_preparation \
+    +should_download=true
+```
+
+For NVIDIA internal users, you can download from GitLab instead:
+
+```bash
+ng_prepare_data "+config_paths=[$config_paths]" \
+    +output_dirpath=data/example_multi_step \
+    +mode=train_preparation \
+    +should_download=true \
+    +data_source=gitlab
+```
 
-# Run NeMo Gym servers the exact same way with the same configs!
+Run NeMo Gym servers the exact same way with the same configs!
+```bash
 ng_run "+config_paths=[$config_paths]"
 ```
 
+
 The `ng_prepare_data` command will:
 1. Attempt to load all the datasets you specified from disk. Missing datasets will be reported before any processing is done.
 2. For each dataset, read example by example. Check the format and report the filepaths and indices/ranges of offending examples if any.

diff --git a/docs/tutorials/rl-training-with-nemo-rl.md b/docs/tutorials/rl-training-with-nemo-rl.md
@@ -77,6 +77,9 @@ ng_prepare_data "+config_paths=[${config_paths}]" \
     +output_dirpath=data/bytedtsinghua_dapo17k \
     +mode=train_preparation +should_download=true
 
+# Note: Datasets are downloaded from HuggingFace by default.
+# For NVIDIA internal users, add +data_source=gitlab to download from GitLab instead.    
+
 # Return to NeMo RL directory and Python env
 cd ../../.. && source /opt/nemo_rl_venv/bin/activate
 ```

diff --git a/nemo_gym/config_types.py b/nemo_gym/config_types.py
@@ -195,6 +195,14 @@ class DeleteJsonlDatasetGitlabConfig(BaseNeMoGymCLIConfig):
     dataset_name: str = Field(description="Name of the dataset to delete from GitLab.")
 
 
+class JsonlDatasetHuggingFaceIdentifer(BaseModel):
+    repo_id: str = Field(description="The repo id.")
+    artifact_fpath: Optional[str] = Field(
+        default=None,
+        description="Path to specific file in HuggingFace repo (e.g., 'train.jsonl'). If omitted, load_dataset will be used with split.",
+    )
+
+
 class BaseUploadJsonlDatasetHuggingFaceConfig(BaseNeMoGymCLIConfig):
     """
     Upload a JSONL dataset to HuggingFace Hub with automatic naming based on domain and resource server.
@@ -276,7 +284,7 @@ class UploadJsonlDatasetHuggingFaceMaybeDeleteConfig(BaseUploadJsonlDatasetHuggi
     )
 
 
-class DownloadJsonlDatasetHuggingFaceConfig(BaseNeMoGymCLIConfig):
+class DownloadJsonlDatasetHuggingFaceConfig(JsonlDatasetHuggingFaceIdentifer, BaseNeMoGymCLIConfig):
     """
     Download a JSONL dataset from HuggingFace Hub to local filesystem.
 
@@ -290,10 +298,35 @@ class DownloadJsonlDatasetHuggingFaceConfig(BaseNeMoGymCLIConfig):
     ```
     """
 
-    output_fpath: str = Field(description="Local file path where the downloaded dataset will be saved.")
-    hf_token: str = Field(description="HuggingFace API token for authentication.")
-    artifact_fpath: str = Field(description="Name of the artifact file to download from the repository.")
-    repo_id: str = Field(description="HuggingFace repository ID in format 'organization/dataset-name'.")
+    output_dirpath: Optional[str] = Field(
+        default=None,
+        description="Directory to save the downloaded dataset. Files will be named {split}.jsonl. If split is omitted, all available splits are downloaded.",
+    )
+    output_fpath: Optional[str] = Field(
+        default=None,
+        description="Exact local file path where the downloaded dataset will be saved. Requires `artifact_fpath` or `split`. Overrides output_dirpath.",
+    )
+    hf_token: Optional[str] = Field(default=None, description="HuggingFace API token for authentication.")
+    split: Optional[Literal["train", "validation", "test"]] = Field(
+        default=None, description="Dataset split to download. Omit to download all available splits."
+    )
+
+    @model_validator(mode="after")
+    def check_output_path(self) -> "DownloadJsonlDatasetHuggingFaceConfig":
+        if not self.output_dirpath and not self.output_fpath:
+            raise ValueError("Either output_dirpath or output_fpath must be provided")
+        if self.output_dirpath and self.output_fpath:
+            raise ValueError("Cannot specify both output_dirpath and output_fpath")
+        if self.artifact_fpath and self.split:
+            raise ValueError(
+                "Cannot specify both artifact_fpath and split. Use artifact_fpath for targeting a raw file, or split for structured datasets."
+            )
+        # Prevent output_fpath without split when not using artifact_fpath
+        if self.output_fpath and not self.split and not self.artifact_fpath:
+            raise ValueError(
+                "When using output_fpath without artifact_fpath, split must be specified. Use output_dirpath to download all splits."
+            )
+        return self
 
 
 DatasetType = Union[Literal["train"], Literal["validation"], Literal["example"]]
@@ -306,6 +339,7 @@ class DatasetConfig(BaseModel):
 
     num_repeats: int = Field(default=1, ge=1)
     gitlab_identifier: Optional[JsonlDatasetGitlabIdentifer] = None
+    huggingface_identifier: Optional[JsonlDatasetHuggingFaceIdentifer] = None
     license: Optional[
         Union[
             Literal["Apache 2.0"],
@@ -320,7 +354,6 @@ class DatasetConfig(BaseModel):
     @model_validator(mode="after")
     def check_train_validation_sets(self) -> "DatasetConfig":
         if self.type in ["train", "validation"]:
-            assert self.gitlab_identifier is not None, f"A Gitlab path is required for {self.name}"
             assert self.license is not None, f"A license is required for {self.name}"
 
         return self

diff --git a/nemo_gym/dataset_orchestrator.py b/nemo_gym/dataset_orchestrator.py
@@ -20,7 +20,7 @@
     UploadJsonlDatasetHuggingFaceMaybeDeleteConfig,
 )
 from nemo_gym.gitlab_utils import delete_model_from_gitlab, is_model_in_gitlab
-from nemo_gym.hf_utils import download_jsonl_dataset as download_jsonl_dataset_from_hf
+from nemo_gym.hf_utils import download_hf_dataset_as_jsonl
 from nemo_gym.hf_utils import upload_jsonl_dataset as upload_jsonl_dataset_to_hf
 from nemo_gym.server_utils import get_global_config_dict
 
@@ -73,7 +73,13 @@ def upload_jsonl_dataset_to_hf_and_delete_gitlab_cli() -> None:  # pragma: no co
 def download_jsonl_dataset_from_hf_cli() -> None:  # pragma: no cover
     global_config = get_global_config_dict()
     config = DownloadJsonlDatasetHuggingFaceConfig.model_validate(global_config)
-    download_jsonl_dataset_from_hf(config)
+
+    if config.artifact_fpath:
+        print(f"Downloading file '{config.artifact_fpath}' from '{config.repo_id}'...")
+    else:
+        print(f"Downloading '{config.split or 'all'}' split(s) from '{config.repo_id}'...")
+
+    download_hf_dataset_as_jsonl(config)
 
 
 def delete_jsonl_dataset_from_gitlab_cli() -> None:  # pragma: no cover

diff --git a/nemo_gym/hf_utils.py b/nemo_gym/hf_utils.py
@@ -12,16 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import json
+import shutil
 from os import environ
 from pathlib import Path
 
 import yaml
+from datasets import load_dataset
 from huggingface_hub import HfApi, hf_hub_download
 from huggingface_hub.utils import HfHubHTTPError
 from scripts.update_resource_servers import visit_resource_server
 
 from nemo_gym.config_types import DownloadJsonlDatasetHuggingFaceConfig, UploadJsonlDatasetHuggingFaceConfig
-from nemo_gym.server_utils import get_global_config_dict
 
 
 def create_huggingface_client(token: str) -> HfApi:  # pragma: no cover
@@ -54,6 +55,67 @@ def check_jsonl_format(file_path: str) -> bool:  # pragma: no cover
     return True
 
 
+def download_hf_dataset_as_jsonl(
+    config: DownloadJsonlDatasetHuggingFaceConfig,
+) -> None:  # pragma: no cover
+    """
+    Download a HF dataset and save as JSONL.
+    If `artifact_fpath` is provided, downloads that specific file using `hf_hub_download`.
+    Otherwise, uses datasets.load_dataset() to handle structured datasets.
+    """
+    try:
+        # artifact_fpath - download raw jsonl file
+        if config.artifact_fpath:
+            downloaded_path = hf_hub_download(
+                repo_id=config.repo_id,
+                filename=config.artifact_fpath,
+                repo_type="dataset",
+                token=config.hf_token,
+            )
+            output_file = (
+                Path(config.output_fpath or config.output_dirpath) / Path(config.artifact_fpath).name
+                if config.output_dirpath
+                else Path(config.output_fpath)
+            )
+            output_file.parent.mkdir(parents=True, exist_ok=True)
+
+            # We copy the downloaded file from the cache to the target path
+            # to allow renaming (e.g., artifact_fpath="something.jsonl" -> output_fpath="train.jsonl")
+            shutil.copy(downloaded_path, output_file)
+            print(f"[Nemo-Gym] - Downloaded {config.artifact_fpath} to: {output_file}")
+            return
+
+        # no artifact_fpath - use load_dataset() with split (if provided)
+        if config.output_fpath:
+            # Exact output path specified
+            output_file = Path(config.output_fpath)
+            output_file.parent.mkdir(parents=True, exist_ok=True)
+            ds = load_dataset(config.repo_id, split=config.split, token=config.hf_token)
+            ds.to_json(str(output_file))
+            print(f"[Nemo-Gym] - Downloaded {config.split} split to: {output_file}")
+        else:
+            # Output directory specified
+            output_dir = Path(config.output_dirpath)
+            output_dir.mkdir(parents=True, exist_ok=True)
+
+            if config.split:
+                ds = load_dataset(config.repo_id, split=config.split, token=config.hf_token)
+                output_file = output_dir / f"{config.split}.jsonl"
+                ds.to_json(str(output_file))
+                print(f"[Nemo-Gym] - Downloaded {config.split} split to: {output_file}")
+            else:
+                # Download all
+                ds = load_dataset(config.repo_id, token=config.hf_token)
+                for split_name, split_data in ds.items():
+                    output_file = output_dir / f"{split_name}.jsonl"
+                    split_data.to_json(str(output_file))
+                    print(f"[Nemo-Gym] - Downloaded {split_name} split to: {output_file}")
+
+    except Exception as e:
+        print(f"[Nemo-Gym] - Error downloading/converting dataset: {e}")
+        raise
+
+
 def upload_jsonl_dataset(
     config: UploadJsonlDatasetHuggingFaceConfig,
 ) -> None:  # pragma: no cover
@@ -108,22 +170,3 @@ def upload_jsonl_dataset(
     except HfHubHTTPError as e:
         print(f"[Nemo-Gym] - Error uploading file: {e}")
         raise
-
-
-def download_jsonl_dataset(config: DownloadJsonlDatasetHuggingFaceConfig) -> None:  # pragma: no cover
-    try:
-        downloaded_path = hf_hub_download(
-            repo_id=config.repo_id,
-            repo_type="dataset",
-            filename=config.artifact_fpath,
-            token=config.hf_token,
-        )
-        Path(config.output_fpath).write_bytes(Path(downloaded_path).read_bytes())
-    except HfHubHTTPError as e:
-        print(f"[Nemo-Gym] - Error downloading file: {e}")
-
-
-def download_jsonl_dataset_cli() -> None:  # pragma: no cover
-    global_config = get_global_config_dict()
-    config = DownloadJsonlDatasetHuggingFaceConfig.model_validate(global_config)
-    download_jsonl_dataset(config)