diff --git a/docs/cluster.md b/docs/cluster.md index 2327b813c1..73e2225a1b 100644 --- a/docs/cluster.md +++ b/docs/cluster.md @@ -25,9 +25,8 @@ sbatch \ ray.sub ``` -:::{tip} -Depending on your Slurm cluster configuration, you may or may not need to include the `--gres=gpu:8` option in the `sbatch` command. -::: +> [!TIP] +> Depending on your Slurm cluster configuration, you may or may not need to include the `--gres=gpu:8` option in the `sbatch` command. Upon successful submission, Slurm will print the `SLURM_JOB_ID`: ```text @@ -40,9 +39,8 @@ tail -f 1980204-logs/ray-driver.log ### Interactive Launching -:::{tip} -A key advantage of running interactively on the head node is the ability to execute multiple multi-node jobs without needing to requeue in the Slurm job queue. This means that during debugging sessions, you can avoid submitting a new `sbatch` command each time. Instead, you can debug and re-submit your NeMo RL job directly from the interactive session. -::: +> [!TIP] +> A key advantage of running interactively on the head node is the ability to execute multiple multi-node jobs without needing to requeue in the Slurm job queue. This means that during debugging sessions, you can avoid submitting a new `sbatch` command each time. Instead, you can debug and re-submit your NeMo RL job directly from the interactive session. To run interactively, launch the same command as [Batched Job Submission](#batched-job-submission), but omit the `COMMAND` line: ```sh @@ -111,14 +109,13 @@ sbatch ray.sub \ - Sets the cache dir for downloaded Huggingface datasets. `````` -:::{tip} -When `HF_TOKEN`, `WANDB_API_KEY`, `HF_HOME`, and `HF_DATASETS_CACHE` are set in your shell environment using `export`, they are automatically passed to `ray.sub`. For instance, if you set: - -```sh -export HF_TOKEN=XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX -``` -this token will be available to your NeMo RL run. Consider adding these exports to your shell configuration file, such as `~/.bashrc`. -::: +> [!TIP] +> When `HF_TOKEN`, `WANDB_API_KEY`, `HF_HOME`, and `HF_DATASETS_CACHE` are set in your shell environment using `export`, they are automatically passed to `ray.sub`. For instance, if you set: +> +> ```sh +> export HF_TOKEN=XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX +> ``` +> this token will be available to your NeMo RL run. Consider adding these exports to your shell configuration file, such as `~/.bashrc`. #### Advanced Environment Configuration ``````{list-table} @@ -170,10 +167,9 @@ this token will be available to your NeMo RL run. Consider adding these exports - Maximum port in the range for Ray worker processes. `````` -:::{note} -For the most part, you will not need to change ports unless these -are already taken by some other service backgrounded on your cluster. -::: +> [!NOTE] +> For the most part, you will not need to change ports unless these +> are already taken by some other service backgrounded on your cluster. ## Kubernetes diff --git a/docs/conf.py b/docs/conf.py index eaef92a78b..b5ea312d80 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -116,12 +116,10 @@ ] -def _convert_gh_admonitions( - app: Sphinx, relative_path: Path, parent_docname: str, contents: list[str] -) -> None: - """Supporting rendering GitHub alerts correctly. +def _convert_gh_admonitions_inplace(contents: list[str]) -> None: + """Mutate contents to convert GitHub blockquote admonitions to MyST. - # https://github.com/executablebooks/MyST-Parser/issues/845 + https://github.com/executablebooks/MyST-Parser/issues/845 """ _github_admonitions = { "> [!NOTE]": "note", @@ -130,41 +128,56 @@ def _convert_gh_admonitions( "> [!WARNING]": "warning", "> [!CAUTION]": "caution", } - # loop through content lines, replace github admonitions + # Use 8 backticks for admonition fences to allow code blocks with 3 or 6 backticks inside + FENCE = "````````" for i, orig_content in enumerate(contents): orig_line_splits = orig_content.split("\n") replacing = False for j, line in enumerate(orig_line_splits): - # look for admonition key line_roi = line.lstrip() - for admonition_key in _github_admonitions: + for admonition_key, admonition_name in _github_admonitions.items(): if line_roi.startswith(admonition_key): - line = line.replace( - admonition_key, - "```{" + _github_admonitions[admonition_key] + "}", - ) - # start replacing quotes in subsequent lines + replacement = f"{FENCE}{{{admonition_name}}}" + if replacing: + # Close previous fence before starting new admonition + # Add blank line between admonitions for proper MyST parsing + line = ( + f"{FENCE}\n\n{line.replace(admonition_key, replacement, 1)}" + ) + else: + line = line.replace(admonition_key, replacement, 1) replacing = True break - else: # no break + else: if not replacing: continue - # remove GH directive to match MyST directive - # since we are replacing on the original line, this will preserve the right indent, if any if line_roi.startswith("> "): line = line.replace("> ", "", 1) elif line_roi.rstrip() == ">": line = line.replace(">", "", 1) else: - # missing "> ", so stop replacing and terminate directive - line = f"```\n{line}" + line = f"{FENCE}\n{line}" replacing = False - # swap line back in splits orig_line_splits[j] = line - # swap line back in original + if replacing: + orig_line_splits.append(FENCE) + replacing = False contents[i] = "\n".join(orig_line_splits) +def _convert_gh_admonitions( + _app: Sphinx, _relative_path: Path, _parent_docname: str, contents: list[str] +) -> None: + _convert_gh_admonitions_inplace(contents) + + +def _convert_gh_admonitions_source( + _app: Sphinx, _docname: str, source: list[str] +) -> None: + # Sphinx "source-read" event + _convert_gh_admonitions_inplace(source) + + class _GitHubLinkTransform(Transform): """Converting the relative path to a file in a Markdown to the URL of that file on GitHub.""" @@ -238,4 +251,6 @@ def apply(self, **kwargs: Any) -> None: # type: ignore[bad-override] def setup(app: Sphinx) -> None: app.add_transform(_GitHubLinkTransform) + # Convert GH admonitions for included files and top-level sources app.connect("include-read", _convert_gh_admonitions) + app.connect("source-read", _convert_gh_admonitions_source) diff --git a/docs/design-docs/logger.md b/docs/design-docs/logger.md index 86f2db6258..dad321c6a1 100644 --- a/docs/design-docs/logger.md +++ b/docs/design-docs/logger.md @@ -171,13 +171,12 @@ logger: flush_interval: 10 ``` -:::{note} -While it is feasible to monitor using remote workers, the implementation requires careful attention to details to ensure: -* Logs sent back to the driver do not introduce significant overhead. -* Metrics remain clear and interpretable, avoiding issues like double counting caused by colocated workers. -* Workers can gracefully flush their logs in case of failure. -* Logging behaves consistently across TensorBoard, WandB, MLflow and Swanlab. -* Workers that spawn other workers accurately report the total resource usage of any grandchild workers. - -Due to these complexities, we opted for a simpler approach: collecting metrics exposed by the Ray metrics server from the driver. -::: +> [!NOTE] +> While it is feasible to monitor using remote workers, the implementation requires careful attention to details to ensure: +> * Logs sent back to the driver do not introduce significant overhead. +> * Metrics remain clear and interpretable, avoiding issues like double counting caused by colocated workers. +> * Workers can gracefully flush their logs in case of failure. +> * Logging behaves consistently across TensorBoard, WandB, MLflow and Swanlab. +> * Workers that spawn other workers accurately report the total resource usage of any grandchild workers. +> +> Due to these complexities, we opted for a simpler approach: collecting metrics exposed by the Ray metrics server from the driver. diff --git a/docs/nsys-profiling.md b/docs/nsys-profiling.md index 8f40ac9780..df2574a4c5 100644 --- a/docs/nsys-profiling.md +++ b/docs/nsys-profiling.md @@ -63,9 +63,8 @@ NRL_NSYS_PROFILE_STEP_RANGE=3:10 NRL_NSYS_WORKER_PATTERNS="dtensor_policy_worker ### Profile Megatron Workers -:::{important} -To profile a Megatron worker, you should set `LD_LIBRARY_PATH` as follows, otherwise you will get errors when loading `libtransformer_engine.so`. -::: +> [!IMPORTANT] +> To profile a Megatron worker, you should set `LD_LIBRARY_PATH` as follows, otherwise you will get errors when loading `libtransformer_engine.so`. ```bash LD_LIBRARY_PATH="/usr/local/cuda/targets/x86_64-linux/lib:/usr/local/cuda/lib64:/usr/local/cuda/lib:/usr/local/nvidia/lib64:/usr/local/nvidia/lib:/usr/lib/x86_64-linux-gnu" \ diff --git a/docs/testing.md b/docs/testing.md index c1d1bc570a..24d277802e 100644 --- a/docs/testing.md +++ b/docs/testing.md @@ -129,28 +129,24 @@ Which would produce this file in `tests/unit/unit_results.json`: } ``` -:::{tip} -Past unit test results are logged in `tests/unit/unit_results/`. These are helpful to view trends over time and commits. - -Here's an example `jq` command to view trends: - -```sh -jq -r '[.start_time, .git_commit, .metrics["test_hf_ray_policy::test_lm_policy_generation"].avg_prob_mult_error] | @tsv' tests/unit/unit_results/* - -# Example output: -#2025-03-24 23:35:39 778d288bb5d2edfd3eec4d07bb7dffffad5ef21b 1.0000039339065552 -#2025-03-24 23:36:37 778d288bb5d2edfd3eec4d07bb7dffffad5ef21b 1.0000039339065552 -#2025-03-24 23:37:37 778d288bb5d2edfd3eec4d07bb7dffffad5ef21b 1.0000039339065552 -#2025-03-24 23:38:14 778d288bb5d2edfd3eec4d07bb7dffffad5ef21b 1.0000039339065552 -#2025-03-24 23:38:50 778d288bb5d2edfd3eec4d07bb7dffffad5ef21b 1.0000039339065552 -``` -::: +> [!TIP] +> Past unit test results are logged in `tests/unit/unit_results/`. These are helpful to view trends over time and commits. +> +> ```sh +> jq -r '[.start_time, .git_commit, .metrics["test_hf_ray_policy::test_lm_policy_generation"].avg_prob_mult_error] | @tsv' tests/unit/unit_results/* +> +> # Example output: +> #2025-03-24 23:35:39 778d288bb5d2edfd3eec4d07bb7dffffad5ef21b 1.0000039339065552 +> #2025-03-24 23:36:37 778d288bb5d2edfd3eec4d07bb7dffffad5ef21b 1.0000039339065552 +> #2025-03-24 23:37:37 778d288bb5d2edfd3eec4d07bb7dffffad5ef21b 1.0000039339065552 +> #2025-03-24 23:38:14 778d288bb5d2edfd3eec4d07bb7dffffad5ef21b 1.0000039339065552 +> #2025-03-24 23:38:50 778d288bb5d2edfd3eec4d07bb7dffffad5ef21b 1.0000039339065552 +> ``` ## Functional Tests -:::{important} -Functional tests may require multiple GPUs to run. See each script to understand the requirements. -::: +> [!IMPORTANT] +> Functional tests may require multiple GPUs to run. See each script to understand the requirements. Functional tests are located under `tests/functional/`.