diff --git a/examples/ray/tutorial.ipynb b/examples/ray/tutorial.ipynb
index 4152331828a..9b8591a92b1 100644
--- a/examples/ray/tutorial.ipynb
+++ b/examples/ray/tutorial.ipynb
@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "markdown",
- "id": "ed65145e-1dcc-4fad-85cd-8cc683fde91b",
+ "id": "0ddc582b",
"metadata": {},
"source": [
"# VeRL Ray API Tutorial"
@@ -10,46 +10,43 @@
},
{
"cell_type": "markdown",
- "id": "5f90dfa8-3285-41e4-ba44-12abcb76b3ce",
- "metadata": {
- "tags": []
- },
+ "id": "71fe3b94",
+ "metadata": {},
"source": [
"## Chapter 1: Ray Basics"
]
},
{
"cell_type": "code",
- "execution_count": 1,
- "id": "cafc9be5-614b-4380-9b69-31525ea4e73a",
- "metadata": {},
+ "execution_count": 144,
+ "id": "1347d381",
+ "metadata": {
+ "tags": []
+ },
"outputs": [],
"source": [
- "import os\n",
- "\n",
- "# turn off Megatron timer\n",
- "os.environ['MEGATRON_USE_CUDA_TIMER'] = '0'\n",
- "os.environ['MEGATRON_START_PROCESS_TIMER'] = 'False'\n",
- "os.environ['NCCL_DEBUG'] = 'WARN'"
+ "import os"
]
},
{
"cell_type": "code",
- "execution_count": 2,
- "id": "3fb65288-4410-43b4-9ffc-8538197ae039",
+ "execution_count": 145,
+ "id": "e75b9d44",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"import ray\n",
- "import torch"
+ "import torch\n",
+ "import warnings\n",
+ "warnings.filterwarnings('ignore')"
]
},
{
"cell_type": "code",
- "execution_count": 3,
- "id": "ca7f0778-f676-4f67-8c1f-9f6c8eee74e2",
+ "execution_count": 146,
+ "id": "2e90ae00",
"metadata": {
"tags": []
},
@@ -58,59 +55,53 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "2024-01-22 14:52:10,398\tINFO worker.py:1655 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32m127.0.0.1:8265 \u001b[39m\u001b[22m\n"
+ "2024-11-01 17:27:19,132\tINFO worker.py:1752 -- Started a local Ray instance.\n"
]
},
{
"data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "9cc9d2ccbdfb48918c8fd6cd13a0807a",
+ "version_major": 2,
+ "version_minor": 0
+ },
"text/html": [
- "
\n",
+ "
\n"
],
"text/plain": [
- "RayContext(dashboard_url='127.0.0.1:8265', python_version='3.9.2', ray_version='2.3.0', ray_commit='{{RAY_COMMIT_SHA}}', address_info={'node_ip_address': '10.122.229.29', 'raylet_ip_address': '10.122.229.29', 'redis_address': None, 'object_store_address': '/tmp/ray/session_2024-01-22_14-52-08_389091_1470202/sockets/plasma_store', 'raylet_socket_name': '/tmp/ray/session_2024-01-22_14-52-08_389091_1470202/sockets/raylet', 'webui_url': '127.0.0.1:8265', 'session_dir': '/tmp/ray/session_2024-01-22_14-52-08_389091_1470202', 'metrics_export_port': 46547, 'gcs_address': '10.122.229.29:57281', 'address': '10.122.229.29:57281', 'dashboard_agent_listen_port': 52365, 'node_id': 'ec81d96ec29985ce0b98441aabb46fc8dcb49b0e1dec1088f973977f'})"
+ "RayContext(dashboard_url='', python_version='3.9.2', ray_version='2.10.0', ray_commit='09abba26b5bf2707639bb637c208d062a47b46f6')"
]
},
- "execution_count": 3,
+ "execution_count": 146,
"metadata": {},
"output_type": "execute_result"
},
@@ -118,111 +109,14 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "\u001b[2m\u001b[36m(GPUAccumulator pid=1489193)\u001b[0m rank 0, value: tensor([1.], device='cuda:0')\n",
- "\u001b[2m\u001b[36m(GPUAccumulator pid=1489357)\u001b[0m rank 2, value: tensor([3.], device='cuda:0')\n",
- "\u001b[2m\u001b[36m(GPUAccumulator pid=1489358)\u001b[0m rank 3, value: tensor([4.], device='cuda:0')\n",
- "\u001b[2m\u001b[36m(GPUAccumulator pid=1489356)\u001b[0m rank 1, value: tensor([2.], device='cuda:0')\n",
- "\u001b[2m\u001b[36m(GPUAccumulator pid=1489837)\u001b[0m rank 0, value: tensor([2.], device='cuda:0')\n",
- "\u001b[2m\u001b[36m(GPUAccumulator pid=1489999)\u001b[0m rank 3, value: tensor([5.], device='cuda:0')\n",
- "\u001b[2m\u001b[36m(GPUAccumulator pid=1489998)\u001b[0m rank 2, value: tensor([4.], device='cuda:0')\n",
- "\u001b[2m\u001b[36m(GPUAccumulator pid=1489997)\u001b[0m rank 1, value: tensor([3.], device='cuda:0')\n",
- "\u001b[2m\u001b[36m(GPUAccumulator pid=1490000)\u001b[0m rank 0, value: tensor([3.], device='cuda:0')\n",
- "\u001b[2m\u001b[36m(GPUAccumulator pid=1490633)\u001b[0m rank 3, value: tensor([6.], device='cuda:0')\n",
- "\u001b[2m\u001b[36m(GPUAccumulator pid=1490635)\u001b[0m rank 5, value: tensor([8.], device='cuda:0')\n",
- "\u001b[2m\u001b[36m(GPUAccumulator pid=1490636)\u001b[0m rank 6, value: tensor([9.], device='cuda:0')\n",
- "\u001b[2m\u001b[36m(GPUAccumulator pid=1490631)\u001b[0m rank 1, value: tensor([4.], device='cuda:0')\n",
- "\u001b[2m\u001b[36m(GPUAccumulator pid=1490637)\u001b[0m rank 7, value: tensor([10.], device='cuda:0')\n",
- "\u001b[2m\u001b[36m(GPUAccumulator pid=1490634)\u001b[0m rank 4, value: tensor([7.], device='cuda:0')\n",
- "\u001b[2m\u001b[36m(GPUAccumulator pid=1490632)\u001b[0m rank 2, value: tensor([5.], device='cuda:0')\n",
- "\u001b[2m\u001b[36m(GPUAccumulatorDecorator pid=1491761)\u001b[0m 10\n",
- "\u001b[2m\u001b[36m(GPUAccumulatorDecorator pid=1491761)\u001b[0m rank 0, value: tensor([10.], device='cuda:0')\n",
- "\u001b[2m\u001b[36m(GPUAccumulatorDecorator pid=1491923)\u001b[0m 10\n",
- "\u001b[2m\u001b[36m(GPUAccumulatorDecorator pid=1491922)\u001b[0m 10\n",
- "\u001b[2m\u001b[36m(GPUAccumulatorDecorator pid=1491921)\u001b[0m 10\n",
- "\u001b[2m\u001b[36m(GPUAccumulatorDecorator pid=1491924)\u001b[0m 10\n",
- "\u001b[2m\u001b[36m(GPUAccumulatorDecorator pid=1491919)\u001b[0m 10\n",
- "\u001b[2m\u001b[36m(GPUAccumulatorDecorator pid=1491925)\u001b[0m 10\n",
- "\u001b[2m\u001b[36m(GPUAccumulatorDecorator pid=1491920)\u001b[0m 10\n",
- "\u001b[2m\u001b[36m(GPUAccumulatorDecorator pid=1491923)\u001b[0m rank 5, value: tensor([15.], device='cuda:0')\n",
- "\u001b[2m\u001b[36m(GPUAccumulatorDecorator pid=1491921)\u001b[0m rank 3, value: tensor([13.], device='cuda:0')\n",
- "\u001b[2m\u001b[36m(GPUAccumulatorDecorator pid=1491924)\u001b[0m rank 7, value: tensor([17.], device='cuda:0')\n",
- "\u001b[2m\u001b[36m(GPUAccumulatorDecorator pid=1491925)\u001b[0m rank 6, value: tensor([16.], device='cuda:0')\n",
- "\u001b[2m\u001b[36m(GPUAccumulatorDecorator pid=1491920)\u001b[0m rank 2, value: tensor([12.], device='cuda:0')\n",
- "\u001b[2m\u001b[36m(GPUAccumulatorDecorator pid=1491919)\u001b[0m rank 1, value: tensor([11.], device='cuda:0')\n",
- "\u001b[2m\u001b[36m(GPUAccumulatorDecorator pid=1491922)\u001b[0m rank 4, value: tensor([14.], device='cuda:0')\n",
- "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m WARNING: overriding default argument timing_log_level:2 with timing_log_level:0\n",
- "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m WARNING: overriding default argument tensor_model_parallel_size:1 with tensor_model_parallel_size:4\n",
- "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m WARNING: overriding default argument use_distributed_optimizer:False with use_distributed_optimizer:True\n",
- "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m WARNING: overriding default argument weight_decay:0.01 with weight_decay:0.0\n",
- "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m WARNING: overriding default argument sgd_momentum:0.9 with sgd_momentum:0.0\n",
- "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m WARNING: overriding default argument bf16:False with bf16:True\n",
- "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m WARNING: overriding default argument clip_grad:1.0 with clip_grad:0.0\n",
- "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m WARNING: overriding default argument sequence_parallel:False with sequence_parallel:True\n",
- "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m using world size: 4, data-parallel-size: 1, tensor-model-parallel size: 4, pipeline-model-parallel size: 1 \n",
- "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m setting global batch size to 1\n",
- "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m accumulate gradients in fp32 and all-reduce gradients in fp32 for bfloat16 data type. \n",
- "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m using torch.bfloat16 for parameters ...\n",
- "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m setting number of micro-batches to constant 1\n",
- "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m [2024-01-22 14:53:47] > initializing torch distributed ...\n",
- "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m [rank0]\t[group1]\tnew_group dp\thigh_stream False\tranks: [0]\n",
- "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m [rank0]\t[group2]\tnew_group dp\thigh_stream False\tranks: [1]\n",
- "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m [rank0]\t[group3]\tnew_group dp\thigh_stream False\tranks: [2]\n",
- "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m [rank0]\t[group4]\tnew_group dp\thigh_stream False\tranks: [3]\n",
- "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m [rank0]\t[group5]\tnew_group mp\thigh_stream False\tranks: [0, 1, 2, 3]\n",
- "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m [rank0]\t[group6]\tnew_group tp\thigh_stream False\tranks: [0, 1, 2, 3]\n",
- "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m [rank0]\t[group8]\tnew_group pp\thigh_stream False\tranks: [0]\n",
- "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m [rank0]\t[group9]\tnew_group emb\thigh_stream False\tranks: [0]\n",
- "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m [rank0]\t[group10]\tnew_group posemb\thigh_stream False\tranks: [0]\n",
- "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m [rank0]\t[group11]\tnew_group pp\thigh_stream False\tranks: [1]\n",
- "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m [rank0]\t[group12]\tnew_group emb\thigh_stream False\tranks: [1]\n",
- "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m [rank0]\t[group13]\tnew_group posemb\thigh_stream False\tranks: [1]\n",
- "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m [rank0]\t[group14]\tnew_group pp\thigh_stream False\tranks: [2]\n",
- "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m [rank0]\t[group15]\tnew_group emb\thigh_stream False\tranks: [2]\n",
- "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m [rank0]\t[group16]\tnew_group posemb\thigh_stream False\tranks: [2]\n",
- "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m [rank0]\t[group17]\tnew_group pp\thigh_stream False\tranks: [3]\n",
- "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m [rank0]\t[group18]\tnew_group emb\thigh_stream False\tranks: [3]\n",
- "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m [rank0]\t[group19]\tnew_group posemb\thigh_stream False\tranks: [3]\n",
- "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m NCCL version 2.18.6+cuda12.1\n",
- "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m this torch version does not have hires nccl profiling. nccl profiling is not enabled this time. this is only a kindly notice, not a error message\n",
- "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m [2024-01-22 14:53:52] > initialized tensor model parallel with size 4\n",
- "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m [2024-01-22 14:53:52] > initialized pipeline model parallel with size 1\n",
- "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m > setting random seeds to 1234 ...\n",
- "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m > compiling dataset index builder ...\n",
- "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m already compiled, skip compiling\n",
- "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m >>> done with dataset index builder. Compilation time: 0.005 seconds\n",
- "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m > compiling and loading fused kernels ...\n",
- "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m will compile object fused_mix_prec_layer_norm_cuda with extra cuda flags ['-maxrregcount=50', '-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__']\n",
- "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m [2024-01-22 14:53:52] >>> done with compiling and loading fused kernels. Compilation time: 0.039 seconds\n",
- "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493654)\u001b[0m this torch version does not have hires nccl profiling. nccl profiling is not enabled this time. this is only a kindly notice, not a error message\n",
- "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493654)\u001b[0m > compiling and loading fused kernels ...\n",
- "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493654)\u001b[0m will compile object fused_mix_prec_layer_norm_cuda with extra cuda flags ['-maxrregcount=50', '-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__']\n",
- "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493654)\u001b[0m [2024-01-22 14:53:52] >>> done with compiling and loading fused kernels. Compilation time: 0.044 seconds\n",
- "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493653)\u001b[0m this torch version does not have hires nccl profiling. nccl profiling is not enabled this time. this is only a kindly notice, not a error message\n",
- "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493653)\u001b[0m > compiling and loading fused kernels ...\n",
- "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493653)\u001b[0m will compile object fused_mix_prec_layer_norm_cuda with extra cuda flags ['-maxrregcount=50', '-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__']\n",
- "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493653)\u001b[0m [2024-01-22 14:53:52] >>> done with compiling and loading fused kernels. Compilation time: 0.044 seconds\n",
- "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493652)\u001b[0m this torch version does not have hires nccl profiling. nccl profiling is not enabled this time. this is only a kindly notice, not a error message\n",
- "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493652)\u001b[0m > compiling and loading fused kernels ...\n",
- "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493652)\u001b[0m will compile object fused_mix_prec_layer_norm_cuda with extra cuda flags ['-maxrregcount=50', '-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__']\n",
- "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493652)\u001b[0m [2024-01-22 14:53:52] >>> done with compiling and loading fused kernels. Compilation time: 0.045 seconds\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m No modifications detected for re-loaded extension module fused_mix_prec_layer_norm_cuda, skipping build step...\n",
- "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m Loading extension module fused_mix_prec_layer_norm_cuda...\n",
- "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493654)\u001b[0m No modifications detected for re-loaded extension module fused_mix_prec_layer_norm_cuda, skipping build step...\n",
- "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493654)\u001b[0m Loading extension module fused_mix_prec_layer_norm_cuda...\n",
- "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493653)\u001b[0m No modifications detected for re-loaded extension module fused_mix_prec_layer_norm_cuda, skipping build step...\n",
- "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493653)\u001b[0m Loading extension module fused_mix_prec_layer_norm_cuda...\n",
- "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493652)\u001b[0m No modifications detected for re-loaded extension module fused_mix_prec_layer_norm_cuda, skipping build step...\n",
- "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493652)\u001b[0m Loading extension module fused_mix_prec_layer_norm_cuda...\n",
- "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m Using `is_flash_attn_available` is deprecated and will be removed in v4.38. Please use `is_flash_attn_2_available` instead.\n",
- "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493654)\u001b[0m Using `is_flash_attn_available` is deprecated and will be removed in v4.38. Please use `is_flash_attn_2_available` instead.\n",
- "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493653)\u001b[0m Using `is_flash_attn_available` is deprecated and will be removed in v4.38. Please use `is_flash_attn_2_available` instead.\n",
- "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493652)\u001b[0m Using `is_flash_attn_available` is deprecated and will be removed in v4.38. Please use `is_flash_attn_2_available` instead.\n"
+ "\u001b[36m(GPUAccumulator pid=224400)\u001b[0m rank 0, value: tensor([1.], device='cuda:0')\n",
+ "\u001b[36m(GPUAccumulator pid=225234)\u001b[0m rank 2, value: tensor([3.], device='cuda:0')\n",
+ "\u001b[36m(GPUAccumulator pid=225607)\u001b[0m rank 0, value: tensor([2.], device='cuda:0')\n",
+ "\u001b[36m(GPUAccumulator pid=226423)\u001b[0m rank 1, value: tensor([3.], device='cuda:0')\n",
+ "\u001b[36m(GPUAccumulator pid=226857)\u001b[0m rank 3, value: tensor([6.], device='cuda:0')\n",
+ "\u001b[36m(GPUAccumulatorDecorator pid=227475)\u001b[0m 10\n",
+ "\u001b[36m(GPUAccumulatorDecorator pid=227475)\u001b[0m rank 0, value: tensor([10.], device='cuda:0')\n",
+ "\u001b[36m(GPUAccumulatorDecorator pid=227655)\u001b[0m rank 1, value: tensor([11.], device='cuda:0')\n"
]
}
],
@@ -233,18 +127,16 @@
},
{
"cell_type": "markdown",
- "id": "a0d35eb2-cf28-411a-b39f-f45e97c2edba",
- "metadata": {
- "tags": []
- },
+ "id": "a127e4e4",
+ "metadata": {},
"source": [
"Implement an Accumulator class."
]
},
{
"cell_type": "code",
- "execution_count": 4,
- "id": "86c98e9c-26a5-44d0-bbe9-064f8809708c",
+ "execution_count": 147,
+ "id": "20e7b9a3",
"metadata": {
"tags": []
},
@@ -264,8 +156,8 @@
},
{
"cell_type": "code",
- "execution_count": 5,
- "id": "9d5aa406-23c3-4270-a6f8-d4af20769ed9",
+ "execution_count": 148,
+ "id": "3b80098c",
"metadata": {
"tags": []
},
@@ -277,8 +169,8 @@
},
{
"cell_type": "code",
- "execution_count": 6,
- "id": "56850d3a-3803-4395-a73e-a27e99dc6905",
+ "execution_count": 149,
+ "id": "b14b1009",
"metadata": {
"tags": []
},
@@ -300,8 +192,8 @@
},
{
"cell_type": "code",
- "execution_count": 7,
- "id": "932807f1-792b-42af-aa44-6a6202718919",
+ "execution_count": 150,
+ "id": "513a84b3",
"metadata": {
"tags": []
},
@@ -323,7 +215,7 @@
},
{
"cell_type": "markdown",
- "id": "ca31e450-e02c-44d0-86a1-a3f9eeadd311",
+ "id": "3c332fe0",
"metadata": {},
"source": [
"## Chapter 2: Resource Pool and RayWorkerGroup\n",
@@ -333,8 +225,8 @@
},
{
"cell_type": "code",
- "execution_count": 8,
- "id": "84727b3d-1351-4550-9009-48526d18dc2c",
+ "execution_count": 151,
+ "id": "04229afb",
"metadata": {
"tags": []
},
@@ -346,8 +238,8 @@
},
{
"cell_type": "code",
- "execution_count": 9,
- "id": "aab5eeb7-6485-4911-94cc-3f5ca52b6634",
+ "execution_count": 152,
+ "id": "0d0dbd58",
"metadata": {
"tags": []
},
@@ -358,8 +250,8 @@
},
{
"cell_type": "code",
- "execution_count": 10,
- "id": "54e4d531-ca50-43e9-ab79-d24bb66ffe0e",
+ "execution_count": 153,
+ "id": "68f6838a",
"metadata": {
"tags": []
},
@@ -381,8 +273,8 @@
},
{
"cell_type": "code",
- "execution_count": 11,
- "id": "73e0658b-a6a7-4a86-a53f-5442c8cd56fe",
+ "execution_count": 154,
+ "id": "23aad8fe",
"metadata": {
"tags": []
},
@@ -404,7 +296,7 @@
},
{
"cell_type": "markdown",
- "id": "e36f73e8-4f38-459f-a09b-f6ffb95fd42b",
+ "id": "e6705284",
"metadata": {},
"source": [
"The principle of parameter passing: The input parameter is a list of length world_size, where each element in the list is dispatched respectively to each worker in the RayWorkerGroup. \n",
@@ -413,7 +305,7 @@
},
{
"cell_type": "markdown",
- "id": "3873383b-e89d-4283-ae3e-af367e6160a5",
+ "id": "d25c2412",
"metadata": {},
"source": [
"### GPU Resource Sharing"
@@ -421,18 +313,16 @@
},
{
"cell_type": "markdown",
- "id": "cf62933e-143e-4a15-90c3-3d0d0568d89c",
- "metadata": {
- "tags": []
- },
+ "id": "f74f6d24",
+ "metadata": {},
"source": [
"RayWorkerGroups mapped to the same resource pool share the GPU. In this example, we implement three resource pools: the first occupies 4 GPUs, the second also occupies 4 GPUs, and the last occupies all 8 GPUs. Among them, the first resource pool reuses the resource pool mentioned above."
]
},
{
"cell_type": "code",
- "execution_count": 12,
- "id": "100e75f1-6f2b-44b6-b4e1-f703eae8e5bf",
+ "execution_count": 155,
+ "id": "49f9c06f",
"metadata": {
"tags": []
},
@@ -445,8 +335,8 @@
},
{
"cell_type": "code",
- "execution_count": 13,
- "id": "59393509-037b-475a-800a-5e6de9ea7b66",
+ "execution_count": 156,
+ "id": "05c2e305",
"metadata": {
"tags": []
},
@@ -459,8 +349,8 @@
},
{
"cell_type": "code",
- "execution_count": 14,
- "id": "558dc5bf-75aa-4675-b8de-99944e088522",
+ "execution_count": 157,
+ "id": "6b9b13f4",
"metadata": {
"tags": []
},
@@ -481,8 +371,8 @@
},
{
"cell_type": "code",
- "execution_count": 15,
- "id": "30228667-f8ef-448c-a2cc-44241ede86d9",
+ "execution_count": 158,
+ "id": "d856d030",
"metadata": {
"tags": []
},
@@ -503,8 +393,8 @@
},
{
"cell_type": "code",
- "execution_count": 16,
- "id": "041329de-0e27-4e31-ac11-f0dd54f4f4b3",
+ "execution_count": 159,
+ "id": "33a4628c",
"metadata": {
"tags": []
},
@@ -523,7 +413,7 @@
},
{
"cell_type": "markdown",
- "id": "8cb9fc59-6035-4b72-9863-67bd4d48f3fe",
+ "id": "3df19d13",
"metadata": {},
"source": [
"## Chapter 3: Data Dispatch, Execution and Collection"
@@ -531,7 +421,7 @@
},
{
"cell_type": "markdown",
- "id": "0e1fedea-16d1-4dec-b269-c402747e11e8",
+ "id": "acb22d9d",
"metadata": {},
"source": [
"In the above example, we used the `execute_all_sync` function in the RayWorkerGroup to dispatch data from the driver to each worker. This is very inconvenient for coding. \n",
@@ -540,8 +430,8 @@
},
{
"cell_type": "code",
- "execution_count": 17,
- "id": "c70d1b33-030a-4681-aeb2-16ab48b6445b",
+ "execution_count": 160,
+ "id": "35237432",
"metadata": {
"tags": []
},
@@ -552,9 +442,11 @@
},
{
"cell_type": "code",
- "execution_count": 18,
- "id": "bb6b76c1-8caf-4749-b20d-c3f842751aa4",
- "metadata": {},
+ "execution_count": 161,
+ "id": "88b8ba3b",
+ "metadata": {
+ "tags": []
+ },
"outputs": [],
"source": [
"@ray.remote\n",
@@ -576,8 +468,8 @@
},
{
"cell_type": "code",
- "execution_count": 19,
- "id": "2d2bb460-4e9d-4fc5-a767-e7c8b9a4d7fe",
+ "execution_count": 162,
+ "id": "eddaa043",
"metadata": {
"tags": []
},
@@ -589,8 +481,8 @@
},
{
"cell_type": "code",
- "execution_count": 20,
- "id": "e4db17f7-8896-4ea6-a732-2146400ff2da",
+ "execution_count": 163,
+ "id": "10087c91",
"metadata": {
"tags": []
},
@@ -610,7 +502,7 @@
},
{
"cell_type": "markdown",
- "id": "f8b54ccb-0064-4d06-b74b-fbf39f8c5faf",
+ "id": "540ee6ad",
"metadata": {},
"source": [
"### Custom Dispatch, Collection\n",
@@ -619,8 +511,8 @@
},
{
"cell_type": "code",
- "execution_count": 21,
- "id": "9ad68e0f-372e-4792-b4db-c2406f211113",
+ "execution_count": 164,
+ "id": "8e041270",
"metadata": {
"tags": []
},
@@ -631,8 +523,8 @@
},
{
"cell_type": "code",
- "execution_count": 22,
- "id": "74f91242-3176-4ea8-adce-104a58d21874",
+ "execution_count": 165,
+ "id": "43b5be31",
"metadata": {
"tags": []
},
@@ -674,8 +566,8 @@
},
{
"cell_type": "code",
- "execution_count": 23,
- "id": "00f386a8-c5f7-49c4-87ec-6e2ab3547bf1",
+ "execution_count": 166,
+ "id": "83ec6609",
"metadata": {
"tags": []
},
@@ -687,8 +579,8 @@
},
{
"cell_type": "code",
- "execution_count": 24,
- "id": "45ae935c-4f92-4c28-9e35-add439dcd2a0",
+ "execution_count": 167,
+ "id": "62c58d8a",
"metadata": {
"tags": []
},
@@ -701,50 +593,203 @@
"assert output_ref == 5"
]
},
+ {
+ "cell_type": "code",
+ "execution_count": 168,
+ "id": "14689353",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "8\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(gpu_accumulator_decorator.world_size)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 169,
+ "id": "2c80bbf4",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "# Shutdown ray cluster\n",
+ "ray.shutdown()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a5c8151c",
+ "metadata": {},
+ "source": [
+ "## Chapter 4: NVMegatronRayWorkerGroup"
+ ]
+ },
{
"cell_type": "markdown",
- "id": "165ae826-7b20-4305-a5c6-d7f236eab410",
+ "id": "cd5680e9",
"metadata": {},
"source": [
- "## Chapter 4: MegatronRayWorkerGroup"
+ "Due to the Ray issue, we can only support max_colocate_count=1 in RayResourcePool for now. \n",
+ "This means that each GPU can only have one process.\n",
+ "We can support max_colocate > 1 when applying this pull request: https://github.com/ray-project/ray/pull/44385"
]
},
{
"cell_type": "markdown",
- "id": "d0855a6b-41d7-41bb-95e6-d9395b175d81",
+ "id": "92724419",
"metadata": {},
"source": [
- "Finally, we implement a `MegatronRayWorkerGroup`, within which we create a Megatron and then run a tensor parallel (tp) split Llama mlp layer. Here, we use a complex dispatch mode, `Megatron_COMPUTE`. This dispatch mode assumes that user passes the data partitioned by DP dimension. The data is dispatched to all tp/pp ranks within the same dp group, and ultimately only collects output data from tp=0 and the last pp. In this way, for users that only write code on the driver, the Megatron behind the RPC becomes transparent."
+ "Therefore, we need to restart the ray and initialize a new resource_pool to demonstrate the **NVMegatronRayWorkerGroup**"
]
},
{
"cell_type": "code",
- "execution_count": 25,
- "id": "d36b4606-0e28-4fd6-8e04-498408ca161a",
+ "execution_count": null,
+ "id": "9b038538",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
- "from single_controller.ray import MegatronRayWorkerGroup\n",
- "from single_controller.megatron.worker import MegatronWorker\n",
- "from omegaconf import OmegaConf"
+ "# Build a local ray cluster. The head node and worker node are on this machine\n",
+ "ray.init()"
]
},
{
- "cell_type": "code",
- "execution_count": 26,
- "id": "430fa41d-7874-49fd-b7f0-2c173a6c849b",
+ "cell_type": "markdown",
+ "id": "ebfd8798",
"metadata": {},
+ "source": [
+ "Finally, we implement a `NVMegatronRayWorkerGroup`, within which we create a Megatron and then run a tensor parallel (tp) split Llama mlp layer. Here, we use a complex dispatch mode, `Megatron_COMPUTE`. This dispatch mode assumes that user passes the data partitioned by DP dimension. The data is dispatched to all tp/pp ranks within the same dp group, and ultimately only collects output data from tp=0 and the last pp. In this way, for users that only write code on the driver, the Megatron behind the RPC becomes transparent."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 171,
+ "id": "5a032154",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "/opt/tiger/Megatron-LM\n",
+ "/opt/tiger/Megatron-LM/megatron/__init__.py\n"
+ ]
+ }
+ ],
+ "source": [
+ "import os\n",
+ "import sys\n",
+ "import site\n",
+ "\n",
+ "\n",
+ "current_pythonpath = os.environ.get('PYTHONPATH', '')\n",
+ "\n",
+ "new_path = '/opt/tiger/Megatron-LM'\n",
+ "\n",
+ "if current_pythonpath:\n",
+ " new_pythonpath = f'{new_path}:{current_pythonpath}'\n",
+ "else:\n",
+ " new_pythonpath = new_path\n",
+ "\n",
+ "os.environ['PYTHONPATH'] = new_pythonpath\n",
+ "\n",
+ "print(new_path)\n",
+ "sys.path.append(new_path)\n",
+ "\n",
+ "import megatron\n",
+ "print(megatron.__file__)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 172,
+ "id": "8c84cd5a",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "from single_controller.ray.decorator import register, Dispatch, Execute\n",
+ "from single_controller.ray.megatron import NVMegatronRayWorkerGroup\n",
+ "from single_controller.base.megatron.worker import MegatronWorker\n",
+ "from single_controller.ray.base import RayResourcePool, RayClassWithInitArgs, RayWorkerGroup\n",
+ "from omegaconf import OmegaConf\n",
+ "from megatron.core import parallel_state as mpu"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 173,
+ "id": "1b1debcc",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "resource_pool = RayResourcePool([4], use_gpu=True, max_colocate_count=1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 174,
+ "id": "bccbe081",
+ "metadata": {
+ "tags": []
+ },
"outputs": [],
"source": [
"@ray.remote\n",
"class MLPLayerWorker(MegatronWorker):\n",
+ " def __init__(self):\n",
+ " super().__init__()\n",
+ " rank = int(os.environ['LOCAL_RANK'])\n",
+ " torch.distributed.init_process_group(backend=\"nccl\")\n",
+ " torch.cuda.set_device(rank)\n",
+ "\n",
+ " mpu.initialize_model_parallel(\n",
+ " tensor_model_parallel_size=4,\n",
+ " pipeline_model_parallel_size=1,\n",
+ " virtual_pipeline_model_parallel_size=None,\n",
+ " pipeline_model_parallel_split_rank=None,\n",
+ " use_sharp=False,\n",
+ " context_parallel_size=1,\n",
+ " expert_model_parallel_size=1,\n",
+ " nccl_communicator_config_path=None,\n",
+ " )\n",
+ " from megatron.core import tensor_parallel\n",
+ " tensor_parallel.model_parallel_cuda_manual_seed(10)\n",
+ "\n",
+ "\n",
" @register(Dispatch.ONE_TO_ALL)\n",
" def init_model(self, config):\n",
" from omegaconf import OmegaConf\n",
+ " from verl.utils.megatron_utils import init_model_parallel_config\n",
" from verl.models.llama.megatron.layers import ParallelLlamaMLP\n",
- " megatron_config = OmegaConf.create({'sequence_parallel_enabled': False})\n",
+ " megatron_config = OmegaConf.create({\n",
+ " 'sequence_parallel': False,\n",
+ " 'param_dtype': 'fp32',\n",
+ " 'tensor_model_parallel_size': mpu.get_tensor_model_parallel_world_size(),\n",
+ " 'pipeline_model_parallel_rank': mpu.get_pipeline_model_parallel_rank(),\n",
+ " 'pipeline_model_parallel_size': mpu.get_pipeline_model_parallel_world_size(),\n",
+ " 'virtual_pipeline_model_parallel_rank': mpu.get_virtual_pipeline_model_parallel_rank(),\n",
+ " 'virtual_pipeline_model_parallel_size': mpu.get_virtual_pipeline_model_parallel_world_size()\n",
+ " })\n",
+ "\n",
+ " megatron_config = init_model_parallel_config(megatron_config)\n",
" self.parallel_layer = ParallelLlamaMLP(config=config, megatron_config=megatron_config)\n",
" \n",
" @register(Dispatch.ONE_TO_ALL)\n",
@@ -763,27 +808,23 @@
},
{
"cell_type": "code",
- "execution_count": 27,
- "id": "2b749128-4190-454f-96a2-72477a7a77bd",
+ "execution_count": 175,
+ "id": "a655271d",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"layer_cls = RayClassWithInitArgs(cls=MLPLayerWorker)\n",
- "layer_worker_group = MegatronRayWorkerGroup(resource_pool=resource_pool,\n",
+ "layer_worker_group = NVMegatronRayWorkerGroup(resource_pool=resource_pool,\n",
" ray_cls_with_init=layer_cls,\n",
- " default_megatron_kwargs={\n",
- " 'tensor_model_parallel_size': 4,\n",
- " 'pipeline_model_parallel_size': 1,\n",
- " 'num_layers_per_virtual_pipeline_stage': None\n",
- " })\n"
+ " )\n"
]
},
{
"cell_type": "code",
- "execution_count": 28,
- "id": "f134f99e-678a-4b6f-8d70-fce2a5c9ac3e",
+ "execution_count": 176,
+ "id": "f105ebee",
"metadata": {
"tags": []
},
@@ -802,8 +843,8 @@
},
{
"cell_type": "code",
- "execution_count": 29,
- "id": "204ce866-868a-4b0b-900b-0dda6164a995",
+ "execution_count": 177,
+ "id": "38655091",
"metadata": {
"tags": []
},
@@ -825,8 +866,8 @@
},
{
"cell_type": "code",
- "execution_count": 30,
- "id": "d35a0216-7787-4ea7-86ab-c87f8e98f13d",
+ "execution_count": 178,
+ "id": "a026efca",
"metadata": {
"tags": []
},
@@ -837,8 +878,8 @@
},
{
"cell_type": "code",
- "execution_count": 31,
- "id": "0f84eaf8-adcd-4f89-a1ce-800db07df242",
+ "execution_count": 179,
+ "id": "f5fcaf13",
"metadata": {
"tags": []
},
@@ -849,7 +890,7 @@
"[None, None, None, None]"
]
},
- "execution_count": 31,
+ "execution_count": 179,
"metadata": {},
"output_type": "execute_result"
}
@@ -860,27 +901,8 @@
},
{
"cell_type": "code",
- "execution_count": 32,
- "id": "5203bb40-f831-4898-832c-5f65596ba0f6",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "torch.Size([2048, 16, 4096])\n"
- ]
- }
- ],
- "source": [
- "output = layer_worker_group.run_layer([x]) # This must be a list of size 1, ensuring that the input equals the data parallel (dp).\n",
- "print(output[0].shape)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 27,
- "id": "6a159884-7409-4dfe-a367-3c31090b09a1",
+ "execution_count": 180,
+ "id": "3f5cc9b4",
"metadata": {
"tags": []
},
@@ -889,18 +911,19 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "0\n"
+ "torch.Size([2048, 16, 4096])\n"
]
}
],
"source": [
- "print(gpu_accumulator_decorator.world_size)"
+ "output = layer_worker_group.run_layer([x]) # This must be a list of size 1, ensuring that the input equals the data parallel (dp).\n",
+ "print(output[0].shape)"
]
},
{
"cell_type": "code",
- "execution_count": 33,
- "id": "806c36c3-a6a6-4c99-88fb-f2e2468b12db",
+ "execution_count": 181,
+ "id": "49792210",
"metadata": {
"tags": []
},
@@ -909,20 +932,11 @@
"# Shutdown ray cluster\n",
"ray.shutdown()"
]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "ea54e858-5221-493e-b53a-4765b918a0fa",
- "metadata": {},
- "outputs": [],
- "source": []
}
],
"metadata": {
- "fileId": "e862b5a9-13b2-48bf-9eda-96bd0c76e37c",
"kernelspec": {
- "display_name": "Python 3.9.2 64-bit",
+ "display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
@@ -937,11 +951,6 @@
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.2"
- },
- "vscode": {
- "interpreter": {
- "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
- }
}
},
"nbformat": 4,