diff --git a/examples/ray/tutorial.ipynb b/examples/ray/tutorial.ipynb index 4152331828a..9b8591a92b1 100644 --- a/examples/ray/tutorial.ipynb +++ b/examples/ray/tutorial.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "ed65145e-1dcc-4fad-85cd-8cc683fde91b", + "id": "0ddc582b", "metadata": {}, "source": [ "# VeRL Ray API Tutorial" @@ -10,46 +10,43 @@ }, { "cell_type": "markdown", - "id": "5f90dfa8-3285-41e4-ba44-12abcb76b3ce", - "metadata": { - "tags": [] - }, + "id": "71fe3b94", + "metadata": {}, "source": [ "## Chapter 1: Ray Basics" ] }, { "cell_type": "code", - "execution_count": 1, - "id": "cafc9be5-614b-4380-9b69-31525ea4e73a", - "metadata": {}, + "execution_count": 144, + "id": "1347d381", + "metadata": { + "tags": [] + }, "outputs": [], "source": [ - "import os\n", - "\n", - "# turn off Megatron timer\n", - "os.environ['MEGATRON_USE_CUDA_TIMER'] = '0'\n", - "os.environ['MEGATRON_START_PROCESS_TIMER'] = 'False'\n", - "os.environ['NCCL_DEBUG'] = 'WARN'" + "import os" ] }, { "cell_type": "code", - "execution_count": 2, - "id": "3fb65288-4410-43b4-9ffc-8538197ae039", + "execution_count": 145, + "id": "e75b9d44", "metadata": { "tags": [] }, "outputs": [], "source": [ "import ray\n", - "import torch" + "import torch\n", + "import warnings\n", + "warnings.filterwarnings('ignore')" ] }, { "cell_type": "code", - "execution_count": 3, - "id": "ca7f0778-f676-4f67-8c1f-9f6c8eee74e2", + "execution_count": 146, + "id": "2e90ae00", "metadata": { "tags": [] }, @@ -58,59 +55,53 @@ "name": "stderr", "output_type": "stream", "text": [ - "2024-01-22 14:52:10,398\tINFO worker.py:1655 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32m127.0.0.1:8265 \u001b[39m\u001b[22m\n" + "2024-11-01 17:27:19,132\tINFO worker.py:1752 -- Started a local Ray instance.\n" ] }, { "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "9cc9d2ccbdfb48918c8fd6cd13a0807a", + "version_major": 2, + "version_minor": 0 + }, "text/html": [ - "
\n", + "
\n", "
\n", - "

Ray

\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "\n", + "
Python version:3.9.2
Ray version: 2.3.0
Dashboard:http://127.0.0.1:8265
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Python version:3.9.2
Ray version:2.10.0
\n", "\n", - " \n", "
\n", "
\n" ], "text/plain": [ - "RayContext(dashboard_url='127.0.0.1:8265', python_version='3.9.2', ray_version='2.3.0', ray_commit='{{RAY_COMMIT_SHA}}', address_info={'node_ip_address': '10.122.229.29', 'raylet_ip_address': '10.122.229.29', 'redis_address': None, 'object_store_address': '/tmp/ray/session_2024-01-22_14-52-08_389091_1470202/sockets/plasma_store', 'raylet_socket_name': '/tmp/ray/session_2024-01-22_14-52-08_389091_1470202/sockets/raylet', 'webui_url': '127.0.0.1:8265', 'session_dir': '/tmp/ray/session_2024-01-22_14-52-08_389091_1470202', 'metrics_export_port': 46547, 'gcs_address': '10.122.229.29:57281', 'address': '10.122.229.29:57281', 'dashboard_agent_listen_port': 52365, 'node_id': 'ec81d96ec29985ce0b98441aabb46fc8dcb49b0e1dec1088f973977f'})" + "RayContext(dashboard_url='', python_version='3.9.2', ray_version='2.10.0', ray_commit='09abba26b5bf2707639bb637c208d062a47b46f6')" ] }, - "execution_count": 3, + "execution_count": 146, "metadata": {}, "output_type": "execute_result" }, @@ -118,111 +109,14 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(GPUAccumulator pid=1489193)\u001b[0m rank 0, value: tensor([1.], device='cuda:0')\n", - "\u001b[2m\u001b[36m(GPUAccumulator pid=1489357)\u001b[0m rank 2, value: tensor([3.], device='cuda:0')\n", - "\u001b[2m\u001b[36m(GPUAccumulator pid=1489358)\u001b[0m rank 3, value: tensor([4.], device='cuda:0')\n", - "\u001b[2m\u001b[36m(GPUAccumulator pid=1489356)\u001b[0m rank 1, value: tensor([2.], device='cuda:0')\n", - "\u001b[2m\u001b[36m(GPUAccumulator pid=1489837)\u001b[0m rank 0, value: tensor([2.], device='cuda:0')\n", - "\u001b[2m\u001b[36m(GPUAccumulator pid=1489999)\u001b[0m rank 3, value: tensor([5.], device='cuda:0')\n", - "\u001b[2m\u001b[36m(GPUAccumulator pid=1489998)\u001b[0m rank 2, value: tensor([4.], device='cuda:0')\n", - "\u001b[2m\u001b[36m(GPUAccumulator pid=1489997)\u001b[0m rank 1, value: tensor([3.], device='cuda:0')\n", - "\u001b[2m\u001b[36m(GPUAccumulator pid=1490000)\u001b[0m rank 0, value: tensor([3.], device='cuda:0')\n", - "\u001b[2m\u001b[36m(GPUAccumulator pid=1490633)\u001b[0m rank 3, value: tensor([6.], device='cuda:0')\n", - "\u001b[2m\u001b[36m(GPUAccumulator pid=1490635)\u001b[0m rank 5, value: tensor([8.], device='cuda:0')\n", - "\u001b[2m\u001b[36m(GPUAccumulator pid=1490636)\u001b[0m rank 6, value: tensor([9.], device='cuda:0')\n", - "\u001b[2m\u001b[36m(GPUAccumulator pid=1490631)\u001b[0m rank 1, value: tensor([4.], device='cuda:0')\n", - "\u001b[2m\u001b[36m(GPUAccumulator pid=1490637)\u001b[0m rank 7, value: tensor([10.], device='cuda:0')\n", - "\u001b[2m\u001b[36m(GPUAccumulator pid=1490634)\u001b[0m rank 4, value: tensor([7.], device='cuda:0')\n", - "\u001b[2m\u001b[36m(GPUAccumulator pid=1490632)\u001b[0m rank 2, value: tensor([5.], device='cuda:0')\n", - "\u001b[2m\u001b[36m(GPUAccumulatorDecorator pid=1491761)\u001b[0m 10\n", - "\u001b[2m\u001b[36m(GPUAccumulatorDecorator pid=1491761)\u001b[0m rank 0, value: tensor([10.], device='cuda:0')\n", - "\u001b[2m\u001b[36m(GPUAccumulatorDecorator pid=1491923)\u001b[0m 10\n", - "\u001b[2m\u001b[36m(GPUAccumulatorDecorator pid=1491922)\u001b[0m 10\n", - "\u001b[2m\u001b[36m(GPUAccumulatorDecorator pid=1491921)\u001b[0m 10\n", - "\u001b[2m\u001b[36m(GPUAccumulatorDecorator pid=1491924)\u001b[0m 10\n", - "\u001b[2m\u001b[36m(GPUAccumulatorDecorator pid=1491919)\u001b[0m 10\n", - "\u001b[2m\u001b[36m(GPUAccumulatorDecorator pid=1491925)\u001b[0m 10\n", - "\u001b[2m\u001b[36m(GPUAccumulatorDecorator pid=1491920)\u001b[0m 10\n", - "\u001b[2m\u001b[36m(GPUAccumulatorDecorator pid=1491923)\u001b[0m rank 5, value: tensor([15.], device='cuda:0')\n", - "\u001b[2m\u001b[36m(GPUAccumulatorDecorator pid=1491921)\u001b[0m rank 3, value: tensor([13.], device='cuda:0')\n", - "\u001b[2m\u001b[36m(GPUAccumulatorDecorator pid=1491924)\u001b[0m rank 7, value: tensor([17.], device='cuda:0')\n", - "\u001b[2m\u001b[36m(GPUAccumulatorDecorator pid=1491925)\u001b[0m rank 6, value: tensor([16.], device='cuda:0')\n", - "\u001b[2m\u001b[36m(GPUAccumulatorDecorator pid=1491920)\u001b[0m rank 2, value: tensor([12.], device='cuda:0')\n", - "\u001b[2m\u001b[36m(GPUAccumulatorDecorator pid=1491919)\u001b[0m rank 1, value: tensor([11.], device='cuda:0')\n", - "\u001b[2m\u001b[36m(GPUAccumulatorDecorator pid=1491922)\u001b[0m rank 4, value: tensor([14.], device='cuda:0')\n", - "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m WARNING: overriding default argument timing_log_level:2 with timing_log_level:0\n", - "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m WARNING: overriding default argument tensor_model_parallel_size:1 with tensor_model_parallel_size:4\n", - "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m WARNING: overriding default argument use_distributed_optimizer:False with use_distributed_optimizer:True\n", - "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m WARNING: overriding default argument weight_decay:0.01 with weight_decay:0.0\n", - "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m WARNING: overriding default argument sgd_momentum:0.9 with sgd_momentum:0.0\n", - "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m WARNING: overriding default argument bf16:False with bf16:True\n", - "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m WARNING: overriding default argument clip_grad:1.0 with clip_grad:0.0\n", - "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m WARNING: overriding default argument sequence_parallel:False with sequence_parallel:True\n", - "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m using world size: 4, data-parallel-size: 1, tensor-model-parallel size: 4, pipeline-model-parallel size: 1 \n", - "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m setting global batch size to 1\n", - "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m accumulate gradients in fp32 and all-reduce gradients in fp32 for bfloat16 data type. \n", - "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m using torch.bfloat16 for parameters ...\n", - "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m setting number of micro-batches to constant 1\n", - "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m [2024-01-22 14:53:47] > initializing torch distributed ...\n", - "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m [rank0]\t[group1]\tnew_group dp\thigh_stream False\tranks: [0]\n", - "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m [rank0]\t[group2]\tnew_group dp\thigh_stream False\tranks: [1]\n", - "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m [rank0]\t[group3]\tnew_group dp\thigh_stream False\tranks: [2]\n", - "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m [rank0]\t[group4]\tnew_group dp\thigh_stream False\tranks: [3]\n", - "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m [rank0]\t[group5]\tnew_group mp\thigh_stream False\tranks: [0, 1, 2, 3]\n", - "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m [rank0]\t[group6]\tnew_group tp\thigh_stream False\tranks: [0, 1, 2, 3]\n", - "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m [rank0]\t[group8]\tnew_group pp\thigh_stream False\tranks: [0]\n", - "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m [rank0]\t[group9]\tnew_group emb\thigh_stream False\tranks: [0]\n", - "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m [rank0]\t[group10]\tnew_group posemb\thigh_stream False\tranks: [0]\n", - "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m [rank0]\t[group11]\tnew_group pp\thigh_stream False\tranks: [1]\n", - "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m [rank0]\t[group12]\tnew_group emb\thigh_stream False\tranks: [1]\n", - "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m [rank0]\t[group13]\tnew_group posemb\thigh_stream False\tranks: [1]\n", - "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m [rank0]\t[group14]\tnew_group pp\thigh_stream False\tranks: [2]\n", - "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m [rank0]\t[group15]\tnew_group emb\thigh_stream False\tranks: [2]\n", - "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m [rank0]\t[group16]\tnew_group posemb\thigh_stream False\tranks: [2]\n", - "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m [rank0]\t[group17]\tnew_group pp\thigh_stream False\tranks: [3]\n", - "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m [rank0]\t[group18]\tnew_group emb\thigh_stream False\tranks: [3]\n", - "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m [rank0]\t[group19]\tnew_group posemb\thigh_stream False\tranks: [3]\n", - "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m NCCL version 2.18.6+cuda12.1\n", - "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m this torch version does not have hires nccl profiling. nccl profiling is not enabled this time. this is only a kindly notice, not a error message\n", - "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m [2024-01-22 14:53:52] > initialized tensor model parallel with size 4\n", - "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m [2024-01-22 14:53:52] > initialized pipeline model parallel with size 1\n", - "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m > setting random seeds to 1234 ...\n", - "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m > compiling dataset index builder ...\n", - "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m already compiled, skip compiling\n", - "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m >>> done with dataset index builder. Compilation time: 0.005 seconds\n", - "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m > compiling and loading fused kernels ...\n", - "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m will compile object fused_mix_prec_layer_norm_cuda with extra cuda flags ['-maxrregcount=50', '-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__']\n", - "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m [2024-01-22 14:53:52] >>> done with compiling and loading fused kernels. Compilation time: 0.039 seconds\n", - "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493654)\u001b[0m this torch version does not have hires nccl profiling. nccl profiling is not enabled this time. this is only a kindly notice, not a error message\n", - "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493654)\u001b[0m > compiling and loading fused kernels ...\n", - "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493654)\u001b[0m will compile object fused_mix_prec_layer_norm_cuda with extra cuda flags ['-maxrregcount=50', '-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__']\n", - "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493654)\u001b[0m [2024-01-22 14:53:52] >>> done with compiling and loading fused kernels. Compilation time: 0.044 seconds\n", - "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493653)\u001b[0m this torch version does not have hires nccl profiling. nccl profiling is not enabled this time. this is only a kindly notice, not a error message\n", - "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493653)\u001b[0m > compiling and loading fused kernels ...\n", - "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493653)\u001b[0m will compile object fused_mix_prec_layer_norm_cuda with extra cuda flags ['-maxrregcount=50', '-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__']\n", - "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493653)\u001b[0m [2024-01-22 14:53:52] >>> done with compiling and loading fused kernels. Compilation time: 0.044 seconds\n", - "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493652)\u001b[0m this torch version does not have hires nccl profiling. nccl profiling is not enabled this time. this is only a kindly notice, not a error message\n", - "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493652)\u001b[0m > compiling and loading fused kernels ...\n", - "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493652)\u001b[0m will compile object fused_mix_prec_layer_norm_cuda with extra cuda flags ['-maxrregcount=50', '-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__']\n", - "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493652)\u001b[0m [2024-01-22 14:53:52] >>> done with compiling and loading fused kernels. Compilation time: 0.045 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m No modifications detected for re-loaded extension module fused_mix_prec_layer_norm_cuda, skipping build step...\n", - "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m Loading extension module fused_mix_prec_layer_norm_cuda...\n", - "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493654)\u001b[0m No modifications detected for re-loaded extension module fused_mix_prec_layer_norm_cuda, skipping build step...\n", - "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493654)\u001b[0m Loading extension module fused_mix_prec_layer_norm_cuda...\n", - "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493653)\u001b[0m No modifications detected for re-loaded extension module fused_mix_prec_layer_norm_cuda, skipping build step...\n", - "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493653)\u001b[0m Loading extension module fused_mix_prec_layer_norm_cuda...\n", - "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493652)\u001b[0m No modifications detected for re-loaded extension module fused_mix_prec_layer_norm_cuda, skipping build step...\n", - "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493652)\u001b[0m Loading extension module fused_mix_prec_layer_norm_cuda...\n", - "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493490)\u001b[0m Using `is_flash_attn_available` is deprecated and will be removed in v4.38. Please use `is_flash_attn_2_available` instead.\n", - "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493654)\u001b[0m Using `is_flash_attn_available` is deprecated and will be removed in v4.38. Please use `is_flash_attn_2_available` instead.\n", - "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493653)\u001b[0m Using `is_flash_attn_available` is deprecated and will be removed in v4.38. Please use `is_flash_attn_2_available` instead.\n", - "\u001b[2m\u001b[36m(MLPLayerWorker pid=1493652)\u001b[0m Using `is_flash_attn_available` is deprecated and will be removed in v4.38. Please use `is_flash_attn_2_available` instead.\n" + "\u001b[36m(GPUAccumulator pid=224400)\u001b[0m rank 0, value: tensor([1.], device='cuda:0')\n", + "\u001b[36m(GPUAccumulator pid=225234)\u001b[0m rank 2, value: tensor([3.], device='cuda:0')\n", + "\u001b[36m(GPUAccumulator pid=225607)\u001b[0m rank 0, value: tensor([2.], device='cuda:0')\n", + "\u001b[36m(GPUAccumulator pid=226423)\u001b[0m rank 1, value: tensor([3.], device='cuda:0')\n", + "\u001b[36m(GPUAccumulator pid=226857)\u001b[0m rank 3, value: tensor([6.], device='cuda:0')\n", + "\u001b[36m(GPUAccumulatorDecorator pid=227475)\u001b[0m 10\n", + "\u001b[36m(GPUAccumulatorDecorator pid=227475)\u001b[0m rank 0, value: tensor([10.], device='cuda:0')\n", + "\u001b[36m(GPUAccumulatorDecorator pid=227655)\u001b[0m rank 1, value: tensor([11.], device='cuda:0')\n" ] } ], @@ -233,18 +127,16 @@ }, { "cell_type": "markdown", - "id": "a0d35eb2-cf28-411a-b39f-f45e97c2edba", - "metadata": { - "tags": [] - }, + "id": "a127e4e4", + "metadata": {}, "source": [ "Implement an Accumulator class." ] }, { "cell_type": "code", - "execution_count": 4, - "id": "86c98e9c-26a5-44d0-bbe9-064f8809708c", + "execution_count": 147, + "id": "20e7b9a3", "metadata": { "tags": [] }, @@ -264,8 +156,8 @@ }, { "cell_type": "code", - "execution_count": 5, - "id": "9d5aa406-23c3-4270-a6f8-d4af20769ed9", + "execution_count": 148, + "id": "3b80098c", "metadata": { "tags": [] }, @@ -277,8 +169,8 @@ }, { "cell_type": "code", - "execution_count": 6, - "id": "56850d3a-3803-4395-a73e-a27e99dc6905", + "execution_count": 149, + "id": "b14b1009", "metadata": { "tags": [] }, @@ -300,8 +192,8 @@ }, { "cell_type": "code", - "execution_count": 7, - "id": "932807f1-792b-42af-aa44-6a6202718919", + "execution_count": 150, + "id": "513a84b3", "metadata": { "tags": [] }, @@ -323,7 +215,7 @@ }, { "cell_type": "markdown", - "id": "ca31e450-e02c-44d0-86a1-a3f9eeadd311", + "id": "3c332fe0", "metadata": {}, "source": [ "## Chapter 2: Resource Pool and RayWorkerGroup\n", @@ -333,8 +225,8 @@ }, { "cell_type": "code", - "execution_count": 8, - "id": "84727b3d-1351-4550-9009-48526d18dc2c", + "execution_count": 151, + "id": "04229afb", "metadata": { "tags": [] }, @@ -346,8 +238,8 @@ }, { "cell_type": "code", - "execution_count": 9, - "id": "aab5eeb7-6485-4911-94cc-3f5ca52b6634", + "execution_count": 152, + "id": "0d0dbd58", "metadata": { "tags": [] }, @@ -358,8 +250,8 @@ }, { "cell_type": "code", - "execution_count": 10, - "id": "54e4d531-ca50-43e9-ab79-d24bb66ffe0e", + "execution_count": 153, + "id": "68f6838a", "metadata": { "tags": [] }, @@ -381,8 +273,8 @@ }, { "cell_type": "code", - "execution_count": 11, - "id": "73e0658b-a6a7-4a86-a53f-5442c8cd56fe", + "execution_count": 154, + "id": "23aad8fe", "metadata": { "tags": [] }, @@ -404,7 +296,7 @@ }, { "cell_type": "markdown", - "id": "e36f73e8-4f38-459f-a09b-f6ffb95fd42b", + "id": "e6705284", "metadata": {}, "source": [ "The principle of parameter passing: The input parameter is a list of length world_size, where each element in the list is dispatched respectively to each worker in the RayWorkerGroup. \n", @@ -413,7 +305,7 @@ }, { "cell_type": "markdown", - "id": "3873383b-e89d-4283-ae3e-af367e6160a5", + "id": "d25c2412", "metadata": {}, "source": [ "### GPU Resource Sharing" @@ -421,18 +313,16 @@ }, { "cell_type": "markdown", - "id": "cf62933e-143e-4a15-90c3-3d0d0568d89c", - "metadata": { - "tags": [] - }, + "id": "f74f6d24", + "metadata": {}, "source": [ "RayWorkerGroups mapped to the same resource pool share the GPU. In this example, we implement three resource pools: the first occupies 4 GPUs, the second also occupies 4 GPUs, and the last occupies all 8 GPUs. Among them, the first resource pool reuses the resource pool mentioned above." ] }, { "cell_type": "code", - "execution_count": 12, - "id": "100e75f1-6f2b-44b6-b4e1-f703eae8e5bf", + "execution_count": 155, + "id": "49f9c06f", "metadata": { "tags": [] }, @@ -445,8 +335,8 @@ }, { "cell_type": "code", - "execution_count": 13, - "id": "59393509-037b-475a-800a-5e6de9ea7b66", + "execution_count": 156, + "id": "05c2e305", "metadata": { "tags": [] }, @@ -459,8 +349,8 @@ }, { "cell_type": "code", - "execution_count": 14, - "id": "558dc5bf-75aa-4675-b8de-99944e088522", + "execution_count": 157, + "id": "6b9b13f4", "metadata": { "tags": [] }, @@ -481,8 +371,8 @@ }, { "cell_type": "code", - "execution_count": 15, - "id": "30228667-f8ef-448c-a2cc-44241ede86d9", + "execution_count": 158, + "id": "d856d030", "metadata": { "tags": [] }, @@ -503,8 +393,8 @@ }, { "cell_type": "code", - "execution_count": 16, - "id": "041329de-0e27-4e31-ac11-f0dd54f4f4b3", + "execution_count": 159, + "id": "33a4628c", "metadata": { "tags": [] }, @@ -523,7 +413,7 @@ }, { "cell_type": "markdown", - "id": "8cb9fc59-6035-4b72-9863-67bd4d48f3fe", + "id": "3df19d13", "metadata": {}, "source": [ "## Chapter 3: Data Dispatch, Execution and Collection" @@ -531,7 +421,7 @@ }, { "cell_type": "markdown", - "id": "0e1fedea-16d1-4dec-b269-c402747e11e8", + "id": "acb22d9d", "metadata": {}, "source": [ "In the above example, we used the `execute_all_sync` function in the RayWorkerGroup to dispatch data from the driver to each worker. This is very inconvenient for coding. \n", @@ -540,8 +430,8 @@ }, { "cell_type": "code", - "execution_count": 17, - "id": "c70d1b33-030a-4681-aeb2-16ab48b6445b", + "execution_count": 160, + "id": "35237432", "metadata": { "tags": [] }, @@ -552,9 +442,11 @@ }, { "cell_type": "code", - "execution_count": 18, - "id": "bb6b76c1-8caf-4749-b20d-c3f842751aa4", - "metadata": {}, + "execution_count": 161, + "id": "88b8ba3b", + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "@ray.remote\n", @@ -576,8 +468,8 @@ }, { "cell_type": "code", - "execution_count": 19, - "id": "2d2bb460-4e9d-4fc5-a767-e7c8b9a4d7fe", + "execution_count": 162, + "id": "eddaa043", "metadata": { "tags": [] }, @@ -589,8 +481,8 @@ }, { "cell_type": "code", - "execution_count": 20, - "id": "e4db17f7-8896-4ea6-a732-2146400ff2da", + "execution_count": 163, + "id": "10087c91", "metadata": { "tags": [] }, @@ -610,7 +502,7 @@ }, { "cell_type": "markdown", - "id": "f8b54ccb-0064-4d06-b74b-fbf39f8c5faf", + "id": "540ee6ad", "metadata": {}, "source": [ "### Custom Dispatch, Collection\n", @@ -619,8 +511,8 @@ }, { "cell_type": "code", - "execution_count": 21, - "id": "9ad68e0f-372e-4792-b4db-c2406f211113", + "execution_count": 164, + "id": "8e041270", "metadata": { "tags": [] }, @@ -631,8 +523,8 @@ }, { "cell_type": "code", - "execution_count": 22, - "id": "74f91242-3176-4ea8-adce-104a58d21874", + "execution_count": 165, + "id": "43b5be31", "metadata": { "tags": [] }, @@ -674,8 +566,8 @@ }, { "cell_type": "code", - "execution_count": 23, - "id": "00f386a8-c5f7-49c4-87ec-6e2ab3547bf1", + "execution_count": 166, + "id": "83ec6609", "metadata": { "tags": [] }, @@ -687,8 +579,8 @@ }, { "cell_type": "code", - "execution_count": 24, - "id": "45ae935c-4f92-4c28-9e35-add439dcd2a0", + "execution_count": 167, + "id": "62c58d8a", "metadata": { "tags": [] }, @@ -701,50 +593,203 @@ "assert output_ref == 5" ] }, + { + "cell_type": "code", + "execution_count": 168, + "id": "14689353", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "8\n" + ] + } + ], + "source": [ + "print(gpu_accumulator_decorator.world_size)" + ] + }, + { + "cell_type": "code", + "execution_count": 169, + "id": "2c80bbf4", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Shutdown ray cluster\n", + "ray.shutdown()" + ] + }, + { + "cell_type": "markdown", + "id": "a5c8151c", + "metadata": {}, + "source": [ + "## Chapter 4: NVMegatronRayWorkerGroup" + ] + }, { "cell_type": "markdown", - "id": "165ae826-7b20-4305-a5c6-d7f236eab410", + "id": "cd5680e9", "metadata": {}, "source": [ - "## Chapter 4: MegatronRayWorkerGroup" + "Due to the Ray issue, we can only support max_colocate_count=1 in RayResourcePool for now. \n", + "This means that each GPU can only have one process.\n", + "We can support max_colocate > 1 when applying this pull request: https://github.com/ray-project/ray/pull/44385" ] }, { "cell_type": "markdown", - "id": "d0855a6b-41d7-41bb-95e6-d9395b175d81", + "id": "92724419", "metadata": {}, "source": [ - "Finally, we implement a `MegatronRayWorkerGroup`, within which we create a Megatron and then run a tensor parallel (tp) split Llama mlp layer. Here, we use a complex dispatch mode, `Megatron_COMPUTE`. This dispatch mode assumes that user passes the data partitioned by DP dimension. The data is dispatched to all tp/pp ranks within the same dp group, and ultimately only collects output data from tp=0 and the last pp. In this way, for users that only write code on the driver, the Megatron behind the RPC becomes transparent." + "Therefore, we need to restart the ray and initialize a new resource_pool to demonstrate the **NVMegatronRayWorkerGroup**" ] }, { "cell_type": "code", - "execution_count": 25, - "id": "d36b4606-0e28-4fd6-8e04-498408ca161a", + "execution_count": null, + "id": "9b038538", "metadata": { "tags": [] }, "outputs": [], "source": [ - "from single_controller.ray import MegatronRayWorkerGroup\n", - "from single_controller.megatron.worker import MegatronWorker\n", - "from omegaconf import OmegaConf" + "# Build a local ray cluster. The head node and worker node are on this machine\n", + "ray.init()" ] }, { - "cell_type": "code", - "execution_count": 26, - "id": "430fa41d-7874-49fd-b7f0-2c173a6c849b", + "cell_type": "markdown", + "id": "ebfd8798", "metadata": {}, + "source": [ + "Finally, we implement a `NVMegatronRayWorkerGroup`, within which we create a Megatron and then run a tensor parallel (tp) split Llama mlp layer. Here, we use a complex dispatch mode, `Megatron_COMPUTE`. This dispatch mode assumes that user passes the data partitioned by DP dimension. The data is dispatched to all tp/pp ranks within the same dp group, and ultimately only collects output data from tp=0 and the last pp. In this way, for users that only write code on the driver, the Megatron behind the RPC becomes transparent." + ] + }, + { + "cell_type": "code", + "execution_count": 171, + "id": "5a032154", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/opt/tiger/Megatron-LM\n", + "/opt/tiger/Megatron-LM/megatron/__init__.py\n" + ] + } + ], + "source": [ + "import os\n", + "import sys\n", + "import site\n", + "\n", + "\n", + "current_pythonpath = os.environ.get('PYTHONPATH', '')\n", + "\n", + "new_path = '/opt/tiger/Megatron-LM'\n", + "\n", + "if current_pythonpath:\n", + " new_pythonpath = f'{new_path}:{current_pythonpath}'\n", + "else:\n", + " new_pythonpath = new_path\n", + "\n", + "os.environ['PYTHONPATH'] = new_pythonpath\n", + "\n", + "print(new_path)\n", + "sys.path.append(new_path)\n", + "\n", + "import megatron\n", + "print(megatron.__file__)" + ] + }, + { + "cell_type": "code", + "execution_count": 172, + "id": "8c84cd5a", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from single_controller.ray.decorator import register, Dispatch, Execute\n", + "from single_controller.ray.megatron import NVMegatronRayWorkerGroup\n", + "from single_controller.base.megatron.worker import MegatronWorker\n", + "from single_controller.ray.base import RayResourcePool, RayClassWithInitArgs, RayWorkerGroup\n", + "from omegaconf import OmegaConf\n", + "from megatron.core import parallel_state as mpu" + ] + }, + { + "cell_type": "code", + "execution_count": 173, + "id": "1b1debcc", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "resource_pool = RayResourcePool([4], use_gpu=True, max_colocate_count=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 174, + "id": "bccbe081", + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "@ray.remote\n", "class MLPLayerWorker(MegatronWorker):\n", + " def __init__(self):\n", + " super().__init__()\n", + " rank = int(os.environ['LOCAL_RANK'])\n", + " torch.distributed.init_process_group(backend=\"nccl\")\n", + " torch.cuda.set_device(rank)\n", + "\n", + " mpu.initialize_model_parallel(\n", + " tensor_model_parallel_size=4,\n", + " pipeline_model_parallel_size=1,\n", + " virtual_pipeline_model_parallel_size=None,\n", + " pipeline_model_parallel_split_rank=None,\n", + " use_sharp=False,\n", + " context_parallel_size=1,\n", + " expert_model_parallel_size=1,\n", + " nccl_communicator_config_path=None,\n", + " )\n", + " from megatron.core import tensor_parallel\n", + " tensor_parallel.model_parallel_cuda_manual_seed(10)\n", + "\n", + "\n", " @register(Dispatch.ONE_TO_ALL)\n", " def init_model(self, config):\n", " from omegaconf import OmegaConf\n", + " from verl.utils.megatron_utils import init_model_parallel_config\n", " from verl.models.llama.megatron.layers import ParallelLlamaMLP\n", - " megatron_config = OmegaConf.create({'sequence_parallel_enabled': False})\n", + " megatron_config = OmegaConf.create({\n", + " 'sequence_parallel': False,\n", + " 'param_dtype': 'fp32',\n", + " 'tensor_model_parallel_size': mpu.get_tensor_model_parallel_world_size(),\n", + " 'pipeline_model_parallel_rank': mpu.get_pipeline_model_parallel_rank(),\n", + " 'pipeline_model_parallel_size': mpu.get_pipeline_model_parallel_world_size(),\n", + " 'virtual_pipeline_model_parallel_rank': mpu.get_virtual_pipeline_model_parallel_rank(),\n", + " 'virtual_pipeline_model_parallel_size': mpu.get_virtual_pipeline_model_parallel_world_size()\n", + " })\n", + "\n", + " megatron_config = init_model_parallel_config(megatron_config)\n", " self.parallel_layer = ParallelLlamaMLP(config=config, megatron_config=megatron_config)\n", " \n", " @register(Dispatch.ONE_TO_ALL)\n", @@ -763,27 +808,23 @@ }, { "cell_type": "code", - "execution_count": 27, - "id": "2b749128-4190-454f-96a2-72477a7a77bd", + "execution_count": 175, + "id": "a655271d", "metadata": { "tags": [] }, "outputs": [], "source": [ "layer_cls = RayClassWithInitArgs(cls=MLPLayerWorker)\n", - "layer_worker_group = MegatronRayWorkerGroup(resource_pool=resource_pool,\n", + "layer_worker_group = NVMegatronRayWorkerGroup(resource_pool=resource_pool,\n", " ray_cls_with_init=layer_cls,\n", - " default_megatron_kwargs={\n", - " 'tensor_model_parallel_size': 4,\n", - " 'pipeline_model_parallel_size': 1,\n", - " 'num_layers_per_virtual_pipeline_stage': None\n", - " })\n" + " )\n" ] }, { "cell_type": "code", - "execution_count": 28, - "id": "f134f99e-678a-4b6f-8d70-fce2a5c9ac3e", + "execution_count": 176, + "id": "f105ebee", "metadata": { "tags": [] }, @@ -802,8 +843,8 @@ }, { "cell_type": "code", - "execution_count": 29, - "id": "204ce866-868a-4b0b-900b-0dda6164a995", + "execution_count": 177, + "id": "38655091", "metadata": { "tags": [] }, @@ -825,8 +866,8 @@ }, { "cell_type": "code", - "execution_count": 30, - "id": "d35a0216-7787-4ea7-86ab-c87f8e98f13d", + "execution_count": 178, + "id": "a026efca", "metadata": { "tags": [] }, @@ -837,8 +878,8 @@ }, { "cell_type": "code", - "execution_count": 31, - "id": "0f84eaf8-adcd-4f89-a1ce-800db07df242", + "execution_count": 179, + "id": "f5fcaf13", "metadata": { "tags": [] }, @@ -849,7 +890,7 @@ "[None, None, None, None]" ] }, - "execution_count": 31, + "execution_count": 179, "metadata": {}, "output_type": "execute_result" } @@ -860,27 +901,8 @@ }, { "cell_type": "code", - "execution_count": 32, - "id": "5203bb40-f831-4898-832c-5f65596ba0f6", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "torch.Size([2048, 16, 4096])\n" - ] - } - ], - "source": [ - "output = layer_worker_group.run_layer([x]) # This must be a list of size 1, ensuring that the input equals the data parallel (dp).\n", - "print(output[0].shape)" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "id": "6a159884-7409-4dfe-a367-3c31090b09a1", + "execution_count": 180, + "id": "3f5cc9b4", "metadata": { "tags": [] }, @@ -889,18 +911,19 @@ "name": "stdout", "output_type": "stream", "text": [ - "0\n" + "torch.Size([2048, 16, 4096])\n" ] } ], "source": [ - "print(gpu_accumulator_decorator.world_size)" + "output = layer_worker_group.run_layer([x]) # This must be a list of size 1, ensuring that the input equals the data parallel (dp).\n", + "print(output[0].shape)" ] }, { "cell_type": "code", - "execution_count": 33, - "id": "806c36c3-a6a6-4c99-88fb-f2e2468b12db", + "execution_count": 181, + "id": "49792210", "metadata": { "tags": [] }, @@ -909,20 +932,11 @@ "# Shutdown ray cluster\n", "ray.shutdown()" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ea54e858-5221-493e-b53a-4765b918a0fa", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { - "fileId": "e862b5a9-13b2-48bf-9eda-96bd0c76e37c", "kernelspec": { - "display_name": "Python 3.9.2 64-bit", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -937,11 +951,6 @@ "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.2" - }, - "vscode": { - "interpreter": { - "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6" - } } }, "nbformat": 4,