diff --git a/doc/source/train/examples/lightning/vicuna_13b_lightning_deepspeed_finetune.ipynb b/doc/source/train/examples/lightning/vicuna_13b_lightning_deepspeed_finetune.ipynb index 2a2703353a68..39b96a47020e 100644 --- a/doc/source/train/examples/lightning/vicuna_13b_lightning_deepspeed_finetune.ipynb +++ b/doc/source/train/examples/lightning/vicuna_13b_lightning_deepspeed_finetune.ipynb @@ -84,6 +84,16 @@ "```" ] }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "os.environ[\"RAY_TRAIN_V2_ENABLED\"] = \"1\"" + ] + }, { "cell_type": "code", "execution_count": null, @@ -138,22 +148,16 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [ { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "535afe3e183b4cdfa61c39cbae788608", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/2 [00:00\n", - "
\n", - "
\n", - "

Tune Status

\n", - " \n", - "\n", - "\n", - "\n", - "\n", - "\n", - "
Current time:2023-06-30 18:21:59
Running for: 00:42:22.75
Memory: 10.7/249.1 GiB
\n", - "
\n", - "
\n", - "
\n", - "

System Info

\n", - " Using FIFO scheduling algorithm.
Logical resource usage: 241.0/304 CPUs, 16.0/16 GPUs (0.0/16.0 accelerator_type:A10G)\n", - "
\n", - " \n", - "
\n", - "
\n", - "
\n", - "

Trial Status

\n", - " \n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "
Trial name status loc iter total time (s) train_loss epoch step
LightningTrainer_c1544_00000TERMINATED10.0.55.20:134103 1 2473.94 0.523438 0 29
\n", - "
\n", - "\n", - "\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[36m(TrainController pid=17559)\u001b[0m [State Transition] INITIALIZING -> SCHEDULING.\n", + "\u001b[36m(TrainController pid=17559)\u001b[0m Attempting to start training worker group of size 16 with the following resources: [{'CPU': 15, 'GPU': 1}] * 16\n", + "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m Setting up process group for: env:// [rank=0, world_size=16]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m [2025-10-15 15:51:07,627] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[36m(RayTrainWorker pid=4048, ip=10.0.130.188)\u001b[0m 2025-10-15 15:51:09.458702: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n", + "\u001b[36m(RayTrainWorker pid=4048, ip=10.0.130.188)\u001b[0m 2025-10-15 15:51:09.458741: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n", + "\u001b[36m(RayTrainWorker pid=4048, ip=10.0.130.188)\u001b[0m 2025-10-15 15:51:09.460080: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n", + "\u001b[36m(RayTrainWorker pid=4048, ip=10.0.130.188)\u001b[0m 2025-10-15 15:51:09.467398: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n", + "\u001b[36m(RayTrainWorker pid=4048, ip=10.0.130.188)\u001b[0m To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", + "\u001b[36m(RayTrainWorker pid=4048, ip=10.0.130.188)\u001b[0m 2025-10-15 15:51:10.359839: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n", + "\u001b[36m(RayTrainWorker pid=4048, ip=10.0.130.188)\u001b[0m INFO: initializing deepspeed distributed: GLOBAL_RANK: 5, MEMBER: 6/16\n", + "\u001b[36m(RayTrainWorker pid=4048, ip=10.0.130.188)\u001b[0m initializing deepspeed distributed: GLOBAL_RANK: 5, MEMBER: 6/16\n", + "\u001b[36m(RayTrainWorker pid=4048, ip=10.0.130.188)\u001b[0m WARNING: Missing logger folder: /tmp/ray/session_2025-10-15_15-40-01_399241_4076/artifacts/vicuna-13b-finetune/lightning_logs\n", + "\u001b[36m(RayTrainWorker pid=4048, ip=10.0.130.188)\u001b[0m Missing logger folder: /tmp/ray/session_2025-10-15_15-40-01_399241_4076/artifacts/vicuna-13b-finetune/lightning_logs\n", + "\u001b[36m(TrainController pid=17559)\u001b[0m Started training worker group of size 16: \n", + "\u001b[36m(TrainController pid=17559)\u001b[0m - (ip=10.0.171.127, pid=17770) world_rank=0, local_rank=0, node_rank=0\n", + "\u001b[36m(TrainController pid=17559)\u001b[0m - (ip=10.0.155.201, pid=4224) world_rank=1, local_rank=0, node_rank=1\n", + "\u001b[36m(TrainController pid=17559)\u001b[0m - (ip=10.0.130.65, pid=4187) world_rank=2, local_rank=0, node_rank=2\n", + "\u001b[36m(TrainController pid=17559)\u001b[0m - (ip=10.0.178.75, pid=4182) world_rank=3, local_rank=0, node_rank=3\n", + "\u001b[36m(TrainController pid=17559)\u001b[0m - (ip=10.0.167.159, pid=5417) world_rank=4, local_rank=0, node_rank=4\n", + "\u001b[36m(TrainController pid=17559)\u001b[0m - (ip=10.0.130.188, pid=4048) world_rank=5, local_rank=0, node_rank=5\n", + "\u001b[36m(TrainController pid=17559)\u001b[0m - (ip=10.0.134.47, pid=4191) world_rank=6, local_rank=0, node_rank=6\n", + "\u001b[36m(TrainController pid=17559)\u001b[0m - (ip=10.0.173.126, pid=4079) world_rank=7, local_rank=0, node_rank=7\n", + "\u001b[36m(TrainController pid=17559)\u001b[0m - (ip=10.0.166.0, pid=4053) world_rank=8, local_rank=0, node_rank=8\n", + "\u001b[36m(TrainController pid=17559)\u001b[0m - (ip=10.0.183.211, pid=5448) world_rank=9, local_rank=0, node_rank=9\n", + "\u001b[36m(TrainController pid=17559)\u001b[0m - (ip=10.0.138.121, pid=4069) world_rank=10, local_rank=0, node_rank=10\n", + "\u001b[36m(TrainController pid=17559)\u001b[0m - (ip=10.0.129.201, pid=5418) world_rank=11, local_rank=0, node_rank=11\n", + "\u001b[36m(TrainController pid=17559)\u001b[0m - (ip=10.0.184.103, pid=4038) world_rank=12, local_rank=0, node_rank=12\n", + "\u001b[36m(TrainController pid=17559)\u001b[0m - (ip=10.0.164.99, pid=4075) world_rank=13, local_rank=0, node_rank=13\n", + "\u001b[36m(TrainController pid=17559)\u001b[0m - (ip=10.0.136.125, pid=4040) world_rank=14, local_rank=0, node_rank=14\n", + "\u001b[36m(TrainController pid=17559)\u001b[0m - (ip=10.0.161.115, pid=4057) world_rank=15, local_rank=0, node_rank=15\n", + "\u001b[36m(TrainController pid=17559)\u001b[0m [State Transition] SCHEDULING -> RUNNING.\n", + "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m INFO: GPU available: True (cuda), used: True\n", + "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m GPU available: True (cuda), used: True\n", + "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m INFO: TPU available: False, using: 0 TPU cores\n", + "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m TPU available: False, using: 0 TPU cores\n", + "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m INFO: IPU available: False, using: 0 IPUs\n", + "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m IPU available: False, using: 0 IPUs\n", + "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m INFO: HPU available: False, using: 0 HPUs\n", + "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m HPU available: False, using: 0 HPUs\n", + "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m /home/ray/anaconda3/lib/python3.10/site-packages/huggingface_hub/file_download.py:795: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", + "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m warnings.warn(\n", + "Downloading shards: 0%| | 0/3 [00:00 TaskPoolMapOperator[MapBatches(BatchMapper._transform_pandas)->MapBatches(BatchMapper._transform_pandas)] -> AllToAllOperator[RandomizeBlockOrder]\n", - "\u001b[2m\u001b[36m(LightningTrainer pid=134103)\u001b[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n", - "\u001b[2m\u001b[36m(LightningTrainer pid=134103)\u001b[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" + "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m Parameter Offload: Total persistent parameters: 414720 in 81 params\n", + "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m INFO: \n", + "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m | Name | Type | Params | Params per Device\n", + "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m ---------------------------------------------------------------\n", + "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m 0 | model | LlamaForCausalLM | 13.0 B | 813 M \n", + "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m ---------------------------------------------------------------\n", + "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m 13.0 B Trainable params\n", + "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m 0 Non-trainable params\n", + "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m 13.0 B Total params\n", + "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m 52,063.457Total estimated model params size (MB)\n", + "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m \n", + "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m | Name | Type | Params | Params per Device\n", + "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m ---------------------------------------------------------------\n", + "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m 0 | model | LlamaForCausalLM | 13.0 B | 813 M \n", + "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m ---------------------------------------------------------------\n", + "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m 13.0 B Trainable params\n", + "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m 0 Non-trainable params\n", + "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m 13.0 B Total params\n", + "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m 52,063.457Total estimated model params size (MB)\n", + "\u001b[36m(RayTrainWorker pid=5418, ip=10.0.129.201)\u001b[0m Loading extension module cpu_adam...\u001b[32m [repeated 15x across cluster]\u001b[0m\n", + "\u001b[36m(RayTrainWorker pid=5418, ip=10.0.129.201)\u001b[0m Time to load cpu_adam op: 31.185880184173584 seconds\u001b[32m [repeated 15x across cluster]\u001b[0m\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 0: : 0it [00:00, ?it/s]0)\u001b[0m \n", + "\u001b[36m(RayTrainWorker pid=5418, ip=10.0.129.201)\u001b[0m [2/4] c++ -MMD -MF cpu_adam.o.d -DTORCH_EXTENSION_NAME=cpu_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\\"_gcc\\\" -DPYBIND11_STDLIB=\\\"_libstdcpp\\\" -DPYBIND11_BUILD_ABI=\\\"_cxxabi1011\\\" -I/home/ray/anaconda3/lib/python3.10/site-packages/deepspeed/ops/csrc/includes -I/usr/local/cuda/include -isystem /home/ray/anaconda3/lib/python3.10/site-packages/torch/include -isystem /home/ray/anaconda3/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -isystem /home/ray/anaconda3/lib/python3.10/site-packages/torch/include/TH -isystem /home/ray/anaconda3/lib/python3.10/site-packages/torch/include/THC -isystem /usr/local/cuda/include -isystem /home/ray/anaconda3/include/python3.10 -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -std=c++17 -O3 -std=c++17 -g -Wno-reorder -L/usr/local/cuda/lib64 -lcudart -lcublas -g -march=native -fopenmp -D__AVX256__ -D__ENABLE_CUDA__ -DBF16_AVAILABLE -c /home/ray/anaconda3/lib/python3.10/site-packages/deepspeed/ops/csrc/adam/cpu_adam.cpp -o cpu_adam.o \u001b[32m [repeated 15x across cluster]\u001b[0m\n", + "\u001b[36m(RayTrainWorker pid=5418, ip=10.0.129.201)\u001b[0m [3/4] c++ -MMD -MF cpu_adam_impl.o.d -DTORCH_EXTENSION_NAME=cpu_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\\"_gcc\\\" -DPYBIND11_STDLIB=\\\"_libstdcpp\\\" -DPYBIND11_BUILD_ABI=\\\"_cxxabi1011\\\" -I/home/ray/anaconda3/lib/python3.10/site-packages/deepspeed/ops/csrc/includes -I/usr/local/cuda/include -isystem /home/ray/anaconda3/lib/python3.10/site-packages/torch/include -isystem /home/ray/anaconda3/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -isystem /home/ray/anaconda3/lib/python3.10/site-packages/torch/include/TH -isystem /home/ray/anaconda3/lib/python3.10/site-packages/torch/include/THC -isystem /usr/local/cuda/include -isystem /home/ray/anaconda3/include/python3.10 -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -std=c++17 -O3 -std=c++17 -g -Wno-reorder -L/usr/local/cuda/lib64 -lcudart -lcublas -g -march=native -fopenmp -D__AVX256__ -D__ENABLE_CUDA__ -DBF16_AVAILABLE -c /home/ray/anaconda3/lib/python3.10/site-packages/deepspeed/ops/csrc/adam/cpu_adam_impl.cpp -o cpu_adam_impl.o \u001b[32m [repeated 15x across cluster]\u001b[0m\n", + "\u001b[36m(RayTrainWorker pid=5418, ip=10.0.129.201)\u001b[0m [4/4] c++ cpu_adam.o cpu_adam_impl.o custom_cuda_kernel.cuda.o -shared -lcurand -L/home/ray/anaconda3/lib/python3.10/site-packages/torch/lib -lc10 -lc10_cuda -ltorch_cpu -ltorch_cuda -ltorch -ltorch_python -L/usr/local/cuda/lib64 -lcudart -o cpu_adam.so\u001b[32m [repeated 15x across cluster]\u001b[0m\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "da7f200767b448d7b409fcdd07daecce", + "model_id": "2a3cf444199946fa9760cd89e1e8d198", "version_major": 2, "version_minor": 0 }, "text/plain": [ - "(pid=134103) - RandomizeBlockOrder 1: 0%| | 0/1 [00:00MapBatches(tokenize) 1: 0.00 row [00:00, ? row/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "27c3f884506944d1b3825a1104412c6c", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "(pid=17972) - limit=2048 2: 0.00 row [00:00, ? row/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "029aff619c7644bcb70086a01f3c15e5", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "(pid=17972) - split(16, equal=True) 3: 0.00 row [00:00, ? row/s]" ] }, "metadata": {}, @@ -588,468 +753,239 @@ "name": "stderr", "output_type": "stream", "text": [ - "Downloading (…)okenizer_config.json: 100%|██████████| 727/727 [00:00<00:00, 8.86MB/s]m_pandas) pid=74329, ip=10.0.54.55)\u001b[0m \n", - "Downloading tokenizer.model: 100%|██████████| 500k/500k [00:00<00:00, 18.2MB/s]ansform_pandas) pid=74329, ip=10.0.54.55)\u001b[0m \n", - "Downloading (…)cial_tokens_map.json: 100%|██████████| 435/435 [00:00<00:00, 3.33MB/s]m_pandas) pid=74329, ip=10.0.54.55)\u001b[0m \n" + "\u001b[36m(SplitCoordinator pid=17972)\u001b[0m Registered dataset logger for dataset train_16_0\n", + "\u001b[36m(SplitCoordinator pid=17972)\u001b[0m Starting execution of Dataset train_16_0. Full logs are in /tmp/ray/session_2025-10-15_15-40-01_399241_4076/logs/ray-data\n", + "\u001b[36m(SplitCoordinator pid=17972)\u001b[0m Execution plan of Dataset train_16_0: InputDataBuffer[Input] -> TaskPoolMapOperator[MapBatches(fill_prompt)->MapBatches(tokenize)] -> LimitOperator[limit=2048] -> OutputSplitter[split(16, equal=True)]\n", + "\u001b[36m(SplitCoordinator pid=17972)\u001b[0m ⚠️ Ray's object store is configured to use only 28.0% of available memory (341.1GiB out of 1216.0GiB total). For optimal Ray Data performance, we recommend setting the object store to at least 50% of available memory. You can do this by setting the 'object_store_memory' parameter when calling ray.init() or by setting the RAY_DEFAULT_OBJECT_STORE_MEMORY_PROPORTION environment variable.\n", + "\u001b[36m(MapBatches(fill_prompt)->MapBatches(tokenize) pid=4600, ip=10.0.166.0)\u001b[0m /home/ray/anaconda3/lib/python3.10/site-packages/huggingface_hub/file_download.py:795: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", + "\u001b[36m(MapBatches(fill_prompt)->MapBatches(tokenize) pid=4600, ip=10.0.166.0)\u001b[0m warnings.warn(\n", + "\u001b[36m(MapBatches(fill_prompt)->MapBatches(tokenize) pid=4600, ip=10.0.166.0)\u001b[0m normalizer.cc(51) LOG(INFO) precompiled_charsmap is empty. use identity normalization.\n", + "\u001b[36m(SplitCoordinator pid=17972)\u001b[0m ✔️ Dataset train_16_0 execution finished in 5.69 seconds\n", + "\u001b[36m(RayTrainWorker pid=4048, ip=10.0.130.188)\u001b[0m /home/ray/anaconda3/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::broadcast_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.)\n", + "\u001b[36m(RayTrainWorker pid=4048, ip=10.0.130.188)\u001b[0m return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(RayTrainWorker pid=74152, ip=10.0.63.141)\u001b[0m [2023-06-30 17:39:54,612] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n" + "Epoch 0: : 1it [00:52, 52.00s/it, v_num=0, train_loss=9.190]\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Downloading (…)okenizer_config.json: 100%|██████████| 727/727 [00:00<00:00, 7.86MB/s]\n", - "Downloading (…)okenizer_config.json: 100%|██████████| 727/727 [00:00<00:00, 7.57MB/s]\n", - "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m GPU available: True (cuda), used: True\n", - "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m TPU available: False, using: 0 TPU cores\n", - "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m IPU available: False, using: 0 IPUs\n", - "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m HPU available: False, using: 0 HPUs\n", - "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m `Trainer(limit_val_batches=1)` was configured so 1 batch will be used.\n", - "Downloading tokenizer.model: 0%| | 0.00/500k [00:00 FINISHED.\n" + ] } ], "source": [ - "result" + "result = trainer.fit()" ] }, { @@ -1062,41 +998,6 @@ "Now, it's time to play with our fine-tuned Vicuna code generator!" ] }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Download and Process your checkpoints\n", - "\n", - "First, download the checkpoints to your local machine using the AWS CLI.\n", - "\n", - "Note that adding the following configurations can significantly increase the syncing throughput compared to the default configurations. On a g5 instance with NVME SSD, the download speed improved from `200MB/s` to around `1.5GB/s`." - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "!aws configure set s3.max_concurrent_requests 32\n", - "!aws configure set default.s3.preferred_transfer_client crt\n", - "!aws configure set default.s3.target_bandwidth 100Gb/s\n", - "!aws configure set default.s3.multipart_chunksize 8MB" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "\n", - "os.system(f\"aws s3 sync s3://{result.checkpoint.path} /mnt/local_storage\")" - ] - }, { "attachments": {}, "cell_type": "markdown", @@ -1112,16 +1013,16 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Processing zero checkpoint '/mnt/local_storage/checkpoint/model/checkpoint'\n", + "Processing zero checkpoint '/mnt/cluster_storage/vicuna-13b-finetune/checkpoint_2025-10-15_16-04-29.037536/checkpoint.ckpt/checkpoint'\n", "Detected checkpoint of type zero stage 3, world_size: 16\n", - "Parsing checkpoint created by deepspeed==0.9.4\n", + "Parsing checkpoint created by deepspeed==0.12.3\n", "Reconstructed Trainable fp32 state dict with 363 params 13015864320 elements\n" ] } @@ -1136,11 +1037,7 @@ " vicuna_state_dict = {\n", " k.replace(\"_forward_module.model.\", \"\"): v for k, v in state_dict.items()\n", " }\n", - " torch.save(vicuna_state_dict, os.path.join(zero_ckpt_dir, \"full_model.pt\"))\n", - "\n", - "\n", - "full_model_ckpt_path = \"/mnt/local_storage/checkpoint.ckpt/full_model.pt\"\n", - "extract_fp32_ckpt_from_zero(\"/mnt/local_storage/checkpoint.ckpt\")" + " torch.save(vicuna_state_dict, os.path.join(zero_ckpt_dir, \"full_model.pt\"))\n" ] }, { @@ -1165,53 +1062,59 @@ "metadata": {}, "outputs": [], "source": [ + "import shutil\n", "import torch\n", "import ray\n", "import lightning.pytorch as pl\n", - "from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM\n", + "from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM, pipeline\n", "from accelerate import (\n", " init_empty_weights,\n", " infer_auto_device_map,\n", " load_checkpoint_and_dispatch,\n", ")\n", "\n", - "# Initialize a model on meta device\n", - "with init_empty_weights():\n", - " config = AutoConfig.from_pretrained(MODEL_NAME)\n", - " meta_model = AutoModelForCausalLM.from_config(config)\n", - "meta_model.tie_weights()\n", - "\n", - "# Define the device mapping\n", - "device_map = infer_auto_device_map(\n", - " meta_model,\n", - " max_memory={0: \"15GB\", \"cpu\": \"60GB\"},\n", - " no_split_module_classes=[\"LlamaDecoderLayer\"],\n", - ")\n", "\n", - "# Load the model parameters\n", - "model = load_checkpoint_and_dispatch(\n", - " meta_model,\n", - " checkpoint=full_model_ckpt_path,\n", - " device_map=device_map,\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from transformers import pipeline\n", + "def generate_sample_outputs(model_checkpoint_path, prompts):\n", + " # Initialize a model on meta device\n", + " with init_empty_weights():\n", + " config = AutoConfig.from_pretrained(MODEL_NAME)\n", + " meta_model = AutoModelForCausalLM.from_config(config)\n", + " meta_model.tie_weights()\n", "\n", - "generator = pipeline(\n", - " \"text-generation\",\n", - " model=model,\n", - " device_map=device_map,\n", - " tokenizer=AutoTokenizer.from_pretrained(\n", - " MODEL_NAME, padding_side=\"left\", use_fast=False\n", - " ),\n", - ")" + " # Define the device mapping\n", + " device_map = infer_auto_device_map(\n", + " meta_model,\n", + " max_memory={0: \"15GB\", \"cpu\": \"60GB\"},\n", + " no_split_module_classes=[\"LlamaDecoderLayer\"],\n", + " )\n", + "\n", + " local_checkpoint_path = \"/mnt/local_storage/vicuna_ckpt\"\n", + " shutil.copytree(model_checkpoint_path, local_checkpoint_path)\n", + "\n", + " extract_fp32_ckpt_from_zero(local_checkpoint_path)\n", + "\n", + " full_model_ckpt_path = os.path.join(local_checkpoint_path, \"full_model.pt\")\n", + "\n", + " # Load the model parameters\n", + " model = load_checkpoint_and_dispatch(\n", + " meta_model,\n", + " checkpoint=full_model_ckpt_path,\n", + " device_map=device_map,\n", + " )\n", + "\n", + " generator = pipeline(\n", + " \"text-generation\",\n", + " model=model,\n", + " device_map=device_map,\n", + " tokenizer=AutoTokenizer.from_pretrained(\n", + " MODEL_NAME, padding_side=\"left\", use_fast=False\n", + " ),\n", + " )\n", + "\n", + " for sample_prompt in prompts:\n", + " prompt = PROMPT_TEMPLATE.format(intent=sample_prompt[\"intent\"], snippet=\"\")\n", + " output = generator(prompt, max_new_tokens=30, do_sample=True)\n", + " print(output[0][\"generated_text\"])" ] }, { @@ -1226,7 +1129,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ @@ -1243,60 +1146,13 @@ "]" ] }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's begin by examining the generated outputs without fine-tuning. In this case study, we utilize [Aviary Explorer](https://aviary.anyscale.com), an open-source multi-LLM serving platform supported by Ray and Anyscale. You can easily select from a variety of open-source LLMs and compare their generation quality, cost, latency, and many other metrics.\n", - "\n", - "We constructed a prompt in a zero-shot learning manner and feed it into 3 OSS LLMs.\n", - "\n", - "![](https://user-images.githubusercontent.com/26745457/250704232-65a20f1b-6752-4d6c-bba1-8296a373162f.png)\n", - "\n", - "\n", - "- `vicuna-13b-v1.3` begins to speak Chinese.\n", - "- `mpt-7b-chat` generates a reasonable code snippet, but with multiple lines.\n", - "- `falcon-7b-sft` generates a one line snippet, but it doesn't seem to work.\n", - "\n", - "As we can see, none of them generate a satisfactory code snippet. \n", - "\n", - "Now let's check the performance of our fine-tuned `vicuna-13b-v1.3` model:" - ] - }, { "cell_type": "code", - "execution_count": 35, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/ray/anaconda3/lib/python3.10/site-packages/transformers/pipelines/base.py:1081: UserWarning: You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset\n", - " warnings.warn(\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Intent: replace white spaces in colunm 'col' of dataframe `df` with '_'\n", - "One-line code snippet: `df['col'] = df['col'].str.replace(' ', '_')`\n", - "\n", - "Intent: search for occurrences of regex pattern '>.*<' in xml string `line`\n", - "One-line code snippet: `re.findall('>.*<', line)``\n", - "\n", - "Intent: send a signal `signal.SIGUSR1` to the current process\n", - "One-line code snippet: `os.kill(os.getpid(), signal.SIGUSR1)``\n" - ] - } - ], + "outputs": [], "source": [ - "for case in testcases:\n", - " prompt = PROMPT_TEMPLATE.format(intent=case[\"intent\"], snippet=\"\")\n", - " output = generator(prompt, max_new_tokens=30, do_sample=True)\n", - " print(output[0][\"generated_text\"])" + "generate_sample_outputs(os.path.join(result.checkpoint.path, \"checkpoint.ckpt\"), testcases)" ] }, { @@ -1311,26 +1167,9 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Before\n", - " col\n", - "0 abc def ghi\n", - "1 12 3 456\n", - "2 \n", - "After\n", - " col\n", - "0 abc_def_ghi\n", - "1 _12_3_456\n", - "2 _____\n" - ] - } - ], + "outputs": [], "source": [ "import pandas as pd\n", "\n", @@ -1343,25 +1182,9 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['>The Great Gatsby<',\n", - " '>F. Scott Fitzgerald<',\n", - " '>1925<',\n", - " '>Sapiens: A Brief History of Humankind<',\n", - " '>Yuval Noah Harari<',\n", - " '>2011<']" - ] - }, - "execution_count": 47, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "import re\n", "\n", @@ -1398,7 +1221,8 @@ "source": [ "import os, signal\n", "\n", - "os.kill(os.getpid(), signal.SIGUSR1) # Terminate the current process~" + "# Don't actually kill the process, it's just for demo :D\n", + "# os.kill(os.getpid(), signal.SIGUSR1) # Terminate the current process~" ] }, { @@ -1412,12 +1236,16 @@ "- [HuggingFace: DeepSpeed Integration](https://huggingface.co/docs/transformers/main_classes/deepspeed#deepspeed-integration)\n", "- [HuggingFace: Handling big models for inference](https://huggingface.co/docs/accelerate/main/usage_guides/big_modeling)\n", "- [Lightning Transformers: DeepSpeed Training with Big Transformer Models](https://lightning-transformers.readthedocs.io/en/latest/)\n", - "- [Aviary: Open Source Multi-LLM Serving](https://www.anyscale.com/blog/announcing-aviary-open-source-multi-llm-serving-solution)\n", "- Rajbhandari, S., Rasley, J., et al. (2020). ZeRO: Memory Optimizations Toward Training Trillion Parameter Models. [arXiv:1910.02054](https://arxiv.org/abs/1910.02054)\n", "- Zheng, L., Chiang, W-L., Sheng, Y., et al. (2023). Judging LLM-as-a-judge with MT-Bench and Chatbot Arena. [arXiv:2306.05685](https://arxiv.org/abs/2306.05685)\n", "\n", "\n" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] } ], "metadata": { @@ -1436,7 +1264,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.15" + "version": "3.10.18" }, "orphan": true }, diff --git a/release/release_tests.yaml b/release/release_tests.yaml index 51ead844ec15..32e1c81f22ef 100644 --- a/release/release_tests.yaml +++ b/release/release_tests.yaml @@ -405,7 +405,7 @@ python: "3.10" group: AIR examples working_dir: air_examples/vicuna_13b_lightning_deepspeed_finetuning - frequency: weekly + frequency: manual team: ml cluster: byod: