diff --git a/doc/source/train/examples/lightning/vicuna_13b_lightning_deepspeed_finetune.ipynb b/doc/source/train/examples/lightning/vicuna_13b_lightning_deepspeed_finetune.ipynb
index 2a2703353a68..39b96a47020e 100644
--- a/doc/source/train/examples/lightning/vicuna_13b_lightning_deepspeed_finetune.ipynb
+++ b/doc/source/train/examples/lightning/vicuna_13b_lightning_deepspeed_finetune.ipynb
@@ -84,6 +84,16 @@
     "```"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "os.environ[\"RAY_TRAIN_V2_ENABLED\"] = \"1\""
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -138,22 +148,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "535afe3e183b4cdfa61c39cbae788608",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "  0%|          | 0/2 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/ray/anaconda3/lib/python3.10/site-packages/transformers/utils/generic.py:441: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.\n",
+      "  _torch_pytree._register_pytree_node(\n"
+     ]
     },
     {
      "name": "stdout",
@@ -233,7 +237,7 @@
    "outputs": [],
    "source": [
     "# To accelerate release tests\n",
-    "processed_ds = processed_ds.limit(16 * 8 * 16)  # each worker has 16 batches"
+    "processed_ds = processed_ds.limit(16 * 8 * 1)  # each worker has 1 batch"
    ]
   },
   {
@@ -248,17 +252,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2023-06-30 17:39:35,109] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "import torch\n",
     "import transformers\n",
@@ -337,9 +333,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 6,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/ray/anaconda3/lib/python3.10/site-packages/huggingface_hub/file_download.py:795: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
+      "  warnings.warn(\n"
+     ]
+    }
+   ],
    "source": [
     "from transformers import AutoConfig\n",
     "\n",
@@ -380,7 +385,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -412,6 +417,7 @@
     "    pl_trainer = pl.Trainer(\n",
     "        devices=\"auto\",\n",
     "        accelerator=\"auto\",\n",
+    "        default_root_dir=\"/mnt/local_storage\",\n",
     "        strategy=RayDeepSpeedStrategy(config=deepspeed_configs),\n",
     "        plugins=[RayLightningEnvironment()],\n",
     "        callbacks=[RayTrainReportCallback()],\n",
@@ -430,11 +436,11 @@
     "    train_loop_config={\n",
     "        \"max_epochs\": 1,\n",
     "        \"batch_size\": BATCH_SIZE_PER_WORKER,\n",
-    "        \"accumulate_grad_batches\": 2\n",
+    "        \"accumulate_grad_batches\": 2,\n",
     "    },\n",
     "    run_config=RunConfig(\n",
     "        name=\"vicuna-13b-finetune\",\n",
-    "        storage_path=\"s3://anyscale-staging-data-cld-kvedzwag2qa8i5bjxuevf5i7/air-release-tests\",\n",
+    "        storage_path=\"/mnt/cluster_storage\",\n",
     "        checkpoint_config=CheckpointConfig(num_to_keep=1),\n",
     "    ),\n",
     "    scaling_config=ScalingConfig(\n",
@@ -458,113 +464,244 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
-     "data": {
-      "text/html": [
-       "<div class=\"tuneStatus\">\n",
-       "  <div style=\"display: flex;flex-direction: row\">\n",
-       "    <div style=\"display: flex;flex-direction: column;\">\n",
-       "      <h3>Tune Status</h3>\n",
-       "      <table>\n",
-       "<tbody>\n",
-       "<tr><td>Current time:</td><td>2023-06-30 18:21:59</td></tr>\n",
-       "<tr><td>Running for: </td><td>00:42:22.75        </td></tr>\n",
-       "<tr><td>Memory:      </td><td>10.7/249.1 GiB     </td></tr>\n",
-       "</tbody>\n",
-       "</table>\n",
-       "    </div>\n",
-       "    <div class=\"vDivider\"></div>\n",
-       "    <div class=\"systemInfo\">\n",
-       "      <h3>System Info</h3>\n",
-       "      Using FIFO scheduling algorithm.<br>Logical resource usage: 241.0/304 CPUs, 16.0/16 GPUs (0.0/16.0 accelerator_type:A10G)\n",
-       "    </div>\n",
-       "    \n",
-       "  </div>\n",
-       "  <div class=\"hDivider\"></div>\n",
-       "  <div class=\"trialStatus\">\n",
-       "    <h3>Trial Status</h3>\n",
-       "    <table>\n",
-       "<thead>\n",
-       "<tr><th>Trial name                  </th><th>status    </th><th>loc              </th><th style=\"text-align: right;\">  iter</th><th style=\"text-align: right;\">  total time (s)</th><th style=\"text-align: right;\">  train_loss</th><th style=\"text-align: right;\">  epoch</th><th style=\"text-align: right;\">  step</th></tr>\n",
-       "</thead>\n",
-       "<tbody>\n",
-       "<tr><td>LightningTrainer_c1544_00000</td><td>TERMINATED</td><td>10.0.55.20:134103</td><td style=\"text-align: right;\">     1</td><td style=\"text-align: right;\">         2473.94</td><td style=\"text-align: right;\">    0.523438</td><td style=\"text-align: right;\">      0</td><td style=\"text-align: right;\">    29</td></tr>\n",
-       "</tbody>\n",
-       "</table>\n",
-       "  </div>\n",
-       "</div>\n",
-       "<style>\n",
-       ".tuneStatus {\n",
-       "  color: var(--jp-ui-font-color1);\n",
-       "}\n",
-       ".tuneStatus .systemInfo {\n",
-       "  display: flex;\n",
-       "  flex-direction: column;\n",
-       "}\n",
-       ".tuneStatus td {\n",
-       "  white-space: nowrap;\n",
-       "}\n",
-       ".tuneStatus .trialStatus {\n",
-       "  display: flex;\n",
-       "  flex-direction: column;\n",
-       "}\n",
-       ".tuneStatus h3 {\n",
-       "  font-weight: bold;\n",
-       "}\n",
-       ".tuneStatus .hDivider {\n",
-       "  border-bottom-width: var(--jp-border-width);\n",
-       "  border-bottom-color: var(--jp-border-color0);\n",
-       "  border-bottom-style: solid;\n",
-       "}\n",
-       ".tuneStatus .vDivider {\n",
-       "  border-left-width: var(--jp-border-width);\n",
-       "  border-left-color: var(--jp-border-color0);\n",
-       "  border-left-style: solid;\n",
-       "  margin: 0.5em 1em 0.5em 1em;\n",
-       "}\n",
-       "</style>\n"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[36m(TrainController pid=17559)\u001b[0m [State Transition] INITIALIZING -> SCHEDULING.\n",
+      "\u001b[36m(TrainController pid=17559)\u001b[0m Attempting to start training worker group of size 16 with the following resources: [{'CPU': 15, 'GPU': 1}] * 16\n",
+      "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m Setting up process group for: env:// [rank=0, world_size=16]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m [2025-10-15 15:51:07,627] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[36m(RayTrainWorker pid=4048, ip=10.0.130.188)\u001b[0m 2025-10-15 15:51:09.458702: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
+      "\u001b[36m(RayTrainWorker pid=4048, ip=10.0.130.188)\u001b[0m 2025-10-15 15:51:09.458741: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
+      "\u001b[36m(RayTrainWorker pid=4048, ip=10.0.130.188)\u001b[0m 2025-10-15 15:51:09.460080: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
+      "\u001b[36m(RayTrainWorker pid=4048, ip=10.0.130.188)\u001b[0m 2025-10-15 15:51:09.467398: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
+      "\u001b[36m(RayTrainWorker pid=4048, ip=10.0.130.188)\u001b[0m To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
+      "\u001b[36m(RayTrainWorker pid=4048, ip=10.0.130.188)\u001b[0m 2025-10-15 15:51:10.359839: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
+      "\u001b[36m(RayTrainWorker pid=4048, ip=10.0.130.188)\u001b[0m INFO: initializing deepspeed distributed: GLOBAL_RANK: 5, MEMBER: 6/16\n",
+      "\u001b[36m(RayTrainWorker pid=4048, ip=10.0.130.188)\u001b[0m initializing deepspeed distributed: GLOBAL_RANK: 5, MEMBER: 6/16\n",
+      "\u001b[36m(RayTrainWorker pid=4048, ip=10.0.130.188)\u001b[0m WARNING: Missing logger folder: /tmp/ray/session_2025-10-15_15-40-01_399241_4076/artifacts/vicuna-13b-finetune/lightning_logs\n",
+      "\u001b[36m(RayTrainWorker pid=4048, ip=10.0.130.188)\u001b[0m Missing logger folder: /tmp/ray/session_2025-10-15_15-40-01_399241_4076/artifacts/vicuna-13b-finetune/lightning_logs\n",
+      "\u001b[36m(TrainController pid=17559)\u001b[0m Started training worker group of size 16: \n",
+      "\u001b[36m(TrainController pid=17559)\u001b[0m - (ip=10.0.171.127, pid=17770) world_rank=0, local_rank=0, node_rank=0\n",
+      "\u001b[36m(TrainController pid=17559)\u001b[0m - (ip=10.0.155.201, pid=4224) world_rank=1, local_rank=0, node_rank=1\n",
+      "\u001b[36m(TrainController pid=17559)\u001b[0m - (ip=10.0.130.65, pid=4187) world_rank=2, local_rank=0, node_rank=2\n",
+      "\u001b[36m(TrainController pid=17559)\u001b[0m - (ip=10.0.178.75, pid=4182) world_rank=3, local_rank=0, node_rank=3\n",
+      "\u001b[36m(TrainController pid=17559)\u001b[0m - (ip=10.0.167.159, pid=5417) world_rank=4, local_rank=0, node_rank=4\n",
+      "\u001b[36m(TrainController pid=17559)\u001b[0m - (ip=10.0.130.188, pid=4048) world_rank=5, local_rank=0, node_rank=5\n",
+      "\u001b[36m(TrainController pid=17559)\u001b[0m - (ip=10.0.134.47, pid=4191) world_rank=6, local_rank=0, node_rank=6\n",
+      "\u001b[36m(TrainController pid=17559)\u001b[0m - (ip=10.0.173.126, pid=4079) world_rank=7, local_rank=0, node_rank=7\n",
+      "\u001b[36m(TrainController pid=17559)\u001b[0m - (ip=10.0.166.0, pid=4053) world_rank=8, local_rank=0, node_rank=8\n",
+      "\u001b[36m(TrainController pid=17559)\u001b[0m - (ip=10.0.183.211, pid=5448) world_rank=9, local_rank=0, node_rank=9\n",
+      "\u001b[36m(TrainController pid=17559)\u001b[0m - (ip=10.0.138.121, pid=4069) world_rank=10, local_rank=0, node_rank=10\n",
+      "\u001b[36m(TrainController pid=17559)\u001b[0m - (ip=10.0.129.201, pid=5418) world_rank=11, local_rank=0, node_rank=11\n",
+      "\u001b[36m(TrainController pid=17559)\u001b[0m - (ip=10.0.184.103, pid=4038) world_rank=12, local_rank=0, node_rank=12\n",
+      "\u001b[36m(TrainController pid=17559)\u001b[0m - (ip=10.0.164.99, pid=4075) world_rank=13, local_rank=0, node_rank=13\n",
+      "\u001b[36m(TrainController pid=17559)\u001b[0m - (ip=10.0.136.125, pid=4040) world_rank=14, local_rank=0, node_rank=14\n",
+      "\u001b[36m(TrainController pid=17559)\u001b[0m - (ip=10.0.161.115, pid=4057) world_rank=15, local_rank=0, node_rank=15\n",
+      "\u001b[36m(TrainController pid=17559)\u001b[0m [State Transition] SCHEDULING -> RUNNING.\n",
+      "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m INFO: GPU available: True (cuda), used: True\n",
+      "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m GPU available: True (cuda), used: True\n",
+      "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m INFO: TPU available: False, using: 0 TPU cores\n",
+      "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m TPU available: False, using: 0 TPU cores\n",
+      "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m INFO: IPU available: False, using: 0 IPUs\n",
+      "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m IPU available: False, using: 0 IPUs\n",
+      "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m INFO: HPU available: False, using: 0 HPUs\n",
+      "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m HPU available: False, using: 0 HPUs\n",
+      "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m /home/ray/anaconda3/lib/python3.10/site-packages/huggingface_hub/file_download.py:795: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
+      "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m   warnings.warn(\n",
+      "Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]\n",
+      "\u001b[36m(RayTrainWorker pid=5418, ip=10.0.129.201)\u001b[0m 2025-10-15 15:51:09.590755: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\u001b[32m [repeated 15x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/user-guides/configure-logging.html#log-deduplication for more options.)\u001b[0m\n",
+      "\u001b[36m(RayTrainWorker pid=5418, ip=10.0.129.201)\u001b[0m 2025-10-15 15:51:09.590792: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
+      "\u001b[36m(RayTrainWorker pid=5418, ip=10.0.129.201)\u001b[0m 2025-10-15 15:51:09.592129: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
+      "\u001b[36m(RayTrainWorker pid=5418, ip=10.0.129.201)\u001b[0m 2025-10-15 15:51:09.599431: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
+      "\u001b[36m(RayTrainWorker pid=5418, ip=10.0.129.201)\u001b[0m To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\u001b[32m [repeated 15x across cluster]\u001b[0m\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[36m(autoscaler +35s)\u001b[0m Tip: use `ray status` to view detailed cluster status. To disable these messages, set RAY_SCHEDULER_EVENTS=0.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Downloading shards:  33%|███▎      | 1/3 [00:08<00:16,  8.45s/it]\n",
+      "\u001b[36m(RayTrainWorker pid=5418, ip=10.0.129.201)\u001b[0m 2025-10-15 15:51:10.532071: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
+      "\u001b[36m(RayTrainWorker pid=5418, ip=10.0.129.201)\u001b[0m INFO: initializing deepspeed distributed: GLOBAL_RANK: 11, MEMBER: 12/16\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
+      "\u001b[36m(RayTrainWorker pid=5418, ip=10.0.129.201)\u001b[0m initializing deepspeed distributed: GLOBAL_RANK: 11, MEMBER: 12/16\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
+      "\u001b[36m(RayTrainWorker pid=5418, ip=10.0.129.201)\u001b[0m WARNING: Missing logger folder: /tmp/ray/session_2025-10-15_15-40-01_399241_4076/artifacts/vicuna-13b-finetune/lightning_logs\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
+      "\u001b[36m(RayTrainWorker pid=5418, ip=10.0.129.201)\u001b[0m Missing logger folder: /tmp/ray/session_2025-10-15_15-40-01_399241_4076/artifacts/vicuna-13b-finetune/lightning_logs\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
+      "\u001b[36m(RayTrainWorker pid=5418, ip=10.0.129.201)\u001b[0m /home/ray/anaconda3/lib/python3.10/site-packages/huggingface_hub/file_download.py:795: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
+      "\u001b[36m(RayTrainWorker pid=5418, ip=10.0.129.201)\u001b[0m   warnings.warn(\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
+      "Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
+      "Downloading shards:  33%|███▎      | 1/3 [00:14<00:29, 14.64s/it]\u001b[32m [repeated 10x across cluster]\u001b[0m\n",
+      "Downloading shards:  33%|███▎      | 1/3 [00:24<00:48, 24.42s/it]\u001b[32m [repeated 5x across cluster]\u001b[0m\n",
+      "Downloading shards:  67%|██████▋   | 2/3 [00:32<00:17, 17.90s/it]\n",
+      "Downloading shards:  67%|██████▋   | 2/3 [00:36<00:19, 19.52s/it]\n",
+      "Downloading shards:  67%|██████▋   | 2/3 [00:47<00:24, 24.79s/it]\u001b[32m [repeated 9x across cluster]\u001b[0m\n",
+      "Downloading shards: 100%|██████████| 3/3 [00:49<00:00, 16.55s/it]\n",
+      "Downloading shards:  67%|██████▋   | 2/3 [00:51<00:27, 27.69s/it]\n",
+      "Downloading shards: 100%|██████████| 3/3 [00:54<00:00, 18.33s/it]\u001b[32m [repeated 8x across cluster]\u001b[0m\n",
+      "Downloading shards:  67%|██████▋   | 2/3 [01:00<00:33, 33.57s/it]\n",
+      "Downloading shards: 100%|██████████| 3/3 [00:55<00:00, 18.63s/it]\n",
+      "Downloading shards: 100%|██████████| 3/3 [01:03<00:00, 21.30s/it]\n",
+      "Downloading shards:  67%|██████▋   | 2/3 [01:05<00:35, 35.56s/it]\u001b[32m [repeated 2x across cluster]\u001b[0m\n",
+      "Downloading shards: 100%|██████████| 3/3 [01:09<00:00, 23.31s/it]\n",
+      "Downloading shards:  67%|██████▋   | 2/3 [01:12<00:38, 38.09s/it]\n",
+      "Downloading shards: 100%|██████████| 3/3 [01:30<00:00, 30.22s/it]\n",
+      "Downloading shards: 100%|██████████| 3/3 [01:36<00:00, 32.00s/it]\u001b[32m [repeated 2x across cluster]\u001b[0m\n",
+      "Downloading shards: 100%|██████████| 3/3 [01:41<00:00, 33.94s/it]\n",
+      "Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]\n",
+      "Loading checkpoint shards:  33%|███▎      | 1/3 [00:17<00:35, 17.89s/it]\n",
+      "Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
+      "Loading checkpoint shards:  33%|███▎      | 1/3 [00:23<00:47, 23.70s/it]\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
+      "Loading checkpoint shards:  67%|██████▋   | 2/3 [00:39<00:19, 19.88s/it]\n",
+      "Loading checkpoint shards:  67%|██████▋   | 2/3 [00:39<00:19, 19.89s/it]\n",
+      "Loading checkpoint shards:  67%|██████▋   | 2/3 [00:44<00:22, 22.24s/it]\u001b[32m [repeated 14x across cluster]\u001b[0m\n",
+      "Loading checkpoint shards: 100%|██████████| 3/3 [00:52<00:00, 17.38s/it]\n",
+      "Loading checkpoint shards: 100%|██████████| 3/3 [00:57<00:00, 19.26s/it]\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
+      "\u001b[36m(RayTrainWorker pid=4040, ip=10.0.136.125)\u001b[0m INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n",
+      "\u001b[36m(RayTrainWorker pid=4040, ip=10.0.136.125)\u001b[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n",
+      "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m DeepSpeed Configs:  {'zero_allow_untested_optimizer': True, 'bf16': {'enabled': True}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'cpu', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'reduce_bucket_size': 26214400, 'stage3_prefetch_bucket_size': 23592960.0, 'stage3_param_persistence_threshold': 51200}, 'gradient_accumulation_steps': 2, 'train_micro_batch_size_per_gpu': 1, 'gradient_clipping': 0.0}\n",
+      "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m Model Archetecture:  LlamaForCausalLM(\n",
+      "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m   (model): LlamaModel(\n",
+      "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m     (embed_tokens): Embedding(32000, 5120, padding_idx=0)\n",
+      "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m     (layers): ModuleList(\n",
+      "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m       (0-39): 40 x LlamaDecoderLayer(\n",
+      "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m         (self_attn): LlamaAttention(\n",
+      "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m           (q_proj): Linear(in_features=5120, out_features=5120, bias=False)\n",
+      "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m           (k_proj): Linear(in_features=5120, out_features=5120, bias=False)\n",
+      "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m           (v_proj): Linear(in_features=5120, out_features=5120, bias=False)\n",
+      "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m           (o_proj): Linear(in_features=5120, out_features=5120, bias=False)\n",
+      "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m           (rotary_emb): LlamaRotaryEmbedding()\n",
+      "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m         )\n",
+      "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m         (mlp): LlamaMLP(\n",
+      "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m           (gate_proj): Linear(in_features=5120, out_features=13824, bias=False)\n",
+      "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m           (down_proj): Linear(in_features=13824, out_features=5120, bias=False)\n",
+      "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m           (up_proj): Linear(in_features=5120, out_features=13824, bias=False)\n",
+      "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m           (act_fn): SiLUActivation()\n",
+      "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m         )\n",
+      "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m         (input_layernorm): LlamaRMSNorm()\n",
+      "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m         (post_attention_layernorm): LlamaRMSNorm()\n",
+      "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m       )\n",
+      "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m     )\n",
+      "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m     (norm): LlamaRMSNorm()\n",
+      "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m   )\n",
+      "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m   (lm_head): Linear(in_features=5120, out_features=32000, bias=False)\n",
+      "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m )\n",
+      "\u001b[36m(RayTrainWorker pid=4038, ip=10.0.184.103)\u001b[0m Using /home/ray/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "\u001b[36m(RayTrainWorker pid=4038, ip=10.0.184.103)\u001b[0m Creating extension directory /home/ray/.cache/torch_extensions/py310_cu121/cpu_adam...\n",
+      "\u001b[36m(RayTrainWorker pid=4224, ip=10.0.155.201)\u001b[0m INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
+      "\u001b[36m(RayTrainWorker pid=4224, ip=10.0.155.201)\u001b[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
+      "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m Detected CUDA files, patching ldflags\n",
+      "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m Emitting ninja build file /home/ray/.cache/torch_extensions/py310_cu121/cpu_adam/build.ninja...\n",
+      "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m /home/ray/anaconda3/lib/python3.10/site-packages/torch/utils/cpp_extension.py:1967: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation. \n",
+      "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].\n",
+      "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m   warnings.warn(\n",
+      "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m Building extension module cpu_adam...\n",
+      "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[36m(RayTrainWorker pid=4187, ip=10.0.130.65)\u001b[0m [1/4] /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output custom_cuda_kernel.cuda.o.d -DTORCH_EXTENSION_NAME=cpu_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\\"_gcc\\\" -DPYBIND11_STDLIB=\\\"_libstdcpp\\\" -DPYBIND11_BUILD_ABI=\\\"_cxxabi1011\\\" -I/home/ray/anaconda3/lib/python3.10/site-packages/deepspeed/ops/csrc/includes -I/usr/local/cuda/include -isystem /home/ray/anaconda3/lib/python3.10/site-packages/torch/include -isystem /home/ray/anaconda3/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -isystem /home/ray/anaconda3/lib/python3.10/site-packages/torch/include/TH -isystem /home/ray/anaconda3/lib/python3.10/site-packages/torch/include/THC -isystem /usr/local/cuda/include -isystem /home/ray/anaconda3/include/python3.10 -D_GLIBCXX_USE_CXX11_ABI=0 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_86,code=sm_86 --compiler-options '-fPIC' -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_86,code=compute_86 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -c /home/ray/anaconda3/lib/python3.10/site-packages/deepspeed/ops/csrc/common/custom_cuda_kernel.cu -o custom_cuda_kernel.cuda.o \n",
+      "\u001b[36m(RayTrainWorker pid=5418, ip=10.0.129.201)\u001b[0m [2025-10-15 15:51:07,681] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
+      "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m [2/4] c++ -MMD -MF cpu_adam.o.d -DTORCH_EXTENSION_NAME=cpu_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\\"_gcc\\\" -DPYBIND11_STDLIB=\\\"_libstdcpp\\\" -DPYBIND11_BUILD_ABI=\\\"_cxxabi1011\\\" -I/home/ray/anaconda3/lib/python3.10/site-packages/deepspeed/ops/csrc/includes -I/usr/local/cuda/include -isystem /home/ray/anaconda3/lib/python3.10/site-packages/torch/include -isystem /home/ray/anaconda3/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -isystem /home/ray/anaconda3/lib/python3.10/site-packages/torch/include/TH -isystem /home/ray/anaconda3/lib/python3.10/site-packages/torch/include/THC -isystem /usr/local/cuda/include -isystem /home/ray/anaconda3/include/python3.10 -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -std=c++17 -O3 -std=c++17 -g -Wno-reorder -L/usr/local/cuda/lib64 -lcudart -lcublas -g -march=native -fopenmp -D__AVX256__ -D__ENABLE_CUDA__ -DBF16_AVAILABLE -c /home/ray/anaconda3/lib/python3.10/site-packages/deepspeed/ops/csrc/adam/cpu_adam.cpp -o cpu_adam.o \n",
+      "\u001b[36m(RayTrainWorker pid=5418, ip=10.0.129.201)\u001b[0m [1/4] /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output custom_cuda_kernel.cuda.o.d -DTORCH_EXTENSION_NAME=cpu_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\\"_gcc\\\" -DPYBIND11_STDLIB=\\\"_libstdcpp\\\" -DPYBIND11_BUILD_ABI=\\\"_cxxabi1011\\\" -I/home/ray/anaconda3/lib/python3.10/site-packages/deepspeed/ops/csrc/includes -I/usr/local/cuda/include -isystem /home/ray/anaconda3/lib/python3.10/site-packages/torch/include -isystem /home/ray/anaconda3/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -isystem /home/ray/anaconda3/lib/python3.10/site-packages/torch/include/TH -isystem /home/ray/anaconda3/lib/python3.10/site-packages/torch/include/THC -isystem /usr/local/cuda/include -isystem /home/ray/anaconda3/include/python3.10 -D_GLIBCXX_USE_CXX11_ABI=0 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_86,code=sm_86 --compiler-options '-fPIC' -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_86,code=compute_86 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -c /home/ray/anaconda3/lib/python3.10/site-packages/deepspeed/ops/csrc/common/custom_cuda_kernel.cu -o custom_cuda_kernel.cuda.o \u001b[32m [repeated 15x across cluster]\u001b[0m\n",
+      "\u001b[36m(RayTrainWorker pid=4048, ip=10.0.130.188)\u001b[0m [2/4] c++ -MMD -MF cpu_adam_impl.o.d -DTORCH_EXTENSION_NAME=cpu_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\\"_gcc\\\" -DPYBIND11_STDLIB=\\\"_libstdcpp\\\" -DPYBIND11_BUILD_ABI=\\\"_cxxabi1011\\\" -I/home/ray/anaconda3/lib/python3.10/site-packages/deepspeed/ops/csrc/includes -I/usr/local/cuda/include -isystem /home/ray/anaconda3/lib/python3.10/site-packages/torch/include -isystem /home/ray/anaconda3/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -isystem /home/ray/anaconda3/lib/python3.10/site-packages/torch/include/TH -isystem /home/ray/anaconda3/lib/python3.10/site-packages/torch/include/THC -isystem /usr/local/cuda/include -isystem /home/ray/anaconda3/include/python3.10 -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -std=c++17 -O3 -std=c++17 -g -Wno-reorder -L/usr/local/cuda/lib64 -lcudart -lcublas -g -march=native -fopenmp -D__AVX256__ -D__ENABLE_CUDA__ -DBF16_AVAILABLE -c /home/ray/anaconda3/lib/python3.10/site-packages/deepspeed/ops/csrc/adam/cpu_adam_impl.cpp -o cpu_adam_impl.o \n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[36m(RayTrainWorker pid=4048, ip=10.0.130.188)\u001b[0m Loading extension module cpu_adam...\n",
+      "\u001b[36m(RayTrainWorker pid=4048, ip=10.0.130.188)\u001b[0m Time to load cpu_adam op: 28.735835075378418 seconds\n",
+      "\u001b[36m(RayTrainWorker pid=5418, ip=10.0.129.201)\u001b[0m Using /home/ray/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
+      "\u001b[36m(RayTrainWorker pid=5418, ip=10.0.129.201)\u001b[0m Creating extension directory /home/ray/.cache/torch_extensions/py310_cu121/cpu_adam...\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
+      "\u001b[36m(RayTrainWorker pid=5418, ip=10.0.129.201)\u001b[0m Detected CUDA files, patching ldflags\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
+      "\u001b[36m(RayTrainWorker pid=5418, ip=10.0.129.201)\u001b[0m Emitting ninja build file /home/ray/.cache/torch_extensions/py310_cu121/cpu_adam/build.ninja...\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
+      "\u001b[36m(RayTrainWorker pid=5418, ip=10.0.129.201)\u001b[0m /home/ray/anaconda3/lib/python3.10/site-packages/torch/utils/cpp_extension.py:1967: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation. \u001b[32m [repeated 15x across cluster]\u001b[0m\n",
+      "\u001b[36m(RayTrainWorker pid=5418, ip=10.0.129.201)\u001b[0m If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
+      "\u001b[36m(RayTrainWorker pid=5418, ip=10.0.129.201)\u001b[0m   warnings.warn(\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
+      "\u001b[36m(RayTrainWorker pid=5418, ip=10.0.129.201)\u001b[0m Building extension module cpu_adam...\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
+      "\u001b[36m(RayTrainWorker pid=5418, ip=10.0.129.201)\u001b[0m Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\u001b[32m [repeated 15x across cluster]\u001b[0m\n"
+     ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "\u001b[2m\u001b[36m(pid=134103)\u001b[0m [2023-06-30 17:39:41,637] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n"
+      "\u001b[36m(RayTrainWorker pid=4048, ip=10.0.130.188)\u001b[0m [4/4] c++ cpu_adam.o cpu_adam_impl.o custom_cuda_kernel.cuda.o -shared -lcurand -L/home/ray/anaconda3/lib/python3.10/site-packages/torch/lib -lc10 -lc10_cuda -ltorch_cpu -ltorch_cuda -ltorch -ltorch_python -L/usr/local/cuda/lib64 -lcudart -o cpu_adam.so\n"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\u001b[2m\u001b[36m(LightningTrainer pid=134103)\u001b[0m The `preprocessor` arg to Trainer is deprecated. Apply preprocessor transformations ahead of time by calling `preprocessor.transform(ds)`. Support for the preprocessor arg will be dropped in a future release.\n",
-      "\u001b[2m\u001b[36m(LightningTrainer pid=134103)\u001b[0m \u001b[33mImportant: Ray Data requires schemas for all datasets in Ray 2.5. This means that standalone Python objects are no longer supported. In addition, the default batch format is fixed to NumPy. To revert to legacy behavior temporarily, set the environment variable RAY_DATA_STRICT_MODE=0 on all cluster processes.\n",
-      "\u001b[2m\u001b[36m(LightningTrainer pid=134103)\u001b[0m \n",
-      "\u001b[2m\u001b[36m(LightningTrainer pid=134103)\u001b[0m Learn more here: https://docs.ray.io/en/master/data/faq.html#migrating-to-strict-mode\u001b[0m\n",
-      "\u001b[2m\u001b[36m(LightningTrainer pid=134103)\u001b[0m Starting distributed worker processes: ['134267 (10.0.55.20)', '74152 (10.0.63.141)', '75476 (10.0.51.205)', '75547 (10.0.42.158)', '74711 (10.0.45.211)', '75132 (10.0.20.140)', '74502 (10.0.60.86)', '75695 (10.0.53.69)', '74457 (10.0.47.2)', '74569 (10.0.33.23)', '74341 (10.0.29.61)', '74274 (10.0.36.152)', '74561 (10.0.35.16)', '74427 (10.0.16.236)', '74273 (10.0.54.55)', '74996 (10.0.9.249)']\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m Setting up process group for: env:// [rank=0, world_size=16]\n",
-      "\u001b[2m\u001b[36m(LightningTrainer pid=134103)\u001b[0m Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[MapBatches(BatchMapper._transform_pandas)->MapBatches(BatchMapper._transform_pandas)] -> AllToAllOperator[RandomizeBlockOrder]\n",
-      "\u001b[2m\u001b[36m(LightningTrainer pid=134103)\u001b[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n",
-      "\u001b[2m\u001b[36m(LightningTrainer pid=134103)\u001b[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n"
+      "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m Parameter Offload: Total persistent parameters: 414720 in 81 params\n",
+      "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m INFO: \n",
+      "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m   | Name  | Type             | Params | Params per Device\n",
+      "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m ---------------------------------------------------------------\n",
+      "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m 0 | model | LlamaForCausalLM | 13.0 B | 813 M            \n",
+      "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m ---------------------------------------------------------------\n",
+      "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m 13.0 B    Trainable params\n",
+      "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m 0         Non-trainable params\n",
+      "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m 13.0 B    Total params\n",
+      "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m 52,063.457Total estimated model params size (MB)\n",
+      "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m \n",
+      "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m   | Name  | Type             | Params | Params per Device\n",
+      "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m ---------------------------------------------------------------\n",
+      "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m 0 | model | LlamaForCausalLM | 13.0 B | 813 M            \n",
+      "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m ---------------------------------------------------------------\n",
+      "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m 13.0 B    Trainable params\n",
+      "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m 0         Non-trainable params\n",
+      "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m 13.0 B    Total params\n",
+      "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m 52,063.457Total estimated model params size (MB)\n",
+      "\u001b[36m(RayTrainWorker pid=5418, ip=10.0.129.201)\u001b[0m Loading extension module cpu_adam...\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
+      "\u001b[36m(RayTrainWorker pid=5418, ip=10.0.129.201)\u001b[0m Time to load cpu_adam op: 31.185880184173584 seconds\u001b[32m [repeated 15x across cluster]\u001b[0m\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Epoch 0: : 0it [00:00, ?it/s]0)\u001b[0m \n",
+      "\u001b[36m(RayTrainWorker pid=5418, ip=10.0.129.201)\u001b[0m [2/4] c++ -MMD -MF cpu_adam.o.d -DTORCH_EXTENSION_NAME=cpu_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\\"_gcc\\\" -DPYBIND11_STDLIB=\\\"_libstdcpp\\\" -DPYBIND11_BUILD_ABI=\\\"_cxxabi1011\\\" -I/home/ray/anaconda3/lib/python3.10/site-packages/deepspeed/ops/csrc/includes -I/usr/local/cuda/include -isystem /home/ray/anaconda3/lib/python3.10/site-packages/torch/include -isystem /home/ray/anaconda3/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -isystem /home/ray/anaconda3/lib/python3.10/site-packages/torch/include/TH -isystem /home/ray/anaconda3/lib/python3.10/site-packages/torch/include/THC -isystem /usr/local/cuda/include -isystem /home/ray/anaconda3/include/python3.10 -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -std=c++17 -O3 -std=c++17 -g -Wno-reorder -L/usr/local/cuda/lib64 -lcudart -lcublas -g -march=native -fopenmp -D__AVX256__ -D__ENABLE_CUDA__ -DBF16_AVAILABLE -c /home/ray/anaconda3/lib/python3.10/site-packages/deepspeed/ops/csrc/adam/cpu_adam.cpp -o cpu_adam.o \u001b[32m [repeated 15x across cluster]\u001b[0m\n",
+      "\u001b[36m(RayTrainWorker pid=5418, ip=10.0.129.201)\u001b[0m [3/4] c++ -MMD -MF cpu_adam_impl.o.d -DTORCH_EXTENSION_NAME=cpu_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\\"_gcc\\\" -DPYBIND11_STDLIB=\\\"_libstdcpp\\\" -DPYBIND11_BUILD_ABI=\\\"_cxxabi1011\\\" -I/home/ray/anaconda3/lib/python3.10/site-packages/deepspeed/ops/csrc/includes -I/usr/local/cuda/include -isystem /home/ray/anaconda3/lib/python3.10/site-packages/torch/include -isystem /home/ray/anaconda3/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -isystem /home/ray/anaconda3/lib/python3.10/site-packages/torch/include/TH -isystem /home/ray/anaconda3/lib/python3.10/site-packages/torch/include/THC -isystem /usr/local/cuda/include -isystem /home/ray/anaconda3/include/python3.10 -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -std=c++17 -O3 -std=c++17 -g -Wno-reorder -L/usr/local/cuda/lib64 -lcudart -lcublas -g -march=native -fopenmp -D__AVX256__ -D__ENABLE_CUDA__ -DBF16_AVAILABLE -c /home/ray/anaconda3/lib/python3.10/site-packages/deepspeed/ops/csrc/adam/cpu_adam_impl.cpp -o cpu_adam_impl.o \u001b[32m [repeated 15x across cluster]\u001b[0m\n",
+      "\u001b[36m(RayTrainWorker pid=5418, ip=10.0.129.201)\u001b[0m [4/4] c++ cpu_adam.o cpu_adam_impl.o custom_cuda_kernel.cuda.o -shared -lcurand -L/home/ray/anaconda3/lib/python3.10/site-packages/torch/lib -lc10 -lc10_cuda -ltorch_cpu -ltorch_cuda -ltorch -ltorch_python -L/usr/local/cuda/lib64 -lcudart -o cpu_adam.so\u001b[32m [repeated 15x across cluster]\u001b[0m\n"
      ]
     },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "da7f200767b448d7b409fcdd07daecce",
+       "model_id": "2a3cf444199946fa9760cd89e1e8d198",
        "version_major": 2,
        "version_minor": 0
       },
       "text/plain": [
-       "(pid=134103) - RandomizeBlockOrder 1:   0%|          | 0/1 [00:00<?, ?it/s]"
+       "(pid=17972) Running 0: 0.00 row [00:00, ? row/s]"
       ]
      },
      "metadata": {},
@@ -573,12 +710,40 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "d9c76218373645cc99438e1f14133e74",
+       "model_id": "e0810cbe81cb4f418cd6a45728187e66",
        "version_major": 2,
        "version_minor": 0
       },
       "text/plain": [
-       "(pid=134103) Running 0:   0%|          | 0/1 [00:00<?, ?it/s]"
+       "(pid=17972) - MapBatches(fill_prompt)->MapBatches(tokenize) 1: 0.00 row [00:00, ? row/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "27c3f884506944d1b3825a1104412c6c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "(pid=17972) - limit=2048 2: 0.00 row [00:00, ? row/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "029aff619c7644bcb70086a01f3c15e5",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "(pid=17972) - split(16, equal=True) 3: 0.00 row [00:00, ? row/s]"
       ]
      },
      "metadata": {},
@@ -588,468 +753,239 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Downloading (…)okenizer_config.json: 100%|██████████| 727/727 [00:00<00:00, 8.86MB/s]m_pandas) pid=74329, ip=10.0.54.55)\u001b[0m \n",
-      "Downloading tokenizer.model: 100%|██████████| 500k/500k [00:00<00:00, 18.2MB/s]ansform_pandas) pid=74329, ip=10.0.54.55)\u001b[0m \n",
-      "Downloading (…)cial_tokens_map.json: 100%|██████████| 435/435 [00:00<00:00, 3.33MB/s]m_pandas) pid=74329, ip=10.0.54.55)\u001b[0m \n"
+      "\u001b[36m(SplitCoordinator pid=17972)\u001b[0m Registered dataset logger for dataset train_16_0\n",
+      "\u001b[36m(SplitCoordinator pid=17972)\u001b[0m Starting execution of Dataset train_16_0. Full logs are in /tmp/ray/session_2025-10-15_15-40-01_399241_4076/logs/ray-data\n",
+      "\u001b[36m(SplitCoordinator pid=17972)\u001b[0m Execution plan of Dataset train_16_0: InputDataBuffer[Input] -> TaskPoolMapOperator[MapBatches(fill_prompt)->MapBatches(tokenize)] -> LimitOperator[limit=2048] -> OutputSplitter[split(16, equal=True)]\n",
+      "\u001b[36m(SplitCoordinator pid=17972)\u001b[0m ⚠️  Ray's object store is configured to use only 28.0% of available memory (341.1GiB out of 1216.0GiB total). For optimal Ray Data performance, we recommend setting the object store to at least 50% of available memory. You can do this by setting the 'object_store_memory' parameter when calling ray.init() or by setting the RAY_DEFAULT_OBJECT_STORE_MEMORY_PROPORTION environment variable.\n",
+      "\u001b[36m(MapBatches(fill_prompt)->MapBatches(tokenize) pid=4600, ip=10.0.166.0)\u001b[0m /home/ray/anaconda3/lib/python3.10/site-packages/huggingface_hub/file_download.py:795: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
+      "\u001b[36m(MapBatches(fill_prompt)->MapBatches(tokenize) pid=4600, ip=10.0.166.0)\u001b[0m   warnings.warn(\n",
+      "\u001b[36m(MapBatches(fill_prompt)->MapBatches(tokenize) pid=4600, ip=10.0.166.0)\u001b[0m normalizer.cc(51) LOG(INFO) precompiled_charsmap is empty. use identity normalization.\n",
+      "\u001b[36m(SplitCoordinator pid=17972)\u001b[0m ✔️  Dataset train_16_0 execution finished in 5.69 seconds\n",
+      "\u001b[36m(RayTrainWorker pid=4048, ip=10.0.130.188)\u001b[0m /home/ray/anaconda3/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::broadcast_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.)\n",
+      "\u001b[36m(RayTrainWorker pid=4048, ip=10.0.130.188)\u001b[0m   return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74152, ip=10.0.63.141)\u001b[0m [2023-06-30 17:39:54,612] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n"
+      "Epoch 0: : 1it [00:52, 52.00s/it, v_num=0, train_loss=9.190]\n"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Downloading (…)okenizer_config.json: 100%|██████████| 727/727 [00:00<00:00, 7.86MB/s]\n",
-      "Downloading (…)okenizer_config.json: 100%|██████████| 727/727 [00:00<00:00, 7.57MB/s]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m GPU available: True (cuda), used: True\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m TPU available: False, using: 0 TPU cores\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m IPU available: False, using: 0 IPUs\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m HPU available: False, using: 0 HPUs\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m `Trainer(limit_val_batches=1)` was configured so 1 batch will be used.\n",
-      "Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]\n",
-      "Downloading tokenizer.model: 100%|██████████| 500k/500k [00:00<00:00, 14.9MB/s]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m initializing deepspeed distributed: GLOBAL_RANK: 0, MEMBER: 1/16\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74273, ip=10.0.54.55)\u001b[0m Missing logger folder: /home/ray/ray_results/vicuna-13b-relation-extraction/LightningTrainer_c1544_00000_0_2023-06-30_17-39-36/rank_all/lightning_logs\n"
+      "\u001b[36m(RayTrainWorker pid=5418, ip=10.0.129.201)\u001b[0m /home/ray/anaconda3/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::broadcast_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.)\u001b[32m [repeated 16x across cluster]\u001b[0m\n",
+      "\u001b[36m(RayTrainWorker pid=5418, ip=10.0.129.201)\u001b[0m   return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass\u001b[32m [repeated 16x across cluster]\u001b[0m\n",
+      "\u001b[36m(RayTrainWorker pid=4224, ip=10.0.155.201)\u001b[0m /home/ray/anaconda3/lib/python3.10/site-packages/deepspeed/runtime/zero/stage3.py:1330: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:78.)\n",
+      "\u001b[36m(RayTrainWorker pid=4224, ip=10.0.155.201)\u001b[0m   total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)])\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 17:39:55,589] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented\n"
+      "Epoch 0: : 2it [01:28, 44.47s/it, v_num=0, train_loss=9.250]\n"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Downloading tokenizer.model: 100%|██████████| 500k/500k [00:00<00:00, 18.2MB/s]\n",
-      "Downloading (…)cial_tokens_map.json: 100%|██████████| 435/435 [00:00<00:00, 6.49MB/s]\n",
-      "Downloading (…)lve/main/config.json:   0%|          | 0.00/585 [00:00<?, ?B/s]\n",
-      "Downloading (…)lve/main/config.json: 100%|██████████| 585/585 [00:00<00:00, 7.81MB/s]\n",
-      "Downloading (…)lve/main/config.json: 100%|██████████| 585/585 [00:00<00:00, 7.09MB/s]\n",
-      "Downloading (…)model.bin.index.json: 100%|██████████| 33.4k/33.4k [00:00<00:00, 35.1MB/s]\n",
-      "Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=75547, ip=10.0.42.158)\u001b[0m \n",
-      "Downloading (…)l-00001-of-00003.bin:   0%|          | 0.00/9.95G [00:00<?, ?B/s]\u001b[A\n",
-      "Downloading (…)l-00001-of-00003.bin:   0%|          | 21.0M/9.95G [00:00<00:59, 167MB/s]\u001b[A\n",
-      "Downloading (…)l-00001-of-00003.bin:   0%|          | 41.9M/9.95G [00:00<00:58, 170MB/s] \u001b[A\n",
-      "Downloading (…)okenizer_config.json: 100%|██████████| 727/727 [00:00<00:00, 8.33MB/s]\u001b[32m [repeated 9x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/ray-logging.html#log-deduplication for more options.)\u001b[0m\n",
-      "Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]\n",
-      "Downloading tokenizer.model: 100%|██████████| 500k/500k [00:00<00:00, 17.5MB/s]\u001b[32m [repeated 8x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74561, ip=10.0.35.16)\u001b[0m initializing deepspeed distributed: GLOBAL_RANK: 12, MEMBER: 13/16\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74561, ip=10.0.35.16)\u001b[0m Missing logger folder: /home/ray/ray_results/vicuna-13b-relation-extraction/LightningTrainer_c1544_00000_0_2023-06-30_17-39-36/rank_all/lightning_logs\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
-      "Downloading tokenizer.model: 100%|██████████| 500k/500k [00:00<00:00, 8.85MB/s]\n",
-      "Downloading (…)cial_tokens_map.json: 100%|██████████| 435/435 [00:00<00:00, 5.23MB/s]\u001b[32m [repeated 10x across cluster]\u001b[0m\n",
-      "Downloading (…)lve/main/config.json: 100%|██████████| 585/585 [00:00<00:00, 7.03MB/s]\u001b[32m [repeated 13x across cluster]\u001b[0m\n",
-      "Downloading (…)model.bin.index.json: 100%|██████████| 33.4k/33.4k [00:00<00:00, 87.9MB/s]\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
-      "Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74341, ip=10.0.29.61)\u001b[0m \u001b[32m [repeated 650x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00001-of-00003.bin:   0%|          | 0.00/9.95G [00:00<?, ?B/s]\u001b[A\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00001-of-00003.bin:  13%|█▎        | 1.31G/9.95G [00:05<00:36, 239MB/s]\u001b[A\u001b[32m [repeated 636x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00001-of-00003.bin:   1%|          | 105M/9.95G [00:00<00:41, 239MB/s] \u001b[A\u001b[32m [repeated 17x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74711, ip=10.0.45.211)\u001b[0m \u001b[32m [repeated 640x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00001-of-00003.bin:  26%|██▌       | 2.58G/9.95G [00:10<00:28, 256MB/s]\u001b[A\u001b[32m [repeated 635x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74502, ip=10.0.60.86)\u001b[0m \u001b[32m [repeated 638x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00001-of-00003.bin:  37%|███▋      | 3.70G/9.95G [00:15<00:26, 238MB/s]\u001b[A\u001b[32m [repeated 638x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74274, ip=10.0.36.152)\u001b[0m \u001b[32m [repeated 643x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00001-of-00003.bin:  51%|█████▏    | 5.12G/9.95G [00:20<00:18, 255MB/s]\u001b[A\u001b[32m [repeated 649x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=75476, ip=10.0.51.205)\u001b[0m \u001b[32m [repeated 638x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00001-of-00003.bin:  65%|██████▌   | 6.48G/9.95G [00:25<00:14, 246MB/s]\u001b[A\u001b[32m [repeated 633x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74457, ip=10.0.47.2)\u001b[0m \u001b[32m [repeated 645x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00001-of-00003.bin:  76%|███████▌  | 7.52G/9.95G [00:29<00:09, 247MB/s]\u001b[A\u001b[32m [repeated 644x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00001-of-00003.bin:  91%|█████████▏| 9.10G/9.95G [00:34<00:03, 263MB/s]\u001b[A\n",
-      "Downloading (…)l-00001-of-00003.bin:  92%|█████████▏| 9.13G/9.95G [00:34<00:03, 257MB/s]\u001b[A\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74711, ip=10.0.45.211)\u001b[0m \u001b[32m [repeated 634x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00001-of-00003.bin:  82%|████████▏ | 8.17G/9.95G [00:35<00:07, 228MB/s]\u001b[A\u001b[32m [repeated 628x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00001-of-00003.bin: 100%|██████████| 9.95G/9.95G [00:37<00:00, 262MB/s]\u001b[A\n",
-      "Downloading shards:  33%|███▎      | 1/3 [00:38<01:16, 38.09s/it]\n",
-      "Downloading (…)l-00002-of-00003.bin:   0%|          | 0.00/9.90G [00:00<?, ?B/s]\u001b[A\n",
-      "Downloading (…)l-00002-of-00003.bin:   1%|▏         | 126M/9.90G [00:00<00:35, 273MB/s] \u001b[A\n",
-      "Downloading (…)l-00001-of-00003.bin:  93%|█████████▎| 9.27G/9.95G [00:39<00:02, 228MB/s]\u001b[A\u001b[32m [repeated 394x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=75547, ip=10.0.42.158)\u001b[0m \u001b[32m [repeated 633x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00002-of-00003.bin:   2%|▏         | 241M/9.90G [00:01<00:38, 252MB/s]\u001b[A\u001b[32m [repeated 213x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00001-of-00003.bin: 100%|██████████| 9.95G/9.95G [00:40<00:00, 243MB/s]\u001b[32m [repeated 8x across cluster]\u001b[0m\n",
-      "Downloading shards:  33%|███▎      | 1/3 [00:42<01:25, 42.77s/it]\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00002-of-00003.bin:   0%|          | 0.00/9.90G [00:00<?, ?B/s]\u001b[A\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00002-of-00003.bin:   1%|          | 115M/9.90G [00:00<00:46, 209MB/s] \u001b[A\u001b[32m [repeated 16x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00001-of-00003.bin: 100%|██████████| 9.95G/9.95G [00:42<00:00, 233MB/s]\u001b[32m [repeated 50x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74341, ip=10.0.29.61)\u001b[0m \u001b[32m [repeated 636x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00002-of-00003.bin:  19%|█▊        | 1.86G/9.90G [00:06<00:29, 275MB/s]\u001b[A\u001b[32m [repeated 589x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74996, ip=10.0.9.249)\u001b[0m \u001b[32m [repeated 649x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00002-of-00003.bin:  18%|█▊        | 1.75G/9.90G [00:07<00:34, 234MB/s]\u001b[A\u001b[32m [repeated 643x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74502, ip=10.0.60.86)\u001b[0m \u001b[32m [repeated 645x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00002-of-00003.bin:  41%|████▏     | 4.09G/9.90G [00:15<00:21, 271MB/s]\u001b[A\u001b[32m [repeated 644x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74273, ip=10.0.54.55)\u001b[0m \u001b[32m [repeated 652x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00002-of-00003.bin:  53%|█████▎    | 5.25G/9.90G [00:21<00:19, 242MB/s]\u001b[A\u001b[32m [repeated 656x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74152, ip=10.0.63.141)\u001b[0m \u001b[32m [repeated 647x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00002-of-00003.bin:  67%|██████▋   | 6.66G/9.90G [00:25<00:13, 246MB/s]\u001b[A\u001b[32m [repeated 646x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=75132, ip=10.0.20.140)\u001b[0m \u001b[32m [repeated 629x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00002-of-00003.bin:  84%|████████▍ | 8.30G/9.90G [00:31<00:06, 234MB/s]\u001b[A\u001b[32m [repeated 627x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00002-of-00003.bin:  91%|█████████▏| 9.06G/9.90G [00:34<00:03, 241MB/s]\u001b[A\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74457, ip=10.0.47.2)\u001b[0m \u001b[32m [repeated 627x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00002-of-00003.bin:  89%|████████▉ | 8.84G/9.90G [00:36<00:04, 228MB/s]\u001b[A\u001b[32m [repeated 567x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00002-of-00003.bin: 100%|██████████| 9.90G/9.90G [00:38<00:00, 257MB/s]\u001b[A\n",
-      "Downloading shards:  67%|██████▋   | 2/3 [01:16<00:38, 38.38s/it]\n",
-      "Downloading (…)l-00003-of-00003.bin:   0%|          | 0.00/6.18G [00:00<?, ?B/s]\u001b[A\n",
-      "Downloading (…)l-00003-of-00003.bin:   2%|▏         | 126M/6.18G [00:00<00:22, 266MB/s] \u001b[A\n",
-      "Downloading (…)l-00002-of-00003.bin:  98%|█████████▊| 9.69G/9.90G [00:38<00:00, 236MB/s]\u001b[A\u001b[32m [repeated 310x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=75476, ip=10.0.51.205)\u001b[0m \u001b[32m [repeated 629x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00003-of-00003.bin:   2%|▏         | 94.4M/6.18G [00:00<00:24, 247MB/s]\u001b[A\u001b[32m [repeated 275x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00002-of-00003.bin: 100%|██████████| 9.90G/9.90G [00:39<00:00, 253MB/s]\u001b[32m [repeated 10x across cluster]\u001b[0m\n",
-      "Downloading shards:  67%|██████▋   | 2/3 [01:20<00:40, 40.01s/it]\u001b[32m [repeated 13x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00003-of-00003.bin:   0%|          | 0.00/6.18G [00:00<?, ?B/s]\u001b[A\u001b[32m [repeated 13x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00003-of-00003.bin:   2%|▏         | 126M/6.18G [00:00<00:24, 243MB/s] \u001b[A\u001b[32m [repeated 13x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00002-of-00003.bin: 100%|█████████▉| 9.88G/9.90G [00:41<00:00, 242MB/s]\u001b[A\u001b[32m [repeated 122x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74273, ip=10.0.54.55)\u001b[0m \u001b[32m [repeated 638x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00003-of-00003.bin:  21%|██        | 1.31G/6.18G [00:05<00:20, 243MB/s]\u001b[A\u001b[32m [repeated 569x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00002-of-00003.bin: 100%|██████████| 9.90G/9.90G [00:40<00:00, 242MB/s]\u001b[32m [repeated 2x across cluster]\u001b[0m\n",
-      "Downloading shards:  67%|██████▋   | 2/3 [01:23<00:41, 41.78s/it]\u001b[32m [repeated 2x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00003-of-00003.bin:   0%|          | 0.00/6.18G [00:00<?, ?B/s]\u001b[A\u001b[32m [repeated 2x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00003-of-00003.bin:   2%|▏         | 105M/6.18G [00:00<00:24, 248MB/s] \u001b[A\u001b[32m [repeated 2x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00002-of-00003.bin: 100%|█████████▉| 9.87G/9.90G [00:40<00:00, 260MB/s]\u001b[A\u001b[32m [repeated 3x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74274, ip=10.0.36.152)\u001b[0m \u001b[32m [repeated 638x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00003-of-00003.bin:  41%|████▏     | 2.56G/6.18G [00:10<00:14, 256MB/s]\u001b[A\u001b[32m [repeated 635x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74152, ip=10.0.63.141)\u001b[0m \u001b[32m [repeated 629x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00003-of-00003.bin:  62%|██████▏   | 3.84G/6.18G [00:15<00:08, 279MB/s]\u001b[A\u001b[32m [repeated 627x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00003-of-00003.bin:  92%|█████████▏| 5.66G/6.18G [00:22<00:01, 268MB/s]\u001b[A\n",
-      "Downloading (…)l-00003-of-00003.bin:  92%|█████████▏| 5.69G/6.18G [00:22<00:01, 265MB/s]\u001b[A\n",
-      "Downloading (…)l-00003-of-00003.bin:  93%|█████████▎| 5.73G/6.18G [00:22<00:01, 268MB/s]\u001b[A\n",
-      "Downloading (…)l-00003-of-00003.bin:  93%|█████████▎| 5.76G/6.18G [00:22<00:01, 270MB/s]\u001b[A\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=75547, ip=10.0.42.158)\u001b[0m \u001b[32m [repeated 644x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00003-of-00003.bin:  85%|████████▌ | 5.25G/6.18G [00:20<00:03, 270MB/s]\u001b[A\u001b[32m [repeated 618x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00003-of-00003.bin: 100%|██████████| 6.18G/6.18G [00:24<00:00, 257MB/s]\u001b[A\n",
-      "Downloading shards: 100%|██████████| 3/3 [01:40<00:00, 33.61s/it]\n",
-      "Downloading (…)l-00003-of-00003.bin:  98%|█████████▊| 6.03G/6.18G [00:23<00:00, 269MB/s]\u001b[A\u001b[32m [repeated 166x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74274, ip=10.0.36.152)\u001b[0m \u001b[32m [repeated 426x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00003-of-00003.bin:  86%|████████▌ | 5.30G/6.18G [00:21<00:03, 246MB/s]\u001b[A\u001b[32m [repeated 222x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00003-of-00003.bin: 100%|██████████| 6.18G/6.18G [00:25<00:00, 239MB/s]\u001b[32m [repeated 7x across cluster]\u001b[0m\n",
-      "Downloading shards: 100%|██████████| 3/3 [01:45<00:00, 35.27s/it]\u001b[32m [repeated 11x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00003-of-00003.bin:  98%|█████████▊| 6.04G/6.18G [00:25<00:00, 231MB/s]\u001b[A\u001b[32m [repeated 98x across cluster]\u001b[0m\n",
-      "Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74274, ip=10.0.36.152)\u001b[0m \u001b[32m [repeated 74x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00003-of-00003.bin:  91%|█████████ | 5.63G/6.18G [00:23<00:02, 242MB/s]\u001b[A\u001b[32m [repeated 23x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00003-of-00003.bin: 100%|██████████| 6.18G/6.18G [00:24<00:00, 249MB/s]\u001b[A\n",
-      "Downloading shards: 100%|██████████| 3/3 [01:49<00:00, 36.47s/it]\u001b[32m [repeated 4x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00003-of-00003.bin: 100%|██████████| 6.18G/6.18G [00:25<00:00, 241MB/s]\u001b[32m [repeated 5x across cluster]\u001b[0m\n",
-      "Loading checkpoint shards:  33%|███▎      | 1/3 [00:12<00:24, 12.11s/it]\n",
-      "Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
-      "Loading checkpoint shards:  33%|███▎      | 1/3 [00:18<00:37, 18.54s/it]\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
-      "Loading checkpoint shards:  67%|██████▋   | 2/3 [00:30<00:15, 15.63s/it]\n",
-      "Loading checkpoint shards:  67%|██████▋   | 2/3 [00:30<00:15, 15.71s/it]\n",
-      "Loading checkpoint shards:  67%|██████▋   | 2/3 [00:35<00:17, 17.73s/it]\u001b[32m [repeated 14x across cluster]\u001b[0m\n",
-      "Loading checkpoint shards: 100%|██████████| 3/3 [00:40<00:00, 13.47s/it]\n",
-      "Downloading (…)neration_config.json: 100%|██████████| 132/132 [00:00<00:00, 458kB/s]\n",
-      "Loading checkpoint shards: 100%|██████████| 3/3 [00:45<00:00, 15.29s/it]\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74996, ip=10.0.9.249)\u001b[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n",
-      "Downloading (…)neration_config.json: 100%|██████████| 132/132 [00:00<00:00, 542kB/s]\u001b[32m [repeated 14x across cluster]\u001b[0m\n"
+      "\u001b[36m(RayTrainWorker pid=4075, ip=10.0.164.99)\u001b[0m /home/ray/anaconda3/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::broadcast_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.)\u001b[32m [repeated 16x across cluster]\u001b[0m\n",
+      "\u001b[36m(RayTrainWorker pid=4075, ip=10.0.164.99)\u001b[0m   return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass\u001b[32m [repeated 16x across cluster]\u001b[0m\n",
+      "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m /home/ray/anaconda3/lib/python3.10/site-packages/deepspeed/runtime/zero/stage3.py:1330: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:78.)\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
+      "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m   total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)])\u001b[32m [repeated 15x across cluster]\u001b[0m\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m DeepSpeed Configs:  {'zero_allow_untested_optimizer': True, 'bf16': {'enabled': True}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'cpu', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'reduce_bucket_size': 26214400, 'stage3_prefetch_bucket_size': 23592960.0, 'stage3_param_persistence_threshold': 51200}, 'gradient_accumulation_steps': 2, 'train_micro_batch_size_per_gpu': 1, 'gradient_clipping': 0.0}\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m Model Archetecture:  LlamaForCausalLM(\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m   (model): LlamaModel(\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m     (embed_tokens): Embedding(32000, 5120, padding_idx=0)\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m     (layers): ModuleList(\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m       (0-39): 40 x LlamaDecoderLayer(\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m         (self_attn): LlamaAttention(\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m           (q_proj): Linear(in_features=5120, out_features=5120, bias=False)\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m           (k_proj): Linear(in_features=5120, out_features=5120, bias=False)\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m           (v_proj): Linear(in_features=5120, out_features=5120, bias=False)\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m           (o_proj): Linear(in_features=5120, out_features=5120, bias=False)\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m           (rotary_emb): LlamaRotaryEmbedding()\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m         )\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m         (mlp): LlamaMLP(\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m           (gate_proj): Linear(in_features=5120, out_features=13824, bias=False)\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m           (down_proj): Linear(in_features=13824, out_features=5120, bias=False)\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m           (up_proj): Linear(in_features=5120, out_features=13824, bias=False)\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m           (act_fn): SiLUActivation()\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m         )\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m         (input_layernorm): LlamaRMSNorm()\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m         (post_attention_layernorm): LlamaRMSNorm()\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m       )\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m     )\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m     (norm): LlamaRMSNorm()\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m   )\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m   (lm_head): Linear(in_features=5120, out_features=32000, bias=False)\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m )\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74274, ip=10.0.36.152)\u001b[0m [2023-06-30 17:39:54,688] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74561, ip=10.0.35.16)\u001b[0m [2023-06-30 17:39:56,220] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m ninja: no work to do.\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m Time to load cpu_adam op: 2.403524875640869 seconds\n"
+      "Epoch 0: : 3it [01:59, 39.68s/it, v_num=0, train_loss=1.160]\n"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m Using /home/ray/.cache/torch_extensions/py310_cu118 as PyTorch extensions root...\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m Detected CUDA files, patching ldflags\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m Emitting ninja build file /home/ray/.cache/torch_extensions/py310_cu118/cpu_adam/build.ninja...\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m Building extension module cpu_adam...\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m Loading extension module cpu_adam...\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74502, ip=10.0.60.86)\u001b[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
-      "Downloading (…)neration_config.json: 100%|██████████| 132/132 [00:00<00:00, 1.72MB/s]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74996, ip=10.0.9.249)\u001b[0m Building extension module utils...\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74152, ip=10.0.63.141)\u001b[0m Loading extension module utils...\n"
+      "\u001b[36m(RayTrainWorker pid=4040, ip=10.0.136.125)\u001b[0m /home/ray/anaconda3/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::broadcast_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.)\u001b[32m [repeated 16x across cluster]\u001b[0m\n",
+      "\u001b[36m(RayTrainWorker pid=4040, ip=10.0.136.125)\u001b[0m   return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass\u001b[32m [repeated 16x across cluster]\u001b[0m\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74152, ip=10.0.63.141)\u001b[0m Time to load utils op: 0.0775597095489502 seconds\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m Parameter Offload: Total persistent parameters: 414720 in 81 params\n"
+      "Epoch 0: : 4it [02:34, 38.54s/it, v_num=0, train_loss=1.120]\n"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74152, ip=10.0.63.141)\u001b[0m No modifications detected for re-loaded extension module utils, skipping build step...\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74152, ip=10.0.63.141)\u001b[0m Using /home/ray/.cache/torch_extensions/py310_cu118 as PyTorch extensions root...\u001b[32m [repeated 32x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74561, ip=10.0.35.16)\u001b[0m Detected CUDA files, patching ldflags\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m Emitting ninja build file /home/ray/.cache/torch_extensions/py310_cu118/utils/build.ninja...\u001b[32m [repeated 31x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74561, ip=10.0.35.16)\u001b[0m Building extension module cpu_adam...\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\u001b[32m [repeated 31x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=75132, ip=10.0.20.140)\u001b[0m Loading extension module cpu_adam...\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m Building extension module utils...\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74152, ip=10.0.63.141)\u001b[0m Loading extension module utils...\u001b[32m [repeated 16x across cluster]\u001b[0m\n"
+      "\u001b[36m(RayTrainWorker pid=5418, ip=10.0.129.201)\u001b[0m /home/ray/anaconda3/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::broadcast_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.)\u001b[32m [repeated 16x across cluster]\u001b[0m\n",
+      "\u001b[36m(RayTrainWorker pid=5418, ip=10.0.129.201)\u001b[0m   return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass\u001b[32m [repeated 16x across cluster]\u001b[0m\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m ninja: no work to do.\u001b[32m [repeated 31x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=75132, ip=10.0.20.140)\u001b[0m Time to load cpu_adam op: 2.3851447105407715 seconds\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74152, ip=10.0.63.141)\u001b[0m Time to load utils op: 0.0005815029144287109 seconds\u001b[32m [repeated 16x across cluster]\u001b[0m\n"
+      "Epoch 0: : 5it [03:05, 37.12s/it, v_num=0, train_loss=0.957]\n"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m \n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m   | Name  | Type             | Params | Params per Device\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m ---------------------------------------------------------------\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m 0 | model | LlamaForCausalLM | 13.0 B | 813 M            \n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m ---------------------------------------------------------------\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m 13.0 B    Trainable params\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m 0         Non-trainable params\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m 13.0 B    Total params\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m 52,063.457Total estimated model params size (MB)\n"
+      "\u001b[36m(RayTrainWorker pid=4079, ip=10.0.173.126)\u001b[0m /home/ray/anaconda3/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::broadcast_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.)\u001b[32m [repeated 16x across cluster]\u001b[0m\n",
+      "\u001b[36m(RayTrainWorker pid=4079, ip=10.0.173.126)\u001b[0m   return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass\u001b[32m [repeated 16x across cluster]\u001b[0m\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Epoch 0:   0%|          | 0/57 [00:00<?, ?it/s]\n"
+      "Epoch 0: : 6it [03:40, 36.73s/it, v_num=0, train_loss=0.941]\n"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m /home/ray/anaconda3/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:432: PossibleUserWarning: The dataloader, train_dataloader, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 64 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m   rank_zero_warn(\n"
+      "\u001b[36m(RayTrainWorker pid=4224, ip=10.0.155.201)\u001b[0m /home/ray/anaconda3/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::broadcast_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.)\u001b[32m [repeated 16x across cluster]\u001b[0m\n",
+      "\u001b[36m(RayTrainWorker pid=4224, ip=10.0.155.201)\u001b[0m   return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass\u001b[32m [repeated 16x across cluster]\u001b[0m\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Epoch 0:   2%|▏         | 1/57 [00:38<35:42, 38.26s/it, v_num=0, train_loss=11.50]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m Time to load utils op: 0.00030732154846191406 seconds\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 17:44:33,395] [WARNING] [stage3.py:1851:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
-      "Epoch 0:   4%|▎         | 2/57 [01:19<36:23, 39.69s/it, v_num=0, train_loss=10.70]\n",
-      "Epoch 0:   5%|▌         | 3/57 [01:52<33:52, 37.65s/it, v_num=0, train_loss=1.710]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 17:45:48,054] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
-      "Epoch 0:   7%|▋         | 4/57 [02:34<34:01, 38.51s/it, v_num=0, train_loss=1.610]\n",
-      "Epoch 0:   9%|▉         | 5/57 [03:08<32:35, 37.60s/it, v_num=0, train_loss=0.914]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 17:47:03,011] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
-      "Epoch 0:  11%|█         | 6/57 [03:49<32:26, 38.17s/it, v_num=0, train_loss=0.973]\n",
-      "Epoch 0:  12%|█▏        | 7/57 [04:24<31:30, 37.81s/it, v_num=0, train_loss=0.801]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 17:48:19,362] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
-      "Epoch 0:  14%|█▍        | 8/57 [05:05<31:10, 38.17s/it, v_num=0, train_loss=0.844]\n",
-      "Epoch 0:  16%|█▌        | 9/57 [05:39<30:12, 37.75s/it, v_num=0, train_loss=0.652]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 17:49:36,571] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
-      "Epoch 0:  18%|█▊        | 10/57 [06:22<29:58, 38.26s/it, v_num=0, train_loss=0.633]\n",
-      "Epoch 0:  19%|█▉        | 11/57 [06:59<29:13, 38.12s/it, v_num=0, train_loss=0.629]\n"
+      "Epoch 0: : 7it [04:10, 35.84s/it, v_num=0, train_loss=0.793]\n"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/arrow/cpp/src/arrow/filesystem/s3fs.cc:663: CompletedMultipartUpload got error embedded in a 200 OK response: InternalError (\"We encountered an internal error. Please try again.\"), retry = 1\n"
+      "\u001b[36m(RayTrainWorker pid=4182, ip=10.0.178.75)\u001b[0m /home/ray/anaconda3/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::broadcast_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.)\u001b[32m [repeated 16x across cluster]\u001b[0m\n",
+      "\u001b[36m(RayTrainWorker pid=4182, ip=10.0.178.75)\u001b[0m   return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass\u001b[32m [repeated 16x across cluster]\u001b[0m\n",
+      "\u001b[36m(RayTrainWorker pid=4182, ip=10.0.178.75)\u001b[0m Exiting prefetcher's background thread\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 17:50:54,177] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
-      "Epoch 0:  21%|██        | 12/57 [07:40<28:45, 38.35s/it, v_num=0, train_loss=0.609]\n",
-      "Epoch 0:  23%|██▎       | 13/57 [08:14<27:53, 38.04s/it, v_num=0, train_loss=0.680]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 17:52:10,002] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
-      "Epoch 0:  25%|██▍       | 14/57 [08:55<27:26, 38.29s/it, v_num=0, train_loss=0.648]\n",
-      "Epoch 0:  26%|██▋       | 15/57 [09:29<26:33, 37.95s/it, v_num=0, train_loss=0.645]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 17:53:23,209] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
-      "Epoch 0:  28%|██▊       | 16/57 [10:09<26:01, 38.08s/it, v_num=0, train_loss=0.664]\n",
-      "Epoch 0:  30%|██▉       | 17/57 [10:43<25:13, 37.83s/it, v_num=0, train_loss=0.625]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 17:54:36,660] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
-      "Epoch 0:  32%|███▏      | 18/57 [11:22<24:39, 37.93s/it, v_num=0, train_loss=0.617]\n",
-      "Epoch 0:  33%|███▎      | 19/57 [11:56<23:53, 37.71s/it, v_num=0, train_loss=0.609]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 17:55:51,289] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
-      "Epoch 0:  35%|███▌      | 20/57 [12:37<23:20, 37.86s/it, v_num=0, train_loss=0.602]\n",
-      "Epoch 0:  37%|███▋      | 21/57 [13:11<22:36, 37.69s/it, v_num=0, train_loss=0.590]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 17:57:07,919] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
-      "Epoch 0:  39%|███▊      | 22/57 [13:53<22:06, 37.91s/it, v_num=0, train_loss=0.555]\n",
-      "Epoch 0:  40%|████      | 23/57 [14:27<21:22, 37.72s/it, v_num=0, train_loss=0.598]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 17:58:22,349] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
-      "Epoch 0:  42%|████▏     | 24/57 [15:08<20:48, 37.85s/it, v_num=0, train_loss=0.625]\n",
-      "Epoch 0:  44%|████▍     | 25/57 [15:43<20:07, 37.74s/it, v_num=0, train_loss=0.625]\n",
-      "Epoch 0:  44%|████▍     | 25/57 [15:43<20:07, 37.74s/it, v_num=0, train_loss=0.582]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 17:59:40,125] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
-      "Epoch 0:  46%|████▌     | 26/57 [16:26<19:35, 37.93s/it, v_num=0, train_loss=0.535]\n",
-      "Epoch 0:  47%|████▋     | 27/57 [17:02<18:56, 37.88s/it, v_num=0, train_loss=0.578]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 18:00:58,164] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
-      "Epoch 0:  49%|████▉     | 28/57 [17:44<18:22, 38.01s/it, v_num=0, train_loss=0.582]\n",
-      "Epoch 0:  51%|█████     | 29/57 [18:20<17:42, 37.93s/it, v_num=0, train_loss=0.578]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 18:02:15,097] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
-      "Epoch 0:  53%|█████▎    | 30/57 [19:01<17:06, 38.04s/it, v_num=0, train_loss=0.598]\n",
-      "Epoch 0:  54%|█████▍    | 31/57 [19:36<16:26, 37.95s/it, v_num=0, train_loss=0.586]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 18:03:30,632] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
-      "Epoch 0:  56%|█████▌    | 32/57 [20:16<15:50, 38.02s/it, v_num=0, train_loss=0.605]\n",
-      "Epoch 0:  58%|█████▊    | 33/57 [20:49<15:08, 37.87s/it, v_num=0, train_loss=0.594]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 18:04:45,362] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
-      "Epoch 0:  60%|█████▉    | 34/57 [21:31<14:33, 37.98s/it, v_num=0, train_loss=0.598]\n",
-      "Epoch 0:  61%|██████▏   | 35/57 [22:08<13:54, 37.95s/it, v_num=0, train_loss=0.574]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 18:06:02,727] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
-      "Epoch 0:  63%|██████▎   | 36/57 [22:48<13:18, 38.02s/it, v_num=0, train_loss=0.586]\n",
-      "Epoch 0:  65%|██████▍   | 37/57 [23:23<12:38, 37.94s/it, v_num=0, train_loss=0.562]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 18:07:19,126] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
-      "Epoch 0:  67%|██████▋   | 38/57 [24:05<12:02, 38.03s/it, v_num=0, train_loss=0.535]\n",
-      "Epoch 0:  68%|██████▊   | 39/57 [24:38<11:22, 37.91s/it, v_num=0, train_loss=0.598]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 18:08:36,683] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
-      "Epoch 0:  70%|███████   | 40/57 [25:22<10:47, 38.07s/it, v_num=0, train_loss=0.562]\n",
-      "Epoch 0:  72%|███████▏  | 41/57 [25:57<10:07, 37.98s/it, v_num=0, train_loss=0.555]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 18:09:52,426] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
-      "Epoch 0:  74%|███████▎  | 42/57 [26:38<09:30, 38.06s/it, v_num=0, train_loss=0.555]\n",
-      "Epoch 0:  75%|███████▌  | 43/57 [27:13<08:51, 37.99s/it, v_num=0, train_loss=0.547]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 18:11:08,855] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
-      "Epoch 0:  77%|███████▋  | 44/57 [27:54<08:14, 38.06s/it, v_num=0, train_loss=0.562]\n",
-      "Epoch 0:  79%|███████▉  | 45/57 [28:29<07:35, 37.98s/it, v_num=0, train_loss=0.535]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 18:12:25,181] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
-      "Epoch 0:  81%|████████  | 46/57 [29:11<06:58, 38.07s/it, v_num=0, train_loss=0.531]\n",
-      "Epoch 0:  82%|████████▏ | 47/57 [29:45<06:19, 37.99s/it, v_num=0, train_loss=0.504]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 18:13:40,300] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
-      "Epoch 0:  84%|████████▍ | 48/57 [30:26<05:42, 38.05s/it, v_num=0, train_loss=0.520]\n",
-      "Epoch 0:  86%|████████▌ | 49/57 [31:01<05:03, 37.99s/it, v_num=0, train_loss=0.523]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 18:14:55,542] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
-      "Epoch 0:  88%|████████▊ | 50/57 [31:41<04:26, 38.03s/it, v_num=0, train_loss=0.520]\n",
-      "Epoch 0:  89%|████████▉ | 51/57 [32:16<03:47, 37.98s/it, v_num=0, train_loss=0.527]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 18:16:12,131] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
-      "Epoch 0:  91%|█████████ | 52/57 [32:58<03:10, 38.04s/it, v_num=0, train_loss=0.562]\n",
-      "Epoch 0:  93%|█████████▎| 53/57 [33:34<02:32, 38.00s/it, v_num=0, train_loss=0.539]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 18:17:29,752] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
-      "Epoch 0:  95%|█████████▍| 54/57 [34:15<01:54, 38.07s/it, v_num=0, train_loss=0.535]\n",
-      "Epoch 0:  96%|█████████▋| 55/57 [34:50<01:16, 38.01s/it, v_num=0, train_loss=0.512]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 18:18:45,986] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
-      "Epoch 0:  98%|█████████▊| 56/57 [35:31<00:38, 38.07s/it, v_num=0, train_loss=0.516]\n",
-      "Epoch 0: 100%|██████████| 57/57 [36:06<00:00, 38.00s/it, v_num=0, train_loss=0.461]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 18:20:01,817] [WARNING] [stage3.py:1851:step] 3 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
-      "Epoch 0: : 58it [36:47, 38.07s/it, v_num=0, train_loss=0.523]                      \n"
+      "Epoch 0: : 8it [04:46, 35.78s/it, v_num=0, train_loss=0.777]\n"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74427, ip=10.0.16.236)\u001b[0m /home/ray/anaconda3/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74427, ip=10.0.16.236)\u001b[0m   warnings.warn(\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m No modifications detected for re-loaded extension module utils, skipping build step...\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m Using /home/ray/.cache/torch_extensions/py310_cu118 as PyTorch extensions root...\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m Loading extension module utils...\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m Uploading checkpoint files from worker rank 0 to cloud URI s3://anyscale-staging-data-cld-kvedzwag2qa8i5bjxuevf5i7/yunxuanx-test/vicuna-13b-test/vicuna-13b-relation-extraction/LightningTrainer_c1544_00000_0_2023-06-30_17-39-36/checkpoint_000000.\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m /home/ray/anaconda3/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m   warnings.warn(\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=75547, ip=10.0.42.158)\u001b[0m Uploading checkpoint files from worker rank 3 to cloud URI s3://anyscale-staging-data-cld-kvedzwag2qa8i5bjxuevf5i7/yunxuanx-test/vicuna-13b-test/vicuna-13b-relation-extraction/LightningTrainer_c1544_00000_0_2023-06-30_17-39-36/checkpoint_000000.\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74152, ip=10.0.63.141)\u001b[0m Uploading checkpoint files from worker rank 1 to cloud URI s3://anyscale-staging-data-cld-kvedzwag2qa8i5bjxuevf5i7/yunxuanx-test/vicuna-13b-test/vicuna-13b-relation-extraction/LightningTrainer_c1544_00000_0_2023-06-30_17-39-36/checkpoint_000000.\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m Done uploading checkpoint files.\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74341, ip=10.0.29.61)\u001b[0m Uploading checkpoint files from worker rank 10 to cloud URI s3://anyscale-staging-data-cld-kvedzwag2qa8i5bjxuevf5i7/yunxuanx-test/vicuna-13b-test/vicuna-13b-relation-extraction/LightningTrainer_c1544_00000_0_2023-06-30_17-39-36/checkpoint_000000.\u001b[32m [repeated 13x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74427, ip=10.0.16.236)\u001b[0m Done uploading checkpoint files.\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74152, ip=10.0.63.141)\u001b[0m Done uploading checkpoint files.\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74711, ip=10.0.45.211)\u001b[0m Done uploading checkpoint files.\u001b[32m [repeated 11x across cluster]\u001b[0m\n"
+      "\u001b[36m(RayTrainWorker pid=4040, ip=10.0.136.125)\u001b[0m /home/ray/anaconda3/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::broadcast_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.)\u001b[32m [repeated 16x across cluster]\u001b[0m\n",
+      "\u001b[36m(RayTrainWorker pid=4040, ip=10.0.136.125)\u001b[0m   return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass\u001b[32m [repeated 16x across cluster]\u001b[0m\n",
+      "\u001b[36m(RayTrainWorker pid=4079, ip=10.0.173.126)\u001b[0m Exiting prefetcher's background thread\u001b[32m [repeated 15x across cluster]\u001b[0m\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Epoch 0: : 58it [37:42, 39.00s/it, v_num=0, train_loss=0.523]\n"
+      "Epoch 0: : 9it [05:19, 35.48s/it, v_num=0, train_loss=0.629]\n"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m `Trainer.fit` stopped: `max_epochs=1` reached.\n",
-      "\u001b[2m\u001b[36m(LightningTrainer pid=134103)\u001b[0m Uploading trial artifacts took 26.651 s, which may be a performance bottleneck. Consider saving fewer/smaller artifacts to the trial log directory, or disable artifact syncing with `SyncConfig(sync_artifacts=False)`.\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=75547, ip=10.0.42.158)\u001b[0m Done uploading checkpoint files.\u001b[32m [repeated 2x across cluster]\u001b[0m\n",
-      "2023-06-30 18:21:59,316\tINFO tune.py:1148 -- Total run time: 2542.82 seconds (2511.95 seconds for the tuning loop).\n"
+      "\u001b[36m(RayTrainWorker pid=4224, ip=10.0.155.201)\u001b[0m /home/ray/anaconda3/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::broadcast_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.)\u001b[32m [repeated 16x across cluster]\u001b[0m\n",
+      "\u001b[36m(RayTrainWorker pid=4224, ip=10.0.155.201)\u001b[0m   return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass\u001b[32m [repeated 16x across cluster]\u001b[0m\n"
      ]
-    }
-   ],
-   "source": [
-    "result = trainer.fit()"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "In summary:\n",
-    "- Training takes: 36:06 = 2166s\n",
-    "- Training + initialization + checkpointing takes 2473s\n",
-    "\n",
-    "Model initialization and checkpoint synchronization took 307 seconds. It will be amortized as you have larger datasets and take more time to train."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {},
-   "outputs": [
+    },
     {
-     "data": {
-      "text/plain": [
-       "Result(\n",
-       "  metrics={'_report_on': 'train_epoch_end', 'train_loss': 0.5234375, 'epoch': 0, 'step': 29, 'should_checkpoint': True, 'done': True, 'trial_id': 'c1544_00000', 'experiment_tag': '0'},\n",
-       "  path='s3://anyscale-staging-data-cld-kvedzwag2qa8i5bjxuevf5i7/yunxuanx-test/vicuna-13b-test/vicuna-13b-relation-extraction/LightningTrainer_c1544_00000_0_2023-06-30_17-39-36',\n",
-       "  checkpoint=LightningCheckpoint(uri=s3://anyscale-staging-data-cld-kvedzwag2qa8i5bjxuevf5i7/yunxuanx-test/vicuna-13b-test/vicuna-13b-relation-extraction/LightningTrainer_c1544_00000_0_2023-06-30_17-39-36/checkpoint_000000)\n",
-       ")"
-      ]
-     },
-     "execution_count": 10,
-     "metadata": {},
-     "output_type": "execute_result"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Epoch 0: : 10it [05:57, 35.70s/it, v_num=0, train_loss=0.672]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[36m(RayTrainWorker pid=4182, ip=10.0.178.75)\u001b[0m /home/ray/anaconda3/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::broadcast_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.)\u001b[32m [repeated 16x across cluster]\u001b[0m\n",
+      "\u001b[36m(RayTrainWorker pid=4182, ip=10.0.178.75)\u001b[0m   return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass\u001b[32m [repeated 16x across cluster]\u001b[0m\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Epoch 0: : 11it [06:30, 35.54s/it, v_num=0, train_loss=0.562]\n",
+      "Epoch 0: : 12it [07:04, 35.41s/it, v_num=0, train_loss=0.562]\n",
+      "Epoch 0: : 13it [07:36, 35.09s/it, v_num=0, train_loss=0.559]\n",
+      "Epoch 0: : 14it [08:11, 35.13s/it, v_num=0, train_loss=0.582]\n",
+      "Epoch 0: : 15it [08:43, 34.89s/it, v_num=0, train_loss=0.535]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[36m(RayTrainWorker pid=4224, ip=10.0.155.201)\u001b[0m /home/ray/anaconda3/lib/python3.10/site-packages/torch/nn/modules/module.py:1898: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "\u001b[36m(RayTrainWorker pid=4224, ip=10.0.155.201)\u001b[0m   warnings.warn(\n",
+      "\u001b[36m(RayTrainWorker pid=4048, ip=10.0.130.188)\u001b[0m /home/ray/anaconda3/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::broadcast_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.)\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
+      "\u001b[36m(RayTrainWorker pid=4048, ip=10.0.130.188)\u001b[0m   return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass\u001b[32m [repeated 15x across cluster]\u001b[0m\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Epoch 0: : 16it [09:19, 34.98s/it, v_num=0, train_loss=0.551]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[36m(RayTrainWorker pid=4048, ip=10.0.130.188)\u001b[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/mnt/cluster_storage/vicuna-13b-finetune/checkpoint_2025-10-15_16-04-29.037536)\n",
+      "\u001b[36m(RayTrainWorker pid=4048, ip=10.0.130.188)\u001b[0m Reporting training result 1: TrainingReport(checkpoint=Checkpoint(filesystem=local, path=/mnt/cluster_storage/vicuna-13b-finetune/checkpoint_2025-10-15_16-04-29.037536), metrics={'train_loss': 0.55078125, 'epoch': 0, 'step': 8}, validation_spec=None)\n",
+      "\u001b[36m(RayTrainWorker pid=4075, ip=10.0.164.99)\u001b[0m /home/ray/anaconda3/lib/python3.10/site-packages/torch/nn/modules/module.py:1898: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
+      "\u001b[36m(RayTrainWorker pid=4075, ip=10.0.164.99)\u001b[0m   warnings.warn(\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
+      "\u001b[36m(RayTrainWorker pid=5417, ip=10.0.167.159)\u001b[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/mnt/cluster_storage/vicuna-13b-finetune/checkpoint_2025-10-15_16-04-29.037536)\u001b[32m [repeated 2x across cluster]\u001b[0m\n",
+      "\u001b[36m(RayTrainWorker pid=5417, ip=10.0.167.159)\u001b[0m Reporting training result 1: TrainingReport(checkpoint=Checkpoint(filesystem=local, path=/mnt/cluster_storage/vicuna-13b-finetune/checkpoint_2025-10-15_16-04-29.037536), metrics={'train_loss': 0.55078125, 'epoch': 0, 'step': 8}, validation_spec=None)\u001b[32m [repeated 2x across cluster]\u001b[0m\n",
+      "\u001b[36m(RayTrainWorker pid=4191, ip=10.0.134.47)\u001b[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/mnt/cluster_storage/vicuna-13b-finetune/checkpoint_2025-10-15_16-04-29.037536)\u001b[32m [repeated 8x across cluster]\u001b[0m\n",
+      "\u001b[36m(RayTrainWorker pid=4191, ip=10.0.134.47)\u001b[0m Reporting training result 1: TrainingReport(checkpoint=Checkpoint(filesystem=local, path=/mnt/cluster_storage/vicuna-13b-finetune/checkpoint_2025-10-15_16-04-29.037536), metrics={'train_loss': 0.55078125, 'epoch': 0, 'step': 8}, validation_spec=None)\u001b[32m [repeated 8x across cluster]\u001b[0m\n",
+      "\u001b[36m(RayTrainWorker pid=4075, ip=10.0.164.99)\u001b[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/mnt/cluster_storage/vicuna-13b-finetune/checkpoint_2025-10-15_16-04-29.037536)\u001b[32m [repeated 3x across cluster]\u001b[0m\n",
+      "\u001b[36m(RayTrainWorker pid=4075, ip=10.0.164.99)\u001b[0m Reporting training result 1: TrainingReport(checkpoint=Checkpoint(filesystem=local, path=/mnt/cluster_storage/vicuna-13b-finetune/checkpoint_2025-10-15_16-04-29.037536), metrics={'train_loss': 0.55078125, 'epoch': 0, 'step': 8}, validation_spec=None)\u001b[32m [repeated 3x across cluster]\u001b[0m\n",
+      "\u001b[36m(RayTrainWorker pid=4053, ip=10.0.166.0)\u001b[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/mnt/cluster_storage/vicuna-13b-finetune/checkpoint_2025-10-15_16-04-29.037536)\u001b[32m [repeated 2x across cluster]\u001b[0m\n",
+      "\u001b[36m(RayTrainWorker pid=4053, ip=10.0.166.0)\u001b[0m Reporting training result 1: TrainingReport(checkpoint=Checkpoint(filesystem=local, path=/mnt/cluster_storage/vicuna-13b-finetune/checkpoint_2025-10-15_16-04-29.037536), metrics={'train_loss': 0.55078125, 'epoch': 0, 'step': 8}, validation_spec=None)\u001b[32m [repeated 2x across cluster]\u001b[0m\n",
+      "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m INFO: `Trainer.fit` stopped: `max_epochs=1` reached.\n",
+      "\u001b[36m(RayTrainWorker pid=17770)\u001b[0m `Trainer.fit` stopped: `max_epochs=1` reached.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Epoch 0: : 16it [10:25, 39.09s/it, v_num=0, train_loss=0.551]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[36m(TrainController pid=17559)\u001b[0m [State Transition] RUNNING -> FINISHED.\n"
+     ]
     }
    ],
    "source": [
-    "result"
+    "result = trainer.fit()"
    ]
   },
   {
@@ -1062,41 +998,6 @@
     "Now, it's time to play with our fine-tuned Vicuna code generator!"
    ]
   },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Download and Process your checkpoints\n",
-    "\n",
-    "First, download the checkpoints to your local machine using the AWS CLI.\n",
-    "\n",
-    "Note that adding the following configurations can significantly increase the syncing throughput compared to the default configurations. On a g5 instance with NVME SSD, the download speed improved from `200MB/s` to around `1.5GB/s`."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!aws configure set s3.max_concurrent_requests 32\n",
-    "!aws configure set default.s3.preferred_transfer_client crt\n",
-    "!aws configure set default.s3.target_bandwidth 100Gb/s\n",
-    "!aws configure set default.s3.multipart_chunksize 8MB"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import os\n",
-    "\n",
-    "os.system(f\"aws s3 sync s3://{result.checkpoint.path} /mnt/local_storage\")"
-   ]
-  },
   {
    "attachments": {},
    "cell_type": "markdown",
@@ -1112,16 +1013,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Processing zero checkpoint '/mnt/local_storage/checkpoint/model/checkpoint'\n",
+      "Processing zero checkpoint '/mnt/cluster_storage/vicuna-13b-finetune/checkpoint_2025-10-15_16-04-29.037536/checkpoint.ckpt/checkpoint'\n",
       "Detected checkpoint of type zero stage 3, world_size: 16\n",
-      "Parsing checkpoint created by deepspeed==0.9.4\n",
+      "Parsing checkpoint created by deepspeed==0.12.3\n",
       "Reconstructed Trainable fp32 state dict with 363 params 13015864320 elements\n"
      ]
     }
@@ -1136,11 +1037,7 @@
     "    vicuna_state_dict = {\n",
     "        k.replace(\"_forward_module.model.\", \"\"): v for k, v in state_dict.items()\n",
     "    }\n",
-    "    torch.save(vicuna_state_dict, os.path.join(zero_ckpt_dir, \"full_model.pt\"))\n",
-    "\n",
-    "\n",
-    "full_model_ckpt_path = \"/mnt/local_storage/checkpoint.ckpt/full_model.pt\"\n",
-    "extract_fp32_ckpt_from_zero(\"/mnt/local_storage/checkpoint.ckpt\")"
+    "    torch.save(vicuna_state_dict, os.path.join(zero_ckpt_dir, \"full_model.pt\"))\n"
    ]
   },
   {
@@ -1165,53 +1062,59 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "import shutil\n",
     "import torch\n",
     "import ray\n",
     "import lightning.pytorch as pl\n",
-    "from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM\n",
+    "from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM, pipeline\n",
     "from accelerate import (\n",
     "    init_empty_weights,\n",
     "    infer_auto_device_map,\n",
     "    load_checkpoint_and_dispatch,\n",
     ")\n",
     "\n",
-    "# Initialize a model on meta device\n",
-    "with init_empty_weights():\n",
-    "    config = AutoConfig.from_pretrained(MODEL_NAME)\n",
-    "    meta_model = AutoModelForCausalLM.from_config(config)\n",
-    "meta_model.tie_weights()\n",
-    "\n",
-    "# Define the device mapping\n",
-    "device_map = infer_auto_device_map(\n",
-    "    meta_model,\n",
-    "    max_memory={0: \"15GB\", \"cpu\": \"60GB\"},\n",
-    "    no_split_module_classes=[\"LlamaDecoderLayer\"],\n",
-    ")\n",
     "\n",
-    "# Load the model parameters\n",
-    "model = load_checkpoint_and_dispatch(\n",
-    "    meta_model,\n",
-    "    checkpoint=full_model_ckpt_path,\n",
-    "    device_map=device_map,\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from transformers import pipeline\n",
+    "def generate_sample_outputs(model_checkpoint_path, prompts):\n",
+    "    # Initialize a model on meta device\n",
+    "    with init_empty_weights():\n",
+    "        config = AutoConfig.from_pretrained(MODEL_NAME)\n",
+    "        meta_model = AutoModelForCausalLM.from_config(config)\n",
+    "    meta_model.tie_weights()\n",
     "\n",
-    "generator = pipeline(\n",
-    "    \"text-generation\",\n",
-    "    model=model,\n",
-    "    device_map=device_map,\n",
-    "    tokenizer=AutoTokenizer.from_pretrained(\n",
-    "        MODEL_NAME, padding_side=\"left\", use_fast=False\n",
-    "    ),\n",
-    ")"
+    "    # Define the device mapping\n",
+    "    device_map = infer_auto_device_map(\n",
+    "        meta_model,\n",
+    "        max_memory={0: \"15GB\", \"cpu\": \"60GB\"},\n",
+    "        no_split_module_classes=[\"LlamaDecoderLayer\"],\n",
+    "    )\n",
+    "\n",
+    "    local_checkpoint_path = \"/mnt/local_storage/vicuna_ckpt\"\n",
+    "    shutil.copytree(model_checkpoint_path, local_checkpoint_path)\n",
+    "\n",
+    "    extract_fp32_ckpt_from_zero(local_checkpoint_path)\n",
+    "\n",
+    "    full_model_ckpt_path = os.path.join(local_checkpoint_path, \"full_model.pt\")\n",
+    "\n",
+    "    # Load the model parameters\n",
+    "    model = load_checkpoint_and_dispatch(\n",
+    "        meta_model,\n",
+    "        checkpoint=full_model_ckpt_path,\n",
+    "        device_map=device_map,\n",
+    "    )\n",
+    "\n",
+    "    generator = pipeline(\n",
+    "        \"text-generation\",\n",
+    "        model=model,\n",
+    "        device_map=device_map,\n",
+    "        tokenizer=AutoTokenizer.from_pretrained(\n",
+    "            MODEL_NAME, padding_side=\"left\", use_fast=False\n",
+    "        ),\n",
+    "    )\n",
+    "\n",
+    "    for sample_prompt in prompts:\n",
+    "        prompt = PROMPT_TEMPLATE.format(intent=sample_prompt[\"intent\"], snippet=\"\")\n",
+    "        output = generator(prompt, max_new_tokens=30, do_sample=True)\n",
+    "        print(output[0][\"generated_text\"])"
    ]
   },
   {
@@ -1226,7 +1129,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 34,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1243,60 +1146,13 @@
     "]"
    ]
   },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Let's begin by examining the generated outputs without fine-tuning. In this case study, we utilize [Aviary Explorer](https://aviary.anyscale.com), an open-source multi-LLM serving platform supported by Ray and Anyscale. You can easily select from a variety of open-source LLMs and compare their generation quality, cost, latency, and many other metrics.\n",
-    "\n",
-    "We constructed a prompt in a zero-shot learning manner and feed it into 3 OSS LLMs.\n",
-    "\n",
-    "![](https://user-images.githubusercontent.com/26745457/250704232-65a20f1b-6752-4d6c-bba1-8296a373162f.png)\n",
-    "\n",
-    "\n",
-    "- `vicuna-13b-v1.3` begins to speak Chinese.\n",
-    "- `mpt-7b-chat` generates a reasonable code snippet, but with multiple lines.\n",
-    "- `falcon-7b-sft` generates a one line snippet, but it doesn't seem to work.\n",
-    "\n",
-    "As we can see, none of them generate a satisfactory code snippet. \n",
-    "\n",
-    "Now let's check the performance of our fine-tuned `vicuna-13b-v1.3` model:"
-   ]
-  },
   {
    "cell_type": "code",
-   "execution_count": 35,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/home/ray/anaconda3/lib/python3.10/site-packages/transformers/pipelines/base.py:1081: UserWarning: You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset\n",
-      "  warnings.warn(\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Intent: replace white spaces in colunm 'col' of dataframe `df` with '_'\n",
-      "One-line code snippet:  `df['col'] = df['col'].str.replace(' ', '_')`\n",
-      "\n",
-      "Intent: search for occurrences of regex pattern '>.*<' in xml string `line`\n",
-      "One-line code snippet:  `re.findall('>.*<', line)``\n",
-      "\n",
-      "Intent: send a signal `signal.SIGUSR1` to the current process\n",
-      "One-line code snippet:  `os.kill(os.getpid(), signal.SIGUSR1)``\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "for case in testcases:\n",
-    "    prompt = PROMPT_TEMPLATE.format(intent=case[\"intent\"], snippet=\"\")\n",
-    "    output = generator(prompt, max_new_tokens=30, do_sample=True)\n",
-    "    print(output[0][\"generated_text\"])"
+    "generate_sample_outputs(os.path.join(result.checkpoint.path, \"checkpoint.ckpt\"), testcases)"
    ]
   },
   {
@@ -1311,26 +1167,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 33,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Before\n",
-      "            col\n",
-      "0  abc def ghi\n",
-      "1     12 3 456\n",
-      "2             \n",
-      "After\n",
-      "            col\n",
-      "0  abc_def_ghi\n",
-      "1    _12_3_456\n",
-      "2        _____\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "import pandas as pd\n",
     "\n",
@@ -1343,25 +1182,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 47,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['>The Great Gatsby<',\n",
-       " '>F. Scott Fitzgerald<',\n",
-       " '>1925<',\n",
-       " '>Sapiens: A Brief History of Humankind<',\n",
-       " '>Yuval Noah Harari<',\n",
-       " '>2011<']"
-      ]
-     },
-     "execution_count": 47,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "import re\n",
     "\n",
@@ -1398,7 +1221,8 @@
    "source": [
     "import os, signal\n",
     "\n",
-    "os.kill(os.getpid(), signal.SIGUSR1)  # Terminate the current process~"
+    "# Don't actually kill the process, it's just for demo :D\n",
+    "# os.kill(os.getpid(), signal.SIGUSR1)  # Terminate the current process~"
    ]
   },
   {
@@ -1412,12 +1236,16 @@
     "- [HuggingFace: DeepSpeed Integration](https://huggingface.co/docs/transformers/main_classes/deepspeed#deepspeed-integration)\n",
     "- [HuggingFace: Handling big models for inference](https://huggingface.co/docs/accelerate/main/usage_guides/big_modeling)\n",
     "- [Lightning Transformers: DeepSpeed Training with Big Transformer Models](https://lightning-transformers.readthedocs.io/en/latest/)\n",
-    "- [Aviary: Open Source Multi-LLM Serving](https://www.anyscale.com/blog/announcing-aviary-open-source-multi-llm-serving-solution)\n",
     "- Rajbhandari, S., Rasley, J., et al. (2020). ZeRO: Memory Optimizations Toward Training Trillion Parameter Models. [arXiv:1910.02054](https://arxiv.org/abs/1910.02054)\n",
     "- Zheng, L., Chiang, W-L., Sheng, Y., et al. (2023). Judging LLM-as-a-judge with MT-Bench and Chatbot Arena. [arXiv:2306.05685](https://arxiv.org/abs/2306.05685)\n",
     "\n",
     "\n"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": []
   }
  ],
  "metadata": {
@@ -1436,7 +1264,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.15"
+   "version": "3.10.18"
   },
   "orphan": true
  },
diff --git a/release/release_tests.yaml b/release/release_tests.yaml
index 51ead844ec15..32e1c81f22ef 100644
--- a/release/release_tests.yaml
+++ b/release/release_tests.yaml
@@ -405,7 +405,7 @@
   python: "3.10"
   group: AIR examples
   working_dir: air_examples/vicuna_13b_lightning_deepspeed_finetuning
-  frequency: weekly
+  frequency: manual
   team: ml
   cluster:
     byod: