[Template] Update image-search-and-classification to pass device for collate_fn (#58327)

gangsf · Gang Zhao · web-flow · commit b6e6210e7db5 · 2025-10-31T22:04:50.000Z
Signed-off-by: Gang Zhao &lt;gang@gang-JQ62HD2C37.local&gt;
Co-authored-by: Gang Zhao &lt;gang@gang-JQ62HD2C37.local&gt;
diff --git a/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/configs/generate_embeddings.yaml b/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/configs/generate_embeddings.yaml
@@ -6,7 +6,7 @@ name: image-batch-embeddings
 # like anyscale/ray:2.43.0-slim-py312-cu125, a user-provided base image (provided
 # that it meets certain specs), or you can build new images using the Anyscale
 # image builder at https://console.anyscale-staging.com/v2/container-images.
-image_uri:  anyscale/ray:2.48.0-slim-py312-cu128
+image_uri:  anyscale/ray:2.51.0-slim-py312-cu128
 # containerfile: /home/ray/default/containerfile
 
 # When empty, Anyscale will auto-select the instance types. You can also specify
diff --git a/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/configs/service.yaml b/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/configs/service.yaml
@@ -6,7 +6,7 @@ name: doggos-app
 # like anyscale/ray:2.43.0-slim-py312-cu125, a user-provided base image (provided
 # that it meets certain specs), or you can build new images using the Anyscale
 # image builder at https://console.anyscale-staging.com/v2/container-images.
-image_uri:  anyscale/ray:2.48.0-slim-py312-cu128
+image_uri:  anyscale/ray:2.51.0-slim-py312-cu128
 # containerfile: /home/ray/default/containerfile
 
 # When empty, Anyscale will auto-select the instance types. You can also specify
diff --git a/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/configs/train_model.yaml b/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/configs/train_model.yaml
@@ -6,7 +6,7 @@ name: train-image-model
 # like anyscale/ray:2.43.0-slim-py312-cu125, a user-provided base image (provided
 # that it meets certain specs), or you can build new images using the Anyscale
 # image builder at https://console.anyscale-staging.com/v2/container-images.
-image_uri:  anyscale/ray:2.48.0-slim-py312-cu128
+image_uri:  anyscale/ray:2.51.0-slim-py312-cu128
 # containerfile: /home/ray/default/containerfile
 
 # When empty, Anyscale will auto-select the instance types. You can also specify
diff --git a/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/doggos/doggos/infer.py b/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/doggos/doggos/infer.py
@@ -13,12 +13,12 @@ def __init__(self, preprocessor, model):
 
     def __call__(self, batch, device="cuda"):
         self.model.to(device)
-        batch["prediction"] = self.model.predict(collate_fn(batch))
+        batch["prediction"] = self.model.predict(collate_fn(batch, device=device))
         return batch
 
     def predict_probabilities(self, batch, device="cuda"):
         self.model.to(device)
-        predicted_probabilities = self.model.predict_probabilities(collate_fn(batch))
+        predicted_probabilities = self.model.predict_probabilities(collate_fn(batch, device=device))
         batch["probabilities"] = [
             {
                 self.preprocessor.label_to_class[i]: float(prob)
diff --git a/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/doggos/doggos/model.py b/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/doggos/doggos/model.py
@@ -16,15 +16,24 @@ def pad_array(arr, dtype=np.int32):
     return padded_arr
 
 
-def collate_fn(batch):
+def collate_fn(batch, device=None):
     dtypes = {"embedding": torch.float32, "label": torch.int64}
     tensor_batch = {}
+    
+    # If no device is provided, try to get it from Ray Train context
+    if device is None:
+        try:
+            device = get_device()
+        except RuntimeError:
+            # When not in Ray Train context, use CPU for testing/serving
+            device = "cpu"
+    
     for key in dtypes.keys():
         if key in batch:
             tensor_batch[key] = torch.as_tensor(
                 batch[key],
                 dtype=dtypes[key],
-                device=get_device(),
+                device=device,
             )
     return tensor_batch
 
diff --git a/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/doggos/doggos/serve.py b/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/doggos/doggos/serve.py
@@ -42,7 +42,7 @@ def get_probabilities(self, url):
         with torch.inference_mode():
             embedding = self.model.get_image_features(**inputs).cpu().numpy()
         outputs = self.predictor.predict_probabilities(
-            collate_fn({"embedding": embedding})
+            collate_fn({"embedding": embedding}, device=self.device)
         )
         return {"probabilities": outputs["probabilities"][0]}
 
diff --git a/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/notebooks/02-Distributed-Training.ipynb b/doc/source/ray-overview/examples/e2e-multimodal-ai-workloads/notebooks/02-Distributed-Training.ipynb
@@ -895,15 +895,24 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def collate_fn(batch):\n",
+    "def collate_fn(batch, device=None):\n",
     "    dtypes = {\"embedding\": torch.float32, \"label\": torch.int64}\n",
     "    tensor_batch = {}\n",
+    "    \n",
+    "    # If no device is provided, try to get it from Ray Train context\n",
+    "    if device is None:\n",
+    "        try:\n",
+    "            device = get_device()\n",
+    "        except RuntimeError:\n",
+    "            # When not in Ray Train context, use CPU for testing\n",
+    "            device = \"cpu\"\n",
+    "    \n",
     "    for key in dtypes.keys():\n",
     "        if key in batch:\n",
     "            tensor_batch[key] = torch.as_tensor(\n",
     "                batch[key],\n",
     "                dtype=dtypes[key],\n",
-    "                device=get_device(),\n",
+    "                device=device,\n",
     "            )\n",
     "    return tensor_batch\n"
    ]
@@ -1047,7 +1056,7 @@
    "source": [
     "# Sample batch\n",
     "sample_batch = train_ds.take_batch(batch_size=3)\n",
-    "collate_fn(batch=sample_batch)\n"
+    "collate_fn(batch=sample_batch, device=\"cpu\")\n"
    ]
   },
   {
@@ -1527,12 +1536,12 @@
     "\n",
     "    def __call__(self, batch, device=\"cuda\"):\n",
     "        self.model.to(device)\n",
-    "        batch[\"prediction\"] = self.model.predict(collate_fn(batch))\n",
+    "        batch[\"prediction\"] = self.model.predict(collate_fn(batch, device=device))\n",
     "        return batch\n",
     "\n",
     "    def predict_probabilities(self, batch, device=\"cuda\"):\n",
     "        self.model.to(device)\n",
-    "        predicted_probabilities = self.model.predict_probabilities(collate_fn(batch))\n",
+    "        predicted_probabilities = self.model.predict_probabilities(collate_fn(batch, device=device))\n",
     "        batch[\"probabilities\"] = [\n",
     "            {\n",
     "                self.preprocessor.label_to_class[i]: float(prob)\n",
@@ -1551,7 +1560,8 @@
     "            args_fp=os.path.join(artifacts_dir, \"args.json\"), \n",
     "            state_dict_fp=os.path.join(artifacts_dir, \"model.pt\"),\n",
     "        )\n",
-    "        return cls(preprocessor=preprocessor, model=model)\n"
+    "        return cls(preprocessor=preprocessor, model=model)\n",
+    "\n"
    ]
   },
   {

Original file line number	Diff line number	Diff line change
`@@ -42,7 +42,7 @@ def get_probabilities(self, url):`
`42`	`42`	`with torch.inference_mode():`
`43`	`43`	`embedding = self.model.get_image_features(**inputs).cpu().numpy()`
`44`	`44`	`outputs = self.predictor.predict_probabilities(`
`45`		`- collate_fn({"embedding": embedding})`
	`45`	`+ collate_fn({"embedding": embedding}, device=self.device)`
`46`	`46`	`)`
`47`	`47`	`return {"probabilities": outputs["probabilities"][0]}`
`48`	`48`