feat: Add get_or_infer_runner_type to support getting runner type from context

plotor · plotor · commit 79721bee84cc · 2025-07-30T10:32:51.000+08:00
Signed-off-by: plotor &lt;zhenchao.wang@hotmail.com&gt;
diff --git a/daft/context.py b/daft/context.py
@@ -47,6 +47,21 @@ def __init__(self, ctx: PyDaftContext | None = None):
         else:
             self._ctx = PyDaftContext()
 
+    def get_or_infer_runner_type(self) -> str:
+        """Get or infer the runner type.
+
+        This API will get or infer the currently used runner type according to the following strategies:
+        1. If the `runner` has been set, return its type directly;
+        2. Try to determine whether it's currently running on a ray cluster. If so, consider it to be a ray type;
+        3. Try to determine based on `DAFT_RUNNER` env variable.
+
+        :return: runner type string ("native" or "ray")
+        """
+        if self._ctx._runner is not None:
+            return self._ctx._runner.name
+
+        return self._ctx.get_or_infer_runner_type()
+
     def get_or_create_runner(self) -> Runner[PartitionT]:
         return self._ctx.get_or_create_runner()
 
diff --git a/daft/daft/__init__.pyi b/daft/daft/__init__.pyi
@@ -1962,6 +1962,7 @@ class PyDaftContext:
     def __init__(self) -> None: ...
     _runner: Runner[Any]
     def get_or_create_runner(self) -> Runner[PartitionT]: ...
+    def get_or_infer_runner_type(self) -> str: ...
     _daft_execution_config: PyDaftExecutionConfig
     _daft_planning_config: PyDaftPlanningConfig
     @property
diff --git a/daft/utils.py b/daft/utils.py
@@ -126,7 +126,7 @@ def column_inputs_to_expressions(columns: ManyColumnsInputType) -> list[Expressi
     return [col(c) if isinstance(c, str) else c for c in column_iter]
 
 
-def detect_ray_state() -> bool:
+def detect_ray_state() -> tuple[bool, bool]:
     ray_is_initialized = False
     ray_is_in_job = False
     in_ray_worker = False
@@ -145,4 +145,4 @@ def detect_ray_state() -> bool:
     except ImportError:
         pass
 
-    return not in_ray_worker and (ray_is_initialized or ray_is_in_job)
+    return ray_is_initialized or ray_is_in_job, in_ray_worker
diff --git a/src/daft-context/src/lib.rs b/src/daft-context/src/lib.rs
@@ -244,6 +244,14 @@ pub fn set_runner_ray(
 ) -> DaftResult<DaftContext> {
     let ctx = get_context();
 
+    let runner_type = get_runner_type_from_env();
+    if !runner_type.is_empty() && runner_type != RayRunner::NAME {
+        log::warn!(
+            "Ignore inconsistent $DAFT_RUNNER='{}' env when setting runner as ray",
+            runner_type
+        );
+    }
+
     let runner = Runner::Ray(RayRunner::try_new(
         address,
         max_task_backlog,
@@ -268,6 +276,14 @@ pub fn set_runner_ray(
 pub fn set_runner_native(num_threads: Option<usize>) -> DaftResult<DaftContext> {
     let ctx = get_context();
 
+    let runner_type = get_runner_type_from_env();
+    if !runner_type.is_empty() && runner_type != NativeRunner::NAME {
+        log::warn!(
+            "Ignore inconsistent $DAFT_RUNNER='{}' env when setting runner as native",
+            runner_type
+        );
+    }
+
     let runner = Runner::Native(NativeRunner::try_new(num_threads)?);
     let runner = Arc::new(runner);
 
@@ -322,30 +338,45 @@ fn get_ray_runner_config_from_env() -> RunnerConfig {
 
 /// Helper function to automatically detect whether to use the ray runner.
 #[cfg(feature = "python")]
-fn detect_ray_state() -> bool {
+fn detect_ray_state() -> (bool, bool) {
     Python::with_gil(|py| {
         py.import(pyo3::intern!(py, "daft.utils"))
             .and_then(|m| m.getattr(pyo3::intern!(py, "detect_ray_state")))
             .and_then(|m| m.call0())
             .and_then(|m| m.extract())
-            .unwrap_or(false)
+            .unwrap_or((false, false))
     })
 }
 
 #[cfg(feature = "python")]
-fn get_runner_config_from_env() -> DaftResult<RunnerConfig> {
+fn get_runner_type_from_env() -> String {
     const DAFT_RUNNER: &str = "DAFT_RUNNER";
 
-    let runner_from_envvar = std::env::var(DAFT_RUNNER)
+    std::env::var(DAFT_RUNNER)
         .unwrap_or_default()
-        .to_lowercase();
-
-    match runner_from_envvar.as_str() {
-        "native" => Ok(RunnerConfig::Native { num_threads: None }),
-        "ray" => Ok(get_ray_runner_config_from_env()),
-        "py" => Err(DaftError::ValueError("The PyRunner was removed from Daft from v0.5.0 onwards. Please set the env to `DAFT_RUNNER=native` instead.".to_string())),
-        "" => Ok(if detect_ray_state() { get_ray_runner_config_from_env() } else { RunnerConfig::Native { num_threads: None }}),
-        other => Err(DaftError::ValueError(format!("Invalid runner type `DAFT_RUNNER={other}` specified through the env. Please use either `native` or `ray` instead.")))
+        .to_lowercase()
+}
+
+#[cfg(feature = "python")]
+fn get_runner_config_from_env() -> DaftResult<RunnerConfig> {
+    match get_runner_type_from_env().as_str() {
+        NativeRunner::NAME => Ok(RunnerConfig::Native { num_threads: None }),
+        RayRunner::NAME => Ok(get_ray_runner_config_from_env()),
+        "py" => Err(DaftError::ValueError(
+            "The PyRunner was removed from Daft from v0.5.0 onwards. \
+            Please set the env to `DAFT_RUNNER=native` instead."
+                .to_string(),
+        )),
+        "" => Ok(if detect_ray_state() == (true, false) {
+            // on ray but not in ray worker
+            get_ray_runner_config_from_env()
+        } else {
+            RunnerConfig::Native { num_threads: None }
+        }),
+        other => Err(DaftError::ValueError(format!(
+            "Invalid runner type `DAFT_RUNNER={other}` specified through the env. \
+            Please use either `native` or `ray` instead."
+        ))),
     }
 }
 
@@ -366,7 +397,7 @@ pub fn reset_runner() {
 }
 
 #[cfg(feature = "python")]
-pub fn register_modules(parent: &Bound<PyModule>) -> pyo3::PyResult<()> {
+pub fn register_modules(parent: &Bound<PyModule>) -> PyResult<()> {
     parent.add_function(wrap_pyfunction!(
         python::get_runner_config_from_env,
         parent
diff --git a/src/daft-context/src/python.rs b/src/daft-context/src/python.rs
@@ -2,9 +2,10 @@ use std::sync::Arc;
 
 use common_daft_config::{PyDaftExecutionConfig, PyDaftPlanningConfig};
 use common_error::DaftError;
-use pyo3::prelude::*;
+use daft_py_runners::{NativeRunner, RayRunner};
+use pyo3::{prelude::*, IntoPyObjectExt};
 
-use crate::{DaftContext, Runner, RunnerConfig};
+use crate::{detect_ray_state, DaftContext, Runner, RunnerConfig};
 
 #[pyclass]
 pub struct PyRunnerConfig {
@@ -13,7 +14,7 @@ pub struct PyRunnerConfig {
 
 #[pyclass]
 pub struct PyDaftContext {
-    inner: crate::DaftContext,
+    inner: DaftContext,
 }
 
 impl Default for PyDaftContext {
@@ -45,6 +46,27 @@ impl PyDaftContext {
             }
         }
     }
+
+    pub fn get_or_infer_runner_type(&self, py: Python) -> PyResult<PyObject> {
+        match self.inner.runner() {
+            Some(runner) => match runner.as_ref() {
+                Runner::Ray(_) => RayRunner::NAME,
+                Runner::Native(_) => NativeRunner::NAME,
+            },
+            None => {
+                if let (true, _) = detect_ray_state() {
+                    RayRunner::NAME
+                } else {
+                    match super::get_runner_config_from_env()? {
+                        RunnerConfig::Ray { .. } => RayRunner::NAME,
+                        RunnerConfig::Native { .. } => NativeRunner::NAME,
+                    }
+                }
+            }
+        }
+        .into_py_any(py)
+    }
+
     #[getter(_daft_execution_config)]
     pub fn get_daft_execution_config(&self, py: Python) -> PyResult<PyDaftExecutionConfig> {
         let config = py.allow_threads(|| self.inner.execution_config());
diff --git a/src/daft-py-runners/src/lib.rs b/src/daft-py-runners/src/lib.rs
@@ -22,6 +22,8 @@ pub struct RayRunner {
 
 #[cfg(feature = "python")]
 impl RayRunner {
+    pub const NAME: &'static str = "ray";
+
     pub fn try_new(
         address: Option<String>,
         max_task_backlog: Option<usize>,
@@ -53,6 +55,8 @@ pub struct NativeRunner {
 
 #[cfg(feature = "python")]
 impl NativeRunner {
+    pub const NAME: &'static str = "native";
+
     pub fn try_new(num_threads: Option<usize>) -> DaftResult<Self> {
         Python::with_gil(|py| {
             let native_runner_module = py.import(intern!(py, "daft.runners.native_runner"))?;
@@ -84,13 +88,13 @@ impl Runner {
         Python::with_gil(|py| {
             let name = obj.getattr(py, "name")?.extract::<String>(py)?;
             match name.as_ref() {
-                "ray" => {
+                RayRunner::NAME => {
                     let ray_runner = RayRunner {
                         pyobj: Arc::new(obj),
                     };
                     Ok(Self::Ray(ray_runner))
                 }
-                "native" => {
+                NativeRunner::NAME => {
                     let native_runner = NativeRunner {
                         pyobj: Arc::new(obj),
                     };
diff --git a/tests/test_context.py b/tests/test_context.py
@@ -238,3 +238,101 @@ def test_cannot_set_runner_ray_after_py():
         )
         assert result.stdout.decode().strip() in {"native"}
         assert "DaftError::InternalError Cannot set runner more than once" in result.stderr.decode().strip()
+
+
+@pytest.mark.parametrize("daft_runner_envvar", ["ray", "native"])
+def test_get_or_infer_runner_type_from_env(daft_runner_envvar):
+    get_or_infer_runner_type_py_script = """
+import daft
+
+print(daft.context.get_context().get_or_infer_runner_type())
+
+
+@daft.udf(return_dtype=daft.DataType.string())
+def my_udf(foo):
+    runner_type = daft.context.get_context().get_or_infer_runner_type()
+    return [f"{runner_type}_{f}" for f in foo]
+
+
+df = daft.from_pydict({"foo": [7]})
+pd = df.with_column(column_name="bar", expr=my_udf(df["foo"])).to_pydict()
+print(pd["bar"][0])
+    """
+
+    with with_null_env():
+        result = subprocess.run(
+            [sys.executable, "-c", get_or_infer_runner_type_py_script],
+            capture_output=True,
+            env={"DAFT_RUNNER": daft_runner_envvar},
+        )
+
+        assert result.stdout.decode().strip() == f"{daft_runner_envvar}\n{daft_runner_envvar}_7"
+
+
+def test_get_or_infer_runner_type_with_set_runner_native():
+    get_or_infer_runner_type_py_script = """
+import daft
+
+daft.context.set_runner_native()
+
+print(daft.context.get_context().get_or_infer_runner_type())
+
+
+@daft.udf(return_dtype=daft.DataType.string())
+def my_udf(foo):
+    runner_type = daft.context.get_context().get_or_infer_runner_type()
+    return [f"{runner_type}_{f}" for f in foo]
+
+
+df = daft.from_pydict({"foo": [7]})
+pd = df.with_column(column_name="bar", expr=my_udf(df["foo"])).to_pydict()
+print(pd["bar"][0])
+    """
+
+    with with_null_env():
+        result = subprocess.run([sys.executable, "-c", get_or_infer_runner_type_py_script], capture_output=True)
+        assert result.stdout.decode().strip() == "native\nnative_7"
+
+
+def test_get_or_infer_runner_type_with_set_runner_ray():
+    get_or_infer_runner_type_py_script = """
+import daft
+
+daft.context.set_runner_ray()
+
+print(daft.context.get_context().get_or_infer_runner_type())
+
+
+@daft.udf(return_dtype=daft.DataType.string())
+def my_udf(foo):
+    runner_type = daft.context.get_context().get_or_infer_runner_type()
+    return [f"{runner_type}_{f}" for f in foo]
+
+
+df = daft.from_pydict({"foo": [7]})
+pd = df.with_column(column_name="bar", expr=my_udf(df["foo"])).to_pydict()
+print(pd["bar"][0])
+    """
+
+    with with_null_env():
+        result = subprocess.run([sys.executable, "-c", get_or_infer_runner_type_py_script], capture_output=True)
+        assert result.stdout.decode().strip() == "ray\nray_7"
+
+
+@pytest.mark.parametrize("daft_runner_envvar", ["ray", "native"])
+def test_get_or_infer_runner_type_with_inconsistent_settings(daft_runner_envvar):
+    get_or_infer_runner_type_py_script = """
+import daft
+
+print(daft.context.get_context().get_or_infer_runner_type())
+daft.context.set_runner_ray()
+print(daft.context.get_context().get_or_infer_runner_type())
+    """
+
+    with with_null_env():
+        result = subprocess.run(
+            [sys.executable, "-c", get_or_infer_runner_type_py_script],
+            capture_output=True,
+            env={"DAFT_RUNNER": daft_runner_envvar},
+        )
+        assert result.stdout.decode().strip() == f"{daft_runner_envvar}\nray"