Allow CustomTrainer to run a Python script directly

jskswamy · jskswamy · commit e96e53a93d4a · 2025-08-07T15:19:51.000+05:30
CustomTrainer now supports a python_file argument.
If set, the job will run the specified script as the main
process (python myscript.py) instead of requiring a function.

This is mutually exclusive with func. This change makes it easier
to migrate script-based workflows and matches user expectations
for direct script execution.

Existing function-based usage is unchanged.

Validation is added to ensure only one of func or python_file is set.

Signed-off-by: Krishnaswamy Subramanian &lt;subramk@thoughtworks.com&gt;
diff --git a/python/kubeflow/trainer/types/types.py b/python/kubeflow/trainer/types/types.py
@@ -16,7 +16,7 @@
 from dataclasses import dataclass, field
 from datetime import datetime
 from enum import Enum
-from typing import Callable, Dict, Optional
+from typing import Callable, Dict, List, Optional
 
 from kubeflow.trainer.constants import constants
 
@@ -25,10 +25,13 @@
 @dataclass
 class CustomTrainer:
     """Custom Trainer configuration. Configure the self-contained function
-        that encapsulates the entire model training process.
+        that encapsulates the entire model training process, or run a Python script directly.
 
     Args:
         func (`Callable`): The function that encapsulates the entire model training process.
+        python_file (`Optional[str]`): Path to a Python script to run directly (e.g., 'train.py').
+        python_args (`Optional[List[str]]`): Arguments to pass to the Python script.
+        Only one of func or python_file should be set.
         func_args (`Optional[Dict]`): The arguments to pass to the function.
         packages_to_install (`Optional[List[str]]`):
             A list of Python packages to install before running the function.
@@ -38,7 +41,9 @@ class CustomTrainer:
         env (`Optional[Dict[str, str]]`): The environment variables to set in the training nodes.
     """
 
-    func: Callable
+    func: Optional[Callable] = None
+    python_file: Optional[str] = None
+    python_args: Optional[List[str]] = None
     func_args: Optional[Dict] = None
     packages_to_install: Optional[list[str]] = None
     pip_index_url: str = constants.DEFAULT_PIP_INDEX_URL
diff --git a/python/kubeflow/trainer/types/types_test.py b/python/kubeflow/trainer/types/types_test.py
@@ -0,0 +1,74 @@
+from kubeflow.trainer.types import types
+
+
+class TestTrainerConfigurations:
+    """Test cases for trainer configurations and types."""
+
+    def test_centralized_trainer_configs(self):
+        """Test that centralized trainer configurations are properly defined."""
+        # Verify all trainer frameworks have configurations
+        for framework in types.Framework:
+            assert framework in types.TRAINER_CONFIGS
+            trainer = types.TRAINER_CONFIGS[framework]
+            assert trainer.framework == framework
+
+    def test_default_trainer_uses_centralized_config(self):
+        """Test that DEFAULT_TRAINER uses centralized configuration."""
+        assert types.DEFAULT_TRAINER == types.TRAINER_CONFIGS[types.Framework.TORCH]
+        assert types.DEFAULT_TRAINER.framework == types.Framework.TORCH
+
+    def test_custom_trainer_python_file_with_args(self):
+        """Test CustomTrainer with python_file and python_args."""
+        # Test basic python_file without args
+        trainer = types.CustomTrainer(python_file="train.py")
+        assert trainer.python_file == "train.py"
+        assert trainer.python_args is None
+
+        # Test python_file with args
+        trainer = types.CustomTrainer(
+            python_file="train.py",
+            python_args=["--epochs", "100", "--batch-size", "32"]
+        )
+        assert trainer.python_file == "train.py"
+        assert trainer.python_args == ["--epochs", "100", "--batch-size", "32"]
+
+        # Test python_file with complex args
+        trainer = types.CustomTrainer(
+            python_file="train.py",
+            python_args=["--epochs", "100", "--batch-size", "32", "--lr", "0.001", "--model-path", "/workspace/model"]
+        )
+        assert trainer.python_file == "train.py"
+        assert trainer.python_args == ["--epochs", "100", "--batch-size", "32", "--lr", "0.001", "--model-path", "/workspace/model"]
+
+    def test_custom_trainer_mutual_exclusivity(self):
+        """Test that func and python_file are mutually exclusive."""
+        # This should work
+        trainer = types.CustomTrainer(python_file="train.py")
+        assert trainer.func is None
+        assert trainer.python_file == "train.py"
+
+        # This should work
+        def dummy_func():
+            pass
+        trainer = types.CustomTrainer(func=dummy_func)
+        assert trainer.func == dummy_func
+        assert trainer.python_file is None
+
+    def test_custom_trainer_python_args_only(self):
+        """Test CustomTrainer with python_args but no python_file (should be None)."""
+        trainer = types.CustomTrainer(python_args=["--epochs", "100"])
+        assert trainer.python_file is None
+        assert trainer.python_args == ["--epochs", "100"]
+
+    def test_custom_trainer_python_args_with_func(self):
+        """Test CustomTrainer with func and python_args (should be allowed)."""
+        def dummy_func():
+            pass
+
+        trainer = types.CustomTrainer(
+            func=dummy_func,
+            python_args=["--epochs", "100"]
+        )
+        assert trainer.func == dummy_func
+        assert trainer.python_file is None
+        assert trainer.python_args == ["--epochs", "100"]
diff --git a/python/kubeflow/trainer/utils/utils.py b/python/kubeflow/trainer/utils/utils.py
@@ -372,6 +372,20 @@ def get_trainer_crd_from_custom_trainer(
             trainer.resources_per_node
         )
 
+    if trainer.python_file:
+        if trainer.func:
+            raise ValueError("Specify only one of func or python_file in CustomTrainer.")
+        trainer_crd.command = ["python"]
+           # Combine python_file with python_args
+        args = [trainer.python_file]
+        if trainer.python_args:
+            args.extend(trainer.python_args)
+        trainer_crd.args = args
+        return trainer_crd
+
+    if not trainer.func:
+        raise ValueError("You must specify either func or python_file in CustomTrainer.")
+
     # Add command to the Trainer.
     # TODO: Support train function parameters.
     trainer_crd.command = get_command_using_train_func(
diff --git a/python/kubeflow/trainer/utils/utils_test.py b/python/kubeflow/trainer/utils/utils_test.py
@@ -0,0 +1,99 @@
+# Copyright 2024 The Kubeflow Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from unittest.mock import Mock
+
+from kubeflow.trainer.utils import utils
+from kubeflow.trainer.types import types
+
+
+class TestCustomTrainerPythonFileSupport(unittest.TestCase):
+    """Test cases for the new python_file and python_args functionality in CustomTrainer."""
+
+    def test_get_trainer_crd_from_custom_trainer_python_file_with_args(self):
+        """Test get_trainer_crd_from_custom_trainer with python_file and python_args."""
+        runtime = Mock()
+        trainer = types.CustomTrainer(
+            python_file="train.py",
+            python_args=["--epochs", "100", "--batch-size", "32"],
+            num_nodes=2,
+            resources_per_node={"gpu": "4"},
+        )
+
+        result = utils.get_trainer_crd_from_custom_trainer(runtime, trainer)
+
+        self.assertEqual(result.num_nodes, 2)
+        self.assertEqual(result.command, ["python"])
+        self.assertEqual(result.args, ["train.py", "--epochs", "100", "--batch-size", "32"])
+
+    def test_get_trainer_crd_from_custom_trainer_python_file_no_args(self):
+        """Test get_trainer_crd_from_custom_trainer with python_file but no args."""
+        runtime = Mock()
+        trainer = types.CustomTrainer(
+            python_file="train.py", num_nodes=2, resources_per_node={"gpu": "4"}
+        )
+
+        result = utils.get_trainer_crd_from_custom_trainer(runtime, trainer)
+
+        self.assertEqual(result.num_nodes, 2)
+        self.assertEqual(result.command, ["python"])
+        self.assertEqual(result.args, ["train.py"])
+
+    def test_get_trainer_crd_from_custom_trainer_mutual_exclusivity_both_specified(self):
+        """Test that func and python_file cannot be specified together."""
+        runtime = Mock()
+        trainer = types.CustomTrainer(func=lambda: None, python_file="train.py")
+
+        with self.assertRaises(ValueError) as context:
+            utils.get_trainer_crd_from_custom_trainer(runtime, trainer)
+
+        self.assertIn(
+            "Specify only one of func or python_file in CustomTrainer", str(context.exception)
+        )
+
+    def test_get_trainer_crd_from_custom_trainer_mutual_exclusivity_neither_specified(self):
+        """Test that either func or python_file must be specified."""
+        runtime = Mock()
+        trainer = types.CustomTrainer()
+
+        with self.assertRaises(ValueError) as context:
+            utils.get_trainer_crd_from_custom_trainer(runtime, trainer)
+
+        self.assertIn(
+            "You must specify either func or python_file in CustomTrainer", str(context.exception)
+        )
+
+    def test_get_trainer_crd_from_custom_trainer_with_func_unchanged(self):
+        """Test that existing func functionality remains unchanged."""
+        runtime = Mock()
+        runtime.trainer = Mock()
+        runtime.trainer.command = ["python", "script.py"]
+
+        def dummy_func():
+            pass
+
+        trainer = types.CustomTrainer(
+            func=dummy_func, func_args={"lr": 0.001}, num_nodes=2, resources_per_node={"gpu": "4"}
+        )
+
+        with unittest.mock.patch(
+            "kubeflow.trainer.utils.utils.get_command_using_train_func"
+        ) as mock_get_command:
+            mock_get_command.return_value = ["python", "script.py"]
+            result = utils.get_trainer_crd_from_custom_trainer(runtime, trainer)
+
+        self.assertEqual(result.num_nodes, 2)
+        # Verify that the existing func path still works
+        mock_get_command.assert_called_once()