update unit tests and fix api errors

Signed-off-by: helenxie-bit <[email protected]>
kubeflow · Sep 5, 2024 · b0195a6 · b0195a6
1 parent 8c4d65a
commit b0195a6
Show file tree

Hide file tree

Showing 3 changed files with 26 additions and 31 deletions.
diff --git a/.github/workflows/test-python.yaml b/.github/workflows/test-python.yaml
@@ -22,6 +22,16 @@ jobs:
         uses: actions/setup-python@v5
         with:
           python-version: 3.11
+
+      - name: Install Katib SDK
+        shell: bash
+        run: pip install --prefer-binary -e sdk/python/v1beta1
+
+      - name: Install Training Operator SDK
+        shell: bash
+        run: |
+          pip install git+https://github.com/kubeflow/[email protected]#subdirectory=sdk/python
+          pip install peft==0.3.0 datasets==2.15.0 transformers==4.38.0
 
       - name: Run Python test
         run: make pytest

diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py
@@ -416,6 +416,12 @@ class name in this argument.
 
         # If users choose to use a custom objective function.
         if objective is not None:
+            if (
+                not base_image
+                or not parameters
+            ):
+                raise ValueError("One of the required parameters is None")
+
             # Add metrics collector to the Katib Experiment.
             # Up to now, we only support parameter `kind`, of which default value
             # is `StdOut`, to specify the kind of metrics collector.
@@ -645,7 +651,7 @@ class name in this argument.
                     f"'{training_args}'",
                 ],
                 volume_mounts=[STORAGE_INITIALIZER_VOLUME_MOUNT],
-                resources=resources_per_trial.resources_per_worker,
+                resources=resources_per_trial.resources_per_worker if resources_per_trial else None,
             )
 
             # Create the worker and the master pod.
@@ -679,7 +685,7 @@ class name in this argument.
                 ),
             )
 
-            if resources_per_trial.num_procs_per_worker:
+            if resources_per_trial is not None and resources_per_trial.num_procs_per_worker:
                 pytorchjob.spec.nproc_per_node = str(
                     resources_per_trial.num_procs_per_worker
                 )
@@ -691,7 +697,7 @@ class name in this argument.
                 )
             )
 
-            if resources_per_trial.num_workers > 1:
+            if resources_per_trial is not None and resources_per_trial.num_workers > 1:
                 pytorchjob.spec.pytorch_replica_specs["Worker"] = (
                     training_models.KubeflowOrgV1ReplicaSpec(
                         replicas=resources_per_trial.num_workers - 1,

diff --git a/test/unit/v1beta1/tune-api/test_tune_api.py b/test/unit/v1beta1/tune-api/test_tune_api.py
@@ -167,28 +167,7 @@ def test_tune_invalid_env_per_trial(self):
 
         self.assertIn("Incorrect value for env_per_trial", str(context.exception))
 
-    # Case 2: Invalid resources_per_trial.num_workers (for distributed training)
-    def test_tune_invalid_resources_per_trial_value(self):
-        with self.assertRaises(ValueError) as context:
-            self.katib_client.tune(
-                name="experiment",
-                objective=lambda x: x,
-                parameters={
-                    "a": katib.search.int(min=10, max=100),
-                    "b": katib.search.double(min=0.1, max=0.2),
-                },
-                resources_per_trial=katib.TrainerResources(
-                    num_workers=0,  # Invalid value, should be at least 1
-                    num_procs_per_worker=1,
-                    resources_per_worker={"cpu": "1", "memory": "1Gi"},
-                ),
-            )
-
-        self.assertIn(
-            "At least one Worker for PyTorchJob must be set", str(context.exception)
-        )
-
-    # Case 3: Invalid model_provider_parameters
+    # Case 2: Invalid model_provider_parameters
     def test_tune_invalid_model_provider_parameters(self):
         with self.assertRaises(ValueError) as context:
             self.katib_client.tune(
@@ -221,7 +200,7 @@ def test_tune_invalid_model_provider_parameters(self):
             str(context.exception),
         )
 
-    # Case 4: Invalid dataset_provider_parameters
+    # Case 3: Invalid dataset_provider_parameters
     def test_tune_invalid_dataset_provider_parameters(self):
         with self.assertRaises(ValueError) as context:
             self.katib_client.tune(
@@ -255,7 +234,7 @@ def test_tune_invalid_dataset_provider_parameters(self):
             str(context.exception),
         )
 
-    # Case 5: Invalid trainer_parameters.training_parameters
+    # Case 4: Invalid trainer_parameters.training_parameters
     def test_tune_invalid_trainer_parameters_training_parameters(self):
         with self.assertRaises(TypeError) as context:
             self.katib_client.tune(
@@ -283,7 +262,7 @@ def test_tune_invalid_trainer_parameters_training_parameters(self):
             str(context.exception),
         )
 
-    # Case 6: Invalid trainer_parameters.lora_config
+    # Case 5: Invalid trainer_parameters.lora_config
     def test_tune_invalid_trainer_parameters_lora_config(self):
         with self.assertRaises(TypeError) as context:
             self.katib_client.tune(
@@ -545,9 +524,9 @@ def test_experiment_creation_with_custom_objective(
                     "<lambda>({'a': '${trialParameters.a}', 'b': '${trialParameters.b}'})\n"
                     "\n"
                     "EOM\n"
-                    'printf "%s" "$SCRIPT" > "$program_path/ephemeral_script.py"\n'
-                    'python3 -u "$program_path/ephemeral_script.py"'
-                ],
+                    'printf "%s" "$SCRIPT" > $program_path/ephemeral_objective.py\n'
+                    'python3 -u $program_path/ephemeral_objective.py'
+                    ],
                 resources=models.V1ResourceRequirements(
                     requests={"cpu": "1", "memory": "1Gi"},
                     limits={"cpu": "1", "memory": "1Gi"},