Skip to content

Commit

Permalink
update unit tests and fix api errors
Browse files Browse the repository at this point in the history
Signed-off-by: helenxie-bit <[email protected]>
  • Loading branch information
helenxie-bit committed Sep 5, 2024
1 parent 8c4d65a commit b0195a6
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 31 deletions.
10 changes: 10 additions & 0 deletions .github/workflows/test-python.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,16 @@ jobs:
uses: actions/setup-python@v5
with:
python-version: 3.11

- name: Install Katib SDK
shell: bash
run: pip install --prefer-binary -e sdk/python/v1beta1

- name: Install Training Operator SDK
shell: bash
run: |
pip install git+https://github.com/kubeflow/[email protected]#subdirectory=sdk/python
pip install peft==0.3.0 datasets==2.15.0 transformers==4.38.0
- name: Run Python test
run: make pytest
Expand Down
12 changes: 9 additions & 3 deletions sdk/python/v1beta1/kubeflow/katib/api/katib_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -416,6 +416,12 @@ class name in this argument.

# If users choose to use a custom objective function.
if objective is not None:
if (
not base_image
or not parameters
):
raise ValueError("One of the required parameters is None")

# Add metrics collector to the Katib Experiment.
# Up to now, we only support parameter `kind`, of which default value
# is `StdOut`, to specify the kind of metrics collector.
Expand Down Expand Up @@ -645,7 +651,7 @@ class name in this argument.
f"'{training_args}'",
],
volume_mounts=[STORAGE_INITIALIZER_VOLUME_MOUNT],
resources=resources_per_trial.resources_per_worker,
resources=resources_per_trial.resources_per_worker if resources_per_trial else None,
)

# Create the worker and the master pod.
Expand Down Expand Up @@ -679,7 +685,7 @@ class name in this argument.
),
)

if resources_per_trial.num_procs_per_worker:
if resources_per_trial is not None and resources_per_trial.num_procs_per_worker:
pytorchjob.spec.nproc_per_node = str(
resources_per_trial.num_procs_per_worker
)
Expand All @@ -691,7 +697,7 @@ class name in this argument.
)
)

if resources_per_trial.num_workers > 1:
if resources_per_trial is not None and resources_per_trial.num_workers > 1:
pytorchjob.spec.pytorch_replica_specs["Worker"] = (
training_models.KubeflowOrgV1ReplicaSpec(
replicas=resources_per_trial.num_workers - 1,
Expand Down
35 changes: 7 additions & 28 deletions test/unit/v1beta1/tune-api/test_tune_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,28 +167,7 @@ def test_tune_invalid_env_per_trial(self):

self.assertIn("Incorrect value for env_per_trial", str(context.exception))

# Case 2: Invalid resources_per_trial.num_workers (for distributed training)
def test_tune_invalid_resources_per_trial_value(self):
with self.assertRaises(ValueError) as context:
self.katib_client.tune(
name="experiment",
objective=lambda x: x,
parameters={
"a": katib.search.int(min=10, max=100),
"b": katib.search.double(min=0.1, max=0.2),
},
resources_per_trial=katib.TrainerResources(
num_workers=0, # Invalid value, should be at least 1
num_procs_per_worker=1,
resources_per_worker={"cpu": "1", "memory": "1Gi"},
),
)

self.assertIn(
"At least one Worker for PyTorchJob must be set", str(context.exception)
)

# Case 3: Invalid model_provider_parameters
# Case 2: Invalid model_provider_parameters
def test_tune_invalid_model_provider_parameters(self):
with self.assertRaises(ValueError) as context:
self.katib_client.tune(
Expand Down Expand Up @@ -221,7 +200,7 @@ def test_tune_invalid_model_provider_parameters(self):
str(context.exception),
)

# Case 4: Invalid dataset_provider_parameters
# Case 3: Invalid dataset_provider_parameters
def test_tune_invalid_dataset_provider_parameters(self):
with self.assertRaises(ValueError) as context:
self.katib_client.tune(
Expand Down Expand Up @@ -255,7 +234,7 @@ def test_tune_invalid_dataset_provider_parameters(self):
str(context.exception),
)

# Case 5: Invalid trainer_parameters.training_parameters
# Case 4: Invalid trainer_parameters.training_parameters
def test_tune_invalid_trainer_parameters_training_parameters(self):
with self.assertRaises(TypeError) as context:
self.katib_client.tune(
Expand Down Expand Up @@ -283,7 +262,7 @@ def test_tune_invalid_trainer_parameters_training_parameters(self):
str(context.exception),
)

# Case 6: Invalid trainer_parameters.lora_config
# Case 5: Invalid trainer_parameters.lora_config
def test_tune_invalid_trainer_parameters_lora_config(self):
with self.assertRaises(TypeError) as context:
self.katib_client.tune(
Expand Down Expand Up @@ -545,9 +524,9 @@ def test_experiment_creation_with_custom_objective(
"<lambda>({'a': '${trialParameters.a}', 'b': '${trialParameters.b}'})\n"
"\n"
"EOM\n"
'printf "%s" "$SCRIPT" > "$program_path/ephemeral_script.py"\n'
'python3 -u "$program_path/ephemeral_script.py"'
],
'printf "%s" "$SCRIPT" > $program_path/ephemeral_objective.py\n'
'python3 -u $program_path/ephemeral_objective.py'
],
resources=models.V1ResourceRequirements(
requests={"cpu": "1", "memory": "1Gi"},
limits={"cpu": "1", "memory": "1Gi"},
Expand Down

0 comments on commit b0195a6

Please sign in to comment.