-
Notifications
You must be signed in to change notification settings - Fork 440
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: Refactor to make it easy to extend new kinds (#865)
* feat: Refactor to make it easy to extend new kinds Signed-off-by: Ce Gao <[email protected]> * fix: Remove hard coded name Signed-off-by: Ce Gao <[email protected]>
- Loading branch information
1 parent
fb6739c
commit 198a63a
Showing
12 changed files
with
303 additions
and
137 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
package v1alpha3 | ||
|
||
import ( | ||
"github.com/kubeflow/katib/pkg/controller.v1alpha3/consts" | ||
"k8s.io/apimachinery/pkg/runtime/schema" | ||
) | ||
|
||
const ( | ||
// JobNameLabel represents the label key for the job name, the value is job name | ||
JobNameLabel = "job-name" | ||
// JobRoleLabel represents the label key for the job role, e.g. the value is master | ||
JobRoleLabel = "job-role" | ||
// TFJobRoleLabel is deprecated in kubeflow 0.7, but we need to be compatible. | ||
TFJobRoleLabel = "tf-job-role" | ||
// PyTorchJobRoleLabel is deprecated in kubeflow 0.7, but we need to be compatible. | ||
PyTorchJobRoleLabel = "pytorch-job-role" | ||
) | ||
|
||
// JobRoleMap is the map which is used to determin if the replica is master. | ||
// Katib will inject metrics collector into master replica. | ||
var JobRoleMap = map[string][]string{ | ||
// Job kind does not support distributed training, thus no master. | ||
consts.JobKindJob: {}, | ||
consts.JobKindTF: {JobRoleLabel, TFJobRoleLabel}, | ||
consts.JobKindPyTorch: {JobRoleLabel, PyTorchJobRoleLabel}, | ||
} | ||
|
||
// GetSupportedJobList returns the list of the supported jobs' GVK. | ||
func GetSupportedJobList() []schema.GroupVersionKind { | ||
supportedJobList := []schema.GroupVersionKind{ | ||
{ | ||
Group: consts.JobGroupJob, | ||
Version: consts.JobVersionJob, | ||
Kind: consts.JobKindJob, | ||
}, | ||
{ | ||
Group: consts.JobGroupKubeflow, | ||
Version: consts.JobVersionTF, | ||
Kind: consts.JobKindTF, | ||
}, | ||
{ | ||
Group: consts.JobGroupKubeflow, | ||
Version: consts.JobVersionPyTorch, | ||
Kind: consts.JobKindPyTorch, | ||
}, | ||
} | ||
return supportedJobList | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
package job | ||
|
||
import ( | ||
"fmt" | ||
|
||
commonv1 "github.com/kubeflow/tf-operator/pkg/apis/common/v1" | ||
batchv1 "k8s.io/api/batch/v1" | ||
corev1 "k8s.io/api/core/v1" | ||
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" | ||
"k8s.io/apimachinery/pkg/runtime" | ||
logf "sigs.k8s.io/controller-runtime/pkg/runtime/log" | ||
) | ||
|
||
var ( | ||
log = logf.Log.WithName("provider-job") | ||
) | ||
|
||
// Job is the provider of Job kind. | ||
type Job struct{} | ||
|
||
// GetDeployedJobStatus get the deployed job status. | ||
func (j Job) GetDeployedJobStatus( | ||
deployedJob *unstructured.Unstructured) (*commonv1.JobCondition, error) { | ||
jobCondition := commonv1.JobCondition{} | ||
// Set default type to running. | ||
jobCondition.Type = commonv1.JobRunning | ||
status, ok, unerr := unstructured.NestedFieldCopy(deployedJob.Object, "status") | ||
if !ok { | ||
if unerr != nil { | ||
log.Error(unerr, "NestedFieldCopy unstructured to status error") | ||
return nil, unerr | ||
} | ||
err := fmt.Errorf("value is missing") | ||
log.Error(err, "NestedFieldCopy unstructured to status error") | ||
return nil, err | ||
} | ||
|
||
statusMap := status.(map[string]interface{}) | ||
jobStatus := batchv1.JobStatus{} | ||
err := runtime.DefaultUnstructuredConverter.FromUnstructured(statusMap, &jobStatus) | ||
if err != nil { | ||
log.Error(err, "Convert unstructured to status error") | ||
return nil, err | ||
} | ||
for _, cond := range jobStatus.Conditions { | ||
if cond.Type == batchv1.JobComplete && cond.Status == corev1.ConditionTrue { | ||
jobCondition.Type = commonv1.JobSucceeded | ||
// JobConditions message not populated when succeeded for batchv1 Job | ||
break | ||
} | ||
if cond.Type == batchv1.JobFailed && cond.Status == corev1.ConditionTrue { | ||
jobCondition.Type = commonv1.JobFailed | ||
jobCondition.Message = cond.Message | ||
break | ||
} | ||
} | ||
return &jobCondition, nil | ||
} | ||
|
||
// IsTrainingContainer returns if the c is the actual training container. | ||
func (j Job) IsTrainingContainer(index int, c corev1.Container) bool { | ||
if index == 0 { | ||
// for Job worker, the first container will be taken as worker container, | ||
// katib document should note it | ||
return true | ||
} | ||
return false | ||
} |
Oops, something went wrong.