-
Notifications
You must be signed in to change notification settings - Fork 440
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add validation for NAS job in Katib controller #398
Changes from 8 commits
9f9a99a
0b171ab
46221b2
d407be8
c5c284f
14cc8be
36958e0
76f8397
c3e54cb
345d66b
e45216e
3dc4f3e
4b47650
3a7e0b0
c16690e
30e876e
38febad
8a01094
ea896ae
b88030b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -25,6 +25,7 @@ import ( | |
) | ||
|
||
func initializeStudy(instance *katibv1alpha1.StudyJob) error { | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. move validateHPJob and validateNASJob into validateStudy, and move validateStudy away from initializeStudy There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You want to put validateStudy function before initializeStudy? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. no, validateStudy has been moved to https://github.com/kubeflow/katib/blob/master/pkg/controller/studyjob/validating_webhook.go#L44 |
||
if instance.Spec.SuggestionSpec.SuggestionAlgorithm == "" { | ||
instance.Spec.SuggestionSpec.SuggestionAlgorithm = "random" | ||
} | ||
|
@@ -73,15 +74,15 @@ func initializeStudy(instance *katibv1alpha1.StudyJob) error { | |
} | ||
|
||
func getStudyConf(instance *katibv1alpha1.StudyJob) (*katibapi.StudyConfig, error) { | ||
jobType := getJobType(instance) | ||
jobType := getJobType(instance.Spec.SuggestionSpec.SuggestionAlgorithm) | ||
if jobType == jobTypeNAS { | ||
return populateConfigForNAS(instance) | ||
} | ||
return populateConfigForHP(instance) | ||
} | ||
|
||
func getJobType(instance *katibv1alpha1.StudyJob) string { | ||
if instance.Spec.NasConfig != nil { | ||
func getJobType(suggestionAlgorithm string) string { | ||
if contains(suggestionNASList, suggestionAlgorithm) { | ||
return jobTypeNAS | ||
} | ||
return jobTypeHP | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,6 +15,7 @@ import ( | |
"fmt" | ||
"io/ioutil" | ||
"log" | ||
"reflect" | ||
"strings" | ||
|
||
katibapi "github.com/kubeflow/katib/pkg/api" | ||
|
@@ -101,6 +102,21 @@ func getWorkerKind(workerSpec *katibv1alpha1.WorkerSpec) (*schema.GroupVersionKi | |
} | ||
|
||
func validateStudy(instance *katibv1alpha1.StudyJob) error { | ||
|
||
if getJobType(instance.Spec.SuggestionSpec.SuggestionAlgorithm) != jobTypeNAS { | ||
//Validate HP job | ||
if validJobErr := validateHPJob(instance); validJobErr != nil { | ||
instance.Status.Condition = katibv1alpha1.ConditionFailed | ||
return validJobErr | ||
} | ||
} else { | ||
//Validate NAS job | ||
if validJobErr := validateNASJob(instance); validJobErr != nil { | ||
instance.Status.Condition = katibv1alpha1.ConditionFailed | ||
return validJobErr | ||
} | ||
} | ||
|
||
if instance.Spec.SuggestionSpec == nil { | ||
return fmt.Errorf("No Spec.SuggestionSpec specified.") | ||
} | ||
|
@@ -155,6 +171,7 @@ func validateStudy(instance *katibv1alpha1.StudyJob) error { | |
if mcjob.GetNamespace() != namespace || mcjob.GetName() != workerID { | ||
return fmt.Errorf("Invalid metricsCollector template.") | ||
} | ||
|
||
return nil | ||
} | ||
|
||
|
@@ -206,6 +223,86 @@ func contains(l []string, s string) bool { | |
return false | ||
} | ||
|
||
func validateHPJob(instance *katibv1alpha1.StudyJob) error { | ||
log.Printf("Validation for the HP job") | ||
if instance.Spec.ParameterConfigs == nil { | ||
return fmt.Errorf("No Spec.ParameterConfigs specified") | ||
} | ||
return validateParameterConfigs(instance.Spec.ParameterConfigs, jobTypeHP, instance.Spec.SuggestionSpec.SuggestionAlgorithm) | ||
|
||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
|
||
func validateNASJob(instance *katibv1alpha1.StudyJob) error { | ||
|
||
log.Printf("Validation for the NAS job") | ||
if instance.Spec.NasConfig == nil { | ||
return fmt.Errorf("No Spec.NasConfig specified") | ||
} | ||
|
||
if reflect.DeepEqual(instance.Spec.NasConfig.GraphConfig, katibv1alpha1.GraphConfig{}) { | ||
return fmt.Errorf("Missing GraphConfig in NasConfig: %v", instance.Spec.NasConfig) | ||
} | ||
|
||
if instance.Spec.NasConfig.GraphConfig.InputSize == nil { | ||
return fmt.Errorf("Missing InputSize in NasConfig.GraphConfig: %v", instance.Spec.NasConfig.GraphConfig) | ||
} | ||
|
||
if instance.Spec.NasConfig.GraphConfig.OutputSize == nil { | ||
return fmt.Errorf("Missing OutputSize in NasConfig.GraphConfig: %v", instance.Spec.NasConfig.GraphConfig) | ||
} | ||
|
||
if instance.Spec.NasConfig.GraphConfig.NumLayers == 0 && contains(suggestionNASList, instance.Spec.SuggestionSpec.SuggestionAlgorithm) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think line 298 can use contains(suggestionNASList,..), but here I don't think so. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @hougangliu You think we should specify this block only for There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there a better place to put algorithm specific checks other than the controller? This list can grow over time with more algorithms and these checks are very specific to the algorithm implementation. @andreyvelich @hougangliu Thoughts? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. From my point of view, it is important to check it in the controller. We don't need to run getSuggestion if something wrong with yaml file. We can't set failed condition for the StudyJob inside Suggestion. Controller do that. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should do algorithm-specific check in suggestion service.
@andreyvelich @johnugeorge @hougangliu Thoughts? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @YujiOshima In that case, should we send StudyConfig to ValidateSuggestionParameters, because we don't save StudyConfig in db, if it is not valid. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @andreyvelich You're right. StudyConf and SuggestionParams should be included in an args of ValidateSuggestionParameters. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @YujiOshima Ok. In this PR, should I make a validation only for NAS RL service, or we should include BO here, as well? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @andreyvelich Only for NAS RL is enough here. If you could add validation for BO in another PR, it would be fantastic. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @YujiOshima @hougangliu What do you think about name of this API: |
||
return fmt.Errorf("Missing NumLayers in NasConfig.GraphConfig: %v", instance.Spec.NasConfig.GraphConfig) | ||
} | ||
|
||
if instance.Spec.NasConfig.Operations == nil { | ||
return fmt.Errorf("Missing Operations in NasConfig: %v", instance.Spec.NasConfig) | ||
} | ||
|
||
for _, op := range instance.Spec.NasConfig.Operations { | ||
if op.OperationType == "" { | ||
return fmt.Errorf("Missing OperationType in Operation: %v", op) | ||
} | ||
if op.ParameterConfigs == nil { | ||
return fmt.Errorf("Missing ParameterConfig in Operation: %v", op) | ||
} | ||
return validateParameterConfigs(op.ParameterConfigs, jobTypeNAS, instance.Spec.SuggestionSpec.SuggestionAlgorithm) | ||
} | ||
|
||
return nil | ||
} | ||
|
||
func validateParameterConfigs(parameterConfigs []katibv1alpha1.ParameterConfig, jobType string, suggestionAlgorithm string) error { | ||
for _, pc := range parameterConfigs { | ||
if pc.Name == "" { | ||
return fmt.Errorf("Missing Name in ParameterConfig: %v", pc) | ||
} | ||
if pc.ParameterType == "" { | ||
return fmt.Errorf("Missing ParameterType in ParameterConfig: %v", pc) | ||
} | ||
if reflect.DeepEqual(pc.Feasible, katibv1alpha1.FeasibleSpace{}) { | ||
return fmt.Errorf("Missing Feasible in ParameterConfig: %v", pc) | ||
} | ||
if pc.ParameterType == katibv1alpha1.ParameterTypeCategorical || pc.ParameterType == katibv1alpha1.ParameterTypeDiscrete { | ||
if pc.Feasible.List == nil { | ||
return fmt.Errorf("Missing List in ParameterConfig.Feasible: %v", pc.Feasible) | ||
} | ||
} | ||
if pc.ParameterType == katibv1alpha1.ParameterTypeDouble || pc.ParameterType == katibv1alpha1.ParameterTypeInt { | ||
if pc.Feasible.Min == "" { | ||
return fmt.Errorf("Missing Min in ParameterConfig.Feasible: %v", pc.Feasible) | ||
} | ||
if pc.Feasible.Max == "" { | ||
return fmt.Errorf("Missing Max in ParameterConfig.Feasible: %v", pc.Feasible) | ||
} | ||
if jobType == jobTypeNAS && pc.Feasible.Step == "" && pc.ParameterType == katibv1alpha1.ParameterTypeDouble && contains(suggestionNASList, suggestionAlgorithm) { | ||
return fmt.Errorf("Missing Step in ParameterConfig.Feasible for NAS job: %v", pc.Feasible) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. define a suggestionNASList (now only nasrl is in) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same here. Better way to handle this? |
||
} | ||
} | ||
} | ||
return nil | ||
} | ||
|
||
func getMyNamespace() string { | ||
data, _ := ioutil.ReadFile("/var/run/secrets/kubernetes.io/serviceaccount/namespace") | ||
return strings.TrimSpace(string(data)) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Our goal is to keep controller independent of worker type and suggestion algos. Currently there are no worker specific code in the code(except for watch. Hope, it can be fixed soon and ValidWorkerKindList won't not needed any more). Similarly i feel that there should not be any suggestion algorithm specific changes in the controller code.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In that case, how we can make a validation for the NAS job ?