-
Notifications
You must be signed in to change notification settings - Fork 231
Re-enable metrics, add metrics for vSphere #590
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -12,6 +12,7 @@ import ( | |
| vspherev1 "github.com/openshift/machine-api-operator/pkg/apis/vsphereprovider/v1beta1" | ||
| machinecontroller "github.com/openshift/machine-api-operator/pkg/controller/machine" | ||
| "github.com/openshift/machine-api-operator/pkg/controller/vsphere/session" | ||
| "github.com/openshift/machine-api-operator/pkg/metrics" | ||
| "github.com/vmware/govmomi/find" | ||
| "github.com/vmware/govmomi/object" | ||
| "github.com/vmware/govmomi/property" | ||
|
|
@@ -67,6 +68,13 @@ func (r *Reconciler) create() error { | |
| return err | ||
| } | ||
| } | ||
| if moTask.Info.State == types.TaskInfoStateError { | ||
| metrics.RegisterFailedInstanceCreate(&metrics.MachineLabels{ | ||
| Name: r.machine.Name, | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. how are this metrics actually being exposed through this controller metrics server?
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This happens inside the init in https://github.com/openshift/machine-api-operator/pull/590/files#diff-7cbe8e056d62a2de30c7066e359bd9c9R68 |
||
| Namespace: r.machine.Namespace, | ||
| Reason: fmt.Sprintf("Create machine task finished with error: %+v", moTask.Info.Error), | ||
| }) | ||
| } | ||
| if taskIsFinished, err := taskIsFinished(moTask); err != nil || !taskIsFinished { | ||
| if !taskIsFinished { | ||
| return fmt.Errorf("task %v has not finished", moTask.Reference().Value) | ||
|
|
@@ -113,6 +121,13 @@ func (r *Reconciler) update() error { | |
| return err | ||
| } | ||
| } | ||
| if motask.Info.State == types.TaskInfoStateError { | ||
| metrics.RegisterFailedInstanceCreate(&metrics.MachineLabels{ | ||
| Name: r.machine.Name, | ||
| Namespace: r.machine.Namespace, | ||
| Reason: fmt.Sprintf("Create machine task finished with error: %+v", motask.Info.Error), | ||
| }) | ||
| } | ||
| if taskIsFinished, err := taskIsFinished(motask); err != nil || !taskIsFinished { | ||
| if !taskIsFinished { | ||
| return fmt.Errorf("task %v has not finished", motask.Reference().Value) | ||
|
|
@@ -167,6 +182,13 @@ func (r *Reconciler) delete() error { | |
| return err | ||
| } | ||
| } | ||
| if moTask.Info.State == types.TaskInfoStateError { | ||
| metrics.RegisterFailedInstanceCreate(&metrics.MachineLabels{ | ||
| Name: r.machine.Name, | ||
| Namespace: r.machine.Namespace, | ||
| Reason: fmt.Sprintf("Create machine task finished with error: %+v", moTask.Info.Error), | ||
| }) | ||
| } | ||
| if taskIsFinished, err := taskIsFinished(moTask); err != nil || !taskIsFinished { | ||
| if !taskIsFinished { | ||
| return fmt.Errorf("task %v has not finished", moTask.Reference().Value) | ||
|
|
@@ -178,6 +200,11 @@ func (r *Reconciler) delete() error { | |
| vmRef, err := findVM(r.machineScope) | ||
| if err != nil { | ||
| if !isNotFound(err) { | ||
| metrics.RegisterFailedInstanceDelete(&metrics.MachineLabels{ | ||
| Name: r.machine.Name, | ||
| Namespace: r.machine.Namespace, | ||
| Reason: err.Error(), | ||
| }) | ||
| return err | ||
| } | ||
| klog.Infof("%v: vm does not exist", r.machine.GetName()) | ||
|
|
@@ -196,6 +223,11 @@ func (r *Reconciler) delete() error { | |
|
|
||
| task, err := vm.Obj.Destroy(r.Context) | ||
| if err != nil { | ||
| metrics.RegisterFailedInstanceDelete(&metrics.MachineLabels{ | ||
| Name: r.machine.Name, | ||
| Namespace: r.machine.Namespace, | ||
| Reason: err.Error(), | ||
| }) | ||
| return fmt.Errorf("%v: failed to destroy vm: %v", r.machine.GetName(), err) | ||
| } | ||
|
|
||
|
|
@@ -258,6 +290,11 @@ func (r *Reconciler) reconcileRegionAndZoneLabels(vm *virtualMachine) error { | |
| }) | ||
|
|
||
| if err != nil { | ||
| metrics.RegisterFailedInstanceUpdate(&metrics.MachineLabels{ | ||
| Name: r.machine.Name, | ||
| Namespace: r.machine.Namespace, | ||
| Reason: err.Error(), | ||
| }) | ||
| return err | ||
| } | ||
|
|
||
|
|
@@ -550,7 +587,13 @@ func clone(s *machineScope) (string, error) { | |
|
|
||
| task, err := vmTemplate.Clone(s, folder, s.machine.GetName(), spec) | ||
| if err != nil { | ||
| return "", fmt.Errorf("error triggering clone op for machine %v: %w", s, err) | ||
| err = fmt.Errorf("error triggering clone op for machine %v: %w", s, err) | ||
| metrics.RegisterFailedInstanceCreate(&metrics.MachineLabels{ | ||
| Name: s.machine.Name, | ||
| Namespace: s.machine.Namespace, | ||
| Reason: err.Error(), | ||
| }) | ||
| return "", err | ||
| } | ||
|
|
||
| klog.V(3).Infof("%v: running task: %+v", s.machine.GetName(), s.providerStatus.TaskRef) | ||
|
|
@@ -813,6 +856,11 @@ func (vm *virtualMachine) reconcileTags(ctx context.Context, session *session.Se | |
| klog.Infof("%v: Attaching %s tag to vm", machine.GetName(), clusterID) | ||
| // the tag should already be created by installer | ||
| if err := m.AttachTag(ctx, clusterID, vm.Ref); err != nil { | ||
| metrics.RegisterFailedInstanceUpdate(&metrics.MachineLabels{ | ||
| Name: machine.Name, | ||
| Namespace: machine.Namespace, | ||
| Reason: err.Error(), | ||
| }) | ||
| return err | ||
| } | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,12 +1,19 @@ | ||
| package metrics | ||
|
|
||
| import ( | ||
| "github.com/golang/glog" | ||
| mapiv1beta1 "github.com/openshift/machine-api-operator/pkg/apis/machine/v1beta1" | ||
| machineinformers "github.com/openshift/machine-api-operator/pkg/generated/informers/externalversions/machine/v1beta1" | ||
| machinelisters "github.com/openshift/machine-api-operator/pkg/generated/listers/machine/v1beta1" | ||
| "github.com/prometheus/client_golang/prometheus" | ||
| "k8s.io/apimachinery/pkg/labels" | ||
| "k8s.io/klog" | ||
| "sigs.k8s.io/controller-runtime/pkg/metrics" | ||
| ) | ||
|
|
||
| const ( | ||
| DefaultHealthCheckMetricsAddress = ":8083" | ||
| DefaultMachineSetMetricsAddress = ":8082" | ||
| DefaultMachineMetricsAddress = ":8081" | ||
| ) | ||
|
|
||
| var ( | ||
|
|
@@ -33,10 +40,36 @@ var ( | |
| Name: "mapi_mao_collector_up", | ||
| Help: "Machine API Operator metrics are being collected and reported successfully", | ||
| }, []string{"kind"}) | ||
|
|
||
| failedInstanceCreateCount = prometheus.NewCounterVec( | ||
| prometheus.CounterOpts{ | ||
| Name: "mapi_instance_create_failed", | ||
| Help: "Number of times provider instance create has failed.", | ||
| }, []string{"name", "namespace", "reason"}, | ||
| ) | ||
|
|
||
| failedInstanceUpdateCount = prometheus.NewCounterVec( | ||
| prometheus.CounterOpts{ | ||
| Name: "mapi_instance_update_failed", | ||
| Help: "Number of times provider instance update has failed.", | ||
| }, []string{"name", "namespace", "reason"}, | ||
| ) | ||
|
|
||
| failedInstanceDeleteCount = prometheus.NewCounterVec( | ||
| prometheus.CounterOpts{ | ||
| Name: "mapi_instance_delete_failed", | ||
| Help: "Number of times provider instance delete has failed.", | ||
| }, []string{"name", "namespace", "reason"}, | ||
| ) | ||
| ) | ||
|
|
||
| func init() { | ||
| prometheus.MustRegister(MachineCollectorUp) | ||
| metrics.Registry.MustRegister( | ||
| failedInstanceCreateCount, | ||
| failedInstanceUpdateCount, | ||
| failedInstanceDeleteCount, | ||
| ) | ||
|
Comment on lines
67
to
+72
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should be registering these all to the same registry, does the
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It does show up, but switching to a single registry makes sense to me. Only it is completely out of scope for https://issues.redhat.com/browse/OCPCLOUD-784 This one has to be splitted to include refactoring for existing code, as well as providers integration.
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Opened https://issues.redhat.com/browse/OCPCLOUD-890 for that matter, and included https://issues.redhat.com/browse/OCPCLOUD-891 too. |
||
| } | ||
|
|
||
| // MachineCollector is implementing prometheus.Collector interface. | ||
|
|
@@ -46,6 +79,13 @@ type MachineCollector struct { | |
| namespace string | ||
| } | ||
|
|
||
| // MachineLabels is the group of labels that are applied to the machine metrics | ||
| type MachineLabels struct { | ||
| Name string | ||
| Namespace string | ||
| Reason string | ||
| } | ||
|
|
||
| func NewMachineCollector(machineInformer machineinformers.MachineInformer, machinesetInformer machineinformers.MachineSetInformer, namespace string) *MachineCollector { | ||
| return &MachineCollector{ | ||
| machineLister: machineInformer.Lister(), | ||
|
|
@@ -95,7 +135,7 @@ func (mc MachineCollector) collectMachineMetrics(ch chan<- prometheus.Metric) { | |
| } | ||
|
|
||
| ch <- prometheus.MustNewConstMetric(MachineCountDesc, prometheus.GaugeValue, float64(len(machineList))) | ||
| glog.V(4).Infof("collectmachineMetrics exit") | ||
| klog.V(4).Infof("collectmachineMetrics exit") | ||
| } | ||
|
|
||
| func stringPointerDeref(stringPointer *string) string { | ||
|
|
@@ -151,3 +191,27 @@ func (mc MachineCollector) listMachines() ([]*mapiv1beta1.Machine, error) { | |
| func (mc MachineCollector) listMachineSets() ([]*mapiv1beta1.MachineSet, error) { | ||
| return mc.machineSetLister.MachineSets(mc.namespace).List(labels.Everything()) | ||
| } | ||
|
|
||
| func RegisterFailedInstanceCreate(labels *MachineLabels) { | ||
| failedInstanceCreateCount.With(prometheus.Labels{ | ||
| "name": labels.Name, | ||
| "namespace": labels.Namespace, | ||
| "reason": labels.Reason, | ||
| }).Inc() | ||
| } | ||
|
|
||
| func RegisterFailedInstanceUpdate(labels *MachineLabels) { | ||
| failedInstanceCreateCount.With(prometheus.Labels{ | ||
| "name": labels.Name, | ||
| "namespace": labels.Namespace, | ||
| "reason": labels.Reason, | ||
| }).Inc() | ||
| } | ||
|
|
||
| func RegisterFailedInstanceDelete(labels *MachineLabels) { | ||
| failedInstanceDeleteCount.With(prometheus.Labels{ | ||
| "name": labels.Name, | ||
| "namespace": labels.Namespace, | ||
| "reason": labels.Reason, | ||
| }).Inc() | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
where is the serviceMonitor that goes with this?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It can’t be merged until provider integration PRs get into repos, or the CI would fail as the machine metrics won’t be accessible. The serviceMonitor i therefore in #609
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The provider PRs are
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can you elaborate? can you point me to the origin test that will fail?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Have a look into ´Alerts shouldn't report any alerts in firing state...´ test in every provider: sample https://deck-ci.apps.ci.l2s4.p1.openshiftapps.com/view/gcs/origin-ci-test/pr-logs/pull/openshift_machine-api-operator/590/pull-ci-openshift-machine-api-operator-master-e2e-aws/2496
https://coreos.slack.com/archives/GE2HQ9QP4/p1591280443332000?thread_ts=1591279283.331700&channel=GE2HQ9QP4&message_ts=1591280443.332000
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why would this fire an alert?
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's not the service which is triggering it, but the serviceMonitor from the second PR #609 I included the merge order in Jira issue, posted it in slack too. While provider PRs are not merged, the metrics port is not served from the code, and while Prometheus is trying to connect, it causes machine metrics respond with 502. That results in alert in openshift-monitoring namespace. Joel and me already discussed the issue in slack, and decided to split the PR, instead of revendoring MAO PR branch in every provider.
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I thought prometheus was no triggering alert for that scenario. This would need to account for openstack and baremetal.
There's nothing stopping us from including the serviceMonitor for machineset and mhc in this PR and watching working right?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I’ve opened issues in both repos to make sure this won’t be left unanswered.
including machineset and mhc in the metrics should not be disruptive.