Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions cmd/machine-healthcheck/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"runtime"

"github.com/openshift/machine-api-operator/pkg/controller/machinehealthcheck"
"github.com/openshift/machine-api-operator/pkg/metrics"

"github.com/golang/glog"
mapiv1 "github.com/openshift/machine-api-operator/pkg/apis/machine/v1beta1"
Expand All @@ -24,6 +25,7 @@ func printVersion() {

func main() {
watchNamespace := flag.String("namespace", "", "Namespace that the controller watches to reconcile machine-api objects. If unspecified, the controller watches for machine-api objects across all namespaces.")
metricsAddress := flag.String("metrics-bind-address", metrics.DefaultHealthCheckMetricsAddress, "Address for hosting metrics")
flag.Parse()
printVersion()

Expand All @@ -34,8 +36,7 @@ func main() {
}

opts := manager.Options{
// Disable metrics serving
MetricsBindAddress: "0",
MetricsBindAddress: *metricsAddress,
}
if *watchNamespace != "" {
opts.Namespace = *watchNamespace
Expand Down
5 changes: 3 additions & 2 deletions cmd/machineset/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (
"github.com/openshift/machine-api-operator/pkg/apis/machine/v1beta1"
"github.com/openshift/machine-api-operator/pkg/controller"
"github.com/openshift/machine-api-operator/pkg/controller/machineset"
"github.com/openshift/machine-api-operator/pkg/metrics"
_ "k8s.io/client-go/plugin/pkg/client/auth/gcp"
"k8s.io/klog"
"sigs.k8s.io/controller-runtime/pkg/client/config"
Expand All @@ -42,6 +43,7 @@ func main() {
klog.InitFlags(nil)
watchNamespace := flag.String("namespace", "",
"Namespace that the controller watches to reconcile cluster-api objects. If unspecified, the controller watches for cluster-api objects across all namespaces.")
metricsAddress := flag.String("metrics-bind-address", metrics.DefaultMachineSetMetricsAddress, "Address for hosting metrics")

webhookEnabled := flag.Bool("webhook-enabled", true,
"Webhook server, enabled by default. When enabled, the manager will run a webhook server.")
Expand All @@ -67,8 +69,7 @@ func main() {
// Create a new Cmd to provide shared dependencies and start components
syncPeriod := 10 * time.Minute
opts := manager.Options{
// Disable metrics serving
MetricsBindAddress: "0",
MetricsBindAddress: *metricsAddress,
SyncPeriod: &syncPeriod,
Namespace: *watchNamespace,
}
Expand Down
8 changes: 3 additions & 5 deletions cmd/vsphere/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@ import (
"flag"
"fmt"
"os"
"time"

configv1 "github.com/openshift/api/config/v1"
"github.com/openshift/machine-api-operator/pkg/apis/machine/v1beta1"
vsphereapis "github.com/openshift/machine-api-operator/pkg/apis/vsphereprovider"
capimachine "github.com/openshift/machine-api-operator/pkg/controller/machine"
machine "github.com/openshift/machine-api-operator/pkg/controller/vsphere"
"github.com/openshift/machine-api-operator/pkg/metrics"
"github.com/openshift/machine-api-operator/pkg/version"
"k8s.io/klog"
"sigs.k8s.io/controller-runtime/pkg/client/config"
Expand All @@ -24,6 +24,7 @@ func main() {

klog.InitFlags(nil)
watchNamespace := flag.String("namespace", "", "Namespace that the controller watches to reconcile machine-api objects. If unspecified, the controller watches for machine-api objects across all namespaces.")
metricsAddress := flag.String("metrics-bind-address", metrics.DefaultMachineMetricsAddress, "Address for hosting metrics")
flag.Set("logtostderr", "true")
flag.Parse()

Expand All @@ -33,12 +34,9 @@ func main() {
}

cfg := config.GetConfigOrDie()
syncPeriod := 10 * time.Minute

opts := manager.Options{
// Disable metrics serving
MetricsBindAddress: "0",
SyncPeriod: &syncPeriod,
MetricsBindAddress: *metricsAddress,
}
if *watchNamespace != "" {
opts.Namespace = *watchNamespace
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ data:
images.json: >
{
"machineAPIOperator": "registry.svc.ci.openshift.org/openshift:machine-api-operator",
"kubeRBACProxy": "registry.svc.ci.openshift.org/openshift:kube-rbac-proxy",
"clusterAPIControllerAWS": "registry.svc.ci.openshift.org/openshift:aws-machine-controllers",
"clusterAPIControllerOpenStack": "registry.svc.ci.openshift.org/openshift:openstack-machine-controllers",
"clusterAPIControllerLibvirt": "registry.svc.ci.openshift.org/openshift:libvirt-machine-controllers",
Expand Down
14 changes: 14 additions & 0 deletions install/0000_30_machine-api-operator_09_rbac.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,20 @@ rules:
verbs:
- create

- apiGroups:
- authentication.k8s.io
resources:
- tokenreviews
verbs:
- create

- apiGroups:
- authorization.k8s.io
resources:
- subjectaccessreviews
verbs:
- create

# TODO(vikasc): Remove extensions/daemonsets permissions once all controllers have bumped kubernetes-drain
- apiGroups:
- extensions
Expand Down
26 changes: 26 additions & 0 deletions install/0000_30_machine-api-operator_10_service.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,29 @@ spec:
selector:
k8s-app: machine-api-operator
sessionAffinity: None
---
apiVersion: v1
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

where is the serviceMonitor that goes with this?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It can’t be merged until provider integration PRs get into repos, or the CI would fail as the machine metrics won’t be accessible. The serviceMonitor i therefore in #609

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

or the CI would fail as the machine metrics won’t be accessible

can you elaborate? can you point me to the origin test that will fail?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why would this fire an alert?

Copy link
Author

@Danil-Grigorev Danil-Grigorev Jun 5, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not the service which is triggering it, but the serviceMonitor from the second PR #609 I included the merge order in Jira issue, posted it in slack too. While provider PRs are not merged, the metrics port is not served from the code, and while Prometheus is trying to connect, it causes machine metrics respond with 502. That results in alert in openshift-monitoring namespace. Joel and me already discussed the issue in slack, and decided to split the PR, instead of revendoring MAO PR branch in every provider.

Copy link
Member

@enxebre enxebre Jun 5, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought prometheus was no triggering alert for that scenario. This would need to account for openstack and baremetal.
There's nothing stopping us from including the serviceMonitor for machineset and mhc in this PR and watching working right?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I’ve opened issues in both repos to make sure this won’t be left unanswered.

including machineset and mhc in the metrics should not be disruptive.

kind: Service
metadata:
name: machine-api-controllers
namespace: openshift-machine-api
annotations:
service.alpha.openshift.io/serving-cert-secret-name: machine-api-controllers-tls
exclude.release.openshift.io/internal-openshift-hosted: "true"
labels:
k8s-app: controller
spec:
type: ClusterIP
ports:
- name: machine-mtrc
targetPort: machine-mtrc
port: 8441
- name: machineset-mtrc
targetPort: machineset-mtrc
port: 8442
- name: mhc-mtrc
targetPort: mhc-mtrc
port: 8444
selector:
k8s-app: controller
sessionAffinity: None
50 changes: 49 additions & 1 deletion pkg/controller/vsphere/reconciler.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
vspherev1 "github.com/openshift/machine-api-operator/pkg/apis/vsphereprovider/v1beta1"
machinecontroller "github.com/openshift/machine-api-operator/pkg/controller/machine"
"github.com/openshift/machine-api-operator/pkg/controller/vsphere/session"
"github.com/openshift/machine-api-operator/pkg/metrics"
"github.com/vmware/govmomi/find"
"github.com/vmware/govmomi/object"
"github.com/vmware/govmomi/property"
Expand Down Expand Up @@ -67,6 +68,13 @@ func (r *Reconciler) create() error {
return err
}
}
if moTask.Info.State == types.TaskInfoStateError {
metrics.RegisterFailedInstanceCreate(&metrics.MachineLabels{
Name: r.machine.Name,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

how are this metrics actually being exposed through this controller metrics server?
wouldn't this need to metrics.Registry.MustRegister(failedInstanceCreateCount) or anything?
https://github.com/kubernetes-sigs/controller-runtime/blob/c0438568a706ec61de31b92f4d76e7fb7e1007b9/pkg/internal/controller/metrics/metrics.go#L50

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Namespace: r.machine.Namespace,
Reason: fmt.Sprintf("Create machine task finished with error: %+v", moTask.Info.Error),
})
}
if taskIsFinished, err := taskIsFinished(moTask); err != nil || !taskIsFinished {
if !taskIsFinished {
return fmt.Errorf("task %v has not finished", moTask.Reference().Value)
Expand Down Expand Up @@ -113,6 +121,13 @@ func (r *Reconciler) update() error {
return err
}
}
if motask.Info.State == types.TaskInfoStateError {
metrics.RegisterFailedInstanceCreate(&metrics.MachineLabels{
Name: r.machine.Name,
Namespace: r.machine.Namespace,
Reason: fmt.Sprintf("Create machine task finished with error: %+v", motask.Info.Error),
})
}
if taskIsFinished, err := taskIsFinished(motask); err != nil || !taskIsFinished {
if !taskIsFinished {
return fmt.Errorf("task %v has not finished", motask.Reference().Value)
Expand Down Expand Up @@ -167,6 +182,13 @@ func (r *Reconciler) delete() error {
return err
}
}
if moTask.Info.State == types.TaskInfoStateError {
metrics.RegisterFailedInstanceCreate(&metrics.MachineLabels{
Name: r.machine.Name,
Namespace: r.machine.Namespace,
Reason: fmt.Sprintf("Create machine task finished with error: %+v", moTask.Info.Error),
})
}
if taskIsFinished, err := taskIsFinished(moTask); err != nil || !taskIsFinished {
if !taskIsFinished {
return fmt.Errorf("task %v has not finished", moTask.Reference().Value)
Expand All @@ -178,6 +200,11 @@ func (r *Reconciler) delete() error {
vmRef, err := findVM(r.machineScope)
if err != nil {
if !isNotFound(err) {
metrics.RegisterFailedInstanceDelete(&metrics.MachineLabels{
Name: r.machine.Name,
Namespace: r.machine.Namespace,
Reason: err.Error(),
})
return err
}
klog.Infof("%v: vm does not exist", r.machine.GetName())
Expand All @@ -196,6 +223,11 @@ func (r *Reconciler) delete() error {

task, err := vm.Obj.Destroy(r.Context)
if err != nil {
metrics.RegisterFailedInstanceDelete(&metrics.MachineLabels{
Name: r.machine.Name,
Namespace: r.machine.Namespace,
Reason: err.Error(),
})
return fmt.Errorf("%v: failed to destroy vm: %v", r.machine.GetName(), err)
}

Expand Down Expand Up @@ -258,6 +290,11 @@ func (r *Reconciler) reconcileRegionAndZoneLabels(vm *virtualMachine) error {
})

if err != nil {
metrics.RegisterFailedInstanceUpdate(&metrics.MachineLabels{
Name: r.machine.Name,
Namespace: r.machine.Namespace,
Reason: err.Error(),
})
return err
}

Expand Down Expand Up @@ -550,7 +587,13 @@ func clone(s *machineScope) (string, error) {

task, err := vmTemplate.Clone(s, folder, s.machine.GetName(), spec)
if err != nil {
return "", fmt.Errorf("error triggering clone op for machine %v: %w", s, err)
err = fmt.Errorf("error triggering clone op for machine %v: %w", s, err)
metrics.RegisterFailedInstanceCreate(&metrics.MachineLabels{
Name: s.machine.Name,
Namespace: s.machine.Namespace,
Reason: err.Error(),
})
return "", err
}

klog.V(3).Infof("%v: running task: %+v", s.machine.GetName(), s.providerStatus.TaskRef)
Expand Down Expand Up @@ -813,6 +856,11 @@ func (vm *virtualMachine) reconcileTags(ctx context.Context, session *session.Se
klog.Infof("%v: Attaching %s tag to vm", machine.GetName(), clusterID)
// the tag should already be created by installer
if err := m.AttachTag(ctx, clusterID, vm.Ref); err != nil {
metrics.RegisterFailedInstanceUpdate(&metrics.MachineLabels{
Name: machine.Name,
Namespace: machine.Namespace,
Reason: err.Error(),
})
return err
}
}
Expand Down
68 changes: 66 additions & 2 deletions pkg/metrics/metrics.go
Original file line number Diff line number Diff line change
@@ -1,12 +1,19 @@
package metrics

import (
"github.com/golang/glog"
mapiv1beta1 "github.com/openshift/machine-api-operator/pkg/apis/machine/v1beta1"
machineinformers "github.com/openshift/machine-api-operator/pkg/generated/informers/externalversions/machine/v1beta1"
machinelisters "github.com/openshift/machine-api-operator/pkg/generated/listers/machine/v1beta1"
"github.com/prometheus/client_golang/prometheus"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/klog"
"sigs.k8s.io/controller-runtime/pkg/metrics"
)

const (
DefaultHealthCheckMetricsAddress = ":8083"
DefaultMachineSetMetricsAddress = ":8082"
DefaultMachineMetricsAddress = ":8081"
)

var (
Expand All @@ -33,10 +40,36 @@ var (
Name: "mapi_mao_collector_up",
Help: "Machine API Operator metrics are being collected and reported successfully",
}, []string{"kind"})

failedInstanceCreateCount = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "mapi_instance_create_failed",
Help: "Number of times provider instance create has failed.",
}, []string{"name", "namespace", "reason"},
)

failedInstanceUpdateCount = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "mapi_instance_update_failed",
Help: "Number of times provider instance update has failed.",
}, []string{"name", "namespace", "reason"},
)

failedInstanceDeleteCount = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "mapi_instance_delete_failed",
Help: "Number of times provider instance delete has failed.",
}, []string{"name", "namespace", "reason"},
)
)

func init() {
prometheus.MustRegister(MachineCollectorUp)
metrics.Registry.MustRegister(
failedInstanceCreateCount,
failedInstanceUpdateCount,
failedInstanceDeleteCount,
)
Comment on lines 67 to +72
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should be registering these all to the same registry, does the MachineCollectorUp metric show up in your tests?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It does show up, but switching to a single registry makes sense to me. Only it is completely out of scope for https://issues.redhat.com/browse/OCPCLOUD-784 This one has to be splitted to include refactoring for existing code, as well as providers integration.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

}

// MachineCollector is implementing prometheus.Collector interface.
Expand All @@ -46,6 +79,13 @@ type MachineCollector struct {
namespace string
}

// MachineLabels is the group of labels that are applied to the machine metrics
type MachineLabels struct {
Name string
Namespace string
Reason string
}

func NewMachineCollector(machineInformer machineinformers.MachineInformer, machinesetInformer machineinformers.MachineSetInformer, namespace string) *MachineCollector {
return &MachineCollector{
machineLister: machineInformer.Lister(),
Expand Down Expand Up @@ -95,7 +135,7 @@ func (mc MachineCollector) collectMachineMetrics(ch chan<- prometheus.Metric) {
}

ch <- prometheus.MustNewConstMetric(MachineCountDesc, prometheus.GaugeValue, float64(len(machineList)))
glog.V(4).Infof("collectmachineMetrics exit")
klog.V(4).Infof("collectmachineMetrics exit")
}

func stringPointerDeref(stringPointer *string) string {
Expand Down Expand Up @@ -151,3 +191,27 @@ func (mc MachineCollector) listMachines() ([]*mapiv1beta1.Machine, error) {
func (mc MachineCollector) listMachineSets() ([]*mapiv1beta1.MachineSet, error) {
return mc.machineSetLister.MachineSets(mc.namespace).List(labels.Everything())
}

func RegisterFailedInstanceCreate(labels *MachineLabels) {
failedInstanceCreateCount.With(prometheus.Labels{
"name": labels.Name,
"namespace": labels.Namespace,
"reason": labels.Reason,
}).Inc()
}

func RegisterFailedInstanceUpdate(labels *MachineLabels) {
failedInstanceCreateCount.With(prometheus.Labels{
"name": labels.Name,
"namespace": labels.Namespace,
"reason": labels.Reason,
}).Inc()
}

func RegisterFailedInstanceDelete(labels *MachineLabels) {
failedInstanceDeleteCount.With(prometheus.Labels{
"name": labels.Name,
"namespace": labels.Namespace,
"reason": labels.Reason,
}).Inc()
}
1 change: 1 addition & 0 deletions pkg/operator/baremetal_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ func newOperatorWithBaremetalConfig() *OperatorConfig {
"docker.io/openshift/origin-machine-api-operator:v4.0.0",
"docker.io/openshift/origin-machine-api-operator:v4.0.0",
"docker.io/openshift/origin-machine-api-operator:v4.0.0",
"docker.io/openshift/origin-kube-rbac-proxy:v4.0.0",
"docker.io/openshift/origin-aws-machine-controllers:v4.0.0",
},
BaremetalControllers{
Expand Down
Loading