Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pkg/operator/starter.go
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,7 @@ func RunOperator(ctx context.Context, controllerContext *controllercmd.Controlle
kubeInformersForNamespaces,
kubeClient,
startupmonitorreadiness.IsStartupMonitorEnabledFunction(configInformers.Config().V1().Infrastructures().Lister(), operatorClient),
notOnSingleReplicaTopology,
controllerContext.EventRecorder,
)

Expand Down
32 changes: 23 additions & 9 deletions pkg/operator/targetconfigcontroller/targetconfigcontroller.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,8 @@ type TargetConfigController struct {
kubeClient kubernetes.Interface
configMapLister corev1listers.ConfigMapLister

isStartupMonitorEnabledFn func() (bool, error)
isStartupMonitorEnabledFn func() (bool, error)
notOnSingleReplicaTopologyFn func() bool
}

func NewTargetConfigController(
Expand All @@ -58,15 +59,17 @@ func NewTargetConfigController(
kubeInformersForNamespaces v1helpers.KubeInformersForNamespaces,
kubeClient kubernetes.Interface,
isStartupMonitorEnabledFn func() (bool, error),
notOnSingleReplicaTopologyFn func() bool,
eventRecorder events.Recorder,
) factory.Controller {
c := &TargetConfigController{
targetImagePullSpec: targetImagePullSpec,
operatorImagePullSpec: operatorImagePullSpec,
operatorClient: operatorClient,
kubeClient: kubeClient,
configMapLister: kubeInformersForNamespaces.ConfigMapLister(),
isStartupMonitorEnabledFn: isStartupMonitorEnabledFn,
targetImagePullSpec: targetImagePullSpec,
operatorImagePullSpec: operatorImagePullSpec,
operatorClient: operatorClient,
kubeClient: kubeClient,
configMapLister: kubeInformersForNamespaces.ConfigMapLister(),
isStartupMonitorEnabledFn: isStartupMonitorEnabledFn,
notOnSingleReplicaTopologyFn: notOnSingleReplicaTopologyFn,
}

return factory.New().WithInformers(
Expand Down Expand Up @@ -100,7 +103,8 @@ func (c TargetConfigController) sync(ctx context.Context, syncContext factory.Sy
}

// block until config is observed and specific paths are present
if err := isRequiredConfigPresent(operatorSpec.ObservedConfig.Raw); err != nil {
isNotOnSingleReplicaTopology := c.notOnSingleReplicaTopologyFn()
if err := isRequiredConfigPresent(operatorSpec.ObservedConfig.Raw, isNotOnSingleReplicaTopology); err != nil {
syncContext.Recorder().Warning("ConfigMissing", err.Error())
return err
}
Expand All @@ -116,7 +120,7 @@ func (c TargetConfigController) sync(ctx context.Context, syncContext factory.Sy
return nil
}

func isRequiredConfigPresent(config []byte) error {
func isRequiredConfigPresent(config []byte, isNotSingleNode bool) error {
if len(config) == 0 {
return fmt.Errorf("no observedConfig")
}
Expand Down Expand Up @@ -148,6 +152,16 @@ func isRequiredConfigPresent(config []byte) error {
if configValString, ok := configVal.(string); ok && len(configValString) == 0 {
return fmt.Errorf("%v empty in config", strings.Join(requiredPath, "."))
}

if len(requiredPath) == 2 && requiredPath[0] == "apiServerArguments" && requiredPath[1] == "etcd-servers" && isNotSingleNode {
configValSlice, ok := configVal.([]interface{})
if !ok {
return fmt.Errorf("%v is not a slice", strings.Join(requiredPath, "."))
}
if len(configValSlice) < 3 {
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This needs to be gated for SNO as I assume it would only get 2 endpoints. Also, shouldn't we aim for 4 endpoints since we include localhost?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, adding an SNO gate too.

shouldn't we aim for 4 endpoints since we include localhost?

Three endpoints - localhost, local IP and etcd IP on any other master - should be sufficient to prevent hangup for 10 mins.
We may never have all 4 endpoints on HA with assisted installer - one of masters is used as a bootstrap, so we shouldn't rely on having all 3 masters available during bootstrap.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I haven’t read the code, but I’m wondering if we only want to stop new revisions during the installation time. If not, what happens when a customer takes a master node out of the pool for service? Does that mean the kas-o will stop installing new revisions? If so, that’s bad, right?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, recently there was a change to the RevisionController (library-go) which permits passing a precondition function. New revisions will be allowed by the controller once the precondition fulfils. Maybe you could use it.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If not, what happens when a customer takes a master node out of the pool for service?

Multiple scenarios here:

  • The node may be physically down for maintenance, but it would still be listed as an endpoint.

  • Also, while we require three endpoints - its in fact two distinct etcd servers (local IP and localhost are considered single etcd server but two distinct endpoints). So this code won't prevent one master from being torn down during CPMS scale up / down

  • However there is an etcd restore procedure where the cluster is reduced to a single node to restore to backup and later being scale up to 3. We specifically don't want rolling out new revisions until all three masters are available to ensure all three etcd servers are in sync.

TODO: run e2e to do cpms scale up / etcd disruption to ensure that it works as expected

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

New revisions will be allowed by the controller once the precondition fulfils

WithRevisionControllerPrecondition might be an alternative to this solution, yes

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

WithRevisionControllerPrecondition might be an alternative to this solution, yes

Not a blocking comment just wanted to let you know the mechanism exist so that you may consider using it.

So, when an etcd endpoint is removed from the list?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, should we add a similar mechanism for the aggregated API servers ?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just wanted to let you know the mechanism exist so that you may consider using it

Right, I think current one is more suited for this situation though - we already have a config validation function so its only natural to extend it. This way we prevent the config from being generated not merely rolled out.

However if this method won't work in some scenarios WithRevisionControllerPrecondition would be a good candidate to try.

should we add a similar mechanism for the aggregated API servers

Its not necessary imo, the issue is happening early on bootstrap when openshift-apiserver is not even created.

If kube-apiserver can reliably write to etcd we'll notice slow etcd rollout via "static pod didn't rollout in 3 mins" test - and a similar solution can be applied to openshift-apiservers

return fmt.Errorf("%v has less than three endpoints: %v", strings.Join(requiredPath, "."), configValSlice)
}
}
}
return nil
}
Expand Down
101 changes: 100 additions & 1 deletion pkg/operator/targetconfigcontroller/targetconfigcontroller_test.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package targetconfigcontroller

import (
"fmt"
"strings"
"testing"

Expand Down Expand Up @@ -110,7 +111,7 @@ func TestIsRequiredConfigPresent(t *testing.T) {

for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
actual := isRequiredConfigPresent([]byte(test.config))
actual := isRequiredConfigPresent([]byte(test.config), false)
switch {
case actual == nil && len(test.expectedError) == 0:
case actual == nil && len(test.expectedError) != 0:
Expand Down Expand Up @@ -237,3 +238,101 @@ func TestManageTemplate(t *testing.T) {
})
}
}

func TestIsRequiredConfigPresentEtcdEndpoints(t *testing.T) {
configTemplate := `{
"servingInfo": {
"namedCertificates": [
{
"certFile": "/etc/kubernetes/static-pod-certs/secrets/localhost-serving-cert-certkey/tls.crt",
"keyFile": "/etc/kubernetes/static-pod-certs/secrets/localhost-serving-cert-certkey/tls.key"
}
]
},
"admission": {"pluginConfig": { "network.openshift.io/RestrictedEndpointsAdmission": {}}},
"apiServerArguments": {
"etcd-servers": %s
}
}
`
tests := []struct {
name string
etcdServers string
expectedError string
isNotSingleNode bool
}{
{
name: "nil-storage-urls",
etcdServers: "null",
expectedError: "apiServerArguments.etcd-servers null in config",
},
{
name: "missing-storage-urls",
etcdServers: "[]",
expectedError: "apiServerArguments.etcd-servers empty in config",
},
{
name: "empty-string-storage-urls",
etcdServers: `""`,
expectedError: "apiServerArguments.etcd-servers empty in config",
},
{
name: "one-etcd-server",
etcdServers: `[ "val" ]`,
isNotSingleNode: true,
expectedError: "apiServerArguments.etcd-servers has less than three endpoints",
},
{
name: "one-etcd-server-sno",
etcdServers: `[ "val" ]`,
isNotSingleNode: false,
},
{
name: "two-etcd-servers",
etcdServers: `[ "val1", "val2" ]`,
isNotSingleNode: true,
expectedError: "apiServerArguments.etcd-servers has less than three endpoints",
},
{
name: "two-etcd-servers-sno",
etcdServers: `[ "val1", "val2" ]`,
isNotSingleNode: false,
},
{
name: "three-etcd-servers",
etcdServers: `[ "val1", "val2", "val3" ]`,
isNotSingleNode: true,
},
{
name: "three-etcd-servers-sno",
etcdServers: `[ "val1", "val2", "val3" ]`,
isNotSingleNode: false,
},
{
name: "four-etcd-servers",
etcdServers: `[ "val1", "val2", "val3", "val4" ]`,
isNotSingleNode: true,
},
{
name: "four-etcd-servers-sno",
etcdServers: `[ "val1", "val2", "val3", "val4" ]`,
isNotSingleNode: false,
},
}

for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
config := fmt.Sprintf(configTemplate, test.etcdServers)
actual := isRequiredConfigPresent([]byte(config), test.isNotSingleNode)
switch {
case actual == nil && len(test.expectedError) == 0:
case actual == nil && len(test.expectedError) != 0:
t.Fatal(actual)
case actual != nil && len(test.expectedError) == 0:
t.Fatal(actual)
case actual != nil && len(test.expectedError) != 0 && !strings.Contains(actual.Error(), test.expectedError):
t.Fatal(actual)
}
})
}
}