-
Notifications
You must be signed in to change notification settings - Fork 214
OCPBUGS-3783: Update dnsPolicy to allow consistent resolution of the internal LB #920
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -37,14 +37,17 @@ type Client struct { | |
| // requests. If empty, the User-Agent header will not be | ||
| // populated. | ||
| userAgent string | ||
|
|
||
| conditionRegistry clusterconditions.ConditionRegistry | ||
| } | ||
|
|
||
| // NewClient creates a new Cincinnati client with the given client identifier. | ||
| func NewClient(id uuid.UUID, transport *http.Transport, userAgent string) Client { | ||
| func NewClient(id uuid.UUID, transport *http.Transport, userAgent string, conditionRegistry clusterconditions.ConditionRegistry) Client { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just a note: this is a really good example why the global sucked. I would never expect an osus client code to depend on the condition registry data, this way it is at least explicit (although it's still fishy and seems wrong to me) |
||
| return Client{ | ||
| id: id, | ||
| transport: transport, | ||
| userAgent: userAgent, | ||
| id: id, | ||
| transport: transport, | ||
| userAgent: userAgent, | ||
| conditionRegistry: conditionRegistry, | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -246,7 +249,7 @@ func (c Client) GetUpdates(ctx context.Context, uri *url.URL, desiredArch, curre | |
|
|
||
| for i := len(conditionalUpdates) - 1; i >= 0; i-- { | ||
| for j, risk := range conditionalUpdates[i].Risks { | ||
| conditionalUpdates[i].Risks[j].MatchingRules, err = clusterconditions.PruneInvalid(ctx, risk.MatchingRules) | ||
| conditionalUpdates[i].Risks[j].MatchingRules, err = c.conditionRegistry.PruneInvalid(ctx, risk.MatchingRules) | ||
| if len(conditionalUpdates[i].Risks[j].MatchingRules) == 0 { | ||
| klog.Warningf("Conditional update to %s, risk %q, has empty pruned matchingRules; dropping this target to avoid rejections when pushing to the Kubernetes API server. Pruning results: %s", conditionalUpdates[i].Release.Version, risk.Name, err) | ||
| conditionalUpdates = append(conditionalUpdates[:i], conditionalUpdates[i+1:]...) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -10,11 +10,11 @@ import ( | |
| "reflect" | ||
| "testing" | ||
|
|
||
| "github.com/openshift/cluster-version-operator/pkg/clusterconditions/standard" | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we come up with some better names here? Both |
||
|
|
||
| "github.com/blang/semver/v4" | ||
| "github.com/google/uuid" | ||
| configv1 "github.com/openshift/api/config/v1" | ||
| _ "github.com/openshift/cluster-version-operator/pkg/clusterconditions/always" | ||
| _ "github.com/openshift/cluster-version-operator/pkg/clusterconditions/promql" | ||
| _ "k8s.io/klog/v2" // integration tests set glog flags. | ||
| ) | ||
|
|
||
|
|
@@ -745,7 +745,7 @@ func TestGetUpdates(t *testing.T) { | |
| ts := httptest.NewServer(http.HandlerFunc(handler)) | ||
| defer ts.Close() | ||
|
|
||
| c := NewClient(clientID, nil, "") | ||
| c := NewClient(clientID, nil, "", standard.NewConditionRegistry(nil)) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: I'd prefer if the registry had a |
||
|
|
||
| uri, err := url.Parse(ts.URL) | ||
| if err != nil { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -25,28 +25,51 @@ type Condition interface { | |
| Match(ctx context.Context, condition *configv1.ClusterCondition) (bool, error) | ||
| } | ||
|
|
||
| // Registry is a registry of implemented condition types. | ||
| var Registry map[string]Condition | ||
| type ConditionRegistry interface { | ||
| // Register registers a condition type, and panics on any name collisions. | ||
| Register(conditionType string, condition Condition) | ||
|
|
||
| // PruneInvalid returns a new slice with recognized, valid conditions. | ||
| // The error complains about any unrecognized or invalid conditions. | ||
| PruneInvalid(ctx context.Context, matchingRules []configv1.ClusterCondition) ([]configv1.ClusterCondition, error) | ||
|
|
||
| // Match returns whether the cluster matches the given rules (true), | ||
| // does not match (false), or the rules fail to evaluate (error). | ||
| Match(ctx context.Context, matchingRules []configv1.ClusterCondition) (bool, error) | ||
| } | ||
|
|
||
| type conditionRegistry struct { | ||
| // registry is a registry of implemented condition types. | ||
| registry map[string]Condition | ||
| } | ||
|
|
||
| func NewConditionRegistry() ConditionRegistry { | ||
| ret := &conditionRegistry{ | ||
| registry: map[string]Condition{}, | ||
| } | ||
|
|
||
| return ret | ||
| } | ||
|
|
||
| // Register registers a condition type, and panics on any name collisions. | ||
| func Register(conditionType string, condition Condition) { | ||
| if Registry == nil { | ||
| Registry = make(map[string]Condition, 1) | ||
| func (r *conditionRegistry) Register(conditionType string, condition Condition) { | ||
| if r.registry == nil { | ||
| r.registry = make(map[string]Condition, 1) | ||
| } | ||
| if existing, ok := Registry[conditionType]; ok && condition != existing { | ||
| if existing, ok := r.registry[conditionType]; ok && condition != existing { | ||
| panic(fmt.Sprintf("cluster condition %q already registered", conditionType)) | ||
| } | ||
| Registry[conditionType] = condition | ||
| r.registry[conditionType] = condition | ||
| } | ||
|
|
||
| // PruneInvalid returns a new slice with recognized, valid conditions. | ||
| // The error complains about any unrecognized or invalid conditions. | ||
| func PruneInvalid(ctx context.Context, matchingRules []configv1.ClusterCondition) ([]configv1.ClusterCondition, error) { | ||
| func (r *conditionRegistry) PruneInvalid(ctx context.Context, matchingRules []configv1.ClusterCondition) ([]configv1.ClusterCondition, error) { | ||
| var valid []configv1.ClusterCondition | ||
| var errs []error | ||
|
|
||
| for _, config := range matchingRules { | ||
| condition, ok := Registry[config.Type] | ||
| condition, ok := r.registry[config.Type] | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok this is a code I dislike (not your fault, another artifact of the global), the actual registry is needed here only to validate types, and the remaining code IMO does not belong to a registry. I believe this code should live elsewhere, and the registry should only provide a I don't think this needs to be addressed in this PR, I'll probably want to clean this up as a followup |
||
| if !ok { | ||
| errs = append(errs, fmt.Errorf("Skipping unrecognized cluster condition type %q", config.Type)) | ||
| continue | ||
|
|
@@ -63,11 +86,11 @@ func PruneInvalid(ctx context.Context, matchingRules []configv1.ClusterCondition | |
|
|
||
| // Match returns whether the cluster matches the given rules (true), | ||
| // does not match (false), or the rules fail to evaluate (error). | ||
| func Match(ctx context.Context, matchingRules []configv1.ClusterCondition) (bool, error) { | ||
| func (r *conditionRegistry) Match(ctx context.Context, matchingRules []configv1.ClusterCondition) (bool, error) { | ||
| var errs []error | ||
|
|
||
| for _, config := range matchingRules { | ||
| condition, ok := Registry[config.Type] | ||
| condition, ok := r.registry[config.Type] | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ditto |
||
| if !ok { | ||
| klog.V(2).Infof("Skipping unrecognized cluster condition type %q", config.Type) | ||
| continue | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -8,10 +8,7 @@ import ( | |
| "testing" | ||
|
|
||
| configv1 "github.com/openshift/api/config/v1" | ||
|
|
||
| "github.com/openshift/cluster-version-operator/pkg/clusterconditions" | ||
| _ "github.com/openshift/cluster-version-operator/pkg/clusterconditions/always" | ||
| _ "github.com/openshift/cluster-version-operator/pkg/clusterconditions/promql" | ||
| "github.com/openshift/cluster-version-operator/pkg/clusterconditions/standard" | ||
| ) | ||
|
|
||
| // Error implements a cluster condition that always errors. | ||
|
|
@@ -33,6 +30,7 @@ func (e *Error) Match(ctx context.Context, condition *configv1.ClusterCondition) | |
|
|
||
| func TestPruneInvalid(t *testing.T) { | ||
| ctx := context.Background() | ||
| registry := standard.NewConditionRegistry(nil) | ||
|
|
||
| for _, testCase := range []struct { | ||
| name string | ||
|
|
@@ -100,7 +98,7 @@ func TestPruneInvalid(t *testing.T) { | |
| }, | ||
| } { | ||
| t.Run(testCase.name, func(t *testing.T) { | ||
| valid, err := clusterconditions.PruneInvalid(ctx, testCase.conditions) | ||
| valid, err := registry.PruneInvalid(ctx, testCase.conditions) | ||
| if !reflect.DeepEqual(valid, testCase.expectedValid) { | ||
| t.Errorf("got valid %v but expected %v", valid, testCase.expectedValid) | ||
| } | ||
|
|
@@ -117,7 +115,8 @@ func TestPruneInvalid(t *testing.T) { | |
|
|
||
| func TestMatch(t *testing.T) { | ||
| ctx := context.Background() | ||
| clusterconditions.Register("Error", &Error{}) | ||
| registry := standard.NewConditionRegistry(nil) | ||
| registry.Register("Error", &Error{}) | ||
|
|
||
| for _, testCase := range []struct { | ||
| name string | ||
|
|
@@ -181,7 +180,7 @@ func TestMatch(t *testing.T) { | |
| }, | ||
| } { | ||
| t.Run(testCase.name, func(t *testing.T) { | ||
| match, err := clusterconditions.Match(ctx, testCase.conditions) | ||
| match, err := registry.Match(ctx, testCase.conditions) | ||
| if match != testCase.expectedMatch { | ||
| t.Errorf("got match %t but expected %t", match, testCase.expectedMatch) | ||
| } | ||
|
|
@@ -194,6 +193,4 @@ func TestMatch(t *testing.T) { | |
| } | ||
| }) | ||
| } | ||
|
|
||
| delete(clusterconditions.Registry, "Error") | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ❤️ |
||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -7,23 +7,24 @@ import ( | |
| "context" | ||
| "errors" | ||
| "fmt" | ||
| "net" | ||
| "time" | ||
|
|
||
| configv1 "github.com/openshift/api/config/v1" | ||
| "github.com/prometheus/client_golang/api" | ||
| prometheusv1 "github.com/prometheus/client_golang/api/prometheus/v1" | ||
| "github.com/prometheus/common/config" | ||
| "github.com/prometheus/common/model" | ||
| metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | ||
| "k8s.io/client-go/kubernetes" | ||
| "k8s.io/klog/v2" | ||
|
|
||
| "github.com/openshift/cluster-version-operator/pkg/clusterconditions" | ||
| "github.com/openshift/cluster-version-operator/pkg/clusterconditions/cache" | ||
| ) | ||
|
|
||
| // PromQL implements a cluster condition that matches based on PromQL. | ||
| type PromQL struct { | ||
| // Address holds the Prometheus query URI. | ||
| Address string | ||
| kubeClient kubernetes.Interface | ||
|
|
||
| // HTTPClientConfig holds the client configuration for connecting to the Prometheus service. | ||
| HTTPClientConfig config.HTTPClientConfig | ||
|
|
@@ -32,23 +33,40 @@ type PromQL struct { | |
| QueryTimeout time.Duration | ||
| } | ||
|
|
||
| var promql = &cache.Cache{ | ||
| Condition: &PromQL{ | ||
| Address: "https://thanos-querier.openshift-monitoring.svc.cluster.local:9091", | ||
| HTTPClientConfig: config.HTTPClientConfig{ | ||
| Authorization: &config.Authorization{ | ||
| Type: "Bearer", | ||
| CredentialsFile: "/var/run/secrets/kubernetes.io/serviceaccount/token", | ||
| }, | ||
| TLSConfig: config.TLSConfig{ | ||
| CAFile: "/etc/tls/service-ca/service-ca.crt", | ||
| func NewPromQL(kubeClient kubernetes.Interface) *cache.Cache { | ||
| return &cache.Cache{ | ||
| Condition: &PromQL{ | ||
| kubeClient: kubeClient, | ||
deads2k marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| HTTPClientConfig: config.HTTPClientConfig{ | ||
| Authorization: &config.Authorization{ | ||
| Type: "Bearer", | ||
| CredentialsFile: "/var/run/secrets/kubernetes.io/serviceaccount/token", | ||
| }, | ||
| TLSConfig: config.TLSConfig{ | ||
| CAFile: "/etc/tls/service-ca/service-ca.crt", | ||
| // ServerName is used to verify the name of the service we will connect to using IP. | ||
| ServerName: "thanos-querier.openshift-monitoring.svc.cluster.local", | ||
deads2k marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| }, | ||
| }, | ||
| QueryTimeout: 5 * time.Minute, | ||
| }, | ||
| QueryTimeout: 5 * time.Minute, | ||
| }, | ||
| MinBetweenMatches: 10 * time.Minute, | ||
| MinForCondition: time.Hour, | ||
| Expiration: 24 * time.Hour, | ||
| MinBetweenMatches: 10 * time.Minute, | ||
| MinForCondition: time.Hour, | ||
| Expiration: 24 * time.Hour, | ||
| } | ||
| } | ||
|
|
||
| // Address determines the address of the thanos-querier to avoid requiring service DNS resolution. | ||
| // We do this so that our host-network pod can use the node's resolv.conf to resolve the internal load balancer name | ||
| // on the pod before DNS pods are available and before the service network is available. The side effect is that | ||
| // the CVO cannot resolve service DNS names. | ||
| func (p *PromQL) Address(ctx context.Context) (string, error) { | ||
deads2k marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| svc, err := p.kubeClient.CoreV1().Services("openshift-monitoring").Get(ctx, "thanos-querier", metav1.GetOptions{}) | ||
| if err != nil { | ||
| return "", err | ||
| } | ||
|
Comment on lines
+64
to
+67
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we have any expectations about how reliable this is? Rock-solid, flakey, no idea? I'm just curious.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
very reliable. It's what drives DNS |
||
|
|
||
| return fmt.Sprintf("https://%s", net.JoinHostPort(svc.Spec.ClusterIP, "9091")), nil | ||
| } | ||
|
|
||
| // Valid returns an error if the condition contains any properties | ||
|
|
@@ -69,7 +87,13 @@ func (p *PromQL) Valid(ctx context.Context, condition *configv1.ClusterCondition | |
| // false when the PromQL evaluates to 0, and an error if the PromQL | ||
| // returns no time series or returns a value besides 0 or 1. | ||
| func (p *PromQL) Match(ctx context.Context, condition *configv1.ClusterCondition) (bool, error) { | ||
| clientConfig := api.Config{Address: p.Address} | ||
| // Lookup the address every attempt in case the service IP changes. This can happen when the thanos service is | ||
| // deleted and recreated. | ||
| address, err := p.Address(ctx) | ||
deads2k marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| if err != nil { | ||
| return false, fmt.Errorf("failure determine thanos IP: %w", err) | ||
| } | ||
| clientConfig := api.Config{Address: address} | ||
|
|
||
| if roundTripper, err := config.NewRoundTripperFromConfig(p.HTTPClientConfig, "cluster-conditions"); err == nil { | ||
| clientConfig.RoundTripper = roundTripper | ||
|
|
@@ -122,7 +146,3 @@ func (p *PromQL) Match(ctx context.Context, condition *configv1.ClusterCondition | |
| } | ||
| return false, fmt.Errorf("invalid PromQL result (must be 0 or 1): %v", sample.Value) | ||
| } | ||
|
|
||
| func init() { | ||
| clusterconditions.Register("PromQL", promql) | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,16 @@ | ||
| package standard | ||
|
|
||
| import ( | ||
| "github.com/openshift/cluster-version-operator/pkg/clusterconditions" | ||
| "github.com/openshift/cluster-version-operator/pkg/clusterconditions/always" | ||
| "github.com/openshift/cluster-version-operator/pkg/clusterconditions/promql" | ||
| "k8s.io/client-go/kubernetes" | ||
| ) | ||
|
|
||
| func NewConditionRegistry(kubeClient kubernetes.Interface) clusterconditions.ConditionRegistry { | ||
| conditionRegistry := clusterconditions.NewConditionRegistry() | ||
| conditionRegistry.Register("Always", &always.Always{}) | ||
| conditionRegistry.Register("PromQL", promql.NewPromQL(kubeClient)) | ||
|
|
||
| return conditionRegistry | ||
| } |
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nitpick: This doesn't cause it to use the kubelet's DNS config, it causes it to use 127.0.0.1:53 as its DNS resolver (regardless of what kubelet does). This only works if we run a recursive DNS resolver in the host network ns on nodes, which I guess we must...Just leaving
dnsPolicyunset would do the same thing, for ahostNetworkpod.The naming of the
dnsPolicyoptions is completely broken and led the author of this yaml file astray...