Skip to content

Commit

Permalink
Add OFSwitch connection check to Agent's liveness probes
Browse files Browse the repository at this point in the history
This helps automatic recovery if some issues cause OFSwitch reconnection to
not work properly.

It also fixes a race condition between the IsConnected and SwitchConnected
methods of OFBridge and makes necessary changes to the constructor of
APIServer to allow testing.

For #4092

Signed-off-by: Quan Tian <[email protected]>
  • Loading branch information
tnqn committed Aug 25, 2022
1 parent 52db699 commit f184dc0
Show file tree
Hide file tree
Showing 14 changed files with 440 additions and 66 deletions.
12 changes: 6 additions & 6 deletions build/charts/antrea/templates/agent/daemonset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -175,12 +175,12 @@ spec:
name: api
protocol: TCP
livenessProbe:
exec:
command:
- /bin/sh
- -c
- container_liveness_probe agent
initialDelaySeconds: 5
httpGet:
host: localhost
path: /livez
port: api
scheme: HTTPS
initialDelaySeconds: 10
timeoutSeconds: 5
periodSeconds: 10
failureThreshold: 5
Expand Down
4 changes: 1 addition & 3 deletions build/images/scripts/container_liveness_probe
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,7 @@

source daemon_status

if [ $1 == agent ]; then
exit 0
elif [ $1 == ovs ]; then
if [ $1 == ovs ]; then
check_ovs_status && exit 0
elif [ $1 == ovs-ipsec ]; then
check_ovs_ipsec_status && exit 0
Expand Down
12 changes: 6 additions & 6 deletions build/yamls/antrea-aks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4082,12 +4082,12 @@ spec:
name: api
protocol: TCP
livenessProbe:
exec:
command:
- /bin/sh
- -c
- container_liveness_probe agent
initialDelaySeconds: 5
httpGet:
host: localhost
path: /livez
port: api
scheme: HTTPS
initialDelaySeconds: 10
timeoutSeconds: 5
periodSeconds: 10
failureThreshold: 5
Expand Down
12 changes: 6 additions & 6 deletions build/yamls/antrea-eks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4084,12 +4084,12 @@ spec:
name: api
protocol: TCP
livenessProbe:
exec:
command:
- /bin/sh
- -c
- container_liveness_probe agent
initialDelaySeconds: 5
httpGet:
host: localhost
path: /livez
port: api
scheme: HTTPS
initialDelaySeconds: 10
timeoutSeconds: 5
periodSeconds: 10
failureThreshold: 5
Expand Down
12 changes: 6 additions & 6 deletions build/yamls/antrea-gke.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4081,12 +4081,12 @@ spec:
name: api
protocol: TCP
livenessProbe:
exec:
command:
- /bin/sh
- -c
- container_liveness_probe agent
initialDelaySeconds: 5
httpGet:
host: localhost
path: /livez
port: api
scheme: HTTPS
initialDelaySeconds: 10
timeoutSeconds: 5
periodSeconds: 10
failureThreshold: 5
Expand Down
12 changes: 6 additions & 6 deletions build/yamls/antrea-ipsec.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4101,12 +4101,12 @@ spec:
name: api
protocol: TCP
livenessProbe:
exec:
command:
- /bin/sh
- -c
- container_liveness_probe agent
initialDelaySeconds: 5
httpGet:
host: localhost
path: /livez
port: api
scheme: HTTPS
initialDelaySeconds: 10
timeoutSeconds: 5
periodSeconds: 10
failureThreshold: 5
Expand Down
12 changes: 6 additions & 6 deletions build/yamls/antrea.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4081,12 +4081,12 @@ spec:
name: api
protocol: TCP
livenessProbe:
exec:
command:
- /bin/sh
- -c
- container_liveness_probe agent
initialDelaySeconds: 5
httpGet:
host: localhost
path: /livez
port: api
scheme: HTTPS
initialDelaySeconds: 10
timeoutSeconds: 5
periodSeconds: 10
failureThreshold: 5
Expand Down
20 changes: 11 additions & 9 deletions cmd/antrea-agent/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (
"k8s.io/apimachinery/pkg/fields"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/apiserver/pkg/server/options"
"k8s.io/client-go/informers"
coreinformers "k8s.io/client-go/informers/core/v1"
"k8s.io/client-go/tools/cache"
Expand Down Expand Up @@ -71,7 +72,6 @@ import (
"antrea.io/antrea/pkg/ovs/ovsctl"
"antrea.io/antrea/pkg/signals"
"antrea.io/antrea/pkg/util/channel"
"antrea.io/antrea/pkg/util/cipher"
"antrea.io/antrea/pkg/util/env"
"antrea.io/antrea/pkg/util/k8s"
"antrea.io/antrea/pkg/version"
Expand Down Expand Up @@ -748,25 +748,27 @@ func run(o *Options) error {

go agentMonitor.Run(stopCh)

cipherSuites, err := cipher.GenerateCipherSuitesList(o.config.TLSCipherSuites)
if err != nil {
return fmt.Errorf("error generating Cipher Suite list: %v", err)
}
bindAddress := net.IPv4zero
if o.nodeType == config.ExternalNode {
bindAddress = ipv4Localhost
}
secureServing := options.NewSecureServingOptions().WithLoopback()
secureServing.BindAddress = bindAddress
secureServing.BindPort = o.config.APIPort
secureServing.CipherSuites = o.tlsCipherSuites
secureServing.MinTLSVersion = o.config.TLSMinVersion
authentication := options.NewDelegatingAuthenticationOptions()
authorization := options.NewDelegatingAuthorizationOptions().WithAlwaysAllowPaths("/healthz", "/livez", "/readyz")
apiServer, err := apiserver.New(
agentQuerier,
networkPolicyController,
mcastController,
externalIPController,
bindAddress,
o.config.APIPort,
secureServing,
authentication,
authorization,
*o.config.EnablePrometheusMetrics,
o.config.ClientConnection.Kubeconfig,
cipherSuites,
cipher.TLSVersionMap[o.config.TLSMinVersion],
v4Enabled,
v6Enabled)
if err != nil {
Expand Down
20 changes: 20 additions & 0 deletions cmd/antrea-agent/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (
"github.com/spf13/pflag"
"gopkg.in/yaml.v2"
"k8s.io/apimachinery/pkg/util/sets"
cliflag "k8s.io/component-base/cli/flag"
"k8s.io/component-base/featuregate"
"k8s.io/klog/v2"

Expand Down Expand Up @@ -61,6 +62,8 @@ type Options struct {
configFile string
// The configuration object
config *agentconfig.AgentConfig
// tlsCipherSuites is a slice of TLSCipherSuites mapped to input provided by user.
tlsCipherSuites []string
// IPFIX flow collector address
flowCollectorAddr string
// IPFIX flow collector protocol
Expand Down Expand Up @@ -171,6 +174,23 @@ func (o *Options) setDefaults() {
}
}

func (o *Options) validateTLSOptions() error {
_, err := cliflag.TLSVersion(o.config.TLSMinVersion)
if err != nil {
return fmt.Errorf("invalid TLSMinVersion: %v", err)
}
trimmedTLSCipherSuites := strings.ReplaceAll(o.config.TLSCipherSuites, " ", "")
if trimmedTLSCipherSuites != "" {
tlsCipherSuites := strings.Split(trimmedTLSCipherSuites, ",")
_, err = cliflag.TLSCipherSuites(tlsCipherSuites)
if err != nil {
return fmt.Errorf("invalid TLSCipherSuites: %v", err)
}
o.tlsCipherSuites = tlsCipherSuites
}
return nil
}

func (o *Options) validateAntreaProxyConfig() error {
if !features.DefaultFeatureGate.Enabled(features.AntreaProxy) {
// Validate service CIDR configuration if AntreaProxy is not enabled.
Expand Down
75 changes: 75 additions & 0 deletions cmd/antrea-agent/options_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
// Copyright 2022 Antrea Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package main

import (
"testing"

"github.com/stretchr/testify/assert"

agentconfig "antrea.io/antrea/pkg/config/agent"
)

func TestOptionsValidateTLSOptions(t *testing.T) {
tests := []struct {
name string
config *agentconfig.AgentConfig
expectedErr string
}{
{
name: "empty input",
config: &agentconfig.AgentConfig{
TLSCipherSuites: "",
TLSMinVersion: "",
},
expectedErr: "",
},
{
name: "invalid TLSMinVersion",
config: &agentconfig.AgentConfig{
TLSCipherSuites: "",
TLSMinVersion: "foo",
},
expectedErr: "invalid TLSMinVersion",
},
{
name: "invalid TLSCipherSuites",
config: &agentconfig.AgentConfig{
TLSCipherSuites: "TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305, foo",
TLSMinVersion: "VersionTLS10",
},
expectedErr: "invalid TLSCipherSuites",
},
{
name: "valid input",
config: &agentconfig.AgentConfig{
TLSCipherSuites: "TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305, TLS_RSA_WITH_AES_128_GCM_SHA256",
TLSMinVersion: "VersionTLS12",
},
expectedErr: "",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
o := &Options{config: tt.config}
err := o.validateTLSOptions()
if tt.expectedErr == "" {
assert.NoError(t, err)
} else {
assert.ErrorContains(t, err, tt.expectedErr)
}
})
}
}
46 changes: 32 additions & 14 deletions pkg/agent/apiserver/apiserver.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,30 +98,41 @@ func installAPIGroup(s *genericapiserver.GenericAPIServer, aq agentquerier.Agent
}

// New creates an APIServer for running in antrea agent.
func New(aq agentquerier.AgentQuerier, npq querier.AgentNetworkPolicyInfoQuerier, mq querier.AgentMulticastInfoQuerier, seipq querier.ServiceExternalIPStatusQuerier,
bindAddress net.IP, bindPort int, enableMetrics bool, kubeconfig string, cipherSuites []uint16, tlsMinVersion uint16, v4Enabled, v6Enabled bool) (*agentAPIServer, error) {
cfg, err := newConfig(npq, bindAddress, bindPort, enableMetrics, kubeconfig)
func New(aq agentquerier.AgentQuerier,
npq querier.AgentNetworkPolicyInfoQuerier,
mq querier.AgentMulticastInfoQuerier,
seipq querier.ServiceExternalIPStatusQuerier,
secureServing *genericoptions.SecureServingOptionsWithLoopback,
authentication *genericoptions.DelegatingAuthenticationOptions,
authorization *genericoptions.DelegatingAuthorizationOptions,
enableMetrics bool,
kubeconfig string,
v4Enabled,
v6Enabled bool,
) (*agentAPIServer, error) {
cfg, err := newConfig(aq, npq, secureServing, authentication, authorization, enableMetrics, kubeconfig)
if err != nil {
return nil, err
}
s, err := cfg.New(Name, genericapiserver.NewEmptyDelegate())
if err != nil {
return nil, err
}
s.SecureServingInfo.CipherSuites = cipherSuites
s.SecureServingInfo.MinTLSVersion = tlsMinVersion
if err := installAPIGroup(s, aq, npq, v4Enabled, v6Enabled); err != nil {
return nil, err
}
installHandlers(aq, npq, mq, seipq, s)
return &agentAPIServer{GenericAPIServer: s}, nil
}

func newConfig(npq querier.AgentNetworkPolicyInfoQuerier, bindAddress net.IP, bindPort int, enableMetrics bool, kubeconfig string) (*genericapiserver.CompletedConfig, error) {
secureServing := genericoptions.NewSecureServingOptions().WithLoopback()
authentication := genericoptions.NewDelegatingAuthenticationOptions()
authorization := genericoptions.NewDelegatingAuthorizationOptions().WithAlwaysAllowPaths("/healthz", "/livez", "/readyz")

func newConfig(aq agentquerier.AgentQuerier,
npq querier.AgentNetworkPolicyInfoQuerier,
secureServing *genericoptions.SecureServingOptionsWithLoopback,
authentication *genericoptions.DelegatingAuthenticationOptions,
authorization *genericoptions.DelegatingAuthorizationOptions,
enableMetrics bool,
kubeconfig string,
) (*genericapiserver.CompletedConfig, error) {
// kubeconfig file is useful when antrea-agent isn't running as a Pod.
if len(kubeconfig) > 0 {
authentication.RemoteKubeConfigFile = kubeconfig
Expand All @@ -131,8 +142,6 @@ func newConfig(npq querier.AgentNetworkPolicyInfoQuerier, bindAddress net.IP, bi
// Set the PairName but leave certificate directory blank to generate in-memory by default.
secureServing.ServerCert.CertDirectory = ""
secureServing.ServerCert.PairName = Name
secureServing.BindAddress = bindAddress
secureServing.BindPort = bindPort

if err := secureServing.MaybeDefaultWithSelfSignedCerts("localhost", nil, []net.IP{net.ParseIP("127.0.0.1"), net.IPv6loopback}); err != nil {
return nil, fmt.Errorf("error creating self-signed certificates: %v", err)
Expand Down Expand Up @@ -163,13 +172,22 @@ func newConfig(npq querier.AgentNetworkPolicyInfoQuerier, bindAddress net.IP, bi
}
serverConfig.EnableMetrics = enableMetrics
// Add readiness probe to check the status of watchers.
check := healthz.NamedCheck("watcher", func(_ *http.Request) error {
watcherCheck := healthz.NamedCheck("watcher", func(_ *http.Request) error {
if npq.GetControllerConnectionStatus() {
return nil
}
return fmt.Errorf("some watchers may not be connected")
})
serverConfig.ReadyzChecks = append(serverConfig.ReadyzChecks, check)
serverConfig.ReadyzChecks = append(serverConfig.ReadyzChecks, watcherCheck)
// Add liveness probe to check the connection with OFSwitch.
// This helps automatic recovery if some issues cause OFSwitch reconnection to not work properly, e.g. issue #4092.
ovsConnCheck := healthz.NamedCheck("ovs", func(_ *http.Request) error {
if aq.GetOpenflowClient().IsConnected() {
return nil
}
return fmt.Errorf("disconnected from OFSwitch")
})
serverConfig.LivezChecks = append(serverConfig.LivezChecks, ovsConnCheck)

completedServerCfg := serverConfig.Complete(nil)
return &completedServerCfg, nil
Expand Down
Loading

0 comments on commit f184dc0

Please sign in to comment.