-
Notifications
You must be signed in to change notification settings - Fork 1.8k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Restart entire node on tunnel collapse (#8102)
Fixes #7606, where a node doesn't notice when the tunnel port changes. Imagine you have a cluster with a node connected in via a tunnel through a proxy `proxy.example.com` on port `3024` Now change the proxy config so that `tunnel_public_address` is `proxy.example.com:4024`. You either restart the proxy, or reload the proxy config with a `SIGHUP`. ...and then the node a) loses its connection to auth (because the tunnel is gone), and b) _doesn't reconnect_, because even though the proxy address hasn't changed, the node has cached the old tunnel_public_address and keeps trying to connect to that. You can always manually restart the node to have it reconnect, but that would be a pain if you have thousands of nodes. In order to not have to manually restart all nodes, this change implements a check for a connection failures to the auth server, and re-starts the node if there are multiple connection failures in a given period of time. The check as-implemented piggybacks on the node's "common.rotate" service, which can already restart the node in certain circumstances, and uses the success of the periodic rotation sync as a proxy for the health of the node's connection to the auth server. See-Also: #7606
- Loading branch information
Showing
9 changed files
with
385 additions
and
18 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
/* | ||
Copyright 2021 Gravitational, Inc. | ||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
http://www.apache.org/licenses/LICENSE-2.0 | ||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
*/ | ||
|
||
package integration | ||
|
||
import ( | ||
"context" | ||
"testing" | ||
"time" | ||
|
||
"github.com/gravitational/teleport/lib" | ||
"github.com/gravitational/teleport/lib/auth/testauthority" | ||
"github.com/gravitational/teleport/lib/service" | ||
log "github.com/sirupsen/logrus" | ||
"github.com/stretchr/testify/require" | ||
) | ||
|
||
// TestLostConnectionToAuthCausesReload tests that a lost connection to the auth server | ||
// will eventually restart a node | ||
func TestLostConnectionToAuthCausesReload(t *testing.T) { | ||
// Because testing that the node does a full restart is a bit tricky when | ||
// running a cluster from inside a test runner (i.e. we don't want to | ||
// SIGTERM the test runner), we will watch for the node emitting a | ||
// `TeleportReload` even instead. In a proper Teleport instance, this | ||
// event would be picked up at the Supervisor level and would eventually | ||
// cause the instance to gracefully restart. | ||
|
||
require := require.New(t) | ||
log := log.StandardLogger() | ||
|
||
log.Info(">>> Entering Test") | ||
|
||
// InsecureDevMode needed for SSH connections | ||
// TODO(tcsc): surface this as per-server config (see also issue #8913) | ||
lib.SetInsecureDevMode(true) | ||
defer lib.SetInsecureDevMode(false) | ||
|
||
// GIVEN a cluster with a running auth+proxy instance.... | ||
log.Info(">>> Creating cluster") | ||
keygen := testauthority.New() | ||
privateKey, publicKey, err := keygen.GenerateKeyPair("") | ||
require.NoError(err) | ||
auth := NewInstance(InstanceConfig{ | ||
ClusterName: "test-tunnel-collapse", | ||
HostID: "auth", | ||
Priv: privateKey, | ||
Pub: publicKey, | ||
Ports: standardPortSetup(), | ||
log: log, | ||
}) | ||
|
||
log.Info(">>> Creating auth-proxy...") | ||
authConf := service.MakeDefaultConfig() | ||
authConf.Hostname = Host | ||
authConf.Auth.Enabled = true | ||
authConf.Proxy.Enabled = true | ||
authConf.SSH.Enabled = false | ||
authConf.Proxy.DisableWebInterface = true | ||
authConf.Proxy.DisableDatabaseProxy = true | ||
require.NoError(auth.CreateEx(t, nil, authConf)) | ||
t.Cleanup(func() { require.NoError(auth.StopAll()) }) | ||
|
||
log.Info(">>> Start auth-proxy...") | ||
require.NoError(auth.Start()) | ||
|
||
// ... and an SSH node connected via a reverse tunnel configured to | ||
// reload after only a few failed connection attempts per minute | ||
log.Info(">>> Creating and starting node...") | ||
nodeCfg := service.MakeDefaultConfig() | ||
nodeCfg.Hostname = Host | ||
nodeCfg.SSH.Enabled = true | ||
nodeCfg.RotationConnectionInterval = 1 * time.Second | ||
nodeCfg.RestartThreshold = service.Rate{Amount: 3, Time: 1 * time.Minute} | ||
node, err := auth.StartReverseTunnelNode(nodeCfg) | ||
require.NoError(err) | ||
|
||
// WHEN I stop the auth node (and, by implication, disrupt the ssh node's | ||
// connection to it) | ||
log.Info(">>> Stopping auth node") | ||
auth.StopAuth(false) | ||
|
||
// EXPECT THAT the ssh node will eventually issue a reload request | ||
log.Info(">>> Waiting for node restart request.") | ||
waitCtx, cancel := context.WithTimeout(context.Background(), 1*time.Minute) | ||
defer cancel() | ||
|
||
eventCh := make(chan service.Event) | ||
node.WaitForEvent(waitCtx, service.TeleportReloadEvent, eventCh) | ||
select { | ||
case e := <-eventCh: | ||
log.Infof(">>> Received Reload event: %v. Test passed.", e) | ||
|
||
case <-waitCtx.Done(): | ||
require.FailNow("Timed out", "Timed out waiting for reload event") | ||
} | ||
|
||
log.Info(">>> TEST COMPLETE") | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
/* | ||
Copyright 2021 Gravitational, Inc. | ||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
http://www.apache.org/licenses/LICENSE-2.0 | ||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
*/ | ||
|
||
package utils | ||
|
||
import ( | ||
"time" | ||
|
||
"github.com/jonboulle/clockwork" | ||
) | ||
|
||
// TimedCounter is essentially a lightweight rate calculator. It counts events | ||
// that happen over a period of time, e.g. have there been more than 4 errors | ||
// in the last 30 seconds. Automatically expires old events so they are not | ||
// included in the count. Not safe for concurrent use. | ||
type TimedCounter struct { | ||
clock clockwork.Clock | ||
timeout time.Duration | ||
events []time.Time | ||
} | ||
|
||
// TimedCounted creates a new timed counter with the specified timeout | ||
func NewTimedCounter(clock clockwork.Clock, timeout time.Duration) *TimedCounter { | ||
return &TimedCounter{ | ||
clock: clock, | ||
timeout: timeout, | ||
events: nil, | ||
} | ||
} | ||
|
||
// Increment adds a new item into the counter, returning the current count. | ||
func (c *TimedCounter) Increment() int { | ||
c.trim() | ||
c.events = append(c.events, c.clock.Now()) | ||
return len(c.events) | ||
} | ||
|
||
// Count fetches the number of recorded events currently in the measurement | ||
// time window. | ||
func (c *TimedCounter) Count() int { | ||
c.trim() | ||
return len(c.events) | ||
} | ||
|
||
func (c *TimedCounter) trim() { | ||
deadline := c.clock.Now().Add(-c.timeout) | ||
lastExpiredEvent := -1 | ||
for i := range c.events { | ||
if c.events[i].After(deadline) { | ||
break | ||
} | ||
lastExpiredEvent = i | ||
} | ||
|
||
if lastExpiredEvent > -1 { | ||
c.events = c.events[lastExpiredEvent+1:] | ||
} | ||
} |
Oops, something went wrong.