diff --git a/agent/rpc/client_grpc.go b/agent/rpc/client_grpc.go index 96f864cf25e..318bc093832 100644 --- a/agent/rpc/client_grpc.go +++ b/agent/rpc/client_grpc.go @@ -19,6 +19,7 @@ import ( "encoding/json" "errors" "strings" + "sync" "time" "github.com/cenkalti/backoff/v5" @@ -47,32 +48,49 @@ const ( // Maximum amount of time between sending consecutive batched log messages. // Controls the delay between the CI job generating a log record, and web users receiving it. maxLogFlushPeriod time.Duration = time.Second - - // ConnectionRetryTimeout is the maximum time to wait for a connection to be restored - // before the agent gives up and exits. - ConnectionRetryTimeout = 2 * time.Minute ) type client struct { - client proto.WoodpeckerClient - conn *grpc.ClientConn - logs chan *proto.LogEntry - connectionLostAt time.Time + client proto.WoodpeckerClient + conn *grpc.ClientConn + logs chan *proto.LogEntry + connectionLostAt time.Time + connectionLostLock sync.Mutex + // connectionRetryTimeout is the maximum time to wait for a connection to be restored before the agent gives up and exits. + connectionRetryTimeout time.Duration } // NewGrpcClient returns a new grpc Client. -func NewGrpcClient(ctx context.Context, conn *grpc.ClientConn) rpc.Peer { +func NewGrpcClient(ctx context.Context, conn *grpc.ClientConn, opts ...ClientOption) rpc.Peer { client := new(client) client.client = proto.NewWoodpeckerClient(conn) client.conn = conn client.logs = make(chan *proto.LogEntry, 10) // max memory use: 10 lines * 1 MiB + + for _, opt := range opts { + opt(client) + } + go client.processLogs(ctx) return client } +type ClientOption func(c *client) + +func SetConnectionRetryTimeout(d time.Duration) ClientOption { + if d == 0 { + log.Warn().Msg("connection retry timeout set to infinite") + } + return func(c *client) { + c.connectionRetryTimeout = d + } +} + func (c *client) IsConnected() bool { state := c.conn.GetState() connected := state == connectivity.Ready || state == connectivity.Idle + c.connectionLostLock.Lock() + defer c.connectionLostLock.Unlock() if !connected && c.connectionLostAt.IsZero() { c.connectionLostAt = time.Now() } else if connected && !c.connectionLostAt.IsZero() { @@ -82,10 +100,10 @@ func (c *client) IsConnected() bool { } func (c *client) shouldGiveUp() bool { - if c.connectionLostAt.IsZero() { + if c.connectionRetryTimeout == 0 || c.connectionLostAt.IsZero() { return false } - return time.Since(c.connectionLostAt) > ConnectionRetryTimeout + return time.Since(c.connectionLostAt) > c.connectionRetryTimeout } func (c *client) newBackOff() backoff.BackOff { diff --git a/cmd/agent/core/agent.go b/cmd/agent/core/agent.go index c5a438836e7..1b7ddbccd8f 100644 --- a/cmd/agent/core/agent.go +++ b/cmd/agent/core/agent.go @@ -152,7 +152,9 @@ func run(ctx context.Context, c *cli.Command, backends []types.Backend) error { } defer conn.Close() - client := agent_rpc.NewGrpcClient(ctx, conn) + client := agent_rpc.NewGrpcClient(ctx, conn, + agent_rpc.SetConnectionRetryTimeout(c.Duration("retry-timeout")), + ) agentConfigPersisted := atomic.Bool{} grpcCtx := metadata.NewOutgoingContext(grpcClientCtx, metadata.Pairs("hostname", hostname)) diff --git a/cmd/agent/core/flags.go b/cmd/agent/core/flags.go index e7ad92f03f8..549da5eaf46 100644 --- a/cmd/agent/core/flags.go +++ b/cmd/agent/core/flags.go @@ -51,6 +51,12 @@ var flags = []cli.Flag{ Usage: "should the grpc server certificate be verified, only valid when WOODPECKER_GRPC_SECURE is true", Value: true, }, + &cli.DurationFlag{ + Sources: cli.EnvVars("WOODPECKER_RETRY_TIMEOUT"), + Name: "retry-timeout", + Usage: "how long the agent keeps retrying to reconnect to the server after the gRPC connection is lost before giving up, set to 0 to retry forever", + Value: 2 * time.Minute, + }, &cli.StringFlag{ Sources: cli.EnvVars("WOODPECKER_HOSTNAME"), Name: "hostname", diff --git a/docs/docs/30-administration/10-configuration/30-agent.md b/docs/docs/30-administration/10-configuration/30-agent.md index 10f1cbc4670..6c4d2bdb801 100644 --- a/docs/docs/30-administration/10-configuration/30-agent.md +++ b/docs/docs/30-administration/10-configuration/30-agent.md @@ -228,6 +228,19 @@ Configures if the gRPC server certificate should be verified, only valid when `W --- +## RETRY_TIMEOUT + +- Name: `WOODPECKER_RETRY_TIMEOUT` +- Default: `2m` + +Set how long the agent keeps retrying to reconnect to the server after the gRPC connection is lost before giving up. + +:::warning +If set to 0 we retry forever. +::: + +--- + ### BACKEND - Name: `WOODPECKER_BACKEND`