From 047e3ed6dd88b3ff410dab1db003f0713bd36624 Mon Sep 17 00:00:00 2001 From: 6543 <6543@obermui.de> Date: Sun, 19 Apr 2026 23:46:38 +0200 Subject: [PATCH 1/3] Make agent reconnect retry timeout configurable --- agent/rpc/client_grpc.go | 29 ++++++++++++++----- cmd/agent/core/agent.go | 4 ++- cmd/agent/core/flags.go | 6 ++++ .../10-configuration/30-agent.md | 13 +++++++++ 4 files changed, 44 insertions(+), 8 deletions(-) diff --git a/agent/rpc/client_grpc.go b/agent/rpc/client_grpc.go index 9c1c42d9b85..2e64d2c4240 100644 --- a/agent/rpc/client_grpc.go +++ b/agent/rpc/client_grpc.go @@ -47,10 +47,6 @@ const ( // Maximum amount of time between sending consecutive batched log messages. // Controls the delay between the CI job generating a log record, and web users receiving it. maxLogFlushPeriod time.Duration = time.Second - - // ConnectionRetryTimeout is the maximum time to wait for a connection to be restored - // before the agent gives up and exits. - ConnectionRetryTimeout = 2 * time.Minute ) type client struct { @@ -58,18 +54,37 @@ type client struct { conn *grpc.ClientConn logs chan *proto.LogEntry connectionLostAt time.Time + // connectionRetryTimeout is the maximum time to wait for a connection to be restored before the agent gives up and exits. + connectionRetryTimeout time.Duration } // NewGrpcClient returns a new grpc Client. -func NewGrpcClient(ctx context.Context, conn *grpc.ClientConn) rpc.Peer { +func NewGrpcClient(ctx context.Context, conn *grpc.ClientConn, opts ...ClientOption) rpc.Peer { client := new(client) client.client = proto.NewWoodpeckerClient(conn) client.conn = conn client.logs = make(chan *proto.LogEntry, 10) // max memory use: 10 lines * 1 MiB + client.connectionRetryTimeout = 2 * time.Minute + + for _, opt := range opts { + opt(client) + } + go client.processLogs(ctx) return client } +type ClientOption func(c *client) + +func SetConnectionRetryTimeout(d time.Duration) ClientOption { + if d == 0 { + log.Warn().Msg("connection retry timeout set to infinite") + } + return func(c *client) { + c.connectionRetryTimeout = d + } +} + func (c *client) Close() error { close(c.logs) return c.conn.Close() @@ -87,10 +102,10 @@ func (c *client) IsConnected() bool { } func (c *client) shouldGiveUp() bool { - if c.connectionLostAt.IsZero() { + if c.connectionRetryTimeout == 0 || c.connectionLostAt.IsZero() { return false } - return time.Since(c.connectionLostAt) > ConnectionRetryTimeout + return time.Since(c.connectionLostAt) > c.connectionRetryTimeout } func (c *client) newBackOff() backoff.BackOff { diff --git a/cmd/agent/core/agent.go b/cmd/agent/core/agent.go index c5a438836e7..1b7ddbccd8f 100644 --- a/cmd/agent/core/agent.go +++ b/cmd/agent/core/agent.go @@ -152,7 +152,9 @@ func run(ctx context.Context, c *cli.Command, backends []types.Backend) error { } defer conn.Close() - client := agent_rpc.NewGrpcClient(ctx, conn) + client := agent_rpc.NewGrpcClient(ctx, conn, + agent_rpc.SetConnectionRetryTimeout(c.Duration("retry-timeout")), + ) agentConfigPersisted := atomic.Bool{} grpcCtx := metadata.NewOutgoingContext(grpcClientCtx, metadata.Pairs("hostname", hostname)) diff --git a/cmd/agent/core/flags.go b/cmd/agent/core/flags.go index e7ad92f03f8..549da5eaf46 100644 --- a/cmd/agent/core/flags.go +++ b/cmd/agent/core/flags.go @@ -51,6 +51,12 @@ var flags = []cli.Flag{ Usage: "should the grpc server certificate be verified, only valid when WOODPECKER_GRPC_SECURE is true", Value: true, }, + &cli.DurationFlag{ + Sources: cli.EnvVars("WOODPECKER_RETRY_TIMEOUT"), + Name: "retry-timeout", + Usage: "how long the agent keeps retrying to reconnect to the server after the gRPC connection is lost before giving up, set to 0 to retry forever", + Value: 2 * time.Minute, + }, &cli.StringFlag{ Sources: cli.EnvVars("WOODPECKER_HOSTNAME"), Name: "hostname", diff --git a/docs/docs/30-administration/10-configuration/30-agent.md b/docs/docs/30-administration/10-configuration/30-agent.md index 10f1cbc4670..6c4d2bdb801 100644 --- a/docs/docs/30-administration/10-configuration/30-agent.md +++ b/docs/docs/30-administration/10-configuration/30-agent.md @@ -228,6 +228,19 @@ Configures if the gRPC server certificate should be verified, only valid when `W --- +## RETRY_TIMEOUT + +- Name: `WOODPECKER_RETRY_TIMEOUT` +- Default: `2m` + +Set how long the agent keeps retrying to reconnect to the server after the gRPC connection is lost before giving up. + +:::warning +If set to 0 we retry forever. +::: + +--- + ### BACKEND - Name: `WOODPECKER_BACKEND` From 9aeed5870e60b3073715bc2c98ff8d888f33345d Mon Sep 17 00:00:00 2001 From: 6543 <6543@obermui.de> Date: Sun, 19 Apr 2026 23:48:35 +0200 Subject: [PATCH 2/3] rm mnd lint issue --- agent/rpc/client_grpc.go | 1 - 1 file changed, 1 deletion(-) diff --git a/agent/rpc/client_grpc.go b/agent/rpc/client_grpc.go index 2e64d2c4240..edebf5763af 100644 --- a/agent/rpc/client_grpc.go +++ b/agent/rpc/client_grpc.go @@ -64,7 +64,6 @@ func NewGrpcClient(ctx context.Context, conn *grpc.ClientConn, opts ...ClientOpt client.client = proto.NewWoodpeckerClient(conn) client.conn = conn client.logs = make(chan *proto.LogEntry, 10) // max memory use: 10 lines * 1 MiB - client.connectionRetryTimeout = 2 * time.Minute for _, opt := range opts { opt(client) From d62a3a8ab1237d833b6827b815281a69a27e98a6 Mon Sep 17 00:00:00 2001 From: 6543 <6543@obermui.de> Date: Mon, 20 Apr 2026 00:05:26 +0200 Subject: [PATCH 3/3] make IsConnected race protected --- agent/rpc/client_grpc.go | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/agent/rpc/client_grpc.go b/agent/rpc/client_grpc.go index edebf5763af..9fdc5b5de70 100644 --- a/agent/rpc/client_grpc.go +++ b/agent/rpc/client_grpc.go @@ -19,6 +19,7 @@ import ( "encoding/json" "errors" "strings" + "sync" "time" "github.com/cenkalti/backoff/v5" @@ -50,10 +51,11 @@ const ( ) type client struct { - client proto.WoodpeckerClient - conn *grpc.ClientConn - logs chan *proto.LogEntry - connectionLostAt time.Time + client proto.WoodpeckerClient + conn *grpc.ClientConn + logs chan *proto.LogEntry + connectionLostAt time.Time + connectionLostLock sync.Mutex // connectionRetryTimeout is the maximum time to wait for a connection to be restored before the agent gives up and exits. connectionRetryTimeout time.Duration } @@ -92,6 +94,8 @@ func (c *client) Close() error { func (c *client) IsConnected() bool { state := c.conn.GetState() connected := state == connectivity.Ready || state == connectivity.Idle + c.connectionLostLock.Lock() + defer c.connectionLostLock.Unlock() if !connected && c.connectionLostAt.IsZero() { c.connectionLostAt = time.Now() } else if connected && !c.connectionLostAt.IsZero() {