diff --git a/docs/release-notes/release-notes-0.16.1.md b/docs/release-notes/release-notes-0.16.1.md index a0f0bb530..888afd62d 100644 --- a/docs/release-notes/release-notes-0.16.1.md +++ b/docs/release-notes/release-notes-0.16.1.md @@ -18,6 +18,14 @@ ### Functional Changes/Additions +* [PR](https://github.com/lightninglabs/lightning-terminal/pull/1183): LiT now + fails fast if a critical integrated sub-server cannot start. The critical set + currently includes only tapd; other integrated sub-servers can fail to start + without blocking LiT, and their errors are recorded in status. +* Integrated-mode sub-servers now start deterministically: critical integrated + services are launched first, followed by the remaining services in + alphabetical order to keep startup ordering stable across runs. + ### Technical and Architectural Updates ## RPC Updates @@ -37,4 +45,4 @@ ### Taproot Assets -# Contributors (Alphabetical Order) \ No newline at end of file +# Contributors (Alphabetical Order) diff --git a/itest/litd_mode_integrated_test.go b/itest/litd_mode_integrated_test.go index f6b9a7ca8..4181e8fee 100644 --- a/itest/litd_mode_integrated_test.go +++ b/itest/litd_mode_integrated_test.go @@ -20,6 +20,7 @@ import ( "github.com/lightninglabs/faraday/frdrpc" "github.com/lightninglabs/lightning-node-connect/mailbox" "github.com/lightninglabs/lightning-terminal/litrpc" + "github.com/lightninglabs/lightning-terminal/subservers" "github.com/lightninglabs/loop/looprpc" "github.com/lightninglabs/pool/poolrpc" "github.com/lightninglabs/taproot-assets/taprpc" @@ -29,6 +30,7 @@ import ( "github.com/lightningnetwork/lnd/lnrpc/routerrpc" "github.com/lightningnetwork/lnd/lnrpc/walletrpc" "github.com/lightningnetwork/lnd/macaroons" + "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "golang.org/x/net/http2" "google.golang.org/grpc" @@ -465,6 +467,170 @@ func testModeIntegrated(ctx context.Context, net *NetworkHarness, ) } +// testCriticalTapStartupFailure ensures LiT exits quickly when a critical +// integrated sub-server (tapd) fails to start during boot. +func testCriticalTapStartupFailure(ctx context.Context, net *NetworkHarness, + t *harnessTest) { + + // Force tapd to use a postgres backend with an invalid host to + // guarantee a startup failure in integrated mode. This config will + // error during tapd's startup (after wallet unlock) rather than during + // litd's config validation phase. + node, err := net.NewNode( + t.t, "FailFastTap", nil, false, false, + "--taproot-assets.databasebackend=postgres", + "--taproot-assets.postgres.host=tapd-postgres.invalid", + ) + require.NoError(t.t, err) + + defer func() { + _ = net.ShutdownNode(node) + }() + + select { + case procErr := <-net.ProcessErrors(): + require.ErrorContains(t.t, procErr, "tapd-postgres.invalid") + case <-time.After(15 * time.Second): + t.Fatalf("expected tapd startup failure to be reported") + } + + // LiT should terminate promptly after the critical startup failure. + select { + case <-node.processExit: + case <-time.After(5 * time.Second): + t.Fatalf("litd did not exit after tapd startup failure") + } +} + +// testNonCriticalLoopStartupFailure ensures LiT continues running when a +// non-critical integrated sub-server (loopd) fails to start during boot. +func testNonCriticalLoopStartupFailure(ctx context.Context, + net *NetworkHarness, t *harnessTest) { + + // Force loopd into an invalid config combination to guarantee a + // startup failure in integrated mode. + node, err := net.NewNode( + t.t, "NonCriticalLoop", nil, false, false, + "--loop.maxl402cost=1", + "--loop.maxlsatcost=1", + ) + require.NoError(t.t, err) + + stopNodeGracefully := func() error { + conn, err := node.ConnectRPC(true) + if err != nil { + return err + } + defer conn.Close() + + stopCtx, cancel := context.WithTimeout(ctx, defaultTimeout) + defer cancel() + + lndConn := lnrpc.NewLightningClient(conn) + _, err = lndConn.StopDaemon(stopCtx, &lnrpc.StopRequest{}) + return err + } + + defer func() { + t.Logf("Shutting down node: %s", node.Cfg.LitDir) + if err := stopNodeGracefully(); err != nil { + t.Logf("Graceful shutdown failed, killing node: %v", + err) + _ = net.KillNode(node) + } + select { + case <-node.processExit: + case <-time.After(5 * time.Second): + _ = net.KillNode(node) + } + _ = node.cleanup() + }() + + // Wait for the TLS cert so we can connect to the status service. + t.Logf("Waiting for lit TLS cert to be created: %s", + node.Cfg.LitTLSCertPath) + require.Eventually(t.t, func() bool { + _, err := os.Stat(node.Cfg.LitTLSCertPath) + return err == nil + }, 10*time.Second, 1*time.Second, + "expected lit TLS cert to be created", + ) + + t.Logf("Connecting to litd service") + rawConn, err := connectLitRPC( + ctx, node.Cfg.LitAddr(), node.Cfg.LitTLSCertPath, "", + ) + require.NoError(t.t, err) + defer rawConn.Close() + + statusConn := litrpc.NewStatusClient(rawConn) + + fetchStatus := func() (*litrpc.SubServerStatusResp, error) { + callCtx, cancel := context.WithTimeout(ctx, 2*time.Second) + defer cancel() + + return statusConn.SubServerStatus( + callCtx, &litrpc.SubServerStatusReq{}, + ) + } + + waitForStatus := func(desc string, + predicate func(*litrpc.SubServerStatusResp) bool) { + + t.t.Helper() + + var lastErr error + var lastResp *litrpc.SubServerStatusResp + + ok := assert.Eventually(t.t, func() bool { + lastResp, lastErr = fetchStatus() + if lastErr != nil || lastResp == nil { + return false + } + + return predicate(lastResp) + }, 20*time.Second, 200*time.Millisecond) + if ok { + return + } + + if lastErr != nil { + t.t.Fatalf("%s: last error: %v", desc, lastErr) + } + + t.t.Fatalf("%s: last status: %#v", desc, lastResp.SubServers) + } + + t.Logf("Waiting for loop to report non-critical loop startup failure") + waitForStatus("expected loop startup error to be reported", + func(resp *litrpc.SubServerStatusResp) bool { + status, ok := resp.SubServers[subservers.LOOP] + if !ok || status.Running || status.Error == "" { + return false + } + + res := strings.Contains(status.Error, "maxl402cost") + return res + }, + ) + + t.Logf("Waiting for lnd to report running state") + waitForStatus("expected lnd to be running", + func(resp *litrpc.SubServerStatusResp) bool { + status, ok := resp.SubServers[subservers.LND] + return ok && status.Running + }, + ) + + t.Logf("Waiting for litd to report running state") + waitForStatus("expected litd to be running", + func(resp *litrpc.SubServerStatusResp) bool { + status, ok := resp.SubServers[subservers.LIT] + return ok && status.Running + }, + ) +} + // integratedTestSuite makes sure that in integrated mode all daemons work // correctly. func integratedTestSuite(ctx context.Context, net *NetworkHarness, t *testing.T, diff --git a/itest/litd_test_list_on_test.go b/itest/litd_test_list_on_test.go index fda3f1653..1cf8fc60c 100644 --- a/itest/litd_test_list_on_test.go +++ b/itest/litd_test_list_on_test.go @@ -87,6 +87,16 @@ var allTestCases = []*testCase{ test: testCustomChannelsHtlcForceCloseMpp, noAliceBob: true, }, + { + name: "critical tap startup failure", + test: testCriticalTapStartupFailure, + noAliceBob: true, + }, + { + name: "non-critical loop startup failure", + test: testNonCriticalLoopStartupFailure, + noAliceBob: true, + }, { name: "custom channels balance consistency", test: testCustomChannelsBalanceConsistency, diff --git a/subservers/interface.go b/subservers/interface.go index 4ff0083cf..4f5e7c84e 100644 --- a/subservers/interface.go +++ b/subservers/interface.go @@ -14,9 +14,7 @@ import ( // SubServer defines an interface that should be implemented by any sub-server // that the subServer manager should manage. A sub-server can be run in either -// integrated or remote mode. A sub-server is considered non-fatal to LiT -// meaning that if a sub-server fails to start, LiT can safely continue with its -// operations and other sub-servers can too. +// integrated or remote mode. type SubServer interface { macaroons.MacaroonValidator diff --git a/subservers/manager.go b/subservers/manager.go index 8d9f6b1b3..65903388f 100644 --- a/subservers/manager.go +++ b/subservers/manager.go @@ -4,6 +4,7 @@ import ( "context" "fmt" "io/ioutil" + "sort" "sync" "time" @@ -11,6 +12,7 @@ import ( "github.com/lightninglabs/lightning-terminal/perms" "github.com/lightninglabs/lightning-terminal/status" "github.com/lightninglabs/lndclient" + "github.com/lightningnetwork/lnd/fn" "github.com/lightningnetwork/lnd/lncfg" "github.com/lightningnetwork/lnd/lnrpc" grpcProxy "github.com/mwitkow/grpc-proxy/proxy" @@ -29,8 +31,34 @@ var ( // defaultConnectTimeout is the default timeout for connecting to the // backend. defaultConnectTimeout = 15 * time.Second + + // criticalIntegratedSubServers lists integrated sub-servers that must + // succeed during startup. Failures from these sub-servers are surfaced + // to LiT and abort the startup sequence. + criticalIntegratedSubServers = fn.NewSet[string](TAP) ) +// CriticalIntegratedSubServerError signals a startup failure for a critical +// integrated sub-server. +type CriticalIntegratedSubServerError struct { + // Name is the sub-server name that failed to start. + Name string + + // Err is the underlying startup error. + Err error +} + +// Error returns the formatted error string. +func (e *CriticalIntegratedSubServerError) Error() string { + return fmt.Sprintf("critical integrated sub-server failed to "+ + "start (subserver=%s): %v", e.Name, e.Err) +} + +// Unwrap returns the underlying error. +func (e *CriticalIntegratedSubServerError) Unwrap() error { + return e.Err +} + // Manager manages a set of subServer objects. type Manager struct { servers map[string]*subServerWrapper @@ -104,14 +132,37 @@ func (s *Manager) GetServer(name string) (SubServer, bool) { } // StartIntegratedServers starts all the manager's sub-servers that should be -// started in integrated mode. +// started in integrated mode. An error is returned if any critical integrated +// sub-server fails to start. func (s *Manager) StartIntegratedServers(lndClient lnrpc.LightningClient, - lndGrpc *lndclient.GrpcLndServices, withMacaroonService bool) { + lndGrpc *lndclient.GrpcLndServices, withMacaroonService bool) error { s.mu.Lock() defer s.mu.Unlock() + // Sort for deterministic startup: critical integrated sub-servers + // first, then alphabetical to keep the order stable across runs. + servers := make([]*subServerWrapper, 0, len(s.servers)) for _, ss := range s.servers { + servers = append(servers, ss) + } + + sort.Slice(servers, func(i, j int) bool { + iCritical := criticalIntegratedSubServers.Contains( + servers[i].Name(), + ) + jCritical := criticalIntegratedSubServers.Contains( + servers[j].Name(), + ) + + if iCritical != jCritical { + return iCritical + } + + return servers[i].Name() < servers[j].Name() + }) + + for _, ss := range servers { if ss.Remote() { continue } @@ -126,11 +177,21 @@ func (s *Manager) StartIntegratedServers(lndClient lnrpc.LightningClient, ) if err != nil { s.statusServer.SetErrored(ss.Name(), err.Error()) + + if criticalIntegratedSubServers.Contains(ss.Name()) { + return &CriticalIntegratedSubServerError{ + Name: ss.Name(), + Err: err, + } + } + continue } s.statusServer.SetRunning(ss.Name()) } + + return nil } // ConnectRemoteSubServers creates connections to all the manager's sub-servers diff --git a/subservers/manager_test.go b/subservers/manager_test.go new file mode 100644 index 000000000..279e68bed --- /dev/null +++ b/subservers/manager_test.go @@ -0,0 +1,189 @@ +package subservers + +import ( + "context" + "errors" + "testing" + + restProxy "github.com/grpc-ecosystem/grpc-gateway/v2/runtime" + "github.com/lightninglabs/lightning-terminal/litrpc" + "github.com/lightninglabs/lightning-terminal/perms" + "github.com/lightninglabs/lightning-terminal/status" + "github.com/lightninglabs/lndclient" + tafn "github.com/lightninglabs/taproot-assets/fn" + "github.com/lightningnetwork/lnd/lnrpc" + "github.com/stretchr/testify/require" + "google.golang.org/grpc" + "gopkg.in/macaroon-bakery.v2/bakery" +) + +// mockSubServer is a lightweight SubServer test double. +type mockSubServer struct { + // name is returned by Name(). + name string + + // remote toggles Remote() return value. + remote bool + + // startErr is returned from Start() when set. + startErr error + + // started tracks whether Start() succeeded. + started bool +} + +// Name returns the mock sub-server name. +func (t *mockSubServer) Name() string { + return t.name +} + +// Remote indicates whether the sub-server runs remotely. +func (t *mockSubServer) Remote() bool { + return t.remote +} + +// RemoteConfig returns nil for the mock. +func (t *mockSubServer) RemoteConfig() *RemoteDaemonConfig { + return nil +} + +// Start marks the server started unless startErr is set. +func (t *mockSubServer) Start(_ lnrpc.LightningClient, + _ *lndclient.GrpcLndServices, _ bool) error { + + if t.startErr != nil { + return t.startErr + } + + t.started = true + + return nil +} + +// Stop marks the server as stopped. +func (t *mockSubServer) Stop() error { + t.started = false + return nil +} + +// RegisterGrpcService is a no-op for the mock. +func (t *mockSubServer) RegisterGrpcService(_ grpc.ServiceRegistrar) {} + +// RegisterRestService is a no-op for the mock. +func (t *mockSubServer) RegisterRestService(_ context.Context, + _ *restProxy.ServeMux, _ string, _ []grpc.DialOption) error { + + return nil +} + +// ServerErrChan returns nil for the mock. +func (t *mockSubServer) ServerErrChan() chan error { + return nil +} + +// MacPath returns an empty string for the mock. +func (t *mockSubServer) MacPath() string { + return "" +} + +// Permissions returns nil for the mock. +func (t *mockSubServer) Permissions() map[string][]bakery.Op { + return nil +} + +// WhiteListedURLs returns nil for the mock. +func (t *mockSubServer) WhiteListedURLs() map[string]struct{} { + return nil +} + +// Impl returns an empty option for the mock. +func (t *mockSubServer) Impl() tafn.Option[any] { + return tafn.None[any]() +} + +// ValidateMacaroon always succeeds for the mock. +func (t *mockSubServer) ValidateMacaroon(context.Context, + []bakery.Op, string) error { + + return nil +} + +// newTestManager creates a Manager and status Manager with permissive perms. +func newTestManager(t *testing.T) (*Manager, *status.Manager) { + t.Helper() + + permsMgr, err := perms.NewManager(true) + require.NoError(t, err) + + statusMgr := status.NewStatusManager() + + return NewManager(permsMgr, statusMgr), statusMgr +} + +// TestStartIntegratedServersCriticalFailureStopsStartup ensures critical +// startup errors abort integrated startup. +func TestStartIntegratedServersCriticalFailureStopsStartup(t *testing.T) { + manager, statusMgr := newTestManager(t) + + nonCritical := &mockSubServer{name: "loop"} + critical := &mockSubServer{ + name: TAP, + startErr: errors.New("boom"), + } + + require.NoError(t, manager.AddServer(nonCritical, true)) + require.NoError(t, manager.AddServer(critical, true)) + + err := manager.StartIntegratedServers(nil, nil, true) + require.Error(t, err) + require.Contains(t, err.Error(), TAP) + + resp, err := statusMgr.SubServerStatus( + context.Background(), &litrpc.SubServerStatusReq{}, + ) + require.NoError(t, err) + + statuses := resp.SubServers + require.Contains(t, statuses, TAP) + require.Equal(t, "boom", statuses[TAP].Error) + require.False(t, statuses[TAP].Running) + + require.False( + t, nonCritical.started, "non-critical sub-server should not "+ + "start after critical failure", + ) +} + +// TestStartIntegratedServersNonCriticalFailureContinues verifies non-critical +// startup failures are tolerated. +func TestStartIntegratedServersNonCriticalFailureContinues(t *testing.T) { + manager, statusMgr := newTestManager(t) + + failing := &mockSubServer{ + name: "loop", + startErr: errors.New("start failed"), + } + succeeding := &mockSubServer{name: "pool"} + + require.NoError(t, manager.AddServer(failing, true)) + require.NoError(t, manager.AddServer(succeeding, true)) + + err := manager.StartIntegratedServers(nil, nil, true) + require.NoError(t, err) + + resp, err := statusMgr.SubServerStatus( + context.Background(), &litrpc.SubServerStatusReq{}, + ) + require.NoError(t, err) + + statuses := resp.SubServers + + require.Contains(t, statuses, failing.name) + require.Equal(t, "start failed", statuses[failing.name].Error) + require.False(t, statuses[failing.name].Running) + + require.True(t, succeeding.started) + require.Contains(t, statuses, succeeding.name) + require.True(t, statuses[succeeding.name].Running) + require.Empty(t, statuses[succeeding.name].Error) +} diff --git a/terminal.go b/terminal.go index 498be4697..902179439 100644 --- a/terminal.go +++ b/terminal.go @@ -87,6 +87,7 @@ const ( defaultRPCTimeout = 3 * time.Minute minimumRPCTimeout = 30 * time.Second defaultStartupTimeout = 5 * time.Second + lndStopTimeout = 5 * time.Second ) // restRegistration is a function type that represents a REST proxy @@ -405,6 +406,30 @@ func (g *LightningTerminal) Run(ctx context.Context) error { g.statusMgr.SetErrored( subservers.LIT, "could not start Lit: %v", startErr, ) + + var criticalErr *subservers.CriticalIntegratedSubServerError + if g.cfg.LndMode == ModeIntegrated && + errors.As(startErr, &criticalErr) { + + // Stop lnd promptly to avoid it continuing to run + // after a failed critical integrated sub-server + // startup. + if client, err := g.basicLNDClient(); err == nil { + stopCtx, cancel := context.WithTimeout( + ctx, lndStopTimeout, + ) + defer cancel() + + _, err := client.StopDaemon( + stopCtx, &lnrpc.StopRequest{}, + ) + if err != nil { + log.Errorf("Failed to stop lnd after "+ + "failed critical sub-server "+ + "start: %v", err) + } + } + } } // Now block until we receive an error or the main shutdown @@ -422,11 +447,10 @@ func (g *LightningTerminal) Run(ctx context.Context) error { return startErr } -// start attempts to start all the various components of Litd. Only Litd and -// LND errors are considered fatal and will result in an error being returned. -// If any of the sub-servers managed by the subServerMgr error while starting -// up, these are considered non-fatal and will not result in an error being -// returned. +// start attempts to start all the various components of Litd. LND errors and +// critical integrated sub-server failures are fatal and will result in an +// error being returned. Non-critical sub-server startup failures are recorded +// in the status manager but do not stop startup. func (g *LightningTerminal) start(ctx context.Context) error { var err error @@ -769,9 +793,13 @@ func (g *LightningTerminal) start(ctx context.Context) error { // Both connection types are ready now, let's start our sub-servers if // they should be started locally as an integrated service. createDefaultMacaroons := !g.cfg.statelessInitMode - g.subServerMgr.StartIntegratedServers( + err = g.subServerMgr.StartIntegratedServers( g.basicClient, g.lndClient, createDefaultMacaroons, ) + if err != nil { + return fmt.Errorf("could not start integrated sub-servers: %w", + err) + } err = g.startInternalSubServers(ctx, !g.cfg.statelessInitMode) if err != nil {