Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion docs/release-notes/release-notes-0.16.1.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,14 @@

### Functional Changes/Additions

* [PR](https://github.com/lightninglabs/lightning-terminal/pull/1183): LiT now
fails fast if a critical integrated sub-server cannot start. The critical set
currently includes only tapd; other integrated sub-servers can fail to start
without blocking LiT, and their errors are recorded in status.
* Integrated-mode sub-servers now start deterministically: critical integrated
services are launched first, followed by the remaining services in
alphabetical order to keep startup ordering stable across runs.

### Technical and Architectural Updates

## RPC Updates
Expand All @@ -37,4 +45,4 @@

### Taproot Assets

# Contributors (Alphabetical Order)
# Contributors (Alphabetical Order)
166 changes: 166 additions & 0 deletions itest/litd_mode_integrated_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"github.com/lightninglabs/faraday/frdrpc"
"github.com/lightninglabs/lightning-node-connect/mailbox"
"github.com/lightninglabs/lightning-terminal/litrpc"
"github.com/lightninglabs/lightning-terminal/subservers"
"github.com/lightninglabs/loop/looprpc"
"github.com/lightninglabs/pool/poolrpc"
"github.com/lightninglabs/taproot-assets/taprpc"
Expand All @@ -29,6 +30,7 @@ import (
"github.com/lightningnetwork/lnd/lnrpc/routerrpc"
"github.com/lightningnetwork/lnd/lnrpc/walletrpc"
"github.com/lightningnetwork/lnd/macaroons"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"golang.org/x/net/http2"
"google.golang.org/grpc"
Expand Down Expand Up @@ -465,6 +467,170 @@ func testModeIntegrated(ctx context.Context, net *NetworkHarness,
)
}

// testCriticalTapStartupFailure ensures LiT exits quickly when a critical
// integrated sub-server (tapd) fails to start during boot.
func testCriticalTapStartupFailure(ctx context.Context, net *NetworkHarness,
t *harnessTest) {

// Force tapd to use a postgres backend with an invalid host to
// guarantee a startup failure in integrated mode. This config will
// error during tapd's startup (after wallet unlock) rather than during
// litd's config validation phase.
node, err := net.NewNode(
t.t, "FailFastTap", nil, false, false,
"--taproot-assets.databasebackend=postgres",
"--taproot-assets.postgres.host=tapd-postgres.invalid",
)
require.NoError(t.t, err)

defer func() {
_ = net.ShutdownNode(node)
}()

select {
case procErr := <-net.ProcessErrors():
require.ErrorContains(t.t, procErr, "tapd-postgres.invalid")
case <-time.After(15 * time.Second):
t.Fatalf("expected tapd startup failure to be reported")
}

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Depending on @ellemouton's response to my broader question in the main review comment, we likely want to expand the test code here, to ensure that the status endpoint & lnd node is at the state we expect it to be.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Issue to track interceptor decoupling: #1201

// LiT should terminate promptly after the critical startup failure.
select {
case <-node.processExit:
case <-time.After(5 * time.Second):
t.Fatalf("litd did not exit after tapd startup failure")
}
}

// testNonCriticalLoopStartupFailure ensures LiT continues running when a
// non-critical integrated sub-server (loopd) fails to start during boot.
func testNonCriticalLoopStartupFailure(ctx context.Context,
net *NetworkHarness, t *harnessTest) {

// Force loopd into an invalid config combination to guarantee a
// startup failure in integrated mode.
node, err := net.NewNode(
t.t, "NonCriticalLoop", nil, false, false,
"--loop.maxl402cost=1",
"--loop.maxlsatcost=1",
)
require.NoError(t.t, err)

stopNodeGracefully := func() error {
conn, err := node.ConnectRPC(true)
if err != nil {
return err
}
defer conn.Close()

stopCtx, cancel := context.WithTimeout(ctx, defaultTimeout)
defer cancel()

lndConn := lnrpc.NewLightningClient(conn)
_, err = lndConn.StopDaemon(stopCtx, &lnrpc.StopRequest{})
return err
}

defer func() {
t.Logf("Shutting down node: %s", node.Cfg.LitDir)
if err := stopNodeGracefully(); err != nil {
t.Logf("Graceful shutdown failed, killing node: %v",
err)
_ = net.KillNode(node)
}
select {
case <-node.processExit:
case <-time.After(5 * time.Second):
_ = net.KillNode(node)
}
_ = node.cleanup()
}()

// Wait for the TLS cert so we can connect to the status service.
t.Logf("Waiting for lit TLS cert to be created: %s",
node.Cfg.LitTLSCertPath)
require.Eventually(t.t, func() bool {
_, err := os.Stat(node.Cfg.LitTLSCertPath)
return err == nil
}, 10*time.Second, 1*time.Second,
"expected lit TLS cert to be created",
)

t.Logf("Connecting to litd service")
rawConn, err := connectLitRPC(
ctx, node.Cfg.LitAddr(), node.Cfg.LitTLSCertPath, "",
)
require.NoError(t.t, err)
defer rawConn.Close()

statusConn := litrpc.NewStatusClient(rawConn)

fetchStatus := func() (*litrpc.SubServerStatusResp, error) {
callCtx, cancel := context.WithTimeout(ctx, 2*time.Second)
defer cancel()

return statusConn.SubServerStatus(
callCtx, &litrpc.SubServerStatusReq{},
)
}

waitForStatus := func(desc string,
predicate func(*litrpc.SubServerStatusResp) bool) {

t.t.Helper()

var lastErr error
var lastResp *litrpc.SubServerStatusResp

ok := assert.Eventually(t.t, func() bool {
lastResp, lastErr = fetchStatus()
if lastErr != nil || lastResp == nil {
return false
}

return predicate(lastResp)
}, 20*time.Second, 200*time.Millisecond)
if ok {
return
}

if lastErr != nil {
t.t.Fatalf("%s: last error: %v", desc, lastErr)
}

t.t.Fatalf("%s: last status: %#v", desc, lastResp.SubServers)
}

t.Logf("Waiting for loop to report non-critical loop startup failure")
waitForStatus("expected loop startup error to be reported",
func(resp *litrpc.SubServerStatusResp) bool {
status, ok := resp.SubServers[subservers.LOOP]
if !ok || status.Running || status.Error == "" {
return false
}

res := strings.Contains(status.Error, "maxl402cost")
return res
},
)

t.Logf("Waiting for lnd to report running state")
waitForStatus("expected lnd to be running",
func(resp *litrpc.SubServerStatusResp) bool {
status, ok := resp.SubServers[subservers.LND]
return ok && status.Running
},
)

t.Logf("Waiting for litd to report running state")
waitForStatus("expected litd to be running",
func(resp *litrpc.SubServerStatusResp) bool {
status, ok := resp.SubServers[subservers.LIT]
return ok && status.Running
},
)
}

// integratedTestSuite makes sure that in integrated mode all daemons work
// correctly.
func integratedTestSuite(ctx context.Context, net *NetworkHarness, t *testing.T,
Expand Down
10 changes: 10 additions & 0 deletions itest/litd_test_list_on_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,16 @@ var allTestCases = []*testCase{
test: testCustomChannelsHtlcForceCloseMpp,
noAliceBob: true,
},
{
name: "critical tap startup failure",
test: testCriticalTapStartupFailure,
noAliceBob: true,
},
{
name: "non-critical loop startup failure",
test: testNonCriticalLoopStartupFailure,
noAliceBob: true,
},
{
name: "custom channels balance consistency",
test: testCustomChannelsBalanceConsistency,
Expand Down
4 changes: 1 addition & 3 deletions subservers/interface.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,7 @@ import (

// SubServer defines an interface that should be implemented by any sub-server
// that the subServer manager should manage. A sub-server can be run in either
// integrated or remote mode. A sub-server is considered non-fatal to LiT
// meaning that if a sub-server fails to start, LiT can safely continue with its
// operations and other sub-servers can too.
// integrated or remote mode.
type SubServer interface {
macaroons.MacaroonValidator

Expand Down
65 changes: 63 additions & 2 deletions subservers/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,15 @@ import (
"context"
"fmt"
"io/ioutil"
"sort"
"sync"
"time"

restProxy "github.com/grpc-ecosystem/grpc-gateway/v2/runtime"
"github.com/lightninglabs/lightning-terminal/perms"
"github.com/lightninglabs/lightning-terminal/status"
"github.com/lightninglabs/lndclient"
"github.com/lightningnetwork/lnd/fn"
"github.com/lightningnetwork/lnd/lncfg"
"github.com/lightningnetwork/lnd/lnrpc"
grpcProxy "github.com/mwitkow/grpc-proxy/proxy"
Expand All @@ -29,8 +31,34 @@ var (
// defaultConnectTimeout is the default timeout for connecting to the
// backend.
defaultConnectTimeout = 15 * time.Second

// criticalIntegratedSubServers lists integrated sub-servers that must
// succeed during startup. Failures from these sub-servers are surfaced
// to LiT and abort the startup sequence.
criticalIntegratedSubServers = fn.NewSet[string](TAP)
)

// CriticalIntegratedSubServerError signals a startup failure for a critical
// integrated sub-server.
type CriticalIntegratedSubServerError struct {
// Name is the sub-server name that failed to start.
Name string

// Err is the underlying startup error.
Err error
}

// Error returns the formatted error string.
func (e *CriticalIntegratedSubServerError) Error() string {
return fmt.Sprintf("critical integrated sub-server failed to "+
"start (subserver=%s): %v", e.Name, e.Err)
}

// Unwrap returns the underlying error.
func (e *CriticalIntegratedSubServerError) Unwrap() error {
return e.Err
}

// Manager manages a set of subServer objects.
type Manager struct {
servers map[string]*subServerWrapper
Expand Down Expand Up @@ -104,14 +132,37 @@ func (s *Manager) GetServer(name string) (SubServer, bool) {
}

// StartIntegratedServers starts all the manager's sub-servers that should be
// started in integrated mode.
// started in integrated mode. An error is returned if any critical integrated
// sub-server fails to start.
func (s *Manager) StartIntegratedServers(lndClient lnrpc.LightningClient,
lndGrpc *lndclient.GrpcLndServices, withMacaroonService bool) {
lndGrpc *lndclient.GrpcLndServices, withMacaroonService bool) error {

s.mu.Lock()
defer s.mu.Unlock()

// Sort for deterministic startup: critical integrated sub-servers
// first, then alphabetical to keep the order stable across runs.
servers := make([]*subServerWrapper, 0, len(s.servers))
for _, ss := range s.servers {
servers = append(servers, ss)
}

sort.Slice(servers, func(i, j int) bool {
iCritical := criticalIntegratedSubServers.Contains(
servers[i].Name(),
)
jCritical := criticalIntegratedSubServers.Contains(
servers[j].Name(),
)

if iCritical != jCritical {
return iCritical
}

return servers[i].Name() < servers[j].Name()
})

for _, ss := range servers {
if ss.Remote() {
continue
}
Expand All @@ -126,11 +177,21 @@ func (s *Manager) StartIntegratedServers(lndClient lnrpc.LightningClient,
)
if err != nil {
s.statusServer.SetErrored(ss.Name(), err.Error())

if criticalIntegratedSubServers.Contains(ss.Name()) {
return &CriticalIntegratedSubServerError{
Name: ss.Name(),
Err: err,
}
}

continue
}

s.statusServer.SetRunning(ss.Name())
}

return nil
}

// ConnectRemoteSubServers creates connections to all the manager's sub-servers
Expand Down
Loading
Loading