diff --git a/cmd/goal/network.go b/cmd/goal/network.go index 7123f33288..969661a98a 100644 --- a/cmd/goal/network.go +++ b/cmd/goal/network.go @@ -93,7 +93,7 @@ var networkCreateCmd = &cobra.Command{ panic(err) } - network, err := netdeploy.CreateNetworkFromTemplate(networkName, networkRootDir, networkTemplateFile, binDir, !noImportKeys) + network, err := netdeploy.CreateNetworkFromTemplate(networkName, networkRootDir, networkTemplateFile, binDir, !noImportKeys, nil) if err != nil { if noClean { reportInfof(" ** failed ** - Preserving network rootdir '%s'", networkRootDir) diff --git a/netdeploy/network.go b/netdeploy/network.go index d3296baf34..0d867bb60c 100644 --- a/netdeploy/network.go +++ b/netdeploy/network.go @@ -48,59 +48,19 @@ type NetworkCfg struct { // Network represents an instance of a deployed network type Network struct { - rootDir string - cfg NetworkCfg - nodeDirs map[string]string // mapping between the node name and the directories where the node is operation on (not including RelayDirs) - gen gen.GenesisData -} - -// Name returns the name of the private network -func (n Network) Name() string { - return n.cfg.Name -} - -// PrimaryDataDir returns the primary data directory for the network -func (n Network) PrimaryDataDir() string { - return n.getNodeFullPath(n.cfg.RelayDirs[0]) -} - -// NodeDataDirs returns an array of node data directories (not the relays) -func (n Network) NodeDataDirs() []string { - var directories []string - for _, nodeDir := range n.nodeDirs { - directories = append(directories, n.getNodeFullPath(nodeDir)) - } - return directories -} - -// GetNodeDir returns the node directory that is associated with the given node name. -func (n Network) GetNodeDir(nodeName string) (string, error) { - possibleDir := n.getNodeFullPath(nodeName) - if isNodeDir(possibleDir) { - return possibleDir, nil - } - return "", fmt.Errorf("no node exists that is named '%s'", nodeName) -} - -func isNodeDir(path string) bool { - if util.IsDir(path) { - if util.FileExists(filepath.Join(path, config.GenesisJSONFile)) { - return true - } - } - return false -} - -// Genesis returns the genesis data for this network -func (n Network) Genesis() gen.GenesisData { - return n.gen + rootDir string + cfg NetworkCfg + nodeDirs map[string]string // mapping between the node name and the directories where the node is operation on (not including RelayDirs) + gen gen.GenesisData + nodeExitCallback nodecontrol.AlgodExitErrorCallback } // CreateNetworkFromTemplate uses the specified template to deploy a new private network // under the specified root directory. -func CreateNetworkFromTemplate(name, rootDir, templateFile, binDir string, importKeys bool) (Network, error) { +func CreateNetworkFromTemplate(name, rootDir, templateFile, binDir string, importKeys bool, nodeExitCallback nodecontrol.AlgodExitErrorCallback) (Network, error) { n := Network{ - rootDir: rootDir, + rootDir: rootDir, + nodeExitCallback: nodeExitCallback, } n.cfg.Name = name n.cfg.TemplateFile = templateFile @@ -133,21 +93,6 @@ func CreateNetworkFromTemplate(name, rootDir, templateFile, binDir string, impor return n, err } -func isValidNetworkDir(rootDir string) bool { - cfgFile := filepath.Join(rootDir, configFileName) - fileExists := util.FileExists(cfgFile) - - // If file exists, network assumed to exist - if !fileExists { - return false - } - - // Now check for genesis.json file too - cfgFile = filepath.Join(rootDir, genesisFileName) - fileExists = util.FileExists(cfgFile) - return fileExists -} - // LoadNetwork loads and initializes the Network state representing // an existing deployed network. func LoadNetwork(rootDir string) (Network, error) { @@ -183,6 +128,63 @@ func loadNetworkCfg(configFile string) (NetworkCfg, error) { return cfg, err } +// Name returns the name of the private network +func (n Network) Name() string { + return n.cfg.Name +} + +// PrimaryDataDir returns the primary data directory for the network +func (n Network) PrimaryDataDir() string { + return n.getNodeFullPath(n.cfg.RelayDirs[0]) +} + +// NodeDataDirs returns an array of node data directories (not the relays) +func (n Network) NodeDataDirs() []string { + var directories []string + for _, nodeDir := range n.nodeDirs { + directories = append(directories, n.getNodeFullPath(nodeDir)) + } + return directories +} + +// GetNodeDir returns the node directory that is associated with the given node name. +func (n Network) GetNodeDir(nodeName string) (string, error) { + possibleDir := n.getNodeFullPath(nodeName) + if isNodeDir(possibleDir) { + return possibleDir, nil + } + return "", fmt.Errorf("no node exists that is named '%s'", nodeName) +} + +func isNodeDir(path string) bool { + if util.IsDir(path) { + if util.FileExists(filepath.Join(path, config.GenesisJSONFile)) { + return true + } + } + return false +} + +// Genesis returns the genesis data for this network +func (n Network) Genesis() gen.GenesisData { + return n.gen +} + +func isValidNetworkDir(rootDir string) bool { + cfgFile := filepath.Join(rootDir, configFileName) + fileExists := util.FileExists(cfgFile) + + // If file exists, network assumed to exist + if !fileExists { + return false + } + + // Now check for genesis.json file too + cfgFile = filepath.Join(rootDir, genesisFileName) + fileExists = util.FileExists(cfgFile) + return fileExists +} + // Save persists the network state in the root directory (in network.json) func (n Network) Save(rootDir string) error { cfgFile := filepath.Join(rootDir, configFileName) @@ -250,9 +252,12 @@ func (n Network) Start(binDir string, redirectOutput bool) error { var peerAddressListBuilder strings.Builder for _, relayDir := range n.cfg.RelayDirs { - nc := nodecontrol.MakeNodeController(binDir, n.getNodeFullPath(relayDir)) + + nodeFulllPath := n.getNodeFullPath(relayDir) + nc := nodecontrol.MakeNodeController(binDir, nodeFulllPath) args := nodecontrol.AlgodStartArgs{ - RedirectOutput: redirectOutput, + RedirectOutput: redirectOutput, + ExitErrorCallback: n.nodeExitCallback, } _, err := nc.StartAlgod(args) @@ -308,8 +313,9 @@ func (n Network) GetPeerAddresses(binDir string) []string { func (n Network) startNodes(binDir, relayAddress string, redirectOutput bool) error { args := nodecontrol.AlgodStartArgs{ - PeerAddress: relayAddress, - RedirectOutput: redirectOutput, + PeerAddress: relayAddress, + RedirectOutput: redirectOutput, + ExitErrorCallback: n.nodeExitCallback, } for _, nodeDir := range n.nodeDirs { nc := nodecontrol.MakeNodeController(binDir, n.getNodeFullPath(nodeDir)) diff --git a/nodecontrol/NodeController.go b/nodecontrol/NodeController.go index 0761ee03a1..8467bf5ff6 100644 --- a/nodecontrol/NodeController.go +++ b/nodecontrol/NodeController.go @@ -50,6 +50,10 @@ func MakeNodeController(binDir, algodDataDir string) NodeController { return nc } +// AlgodExitErrorCallback is the callback function from the node controller that reports upstream +// in case there was a change with the algod running state. +type AlgodExitErrorCallback func(*NodeController, error) + // AlgodStartArgs are the possible arguments for starting algod type AlgodStartArgs struct { PeerAddress string @@ -57,6 +61,7 @@ type AlgodStartArgs struct { RedirectOutput bool RunUnderHost bool TelemetryOverride string + ExitErrorCallback AlgodExitErrorCallback } // KMDStartArgs are the possible arguments for starting kmd diff --git a/nodecontrol/algodControl.go b/nodecontrol/algodControl.go index bfca00f783..7563b95c92 100644 --- a/nodecontrol/algodControl.go +++ b/nodecontrol/algodControl.go @@ -184,18 +184,28 @@ func (nc *NodeController) StartAlgod(args AlgodStartArgs) (alreadyRunning bool, } // Wait on the algod process and check if exits - c := make(chan bool) + algodExitChan := make(chan struct{}) + startAlgodCompletedChan := make(chan struct{}) + defer close(startAlgodCompletedChan) go func() { // this Wait call is important even beyond the scope of this function; it allows the system to // move the process from a "zombie" state into "done" state, and is required for the Signal(0) test. - algodCmd.Wait() - c <- true + err := algodCmd.Wait() + select { + case <-startAlgodCompletedChan: + // we've already exited this function, so we want to report to the error to the callback. + if args.ExitErrorCallback != nil { + args.ExitErrorCallback(nc, err) + } + default: + } + algodExitChan <- struct{}{} }() success := false for !success { select { - case <-c: + case <-algodExitChan: return false, errAlgodExitedEarly case <-time.After(time.Millisecond * 100): // If we can't talk to the API yet, spin diff --git a/test/e2e-go/features/participation/onlineOfflineParticipation_test.go b/test/e2e-go/features/participation/onlineOfflineParticipation_test.go index 1b1e73704c..7419f05b27 100644 --- a/test/e2e-go/features/participation/onlineOfflineParticipation_test.go +++ b/test/e2e-go/features/participation/onlineOfflineParticipation_test.go @@ -18,8 +18,8 @@ package participation import ( "path/filepath" + "runtime" "testing" - "runtime" "github.com/stretchr/testify/require" @@ -52,7 +52,7 @@ func TestParticipationKeyOnlyAccountParticipatesCorrectly(t *testing.T) { // since block proposer selection is probabilistic, it is not guaranteed that the account will be chosen // it is a trade-off between test flakiness and test duration proposalWindow := 50 // arbitrary - blockWasProposedByPartkeyOnlyAccountRecently := waitForAccountToProposeBlock(a, fixture, partkeyOnlyAccount, proposalWindow) + blockWasProposedByPartkeyOnlyAccountRecently := waitForAccountToProposeBlock(a, &fixture, partkeyOnlyAccount, proposalWindow) a.True(blockWasProposedByPartkeyOnlyAccountRecently, "partkey-only account should be proposing blocks") // verify partkeyonly_account cannot spend @@ -71,7 +71,7 @@ func TestParticipationKeyOnlyAccountParticipatesCorrectly(t *testing.T) { a.Error(err, "partkey only account should fail to go offline") } -func waitForAccountToProposeBlock(a *require.Assertions, fixture fixtures.RestClientFixture, account string, window int) bool { +func waitForAccountToProposeBlock(a *require.Assertions, fixture *fixtures.RestClientFixture, account string, window int) bool { client := fixture.AlgodClient curStatus, err := client.Status() @@ -183,7 +183,7 @@ func TestNewAccountCanGoOnlineAndParticipate(t *testing.T) { // check that account starts participating after a while proposalWindow := 20 // arbitrary - blockWasProposedByNewAccountRecently := waitForAccountToProposeBlock(a, fixture, newAccount, proposalWindow) + blockWasProposedByNewAccountRecently := waitForAccountToProposeBlock(a, &fixture, newAccount, proposalWindow) a.True(blockWasProposedByNewAccountRecently, "newly online account should be proposing blocks") } diff --git a/test/e2e-go/features/participation/participationRewards_test.go b/test/e2e-go/features/participation/participationRewards_test.go index 30828798db..cf6a56131d 100644 --- a/test/e2e-go/features/participation/participationRewards_test.go +++ b/test/e2e-go/features/participation/participationRewards_test.go @@ -30,7 +30,7 @@ import ( "github.com/algorand/go-algorand/test/framework/fixtures" ) -func getFirstAccountFromNamedNode(fixture fixtures.RestClientFixture, r *require.Assertions, nodeName string) (account string) { +func getFirstAccountFromNamedNode(fixture *fixtures.RestClientFixture, r *require.Assertions, nodeName string) (account string) { cli := fixture.GetLibGoalClientForNamedNode(nodeName) wh, err := cli.GetUnencryptedWalletHandle() r.NoError(err) @@ -82,9 +82,9 @@ func TestOnlineOfflineRewards(t *testing.T) { defer fixture.Shutdown() // get online and offline accounts - onlineAccount := getFirstAccountFromNamedNode(fixture, r, "Online") + onlineAccount := getFirstAccountFromNamedNode(&fixture, r, "Online") onlineClient := fixture.GetLibGoalClientForNamedNode("Online") - offlineAccount := getFirstAccountFromNamedNode(fixture, r, "Offline") + offlineAccount := getFirstAccountFromNamedNode(&fixture, r, "Offline") offlineClient := fixture.GetLibGoalClientForNamedNode("Offline") // learn initial balances @@ -184,7 +184,7 @@ func TestRewardUnitThreshold(t *testing.T) { defer fixture.Shutdown() // get "poor" account (has 1% stake as opposed to 33%) - poorAccount := getFirstAccountFromNamedNode(fixture, r, "SmallNode") + poorAccount := getFirstAccountFromNamedNode(&fixture, r, "SmallNode") client := fixture.GetLibGoalClientForNamedNode("SmallNode") // make new account wh, _ := client.GetUnencryptedWalletHandle() diff --git a/test/e2e-go/features/partitionRecovery/partitionRecovery_test.go b/test/e2e-go/features/partitionRecovery/partitionRecovery_test.go index b489036f1e..6f1e094667 100644 --- a/test/e2e-go/features/partitionRecovery/partitionRecovery_test.go +++ b/test/e2e-go/features/partitionRecovery/partitionRecovery_test.go @@ -18,9 +18,9 @@ package partitionrecovery import ( "path/filepath" + "runtime" "testing" "time" - "runtime" "github.com/stretchr/testify/require" @@ -91,7 +91,7 @@ func TestPartitionRecoverySwapStartup(t *testing.T) { fixture.Setup(t, filepath.Join("nettemplates", "TwoNodes50EachWithRelay.json")) defer fixture.Shutdown() - runTestWithStaggeredStopStart(t, fixture) + runTestWithStaggeredStopStart(t, &fixture) } func TestPartitionRecoveryStaggerRestart(t *testing.T) { @@ -114,10 +114,10 @@ func TestPartitionRecoveryStaggerRestart(t *testing.T) { fixture.Setup(t, filepath.Join("nettemplates", "ThreeNodesEvenDist.json")) defer fixture.Shutdown() - runTestWithStaggeredStopStart(t, fixture) + runTestWithStaggeredStopStart(t, &fixture) } -func runTestWithStaggeredStopStart(t *testing.T, fixture fixtures.RestClientFixture) { +func runTestWithStaggeredStopStart(t *testing.T, fixture *fixtures.RestClientFixture) { a := require.New(t) // Get Node1 so we can wait until it has reached the target round diff --git a/test/framework/fixtures/libgoalFixture.go b/test/framework/fixtures/libgoalFixture.go index 09db0d3e8b..ffc11f12a7 100644 --- a/test/framework/fixtures/libgoalFixture.go +++ b/test/framework/fixtures/libgoalFixture.go @@ -20,11 +20,14 @@ import ( "fmt" "io/ioutil" "os" + "os/exec" "path/filepath" "strings" + "syscall" "testing" "time" + "github.com/algorand/go-deadlock" "github.com/stretchr/testify/require" "github.com/algorand/go-algorand/config" @@ -48,6 +51,7 @@ type LibGoalFixture struct { Name string network netdeploy.Network t TestingT + tMu deadlock.RWMutex clientPartKeys map[string][]account.Participation } @@ -82,9 +86,8 @@ func (f *LibGoalFixture) setup(test TestingT, testName string, templateFile stri os.RemoveAll(f.rootDir) templateFile = filepath.Join(f.testDataDir, templateFile) importKeys := false // Don't automatically import root keys when creating folders, we'll import on-demand - network, err := netdeploy.CreateNetworkFromTemplate("test", f.rootDir, templateFile, f.binDir, importKeys) + network, err := netdeploy.CreateNetworkFromTemplate("test", f.rootDir, templateFile, f.binDir, importKeys, f.nodeExitWithError) f.failOnError(err, "CreateNetworkFromTemplate failed: %v") - f.network = network if startNetwork { @@ -92,6 +95,29 @@ func (f *LibGoalFixture) setup(test TestingT, testName string, templateFile stri } } +// nodeExitWithError is a callback from the network indicating that the node exit with an error after a successfull startup. +// i.e. node terminated, and not due to shutdown.. this is likely to be a crash/panic. +func (f *LibGoalFixture) nodeExitWithError(nc *nodecontrol.NodeController, err error) { + if err == nil { + return + } + + f.tMu.RLock() + defer f.tMu.RUnlock() + if f.t == nil { + return + } + exitError, ok := err.(*exec.ExitError) + if !ok { + require.NoError(f.t, err, "Node at %s has terminated with an error", nc.GetDataDir()) + return + } + ws := exitError.Sys().(syscall.WaitStatus) + exitCode := ws.ExitStatus() + + require.NoError(f.t, err, "Node at %s has terminated with error code %d", nc.GetDataDir(), exitCode) +} + func (f *LibGoalFixture) importRootKeys(lg *libgoal.Client, dataDir string) { genID, err := lg.GenesisID() if err != nil { @@ -241,8 +267,12 @@ func (f *LibGoalFixture) Start() { // It ensures the current test context is set and then reset after the test ends // It should be called in the form of "defer fixture.SetTestContext(t)()" func (f *LibGoalFixture) SetTestContext(t TestingT) func() { + f.tMu.Lock() + defer f.tMu.Unlock() f.t = t return func() { + f.tMu.Lock() + defer f.tMu.Unlock() f.t = nil } }