From bcd607e636a589d01d896b00ab84f526f77ba7ef Mon Sep 17 00:00:00 2001 From: ohill <145173879+ohill@users.noreply.github.com> Date: Mon, 8 Apr 2024 10:16:54 -0400 Subject: [PATCH 1/4] Add GoMemLimit config option and update TestPartitionHalfOffline to use it for 10 nodes * 1GB limit --- cmd/algod/main.go | 6 + config/localTemplate.go | 6 +- config/local_defaults.go | 3 +- installer/config.json.example | 3 +- .../partitionRecovery_test.go | 1 + test/testdata/configs/config-v34.json | 139 ++++++++++++++++++ 6 files changed, 155 insertions(+), 3 deletions(-) create mode 100644 test/testdata/configs/config-v34.json diff --git a/cmd/algod/main.go b/cmd/algod/main.go index 603f543b89..0f93ed447f 100644 --- a/cmd/algod/main.go +++ b/cmd/algod/main.go @@ -22,6 +22,7 @@ import ( "math/rand" "os" "path/filepath" + "runtime/debug" "strconv" "strings" "time" @@ -173,6 +174,11 @@ func run() int { log.Fatalf("Cannot load config: %v", err) } + // set soft memory limit, if configured + if cfg.GoMemLimit > 0 { + debug.SetMemoryLimit(int64(cfg.GoMemLimit)) + } + _, err = cfg.ValidateDNSBootstrapArray(genesis.Network) if err != nil { // log is not setup yet, this will log to stderr diff --git a/config/localTemplate.go b/config/localTemplate.go index ce4294de01..0f23a6ad86 100644 --- a/config/localTemplate.go +++ b/config/localTemplate.go @@ -42,7 +42,7 @@ type Local struct { // Version tracks the current version of the defaults so we can migrate old -> new // This is specifically important whenever we decide to change the default value // for an existing parameter. This field tag must be updated any time we add a new version. - Version uint32 `version[0]:"0" version[1]:"1" version[2]:"2" version[3]:"3" version[4]:"4" version[5]:"5" version[6]:"6" version[7]:"7" version[8]:"8" version[9]:"9" version[10]:"10" version[11]:"11" version[12]:"12" version[13]:"13" version[14]:"14" version[15]:"15" version[16]:"16" version[17]:"17" version[18]:"18" version[19]:"19" version[20]:"20" version[21]:"21" version[22]:"22" version[23]:"23" version[24]:"24" version[25]:"25" version[26]:"26" version[27]:"27" version[28]:"28" version[29]:"29" version[30]:"30" version[31]:"31" version[32]:"32" version[33]:"33"` + Version uint32 `version[0]:"0" version[1]:"1" version[2]:"2" version[3]:"3" version[4]:"4" version[5]:"5" version[6]:"6" version[7]:"7" version[8]:"8" version[9]:"9" version[10]:"10" version[11]:"11" version[12]:"12" version[13]:"13" version[14]:"14" version[15]:"15" version[16]:"16" version[17]:"17" version[18]:"18" version[19]:"19" version[20]:"20" version[21]:"21" version[22]:"22" version[23]:"23" version[24]:"24" version[25]:"25" version[26]:"26" version[27]:"27" version[28]:"28" version[29]:"29" version[30]:"30" version[31]:"31" version[32]:"32" version[33]:"33" version[34]:"34"` // Archival nodes retain a full copy of the block history. Non-Archival nodes will delete old blocks and only retain what's need to properly validate blockchain messages (the precise number of recent blocks depends on the consensus parameters. Currently the last 1321 blocks are required). This means that non-Archival nodes require significantly less storage than Archival nodes. If setting this to true for the first time, the existing ledger may need to be deleted to get the historical values stored as the setting only affects current blocks forward. To do this, shutdown the node and delete all .sqlite files within the data/testnet-version directory, except the crash.sqlite file. Restart the node and wait for the node to sync. Archival bool `version[0]:"false"` @@ -609,6 +609,10 @@ type Local struct { // DisableAPIAuth turns off authentication for public (non-admin) API endpoints. DisableAPIAuth bool `version[30]:"false"` + + // GoMemLimit provides the Go runtime with a soft memory limit. The default behavior is no limit, + // unless the GOMEMLIMIT environment variable is set. + GoMemLimit uint64 `version[34]:"0"` } // DNSBootstrapArray returns an array of one or more DNS Bootstrap identifiers diff --git a/config/local_defaults.go b/config/local_defaults.go index d2a73d4c6f..f5f02082aa 100644 --- a/config/local_defaults.go +++ b/config/local_defaults.go @@ -20,7 +20,7 @@ package config var defaultLocal = Local{ - Version: 33, + Version: 34, AccountUpdatesStatsInterval: 5000000000, AccountsRebuildSynchronousMode: 1, AgreementIncomingBundlesQueueLength: 15, @@ -89,6 +89,7 @@ var defaultLocal = Local{ FallbackDNSResolverAddress: "", ForceFetchTransactions: false, ForceRelayMessages: false, + GoMemLimit: 0, GossipFanout: 4, HeartbeatUpdateInterval: 600, HotDataDir: "", diff --git a/installer/config.json.example b/installer/config.json.example index d9188ef748..4a9714115f 100644 --- a/installer/config.json.example +++ b/installer/config.json.example @@ -1,5 +1,5 @@ { - "Version": 33, + "Version": 34, "AccountUpdatesStatsInterval": 5000000000, "AccountsRebuildSynchronousMode": 1, "AgreementIncomingBundlesQueueLength": 15, @@ -68,6 +68,7 @@ "FallbackDNSResolverAddress": "", "ForceFetchTransactions": false, "ForceRelayMessages": false, + "GoMemLimit": 0, "GossipFanout": 4, "HeartbeatUpdateInterval": 600, "HotDataDir": "", diff --git a/test/e2e-go/features/partitionRecovery/partitionRecovery_test.go b/test/e2e-go/features/partitionRecovery/partitionRecovery_test.go index 284146864d..14b3b82f6e 100644 --- a/test/e2e-go/features/partitionRecovery/partitionRecovery_test.go +++ b/test/e2e-go/features/partitionRecovery/partitionRecovery_test.go @@ -251,6 +251,7 @@ func TestPartitionHalfOffline(t *testing.T) { a.NoError(err) // adjust the refresh interval for one hour, so that we won't be reloading the participation key during this test. cfg.ParticipationKeysRefreshInterval = time.Hour + cfg.GoMemLimit = 1 * 1024 * 1024 * 1024 cfg.SaveToDisk(nodeDir) } fixture.Start() diff --git a/test/testdata/configs/config-v34.json b/test/testdata/configs/config-v34.json new file mode 100644 index 0000000000..4a9714115f --- /dev/null +++ b/test/testdata/configs/config-v34.json @@ -0,0 +1,139 @@ +{ + "Version": 34, + "AccountUpdatesStatsInterval": 5000000000, + "AccountsRebuildSynchronousMode": 1, + "AgreementIncomingBundlesQueueLength": 15, + "AgreementIncomingProposalsQueueLength": 50, + "AgreementIncomingVotesQueueLength": 20000, + "AnnounceParticipationKey": true, + "Archival": false, + "BaseLoggerDebugLevel": 4, + "BlockDBDir": "", + "BlockServiceCustomFallbackEndpoints": "", + "BlockServiceMemCap": 500000000, + "BroadcastConnectionsLimit": -1, + "CadaverDirectory": "", + "CadaverSizeTarget": 0, + "CatchpointDir": "", + "CatchpointFileHistoryLength": 365, + "CatchpointInterval": 10000, + "CatchpointTracking": 0, + "CatchupBlockDownloadRetryAttempts": 1000, + "CatchupBlockValidateMode": 0, + "CatchupFailurePeerRefreshRate": 10, + "CatchupGossipBlockFetchTimeoutSec": 4, + "CatchupHTTPBlockFetchTimeoutSec": 4, + "CatchupLedgerDownloadRetryAttempts": 50, + "CatchupParallelBlocks": 16, + "ColdDataDir": "", + "ConnectionsRateLimitingCount": 60, + "ConnectionsRateLimitingWindowSeconds": 1, + "CrashDBDir": "", + "DNSBootstrapID": ".algorand.network?backup=.algorand.net&dedup=.algorand-.(network|net)", + "DNSSecurityFlags": 1, + "DeadlockDetection": 0, + "DeadlockDetectionThreshold": 30, + "DisableAPIAuth": false, + "DisableLedgerLRUCache": false, + "DisableLocalhostConnectionRateLimit": true, + "DisableNetworking": false, + "DisableOutgoingConnectionThrottling": false, + "EnableAccountUpdatesStats": false, + "EnableAgreementReporting": false, + "EnableAgreementTimeMetrics": false, + "EnableAssembleStats": false, + "EnableBlockService": false, + "EnableDeveloperAPI": false, + "EnableExperimentalAPI": false, + "EnableFollowMode": false, + "EnableGossipBlockService": true, + "EnableGossipService": true, + "EnableIncomingMessageFilter": false, + "EnableLedgerService": false, + "EnableMetricReporting": false, + "EnableOutgoingNetworkMessageFiltering": true, + "EnableP2P": false, + "EnablePingHandler": true, + "EnableProcessBlockStats": false, + "EnableProfiler": false, + "EnableRequestLogger": false, + "EnableRuntimeMetrics": false, + "EnableTopAccountsReporting": false, + "EnableTxBacklogAppRateLimiting": true, + "EnableTxBacklogRateLimiting": true, + "EnableTxnEvalTracer": false, + "EnableUsageLog": false, + "EnableVerbosedTransactionSyncLogging": false, + "EndpointAddress": "127.0.0.1:0", + "FallbackDNSResolverAddress": "", + "ForceFetchTransactions": false, + "ForceRelayMessages": false, + "GoMemLimit": 0, + "GossipFanout": 4, + "HeartbeatUpdateInterval": 600, + "HotDataDir": "", + "IncomingConnectionsLimit": 2400, + "IncomingMessageFilterBucketCount": 5, + "IncomingMessageFilterBucketSize": 512, + "LedgerSynchronousMode": 2, + "LogArchiveDir": "", + "LogArchiveMaxAge": "", + "LogArchiveName": "node.archive.log", + "LogFileDir": "", + "LogSizeLimit": 1073741824, + "MaxAPIBoxPerApplication": 100000, + "MaxAPIResourcesPerAccount": 100000, + "MaxAcctLookback": 4, + "MaxBlockHistoryLookback": 0, + "MaxCatchpointDownloadDuration": 43200000000000, + "MaxConnectionsPerIP": 15, + "MinCatchpointFileDownloadBytesPerSecond": 20480, + "NetAddress": "", + "NetworkMessageTraceServer": "", + "NetworkProtocolVersion": "", + "NodeExporterListenAddress": ":9100", + "NodeExporterPath": "./node_exporter", + "OptimizeAccountsDatabaseOnStartup": false, + "OutgoingMessageFilterBucketCount": 3, + "OutgoingMessageFilterBucketSize": 128, + "P2PPersistPeerID": false, + "P2PPrivateKeyLocation": "", + "ParticipationKeysRefreshInterval": 60000000000, + "PeerConnectionsUpdateInterval": 3600, + "PeerPingPeriodSeconds": 0, + "PriorityPeers": {}, + "ProposalAssemblyTime": 500000000, + "PublicAddress": "", + "ReconnectTime": 60000000000, + "ReservedFDs": 256, + "RestConnectionsHardLimit": 2048, + "RestConnectionsSoftLimit": 1024, + "RestReadTimeoutSeconds": 15, + "RestWriteTimeoutSeconds": 120, + "RunHosted": false, + "StateproofDir": "", + "StorageEngine": "sqlite", + "SuggestedFeeBlockHistory": 3, + "SuggestedFeeSlidingWindowSize": 50, + "TLSCertFile": "", + "TLSKeyFile": "", + "TelemetryToLog": true, + "TrackerDBDir": "", + "TransactionSyncDataExchangeRate": 0, + "TransactionSyncSignificantMessageThreshold": 0, + "TxBacklogAppTxPerSecondRate": 100, + "TxBacklogAppTxRateLimiterMaxSize": 1048576, + "TxBacklogRateLimitingCongestionPct": 50, + "TxBacklogReservedCapacityPerPeer": 20, + "TxBacklogServiceRateWindowSeconds": 10, + "TxBacklogSize": 26000, + "TxIncomingFilterMaxSize": 500000, + "TxIncomingFilteringFlags": 1, + "TxPoolExponentialIncreaseFactor": 2, + "TxPoolSize": 75000, + "TxSyncIntervalSeconds": 60, + "TxSyncServeResponseSize": 1000000, + "TxSyncTimeoutSeconds": 30, + "UseXForwardedForAddressField": "", + "VerifiedTranscationsCacheSize": 150000 +} From a8cdebc66f9d562dd0b08af4b4d70f04db5d4a8e Mon Sep 17 00:00:00 2001 From: ohill <145173879+ohill@users.noreply.github.com> Date: Mon, 8 Apr 2024 10:23:37 -0400 Subject: [PATCH 2/4] Remove short check from TestPartitionHalfOffline --- .../features/partitionRecovery/partitionRecovery_test.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/e2e-go/features/partitionRecovery/partitionRecovery_test.go b/test/e2e-go/features/partitionRecovery/partitionRecovery_test.go index 14b3b82f6e..c35a8dfd02 100644 --- a/test/e2e-go/features/partitionRecovery/partitionRecovery_test.go +++ b/test/e2e-go/features/partitionRecovery/partitionRecovery_test.go @@ -231,9 +231,9 @@ func TestPartitionHalfOffline(t *testing.T) { partitiontest.PartitionTest(t) defer fixtures.ShutdownSynchronizedTest(t) - if testing.Short() { - t.Skip() - } + //if testing.Short() { + // t.Skip() + //} t.Parallel() a := require.New(fixtures.SynchronizedTest(t)) From aea9f9171233148863d9fa66589998c88dd794ed Mon Sep 17 00:00:00 2001 From: ohill <145173879+ohill@users.noreply.github.com> Date: Mon, 8 Apr 2024 11:16:40 -0400 Subject: [PATCH 3/4] Remove short check from TestSendSigsAfterCatchpointCatchup and set mem limit --- .../features/catchup/stateproofsCatchup_test.go | 12 +++++++++--- .../partitionRecovery/partitionRecovery_test.go | 2 +- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/test/e2e-go/features/catchup/stateproofsCatchup_test.go b/test/e2e-go/features/catchup/stateproofsCatchup_test.go index 9de6bf385c..6fdee988b3 100644 --- a/test/e2e-go/features/catchup/stateproofsCatchup_test.go +++ b/test/e2e-go/features/catchup/stateproofsCatchup_test.go @@ -204,9 +204,9 @@ func TestSendSigsAfterCatchpointCatchup(t *testing.T) { partitiontest.PartitionTest(t) defer fixtures.ShutdownSynchronizedTest(t) - if testing.Short() { - t.Skip() - } + // if testing.Short() { + // t.Skip() + // } a := require.New(fixtures.SynchronizedTest(t)) configurableConsensus := make(config.ConsensusProtocols) @@ -221,6 +221,12 @@ func TestSendSigsAfterCatchpointCatchup(t *testing.T) { var fixture fixtures.RestClientFixture fixture.SetConsensus(configurableConsensus) fixture.SetupNoStart(t, filepath.Join("nettemplates", "ThreeNodesWithRichAcct.json")) + for _, nodeDir := range fixture.NodeDataDirs() { + cfg, err := config.LoadConfigFromDisk(nodeDir) + a.NoError(err) + cfg.GoMemLimit = 4 * 1024 * 1024 * 1024 // 4GB + cfg.SaveToDisk(nodeDir) + } primaryNode, primaryNodeRestClient, primaryEC := startCatchpointGeneratingNode(a, &fixture, "Primary") defer primaryEC.Print() diff --git a/test/e2e-go/features/partitionRecovery/partitionRecovery_test.go b/test/e2e-go/features/partitionRecovery/partitionRecovery_test.go index c35a8dfd02..736f8e83dd 100644 --- a/test/e2e-go/features/partitionRecovery/partitionRecovery_test.go +++ b/test/e2e-go/features/partitionRecovery/partitionRecovery_test.go @@ -251,7 +251,7 @@ func TestPartitionHalfOffline(t *testing.T) { a.NoError(err) // adjust the refresh interval for one hour, so that we won't be reloading the participation key during this test. cfg.ParticipationKeysRefreshInterval = time.Hour - cfg.GoMemLimit = 1 * 1024 * 1024 * 1024 + cfg.GoMemLimit = 1 * 1024 * 1024 * 1024 // 1GB cfg.SaveToDisk(nodeDir) } fixture.Start() From e0cb8d0db7d6fed75199525816153004c3c346b8 Mon Sep 17 00:00:00 2001 From: ohill <145173879+ohill@users.noreply.github.com> Date: Mon, 8 Apr 2024 12:19:24 -0400 Subject: [PATCH 4/4] add back short checks to TestPartitionHalfOffline and TestSendSigsAfterCatchpointCatchup --- test/e2e-go/features/catchup/stateproofsCatchup_test.go | 6 +++--- .../features/partitionRecovery/partitionRecovery_test.go | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/test/e2e-go/features/catchup/stateproofsCatchup_test.go b/test/e2e-go/features/catchup/stateproofsCatchup_test.go index 6fdee988b3..5dcbc11452 100644 --- a/test/e2e-go/features/catchup/stateproofsCatchup_test.go +++ b/test/e2e-go/features/catchup/stateproofsCatchup_test.go @@ -204,9 +204,9 @@ func TestSendSigsAfterCatchpointCatchup(t *testing.T) { partitiontest.PartitionTest(t) defer fixtures.ShutdownSynchronizedTest(t) - // if testing.Short() { - // t.Skip() - // } + if testing.Short() { + t.Skip() + } a := require.New(fixtures.SynchronizedTest(t)) configurableConsensus := make(config.ConsensusProtocols) diff --git a/test/e2e-go/features/partitionRecovery/partitionRecovery_test.go b/test/e2e-go/features/partitionRecovery/partitionRecovery_test.go index 736f8e83dd..21ce3bdf0d 100644 --- a/test/e2e-go/features/partitionRecovery/partitionRecovery_test.go +++ b/test/e2e-go/features/partitionRecovery/partitionRecovery_test.go @@ -231,9 +231,9 @@ func TestPartitionHalfOffline(t *testing.T) { partitiontest.PartitionTest(t) defer fixtures.ShutdownSynchronizedTest(t) - //if testing.Short() { - // t.Skip() - //} + if testing.Short() { + t.Skip() + } t.Parallel() a := require.New(fixtures.SynchronizedTest(t))