Skip to content

Commit

Permalink
Add runtime stats logger and pprof
Browse files Browse the repository at this point in the history
Add pprof to agent introspection endpoint

fixup! Add runtime stats logger
  • Loading branch information
angelcar committed Sep 1, 2021
1 parent d63942d commit 5d3232b
Show file tree
Hide file tree
Showing 14 changed files with 273 additions and 5 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,7 @@ additional details on each available environment variable.
| `ECS_LOG_OPTS` | `{"option":"value"}` | The options for configuring the logging driver set in `ECS_LOG_DRIVER`. | `{}` | Not applicable |
| `ECS_ENABLE_AWSLOGS_EXECUTIONROLE_OVERRIDE` | `true` | Whether to enable awslogs log driver to authenticate via credentials of task execution IAM role. Needs to be true if you want to use awslogs log driver in a task that has task execution IAM role specified. When using the ecs-init RPM with version equal or later than V1.16.0-1, this env is set to true by default. | `false` | `false` |
| `ECS_FSX_WINDOWS_FILE_SERVER_SUPPORTED` | `true` | Whether FSx for Windows File Server volume type is supported on the container instance. This variable is only supported on agent versions 1.47.0 and later. | `false` | `true` |
| `ECS_ENABLE_RUNTIME_STATS` | `true` | Determines if [pprof](https://pkg.go.dev/net/http/pprof) is enabled for the agent. If enabled, the different profiles can be accessed through the agent's introspection port (e.g. `curl http://localhost:51678/debug/pprof/heap > heap.pprof`). In addition, agent's [runtime stats](https://pkg.go.dev/runtime#ReadMemStats) are logged to `/var/log/ecs/runtime-stats.log` file. | `false` | `false` |

### Persistence

Expand Down
6 changes: 6 additions & 0 deletions agent/app/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,8 @@ type agent interface {
start() int
// setTerminationHandler sets the termination handler
setTerminationHandler(sighandlers.TerminationHandler)
// getConfig gets the agent configuration
getConfig() *config.Config
}

// ecsAgent wraps all the entities needed to start the ECS Agent execution.
Expand Down Expand Up @@ -216,6 +218,10 @@ func newAgent(blackholeEC2Metadata bool, acceptInsecureCert *bool) (agent, error
}, nil
}

func (agent *ecsAgent) getConfig() *config.Config {
return agent.cfg
}

// printECSAttributes prints the Agent's ECS Attributes based on its
// environment
func (agent *ecsAgent) printECSAttributes() int {
Expand Down
5 changes: 3 additions & 2 deletions agent/app/agent_windows_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,9 @@ func (m *mockAgent) start() int {
func (m *mockAgent) setTerminationHandler(handler sighandlers.TerminationHandler) {
m.terminationHandler = handler
}
func (m *mockAgent) printECSAttributes() int { return 0 }
func (m *mockAgent) startWindowsService() int { return 0 }
func (m *mockAgent) printECSAttributes() int { return 0 }
func (m *mockAgent) startWindowsService() int { return 0 }
func (m *mockAgent) getConfig() *config.Config { return &config.Config{} }

func TestHandler_RunAgent_StartExitImmediately(t *testing.T) {
// register some mocks, but nothing should get called on any of them
Expand Down
4 changes: 4 additions & 0 deletions agent/app/run.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,10 @@ func Run(arguments []string) int {
return exitcodes.ExitError
}

if agent.getConfig().EnableRuntimeStats.Enabled() {
defer logger.StartRuntimeStatsLogger(agent.getConfig())()
}

switch {
case *parsedArgs.ECSAttributes:
// Print agent's ecs attributes based on its environment and exit
Expand Down
1 change: 1 addition & 0 deletions agent/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -590,6 +590,7 @@ func environmentConfig() (Config, error) {
VolumePluginCapabilities: parseVolumePluginCapabilities(),
FSxWindowsFileServerCapable: parseFSxWindowsFileServerCapability(),
External: parseBooleanDefaultFalseConfig("ECS_EXTERNAL"),
EnableRuntimeStats: parseBooleanDefaultFalseConfig("ECS_ENABLE_RUNTIME_STATS"),
}, err
}

Expand Down
10 changes: 10 additions & 0 deletions agent/config/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,7 @@ func TestEnvironmentConfig(t *testing.T) {
defer setTestEnv("ECS_POLLING_METRICS_WAIT_DURATION", "10s")()
defer setTestEnv("ECS_CGROUP_CPU_PERIOD", "")
defer setTestEnv("ECS_PULL_DEPENDENT_CONTAINERS_UPFRONT", "true")()
defer setTestEnv("ECS_ENABLE_RUNTIME_STATS", "true")()
additionalLocalRoutesJSON := `["1.2.3.4/22","5.6.7.8/32"]`
setTestEnv("ECS_AWSVPC_ADDITIONAL_LOCAL_ROUTES", additionalLocalRoutesJSON)
setTestEnv("ECS_ENABLE_CONTAINER_METADATA", "true")
Expand Down Expand Up @@ -212,6 +213,7 @@ func TestEnvironmentConfig(t *testing.T) {
assert.False(t, conf.SpotInstanceDrainingEnabled.Enabled())
assert.Equal(t, []string{"efsAuth"}, conf.VolumePluginCapabilities)
assert.True(t, conf.DependentContainersPullUpfront.Enabled(), "Wrong value for DependentContainersPullUpfront")
assert.True(t, conf.EnableRuntimeStats.Enabled(), "Wrong value for EnableRuntimeStats")
}

func TestTrimWhitespaceWhenCreating(t *testing.T) {
Expand Down Expand Up @@ -597,6 +599,14 @@ func TestSharedVolumeMatchFullConfigEnabled(t *testing.T) {
assert.True(t, cfg.SharedVolumeMatchFullConfig.Enabled(), "Wrong value for SharedVolumeMatchFullConfig")
}

func TestEnableRuntimeStatsConfigEnabled(t *testing.T) {
defer setTestRegion()()
defer setTestEnv("ECS_ENABLE_RUNTIME_STATS", "true")()
cfg, err := NewConfig(ec2.NewBlackholeEC2MetadataClient())
assert.NoError(t, err)
assert.True(t, cfg.EnableRuntimeStats.Enabled(), "Wrong value for EnableRuntimeStats")
}

func TestParseImagePullBehavior(t *testing.T) {
testcases := []struct {
name string
Expand Down
6 changes: 6 additions & 0 deletions agent/config/config_unix.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@ const (
AgentCredentialsAddress = "" // this is left blank right now for net=bridge
// defaultAuditLogFile specifies the default audit log filename
defaultCredentialsAuditLogFile = "/log/audit.log"

// defaultRuntimeStatsLogFile stores the path where the golang runtime stats are periodically logged
defaultRuntimeStatsLogFile = `/log/agent-runtime-stats.log`

// DefaultTaskCgroupPrefix is default cgroup prefix for ECS tasks
DefaultTaskCgroupPrefix = "/ecs"

Expand Down Expand Up @@ -91,6 +95,8 @@ func DefaultConfig() Config {
CgroupCPUPeriod: defaultCgroupCPUPeriod,
GMSACapable: false,
FSxWindowsFileServerCapable: false,
RuntimeStatsLogFile: defaultRuntimeStatsLogFile,
EnableRuntimeStats: BooleanDefaultFalse{Value: NotSet},
}
}

Expand Down
1 change: 1 addition & 0 deletions agent/config/config_unix_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ func TestConfigDefault(t *testing.T) {
assert.Equal(t, DefaultImagePullTimeout, cfg.ImagePullTimeout, "Default ImagePullTimeout set incorrectly")
assert.False(t, cfg.DependentContainersPullUpfront.Enabled(), "Default DependentContainersPullUpfront set incorrectly")
assert.False(t, cfg.PollMetrics.Enabled(), "ECS_POLL_METRICS default should be false")
assert.False(t, cfg.EnableRuntimeStats.Enabled(), "Default EnableRuntimeStats set incorrectly")
}

// TestConfigFromFile tests the configuration can be read from file
Expand Down
6 changes: 6 additions & 0 deletions agent/config/config_windows.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@ const (

// defaultAuditLogFile specifies the default audit log filename
defaultCredentialsAuditLogFile = `log\audit.log`

// defaultRuntimeStatsLogFile stores the path where the golang runtime stats are periodically logged
defaultRuntimeStatsLogFile = `log\agent-runtime-stats.log`

// When using IAM roles for tasks on Windows, the credential proxy consumes port 80
httpPort = 80
// Remote Desktop / Terminal Services
Expand Down Expand Up @@ -130,6 +134,8 @@ func DefaultConfig() Config {
PauseContainerImageName: DefaultPauseContainerImageName,
PauseContainerTag: DefaultPauseContainerTag,
CNIPluginsPath: filepath.Join(ecsBinaryDir, defaultCNIPluginDirName),
RuntimeStatsLogFile: filepath.Join(ecsRoot, defaultRuntimeStatsLogFile),
EnableRuntimeStats: BooleanDefaultFalse{Value: NotSet},
}
}

Expand Down
1 change: 1 addition & 0 deletions agent/config/config_windows_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ func TestConfigDefault(t *testing.T) {
assert.False(t, cfg.SharedVolumeMatchFullConfig.Enabled(), "Default SharedVolumeMatchFullConfig set incorrectly")
assert.Equal(t, DefaultImagePullTimeout, cfg.ImagePullTimeout, "Default ImagePullTimeout set incorrectly")
assert.False(t, cfg.DependentContainersPullUpfront.Enabled(), "Default DependentContainersPullUpfront set incorrectly")
assert.False(t, cfg.EnableRuntimeStats.Enabled(), "Default EnableRuntimeStats set incorrectly")
}

func TestConfigIAMTaskRolesReserves80(t *testing.T) {
Expand Down
7 changes: 7 additions & 0 deletions agent/config/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -342,4 +342,11 @@ type Config struct {
// InstanceENIDNSServerList stores the list of DNS servers for the primary instance ENI.
// Currently, this field is only populated for Windows and is used during task networking setup.
InstanceENIDNSServerList []string

// RuntimeStatsLogFile stores the path where the golang runtime stats are periodically logged
RuntimeStatsLogFile string

// EnableRuntimeStats specifies if pprof should be enabled through the agent introspection port. By default, this configuration
// is set to false and can be overridden by means of the ECS_ENABLE_RUNTIME_STATS environment variable.
EnableRuntimeStats BooleanDefaultFalse
}
45 changes: 44 additions & 1 deletion agent/handlers/introspection_server_setup.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import (
"context"
"encoding/json"
"net/http"
"net/http/pprof"
"strconv"
"time"

Expand All @@ -33,8 +34,34 @@ type rootResponse struct {
AvailableCommands []string
}

const (
// With pprof we need to increase the timeout so that there is enough time to do the profiling. Since the profiling
// time window for CPU is configurable in the request, this timeout effectively means the CPU profiling will be
// capped to 5 min.
writeTimeoutForPprof = time.Minute * 5
pprofBasePath = "/debug/pprof/"
pprofCMDLinePath = pprofBasePath + "cmdline"
pprofProfilePath = pprofBasePath + "profile"
pprofSymbolPath = pprofBasePath + "symbol"
pprofTracePath = pprofBasePath + "trace"
)

var (
// Injection points for testing
pprofIndexHandler = pprof.Index
pprofCmdlineHandler = pprof.Cmdline
pprofProfileHandler = pprof.Profile
pprofSymbolHandler = pprof.Symbol
pprofTraceHandler = pprof.Trace
)

func introspectionServerSetup(containerInstanceArn *string, taskEngine handlersutils.DockerStateResolver, cfg *config.Config) *http.Server {
paths := []string{v1.AgentMetadataPath, v1.TaskContainerMetadataPath, v1.LicensePath}

if cfg.EnableRuntimeStats.Enabled() {
paths = append(paths, pprofBasePath, pprofCMDLinePath, pprofProfilePath, pprofSymbolPath, pprofTracePath)
}

availableCommands := &rootResponse{paths}
// Autogenerated list of the above serverFunctions paths
availableCommandResponse, err := json.Marshal(&availableCommands)
Expand All @@ -50,16 +77,21 @@ func introspectionServerSetup(containerInstanceArn *string, taskEngine handlersu
serverMux.HandleFunc("/", defaultHandler)

v1HandlersSetup(serverMux, containerInstanceArn, taskEngine, cfg)
pprofHandlerSetup(serverMux, cfg)

// Log all requests and then pass through to serverMux
loggingServeMux := http.NewServeMux()
loggingServeMux.Handle("/", LoggingHandler{serverMux})

wTimeout := writeTimeout
if cfg.EnableRuntimeStats.Enabled() {
wTimeout = writeTimeoutForPprof
}
server := &http.Server{
Addr: ":" + strconv.Itoa(config.AgentIntrospectionPort),
Handler: loggingServeMux,
ReadTimeout: readTimeout,
WriteTimeout: writeTimeout,
WriteTimeout: wTimeout,
}

return server
Expand All @@ -75,6 +107,17 @@ func v1HandlersSetup(serverMux *http.ServeMux,
serverMux.HandleFunc(v1.LicensePath, v1.LicenseHandler)
}

func pprofHandlerSetup(serverMux *http.ServeMux, cfg *config.Config) {
if !cfg.EnableRuntimeStats.Enabled() {
return
}
serverMux.HandleFunc(pprofBasePath, pprofIndexHandler)
serverMux.HandleFunc(pprofCMDLinePath, pprofCmdlineHandler)
serverMux.HandleFunc(pprofProfilePath, pprofProfileHandler)
serverMux.HandleFunc(pprofSymbolPath, pprofSymbolHandler)
serverMux.HandleFunc(pprofTracePath, pprofTraceHandler)
}

// ServeIntrospectionHTTPEndpoint serves information about this agent/containerInstance and tasks
// running on it. "V1" here indicates the hostname version of this server instead
// of the handler versions, i.e. "V1" server can include "V1" and "V2" handlers.
Expand Down
77 changes: 75 additions & 2 deletions agent/handlers/introspection_server_setup_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"net/http"
"net/http/httptest"
"strconv"
"strings"
"testing"

apicontainer "github.com/aws/amazon-ecs-agent/agent/api/container"
Expand All @@ -42,6 +43,8 @@ const (
eniIPV4Address = "10.0.0.2"
)

var runtimeStatsConfigForTest = config.BooleanDefaultFalse{}

func TestMetadataHandler(t *testing.T) {
metadataHandler := v1.AgentMetadataHandler(utils.Strptr(testContainerInstanceArn), &config.Config{Cluster: testClusterArn})

Expand Down Expand Up @@ -239,6 +242,70 @@ func TestBackendMismatchMapping(t *testing.T) {
}
}

func setupMockPprofHandlers() func() {
runtimeStatsConfigForTestBkp := runtimeStatsConfigForTest
pprofIndexHandlerBkp := pprofIndexHandler
pprofCmdlineHandlerBkp := pprofCmdlineHandler
pprofProfileHandlerBkp := pprofProfileHandler
pprofSymbolHandlerBkp := pprofSymbolHandler
pprofTraceHandlerBkp := pprofTraceHandler

mockPprofTestHandler := func(w http.ResponseWriter, r *http.Request) {
w.Write([]byte(r.URL.Path))
}
pprofIndexHandler = mockPprofTestHandler
pprofCmdlineHandler = mockPprofTestHandler
pprofProfileHandler = mockPprofTestHandler
pprofSymbolHandler = mockPprofTestHandler
pprofTraceHandler = mockPprofTestHandler

return func() {
runtimeStatsConfigForTest = runtimeStatsConfigForTestBkp
pprofIndexHandler = pprofIndexHandlerBkp
pprofCmdlineHandler = pprofCmdlineHandlerBkp
pprofProfileHandler = pprofProfileHandlerBkp
pprofSymbolHandler = pprofSymbolHandlerBkp
pprofTraceHandler = pprofTraceHandlerBkp
}
}

func TestPProfHandlerSetup(t *testing.T) {
pprofPaths := []string{
"/debug/pprof/",
"/debug/pprof/cmdline",
"/debug/pprof/profile",
"/debug/pprof/symbol",
"/debug/pprof/trace",
}

testCases := []struct {
runtimeStatsEnabled config.Conditional
paths []string
}{
{runtimeStatsEnabled: config.ExplicitlyDisabled, paths: pprofPaths},
{runtimeStatsEnabled: config.ExplicitlyEnabled, paths: pprofPaths},
}

for _, tc := range testCases {
runtimeStatsConfigForTest = config.BooleanDefaultFalse{Value: tc.runtimeStatsEnabled}
for _, p := range tc.paths {
t.Run(p+"-"+strconv.FormatBool(runtimeStatsConfigForTest.Enabled()), func(t *testing.T) {
defer setupMockPprofHandlers()()

recorder := performMockRequest(t, p)
if runtimeStatsConfigForTest.Enabled() {
assert.Equal(t, http.StatusOK, recorder.Code)
assert.Equal(t, p, recorder.Body.String())
} else {
assert.Equal(t, http.StatusOK, recorder.Code)
assert.Equal(t, `{"AvailableCommands":["/v1/metadata","/v1/tasks","/license"]}`, recorder.Body.String())

}
})
}
}
}

func taskDiffHelper(t *testing.T, expected []*apitask.Task, actual v1.TasksResponse) {
if len(expected) != len(actual.Tasks) {
t.Errorf("Expected %v tasks, had %v tasks", len(expected), len(actual.Tasks))
Expand Down Expand Up @@ -409,8 +476,14 @@ func performMockRequest(t *testing.T, path string) *httptest.ResponseRecorder {
state := dockerstate.NewTaskEngineState()
stateSetupHelper(state, testTasks)

mockStateResolver.EXPECT().State().Return(state)
requestHandler := introspectionServerSetup(utils.Strptr(testContainerInstanceArn), mockStateResolver, &config.Config{Cluster: testClusterArn})
if !strings.HasPrefix(path, pprofBasePath) {
mockStateResolver.EXPECT().State().Return(state)
}

requestHandler := introspectionServerSetup(utils.Strptr(testContainerInstanceArn), mockStateResolver, &config.Config{
Cluster: testClusterArn,
EnableRuntimeStats: runtimeStatsConfigForTest,
})

recorder := httptest.NewRecorder()
req, _ := http.NewRequest("GET", path, nil)
Expand Down
Loading

0 comments on commit 5d3232b

Please sign in to comment.