From 99f17d66d79ccb772ed376f255bbdf79be320fac Mon Sep 17 00:00:00 2001 From: Nicolas Ruflin Date: Tue, 21 Feb 2017 10:40:48 +0100 Subject: [PATCH] Fix docker hanging when container killed (#3612) No timeout was passed to the docker client. It seems in case of a killed container it can happen that the connection is hanging. To interrupt this connection, the timeout from the metricset is passed to the client. That means in case info for a container cannot be fetched, it will timeout. This change requires that the docker module is not run with a timeout of 3s seconds, which indirectly means a period of 3s. The reason is that already the http request waits ~2s for the response. So if 1s is set as timeout, all requests will timeout. Further changes: * Containers without names will be ignored, as these are containers for which the data could not be fetched. * Period was set to 1s by default instead of the period as document. This was changed. * Add documentation node about minimal period. Closes https://github.com/elastic/beats/issues/3610 The issue with this PR was introduce in 5.2.1 by fixing the memory leak. Before go routines just piled up, but now they caused filebeat to hang. This needs also backport to 5.2.2 --- CHANGELOG.asciidoc | 2 ++ metricbeat/docs/modules/system.asciidoc | 3 +++ metricbeat/mb/builders.go | 5 +++++ metricbeat/mb/mb.go | 1 - metricbeat/mb/mb_test.go | 6 +++--- metricbeat/module/docker/cpu/cpu.go | 2 +- metricbeat/module/docker/diskio/diskio.go | 2 +- metricbeat/module/docker/docker.go | 21 ++++++++++++-------- metricbeat/module/docker/memory/memory.go | 2 +- metricbeat/module/docker/network/network.go | 2 +- metricbeat/module/system/_meta/docs.asciidoc | 3 +++ metricbeat/tests/system/test_docker.py | 16 +++++++-------- 12 files changed, 41 insertions(+), 24 deletions(-) diff --git a/CHANGELOG.asciidoc b/CHANGELOG.asciidoc index a50c3b2a341b..00cba187bf9e 100644 --- a/CHANGELOG.asciidoc +++ b/CHANGELOG.asciidoc @@ -49,6 +49,8 @@ https://github.com/elastic/beats/compare/v5.1.1...master[Check the HEAD diff] *Metricbeat* - Fix go routine leak in docker module. {pull}3492[3492] +- Fix bug docker module hanging when docker container killed. {issue}3610[3610] +- Set timeout to period instead of 1s by default as documented. *Packetbeat* diff --git a/metricbeat/docs/modules/system.asciidoc b/metricbeat/docs/modules/system.asciidoc index afbcb259e54f..bda7bfbeb449 100644 --- a/metricbeat/docs/modules/system.asciidoc +++ b/metricbeat/docs/modules/system.asciidoc @@ -49,6 +49,9 @@ metricbeat.modules: cpu_ticks: true ---- +It is strongly recommended to not run docker metricsets with a period smaller then 3 seconds. The request to the docker +API already takes up to 2s seconds. Otherwise all the requests would timeout and no data is reported. + [float] === Dashboard diff --git a/metricbeat/mb/builders.go b/metricbeat/mb/builders.go index 921ae5eec3d0..90213315f3da 100644 --- a/metricbeat/mb/builders.go +++ b/metricbeat/mb/builders.go @@ -89,6 +89,11 @@ func newBaseModuleFromConfig(rawConfig *common.Config) (BaseModule, error) { return baseModule, err } + // If timeout is not set, timeout is set to the same value as period + if baseModule.config.Timeout == 0 { + baseModule.config.Timeout = baseModule.config.Period + } + baseModule.name = strings.ToLower(baseModule.config.Module) err = mustNotContainDuplicates(baseModule.config.Hosts) diff --git a/metricbeat/mb/mb.go b/metricbeat/mb/mb.go index 4b1c36f3b07f..ce6c4abf9e96 100644 --- a/metricbeat/mb/mb.go +++ b/metricbeat/mb/mb.go @@ -183,7 +183,6 @@ func (c ModuleConfig) GoString() string { return c.String() } var defaultModuleConfig = ModuleConfig{ Enabled: true, Period: time.Second * 10, - Timeout: time.Second, } // DefaultModuleConfig returns a ModuleConfig with the default values populated. diff --git a/metricbeat/mb/mb_test.go b/metricbeat/mb/mb_test.go index 148e72be6481..498c6bdc64db 100644 --- a/metricbeat/mb/mb_test.go +++ b/metricbeat/mb/mb_test.go @@ -61,7 +61,7 @@ func TestModuleConfig(t *testing.T) { MetricSets: []string{"test"}, Enabled: true, Period: time.Second * 10, - Timeout: time.Second, + Timeout: 0, }, }, { @@ -127,7 +127,7 @@ func TestModuleConfigDefaults(t *testing.T) { assert.Equal(t, true, mc.Enabled) assert.Equal(t, time.Second*10, mc.Period) - assert.Equal(t, time.Second, mc.Timeout) + assert.Equal(t, time.Second*0, mc.Timeout) assert.Empty(t, mc.Hosts) } @@ -252,7 +252,7 @@ func TestNewBaseModuleFromModuleConfigStruct(t *testing.T) { assert.Equal(t, moduleName, baseModule.Config().Module) assert.Equal(t, true, baseModule.Config().Enabled) assert.Equal(t, time.Second*10, baseModule.Config().Period) - assert.Equal(t, time.Second, baseModule.Config().Timeout) + assert.Equal(t, time.Second*10, baseModule.Config().Timeout) assert.Empty(t, baseModule.Config().Hosts) } diff --git a/metricbeat/module/docker/cpu/cpu.go b/metricbeat/module/docker/cpu/cpu.go index f5932fba30d5..48eca5c0f161 100644 --- a/metricbeat/module/docker/cpu/cpu.go +++ b/metricbeat/module/docker/cpu/cpu.go @@ -44,7 +44,7 @@ func New(base mb.BaseMetricSet) (mb.MetricSet, error) { // Fetch returns a list of docker CPU stats. func (m *MetricSet) Fetch() ([]common.MapStr, error) { - stats, err := docker.FetchStats(m.dockerClient) + stats, err := docker.FetchStats(m.dockerClient, m.Module().Config().Timeout) if err != nil { return nil, err } diff --git a/metricbeat/module/docker/diskio/diskio.go b/metricbeat/module/docker/diskio/diskio.go index 15688b4fde40..dc0a423764ad 100644 --- a/metricbeat/module/docker/diskio/diskio.go +++ b/metricbeat/module/docker/diskio/diskio.go @@ -46,7 +46,7 @@ func New(base mb.BaseMetricSet) (mb.MetricSet, error) { // Fetch creates list of events with diskio stats for all containers. func (m *MetricSet) Fetch() ([]common.MapStr, error) { - stats, err := docker.FetchStats(m.dockerClient) + stats, err := docker.FetchStats(m.dockerClient, m.Module().Config().Timeout) if err != nil { return nil, err } diff --git a/metricbeat/module/docker/docker.go b/metricbeat/module/docker/docker.go index 79aa42988dae..9b2b5a1c3282 100644 --- a/metricbeat/module/docker/docker.go +++ b/metricbeat/module/docker/docker.go @@ -7,6 +7,8 @@ import ( "github.com/elastic/beats/metricbeat/mb" "github.com/elastic/beats/metricbeat/mb/parse" + "time" + "github.com/fsouza/go-dockerclient" ) @@ -58,7 +60,7 @@ func NewDockerClient(endpoint string, config Config) (*docker.Client, error) { } // FetchStats returns a list of running containers with all related stats inside -func FetchStats(client *docker.Client) ([]Stat, error) { +func FetchStats(client *docker.Client, timeout time.Duration) ([]Stat, error) { containers, err := client.ListContainers(docker.ListContainersOptions{}) if err != nil { return nil, err @@ -67,24 +69,27 @@ func FetchStats(client *docker.Client) ([]Stat, error) { var wg sync.WaitGroup containersList := make([]Stat, 0, len(containers)) - queue := make(chan Stat, 1) + statsQueue := make(chan Stat, 1) wg.Add(len(containers)) for _, container := range containers { go func(container docker.APIContainers) { defer wg.Done() - queue <- exportContainerStats(client, &container) + statsQueue <- exportContainerStats(client, &container, timeout) }(container) } go func() { wg.Wait() - close(queue) + close(statsQueue) }() // This will break after the queue has been drained and queue is closed. - for container := range queue { - containersList = append(containersList, container) + for stat := range statsQueue { + // If names is empty, there is not data inside + if len(stat.Container.Names) != 0 { + containersList = append(containersList, stat) + } } return containersList, err @@ -95,7 +100,7 @@ func FetchStats(client *docker.Client) ([]Stat, error) { // This is currently very inefficient as docker calculates the average for each request, // means each request will take at least 2s: https://github.com/docker/docker/blob/master/cli/command/container/stats_helpers.go#L148 // Getting all stats at once is implemented here: https://github.com/docker/docker/pull/25361 -func exportContainerStats(client *docker.Client, container *docker.APIContainers) Stat { +func exportContainerStats(client *docker.Client, container *docker.APIContainers, timeout time.Duration) Stat { var wg sync.WaitGroup var event Stat @@ -105,7 +110,7 @@ func exportContainerStats(client *docker.Client, container *docker.APIContainers ID: container.ID, Stats: statsC, Stream: false, - Timeout: -1, + Timeout: timeout, } wg.Add(2) diff --git a/metricbeat/module/docker/memory/memory.go b/metricbeat/module/docker/memory/memory.go index 192083f32d5c..f21fd1d1d93f 100644 --- a/metricbeat/module/docker/memory/memory.go +++ b/metricbeat/module/docker/memory/memory.go @@ -44,7 +44,7 @@ func New(base mb.BaseMetricSet) (mb.MetricSet, error) { // Fetch creates a list of memory events for each container. func (m *MetricSet) Fetch() ([]common.MapStr, error) { - stats, err := docker.FetchStats(m.dockerClient) + stats, err := docker.FetchStats(m.dockerClient, m.Module().Config().Timeout) if err != nil { return nil, err } diff --git a/metricbeat/module/docker/network/network.go b/metricbeat/module/docker/network/network.go index 5119cb19539a..7c3517b170c7 100644 --- a/metricbeat/module/docker/network/network.go +++ b/metricbeat/module/docker/network/network.go @@ -46,7 +46,7 @@ func New(base mb.BaseMetricSet) (mb.MetricSet, error) { // Fetch methods creates a list of network events for each container. func (m *MetricSet) Fetch() ([]common.MapStr, error) { - stats, err := docker.FetchStats(m.dockerClient) + stats, err := docker.FetchStats(m.dockerClient, m.Module().Config().Timeout) if err != nil { return nil, err } diff --git a/metricbeat/module/system/_meta/docs.asciidoc b/metricbeat/module/system/_meta/docs.asciidoc index a861aa9e5d52..22b08e32c547 100644 --- a/metricbeat/module/system/_meta/docs.asciidoc +++ b/metricbeat/module/system/_meta/docs.asciidoc @@ -44,6 +44,9 @@ metricbeat.modules: cpu_ticks: true ---- +It is strongly recommended to not run docker metricsets with a period smaller then 3 seconds. The request to the docker +API already takes up to 2s seconds. Otherwise all the requests would timeout and no data is reported. + [float] === Dashboard diff --git a/metricbeat/tests/system/test_docker.py b/metricbeat/tests/system/test_docker.py index f131653aa06a..b515f03eb681 100644 --- a/metricbeat/tests/system/test_docker.py +++ b/metricbeat/tests/system/test_docker.py @@ -15,7 +15,7 @@ def test_container_fields(self): "name": "docker", "metricsets": ["container"], "hosts": ["unix:///var/run/docker.sock"], - "period": "1s", + "period": "10s", }]) proc = self.start_beat() @@ -41,7 +41,7 @@ def test_cpu_fields(self): "name": "docker", "metricsets": ["cpu"], "hosts": ["unix:///var/run/docker.sock"], - "period": "1s" + "period": "10s" }]) proc = self.start_beat() @@ -71,7 +71,7 @@ def test_diskio_fields(self): "name": "docker", "metricsets": ["diskio"], "hosts": ["unix:///var/run/docker.sock"], - "period": "1s" + "period": "10s" }]) proc = self.start_beat() @@ -98,7 +98,7 @@ def test_info_fields(self): "name": "docker", "metricsets": ["info"], "hosts": ["unix:///var/run/docker.sock"], - "period": "1s" + "period": "10s" }]) proc = self.start_beat() @@ -123,7 +123,7 @@ def test_memory_fields(self): "name": "docker", "metricsets": ["memory"], "hosts": ["unix:///var/run/docker.sock"], - "period": "1s" + "period": "10s" }]) proc = self.start_beat() @@ -149,7 +149,7 @@ def test_network_fields(self): "name": "docker", "metricsets": ["network"], "hosts": ["unix:///var/run/docker.sock"], - "period": "1s" + "period": "10s" }]) proc = self.start_beat() @@ -175,7 +175,7 @@ def test_health_fields(self): "name": "docker", "metricsets": ["healthcheck"], "hosts": ["unix:///var/run/docker.sock"], - "period": "1s", + "period": "10s", }]) proc = self.start_beat() @@ -201,7 +201,7 @@ def test_image_fields(self): "name": "docker", "metricsets": ["image"], "hosts": ["unix:///var/run/docker.sock"], - "period": "1s", + "period": "10s", }]) proc = self.start_beat()