Skip to content

Commit 4e8128d

Browse files
committed
Fix docker hanging when container killed (elastic#3612)
No timeout was passed to the docker client. It seems in case of a killed container it can happen that the connection is hanging. To interrupt this connection, the timeout from the metricset is passed to the client. That means in case info for a container cannot be fetched, it will timeout. This change requires that the docker module is not run with a timeout of 3s seconds, which indirectly means a period of 3s. The reason is that already the http request waits ~2s for the response. So if 1s is set as timeout, all requests will timeout. Further changes: * Containers without names will be ignored, as these are containers for which the data could not be fetched. * Period was set to 1s by default instead of the period as document. This was changed. * Add documentation node about minimal period. Closes elastic#3610 The issue with this PR was introduce in 5.2.1 by fixing the memory leak. Before go routines just piled up, but now they caused filebeat to hang. This needs also backport to 5.2.2 (cherry picked from commit 99f17d6)
1 parent 76b78d3 commit 4e8128d

File tree

12 files changed

+42
-22
lines changed

12 files changed

+42
-22
lines changed

CHANGELOG.asciidoc

+4
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,10 @@ https://github.com/elastic/beats/compare/v5.2.0...master[Check the HEAD diff]
3434

3535
*Metricbeat*
3636

37+
- Fix go routine leak in docker module. {pull}3492[3492]
38+
- Fix bug docker module hanging when docker container killed. {issue}3610[3610]
39+
- Set timeout to period instead of 1s by default as documented.
40+
3741
*Packetbeat*
3842

3943
*Winlogbeat*

metricbeat/docs/modules/system.asciidoc

+3
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,9 @@ metricbeat.modules:
4848
cpu_ticks: true
4949
----
5050

51+
It is strongly recommended to not run docker metricsets with a period smaller then 3 seconds. The request to the docker
52+
API already takes up to 2s seconds. Otherwise all the requests would timeout and no data is reported.
53+
5154
[float]
5255
=== Dashboard
5356

metricbeat/mb/builders.go

+5
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,11 @@ func newBaseModuleFromConfig(rawConfig *common.Config) (BaseModule, error) {
8989
return baseModule, err
9090
}
9191

92+
// If timeout is not set, timeout is set to the same value as period
93+
if baseModule.config.Timeout == 0 {
94+
baseModule.config.Timeout = baseModule.config.Period
95+
}
96+
9297
baseModule.name = strings.ToLower(baseModule.config.Module)
9398

9499
err = mustNotContainDuplicates(baseModule.config.Hosts)

metricbeat/mb/mb.go

-1
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,6 @@ func (c ModuleConfig) GoString() string { return c.String() }
183183
var defaultModuleConfig = ModuleConfig{
184184
Enabled: true,
185185
Period: time.Second * 10,
186-
Timeout: time.Second,
187186
}
188187

189188
// DefaultModuleConfig returns a ModuleConfig with the default values populated.

metricbeat/mb/mb_test.go

+3-3
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ func TestModuleConfig(t *testing.T) {
6161
MetricSets: []string{"test"},
6262
Enabled: true,
6363
Period: time.Second * 10,
64-
Timeout: time.Second,
64+
Timeout: 0,
6565
},
6666
},
6767
{
@@ -124,7 +124,7 @@ func TestModuleConfigDefaults(t *testing.T) {
124124

125125
assert.Equal(t, true, mc.Enabled)
126126
assert.Equal(t, time.Second*10, mc.Period)
127-
assert.Equal(t, time.Second, mc.Timeout)
127+
assert.Equal(t, time.Second*0, mc.Timeout)
128128
assert.Empty(t, mc.Hosts)
129129
}
130130

@@ -249,7 +249,7 @@ func TestNewBaseModuleFromModuleConfigStruct(t *testing.T) {
249249
assert.Equal(t, moduleName, baseModule.Config().Module)
250250
assert.Equal(t, true, baseModule.Config().Enabled)
251251
assert.Equal(t, time.Second*10, baseModule.Config().Period)
252-
assert.Equal(t, time.Second, baseModule.Config().Timeout)
252+
assert.Equal(t, time.Second*10, baseModule.Config().Timeout)
253253
assert.Empty(t, baseModule.Config().Hosts)
254254
}
255255

metricbeat/module/docker/cpu/cpu.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ func New(base mb.BaseMetricSet) (mb.MetricSet, error) {
4444

4545
// Fetch returns a list of docker CPU stats.
4646
func (m *MetricSet) Fetch() ([]common.MapStr, error) {
47-
stats, err := docker.FetchStats(m.dockerClient)
47+
stats, err := docker.FetchStats(m.dockerClient, m.Module().Config().Timeout)
4848
if err != nil {
4949
return nil, err
5050
}

metricbeat/module/docker/diskio/diskio.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ func New(base mb.BaseMetricSet) (mb.MetricSet, error) {
4646

4747
// Fetch creates list of events with diskio stats for all containers.
4848
func (m *MetricSet) Fetch() ([]common.MapStr, error) {
49-
stats, err := docker.FetchStats(m.dockerClient)
49+
stats, err := docker.FetchStats(m.dockerClient, m.Module().Config().Timeout)
5050
if err != nil {
5151
return nil, err
5252
}

metricbeat/module/docker/docker.go

+13-8
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ import (
77
"github.com/elastic/beats/metricbeat/mb"
88
"github.com/elastic/beats/metricbeat/mb/parse"
99

10+
"time"
11+
1012
"github.com/fsouza/go-dockerclient"
1113
)
1214

@@ -58,7 +60,7 @@ func NewDockerClient(endpoint string, config Config) (*docker.Client, error) {
5860
}
5961

6062
// FetchStats returns a list of running containers with all related stats inside
61-
func FetchStats(client *docker.Client) ([]Stat, error) {
63+
func FetchStats(client *docker.Client, timeout time.Duration) ([]Stat, error) {
6264
containers, err := client.ListContainers(docker.ListContainersOptions{})
6365
if err != nil {
6466
return nil, err
@@ -67,24 +69,27 @@ func FetchStats(client *docker.Client) ([]Stat, error) {
6769
var wg sync.WaitGroup
6870

6971
containersList := make([]Stat, 0, len(containers))
70-
queue := make(chan Stat, 1)
72+
statsQueue := make(chan Stat, 1)
7173
wg.Add(len(containers))
7274

7375
for _, container := range containers {
7476
go func(container docker.APIContainers) {
7577
defer wg.Done()
76-
queue <- exportContainerStats(client, &container)
78+
statsQueue <- exportContainerStats(client, &container, timeout)
7779
}(container)
7880
}
7981

8082
go func() {
8183
wg.Wait()
82-
close(queue)
84+
close(statsQueue)
8385
}()
8486

8587
// This will break after the queue has been drained and queue is closed.
86-
for container := range queue {
87-
containersList = append(containersList, container)
88+
for stat := range statsQueue {
89+
// If names is empty, there is not data inside
90+
if len(stat.Container.Names) != 0 {
91+
containersList = append(containersList, stat)
92+
}
8893
}
8994

9095
return containersList, err
@@ -95,7 +100,7 @@ func FetchStats(client *docker.Client) ([]Stat, error) {
95100
// This is currently very inefficient as docker calculates the average for each request,
96101
// means each request will take at least 2s: https://github.com/docker/docker/blob/master/cli/command/container/stats_helpers.go#L148
97102
// Getting all stats at once is implemented here: https://github.com/docker/docker/pull/25361
98-
func exportContainerStats(client *docker.Client, container *docker.APIContainers) Stat {
103+
func exportContainerStats(client *docker.Client, container *docker.APIContainers, timeout time.Duration) Stat {
99104
var wg sync.WaitGroup
100105
var event Stat
101106

@@ -105,7 +110,7 @@ func exportContainerStats(client *docker.Client, container *docker.APIContainers
105110
ID: container.ID,
106111
Stats: statsC,
107112
Stream: false,
108-
Timeout: -1,
113+
Timeout: timeout,
109114
}
110115

111116
wg.Add(2)

metricbeat/module/docker/memory/memory.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ func New(base mb.BaseMetricSet) (mb.MetricSet, error) {
4444

4545
// Fetch creates a list of memory events for each container.
4646
func (m *MetricSet) Fetch() ([]common.MapStr, error) {
47-
stats, err := docker.FetchStats(m.dockerClient)
47+
stats, err := docker.FetchStats(m.dockerClient, m.Module().Config().Timeout)
4848
if err != nil {
4949
return nil, err
5050
}

metricbeat/module/docker/network/network.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ func New(base mb.BaseMetricSet) (mb.MetricSet, error) {
4646

4747
// Fetch methods creates a list of network events for each container.
4848
func (m *MetricSet) Fetch() ([]common.MapStr, error) {
49-
stats, err := docker.FetchStats(m.dockerClient)
49+
stats, err := docker.FetchStats(m.dockerClient, m.Module().Config().Timeout)
5050
if err != nil {
5151
return nil, err
5252
}

metricbeat/module/system/_meta/docs.asciidoc

+3
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,9 @@ metricbeat.modules:
4343
cpu_ticks: true
4444
----
4545

46+
It is strongly recommended to not run docker metricsets with a period smaller then 3 seconds. The request to the docker
47+
API already takes up to 2s seconds. Otherwise all the requests would timeout and no data is reported.
48+
4649
[float]
4750
=== Dashboard
4851

metricbeat/tests/system/test_docker.py

+7-6
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ def test_container_fields(self):
1414
"name": "docker",
1515
"metricsets": ["container"],
1616
"hosts": ["unix:///var/run/docker.sock"],
17-
"period": "1s",
17+
"period": "10s",
1818
}])
1919

2020
proc = self.start_beat()
@@ -40,7 +40,7 @@ def test_cpu_fields(self):
4040
"name": "docker",
4141
"metricsets": ["cpu"],
4242
"hosts": ["unix:///var/run/docker.sock"],
43-
"period": "1s"
43+
"period": "10s"
4444
}])
4545

4646
proc = self.start_beat()
@@ -70,7 +70,7 @@ def test_diskio_fields(self):
7070
"name": "docker",
7171
"metricsets": ["diskio"],
7272
"hosts": ["unix:///var/run/docker.sock"],
73-
"period": "1s"
73+
"period": "10s"
7474
}])
7575

7676
proc = self.start_beat()
@@ -97,7 +97,7 @@ def test_info_fields(self):
9797
"name": "docker",
9898
"metricsets": ["info"],
9999
"hosts": ["unix:///var/run/docker.sock"],
100-
"period": "1s"
100+
"period": "10s"
101101
}])
102102

103103
proc = self.start_beat()
@@ -122,7 +122,7 @@ def test_memory_fields(self):
122122
"name": "docker",
123123
"metricsets": ["memory"],
124124
"hosts": ["unix:///var/run/docker.sock"],
125-
"period": "1s"
125+
"period": "10s"
126126
}])
127127

128128
proc = self.start_beat()
@@ -148,7 +148,7 @@ def test_network_fields(self):
148148
"name": "docker",
149149
"metricsets": ["network"],
150150
"hosts": ["unix:///var/run/docker.sock"],
151-
"period": "1s"
151+
"period": "10s"
152152
}])
153153

154154
proc = self.start_beat()
@@ -157,6 +157,7 @@ def test_network_fields(self):
157157

158158
# Ensure no errors or warnings exist in the log.
159159
log = self.get_log()
160+
160161
self.assertNotRegexpMatches(log.replace("WARN EXPERIMENTAL", ""), "ERR|WARN")
161162

162163
output = self.read_output_json()

0 commit comments

Comments
 (0)