Skip to content

Commit 712dd62

Browse files
author
Vincent Demeester
authored
Merge pull request moby#37234 from kolyshkin/plugin-panic
Fix daemon panic on restart when a plugin is running
2 parents fe3d023 + dbeb432 commit 712dd62

File tree

5 files changed

+199
-55
lines changed

5 files changed

+199
-55
lines changed

plugin/executor/containerd/containerd.go

+19-26
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,19 @@ type Executor struct {
5858
exitHandler ExitHandler
5959
}
6060

61+
// deleteTaskAndContainer deletes plugin task and then plugin container from containerd
62+
func deleteTaskAndContainer(ctx context.Context, cli Client, id string) {
63+
_, _, err := cli.DeleteTask(ctx, id)
64+
if err != nil && !errdefs.IsNotFound(err) {
65+
logrus.WithError(err).WithField("id", id).Error("failed to delete plugin task from containerd")
66+
}
67+
68+
err = cli.Delete(ctx, id)
69+
if err != nil && !errdefs.IsNotFound(err) {
70+
logrus.WithError(err).WithField("id", id).Error("failed to delete plugin container from containerd")
71+
}
72+
}
73+
6174
// Create creates a new container
6275
func (e *Executor) Create(id string, spec specs.Spec, stdout, stderr io.WriteCloser) error {
6376
opts := runctypes.RuncOptions{
@@ -87,34 +100,21 @@ func (e *Executor) Create(id string, spec specs.Spec, stdout, stderr io.WriteClo
87100

88101
_, err = e.client.Start(ctx, id, "", false, attachStreamsFunc(stdout, stderr))
89102
if err != nil {
90-
if _, _, err2 := e.client.DeleteTask(ctx, id); err2 != nil && !errdefs.IsNotFound(err2) {
91-
logrus.WithError(err2).WithField("id", id).Warn("Received an error while attempting to clean up containerd plugin task after failed start")
92-
}
93-
if err2 := e.client.Delete(ctx, id); err2 != nil && !errdefs.IsNotFound(err2) {
94-
logrus.WithError(err2).WithField("id", id).Warn("Received an error while attempting to clean up containerd plugin container after failed start")
95-
}
103+
deleteTaskAndContainer(ctx, e.client, id)
96104
}
97105
return err
98106
}
99107

100108
// Restore restores a container
101-
func (e *Executor) Restore(id string, stdout, stderr io.WriteCloser) error {
109+
func (e *Executor) Restore(id string, stdout, stderr io.WriteCloser) (bool, error) {
102110
alive, _, err := e.client.Restore(context.Background(), id, attachStreamsFunc(stdout, stderr))
103111
if err != nil && !errdefs.IsNotFound(err) {
104-
return err
112+
return false, err
105113
}
106114
if !alive {
107-
_, _, err = e.client.DeleteTask(context.Background(), id)
108-
if err != nil && !errdefs.IsNotFound(err) {
109-
logrus.WithError(err).Errorf("failed to delete container plugin %s task from containerd", id)
110-
}
111-
112-
err = e.client.Delete(context.Background(), id)
113-
if err != nil && !errdefs.IsNotFound(err) {
114-
logrus.WithError(err).Errorf("failed to delete container plugin %s from containerd", id)
115-
}
115+
deleteTaskAndContainer(context.Background(), e.client, id)
116116
}
117-
return nil
117+
return alive, nil
118118
}
119119

120120
// IsRunning returns if the container with the given id is running
@@ -133,14 +133,7 @@ func (e *Executor) Signal(id string, signal int) error {
133133
func (e *Executor) ProcessEvent(id string, et libcontainerd.EventType, ei libcontainerd.EventInfo) error {
134134
switch et {
135135
case libcontainerd.EventExit:
136-
// delete task and container
137-
if _, _, err := e.client.DeleteTask(context.Background(), id); err != nil {
138-
logrus.WithError(err).Errorf("failed to delete container plugin %s task from containerd", id)
139-
}
140-
141-
if err := e.client.Delete(context.Background(), id); err != nil {
142-
logrus.WithError(err).Errorf("failed to delete container plugin %s from containerd", id)
143-
}
136+
deleteTaskAndContainer(context.Background(), e.client, id)
144137
return e.exitHandler.HandleExitEvent(ei.ContainerID)
145138
}
146139
return nil

plugin/manager.go

+14-8
Original file line numberDiff line numberDiff line change
@@ -37,14 +37,14 @@ var validFullID = regexp.MustCompile(`^([a-f0-9]{64})$`)
3737
// Executor is the interface that the plugin manager uses to interact with for starting/stopping plugins
3838
type Executor interface {
3939
Create(id string, spec specs.Spec, stdout, stderr io.WriteCloser) error
40-
Restore(id string, stdout, stderr io.WriteCloser) error
4140
IsRunning(id string) (bool, error)
41+
Restore(id string, stdout, stderr io.WriteCloser) (alive bool, err error)
4242
Signal(id string, signal int) error
4343
}
4444

45-
func (pm *Manager) restorePlugin(p *v2.Plugin) error {
45+
func (pm *Manager) restorePlugin(p *v2.Plugin, c *controller) error {
4646
if p.IsEnabled() {
47-
return pm.restore(p)
47+
return pm.restore(p, c)
4848
}
4949
return nil
5050
}
@@ -143,12 +143,15 @@ func (pm *Manager) HandleExitEvent(id string) error {
143143
return err
144144
}
145145

146-
os.RemoveAll(filepath.Join(pm.config.ExecRoot, id))
146+
if err := os.RemoveAll(filepath.Join(pm.config.ExecRoot, id)); err != nil && !os.IsNotExist(err) {
147+
logrus.WithError(err).WithField("id", id).Error("Could not remove plugin bundle dir")
148+
}
147149

148150
pm.mu.RLock()
149151
c := pm.cMap[p]
150152
if c.exitChan != nil {
151153
close(c.exitChan)
154+
c.exitChan = nil // ignore duplicate events (containerd issue #2299)
152155
}
153156
restart := c.restart
154157
pm.mu.RUnlock()
@@ -205,12 +208,15 @@ func (pm *Manager) reload() error { // todo: restore
205208
var wg sync.WaitGroup
206209
wg.Add(len(plugins))
207210
for _, p := range plugins {
208-
c := &controller{} // todo: remove this
211+
c := &controller{exitChan: make(chan bool)}
212+
pm.mu.Lock()
209213
pm.cMap[p] = c
214+
pm.mu.Unlock()
215+
210216
go func(p *v2.Plugin) {
211217
defer wg.Done()
212-
if err := pm.restorePlugin(p); err != nil {
213-
logrus.Errorf("failed to restore plugin '%s': %s", p.Name(), err)
218+
if err := pm.restorePlugin(p, c); err != nil {
219+
logrus.WithError(err).WithField("id", p.GetID()).Error("Failed to restore plugin")
214220
return
215221
}
216222

@@ -248,7 +254,7 @@ func (pm *Manager) reload() error { // todo: restore
248254
if requiresManualRestore {
249255
// if liveRestore is not enabled, the plugin will be stopped now so we should enable it
250256
if err := pm.enable(p, c, true); err != nil {
251-
logrus.Errorf("failed to enable plugin '%s': %s", p.Name(), err)
257+
logrus.WithError(err).WithField("id", p.GetID()).Error("failed to enable plugin")
252258
}
253259
}
254260
}(p)

plugin/manager_linux.go

+17-12
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ func (pm *Manager) pluginPostStart(p *v2.Plugin, c *controller) error {
7979
client, err := plugins.NewClientWithTimeout(addr.Network()+"://"+addr.String(), nil, p.Timeout())
8080
if err != nil {
8181
c.restart = false
82-
shutdownPlugin(p, c, pm.executor)
82+
shutdownPlugin(p, c.exitChan, pm.executor)
8383
return errors.WithStack(err)
8484
}
8585

@@ -106,7 +106,7 @@ func (pm *Manager) pluginPostStart(p *v2.Plugin, c *controller) error {
106106
c.restart = false
107107
// While restoring plugins, we need to explicitly set the state to disabled
108108
pm.config.Store.SetState(p, false)
109-
shutdownPlugin(p, c, pm.executor)
109+
shutdownPlugin(p, c.exitChan, pm.executor)
110110
return err
111111
}
112112

@@ -117,16 +117,15 @@ func (pm *Manager) pluginPostStart(p *v2.Plugin, c *controller) error {
117117
return pm.save(p)
118118
}
119119

120-
func (pm *Manager) restore(p *v2.Plugin) error {
120+
func (pm *Manager) restore(p *v2.Plugin, c *controller) error {
121121
stdout, stderr := makeLoggerStreams(p.GetID())
122-
if err := pm.executor.Restore(p.GetID(), stdout, stderr); err != nil {
122+
alive, err := pm.executor.Restore(p.GetID(), stdout, stderr)
123+
if err != nil {
123124
return err
124125
}
125126

126127
if pm.config.LiveRestoreEnabled {
127-
c := &controller{}
128-
if isRunning, _ := pm.executor.IsRunning(p.GetID()); !isRunning {
129-
// plugin is not running, so follow normal startup procedure
128+
if !alive {
130129
return pm.enable(p, c, true)
131130
}
132131

@@ -138,26 +137,32 @@ func (pm *Manager) restore(p *v2.Plugin) error {
138137
return pm.pluginPostStart(p, c)
139138
}
140139

140+
if alive {
141+
// TODO(@cpuguy83): Should we always just re-attach to the running plugin instead of doing this?
142+
c.restart = false
143+
shutdownPlugin(p, c.exitChan, pm.executor)
144+
}
145+
141146
return nil
142147
}
143148

144-
func shutdownPlugin(p *v2.Plugin, c *controller, executor Executor) {
149+
func shutdownPlugin(p *v2.Plugin, ec chan bool, executor Executor) {
145150
pluginID := p.GetID()
146151

147152
err := executor.Signal(pluginID, int(unix.SIGTERM))
148153
if err != nil {
149154
logrus.Errorf("Sending SIGTERM to plugin failed with error: %v", err)
150155
} else {
151156
select {
152-
case <-c.exitChan:
157+
case <-ec:
153158
logrus.Debug("Clean shutdown of plugin")
154159
case <-time.After(time.Second * 10):
155160
logrus.Debug("Force shutdown plugin")
156161
if err := executor.Signal(pluginID, int(unix.SIGKILL)); err != nil {
157162
logrus.Errorf("Sending SIGKILL to plugin failed with error: %v", err)
158163
}
159164
select {
160-
case <-c.exitChan:
165+
case <-ec:
161166
logrus.Debug("SIGKILL plugin shutdown")
162167
case <-time.After(time.Second * 10):
163168
logrus.Debug("Force shutdown plugin FAILED")
@@ -172,7 +177,7 @@ func (pm *Manager) disable(p *v2.Plugin, c *controller) error {
172177
}
173178

174179
c.restart = false
175-
shutdownPlugin(p, c, pm.executor)
180+
shutdownPlugin(p, c.exitChan, pm.executor)
176181
pm.config.Store.SetState(p, false)
177182
return pm.save(p)
178183
}
@@ -191,7 +196,7 @@ func (pm *Manager) Shutdown() {
191196
}
192197
if pm.executor != nil && p.IsEnabled() {
193198
c.restart = false
194-
shutdownPlugin(p, c, pm.executor)
199+
shutdownPlugin(p, c.exitChan, pm.executor)
195200
}
196201
}
197202
if err := mount.RecursiveUnmount(pm.config.Root); err != nil {

0 commit comments

Comments
 (0)