-
Notifications
You must be signed in to change notification settings - Fork 2.1k
/
init_linux.go
737 lines (667 loc) · 23.8 KB
/
init_linux.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
package libcontainer
import (
"bytes"
"encoding/json"
"errors"
"fmt"
"net"
"os"
"path/filepath"
"runtime"
"runtime/debug"
"strconv"
"strings"
"syscall"
"github.com/containerd/console"
"github.com/moby/sys/user"
"github.com/opencontainers/runtime-spec/specs-go"
"github.com/sirupsen/logrus"
"github.com/vishvananda/netlink"
"golang.org/x/sys/unix"
"github.com/opencontainers/runc/libcontainer/capabilities"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/system"
"github.com/opencontainers/runc/libcontainer/utils"
)
type initType string
const (
initSetns initType = "setns"
initStandard initType = "standard"
)
type pid struct {
Pid int `json:"stage2_pid"`
PidFirstChild int `json:"stage1_pid"`
}
// network is an internal struct used to setup container networks.
type network struct {
configs.Network
// TempVethPeerName is a unique temporary veth peer name that was placed into
// the container's namespace.
TempVethPeerName string `json:"temp_veth_peer_name"`
}
// initConfig is used for transferring parameters from Exec() to Init()
type initConfig struct {
Args []string `json:"args"`
Env []string `json:"env"`
Cwd string `json:"cwd"`
Capabilities *configs.Capabilities `json:"capabilities"`
ProcessLabel string `json:"process_label"`
AppArmorProfile string `json:"apparmor_profile"`
NoNewPrivileges bool `json:"no_new_privileges"`
User string `json:"user"`
AdditionalGroups []string `json:"additional_groups"`
Config *configs.Config `json:"config"`
Networks []*network `json:"network"`
PassedFilesCount int `json:"passed_files_count"`
ContainerID string `json:"containerid"`
Rlimits []configs.Rlimit `json:"rlimits"`
CreateConsole bool `json:"create_console"`
ConsoleWidth uint16 `json:"console_width"`
ConsoleHeight uint16 `json:"console_height"`
RootlessEUID bool `json:"rootless_euid,omitempty"`
RootlessCgroups bool `json:"rootless_cgroups,omitempty"`
SpecState *specs.State `json:"spec_state,omitempty"`
Cgroup2Path string `json:"cgroup2_path,omitempty"`
}
// Init is part of "runc init" implementation.
func Init() {
runtime.GOMAXPROCS(1)
runtime.LockOSThread()
if err := startInitialization(); err != nil {
// If the error is returned, it was not communicated
// back to the parent (which is not a common case),
// so print it to stderr here as a last resort.
//
// Do not use logrus as we are not sure if it has been
// set up yet, but most important, if the parent is
// alive (and its log forwarding is working).
fmt.Fprintln(os.Stderr, err)
}
// Normally, StartInitialization() never returns, meaning
// if we are here, it had failed.
os.Exit(255)
}
// Normally, this function does not return. If it returns, with or without an
// error, it means the initialization has failed. If the error is returned,
// it means the error can not be communicated back to the parent.
func startInitialization() (retErr error) {
// Get the synchronisation pipe.
envSyncPipe := os.Getenv("_LIBCONTAINER_SYNCPIPE")
syncPipeFd, err := strconv.Atoi(envSyncPipe)
if err != nil {
return fmt.Errorf("unable to convert _LIBCONTAINER_SYNCPIPE: %w", err)
}
syncPipe := newSyncSocket(os.NewFile(uintptr(syncPipeFd), "sync"))
defer syncPipe.Close()
defer func() {
// If this defer is ever called, this means initialization has failed.
// Send the error back to the parent process in the form of an initError
// if the sync socket has not been closed.
if syncPipe.isClosed() {
return
}
ierr := initError{Message: retErr.Error()}
if err := writeSyncArg(syncPipe, procError, ierr); err != nil {
fmt.Fprintln(os.Stderr, err)
return
}
// The error is sent, no need to also return it (or it will be reported twice).
retErr = nil
}()
// Get the INITPIPE.
envInitPipe := os.Getenv("_LIBCONTAINER_INITPIPE")
initPipeFd, err := strconv.Atoi(envInitPipe)
if err != nil {
return fmt.Errorf("unable to convert _LIBCONTAINER_INITPIPE: %w", err)
}
initPipe := os.NewFile(uintptr(initPipeFd), "init")
defer initPipe.Close()
// Set up logging. This is used rarely, and mostly for init debugging.
// Passing log level is optional; currently libcontainer/integration does not do it.
if levelStr := os.Getenv("_LIBCONTAINER_LOGLEVEL"); levelStr != "" {
logLevel, err := strconv.Atoi(levelStr)
if err != nil {
return fmt.Errorf("unable to convert _LIBCONTAINER_LOGLEVEL: %w", err)
}
logrus.SetLevel(logrus.Level(logLevel))
}
logFd, err := strconv.Atoi(os.Getenv("_LIBCONTAINER_LOGPIPE"))
if err != nil {
return fmt.Errorf("unable to convert _LIBCONTAINER_LOGPIPE: %w", err)
}
logPipe := os.NewFile(uintptr(logFd), "logpipe")
logrus.SetOutput(logPipe)
logrus.SetFormatter(new(logrus.JSONFormatter))
logrus.Debug("child process in init()")
// Only init processes have FIFOFD.
var fifoFile *os.File
envInitType := os.Getenv("_LIBCONTAINER_INITTYPE")
it := initType(envInitType)
if it == initStandard {
fifoFd, err := strconv.Atoi(os.Getenv("_LIBCONTAINER_FIFOFD"))
if err != nil {
return fmt.Errorf("unable to convert _LIBCONTAINER_FIFOFD: %w", err)
}
fifoFile = os.NewFile(uintptr(fifoFd), "initfifo")
}
var consoleSocket *os.File
if envConsole := os.Getenv("_LIBCONTAINER_CONSOLE"); envConsole != "" {
console, err := strconv.Atoi(envConsole)
if err != nil {
return fmt.Errorf("unable to convert _LIBCONTAINER_CONSOLE: %w", err)
}
consoleSocket = os.NewFile(uintptr(console), "console-socket")
defer consoleSocket.Close()
}
var pidfdSocket *os.File
if envSockFd := os.Getenv("_LIBCONTAINER_PIDFD_SOCK"); envSockFd != "" {
sockFd, err := strconv.Atoi(envSockFd)
if err != nil {
return fmt.Errorf("unable to convert _LIBCONTAINER_PIDFD_SOCK: %w", err)
}
pidfdSocket = os.NewFile(uintptr(sockFd), "pidfd-socket")
defer pidfdSocket.Close()
}
// clear the current process's environment to clean any libcontainer
// specific env vars.
os.Clearenv()
defer func() {
if err := recover(); err != nil {
if err2, ok := err.(error); ok {
retErr = fmt.Errorf("panic from initialization: %w, %s", err2, debug.Stack())
} else {
retErr = fmt.Errorf("panic from initialization: %v, %s", err, debug.Stack())
}
}
}()
var config initConfig
if err := json.NewDecoder(initPipe).Decode(&config); err != nil {
return err
}
// If init succeeds, it will not return, hence none of the defers will be called.
return containerInit(it, &config, syncPipe, consoleSocket, pidfdSocket, fifoFile, logPipe)
}
func containerInit(t initType, config *initConfig, pipe *syncSocket, consoleSocket, pidfdSocket, fifoFile, logPipe *os.File) error {
if err := populateProcessEnvironment(config.Env); err != nil {
return err
}
// Clean the RLIMIT_NOFILE cache in go runtime.
// Issue: https://github.com/opencontainers/runc/issues/4195
maybeClearRlimitNofileCache(config.Rlimits)
switch t {
case initSetns:
i := &linuxSetnsInit{
pipe: pipe,
consoleSocket: consoleSocket,
pidfdSocket: pidfdSocket,
config: config,
logPipe: logPipe,
}
return i.Init()
case initStandard:
i := &linuxStandardInit{
pipe: pipe,
consoleSocket: consoleSocket,
pidfdSocket: pidfdSocket,
parentPid: unix.Getppid(),
config: config,
fifoFile: fifoFile,
logPipe: logPipe,
}
return i.Init()
}
return fmt.Errorf("unknown init type %q", t)
}
// populateProcessEnvironment loads the provided environment variables into the
// current processes's environment.
func populateProcessEnvironment(env []string) error {
for _, pair := range env {
name, val, ok := strings.Cut(pair, "=")
if !ok {
return errors.New("invalid environment variable: missing '='")
}
if name == "" {
return errors.New("invalid environment variable: name cannot be empty")
}
if strings.IndexByte(name, 0) >= 0 {
return fmt.Errorf("invalid environment variable %q: name contains nul byte (\\x00)", name)
}
if strings.IndexByte(val, 0) >= 0 {
return fmt.Errorf("invalid environment variable %q: value contains nul byte (\\x00)", name)
}
if err := os.Setenv(name, val); err != nil {
return err
}
}
return nil
}
// verifyCwd ensures that the current directory is actually inside the mount
// namespace root of the current process.
func verifyCwd() error {
// getcwd(2) on Linux detects if cwd is outside of the rootfs of the
// current mount namespace root, and in that case prefixes "(unreachable)"
// to the returned string. glibc's getcwd(3) and Go's Getwd() both detect
// when this happens and return ENOENT rather than returning a non-absolute
// path. In both cases we can therefore easily detect if we have an invalid
// cwd by checking the return value of getcwd(3). See getcwd(3) for more
// details, and CVE-2024-21626 for the security issue that motivated this
// check.
//
// We have to use unix.Getwd() here because os.Getwd() has a workaround for
// $PWD which involves doing stat(.), which can fail if the current
// directory is inaccessible to the container process.
if wd, err := unix.Getwd(); errors.Is(err, unix.ENOENT) {
return errors.New("current working directory is outside of container mount namespace root -- possible container breakout detected")
} else if err != nil {
return fmt.Errorf("failed to verify if current working directory is safe: %w", err)
} else if !filepath.IsAbs(wd) {
// We shouldn't ever hit this, but check just in case.
return fmt.Errorf("current working directory is not absolute -- possible container breakout detected: cwd is %q", wd)
}
return nil
}
// finalizeNamespace drops the caps, sets the correct user
// and working dir, and closes any leaked file descriptors
// before executing the command inside the namespace
func finalizeNamespace(config *initConfig) error {
// Ensure that all unwanted fds we may have accidentally
// inherited are marked close-on-exec so they stay out of the
// container
if err := utils.CloseExecFrom(config.PassedFilesCount + 3); err != nil {
return fmt.Errorf("error closing exec fds: %w", err)
}
// we only do chdir if it's specified
doChdir := config.Cwd != ""
if doChdir {
// First, attempt the chdir before setting up the user.
// This could allow us to access a directory that the user running runc can access
// but the container user cannot.
err := unix.Chdir(config.Cwd)
switch {
case err == nil:
doChdir = false
case os.IsPermission(err):
// If we hit an EPERM, we should attempt again after setting up user.
// This will allow us to successfully chdir if the container user has access
// to the directory, but the user running runc does not.
// This is useful in cases where the cwd is also a volume that's been chowned to the container user.
default:
return fmt.Errorf("chdir to cwd (%q) set in config.json failed: %w", config.Cwd, err)
}
}
caps := &configs.Capabilities{}
if config.Capabilities != nil {
caps = config.Capabilities
} else if config.Config.Capabilities != nil {
caps = config.Config.Capabilities
}
w, err := capabilities.New(caps)
if err != nil {
return err
}
// drop capabilities in bounding set before changing user
if err := w.ApplyBoundingSet(); err != nil {
return fmt.Errorf("unable to apply bounding set: %w", err)
}
// preserve existing capabilities while we change users
if err := system.SetKeepCaps(); err != nil {
return fmt.Errorf("unable to set keep caps: %w", err)
}
if err := setupUser(config); err != nil {
return fmt.Errorf("unable to setup user: %w", err)
}
// Change working directory AFTER the user has been set up, if we haven't done it yet.
if doChdir {
if err := unix.Chdir(config.Cwd); err != nil {
return fmt.Errorf("chdir to cwd (%q) set in config.json failed: %w", config.Cwd, err)
}
}
// Make sure our final working directory is inside the container.
if err := verifyCwd(); err != nil {
return err
}
if err := system.ClearKeepCaps(); err != nil {
return fmt.Errorf("unable to clear keep caps: %w", err)
}
if err := w.ApplyCaps(); err != nil {
return fmt.Errorf("unable to apply caps: %w", err)
}
return nil
}
// setupConsole sets up the console from inside the container, and sends the
// master pty fd to the config.Pipe (using cmsg). This is done to ensure that
// consoles are scoped to a container properly (see runc#814 and the many
// issues related to that). This has to be run *after* we've pivoted to the new
// rootfs (and the users' configuration is entirely set up).
func setupConsole(socket *os.File, config *initConfig, mount bool) error {
defer socket.Close()
// At this point, /dev/ptmx points to something that we would expect. We
// used to change the owner of the slave path, but since the /dev/pts mount
// can have gid=X set (at the users' option). So touching the owner of the
// slave PTY is not necessary, as the kernel will handle that for us. Note
// however, that setupUser (specifically fixStdioPermissions) *will* change
// the UID owner of the console to be the user the process will run as (so
// they can actually control their console).
pty, slavePath, err := console.NewPty()
if err != nil {
return err
}
// After we return from here, we don't need the console anymore.
defer pty.Close()
if config.ConsoleHeight != 0 && config.ConsoleWidth != 0 {
err = pty.Resize(console.WinSize{
Height: config.ConsoleHeight,
Width: config.ConsoleWidth,
})
if err != nil {
return err
}
}
// Mount the console inside our rootfs.
if mount {
if err := mountConsole(slavePath); err != nil {
return err
}
}
// While we can access console.master, using the API is a good idea.
if err := utils.SendRawFd(socket, pty.Name(), pty.Fd()); err != nil {
return err
}
runtime.KeepAlive(pty)
// Now, dup over all the things.
return dupStdio(slavePath)
}
// syncParentReady sends to the given pipe a JSON payload which indicates that
// the init is ready to Exec the child process. It then waits for the parent to
// indicate that it is cleared to Exec.
func syncParentReady(pipe *syncSocket) error {
// Tell parent.
if err := writeSync(pipe, procReady); err != nil {
return err
}
// Wait for parent to give the all-clear.
return readSync(pipe, procRun)
}
// syncParentHooks sends to the given pipe a JSON payload which indicates that
// the parent should execute pre-start hooks. It then waits for the parent to
// indicate that it is cleared to resume.
func syncParentHooks(pipe *syncSocket) error {
// Tell parent.
if err := writeSync(pipe, procHooks); err != nil {
return err
}
// Wait for parent to give the all-clear.
return readSync(pipe, procHooksDone)
}
// syncParentSeccomp sends the fd associated with the seccomp file descriptor
// to the parent, and wait for the parent to do pidfd_getfd() to grab a copy.
func syncParentSeccomp(pipe *syncSocket, seccompFd int) error {
if seccompFd == -1 {
return nil
}
defer unix.Close(seccompFd)
// Tell parent to grab our fd.
//
// Notably, we do not use writeSyncFile here because a container might have
// an SCMP_ACT_NOTIFY action on sendmsg(2) so we need to use the smallest
// possible number of system calls here because all of those syscalls
// cannot be used with SCMP_ACT_NOTIFY as a result (any syscall we use here
// before the parent gets the file descriptor would deadlock "runc init" if
// we allowed it for SCMP_ACT_NOTIFY). See seccomp.InitSeccomp() for more
// details.
if err := writeSyncArg(pipe, procSeccomp, seccompFd); err != nil {
return err
}
// Wait for parent to tell us they've grabbed the seccompfd.
return readSync(pipe, procSeccompDone)
}
// setupUser changes the groups, gid, and uid for the user inside the container
func setupUser(config *initConfig) error {
// Set up defaults.
defaultExecUser := user.ExecUser{
Uid: 0,
Gid: 0,
Home: "/",
}
passwdPath, err := user.GetPasswdPath()
if err != nil {
return err
}
groupPath, err := user.GetGroupPath()
if err != nil {
return err
}
execUser, err := user.GetExecUserPath(config.User, &defaultExecUser, passwdPath, groupPath)
if err != nil {
return err
}
var addGroups []int
if len(config.AdditionalGroups) > 0 {
addGroups, err = user.GetAdditionalGroupsPath(config.AdditionalGroups, groupPath)
if err != nil {
return err
}
}
if config.RootlessEUID {
// We cannot set any additional groups in a rootless container and thus
// we bail if the user asked us to do so. TODO: We currently can't do
// this check earlier, but if libcontainer.Process.User was typesafe
// this might work.
if len(addGroups) > 0 {
return errors.New("cannot set any additional groups in a rootless container")
}
}
// Before we change to the container's user make sure that the processes
// STDIO is correctly owned by the user that we are switching to.
if err := fixStdioPermissions(execUser); err != nil {
return err
}
// We don't need to use /proc/thread-self here because setgroups is a
// per-userns file and thus is global to all threads in a thread-group.
// This lets us avoid having to do runtime.LockOSThread.
setgroups, err := os.ReadFile("/proc/self/setgroups")
if err != nil && !os.IsNotExist(err) {
return err
}
// This isn't allowed in an unprivileged user namespace since Linux 3.19.
// There's nothing we can do about /etc/group entries, so we silently
// ignore setting groups here (since the user didn't explicitly ask us to
// set the group).
allowSupGroups := !config.RootlessEUID && string(bytes.TrimSpace(setgroups)) != "deny"
if allowSupGroups {
suppGroups := append(execUser.Sgids, addGroups...)
if err := unix.Setgroups(suppGroups); err != nil {
return &os.SyscallError{Syscall: "setgroups", Err: err}
}
}
if err := unix.Setgid(execUser.Gid); err != nil {
if err == unix.EINVAL {
return fmt.Errorf("cannot setgid to unmapped gid %d in user namespace", execUser.Gid)
}
return err
}
if err := unix.Setuid(execUser.Uid); err != nil {
if err == unix.EINVAL {
return fmt.Errorf("cannot setuid to unmapped uid %d in user namespace", execUser.Uid)
}
return err
}
// if we didn't get HOME already, set it based on the user's HOME
if envHome := os.Getenv("HOME"); envHome == "" {
if err := os.Setenv("HOME", execUser.Home); err != nil {
return err
}
}
return nil
}
// fixStdioPermissions fixes the permissions of PID 1's STDIO within the container to the specified user.
// The ownership needs to match because it is created outside of the container and needs to be
// localized.
func fixStdioPermissions(u *user.ExecUser) error {
var null unix.Stat_t
if err := unix.Stat("/dev/null", &null); err != nil {
return &os.PathError{Op: "stat", Path: "/dev/null", Err: err}
}
for _, file := range []*os.File{os.Stdin, os.Stdout, os.Stderr} {
var s unix.Stat_t
if err := unix.Fstat(int(file.Fd()), &s); err != nil {
return &os.PathError{Op: "fstat", Path: file.Name(), Err: err}
}
// Skip chown if uid is already the one we want or any of the STDIO descriptors
// were redirected to /dev/null.
if int(s.Uid) == u.Uid || s.Rdev == null.Rdev {
continue
}
// We only change the uid (as it is possible for the mount to
// prefer a different gid, and there's no reason for us to change it).
// The reason why we don't just leave the default uid=X mount setup is
// that users expect to be able to actually use their console. Without
// this code, you couldn't effectively run as a non-root user inside a
// container and also have a console set up.
if err := file.Chown(u.Uid, int(s.Gid)); err != nil {
// If we've hit an EINVAL then s.Gid isn't mapped in the user
// namespace. If we've hit an EPERM then the inode's current owner
// is not mapped in our user namespace (in particular,
// privileged_wrt_inode_uidgid() has failed). Read-only
// /dev can result in EROFS error. In any case, it's
// better for us to just not touch the stdio rather
// than bail at this point.
if errors.Is(err, unix.EINVAL) || errors.Is(err, unix.EPERM) || errors.Is(err, unix.EROFS) {
continue
}
return err
}
}
return nil
}
// setupNetwork sets up and initializes any network interface inside the container.
func setupNetwork(config *initConfig) error {
for _, config := range config.Networks {
strategy, err := getStrategy(config.Type)
if err != nil {
return err
}
if err := strategy.initialize(config); err != nil {
return err
}
}
return nil
}
func setupRoute(config *configs.Config) error {
for _, config := range config.Routes {
_, dst, err := net.ParseCIDR(config.Destination)
if err != nil {
return err
}
src := net.ParseIP(config.Source)
if src == nil {
return fmt.Errorf("Invalid source for route: %s", config.Source)
}
gw := net.ParseIP(config.Gateway)
if gw == nil {
return fmt.Errorf("Invalid gateway for route: %s", config.Gateway)
}
l, err := netlink.LinkByName(config.InterfaceName)
if err != nil {
return err
}
route := &netlink.Route{
Scope: netlink.SCOPE_UNIVERSE,
Dst: dst,
Src: src,
Gw: gw,
LinkIndex: l.Attrs().Index,
}
if err := netlink.RouteAdd(route); err != nil {
return err
}
}
return nil
}
func maybeClearRlimitNofileCache(limits []configs.Rlimit) {
for _, rlimit := range limits {
if rlimit.Type == syscall.RLIMIT_NOFILE {
system.ClearRlimitNofileCache(&syscall.Rlimit{
Cur: rlimit.Soft,
Max: rlimit.Hard,
})
return
}
}
}
func setupRlimits(limits []configs.Rlimit, pid int) error {
for _, rlimit := range limits {
if err := unix.Prlimit(pid, rlimit.Type, &unix.Rlimit{Max: rlimit.Hard, Cur: rlimit.Soft}, nil); err != nil {
return fmt.Errorf("error setting rlimit type %v: %w", rlimit.Type, err)
}
}
return nil
}
func setupScheduler(config *configs.Config) error {
attr, err := configs.ToSchedAttr(config.Scheduler)
if err != nil {
return err
}
if err := unix.SchedSetAttr(0, attr, 0); err != nil {
if errors.Is(err, unix.EPERM) && config.Cgroups.CpusetCpus != "" {
return errors.New("process scheduler can't be used together with AllowedCPUs")
}
return fmt.Errorf("error setting scheduler: %w", err)
}
return nil
}
func setupPersonality(config *configs.Config) error {
return system.SetLinuxPersonality(config.Personality.Domain)
}
// signalAllProcesses freezes then iterates over all the processes inside the
// manager's cgroups sending the signal s to them.
func signalAllProcesses(m cgroups.Manager, s unix.Signal) error {
if !m.Exists() {
return ErrCgroupNotExist
}
// Use cgroup.kill, if available.
if s == unix.SIGKILL {
if p := m.Path(""); p != "" { // Either cgroup v2 or hybrid.
err := cgroups.WriteFile(p, "cgroup.kill", "1")
if err == nil || !errors.Is(err, os.ErrNotExist) {
return err
}
// Fallback to old implementation.
}
}
if err := m.Freeze(cgroups.Frozen); err != nil {
logrus.Warn(err)
}
pids, err := m.GetAllPids()
if err != nil {
if err := m.Freeze(cgroups.Thawed); err != nil {
logrus.Warn(err)
}
return err
}
for _, pid := range pids {
err := unix.Kill(pid, s)
if err != nil && err != unix.ESRCH {
logrus.Warnf("kill %d: %v", pid, err)
}
}
if err := m.Freeze(cgroups.Thawed); err != nil {
logrus.Warn(err)
}
return nil
}
// setupPidfd opens a process file descriptor of init process, and sends the
// file descriptor back to the socket.
func setupPidfd(socket *os.File, initType string) error {
defer socket.Close()
pidFd, err := unix.PidfdOpen(os.Getpid(), 0)
if err != nil {
return fmt.Errorf("failed to pidfd_open: %w", err)
}
if err := utils.SendRawFd(socket, initType, uintptr(pidFd)); err != nil {
unix.Close(pidFd)
return fmt.Errorf("failed to send pidfd on socket: %w", err)
}
return unix.Close(pidFd)
}