Skip to content

Commit

Permalink
merge opencontainers#4219 into opencontainers/runc:main
Browse files Browse the repository at this point in the history
Aleksa Sarai (2):
  seccomp: patchbpf: always include native architecture in stub
  seccomp: patchbpf: rename nativeArch -> linuxAuditArch

LGTMs: AkihiroSuda kolyshkin
  • Loading branch information
cyphar committed Mar 29, 2024
2 parents 8e69225 + ccc500c commit a1acca9
Show file tree
Hide file tree
Showing 2 changed files with 105 additions and 56 deletions.
103 changes: 59 additions & 44 deletions libcontainer/seccomp/patchbpf/enosys_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -171,87 +171,101 @@ func disassembleFilter(filter *libseccomp.ScmpFilter) ([]bpf.Instruction, error)
return program, nil
}

type nativeArch uint32
type linuxAuditArch uint32

const invalidArch nativeArch = 0
const invalidArch linuxAuditArch = 0

func archToNative(arch libseccomp.ScmpArch) (nativeArch, error) {
func scmpArchToAuditArch(arch libseccomp.ScmpArch) (linuxAuditArch, error) {
switch arch {
case libseccomp.ArchNative:
// Convert to actual native architecture.
arch, err := libseccomp.GetNativeArch()
if err != nil {
return invalidArch, fmt.Errorf("unable to get native arch: %w", err)
}
return archToNative(arch)
return scmpArchToAuditArch(arch)
case libseccomp.ArchX86:
return nativeArch(C.C_AUDIT_ARCH_I386), nil
return linuxAuditArch(C.C_AUDIT_ARCH_I386), nil
case libseccomp.ArchAMD64, libseccomp.ArchX32:
// NOTE: x32 is treated like x86_64 except all x32 syscalls have the
// 30th bit of the syscall number set to indicate that it's not a
// normal x86_64 syscall.
return nativeArch(C.C_AUDIT_ARCH_X86_64), nil
return linuxAuditArch(C.C_AUDIT_ARCH_X86_64), nil
case libseccomp.ArchARM:
return nativeArch(C.C_AUDIT_ARCH_ARM), nil
return linuxAuditArch(C.C_AUDIT_ARCH_ARM), nil
case libseccomp.ArchARM64:
return nativeArch(C.C_AUDIT_ARCH_AARCH64), nil
return linuxAuditArch(C.C_AUDIT_ARCH_AARCH64), nil
case libseccomp.ArchMIPS:
return nativeArch(C.C_AUDIT_ARCH_MIPS), nil
return linuxAuditArch(C.C_AUDIT_ARCH_MIPS), nil
case libseccomp.ArchMIPS64:
return nativeArch(C.C_AUDIT_ARCH_MIPS64), nil
return linuxAuditArch(C.C_AUDIT_ARCH_MIPS64), nil
case libseccomp.ArchMIPS64N32:
return nativeArch(C.C_AUDIT_ARCH_MIPS64N32), nil
return linuxAuditArch(C.C_AUDIT_ARCH_MIPS64N32), nil
case libseccomp.ArchMIPSEL:
return nativeArch(C.C_AUDIT_ARCH_MIPSEL), nil
return linuxAuditArch(C.C_AUDIT_ARCH_MIPSEL), nil
case libseccomp.ArchMIPSEL64:
return nativeArch(C.C_AUDIT_ARCH_MIPSEL64), nil
return linuxAuditArch(C.C_AUDIT_ARCH_MIPSEL64), nil
case libseccomp.ArchMIPSEL64N32:
return nativeArch(C.C_AUDIT_ARCH_MIPSEL64N32), nil
return linuxAuditArch(C.C_AUDIT_ARCH_MIPSEL64N32), nil
case libseccomp.ArchPPC:
return nativeArch(C.C_AUDIT_ARCH_PPC), nil
return linuxAuditArch(C.C_AUDIT_ARCH_PPC), nil
case libseccomp.ArchPPC64:
return nativeArch(C.C_AUDIT_ARCH_PPC64), nil
return linuxAuditArch(C.C_AUDIT_ARCH_PPC64), nil
case libseccomp.ArchPPC64LE:
return nativeArch(C.C_AUDIT_ARCH_PPC64LE), nil
return linuxAuditArch(C.C_AUDIT_ARCH_PPC64LE), nil
case libseccomp.ArchS390:
return nativeArch(C.C_AUDIT_ARCH_S390), nil
return linuxAuditArch(C.C_AUDIT_ARCH_S390), nil
case libseccomp.ArchS390X:
return nativeArch(C.C_AUDIT_ARCH_S390X), nil
return linuxAuditArch(C.C_AUDIT_ARCH_S390X), nil
case libseccomp.ArchRISCV64:
return nativeArch(C.C_AUDIT_ARCH_RISCV64), nil
return linuxAuditArch(C.C_AUDIT_ARCH_RISCV64), nil
default:
return invalidArch, fmt.Errorf("unknown architecture: %v", arch)
}
}

type lastSyscallMap map[nativeArch]map[libseccomp.ScmpArch]libseccomp.ScmpSyscall
type lastSyscallMap map[linuxAuditArch]map[libseccomp.ScmpArch]libseccomp.ScmpSyscall

// Figure out largest syscall number referenced in the filter for each
// architecture. We will be generating code based on the native architecture
// representation, but SCMP_ARCH_X32 means we have to track cases where the
// same architecture has different largest syscalls based on the mode.
func findLastSyscalls(config *configs.Seccomp) (lastSyscallMap, error) {
lastSyscalls := make(lastSyscallMap)
// Only loop over architectures which are present in the filter. Any other
// architectures will get the libseccomp bad architecture action anyway.
scmpArchs := make(map[libseccomp.ScmpArch]struct{})
for _, ociArch := range config.Architectures {
arch, err := libseccomp.GetArchFromString(ociArch)
if err != nil {
return nil, fmt.Errorf("unable to validate seccomp architecture: %w", err)
}
scmpArchs[arch] = struct{}{}
}
// On architectures like ppc64le, Docker inexplicably doesn't include the
// native architecture in the architecture list which results in no
// architectures being present in the list at all (rendering the ENOSYS
// stub a no-op). So, always include the native architecture.
if nativeScmpArch, err := libseccomp.GetNativeArch(); err != nil {
return nil, fmt.Errorf("unable to get native arch: %w", err)
} else if _, ok := scmpArchs[nativeScmpArch]; !ok {
logrus.Debugf("seccomp: adding implied native architecture %v to config set", nativeScmpArch)
scmpArchs[nativeScmpArch] = struct{}{}
}
logrus.Debugf("seccomp: configured architecture set: %s", scmpArchs)

// Figure out native architecture representation of the architecture.
nativeArch, err := archToNative(arch)
// Only loop over architectures which are present in the filter. Any other
// architectures will get the libseccomp bad architecture action anyway.
lastSyscalls := make(lastSyscallMap)
for arch := range scmpArchs {
auditArch, err := scmpArchToAuditArch(arch)
if err != nil {
return nil, fmt.Errorf("cannot map architecture %v to AUDIT_ARCH_ constant: %w", arch, err)
}

if _, ok := lastSyscalls[nativeArch]; !ok {
lastSyscalls[nativeArch] = map[libseccomp.ScmpArch]libseccomp.ScmpSyscall{}
if _, ok := lastSyscalls[auditArch]; !ok {
lastSyscalls[auditArch] = map[libseccomp.ScmpArch]libseccomp.ScmpSyscall{}
}
if _, ok := lastSyscalls[nativeArch][arch]; ok {
if _, ok := lastSyscalls[auditArch][arch]; ok {
// Because of ArchNative we may hit the same entry multiple times.
// Just skip it if we've seen this (nativeArch, ScmpArch)
// Just skip it if we've seen this (linuxAuditArch, ScmpArch)
// combination before.
continue
}
Expand All @@ -269,10 +283,11 @@ func findLastSyscalls(config *configs.Seccomp) (lastSyscallMap, error) {
}
}
if largestSyscall != 0 {
lastSyscalls[nativeArch][arch] = largestSyscall
logrus.Debugf("seccomp: largest syscall number for arch %v is %v", arch, largestSyscall)
lastSyscalls[auditArch][arch] = largestSyscall
} else {
logrus.Warnf("could not find any syscalls for arch %s", ociArch)
delete(lastSyscalls[nativeArch], arch)
logrus.Warnf("could not find any syscalls for arch %v", arch)
delete(lastSyscalls[auditArch], arch)
}
}
return lastSyscalls, nil
Expand All @@ -290,10 +305,10 @@ func findLastSyscalls(config *configs.Seccomp) (lastSyscallMap, error) {
// close_range(2) which were added out-of-order in the syscall table between
// kernel releases.
func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error) {
// A jump-table for each nativeArch used to generate the initial
// A jump-table for each linuxAuditArch used to generate the initial
// conditional jumps -- measured from the *END* of the program so they
// remain valid after prepending to the tail.
archJumpTable := map[nativeArch]uint32{}
archJumpTable := map[linuxAuditArch]uint32{}

// Generate our own -ENOSYS rules for each architecture. They have to be
// generated in reverse (prepended to the tail of the program) because the
Expand All @@ -306,7 +321,7 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
}

// Generate the syscall -ENOSYS rules.
for nativeArch, maxSyscalls := range lastSyscalls {
for auditArch, maxSyscalls := range lastSyscalls {
// The number of instructions from the tail of this section which need
// to be jumped in order to reach the -ENOSYS return. If the section
// does not jump, it will fall through to the actual filter.
Expand Down Expand Up @@ -387,7 +402,7 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)

// If we're on x86 we need to add a check for x32 and if we're in
// the wrong mode we jump over the section.
if uint32(nativeArch) == uint32(C.C_AUDIT_ARCH_X86_64) {
if uint32(auditArch) == uint32(C.C_AUDIT_ARCH_X86_64) {
// Generate a prefix to check the mode.
switch scmpArch {
case libseccomp.ArchAMD64:
Expand Down Expand Up @@ -416,8 +431,8 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
section = append(section, sectionTail...)
case 2:
// x32 and x86_64 are a unique case, we can't handle any others.
if uint32(nativeArch) != uint32(C.C_AUDIT_ARCH_X86_64) {
return nil, fmt.Errorf("unknown architecture overlap on native arch %#x", nativeArch)
if uint32(auditArch) != uint32(C.C_AUDIT_ARCH_X86_64) {
return nil, fmt.Errorf("unknown architecture overlap on native arch %#x", auditArch)
}

x32sysno, ok := maxSyscalls[libseccomp.ArchX32]
Expand Down Expand Up @@ -494,7 +509,7 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
programTail = append(section, programTail...)

// Update jump table.
archJumpTable[nativeArch] = uint32(len(programTail))
archJumpTable[auditArch] = uint32(len(programTail))
}

// Add a dummy "jump to filter" for any architecture we might miss below.
Expand All @@ -514,9 +529,9 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
// architectures based on how large the jumps are going to be, or
// re-sort the candidate architectures each time to make sure that we
// pick the largest jump which is going to be smaller than 255.
for nativeArch := range lastSyscalls {
for auditArch := range lastSyscalls {
// We jump forwards but the jump table is calculated from the *END*.
jump := uint32(len(programTail)) - archJumpTable[nativeArch]
jump := uint32(len(programTail)) - archJumpTable[auditArch]

// Same routine as above -- this is a basic jeq check, complicated
// slightly if it turns out that we need to do a long jump.
Expand All @@ -525,7 +540,7 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
// jeq [arch],[jump]
bpf.JumpIf{
Cond: bpf.JumpEqual,
Val: uint32(nativeArch),
Val: uint32(auditArch),
SkipTrue: uint8(jump),
},
}, programTail...)
Expand All @@ -534,7 +549,7 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
// jne [arch],1
bpf.JumpIf{
Cond: bpf.JumpNotEqual,
Val: uint32(nativeArch),
Val: uint32(auditArch),
SkipTrue: 1,
},
// ja [jump]
Expand Down
58 changes: 46 additions & 12 deletions libcontainer/seccomp/patchbpf/enosys_linux_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ type seccompData struct {
}

// mockSyscallPayload creates a fake seccomp_data struct with the given data.
func mockSyscallPayload(t *testing.T, sysno libseccomp.ScmpSyscall, arch nativeArch, args ...uint64) []byte {
func mockSyscallPayload(t *testing.T, sysno libseccomp.ScmpSyscall, arch linuxAuditArch, args ...uint64) []byte {
var buf bytes.Buffer

data := seccompData{
Expand Down Expand Up @@ -105,8 +105,16 @@ var testArches = []string{
"ppc64le",
"s390",
"s390x",
// Dummy value to indicate a configuration with no architecture specified.
"native",
}

// Used for the "native" architecture.
var (
scmpNativeArch, _ = libseccomp.GetNativeArch()
nativeArch = scmpNativeArch.String()
)

func testEnosysStub(t *testing.T, defaultAction configs.Action, arches []string) {
explicitSyscalls := []string{
"setns",
Expand Down Expand Up @@ -150,17 +158,20 @@ func testEnosysStub(t *testing.T, defaultAction configs.Action, arches []string)

for _, arch := range testArches {
type syscallTest struct {
syscall string
sysno libseccomp.ScmpSyscall
syscall string
expected uint32
}

if arch == "native" {
arch = nativeArch
}
scmpArch, err := libseccomp.GetArchFromString(arch)
if err != nil {
t.Fatalf("unknown libseccomp architecture %q: %v", arch, err)
}

nativeArch, err := archToNative(scmpArch)
auditArch, err := scmpArchToAuditArch(scmpArch)
if err != nil {
t.Fatalf("unknown audit architecture %q: %v", arch, err)
}
Expand All @@ -179,9 +190,9 @@ func testEnosysStub(t *testing.T, defaultAction configs.Action, arches []string)
t.Fatalf("unknown syscall %q on arch %q: %v", syscall, arch, err)
}
syscallTests = append(syscallTests, syscallTest{
syscall,
sysno,
expected,
sysno: sysno,
syscall: syscall,
expected: expected,
})
}

Expand Down Expand Up @@ -228,12 +239,19 @@ func testEnosysStub(t *testing.T, defaultAction configs.Action, arches []string)

// Test syscalls in the explicit list.
for _, test := range syscallTests {
// Override the expected value in the two special cases.
if !archSet[arch] || isAllowAction(defaultAction) {
// Override the expected value in the two special cases:
// 1. If the default action is allow, the filter won't have
// the stub prepended so we expect a fallthrough.
// 2. If the executing architecture is not in the architecture
// set, then the architecture is not handled by the stub --
// *except* in the case of the native architecture (which
// is always included in the stub).
if isAllowAction(defaultAction) ||
(!archSet[arch] && arch != nativeArch) {
test.expected = retFallthrough
}

payload := mockSyscallPayload(t, test.sysno, nativeArch, 0x1337, 0xF00BA5)
payload := mockSyscallPayload(t, test.sysno, auditArch, 0x1337, 0xF00BA5)
// NOTE: golang.org/x/net/bpf returns int here rather
// than uint32.
rawRet, err := filter.Run(payload)
Expand All @@ -247,7 +265,7 @@ func testEnosysStub(t *testing.T, defaultAction configs.Action, arches []string)
t.Logf(" [%4.1d] %s", idx, insn)
}
t.Logf("payload: %#v", payload)
t.Errorf("filter %s(%d) %q(%d): got %#x, want %#x", arch, nativeArch, test.syscall, test.sysno, ret, test.expected)
t.Errorf("filter %s(%d) %q(%d): got %#x, want %#x", arch, auditArch, test.syscall, test.sysno, ret, test.expected)
}
}
}
Expand All @@ -263,7 +281,14 @@ var testActions = map[string]configs.Action{

func TestEnosysStub_SingleArch(t *testing.T) {
for _, arch := range testArches {
arches := []string{arch}
var arches []string
// "native" indicates a blank architecture field for seccomp, to test
// the case where the running architecture was not included in the
// architecture. Docker doesn't always set the architecture for some
// reason (namely for ppc64le).
if arch != "native" {
arches = append(arches, arch)
}
t.Run("arch="+arch, func(t *testing.T) {
for name, action := range testActions {
t.Run("action="+name, func(t *testing.T) {
Expand All @@ -277,7 +302,16 @@ func TestEnosysStub_SingleArch(t *testing.T) {
func TestEnosysStub_MultiArch(t *testing.T) {
for end := 0; end < len(testArches); end++ {
for start := 0; start < end; start++ {
arches := testArches[start:end]
var arches []string
for _, arch := range testArches[start:end] {
// "native" indicates a blank architecture field for seccomp, to test
// the case where the running architecture was not included in the
// architecture. Docker doesn't always set the architecture for some
// reason (namely for ppc64le).
if arch != "native" {
arches = append(arches, arch)
}
}
if len(arches) <= 1 {
continue
}
Expand Down

0 comments on commit a1acca9

Please sign in to comment.