Skip to content

Commit

Permalink
ociruntime: handle images with high layer count
Browse files Browse the repository at this point in the history
When the action required an image with more than 20 layers, our mount
will fail with

```
create OCI bundle: create rootfs: mount overlayfs: no such file or directory
```

After some digging, it seems like 20 is the current limit of the number
of lowerdir allowed in each mount call.

Add special logic to break down images with more than 20 layers into
groups of 20. For each group, create an overlayfs mount called
"merged<group-id>" in the same bundle dir. The final overlayfs will then
be composed of these "merged" groups as lowerdirs.
  • Loading branch information
sluongng committed Oct 4, 2024
1 parent f34eecb commit f95d891
Show file tree
Hide file tree
Showing 5 changed files with 233 additions and 9 deletions.
8 changes: 8 additions & 0 deletions MODULE.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -470,10 +470,18 @@ oci.pull(
image = "gcr.io/distroless/java17-debian12",
platforms = ["linux/amd64"],
)
oci.pull(
name = "busybox",
digest = "sha256:c230832bd3b0be59a6c47ed64294f9ce71e91b327957920b6929a0caa8353140",
image = "mirror.gcr.io/library/busybox:1.36.1",
platforms = ["linux/amd64"],
)
use_repo(
oci,
"bazel_oci_image_base",
"bazel_oci_image_base_linux_amd64",
"buildbuddy_go_oci_image_base",
"buildbuddy_go_oci_image_base_linux_amd64",
"busybox",
"busybox_linux_amd64",
)
7 changes: 7 additions & 0 deletions WORKSPACE
Original file line number Diff line number Diff line change
Expand Up @@ -461,6 +461,13 @@ oci_pull(
platforms = ["linux/amd64"],
)

oci_pull(
name = "busybox",
digest = "sha256:c230832bd3b0be59a6c47ed64294f9ce71e91b327957920b6929a0caa8353140",
image = "mirror.gcr.io/library/busybox:1.36.1",
platforms = ["linux/amd64"],
)

# BuildBuddy Toolchain
# Keep up-to-date with docs/rbe-setup.md and docs/rbe-github-actions.md
http_archive(
Expand Down
10 changes: 10 additions & 0 deletions enterprise/server/remote_execution/containers/ociruntime/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ go_test(
":busybox",
":crun",
"//enterprise/server/remote_execution/runner/testworker",
"@busybox",
],
exec_properties = {
"test.workload-isolation-type": "firecracker",
Expand All @@ -65,6 +66,7 @@ go_test(
x_defs = {
"crunRlocationpath": "$(rlocationpath :crun)",
"busyboxRlocationpath": "$(rlocationpath :busybox)",
"ociBusyboxRlocationpath": "$(rlocationpath @busybox)",
"testworkerRlocationpath": "$(rlocationpath //enterprise/server/remote_execution/runner/testworker)",
},
deps = [
Expand All @@ -86,6 +88,14 @@ go_test(
"//server/util/status",
"//server/util/testing/flags",
"//server/util/uuid",
"@com_github_google_go_containerregistry//pkg/name",
"@com_github_google_go_containerregistry//pkg/registry",
"@com_github_google_go_containerregistry//pkg/v1:pkg",
"@com_github_google_go_containerregistry//pkg/v1/layout",
"@com_github_google_go_containerregistry//pkg/v1/mutate",
"@com_github_google_go_containerregistry//pkg/v1/partial",
"@com_github_google_go_containerregistry//pkg/v1/remote",
"@com_github_google_go_containerregistry//pkg/v1/types",
"@com_github_stretchr_testify//assert",
"@com_github_stretchr_testify//require",
"@io_bazel_rules_go//go/runfiles:go_default_library",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,7 @@ type ociContainer struct {

cid string
workDir string
mergedMounts []string
overlayfsMounted bool
stats container.UsageStats
networkPool *networking.ContainerNetworkPool
Expand Down Expand Up @@ -441,6 +442,14 @@ func (c *ociContainer) Remove(ctx context.Context) error {
firstErr = status.UnavailableErrorf("delete container: %s", err)
}

if len(c.mergedMounts) > 0 {
for _, merged := range c.mergedMounts {
if err := syscall.Unmount(merged, syscall.MNT_FORCE); err != nil && firstErr == nil {
firstErr = status.UnavailableErrorf("unmount overlayfs: %s", err)
}
}
}

if c.overlayfsMounted {
if err := syscall.Unmount(c.rootfsPath(), syscall.MNT_FORCE); err != nil && firstErr == nil {
firstErr = status.UnavailableErrorf("unmount overlayfs: %s", err)
Expand Down Expand Up @@ -546,16 +555,12 @@ func (c *ociContainer) createRootfs(ctx context.Context) error {
}

// Create an overlayfs with the pulled image layers.
var lowerDirs []string
image, ok := c.imageStore.CachedImage(c.imageRef)
if !ok {
return fmt.Errorf("bad state: attempted to create rootfs before pulling image")
}
// overlayfs "lowerdir" mount args are ordered from uppermost to lowermost,
// but manifest layers are ordered from lowermost to uppermost. So we
// iterate in reverse order when building the lowerdir args.
for i := len(image.Layers) - 1; i >= 0; i-- {
layer := image.Layers[i]
var lowerDirs []string
for i, layer := range image.Layers {
path := layerPath(c.imageCacheRoot, layer.DiffID)
// Skip empty dirs - these can cause conflicts since they will always
// have the same digest, and also just add more overhead.
Expand All @@ -568,6 +573,33 @@ func (c *ociContainer) createRootfs(ctx context.Context) error {
continue
}
lowerDirs = append(lowerDirs, path)

// Overlayfs mount has a limit of 20 lowerdirs.
// Merge every 20 lower dir into a new overlayfs to avoid the limit
//
// When we reach the last layer, conditionally merge the remaining lower dirs
// so that we don't create an unnecessary overlayfs with only a few lower dirs.
//
// For example:
// If we have 21 layers, and the first 20 layers are merged into mount "merged0",
// we don't want to create a second "merged1" overlayfs mount that would only
// contain 1 remaining layer as lowerdir.
if len(lowerDirs) == 20 || (i == len(image.Layers)-1 && len(lowerDirs)+len(c.mergedMounts) > 20) {
merged, err := c.createMergedOverlayMount(ctx, lowerDirs)
if err != nil {
return err
}
c.mergedMounts = append(c.mergedMounts, merged)
lowerDirs = []string{}
}
}
if len(c.mergedMounts) != 0 {
lowerDirs = append(c.mergedMounts, lowerDirs...)
}
if len(lowerDirs) > 20 {
// overlayfs don't always error out when there are too many layers.
// TODO(sluongng): investigate why overlayfs would fail sometimes when there are too many layers.
log.CtxWarningf(ctx, "Image %q has too many layers (%d) for overlayfs", c.imageRef, len(lowerDirs))
}
// Create workdir and upperdir.
workdir := filepath.Join(c.bundlePath(), "tmp", "rootfs.work")
Expand All @@ -579,6 +611,11 @@ func (c *ociContainer) createRootfs(ctx context.Context) error {
return fmt.Errorf("create overlay upperdir: %w", err)
}

// overlayfs "lowerdir" mount args are ordered from uppermost to lowermost,
// but manifest layers are ordered from lowermost to uppermost. So we need to
// reverse the order before constructing the mount option.
slices.Reverse(lowerDirs)

// TODO: do this mount inside a namespace so that it gets removed even if
// the executor crashes (also needed for rootless support)

Expand All @@ -595,6 +632,30 @@ func (c *ociContainer) createRootfs(ctx context.Context) error {
return nil
}

func (c *ociContainer) createMergedOverlayMount(ctx context.Context, lowerDirs []string) (string, error) {
workdir := filepath.Join(c.bundlePath(), "tmp", fmt.Sprintf("merged%d.work", len(c.mergedMounts)))
if err := os.MkdirAll(workdir, 0755); err != nil {
return "", fmt.Errorf("create overlay workdir: %w", err)
}
upperdir := filepath.Join(c.bundlePath(), "tmp", fmt.Sprintf("merged%d.upper", len(c.mergedMounts)))
if err := os.MkdirAll(upperdir, 0755); err != nil {
return "", fmt.Errorf("create overlay upperdir: %w", err)
}
merged := filepath.Join(c.bundlePath(), "tmp", fmt.Sprintf("merged%d", len(c.mergedMounts)))
if err := os.MkdirAll(merged, 0755); err != nil {
return "", fmt.Errorf("create overlay merged: %w", err)
}
slices.Reverse(lowerDirs)
options := fmt.Sprintf(
"lowerdir=%s,upperdir=%s,workdir=%s,userxattr,volatile",
strings.Join(lowerDirs, ":"), upperdir, workdir)
log.CtxDebugf(ctx, "Mounting overlayfs to %q, options=%q", merged, options)
if err := syscall.Mount("none", merged, "overlay", 0, options); err != nil {
return "", fmt.Errorf("mount overlayfs: %w", err)
}
return merged, nil
}

func installBusybox(ctx context.Context, path string) error {
busyboxPath, err := exec.LookPath("busybox")
if err != nil {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,13 +1,20 @@
package ociruntime_test

import (
"archive/tar"
"bytes"
"context"
"crypto"
"encoding/base64"
"encoding/binary"
"encoding/hex"
"fmt"
"io"
"io/fs"
"log"
"math/rand/v2"
"net/http/httptest"
"net/url"
"os"
"os/exec"
"path/filepath"
Expand All @@ -34,6 +41,14 @@ import (
"github.com/buildbuddy-io/buildbuddy/server/util/status"
"github.com/buildbuddy-io/buildbuddy/server/util/testing/flags"
"github.com/buildbuddy-io/buildbuddy/server/util/uuid"
"github.com/google/go-containerregistry/pkg/name"
"github.com/google/go-containerregistry/pkg/registry"
v1 "github.com/google/go-containerregistry/pkg/v1"
"github.com/google/go-containerregistry/pkg/v1/layout"
"github.com/google/go-containerregistry/pkg/v1/mutate"
"github.com/google/go-containerregistry/pkg/v1/partial"
"github.com/google/go-containerregistry/pkg/v1/remote"
"github.com/google/go-containerregistry/pkg/v1/types"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"

Expand All @@ -42,9 +57,12 @@ import (
)

// Set via x_defs in BUILD file.
var crunRlocationpath string
var busyboxRlocationpath string
var testworkerRlocationpath string
var (
crunRlocationpath string
busyboxRlocationpath string
ociBusyboxRlocationpath string
testworkerRlocationpath string
)

func init() {
runtimePath, err := runfiles.Rlocation(crunRlocationpath)
Expand Down Expand Up @@ -922,6 +940,126 @@ func TestOverlayfsEdgeCases(t *testing.T) {
assert.Equal(t, 0, res.ExitCode)
}

// uncompressedLayer implements partial.UncompressedLayer from raw bytes.
type uncompressedLayer struct {
diffID v1.Hash
mediaType types.MediaType
content []byte
}

// DiffID implements partial.UncompressedLayer
func (ul *uncompressedLayer) DiffID() (v1.Hash, error) {
return ul.diffID, nil
}

// Uncompressed implements partial.UncompressedLayer
func (ul *uncompressedLayer) Uncompressed() (io.ReadCloser, error) {
return io.NopCloser(bytes.NewBuffer(ul.content)), nil
}

// MediaType returns the media type of the layer
func (ul *uncompressedLayer) MediaType() (types.MediaType, error) {
return ul.mediaType, nil
}

var _ partial.UncompressedLayer = (*uncompressedLayer)(nil)

func TestHighLayerCount(t *testing.T) {
// Load busybox oci image
busyboxPath, err := runfiles.Rlocation(ociBusyboxRlocationpath)
require.NoError(t, err)
idx, err := layout.ImageIndexFromPath(busyboxPath)
require.NoError(t, err)
m, err := idx.IndexManifest()
require.NoError(t, err)
require.Equal(t, 1, len(m.Manifests))
require.True(t, m.Manifests[0].MediaType.IsImage())
busyboxImg, err := idx.Image(m.Manifests[0].Digest)
require.NoError(t, err)

for _, tc := range []struct {
layerCount int
}{
{layerCount: 5},
{layerCount: 20},
{layerCount: 21},
{layerCount: 59},
} {
// Note that the "busybox" oci image has 1 layer
// and the following tests will add more layers on top of it.
t.Run(fmt.Sprintf("1And%dLayers", tc.layerCount), func(t *testing.T) {
// Create new layers on top of busybox
var lastContent string
var layers []v1.Layer
for i := range tc.layerCount {
lastContent = fmt.Sprintf("layer %d", i)
content := []byte(lastContent)

var b bytes.Buffer
hasher := crypto.SHA256.New()
mw := io.MultiWriter(&b, hasher)

tw := tar.NewWriter(mw)
err := tw.WriteHeader(&tar.Header{
Name: "a.txt",
Size: int64(len(content)),
Typeflag: tar.TypeReg,
})
require.NoError(t, err)
_, err = tw.Write(content)
require.NoError(t, err)
err = tw.Close()
require.NoError(t, err)

layer, err := partial.UncompressedToLayer(&uncompressedLayer{
diffID: v1.Hash{
Algorithm: "sha256",
Hex: hex.EncodeToString(hasher.Sum(make([]byte, 0, hasher.Size()))),
},
mediaType: types.OCILayer,
content: b.Bytes(),
})
require.NoError(t, err)
layers = append(layers, layer)
}
testImg, err := mutate.AppendLayers(busyboxImg, layers...)
require.NoError(t, err)

// Start registry and push our new image there
ts := httptest.NewServer(registry.New(registry.Logger(log.New(io.Discard, "", 0))))
url, err := url.Parse(ts.URL)
require.NoError(t, err)
imageRef := url.Host + "/foo:latest"
tag, err := name.NewTag(imageRef)
require.NoError(t, err)
err = remote.Write(tag, testImg)
require.NoError(t, err)

// Start container to verify content
setupNetworking(t)
ctx := context.Background()
env := testenv.GetTestEnv(t)
buildRoot := testfs.MakeTempDir(t)
provider, err := ociruntime.NewProvider(env, buildRoot)
require.NoError(t, err)
wd := testfs.MakeDirAll(t, buildRoot, "work")
c, err := provider.New(ctx, &container.Init{Props: &platform.Properties{
ContainerImage: imageRef,
}})
require.NoError(t, err)
cmd := &repb.Command{Arguments: []string{"sh", "-c", `cat /a.txt`}}
res := c.Run(ctx, cmd, wd, oci.Credentials{})
require.NoError(t, res.Error)
assert.Equal(t, lastContent, string(res.Stdout))
assert.Empty(t, string(res.Stderr))
assert.Equal(t, 0, res.ExitCode)

// Cleanup
require.NoError(t, c.Remove(ctx))
})
}
}

func TestPersistentWorker(t *testing.T) {
setupNetworking(t)

Expand Down

0 comments on commit f95d891

Please sign in to comment.