From fb618040462468aa007883d4788998f78a99487f Mon Sep 17 00:00:00 2001 From: Yifan Yuan Date: Tue, 6 Aug 2024 11:04:52 +0800 Subject: [PATCH] [Feat.] Support 'prioritize files' as a new prefetch mode See details in 'docs/trace-prefetch.md' Signed-off-by: Yifan Yuan --- README.md | 2 + cmd/ctr/record_trace.go | 174 +++++++++++++++++++++++----------------- docs/trace-prefetch.md | 41 ++++++++-- pkg/snapshot/overlay.go | 9 ++- 4 files changed, 145 insertions(+), 81 deletions(-) diff --git a/README.md b/README.md index 036fa99c..014c16e6 100644 --- a/README.md +++ b/README.md @@ -72,6 +72,8 @@ Accelerated Container Image is a __non-core__ sub-project of containerd. * See the [PERFORMANCE](docs/PERFORMANCE.md) test about the acceleration. +* Enable 'record-trace' function can achieve higher performance for the entrypoint that needs to read amount of data at container startup. See [ENABLE_TRACE](docs/trace-prefetch.md). + * See how to convert OCI image into overlaybd with specified file system at [MULTI_FS_SUPPORT](docs/MULTI_FS_SUPPORT.md). * See how to use layer deduplication for image conversion at [IMAGE_CONVERTOR](docs/IMAGE_CONVERTOR.md). diff --git a/cmd/ctr/record_trace.go b/cmd/ctr/record_trace.go index d44e0741..64211787 100644 --- a/cmd/ctr/record_trace.go +++ b/cmd/ctr/record_trace.go @@ -23,6 +23,7 @@ import ( "encoding/hex" "encoding/json" "fmt" + "io" "os" "os/exec" "os/signal" @@ -110,6 +111,11 @@ var recordTraceCommand = &cli.Command{ Usage: "record time in seconds. When time expires, a TERM signal will be sent to the task. The task might fail to respond signal if time is too short.", Value: 60, }, + &cli.StringFlag{ + Name: "priority_list", + Usage: "path of a file-list contains files to be prefetched", + Value: "", + }, &cli.StringFlag{ Name: "working-dir", Value: "/tmp/ctr-record-trace/", @@ -140,7 +146,12 @@ var recordTraceCommand = &cli.Command{ Value: "/opt/cni/bin/", }, }, - + Before: func(cliCtx *cli.Context) error { + if cliCtx.IsSet("priority_list") && cliCtx.Args().Len() > 2 { + return errors.New("command args and priority_list can't be set at the same time") + } + return nil + }, Action: func(cliCtx *cli.Context) (err error) { recordTime := time.Duration(cliCtx.Uint("time")) * time.Second if recordTime == 0 { @@ -206,89 +217,105 @@ var recordTraceCommand = &cli.Command{ if traceFd, err = os.Create(traceFile); err != nil { return errors.New("failed to create trace file") } - _ = traceFd.Close() defer os.Remove(traceFile) - - // Create lease - ctx, deleteLease, err := client.WithLease(ctx, - leases.WithID(uniqueObjectString()), - leases.WithExpiration(maxLeaseTime), - ) - if err != nil { - return errors.Wrap(err, "failed to create lease") - } - defer deleteLease(ctx) - - // Create isolated network - if !cliCtx.Bool("disable-network-isolation") { - networkNamespace = uniqueObjectString() - namespacePath = "/var/run/netns/" + networkNamespace - if err = exec.Command("ip", "netns", "add", networkNamespace).Run(); err != nil { - return errors.Wrapf(err, "failed to add netns") - } - defer func() { - if nextErr := exec.Command("ip", "netns", "delete", networkNamespace).Run(); err == nil && nextErr != nil { - err = errors.Wrapf(err, "failed to delete netns") - } - }() - cniObj, err := createIsolatedNetwork(cliCtx) + if !cliCtx.IsSet("priority_list") { + _ = traceFd.Close() + + // Create lease + ctx, deleteLease, err := client.WithLease(ctx, + leases.WithID(uniqueObjectString()), + leases.WithExpiration(maxLeaseTime), + ) if err != nil { - return err + return errors.Wrap(err, "failed to create lease") } - defer func() { - if nextErr := cniObj.Remove(ctx, networkNamespace, namespacePath); err == nil && nextErr != nil { - err = errors.Wrapf(nextErr, "failed to teardown network") + defer deleteLease(ctx) + + // Create isolated network + if !cliCtx.Bool("disable-network-isolation") { + networkNamespace = uniqueObjectString() + namespacePath = "/var/run/netns/" + networkNamespace + if err = exec.Command("ip", "netns", "add", networkNamespace).Run(); err != nil { + return errors.Wrapf(err, "failed to add netns") + } + defer func() { + if nextErr := exec.Command("ip", "netns", "delete", networkNamespace).Run(); err == nil && nextErr != nil { + err = errors.Wrapf(err, "failed to delete netns") + } + }() + cniObj, err := createIsolatedNetwork(cliCtx) + if err != nil { + return err + } + defer func() { + if nextErr := cniObj.Remove(ctx, networkNamespace, namespacePath); err == nil && nextErr != nil { + err = errors.Wrapf(nextErr, "failed to teardown network") + } + }() + if _, err = cniObj.Setup(ctx, networkNamespace, namespacePath); err != nil { + return errors.Wrapf(err, "failed to setup network for namespace") } - }() - if _, err = cniObj.Setup(ctx, networkNamespace, namespacePath); err != nil { - return errors.Wrapf(err, "failed to setup network for namespace") } - } - // Create container and run task - fmt.Println("Create container") - container, err := createContainer(ctx, client, cliCtx, image, traceFile) - if err != nil { - return err - } - defer container.Delete(ctx, containerd.WithSnapshotCleanup) + // Create container and run task + fmt.Println("Create container") + container, err := createContainer(ctx, client, cliCtx, image, traceFile) + if err != nil { + return err + } + defer container.Delete(ctx, containerd.WithSnapshotCleanup) - task, err := tasks.NewTask(ctx, client, container, "", nil, false, "", nil) - if err != nil { - return err - } - defer task.Delete(ctx) + task, err := tasks.NewTask(ctx, client, container, "", nil, false, "", nil) + if err != nil { + return err + } + defer task.Delete(ctx) - var statusC <-chan containerd.ExitStatus - if statusC, err = task.Wait(ctx); err != nil { - return err - } + var statusC <-chan containerd.ExitStatus + if statusC, err = task.Wait(ctx); err != nil { + return err + } - if err := task.Start(ctx); err != nil { - return err - } - fmt.Println("Task is running ...") + if err := task.Start(ctx); err != nil { + return err + } + fmt.Println("Task is running ...") - timer := time.NewTimer(recordTime) - watchStop := make(chan bool) + timer := time.NewTimer(recordTime) + watchStop := make(chan bool) - // Start a thread to watch timeout and signals - go watchThread(ctx, timer, task, watchStop) + // Start a thread to watch timeout and signals + go watchThread(ctx, timer, task, watchStop) - // Wait task stopped - status := <-statusC - if _, _, err := status.Result(); err != nil { - return errors.Wrapf(err, "failed to get exit status") - } + // Wait task stopped + status := <-statusC + if _, _, err := status.Result(); err != nil { + return errors.Wrapf(err, "failed to get exit status") + } - if timer.Stop() { - watchStop <- true - fmt.Println("Task finished before timeout ...") - } + if timer.Stop() { + watchStop <- true + fmt.Println("Task finished before timeout ...") + } - // Collect trace - if err = collectTrace(traceFile); err != nil { - return err + // Collect trace + if err = collectTrace(traceFile); err != nil { + return err + } + } else { + fmt.Println("Set priority list as acceleration layer") + defer traceFd.Close() + fn := cliCtx.String("priority_list") + inf, err := os.OpenFile(fn, os.O_RDONLY, 0644) + if err != nil { + fmt.Printf("failed to open priority list: %s", err.Error()) + return err + } + defer inf.Close() + _, err = io.Copy(traceFd, inf) + if err != nil { + return err + } } // Load trace file into content, and generate an acceleration layer @@ -455,22 +482,23 @@ func createImageWithAccelLayer(ctx context.Context, cs content.Store, oldManifes newManifest.Config = newConfigDesc newManifest.Layers = append(oldManifest.Layers, l.Desc) + imageMediaType := oldManifest.MediaType + // V2 manifest is not adopted in OCI spec yet, so follow the docker registry V2 spec here var newManifestV2 = struct { ocispec.Manifest MediaType string `json:"mediaType"` }{ Manifest: newManifest, - MediaType: images.MediaTypeDockerSchema2Manifest, + MediaType: imageMediaType, //images.MediaTypeDockerSchema2Manifest, } newManifestData, err := json.MarshalIndent(newManifestV2, "", " ") if err != nil { return emptyDesc, err } - newManifestDesc := ocispec.Descriptor{ - MediaType: images.MediaTypeDockerSchema2Manifest, + MediaType: imageMediaType, // images.MediaTypeDockerSchema2Manifest, Digest: digest.Canonical.FromBytes(newManifestData), Size: int64(len(newManifestData)), } diff --git a/docs/trace-prefetch.md b/docs/trace-prefetch.md index 7e5e6f03..8f871aef 100644 --- a/docs/trace-prefetch.md +++ b/docs/trace-prefetch.md @@ -8,15 +8,38 @@ There are many ways to do prefetch, for instance, we can simply read extra data Another way is to [prioritize files and use landmarks](https://github.com/containerd/stargz-snapshotter/blob/master/docs/stargz-estargz.md#prioritized-files-and-landmark-files), which is already adopted in Google's stargz. The storage engine runtime will prefetch the range where prioritized files are contained. And finally this information will be leveraged for increasing cache hit ratio and mitigating read overhead. -In this article we are about to introduce a new prefetch mechanism based on time sequenced I/O patterns (trace). This mechanism has been integrated as a feature into `ctr record-trace` command. +In this article we are about to introduce two prefetch modes in overlayBD. One is to set prioritized files, another is a new prefetch mechanism based on time sequenced I/O patterns (trace). +These two mechanisms have been integrated as a feature into `ctr record-trace` command. -## Trace Prefetch +## Prefetch Mode + +### Prioritize Files + +Setting prioritized files is a simple way to improve container's cold start time. It is suitable for the condition where the target files needed be fully loaded. + +When overlaybd device has been created, it will get prioritized files from the priority_list and analyze the filesystem via libext4 before mounting, then download the target files to overalybd's cache. + +**Only support images based on EXT4 filesystem** + +The priority list is a simple text file, each line contains a file path like follow: +```bash +## cat /tmp/priority_list.txt +/usr/bin/containerd +/usr/bin/nerdctl +/opt/cni/dhcp +/opt/cni/vlan +``` + + +### Trace Prefetch Since every single I/O request happens on user's own filesystem will eventually be mapped into one overlaybd's layer blob, we can then record all I/Os from the layer blob's perspective, and replay them later. That's why we call it Trace Prefetch. Trace prefetch is time based, and it has greater granularity and predication accuracy than stargz. We don't mark a file, because user app might only need to read a small part of it in the beginning, simply prefetching the whole file would be less efficient. Instead, we replay the trace, by the exact I/O records that happened before. Each record contains only necessary information, such as the offset and length of the blob being read. -Trace is stored as an independent image layer, and MUST always be the uppermost one. Neither image manifest nor container snapshotter needs to know if it is a trace layer, snapshotter just downloads and extracts it as usual. The overlaybd backstore MUST recognize trace layer, and replay it accordingly. +**!! Note !!** + +Both priority list and I/O trace are stored as an independent image layer, and MUST always be the uppermost one. Neither image manifest nor container snapshotter needs to know if it is a trace layer, snapshotter just downloads and extracts it as usual. The overlaybd backstore MUST recognize trace layer, and replay it accordingly. ## Terminology @@ -42,14 +65,18 @@ After Recording and Pushing, users could pull and run the specific image somewhe The example usage of building a new image with trace layer would be as follows: ``` -bin/ctr rpull --download-blobs +bin/ctr rpull --download-blobs + +## trace prefetch +bin/ctr record-trace --time 20 -bin/ctr record-trace --time 20 +## prioritized files +bin/ctr record-trace --priority_list -ctr i push +ctr i push ``` -Note the `old_image` must be in overlaybd format. A temporary container will be created and do the recording. The recording progress will be terminated by either timeout, or user signals. +Note the `` must be in overlaybd format. A temporary container will be created and do the recording. The recording progress will be terminated by either timeout, or user signals. Due to current limitations, this command might ask you remove the old image locally, in order to prepare a clean environment for the recording. diff --git a/pkg/snapshot/overlay.go b/pkg/snapshot/overlay.go index 0fbc8140..a83e2d8e 100644 --- a/pkg/snapshot/overlay.go +++ b/pkg/snapshot/overlay.go @@ -208,6 +208,13 @@ func NewSnapshotter(bootConfig *BootConfig, opts ...Opt) (snapshots.Snapshotter, return nil, err } + root, err := filepath.EvalSymlinks(bootConfig.Root) + if err != nil { + log.L.Errorf("invalid root: %s. (%s)", bootConfig.Root, err.Error()) + return nil, err + } + log.L.Infof("new snapshotter: root = %s", root) + metacopyOption := "" if _, err := os.Stat("/sys/module/overlay/parameters/metacopy"); err == nil { metacopyOption = "metacopy=on" @@ -224,7 +231,7 @@ func NewSnapshotter(bootConfig *BootConfig, opts ...Opt) (snapshots.Snapshotter, } return &snapshotter{ - root: bootConfig.Root, + root: root, rwMode: bootConfig.RwMode, ms: ms, indexOff: indexOff,