Skip to content

Commit

Permalink
[Feat.] Support 'prioritize files' as a new prefetch mode
Browse files Browse the repository at this point in the history
See details in 'docs/trace-prefetch.md'

Signed-off-by: Yifan Yuan <[email protected]>
  • Loading branch information
BigVan committed Aug 8, 2024
1 parent e4abacf commit fb61804
Show file tree
Hide file tree
Showing 4 changed files with 145 additions and 81 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,8 @@ Accelerated Container Image is a __non-core__ sub-project of containerd.

* See the [PERFORMANCE](docs/PERFORMANCE.md) test about the acceleration.

* Enable 'record-trace' function can achieve higher performance for the entrypoint that needs to read amount of data at container startup. See [ENABLE_TRACE](docs/trace-prefetch.md).

* See how to convert OCI image into overlaybd with specified file system at [MULTI_FS_SUPPORT](docs/MULTI_FS_SUPPORT.md).

* See how to use layer deduplication for image conversion at [IMAGE_CONVERTOR](docs/IMAGE_CONVERTOR.md).
Expand Down
174 changes: 101 additions & 73 deletions cmd/ctr/record_trace.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"encoding/hex"
"encoding/json"
"fmt"
"io"
"os"
"os/exec"
"os/signal"
Expand Down Expand Up @@ -110,6 +111,11 @@ var recordTraceCommand = &cli.Command{
Usage: "record time in seconds. When time expires, a TERM signal will be sent to the task. The task might fail to respond signal if time is too short.",
Value: 60,
},
&cli.StringFlag{
Name: "priority_list",
Usage: "path of a file-list contains files to be prefetched",
Value: "",
},
&cli.StringFlag{
Name: "working-dir",
Value: "/tmp/ctr-record-trace/",
Expand Down Expand Up @@ -140,7 +146,12 @@ var recordTraceCommand = &cli.Command{
Value: "/opt/cni/bin/",
},
},

Before: func(cliCtx *cli.Context) error {
if cliCtx.IsSet("priority_list") && cliCtx.Args().Len() > 2 {
return errors.New("command args and priority_list can't be set at the same time")
}
return nil
},
Action: func(cliCtx *cli.Context) (err error) {
recordTime := time.Duration(cliCtx.Uint("time")) * time.Second
if recordTime == 0 {
Expand Down Expand Up @@ -206,89 +217,105 @@ var recordTraceCommand = &cli.Command{
if traceFd, err = os.Create(traceFile); err != nil {
return errors.New("failed to create trace file")
}
_ = traceFd.Close()
defer os.Remove(traceFile)

// Create lease
ctx, deleteLease, err := client.WithLease(ctx,
leases.WithID(uniqueObjectString()),
leases.WithExpiration(maxLeaseTime),
)
if err != nil {
return errors.Wrap(err, "failed to create lease")
}
defer deleteLease(ctx)

// Create isolated network
if !cliCtx.Bool("disable-network-isolation") {
networkNamespace = uniqueObjectString()
namespacePath = "/var/run/netns/" + networkNamespace
if err = exec.Command("ip", "netns", "add", networkNamespace).Run(); err != nil {
return errors.Wrapf(err, "failed to add netns")
}
defer func() {
if nextErr := exec.Command("ip", "netns", "delete", networkNamespace).Run(); err == nil && nextErr != nil {
err = errors.Wrapf(err, "failed to delete netns")
}
}()
cniObj, err := createIsolatedNetwork(cliCtx)
if !cliCtx.IsSet("priority_list") {
_ = traceFd.Close()

// Create lease
ctx, deleteLease, err := client.WithLease(ctx,
leases.WithID(uniqueObjectString()),
leases.WithExpiration(maxLeaseTime),
)
if err != nil {
return err
return errors.Wrap(err, "failed to create lease")
}
defer func() {
if nextErr := cniObj.Remove(ctx, networkNamespace, namespacePath); err == nil && nextErr != nil {
err = errors.Wrapf(nextErr, "failed to teardown network")
defer deleteLease(ctx)

// Create isolated network
if !cliCtx.Bool("disable-network-isolation") {
networkNamespace = uniqueObjectString()
namespacePath = "/var/run/netns/" + networkNamespace
if err = exec.Command("ip", "netns", "add", networkNamespace).Run(); err != nil {
return errors.Wrapf(err, "failed to add netns")
}
defer func() {
if nextErr := exec.Command("ip", "netns", "delete", networkNamespace).Run(); err == nil && nextErr != nil {
err = errors.Wrapf(err, "failed to delete netns")
}
}()
cniObj, err := createIsolatedNetwork(cliCtx)
if err != nil {
return err
}
defer func() {
if nextErr := cniObj.Remove(ctx, networkNamespace, namespacePath); err == nil && nextErr != nil {
err = errors.Wrapf(nextErr, "failed to teardown network")
}
}()
if _, err = cniObj.Setup(ctx, networkNamespace, namespacePath); err != nil {
return errors.Wrapf(err, "failed to setup network for namespace")
}
}()
if _, err = cniObj.Setup(ctx, networkNamespace, namespacePath); err != nil {
return errors.Wrapf(err, "failed to setup network for namespace")
}
}

// Create container and run task
fmt.Println("Create container")
container, err := createContainer(ctx, client, cliCtx, image, traceFile)
if err != nil {
return err
}
defer container.Delete(ctx, containerd.WithSnapshotCleanup)
// Create container and run task
fmt.Println("Create container")
container, err := createContainer(ctx, client, cliCtx, image, traceFile)
if err != nil {
return err
}
defer container.Delete(ctx, containerd.WithSnapshotCleanup)

task, err := tasks.NewTask(ctx, client, container, "", nil, false, "", nil)
if err != nil {
return err
}
defer task.Delete(ctx)
task, err := tasks.NewTask(ctx, client, container, "", nil, false, "", nil)
if err != nil {
return err
}
defer task.Delete(ctx)

var statusC <-chan containerd.ExitStatus
if statusC, err = task.Wait(ctx); err != nil {
return err
}
var statusC <-chan containerd.ExitStatus
if statusC, err = task.Wait(ctx); err != nil {
return err
}

if err := task.Start(ctx); err != nil {
return err
}
fmt.Println("Task is running ...")
if err := task.Start(ctx); err != nil {
return err
}
fmt.Println("Task is running ...")

timer := time.NewTimer(recordTime)
watchStop := make(chan bool)
timer := time.NewTimer(recordTime)
watchStop := make(chan bool)

// Start a thread to watch timeout and signals
go watchThread(ctx, timer, task, watchStop)
// Start a thread to watch timeout and signals
go watchThread(ctx, timer, task, watchStop)

// Wait task stopped
status := <-statusC
if _, _, err := status.Result(); err != nil {
return errors.Wrapf(err, "failed to get exit status")
}
// Wait task stopped
status := <-statusC
if _, _, err := status.Result(); err != nil {
return errors.Wrapf(err, "failed to get exit status")
}

if timer.Stop() {
watchStop <- true
fmt.Println("Task finished before timeout ...")
}
if timer.Stop() {
watchStop <- true
fmt.Println("Task finished before timeout ...")
}

// Collect trace
if err = collectTrace(traceFile); err != nil {
return err
// Collect trace
if err = collectTrace(traceFile); err != nil {
return err
}
} else {
fmt.Println("Set priority list as acceleration layer")
defer traceFd.Close()
fn := cliCtx.String("priority_list")
inf, err := os.OpenFile(fn, os.O_RDONLY, 0644)
if err != nil {
fmt.Printf("failed to open priority list: %s", err.Error())
return err
}
defer inf.Close()
_, err = io.Copy(traceFd, inf)
if err != nil {
return err
}
}

// Load trace file into content, and generate an acceleration layer
Expand Down Expand Up @@ -455,22 +482,23 @@ func createImageWithAccelLayer(ctx context.Context, cs content.Store, oldManifes
newManifest.Config = newConfigDesc
newManifest.Layers = append(oldManifest.Layers, l.Desc)

imageMediaType := oldManifest.MediaType

// V2 manifest is not adopted in OCI spec yet, so follow the docker registry V2 spec here
var newManifestV2 = struct {
ocispec.Manifest
MediaType string `json:"mediaType"`
}{
Manifest: newManifest,
MediaType: images.MediaTypeDockerSchema2Manifest,
MediaType: imageMediaType, //images.MediaTypeDockerSchema2Manifest,
}

newManifestData, err := json.MarshalIndent(newManifestV2, "", " ")
if err != nil {
return emptyDesc, err
}

newManifestDesc := ocispec.Descriptor{
MediaType: images.MediaTypeDockerSchema2Manifest,
MediaType: imageMediaType, // images.MediaTypeDockerSchema2Manifest,
Digest: digest.Canonical.FromBytes(newManifestData),
Size: int64(len(newManifestData)),
}
Expand Down
41 changes: 34 additions & 7 deletions docs/trace-prefetch.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,38 @@ There are many ways to do prefetch, for instance, we can simply read extra data

Another way is to [prioritize files and use landmarks](https://github.com/containerd/stargz-snapshotter/blob/master/docs/stargz-estargz.md#prioritized-files-and-landmark-files), which is already adopted in Google's stargz. The storage engine runtime will prefetch the range where prioritized files are contained. And finally this information will be leveraged for increasing cache hit ratio and mitigating read overhead.

In this article we are about to introduce a new prefetch mechanism based on time sequenced I/O patterns (trace). This mechanism has been integrated as a feature into `ctr record-trace` command.
In this article we are about to introduce two prefetch modes in overlayBD. One is to set prioritized files, another is a new prefetch mechanism based on time sequenced I/O patterns (trace).
These two mechanisms have been integrated as a feature into `ctr record-trace` command.

## Trace Prefetch
## Prefetch Mode

### Prioritize Files

Setting prioritized files is a simple way to improve container's cold start time. It is suitable for the condition where the target files needed be fully loaded.

When overlaybd device has been created, it will get prioritized files from the priority_list and analyze the filesystem via libext4 before mounting, then download the target files to overalybd's cache.

**Only support images based on EXT4 filesystem**

The priority list is a simple text file, each line contains a file path like follow:
```bash
## cat /tmp/priority_list.txt
/usr/bin/containerd
/usr/bin/nerdctl
/opt/cni/dhcp
/opt/cni/vlan
```


### Trace Prefetch

Since every single I/O request happens on user's own filesystem will eventually be mapped into one overlaybd's layer blob, we can then record all I/Os from the layer blob's perspective, and replay them later. That's why we call it Trace Prefetch.

Trace prefetch is time based, and it has greater granularity and predication accuracy than stargz. We don't mark a file, because user app might only need to read a small part of it in the beginning, simply prefetching the whole file would be less efficient. Instead, we replay the trace, by the exact I/O records that happened before. Each record contains only necessary information, such as the offset and length of the blob being read.

Trace is stored as an independent image layer, and MUST always be the uppermost one. Neither image manifest nor container snapshotter needs to know if it is a trace layer, snapshotter just downloads and extracts it as usual. The overlaybd backstore MUST recognize trace layer, and replay it accordingly.
**!! Note !!**

Both priority list and I/O trace are stored as an independent image layer, and MUST always be the uppermost one. Neither image manifest nor container snapshotter needs to know if it is a trace layer, snapshotter just downloads and extracts it as usual. The overlaybd backstore MUST recognize trace layer, and replay it accordingly.

## Terminology

Expand All @@ -42,14 +65,18 @@ After Recording and Pushing, users could pull and run the specific image somewhe

The example usage of building a new image with trace layer would be as follows:
```
bin/ctr rpull --download-blobs <old_image>
bin/ctr rpull --download-blobs <image>
## trace prefetch
bin/ctr record-trace --time 20 <image> <image_with_trace>
bin/ctr record-trace --time 20 <old_image> <local>
## prioritized files
bin/ctr record-trace --priority_list <path/to/filelist> <image> <image_with_trace>
ctr i push <new_image> <local>
ctr i push <image_with_trace>
```

Note the `old_image` must be in overlaybd format. A temporary container will be created and do the recording. The recording progress will be terminated by either timeout, or user signals.
Note the `<image>` must be in overlaybd format. A temporary container will be created and do the recording. The recording progress will be terminated by either timeout, or user signals.

Due to current limitations, this command might ask you remove the old image locally, in order to prepare a clean environment for the recording.

Expand Down
9 changes: 8 additions & 1 deletion pkg/snapshot/overlay.go
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,13 @@ func NewSnapshotter(bootConfig *BootConfig, opts ...Opt) (snapshots.Snapshotter,
return nil, err
}

root, err := filepath.EvalSymlinks(bootConfig.Root)
if err != nil {
log.L.Errorf("invalid root: %s. (%s)", bootConfig.Root, err.Error())
return nil, err
}
log.L.Infof("new snapshotter: root = %s", root)

metacopyOption := ""
if _, err := os.Stat("/sys/module/overlay/parameters/metacopy"); err == nil {
metacopyOption = "metacopy=on"
Expand All @@ -224,7 +231,7 @@ func NewSnapshotter(bootConfig *BootConfig, opts ...Opt) (snapshots.Snapshotter,
}

return &snapshotter{
root: bootConfig.Root,
root: root,
rwMode: bootConfig.RwMode,
ms: ms,
indexOff: indexOff,
Expand Down

0 comments on commit fb61804

Please sign in to comment.