diff --git a/README.md b/README.md index 9f03e6257..b51d20c5b 100644 --- a/README.md +++ b/README.md @@ -98,12 +98,13 @@ Controller-level and node-level deployments will both have priorityClassName set As noted in [GCP PD documentation](https://cloud.google.com/kubernetes-engine/docs/how-to/persistent-volumes/gce-pd-csi-driver), `ext4` and `xfs` are officially supported. `btrfs` support is experimental: - As of writing, Ubuntu VM images support btrfs, but [COS does not](https://cloud.google.com/container-optimized-os/docs/concepts/supported-filesystems). -`btrfs` filesystem accepts two "special" mount options: +`btrfs` filesystem accepts the following "special" mount options and the sysfs paths they target: -- `btrfs-data-bg_reclaim_threshold` -- `btrfs-metadata-bg_reclaim_threshold` +- `btrfs-data-bg_reclaim_threshold`: `/sys/fs/btrfs/FS-UUID/allocation/data/bg_reclaim_threshold`. +- `btrfs-metadata-bg_reclaim_thresho: `/sys/fs/btrfs/FS-UUID/allocation/metadata/bg_reclaim_threshold`. +- `btrfs-bdi-read_ahead_kb`: `/sys/fs/btrfs/FS-UUID/bdi/read_ahead_kb`. -Which writes to `/sys/fs/btrfs/FS-UUID/allocation/{,meta}data/bg_reclaim_threshold`, as documented [in btrfs docs](https://btrfs.readthedocs.io/en/latest/ch-sysfs.html#uuid-allocations-data-metadata-system). +See more in the [in btrfs docs](https://btrfs.readthedocs.io/en/latest/ch-sysfs.html#uuid-allocations-data-metadata-system). ## Further Documentation diff --git a/pkg/gce-pd-csi-driver/node.go b/pkg/gce-pd-csi-driver/node.go index 00f0b0859..9c55f27fa 100644 --- a/pkg/gce-pd-csi-driver/node.go +++ b/pkg/gce-pd-csi-driver/node.go @@ -124,12 +124,14 @@ const ( readAheadKBMountFlagRegexPattern = "^read_ahead_kb=(.+)$" btrfsReclaimDataRegexPattern = "^btrfs-allocation-data-bg_reclaim_threshold=(\\d{1,2})$" // 0-99 are valid, incl. 00 btrfsReclaimMetadataRegexPattern = "^btrfs-allocation-metadata-bg_reclaim_threshold=(\\d{1,2})$" // ditto ^ + btrfsReadAheadKBRegexPattern = "^btrfs-bdi-read_ahead_kb=(\\d+)$" ) var ( readAheadKBMountFlagRegex = regexp.MustCompile(readAheadKBMountFlagRegexPattern) btrfsReclaimDataRegex = regexp.MustCompile(btrfsReclaimDataRegexPattern) btrfsReclaimMetadataRegex = regexp.MustCompile(btrfsReclaimMetadataRegexPattern) + btrfsReadAheadKBRegex = regexp.MustCompile(btrfsReadAheadKBRegexPattern) ) func getDefaultFsType() string { @@ -402,7 +404,7 @@ func (ns *GCENodeServer) NodeStageVolume(ctx context.Context, req *csi.NodeStage // Part 3: Mount device to stagingTargetPath fstype := getDefaultFsType() - var btrfsReclaimData, btrfsReclaimMetadata string + var btrfsReclaimData, btrfsReclaimMetadata, btrfsReadAheadKb string shouldUpdateReadAhead := false var readAheadKB int64 options := []string{} @@ -418,7 +420,7 @@ func (ns *GCENodeServer) NodeStageVolume(ctx context.Context, req *csi.NodeStage } if mnt.FsType == fsTypeBtrfs { - btrfsReclaimData, btrfsReclaimMetadata = extractBtrfsReclaimFlags(mnt.MountFlags) + btrfsReclaimData, btrfsReclaimMetadata, btrfsReadAheadKb = extractBtrfsFlags(mnt.MountFlags) } } else if blk := volumeCapability.GetBlock(); blk != nil { // Noop for Block NodeStageVolume @@ -465,47 +467,52 @@ func (ns *GCENodeServer) NodeStageVolume(ctx context.Context, req *csi.NodeStage } } - // Part 5: Update read_ahead + // Part 5: Update read_ahead for the block device if shouldUpdateReadAhead { if err := ns.updateReadAhead(devicePath, readAheadKB); err != nil { return nil, status.Errorf(codes.Internal, "failure updating readahead for %s to %dKB: %v", devicePath, readAheadKB, err.Error()) } } - // Part 6: if configured, write sysfs values + btrfsSysfs := map[string]string{} + + if btrfsReadAheadKb != "" { + btrfsSysfs["bdi/read_ahead_kb"] = btrfsReadAheadKb + } + if !readonly { - sysfs := map[string]string{} if btrfsReclaimData != "" { - sysfs["allocation/data/bg_reclaim_threshold"] = btrfsReclaimData + btrfsSysfs["allocation/data/bg_reclaim_threshold"] = btrfsReclaimData } if btrfsReclaimMetadata != "" { - sysfs["allocation/metadata/bg_reclaim_threshold"] = btrfsReclaimMetadata - } - - if len(sysfs) > 0 { - args := []string{"--match-tag", "UUID", "--output", "value", stagingTargetPath} - cmd := ns.Mounter.Exec.Command("blkid", args...) - var stderr bytes.Buffer - cmd.SetStderr(&stderr) - klog.V(4).Infof( - "running %q for volume %s", - strings.Join(append([]string{"blkid"}, args...), " "), - volumeID, - ) - uuid, err := cmd.Output() - if err != nil { - klog.Errorf("blkid failed for %s. stderr:\n%s", volumeID, stderr.String()) - return nil, status.Errorf(codes.Internal, "blkid failed: %v", err) - } - uuid = bytes.TrimRight(uuid, "\n") + btrfsSysfs["allocation/metadata/bg_reclaim_threshold"] = btrfsReclaimMetadata + } + } - for key, value := range sysfs { - path := fmt.Sprintf("%s/fs/btrfs/%s/%s", ns.SysfsPath, uuid, key) - if err := writeSysfs(path, value); err != nil { - return nil, status.Error(codes.Internal, err.Error()) - } - klog.V(4).Infof("NodeStageVolume set %s %s=%s", volumeID, key, value) + // Part 6: if configured, write sysfs values + if len(btrfsSysfs) > 0 { + args := []string{"--match-tag", "UUID", "--output", "value", stagingTargetPath} + cmd := ns.Mounter.Exec.Command("blkid", args...) + var stderr bytes.Buffer + cmd.SetStderr(&stderr) + klog.V(4).Infof( + "running %q for volume %s", + strings.Join(append([]string{"blkid"}, args...), " "), + volumeID, + ) + uuid, err := cmd.Output() + if err != nil { + klog.Errorf("blkid failed for %s. stderr:\n%s", volumeID, stderr.String()) + return nil, status.Errorf(codes.Internal, "blkid failed: %v", err) + } + uuid = bytes.TrimRight(uuid, "\n") + + for key, value := range btrfsSysfs { + path := fmt.Sprintf("%s/fs/btrfs/%s/%s", ns.SysfsPath, uuid, key) + if err := writeSysfs(path, value); err != nil { + return nil, status.Error(codes.Internal, err.Error()) } + klog.V(4).Infof("NodeStageVolume set %s %s=%s", volumeID, key, value) } } @@ -526,7 +533,6 @@ func writeSysfs(path, value string) (_err error) { if _, err := f.Write([]byte(value)); err != nil { return err } - return nil } @@ -546,16 +552,18 @@ func (ns *GCENodeServer) updateReadAhead(devicePath string, readAheadKB int64) e return nil } -func extractBtrfsReclaimFlags(mountFlags []string) (string, string) { - var reclaimData, reclaimMetadata string +func extractBtrfsFlags(mountFlags []string) (string, string, string) { + var reclaimData, reclaimMetadata, readAheadKb string for _, mountFlag := range mountFlags { if got := btrfsReclaimDataRegex.FindStringSubmatch(mountFlag); len(got) == 2 { reclaimData = got[1] } else if got := btrfsReclaimMetadataRegex.FindStringSubmatch(mountFlag); len(got) == 2 { reclaimMetadata = got[1] + } else if got := btrfsReadAheadKBRegex.FindStringSubmatch(mountFlag); len(got) == 2 { + readAheadKb = got[1] } } - return reclaimData, reclaimMetadata + return reclaimData, reclaimMetadata, readAheadKb } func extractReadAheadKBMountFlag(mountFlags []string) (int64, bool, error) { diff --git a/pkg/gce-pd-csi-driver/node_test.go b/pkg/gce-pd-csi-driver/node_test.go index 2ac1b9e2e..06d08563c 100644 --- a/pkg/gce-pd-csi-driver/node_test.go +++ b/pkg/gce-pd-csi-driver/node_test.go @@ -19,6 +19,7 @@ import ( "context" "fmt" "os" + "path" "path/filepath" "strings" "testing" @@ -625,17 +626,24 @@ func TestNodeStageVolume(t *testing.T) { defer os.RemoveAll(tempDir) stagingPath := filepath.Join(tempDir, defaultStagingPath) - btrfsUUID := "00000000-0000-0000-0000-000000000001" - btrfsPrefix := fmt.Sprintf("%s/sys/fs/btrfs/%s/allocation", tempDir, btrfsUUID) + var ( + btrfsUUID = "00000000-0000-0000-0000-000000000001" + btrfsPrefix = fmt.Sprintf("%s/sys/fs/btrfs/%s", tempDir, btrfsUUID) + btrfsFixtures = map[string]string{ + "allocation/data/bg_reclaim_threshold": "0\n", + "allocation/metadata/bg_reclaim_threshold": "0\n", + "bdi/read_ahead_kb": "4096\n", + } + ) - for _, suffix := range []string{"data", "metadata"} { - dir := btrfsPrefix + "/" + suffix + for fname, contents := range btrfsFixtures { + fullPath := btrfsPrefix + "/" + fname + dir := path.Dir(fullPath) if err := os.MkdirAll(dir, 0755); err != nil { t.Fatalf("Failed to set up fake sysfs dir %q: %v", dir, err) } - fname := dir + "/bg_reclaim_threshold" - if err := os.WriteFile(fname, []byte("0\n"), 0644); err != nil { - t.Fatalf("write %q: %v", fname, err) + if err := os.WriteFile(fullPath, []byte(contents), 0644); err != nil { + t.Fatalf("write %q: %v", fullPath, err) } } @@ -653,6 +661,7 @@ func TestNodeStageVolume(t *testing.T) { readAheadSectors string btrfsReclaimData string btrfsReclaimMetadata string + btrfsReadAheadKb string sectorSizeInBytes int expErrCode codes.Code }{ @@ -907,7 +916,7 @@ func TestNodeStageVolume(t *testing.T) { }, }, { - name: "Valid request, set btrfs-allocation-{,meta}data-bg_reclaim_threshold", + name: "Valid request, set btrfs props", req: &csi.NodeStageVolumeRequest{ VolumeId: volumeID, StagingTargetPath: stagingPath, @@ -918,6 +927,7 @@ func TestNodeStageVolume(t *testing.T) { MountFlags: []string{ "btrfs-allocation-data-bg_reclaim_threshold=90", "btrfs-allocation-metadata-bg_reclaim_threshold=91", + "btrfs-bdi-read_ahead_kb=128", }, }, }, @@ -931,6 +941,7 @@ func TestNodeStageVolume(t *testing.T) { readonlyBit: "0", btrfsReclaimData: "90", btrfsReclaimMetadata: "91", + btrfsReadAheadKb: "128", expCommandList: []fakeCmd{ { cmd: "blkid", @@ -1256,29 +1267,29 @@ func TestNodeStageVolume(t *testing.T) { if tc.expReadAheadUpdate == false && readAheadUpdateCalled == true { t.Fatalf("Test updated read ahead, but it was not expected.") } - if tc.btrfsReclaimData == "" && tc.btrfsReclaimMetadata == "" && blkidCalled { + if tc.btrfsReclaimData == "" && tc.btrfsReclaimMetadata == "" && tc.btrfsReadAheadKb == "" && blkidCalled { t.Fatalf("blkid was called, but was not expected.") } - if tc.btrfsReclaimData != "" { - fname := btrfsPrefix + "/data/bg_reclaim_threshold" - got, err := os.ReadFile(fname) - if err != nil { - t.Fatalf("read %q: %v", fname, err) - } - if s := strings.TrimSpace(string(got)); s != tc.btrfsReclaimData { - t.Fatalf("%q: expected %q, got %q", fname, tc.btrfsReclaimData, s) - } + btrfsProps := map[string]string{ + "/allocation/data/bg_reclaim_threshold": tc.btrfsReclaimData, + "/allocation/metadata/bg_reclaim_threshold": tc.btrfsReclaimMetadata, + "/bdi/read_ahead_kb": tc.btrfsReadAheadKb, } - if tc.btrfsReclaimMetadata != "" { - fname := btrfsPrefix + "/metadata/bg_reclaim_threshold" - got, err := os.ReadFile(fname) + + for fname, prop := range btrfsProps { + if prop == "" { + continue + } + + got, err := os.ReadFile(btrfsPrefix + fname) if err != nil { - t.Fatalf("read %q: %v", fname, err) + t.Fatalf("read %q: %v", btrfsPrefix+fname, err) } - if s := strings.TrimSpace(string(got)); s != tc.btrfsReclaimMetadata { - t.Fatalf("%q: expected %q, got %q", fname, tc.btrfsReclaimMetadata, s) + if s := strings.TrimSpace(string(got)); s != prop { + t.Fatalf("%q: expected %q, got %q", btrfsPrefix+fname, prop, s) } + } }) } diff --git a/pkg/gce-pd-csi-driver/utils.go b/pkg/gce-pd-csi-driver/utils.go index f8a4d0461..b0d3af954 100644 --- a/pkg/gce-pd-csi-driver/utils.go +++ b/pkg/gce-pd-csi-driver/utils.go @@ -306,18 +306,20 @@ func collectMountOptions(fsType string, mntFlags []string) []string { var options []string for _, opt := range mntFlags { + // The flags below are special flags that aren't + // passed directly as an options to the mount command. if readAheadKBMountFlagRegex.FindString(opt) != "" { - // The read_ahead_kb flag is a special flag that isn't - // passed directly as an option to the mount command. continue } - if btrfsReclaimDataRegex.FindString(opt) != "" { continue } if btrfsReclaimMetadataRegex.FindString(opt) != "" { continue } + if btrfsReadAheadKBRegex.FindString(opt) != "" { + continue + } options = append(options, opt) }