diff --git a/charts/aws-efs-csi-driver/templates/controller-deployment.yaml b/charts/aws-efs-csi-driver/templates/controller-deployment.yaml index 4753b16fa..573e9e33f 100644 --- a/charts/aws-efs-csi-driver/templates/controller-deployment.yaml +++ b/charts/aws-efs-csi-driver/templates/controller-deployment.yaml @@ -137,6 +137,12 @@ spec: {{- if hasKey .Values.controller "leaderElectionLeaseDuration" }} - --leader-election-lease-duration={{ .Values.controller.leaderElectionLeaseDuration }} {{- end }} + {{- if hasKey .Values.controller "workerThreads" }} + - --worker-threads={{ .Values.controller.workerThreads }} + {{- end }} + {{- if hasKey .Values.controller "timeout" }} + - --timeout={{ .Values.controller.timeout }} + {{- end }} {{- range .Values.sidecars.csiProvisioner.additionalArgs }} - {{ . }} {{- end }} diff --git a/charts/aws-efs-csi-driver/values.yaml b/charts/aws-efs-csi-driver/values.yaml index 89c637a67..d2abbf35f 100644 --- a/charts/aws-efs-csi-driver/values.yaml +++ b/charts/aws-efs-csi-driver/values.yaml @@ -124,6 +124,10 @@ controller: privileged: true leaderElectionRenewDeadline: 10s leaderElectionLeaseDuration: 15s + # Timeout for Create/DeleteVolume calls to Controller. We recommend increasing for high concurrency workloads + timeout: 15s + # Number of concurrent threads controller will handle at once. + workerThreads: 100 # TSCs without the label selector stanza # # Example: diff --git a/deploy/kubernetes/base/controller-deployment.yaml b/deploy/kubernetes/base/controller-deployment.yaml index dd4518e1f..aba50dc25 100644 --- a/deploy/kubernetes/base/controller-deployment.yaml +++ b/deploy/kubernetes/base/controller-deployment.yaml @@ -77,6 +77,8 @@ spec: - --feature-gates=Topology=true - --extra-create-metadata - --leader-election + - --worker-threads=100 + - --timeout=15s env: - name: ADDRESS value: /var/lib/csi/sockets/pluginproxy/csi.sock diff --git a/docs/README.md b/docs/README.md index 9307c35c9..79e0d5239 100644 --- a/docs/README.md +++ b/docs/README.md @@ -420,6 +420,15 @@ Before following the examples, you need to: * [Mount subpath](../examples/kubernetes/volume_path/README.md) * [Use Access Points](../examples/kubernetes/access_points/README.md) +## Resource limits +The controller container has different memory / CPU requirements based on the workload scale, concurrency, and configurations. When configuring your controller with `delete-access-point-root-dir=true`, we recommend setting higher resource limits if your workload requires many concurrent volume deletions. For example, for a workload that requires 100 concurrent PVC deletions, we recommend setting a minimum CPU limit of 3000m and a minimum memory limit of 2.5 GiB. + +Alternatively, if you would prefer not to allocate these resources to your controller container, we advise lowering concurrency by lowering the `--worker-threads` argument of the [external-provisioner](https://github.com/kubernetes-csi/external-provisioner). + +## Timeouts +For most highly concurrent workloads, we recommend increasing the default timeout argument set in the [external-provisioner](https://github.com/kubernetes-csi/external-provisioner) from 15 seconds to 60 seconds. This will avoid provisioning failures due to throttling and resource contention in the controller container. + + ## Using botocore to retrieve mount target ip address when dns name cannot be resolved * Amazon EFS CSI driver supports using botocore to retrieve mount target ip address when dns name cannot be resolved, e.g., when user is mounting a file system in another VPC, botocore comes preinstalled on efs-csi-driver which can solve this DNS issue. * IAM policy prerequisites to use this feature :