diff --git a/examples/load-tests/.gitignore b/examples/load-tests/.gitignore new file mode 100644 index 0000000000000..35cb1fac2c4f3 --- /dev/null +++ b/examples/load-tests/.gitignore @@ -0,0 +1 @@ +identity.pem diff --git a/examples/load-tests/README.md b/examples/load-tests/README.md new file mode 100644 index 0000000000000..95de866bee962 --- /dev/null +++ b/examples/load-tests/README.md @@ -0,0 +1,90 @@ +# Teleport load-test resources + +## Introduction + +This directory contains: + +- [the `node-agent` helm chart](./node-agent) deploying Teleport ssh node load-test agents +- [the `tsh-bench-agent` helm chart](./tsh-bench-agent) deploying tsh bench session agents +- instructions to deploy a test Teleport cluster on EKS (in this README) + +Those charts and instructions are for Teleport internal development, +they are not part of the product and no support will be provided. + +## How to load-test Teleport deployed via the `teleport-cluster` Helm chart + +### Install tested cluster + +Start by creating a working cluster: + +- Create EKS cluster with the correct policies + [according to our EKS guide](https://goteleport.com/docs/ver/12.x/deploy-a-cluster/helm-deployments/aws/) +- Make sure EBS CSI addon is deployed +- Make sure the policy `AmazonEBSCSIDriverPolicy` is granted to the instance + role associated with the EKS nodegroups which are running your Kubernetes nodes. +- install cert-manager and create an issuer as instructed in the EKS guide + +Install the monitoring stack: + +```shell +# Add repos if you don't have them yet +helm repo add prometheus-community https://prometheus-community.github.io/helm-charts +helm repo update + +# Install the stack +helm install monitoring -n monitoring --create-namespace prometheus-community/kube-prometheus-stack -f values/kube-prometheus-stack.yaml +``` + +Generate a secret token + +```bash +TOKEN=$(pwgen -n 30) +``` +Edit `values/teleport.yaml` (replace ), then install Teleport using the chart + +```shell +helm install teleport -n teleport --create-namespace --values values/teleport.yaml --set auth.teleportConfig.auth_service.tokens[0]="node:$TOKEN" +``` + +For v11 and below: +- edit the `teleport` configmap to add a static token and set `routing_strategy: most_recent` + ```yaml + auth_service: + routing_strategy: 'most_recent' + tokens: + - "node:$TOKEN" # Replace $TOKEN with your join token + ``` + +In the AWS Console, [change dynamoDB provision settings for "onDemand"](https://aws.amazon.com/blogs/aws/amazon-dynamodb-on-demand-no-capacity-planning-and-pay-per-request-pricing/). + +### Run test + +#### Run node agents + +To deploy 5000 ssh nodes, run the following command. A node is a teleport instance running only the `ssh_service`. + +``` +helm upgrade --install node-agents -n agents --create-namespace node-agent/ --values values/node-agents.yaml --set replicaCount=250 --set agentsPerPod=20 --set proxyServer=-lt.teleportdemo.net:443 --set joinParams.token_name=$TOKEN +``` + +This will deploy 250 pods running 20 Teleport SSH instances each, the instances are packed by pod because ENIs are limited on EKS and Kubernetes also limits the amount of pods per node. + +#### Run tsh-bench agents + +Create a user and get an identity (by default the identity is valid for 24 hours, make sure to refresh it or increase the TTL): + +Note: by default the user is named `joe`, you can change this by editing `user.yaml`. + +```bash +POD="$(kubectl get pods -n teleport -l app=teleport -o name | head -n 1 | sed 's@^pod/@@')" +kubectl exec -i -n teleport "$POD" -- tctl create -f < fixtures/user.yaml +kubectl exec -it -n teleport "$POD" -- tctl auth sign --user joe -o identity.pem +kubectl cp -n teleport "$POD:/identity.pem" ./fixtures/identity.pem +kubectl create -n agents secret generic tsh-bench-agents --from-file=identity.pem=./fixtures/identity.pem +``` + +Deploy the agent: + +```shell +helm upgrade --install tsh-bench-agents tsh-bench-agent/ -n agents --values values/tsh-bench-agents.yaml --set proxyServer=-lt.teleportdemo.net:443 --set joinParams.token_name=$TOKEN +``` diff --git a/examples/load-tests/fixtures/user.yaml b/examples/load-tests/fixtures/user.yaml new file mode 100644 index 0000000000000..e15c924905fcd --- /dev/null +++ b/examples/load-tests/fixtures/user.yaml @@ -0,0 +1,15 @@ +kind: user +version: v2 +metadata: + name: joe +spec: + roles: + - editor + - auditor + - access + traits: + logins: + - root + - ubuntu + - debian + diff --git a/examples/load-tests/node-agent/.helmignore b/examples/load-tests/node-agent/.helmignore new file mode 100644 index 0000000000000..0e8a0eb36f4ca --- /dev/null +++ b/examples/load-tests/node-agent/.helmignore @@ -0,0 +1,23 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/examples/load-tests/node-agent/Chart.yaml b/examples/load-tests/node-agent/Chart.yaml new file mode 100644 index 0000000000000..345de8f837521 --- /dev/null +++ b/examples/load-tests/node-agent/Chart.yaml @@ -0,0 +1,9 @@ +apiVersion: v2 +name: node-agent +description: Deploys node load-test agents (Teleport nodes running SSH Service) + +type: application + +version: 0.1.0 + +appVersion: "12.0.0-dev" diff --git a/examples/load-tests/node-agent/templates/config.yaml b/examples/load-tests/node-agent/templates/config.yaml new file mode 100644 index 0000000000000..07958a06d654c --- /dev/null +++ b/examples/load-tests/node-agent/templates/config.yaml @@ -0,0 +1,36 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ .Release.Name }} + namespace: {{ .Release.Namespace }} +data: + teleport.yaml: |2 + version: v3 + teleport: + log: + severity: DEBUG + storage: + type: dir + {{- if .Values.authServer }} + auth_server: {{ .Values.authServer }} + {{- end }} + {{- if .Values.proxyServer }} + proxy_server: {{ .Values.proxyServer }} + {{- end }} + join_params: {{- toYaml .Values.joinParams | nindent 8 }} + auth_service: + enabled: false + proxy_service: + enabled: false + ssh_service: + enabled: true + # listen_addr set at runtime to avoid conflicts in the same pod + # listen_addr: 0.0.0.0:3022 + entrypoint.sh: |2 + #!/bin/bash + set -euxo pipefail + cp /etc/teleport-config/teleport.yaml /etc/teleport.yaml + echo " listen_addr: '0.0.0.0:30$REPLICA'" >> /etc/teleport.yaml + HOST="$(hostname)-$REPLICA" + cat /etc/teleport.yaml + exec teleport start -c /etc/teleport.yaml --nodename $HOST diff --git a/examples/load-tests/node-agent/templates/deployment.yaml b/examples/load-tests/node-agent/templates/deployment.yaml new file mode 100644 index 0000000000000..8a8da4b2a8652 --- /dev/null +++ b/examples/load-tests/node-agent/templates/deployment.yaml @@ -0,0 +1,43 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + namespace: agents + name: {{ .Release.Name }} +spec: + replicas: {{ .Values.replicaCount }} + minReadySeconds: {{ .Values.minReadySeconds }} + selector: + matchLabels: + app.kubernetes.io/name: {{ .Release.Name }} + template: + metadata: + labels: + app.kubernetes.io/name: {{ .Release.Name }} + spec: + serviceAccountName: {{ .Release.Name }} + containers: + {{- range $i, $_ := until (int .Values.agentsPerPod) }} + {{- $id := printf "%02d" $i }} + - image: "{{ $.Values.image.repository }}:{{ default $.Chart.AppVersion $.Values.image.tag }}" + name: agent-{{ $id }} + command: ["bash", "/etc/teleport-config/entrypoint.sh"] + env: + - name: REPLICA + value: "{{ $id }}" + volumeMounts: + - mountPath: /etc/teleport-config + name: config + readOnly: true + resources: {{- toYaml $.Values.resources | nindent 12 }} + {{- end }} + volumes: + - configMap: + name: {{ .Release.Name }} + defaultMode: 0766 + name: config + {{- if .Values.tolerations }} + tolerations: {{ toYaml .Values.tolerations | nindent 8}} + {{- end }} + {{- if .Values.affinity }} + affinity: {{ toYaml .Values.affinity | nindent 8}} + {{- end }} diff --git a/examples/load-tests/node-agent/templates/serviceaccount.yaml b/examples/load-tests/node-agent/templates/serviceaccount.yaml new file mode 100644 index 0000000000000..607d85bce1843 --- /dev/null +++ b/examples/load-tests/node-agent/templates/serviceaccount.yaml @@ -0,0 +1,7 @@ +{{- if .Values.serviceAccount.create -}} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ .Release.Name }} + namespace: {{ .Release.Namespace }} +{{- end }} diff --git a/examples/load-tests/node-agent/values.yaml b/examples/load-tests/node-agent/values.yaml new file mode 100644 index 0000000000000..2754aaf58554a --- /dev/null +++ b/examples/load-tests/node-agent/values.yaml @@ -0,0 +1,32 @@ +replicaCount: 1 +agentsPerPod: 10 +proxyServer: "" +authServer: "" + +minReadySeconds: 0 + +image: + repository: public.ecr.aws/gravitational/teleport + pullPolicy: IfNotPresent + tag: "" + +serviceAccount: + create: true + +joinParams: + # the kubernetes join method is not currently suited for joining a large amoubt of nodes in a short time + method: token + # DO NOT USE THIS IN PRODUCTION + token_name: qwertyuiop + +# Applied par agent (not per-pod) +resources: + limits: + memory: 150Mi + requests: + cpu: 20m + memory: 150Mi + +tolerations: [] + +affinity: {} diff --git a/examples/load-tests/podmonitor.yaml b/examples/load-tests/podmonitor.yaml new file mode 100644 index 0000000000000..fd098030b364b --- /dev/null +++ b/examples/load-tests/podmonitor.yaml @@ -0,0 +1,17 @@ +# This resource is only required for pre-v12 `teleport-cluster` Helm chart +apiVersion: monitoring.coreos.com/v1 +kind: PodMonitor +metadata: + name: teleport + namespace: teleport +spec: + jobLabel: app + namespaceSelector: + matchNames: + - teleport + selector: + matchLabels: + app: teleport + podMetricsEndpoints: + - port: diag + path: /metrics diff --git a/examples/load-tests/tsh-bench-agent/.helmignore b/examples/load-tests/tsh-bench-agent/.helmignore new file mode 100644 index 0000000000000..0e8a0eb36f4ca --- /dev/null +++ b/examples/load-tests/tsh-bench-agent/.helmignore @@ -0,0 +1,23 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/examples/load-tests/tsh-bench-agent/Chart.yaml b/examples/load-tests/tsh-bench-agent/Chart.yaml new file mode 100644 index 0000000000000..69115468689d6 --- /dev/null +++ b/examples/load-tests/tsh-bench-agent/Chart.yaml @@ -0,0 +1,9 @@ +apiVersion: v2 +name: tsh-bench-agents +description: Deploys load-tests agents running `tsh bench sessions`. + +type: application + +version: 0.1.0 + +appVersion: "12.0.0-dev" diff --git a/examples/load-tests/tsh-bench-agent/templates/deployment.yaml b/examples/load-tests/tsh-bench-agent/templates/deployment.yaml new file mode 100644 index 0000000000000..4d58754edd86c --- /dev/null +++ b/examples/load-tests/tsh-bench-agent/templates/deployment.yaml @@ -0,0 +1,66 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + namespace: {{ .Release.Namespace }} + name: {{ .Release.Name }} +spec: + replicas: {{ .Values.replicaCount }} + minReadySeconds: {{ .Values.minReadySeconds }} + selector: + matchLabels: + app.kubernetes.io/name: {{ .Release.Name }} + template: + metadata: + labels: + app.kubernetes.io/name: {{ .Release.Name }} + spec: + # NAT-ing usually causes issues when load-testing + hostNetwork: true + containers: + - image: "{{ $.Values.image.repository }}:{{ default $.Chart.AppVersion $.Values.image.tag }}" + name: tsh-bench + command: + - tsh + - "--proxy={{ .Values.proxyServer }}" + - "-i" + - "/mnt/identity.pem" + - "bench" + - "sessions" + - "--max={{.Values.sessionsPerAgent}}" + - "root" + {{- toYaml .Values.command | nindent 12 }} + volumeMounts: + - mountPath: /mnt + name: identity + readOnly: true + resources: {{- toYaml $.Values.resources | nindent 12 }} + {{- if .Values.webSessions }} + - image: "{{ $.Values.image.repository }}:{{ default $.Chart.AppVersion $.Values.image.tag }}" + name: tsh-bench-web + command: + - tsh + - "--proxy={{ .Values.proxyServer }}" + - "-i" + - "/mnt/identity.pem" + - "bench" + - "sessions" + - "--max={{.Values.sessionsPerAgent}}" + - "--web" + {{- toYaml .Values.command | nindent 12 }} + volumeMounts: + - mountPath: /mnt + name: identity + readOnly: true + resources: {{- toYaml $.Values.resources | nindent 12 }} + {{- end }} + volumes: + - secret: + secretName: {{ .Release.Name }} + optional: false + name: identity + {{- if .Values.tolerations }} + tolerations: {{ toYaml .Values.tolerations | nindent 8}} + {{- end }} + {{- if .Values.affinity }} + affinity: {{ toYaml .Values.affinity | nindent 8}} + {{- end }} diff --git a/examples/load-tests/tsh-bench-agent/values.yaml b/examples/load-tests/tsh-bench-agent/values.yaml new file mode 100644 index 0000000000000..930f868e41ac7 --- /dev/null +++ b/examples/load-tests/tsh-bench-agent/values.yaml @@ -0,0 +1,32 @@ +replicaCount: 1 +sessionsPerAgent: 1000 +proxyServer: "" + +minReadySeconds: 0 + +# Web sessions is untested +# In theory it should work, but you need to find a way to pass the OTP challenge without an interactive terminal +webSessions: false + +command: + - "sh" + - "-c" + - "while ls; do sleep 5; done" + +image: + repository: public.ecr.aws/gravitational/teleport + pullPolicy: IfNotPresent + # This tag contains `tsh bench sessions` + tag: "10.1.4-bench.2" + +# Applied par agent +resources: + limits: + memory: 3Gi + requests: + cpu: "4" + memory: 3Gi + +tolerations: [] + +affinity: {} diff --git a/examples/load-tests/values/kube-prometheus-stack.yaml b/examples/load-tests/values/kube-prometheus-stack.yaml new file mode 100644 index 0000000000000..6c89a751fa880 --- /dev/null +++ b/examples/load-tests/values/kube-prometheus-stack.yaml @@ -0,0 +1,22 @@ +prometheus: + prometheusSpec: + scrapeInterval: 15s + retention: 30d + resources: + requests: + memory: 16Gi + cpu: "4" + limits: + memory: 16Gi + # cpu: 4 + storageSpec: + volumeClaimTemplate: + spec: + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: 50Gi + podMonitorSelectorNilUsesHelmValues: false + serviceMonitorSelectorNilUsesHelmValues: false + + diff --git a/examples/load-tests/values/node-agents.yaml b/examples/load-tests/values/node-agents.yaml new file mode 100644 index 0000000000000..86076fec5adec --- /dev/null +++ b/examples/load-tests/values/node-agents.yaml @@ -0,0 +1,29 @@ +replicaCount: 30 +agentsPerPod: 15 + +minReadySeconds: 30 + +image: + # TODO: remove after v12 is released + tag: 12.0.0-hugochartsplit.2 + +joinParams: + method: token + token_name: "qwertyuiop" + +tolerations: +- key: "role" + operator: "Equal" + value: "agent" + effect: "NoSchedule" + +affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: role + operator: In + values: + - agent + diff --git a/examples/load-tests/values/teleport.yaml b/examples/load-tests/values/teleport.yaml new file mode 100644 index 0000000000000..67ed4b2982062 --- /dev/null +++ b/examples/load-tests/values/teleport.yaml @@ -0,0 +1,37 @@ +chartMode: aws +clusterName: -lt.teleportdemo.net +aws: + region: us-east-1 + backendTable: -teleport-helm-backend + auditLogTable: -teleport-helm-events + auditLogMirrorOnStdout: false + sessionRecordingBucket: -teleport-helm-sessions + backups: false + dynamoAutoScaling: false + +highAvailability: + replicaCount: 2 + certManager: + enabled: true + issuerName: letsencrypt-production + +resources: + limits: + memory: 8Gi + requests: + cpu: 4 + memory: 8Gi + +podMonitor: + enabled: true + interval: "" + +# TODO: change after v12 is released +teleportVersionOverride: 12.0.0-hugochartsplit.2 + +auth: + teleportConfig: + auth_service: + routing_strategy: 'most_recent' + # tokens: + # - "node:qwertyuiop" diff --git a/examples/load-tests/values/tsh-bench-agents.yaml b/examples/load-tests/values/tsh-bench-agents.yaml new file mode 100644 index 0000000000000..e5b73902f5442 --- /dev/null +++ b/examples/load-tests/values/tsh-bench-agents.yaml @@ -0,0 +1,20 @@ +replicaCount: 1 +sessionsPerAgent: 500 + +webSessions: false + +tolerations: +- key: "role" + operator: "Equal" + value: "agent" + effect: "NoSchedule" + +affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: role + operator: In + values: + - agent