Skip to content

Commit

Permalink
ci: [CNI] Load testing for cilium cni (#1871)
Browse files Browse the repository at this point in the history
ci:[CNI] Load testing for cilium cni
  • Loading branch information
vipul-21 authored and jpayne3506 committed Sep 11, 2023
1 parent 434aa88 commit 06defa2
Show file tree
Hide file tree
Showing 6 changed files with 323 additions and 9 deletions.
118 changes: 118 additions & 0 deletions .pipelines/cni/cilium/cilium-cni-load-test.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
pr: none
trigger: none

stages:
- stage: creating_aks_cluster
displayName: "Create AKS Cluster with Cilium"
jobs:
- job: create_aks_cluster_with_cilium
steps:
- task: AzureCLI@1
inputs:
azureSubscription: $(TEST_SUB_SERVICE_CONNECTION)
scriptLocation: "inlineScript"
scriptType: "bash"
addSpnToEnvironment: true
inlineScript: |
set -ex
make -C ./hack/swift azcfg AZCLI=az REGION=$(LOCATION)
make -C ./hack/swift overlay-cilium-up AZCLI=az REGION=$(LOCATION) SUB=$(SUBSCRIPTION_ID) CLUSTER=${RESOURCE_GROUP} NODE_COUNT=10 VM_SIZE=Standard_DS4_v2
name: "CreateAksCluster"
displayName: "Create AKS Cluster"
- stage: pod_deployment
dependsOn: creating_aks_cluster
displayName: "Pod Deployment"
jobs:
- job: deploy_pods
steps:
- task: AzureCLI@1
displayName: "Pod Deployment"
inputs:
azureSubscription: $(TEST_SUB_SERVICE_CONNECTION)
scriptLocation: "inlineScript"
scriptType: "bash"
addSpnToEnvironment: true
inlineScript: |
set -ex
az extension add --name aks-preview
make -C ./hack/swift set-kubeconf AZCLI=az CLUSTER=${RESOURCE_GROUP}
bash hack/scripts/scale_deployment.sh
- stage: validate_state
dependsOn: pod_deployment
displayName: "Validate State"
jobs:
- job: validate_state
steps:
- task: AzureCLI@1
inputs:
azureSubscription: $(TEST_SUB_SERVICE_CONNECTION)
scriptLocation: "inlineScript"
scriptType: "bash"
addSpnToEnvironment: true
inlineScript: |
make -C ./hack/swift set-kubeconf AZCLI=az CLUSTER=${RESOURCE_GROUP}
kubectl get pods -A
bash hack/scripts/validate_state.sh
name: "ValidateState"
displayName: "Validate State"
retryCountOnTaskFailure: 3
- stage: connectivity_tests
dependsOn: validate_state
displayName: "Connectivity Tests"
jobs:
- job: cni_tests
steps:
- script: |
echo "install cilium CLI"
CILIUM_CLI_VERSION=v0.13.2
CLI_ARCH=amd64
curl -L --fail --remote-name-all https://github.com/cilium/cilium-cli/releases/download/${CILIUM_CLI_VERSION}/cilium-linux-${CLI_ARCH}.tar.gz{,.sha256sum}
sha256sum --check cilium-linux-${CLI_ARCH}.tar.gz.sha256sum
sudo tar xzvfC cilium-linux-${CLI_ARCH}.tar.gz /usr/local/bin
rm cilium-linux-${CLI_ARCH}.tar.gz{,.sha256sum}
name: "InstallCiliumCli"
displayName: "Install Cilium CLI"
- task: AzureCLI@1
inputs:
azureSubscription: $(TEST_SUB_SERVICE_CONNECTION)
scriptLocation: "inlineScript"
scriptType: "bash"
addSpnToEnvironment: true
inlineScript: |
set -ex
make -C ./hack/swift set-kubeconf AZCLI=az CLUSTER=${RESOURCE_GROUP}
name: "GetCluster"
displayName: "Get AKS Cluster"
- script: |
cilium connectivity test
retryCountOnTaskFailure: 6
name: "CiliumConnectivityTests"
displayName: "Run Cilium Connectivity Tests"
- stage: delete
displayName: "Delete Resources"
dependsOn:
- connectivity_tests
jobs:
- job: delete_resources
steps:
- task: AzureCLI@1
inputs:
azureSubscription: $(TEST_SUB_SERVICE_CONNECTION)
scriptLocation: "inlineScript"
scriptType: "bash"
addSpnToEnvironment: true
inlineScript: |
set -ex
if [ "$(DELETE_RESOURCES)" ]
then
echo "Deleting Cluster and resource group"
make -C ./hack/swift set-kubeconf AZCLI=az CLUSTER=${RESOURCE_GROUP}
make -C ./hack/swift azcfg AZCLI=az REGION=$(LOCATION)
make -C ./hack/swift down AZCLI=az REGION=$(LOCATION) SUB=$(SUBSCRIPTION_ID) CLUSTER=${RESOURCE_GROUP}
echo "Cluster and resources down"
else
echo "Deletion of resources is False"
fi
name: "CleanUpCluster"
displayName: "Cleanup cluster"
condition: always()
38 changes: 38 additions & 0 deletions hack/manifests/hostprocess.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: privileged-daemonset
namespace: kube-system
labels:
app: privileged-daemonset
spec:
selector:
matchLabels:
app: privileged-daemonset
template:
metadata:
labels:
app: privileged-daemonset
spec:
hostNetwork: true
hostPID: true
containers:
- name: privileged-container
image: mcr.microsoft.com/dotnet/runtime-deps:6.0
command: ["/bin/sleep", "3650d"]
securityContext:
privileged: true
runAsUser: 0
volumeMounts:
- mountPath: /var/run/azure-cns
name: azure-cns
- mountPath: /host
name: host-root
volumes:
- name: azure-cns
hostPath:
path: /var/run/azure-cns
- hostPath:
path: /
type: ""
name: host-root
20 changes: 20 additions & 0 deletions hack/manifests/pod.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: container
namespace: default
spec:
selector:
matchLabels:
app: container
template:
metadata:
labels:
app: container
spec:
containers:
- name: ubuntu
image: mcr.microsoft.com/oss/kubernetes/pause:3.6
imagePullPolicy: Always
securityContext:
privileged: true
37 changes: 37 additions & 0 deletions hack/scripts/scale_deployment.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#!/bin/bash
set -ex
kubectl apply -f hack/manifests/pod.yaml
kubectl apply -f hack/manifests/hostprocess.yaml
sleep 1m
total_num_of_run=4
scale_up_of_pods=2400
scale_down_pods=1
echo "Total num of run $total_num_of_run"

function check_deployment() {
available=-1
replicas="$1"
while [ "${available}" -ne "${replicas}" ]; do
sleep 5s
current_available=$(kubectl get deployment container -o "jsonpath={.status.availableReplicas}" )
if [ "$current_available" != '' ]; then
available=$current_available
fi
echo "available replicas: ${available}"
done
echo "deployment complete."
}

for ((i=1; i <= total_num_of_run; i++))
do
echo "Current Run: $i"
echo "Scaling pods to : $scale_up_of_pods"
kubectl scale deployment container --replicas $scale_up_of_pods
check_deployment $scale_up_of_pods
echo "Scaling down pods to : $scale_down_pods"
kubectl scale deployment container --replicas $scale_down_pods
check_deployment $scale_down_pods
done

kubectl scale deployment container --replicas $scale_up_of_pods
check_deployment $scale_up_of_pods
98 changes: 98 additions & 0 deletions hack/scripts/validate_state.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
#!/bin/bash
function find_in_array() {
for i in $1
do
if [ "$i" == "$2" ] ; then
return 0
fi
done
return 1
}

for node in $(kubectl get nodes -o name);
do
echo "Current : $node"
node_name="${node##*/}"
node_ip=$(kubectl get "$node" -o jsonpath='{$.status.addresses[?(@.type=="InternalIP")].address}')
echo "Node internal ip: $node_ip"
privileged_pod=$(kubectl get pods -n kube-system -l app=privileged-daemonset -o wide | grep "$node_name" | awk '{print $1}')
echo "privileged pod : $privileged_pod"
while ! [ -s "azure_endpoints.json" ]
do
echo "trying to get the azure_endpoints"
kubectl exec -i "$privileged_pod" -n kube-system -- bash -c "cat /var/run/azure-cns/azure-endpoints.json" > azure_endpoints.json
sleep 10
done

cilium_agent=$(kubectl get pod -l k8s-app=cilium -n kube-system -o wide | grep "$node_name" | awk '{print $1}')
echo "cilium agent : $cilium_agent"

while ! [ -s "cilium_endpoints.json" ]
do
echo "trying to get the cilium_endpoints"
kubectl exec -i "$cilium_agent" -n kube-system -- bash -c "cilium endpoint list -o json" > cilium_endpoints.json
sleep 10
done

total_pods=$(kubectl get pods --all-namespaces -o wide --field-selector spec.nodeName="$node_name",status.phase=Running --output json)

echo "Checking if there are any pods with no ips"
pods_with_no_ip=$(echo "$total_pods" | jq -j '(.items[] | select(.status.podIP == "" or .status.podIP == null))')
if [ "$pods_with_no_ip" != "" ]; then
echo "There are some pods with no ip assigned."
kubectl get pods -A -o wide
exit 1
fi
total_pods_ips=$(echo "$total_pods" | jq -r '(.items[] | .status.podIP)')
pod_ips=()
num_of_pod_ips=0
for ip in $total_pods_ips
do
if [ "$ip" != "$node_ip" ]; then
pod_ips+=("$ip")
num_of_pod_ips=$((num_of_pod_ips+1))
fi
done
echo "Number of pods running with ip assigned $num_of_pod_ips"

num_of_azure_endpoint_ips=$( cat azure_endpoints.json | jq -r '[.Endpoints | .[] | .IfnameToIPMap.eth0.IPv4[0].IP] | length' )
azure_endpoint_ips=$( cat azure_endpoints.json | jq -r '(.Endpoints | .[] | .IfnameToIPMap.eth0.IPv4[0].IP) ' )
echo "Number of azure endpoint ips : $num_of_azure_endpoint_ips"

if [ "$num_of_pod_ips" != "$num_of_azure_endpoint_ips" ]; then
printf "Error: Number of pods in running state is less than total ips in the azure ednpoint file" >&2
exit 1
fi

echo "checking the ips in the azure endpoints file"
for ip in "${pod_ips[@]}"
do
find_in_array "$azure_endpoint_ips" "$ip" "azure_endpoints.json"
if [[ $? -eq 1 ]]; then
printf "Error: %s Not found in the azure_endpoints.json" "$ip" >&2
exit 1
fi
done

num_of_cilium_endpoints=$(cat cilium_endpoints.json | jq -r '[.[] | select(.status.networking.addressing[0].ipv4 != null)] | length')
cilium_endpoint_ips=$(cat cilium_endpoints.json | jq -r '(.[] | select(.status.networking.addressing[0].ipv4 != null) | .status.networking.addressing[0].ipv4)')
echo "Number of cilium endpoints: $num_of_cilium_endpoints"

if [ "$num_of_pod_ips" != "$num_of_cilium_endpoints" ]; then
printf "Error: Number of pods in running state is less than total ips in the cilium endpoint file" >&2
exit 1
fi

for ip in "${pod_ips[@]}"
do
find_in_array "$cilium_endpoint_ips" "$ip" "cilium_endpoints.json"
if [[ $? -eq 1 ]]; then
printf "Error: %s Not found in the cilium_endpoints.json" "$ip" >&2
exit 1
fi
done

#We are restarting the systmemd network and checking that the connectivity works after the restart. For more details: https://github.com/cilium/cilium/issues/18706
kubectl exec -i "$privileged_pod" -n kube-system -- bash -c "chroot /host /bin/bash -c 'systemctl restart systemd-networkd'"
rm -rf cilium_endpoints.json azure_endpoints.json
done
21 changes: 12 additions & 9 deletions hack/swift/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ AZCLI ?= docker run --rm -v $(AZCFG):/root/.azure -v $(KUBECFG):/root/.kube -v
REGION ?= westus2
OS_SKU ?= Ubuntu
VM_SIZE ?= Standard_B2s
NODE_COUNT ?= 2

# overrideable variables
SUB ?= $(AZURE_SUBSCRIPTION)
CLUSTER ?= $(USER)-$(REGION)
Expand Down Expand Up @@ -53,6 +55,7 @@ vars: ## Show the input vars configured for the cluster commands
@echo VNET=$(VNET)
@echo OS_SKU=$(OS_SKU)
@echo VM_SIZE=$(VM_SIZE)
@echo NODE_COUNT=$(NODE_COUNT)


##@ SWIFT Infra
Expand Down Expand Up @@ -81,8 +84,8 @@ up: swift-up ## Alias to swift-up

overlay-byocni-up: rg-up overlay-net-up ## Brings up an Overlay BYO CNI cluster
$(AZCLI) aks create -n $(CLUSTER) -g $(GROUP) -l $(REGION) \
--node-count 2 \
--node-vm-size Standard_B2s \
--node-count $(NODE_COUNT) \
--node-vm-size $(VM_SIZE) \
--load-balancer-sku basic \
--network-plugin none \
--network-plugin-mode overlay \
Expand All @@ -94,11 +97,11 @@ overlay-byocni-up: rg-up overlay-net-up ## Brings up an Overlay BYO CNI cluster

overlay-cilium-up: rg-up overlay-net-up ## Brings up an Overlay AzCNI cluster
$(AZCLI) aks create -n $(CLUSTER) -g $(GROUP) -l $(REGION) \
--node-count 2 \
--node-count $(NODE_COUNT) \
--node-vm-size $(VM_SIZE) \
--load-balancer-sku basic \
--network-plugin azure \
--enable-cilium-dataplane \
--network-dataplane cilium \
--network-plugin-mode overlay \
--pod-cidr 192.168.0.0/16 \
--vnet-subnet-id /subscriptions/$(SUB)/resourceGroups/$(GROUP)/providers/Microsoft.Network/virtualNetworks/$(VNET)/subnets/nodenet \
Expand All @@ -108,7 +111,7 @@ overlay-cilium-up: rg-up overlay-net-up ## Brings up an Overlay AzCNI cluster

overlay-up: rg-up overlay-net-up ## Brings up an Overlay AzCNI cluster
$(AZCLI) aks create -n $(CLUSTER) -g $(GROUP) -l $(REGION) \
--node-count 2 \
--node-count $(NODE_COUNT) \
--node-vm-size $(VM_SIZE) \
--load-balancer-sku basic \
--network-plugin azure \
Expand All @@ -121,7 +124,7 @@ overlay-up: rg-up overlay-net-up ## Brings up an Overlay AzCNI cluster

swift-byocni-up: rg-up swift-net-up ## Bring up a SWIFT BYO CNI cluster
$(AZCLI) aks create -n $(CLUSTER) -g $(GROUP) -l $(REGION) \
--node-count 2 \
--node-count $(NODE_COUNT) \
--node-vm-size $(VM_SIZE) \
--load-balancer-sku basic \
--network-plugin none \
Expand All @@ -134,11 +137,11 @@ swift-byocni-up: rg-up swift-net-up ## Bring up a SWIFT BYO CNI cluster

swift-cilium-up: rg-up swift-net-up ## Bring up a SWIFT Cilium cluster
$(AZCLI) aks create -n $(CLUSTER) -g $(GROUP) -l $(REGION) \
--node-count 2 \
--node-count $(NODE_COUNT) \
--node-vm-size $(VM_SIZE) \
--load-balancer-sku basic \
--network-plugin azure \
--enable-cilium-dataplane \
--network-dataplane cilium \
--aks-custom-headers AKSHTTPCustomFeatures=Microsoft.ContainerService/CiliumDataplanePreview \
--vnet-subnet-id /subscriptions/$(SUB)/resourceGroups/$(GROUP)/providers/Microsoft.Network/virtualNetworks/$(VNET)/subnets/nodenet \
--pod-subnet-id /subscriptions/$(SUB)/resourceGroups/$(GROUP)/providers/Microsoft.Network/virtualNetworks/$(VNET)/subnets/podnet \
Expand All @@ -148,7 +151,7 @@ swift-cilium-up: rg-up swift-net-up ## Bring up a SWIFT Cilium cluster

swift-up: rg-up swift-net-up ## Bring up a SWIFT AzCNI cluster
$(AZCLI) aks create -n $(CLUSTER) -g $(GROUP) -l $(REGION) \
--node-count 2 \
--node-count $(NODE_COUNT) \
--node-vm-size $(VM_SIZE) \
--load-balancer-sku basic \
--network-plugin azure \
Expand Down

0 comments on commit 06defa2

Please sign in to comment.