Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Automate remaining graceful recovery tests #2140

Merged
Merged
Show file tree
Hide file tree
Changes from 31 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
28c753d
Automate remaining graceful recovery tests
bjee19 Jun 12, 2024
6b039b8
Remove debugging statements
bjee19 Jun 17, 2024
0ea4682
Run pipeline with failing test
bjee19 Jun 17, 2024
1971d96
Revert the test to make it work
bjee19 Jun 17, 2024
4b31588
Run pipeline
bjee19 Jun 20, 2024
7f52961
Run pipeline with failing test
bjee19 Jun 20, 2024
ea8fa3d
Remove manual test document
bjee19 Jun 20, 2024
b104e2b
Run pipeline
bjee19 Jun 20, 2024
f7932bb
Add separate functions for draining and abrupt restart tests
bjee19 Jun 20, 2024
9ad9fb8
Add review feedback
bjee19 Jun 20, 2024
23f6ff0
Refactor docker inspect command
bjee19 Jun 21, 2024
d145601
Use cluster name passed in from flag
bjee19 Jun 21, 2024
5c26b39
Add cluster name flag to right make command
bjee19 Jun 21, 2024
a19bce4
Correct flag name
bjee19 Jun 21, 2024
e752d21
Remove comments
bjee19 Jun 21, 2024
dd25200
Rebase with fixes to skipped failing tests
bjee19 Jun 24, 2024
c79b644
Teardown NGF between each test
bjee19 Jun 24, 2024
3d2151b
Run pipeline
bjee19 Jun 24, 2024
71affc5
Use BeEmpty instead of empty string
bjee19 Jun 24, 2024
7fe4691
Remove functional label
bjee19 Jun 24, 2024
53a6529
Add stable readiness check
bjee19 Jun 25, 2024
43597ad
Add back in extended timeout
bjee19 Jun 25, 2024
666f35f
Extend timout duration for waiting on NGF
bjee19 Jun 25, 2024
13ae858
Add skip if test is running on GKE
bjee19 Jun 25, 2024
88ac69a
Increase stable readiness count
bjee19 Jun 25, 2024
1a7027f
Adjust readiness count
bjee19 Jun 25, 2024
4c305d0
Update comment
bjee19 Jun 26, 2024
bb392b1
Add nil check for clusterName
bjee19 Jun 26, 2024
9769303
Adjust wording on error
bjee19 Jun 26, 2024
18d153b
Add MustPassRepeatedly to Eventually check
bjee19 Jun 26, 2024
9370b25
Move checks on clusterName earlier
bjee19 Jun 26, 2024
df28aa1
Re-run pipeline
bjee19 Jun 26, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions .github/workflows/functional.yml
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,13 @@ jobs:
make test${{ inputs.image == 'plus' && '-with-plus' || ''}} PREFIX=${ngf_prefix} TAG=${ngf_tag} GINKGO_LABEL=telemetry GW_SERVICE_TYPE=LoadBalancer
working-directory: ./tests

- name: Run functional graceful-recovery tests
run: |
ngf_prefix=ghcr.io/nginxinc/nginx-gateway-fabric
ngf_tag=${{ steps.ngf-meta.outputs.version }}
make test${{ inputs.image == 'plus' && '-with-plus' || ''}} PREFIX=${ngf_prefix} TAG=${ngf_tag} GINKGO_LABEL=graceful-recovery GW_SERVICE_TYPE=LoadBalancer CLUSTER_NAME=${{ github.run_id }}
working-directory: ./tests

- name: Run functional tests
run: |
ngf_prefix=ghcr.io/nginxinc/nginx-gateway-fabric
Expand Down
2 changes: 1 addition & 1 deletion tests/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ test: ## Runs the functional tests on your default k8s cluster
--image-tag=$(TAG) --version-under-test=$(NGF_VERSION) \
--plus-enabled=$(PLUS_ENABLED) --ngf-image-repo=$(PREFIX) --nginx-image-repo=$(NGINX_PREFIX) --nginx-plus-image-repo=$(NGINX_PLUS_PREFIX) \
--pull-policy=$(PULL_POLICY) --service-type=$(GW_SERVICE_TYPE) \
--is-gke-internal-lb=$(GW_SVC_GKE_INTERNAL)
--is-gke-internal-lb=$(GW_SVC_GKE_INTERNAL) --cluster-name=$(CLUSTER_NAME)

.PHONY: test-with-plus
test-with-plus: PLUS_ENABLED=true
Expand Down
101 changes: 0 additions & 101 deletions tests/graceful-recovery/graceful-recovery.md

This file was deleted.

187 changes: 148 additions & 39 deletions tests/suite/graceful_recovery_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"errors"
"fmt"
"net/http"
"os/exec"
"strings"
"time"

Expand All @@ -15,6 +16,7 @@ import (
core "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
ctlr "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/yaml"

Expand All @@ -28,7 +30,7 @@ const (

// Since checkContainerLogsForErrors may experience interference from previous tests (as explained in the function
// documentation), this test is recommended to be run separate from other tests.
var _ = Describe("Graceful Recovery test", Ordered, Label("functional", "graceful-recovery"), func() {
var _ = Describe("Graceful Recovery test", Ordered, Label("graceful-recovery"), func() {
files := []string{
"graceful-recovery/cafe.yaml",
"graceful-recovery/cafe-secret.yaml",
Expand All @@ -45,10 +47,11 @@ var _ = Describe("Graceful Recovery test", Ordered, Label("functional", "gracefu

var ngfPodName string

BeforeAll(func() {
BeforeEach(func() {
// this test is unique in that it will check the entire log of both ngf and nginx containers
kate-osborn marked this conversation as resolved.
Show resolved Hide resolved
// for any errors, so in order to avoid errors generated in previous tests we will uninstall
// NGF installed at the suite level, then re-deploy our own
// NGF installed at the suite level, then re-deploy our own. We will also uninstall and re-install
// NGF between each graceful-recovery test for the same reason.
teardown(releaseName)

setup(getDefaultSetupCfg())
Expand All @@ -64,9 +67,7 @@ var _ = Describe("Graceful Recovery test", Ordered, Label("functional", "gracefu
if portFwdHTTPSPort != 0 {
teaURL = fmt.Sprintf("%s:%d/tea", baseHTTPSURL, portFwdHTTPSPort)
}
})

BeforeEach(func() {
ns = core.Namespace{
ObjectMeta: metav1.ObjectMeta{
Name: "graceful-recovery",
Expand Down Expand Up @@ -98,8 +99,97 @@ var _ = Describe("Graceful Recovery test", Ordered, Label("functional", "gracefu
It("recovers when nginx container is restarted", func() {
runRecoveryTest(teaURL, coffeeURL, ngfPodName, nginxContainerName, files, &ns)
})

It("recovers when drained node is restarted", func() {
runRestartNodeWithDrainingTest(teaURL, coffeeURL, files, &ns)
})

It("recovers when node is restarted abruptly", func() {
runRestartNodeAbruptlyTest(teaURL, coffeeURL, files, &ns)
})
})

func runRestartNodeWithDrainingTest(teaURL, coffeeURL string, files []string, ns *core.Namespace) {
runRestartNodeTest(teaURL, coffeeURL, files, ns, true)
}

func runRestartNodeAbruptlyTest(teaURL, coffeeURL string, files []string, ns *core.Namespace) {
runRestartNodeTest(teaURL, coffeeURL, files, ns, false)
}

func runRestartNodeTest(teaURL, coffeeURL string, files []string, ns *core.Namespace, drain bool) {
nodeNames, err := getNodeNames()
Expect(err).ToNot(HaveOccurred())
Expect(nodeNames).To(HaveLen(1))
kate-osborn marked this conversation as resolved.
Show resolved Hide resolved

kindNodeName := nodeNames[0]

Expect(clusterName).ToNot(BeNil(), "clusterName variable not set")
Expect(*clusterName).ToNot(BeEmpty())
containerName := *clusterName + "-control-plane"

if portFwdPort != 0 {
close(portForwardStopCh)
}

if drain {
_, err := exec.Command(
bjee19 marked this conversation as resolved.
Show resolved Hide resolved
"kubectl",
"drain",
kindNodeName,
"--ignore-daemonsets",
"--delete-local-data",
).CombinedOutput()
Expect(err).ToNot(HaveOccurred())

_, err = exec.Command("kubectl", "delete", "node", kindNodeName).CombinedOutput()
Expect(err).ToNot(HaveOccurred())
}

_, err = exec.Command("docker", "restart", containerName).CombinedOutput()
Expect(err).ToNot(HaveOccurred())

// need to wait for docker container to restart and be running before polling for ready NGF Pods or else we will error
Eventually(
func() bool {
output, err := exec.Command(
"docker",
"inspect",
"-f",
"{{.State.Running}}",
containerName,
).CombinedOutput()
return strings.TrimSpace(string(output)) == "true" && err == nil
}).
WithTimeout(timeoutConfig.CreateTimeout).
WithPolling(500 * time.Millisecond).
Should(BeTrue())

// ngf can often oscillate between ready and error, so we wait for a stable readiness in ngf
bjee19 marked this conversation as resolved.
Show resolved Hide resolved
var podNames []string
Eventually(
kate-osborn marked this conversation as resolved.
Show resolved Hide resolved
func() bool {
podNames, err = framework.GetReadyNGFPodNames(k8sClient, ngfNamespace, releaseName, timeoutConfig.GetStatusTimeout)
return len(podNames) == 1 && err == nil
}).
WithTimeout(timeoutConfig.CreateTimeout * 2).
WithPolling(500 * time.Millisecond).
MustPassRepeatedly(20).
Should(BeTrue())

ngfPodName := podNames[0]
bjee19 marked this conversation as resolved.
Show resolved Hide resolved
Expect(ngfPodName).ToNot(BeEmpty())

if portFwdPort != 0 {
ports := []string{fmt.Sprintf("%d:80", ngfHTTPForwardedPort), fmt.Sprintf("%d:443", ngfHTTPSForwardedPort)}
portForwardStopCh = make(chan struct{})
kate-osborn marked this conversation as resolved.
Show resolved Hide resolved
err = framework.PortForward(ctlr.GetConfigOrDie(), ngfNamespace, ngfPodName, ports, portForwardStopCh)
Expect(err).ToNot(HaveOccurred())
}

checkNGFFunctionality(teaURL, coffeeURL, ngfPodName, "", files, ns)
}

func runRecoveryTest(teaURL, coffeeURL, ngfPodName, containerName string, files []string, ns *core.Namespace) {
var (
err error
Expand Down Expand Up @@ -127,36 +217,7 @@ func runRecoveryTest(teaURL, coffeeURL, ngfPodName, containerName string, files
Should(Succeed())
}

Eventually(
func() error {
return checkForWorkingTraffic(teaURL, coffeeURL)
}).
WithTimeout(timeoutConfig.RequestTimeout).
WithPolling(500 * time.Millisecond).
Should(Succeed())

Expect(resourceManager.DeleteFromFiles(files, ns.Name)).To(Succeed())

Eventually(
func() error {
return checkForFailingTraffic(teaURL, coffeeURL)
}).
WithTimeout(timeoutConfig.RequestTimeout).
WithPolling(500 * time.Millisecond).
Should(Succeed())

Expect(resourceManager.ApplyFromFiles(files, ns.Name)).To(Succeed())
Expect(resourceManager.WaitForAppsToBeReadyWithPodCount(ns.Name, 2)).To(Succeed())

Eventually(
func() error {
return checkForWorkingTraffic(teaURL, coffeeURL)
}).
WithTimeout(timeoutConfig.RequestTimeout * 2).
WithPolling(500 * time.Millisecond).
Should(Succeed())

checkContainerLogsForErrors(ngfPodName, containerName == nginxContainerName)
checkNGFFunctionality(teaURL, coffeeURL, ngfPodName, containerName, files, ns)
}

func restartContainer(ngfPodName, containerName string) {
Expand Down Expand Up @@ -254,11 +315,41 @@ func expectRequestToFail(appURL, address string) error {
return nil
}

func checkNGFFunctionality(teaURL, coffeeURL, ngfPodName, containerName string, files []string, ns *core.Namespace) {
Eventually(
func() error {
return checkForWorkingTraffic(teaURL, coffeeURL)
}).
WithTimeout(timeoutConfig.RequestTimeout * 2).
WithPolling(500 * time.Millisecond).
Should(Succeed())

Expect(resourceManager.DeleteFromFiles(files, ns.Name)).To(Succeed())

Eventually(
func() error {
return checkForFailingTraffic(teaURL, coffeeURL)
}).
WithTimeout(timeoutConfig.RequestTimeout).
WithPolling(500 * time.Millisecond).
Should(Succeed())

Expect(resourceManager.ApplyFromFiles(files, ns.Name)).To(Succeed())
Expect(resourceManager.WaitForAppsToBeReadyWithPodCount(ns.Name, 2)).To(Succeed())

Eventually(
func() error {
return checkForWorkingTraffic(teaURL, coffeeURL)
}).
WithTimeout(timeoutConfig.RequestTimeout * 2).
WithPolling(500 * time.Millisecond).
Should(Succeed())

checkContainerLogsForErrors(ngfPodName, containerName == nginxContainerName)
}

// checkContainerLogsForErrors checks both nginx and ngf container's logs for any possible errors.
// Since this function retrieves all the logs from both containers and the NGF pod may be shared between tests,
// the logs retrieved may contain log messages from previous tests, thus any errors in the logs from previous tests
// may cause an interference with this test and cause this test to fail.
// Additionally, when the NGINX process is killed, some errors are expected in the NGF logs while we wait for the
// When the NGINX process is killed, some errors are expected in the NGF logs while we wait for the
// NGINX container to be restarted.
func checkContainerLogsForErrors(ngfPodName string, checkNginxLogsOnly bool) {
nginxLogs, err := resourceManager.GetPodLogs(
Expand Down Expand Up @@ -349,6 +440,24 @@ func getContainerRestartCount(ngfPodName, containerName string) (int, error) {
return restartCount, nil
}

func getNodeNames() ([]string, error) {
ctx, cancel := context.WithTimeout(context.Background(), timeoutConfig.GetTimeout)
defer cancel()
var nodes core.NodeList

if err := k8sClient.List(ctx, &nodes); err != nil {
return nil, fmt.Errorf("error listing nodes: %w", err)
}

names := make([]string, 0, len(nodes.Items))

for _, node := range nodes.Items {
names = append(names, node.Name)
}

return names, nil
}

func runNodeDebuggerJob(ngfPodName, jobScript string) (*v1.Job, error) {
ctx, cancel := context.WithTimeout(context.Background(), timeoutConfig.GetTimeout)
defer cancel()
Expand Down
5 changes: 5 additions & 0 deletions tests/suite/system_suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ var (
serviceType = flag.String("service-type", "NodePort", "Type of service fronting NGF to be deployed")
isGKEInternalLB = flag.Bool("is-gke-internal-lb", false, "Is the LB service GKE internal only")
plusEnabled = flag.Bool("plus-enabled", false, "Is NGINX Plus enabled")
clusterName = flag.String("cluster-name", "kind", "Cluster name")
)

var (
Expand Down Expand Up @@ -138,6 +139,10 @@ func setup(cfg setupConfig, extraInstallArgs ...string) {
Skip("GW_SERVICE_TYPE must be 'LoadBalancer' for NFR tests")
}

if clusterInfo.IsGKE && strings.Contains(GinkgoLabelFilter(), "graceful-recovery") {
Skip("Graceful Recovery test must be run on Kind")
}

if *versionUnderTest != "" {
version = *versionUnderTest
} else if *imageTag != "" {
Expand Down
Loading