From 88ecd8b1b477c09a10bc7e151a181580fc8ca79f Mon Sep 17 00:00:00 2001 From: Nadia Pinaeva Date: Mon, 27 Feb 2023 12:50:17 +0100 Subject: [PATCH 1/2] Batch potentially big transaction on egress firewall ACLs migration. The default transaction timeout is 10 seconds, it can be reached when we delete all egress firewall acls during migration to port groups from switches. Signed-off-by: Nadia Pinaeva (cherry picked from commit 1896e165b765f93e388770645d7bcfdfbcf6d337) (cherry picked from commit 7fb527e1eb9b511981cb51be6b79901e4aedbd2f) --- go-controller/pkg/ovn/egressfirewall.go | 10 ++- go-controller/pkg/util/batching/batch.go | 23 ++++++ go-controller/pkg/util/batching/batch_test.go | 81 +++++++++++++++++++ 3 files changed, 111 insertions(+), 3 deletions(-) create mode 100644 go-controller/pkg/util/batching/batch.go create mode 100644 go-controller/pkg/util/batching/batch_test.go diff --git a/go-controller/pkg/ovn/egressfirewall.go b/go-controller/pkg/ovn/egressfirewall.go index cc8e1a5a43..c0c9f4410a 100644 --- a/go-controller/pkg/ovn/egressfirewall.go +++ b/go-controller/pkg/ovn/egressfirewall.go @@ -12,6 +12,7 @@ import ( "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/nbdb" addressset "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/ovn/address_set" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/types" + "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/util/batching" kapi "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/util/errors" @@ -26,6 +27,7 @@ const ( // egressFirewallACLExtIdKey external ID key for egress firewall ACLs egressFirewallACLExtIdKey = "egressFirewall" egressFirewallACLPriorityKey = "priority" + aclDeleteBatchSize = 1000 ) type egressFirewall struct { @@ -158,10 +160,12 @@ func (oc *Controller) syncEgressFirewall(egressFirewalls []interface{}) error { // delete acls from all switches, they reside on the port group now if len(egressFirewallACLs) != 0 { - err = libovsdbops.RemoveACLsFromLogicalSwitchesWithPredicate(oc.nbClient, func(item *nbdb.LogicalSwitch) bool { return true }, - egressFirewallACLs...) + err = batching.Batch[*nbdb.ACL](aclDeleteBatchSize, egressFirewallACLs, func(batchACLs []*nbdb.ACL) error { + return libovsdbops.RemoveACLsFromLogicalSwitchesWithPredicate(oc.nbClient, func(item *nbdb.LogicalSwitch) bool { return true }, + batchACLs...) + }) if err != nil { - return fmt.Errorf("failed to remove reject acl from node logical switches: %v", err) + return fmt.Errorf("failed to remove egress firewall acls from node logical switches: %v", err) } } diff --git a/go-controller/pkg/util/batching/batch.go b/go-controller/pkg/util/batching/batch.go new file mode 100644 index 0000000000..88932e9afc --- /dev/null +++ b/go-controller/pkg/util/batching/batch.go @@ -0,0 +1,23 @@ +package batching + +import "fmt" + +func Batch[T any](batchSize int, data []T, eachFn func([]T) error) error { + if batchSize < 1 { + return fmt.Errorf("batchSize should be > 0, got %d", batchSize) + } + start := 0 + dataLen := len(data) + for start < dataLen { + end := start + batchSize + if end > dataLen { + end = dataLen + } + err := eachFn(data[start:end]) + if err != nil { + return err + } + start = end + } + return nil +} diff --git a/go-controller/pkg/util/batching/batch_test.go b/go-controller/pkg/util/batching/batch_test.go new file mode 100644 index 0000000000..5de64f1f86 --- /dev/null +++ b/go-controller/pkg/util/batching/batch_test.go @@ -0,0 +1,81 @@ +package batching + +import ( + "fmt" + "github.com/onsi/ginkgo" + "github.com/onsi/gomega" + + "strings" + "testing" +) + +type batchTestData struct { + name string + batchSize int + data []int + result []int + expectErr string +} + +func TestBatch(t *testing.T) { + tt := []batchTestData{ + { + name: "batch size should be > 0", + batchSize: 0, + data: []int{1, 2, 3}, + expectErr: "batchSize should be > 0", + }, + { + name: "batchSize = 1", + batchSize: 1, + data: []int{1, 2, 3}, + }, + { + name: "batchSize > 1", + batchSize: 2, + data: []int{1, 2, 3}, + }, + { + name: "number of batches = 0", + batchSize: 2, + data: nil, + }, + { + name: "number of batches = 1", + batchSize: 2, + data: []int{1, 2}, + }, + { + name: "number of batches > 1", + batchSize: 2, + data: []int{1, 2, 3, 4}, + }, + { + name: "number of batches not int", + batchSize: 2, + data: []int{1, 2, 3, 4, 5}, + }, + } + + for _, tCase := range tt { + g := gomega.NewGomegaWithT(t) + ginkgo.By(tCase.name) + var result []int + batchNum := 0 + err := Batch[int](tCase.batchSize, tCase.data, func(l []int) error { + batchNum += 1 + result = append(result, l...) + return nil + }) + if err != nil { + if tCase.expectErr != "" && strings.Contains(err.Error(), tCase.expectErr) { + continue + } + t.Fatal(fmt.Sprintf("test %s failed: %v", tCase.name, err)) + } + // tCase.data/tCase.batchSize round up + expectedBatchNum := (len(tCase.data) + tCase.batchSize - 1) / tCase.batchSize + g.Expect(batchNum).To(gomega.Equal(expectedBatchNum)) + g.Expect(result).To(gomega.Equal(tCase.data)) + } +} From 0bc8f1412f1a462900961e4e54bc747b7a83d52a Mon Sep 17 00:00:00 2001 From: Nadia Pinaeva Date: Wed, 8 Mar 2023 16:07:36 +0100 Subject: [PATCH 2/2] Optimize egress firewall cleanup to only select switches that have stale acls. Signed-off-by: Nadia Pinaeva (cherry picked from commit 81acdc2bbd3a608900a120edadb621de743ca1e3) (cherry picked from commit 34eb56293928264919eb9e817eddaf62321a9370) Conflict; egressfirewall.go - update apimachinery.sets to the previous version --- go-controller/pkg/ovn/egressfirewall.go | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/go-controller/pkg/ovn/egressfirewall.go b/go-controller/pkg/ovn/egressfirewall.go index c0c9f4410a..f8cfdc8ae0 100644 --- a/go-controller/pkg/ovn/egressfirewall.go +++ b/go-controller/pkg/ovn/egressfirewall.go @@ -16,6 +16,7 @@ import ( kapi "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/util/errors" + "k8s.io/apimachinery/pkg/util/sets" "k8s.io/client-go/util/retry" "k8s.io/klog/v2" utilnet "k8s.io/utils/net" @@ -161,8 +162,15 @@ func (oc *Controller) syncEgressFirewall(egressFirewalls []interface{}) error { // delete acls from all switches, they reside on the port group now if len(egressFirewallACLs) != 0 { err = batching.Batch[*nbdb.ACL](aclDeleteBatchSize, egressFirewallACLs, func(batchACLs []*nbdb.ACL) error { - return libovsdbops.RemoveACLsFromLogicalSwitchesWithPredicate(oc.nbClient, func(item *nbdb.LogicalSwitch) bool { return true }, - batchACLs...) + // optimize the predicate to exclude switches that don't reference deleting acls. + aclsToDelete := sets.String{} + for _, acl := range batchACLs { + aclsToDelete.Insert(acl.UUID) + } + swWithACLsPred := func(sw *nbdb.LogicalSwitch) bool { + return aclsToDelete.HasAny(sw.ACLs...) + } + return libovsdbops.RemoveACLsFromLogicalSwitchesWithPredicate(oc.nbClient, swWithACLsPred, batchACLs...) }) if err != nil { return fmt.Errorf("failed to remove egress firewall acls from node logical switches: %v", err)