From 5a64c5b924d833f296eca0e42ce626b512982b62 Mon Sep 17 00:00:00 2001 From: Nadia Pinaeva Date: Mon, 27 Feb 2023 12:50:17 +0100 Subject: [PATCH 1/3] Batch potentially big transaction on egress firewall ACLs migration. The default transaction timeout is 10 seconds, it can be reached when we delete all egress firewall acls during migration to port groups from switches. Signed-off-by: Nadia Pinaeva (cherry picked from commit 1896e165b765f93e388770645d7bcfdfbcf6d337) (cherry picked from commit 7fb527e1eb9b511981cb51be6b79901e4aedbd2f) (cherry picked from commit 88ecd8b1b477c09a10bc7e151a181580fc8ca79f) Conflicts: go-controller/pkg/ovn/egressfirewall.go - egressFirewallACLPriorityKey is not used in 4.11, because logging for egress firewall is not implemented --- go-controller/pkg/ovn/egressfirewall.go | 10 ++- go-controller/pkg/util/batching/batch.go | 23 ++++++ go-controller/pkg/util/batching/batch_test.go | 81 +++++++++++++++++++ 3 files changed, 111 insertions(+), 3 deletions(-) create mode 100644 go-controller/pkg/util/batching/batch.go create mode 100644 go-controller/pkg/util/batching/batch_test.go diff --git a/go-controller/pkg/ovn/egressfirewall.go b/go-controller/pkg/ovn/egressfirewall.go index 141c0393cc..8776624277 100644 --- a/go-controller/pkg/ovn/egressfirewall.go +++ b/go-controller/pkg/ovn/egressfirewall.go @@ -14,6 +14,7 @@ import ( "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/nbdb" addressset "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/ovn/address_set" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/types" + "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/util/batching" kapi "k8s.io/api/core/v1" "k8s.io/client-go/util/retry" @@ -26,6 +27,7 @@ const ( egressFirewallAddError = "EgressFirewall Rules not correctly added" // egressFirewallACLExtIdKey external ID key for egress firewall ACLs egressFirewallACLExtIdKey = "egressFirewall" + aclDeleteBatchSize = 1000 ) type egressFirewall struct { @@ -134,10 +136,12 @@ func (oc *Controller) syncEgressFirewall(egressFirewalls []interface{}) error { // delete acls from all switches, they reside on the port group now if len(egressFirewallACLs) != 0 { - err = libovsdbops.RemoveACLsFromLogicalSwitchesWithPredicate(oc.nbClient, func(item *nbdb.LogicalSwitch) bool { return true }, - egressFirewallACLs...) + err = batching.Batch[*nbdb.ACL](aclDeleteBatchSize, egressFirewallACLs, func(batchACLs []*nbdb.ACL) error { + return libovsdbops.RemoveACLsFromLogicalSwitchesWithPredicate(oc.nbClient, func(item *nbdb.LogicalSwitch) bool { return true }, + batchACLs...) + }) if err != nil { - return fmt.Errorf("failed to remove reject acl from node logical switches: %v", err) + return fmt.Errorf("failed to remove egress firewall acls from node logical switches: %v", err) } } diff --git a/go-controller/pkg/util/batching/batch.go b/go-controller/pkg/util/batching/batch.go new file mode 100644 index 0000000000..88932e9afc --- /dev/null +++ b/go-controller/pkg/util/batching/batch.go @@ -0,0 +1,23 @@ +package batching + +import "fmt" + +func Batch[T any](batchSize int, data []T, eachFn func([]T) error) error { + if batchSize < 1 { + return fmt.Errorf("batchSize should be > 0, got %d", batchSize) + } + start := 0 + dataLen := len(data) + for start < dataLen { + end := start + batchSize + if end > dataLen { + end = dataLen + } + err := eachFn(data[start:end]) + if err != nil { + return err + } + start = end + } + return nil +} diff --git a/go-controller/pkg/util/batching/batch_test.go b/go-controller/pkg/util/batching/batch_test.go new file mode 100644 index 0000000000..5de64f1f86 --- /dev/null +++ b/go-controller/pkg/util/batching/batch_test.go @@ -0,0 +1,81 @@ +package batching + +import ( + "fmt" + "github.com/onsi/ginkgo" + "github.com/onsi/gomega" + + "strings" + "testing" +) + +type batchTestData struct { + name string + batchSize int + data []int + result []int + expectErr string +} + +func TestBatch(t *testing.T) { + tt := []batchTestData{ + { + name: "batch size should be > 0", + batchSize: 0, + data: []int{1, 2, 3}, + expectErr: "batchSize should be > 0", + }, + { + name: "batchSize = 1", + batchSize: 1, + data: []int{1, 2, 3}, + }, + { + name: "batchSize > 1", + batchSize: 2, + data: []int{1, 2, 3}, + }, + { + name: "number of batches = 0", + batchSize: 2, + data: nil, + }, + { + name: "number of batches = 1", + batchSize: 2, + data: []int{1, 2}, + }, + { + name: "number of batches > 1", + batchSize: 2, + data: []int{1, 2, 3, 4}, + }, + { + name: "number of batches not int", + batchSize: 2, + data: []int{1, 2, 3, 4, 5}, + }, + } + + for _, tCase := range tt { + g := gomega.NewGomegaWithT(t) + ginkgo.By(tCase.name) + var result []int + batchNum := 0 + err := Batch[int](tCase.batchSize, tCase.data, func(l []int) error { + batchNum += 1 + result = append(result, l...) + return nil + }) + if err != nil { + if tCase.expectErr != "" && strings.Contains(err.Error(), tCase.expectErr) { + continue + } + t.Fatal(fmt.Sprintf("test %s failed: %v", tCase.name, err)) + } + // tCase.data/tCase.batchSize round up + expectedBatchNum := (len(tCase.data) + tCase.batchSize - 1) / tCase.batchSize + g.Expect(batchNum).To(gomega.Equal(expectedBatchNum)) + g.Expect(result).To(gomega.Equal(tCase.data)) + } +} From 6173c7b8cd3f49af9bdd27724638447a9d7a4482 Mon Sep 17 00:00:00 2001 From: Nadia Pinaeva Date: Thu, 9 Mar 2023 13:23:33 +0100 Subject: [PATCH 2/3] fix multiple error aggregation: `errors.Wrapf` always returns nil if the first argument is nil. Signed-off-by: Nadia Pinaeva (cherry picked from commit 11283d66284f58e1e0da145ee9c1ccc17b6d2347) (cherry picked from commit 74f95e997d116f572b64c7cf7775d483aeed86d1) (cherry picked from commit 8ce4aa4f4594271b53e7833dd9713605845d2b3a) --- go-controller/pkg/ovn/egressfirewall.go | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/go-controller/pkg/ovn/egressfirewall.go b/go-controller/pkg/ovn/egressfirewall.go index 8776624277..30d825069a 100644 --- a/go-controller/pkg/ovn/egressfirewall.go +++ b/go-controller/pkg/ovn/egressfirewall.go @@ -6,8 +6,6 @@ import ( "strings" "sync" - "github.com/pkg/errors" - "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/config" egressfirewallapi "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/crd/egressfirewall/v1" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/libovsdbops" @@ -17,6 +15,7 @@ import ( "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/util/batching" kapi "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/util/errors" "k8s.io/client-go/util/retry" "k8s.io/klog/v2" utilnet "k8s.io/utils/net" @@ -189,7 +188,7 @@ func (oc *Controller) addEgressFirewall(egressFirewall *egressfirewallapi.Egress egressFirewall.Name, egressFirewall.Namespace) } - var addErrors error + var errorList []error for i, egressFirewallRule := range egressFirewall.Spec.Egress { // process Rules into egressFirewallRules for egressFirewall struct if i > types.EgressFirewallStartPriority-types.MinimumReservedEgressFirewallPriority { @@ -199,15 +198,15 @@ func (oc *Controller) addEgressFirewall(egressFirewall *egressfirewallapi.Egress } efr, err := newEgressFirewallRule(egressFirewallRule, i) if err != nil { - addErrors = errors.Wrapf(addErrors, "error: cannot create EgressFirewall Rule to destination %s for namespace %s - %v", - egressFirewallRule.To.CIDRSelector, egressFirewall.Namespace, err) + errorList = append(errorList, fmt.Errorf("cannot create EgressFirewall Rule to destination %s for namespace %s: %w", + egressFirewallRule.To.CIDRSelector, egressFirewall.Namespace, err)) continue } ef.egressRules = append(ef.egressRules, efr) } - if addErrors != nil { - return addErrors + if len(errorList) > 0 { + return errors.NewAggregate(errorList) } // EgressFirewall needs to make sure that the address_set for the namespace exists independently of the namespace object From dc46aa976f96601b13bb8460bc463d34598ae527 Mon Sep 17 00:00:00 2001 From: Nadia Pinaeva Date: Wed, 8 Mar 2023 16:07:36 +0100 Subject: [PATCH 3/3] Optimize egress firewall cleanup to only select switches that have stale acls. Signed-off-by: Nadia Pinaeva (cherry picked from commit 81acdc2bbd3a608900a120edadb621de743ca1e3) (cherry picked from commit 34eb56293928264919eb9e817eddaf62321a9370) Conflict; egressfirewall.go - update apimachinery.sets to the previous version (cherry picked from commit 0bc8f1412f1a462900961e4e54bc747b7a83d52a) --- go-controller/pkg/ovn/egressfirewall.go | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/go-controller/pkg/ovn/egressfirewall.go b/go-controller/pkg/ovn/egressfirewall.go index 30d825069a..6ddcae98af 100644 --- a/go-controller/pkg/ovn/egressfirewall.go +++ b/go-controller/pkg/ovn/egressfirewall.go @@ -16,6 +16,7 @@ import ( kapi "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/util/errors" + "k8s.io/apimachinery/pkg/util/sets" "k8s.io/client-go/util/retry" "k8s.io/klog/v2" utilnet "k8s.io/utils/net" @@ -136,8 +137,15 @@ func (oc *Controller) syncEgressFirewall(egressFirewalls []interface{}) error { // delete acls from all switches, they reside on the port group now if len(egressFirewallACLs) != 0 { err = batching.Batch[*nbdb.ACL](aclDeleteBatchSize, egressFirewallACLs, func(batchACLs []*nbdb.ACL) error { - return libovsdbops.RemoveACLsFromLogicalSwitchesWithPredicate(oc.nbClient, func(item *nbdb.LogicalSwitch) bool { return true }, - batchACLs...) + // optimize the predicate to exclude switches that don't reference deleting acls. + aclsToDelete := sets.String{} + for _, acl := range batchACLs { + aclsToDelete.Insert(acl.UUID) + } + swWithACLsPred := func(sw *nbdb.LogicalSwitch) bool { + return aclsToDelete.HasAny(sw.ACLs...) + } + return libovsdbops.RemoveACLsFromLogicalSwitchesWithPredicate(oc.nbClient, swWithACLsPred, batchACLs...) }) if err != nil { return fmt.Errorf("failed to remove egress firewall acls from node logical switches: %v", err)