Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[NPM Lite] Querying L1VH + Non-L1VH Endpoints #3086

Merged
merged 32 commits into from
Nov 6, 2024
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
ae849be
Added logic to make 2 hns calls for 2 different endpoint states
rejain456 Oct 22, 2024
6e56bba
added querying to l1vh hns only if npm lite is enabled
rejain456 Oct 24, 2024
98a5d1e
added logging line for debugging
rejain456 Oct 24, 2024
ddc83b0
updated config
rejain456 Oct 24, 2024
382c150
removed logging lines
rejain456 Oct 24, 2024
1ef66e3
fixing go lint err
rejain456 Oct 25, 2024
70fb5c4
refactored based on pr comments
rejain456 Oct 25, 2024
605a498
replaced with errors.Wrap and fixed a logging statement
rejain456 Oct 25, 2024
6dde355
added if condition with logic
rejain456 Oct 25, 2024
4abb1c9
changed errl1vh to err
rejain456 Oct 25, 2024
f19fd62
added omments
rejain456 Oct 25, 2024
a79e44f
added logging lines for debugging
rejain456 Oct 25, 2024
79b54fb
added npm lite enabled log debugging
rejain456 Oct 25, 2024
65714fb
spacing
rejain456 Oct 25, 2024
7b2e422
syntax
rejain456 Oct 25, 2024
79d19b8
added logs for debugging
rejain456 Oct 25, 2024
27b76cd
optimizing api load
rejain456 Oct 25, 2024
7728b31
added function to remove common endpoints
rejain456 Oct 29, 2024
b6e07fb
added logging for debugging
rejain456 Oct 29, 2024
b1b941e
removed npm lite check
rejain456 Oct 30, 2024
ae341d5
removed all the debugging comments
rejain456 Oct 31, 2024
c5b18be
added extra unit test cases
rejain456 Oct 31, 2024
3c237ba
added additional unit tests
rejain456 Oct 31, 2024
3089651
removed protobuf code
rejain456 Oct 31, 2024
15a1182
fixed comment
rejain456 Nov 4, 2024
fd985c7
fixed a spelling error
rejain456 Nov 4, 2024
60dda1c
resolved pr comments
rejain456 Nov 4, 2024
36f0d74
updated a comment
rejain456 Nov 4, 2024
ab7038d
revised comment
rejain456 Nov 4, 2024
2c66d63
resolved further pr comments
rejain456 Nov 4, 2024
10d116f
changed back to for loop from range
rejain456 Nov 4, 2024
cd17bd3
Merge branch 'master' into jainriya/hnsEndpointFixL1VH
rejain456 Nov 5, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions npm/cmd/start.go
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,8 @@ func start(config npmconfig.Config, flags npmconfig.Flags) error {
stopChannel := wait.NeverStop
if config.Toggles.EnableV2NPM {
// update the dataplane config
npmV2DataplaneCfg.EnableNPMLite = config.Toggles.EnableNPMLite

npmV2DataplaneCfg.MaxBatchedACLsPerPod = config.MaxBatchedACLsPerPod

npmV2DataplaneCfg.NetPolInBackground = config.Toggles.NetPolInBackground
Expand Down
26 changes: 14 additions & 12 deletions npm/pkg/dataplane/dataplane.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ type Config struct {
NetPolInBackground bool
MaxPendingNetPols int
NetPolInterval time.Duration
EnableNPMLite bool
*ipsets.IPSetManagerCfg
*policies.PolicyManagerCfg
}
Expand All @@ -64,12 +65,13 @@ type DataPlane struct {
nodeName string
// endpointCache stores all endpoints of the network (including off-node)
// Key is PodIP
endpointCache *endpointCache
ioShim *common.IOShim
updatePodCache *updatePodCache
endpointQuery *endpointQuery
applyInfo *applyInfo
netPolQueue *netPolQueue
endpointCache *endpointCache
ioShim *common.IOShim
updatePodCache *updatePodCache
endpointQuery *endpointQuery
endpointQueryL1VH *endpointQuery // windows -> filter for state 2 (attached) endpoints in l1vh
applyInfo *applyInfo
netPolQueue *netPolQueue
// removePolicyInfo tracks when a policy was removed yet had ApplyIPSet failures.
// This field is only relevant for Linux.
removePolicyInfo removePolicyInfo
Expand All @@ -88,11 +90,12 @@ func NewDataPlane(nodeName string, ioShim *common.IOShim, cfg *Config, stopChann
policyMgr: policies.NewPolicyManager(ioShim, cfg.PolicyManagerCfg),
ipsetMgr: ipsets.NewIPSetManager(cfg.IPSetManagerCfg, ioShim),
// networkID is set when initializing Windows dataplane
networkID: "",
endpointCache: newEndpointCache(),
nodeName: nodeName,
ioShim: ioShim,
endpointQuery: new(endpointQuery),
networkID: "",
endpointCache: newEndpointCache(),
nodeName: nodeName,
ioShim: ioShim,
endpointQuery: new(endpointQuery),
endpointQueryL1VH: new(endpointQuery),
applyInfo: &applyInfo{
inBootupPhase: true,
},
Expand Down Expand Up @@ -128,7 +131,6 @@ func NewDataPlane(nodeName string, ioShim *common.IOShim, cfg *Config, stopChann
} else {
metrics.SendLog(util.DaemonDataplaneID, "[DataPlane] dataplane configured to NOT add netpols in background", true)
}

return dp, nil
}

Expand Down
34 changes: 31 additions & 3 deletions npm/pkg/dataplane/dataplane_windows.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ package dataplane

import (
"encoding/json"
"errors"
"fmt"
"strings"
"time"
Expand All @@ -12,6 +11,7 @@ import (
"github.com/Azure/azure-container-networking/npm/util"
npmerrors "github.com/Azure/azure-container-networking/npm/util/errors"
"github.com/Microsoft/hcsshim/hcn"
"github.com/pkg/errors"
"k8s.io/klog"
)

Expand Down Expand Up @@ -50,14 +50,32 @@ func (dp *DataPlane) initializeDataPlane() error {
},
Flags: hcn.HostComputeQueryFlagsNone,
}
// Initialize Endpoint query used to filter healthy endpoints (vNIC) of Windows pods on L1VH Node
dp.endpointQueryL1VH.query = hcn.HostComputeQuery{
SchemaVersion: hcn.SchemaVersion{
Major: hcnSchemaMajorVersion,
Minor: hcnSchemaMinorVersion,
},
Flags: hcn.HostComputeQueryFlagsNone,
}

// Filter out any endpoints that are not in "AttachedShared" State. All running Windows pods with networking must be in this state.
rejain456 marked this conversation as resolved.
Show resolved Hide resolved
filterMap := map[string]uint16{"State": hcnEndpointStateAttachedSharing}
filter, err := json.Marshal(filterMap)
if err != nil {
return npmerrors.SimpleErrorWrapper("failed to marshal endpoint filter map", err)
return errors.Wrap(err, "failed to marshal endpoint filter map")
}
dp.endpointQuery.query.Filter = string(filter)

if dp.EnableNPMLite {
filterMapL1VH := map[string]uint16{"State": hcnEndpointStateAttached}
filterL1VH, errL1VH := json.Marshal(filterMapL1VH)
if errL1VH != nil {
return errors.Wrap(errL1VH, "failed to marshal endpoint filter map")
}
dp.endpointQueryL1VH.query.Filter = string(filterL1VH)
}

// reset endpoint cache so that netpol references are removed for all endpoints while refreshing pod endpoints
// no need to lock endpointCache at boot up
dp.endpointCache.cache = make(map[string]*npmEndpoint)
Expand Down Expand Up @@ -334,9 +352,19 @@ func (dp *DataPlane) getLocalPodEndpoints() ([]*hcn.HostComputeEndpoint, error)
metrics.RecordListEndpointsLatency(timer)
if err != nil {
metrics.IncListEndpointsFailures()
return nil, npmerrors.SimpleErrorWrapper("failed to get local pod endpoints", err)
return nil, errors.Wrap(err, "failed to get local pod endpoints")
}

if dp.EnableNPMLite {
timer = metrics.StartNewTimer()
endpointsAttached, errL1vh := dp.ioShim.Hns.ListEndpointsQuery(dp.endpointQueryL1VH.query)
metrics.RecordListEndpointsLatency(timer)
if errL1vh != nil {
metrics.IncListEndpointsFailures()
return nil, errors.Wrap(errL1vh, "failed to get local pod endpoints in L1VH")
}
endpoints = append(endpoints, endpointsAttached...)
rejain456 marked this conversation as resolved.
Show resolved Hide resolved
}
epPointers := make([]*hcn.HostComputeEndpoint, 0, len(endpoints))
for k := range endpoints {
epPointers = append(epPointers, &endpoints[k])
Expand Down
Loading