From 8ed3eef67ebac65e85fb23201dd085f99229a70b Mon Sep 17 00:00:00 2001 From: eshitachandwani Date: Thu, 4 Dec 2025 16:02:53 +0530 Subject: [PATCH 01/25] cds/eds --- internal/xds/resolver/watch_service_test.go | 5 +- internal/xds/resolver/xds_resolver_test.go | 7 +- internal/xds/xdsdepmgr/watch_service.go | 152 ++- .../xds/xdsdepmgr/xds_dependency_manager.go | 398 +++++- .../xdsdepmgr/xds_dependency_manager_test.go | 1159 ++++++++++++----- 5 files changed, 1403 insertions(+), 318 deletions(-) diff --git a/internal/xds/resolver/watch_service_test.go b/internal/xds/resolver/watch_service_test.go index 1c68eba32013..5ab5d6f4132f 100644 --- a/internal/xds/resolver/watch_service_test.go +++ b/internal/xds/resolver/watch_service_test.go @@ -71,7 +71,8 @@ func (s) TestServiceWatch_ListenerPointsToNewRouteConfiguration(t *testing.T) { // Leave the old route configuration resource unchanged. newTestRouteConfigName := defaultTestRouteConfigName + "-new" resources.Listeners = []*v3listenerpb.Listener{e2e.DefaultClientListener(defaultTestServiceName, newTestRouteConfigName)} - configureResourcesOnManagementServer(ctx, t, mgmtServer, nodeID, resources.Listeners, resources.Routes) + resources.SkipValidation = true + mgmtServer.Update(ctx, resources) // Verify that the new route configuration resource is requested. waitForResourceNames(ctx, t, routeCfgCh, []string{newTestRouteConfigName}) @@ -88,7 +89,7 @@ func (s) TestServiceWatch_ListenerPointsToNewRouteConfiguration(t *testing.T) { }, }, }) - configureResourcesOnManagementServer(ctx, t, mgmtServer, nodeID, resources.Listeners, resources.Routes) + mgmtServer.Update(ctx, resources) // Wait for no update from the resolver. verifyNoUpdateFromResolver(ctx, t, stateCh) diff --git a/internal/xds/resolver/xds_resolver_test.go b/internal/xds/resolver/xds_resolver_test.go index 70db68d3b1a8..6d5abb399b40 100644 --- a/internal/xds/resolver/xds_resolver_test.go +++ b/internal/xds/resolver/xds_resolver_test.go @@ -433,11 +433,12 @@ func (s) TestResolverBadServiceUpdate_NACKedWithCache(t *testing.T) { stateCh, _, _ := buildResolverForTarget(t, resolver.Target{URL: *testutils.MustParseURL("xds:///" + defaultTestServiceName)}, bc) - // Configure good listener and route configuration resources on the - // management server. + // Configure all resources on the management server. listeners := []*v3listenerpb.Listener{e2e.DefaultClientListener(defaultTestServiceName, defaultTestRouteConfigName)} routes := []*v3routepb.RouteConfiguration{e2e.DefaultRouteConfig(defaultTestRouteConfigName, defaultTestServiceName, defaultTestClusterName)} - configureResourcesOnManagementServer(ctx, t, mgmtServer, nodeID, listeners, routes) + cluster := []*v3clusterpb.Cluster{e2e.DefaultCluster(defaultTestClusterName, defaultTestEndpointName, e2e.SecurityLevelNone)} + endpoints := []*v3endpointpb.ClusterLoadAssignment{e2e.DefaultEndpoint(defaultTestEndpointName, defaultTestHostname, defaultTestPort)} + configureAllResourcesOnManagementServer(ctx, t, mgmtServer, nodeID, listeners, routes, cluster, endpoints) // Expect a good update from the resolver. cs := verifyUpdateFromResolver(ctx, t, stateCh, wantServiceConfig(defaultTestClusterName)) diff --git a/internal/xds/xdsdepmgr/watch_service.go b/internal/xds/xdsdepmgr/watch_service.go index 73f1b703519f..1cd9e26011ec 100644 --- a/internal/xds/xdsdepmgr/watch_service.go +++ b/internal/xds/xdsdepmgr/watch_service.go @@ -18,7 +18,16 @@ package xdsdepmgr -import "google.golang.org/grpc/internal/xds/xdsclient/xdsresource" +import ( + "context" + "fmt" + "net/url" + + "google.golang.org/grpc/internal/grpcsync" + "google.golang.org/grpc/internal/xds/xdsclient/xdsresource" + "google.golang.org/grpc/resolver" + "google.golang.org/grpc/serviceconfig" +) type listenerWatcher struct { resourceName string @@ -81,3 +90,144 @@ func (r *routeConfigWatcher) stop() { r.depMgr.logger.Infof("Canceling watch on RouteConfiguration resource %q", r.resourceName) } } + +type clusterWatcher struct { + name string + depMgr *DependencyManager +} + +func (e *clusterWatcher) ResourceChanged(u *xdsresource.ClusterUpdate, onDone func()) { + e.depMgr.onClusterResourceUpdate(e.name, u, onDone) +} + +func (e *clusterWatcher) ResourceError(err error, onDone func()) { + e.depMgr.onClusterResourceError(e.name, err, onDone) +} + +func (e *clusterWatcher) AmbientError(err error, onDone func()) { + e.depMgr.onClusterAmbientError(e.name, err, onDone) +} + +// clusterWatcherState groups the state associated with a clusterWatcher. +type clusterWatcherState struct { + watcher *clusterWatcher // The underlying watcher. + cancelWatch func() // Cancel func to cancel the watch. + // Most recent update received for this cluster. + lastUpdate *xdsresource.ClusterUpdate + err error +} + +func newClusterWatcher(resourceName string, depMgr *DependencyManager) *clusterWatcherState { + w := &clusterWatcher{name: resourceName, depMgr: depMgr} + return &clusterWatcherState{ + watcher: w, + cancelWatch: xdsresource.WatchCluster(depMgr.xdsClient, resourceName, w), + } +} + +type endpointWatcher struct { + name string + depMgr *DependencyManager +} + +func (e *endpointWatcher) ResourceChanged(u *xdsresource.EndpointsUpdate, onDone func()) { + e.depMgr.onEndpointUpdate(e.name, u, onDone) +} + +func (e *endpointWatcher) ResourceError(err error, onDone func()) { + e.depMgr.onEndpointResourceError(e.name, err, onDone) +} + +func (e *endpointWatcher) AmbientError(err error, onDone func()) { + e.depMgr.onEndpointAmbientError(e.name, err, onDone) +} + +// endpointWatcherState groups the state associated with a endpointWatcher. +type endpointWatcherState struct { + watcher *endpointWatcher // The underlying watcher. + cancelWatch func() // Cancel func to cancel the watch. + // Most recent update received for this cluster. + lastUpdate *xdsresource.EndpointsUpdate + err error +} + +func newEndpointWatcher(resourceName string, depMgr *DependencyManager) *endpointWatcherState { + w := &endpointWatcher{name: resourceName, depMgr: depMgr} + return &endpointWatcherState{ + watcher: w, + cancelWatch: xdsresource.WatchEndpoints(depMgr.xdsClient, resourceName, w), + } +} + +// dnsResolverState watches updates for the given DNS hostname. It implements +// resolver.ClientConn interface to work with the DNS resolver. +type dnsResolver struct { + target string + dnsR resolver.Resolver + depMgr *DependencyManager + + //serializer is used to make sure that any methods on the resolver can be + //called from inside th Build function which is a garuntee that + //implementations of resolver.Clientconn need to maintain. + serializer grpcsync.CallbackSerializer + serializerCancel func() +} + +type dnsResolverState struct { + resolver *dnsResolver + updateReceived bool + err error + lastUpdate *xdsresource.DNSUpdate + cancelResolver func() +} + +// dnsResolverState needs to implement resolver.ClientConn interface to receive +// updates from the real DNS resolver. +func (dr *dnsResolver) UpdateState(state resolver.State) error { + dr.serializer.TrySchedule(func(context.Context) { + dr.depMgr.onDNSUpdate(dr.target, &state) + }) + return nil +} + +func (dr *dnsResolver) ReportError(err error) { + dr.serializer.TrySchedule(func(ctx context.Context) { + dr.depMgr.onDNSError(dr.target, err) + }) +} + +func (dr *dnsResolver) NewAddress(addresses []resolver.Address) { + dr.UpdateState(resolver.State{Addresses: addresses}) +} + +func (dr *dnsResolver) ParseServiceConfig(string) *serviceconfig.ParseResult { + return &serviceconfig.ParseResult{Err: fmt.Errorf("service config not supported")} +} + +// NewDNSResolver creates a new DNS resolver for the given target. +func newDNSResolver(target string, depMgr *DependencyManager) *dnsResolverState { + ctx, cancel := context.WithCancel(context.Background()) + dr := &dnsResolver{target: target, depMgr: depMgr, serializer: *grpcsync.NewCallbackSerializer(ctx), serializerCancel: cancel} + drState := &dnsResolverState{resolver: dr, cancelResolver: func() {}} + u, err := url.Parse("dns:///" + target) + if err != nil { + drState.updateReceived = true + drState.err = err + drState.resolver.depMgr.logger.Warningf("Error while parsing DNS target %q: %v", target, drState.resolver.depMgr.annotateErrorWithNodeID(err)) + return drState + } + r, err := resolver.Get("dns").Build(resolver.Target{URL: *u}, dr, resolver.BuildOptions{}) + if err != nil { + drState.updateReceived = true + drState.err = err + drState.resolver.depMgr.logger.Warningf("Error while building DNS resolver for target %q: %v", target, drState.resolver.depMgr.annotateErrorWithNodeID(err)) + return drState + } + drState.resolver.dnsR = r + drState.cancelResolver = func() { + drState.resolver.serializerCancel() + <-drState.resolver.serializer.Done() + r.Close() + } + return drState +} diff --git a/internal/xds/xdsdepmgr/xds_dependency_manager.go b/internal/xds/xdsdepmgr/xds_dependency_manager.go index fae5ca962d42..c469e4ab8205 100644 --- a/internal/xds/xdsdepmgr/xds_dependency_manager.go +++ b/internal/xds/xdsdepmgr/xds_dependency_manager.go @@ -26,11 +26,16 @@ import ( internalgrpclog "google.golang.org/grpc/internal/grpclog" "google.golang.org/grpc/internal/xds/xdsclient" "google.golang.org/grpc/internal/xds/xdsclient/xdsresource" + "google.golang.org/grpc/resolver" ) -const prefix = "[xdsdepmgr %p] " +const ( + aggregateClusterMaxDepth = 16 + prefix = "[xdsdepmgr %p] " +) var logger = grpclog.Component("xds") +var EnableCDSEDSlooksups = false func prefixLogger(p *DependencyManager) *internalgrpclog.PrefixLogger { return internalgrpclog.NewPrefixLogger(logger, fmt.Sprintf(prefix, p)) @@ -73,12 +78,16 @@ type DependencyManager struct { mu sync.Mutex stopped bool - listenerWatcher *listenerWatcher - currentListenerUpdate *xdsresource.ListenerUpdate - routeConfigWatcher *routeConfigWatcher - rdsResourceName string - currentRouteConfig *xdsresource.RouteConfigUpdate - currentVirtualHost *xdsresource.VirtualHost + listenerWatcher *listenerWatcher + currentListenerUpdate *xdsresource.ListenerUpdate + routeConfigWatcher *routeConfigWatcher + rdsResourceName string + currentRouteConfig *xdsresource.RouteConfigUpdate + currentVirtualHost *xdsresource.VirtualHost + clusterFromRouteConfig map[string]struct{} + endpointWatchers map[string]*endpointWatcherState + dnsResolvers map[string]*dnsResolverState + clusterWatchers map[string]*clusterWatcherState } // New creates a new DependencyManager. @@ -91,12 +100,18 @@ type DependencyManager struct { // - watcher is the ConfigWatcher interface that will receive the aggregated // XDSConfig updates and errors. func New(listenerName, dataplaneAuthority string, xdsClient xdsclient.XDSClient, watcher ConfigWatcher) *DependencyManager { + // ctx, cancel := context.WithCancel(context.Background()) + dm := &DependencyManager{ - ldsResourceName: listenerName, - dataplaneAuthority: dataplaneAuthority, - xdsClient: xdsClient, - watcher: watcher, - nodeID: xdsClient.BootstrapConfig().Node().GetId(), + ldsResourceName: listenerName, + dataplaneAuthority: dataplaneAuthority, + xdsClient: xdsClient, + watcher: watcher, + nodeID: xdsClient.BootstrapConfig().Node().GetId(), + clusterFromRouteConfig: make(map[string]struct{}), + endpointWatchers: make(map[string]*endpointWatcherState), + dnsResolvers: make(map[string]*dnsResolverState), + clusterWatchers: make(map[string]*clusterWatcherState), } dm.logger = prefixLogger(dm) @@ -122,6 +137,20 @@ func (m *DependencyManager) Close() { if m.routeConfigWatcher != nil { m.routeConfigWatcher.stop() } + for name, cluster := range m.clusterWatchers { + go cluster.cancelWatch() + delete(m.clusterWatchers, name) + } + + for name, endpoint := range m.endpointWatchers { + go endpoint.cancelWatch() + delete(m.endpointWatchers, name) + } + + for name, dnsResolver := range m.dnsResolvers { + go dnsResolver.cancelResolver() + delete(m.dnsResolvers, name) + } } // annotateErrorWithNodeID annotates the given error with the provided xDS node @@ -134,11 +163,181 @@ func (m *DependencyManager) annotateErrorWithNodeID(err error) error { // the current aggregated xDS configuration to the watcher if all the updates // are available. func (m *DependencyManager) maybeSendUpdateLocked() { - m.watcher.Update(&xdsresource.XDSConfig{ + if m.currentListenerUpdate == nil || m.currentRouteConfig == nil { + return + } + config := (&xdsresource.XDSConfig{ Listener: m.currentListenerUpdate, RouteConfig: m.currentRouteConfig, VirtualHost: m.currentVirtualHost, + Clusters: make(map[string]*xdsresource.ClusterResult), }) + if !EnableCDSEDSlooksups { + m.watcher.Update(config) + return + } + edsResourcesSeen := make(map[string]struct{}) + dnsResourcesSeen := make(map[string]struct{}) + clusterResourceSeen := make(map[string]struct{}) + haveAllResources := true + for cluster := range m.clusterFromRouteConfig { + gotAllResource, _, err := m.populateClusterConfigLocked(cluster, 0, config.Clusters, edsResourcesSeen, dnsResourcesSeen, clusterResourceSeen) + if !gotAllResource { + haveAllResources = false + } + if err != nil { + config.Clusters[cluster] = &xdsresource.ClusterResult{ + Err: err, + } + } + } + + // Cancel resources not seen in the tree. + for name, endpoint := range m.endpointWatchers { + if _, ok := edsResourcesSeen[name]; !ok { + endpoint.cancelWatch() + delete(m.endpointWatchers, name) + } + } + for name, dnsResolver := range m.dnsResolvers { + if _, ok := dnsResourcesSeen[name]; !ok { + dnsResolver.cancelResolver() + delete(m.dnsResolvers, name) + } + } + for clusterName, cluster := range m.clusterWatchers { + if _, ok := clusterResourceSeen[clusterName]; !ok { + cluster.cancelWatch() + delete(m.clusterWatchers, clusterName) + } + } + if haveAllResources { + m.watcher.Update(config) + return + } + if m.logger.V(2) { + m.logger.Infof("Not sending update to watcher as not all resources are available") + } +} + +func (m *DependencyManager) populateClusterConfigLocked(name string, depth int, clusterMap map[string]*xdsresource.ClusterResult, edsSeen, dnsSeen map[string]struct{}, clusterSeen map[string]struct{}) (bool, []string, error) { + clusterSeen[name] = struct{}{} + + if depth > aggregateClusterMaxDepth { + m.logger.Warningf("aggregate cluster graph exceeds max depth (%d)", aggregateClusterMaxDepth) + return true, nil, m.annotateErrorWithNodeID(fmt.Errorf("aggregate cluster graph exceeds max depth (%d)", aggregateClusterMaxDepth)) + } + + // If cluster is already seen in the tree, return. + if _, ok := clusterMap[name]; ok { + return true, nil, nil + } + + // If cluster watcher does not exist, create one. + if _, ok := m.clusterWatchers[name]; !ok { + m.clusterWatchers[name] = newClusterWatcher(name, m) + return false, nil, nil + } + + state := m.clusterWatchers[name] + + // If update is not yet received for this cluster, return. + if state.lastUpdate == nil && state.err == nil { + return false, nil, nil + } + + // If resource error is seen for this cluster, save it in the cluster map + // and return. + if state.lastUpdate == nil && state.err != nil { + clusterMap[name] = &xdsresource.ClusterResult{Err: state.err} + return true, nil, nil + } + + update := state.lastUpdate + clusterMap[name] = &xdsresource.ClusterResult{ + Config: xdsresource.ClusterConfig{ + Cluster: update, + }, + } + + switch update.ClusterType { + case xdsresource.ClusterTypeEDS: + var edsName string + if update.EDSServiceName == "" { + edsName = name + } else { + edsName = update.EDSServiceName + } + edsSeen[edsName] = struct{}{} + // If endpoint watcher does not exist, create one. + if _, ok := m.endpointWatchers[edsName]; !ok { + m.endpointWatchers[edsName] = newEndpointWatcher(edsName, m) + return false, nil, nil + } + endpointState := m.endpointWatchers[edsName] + + // If the resource does not have any update yet, return. + if endpointState.lastUpdate == nil && endpointState.err == nil { + return false, nil, nil + } + + // Store the update and error. + clusterMap[name].Config.EndpointConfig = &xdsresource.EndpointConfig{ + EDSUpdate: endpointState.lastUpdate, + ResolutionNote: endpointState.err, + } + return true, []string{name}, nil + + case xdsresource.ClusterTypeLogicalDNS: + target := update.DNSHostName + dnsSeen[target] = struct{}{} + // If dns resolver does not exist, create one. + if _, ok := m.dnsResolvers[target]; !ok { + m.dnsResolvers[target] = newDNSResolver(target, m) + return false, nil, nil + } + dnsState := m.dnsResolvers[target] + // If no update received, return false. + if !dnsState.updateReceived { + return false, nil, nil + } + + clusterMap[name].Config.EndpointConfig = &xdsresource.EndpointConfig{ + DNSEndpoints: dnsState.lastUpdate, + ResolutionNote: dnsState.err, + } + return true, []string{name}, nil + + case xdsresource.ClusterTypeAggregate: + var leafClusters []string + haveAllResources := true + for _, child := range update.PrioritizedClusterNames { + gotAll, childLeafCluster, err := m.populateClusterConfigLocked(child, depth+1, clusterMap, edsSeen, dnsSeen, clusterSeen) + if !gotAll { + haveAllResources = false + } + if err != nil { + clusterMap[name] = &xdsresource.ClusterResult{ + Err: err, + } + return true, leafClusters, err + } + leafClusters = append(leafClusters, childLeafCluster...) + } + if !haveAllResources { + return false, leafClusters, nil + } + if haveAllResources && len(leafClusters) == 0 { + clusterMap[name] = &xdsresource.ClusterResult{Err: m.annotateErrorWithNodeID(fmt.Errorf("no leaf clusters found for aggregate cluster"))} + return true, leafClusters, m.annotateErrorWithNodeID(fmt.Errorf("no leaf clusters found for aggregate cluster")) + } + clusterMap[name].Config.AggregateConfig = &xdsresource.AggregateConfig{ + LeafClusters: leafClusters, + } + return true, leafClusters, nil + } + clusterMap[name] = &xdsresource.ClusterResult{Err: m.annotateErrorWithNodeID(fmt.Errorf("no cluster data yet for cluster %s", name))} + return false, nil, nil } func (m *DependencyManager) applyRouteConfigUpdateLocked(update *xdsresource.RouteConfigUpdate) { @@ -149,6 +348,27 @@ func (m *DependencyManager) applyRouteConfigUpdateLocked(update *xdsresource.Rou } m.currentRouteConfig = update m.currentVirtualHost = matchVH + // Get the clusters to be watched from the routes in the virtual host. If + // the CLusterSpecifierField is set, we ignore it for now as the clusters + // will be determined dynamically for it. + newClusters := make(map[string]struct{}) + + for _, rt := range matchVH.Routes { + for _, cluster := range rt.WeightedClusters { + newClusters[cluster.Name] = struct{}{} + } + } + if EnableCDSEDSlooksups { + // Cancel watch for clusters not seen in route config + for name := range m.clusterFromRouteConfig { + if _, ok := newClusters[name]; !ok { + m.clusterWatchers[name].cancelWatch() + delete(m.clusterWatchers, name) + } + } + + m.clusterFromRouteConfig = newClusters + } m.maybeSendUpdateLocked() } @@ -194,9 +414,9 @@ func (m *DependencyManager) onListenerResourceUpdate(update *xdsresource.Listene m.rdsResourceName = update.RouteConfigName if m.routeConfigWatcher != nil { m.routeConfigWatcher.stop() - m.currentVirtualHost = nil } m.routeConfigWatcher = newRouteConfigWatcher(m.rdsResourceName, m) + } func (m *DependencyManager) onListenerResourceError(err error, onDone func()) { @@ -213,8 +433,8 @@ func (m *DependencyManager) onListenerResourceError(err error, onDone func()) { if m.routeConfigWatcher != nil { m.routeConfigWatcher.stop() } + m.currentListenerUpdate = nil m.rdsResourceName = "" - m.currentVirtualHost = nil m.routeConfigWatcher = nil m.watcher.Error(fmt.Errorf("listener resource error: %v", m.annotateErrorWithNodeID(err))) } @@ -258,6 +478,7 @@ func (m *DependencyManager) onRouteConfigResourceError(resourceName string, err if m.stopped || m.rdsResourceName != resourceName { return } + m.currentRouteConfig = nil m.logger.Warningf("Received resource error for RouteConfiguration resource %q: %v", resourceName, m.annotateErrorWithNodeID(err)) m.watcher.Error(fmt.Errorf("route resource error: %v", m.annotateErrorWithNodeID(err))) } @@ -277,3 +498,150 @@ func (m *DependencyManager) onRouteConfigResourceAmbientError(resourceName strin m.logger.Warningf("Route resource ambient error %q: %v", resourceName, m.annotateErrorWithNodeID(err)) } + +func (m *DependencyManager) onClusterResourceUpdate(resourceName string, update *xdsresource.ClusterUpdate, onDone func()) { + m.mu.Lock() + defer m.mu.Unlock() + + defer onDone() + if m.stopped || m.clusterWatchers[resourceName] == nil { + return + } + + if m.logger.V(2) { + m.logger.Infof("Received update for Cluster resource %q: %+v", resourceName, update) + } + state := m.clusterWatchers[resourceName] + state.lastUpdate = update + state.err = nil + m.maybeSendUpdateLocked() + +} + +func (m *DependencyManager) onClusterResourceError(resourceName string, err error, onDone func()) { + m.mu.Lock() + defer m.mu.Unlock() + + defer onDone() + if m.stopped || m.clusterWatchers[resourceName] == nil { + return + } + m.logger.Warningf("Received resource error for Cluster resource %q: %v", resourceName, m.annotateErrorWithNodeID(err)) + state := m.clusterWatchers[resourceName] + state.err = err + // We want to stop using this cluster as we received a resource error so + // remove the last update. + state.lastUpdate = nil + m.maybeSendUpdateLocked() +} + +func (m *DependencyManager) onClusterAmbientError(resourceName string, err error, onDone func()) { + m.mu.Lock() + defer m.mu.Unlock() + + defer onDone() + // check resource name matches + if m.stopped || m.clusterWatchers[resourceName] == nil { + return + } + m.logger.Warningf("Cluster resource ambient error %q: %v", resourceName, m.annotateErrorWithNodeID(err)) +} + +func (m *DependencyManager) onEndpointUpdate(resourceName string, update *xdsresource.EndpointsUpdate, onDone func()) { + m.mu.Lock() + defer m.mu.Unlock() + + defer onDone() + if m.stopped || m.endpointWatchers[resourceName] == nil { + return + } + + if m.logger.V(2) { + m.logger.Infof("Received update for Endpoint resource %q: %+v", resourceName, update) + } + state := m.endpointWatchers[resourceName] + state.lastUpdate = update + state.err = nil + m.maybeSendUpdateLocked() +} + +func (m *DependencyManager) onEndpointResourceError(resourceName string, err error, onDone func()) { + m.mu.Lock() + defer m.mu.Unlock() + + defer onDone() + if m.stopped || m.endpointWatchers[resourceName] == nil { + return + } + m.logger.Warningf("Received resource error for Endpoint resource %q: %v", resourceName, m.annotateErrorWithNodeID(err)) + state := m.endpointWatchers[resourceName] + state.err = err + // We want to stop using the endpoints for this cluster as we received a + // resource error so remove the last update. + state.lastUpdate = nil + m.maybeSendUpdateLocked() +} + +func (m *DependencyManager) onEndpointAmbientError(resourceName string, err error, onDone func()) { + m.mu.Lock() + defer m.mu.Unlock() + + defer onDone() + if m.stopped || m.endpointWatchers[resourceName] == nil { + return + } + + m.logger.Warningf("Endpoint resource ambient error %q: %v", resourceName, m.annotateErrorWithNodeID(err)) + state := m.endpointWatchers[resourceName] + state.err = err + m.maybeSendUpdateLocked() +} + +func (m *DependencyManager) onDNSUpdate(resourceName string, update *resolver.State) { + m.mu.Lock() + defer m.mu.Unlock() + if m.stopped || m.dnsResolvers[resourceName] == nil { + return + } + + if m.logger.V(2) { + m.logger.Infof("Received update from DNS resolver for resource %q: %+v", resourceName, update) + } + var endpoints []resolver.Endpoint + if len(update.Endpoints) == 0 { + endpoints = make([]resolver.Endpoint, len(update.Addresses)) + for i, a := range update.Addresses { + endpoints[i] = resolver.Endpoint{Addresses: []resolver.Address{a}} + endpoints[i].Attributes = a.BalancerAttributes + } + } + + state := m.dnsResolvers[resourceName] + state.lastUpdate = &xdsresource.DNSUpdate{Endpoints: endpoints} + state.updateReceived = true + state.err = nil + m.maybeSendUpdateLocked() +} + +func (m *DependencyManager) onDNSError(resourceName string, err error) { + m.mu.Lock() + defer m.mu.Unlock() + + if m.stopped || m.dnsResolvers[resourceName] == nil { + return + } + + state := m.dnsResolvers[resourceName] + // If a previous good update was received, record the error and continue + // using the previous update. If RPCs were succeeding prior to this, they + // will continue to do so. Also suppress errors if we previously received an + // error, since there will be no downstream effects of propagating this + // error. + state.err = err + if !state.updateReceived { + state.lastUpdate = nil + state.updateReceived = true + } + m.logger.Warningf("DNS resolver error %q: %v", resourceName, m.annotateErrorWithNodeID(err)) + m.maybeSendUpdateLocked() +} diff --git a/internal/xds/xdsdepmgr/xds_dependency_manager_test.go b/internal/xds/xdsdepmgr/xds_dependency_manager_test.go index 009010877adf..eee17137a7b5 100644 --- a/internal/xds/xdsdepmgr/xds_dependency_manager_test.go +++ b/internal/xds/xdsdepmgr/xds_dependency_manager_test.go @@ -21,6 +21,7 @@ package xdsdepmgr_test import ( "context" "fmt" + "regexp" "strings" "testing" "time" @@ -33,10 +34,16 @@ import ( "google.golang.org/grpc/internal/testutils" "google.golang.org/grpc/internal/testutils/xds/e2e" "google.golang.org/grpc/internal/xds/bootstrap" + "google.golang.org/grpc/internal/xds/clients" "google.golang.org/grpc/internal/xds/xdsclient" "google.golang.org/grpc/internal/xds/xdsclient/xdsresource" "google.golang.org/grpc/internal/xds/xdsdepmgr" + "google.golang.org/grpc/resolver" + "google.golang.org/protobuf/types/known/wrapperspb" + v3clusterpb "github.com/envoyproxy/go-control-plane/envoy/config/cluster/v3" + v3corepb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3" + v3endpointpb "github.com/envoyproxy/go-control-plane/envoy/config/endpoint/v3" v3listenerpb "github.com/envoyproxy/go-control-plane/envoy/config/listener/v3" v3routepb "github.com/envoyproxy/go-control-plane/envoy/config/route/v3" v3routerpb "github.com/envoyproxy/go-control-plane/envoy/extensions/filters/http/router/v3" @@ -50,6 +57,7 @@ type s struct { } func Test(t *testing.T) { + xdsdepmgr.EnableCDSEDSlooksups = true grpctest.RunSubTests(t, s{}) } @@ -60,12 +68,15 @@ const ( defaultTestServiceName = "service-name" defaultTestRouteConfigName = "route-config-name" defaultTestClusterName = "cluster-name" + defaultTestEDSServiceName = "eds-service-name" ) func newStringP(s string) *string { return &s } +// depMgr.EnableCDSEDSlooksups=true + // testWatcher is an implementation of the ConfigWatcher interface that sends // the updates and errors received from the dependency manager to respective // channels, for the tests to verify. @@ -112,6 +123,22 @@ func verifyXDSConfig(ctx context.Context, xdsCh chan *xdsresource.XDSConfig, err cmpopts.IgnoreFields(xdsresource.HTTPFilter{}, "Filter", "Config"), cmpopts.IgnoreFields(xdsresource.ListenerUpdate{}, "Raw"), cmpopts.IgnoreFields(xdsresource.RouteConfigUpdate{}, "Raw"), + cmpopts.IgnoreFields(xdsresource.ClusterUpdate{}, "Raw", "LBPolicy", "TelemetryLabels"), + cmpopts.IgnoreFields(xdsresource.EndpointsUpdate{}, "Raw"), + cmp.Transformer("ErrorsToString", func(in error) string { + if in == nil { + return "" // Treat nil as an empty string + } + s := in.Error() + + // Replace all sequences of whitespace (including newlines and + // tabs) with a single standard space. + s = regexp.MustCompile(`\s+`).ReplaceAllString(s, " ") + + // Trim any leading/trailing space that might be left over and + // return error as string. + return strings.TrimSpace(s) + }), } if diff := cmp.Diff(update, want, cmpOpts...); diff != "" { return fmt.Errorf("received unexpected update from dependency manager. Diff (-got +want):\n%v", diff) @@ -122,6 +149,71 @@ func verifyXDSConfig(ctx context.Context, xdsCh chan *xdsresource.XDSConfig, err return nil } +func makedefaultXDSConfig(routeConfigName, clusterName, edsServiceName, addr string) *xdsresource.XDSConfig { + return &xdsresource.XDSConfig{ + Listener: &xdsresource.ListenerUpdate{ + RouteConfigName: routeConfigName, + HTTPFilters: []xdsresource.HTTPFilter{{Name: "router"}}, + }, + RouteConfig: &xdsresource.RouteConfigUpdate{ + VirtualHosts: []*xdsresource.VirtualHost{ + { + Domains: []string{defaultTestServiceName}, + Routes: []*xdsresource.Route{{ + Prefix: newStringP("/"), + WeightedClusters: []xdsresource.WeightedCluster{{Name: clusterName, Weight: 100}}, + ActionType: xdsresource.RouteActionRoute, + }}, + }, + }, + }, + VirtualHost: &xdsresource.VirtualHost{ + Domains: []string{defaultTestServiceName}, + Routes: []*xdsresource.Route{{ + Prefix: newStringP("/"), + WeightedClusters: []xdsresource.WeightedCluster{{Name: clusterName, Weight: 100}}, + ActionType: xdsresource.RouteActionRoute}, + }, + }, + Clusters: map[string]*xdsresource.ClusterResult{ + clusterName: { + Config: xdsresource.ClusterConfig{Cluster: &xdsresource.ClusterUpdate{ + ClusterType: xdsresource.ClusterTypeEDS, + ClusterName: clusterName, + EDSServiceName: edsServiceName, + }, + EndpointConfig: &xdsresource.EndpointConfig{ + EDSUpdate: &xdsresource.EndpointsUpdate{ + Localities: []xdsresource.Locality{ + {ID: clients.Locality{ + Region: "region-1", + Zone: "zone-1", + SubZone: "subzone-1", + }, + Endpoints: []xdsresource.Endpoint{ + { + Addresses: []string{addr}, + HealthStatus: xdsresource.EndpointHealthStatusUnknown, + Weight: 1, + }, + }, + Weight: 1}, + }, + }, + }}, + }, + }, + } +} + +func setup(t *testing.T, allowResourceSubset bool) (string, *e2e.ManagementServer, xdsclient.XDSClient) { + t.Helper() + nodeID := uuid.New().String() + mgmtServer, bootstrapContents := setupManagementServerForTest(t, nodeID, allowResourceSubset) + xdsClient := createXDSClient(t, bootstrapContents) + return nodeID, mgmtServer, xdsClient +} + func createXDSClient(t *testing.T, bootstrapContents []byte) xdsclient.XDSClient { t.Helper() @@ -142,28 +234,50 @@ func createXDSClient(t *testing.T, bootstrapContents []byte) xdsclient.XDSClient return c } -// Spins up an xDS management server and sets up the xDS bootstrap configuration. +// Spins up an xDS management server and sets up the xDS bootstrap +// configuration. // // Returns the following: // - A reference to the xDS management server // - Contents of the bootstrap configuration pointing to xDS management // server -func setupManagementServerForTest(t *testing.T, nodeID string) (*e2e.ManagementServer, []byte) { +func setupManagementServerForTest(t *testing.T, nodeID string, allowResourceSubset bool) (*e2e.ManagementServer, []byte) { t.Helper() - mgmtServer := e2e.StartManagementServer(t, e2e.ManagementServerOptions{}) + mgmtServer := e2e.StartManagementServer(t, e2e.ManagementServerOptions{ + AllowResourceSubset: allowResourceSubset, + }) t.Cleanup(mgmtServer.Stop) bootstrapContents := e2e.DefaultBootstrapContents(t, nodeID, mgmtServer.Address) return mgmtServer, bootstrapContents } +// makeAggregateClusterResource returns an aggregate cluster resource with the +// given name and list of child names. +func makeAggregateClusterResource(name string, childNames []string) *v3clusterpb.Cluster { + return e2e.ClusterResourceWithOptions(e2e.ClusterOptions{ + ClusterName: name, + Type: e2e.ClusterTypeAggregate, + ChildNames: childNames, + }) +} + +// makeLogicalDNSClusterResource returns a LOGICAL_DNS cluster resource with the +// given name and given DNS host and port. +func makeLogicalDNSClusterResource(name, dnsHost string, dnsPort uint32) *v3clusterpb.Cluster { + return e2e.ClusterResourceWithOptions(e2e.ClusterOptions{ + ClusterName: name, + Type: e2e.ClusterTypeLogicalDNS, + DNSHostName: dnsHost, + DNSPort: dnsPort, + }) +} + // Tests the happy case where the dependency manager receives all the required // resources and verifies that Update is called with with the correct XDSConfig. func (s) TestHappyCase(t *testing.T) { - nodeID := uuid.New().String() - mgmtServer, bc := setupManagementServerForTest(t, nodeID) - xdsClient := createXDSClient(t, bc) + nodeID, mgmtServer, xdsClient := setup(t, false) watcher := &testWatcher{ updateCh: make(chan *xdsresource.XDSConfig), @@ -173,46 +287,20 @@ func (s) TestHappyCase(t *testing.T) { ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) defer cancel() - listener := e2e.DefaultClientListener(defaultTestServiceName, defaultTestRouteConfigName) - route := e2e.DefaultRouteConfig(defaultTestRouteConfigName, defaultTestServiceName, defaultTestClusterName) - resources := e2e.UpdateOptions{ - NodeID: nodeID, - Listeners: []*v3listenerpb.Listener{listener}, - Routes: []*v3routepb.RouteConfiguration{route}, - SkipValidation: true, - } + resources := e2e.DefaultClientResources(e2e.ResourceParams{ + NodeID: nodeID, + DialTarget: defaultTestServiceName, + Host: "localhost", + Port: 8080, + SecLevel: e2e.SecurityLevelNone, + }) if err := mgmtServer.Update(ctx, resources); err != nil { t.Fatal(err) } dm := xdsdepmgr.New(defaultTestServiceName, defaultTestServiceName, xdsClient, watcher) defer dm.Close() - wantXdsConfig := &xdsresource.XDSConfig{ - Listener: &xdsresource.ListenerUpdate{ - RouteConfigName: defaultTestRouteConfigName, - HTTPFilters: []xdsresource.HTTPFilter{{Name: "router"}}, - }, - RouteConfig: &xdsresource.RouteConfigUpdate{ - VirtualHosts: []*xdsresource.VirtualHost{ - { - Domains: []string{defaultTestServiceName}, - Routes: []*xdsresource.Route{{ - Prefix: newStringP("/"), - WeightedClusters: []xdsresource.WeightedCluster{{Name: defaultTestClusterName, Weight: 100}}, - ActionType: xdsresource.RouteActionRoute, - }}, - }, - }, - }, - VirtualHost: &xdsresource.VirtualHost{ - Domains: []string{defaultTestServiceName}, - Routes: []*xdsresource.Route{{ - Prefix: newStringP("/"), - WeightedClusters: []xdsresource.WeightedCluster{{Name: defaultTestClusterName, Weight: 100}}, - ActionType: xdsresource.RouteActionRoute}, - }, - }, - } + wantXdsConfig := makedefaultXDSConfig(resources.Routes[0].Name, resources.Clusters[0].Name, resources.Clusters[0].EdsClusterConfig.ServiceName, "localhost:8080") if err := verifyXDSConfig(ctx, watcher.updateCh, watcher.errorCh, wantXdsConfig); err != nil { t.Fatal(err) } @@ -221,9 +309,7 @@ func (s) TestHappyCase(t *testing.T) { // Tests the case where the listener contains an inline route configuration and // verifies that Update is called with the correct XDSConfig. func (s) TestInlineRouteConfig(t *testing.T) { - nodeID := uuid.New().String() - mgmtServer, bc := setupManagementServerForTest(t, nodeID) - xdsClient := createXDSClient(t, bc) + nodeID, mgmtServer, xdsClient := setup(t, false) watcher := &testWatcher{ updateCh: make(chan *xdsresource.XDSConfig), @@ -249,9 +335,13 @@ func (s) TestInlineRouteConfig(t *testing.T) { }}, }}, } + cluster := e2e.DefaultCluster(defaultTestClusterName, defaultTestEDSServiceName, e2e.SecurityLevelNone) + endpoint := e2e.DefaultEndpoint(defaultTestEDSServiceName, "localhost", []uint32{8080}) resources := e2e.UpdateOptions{ NodeID: nodeID, Listeners: []*v3listenerpb.Listener{listener}, + Clusters: []*v3clusterpb.Cluster{cluster}, + Endpoints: []*v3endpointpb.ClusterLoadAssignment{endpoint}, SkipValidation: true, } if err := mgmtServer.Update(ctx, resources); err != nil { @@ -260,44 +350,11 @@ func (s) TestInlineRouteConfig(t *testing.T) { dm := xdsdepmgr.New(defaultTestServiceName, defaultTestServiceName, xdsClient, watcher) defer dm.Close() - wantConfig := &xdsresource.XDSConfig{ - Listener: &xdsresource.ListenerUpdate{ - InlineRouteConfig: &xdsresource.RouteConfigUpdate{ - VirtualHosts: []*xdsresource.VirtualHost{ - { - Domains: []string{defaultTestServiceName}, - Routes: []*xdsresource.Route{{ - Prefix: newStringP("/"), - WeightedClusters: []xdsresource.WeightedCluster{{Name: defaultTestClusterName, Weight: 100}}, - ActionType: xdsresource.RouteActionRoute}, - }, - }, - }, - }, - HTTPFilters: []xdsresource.HTTPFilter{{Name: "router"}}, - }, - RouteConfig: &xdsresource.RouteConfigUpdate{ - VirtualHosts: []*xdsresource.VirtualHost{ - { - Domains: []string{defaultTestServiceName}, - Routes: []*xdsresource.Route{{ - Prefix: newStringP("/"), - WeightedClusters: []xdsresource.WeightedCluster{{Name: defaultTestClusterName, Weight: 100}}, - ActionType: xdsresource.RouteActionRoute}, - }, - }, - }, - }, - VirtualHost: &xdsresource.VirtualHost{ - Domains: []string{defaultTestServiceName}, - Routes: []*xdsresource.Route{{ - Prefix: newStringP("/"), - WeightedClusters: []xdsresource.WeightedCluster{{Name: defaultTestClusterName, Weight: 100}}, - ActionType: xdsresource.RouteActionRoute}, - }, - }, - } - if err := verifyXDSConfig(ctx, watcher.updateCh, watcher.errorCh, wantConfig); err != nil { + wantXdsConfig := makedefaultXDSConfig(defaultTestRouteConfigName, defaultTestClusterName, defaultTestEDSServiceName, "localhost:8080") + wantXdsConfig.Listener.InlineRouteConfig = wantXdsConfig.RouteConfig + wantXdsConfig.Listener.RouteConfigName = "" + + if err := verifyXDSConfig(ctx, watcher.updateCh, watcher.errorCh, wantXdsConfig); err != nil { t.Fatal(err) } } @@ -306,9 +363,7 @@ func (s) TestInlineRouteConfig(t *testing.T) { // does not receive route config resource. Verfies that Update is not called // since we do not have all resources. func (s) TestIncompleteResources(t *testing.T) { - nodeID := uuid.New().String() - mgmtServer, bc := setupManagementServerForTest(t, nodeID) - xdsClient := createXDSClient(t, bc) + nodeID, mgmtServer, xdsClient := setup(t, false) watcher := &testWatcher{ updateCh: make(chan *xdsresource.XDSConfig), @@ -343,9 +398,7 @@ func (s) TestIncompleteResources(t *testing.T) { // sending the correct update first and then removing the listener resource. It // verifies that Error is called with the correct error. func (s) TestListenerResourceError(t *testing.T) { - nodeID := uuid.New().String() - mgmtServer, bc := setupManagementServerForTest(t, nodeID) - xdsClient := createXDSClient(t, bc) + nodeID, mgmtServer, xdsClient := setup(t, false) watcher := &testWatcher{ updateCh: make(chan *xdsresource.XDSConfig), @@ -355,54 +408,27 @@ func (s) TestListenerResourceError(t *testing.T) { defer cancel() // Send a correct update first - listener := e2e.DefaultClientListener(defaultTestServiceName, defaultTestRouteConfigName) - listener.FilterChains = nil - route := e2e.DefaultRouteConfig(defaultTestRouteConfigName, defaultTestServiceName, defaultTestClusterName) - resources := e2e.UpdateOptions{ - NodeID: nodeID, - Listeners: []*v3listenerpb.Listener{listener}, - Routes: []*v3routepb.RouteConfiguration{route}, - SkipValidation: true, - } + resources := e2e.DefaultClientResources(e2e.ResourceParams{ + NodeID: nodeID, + DialTarget: defaultTestServiceName, + Host: "localhost", + Port: 8080, + SecLevel: e2e.SecurityLevelNone, + }) if err := mgmtServer.Update(ctx, resources); err != nil { t.Fatal(err) } dm := xdsdepmgr.New(defaultTestServiceName, defaultTestServiceName, xdsClient, watcher) defer dm.Close() - - wantXdsConfig := &xdsresource.XDSConfig{ - Listener: &xdsresource.ListenerUpdate{ - RouteConfigName: defaultTestRouteConfigName, - HTTPFilters: []xdsresource.HTTPFilter{{Name: "router"}}, - }, - RouteConfig: &xdsresource.RouteConfigUpdate{ - VirtualHosts: []*xdsresource.VirtualHost{ - { - Domains: []string{defaultTestServiceName}, - Routes: []*xdsresource.Route{{ - Prefix: newStringP("/"), - WeightedClusters: []xdsresource.WeightedCluster{{Name: defaultTestClusterName, Weight: 100}}, - ActionType: xdsresource.RouteActionRoute, - }}, - }, - }, - }, - VirtualHost: &xdsresource.VirtualHost{ - Domains: []string{defaultTestServiceName}, - Routes: []*xdsresource.Route{{ - Prefix: newStringP("/"), - WeightedClusters: []xdsresource.WeightedCluster{{Name: defaultTestClusterName, Weight: 100}}, - ActionType: xdsresource.RouteActionRoute, - }}, - }, - } + wantXdsConfig := makedefaultXDSConfig(resources.Routes[0].Name, resources.Clusters[0].Name, resources.Clusters[0].EdsClusterConfig.ServiceName, "localhost:8080") if err := verifyXDSConfig(ctx, watcher.updateCh, watcher.errorCh, wantXdsConfig); err != nil { t.Fatal(err) } // Remove listener resource so that we get listener resource error. resources.Listeners = nil + resources.SkipValidation = true if err := mgmtServer.Update(ctx, resources); err != nil { t.Fatal(err) } @@ -416,14 +442,11 @@ func (s) TestListenerResourceError(t *testing.T) { // error by sending a route resource that is NACKed by the XDSClient. It // verifies that Error is called with correct error. func (s) TestRouteResourceError(t *testing.T) { - nodeID := uuid.New().String() - mgmtServer, bc := setupManagementServerForTest(t, nodeID) - xdsClient := createXDSClient(t, bc) + nodeID, mgmtServer, xdsClient := setup(t, false) - errorCh := make(chan error, 1) watcher := &testWatcher{ updateCh: make(chan *xdsresource.XDSConfig), - errorCh: errorCh, + errorCh: make(chan error), } ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) defer cancel() @@ -455,7 +478,7 @@ func (s) TestRouteResourceError(t *testing.T) { // host. Verifies that Error is called with correct error. func (s) TestNoVirtualHost(t *testing.T) { nodeID := uuid.New().String() - mgmtServer, bc := setupManagementServerForTest(t, nodeID) + mgmtServer, bc := setupManagementServerForTest(t, nodeID, false) xdsClient := createXDSClient(t, bc) watcher := &testWatcher{ @@ -466,15 +489,16 @@ func (s) TestNoVirtualHost(t *testing.T) { ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) defer cancel() - listener := e2e.DefaultClientListener(defaultTestServiceName, defaultTestRouteConfigName) - route := e2e.DefaultRouteConfig(defaultTestRouteConfigName, defaultTestServiceName, defaultTestClusterName) - route.VirtualHosts = nil - resources := e2e.UpdateOptions{ - NodeID: nodeID, - Listeners: []*v3listenerpb.Listener{listener}, - Routes: []*v3routepb.RouteConfiguration{route}, - SkipValidation: true, - } + resources := e2e.DefaultClientResources(e2e.ResourceParams{ + NodeID: nodeID, + DialTarget: defaultTestServiceName, + Host: "localhost", + Port: 8080, + SecLevel: e2e.SecurityLevelNone, + }) + // Make the virtual host match nil so that the route config is NACKed. + resources.Routes[0].VirtualHosts = nil + if err := mgmtServer.Update(ctx, resources); err != nil { t.Fatal(err) } @@ -490,9 +514,7 @@ func (s) TestNoVirtualHost(t *testing.T) { // route resource with no virtual host, which also results in error being sent // across. func (s) TestNoVirtualHost_ExistingResource(t *testing.T) { - nodeID := uuid.New().String() - mgmtServer, bc := setupManagementServerForTest(t, nodeID) - xdsClient := createXDSClient(t, bc) + nodeID, mgmtServer, xdsClient := setup(t, false) watcher := &testWatcher{ updateCh: make(chan *xdsresource.XDSConfig), @@ -502,15 +524,13 @@ func (s) TestNoVirtualHost_ExistingResource(t *testing.T) { ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) defer cancel() - // Send valid listener and route. - listener := e2e.DefaultClientListener(defaultTestServiceName, defaultTestRouteConfigName) - route := e2e.DefaultRouteConfig(defaultTestRouteConfigName, defaultTestServiceName, defaultTestClusterName) - resources := e2e.UpdateOptions{ - NodeID: nodeID, - Listeners: []*v3listenerpb.Listener{listener}, - Routes: []*v3routepb.RouteConfiguration{route}, - SkipValidation: true, - } + resources := e2e.DefaultClientResources(e2e.ResourceParams{ + NodeID: nodeID, + DialTarget: defaultTestServiceName, + Host: "localhost", + Port: 8080, + SecLevel: e2e.SecurityLevelNone, + }) if err := mgmtServer.Update(ctx, resources); err != nil { t.Fatal(err) } @@ -519,38 +539,13 @@ func (s) TestNoVirtualHost_ExistingResource(t *testing.T) { defer dm.Close() // Verify valid update. - wantXdsConfig := &xdsresource.XDSConfig{ - Listener: &xdsresource.ListenerUpdate{ - RouteConfigName: defaultTestRouteConfigName, - HTTPFilters: []xdsresource.HTTPFilter{{Name: "router"}}, - }, - RouteConfig: &xdsresource.RouteConfigUpdate{ - VirtualHosts: []*xdsresource.VirtualHost{ - { - Domains: []string{defaultTestServiceName}, - Routes: []*xdsresource.Route{{ - Prefix: newStringP("/"), - WeightedClusters: []xdsresource.WeightedCluster{{Name: defaultTestClusterName, Weight: 100}}, - ActionType: xdsresource.RouteActionRoute, - }}, - }, - }, - }, - VirtualHost: &xdsresource.VirtualHost{ - Domains: []string{defaultTestServiceName}, - Routes: []*xdsresource.Route{{ - Prefix: newStringP("/"), - WeightedClusters: []xdsresource.WeightedCluster{{Name: defaultTestClusterName, Weight: 100}}, - ActionType: xdsresource.RouteActionRoute, - }}, - }, - } + wantXdsConfig := makedefaultXDSConfig(resources.Routes[0].Name, resources.Clusters[0].Name, resources.Endpoints[0].ClusterName, "localhost:8080") if err := verifyXDSConfig(ctx, watcher.updateCh, watcher.errorCh, wantXdsConfig); err != nil { t.Fatal(err) } // 3. Send route update with no virtual host. - route.VirtualHosts = nil + resources.Routes[0].VirtualHosts = nil if err := mgmtServer.Update(ctx, resources); err != nil { t.Fatal(err) } @@ -571,26 +566,24 @@ func (s) TestAmbientError(t *testing.T) { // Expect a warning log for the ambient error. grpctest.ExpectWarning("Listener resource ambient error") - nodeID := uuid.New().String() - mgmtServer, bc := setupManagementServerForTest(t, nodeID) - xdsClient := createXDSClient(t, bc) + nodeID, mgmtServer, xdsClient := setup(t, false) watcher := &testWatcher{ updateCh: make(chan *xdsresource.XDSConfig), errorCh: make(chan error), } + ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) defer cancel() - // Configure a valid listener and route. - listener := e2e.DefaultClientListener(defaultTestServiceName, defaultTestRouteConfigName) - route := e2e.DefaultRouteConfig(defaultTestRouteConfigName, defaultTestServiceName, defaultTestClusterName) - resources := e2e.UpdateOptions{ - NodeID: nodeID, - Listeners: []*v3listenerpb.Listener{listener}, - Routes: []*v3routepb.RouteConfiguration{route}, - SkipValidation: true, - } + // Configure a valid resources. + resources := e2e.DefaultClientResources(e2e.ResourceParams{ + NodeID: nodeID, + DialTarget: defaultTestServiceName, + Host: "localhost", + Port: 8080, + SecLevel: e2e.SecurityLevelNone, + }) if err := mgmtServer.Update(ctx, resources); err != nil { t.Fatal(err) } @@ -599,32 +592,7 @@ func (s) TestAmbientError(t *testing.T) { defer dm.Close() // Wait for the initial valid update. - wantXdsConfig := &xdsresource.XDSConfig{ - Listener: &xdsresource.ListenerUpdate{ - RouteConfigName: defaultTestRouteConfigName, - HTTPFilters: []xdsresource.HTTPFilter{{Name: "router"}}, - }, - RouteConfig: &xdsresource.RouteConfigUpdate{ - VirtualHosts: []*xdsresource.VirtualHost{ - { - Domains: []string{defaultTestServiceName}, - Routes: []*xdsresource.Route{{ - Prefix: newStringP("/"), - WeightedClusters: []xdsresource.WeightedCluster{{Name: defaultTestClusterName, Weight: 100}}, - ActionType: xdsresource.RouteActionRoute, - }}, - }, - }, - }, - VirtualHost: &xdsresource.VirtualHost{ - Domains: []string{defaultTestServiceName}, - Routes: []*xdsresource.Route{{ - Prefix: newStringP("/"), - WeightedClusters: []xdsresource.WeightedCluster{{Name: defaultTestClusterName, Weight: 100}}, - ActionType: xdsresource.RouteActionRoute, - }}, - }, - } + wantXdsConfig := makedefaultXDSConfig(resources.Routes[0].Name, resources.Clusters[0].Name, resources.Clusters[0].EdsClusterConfig.ServiceName, "localhost:8080") if err := verifyXDSConfig(ctx, watcher.updateCh, watcher.errorCh, wantXdsConfig); err != nil { t.Fatal(err) } @@ -648,12 +616,13 @@ func (s) TestAmbientError(t *testing.T) { }}, } resources.Listeners = []*v3listenerpb.Listener{lis} + resources.SkipValidation = true if err := mgmtServer.Update(ctx, resources); err != nil { t.Fatal(err) } - // We expect no call to Error or Update on our watcher. We just wait for - // a short duration to ensure that. + // We expect no call to Error or Update on our watcher. We just wait for a + // short duration to ensure that. sCtx, sCancel := context.WithTimeout(ctx, defaultTestShortTimeout) defer sCancel() select { @@ -665,14 +634,13 @@ func (s) TestAmbientError(t *testing.T) { } // Send valid resources again. - listener = e2e.DefaultClientListener(defaultTestServiceName, defaultTestRouteConfigName) - route = e2e.DefaultRouteConfig(defaultTestRouteConfigName, defaultTestServiceName, defaultTestClusterName) - resources = e2e.UpdateOptions{ - NodeID: nodeID, - Listeners: []*v3listenerpb.Listener{listener}, - Routes: []*v3routepb.RouteConfiguration{route}, - SkipValidation: true, - } + resources = e2e.DefaultClientResources(e2e.ResourceParams{ + NodeID: nodeID, + DialTarget: defaultTestServiceName, + Host: "localhost", + Port: 8080, + SecLevel: e2e.SecurityLevelNone, + }) if err := mgmtServer.Update(ctx, resources); err != nil { t.Fatal(err) } @@ -684,9 +652,7 @@ func (s) TestAmbientError(t *testing.T) { // Tests the case where the cluster name changes in the route resource update // and verify that each time Update is called with correct cluster name. func (s) TestRouteResourceUpdate(t *testing.T) { - nodeID := uuid.New().String() - mgmtServer, bc := setupManagementServerForTest(t, nodeID) - xdsClient := createXDSClient(t, bc) + nodeID, mgmtServer, xdsClient := setup(t, false) watcher := &testWatcher{ updateCh: make(chan *xdsresource.XDSConfig), @@ -695,15 +661,14 @@ func (s) TestRouteResourceUpdate(t *testing.T) { ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) defer cancel() - // Initial resources with defaultTestClusterName - listener := e2e.DefaultClientListener(defaultTestServiceName, defaultTestRouteConfigName) - route := e2e.DefaultRouteConfig(defaultTestRouteConfigName, defaultTestServiceName, defaultTestClusterName) - resources := e2e.UpdateOptions{ - NodeID: nodeID, - Listeners: []*v3listenerpb.Listener{listener}, - Routes: []*v3routepb.RouteConfiguration{route}, - SkipValidation: true, - } + // Configure initial resources + resources := e2e.DefaultClientResources(e2e.ResourceParams{ + NodeID: nodeID, + DialTarget: defaultTestServiceName, + Host: "localhost", + Port: 8080, + SecLevel: e2e.SecurityLevelNone, + }) if err := mgmtServer.Update(ctx, resources); err != nil { t.Fatal(err) } @@ -712,10 +677,128 @@ func (s) TestRouteResourceUpdate(t *testing.T) { defer dm.Close() // Wait for the first update. - wantXdsConfig := &xdsresource.XDSConfig{ + wantXdsConfig := makedefaultXDSConfig(resources.Routes[0].Name, resources.Clusters[0].Name, resources.Clusters[0].EdsClusterConfig.ServiceName, "localhost:8080") + if err := verifyXDSConfig(ctx, watcher.updateCh, watcher.errorCh, wantXdsConfig); err != nil { + t.Fatal(err) + } + + // Update route to point to a new cluster. + newClusterName := "new-cluster-name" + route2 := e2e.DefaultRouteConfig(resources.Routes[0].Name, defaultTestServiceName, newClusterName) + cluster2 := e2e.DefaultCluster(newClusterName, newClusterName, e2e.SecurityLevelNone) + endpoints2 := e2e.DefaultEndpoint(newClusterName, "localhost", []uint32{8081}) + resources.Routes = []*v3routepb.RouteConfiguration{route2} + resources.Clusters = []*v3clusterpb.Cluster{cluster2} + resources.Endpoints = []*v3endpointpb.ClusterLoadAssignment{endpoints2} + if err := mgmtServer.Update(ctx, resources); err != nil { + t.Fatal(err) + } + + // Wait for the second update and verify it has the new cluster. + wantXdsConfig.RouteConfig.VirtualHosts[0].Routes[0].WeightedClusters[0].Name = newClusterName + wantXdsConfig.VirtualHost.Routes[0].WeightedClusters[0].Name = newClusterName + wantXdsConfig.Clusters = map[string]*xdsresource.ClusterResult{ + newClusterName: { + Config: xdsresource.ClusterConfig{ + Cluster: &xdsresource.ClusterUpdate{ + ClusterName: newClusterName, + ClusterType: xdsresource.ClusterTypeEDS, + EDSServiceName: newClusterName, + }, + EndpointConfig: &xdsresource.EndpointConfig{ + EDSUpdate: &xdsresource.EndpointsUpdate{ + Localities: []xdsresource.Locality{ + {ID: clients.Locality{ + Region: "region-1", + Zone: "zone-1", + SubZone: "subzone-1", + }, + Endpoints: []xdsresource.Endpoint{ + { + Addresses: []string{"localhost:8081"}, + HealthStatus: xdsresource.EndpointHealthStatusUnknown, + Weight: 1, + }, + }, + Weight: 1}, + }, + }, + }, + }, + }, + } + if err := verifyXDSConfig(ctx, watcher.updateCh, watcher.errorCh, wantXdsConfig); err != nil { + t.Fatal(err) + } +} + +// Tests the case where the route resource is first sent from the management +// server and the changed to be inline with the listener and then again changed +// to be received from the management server. It verifies that each time Update +// called with the correct XDSConfig. +func (s) TestRouteResourceChangeToInline(t *testing.T) { + nodeID, mgmtServer, xdsClient := setup(t, false) + + watcher := &testWatcher{ + updateCh: make(chan *xdsresource.XDSConfig), + errorCh: make(chan error), + } + ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) + defer cancel() + + // Initial resources with defaultTestClusterName + resources := e2e.DefaultClientResources(e2e.ResourceParams{ + NodeID: nodeID, + DialTarget: defaultTestServiceName, + Host: "localhost", + Port: 8080, + SecLevel: e2e.SecurityLevelNone, + }) + if err := mgmtServer.Update(ctx, resources); err != nil { + t.Fatal(err) + } + + dm := xdsdepmgr.New(defaultTestServiceName, defaultTestServiceName, xdsClient, watcher) + defer dm.Close() + + // Wait for the first update. + wantXdsConfig := makedefaultXDSConfig(resources.Routes[0].Name, resources.Clusters[0].Name, resources.Clusters[0].EdsClusterConfig.ServiceName, "localhost:8080") + if err := verifyXDSConfig(ctx, watcher.updateCh, watcher.errorCh, wantXdsConfig); err != nil { + t.Fatal(err) + } + + // Update route to point to a new cluster and make it inline with the + // listener. + newClusterName := "new-cluster-name" + hcm := testutils.MarshalAny(t, &v3httppb.HttpConnectionManager{ + RouteSpecifier: &v3httppb.HttpConnectionManager_RouteConfig{ + RouteConfig: e2e.DefaultRouteConfig(defaultTestRouteConfigName, defaultTestServiceName, newClusterName), + }, + HttpFilters: []*v3httppb.HttpFilter{e2e.HTTPFilter("router", &v3routerpb.Router{})}, // router fields are unused by grpc + }) + resources.Listeners[0].ApiListener.ApiListener = hcm + resources.Clusters = []*v3clusterpb.Cluster{e2e.DefaultCluster(newClusterName, defaultTestEDSServiceName, e2e.SecurityLevelNone)} + resources.Endpoints = []*v3endpointpb.ClusterLoadAssignment{e2e.DefaultEndpoint(defaultTestEDSServiceName, "localhost", []uint32{8081})} + if err := mgmtServer.Update(ctx, resources); err != nil { + t.Fatal(err) + } + + // Wait for the second update and verify it has the new cluster. + wantInlineXdsConfig := &xdsresource.XDSConfig{ Listener: &xdsresource.ListenerUpdate{ - RouteConfigName: defaultTestRouteConfigName, - HTTPFilters: []xdsresource.HTTPFilter{{Name: "router"}}, + HTTPFilters: []xdsresource.HTTPFilter{{Name: "router"}}, + InlineRouteConfig: &xdsresource.RouteConfigUpdate{ + VirtualHosts: []*xdsresource.VirtualHost{ + { + Domains: []string{defaultTestServiceName}, + Routes: []*xdsresource.Route{{ + Prefix: newStringP("/"), + WeightedClusters: []xdsresource.WeightedCluster{{Name: newClusterName, Weight: 100}}, + ActionType: xdsresource.RouteActionRoute, + }}, + }, + }, + }, }, RouteConfig: &xdsresource.RouteConfigUpdate{ VirtualHosts: []*xdsresource.VirtualHost{ @@ -723,7 +806,7 @@ func (s) TestRouteResourceUpdate(t *testing.T) { Domains: []string{defaultTestServiceName}, Routes: []*xdsresource.Route{{ Prefix: newStringP("/"), - WeightedClusters: []xdsresource.WeightedCluster{{Name: defaultTestClusterName, Weight: 100}}, + WeightedClusters: []xdsresource.WeightedCluster{{Name: newClusterName, Weight: 100}}, ActionType: xdsresource.RouteActionRoute, }}, }, @@ -733,40 +816,182 @@ func (s) TestRouteResourceUpdate(t *testing.T) { Domains: []string{defaultTestServiceName}, Routes: []*xdsresource.Route{{ Prefix: newStringP("/"), - WeightedClusters: []xdsresource.WeightedCluster{{Name: defaultTestClusterName, Weight: 100}}, - ActionType: xdsresource.RouteActionRoute, - }}, + WeightedClusters: []xdsresource.WeightedCluster{{Name: newClusterName, Weight: 100}}, + ActionType: xdsresource.RouteActionRoute}, + }, + }, + Clusters: map[string]*xdsresource.ClusterResult{ + newClusterName: { + Config: xdsresource.ClusterConfig{Cluster: &xdsresource.ClusterUpdate{ + ClusterType: xdsresource.ClusterTypeEDS, + ClusterName: newClusterName, + EDSServiceName: defaultTestEDSServiceName, + }, + EndpointConfig: &xdsresource.EndpointConfig{ + EDSUpdate: &xdsresource.EndpointsUpdate{ + Localities: []xdsresource.Locality{ + {ID: clients.Locality{ + Region: "region-1", + Zone: "zone-1", + SubZone: "subzone-1", + }, + Endpoints: []xdsresource.Endpoint{ + { + Addresses: []string{"localhost:8081"}, + HealthStatus: xdsresource.EndpointHealthStatusUnknown, + Weight: 1, + }, + }, + Weight: 1}, + }, + }, + }, + }, + }, }, } + if err := verifyXDSConfig(ctx, watcher.updateCh, watcher.errorCh, wantInlineXdsConfig); err != nil { + t.Fatal(err) + } + + // Change the route resource back to non-inline. + resources = e2e.DefaultClientResources(e2e.ResourceParams{ + NodeID: nodeID, + DialTarget: defaultTestServiceName, + Host: "localhost", + Port: 8080, + SecLevel: e2e.SecurityLevelNone, + }) + if err := mgmtServer.Update(ctx, resources); err != nil { + t.Fatal(err) + } if err := verifyXDSConfig(ctx, watcher.updateCh, watcher.errorCh, wantXdsConfig); err != nil { t.Fatal(err) } +} + +// Tests the case where the dependency manager receives a cluster resource error +// and verifies that Update is called with XDSConfig containing cluster error. +func (s) TestClusterResourceError(t *testing.T) { + nodeID, mgmtServer, xdsClient := setup(t, false) + + watcher := &testWatcher{ + updateCh: make(chan *xdsresource.XDSConfig), + errorCh: make(chan error), + } + + ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) + defer cancel() + + resources := e2e.DefaultClientResources(e2e.ResourceParams{ + NodeID: nodeID, + DialTarget: defaultTestServiceName, + Host: "localhost", + Port: 8080, + SecLevel: e2e.SecurityLevelNone, + }) + resources.Clusters[0].LrsServer = &v3corepb.ConfigSource{ConfigSourceSpecifier: &v3corepb.ConfigSource_Ads{}} - // Update route to point to a new cluster. - newClusterName := "new-cluster-name" - route2 := e2e.DefaultRouteConfig(defaultTestRouteConfigName, defaultTestServiceName, newClusterName) - resources.Routes = []*v3routepb.RouteConfiguration{route2} if err := mgmtServer.Update(ctx, resources); err != nil { t.Fatal(err) } - // Wait for the second update and verify it has the new cluster. - wantXdsConfig.RouteConfig.VirtualHosts[0].Routes[0].WeightedClusters[0].Name = newClusterName - wantXdsConfig.VirtualHost.Routes[0].WeightedClusters[0].Name = newClusterName + dm := xdsdepmgr.New(defaultTestServiceName, defaultTestServiceName, xdsClient, watcher) + defer dm.Close() + + wantXdsConfig := makedefaultXDSConfig(resources.Routes[0].Name, resources.Clusters[0].Name, resources.Clusters[0].EdsClusterConfig.ServiceName, "localhost:8080") + wantXdsConfig.Clusters[resources.Clusters[0].Name] = &xdsresource.ClusterResult{Err: fmt.Errorf("[xDS node id: %v]: %v", nodeID, fmt.Errorf("unsupported config_source_specifier *corev3.ConfigSource_Ads in lrs_server field"))} + if err := verifyXDSConfig(ctx, watcher.updateCh, watcher.errorCh, wantXdsConfig); err != nil { t.Fatal(err) } + + // Drain the updates because management server keeps sending the error + // updates repeatedly causing the update from dependency manager to be + // blocked if we don't drain it. + select { + case <-ctx.Done(): + case <-watcher.updateCh: + } } -// Tests the case where the route resource is first sent from the management -// server and the changed to be inline with the listener and then again changed -// to be received from the management server. It verifies that each time Update -// called with the correct XDSConfig. -func (s) TestRouteResourceChangeToInline(t *testing.T) { - nodeID := uuid.New().String() - mgmtServer, bc := setupManagementServerForTest(t, nodeID) - defer mgmtServer.Stop() - xdsClient := createXDSClient(t, bc) +// Tests the case where the dependency manager receives a cluster resource +// ambient error. We send a valid cluster resource first, then send an invalid +// one and then send the valid resource again. We send the valid resource again +// so that we can be sure the ambient error reaches the dependency manager since +// there is no other way to wait for it. +func (s) TestClusterAmbientError(t *testing.T) { + // Expect a warning log for the ambient error. + grpctest.ExpectWarning("Cluster resource ambient error") + + nodeID, mgmtServer, xdsClient := setup(t, false) + + watcher := &testWatcher{ + updateCh: make(chan *xdsresource.XDSConfig), + errorCh: make(chan error), + } + + ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) + defer cancel() + + resources := e2e.DefaultClientResources(e2e.ResourceParams{ + NodeID: nodeID, + DialTarget: defaultTestServiceName, + Host: "localhost", + Port: 8080, + SecLevel: e2e.SecurityLevelNone, + }) + + if err := mgmtServer.Update(ctx, resources); err != nil { + t.Fatal(err) + } + + dm := xdsdepmgr.New(defaultTestServiceName, defaultTestServiceName, xdsClient, watcher) + defer dm.Close() + + wantXdsConfig := makedefaultXDSConfig(resources.Routes[0].Name, resources.Clusters[0].Name, resources.Clusters[0].EdsClusterConfig.ServiceName, "localhost:8080") + if err := verifyXDSConfig(ctx, watcher.updateCh, watcher.errorCh, wantXdsConfig); err != nil { + t.Fatal(err) + } + + // Configure a cluster resource that is expected to be NACKed because it + // does not contain the `LrsServer` field. Since a valid one is already + // cached, this should result in an ambient error. + resources.Clusters[0].LrsServer = &v3corepb.ConfigSource{ConfigSourceSpecifier: &v3corepb.ConfigSource_Ads{}} + if err := mgmtServer.Update(ctx, resources); err != nil { + t.Fatal(err) + } + + select { + case <-time.After(defaultTestShortTimeout): + case update := <-watcher.updateCh: + t.Fatalf("received unexpected update from dependency manager: %v", update) + } + + // Send valid resources again to guarantee we get the cluster ambient error + // before the test ends. + resources = e2e.DefaultClientResources(e2e.ResourceParams{ + NodeID: nodeID, + DialTarget: defaultTestServiceName, + Host: "localhost", + Port: 8080, + SecLevel: e2e.SecurityLevelNone, + }) + if err := mgmtServer.Update(ctx, resources); err != nil { + t.Fatal(err) + } + + if err := verifyXDSConfig(ctx, watcher.updateCh, watcher.errorCh, wantXdsConfig); err != nil { + t.Fatal(err) + } +} + +// Tests the case where a cluster is an aggregate cluster whose children are of +// type EDS and DNS. Verifies that Update is not called when one of the child +// resources is not configured and then verifies that Update is called with +// correct config when all resources are configured. +func (s) TestAggregateCluster(t *testing.T) { + nodeID, mgmtServer, xdsClient := setup(t, true) watcher := &testWatcher{ updateCh: make(chan *xdsresource.XDSConfig), @@ -775,14 +1000,17 @@ func (s) TestRouteResourceChangeToInline(t *testing.T) { ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) defer cancel() - // Initial resources with defaultTestClusterName listener := e2e.DefaultClientListener(defaultTestServiceName, defaultTestRouteConfigName) route := e2e.DefaultRouteConfig(defaultTestRouteConfigName, defaultTestServiceName, defaultTestClusterName) + aggregateCluster := makeAggregateClusterResource(defaultTestClusterName, []string{"eds-cluster", "dns-cluster"}) + edsCluster := e2e.DefaultCluster("eds-cluster", defaultTestEDSServiceName, e2e.SecurityLevelNone) + endpoint := e2e.DefaultEndpoint(defaultTestEDSServiceName, "localhost", []uint32{8080}) resources := e2e.UpdateOptions{ - NodeID: nodeID, - Listeners: []*v3listenerpb.Listener{listener}, - Routes: []*v3routepb.RouteConfiguration{route}, - SkipValidation: true, + NodeID: nodeID, + Listeners: []*v3listenerpb.Listener{listener}, + Routes: []*v3routepb.RouteConfiguration{route}, + Clusters: []*v3clusterpb.Cluster{aggregateCluster, edsCluster}, + Endpoints: []*v3endpointpb.ClusterLoadAssignment{endpoint}, } if err := mgmtServer.Update(ctx, resources); err != nil { t.Fatal(err) @@ -791,10 +1019,26 @@ func (s) TestRouteResourceChangeToInline(t *testing.T) { dm := xdsdepmgr.New(defaultTestServiceName, defaultTestServiceName, xdsClient, watcher) defer dm.Close() - // Wait for the first update. + // Verify that no configuration is pushed to the child policy yet, because + // not all clusters making up the aggregate cluster have been resolved yet. + select { + case <-time.After(defaultTestShortTimeout): + case update := <-watcher.updateCh: + t.Fatalf("received unexpected update from dependency manager: %+v", update) + case err := <-watcher.errorCh: + t.Fatalf("received unexpected error from dependency manager: %v", err) + } + + // Now configure the LogicalDNS cluster in the management server. This + // should result in configuration being pushed down to the child policy. + resources.Clusters = append(resources.Clusters, makeLogicalDNSClusterResource("dns-cluster", "localhost", 8081)) + if err := mgmtServer.Update(ctx, resources); err != nil { + t.Fatal(err) + } + wantXdsConfig := &xdsresource.XDSConfig{ Listener: &xdsresource.ListenerUpdate{ - RouteConfigName: defaultTestRouteConfigName, + RouteConfigName: resources.Routes[0].Name, HTTPFilters: []xdsresource.HTTPFilter{{Name: "router"}}, }, RouteConfig: &xdsresource.RouteConfigUpdate{ @@ -803,12 +1047,137 @@ func (s) TestRouteResourceChangeToInline(t *testing.T) { Domains: []string{defaultTestServiceName}, Routes: []*xdsresource.Route{{ Prefix: newStringP("/"), - WeightedClusters: []xdsresource.WeightedCluster{{Name: defaultTestClusterName, Weight: 100}}, + WeightedClusters: []xdsresource.WeightedCluster{{Name: resources.Clusters[0].Name, Weight: 100}}, ActionType: xdsresource.RouteActionRoute, }}, }, }, }, + VirtualHost: &xdsresource.VirtualHost{ + Domains: []string{defaultTestServiceName}, + Routes: []*xdsresource.Route{{ + Prefix: newStringP("/"), + WeightedClusters: []xdsresource.WeightedCluster{{Name: resources.Clusters[0].Name, Weight: 100}}, + ActionType: xdsresource.RouteActionRoute}, + }, + }, + Clusters: map[string]*xdsresource.ClusterResult{ + resources.Clusters[0].Name: { + Config: xdsresource.ClusterConfig{Cluster: &xdsresource.ClusterUpdate{ + ClusterType: xdsresource.ClusterTypeAggregate, + ClusterName: resources.Clusters[0].Name, + PrioritizedClusterNames: []string{"eds-cluster", "dns-cluster"}, + }, + AggregateConfig: &xdsresource.AggregateConfig{LeafClusters: []string{"eds-cluster", "dns-cluster"}}, + }, + }, + "eds-cluster": { + Config: xdsresource.ClusterConfig{ + Cluster: &xdsresource.ClusterUpdate{ + ClusterType: xdsresource.ClusterTypeEDS, + ClusterName: "eds-cluster", + EDSServiceName: defaultTestEDSServiceName}, + EndpointConfig: &xdsresource.EndpointConfig{ + EDSUpdate: &xdsresource.EndpointsUpdate{ + Localities: []xdsresource.Locality{ + {ID: clients.Locality{ + Region: "region-1", + Zone: "zone-1", + SubZone: "subzone-1", + }, + Endpoints: []xdsresource.Endpoint{ + { + Addresses: []string{"localhost:8080"}, + HealthStatus: xdsresource.EndpointHealthStatusUnknown, + Weight: 1, + }, + }, + Weight: 1}, + }, + }, + }, + }}, + "dns-cluster": { + Config: xdsresource.ClusterConfig{ + Cluster: &xdsresource.ClusterUpdate{ + ClusterType: xdsresource.ClusterTypeLogicalDNS, + ClusterName: "dns-cluster", + DNSHostName: "localhost:8081", + }, + EndpointConfig: &xdsresource.EndpointConfig{ + DNSEndpoints: &xdsresource.DNSUpdate{ + Endpoints: []resolver.Endpoint{ + { + Addresses: []resolver.Address{ + {Addr: "[::1]:8081"}, + }, + }, + { + Addresses: []resolver.Address{ + {Addr: "127.0.0.1:8081"}, + }, + }, + }, + }, + }, + }, + }, + }, + } + + if err := verifyXDSConfig(ctx, watcher.updateCh, watcher.errorCh, wantXdsConfig); err != nil { + t.Fatal(err) + } + +} + +// Tests the case where an aggregate cluster has one child whose resource is +// configured with an error. Verifies that the error is correctly received in +// the XDSConfig. +func (s) TestAggregateClusterChildError(t *testing.T) { + nodeID, mgmtServer, xdsClient := setup(t, true) + + watcher := &testWatcher{ + updateCh: make(chan *xdsresource.XDSConfig), + errorCh: make(chan error), + } + ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) + defer cancel() + + resources := e2e.UpdateOptions{ + NodeID: nodeID, + Listeners: []*v3listenerpb.Listener{e2e.DefaultClientListener(defaultTestServiceName, defaultTestRouteConfigName)}, + Routes: []*v3routepb.RouteConfiguration{e2e.DefaultRouteConfig(defaultTestRouteConfigName, defaultTestServiceName, defaultTestClusterName)}, + Clusters: []*v3clusterpb.Cluster{ + makeAggregateClusterResource(defaultTestClusterName, []string{"err-cluster", "good-cluster"}), + e2e.DefaultCluster("err-cluster", defaultTestEDSServiceName, e2e.SecurityLevelNone), + makeLogicalDNSClusterResource("good-cluster", "localhost", 8081), + }, + Endpoints: []*v3endpointpb.ClusterLoadAssignment{e2e.DefaultEndpoint(defaultTestEDSServiceName, "localhost", []uint32{8080})}, + } + resources.Endpoints[0].Endpoints[0].LbEndpoints[0].LoadBalancingWeight = &wrapperspb.UInt32Value{Value: 0} + if err := mgmtServer.Update(ctx, resources); err != nil { + t.Fatal(err) + } + + dm := xdsdepmgr.New(defaultTestServiceName, defaultTestServiceName, xdsClient, watcher) + defer dm.Close() + + wantXdsConfig := &xdsresource.XDSConfig{ + Listener: &xdsresource.ListenerUpdate{ + RouteConfigName: defaultTestRouteConfigName, + HTTPFilters: []xdsresource.HTTPFilter{{Name: "router"}}, + }, + RouteConfig: &xdsresource.RouteConfigUpdate{ + VirtualHosts: []*xdsresource.VirtualHost{{ + Domains: []string{defaultTestServiceName}, + Routes: []*xdsresource.Route{{ + Prefix: newStringP("/"), + WeightedClusters: []xdsresource.WeightedCluster{{Name: defaultTestClusterName, Weight: 100}}, + ActionType: xdsresource.RouteActionRoute, + }}, + }}, + }, VirtualHost: &xdsresource.VirtualHost{ Domains: []string{defaultTestServiceName}, Routes: []*xdsresource.Route{{ @@ -817,63 +1186,259 @@ func (s) TestRouteResourceChangeToInline(t *testing.T) { ActionType: xdsresource.RouteActionRoute, }}, }, + Clusters: map[string]*xdsresource.ClusterResult{ + defaultTestClusterName: { + Config: xdsresource.ClusterConfig{ + Cluster: &xdsresource.ClusterUpdate{ + ClusterType: xdsresource.ClusterTypeAggregate, + ClusterName: defaultTestClusterName, + PrioritizedClusterNames: []string{"err-cluster", "good-cluster"}, + }, + AggregateConfig: &xdsresource.AggregateConfig{LeafClusters: []string{"err-cluster", "good-cluster"}}, + }, + }, + "err-cluster": { + Config: xdsresource.ClusterConfig{ + Cluster: &xdsresource.ClusterUpdate{ + ClusterType: xdsresource.ClusterTypeEDS, + ClusterName: "err-cluster", + EDSServiceName: defaultTestEDSServiceName, + }, + EndpointConfig: &xdsresource.EndpointConfig{ + ResolutionNote: fmt.Errorf("[xDS node id: %v]: %v", nodeID, fmt.Errorf("EDS response contains an endpoint with zero weight: endpoint:{address:{socket_address:{address:%q port_value:%v}}} load_balancing_weight:{}", "localhost", 8080)), + }, + }, + }, + "good-cluster": { + Config: xdsresource.ClusterConfig{ + Cluster: &xdsresource.ClusterUpdate{ + ClusterType: xdsresource.ClusterTypeLogicalDNS, + ClusterName: "good-cluster", + DNSHostName: "localhost:8081", + }, + EndpointConfig: &xdsresource.EndpointConfig{ + DNSEndpoints: &xdsresource.DNSUpdate{ + Endpoints: []resolver.Endpoint{ + {Addresses: []resolver.Address{{Addr: "[::1]:8081"}}}, + {Addresses: []resolver.Address{{Addr: "127.0.0.1:8081"}}}, + }, + }, + }, + }, + }, + }, } + if err := verifyXDSConfig(ctx, watcher.updateCh, watcher.errorCh, wantXdsConfig); err != nil { t.Fatal(err) } - // Update route to point to a new cluster. - newClusterName := "new-cluster-name" - hcm := testutils.MarshalAny(t, &v3httppb.HttpConnectionManager{ - RouteSpecifier: &v3httppb.HttpConnectionManager_RouteConfig{ - RouteConfig: e2e.DefaultRouteConfig(defaultTestRouteConfigName, defaultTestServiceName, newClusterName), + // Drain the updates because management server keeps sending the error + // updates repeatedly causing the update from dependency manager to be + // blocked if we don't drain it. + select { + case <-ctx.Done(): + case <-watcher.updateCh: + } +} + +// Tests the case where an aggregate cluster has no leaf clusters by creating a +// cyclic dependency where A->B and B->A. Verifies that an error with "no leaf +// clusters found" is received. +func (s) TestAggregateClusterNoLeafCluster(t *testing.T) { + nodeID, mgmtServer, xdsClient := setup(t, true) + + watcher := &testWatcher{ + updateCh: make(chan *xdsresource.XDSConfig), + errorCh: make(chan error), + } + ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) + defer cancel() + + const ( + clusterNameA = defaultTestClusterName // cluster name in cds LB policy config + clusterNameB = defaultTestClusterName + "-B" + ) + // Create a cyclic dependency where A->B and B->A. + resources := e2e.UpdateOptions{ + NodeID: nodeID, + Listeners: []*v3listenerpb.Listener{e2e.DefaultClientListener(defaultTestServiceName, defaultTestRouteConfigName)}, + Routes: []*v3routepb.RouteConfiguration{e2e.DefaultRouteConfig(defaultTestRouteConfigName, defaultTestServiceName, defaultTestClusterName)}, + Clusters: []*v3clusterpb.Cluster{ + makeAggregateClusterResource(clusterNameA, []string{clusterNameB}), + makeAggregateClusterResource(clusterNameB, []string{clusterNameA}), }, - HttpFilters: []*v3httppb.HttpFilter{e2e.HTTPFilter("router", &v3routerpb.Router{})}, // router fields are unused by grpc - }) - resources.Listeners[0].ApiListener.ApiListener = hcm - resources.Routes = nil + } if err := mgmtServer.Update(ctx, resources); err != nil { t.Fatal(err) } - // Wait for the second update and verify it has the new cluster. - wantXdsConfig.Listener.InlineRouteConfig = &xdsresource.RouteConfigUpdate{ - VirtualHosts: []*xdsresource.VirtualHost{ - { + dm := xdsdepmgr.New(defaultTestServiceName, defaultTestServiceName, xdsClient, watcher) + defer dm.Close() + + wantXdsConfig := &xdsresource.XDSConfig{ + Listener: &xdsresource.ListenerUpdate{ + RouteConfigName: defaultTestRouteConfigName, + HTTPFilters: []xdsresource.HTTPFilter{{Name: "router"}}, + }, + RouteConfig: &xdsresource.RouteConfigUpdate{ + VirtualHosts: []*xdsresource.VirtualHost{{ Domains: []string{defaultTestServiceName}, - Routes: []*xdsresource.Route{{Prefix: newStringP("/"), - WeightedClusters: []xdsresource.WeightedCluster{{Name: newClusterName, Weight: 100}}, + Routes: []*xdsresource.Route{{ + Prefix: newStringP("/"), + WeightedClusters: []xdsresource.WeightedCluster{{Name: defaultTestClusterName, Weight: 100}}, ActionType: xdsresource.RouteActionRoute, }}, + }}, + }, + VirtualHost: &xdsresource.VirtualHost{ + Domains: []string{defaultTestServiceName}, + Routes: []*xdsresource.Route{{ + Prefix: newStringP("/"), + WeightedClusters: []xdsresource.WeightedCluster{{Name: defaultTestClusterName, Weight: 100}}, + ActionType: xdsresource.RouteActionRoute, + }}, + }, + Clusters: map[string]*xdsresource.ClusterResult{ + defaultTestClusterName: { + Err: fmt.Errorf("[xDS node id: %v]: %v", nodeID, fmt.Errorf("no leaf clusters found for aggregate cluster")), + }, + clusterNameB: { + Err: fmt.Errorf("[xDS node id: %v]: %v", nodeID, fmt.Errorf("no leaf clusters found for aggregate cluster")), }, }, } - wantXdsConfig.Listener.RouteConfigName = "" - wantXdsConfig.RouteConfig.VirtualHosts[0].Routes[0].WeightedClusters[0].Name = newClusterName - wantXdsConfig.VirtualHost.Routes[0].WeightedClusters[0].Name = newClusterName + if err := verifyXDSConfig(ctx, watcher.updateCh, watcher.errorCh, wantXdsConfig); err != nil { t.Fatal(err) } +} - // Change the route resource back to non-inline. - listener = e2e.DefaultClientListener(defaultTestServiceName, defaultTestRouteConfigName) - route = e2e.DefaultRouteConfig(defaultTestRouteConfigName, defaultTestServiceName, defaultTestClusterName) - resources = e2e.UpdateOptions{ +// Tests the case where nested aggregate clusters exceed the max depth of 16. +// Verify that the error is correctly received in the XDSConfig in all the +// clusters. +func (s) TestAggregateClusterMaxDepth(t *testing.T) { + nodeID, mgmtServer, xdsClient := setup(t, true) + + watcher := &testWatcher{ + updateCh: make(chan *xdsresource.XDSConfig), + errorCh: make(chan error), + } + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute) + defer cancel() + + // Create a graph of aggregate clusters with 18 clusters. + clusters := make([]*v3clusterpb.Cluster, 18) + for i := 0; i < 17; i++ { + clusters[i] = makeAggregateClusterResource(fmt.Sprintf("agg-%d", i), []string{fmt.Sprintf("agg-%d", i+1)}) + } + clusters[17] = makeLogicalDNSClusterResource("agg-17", "localhost", 8081) + + resources := e2e.UpdateOptions{ NodeID: nodeID, - Listeners: []*v3listenerpb.Listener{listener}, - Routes: []*v3routepb.RouteConfiguration{route}, + Listeners: []*v3listenerpb.Listener{e2e.DefaultClientListener(defaultTestServiceName, defaultTestRouteConfigName)}, + Routes: []*v3routepb.RouteConfiguration{e2e.DefaultRouteConfig(defaultTestRouteConfigName, defaultTestServiceName, "agg-0")}, + Clusters: clusters, SkipValidation: true, } if err := mgmtServer.Update(ctx, resources); err != nil { t.Fatal(err) } - // Wait for the third update and verify it has the original cluster. - wantXdsConfig.Listener.InlineRouteConfig = nil - wantXdsConfig.Listener.RouteConfigName = defaultTestRouteConfigName - wantXdsConfig.RouteConfig.VirtualHosts[0].Routes[0].WeightedClusters[0].Name = defaultTestClusterName - wantXdsConfig.VirtualHost.Routes[0].WeightedClusters[0].Name = defaultTestClusterName + dm := xdsdepmgr.New(defaultTestServiceName, defaultTestServiceName, xdsClient, watcher) + defer dm.Close() + + commonError := fmt.Errorf("[xDS node id: %v]: %v", nodeID, fmt.Errorf("aggregate cluster graph exceeds max depth (%d)", 16)) + + wantXdsConfig := &xdsresource.XDSConfig{ + Listener: &xdsresource.ListenerUpdate{ + RouteConfigName: defaultTestRouteConfigName, + HTTPFilters: []xdsresource.HTTPFilter{{Name: "router"}}, + }, + RouteConfig: &xdsresource.RouteConfigUpdate{ + VirtualHosts: []*xdsresource.VirtualHost{{ + Domains: []string{defaultTestServiceName}, + Routes: []*xdsresource.Route{{ + // The route should point to the first cluster in the chain: + // agg-0 + Prefix: newStringP("/"), + WeightedClusters: []xdsresource.WeightedCluster{{Name: "agg-0", Weight: 100}}, + ActionType: xdsresource.RouteActionRoute, + }}, + }}, + }, + VirtualHost: &xdsresource.VirtualHost{ + Domains: []string{defaultTestServiceName}, + Routes: []*xdsresource.Route{{ + Prefix: newStringP("/"), + WeightedClusters: []xdsresource.WeightedCluster{{Name: "agg-0", Weight: 100}}, + ActionType: xdsresource.RouteActionRoute, + }}, + }, + Clusters: map[string]*xdsresource.ClusterResult{}, // Initialize the map + } + + // Populate the Clusters map with all clusters,except the last one, each + // having the common error + for i := 0; i < 17; i++ { + clusterName := fmt.Sprintf("agg-%d", i) + + // The ClusterResult only needs the Err field set to the common error + wantXdsConfig.Clusters[clusterName] = &xdsresource.ClusterResult{ + Err: commonError, + } + } + if err := verifyXDSConfig(ctx, watcher.updateCh, watcher.errorCh, wantXdsConfig); err != nil { t.Fatal(err) } } + +func (s) TestEndpointAmbientError(t *testing.T) { + nodeID, mgmtServer, xdsClient := setup(t, true) + + watcher := &testWatcher{ + updateCh: make(chan *xdsresource.XDSConfig), + errorCh: make(chan error), + } + ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) + defer cancel() + + resources := e2e.DefaultClientResources(e2e.ResourceParams{ + NodeID: nodeID, + DialTarget: defaultTestServiceName, + Host: "localhost", + Port: 8080, + SecLevel: e2e.SecurityLevelNone, + }) + if err := mgmtServer.Update(ctx, resources); err != nil { + t.Fatal(err) + } + + dm := xdsdepmgr.New(defaultTestServiceName, defaultTestServiceName, xdsClient, watcher) + defer dm.Close() + wantXdsConfig := makedefaultXDSConfig(resources.Routes[0].Name, resources.Clusters[0].Name, resources.Endpoints[0].ClusterName, "localhost:8080") + + if err := verifyXDSConfig(ctx, watcher.updateCh, watcher.errorCh, wantXdsConfig); err != nil { + t.Fatal(err) + } + + // Send an ambient error for the endpoint resource by setting the weight to + // 0. + resources.Endpoints[0].Endpoints[0].LbEndpoints[0].LoadBalancingWeight = &wrapperspb.UInt32Value{Value: 0} + if err := mgmtServer.Update(ctx, resources); err != nil { + t.Fatal(err) + } + wantXdsConfig.Clusters[resources.Clusters[0].Name].Config.EndpointConfig.ResolutionNote = fmt.Errorf("[xDS node id: %v]: %v", nodeID, fmt.Errorf("EDS response contains an endpoint with zero weight: endpoint:{address:{socket_address:{address:%q port_value:%v}}} load_balancing_weight:{}", "localhost", 8080)) + if err := verifyXDSConfig(ctx, watcher.updateCh, watcher.errorCh, wantXdsConfig); err != nil { + t.Fatal(err) + } + + // Drain the updates because management server keeps sending the error + // updates causing the update from dependency manager to be blocked if we + // don't drain it. + select { + case <-ctx.Done(): + case <-watcher.updateCh: + } +} From a2479e40dc343b3fad3a6f8f090ecbcc53afb7f4 Mon Sep 17 00:00:00 2001 From: eshitachandwani Date: Thu, 4 Dec 2025 16:10:28 +0530 Subject: [PATCH 02/25] change --- internal/xds/xdsdepmgr/watch_service.go | 2 +- internal/xds/xdsdepmgr/xds_dependency_manager.go | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/internal/xds/xdsdepmgr/watch_service.go b/internal/xds/xdsdepmgr/watch_service.go index 1cd9e26011ec..1a9b6fcf2acc 100644 --- a/internal/xds/xdsdepmgr/watch_service.go +++ b/internal/xds/xdsdepmgr/watch_service.go @@ -191,7 +191,7 @@ func (dr *dnsResolver) UpdateState(state resolver.State) error { } func (dr *dnsResolver) ReportError(err error) { - dr.serializer.TrySchedule(func(ctx context.Context) { + dr.serializer.TrySchedule(func(context.Context) { dr.depMgr.onDNSError(dr.target, err) }) } diff --git a/internal/xds/xdsdepmgr/xds_dependency_manager.go b/internal/xds/xdsdepmgr/xds_dependency_manager.go index c469e4ab8205..03e267a6834c 100644 --- a/internal/xds/xdsdepmgr/xds_dependency_manager.go +++ b/internal/xds/xdsdepmgr/xds_dependency_manager.go @@ -35,6 +35,9 @@ const ( ) var logger = grpclog.Component("xds") + +// EnableCDSEDSlookups is a flag used to control whether the CDS/EDS watchers in +// the dependency manager should be used. It is made false by defualt. var EnableCDSEDSlooksups = false func prefixLogger(p *DependencyManager) *internalgrpclog.PrefixLogger { From ae5251b37f5aaa4f98b300cda68f036cf097768b Mon Sep 17 00:00:00 2001 From: eshitachandwani Date: Thu, 4 Dec 2025 16:11:51 +0530 Subject: [PATCH 03/25] spelling --- internal/xds/xdsdepmgr/xds_dependency_manager.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/xds/xdsdepmgr/xds_dependency_manager.go b/internal/xds/xdsdepmgr/xds_dependency_manager.go index 03e267a6834c..03b4b014d79a 100644 --- a/internal/xds/xdsdepmgr/xds_dependency_manager.go +++ b/internal/xds/xdsdepmgr/xds_dependency_manager.go @@ -37,7 +37,7 @@ const ( var logger = grpclog.Component("xds") // EnableCDSEDSlookups is a flag used to control whether the CDS/EDS watchers in -// the dependency manager should be used. It is made false by defualt. +// the dependency manager should be used. It is made false by default. var EnableCDSEDSlooksups = false func prefixLogger(p *DependencyManager) *internalgrpclog.PrefixLogger { From b8a65073d9bffb22342e3b922773b28c8f844ee6 Mon Sep 17 00:00:00 2001 From: eshitachandwani Date: Thu, 4 Dec 2025 16:14:03 +0530 Subject: [PATCH 04/25] spelling --- internal/xds/xdsdepmgr/xds_dependency_manager.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/xds/xdsdepmgr/xds_dependency_manager.go b/internal/xds/xdsdepmgr/xds_dependency_manager.go index 03b4b014d79a..fcc4c6e98357 100644 --- a/internal/xds/xdsdepmgr/xds_dependency_manager.go +++ b/internal/xds/xdsdepmgr/xds_dependency_manager.go @@ -36,7 +36,7 @@ const ( var logger = grpclog.Component("xds") -// EnableCDSEDSlookups is a flag used to control whether the CDS/EDS watchers in +// EnableCDSEDSlooksups is a flag used to control whether the CDS/EDS watchers in // the dependency manager should be used. It is made false by default. var EnableCDSEDSlooksups = false From 4084ae597f802c92c3920bdda28569e18fbf205d Mon Sep 17 00:00:00 2001 From: eshitachandwani Date: Thu, 4 Dec 2025 16:27:09 +0530 Subject: [PATCH 05/25] all correct --- internal/xds/xdsdepmgr/xds_dependency_manager_test.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/internal/xds/xdsdepmgr/xds_dependency_manager_test.go b/internal/xds/xdsdepmgr/xds_dependency_manager_test.go index eee17137a7b5..bfb3d4219bf0 100644 --- a/internal/xds/xdsdepmgr/xds_dependency_manager_test.go +++ b/internal/xds/xdsdepmgr/xds_dependency_manager_test.go @@ -75,8 +75,6 @@ func newStringP(s string) *string { return &s } -// depMgr.EnableCDSEDSlooksups=true - // testWatcher is an implementation of the ConfigWatcher interface that sends // the updates and errors received from the dependency manager to respective // channels, for the tests to verify. From 486284426b288dbbcee43332d7db4c1370a9546d Mon Sep 17 00:00:00 2001 From: eshitachandwani Date: Thu, 4 Dec 2025 16:41:30 +0530 Subject: [PATCH 06/25] order not checked --- .../xds/xdsdepmgr/xds_dependency_manager_test.go | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/internal/xds/xdsdepmgr/xds_dependency_manager_test.go b/internal/xds/xdsdepmgr/xds_dependency_manager_test.go index bfb3d4219bf0..62ab0786165e 100644 --- a/internal/xds/xdsdepmgr/xds_dependency_manager_test.go +++ b/internal/xds/xdsdepmgr/xds_dependency_manager_test.go @@ -111,6 +111,21 @@ func verifyError(ctx context.Context, errCh chan error, wantErr, wantNodeID stri return nil } +// This function determines the stable, canonical order for any two resolver.Endpoint structs. +func lessEndpoint(a, b resolver.Endpoint) bool { + // Safely access the first address string for comparison. + addrA := "" + if len(a.Addresses) > 0 { + addrA = a.Addresses[0].Addr + } + + addrB := "" + if len(b.Addresses) > 0 { + addrB = b.Addresses[0].Addr + } + + return addrA < addrB +} func verifyXDSConfig(ctx context.Context, xdsCh chan *xdsresource.XDSConfig, errCh chan error, want *xdsresource.XDSConfig) error { select { case <-ctx.Done(): @@ -137,6 +152,7 @@ func verifyXDSConfig(ctx context.Context, xdsCh chan *xdsresource.XDSConfig, err // return error as string. return strings.TrimSpace(s) }), + cmpopts.SortSlices(lessEndpoint), } if diff := cmp.Diff(update, want, cmpOpts...); diff != "" { return fmt.Errorf("received unexpected update from dependency manager. Diff (-got +want):\n%v", diff) From 15cc42eb7761b9561f27b9e0661ae3ceab84fff0 Mon Sep 17 00:00:00 2001 From: eshitachandwani Date: Tue, 9 Dec 2025 16:39:43 +0530 Subject: [PATCH 07/25] 1st set of comment --- internal/xds/xdsdepmgr/watch_service.go | 68 ++------ .../xds/xdsdepmgr/xds_dependency_manager.go | 162 +++++++++++------- .../xdsdepmgr/xds_dependency_manager_test.go | 2 +- 3 files changed, 119 insertions(+), 113 deletions(-) diff --git a/internal/xds/xdsdepmgr/watch_service.go b/internal/xds/xdsdepmgr/watch_service.go index 1a9b6fcf2acc..070e932743b6 100644 --- a/internal/xds/xdsdepmgr/watch_service.go +++ b/internal/xds/xdsdepmgr/watch_service.go @@ -92,71 +92,37 @@ func (r *routeConfigWatcher) stop() { } type clusterWatcher struct { - name string - depMgr *DependencyManager -} - -func (e *clusterWatcher) ResourceChanged(u *xdsresource.ClusterUpdate, onDone func()) { - e.depMgr.onClusterResourceUpdate(e.name, u, onDone) -} - -func (e *clusterWatcher) ResourceError(err error, onDone func()) { - e.depMgr.onClusterResourceError(e.name, err, onDone) -} - -func (e *clusterWatcher) AmbientError(err error, onDone func()) { - e.depMgr.onClusterAmbientError(e.name, err, onDone) -} - -// clusterWatcherState groups the state associated with a clusterWatcher. -type clusterWatcherState struct { - watcher *clusterWatcher // The underlying watcher. - cancelWatch func() // Cancel func to cancel the watch. - // Most recent update received for this cluster. - lastUpdate *xdsresource.ClusterUpdate - err error + resourceName string + depMgr *DependencyManager } -func newClusterWatcher(resourceName string, depMgr *DependencyManager) *clusterWatcherState { - w := &clusterWatcher{name: resourceName, depMgr: depMgr} - return &clusterWatcherState{ - watcher: w, - cancelWatch: xdsresource.WatchCluster(depMgr.xdsClient, resourceName, w), - } +func (c *clusterWatcher) ResourceChanged(u *xdsresource.ClusterUpdate, onDone func()) { + c.depMgr.onClusterResourceUpdate(c.resourceName, u, onDone) } -type endpointWatcher struct { - name string - depMgr *DependencyManager +func (c *clusterWatcher) ResourceError(err error, onDone func()) { + c.depMgr.onClusterResourceError(c.resourceName, err, onDone) } -func (e *endpointWatcher) ResourceChanged(u *xdsresource.EndpointsUpdate, onDone func()) { - e.depMgr.onEndpointUpdate(e.name, u, onDone) +func (c *clusterWatcher) AmbientError(err error, onDone func()) { + c.depMgr.onClusterAmbientError(c.resourceName, err, onDone) } -func (e *endpointWatcher) ResourceError(err error, onDone func()) { - e.depMgr.onEndpointResourceError(e.name, err, onDone) +type endpointsWatcher struct { + resourceName string + depMgr *DependencyManager } -func (e *endpointWatcher) AmbientError(err error, onDone func()) { - e.depMgr.onEndpointAmbientError(e.name, err, onDone) +func (e *endpointsWatcher) ResourceChanged(u *xdsresource.EndpointsUpdate, onDone func()) { + e.depMgr.onEndpointUpdate(e.resourceName, u, onDone) } -// endpointWatcherState groups the state associated with a endpointWatcher. -type endpointWatcherState struct { - watcher *endpointWatcher // The underlying watcher. - cancelWatch func() // Cancel func to cancel the watch. - // Most recent update received for this cluster. - lastUpdate *xdsresource.EndpointsUpdate - err error +func (e *endpointsWatcher) ResourceError(err error, onDone func()) { + e.depMgr.onEndpointResourceError(e.resourceName, err, onDone) } -func newEndpointWatcher(resourceName string, depMgr *DependencyManager) *endpointWatcherState { - w := &endpointWatcher{name: resourceName, depMgr: depMgr} - return &endpointWatcherState{ - watcher: w, - cancelWatch: xdsresource.WatchEndpoints(depMgr.xdsClient, resourceName, w), - } +func (e *endpointsWatcher) AmbientError(err error, onDone func()) { + e.depMgr.onEndpointAmbientError(e.resourceName, err, onDone) } // dnsResolverState watches updates for the given DNS hostname. It implements diff --git a/internal/xds/xdsdepmgr/xds_dependency_manager.go b/internal/xds/xdsdepmgr/xds_dependency_manager.go index fcc4c6e98357..4f0a92c5290a 100644 --- a/internal/xds/xdsdepmgr/xds_dependency_manager.go +++ b/internal/xds/xdsdepmgr/xds_dependency_manager.go @@ -36,9 +36,9 @@ const ( var logger = grpclog.Component("xds") -// EnableCDSEDSlooksups is a flag used to control whether the CDS/EDS watchers in -// the dependency manager should be used. It is made false by default. -var EnableCDSEDSlooksups = false +// EnableClusterAndEndpointsWatch is a flag used to control whether the CDS/EDS watchers in +// the dependency manager should be used. +var EnableClusterAndEndpointsWatch = false func prefixLogger(p *DependencyManager) *internalgrpclog.PrefixLogger { return internalgrpclog.NewPrefixLogger(logger, fmt.Sprintf(prefix, p)) @@ -78,19 +78,18 @@ type DependencyManager struct { nodeID string // All the fields below are protected by mu. - mu sync.Mutex - stopped bool - - listenerWatcher *listenerWatcher - currentListenerUpdate *xdsresource.ListenerUpdate - routeConfigWatcher *routeConfigWatcher - rdsResourceName string - currentRouteConfig *xdsresource.RouteConfigUpdate - currentVirtualHost *xdsresource.VirtualHost - clusterFromRouteConfig map[string]struct{} - endpointWatchers map[string]*endpointWatcherState - dnsResolvers map[string]*dnsResolverState - clusterWatchers map[string]*clusterWatcherState + mu sync.Mutex + stopped bool + listenerWatcher *listenerWatcher + currentListenerUpdate *xdsresource.ListenerUpdate + routeConfigWatcher *routeConfigWatcher + rdsResourceName string + currentRouteConfig *xdsresource.RouteConfigUpdate + currentVirtualHost *xdsresource.VirtualHost + clustersFromRouteConfig map[string]bool + clusterWatchers map[string]*clusterWatcherState + endpointWatchers map[string]*endpointWatcherState + dnsResolvers map[string]*dnsResolverState } // New creates a new DependencyManager. @@ -103,18 +102,16 @@ type DependencyManager struct { // - watcher is the ConfigWatcher interface that will receive the aggregated // XDSConfig updates and errors. func New(listenerName, dataplaneAuthority string, xdsClient xdsclient.XDSClient, watcher ConfigWatcher) *DependencyManager { - // ctx, cancel := context.WithCancel(context.Background()) - dm := &DependencyManager{ - ldsResourceName: listenerName, - dataplaneAuthority: dataplaneAuthority, - xdsClient: xdsClient, - watcher: watcher, - nodeID: xdsClient.BootstrapConfig().Node().GetId(), - clusterFromRouteConfig: make(map[string]struct{}), - endpointWatchers: make(map[string]*endpointWatcherState), - dnsResolvers: make(map[string]*dnsResolverState), - clusterWatchers: make(map[string]*clusterWatcherState), + ldsResourceName: listenerName, + dataplaneAuthority: dataplaneAuthority, + xdsClient: xdsClient, + watcher: watcher, + nodeID: xdsClient.BootstrapConfig().Node().GetId(), + clustersFromRouteConfig: make(map[string]bool), + endpointWatchers: make(map[string]*endpointWatcherState), + dnsResolvers: make(map[string]*dnsResolverState), + clusterWatchers: make(map[string]*clusterWatcherState), } dm.logger = prefixLogger(dm) @@ -141,17 +138,17 @@ func (m *DependencyManager) Close() { m.routeConfigWatcher.stop() } for name, cluster := range m.clusterWatchers { - go cluster.cancelWatch() + cluster.cancelWatch() delete(m.clusterWatchers, name) } for name, endpoint := range m.endpointWatchers { - go endpoint.cancelWatch() + endpoint.cancelWatch() delete(m.endpointWatchers, name) } for name, dnsResolver := range m.dnsResolvers { - go dnsResolver.cancelResolver() + dnsResolver.cancelResolver() delete(m.dnsResolvers, name) } } @@ -169,13 +166,13 @@ func (m *DependencyManager) maybeSendUpdateLocked() { if m.currentListenerUpdate == nil || m.currentRouteConfig == nil { return } - config := (&xdsresource.XDSConfig{ + config := &xdsresource.XDSConfig{ Listener: m.currentListenerUpdate, RouteConfig: m.currentRouteConfig, VirtualHost: m.currentVirtualHost, Clusters: make(map[string]*xdsresource.ClusterResult), - }) - if !EnableCDSEDSlooksups { + } + if !EnableClusterAndEndpointsWatch { m.watcher.Update(config) return } @@ -183,7 +180,7 @@ func (m *DependencyManager) maybeSendUpdateLocked() { dnsResourcesSeen := make(map[string]struct{}) clusterResourceSeen := make(map[string]struct{}) haveAllResources := true - for cluster := range m.clusterFromRouteConfig { + for cluster := range m.clustersFromRouteConfig { gotAllResource, _, err := m.populateClusterConfigLocked(cluster, 0, config.Clusters, edsResourcesSeen, dnsResourcesSeen, clusterResourceSeen) if !gotAllResource { haveAllResources = false @@ -245,14 +242,14 @@ func (m *DependencyManager) populateClusterConfigLocked(name string, depth int, state := m.clusterWatchers[name] // If update is not yet received for this cluster, return. - if state.lastUpdate == nil && state.err == nil { + if state.lastUpdate == nil && state.lastErr == nil { return false, nil, nil } // If resource error is seen for this cluster, save it in the cluster map // and return. - if state.lastUpdate == nil && state.err != nil { - clusterMap[name] = &xdsresource.ClusterResult{Err: state.err} + if state.lastUpdate == nil && state.lastErr != nil { + clusterMap[name] = &xdsresource.ClusterResult{Err: state.lastErr} return true, nil, nil } @@ -280,14 +277,14 @@ func (m *DependencyManager) populateClusterConfigLocked(name string, depth int, endpointState := m.endpointWatchers[edsName] // If the resource does not have any update yet, return. - if endpointState.lastUpdate == nil && endpointState.err == nil { + if endpointState.lastUpdate == nil && endpointState.lastErr == nil { return false, nil, nil } // Store the update and error. clusterMap[name].Config.EndpointConfig = &xdsresource.EndpointConfig{ EDSUpdate: endpointState.lastUpdate, - ResolutionNote: endpointState.err, + ResolutionNote: endpointState.lastErr, } return true, []string{name}, nil @@ -351,26 +348,27 @@ func (m *DependencyManager) applyRouteConfigUpdateLocked(update *xdsresource.Rou } m.currentRouteConfig = update m.currentVirtualHost = matchVH - // Get the clusters to be watched from the routes in the virtual host. If - // the CLusterSpecifierField is set, we ignore it for now as the clusters - // will be determined dynamically for it. - newClusters := make(map[string]struct{}) - - for _, rt := range matchVH.Routes { - for _, cluster := range rt.WeightedClusters { - newClusters[cluster.Name] = struct{}{} + + if EnableClusterAndEndpointsWatch { + // Get the clusters to be watched from the routes in the virtual host. If + // the CLusterSpecifierField is set, we ignore it for now as the clusters + // will be determined dynamically for it. + newClusters := make(map[string]bool) + + for _, rt := range matchVH.Routes { + for _, cluster := range rt.WeightedClusters { + newClusters[cluster.Name] = true + } } - } - if EnableCDSEDSlooksups { // Cancel watch for clusters not seen in route config - for name := range m.clusterFromRouteConfig { + for name := range m.clustersFromRouteConfig { if _, ok := newClusters[name]; !ok { m.clusterWatchers[name].cancelWatch() delete(m.clusterWatchers, name) } } - m.clusterFromRouteConfig = newClusters + m.clustersFromRouteConfig = newClusters } m.maybeSendUpdateLocked() } @@ -502,6 +500,20 @@ func (m *DependencyManager) onRouteConfigResourceAmbientError(resourceName strin m.logger.Warningf("Route resource ambient error %q: %v", resourceName, m.annotateErrorWithNodeID(err)) } +type clusterWatcherState struct { + cancelWatch func() + lastUpdate *xdsresource.ClusterUpdate + lastErr error +} + +func newClusterWatcher(resourceName string, depMgr *DependencyManager) *clusterWatcherState { + w := &clusterWatcher{resourceName: resourceName, depMgr: depMgr} + return &clusterWatcherState{ + cancelWatch: xdsresource.WatchCluster(depMgr.xdsClient, resourceName, w), + } +} + +// Records a successful Cluster resource update, clears any previous error. func (m *DependencyManager) onClusterResourceUpdate(resourceName string, update *xdsresource.ClusterUpdate, onDone func()) { m.mu.Lock() defer m.mu.Unlock() @@ -516,11 +528,13 @@ func (m *DependencyManager) onClusterResourceUpdate(resourceName string, update } state := m.clusterWatchers[resourceName] state.lastUpdate = update - state.err = nil + state.lastErr = nil m.maybeSendUpdateLocked() } +// Records a resource error for a Cluster resource, clears the last successful +// update since we want to stop using the resource if we get a resource error. func (m *DependencyManager) onClusterResourceError(resourceName string, err error, onDone func()) { m.mu.Lock() defer m.mu.Unlock() @@ -531,25 +545,39 @@ func (m *DependencyManager) onClusterResourceError(resourceName string, err erro } m.logger.Warningf("Received resource error for Cluster resource %q: %v", resourceName, m.annotateErrorWithNodeID(err)) state := m.clusterWatchers[resourceName] - state.err = err - // We want to stop using this cluster as we received a resource error so - // remove the last update. + state.lastErr = err state.lastUpdate = nil m.maybeSendUpdateLocked() } +// Records the error in the state. The last successful update is retained +// because it should continue to be used as an amnbient error is received. func (m *DependencyManager) onClusterAmbientError(resourceName string, err error, onDone func()) { m.mu.Lock() defer m.mu.Unlock() defer onDone() - // check resource name matches if m.stopped || m.clusterWatchers[resourceName] == nil { return } m.logger.Warningf("Cluster resource ambient error %q: %v", resourceName, m.annotateErrorWithNodeID(err)) } +type endpointWatcherState struct { + cancelWatch func() + lastUpdate *xdsresource.EndpointsUpdate + lastErr error +} + +func newEndpointWatcher(resourceName string, depMgr *DependencyManager) *endpointWatcherState { + w := &endpointsWatcher{resourceName: resourceName, depMgr: depMgr} + return &endpointWatcherState{ + cancelWatch: xdsresource.WatchEndpoints(depMgr.xdsClient, resourceName, w), + } +} + +// Records a successful Endpoint resource update, clears any previous error from +// the state. func (m *DependencyManager) onEndpointUpdate(resourceName string, update *xdsresource.EndpointsUpdate, onDone func()) { m.mu.Lock() defer m.mu.Unlock() @@ -564,10 +592,12 @@ func (m *DependencyManager) onEndpointUpdate(resourceName string, update *xdsres } state := m.endpointWatchers[resourceName] state.lastUpdate = update - state.err = nil + state.lastErr = nil m.maybeSendUpdateLocked() } +// Records a resource error and clears the last successful update since the +// endpoints should not be used after getting a resource error. func (m *DependencyManager) onEndpointResourceError(resourceName string, err error, onDone func()) { m.mu.Lock() defer m.mu.Unlock() @@ -578,13 +608,13 @@ func (m *DependencyManager) onEndpointResourceError(resourceName string, err err } m.logger.Warningf("Received resource error for Endpoint resource %q: %v", resourceName, m.annotateErrorWithNodeID(err)) state := m.endpointWatchers[resourceName] - state.err = err - // We want to stop using the endpoints for this cluster as we received a - // resource error so remove the last update. + state.lastErr = err state.lastUpdate = nil m.maybeSendUpdateLocked() } +// Records the ambient error without clearing the last successful update, as the +// endpoints should continue to be used. func (m *DependencyManager) onEndpointAmbientError(resourceName string, err error, onDone func()) { m.mu.Lock() defer m.mu.Unlock() @@ -596,10 +626,13 @@ func (m *DependencyManager) onEndpointAmbientError(resourceName string, err erro m.logger.Warningf("Endpoint resource ambient error %q: %v", resourceName, m.annotateErrorWithNodeID(err)) state := m.endpointWatchers[resourceName] - state.err = err + state.lastErr = err m.maybeSendUpdateLocked() } +// Converts the DNS resolver state to an internal update, handling address-only +// updates by wrapping them into endpoints. It records the update and clears any +// previous error. func (m *DependencyManager) onDNSUpdate(resourceName string, update *resolver.State) { m.mu.Lock() defer m.mu.Unlock() @@ -626,6 +659,13 @@ func (m *DependencyManager) onDNSUpdate(resourceName string, update *resolver.St m.maybeSendUpdateLocked() } +// Records a DNS resolver error. It clears the last update only if no successful +// update has been received yet, then triggers a dependency update. +// +// If a previous good update was received, the error is recorded but the +// previous update is retained for continued use. Errors are suppressed if a +// resource error was already received, as further propagation would have no +// downstream effect. func (m *DependencyManager) onDNSError(resourceName string, err error) { m.mu.Lock() defer m.mu.Unlock() diff --git a/internal/xds/xdsdepmgr/xds_dependency_manager_test.go b/internal/xds/xdsdepmgr/xds_dependency_manager_test.go index 62ab0786165e..52a4f04a2c86 100644 --- a/internal/xds/xdsdepmgr/xds_dependency_manager_test.go +++ b/internal/xds/xdsdepmgr/xds_dependency_manager_test.go @@ -57,7 +57,7 @@ type s struct { } func Test(t *testing.T) { - xdsdepmgr.EnableCDSEDSlooksups = true + xdsdepmgr.EnableClusterAndEndpointsWatch = true grpctest.RunSubTests(t, s{}) } From ed3c1eb24d3e4a02f9a4e31d18762898e8806b91 Mon Sep 17 00:00:00 2001 From: eshitachandwani Date: Tue, 9 Dec 2025 16:43:57 +0530 Subject: [PATCH 08/25] change set to use bool in place of struct --- internal/xds/xdsdepmgr/xds_dependency_manager.go | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/internal/xds/xdsdepmgr/xds_dependency_manager.go b/internal/xds/xdsdepmgr/xds_dependency_manager.go index 4f0a92c5290a..df0b6a27fd54 100644 --- a/internal/xds/xdsdepmgr/xds_dependency_manager.go +++ b/internal/xds/xdsdepmgr/xds_dependency_manager.go @@ -176,9 +176,9 @@ func (m *DependencyManager) maybeSendUpdateLocked() { m.watcher.Update(config) return } - edsResourcesSeen := make(map[string]struct{}) - dnsResourcesSeen := make(map[string]struct{}) - clusterResourceSeen := make(map[string]struct{}) + edsResourcesSeen := make(map[string]bool) + dnsResourcesSeen := make(map[string]bool) + clusterResourceSeen := make(map[string]bool) haveAllResources := true for cluster := range m.clustersFromRouteConfig { gotAllResource, _, err := m.populateClusterConfigLocked(cluster, 0, config.Clusters, edsResourcesSeen, dnsResourcesSeen, clusterResourceSeen) @@ -220,8 +220,8 @@ func (m *DependencyManager) maybeSendUpdateLocked() { } } -func (m *DependencyManager) populateClusterConfigLocked(name string, depth int, clusterMap map[string]*xdsresource.ClusterResult, edsSeen, dnsSeen map[string]struct{}, clusterSeen map[string]struct{}) (bool, []string, error) { - clusterSeen[name] = struct{}{} +func (m *DependencyManager) populateClusterConfigLocked(name string, depth int, clusterMap map[string]*xdsresource.ClusterResult, edsSeen, dnsSeen, clusterSeen map[string]bool) (bool, []string, error) { + clusterSeen[name] = true if depth > aggregateClusterMaxDepth { m.logger.Warningf("aggregate cluster graph exceeds max depth (%d)", aggregateClusterMaxDepth) @@ -268,7 +268,7 @@ func (m *DependencyManager) populateClusterConfigLocked(name string, depth int, } else { edsName = update.EDSServiceName } - edsSeen[edsName] = struct{}{} + edsSeen[edsName] = true // If endpoint watcher does not exist, create one. if _, ok := m.endpointWatchers[edsName]; !ok { m.endpointWatchers[edsName] = newEndpointWatcher(edsName, m) @@ -290,7 +290,7 @@ func (m *DependencyManager) populateClusterConfigLocked(name string, depth int, case xdsresource.ClusterTypeLogicalDNS: target := update.DNSHostName - dnsSeen[target] = struct{}{} + dnsSeen[target] = true // If dns resolver does not exist, create one. if _, ok := m.dnsResolvers[target]; !ok { m.dnsResolvers[target] = newDNSResolver(target, m) From 4b8607410548ac722ab86968fd27df845813ab0f Mon Sep 17 00:00:00 2001 From: eshitachandwani Date: Tue, 9 Dec 2025 23:42:25 +0530 Subject: [PATCH 09/25] comment lengths --- internal/xds/xdsdepmgr/watch_service.go | 6 +++--- internal/xds/xdsdepmgr/xds_dependency_manager.go | 16 ++++++++-------- .../xds/xdsdepmgr/xds_dependency_manager_test.go | 3 ++- 3 files changed, 13 insertions(+), 12 deletions(-) diff --git a/internal/xds/xdsdepmgr/watch_service.go b/internal/xds/xdsdepmgr/watch_service.go index 070e932743b6..20e473c59120 100644 --- a/internal/xds/xdsdepmgr/watch_service.go +++ b/internal/xds/xdsdepmgr/watch_service.go @@ -132,9 +132,9 @@ type dnsResolver struct { dnsR resolver.Resolver depMgr *DependencyManager - //serializer is used to make sure that any methods on the resolver can be - //called from inside th Build function which is a garuntee that - //implementations of resolver.Clientconn need to maintain. + // serializer is used to make sure that any methods on the resolver can be + // called from inside th Build function which is a guarantee that + // implementations of resolver.Clientconn need to maintain. serializer grpcsync.CallbackSerializer serializerCancel func() } diff --git a/internal/xds/xdsdepmgr/xds_dependency_manager.go b/internal/xds/xdsdepmgr/xds_dependency_manager.go index df0b6a27fd54..ebad32c15da7 100644 --- a/internal/xds/xdsdepmgr/xds_dependency_manager.go +++ b/internal/xds/xdsdepmgr/xds_dependency_manager.go @@ -36,8 +36,8 @@ const ( var logger = grpclog.Component("xds") -// EnableClusterAndEndpointsWatch is a flag used to control whether the CDS/EDS watchers in -// the dependency manager should be used. +// EnableClusterAndEndpointsWatch is a flag used to control whether the CDS/EDS +// watchers in the dependency manager should be used. var EnableClusterAndEndpointsWatch = false func prefixLogger(p *DependencyManager) *internalgrpclog.PrefixLogger { @@ -159,9 +159,9 @@ func (m *DependencyManager) annotateErrorWithNodeID(err error) error { return fmt.Errorf("[xDS node id: %v]: %v", m.nodeID, err) } -// maybeSendUpdateLocked checks that all the resources have been received and sends -// the current aggregated xDS configuration to the watcher if all the updates -// are available. +// maybeSendUpdateLocked checks that all the resources have been received and +// sends the current aggregated xDS configuration to the watcher if all the +// updates are available. func (m *DependencyManager) maybeSendUpdateLocked() { if m.currentListenerUpdate == nil || m.currentRouteConfig == nil { return @@ -350,9 +350,9 @@ func (m *DependencyManager) applyRouteConfigUpdateLocked(update *xdsresource.Rou m.currentVirtualHost = matchVH if EnableClusterAndEndpointsWatch { - // Get the clusters to be watched from the routes in the virtual host. If - // the CLusterSpecifierField is set, we ignore it for now as the clusters - // will be determined dynamically for it. + // Get the clusters to be watched from the routes in the virtual host. + // If the CLusterSpecifierField is set, we ignore it for now as the + // clusters will be determined dynamically for it. newClusters := make(map[string]bool) for _, rt := range matchVH.Routes { diff --git a/internal/xds/xdsdepmgr/xds_dependency_manager_test.go b/internal/xds/xdsdepmgr/xds_dependency_manager_test.go index 52a4f04a2c86..48835b279a6f 100644 --- a/internal/xds/xdsdepmgr/xds_dependency_manager_test.go +++ b/internal/xds/xdsdepmgr/xds_dependency_manager_test.go @@ -111,7 +111,8 @@ func verifyError(ctx context.Context, errCh chan error, wantErr, wantNodeID stri return nil } -// This function determines the stable, canonical order for any two resolver.Endpoint structs. +// This function determines the stable, canonical order for any two +// resolver.Endpoint structs. func lessEndpoint(a, b resolver.Endpoint) bool { // Safely access the first address string for comparison. addrA := "" From e369eeba6ad8468e2dfaec2fad9cdcab54b5ae71 Mon Sep 17 00:00:00 2001 From: eshitachandwani Date: Fri, 12 Dec 2025 09:18:34 +0530 Subject: [PATCH 10/25] resolve comments --- internal/xds/resolver/xds_resolver.go | 6 +- .../xds/xdsdepmgr/xds_dependency_manager.go | 104 ++++++++++++------ 2 files changed, 74 insertions(+), 36 deletions(-) diff --git a/internal/xds/resolver/xds_resolver.go b/internal/xds/resolver/xds_resolver.go index d877d42700ac..64ce7c09132d 100644 --- a/internal/xds/resolver/xds_resolver.go +++ b/internal/xds/resolver/xds_resolver.go @@ -231,7 +231,11 @@ type xdsResolver struct { } // ResolveNow is a no-op at this point. -func (*xdsResolver) ResolveNow(resolver.ResolveNowOptions) {} +func (r *xdsResolver) ResolveNow(opts resolver.ResolveNowOptions) { + if r.dm != nil { + r.dm.ResolveNow(opts) + } +} func (r *xdsResolver) Close() { // Cancel the context passed to the serializer and wait for any scheduled diff --git a/internal/xds/xdsdepmgr/xds_dependency_manager.go b/internal/xds/xdsdepmgr/xds_dependency_manager.go index ebad32c15da7..28dc0fad3f9b 100644 --- a/internal/xds/xdsdepmgr/xds_dependency_manager.go +++ b/internal/xds/xdsdepmgr/xds_dependency_manager.go @@ -220,8 +220,37 @@ func (m *DependencyManager) maybeSendUpdateLocked() { } } -func (m *DependencyManager) populateClusterConfigLocked(name string, depth int, clusterMap map[string]*xdsresource.ClusterResult, edsSeen, dnsSeen, clusterSeen map[string]bool) (bool, []string, error) { - clusterSeen[name] = true +// populateClusterConfigLocked recursively resolves and populates the +// configuration for the given cluster and its children, including its +// associated endpoint or aggregate children. +// +// This function traverses the cluster dependency graph (e.g., from an Aggregate +// cluster down to its leaf clusters and their endpoints/DNS resources) to +// ensure all necessary xDS resources are watched and fully resolved before +// configuration is considered ready. +// +// Parameters: +// +// clusterName: The name of the cluster resource to resolve. +// depth: The current recursion depth. +// clusterConfigs: Map to store the resolved cluster configuration. +// endpointResourceSeen: Stores which EDS resource names have been encountered. +// dnsResourceSeen: Stores which DNS resource names have been encountered. +// clustersSeen: Stores which cluster resource names have been encountered. +// +// Returns: +// +// bool: Returns if the cluster configuration (and all its +// dependencies) is fully resolved (i.e either update or +// error has been received). +// []string: A slice of all "leaf" cluster names discovered in the +// traversal starting from `clusterName`. For +// non-aggregate clusters, this will contain only `clusterName`. +// error: Error that needs to be propogated up the tree (like +// max depth exceeded or an error propagated from a +// child cluster). +func (m *DependencyManager) populateClusterConfigLocked(clusterName string, depth int, clusterConfigs map[string]*xdsresource.ClusterResult, endpointResourceSeen, dnsResourceSeen, clustersSeen map[string]bool) (bool, []string, error) { + clustersSeen[clusterName] = true if depth > aggregateClusterMaxDepth { m.logger.Warningf("aggregate cluster graph exceeds max depth (%d)", aggregateClusterMaxDepth) @@ -229,46 +258,45 @@ func (m *DependencyManager) populateClusterConfigLocked(name string, depth int, } // If cluster is already seen in the tree, return. - if _, ok := clusterMap[name]; ok { + if _, ok := clusterConfigs[clusterName]; ok { return true, nil, nil } // If cluster watcher does not exist, create one. - if _, ok := m.clusterWatchers[name]; !ok { - m.clusterWatchers[name] = newClusterWatcher(name, m) + state, ok := m.clusterWatchers[clusterName] + if !ok { + m.clusterWatchers[clusterName] = newClusterWatcher(clusterName, m) return false, nil, nil } - state := m.clusterWatchers[name] - - // If update is not yet received for this cluster, return. - if state.lastUpdate == nil && state.lastErr == nil { + switch { + case state.lastUpdate == nil && state.lastErr == nil: + // Case 1: Watcher exists but no data or error has arrived yet. return false, nil, nil - } - // If resource error is seen for this cluster, save it in the cluster map - // and return. - if state.lastUpdate == nil && state.lastErr != nil { - clusterMap[name] = &xdsresource.ClusterResult{Err: state.lastErr} + case state.lastUpdate == nil: + // Case 2: No update received, but a terminal resource error occurred. + // (state.lastErr is implicitly non-nil here). + clusterConfigs[clusterName] = &xdsresource.ClusterResult{Err: state.lastErr} return true, nil, nil - } - update := state.lastUpdate - clusterMap[name] = &xdsresource.ClusterResult{ - Config: xdsresource.ClusterConfig{ - Cluster: update, - }, + default: + // Case 3: A valid update was received. Store the config and continue resolution. + clusterConfigs[clusterName] = &xdsresource.ClusterResult{ + Config: xdsresource.ClusterConfig{ + Cluster: state.lastUpdate, + }, + } } + update := state.lastUpdate switch update.ClusterType { case xdsresource.ClusterTypeEDS: - var edsName string - if update.EDSServiceName == "" { - edsName = name - } else { + edsName := clusterName + if update.EDSServiceName != "" { edsName = update.EDSServiceName } - edsSeen[edsName] = true + endpointResourceSeen[edsName] = true // If endpoint watcher does not exist, create one. if _, ok := m.endpointWatchers[edsName]; !ok { m.endpointWatchers[edsName] = newEndpointWatcher(edsName, m) @@ -282,15 +310,15 @@ func (m *DependencyManager) populateClusterConfigLocked(name string, depth int, } // Store the update and error. - clusterMap[name].Config.EndpointConfig = &xdsresource.EndpointConfig{ + clusterConfigs[clusterName].Config.EndpointConfig = &xdsresource.EndpointConfig{ EDSUpdate: endpointState.lastUpdate, ResolutionNote: endpointState.lastErr, } - return true, []string{name}, nil + return true, []string{clusterName}, nil case xdsresource.ClusterTypeLogicalDNS: target := update.DNSHostName - dnsSeen[target] = true + dnsResourceSeen[target] = true // If dns resolver does not exist, create one. if _, ok := m.dnsResolvers[target]; !ok { m.dnsResolvers[target] = newDNSResolver(target, m) @@ -302,22 +330,22 @@ func (m *DependencyManager) populateClusterConfigLocked(name string, depth int, return false, nil, nil } - clusterMap[name].Config.EndpointConfig = &xdsresource.EndpointConfig{ + clusterConfigs[clusterName].Config.EndpointConfig = &xdsresource.EndpointConfig{ DNSEndpoints: dnsState.lastUpdate, ResolutionNote: dnsState.err, } - return true, []string{name}, nil + return true, []string{clusterName}, nil case xdsresource.ClusterTypeAggregate: var leafClusters []string haveAllResources := true for _, child := range update.PrioritizedClusterNames { - gotAll, childLeafCluster, err := m.populateClusterConfigLocked(child, depth+1, clusterMap, edsSeen, dnsSeen, clusterSeen) + gotAll, childLeafCluster, err := m.populateClusterConfigLocked(child, depth+1, clusterConfigs, endpointResourceSeen, dnsResourceSeen, clustersSeen) if !gotAll { haveAllResources = false } if err != nil { - clusterMap[name] = &xdsresource.ClusterResult{ + clusterConfigs[clusterName] = &xdsresource.ClusterResult{ Err: err, } return true, leafClusters, err @@ -328,15 +356,15 @@ func (m *DependencyManager) populateClusterConfigLocked(name string, depth int, return false, leafClusters, nil } if haveAllResources && len(leafClusters) == 0 { - clusterMap[name] = &xdsresource.ClusterResult{Err: m.annotateErrorWithNodeID(fmt.Errorf("no leaf clusters found for aggregate cluster"))} + clusterConfigs[clusterName] = &xdsresource.ClusterResult{Err: m.annotateErrorWithNodeID(fmt.Errorf("no leaf clusters found for aggregate cluster"))} return true, leafClusters, m.annotateErrorWithNodeID(fmt.Errorf("no leaf clusters found for aggregate cluster")) } - clusterMap[name].Config.AggregateConfig = &xdsresource.AggregateConfig{ + clusterConfigs[clusterName].Config.AggregateConfig = &xdsresource.AggregateConfig{ LeafClusters: leafClusters, } return true, leafClusters, nil } - clusterMap[name] = &xdsresource.ClusterResult{Err: m.annotateErrorWithNodeID(fmt.Errorf("no cluster data yet for cluster %s", name))} + clusterConfigs[clusterName] = &xdsresource.ClusterResult{Err: m.annotateErrorWithNodeID(fmt.Errorf("no cluster data yet for cluster %s", clusterName))} return false, nil, nil } @@ -688,3 +716,9 @@ func (m *DependencyManager) onDNSError(resourceName string, err error) { m.logger.Warningf("DNS resolver error %q: %v", resourceName, m.annotateErrorWithNodeID(err)) m.maybeSendUpdateLocked() } + +func (m *DependencyManager) ResolveNow(opt resolver.ResolveNowOptions) { + for _, res := range m.dnsResolvers { + res.resolver.dnsR.ResolveNow(opt) + } +} From a98e575ddf23f214711079682afb28990dcbae12 Mon Sep 17 00:00:00 2001 From: eshitachandwani Date: Fri, 12 Dec 2025 09:21:26 +0530 Subject: [PATCH 11/25] resolvenow coment --- internal/xds/xdsdepmgr/xds_dependency_manager.go | 1 + 1 file changed, 1 insertion(+) diff --git a/internal/xds/xdsdepmgr/xds_dependency_manager.go b/internal/xds/xdsdepmgr/xds_dependency_manager.go index 28dc0fad3f9b..f58f0429f3fa 100644 --- a/internal/xds/xdsdepmgr/xds_dependency_manager.go +++ b/internal/xds/xdsdepmgr/xds_dependency_manager.go @@ -717,6 +717,7 @@ func (m *DependencyManager) onDNSError(resourceName string, err error) { m.maybeSendUpdateLocked() } +// ResolveNow calls all the the DNS resolvers ResolveNow. func (m *DependencyManager) ResolveNow(opt resolver.ResolveNowOptions) { for _, res := range m.dnsResolvers { res.resolver.dnsR.ResolveNow(opt) From f04a355ddf6d624b702729eed53894b4e501b054 Mon Sep 17 00:00:00 2001 From: eshitachandwani Date: Fri, 12 Dec 2025 20:09:35 +0530 Subject: [PATCH 12/25] report error for resolver error --- internal/xds/xdsdepmgr/watch_service.go | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/internal/xds/xdsdepmgr/watch_service.go b/internal/xds/xdsdepmgr/watch_service.go index 20e473c59120..55671c86e444 100644 --- a/internal/xds/xdsdepmgr/watch_service.go +++ b/internal/xds/xdsdepmgr/watch_service.go @@ -177,15 +177,13 @@ func newDNSResolver(target string, depMgr *DependencyManager) *dnsResolverState drState := &dnsResolverState{resolver: dr, cancelResolver: func() {}} u, err := url.Parse("dns:///" + target) if err != nil { - drState.updateReceived = true - drState.err = err + drState.resolver.ReportError(err) drState.resolver.depMgr.logger.Warningf("Error while parsing DNS target %q: %v", target, drState.resolver.depMgr.annotateErrorWithNodeID(err)) return drState } r, err := resolver.Get("dns").Build(resolver.Target{URL: *u}, dr, resolver.BuildOptions{}) if err != nil { - drState.updateReceived = true - drState.err = err + drState.resolver.ReportError(err) drState.resolver.depMgr.logger.Warningf("Error while building DNS resolver for target %q: %v", target, drState.resolver.depMgr.annotateErrorWithNodeID(err)) return drState } From 0da35e0e935dbebf4321778a2695cf0c3b551e1e Mon Sep 17 00:00:00 2001 From: eshitachandwani Date: Fri, 12 Dec 2025 20:52:58 +0530 Subject: [PATCH 13/25] generics try --- internal/xds/xdsdepmgr/watch_service.go | 100 +++--------------- .../xds/xdsdepmgr/xds_dependency_manager.go | 81 ++++++++++---- 2 files changed, 74 insertions(+), 107 deletions(-) diff --git a/internal/xds/xdsdepmgr/watch_service.go b/internal/xds/xdsdepmgr/watch_service.go index 55671c86e444..4204e14de83b 100644 --- a/internal/xds/xdsdepmgr/watch_service.go +++ b/internal/xds/xdsdepmgr/watch_service.go @@ -29,100 +29,24 @@ import ( "google.golang.org/grpc/serviceconfig" ) -type listenerWatcher struct { - resourceName string - cancel func() - depMgr *DependencyManager +// xdsResourceWatcher is a generic implementation of the xdsresource.Watcher +// interface. +type xdsResourceWatcher[T any] struct { + onUpdate func(*T, func()) + onError func(error, func()) + onAmbientError func(error, func()) } -func newListenerWatcher(resourceName string, depMgr *DependencyManager) *listenerWatcher { - lw := &listenerWatcher{resourceName: resourceName, depMgr: depMgr} - lw.cancel = xdsresource.WatchListener(depMgr.xdsClient, resourceName, lw) - return lw +func (x *xdsResourceWatcher[T]) ResourceChanged(update *T, onDone func()) { + x.onUpdate(update, onDone) } -func (l *listenerWatcher) ResourceChanged(update *xdsresource.ListenerUpdate, onDone func()) { - l.depMgr.onListenerResourceUpdate(update, onDone) +func (x *xdsResourceWatcher[T]) ResourceError(err error, onDone func()) { + x.onError(err, onDone) } -func (l *listenerWatcher) ResourceError(err error, onDone func()) { - l.depMgr.onListenerResourceError(err, onDone) -} - -func (l *listenerWatcher) AmbientError(err error, onDone func()) { - l.depMgr.onListenerResourceAmbientError(err, onDone) -} - -func (l *listenerWatcher) stop() { - l.cancel() - if l.depMgr.logger.V(2) { - l.depMgr.logger.Infof("Canceling watch on Listener resource %q", l.resourceName) - } -} - -type routeConfigWatcher struct { - resourceName string - cancel func() - depMgr *DependencyManager -} - -func newRouteConfigWatcher(resourceName string, depMgr *DependencyManager) *routeConfigWatcher { - rw := &routeConfigWatcher{resourceName: resourceName, depMgr: depMgr} - rw.cancel = xdsresource.WatchRouteConfig(depMgr.xdsClient, resourceName, rw) - return rw -} - -func (r *routeConfigWatcher) ResourceChanged(u *xdsresource.RouteConfigUpdate, onDone func()) { - r.depMgr.onRouteConfigResourceUpdate(r.resourceName, u, onDone) -} - -func (r *routeConfigWatcher) ResourceError(err error, onDone func()) { - r.depMgr.onRouteConfigResourceError(r.resourceName, err, onDone) -} - -func (r *routeConfigWatcher) AmbientError(err error, onDone func()) { - r.depMgr.onRouteConfigResourceAmbientError(r.resourceName, err, onDone) -} - -func (r *routeConfigWatcher) stop() { - r.cancel() - if r.depMgr.logger.V(2) { - r.depMgr.logger.Infof("Canceling watch on RouteConfiguration resource %q", r.resourceName) - } -} - -type clusterWatcher struct { - resourceName string - depMgr *DependencyManager -} - -func (c *clusterWatcher) ResourceChanged(u *xdsresource.ClusterUpdate, onDone func()) { - c.depMgr.onClusterResourceUpdate(c.resourceName, u, onDone) -} - -func (c *clusterWatcher) ResourceError(err error, onDone func()) { - c.depMgr.onClusterResourceError(c.resourceName, err, onDone) -} - -func (c *clusterWatcher) AmbientError(err error, onDone func()) { - c.depMgr.onClusterAmbientError(c.resourceName, err, onDone) -} - -type endpointsWatcher struct { - resourceName string - depMgr *DependencyManager -} - -func (e *endpointsWatcher) ResourceChanged(u *xdsresource.EndpointsUpdate, onDone func()) { - e.depMgr.onEndpointUpdate(e.resourceName, u, onDone) -} - -func (e *endpointsWatcher) ResourceError(err error, onDone func()) { - e.depMgr.onEndpointResourceError(e.resourceName, err, onDone) -} - -func (e *endpointsWatcher) AmbientError(err error, onDone func()) { - e.depMgr.onEndpointAmbientError(e.resourceName, err, onDone) +func (x *xdsResourceWatcher[T]) AmbientError(err error, onDone func()) { + x.onAmbientError(err, onDone) } // dnsResolverState watches updates for the given DNS hostname. It implements diff --git a/internal/xds/xdsdepmgr/xds_dependency_manager.go b/internal/xds/xdsdepmgr/xds_dependency_manager.go index f58f0429f3fa..5ce12ec522d4 100644 --- a/internal/xds/xdsdepmgr/xds_dependency_manager.go +++ b/internal/xds/xdsdepmgr/xds_dependency_manager.go @@ -80,9 +80,9 @@ type DependencyManager struct { // All the fields below are protected by mu. mu sync.Mutex stopped bool - listenerWatcher *listenerWatcher - currentListenerUpdate *xdsresource.ListenerUpdate - routeConfigWatcher *routeConfigWatcher + listenerCancel func() + currentListenerUpdate *xdsresource.ListenerUpdate + routeConfigCancel func() rdsResourceName string currentRouteConfig *xdsresource.RouteConfigUpdate currentVirtualHost *xdsresource.VirtualHost @@ -117,7 +117,18 @@ func New(listenerName, dataplaneAuthority string, xdsClient xdsclient.XDSClient, // Start the listener watch. Listener watch will start the other resource // watches as needed. - dm.listenerWatcher = newListenerWatcher(listenerName, dm) + lw := &xdsResourceWatcher[xdsresource.ListenerUpdate]{ + onUpdate: func(update *xdsresource.ListenerUpdate, onDone func()) { + dm.onListenerResourceUpdate(update, onDone) + }, + onError: func(err error, onDone func()) { + dm.onListenerResourceError(err, onDone) + }, + onAmbientError: func(err error, onDone func()) { + dm.onListenerResourceAmbientError(err, onDone) + }, + } + dm.listenerCancel = xdsresource.WatchListener(dm.xdsClient, listenerName, lw) return dm } @@ -131,11 +142,11 @@ func (m *DependencyManager) Close() { } m.stopped = true - if m.listenerWatcher != nil { - m.listenerWatcher.stop() + if m.listenerCancel != nil { + m.listenerCancel() } - if m.routeConfigWatcher != nil { - m.routeConfigWatcher.stop() + if m.routeConfigCancel != nil { + m.routeConfigCancel() } for name, cluster := range m.clusterWatchers { cluster.cancelWatch() @@ -420,9 +431,9 @@ func (m *DependencyManager) onListenerResourceUpdate(update *xdsresource.Listene // If there was a previous route config watcher because of a non-inline // route configuration, cancel it. m.rdsResourceName = "" - if m.routeConfigWatcher != nil { - m.routeConfigWatcher.stop() - m.routeConfigWatcher = nil + if m.routeConfigCancel != nil { + m.routeConfigCancel() + m.routeConfigCancel = nil } m.applyRouteConfigUpdateLocked(update.InlineRouteConfig) return @@ -441,10 +452,22 @@ func (m *DependencyManager) onListenerResourceUpdate(update *xdsresource.Listene // resolved, no update is sent to the channel, and therefore the old route // configuration (if received) is used until the new one is received. m.rdsResourceName = update.RouteConfigName - if m.routeConfigWatcher != nil { - m.routeConfigWatcher.stop() + if m.routeConfigCancel != nil { + m.routeConfigCancel() } - m.routeConfigWatcher = newRouteConfigWatcher(m.rdsResourceName, m) + + rw := &xdsResourceWatcher[xdsresource.RouteConfigUpdate]{ + onUpdate: func(update *xdsresource.RouteConfigUpdate, onDone func()) { + m.onRouteConfigResourceUpdate(m.rdsResourceName, update, onDone) + }, + onError: func(err error, onDone func()) { + m.onRouteConfigResourceError(m.rdsResourceName, err, onDone) + }, + onAmbientError: func(err error, onDone func()) { + m.onRouteConfigResourceAmbientError(m.rdsResourceName, err, onDone) + }, + } + m.routeConfigCancel = xdsresource.WatchRouteConfig(m.xdsClient, m.rdsResourceName, rw) } @@ -459,12 +482,12 @@ func (m *DependencyManager) onListenerResourceError(err error, onDone func()) { m.logger.Warningf("Received resource error for Listener resource %q: %v", m.ldsResourceName, m.annotateErrorWithNodeID(err)) - if m.routeConfigWatcher != nil { - m.routeConfigWatcher.stop() + if m.routeConfigCancel != nil { + m.routeConfigCancel() } m.currentListenerUpdate = nil m.rdsResourceName = "" - m.routeConfigWatcher = nil + m.routeConfigCancel = nil m.watcher.Error(fmt.Errorf("listener resource error: %v", m.annotateErrorWithNodeID(err))) } @@ -535,7 +558,17 @@ type clusterWatcherState struct { } func newClusterWatcher(resourceName string, depMgr *DependencyManager) *clusterWatcherState { - w := &clusterWatcher{resourceName: resourceName, depMgr: depMgr} + w := &xdsResourceWatcher[xdsresource.ClusterUpdate]{ + onUpdate: func(u *xdsresource.ClusterUpdate, onDone func()) { + depMgr.onClusterResourceUpdate(resourceName, u, onDone) + }, + onError: func(err error, onDone func()) { + depMgr.onClusterResourceError(resourceName, err, onDone) + }, + onAmbientError: func(err error, onDone func()) { + depMgr.onClusterAmbientError(resourceName, err, onDone) + }, + } return &clusterWatcherState{ cancelWatch: xdsresource.WatchCluster(depMgr.xdsClient, resourceName, w), } @@ -598,7 +631,17 @@ type endpointWatcherState struct { } func newEndpointWatcher(resourceName string, depMgr *DependencyManager) *endpointWatcherState { - w := &endpointsWatcher{resourceName: resourceName, depMgr: depMgr} + w := &xdsResourceWatcher[xdsresource.EndpointsUpdate]{ + onUpdate: func(u *xdsresource.EndpointsUpdate, onDone func()) { + depMgr.onEndpointUpdate(resourceName, u, onDone) + }, + onError: func(err error, onDone func()) { + depMgr.onEndpointResourceError(resourceName, err, onDone) + }, + onAmbientError: func(err error, onDone func()) { + depMgr.onEndpointAmbientError(resourceName, err, onDone) + }, + } return &endpointWatcherState{ cancelWatch: xdsresource.WatchEndpoints(depMgr.xdsClient, resourceName, w), } From 1de452ef48d5674d3b0f6b47511a82bc3310cb25 Mon Sep 17 00:00:00 2001 From: eshitachandwani Date: Fri, 12 Dec 2025 23:28:27 +0530 Subject: [PATCH 14/25] resolve comments --- internal/xds/resolver/watch_service_test.go | 3 +- internal/xds/resolver/xds_resolver.go | 2 +- internal/xds/resolver/xds_resolver_test.go | 18 ++++--- .../xds/xdsdepmgr/xds_dependency_manager.go | 54 +++++++++---------- 4 files changed, 40 insertions(+), 37 deletions(-) diff --git a/internal/xds/resolver/watch_service_test.go b/internal/xds/resolver/watch_service_test.go index 5ab5d6f4132f..831de0b31f0a 100644 --- a/internal/xds/resolver/watch_service_test.go +++ b/internal/xds/resolver/watch_service_test.go @@ -68,7 +68,8 @@ func (s) TestServiceWatch_ListenerPointsToNewRouteConfiguration(t *testing.T) { verifyUpdateFromResolver(ctx, t, stateCh, wantServiceConfig(resources.Clusters[0].Name)) // Update the listener resource to point to a new route configuration name. - // Leave the old route configuration resource unchanged. + // Leave the old route configuration resource unchanged to prevent the race + // between route resource error for resource removed and listener update. newTestRouteConfigName := defaultTestRouteConfigName + "-new" resources.Listeners = []*v3listenerpb.Listener{e2e.DefaultClientListener(defaultTestServiceName, newTestRouteConfigName)} resources.SkipValidation = true diff --git a/internal/xds/resolver/xds_resolver.go b/internal/xds/resolver/xds_resolver.go index 64ce7c09132d..cee49a914c31 100644 --- a/internal/xds/resolver/xds_resolver.go +++ b/internal/xds/resolver/xds_resolver.go @@ -230,7 +230,7 @@ type xdsResolver struct { curConfigSelector stoppableConfigSelector } -// ResolveNow is a no-op at this point. +// ResolveNow calls ResolveNow on the dependency manager. func (r *xdsResolver) ResolveNow(opts resolver.ResolveNowOptions) { if r.dm != nil { r.dm.ResolveNow(opts) diff --git a/internal/xds/resolver/xds_resolver_test.go b/internal/xds/resolver/xds_resolver_test.go index 6d5abb399b40..78be09b1fed6 100644 --- a/internal/xds/resolver/xds_resolver_test.go +++ b/internal/xds/resolver/xds_resolver_test.go @@ -434,14 +434,16 @@ func (s) TestResolverBadServiceUpdate_NACKedWithCache(t *testing.T) { stateCh, _, _ := buildResolverForTarget(t, resolver.Target{URL: *testutils.MustParseURL("xds:///" + defaultTestServiceName)}, bc) // Configure all resources on the management server. - listeners := []*v3listenerpb.Listener{e2e.DefaultClientListener(defaultTestServiceName, defaultTestRouteConfigName)} - routes := []*v3routepb.RouteConfiguration{e2e.DefaultRouteConfig(defaultTestRouteConfigName, defaultTestServiceName, defaultTestClusterName)} - cluster := []*v3clusterpb.Cluster{e2e.DefaultCluster(defaultTestClusterName, defaultTestEndpointName, e2e.SecurityLevelNone)} - endpoints := []*v3endpointpb.ClusterLoadAssignment{e2e.DefaultEndpoint(defaultTestEndpointName, defaultTestHostname, defaultTestPort)} - configureAllResourcesOnManagementServer(ctx, t, mgmtServer, nodeID, listeners, routes, cluster, endpoints) + resources := e2e.DefaultClientResources(e2e.ResourceParams{ + NodeID: nodeID, + DialTarget: defaultTestServiceName, + Host: "localhost", + Port: 8080, + }) + mgmtServer.Update(ctx, resources) // Expect a good update from the resolver. - cs := verifyUpdateFromResolver(ctx, t, stateCh, wantServiceConfig(defaultTestClusterName)) + cs := verifyUpdateFromResolver(ctx, t, stateCh, wantServiceConfig(resources.Clusters[0].Name)) // "Make an RPC" by invoking the config selector. _, err := cs.SelectConfig(iresolver.RPCInfo{Context: ctx, Method: "/service/method"}) @@ -999,7 +1001,9 @@ func (s) TestResolverDelayedOnCommitted(t *testing.T) { Port: defaultTestPort[0], SecLevel: e2e.SecurityLevelNone, }) - mgmtServer.Update(ctx, resources) + if err := mgmtServer.Update(ctx, resources); err != nil { + t.Fatal(err) + } stateCh, _, _ := buildResolverForTarget(t, resolver.Target{URL: *testutils.MustParseURL("xds:///" + defaultTestServiceName)}, bc) diff --git a/internal/xds/xdsdepmgr/xds_dependency_manager.go b/internal/xds/xdsdepmgr/xds_dependency_manager.go index 5ce12ec522d4..2d0bb15fac38 100644 --- a/internal/xds/xdsdepmgr/xds_dependency_manager.go +++ b/internal/xds/xdsdepmgr/xds_dependency_manager.go @@ -80,9 +80,9 @@ type DependencyManager struct { // All the fields below are protected by mu. mu sync.Mutex stopped bool - listenerCancel func() - currentListenerUpdate *xdsresource.ListenerUpdate - routeConfigCancel func() + listenerCancel func() + currentListenerUpdate *xdsresource.ListenerUpdate + routeConfigCancel func() rdsResourceName string currentRouteConfig *xdsresource.RouteConfigUpdate currentVirtualHost *xdsresource.VirtualHost @@ -224,10 +224,6 @@ func (m *DependencyManager) maybeSendUpdateLocked() { } if haveAllResources { m.watcher.Update(config) - return - } - if m.logger.V(2) { - m.logger.Infof("Not sending update to watcher as not all resources are available") } } @@ -280,24 +276,18 @@ func (m *DependencyManager) populateClusterConfigLocked(clusterName string, dept return false, nil, nil } - switch { - case state.lastUpdate == nil && state.lastErr == nil: - // Case 1: Watcher exists but no data or error has arrived yet. + if !state.updateReceived { return false, nil, nil - - case state.lastUpdate == nil: - // Case 2: No update received, but a terminal resource error occurred. - // (state.lastErr is implicitly non-nil here). + } + if state.lastErr != nil { clusterConfigs[clusterName] = &xdsresource.ClusterResult{Err: state.lastErr} return true, nil, nil + } - default: - // Case 3: A valid update was received. Store the config and continue resolution. - clusterConfigs[clusterName] = &xdsresource.ClusterResult{ - Config: xdsresource.ClusterConfig{ - Cluster: state.lastUpdate, - }, - } + clusterConfigs[clusterName] = &xdsresource.ClusterResult{ + Config: xdsresource.ClusterConfig{ + Cluster: state.lastUpdate, + }, } update := state.lastUpdate @@ -316,7 +306,7 @@ func (m *DependencyManager) populateClusterConfigLocked(clusterName string, dept endpointState := m.endpointWatchers[edsName] // If the resource does not have any update yet, return. - if endpointState.lastUpdate == nil && endpointState.lastErr == nil { + if !endpointState.updateReceived { return false, nil, nil } @@ -407,6 +397,8 @@ func (m *DependencyManager) applyRouteConfigUpdateLocked(update *xdsresource.Rou } } + // Watch for new clusters is started in populateClusterConfigLocked to avoid + // repeating the code. m.clustersFromRouteConfig = newClusters } m.maybeSendUpdateLocked() @@ -552,9 +544,10 @@ func (m *DependencyManager) onRouteConfigResourceAmbientError(resourceName strin } type clusterWatcherState struct { - cancelWatch func() - lastUpdate *xdsresource.ClusterUpdate - lastErr error + cancelWatch func() + lastUpdate *xdsresource.ClusterUpdate + lastErr error + updateReceived bool } func newClusterWatcher(resourceName string, depMgr *DependencyManager) *clusterWatcherState { @@ -590,6 +583,7 @@ func (m *DependencyManager) onClusterResourceUpdate(resourceName string, update state := m.clusterWatchers[resourceName] state.lastUpdate = update state.lastErr = nil + state.updateReceived = true m.maybeSendUpdateLocked() } @@ -608,6 +602,7 @@ func (m *DependencyManager) onClusterResourceError(resourceName string, err erro state := m.clusterWatchers[resourceName] state.lastErr = err state.lastUpdate = nil + state.updateReceived = true m.maybeSendUpdateLocked() } @@ -625,9 +620,10 @@ func (m *DependencyManager) onClusterAmbientError(resourceName string, err error } type endpointWatcherState struct { - cancelWatch func() - lastUpdate *xdsresource.EndpointsUpdate - lastErr error + cancelWatch func() + lastUpdate *xdsresource.EndpointsUpdate + lastErr error + updateReceived bool } func newEndpointWatcher(resourceName string, depMgr *DependencyManager) *endpointWatcherState { @@ -664,6 +660,7 @@ func (m *DependencyManager) onEndpointUpdate(resourceName string, update *xdsres state := m.endpointWatchers[resourceName] state.lastUpdate = update state.lastErr = nil + state.updateReceived = true m.maybeSendUpdateLocked() } @@ -681,6 +678,7 @@ func (m *DependencyManager) onEndpointResourceError(resourceName string, err err state := m.endpointWatchers[resourceName] state.lastErr = err state.lastUpdate = nil + state.updateReceived = true m.maybeSendUpdateLocked() } From 5ba419bf9d810dcbe9a5264bd0cdbffc8587852a Mon Sep 17 00:00:00 2001 From: eshitachandwani Date: Fri, 12 Dec 2025 23:35:48 +0530 Subject: [PATCH 15/25] test formatting --- .../xdsdepmgr/xds_dependency_manager_test.go | 27 ++++++++++--------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/internal/xds/xdsdepmgr/xds_dependency_manager_test.go b/internal/xds/xdsdepmgr/xds_dependency_manager_test.go index 48835b279a6f..130b8a81c6a0 100644 --- a/internal/xds/xdsdepmgr/xds_dependency_manager_test.go +++ b/internal/xds/xdsdepmgr/xds_dependency_manager_test.go @@ -127,6 +127,7 @@ func lessEndpoint(a, b resolver.Endpoint) bool { return addrA < addrB } + func verifyXDSConfig(ctx context.Context, xdsCh chan *xdsresource.XDSConfig, errCh chan error, want *xdsresource.XDSConfig) error { select { case <-ctx.Done(): @@ -164,7 +165,7 @@ func verifyXDSConfig(ctx context.Context, xdsCh chan *xdsresource.XDSConfig, err return nil } -func makedefaultXDSConfig(routeConfigName, clusterName, edsServiceName, addr string) *xdsresource.XDSConfig { +func makeDefaultXDSConfig(routeConfigName, clusterName, edsServiceName, addr string) *xdsresource.XDSConfig { return &xdsresource.XDSConfig{ Listener: &xdsresource.ListenerUpdate{ RouteConfigName: routeConfigName, @@ -290,7 +291,7 @@ func makeLogicalDNSClusterResource(name, dnsHost string, dnsPort uint32) *v3clus } // Tests the happy case where the dependency manager receives all the required -// resources and verifies that Update is called with with the correct XDSConfig. +// resources and verifies that Update is called with the correct XDSConfig. func (s) TestHappyCase(t *testing.T) { nodeID, mgmtServer, xdsClient := setup(t, false) @@ -315,7 +316,7 @@ func (s) TestHappyCase(t *testing.T) { dm := xdsdepmgr.New(defaultTestServiceName, defaultTestServiceName, xdsClient, watcher) defer dm.Close() - wantXdsConfig := makedefaultXDSConfig(resources.Routes[0].Name, resources.Clusters[0].Name, resources.Clusters[0].EdsClusterConfig.ServiceName, "localhost:8080") + wantXdsConfig := makeDefaultXDSConfig(resources.Routes[0].Name, resources.Clusters[0].Name, resources.Clusters[0].EdsClusterConfig.ServiceName, "localhost:8080") if err := verifyXDSConfig(ctx, watcher.updateCh, watcher.errorCh, wantXdsConfig); err != nil { t.Fatal(err) } @@ -365,7 +366,7 @@ func (s) TestInlineRouteConfig(t *testing.T) { dm := xdsdepmgr.New(defaultTestServiceName, defaultTestServiceName, xdsClient, watcher) defer dm.Close() - wantXdsConfig := makedefaultXDSConfig(defaultTestRouteConfigName, defaultTestClusterName, defaultTestEDSServiceName, "localhost:8080") + wantXdsConfig := makeDefaultXDSConfig(defaultTestRouteConfigName, defaultTestClusterName, defaultTestEDSServiceName, "localhost:8080") wantXdsConfig.Listener.InlineRouteConfig = wantXdsConfig.RouteConfig wantXdsConfig.Listener.RouteConfigName = "" @@ -375,7 +376,7 @@ func (s) TestInlineRouteConfig(t *testing.T) { } // Tests the case where dependency manager only receives listener resource but -// does not receive route config resource. Verfies that Update is not called +// does not receive route config resource. Verifies that Update is not called // since we do not have all resources. func (s) TestIncompleteResources(t *testing.T) { nodeID, mgmtServer, xdsClient := setup(t, false) @@ -436,7 +437,7 @@ func (s) TestListenerResourceError(t *testing.T) { dm := xdsdepmgr.New(defaultTestServiceName, defaultTestServiceName, xdsClient, watcher) defer dm.Close() - wantXdsConfig := makedefaultXDSConfig(resources.Routes[0].Name, resources.Clusters[0].Name, resources.Clusters[0].EdsClusterConfig.ServiceName, "localhost:8080") + wantXdsConfig := makeDefaultXDSConfig(resources.Routes[0].Name, resources.Clusters[0].Name, resources.Clusters[0].EdsClusterConfig.ServiceName, "localhost:8080") if err := verifyXDSConfig(ctx, watcher.updateCh, watcher.errorCh, wantXdsConfig); err != nil { t.Fatal(err) } @@ -554,7 +555,7 @@ func (s) TestNoVirtualHost_ExistingResource(t *testing.T) { defer dm.Close() // Verify valid update. - wantXdsConfig := makedefaultXDSConfig(resources.Routes[0].Name, resources.Clusters[0].Name, resources.Endpoints[0].ClusterName, "localhost:8080") + wantXdsConfig := makeDefaultXDSConfig(resources.Routes[0].Name, resources.Clusters[0].Name, resources.Endpoints[0].ClusterName, "localhost:8080") if err := verifyXDSConfig(ctx, watcher.updateCh, watcher.errorCh, wantXdsConfig); err != nil { t.Fatal(err) } @@ -607,7 +608,7 @@ func (s) TestAmbientError(t *testing.T) { defer dm.Close() // Wait for the initial valid update. - wantXdsConfig := makedefaultXDSConfig(resources.Routes[0].Name, resources.Clusters[0].Name, resources.Clusters[0].EdsClusterConfig.ServiceName, "localhost:8080") + wantXdsConfig := makeDefaultXDSConfig(resources.Routes[0].Name, resources.Clusters[0].Name, resources.Clusters[0].EdsClusterConfig.ServiceName, "localhost:8080") if err := verifyXDSConfig(ctx, watcher.updateCh, watcher.errorCh, wantXdsConfig); err != nil { t.Fatal(err) } @@ -692,7 +693,7 @@ func (s) TestRouteResourceUpdate(t *testing.T) { defer dm.Close() // Wait for the first update. - wantXdsConfig := makedefaultXDSConfig(resources.Routes[0].Name, resources.Clusters[0].Name, resources.Clusters[0].EdsClusterConfig.ServiceName, "localhost:8080") + wantXdsConfig := makeDefaultXDSConfig(resources.Routes[0].Name, resources.Clusters[0].Name, resources.Clusters[0].EdsClusterConfig.ServiceName, "localhost:8080") if err := verifyXDSConfig(ctx, watcher.updateCh, watcher.errorCh, wantXdsConfig); err != nil { t.Fatal(err) } @@ -777,7 +778,7 @@ func (s) TestRouteResourceChangeToInline(t *testing.T) { defer dm.Close() // Wait for the first update. - wantXdsConfig := makedefaultXDSConfig(resources.Routes[0].Name, resources.Clusters[0].Name, resources.Clusters[0].EdsClusterConfig.ServiceName, "localhost:8080") + wantXdsConfig := makeDefaultXDSConfig(resources.Routes[0].Name, resources.Clusters[0].Name, resources.Clusters[0].EdsClusterConfig.ServiceName, "localhost:8080") if err := verifyXDSConfig(ctx, watcher.updateCh, watcher.errorCh, wantXdsConfig); err != nil { t.Fatal(err) } @@ -914,7 +915,7 @@ func (s) TestClusterResourceError(t *testing.T) { dm := xdsdepmgr.New(defaultTestServiceName, defaultTestServiceName, xdsClient, watcher) defer dm.Close() - wantXdsConfig := makedefaultXDSConfig(resources.Routes[0].Name, resources.Clusters[0].Name, resources.Clusters[0].EdsClusterConfig.ServiceName, "localhost:8080") + wantXdsConfig := makeDefaultXDSConfig(resources.Routes[0].Name, resources.Clusters[0].Name, resources.Clusters[0].EdsClusterConfig.ServiceName, "localhost:8080") wantXdsConfig.Clusters[resources.Clusters[0].Name] = &xdsresource.ClusterResult{Err: fmt.Errorf("[xDS node id: %v]: %v", nodeID, fmt.Errorf("unsupported config_source_specifier *corev3.ConfigSource_Ads in lrs_server field"))} if err := verifyXDSConfig(ctx, watcher.updateCh, watcher.errorCh, wantXdsConfig); err != nil { @@ -964,7 +965,7 @@ func (s) TestClusterAmbientError(t *testing.T) { dm := xdsdepmgr.New(defaultTestServiceName, defaultTestServiceName, xdsClient, watcher) defer dm.Close() - wantXdsConfig := makedefaultXDSConfig(resources.Routes[0].Name, resources.Clusters[0].Name, resources.Clusters[0].EdsClusterConfig.ServiceName, "localhost:8080") + wantXdsConfig := makeDefaultXDSConfig(resources.Routes[0].Name, resources.Clusters[0].Name, resources.Clusters[0].EdsClusterConfig.ServiceName, "localhost:8080") if err := verifyXDSConfig(ctx, watcher.updateCh, watcher.errorCh, wantXdsConfig); err != nil { t.Fatal(err) } @@ -1432,7 +1433,7 @@ func (s) TestEndpointAmbientError(t *testing.T) { dm := xdsdepmgr.New(defaultTestServiceName, defaultTestServiceName, xdsClient, watcher) defer dm.Close() - wantXdsConfig := makedefaultXDSConfig(resources.Routes[0].Name, resources.Clusters[0].Name, resources.Endpoints[0].ClusterName, "localhost:8080") + wantXdsConfig := makeDefaultXDSConfig(resources.Routes[0].Name, resources.Clusters[0].Name, resources.Endpoints[0].ClusterName, "localhost:8080") if err := verifyXDSConfig(ctx, watcher.updateCh, watcher.errorCh, wantXdsConfig); err != nil { t.Fatal(err) From d4836bbb7728e6e7292f297b5bf4fd65b0136e48 Mon Sep 17 00:00:00 2001 From: eshitachandwani Date: Tue, 16 Dec 2025 12:10:15 +0530 Subject: [PATCH 16/25] changes --- internal/xds/resolver/xds_resolver.go | 2 +- internal/xds/xdsdepmgr/watch_service.go | 6 +- .../xds/xdsdepmgr/xds_dependency_manager.go | 89 +++++------ .../xdsdepmgr/xds_dependency_manager_test.go | 138 +++++++++--------- 4 files changed, 124 insertions(+), 111 deletions(-) diff --git a/internal/xds/resolver/xds_resolver.go b/internal/xds/resolver/xds_resolver.go index cee49a914c31..e98fe240f23c 100644 --- a/internal/xds/resolver/xds_resolver.go +++ b/internal/xds/resolver/xds_resolver.go @@ -233,7 +233,7 @@ type xdsResolver struct { // ResolveNow calls ResolveNow on the dependency manager. func (r *xdsResolver) ResolveNow(opts resolver.ResolveNowOptions) { if r.dm != nil { - r.dm.ResolveNow(opts) + r.dm.RequestDNSReresolution(opts) } } diff --git a/internal/xds/xdsdepmgr/watch_service.go b/internal/xds/xdsdepmgr/watch_service.go index 4204e14de83b..807536129098 100644 --- a/internal/xds/xdsdepmgr/watch_service.go +++ b/internal/xds/xdsdepmgr/watch_service.go @@ -98,7 +98,11 @@ func (dr *dnsResolver) ParseServiceConfig(string) *serviceconfig.ParseResult { func newDNSResolver(target string, depMgr *DependencyManager) *dnsResolverState { ctx, cancel := context.WithCancel(context.Background()) dr := &dnsResolver{target: target, depMgr: depMgr, serializer: *grpcsync.NewCallbackSerializer(ctx), serializerCancel: cancel} - drState := &dnsResolverState{resolver: dr, cancelResolver: func() {}} + drState := &dnsResolverState{resolver: dr} + drState.cancelResolver = func() { + drState.resolver.serializerCancel() + <-drState.resolver.serializer.Done() + } u, err := url.Parse("dns:///" + target) if err != nil { drState.resolver.ReportError(err) diff --git a/internal/xds/xdsdepmgr/xds_dependency_manager.go b/internal/xds/xdsdepmgr/xds_dependency_manager.go index 2d0bb15fac38..b7fa2a605a1b 100644 --- a/internal/xds/xdsdepmgr/xds_dependency_manager.go +++ b/internal/xds/xdsdepmgr/xds_dependency_manager.go @@ -29,10 +29,7 @@ import ( "google.golang.org/grpc/resolver" ) -const ( - aggregateClusterMaxDepth = 16 - prefix = "[xdsdepmgr %p] " -) +const prefix = "[xdsdepmgr %p] " var logger = grpclog.Component("xds") @@ -170,9 +167,8 @@ func (m *DependencyManager) annotateErrorWithNodeID(err error) error { return fmt.Errorf("[xDS node id: %v]: %v", m.nodeID, err) } -// maybeSendUpdateLocked checks that all the resources have been received and -// sends the current aggregated xDS configuration to the watcher if all the -// updates are available. +// maybeSendUpdateLocked verifies that all expected resources have been +// received, and if so, delivers the complete xDS configuration to the watcher. func (m *DependencyManager) maybeSendUpdateLocked() { if m.currentListenerUpdate == nil || m.currentRouteConfig == nil { return @@ -183,43 +179,47 @@ func (m *DependencyManager) maybeSendUpdateLocked() { VirtualHost: m.currentVirtualHost, Clusters: make(map[string]*xdsresource.ClusterResult), } + if !EnableClusterAndEndpointsWatch { m.watcher.Update(config) return } + edsResourcesSeen := make(map[string]bool) dnsResourcesSeen := make(map[string]bool) - clusterResourceSeen := make(map[string]bool) + clusterResourcesSeen := make(map[string]bool) haveAllResources := true for cluster := range m.clustersFromRouteConfig { - gotAllResource, _, err := m.populateClusterConfigLocked(cluster, 0, config.Clusters, edsResourcesSeen, dnsResourcesSeen, clusterResourceSeen) - if !gotAllResource { + ok, leafClusters, err := m.populateClusterConfigLocked(cluster, 0, config.Clusters, edsResourcesSeen, dnsResourcesSeen, clusterResourcesSeen) + if !ok { haveAllResources = false } + // If there are no leaf clusters, add that as error. + if ok && len(leafClusters) == 0 { + config.Clusters[cluster] = &xdsresource.ClusterResult{Err: m.annotateErrorWithNodeID(fmt.Errorf("aggregate cluster graph has no leaf clusters"))} + } if err != nil { - config.Clusters[cluster] = &xdsresource.ClusterResult{ - Err: err, - } + config.Clusters[cluster] = &xdsresource.ClusterResult{Err: err} } } // Cancel resources not seen in the tree. - for name, endpoint := range m.endpointWatchers { + for name, ep := range m.endpointWatchers { if _, ok := edsResourcesSeen[name]; !ok { - endpoint.cancelWatch() + ep.cancelWatch() delete(m.endpointWatchers, name) } } - for name, dnsResolver := range m.dnsResolvers { + for name, dr := range m.dnsResolvers { if _, ok := dnsResourcesSeen[name]; !ok { - dnsResolver.cancelResolver() + dr.cancelResolver() delete(m.dnsResolvers, name) } } - for clusterName, cluster := range m.clusterWatchers { - if _, ok := clusterResourceSeen[clusterName]; !ok { + for name, cluster := range m.clusterWatchers { + if _, ok := clusterResourcesSeen[name]; !ok { cluster.cancelWatch() - delete(m.clusterWatchers, clusterName) + delete(m.clusterWatchers, name) } } if haveAllResources { @@ -227,9 +227,10 @@ func (m *DependencyManager) maybeSendUpdateLocked() { } } -// populateClusterConfigLocked recursively resolves and populates the +// populateClusterConfigLocked resolves and populates the // configuration for the given cluster and its children, including its -// associated endpoint or aggregate children. +// associated endpoint or aggregate children. For aggregate clusters, it +// recursively resolves the configuration for its child clusters. // // This function traverses the cluster dependency graph (e.g., from an Aggregate // cluster down to its leaf clusters and their endpoints/DNS resources) to @@ -241,13 +242,13 @@ func (m *DependencyManager) maybeSendUpdateLocked() { // clusterName: The name of the cluster resource to resolve. // depth: The current recursion depth. // clusterConfigs: Map to store the resolved cluster configuration. -// endpointResourceSeen: Stores which EDS resource names have been encountered. -// dnsResourceSeen: Stores which DNS resource names have been encountered. +// endpointResourcesSeen: Stores which EDS resource names have been encountered. +// dnsResourcesSeen: Stores which DNS resource names have been encountered. // clustersSeen: Stores which cluster resource names have been encountered. // // Returns: // -// bool: Returns if the cluster configuration (and all its +// bool: Returns true if the cluster configuration (and all its // dependencies) is fully resolved (i.e either update or // error has been received). // []string: A slice of all "leaf" cluster names discovered in the @@ -256,11 +257,13 @@ func (m *DependencyManager) maybeSendUpdateLocked() { // error: Error that needs to be propogated up the tree (like // max depth exceeded or an error propagated from a // child cluster). -func (m *DependencyManager) populateClusterConfigLocked(clusterName string, depth int, clusterConfigs map[string]*xdsresource.ClusterResult, endpointResourceSeen, dnsResourceSeen, clustersSeen map[string]bool) (bool, []string, error) { +func (m *DependencyManager) populateClusterConfigLocked(clusterName string, depth int, clusterConfigs map[string]*xdsresource.ClusterResult, endpointResourcesSeen, dnsResourcesSeen, clustersSeen map[string]bool) (bool, []string, error) { + const aggregateClusterMaxDepth = 16 clustersSeen[clusterName] = true - if depth > aggregateClusterMaxDepth { + if depth >= aggregateClusterMaxDepth { m.logger.Warningf("aggregate cluster graph exceeds max depth (%d)", aggregateClusterMaxDepth) + clusterConfigs[clusterName] = &xdsresource.ClusterResult{Err: m.annotateErrorWithNodeID(fmt.Errorf("aggregate cluster graph exceeds max depth (%d)", aggregateClusterMaxDepth))} return true, nil, m.annotateErrorWithNodeID(fmt.Errorf("aggregate cluster graph exceeds max depth (%d)", aggregateClusterMaxDepth)) } @@ -280,8 +283,7 @@ func (m *DependencyManager) populateClusterConfigLocked(clusterName string, dept return false, nil, nil } if state.lastErr != nil { - clusterConfigs[clusterName] = &xdsresource.ClusterResult{Err: state.lastErr} - return true, nil, nil + return true, nil, state.lastErr } clusterConfigs[clusterName] = &xdsresource.ClusterResult{ @@ -297,7 +299,7 @@ func (m *DependencyManager) populateClusterConfigLocked(clusterName string, dept if update.EDSServiceName != "" { edsName = update.EDSServiceName } - endpointResourceSeen[edsName] = true + endpointResourcesSeen[edsName] = true // If endpoint watcher does not exist, create one. if _, ok := m.endpointWatchers[edsName]; !ok { m.endpointWatchers[edsName] = newEndpointWatcher(edsName, m) @@ -319,7 +321,7 @@ func (m *DependencyManager) populateClusterConfigLocked(clusterName string, dept case xdsresource.ClusterTypeLogicalDNS: target := update.DNSHostName - dnsResourceSeen[target] = true + dnsResourcesSeen[target] = true // If dns resolver does not exist, create one. if _, ok := m.dnsResolvers[target]; !ok { m.dnsResolvers[target] = newDNSResolver(target, m) @@ -341,31 +343,30 @@ func (m *DependencyManager) populateClusterConfigLocked(clusterName string, dept var leafClusters []string haveAllResources := true for _, child := range update.PrioritizedClusterNames { - gotAll, childLeafCluster, err := m.populateClusterConfigLocked(child, depth+1, clusterConfigs, endpointResourceSeen, dnsResourceSeen, clustersSeen) - if !gotAll { + ok, childLeafClusters, err := m.populateClusterConfigLocked(child, depth+1, clusterConfigs, endpointResourcesSeen, dnsResourcesSeen, clustersSeen) + if !ok { haveAllResources = false } if err != nil { - clusterConfigs[clusterName] = &xdsresource.ClusterResult{ - Err: err, - } + clusterConfigs[clusterName] = &xdsresource.ClusterResult{Err: err} return true, leafClusters, err } - leafClusters = append(leafClusters, childLeafCluster...) + leafClusters = append(leafClusters, childLeafClusters...) } if !haveAllResources { return false, leafClusters, nil } if haveAllResources && len(leafClusters) == 0 { - clusterConfigs[clusterName] = &xdsresource.ClusterResult{Err: m.annotateErrorWithNodeID(fmt.Errorf("no leaf clusters found for aggregate cluster"))} - return true, leafClusters, m.annotateErrorWithNodeID(fmt.Errorf("no leaf clusters found for aggregate cluster")) + clusterConfigs[clusterName] = &xdsresource.ClusterResult{Err: m.annotateErrorWithNodeID(fmt.Errorf("aggregate cluster graph has no leaf clusters"))} + return true, leafClusters, nil } clusterConfigs[clusterName].Config.AggregateConfig = &xdsresource.AggregateConfig{ LeafClusters: leafClusters, } return true, leafClusters, nil } - clusterConfigs[clusterName] = &xdsresource.ClusterResult{Err: m.annotateErrorWithNodeID(fmt.Errorf("no cluster data yet for cluster %s", clusterName))} + // When the cluster update is not one of the three types of cluster updates. + clusterConfigs[clusterName] = &xdsresource.ClusterResult{Err: m.annotateErrorWithNodeID(fmt.Errorf("cluster type %v of cluster %s not supported", update.ClusterType, clusterName))} return false, nil, nil } @@ -397,8 +398,8 @@ func (m *DependencyManager) applyRouteConfigUpdateLocked(update *xdsresource.Rou } } - // Watch for new clusters is started in populateClusterConfigLocked to avoid - // repeating the code. + // Watch for new clusters is started in populateClusterConfigLocked to + // avoid repeating the code. m.clustersFromRouteConfig = newClusters } m.maybeSendUpdateLocked() @@ -758,8 +759,8 @@ func (m *DependencyManager) onDNSError(resourceName string, err error) { m.maybeSendUpdateLocked() } -// ResolveNow calls all the the DNS resolvers ResolveNow. -func (m *DependencyManager) ResolveNow(opt resolver.ResolveNowOptions) { +// RequestDNSReresolution calls all the the DNS resolvers ResolveNow. +func (m *DependencyManager) RequestDNSReresolution(opt resolver.ResolveNowOptions) { for _, res := range m.dnsResolvers { res.resolver.dnsR.ResolveNow(opt) } diff --git a/internal/xds/xdsdepmgr/xds_dependency_manager_test.go b/internal/xds/xdsdepmgr/xds_dependency_manager_test.go index 130b8a81c6a0..b849097998b4 100644 --- a/internal/xds/xdsdepmgr/xds_dependency_manager_test.go +++ b/internal/xds/xdsdepmgr/xds_dependency_manager_test.go @@ -93,6 +93,13 @@ func (w *testWatcher) Error(err error) { w.errorCh <- err } +func (w *testWatcher) drainUpdates(ctx context.Context) { + select { + case <-ctx.Done(): + case <-w.updateCh: + } +} + func verifyError(ctx context.Context, errCh chan error, wantErr, wantNodeID string) error { select { case gotErr := <-errCh: @@ -157,6 +164,7 @@ func verifyXDSConfig(ctx context.Context, xdsCh chan *xdsresource.XDSConfig, err cmpopts.SortSlices(lessEndpoint), } if diff := cmp.Diff(update, want, cmpOpts...); diff != "" { + fmt.Printf("eshita got config %v", update.Clusters) return fmt.Errorf("received unexpected update from dependency manager. Diff (-got +want):\n%v", diff) } case err := <-errCh: @@ -165,7 +173,7 @@ func verifyXDSConfig(ctx context.Context, xdsCh chan *xdsresource.XDSConfig, err return nil } -func makeDefaultXDSConfig(routeConfigName, clusterName, edsServiceName, addr string) *xdsresource.XDSConfig { +func makeXDSConfig(routeConfigName, clusterName, edsServiceName, addr string) *xdsresource.XDSConfig { return &xdsresource.XDSConfig{ Listener: &xdsresource.ListenerUpdate{ RouteConfigName: routeConfigName, @@ -213,7 +221,8 @@ func makeDefaultXDSConfig(routeConfigName, clusterName, edsServiceName, addr str Weight: 1, }, }, - Weight: 1}, + Weight: 1, + }, }, }, }}, @@ -222,7 +231,9 @@ func makeDefaultXDSConfig(routeConfigName, clusterName, edsServiceName, addr str } } -func setup(t *testing.T, allowResourceSubset bool) (string, *e2e.ManagementServer, xdsclient.XDSClient) { +// setupManagementServerAndClient creates a management server, an xds client and +// returns the node ID, management server and xds client. +func setupManagementServerAndClient(t *testing.T, allowResourceSubset bool) (string, *e2e.ManagementServer, xdsclient.XDSClient) { t.Helper() nodeID := uuid.New().String() mgmtServer, bootstrapContents := setupManagementServerForTest(t, nodeID, allowResourceSubset) @@ -293,7 +304,7 @@ func makeLogicalDNSClusterResource(name, dnsHost string, dnsPort uint32) *v3clus // Tests the happy case where the dependency manager receives all the required // resources and verifies that Update is called with the correct XDSConfig. func (s) TestHappyCase(t *testing.T) { - nodeID, mgmtServer, xdsClient := setup(t, false) + nodeID, mgmtServer, xdsClient := setupManagementServerAndClient(t, false) watcher := &testWatcher{ updateCh: make(chan *xdsresource.XDSConfig), @@ -316,7 +327,7 @@ func (s) TestHappyCase(t *testing.T) { dm := xdsdepmgr.New(defaultTestServiceName, defaultTestServiceName, xdsClient, watcher) defer dm.Close() - wantXdsConfig := makeDefaultXDSConfig(resources.Routes[0].Name, resources.Clusters[0].Name, resources.Clusters[0].EdsClusterConfig.ServiceName, "localhost:8080") + wantXdsConfig := makeXDSConfig(resources.Routes[0].Name, resources.Clusters[0].Name, resources.Clusters[0].EdsClusterConfig.ServiceName, "localhost:8080") if err := verifyXDSConfig(ctx, watcher.updateCh, watcher.errorCh, wantXdsConfig); err != nil { t.Fatal(err) } @@ -325,7 +336,7 @@ func (s) TestHappyCase(t *testing.T) { // Tests the case where the listener contains an inline route configuration and // verifies that Update is called with the correct XDSConfig. func (s) TestInlineRouteConfig(t *testing.T) { - nodeID, mgmtServer, xdsClient := setup(t, false) + nodeID, mgmtServer, xdsClient := setupManagementServerAndClient(t, false) watcher := &testWatcher{ updateCh: make(chan *xdsresource.XDSConfig), @@ -366,7 +377,7 @@ func (s) TestInlineRouteConfig(t *testing.T) { dm := xdsdepmgr.New(defaultTestServiceName, defaultTestServiceName, xdsClient, watcher) defer dm.Close() - wantXdsConfig := makeDefaultXDSConfig(defaultTestRouteConfigName, defaultTestClusterName, defaultTestEDSServiceName, "localhost:8080") + wantXdsConfig := makeXDSConfig(defaultTestRouteConfigName, defaultTestClusterName, defaultTestEDSServiceName, "localhost:8080") wantXdsConfig.Listener.InlineRouteConfig = wantXdsConfig.RouteConfig wantXdsConfig.Listener.RouteConfigName = "" @@ -378,8 +389,8 @@ func (s) TestInlineRouteConfig(t *testing.T) { // Tests the case where dependency manager only receives listener resource but // does not receive route config resource. Verifies that Update is not called // since we do not have all resources. -func (s) TestIncompleteResources(t *testing.T) { - nodeID, mgmtServer, xdsClient := setup(t, false) +func (s) TestNoRouteConfigResource(t *testing.T) { + nodeID, mgmtServer, xdsClient := setupManagementServerAndClient(t, false) watcher := &testWatcher{ updateCh: make(chan *xdsresource.XDSConfig), @@ -413,8 +424,8 @@ func (s) TestIncompleteResources(t *testing.T) { // Tests the case where dependency manager receives a listener resource error by // sending the correct update first and then removing the listener resource. It // verifies that Error is called with the correct error. -func (s) TestListenerResourceError(t *testing.T) { - nodeID, mgmtServer, xdsClient := setup(t, false) +func (s) TestListenerResourceNotFoundError(t *testing.T) { + nodeID, mgmtServer, xdsClient := setupManagementServerAndClient(t, false) watcher := &testWatcher{ updateCh: make(chan *xdsresource.XDSConfig), @@ -437,7 +448,7 @@ func (s) TestListenerResourceError(t *testing.T) { dm := xdsdepmgr.New(defaultTestServiceName, defaultTestServiceName, xdsClient, watcher) defer dm.Close() - wantXdsConfig := makeDefaultXDSConfig(resources.Routes[0].Name, resources.Clusters[0].Name, resources.Clusters[0].EdsClusterConfig.ServiceName, "localhost:8080") + wantXdsConfig := makeXDSConfig(resources.Routes[0].Name, resources.Clusters[0].Name, resources.Clusters[0].EdsClusterConfig.ServiceName, "localhost:8080") if err := verifyXDSConfig(ctx, watcher.updateCh, watcher.errorCh, wantXdsConfig); err != nil { t.Fatal(err) } @@ -454,11 +465,12 @@ func (s) TestListenerResourceError(t *testing.T) { } } -// Tests the case where dependency manager receives a route config resource -// error by sending a route resource that is NACKed by the XDSClient. It -// verifies that Error is called with correct error. -func (s) TestRouteResourceError(t *testing.T) { - nodeID, mgmtServer, xdsClient := setup(t, false) +// Tests the scenario where the Dependency Manager receives an invalid +// RouteConfiguration from the management server. The test provides a +// malformed resource to trigger a NACK, and verifies that the Dependency +// Manager propagates the resulting error via Error method. +func (s) TestRouteConfigResourceError(t *testing.T) { + nodeID, mgmtServer, xdsClient := setupManagementServerAndClient(t, false) watcher := &testWatcher{ updateCh: make(chan *xdsresource.XDSConfig), @@ -490,8 +502,8 @@ func (s) TestRouteResourceError(t *testing.T) { } } -// Tests the case where route config updates receives does not have any virtual -// host. Verifies that Error is called with correct error. +// Tests the case where a received route configuration update has no virtual +// hosts. Verifies that Error is called with the expected error. func (s) TestNoVirtualHost(t *testing.T) { nodeID := uuid.New().String() mgmtServer, bc := setupManagementServerForTest(t, nodeID, false) @@ -530,7 +542,7 @@ func (s) TestNoVirtualHost(t *testing.T) { // route resource with no virtual host, which also results in error being sent // across. func (s) TestNoVirtualHost_ExistingResource(t *testing.T) { - nodeID, mgmtServer, xdsClient := setup(t, false) + nodeID, mgmtServer, xdsClient := setupManagementServerAndClient(t, false) watcher := &testWatcher{ updateCh: make(chan *xdsresource.XDSConfig), @@ -555,7 +567,7 @@ func (s) TestNoVirtualHost_ExistingResource(t *testing.T) { defer dm.Close() // Verify valid update. - wantXdsConfig := makeDefaultXDSConfig(resources.Routes[0].Name, resources.Clusters[0].Name, resources.Endpoints[0].ClusterName, "localhost:8080") + wantXdsConfig := makeXDSConfig(resources.Routes[0].Name, resources.Clusters[0].Name, resources.Endpoints[0].ClusterName, "localhost:8080") if err := verifyXDSConfig(ctx, watcher.updateCh, watcher.errorCh, wantXdsConfig); err != nil { t.Fatal(err) } @@ -582,7 +594,7 @@ func (s) TestAmbientError(t *testing.T) { // Expect a warning log for the ambient error. grpctest.ExpectWarning("Listener resource ambient error") - nodeID, mgmtServer, xdsClient := setup(t, false) + nodeID, mgmtServer, xdsClient := setupManagementServerAndClient(t, false) watcher := &testWatcher{ updateCh: make(chan *xdsresource.XDSConfig), @@ -608,7 +620,7 @@ func (s) TestAmbientError(t *testing.T) { defer dm.Close() // Wait for the initial valid update. - wantXdsConfig := makeDefaultXDSConfig(resources.Routes[0].Name, resources.Clusters[0].Name, resources.Clusters[0].EdsClusterConfig.ServiceName, "localhost:8080") + wantXdsConfig := makeXDSConfig(resources.Routes[0].Name, resources.Clusters[0].Name, resources.Clusters[0].EdsClusterConfig.ServiceName, "localhost:8080") if err := verifyXDSConfig(ctx, watcher.updateCh, watcher.errorCh, wantXdsConfig); err != nil { t.Fatal(err) } @@ -668,7 +680,7 @@ func (s) TestAmbientError(t *testing.T) { // Tests the case where the cluster name changes in the route resource update // and verify that each time Update is called with correct cluster name. func (s) TestRouteResourceUpdate(t *testing.T) { - nodeID, mgmtServer, xdsClient := setup(t, false) + nodeID, mgmtServer, xdsClient := setupManagementServerAndClient(t, false) watcher := &testWatcher{ updateCh: make(chan *xdsresource.XDSConfig), @@ -693,7 +705,7 @@ func (s) TestRouteResourceUpdate(t *testing.T) { defer dm.Close() // Wait for the first update. - wantXdsConfig := makeDefaultXDSConfig(resources.Routes[0].Name, resources.Clusters[0].Name, resources.Clusters[0].EdsClusterConfig.ServiceName, "localhost:8080") + wantXdsConfig := makeXDSConfig(resources.Routes[0].Name, resources.Clusters[0].Name, resources.Clusters[0].EdsClusterConfig.ServiceName, "localhost:8080") if err := verifyXDSConfig(ctx, watcher.updateCh, watcher.errorCh, wantXdsConfig); err != nil { t.Fatal(err) } @@ -736,7 +748,8 @@ func (s) TestRouteResourceUpdate(t *testing.T) { Weight: 1, }, }, - Weight: 1}, + Weight: 1, + }, }, }, }, @@ -751,9 +764,9 @@ func (s) TestRouteResourceUpdate(t *testing.T) { // Tests the case where the route resource is first sent from the management // server and the changed to be inline with the listener and then again changed // to be received from the management server. It verifies that each time Update -// called with the correct XDSConfig. +// is called with the correct XDSConfig. func (s) TestRouteResourceChangeToInline(t *testing.T) { - nodeID, mgmtServer, xdsClient := setup(t, false) + nodeID, mgmtServer, xdsClient := setupManagementServerAndClient(t, false) watcher := &testWatcher{ updateCh: make(chan *xdsresource.XDSConfig), @@ -778,7 +791,7 @@ func (s) TestRouteResourceChangeToInline(t *testing.T) { defer dm.Close() // Wait for the first update. - wantXdsConfig := makeDefaultXDSConfig(resources.Routes[0].Name, resources.Clusters[0].Name, resources.Clusters[0].EdsClusterConfig.ServiceName, "localhost:8080") + wantXdsConfig := makeXDSConfig(resources.Routes[0].Name, resources.Clusters[0].Name, resources.Clusters[0].EdsClusterConfig.ServiceName, "localhost:8080") if err := verifyXDSConfig(ctx, watcher.updateCh, watcher.errorCh, wantXdsConfig); err != nil { t.Fatal(err) } @@ -858,7 +871,8 @@ func (s) TestRouteResourceChangeToInline(t *testing.T) { Weight: 1, }, }, - Weight: 1}, + Weight: 1, + }, }, }, }, @@ -889,7 +903,7 @@ func (s) TestRouteResourceChangeToInline(t *testing.T) { // Tests the case where the dependency manager receives a cluster resource error // and verifies that Update is called with XDSConfig containing cluster error. func (s) TestClusterResourceError(t *testing.T) { - nodeID, mgmtServer, xdsClient := setup(t, false) + nodeID, mgmtServer, xdsClient := setupManagementServerAndClient(t, false) watcher := &testWatcher{ updateCh: make(chan *xdsresource.XDSConfig), @@ -915,7 +929,7 @@ func (s) TestClusterResourceError(t *testing.T) { dm := xdsdepmgr.New(defaultTestServiceName, defaultTestServiceName, xdsClient, watcher) defer dm.Close() - wantXdsConfig := makeDefaultXDSConfig(resources.Routes[0].Name, resources.Clusters[0].Name, resources.Clusters[0].EdsClusterConfig.ServiceName, "localhost:8080") + wantXdsConfig := makeXDSConfig(resources.Routes[0].Name, resources.Clusters[0].Name, resources.Clusters[0].EdsClusterConfig.ServiceName, "localhost:8080") wantXdsConfig.Clusters[resources.Clusters[0].Name] = &xdsresource.ClusterResult{Err: fmt.Errorf("[xDS node id: %v]: %v", nodeID, fmt.Errorf("unsupported config_source_specifier *corev3.ConfigSource_Ads in lrs_server field"))} if err := verifyXDSConfig(ctx, watcher.updateCh, watcher.errorCh, wantXdsConfig); err != nil { @@ -925,22 +939,19 @@ func (s) TestClusterResourceError(t *testing.T) { // Drain the updates because management server keeps sending the error // updates repeatedly causing the update from dependency manager to be // blocked if we don't drain it. - select { - case <-ctx.Done(): - case <-watcher.updateCh: - } + watcher.drainUpdates(ctx) } // Tests the case where the dependency manager receives a cluster resource -// ambient error. We send a valid cluster resource first, then send an invalid -// one and then send the valid resource again. We send the valid resource again -// so that we can be sure the ambient error reaches the dependency manager since +// ambient error. A valid cluster resource is sent first, then an invalid +// one and then the valid resource again. The valid resource is sent again +// to make sure that the ambient error reaches the dependency manager since // there is no other way to wait for it. func (s) TestClusterAmbientError(t *testing.T) { // Expect a warning log for the ambient error. grpctest.ExpectWarning("Cluster resource ambient error") - nodeID, mgmtServer, xdsClient := setup(t, false) + nodeID, mgmtServer, xdsClient := setupManagementServerAndClient(t, false) watcher := &testWatcher{ updateCh: make(chan *xdsresource.XDSConfig), @@ -965,7 +976,7 @@ func (s) TestClusterAmbientError(t *testing.T) { dm := xdsdepmgr.New(defaultTestServiceName, defaultTestServiceName, xdsClient, watcher) defer dm.Close() - wantXdsConfig := makeDefaultXDSConfig(resources.Routes[0].Name, resources.Clusters[0].Name, resources.Clusters[0].EdsClusterConfig.ServiceName, "localhost:8080") + wantXdsConfig := makeXDSConfig(resources.Routes[0].Name, resources.Clusters[0].Name, resources.Clusters[0].EdsClusterConfig.ServiceName, "localhost:8080") if err := verifyXDSConfig(ctx, watcher.updateCh, watcher.errorCh, wantXdsConfig); err != nil { t.Fatal(err) } @@ -1007,7 +1018,7 @@ func (s) TestClusterAmbientError(t *testing.T) { // resources is not configured and then verifies that Update is called with // correct config when all resources are configured. func (s) TestAggregateCluster(t *testing.T) { - nodeID, mgmtServer, xdsClient := setup(t, true) + nodeID, mgmtServer, xdsClient := setupManagementServerAndClient(t, true) watcher := &testWatcher{ updateCh: make(chan *xdsresource.XDSConfig), @@ -1075,8 +1086,7 @@ func (s) TestAggregateCluster(t *testing.T) { Prefix: newStringP("/"), WeightedClusters: []xdsresource.WeightedCluster{{Name: resources.Clusters[0].Name, Weight: 100}}, ActionType: xdsresource.RouteActionRoute}, - }, - }, + }}, Clusters: map[string]*xdsresource.ClusterResult{ resources.Clusters[0].Name: { Config: xdsresource.ClusterConfig{Cluster: &xdsresource.ClusterUpdate{ @@ -1108,7 +1118,8 @@ func (s) TestAggregateCluster(t *testing.T) { Weight: 1, }, }, - Weight: 1}, + Weight: 1, + }, }, }, }, @@ -1151,7 +1162,7 @@ func (s) TestAggregateCluster(t *testing.T) { // configured with an error. Verifies that the error is correctly received in // the XDSConfig. func (s) TestAggregateClusterChildError(t *testing.T) { - nodeID, mgmtServer, xdsClient := setup(t, true) + nodeID, mgmtServer, xdsClient := setupManagementServerAndClient(t, true) watcher := &testWatcher{ updateCh: make(chan *xdsresource.XDSConfig), @@ -1252,17 +1263,14 @@ func (s) TestAggregateClusterChildError(t *testing.T) { // Drain the updates because management server keeps sending the error // updates repeatedly causing the update from dependency manager to be // blocked if we don't drain it. - select { - case <-ctx.Done(): - case <-watcher.updateCh: - } + watcher.drainUpdates(ctx) } // Tests the case where an aggregate cluster has no leaf clusters by creating a // cyclic dependency where A->B and B->A. Verifies that an error with "no leaf // clusters found" is received. func (s) TestAggregateClusterNoLeafCluster(t *testing.T) { - nodeID, mgmtServer, xdsClient := setup(t, true) + nodeID, mgmtServer, xdsClient := setupManagementServerAndClient(t, true) watcher := &testWatcher{ updateCh: make(chan *xdsresource.XDSConfig), @@ -1317,10 +1325,10 @@ func (s) TestAggregateClusterNoLeafCluster(t *testing.T) { }, Clusters: map[string]*xdsresource.ClusterResult{ defaultTestClusterName: { - Err: fmt.Errorf("[xDS node id: %v]: %v", nodeID, fmt.Errorf("no leaf clusters found for aggregate cluster")), + Err: fmt.Errorf("[xDS node id: %v]: %v", nodeID, fmt.Errorf("aggregate cluster graph has no leaf clusters")), }, clusterNameB: { - Err: fmt.Errorf("[xDS node id: %v]: %v", nodeID, fmt.Errorf("no leaf clusters found for aggregate cluster")), + Err: fmt.Errorf("[xDS node id: %v]: %v", nodeID, fmt.Errorf("aggregate cluster graph has no leaf clusters")), }, }, } @@ -1334,21 +1342,21 @@ func (s) TestAggregateClusterNoLeafCluster(t *testing.T) { // Verify that the error is correctly received in the XDSConfig in all the // clusters. func (s) TestAggregateClusterMaxDepth(t *testing.T) { - nodeID, mgmtServer, xdsClient := setup(t, true) + const clusterDepth = 17 + nodeID, mgmtServer, xdsClient := setupManagementServerAndClient(t, true) watcher := &testWatcher{ updateCh: make(chan *xdsresource.XDSConfig), errorCh: make(chan error), } - ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute) + ctx, cancel := context.WithTimeout(context.Background(), 50*defaultTestTimeout) defer cancel() // Create a graph of aggregate clusters with 18 clusters. - clusters := make([]*v3clusterpb.Cluster, 18) - for i := 0; i < 17; i++ { + clusters := make([]*v3clusterpb.Cluster, clusterDepth) + for i := 0; i < clusterDepth; i++ { clusters[i] = makeAggregateClusterResource(fmt.Sprintf("agg-%d", i), []string{fmt.Sprintf("agg-%d", i+1)}) } - clusters[17] = makeLogicalDNSClusterResource("agg-17", "localhost", 8081) resources := e2e.UpdateOptions{ NodeID: nodeID, @@ -1396,7 +1404,7 @@ func (s) TestAggregateClusterMaxDepth(t *testing.T) { // Populate the Clusters map with all clusters,except the last one, each // having the common error - for i := 0; i < 17; i++ { + for i := 0; i < clusterDepth; i++ { clusterName := fmt.Sprintf("agg-%d", i) // The ClusterResult only needs the Err field set to the common error @@ -1410,8 +1418,11 @@ func (s) TestAggregateClusterMaxDepth(t *testing.T) { } } +// Tests the scenrio where the Endpoint watcher receives an ambient error. Tests +// verifies that the error is stored in resolution note and the update remains +// too. func (s) TestEndpointAmbientError(t *testing.T) { - nodeID, mgmtServer, xdsClient := setup(t, true) + nodeID, mgmtServer, xdsClient := setupManagementServerAndClient(t, true) watcher := &testWatcher{ updateCh: make(chan *xdsresource.XDSConfig), @@ -1433,7 +1444,7 @@ func (s) TestEndpointAmbientError(t *testing.T) { dm := xdsdepmgr.New(defaultTestServiceName, defaultTestServiceName, xdsClient, watcher) defer dm.Close() - wantXdsConfig := makeDefaultXDSConfig(resources.Routes[0].Name, resources.Clusters[0].Name, resources.Endpoints[0].ClusterName, "localhost:8080") + wantXdsConfig := makeXDSConfig(resources.Routes[0].Name, resources.Clusters[0].Name, resources.Endpoints[0].ClusterName, "localhost:8080") if err := verifyXDSConfig(ctx, watcher.updateCh, watcher.errorCh, wantXdsConfig); err != nil { t.Fatal(err) @@ -1453,8 +1464,5 @@ func (s) TestEndpointAmbientError(t *testing.T) { // Drain the updates because management server keeps sending the error // updates causing the update from dependency manager to be blocked if we // don't drain it. - select { - case <-ctx.Done(): - case <-watcher.updateCh: - } + watcher.drainUpdates(ctx) } From 229ce3be7eb6cf4f7403d60a0a888cce4e599268 Mon Sep 17 00:00:00 2001 From: eshitachandwani Date: Wed, 17 Dec 2025 00:31:45 +0530 Subject: [PATCH 17/25] change addresses fields of xdsresource.Endpoint --- .../xdsdepmgr/xds_dependency_manager_test.go | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/internal/xds/xdsdepmgr/xds_dependency_manager_test.go b/internal/xds/xdsdepmgr/xds_dependency_manager_test.go index b849097998b4..0c442e49cfe4 100644 --- a/internal/xds/xdsdepmgr/xds_dependency_manager_test.go +++ b/internal/xds/xdsdepmgr/xds_dependency_manager_test.go @@ -216,9 +216,9 @@ func makeXDSConfig(routeConfigName, clusterName, edsServiceName, addr string) *x }, Endpoints: []xdsresource.Endpoint{ { - Addresses: []string{addr}, - HealthStatus: xdsresource.EndpointHealthStatusUnknown, - Weight: 1, + ResolverEndpoint: resolver.Endpoint{Addresses: []resolver.Address{{Addr: addr}}}, + HealthStatus: xdsresource.EndpointHealthStatusUnknown, + Weight: 1, }, }, Weight: 1, @@ -743,9 +743,9 @@ func (s) TestRouteResourceUpdate(t *testing.T) { }, Endpoints: []xdsresource.Endpoint{ { - Addresses: []string{"localhost:8081"}, - HealthStatus: xdsresource.EndpointHealthStatusUnknown, - Weight: 1, + ResolverEndpoint: resolver.Endpoint{Addresses: []resolver.Address{{Addr: "localhost:8081"}}}, + HealthStatus: xdsresource.EndpointHealthStatusUnknown, + Weight: 1, }, }, Weight: 1, @@ -866,9 +866,9 @@ func (s) TestRouteResourceChangeToInline(t *testing.T) { }, Endpoints: []xdsresource.Endpoint{ { - Addresses: []string{"localhost:8081"}, - HealthStatus: xdsresource.EndpointHealthStatusUnknown, - Weight: 1, + ResolverEndpoint: resolver.Endpoint{Addresses: []resolver.Address{{Addr: "localhost:8081"}}}, + HealthStatus: xdsresource.EndpointHealthStatusUnknown, + Weight: 1, }, }, Weight: 1, @@ -1113,9 +1113,9 @@ func (s) TestAggregateCluster(t *testing.T) { }, Endpoints: []xdsresource.Endpoint{ { - Addresses: []string{"localhost:8080"}, - HealthStatus: xdsresource.EndpointHealthStatusUnknown, - Weight: 1, + ResolverEndpoint: resolver.Endpoint{Addresses: []resolver.Address{{Addr: "localhost:8080"}}}, + HealthStatus: xdsresource.EndpointHealthStatusUnknown, + Weight: 1, }, }, Weight: 1, From 6df023caf0a2c4a6923a2c3e72ae401e90dd23da Mon Sep 17 00:00:00 2001 From: Easwar Swaminathan Date: Wed, 17 Dec 2025 00:05:40 +0000 Subject: [PATCH 18/25] split populateClusterConfigLocked to keep it's size down --- .../xds/xdsdepmgr/xds_dependency_manager.go | 144 ++++++++++-------- 1 file changed, 80 insertions(+), 64 deletions(-) diff --git a/internal/xds/xdsdepmgr/xds_dependency_manager.go b/internal/xds/xdsdepmgr/xds_dependency_manager.go index b7fa2a605a1b..a2ddc5adea29 100644 --- a/internal/xds/xdsdepmgr/xds_dependency_manager.go +++ b/internal/xds/xdsdepmgr/xds_dependency_manager.go @@ -279,9 +279,12 @@ func (m *DependencyManager) populateClusterConfigLocked(clusterName string, dept return false, nil, nil } + // If a watch exists but no update received yet, return. if !state.updateReceived { return false, nil, nil } + + // If there was a resource error, propagate it up. if state.lastErr != nil { return true, nil, state.lastErr } @@ -295,79 +298,92 @@ func (m *DependencyManager) populateClusterConfigLocked(clusterName string, dept switch update.ClusterType { case xdsresource.ClusterTypeEDS: - edsName := clusterName - if update.EDSServiceName != "" { - edsName = update.EDSServiceName - } - endpointResourcesSeen[edsName] = true - // If endpoint watcher does not exist, create one. - if _, ok := m.endpointWatchers[edsName]; !ok { - m.endpointWatchers[edsName] = newEndpointWatcher(edsName, m) - return false, nil, nil - } - endpointState := m.endpointWatchers[edsName] + return m.populateEDSClusterLocked(clusterName, update, clusterConfigs, endpointResourcesSeen) + case xdsresource.ClusterTypeLogicalDNS: + return m.populateLogicalDNSClusterLocked(clusterName, update, clusterConfigs, dnsResourcesSeen) + case xdsresource.ClusterTypeAggregate: + return m.populateAggregateClusterLocked(clusterName, update, depth, clusterConfigs, endpointResourcesSeen, dnsResourcesSeen, clustersSeen) + default: + clusterConfigs[clusterName] = &xdsresource.ClusterResult{Err: m.annotateErrorWithNodeID(fmt.Errorf("cluster type %v of cluster %s not supported", update.ClusterType, clusterName))} + return true, nil, nil + } +} - // If the resource does not have any update yet, return. - if !endpointState.updateReceived { - return false, nil, nil - } +func (m *DependencyManager) populateEDSClusterLocked(clusterName string, update *xdsresource.ClusterUpdate, clusterConfigs map[string]*xdsresource.ClusterResult, endpointResourcesSeen map[string]bool) (bool, []string, error) { + edsName := clusterName + if update.EDSServiceName != "" { + edsName = update.EDSServiceName + } + endpointResourcesSeen[edsName] = true - // Store the update and error. - clusterConfigs[clusterName].Config.EndpointConfig = &xdsresource.EndpointConfig{ - EDSUpdate: endpointState.lastUpdate, - ResolutionNote: endpointState.lastErr, - } - return true, []string{clusterName}, nil + // If endpoint watcher does not exist, create one. + if _, ok := m.endpointWatchers[edsName]; !ok { + m.endpointWatchers[edsName] = newEndpointWatcher(edsName, m) + return false, nil, nil + } + endpointState := m.endpointWatchers[edsName] - case xdsresource.ClusterTypeLogicalDNS: - target := update.DNSHostName - dnsResourcesSeen[target] = true - // If dns resolver does not exist, create one. - if _, ok := m.dnsResolvers[target]; !ok { - m.dnsResolvers[target] = newDNSResolver(target, m) - return false, nil, nil - } - dnsState := m.dnsResolvers[target] - // If no update received, return false. - if !dnsState.updateReceived { - return false, nil, nil - } + // If the resource does not have any update yet, return. + if !endpointState.updateReceived { + return false, nil, nil + } - clusterConfigs[clusterName].Config.EndpointConfig = &xdsresource.EndpointConfig{ - DNSEndpoints: dnsState.lastUpdate, - ResolutionNote: dnsState.err, - } - return true, []string{clusterName}, nil + // Store the update and error. + clusterConfigs[clusterName].Config.EndpointConfig = &xdsresource.EndpointConfig{ + EDSUpdate: endpointState.lastUpdate, + ResolutionNote: endpointState.lastErr, + } + return true, []string{clusterName}, nil +} - case xdsresource.ClusterTypeAggregate: - var leafClusters []string - haveAllResources := true - for _, child := range update.PrioritizedClusterNames { - ok, childLeafClusters, err := m.populateClusterConfigLocked(child, depth+1, clusterConfigs, endpointResourcesSeen, dnsResourcesSeen, clustersSeen) - if !ok { - haveAllResources = false - } - if err != nil { - clusterConfigs[clusterName] = &xdsresource.ClusterResult{Err: err} - return true, leafClusters, err - } - leafClusters = append(leafClusters, childLeafClusters...) - } - if !haveAllResources { - return false, leafClusters, nil - } - if haveAllResources && len(leafClusters) == 0 { - clusterConfigs[clusterName] = &xdsresource.ClusterResult{Err: m.annotateErrorWithNodeID(fmt.Errorf("aggregate cluster graph has no leaf clusters"))} - return true, leafClusters, nil +func (m *DependencyManager) populateLogicalDNSClusterLocked(clusterName string, update *xdsresource.ClusterUpdate, clusterConfigs map[string]*xdsresource.ClusterResult, dnsResourcesSeen map[string]bool) (bool, []string, error) { + target := update.DNSHostName + dnsResourcesSeen[target] = true + + // If dns resolver does not exist, create one. + if _, ok := m.dnsResolvers[target]; !ok { + m.dnsResolvers[target] = newDNSResolver(target, m) + return false, nil, nil + } + dnsState := m.dnsResolvers[target] + + // If no update received, return false. + if !dnsState.updateReceived { + return false, nil, nil + } + + clusterConfigs[clusterName].Config.EndpointConfig = &xdsresource.EndpointConfig{ + DNSEndpoints: dnsState.lastUpdate, + ResolutionNote: dnsState.err, + } + return true, []string{clusterName}, nil +} + +func (m *DependencyManager) populateAggregateClusterLocked(clusterName string, update *xdsresource.ClusterUpdate, depth int, clusterConfigs map[string]*xdsresource.ClusterResult, endpointResourcesSeen, dnsResourcesSeen, clustersSeen map[string]bool) (bool, []string, error) { + var leafClusters []string + haveAllResources := true + for _, child := range update.PrioritizedClusterNames { + ok, childLeafClusters, err := m.populateClusterConfigLocked(child, depth+1, clusterConfigs, endpointResourcesSeen, dnsResourcesSeen, clustersSeen) + if !ok { + haveAllResources = false } - clusterConfigs[clusterName].Config.AggregateConfig = &xdsresource.AggregateConfig{ - LeafClusters: leafClusters, + if err != nil { + clusterConfigs[clusterName] = &xdsresource.ClusterResult{Err: err} + return true, leafClusters, err } + leafClusters = append(leafClusters, childLeafClusters...) + } + if !haveAllResources { + return false, leafClusters, nil + } + if haveAllResources && len(leafClusters) == 0 { + clusterConfigs[clusterName] = &xdsresource.ClusterResult{Err: m.annotateErrorWithNodeID(fmt.Errorf("aggregate cluster graph has no leaf clusters"))} return true, leafClusters, nil } - // When the cluster update is not one of the three types of cluster updates. - clusterConfigs[clusterName] = &xdsresource.ClusterResult{Err: m.annotateErrorWithNodeID(fmt.Errorf("cluster type %v of cluster %s not supported", update.ClusterType, clusterName))} - return false, nil, nil + clusterConfigs[clusterName].Config.AggregateConfig = &xdsresource.AggregateConfig{ + LeafClusters: leafClusters, + } + return true, leafClusters, nil } func (m *DependencyManager) applyRouteConfigUpdateLocked(update *xdsresource.RouteConfigUpdate) { From ad3a3b3073f20452813f29924e314c908447f7aa Mon Sep 17 00:00:00 2001 From: eshitachandwani Date: Wed, 17 Dec 2025 08:43:48 +0530 Subject: [PATCH 19/25] small changes --- internal/xds/xdsdepmgr/xds_dependency_manager.go | 8 ++++---- internal/xds/xdsdepmgr/xds_dependency_manager_test.go | 6 ++++-- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/internal/xds/xdsdepmgr/xds_dependency_manager.go b/internal/xds/xdsdepmgr/xds_dependency_manager.go index a2ddc5adea29..a76873f847be 100644 --- a/internal/xds/xdsdepmgr/xds_dependency_manager.go +++ b/internal/xds/xdsdepmgr/xds_dependency_manager.go @@ -262,9 +262,9 @@ func (m *DependencyManager) populateClusterConfigLocked(clusterName string, dept clustersSeen[clusterName] = true if depth >= aggregateClusterMaxDepth { - m.logger.Warningf("aggregate cluster graph exceeds max depth (%d)", aggregateClusterMaxDepth) - clusterConfigs[clusterName] = &xdsresource.ClusterResult{Err: m.annotateErrorWithNodeID(fmt.Errorf("aggregate cluster graph exceeds max depth (%d)", aggregateClusterMaxDepth))} - return true, nil, m.annotateErrorWithNodeID(fmt.Errorf("aggregate cluster graph exceeds max depth (%d)", aggregateClusterMaxDepth)) + err := fmt.Errorf("aggregate cluster graph exceeds max depth (%d)", aggregateClusterMaxDepth) + clusterConfigs[clusterName] = &xdsresource.ClusterResult{Err: err} + return true, nil, m.annotateErrorWithNodeID(err) } // If cluster is already seen in the tree, return. @@ -775,7 +775,7 @@ func (m *DependencyManager) onDNSError(resourceName string, err error) { m.maybeSendUpdateLocked() } -// RequestDNSReresolution calls all the the DNS resolvers ResolveNow. +// RequestDNSReresolution calls all the the DNS resolver's ResolveNow. func (m *DependencyManager) RequestDNSReresolution(opt resolver.ResolveNowOptions) { for _, res := range m.dnsResolvers { res.resolver.dnsR.ResolveNow(opt) diff --git a/internal/xds/xdsdepmgr/xds_dependency_manager_test.go b/internal/xds/xdsdepmgr/xds_dependency_manager_test.go index 0c442e49cfe4..f1d65d906a0f 100644 --- a/internal/xds/xdsdepmgr/xds_dependency_manager_test.go +++ b/internal/xds/xdsdepmgr/xds_dependency_manager_test.go @@ -147,6 +147,7 @@ func verifyXDSConfig(ctx context.Context, xdsCh chan *xdsresource.XDSConfig, err cmpopts.IgnoreFields(xdsresource.RouteConfigUpdate{}, "Raw"), cmpopts.IgnoreFields(xdsresource.ClusterUpdate{}, "Raw", "LBPolicy", "TelemetryLabels"), cmpopts.IgnoreFields(xdsresource.EndpointsUpdate{}, "Raw"), + // Used for EndpointConfig.ResolutionNote and ClusterResult.Err fields. cmp.Transformer("ErrorsToString", func(in error) string { if in == nil { return "" // Treat nil as an empty string @@ -225,7 +226,8 @@ func makeXDSConfig(routeConfigName, clusterName, edsServiceName, addr string) *x }, }, }, - }}, + }, + }, }, }, } @@ -1349,7 +1351,7 @@ func (s) TestAggregateClusterMaxDepth(t *testing.T) { updateCh: make(chan *xdsresource.XDSConfig), errorCh: make(chan error), } - ctx, cancel := context.WithTimeout(context.Background(), 50*defaultTestTimeout) + ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) defer cancel() // Create a graph of aggregate clusters with 18 clusters. From ab7a2aa1a1a908226d8184253153ea43a1a6b1f6 Mon Sep 17 00:00:00 2001 From: eshitachandwani Date: Wed, 17 Dec 2025 08:46:48 +0530 Subject: [PATCH 20/25] annotate error --- internal/xds/xdsdepmgr/xds_dependency_manager.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/internal/xds/xdsdepmgr/xds_dependency_manager.go b/internal/xds/xdsdepmgr/xds_dependency_manager.go index a76873f847be..9cd38880b5b0 100644 --- a/internal/xds/xdsdepmgr/xds_dependency_manager.go +++ b/internal/xds/xdsdepmgr/xds_dependency_manager.go @@ -262,9 +262,9 @@ func (m *DependencyManager) populateClusterConfigLocked(clusterName string, dept clustersSeen[clusterName] = true if depth >= aggregateClusterMaxDepth { - err := fmt.Errorf("aggregate cluster graph exceeds max depth (%d)", aggregateClusterMaxDepth) + err := m.annotateErrorWithNodeID(fmt.Errorf("aggregate cluster graph exceeds max depth (%d)", aggregateClusterMaxDepth)) clusterConfigs[clusterName] = &xdsresource.ClusterResult{Err: err} - return true, nil, m.annotateErrorWithNodeID(err) + return true, nil, err } // If cluster is already seen in the tree, return. From 4adea773ac07e9f1e64c48b4a8f9820b05208f25 Mon Sep 17 00:00:00 2001 From: eshitachandwani Date: Wed, 17 Dec 2025 11:01:18 +0530 Subject: [PATCH 21/25] error test fix --- .../xdsdepmgr/xds_dependency_manager_test.go | 48 ++++++++++--------- 1 file changed, 26 insertions(+), 22 deletions(-) diff --git a/internal/xds/xdsdepmgr/xds_dependency_manager_test.go b/internal/xds/xdsdepmgr/xds_dependency_manager_test.go index f1d65d906a0f..292ec7a68b91 100644 --- a/internal/xds/xdsdepmgr/xds_dependency_manager_test.go +++ b/internal/xds/xdsdepmgr/xds_dependency_manager_test.go @@ -81,11 +81,20 @@ func newStringP(s string) *string { type testWatcher struct { updateCh chan *xdsresource.XDSConfig errorCh chan error + done chan struct{} } -// Update sends the received XDSConfig update to the update channel. +// Update sends the received XDSConfig update to the update channel. Does not +// send updates if the done channel is closed. The done channel is closed in the +// cases of errors because management server keeps sending error updates that +// cases multiple updates to be sent from dependency manager causing the update +// channel to be blocked. func (w *testWatcher) Update(cfg *xdsresource.XDSConfig) { - w.updateCh <- cfg + select { + case <-w.done: + return + case w.updateCh <- cfg: + } } // Error sends the received error to the error channel. @@ -93,13 +102,6 @@ func (w *testWatcher) Error(err error) { w.errorCh <- err } -func (w *testWatcher) drainUpdates(ctx context.Context) { - select { - case <-ctx.Done(): - case <-w.updateCh: - } -} - func verifyError(ctx context.Context, errCh chan error, wantErr, wantNodeID string) error { select { case gotErr := <-errCh: @@ -910,6 +912,7 @@ func (s) TestClusterResourceError(t *testing.T) { watcher := &testWatcher{ updateCh: make(chan *xdsresource.XDSConfig), errorCh: make(chan error), + done: make(chan struct{}), } ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) @@ -937,11 +940,10 @@ func (s) TestClusterResourceError(t *testing.T) { if err := verifyXDSConfig(ctx, watcher.updateCh, watcher.errorCh, wantXdsConfig); err != nil { t.Fatal(err) } - - // Drain the updates because management server keeps sending the error - // updates repeatedly causing the update from dependency manager to be - // blocked if we don't drain it. - watcher.drainUpdates(ctx) + // Close the watcher done channel to stop sending updates because management + // server keeps sending the error updates repeatedly causing the update from + // dependency manager to be blocked. + close(watcher.done) } // Tests the case where the dependency manager receives a cluster resource @@ -1169,6 +1171,7 @@ func (s) TestAggregateClusterChildError(t *testing.T) { watcher := &testWatcher{ updateCh: make(chan *xdsresource.XDSConfig), errorCh: make(chan error), + done: make(chan struct{}), } ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) defer cancel() @@ -1262,10 +1265,10 @@ func (s) TestAggregateClusterChildError(t *testing.T) { t.Fatal(err) } - // Drain the updates because management server keeps sending the error - // updates repeatedly causing the update from dependency manager to be - // blocked if we don't drain it. - watcher.drainUpdates(ctx) + // Close the watcher done channel to stop sending updates because management + // server keeps sending the error updates repeatedly causing the update from + // dependency manager to be blocked. + close(watcher.done) } // Tests the case where an aggregate cluster has no leaf clusters by creating a @@ -1429,6 +1432,7 @@ func (s) TestEndpointAmbientError(t *testing.T) { watcher := &testWatcher{ updateCh: make(chan *xdsresource.XDSConfig), errorCh: make(chan error), + done: make(chan struct{}), } ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) defer cancel() @@ -1463,8 +1467,8 @@ func (s) TestEndpointAmbientError(t *testing.T) { t.Fatal(err) } - // Drain the updates because management server keeps sending the error - // updates causing the update from dependency manager to be blocked if we - // don't drain it. - watcher.drainUpdates(ctx) + // Close the watcher done channel to stop sending updates because management + // server keeps sending the error updates repeatedly causing the update from + // dependency manager to be blocked. + close(watcher.done) } From 09842ae3f812d4d3a8202e557c7cd7559013ee03 Mon Sep 17 00:00:00 2001 From: eshitachandwani Date: Wed, 17 Dec 2025 11:08:11 +0530 Subject: [PATCH 22/25] use has --- .../xdsdepmgr/xds_dependency_manager_test.go | 22 ++++++++++--------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/internal/xds/xdsdepmgr/xds_dependency_manager_test.go b/internal/xds/xdsdepmgr/xds_dependency_manager_test.go index 292ec7a68b91..bf8d96c64d24 100644 --- a/internal/xds/xdsdepmgr/xds_dependency_manager_test.go +++ b/internal/xds/xdsdepmgr/xds_dependency_manager_test.go @@ -21,6 +21,7 @@ package xdsdepmgr_test import ( "context" "fmt" + "hash/fnv" "regexp" "strings" "testing" @@ -123,18 +124,20 @@ func verifyError(ctx context.Context, errCh chan error, wantErr, wantNodeID stri // This function determines the stable, canonical order for any two // resolver.Endpoint structs. func lessEndpoint(a, b resolver.Endpoint) bool { - // Safely access the first address string for comparison. - addrA := "" - if len(a.Addresses) > 0 { - addrA = a.Addresses[0].Addr - } + return getHash(a) < getHash(b) +} + +func getHash(e resolver.Endpoint) uint64 { + h := fnv.New64a() - addrB := "" - if len(b.Addresses) > 0 { - addrB = b.Addresses[0].Addr + // We iterate through all addresses to ensure the hash represents + // the full endpoint identity. + for _, addr := range e.Addresses { + h.Write([]byte(addr.Addr)) + h.Write([]byte(addr.ServerName)) } - return addrA < addrB + return h.Sum64() } func verifyXDSConfig(ctx context.Context, xdsCh chan *xdsresource.XDSConfig, errCh chan error, want *xdsresource.XDSConfig) error { @@ -167,7 +170,6 @@ func verifyXDSConfig(ctx context.Context, xdsCh chan *xdsresource.XDSConfig, err cmpopts.SortSlices(lessEndpoint), } if diff := cmp.Diff(update, want, cmpOpts...); diff != "" { - fmt.Printf("eshita got config %v", update.Clusters) return fmt.Errorf("received unexpected update from dependency manager. Diff (-got +want):\n%v", diff) } case err := <-errCh: From 980b5295391dccbe933a84541075aa1dc8532030 Mon Sep 17 00:00:00 2001 From: Easwar Swaminathan Date: Thu, 18 Dec 2025 20:06:44 +0000 Subject: [PATCH 23/25] simplify the dns resolver implementation --- internal/xds/xdsdepmgr/watch_service.go | 86 ---------------- .../xds/xdsdepmgr/xds_dependency_manager.go | 98 +++++++++++++++++-- 2 files changed, 89 insertions(+), 95 deletions(-) diff --git a/internal/xds/xdsdepmgr/watch_service.go b/internal/xds/xdsdepmgr/watch_service.go index 807536129098..2a3bc3ab4b6c 100644 --- a/internal/xds/xdsdepmgr/watch_service.go +++ b/internal/xds/xdsdepmgr/watch_service.go @@ -18,17 +18,6 @@ package xdsdepmgr -import ( - "context" - "fmt" - "net/url" - - "google.golang.org/grpc/internal/grpcsync" - "google.golang.org/grpc/internal/xds/xdsclient/xdsresource" - "google.golang.org/grpc/resolver" - "google.golang.org/grpc/serviceconfig" -) - // xdsResourceWatcher is a generic implementation of the xdsresource.Watcher // interface. type xdsResourceWatcher[T any] struct { @@ -48,78 +37,3 @@ func (x *xdsResourceWatcher[T]) ResourceError(err error, onDone func()) { func (x *xdsResourceWatcher[T]) AmbientError(err error, onDone func()) { x.onAmbientError(err, onDone) } - -// dnsResolverState watches updates for the given DNS hostname. It implements -// resolver.ClientConn interface to work with the DNS resolver. -type dnsResolver struct { - target string - dnsR resolver.Resolver - depMgr *DependencyManager - - // serializer is used to make sure that any methods on the resolver can be - // called from inside th Build function which is a guarantee that - // implementations of resolver.Clientconn need to maintain. - serializer grpcsync.CallbackSerializer - serializerCancel func() -} - -type dnsResolverState struct { - resolver *dnsResolver - updateReceived bool - err error - lastUpdate *xdsresource.DNSUpdate - cancelResolver func() -} - -// dnsResolverState needs to implement resolver.ClientConn interface to receive -// updates from the real DNS resolver. -func (dr *dnsResolver) UpdateState(state resolver.State) error { - dr.serializer.TrySchedule(func(context.Context) { - dr.depMgr.onDNSUpdate(dr.target, &state) - }) - return nil -} - -func (dr *dnsResolver) ReportError(err error) { - dr.serializer.TrySchedule(func(context.Context) { - dr.depMgr.onDNSError(dr.target, err) - }) -} - -func (dr *dnsResolver) NewAddress(addresses []resolver.Address) { - dr.UpdateState(resolver.State{Addresses: addresses}) -} - -func (dr *dnsResolver) ParseServiceConfig(string) *serviceconfig.ParseResult { - return &serviceconfig.ParseResult{Err: fmt.Errorf("service config not supported")} -} - -// NewDNSResolver creates a new DNS resolver for the given target. -func newDNSResolver(target string, depMgr *DependencyManager) *dnsResolverState { - ctx, cancel := context.WithCancel(context.Background()) - dr := &dnsResolver{target: target, depMgr: depMgr, serializer: *grpcsync.NewCallbackSerializer(ctx), serializerCancel: cancel} - drState := &dnsResolverState{resolver: dr} - drState.cancelResolver = func() { - drState.resolver.serializerCancel() - <-drState.resolver.serializer.Done() - } - u, err := url.Parse("dns:///" + target) - if err != nil { - drState.resolver.ReportError(err) - drState.resolver.depMgr.logger.Warningf("Error while parsing DNS target %q: %v", target, drState.resolver.depMgr.annotateErrorWithNodeID(err)) - return drState - } - r, err := resolver.Get("dns").Build(resolver.Target{URL: *u}, dr, resolver.BuildOptions{}) - if err != nil { - drState.resolver.ReportError(err) - drState.resolver.depMgr.logger.Warningf("Error while building DNS resolver for target %q: %v", target, drState.resolver.depMgr.annotateErrorWithNodeID(err)) - return drState - } - drState.resolver.dnsR = r - drState.cancelResolver = func() { - drState.resolver.serializerCancel() - <-drState.resolver.serializer.Done() - r.Close() - } - return drState -} diff --git a/internal/xds/xdsdepmgr/xds_dependency_manager.go b/internal/xds/xdsdepmgr/xds_dependency_manager.go index 9cd38880b5b0..23188b4fbae0 100644 --- a/internal/xds/xdsdepmgr/xds_dependency_manager.go +++ b/internal/xds/xdsdepmgr/xds_dependency_manager.go @@ -19,14 +19,18 @@ package xdsdepmgr import ( + "context" "fmt" + "net/url" "sync" "google.golang.org/grpc/grpclog" internalgrpclog "google.golang.org/grpc/internal/grpclog" + "google.golang.org/grpc/internal/grpcsync" "google.golang.org/grpc/internal/xds/xdsclient" "google.golang.org/grpc/internal/xds/xdsclient/xdsresource" "google.golang.org/grpc/resolver" + "google.golang.org/grpc/serviceconfig" ) const prefix = "[xdsdepmgr %p] " @@ -74,6 +78,13 @@ type DependencyManager struct { dataplaneAuthority string nodeID string + // Used to serialize callbacks from DNS resolvers to avoid deadlocks. Since + // the resolver's Build() is called with the dependency manager lock held, + // direct callbacks to ClientConn (which also require that lock) would + // deadlock. + dnsSerializer *grpcsync.CallbackSerializer + dnsSerializerCancel func() + // All the fields below are protected by mu. mu sync.Mutex stopped bool @@ -86,7 +97,7 @@ type DependencyManager struct { clustersFromRouteConfig map[string]bool clusterWatchers map[string]*clusterWatcherState endpointWatchers map[string]*endpointWatcherState - dnsResolvers map[string]*dnsResolverState + dnsResolvers map[string]*dnsResourceState } // New creates a new DependencyManager. @@ -99,15 +110,18 @@ type DependencyManager struct { // - watcher is the ConfigWatcher interface that will receive the aggregated // XDSConfig updates and errors. func New(listenerName, dataplaneAuthority string, xdsClient xdsclient.XDSClient, watcher ConfigWatcher) *DependencyManager { + ctx, cancel := context.WithCancel(context.Background()) dm := &DependencyManager{ ldsResourceName: listenerName, dataplaneAuthority: dataplaneAuthority, xdsClient: xdsClient, watcher: watcher, nodeID: xdsClient.BootstrapConfig().Node().GetId(), + dnsSerializer: grpcsync.NewCallbackSerializer(ctx), + dnsSerializerCancel: cancel, clustersFromRouteConfig: make(map[string]bool), endpointWatchers: make(map[string]*endpointWatcherState), - dnsResolvers: make(map[string]*dnsResolverState), + dnsResolvers: make(map[string]*dnsResourceState), clusterWatchers: make(map[string]*clusterWatcherState), } dm.logger = prefixLogger(dm) @@ -155,8 +169,11 @@ func (m *DependencyManager) Close() { delete(m.endpointWatchers, name) } + // We cannot wait for the dns serializer to finish here, as the callbacks + // try to grab the dependency manager lock, which is already held here. + m.dnsSerializerCancel() for name, dnsResolver := range m.dnsResolvers { - dnsResolver.cancelResolver() + dnsResolver.stopResolver() delete(m.dnsResolvers, name) } } @@ -212,7 +229,7 @@ func (m *DependencyManager) maybeSendUpdateLocked() { } for name, dr := range m.dnsResolvers { if _, ok := dnsResourcesSeen[name]; !ok { - dr.cancelResolver() + dr.stopResolver() delete(m.dnsResolvers, name) } } @@ -342,7 +359,7 @@ func (m *DependencyManager) populateLogicalDNSClusterLocked(clusterName string, // If dns resolver does not exist, create one. if _, ok := m.dnsResolvers[target]; !ok { - m.dnsResolvers[target] = newDNSResolver(target, m) + m.dnsResolvers[target] = m.newDNSResolver(target) return false, nil, nil } dnsState := m.dnsResolvers[target] @@ -354,7 +371,7 @@ func (m *DependencyManager) populateLogicalDNSClusterLocked(clusterName string, clusterConfigs[clusterName].Config.EndpointConfig = &xdsresource.EndpointConfig{ DNSEndpoints: dnsState.lastUpdate, - ResolutionNote: dnsState.err, + ResolutionNote: dnsState.lastErr, } return true, []string{clusterName}, nil } @@ -741,7 +758,7 @@ func (m *DependencyManager) onDNSUpdate(resourceName string, update *resolver.St state := m.dnsResolvers[resourceName] state.lastUpdate = &xdsresource.DNSUpdate{Endpoints: endpoints} state.updateReceived = true - state.err = nil + state.lastErr = nil m.maybeSendUpdateLocked() } @@ -766,7 +783,7 @@ func (m *DependencyManager) onDNSError(resourceName string, err error) { // will continue to do so. Also suppress errors if we previously received an // error, since there will be no downstream effects of propagating this // error. - state.err = err + state.lastErr = err if !state.updateReceived { state.lastUpdate = nil state.updateReceived = true @@ -778,6 +795,69 @@ func (m *DependencyManager) onDNSError(resourceName string, err error) { // RequestDNSReresolution calls all the the DNS resolver's ResolveNow. func (m *DependencyManager) RequestDNSReresolution(opt resolver.ResolveNowOptions) { for _, res := range m.dnsResolvers { - res.resolver.dnsR.ResolveNow(opt) + res.dnsR.ResolveNow(opt) + } +} + +type dnsResourceState struct { + dnsR resolver.Resolver + lastUpdate *xdsresource.DNSUpdate + lastErr error + updateReceived bool + stopResolver func() +} + +type resolverClientConn struct { + target string + depMgr *DependencyManager +} + +func (rcc *resolverClientConn) UpdateState(state resolver.State) error { + rcc.depMgr.dnsSerializer.TrySchedule(func(context.Context) { + rcc.depMgr.onDNSUpdate(rcc.target, &state) + }) + return nil +} + +func (rcc *resolverClientConn) ReportError(err error) { + rcc.depMgr.dnsSerializer.TrySchedule(func(context.Context) { + rcc.depMgr.onDNSError(rcc.target, err) + }) +} + +func (rcc *resolverClientConn) NewAddress(addresses []resolver.Address) { + rcc.UpdateState(resolver.State{Addresses: addresses}) +} + +func (rcc *resolverClientConn) ParseServiceConfig(string) *serviceconfig.ParseResult { + return &serviceconfig.ParseResult{Err: fmt.Errorf("service config not supported")} +} + +func (m *DependencyManager) newDNSResolver(target string) *dnsResourceState { + u, err := url.Parse("dns:///" + target) + if err != nil { + err := fmt.Errorf("failed to parse DNS target %q: %v", target, m.annotateErrorWithNodeID(err)) + m.logger.Warningf("%v", err) + return &dnsResourceState{ + lastErr: err, + updateReceived: true, + } + } + + rcc := &resolverClientConn{ + target: target, + depMgr: m, + } + r, err := resolver.Get("dns").Build(resolver.Target{URL: *u}, rcc, resolver.BuildOptions{}) + if err != nil { + rcc.ReportError(err) + err := fmt.Errorf("failed to build DNS resolver for target %q: %v", target, m.annotateErrorWithNodeID(err)) + m.logger.Warningf("%v", err) + return nil + } + + return &dnsResourceState{ + dnsR: r, + stopResolver: r.Close, } } From d8ebaaf4bdf184bee0dfc16260c2a0a955b487d8 Mon Sep 17 00:00:00 2001 From: Easwar Swaminathan Date: Thu, 18 Dec 2025 21:18:54 +0000 Subject: [PATCH 24/25] more generics --- internal/xds/xdsdepmgr/watch_service.go | 39 --- .../xds/xdsdepmgr/xds_dependency_manager.go | 246 ++++++++++-------- 2 files changed, 133 insertions(+), 152 deletions(-) delete mode 100644 internal/xds/xdsdepmgr/watch_service.go diff --git a/internal/xds/xdsdepmgr/watch_service.go b/internal/xds/xdsdepmgr/watch_service.go deleted file mode 100644 index 2a3bc3ab4b6c..000000000000 --- a/internal/xds/xdsdepmgr/watch_service.go +++ /dev/null @@ -1,39 +0,0 @@ -/* - * - * Copyright 2025 gRPC authors. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - */ - -package xdsdepmgr - -// xdsResourceWatcher is a generic implementation of the xdsresource.Watcher -// interface. -type xdsResourceWatcher[T any] struct { - onUpdate func(*T, func()) - onError func(error, func()) - onAmbientError func(error, func()) -} - -func (x *xdsResourceWatcher[T]) ResourceChanged(update *T, onDone func()) { - x.onUpdate(update, onDone) -} - -func (x *xdsResourceWatcher[T]) ResourceError(err error, onDone func()) { - x.onError(err, onDone) -} - -func (x *xdsResourceWatcher[T]) AmbientError(err error, onDone func()) { - x.onAmbientError(err, onDone) -} diff --git a/internal/xds/xdsdepmgr/xds_dependency_manager.go b/internal/xds/xdsdepmgr/xds_dependency_manager.go index 23188b4fbae0..f91a2145a3ea 100644 --- a/internal/xds/xdsdepmgr/xds_dependency_manager.go +++ b/internal/xds/xdsdepmgr/xds_dependency_manager.go @@ -65,6 +65,40 @@ type ConfigWatcher interface { Error(error) } +// xdsResourceState is a generic struct to hold the state of a watched xDS +// resource. +type xdsResourceState[T any, U any] struct { + lastUpdate *T + lastErr error + updateReceived bool + stop func() + extras U // to store any additional state specific to the watcher +} + +func (x *xdsResourceState[T, U]) setLastUpdate(update *T) { + x.lastUpdate = update + x.updateReceived = true + x.lastErr = nil +} + +func (x *xdsResourceState[T, U]) setLastError(err error) { + x.lastErr = err + x.updateReceived = true + x.lastUpdate = nil +} + +func (x *xdsResourceState[T, U]) updateLastError(err error) { + x.lastErr = err +} + +type dnsExtras struct { + dnsR resolver.Resolver +} + +type routeExtras struct { + virtualHost *xdsresource.VirtualHost +} + // DependencyManager registers watches on the xDS client for all required xDS // resources, resolves dependencies between them, and returns a complete // configuration to the xDS resolver. @@ -88,16 +122,13 @@ type DependencyManager struct { // All the fields below are protected by mu. mu sync.Mutex stopped bool - listenerCancel func() - currentListenerUpdate *xdsresource.ListenerUpdate - routeConfigCancel func() + listenerWatcher *xdsResourceState[xdsresource.ListenerUpdate, struct{}] rdsResourceName string - currentRouteConfig *xdsresource.RouteConfigUpdate - currentVirtualHost *xdsresource.VirtualHost + routeConfigWatcher *xdsResourceState[xdsresource.RouteConfigUpdate, routeExtras] clustersFromRouteConfig map[string]bool - clusterWatchers map[string]*clusterWatcherState - endpointWatchers map[string]*endpointWatcherState - dnsResolvers map[string]*dnsResourceState + clusterWatchers map[string]*xdsResourceState[xdsresource.ClusterUpdate, struct{}] + endpointWatchers map[string]*xdsResourceState[xdsresource.EndpointsUpdate, struct{}] + dnsResolvers map[string]*xdsResourceState[xdsresource.DNSUpdate, dnsExtras] } // New creates a new DependencyManager. @@ -120,14 +151,15 @@ func New(listenerName, dataplaneAuthority string, xdsClient xdsclient.XDSClient, dnsSerializer: grpcsync.NewCallbackSerializer(ctx), dnsSerializerCancel: cancel, clustersFromRouteConfig: make(map[string]bool), - endpointWatchers: make(map[string]*endpointWatcherState), - dnsResolvers: make(map[string]*dnsResourceState), - clusterWatchers: make(map[string]*clusterWatcherState), + endpointWatchers: make(map[string]*xdsResourceState[xdsresource.EndpointsUpdate, struct{}]), + dnsResolvers: make(map[string]*xdsResourceState[xdsresource.DNSUpdate, dnsExtras]), + clusterWatchers: make(map[string]*xdsResourceState[xdsresource.ClusterUpdate, struct{}]), } dm.logger = prefixLogger(dm) // Start the listener watch. Listener watch will start the other resource // watches as needed. + dm.listenerWatcher = &xdsResourceState[xdsresource.ListenerUpdate, struct{}]{} lw := &xdsResourceWatcher[xdsresource.ListenerUpdate]{ onUpdate: func(update *xdsresource.ListenerUpdate, onDone func()) { dm.onListenerResourceUpdate(update, onDone) @@ -139,7 +171,7 @@ func New(listenerName, dataplaneAuthority string, xdsClient xdsclient.XDSClient, dm.onListenerResourceAmbientError(err, onDone) }, } - dm.listenerCancel = xdsresource.WatchListener(dm.xdsClient, listenerName, lw) + dm.listenerWatcher.stop = xdsresource.WatchListener(dm.xdsClient, listenerName, lw) return dm } @@ -153,19 +185,17 @@ func (m *DependencyManager) Close() { } m.stopped = true - if m.listenerCancel != nil { - m.listenerCancel() - } - if m.routeConfigCancel != nil { - m.routeConfigCancel() + m.listenerWatcher.stop() + if m.routeConfigWatcher != nil { + m.routeConfigWatcher.stop() } for name, cluster := range m.clusterWatchers { - cluster.cancelWatch() + cluster.stop() delete(m.clusterWatchers, name) } for name, endpoint := range m.endpointWatchers { - endpoint.cancelWatch() + endpoint.stop() delete(m.endpointWatchers, name) } @@ -173,7 +203,7 @@ func (m *DependencyManager) Close() { // try to grab the dependency manager lock, which is already held here. m.dnsSerializerCancel() for name, dnsResolver := range m.dnsResolvers { - dnsResolver.stopResolver() + dnsResolver.stop() delete(m.dnsResolvers, name) } } @@ -187,13 +217,13 @@ func (m *DependencyManager) annotateErrorWithNodeID(err error) error { // maybeSendUpdateLocked verifies that all expected resources have been // received, and if so, delivers the complete xDS configuration to the watcher. func (m *DependencyManager) maybeSendUpdateLocked() { - if m.currentListenerUpdate == nil || m.currentRouteConfig == nil { + if m.listenerWatcher.lastUpdate == nil || m.routeConfigWatcher == nil || m.routeConfigWatcher.lastUpdate == nil { return } config := &xdsresource.XDSConfig{ - Listener: m.currentListenerUpdate, - RouteConfig: m.currentRouteConfig, - VirtualHost: m.currentVirtualHost, + Listener: m.listenerWatcher.lastUpdate, + RouteConfig: m.routeConfigWatcher.lastUpdate, + VirtualHost: m.routeConfigWatcher.extras.virtualHost, Clusters: make(map[string]*xdsresource.ClusterResult), } @@ -223,19 +253,19 @@ func (m *DependencyManager) maybeSendUpdateLocked() { // Cancel resources not seen in the tree. for name, ep := range m.endpointWatchers { if _, ok := edsResourcesSeen[name]; !ok { - ep.cancelWatch() + ep.stop() delete(m.endpointWatchers, name) } } for name, dr := range m.dnsResolvers { if _, ok := dnsResourcesSeen[name]; !ok { - dr.stopResolver() + dr.stop() delete(m.dnsResolvers, name) } } for name, cluster := range m.clusterWatchers { if _, ok := clusterResourcesSeen[name]; !ok { - cluster.cancelWatch() + cluster.stop() delete(m.clusterWatchers, name) } } @@ -359,7 +389,11 @@ func (m *DependencyManager) populateLogicalDNSClusterLocked(clusterName string, // If dns resolver does not exist, create one. if _, ok := m.dnsResolvers[target]; !ok { - m.dnsResolvers[target] = m.newDNSResolver(target) + state := m.newDNSResolver(target) + if state == nil { + return false, nil, nil + } + m.dnsResolvers[target] = state return false, nil, nil } dnsState := m.dnsResolvers[target] @@ -406,11 +440,13 @@ func (m *DependencyManager) populateAggregateClusterLocked(clusterName string, u func (m *DependencyManager) applyRouteConfigUpdateLocked(update *xdsresource.RouteConfigUpdate) { matchVH := xdsresource.FindBestMatchingVirtualHost(m.dataplaneAuthority, update.VirtualHosts) if matchVH == nil { - m.watcher.Error(m.annotateErrorWithNodeID(fmt.Errorf("could not find VirtualHost for %q", m.dataplaneAuthority))) + err := m.annotateErrorWithNodeID(fmt.Errorf("could not find VirtualHost for %q", m.dataplaneAuthority)) + m.routeConfigWatcher.setLastError(err) + m.watcher.Error(err) return } - m.currentRouteConfig = update - m.currentVirtualHost = matchVH + m.routeConfigWatcher.setLastUpdate(update) + m.routeConfigWatcher.extras.virtualHost = matchVH if EnableClusterAndEndpointsWatch { // Get the clusters to be watched from the routes in the virtual host. @@ -426,7 +462,7 @@ func (m *DependencyManager) applyRouteConfigUpdateLocked(update *xdsresource.Rou // Cancel watch for clusters not seen in route config for name := range m.clustersFromRouteConfig { if _, ok := newClusters[name]; !ok { - m.clusterWatchers[name].cancelWatch() + m.clusterWatchers[name].stop() delete(m.clusterWatchers, name) } } @@ -451,16 +487,16 @@ func (m *DependencyManager) onListenerResourceUpdate(update *xdsresource.Listene m.logger.Infof("Received update for Listener resource %q: %+v", m.ldsResourceName, update) } - m.currentListenerUpdate = update + m.listenerWatcher.setLastUpdate(update) if update.InlineRouteConfig != nil { // If there was a previous route config watcher because of a non-inline // route configuration, cancel it. m.rdsResourceName = "" - if m.routeConfigCancel != nil { - m.routeConfigCancel() - m.routeConfigCancel = nil + if m.routeConfigWatcher != nil { + m.routeConfigWatcher.stop() } + m.routeConfigWatcher = &xdsResourceState[xdsresource.RouteConfigUpdate, routeExtras]{stop: func() {}} m.applyRouteConfigUpdateLocked(update.InlineRouteConfig) return } @@ -478,10 +514,9 @@ func (m *DependencyManager) onListenerResourceUpdate(update *xdsresource.Listene // resolved, no update is sent to the channel, and therefore the old route // configuration (if received) is used until the new one is received. m.rdsResourceName = update.RouteConfigName - if m.routeConfigCancel != nil { - m.routeConfigCancel() + if m.routeConfigWatcher != nil { + m.routeConfigWatcher.stop() } - rw := &xdsResourceWatcher[xdsresource.RouteConfigUpdate]{ onUpdate: func(update *xdsresource.RouteConfigUpdate, onDone func()) { m.onRouteConfigResourceUpdate(m.rdsResourceName, update, onDone) @@ -493,8 +528,13 @@ func (m *DependencyManager) onListenerResourceUpdate(update *xdsresource.Listene m.onRouteConfigResourceAmbientError(m.rdsResourceName, err, onDone) }, } - m.routeConfigCancel = xdsresource.WatchRouteConfig(m.xdsClient, m.rdsResourceName, rw) - + if m.routeConfigWatcher != nil { + m.routeConfigWatcher.stop = xdsresource.WatchRouteConfig(m.xdsClient, m.rdsResourceName, rw) + } else { + m.routeConfigWatcher = &xdsResourceState[xdsresource.RouteConfigUpdate, routeExtras]{ + stop: xdsresource.WatchRouteConfig(m.xdsClient, m.rdsResourceName, rw), + } + } } func (m *DependencyManager) onListenerResourceError(err error, onDone func()) { @@ -508,12 +548,12 @@ func (m *DependencyManager) onListenerResourceError(err error, onDone func()) { m.logger.Warningf("Received resource error for Listener resource %q: %v", m.ldsResourceName, m.annotateErrorWithNodeID(err)) - if m.routeConfigCancel != nil { - m.routeConfigCancel() + if m.routeConfigWatcher != nil { + m.routeConfigWatcher.stop() } - m.currentListenerUpdate = nil + m.listenerWatcher.setLastError(err) m.rdsResourceName = "" - m.routeConfigCancel = nil + m.routeConfigWatcher = nil m.watcher.Error(fmt.Errorf("listener resource error: %v", m.annotateErrorWithNodeID(err))) } @@ -556,7 +596,7 @@ func (m *DependencyManager) onRouteConfigResourceError(resourceName string, err if m.stopped || m.rdsResourceName != resourceName { return } - m.currentRouteConfig = nil + m.routeConfigWatcher.setLastError(err) m.logger.Warningf("Received resource error for RouteConfiguration resource %q: %v", resourceName, m.annotateErrorWithNodeID(err)) m.watcher.Error(fmt.Errorf("route resource error: %v", m.annotateErrorWithNodeID(err))) } @@ -577,14 +617,7 @@ func (m *DependencyManager) onRouteConfigResourceAmbientError(resourceName strin m.logger.Warningf("Route resource ambient error %q: %v", resourceName, m.annotateErrorWithNodeID(err)) } -type clusterWatcherState struct { - cancelWatch func() - lastUpdate *xdsresource.ClusterUpdate - lastErr error - updateReceived bool -} - -func newClusterWatcher(resourceName string, depMgr *DependencyManager) *clusterWatcherState { +func newClusterWatcher(resourceName string, depMgr *DependencyManager) *xdsResourceState[xdsresource.ClusterUpdate, struct{}] { w := &xdsResourceWatcher[xdsresource.ClusterUpdate]{ onUpdate: func(u *xdsresource.ClusterUpdate, onDone func()) { depMgr.onClusterResourceUpdate(resourceName, u, onDone) @@ -596,8 +629,8 @@ func newClusterWatcher(resourceName string, depMgr *DependencyManager) *clusterW depMgr.onClusterAmbientError(resourceName, err, onDone) }, } - return &clusterWatcherState{ - cancelWatch: xdsresource.WatchCluster(depMgr.xdsClient, resourceName, w), + return &xdsResourceState[xdsresource.ClusterUpdate, struct{}]{ + stop: xdsresource.WatchCluster(depMgr.xdsClient, resourceName, w), } } @@ -614,12 +647,8 @@ func (m *DependencyManager) onClusterResourceUpdate(resourceName string, update if m.logger.V(2) { m.logger.Infof("Received update for Cluster resource %q: %+v", resourceName, update) } - state := m.clusterWatchers[resourceName] - state.lastUpdate = update - state.lastErr = nil - state.updateReceived = true + m.clusterWatchers[resourceName].setLastUpdate(update) m.maybeSendUpdateLocked() - } // Records a resource error for a Cluster resource, clears the last successful @@ -633,10 +662,7 @@ func (m *DependencyManager) onClusterResourceError(resourceName string, err erro return } m.logger.Warningf("Received resource error for Cluster resource %q: %v", resourceName, m.annotateErrorWithNodeID(err)) - state := m.clusterWatchers[resourceName] - state.lastErr = err - state.lastUpdate = nil - state.updateReceived = true + m.clusterWatchers[resourceName].setLastError(err) m.maybeSendUpdateLocked() } @@ -653,14 +679,7 @@ func (m *DependencyManager) onClusterAmbientError(resourceName string, err error m.logger.Warningf("Cluster resource ambient error %q: %v", resourceName, m.annotateErrorWithNodeID(err)) } -type endpointWatcherState struct { - cancelWatch func() - lastUpdate *xdsresource.EndpointsUpdate - lastErr error - updateReceived bool -} - -func newEndpointWatcher(resourceName string, depMgr *DependencyManager) *endpointWatcherState { +func newEndpointWatcher(resourceName string, depMgr *DependencyManager) *xdsResourceState[xdsresource.EndpointsUpdate, struct{}] { w := &xdsResourceWatcher[xdsresource.EndpointsUpdate]{ onUpdate: func(u *xdsresource.EndpointsUpdate, onDone func()) { depMgr.onEndpointUpdate(resourceName, u, onDone) @@ -672,8 +691,8 @@ func newEndpointWatcher(resourceName string, depMgr *DependencyManager) *endpoin depMgr.onEndpointAmbientError(resourceName, err, onDone) }, } - return &endpointWatcherState{ - cancelWatch: xdsresource.WatchEndpoints(depMgr.xdsClient, resourceName, w), + return &xdsResourceState[xdsresource.EndpointsUpdate, struct{}]{ + stop: xdsresource.WatchEndpoints(depMgr.xdsClient, resourceName, w), } } @@ -691,10 +710,7 @@ func (m *DependencyManager) onEndpointUpdate(resourceName string, update *xdsres if m.logger.V(2) { m.logger.Infof("Received update for Endpoint resource %q: %+v", resourceName, update) } - state := m.endpointWatchers[resourceName] - state.lastUpdate = update - state.lastErr = nil - state.updateReceived = true + m.endpointWatchers[resourceName].setLastUpdate(update) m.maybeSendUpdateLocked() } @@ -709,10 +725,7 @@ func (m *DependencyManager) onEndpointResourceError(resourceName string, err err return } m.logger.Warningf("Received resource error for Endpoint resource %q: %v", resourceName, m.annotateErrorWithNodeID(err)) - state := m.endpointWatchers[resourceName] - state.lastErr = err - state.lastUpdate = nil - state.updateReceived = true + m.endpointWatchers[resourceName].setLastError(err) m.maybeSendUpdateLocked() } @@ -728,8 +741,7 @@ func (m *DependencyManager) onEndpointAmbientError(resourceName string, err erro } m.logger.Warningf("Endpoint resource ambient error %q: %v", resourceName, m.annotateErrorWithNodeID(err)) - state := m.endpointWatchers[resourceName] - state.lastErr = err + m.endpointWatchers[resourceName].updateLastError(err) m.maybeSendUpdateLocked() } @@ -755,10 +767,7 @@ func (m *DependencyManager) onDNSUpdate(resourceName string, update *resolver.St } } - state := m.dnsResolvers[resourceName] - state.lastUpdate = &xdsresource.DNSUpdate{Endpoints: endpoints} - state.updateReceived = true - state.lastErr = nil + m.dnsResolvers[resourceName].setLastUpdate(&xdsresource.DNSUpdate{Endpoints: endpoints}) m.maybeSendUpdateLocked() } @@ -777,36 +786,27 @@ func (m *DependencyManager) onDNSError(resourceName string, err error) { return } + err = fmt.Errorf("dns resolver error for target %q: %v", resourceName, m.annotateErrorWithNodeID(err)) + m.logger.Warningf("%v", err) state := m.dnsResolvers[resourceName] - // If a previous good update was received, record the error and continue - // using the previous update. If RPCs were succeeding prior to this, they - // will continue to do so. Also suppress errors if we previously received an - // error, since there will be no downstream effects of propagating this - // error. - state.lastErr = err - if !state.updateReceived { - state.lastUpdate = nil - state.updateReceived = true + if state.updateReceived { + state.updateLastError(err) + return } - m.logger.Warningf("DNS resolver error %q: %v", resourceName, m.annotateErrorWithNodeID(err)) + + state.setLastError(err) m.maybeSendUpdateLocked() } // RequestDNSReresolution calls all the the DNS resolver's ResolveNow. func (m *DependencyManager) RequestDNSReresolution(opt resolver.ResolveNowOptions) { for _, res := range m.dnsResolvers { - res.dnsR.ResolveNow(opt) + if res.extras.dnsR != nil { + res.extras.dnsR.ResolveNow(opt) + } } } -type dnsResourceState struct { - dnsR resolver.Resolver - lastUpdate *xdsresource.DNSUpdate - lastErr error - updateReceived bool - stopResolver func() -} - type resolverClientConn struct { target string depMgr *DependencyManager @@ -833,12 +833,12 @@ func (rcc *resolverClientConn) ParseServiceConfig(string) *serviceconfig.ParseRe return &serviceconfig.ParseResult{Err: fmt.Errorf("service config not supported")} } -func (m *DependencyManager) newDNSResolver(target string) *dnsResourceState { +func (m *DependencyManager) newDNSResolver(target string) *xdsResourceState[xdsresource.DNSUpdate, dnsExtras] { u, err := url.Parse("dns:///" + target) if err != nil { err := fmt.Errorf("failed to parse DNS target %q: %v", target, m.annotateErrorWithNodeID(err)) m.logger.Warningf("%v", err) - return &dnsResourceState{ + return &xdsResourceState[xdsresource.DNSUpdate, dnsExtras]{ lastErr: err, updateReceived: true, } @@ -856,8 +856,28 @@ func (m *DependencyManager) newDNSResolver(target string) *dnsResourceState { return nil } - return &dnsResourceState{ - dnsR: r, - stopResolver: r.Close, + return &xdsResourceState[xdsresource.DNSUpdate, dnsExtras]{ + extras: dnsExtras{dnsR: r}, + stop: r.Close, } } + +// xdsResourceWatcher is a generic implementation of the xdsresource.Watcher +// interface. +type xdsResourceWatcher[T any] struct { + onUpdate func(*T, func()) + onError func(error, func()) + onAmbientError func(error, func()) +} + +func (x *xdsResourceWatcher[T]) ResourceChanged(update *T, onDone func()) { + x.onUpdate(update, onDone) +} + +func (x *xdsResourceWatcher[T]) ResourceError(err error, onDone func()) { + x.onError(err, onDone) +} + +func (x *xdsResourceWatcher[T]) AmbientError(err error, onDone func()) { + x.onAmbientError(err, onDone) +} From cd5d9906517d4b62281cd3622c4498b38c69a858 Mon Sep 17 00:00:00 2001 From: eshitachandwani Date: Fri, 19 Dec 2025 13:31:37 +0530 Subject: [PATCH 25/25] changes --- internal/xds/resolver/watch_service_test.go | 10 +++++++--- internal/xds/resolver/xds_resolver.go | 2 +- internal/xds/xdsdepmgr/xds_dependency_manager.go | 16 ++++++++-------- .../xds/xdsdepmgr/xds_dependency_manager_test.go | 4 ++-- 4 files changed, 18 insertions(+), 14 deletions(-) diff --git a/internal/xds/resolver/watch_service_test.go b/internal/xds/resolver/watch_service_test.go index 831de0b31f0a..6eec8a2b7429 100644 --- a/internal/xds/resolver/watch_service_test.go +++ b/internal/xds/resolver/watch_service_test.go @@ -68,8 +68,10 @@ func (s) TestServiceWatch_ListenerPointsToNewRouteConfiguration(t *testing.T) { verifyUpdateFromResolver(ctx, t, stateCh, wantServiceConfig(resources.Clusters[0].Name)) // Update the listener resource to point to a new route configuration name. - // Leave the old route configuration resource unchanged to prevent the race - // between route resource error for resource removed and listener update. + // The old route configuration resource is left unchanged to prevent a race: + // if it were removed immediately, the resolver might encounter a + // "resource-not-found" error for the old route before the listener update + // successfully transitions the client to the new route. newTestRouteConfigName := defaultTestRouteConfigName + "-new" resources.Listeners = []*v3listenerpb.Listener{e2e.DefaultClientListener(defaultTestServiceName, newTestRouteConfigName)} resources.SkipValidation = true @@ -92,7 +94,9 @@ func (s) TestServiceWatch_ListenerPointsToNewRouteConfiguration(t *testing.T) { }) mgmtServer.Update(ctx, resources) - // Wait for no update from the resolver. + // Wait for no update from the resolver since the listener resource no + // longer points to the old route resource and new route resource hasn't + // been sent yet. verifyNoUpdateFromResolver(ctx, t, stateCh) // Update the management server with the new route configuration resource. diff --git a/internal/xds/resolver/xds_resolver.go b/internal/xds/resolver/xds_resolver.go index b750f6661abd..a38b292b006d 100644 --- a/internal/xds/resolver/xds_resolver.go +++ b/internal/xds/resolver/xds_resolver.go @@ -230,7 +230,7 @@ type xdsResolver struct { curConfigSelector stoppableConfigSelector } -// ResolveNow calls ResolveNow on the dependency manager. +// ResolveNow calls RequestDNSReresolution on the dependency manager. func (r *xdsResolver) ResolveNow(opts resolver.ResolveNowOptions) { if r.dm != nil { r.dm.RequestDNSReresolution(opts) diff --git a/internal/xds/xdsdepmgr/xds_dependency_manager.go b/internal/xds/xdsdepmgr/xds_dependency_manager.go index f91a2145a3ea..266516bd27d8 100644 --- a/internal/xds/xdsdepmgr/xds_dependency_manager.go +++ b/internal/xds/xdsdepmgr/xds_dependency_manager.go @@ -800,6 +800,8 @@ func (m *DependencyManager) onDNSError(resourceName string, err error) { // RequestDNSReresolution calls all the the DNS resolver's ResolveNow. func (m *DependencyManager) RequestDNSReresolution(opt resolver.ResolveNowOptions) { + m.mu.Lock() + defer m.mu.Unlock() for _, res := range m.dnsResolvers { if res.extras.dnsR != nil { res.extras.dnsR.ResolveNow(opt) @@ -834,20 +836,18 @@ func (rcc *resolverClientConn) ParseServiceConfig(string) *serviceconfig.ParseRe } func (m *DependencyManager) newDNSResolver(target string) *xdsResourceState[xdsresource.DNSUpdate, dnsExtras] { + rcc := &resolverClientConn{ + target: target, + depMgr: m, + } u, err := url.Parse("dns:///" + target) if err != nil { err := fmt.Errorf("failed to parse DNS target %q: %v", target, m.annotateErrorWithNodeID(err)) m.logger.Warningf("%v", err) - return &xdsResourceState[xdsresource.DNSUpdate, dnsExtras]{ - lastErr: err, - updateReceived: true, - } + rcc.ReportError(err) + return &xdsResourceState[xdsresource.DNSUpdate, dnsExtras]{} } - rcc := &resolverClientConn{ - target: target, - depMgr: m, - } r, err := resolver.Get("dns").Build(resolver.Target{URL: *u}, rcc, resolver.BuildOptions{}) if err != nil { rcc.ReportError(err) diff --git a/internal/xds/xdsdepmgr/xds_dependency_manager_test.go b/internal/xds/xdsdepmgr/xds_dependency_manager_test.go index bf8d96c64d24..8cfd1aa924ec 100644 --- a/internal/xds/xdsdepmgr/xds_dependency_manager_test.go +++ b/internal/xds/xdsdepmgr/xds_dependency_manager_test.go @@ -21,12 +21,12 @@ package xdsdepmgr_test import ( "context" "fmt" - "hash/fnv" "regexp" "strings" "testing" "time" + xxhash "github.com/cespare/xxhash/v2" "github.com/envoyproxy/go-control-plane/pkg/wellknown" "github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp/cmpopts" @@ -128,7 +128,7 @@ func lessEndpoint(a, b resolver.Endpoint) bool { } func getHash(e resolver.Endpoint) uint64 { - h := fnv.New64a() + h := xxhash.New() // We iterate through all addresses to ensure the hash represents // the full endpoint identity.