-
Notifications
You must be signed in to change notification settings - Fork 886
Gracefully remove LB endpoints from services #2112
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -638,14 +638,15 @@ func (ep *endpoint) addServiceInfoToCluster(sb *sandbox) error { | |
| } | ||
|
|
||
| buf, err := proto.Marshal(&EndpointRecord{ | ||
| Name: name, | ||
| ServiceName: ep.svcName, | ||
| ServiceID: ep.svcID, | ||
| VirtualIP: ep.virtualIP.String(), | ||
| IngressPorts: ingressPorts, | ||
| Aliases: ep.svcAliases, | ||
| TaskAliases: ep.myAliases, | ||
| EndpointIP: ep.Iface().Address().IP.String(), | ||
| Name: name, | ||
| ServiceName: ep.svcName, | ||
| ServiceID: ep.svcID, | ||
| VirtualIP: ep.virtualIP.String(), | ||
| IngressPorts: ingressPorts, | ||
| Aliases: ep.svcAliases, | ||
| TaskAliases: ep.myAliases, | ||
| EndpointIP: ep.Iface().Address().IP.String(), | ||
| ServiceDisabled: false, | ||
| }) | ||
| if err != nil { | ||
| return err | ||
|
|
@@ -663,7 +664,7 @@ func (ep *endpoint) addServiceInfoToCluster(sb *sandbox) error { | |
| return nil | ||
| } | ||
|
|
||
| func (ep *endpoint) deleteServiceInfoFromCluster(sb *sandbox, method string) error { | ||
| func (ep *endpoint) deleteServiceInfoFromCluster(sb *sandbox, fullRemove bool, method string) error { | ||
| if ep.isAnonymous() && len(ep.myAliases) == 0 { | ||
| return nil | ||
| } | ||
|
|
@@ -677,6 +678,15 @@ func (ep *endpoint) deleteServiceInfoFromCluster(sb *sandbox, method string) err | |
| defer sb.Service.Unlock() | ||
| logrus.Debugf("deleteServiceInfoFromCluster from %s START for %s %s", method, ep.svcName, ep.ID()) | ||
|
|
||
| // Avoid a race w/ with a container that aborts preemptively. This would | ||
| // get caught in disableServceInNetworkDB, but we check here to make the | ||
| // nature of the condition more clear. | ||
| // See comment in addServiceInfoToCluster() | ||
| if e := sb.getEndpoint(ep.ID()); e == nil { | ||
| logrus.Warnf("deleteServiceInfoFromCluster suppressing service resolution ep is not anymore in the sandbox %s", ep.ID()) | ||
| return nil | ||
| } | ||
|
|
||
| c := n.getController() | ||
| agent := c.getAgent() | ||
|
|
||
|
|
@@ -686,9 +696,13 @@ func (ep *endpoint) deleteServiceInfoFromCluster(sb *sandbox, method string) err | |
| } | ||
|
|
||
| if agent != nil { | ||
| // First delete from networkDB then locally | ||
| if err := agent.networkDB.DeleteEntry(libnetworkEPTable, n.ID(), ep.ID()); err != nil { | ||
| logrus.Warnf("deleteServiceInfoFromCluster NetworkDB DeleteEntry failed for %s %s err:%s", ep.id, n.id, err) | ||
| // First update the networkDB then locally | ||
| if fullRemove { | ||
| if err := agent.networkDB.DeleteEntry(libnetworkEPTable, n.ID(), ep.ID()); err != nil { | ||
| logrus.Warnf("deleteServiceInfoFromCluster NetworkDB DeleteEntry failed for %s %s err:%s", ep.id, n.id, err) | ||
| } | ||
| } else { | ||
| disableServiceInNetworkDB(agent, n, ep) | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -699,7 +713,7 @@ func (ep *endpoint) deleteServiceInfoFromCluster(sb *sandbox, method string) err | |
| if n.ingress { | ||
| ingressPorts = ep.ingressPorts | ||
| } | ||
| if err := c.rmServiceBinding(ep.svcName, ep.svcID, n.ID(), ep.ID(), name, ep.virtualIP, ingressPorts, ep.svcAliases, ep.myAliases, ep.Iface().Address().IP, "deleteServiceInfoFromCluster", true); err != nil { | ||
| if err := c.rmServiceBinding(ep.svcName, ep.svcID, n.ID(), ep.ID(), name, ep.virtualIP, ingressPorts, ep.svcAliases, ep.myAliases, ep.Iface().Address().IP, "deleteServiceInfoFromCluster", true, fullRemove); err != nil { | ||
| return err | ||
| } | ||
| } else { | ||
|
|
@@ -715,6 +729,35 @@ func (ep *endpoint) deleteServiceInfoFromCluster(sb *sandbox, method string) err | |
| return nil | ||
| } | ||
|
|
||
| func disableServiceInNetworkDB(a *agent, n *network, ep *endpoint) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. disableServiceInNetworkDB - API handles many error conditions. But we are not propagating to the caller like deleteServiceInfoFromCluster. Dont we need to propagate any error messages to the caller ?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fair question. In this case I think "no". The main reason I put that logic in a separate function was for code style and to keep both the indent level and the length of deleteServiceInfoFromCluster() from growing too much. However, some of the errors that said helper function can encounter are warning-level and some are error-level. Returning an error back to the caller instead of doing the logging directly would require parsing the error type and issuing logs which would negate the readability benefits of using the helper function in the first place. If the result of any of these errors would have altered the control flow of deleteServiceInfoFromCluster(), then returning the error would have been absolutely the right thing to do. However, there is really no way to recover from said errors as evidenced by the handling in the Also, for the record, the error-level logs in disableServiceInNetworkDB() are basically assertions. The only reason they should ever trigger is that there is some critical error in our automatically-generated protocol buffer code.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok. LGTM |
||
| var epRec EndpointRecord | ||
|
|
||
| logrus.Debugf("disableServiceInNetworkDB for %s %s", ep.svcName, ep.ID()) | ||
|
|
||
| // Update existing record to indicate that the service is disabled | ||
| inBuf, err := a.networkDB.GetEntry(libnetworkEPTable, n.ID(), ep.ID()) | ||
| if err != nil { | ||
| logrus.Warnf("disableServiceInNetworkDB GetEntry failed for %s %s err:%s", ep.id, n.id, err) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we use
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That would have been more consistent I guess. But (unless I am misunderstanding something fundamental) they had better produce the same result or something is seriously wrong. :) |
||
| return | ||
| } | ||
| // Should never fail | ||
| if err := proto.Unmarshal(inBuf, &epRec); err != nil { | ||
| logrus.Errorf("disableServiceInNetworkDB unmarshal failed for %s %s err:%s", ep.id, n.id, err) | ||
| return | ||
| } | ||
| epRec.ServiceDisabled = true | ||
| // Should never fail | ||
| outBuf, err := proto.Marshal(&epRec) | ||
| if err != nil { | ||
| logrus.Errorf("disableServiceInNetworkDB marshalling failed for %s %s err:%s", ep.id, n.id, err) | ||
| return | ||
| } | ||
| // Send update to the whole cluster | ||
| if err := a.networkDB.UpdateEntry(libnetworkEPTable, n.ID(), ep.ID(), outBuf); err != nil { | ||
| logrus.Warnf("disableServiceInNetworkDB UpdateEntry failed for %s %s err:%s", ep.id, n.id, err) | ||
| } | ||
| } | ||
|
|
||
| func (n *network) addDriverWatches() { | ||
| if !n.isClusterEligible() { | ||
| return | ||
|
|
@@ -844,7 +887,6 @@ func (c *controller) handleEpTableEvent(ev events.Event) { | |
| nid string | ||
| eid string | ||
| value []byte | ||
| isAdd bool | ||
| epRec EndpointRecord | ||
| ) | ||
|
|
||
|
|
@@ -853,12 +895,15 @@ func (c *controller) handleEpTableEvent(ev events.Event) { | |
| nid = event.NetworkID | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this switch case can be definitely compacted :)
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. mmm... can we? I was hoping so initially, but I'm not sure how we can access the fields of each individual type except through the switch (or equivalent explicit type conversions). It's possible that my understanding of go is wrong, but from what I understand event in each of these separate switch branches has a different type. Unless we were doing C-style unsafe data overlay stuff here, I'm not sure how we can otherwise get to the fields even if they all have the same names, types and order declaration. Please do let me know if I'm wrong. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. oh ok event is an empty interface, so yes never mind, if you collapse the 2 case the type will remain the same There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually looking at it better this can be compacted as: both the 3 events are actually defined as: so they are all sharing the same base type: make the collapse of the 3 cases possible
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What you wrote above is legal, but doesn't do what you think it does. :) CreateEvent and DeleteEvent cases are empty branches. So they will not populate nid, eid, and value. Furthermore, you can't fix that by adding
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually, I can't do that because networkdb.event is not an exported type. :( There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. sorry I wrote it C++ style, and yes because of event is not exported won't work if you don't change the scope. Ok I'm giving up |
||
| eid = event.Key | ||
| value = event.Value | ||
| isAdd = true | ||
| case networkdb.DeleteEvent: | ||
| nid = event.NetworkID | ||
| eid = event.Key | ||
| value = event.Value | ||
| case networkdb.UpdateEvent: | ||
| nid = event.NetworkID | ||
| eid = event.Key | ||
| value = event.Value | ||
| default: | ||
| logrus.Errorf("Unexpected update service table event = %#v", event) | ||
| return | ||
| } | ||
|
|
@@ -883,7 +928,8 @@ func (c *controller) handleEpTableEvent(ev events.Event) { | |
| return | ||
| } | ||
|
|
||
| if isAdd { | ||
| switch ev.(type) { | ||
| case networkdb.CreateEvent: | ||
| logrus.Debugf("handleEpTableEvent ADD %s R:%v", eid, epRec) | ||
| if svcID != "" { | ||
| // This is a remote task part of a service | ||
|
|
@@ -897,11 +943,12 @@ func (c *controller) handleEpTableEvent(ev events.Event) { | |
| logrus.Errorf("failed adding container name resolution for %s epRec:%v err:%v", eid, epRec, err) | ||
| } | ||
| } | ||
| } else { | ||
|
|
||
| case networkdb.DeleteEvent: | ||
| logrus.Debugf("handleEpTableEvent DEL %s R:%v", eid, epRec) | ||
| if svcID != "" { | ||
| // This is a remote task part of a service | ||
| if err := c.rmServiceBinding(svcName, svcID, nid, eid, containerName, vip, ingressPorts, serviceAliases, taskAliases, ip, "handleEpTableEvent", true); err != nil { | ||
| if err := c.rmServiceBinding(svcName, svcID, nid, eid, containerName, vip, ingressPorts, serviceAliases, taskAliases, ip, "handleEpTableEvent", true, true); err != nil { | ||
| logrus.Errorf("failed removing service binding for %s epRec:%v err:%v", eid, epRec, err) | ||
| return | ||
| } | ||
|
|
@@ -911,5 +958,18 @@ func (c *controller) handleEpTableEvent(ev events.Event) { | |
| logrus.Errorf("failed removing container name resolution for %s epRec:%v err:%v", eid, epRec, err) | ||
| } | ||
| } | ||
| case networkdb.UpdateEvent: | ||
| logrus.Debugf("handleEpTableEvent UPD %s R:%v", eid, epRec) | ||
| // We currently should only get these to inform us that an endpoint | ||
| // is disabled. Report if otherwise. | ||
| if svcID == "" || !epRec.ServiceDisabled { | ||
| logrus.Errorf("Unexpected update table event for %s epRec:%v", eid, epRec) | ||
| return | ||
| } | ||
| // This is a remote task that is part of a service that is now disabled | ||
| if err := c.rmServiceBinding(svcName, svcID, nid, eid, containerName, vip, ingressPorts, serviceAliases, taskAliases, ip, "handleEpTableEvent", true, false); err != nil { | ||
| logrus.Errorf("failed disabling service binding for %s epRec:%v err:%v", eid, epRec, err) | ||
| return | ||
| } | ||
| } | ||
| } | ||
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not specific to this PR but had a concern since we are touching this area overall: Will it be safer to remove the service binding (to stop resolving the names) first and then call
disableServiceInNetworkDB/agent.networkDB.DeleteEntryThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That's a fair question. Since the usual order is "lookup name, issue connection request, get redirected by NAT, get load balanced, connect to server", it would make sense to disable the name lookup first and then the load balancing. Or from a code standpoint, since we do (1) set up load balancing and then (2) set up name resolution, it makes sense to tear them down in the opposite order. Patch was already merged, but we could create another PR to address this. I'll confess that I haven't looked in detail at why it removal matched instead of reversed the order of "undo"s. Would need to verify there are no issues there.
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
On a bit further reflection, though, this doesn't get hit as much as I first thought. Clients don't generally connect to the service backends through their backend service names. They connect through the front-end load balancer. The DNS resolution for this doesn't go away until the last backend goes away. So at that point the connection is supposed to fail. The only difference will be whether the client receives a "host unreachable" or a "name resolution failure". And even then, only if it happens to occur within the fractions of a second between removing the front-end IP address and the DNS entry.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Agree with the fact that any negative effect of this will be extremely short lived. The comment was more in the context of proper ordering of cleanup ops. We can address this in a potential cleanup patch later.