-
Notifications
You must be signed in to change notification settings - Fork 619
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
eni watcher: add backoff-retry #1148
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,22 +17,113 @@ | |
package networkutils | ||
|
||
import ( | ||
"context" | ||
"path/filepath" | ||
"strings" | ||
"time" | ||
|
||
"github.com/aws/amazon-ecs-agent/agent/eni/netlinkwrapper" | ||
"github.com/aws/amazon-ecs-agent/agent/utils" | ||
"github.com/pkg/errors" | ||
|
||
log "github.com/cihub/seelog" | ||
"github.com/cihub/seelog" | ||
) | ||
|
||
const ( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. how exactly did we determine the durations for these backoff times? same with the eni ones below. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. with experiments. It took an average of 10ms for the ENI's mac address to show up on the instance. The one for attachment message was based on my conversations with the ECS backend team. |
||
// macAddressBackoffMin specifies the mimimum duration for the backoff | ||
// when looking for an ENI's mac address on the host | ||
macAddressBackoffMin = 2 * time.Millisecond | ||
|
||
// macAddressBackoffMax specifies the maximum duration for the backoff | ||
// when looking for an ENI's mac address on the host | ||
macAddressBackoffMax = 200 * time.Millisecond | ||
|
||
// macAddressBackoffJitter specifies the jitter multiple percentage when | ||
// looking for an ENI's mac address on the host | ||
macAddressBackoffJitter = 0.2 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think jitter is really necessary here since you're not making remote calls or accessing a resource with high concurrency. On the other hand, I don't think it really hurts either except to make the timing a bit more unpredictable. |
||
|
||
// macAddressBackoffMultiple specifies the backoff duration multiplier | ||
// when looking for an ENI's mac address on the host | ||
macAddressBackoffMultiple = 1.5 | ||
) | ||
|
||
// macAddressRetriever is used to retrieve the mac address of a device. It collects | ||
// all the information necessary to start this operation and stores the result in | ||
// the 'macAddress' attribute | ||
type macAddressRetriever struct { | ||
dev string | ||
netlinkClient netlinkwrapper.NetLink | ||
macAddress string | ||
// timeout specifies the timeout duration before giving up when | ||
// looking for an ENI's mac address on the host | ||
timeout time.Duration | ||
ctx context.Context | ||
} | ||
|
||
// GetMACAddress retrieves the MAC address of a device using netlink | ||
func GetMACAddress(dev string, netlinkClient netlinkwrapper.NetLink) (string, error) { | ||
dev = filepath.Base(dev) | ||
link, err := netlinkClient.LinkByName(dev) | ||
func GetMACAddress(ctx context.Context, | ||
timeout time.Duration, | ||
dev string, | ||
netlinkClient netlinkwrapper.NetLink) (string, error) { | ||
retriever := &macAddressRetriever{ | ||
dev: dev, | ||
netlinkClient: netlinkClient, | ||
ctx: ctx, | ||
timeout: timeout, | ||
} | ||
return retriever.retrieve() | ||
} | ||
|
||
// retrieve retrives the mac address of a network device. If the retrieved mac | ||
// address is empty, it retries the operation with a timeout specified by the | ||
// caller | ||
func (retriever *macAddressRetriever) retrieve() (string, error) { | ||
backoff := utils.NewSimpleBackoff(macAddressBackoffMin, macAddressBackoffMax, | ||
macAddressBackoffJitter, macAddressBackoffMultiple) | ||
ctx, cancel := context.WithTimeout(retriever.ctx, retriever.timeout) | ||
defer cancel() | ||
|
||
err := utils.RetryWithBackoffCtx(ctx, backoff, func() error { | ||
retErr := retriever.retrieveOnce() | ||
if retErr != nil { | ||
seelog.Warnf("Unable to retrieve mac address for device '%s': %v", | ||
retriever.dev, retErr) | ||
return retErr | ||
} | ||
|
||
if retriever.macAddress == "" { | ||
seelog.Debugf("Empty mac address for device '%s'", retriever.dev) | ||
// Return a retriable error when mac address is empty. If the error | ||
// is not wrapped with the RetriableError interface, RetryWithBackoffCtx | ||
// treats them as retriable by default | ||
return errors.Errorf("eni mac address: retrieved empty address for device %s", | ||
retriever.dev) | ||
} | ||
|
||
return nil | ||
}) | ||
if err != nil { | ||
return "", err | ||
} | ||
return link.Attrs().HardwareAddr.String(), err | ||
// RetryWithBackoffCtx returns nil when the context is cancelled. Check if there was | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would it be better to check There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm, I don't know if it gives us any additional benefits. i like dealing with errors rather than empty strings or nils. So, will keep it this way |
||
// a timeout here. TODO: Fix RetryWithBackoffCtx to return ctx.Err() on context Done() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. oops, I think that's on me... |
||
if err = ctx.Err(); err != nil { | ||
return "", errors.Wrapf(err, "eni mac address: timed out waiting for eni device '%s'", | ||
retriever.dev) | ||
} | ||
|
||
return retriever.macAddress, nil | ||
} | ||
|
||
// retrieveOnce retrieves the MAC address of a device using netlink.LinkByName | ||
func (retriever *macAddressRetriever) retrieveOnce() error { | ||
dev := filepath.Base(retriever.dev) | ||
link, err := retriever.netlinkClient.LinkByName(dev) | ||
if err != nil { | ||
return utils.NewRetriableError(utils.NewRetriable(false), err) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Are you sure that we should never retry in this case? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yeah, an error means a syscall failed for describing the device. at this point of time, it doesn't make sense to retry it as it really should be visible on the instance |
||
} | ||
retriever.macAddress = link.Attrs().HardwareAddr.String() | ||
return nil | ||
} | ||
|
||
// IsValidNetworkDevice is used to differentiate virtual and physical devices | ||
|
@@ -43,23 +134,23 @@ func IsValidNetworkDevice(devicePath string) bool { | |
* eth1 -> /devices/pci0000:00/0000:00:05.0/net/eth1 | ||
* eth0 -> ../../devices/pci0000:00/0000:00:03.0/net/eth0 | ||
* lo -> ../../devices/virtual/net/lo | ||
*/ | ||
*/ | ||
splitDevLink := strings.SplitN(devicePath, "devices/", 2) | ||
if len(splitDevLink) != 2 { | ||
log.Warnf("Cannot determine device validity: %s", devicePath) | ||
seelog.Warnf("Cannot determine device validity: %s", devicePath) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. its not clear to me why There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The example entries in the comment above are meant to illustrate that:
This means that we expect it to be of "devices/some-string" format. Is that not clear enough? It's just additional validation |
||
return false | ||
} | ||
/* | ||
* CoreOS typically employs the vif style for physical net interfaces | ||
* Amazon Linux, Ubuntu, RHEL, Fedora, Suse use the traditional pci convention | ||
*/ | ||
*/ | ||
if strings.HasPrefix(splitDevLink[1], pciDevicePrefix) || strings.HasPrefix(splitDevLink[1], vifDevicePrefix) { | ||
return true | ||
} | ||
if strings.HasPrefix(splitDevLink[1], virtualDevicePrefix) { | ||
return false | ||
} | ||
// NOTE: Should never reach here | ||
log.Criticalf("Failed to validate device path: %s", devicePath) | ||
seelog.Criticalf("Failed to validate device path: %s", devicePath) | ||
return false | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you wrap this at 80 characters?