Add error when chunkSize is greater than the strict memory limit of t…

…he CacheLimiter. (#2228) Scenario: export AZCOPY_BUFFER_GB=5 # Copy a single 5GB file to an Azure Blob Storage Container with a block size of 4000MB ./acopy [...] --block-size-mb=4000 [...] Since the chunk size in that case is greater than the strict memory limit (75% of 5GB) this would end up in a deadlock: CacheLimiter.TryAdd would loop forever, waiting for memory to be freed (which it never will). This patch tries to remedy that by making it an error if a chunk is greater than the strict memory limit of the associated CacheLimiter.
Azure · May 31, 2023 · 9bc28d3 · 9bc28d3
1 parent 691f114
commit 9bc28d3
Show file tree

Hide file tree

Showing 2 changed files with 26 additions and 11 deletions.
diff --git a/common/cacheLimiter.go b/common/cacheLimiter.go
@@ -27,6 +27,18 @@ import (
 	"time"
 )
 
+// The percentage of a CacheLimiter's Limit that is considered
+// the strict limit.
+var cacheLimiterStrictLimitPercentage = float32(0.75)
+// Rationale for the level of the strict limit: as at Jan 2018, we are using 0.75 of the total as the strict
+// limit, leaving the other 0.25 of the total accessible under the "relaxed" limit.
+// That last 25% gets use for two things: in downloads it is used for things where we KNOW there's
+// no backlogging of new chunks behind slow ones (i.e. these "good" cases are allowed to proceed without
+// interruption) and for uploads its used for re-doing the prefetches when we do retries (i.e. so these are
+// not blocked by other chunks using up RAM).
+// TODO: now that cacheLimiter is used for multiple purposes, the hard-coding of the distinction between
+//   relaxed and strict limits is less appropriate. Refactor to make it a configuration param of the instance?
+
 type Predicate func() bool
 
 // Used to limit the amounts of things. E.g. amount of in-flight data in RAM, to keep it an an acceptable level.
@@ -39,6 +51,7 @@ type CacheLimiter interface {
 	WaitUntilAdd(ctx context.Context, count int64, useRelaxedLimit Predicate) error
 	Remove(count int64)
 	Limit() int64
+	StrictLimit() int64
 }
 
 type cacheLimiter struct {
@@ -58,15 +71,7 @@ func (c *cacheLimiter) TryAdd(count int64, useRelaxedLimit bool) (added bool) {
 	// for high-priority things (i.e. things we deem to be allowable under a relaxed (non-strict) limit)
 	strict := !useRelaxedLimit
 	if strict {
-		lim = int64(float32(lim) * 0.75)
-		// Rationale for the level of the strict limit: as at Jan 2018, we are using 0.75 of the total as the strict
-		// limit, leaving the other 0.25 of the total accessible under the "relaxed" limit.
-		// That last 25% gets use for two things: in downloads it is used for things where we KNOW there's
-		// no backlogging of new chunks behind slow ones (i.e. these "good" cases are allowed to proceed without
-		// interruption) and for uploads its used for re-doing the prefetches when we do retries (i.e. so these are
-		// not blocked by other chunks using up RAM).
-		// TODO: now that cacheLimiter is used for multiple purposes, the hard-coding of the distinction between
-		//   relaxed and strict limits is less appropriate. Refactor to make it a configuration param of the instance?
+		lim = c.StrictLimit()
 	}
 
 	if atomic.AddInt64(&c.value, count) <= lim {
@@ -109,3 +114,7 @@ func (c *cacheLimiter) Remove(count int64) {
 func (c *cacheLimiter) Limit() int64 {
 	return c.limit
 }
+
+func (c *cacheLimiter) StrictLimit() int64 {
+	return int64(float32(c.limit) * cacheLimiterStrictLimitPercentage)
+}
diff --git a/ste/sender-blockBlob.go b/ste/sender-blockBlob.go
@@ -67,7 +67,7 @@ type blockBlobSenderBase struct {
 	completedBlockList     map[int]string
 }
 
-func getVerifiedChunkParams(transferInfo TransferInfo, memLimit int64) (chunkSize int64, numChunks uint32, err error) {
+func getVerifiedChunkParams(transferInfo TransferInfo, memLimit int64, strictMemLimit int64) (chunkSize int64, numChunks uint32, err error) {
 	chunkSize = transferInfo.BlockSize
 	srcSize := transferInfo.SourceSize
 	numChunks = getNumChunks(srcSize, chunkSize)
@@ -93,6 +93,12 @@ func getVerifiedChunkParams(transferInfo TransferInfo, memLimit int64) (chunkSiz
 		return
 	}
 
+	if chunkSize >= strictMemLimit {
+		err = fmt.Errorf("Cannot use a block size of %.2fGiB. AzCopy is limited to use only %.2fGiB of memory, and only %.2fGiB of these are available for chunks.",
+			toGiB(chunkSize), toGiB(memLimit), toGiB(strictMemLimit))
+		return
+	}
+
 	if chunkSize > common.MaxBlockBlobBlockSize {
 		// mercy, please
 		err = fmt.Errorf("block size of %.2fGiB for file %s of size %.2fGiB exceeds maximum allowed block size for a BlockBlob",
@@ -121,7 +127,7 @@ func getBlockNamePrefix(jobID common.JobID, partNum uint32, transferIndex uint32
 
 func newBlockBlobSenderBase(jptm IJobPartTransferMgr, destination string, p pipeline.Pipeline, pacer pacer, srcInfoProvider ISourceInfoProvider, inferredAccessTierType azblob.AccessTierType) (*blockBlobSenderBase, error) {
 	// compute chunk count
-	chunkSize, numChunks, err := getVerifiedChunkParams(jptm.Info(), jptm.CacheLimiter().Limit())
+	chunkSize, numChunks, err := getVerifiedChunkParams(jptm.Info(), jptm.CacheLimiter().Limit(), jptm.CacheLimiter().StrictLimit())
 	if err != nil {
 		return nil, err
 	}