-
Notifications
You must be signed in to change notification settings - Fork 362
Fix target scaler fallback race condition via shared storage #3191
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
5 commits
Select commit
Hold shift + click to select a range
208d624
Fix target scaler fallback race condition via shared storage
alrod 295783a
Add comprehensive tests for ITargetScalerErrorRepository
alrod 4703a35
Rename to NullTargetScalerErrorRepository to align with concurrency p…
alrod 7d89831
Address review feedback: ETag concurrency, TTL expiry, internal inter…
alrod b79f807
Address review feedback: remove redundant tests, simplify AddAsync
alrod File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
181 changes: 181 additions & 0 deletions
181
src/Microsoft.Azure.WebJobs.Host.Storage/BlobStorageTargetScalerErrorRepository.cs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,181 @@ | ||
| // Copyright (c) .NET Foundation. All rights reserved. | ||
| // Licensed under the MIT License. See License.txt in the project root for license information. | ||
|
|
||
| #nullable enable | ||
|
|
||
| using System; | ||
| using System.Collections.Generic; | ||
| using System.IO; | ||
| using System.Text; | ||
| using System.Threading; | ||
| using System.Threading.Tasks; | ||
| using Azure; | ||
| using Azure.Storage.Blobs; | ||
| using Azure.Storage.Blobs.Models; | ||
| using Microsoft.Azure.WebJobs.Host.Executors; | ||
| using Microsoft.Azure.WebJobs.Host.Scale; | ||
| using Microsoft.Azure.WebJobs.Host.Storage; | ||
| using Microsoft.Azure.WebJobs.Logging; | ||
| using Microsoft.Extensions.Logging; | ||
| using Newtonsoft.Json; | ||
|
|
||
| namespace Microsoft.Azure.WebJobs.Host | ||
| { | ||
| internal class BlobStorageTargetScalerErrorRepository : ITargetScalerErrorRepository | ||
| { | ||
| private readonly IHostIdProvider _hostIdProvider; | ||
| private readonly ILogger _logger; | ||
| private readonly IAzureBlobStorageProvider _blobStorageProvider; | ||
| private BlobContainerClient? _blobContainerClient; | ||
|
|
||
| private const int MaxRetries = 3; | ||
| internal static readonly TimeSpan DefaultTtl = TimeSpan.FromMinutes(10); | ||
|
|
||
| public BlobStorageTargetScalerErrorRepository(IHostIdProvider hostIdProvider, ILoggerFactory loggerFactory, IAzureBlobStorageProvider azureStorageProvider) | ||
| { | ||
| _hostIdProvider = hostIdProvider; | ||
| _logger = loggerFactory.CreateLogger(LogCategories.Scale); | ||
| _blobStorageProvider = azureStorageProvider; | ||
| } | ||
|
|
||
| public async Task AddAsync(string scalerUniqueId, CancellationToken cancellationToken) | ||
| { | ||
| try | ||
| { | ||
| for (int attempt = 0; attempt < MaxRetries; attempt++) | ||
| { | ||
| // Read current state with ETag | ||
| var (state, etag) = await ReadBlobWithETagAsync(cancellationToken); | ||
| var set = state?.Scalers ?? new HashSet<string>(); | ||
| set.Add(scalerUniqueId); | ||
|
|
||
| var newState = new TargetScalerErrorState | ||
| { | ||
| Scalers = set, | ||
| LastUpdated = DateTime.UtcNow | ||
| }; | ||
|
|
||
| try | ||
| { | ||
| await WriteBlobAsync(newState, etag, cancellationToken); | ||
| return; | ||
| } | ||
| catch (RequestFailedException ex) when (ex.Status == 412 || ex.Status == 409) | ||
| { | ||
| // ETag mismatch — another instance wrote concurrently, retry | ||
| } | ||
| } | ||
|
|
||
| _logger.LogWarning("Failed to persist target scaler error state after {MaxRetries} attempts due to concurrent updates.", MaxRetries); | ||
| } | ||
| catch (Exception e) | ||
| { | ||
| _logger.LogError(e, "Error persisting target scaler error state."); | ||
| } | ||
| } | ||
|
|
||
| public async Task<ISet<string>> GetAsync(CancellationToken cancellationToken) | ||
| { | ||
| try | ||
| { | ||
| var (state, _) = await ReadBlobWithETagAsync(cancellationToken); | ||
| if (state?.LastUpdated != null && (DateTime.UtcNow - state.LastUpdated.Value) > DefaultTtl) | ||
| { | ||
| // Data is stale — treat as empty so target scalers are re-evaluated | ||
| return new HashSet<string>(); | ||
| } | ||
| return state?.Scalers ?? new HashSet<string>(); | ||
| } | ||
| catch (Exception e) | ||
| { | ||
| _logger.LogError(e, "Error reading target scaler error state."); | ||
| return new HashSet<string>(); | ||
| } | ||
| } | ||
|
|
||
| private async Task<(TargetScalerErrorState?, ETag)> ReadBlobWithETagAsync(CancellationToken cancellationToken) | ||
| { | ||
| string blobPath = await GetBlobPathAsync(cancellationToken); | ||
|
|
||
| try | ||
| { | ||
| BlobContainerClient? containerClient = await GetContainerClientAsync(cancellationToken); | ||
| if (containerClient != null) | ||
| { | ||
| BlobClient blobClient = containerClient.GetBlobClient(blobPath); | ||
| var response = await blobClient.DownloadAsync(cancellationToken: cancellationToken); | ||
|
|
||
| string content; | ||
| using (StreamReader reader = new StreamReader(response.Value.Content, true)) | ||
| { | ||
| content = reader.ReadToEnd(); | ||
| } | ||
|
|
||
| if (!string.IsNullOrEmpty(content)) | ||
| { | ||
| var state = JsonConvert.DeserializeObject<TargetScalerErrorState>(content); | ||
| return (state, response.Value.Details.ETag); | ||
| } | ||
| } | ||
| } | ||
| catch (RequestFailedException exception) when (exception.Status == 404) | ||
| { | ||
| // blob doesn't exist yet — no errors recorded | ||
| return (null, default); | ||
| } | ||
|
|
||
| return (null, default); | ||
| } | ||
|
|
||
| private async Task WriteBlobAsync(TargetScalerErrorState state, ETag etag, CancellationToken cancellationToken) | ||
| { | ||
| string blobPath = await GetBlobPathAsync(cancellationToken); | ||
| BlobContainerClient? containerClient = await GetContainerClientAsync(cancellationToken); | ||
| if (containerClient != null) | ||
| { | ||
| BlobClient blobClient = containerClient.GetBlobClient(blobPath); | ||
| var content = JsonConvert.SerializeObject(state); | ||
| using (Stream stream = new MemoryStream(Encoding.UTF8.GetBytes(content))) | ||
| { | ||
| var options = new BlobUploadOptions(); | ||
| if (etag != default) | ||
| { | ||
| // Existing blob — only write if it hasn't changed since we read it | ||
| options.Conditions = new BlobRequestConditions { IfMatch = etag }; | ||
| } | ||
| else | ||
| { | ||
| // No blob exists yet — only create if it still doesn't exist | ||
| options.Conditions = new BlobRequestConditions { IfNoneMatch = ETag.All }; | ||
| } | ||
| await blobClient.UploadAsync(stream, options, cancellationToken); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| internal async Task<BlobContainerClient?> GetContainerClientAsync(CancellationToken cancellationToken) | ||
| { | ||
| if (_blobContainerClient == null && _blobStorageProvider.TryCreateHostingBlobContainerClient(out _blobContainerClient)) | ||
| { | ||
| await _blobContainerClient.CreateIfNotExistsAsync(cancellationToken: cancellationToken); | ||
| } | ||
|
|
||
| return _blobContainerClient; | ||
| } | ||
|
|
||
| internal async Task<string> GetBlobPathAsync(CancellationToken cancellationToken) | ||
| { | ||
| string hostId = await _hostIdProvider.GetHostIdAsync(cancellationToken); | ||
| return $"scale/{hostId}/targetScalersInError.json"; | ||
| } | ||
|
|
||
| internal class TargetScalerErrorState | ||
| { | ||
| [JsonProperty("scalers")] | ||
| public HashSet<string> Scalers { get; set; } = new HashSet<string>(); | ||
|
|
||
| [JsonProperty("lastUpdated")] | ||
| public DateTime? LastUpdated { get; set; } | ||
| } | ||
| } | ||
| } | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
32 changes: 32 additions & 0 deletions
32
src/Microsoft.Azure.WebJobs.Host/Scale/ITargetScalerErrorRepository.cs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,32 @@ | ||
| // Copyright (c) .NET Foundation. All rights reserved. | ||
| // Licensed under the MIT License. See License.txt in the project root for license information. | ||
|
|
||
| using System.Collections.Generic; | ||
| using System.Threading; | ||
| using System.Threading.Tasks; | ||
|
|
||
| namespace Microsoft.Azure.WebJobs.Host.Scale | ||
| { | ||
| /// <summary> | ||
| /// Provides functionality for persisting target scaler errors across multiple host instances. | ||
| /// When a target scaler throws <see cref="System.NotSupportedException"/>, the scaler identifier | ||
| /// is recorded so all instances can fall back to incremental scale monitoring. | ||
| /// </summary> | ||
| internal interface ITargetScalerErrorRepository | ||
| { | ||
| /// <summary> | ||
| /// Adds a target scaler identifier to the set of scalers in error. | ||
| /// </summary> | ||
| /// <param name="scalerUniqueId">The unique identifier of the target scaler.</param> | ||
| /// <param name="cancellationToken">A cancellation token.</param> | ||
| /// <returns>A task that completes when the write is finished.</returns> | ||
| Task AddAsync(string scalerUniqueId, CancellationToken cancellationToken); | ||
|
|
||
| /// <summary> | ||
| /// Returns the set of target scaler identifiers currently in error. | ||
| /// </summary> | ||
| /// <param name="cancellationToken">A cancellation token.</param> | ||
| /// <returns>A task that returns the set of scaler identifiers in error.</returns> | ||
| Task<ISet<string>> GetAsync(CancellationToken cancellationToken); | ||
| } | ||
| } |
23 changes: 23 additions & 0 deletions
23
src/Microsoft.Azure.WebJobs.Host/Scale/NullTargetScalerErrorRepository.cs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,23 @@ | ||
| // Copyright (c) .NET Foundation. All rights reserved. | ||
| // Licensed under the MIT License. See License.txt in the project root for license information. | ||
|
|
||
| using System.Collections.Generic; | ||
| using System.Threading; | ||
| using System.Threading.Tasks; | ||
|
|
||
| namespace Microsoft.Azure.WebJobs.Host.Scale | ||
| { | ||
| internal class NullTargetScalerErrorRepository : ITargetScalerErrorRepository | ||
| { | ||
| public Task AddAsync(string scalerUniqueId, CancellationToken cancellationToken) | ||
| { | ||
| return Task.CompletedTask; | ||
| } | ||
|
|
||
| public Task<ISet<string>> GetAsync(CancellationToken cancellationToken) | ||
| { | ||
| ISet<string> result = new HashSet<string>(); | ||
| return Task.FromResult(result); | ||
| } | ||
| } | ||
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.