diff --git a/src/Microsoft.Azure.WebJobs.Host/Scale/ScaleManager.cs b/src/Microsoft.Azure.WebJobs.Host/Scale/ScaleManager.cs index bef51f687..d97ec1158 100644 --- a/src/Microsoft.Azure.WebJobs.Host/Scale/ScaleManager.cs +++ b/src/Microsoft.Azure.WebJobs.Host/Scale/ScaleManager.cs @@ -2,6 +2,7 @@ // Licensed under the MIT License. See License.txt in the project root for license information. using System; +using System.Collections.Concurrent; using System.Collections.Generic; using System.Linq; using System.Threading; @@ -25,7 +26,7 @@ internal class ScaleManager : IScaleStatusProvider private readonly IConfiguration _configuration; private IOptions _scaleOptions; private IOptions _concurrencyOptions; - private static HashSet _targetScalersInError = new HashSet(); + internal static ConcurrentDictionary _targetScalersInError = new ConcurrentDictionary(); public ScaleManager( IScaleMonitorManager monitorManager, @@ -42,7 +43,6 @@ public ScaleManager( _metricsRepository = metricsRepository; _concurrencyStatusRepository = concurrencyStatusRepository; _logger = loggerFactory?.CreateLogger(); - _targetScalersInError = new HashSet(); _scaleOptions = scaleConfiguration; _configuration = configuration; _concurrencyOptions = concurrencyOptions; @@ -167,27 +167,11 @@ private async Task> GetTargetScalersResu _logger.LogDebug($"Snapshot dynamic concurrency for target scaler '{targetScaler.TargetScalerDescriptor.FunctionId}' is '{functionSnapshot.Concurrency}'"); } } - TargetScalerResult result = null; - try + TargetScalerResult result = await TryGetScaleResultAsync(targetScaler, targetScaleStatusContext, _logger); + if (result == null) { - result = await targetScaler.GetScaleResultAsync(targetScaleStatusContext); - } - catch (NotSupportedException ex) - { - string targetScalerUniqueId = GetTargetScalerFunctionUniqueId(targetScaler); - - _logger.LogFunctionScaleError( - "Unable to use target based scaling, switching to metrics monitor.", - targetScaler.TargetScalerDescriptor.FunctionId, - ex); - - lock (_targetScalersInError) - { - _targetScalersInError.Add(targetScalerUniqueId); - } - - // Adding ScaleVote.None vote - result = new TargetScalerResult + // Target scaler does not support TBS — use ScaleVote.None + result = new TargetScalerResult { TargetWorkerCount = context.WorkerCount }; @@ -231,7 +215,7 @@ internal static (List, List) GetScalersToSample( foreach (var scaler in targetScalers) { string scalerUniqueId = GetTargetScalerFunctionUniqueId(scaler); - if (!_targetScalersInError.Contains(scalerUniqueId)) + if (!_targetScalersInError.ContainsKey(scalerUniqueId)) { string assemblyName = GetAssemblyName(scaler.GetType()); bool featureDisabled = configuration.GetValue(assemblyName) == "0"; @@ -290,11 +274,37 @@ internal static ScaleVote GetAggregateScaleVote(IEnumerable votes, Sc return vote; } - private static string GetTargetScalerFunctionUniqueId(ITargetScaler scaler) + /// + /// Attempts to get a scale result from a target scaler. If the scaler throws + /// NotSupportedException (e.g. missing Manage claim), it is added to + /// _targetScalersInError and null is returned, causing fallback to the + /// incremental scale monitor. + /// + internal static async Task TryGetScaleResultAsync(ITargetScaler targetScaler, TargetScalerContext context, ILogger logger, string caller = "GetScaleStatus") { - return $"{GetAssemblyName(scaler.GetType())}-{scaler.TargetScalerDescriptor.FunctionId}"; + try + { + return await targetScaler.GetScaleResultAsync(context).ConfigureAwait(false); + } + catch (NotSupportedException ex) + { + string scalerUniqueId = GetTargetScalerFunctionUniqueId(targetScaler); + + logger?.LogFunctionScaleError( + $"Unable to use target based scaling, switching to metrics monitor. Detected by: {caller}.", + targetScaler.TargetScalerDescriptor.FunctionId, + ex); + + _targetScalersInError.TryAdd(scalerUniqueId, 0); + + return null; + } } + internal static string GetTargetScalerFunctionUniqueId(ITargetScaler scaler) + { + return $"{GetAssemblyName(scaler.GetType())}-{scaler.TargetScalerDescriptor.FunctionId}"; + } private static string GetScaleMonitorFunctionUniqueId(IScaleMonitor monitor) { diff --git a/src/Microsoft.Azure.WebJobs.Host/Scale/ScaleMonitorService.cs b/src/Microsoft.Azure.WebJobs.Host/Scale/ScaleMonitorService.cs index 92cc8d578..154e1d1f1 100644 --- a/src/Microsoft.Azure.WebJobs.Host/Scale/ScaleMonitorService.cs +++ b/src/Microsoft.Azure.WebJobs.Host/Scale/ScaleMonitorService.cs @@ -30,6 +30,8 @@ internal class ScaleMonitorService : IHostedService, IDisposable private readonly ITargetScalerManager _targetScalerManager; private readonly IConfiguration _configuration; private bool _disposed; + internal static DateTime _nextTargetScalerValidationTime = DateTime.MinValue; + private static readonly TimeSpan _targetScalerValidationInterval = TimeSpan.FromMinutes(10); public ScaleMonitorService( IScaleStatusProvider scaleStausProvider, @@ -94,6 +96,29 @@ private async Task TakeMetricsSamplesAsync() { var (scaleMonitorsToProcess, targetScalersToSample) = ScaleManager.GetScalersToSample(_monitorManager, _targetScalerManager, _scaleOptions, _configuration); + // Periodically probe target scalers to discover any that throw NotSupportedException. + // This ensures the primary host independently detects target scaler failures, + // even when the Scale Controller runs on a different worker. + // Runs on first tick, then every 10 minutes to avoid per-tick overhead + // while still catching runtime permission changes (e.g. managed identity revocation). + if (targetScalersToSample.Any() && DateTime.UtcNow >= _nextTargetScalerValidationTime) + { + foreach (var targetScaler in targetScalersToSample) + { + try + { + await ScaleManager.TryGetScaleResultAsync(targetScaler, new TargetScalerContext(), _logger, "ScaleMonitorService").ConfigureAwait(false); + } + catch (Exception) + { + // Ignore transient errors — only NotSupportedException triggers fallback + // and that is already handled inside TryGetScaleResultAsync. + } + } + + _nextTargetScalerValidationTime = DateTime.UtcNow + _targetScalerValidationInterval; + } + if (scaleMonitorsToProcess.Any()) { _logger.LogDebug($"Taking metrics samples for {scaleMonitorsToProcess.Count()} monitor(s)."); diff --git a/test/Microsoft.Azure.WebJobs.Host.UnitTests/Scale/ScaleManagerTests.cs b/test/Microsoft.Azure.WebJobs.Host.UnitTests/Scale/ScaleManagerTests.cs index c21e5dd3e..3db8d93d4 100644 --- a/test/Microsoft.Azure.WebJobs.Host.UnitTests/Scale/ScaleManagerTests.cs +++ b/test/Microsoft.Azure.WebJobs.Host.UnitTests/Scale/ScaleManagerTests.cs @@ -385,7 +385,7 @@ public async Task GetScalersToSample_FallsBackToMonitor_OnTargetScalerError() Assert.Equal(result1.TargetWorkerCount, 1); Assert.Equal(result1.Vote, ScaleVote.None); var logs = _loggerProvider.GetAllLogMessages().Select(x => x.FormattedMessage).ToArray(); - Assert.Single(logs, x => x == "Function 'function1' error: Unable to use target based scaling, switching to metrics monitor."); + Assert.Single(logs, x => x == "Function 'function1' error: Unable to use target based scaling, switching to metrics monitor. Detected by: GetScaleStatus."); _loggerProvider.ClearAllLogMessages(); var (monitors2, scalers2) = ScaleManager.GetScalersToSample( @@ -400,7 +400,7 @@ public async Task GetScalersToSample_FallsBackToMonitor_OnTargetScalerError() Assert.Equal(result2.TargetWorkerCount, null); Assert.Equal(result2.Vote, ScaleVote.ScaleIn); logs = _loggerProvider.GetAllLogMessages().Select(x => x.FormattedMessage).ToArray(); - Assert.DoesNotContain(logs, x => x == "Function 'function1' error: Unable to use target based scaling, switching to metrics monitor."); + Assert.DoesNotContain(logs, x => x == "Function 'function1' error: Unable to use target based scaling, switching to metrics monitor. Detected by: GetScaleStatus."); } [Fact] diff --git a/test/Microsoft.Azure.WebJobs.Host.UnitTests/Scale/ScaleMonitorServiceTests.cs b/test/Microsoft.Azure.WebJobs.Host.UnitTests/Scale/ScaleMonitorServiceTests.cs index 20ce4f734..fe470498b 100644 --- a/test/Microsoft.Azure.WebJobs.Host.UnitTests/Scale/ScaleMonitorServiceTests.cs +++ b/test/Microsoft.Azure.WebJobs.Host.UnitTests/Scale/ScaleMonitorServiceTests.cs @@ -213,6 +213,88 @@ await TestHelpers.Await(() => var metricsWritten = _metricsRepository.Metrics[monitor2].Take(5); Assert.Equal(testMetrics2, metricsWritten); } + [Fact] + public async Task OnTimer_ProbesTargetScalers_FallsBackToMonitor() + { + // Arrange: create a fresh service with TBS enabled + var monitors = new List + { + new TestScaleMonitor("function1-test-test", "function1") + }; + var scalers = new List + { + new FaultyTargetScaler + { + TargetScalerDescriptor = new TargetScalerDescriptor("function1") + } + }; + + var monitorManagerMock = new Mock(MockBehavior.Strict); + monitorManagerMock.Setup(p => p.GetMonitors()).Returns(() => monitors); + var targetScalerManagerMock = new Mock(MockBehavior.Strict); + targetScalerManagerMock.Setup(p => p.GetTargetScalers()).Returns(() => scalers); + + var loggerProvider = new TestLoggerProvider(); + var loggerFactory = new LoggerFactory(); + loggerFactory.AddProvider(loggerProvider); + + var configuration = new ConfigurationBuilder() + .AddInMemoryCollection(new Dictionary { { "Microsoft.Azure.WebJobs.Host.UnitTests", "1" } }).Build(); + + var options = Options.Create(new ScaleOptions + { + ScaleMetricsSampleInterval = TimeSpan.FromSeconds(1), + IsRuntimeScalingEnabled = true, + IsTargetScalingEnabled = true + }); + + var primaryHostState = new PrimaryHostStateProvider { IsPrimary = true }; + + // Clear static state from prior tests + ScaleManager._targetScalersInError.Clear(); + ScaleMonitorService._nextTargetScalerValidationTime = DateTime.MinValue; + + var service = new ScaleMonitorService( + new Mock().Object, + new TestMetricsRepository(), + options, + primaryHostState, + monitorManagerMock.Object, + targetScalerManagerMock.Object, + configuration, + loggerFactory); + + // Before starting: target scaler is not in error, so GetScalersToSample returns it as target scaler + var (monitors1, scalers1) = ScaleManager.GetScalersToSample( + monitorManagerMock.Object, targetScalerManagerMock.Object, options, configuration); + Assert.Empty(monitors1); + Assert.Single(scalers1); + + // Act: start the service — the timer tick will probe the faulty target scaler + await service.StartAsync(CancellationToken.None); + + await TestHelpers.Await(() => + { + var logs = loggerProvider.GetAllLogMessages(); + return logs.Any(l => l.FormattedMessage.Contains("Unable to use target based scaling")); + }); + + await service.StopAsync(CancellationToken.None); + + // Assert: after probing, the faulty scaler is in _targetScalersInError + // so GetScalersToSample now returns the incremental monitor instead + var (monitors2, scalers2) = ScaleManager.GetScalersToSample( + monitorManagerMock.Object, targetScalerManagerMock.Object, options, configuration); + Assert.Single(monitors2); + Assert.Empty(scalers2); + + // Verify the log message includes the ScaleMonitorService caller + var errorLogs = loggerProvider.GetAllLogMessages() + .Where(l => l.FormattedMessage.Contains("Unable to use target based scaling")) + .ToArray(); + Assert.Single(errorLogs); + Assert.Contains("Detected by: ScaleMonitorService", errorLogs[0].FormattedMessage); + } } public class TestMetricsRepository : IScaleMetricsRepository