diff --git a/src/Orleans.Runtime/ConsistentRing/ConsistentRingProvider.cs b/src/Orleans.Runtime/ConsistentRing/ConsistentRingProvider.cs index 9a246ee72b1..3d0aa50e21f 100644 --- a/src/Orleans.Runtime/ConsistentRing/ConsistentRingProvider.cs +++ b/src/Orleans.Runtime/ConsistentRing/ConsistentRingProvider.cs @@ -13,7 +13,7 @@ namespace Orleans.Runtime.ConsistentRing /// Note: MembershipOracle uses 'forward/counter-clockwise' definition to assign responsibilities. /// E.g. in a ring of nodes {5, 10, 15}, the responsible of key 7 is node 5 (the node is responsible for its succeeding range). /// - internal sealed class ConsistentRingProvider : + internal sealed partial class ConsistentRingProvider : IConsistentRingProvider, ISiloStatusListener, IDisposable { // internal, so that unit tests can access them @@ -81,7 +81,6 @@ internal void AddServer(SiloAddress silo) if (!(membershipRingList.Count == 0 || myOldIndex != -1)) throw new OrleansException(string.Format("{0}: Couldn't find my position in the ring {1}.", MyAddress, Utils.EnumerableToString(membershipRingList))); - // insert new silo in the sorted order int hash = silo.GetConsistentHashCode(); @@ -100,10 +99,7 @@ internal void AddServer(SiloAddress silo) NotifyLocalRangeSubscribers(oldRange, myRange, false); } - if (log.IsEnabled(LogLevel.Debug)) - { - log.LogDebug("Added Server {SiloAddress}. Current view: {CurrentView}", silo.ToStringWithHashCode(), this.ToString()); - } + LogDebugAddedServer(log, new(silo), this); } } @@ -140,12 +136,12 @@ internal void RemoveServer(SiloAddress silo) int myNewIndex = membershipRingList.IndexOf(MyAddress); if (myNewIndex == -1) - throw new OrleansException($"{MyAddress}: Couldn't find my position in the ring {this.ToString()}."); + throw new OrleansException($"{MyAddress}: Couldn't find my position in the ring {this}."); bool wasMyPred = ((myNewIndex == indexOfFailedSilo) || (myNewIndex == 0 && indexOfFailedSilo == membershipRingList.Count)); // no need for '- 1' if (wasMyPred) // failed node was our predecessor { - if (log.IsEnabled(LogLevel.Debug)) log.LogDebug("Failed server was my predecessor? {WasPredecessor}, updated view {CurrentView}", wasMyPred, this.ToString()); + LogDebugFailedServerWasMyPredecessor(log, wasMyPred, this); IRingRange oldRange = myRange; if (membershipRingList.Count == 1) // i'm the only one left @@ -163,14 +159,7 @@ internal void RemoveServer(SiloAddress silo) } } - if (log.IsEnabled(LogLevel.Debug)) - { - log.LogDebug( - "Removed Server {SiloAddress} hash {Hash}. Current view {CurrentView}", - silo, - silo.GetConsistentHashCode(), - this.ToString()); - } + LogDebugRemovedServer(log, silo, new(silo), this); } } @@ -199,10 +188,7 @@ public bool UnSubscribeFromRangeChangeEvents(IRingRangeListener observer) private void NotifyLocalRangeSubscribers(IRingRange old, IRingRange now, bool increased) { - if (log.IsEnabled(LogLevel.Debug)) - { - log.LogDebug("NotifyLocalRangeSubscribers about old {OldRange} new {NewRange} increased? {IsIncreased}", old, now, increased); - } + LogDebugNotifyLocalRangeSubscribers(log, old, now, increased); IRingRangeListener[] copy; lock (statusListeners) @@ -219,14 +205,7 @@ private void NotifyLocalRangeSubscribers(IRingRange old, IRingRange now, bool in } catch (Exception exc) { - log.LogWarning( - (int)ErrorCode.CRP_Local_Subscriber_Exception, - exc, - "Error notifying listener '{ListenerType}' of ring range {AdjustmentKind} from '{OldRange}' to '{NewRange}'.", - listener.GetType().FullName, - increased ? "expansion" : "contraction", - old, - now); + LogWarningErrorNotifyingListener(log, exc, listener.GetType().FullName, increased ? "expansion" : "contraction", old, now); } } } @@ -305,7 +284,7 @@ private SiloAddress CalculateTargetSilo(uint hash, bool excludeThisSiloIfStoppin } } - if (log.IsEnabled(LogLevel.Trace)) log.LogTrace("Silo {SiloAddress} calculated ring partition owner silo {OwnerAddress} for key {Key}: {Key} --> {OwnerHash}", MyAddress, siloAddress, hash, hash, siloAddress?.GetConsistentHashCode()); + LogTraceCalculatedRingPartitionOwner(log, MyAddress, siloAddress, hash, new(siloAddress)); return siloAddress; } @@ -318,5 +297,52 @@ public void Dispose() { _siloStatusOracle.UnSubscribeFromSiloStatusEvents(this); } + + private readonly struct SiloAddressWithHashLogRecord(SiloAddress siloAddress) + { + public override string ToString() => siloAddress.ToStringWithHashCode(); + } + + private readonly struct ConsistentHashCodeLogRecord(SiloAddress siloAddress) + { + public override string ToString() => siloAddress?.GetConsistentHashCode().ToString(); + } + + [LoggerMessage( + Level = LogLevel.Debug, + Message = "Added Server {SiloAddress}. Current view: {CurrentView}" + )] + private static partial void LogDebugAddedServer(ILogger logger, SiloAddressWithHashLogRecord siloAddress, ConsistentRingProvider currentView); + + [LoggerMessage( + Level = LogLevel.Debug, + Message = "Failed server was my predecessor? {WasPredecessor}, updated view {CurrentView}" + )] + private static partial void LogDebugFailedServerWasMyPredecessor(ILogger logger, bool wasPredecessor, ConsistentRingProvider currentView); + + [LoggerMessage( + Level = LogLevel.Debug, + Message = "Removed Server {SiloAddress} hash {Hash}. Current view {CurrentView}" + )] + private static partial void LogDebugRemovedServer(ILogger logger, SiloAddress siloAddress, SiloAddressWithHashLogRecord hash, ConsistentRingProvider currentView); + + [LoggerMessage( + Level = LogLevel.Debug, + Message = "NotifyLocalRangeSubscribers about old {OldRange} new {NewRange} increased? {IsIncreased}" + )] + private static partial void LogDebugNotifyLocalRangeSubscribers(ILogger logger, IRingRange oldRange, IRingRange newRange, bool isIncreased); + + [LoggerMessage( + EventId = (int)ErrorCode.CRP_Local_Subscriber_Exception, + Level = LogLevel.Warning, + Message = "Error notifying listener '{ListenerType}' of ring range {AdjustmentKind} from '{OldRange}' to '{NewRange}'." + )] + private static partial void LogWarningErrorNotifyingListener(ILogger logger, Exception exception, string listenerType, string adjustmentKind, IRingRange oldRange, IRingRange newRange); + + [LoggerMessage( + Level = LogLevel.Trace, + Message = "Silo {SiloAddress} calculated ring partition owner silo {OwnerAddress} for key {Key}: {Key} --> {OwnerHash}" + )] + private static partial void LogTraceCalculatedRingPartitionOwner(ILogger logger, SiloAddress siloAddress, SiloAddress ownerAddress, uint key, ConsistentHashCodeLogRecord ownerHash); } } diff --git a/src/Orleans.Runtime/GrainDirectory/AdaptiveDirectoryCacheMaintainer.cs b/src/Orleans.Runtime/GrainDirectory/AdaptiveDirectoryCacheMaintainer.cs index 2efaa85eaf1..466bb4f897b 100644 --- a/src/Orleans.Runtime/GrainDirectory/AdaptiveDirectoryCacheMaintainer.cs +++ b/src/Orleans.Runtime/GrainDirectory/AdaptiveDirectoryCacheMaintainer.cs @@ -10,7 +10,7 @@ namespace Orleans.Runtime.GrainDirectory { - internal sealed class AdaptiveDirectoryCacheMaintainer + internal sealed partial class AdaptiveDirectoryCacheMaintainer { private static readonly TimeSpan SLEEP_TIME_BETWEEN_REFRESHES = Debugger.IsAttached ? TimeSpan.FromMinutes(5) : TimeSpan.FromMinutes(1); // this should be something like minTTL/4 @@ -135,14 +135,7 @@ private async Task Run() } } - if (Log.IsEnabled(LogLevel.Trace)) - Log.LogTrace( - "Silo {SiloAddress} self-owned (and removed) {OwnedAndRemovedCount}, kept {KeptCount}, removed {RemovedCount} and tried to refresh {RefreshedCount} grains", - router.MyAddress, - ownedAndRemovedCount, - keptCount, - removedCount, - refreshedCount); + LogTraceSelfOwnedAndRemoved(Log, router.MyAddress, ownedAndRemovedCount, keptCount, removedCount, refreshedCount); // Send batch requests SendBatchCacheRefreshRequests(fetchInBatchList); @@ -174,7 +167,7 @@ private void SendBatchCacheRefreshRequests(Dictionary ProcessCacheRefreshResponse(silo, response); }).Ignore(); - if (Log.IsEnabled(LogLevel.Trace)) Log.LogTrace("Silo {SiloAddress} is sending request to silo {OwnerSilo} with {Count} entries", router.MyAddress, silo, cachedGrainAndETagList.Count); + LogTraceSendingRequest(Log, router.MyAddress, silo, cachedGrainAndETagList.Count); } } @@ -182,7 +175,7 @@ private void ProcessCacheRefreshResponse( SiloAddress silo, List refreshResponse) { - if (Log.IsEnabled(LogLevel.Trace)) Log.LogTrace("Silo {SiloAddress} received ProcessCacheRefreshResponse. #Response entries {Count}.", router.MyAddress, refreshResponse.Count); + LogTraceReceivedProcessCacheRefreshResponse(Log, router.MyAddress, refreshResponse.Count); int otherSiloCount = 0, updatedCount = 0, unchangedCount = 0; @@ -218,14 +211,7 @@ private void ProcessCacheRefreshResponse( } } - if (Log.IsEnabled(LogLevel.Trace)) - Log.LogTrace( - "Silo {SiloAddress} processed refresh response from {OtherSilo} with {UpdatedCount} updated, {RemovedCount} removed, {UnchangedCount} unchanged grains", - router.MyAddress, - silo, - otherSiloCount, - updatedCount, - unchangedCount); + LogTraceProcessedRefreshResponse(Log, router.MyAddress, silo, otherSiloCount, updatedCount, unchangedCount); } /// @@ -277,5 +263,29 @@ private void ProduceStats() lastNumAccesses = curNumAccesses; lastNumHits = curNumHits; } + + [LoggerMessage( + Level = LogLevel.Trace, + Message = "Silo {SiloAddress} self-owned (and removed) {OwnedAndRemovedCount}, kept {KeptCount}, removed {RemovedCount} and tried to refresh {RefreshedCount} grains" + )] + private static partial void LogTraceSelfOwnedAndRemoved(ILogger logger, SiloAddress siloAddress, int ownedAndRemovedCount, int keptCount, int removedCount, int refreshedCount); + + [LoggerMessage( + Level = LogLevel.Trace, + Message = "Silo {SiloAddress} is sending request to silo {OwnerSilo} with {Count} entries" + )] + private static partial void LogTraceSendingRequest(ILogger logger, SiloAddress siloAddress, SiloAddress ownerSilo, int count); + + [LoggerMessage( + Level = LogLevel.Trace, + Message = "Silo {SiloAddress} received ProcessCacheRefreshResponse. #Response entries {Count}." + )] + private static partial void LogTraceReceivedProcessCacheRefreshResponse(ILogger logger, SiloAddress siloAddress, int count); + + [LoggerMessage( + Level = LogLevel.Trace, + Message = "Silo {SiloAddress} processed refresh response from {OtherSilo} with {UpdatedCount} updated, {RemovedCount} removed, {UnchangedCount} unchanged grains" + )] + private static partial void LogTraceProcessedRefreshResponse(ILogger logger, SiloAddress siloAddress, SiloAddress otherSilo, int updatedCount, int removedCount, int unchangedCount); } } diff --git a/src/Orleans.Runtime/MembershipService/ClusterHealthMonitor.cs b/src/Orleans.Runtime/MembershipService/ClusterHealthMonitor.cs index 3159d3d6f7d..5ccbbe89cd9 100644 --- a/src/Orleans.Runtime/MembershipService/ClusterHealthMonitor.cs +++ b/src/Orleans.Runtime/MembershipService/ClusterHealthMonitor.cs @@ -18,7 +18,7 @@ namespace Orleans.Runtime.MembershipService /// /// Responsible for ensuring that this silo monitors other silos in the cluster. /// - internal class ClusterHealthMonitor : IHealthCheckParticipant, ILifecycleParticipant, ClusterHealthMonitor.ITestAccessor, IDisposable, IAsyncDisposable + internal partial class ClusterHealthMonitor : IHealthCheckParticipant, ILifecycleParticipant, ClusterHealthMonitor.ITestAccessor, IDisposable, IAsyncDisposable { private readonly CancellationTokenSource shutdownCancellation = new CancellationTokenSource(); private readonly ILocalSiloDetails localSiloDetails; @@ -76,7 +76,7 @@ private async Task ProcessMembershipUpdates() { try { - if (this.log.IsEnabled(LogLevel.Debug)) this.log.LogDebug("Starting to process membership updates"); + LogDebugStartingToProcessMembershipUpdates(log); await foreach (var tableSnapshot in this.membershipService.MembershipTableUpdates.WithCancellation(this.shutdownCancellation.Token)) { var utcNow = DateTime.UtcNow; @@ -111,7 +111,7 @@ private async Task ProcessMembershipUpdates() } finally { - if (this.log.IsEnabled(LogLevel.Debug)) this.log.LogDebug("Stopped processing membership updates"); + LogDebugStoppedProcessingMembershipUpdates(log); } } @@ -129,16 +129,12 @@ private async Task EvictStaleStateSilos( { try { - if (this.log.IsEnabled(LogLevel.Debug)) this.log.LogDebug("Stale silo with a joining or created state found, calling 'TryToSuspectOrKill'"); + LogDebugStaleSiloFound(log); await this.membershipService.TryToSuspectOrKill(member.Key); } catch(Exception exception) { - log.LogError( - exception, - "Silo {SuspectAddress} has had the status '{SiloStatus}' for longer than 'MaxJoinAttemptTime' but a call to 'TryToSuspectOrKill' has failed", - member.Value.SiloAddress, - member.Value.Status.ToString()); + LogErrorTryToSuspectOrKillFailed(log, exception, member.Value.SiloAddress, member.Value.Status); } } } @@ -228,11 +224,7 @@ private ImmutableDictionary UpdateMonitoredSilos var myIndex = tmpList.FindIndex(el => el.SiloAddress.Equals(self.SiloAddress)); if (myIndex < 0) { - log.LogError( - (int)ErrorCode.Runtime_Error_100305, - "This silo {SiloAddress} status {Status} is not in its own local silo list! This is a bug!", - self.SiloAddress.ToString(), - self.Status); + LogErrorSiloNotInLocalList(log, self.SiloAddress, self.Status); throw new OrleansMissingMembershipEntryException( $"This silo {self.SiloAddress} status {self.Status} is not in its own local silo list! This is a bug!"); } @@ -267,11 +259,7 @@ private ImmutableDictionary UpdateMonitoredSilos var result = newProbedSilos.ToImmutable(); if (!AreTheSame(monitoredSilos, result)) { - log.LogInformation( - (int)ErrorCode.MembershipWatchList, - "Will watch (actively ping) {ProbedSiloCount} silos: {ProbedSilos}", - newProbedSilos.Count, - Utils.EnumerableToString(newProbedSilos.Keys)); + LogInformationWillWatchActivelyPing(log, newProbedSilos.Count, new(newProbedSilos.Keys)); } return result; @@ -368,7 +356,7 @@ public void Dispose() } catch (Exception exception) { - log.LogError(exception, "Error cancelling shutdown token."); + LogErrorCancellingShutdownToken(log, exception); } foreach (var monitor in monitoredSilos.Values) @@ -379,7 +367,7 @@ public void Dispose() } catch (Exception exception) { - log.LogError(exception, "Error disposing monitor for {SiloAddress}.", monitor.TargetSiloAddress); + LogErrorDisposingMonitorForSilo(log, exception, monitor.TargetSiloAddress); } } } @@ -392,7 +380,7 @@ public async ValueTask DisposeAsync() } catch (Exception exception) { - log.LogError(exception, "Error cancelling shutdown token."); + LogErrorCancellingShutdownToken(log, exception); } var tasks = new List(); @@ -404,11 +392,68 @@ public async ValueTask DisposeAsync() } catch (Exception exception) { - log.LogError(exception, "Error disposing monitor for {SiloAddress}.", monitor.TargetSiloAddress); + LogErrorDisposingMonitorForSilo(log, exception, monitor.TargetSiloAddress); } } await Task.WhenAll(tasks).ConfigureAwait(ConfigureAwaitOptions.SuppressThrowing); } + + // --- Logging methods --- + + [LoggerMessage( + Level = LogLevel.Debug, + Message = "Starting to process membership updates" + )] + private static partial void LogDebugStartingToProcessMembershipUpdates(ILogger logger); + + [LoggerMessage( + Level = LogLevel.Debug, + Message = "Stopped processing membership updates" + )] + private static partial void LogDebugStoppedProcessingMembershipUpdates(ILogger logger); + + [LoggerMessage( + Level = LogLevel.Debug, + Message = "Stale silo with a joining or created state found, calling 'TryToSuspectOrKill'" + )] + private static partial void LogDebugStaleSiloFound(ILogger logger); + + [LoggerMessage( + Level = LogLevel.Error, + Message = "Silo {SuspectAddress} has had the status '{SiloStatus}' for longer than 'MaxJoinAttemptTime' but a call to 'TryToSuspectOrKill' has failed" + )] + private static partial void LogErrorTryToSuspectOrKillFailed(ILogger logger, Exception exception, SiloAddress suspectAddress, SiloStatus siloStatus); + + [LoggerMessage( + EventId = (int)ErrorCode.Runtime_Error_100305, + Level = LogLevel.Error, + Message = "This silo {SiloAddress} status {Status} is not in its own local silo list! This is a bug!" + )] + private static partial void LogErrorSiloNotInLocalList(ILogger logger, SiloAddress siloAddress, SiloStatus status); + + private readonly struct ProbedSilosLogRecord(IEnumerable probedSilos) + { + public override string ToString() => Utils.EnumerableToString(probedSilos); + } + + [LoggerMessage( + EventId = (int)ErrorCode.MembershipWatchList, + Level = LogLevel.Information, + Message = "Will watch (actively ping) {ProbedSiloCount} silos: {ProbedSilos}" + )] + private static partial void LogInformationWillWatchActivelyPing(ILogger logger, int probedSiloCount, ProbedSilosLogRecord probedSilos); + + [LoggerMessage( + Level = LogLevel.Error, + Message = "Error cancelling shutdown token." + )] + private static partial void LogErrorCancellingShutdownToken(ILogger logger, Exception exception); + + [LoggerMessage( + Level = LogLevel.Error, + Message = "Error disposing monitor for {SiloAddress}." + )] + private static partial void LogErrorDisposingMonitorForSilo(ILogger logger, Exception exception, SiloAddress siloAddress); } } diff --git a/src/Orleans.Runtime/MembershipService/ClusterMembershipService.cs b/src/Orleans.Runtime/MembershipService/ClusterMembershipService.cs index 11995b94264..ca03fe6b3ec 100644 --- a/src/Orleans.Runtime/MembershipService/ClusterMembershipService.cs +++ b/src/Orleans.Runtime/MembershipService/ClusterMembershipService.cs @@ -10,7 +10,7 @@ namespace Orleans.Runtime { - internal class ClusterMembershipService : IClusterMembershipService, ILifecycleParticipant, IDisposable + internal partial class ClusterMembershipService : IClusterMembershipService, ILifecycleParticipant, IDisposable { private readonly AsyncEnumerable updates; private readonly MembershipTableManager membershipTableManager; @@ -81,7 +81,7 @@ private async Task ProcessMembershipUpdates(CancellationToken ct) { try { - if (this.log.IsEnabled(LogLevel.Debug)) this.log.LogDebug("Starting to process membership updates"); + LogDebugStartingToProcessMembershipUpdates(log); await foreach (var tableSnapshot in this.membershipTableManager.MembershipTableUpdates.WithCancellation(ct)) { this.updates.TryPublish(tableSnapshot.CreateClusterMembershipSnapshot()); @@ -93,12 +93,12 @@ private async Task ProcessMembershipUpdates(CancellationToken ct) } catch (Exception exception) when (this.fatalErrorHandler.IsUnexpected(exception)) { - this.log.LogError(exception, "Error processing membership updates"); + LogErrorProcessingMembershipUpdates(log, exception); this.fatalErrorHandler.OnFatalException(this, nameof(ProcessMembershipUpdates), exception); } finally { - if (this.log.IsEnabled(LogLevel.Debug)) this.log.LogDebug("Stopping membership update processor"); + LogDebugStoppingMembershipUpdateProcessor(log); } } @@ -130,5 +130,23 @@ void IDisposable.Dispose() { this.updates.Dispose(); } + + [LoggerMessage( + Level = LogLevel.Debug, + Message = "Starting to process membership updates" + )] + private static partial void LogDebugStartingToProcessMembershipUpdates(ILogger logger); + + [LoggerMessage( + Level = LogLevel.Error, + Message = "Error processing membership updates" + )] + private static partial void LogErrorProcessingMembershipUpdates(ILogger logger, Exception exception); + + [LoggerMessage( + Level = LogLevel.Debug, + Message = "Stopping membership update processor" + )] + private static partial void LogDebugStoppingMembershipUpdateProcessor(ILogger logger); } } diff --git a/src/Orleans.Runtime/MembershipService/LocalSiloHealthMonitor.cs b/src/Orleans.Runtime/MembershipService/LocalSiloHealthMonitor.cs index 2f46bb623b5..9a6f4e56d22 100644 --- a/src/Orleans.Runtime/MembershipService/LocalSiloHealthMonitor.cs +++ b/src/Orleans.Runtime/MembershipService/LocalSiloHealthMonitor.cs @@ -3,7 +3,6 @@ using System; using System.Collections.Generic; using System.Collections.Immutable; -using System.Diagnostics; using System.Linq; using System.Threading; using System.Threading.Tasks; @@ -50,14 +49,14 @@ internal interface ILocalSiloHealthMonitor /// Check that local async timers have been firing on-time (within 3 seconds of their due time). /// /// - internal class LocalSiloHealthMonitor : ILifecycleParticipant, ILifecycleObserver, ILocalSiloHealthMonitor + internal partial class LocalSiloHealthMonitor : ILifecycleParticipant, ILifecycleObserver, ILocalSiloHealthMonitor { private const int MaxScore = 8; private readonly List _healthCheckParticipants; private readonly MembershipTableManager _membershipTableManager; private readonly ClusterHealthMonitor _clusterHealthMonitor; private readonly ILocalSiloDetails _localSiloDetails; - private readonly ILogger _log; + private readonly ILogger _lo; private readonly ClusterMembershipOptions _clusterMembershipOptions; private readonly IAsyncTimer _degradationCheckTimer; private readonly ThreadPoolMonitor _threadPoolMonitor; @@ -83,7 +82,7 @@ public LocalSiloHealthMonitor( _membershipTableManager = membershipTableManager; _clusterHealthMonitor = clusterHealthMonitor; _localSiloDetails = localSiloDetails; - _log = log; + _lo = log; _probeRequestMonitor = probeRequestMonitor; _clusterMembershipOptions = clusterMembershipOptions.Value; _degradationCheckTimer = timerFactory.Create( @@ -145,10 +144,7 @@ private int CheckThreadPoolQueueDelay(DateTime checkTime, List? complain { // Log as an error if the delay is massive. var logLevel = (int)threadPoolDelaySeconds >= 10 ? LogLevel.Error : LogLevel.Warning; - _log.Log( - logLevel, - ".NET Thread Pool is exhibiting delays of {ThreadPoolQueueDelaySeconds}s. This can indicate .NET Thread Pool starvation, very long .NET GC pauses, or other runtime or machine pauses.", - threadPoolDelaySeconds); + LogThreadPoolDelay(logLevel, threadPoolDelaySeconds); complaints?.Add( $".NET Thread Pool is exhibiting delays of {threadPoolDelaySeconds}s. This can indicate .NET Thread Pool starvation, very long .NET GC pauses, or other runtime or machine pauses."); @@ -166,7 +162,7 @@ private int CheckSuspectingNodes(DateTime now, List? complaints) { if (membershipEntry.Status != SiloStatus.Active) { - _log.LogWarning("This silo is not active (Status: {Status}) and is therefore not healthy.", membershipEntry.Status); + LogSiloNotActive(membershipEntry.Status); complaints?.Add($"This silo is not active (Status: {membershipEntry.Status}) and is therefore not healthy."); score = MaxScore; @@ -179,7 +175,7 @@ private int CheckSuspectingNodes(DateTime now, List? complaints) { if (membershipSnapshot.GetSiloStatus(vote.Item1) == SiloStatus.Active) { - _log.LogWarning("Silo {Silo} recently suspected this silo is dead at {SuspectingTime}.", vote.Item1, vote.Item2); + LogSiloSuspected(vote.Item1, vote.Item2); complaints?.Add($"Silo {vote.Item1} recently suspected this silo is dead at {vote.Item2}."); ++score; @@ -188,8 +184,7 @@ private int CheckSuspectingNodes(DateTime now, List? complaints) } else { - // If our entry is not found, this node is not healthy. - _log.LogError("Could not find a membership entry for this silo"); + LogMembershipEntryNotFound(); complaints?.Add("Could not find a membership entry for this silo"); score = MaxScore; @@ -212,7 +207,7 @@ private int CheckReceivedProbeRequests(DateTime now, List? complaints) var recencyWindow = _clusterMembershipOptions.ProbeTimeout.Multiply(_clusterMembershipOptions.NumMissedProbesLimit); if (!sinceLastProbeRequest.HasValue) { - _log.LogWarning("This silo has not received any probe requests"); + LogNoProbeRequests(); complaints?.Add("This silo has not received any probe requests"); ++score; } @@ -220,7 +215,7 @@ private int CheckReceivedProbeRequests(DateTime now, List? complaints) { // This node has not received a successful ping response since the window began. var lastRequestTime = now - sinceLastProbeRequest.Value; - _log.LogWarning("This silo has not received a probe request since {LastProbeRequest}", lastRequestTime); + LogNoRecentProbeRequest(lastRequestTime); complaints?.Add($"This silo has not received a probe request since {lastRequestTime}"); ++score; } @@ -251,14 +246,14 @@ private int CheckReceivedProbeResponses(DateTime now, List? complaints) var recencyWindow = _clusterMembershipOptions.ProbeTimeout.Multiply(_clusterMembershipOptions.NumMissedProbesLimit); if (!elapsedSinceLastResponse.HasValue) { - _log.LogWarning("This silo has not received any successful probe responses"); + LogNoProbeResponses(); complaints?.Add("This silo has not received any successful probe responses"); ++score; } else if (elapsedSinceLastResponse.Value > recencyWindow) { // This node has not received a successful ping response since the window began. - _log.LogWarning("This silo has not received a successful probe response since {LastSuccessfulResponse}", elapsedSinceLastResponse.Value); + LogNoRecentProbeResponse(elapsedSinceLastResponse.Value); complaints?.Add($"This silo has not received a successful probe response since {elapsedSinceLastResponse.Value}"); ++score; } @@ -277,16 +272,16 @@ private int CheckLocalHealthCheckParticipants(DateTime now, List? compla { if (!participant.CheckHealth(_lastHealthCheckTime, out var reason)) { - _log.LogWarning("Health check participant {Participant} is reporting that it is unhealthy with complaint: {Reason}", participant.GetType().ToString(), reason); - complaints?.Add($"Health check participant {participant.GetType().ToString()} is reporting that it is unhealthy with complaint: {reason}"); + LogHealthCheckParticipantUnhealthy(participant.GetType(), reason); + complaints?.Add($"Health check participant {participant.GetType()} is reporting that it is unhealthy with complaint: {reason}"); ++score; } } catch (Exception exception) { - _log.LogError(exception, "Error checking health for {Participant}", participant.GetType().ToString()); - complaints?.Add($"Error checking health for participant {participant.GetType().ToString()}: {LogFormatter.PrintException(exception)}"); + LogHealthCheckParticipantError(exception, participant.GetType()); + complaints?.Add($"Error checking health for participant {participant.GetType()}: {LogFormatter.PrintException(exception)}"); ++score; } @@ -308,14 +303,14 @@ private async Task Run() if (score > 0) { var complaintsString = string.Join("\n", complaints); - _log.LogWarning("Self-monitoring determined that local health is degraded. Degradation score is {Score}/{MaxScore} (lower is better). Complaints: {Complaints}", score, MaxScore, complaintsString); + LogSelfMonitoringDegraded(score, MaxScore, complaintsString); } this.Complaints = ImmutableArray.CreateRange(complaints); } catch (Exception exception) { - _log.LogError(exception, "Error while monitoring local silo health"); + LogErrorMonitoringLocalSiloHealth(exception); } } } @@ -406,5 +401,76 @@ private void Execute() } } } + + [LoggerMessage( + Message = ".NET Thread Pool is exhibiting delays of {ThreadPoolQueueDelaySeconds}s. This can indicate .NET Thread Pool starvation, very long .NET GC pauses, or other runtime or machine pauses." + )] + private partial void LogThreadPoolDelay(LogLevel logLevel, double threadPoolQueueDelaySeconds); + + [LoggerMessage( + Level = LogLevel.Warning, + Message = "This silo is not active (Status: {Status}) and is therefore not healthy." + )] + private partial void LogSiloNotActive(SiloStatus status); + + [LoggerMessage( + Level = LogLevel.Warning, + Message = "Silo {Silo} recently suspected this silo is dead at {SuspectingTime}." + )] + private partial void LogSiloSuspected(SiloAddress silo, DateTime suspectingTime); + + [LoggerMessage( + Level = LogLevel.Error, + Message = "Could not find a membership entry for this silo" + )] + private partial void LogMembershipEntryNotFound(); + + [LoggerMessage( + Level = LogLevel.Warning, + Message = "This silo has not received any probe requests" + )] + private partial void LogNoProbeRequests(); + + [LoggerMessage( + Level = LogLevel.Warning, + Message = "This silo has not received a probe request since {LastProbeRequest}" + )] + private partial void LogNoRecentProbeRequest(DateTime lastProbeRequest); + + [LoggerMessage( + Level = LogLevel.Warning, + Message = "This silo has not received any successful probe responses" + )] + private partial void LogNoProbeResponses(); + + [LoggerMessage( + Level = LogLevel.Warning, + Message = "This silo has not received a successful probe response since {LastSuccessfulResponse}" + )] + private partial void LogNoRecentProbeResponse(TimeSpan lastSuccessfulResponse); + + [LoggerMessage( + Level = LogLevel.Warning, + Message = "Health check participant {Participant} is reporting that it is unhealthy with complaint: {Reason}" + )] + private partial void LogHealthCheckParticipantUnhealthy(Type participant, string reason); + + [LoggerMessage( + Level = LogLevel.Error, + Message = "Error checking health for {Participant}" + )] + private partial void LogHealthCheckParticipantError(Exception exception, Type participant); + + [LoggerMessage( + Level = LogLevel.Warning, + Message = "Self-monitoring determined that local health is degraded. Degradation score is {Score}/{MaxScore} (lower is better). Complaints: {Complaints}" + )] + private partial void LogSelfMonitoringDegraded(int score, int maxScore, string complaints); + + [LoggerMessage( + Level = LogLevel.Error, + Message = "Error while monitoring local silo health" + )] + private partial void LogErrorMonitoringLocalSiloHealth(Exception exception); } } diff --git a/src/Orleans.Runtime/MembershipService/MembershipAgent.cs b/src/Orleans.Runtime/MembershipService/MembershipAgent.cs index cee156274e8..aa03eae7003 100644 --- a/src/Orleans.Runtime/MembershipService/MembershipAgent.cs +++ b/src/Orleans.Runtime/MembershipService/MembershipAgent.cs @@ -13,7 +13,7 @@ namespace Orleans.Runtime.MembershipService /// /// Responsible for updating membership table with details about the local silo. /// - internal class MembershipAgent : IHealthCheckParticipant, ILifecycleParticipant, IDisposable, MembershipAgent.ITestAccessor + internal partial class MembershipAgent : IHealthCheckParticipant, ILifecycleParticipant, IDisposable, MembershipAgent.ITestAccessor { private static readonly TimeSpan EXP_BACKOFF_CONTENTION_MIN = TimeSpan.FromMilliseconds(200); private static readonly TimeSpan EXP_BACKOFF_CONTENTION_MAX = TimeSpan.FromMinutes(2); @@ -59,7 +59,7 @@ internal interface ITestAccessor private async Task UpdateIAmAlive() { - if (this.log.IsEnabled(LogLevel.Debug)) this.log.LogDebug("Starting periodic membership liveness timestamp updates"); + LogDebugStartingPeriodicMembershipLivenessTimestampUpdates(); try { // jitter for initial @@ -73,18 +73,14 @@ private async Task UpdateIAmAlive() var stopwatch = ValueStopwatch.StartNew(); ((ITestAccessor)this).OnUpdateIAmAlive?.Invoke(); await this.tableManager.UpdateIAmAlive(); - if (this.log.IsEnabled(LogLevel.Trace)) this.log.LogTrace("Updating IAmAlive took {Elapsed}", stopwatch.Elapsed); + LogTraceUpdatingIAmAliveTook(stopwatch.Elapsed); overrideDelayPeriod = default; runningFailures = 0; } catch (Exception exception) { runningFailures += 1; - this.log.LogWarning( - (int)ErrorCode.MembershipUpdateIAmAliveFailure, - exception, - "Failed to update table entry for this silo, will retry shortly"); - + LogWarningFailedToUpdateTableEntryForThisSilo(exception); // Retry quickly and then exponentially back off overrideDelayPeriod = exponentialBackoff.Next(runningFailures); } @@ -92,36 +88,28 @@ private async Task UpdateIAmAlive() } catch (Exception exception) when (this.fatalErrorHandler.IsUnexpected(exception)) { - this.log.LogError(exception, "Error updating liveness timestamp"); + LogErrorErrorUpdatingLivenessTimestamp(exception); this.fatalErrorHandler.OnFatalException(this, nameof(UpdateIAmAlive), exception); } finally { - if (this.log.IsEnabled(LogLevel.Debug)) this.log.LogDebug("Stopping periodic membership liveness timestamp updates"); + LogDebugStoppingPeriodicMembershipLivenessTimestampUpdates(); } } private async Task BecomeActive() { - this.log.LogInformation( - (int)ErrorCode.MembershipBecomeActive, - "-BecomeActive"); - + LogInformationBecomeActive(); await this.ValidateInitialConnectivity(); try { await this.UpdateStatus(SiloStatus.Active); - this.log.LogInformation( - (int)ErrorCode.MembershipFinishBecomeActive, - "-Finished BecomeActive."); + LogInformationFinishedBecomeActive(); } catch (Exception exception) { - this.log.LogInformation( - (int)ErrorCode.MembershipFailedToBecomeActive, - exception, - "BecomeActive failed"); + LogInformationBecomeActiveFailed(exception); throw; } } @@ -156,18 +144,7 @@ private async Task ValidateInitialConnectivity() // If there were no failures, terminate the loop and return without error. if (failedSilos.Count == 0) break; - this.log.LogError( - (int)ErrorCode.MembershipJoiningPreconditionFailure, - "Failed to get ping responses from {FailedCount} of {ActiveCount} active silos. " - + "Newly joining silos validate connectivity with all active silos that have recently updated their 'I Am Alive' value before joining the cluster. " - + "Successfully contacted: {SuccessfulSilos}. Silos which did not respond successfully are: {FailedSilos}. " - + "Will continue attempting to validate connectivity until {Timeout}. Attempt #{Attempt}", - failedSilos.Count, - activeSilos.Count, - Utils.EnumerableToString(successfulSilos), - Utils.EnumerableToString(failedSilos), - attemptUntil, - attemptNumber); + LogErrorFailedToGetPingResponses(failedSilos.Count, activeSilos.Count, new(successfulSilos), new(failedSilos), attemptUntil, attemptNumber); if (now + TimeSpan.FromSeconds(5) > attemptUntil) { @@ -184,7 +161,7 @@ private async Task ValidateInitialConnectivity() } catch (Exception exception) when (canContinue) { - this.log.LogError(exception, "Failed to validate initial cluster connectivity"); + LogErrorFailedToValidateInitialClusterConnectivity(exception); await Task.Delay(TimeSpan.FromSeconds(1)); } @@ -198,11 +175,7 @@ async Task> CheckClusterConnectivity(SiloAddress[] members) var tasks = new List>(members.Length); - this.log.LogInformation( - (int)ErrorCode.MembershipSendingPreJoinPing, - "About to send pings to {Count} nodes in order to validate communication in the Joining state. Pinged nodes = {Nodes}", - members.Length, - Utils.EnumerableToString(members)); + LogInformationAboutToSendPings(members.Length, new EnumerableToStringLogValue(members)); var timeout = this.clusterMembershipOptions.ProbeTimeout; foreach (var silo in members) @@ -244,52 +217,43 @@ static async Task ProbeSilo(IRemoteSiloProber siloProber, SiloAddress silo exception = ex; } - log.LogWarning(exception, "Did not receive a probe response from silo {SiloAddress} in timeout {Timeout}", silo.ToString(), timeout); + LogWarningDidNotReceiveProbeResponse(log, exception, silo, timeout); return false; } } private async Task BecomeJoining() { - this.log.LogInformation((int)ErrorCode.MembershipJoining, "Joining"); + LogInformationJoining(); try { await this.UpdateStatus(SiloStatus.Joining); } catch (Exception exc) { - this.log.LogError( - (int)ErrorCode.MembershipFailedToJoin, - exc, - "Error updating status to Joining"); + LogErrorErrorUpdatingStatusToJoining(exc); throw; } } private async Task BecomeShuttingDown() { - if (this.log.IsEnabled(LogLevel.Debug)) - { - this.log.LogDebug((int)ErrorCode.MembershipShutDown, "-Shutdown"); - } - + LogDebugShutdown(); + try { await this.UpdateStatus(SiloStatus.ShuttingDown); } catch (Exception exc) { - this.log.LogError((int)ErrorCode.MembershipFailedToShutdown, exc, "Error updating status to ShuttingDown"); + LogErrorErrorUpdatingStatusToShuttingDown(exc); throw; } } private async Task BecomeStopping() { - if (this.log.IsEnabled(LogLevel.Debug)) - { - log.LogDebug((int)ErrorCode.MembershipStop, "-Stop"); - } + LogDebugStop(); try { @@ -297,19 +261,14 @@ private async Task BecomeStopping() } catch (Exception exc) { - log.LogError((int)ErrorCode.MembershipFailedToStop, exc, "Error updating status to Stopping"); + LogErrorErrorUpdatingStatusToStopping(exc); throw; } } private async Task BecomeDead() { - if (this.log.IsEnabled(LogLevel.Debug)) - { - this.log.LogDebug( - (int)ErrorCode.MembershipKillMyself, - "Updating status to Dead"); - } + LogDebugUpdatingStatusToDead(); try { @@ -317,10 +276,7 @@ private async Task BecomeDead() } catch (Exception exception) { - this.log.LogError( - (int)ErrorCode.MembershipFailedToKillMyself, - exception, - "Failure updating status to " + nameof(SiloStatus.Dead)); + LogErrorFailureUpdatingStatusToDead(exception); throw; } } @@ -416,5 +372,147 @@ public void Dispose() } bool IHealthCheckable.CheckHealth(DateTime lastCheckTime, out string reason) => this.iAmAliveTimer.CheckHealth(lastCheckTime, out reason); + + private readonly struct EnumerableToStringLogValue(IEnumerable enumerable) + { + public override string ToString() => Utils.EnumerableToString(enumerable); + } + + [LoggerMessage( + Level = LogLevel.Debug, + Message = "Starting periodic membership liveness timestamp updates" + )] + private partial void LogDebugStartingPeriodicMembershipLivenessTimestampUpdates(); + + [LoggerMessage( + Level = LogLevel.Trace, + Message = "Updating IAmAlive took {Elapsed}" + )] + private partial void LogTraceUpdatingIAmAliveTook(TimeSpan elapsed); + + [LoggerMessage( + EventId = (int)ErrorCode.MembershipUpdateIAmAliveFailure, + Level = LogLevel.Warning, + Message = "Failed to update table entry for this silo, will retry shortly" + )] + private partial void LogWarningFailedToUpdateTableEntryForThisSilo(Exception exception); + + [LoggerMessage( + Level = LogLevel.Debug, + Message = "Stopping periodic membership liveness timestamp updates" + )] + private partial void LogDebugStoppingPeriodicMembershipLivenessTimestampUpdates(); + + [LoggerMessage( + Level = LogLevel.Error, + Message = "Error updating liveness timestamp" + )] + private partial void LogErrorErrorUpdatingLivenessTimestamp(Exception exception); + + [LoggerMessage( + EventId = (int)ErrorCode.MembershipBecomeActive, + Level = LogLevel.Information, + Message = "-BecomeActive" + )] + private partial void LogInformationBecomeActive(); + + [LoggerMessage( + EventId = (int)ErrorCode.MembershipFinishBecomeActive, + Level = LogLevel.Information, + Message = "-Finished BecomeActive." + )] + private partial void LogInformationFinishedBecomeActive(); + + [LoggerMessage( + EventId = (int)ErrorCode.MembershipFailedToBecomeActive, + Level = LogLevel.Information, + Message = "BecomeActive failed" + )] + private partial void LogInformationBecomeActiveFailed(Exception exception); + + [LoggerMessage( + EventId = (int)ErrorCode.MembershipJoiningPreconditionFailure, + Level = LogLevel.Error, + Message = "Failed to get ping responses from {FailedCount} of {ActiveCount} active silos. " + + "Newly joining silos validate connectivity with all active silos that have recently updated their 'I Am Alive' value before joining the cluster. " + + "Successfully contacted: {SuccessfulSilos}. Silos which did not respond successfully are: {FailedSilos}. " + + "Will continue attempting to validate connectivity until {Timeout}. Attempt #{Attempt}" + )] + private partial void LogErrorFailedToGetPingResponses(int failedCount, int activeCount, EnumerableToStringLogValue successfulSilos, EnumerableToStringLogValue failedSilos, DateTime timeout, int attempt); + + [LoggerMessage( + Level = LogLevel.Error, + Message = "Failed to validate initial cluster connectivity" + )] + private partial void LogErrorFailedToValidateInitialClusterConnectivity(Exception exception); + + [LoggerMessage( + EventId = (int)ErrorCode.MembershipSendingPreJoinPing, + Level = LogLevel.Information, + Message = "About to send pings to {Count} nodes in order to validate communication in the Joining state. Pinged nodes = {Nodes}" + )] + private partial void LogInformationAboutToSendPings(int count, EnumerableToStringLogValue nodes); + + [LoggerMessage( + Level = LogLevel.Warning, + Message = "Did not receive a probe response from silo {SiloAddress} in timeout {Timeout}" + )] + private static partial void LogWarningDidNotReceiveProbeResponse(ILogger logger, Exception exception, SiloAddress siloAddress, TimeSpan timeout); + + [LoggerMessage( + EventId = (int)ErrorCode.MembershipJoining, + Level = LogLevel.Information, + Message = "Joining" + )] + private partial void LogInformationJoining(); + + [LoggerMessage( + EventId = (int)ErrorCode.MembershipFailedToJoin, + Level = LogLevel.Error, + Message = "Error updating status to Joining" + )] + private partial void LogErrorErrorUpdatingStatusToJoining(Exception exception); + + [LoggerMessage( + EventId = (int)ErrorCode.MembershipShutDown, + Level = LogLevel.Debug, + Message = "-Shutdown" + )] + private partial void LogDebugShutdown(); + + [LoggerMessage( + EventId = (int)ErrorCode.MembershipFailedToShutdown, + Level = LogLevel.Error, + Message = "Error updating status to ShuttingDown" + )] + private partial void LogErrorErrorUpdatingStatusToShuttingDown(Exception exception); + + [LoggerMessage( + EventId = (int)ErrorCode.MembershipStop, + Level = LogLevel.Debug, + Message = "-Stop" + )] + private partial void LogDebugStop(); + + [LoggerMessage( + EventId = (int)ErrorCode.MembershipFailedToStop, + Level = LogLevel.Error, + Message = "Error updating status to Stopping" + )] + private partial void LogErrorErrorUpdatingStatusToStopping(Exception exception); + + [LoggerMessage( + EventId = (int)ErrorCode.MembershipKillMyself, + Level = LogLevel.Debug, + Message = "Updating status to Dead" + )] + private partial void LogDebugUpdatingStatusToDead(); + + [LoggerMessage( + EventId = (int)ErrorCode.MembershipFailedToKillMyself, + Level = LogLevel.Error, + Message = "Failure updating status to Dead" + )] + private partial void LogErrorFailureUpdatingStatusToDead(Exception exception); } } diff --git a/src/Orleans.Runtime/MembershipService/MembershipSystemTarget.cs b/src/Orleans.Runtime/MembershipService/MembershipSystemTarget.cs index 2c9bc6192c0..19509a4b6d0 100644 --- a/src/Orleans.Runtime/MembershipService/MembershipSystemTarget.cs +++ b/src/Orleans.Runtime/MembershipService/MembershipSystemTarget.cs @@ -7,7 +7,7 @@ namespace Orleans.Runtime.MembershipService { - internal sealed class MembershipSystemTarget : SystemTarget, IMembershipService, ILifecycleParticipant + internal sealed partial class MembershipSystemTarget : SystemTarget, IMembershipService, ILifecycleParticipant { private readonly MembershipTableManager membershipTableManager; private readonly ILogger log; @@ -36,9 +36,7 @@ public async Task MembershipChangeNotification(MembershipTableSnapshot snapshot) } else { - if (this.log.IsEnabled(LogLevel.Trace)) - this.log.LogTrace("-Received GOSSIP MembershipChangeNotification with MembershipVersion.MinValue. Going to read the table"); - + LogTraceReceivedGossipMembershipChangeNotificationWithMinValue(this.log); await ReadTable(); } } @@ -88,7 +86,7 @@ public async Task ProbeIndirectly(SiloAddress target, Tim } catch (TimeoutException exception) { - log.LogWarning(exception, "Requested probe timeout {ProbeTimeout} exceeded", probeTimeout); + LogWarningRequestedProbeTimeoutExceeded(this.log, exception, probeTimeout); throw; } @@ -139,14 +137,7 @@ private async Task GossipToRemoteSilo( SiloAddress updatedSilo, SiloStatus updatedStatus) { - if (this.log.IsEnabled(LogLevel.Trace)) - { - this.log.LogTrace( - "-Sending status update GOSSIP notification about silo {UpdatedSilo}, status {UpdatedStatus}, to silo {RemoteSilo}", - updatedSilo, - updatedStatus, - silo); - } + LogTraceSendingStatusUpdateGossipNotification(this.log, updatedSilo, updatedStatus, silo); try { @@ -155,11 +146,7 @@ private async Task GossipToRemoteSilo( } catch (Exception exception) { - this.log.LogWarning( - (int)ErrorCode.MembershipGossipSendFailure, - exception, - "Error sending gossip notification to remote silo '{Silo}'.", - silo); + LogWarningErrorSendingGossipNotificationToRemoteSilo(this.log, exception, silo); } } @@ -171,10 +158,7 @@ private async Task ReadTable() } catch (Exception exception) { - this.log.LogError( - (int)ErrorCode.MembershipGossipProcessingFailure, - exception, - "Error refreshing membership table."); + LogErrorErrorRefreshingMembershipTable(this.log, exception); } } @@ -202,5 +186,37 @@ void ILifecycleParticipant.Participate(ISiloLifecycle observer) { // No-op, just ensure this instance is created at start-up. } + + [LoggerMessage( + Level = LogLevel.Trace, + Message = "-Received GOSSIP MembershipChangeNotification with MembershipVersion.MinValue. Going to read the table" + )] + private static partial void LogTraceReceivedGossipMembershipChangeNotificationWithMinValue(ILogger logger); + + [LoggerMessage( + Level = LogLevel.Warning, + Message = "Requested probe timeout {ProbeTimeout} exceeded" + )] + private static partial void LogWarningRequestedProbeTimeoutExceeded(ILogger logger, Exception exception, TimeSpan probeTimeout); + + [LoggerMessage( + Level = LogLevel.Trace, + Message = "-Sending status update GOSSIP notification about silo {UpdatedSilo}, status {UpdatedStatus}, to silo {RemoteSilo}" + )] + private static partial void LogTraceSendingStatusUpdateGossipNotification(ILogger logger, SiloAddress updatedSilo, SiloStatus updatedStatus, SiloAddress remoteSilo); + + [LoggerMessage( + EventId = (int)ErrorCode.MembershipGossipSendFailure, + Level = LogLevel.Warning, + Message = "Error sending gossip notification to remote silo '{Silo}'." + )] + private static partial void LogWarningErrorSendingGossipNotificationToRemoteSilo(ILogger logger, Exception exception, SiloAddress silo); + + [LoggerMessage( + EventId = (int)ErrorCode.MembershipGossipProcessingFailure, + Level = LogLevel.Error, + Message = "Error refreshing membership table." + )] + private static partial void LogErrorErrorRefreshingMembershipTable(ILogger logger, Exception exception); } } diff --git a/src/Orleans.Runtime/MembershipService/MembershipTableManager.cs b/src/Orleans.Runtime/MembershipService/MembershipTableManager.cs index d03877840d2..608aa1b488e 100644 --- a/src/Orleans.Runtime/MembershipService/MembershipTableManager.cs +++ b/src/Orleans.Runtime/MembershipService/MembershipTableManager.cs @@ -15,7 +15,7 @@ namespace Orleans.Runtime.MembershipService { - internal class MembershipTableManager : IHealthCheckParticipant, ILifecycleParticipant, IDisposable + internal partial class MembershipTableManager : IHealthCheckParticipant, ILifecycleParticipant, IDisposable { private const int NUM_CONDITIONAL_WRITE_CONTENTION_ATTEMPTS = -1; // unlimited private const int NUM_CONDITIONAL_WRITE_ERROR_ATTEMPTS = -1; @@ -117,16 +117,13 @@ public async Task RefreshFromSnapshot(MembershipTableSnapshot snapshot) await pending; } - this.log.LogInformation("Received cluster membership snapshot via gossip: {Snapshot}", snapshot); + LogInformationReceivedClusterMembershipSnapshot(this.log, snapshot); if (snapshot.Entries.TryGetValue(this.myAddress, out var localSiloEntry)) { if (localSiloEntry.Status == SiloStatus.Dead && this.CurrentStatus != SiloStatus.Dead) { - this.log.LogWarning( - (int)ErrorCode.MembershipFoundMyselfDead1, - "I should be Dead according to membership table (in RefreshFromSnapshot). Local entry: {Entry}.", - localSiloEntry.ToFullString()); + LogWarningFoundMyselfDeadInRefreshFromSnapshot(this.log, localSiloEntry.ToFullString()); this.KillMyselfLocally($"I should be Dead according to membership table (in RefreshFromSnapshot). Local entry: {(localSiloEntry.ToFullString())}."); } } @@ -147,9 +144,7 @@ private async Task RefreshInternal(bool requireCleanup) catch (Exception exception) when (!requireCleanup) { success = false; - this.log.LogWarning( - exception, - "Exception while trying to clean up my table entries"); + LogWarningExceptionWhileCleaningUpTableEntries(this.log, exception); } // If cleanup was not required then the cleanup result is ignored. @@ -160,12 +155,7 @@ private async Task Start() { try { - this.log.LogInformation( - (int)ErrorCode.MembershipStarting, - "MembershipOracle starting on host {HostName} with SiloAddress {SiloAddress} at {StartTime}", - this.localSiloDetails.DnsHostName, - this.myAddress, - LogFormatter.PrintDate(this.siloStartTime)); + LogInformationMembershipStarting(this.log, this.localSiloDetails.DnsHostName, this.myAddress, LogFormatter.PrintDate(this.siloStartTime)); // Init the membership table. await this.membershipTableProvider.InitializeMembershipTable(true); @@ -191,7 +181,7 @@ private async Task Start() } catch (Exception exception) { - this.log.LogError((int)ErrorCode.MembershipFailedToStart, exception, "Membership failed to start"); + LogErrorMembershipFailedToStart(this.log, exception); throw; } } @@ -225,31 +215,18 @@ private void DetectNodeMigration(MembershipTableSnapshot snapshot, string myHost bool physicalHostChanged = !myHostname.Equals(mostRecentPreviousEntry.HostName) || !myAddress.Endpoint.Equals(mostRecentPreviousEntry.SiloAddress.Endpoint); if (physicalHostChanged) { - log.LogWarning( - (int)ErrorCode.MembershipNodeMigrated, - "Silo {SiloName} migrated to host {HostName} silo address {SiloAddress} from host {PreviousHostName} silo address {PreviousSiloAddress}.", - mySiloName, - myHostname, - myAddress, - mostRecentPreviousEntry.HostName, - mostRecentPreviousEntry.SiloAddress); + LogWarningNodeMigrated(this.log, mySiloName, myHostname, myAddress, mostRecentPreviousEntry.HostName, mostRecentPreviousEntry.SiloAddress); } else { - log.LogWarning( - (int)ErrorCode.MembershipNodeRestarted, - "Silo {SiloName} restarted on same host {HostName} with silo address = {SiloAddress} Previous silo address = {PreviousSiloAddress}", - mySiloName, - myHostname, - myAddress, - mostRecentPreviousEntry.SiloAddress); + LogWarningNodeRestarted(this.log, mySiloName, myHostname, myAddress, mostRecentPreviousEntry.SiloAddress); } } } private async Task PeriodicallyRefreshMembershipTable() { - if (this.log.IsEnabled(LogLevel.Debug)) this.log.LogDebug("Starting periodic membership table refreshes"); + LogDebugStartingPeriodicMembershipTableRefreshes(this.log); try { // jitter for initial @@ -261,9 +238,9 @@ private async Task PeriodicallyRefreshMembershipTable() try { var stopwatch = ValueStopwatch.StartNew(); - + await this.Refresh(); - if (this.log.IsEnabled(LogLevel.Trace)) this.log.LogTrace("Refreshing membership table took {Elapsed}", stopwatch.Elapsed); + LogTraceRefreshingMembershipTableTook(this.log, stopwatch.Elapsed); // reset to allow normal refresh period after success overrideDelayPeriod = default; runningFailures = 0; @@ -271,10 +248,7 @@ private async Task PeriodicallyRefreshMembershipTable() catch (Exception exception) { runningFailures += 1; - this.log.LogWarning( - (int)ErrorCode.MembershipUpdateIAmAliveFailure, - exception, - "Failed to refresh membership table, will retry shortly. Retry attempt {retries}", runningFailures); + LogWarningFailedToRefreshMembershipTable(this.log, exception, runningFailures); // Retry quickly and then exponentially back off overrideDelayPeriod = exponentialBackoff.Next(runningFailures); @@ -283,12 +257,12 @@ private async Task PeriodicallyRefreshMembershipTable() } catch (Exception exception) when (this.fatalErrorHandler.IsUnexpected(exception)) { - this.log.LogWarning(exception, "Error refreshing membership table"); + LogWarningErrorRefreshingMembershipTable(this.log, exception); this.fatalErrorHandler.OnFatalException(this, nameof(PeriodicallyRefreshMembershipTable), exception); } finally { - if (this.log.IsEnabled(LogLevel.Debug)) this.log.LogDebug("Stopping periodic membership table refreshes"); + LogDebugStoppingPeriodicMembershipTableRefreshes(this.log); } } @@ -320,13 +294,13 @@ public async Task UpdateStatus(SiloStatus status) { bool wasThrownLocally = false; int numCalls = 0; - + try { async Task UpdateMyStatusTask(int counter) { numCalls++; - if (log.IsEnabled(LogLevel.Debug)) log.LogDebug("Going to try to TryUpdateMyStatusGlobalOnce #{Attempt}", counter); + LogDebugGoingToTryToUpdateMyStatusGlobalOnce(this.log, counter); return await TryUpdateMyStatusGlobalOnce(status); // function to retry } @@ -350,7 +324,7 @@ async Task UpdateMyStatusTask(int counter) if (ok) { - if (log.IsEnabled(LogLevel.Debug)) log.LogDebug("Silo {SiloAddress} Successfully updated my Status in the membership table to {Status}", myAddress, status); + LogDebugSuccessfullyUpdatedMyStatus(this.log, myAddress, status); var gossipTask = this.GossipToOthers(this.myAddress, status); gossipTask.Ignore(); @@ -361,11 +335,11 @@ async Task UpdateMyStatusTask(int counter) { if (status.IsTerminating()) { - this.log.LogWarning("Timed out while gossiping status to other silos after {Timeout}", GossipTimeout); + LogWarningTimedOutWhileGossipingStatus(this.log, GossipTimeout); } - else if (this.log.IsEnabled(LogLevel.Debug)) + else { - this.log.LogDebug("Timed out while gossiping status to other silos after {Timeout}", GossipTimeout); + LogDebugTimedOutWhileGossipingStatus(this.log, GossipTimeout); } } else @@ -376,24 +350,13 @@ async Task UpdateMyStatusTask(int counter) else { wasThrownLocally = true; - log.LogInformation( - (int)ErrorCode.MembershipFailedToWriteConditional, - "Silo {MyAddress} failed to update its status to {Status} in the membership table due to write contention on the table after {NumCalls} attempts.", - myAddress, - status, - numCalls); + LogInformationFailedToUpdateMyStatusDueToWriteContention(this.log, myAddress, status, numCalls); throw new OrleansException($"Silo {myAddress} failed to update its status to {status} in the membership table due to write contention on the table after {numCalls} attempts."); } } catch (Exception exc) when (!wasThrownLocally) { - log.LogWarning( - (int)ErrorCode.MembershipFailedToWrite, - exc, - "Silo {MyAddress} failed to update its status to {Status} in the table due to failures (socket failures or table read/write failures) after {NumCalls} attempts", - myAddress, - status, - numCalls); + LogWarningFailedToUpdateMyStatusDueToFailures(this.log, exc, myAddress, status, numCalls); throw new OrleansException($"Silo {myAddress} failed to update its status to {status} in the table due to failures (socket failures or table read/write failures) after {numCalls} attempts", exc); } } @@ -408,20 +371,13 @@ private async Task TryUpdateMyStatusGlobalOnce(SiloStatus newStatus) { var table = await membershipTableProvider.ReadAll(); - if (log.IsEnabled(LogLevel.Debug)) - log.LogDebug( - "TryUpdateMyStatusGlobalOnce: Read{Selection} Membership table {Table}", - (newStatus.Equals(SiloStatus.Active) ? "All" : " my entry from"), - table.ToString()); + LogDebugTryUpdateMyStatusGlobalOnce(this.log, newStatus.Equals(SiloStatus.Active) ? "All" : " my entry from", table.ToString()); LogMissedIAmAlives(table); var (myEntry, myEtag) = this.GetOrCreateLocalSiloEntry(table, newStatus); if (myEntry.Status == SiloStatus.Dead && myEntry.Status != newStatus) { - this.log.LogWarning( - (int)ErrorCode.MembershipFoundMyselfDead1, - "I should be Dead according to membership table (in TryUpdateMyStatusGlobalOnce): Entry = {Entry}.", - myEntry.ToFullString()); + LogWarningFoundMyselfDead1(this.log, myEntry.ToFullString()); this.KillMyselfLocally($"I should be Dead according to membership table (in TryUpdateMyStatusGlobalOnce): Entry = {(myEntry.ToFullString())}."); return true; } @@ -490,20 +446,13 @@ private MembershipEntry CreateLocalSiloEntry(SiloStatus currentStatus) private void ProcessTableUpdate(MembershipTableData table, string caller) { if (table is null) throw new ArgumentNullException(nameof(table)); - if (this.log.IsEnabled(LogLevel.Debug)) this.log.LogDebug($"{nameof(ProcessTableUpdate)} (called from {{Caller}}) membership table {{Table}}", caller, table.ToString()); + LogDebugProcessTableUpdate(this.log, caller, table); if (this.updates.TryPublish(MembershipTableSnapshot.Update, table)) { this.LogMissedIAmAlives(table); - if (this.log.IsEnabled(LogLevel.Debug)) - { - this.log.LogDebug( - (int)ErrorCode.MembershipReadAll_2, - $"{nameof(ProcessTableUpdate)} (called from {{Caller}}) membership table: {{Table}}", - caller, - table.WithoutDuplicateDeads().ToString()); - } + LogDebugProcessTableUpdateWithTable(this.log, caller, new(table)); } } @@ -519,15 +468,7 @@ private void LogMissedIAmAlives(MembershipTableData table) if (entry.HasMissedIAmAlives(this.clusterMembershipOptions, now)) { var missedSince = entry.EffectiveIAmAliveTime; - log.LogWarning( - (int)ErrorCode.MembershipMissedIAmAliveTableUpdate, - "Noticed that silo {SiloAddress} has not updated it's IAmAliveTime table column recently." - + " Last update was at {LastUpdateTime}, now is {CurrentTime}, no update for {SinceUpdate}, which is more than {AllowedIAmAliveMissPeriod}.", - entry.SiloAddress, - missedSince, - now, - now - missedSince, - clusterMembershipOptions.AllowedIAmAliveMissPeriod); + LogWarningMissedIAmAliveTableUpdate(this.log, entry.SiloAddress, missedSince, now, now - missedSince, clusterMembershipOptions.AllowedIAmAliveMissPeriod); } } } @@ -542,50 +483,36 @@ private async Task CleanupMyTableEntries(MembershipTableData table) { var entry = tuple.Item1; var siloAddress = entry.SiloAddress; - + if (siloAddress.Generation.Equals(myAddress.Generation)) { if (entry.Status == SiloStatus.Dead) { - log.LogWarning((int)ErrorCode.MembershipFoundMyselfDead2, "I should be Dead according to membership table (in CleanupTableEntries): entry = {Entry}.", entry.ToFullString()); + LogWarningFoundMyselfDead2(this.log, entry.ToFullString()); KillMyselfLocally($"I should be Dead according to membership table (in CleanupTableEntries): entry = {(entry.ToFullString())}."); } continue; } - + if (entry.Status == SiloStatus.Dead) { - if (log.IsEnabled(LogLevel.Trace)) log.LogTrace("Skipping my previous old Dead entry in membership table: {Entry}", entry.ToFullString()); + LogTraceSkippingOldDeadEntry(this.log, entry.ToFullString()); continue; } - if (log.IsEnabled(LogLevel.Debug)) - log.LogDebug( - "Temporal anomaly detected in membership table -- Me={SiloAddress} Other me={OtherSiloAddress}", - myAddress, - siloAddress); + LogDebugTemporalAnomalyDetected(this.log, myAddress, siloAddress); // Temporal paradox - There is an older clone of this silo in the membership table if (siloAddress.Generation < myAddress.Generation) { - log.LogWarning( - (int)ErrorCode.MembershipDetectedOlder, - "Detected older version of myself - Marking other older clone as Dead -- Current Me={LocalSiloAddress} Older Me={OlderSiloAddress}, Old entry={Entry}", - myAddress, - siloAddress, - entry.ToString()); + LogWarningDetectedOlder(this.log, myAddress, siloAddress, entry.ToString()); // Declare older clone of me as Dead. silosToDeclareDead.Add(tuple); //return DeclareDead(entry, eTag, tableVersion); } else if (siloAddress.Generation > myAddress.Generation) { // I am the older clone - Newer version of me should survive - I need to kill myself - log.LogWarning( - (int)ErrorCode.MembershipDetectedNewer, - "Detected newer version of myself - I am the older clone so I will stop -- Current Me={LocalSiloAddress} Newer Me={NewerSiloAddress}, Current entry={Entry}", - myAddress, - siloAddress, - entry.ToString()); + LogWarningDetectedNewer(this.log, myAddress, siloAddress, entry.ToString()); await this.UpdateStatus(SiloStatus.Dead); KillMyselfLocally($"Detected newer version of myself - I am the older clone so I will stop -- Current Me={myAddress} Newer Me={siloAddress}, Current entry={entry}"); return true; // No point continuing! @@ -594,8 +521,7 @@ private async Task CleanupMyTableEntries(MembershipTableData table) if (silosToDeclareDead.Count == 0) return true; - if (log.IsEnabled(LogLevel.Debug)) log.LogDebug("CleanupTableEntries: About to DeclareDead {Count} outdated silos in the table: {Silos}", silosToDeclareDead.Count, - Utils.EnumerableToString(silosToDeclareDead.Select(tuple => tuple.Item1))); + LogDebugCleanupTableEntriesAboutToDeclareDead(this.log, silosToDeclareDead.Count, Utils.EnumerableToString(silosToDeclareDead.Select(tuple => tuple.Item1))); foreach (var siloData in silosToDeclareDead) { @@ -608,7 +534,7 @@ await _trySuspectOrKillChannel.Writer.WriteAsync( private void KillMyselfLocally(string reason) { - log.LogError((int)ErrorCode.MembershipKillMyselfLocally, "I have been told I am dead, so this silo will stop! Reason: {Reason}", reason); + LogErrorKillMyselfLocally(this.log, reason); this.CurrentStatus = SiloStatus.Dead; this.fatalErrorHandler.OnFatalException(this, $"I have been told I am dead, so this silo will stop! Reason: {reason}", null); } @@ -640,7 +566,7 @@ bool IsFunctionalForMembership(SiloStatus status) } catch (Exception exception) { - this.log.LogWarning(exception, "Error while gossiping status to other silos"); + LogWarningErrorWhileGossipingStatus(this.log, exception); } } @@ -712,7 +638,7 @@ public async Task ProcessSuspectOrKillLists() catch (Exception ex) { runningFailureCount += 1; - log.LogError(ex, "Error while processing suspect or kill lists. '{FailureCount}' consecutive failures.", runningFailureCount); + LogErrorProcessingSuspectOrKillLists(this.log, ex, runningFailureCount); await _trySuspectOrKillChannel.Writer.WriteAsync(request, _shutdownCts.Token); } @@ -728,17 +654,11 @@ private async Task InnerTryKill(SiloAddress silo, CancellationToken cancel { var table = await membershipTableProvider.ReadAll().WaitAsync(cancellationToken); - if (log.IsEnabled(LogLevel.Debug)) - { - log.LogDebug("TryKill: Read Membership table {0}", table.ToString()); - } + LogDebugTryKillReadMembershipTable(this.log, table.ToString()); if (this.IsStopping) { - this.log.LogInformation( - (int)ErrorCode.MembershipFoundMyselfDead3, - "Ignoring call to TryKill for silo {Silo} since the local silo is stopping", - silo); + LogInformationIgnoringCallToTryKill(this.log, silo); return true; } @@ -746,7 +666,7 @@ private async Task InnerTryKill(SiloAddress silo, CancellationToken cancel if (localSiloEntry.Status == SiloStatus.Dead) { var msg = string.Format("I should be Dead according to membership table (in TryKill): entry = {0}.", localSiloEntry.ToFullString()); - log.LogWarning((int)ErrorCode.MembershipFoundMyselfDead3, msg); + LogWarningFoundMyselfDead3(this.log, msg); KillMyselfLocally(msg); return true; } @@ -754,7 +674,7 @@ private async Task InnerTryKill(SiloAddress silo, CancellationToken cancel if (table.TryGet(silo) is not { } tuple) { var str = $"Could not find silo entry for silo {silo} in the table."; - log.LogError((int)ErrorCode.MembershipFailedToReadSilo, str); + LogErrorCouldNotFindSiloEntry(this.log, str); throw new KeyNotFoundException(str); } @@ -768,10 +688,7 @@ private async Task InnerTryKill(SiloAddress silo, CancellationToken cancel return true; } - log.LogInformation( - (int)ErrorCode.MembershipMarkingAsDead, - "Going to mark silo {SiloAddress} dead as a result of a call to TryKill", - entry.SiloAddress); + LogInformationMarkingSiloAsDead(this.log, entry.SiloAddress); return await DeclareDead(entry, eTag, table.Version, GetDateTimeUtcNow()).WaitAsync(cancellationToken); } @@ -786,14 +703,11 @@ private async Task InnerTryToSuspectOrKill(SiloAddress silo, SiloAddress i var table = await membershipTableProvider.ReadAll().WaitAsync(cancellationToken); var now = GetDateTimeUtcNow(); - if (log.IsEnabled(LogLevel.Debug)) log.LogDebug("TryToSuspectOrKill: Read Membership table {Table}", table.ToString()); + LogDebugTryToSuspectOrKillReadMembershipTable(this.log, table.ToString()); if (this.IsStopping) { - this.log.LogInformation( - (int)ErrorCode.MembershipFoundMyselfDead3, - "Ignoring call to TrySuspectOrKill for silo {Silo} since the local silo is dead", - silo); + LogInformationIgnoringCallToTrySuspectOrKill(this.log, silo); return true; } @@ -801,34 +715,20 @@ private async Task InnerTryToSuspectOrKill(SiloAddress silo, SiloAddress i if (localSiloEntry.Status == SiloStatus.Dead) { var localSiloEntryDetails = localSiloEntry.ToFullString(); - log.LogWarning( - (int)ErrorCode.MembershipFoundMyselfDead3, - "I should be Dead according to membership table (in TryToSuspectOrKill): entry = {Entry}.", - localSiloEntryDetails); + LogWarningFoundMyselfDead3(this.log, $"I should be Dead according to membership table (in TryToSuspectOrKill): entry = {localSiloEntryDetails}."); KillMyselfLocally($"I should be Dead according to membership table (in TryToSuspectOrKill): entry = {localSiloEntryDetails}."); return true; } if (table.TryGet(silo) is not { } tuple) { - // this should not happen ... - log.LogError((int)ErrorCode.MembershipFailedToReadSilo, "Could not find silo entry for silo {Silo} in the table.", silo); - //What is a caller going to do? The silo is not in the table which is what we are trying to achieve. - //throw new KeyNotFoundException($"Could not find silo entry for silo {silo} in the table."); + LogErrorCouldNotFindSiloEntry(this.log, $"Could not find silo entry for silo {silo} in the table."); return false; } var entry = tuple.Item1.Copy(); string eTag = tuple.Item2; - if (log.IsEnabled(LogLevel.Debug)) - { - log.LogDebug( - "TryToSuspectOrKill {SiloAddress}: The current status of {SiloAddress} in the table is {Status}, its entry is {Entry}", - entry.SiloAddress, // First - entry.SiloAddress, // Second - entry.Status, - entry.ToString()); - } + LogDebugTryToSuspectOrKillCurrentStatus(this.log, entry.SiloAddress, entry.Status, entry.ToString()); // Check if the table already knows that this silo is dead if (entry.Status == SiloStatus.Dead) @@ -839,19 +739,11 @@ private async Task InnerTryToSuspectOrKill(SiloAddress silo, SiloAddress i // Get all valid (non-expired) votes var freshVotes = entry.GetFreshVotes(now, this.clusterMembershipOptions.DeathVoteExpirationTimeout); - - if (log.IsEnabled(LogLevel.Trace)) log.LogTrace("Current number of fresh voters for '{SiloAddress}' is '{FreshVotes}'.", silo, freshVotes.Count.ToString()); + LogTraceCurrentNumberOfFreshVoters(this.log, silo, freshVotes.Count.ToString()); if (freshVotes.Count >= this.clusterMembershipOptions.NumVotesForDeathDeclaration) { - // this should not happen ... - log.LogError( - (int)ErrorCode.Runtime_Error_100053, - "Silo '{SiloAddress}' is suspected by '{SuspecterCount}' which is greater than or equal to '{NumVotesForDeathDeclaration}', but is not marked as dead. This is a bug!", - entry.SiloAddress, - freshVotes.Count.ToString(), - this.clusterMembershipOptions.NumVotesForDeathDeclaration.ToString()); - + LogErrorSiloIsSuspectedButNotMarkedAsDead(this.log, entry.SiloAddress, freshVotes.Count.ToString(), this.clusterMembershipOptions.NumVotesForDeathDeclaration.ToString()); KillMyselfLocally("Found a bug! Will stop."); return false; } @@ -876,30 +768,11 @@ private async Task InnerTryToSuspectOrKill(SiloAddress silo, SiloAddress i var numVotesRequiredToEvict = Math.Min(clusterMembershipOptions.NumVotesForDeathDeclaration, (activeNonStaleSilos + 1) / 2); if (freshVotes.Count >= numVotesRequiredToEvict) { - // Find the local silo's vote index - int myVoteIndex = freshVotes.FindIndex(voter => myAddress.Equals(voter.Item1)); - - // Kick this silo off - log.LogInformation( - (int)ErrorCode.MembershipMarkingAsDead, - "Evicting '{SiloAddress}'. Fresh vote count: '{FreshVotes}', votes required to evict: '{NumVotesRequiredToEvict}', non-stale silo count: '{NonStaleSiloCount}', suspecters: '{SuspectingSilos}'", - entry.SiloAddress, - freshVotes.Count, - this.clusterMembershipOptions.NumVotesForDeathDeclaration, - activeNonStaleSilos, - PrintSuspectList(entry.SuspectTimes)); - + LogInformationEvictingSilo(this.log, entry.SiloAddress, freshVotes.Count, this.clusterMembershipOptions.NumVotesForDeathDeclaration, activeNonStaleSilos, PrintSuspectList(entry.SuspectTimes)); return await DeclareDead(entry, eTag, table.Version, now).WaitAsync(cancellationToken); } - log.LogInformation( - (int)ErrorCode.MembershipVotingForKill, - "Voting to evict '{SiloAddress}'. Previous suspect list is '{PreviousSuspecters}', trying to update to '{Suspecters}', ETag: '{ETag}', Fresh vote count: '{FreshVotes}'", - entry.SiloAddress, - PrintSuspectList(prevList), - PrintSuspectList(entry.SuspectTimes), - eTag, - PrintSuspectList(freshVotes)); + LogInformationVotingToEvictSilo(this.log, entry.SiloAddress, PrintSuspectList(prevList), PrintSuspectList(entry.SuspectTimes), eTag, PrintSuspectList(freshVotes)); // If we fail to update here we will retry later. var ok = await membershipTableProvider.UpdateRow(entry, eTag, table.Version.Next()).WaitAsync(cancellationToken); @@ -929,27 +802,24 @@ private async Task DeclareDead(MembershipEntry entry, string etag, TableVe // Add the killer (myself) to the suspect list, for easier diagnosis later on. entry.AddSuspector(myAddress, time); - if (log.IsEnabled(LogLevel.Debug)) log.LogDebug("Going to DeclareDead silo {SiloAddress} in the table. About to write entry {Entry}.", entry.SiloAddress, entry.ToString()); + LogDebugGoingToDeclareDead(this.log, entry.SiloAddress, entry.ToString()); entry.Status = SiloStatus.Dead; bool ok = await membershipTableProvider.UpdateRow(entry, etag, tableVersion.Next()); if (ok) { - if (log.IsEnabled(LogLevel.Debug)) log.LogDebug("Successfully updated {SiloAddress} status to Dead in the membership table.", entry.SiloAddress); + LogDebugSuccessfullyUpdatedStatusToDead(this.log, entry.SiloAddress); var table = await membershipTableProvider.ReadAll(); this.ProcessTableUpdate(table, "DeclareDead"); GossipToOthers(entry.SiloAddress, entry.Status).Ignore(); return true; } - - log.LogInformation( - (int)ErrorCode.MembershipMarkDeadWriteFailed, - "Failed to update {SiloAddress} status to Dead in the membership table, due to write conflicts. Will retry.", - entry.SiloAddress); + + LogInformationFailedToUpdateStatusToDead(this.log, entry.SiloAddress); return false; } - - log.LogInformation((int)ErrorCode.MembershipCantWriteLivenessDisabled, "Want to mark silo {SiloAddress} as DEAD, but will ignore because Liveness is Disabled.", entry.SiloAddress); + + LogInformationLivenessDisabled(this.log, entry.SiloAddress); return true; } @@ -989,5 +859,324 @@ public void Dispose() this.membershipUpdateTimer.Dispose(); _shutdownCts.Dispose(); } + + private readonly struct WithoutDuplicateDeadsLogValue(MembershipTableData table) + { + public override string ToString() => table.WithoutDuplicateDeads().ToString(); + } + + [LoggerMessage( + Level = LogLevel.Information, + Message = "Received cluster membership snapshot via gossip: {Snapshot}" + )] + private static partial void LogInformationReceivedClusterMembershipSnapshot(ILogger logger, MembershipTableSnapshot snapshot); + + [LoggerMessage( + EventId = (int)ErrorCode.MembershipStarting, + Level = LogLevel.Information, + Message = "MembershipOracle starting on host {HostName} with SiloAddress {SiloAddress} at {StartTime}" + )] + private static partial void LogInformationMembershipStarting(ILogger logger, string hostName, SiloAddress siloAddress, string startTime); + + [LoggerMessage( + EventId = (int)ErrorCode.MembershipFoundMyselfDead1, + Level = LogLevel.Warning, + Message = "I should be Dead according to membership table (in RefreshFromSnapshot). Local entry: {Entry}." + )] + private static partial void LogWarningFoundMyselfDeadInRefreshFromSnapshot(ILogger logger, string entry); + + [LoggerMessage( + Level = LogLevel.Warning, + Message = "Exception while trying to clean up my table entries" + )] + private static partial void LogWarningExceptionWhileCleaningUpTableEntries(ILogger logger, Exception exception); + + [LoggerMessage( + Level = LogLevel.Trace, + Message = "Refreshing membership table took {Elapsed}" + )] + private static partial void LogTraceRefreshingMembershipTableTook(ILogger logger, TimeSpan elapsed); + + [LoggerMessage( + EventId = (int)ErrorCode.MembershipUpdateIAmAliveFailure, + Level = LogLevel.Warning, + Message = "Failed to refresh membership table, will retry shortly. Retry attempt {retries}" + )] + private static partial void LogWarningFailedToRefreshMembershipTable(ILogger logger, Exception exception, int retries); + + [LoggerMessage( + Level = LogLevel.Warning, + Message = "Error refreshing membership table" + )] + private static partial void LogWarningErrorRefreshingMembershipTable(ILogger logger, Exception exception); + + [LoggerMessage( + Level = LogLevel.Debug, + Message = "Stopping periodic membership table refreshes" + )] + private static partial void LogDebugStoppingPeriodicMembershipTableRefreshes(ILogger logger); + + [LoggerMessage( + Level = LogLevel.Debug, + Message = "Starting periodic membership table refreshes" + )] + private static partial void LogDebugStartingPeriodicMembershipTableRefreshes(ILogger logger); + + [LoggerMessage( + Level = LogLevel.Debug, + Message = "{Caller} membership table {Table}" + )] + private static partial void LogDebugProcessTableUpdate(ILogger logger, string caller, MembershipTableData table); + + [LoggerMessage( + EventId = (int)ErrorCode.MembershipReadAll_2, + Level = LogLevel.Debug, + Message = "{Caller} membership table: {Table}" + )] + private static partial void LogDebugProcessTableUpdateWithTable(ILogger logger, string caller, WithoutDuplicateDeadsLogValue table); + + [LoggerMessage( + EventId = (int)ErrorCode.MembershipMissedIAmAliveTableUpdate, + Level = LogLevel.Warning, + Message = "Noticed that silo {SiloAddress} has not updated it's IAmAliveTime table column recently." + + " Last update was at {LastUpdateTime}, now is {CurrentTime}, no update for {SinceUpdate}, which is more than {AllowedIAmAliveMissPeriod}." + )] + private static partial void LogWarningMissedIAmAliveTableUpdate(ILogger logger, SiloAddress siloAddress, DateTime lastUpdateTime, DateTime currentTime, TimeSpan sinceUpdate, TimeSpan allowedIAmAliveMissPeriod); + + [LoggerMessage( + EventId = (int)ErrorCode.MembershipFoundMyselfDead2, + Level = LogLevel.Warning, + Message = "I should be Dead according to membership table (in CleanupTableEntries): entry = {Entry}." + )] + private static partial void LogWarningFoundMyselfDead2(ILogger logger, string entry); + + [LoggerMessage( + Level = LogLevel.Trace, + Message = "Skipping my previous old Dead entry in membership table: {Entry}" + )] + private static partial void LogTraceSkippingOldDeadEntry(ILogger logger, string entry); + + [LoggerMessage( + Level = LogLevel.Debug, + Message = "Temporal anomaly detected in membership table -- Me={SiloAddress} Other me={OtherSiloAddress}" + )] + private static partial void LogDebugTemporalAnomalyDetected(ILogger logger, SiloAddress siloAddress, SiloAddress otherSiloAddress); + + [LoggerMessage( + EventId = (int)ErrorCode.MembershipDetectedOlder, + Level = LogLevel.Warning, + Message = "Detected older version of myself - Marking other older clone as Dead -- Current Me={LocalSiloAddress} Older Me={OlderSiloAddress}, Old entry={Entry}" + )] + private static partial void LogWarningDetectedOlder(ILogger logger, SiloAddress localSiloAddress, SiloAddress olderSiloAddress, string entry); + + [LoggerMessage( + EventId = (int)ErrorCode.MembershipDetectedNewer, + Level = LogLevel.Warning, + Message = "Detected newer version of myself - I am the older clone so I will stop -- Current Me={LocalSiloAddress} Newer Me={NewerSiloAddress}, Current entry={Entry}" + )] + private static partial void LogWarningDetectedNewer(ILogger logger, SiloAddress localSiloAddress, SiloAddress newerSiloAddress, string entry); + + [LoggerMessage( + Level = LogLevel.Debug, + Message = "CleanupTableEntries: About to DeclareDead {Count} outdated silos in the table: {Silos}" + )] + private static partial void LogDebugCleanupTableEntriesAboutToDeclareDead(ILogger logger, int count, string silos); + + [LoggerMessage( + EventId = (int)ErrorCode.MembershipKillMyselfLocally, + Level = LogLevel.Error, + Message = "I have been told I am dead, so this silo will stop! Reason: {Reason}" + )] + private static partial void LogErrorKillMyselfLocally(ILogger logger, string reason); + + [LoggerMessage( + Level = LogLevel.Warning, + Message = "Error while gossiping status to other silos" + )] + private static partial void LogWarningErrorWhileGossipingStatus(ILogger logger, Exception exception); + + [LoggerMessage( + Level = LogLevel.Debug, + Message = "Going to try to TryUpdateMyStatusGlobalOnce #{Attempt}" + )] + private static partial void LogDebugGoingToTryToUpdateMyStatusGlobalOnce(ILogger logger, int attempt); + + [LoggerMessage( + Level = LogLevel.Debug, + Message = "Silo {SiloAddress} Successfully updated my Status in the membership table to {Status}" + )] + private static partial void LogDebugSuccessfullyUpdatedMyStatus(ILogger logger, SiloAddress siloAddress, SiloStatus status); + + [LoggerMessage( + Level = LogLevel.Warning, + Message = "Timed out while gossiping status to other silos after {Timeout}" + )] + private static partial void LogWarningTimedOutWhileGossipingStatus(ILogger logger, TimeSpan timeout); + + [LoggerMessage( + Level = LogLevel.Debug, + Message = "Timed out while gossiping status to other silos after {Timeout}" + )] + private static partial void LogDebugTimedOutWhileGossipingStatus(ILogger logger, TimeSpan timeout); + + [LoggerMessage( + EventId = (int)ErrorCode.MembershipFailedToWriteConditional, + Level = LogLevel.Information, + Message = "Silo {MyAddress} failed to update its status to {Status} in the membership table due to write contention on the table after {NumCalls} attempts." + )] + private static partial void LogInformationFailedToUpdateMyStatusDueToWriteContention(ILogger logger, SiloAddress myAddress, SiloStatus status, int numCalls); + + [LoggerMessage( + EventId = (int)ErrorCode.MembershipFailedToWrite, + Level = LogLevel.Warning, + Message = "Silo {MyAddress} failed to update its status to {Status} in the table due to failures (socket failures or table read/write failures) after {NumCalls} attempts" + )] + private static partial void LogWarningFailedToUpdateMyStatusDueToFailures(ILogger logger, Exception exception, SiloAddress myAddress, SiloStatus status, int numCalls); + + [LoggerMessage( + Level = LogLevel.Debug, + Message = "TryUpdateMyStatusGlobalOnce: Read{Selection} Membership table {Table}" + )] + private static partial void LogDebugTryUpdateMyStatusGlobalOnce(ILogger logger, string selection, string table); + + [LoggerMessage( + EventId = (int)ErrorCode.MembershipFoundMyselfDead1, + Level = LogLevel.Warning, + Message = "I should be Dead according to membership table (in TryUpdateMyStatusGlobalOnce): Entry = {Entry}." + )] + private static partial void LogWarningFoundMyselfDead1(ILogger logger, string entry); + + [LoggerMessage( + Level = LogLevel.Debug, + Message = "TryKill: Read Membership table {Table}" + )] + private static partial void LogDebugTryKillReadMembershipTable(ILogger logger, string table); + + [LoggerMessage( + EventId = (int)ErrorCode.MembershipFoundMyselfDead3, + Level = LogLevel.Information, + Message = "Ignoring call to TryKill for silo {Silo} since the local silo is stopping" + )] + private static partial void LogInformationIgnoringCallToTryKill(ILogger logger, SiloAddress silo); + + [LoggerMessage( + EventId = (int)ErrorCode.MembershipFoundMyselfDead3, + Level = LogLevel.Warning, + Message = "{Message}" + )] + private static partial void LogWarningFoundMyselfDead3(ILogger logger, string message); + + [LoggerMessage( + EventId = (int)ErrorCode.MembershipFailedToReadSilo, + Level = LogLevel.Error, + Message = "{Message}" + )] + private static partial void LogErrorCouldNotFindSiloEntry(ILogger logger, string message); + + [LoggerMessage( + EventId = (int)ErrorCode.MembershipMarkingAsDead, + Level = LogLevel.Information, + Message = "Going to mark silo {SiloAddress} dead as a result of a call to TryKill" + )] + private static partial void LogInformationMarkingSiloAsDead(ILogger logger, SiloAddress siloAddress); + + [LoggerMessage( + Level = LogLevel.Debug, + Message = "TryToSuspectOrKill: Read Membership table {Table}" + )] + private static partial void LogDebugTryToSuspectOrKillReadMembershipTable(ILogger logger, string table); + + [LoggerMessage( + EventId = (int)ErrorCode.MembershipFoundMyselfDead3, + Level = LogLevel.Information, + Message = "Ignoring call to TrySuspectOrKill for silo {Silo} since the local silo is dead" + )] + private static partial void LogInformationIgnoringCallToTrySuspectOrKill(ILogger logger, SiloAddress silo); + + [LoggerMessage( + Level = LogLevel.Debug, + Message = "TryToSuspectOrKill {SiloAddress}: The current status of {SiloAddress} in the table is {Status}, its entry is {Entry}" + )] + private static partial void LogDebugTryToSuspectOrKillCurrentStatus(ILogger logger, SiloAddress siloAddress, SiloStatus status, string entry); + + [LoggerMessage( + Level = LogLevel.Trace, + Message = "Current number of fresh voters for '{SiloAddress}' is '{FreshVotes}'." + )] + private static partial void LogTraceCurrentNumberOfFreshVoters(ILogger logger, SiloAddress siloAddress, string freshVotes); + + [LoggerMessage( + EventId = (int)ErrorCode.Runtime_Error_100053, + Level = LogLevel.Error, + Message = "Silo '{SiloAddress}' is suspected by '{SuspecterCount}' which is greater than or equal to '{NumVotesForDeathDeclaration}', but is not marked as dead. This is a bug!" + )] + private static partial void LogErrorSiloIsSuspectedButNotMarkedAsDead(ILogger logger, SiloAddress siloAddress, string suspecterCount, string numVotesForDeathDeclaration); + + [LoggerMessage( + EventId = (int)ErrorCode.MembershipMarkingAsDead, + Level = LogLevel.Information, + Message = "Evicting '{SiloAddress}'. Fresh vote count: '{FreshVotes}', votes required to evict: '{NumVotesRequiredToEvict}', non-stale silo count: '{NonStaleSiloCount}', suspecters: '{SuspectingSilos}'" + )] + private static partial void LogInformationEvictingSilo(ILogger logger, SiloAddress siloAddress, int freshVotes, int numVotesRequiredToEvict, int nonStaleSiloCount, string suspectingSilos); + + [LoggerMessage( + EventId = (int)ErrorCode.MembershipVotingForKill, + Level = LogLevel.Information, + Message = "Voting to evict '{SiloAddress}'. Previous suspect list is '{PreviousSuspecters}', trying to update to '{Suspecters}', ETag: '{ETag}', Fresh vote count: '{FreshVotes}'" + )] + private static partial void LogInformationVotingToEvictSilo(ILogger logger, SiloAddress siloAddress, string previousSuspecters, string suspecters, string eTag, string freshVotes); + + [LoggerMessage( + Level = LogLevel.Debug, + Message = "Going to DeclareDead silo {SiloAddress} in the table. About to write entry {Entry}." + )] + private static partial void LogDebugGoingToDeclareDead(ILogger logger, SiloAddress siloAddress, string entry); + + [LoggerMessage( + Level = LogLevel.Debug, + Message = "Successfully updated {SiloAddress} status to Dead in the membership table." + )] + private static partial void LogDebugSuccessfullyUpdatedStatusToDead(ILogger logger, SiloAddress siloAddress); + + [LoggerMessage( + EventId = (int)ErrorCode.MembershipMarkDeadWriteFailed, + Level = LogLevel.Information, + Message = "Failed to update {SiloAddress} status to Dead in the membership table, due to write conflicts. Will retry." + )] + private static partial void LogInformationFailedToUpdateStatusToDead(ILogger logger, SiloAddress siloAddress); + + [LoggerMessage( + EventId = (int)ErrorCode.MembershipCantWriteLivenessDisabled, + Level = LogLevel.Information, + Message = "Want to mark silo {SiloAddress} as DEAD, but will ignore because Liveness is Disabled." + )] + private static partial void LogInformationLivenessDisabled(ILogger logger, SiloAddress siloAddress); + + [LoggerMessage( + Level = LogLevel.Error, + Message = "Error while processing suspect or kill lists. '{FailureCount}' consecutive failures." + )] + private static partial void LogErrorProcessingSuspectOrKillLists(ILogger logger, Exception exception, int failureCount); + + [LoggerMessage( + EventId = (int)ErrorCode.MembershipNodeMigrated, + Level = LogLevel.Warning, + Message = "Silo {SiloName} migrated to host {HostName} silo address {SiloAddress} from host {PreviousHostName} silo address {PreviousSiloAddress}." + )] + private static partial void LogWarningNodeMigrated(ILogger logger, string siloName, string hostName, SiloAddress siloAddress, string previousHostName, SiloAddress previousSiloAddress); + + [LoggerMessage( + EventId = (int)ErrorCode.MembershipNodeRestarted, + Level = LogLevel.Warning, + Message = "Silo {SiloName} restarted on same host {HostName} with silo address = {SiloAddress} Previous silo address = {PreviousSiloAddress}" + )] + private static partial void LogWarningNodeRestarted(ILogger logger, string siloName, string hostName, SiloAddress siloAddress, SiloAddress previousSiloAddress); + + [LoggerMessage( + EventId = (int)ErrorCode.MembershipFailedToStart, + Level = LogLevel.Error, + Message = "Membership failed to start" + )] + private static partial void LogErrorMembershipFailedToStart(ILogger logger, Exception exception); } } diff --git a/src/Orleans.Runtime/MembershipService/SiloHealthMonitor.cs b/src/Orleans.Runtime/MembershipService/SiloHealthMonitor.cs index 44777f31188..ba0d02d5f61 100644 --- a/src/Orleans.Runtime/MembershipService/SiloHealthMonitor.cs +++ b/src/Orleans.Runtime/MembershipService/SiloHealthMonitor.cs @@ -17,7 +17,7 @@ namespace Orleans.Runtime.MembershipService /// /// Responsible for monitoring an individual remote silo. /// - internal class SiloHealthMonitor : ITestAccessor, IHealthCheckable, IDisposable, IAsyncDisposable + internal partial class SiloHealthMonitor : ITestAccessor, IHealthCheckable, IDisposable, IAsyncDisposable { private readonly ILogger _log; private readonly IOptionsMonitor _clusterMembershipOptions; @@ -203,7 +203,7 @@ private async Task Run() // Note that all recused silos will be included in the consideration set the next time cluster membership changes. if (probeResult.Status != ProbeResultStatus.Succeeded && probeResult.IntermediaryHealthDegradationScore > 0) { - _log.LogInformation("Recusing unhealthy intermediary '{Intermediary}' and trying again with remaining nodes", intermediary); + LogInformationRecusingUnhealthyIntermediary(_log, intermediary); otherNodes = [.. otherNodes.Where(node => !node.Equals(intermediary))]; overrideDelay = TimeSpan.FromMilliseconds(250); } @@ -216,7 +216,7 @@ private async Task Run() } catch (Exception exception) { - _log.LogError(exception, "Exception monitoring silo {SiloAddress}", TargetSiloAddress); + LogErrorExceptionMonitoringSilo(_log, exception, TargetSiloAddress); } } @@ -256,10 +256,7 @@ TimeSpan GetTimeout(bool isDirectProbe) private async Task ProbeDirectly(CancellationToken cancellation) { var id = ++_nextProbeId; - if (_log.IsEnabled(LogLevel.Trace)) - { - _log.LogTrace("Going to send Ping #{Id} to probe silo {Silo}", id, TargetSiloAddress); - } + LogTraceGoingToSendPing(_log, id, TargetSiloAddress); var roundTripTimer = ValueStopwatch.StartNew(); ProbeResult probeResult; @@ -286,14 +283,7 @@ private async Task ProbeDirectly(CancellationToken cancellation) { MessagingInstruments.OnPingReplyReceived(TargetSiloAddress); - if (_log.IsEnabled(LogLevel.Trace)) - { - _log.LogTrace( - "Got successful ping response for ping #{Id} from {Silo} with round trip time of {RoundTripTime}", - id, - TargetSiloAddress, - roundTripTimer.Elapsed); - } + LogTraceGotSuccessfulPingResponse(_log, id, TargetSiloAddress, roundTripTimer.Elapsed); _failedProbes = 0; _elapsedSinceLastSuccessfulResponse.Restart(); @@ -305,14 +295,7 @@ private async Task ProbeDirectly(CancellationToken cancellation) MessagingInstruments.OnPingReplyMissed(TargetSiloAddress); var failedProbes = ++_failedProbes; - _log.LogWarning( - (int)ErrorCode.MembershipMissedPing, - failureException, - "Did not get response for probe #{Id} to silo {Silo} after {Elapsed}. Current number of consecutive failed probes is {FailedProbeCount}", - id, - TargetSiloAddress, - roundTripTimer.Elapsed, - failedProbes); + LogWarningDidNotGetResponseForProbe(_log, failureException, id, TargetSiloAddress, roundTripTimer.Elapsed, failedProbes); probeResult = ProbeResult.CreateDirect(failedProbes, ProbeResultStatus.Failed); } @@ -330,10 +313,7 @@ private async Task ProbeDirectly(CancellationToken cancellation) private async Task ProbeIndirectly(SiloAddress intermediary, TimeSpan directProbeTimeout, CancellationToken cancellation) { var id = ++_nextProbeId; - if (_log.IsEnabled(LogLevel.Trace)) - { - _log.LogTrace("Going to send indirect ping #{Id} to probe silo {Silo} via {Intermediary}", id, TargetSiloAddress, intermediary); - } + LogTraceGoingToSendIndirectPing(_log, id, TargetSiloAddress, intermediary); var roundTripTimer = ValueStopwatch.StartNew(); ProbeResult probeResult; @@ -350,13 +330,7 @@ private async Task ProbeIndirectly(SiloAddress intermediary, TimeSp if (indirectResult.Succeeded) { - _log.LogInformation( - "Indirect probe request #{Id} to silo {SiloAddress} via silo {IntermediarySiloAddress} succeeded after {RoundTripTime} with a direct probe response time of {ProbeResponseTime}.", - id, - TargetSiloAddress, - intermediary, - roundTripTimer.Elapsed, - indirectResult.ProbeResponseTime); + LogInformationIndirectProbeSucceeded(_log, id, TargetSiloAddress, intermediary, roundTripTimer.Elapsed, indirectResult.ProbeResponseTime); MessagingInstruments.OnPingReplyReceived(TargetSiloAddress); @@ -369,24 +343,12 @@ private async Task ProbeIndirectly(SiloAddress intermediary, TimeSp if (indirectResult.IntermediaryHealthScore > 0) { - _log.LogInformation( - "Ignoring failure result for ping #{Id} from {Silo} since the intermediary used to probe the silo is not healthy. Intermediary health degradation score: {IntermediaryHealthScore}.", - id, - TargetSiloAddress, - indirectResult.IntermediaryHealthScore); + LogInformationIgnoringFailureResultForPing(_log, id, TargetSiloAddress, indirectResult.IntermediaryHealthScore); probeResult = ProbeResult.CreateIndirect(_failedProbes, ProbeResultStatus.Unknown, indirectResult, intermediary); } else { - _log.LogWarning( - "Indirect probe request #{Id} to silo {SiloAddress} via silo {IntermediarySiloAddress} failed after {RoundTripTime} with a direct probe response time of {ProbeResponseTime}. Failure message: {FailureMessage}. Intermediary health score: {IntermediaryHealthScore}.", - id, - TargetSiloAddress, - intermediary, - roundTripTimer.Elapsed, - indirectResult.ProbeResponseTime, - indirectResult.FailureMessage, - indirectResult.IntermediaryHealthScore); + LogWarningIndirectProbeFailed(_log, id, TargetSiloAddress, intermediary, roundTripTimer.Elapsed, indirectResult.ProbeResponseTime, indirectResult.FailureMessage, indirectResult.IntermediaryHealthScore); var missed = ++_failedProbes; probeResult = ProbeResult.CreateIndirect(missed, ProbeResultStatus.Failed, indirectResult, intermediary); @@ -396,7 +358,7 @@ private async Task ProbeIndirectly(SiloAddress intermediary, TimeSp catch (Exception exception) { MessagingInstruments.OnPingReplyMissed(TargetSiloAddress); - _log.LogWarning(exception, "Indirect probe request failed."); + LogWarningIndirectProbeRequestFailed(_log, exception); probeResult = ProbeResult.CreateIndirect(_failedProbes, ProbeResultStatus.Unknown, default, intermediary); } @@ -444,5 +406,66 @@ public enum ProbeResultStatus Failed, Succeeded } + + [LoggerMessage( + Level = LogLevel.Trace, + Message = "Going to send Ping #{Id} to probe silo {Silo}" + )] + private static partial void LogTraceGoingToSendPing(ILogger logger, int id, SiloAddress silo); + + [LoggerMessage( + Level = LogLevel.Trace, + Message = "Got successful ping response for ping #{Id} from {Silo} with round trip time of {RoundTripTime}" + )] + private static partial void LogTraceGotSuccessfulPingResponse(ILogger logger, int id, SiloAddress silo, TimeSpan roundTripTime); + + [LoggerMessage( + Level = LogLevel.Warning, + EventId = (int)ErrorCode.MembershipMissedPing, + Message = "Did not get response for probe #{Id} to silo {Silo} after {Elapsed}. Current number of consecutive failed probes is {FailedProbeCount}" + )] + private static partial void LogWarningDidNotGetResponseForProbe(ILogger logger, Exception exception, int id, SiloAddress silo, TimeSpan elapsed, int failedProbeCount); + + [LoggerMessage( + Level = LogLevel.Trace, + Message = "Going to send indirect ping #{Id} to probe silo {Silo} via {Intermediary}" + )] + private static partial void LogTraceGoingToSendIndirectPing(ILogger logger, int id, SiloAddress silo, SiloAddress intermediary); + + [LoggerMessage( + Level = LogLevel.Information, + Message = "Indirect probe request #{Id} to silo {SiloAddress} via silo {IntermediarySiloAddress} succeeded after {RoundTripTime} with a direct probe response time of {ProbeResponseTime}." + )] + private static partial void LogInformationIndirectProbeSucceeded(ILogger logger, int id, SiloAddress siloAddress, SiloAddress intermediarySiloAddress, TimeSpan roundTripTime, TimeSpan probeResponseTime); + + [LoggerMessage( + Level = LogLevel.Information, + Message = "Ignoring failure result for ping #{Id} from {Silo} since the intermediary used to probe the silo is not healthy. Intermediary health degradation score: {IntermediaryHealthScore}." + )] + private static partial void LogInformationIgnoringFailureResultForPing(ILogger logger, int id, SiloAddress silo, int intermediaryHealthScore); + + [LoggerMessage( + Level = LogLevel.Warning, + Message = "Indirect probe request #{Id} to silo {SiloAddress} via silo {IntermediarySiloAddress} failed after {RoundTripTime} with a direct probe response time of {ProbeResponseTime}. Failure message: {FailureMessage}. Intermediary health score: {IntermediaryHealthScore}." + )] + private static partial void LogWarningIndirectProbeFailed(ILogger logger, int id, SiloAddress siloAddress, SiloAddress intermediarySiloAddress, TimeSpan roundTripTime, TimeSpan probeResponseTime, string failureMessage, int intermediaryHealthScore); + + [LoggerMessage( + Level = LogLevel.Warning, + Message = "Indirect probe request failed." + )] + private static partial void LogWarningIndirectProbeRequestFailed(ILogger logger, Exception exception); + + [LoggerMessage( + Level = LogLevel.Information, + Message = "Recusing unhealthy intermediary '{Intermediary}' and trying again with remaining nodes" + )] + private static partial void LogInformationRecusingUnhealthyIntermediary(ILogger logger, SiloAddress intermediary); + + [LoggerMessage( + Level = LogLevel.Error, + Message = "Exception monitoring silo {SiloAddress}" + )] + private static partial void LogErrorExceptionMonitoringSilo(ILogger logger, Exception exception, SiloAddress siloAddress); } }