diff --git a/Orleans.slnx b/Orleans.slnx index 2af4d7d21aa..c152e1ce77b 100644 --- a/Orleans.slnx +++ b/Orleans.slnx @@ -34,6 +34,7 @@ + @@ -76,6 +77,7 @@ + @@ -84,7 +86,7 @@ - + @@ -133,7 +135,7 @@ - + diff --git a/src/Azure/Orleans.ScheduledJobs.AzureStorage/AzureStorageJobShard.Log.cs b/src/Azure/Orleans.ScheduledJobs.AzureStorage/AzureStorageJobShard.Log.cs new file mode 100644 index 00000000000..a6f42ccf6bd --- /dev/null +++ b/src/Azure/Orleans.ScheduledJobs.AzureStorage/AzureStorageJobShard.Log.cs @@ -0,0 +1,109 @@ +using System; +using Microsoft.Extensions.Logging; + +namespace Orleans.ScheduledJobs.AzureStorage; + +internal sealed partial class AzureStorageJobShard +{ + [LoggerMessage( + Level = LogLevel.Information, + Message = "Initializing shard '{ShardId}' from Azure Storage blob" + )] + private static partial void LogInitializingShard(ILogger logger, string shardId); + + [LoggerMessage( + Level = LogLevel.Information, + Message = "Shard '{ShardId}' initialized successfully. Loaded {JobCount} job(s) in {ElapsedMilliseconds}ms" + )] + private static partial void LogShardInitialized(ILogger logger, string shardId, int jobCount, long elapsedMilliseconds); + + [LoggerMessage( + Level = LogLevel.Debug, + Message = "Adding job '{JobId}' (Name: '{JobName}') to shard '{ShardId}' with due time {DueTime}" + )] + private static partial void LogAddingJob(ILogger logger, string jobId, string jobName, string shardId, DateTimeOffset dueTime); + + [LoggerMessage( + Level = LogLevel.Debug, + Message = "Removing job '{JobId}' from shard '{ShardId}'" + )] + private static partial void LogRemovingJob(ILogger logger, string jobId, string shardId); + + [LoggerMessage( + Level = LogLevel.Debug, + Message = "Retrying job '{JobId}' in shard '{ShardId}' with new due time {NewDueTime}" + )] + private static partial void LogRetryingJob(ILogger logger, string jobId, string shardId, DateTimeOffset newDueTime); + + [LoggerMessage( + Level = LogLevel.Trace, + Message = "Flushing batch of {OperationCount} job operation(s) to shard '{ShardId}'" + )] + private static partial void LogFlushingBatch(ILogger logger, int operationCount, string shardId); + + [LoggerMessage( + Level = LogLevel.Debug, + Message = "Batch of {OperationCount} job operation(s) written to shard '{ShardId}' in {ElapsedMilliseconds}ms. Total committed blocks: {CommittedBlockCount}" + )] + private static partial void LogBatchWritten(ILogger logger, int operationCount, string shardId, long elapsedMilliseconds, int committedBlockCount); + + [LoggerMessage( + Level = LogLevel.Trace, + Message = "Updating metadata for shard '{ShardId}'" + )] + private static partial void LogUpdatingMetadata(ILogger logger, string shardId); + + [LoggerMessage( + Level = LogLevel.Debug, + Message = "Metadata updated for shard '{ShardId}'" + )] + private static partial void LogMetadataUpdated(ILogger logger, string shardId); + + [LoggerMessage( + Level = LogLevel.Warning, + Message = "Shard '{ShardId}' has {CommittedBlockCount} committed blocks, approaching Azure Blob append limit of 50,000" + )] + private static partial void LogApproachingBlockLimit(ILogger logger, string shardId, int committedBlockCount); + + [LoggerMessage( + Level = LogLevel.Warning, + Message = "Large batch detected for shard '{ShardId}': {OperationCount} operations (max configured: {MaxBatchSize})" + )] + private static partial void LogLargeBatch(ILogger logger, string shardId, int operationCount, int maxBatchSize); + + [LoggerMessage( + Level = LogLevel.Error, + Message = "Error writing batch of {OperationCount} operation(s) to shard '{ShardId}'" + )] + private static partial void LogErrorWritingBatch(ILogger logger, Exception exception, int operationCount, string shardId); + + [LoggerMessage( + Level = LogLevel.Error, + Message = "Error updating metadata for shard '{ShardId}'" + )] + private static partial void LogErrorUpdatingMetadata(ILogger logger, Exception exception, string shardId); + + [LoggerMessage( + Level = LogLevel.Debug, + Message = "Stopping storage processor for shard '{ShardId}'" + )] + private static partial void LogStoppingProcessor(ILogger logger, string shardId); + + [LoggerMessage( + Level = LogLevel.Information, + Message = "Storage processor stopped for shard '{ShardId}'" + )] + private static partial void LogProcessorStopped(ILogger logger, string shardId); + + [LoggerMessage( + Level = LogLevel.Trace, + Message = "Processing storage operation queue for shard '{ShardId}'" + )] + private static partial void LogProcessingStorageQueue(ILogger logger, string shardId); + + [LoggerMessage( + Level = LogLevel.Debug, + Message = "Waiting for additional operations to batch (current size: {CurrentSize}, min size: {MinSize}) for shard '{ShardId}'" + )] + private static partial void LogWaitingForBatch(ILogger logger, int currentSize, int minSize, string shardId); +} diff --git a/src/Azure/Orleans.ScheduledJobs.AzureStorage/AzureStorageJobShard.cs b/src/Azure/Orleans.ScheduledJobs.AzureStorage/AzureStorageJobShard.cs new file mode 100644 index 00000000000..5fe10156f89 --- /dev/null +++ b/src/Azure/Orleans.ScheduledJobs.AzureStorage/AzureStorageJobShard.cs @@ -0,0 +1,395 @@ +using System; +using System.Buffers; +using System.Collections.Generic; +using System.Diagnostics; +using System.IO; +using System.Text; +using System.Text.Json; +using System.Threading; +using System.Threading.Channels; +using System.Threading.Tasks; +using System.Transactions; +using Azure; +using Azure.Storage.Blobs; +using Azure.Storage.Blobs.Models; +using Azure.Storage.Blobs.Specialized; +using Microsoft.Extensions.Logging; +using Orleans.Hosting; +using Orleans.Runtime; +using Orleans.Serialization.Buffers.Adaptors; + +namespace Orleans.ScheduledJobs.AzureStorage; + +internal sealed partial class AzureStorageJobShard : JobShard +{ + private readonly Channel _storageOperationChannel; + private readonly Task _storageProcessorTask; + private readonly CancellationTokenSource _shutdownCts = new(); + private readonly AzureStorageJobShardOptions _options; + private readonly ILogger _logger; + + internal AppendBlobClient BlobClient { get; init; } + internal ETag? ETag { get; private set; } + internal int CommitedBlockCount { get; private set; } + + public AzureStorageJobShard(string id, DateTimeOffset startTime, DateTimeOffset endTime, AppendBlobClient blobClient, IDictionary? metadata, ETag? eTag, AzureStorageJobShardOptions options, ILogger logger) + : base(id, startTime, endTime) + { + BlobClient = blobClient; + ETag = eTag; + Metadata = metadata; + _options = options; + _logger = logger; + + // Create unbounded channel for storage operations + _storageOperationChannel = Channel.CreateUnbounded(new UnboundedChannelOptions + { + SingleReader = true, + SingleWriter = false + }); + + // Start the background task that processes storage operations + _storageProcessorTask = ProcessStorageOperationsAsync(); + } + + protected override async Task PersistAddJobAsync(string jobId, string jobName, DateTimeOffset dueTime, GrainId target, IReadOnlyDictionary? metadata, CancellationToken cancellationToken) + { + LogAddingJob(_logger, jobId, jobName, Id, dueTime); + var operation = JobOperation.CreateAddOperation(jobId, jobName, dueTime, target, metadata); + await EnqueueStorageOperationAsync(StorageOperation.CreateAppendOperation(operation), cancellationToken); + } + + protected override async Task PersistRemoveJobAsync(string jobId, CancellationToken cancellationToken) + { + LogRemovingJob(_logger, jobId, Id); + var operation = JobOperation.CreateRemoveOperation(jobId); + await EnqueueStorageOperationAsync(StorageOperation.CreateAppendOperation(operation), cancellationToken); + } + + protected override async Task PersistRetryJobAsync(string jobId, DateTimeOffset newDueTime, CancellationToken cancellationToken) + { + LogRetryingJob(_logger, jobId, Id, newDueTime); + var operation = JobOperation.CreateRetryOperation(jobId, newDueTime); + await EnqueueStorageOperationAsync(StorageOperation.CreateAppendOperation(operation), cancellationToken); + } + + public async Task UpdateBlobMetadata(IDictionary metadata, CancellationToken cancellationToken) + { + LogUpdatingMetadata(_logger, Id); + await EnqueueStorageOperationAsync(StorageOperation.CreateMetadataOperation(metadata), cancellationToken); + } + + public async ValueTask InitializeAsync(CancellationToken cancellationToken) + { + LogInitializingShard(_logger, Id); + var sw = Stopwatch.StartNew(); + + // Load existing blob + var response = await BlobClient.DownloadAsync(cancellationToken: cancellationToken); + using var stream = response.Value.Content; + + // Rebuild state by replaying operations + var addedJobs = new Dictionary(); + var deletedJobs = new HashSet(); + var jobRetryCounters = new Dictionary(); + + await foreach (var operation in NetstringJsonSerializer.DecodeAsync(stream, JobOperationJsonContext.Default.JobOperation, cancellationToken)) + { + switch (operation.Type) + { + case JobOperation.OperationType.Add: + if (!deletedJobs.Contains(operation.Id)) + { + addedJobs[operation.Id] = operation; + } + break; + case JobOperation.OperationType.Remove: + deletedJobs.Add(operation.Id); + addedJobs.Remove(operation.Id); + jobRetryCounters.Remove(operation.Id); + break; + case JobOperation.OperationType.Retry: + if (!deletedJobs.Contains(operation.Id)) + { + if (!jobRetryCounters.ContainsKey(operation.Id)) + { + jobRetryCounters[operation.Id] = (1, operation.DueTime); + } + else + { + var entry = jobRetryCounters[operation.Id]; + jobRetryCounters[operation.Id] = (entry.dequeueCount + 1, operation.DueTime); + } + } + break; + } + } + + // Rebuild the priority queue + foreach (var op in addedJobs.Values) + { + var retryCounter = 0; + var dueTime = op.DueTime!.Value; + if (jobRetryCounters.TryGetValue(op.Id, out var retryEntries)) + { + retryCounter = retryEntries.dequeueCount; + dueTime = retryEntries.newDueTime ?? dueTime; + } + + EnqueueJob(new ScheduledJob + { + Id = op.Id, + Name = op.Name!, + DueTime = dueTime, + TargetGrainId = op.TargetGrainId!.Value, + ShardId = Id, + Metadata = op.Metadata, + }, + retryCounter); + } + + ETag = response.Value.Details.ETag; + + sw.Stop(); + LogShardInitialized(_logger, Id, addedJobs.Count, sw.ElapsedMilliseconds); + } + + private async Task EnqueueStorageOperationAsync(StorageOperation operation, CancellationToken cancellationToken) + { + await _storageOperationChannel.Writer.WriteAsync(operation, cancellationToken); + await operation.CompletionSource.Task; + } + + private async Task ProcessStorageOperationsAsync() + { + await Task.CompletedTask.ConfigureAwait(ConfigureAwaitOptions.ContinueOnCapturedContext | ConfigureAwaitOptions.ForceYielding); + + var cancellationToken = _shutdownCts.Token; + // TODO: AppendBlob has a limit of 50,000 blocks. Implement blob rotation when this limit is approached. + var batchOperations = new List(_options.MaxBatchSize); + + try + { + while (await _storageOperationChannel.Reader.WaitToReadAsync(cancellationToken)) + { + // Read first operation + if (!_storageOperationChannel.Reader.TryRead(out var firstOperation)) + { + continue; + } + + // Handle metadata operations immediately (cannot be batched) + if (firstOperation.Type is StorageOperationType.UpdateMetadata) + { + try + { + await UpdateMetadataAsync(firstOperation.Metadata!, cancellationToken); + LogMetadataUpdated(_logger, Id); + firstOperation.CompletionSource.TrySetResult(); + } + catch (Exception ex) + { + LogErrorUpdatingMetadata(_logger, ex, Id); + firstOperation.CompletionSource?.TrySetException(ex); + } + continue; + } + + // Collect job operations for batching + batchOperations.Add(firstOperation); + + // Try to collect more operations up to the maximum batch size + if (TryCollectJobOperationsForBatch(batchOperations)) + { + // Not enough operations to meet the minimum batch size, wait for more or timeout + if (batchOperations.Count < _options.MinBatchSize) + { + LogWaitingForBatch(_logger, batchOperations.Count, _options.MinBatchSize, Id); + } + await Task.Delay(_options.BatchFlushInterval, cancellationToken); + TryCollectJobOperationsForBatch(batchOperations); + } + + // Process the batch of job operations + if (batchOperations.Count > 0) + { + try + { + LogFlushingBatch(_logger, batchOperations.Count, Id); + await AppendJobOperationBatchAsync(batchOperations, cancellationToken); + + // Mark all operations as completed + foreach (var op in batchOperations) + { + op.CompletionSource.TrySetResult(); + } + } + catch (Exception ex) + { + LogErrorWritingBatch(_logger, ex, batchOperations.Count, Id); + + // Mark all operations as failed + foreach (var op in batchOperations) + { + op.CompletionSource?.TrySetException(ex); + } + } + finally + { + batchOperations.Clear(); + } + } + } + } + catch (OperationCanceledException) + { + // Ignore + } + finally + { + // Expected during shutdown - cancel all pending operations + while (_storageOperationChannel.Reader.TryRead(out var operation)) + { + operation.CompletionSource?.TrySetCanceled(cancellationToken); + } + } + + // Local function to collect job operations for batching. Returns true if more operations can be collected. + bool TryCollectJobOperationsForBatch(List batchOperations) + { + // Collect more jobs, up to a maximum batch size + while (batchOperations.Count < _options.MaxBatchSize && _storageOperationChannel.Reader.TryPeek(out var nextOperation)) + { + if (nextOperation.Type is StorageOperationType.UpdateMetadata) + { + // Stop batching if we encounter a metadata operation + return false; + } + _storageOperationChannel.Reader.TryRead(out var operation); + Debug.Assert(operation != null); + batchOperations.Add(operation!); + } + return batchOperations.Count != _options.MaxBatchSize; + } + } + + private async Task AppendJobOperationBatchAsync(List operations, CancellationToken cancellationToken) + { + var sw = Stopwatch.StartNew(); + using var stream = PooledBufferStream.Rent(); + try + { + stream.Position = 0; // TODO Remove that once PooledBufferStream fixed + + // Encode all job operations into a single stream + foreach (var operation in operations) + { + NetstringJsonSerializer.Encode(operation.JobOperation!.Value, stream, JobOperationJsonContext.Default.JobOperation); + } + var str = System.Text.Encoding.UTF8.GetString(stream.ToArray()); + stream.Position = 0; + var result = await BlobClient.AppendBlockAsync( + stream, + new AppendBlobAppendBlockOptions { Conditions = new AppendBlobRequestConditions { IfMatch = ETag } }, + cancellationToken); + ETag = result.Value.ETag; + CommitedBlockCount = result.Value.BlobCommittedBlockCount; + + sw.Stop(); + LogBatchWritten(_logger, operations.Count, Id, sw.ElapsedMilliseconds, CommitedBlockCount); + + // Warn if approaching the 50,000 block limit (warn at 80%) + if (CommitedBlockCount > 40000) + { + LogApproachingBlockLimit(_logger, Id, CommitedBlockCount); + } + + // Warn if batch is unusually large + if (operations.Count > _options.MaxBatchSize * 0.8) + { + LogLargeBatch(_logger, Id, operations.Count, _options.MaxBatchSize); + } + } + finally + { + PooledBufferStream.Return(stream); + } + } + + private async Task UpdateMetadataAsync(IDictionary metadata, CancellationToken cancellationToken) + { + var result = await BlobClient.SetMetadataAsync( + metadata, + new BlobRequestConditions { IfMatch = ETag }, + cancellationToken); + ETag = result.Value.ETag; + Metadata = metadata; + } + + /// + /// Stops the background storage processor and waits for all pending operations to complete. + /// After calling this method, no new storage operations can be enqueued. + /// This method is idempotent and can be called multiple times safely. + /// + internal async Task StopProcessorAsync(CancellationToken cancellationToken) + { + LogStoppingProcessor(_logger, Id); + + // Complete the channel to stop accepting new operations (idempotent operation) + if (_storageOperationChannel.Writer.TryComplete()) + { + _shutdownCts.Cancel(); + } + + // Wait for the background processor to finish all pending operations + try + { + await _storageProcessorTask.WaitAsync(cancellationToken); + LogProcessorStopped(_logger, Id); + } + catch (OperationCanceledException) + { + // Expected during normal shutdown + LogProcessorStopped(_logger, Id); + } + } + + public override async ValueTask DisposeAsync() + { + await StopProcessorAsync(CancellationToken.None); + _shutdownCts.Dispose(); + await base.DisposeAsync(); + } +} + +internal enum StorageOperationType +{ + AppendJobOperation, + UpdateMetadata +} + +internal sealed class StorageOperation +{ + public required StorageOperationType Type { get; init; } + public JobOperation? JobOperation { get; init; } + public IDictionary? Metadata { get; init; } + public TaskCompletionSource CompletionSource { get; init; } = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); + + public static StorageOperation CreateAppendOperation(JobOperation jobOperation) + { + return new StorageOperation + { + Type = StorageOperationType.AppendJobOperation, + JobOperation = jobOperation + }; + } + + public static StorageOperation CreateMetadataOperation(IDictionary metadata) + { + return new StorageOperation + { + Type = StorageOperationType.UpdateMetadata, + Metadata = metadata + }; + } +} diff --git a/src/Azure/Orleans.ScheduledJobs.AzureStorage/AzureStorageJobShardManager.cs b/src/Azure/Orleans.ScheduledJobs.AzureStorage/AzureStorageJobShardManager.cs new file mode 100644 index 00000000000..83b308dc3f4 --- /dev/null +++ b/src/Azure/Orleans.ScheduledJobs.AzureStorage/AzureStorageJobShardManager.cs @@ -0,0 +1,417 @@ +using System; +using System.Collections.Concurrent; +using System.Collections.Generic; +using System.Diagnostics; +using System.Globalization; +using System.Threading; +using System.Threading.Tasks; +using Azure; +using Azure.Storage.Blobs; +using Azure.Storage.Blobs.Models; +using Azure.Storage.Blobs.Specialized; +using Microsoft.Extensions.Logging; +using Microsoft.Extensions.Options; +using Orleans.Hosting; +using Orleans.Runtime; + +namespace Orleans.ScheduledJobs.AzureStorage; + +public sealed partial class AzureStorageJobShardManager : JobShardManager +{ + private readonly BlobServiceClient _blobServiceClient; + private readonly string _containerName; + private readonly string _blobPrefix; + private BlobContainerClient _client = null!; + private readonly IClusterMembershipService _clusterMembership; + private readonly ConcurrentDictionary _jobShardCache = new(); + private readonly ILogger _logger; + private readonly ILoggerFactory _loggerFactory; + private readonly AzureStorageJobShardOptions _options; + private long _shardCounter = 0; // For generating unique shard IDs + + public AzureStorageJobShardManager( + SiloAddress siloAddress, + BlobServiceClient client, + string containerName, + string blobPrefix, + AzureStorageJobShardOptions options, + IClusterMembershipService clusterMembership, + ILoggerFactory loggerFactory) + : base(siloAddress) + { + _blobServiceClient = client; + _containerName = containerName; + _blobPrefix = blobPrefix; + _clusterMembership = clusterMembership; + _logger = loggerFactory.CreateLogger(); + _loggerFactory = loggerFactory; + _options = options; + } + + public AzureStorageJobShardManager( + ILocalSiloDetails localSiloDetails, + IOptions options, + IClusterMembershipService clusterMembership, + ILoggerFactory loggerFactory) + : this(localSiloDetails.SiloAddress, options.Value.BlobServiceClient, options.Value.ContainerName, localSiloDetails.ClusterId, options.Value, clusterMembership, loggerFactory) + { + } + + public override async Task> AssignJobShardsAsync(DateTimeOffset maxShardStartTime, CancellationToken cancellationToken) + { + await InitializeIfNeeded(cancellationToken); + LogAssigningShards(_logger, SiloAddress, maxShardStartTime, _containerName); + + var result = new List(); + await foreach (var blob in _client.GetBlobsAsync(traits: BlobTraits.Metadata, cancellationToken: cancellationToken, prefix: _blobPrefix)) + { + // Get the owner and creator of the shard + var (owner, membershipVersion, shardStartTime, maxDueTime) = ParseMetadata(blob.Metadata); + + // Check if the membership version is more recent than our current version + if (membershipVersion > _clusterMembership.CurrentSnapshot.Version) + { + // Refresh membership to at least that version + await _clusterMembership.Refresh(membershipVersion, cancellationToken); + } + + if (shardStartTime > maxShardStartTime) + { + // This shard is too new. Since blobs are returned in alphabetical order and our blob names + // contain timestamps (yyyyMMddHHmm format), all subsequent blobs will also be too new. + LogShardTooNew(_logger, blob.Name, shardStartTime, maxShardStartTime); + break; + } + + // If I am the owner, the shard must be in cache - always return it + if (owner is not null && owner.Equals(SiloAddress)) + { + if (_jobShardCache.TryGetValue(blob.Name, out var shard)) + { + LogShardAssigned(_logger, blob.Name, SiloAddress); + result.Add(shard); + } + else + { + // Shard is owned by us but not in cache - this is unexpected, release ownership + Debug.Assert(false, $"Shard '{blob.Name}' is owned by this silo but not in cache - releasing ownership"); + await ReleaseOwnership(blob.Name); + } + continue; + } + + // In debug, verify that if we're not the owner, the shard should not be in our cache + Debug.Assert(!_jobShardCache.ContainsKey(blob.Name), $"Shard '{blob.Name}' is in cache but we are not the owner (owner: {owner?.ToParsableString() ?? "none"})"); + + // Check if the owner is valid + var ownerStatus = owner is not null ? _clusterMembership.CurrentSnapshot.GetSiloStatus(owner) : SiloStatus.None; + + if (ownerStatus is not SiloStatus.Dead and not SiloStatus.None) + { + // Owner is still active and it's not me, skip this shard + LogShardStillOwned(_logger, blob.Name, owner!); + continue; + } + else + { + // Try to claim orphaned shard + LogClaimingShard(_logger, blob.Name, SiloAddress, owner); + var blobClient = _client.GetAppendBlobClient(blob.Name); + var metadata = blob.Metadata; + var orphanedShard = new AzureStorageJobShard(blob.Name, shardStartTime, maxDueTime, blobClient, metadata, blob.Properties.ETag, _options, _loggerFactory.CreateLogger()); + if (!await TryTakeOwnership(orphanedShard, metadata, SiloAddress, cancellationToken)) + { + // Someone else took over the shard, dispose and continue + await orphanedShard.DisposeAsync(); + LogShardOwnershipConflict(_logger, blob.Name, SiloAddress); + continue; + } + await orphanedShard.InitializeAsync(cancellationToken); + // We don't want to add new jobs to shards that we just took ownership of + await orphanedShard.MarkAsCompleteAsync(cancellationToken); + _jobShardCache[blob.Name] = orphanedShard; + LogShardAssigned(_logger, blob.Name, SiloAddress); + result.Add(orphanedShard); + } + } + + LogAssignmentCompleted(_logger, result.Count, SiloAddress); + return result; + + async Task ReleaseOwnership(string blobName) + { + try + { + var blobClient = _client.GetAppendBlobClient(blobName); + var properties = await blobClient.GetPropertiesAsync(cancellationToken: cancellationToken); + var metadata = properties.Value.Metadata; + metadata.Remove("Owner"); + await blobClient.SetMetadataAsync(metadata, new BlobRequestConditions { IfMatch = properties.Value.ETag }, cancellationToken); + } + catch (Exception ex) + { + // Log but continue - we'll let another silo claim it + _logger.LogWarning(ex, "Failed to release ownership of shard '{ShardId}' that was not in cache", blobName); + } + } + + async Task TryTakeOwnership(AzureStorageJobShard shard, IDictionary metadata, SiloAddress newOwner, CancellationToken ct) + { + metadata["Owner"] = newOwner.ToParsableString(); + metadata["MembershipVersion"] = _clusterMembership.CurrentSnapshot.Version.Value.ToString(); + try + { + await shard.UpdateBlobMetadata(metadata, ct); + LogOwnershipTaken(_logger, shard.Id, newOwner); + return true; + } + catch (RequestFailedException ex) + { + // Someone else took over the shard + LogOwnershipFailed(_logger, ex, shard.Id, newOwner); + return false; + } + } + } + + public override async Task CreateShardAsync(DateTimeOffset minDueTime, DateTimeOffset maxDueTime, IDictionary metadata, CancellationToken cancellationToken) + { + await InitializeIfNeeded(cancellationToken); + LogRegisteringShard(_logger, SiloAddress, minDueTime, maxDueTime, _containerName); + + var i = 0; + while (true) + { + var counter = Interlocked.Increment(ref _shardCounter); + var shardId = $"{_blobPrefix}-{minDueTime:yyyyMMddHHmm}-{SiloAddress.ToParsableString()}-{counter}"; + var blobClient = _client.GetAppendBlobClient(shardId); + var metadataInfo = CreateMetadata(metadata, SiloAddress, _clusterMembership.CurrentSnapshot.Version, minDueTime, maxDueTime); + metadataInfo["Owner"] = SiloAddress.ToParsableString(); + try + { + var response = await blobClient.CreateIfNotExistsAsync(metadata: metadataInfo, cancellationToken: cancellationToken); + if (response == null) + { + // Blob already exists, try again with a different name + LogShardIdCollision(_logger, shardId, i); + continue; + } + } + catch (RequestFailedException ex) + { + i++; + if (i > _options.MaxBlobCreationRetries) + { + throw new InvalidOperationException($"Failed to create shard blob '{shardId}' after {i} attempts", ex); + } + // Blob already exists, try again with a different name + LogShardRegistrationRetry(_logger, ex, shardId, i); + continue; + } + + var shard = new AzureStorageJobShard(shardId, minDueTime, maxDueTime, blobClient, metadataInfo, null, _options, _loggerFactory.CreateLogger()); + await shard.InitializeAsync(cancellationToken); + _jobShardCache[shardId] = shard; + LogShardRegistered(_logger, shardId, SiloAddress); + return shard; + } + } + + public override async Task UnregisterShardAsync(IJobShard shard, CancellationToken cancellationToken) + { + var azureShard = shard as AzureStorageJobShard ?? throw new ArgumentException("Shard is not an AzureStorageJobShard", nameof(shard)); + LogUnregisteringShard(_logger, shard.Id, SiloAddress); + + // Stop the background storage processor to ensure no more changes can happen + await azureShard.StopProcessorAsync(cancellationToken); + + // Now we can safely get a consistent view of the state + var count = await shard.GetJobCountAsync(); + // We want to make sure to get the latest properties + var properties = await azureShard.BlobClient.GetPropertiesAsync(cancellationToken: cancellationToken); + + // But we don't want to update the metadata if the ETag has changed + var currentETag = properties.Value.ETag; + var conditions = new BlobRequestConditions { IfMatch = currentETag }; + var metadata = properties.Value.Metadata; + var (owner, _, _, _) = ParseMetadata(metadata); + + if (owner != SiloAddress) + { + LogUnregisterWrongOwner(_logger, shard.Id, SiloAddress, owner); + throw new InvalidOperationException("Cannot unregister a shard owned by another silo"); + } + + if (count > 0) + { + // There are still jobs in the shard, unregister it + metadata.Remove("Owner"); + var response = await azureShard.BlobClient.SetMetadataAsync(metadata, conditions, cancellationToken); + _jobShardCache.TryRemove(shard.Id, out _); + LogShardOwnershipReleased(_logger, shard.Id, SiloAddress, count); + } + else + { + // No jobs left, we can delete the shard + await azureShard.BlobClient.DeleteIfExistsAsync(conditions: conditions, cancellationToken: cancellationToken); + _jobShardCache.TryRemove(shard.Id, out _); + LogShardDeleted(_logger, shard.Id, SiloAddress); + } + + // Dispose the shard's resources + await azureShard.DisposeAsync(); + } + + private async ValueTask InitializeIfNeeded(CancellationToken cancellationToken = default) + { + if (_client != null) return; + + LogInitializing(_logger, _containerName); + _client = _blobServiceClient.GetBlobContainerClient(_containerName); + await _client.CreateIfNotExistsAsync(cancellationToken: cancellationToken); + LogInitialized(_logger, _containerName); + } + + private static Dictionary CreateMetadata(IDictionary existingMetadata, SiloAddress siloAddress, MembershipVersion membershipVersion, DateTimeOffset minDueTime, DateTimeOffset maxDueTime) + { + var metadata = new Dictionary(existingMetadata) + { + { "MinDueTime", minDueTime.ToString("o") }, + { "MaxDueTime", maxDueTime.ToString("o") }, + { "MembershipVersion", membershipVersion.Value.ToString(CultureInfo.InvariantCulture) } + }; + + return metadata; + } + + private static (SiloAddress? owner, MembershipVersion membershipVersion, DateTimeOffset minDueTime, DateTimeOffset maxDueTime) ParseMetadata(IDictionary metadata) + { + var owner = metadata.TryGetValue("Owner", out var ownerStr) ? SiloAddress.FromParsableString(ownerStr) : null; + var membershipVersion = metadata.TryGetValue("MembershipVersion", out var membershipVersionStr) && long.TryParse(membershipVersionStr, out var versionValue) + ? new MembershipVersion(versionValue) + : MembershipVersion.MinValue; + var minDueTime = metadata.TryGetValue("MinDueTime", out var minDueTimeStr) && DateTimeOffset.TryParse(minDueTimeStr, out var minDt) ? minDt : DateTimeOffset.MinValue; + var maxDueTime = metadata.TryGetValue("MaxDueTime", out var maxDueTimeStr) && DateTimeOffset.TryParse(maxDueTimeStr, out var maxDt) ? maxDt : DateTimeOffset.MaxValue; + return (owner, membershipVersion, minDueTime, maxDueTime); + } + + [LoggerMessage( + Level = LogLevel.Information, + Message = "Initializing Azure Storage Job Shard Manager with container '{ContainerName}'" + )] + private static partial void LogInitializing(ILogger logger, string containerName); + + [LoggerMessage( + Level = LogLevel.Information, + Message = "Azure Storage Job Shard Manager initialized successfully for container '{ContainerName}'" + )] + private static partial void LogInitialized(ILogger logger, string containerName); + + [LoggerMessage( + Level = LogLevel.Debug, + Message = "Assigning job shards for silo {SiloAddress} with max time {MaxDateTime} from container '{ContainerName}'" + )] + private static partial void LogAssigningShards(ILogger logger, SiloAddress siloAddress, DateTimeOffset maxDateTime, string containerName); + + [LoggerMessage( + Level = LogLevel.Trace, + Message = "Ignoring shard '{ShardId}' since its start time is greater than specified maximum (MinDueTime={MinDueTime}, MaxDateTime={MaxDateTime})" + )] + private static partial void LogShardTooNew(ILogger logger, string shardId, DateTimeOffset minDueTime, DateTimeOffset maxDateTime); + + [LoggerMessage( + Level = LogLevel.Trace, + Message = "Shard '{ShardId}' is still owned by active silo {Owner}" + )] + private static partial void LogShardStillOwned(ILogger logger, string shardId, SiloAddress owner); + + [LoggerMessage( + Level = LogLevel.Debug, + Message = "Reclaiming shard '{ShardId}' from cache for silo {SiloAddress}" + )] + private static partial void LogReclaimingShardFromCache(ILogger logger, string shardId, SiloAddress siloAddress); + + [LoggerMessage( + Level = LogLevel.Debug, + Message = "Claiming shard '{ShardId}' for silo {SiloAddress} (Previous Owner={PreviousOwner})" + )] + private static partial void LogClaimingShard(ILogger logger, string shardId, SiloAddress siloAddress, SiloAddress? previousOwner); + + [LoggerMessage( + Level = LogLevel.Warning, + Message = "Failed to take ownership of shard '{ShardId}' for silo {SiloAddress} due to conflict" + )] + private static partial void LogShardOwnershipConflict(ILogger logger, string shardId, SiloAddress siloAddress); + + [LoggerMessage( + Level = LogLevel.Debug, + Message = "Shard '{ShardId}' assigned to silo {SiloAddress}" + )] + private static partial void LogShardAssigned(ILogger logger, string shardId, SiloAddress siloAddress); + + [LoggerMessage( + Level = LogLevel.Information, + Message = "Assigned {ShardCount} shard(s) to silo {SiloAddress}" + )] + private static partial void LogAssignmentCompleted(ILogger logger, int shardCount, SiloAddress siloAddress); + + [LoggerMessage( + Level = LogLevel.Debug, + Message = "Took ownership of shard '{ShardId}' for silo {SiloAddress}" + )] + private static partial void LogOwnershipTaken(ILogger logger, string shardId, SiloAddress siloAddress); + + [LoggerMessage( + Level = LogLevel.Warning, + Message = "Failed to take ownership of shard '{ShardId}' for silo {SiloAddress}" + )] + private static partial void LogOwnershipFailed(ILogger logger, Exception exception, string shardId, SiloAddress siloAddress); + + [LoggerMessage( + Level = LogLevel.Information, + Message = "Creating new shard for silo {SiloAddress} (MinDueTime={MinDueTime}, MaxDueTime={MaxDueTime}) in container '{ContainerName}'" + )] + private static partial void LogRegisteringShard(ILogger logger, SiloAddress siloAddress, DateTimeOffset minDueTime, DateTimeOffset maxDueTime, string containerName); + + [LoggerMessage( + Level = LogLevel.Trace, + Message = "Shard ID collision for '{ShardId}' (attempt {Attempt}), retrying with new ID" + )] + private static partial void LogShardIdCollision(ILogger logger, string shardId, int attempt); + + [LoggerMessage( + Level = LogLevel.Warning, + Message = "Failed to register shard '{ShardId}' (attempt {Attempt}), retrying" + )] + private static partial void LogShardRegistrationRetry(ILogger logger, Exception exception, string shardId, int attempt); + + [LoggerMessage( + Level = LogLevel.Information, + Message = "Shard '{ShardId}' created successfully for silo {SiloAddress}" + )] + private static partial void LogShardRegistered(ILogger logger, string shardId, SiloAddress siloAddress); + + [LoggerMessage( + Level = LogLevel.Information, + Message = "Unregistering shard '{ShardId}' for silo {SiloAddress}" + )] + private static partial void LogUnregisteringShard(ILogger logger, string shardId, SiloAddress siloAddress); + + [LoggerMessage( + Level = LogLevel.Warning, + Message = "Cannot unregister shard '{ShardId}' - silo {SiloAddress} is not the owner (Owner={Owner})" + )] + private static partial void LogUnregisterWrongOwner(ILogger logger, string shardId, SiloAddress siloAddress, SiloAddress? owner); + + [LoggerMessage( + Level = LogLevel.Information, + Message = "Released ownership of shard '{ShardId}' by silo {SiloAddress} ({JobCount} jobs remaining)" + )] + private static partial void LogShardOwnershipReleased(ILogger logger, string shardId, SiloAddress siloAddress, int jobCount); + + [LoggerMessage( + Level = LogLevel.Information, + Message = "Deleted shard '{ShardId}' by silo {SiloAddress} (no jobs remaining)" + )] + private static partial void LogShardDeleted(ILogger logger, string shardId, SiloAddress siloAddress); +} diff --git a/src/Azure/Orleans.ScheduledJobs.AzureStorage/Hosting/AzureStorageJobShardOptions.cs b/src/Azure/Orleans.ScheduledJobs.AzureStorage/Hosting/AzureStorageJobShardOptions.cs new file mode 100644 index 00000000000..f4f85e519ad --- /dev/null +++ b/src/Azure/Orleans.ScheduledJobs.AzureStorage/Hosting/AzureStorageJobShardOptions.cs @@ -0,0 +1,42 @@ +using System; +using Azure.Storage.Blobs; + +namespace Orleans.Hosting; + +public class AzureStorageJobShardOptions +{ + /// + /// Gets or sets the instance used to store job shards. + /// + public BlobServiceClient BlobServiceClient { get; set; } = null!; + + /// + /// Gets or sets the name of the container used to store scheduled jobs. + /// + public string ContainerName { get; set; } = "jobs"; + + /// + /// Gets or sets the maximum number of job operations to batch together in a single blob write. + /// Default is 50 operations. + /// + public int MaxBatchSize { get; set; } = 50; + + /// + /// Gets or sets the minimum number of job operations to batch together before flushing. + /// If more than 1 then the we will wait for additional operations. + /// Default is 1 operation (immediate flush, optimized for latency). + /// + public int MinBatchSize { get; set; } = 1; + + /// + /// Gets or sets the maximum time to wait for additional operations if the minimum batch size isn't reached + /// before flushing a batch. + /// Default is 50 milliseconds. + /// + public TimeSpan BatchFlushInterval { get; set; } = TimeSpan.FromMilliseconds(50); + + /// + /// Gets or sets the maximum number of retries for creating a blob for a job shard in case of name collisions. + /// + public int MaxBlobCreationRetries { get; internal set; } = 3; +} diff --git a/src/Azure/Orleans.ScheduledJobs.AzureStorage/Hosting/AzureStorageJobShardOptionsValidator.cs b/src/Azure/Orleans.ScheduledJobs.AzureStorage/Hosting/AzureStorageJobShardOptionsValidator.cs new file mode 100644 index 00000000000..9fbf438b70c --- /dev/null +++ b/src/Azure/Orleans.ScheduledJobs.AzureStorage/Hosting/AzureStorageJobShardOptionsValidator.cs @@ -0,0 +1,39 @@ +using Microsoft.Extensions.Options; +using Orleans.Configuration.Internal; +using Orleans.Runtime; + +namespace Orleans.Hosting; + +/// +/// Validates . +/// +public class AzureStorageJobShardOptionsValidator : IConfigurationValidator +{ + private readonly AzureStorageJobShardOptions _options; + private readonly string _name; + + /// + /// Initializes a new instance of the class. + /// + /// The options. + /// The name. + public AzureStorageJobShardOptionsValidator(AzureStorageJobShardOptions options, string name) + { + _options = options; + _name = name; + } + + /// + public void ValidateConfiguration() + { + if (_options.BlobServiceClient is null) + { + throw new OrleansConfigurationException($"Invalid configuration for {nameof(AzureStorageJobShardOptions)} with name '{_name}'. {nameof(_options.BlobServiceClient)} is required."); + } + + if (string.IsNullOrWhiteSpace(_options.ContainerName)) + { + throw new OrleansConfigurationException($"Invalid configuration for {nameof(AzureStorageJobShardOptions)} with name '{_name}'. {nameof(_options.ContainerName)} is required."); + } + } +} diff --git a/src/Azure/Orleans.ScheduledJobs.AzureStorage/Hosting/AzureStorageScheduledJobsExtensions.cs b/src/Azure/Orleans.ScheduledJobs.AzureStorage/Hosting/AzureStorageScheduledJobsExtensions.cs new file mode 100644 index 00000000000..983273821f5 --- /dev/null +++ b/src/Azure/Orleans.ScheduledJobs.AzureStorage/Hosting/AzureStorageScheduledJobsExtensions.cs @@ -0,0 +1,96 @@ +using System; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Options; +using Orleans.Configuration; +using Orleans.Configuration.Internal; +using Orleans.ScheduledJobs; +using Orleans.ScheduledJobs.AzureStorage; + +namespace Orleans.Hosting; + +/// +/// Extensions for configuring Azure Blob Storage scheduled jobs. +/// +public static class AzureStorageScheduledJobsExtensions +{ + /// + /// Adds scheduled jobs storage backed by Azure Blob Storage. + /// + /// + /// The builder. + /// + /// + /// The delegate used to configure the scheduled jobs storage. + /// + /// + /// The provided , for chaining. + /// + public static ISiloBuilder UseAzureBlobScheduledJobs(this ISiloBuilder builder, Action configure) + { + builder.ConfigureServices(services => services.UseAzureBlobScheduledJobs(configure)); + return builder; + } + + /// + /// Adds scheduled jobs storage backed by Azure Blob Storage. + /// + /// + /// The builder. + /// + /// + /// The configuration delegate. + /// + /// + /// The provided , for chaining. + /// + public static ISiloBuilder UseAzureBlobScheduledJobs(this ISiloBuilder builder, Action> configureOptions) + { + builder.ConfigureServices(services => services.UseAzureBlobScheduledJobs(configureOptions)); + return builder; + } + + /// + /// Adds scheduled jobs storage backed by Azure Blob Storage. + /// + /// + /// The service collection. + /// + /// + /// The delegate used to configure the scheduled jobs storage. + /// + /// + /// The provided , for chaining. + /// + public static IServiceCollection UseAzureBlobScheduledJobs(this IServiceCollection services, Action configure) + { + services.AddScheduledJobs(); + services.AddSingleton(); + services.AddFromExisting(); + services.Configure(configure); + services.ConfigureFormatter(); + return services; + } + + /// + /// Adds scheduled jobs storage backed by Azure Blob Storage. + /// + /// + /// The service collection. + /// + /// + /// The configuration delegate. + /// + /// + /// The provided , for chaining. + /// + public static IServiceCollection UseAzureBlobScheduledJobs(this IServiceCollection services, Action> configureOptions) + { + services.AddScheduledJobs(); + services.AddSingleton(); + services.AddFromExisting(); + configureOptions?.Invoke(services.AddOptions()); + services.ConfigureFormatter(); + services.AddTransient(sp => new AzureStorageJobShardOptionsValidator(sp.GetRequiredService>().Get(Options.DefaultName), Options.DefaultName)); + return services; + } +} diff --git a/src/Azure/Orleans.ScheduledJobs.AzureStorage/JobOperation.cs b/src/Azure/Orleans.ScheduledJobs.AzureStorage/JobOperation.cs new file mode 100644 index 00000000000..462c18f89f8 --- /dev/null +++ b/src/Azure/Orleans.ScheduledJobs.AzureStorage/JobOperation.cs @@ -0,0 +1,110 @@ +using System; +using System.Collections.Generic; +using System.Text.Json; +using System.Text.Json.Serialization; +using Orleans.Runtime; + +namespace Orleans.ScheduledJobs.AzureStorage; + +/// +/// Represents an operation to be performed on a scheduled job. +/// +internal struct JobOperation +{ + /// + /// The type of operation to perform. + /// + public enum OperationType + { + Add, + Remove, + Retry, + } + + /// + /// Gets or sets the type of operation. + /// + public OperationType Type { get; init; } + + /// + /// Gets or sets the job identifier. + /// + public string Id { get; init; } + + /// + /// Gets or sets the job name (only used for Add operations). + /// + public string? Name { get; init; } + + /// + /// Gets or sets the due time (used for Add and Retry operations). + /// + public DateTimeOffset? DueTime { get; init; } + + /// + /// Gets or sets the target grain ID (only used for Add operations). + /// + public GrainId? TargetGrainId { get; init; } + + /// + /// Gets or sets the job metadata (only used for Add operations). + /// + public IReadOnlyDictionary? Metadata { get; init; } + + /// + /// Creates an Add operation for scheduling a new job. + /// + /// The job identifier. + /// The job name. + /// The job due time. + /// The target grain ID. + /// The job metadata. + /// A new JobOperation for adding a job. + /// Thrown when or is null or empty. + public static JobOperation CreateAddOperation(string id, string name, DateTimeOffset dueTime, GrainId targetGrainId, IReadOnlyDictionary? metadata) + { + ArgumentException.ThrowIfNullOrEmpty(id); + ArgumentException.ThrowIfNullOrEmpty(name); + + return new() { Type = OperationType.Add, Id = id, Name = name, DueTime = dueTime, TargetGrainId = targetGrainId, Metadata = metadata }; + } + + /// + /// Creates a Remove operation for canceling a job. + /// + /// The job identifier. + /// A new JobOperation for removing a job. + /// Thrown when is null or empty. + public static JobOperation CreateRemoveOperation(string id) + { + ArgumentException.ThrowIfNullOrEmpty(id); + + return new() { Type = OperationType.Remove, Id = id }; + } + + /// + /// Creates a Retry operation for rescheduling a job. + /// + /// The job identifier. + /// The new due time. + /// A new JobOperation for retrying a job. + /// Thrown when is null or empty. + public static JobOperation CreateRetryOperation(string id, DateTimeOffset dueTime) + { + ArgumentException.ThrowIfNullOrEmpty(id); + + return new() { Type = OperationType.Retry, Id = id, DueTime = dueTime }; + } +} + +/// +/// JSON serialization context for JobOperation with compile-time source generation. +/// +[JsonSerializable(typeof(JobOperation))] +[JsonSourceGenerationOptions( + DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault, + PropertyNamingPolicy = JsonKnownNamingPolicy.CamelCase, + WriteIndented = false)] +internal partial class JobOperationJsonContext : JsonSerializerContext +{ +} \ No newline at end of file diff --git a/src/Azure/Orleans.ScheduledJobs.AzureStorage/NetstringJsonSerializer.cs b/src/Azure/Orleans.ScheduledJobs.AzureStorage/NetstringJsonSerializer.cs new file mode 100644 index 00000000000..be38d023842 --- /dev/null +++ b/src/Azure/Orleans.ScheduledJobs.AzureStorage/NetstringJsonSerializer.cs @@ -0,0 +1,168 @@ +using System; +using System.Buffers; +using System.Buffers.Text; +using System.Collections.Generic; +using System.IO; +using System.Runtime.CompilerServices; +using System.Text.Json; +using System.Text.Json.Serialization.Metadata; +using System.Threading; +using System.Threading.Tasks; +using Orleans.Serialization.Buffers.Adaptors; + +namespace Orleans.ScheduledJobs.AzureStorage; + +/// +/// Provides methods for serializing and deserializing JSON data using the netstring format. +/// Netstrings are a simple, self-delimiting way to encode data with length prefixes. +/// Format: [6 hex digits]:[data]\n +/// Maximum data size is 10MB (0xA00000 bytes). +/// +public static class NetstringJsonSerializer +{ + private const int MaxLength = 0xA00000; // 10MB + + /// + /// Encodes an object as a netstring by serializing it to JSON and writing directly to a stream. + /// + /// The object to encode. + /// The stream to write the netstring-encoded data to. + /// The JSON type info for serialization. + /// Thrown when the serialized data exceeds the maximum length. + public static void Encode(T value, Stream stream, JsonTypeInfo jsonTypeInfo) + { + // Remember starting position + var startPosition = stream.Position; + + // Skip past where the length prefix will go (6 hex digits + colon) + Span lengthBytes = stackalloc byte[7]; + stream.Write(lengthBytes); + + // Remember position where data starts + var dataStartPosition = stream.Position; + + // Serialize JSON directly to stream + using (var writer = new Utf8JsonWriter(stream, new JsonWriterOptions { SkipValidation = false })) + { + JsonSerializer.Serialize(writer, value, jsonTypeInfo); + } + + stream.Flush(); + + // Calculate JSON length + var jsonLength = (int)(stream.Position - dataStartPosition); + + if (jsonLength > MaxLength) + { + throw new InvalidOperationException($"Serialized data exceeds maximum length of {MaxLength} bytes"); + } + + // Write trailing newline + stream.WriteByte((byte)'\n'); + + // Remember end position + var endPosition = stream.Position; + + // Seek back to write the length prefix + stream.Position = startPosition; + + // Format length as 6-digit hex and write directly + if (!Utf8Formatter.TryFormat(jsonLength, lengthBytes, out _, new StandardFormat('X', 6))) + { + throw new InvalidOperationException("Failed to format length prefix"); + } + + lengthBytes[6] = (byte)':'; + + stream.Write(lengthBytes); + + // Restore position to end + stream.Position = endPosition; + } + + /// + /// Reads netstring-encoded JSON objects from a stream and deserializes them. + /// + /// The stream to read from. + /// The JSON type info for deserialization. + /// The cancellation token to cancel the operation. + /// An async enumerable of deserialized objects. + /// Thrown when the stream contains invalid netstring data. + public static async IAsyncEnumerable DecodeAsync(Stream stream, JsonTypeInfo jsonTypeInfo, [EnumeratorCancellation] CancellationToken cancellationToken) + { + const int TypicalBufferSize = 4096; // 4KB + var buffer = ArrayPool.Shared.Rent(TypicalBufferSize); + + try + { + while (true) + { + + // Try to read length prefix (6 hex digits + colon) + try + { + await stream.ReadExactlyAsync(buffer, 0, 7, cancellationToken); + } + catch (EndOfStreamException) + { + // We are done + yield break; + } + + // Verify colon + if (buffer[6] != ':') + { + throw new InvalidDataException($"Expected colon at position 6, got byte value {buffer[6]}"); + } + + // Parse length as hex + if (!Utf8Parser.TryParse(buffer.AsSpan(0, 6), out int length, out _, 'X')) + { + throw new InvalidDataException($"Invalid netstring length: {System.Text.Encoding.UTF8.GetString(buffer, 0, 6)}"); + } + + if (length < 0 || length > MaxLength) + { + throw new InvalidDataException($"Netstring length out of valid range: {length}"); + } + + // Ensure buffer is large enough for the data + newline + var totalLength = length + 1; + if (buffer.Length < totalLength) + { + ArrayPool.Shared.Return(buffer); + buffer = ArrayPool.Shared.Rent(totalLength); + } + + // Read data + trailing newline + try + { + await stream.ReadExactlyAsync(buffer.AsMemory(0, totalLength), cancellationToken); + } + catch (EndOfStreamException ex) + { + throw new InvalidDataException("Unexpected end of stream while reading netstring data", ex); + } + + // Verify trailing newline + if (buffer[length] != '\n') + { + throw new InvalidDataException($"Expected newline at end of netstring, got byte value {buffer[length]}"); + } + + // Deserialize JSON directly from UTF-8 bytes + var result = JsonSerializer.Deserialize(buffer.AsSpan(0, length), jsonTypeInfo); + if (result is null) + { + throw new JsonException("Deserialized JSON resulted in null value"); + } + + yield return result; + } + } + finally + { + ArrayPool.Shared.Return(buffer); + } + } +} diff --git a/src/Azure/Orleans.ScheduledJobs.AzureStorage/Orleans.ScheduledJobs.AzureStorage.csproj b/src/Azure/Orleans.ScheduledJobs.AzureStorage/Orleans.ScheduledJobs.AzureStorage.csproj new file mode 100644 index 00000000000..150c3f67774 --- /dev/null +++ b/src/Azure/Orleans.ScheduledJobs.AzureStorage/Orleans.ScheduledJobs.AzureStorage.csproj @@ -0,0 +1,30 @@ + + + + README.md + Microsoft.Orleans.ScheduledJobs.AzureStorage + Microsoft Orleans Azure Storage Scheduled Jobs Provider + Microsoft Orleans scheduled jobs provider backed by Azure Blob Storage + $(PackageTags) Azure Storage + $(DefaultTargetFrameworks) + Orleans.ScheduledJobs.AzureStorage + Orleans.ScheduledJobs.AzureStorage + true + $(DefineConstants) + enable + $(VersionSuffix).alpha.1 + alpha.1 + + + + + + + + + + + + + + diff --git a/src/Azure/Orleans.ScheduledJobs.AzureStorage/Properties/AssemblyInfo.cs b/src/Azure/Orleans.ScheduledJobs.AzureStorage/Properties/AssemblyInfo.cs new file mode 100644 index 00000000000..3a3d99f3cce --- /dev/null +++ b/src/Azure/Orleans.ScheduledJobs.AzureStorage/Properties/AssemblyInfo.cs @@ -0,0 +1,3 @@ +using System.Runtime.CompilerServices; + +[assembly: InternalsVisibleTo("Tester.AzureUtils")] diff --git a/src/Azure/Orleans.ScheduledJobs.AzureStorage/README.md b/src/Azure/Orleans.ScheduledJobs.AzureStorage/README.md new file mode 100644 index 00000000000..81d3599ffc5 --- /dev/null +++ b/src/Azure/Orleans.ScheduledJobs.AzureStorage/README.md @@ -0,0 +1,497 @@ +# Microsoft Orleans Scheduled Jobs for Azure Storage + +## Introduction +Microsoft Orleans Scheduled Jobs for Azure Storage provides persistent storage for Orleans scheduled jobs using Azure Blob Storage. This allows your Orleans applications to schedule jobs that survive silo restarts, grain deactivation, and cluster reconfigurations. Jobs are stored in append blobs, providing efficient storage and retrieval for time-based job scheduling. + +## Getting Started + +### Installation +To use this package, install it via NuGet along with the core package: + +```shell +dotnet add package Microsoft.Orleans.ScheduledJobs +dotnet add package Microsoft.Orleans.ScheduledJobs.AzureStorage +``` + +### Configuration + +#### Using Connection String +```csharp +using Azure.Storage.Blobs; +using Microsoft.Extensions.Hosting; +using Orleans.Hosting; + +var builder = Host.CreateApplicationBuilder(args); + +builder.UseOrleans(siloBuilder => +{ + siloBuilder + .UseAzureStorageClustering(options => options.ConfigureTableServiceClient("YOUR_STORAGE_ACCOUNT_URI")) + .UseAzureStorageScheduledJobs(options => + { + options.Configure(o => + { + o.BlobServiceClient = new BlobServiceClient("YOUR_AZURE_STORAGE_CONNECTION_STRING"); + o.ContainerName = "scheduled-jobs"; + }); + }); +}); + +await builder.Build().RunAsync(); +``` + +#### Using Managed Identity (Recommended for Production) +```csharp +using Azure.Identity; +using Azure.Storage.Blobs; +using Microsoft.Extensions.Hosting; +using Orleans.Hosting; + +var builder = Host.CreateApplicationBuilder(args); + +builder.UseOrleans(siloBuilder => +{ + siloBuilder + .UseAzureStorageClustering(options => options.ConfigureTableServiceClient("YOUR_STORAGE_ACCOUNT_URI")) + .UseAzureStorageScheduledJobs(options => + { + options.Configure(o => + { + var credential = new DefaultAzureCredential(); + o.BlobServiceClient = new BlobServiceClient( + new Uri("https://youraccount.blob.core.windows.net"), + credential); + o.ContainerName = "scheduled-jobs"; + }); + }); +}); + +await builder.Build().RunAsync(); +``` + +#### With Advanced Options +```csharp +using Microsoft.Extensions.DependencyInjection; +using Orleans.Hosting; + +builder.UseOrleans(siloBuilder => +{ + siloBuilder + .UseAzureStorageClustering(options => options.ConfigureTableServiceClient(connectionString)) + .UseAzureStorageScheduledJobs(options => + { + options.Configure(o => + { + o.BlobServiceClient = new BlobServiceClient(connectionString); + // Use different containers for different environments + o.ContainerName = $"scheduled-jobs-{Environment.GetEnvironmentVariable("ASPNETCORE_ENVIRONMENT")?.ToLowerInvariant()}"; + }); + }) + .ConfigureServices(services => + { + services.Configure(options => + { + // Shard duration: balance between latency and storage overhead + options.ShardDuration = TimeSpan.FromMinutes(5); + + // Control concurrency to prevent overwhelming the system + options.MaxConcurrentJobsPerSilo = 50; + + // Custom retry policy with exponential backoff + options.ShouldRetry = (context, exception) => + { + // Don't retry on permanent failures + if (exception is ArgumentException or InvalidOperationException) + return null; + + // Exponential backoff with max 3 retries + if (context.DequeueCount < 3) + { + var delay = TimeSpan.FromSeconds(Math.Pow(2, context.DequeueCount)); + return DateTimeOffset.UtcNow.Add(delay); + } + + return null; + }; + }); + }); +}); +``` + +## Usage Example + +### Email Scheduling with Cancellation +```csharp +using Orleans; +using Orleans.ScheduledJobs; + +public interface IEmailGrain : IGrainWithStringKey +{ + Task ScheduleEmail(string subject, string body, DateTimeOffset sendTime); + Task CancelScheduledEmail(); +} + +public class EmailGrain : Grain, IEmailGrain, IScheduledJobHandler +{ + private readonly ILocalScheduledJobManager _jobManager; + private readonly IEmailService _emailService; + private readonly ILogger _logger; + private IScheduledJob? _scheduledEmailJob; + + public EmailGrain( + ILocalScheduledJobManager jobManager, + IEmailService emailService, + ILogger logger) + { + _jobManager = jobManager; + _emailService = emailService; + _logger = logger; + } + + public async Task ScheduleEmail(string subject, string body, DateTimeOffset sendTime) + { + var emailAddress = this.GetPrimaryKeyString(); + var metadata = new Dictionary + { + ["Subject"] = subject, + ["Body"] = body + }; + + _scheduledEmailJob = await _jobManager.ScheduleJobAsync( + this.GetGrainId(), + "SendEmail", + sendTime, + metadata); + + _logger.LogInformation( + "Scheduled email to {EmailAddress} for {SendTime} (JobId: {JobId})", + emailAddress, sendTime, _scheduledEmailJob.Id); + } + + public async Task CancelScheduledEmail() + { + if (_scheduledEmailJob is null) + { + _logger.LogWarning("No scheduled email to cancel"); + return; + } + + var canceled = await _jobManager.TryCancelScheduledJobAsync(_scheduledEmailJob); + if (canceled) + { + _logger.LogInformation("Email job {JobId} canceled successfully", _scheduledEmailJob.Id); + _scheduledEmailJob = null; + } + else + { + _logger.LogWarning("Failed to cancel email job {JobId} (may have already executed)", _scheduledEmailJob.Id); + } + } + + public async Task ExecuteJobAsync(IScheduledJobContext context, CancellationToken cancellationToken) + { + var emailAddress = this.GetPrimaryKeyString(); + var subject = context.Job.Metadata?["Subject"]; + var body = context.Job.Metadata?["Body"]; + + _logger.LogInformation( + "Sending email to {EmailAddress} (Job: {JobId}, Attempt: {Attempt})", + emailAddress, context.Job.Id, context.DequeueCount); + + try + { + await _emailService.SendEmailAsync(emailAddress, subject, body, cancellationToken); + _logger.LogInformation("Email sent successfully to {EmailAddress}", emailAddress); + _scheduledEmailJob = null; + } + catch (Exception ex) + { + _logger.LogError(ex, "Failed to send email to {EmailAddress}", emailAddress); + throw; // Let the retry policy handle it + } + } +} +``` + +### Order Workflow with Multiple Scheduled Steps +```csharp +public interface IOrderGrain : IGrainWithGuidKey +{ + Task PlaceOrder(OrderDetails order); + Task CancelOrder(); +} + +public class OrderGrain : Grain, IOrderGrain, IScheduledJobHandler +{ + private readonly ILocalScheduledJobManager _jobManager; + private readonly IOrderService _orderService; + private readonly IGrainFactory _grainFactory; + private readonly ILogger _logger; + private OrderDetails? _orderDetails; + + public OrderGrain( + ILocalScheduledJobManager jobManager, + IOrderService orderService, + IGrainFactory grainFactory, + ILogger logger) + { + _jobManager = jobManager; + _orderService = orderService; + _grainFactory = grainFactory; + _logger = logger; + } + + public async Task PlaceOrder(OrderDetails order) + { + _orderDetails = order; + var orderId = this.GetPrimaryKey(); + + // Create the order + await _orderService.CreateOrderAsync(orderId, order); + _logger.LogInformation("Order {OrderId} created for customer {CustomerId}", orderId, order.CustomerId); + + // Schedule payment reminder after 1 hour + var paymentReminderTime = DateTimeOffset.UtcNow.AddHours(1); + await _jobManager.ScheduleJobAsync( + this.GetGrainId(), + "PaymentReminder", + paymentReminderTime, + new Dictionary + { + ["Step"] = "PaymentReminder", + ["CustomerEmail"] = order.CustomerEmail + }); + + // Schedule order expiration after 24 hours + var expirationTime = DateTimeOffset.UtcNow.AddHours(24); + await _jobManager.ScheduleJobAsync( + this.GetGrainId(), + "OrderExpiration", + expirationTime, + new Dictionary + { + ["Step"] = "OrderExpiration" + }); + + _logger.LogInformation( + "Scheduled payment reminder for {ReminderTime} and expiration for {ExpirationTime}", + paymentReminderTime, expirationTime); + } + + public async Task CancelOrder() + { + var orderId = this.GetPrimaryKey(); + await _orderService.CancelOrderAsync(orderId); + _orderDetails = null; + _logger.LogInformation("Order {OrderId} canceled", orderId); + } + + public async Task ExecuteJobAsync(IScheduledJobContext context, CancellationToken cancellationToken) + { + var step = context.Job.Metadata!["Step"]; + var orderId = this.GetPrimaryKey(); + + _logger.LogInformation( + "Executing workflow step {Step} for order {OrderId} (Attempt: {Attempt})", + step, orderId, context.DequeueCount); + + switch (step) + { + case "PaymentReminder": + await HandlePaymentReminder(context, cancellationToken); + break; + + case "OrderExpiration": + await HandleOrderExpiration(cancellationToken); + break; + + default: + _logger.LogWarning("Unknown workflow step: {Step}", step); + break; + } + } + + private async Task HandlePaymentReminder(IScheduledJobContext context, CancellationToken ct) + { + var orderId = this.GetPrimaryKey(); + var order = await _orderService.GetOrderAsync(orderId, ct); + + if (order?.Status == OrderStatus.Pending) + { + var customerEmail = context.Job.Metadata!["CustomerEmail"]; + var emailGrain = _grainFactory.GetGrain(customerEmail); + + await emailGrain.ScheduleEmail( + "Payment Reminder", + $"Your order {orderId} is awaiting payment. Please complete your purchase within 23 hours.", + DateTimeOffset.UtcNow); + + _logger.LogInformation("Payment reminder sent for order {OrderId}", orderId); + } + else + { + _logger.LogInformation( + "Skipping payment reminder for order {OrderId} - status is {Status}", + orderId, order?.Status); + } + } + + private async Task HandleOrderExpiration(CancellationToken ct) + { + var orderId = this.GetPrimaryKey(); + var order = await _orderService.GetOrderAsync(orderId, ct); + + if (order?.Status == OrderStatus.Pending) + { + await _orderService.CancelOrderAsync(orderId, ct); + _logger.LogInformation("Order {OrderId} expired and canceled", orderId); + + // Notify customer + var emailGrain = _grainFactory.GetGrain(order.CustomerEmail); + await emailGrain.ScheduleEmail( + "Order Expired", + $"Your order {orderId} has expired due to pending payment.", + DateTimeOffset.UtcNow); + } + else + { + _logger.LogInformation( + "Order {OrderId} did not expire - status is {Status}", + orderId, order?.Status); + } + } +} + +// Supporting types +public class OrderDetails +{ + public string CustomerId { get; set; } = ""; + public string CustomerEmail { get; set; } = ""; + public decimal Amount { get; set; } + public List Items { get; set; } = new(); +} + +public enum OrderStatus +{ + Pending, + Paid, + Shipped, + Delivered, + Cancelled +} +``` + +## How It Works + +### Storage Architecture +1. **Blob Container**: All jobs are stored in a single Azure Blob Storage container +2. **Append Blobs**: Each job shard is stored as an append blob, providing efficient sequential writes +3. **Blob Naming**: Blobs are named with the pattern: `{ShardStartTime:yyyyMMddHHmm}-{SiloAddress}-{Index}` +4. **Metadata**: Blob metadata stores ownership and time range information: + - `Owner`: The silo currently processing this shard + - `Creator`: The silo that created this shard + - `MinDueTime`: Start of the time range for jobs in this shard + - `MaxDueTime`: End of the time range for jobs in this shard + +### Shard Ownership and High Availability +1. **Optimistic Concurrency**: ETags prevent conflicting updates when multiple silos try to claim a shard +2. **Ownership Transfer**: When a silo fails, other silos detect the failure and claim orphaned shards +3. **Creator Priority**: The silo that created a shard gets priority to reclaim it if it loses ownership +4. **Automatic Cleanup**: Empty shards are deleted automatically after processing + +### Job Lifecycle with Azure Storage +``` +┌─────────────────────┐ +│ Job Scheduled │ ──▶ Written to append blob +└─────────────────────┘ + │ + ▼ +┌─────────────────────┐ +│ Waiting in Shard │ ──▶ Persisted in Azure Blob Storage +└─────────────────────┘ + │ + ▼ +┌─────────────────────┐ +│ Shard Owned │ ──▶ Silo acquires ownership via metadata update +└─────────────────────┘ + │ + ▼ +┌─────────────────────┐ +│ Job Executed │ ──▶ Handler invoked on target grain +└─────────────────────┘ + │ + ├──▶ Success ──▶ Job entry removed from blob + │ + └──▶ Failure ──▶ Retry: Updated due time in blob + No Retry: Job entry removed +``` + +## Performance Considerations + +### Concurrency Settings +```csharp +services.Configure(options => +{ + // Adjust based on your workload and Azure Storage limits + options.MaxConcurrentJobsPerSilo = 50; +}); +``` + +### Storage Costs +- **Container**: One container per cluster +- **Blobs**: One blob per active time shard +- **Operations**: + - Schedule job: 1-2 append operations + - Execute job: 1 read + 1 delete operation + - Shard ownership transfer: 1 metadata update + +## Monitoring and Troubleshooting + +### Enable Logging +```csharp +builder.Logging.AddFilter("Orleans.ScheduledJobs", LogLevel.Information); +builder.Logging.AddFilter("Orleans.ScheduledJobs.AzureStorage", LogLevel.Information); +``` + +### Key Metrics to Monitor +- **Shard Assignment Time**: Time to claim ownership of unassigned shards +- **Job Execution Latency**: Time between due time and actual execution +- **Retry Rate**: Percentage of jobs requiring retry +- **Blob Operations**: Number of read/write/delete operations per minute + +## Security Best Practices + +### Use Managed Identity +```csharp +var credential = new DefaultAzureCredential(); +var blobServiceClient = new BlobServiceClient(storageAccountUri, credential); +``` + +### Network Security +- Enable firewall rules to restrict access +- Use private endpoints for enhanced security +- Consider Azure Virtual Network integration + +### Access Control +```csharp +// Minimum required permissions: +// - Storage Blob Data Contributor (for read/write/delete operations) +// - Or custom role with: +// - Microsoft.Storage/storageAccounts/blobServices/containers/read +// - Microsoft.Storage/storageAccounts/blobServices/containers/blobs/read +// - Microsoft.Storage/storageAccounts/blobServices/containers/blobs/write +// - Microsoft.Storage/storageAccounts/blobServices/containers/blobs/delete +``` + +## Documentation +For more comprehensive documentation, please refer to: +- [Microsoft Orleans Documentation](https://learn.microsoft.com/dotnet/orleans/) +- [Azure Blob Storage Documentation](https://learn.microsoft.com/azure/storage/blobs/) +- [Orleans Scheduled Jobs Core Package](../../../Orleans.ScheduledJobs/README.md) + +## Feedback & Contributing +- If you have any issues or would like to provide feedback, please [open an issue on GitHub](https://github.com/dotnet/orleans/issues) +- Join our community on [Discord](https://aka.ms/orleans-discord) +- Follow the [@msftorleans](https://twitter.com/msftorleans) Twitter account for Orleans announcements +- Contributions are welcome! Please review our [contribution guidelines](https://github.com/dotnet/orleans/blob/main/CONTRIBUTING.md) +- This project is licensed under the [MIT license](https://github.com/dotnet/orleans/blob/main/LICENSE) diff --git a/src/Orleans.Core/Properties/AssemblyInfo.cs b/src/Orleans.Core/Properties/AssemblyInfo.cs index 38eeb422f36..552b4929821 100644 --- a/src/Orleans.Core/Properties/AssemblyInfo.cs +++ b/src/Orleans.Core/Properties/AssemblyInfo.cs @@ -4,6 +4,7 @@ [assembly: InternalsVisibleTo("Orleans.CodeGeneration")] [assembly: InternalsVisibleTo("Orleans.CodeGeneration.Build")] [assembly: InternalsVisibleTo("Orleans.Runtime")] +[assembly: InternalsVisibleTo("Orleans.ScheduledJobs")] [assembly: InternalsVisibleTo("Orleans.Streaming")] [assembly: InternalsVisibleTo("Orleans.TestingHost")] diff --git a/src/Orleans.Runtime/MembershipService/IClusterMembershipService.cs b/src/Orleans.Runtime/MembershipService/IClusterMembershipService.cs index f902dc56902..82fa625c05a 100644 --- a/src/Orleans.Runtime/MembershipService/IClusterMembershipService.cs +++ b/src/Orleans.Runtime/MembershipService/IClusterMembershipService.cs @@ -1,4 +1,5 @@ using System.Collections.Generic; +using System.Threading; using System.Threading.Tasks; namespace Orleans.Runtime @@ -24,8 +25,9 @@ public interface IClusterMembershipService /// Refreshes cluster membership if it is not at or above the specified minimum version. /// /// The minimum version. + /// The cancellation token. /// A representing the work performed. - ValueTask Refresh(MembershipVersion minimumVersion = default); + ValueTask Refresh(MembershipVersion minimumVersion = default, CancellationToken cancellationToken = default); /// /// Unilaterally declares the specified silo defunct. diff --git a/src/Orleans.Runtime/Properties/AssemblyInfo.cs b/src/Orleans.Runtime/Properties/AssemblyInfo.cs index abf4e11da60..73a98bd2f18 100644 --- a/src/Orleans.Runtime/Properties/AssemblyInfo.cs +++ b/src/Orleans.Runtime/Properties/AssemblyInfo.cs @@ -2,6 +2,7 @@ [assembly: InternalsVisibleTo("Orleans.Streaming")] [assembly: InternalsVisibleTo("Orleans.Reminders")] +[assembly: InternalsVisibleTo("Orleans.ScheduledJobs")] [assembly: InternalsVisibleTo("Orleans.Journaling")] [assembly: InternalsVisibleTo("Orleans.TestingHost")] @@ -15,4 +16,4 @@ [assembly: InternalsVisibleTo("Benchmarks")] // Mocking libraries -[assembly: InternalsVisibleTo("DynamicProxyGenAssembly2")] \ No newline at end of file +[assembly: InternalsVisibleTo("DynamicProxyGenAssembly2")] diff --git a/src/Orleans.ScheduledJobs/Hosting/ScheduledJobsExtensions.cs b/src/Orleans.ScheduledJobs/Hosting/ScheduledJobsExtensions.cs new file mode 100644 index 00000000000..12d3297d06e --- /dev/null +++ b/src/Orleans.ScheduledJobs/Hosting/ScheduledJobsExtensions.cs @@ -0,0 +1,80 @@ +using System.Linq; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Logging; +using Orleans.Configuration.Internal; +using Orleans.Runtime; +using Orleans.ScheduledJobs; + +namespace Orleans.Hosting; + +/// +/// Extensions to for configuring scheduled jobs. +/// +public static class ScheduledJobsExtensions +{ + /// + /// Adds support for scheduled jobs to this silo. + /// + /// The builder. + /// The silo builder. + public static ISiloBuilder AddScheduledJobs(this ISiloBuilder builder) => builder.ConfigureServices(services => AddScheduledJobs(services)); + + /// + /// Adds support for scheduled jobs to this silo. + /// + /// The services. + public static void AddScheduledJobs(this IServiceCollection services) + { + if (services.Any(service => service.ServiceType.Equals(typeof(LocalScheduledJobManager)))) + { + return; + } + + services.AddSingleton(); + services.AddSingleton(); + services.AddSingleton(); + services.AddFromExisting(); + services.AddFromExisting, LocalScheduledJobManager>(); + services.AddKeyedTransient(typeof(IScheduledJobReceiverExtension), (sp, _) => + { + var grainContextAccessor = sp.GetRequiredService(); + return new ScheduledJobReceiverExtension(grainContextAccessor.GrainContext, sp.GetRequiredService>()); + }); + } + + /// + /// Configures scheduled jobs storage using an in-memory, non-persistent store. + /// + /// + /// Note that this is for development and testing scenarios only and should not be used in production. + /// + /// The silo host builder. + /// The provided , for chaining. + public static ISiloBuilder UseInMemoryScheduledJobs(this ISiloBuilder builder) + { + builder.AddScheduledJobs(); + + builder.ConfigureServices(services => services.UseInMemoryScheduledJobs()); + return builder; + } + + /// + /// Configures scheduled jobs storage using an in-memory, non-persistent store. + /// + /// + /// Note that this is for development and testing scenarios only and should not be used in production. + /// + /// The service collection. + /// The provided , for chaining. + internal static IServiceCollection UseInMemoryScheduledJobs(this IServiceCollection services) + { + services.AddSingleton(sp => + { + var siloDetails = sp.GetRequiredService(); + var membershipService = sp.GetRequiredService(); + return new InMemoryJobShardManager(siloDetails.SiloAddress, membershipService); + }); + services.AddFromExisting(); + return services; + } +} diff --git a/src/Orleans.ScheduledJobs/Hosting/ScheduledJobsOptions.cs b/src/Orleans.ScheduledJobs/Hosting/ScheduledJobsOptions.cs new file mode 100644 index 00000000000..2418b53a59f --- /dev/null +++ b/src/Orleans.ScheduledJobs/Hosting/ScheduledJobsOptions.cs @@ -0,0 +1,79 @@ +using System; +using Microsoft.Extensions.Logging; +using Microsoft.Extensions.Options; +using Orleans.Runtime; +using Orleans.ScheduledJobs; + +namespace Orleans.Hosting; + +/// +/// Configuration options for the scheduled jobs feature. +/// +public sealed class ScheduledJobsOptions +{ + /// + /// Gets or sets the duration of each job shard. Smaller values reduce latency but increase overhead. + /// For optimal alignment with hour boundaries, choose durations that evenly divide 60 minutes + /// (e.g., 1, 2, 3, 4, 5, 6, 10, 12, 15, 20, 30, or 60 minutes) to avoid bucket drift across hours. + /// Default: 1 hour. + /// + public TimeSpan ShardDuration { get; set; } = TimeSpan.FromHours(1); + + /// + /// Gets or sets how far in advance (before the shard's start time) the shard should + /// begin processing. This prevents holding idle shards for extended periods. + /// Default: 5 minutes. + /// + public TimeSpan ShardActivationBufferPeriod { get; set; } = TimeSpan.FromMinutes(5); + + /// + /// Gets or sets the maximum number of jobs that can be executed concurrently on a single silo. + /// Default: 10,000 × processor count. + /// + public int MaxConcurrentJobsPerSilo { get; set; } = 10_000 * Environment.ProcessorCount; + + /// + /// Gets or sets the function that determines whether a failed job should be retried and when. + /// The function receives the job context and the exception that caused the failure, and returns + /// the time when the job should be retried, or if the job should not be retried. + /// Default: Retry up to 5 times with exponential backoff (2^n seconds). + /// + public Func ShouldRetry { get; set; } = DefaultShouldRetry; + + private static DateTimeOffset? DefaultShouldRetry(IScheduledJobContext jobContext, Exception ex) + { + // Default retry logic: retry up to 5 times with exponential backoff + if (jobContext.DequeueCount >= 5) + { + return null; + } + var delay = TimeSpan.FromSeconds(Math.Pow(2, jobContext.DequeueCount)); + return DateTimeOffset.UtcNow.Add(delay); + } +} + +public sealed class ScheduledJobsOptionsValidator : IConfigurationValidator +{ + private readonly ILogger _logger; + private readonly IOptions _options; + + public ScheduledJobsOptionsValidator(ILogger logger, IOptions options) + { + _logger = logger; + _options = options; + } + + public void ValidateConfiguration() + { + var options = _options.Value; + if (options.ShardDuration <= TimeSpan.Zero) + { + throw new OrleansConfigurationException("ScheduledJobsOptions.ShardDuration must be greater than zero."); + } + if (options.ShouldRetry == null) + { + throw new OrleansConfigurationException("ScheduledJobsOptions.ShouldRetry must not be null."); + } + _logger.LogInformation("ScheduledJobsOptions validated: ShardDuration={ShardDuration}", options.ShardDuration); + } +} diff --git a/src/Orleans.ScheduledJobs/ILocalScheduledJobManager.cs b/src/Orleans.ScheduledJobs/ILocalScheduledJobManager.cs new file mode 100644 index 00000000000..df138c78757 --- /dev/null +++ b/src/Orleans.ScheduledJobs/ILocalScheduledJobManager.cs @@ -0,0 +1,32 @@ +using System; +using System.Collections.Generic; +using System.Threading; +using System.Threading.Tasks; +using Orleans.Runtime; + +namespace Orleans.ScheduledJobs; + +/// +/// Provides functionality for scheduling and managing jobs on the local silo. +/// +public interface ILocalScheduledJobManager +{ + /// + /// Schedules a job to be executed at a specific time on the target grain. + /// + /// The grain identifier of the target grain that will receive the scheduled job. + /// The name of the job for identification purposes. + /// The date and time when the job should be executed. + /// Optional metadata associated with the job. + /// A cancellation token to cancel the operation. + /// A representing the asynchronous operation that returns the scheduled job. + Task ScheduleJobAsync(GrainId target, string jobName, DateTimeOffset dueTime, IReadOnlyDictionary? metadata, CancellationToken cancellationToken); + + /// + /// Attempts to cancel a previously scheduled job. + /// + /// The scheduled job to cancel. + /// A cancellation token to cancel the operation. + /// A representing the asynchronous operation that returns if the job was successfully canceled; otherwise, . + Task TryCancelScheduledJobAsync(ScheduledJob job, CancellationToken cancellationToken); +} diff --git a/src/Orleans.ScheduledJobs/IScheduledJobHandler.cs b/src/Orleans.ScheduledJobs/IScheduledJobHandler.cs new file mode 100644 index 00000000000..37d3b1bc710 --- /dev/null +++ b/src/Orleans.ScheduledJobs/IScheduledJobHandler.cs @@ -0,0 +1,113 @@ +using System.Threading; +using System.Threading.Tasks; + +namespace Orleans.ScheduledJobs; + +/// +/// Provides contextual information about a scheduled job execution. +/// +public interface IScheduledJobContext +{ + /// + /// Gets the scheduled job being executed. + /// + ScheduledJob Job { get; } + + /// + /// Gets the unique identifier for this execution run. + /// + string RunId { get; } + + /// + /// Gets the number of times this job has been dequeued for execution, including retries. + /// + int DequeueCount { get; } +} + +/// +/// Represents the execution context for a scheduled job. +/// +[GenerateSerializer] +internal class ScheduledJobContext : IScheduledJobContext +{ + /// + /// Gets the scheduled job being executed. + /// + [Id(0)] + public ScheduledJob Job { get; } + + /// + /// Gets the unique identifier for this execution run. + /// + [Id(1)] + public string RunId { get; } + + /// + /// Gets the number of times this job has been dequeued for execution, including retries. + /// + [Id(2)] + public int DequeueCount { get; } + + /// + /// Initializes a new instance of the class. + /// + /// The scheduled job to execute. + /// The unique identifier for this execution run. + /// The number of times this job has been dequeued, including retries. + public ScheduledJobContext(ScheduledJob job, string runId, int retryCount) + { + Job = job; + RunId = runId; + DequeueCount = retryCount; + } +} + +/// +/// Defines the interface for handling scheduled job execution. +/// Grains implement this interface to receive and process scheduled jobs. +/// +/// +/// +/// Grains that implement this interface can be targeted by scheduled jobs. +/// The method is invoked when the job's due time is reached. +/// +/// +/// The following example demonstrates a grain that implements : +/// +/// public class MyGrain : Grain, IScheduledJobHandler +/// { +/// public Task ExecuteJobAsync(IScheduledJobContext context, CancellationToken cancellationToken) +/// { +/// // Process the scheduled job +/// var jobName = context.Job.Name; +/// var dueTime = context.Job.DueTime; +/// +/// // Perform job logic here +/// +/// return Task.CompletedTask; +/// } +/// } +/// +/// +/// +public interface IScheduledJobHandler +{ + /// + /// Executes the scheduled job with the provided context. + /// + /// The context containing information about the scheduled job execution. + /// A token to monitor for cancellation requests. + /// A task that represents the asynchronous job execution operation. + /// + /// + /// This method is invoked by the Orleans scheduled jobs infrastructure when a job's due time is reached. + /// Implementations should handle job execution logic and can use information from the + /// to access job metadata, dequeue count for retry logic, and other execution details. + /// + /// + /// If the method throws an exception and a retry policy is configured, the job may be retried. + /// The property can be used to determine if this is a retry attempt. + /// + /// + Task ExecuteJobAsync(IScheduledJobContext context, CancellationToken cancellationToken); +} diff --git a/src/Orleans.ScheduledJobs/IScheduledJobReceiverExtension.cs b/src/Orleans.ScheduledJobs/IScheduledJobReceiverExtension.cs new file mode 100644 index 00000000000..fa217c34d0c --- /dev/null +++ b/src/Orleans.ScheduledJobs/IScheduledJobReceiverExtension.cs @@ -0,0 +1,61 @@ +using System; +using System.Threading; +using System.Threading.Tasks; +using Microsoft.Extensions.Logging; +using Orleans.Runtime; + +namespace Orleans.ScheduledJobs; + +/// +/// Extension interface for grains that can receive scheduled job invocations. +/// +internal interface IScheduledJobReceiverExtension : IGrainExtension +{ + /// + /// Delivers a scheduled job to the grain for execution. + /// + /// The context containing information about the scheduled job. + /// A token to monitor for cancellation requests. + /// A task that represents the asynchronous operation. + Task DeliverScheduledJobAsync(IScheduledJobContext context, CancellationToken cancellationToken); +} + +/// +internal sealed partial class ScheduledJobReceiverExtension : IScheduledJobReceiverExtension +{ + private readonly IGrainContext _grain; + private readonly ILogger _logger; + + public ScheduledJobReceiverExtension(IGrainContext grain, ILogger logger) + { + _grain = grain; + _logger = logger; + } + + public async Task DeliverScheduledJobAsync(IScheduledJobContext context, CancellationToken cancellationToken) + { + if (_grain.GrainInstance is IScheduledJobHandler handler) + { + try + { + await handler.ExecuteJobAsync(context, cancellationToken); + } + catch (Exception ex) + { + LogErrorExecutingScheduledJob(ex, context.Job.Id, _grain.GrainId); + throw; + } + } + else + { + LogGrainDoesNotImplementHandler(_grain.GrainId); + throw new InvalidOperationException($"Grain {_grain.GrainId} does not implement IScheduledJobHandler"); + } + } + + [LoggerMessage(Level = LogLevel.Error, Message = "Error executing scheduled job {JobId} on grain {GrainId}")] + private partial void LogErrorExecutingScheduledJob(Exception exception, string jobId, GrainId grainId); + + [LoggerMessage(Level = LogLevel.Error, Message = "Grain {GrainId} does not implement IScheduledJobHandler")] + private partial void LogGrainDoesNotImplementHandler(GrainId grainId); +} diff --git a/src/Orleans.ScheduledJobs/InMemoryJobQueue.cs b/src/Orleans.ScheduledJobs/InMemoryJobQueue.cs new file mode 100644 index 00000000000..a283315bba7 --- /dev/null +++ b/src/Orleans.ScheduledJobs/InMemoryJobQueue.cs @@ -0,0 +1,229 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Threading; +using System.Threading.Tasks; + +namespace Orleans.ScheduledJobs; + +/// +/// Provides an in-memory priority queue for managing scheduled jobs based on their due times. +/// Jobs are organized into time-based buckets and enumerated asynchronously as they become due. +/// +internal sealed class InMemoryJobQueue : IAsyncEnumerable +{ + private readonly PriorityQueue _queue = new(); + private readonly Dictionary _jobsIdToBucket = new(); + private readonly Dictionary _buckets = new(); + private bool _isComplete; + private readonly object _syncLock = new(); + + /// + /// Gets the total number of jobs currently in the queue. + /// + public int Count => _jobsIdToBucket.Count; + + /// + /// Adds a scheduled job to the queue with the specified dequeue count. + /// + /// The scheduled job to enqueue. + /// The number of times this job has been dequeued previously. + /// Thrown when attempting to enqueue a job to a completed queue. + /// Thrown when job is null. + public void Enqueue(ScheduledJob job, int dequeueCount) + { + ArgumentNullException.ThrowIfNull(job); + + lock (_syncLock) + { + if (_isComplete) + throw new InvalidOperationException("Cannot enqueue job to a completed queue."); + + var bucket = GetJobBucket(job.DueTime); + bucket.AddJob(job, dequeueCount); + _jobsIdToBucket[job.Id] = bucket; + } + } + + /// + /// Marks the queue as complete, preventing any further jobs from being enqueued. + /// Once marked complete, the queue will finish processing remaining jobs and then terminate enumeration. + /// + public void MarkAsComplete() + { + lock (_syncLock) + { + _isComplete = true; + } + } + + /// + /// Cancels a scheduled job by removing it from the queue. + /// + /// The unique identifier of the job to cancel. + /// True if the job was found and removed; false if the job was not found. + /// + /// The job's bucket remains in the priority queue until processed, but the job itself is removed immediately. + /// + public bool CancelJob(string jobId) + { + lock (_syncLock) + { + if (_jobsIdToBucket.TryGetValue(jobId, out var bucket)) + { + // Try to remove from bucket (may already be dequeued) + bucket.RemoveJob(jobId); + _jobsIdToBucket.Remove(jobId); + // Note: The bucket remains in the priority queue until processed + return true; + } + + return false; + } + } + + /// + /// Reschedules a job for retry with a new due time. + /// + /// The context of the job to retry. + /// The new due time for the job. + /// + /// The job is removed from its current bucket and added to a new bucket based on the specified due time. + /// The dequeue count from the context is preserved. + /// + public void RetryJobLater(IScheduledJobContext jobContext, DateTimeOffset newDueTime) + { + var jobId = jobContext.Job.Id; + var newJob = new ScheduledJob + { + Id = jobContext.Job.Id, + Name = jobContext.Job.Name, + DueTime = newDueTime, + TargetGrainId = jobContext.Job.TargetGrainId, + ShardId = jobContext.Job.ShardId, + Metadata = jobContext.Job.Metadata + }; + + lock (_syncLock) + { + if (_jobsIdToBucket.TryGetValue(jobId, out var oldBucket)) + { + oldBucket.RemoveJob(jobId); + _jobsIdToBucket.Remove(jobId); + var newBucket = GetJobBucket(newDueTime); + newBucket.AddJob(newJob, jobContext.DequeueCount); + _jobsIdToBucket[jobId] = newBucket; + } + } + } + + /// + /// Returns an asynchronous enumerator that yields scheduled jobs as they become due. + /// + /// A token to monitor for cancellation requests. + /// + /// An async enumerator that returns instances for jobs that are due. + /// The enumerator checks for due jobs every second and terminates when the queue is marked complete and empty. + /// + public async IAsyncEnumerator GetAsyncEnumerator(CancellationToken cancellationToken = default) + { + using var timer = new PeriodicTimer(TimeSpan.FromSeconds(1)); + while (true) + { + JobBucket? bucketToProcess = null; + DateTimeOffset bucketKey = default; + + lock (_syncLock) + { + if (Count == 0) + { + if (_isComplete) + { + yield break; // Exit if the queue is frozen and empty + } + } + else if (_queue.Count > 0) + { + var nextBucket = _queue.Peek(); + if (nextBucket.DueTime < DateTimeOffset.UtcNow) + { + // Dequeue the entire bucket to process outside the lock + bucketToProcess = _queue.Dequeue(); + bucketKey = bucketToProcess.DueTime; + } + } + } + + if (bucketToProcess is not null) + { + // Process all jobs in the bucket outside the lock for better concurrency + foreach (var (job, dequeueCount) in bucketToProcess.Jobs.ToList()) + { + // Verify job hasn't been cancelled while we were processing + bool shouldYield; + lock (_syncLock) + { + shouldYield = _jobsIdToBucket.ContainsKey(job.Id); + // Keep job in _jobsIdToBucket for explicit removal via CancelJob/RetryJobLater + } + + if (shouldYield) + { + yield return new ScheduledJobContext(job, Guid.NewGuid().ToString(), dequeueCount + 1); + } + } + + // Clean up the bucket from dictionary after processing all jobs + lock (_syncLock) + { + _buckets.Remove(bucketKey); + } + } + else + { + await timer.WaitForNextTickAsync(cancellationToken); + } + } + } + + private JobBucket GetJobBucket(DateTimeOffset dueTime) + { + // Truncate to second precision and add 1 second to normalize bucket key + // This ensures all jobs within the same second (e.g., 12:00:00.000-12:00:00.999) share the same bucket (12:00:01) + var key = new DateTimeOffset(dueTime.Year, dueTime.Month, dueTime.Day, dueTime.Hour, dueTime.Minute, dueTime.Second, dueTime.Offset); + key = key.AddSeconds(1); + if (!_buckets.TryGetValue(key, out var bucket)) + { + bucket = new JobBucket(key); + _buckets[key] = bucket; + _queue.Enqueue(bucket, key); + } + return bucket; + } +} + +internal sealed class JobBucket +{ + private readonly Dictionary _jobs = new(); + + public int Count => _jobs.Count; + + public DateTimeOffset DueTime { get; private set; } + + public IEnumerable<(ScheduledJob Job, int DequeueCount)> Jobs => _jobs.Values; + + public JobBucket(DateTimeOffset dueTime) + { + DueTime = dueTime; + } + + public void AddJob(ScheduledJob job, int dequeueCount) + { + _jobs[job.Id] = (job, dequeueCount); + } + + public bool RemoveJob(string jobId) + { + return _jobs.Remove(jobId); + } +} diff --git a/src/Orleans.ScheduledJobs/InMemoryJobShard.cs b/src/Orleans.ScheduledJobs/InMemoryJobShard.cs new file mode 100644 index 00000000000..d063ab03748 --- /dev/null +++ b/src/Orleans.ScheduledJobs/InMemoryJobShard.cs @@ -0,0 +1,33 @@ +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.Threading; +using System.Threading.Tasks; +using Orleans.Runtime; + +namespace Orleans.ScheduledJobs; + +[DebuggerDisplay("ShardId={Id}, StartTime={StartTime}, EndTime={EndTime}")] +internal sealed class InMemoryJobShard : JobShard +{ + public InMemoryJobShard(string shardId, DateTimeOffset minDueTime, DateTimeOffset maxDueTime, IDictionary? metadata) + : base(shardId, minDueTime, maxDueTime) + { + Metadata = metadata; + } + + protected override Task PersistAddJobAsync(string jobId, string jobName, DateTimeOffset dueTime, GrainId target, IReadOnlyDictionary? metadata, CancellationToken cancellationToken) + { + return Task.CompletedTask; + } + + protected override Task PersistRemoveJobAsync(string jobId, CancellationToken cancellationToken) + { + return Task.CompletedTask; + } + + protected override Task PersistRetryJobAsync(string jobId, DateTimeOffset newDueTime, CancellationToken cancellationToken) + { + return Task.CompletedTask; + } +} diff --git a/src/Orleans.ScheduledJobs/JobShard.cs b/src/Orleans.ScheduledJobs/JobShard.cs new file mode 100644 index 00000000000..ca7c9a4b0a8 --- /dev/null +++ b/src/Orleans.ScheduledJobs/JobShard.cs @@ -0,0 +1,240 @@ +using System; +using System.Collections.Generic; +using System.Threading; +using System.Threading.Tasks; +using Orleans.Runtime; + +namespace Orleans.ScheduledJobs; + +/// +/// Represents a shard of scheduled jobs that manages a collection of jobs within a specific time range. +/// A job shard is responsible for storing, retrieving, and managing the lifecycle of scheduled jobs +/// that fall within its designated time window. +/// +/// +/// Job shards are used to partition scheduled jobs across time ranges to improve scalability +/// and performance. Each shard has a defined start and end time that determines which jobs +/// it manages. Shards can be marked as complete when all jobs within their time range +/// have been processed. +/// +public interface IJobShard : IAsyncDisposable +{ + /// + /// Gets the unique identifier for this job shard. + /// + string Id { get; } + + /// + /// Gets the start time of the time range managed by this shard. + /// + DateTimeOffset StartTime { get; } + + /// + /// Gets the end time of the time range managed by this shard. + /// + DateTimeOffset EndTime { get; } + + /// + /// Gets optional metadata associated with this job shard. + /// + IDictionary? Metadata { get; } + + /// + /// Gets a value indicating whether this shard has been marked as complete and is no longer accepting new jobs. + /// + /// + /// When a shard is marked as complete (via ), no new jobs can be added to it. + /// + bool IsAddingCompleted { get; } + + /// + /// Consumes scheduled jobs from this shard in order of their due time. + /// + /// An asynchronous enumerable of scheduled job contexts. + IAsyncEnumerable ConsumeScheduledJobsAsync(); + + /// + /// Gets the number of jobs currently scheduled in this shard. + /// + /// A task that represents the asynchronous operation. The task result contains the job count. + ValueTask GetJobCountAsync(); + + /// + /// Marks this shard as complete, preventing new jobs from being scheduled. + /// + /// A token to cancel the operation. + /// A task that represents the asynchronous operation. + Task MarkAsCompleteAsync(CancellationToken cancellationToken); + + /// + /// Removes a scheduled job from this shard. + /// + /// The unique identifier of the job to remove. + /// A token to cancel the operation. + /// A task that represents the asynchronous operation. The task result contains true if the job was successfully removed, or false if the job was not found. + Task RemoveJobAsync(string jobId, CancellationToken cancellationToken); + + /// + /// Reschedules a job to be retried at a later time. + /// + /// The context of the job to retry. + /// The new due time for the job. + /// A token to cancel the operation. + /// A task that represents the asynchronous operation. + Task RetryJobLaterAsync(IScheduledJobContext jobContext, DateTimeOffset newDueTime, CancellationToken cancellationToken); + + /// + /// Attempts to schedule a new job on this shard. + /// + /// The grain identifier of the target grain that will execute the job. + /// The name of the job to schedule. + /// The time when the job should be executed. + /// Optional metadata to associate with the job. + /// A token to cancel the operation. + /// A task that represents the asynchronous operation. The task result contains the scheduled job if successful, or null if the job could not be scheduled (e.g., the shard was marked as complete). + /// Thrown when the due time is outside the shard's time range. + Task TryScheduleJobAsync(GrainId target, string jobName, DateTimeOffset dueTime, IReadOnlyDictionary? metadata, CancellationToken cancellationToken); +} + +/// +/// Base implementation of that provides common functionality for job shard implementations. +/// +public abstract class JobShard : IJobShard +{ + private readonly InMemoryJobQueue _jobQueue; + + /// + public string Id { get; protected set; } + + /// + public DateTimeOffset StartTime { get; protected set; } + + /// + public DateTimeOffset EndTime { get; protected set; } + + /// + public IDictionary? Metadata { get; protected set; } + + /// + public bool IsAddingCompleted { get; protected set; } + + /// + /// Initializes a new instance of the class. + /// + /// The unique identifier for this job shard. + /// The start time of the time range managed by this shard. + /// The end time of the time range managed by this shard. + protected JobShard(string id, DateTimeOffset startTime, DateTimeOffset endTime) + { + Id = id; + StartTime = startTime; + EndTime = endTime; + _jobQueue = new InMemoryJobQueue(); + } + + /// + public ValueTask GetJobCountAsync() => ValueTask.FromResult(_jobQueue.Count); + + /// + public IAsyncEnumerable ConsumeScheduledJobsAsync() + { + return _jobQueue; + } + + /// + public async Task TryScheduleJobAsync(GrainId target, string jobName, DateTimeOffset dueTime, IReadOnlyDictionary? metadata, CancellationToken cancellationToken) + { + if (IsAddingCompleted) + { + return null; + } + + if (dueTime < StartTime || dueTime > EndTime) + { + throw new ArgumentOutOfRangeException(nameof(dueTime), "Scheduled time is out of shard bounds."); + } + + var jobId = Guid.NewGuid().ToString(); + var job = new ScheduledJob + { + Id = jobId, + TargetGrainId = target, + Name = jobName, + DueTime = dueTime, + ShardId = Id, + Metadata = metadata + }; + + await PersistAddJobAsync(jobId, jobName, dueTime, target, metadata, cancellationToken); + _jobQueue.Enqueue(job, 0); + return job; + } + + /// + public async Task RemoveJobAsync(string jobId, CancellationToken cancellationToken) + { + await PersistRemoveJobAsync(jobId, cancellationToken); + return _jobQueue.CancelJob(jobId); + } + + /// + public Task MarkAsCompleteAsync(CancellationToken cancellationToken) + { + IsAddingCompleted = true; + _jobQueue.MarkAsComplete(); + return Task.CompletedTask; + } + + /// + public async Task RetryJobLaterAsync(IScheduledJobContext jobContext, DateTimeOffset newDueTime, CancellationToken cancellationToken) + { + await PersistRetryJobAsync(jobContext.Job.Id, newDueTime, cancellationToken); + _jobQueue.RetryJobLater(jobContext, newDueTime); + } + + /// + /// Enqueues a job into the in-memory queue with the specified dequeue count. + /// + /// The job to enqueue. + /// The number of times this job has been dequeued. + protected void EnqueueJob(ScheduledJob job, int dequeueCount) + { + _jobQueue.Enqueue(job, dequeueCount); + } + + /// + /// Persists the addition of a new job to the underlying storage. + /// + /// The unique identifier of the job. + /// The name of the job. + /// The time when the job should be executed. + /// The grain identifier of the target grain. + /// Optional metadata to associate with the job. + /// A token to cancel the operation. + /// A task that represents the asynchronous operation. + protected abstract Task PersistAddJobAsync(string jobId, string jobName, DateTimeOffset dueTime, GrainId target, IReadOnlyDictionary? metadata, CancellationToken cancellationToken); + + /// + /// Persists the removal of a job from the underlying storage. + /// + /// The unique identifier of the job to remove. + /// A token to cancel the operation. + /// A task that represents the asynchronous operation. + protected abstract Task PersistRemoveJobAsync(string jobId, CancellationToken cancellationToken); + + /// + /// Persists the rescheduling of a job to the underlying storage. + /// + /// The unique identifier of the job to retry. + /// The new due time for the job. + /// A token to cancel the operation. + /// A task that represents the asynchronous operation. + protected abstract Task PersistRetryJobAsync(string jobId, DateTimeOffset newDueTime, CancellationToken cancellationToken); + + /// + public virtual ValueTask DisposeAsync() + { + GC.SuppressFinalize(this); + return default; + } +} diff --git a/src/Orleans.ScheduledJobs/JobShardManager.cs b/src/Orleans.ScheduledJobs/JobShardManager.cs new file mode 100644 index 00000000000..1f3ff5f69b0 --- /dev/null +++ b/src/Orleans.ScheduledJobs/JobShardManager.cs @@ -0,0 +1,204 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Threading; +using System.Threading.Tasks; +using Orleans.Runtime; + +namespace Orleans.ScheduledJobs; + +/// +/// Manages the lifecycle of job shards for a specific silo. +/// Each silo instance has its own shard manager. +/// +public abstract class JobShardManager +{ + /// + /// Gets the silo address this manager is associated with. + /// + protected SiloAddress SiloAddress { get; } + + /// + /// Initializes a new instance of the class. + /// + /// The silo address this manager represents. + protected JobShardManager(SiloAddress siloAddress) + { + SiloAddress = siloAddress; + } + + /// + /// Assigns orphaned job shards to this silo. + /// + /// Maximum due time for shards to consider. + /// Cancellation token. + /// A list of job shards assigned to this silo. + public abstract Task> AssignJobShardsAsync(DateTimeOffset maxDueTime, CancellationToken cancellationToken); + + /// + /// Creates a new job shard owned by this silo. + /// + /// The minimum due time for jobs in this shard. + /// The maximum due time for jobs in this shard. + /// Optional metadata for the shard. + /// Cancellation token. + /// The newly created job shard. + public abstract Task CreateShardAsync(DateTimeOffset minDueTime, DateTimeOffset maxDueTime, IDictionary metadata, CancellationToken cancellationToken); + + /// + /// Unregisters a shard owned by this silo. + /// + /// The shard to unregister. + /// Cancellation token. + /// A task representing the asynchronous operation. + public abstract Task UnregisterShardAsync(IJobShard shard, CancellationToken cancellationToken); +} + +internal class InMemoryJobShardManager : JobShardManager +{ + // Shared storage across all manager instances to support multi-silo scenarios + private static readonly Dictionary _globalShardStore = new(); + private static readonly SemaphoreSlim _asyncLock = new(1, 1); + private readonly IClusterMembershipService? _membershipService; + + public InMemoryJobShardManager(SiloAddress siloAddress) : base(siloAddress) + { + } + + public InMemoryJobShardManager(SiloAddress siloAddress, IClusterMembershipService membershipService) : base(siloAddress) + { + _membershipService = membershipService; + } + + /// + /// Clears all shards from the global store. For testing purposes only. + /// + internal static async Task ClearAllShardsAsync() + { + await _asyncLock.WaitAsync(); + try + { + _globalShardStore.Clear(); + } + finally + { + _asyncLock.Release(); + } + } + + public override async Task> AssignJobShardsAsync(DateTimeOffset maxDueTime, CancellationToken cancellationToken) + { + var alreadyOwnedShards = new List(); + var stolenShards = new List(); + + await _asyncLock.WaitAsync(cancellationToken); + try + { + var snapshot = _membershipService?.CurrentSnapshot; + var deadSilos = new HashSet(); + + if (snapshot is not null) + { + foreach (var member in snapshot.Members.Values) + { + if (member.Status == SiloStatus.Dead) + { + deadSilos.Add(member.SiloAddress.ToString()); + } + } + } + + // Assign shards from dead silos or orphaned shards + foreach (var kvp in _globalShardStore) + { + var shardId = kvp.Key; + var ownership = kvp.Value; + + // Skip shards that are already owned by this silo + if (ownership.OwnerSiloAddress == SiloAddress.ToString()) + { + if (ownership.Shard.StartTime <= maxDueTime) + { + alreadyOwnedShards.Add(ownership.Shard); + } + } + // Take over orphaned shards or shards from dead silos + else if (ownership.OwnerSiloAddress is null || deadSilos.Contains(ownership.OwnerSiloAddress)) + { + if (ownership.Shard.StartTime <= maxDueTime) + { + ownership.OwnerSiloAddress = SiloAddress.ToString(); + stolenShards.Add(ownership.Shard); + } + } + } + } + finally + { + _asyncLock.Release(); + } + + foreach (var shard in stolenShards) + { + // Mark stolen shards as complete + await shard.MarkAsCompleteAsync(CancellationToken.None); + } + + return [.. alreadyOwnedShards, .. stolenShards]; + } + + public override async Task CreateShardAsync(DateTimeOffset minDueTime, DateTimeOffset maxDueTime, IDictionary metadata, CancellationToken cancellationToken) + { + await _asyncLock.WaitAsync(cancellationToken); + try + { + var shardId = $"{SiloAddress}-{Guid.NewGuid()}"; + var newShard = new InMemoryJobShard(shardId, minDueTime, maxDueTime, metadata); + + _globalShardStore[shardId] = new ShardOwnership + { + Shard = newShard, + OwnerSiloAddress = SiloAddress.ToString() + }; + + return newShard; + } + finally + { + _asyncLock.Release(); + } + } + + public override async Task UnregisterShardAsync(IJobShard shard, CancellationToken cancellationToken) + { + var jobCount = await shard.GetJobCountAsync(); + + await _asyncLock.WaitAsync(cancellationToken); + try + { + // Only remove shards that have no jobs remaining + if (_globalShardStore.TryGetValue(shard.Id, out var ownership)) + { + if (jobCount == 0) + { + _globalShardStore.Remove(shard.Id); + } + else + { + // Mark as unowned so another silo can pick it up + ownership.OwnerSiloAddress = null; + } + } + } + finally + { + _asyncLock.Release(); + } + } + + private sealed class ShardOwnership + { + public required IJobShard Shard { get; init; } + public string? OwnerSiloAddress { get; set; } + } +} diff --git a/src/Orleans.ScheduledJobs/LocalScheduledJobManager.Log.cs b/src/Orleans.ScheduledJobs/LocalScheduledJobManager.Log.cs new file mode 100644 index 00000000000..fefc40fd921 --- /dev/null +++ b/src/Orleans.ScheduledJobs/LocalScheduledJobManager.Log.cs @@ -0,0 +1,134 @@ +using System; +using Microsoft.Extensions.Logging; +using Orleans.Runtime; + +namespace Orleans.ScheduledJobs; + +internal partial class LocalScheduledJobManager +{ + [LoggerMessage( + Level = LogLevel.Debug, + Message = "Scheduling job '{JobName}' for grain {TargetGrain} at {DueTime}" + )] + private static partial void LogSchedulingJob(ILogger logger, string jobName, GrainId targetGrain, DateTimeOffset dueTime); + + [LoggerMessage( + Level = LogLevel.Debug, + Message = "Job '{JobName}' (ID: {JobId}) scheduled to shard {ShardId} for grain {TargetGrain}" + )] + private static partial void LogJobScheduled(ILogger logger, string jobName, string jobId, string shardId, GrainId targetGrain); + + [LoggerMessage( + Level = LogLevel.Information, + Message = "LocalScheduledJobManager starting" + )] + private static partial void LogStarting(ILogger logger); + + [LoggerMessage( + Level = LogLevel.Information, + Message = "LocalScheduledJobManager started" + )] + private static partial void LogStarted(ILogger logger); + + [LoggerMessage( + Level = LogLevel.Information, + Message = "LocalScheduledJobManager stopping. Running shards: {RunningShardCount}" + )] + private static partial void LogStopping(ILogger logger, int runningShardCount); + + [LoggerMessage( + Level = LogLevel.Information, + Message = "LocalScheduledJobManager stopped" + )] + private static partial void LogStopped(ILogger logger); + + [LoggerMessage( + Level = LogLevel.Debug, + Message = "Attempting to cancel job {JobId} (Name: '{JobName}') in shard {ShardId}" + )] + private static partial void LogCancellingJob(ILogger logger, string jobId, string jobName, string shardId); + + [LoggerMessage( + Level = LogLevel.Warning, + Message = "Failed to cancel job {JobId} (Name: '{JobName}') - shard {ShardId} not found in cache" + )] + private static partial void LogJobCancellationFailed(ILogger logger, string jobId, string jobName, string shardId); + + [LoggerMessage( + Level = LogLevel.Information, + Message = "Job {JobId} (Name: '{JobName}') cancelled from shard {ShardId}" + )] + private static partial void LogJobCancelled(ILogger logger, string jobId, string jobName, string shardId); + + [LoggerMessage( + Level = LogLevel.Error, + Message = "Error processing cluster membership update" + )] + private static partial void LogErrorProcessingClusterMembership(ILogger logger, Exception exception); + + [LoggerMessage( + Level = LogLevel.Debug, + Message = "Checking for unassigned shards" + )] + private static partial void LogCheckingForUnassignedShards(ILogger logger); + + [LoggerMessage( + Level = LogLevel.Information, + Message = "Assigned {ShardCount} shard(s)" + )] + private static partial void LogAssignedShards(ILogger logger, int shardCount); + + [LoggerMessage( + Level = LogLevel.Trace, + Message = "No unassigned shards found" + )] + private static partial void LogNoShardsToAssign(ILogger logger); + + [LoggerMessage( + Level = LogLevel.Information, + Message = "Starting shard {ShardId} (Start: {StartTime}, End: {EndTime})" + )] + private static partial void LogStartingShard(ILogger logger, string shardId, DateTimeOffset startTime, DateTimeOffset endTime); + + [LoggerMessage( + Level = LogLevel.Debug, + Message = "Shard {ShardId} not ready yet. Start time: {StartTime}" + )] + private static partial void LogShardNotReadyYet(ILogger logger, string shardId, DateTimeOffset startTime); + + [LoggerMessage( + Level = LogLevel.Trace, + Message = "Checking for pending shards to start" + )] + private static partial void LogCheckingPendingShards(ILogger logger); + + [LoggerMessage( + Level = LogLevel.Error, + Message = "Error in periodic shard check" + )] + private static partial void LogErrorInPeriodicCheck(ILogger logger, Exception exception); + + [LoggerMessage( + Level = LogLevel.Information, + Message = "Unregistered shard {ShardId}" + )] + private static partial void LogUnregisteredShard(ILogger logger, string shardId); + + [LoggerMessage( + Level = LogLevel.Error, + Message = "Error unregistering shard {ShardId}" + )] + private static partial void LogErrorUnregisteringShard(ILogger logger, Exception exception, string shardId); + + [LoggerMessage( + Level = LogLevel.Error, + Message = "Error disposing shard {ShardId}" + )] + private static partial void LogErrorDisposingShard(ILogger logger, Exception exception, string shardId); + + [LoggerMessage( + Level = LogLevel.Information, + Message = "Creating new shard for key {ShardKey}" + )] + private static partial void LogCreatingNewShard(ILogger logger, DateTimeOffset shardKey); +} diff --git a/src/Orleans.ScheduledJobs/LocalScheduledJobManager.cs b/src/Orleans.ScheduledJobs/LocalScheduledJobManager.cs new file mode 100644 index 00000000000..496f0d200ef --- /dev/null +++ b/src/Orleans.ScheduledJobs/LocalScheduledJobManager.cs @@ -0,0 +1,335 @@ +using System; +using System.Collections.Concurrent; +using System.Collections.Generic; +using System.Linq; +using System.Runtime.CompilerServices; +using System.Threading; +using System.Threading.Tasks; +using Microsoft.Extensions.Logging; +using Microsoft.Extensions.Options; +using Orleans.Hosting; +using Orleans.Internal; +using Orleans.Runtime; +using Orleans.Runtime.Internal; + +namespace Orleans.ScheduledJobs; + +/// +internal partial class LocalScheduledJobManager : SystemTarget, ILocalScheduledJobManager, ILifecycleParticipant +{ + private readonly JobShardManager _shardManager; + private readonly ShardExecutor _shardExecutor; + private readonly IAsyncEnumerable _clusterMembershipUpdates; + private readonly ILogger _logger; + private readonly ScheduledJobsOptions _options; + private readonly CancellationTokenSource _cts = new(); + private Task? _listenForClusterChangesTask; + private Task? _periodicCheckTask; + + // Shard tracking state + private readonly ConcurrentDictionary _shardCache = new(); + private readonly ConcurrentDictionary _writeableShards = new(); + private readonly ConcurrentDictionary _runningShards = new(); + private readonly SemaphoreSlim _shardCreationLock = new(1, 1); + private readonly SemaphoreSlim _shardCheckSignal = new(0); + + private static readonly IDictionary EmptyMetadata = new Dictionary(); + + public LocalScheduledJobManager( + JobShardManager shardManager, + ShardExecutor shardExecutor, + IClusterMembershipService clusterMembership, + IOptions options, + SystemTargetShared shared, + ILogger logger) + : base(SystemTargetGrainId.CreateGrainType("job-manager"), shared) + { + _shardManager = shardManager; + _shardExecutor = shardExecutor; + _clusterMembershipUpdates = clusterMembership.MembershipUpdates; + _logger = logger; + _options = options.Value; + } + + /// + public async Task ScheduleJobAsync(GrainId target, string jobName, DateTimeOffset dueTime, IReadOnlyDictionary? metadata, CancellationToken cancellationToken) + { + LogSchedulingJob(_logger, jobName, target, dueTime); + + var shardKey = GetShardKey(dueTime); + + while (true) + { + // Fast path: shard already exists + if (_writeableShards.TryGetValue(shardKey, out var existingShard)) + { + var job = await existingShard.TryScheduleJobAsync(target, jobName, dueTime, metadata, cancellationToken); + if (job is not null) + { + LogJobScheduled(_logger, jobName, job.Id, existingShard.Id, target); + return job; + } + + // Shard is full or no longer writable, remove from writable shards and try again + _writeableShards.TryRemove(shardKey, out _); + continue; + } + + // Slow path: need to create shard + await _shardCreationLock.WaitAsync(cancellationToken); + try + { + // Double-check after acquiring lock + if (_writeableShards.TryGetValue(shardKey, out existingShard)) + { + continue; + } + + // Create new shard + using var linkedCts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken, _cts.Token); + var endTime = shardKey.Add(_options.ShardDuration); + var newShard = await _shardManager.CreateShardAsync(shardKey, endTime, EmptyMetadata, linkedCts.Token); + + LogCreatingNewShard(_logger, shardKey); + _writeableShards[shardKey] = newShard; + _shardCache.TryAdd(newShard.Id, newShard); + TryActivateShard(newShard); + } + finally + { + _shardCreationLock.Release(); + } + } + } + + public void Participate(ISiloLifecycle lifecycle) + { + lifecycle.Subscribe( + nameof(LocalScheduledJobManager), + ServiceLifecycleStage.Active, + ct => Start(ct), + ct => Stop(ct)); + } + + private Task Start(CancellationToken ct) + { + LogStarting(_logger); + + using (var _ = new ExecutionContextSuppressor()) + { + _listenForClusterChangesTask = Task.Factory.StartNew( + state => ((LocalScheduledJobManager)state!).ProcessMembershipUpdates(), + this, + CancellationToken.None, + TaskCreationOptions.None, + WorkItemGroup.TaskScheduler).Unwrap(); + _listenForClusterChangesTask.Ignore(); + + _periodicCheckTask = Task.Factory.StartNew( + state => ((LocalScheduledJobManager)state!).PeriodicShardCheck(), + this, + CancellationToken.None, + TaskCreationOptions.None, + WorkItemGroup.TaskScheduler).Unwrap(); + _periodicCheckTask.Ignore(); + } + + LogStarted(_logger); + return Task.CompletedTask; + } + + private async Task Stop(CancellationToken ct) + { + LogStopping(_logger, _runningShards.Count); + + _cts.Cancel(); + + if (_listenForClusterChangesTask is not null) + { + await _listenForClusterChangesTask; + } + + if (_periodicCheckTask is not null) + { + await _periodicCheckTask; + } + + await Task.WhenAll(_runningShards.Values.ToArray()); + + LogStopped(_logger); + } + + /// + public async Task TryCancelScheduledJobAsync(ScheduledJob job, CancellationToken cancellationToken) + { + LogCancellingJob(_logger, job.Id, job.Name, job.ShardId); + + if (!_shardCache.TryGetValue(job.ShardId, out var shard)) + { + LogJobCancellationFailed(_logger, job.Id, job.Name, job.ShardId); + return false; + } + + using var linkedCts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken, _cts.Token); + var wasRemoved = await shard.RemoveJobAsync(job.Id, linkedCts.Token); + LogJobCancelled(_logger, job.Id, job.Name, job.ShardId); + return wasRemoved; + } + + private async Task ProcessMembershipUpdates() + { + await Task.CompletedTask.ConfigureAwait(ConfigureAwaitOptions.ForceYielding | ConfigureAwaitOptions.ContinueOnCapturedContext); + var current = new HashSet(); + + await foreach (var membershipSnapshot in _clusterMembershipUpdates.WithCancellation(_cts.Token)) + { + try + { + // Get active members + var update = new HashSet(membershipSnapshot.Members.Values + .Where(member => member.Status == SiloStatus.Active) + .Select(member => member.SiloAddress)); + + // If active list has changed, trigger immediate shard check + if (!current.SetEquals(update)) + { + current = update; + _shardCheckSignal.Release(); + } + } + catch (Exception exception) + { + LogErrorProcessingClusterMembership(_logger, exception); + } + } + } + + private async Task PeriodicShardCheck() + { + await Task.CompletedTask.ConfigureAwait(ConfigureAwaitOptions.ForceYielding | ConfigureAwaitOptions.ContinueOnCapturedContext); + + using var timer = new PeriodicTimer(TimeSpan.FromMinutes(10)); + + while (!_cts.Token.IsCancellationRequested) + { + try + { + // Wait for either periodic timer OR signal from membership changes + var timerTask = timer.WaitForNextTickAsync(_cts.Token); + var signalTask = _shardCheckSignal.WaitAsync(_cts.Token); + await Task.WhenAny(timerTask.AsTask(), signalTask); + + LogCheckingPendingShards(_logger); + + // Clean up old writable shards that have passed their time window + var now = DateTimeOffset.UtcNow; + foreach (var key in _writeableShards.Keys.ToArray()) + { + var shardEndTime = key.Add(_options.ShardDuration); + if (shardEndTime < now) + { + _writeableShards.TryRemove(key, out _); + } + } + + // Query ShardManager for assigned shards (source of truth) + var shards = await _shardManager.AssignJobShardsAsync(DateTime.UtcNow.AddHours(1), _cts.Token); + if (shards.Count > 0) + { + LogAssignedShards(_logger, shards.Count); + foreach (var shard in shards) + { + _shardCache.TryAdd(shard.Id, shard); + + if (!_runningShards.ContainsKey(shard.Id)) + { + TryActivateShard(shard); + } + } + } + else + { + LogNoShardsToAssign(_logger); + } + } + catch (OperationCanceledException) + { + break; + } + catch (Exception ex) + { + LogErrorInPeriodicCheck(_logger, ex); + } + } + } + + private void TryActivateShard(IJobShard shard) + { + // Only start if not already running + if (_runningShards.ContainsKey(shard.Id)) + { + return; + } + + // Only start if it's time to start (within buffer period) + if (!ShouldStartShardNow(shard)) + { + LogShardNotReadyYet(_logger, shard.Id, shard.StartTime); + return; + } + + if (_runningShards.TryAdd(shard.Id, Task.CompletedTask)) + { + LogStartingShard(_logger, shard.Id, shard.StartTime, shard.EndTime); + _runningShards[shard.Id] = RunShardWithCleanupAsync(shard); + } + } + + private async Task RunShardWithCleanupAsync(IJobShard shard) + { + try + { + await _shardExecutor.RunShardAsync(shard, _cts.Token); + + // Unregister the shard from the manager + try + { + await _shardManager.UnregisterShardAsync(shard, _cts.Token); + LogUnregisteredShard(_logger, shard.Id); + } + catch (Exception ex) when (ex is not OperationCanceledException) + { + LogErrorUnregisteringShard(_logger, ex, shard.Id); + } + } + finally + { + // Clean up tracking and dispose the shard + _shardCache.TryRemove(shard.Id, out _); + _runningShards.TryRemove(shard.Id, out _); + + try + { + await shard.DisposeAsync(); + } + catch (Exception ex) + { + LogErrorDisposingShard(_logger, ex, shard.Id); + } + } + } + + private bool ShouldStartShardNow(IJobShard shard) + { + var activationTime = shard.StartTime.Subtract(_options.ShardActivationBufferPeriod); + return DateTimeOffset.UtcNow >= activationTime; + } + + private DateTimeOffset GetShardKey(DateTimeOffset scheduledTime) + { + var shardDurationTicks = _options.ShardDuration.Ticks; + var epochTicks = scheduledTime.UtcTicks; + var bucketTicks = (epochTicks / shardDurationTicks) * shardDurationTicks; + return new DateTimeOffset(bucketTicks, TimeSpan.Zero); + } +} diff --git a/src/Orleans.ScheduledJobs/Orleans.ScheduledJobs.csproj b/src/Orleans.ScheduledJobs/Orleans.ScheduledJobs.csproj new file mode 100644 index 00000000000..6b86863d93f --- /dev/null +++ b/src/Orleans.ScheduledJobs/Orleans.ScheduledJobs.csproj @@ -0,0 +1,25 @@ + + + Microsoft.Orleans.ScheduledJobs + Microsoft Orleans Scheduled Jobs Library + Scheduled Jobs library for Microsoft Orleans used on the server. + README.md + $(DefaultTargetFrameworks) + true + false + $(DefineConstants) + $(VersionSuffix).alpha.1 + alpha.1 + enable + + + + + + + + + + + + diff --git a/src/Orleans.ScheduledJobs/Properties/AssemblyInfo.cs b/src/Orleans.ScheduledJobs/Properties/AssemblyInfo.cs new file mode 100644 index 00000000000..df14900e23e --- /dev/null +++ b/src/Orleans.ScheduledJobs/Properties/AssemblyInfo.cs @@ -0,0 +1,4 @@ +using System.Runtime.CompilerServices; + +[assembly: InternalsVisibleTo("NonSilo.Tests")] +[assembly: InternalsVisibleTo("Tester")] diff --git a/src/Orleans.ScheduledJobs/README.md b/src/Orleans.ScheduledJobs/README.md new file mode 100644 index 00000000000..927754127ce --- /dev/null +++ b/src/Orleans.ScheduledJobs/README.md @@ -0,0 +1,465 @@ +# Microsoft Orleans Scheduled Jobs + +## Introduction +Microsoft Orleans Scheduled Jobs provides a distributed, scalable system for scheduling one-time jobs that execute at a specific time. Unlike Orleans Reminders which are designed for recurring tasks, Scheduled Jobs are ideal for one-time future events such as appointment notifications, delayed processing, scheduled workflow steps, and time-based triggers. + +**Key Features:** +- **At Least One-time Execution**: Jobs are scheduled to run at least once +- **Persistent**: Jobs survive grain deactivation and silo restarts +- **Distributed**: Jobs are automatically distributed and rebalanced across silos +- **Reliable**: Failed jobs can be automatically retried with configurable policies +- **Rich Metadata**: Associate custom metadata with each job +- **Cancellable**: Jobs can be canceled before execution + +## Getting Started + +### Installation +To use this package, install it via NuGet: + +```shell +dotnet add package Microsoft.Orleans.ScheduledJobs +``` + +For production scenarios with persistence, also install a storage provider: + +```shell +dotnet add package Microsoft.Orleans.ScheduledJobs.AzureStorage +``` + +### Configuration + +#### Using In-Memory Storage (Development/Testing) +```csharp +using Microsoft.Extensions.Hosting; +using Orleans.Hosting; + +var builder = Host.CreateApplicationBuilder(args); + +builder.UseOrleans(siloBuilder => +{ + siloBuilder + .UseLocalhostClustering() + // Configure in-memory scheduled jobs (no persistence) + .UseInMemoryScheduledJobs(); +}); + +await builder.Build().RunAsync(); +``` + +#### Using Azure Storage (Production) +```csharp +using Microsoft.Extensions.Hosting; +using Orleans.Hosting; + +var builder = Host.CreateApplicationBuilder(args); + +builder.UseOrleans(siloBuilder => +{ + siloBuilder + .UseLocalhostClustering() + // Configure Azure Storage scheduled jobs + .UseAzureStorageScheduledJobs(options => + { + options.Configure(o => + { + o.BlobServiceClient = new Azure.Storage.Blobs.BlobServiceClient("YOUR_CONNECTION_STRING"); + o.ContainerName = "scheduled-jobs"; + }); + }); +}); + +await builder.Build().RunAsync(); +``` + +#### Advanced Configuration +```csharp +builder.UseOrleans(siloBuilder => +{ + siloBuilder + .UseLocalhostClustering() + .UseInMemoryScheduledJobs() + .ConfigureServices(services => + { + services.Configure(options => + { + // Duration of each job shard (jobs are partitioned by time) + options.ShardDuration = TimeSpan.FromMinutes(5); + + // Maximum number of jobs that can execute concurrently on each silo + options.MaxConcurrentJobsPerSilo = 100; + + // Custom retry policy + options.ShouldRetry = (context, exception) => + { + // Retry up to 3 times with exponential backoff + if (context.DequeueCount < 3) + { + var delay = TimeSpan.FromSeconds(Math.Pow(2, context.DequeueCount)); + return DateTimeOffset.UtcNow.Add(delay); + } + return null; // Don't retry + }; + }); + }); +}); +``` + +## Usage Examples + +### Basic Job Scheduling + +#### 1. Implement the IScheduledJobHandler Interface +```csharp +using Orleans; +using Orleans.ScheduledJobs; + +public interface INotificationGrain : IGrainWithStringKey +{ + Task ScheduleNotification(string message, DateTimeOffset sendTime); + Task CancelScheduledNotification(); +} + +public class NotificationGrain : Grain, INotificationGrain, IScheduledJobHandler +{ + private readonly ILocalScheduledJobManager _jobManager; + private readonly ILogger _logger; + private IScheduledJob? _scheduledJob; + + public NotificationGrain( + ILocalScheduledJobManager jobManager, + ILogger logger) + { + _jobManager = jobManager; + _logger = logger; + } + + public async Task ScheduleNotification(string message, DateTimeOffset sendTime) + { + var userId = this.GetPrimaryKeyString(); + var metadata = new Dictionary + { + ["Message"] = message + }; + + _scheduledJob = await _jobManager.ScheduleJobAsync( + this.GetGrainId(), + "SendNotification", + sendTime, + metadata); + + _logger.LogInformation( + "Scheduled notification for user {UserId} at {SendTime} (JobId: {JobId})", + userId, sendTime, _scheduledJob.Id); + } + + public async Task CancelScheduledNotification() + { + if (_scheduledJob is null) + { + _logger.LogWarning("No scheduled notification to cancel"); + return; + } + + var canceled = await _jobManager.TryCancelScheduledJobAsync(_scheduledJob); + _logger.LogInformation("Notification {JobId} canceled: {Canceled}", _scheduledJob.Id, canceled); + + if (canceled) + { + _scheduledJob = null; + } + } + + // This method is called when the scheduled job executes + public Task ExecuteJobAsync(IScheduledJobContext context, CancellationToken cancellationToken) + { + var userId = this.GetPrimaryKeyString(); + var message = context.Job.Metadata?["Message"]; + + _logger.LogInformation( + "Sending notification to user {UserId}: {Message} (Job: {JobId}, Run: {RunId}, Attempt: {DequeueCount})", + userId, message, context.Job.Id, context.RunId, context.DequeueCount); + + // Send the notification here + // If this throws an exception, the job can be retried based on your retry policy + + _scheduledJob = null; + return Task.CompletedTask; + } +} +``` + +#### 2. Order Workflow with Multiple Jobs +```csharp +public interface IOrderGrain : IGrainWithGuidKey +{ + Task PlaceOrder(OrderDetails details); + Task CancelOrder(); +} + +public class OrderGrain : Grain, IOrderGrain, IScheduledJobHandler +{ + private readonly ILocalScheduledJobManager _jobManager; + private readonly IOrderService _orderService; + private readonly IGrainFactory _grainFactory; + private readonly ILogger _logger; + + public OrderGrain( + ILocalScheduledJobManager jobManager, + IOrderService orderService, + IGrainFactory grainFactory, + ILogger logger) + { + _jobManager = jobManager; + _orderService = orderService; + _grainFactory = grainFactory; + _logger = logger; + } + + public async Task PlaceOrder(OrderDetails details) + { + var orderId = this.GetPrimaryKey(); + + // Create the order + await _orderService.CreateOrderAsync(orderId, details); + + // Schedule delivery reminder for 24 hours before delivery + var reminderTime = details.DeliveryDate.AddHours(-24); + await _jobManager.ScheduleJobAsync( + this.GetGrainId(), + "DeliveryReminder", + reminderTime, + new Dictionary + { + ["Step"] = "DeliveryReminder", + ["CustomerId"] = details.CustomerId, + ["OrderNumber"] = details.OrderNumber + }); + + // Schedule order expiration if payment not received + var expirationTime = DateTimeOffset.UtcNow.AddHours(24); + await _jobManager.ScheduleJobAsync( + this.GetGrainId(), + "OrderExpiration", + expirationTime, + new Dictionary + { + ["Step"] = "OrderExpiration" + }); + } + + public async Task CancelOrder() + { + var orderId = this.GetPrimaryKey(); + await _orderService.CancelOrderAsync(orderId); + } + + public async Task ExecuteJobAsync(IScheduledJobContext context, CancellationToken cancellationToken) + { + var step = context.Job.Metadata!["Step"]; + var orderId = this.GetPrimaryKey(); + + switch (step) + { + case "DeliveryReminder": + await HandleDeliveryReminder(context, cancellationToken); + break; + + case "OrderExpiration": + await HandleOrderExpiration(cancellationToken); + break; + } + } + + private async Task HandleDeliveryReminder(IScheduledJobContext context, CancellationToken ct) + { + var customerId = context.Job.Metadata!["CustomerId"]; + var orderNumber = context.Job.Metadata["OrderNumber"]; + + var notificationGrain = _grainFactory.GetGrain(customerId); + await notificationGrain.ScheduleNotification( + $"Your order #{orderNumber} will be delivered tomorrow!", + DateTimeOffset.UtcNow); + } + + private async Task HandleOrderExpiration(CancellationToken ct) + { + var orderId = this.GetPrimaryKey(); + var order = await _orderService.GetOrderAsync(orderId, ct); + + if (order?.Status == OrderStatus.Pending) + { + await _orderService.CancelOrderAsync(orderId, ct); + _logger.LogInformation("Order {OrderId} expired and canceled", orderId); + } + } +} +``` + +### Advanced Scenarios + +#### Job with Retry Logic +```csharp +public class PaymentProcessorGrain : Grain, IScheduledJobHandler +{ + private readonly IPaymentService _paymentService; + private readonly ILogger _logger; + + public Task ExecuteJobAsync(IScheduledJobContext context, CancellationToken cancellationToken) + { + var paymentId = context.Job.Metadata?["PaymentId"]; + + _logger.LogInformation( + "Processing payment {PaymentId} (Attempt {Attempt})", + paymentId, context.DequeueCount); + + try + { + await _paymentService.ProcessPaymentAsync(paymentId, cancellationToken); + return Task.CompletedTask; + } + catch (TransientException ex) + { + _logger.LogWarning(ex, "Payment processing failed with transient error, will retry"); + throw; // Let the retry policy handle it + } + catch (Exception ex) + { + _logger.LogError(ex, "Payment processing failed with permanent error"); + throw; // This will not be retried if the retry policy returns null + } + } +} +``` + +#### Tracking Job Completion +```csharp +public class WorkflowGrain : Grain, IScheduledJobHandler +{ + private readonly Dictionary _pendingJobs = new(); + + public async Task ScheduleWorkflowStep(string stepName, DateTimeOffset executeAt) + { + var job = await _jobManager.ScheduleJobAsync( + this.GetGrainId(), + stepName, + executeAt); + + _pendingJobs[job.Id] = new TaskCompletionSource(); + return job; + } + + public async Task WaitForJobCompletion(string jobId, TimeSpan timeout) + { + if (_pendingJobs.TryGetValue(jobId, out var tcs)) + { + using var cts = new CancellationTokenSource(timeout); + await tcs.Task.WaitAsync(cts.Token); + } + } + + public Task ExecuteJobAsync(IScheduledJobContext context, CancellationToken cancellationToken) + { + // Execute the workflow step... + + // Mark as complete + if (_pendingJobs.TryRemove(context.Job.Id, out var tcs)) + { + tcs.SetResult(); + } + + return Task.CompletedTask; + } +} +``` + +## How It Works + +### Architecture Overview +1. **Job Sharding**: Jobs are partitioned into time-based shards (default: 1-minute windows) +2. **Shard Ownership**: Each shard is owned by a single silo for execution +3. **Automatic Rebalancing**: When a silo fails, its shards are automatically reassigned to healthy silos +4. **Ordered Execution**: Within a shard, jobs are processed in order of their due time +5. **Concurrency Control**: The `MaxConcurrentJobsPerSilo` setting limits concurrent job execution + +### Job Lifecycle +``` +┌─────────────┐ +│ Scheduled │ ──▶ Job is created and added to appropriate shard +└─────────────┘ + │ + ▼ +┌─────────────┐ +│ Waiting │ ──▶ Job waits in queue until due time +└─────────────┘ + │ + ▼ +┌─────────────┐ +│ Executing │ ──▶ Job handler is invoked on target grain +└─────────────┘ + │ + ├──▶ Success ──▶ Job is removed + │ + └──▶ Failure ──▶ Retry policy decides: + • Retry: Job is re-queued with new due time + • No Retry: Job is removed +``` + +## Configuration Reference + +### ScheduledJobsOptions + +| Property | Type | Default | Description | +|----------|------|---------|-------------| +| `ShardDuration` | `TimeSpan` | 1 minute | Duration of each job shard. Smaller values reduce latency but increase overhead. | +| `MaxConcurrentJobsPerSilo` | `int` | 100 | Maximum number of jobs that can execute simultaneously on a silo. | +| `ShouldRetry` | `Func` | 3 retries with exp. backoff | Determines if a failed job should be retried. Return the new due time or `null` to not retry. | + +## Best Practices + +1. **Set Reasonable Concurrency Limits**: Prevent resource exhaustion + ```csharp + options.MaxConcurrentJobsPerSilo = 100; // Adjust based on your workload + ``` + +2. **Implement Idempotent Job Handlers**: Jobs may be retried, ensure handlers are idempotent + ```csharp + public async Task ExecuteJobAsync(IScheduledJobContext context, CancellationToken ct) + { + var jobId = context.Job.Id; + // Check if already processed + if (await _state.IsProcessed(jobId)) + return; + + // Process job... + await _state.MarkProcessed(jobId); + } + ``` + +3. **Use Metadata Wisely**: Keep metadata lightweight + ```csharp + // Good: Store IDs + var metadata = new Dictionary { ["OrderId"] = "12345" }; + + // Bad: Store large objects + var metadata = new Dictionary { ["Order"] = JsonSerializer.Serialize(largeOrder) }; + ``` + +4. **Handle Cancellation**: Respect the cancellation token + ```csharp + public async Task ExecuteJobAsync(IScheduledJobContext context, CancellationToken ct) + { + await SomeLongRunningOperation(ct); + } + ``` + +## Documentation +For more comprehensive documentation, please refer to: +- [Microsoft Orleans Documentation](https://learn.microsoft.com/dotnet/orleans/) +- [Timers and Reminders](https://learn.microsoft.com/en-us/dotnet/orleans/grains/timers-and-reminders) + +## Feedback & Contributing +- If you have any issues or would like to provide feedback, please [open an issue on GitHub](https://github.com/dotnet/orleans/issues) +- Join our community on [Discord](https://aka.ms/orleans-discord) +- Follow the [@msftorleans](https://twitter.com/msftorleans) Twitter account for Orleans announcements +- Contributions are welcome! Please review our [contribution guidelines](https://github.com/dotnet/orleans/blob/main/CONTRIBUTING.md) +- This project is licensed under the [MIT license](https://github.com/dotnet/orleans/blob/main/LICENSE) diff --git a/src/Orleans.ScheduledJobs/ScheduledJob.cs b/src/Orleans.ScheduledJobs/ScheduledJob.cs new file mode 100644 index 00000000000..08e6175f18a --- /dev/null +++ b/src/Orleans.ScheduledJobs/ScheduledJob.cs @@ -0,0 +1,49 @@ +using System; +using System.Collections.Generic; +using Orleans.Runtime; + +namespace Orleans.ScheduledJobs; + +/// +/// Represents a scheduled job that will be executed at a specific time. +/// +[GenerateSerializer] +[Alias("Orleans.ScheduledJobs.ScheduledJob")] +public sealed class ScheduledJob +{ + /// + /// Gets the unique identifier for this scheduled job. + /// + [Id(0)] + public required string Id { get; init; } + + /// + /// Gets the name of the scheduled job. + /// + [Id(1)] + public required string Name { get; init; } + + /// + /// Gets the time when this job is due to be executed. + /// + [Id(2)] + public DateTimeOffset DueTime { get; init; } + + /// + /// Gets the identifier of the target grain that will handle this job. + /// + [Id(3)] + public GrainId TargetGrainId { get; init; } + + /// + /// Gets the identifier of the shard that manages this scheduled job. + /// + [Id(4)] + public required string ShardId { get; init; } + + /// + /// Gets optional metadata associated with this scheduled job. + /// + [Id(5)] + public IReadOnlyDictionary? Metadata { get; init; } +} diff --git a/src/Orleans.ScheduledJobs/ShardExecutor.Log.cs b/src/Orleans.ScheduledJobs/ShardExecutor.Log.cs new file mode 100644 index 00000000000..6ef04c98393 --- /dev/null +++ b/src/Orleans.ScheduledJobs/ShardExecutor.Log.cs @@ -0,0 +1,62 @@ +using System; +using Microsoft.Extensions.Logging; +using Orleans.Runtime; + +namespace Orleans.ScheduledJobs; + +internal sealed partial class ShardExecutor +{ + [LoggerMessage( + Level = LogLevel.Debug, + Message = "Waiting {Delay} for shard {ShardId} start time {StartTime}" + )] + private static partial void LogWaitingForShardStartTime(ILogger logger, string shardId, TimeSpan delay, DateTimeOffset startTime); + + [LoggerMessage( + Level = LogLevel.Information, + Message = "Begin processing shard {ShardId}" + )] + private static partial void LogBeginProcessingShard(ILogger logger, string shardId); + + [LoggerMessage( + Level = LogLevel.Debug, + Message = "Executing job {JobId} (Name: '{JobName}') for grain {TargetGrain}, due at {DueTime}" + )] + private static partial void LogExecutingJob(ILogger logger, string jobId, string jobName, GrainId targetGrain, DateTimeOffset dueTime); + + [LoggerMessage( + Level = LogLevel.Debug, + Message = "Job {JobId} (Name: '{JobName}') executed successfully" + )] + private static partial void LogJobExecutedSuccessfully(ILogger logger, string jobId, string jobName); + + [LoggerMessage( + Level = LogLevel.Error, + Message = "Error executing job {JobId}" + )] + private static partial void LogErrorExecutingJob(ILogger logger, Exception exception, string jobId); + + [LoggerMessage( + Level = LogLevel.Warning, + Message = "Retrying job {JobId} (Name: '{JobName}') at {RetryTime}. Dequeue count: {DequeueCount}" + )] + private static partial void LogRetryingJob(ILogger logger, string jobId, string jobName, DateTimeOffset retryTime, int dequeueCount); + + [LoggerMessage( + Level = LogLevel.Error, + Message = "Job {JobId} (Name: '{JobName}') failed after {DequeueCount} attempts and will not be retried" + )] + private static partial void LogJobFailedNoRetry(ILogger logger, string jobId, string jobName, int dequeueCount); + + [LoggerMessage( + Level = LogLevel.Information, + Message = "Completed processing shard {ShardId}" + )] + private static partial void LogCompletedProcessingShard(ILogger logger, string shardId); + + [LoggerMessage( + Level = LogLevel.Debug, + Message = "Shard {ShardId} processing cancelled" + )] + private static partial void LogShardCancelled(ILogger logger, string shardId); +} diff --git a/src/Orleans.ScheduledJobs/ShardExecutor.cs b/src/Orleans.ScheduledJobs/ShardExecutor.cs new file mode 100644 index 00000000000..5d9bcb6b67f --- /dev/null +++ b/src/Orleans.ScheduledJobs/ShardExecutor.cs @@ -0,0 +1,127 @@ +using System; +using System.Collections.Concurrent; +using System.Runtime.CompilerServices; +using System.Threading; +using System.Threading.Tasks; +using Microsoft.Extensions.Logging; +using Microsoft.Extensions.Options; +using Orleans.Hosting; +using Orleans.Runtime; + +namespace Orleans.ScheduledJobs; + +/// +/// Handles the execution of job shards and individual scheduled jobs. +/// +internal sealed partial class ShardExecutor +{ + private readonly IInternalGrainFactory _grainFactory; + private readonly ILogger _logger; + private readonly ScheduledJobsOptions _options; + private readonly SemaphoreSlim _jobConcurrencyLimiter; + + /// + /// Initializes a new instance of the class. + /// + /// The grain factory for creating grain references. + /// The scheduled jobs configuration options. + /// The logger instance. + public ShardExecutor( + IInternalGrainFactory grainFactory, + IOptions options, + ILogger logger) + { + _grainFactory = grainFactory; + _logger = logger; + _options = options.Value; + _jobConcurrencyLimiter = new SemaphoreSlim(_options.MaxConcurrentJobsPerSilo); + } + + /// + /// Runs a shard, processing all jobs within it until completion or cancellation. + /// + /// The shard to execute. + /// Cancellation token to stop processing. + /// A task representing the asynchronous operation. + public async Task RunShardAsync(IJobShard shard, CancellationToken cancellationToken) + { + await Task.CompletedTask.ConfigureAwait(ConfigureAwaitOptions.ForceYielding | ConfigureAwaitOptions.ContinueOnCapturedContext); + + var tasks = new ConcurrentDictionary(); + try + { + if (shard.StartTime > DateTime.UtcNow) + { + // Wait until the shard's start time + var delay = shard.StartTime - DateTimeOffset.UtcNow; + LogWaitingForShardStartTime(_logger, shard.Id, delay, shard.StartTime); + await Task.Delay(delay, cancellationToken); + } + + LogBeginProcessingShard(_logger, shard.Id); + + // Process all jobs in the shard + await foreach (var jobContext in shard.ConsumeScheduledJobsAsync().WithCancellation(cancellationToken)) + { + // Wait for concurrency slot + await _jobConcurrencyLimiter.WaitAsync(cancellationToken); + // Start processing the job. RunJobAsync will release the semaphore when done and remove itself from the tasks dictionary + tasks[jobContext.Job.Id] = RunJobAsync(jobContext, shard, tasks, cancellationToken); + } + + LogCompletedProcessingShard(_logger, shard.Id); + } + catch (OperationCanceledException) + { + LogShardCancelled(_logger, shard.Id); + throw; + } + finally + { + // Wait for all jobs to complete + await Task.WhenAll(tasks.Values); + } + } + + private async Task RunJobAsync( + IScheduledJobContext jobContext, + IJobShard shard, + ConcurrentDictionary runningTasks, + CancellationToken cancellationToken) + { + await Task.CompletedTask.ConfigureAwait(ConfigureAwaitOptions.ContinueOnCapturedContext | ConfigureAwaitOptions.ForceYielding); + + try + { + LogExecutingJob(_logger, jobContext.Job.Id, jobContext.Job.Name, jobContext.Job.TargetGrainId, jobContext.Job.DueTime); + + var target = _grainFactory + .GetGrain(jobContext.Job.TargetGrainId) + .AsReference(); + + await target.DeliverScheduledJobAsync(jobContext, cancellationToken); + await shard.RemoveJobAsync(jobContext.Job.Id, cancellationToken); + + LogJobExecutedSuccessfully(_logger, jobContext.Job.Id, jobContext.Job.Name); + } + catch (Exception ex) when (ex is not TaskCanceledException) + { + LogErrorExecutingJob(_logger, ex, jobContext.Job.Id); + var retryTime = _options.ShouldRetry(jobContext, ex); + if (retryTime is not null) + { + LogRetryingJob(_logger, jobContext.Job.Id, jobContext.Job.Name, retryTime.Value, jobContext.DequeueCount); + await shard.RetryJobLaterAsync(jobContext, retryTime.Value, cancellationToken); + } + else + { + LogJobFailedNoRetry(_logger, jobContext.Job.Id, jobContext.Job.Name, jobContext.DequeueCount); + } + } + finally + { + _jobConcurrencyLimiter.Release(); + runningTasks.TryRemove(jobContext.Job.Id, out _); + } + } +} diff --git a/src/Orleans.TestingHost/Orleans.TestingHost.csproj b/src/Orleans.TestingHost/Orleans.TestingHost.csproj index 836ead13688..56fff77461d 100644 --- a/src/Orleans.TestingHost/Orleans.TestingHost.csproj +++ b/src/Orleans.TestingHost/Orleans.TestingHost.csproj @@ -12,6 +12,7 @@ + diff --git a/test/DefaultCluster.Tests/InMemoryScheduledJobTests.cs b/test/DefaultCluster.Tests/InMemoryScheduledJobTests.cs new file mode 100644 index 00000000000..a11022460f3 --- /dev/null +++ b/test/DefaultCluster.Tests/InMemoryScheduledJobTests.cs @@ -0,0 +1,68 @@ +using System.Threading.Tasks; +using Tester.ScheduledJobs; +using TestExtensions; +using Xunit; + +namespace DefaultCluster.Tests; + +public class InMemoryScheduledJobsTests : HostedTestClusterEnsureDefaultStarted +{ + private readonly ScheduledJobTestsRunner _runner; + + public InMemoryScheduledJobsTests(DefaultClusterFixture fixture) : base(fixture) + { + _runner = new ScheduledJobTestsRunner(this.GrainFactory); + } + + [Fact, TestCategory("BVT"), TestCategory("ScheduledJobs")] + public Task ScheduledJobGrain() + => _runner.ScheduledJobGrain(); + + [Fact, TestCategory("BVT"), TestCategory("ScheduledJobs")] + public Task JobExecutionOrder() + => _runner.JobExecutionOrder(); + + [Fact, TestCategory("BVT"), TestCategory("ScheduledJobs")] + public Task PastDueTime() + => _runner.PastDueTime(); + + [Fact, TestCategory("BVT"), TestCategory("ScheduledJobs")] + public Task JobWithMetadata() + => _runner.JobWithMetadata(); + + [Fact, TestCategory("BVT"), TestCategory("ScheduledJobs")] + public Task MultipleGrains() + => _runner.MultipleGrains(); + + [Fact, TestCategory("BVT"), TestCategory("ScheduledJobs")] + public Task DuplicateJobNames() + => _runner.DuplicateJobNames(); + + [Fact, TestCategory("BVT"), TestCategory("ScheduledJobs")] + public Task CancelNonExistentJob() + => _runner.CancelNonExistentJob(); + + [Fact, TestCategory("BVT"), TestCategory("ScheduledJobs")] + public Task CancelAlreadyExecutedJob() + => _runner.CancelAlreadyExecutedJob(); + + [Fact, TestCategory("BVT"), TestCategory("ScheduledJobs")] + public Task ConcurrentScheduling() + => _runner.ConcurrentScheduling(); + + [Fact, TestCategory("BVT"), TestCategory("ScheduledJobs")] + public Task JobPropertiesVerification() + => _runner.JobPropertiesVerification(); + + [Fact, TestCategory("BVT"), TestCategory("ScheduledJobs")] + public Task DequeueCount() + => _runner.DequeueCount(); + + [Fact, TestCategory("BVT"), TestCategory("ScheduledJobs")] + public Task ScheduleJobOnAnotherGrain() + => _runner.ScheduleJobOnAnotherGrain(); + + [Fact, TestCategory("BVT"), TestCategory("ScheduledJobs")] + public Task JobRetry() + => _runner.JobRetry(); +} diff --git a/test/Extensions/TesterAzureUtils/AzureStorageOperationOptionsExtensions.cs b/test/Extensions/TesterAzureUtils/AzureStorageOperationOptionsExtensions.cs index 8510017487d..9fc2526ecf6 100644 --- a/test/Extensions/TesterAzureUtils/AzureStorageOperationOptionsExtensions.cs +++ b/test/Extensions/TesterAzureUtils/AzureStorageOperationOptionsExtensions.cs @@ -1,6 +1,7 @@ using Azure.Core.Diagnostics; using Azure.Data.Tables; using Azure.Identity; +using Azure.Storage.Blobs; using TestExtensions; namespace Tester.AzureUtils @@ -58,6 +59,20 @@ public static Orleans.Configuration.AzureBlobStorageOptions ConfigureTestDefault return options; } + public static AzureStorageJobShardOptions ConfigureTestDefaults(this AzureStorageJobShardOptions options) + { + if (TestDefaultConfiguration.UseAadAuthentication) + { + options.BlobServiceClient = new(TestDefaultConfiguration.DataBlobUri, TestDefaultConfiguration.TokenCredential); + } + else + { + options.BlobServiceClient = new(TestDefaultConfiguration.DataConnectionString); + } + + return options; + } + public static Orleans.Configuration.AzureQueueOptions ConfigureTestDefaults(this Orleans.Configuration.AzureQueueOptions options) { if (TestDefaultConfiguration.UseAadAuthentication) @@ -86,4 +101,4 @@ public static Orleans.Configuration.AzureBlobLeaseProviderOptions ConfigureTestD return options; } } -} \ No newline at end of file +} diff --git a/test/Extensions/TesterAzureUtils/ScheduledJobs/AzureStorageBlobScheduledJobsTests.cs b/test/Extensions/TesterAzureUtils/ScheduledJobs/AzureStorageBlobScheduledJobsTests.cs new file mode 100644 index 00000000000..129de88638a --- /dev/null +++ b/test/Extensions/TesterAzureUtils/ScheduledJobs/AzureStorageBlobScheduledJobsTests.cs @@ -0,0 +1,91 @@ +using System; +using System.Threading.Tasks; +using Microsoft.Extensions.DependencyInjection; +using Orleans.Configuration; +using Orleans.TestingHost; +using Tester; +using Tester.ScheduledJobs; +using TestExtensions; +using Xunit; + +namespace Tester.AzureUtils.ScheduledJobs; + +public class AzureStorageBlobScheduledJobsTests : TestClusterPerTest +{ + private ScheduledJobTestsRunner _runner; + + protected override void CheckPreconditionsOrThrow() => TestUtils.CheckForAzureStorage(); + + public override async Task InitializeAsync() + { + await base.InitializeAsync(); + _runner = new ScheduledJobTestsRunner(this.GrainFactory); + } + + protected override void ConfigureTestCluster(TestClusterBuilder builder) + { + builder.AddSiloBuilderConfigurator(); + } + + public class SiloHostConfigurator : ISiloConfigurator + { + public void Configure(ISiloBuilder hostBuilder) + { + hostBuilder + .UseAzureBlobScheduledJobs(options => options.ConfigureTestDefaults()) + .AddMemoryGrainStorageAsDefault(); + } + } + + [SkippableFact, TestCategory("Azure"), TestCategory("ScheduledJobs")] + public Task ScheduledJobGrain() + => _runner.ScheduledJobGrain(); + + [SkippableFact, TestCategory("Azure"), TestCategory("ScheduledJobs")] + public Task JobExecutionOrder() + => _runner.JobExecutionOrder(); + + [SkippableFact, TestCategory("Azure"), TestCategory("ScheduledJobs")] + public Task PastDueTime() + => _runner.PastDueTime(); + + [SkippableFact, TestCategory("Azure"), TestCategory("ScheduledJobs")] + public Task JobWithMetadata() + => _runner.JobWithMetadata(); + + [SkippableFact, TestCategory("Azure"), TestCategory("ScheduledJobs")] + public Task MultipleGrains() + => _runner.MultipleGrains(); + + [SkippableFact, TestCategory("Azure"), TestCategory("ScheduledJobs")] + public Task DuplicateJobNames() + => _runner.DuplicateJobNames(); + + [SkippableFact, TestCategory("Azure"), TestCategory("ScheduledJobs")] + public Task CancelNonExistentJob() + => _runner.CancelNonExistentJob(); + + [SkippableFact, TestCategory("Azure"), TestCategory("ScheduledJobs")] + public Task CancelAlreadyExecutedJob() + => _runner.CancelAlreadyExecutedJob(); + + [SkippableFact, TestCategory("Azure"), TestCategory("ScheduledJobs")] + public Task ConcurrentScheduling() + => _runner.ConcurrentScheduling(); + + [SkippableFact, TestCategory("Azure"), TestCategory("ScheduledJobs")] + public Task JobPropertiesVerification() + => _runner.JobPropertiesVerification(); + + [SkippableFact, TestCategory("Azure"), TestCategory("ScheduledJobs")] + public Task DequeueCount() + => _runner.DequeueCount(); + + [SkippableFact, TestCategory("Azure"), TestCategory("ScheduledJobs")] + public Task ScheduleJobOnAnotherGrain() + => _runner.ScheduleJobOnAnotherGrain(); + + [SkippableFact, TestCategory("Azure"), TestCategory("ScheduledJobs")] + public Task JobRetry() + => _runner.JobRetry(); +} diff --git a/test/Extensions/TesterAzureUtils/ScheduledJobs/AzureStorageJobShardBatchingTests.cs b/test/Extensions/TesterAzureUtils/ScheduledJobs/AzureStorageJobShardBatchingTests.cs new file mode 100644 index 00000000000..9590b04673a --- /dev/null +++ b/test/Extensions/TesterAzureUtils/ScheduledJobs/AzureStorageJobShardBatchingTests.cs @@ -0,0 +1,322 @@ +using System; +using System.Collections.Generic; +using System.Collections.Immutable; +using System.Linq; +using System.Net; +using System.Threading; +using System.Threading.Tasks; +using Microsoft.Extensions.Logging.Abstractions; +using Microsoft.Extensions.Options; +using Orleans.Hosting; +using Orleans.Runtime; +using Orleans.ScheduledJobs; +using Orleans.ScheduledJobs.AzureStorage; +using Tester.AzureUtils; +using Xunit; + +namespace Tester.AzureUtils.ScheduledJobs; + +/// +/// Azure Storage-specific tests for job shard batching functionality. +/// These tests verify Azure-specific batching behaviors that don't apply to all providers. +/// +[TestCategory("ScheduledJobs")] +public class AzureStorageJobShardBatchingTests : AzureStorageBasicTests, IAsyncDisposable +{ + private readonly IDictionary _metadata = new Dictionary + { + { "CreatedBy", "UnitTest" }, + { "Purpose", "Testing" } + }; + + internal InMemoryClusterMembershipService MembershipService { get; } + + internal IOptions StorageOptions { get; } + + public AzureStorageJobShardBatchingTests() + { + MembershipService = new InMemoryClusterMembershipService(); + StorageOptions = Options.Create(new AzureStorageJobShardOptions()); + StorageOptions.Value.ConfigureTestDefaults(); + StorageOptions.Value.ContainerName = "test-batch-container-" + Guid.NewGuid().ToString("N"); + } + + public async ValueTask DisposeAsync() + { + // Cleanup storage container + var client = StorageOptions.Value.BlobServiceClient; + var container = client.GetBlobContainerClient(StorageOptions.Value.ContainerName); + await container.DeleteIfExistsAsync(); + } + + public class TestLocalSiloDetails : ILocalSiloDetails + { + public TestLocalSiloDetails(SiloAddress siloAddress) + { + SiloAddress = siloAddress; + } + + public string Name => SiloAddress.ToString(); + + public string ClusterId => "TestCluster"; + + public string DnsHostName => SiloAddress.ToString(); + + public SiloAddress SiloAddress { get; } + + public SiloAddress GatewayAddress => SiloAddress; + } + + internal AzureStorageJobShardManager CreateManager(SiloAddress siloAddress) + { + var localSiloDetails = new TestLocalSiloDetails(siloAddress); + return new AzureStorageJobShardManager(localSiloDetails, StorageOptions, MembershipService, NullLoggerFactory.Instance); + } + + internal void SetSiloStatus(SiloAddress siloAddress, SiloStatus status) + { + MembershipService.SetSiloStatus(siloAddress, status); + } + + [SkippableFact, TestCategory("Azure"), TestCategory("Functional")] + public async Task AzureStorageJobShard_MultipleOperationsBatched() + { + // Configure batching options to batch multiple operations + StorageOptions.Value.MinBatchSize = 5; + StorageOptions.Value.MaxBatchSize = 50; + StorageOptions.Value.BatchFlushInterval = TimeSpan.FromMilliseconds(100); + + var localAddress = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5000), 0); + SetSiloStatus(localAddress, SiloStatus.Active); + var manager = CreateManager(localAddress); + + var date = DateTime.UtcNow; + var shard = await manager.CreateShardAsync(date, date.AddHours(1), _metadata, CancellationToken.None); + + // Schedule 10 jobs rapidly to trigger batching + var tasks = new List(); + for (int i = 0; i < 10; i++) + { + tasks.Add(shard.TryScheduleJobAsync(GrainId.Create("type", $"target{i}"), $"job{i}", date.AddMilliseconds(i*10), null, CancellationToken.None)); + } + + await Task.WhenAll(tasks); + + // Wait for batches to flush + await Task.Delay(TimeSpan.FromMilliseconds(300)); + + // Verify batching occurred - should have fewer committed blocks than individual operations + var azureShard = (AzureStorageJobShard)shard; + Assert.True(azureShard.CommitedBlockCount < 10, $"Expected batching to reduce block count, but got {azureShard.CommitedBlockCount}"); + + // Verify all jobs were persisted by marking silo as dead and reassigning + SetSiloStatus(localAddress, SiloStatus.Dead); + var newSiloAddress = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5000), 1); + SetSiloStatus(newSiloAddress, SiloStatus.Active); + + var newManager = CreateManager(newSiloAddress); + var shards = await newManager.AssignJobShardsAsync(DateTime.UtcNow.AddHours(1), CancellationToken.None); + Assert.Single(shards); + + var consumedJobs = new List(); + var cts = new CancellationTokenSource(TimeSpan.FromSeconds(20)); + await foreach (var jobCtx in shards[0].ConsumeScheduledJobsAsync().WithCancellation(cts.Token)) + { + consumedJobs.Add(jobCtx.Job.Name); + await shards[0].RemoveJobAsync(jobCtx.Job.Id, CancellationToken.None); + } + + Assert.Equal(10, consumedJobs.Count); + await newManager.UnregisterShardAsync(shards[0], CancellationToken.None); + } + + [SkippableFact, TestCategory("Azure"), TestCategory("Functional")] + public async Task AzureStorageJobShard_PartialBatchFlushesOnTimeout() + { + // Configure batching to require 10 operations but with a short timeout + StorageOptions.Value.MinBatchSize = 10; + StorageOptions.Value.MaxBatchSize = 100; + StorageOptions.Value.BatchFlushInterval = TimeSpan.FromMilliseconds(200); + + var localAddress = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5000), 0); + SetSiloStatus(localAddress, SiloStatus.Active); + var manager = CreateManager(localAddress); + + var date = DateTime.UtcNow; + var shard = await manager.CreateShardAsync(date, date.AddHours(1), _metadata, CancellationToken.None); + + // Schedule only 3 jobs (less than MinBatchSize of 10) + var tasks = new Task[3]; + tasks[0] = shard.TryScheduleJobAsync(GrainId.Create("type", "target1"), "job1", date.AddSeconds(1), null, CancellationToken.None); + tasks[1] = shard.TryScheduleJobAsync(GrainId.Create("type", "target2"), "job2", date.AddSeconds(2), null, CancellationToken.None); + tasks[2] = shard.TryScheduleJobAsync(GrainId.Create("type", "target3"), "job3", date.AddSeconds(3), null, CancellationToken.None); + + await Task.WhenAll(tasks); + + // Verify that the partial batch was flushed - should have 1 committed block + var azureShard = (AzureStorageJobShard)shard; + Assert.Equal(1, azureShard.CommitedBlockCount); + + // Verify jobs were persisted despite not reaching MinBatchSize + SetSiloStatus(localAddress, SiloStatus.Dead); + var newSiloAddress = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5000), 1); + SetSiloStatus(newSiloAddress, SiloStatus.Active); + + var newManager = CreateManager(newSiloAddress); + var shards = await newManager.AssignJobShardsAsync(DateTime.UtcNow.AddHours(1), CancellationToken.None); + Assert.Single(shards); + + var consumedJobs = new List(); + var cts = new CancellationTokenSource(TimeSpan.FromSeconds(20)); + await foreach (var jobCtx in shards[0].ConsumeScheduledJobsAsync().WithCancellation(cts.Token)) + { + consumedJobs.Add(jobCtx.Job.Name); + await shards[0].RemoveJobAsync(jobCtx.Job.Id, CancellationToken.None); + } + + Assert.Equal(3, consumedJobs.Count); + await newManager.UnregisterShardAsync(shards[0], CancellationToken.None); + } + + [SkippableFact, TestCategory("Azure"), TestCategory("Functional")] + public async Task AzureStorageJobShard_MaxBatchSizeEnforced() + { + // Configure batching with a small max batch size + StorageOptions.Value.MinBatchSize = 1; + StorageOptions.Value.MaxBatchSize = 20; + StorageOptions.Value.BatchFlushInterval = TimeSpan.FromMilliseconds(50); + + var localAddress = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5000), 0); + SetSiloStatus(localAddress, SiloStatus.Active); + var manager = CreateManager(localAddress); + + var date = DateTime.UtcNow; + var shard = await manager.CreateShardAsync(date, date.AddHours(1), _metadata, CancellationToken.None); + + // Schedule 50 jobs rapidly (exceeds MaxBatchSize of 20) + var tasks = new List(); + for (int i = 0; i < 50; i++) + { + tasks.Add(shard.TryScheduleJobAsync(GrainId.Create("type", $"target{i}"), $"job{i}", date.AddMilliseconds(i), null, CancellationToken.None)); + } + + await Task.WhenAll(tasks); + + // Wait for all batches to flush + await Task.Delay(TimeSpan.FromMilliseconds(500)); + + // Verify multiple batches were created due to MaxBatchSize limit + // With 50 jobs and MaxBatchSize=20, expect at least 3 blocks (50/20 = 2.5, rounded up) + var azureShard = (AzureStorageJobShard)shard; + Assert.True(azureShard.CommitedBlockCount >= 3, $"Expected at least 3 blocks for 50 jobs with MaxBatchSize=20, but got {azureShard.CommitedBlockCount}"); + + // Verify all jobs were persisted (should be split into multiple batches) + SetSiloStatus(localAddress, SiloStatus.Dead); + var newSiloAddress = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5000), 1); + SetSiloStatus(newSiloAddress, SiloStatus.Active); + + var newManager = CreateManager(newSiloAddress); + var shards = await newManager.AssignJobShardsAsync(DateTime.UtcNow.AddHours(1), CancellationToken.None); + Assert.Single(shards); + + var consumedJobs = new List(); + var cts = new CancellationTokenSource(TimeSpan.FromSeconds(30)); + await foreach (var jobCtx in shards[0].ConsumeScheduledJobsAsync().WithCancellation(cts.Token)) + { + consumedJobs.Add(jobCtx.Job.Name); + await shards[0].RemoveJobAsync(jobCtx.Job.Id, CancellationToken.None); + } + + Assert.Equal(50, consumedJobs.Count); + await newManager.UnregisterShardAsync(shards[0], CancellationToken.None); + } + + [SkippableFact, TestCategory("Azure"), TestCategory("Functional")] + public async Task AzureStorageJobShard_MetadataOperationsBreakBatches() + { + // Configure batching to require large batch + StorageOptions.Value.MinBatchSize = 10; + StorageOptions.Value.MaxBatchSize = 100; + StorageOptions.Value.BatchFlushInterval = TimeSpan.FromSeconds(5); + + var localAddress = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5000), 0); + SetSiloStatus(localAddress, SiloStatus.Active); + var manager = CreateManager(localAddress); + + var date = DateTime.UtcNow; + var shard = await manager.CreateShardAsync(date, date.AddHours(1), _metadata, CancellationToken.None); + + // Schedule 5 jobs (less than MinBatchSize) + var tasks = new List(); + for (int i = 0; i < 5; i++) + { + tasks.Add(shard.TryScheduleJobAsync(GrainId.Create("type", $"target{i}"), $"job{i}", date.AddMilliseconds(i), null, CancellationToken.None)); + } + + // Give operations time to queue + await Task.Delay(50); + + // Verify no blocks committed yet (batch still pending) + var azureShard = (AzureStorageJobShard)shard; + var blockCountBefore = azureShard.CommitedBlockCount; + + // Update metadata (should flush pending batch and process immediately) + var newMetadata = new Dictionary(shard.Metadata) { ["Updated"] = "true" }; + await azureShard.UpdateBlobMetadata(newMetadata, CancellationToken.None); + + Assert.All(tasks, t => Assert.True(t.IsCompletedSuccessfully, "Expected all job scheduling tasks to complete successfully")); + Assert.True(azureShard.CommitedBlockCount > blockCountBefore, "Expected metadata update to flush pending batch"); + + // Verify metadata was updated + var props = await azureShard.BlobClient.GetPropertiesAsync(); + Assert.True(props.Value.Metadata.ContainsKey("Updated")); + Assert.Equal("true", props.Value.Metadata["Updated"]); + + // Verify jobs were persisted (even though batch was incomplete) + SetSiloStatus(localAddress, SiloStatus.Dead); + var newSiloAddress = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5000), 1); + SetSiloStatus(newSiloAddress, SiloStatus.Active); + + // Reconfigure batching to make test faster + StorageOptions.Value.MinBatchSize = 1; + StorageOptions.Value.MaxBatchSize = 1; + StorageOptions.Value.BatchFlushInterval = TimeSpan.FromMilliseconds(100); + + var newManager = CreateManager(newSiloAddress); + var shards = await newManager.AssignJobShardsAsync(DateTime.UtcNow.AddHours(1), CancellationToken.None); + Assert.Single(shards); + + var consumedJobs = new List(); + var cts = new CancellationTokenSource(TimeSpan.FromSeconds(20)); + await foreach (var jobCtx in shards[0].ConsumeScheduledJobsAsync().WithCancellation(cts.Token)) + { + consumedJobs.Add(jobCtx.Job.Name); + await shards[0].RemoveJobAsync(jobCtx.Job.Id, CancellationToken.None); + } + + Assert.Equal(5, consumedJobs.Count); + await newManager.UnregisterShardAsync(shards[0], CancellationToken.None); + } + + public class InMemoryClusterMembershipService : IClusterMembershipService + { + private readonly Dictionary _silos = new(); + private int _version = 0; + + public ClusterMembershipSnapshot CurrentSnapshot => + new ClusterMembershipSnapshot(_silos.ToImmutableDictionary(), new MembershipVersion(_version)); + + public IAsyncEnumerable MembershipUpdates => throw new NotImplementedException(); + + public void SetSiloStatus(SiloAddress address, SiloStatus status) + { + _silos[address] = new ClusterMember(address, status, address.ToParsableString()); + _version++; + } + + public ValueTask Refresh(MembershipVersion minimumVersion = default, CancellationToken cancellationToken = default) => + ValueTask.CompletedTask; + + public Task TryKill(SiloAddress siloAddress) => throw new NotImplementedException(); + } +} diff --git a/test/Extensions/TesterAzureUtils/ScheduledJobs/AzureStorageJobShardManagerTestFixture.cs b/test/Extensions/TesterAzureUtils/ScheduledJobs/AzureStorageJobShardManagerTestFixture.cs new file mode 100644 index 00000000000..3126bdfe0e8 --- /dev/null +++ b/test/Extensions/TesterAzureUtils/ScheduledJobs/AzureStorageJobShardManagerTestFixture.cs @@ -0,0 +1,46 @@ +using System; +using System.Collections.Generic; +using System.Threading.Tasks; +using Microsoft.Extensions.Logging.Abstractions; +using Microsoft.Extensions.Options; +using Orleans.Hosting; +using Orleans.Runtime; +using Orleans.ScheduledJobs; +using Orleans.ScheduledJobs.AzureStorage; +using Tester.AzureUtils; +using Tester.ScheduledJobs; + +namespace Orleans.Tests.ScheduledJobs.AzureStorage; + +/// +/// Azure Storage implementation of . +/// Provides the infrastructure needed to run shared job shard manager tests against Azure Storage. +/// +internal sealed class AzureStorageJobShardManagerTestFixture : IJobShardManagerTestFixture +{ + private readonly IOptions _storageOptions; + + public AzureStorageJobShardManagerTestFixture() + { + _storageOptions = Options.Create(new AzureStorageJobShardOptions()); + _storageOptions.Value.ConfigureTestDefaults(); + _storageOptions.Value.ContainerName = "test-container-" + Guid.NewGuid().ToString("N"); + } + + public JobShardManager CreateManager(ILocalSiloDetails localSiloDetails, IClusterMembershipService membershipService) + { + return new AzureStorageJobShardManager( + localSiloDetails, + _storageOptions, + membershipService, + NullLoggerFactory.Instance); + } + + public async ValueTask DisposeAsync() + { + // Cleanup storage container + var client = _storageOptions.Value.BlobServiceClient; + var container = client.GetBlobContainerClient(_storageOptions.Value.ContainerName); + await container.DeleteIfExistsAsync(); + } +} diff --git a/test/Extensions/TesterAzureUtils/ScheduledJobs/AzureStorageJobShardManagerTests.cs b/test/Extensions/TesterAzureUtils/ScheduledJobs/AzureStorageJobShardManagerTests.cs new file mode 100644 index 00000000000..3ba5b944414 --- /dev/null +++ b/test/Extensions/TesterAzureUtils/ScheduledJobs/AzureStorageJobShardManagerTests.cs @@ -0,0 +1,149 @@ +using System; +using System.Collections.Generic; +using System.Collections.Immutable; +using System.Linq; +using System.Net; +using System.Text; +using System.Threading; +using System.Threading.Tasks; +using FluentAssertions; +using Microsoft.Extensions.Logging; +using Microsoft.Extensions.Logging.Abstractions; +using Microsoft.Extensions.Options; +using Orleans.Internal; +using Orleans.ScheduledJobs; +using Orleans.ScheduledJobs.AzureStorage; +using Orleans.Tests.ScheduledJobs.AzureStorage; +using Tester.ScheduledJobs; +using Xunit; +using Xunit.Sdk; + +namespace Tester.AzureUtils.ScheduledJobs; + +/// +/// Azure Storage-specific tests for job shard manager functionality. +/// Common tests are delegated to for reusability across providers. +/// Provider-specific tests (e.g., batching) remain here. +/// +[TestCategory("ScheduledJobs")] +public class AzureStorageJobShardManagerTests : AzureStorageBasicTests, IAsyncDisposable +{ + private readonly AzureStorageJobShardManagerTestFixture _fixture; + private readonly JobShardManagerTestsRunner _runner; + + internal IOptions StorageOptions { get; } + + public AzureStorageJobShardManagerTests() + { + StorageOptions = Options.Create(new AzureStorageJobShardOptions()); + StorageOptions.Value.ConfigureTestDefaults(); + StorageOptions.Value.ContainerName = "test-container-" + Guid.NewGuid().ToString("N"); + + // Create fixture and runner for common tests + _fixture = new AzureStorageJobShardManagerTestFixture(); + _runner = new JobShardManagerTestsRunner(_fixture); + } + + public async ValueTask DisposeAsync() + { + // Cleanup storage container + var client = StorageOptions.Value.BlobServiceClient; + var container = client.GetBlobContainerClient(StorageOptions.Value.ContainerName); + await container.DeleteIfExistsAsync(); + + // Cleanup fixture + await _fixture.DisposeAsync(); + } + + #region Common Tests (Delegated to Runner) + + /// + /// Tests basic shard creation and assignment workflow. + /// This test is delegated to the runner for reuse across providers. + /// + [SkippableFact, TestCategory("Azure"), TestCategory("Functional")] + public Task AzureStorageJobShardManager_Creation_Assignation() + => _runner.ShardCreationAndAssignment(); + + /// + /// Tests reading and consuming jobs from a frozen shard after ownership transfer. + /// This test is delegated to the runner for reuse across providers. + /// + [SkippableFact, TestCategory("Azure"), TestCategory("Functional")] + public Task AzureStorageJobShardManager_ReadFrozenShard() + => _runner.ReadFrozenShard(); + + /// + /// Tests consuming jobs from a live shard. + /// This test is delegated to the runner for reuse across providers. + /// + [SkippableFact, TestCategory("Azure"), TestCategory("Functional")] + public Task AzureStorageJobShardManager_LiveShard() + => _runner.LiveShard(); + + /// + /// Tests job metadata persistence across ownership transfers. + /// This test is delegated to the runner for reuse across providers. + /// + [SkippableFact, TestCategory("Azure"), TestCategory("Functional")] + public Task AzureStorageJobShardManager_JobMetadata() + => _runner.JobMetadata(); + + /// + /// Tests concurrent shard assignment to verify ownership conflict resolution. + /// This test is delegated to the runner for reuse across providers. + /// + [SkippableFact, TestCategory("Azure"), TestCategory("Functional")] + public Task AzureStorageJobShardManager_ConcurrentShardAssignment_OwnershipConflicts() + => _runner.ConcurrentShardAssignment_OwnershipConflicts(); + + /// + /// Tests shard metadata preservation across ownership transfers. + /// This test is delegated to the runner for reuse across providers. + /// + [SkippableFact, TestCategory("Azure"), TestCategory("Functional")] + public Task AzureStorageJobShardManager_ShardMetadataMerge() + => _runner.ShardMetadataMerge(); + + #endregion + + /// + /// Tests stopping shard processing and verifying jobs remain for reassignment. + /// This test is delegated to the runner for reuse across providers. + /// + [SkippableFact, TestCategory("Azure"), TestCategory("Functional")] + public Task AzureStorageJobShardManager_StopProcessingShard() + => _runner.StopProcessingShard(); + + /// + /// Tests retrying a job with a new due time. + /// This test is delegated to the runner for reuse across providers. + /// + [SkippableFact, TestCategory("Azure"), TestCategory("Functional")] + public Task AzureStorageJobShardManager_RetryJobLater() + => _runner.RetryJobLater(); + + /// + /// Tests job cancellation before and during processing. + /// This test is delegated to the runner for reuse across providers. + /// + [SkippableFact, TestCategory("Azure"), TestCategory("Functional")] + public Task AzureStorageJobShardManager_JobCancellation() + => _runner.JobCancellation(); + + /// + /// Tests that multiple shard registrations with the same time range produce unique IDs. + /// This test is delegated to the runner for reuse across providers. + /// + [SkippableFact, TestCategory("Azure"), TestCategory("Functional")] + public Task AzureStorageJobShardManager_ShardRegistrationRetry_IdCollisions() + => _runner.ShardRegistrationRetry_IdCollisions(); + + /// + /// Tests that unregistering a shard with remaining jobs preserves the shard for reassignment. + /// This test is delegated to the runner for reuse across providers. + /// + [SkippableFact, TestCategory("Azure"), TestCategory("Functional")] + public Task AzureStorageJobShardManager_UnregisterShard_WithJobsRemaining() + => _runner.UnregisterShard_WithJobsRemaining(); +} \ No newline at end of file diff --git a/test/Extensions/TesterAzureUtils/ScheduledJobs/NetstringJsonSerializerTests.cs b/test/Extensions/TesterAzureUtils/ScheduledJobs/NetstringJsonSerializerTests.cs new file mode 100644 index 00000000000..349028428a1 --- /dev/null +++ b/test/Extensions/TesterAzureUtils/ScheduledJobs/NetstringJsonSerializerTests.cs @@ -0,0 +1,445 @@ +using System; +using System.Buffers; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Text; +using System.Text.Json; +using System.Threading.Tasks; +using FluentAssertions; +using Orleans.Runtime; +using Orleans.ScheduledJobs.AzureStorage; +using Xunit; + +namespace Tester.AzureUtils.ScheduledJobs; + +[TestCategory("ScheduledJobs"), TestCategory("BVT")] +public class NetstringJsonSerializerTests +{ + private static byte[] EncodeToBytes(JobOperation operation) + { + var stream = new MemoryStream(); + NetstringJsonSerializer.Encode(operation, stream, JobOperationJsonContext.Default.JobOperation); + return stream.ToArray(); + } + [Fact] + public void Encode_RemoveOperation_ProducesCorrectFormat() + { + var operation = JobOperation.CreateRemoveOperation("job123"); + var result = EncodeToBytes(operation); + var resultString = Encoding.UTF8.GetString(result); + + resultString.Should().EndWith("\n"); + resultString.Should().Match("*:*\n"); + resultString.Should().Contain("\"type\":1"); + resultString.Should().Contain("\"id\":\"job123\""); + } + + [Fact] + public void Encode_AddOperation_ProducesCorrectFormat() + { + var dueTime = new DateTimeOffset(2025, 10, 31, 12, 0, 0, TimeSpan.Zero); + var grainId = GrainId.Create("test", "grain1"); + var operation = JobOperation.CreateAddOperation("job456", "TestJob", dueTime, grainId, null); + var result = EncodeToBytes(operation); + var resultString = Encoding.UTF8.GetString(result); + + resultString.Should().EndWith("\n"); + resultString.Should().Match("*:*\n"); + resultString.Should().Contain("\"id\":\"job456\""); + resultString.Should().Contain("\"name\":\"TestJob\""); + } + + [Fact] + public void Encode_RetryOperation_ProducesCorrectFormat() + { + var dueTime = new DateTimeOffset(2025, 10, 31, 12, 0, 0, TimeSpan.Zero); + var operation = JobOperation.CreateRetryOperation("job789", dueTime); + var result = EncodeToBytes(operation); + var resultString = Encoding.UTF8.GetString(result); + + resultString.Should().EndWith("\n"); + resultString.Should().Match("*:*\n"); + resultString.Should().Contain("\"type\":2"); + resultString.Should().Contain("\"id\":\"job789\""); + } + + [Fact] + public void Encode_AddOperationWithMetadata_ProducesCorrectFormat() + { + var dueTime = new DateTimeOffset(2025, 10, 31, 12, 0, 0, TimeSpan.Zero); + var grainId = GrainId.Create("test", "grain1"); + var metadata = new Dictionary { ["key1"] = "value1", ["key2"] = "value2" }; + var operation = JobOperation.CreateAddOperation("job999", "MetaJob", dueTime, grainId, metadata); + var result = EncodeToBytes(operation); + var resultString = Encoding.UTF8.GetString(result); + + resultString.Should().EndWith("\n"); + resultString.Should().Contain("\"metadata\""); + resultString.Should().Contain("\"key1\":\"value1\""); + resultString.Should().Contain("\"key2\":\"value2\""); + } + + [Fact] + public void Encode_VerifiesNetstringFormat() + { + var operation = JobOperation.CreateRemoveOperation("test"); + var result = EncodeToBytes(operation); + var resultString = Encoding.UTF8.GetString(result); + + var parts = resultString.Split(':', 2); + parts.Should().HaveCount(2); + + var lengthStr = parts[0]; + lengthStr.Should().HaveLength(6, "length prefix should be 6 hex digits"); + int.TryParse(lengthStr, System.Globalization.NumberStyles.HexNumber, null, out var length).Should().BeTrue("length should be valid hex"); + length.Should().BeGreaterThan(0); + + var dataAndNewline = parts[1]; + dataAndNewline.Should().EndWith("\n"); + + var jsonData = dataAndNewline[..^1]; + var jsonBytes = Encoding.UTF8.GetBytes(jsonData); + jsonBytes.Length.Should().Be(length, "JSON data length should match the hex length prefix"); + } + + [Fact] + public async Task DecodeAsync_RemoveOperation_DecodesCorrectly() + { + var operation = JobOperation.CreateRemoveOperation("job123"); + var encoded = EncodeToBytes(operation); + var stream = new MemoryStream(encoded); + + var results = new List(); + await foreach (var item in NetstringJsonSerializer.DecodeAsync(stream, JobOperationJsonContext.Default.JobOperation, CancellationToken.None)) + { + results.Add(item); + } + + results.Should().HaveCount(1); + results[0].Type.Should().Be(JobOperation.OperationType.Remove); + results[0].Id.Should().Be("job123"); + } + + [Fact] + public async Task DecodeAsync_AddOperation_DecodesCorrectly() + { + var dueTime = new DateTimeOffset(2025, 10, 31, 12, 0, 0, TimeSpan.Zero); + var grainId = GrainId.Create("test", "grain1"); + var operation = JobOperation.CreateAddOperation("job456", "TestJob", dueTime, grainId, null); + var encoded = EncodeToBytes(operation); + var stream = new MemoryStream(encoded); + + var results = new List(); + await foreach (var item in NetstringJsonSerializer.DecodeAsync(stream, JobOperationJsonContext.Default.JobOperation, CancellationToken.None)) + { + results.Add(item); + } + + results.Should().HaveCount(1); + results[0].Type.Should().Be(JobOperation.OperationType.Add); + results[0].Id.Should().Be("job456"); + results[0].Name.Should().Be("TestJob"); + results[0].DueTime.Should().Be(dueTime); + results[0].TargetGrainId.Should().Be(grainId); + } + + [Fact] + public async Task DecodeAsync_MultipleOperations_DecodesCorrectly() + { + var dueTime = new DateTimeOffset(2025, 10, 31, 12, 0, 0, TimeSpan.Zero); + var grainId = GrainId.Create("test", "grain1"); + var op1 = JobOperation.CreateAddOperation("job1", "Job1", dueTime, grainId, null); + var op2 = JobOperation.CreateRemoveOperation("job2"); + var op3 = JobOperation.CreateRetryOperation("job3", dueTime.AddHours(1)); + + var stream = new MemoryStream(); + await stream.WriteAsync(EncodeToBytes(op1)); + await stream.WriteAsync(EncodeToBytes(op2)); + await stream.WriteAsync(EncodeToBytes(op3)); + stream.Position = 0; + + var results = new List(); + await foreach (var item in NetstringJsonSerializer.DecodeAsync(stream, JobOperationJsonContext.Default.JobOperation, CancellationToken.None)) + { + results.Add(item); + } + + results.Should().HaveCount(3); + results[0].Type.Should().Be(JobOperation.OperationType.Add); + results[0].Id.Should().Be("job1"); + results[1].Type.Should().Be(JobOperation.OperationType.Remove); + results[1].Id.Should().Be("job2"); + results[2].Type.Should().Be(JobOperation.OperationType.Retry); + results[2].Id.Should().Be("job3"); + } + + [Fact] + public async Task DecodeAsync_AddOperationWithMetadata_DecodesCorrectly() + { + var dueTime = new DateTimeOffset(2025, 10, 31, 12, 0, 0, TimeSpan.Zero); + var grainId = GrainId.Create("test", "grain1"); + var metadata = new Dictionary { ["key1"] = "value1", ["key2"] = "value2" }; + var operation = JobOperation.CreateAddOperation("job999", "MetaJob", dueTime, grainId, metadata); + var encoded = EncodeToBytes(operation); + var stream = new MemoryStream(encoded); + + var results = new List(); + await foreach (var item in NetstringJsonSerializer.DecodeAsync(stream, JobOperationJsonContext.Default.JobOperation, CancellationToken.None)) + { + results.Add(item); + } + + results.Should().HaveCount(1); + results[0].Metadata.Should().NotBeNull(); + results[0].Metadata.Should().ContainKey("key1").WhoseValue.Should().Be("value1"); + results[0].Metadata.Should().ContainKey("key2").WhoseValue.Should().Be("value2"); + } + + [Fact] + public async Task DecodeAsync_EmptyStream_ReturnsEmpty() + { + var stream = new MemoryStream(); + + var results = new List(); + await foreach (var item in NetstringJsonSerializer.DecodeAsync(stream, JobOperationJsonContext.Default.JobOperation, CancellationToken.None)) + { + results.Add(item); + } + + results.Should().BeEmpty(); + } + + [Fact] + public async Task DecodeAsync_InvalidLength_ThrowsInvalidDataException() + { + var encoded = "GGGGGG:{\"type\":1,\"id\":\"test\"}\n"; // Invalid hex + var stream = new MemoryStream(Encoding.UTF8.GetBytes(encoded)); + + var act = async () => + { + await foreach (var item in NetstringJsonSerializer.DecodeAsync(stream, JobOperationJsonContext.Default.JobOperation, CancellationToken.None)) + { + // Should throw before yielding any items + } + }; + + await act.Should().ThrowAsync() + .WithMessage("Invalid netstring length: GGGGGG"); + } + + [Fact] + public async Task DecodeAsync_ExcessiveLength_ThrowsInvalidDataException() + { + var encoded = "FFFFFF:{\"type\":1}\n"; // 16777215 bytes, exceeds MaxLength + var stream = new MemoryStream(Encoding.UTF8.GetBytes(encoded)); + + var act = async () => + { + await foreach (var item in NetstringJsonSerializer.DecodeAsync(stream, JobOperationJsonContext.Default.JobOperation, CancellationToken.None)) + { + // Should throw before yielding any items + } + }; + + await act.Should().ThrowAsync() + .WithMessage("Netstring length out of valid range: *"); + } + + [Fact] + public async Task DecodeAsync_MissingTrailingNewline_ThrowsInvalidDataException() + { + var json = "{\"type\":1,\"id\":\"test\"}"; + var jsonBytes = Encoding.UTF8.GetBytes(json); + var encoded = $"{jsonBytes.Length:X6}:{json}x"; // Use 6-digit hex format + var stream = new MemoryStream(Encoding.UTF8.GetBytes(encoded)); + + var act = async () => + { + await foreach (var item in NetstringJsonSerializer.DecodeAsync(stream, JobOperationJsonContext.Default.JobOperation, CancellationToken.None)) + { + // Should throw after reading the data + } + }; + + await act.Should().ThrowAsync() + .WithMessage("Expected newline at end of netstring, got byte value *"); + } + + [Fact] + public async Task DecodeAsync_IncompleteData_ThrowsEndOfStreamException() + { + var encoded = "000064:{\"type\":1}"; // Claims 100 bytes but only provides 11 + var stream = new MemoryStream(Encoding.UTF8.GetBytes(encoded)); + + var act = async () => + { + await foreach (var item in NetstringJsonSerializer.DecodeAsync(stream, JobOperationJsonContext.Default.JobOperation, CancellationToken.None)) + { + // Should throw before yielding any items + } + }; + + await act.Should().ThrowAsync(); + } + + [Fact] + public async Task DecodeAsync_WrongTrailingCharacter_ThrowsInvalidDataException() + { + var json = "{\"type\":1,\"id\":\"test\"}"; + var jsonBytes = Encoding.UTF8.GetBytes(json); + var encoded = $"{jsonBytes.Length:X6}:{json}X"; // Use 6-digit hex format + var stream = new MemoryStream(Encoding.UTF8.GetBytes(encoded)); + + var act = async () => + { + await foreach (var item in NetstringJsonSerializer.DecodeAsync(stream, JobOperationJsonContext.Default.JobOperation, CancellationToken.None)) + { + // Should throw after reading the data + } + }; + + await act.Should().ThrowAsync() + .WithMessage("Expected newline at end of netstring, got byte value *"); + } + + [Fact] + public async Task DecodeAsync_InvalidJson_ThrowsJsonException() + { + var invalidJson = "{invalid json}"; + var jsonBytes = Encoding.UTF8.GetBytes(invalidJson); + var encoded = $"{jsonBytes.Length:X6}:{invalidJson}\n"; // Use 6-digit hex format + var stream = new MemoryStream(Encoding.UTF8.GetBytes(encoded)); + + var act = async () => + { + await foreach (var item in NetstringJsonSerializer.DecodeAsync(stream, JobOperationJsonContext.Default.JobOperation, CancellationToken.None)) + { + // Should throw when deserializing + } + }; + + await act.Should().ThrowAsync(); + } + + [Fact] + public async Task EncodeAndDecode_RoundTrip_PreservesData() + { + var dueTime1 = new DateTimeOffset(2025, 10, 31, 12, 0, 0, TimeSpan.Zero); + var dueTime2 = new DateTimeOffset(2025, 11, 1, 14, 30, 0, TimeSpan.Zero); + var grainId = GrainId.Create("test", "grain1"); + var metadata = new Dictionary { ["env"] = "prod", ["region"] = "us-east" }; + + var testOperations = new[] + { + JobOperation.CreateRemoveOperation("remove-job"), + JobOperation.CreateAddOperation("add-job", "MyJob", dueTime1, grainId, null), + JobOperation.CreateRetryOperation("retry-job", dueTime2), + JobOperation.CreateAddOperation("meta-job", "MetaJob", dueTime1, grainId, metadata) + }; + + foreach (var operation in testOperations) + { + var encoded = EncodeToBytes(operation); + var stream = new MemoryStream(encoded); + + var results = new List(); + await foreach (var item in NetstringJsonSerializer.DecodeAsync(stream, JobOperationJsonContext.Default.JobOperation, CancellationToken.None)) + { + results.Add(item); + } + + results.Should().HaveCount(1); + results[0].Type.Should().Be(operation.Type); + results[0].Id.Should().Be(operation.Id); + results[0].Name.Should().Be(operation.Name); + results[0].DueTime.Should().Be(operation.DueTime); + results[0].TargetGrainId.Should().Be(operation.TargetGrainId); + + if (operation.Metadata is not null) + { + results[0].Metadata.Should().NotBeNull(); + results[0].Metadata.Should().BeEquivalentTo(operation.Metadata); + } + } + } + + [Fact] + public async Task EncodeAndDecode_MultipleOperations_RoundTrip() + { + var dueTime = new DateTimeOffset(2025, 10, 31, 12, 0, 0, TimeSpan.Zero); + var grainId = GrainId.Create("test", "grain1"); + + var testOperations = new[] + { + JobOperation.CreateAddOperation("job1", "First", dueTime, grainId, null), + JobOperation.CreateRemoveOperation("job2"), + JobOperation.CreateRetryOperation("job3", dueTime.AddHours(1)), + JobOperation.CreateAddOperation("job4", "Fourth", dueTime.AddDays(1), grainId, null) + }; + + var memoryStream = new MemoryStream(); + foreach (var operation in testOperations) + { + var encoded = EncodeToBytes(operation); + await memoryStream.WriteAsync(encoded); + } + + memoryStream.Position = 0; + + var results = new List(); + await foreach (var item in NetstringJsonSerializer.DecodeAsync(memoryStream, JobOperationJsonContext.Default.JobOperation, CancellationToken.None)) + { + results.Add(item); + } + + results.Should().HaveCount(4); + for (var i = 0; i < testOperations.Length; i++) + { + results[i].Type.Should().Be(testOperations[i].Type); + results[i].Id.Should().Be(testOperations[i].Id); + } + } + + [Fact] + public async Task DecodeAsync_StreamPosition_IsPreserved() + { + var operation = JobOperation.CreateRemoveOperation("test"); + var encoded = EncodeToBytes(operation); + var stream = new MemoryStream(encoded); + + await foreach (var item in NetstringJsonSerializer.DecodeAsync(stream, JobOperationJsonContext.Default.JobOperation, CancellationToken.None)) + { + // Stream should be at the end after reading + } + + stream.Position.Should().Be(stream.Length); + } + + [Fact] + public async Task EncodeAndDecode_LargeMetadata_HandlesCorrectly() + { + var dueTime = new DateTimeOffset(2025, 10, 31, 12, 0, 0, TimeSpan.Zero); + var grainId = GrainId.Create("test", "grain1"); + + var largeMetadata = new Dictionary(); + for (var i = 0; i < 100; i++) + { + largeMetadata[$"key{i}"] = new string('x', 1000); + } + + var operation = JobOperation.CreateAddOperation("large-job", "LargeMetaJob", dueTime, grainId, largeMetadata); + var encoded = EncodeToBytes(operation); + var stream = new MemoryStream(encoded); + + var results = new List(); + await foreach (var item in NetstringJsonSerializer.DecodeAsync(stream, JobOperationJsonContext.Default.JobOperation, CancellationToken.None)) + { + results.Add(item); + } + + results.Should().HaveCount(1); + results[0].Metadata.Should().NotBeNull(); + results[0].Metadata.Should().HaveCount(100); + } +} diff --git a/test/Extensions/TesterAzureUtils/Tester.AzureUtils.csproj b/test/Extensions/TesterAzureUtils/Tester.AzureUtils.csproj index db398c4ff22..8246ee7899d 100644 --- a/test/Extensions/TesterAzureUtils/Tester.AzureUtils.csproj +++ b/test/Extensions/TesterAzureUtils/Tester.AzureUtils.csproj @@ -20,6 +20,7 @@ + diff --git a/test/Grains/TestGrainInterfaces/IRetryTestGrain.cs b/test/Grains/TestGrainInterfaces/IRetryTestGrain.cs new file mode 100644 index 00000000000..ebc1762b706 --- /dev/null +++ b/test/Grains/TestGrainInterfaces/IRetryTestGrain.cs @@ -0,0 +1,23 @@ +using System; +using System.Collections.Generic; +using System.Threading.Tasks; +using Orleans.Concurrency; +using Orleans.ScheduledJobs; + +namespace UnitTests.GrainInterfaces; + +public interface IRetryTestGrain : IGrainWithStringKey +{ + Task ScheduleJobAsync(string jobName, DateTimeOffset scheduledTime, IReadOnlyDictionary metadata = null); + + Task HasJobSucceeded(string jobId); + + [AlwaysInterleave] + Task WaitForJobToSucceed(string jobId); + + Task GetJobExecutionAttemptCount(string jobId); + + Task> GetJobDequeueCountHistory(string jobId); + + Task GetFinalJobContext(string jobId); +} diff --git a/test/Grains/TestGrainInterfaces/IScheduledJobGrain.cs b/test/Grains/TestGrainInterfaces/IScheduledJobGrain.cs new file mode 100644 index 00000000000..35afe873bbf --- /dev/null +++ b/test/Grains/TestGrainInterfaces/IScheduledJobGrain.cs @@ -0,0 +1,27 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using Orleans.Concurrency; +using Orleans.ScheduledJobs; + +namespace UnitTests.GrainInterfaces; + +public interface IScheduledJobGrain : IGrainWithStringKey +{ + Task ScheduleJobAsync(string jobName, DateTimeOffset scheduledTime, IReadOnlyDictionary metadata = null); + + Task TryCancelJobAsync(ScheduledJob job); + + Task HasJobRan(string jobId); + + [AlwaysInterleave] + Task WaitForJobToRun(string jobId); + + Task GetJobExecutionTime(string jobId); + + Task GetJobContext(string jobId); + + Task WasCancellationTokenCancelled(string jobId); +} diff --git a/test/Grains/TestGrainInterfaces/ISchedulerGrain.cs b/test/Grains/TestGrainInterfaces/ISchedulerGrain.cs new file mode 100644 index 00000000000..f18e5ec9f45 --- /dev/null +++ b/test/Grains/TestGrainInterfaces/ISchedulerGrain.cs @@ -0,0 +1,10 @@ +using System; +using System.Threading.Tasks; +using Orleans.ScheduledJobs; + +namespace UnitTests.GrainInterfaces; + +public interface ISchedulerGrain : IGrainWithStringKey +{ + Task ScheduleJobOnAnotherGrainAsync(string targetGrainKey, string jobName, DateTimeOffset scheduledTime); +} diff --git a/test/Grains/TestGrainInterfaces/TestGrainInterfaces.csproj b/test/Grains/TestGrainInterfaces/TestGrainInterfaces.csproj index cc89232f992..58838b8f1fe 100644 --- a/test/Grains/TestGrainInterfaces/TestGrainInterfaces.csproj +++ b/test/Grains/TestGrainInterfaces/TestGrainInterfaces.csproj @@ -11,6 +11,7 @@ + diff --git a/test/Grains/TestGrains/RetryTestGrain.cs b/test/Grains/TestGrains/RetryTestGrain.cs new file mode 100644 index 00000000000..506b4b2fee6 --- /dev/null +++ b/test/Grains/TestGrains/RetryTestGrain.cs @@ -0,0 +1,134 @@ +using System; +using System.Collections.Generic; +using System.Threading; +using System.Threading.Tasks; +using Microsoft.Extensions.Logging; +using Orleans.ScheduledJobs; +using UnitTests.GrainInterfaces; + +namespace UnitTests.Grains; + +public class RetryTestGrain : Grain, IRetryTestGrain, IScheduledJobHandler +{ + private readonly Dictionary _jobSuccessStatus = new(); + private readonly Dictionary _jobExecutionAttempts = new(); + private readonly Dictionary> _jobDequeueCountHistory = new(); + private readonly Dictionary _finalJobContexts = new(); + private readonly ILocalScheduledJobManager _localScheduledJobManager; + private readonly ILogger _logger; + + public RetryTestGrain(ILocalScheduledJobManager localScheduledJobManager, ILogger logger) + { + _localScheduledJobManager = localScheduledJobManager; + _logger = logger; + } + + public Task HasJobSucceeded(string jobId) + { + return Task.FromResult(_jobSuccessStatus.TryGetValue(jobId, out var tcs) && tcs.Task.IsCompleted); + } + + public Task ExecuteJobAsync(IScheduledJobContext ctx, CancellationToken cancellationToken) + { + var jobId = ctx.Job.Id; + + // Initialize tracking if this is the first attempt + if (!_jobExecutionAttempts.ContainsKey(jobId)) + { + _jobExecutionAttempts[jobId] = 0; + _jobDequeueCountHistory[jobId] = new List(); + } + + // Track this attempt + _jobExecutionAttempts[jobId]++; + _jobDequeueCountHistory[jobId].Add(ctx.DequeueCount); + + _logger.LogInformation( + "Job {JobId} execution attempt {Attempt}, DequeueCount: {DequeueCount}", + jobId, + _jobExecutionAttempts[jobId], + ctx.DequeueCount); + + // Check if we should fail based on metadata + if (ctx.Job.Metadata is not null && ctx.Job.Metadata.TryGetValue("FailUntilAttempt", out var failUntilAttemptStr)) + { + if (int.TryParse(failUntilAttemptStr, out var failUntilAttempt)) + { + if (ctx.DequeueCount < failUntilAttempt) + { + _logger.LogWarning( + "Job {JobId} intentionally failing on attempt {Attempt} (DequeueCount: {DequeueCount}, FailUntilAttempt: {FailUntilAttempt})", + jobId, + _jobExecutionAttempts[jobId], + ctx.DequeueCount, + failUntilAttempt); + + throw new InvalidOperationException($"Simulated failure for job {jobId} on attempt {_jobExecutionAttempts[jobId]}"); + } + } + } + + // Job succeeded + _logger.LogInformation("Job {JobId} succeeded on attempt {Attempt}", jobId, _jobExecutionAttempts[jobId]); + _finalJobContexts[jobId] = ctx; + _jobSuccessStatus[jobId].SetResult(); + + return Task.CompletedTask; + } + + public async Task ScheduleJobAsync(string jobName, DateTimeOffset scheduledTime, IReadOnlyDictionary metadata = null) + { + var job = await _localScheduledJobManager.ScheduleJobAsync( + this.GetGrainId(), + jobName, + scheduledTime, + metadata, + CancellationToken.None); + + _jobSuccessStatus[job.Id] = new TaskCompletionSource(); + + return job; + } + + public async Task WaitForJobToSucceed(string jobId) + { + if (!_jobSuccessStatus.TryGetValue(jobId, out var tcs)) + { + // The job might not have been scheduled on this grain + _jobSuccessStatus[jobId] = new TaskCompletionSource(); + tcs = _jobSuccessStatus[jobId]; + } + + await tcs.Task; + } + + public Task GetJobExecutionAttemptCount(string jobId) + { + if (!_jobExecutionAttempts.TryGetValue(jobId, out var count)) + { + throw new InvalidOperationException($"Job {jobId} has not been attempted or was not scheduled on this grain."); + } + + return Task.FromResult(count); + } + + public Task> GetJobDequeueCountHistory(string jobId) + { + if (!_jobDequeueCountHistory.TryGetValue(jobId, out var history)) + { + throw new InvalidOperationException($"Job {jobId} has not been attempted or was not scheduled on this grain."); + } + + return Task.FromResult(history); + } + + public Task GetFinalJobContext(string jobId) + { + if (!_finalJobContexts.TryGetValue(jobId, out var ctx)) + { + throw new InvalidOperationException($"Job {jobId} has not succeeded or was not scheduled on this grain."); + } + + return Task.FromResult(ctx); + } +} diff --git a/test/Grains/TestGrains/ScheduledJobGrain.cs b/test/Grains/TestGrains/ScheduledJobGrain.cs new file mode 100644 index 00000000000..634033b7ba0 --- /dev/null +++ b/test/Grains/TestGrains/ScheduledJobGrain.cs @@ -0,0 +1,91 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading; +using System.Threading.Tasks; +using Microsoft.Extensions.Logging; +using Orleans.ScheduledJobs; +using UnitTests.GrainInterfaces; + +namespace UnitTests.Grains; + +public class ScheduledJobGrain : Grain, IScheduledJobGrain, IScheduledJobHandler +{ + private Dictionary jobRunStatus = new(); + private Dictionary jobExecutionTimes = new(); + private Dictionary jobContexts = new(); + private Dictionary cancellationTokenStatus = new(); + private readonly ILocalScheduledJobManager _localScheduledJobManager; + private readonly ILogger _logger; + + public ScheduledJobGrain(ILocalScheduledJobManager localScheduledJobManager, ILogger logger) + { + _localScheduledJobManager = localScheduledJobManager; + _logger = logger; + } + + public Task HasJobRan(string jobId) + { + return Task.FromResult(jobRunStatus.TryGetValue(jobId, out var taskResult) && taskResult.Task.IsCompleted); + } + + public Task ExecuteJobAsync(IScheduledJobContext ctx, CancellationToken cancellationToken) + { + _logger.LogInformation("Job {JobId} received at {ReceivedTime}", ctx.Job.Id, DateTime.UtcNow); + jobExecutionTimes[ctx.Job.Id] = DateTimeOffset.UtcNow; + jobContexts[ctx.Job.Id] = ctx; + cancellationTokenStatus[ctx.Job.Id] = cancellationToken.IsCancellationRequested; + jobRunStatus[ctx.Job.Id].SetResult(); + return Task.CompletedTask; + } + + public async Task ScheduleJobAsync(string jobName, DateTimeOffset scheduledTime, IReadOnlyDictionary metadata = null) + { + var job = await _localScheduledJobManager.ScheduleJobAsync(this.GetGrainId(), jobName, scheduledTime, metadata, CancellationToken.None); + jobRunStatus[job.Id] = new TaskCompletionSource(); + return job; + } + + public async Task WaitForJobToRun(string jobId) + { + if (!jobRunStatus.TryGetValue(jobId, out var taskResult)) + { + // The job might not have been scheduled on this grain. + jobRunStatus[jobId] = new TaskCompletionSource(); + taskResult = jobRunStatus[jobId]; + } + + await taskResult.Task; + } + + public async Task TryCancelJobAsync(ScheduledJob job) + { + return await _localScheduledJobManager.TryCancelScheduledJobAsync(job, CancellationToken.None); + } + + public Task GetJobExecutionTime(string jobId) + { + if (!jobExecutionTimes.TryGetValue(jobId, out var time)) + { + throw new InvalidOperationException($"Job {jobId} has not executed or was not scheduled on this grain."); + } + + return Task.FromResult(time); + } + + public Task GetJobContext(string jobId) + { + if (!jobContexts.TryGetValue(jobId, out var ctx)) + { + throw new InvalidOperationException($"Job {jobId} has not executed or was not scheduled on this grain."); + } + + return Task.FromResult(ctx); + } + + public Task WasCancellationTokenCancelled(string jobId) + { + return Task.FromResult(cancellationTokenStatus.TryGetValue(jobId, out var cancelled) && cancelled); + } +} diff --git a/test/Grains/TestGrains/SchedulerGrain.cs b/test/Grains/TestGrains/SchedulerGrain.cs new file mode 100644 index 00000000000..f4e657bc742 --- /dev/null +++ b/test/Grains/TestGrains/SchedulerGrain.cs @@ -0,0 +1,46 @@ +using System; +using System.Threading; +using System.Threading.Tasks; +using Microsoft.Extensions.Logging; +using Orleans.ScheduledJobs; +using UnitTests.GrainInterfaces; + +namespace UnitTests.Grains; + +public class SchedulerGrain : Grain, ISchedulerGrain +{ + private readonly ILocalScheduledJobManager _localScheduledJobManager; + private readonly IGrainFactory _grainFactory; + private readonly ILogger _logger; + + public SchedulerGrain( + ILocalScheduledJobManager localScheduledJobManager, + IGrainFactory grainFactory, + ILogger logger) + { + _localScheduledJobManager = localScheduledJobManager; + _grainFactory = grainFactory; + _logger = logger; + } + + public async Task ScheduleJobOnAnotherGrainAsync(string targetGrainKey, string jobName, DateTimeOffset scheduledTime) + { + var targetGrain = _grainFactory.GetGrain(targetGrainKey); + var targetGrainId = targetGrain.GetGrainId(); + + _logger.LogInformation( + "Scheduling job {JobName} on grain {TargetGrainKey} from grain {SourceGrain}", + jobName, + targetGrainKey, + this.GetPrimaryKeyString()); + + var job = await _localScheduledJobManager.ScheduleJobAsync( + targetGrainId, + jobName, + scheduledTime, + null, + CancellationToken.None); + + return job; + } +} diff --git a/test/Grains/TestGrains/TestGrains.csproj b/test/Grains/TestGrains/TestGrains.csproj index 2e3d1dcbc8c..52f316aa37b 100644 --- a/test/Grains/TestGrains/TestGrains.csproj +++ b/test/Grains/TestGrains/TestGrains.csproj @@ -12,5 +12,6 @@ + diff --git a/test/NonSilo.Tests/Directory/MockClusterMembershipService.cs b/test/NonSilo.Tests/Directory/MockClusterMembershipService.cs index 362a239fbec..748639d566e 100644 --- a/test/NonSilo.Tests/Directory/MockClusterMembershipService.cs +++ b/test/NonSilo.Tests/Directory/MockClusterMembershipService.cs @@ -44,7 +44,7 @@ internal static ClusterMembershipSnapshot ToSnapshot(Dictionary new ValueTask(); + public ValueTask Refresh(MembershipVersion minimumVersion = default, CancellationToken cancellationToken = default) => new ValueTask(); public Task TryKill(SiloAddress siloAddress) => throw new NotImplementedException(); } diff --git a/test/NonSilo.Tests/ScheduledJobs/InMemoryJobQueueTests.cs b/test/NonSilo.Tests/ScheduledJobs/InMemoryJobQueueTests.cs new file mode 100644 index 00000000000..43bd6ccfbf1 --- /dev/null +++ b/test/NonSilo.Tests/ScheduledJobs/InMemoryJobQueueTests.cs @@ -0,0 +1,349 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Threading; +using System.Threading.Tasks; +using Orleans.ScheduledJobs; +using Orleans.Runtime; +using NSubstitute; +using Xunit; + +namespace NonSilo.Tests.ScheduledJobs; + +[TestCategory("ScheduledJobs")] +public class InMemoryJobQueueTests +{ + [Fact] + public void Enqueue_AddsJobToQueue() + { + var queue = new InMemoryJobQueue(); + var job = CreateJob("job1", DateTimeOffset.UtcNow.AddSeconds(1)); + + queue.Enqueue(job, 0); + + Assert.Equal(1, queue.Count); + } + + [Fact] + public void Enqueue_MultipleJobs_IncreasesCount() + { + var queue = new InMemoryJobQueue(); + var job1 = CreateJob("job1", DateTimeOffset.UtcNow.AddSeconds(1)); + var job2 = CreateJob("job2", DateTimeOffset.UtcNow.AddSeconds(2)); + var job3 = CreateJob("job3", DateTimeOffset.UtcNow.AddSeconds(3)); + + queue.Enqueue(job1, 0); + queue.Enqueue(job2, 0); + queue.Enqueue(job3, 0); + + Assert.Equal(3, queue.Count); + } + + [Fact] + public void Enqueue_AfterMarkAsComplete_ThrowsInvalidOperationException() + { + var queue = new InMemoryJobQueue(); + queue.MarkAsComplete(); + + var job = CreateJob("job1", DateTimeOffset.UtcNow.AddSeconds(1)); + + Assert.Throws(() => queue.Enqueue(job, 0)); + } + + [Fact] + public async Task GetAsyncEnumerator_ReturnsJobsInDueTimeOrder() + { + var queue = new InMemoryJobQueue(); + var now = DateTimeOffset.UtcNow; + var job1 = CreateJob("job1", now.AddMilliseconds(-100)); + var job2 = CreateJob("job2", now.AddMilliseconds(-50)); + + queue.Enqueue(job1, 0); + queue.Enqueue(job2, 0); + queue.MarkAsComplete(); + + var results = new List(); + await foreach (var context in queue.WithCancellation(CancellationToken.None)) + { + results.Add(context); + if (results.Count >= 2) break; + } + + Assert.Equal(2, results.Count); + Assert.Equal("job1", results[0].Job.Name); + Assert.Equal("job2", results[1].Job.Name); + } + + [Fact] + public async Task GetAsyncEnumerator_IncrementsDequeueCount() + { + var queue = new InMemoryJobQueue(); + var job = CreateJob("job1", DateTimeOffset.UtcNow.AddMilliseconds(-100)); + + queue.Enqueue(job, 0); + queue.MarkAsComplete(); + + await foreach (var context in queue.WithCancellation(CancellationToken.None)) + { + Assert.Equal(1, context.DequeueCount); + break; + } + } + + [Fact] + public async Task GetAsyncEnumerator_WithInitialDequeueCount_IncrementsCorrectly() + { + var queue = new InMemoryJobQueue(); + var job = CreateJob("job1", DateTimeOffset.UtcNow.AddMilliseconds(-100)); + + queue.Enqueue(job, 3); + queue.MarkAsComplete(); + + await foreach (var context in queue.WithCancellation(CancellationToken.None)) + { + Assert.Equal(4, context.DequeueCount); + break; + } + } + + [Fact] + public async Task GetAsyncEnumerator_WaitsForDueTime() + { + var queue = new InMemoryJobQueue(); + var futureTime = DateTimeOffset.UtcNow.AddSeconds(2); + var job = CreateJob("job1", futureTime); + + queue.Enqueue(job, 0); + queue.MarkAsComplete(); + + var startTime = DateTimeOffset.UtcNow; + await foreach (var context in queue.WithCancellation(CancellationToken.None)) + { + var elapsed = DateTimeOffset.UtcNow - startTime; + Assert.True(elapsed.TotalSeconds >= 1.5, $"Job was dequeued too early. Elapsed: {elapsed.TotalSeconds}s"); + break; + } + } + + [Fact] + public async Task GetAsyncEnumerator_CompletesWhenQueueIsMarkedComplete() + { + var queue = new InMemoryJobQueue(); + queue.MarkAsComplete(); + + var count = 0; + await foreach (var _ in queue.WithCancellation(CancellationToken.None)) + { + count++; + } + + Assert.Equal(0, count); + } + + [Fact] + public void CancelJob_RemovesJobFromQueue() + { + var queue = new InMemoryJobQueue(); + var job = CreateJob("job1", DateTimeOffset.UtcNow.AddSeconds(5)); + + queue.Enqueue(job, 0); + var removed = queue.CancelJob("job1"); + + Assert.True(removed); + Assert.Equal(0, queue.Count); + } + + [Fact] + public async Task CancelJob_PreventsJobFromBeingDequeued() + { + var queue = new InMemoryJobQueue(); + var job1 = CreateJob("job1", DateTimeOffset.UtcNow.AddMilliseconds(-100)); + var job2 = CreateJob("job2", DateTimeOffset.UtcNow.AddMilliseconds(-50)); + + queue.Enqueue(job1, 0); + queue.Enqueue(job2, 0); + queue.CancelJob("job1"); + queue.MarkAsComplete(); + + var results = new List(); + await foreach (var context in queue.WithCancellation(CancellationToken.None)) + { + results.Add(context.Job.Id); + if (results.Count >= 1) break; + } + + Assert.Single(results); + Assert.Equal("job2", results[0]); + } + + [Fact] + public void CancelJob_NonExistentJob_DoesNotThrow() + { + var queue = new InMemoryJobQueue(); + + var removed = queue.CancelJob("non-existent-job"); + + Assert.False(removed); + Assert.Equal(0, queue.Count); + } + + [Fact] + public void RetryJobLater_MovesJobToNewDueTime() + { + var queue = new InMemoryJobQueue(); + var originalDueTime = DateTimeOffset.UtcNow.AddSeconds(1); + var job = CreateJob("job1", originalDueTime); + + queue.Enqueue(job, 0); + + var context = CreateJobContext(job, "run1", 1); + var newDueTime = DateTimeOffset.UtcNow.AddSeconds(10); + + queue.RetryJobLater(context, newDueTime); + + Assert.Equal(1, queue.Count); + } + + [Fact] + public async Task RetryJobLater_PreservesDequeueCount() + { + var queue = new InMemoryJobQueue(); + var job = CreateJob("job1", DateTimeOffset.UtcNow.AddMilliseconds(-100)); + + queue.Enqueue(job, 5); + + var context = CreateJobContext(job, "run1", 5); + var newDueTime = DateTimeOffset.UtcNow.AddMilliseconds(-50); + + queue.RetryJobLater(context, newDueTime); + queue.MarkAsComplete(); + + await foreach (var newContext in queue.WithCancellation(CancellationToken.None)) + { + Assert.Equal(6, newContext.DequeueCount); + Assert.Equal("job1", newContext.Job.Id); + break; + } + } + + [Fact] + public void RetryJobLater_NonExistentJob_DoesNotThrow() + { + var queue = new InMemoryJobQueue(); + var job = CreateJob("job1", DateTimeOffset.UtcNow.AddSeconds(1)); + var context = CreateJobContext(job, "run1", 1); + + queue.RetryJobLater(context, DateTimeOffset.UtcNow.AddSeconds(10)); + + Assert.Equal(0, queue.Count); + } + + [Fact] + public async Task GetAsyncEnumerator_RespectsEmptyBuckets() + { + var queue = new InMemoryJobQueue(); + var dueTime = DateTimeOffset.UtcNow.AddMilliseconds(-100); + var job1 = CreateJob("job1", dueTime); + var job2 = CreateJob("job2", dueTime); + + queue.Enqueue(job1, 0); + queue.Enqueue(job2, 0); + queue.CancelJob("job1"); + queue.CancelJob("job2"); + queue.MarkAsComplete(); + + var results = new List(); + await foreach (var context in queue.WithCancellation(CancellationToken.None)) + { + results.Add(context); + if (results.Count >= 2) break; + } + + Assert.Empty(results); + } + + [Fact] + public async Task GetAsyncEnumerator_HandlesMultipleDueTimes() + { + var queue = new InMemoryJobQueue(); + var now = DateTimeOffset.UtcNow; + var job1 = CreateJob("job1", now.AddSeconds(-5)); + var job2 = CreateJob("job2", now.AddSeconds(-3)); + var job3 = CreateJob("job3", now.AddSeconds(-1)); + + queue.Enqueue(job1, 0); + queue.Enqueue(job2, 0); + queue.Enqueue(job3, 0); + queue.MarkAsComplete(); + + var results = new List(); + await foreach (var context in queue.WithCancellation(CancellationToken.None)) + { + results.Add(context.Job.Name); + if (results.Count >= 3) break; + } + + Assert.Equal(3, results.Count); + Assert.Equal("job1", results[0]); + Assert.Equal("job2", results[1]); + Assert.Equal("job3", results[2]); + } + + [Fact] + public async Task GetAsyncEnumerator_GeneratesUniqueRunIds() + { + var queue = new InMemoryJobQueue(); + var job = CreateJob("job1", DateTimeOffset.UtcNow.AddMilliseconds(-100)); + + queue.Enqueue(job, 0); + queue.MarkAsComplete(); + + var runIds = new List(); + await foreach (var context in queue.WithCancellation(CancellationToken.None)) + { + runIds.Add(context.RunId); + Assert.False(string.IsNullOrEmpty(context.RunId)); + break; + } + + Assert.Single(runIds); + } + + [Fact] + public async Task GetAsyncEnumerator_CancellationToken_StopsEnumeration() + { + var queue = new InMemoryJobQueue(); + var cts = new CancellationTokenSource(); + + cts.Cancel(); + + await Assert.ThrowsAnyAsync(async () => + { + await foreach (var _ in queue.WithCancellation(cts.Token)) + { + } + }); + } + + private static ScheduledJob CreateJob(string id, DateTimeOffset dueTime) + { + return new ScheduledJob + { + Id = id, + Name = id, + DueTime = dueTime, + TargetGrainId = GrainId.Create("test", id), + ShardId = "shard1", + Metadata = null + }; + } + + private static IScheduledJobContext CreateJobContext(ScheduledJob job, string runId, int dequeueCount) + { + var context = Substitute.For(); + context.Job.Returns(job); + context.RunId.Returns(runId); + context.DequeueCount.Returns(dequeueCount); + return context; + } +} diff --git a/test/TestInfrastructure/TestExtensions/DefaultClusterFixture.cs b/test/TestInfrastructure/TestExtensions/DefaultClusterFixture.cs index a5a6f4a0dfe..000483df482 100644 --- a/test/TestInfrastructure/TestExtensions/DefaultClusterFixture.cs +++ b/test/TestInfrastructure/TestExtensions/DefaultClusterFixture.cs @@ -58,6 +58,7 @@ public void Configure(ISiloBuilder hostBuilder) hostBuilder .Configure(o => o.ClientGatewayShutdownNotificationTimeout = default) .UseInMemoryReminderService() + .UseInMemoryScheduledJobs() .AddMemoryGrainStorageAsDefault() .AddMemoryGrainStorage("MemoryStore"); } diff --git a/test/Tester/ScheduledJobs/IJobShardManagerTestFixture.cs b/test/Tester/ScheduledJobs/IJobShardManagerTestFixture.cs new file mode 100644 index 00000000000..f77d65f32ef --- /dev/null +++ b/test/Tester/ScheduledJobs/IJobShardManagerTestFixture.cs @@ -0,0 +1,23 @@ +using System; +using System.Collections.Generic; +using System.Threading.Tasks; +using Orleans.Runtime; +using Orleans.ScheduledJobs; + +namespace Tester.ScheduledJobs; + +/// +/// Defines the contract for provider-specific test fixtures used by . +/// Each provider implementation (Azure, InMemory, etc.) should implement this interface to provide +/// the necessary infrastructure for running shared job shard manager tests. +/// +public interface IJobShardManagerTestFixture : IAsyncDisposable +{ + /// + /// Creates a new instance for the specified silo. + /// + /// The local silo details. + /// The cluster membership service for the manager. + /// A configured job shard manager instance. + JobShardManager CreateManager(ILocalSiloDetails localSiloDetails, IClusterMembershipService membershipService); +} diff --git a/test/Tester/ScheduledJobs/InMemoryJobShardManagerTestFixture.cs b/test/Tester/ScheduledJobs/InMemoryJobShardManagerTestFixture.cs new file mode 100644 index 00000000000..39509fe8d65 --- /dev/null +++ b/test/Tester/ScheduledJobs/InMemoryJobShardManagerTestFixture.cs @@ -0,0 +1,29 @@ +using System.Threading.Tasks; +using Orleans.Runtime; +using Orleans.ScheduledJobs; + +namespace Tester.ScheduledJobs; + +/// +/// InMemory implementation of . +/// Provides the infrastructure needed to run shared job shard manager tests against the InMemory provider. +/// +internal sealed class InMemoryJobShardManagerTestFixture : IJobShardManagerTestFixture +{ + public InMemoryJobShardManagerTestFixture() + { + // Clear any state from previous tests + InMemoryJobShardManager.ClearAllShardsAsync().GetAwaiter().GetResult(); + } + + public JobShardManager CreateManager(ILocalSiloDetails localSiloDetails, IClusterMembershipService membershipService) + { + return new InMemoryJobShardManager(localSiloDetails.SiloAddress, membershipService); + } + + public async ValueTask DisposeAsync() + { + // Clear state after tests + await InMemoryJobShardManager.ClearAllShardsAsync(); + } +} diff --git a/test/Tester/ScheduledJobs/InMemoryJobShardManagerTests.cs b/test/Tester/ScheduledJobs/InMemoryJobShardManagerTests.cs new file mode 100644 index 00000000000..beb104ee8d8 --- /dev/null +++ b/test/Tester/ScheduledJobs/InMemoryJobShardManagerTests.cs @@ -0,0 +1,58 @@ +using System.Threading.Tasks; +using Xunit; + +namespace Tester.ScheduledJobs; + +/// +/// Tests for using the . +/// These tests verify shard lifecycle management, ownership, and failover semantics for the InMemory provider. +/// +[TestCategory("BVT"), TestCategory("ScheduledJobs")] +public class InMemoryJobShardManagerTests : IAsyncLifetime +{ + private readonly InMemoryJobShardManagerTestFixture _fixture; + private readonly JobShardManagerTestsRunner _runner; + + public InMemoryJobShardManagerTests() + { + _fixture = new InMemoryJobShardManagerTestFixture(); + _runner = new JobShardManagerTestsRunner(_fixture); + } + + public Task InitializeAsync() => Task.CompletedTask; + + public Task DisposeAsync() => _fixture.DisposeAsync().AsTask(); + + [SkippableFact] + public Task InMemoryJobShardManager_ShardCreationAndAssignment() => _runner.ShardCreationAndAssignment(); + + [SkippableFact] + public Task InMemoryJobShardManager_ReadFrozenShard() => _runner.ReadFrozenShard(); + + [SkippableFact] + public Task InMemoryJobShardManager_LiveShard() => _runner.LiveShard(); + + [SkippableFact] + public Task InMemoryJobShardManager_JobMetadata() => _runner.JobMetadata(); + + [SkippableFact] + public Task InMemoryJobShardManager_ConcurrentShardAssignment_OwnershipConflicts() => _runner.ConcurrentShardAssignment_OwnershipConflicts(); + + [SkippableFact] + public Task InMemoryJobShardManager_ShardMetadataMerge() => _runner.ShardMetadataMerge(); + + [SkippableFact] + public Task InMemoryJobShardManager_StopProcessingShard() => _runner.StopProcessingShard(); + + [SkippableFact] + public Task InMemoryJobShardManager_RetryJobLater() => _runner.RetryJobLater(); + + [SkippableFact] + public Task InMemoryJobShardManager_JobCancellation() => _runner.JobCancellation(); + + [SkippableFact] + public Task InMemoryJobShardManager_ShardRegistrationRetry_IdCollisions() => _runner.ShardRegistrationRetry_IdCollisions(); + + [SkippableFact] + public Task InMemoryJobShardManager_UnregisterShard_WithJobsRemaining() => _runner.UnregisterShard_WithJobsRemaining(); +} diff --git a/test/Tester/ScheduledJobs/JobShardManagerTestsRunner.cs b/test/Tester/ScheduledJobs/JobShardManagerTestsRunner.cs new file mode 100644 index 00000000000..4040727dd01 --- /dev/null +++ b/test/Tester/ScheduledJobs/JobShardManagerTestsRunner.cs @@ -0,0 +1,606 @@ +using System; +using System.Collections.Generic; +using System.Collections.Immutable; +using System.Linq; +using System.Net; +using System.Threading; +using System.Threading.Tasks; +using Orleans.Runtime; +using Orleans.ScheduledJobs; +using Xunit; + +namespace Tester.ScheduledJobs; + +/// +/// Contains provider-agnostic test logic for job shard managers that can be run against different providers. +/// This class is similar to but operates at the infrastructure layer, +/// testing shard lifecycle management, ownership, and failover semantics. +/// +public class JobShardManagerTestsRunner +{ + private readonly IJobShardManagerTestFixture _fixture; + private readonly IDictionary _testMetadata; + private readonly InMemoryClusterMembershipService _membershipService; + + public JobShardManagerTestsRunner(IJobShardManagerTestFixture fixture) + { + _fixture = fixture; + _testMetadata = new Dictionary + { + { "CreatedBy", "UnitTest" }, + { "Purpose", "Testing" } + }; + _membershipService = new InMemoryClusterMembershipService(); + } + + /// + /// Sets the status of a silo in the cluster membership service. + /// + private void SetSiloStatus(SiloAddress siloAddress, SiloStatus status) + { + _membershipService.SetSiloStatus(siloAddress, status); + } + + /// + /// Creates a job shard manager for the given silo address. + /// + private JobShardManager CreateManager(SiloAddress siloAddress) + { + var localSiloDetails = new TestLocalSiloDetails(siloAddress); + return _fixture.CreateManager(localSiloDetails, _membershipService); + } + + /// + /// Tests basic shard creation and assignment workflow. + /// Verifies that shards are created with unique IDs and correctly assigned to their creator silo. + /// + public async Task ShardCreationAndAssignment() + { + var silo1Address = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5000), 0); + var silo2Address = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5001), 0); + + SetSiloStatus(silo1Address, SiloStatus.Active); + SetSiloStatus(silo2Address, SiloStatus.Active); + var silo1Manager = CreateManager(silo1Address); + var silo2Manager = CreateManager(silo2Address); + + var date = DateTimeOffset.UtcNow; + var maxDate = date.AddHours(1); + + // Register multiple shards and ensure they are distinct + // two of them have the same time range + var shard1 = await silo1Manager.CreateShardAsync(date, maxDate, _testMetadata, CancellationToken.None); + var shard2 = await silo1Manager.CreateShardAsync(date, maxDate, _testMetadata, CancellationToken.None); + var shard3 = await silo1Manager.CreateShardAsync(date.AddHours(2), maxDate, _testMetadata, CancellationToken.None); + + Assert.Distinct([shard1.Id, shard2.Id, shard3.Id]); + + // All shards are now assigned to the creator silo + var assignedShards = await silo1Manager.AssignJobShardsAsync(DateTime.UtcNow.AddHours(3), CancellationToken.None); + Assert.Equal(3, assignedShards.Count); + Assert.Contains(shard1.Id, assignedShards.Select(s => s.Id)); + Assert.Contains(shard2.Id, assignedShards.Select(s => s.Id)); + Assert.Contains(shard3.Id, assignedShards.Select(s => s.Id)); + var emptyShards = await silo2Manager.AssignJobShardsAsync(DateTime.UtcNow.AddHours(3), CancellationToken.None); + Assert.Empty(emptyShards); + + // Mark the local silo as dead + SetSiloStatus(silo1Address, SiloStatus.Dead); + + // Now we can take over all three shards + var shards = await silo2Manager.AssignJobShardsAsync(DateTime.UtcNow.AddHours(3), CancellationToken.None); + Assert.Equal(3, shards.Count); + Assert.Contains(shard1.Id, shards.Select(s => s.Id)); + Assert.Contains(shard2.Id, shards.Select(s => s.Id)); + Assert.Contains(shard3.Id, shards.Select(s => s.Id)); + + // Register another silo + var silo3Address = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5002), 0); + SetSiloStatus(silo3Address, SiloStatus.Active); + var silo3Manager = CreateManager(silo3Address); + + // No unassigned shards + Assert.Empty(await silo3Manager.AssignJobShardsAsync(DateTime.UtcNow.AddHours(1), CancellationToken.None)); + } + + /// + /// Tests reading and consuming jobs from a shard after ownership transfer. + /// Verifies that jobs are preserved during failover and can be consumed by the new owner. + /// + public async Task ReadFrozenShard() + { + var silo1Address = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5000), 0); + var silo2Address = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5001), 0); + SetSiloStatus(silo1Address, SiloStatus.Active); + SetSiloStatus(silo2Address, SiloStatus.Active); + var silo1Manager = CreateManager(silo1Address); + var silo2Manager = CreateManager(silo2Address); + + var date = DateTime.UtcNow; + var shard1 = await silo1Manager.CreateShardAsync(date, date.AddHours(1), _testMetadata, CancellationToken.None); + + // Schedule some jobs + await shard1.TryScheduleJobAsync(GrainId.Create("type", "target1"), "job1", date.AddSeconds(1), null, CancellationToken.None); + await shard1.TryScheduleJobAsync(GrainId.Create("type", "target1"), "job3", date.AddSeconds(3), null, CancellationToken.None); + await shard1.TryScheduleJobAsync(GrainId.Create("type", "target2"), "job2", date.AddSeconds(2), null, CancellationToken.None); + await shard1.TryScheduleJobAsync(GrainId.Create("type", "target1"), "job4", date.AddSeconds(4), null, CancellationToken.None); + + // Mark the silo1 as dead, and create a new incarnation + SetSiloStatus(silo1Address, SiloStatus.Dead); + + // Take over the shard + var shards = await silo2Manager.AssignJobShardsAsync(DateTime.UtcNow.AddHours(1), CancellationToken.None); + Assert.Single(shards); + shard1 = shards[0]; + + var counter = 1; + var cts = new CancellationTokenSource(TimeSpan.FromSeconds(20)); + await foreach (var jobCtx in shard1.ConsumeScheduledJobsAsync().WithCancellation(cts.Token)) + { + Assert.Equal($"job{counter}", jobCtx.Job.Name); + await shard1.RemoveJobAsync(jobCtx.Job.Id, cts.Token); + counter++; + } + Assert.Equal(5, counter); + await silo2Manager.UnregisterShardAsync(shard1, CancellationToken.None); + + // No unassigned shards + Assert.Empty(await silo2Manager.AssignJobShardsAsync(DateTime.UtcNow.AddHours(1), CancellationToken.None)); + } + + /// + /// Tests consuming jobs from a live shard (one that continues to accept new jobs). + /// Verifies job scheduling, consumption, and cancellation during processing. + /// + public async Task LiveShard() + { + var startTime = DateTime.UtcNow; + var localAddress = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5000), 0); + SetSiloStatus(localAddress, SiloStatus.Active); + var manager = CreateManager(localAddress); + + var date = DateTime.UtcNow; + var shard1 = await manager.CreateShardAsync(date, date.AddYears(1), _testMetadata, CancellationToken.None); + + // Schedule some jobs + await shard1.TryScheduleJobAsync(GrainId.Create("type", "target1"), "job0", startTime.AddSeconds(1), null, CancellationToken.None); + await shard1.TryScheduleJobAsync(GrainId.Create("type", "target1"), "job2", startTime.AddSeconds(3), null, CancellationToken.None); + await shard1.TryScheduleJobAsync(GrainId.Create("type", "target2"), "job1", startTime.AddSeconds(2), null, CancellationToken.None); + var lastJob = await shard1.TryScheduleJobAsync(GrainId.Create("type", "target1"), "job3", startTime.AddSeconds(4), null, CancellationToken.None); + var jobToCancel = await shard1.TryScheduleJobAsync(GrainId.Create("type", "target1"), "job4", startTime.AddSeconds(5), null, CancellationToken.None); + + var counter = 0; + var cts = new CancellationTokenSource(TimeSpan.FromSeconds(10)); + await shard1.MarkAsCompleteAsync(CancellationToken.None); + await shard1.RemoveJobAsync(jobToCancel.Id, CancellationToken.None); + await foreach (var jobCtx in shard1.ConsumeScheduledJobsAsync().WithCancellation(cts.Token)) + { + Assert.Equal($"job{counter}", jobCtx.Job.Name); + await shard1.RemoveJobAsync(jobCtx.Job.Id, CancellationToken.None); + counter++; + } + Assert.Equal(4, counter); + Assert.True(lastJob.DueTime <= DateTimeOffset.UtcNow); + await manager.UnregisterShardAsync(shard1, CancellationToken.None); + + // No unassigned shards + Assert.Empty(await manager.AssignJobShardsAsync(DateTime.UtcNow.AddHours(1), CancellationToken.None)); + } + + /// + /// Tests job metadata persistence and retrieval across shard ownership transfer. + /// + public async Task JobMetadata() + { + // Initialize 2 silos with two managers + var silo1Address = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5000), 0); + var silo2Address = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5001), 0); + + SetSiloStatus(silo1Address, SiloStatus.Active); + SetSiloStatus(silo2Address, SiloStatus.Active); + var silo1Manager = CreateManager(silo1Address); + var silo2Manager = CreateManager(silo2Address); + + var date = DateTime.UtcNow; + var shard = await silo1Manager.CreateShardAsync(date, date.AddYears(1), _testMetadata, CancellationToken.None); + + // Schedule jobs with different metadata on a single shard + var jobMetadata1 = new Dictionary + { + { "Priority", "High" }, + { "Category", "Payment" }, + { "RequestId", "12345" } + }; + var jobMetadata2 = new Dictionary + { + { "Priority", "Low" }, + { "Category", "Notification" } + }; + + var job1 = await shard.TryScheduleJobAsync(GrainId.Create("type", "target1"), "job1", DateTime.UtcNow.AddSeconds(1), jobMetadata1, CancellationToken.None); + var job2 = await shard.TryScheduleJobAsync(GrainId.Create("type", "target2"), "job2", DateTime.UtcNow.AddSeconds(2), jobMetadata2, CancellationToken.None); + var job3 = await shard.TryScheduleJobAsync(GrainId.Create("type", "target3"), "job3", DateTime.UtcNow.AddSeconds(3), null, CancellationToken.None); + + // Verify metadata is set on the scheduled jobs + Assert.Equal(jobMetadata1, job1.Metadata); + Assert.Equal(jobMetadata2, job2.Metadata); + Assert.Null(job3.Metadata); + + // Mark the silo owning the shard as dead + SetSiloStatus(silo1Address, SiloStatus.Dead); + + // Take over the shard with the other silo + var shards = await silo2Manager.AssignJobShardsAsync(DateTime.UtcNow.AddHours(1), CancellationToken.None); + Assert.Single(shards); + shard = shards[0]; + + // Consume jobs and verify metadata is preserved + var consumedJobs = new List(); + var cts = new CancellationTokenSource(TimeSpan.FromSeconds(30)); + await foreach (var jobCtx in shard.ConsumeScheduledJobsAsync().WithCancellation(cts.Token)) + { + consumedJobs.Add(jobCtx.Job); + await shard.RemoveJobAsync(jobCtx.Job.Id, CancellationToken.None); + } + + Assert.Equal(3, consumedJobs.Count); + + var consumedJob1 = consumedJobs.First(j => j.Name == "job1"); + var consumedJob2 = consumedJobs.First(j => j.Name == "job2"); + var consumedJob3 = consumedJobs.First(j => j.Name == "job3"); + + Assert.Equal(jobMetadata1, consumedJob1.Metadata); + Assert.Equal(jobMetadata2, consumedJob2.Metadata); + Assert.Null(consumedJob3.Metadata); + + await silo2Manager.UnregisterShardAsync(shard, CancellationToken.None); + } + + /// + /// Tests concurrent shard assignment to verify that only one silo can claim ownership of an orphaned shard. + /// + public async Task ConcurrentShardAssignment_OwnershipConflicts() + { + // Initialize 3 silos with 3 managers + var silo1Address = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5000), 0); + var silo2Address = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5001), 0); + var silo3Address = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5002), 0); + + SetSiloStatus(silo1Address, SiloStatus.Active); + SetSiloStatus(silo2Address, SiloStatus.Active); + SetSiloStatus(silo3Address, SiloStatus.Active); + var silo1Manager = CreateManager(silo1Address); + var silo2Manager = CreateManager(silo2Address); + var silo3Manager = CreateManager(silo3Address); + + var date = DateTime.UtcNow; + + // Create two shards on the first silo + var shard1 = await silo1Manager.CreateShardAsync(date, date.AddHours(1), _testMetadata, CancellationToken.None); + var shard2 = await silo1Manager.CreateShardAsync(date, date.AddHours(2), _testMetadata, CancellationToken.None); + + // Mark the first silo as dead + SetSiloStatus(silo1Address, SiloStatus.Dead); + + // Concurrently try to assign shards from silo2 and silo3 + var assignTask2 = silo2Manager.AssignJobShardsAsync(DateTime.UtcNow.AddHours(3), CancellationToken.None); + var assignTask3 = silo3Manager.AssignJobShardsAsync(DateTime.UtcNow.AddHours(3), CancellationToken.None); + + await Task.WhenAll(assignTask2, assignTask3); + + var shards2 = await assignTask2; + var shards3 = await assignTask3; + + // Verify that only one silo was able to assign each shard (no duplicates) + var totalAssignments = shards2.Count + shards3.Count; + Assert.Equal(2, totalAssignments); + + var allAssignedShardIds = shards2.Select(s => s.Id).Concat(shards3.Select(s => s.Id)).ToList(); + Assert.Contains(shard1.Id, allAssignedShardIds); + Assert.Contains(shard2.Id, allAssignedShardIds); + Assert.Equal(2, allAssignedShardIds.Distinct().Count()); + } + + /// + /// Tests that shard metadata is correctly preserved and merged during ownership transfers. + /// + public async Task ShardMetadataMerge() + { + // Initialize 2 silos with 2 managers + var silo1Address = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5000), 0); + var silo2Address = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5001), 0); + + SetSiloStatus(silo1Address, SiloStatus.Active); + SetSiloStatus(silo2Address, SiloStatus.Active); + var silo1Manager = CreateManager(silo1Address); + var silo2Manager = CreateManager(silo2Address); + + var date = DateTime.UtcNow; + + // Create a shard on silo1 with some metadata, then update the metadata and verify it is merged correctly + var customMetadata = new Dictionary + { + { "Environment", "Production" }, + { "TenantId", "tenant-123" } + }; + + var shard = await silo1Manager.CreateShardAsync(date, date.AddHours(1), customMetadata, CancellationToken.None); + Assert.NotNull(shard.Metadata); + Assert.All(customMetadata, kvp => + { + Assert.True(shard.Metadata.ContainsKey(kvp.Key)); + Assert.Equal(kvp.Value, shard.Metadata[kvp.Key]); + }); + + // Schedule a job to ensure shard persistence + await shard.TryScheduleJobAsync(GrainId.Create("type", "target1"), "job1", DateTime.UtcNow.AddSeconds(5), null, CancellationToken.None); + + SetSiloStatus(silo1Address, SiloStatus.Dead); + + // Take over the shard from silo2 and verify the metadata is preserved + var shards = await silo2Manager.AssignJobShardsAsync(DateTime.UtcNow.AddHours(1), CancellationToken.None); + Assert.Single(shards); + shard = shards[0]; + + Assert.NotNull(shard.Metadata); + Assert.All(customMetadata, kvp => + { + Assert.True(shard.Metadata.ContainsKey(kvp.Key)); + Assert.Equal(kvp.Value, shard.Metadata[kvp.Key]); + }); + } + + /// + /// Tests stopping shard processing and verifying jobs remain for reassignment. + /// + public async Task StopProcessingShard() + { + var localAddress = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5000), 0); + SetSiloStatus(localAddress, SiloStatus.Active); + var manager = CreateManager(localAddress); + + var date = DateTime.UtcNow; + var shard1 = await manager.CreateShardAsync(date, date.AddYears(1), _testMetadata, CancellationToken.None); + + // Schedule some jobs + await shard1.TryScheduleJobAsync(GrainId.Create("type", "target1"), "job1", DateTime.UtcNow.AddSeconds(5), null, CancellationToken.None); + await shard1.TryScheduleJobAsync(GrainId.Create("type", "target1"), "job3", DateTime.UtcNow.AddSeconds(10), null, CancellationToken.None); + await shard1.TryScheduleJobAsync(GrainId.Create("type", "target2"), "job2", DateTime.UtcNow.AddSeconds(6), null, CancellationToken.None); + await shard1.TryScheduleJobAsync(GrainId.Create("type", "target1"), "job4", DateTime.UtcNow.AddSeconds(15), null, CancellationToken.None); + + var counter = 1; + var cts = new CancellationTokenSource(TimeSpan.FromSeconds(40)); + await foreach (var jobCtx in shard1.ConsumeScheduledJobsAsync().WithCancellation(cts.Token)) + { + Assert.Equal($"job{counter}", jobCtx.Job.Name); + if (counter == 2) + break; + await shard1.RemoveJobAsync(jobCtx.Job.Id, CancellationToken.None); + counter++; + } + Assert.Equal(2, counter); + await manager.UnregisterShardAsync(shard1, CancellationToken.None); + + var shards = await manager.AssignJobShardsAsync(DateTime.UtcNow.AddHours(1), CancellationToken.None); + Assert.Single(shards); + Assert.Equal(shard1.Id, shards[0].Id); + } + + /// + /// Tests retrying a job with a new due time. + /// + public async Task RetryJobLater() + { + var localAddress = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5000), 0); + SetSiloStatus(localAddress, SiloStatus.Active); + var manager = CreateManager(localAddress); + var date = DateTime.UtcNow; + var shard1 = await manager.CreateShardAsync(date, date.AddYears(1), _testMetadata, CancellationToken.None); + + // Schedule a job + var job = await shard1.TryScheduleJobAsync(GrainId.Create("type", "target1"), "job1", DateTime.UtcNow.AddSeconds(1), null, CancellationToken.None); + var cts = new CancellationTokenSource(TimeSpan.FromSeconds(40)); + await foreach (var jobCtx in shard1.ConsumeScheduledJobsAsync().WithCancellation(cts.Token)) + { + Assert.Equal("job1", jobCtx.Job.Name); + var newDueTime = DateTimeOffset.UtcNow.AddSeconds(1); + await shard1.RetryJobLaterAsync(jobCtx, newDueTime, CancellationToken.None); + break; + } + + // Consume again + await foreach (var jobCtx in shard1.ConsumeScheduledJobsAsync().WithCancellation(cts.Token)) + { + Assert.Equal("job1", jobCtx.Job.Name); + Assert.NotEqual(job.DueTime, jobCtx.Job.DueTime); + await shard1.RemoveJobAsync(jobCtx.Job.Id, CancellationToken.None); + break; + } + await manager.UnregisterShardAsync(shard1, CancellationToken.None); + } + + + /// + /// Tests job cancellation before and during processing. + /// + public async Task JobCancellation() + { + // Initialize 2 silos with two managers + var silo1Address = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5000), 0); + var silo2Address = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5001), 0); + + SetSiloStatus(silo1Address, SiloStatus.Active); + SetSiloStatus(silo2Address, SiloStatus.Active); + var silo1Manager = CreateManager(silo1Address); + var silo2Manager = CreateManager(silo2Address); + + var date = DateTime.UtcNow; + var shard = await silo1Manager.CreateShardAsync(date, date.AddYears(1), _testMetadata, CancellationToken.None); + + // Schedule multiple jobs in a single shard + var job1 = await shard.TryScheduleJobAsync(GrainId.Create("type", "target1"), "job1", DateTime.UtcNow.AddMilliseconds(500), null, CancellationToken.None); + var job2 = await shard.TryScheduleJobAsync(GrainId.Create("type", "target2"), "job2", DateTime.UtcNow.AddMilliseconds(1000), null, CancellationToken.None); + var job3 = await shard.TryScheduleJobAsync(GrainId.Create("type", "target3"), "job3", DateTime.UtcNow.AddMilliseconds(1500), null, CancellationToken.None); + var job4 = await shard.TryScheduleJobAsync(GrainId.Create("type", "target4"), "job4", DateTime.UtcNow.AddMilliseconds(2000), null, CancellationToken.None); + + // Cancel job2 before processing starts + await shard.RemoveJobAsync(job2.Id, CancellationToken.None); + + // Start consuming jobs + var consumedJobs = new List(); + var cts = new CancellationTokenSource(TimeSpan.FromSeconds(30)); + + await foreach (var jobCtx in shard.ConsumeScheduledJobsAsync().WithCancellation(cts.Token)) + { + consumedJobs.Add(jobCtx.Job.Name); + + // Cancel job4 during processing (after job1 is consumed) + if (jobCtx.Job.Name == "job1") + { + await shard.RemoveJobAsync(job4.Id, CancellationToken.None); + } + + await shard.RemoveJobAsync(jobCtx.Job.Id, CancellationToken.None); + + if (consumedJobs.Count >= 2) + { + break; + } + } + + // Verify that only job1 and job3 were consumed (job2 was cancelled before consumption, job4 was cancelled during) + Assert.Equal(2, consumedJobs.Count); + Assert.Contains("job1", consumedJobs); + Assert.Contains("job3", consumedJobs); + Assert.DoesNotContain("job2", consumedJobs); + Assert.DoesNotContain("job4", consumedJobs); + + // Mark the shard owner silo as dead and reassign to verify cancelled jobs are not in storage + SetSiloStatus(silo1Address, SiloStatus.Dead); + + var shards = await silo2Manager.AssignJobShardsAsync(DateTime.UtcNow.AddHours(1), CancellationToken.None); + Assert.Single(shards); + shard = shards[0]; + + var hasJobs = false; + cts = new CancellationTokenSource(TimeSpan.FromSeconds(5)); + await foreach (var jobCtx in shard.ConsumeScheduledJobsAsync().WithCancellation(cts.Token)) + { + hasJobs = true; + break; + } + + Assert.False(hasJobs); + await silo2Manager.UnregisterShardAsync(shard, CancellationToken.None); + } + + /// + /// Tests that multiple shard registrations with the same time range produce unique IDs. + /// + public async Task ShardRegistrationRetry_IdCollisions() + { + var localAddress = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5000), 0); + SetSiloStatus(localAddress, SiloStatus.Active); + + var manager = CreateManager(localAddress); + + var date = DateTime.UtcNow; + + var shard1 = await manager.CreateShardAsync(date, date.AddHours(1), _testMetadata, CancellationToken.None); + var shard2 = await manager.CreateShardAsync(date, date.AddHours(1), _testMetadata, CancellationToken.None); + var shard3 = await manager.CreateShardAsync(date, date.AddHours(1), _testMetadata, CancellationToken.None); + + Assert.Distinct([shard1.Id, shard2.Id, shard3.Id]); + } + + /// + /// Tests that unregistering a shard with remaining jobs preserves the shard for reassignment. + /// + public async Task UnregisterShard_WithJobsRemaining() + { + // Initialize 2 silos with 2 managers + var silo1Address = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5000), 0); + var silo2Address = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5001), 0); + + SetSiloStatus(silo1Address, SiloStatus.Active); + SetSiloStatus(silo2Address, SiloStatus.Active); + var silo1Manager = CreateManager(silo1Address); + var silo2Manager = CreateManager(silo2Address); + + var date = DateTime.UtcNow; + var shard = await silo1Manager.CreateShardAsync(date, date.AddHours(1), _testMetadata, CancellationToken.None); + + // Create a shard on silo1, schedule some jobs, then unregister the shard + await shard.TryScheduleJobAsync(GrainId.Create("type", "target1"), "job1", DateTime.UtcNow.AddSeconds(1), null, CancellationToken.None); + await shard.TryScheduleJobAsync(GrainId.Create("type", "target2"), "job2", DateTime.UtcNow.AddSeconds(2), null, CancellationToken.None); + + await silo1Manager.UnregisterShardAsync(shard, CancellationToken.None); + + // The shard should NOT have been deleted since there were jobs remaining + SetSiloStatus(silo1Address, SiloStatus.Dead); + + // Take over the shard from silo2 and consume the jobs + var shards = await silo2Manager.AssignJobShardsAsync(DateTime.UtcNow.AddHours(1), CancellationToken.None); + Assert.Single(shards); + Assert.Equal(shard.Id, shards[0].Id); + + var consumedJobs = new List(); + var cts = new CancellationTokenSource(TimeSpan.FromSeconds(20)); + await foreach (var jobCtx in shards[0].ConsumeScheduledJobsAsync().WithCancellation(cts.Token)) + { + consumedJobs.Add(jobCtx.Job.Name); + await shards[0].RemoveJobAsync(jobCtx.Job.Id, CancellationToken.None); + } + + Assert.Equal(2, consumedJobs.Count); + Assert.Contains("job1", consumedJobs); + Assert.Contains("job2", consumedJobs); + await silo2Manager.UnregisterShardAsync(shards[0], CancellationToken.None); + } + + /// + /// Simple implementation of for testing. + /// + private sealed class TestLocalSiloDetails : ILocalSiloDetails + { + public TestLocalSiloDetails(SiloAddress siloAddress) + { + SiloAddress = siloAddress; + } + + public string Name => SiloAddress.ToString(); + + public string ClusterId => "TestCluster"; + + public string DnsHostName => SiloAddress.ToString(); + + public SiloAddress SiloAddress { get; } + + public SiloAddress GatewayAddress => SiloAddress; + } + + /// + /// Simple in-memory implementation of for testing. + /// + private sealed class InMemoryClusterMembershipService : IClusterMembershipService + { + private readonly Dictionary _silos = new(); + private int _version = 0; + + public ClusterMembershipSnapshot CurrentSnapshot => + new ClusterMembershipSnapshot(_silos.ToImmutableDictionary(), new MembershipVersion(_version)); + + public IAsyncEnumerable MembershipUpdates => throw new NotImplementedException(); + + public void SetSiloStatus(SiloAddress address, SiloStatus status) + { + _silos[address] = new ClusterMember(address, status, address.ToParsableString()); + _version++; + } + + public ValueTask Refresh(MembershipVersion minimumVersion = default, CancellationToken cancellationToken = default) => + ValueTask.CompletedTask; + + public Task TryKill(SiloAddress siloAddress) => throw new NotImplementedException(); + } +} diff --git a/test/Tester/ScheduledJobs/ScheduledJobTestsRunner.cs b/test/Tester/ScheduledJobs/ScheduledJobTestsRunner.cs new file mode 100644 index 00000000000..530a90f6262 --- /dev/null +++ b/test/Tester/ScheduledJobs/ScheduledJobTestsRunner.cs @@ -0,0 +1,329 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Threading.Tasks; +using Orleans; +using Orleans.Internal; +using Orleans.ScheduledJobs; +using Xunit; + +namespace Tester.ScheduledJobs; + +/// +/// Contains the test logic for scheduled jobs that can be run against different providers. +/// This class is provider-agnostic and can be reused by test classes for InMemory, Azure, and other providers. +/// +public class ScheduledJobTestsRunner +{ + private readonly IGrainFactory _grainFactory; + + public ScheduledJobTestsRunner(IGrainFactory grainFactory) + { + _grainFactory = grainFactory; + } + + public async Task ScheduledJobGrain() + { + var grain = _grainFactory.GetGrain("test-job-grain"); + var dueTime = DateTimeOffset.UtcNow.AddSeconds(5); + var job1 = await grain.ScheduleJobAsync("TestJob", dueTime); + Assert.NotNull(job1); + Assert.Equal("TestJob", job1.Name); + Assert.Equal(dueTime, job1.DueTime); + var job2 = await grain.ScheduleJobAsync("TestJob2", dueTime); + var job3 = await grain.ScheduleJobAsync("TestJob3", dueTime.AddSeconds(4)); + var job4 = await grain.ScheduleJobAsync("TestJob4", dueTime); + var job5 = await grain.ScheduleJobAsync("TestJob5", dueTime.AddSeconds(1)); + var canceledJob = await grain.ScheduleJobAsync("CanceledJob", dueTime.AddSeconds(2)); + Assert.True(await grain.TryCancelJobAsync(canceledJob)); + // Wait for the job to run + foreach (var job in new[] { job1, job2, job3, job4, job5 }) + { + try + { + await grain.WaitForJobToRun(job.Id).WithTimeout(TimeSpan.FromSeconds(10)); + } + catch (TimeoutException) + { + Assert.Fail($"The scheduled job {job.Name} did not run within the expected time."); + } + } + // Verify the canceled job did not run + Assert.False(await grain.HasJobRan(canceledJob.Id)); + } + + public async Task JobExecutionOrder() + { + var grain = _grainFactory.GetGrain("test-execution-order"); + var baseTime = DateTimeOffset.UtcNow.AddSeconds(2); + + var job1 = await grain.ScheduleJobAsync("FirstJob", baseTime); + var job2 = await grain.ScheduleJobAsync("SecondJob", baseTime.AddSeconds(2)); + var job3 = await grain.ScheduleJobAsync("ThirdJob", baseTime.AddSeconds(4)); + + await grain.WaitForJobToRun(job1.Id).WithTimeout(TimeSpan.FromSeconds(10)); + await grain.WaitForJobToRun(job2.Id).WithTimeout(TimeSpan.FromSeconds(10)); + await grain.WaitForJobToRun(job3.Id).WithTimeout(TimeSpan.FromSeconds(10)); + + var time1 = await grain.GetJobExecutionTime(job1.Id); + var time2 = await grain.GetJobExecutionTime(job2.Id); + var time3 = await grain.GetJobExecutionTime(job3.Id); + + Assert.True(time1 < time2, $"Job1 executed at {time1}, Job2 at {time2}"); + Assert.True(time2 < time3, $"Job2 executed at {time2}, Job3 at {time3}"); + } + + public async Task PastDueTime() + { + var grain = _grainFactory.GetGrain("test-past-due"); + var pastTime = DateTimeOffset.UtcNow.AddSeconds(-5); + + var job = await grain.ScheduleJobAsync("PastDueJob", pastTime); + Assert.NotNull(job); + + await grain.WaitForJobToRun(job.Id).WithTimeout(TimeSpan.FromSeconds(5)); + Assert.True(await grain.HasJobRan(job.Id)); + } + + public async Task JobWithMetadata() + { + var grain = _grainFactory.GetGrain("test-metadata"); + var dueTime = DateTimeOffset.UtcNow.AddSeconds(3); + var metadata = new Dictionary + { + ["UserId"] = "user123", + ["Action"] = "SendEmail", + ["Priority"] = "High" + }; + + var job = await grain.ScheduleJobAsync("MetadataJob", dueTime, metadata); + Assert.NotNull(job); + Assert.NotNull(job.Metadata); + Assert.Equal(3, job.Metadata.Count); + Assert.Equal("user123", job.Metadata["UserId"]); + Assert.Equal("SendEmail", job.Metadata["Action"]); + Assert.Equal("High", job.Metadata["Priority"]); + + await grain.WaitForJobToRun(job.Id).WithTimeout(TimeSpan.FromSeconds(10)); + + var context = await grain.GetJobContext(job.Id); + Assert.NotNull(context); + Assert.NotNull(context.Job.Metadata); + Assert.Equal("user123", context.Job.Metadata["UserId"]); + } + + public async Task MultipleGrains() + { + var grain1 = _grainFactory.GetGrain("test-grain-1"); + var grain2 = _grainFactory.GetGrain("test-grain-2"); + var grain3 = _grainFactory.GetGrain("test-grain-3"); + var dueTime = DateTimeOffset.UtcNow.AddSeconds(3); + + var job1 = await grain1.ScheduleJobAsync("Job1", dueTime); + var job2 = await grain2.ScheduleJobAsync("Job2", dueTime); + var job3 = await grain3.ScheduleJobAsync("Job3", dueTime); + + await grain1.WaitForJobToRun(job1.Id).WithTimeout(TimeSpan.FromSeconds(10)); + await grain2.WaitForJobToRun(job2.Id).WithTimeout(TimeSpan.FromSeconds(10)); + await grain3.WaitForJobToRun(job3.Id).WithTimeout(TimeSpan.FromSeconds(10)); + + Assert.True(await grain1.HasJobRan(job1.Id)); + Assert.True(await grain2.HasJobRan(job2.Id)); + Assert.True(await grain3.HasJobRan(job3.Id)); + + Assert.False(await grain1.HasJobRan(job2.Id)); + Assert.False(await grain2.HasJobRan(job3.Id)); + Assert.False(await grain3.HasJobRan(job1.Id)); + } + + public async Task DuplicateJobNames() + { + var grain = _grainFactory.GetGrain("test-duplicate-names"); + var dueTime = DateTimeOffset.UtcNow.AddSeconds(3); + + var job1 = await grain.ScheduleJobAsync("SameName", dueTime); + var job2 = await grain.ScheduleJobAsync("SameName", dueTime.AddSeconds(1)); + var job3 = await grain.ScheduleJobAsync("SameName", dueTime.AddSeconds(2)); + + Assert.NotEqual(job1.Id, job2.Id); + Assert.NotEqual(job2.Id, job3.Id); + Assert.NotEqual(job1.Id, job3.Id); + + Assert.Equal("SameName", job1.Name); + Assert.Equal("SameName", job2.Name); + Assert.Equal("SameName", job3.Name); + + await grain.WaitForJobToRun(job1.Id).WithTimeout(TimeSpan.FromSeconds(10)); + await grain.WaitForJobToRun(job2.Id).WithTimeout(TimeSpan.FromSeconds(10)); + await grain.WaitForJobToRun(job3.Id).WithTimeout(TimeSpan.FromSeconds(10)); + + Assert.True(await grain.HasJobRan(job1.Id)); + Assert.True(await grain.HasJobRan(job2.Id)); + Assert.True(await grain.HasJobRan(job3.Id)); + } + + public async Task CancelNonExistentJob() + { + var grain = _grainFactory.GetGrain("test-cancel-nonexistent"); + var dueTime = DateTimeOffset.UtcNow.AddSeconds(10); + + var job = await grain.ScheduleJobAsync("RealJob", dueTime); + + var fakeJob = new ScheduledJob + { + Id = "non-existent-id", + Name = "FakeJob", + DueTime = dueTime, + ShardId = job.ShardId, + TargetGrainId = job.TargetGrainId + }; + + var cancelResult = await grain.TryCancelJobAsync(fakeJob); + Assert.False(cancelResult); + + await Task.Delay(100); + Assert.False(await grain.HasJobRan(fakeJob.Id)); + } + + public async Task CancelAlreadyExecutedJob() + { + var grain = _grainFactory.GetGrain("test-cancel-executed"); + var dueTime = DateTimeOffset.UtcNow.AddSeconds(2); + + var job = await grain.ScheduleJobAsync("QuickJob", dueTime); + + await grain.WaitForJobToRun(job.Id).WithTimeout(TimeSpan.FromSeconds(10)); + Assert.True(await grain.HasJobRan(job.Id)); + + var cancelResult = await grain.TryCancelJobAsync(job); + Assert.False(cancelResult); + } + + public async Task ConcurrentScheduling() + { + var grain = _grainFactory.GetGrain("test-concurrent"); + var baseTime = DateTimeOffset.UtcNow.AddSeconds(5); + var jobCount = 20; + + var scheduleTasks = new List>(); + for (int i = 0; i < jobCount; i++) + { + scheduleTasks.Add(grain.ScheduleJobAsync($"ConcurrentJob{i}", baseTime.AddMilliseconds(i * 100))); + } + + var jobs = await Task.WhenAll(scheduleTasks); + + Assert.Equal(jobCount, jobs.Length); + Assert.Equal(jobCount, jobs.Select(j => j.Id).Distinct().Count()); + + var waitTasks = jobs.Select(j => grain.WaitForJobToRun(j.Id).WithTimeout(TimeSpan.FromSeconds(15))); + await Task.WhenAll(waitTasks); + + foreach (var job in jobs) + { + Assert.True(await grain.HasJobRan(job.Id), $"Job {job.Name} did not run"); + } + } + + public async Task JobPropertiesVerification() + { + var grain = _grainFactory.GetGrain("test-properties"); + var dueTime = DateTimeOffset.UtcNow.AddSeconds(3); + var metadata = new Dictionary { ["Key"] = "Value" }; + + var job = await grain.ScheduleJobAsync("PropertyTestJob", dueTime, metadata); + + Assert.NotNull(job.Id); + Assert.NotEmpty(job.Id); + Assert.Equal("PropertyTestJob", job.Name); + Assert.Equal(dueTime, job.DueTime); + Assert.NotNull(job.ShardId); + Assert.NotEmpty(job.ShardId); + Assert.NotNull(job.Metadata); + Assert.Single(job.Metadata); + + await grain.WaitForJobToRun(job.Id).WithTimeout(TimeSpan.FromSeconds(10)); + + var context = await grain.GetJobContext(job.Id); + Assert.NotNull(context); + Assert.Equal(job.Id, context.Job.Id); + Assert.Equal(job.Name, context.Job.Name); + Assert.NotNull(context.RunId); + Assert.NotEmpty(context.RunId); + } + + public async Task DequeueCount() + { + var grain = _grainFactory.GetGrain("test-dequeue-count"); + var dueTime = DateTimeOffset.UtcNow.AddSeconds(3); + + var job = await grain.ScheduleJobAsync("DequeueTestJob", dueTime); + + await grain.WaitForJobToRun(job.Id).WithTimeout(TimeSpan.FromSeconds(10)); + + var context = await grain.GetJobContext(job.Id); + Assert.NotNull(context); + Assert.Equal(1, context.DequeueCount); + } + + public async Task ScheduleJobOnAnotherGrain() + { + var schedulerGrain = _grainFactory.GetGrain("scheduler-grain"); + var targetGrain = _grainFactory.GetGrain("target-grain"); + var dueTime = DateTimeOffset.UtcNow.AddSeconds(3); + + var job = await schedulerGrain.ScheduleJobOnAnotherGrainAsync("target-grain", "CrossGrainJob", dueTime); + + Assert.NotNull(job); + Assert.Equal("CrossGrainJob", job.Name); + Assert.Equal(dueTime, job.DueTime); + + await targetGrain.WaitForJobToRun(job.Id).WithTimeout(TimeSpan.FromSeconds(10)); + + Assert.True(await targetGrain.HasJobRan(job.Id)); + + var context = await targetGrain.GetJobContext(job.Id); + Assert.NotNull(context); + Assert.Equal(job.Id, context.Job.Id); + Assert.Equal("CrossGrainJob", context.Job.Name); + } + + public async Task JobRetry() + { + var grain = _grainFactory.GetGrain("retry-test-grain"); + var dueTime = DateTimeOffset.UtcNow.AddSeconds(2); + var metadata = new Dictionary + { + ["FailUntilAttempt"] = "3" + }; + + var job = await grain.ScheduleJobAsync("RetryJob", dueTime, metadata); + + Assert.NotNull(job); + Assert.Equal("RetryJob", job.Name); + Assert.NotNull(job.Metadata); + Assert.Equal("3", job.Metadata["FailUntilAttempt"]); + + // Wait for the job to eventually succeed (with retries) + // Default retry policy: retry up to 5 times with exponential backoff (1s, 2s, 4s, 8s, 16s) + // We expect 3 attempts: fail at DequeueCount=1, fail at DequeueCount=2, succeed at DequeueCount=3 + // Total time: ~2s (initial) + 1s (first retry delay) + 2s (second retry delay) = ~5s + await grain.WaitForJobToSucceed(job.Id).WithTimeout(TimeSpan.FromSeconds(15)); + + Assert.True(await grain.HasJobSucceeded(job.Id)); + + var attemptCount = await grain.GetJobExecutionAttemptCount(job.Id); + Assert.Equal(3, attemptCount); + + var dequeueCountHistory = await grain.GetJobDequeueCountHistory(job.Id); + Assert.Equal(3, dequeueCountHistory.Count); + Assert.Equal(1, dequeueCountHistory[0]); + Assert.Equal(2, dequeueCountHistory[1]); + Assert.Equal(3, dequeueCountHistory[2]); + + var finalContext = await grain.GetFinalJobContext(job.Id); + Assert.NotNull(finalContext); + Assert.Equal(3, finalContext.DequeueCount); + Assert.Equal(job.Id, finalContext.Job.Id); + } +}