diff --git a/Orleans.slnx b/Orleans.slnx index 72c928450cf..7daec69935d 100644 --- a/Orleans.slnx +++ b/Orleans.slnx @@ -156,6 +156,7 @@ + diff --git a/src/Azure/Orleans.DurableJobs.AzureStorage/AzureStorageJobShard.Log.cs b/src/Azure/Orleans.DurableJobs.AzureStorage/AzureStorageJobShard.Log.cs deleted file mode 100644 index cd2ba74ec2a..00000000000 --- a/src/Azure/Orleans.DurableJobs.AzureStorage/AzureStorageJobShard.Log.cs +++ /dev/null @@ -1,109 +0,0 @@ -using System; -using Microsoft.Extensions.Logging; - -namespace Orleans.DurableJobs.AzureStorage; - -internal sealed partial class AzureStorageJobShard -{ - [LoggerMessage( - Level = LogLevel.Information, - Message = "Initializing shard '{ShardId}' from Azure Storage blob" - )] - private static partial void LogInitializingShard(ILogger logger, string shardId); - - [LoggerMessage( - Level = LogLevel.Information, - Message = "Shard '{ShardId}' initialized successfully. Loaded {JobCount} job(s) in {ElapsedMilliseconds}ms" - )] - private static partial void LogShardInitialized(ILogger logger, string shardId, int jobCount, long elapsedMilliseconds); - - [LoggerMessage( - Level = LogLevel.Debug, - Message = "Adding job '{JobId}' (Name: '{JobName}') to shard '{ShardId}' with due time {DueTime}" - )] - private static partial void LogAddingJob(ILogger logger, string jobId, string jobName, string shardId, DateTimeOffset dueTime); - - [LoggerMessage( - Level = LogLevel.Debug, - Message = "Removing job '{JobId}' from shard '{ShardId}'" - )] - private static partial void LogRemovingJob(ILogger logger, string jobId, string shardId); - - [LoggerMessage( - Level = LogLevel.Debug, - Message = "Retrying job '{JobId}' in shard '{ShardId}' with new due time {NewDueTime}" - )] - private static partial void LogRetryingJob(ILogger logger, string jobId, string shardId, DateTimeOffset newDueTime); - - [LoggerMessage( - Level = LogLevel.Trace, - Message = "Flushing batch of {OperationCount} job operation(s) to shard '{ShardId}'" - )] - private static partial void LogFlushingBatch(ILogger logger, int operationCount, string shardId); - - [LoggerMessage( - Level = LogLevel.Debug, - Message = "Batch of {OperationCount} job operation(s) written to shard '{ShardId}' in {ElapsedMilliseconds}ms. Total committed blocks: {CommittedBlockCount}" - )] - private static partial void LogBatchWritten(ILogger logger, int operationCount, string shardId, long elapsedMilliseconds, int committedBlockCount); - - [LoggerMessage( - Level = LogLevel.Trace, - Message = "Updating metadata for shard '{ShardId}'" - )] - private static partial void LogUpdatingMetadata(ILogger logger, string shardId); - - [LoggerMessage( - Level = LogLevel.Debug, - Message = "Metadata updated for shard '{ShardId}'" - )] - private static partial void LogMetadataUpdated(ILogger logger, string shardId); - - [LoggerMessage( - Level = LogLevel.Warning, - Message = "Shard '{ShardId}' has {CommittedBlockCount} committed blocks, approaching Azure Blob append limit of 50,000" - )] - private static partial void LogApproachingBlockLimit(ILogger logger, string shardId, int committedBlockCount); - - [LoggerMessage( - Level = LogLevel.Warning, - Message = "Large batch detected for shard '{ShardId}': {OperationCount} operations (max configured: {MaxBatchSize})" - )] - private static partial void LogLargeBatch(ILogger logger, string shardId, int operationCount, int maxBatchSize); - - [LoggerMessage( - Level = LogLevel.Error, - Message = "Error writing batch of {OperationCount} operation(s) to shard '{ShardId}'" - )] - private static partial void LogErrorWritingBatch(ILogger logger, Exception exception, int operationCount, string shardId); - - [LoggerMessage( - Level = LogLevel.Error, - Message = "Error updating metadata for shard '{ShardId}'" - )] - private static partial void LogErrorUpdatingMetadata(ILogger logger, Exception exception, string shardId); - - [LoggerMessage( - Level = LogLevel.Debug, - Message = "Stopping storage processor for shard '{ShardId}'" - )] - private static partial void LogStoppingProcessor(ILogger logger, string shardId); - - [LoggerMessage( - Level = LogLevel.Information, - Message = "Storage processor stopped for shard '{ShardId}'" - )] - private static partial void LogProcessorStopped(ILogger logger, string shardId); - - [LoggerMessage( - Level = LogLevel.Trace, - Message = "Processing storage operation queue for shard '{ShardId}'" - )] - private static partial void LogProcessingStorageQueue(ILogger logger, string shardId); - - [LoggerMessage( - Level = LogLevel.Debug, - Message = "Waiting for additional operations to batch (current size: {CurrentSize}, min size: {MinSize}) for shard '{ShardId}'" - )] - private static partial void LogWaitingForBatch(ILogger logger, int currentSize, int minSize, string shardId); -} diff --git a/src/Azure/Orleans.DurableJobs.AzureStorage/AzureStorageJobShard.cs b/src/Azure/Orleans.DurableJobs.AzureStorage/AzureStorageJobShard.cs deleted file mode 100644 index c5184ccc911..00000000000 --- a/src/Azure/Orleans.DurableJobs.AzureStorage/AzureStorageJobShard.cs +++ /dev/null @@ -1,394 +0,0 @@ -using System; -using System.Buffers; -using System.Collections.Generic; -using System.Diagnostics; -using System.IO; -using System.Text; -using System.Text.Json; -using System.Threading; -using System.Threading.Channels; -using System.Threading.Tasks; -using System.Transactions; -using Azure; -using Azure.Storage.Blobs; -using Azure.Storage.Blobs.Models; -using Azure.Storage.Blobs.Specialized; -using Microsoft.Extensions.Logging; -using Orleans.Hosting; -using Orleans.Runtime; -using Orleans.Serialization.Buffers.Adaptors; - -namespace Orleans.DurableJobs.AzureStorage; - -internal sealed partial class AzureStorageJobShard : JobShard -{ - private readonly Channel _storageOperationChannel; - private readonly Task _storageProcessorTask; - private readonly CancellationTokenSource _shutdownCts = new(); - private readonly AzureStorageJobShardOptions _options; - private readonly ILogger _logger; - - internal AppendBlobClient BlobClient { get; init; } - internal ETag? ETag { get; private set; } - internal int CommitedBlockCount { get; private set; } - - public AzureStorageJobShard(string id, DateTimeOffset startTime, DateTimeOffset endTime, AppendBlobClient blobClient, IDictionary? metadata, ETag? eTag, AzureStorageJobShardOptions options, ILogger logger) - : base(id, startTime, endTime) - { - BlobClient = blobClient; - ETag = eTag; - Metadata = metadata; - _options = options; - _logger = logger; - - // Create unbounded channel for storage operations - _storageOperationChannel = Channel.CreateUnbounded(new UnboundedChannelOptions - { - SingleReader = true, - SingleWriter = false - }); - - // Start the background task that processes storage operations - _storageProcessorTask = ProcessStorageOperationsAsync(); - } - - protected override async Task PersistAddJobAsync(string jobId, string jobName, DateTimeOffset dueTime, GrainId target, IReadOnlyDictionary? metadata, CancellationToken cancellationToken) - { - LogAddingJob(_logger, jobId, jobName, Id, dueTime); - var operation = JobOperation.CreateAddOperation(jobId, jobName, dueTime, target, metadata); - await EnqueueStorageOperationAsync(StorageOperation.CreateAppendOperation(operation), cancellationToken); - } - - protected override async Task PersistRemoveJobAsync(string jobId, CancellationToken cancellationToken) - { - LogRemovingJob(_logger, jobId, Id); - var operation = JobOperation.CreateRemoveOperation(jobId); - await EnqueueStorageOperationAsync(StorageOperation.CreateAppendOperation(operation), cancellationToken); - } - - protected override async Task PersistRetryJobAsync(string jobId, DateTimeOffset newDueTime, CancellationToken cancellationToken) - { - LogRetryingJob(_logger, jobId, Id, newDueTime); - var operation = JobOperation.CreateRetryOperation(jobId, newDueTime); - await EnqueueStorageOperationAsync(StorageOperation.CreateAppendOperation(operation), cancellationToken); - } - - public async Task UpdateBlobMetadata(IDictionary metadata, CancellationToken cancellationToken) - { - LogUpdatingMetadata(_logger, Id); - await EnqueueStorageOperationAsync(StorageOperation.CreateMetadataOperation(metadata), cancellationToken); - } - - public async ValueTask InitializeAsync(CancellationToken cancellationToken) - { - LogInitializingShard(_logger, Id); - var sw = Stopwatch.StartNew(); - - // Load existing blob - var response = await BlobClient.DownloadAsync(cancellationToken: cancellationToken); - using var stream = response.Value.Content; - - // Rebuild state by replaying operations - var addedJobs = new Dictionary(); - var deletedJobs = new HashSet(); - var jobRetryCounters = new Dictionary(); - - await foreach (var operation in NetstringJsonSerializer.DecodeAsync(stream, JobOperationJsonContext.Default.JobOperation, cancellationToken)) - { - switch (operation.Type) - { - case JobOperation.OperationType.Add: - if (!deletedJobs.Contains(operation.Id)) - { - addedJobs[operation.Id] = operation; - } - break; - case JobOperation.OperationType.Remove: - deletedJobs.Add(operation.Id); - addedJobs.Remove(operation.Id); - jobRetryCounters.Remove(operation.Id); - break; - case JobOperation.OperationType.Retry: - if (!deletedJobs.Contains(operation.Id)) - { - if (!jobRetryCounters.ContainsKey(operation.Id)) - { - jobRetryCounters[operation.Id] = (1, operation.DueTime); - } - else - { - var entry = jobRetryCounters[operation.Id]; - jobRetryCounters[operation.Id] = (entry.dequeueCount + 1, operation.DueTime); - } - } - break; - } - } - - // Rebuild the priority queue - foreach (var op in addedJobs.Values) - { - var retryCounter = 0; - var dueTime = op.DueTime!.Value; - if (jobRetryCounters.TryGetValue(op.Id, out var retryEntries)) - { - retryCounter = retryEntries.dequeueCount; - dueTime = retryEntries.newDueTime ?? dueTime; - } - - EnqueueJob(new DurableJob - { - Id = op.Id, - Name = op.Name!, - DueTime = dueTime, - TargetGrainId = op.TargetGrainId!.Value, - ShardId = Id, - Metadata = op.Metadata, - }, - retryCounter); - } - - ETag = response.Value.Details.ETag; - - sw.Stop(); - LogShardInitialized(_logger, Id, addedJobs.Count, sw.ElapsedMilliseconds); - } - - private async Task EnqueueStorageOperationAsync(StorageOperation operation, CancellationToken cancellationToken) - { - await _storageOperationChannel.Writer.WriteAsync(operation, cancellationToken); - await operation.CompletionSource.Task; - } - - private async Task ProcessStorageOperationsAsync() - { - await Task.CompletedTask.ConfigureAwait(ConfigureAwaitOptions.ContinueOnCapturedContext | ConfigureAwaitOptions.ForceYielding); - - var cancellationToken = _shutdownCts.Token; - // TODO: AppendBlob has a limit of 50,000 blocks. Implement blob rotation when this limit is approached. - var batchOperations = new List(_options.MaxBatchSize); - - try - { - while (await _storageOperationChannel.Reader.WaitToReadAsync(cancellationToken)) - { - // Read first operation - if (!_storageOperationChannel.Reader.TryRead(out var firstOperation)) - { - continue; - } - - // Handle metadata operations immediately (cannot be batched) - if (firstOperation.Type is StorageOperationType.UpdateMetadata) - { - try - { - await UpdateMetadataAsync(firstOperation.Metadata!, cancellationToken); - LogMetadataUpdated(_logger, Id); - firstOperation.CompletionSource.TrySetResult(); - } - catch (Exception ex) - { - LogErrorUpdatingMetadata(_logger, ex, Id); - firstOperation.CompletionSource?.TrySetException(ex); - } - continue; - } - - // Collect job operations for batching - batchOperations.Add(firstOperation); - - // Try to collect more operations up to the maximum batch size - if (TryCollectJobOperationsForBatch(batchOperations)) - { - // Not enough operations to meet the minimum batch size, wait for more or timeout - if (batchOperations.Count < _options.MinBatchSize) - { - LogWaitingForBatch(_logger, batchOperations.Count, _options.MinBatchSize, Id); - } - await Task.Delay(_options.BatchFlushInterval, cancellationToken); - TryCollectJobOperationsForBatch(batchOperations); - } - - // Process the batch of job operations - if (batchOperations.Count > 0) - { - try - { - LogFlushingBatch(_logger, batchOperations.Count, Id); - await AppendJobOperationBatchAsync(batchOperations, cancellationToken); - - // Mark all operations as completed - foreach (var op in batchOperations) - { - op.CompletionSource.TrySetResult(); - } - } - catch (Exception ex) - { - LogErrorWritingBatch(_logger, ex, batchOperations.Count, Id); - - // Mark all operations as failed - foreach (var op in batchOperations) - { - op.CompletionSource?.TrySetException(ex); - } - } - finally - { - batchOperations.Clear(); - } - } - } - } - catch (OperationCanceledException) - { - // Ignore - } - finally - { - // Expected during shutdown - cancel all pending operations - while (_storageOperationChannel.Reader.TryRead(out var operation)) - { - operation.CompletionSource?.TrySetCanceled(cancellationToken); - } - } - - // Local function to collect job operations for batching. Returns true if more operations can be collected. - bool TryCollectJobOperationsForBatch(List batchOperations) - { - // Collect more jobs, up to a maximum batch size - while (batchOperations.Count < _options.MaxBatchSize && _storageOperationChannel.Reader.TryPeek(out var nextOperation)) - { - if (nextOperation.Type is StorageOperationType.UpdateMetadata) - { - // Stop batching if we encounter a metadata operation - return false; - } - _storageOperationChannel.Reader.TryRead(out var operation); - Debug.Assert(operation != null); - batchOperations.Add(operation!); - } - return batchOperations.Count != _options.MaxBatchSize; - } - } - - private async Task AppendJobOperationBatchAsync(List operations, CancellationToken cancellationToken) - { - var sw = Stopwatch.StartNew(); - using var stream = PooledBufferStream.Rent(); - try - { - stream.Position = 0; // TODO Remove that once PooledBufferStream fixed - - // Encode all job operations into a single stream - foreach (var operation in operations) - { - NetstringJsonSerializer.Encode(operation.JobOperation!.Value, stream, JobOperationJsonContext.Default.JobOperation); - } - stream.Position = 0; - var result = await BlobClient.AppendBlockAsync( - stream, - new AppendBlobAppendBlockOptions { Conditions = new AppendBlobRequestConditions { IfMatch = ETag } }, - cancellationToken); - ETag = result.Value.ETag; - CommitedBlockCount = result.Value.BlobCommittedBlockCount; - - sw.Stop(); - LogBatchWritten(_logger, operations.Count, Id, sw.ElapsedMilliseconds, CommitedBlockCount); - - // Warn if approaching the 50,000 block limit (warn at 80%) - if (CommitedBlockCount > 40000) - { - LogApproachingBlockLimit(_logger, Id, CommitedBlockCount); - } - - // Warn if batch is unusually large - if (operations.Count > _options.MaxBatchSize * 0.8) - { - LogLargeBatch(_logger, Id, operations.Count, _options.MaxBatchSize); - } - } - finally - { - PooledBufferStream.Return(stream); - } - } - - private async Task UpdateMetadataAsync(IDictionary metadata, CancellationToken cancellationToken) - { - var result = await BlobClient.SetMetadataAsync( - metadata, - new BlobRequestConditions { IfMatch = ETag }, - cancellationToken); - ETag = result.Value.ETag; - Metadata = metadata; - } - - /// - /// Stops the background storage processor and waits for all pending operations to complete. - /// After calling this method, no new storage operations can be enqueued. - /// This method is idempotent and can be called multiple times safely. - /// - internal async Task StopProcessorAsync(CancellationToken cancellationToken) - { - LogStoppingProcessor(_logger, Id); - - // Complete the channel to stop accepting new operations (idempotent operation) - if (_storageOperationChannel.Writer.TryComplete()) - { - _shutdownCts.Cancel(); - } - - // Wait for the background processor to finish all pending operations - try - { - await _storageProcessorTask.WaitAsync(cancellationToken); - LogProcessorStopped(_logger, Id); - } - catch (OperationCanceledException) - { - // Expected during normal shutdown - LogProcessorStopped(_logger, Id); - } - } - - public override async ValueTask DisposeAsync() - { - await StopProcessorAsync(CancellationToken.None); - _shutdownCts.Dispose(); - await base.DisposeAsync(); - } -} - -internal enum StorageOperationType -{ - AppendJobOperation, - UpdateMetadata -} - -internal sealed class StorageOperation -{ - public required StorageOperationType Type { get; init; } - public JobOperation? JobOperation { get; init; } - public IDictionary? Metadata { get; init; } - public TaskCompletionSource CompletionSource { get; init; } = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); - - public static StorageOperation CreateAppendOperation(JobOperation jobOperation) - { - return new StorageOperation - { - Type = StorageOperationType.AppendJobOperation, - JobOperation = jobOperation - }; - } - - public static StorageOperation CreateMetadataOperation(IDictionary metadata) - { - return new StorageOperation - { - Type = StorageOperationType.UpdateMetadata, - Metadata = metadata - }; - } -} diff --git a/src/Azure/Orleans.DurableJobs.AzureStorage/AzureStorageJobShardManager.cs b/src/Azure/Orleans.DurableJobs.AzureStorage/AzureStorageJobShardManager.cs deleted file mode 100644 index 4f1abd761b9..00000000000 --- a/src/Azure/Orleans.DurableJobs.AzureStorage/AzureStorageJobShardManager.cs +++ /dev/null @@ -1,517 +0,0 @@ -using System; -using System.Collections.Concurrent; -using System.Collections.Generic; -using System.Diagnostics; -using System.Globalization; -using System.Threading; -using System.Threading.Tasks; -using Azure; -using Azure.Storage.Blobs; -using Azure.Storage.Blobs.Models; -using Azure.Storage.Blobs.Specialized; -using Microsoft.Extensions.Logging; -using Microsoft.Extensions.Options; -using Orleans.Hosting; -using Orleans.Runtime; - -namespace Orleans.DurableJobs.AzureStorage; - -public sealed partial class AzureStorageJobShardManager : JobShardManager -{ - private readonly BlobServiceClient _blobServiceClient; - private readonly string _containerName; - private readonly string _blobPrefix; - private BlobContainerClient _client = null!; - private readonly IClusterMembershipService _clusterMembership; - private readonly ConcurrentDictionary _jobShardCache = new(); - private readonly ILogger _logger; - private readonly ILoggerFactory _loggerFactory; - private readonly AzureStorageJobShardOptions _options; - private readonly DurableJobsOptions _durableJobsOptions; - private long _shardCounter = 0; // For generating unique shard IDs - - private const string AdoptedCountKey = "AdoptedCount"; - private const string LastAdoptedTimeKey = "LastAdoptedTime"; - private const string LegacyStolenCountKey = "StolenCount"; - private const string LegacyLastStolenTimeKey = "LastStolenTime"; - - public AzureStorageJobShardManager( - SiloAddress siloAddress, - BlobServiceClient client, - string containerName, - string blobPrefix, - AzureStorageJobShardOptions options, - IOptions durableJobsOptions, - IClusterMembershipService clusterMembership, - ILoggerFactory loggerFactory) - : base(siloAddress) - { - _blobServiceClient = client; - _containerName = containerName; - _blobPrefix = blobPrefix; - _clusterMembership = clusterMembership; - _logger = loggerFactory.CreateLogger(); - _loggerFactory = loggerFactory; - _options = options; - _durableJobsOptions = durableJobsOptions.Value; - } - - public AzureStorageJobShardManager( - ILocalSiloDetails localSiloDetails, - IOptions options, - IOptions durableJobsOptions, - IClusterMembershipService clusterMembership, - ILoggerFactory loggerFactory) - : this(localSiloDetails.SiloAddress, options.Value.BlobServiceClient, options.Value.ContainerName, localSiloDetails.ClusterId, options.Value, durableJobsOptions, clusterMembership, loggerFactory) - { - } - - public override async Task> AssignJobShardsAsync(DateTimeOffset maxShardStartTime, int maxNewClaims, CancellationToken cancellationToken) - { - await InitializeIfNeeded(cancellationToken); - LogAssigningShards(_logger, SiloAddress, maxShardStartTime, _containerName); - - var result = new List(); - var newClaimCount = 0; - await foreach (var blob in _client.GetBlobsAsync(traits: BlobTraits.Metadata, states: BlobStates.None, cancellationToken: cancellationToken, prefix: _blobPrefix)) - { - // Get the owner and creator of the shard - var (owner, membershipVersion, shardStartTime, maxDueTime) = ParseMetadata(blob.Metadata); - - // Check if the membership version is more recent than our current version - if (membershipVersion > _clusterMembership.CurrentSnapshot.Version) - { - // Refresh membership to at least that version - await _clusterMembership.Refresh(membershipVersion, cancellationToken); - } - - if (shardStartTime > maxShardStartTime) - { - // This shard is too new. Since blobs are returned in alphabetical order and our blob names - // contain timestamps (yyyyMMddHHmm format), all subsequent blobs will also be too new. - LogShardTooNew(_logger, blob.Name, shardStartTime, maxShardStartTime); - break; - } - - // If I am the owner, the shard must be in cache - always return it - if (owner is not null && owner.Equals(SiloAddress)) - { - if (_jobShardCache.TryGetValue(blob.Name, out var shard)) - { - LogShardAssigned(_logger, blob.Name, SiloAddress); - result.Add(shard); - } - else - { - // Shard is owned by us but not in cache - this is unexpected, release ownership - Debug.Assert(false, $"Shard '{blob.Name}' is owned by this silo but not in cache - releasing ownership"); - await ReleaseOwnership(blob.Name); - } - continue; - } - - // In debug, verify that if we're not the owner, the shard should not be in our cache - Debug.Assert(!_jobShardCache.ContainsKey(blob.Name), $"Shard '{blob.Name}' is in cache but we are not the owner (owner: {owner?.ToParsableString() ?? "none"})"); - - // Check if the owner is valid - var ownerStatus = owner is not null ? _clusterMembership.CurrentSnapshot.GetSiloStatus(owner) : SiloStatus.None; - - if (ownerStatus is not SiloStatus.Dead and not SiloStatus.None) - { - // Owner is still active and it's not me, skip this shard - LogShardStillOwned(_logger, blob.Name, owner!); - continue; - } - - // Determine if this is an adopted shard (taken from dead owner) vs orphaned (gracefully released) - var isAdopted = owner is not null && ownerStatus == SiloStatus.Dead; - - // Respect the slow-start budget: skip claiming if we've exhausted the budget - if (newClaimCount >= maxNewClaims) - { - continue; - } - - // Try to claim orphaned or adopted shard - LogClaimingShard(_logger, blob.Name, SiloAddress, owner); - var blobClient = _client.GetAppendBlobClient(blob.Name); - var metadata = blob.Metadata; - var orphanedShard = new AzureStorageJobShard(blob.Name, shardStartTime, maxDueTime, blobClient, metadata, blob.Properties.ETag, _options, _loggerFactory.CreateLogger()); - if (!await TryTakeOwnership(orphanedShard, metadata, SiloAddress, isAdopted, cancellationToken)) - { - // Either poisoned shard or someone else took ownership - dispose and continue - await orphanedShard.DisposeAsync(); - continue; - } - await orphanedShard.InitializeAsync(cancellationToken); - // We don't want to add new jobs to shards that we just took ownership of - await orphanedShard.MarkAsCompleteAsync(cancellationToken); - _jobShardCache[blob.Name] = orphanedShard; - LogShardAssigned(_logger, blob.Name, SiloAddress); - result.Add(orphanedShard); - newClaimCount++; - } - - LogAssignmentCompleted(_logger, result.Count, SiloAddress); - return result; - - async Task ReleaseOwnership(string blobName) - { - try - { - var blobClient = _client.GetAppendBlobClient(blobName); - var properties = await blobClient.GetPropertiesAsync(cancellationToken: cancellationToken); - var metadata = properties.Value.Metadata; - metadata.Remove("Owner"); - // Reset adopted count since we're gracefully releasing - metadata.Remove(AdoptedCountKey); - metadata.Remove(LastAdoptedTimeKey); - metadata.Remove(LegacyStolenCountKey); - metadata.Remove(LegacyLastStolenTimeKey); - await blobClient.SetMetadataAsync(metadata, new BlobRequestConditions { IfMatch = properties.Value.ETag }, cancellationToken); - } - catch (Exception ex) - { - // Log but continue - we'll let another silo claim it - LogWarningReleaseOwnershipNotInCache(_logger, ex, blobName); - } - } - - async Task TryTakeOwnership(AzureStorageJobShard shard, IDictionary metadata, SiloAddress newOwner, bool isAdopted, CancellationToken ct) - { - if (isAdopted) - { - var existingAdoptedCount = GetAdoptedCount(metadata); - if (existingAdoptedCount > _durableJobsOptions.MaxAdoptedCount) - { - // Already marked as poisoned. - return false; - } - - // Increment adopted count for shards taken from dead owners. - var adoptedCount = existingAdoptedCount + 1; - if (adoptedCount > _durableJobsOptions.MaxAdoptedCount) - { - // Persist poisoned marker so this shard is not repeatedly re-evaluated as newly poisoned. - SetAdoptedMetadata(metadata, adoptedCount, DateTimeOffset.UtcNow); - try - { - await shard.UpdateBlobMetadata(metadata, ct); - } - catch (RequestFailedException ex) - { - LogOwnershipFailed(_logger, ex, shard.Id, newOwner); - } - - LogPoisonedShardDetected(_logger, shard.Id, adoptedCount, _durableJobsOptions.MaxAdoptedCount); - return false; - } - - SetAdoptedMetadata(metadata, adoptedCount, DateTimeOffset.UtcNow); - LogShardAdopted(_logger, shard.Id, newOwner, adoptedCount); - } - - metadata["Owner"] = newOwner.ToParsableString(); - metadata["MembershipVersion"] = _clusterMembership.CurrentSnapshot.Version.Value.ToString(); - - try - { - await shard.UpdateBlobMetadata(metadata, ct); - LogOwnershipTaken(_logger, shard.Id, newOwner); - return true; - } - catch (RequestFailedException ex) - { - // Someone else took over the shard - LogOwnershipFailed(_logger, ex, shard.Id, newOwner); - return false; - } - } - - static int GetAdoptedCount(IDictionary metadata) - { - if (metadata.TryGetValue(AdoptedCountKey, out var countStr) - && int.TryParse(countStr, NumberStyles.Integer, CultureInfo.InvariantCulture, out var adoptedCount)) - { - return adoptedCount; - } - - return metadata.TryGetValue(LegacyStolenCountKey, out countStr) - && int.TryParse(countStr, NumberStyles.Integer, CultureInfo.InvariantCulture, out var legacyCount) - ? legacyCount - : 0; - } - - static void SetAdoptedMetadata(IDictionary metadata, int adoptedCount, DateTimeOffset adoptedTime) - { - metadata[AdoptedCountKey] = adoptedCount.ToString(CultureInfo.InvariantCulture); - metadata[LastAdoptedTimeKey] = adoptedTime.ToString("o", CultureInfo.InvariantCulture); - metadata.Remove(LegacyStolenCountKey); - metadata.Remove(LegacyLastStolenTimeKey); - } - } - - public override async Task CreateShardAsync(DateTimeOffset minDueTime, DateTimeOffset maxDueTime, IDictionary metadata, CancellationToken cancellationToken) - { - await InitializeIfNeeded(cancellationToken); - LogRegisteringShard(_logger, SiloAddress, minDueTime, maxDueTime, _containerName); - - var i = 0; - while (true) - { - var counter = Interlocked.Increment(ref _shardCounter); - var shardId = $"{_blobPrefix}-{minDueTime:yyyyMMddHHmm}-{SiloAddress.ToParsableString()}-{counter}"; - var blobClient = _client.GetAppendBlobClient(shardId); - var metadataInfo = CreateMetadata(metadata, SiloAddress, _clusterMembership.CurrentSnapshot.Version, minDueTime, maxDueTime); - metadataInfo["Owner"] = SiloAddress.ToParsableString(); - try - { - var response = await blobClient.CreateIfNotExistsAsync(metadata: metadataInfo, cancellationToken: cancellationToken); - if (response == null) - { - // Blob already exists, try again with a different name - LogShardIdCollision(_logger, shardId, i); - continue; - } - } - catch (RequestFailedException ex) - { - i++; - if (i > _options.MaxBlobCreationRetries) - { - throw new InvalidOperationException($"Failed to create shard blob '{shardId}' after {i} attempts", ex); - } - // Blob already exists, try again with a different name - LogShardRegistrationRetry(_logger, ex, shardId, i); - continue; - } - - var shard = new AzureStorageJobShard(shardId, minDueTime, maxDueTime, blobClient, metadataInfo, null, _options, _loggerFactory.CreateLogger()); - await shard.InitializeAsync(cancellationToken); - _jobShardCache[shardId] = shard; - LogShardRegistered(_logger, shardId, SiloAddress); - return shard; - } - } - - public override async Task UnregisterShardAsync(Orleans.DurableJobs.IJobShard shard, CancellationToken cancellationToken) - { - var azureShard = shard as AzureStorageJobShard ?? throw new ArgumentException("Shard is not an AzureStorageJobShard", nameof(shard)); - LogUnregisteringShard(_logger, shard.Id, SiloAddress); - - // Stop the background storage processor to ensure no more changes can happen - await azureShard.StopProcessorAsync(cancellationToken); - - // Now we can safely get a consistent view of the state - var count = await shard.GetJobCountAsync(); - // We want to make sure to get the latest properties - var properties = await azureShard.BlobClient.GetPropertiesAsync(cancellationToken: cancellationToken); - - // But we don't want to update the metadata if the ETag has changed - var currentETag = properties.Value.ETag; - var conditions = new BlobRequestConditions { IfMatch = currentETag }; - var metadata = properties.Value.Metadata; - var (owner, _, _, _) = ParseMetadata(metadata); - - if (owner != SiloAddress) - { - LogUnregisterWrongOwner(_logger, shard.Id, SiloAddress, owner); - throw new InvalidOperationException("Cannot unregister a shard owned by another silo"); - } - - if (count > 0) - { - // There are still jobs in the shard, release ownership gracefully. - metadata.Remove("Owner"); - // Reset adopted count since we're gracefully releasing (not crashing) - metadata.Remove(AdoptedCountKey); - metadata.Remove(LastAdoptedTimeKey); - metadata.Remove(LegacyStolenCountKey); - metadata.Remove(LegacyLastStolenTimeKey); - await azureShard.BlobClient.SetMetadataAsync(metadata, conditions, cancellationToken); - _jobShardCache.TryRemove(shard.Id, out _); - LogShardOwnershipReleased(_logger, shard.Id, SiloAddress, count); - } - else - { - // No jobs left, we can delete the shard - await azureShard.BlobClient.DeleteIfExistsAsync(conditions: conditions, cancellationToken: cancellationToken); - _jobShardCache.TryRemove(shard.Id, out _); - LogShardDeleted(_logger, shard.Id, SiloAddress); - } - - // Dispose the shard's resources - await azureShard.DisposeAsync(); - } - - private async ValueTask InitializeIfNeeded(CancellationToken cancellationToken = default) - { - if (_client != null) return; - - LogInitializing(_logger, _containerName); - _client = _blobServiceClient.GetBlobContainerClient(_containerName); - await _client.CreateIfNotExistsAsync(cancellationToken: cancellationToken); - LogInitialized(_logger, _containerName); - } - - private static Dictionary CreateMetadata(IDictionary existingMetadata, SiloAddress siloAddress, MembershipVersion membershipVersion, DateTimeOffset minDueTime, DateTimeOffset maxDueTime) - { - var metadata = new Dictionary(existingMetadata) - { - { "MinDueTime", minDueTime.ToString("o") }, - { "MaxDueTime", maxDueTime.ToString("o") }, - { "MembershipVersion", membershipVersion.Value.ToString(CultureInfo.InvariantCulture) } - }; - - return metadata; - } - - private static (SiloAddress? owner, MembershipVersion membershipVersion, DateTimeOffset minDueTime, DateTimeOffset maxDueTime) ParseMetadata(IDictionary metadata) - { - var owner = metadata.TryGetValue("Owner", out var ownerStr) ? SiloAddress.FromParsableString(ownerStr) : null; - var membershipVersion = metadata.TryGetValue("MembershipVersion", out var membershipVersionStr) && long.TryParse(membershipVersionStr, out var versionValue) - ? new MembershipVersion(versionValue) - : MembershipVersion.MinValue; - var minDueTime = metadata.TryGetValue("MinDueTime", out var minDueTimeStr) && DateTimeOffset.TryParse(minDueTimeStr, out var minDt) ? minDt : DateTimeOffset.MinValue; - var maxDueTime = metadata.TryGetValue("MaxDueTime", out var maxDueTimeStr) && DateTimeOffset.TryParse(maxDueTimeStr, out var maxDt) ? maxDt : DateTimeOffset.MaxValue; - return (owner, membershipVersion, minDueTime, maxDueTime); - } - - [LoggerMessage( - Level = LogLevel.Information, - Message = "Initializing Azure Storage Job Shard Manager with container '{ContainerName}'" - )] - private static partial void LogInitializing(ILogger logger, string containerName); - - [LoggerMessage( - Level = LogLevel.Information, - Message = "Azure Storage Job Shard Manager initialized successfully for container '{ContainerName}'" - )] - private static partial void LogInitialized(ILogger logger, string containerName); - - [LoggerMessage( - Level = LogLevel.Debug, - Message = "Assigning job shards for silo {SiloAddress} with max time {MaxDateTime} from container '{ContainerName}'" - )] - private static partial void LogAssigningShards(ILogger logger, SiloAddress siloAddress, DateTimeOffset maxDateTime, string containerName); - - [LoggerMessage( - Level = LogLevel.Trace, - Message = "Ignoring shard '{ShardId}' since its start time is greater than specified maximum (MinDueTime={MinDueTime}, MaxDateTime={MaxDateTime})" - )] - private static partial void LogShardTooNew(ILogger logger, string shardId, DateTimeOffset minDueTime, DateTimeOffset maxDateTime); - - [LoggerMessage( - Level = LogLevel.Trace, - Message = "Shard '{ShardId}' is still owned by active silo {Owner}" - )] - private static partial void LogShardStillOwned(ILogger logger, string shardId, SiloAddress owner); - - [LoggerMessage( - Level = LogLevel.Debug, - Message = "Reclaiming shard '{ShardId}' from cache for silo {SiloAddress}" - )] - private static partial void LogReclaimingShardFromCache(ILogger logger, string shardId, SiloAddress siloAddress); - - [LoggerMessage( - Level = LogLevel.Debug, - Message = "Claiming shard '{ShardId}' for silo {SiloAddress} (Previous Owner={PreviousOwner})" - )] - private static partial void LogClaimingShard(ILogger logger, string shardId, SiloAddress siloAddress, SiloAddress? previousOwner); - - [LoggerMessage( - Level = LogLevel.Warning, - Message = "Failed to take ownership of shard '{ShardId}' for silo {SiloAddress} due to conflict" - )] - private static partial void LogShardOwnershipConflict(ILogger logger, string shardId, SiloAddress siloAddress); - - [LoggerMessage( - Level = LogLevel.Debug, - Message = "Shard '{ShardId}' assigned to silo {SiloAddress}" - )] - private static partial void LogShardAssigned(ILogger logger, string shardId, SiloAddress siloAddress); - - [LoggerMessage( - Level = LogLevel.Information, - Message = "Assigned {ShardCount} shard(s) to silo {SiloAddress}" - )] - private static partial void LogAssignmentCompleted(ILogger logger, int shardCount, SiloAddress siloAddress); - - [LoggerMessage( - Level = LogLevel.Debug, - Message = "Took ownership of shard '{ShardId}' for silo {SiloAddress}" - )] - private static partial void LogOwnershipTaken(ILogger logger, string shardId, SiloAddress siloAddress); - - [LoggerMessage( - Level = LogLevel.Warning, - Message = "Failed to take ownership of shard '{ShardId}' for silo {SiloAddress}" - )] - private static partial void LogOwnershipFailed(ILogger logger, Exception exception, string shardId, SiloAddress siloAddress); - - [LoggerMessage( - Level = LogLevel.Warning, - Message = "Failed to release ownership of shard '{ShardId}' that was not in cache" - )] - private static partial void LogWarningReleaseOwnershipNotInCache(ILogger logger, Exception exception, string shardId); - - [LoggerMessage( - Level = LogLevel.Information, - Message = "Creating new shard for silo {SiloAddress} (MinDueTime={MinDueTime}, MaxDueTime={MaxDueTime}) in container '{ContainerName}'" - )] - private static partial void LogRegisteringShard(ILogger logger, SiloAddress siloAddress, DateTimeOffset minDueTime, DateTimeOffset maxDueTime, string containerName); - - [LoggerMessage( - Level = LogLevel.Trace, - Message = "Shard ID collision for '{ShardId}' (attempt {Attempt}), retrying with new ID" - )] - private static partial void LogShardIdCollision(ILogger logger, string shardId, int attempt); - - [LoggerMessage( - Level = LogLevel.Warning, - Message = "Failed to register shard '{ShardId}' (attempt {Attempt}), retrying" - )] - private static partial void LogShardRegistrationRetry(ILogger logger, Exception exception, string shardId, int attempt); - - [LoggerMessage( - Level = LogLevel.Information, - Message = "Shard '{ShardId}' created successfully for silo {SiloAddress}" - )] - private static partial void LogShardRegistered(ILogger logger, string shardId, SiloAddress siloAddress); - - [LoggerMessage( - Level = LogLevel.Information, - Message = "Unregistering shard '{ShardId}' for silo {SiloAddress}" - )] - private static partial void LogUnregisteringShard(ILogger logger, string shardId, SiloAddress siloAddress); - - [LoggerMessage( - Level = LogLevel.Warning, - Message = "Cannot unregister shard '{ShardId}' - silo {SiloAddress} is not the owner (Owner={Owner})" - )] - private static partial void LogUnregisterWrongOwner(ILogger logger, string shardId, SiloAddress siloAddress, SiloAddress? owner); - - [LoggerMessage( - Level = LogLevel.Information, - Message = "Released ownership of shard '{ShardId}' by silo {SiloAddress} ({JobCount} jobs remaining)" - )] - private static partial void LogShardOwnershipReleased(ILogger logger, string shardId, SiloAddress siloAddress, int jobCount); - - [LoggerMessage( - Level = LogLevel.Information, - Message = "Deleted shard '{ShardId}' by silo {SiloAddress} (no jobs remaining)" - )] - private static partial void LogShardDeleted(ILogger logger, string shardId, SiloAddress siloAddress); - - [LoggerMessage( - Level = LogLevel.Warning, - Message = "Poisoned shard detected: '{ShardId}' has been adopted {AdoptedCount} times (max allowed: {MaxAdoptedCount}). Shard will not be assigned." - )] - private static partial void LogPoisonedShardDetected(ILogger logger, string shardId, int adoptedCount, int maxAdoptedCount); - - [LoggerMessage( - Level = LogLevel.Information, - Message = "Shard '{ShardId}' adopted by silo {SiloAddress} (adopted count: {AdoptedCount})" - )] - private static partial void LogShardAdopted(ILogger logger, string shardId, SiloAddress siloAddress, int adoptedCount); -} diff --git a/src/Azure/Orleans.DurableJobs.AzureStorage/Hosting/AzureStorageDurableJobsExtensions.cs b/src/Azure/Orleans.DurableJobs.AzureStorage/Hosting/AzureStorageDurableJobsExtensions.cs index ccb8d80fb4b..a204098d7c6 100644 --- a/src/Azure/Orleans.DurableJobs.AzureStorage/Hosting/AzureStorageDurableJobsExtensions.cs +++ b/src/Azure/Orleans.DurableJobs.AzureStorage/Hosting/AzureStorageDurableJobsExtensions.cs @@ -1,10 +1,11 @@ using System; +using Microsoft.Extensions.Configuration; using Microsoft.Extensions.DependencyInjection; -using Microsoft.Extensions.Options; -using Orleans.Configuration; +using Microsoft.Extensions.DependencyInjection.Extensions; using Orleans.Configuration.Internal; using Orleans.DurableJobs; -using Orleans.DurableJobs.AzureStorage; +using Orleans.Journaling; +using Orleans.Journaling.Json; namespace Orleans.Hosting; @@ -25,27 +26,15 @@ public static class AzureStorageDurableJobsExtensions /// /// The provided , for chaining. /// - public static ISiloBuilder UseAzureBlobDurableJobs(this ISiloBuilder builder, Action configure) + public static ISiloBuilder UseAzureBlobDurableJobs(this ISiloBuilder builder, Action configure) { - builder.ConfigureServices(services => services.UseAzureBlobDurableJobs(configure)); - return builder; - } + ArgumentNullException.ThrowIfNull(builder); + ArgumentNullException.ThrowIfNull(configure); - /// - /// Adds durable jobs storage backed by Azure Blob Storage. - /// - /// - /// The builder. - /// - /// - /// The configuration delegate. - /// - /// - /// The provided , for chaining. - /// - public static ISiloBuilder UseAzureBlobDurableJobs(this ISiloBuilder builder, Action> configureOptions) - { - builder.ConfigureServices(services => services.UseAzureBlobDurableJobs(configureOptions)); + builder.AddDurableJobs(); + builder.AddAzureBlobJournalStorage(configure); + builder.UseJsonJournalFormat(options => options.AddTypeInfoResolver(DurableJobsJsonContext.Default)); + builder.Services.UseJournaledDurableJobs(); return builder; } @@ -61,36 +50,38 @@ public static ISiloBuilder UseAzureBlobDurableJobs(this ISiloBuilder builder, Ac /// /// The provided , for chaining. /// - public static IServiceCollection UseAzureBlobDurableJobs(this IServiceCollection services, Action configure) + public static IServiceCollection UseAzureBlobDurableJobs(this IServiceCollection services, Action configure) { + ArgumentNullException.ThrowIfNull(services); + ArgumentNullException.ThrowIfNull(configure); + services.AddDurableJobs(); - services.AddSingleton(); - services.AddFromExisting(); - services.Configure(configure); - services.ConfigureFormatter(); + + var builder = new ServiceCollectionSiloBuilder(services); + builder.AddAzureBlobJournalStorage(configure); + builder.UseJsonJournalFormat(options => options.AddTypeInfoResolver(DurableJobsJsonContext.Default)); + + services.UseJournaledDurableJobs(); return services; } - /// - /// Adds durable jobs storage backed by Azure Blob Storage. - /// - /// - /// The service collection. - /// - /// - /// The configuration delegate. - /// - /// - /// The provided , for chaining. - /// - public static IServiceCollection UseAzureBlobDurableJobs(this IServiceCollection services, Action> configureOptions) + private static IServiceCollection UseJournaledDurableJobs(this IServiceCollection services) { - services.AddDurableJobs(); - services.AddSingleton(); - services.AddFromExisting(); - configureOptions?.Invoke(services.AddOptions()); - services.ConfigureFormatter(); - services.AddTransient(sp => new AzureStorageJobShardOptionsValidator(sp.GetRequiredService>().Get(Options.DefaultName), Options.DefaultName)); + services.TryAddSingleton(); + services.AddFromExisting(); return services; } + + private sealed class ServiceCollectionSiloBuilder : ISiloBuilder + { + public ServiceCollectionSiloBuilder(IServiceCollection services) + { + Services = services; + Configuration = new ConfigurationBuilder().Build(); + } + + public IServiceCollection Services { get; } + + public IConfiguration Configuration { get; } + } } diff --git a/src/Azure/Orleans.DurableJobs.AzureStorage/Hosting/AzureStorageJobShardOptions.cs b/src/Azure/Orleans.DurableJobs.AzureStorage/Hosting/AzureStorageJobShardOptions.cs deleted file mode 100644 index 5139b1cd071..00000000000 --- a/src/Azure/Orleans.DurableJobs.AzureStorage/Hosting/AzureStorageJobShardOptions.cs +++ /dev/null @@ -1,42 +0,0 @@ -using System; -using Azure.Storage.Blobs; - -namespace Orleans.Hosting; - -public class AzureStorageJobShardOptions -{ - /// - /// Gets or sets the instance used to store job shards. - /// - public BlobServiceClient BlobServiceClient { get; set; } = null!; - - /// - /// Gets or sets the name of the container used to store durable jobs. - /// - public string ContainerName { get; set; } = "jobs"; - - /// - /// Gets or sets the maximum number of job operations to batch together in a single blob write. - /// Default is 50 operations. - /// - public int MaxBatchSize { get; set; } = 50; - - /// - /// Gets or sets the minimum number of job operations to batch together before flushing. - /// If more than 1 then the we will wait for additional operations. - /// Default is 1 operation (immediate flush, optimized for latency). - /// - public int MinBatchSize { get; set; } = 1; - - /// - /// Gets or sets the maximum time to wait for additional operations if the minimum batch size isn't reached - /// before flushing a batch. - /// Default is 50 milliseconds. - /// - public TimeSpan BatchFlushInterval { get; set; } = TimeSpan.FromMilliseconds(50); - - /// - /// Gets or sets the maximum number of retries for creating a blob for a job shard in case of name collisions. - /// - public int MaxBlobCreationRetries { get; internal set; } = 3; -} diff --git a/src/Azure/Orleans.DurableJobs.AzureStorage/Hosting/AzureStorageJobShardOptionsValidator.cs b/src/Azure/Orleans.DurableJobs.AzureStorage/Hosting/AzureStorageJobShardOptionsValidator.cs deleted file mode 100644 index 9fbf438b70c..00000000000 --- a/src/Azure/Orleans.DurableJobs.AzureStorage/Hosting/AzureStorageJobShardOptionsValidator.cs +++ /dev/null @@ -1,39 +0,0 @@ -using Microsoft.Extensions.Options; -using Orleans.Configuration.Internal; -using Orleans.Runtime; - -namespace Orleans.Hosting; - -/// -/// Validates . -/// -public class AzureStorageJobShardOptionsValidator : IConfigurationValidator -{ - private readonly AzureStorageJobShardOptions _options; - private readonly string _name; - - /// - /// Initializes a new instance of the class. - /// - /// The options. - /// The name. - public AzureStorageJobShardOptionsValidator(AzureStorageJobShardOptions options, string name) - { - _options = options; - _name = name; - } - - /// - public void ValidateConfiguration() - { - if (_options.BlobServiceClient is null) - { - throw new OrleansConfigurationException($"Invalid configuration for {nameof(AzureStorageJobShardOptions)} with name '{_name}'. {nameof(_options.BlobServiceClient)} is required."); - } - - if (string.IsNullOrWhiteSpace(_options.ContainerName)) - { - throw new OrleansConfigurationException($"Invalid configuration for {nameof(AzureStorageJobShardOptions)} with name '{_name}'. {nameof(_options.ContainerName)} is required."); - } - } -} diff --git a/src/Azure/Orleans.DurableJobs.AzureStorage/JobOperation.cs b/src/Azure/Orleans.DurableJobs.AzureStorage/JobOperation.cs deleted file mode 100644 index 834e858ada3..00000000000 --- a/src/Azure/Orleans.DurableJobs.AzureStorage/JobOperation.cs +++ /dev/null @@ -1,110 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Text.Json; -using System.Text.Json.Serialization; -using Orleans.Runtime; - -namespace Orleans.DurableJobs.AzureStorage; - -/// -/// Represents an operation to be performed on a durable job. -/// -internal struct JobOperation -{ - /// - /// The type of operation to perform. - /// - public enum OperationType - { - Add, - Remove, - Retry, - } - - /// - /// Gets or sets the type of operation. - /// - public OperationType Type { get; init; } - - /// - /// Gets or sets the job identifier. - /// - public string Id { get; init; } - - /// - /// Gets or sets the job name (only used for Add operations). - /// - public string? Name { get; init; } - - /// - /// Gets or sets the due time (used for Add and Retry operations). - /// - public DateTimeOffset? DueTime { get; init; } - - /// - /// Gets or sets the target grain ID (only used for Add operations). - /// - public GrainId? TargetGrainId { get; init; } - - /// - /// Gets or sets the job metadata (only used for Add operations). - /// - public IReadOnlyDictionary? Metadata { get; init; } - - /// - /// Creates an Add operation for scheduling a new job. - /// - /// The job identifier. - /// The job name. - /// The job due time. - /// The target grain ID. - /// The job metadata. - /// A new JobOperation for adding a job. - /// Thrown when or is null or empty. - public static JobOperation CreateAddOperation(string id, string name, DateTimeOffset dueTime, GrainId targetGrainId, IReadOnlyDictionary? metadata) - { - ArgumentException.ThrowIfNullOrEmpty(id); - ArgumentException.ThrowIfNullOrEmpty(name); - - return new() { Type = OperationType.Add, Id = id, Name = name, DueTime = dueTime, TargetGrainId = targetGrainId, Metadata = metadata }; - } - - /// - /// Creates a Remove operation for canceling a job. - /// - /// The job identifier. - /// A new JobOperation for removing a job. - /// Thrown when is null or empty. - public static JobOperation CreateRemoveOperation(string id) - { - ArgumentException.ThrowIfNullOrEmpty(id); - - return new() { Type = OperationType.Remove, Id = id }; - } - - /// - /// Creates a Retry operation for rescheduling a job. - /// - /// The job identifier. - /// The new due time. - /// A new JobOperation for retrying a job. - /// Thrown when is null or empty. - public static JobOperation CreateRetryOperation(string id, DateTimeOffset dueTime) - { - ArgumentException.ThrowIfNullOrEmpty(id); - - return new() { Type = OperationType.Retry, Id = id, DueTime = dueTime }; - } -} - -/// -/// JSON serialization context for JobOperation with compile-time source generation. -/// -[JsonSerializable(typeof(JobOperation))] -[JsonSourceGenerationOptions( - DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault, - PropertyNamingPolicy = JsonKnownNamingPolicy.CamelCase, - WriteIndented = false)] -internal partial class JobOperationJsonContext : JsonSerializerContext -{ -} \ No newline at end of file diff --git a/src/Azure/Orleans.DurableJobs.AzureStorage/NetstringJsonSerializer.cs b/src/Azure/Orleans.DurableJobs.AzureStorage/NetstringJsonSerializer.cs deleted file mode 100644 index f47575c2162..00000000000 --- a/src/Azure/Orleans.DurableJobs.AzureStorage/NetstringJsonSerializer.cs +++ /dev/null @@ -1,168 +0,0 @@ -using System; -using System.Buffers; -using System.Buffers.Text; -using System.Collections.Generic; -using System.IO; -using System.Runtime.CompilerServices; -using System.Text.Json; -using System.Text.Json.Serialization.Metadata; -using System.Threading; -using System.Threading.Tasks; -using Orleans.Serialization.Buffers.Adaptors; - -namespace Orleans.DurableJobs.AzureStorage; - -/// -/// Provides methods for serializing and deserializing JSON data using the netstring format. -/// Netstrings are a simple, self-delimiting way to encode data with length prefixes. -/// Format: [6 hex digits]:[data]\n -/// Maximum data size is 10MB (0xA00000 bytes). -/// -public static class NetstringJsonSerializer -{ - private const int MaxLength = 0xA00000; // 10MB - - /// - /// Encodes an object as a netstring by serializing it to JSON and writing directly to a stream. - /// - /// The object to encode. - /// The stream to write the netstring-encoded data to. - /// The JSON type info for serialization. - /// Thrown when the serialized data exceeds the maximum length. - public static void Encode(T value, Stream stream, JsonTypeInfo jsonTypeInfo) - { - // Remember starting position - var startPosition = stream.Position; - - // Skip past where the length prefix will go (6 hex digits + colon) - Span lengthBytes = stackalloc byte[7]; - stream.Write(lengthBytes); - - // Remember position where data starts - var dataStartPosition = stream.Position; - - // Serialize JSON directly to stream - using (var writer = new Utf8JsonWriter(stream, new JsonWriterOptions { SkipValidation = false })) - { - JsonSerializer.Serialize(writer, value, jsonTypeInfo); - } - - stream.Flush(); - - // Calculate JSON length - var jsonLength = (int)(stream.Position - dataStartPosition); - - if (jsonLength > MaxLength) - { - throw new InvalidOperationException($"Serialized data exceeds maximum length of {MaxLength} bytes"); - } - - // Write trailing newline - stream.WriteByte((byte)'\n'); - - // Remember end position - var endPosition = stream.Position; - - // Seek back to write the length prefix - stream.Position = startPosition; - - // Format length as 6-digit hex and write directly - if (!Utf8Formatter.TryFormat(jsonLength, lengthBytes, out _, new StandardFormat('X', 6))) - { - throw new InvalidOperationException("Failed to format length prefix"); - } - - lengthBytes[6] = (byte)':'; - - stream.Write(lengthBytes); - - // Restore position to end - stream.Position = endPosition; - } - - /// - /// Reads netstring-encoded JSON objects from a stream and deserializes them. - /// - /// The stream to read from. - /// The JSON type info for deserialization. - /// The cancellation token to cancel the operation. - /// An async enumerable of deserialized objects. - /// Thrown when the stream contains invalid netstring data. - public static async IAsyncEnumerable DecodeAsync(Stream stream, JsonTypeInfo jsonTypeInfo, [EnumeratorCancellation] CancellationToken cancellationToken) - { - const int TypicalBufferSize = 4096; // 4KB - var buffer = ArrayPool.Shared.Rent(TypicalBufferSize); - - try - { - while (true) - { - - // Try to read length prefix (6 hex digits + colon) - try - { - await stream.ReadExactlyAsync(buffer, 0, 7, cancellationToken); - } - catch (EndOfStreamException) - { - // We are done - yield break; - } - - // Verify colon - if (buffer[6] != ':') - { - throw new InvalidDataException($"Expected colon at position 6, got byte value {buffer[6]}"); - } - - // Parse length as hex - if (!Utf8Parser.TryParse(buffer.AsSpan(0, 6), out int length, out _, 'X')) - { - throw new InvalidDataException($"Invalid netstring length: {System.Text.Encoding.UTF8.GetString(buffer, 0, 6)}"); - } - - if (length < 0 || length > MaxLength) - { - throw new InvalidDataException($"Netstring length out of valid range: {length}"); - } - - // Ensure buffer is large enough for the data + newline - var totalLength = length + 1; - if (buffer.Length < totalLength) - { - ArrayPool.Shared.Return(buffer); - buffer = ArrayPool.Shared.Rent(totalLength); - } - - // Read data + trailing newline - try - { - await stream.ReadExactlyAsync(buffer.AsMemory(0, totalLength), cancellationToken); - } - catch (EndOfStreamException ex) - { - throw new InvalidDataException("Unexpected end of stream while reading netstring data", ex); - } - - // Verify trailing newline - if (buffer[length] != '\n') - { - throw new InvalidDataException($"Expected newline at end of netstring, got byte value {buffer[length]}"); - } - - // Deserialize JSON directly from UTF-8 bytes - var result = JsonSerializer.Deserialize(buffer.AsSpan(0, length), jsonTypeInfo); - if (result is null) - { - throw new JsonException("Deserialized JSON resulted in null value"); - } - - yield return result; - } - } - finally - { - ArrayPool.Shared.Return(buffer); - } - } -} diff --git a/src/Azure/Orleans.DurableJobs.AzureStorage/Orleans.DurableJobs.AzureStorage.csproj b/src/Azure/Orleans.DurableJobs.AzureStorage/Orleans.DurableJobs.AzureStorage.csproj index cf1848b99ed..a2990fa9144 100644 --- a/src/Azure/Orleans.DurableJobs.AzureStorage/Orleans.DurableJobs.AzureStorage.csproj +++ b/src/Azure/Orleans.DurableJobs.AzureStorage/Orleans.DurableJobs.AzureStorage.csproj @@ -12,6 +12,7 @@ true $(DefineConstants) enable + $(NoWarn);ORLEANSEXP005 $(VersionSuffix).alpha.1 alpha.1 @@ -19,8 +20,7 @@ - - + diff --git a/src/Azure/Orleans.DurableJobs.AzureStorage/README.md b/src/Azure/Orleans.DurableJobs.AzureStorage/README.md index 29757f6b3c5..6c2557efbe2 100644 --- a/src/Azure/Orleans.DurableJobs.AzureStorage/README.md +++ b/src/Azure/Orleans.DurableJobs.AzureStorage/README.md @@ -27,13 +27,10 @@ builder.UseOrleans(siloBuilder => { siloBuilder .UseAzureStorageClustering(options => options.ConfigureTableServiceClient("YOUR_STORAGE_ACCOUNT_URI")) - .UseAzureStorageDurableJobs(options => + .UseAzureBlobDurableJobs(options => { - options.Configure(o => - { - o.BlobServiceClient = new BlobServiceClient("YOUR_AZURE_STORAGE_CONNECTION_STRING"); - o.ContainerName = "durable-jobs"; - }); + options.BlobServiceClient = new BlobServiceClient("YOUR_AZURE_STORAGE_CONNECTION_STRING"); + options.ContainerName = "durable-jobs"; }); }); @@ -53,16 +50,13 @@ builder.UseOrleans(siloBuilder => { siloBuilder .UseAzureStorageClustering(options => options.ConfigureTableServiceClient("YOUR_STORAGE_ACCOUNT_URI")) - .UseAzureStorageDurableJobs(options => + .UseAzureBlobDurableJobs(options => { - options.Configure(o => - { - var credential = new DefaultAzureCredential(); - o.BlobServiceClient = new BlobServiceClient( - new Uri("https://youraccount.blob.core.windows.net"), - credential); - o.ContainerName = "durable-jobs"; - }); + var credential = new DefaultAzureCredential(); + options.BlobServiceClient = new BlobServiceClient( + new Uri("https://youraccount.blob.core.windows.net"), + credential); + options.ContainerName = "durable-jobs"; }); }); @@ -78,14 +72,11 @@ builder.UseOrleans(siloBuilder => { siloBuilder .UseAzureStorageClustering(options => options.ConfigureTableServiceClient(connectionString)) - .UseAzureStorageDurableJobs(options => + .UseAzureBlobDurableJobs(options => { - options.Configure(o => - { - o.BlobServiceClient = new BlobServiceClient(connectionString); - // Use different containers for different environments - o.ContainerName = $"durable-jobs-{Environment.GetEnvironmentVariable("ASPNETCORE_ENVIRONMENT")?.ToLowerInvariant()}"; - }); + options.BlobServiceClient = new BlobServiceClient(connectionString); + // Use different containers for different environments + options.ContainerName = $"durable-jobs-{Environment.GetEnvironmentVariable("ASPNETCORE_ENVIRONMENT")?.ToLowerInvariant()}"; }) .ConfigureServices(services => { diff --git a/src/Azure/Orleans.Journaling.AzureStorage/AzureBlobJournalStorage.cs b/src/Azure/Orleans.Journaling.AzureStorage/AzureBlobJournalStorage.cs index 3c29b0dc8be..ccdd4d6c360 100644 --- a/src/Azure/Orleans.Journaling.AzureStorage/AzureBlobJournalStorage.cs +++ b/src/Azure/Orleans.Journaling.AzureStorage/AzureBlobJournalStorage.cs @@ -57,6 +57,91 @@ internal AzureBlobJournalStorage( _walClient = GetWalClient(); } + public async ValueTask CreateIfNotExistsAsync( + IReadOnlyDictionary? metadata = null, + CancellationToken cancellationToken = default) + { + var callerMetadata = CopyAndValidateCallerMetadata(metadata); + try + { + var response = await CreateWalAsync( + checkpointName: null, + new AppendBlobRequestConditions { IfNoneMatch = ETag.All }, + cancellationToken, + callerMetadata).ConfigureAwait(false); + SetWal(response.Value.ETag, blockCount: 0); + return true; + } + catch (RequestFailedException exception) when (exception.Status is 409 or 412) + { + return false; + } + } + + public async ValueTask GetMetadataAsync(CancellationToken cancellationToken = default) + { + var properties = await GetPropertiesCoreAsync(_walClient, conditions: null, cancellationToken).ConfigureAwait(false); + return properties is null || properties.BlobType != BlobType.Append + ? null + : CreateJournalMetadata(properties.ETag, properties.Metadata); + } + + public async ValueTask UpdateMetadataAsync( + IReadOnlyDictionary? set = null, + IEnumerable? remove = null, + string? expectedETag = null, + CancellationToken cancellationToken = default) + { + var setValues = CopyAndValidateCallerMetadata(set); + var removeValues = CopyRemove(remove, setValues); + for (var attempt = 0; attempt < 3; attempt++) + { + BlobProperties? properties; + try + { + properties = await GetPropertiesCoreAsync( + _walClient, + expectedETag is null ? null : new BlobRequestConditions { IfMatch = ToAzureETag(expectedETag) }, + cancellationToken).ConfigureAwait(false); + } + catch (RequestFailedException exception) when (exception.Status is 412) + { + return null; + } + + if (properties is null || properties.BlobType != BlobType.Append) + { + return null; + } + + var metadata = CopyMetadata(properties.Metadata); + if (!ApplyCallerMetadataUpdate(metadata, setValues, removeValues)) + { + return CreateJournalMetadata(properties.ETag, metadata); + } + + var conditions = new BlobRequestConditions + { + IfMatch = expectedETag is null ? properties.ETag : ToAzureETag(expectedETag), + }; + + try + { + var response = await _walClient.SetMetadataAsync(metadata, conditions, cancellationToken).ConfigureAwait(false); + return CreateJournalMetadata(response.Value.ETag, metadata); + } + catch (RequestFailedException exception) when (exception.Status is 412) + { + if (expectedETag is not null) + { + return null; + } + } + } + + return null; + } + public async ValueTask AppendAsync(ReadOnlySequence value, CancellationToken cancellationToken) { // Appends are written as one Azure append block, so validate blob limits before touching storage. @@ -220,8 +305,8 @@ public async ValueTask ReadAsync(IJournalStorageConsumer consumer, CancellationT var walMetadata = manifest.Metadata.Format is { Length: > 0 } ? manifest.Metadata : expectedFormat is { Length: > 0 } - ? new JournalFileMetadata(expectedFormat) - : JournalFileMetadata.Empty; + ? new JournalMetadata(expectedFormat) + : JournalMetadata.Empty; var totalWalBytes = await consumer.ReadAsync( walStream, walMetadata, @@ -239,33 +324,29 @@ public async ValueTask ReplaceAsync(ReadOnlySequence value, CancellationTo await EnsureWalAsync(cancellationToken).ConfigureAwait(false); var expectedWalETag = _walETag; - string? previousCheckpointName = null; - if (_shared.Options.DeleteOldCheckpoints) + WalState? walState; + try { - // Read the WAL manifest only when cleanup needs the previous checkpoint name, and require the cached ETag to still match. - WalState? walState; - try - { - walState = await TryLoadWalStateAsync(new BlobRequestConditions { IfMatch = expectedWalETag }, cancellationToken).ConfigureAwait(false); - - if (walState is null) - { - throw CreateInconsistentWalStateException( - "Azure Blob journal WAL changed while publishing a checkpoint; recovery is required.", - expectedWalETag); - } - } - catch (RequestFailedException exception) when (IsWalMutationConflict(exception)) + // Read the WAL manifest so compaction preserves caller-owned metadata while replacing provider-owned checkpoint metadata. + walState = await TryLoadWalStateAsync(new BlobRequestConditions { IfMatch = expectedWalETag }, cancellationToken).ConfigureAwait(false); + if (walState is null) { throw CreateInconsistentWalStateException( "Azure Blob journal WAL changed while publishing a checkpoint; recovery is required.", - expectedWalETag, - exception); + expectedWalETag); } - - expectedWalETag = walState.Value.ETag; - previousCheckpointName = walState.Value.Manifest.Checkpoint?.Name; } + catch (RequestFailedException exception) when (IsWalMutationConflict(exception)) + { + throw CreateInconsistentWalStateException( + "Azure Blob journal WAL changed while publishing a checkpoint; recovery is required.", + expectedWalETag, + exception); + } + + expectedWalETag = walState.Value.ETag; + var previousCheckpointName = _shared.Options.DeleteOldCheckpoints ? walState.Value.Manifest.Checkpoint?.Name : null; + var callerMetadata = walState.Value.Manifest.Metadata.Properties; using var checkpointStream = new ReadOnlySequenceStream(value); while (true) @@ -299,7 +380,8 @@ await checkpointClient.UploadAsync( var result = await CreateWalAsync( checkpointName, new AppendBlobRequestConditions { IfMatch = expectedWalETag }, - cancellationToken).ConfigureAwait(false); + cancellationToken, + callerMetadata).ConfigureAwait(false); SetWal(result.Value.ETag, blockCount: 0); } catch (RequestFailedException exception) when (IsWalMutationConflict(exception)) @@ -412,7 +494,8 @@ private AppendBlobClient GetWalClient() private async ValueTask> CreateWalAsync( string? checkpointName, AppendBlobRequestConditions conditions, - CancellationToken cancellationToken) + CancellationToken cancellationToken, + IReadOnlyDictionary? callerMetadata = null) { // Creating an append blob is also how compaction publishes a fresh WAL manifest. return await _walClient.CreateAsync( @@ -420,7 +503,7 @@ private async ValueTask> CreateWalAsync( { Conditions = conditions, HttpHeaders = CreateHttpHeaders(_shared.MimeType), - Metadata = CreateWalMetadata(checkpointName, checkpointOffset: 0), + Metadata = CreateWalMetadata(checkpointName, checkpointOffset: 0, callerMetadata), }, cancellationToken).ConfigureAwait(false); } @@ -478,10 +561,22 @@ private Dictionary CreateMetadataDictionary() private Dictionary CreateCheckpointBlobMetadata() => CreateMetadataDictionary(); - private Dictionary CreateWalMetadata(string? checkpointName, long checkpointOffset) + private Dictionary CreateWalMetadata( + string? checkpointName, + long checkpointOffset, + IReadOnlyDictionary? callerMetadata = null) { // WAL metadata is the recovery manifest: common format plus optional checkpoint pointer and WAL offset. var metadata = CreateMetadataDictionary(); + if (callerMetadata is not null) + { + foreach (var (key, value) in callerMetadata) + { + ValidateCallerMetadataProperty(key, value); + metadata[key] = value; + } + } + if (checkpointName is not null) { metadata[CheckpointMetadataKey] = checkpointName; @@ -501,9 +596,7 @@ private Dictionary CreateWalMetadata(string? checkpointName, lon private static WalManifest CreateWalManifest(IDictionary? metadata) { // Decode the WAL manifest, accepting non-compacted WALs that have no checkpoint pointer. - var fileMetadata = GetFormatKeyMetadata(metadata) is { } format - ? new JournalFileMetadata(format) - : JournalFileMetadata.Empty; + var fileMetadata = CreateJournalMetadata(eTag: default, metadata); if (metadata is null || !metadata.TryGetValue(CheckpointMetadataKey, out var checkpointName) || checkpointName is not { Length: > 0 }) { return new WalManifest(fileMetadata, Checkpoint: null); @@ -521,7 +614,7 @@ private static WalManifest CreateWalManifest(IDictionary? metada return new WalManifest(fileMetadata, new CheckpointReference(checkpointName, checkpointOffset)); } - private static IJournalFileMetadata ValidateCheckpointMetadata(CheckpointReference checkpoint, BlobDownloadDetails checkpointDetails, string? expectedFormat) + private static IJournalMetadata ValidateCheckpointMetadata(CheckpointReference checkpoint, BlobDownloadDetails checkpointDetails, string? expectedFormat) { // Refuse to stitch checkpoint and WAL data together if their declared journal formats differ. var checkpointBlobFormat = GetFormatKeyMetadata(checkpointDetails.Metadata); @@ -540,9 +633,152 @@ private static IJournalFileMetadata ValidateCheckpointMetadata(CheckpointReferen } } - return checkpointBlobFormat is { } format - ? new JournalFileMetadata(format) - : JournalFileMetadata.Empty; + return CreateJournalMetadata(eTag: default, checkpointDetails.Metadata); + } + + private static async ValueTask GetPropertiesCoreAsync( + AppendBlobClient blobClient, + BlobRequestConditions? conditions, + CancellationToken cancellationToken) + { + try + { + var response = await blobClient.GetPropertiesAsync(conditions, cancellationToken).ConfigureAwait(false); + return response.Value; + } + catch (RequestFailedException exception) when (exception.Status is 404) + { + return null; + } + } + + private static IJournalMetadata CreateJournalMetadata(ETag eTag, IDictionary? metadata) + => new JournalMetadata( + GetFormatKeyMetadata(metadata), + eTag == default ? null : eTag.ToString(), + CopyCallerMetadata(metadata)); + + private static Dictionary CopyCallerMetadata(IDictionary? metadata) + { + var result = new Dictionary(StringComparer.Ordinal); + if (metadata is null) + { + return result; + } + + foreach (var (key, value) in metadata) + { + if (IsProviderMetadataKey(key)) + { + continue; + } + + result[key] = value; + } + + return result; + } + + private static Dictionary CopyAndValidateCallerMetadata(IReadOnlyDictionary? metadata) + { + var result = new Dictionary(StringComparer.Ordinal); + if (metadata is null) + { + return result; + } + + foreach (var (key, value) in metadata) + { + ValidateCallerMetadataProperty(key, value); + result.Add(key, value); + } + + return result; + } + + private static Dictionary CopyMetadata(IDictionary? metadata) + => metadata is null + ? new Dictionary(StringComparer.OrdinalIgnoreCase) + : new Dictionary(metadata, StringComparer.OrdinalIgnoreCase); + + private static IReadOnlySet CopyRemove(IEnumerable? remove, IReadOnlyDictionary set) + { + if (remove is null) + { + return new HashSet(StringComparer.Ordinal); + } + + var result = new HashSet(StringComparer.Ordinal); + foreach (var propertyName in remove) + { + ValidateCallerMetadataPropertyName(propertyName); + if (set.ContainsKey(propertyName)) + { + throw new ArgumentException($"Journal metadata property '{propertyName}' cannot be both set and removed.", nameof(remove)); + } + + result.Add(propertyName); + } + + return result; + } + + private static bool ApplyCallerMetadataUpdate( + Dictionary metadata, + IReadOnlyDictionary set, + IReadOnlySet remove) + { + var changed = false; + foreach (var propertyName in remove) + { + ValidateCallerMetadataPropertyName(propertyName); + changed |= metadata.Remove(propertyName); + } + + foreach (var (propertyName, value) in set) + { + ValidateCallerMetadataProperty(propertyName, value); + if (!metadata.TryGetValue(propertyName, out var currentValue) + || !string.Equals(currentValue, value, StringComparison.Ordinal)) + { + metadata[propertyName] = value; + changed = true; + } + } + + return changed; + } + + private static void ValidateCallerMetadataProperty(string key, string value) + { + ValidateCallerMetadataPropertyName(key); + ArgumentNullException.ThrowIfNull(value); + } + + private static void ValidateCallerMetadataPropertyName(string key) + { + ArgumentException.ThrowIfNullOrWhiteSpace(key); + if (key.IndexOf('\0') >= 0) + { + throw new ArgumentException("Journal metadata property names must not contain null characters.", nameof(key)); + } + + if (IsProviderMetadataKey(key)) + { + throw new ArgumentException($"Journal metadata property '{key}' is provider-owned.", nameof(key)); + } + } + + private static bool IsProviderMetadataKey(string key) + => string.Equals(key, FormatMetadataKey, StringComparison.OrdinalIgnoreCase) + || string.Equals(key, CheckpointMetadataKey, StringComparison.OrdinalIgnoreCase) + || string.Equals(key, CheckpointOffsetMetadataKey, StringComparison.OrdinalIgnoreCase) + || key.StartsWith("$", StringComparison.Ordinal); + + private static ETag ToAzureETag(string eTag) + { + ArgumentException.ThrowIfNullOrWhiteSpace(eTag); + return new ETag(eTag); } private static bool IsBlobSealed(RequestFailedException exception) @@ -590,7 +826,7 @@ private static InconsistentStateException CreateInconsistentWalStateException(st Message = "Failed to delete obsolete Azure Blob journal checkpoint \"{ContainerName}/{BlobName}\"")] private static partial void LogCheckpointCleanupFailure(ILogger logger, string containerName, string blobName, Exception exception); - private sealed record WalManifest(IJournalFileMetadata Metadata, CheckpointReference? Checkpoint); + private sealed record WalManifest(IJournalMetadata Metadata, CheckpointReference? Checkpoint); private readonly record struct WalState(ETag ETag, WalManifest Manifest); diff --git a/src/Azure/Orleans.Journaling.AzureStorage/AzureBlobJournalStorageProvider.cs b/src/Azure/Orleans.Journaling.AzureStorage/AzureBlobJournalStorageProvider.cs index 0a08b3051a2..615f13763a0 100644 --- a/src/Azure/Orleans.Journaling.AzureStorage/AzureBlobJournalStorageProvider.cs +++ b/src/Azure/Orleans.Journaling.AzureStorage/AzureBlobJournalStorageProvider.cs @@ -1,14 +1,21 @@ +using System.Runtime.CompilerServices; +using Azure; +using Azure.Storage.Blobs; +using Azure.Storage.Blobs.Models; +using Azure.Storage.Blobs.Specialized; using Microsoft.Extensions.DependencyInjection; using Microsoft.Extensions.Logging; using Microsoft.Extensions.Options; +using Orleans.Runtime; namespace Orleans.Journaling; -internal sealed class AzureBlobJournalStorageProvider : ILifecycleParticipant, IJournalStorageProvider +internal sealed class AzureBlobJournalStorageProvider : ILifecycleParticipant, IJournalStorageProvider, IJournalStorageCatalog { private readonly IBlobContainerFactory _containerFactory; private readonly AzureBlobJournalStorageOptions _options; private readonly AzureBlobJournalStorage.AzureBlobJournalStorageShared _shared; + private BlobContainerClient? _defaultContainer; public AzureBlobJournalStorageProvider( IOptions options, @@ -31,6 +38,8 @@ public AzureBlobJournalStorageProvider( private async Task Initialize(CancellationToken cancellationToken) { var client = await _options.CreateClient!(cancellationToken); + _defaultContainer = client.GetBlobContainerClient(_options.ContainerName); + await _defaultContainer.CreateIfNotExistsAsync(cancellationToken: cancellationToken).ConfigureAwait(false); await _containerFactory.InitializeAsync(client, cancellationToken).ConfigureAwait(false); } @@ -44,6 +53,43 @@ public IJournalStorage CreateStorage(JournalId journalId) return new AzureBlobJournalStorage(_shared, journalId); } + public async IAsyncEnumerable ListAsync( + JournalId prefix = default, + [EnumeratorCancellation] CancellationToken cancellationToken = default) + { + var container = GetDefaultContainerClient(); + var blobPrefix = prefix.IsDefault ? null : prefix.Value; + var journalIds = new List(); + await foreach (var item in container.GetBlobsAsync( + traits: BlobTraits.None, + states: BlobStates.None, + prefix: blobPrefix, + cancellationToken: cancellationToken)) + { + if (item.Properties.BlobType is { } blobType && blobType != BlobType.Append) + { + continue; + } + + if (!item.Name.EndsWith("/wal", StringComparison.Ordinal)) + { + continue; + } + + var storageIdValue = item.Name[..^"/wal".Length]; + if (TryParseJournalId(storageIdValue, out var journalId) && prefix.IsPrefixOf(journalId)) + { + journalIds.Add(journalId); + } + } + + foreach (var journalId in journalIds.OrderBy(static journalId => journalId.Value, StringComparer.Ordinal)) + { + cancellationToken.ThrowIfCancellationRequested(); + yield return journalId; + } + } + public void Participate(ISiloLifecycle observer) { observer.Subscribe( @@ -52,6 +98,24 @@ public void Participate(ISiloLifecycle observer) onStart: Initialize); } + private BlobContainerClient GetDefaultContainerClient() + => _defaultContainer ?? throw new InvalidOperationException( + $"{nameof(AzureBlobJournalStorageProvider)} has not been initialized. Ensure the silo lifecycle has started before using journal storage."); + + private static bool TryParseJournalId(string value, out JournalId journalId) + { + try + { + journalId = new JournalId(value); + return true; + } + catch (ArgumentException) + { + journalId = default; + return false; + } + } + private static IJournalFormat GetJournalFormat(IServiceProvider serviceProvider, string journalFormatKey) { var journalFormat = serviceProvider.GetKeyedService(journalFormatKey); diff --git a/src/Azure/Orleans.Journaling.AzureStorage/AzureBlobStorageHostingExtensions.cs b/src/Azure/Orleans.Journaling.AzureStorage/AzureBlobStorageHostingExtensions.cs index 9bf1088bf7c..a5f0702e370 100644 --- a/src/Azure/Orleans.Journaling.AzureStorage/AzureBlobStorageHostingExtensions.cs +++ b/src/Azure/Orleans.Journaling.AzureStorage/AzureBlobStorageHostingExtensions.cs @@ -22,6 +22,7 @@ public static ISiloBuilder AddAzureBlobJournalStorage(this ISiloBuilder builder, { builder.Services.AddSingleton(); builder.Services.AddFromExisting(); + builder.Services.AddFromExisting(); builder.Services.AddFromExisting, AzureBlobJournalStorageProvider>(); } return builder; diff --git a/src/Orleans.DurableJobs/DurableJobsJsonContext.cs b/src/Orleans.DurableJobs/DurableJobsJsonContext.cs new file mode 100644 index 00000000000..cb97b4bb2e5 --- /dev/null +++ b/src/Orleans.DurableJobs/DurableJobsJsonContext.cs @@ -0,0 +1,19 @@ +using System; +using System.Collections.Generic; +using System.Text.Json.Serialization; + +namespace Orleans.DurableJobs; + +[JsonSerializable(typeof(DurableJob))] +[JsonSerializable(typeof(DurableJobShardJournalRecord))] +[JsonSerializable(typeof(DurableJobShardRemoveOperation))] +[JsonSerializable(typeof(DurableJobShardRetryOperation))] +[JsonSerializable(typeof(DurableJobShardScheduleOperation))] +[JsonSerializable(typeof(DurableJobShardSnapshot))] +[JsonSerializable(typeof(DurableJobShardSnapshotEntry))] +[JsonSerializable(typeof(Dictionary))] +[JsonSerializable(typeof(DateTime))] +[JsonSerializable(typeof(string))] +[JsonSerializable(typeof(uint))] +[JsonSerializable(typeof(ulong))] +internal sealed partial class DurableJobsJsonContext : JsonSerializerContext; diff --git a/src/Orleans.DurableJobs/Hosting/DurableJobsExtensions.cs b/src/Orleans.DurableJobs/Hosting/DurableJobsExtensions.cs index f0cd24844a3..7f894661617 100644 --- a/src/Orleans.DurableJobs/Hosting/DurableJobsExtensions.cs +++ b/src/Orleans.DurableJobs/Hosting/DurableJobsExtensions.cs @@ -1,10 +1,13 @@ using System.Linq; +using Microsoft.Extensions.Configuration; using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.DependencyInjection.Extensions; using Microsoft.Extensions.Logging; -using Microsoft.Extensions.Options; using Orleans.Configuration.Internal; using Orleans.Runtime; using Orleans.DurableJobs; +using Orleans.Journaling; +using Orleans.Journaling.Json; namespace Orleans.Hosting; @@ -32,6 +35,7 @@ public static void AddDurableJobs(this IServiceCollection services) } services.AddSingleton(); + services.AddSingleton(); services.AddSingleton(); services.AddSingleton(); services.AddFromExisting(); @@ -54,8 +58,10 @@ public static void AddDurableJobs(this IServiceCollection services) public static ISiloBuilder UseInMemoryDurableJobs(this ISiloBuilder builder) { builder.AddDurableJobs(); + builder.AddJournalStorage(); + builder.UseJsonJournalFormat(options => options.AddTypeInfoResolver(DurableJobsJsonContext.Default)); - builder.ConfigureServices(services => services.UseInMemoryDurableJobs()); + builder.ConfigureServices(services => services.UseVolatileJournaledDurableJobs()); return builder; } @@ -69,14 +75,32 @@ public static ISiloBuilder UseInMemoryDurableJobs(this ISiloBuilder builder) /// The provided , for chaining. internal static IServiceCollection UseInMemoryDurableJobs(this IServiceCollection services) { - services.AddSingleton(sp => - { - var siloDetails = sp.GetRequiredService(); - var membershipService = sp.GetRequiredService(); - var durableJobsOptions = sp.GetRequiredService>(); - return new InMemoryJobShardManager(siloDetails.SiloAddress, membershipService, durableJobsOptions.Value.MaxAdoptedCount); - }); - services.AddFromExisting(); + var builder = new ServiceCollectionSiloBuilder(services); + builder.AddJournalStorage(); + builder.UseJsonJournalFormat(options => options.AddTypeInfoResolver(DurableJobsJsonContext.Default)); + return services.UseVolatileJournaledDurableJobs(); + } + + private static IServiceCollection UseVolatileJournaledDurableJobs(this IServiceCollection services) + { + services.TryAddSingleton(); + services.AddFromExisting(); + services.AddFromExisting(); + services.TryAddSingleton(); + services.AddFromExisting(); return services; } + + private sealed class ServiceCollectionSiloBuilder : ISiloBuilder + { + public ServiceCollectionSiloBuilder(IServiceCollection services) + { + Services = services; + Configuration = new ConfigurationBuilder().Build(); + } + + public IServiceCollection Services { get; } + + public IConfiguration Configuration { get; } + } } diff --git a/src/Orleans.DurableJobs/Hosting/DurableJobsOptions.cs b/src/Orleans.DurableJobs/Hosting/DurableJobsOptions.cs index 133356868a7..751a92a69df 100644 --- a/src/Orleans.DurableJobs/Hosting/DurableJobsOptions.cs +++ b/src/Orleans.DurableJobs/Hosting/DurableJobsOptions.cs @@ -1,8 +1,11 @@ using System; +using System.Collections.Generic; +using Microsoft.Extensions.DependencyInjection; using Microsoft.Extensions.Logging; using Microsoft.Extensions.Options; using Orleans.Runtime; using Orleans.DurableJobs; +using Orleans.Journaling; namespace Orleans.Hosting; @@ -210,3 +213,87 @@ public void ValidateConfiguration() )] private static partial void LogInformationOptionsValidated(ILogger logger, TimeSpan shardDuration); } + +internal sealed class DurableJobsJournalingConfigurationValidator : IConfigurationValidator +{ + private readonly IServiceProvider _serviceProvider; + + public DurableJobsJournalingConfigurationValidator(IServiceProvider serviceProvider) + { + _serviceProvider = serviceProvider; + } + + public void ValidateConfiguration() + { + var missingServices = new List(); + var serviceProviderIsService = _serviceProvider.GetService(); + + CheckService(serviceProviderIsService, missingServices); + CheckService(serviceProviderIsService, missingServices); + CheckService(serviceProviderIsService, missingServices); + CheckService(serviceProviderIsService, missingServices); + + if (missingServices.Count > 0) + { + throw new OrleansConfigurationException( + $"DurableJobs requires Orleans.Journaling storage. Configure DurableJobs storage using UseInMemoryDurableJobs() or UseAzureBlobDurableJobs(...) before starting the silo. Missing services: {string.Join(", ", missingServices)}."); + } + + var shardManager = ResolveRequiredService(); + if (shardManager is not JournaledJobShardManager) + { + throw new OrleansConfigurationException( + $"DurableJobs requires the journaled shard manager, but '{shardManager.GetType().FullName}' is registered. Configure DurableJobs storage using UseInMemoryDurableJobs() or UseAzureBlobDurableJobs(...)."); + } + } + + private void CheckService(IServiceProviderIsService? serviceProviderIsService, List missingServices) + where TService : class + { + if (serviceProviderIsService is not null) + { + if (!serviceProviderIsService.IsService(typeof(TService))) + { + missingServices.Add(typeof(TService).Name); + } + + return; + } + + if (ResolveService() is null) + { + missingServices.Add(typeof(TService).Name); + } + } + + private TService? ResolveService() + where TService : class + { + try + { + return _serviceProvider.GetService(); + } + catch (Exception exception) + { + throw CreateServiceResolutionException(exception); + } + } + + private TService ResolveRequiredService() + where TService : notnull + { + try + { + return _serviceProvider.GetRequiredService(); + } + catch (Exception exception) + { + throw CreateServiceResolutionException(exception); + } + } + + private static OrleansConfigurationException CreateServiceResolutionException(Exception exception) + => new( + $"DurableJobs requires Orleans.Journaling storage, but service '{typeof(TService).Name}' could not be resolved. Configure DurableJobs storage using UseInMemoryDurableJobs() or UseAzureBlobDurableJobs(...).", + exception); +} diff --git a/src/Orleans.DurableJobs/ILocalDurableJobManager.cs b/src/Orleans.DurableJobs/ILocalDurableJobManager.cs index 9e50a6231af..3bc367cf44f 100644 --- a/src/Orleans.DurableJobs/ILocalDurableJobManager.cs +++ b/src/Orleans.DurableJobs/ILocalDurableJobManager.cs @@ -27,3 +27,8 @@ public interface ILocalDurableJobManager /// A representing the asynchronous operation that returns if the job was successfully canceled; otherwise, . Task TryCancelDurableJobAsync(DurableJob job, CancellationToken cancellationToken); } + +internal interface ILocalDurableJobManagerSystemTarget : ISystemTarget +{ + Task TryCancelDurableJobAsync(DurableJob job, CancellationToken cancellationToken); +} diff --git a/src/Orleans.DurableJobs/InMemoryJobQueue.cs b/src/Orleans.DurableJobs/InMemoryJobQueue.cs index 55db1811620..e687ed3c00e 100644 --- a/src/Orleans.DurableJobs/InMemoryJobQueue.cs +++ b/src/Orleans.DurableJobs/InMemoryJobQueue.cs @@ -12,6 +12,7 @@ namespace Orleans.DurableJobs; /// internal sealed class InMemoryJobQueue : IAsyncEnumerable { + private readonly TimeProvider _timeProvider; private readonly PriorityQueue _queue = new(); private readonly Dictionary _jobsIdToBucket = new(); private readonly Dictionary _buckets = new(); @@ -22,6 +23,11 @@ internal sealed class InMemoryJobQueue : IAsyncEnumerable private readonly object _syncLock = new(); #endif + public InMemoryJobQueue(TimeProvider? timeProvider = null) + { + _timeProvider = timeProvider ?? TimeProvider.System; + } + /// /// Gets the total number of jobs currently in the queue. /// @@ -37,6 +43,10 @@ internal sealed class InMemoryJobQueue : IAsyncEnumerable public void Enqueue(DurableJob job, int dequeueCount) { ArgumentNullException.ThrowIfNull(job); + if (dequeueCount < 0) + { + throw new ArgumentOutOfRangeException(nameof(dequeueCount)); + } lock (_syncLock) { @@ -97,27 +107,83 @@ public bool CancelJob(string jobId) /// public void RetryJobLater(IJobRunContext jobContext, DateTimeOffset newDueTime) { - var jobId = jobContext.Job.Id; - var newJob = new DurableJob + ArgumentNullException.ThrowIfNull(jobContext); + _ = RetryJobLater(jobContext.Job.Id, newDueTime, jobContext.DequeueCount); + } + + /// + /// Reschedules a job for retry with a new due time. + /// + /// The unique identifier of the job to retry. + /// The new due time for the job. + /// The persisted dequeue count to associate with the retried job. + /// True if the job was found and rescheduled; false if the job was not found. + public bool RetryJobLater(string jobId, DateTimeOffset newDueTime, int dequeueCount) + { + ArgumentException.ThrowIfNullOrWhiteSpace(jobId); + if (dequeueCount < 0) { - Id = jobContext.Job.Id, - Name = jobContext.Job.Name, - DueTime = newDueTime, - TargetGrainId = jobContext.Job.TargetGrainId, - ShardId = jobContext.Job.ShardId, - Metadata = jobContext.Job.Metadata - }; + throw new ArgumentOutOfRangeException(nameof(dequeueCount)); + } lock (_syncLock) { - if (_jobsIdToBucket.TryGetValue(jobId, out var oldBucket)) + if (!_jobsIdToBucket.TryGetValue(jobId, out var oldBucket) || !oldBucket.TryGetJob(jobId, out var existing)) { - oldBucket.RemoveJob(jobId); - _jobsIdToBucket.Remove(jobId); - var newBucket = GetJobBucket(newDueTime); - newBucket.AddJob(newJob, jobContext.DequeueCount); - _jobsIdToBucket[jobId] = newBucket; + return false; } + + var newJob = new DurableJob + { + Id = existing.Job.Id, + Name = existing.Job.Name, + DueTime = newDueTime, + TargetGrainId = existing.Job.TargetGrainId, + ShardId = existing.Job.ShardId, + Metadata = existing.Job.Metadata + }; + + oldBucket.RemoveJob(jobId); + _jobsIdToBucket.Remove(jobId); + var newBucket = GetJobBucket(newDueTime); + newBucket.AddJob(newJob, dequeueCount); + _jobsIdToBucket[jobId] = newBucket; + return true; + } + } + + /// + /// Gets a point-in-time snapshot of live jobs and their persisted dequeue counts. + /// + /// The current live jobs and dequeue counts. + public IReadOnlyList<(DurableJob Job, int DequeueCount)> GetSnapshot() + { + lock (_syncLock) + { + var result = new List<(DurableJob Job, int DequeueCount)>(_jobsIdToBucket.Count); + foreach (var (jobId, bucket) in _jobsIdToBucket) + { + if (bucket.TryGetJob(jobId, out var item)) + { + result.Add(item); + } + } + + return result; + } + } + + /// + /// Clears all queue state. + /// + public void Clear() + { + lock (_syncLock) + { + _queue.Clear(); + _jobsIdToBucket.Clear(); + _buckets.Clear(); + _isComplete = false; } } @@ -131,7 +197,7 @@ public void RetryJobLater(IJobRunContext jobContext, DateTimeOffset newDueTime) /// public async IAsyncEnumerator GetAsyncEnumerator(CancellationToken cancellationToken = default) { - using var timer = new PeriodicTimer(TimeSpan.FromSeconds(1)); + using var timer = new PeriodicTimer(TimeSpan.FromSeconds(1), _timeProvider); while (true) { JobBucket? bucketToProcess = null; @@ -149,7 +215,7 @@ public async IAsyncEnumerator GetAsyncEnumerator(CancellationTok else if (_queue.Count > 0) { var nextBucket = _queue.Peek(); - if (nextBucket.DueTime < DateTimeOffset.UtcNow) + if (nextBucket.DueTime < _timeProvider.GetUtcNow()) { // Dequeue the entire bucket to process outside the lock bucketToProcess = _queue.Dequeue(); @@ -230,4 +296,9 @@ public bool RemoveJob(string jobId) { return _jobs.Remove(jobId); } + + public bool TryGetJob(string jobId, out (DurableJob Job, int DequeueCount) job) + { + return _jobs.TryGetValue(jobId, out job); + } } diff --git a/src/Orleans.DurableJobs/InMemoryJobShard.cs b/src/Orleans.DurableJobs/InMemoryJobShard.cs deleted file mode 100644 index 148a14b0716..00000000000 --- a/src/Orleans.DurableJobs/InMemoryJobShard.cs +++ /dev/null @@ -1,33 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Diagnostics; -using System.Threading; -using System.Threading.Tasks; -using Orleans.Runtime; - -namespace Orleans.DurableJobs; - -[DebuggerDisplay("ShardId={Id}, StartTime={StartTime}, EndTime={EndTime}")] -internal sealed class InMemoryJobShard : JobShard -{ - public InMemoryJobShard(string shardId, DateTimeOffset minDueTime, DateTimeOffset maxDueTime, IDictionary? metadata) - : base(shardId, minDueTime, maxDueTime) - { - Metadata = metadata; - } - - protected override Task PersistAddJobAsync(string jobId, string jobName, DateTimeOffset dueTime, GrainId target, IReadOnlyDictionary? metadata, CancellationToken cancellationToken) - { - return Task.CompletedTask; - } - - protected override Task PersistRemoveJobAsync(string jobId, CancellationToken cancellationToken) - { - return Task.CompletedTask; - } - - protected override Task PersistRetryJobAsync(string jobId, DateTimeOffset newDueTime, CancellationToken cancellationToken) - { - return Task.CompletedTask; - } -} diff --git a/src/Orleans.DurableJobs/JobShardId.cs b/src/Orleans.DurableJobs/JobShardId.cs new file mode 100644 index 00000000000..a6b6e14480a --- /dev/null +++ b/src/Orleans.DurableJobs/JobShardId.cs @@ -0,0 +1,68 @@ +using System; +using Orleans.Journaling; + +namespace Orleans.DurableJobs; + +internal readonly record struct JobShardId +{ + private const string RootSegment = "jobs"; + private const string ShardsSegment = "shards"; + + public JobShardId(string value) + { + ArgumentException.ThrowIfNullOrWhiteSpace(value); + Value = value; + } + + public string Value { get; } + + public static JournalId StoragePrefix => JournalId.Create(RootSegment, ShardsSegment); + + public static JobShardId New() => new(Guid.NewGuid().ToString("N")); + + public static JobShardId Parse(string value) => new(value); + + public static JobShardId FromJournalId(JournalId journalId) + { + if (journalId.IsDefault) + { + throw new ArgumentException("The journal id must not be the default value.", nameof(journalId)); + } + + var segments = DecodeSegments(journalId.Value); + if (segments.Length != 3 + || !string.Equals(segments[0], RootSegment, StringComparison.Ordinal) + || !string.Equals(segments[1], ShardsSegment, StringComparison.Ordinal)) + { + throw new ArgumentException($"Journal id '{journalId}' is not a DurableJobs shard journal id.", nameof(journalId)); + } + + return new(segments[2]); + } + + public JournalId ToJournalId() => JournalId.Create(RootSegment, ShardsSegment, Value); + + public override string ToString() => Value; + + private static string[] DecodeSegments(string value) + { + if (value[0] == '/' || value[^1] == '/') + { + throw new ArgumentException("A journal id must not start or end with a separator.", nameof(value)); + } + + var encodedSegments = value.Split('/'); + var decodedSegments = new string[encodedSegments.Length]; + for (var i = 0; i < encodedSegments.Length; i++) + { + if (encodedSegments[i].Length == 0) + { + throw new ArgumentException("A journal id must not contain empty segments.", nameof(value)); + } + + decodedSegments[i] = Uri.UnescapeDataString(encodedSegments[i]); + } + + return decodedSegments; + } +} diff --git a/src/Orleans.DurableJobs/JobShardManager.cs b/src/Orleans.DurableJobs/JobShardManager.cs index 51a5feaa677..5a1a9408768 100644 --- a/src/Orleans.DurableJobs/JobShardManager.cs +++ b/src/Orleans.DurableJobs/JobShardManager.cs @@ -1,6 +1,5 @@ using System; using System.Collections.Generic; -using System.Linq; using System.Threading; using System.Threading.Tasks; using Orleans.Runtime; @@ -9,8 +8,10 @@ namespace Orleans.DurableJobs; /// /// Manages the lifecycle of job shards for a specific silo. -/// Each silo instance has its own shard manager. /// +/// +/// Each silo instance has its own shard manager. +/// public abstract class JobShardManager { /// @@ -57,209 +58,8 @@ protected JobShardManager(SiloAddress siloAddress) /// Cancellation token. /// A task representing the asynchronous operation. public abstract Task UnregisterShardAsync(IJobShard shard, CancellationToken cancellationToken); -} - -internal class InMemoryJobShardManager : JobShardManager -{ - // Shared storage across all manager instances to support multi-silo scenarios - private static readonly Dictionary _globalShardStore = new(); - private static readonly SemaphoreSlim _asyncLock = new(1, 1); - private readonly IClusterMembershipService? _membershipService; - private readonly int _maxAdoptedCount; - - public InMemoryJobShardManager(SiloAddress siloAddress) : this(siloAddress, null, 3) - { - } - - public InMemoryJobShardManager(SiloAddress siloAddress, IClusterMembershipService? membershipService) : this(siloAddress, membershipService, 3) - { - } - - public InMemoryJobShardManager(SiloAddress siloAddress, IClusterMembershipService? membershipService, int maxAdoptedCount) : base(siloAddress) - { - _membershipService = membershipService; - _maxAdoptedCount = maxAdoptedCount; - } - - /// - /// Clears all shards from the global store. For testing purposes only. - /// - internal static async Task ClearAllShardsAsync() - { - await _asyncLock.WaitAsync(); - try - { - _globalShardStore.Clear(); - } - finally - { - _asyncLock.Release(); - } - } - - /// - /// Gets ownership info for a shard. For testing purposes only. - /// - internal static async Task<(string? Owner, int AdoptedCount)?> GetOwnershipInfoAsync(string shardId) - { - await _asyncLock.WaitAsync(); - try - { - if (_globalShardStore.TryGetValue(shardId, out var ownership)) - { - return (ownership.OwnerSiloAddress, ownership.AdoptedCount); - } - return null; - } - finally - { - _asyncLock.Release(); - } - } - - public override async Task> AssignJobShardsAsync(DateTimeOffset maxDueTime, int maxNewClaims, CancellationToken cancellationToken) - { - var alreadyOwnedShards = new List(); - var adoptedShards = new List(); - - await _asyncLock.WaitAsync(cancellationToken); - try - { - var snapshot = _membershipService?.CurrentSnapshot; - var deadSilos = new HashSet(); - - if (snapshot is not null) - { - foreach (var member in snapshot.Members.Values) - { - if (member.Status == SiloStatus.Dead) - { - deadSilos.Add(member.SiloAddress.ToString()); - } - } - } - - // Assign shards from dead silos or orphaned shards - foreach (var kvp in _globalShardStore) - { - var shardId = kvp.Key; - var ownership = kvp.Value; - - // Skip shards that are already owned by this silo - if (ownership.OwnerSiloAddress == SiloAddress.ToString()) - { - if (ownership.Shard.StartTime <= maxDueTime) - { - alreadyOwnedShards.Add(ownership.Shard); - } - continue; - } - - // Check if this is an orphaned shard (gracefully released) or adopted (from dead silo) - var isOrphaned = ownership.OwnerSiloAddress is null; - var ownerAddress = ownership.OwnerSiloAddress; - var isFromDeadSilo = ownerAddress is not null && deadSilos.Contains(ownerAddress); - - if (isOrphaned || isFromDeadSilo) - { - if (ownership.Shard.StartTime <= maxDueTime) - { - // Respect the slow-start budget: skip claiming if we've exhausted the budget. - // This must be checked before incrementing AdoptedCount to avoid - // inflating the count when the shard isn't actually claimed. - if (adoptedShards.Count >= maxNewClaims) - { - continue; - } - - // If adopted from dead silo, increment adopted count - if (isFromDeadSilo) - { - ownership.AdoptedCount++; - - // Check if shard is poisoned - if (ownership.AdoptedCount > _maxAdoptedCount) - { - // Shard is poisoned - don't assign it - continue; - } - } - - ownership.OwnerSiloAddress = SiloAddress.ToString(); - adoptedShards.Add(ownership.Shard); - } - } - } - } - finally - { - _asyncLock.Release(); - } - foreach (var shard in adoptedShards) - { - // Mark adopted shards as complete - await shard.MarkAsCompleteAsync(CancellationToken.None); - } + internal virtual ValueTask GetShardOwnerAsync(string shardId, CancellationToken cancellationToken) => new((SiloAddress?)null); - return [.. alreadyOwnedShards, .. adoptedShards]; - } - - public override async Task CreateShardAsync(DateTimeOffset minDueTime, DateTimeOffset maxDueTime, IDictionary metadata, CancellationToken cancellationToken) - { - await _asyncLock.WaitAsync(cancellationToken); - try - { - var shardId = $"{SiloAddress}-{Guid.NewGuid()}"; - var newShard = new InMemoryJobShard(shardId, minDueTime, maxDueTime, metadata); - - _globalShardStore[shardId] = new ShardOwnership - { - Shard = newShard, - OwnerSiloAddress = SiloAddress.ToString() - }; - - return newShard; - } - finally - { - _asyncLock.Release(); - } - } - - public override async Task UnregisterShardAsync(IJobShard shard, CancellationToken cancellationToken) - { - var jobCount = await shard.GetJobCountAsync(); - - await _asyncLock.WaitAsync(cancellationToken); - try - { - // Only remove shards that have no jobs remaining - if (_globalShardStore.TryGetValue(shard.Id, out var ownership)) - { - if (jobCount == 0) - { - _globalShardStore.Remove(shard.Id); - } - else - { - // Mark as unowned so another silo can pick it up - ownership.OwnerSiloAddress = null; - // Reset adopted count since we're gracefully releasing (not crashing) - ownership.AdoptedCount = 0; - } - } - } - finally - { - _asyncLock.Release(); - } - } - - private sealed class ShardOwnership - { - public required IJobShard Shard { get; init; } - public string? OwnerSiloAddress { get; set; } - public int AdoptedCount { get; set; } - } + internal virtual ValueTask IsShardOwnedByLocalSiloAsync(string shardId, CancellationToken cancellationToken) => new(true); } diff --git a/src/Orleans.DurableJobs/JournaledJobShard.cs b/src/Orleans.DurableJobs/JournaledJobShard.cs new file mode 100644 index 00000000000..5d66d778aad --- /dev/null +++ b/src/Orleans.DurableJobs/JournaledJobShard.cs @@ -0,0 +1,229 @@ +using System; +using System.Collections.Generic; +using System.Threading; +using System.Threading.Tasks; +using Orleans.Journaling; + +namespace Orleans.DurableJobs; + +/// +/// Journaled implementation of that stores shard state in Orleans journaling storage. +/// +internal sealed class JournaledJobShard : IJobShard +{ + private readonly JournaledJobShardState _state; + private readonly IJournaledStateManager _stateManager; + private readonly JournaledJobShardManager _shardManager; + private readonly SemaphoreSlim _operationLock = new(1, 1); + private int _disposed; + + /// + /// Initializes a new instance of the class. + /// + /// The unique identifier for this job shard. + /// The start time of the time range managed by this shard. + /// The end time of the time range managed by this shard. + /// Optional metadata associated with this job shard. + /// A value indicating whether this shard is closed to new jobs. + /// The journaled shard state. + /// The manager used to persist journaled state. + /// The shard manager that owns this shard. + public JournaledJobShard( + JobShardId shardId, + DateTimeOffset startTime, + DateTimeOffset endTime, + IReadOnlyDictionary? metadata, + bool isClosed, + JournaledJobShardState state, + IJournaledStateManager stateManager, + JournaledJobShardManager shardManager) + { + ArgumentNullException.ThrowIfNull(state); + ArgumentNullException.ThrowIfNull(stateManager); + ArgumentNullException.ThrowIfNull(shardManager); + + Id = shardId.Value; + StartTime = startTime; + EndTime = endTime; + Metadata = metadata is { Count: > 0 } ? new Dictionary(metadata, StringComparer.Ordinal) : null; + _state = state; + _stateManager = stateManager; + _shardManager = shardManager; + + if (isClosed) + { + _state.MarkAsComplete(); + } + } + + /// + public string Id { get; } + + /// + public DateTimeOffset StartTime { get; } + + /// + public DateTimeOffset EndTime { get; } + + /// + public IDictionary? Metadata { get; } + + /// + public bool IsAddingCompleted => _state.IsAddingCompleted; + + /// + /// Gets the backing journal identifier for this shard. + /// + internal JournalId StorageId => JobShardId.Parse(Id).ToJournalId(); + + /// + public IAsyncEnumerable ConsumeDurableJobsAsync() => _state.ConsumeDurableJobsAsync(); + + /// + public ValueTask GetJobCountAsync() => ValueTask.FromResult(_state.Count); + + /// + public async Task MarkAsCompleteAsync(CancellationToken cancellationToken) + { + ThrowIfDisposed(); + + await _operationLock.WaitAsync(cancellationToken); + try + { + if (_state.IsAddingCompleted) + { + return; + } + + if (await _shardManager.TryMarkShardClosedAsync(Id, cancellationToken)) + { + _state.MarkAsComplete(); + } + } + finally + { + _operationLock.Release(); + } + } + + /// + public async Task RemoveJobAsync(string jobId, CancellationToken cancellationToken) + { + ArgumentException.ThrowIfNullOrWhiteSpace(jobId); + ThrowIfDisposed(); + + await _operationLock.WaitAsync(cancellationToken); + try + { + if (!await _shardManager.IsShardOwnedByLocalSiloAsync(Id, cancellationToken)) + { + return false; + } + + var removed = _state.RemoveJob(jobId); + await _stateManager.WriteStateAsync(cancellationToken); + return removed; + } + finally + { + _operationLock.Release(); + } + } + + /// + public async Task RetryJobLaterAsync(IJobRunContext jobContext, DateTimeOffset newDueTime, CancellationToken cancellationToken) + { + ArgumentNullException.ThrowIfNull(jobContext); + ThrowIfDisposed(); + + await _operationLock.WaitAsync(cancellationToken); + try + { + if (!await _shardManager.IsShardOwnedByLocalSiloAsync(Id, cancellationToken)) + { + return; + } + + _state.RetryJobLater(jobContext, newDueTime); + await _stateManager.WriteStateAsync(cancellationToken); + } + finally + { + _operationLock.Release(); + } + } + + /// + public async Task TryScheduleJobAsync(ScheduleJobRequest request, CancellationToken cancellationToken) + { + ThrowIfDisposed(); + + await _operationLock.WaitAsync(cancellationToken); + try + { + if (_state.IsAddingCompleted) + { + return null; + } + + if (!await _shardManager.IsShardOwnedByLocalSiloAsync(Id, cancellationToken)) + { + return null; + } + + var job = _state.TryScheduleJob(request); + if (job is null) + { + return null; + } + + await _stateManager.WriteStateAsync(cancellationToken); + return job; + } + finally + { + _operationLock.Release(); + } + } + + /// + /// Deletes this shard's journaled state. + /// + /// A token to cancel the operation. + /// A task that represents the asynchronous operation. + internal async ValueTask DeleteStateAsync(CancellationToken cancellationToken) + { + ThrowIfDisposed(); + + await _operationLock.WaitAsync(cancellationToken); + try + { + await _stateManager.DeleteStateAsync(cancellationToken); + } + finally + { + _operationLock.Release(); + } + } + + /// + public async ValueTask DisposeAsync() + { + if (Interlocked.Exchange(ref _disposed, 1) != 0) + { + return; + } + + try + { + await _stateManager.DisposeAsync(); + } + finally + { + _operationLock.Dispose(); + GC.SuppressFinalize(this); + } + } + + private void ThrowIfDisposed() => ObjectDisposedException.ThrowIf(_disposed != 0, this); +} diff --git a/src/Orleans.DurableJobs/JournaledJobShardManager.cs b/src/Orleans.DurableJobs/JournaledJobShardManager.cs new file mode 100644 index 00000000000..0ca5e9fab9c --- /dev/null +++ b/src/Orleans.DurableJobs/JournaledJobShardManager.cs @@ -0,0 +1,560 @@ +using System; +using System.Collections.Concurrent; +using System.Collections.Generic; +using System.Globalization; +using System.Text; +using System.Threading; +using System.Threading.Tasks; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Options; +using Orleans.Hosting; +using Orleans.Journaling; +using Orleans.Runtime; + +namespace Orleans.DurableJobs; + +internal sealed class JournaledJobShardManager : JobShardManager +{ + private const string OwnerProperty = "DurableJobsOwner"; + private const string MembershipVersionProperty = "DurableJobsMembershipVersion"; + private const string MinDueTimeProperty = "DurableJobsMinDueTime"; + private const string MaxDueTimeProperty = "DurableJobsMaxDueTime"; + private const string AdoptedCountProperty = "DurableJobsAdoptedCount"; + private const string LastAdoptedTimeProperty = "DurableJobsLastAdoptedTime"; + private const string PoisonedProperty = "DurableJobsPoisoned"; + private const string ClosedProperty = "DurableJobsClosed"; + private const string MetadataPropertyPrefix = "DurableJobsMetadata_"; + + private readonly IJournaledStateManagerFactory _stateManagerFactory; + private readonly IJournalStorageProvider _storageProvider; + private readonly IJournalStorageCatalog _catalog; + private readonly IClusterMembershipService _membershipService; + private readonly IServiceProvider _serviceProvider; + private readonly DurableJobsOptions _options; + private readonly JournaledStateManagerOptions _journaledStateManagerOptions; + private readonly TimeProvider _timeProvider; + private readonly ConcurrentDictionary _jobShardCache = new(); + + public JournaledJobShardManager( + ILocalSiloDetails localSiloDetails, + IJournaledStateManagerFactory stateManagerFactory, + IJournalStorageProvider storageProvider, + IJournalStorageCatalog catalog, + IClusterMembershipService membershipService, + IServiceProvider serviceProvider, + IOptions options, + IOptions journaledStateManagerOptions) + : base(GetSiloAddress(localSiloDetails)) + { + ArgumentNullException.ThrowIfNull(localSiloDetails); + ArgumentNullException.ThrowIfNull(stateManagerFactory); + ArgumentNullException.ThrowIfNull(storageProvider); + ArgumentNullException.ThrowIfNull(catalog); + ArgumentNullException.ThrowIfNull(membershipService); + ArgumentNullException.ThrowIfNull(serviceProvider); + ArgumentNullException.ThrowIfNull(options); + ArgumentNullException.ThrowIfNull(journaledStateManagerOptions); + + _stateManagerFactory = stateManagerFactory; + _storageProvider = storageProvider; + _catalog = catalog; + _membershipService = membershipService; + _serviceProvider = serviceProvider; + _options = options.Value; + _journaledStateManagerOptions = journaledStateManagerOptions.Value; + _timeProvider = serviceProvider.GetService() ?? TimeProvider.System; + } + + private static SiloAddress GetSiloAddress(ILocalSiloDetails localSiloDetails) + { + ArgumentNullException.ThrowIfNull(localSiloDetails); + return localSiloDetails.SiloAddress; + } + + public override async Task> AssignJobShardsAsync(DateTimeOffset maxDueTime, int maxNewClaims, CancellationToken cancellationToken) + { + var result = new List(); + var newClaimCount = 0; + var membershipSnapshot = _membershipService.CurrentSnapshot; + + await foreach (var storageId in _catalog.ListAsync(JobShardId.StoragePrefix, cancellationToken)) + { + var descriptor = await GetDescriptorAsync(storageId, cancellationToken); + if (descriptor is null || descriptor.Poisoned || descriptor.StartTime > maxDueTime) + { + continue; + } + + if (descriptor.MembershipVersion > membershipSnapshot.Version) + { + // Refresh membership to at least that version. + await _membershipService.Refresh(descriptor.MembershipVersion, cancellationToken); + membershipSnapshot = _membershipService.CurrentSnapshot; + } + + if (descriptor.Owner is { } owner && owner.Equals(SiloAddress)) + { + result.Add(await GetOrOpenShardAsync(descriptor, cancellationToken)); + continue; + } + + // Determine if this is an adopted shard (taken from dead owner) vs orphaned (gracefully released). + var isAdopted = false; + if (descriptor.Owner is { } previousOwner) + { + var ownerStatus = membershipSnapshot.GetSiloStatus(previousOwner); + if (ownerStatus is not SiloStatus.Dead and not SiloStatus.None) + { + // Owner is still active and it's not me, skip this shard. + continue; + } + + isAdopted = ownerStatus == SiloStatus.Dead; + } + + // Respect the slow-start budget: skip claiming if we've exhausted the budget. + // This must be checked before incrementing the adopted count to avoid + // inflating the count when the shard isn't actually claimed. + if (newClaimCount >= maxNewClaims) + { + continue; + } + + // Try to claim orphaned or adopted shard. + var claimedShard = await TryClaimShardAsync(descriptor, isAdopted, cancellationToken); + if (claimedShard is null) + { + // Either poisoned shard or someone else took ownership. + continue; + } + + _jobShardCache[claimedShard.Id] = claimedShard; + result.Add(claimedShard); + newClaimCount++; + } + + return result; + } + + public override async Task CreateShardAsync(DateTimeOffset minDueTime, DateTimeOffset maxDueTime, IDictionary metadata, CancellationToken cancellationToken) + { + while (true) + { + var shardId = JobShardId.New(); + var storageId = shardId.ToJournalId(); + var initialProperties = CreateInitialProperties(minDueTime, maxDueTime, metadata); + var storage = _storageProvider.CreateStorage(storageId); + if (!await storage.CreateIfNotExistsAsync(initialProperties, cancellationToken)) + { + continue; + } + + var properties = await storage.GetMetadataAsync(cancellationToken); + var descriptor = properties is not null ? ShardCatalogProperties.From(storageId, properties) : null; + if (descriptor is null) + { + throw new InvalidOperationException($"Created DurableJobs shard '{shardId}' without readable journal storage properties."); + } + + var shard = await OpenShardAsync(descriptor, cancellationToken); + _jobShardCache[shard.Id] = shard; + return shard; + } + } + + public override async Task UnregisterShardAsync(IJobShard shard, CancellationToken cancellationToken) + { + var journaledShard = shard as JournaledJobShard + ?? throw new ArgumentException("Shard is not a journaled DurableJobs shard.", nameof(shard)); + + try + { + var descriptor = await GetDescriptorAsync(journaledShard.StorageId, cancellationToken) + ?? throw new InvalidOperationException($"Cannot unregister DurableJobs shard '{shard.Id}' because its catalog properties were not found."); + + if (descriptor.Owner is null || !descriptor.Owner.Equals(SiloAddress)) + { + throw new InvalidOperationException("Cannot unregister a DurableJobs shard owned by another silo."); + } + + var count = await shard.GetJobCountAsync(); + if (count == 0) + { + // No jobs left, we can delete the shard. + await journaledShard.DeleteStateAsync(cancellationToken); + } + else + { + // There are still jobs in the shard, release ownership gracefully. + var updatedMetadata = await UpdateMetadataAsync( + descriptor, + new Dictionary(StringComparer.Ordinal) + { + [ClosedProperty] = bool.TrueString, + [MembershipVersionProperty] = GetMembershipVersionString() + }, + [OwnerProperty, AdoptedCountProperty, LastAdoptedTimeProperty], + cancellationToken); + + if (updatedMetadata is null) + { + throw new InvalidOperationException($"Failed to release DurableJobs shard '{shard.Id}' ownership."); + } + } + } + finally + { + _jobShardCache.TryRemove(shard.Id, out _); + await journaledShard.DisposeAsync(); + } + } + + internal override async ValueTask GetShardOwnerAsync(string shardId, CancellationToken cancellationToken) + { + var descriptor = await GetDescriptorAsync(shardId, cancellationToken); + if (descriptor is null || descriptor.Poisoned || descriptor.Owner is null) + { + return null; + } + + if (descriptor.Owner.Equals(SiloAddress)) + { + return descriptor.Owner; + } + + var membershipSnapshot = _membershipService.CurrentSnapshot; + if (descriptor.MembershipVersion > membershipSnapshot.Version) + { + await _membershipService.Refresh(descriptor.MembershipVersion, cancellationToken); + membershipSnapshot = _membershipService.CurrentSnapshot; + } + + return membershipSnapshot.GetSiloStatus(descriptor.Owner) == SiloStatus.Active ? descriptor.Owner : null; + } + + internal override async ValueTask IsShardOwnedByLocalSiloAsync(string shardId, CancellationToken cancellationToken) + { + var descriptor = await GetDescriptorAsync(shardId, cancellationToken); + return descriptor is { Poisoned: false, Owner: { } owner } && owner.Equals(SiloAddress); + } + + internal async ValueTask TryMarkShardClosedAsync(string shardId, CancellationToken cancellationToken) + { + for (var attempt = 0; attempt < 3; attempt++) + { + var descriptor = await GetDescriptorAsync(shardId, cancellationToken); + if (descriptor is null || descriptor.Poisoned || descriptor.Owner is null || !descriptor.Owner.Equals(SiloAddress)) + { + return false; + } + + if (descriptor.Closed) + { + return true; + } + + var result = await UpdateMetadataAsync( + descriptor, + new Dictionary(StringComparer.Ordinal) + { + [ClosedProperty] = bool.TrueString, + [MembershipVersionProperty] = GetMembershipVersionString() + }, + remove: null, + cancellationToken); + if (result is not null) + { + return true; + } + } + + return false; + } + + private async ValueTask TryClaimShardAsync(ShardCatalogProperties descriptor, bool isAdopted, CancellationToken cancellationToken) + { + var adoptedCount = descriptor.AdoptedCount; + var set = new Dictionary(StringComparer.Ordinal) + { + [OwnerProperty] = SiloAddress.ToParsableString(), + [MembershipVersionProperty] = GetMembershipVersionString(), + // We don't want to add new jobs to shards that we just took ownership of. + [ClosedProperty] = bool.TrueString + }; + List? remove = null; + + if (isAdopted) + { + // Increment adopted count for shards taken from dead owners. + adoptedCount++; + if (adoptedCount > _options.MaxAdoptedCount) + { + // Persist poisoned marker so this shard is not repeatedly re-evaluated as newly poisoned. + await TryMarkShardPoisonedAsync(descriptor, adoptedCount, cancellationToken); + return null; + } + + set[AdoptedCountProperty] = adoptedCount.ToString(CultureInfo.InvariantCulture); + set[LastAdoptedTimeProperty] = _timeProvider.GetUtcNow().ToString("O", CultureInfo.InvariantCulture); + } + else + { + // Reset adopted count since we're gracefully releasing. + set[AdoptedCountProperty] = "0"; + remove = [LastAdoptedTimeProperty]; + } + + var updatedMetadata = await UpdateMetadataAsync(descriptor, set, remove, cancellationToken); + if (updatedMetadata is null) + { + return null; + } + + var updatedDescriptor = ShardCatalogProperties.From(descriptor.StorageId, updatedMetadata); + return updatedDescriptor is null || updatedDescriptor.Owner is null || !updatedDescriptor.Owner.Equals(SiloAddress) + ? null + : await OpenShardAsync(updatedDescriptor, cancellationToken); + } + + private async Task TryMarkShardPoisonedAsync(ShardCatalogProperties descriptor, int adoptedCount, CancellationToken cancellationToken) + { + await UpdateMetadataAsync( + descriptor, + new Dictionary(StringComparer.Ordinal) + { + [PoisonedProperty] = bool.TrueString, + [AdoptedCountProperty] = adoptedCount.ToString(CultureInfo.InvariantCulture), + [LastAdoptedTimeProperty] = _timeProvider.GetUtcNow().ToString("O", CultureInfo.InvariantCulture), + [MembershipVersionProperty] = GetMembershipVersionString() + }, + remove: null, + cancellationToken); + } + + private async ValueTask GetOrOpenShardAsync(ShardCatalogProperties descriptor, CancellationToken cancellationToken) + { + if (_jobShardCache.TryGetValue(descriptor.ShardId.Value, out var existing)) + { + return existing; + } + + var shard = await OpenShardAsync(descriptor, cancellationToken); + if (_jobShardCache.TryAdd(shard.Id, shard)) + { + return shard; + } + + await shard.DisposeAsync(); + return _jobShardCache[descriptor.ShardId.Value]; + } + + private async ValueTask OpenShardAsync(ShardCatalogProperties descriptor, CancellationToken cancellationToken) + { + var codec = CreateOperationCodec(); + var state = new JournaledJobShardState(descriptor.ShardId, descriptor.StartTime, descriptor.EndTime, codec, _timeProvider); + var manager = _stateManagerFactory.Create(descriptor.StorageId); + try + { + manager.RegisterState(JournaledJobShardState.StateName, state); + await manager.InitializeAsync(cancellationToken).ConfigureAwait(false); + } + catch + { + await manager.DisposeAsync().ConfigureAwait(false); + throw; + } + + return new JournaledJobShard( + descriptor.ShardId, + descriptor.StartTime, + descriptor.EndTime, + descriptor.Metadata, + descriptor.Closed, + state, + manager, + this); + } + + private IDurableValueCommandCodec CreateOperationCodec() + { + var journalFormatKey = _journaledStateManagerOptions.JournalFormatKey; + if (string.IsNullOrWhiteSpace(journalFormatKey)) + { + throw new InvalidOperationException("The configured journal format key must be non-empty."); + } + + var codec = _serviceProvider.GetKeyedService>(journalFormatKey); + return codec ?? throw new InvalidOperationException( + $"Journal format key '{journalFormatKey}' requires keyed service '{typeof(IDurableValueCommandCodec).FullName}', but none was registered."); + } + + private async ValueTask GetDescriptorAsync(string shardId, CancellationToken cancellationToken) + { + try + { + return await GetDescriptorAsync(JobShardId.Parse(shardId).ToJournalId(), cancellationToken); + } + catch (ArgumentException) + { + return null; + } + } + + private async ValueTask GetDescriptorAsync(JournalId storageId, CancellationToken cancellationToken) + { + var properties = await _storageProvider.CreateStorage(storageId).GetMetadataAsync(cancellationToken); + return properties is null ? null : ShardCatalogProperties.From(storageId, properties); + } + + private async ValueTask UpdateMetadataAsync( + ShardCatalogProperties descriptor, + IReadOnlyDictionary? set, + IEnumerable? remove, + CancellationToken cancellationToken) + { + var storage = _storageProvider.CreateStorage(descriptor.StorageId); + return await storage.UpdateMetadataAsync(set, remove, descriptor.Properties.ETag, cancellationToken); + } + + private Dictionary CreateInitialProperties(DateTimeOffset minDueTime, DateTimeOffset maxDueTime, IDictionary? metadata) + { + var result = new Dictionary(StringComparer.Ordinal) + { + [OwnerProperty] = SiloAddress.ToParsableString(), + [MembershipVersionProperty] = GetMembershipVersionString(), + [MinDueTimeProperty] = minDueTime.ToString("O", CultureInfo.InvariantCulture), + [MaxDueTimeProperty] = maxDueTime.ToString("O", CultureInfo.InvariantCulture), + [AdoptedCountProperty] = "0", + [ClosedProperty] = bool.FalseString + }; + + if (metadata is not null) + { + foreach (var (key, value) in metadata) + { + result[MetadataPropertyPrefix + EncodeMetadataKey(key)] = value; + } + } + + return result; + } + + private string GetMembershipVersionString() => _membershipService.CurrentSnapshot.Version.Value.ToString(CultureInfo.InvariantCulture); + + private static string EncodeMetadataKey(string key) + { + ArgumentNullException.ThrowIfNull(key); + return Convert.ToBase64String(Encoding.UTF8.GetBytes(key)).TrimEnd('=').Replace('+', '-').Replace('/', '_'); + } + + private static string DecodeMetadataKey(string encoded) + { + var base64 = encoded.Replace('-', '+').Replace('_', '/'); + base64 = base64.PadRight(base64.Length + (4 - base64.Length % 4) % 4, '='); + return Encoding.UTF8.GetString(Convert.FromBase64String(base64)); + } + + private sealed class ShardCatalogProperties + { + private ShardCatalogProperties( + JournalId storageId, + JobShardId shardId, + IJournalMetadata properties, + SiloAddress? owner, + MembershipVersion membershipVersion, + DateTimeOffset startTime, + DateTimeOffset endTime, + int adoptedCount, + bool poisoned, + bool closed, + IReadOnlyDictionary metadata) + { + StorageId = storageId; + ShardId = shardId; + Properties = properties; + Owner = owner; + MembershipVersion = membershipVersion; + StartTime = startTime; + EndTime = endTime; + AdoptedCount = adoptedCount; + Poisoned = poisoned; + Closed = closed; + Metadata = metadata; + } + + public JournalId StorageId { get; } + + public JobShardId ShardId { get; } + + public IJournalMetadata Properties { get; } + + public SiloAddress? Owner { get; } + + public MembershipVersion MembershipVersion { get; } + + public DateTimeOffset StartTime { get; } + + public DateTimeOffset EndTime { get; } + + public int AdoptedCount { get; } + + public bool Poisoned { get; } + + public bool Closed { get; } + + public IReadOnlyDictionary Metadata { get; } + + public static ShardCatalogProperties? From(JournalId storageId, IJournalMetadata properties) + { + try + { + var values = properties.Properties; + if (!values.TryGetValue(MinDueTimeProperty, out var minDueTimeValue) + || !DateTimeOffset.TryParse(minDueTimeValue, CultureInfo.InvariantCulture, DateTimeStyles.RoundtripKind, out var minDueTime) + || !values.TryGetValue(MaxDueTimeProperty, out var maxDueTimeValue) + || !DateTimeOffset.TryParse(maxDueTimeValue, CultureInfo.InvariantCulture, DateTimeStyles.RoundtripKind, out var maxDueTime)) + { + return null; + } + + var owner = values.TryGetValue(OwnerProperty, out var ownerValue) && !string.IsNullOrWhiteSpace(ownerValue) + ? SiloAddress.FromParsableString(ownerValue) + : null; + + var membershipVersion = values.TryGetValue(MembershipVersionProperty, out var membershipVersionValue) + && long.TryParse(membershipVersionValue, NumberStyles.Integer, CultureInfo.InvariantCulture, out var parsedMembershipVersion) + ? new MembershipVersion(parsedMembershipVersion) + : MembershipVersion.MinValue; + + var adoptedCount = values.TryGetValue(AdoptedCountProperty, out var adoptedCountValue) + && int.TryParse(adoptedCountValue, NumberStyles.Integer, CultureInfo.InvariantCulture, out var parsedAdoptedCount) + ? parsedAdoptedCount + : 0; + + var poisoned = values.TryGetValue(PoisonedProperty, out var poisonedValue) + && bool.TryParse(poisonedValue, out var parsedPoisoned) + && parsedPoisoned; + + var closed = values.TryGetValue(ClosedProperty, out var closedValue) + && bool.TryParse(closedValue, out var parsedClosed) + && parsedClosed; + + var metadata = new Dictionary(StringComparer.Ordinal); + foreach (var (key, value) in values) + { + if (key.StartsWith(MetadataPropertyPrefix, StringComparison.Ordinal)) + { + metadata[DecodeMetadataKey(key[MetadataPropertyPrefix.Length..])] = value; + } + } + + var shardId = JobShardId.FromJournalId(storageId); + return new(storageId, shardId, properties, owner, membershipVersion, minDueTime, maxDueTime, adoptedCount, poisoned, closed, metadata); + } + catch (Exception exception) when (exception is ArgumentException or FormatException) + { + return null; + } + } + } +} diff --git a/src/Orleans.DurableJobs/JournaledJobShardState.cs b/src/Orleans.DurableJobs/JournaledJobShardState.cs new file mode 100644 index 00000000000..5a19f9f3b3e --- /dev/null +++ b/src/Orleans.DurableJobs/JournaledJobShardState.cs @@ -0,0 +1,351 @@ +using System; +using System.Linq; +using Orleans.Journaling; + +namespace Orleans.DurableJobs; + +internal sealed class JournaledJobShardState : IJournaledState, IDurableValueCommandHandler +{ + public const string StateName = "jobs"; + + private readonly JobShardId _shardId; + private readonly IDurableValueCommandCodec? _codec; + private readonly TimeProvider _timeProvider; + private InMemoryJobQueue _jobQueue; + private JournalStreamWriter _writer; + + public JournaledJobShardState( + JobShardId shardId, + DateTimeOffset startTime, + DateTimeOffset endTime, + IDurableValueCommandCodec codec, + TimeProvider? timeProvider = null) + : this(shardId, startTime, endTime, codec, timeProvider, isAddingCompleted: false) + { + ArgumentNullException.ThrowIfNull(codec); + } + + internal JournaledJobShardState(JobShardId shardId, DateTimeOffset startTime, DateTimeOffset endTime, TimeProvider? timeProvider = null) + : this(shardId, startTime, endTime, codec: null, timeProvider: timeProvider, isAddingCompleted: false) + { + } + + private JournaledJobShardState( + JobShardId shardId, + DateTimeOffset startTime, + DateTimeOffset endTime, + IDurableValueCommandCodec? codec, + TimeProvider? timeProvider, + bool isAddingCompleted) + { + if (endTime < startTime) + { + throw new ArgumentOutOfRangeException(nameof(endTime), "Shard end time must be greater than or equal to the start time."); + } + + _shardId = shardId; + _codec = codec; + _timeProvider = timeProvider ?? TimeProvider.System; + _jobQueue = new(_timeProvider); + StartTime = startTime; + EndTime = endTime; + IsAddingCompleted = isAddingCompleted; + } + + public string Id => _shardId.Value; + + public DateTimeOffset StartTime { get; } + + public DateTimeOffset EndTime { get; } + + public bool IsAddingCompleted { get; private set; } + + public int Count => _jobQueue.Count; + + public IAsyncEnumerable ConsumeDurableJobsAsync() => _jobQueue; + + public DurableJob? TryScheduleJob(ScheduleJobRequest request) + { + if (IsAddingCompleted) + { + return null; + } + + if (request.DueTime < StartTime || request.DueTime > EndTime) + { + throw new ArgumentOutOfRangeException(nameof(request), "Scheduled time is out of shard bounds."); + } + + var job = new DurableJob + { + Id = Guid.NewGuid().ToString(), + TargetGrainId = request.Target, + Name = request.JobName, + DueTime = request.DueTime, + ShardId = Id, + Metadata = request.Metadata + }; + + Write(DurableJobShardJournalRecord.ForSchedule(job)); + ApplySchedule(job); + return job; + } + + public bool RemoveJob(string jobId) + { + ArgumentException.ThrowIfNullOrWhiteSpace(jobId); + + Write(DurableJobShardJournalRecord.ForRemove(jobId)); + return ApplyRemove(jobId); + } + + public bool RetryJobLater(IJobRunContext jobContext, DateTimeOffset newDueTime) + { + ArgumentNullException.ThrowIfNull(jobContext); + return RetryJobLater(jobContext.Job.Id, newDueTime, jobContext.DequeueCount); + } + + public bool RetryJobLater(string jobId, DateTimeOffset newDueTime, int dequeueCount) + { + ArgumentException.ThrowIfNullOrWhiteSpace(jobId); + ValidateDequeueCount(dequeueCount); + + Write(DurableJobShardJournalRecord.ForRetry(jobId, newDueTime, dequeueCount)); + return ApplyRetry(jobId, newDueTime, dequeueCount); + } + + public void MarkAsComplete() + { + IsAddingCompleted = true; + _jobQueue.MarkAsComplete(); + } + + internal DurableJobShardSnapshot CaptureSnapshot() + { + var jobs = _jobQueue.GetSnapshot() + .OrderBy(static item => item.Job.DueTime) + .ThenBy(static item => item.Job.Id, StringComparer.Ordinal) + .Select(static item => new DurableJobShardSnapshotEntry + { + Job = item.Job, + DequeueCount = item.DequeueCount + }) + .ToList(); + + return new() { Jobs = jobs }; + } + + internal void Apply(DurableJobShardJournalRecord record) + { + ArgumentNullException.ThrowIfNull(record); + + switch (record.Kind) + { + case DurableJobShardJournalRecordKind.Schedule: + ApplySchedule(GetRequired(record.Schedule, nameof(record.Schedule)).Job); + break; + case DurableJobShardJournalRecordKind.Remove: + ApplyRemove(GetRequired(record.Remove, nameof(record.Remove)).JobId); + break; + case DurableJobShardJournalRecordKind.Retry: + var retry = GetRequired(record.Retry, nameof(record.Retry)); + ApplyRetry(retry.JobId, retry.DueTime, retry.DequeueCount); + break; + case DurableJobShardJournalRecordKind.Snapshot: + ApplySnapshot(GetRequired(record.Snapshot, nameof(record.Snapshot))); + break; + default: + throw new NotSupportedException($"DurableJobs shard journal record kind '{record.Kind}' is not supported."); + } + } + + void IJournaledState.ReplayEntry(JournalEntry entry, JournalReplayContext context) => + context.GetRequiredCommandCodec(entry.FormatKey, GetCodec()).Apply(entry.Reader, this); + + void IDurableValueCommandHandler.ApplySet(DurableJobShardJournalRecord value) => Apply(value); + + void IJournaledState.Reset(JournalStreamWriter writer) + { + _jobQueue = new(_timeProvider); + IsAddingCompleted = false; + _writer = writer; + } + + void IJournaledState.AppendEntries(JournalStreamWriter writer) + { + } + + void IJournaledState.AppendSnapshot(JournalStreamWriter writer) + { + GetCodec().WriteSet(DurableJobShardJournalRecord.ForSnapshot(CaptureSnapshot()), writer); + } + + IJournaledState IJournaledState.DeepCopy() => throw new NotSupportedException(); + + private void Write(DurableJobShardJournalRecord record) => GetCodec().WriteSet(record, _writer); + + private void ApplySchedule(DurableJob job) => _jobQueue.Enqueue(job, dequeueCount: 0); + + private bool ApplyRemove(string jobId) => _jobQueue.CancelJob(jobId); + + private bool ApplyRetry(string jobId, DateTimeOffset dueTime, int dequeueCount) + { + ValidateDequeueCount(dequeueCount); + return _jobQueue.RetryJobLater(jobId, dueTime, dequeueCount); + } + + private void ApplySnapshot(DurableJobShardSnapshot snapshot) + { + ArgumentNullException.ThrowIfNull(snapshot); + + _jobQueue.Clear(); + foreach (var entry in snapshot.Jobs) + { + ArgumentNullException.ThrowIfNull(entry.Job); + ValidateDequeueCount(entry.DequeueCount); + _jobQueue.Enqueue(entry.Job, entry.DequeueCount); + } + } + + private IDurableValueCommandCodec GetCodec() + => _codec ?? throw new InvalidOperationException("A DurableJobs shard journal operation codec is required before journal entries can be appended."); + + private static T GetRequired(T? value, string propertyName) where T : class + => value ?? throw new InvalidOperationException($"DurableJobs shard journal record is missing required '{propertyName}' payload."); + + private static void ValidateDequeueCount(int dequeueCount) + { + if (dequeueCount < 0) + { + throw new InvalidOperationException("DurableJobs shard journal dequeue count must not be negative."); + } + } +} + +[GenerateSerializer] +[Alias("Orleans.DurableJobs.DurableJobShardJournalRecordKind")] +internal enum DurableJobShardJournalRecordKind : byte +{ + Schedule = 0, + Remove = 1, + Retry = 2, + Snapshot = 3 +} + +[GenerateSerializer] +[Alias("Orleans.DurableJobs.DurableJobShardJournalRecord")] +internal sealed class DurableJobShardJournalRecord +{ + [Id(0)] + public DurableJobShardJournalRecordKind Kind { get; init; } + + [Id(1)] + public DurableJobShardScheduleOperation? Schedule { get; init; } + + [Id(2)] + public DurableJobShardRemoveOperation? Remove { get; init; } + + [Id(3)] + public DurableJobShardRetryOperation? Retry { get; init; } + + [Id(4)] + public DurableJobShardSnapshot? Snapshot { get; init; } + + public static DurableJobShardJournalRecord ForSchedule(DurableJob job) + { + ArgumentNullException.ThrowIfNull(job); + + return new() + { + Kind = DurableJobShardJournalRecordKind.Schedule, + Schedule = new() { Job = job } + }; + } + + public static DurableJobShardJournalRecord ForRemove(string jobId) + { + ArgumentException.ThrowIfNullOrWhiteSpace(jobId); + + return new() + { + Kind = DurableJobShardJournalRecordKind.Remove, + Remove = new() { JobId = jobId } + }; + } + + public static DurableJobShardJournalRecord ForRetry(string jobId, DateTimeOffset dueTime, int dequeueCount) + { + ArgumentException.ThrowIfNullOrWhiteSpace(jobId); + + return new() + { + Kind = DurableJobShardJournalRecordKind.Retry, + Retry = new() + { + JobId = jobId, + DueTime = dueTime, + DequeueCount = dequeueCount + } + }; + } + + public static DurableJobShardJournalRecord ForSnapshot(DurableJobShardSnapshot snapshot) + { + ArgumentNullException.ThrowIfNull(snapshot); + + return new() + { + Kind = DurableJobShardJournalRecordKind.Snapshot, + Snapshot = snapshot + }; + } +} + +[GenerateSerializer] +[Alias("Orleans.DurableJobs.DurableJobShardScheduleOperation")] +internal sealed class DurableJobShardScheduleOperation +{ + [Id(0)] + public DurableJob Job { get; init; } = null!; +} + +[GenerateSerializer] +[Alias("Orleans.DurableJobs.DurableJobShardRemoveOperation")] +internal sealed class DurableJobShardRemoveOperation +{ + [Id(0)] + public string JobId { get; init; } = string.Empty; +} + +[GenerateSerializer] +[Alias("Orleans.DurableJobs.DurableJobShardRetryOperation")] +internal sealed class DurableJobShardRetryOperation +{ + [Id(0)] + public string JobId { get; init; } = string.Empty; + + [Id(1)] + public DateTimeOffset DueTime { get; init; } + + [Id(2)] + public int DequeueCount { get; init; } +} + +[GenerateSerializer] +[Alias("Orleans.DurableJobs.DurableJobShardSnapshot")] +internal sealed class DurableJobShardSnapshot +{ + [Id(0)] + public List Jobs { get; init; } = []; +} + +[GenerateSerializer] +[Alias("Orleans.DurableJobs.DurableJobShardSnapshotEntry")] +internal sealed class DurableJobShardSnapshotEntry +{ + [Id(0)] + public DurableJob Job { get; init; } = null!; + + [Id(1)] + public int DequeueCount { get; init; } +} diff --git a/src/Orleans.DurableJobs/LocalDurableJobManager.cs b/src/Orleans.DurableJobs/LocalDurableJobManager.cs index 51e50030857..c5801c99121 100644 --- a/src/Orleans.DurableJobs/LocalDurableJobManager.cs +++ b/src/Orleans.DurableJobs/LocalDurableJobManager.cs @@ -16,10 +16,13 @@ namespace Orleans.DurableJobs; /// -internal partial class LocalDurableJobManager : SystemTarget, ILocalDurableJobManager, ILifecycleParticipant +internal partial class LocalDurableJobManager : SystemTarget, ILocalDurableJobManager, ILocalDurableJobManagerSystemTarget, ILifecycleParticipant { + internal static readonly GrainType JobManagerGrainType = SystemTargetGrainId.CreateGrainType("job-manager"); + private readonly JobShardManager _shardManager; private readonly ShardExecutor _shardExecutor; + private readonly IInternalGrainFactory _grainFactory; private readonly IAsyncEnumerable _clusterMembershipUpdates; private readonly IOverloadDetector _overloadDetector; private readonly TimeProvider _timeProvider; @@ -45,16 +48,18 @@ internal partial class LocalDurableJobManager : SystemTarget, ILocalDurableJobMa public LocalDurableJobManager( JobShardManager shardManager, ShardExecutor shardExecutor, + IInternalGrainFactory grainFactory, IClusterMembershipService clusterMembership, IOverloadDetector overloadDetector, TimeProvider timeProvider, IOptions options, SystemTargetShared shared, ILogger logger) - : base(SystemTargetGrainId.CreateGrainType("job-manager"), shared) + : base(JobManagerGrainType, shared) { _shardManager = shardManager; _shardExecutor = shardExecutor; + _grainFactory = grainFactory; _clusterMembershipUpdates = clusterMembership.MembershipUpdates; _overloadDetector = overloadDetector; _timeProvider = timeProvider; @@ -177,16 +182,35 @@ public async Task TryCancelDurableJobAsync(DurableJob job, CancellationTok { LogCancellingJob(_logger, job.Id, job.Name, job.ShardId); - if (!_shardCache.TryGetValue(job.ShardId, out var shard)) + if (_shardCache.TryGetValue(job.ShardId, out var shard)) + { + if (!await _shardManager.IsShardOwnedByLocalSiloAsync(job.ShardId, cancellationToken)) + { + LogJobCancellationFailed(_logger, job.Id, job.Name, job.ShardId); + return false; + } + + using var linkedCts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken, _cts.Token); + var wasRemoved = await shard.RemoveJobAsync(job.Id, linkedCts.Token); + LogJobCancelled(_logger, job.Id, job.Name, job.ShardId); + return wasRemoved; + } + + var owner = await _shardManager.GetShardOwnerAsync(job.ShardId, cancellationToken); + if (owner is null || owner.Equals(Silo)) { LogJobCancellationFailed(_logger, job.Id, job.Name, job.ShardId); return false; } - using var linkedCts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken, _cts.Token); - var wasRemoved = await shard.RemoveJobAsync(job.Id, linkedCts.Token); - LogJobCancelled(_logger, job.Id, job.Name, job.ShardId); - return wasRemoved; + var remote = _grainFactory.GetSystemTarget(JobManagerGrainType, owner); + var routed = await remote.TryCancelDurableJobAsync(job, cancellationToken); + if (!routed) + { + LogJobCancellationFailed(_logger, job.Id, job.Name, job.ShardId); + } + + return routed; } private async Task ProcessMembershipUpdates() @@ -231,7 +255,7 @@ private async Task PeriodicShardCheck() { await Task.CompletedTask.ConfigureAwait(ConfigureAwaitOptions.ForceYielding | ConfigureAwaitOptions.ContinueOnCapturedContext); - using var timer = new PeriodicTimer(TimeSpan.FromMinutes(10)); + using var timer = new PeriodicTimer(TimeSpan.FromMinutes(10), _timeProvider); Task timerTask = Task.CompletedTask; while (!_cts.Token.IsCancellationRequested) @@ -247,53 +271,7 @@ private async Task PeriodicShardCheck() var signalTask = _shardCheckSignal.WaitAsync(_cts.Token); await Task.WhenAny(timerTask, signalTask); - LogCheckingPendingShards(_logger); - - // Clean up old writable shards that have passed their time window - var now = DateTimeOffset.UtcNow; - foreach (var key in _writeableShards.Keys.ToArray()) - { - var shardEndTime = key.Add(_options.ShardDuration); - if (shardEndTime < now) - { - _writeableShards.TryRemove(key, out _); - } - } - - // Compute the slow-start budget for this cycle - var budget = ComputeClaimBudget(); - - // Query ShardManager for assigned shards (source of truth) - var shards = await _shardManager.AssignJobShardsAsync(DateTime.UtcNow.AddHours(1), budget, _cts.Token); - - // Count newly claimed shards (those not already in our cache) - var newClaimsThisCycle = 0; - if (shards.Count > 0) - { - LogAssignedShards(_logger, shards.Count); - foreach (var shard in shards) - { - if (_shardCache.TryAdd(shard.Id, shard)) - { - newClaimsThisCycle++; - } - - if (!_runningShards.ContainsKey(shard.Id)) - { - TryActivateShard(shard); - } - } - } - else - { - LogNoShardsToAssign(_logger); - } - - if (newClaimsThisCycle > 0) - { - _totalClaimedShards += newClaimsThisCycle; - LogOrphanedShardsClaimed(_logger, newClaimsThisCycle, _totalClaimedShards); - } + await ProcessShardCheckCycleAsync(_cts.Token); } catch (OperationCanceledException) { @@ -307,6 +285,57 @@ private async Task PeriodicShardCheck() } } + internal async Task ProcessShardCheckCycleAsync(CancellationToken cancellationToken) + { + LogCheckingPendingShards(_logger); + + // Clean up old writable shards that have passed their time window. + var now = _timeProvider.GetUtcNow(); + foreach (var key in _writeableShards.Keys.ToArray()) + { + var shardEndTime = key.Add(_options.ShardDuration); + if (shardEndTime < now && _writeableShards.TryRemove(key, out var expiredShard)) + { + await expiredShard.MarkAsCompleteAsync(cancellationToken); + } + } + + // Compute the slow-start budget for this cycle + var budget = ComputeClaimBudget(); + + // Query ShardManager for assigned shards (source of truth) + var shards = await _shardManager.AssignJobShardsAsync(now.AddHours(1), budget, cancellationToken); + + // Count newly claimed shards (those not already in our cache) + var newClaimsThisCycle = 0; + if (shards.Count > 0) + { + LogAssignedShards(_logger, shards.Count); + foreach (var shard in shards) + { + if (_shardCache.TryAdd(shard.Id, shard)) + { + newClaimsThisCycle++; + } + + if (!_runningShards.ContainsKey(shard.Id)) + { + TryActivateShard(shard); + } + } + } + else + { + LogNoShardsToAssign(_logger); + } + + if (newClaimsThisCycle > 0) + { + _totalClaimedShards += newClaimsThisCycle; + LogOrphanedShardsClaimed(_logger, newClaimsThisCycle, _totalClaimedShards); + } + } + /// /// Computes the maximum number of orphaned shards this silo may claim in the current check cycle. /// Returns when unlimited (ramp-up complete or disabled). @@ -426,7 +455,26 @@ private async Task RunShardWithCleanupAsync(IJobShard shard) private bool ShouldStartShardNow(IJobShard shard) { var activationTime = shard.StartTime.Subtract(_options.ShardActivationBufferPeriod); - return DateTimeOffset.UtcNow >= activationTime; + return _timeProvider.GetUtcNow() >= activationTime; + } + + internal sealed class TestAccessor(LocalDurableJobManager manager) + { + public Task ProcessShardCheckCycleAsync(CancellationToken cancellationToken) => manager.ProcessShardCheckCycleAsync(cancellationToken); + + public void AddWritableShard(DateTimeOffset shardKey, IJobShard shard) + { + manager._writeableShards[shardKey] = shard; + manager._shardCache.TryAdd(shard.Id, shard); + } + + public bool HasWritableShard(DateTimeOffset shardKey) => manager._writeableShards.ContainsKey(shardKey); + + public void TryActivateShard(IJobShard shard) => manager.TryActivateShard(shard); + + public bool TryGetRunningShardTask(string shardId, out Task? task) => manager._runningShards.TryGetValue(shardId, out task); + + public bool HasCachedShard(string shardId) => manager._shardCache.ContainsKey(shardId); } private DateTimeOffset GetShardKey(DateTimeOffset scheduledTime) diff --git a/src/Orleans.DurableJobs/Orleans.DurableJobs.csproj b/src/Orleans.DurableJobs/Orleans.DurableJobs.csproj index ba79c4e09ad..e838ed729a7 100644 --- a/src/Orleans.DurableJobs/Orleans.DurableJobs.csproj +++ b/src/Orleans.DurableJobs/Orleans.DurableJobs.csproj @@ -11,11 +11,13 @@ $(VersionSuffix).alpha.1 alpha.1 enable + $(NoWarn);ORLEANSEXP005 + @@ -27,6 +29,7 @@ + diff --git a/src/Orleans.DurableJobs/README.md b/src/Orleans.DurableJobs/README.md index 5738e1cd057..949bd1f9393 100644 --- a/src/Orleans.DurableJobs/README.md +++ b/src/Orleans.DurableJobs/README.md @@ -58,13 +58,10 @@ builder.UseOrleans(siloBuilder => siloBuilder .UseLocalhostClustering() // Configure Azure Storage Durable Jobs - .UseAzureStorageDurableJobs(options => + .UseAzureBlobDurableJobs(options => { - options.Configure(o => - { - o.BlobServiceClient = new Azure.Storage.Blobs.BlobServiceClient("YOUR_CONNECTION_STRING"); - o.ContainerName = "durable-jobs"; - }); + options.BlobServiceClient = new Azure.Storage.Blobs.BlobServiceClient("YOUR_CONNECTION_STRING"); + options.ContainerName = "durable-jobs"; }); }); diff --git a/src/Orleans.DurableJobs/ShardExecutor.cs b/src/Orleans.DurableJobs/ShardExecutor.cs index 0c98e2c599b..6d4d5082e52 100644 --- a/src/Orleans.DurableJobs/ShardExecutor.cs +++ b/src/Orleans.DurableJobs/ShardExecutor.cs @@ -19,6 +19,7 @@ internal sealed partial class ShardExecutor private readonly IInternalGrainFactory _grainFactory; private readonly ILogger _logger; private readonly DurableJobsOptions _options; + private readonly TimeProvider _timeProvider; private readonly SemaphoreSlim _jobConcurrencyLimiter; private readonly IOverloadDetector _overloadDetector; private int _currentCapacity; @@ -35,12 +36,14 @@ public ShardExecutor( IInternalGrainFactory grainFactory, IOptions options, IOverloadDetector overloadDetector, - ILogger logger) + ILogger logger, + TimeProvider? timeProvider = null) { _grainFactory = grainFactory; _logger = logger; _options = options.Value; _overloadDetector = overloadDetector; + _timeProvider = timeProvider ?? TimeProvider.System; _currentCapacity = _options.ConcurrencySlowStartEnabled && _options.SlowStartInitialConcurrency < _options.MaxConcurrentJobsPerSilo ? _options.SlowStartInitialConcurrency @@ -68,12 +71,13 @@ public async Task RunShardAsync(IJobShard shard, CancellationToken cancellationT var tasks = new ConcurrentDictionary(); try { - if (shard.StartTime > DateTime.UtcNow) + var now = _timeProvider.GetUtcNow(); + if (shard.StartTime > now) { // Wait until the shard's start time - var delay = shard.StartTime - DateTimeOffset.UtcNow; + var delay = shard.StartTime - now; LogWaitingForShardStartTime(_logger, shard.Id, delay, shard.StartTime); - await Task.Delay(delay, cancellationToken); + await Task.Delay(delay, _timeProvider, cancellationToken); } LogBeginProcessingShard(_logger, shard.Id); @@ -87,7 +91,7 @@ public async Task RunShardAsync(IJobShard shard, CancellationToken cancellationT LogOverloadDetected(_logger, shard.Id); while (_overloadDetector.IsOverloaded) { - await Task.Delay(_options.OverloadBackoffDelay, cancellationToken); + await Task.Delay(_options.OverloadBackoffDelay, _timeProvider, cancellationToken); } LogOverloadCleared(_logger, shard.Id); } @@ -121,7 +125,7 @@ private async Task SlowStartRampUpAsync() { while (Volatile.Read(ref _currentCapacity) < targetCapacity) { - await Task.Delay(_options.SlowStartInterval); + await Task.Delay(_options.SlowStartInterval, _timeProvider, CancellationToken.None); while (true) { @@ -191,7 +195,7 @@ private async Task RunJobAsync( // Enter polling loop LogPollingJob(_logger, jobContext.Job.Id, jobContext.Job.Name, result.PollAfterDelay.Value); - await Task.Delay(result.PollAfterDelay.Value, cancellationToken); + await Task.Delay(result.PollAfterDelay.Value, _timeProvider, cancellationToken); result = await target.HandleDurableJobAsync(jobContext, cancellationToken); } diff --git a/src/Orleans.Journaling/HostingExtensions.cs b/src/Orleans.Journaling/HostingExtensions.cs index 1b663a4f80d..4a3832d95f7 100644 --- a/src/Orleans.Journaling/HostingExtensions.cs +++ b/src/Orleans.Journaling/HostingExtensions.cs @@ -9,9 +9,9 @@ public static class HostingExtensions public static ISiloBuilder AddJournalStorage(this ISiloBuilder builder) { builder.Services.AddOptions(); - builder.Services.TryAddScoped(); + builder.Services.TryAddSingleton(); builder.Services.TryAddScoped(); - builder.Services.TryAddScoped(); + builder.Services.TryAddSingleton(); // Register JSON as the default format family and keep Orleans binary available for existing data. builder.Services.AddJsonJournalFormat(new JsonJournalOptions().SerializerOptions, tryAdd: true); diff --git a/src/Orleans.Journaling/IJournalStorage.cs b/src/Orleans.Journaling/IJournalStorage.cs index 4cedd67fe35..e3808ce6874 100644 --- a/src/Orleans.Journaling/IJournalStorage.cs +++ b/src/Orleans.Journaling/IJournalStorage.cs @@ -14,14 +14,58 @@ public interface IJournalStorage /// Implementations must notify when the read is complete by passing a /// with set to . /// Each call must pass metadata describing the journal file being read. If storage has no metadata, - /// pass or . Metadata passed during one read must have the same - /// value for every call. + /// pass or . Metadata passed during one read must have the same + /// value for every call. /// /// The consumer of ordered raw journal data. Chunk boundaries are not journal-entry boundaries. /// The cancellation token. /// A representing the operation. ValueTask ReadAsync(IJournalStorageConsumer consumer, CancellationToken cancellationToken); + /// + /// Creates this journal storage instance if it does not already exist. + /// + /// + /// Initial metadata is only applied when the storage instance is created. If the journal was + /// already created by a write, this method returns and does not update metadata. + /// + /// Initial caller-owned metadata properties. + /// The cancellation token. + /// if storage was created; otherwise, . + ValueTask CreateIfNotExistsAsync( + IReadOnlyDictionary? metadata = null, + CancellationToken cancellationToken = default) + => throw new NotSupportedException($"{nameof(IJournalStorage)} implementation does not support journal storage metadata operations."); + + /// + /// Gets metadata for this journal storage instance. + /// + /// The cancellation token. + /// The metadata, or if the storage instance does not exist. + ValueTask GetMetadataAsync(CancellationToken cancellationToken = default) + => throw new NotSupportedException($"{nameof(IJournalStorage)} implementation does not support journal storage metadata operations."); + + /// + /// Conditionally updates caller-owned metadata properties. + /// + /// + /// Implementations apply updates atomically against the current metadata. When + /// is not , providers which support ETags + /// must only apply the update if the current metadata ETag matches it. Provider-owned metadata + /// must be preserved. + /// + /// Metadata properties to set. + /// Metadata properties to remove. + /// The expected metadata ETag, or for an unconditional update. + /// The cancellation token. + /// The current metadata if the update was applied or made no changes; otherwise, . + ValueTask UpdateMetadataAsync( + IReadOnlyDictionary? set = null, + IEnumerable? remove = null, + string? expectedETag = null, + CancellationToken cancellationToken = default) + => throw new NotSupportedException($"{nameof(IJournalStorage)} implementation does not support journal storage metadata operations."); + /// /// Replaces the journal with the provided value atomically. /// diff --git a/src/Orleans.Journaling/IJournalStorageCatalog.cs b/src/Orleans.Journaling/IJournalStorageCatalog.cs new file mode 100644 index 00000000000..5c875b0069e --- /dev/null +++ b/src/Orleans.Journaling/IJournalStorageCatalog.cs @@ -0,0 +1,19 @@ +namespace Orleans.Journaling; + +/// +/// Provides catalog operations for journal storage instances. +/// +/// +/// A catalog only discovers storage identities. Storage lifecycle, metadata, and data mutation +/// operations remain on . +/// +public interface IJournalStorageCatalog +{ + /// + /// Lists journal ids which match . + /// + /// The journal id prefix, or the default value to list all ids. + /// The cancellation token. + /// Matching ids in lexicographic order. + IAsyncEnumerable ListAsync(JournalId prefix = default, CancellationToken cancellationToken = default); +} diff --git a/src/Orleans.Journaling/IJournalStorageConsumer.cs b/src/Orleans.Journaling/IJournalStorageConsumer.cs index 3f4b9e22547..6ce8e500a12 100644 --- a/src/Orleans.Journaling/IJournalStorageConsumer.cs +++ b/src/Orleans.Journaling/IJournalStorageConsumer.cs @@ -9,40 +9,109 @@ public interface IJournalStorageConsumer /// Reads buffered raw journal data. /// /// The buffered journal data available to the consumer. - /// The metadata associated with the journal file being read, or if no metadata is available. - void Read(JournalBufferReader buffer, IJournalFileMetadata? metadata); + /// The metadata associated with the journal data being read, or if no metadata is available. + void Read(JournalBufferReader buffer, IJournalMetadata? metadata); } /// -/// Metadata associated with a journal file being read from storage. +/// Metadata associated with journal storage. /// -public interface IJournalFileMetadata +public interface IJournalMetadata { /// /// Gets the journal format key stored with the journal data, or if no key is present. /// string? Format { get; } + + /// + /// Gets the storage metadata ETag, or if none is available. + /// + string? ETag { get; } + + /// + /// Gets caller-owned storage metadata properties. + /// + IReadOnlyDictionary Properties { get; } } /// -/// Default implementation of . +/// Default implementation of . /// -public sealed class JournalFileMetadata : IJournalFileMetadata +public sealed class JournalMetadata : IJournalMetadata { /// - /// Gets an empty metadata instance for journal data without storage metadata. + /// Gets an empty metadata instance. /// - public static IJournalFileMetadata Empty { get; } = new JournalFileMetadata(format: null); + public static IJournalMetadata Empty { get; } = new JournalMetadata(format: null, eTag: null, properties: null); /// - /// Initializes a new instance of the class. + /// Initializes a new instance of the class. /// /// The journal format key stored with the journal data, or if no key is present. - public JournalFileMetadata(string? format) + /// The storage metadata ETag, or if none is available. + /// Caller-owned storage metadata properties. + public JournalMetadata(string? format, string? eTag = null, IReadOnlyDictionary? properties = null) { Format = format; + ETag = eTag; + Properties = CopyProperties(properties); } /// public string? Format { get; } + + /// + public string? ETag { get; } + + /// + public IReadOnlyDictionary Properties { get; } + + internal static Dictionary CopyProperties(IReadOnlyDictionary? properties) + { + var result = new Dictionary(StringComparer.Ordinal); + if (properties is null) + { + return result; + } + + foreach (var (key, value) in properties) + { + ValidateCallerProperty(key, value); + result.Add(key, value); + } + + return result; + } + + internal static void ValidatePropertyName(string propertyName) + { + ArgumentException.ThrowIfNullOrWhiteSpace(propertyName); + if (propertyName.IndexOf('\0') >= 0) + { + throw new ArgumentException("Journal metadata property names must not contain null characters.", nameof(propertyName)); + } + } + + internal static void ValidateCallerPropertyName(string propertyName) + { + ValidatePropertyName(propertyName); + if (IsProviderOwned(propertyName)) + { + throw new ArgumentException( + $"Journal metadata property '{propertyName}' is provider-owned. Caller updates must not set or remove provider-owned properties.", + nameof(propertyName)); + } + } + + internal static void ValidateCallerProperty(string propertyName, string value) + { + ValidateCallerPropertyName(propertyName); + ArgumentNullException.ThrowIfNull(value); + } + + internal static bool IsProviderOwned(string propertyName) + { + ArgumentException.ThrowIfNullOrWhiteSpace(propertyName); + return propertyName.StartsWith("$", StringComparison.Ordinal); + } } diff --git a/src/Orleans.Journaling/JournalId.cs b/src/Orleans.Journaling/JournalId.cs index 8af8ffe0f2b..7d06cd00e02 100644 --- a/src/Orleans.Journaling/JournalId.cs +++ b/src/Orleans.Journaling/JournalId.cs @@ -5,6 +5,8 @@ namespace Orleans.Journaling; /// public readonly struct JournalId : IEquatable { + private const char Separator = '/'; + /// /// Initializes a new instance of the struct. /// @@ -40,6 +42,78 @@ public static JournalId FromGrainId(GrainId grainId) return new(grainId.ToString()); } + /// + /// Creates a journal id from decoded hierarchical segments. + /// + /// The first id segment. + /// Additional id segments. + /// The normalized journal id. + public static JournalId Create(string firstSegment, params ReadOnlySpan additionalSegments) + { + var encodedSegments = new string[additionalSegments.Length + 1]; + encodedSegments[0] = EncodeSegment(firstSegment, nameof(firstSegment)); + for (var i = 0; i < additionalSegments.Length; i++) + { + encodedSegments[i + 1] = EncodeSegment(additionalSegments[i], nameof(additionalSegments)); + } + + return new(string.Join(Separator, encodedSegments)); + } + + /// + /// Creates a journal id from decoded hierarchical segments. + /// + /// The id segments. + /// The normalized journal id. + public static JournalId Create(IEnumerable segments) + { + ArgumentNullException.ThrowIfNull(segments); + + return Create(segments.ToArray().AsSpan()); + } + + /// + /// Creates a journal id from decoded hierarchical segments. + /// + /// The id segments. + /// The normalized journal id. + public static JournalId Create(ReadOnlySpan segments) + { + if (segments.Length == 0) + { + throw new ArgumentException("A journal id must contain at least one segment.", nameof(segments)); + } + + var encodedSegments = new string[segments.Length]; + for (var i = 0; i < segments.Length; i++) + { + encodedSegments[i] = EncodeSegment(segments[i], nameof(segments)); + } + + return new(string.Join(Separator, encodedSegments)); + } + + /// + /// Determines whether this id is a prefix of . + /// + /// The journal id to test. + /// if this id is the default value, equals , or identifies an ancestor segment. + public bool IsPrefixOf(JournalId journalId) + { + if (IsDefault) + { + return true; + } + + if (journalId.IsDefault) + { + return false; + } + + return string.Equals(journalId.Value, Value, StringComparison.Ordinal) + || journalId.Value.StartsWith(Value + Separator, StringComparison.Ordinal); + } + /// public override string ToString() => Value ?? string.Empty; @@ -67,4 +141,20 @@ public static JournalId FromGrainId(GrainId grainId) /// The second journal id. /// if the journal ids are not equal; otherwise, . public static bool operator !=(JournalId left, JournalId right) => !left.Equals(right); + + private static string EncodeSegment(string segment, string parameterName) + { + ArgumentException.ThrowIfNullOrWhiteSpace(segment, parameterName); + if (segment is "." or "..") + { + throw new ArgumentException("Journal id segments must not be '.' or '..'.", parameterName); + } + + if (segment.IndexOf('\0') >= 0) + { + throw new ArgumentException("Journal id segments must not contain null characters.", parameterName); + } + + return Uri.EscapeDataString(segment); + } } diff --git a/src/Orleans.Journaling/JournalStorageConsumerExtensions.cs b/src/Orleans.Journaling/JournalStorageConsumerExtensions.cs index 2579931d612..cfce4a2190f 100644 --- a/src/Orleans.Journaling/JournalStorageConsumerExtensions.cs +++ b/src/Orleans.Journaling/JournalStorageConsumerExtensions.cs @@ -13,10 +13,10 @@ public static class JournalStorageConsumerExtensions /// /// The journal storage consumer. /// The metadata associated with the journal data being read, or if no metadata is available. - public static void Complete(this IJournalStorageConsumer consumer, IJournalFileMetadata? metadata) + public static void Complete(this IJournalStorageConsumer consumer, IJournalMetadata? metadata) { ArgumentNullException.ThrowIfNull(consumer); - metadata ??= JournalFileMetadata.Empty; + metadata ??= JournalMetadata.Empty; using var buffer = new ArcBufferWriter(); ReadBuffer(consumer, buffer, metadata, isCompleted: true); @@ -29,10 +29,10 @@ public static void Complete(this IJournalStorageConsumer consumer, IJournalFileM /// The bytes to read. /// The metadata associated with the journal data being read, or if no metadata is available. /// Whether to notify the consumer that no more data will be supplied. If , the consumer must read all supplied bytes. - public static void Read(this IJournalStorageConsumer consumer, ReadOnlyMemory input, IJournalFileMetadata? metadata, bool complete) + public static void Read(this IJournalStorageConsumer consumer, ReadOnlyMemory input, IJournalMetadata? metadata, bool complete) { ArgumentNullException.ThrowIfNull(consumer); - metadata ??= JournalFileMetadata.Empty; + metadata ??= JournalMetadata.Empty; using var buffer = new ArcBufferWriter(); if (!input.IsEmpty) @@ -51,10 +51,10 @@ public static void Read(this IJournalStorageConsumer consumer, ReadOnlyMemoryThe bytes to read. /// The metadata associated with the journal data being read, or if no metadata is available. /// Whether to notify the consumer that no more data will be supplied. If , the consumer must read all supplied bytes. - public static void Read(this IJournalStorageConsumer consumer, ReadOnlySequence input, IJournalFileMetadata? metadata, bool complete) + public static void Read(this IJournalStorageConsumer consumer, ReadOnlySequence input, IJournalMetadata? metadata, bool complete) { ArgumentNullException.ThrowIfNull(consumer); - metadata ??= JournalFileMetadata.Empty; + metadata ??= JournalMetadata.Empty; using var buffer = new ArcBufferWriter(); foreach (var segment in input) @@ -78,11 +78,11 @@ public static void Read(this IJournalStorageConsumer consumer, ReadOnlySequence< /// The ordered bytes to read. /// The metadata associated with the journal data being read, or if no metadata is available. /// Whether to notify the consumer that no more data will be supplied. If , the consumer must read all supplied bytes. - public static void Read(this IJournalStorageConsumer consumer, IEnumerable> segments, IJournalFileMetadata? metadata, bool complete) + public static void Read(this IJournalStorageConsumer consumer, IEnumerable> segments, IJournalMetadata? metadata, bool complete) { ArgumentNullException.ThrowIfNull(consumer); ArgumentNullException.ThrowIfNull(segments); - metadata ??= JournalFileMetadata.Empty; + metadata ??= JournalMetadata.Empty; using var buffer = new ArcBufferWriter(); foreach (var segment in segments) @@ -99,7 +99,7 @@ public static void Read(this IJournalStorageConsumer consumer, IEnumerableThe metadata associated with the journal data being read, or if no metadata is available. /// The cancellation token. /// The number of bytes read from . - public static async ValueTask ReadAsync(this IJournalStorageConsumer consumer, Stream input, IJournalFileMetadata? metadata, CancellationToken cancellationToken) + public static async ValueTask ReadAsync(this IJournalStorageConsumer consumer, Stream input, IJournalMetadata? metadata, CancellationToken cancellationToken) => await consumer.ReadAsync(input, metadata, complete: true, cancellationToken).ConfigureAwait(false); /// @@ -132,11 +132,11 @@ public static async ValueTask ReadAsync(this IJournalStorageConsumer consu /// Whether to notify the consumer that no more data will be supplied. If , the consumer must read all supplied bytes. /// The cancellation token. /// The number of bytes read from . - public static async ValueTask ReadAsync(this IJournalStorageConsumer consumer, Stream input, IJournalFileMetadata? metadata, bool complete, CancellationToken cancellationToken) + public static async ValueTask ReadAsync(this IJournalStorageConsumer consumer, Stream input, IJournalMetadata? metadata, bool complete, CancellationToken cancellationToken) { ArgumentNullException.ThrowIfNull(consumer); ArgumentNullException.ThrowIfNull(input); - metadata ??= JournalFileMetadata.Empty; + metadata ??= JournalMetadata.Empty; using var buffer = new ArcBufferWriter(); long totalBytesRead = 0; @@ -157,7 +157,7 @@ public static async ValueTask ReadAsync(this IJournalStorageConsumer consu } } - private static void ReadBuffer(IJournalStorageConsumer consumer, ArcBufferWriter buffer, IJournalFileMetadata metadata, bool isCompleted) + private static void ReadBuffer(IJournalStorageConsumer consumer, ArcBufferWriter buffer, IJournalMetadata metadata, bool isCompleted) { var readBuffer = new JournalBufferReader(buffer.Reader, isCompleted); consumer.Read(readBuffer, metadata); diff --git a/src/Orleans.Journaling/JournaledStateManager.cs b/src/Orleans.Journaling/JournaledStateManager.cs index 348c44cf45d..f845e0de18b 100644 --- a/src/Orleans.Journaling/JournaledStateManager.cs +++ b/src/Orleans.Journaling/JournaledStateManager.cs @@ -665,7 +665,7 @@ private IJournalFormat GetJournalFormat(string journalFormatKey) return JournalFormatServices.GetRequiredJournalFormat(_shared.ServiceProvider, journalFormatKey); } - private void ProcessRecoveryBuffer(JournalBufferReader buffer, IJournalFileMetadata? metadata) + private void ProcessRecoveryBuffer(JournalBufferReader buffer, IJournalMetadata? metadata) { if (buffer.Length == 0) { @@ -697,7 +697,7 @@ private void ProcessRecoveryBuffer(JournalBufferReader buffer, IJournalFileMetad } } - void IJournalStorageConsumer.Read(JournalBufferReader buffer, IJournalFileMetadata? metadata) => ProcessRecoveryBuffer(buffer, metadata); + void IJournalStorageConsumer.Read(JournalBufferReader buffer, IJournalMetadata? metadata) => ProcessRecoveryBuffer(buffer, metadata); private static bool ShouldWrapRecoveryFormatException(Exception exception) => exception is not OperationCanceledException && !IsRecoveryFormatException(exception); diff --git a/src/Orleans.Journaling/VolatileJournalStorage.cs b/src/Orleans.Journaling/VolatileJournalStorage.cs index 3465615f747..1b9899aa0d7 100644 --- a/src/Orleans.Journaling/VolatileJournalStorage.cs +++ b/src/Orleans.Journaling/VolatileJournalStorage.cs @@ -1,14 +1,16 @@ using System.Buffers; using System.Collections.Concurrent; +using System.Globalization; +using System.Runtime.CompilerServices; using Microsoft.Extensions.Options; using Orleans.Journaling.Json; namespace Orleans.Journaling; -public sealed class VolatileJournalStorageProvider : IJournalStorageProvider +public sealed class VolatileJournalStorageProvider : IJournalStorageProvider, IJournalStorageCatalog { private readonly IOptions? _options; - private readonly ConcurrentDictionary _storage = new(); + private readonly ConcurrentDictionary _storage = new(StringComparer.Ordinal); public VolatileJournalStorageProvider() { @@ -32,24 +34,72 @@ public IJournalStorage CreateStorage(JournalId journalId) } var journalFormatKey = GetJournalFormatKey(); - var storage = _storage.GetOrAdd(journalId, _ => new VolatileJournalStorage(journalFormatKey)); - storage.SetConfiguredJournalFormatKey(journalFormatKey); - return storage; + var store = _storage.GetOrAdd(journalId.Value, static key => new VolatileJournalStorage.Store(key)); + return new VolatileJournalStorage(store, journalFormatKey); + } + + public async IAsyncEnumerable ListAsync( + JournalId prefix = default, + [EnumeratorCancellation] CancellationToken cancellationToken = default) + { + List journalIds = []; + foreach (var (key, store) in _storage) + { + cancellationToken.ThrowIfCancellationRequested(); + if (!TryParseJournalId(key, out var journalId) || !prefix.IsPrefixOf(journalId)) + { + continue; + } + + lock (store.SyncRoot) + { + if (!store.Exists) + { + continue; + } + } + + journalIds.Add(journalId); + } + + journalIds.Sort(static (left, right) => StringComparer.Ordinal.Compare(left.Value, right.Value)); + + foreach (var journalId in journalIds) + { + cancellationToken.ThrowIfCancellationRequested(); + yield return journalId; + } + + await Task.CompletedTask.ConfigureAwait(false); } private string GetJournalFormatKey() => JournalFormatServices.ValidateJournalFormatKey(_options?.Value.JournalFormatKey ?? JsonJournalExtensions.JournalFormatKey); + + private static bool TryParseJournalId(string value, out JournalId journalId) + { + try + { + journalId = new JournalId(value); + return true; + } + catch (ArgumentException) + { + journalId = default; + return false; + } + } } + /// /// An in-memory, volatile implementation of for non-durable use cases, such as development and testing. /// public sealed class VolatileJournalStorage : IJournalStorage { - private readonly List _segments = []; + private readonly Store _store; private string? _configuredJournalFormatKey; - private string? _storedJournalFormatKey; - public VolatileJournalStorage() + public VolatileJournalStorage() : this(new Store(CreateVolatileStorageId()), journalFormatKey: null) { } @@ -57,32 +107,116 @@ public VolatileJournalStorage() /// Initializes a new instance of the class. /// /// The journal format key to stamp on writes. - public VolatileJournalStorage(string? journalFormatKey) + public VolatileJournalStorage(string? journalFormatKey) : this(new Store(CreateVolatileStorageId()), journalFormatKey) + { + } + + internal VolatileJournalStorage(Store store, string? journalFormatKey) { + ArgumentNullException.ThrowIfNull(store); + _store = store; SetConfiguredJournalFormatKey(journalFormatKey); } - public bool IsCompactionRequested => _segments.Count > 10; + public bool IsCompactionRequested + { + get + { + lock (_store.SyncRoot) + { + return _store.Segments.Count > 10; + } + } + } - internal IReadOnlyList Segments => _segments; + internal IReadOnlyList Segments => _store.Segments; internal string? StoredJournalFormatKey - => _storedJournalFormatKey; + { + get + { + lock (_store.SyncRoot) + { + return _store.StoredJournalFormatKey; + } + } + + set + { + lock (_store.SyncRoot) + { + _store.StoredJournalFormatKey = value; + } + } + } internal void SetConfiguredJournalFormatKey(string? journalFormatKey) { _configuredJournalFormatKey = journalFormatKey; } + public ValueTask CreateIfNotExistsAsync( + IReadOnlyDictionary? metadata = null, + CancellationToken cancellationToken = default) + { + cancellationToken.ThrowIfCancellationRequested(); + var values = JournalMetadata.CopyProperties(metadata); + lock (_store.SyncRoot) + { + if (_store.Exists) + { + return new(false); + } + + _store.Create(values); + return new(true); + } + } + + public ValueTask GetMetadataAsync(CancellationToken cancellationToken = default) + { + cancellationToken.ThrowIfCancellationRequested(); + lock (_store.SyncRoot) + { + return new(_store.Exists ? _store.GetMetadata() : null); + } + } + + public ValueTask UpdateMetadataAsync( + IReadOnlyDictionary? set = null, + IEnumerable? remove = null, + string? expectedETag = null, + CancellationToken cancellationToken = default) + { + cancellationToken.ThrowIfCancellationRequested(); + var setValues = JournalMetadata.CopyProperties(set); + var removeValues = CopyRemove(remove, setValues); + lock (_store.SyncRoot) + { + if (!_store.Exists || expectedETag is not null && !string.Equals(expectedETag, _store.ETag, StringComparison.Ordinal)) + { + return new((IJournalMetadata?)null); + } + + _store.ApplyMetadataUpdate(setValues, removeValues); + return new(_store.GetMetadata()); + } + } + /// public ValueTask ReadAsync(IJournalStorageConsumer consumer, CancellationToken cancellationToken) { ArgumentNullException.ThrowIfNull(consumer); - var metadata = _storedJournalFormatKey is null - ? JournalFileMetadata.Empty - : new JournalFileMetadata(_storedJournalFormatKey); - consumer.Read(GetSegments(_segments, cancellationToken), metadata, complete: true); + byte[][] segments; + IJournalMetadata metadata; + lock (_store.SyncRoot) + { + metadata = _store.Exists ? _store.GetMetadata() : JournalMetadata.Empty; + segments = _store.Segments.ToArray(); + } + + consumer.Read(GetSegments(segments, cancellationToken), metadata, complete: true); return default; } @@ -99,8 +233,14 @@ private static IEnumerable> GetSegments(IEnumerable public ValueTask AppendAsync(ReadOnlySequence segment, CancellationToken cancellationToken) { cancellationToken.ThrowIfCancellationRequested(); - _storedJournalFormatKey = _configuredJournalFormatKey; - _segments.Add(segment.ToArray()); + lock (_store.SyncRoot) + { + _store.Exists = true; + _store.StoredJournalFormatKey = _configuredJournalFormatKey; + _store.Segments.Add(segment.ToArray()); + _store.RefreshETag(); + } + return default; } @@ -108,17 +248,131 @@ public ValueTask AppendAsync(ReadOnlySequence segment, CancellationToken c public ValueTask ReplaceAsync(ReadOnlySequence snapshot, CancellationToken cancellationToken) { cancellationToken.ThrowIfCancellationRequested(); - _storedJournalFormatKey = _configuredJournalFormatKey; - _segments.Clear(); - _segments.Add(snapshot.ToArray()); + lock (_store.SyncRoot) + { + _store.Exists = true; + _store.StoredJournalFormatKey = _configuredJournalFormatKey; + _store.Segments.Clear(); + _store.Segments.Add(snapshot.ToArray()); + _store.RefreshETag(); + } + return default; } public ValueTask DeleteAsync(CancellationToken cancellationToken) { cancellationToken.ThrowIfCancellationRequested(); - _segments.Clear(); - _storedJournalFormatKey = null; + lock (_store.SyncRoot) + { + _store.Delete(); + } + return default; } + + private static string CreateVolatileStorageId() => $"volatile/{Guid.NewGuid():N}"; + + internal sealed class Store(string storageId) + { + public object SyncRoot { get; } = new(); + + public List Segments { get; } = []; + + public Dictionary Properties { get; } = new(StringComparer.Ordinal); + + public string? StoredJournalFormatKey { get; set; } + + public bool Exists { get; set; } + + public long Version { get; private set; } + + public string? ETag { get; private set; } + + public void Create(IReadOnlyDictionary? properties) + { + Exists = true; + Segments.Clear(); + Properties.Clear(); + StoredJournalFormatKey = null; + if (properties is not null) + { + foreach (var (key, value) in properties) + { + Properties.Add(key, value); + } + } + + RefreshETag(); + } + + public void Delete() + { + Exists = false; + Segments.Clear(); + Properties.Clear(); + StoredJournalFormatKey = null; + ETag = null; + Version++; + } + + public IJournalMetadata GetMetadata() => new JournalMetadata(StoredJournalFormatKey, ETag, Properties); + + public bool ApplyMetadataUpdate(IReadOnlyDictionary set, IReadOnlySet remove) + { + var changed = false; + foreach (var propertyName in remove) + { + changed |= Properties.Remove(propertyName); + } + + foreach (var (propertyName, value) in set) + { + if (!Properties.TryGetValue(propertyName, out var currentValue) + || !string.Equals(currentValue, value, StringComparison.Ordinal)) + { + Properties[propertyName] = value; + changed = true; + } + } + + if (changed) + { + RefreshETag(); + } + + return changed; + } + + public string RefreshETag() + { + Exists = true; + ETag = (++Version).ToString("D", CultureInfo.InvariantCulture); + return ETag; + } + + public override string ToString() => storageId; + } + + private static IReadOnlySet CopyRemove(IEnumerable? remove, IReadOnlyDictionary set) + { + if (remove is null) + { + return new HashSet(StringComparer.Ordinal); + } + + var result = new HashSet(StringComparer.Ordinal); + foreach (var key in remove) + { + JournalMetadata.ValidateCallerPropertyName(key); + if (set.ContainsKey(key)) + { + throw new ArgumentException($"Journal metadata property '{key}' cannot be both set and removed.", nameof(remove)); + } + + result.Add(key); + } + + return result; + } } diff --git a/src/api/Azure/Orleans.DurableJobs.AzureStorage/Orleans.DurableJobs.AzureStorage.cs b/src/api/Azure/Orleans.DurableJobs.AzureStorage/Orleans.DurableJobs.AzureStorage.cs index 70e349bab36..a7f71e0c87a 100644 --- a/src/api/Azure/Orleans.DurableJobs.AzureStorage/Orleans.DurableJobs.AzureStorage.cs +++ b/src/api/Azure/Orleans.DurableJobs.AzureStorage/Orleans.DurableJobs.AzureStorage.cs @@ -6,61 +6,12 @@ // the code is regenerated. // //------------------------------------------------------------------------------ -namespace Orleans.DurableJobs.AzureStorage -{ - public sealed partial class AzureStorageJobShardManager : JobShardManager - { - public AzureStorageJobShardManager(Runtime.ILocalSiloDetails localSiloDetails, Microsoft.Extensions.Options.IOptions options, Microsoft.Extensions.Options.IOptions durableJobsOptions, Runtime.IClusterMembershipService clusterMembership, Microsoft.Extensions.Logging.ILoggerFactory loggerFactory) : base(default!) { } - - public AzureStorageJobShardManager(Runtime.SiloAddress siloAddress, Azure.Storage.Blobs.BlobServiceClient client, string containerName, string blobPrefix, Hosting.AzureStorageJobShardOptions options, Microsoft.Extensions.Options.IOptions durableJobsOptions, Runtime.IClusterMembershipService clusterMembership, Microsoft.Extensions.Logging.ILoggerFactory loggerFactory) : base(default!) { } - - public override System.Threading.Tasks.Task> AssignJobShardsAsync(System.DateTimeOffset maxShardStartTime, int maxNewClaims, System.Threading.CancellationToken cancellationToken) { throw null; } - - public override System.Threading.Tasks.Task CreateShardAsync(System.DateTimeOffset minDueTime, System.DateTimeOffset maxDueTime, System.Collections.Generic.IDictionary metadata, System.Threading.CancellationToken cancellationToken) { throw null; } - - public override System.Threading.Tasks.Task UnregisterShardAsync(IJobShard shard, System.Threading.CancellationToken cancellationToken) { throw null; } - } - - public static partial class NetstringJsonSerializer - { - public static System.Collections.Generic.IAsyncEnumerable DecodeAsync(System.IO.Stream stream, System.Text.Json.Serialization.Metadata.JsonTypeInfo jsonTypeInfo, System.Threading.CancellationToken cancellationToken) { throw null; } - - public static void Encode(T value, System.IO.Stream stream, System.Text.Json.Serialization.Metadata.JsonTypeInfo jsonTypeInfo) { } - } -} - namespace Orleans.Hosting { public static partial class AzureStorageDurableJobsExtensions { - public static Microsoft.Extensions.DependencyInjection.IServiceCollection UseAzureBlobDurableJobs(this Microsoft.Extensions.DependencyInjection.IServiceCollection services, System.Action> configureOptions) { throw null; } - - public static Microsoft.Extensions.DependencyInjection.IServiceCollection UseAzureBlobDurableJobs(this Microsoft.Extensions.DependencyInjection.IServiceCollection services, System.Action configure) { throw null; } - - public static ISiloBuilder UseAzureBlobDurableJobs(this ISiloBuilder builder, System.Action> configureOptions) { throw null; } - - public static ISiloBuilder UseAzureBlobDurableJobs(this ISiloBuilder builder, System.Action configure) { throw null; } - } - - public partial class AzureStorageJobShardOptions - { - public System.TimeSpan BatchFlushInterval { get { throw null; } set { } } - - public Azure.Storage.Blobs.BlobServiceClient BlobServiceClient { get { throw null; } set { } } - - public string ContainerName { get { throw null; } set { } } - - public int MaxBatchSize { get { throw null; } set { } } - - public int MaxBlobCreationRetries { get { throw null; } } - - public int MinBatchSize { get { throw null; } set { } } - } - - public partial class AzureStorageJobShardOptionsValidator : IConfigurationValidator - { - public AzureStorageJobShardOptionsValidator(AzureStorageJobShardOptions options, string name) { } + public static Microsoft.Extensions.DependencyInjection.IServiceCollection UseAzureBlobDurableJobs(this Microsoft.Extensions.DependencyInjection.IServiceCollection services, System.Action configure) { throw null; } - public void ValidateConfiguration() { } + public static ISiloBuilder UseAzureBlobDurableJobs(this ISiloBuilder builder, System.Action configure) { throw null; } } } \ No newline at end of file diff --git a/src/api/Orleans.Journaling/Orleans.Journaling.cs b/src/api/Orleans.Journaling/Orleans.Journaling.cs index e9fdd72be71..b491fa2e1fa 100644 --- a/src/api/Orleans.Journaling/Orleans.Journaling.cs +++ b/src/api/Orleans.Journaling/Orleans.Journaling.cs @@ -234,11 +234,6 @@ public partial interface IJournaledStateManagerFactory IJournaledStateManager Create(JournalId journalId); } - public partial interface IJournalFileMetadata - { - string? Format { get; } - } - public partial interface IJournalFormat { string FormatKey { get; } @@ -249,19 +244,36 @@ public partial interface IJournalFormat void Replay(JournalBufferReader input, JournalReplayContext context); } + public partial interface IJournalMetadata + { + string? ETag { get; } + + string? Format { get; } + + System.Collections.Generic.IReadOnlyDictionary Properties { get; } + } + public partial interface IJournalStorage { bool IsCompactionRequested { get; } System.Threading.Tasks.ValueTask AppendAsync(System.Buffers.ReadOnlySequence value, System.Threading.CancellationToken cancellationToken); + System.Threading.Tasks.ValueTask CreateIfNotExistsAsync(System.Collections.Generic.IReadOnlyDictionary? metadata = null, System.Threading.CancellationToken cancellationToken = default); System.Threading.Tasks.ValueTask DeleteAsync(System.Threading.CancellationToken cancellationToken); + System.Threading.Tasks.ValueTask GetMetadataAsync(System.Threading.CancellationToken cancellationToken = default); System.Threading.Tasks.ValueTask ReadAsync(IJournalStorageConsumer consumer, System.Threading.CancellationToken cancellationToken); System.Threading.Tasks.ValueTask ReplaceAsync(System.Buffers.ReadOnlySequence value, System.Threading.CancellationToken cancellationToken); + System.Threading.Tasks.ValueTask UpdateMetadataAsync(System.Collections.Generic.IReadOnlyDictionary? set = null, System.Collections.Generic.IEnumerable? remove = null, string? expectedETag = null, System.Threading.CancellationToken cancellationToken = default); + } + + public partial interface IJournalStorageCatalog + { + System.Collections.Generic.IAsyncEnumerable ListAsync(JournalId prefix = default, System.Threading.CancellationToken cancellationToken = default); } public partial interface IJournalStorageConsumer { - void Read(JournalBufferReader buffer, IJournalFileMetadata? metadata); + void Read(JournalBufferReader buffer, IJournalMetadata? metadata); } public partial interface IJournalStorageProvider @@ -379,15 +391,6 @@ public void Commit() { } public void Dispose() { } } - public sealed partial class JournalFileMetadata : IJournalFileMetadata - { - public JournalFileMetadata(string? format) { } - - public static IJournalFileMetadata Empty { get { throw null; } } - - public string? Format { get { throw null; } } - } - public readonly partial struct JournalId : System.IEquatable { private readonly object _dummy; @@ -398,6 +401,12 @@ public JournalId(string value) { } public string Value { get { throw null; } } + public static JournalId Create(System.Collections.Generic.IEnumerable segments) { throw null; } + + public static JournalId Create(System.ReadOnlySpan segments) { throw null; } + + public static JournalId Create(string firstSegment, params System.ReadOnlySpan additionalSegments) { throw null; } + public readonly bool Equals(JournalId other) { throw null; } public override readonly bool Equals(object? obj) { throw null; } @@ -406,6 +415,8 @@ public JournalId(string value) { } public override readonly int GetHashCode() { throw null; } + public readonly bool IsPrefixOf(JournalId journalId) { throw null; } + public static bool operator ==(JournalId left, JournalId right) { throw null; } public static bool operator !=(JournalId left, JournalId right) { throw null; } @@ -413,6 +424,19 @@ public JournalId(string value) { } public override readonly string ToString() { throw null; } } + public sealed partial class JournalMetadata : IJournalMetadata + { + public JournalMetadata(string? format, string? eTag = null, System.Collections.Generic.IReadOnlyDictionary? properties = null) { } + + public static IJournalMetadata Empty { get { throw null; } } + + public string? ETag { get { throw null; } } + + public string? Format { get { throw null; } } + + public System.Collections.Generic.IReadOnlyDictionary Properties { get { throw null; } } + } + public readonly partial struct JournalReplayContext { private readonly object _dummy; @@ -428,17 +452,17 @@ public readonly partial struct JournalReplayContext public static partial class JournalStorageConsumerExtensions { - public static void Complete(this IJournalStorageConsumer consumer, IJournalFileMetadata? metadata) { } + public static void Complete(this IJournalStorageConsumer consumer, IJournalMetadata? metadata) { } - public static void Read(this IJournalStorageConsumer consumer, System.Buffers.ReadOnlySequence input, IJournalFileMetadata? metadata, bool complete) { } + public static void Read(this IJournalStorageConsumer consumer, System.Buffers.ReadOnlySequence input, IJournalMetadata? metadata, bool complete) { } - public static void Read(this IJournalStorageConsumer consumer, System.Collections.Generic.IEnumerable> segments, IJournalFileMetadata? metadata, bool complete) { } + public static void Read(this IJournalStorageConsumer consumer, System.Collections.Generic.IEnumerable> segments, IJournalMetadata? metadata, bool complete) { } - public static void Read(this IJournalStorageConsumer consumer, System.ReadOnlyMemory input, IJournalFileMetadata? metadata, bool complete) { } + public static void Read(this IJournalStorageConsumer consumer, System.ReadOnlyMemory input, IJournalMetadata? metadata, bool complete) { } - public static System.Threading.Tasks.ValueTask ReadAsync(this IJournalStorageConsumer consumer, System.IO.Stream input, IJournalFileMetadata? metadata, bool complete, System.Threading.CancellationToken cancellationToken) { throw null; } + public static System.Threading.Tasks.ValueTask ReadAsync(this IJournalStorageConsumer consumer, System.IO.Stream input, IJournalMetadata? metadata, bool complete, System.Threading.CancellationToken cancellationToken) { throw null; } - public static System.Threading.Tasks.ValueTask ReadAsync(this IJournalStorageConsumer consumer, System.IO.Stream input, IJournalFileMetadata? metadata, System.Threading.CancellationToken cancellationToken) { throw null; } + public static System.Threading.Tasks.ValueTask ReadAsync(this IJournalStorageConsumer consumer, System.IO.Stream input, IJournalMetadata? metadata, System.Threading.CancellationToken cancellationToken) { throw null; } } public readonly partial struct JournalStreamId : System.IEquatable @@ -487,20 +511,28 @@ public VolatileJournalStorage(string? journalFormatKey) { } public System.Threading.Tasks.ValueTask AppendAsync(System.Buffers.ReadOnlySequence segment, System.Threading.CancellationToken cancellationToken) { throw null; } + public System.Threading.Tasks.ValueTask CreateIfNotExistsAsync(System.Collections.Generic.IReadOnlyDictionary? metadata = null, System.Threading.CancellationToken cancellationToken = default) { throw null; } + public System.Threading.Tasks.ValueTask DeleteAsync(System.Threading.CancellationToken cancellationToken) { throw null; } + public System.Threading.Tasks.ValueTask GetMetadataAsync(System.Threading.CancellationToken cancellationToken = default) { throw null; } + public System.Threading.Tasks.ValueTask ReadAsync(IJournalStorageConsumer consumer, System.Threading.CancellationToken cancellationToken) { throw null; } public System.Threading.Tasks.ValueTask ReplaceAsync(System.Buffers.ReadOnlySequence snapshot, System.Threading.CancellationToken cancellationToken) { throw null; } + + public System.Threading.Tasks.ValueTask UpdateMetadataAsync(System.Collections.Generic.IReadOnlyDictionary? set = null, System.Collections.Generic.IEnumerable? remove = null, string? expectedETag = null, System.Threading.CancellationToken cancellationToken = default) { throw null; } } - public sealed partial class VolatileJournalStorageProvider : IJournalStorageProvider + public sealed partial class VolatileJournalStorageProvider : IJournalStorageProvider, IJournalStorageCatalog { public VolatileJournalStorageProvider() { } public VolatileJournalStorageProvider(Microsoft.Extensions.Options.IOptions options) { } public IJournalStorage CreateStorage(JournalId journalId) { throw null; } + + public System.Collections.Generic.IAsyncEnumerable ListAsync(JournalId prefix = default, System.Threading.CancellationToken cancellationToken = default) { throw null; } } } @@ -658,4 +690,4 @@ public void WriteField(ref global::Orleans.Serialization.Buffers. public sealed partial class Copier_DurableTaskCompletionSourceState : global::Orleans.Serialization.Cloning.ShallowCopier> { } -} \ No newline at end of file +} diff --git a/test/Extensions/Orleans.Azure.Tests/AzureStorageOperationOptionsExtensions.cs b/test/Extensions/Orleans.Azure.Tests/AzureStorageOperationOptionsExtensions.cs index 9fc2526ecf6..7955815ced7 100644 --- a/test/Extensions/Orleans.Azure.Tests/AzureStorageOperationOptionsExtensions.cs +++ b/test/Extensions/Orleans.Azure.Tests/AzureStorageOperationOptionsExtensions.cs @@ -59,20 +59,6 @@ public static Orleans.Configuration.AzureBlobStorageOptions ConfigureTestDefault return options; } - public static AzureStorageJobShardOptions ConfigureTestDefaults(this AzureStorageJobShardOptions options) - { - if (TestDefaultConfiguration.UseAadAuthentication) - { - options.BlobServiceClient = new(TestDefaultConfiguration.DataBlobUri, TestDefaultConfiguration.TokenCredential); - } - else - { - options.BlobServiceClient = new(TestDefaultConfiguration.DataConnectionString); - } - - return options; - } - public static Orleans.Configuration.AzureQueueOptions ConfigureTestDefaults(this Orleans.Configuration.AzureQueueOptions options) { if (TestDefaultConfiguration.UseAadAuthentication) diff --git a/test/Extensions/Orleans.Azure.Tests/DurableJobs/AzureStorageJobShardBatchingTests.cs b/test/Extensions/Orleans.Azure.Tests/DurableJobs/AzureStorageJobShardBatchingTests.cs deleted file mode 100644 index 63921e9a557..00000000000 --- a/test/Extensions/Orleans.Azure.Tests/DurableJobs/AzureStorageJobShardBatchingTests.cs +++ /dev/null @@ -1,328 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Collections.Immutable; -using System.Linq; -using System.Net; -using System.Threading; -using System.Threading.Tasks; -using Microsoft.Extensions.Logging.Abstractions; -using Microsoft.Extensions.Options; -using Orleans.Hosting; -using Orleans.Runtime; -using Orleans.DurableJobs; -using Orleans.DurableJobs.AzureStorage; -using Tester.AzureUtils; -using Xunit; - -namespace Tester.AzureUtils.DurableJobs; - -/// -/// Azure Storage-specific tests for job shard batching functionality. -/// These tests verify Azure-specific batching behaviors that don't apply to all providers. -/// -[TestCategory("DurableJobs")] -public class AzureStorageJobShardBatchingTests : AzureStorageBasicTests, IAsyncDisposable -{ - private readonly IDictionary _metadata = new Dictionary - { - { "CreatedBy", "UnitTest" }, - { "Purpose", "Testing" } - }; - - internal InMemoryClusterMembershipService MembershipService { get; } - - internal IOptions StorageOptions { get; } - internal IOptions DurableJobsOptions { get; } - - public AzureStorageJobShardBatchingTests() - { - MembershipService = new InMemoryClusterMembershipService(); - StorageOptions = Options.Create(new AzureStorageJobShardOptions()); - DurableJobsOptions = Options.Create(new DurableJobsOptions()); - StorageOptions.Value.ConfigureTestDefaults(); - StorageOptions.Value.ContainerName = "test-batch-container-" + Guid.NewGuid().ToString("N"); - } - - public async ValueTask DisposeAsync() - { - // Cleanup storage container - var client = StorageOptions.Value.BlobServiceClient; - var container = client.GetBlobContainerClient(StorageOptions.Value.ContainerName); - await container.DeleteIfExistsAsync(); - } - - public class TestLocalSiloDetails : ILocalSiloDetails - { - public TestLocalSiloDetails(SiloAddress siloAddress) - { - SiloAddress = siloAddress; - } - - public string Name => SiloAddress.ToString(); - - public string ClusterId => "TestCluster"; - - public string DnsHostName => SiloAddress.ToString(); - - public SiloAddress SiloAddress { get; } - - public SiloAddress GatewayAddress => SiloAddress; - } - - internal AzureStorageJobShardManager CreateManager(SiloAddress siloAddress) - { - var localSiloDetails = new TestLocalSiloDetails(siloAddress); - return new AzureStorageJobShardManager(localSiloDetails, StorageOptions, DurableJobsOptions, MembershipService, NullLoggerFactory.Instance); - } - - internal void SetSiloStatus(SiloAddress siloAddress, SiloStatus status) - { - MembershipService.SetSiloStatus(siloAddress, status); - } - - [SkippableFact, TestCategory("Azure"), TestCategory("Functional")] - public async Task AzureStorageJobShard_MultipleOperationsBatched() - { - using var cts = new CancellationTokenSource(TimeSpan.FromMinutes(2)); - var cancellationToken = cts.Token; - // Configure batching options to batch multiple operations - StorageOptions.Value.MinBatchSize = 5; - StorageOptions.Value.MaxBatchSize = 50; - StorageOptions.Value.BatchFlushInterval = TimeSpan.FromMilliseconds(100); - - var localAddress = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5000), 0); - SetSiloStatus(localAddress, SiloStatus.Active); - var manager = CreateManager(localAddress); - - var date = DateTime.UtcNow; - var shard = await manager.CreateShardAsync(date, date.AddHours(1), _metadata, cancellationToken); - - // Schedule 10 jobs rapidly to trigger batching - var tasks = new List(); - for (int i = 0; i < 10; i++) - { - tasks.Add(shard.TryScheduleJobAsync(new ScheduleJobRequest { Target = GrainId.Create("type", $"target{i}"), JobName = $"job{i}", DueTime = date.AddMilliseconds(i * 10d), Metadata = null }, cancellationToken)); - } - - await Task.WhenAll(tasks); - - // Wait for batches to flush - await Task.Delay(TimeSpan.FromMilliseconds(300), cancellationToken); - - // Verify batching occurred - should have fewer committed blocks than individual operations - var azureShard = (AzureStorageJobShard)shard; - Assert.True(azureShard.CommitedBlockCount < 10, $"Expected batching to reduce block count, but got {azureShard.CommitedBlockCount}"); - - // Verify all jobs were persisted by marking silo as dead and reassigning - SetSiloStatus(localAddress, SiloStatus.Dead); - var newSiloAddress = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5000), 1); - SetSiloStatus(newSiloAddress, SiloStatus.Active); - - var newManager = CreateManager(newSiloAddress); - var shards = await newManager.AssignJobShardsAsync(DateTime.UtcNow.AddHours(1), maxNewClaims: int.MaxValue, cancellationToken); - Assert.Single(shards); - - var consumedJobs = new List(); - await foreach (var jobCtx in shards[0].ConsumeDurableJobsAsync().WithCancellation(cancellationToken)) - { - consumedJobs.Add(jobCtx.Job.Name); - await shards[0].RemoveJobAsync(jobCtx.Job.Id, cancellationToken); - } - - Assert.Equal(10, consumedJobs.Count); - await newManager.UnregisterShardAsync(shards[0], cancellationToken); - } - - [SkippableFact, TestCategory("Azure"), TestCategory("Functional")] - public async Task AzureStorageJobShard_PartialBatchFlushesOnTimeout() - { - using var cts = new CancellationTokenSource(TimeSpan.FromMinutes(2)); - var cancellationToken = cts.Token; - // Configure batching to require 10 operations but with a short timeout - StorageOptions.Value.MinBatchSize = 10; - StorageOptions.Value.MaxBatchSize = 100; - StorageOptions.Value.BatchFlushInterval = TimeSpan.FromMilliseconds(200); - - var localAddress = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5000), 0); - SetSiloStatus(localAddress, SiloStatus.Active); - var manager = CreateManager(localAddress); - - var date = DateTime.UtcNow; - var shard = await manager.CreateShardAsync(date, date.AddHours(1), _metadata, cancellationToken); - - // Schedule only 3 jobs (less than MinBatchSize of 10) - var tasks = new Task[3]; - tasks[0] = shard.TryScheduleJobAsync(new ScheduleJobRequest { Target = GrainId.Create("type", "target1"), JobName = "job1", DueTime = date.AddSeconds(1), Metadata = null }, cancellationToken); - tasks[1] = shard.TryScheduleJobAsync(new ScheduleJobRequest { Target = GrainId.Create("type", "target2"), JobName = "job2", DueTime = date.AddSeconds(2), Metadata = null }, cancellationToken); - tasks[2] = shard.TryScheduleJobAsync(new ScheduleJobRequest { Target = GrainId.Create("type", "target3"), JobName = "job3", DueTime = date.AddSeconds(3), Metadata = null }, cancellationToken); - - await Task.WhenAll(tasks); - - // Verify that the partial batch was flushed - should have 1 committed block - var azureShard = (AzureStorageJobShard)shard; - Assert.Equal(1, azureShard.CommitedBlockCount); - - // Verify jobs were persisted despite not reaching MinBatchSize - SetSiloStatus(localAddress, SiloStatus.Dead); - var newSiloAddress = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5000), 1); - SetSiloStatus(newSiloAddress, SiloStatus.Active); - - var newManager = CreateManager(newSiloAddress); - var shards = await newManager.AssignJobShardsAsync(DateTime.UtcNow.AddHours(1), maxNewClaims: int.MaxValue, cancellationToken); - Assert.Single(shards); - - var consumedJobs = new List(); - await foreach (var jobCtx in shards[0].ConsumeDurableJobsAsync().WithCancellation(cancellationToken)) - { - consumedJobs.Add(jobCtx.Job.Name); - await shards[0].RemoveJobAsync(jobCtx.Job.Id, cancellationToken); - } - - Assert.Equal(3, consumedJobs.Count); - await newManager.UnregisterShardAsync(shards[0], cancellationToken); - } - - [SkippableFact, TestCategory("Azure"), TestCategory("Functional")] - public async Task AzureStorageJobShard_MaxBatchSizeEnforced() - { - using var cts = new CancellationTokenSource(TimeSpan.FromMinutes(2)); - var cancellationToken = cts.Token; - // Configure batching with a small max batch size - StorageOptions.Value.MinBatchSize = 1; - StorageOptions.Value.MaxBatchSize = 20; - StorageOptions.Value.BatchFlushInterval = TimeSpan.FromMilliseconds(50); - - var localAddress = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5000), 0); - SetSiloStatus(localAddress, SiloStatus.Active); - var manager = CreateManager(localAddress); - - var date = DateTime.UtcNow; - var shard = await manager.CreateShardAsync(date, date.AddHours(1), _metadata, cancellationToken); - - // Schedule 50 jobs rapidly (exceeds MaxBatchSize of 20) - var tasks = new List(); - for (int i = 0; i < 50; i++) - { - tasks.Add(shard.TryScheduleJobAsync(new ScheduleJobRequest { Target = GrainId.Create("type", $"target{i}"), JobName = $"job{i}", DueTime = date.AddMilliseconds(i), Metadata = null }, cancellationToken)); - } - - await Task.WhenAll(tasks); - - // Wait for all batches to flush - await Task.Delay(TimeSpan.FromMilliseconds(500), cancellationToken); - - // Verify multiple batches were created due to MaxBatchSize limit - // With 50 jobs and MaxBatchSize=20, expect at least 3 blocks (50/20 = 2.5, rounded up) - var azureShard = (AzureStorageJobShard)shard; - Assert.True(azureShard.CommitedBlockCount >= 3, $"Expected at least 3 blocks for 50 jobs with MaxBatchSize=20, but got {azureShard.CommitedBlockCount}"); - - // Verify all jobs were persisted (should be split into multiple batches) - SetSiloStatus(localAddress, SiloStatus.Dead); - var newSiloAddress = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5000), 1); - SetSiloStatus(newSiloAddress, SiloStatus.Active); - - var newManager = CreateManager(newSiloAddress); - var shards = await newManager.AssignJobShardsAsync(DateTime.UtcNow.AddHours(1), maxNewClaims: int.MaxValue, cancellationToken); - Assert.Single(shards); - - var consumedJobs = new List(); - await foreach (var jobCtx in shards[0].ConsumeDurableJobsAsync().WithCancellation(cancellationToken)) - { - consumedJobs.Add(jobCtx.Job.Name); - await shards[0].RemoveJobAsync(jobCtx.Job.Id, cancellationToken); - } - - Assert.Equal(50, consumedJobs.Count); - await newManager.UnregisterShardAsync(shards[0], cancellationToken); - } - - [SkippableFact, TestCategory("Azure"), TestCategory("Functional")] - public async Task AzureStorageJobShard_MetadataOperationsBreakBatches() - { - using var cts = new CancellationTokenSource(TimeSpan.FromMinutes(2)); - var cancellationToken = cts.Token; - // Configure batching to require large batch - StorageOptions.Value.MinBatchSize = 10; - StorageOptions.Value.MaxBatchSize = 100; - StorageOptions.Value.BatchFlushInterval = TimeSpan.FromSeconds(5); - - var localAddress = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5000), 0); - SetSiloStatus(localAddress, SiloStatus.Active); - var manager = CreateManager(localAddress); - - var date = DateTime.UtcNow; - var shard = await manager.CreateShardAsync(date, date.AddHours(1), _metadata, cancellationToken); - - // Schedule 5 jobs (less than MinBatchSize) - var tasks = new List(); - for (int i = 0; i < 5; i++) - { - tasks.Add(shard.TryScheduleJobAsync(new ScheduleJobRequest { Target = GrainId.Create("type", $"target{i}"), JobName = $"job{i}", DueTime = date.AddMilliseconds(i), Metadata = null }, cancellationToken)); - } - - // Give operations time to queue - await Task.Delay(50, cancellationToken); - - // Verify no blocks committed yet (batch still pending) - var azureShard = (AzureStorageJobShard)shard; - var blockCountBefore = azureShard.CommitedBlockCount; - - // Update metadata (should flush pending batch and process immediately) - var newMetadata = new Dictionary(shard.Metadata) { ["Updated"] = "true" }; - await azureShard.UpdateBlobMetadata(newMetadata, cancellationToken); - - Assert.All(tasks, t => Assert.True(t.IsCompletedSuccessfully, "Expected all job scheduling tasks to complete successfully")); - Assert.True(azureShard.CommitedBlockCount > blockCountBefore, "Expected metadata update to flush pending batch"); - - // Verify metadata was updated - var props = await azureShard.BlobClient.GetPropertiesAsync(cancellationToken: cancellationToken); - Assert.True(props.Value.Metadata.ContainsKey("Updated")); - Assert.Equal("true", props.Value.Metadata["Updated"]); - - // Verify jobs were persisted (even though batch was incomplete) - SetSiloStatus(localAddress, SiloStatus.Dead); - var newSiloAddress = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5000), 1); - SetSiloStatus(newSiloAddress, SiloStatus.Active); - - // Reconfigure batching to make test faster - StorageOptions.Value.MinBatchSize = 1; - StorageOptions.Value.MaxBatchSize = 1; - StorageOptions.Value.BatchFlushInterval = TimeSpan.FromMilliseconds(100); - - var newManager = CreateManager(newSiloAddress); - var shards = await newManager.AssignJobShardsAsync(DateTime.UtcNow.AddHours(1), maxNewClaims: int.MaxValue, cancellationToken); - Assert.Single(shards); - - var consumedJobs = new List(); - await foreach (var jobCtx in shards[0].ConsumeDurableJobsAsync().WithCancellation(cancellationToken)) - { - consumedJobs.Add(jobCtx.Job.Name); - await shards[0].RemoveJobAsync(jobCtx.Job.Id, cancellationToken); - } - - Assert.Equal(5, consumedJobs.Count); - await newManager.UnregisterShardAsync(shards[0], cancellationToken); - } - - public class InMemoryClusterMembershipService : IClusterMembershipService - { - private readonly Dictionary _silos = new(); - private int _version = 0; - - public ClusterMembershipSnapshot CurrentSnapshot => - new ClusterMembershipSnapshot(_silos.ToImmutableDictionary(), new MembershipVersion(_version)); - - public IAsyncEnumerable MembershipUpdates => throw new NotImplementedException(); - - public void SetSiloStatus(SiloAddress address, SiloStatus status) - { - _silos[address] = new ClusterMember(address, status, address.ToParsableString()); - _version++; - } - - public ValueTask Refresh(MembershipVersion minimumVersion = default, CancellationToken cancellationToken = default) => - ValueTask.CompletedTask; - - public Task TryKill(SiloAddress siloAddress) => throw new NotImplementedException(); - } -} diff --git a/test/Extensions/Orleans.Azure.Tests/DurableJobs/AzureStorageJobShardManagerTestFixture.cs b/test/Extensions/Orleans.Azure.Tests/DurableJobs/AzureStorageJobShardManagerTestFixture.cs deleted file mode 100644 index bfbae50ba93..00000000000 --- a/test/Extensions/Orleans.Azure.Tests/DurableJobs/AzureStorageJobShardManagerTestFixture.cs +++ /dev/null @@ -1,49 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Threading.Tasks; -using Microsoft.Extensions.Logging.Abstractions; -using Microsoft.Extensions.Options; -using Orleans.Hosting; -using Orleans.Runtime; -using Orleans.DurableJobs; -using Orleans.DurableJobs.AzureStorage; -using Tester.AzureUtils; -using Tester.DurableJobs; - -namespace Orleans.Tests.DurableJobs.AzureStorage; - -/// -/// Azure Storage implementation of . -/// Provides the infrastructure needed to run shared job shard manager tests against Azure Storage. -/// -internal sealed class AzureStorageJobShardManagerTestFixture : IJobShardManagerTestFixture -{ - private readonly IOptions _storageOptions; - private readonly IOptions _durableJobsOptions; - - public AzureStorageJobShardManagerTestFixture() - { - _storageOptions = Options.Create(new AzureStorageJobShardOptions()); - _durableJobsOptions = Options.Create(new DurableJobsOptions()); - _storageOptions.Value.ConfigureTestDefaults(); - _storageOptions.Value.ContainerName = "test-container-" + Guid.NewGuid().ToString("N"); - } - - public JobShardManager CreateManager(ILocalSiloDetails localSiloDetails, IClusterMembershipService membershipService) - { - return new AzureStorageJobShardManager( - localSiloDetails, - _storageOptions, - _durableJobsOptions, - membershipService, - NullLoggerFactory.Instance); - } - - public async ValueTask DisposeAsync() - { - // Cleanup storage container - var client = _storageOptions.Value.BlobServiceClient; - var container = client.GetBlobContainerClient(_storageOptions.Value.ContainerName); - await container.DeleteIfExistsAsync(); - } -} diff --git a/test/Extensions/Orleans.Azure.Tests/DurableJobs/AzureStorageJobShardManagerTests.cs b/test/Extensions/Orleans.Azure.Tests/DurableJobs/AzureStorageJobShardManagerTests.cs deleted file mode 100644 index 6ea17ecfc39..00000000000 --- a/test/Extensions/Orleans.Azure.Tests/DurableJobs/AzureStorageJobShardManagerTests.cs +++ /dev/null @@ -1,226 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Collections.Immutable; -using System.Linq; -using System.Net; -using System.Text; -using System.Threading; -using System.Threading.Tasks; -using AwesomeAssertions; -using Microsoft.Extensions.Logging; -using Microsoft.Extensions.Logging.Abstractions; -using Microsoft.Extensions.Options; -using Orleans.Internal; -using Orleans.DurableJobs; -using Orleans.DurableJobs.AzureStorage; -using Orleans.Tests.DurableJobs.AzureStorage; -using Tester.DurableJobs; -using Xunit; -using Xunit.Sdk; - -namespace Tester.AzureUtils.DurableJobs; - -/// -/// Azure Storage-specific tests for job shard manager functionality. -/// Common tests are delegated to for reusability across providers. -/// Provider-specific tests (e.g., batching) remain here. -/// -[TestCategory("DurableJobs")] -public class AzureStorageJobShardManagerTests : AzureStorageBasicTests, IAsyncDisposable -{ - private readonly AzureStorageJobShardManagerTestFixture _fixture; - private readonly JobShardManagerTestsRunner _runner; - - internal IOptions StorageOptions { get; } - - public AzureStorageJobShardManagerTests() - { - StorageOptions = Options.Create(new AzureStorageJobShardOptions()); - StorageOptions.Value.ConfigureTestDefaults(); - StorageOptions.Value.ContainerName = "test-container-" + Guid.NewGuid().ToString("N"); - - // Create fixture and runner for common tests - _fixture = new AzureStorageJobShardManagerTestFixture(); - _runner = new JobShardManagerTestsRunner(_fixture); - } - - public async ValueTask DisposeAsync() - { - // Cleanup storage container - var client = StorageOptions.Value.BlobServiceClient; - var container = client.GetBlobContainerClient(StorageOptions.Value.ContainerName); - await container.DeleteIfExistsAsync(); - - // Cleanup fixture - await _fixture.DisposeAsync(); - } - - #region Common Tests (Delegated to Runner) - - /// - /// Tests basic shard creation and assignment workflow. - /// This test is delegated to the runner for reuse across providers. - /// - [SkippableFact, TestCategory("Azure"), TestCategory("Functional")] - public async Task AzureStorageJobShardManager_Creation_Assignation() - { - using var cts = new CancellationTokenSource(TimeSpan.FromMinutes(2)); - await _runner.ShardCreationAndAssignment(cts.Token); - } - - /// - /// Tests reading and consuming jobs from a frozen shard after ownership transfer. - /// This test is delegated to the runner for reuse across providers. - /// - [SkippableFact, TestCategory("Azure"), TestCategory("Functional")] - public async Task AzureStorageJobShardManager_ReadFrozenShard() - { - using var cts = new CancellationTokenSource(TimeSpan.FromMinutes(2)); - await _runner.ReadFrozenShard(cts.Token); - } - - /// - /// Tests consuming jobs from a live shard. - /// This test is delegated to the runner for reuse across providers. - /// - [SkippableFact, TestCategory("Azure"), TestCategory("Functional")] - public async Task AzureStorageJobShardManager_LiveShard() - { - using var cts = new CancellationTokenSource(TimeSpan.FromMinutes(2)); - await _runner.LiveShard(cts.Token); - } - - /// - /// Tests job metadata persistence across ownership transfers. - /// This test is delegated to the runner for reuse across providers. - /// - [SkippableFact, TestCategory("Azure"), TestCategory("Functional")] - public async Task AzureStorageJobShardManager_JobMetadata() - { - using var cts = new CancellationTokenSource(TimeSpan.FromMinutes(2)); - await _runner.JobMetadata(cts.Token); - } - - /// - /// Tests concurrent shard assignment to verify ownership conflict resolution. - /// This test is delegated to the runner for reuse across providers. - /// - [SkippableFact, TestCategory("Azure"), TestCategory("Functional")] - public async Task AzureStorageJobShardManager_ConcurrentShardAssignment_OwnershipConflicts() - { - using var cts = new CancellationTokenSource(TimeSpan.FromMinutes(2)); - await _runner.ConcurrentShardAssignment_OwnershipConflicts(cts.Token); - } - - /// - /// Tests shard metadata preservation across ownership transfers. - /// This test is delegated to the runner for reuse across providers. - /// - [SkippableFact, TestCategory("Azure"), TestCategory("Functional")] - public async Task AzureStorageJobShardManager_ShardMetadataMerge() - { - using var cts = new CancellationTokenSource(TimeSpan.FromMinutes(2)); - await _runner.ShardMetadataMerge(cts.Token); - } - - #endregion - - /// - /// Tests stopping shard processing and verifying jobs remain for reassignment. - /// This test is delegated to the runner for reuse across providers. - /// - [SkippableFact, TestCategory("Azure"), TestCategory("Functional")] - public async Task AzureStorageJobShardManager_StopProcessingShard() - { - using var cts = new CancellationTokenSource(TimeSpan.FromMinutes(2)); - await _runner.StopProcessingShard(cts.Token); - } - - /// - /// Tests retrying a job with a new due time. - /// This test is delegated to the runner for reuse across providers. - /// - [SkippableFact, TestCategory("Azure"), TestCategory("Functional")] - public async Task AzureStorageJobShardManager_RetryJobLater() - { - using var cts = new CancellationTokenSource(TimeSpan.FromMinutes(2)); - await _runner.RetryJobLater(cts.Token); - } - - /// - /// Tests job cancellation before and during processing. - /// This test is delegated to the runner for reuse across providers. - /// - [SkippableFact, TestCategory("Azure"), TestCategory("Functional")] - public async Task AzureStorageJobShardManager_JobCancellation() - { - using var cts = new CancellationTokenSource(TimeSpan.FromMinutes(2)); - await _runner.JobCancellation(cts.Token); - } - - /// - /// Tests that multiple shard registrations with the same time range produce unique IDs. - /// This test is delegated to the runner for reuse across providers. - /// - [SkippableFact, TestCategory("Azure"), TestCategory("Functional")] - public async Task AzureStorageJobShardManager_ShardRegistrationRetry_IdCollisions() - { - using var cts = new CancellationTokenSource(TimeSpan.FromMinutes(2)); - await _runner.ShardRegistrationRetry_IdCollisions(cts.Token); - } - - /// - /// Tests that unregistering a shard with remaining jobs preserves the shard for reassignment. - /// This test is delegated to the runner for reuse across providers. - /// - [SkippableFact, TestCategory("Azure"), TestCategory("Functional")] - public async Task AzureStorageJobShardManager_UnregisterShard_WithJobsRemaining() - { - using var cts = new CancellationTokenSource(TimeSpan.FromMinutes(2)); - await _runner.UnregisterShard_WithJobsRemaining(cts.Token); - } - - /// - /// Tests that maxNewClaims limits the number of orphaned shards claimed. - /// This test is delegated to the runner for reuse across providers. - /// - [SkippableFact, TestCategory("Azure"), TestCategory("Functional")] - public async Task AzureStorageJobShardManager_SlowStart_LimitsOrphanedShardClaims() - { - using var cts = new CancellationTokenSource(TimeSpan.FromMinutes(2)); - await _runner.SlowStart_LimitsOrphanedShardClaims(cts.Token); - } - - /// - /// Tests that maxNewClaims = 0 prevents claiming orphaned shards but returns owned shards. - /// This test is delegated to the runner for reuse across providers. - /// - [SkippableFact, TestCategory("Azure"), TestCategory("Functional")] - public async Task AzureStorageJobShardManager_SlowStart_ZeroBudgetClaimsNothing() - { - using var cts = new CancellationTokenSource(TimeSpan.FromMinutes(2)); - await _runner.SlowStart_ZeroBudgetClaimsNothing(cts.Token); - } - - /// - /// Tests that maxNewClaims = int.MaxValue (unlimited) claims all orphaned shards. - /// This test is delegated to the runner for reuse across providers. - /// - [SkippableFact, TestCategory("Azure"), TestCategory("Functional")] - public async Task AzureStorageJobShardManager_SlowStart_UnlimitedBudgetClaimsAll() - { - using var cts = new CancellationTokenSource(TimeSpan.FromMinutes(2)); - await _runner.SlowStart_UnlimitedBudgetClaimsAll(cts.Token); - } - - /// - /// Tests that budget exhaustion does not inflate the adopted count, avoiding false poison detection. - /// This test is delegated to the runner for reuse across providers. - /// - [SkippableFact, TestCategory("Azure"), TestCategory("Functional")] - public async Task AzureStorageJobShardManager_SlowStart_BudgetExhaustion_DoesNotInflateAdoptedCount() - { - using var cts = new CancellationTokenSource(TimeSpan.FromMinutes(2)); - await _runner.SlowStart_BudgetExhaustion_DoesNotInflateAdoptedCount(cts.Token); - } -} diff --git a/test/Extensions/Orleans.Azure.Tests/DurableJobs/NetstringJsonSerializerTests.cs b/test/Extensions/Orleans.Azure.Tests/DurableJobs/NetstringJsonSerializerTests.cs deleted file mode 100644 index 64cf105c586..00000000000 --- a/test/Extensions/Orleans.Azure.Tests/DurableJobs/NetstringJsonSerializerTests.cs +++ /dev/null @@ -1,445 +0,0 @@ -using System; -using System.Buffers; -using System.Collections.Generic; -using System.IO; -using System.Linq; -using System.Text; -using System.Text.Json; -using System.Threading.Tasks; -using AwesomeAssertions; -using Orleans.Runtime; -using Orleans.DurableJobs.AzureStorage; -using Xunit; - -namespace Tester.AzureUtils.DurableJobs; - -[TestCategory("DurableJobs"), TestCategory("BVT")] -public class NetstringJsonSerializerTests -{ - private static byte[] EncodeToBytes(JobOperation operation) - { - var stream = new MemoryStream(); - NetstringJsonSerializer.Encode(operation, stream, JobOperationJsonContext.Default.JobOperation); - return stream.ToArray(); - } - [Fact] - public void Encode_RemoveOperation_ProducesCorrectFormat() - { - var operation = JobOperation.CreateRemoveOperation("job123"); - var result = EncodeToBytes(operation); - var resultString = Encoding.UTF8.GetString(result); - - resultString.Should().EndWith("\n"); - resultString.Should().Match("*:*\n"); - resultString.Should().Contain("\"type\":1"); - resultString.Should().Contain("\"id\":\"job123\""); - } - - [Fact] - public void Encode_AddOperation_ProducesCorrectFormat() - { - var dueTime = new DateTimeOffset(2025, 10, 31, 12, 0, 0, TimeSpan.Zero); - var grainId = GrainId.Create("test", "grain1"); - var operation = JobOperation.CreateAddOperation("job456", "TestJob", dueTime, grainId, null); - var result = EncodeToBytes(operation); - var resultString = Encoding.UTF8.GetString(result); - - resultString.Should().EndWith("\n"); - resultString.Should().Match("*:*\n"); - resultString.Should().Contain("\"id\":\"job456\""); - resultString.Should().Contain("\"name\":\"TestJob\""); - } - - [Fact] - public void Encode_RetryOperation_ProducesCorrectFormat() - { - var dueTime = new DateTimeOffset(2025, 10, 31, 12, 0, 0, TimeSpan.Zero); - var operation = JobOperation.CreateRetryOperation("job789", dueTime); - var result = EncodeToBytes(operation); - var resultString = Encoding.UTF8.GetString(result); - - resultString.Should().EndWith("\n"); - resultString.Should().Match("*:*\n"); - resultString.Should().Contain("\"type\":2"); - resultString.Should().Contain("\"id\":\"job789\""); - } - - [Fact] - public void Encode_AddOperationWithMetadata_ProducesCorrectFormat() - { - var dueTime = new DateTimeOffset(2025, 10, 31, 12, 0, 0, TimeSpan.Zero); - var grainId = GrainId.Create("test", "grain1"); - var metadata = new Dictionary { ["key1"] = "value1", ["key2"] = "value2" }; - var operation = JobOperation.CreateAddOperation("job999", "MetaJob", dueTime, grainId, metadata); - var result = EncodeToBytes(operation); - var resultString = Encoding.UTF8.GetString(result); - - resultString.Should().EndWith("\n"); - resultString.Should().Contain("\"metadata\""); - resultString.Should().Contain("\"key1\":\"value1\""); - resultString.Should().Contain("\"key2\":\"value2\""); - } - - [Fact] - public void Encode_VerifiesNetstringFormat() - { - var operation = JobOperation.CreateRemoveOperation("test"); - var result = EncodeToBytes(operation); - var resultString = Encoding.UTF8.GetString(result); - - var parts = resultString.Split(':', 2); - parts.Should().HaveCount(2); - - var lengthStr = parts[0]; - lengthStr.Should().HaveLength(6, "length prefix should be 6 hex digits"); - int.TryParse(lengthStr, System.Globalization.NumberStyles.HexNumber, null, out var length).Should().BeTrue("length should be valid hex"); - length.Should().BeGreaterThan(0); - - var dataAndNewline = parts[1]; - dataAndNewline.Should().EndWith("\n"); - - var jsonData = dataAndNewline[..^1]; - var jsonBytes = Encoding.UTF8.GetBytes(jsonData); - jsonBytes.Length.Should().Be(length, "JSON data length should match the hex length prefix"); - } - - [Fact] - public async Task DecodeAsync_RemoveOperation_DecodesCorrectly() - { - var operation = JobOperation.CreateRemoveOperation("job123"); - var encoded = EncodeToBytes(operation); - var stream = new MemoryStream(encoded); - - var results = new List(); - await foreach (var item in NetstringJsonSerializer.DecodeAsync(stream, JobOperationJsonContext.Default.JobOperation, CancellationToken.None)) - { - results.Add(item); - } - - results.Should().HaveCount(1); - results[0].Type.Should().Be(JobOperation.OperationType.Remove); - results[0].Id.Should().Be("job123"); - } - - [Fact] - public async Task DecodeAsync_AddOperation_DecodesCorrectly() - { - var dueTime = new DateTimeOffset(2025, 10, 31, 12, 0, 0, TimeSpan.Zero); - var grainId = GrainId.Create("test", "grain1"); - var operation = JobOperation.CreateAddOperation("job456", "TestJob", dueTime, grainId, null); - var encoded = EncodeToBytes(operation); - var stream = new MemoryStream(encoded); - - var results = new List(); - await foreach (var item in NetstringJsonSerializer.DecodeAsync(stream, JobOperationJsonContext.Default.JobOperation, CancellationToken.None)) - { - results.Add(item); - } - - results.Should().HaveCount(1); - results[0].Type.Should().Be(JobOperation.OperationType.Add); - results[0].Id.Should().Be("job456"); - results[0].Name.Should().Be("TestJob"); - results[0].DueTime.Should().Be(dueTime); - results[0].TargetGrainId.Should().Be(grainId); - } - - [Fact] - public async Task DecodeAsync_MultipleOperations_DecodesCorrectly() - { - var dueTime = new DateTimeOffset(2025, 10, 31, 12, 0, 0, TimeSpan.Zero); - var grainId = GrainId.Create("test", "grain1"); - var op1 = JobOperation.CreateAddOperation("job1", "Job1", dueTime, grainId, null); - var op2 = JobOperation.CreateRemoveOperation("job2"); - var op3 = JobOperation.CreateRetryOperation("job3", dueTime.AddHours(1)); - - var stream = new MemoryStream(); - await stream.WriteAsync(EncodeToBytes(op1)); - await stream.WriteAsync(EncodeToBytes(op2)); - await stream.WriteAsync(EncodeToBytes(op3)); - stream.Position = 0; - - var results = new List(); - await foreach (var item in NetstringJsonSerializer.DecodeAsync(stream, JobOperationJsonContext.Default.JobOperation, CancellationToken.None)) - { - results.Add(item); - } - - results.Should().HaveCount(3); - results[0].Type.Should().Be(JobOperation.OperationType.Add); - results[0].Id.Should().Be("job1"); - results[1].Type.Should().Be(JobOperation.OperationType.Remove); - results[1].Id.Should().Be("job2"); - results[2].Type.Should().Be(JobOperation.OperationType.Retry); - results[2].Id.Should().Be("job3"); - } - - [Fact] - public async Task DecodeAsync_AddOperationWithMetadata_DecodesCorrectly() - { - var dueTime = new DateTimeOffset(2025, 10, 31, 12, 0, 0, TimeSpan.Zero); - var grainId = GrainId.Create("test", "grain1"); - var metadata = new Dictionary { ["key1"] = "value1", ["key2"] = "value2" }; - var operation = JobOperation.CreateAddOperation("job999", "MetaJob", dueTime, grainId, metadata); - var encoded = EncodeToBytes(operation); - var stream = new MemoryStream(encoded); - - var results = new List(); - await foreach (var item in NetstringJsonSerializer.DecodeAsync(stream, JobOperationJsonContext.Default.JobOperation, CancellationToken.None)) - { - results.Add(item); - } - - results.Should().HaveCount(1); - results[0].Metadata.Should().NotBeNull(); - results[0].Metadata.Should().ContainKey("key1").WhoseValue.Should().Be("value1"); - results[0].Metadata.Should().ContainKey("key2").WhoseValue.Should().Be("value2"); - } - - [Fact] - public async Task DecodeAsync_EmptyStream_ReturnsEmpty() - { - var stream = new MemoryStream(); - - var results = new List(); - await foreach (var item in NetstringJsonSerializer.DecodeAsync(stream, JobOperationJsonContext.Default.JobOperation, CancellationToken.None)) - { - results.Add(item); - } - - results.Should().BeEmpty(); - } - - [Fact] - public async Task DecodeAsync_InvalidLength_ThrowsInvalidDataException() - { - var encoded = "GGGGGG:{\"type\":1,\"id\":\"test\"}\n"; // Invalid hex - var stream = new MemoryStream(Encoding.UTF8.GetBytes(encoded)); - - var act = async () => - { - await foreach (var item in NetstringJsonSerializer.DecodeAsync(stream, JobOperationJsonContext.Default.JobOperation, CancellationToken.None)) - { - // Should throw before yielding any items - } - }; - - await act.Should().ThrowAsync() - .WithMessage("Invalid netstring length: GGGGGG"); - } - - [Fact] - public async Task DecodeAsync_ExcessiveLength_ThrowsInvalidDataException() - { - var encoded = "FFFFFF:{\"type\":1}\n"; // 16777215 bytes, exceeds MaxLength - var stream = new MemoryStream(Encoding.UTF8.GetBytes(encoded)); - - var act = async () => - { - await foreach (var item in NetstringJsonSerializer.DecodeAsync(stream, JobOperationJsonContext.Default.JobOperation, CancellationToken.None)) - { - // Should throw before yielding any items - } - }; - - await act.Should().ThrowAsync() - .WithMessage("Netstring length out of valid range: *"); - } - - [Fact] - public async Task DecodeAsync_MissingTrailingNewline_ThrowsInvalidDataException() - { - var json = "{\"type\":1,\"id\":\"test\"}"; - var jsonBytes = Encoding.UTF8.GetBytes(json); - var encoded = $"{jsonBytes.Length:X6}:{json}x"; // Use 6-digit hex format - var stream = new MemoryStream(Encoding.UTF8.GetBytes(encoded)); - - var act = async () => - { - await foreach (var item in NetstringJsonSerializer.DecodeAsync(stream, JobOperationJsonContext.Default.JobOperation, CancellationToken.None)) - { - // Should throw after reading the data - } - }; - - await act.Should().ThrowAsync() - .WithMessage("Expected newline at end of netstring, got byte value *"); - } - - [Fact] - public async Task DecodeAsync_IncompleteData_ThrowsEndOfStreamException() - { - var encoded = "000064:{\"type\":1}"; // Claims 100 bytes but only provides 11 - var stream = new MemoryStream(Encoding.UTF8.GetBytes(encoded)); - - var act = async () => - { - await foreach (var item in NetstringJsonSerializer.DecodeAsync(stream, JobOperationJsonContext.Default.JobOperation, CancellationToken.None)) - { - // Should throw before yielding any items - } - }; - - await act.Should().ThrowAsync(); - } - - [Fact] - public async Task DecodeAsync_WrongTrailingCharacter_ThrowsInvalidDataException() - { - var json = "{\"type\":1,\"id\":\"test\"}"; - var jsonBytes = Encoding.UTF8.GetBytes(json); - var encoded = $"{jsonBytes.Length:X6}:{json}X"; // Use 6-digit hex format - var stream = new MemoryStream(Encoding.UTF8.GetBytes(encoded)); - - var act = async () => - { - await foreach (var item in NetstringJsonSerializer.DecodeAsync(stream, JobOperationJsonContext.Default.JobOperation, CancellationToken.None)) - { - // Should throw after reading the data - } - }; - - await act.Should().ThrowAsync() - .WithMessage("Expected newline at end of netstring, got byte value *"); - } - - [Fact] - public async Task DecodeAsync_InvalidJson_ThrowsJsonException() - { - var invalidJson = "{invalid json}"; - var jsonBytes = Encoding.UTF8.GetBytes(invalidJson); - var encoded = $"{jsonBytes.Length:X6}:{invalidJson}\n"; // Use 6-digit hex format - var stream = new MemoryStream(Encoding.UTF8.GetBytes(encoded)); - - var act = async () => - { - await foreach (var item in NetstringJsonSerializer.DecodeAsync(stream, JobOperationJsonContext.Default.JobOperation, CancellationToken.None)) - { - // Should throw when deserializing - } - }; - - await act.Should().ThrowAsync(); - } - - [Fact] - public async Task EncodeAndDecode_RoundTrip_PreservesData() - { - var dueTime1 = new DateTimeOffset(2025, 10, 31, 12, 0, 0, TimeSpan.Zero); - var dueTime2 = new DateTimeOffset(2025, 11, 1, 14, 30, 0, TimeSpan.Zero); - var grainId = GrainId.Create("test", "grain1"); - var metadata = new Dictionary { ["env"] = "prod", ["region"] = "us-east" }; - - var testOperations = new[] - { - JobOperation.CreateRemoveOperation("remove-job"), - JobOperation.CreateAddOperation("add-job", "MyJob", dueTime1, grainId, null), - JobOperation.CreateRetryOperation("retry-job", dueTime2), - JobOperation.CreateAddOperation("meta-job", "MetaJob", dueTime1, grainId, metadata) - }; - - foreach (var operation in testOperations) - { - var encoded = EncodeToBytes(operation); - var stream = new MemoryStream(encoded); - - var results = new List(); - await foreach (var item in NetstringJsonSerializer.DecodeAsync(stream, JobOperationJsonContext.Default.JobOperation, CancellationToken.None)) - { - results.Add(item); - } - - results.Should().HaveCount(1); - results[0].Type.Should().Be(operation.Type); - results[0].Id.Should().Be(operation.Id); - results[0].Name.Should().Be(operation.Name); - results[0].DueTime.Should().Be(operation.DueTime); - results[0].TargetGrainId.Should().Be(operation.TargetGrainId); - - if (operation.Metadata is not null) - { - results[0].Metadata.Should().NotBeNull(); - results[0].Metadata.Should().BeEquivalentTo(operation.Metadata); - } - } - } - - [Fact] - public async Task EncodeAndDecode_MultipleOperations_RoundTrip() - { - var dueTime = new DateTimeOffset(2025, 10, 31, 12, 0, 0, TimeSpan.Zero); - var grainId = GrainId.Create("test", "grain1"); - - var testOperations = new[] - { - JobOperation.CreateAddOperation("job1", "First", dueTime, grainId, null), - JobOperation.CreateRemoveOperation("job2"), - JobOperation.CreateRetryOperation("job3", dueTime.AddHours(1)), - JobOperation.CreateAddOperation("job4", "Fourth", dueTime.AddDays(1), grainId, null) - }; - - var memoryStream = new MemoryStream(); - foreach (var operation in testOperations) - { - var encoded = EncodeToBytes(operation); - await memoryStream.WriteAsync(encoded); - } - - memoryStream.Position = 0; - - var results = new List(); - await foreach (var item in NetstringJsonSerializer.DecodeAsync(memoryStream, JobOperationJsonContext.Default.JobOperation, CancellationToken.None)) - { - results.Add(item); - } - - results.Should().HaveCount(4); - for (var i = 0; i < testOperations.Length; i++) - { - results[i].Type.Should().Be(testOperations[i].Type); - results[i].Id.Should().Be(testOperations[i].Id); - } - } - - [Fact] - public async Task DecodeAsync_StreamPosition_IsPreserved() - { - var operation = JobOperation.CreateRemoveOperation("test"); - var encoded = EncodeToBytes(operation); - var stream = new MemoryStream(encoded); - - await foreach (var item in NetstringJsonSerializer.DecodeAsync(stream, JobOperationJsonContext.Default.JobOperation, CancellationToken.None)) - { - // Stream should be at the end after reading - } - - stream.Position.Should().Be(stream.Length); - } - - [Fact] - public async Task EncodeAndDecode_LargeMetadata_HandlesCorrectly() - { - var dueTime = new DateTimeOffset(2025, 10, 31, 12, 0, 0, TimeSpan.Zero); - var grainId = GrainId.Create("test", "grain1"); - - var largeMetadata = new Dictionary(); - for (var i = 0; i < 100; i++) - { - largeMetadata[$"key{i}"] = new string('x', 1000); - } - - var operation = JobOperation.CreateAddOperation("large-job", "LargeMetaJob", dueTime, grainId, largeMetadata); - var encoded = EncodeToBytes(operation); - var stream = new MemoryStream(encoded); - - var results = new List(); - await foreach (var item in NetstringJsonSerializer.DecodeAsync(stream, JobOperationJsonContext.Default.JobOperation, CancellationToken.None)) - { - results.Add(item); - } - - results.Should().HaveCount(1); - results[0].Metadata.Should().NotBeNull(); - results[0].Metadata.Should().HaveCount(100); - } -} diff --git a/test/Extensions/Orleans.Azure.Tests/Orleans.Azure.Tests.csproj b/test/Extensions/Orleans.Azure.Tests/Orleans.Azure.Tests.csproj index b654de69ca5..2e704b4cece 100644 --- a/test/Extensions/Orleans.Azure.Tests/Orleans.Azure.Tests.csproj +++ b/test/Extensions/Orleans.Azure.Tests/Orleans.Azure.Tests.csproj @@ -21,9 +21,7 @@ - - diff --git a/test/Extensions/Orleans.DurableJobs.AzureStorage.Tests/DurableJobs/AzureBlobJournaledJobShardManagerTests.cs b/test/Extensions/Orleans.DurableJobs.AzureStorage.Tests/DurableJobs/AzureBlobJournaledJobShardManagerTests.cs new file mode 100644 index 00000000000..15bac6e3691 --- /dev/null +++ b/test/Extensions/Orleans.DurableJobs.AzureStorage.Tests/DurableJobs/AzureBlobJournaledJobShardManagerTests.cs @@ -0,0 +1,67 @@ +#nullable enable + +using Azure.Storage.Blobs; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Logging; +using Orleans.DurableJobs.Tests; +using Orleans.Hosting; +using Orleans.Runtime; +using Tester; +using TestExtensions; +using Xunit; + +namespace Tester.AzureUtils.DurableJobs; + +[TestCategory("Azure"), TestCategory("DurableJobs")] +public sealed class AzureBlobJournaledJobShardManagerTests(AzureBlobJournaledJobShardManagerTestFixture fixture) + : JobShardManagerTestsRunner(fixture), IClassFixture; + +public sealed class AzureBlobJournaledJobShardManagerTestFixture : IJobShardManagerTestFixture +{ + public async Task CreateScopeAsync() + { + TestUtils.CheckForAzureStorage(); + + var containerName = "durablejobs-shard-tests-" + Guid.NewGuid().ToString("N"); + var services = new ServiceCollection(); + services.AddLogging(); + services.AddSingleton(TimeProvider.System); + services.UseAzureBlobDurableJobs(options => + { + options.ConfigureTestDefaults(); + options.ContainerName = containerName; + }); + + var serviceProvider = services.BuildServiceProvider(); + var lifecycle = new SiloLifecycleSubject(serviceProvider.GetRequiredService>()); + foreach (var participant in serviceProvider.GetServices>()) + { + participant.Participate(lifecycle); + } + + using var cts = new CancellationTokenSource(TimeSpan.FromSeconds(60)); + await lifecycle.OnStart(cts.Token); + return new AzureBlobJournaledJobShardManagerTestScope(serviceProvider, lifecycle, CreateContainerClient(containerName)); + } + + private static BlobContainerClient CreateContainerClient(string containerName) + { + return TestDefaultConfiguration.UseAadAuthentication + ? new BlobContainerClient(new Uri(TestDefaultConfiguration.DataBlobUri, containerName), TestDefaultConfiguration.TokenCredential) + : new BlobContainerClient(TestDefaultConfiguration.DataConnectionString, containerName); + } + + private sealed class AzureBlobJournaledJobShardManagerTestScope( + ServiceProvider services, + SiloLifecycleSubject lifecycle, + BlobContainerClient container) : JournaledJobShardManagerTestScope(services) + { + public override async ValueTask DisposeAsync() + { + using var cts = new CancellationTokenSource(TimeSpan.FromSeconds(60)); + await lifecycle.OnStop(cts.Token); + await base.DisposeAsync(); + await container.DeleteIfExistsAsync(cancellationToken: cts.Token); + } + } +} diff --git a/test/Extensions/Orleans.Azure.Tests/DurableJobs/AzureStorageBlobDurableJobsTests.cs b/test/Extensions/Orleans.DurableJobs.AzureStorage.Tests/DurableJobs/AzureStorageBlobDurableJobsTests.cs similarity index 87% rename from test/Extensions/Orleans.Azure.Tests/DurableJobs/AzureStorageBlobDurableJobsTests.cs rename to test/Extensions/Orleans.DurableJobs.AzureStorage.Tests/DurableJobs/AzureStorageBlobDurableJobsTests.cs index 4dd964c8f0d..60178f563e2 100644 --- a/test/Extensions/Orleans.Azure.Tests/DurableJobs/AzureStorageBlobDurableJobsTests.cs +++ b/test/Extensions/Orleans.DurableJobs.AzureStorage.Tests/DurableJobs/AzureStorageBlobDurableJobsTests.cs @@ -1,8 +1,7 @@ -using System; -using System.Threading; -using System.Threading.Tasks; using Microsoft.Extensions.DependencyInjection; using Orleans.Configuration; +using Orleans.Hosting; +using Orleans.Journaling; using Orleans.TestingHost; using Tester; using Tester.DurableJobs; @@ -129,3 +128,20 @@ public async Task JobRetry() await _runner.JobRetry(cts.Token); } } + +internal static class AzureBlobDurableJobsTestConfiguration +{ + public static AzureBlobJournalStorageOptions ConfigureTestDefaults(this AzureBlobJournalStorageOptions options) + { + if (TestDefaultConfiguration.UseAadAuthentication) + { + options.ConfigureBlobServiceClient(TestDefaultConfiguration.DataBlobUri, TestDefaultConfiguration.TokenCredential); + } + else + { + options.ConfigureBlobServiceClient(TestDefaultConfiguration.DataConnectionString); + } + + return options; + } +} diff --git a/test/Extensions/Orleans.DurableJobs.AzureStorage.Tests/Orleans.DurableJobs.AzureStorage.Tests.csproj b/test/Extensions/Orleans.DurableJobs.AzureStorage.Tests/Orleans.DurableJobs.AzureStorage.Tests.csproj new file mode 100644 index 00000000000..a52f7618b79 --- /dev/null +++ b/test/Extensions/Orleans.DurableJobs.AzureStorage.Tests/Orleans.DurableJobs.AzureStorage.Tests.csproj @@ -0,0 +1,32 @@ + + + true + TRACE;TESTER_AZUREUTILS;ORLEANS_PERSISTENCE + Exe + $(TestTargetFrameworks) + false + true + en-US + + + + $(NoWarn);ORLEANSEXP005 + + + + + + + all + runtime; build; native; contentfiles; analyzers; buildtransitive + + + + + + + + + + + diff --git a/test/Extensions/Orleans.DurableJobs.AzureStorage.Tests/Program.cs b/test/Extensions/Orleans.DurableJobs.AzureStorage.Tests/Program.cs new file mode 100644 index 00000000000..f979d5fb50e --- /dev/null +++ b/test/Extensions/Orleans.DurableJobs.AzureStorage.Tests/Program.cs @@ -0,0 +1,8 @@ +using Orleans.TestingHost; + +namespace Tester.AzureUtils.DurableJobs; + +public static class Program +{ + public static async Task Main(string[] args) => await StandaloneSiloHost.Main(args); +} diff --git a/test/Orleans.Core.Tests/DurableJobs/InMemoryJobQueueTests.cs b/test/Orleans.Core.Tests/DurableJobs/InMemoryJobQueueTests.cs index c9826818cfc..0026c4ef0dc 100644 --- a/test/Orleans.Core.Tests/DurableJobs/InMemoryJobQueueTests.cs +++ b/test/Orleans.Core.Tests/DurableJobs/InMemoryJobQueueTests.cs @@ -3,9 +3,10 @@ using System.Linq; using System.Threading; using System.Threading.Tasks; +using Microsoft.Extensions.Time.Testing; +using NSubstitute; using Orleans.DurableJobs; using Orleans.Runtime; -using NSubstitute; using Xunit; namespace NonSilo.Tests.DurableJobs; @@ -109,20 +110,23 @@ public async Task GetAsyncEnumerator_WithInitialDequeueCount_IncrementsCorrectly [Fact] public async Task GetAsyncEnumerator_WaitsForDueTime() { - var queue = new InMemoryJobQueue(); - var futureTime = DateTimeOffset.UtcNow.AddSeconds(2); + var timeProvider = new FakeTimeProvider(new DateTimeOffset(2026, 1, 1, 0, 0, 0, TimeSpan.Zero)); + var queue = new InMemoryJobQueue(timeProvider); + var futureTime = timeProvider.GetUtcNow().AddSeconds(1); var job = CreateJob("job1", futureTime); queue.Enqueue(job, 0); queue.MarkAsComplete(); - var startTime = DateTimeOffset.UtcNow; - await foreach (var context in queue.WithCancellation(CancellationToken.None)) - { - var elapsed = DateTimeOffset.UtcNow - startTime; - Assert.True(elapsed.TotalSeconds >= 1.5, $"Job was dequeued too early. Elapsed: {elapsed.TotalSeconds}s"); - break; - } + await using var enumerator = queue.GetAsyncEnumerator(CancellationToken.None); + var moveNextTask = enumerator.MoveNextAsync().AsTask(); + + Assert.False(moveNextTask.IsCompleted); + + timeProvider.Advance(TimeSpan.FromSeconds(3)); + + Assert.True(await moveNextTask.WaitAsync(TimeSpan.FromSeconds(5))); + Assert.Equal(job.Id, enumerator.Current.Job.Id); } [Fact] diff --git a/test/Orleans.Core.Tests/DurableJobs/InMemoryJobShardManagerTests.cs b/test/Orleans.Core.Tests/DurableJobs/InMemoryJobShardManagerTests.cs deleted file mode 100644 index bda6a42ecde..00000000000 --- a/test/Orleans.Core.Tests/DurableJobs/InMemoryJobShardManagerTests.cs +++ /dev/null @@ -1,282 +0,0 @@ -#nullable enable - -using System.Collections.Immutable; -using System.Net; -using Microsoft.Extensions.DependencyInjection; -using Orleans.DurableJobs; -using Orleans.Hosting; -using Orleans.Runtime; -using NSubstitute; -using Xunit; - -namespace NonSilo.Tests.DurableJobs; - -[TestCategory("DurableJobs")] -public class InMemoryJobShardManagerTests : IAsyncLifetime -{ - private static readonly SiloAddress Silo1 = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5001), 1); - private static readonly SiloAddress Silo2 = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5002), 2); - private static readonly SiloAddress Silo3 = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5003), 3); - private static readonly SiloAddress Silo4 = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5004), 4); - - public Task InitializeAsync() => InMemoryJobShardManager.ClearAllShardsAsync(); - - public Task DisposeAsync() => InMemoryJobShardManager.ClearAllShardsAsync(); - - [Fact] - public async Task CreateShardAsync_CreatesShardOwnedBySilo() - { - var manager = new InMemoryJobShardManager(Silo1); - var minDueTime = DateTimeOffset.UtcNow; - var maxDueTime = minDueTime.AddHours(1); - - var shard = await manager.CreateShardAsync(minDueTime, maxDueTime, new Dictionary(), CancellationToken.None); - - Assert.NotNull(shard); - Assert.Equal(minDueTime, shard.StartTime); - Assert.Equal(maxDueTime, shard.EndTime); - } - - [Fact] - public async Task AssignJobShardsAsync_ReturnsOwnedShards() - { - var manager = new InMemoryJobShardManager(Silo1); - var minDueTime = DateTimeOffset.UtcNow; - var maxDueTime = minDueTime.AddHours(1); - - var createdShard = await manager.CreateShardAsync(minDueTime, maxDueTime, new Dictionary(), CancellationToken.None); - var assignedShards = await manager.AssignJobShardsAsync(maxDueTime, int.MaxValue, CancellationToken.None); - - Assert.Single(assignedShards); - Assert.Equal(createdShard.Id, assignedShards[0].Id); - } - - [Fact] - public async Task AssignJobShardsAsync_OrphanedShard_IsAssignedWithoutIncrementingAdoptedCount() - { - // Silo1 creates a shard and gracefully releases it - var manager1 = new InMemoryJobShardManager(Silo1); - var minDueTime = DateTimeOffset.UtcNow; - var maxDueTime = minDueTime.AddHours(1); - - var shard = await manager1.CreateShardAsync(minDueTime, maxDueTime, new Dictionary(), CancellationToken.None); - - // Schedule a job so the shard isn't deleted on unregister - await shard.TryScheduleJobAsync(new ScheduleJobRequest { Target = GrainId.Create("test", "grain1"), JobName = "TestJob", DueTime = minDueTime.AddMinutes(30), Metadata = null }, CancellationToken.None); - - // Gracefully unregister (sets owner to null) - await manager1.UnregisterShardAsync(shard, CancellationToken.None); - - // Silo2 picks up the orphaned shard - var manager2 = new InMemoryJobShardManager(Silo2); - var assignedShards = await manager2.AssignJobShardsAsync(maxDueTime, int.MaxValue, CancellationToken.None); - - Assert.Single(assignedShards); - Assert.Equal(shard.Id, assignedShards[0].Id); - - var ownershipInfo = await InMemoryJobShardManager.GetOwnershipInfoAsync(shard.Id); - Assert.True(ownershipInfo.HasValue); - Assert.Equal(Silo2.ToString(), ownershipInfo.Value.Owner); - Assert.Equal(0, ownershipInfo.Value.AdoptedCount); - } - - [Fact] - public async Task AssignJobShardsAsync_AdoptedFromDeadSilo_IncrementsAdoptedCount() - { - // Setup membership service that reports Silo1 as dead - var membershipService = CreateMembershipService(deadSilos: [Silo1]); - - // Silo1 creates a shard (simulating it was created before death) - var manager1 = new InMemoryJobShardManager(Silo1, membershipService); - var minDueTime = DateTimeOffset.UtcNow; - var maxDueTime = minDueTime.AddHours(1); - - var shard = await manager1.CreateShardAsync(minDueTime, maxDueTime, new Dictionary(), CancellationToken.None); - - // Silo2 adopts the shard from dead Silo1 - var manager2 = new InMemoryJobShardManager(Silo2, membershipService, maxAdoptedCount: 3); - var assignedShards = await manager2.AssignJobShardsAsync(maxDueTime, int.MaxValue, CancellationToken.None); - - // Shard should be assigned (adopted count = 1, under threshold) - Assert.Single(assignedShards); - Assert.Equal(shard.Id, assignedShards[0].Id); - - var ownershipInfo = await InMemoryJobShardManager.GetOwnershipInfoAsync(shard.Id); - Assert.True(ownershipInfo.HasValue); - Assert.Equal(Silo2.ToString(), ownershipInfo.Value.Owner); - Assert.Equal(1, ownershipInfo.Value.AdoptedCount); - } - - [Fact] - public async Task AssignJobShardsAsync_PoisonedShard_IsNotAssigned() - { - // Setup membership service - var membershipService = Substitute.For(); - var snapshot = CreateMembershipSnapshot(deadSilos: [Silo1, Silo2, Silo3]); - membershipService.CurrentSnapshot.Returns(snapshot); - - // Silo1 creates a shard - var manager1 = new InMemoryJobShardManager(Silo1, membershipService, maxAdoptedCount: 2); - var minDueTime = DateTimeOffset.UtcNow; - var maxDueTime = minDueTime.AddHours(1); - - await manager1.CreateShardAsync(minDueTime, maxDueTime, new Dictionary(), CancellationToken.None); - - // Silo2 adopts from dead Silo1 (adopted count = 1) - var manager2 = new InMemoryJobShardManager(Silo2, membershipService, maxAdoptedCount: 2); - var shards2 = await manager2.AssignJobShardsAsync(maxDueTime, int.MaxValue, CancellationToken.None); - Assert.Single(shards2); - - // Silo3 adopts from dead Silo2 (adopted count = 2) - var manager3 = new InMemoryJobShardManager(Silo3, membershipService, maxAdoptedCount: 2); - var shards3 = await manager3.AssignJobShardsAsync(maxDueTime, int.MaxValue, CancellationToken.None); - Assert.Single(shards3); - - // Silo4 tries to adopt from dead Silo3 (adopted count would be 3, exceeds max of 2) - var manager4 = new InMemoryJobShardManager(Silo4, membershipService, maxAdoptedCount: 2); - var shards4 = await manager4.AssignJobShardsAsync(maxDueTime, int.MaxValue, CancellationToken.None); - - // Shard is poisoned and should not be assigned - Assert.Empty(shards4); - } - - [Fact] - public async Task AssignJobShardsAsync_MaxAdoptedCountOfZero_NeverAssignsAdoptedShards() - { - // Setup membership service that reports Silo1 as dead - var membershipService = CreateMembershipService(deadSilos: [Silo1]); - - // Silo1 creates a shard - var manager1 = new InMemoryJobShardManager(Silo1, membershipService, maxAdoptedCount: 0); - var minDueTime = DateTimeOffset.UtcNow; - var maxDueTime = minDueTime.AddHours(1); - - await manager1.CreateShardAsync(minDueTime, maxDueTime, new Dictionary(), CancellationToken.None); - - // Silo2 tries to adopt from dead Silo1 with maxAdoptedCount=0 - var manager2 = new InMemoryJobShardManager(Silo2, membershipService, maxAdoptedCount: 0); - var assignedShards = await manager2.AssignJobShardsAsync(maxDueTime, int.MaxValue, CancellationToken.None); - - // Shard should not be assigned (adopted count would be 1, exceeds max of 0) - Assert.Empty(assignedShards); - } - - [Fact] - public async Task UseInMemoryDurableJobs_ConfiguredMaxAdoptedCount_IsApplied() - { - var membershipService = CreateMembershipService(deadSilos: [Silo2]); - var minDueTime = DateTimeOffset.UtcNow; - var maxDueTime = minDueTime.AddHours(1); - - var ownerManager = new InMemoryJobShardManager(Silo2, membershipService, maxAdoptedCount: 3); - await ownerManager.CreateShardAsync(minDueTime, maxDueTime, new Dictionary(), CancellationToken.None); - - var localSiloDetails = Substitute.For(); - localSiloDetails.SiloAddress.Returns(Silo1); - - var services = new ServiceCollection(); - services.AddSingleton(localSiloDetails); - services.AddSingleton(membershipService); - services.Configure(options => options.MaxAdoptedCount = 0); - services.UseInMemoryDurableJobs(); - - using var serviceProvider = services.BuildServiceProvider(); - var manager = serviceProvider.GetRequiredService(); - - var assignedShards = await manager.AssignJobShardsAsync(maxDueTime, int.MaxValue, CancellationToken.None); - Assert.Empty(assignedShards); - } - - [Fact] - public async Task AssignJobShardsAsync_ShardFromActiveSilo_IsNotAssigned() - { - // Setup membership service that reports Silo1 as active - var membershipService = CreateMembershipService(activeSilos: [Silo1]); - - // Silo1 creates a shard - var manager1 = new InMemoryJobShardManager(Silo1, membershipService); - var minDueTime = DateTimeOffset.UtcNow; - var maxDueTime = minDueTime.AddHours(1); - - await manager1.CreateShardAsync(minDueTime, maxDueTime, new Dictionary(), CancellationToken.None); - - // Silo2 tries to get shards - should not get Silo1's shard since Silo1 is active - var manager2 = new InMemoryJobShardManager(Silo2, membershipService); - var assignedShards = await manager2.AssignJobShardsAsync(maxDueTime, int.MaxValue, CancellationToken.None); - - Assert.Empty(assignedShards); - } - - [Fact] - public async Task UnregisterShardAsync_WithNoJobsRemaining_RemovesShard() - { - var manager = new InMemoryJobShardManager(Silo1); - var minDueTime = DateTimeOffset.UtcNow; - var maxDueTime = minDueTime.AddHours(1); - - var shard = await manager.CreateShardAsync(minDueTime, maxDueTime, new Dictionary(), CancellationToken.None); - - // Unregister with no jobs - await manager.UnregisterShardAsync(shard, CancellationToken.None); - - // Shard should be removed, not reassignable - var assignedShards = await manager.AssignJobShardsAsync(maxDueTime, int.MaxValue, CancellationToken.None); - Assert.Empty(assignedShards); - } - - [Fact] - public async Task UnregisterShardAsync_WithJobsRemaining_MarksShardAsOrphaned() - { - var manager1 = new InMemoryJobShardManager(Silo1); - var minDueTime = DateTimeOffset.UtcNow; - var maxDueTime = minDueTime.AddHours(1); - - var shard = await manager1.CreateShardAsync(minDueTime, maxDueTime, new Dictionary(), CancellationToken.None); - - // Add a job - await shard.TryScheduleJobAsync(new ScheduleJobRequest { Target = GrainId.Create("test", "grain1"), JobName = "TestJob", DueTime = minDueTime.AddMinutes(30), Metadata = null }, CancellationToken.None); - - // Unregister with jobs remaining - await manager1.UnregisterShardAsync(shard, CancellationToken.None); - - // Shard should be orphaned and available for another silo - var manager2 = new InMemoryJobShardManager(Silo2); - var assignedShards = await manager2.AssignJobShardsAsync(maxDueTime, int.MaxValue, CancellationToken.None); - Assert.Single(assignedShards); - } - - private static IClusterMembershipService CreateMembershipService( - SiloAddress[]? activeSilos = null, - SiloAddress[]? deadSilos = null) - { - var membershipService = Substitute.For(); - var snapshot = CreateMembershipSnapshot(activeSilos, deadSilos); - membershipService.CurrentSnapshot.Returns(snapshot); - return membershipService; - } - - private static ClusterMembershipSnapshot CreateMembershipSnapshot( - SiloAddress[]? activeSilos = null, - SiloAddress[]? deadSilos = null) - { - var builder = ImmutableDictionary.CreateBuilder(); - - if (activeSilos is not null) - { - foreach (var silo in activeSilos) - { - builder[silo] = new ClusterMember(silo, SiloStatus.Active, silo.ToString()); - } - } - - if (deadSilos is not null) - { - foreach (var silo in deadSilos) - { - builder[silo] = new ClusterMember(silo, SiloStatus.Dead, silo.ToString()); - } - } - - return new ClusterMembershipSnapshot(builder.ToImmutable(), new MembershipVersion(1)); - } -} diff --git a/test/Orleans.Core.Tests/DurableJobs/LocalDurableJobManagerTests.cs b/test/Orleans.Core.Tests/DurableJobs/LocalDurableJobManagerTests.cs new file mode 100644 index 00000000000..4accc48374b --- /dev/null +++ b/test/Orleans.Core.Tests/DurableJobs/LocalDurableJobManagerTests.cs @@ -0,0 +1,526 @@ +#nullable enable +#pragma warning disable ORLEANSEXP005 + +using System.Collections.Immutable; +using System.Net; +using System.Runtime.CompilerServices; +using Microsoft.Extensions.Configuration; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Logging.Abstractions; +using Microsoft.Extensions.Options; +using Microsoft.Extensions.Time.Testing; +using NSubstitute; +using Orleans.Configuration; +using Orleans.DurableJobs; +using Orleans.Hosting; +using Orleans.Journaling; +using Orleans.Journaling.Json; +using Orleans.Runtime; +using Orleans.Runtime.Messaging; +using Xunit; + +namespace NonSilo.Tests.DurableJobs; + +[TestCategory("BVT"), TestCategory("DurableJobs")] +public class LocalDurableJobManagerTests +{ + [Fact] + public async Task ProcessShardCheckCycleAsync_MarksExpiredWritableShardComplete() + { + var timeProvider = new FakeTimeProvider(new DateTimeOffset(2026, 1, 1, 0, 0, 0, TimeSpan.Zero)); + var options = CreateOptions(); + var shardManager = new TestJobShardManager(); + var manager = CreateManager(shardManager, timeProvider, options); + var accessor = new LocalDurableJobManager.TestAccessor(manager); + var shardKey = timeProvider.GetUtcNow().Subtract(options.ShardDuration * 2); + var shard = CreateSubstituteShard("expired-shard", shardKey, shardKey.Add(options.ShardDuration)); + + accessor.AddWritableShard(shardKey, shard); + + await accessor.ProcessShardCheckCycleAsync(CancellationToken.None); + + Assert.False(accessor.HasWritableShard(shardKey)); + await shard.Received(1).MarkAsCompleteAsync(Arg.Any()); + Assert.Equal(timeProvider.GetUtcNow().AddHours(1), shardManager.LastMaxDueTime); + } + + [Fact] + public async Task ProcessShardCheckCycleAsync_LeavesNonExpiredWritableShardOpen() + { + var timeProvider = new FakeTimeProvider(new DateTimeOffset(2026, 1, 1, 0, 0, 0, TimeSpan.Zero)); + var options = CreateOptions(); + var shardManager = new TestJobShardManager(); + var manager = CreateManager(shardManager, timeProvider, options); + var accessor = new LocalDurableJobManager.TestAccessor(manager); + var shardKey = timeProvider.GetUtcNow(); + var shard = CreateSubstituteShard("active-shard", shardKey, shardKey.Add(options.ShardDuration)); + + accessor.AddWritableShard(shardKey, shard); + + await accessor.ProcessShardCheckCycleAsync(CancellationToken.None); + + Assert.True(accessor.HasWritableShard(shardKey)); + await shard.DidNotReceive().MarkAsCompleteAsync(Arg.Any()); + } + + [Fact] + public async Task ExpiredWritableShard_DrainsThenUnregistersAndDisposes() + { + var timeProvider = new FakeTimeProvider(new DateTimeOffset(2026, 1, 1, 0, 0, 0, TimeSpan.Zero)); + var options = CreateOptions(); + var shardManager = new TestJobShardManager(); + var manager = CreateManager(shardManager, timeProvider, options); + var accessor = new LocalDurableJobManager.TestAccessor(manager); + var shardKey = timeProvider.GetUtcNow().Subtract(options.ShardDuration * 2); + var shard = new CompletingShard("draining-shard", shardKey, shardKey.Add(options.ShardDuration)); + + accessor.AddWritableShard(shardKey, shard); + accessor.TryActivateShard(shard); + + Assert.True(accessor.TryGetRunningShardTask(shard.Id, out var runTask)); + + await accessor.ProcessShardCheckCycleAsync(CancellationToken.None); + await runTask!.WaitAsync(TimeSpan.FromSeconds(5)); + + Assert.Equal(1, shard.MarkAsCompleteCallCount); + Assert.Equal(1, shard.DisposeCallCount); + Assert.Same(shard, Assert.Single(shardManager.UnregisteredShards)); + Assert.False(accessor.HasCachedShard(shard.Id)); + Assert.False(accessor.TryGetRunningShardTask(shard.Id, out _)); + } + + [Fact] + public async Task AssignedShardActivation_UsesTimeProvider() + { + var timeProvider = new FakeTimeProvider(new DateTimeOffset(2026, 1, 1, 0, 0, 0, TimeSpan.Zero)); + var options = CreateOptions(); + options.ShardActivationBufferPeriod = TimeSpan.FromMinutes(5); + var shardManager = new TestJobShardManager(); + var manager = CreateManager(shardManager, timeProvider, options); + var accessor = new LocalDurableJobManager.TestAccessor(manager); + var shard = new CompletingShard( + "future-shard", + timeProvider.GetUtcNow().AddMinutes(10), + timeProvider.GetUtcNow().AddMinutes(11)); + shardManager.AssignedShards.Add(shard); + + await accessor.ProcessShardCheckCycleAsync(CancellationToken.None); + + Assert.False(accessor.TryGetRunningShardTask(shard.Id, out _)); + + timeProvider.Advance(TimeSpan.FromMinutes(11)); + + await accessor.ProcessShardCheckCycleAsync(CancellationToken.None); + + Assert.True(accessor.TryGetRunningShardTask(shard.Id, out var runTask)); + + await shard.ConsumeStarted.Task.WaitAsync(TimeSpan.FromSeconds(5)); + await shard.MarkAsCompleteAsync(CancellationToken.None); + await runTask!.WaitAsync(TimeSpan.FromSeconds(5)); + } + + [Fact] + public async Task ScheduleJobAsync_WhenExpiryWaitsBehindScheduling_CompletesShardAfterJobIsAccepted() + { + var timeProvider = new FakeTimeProvider(new DateTimeOffset(2026, 1, 1, 0, 0, 0, TimeSpan.Zero)); + var options = CreateOptions(); + var shardManager = new TestJobShardManager(); + var manager = CreateManager(shardManager, timeProvider, options); + var accessor = new LocalDurableJobManager.TestAccessor(manager); + var shardKey = timeProvider.GetUtcNow().Subtract(options.ShardDuration * 2); + var shard = new GateableSchedulingShard("expiring-shard", shardKey, shardKey.Add(options.ShardDuration)); + + accessor.AddWritableShard(shardKey, shard); + + var scheduleTask = manager.ScheduleJobAsync(new() + { + Target = GrainId.Create("test", "target"), + JobName = "late-job", + DueTime = shardKey + }, CancellationToken.None); + + await shard.ScheduleStarted.Task.WaitAsync(TimeSpan.FromSeconds(5)); + + var cycleTask = accessor.ProcessShardCheckCycleAsync(CancellationToken.None); + + Assert.False(cycleTask.IsCompleted); + + shard.AllowScheduleToFinish.SetResult(); + + var job = await scheduleTask.WaitAsync(TimeSpan.FromSeconds(5)); + await cycleTask.WaitAsync(TimeSpan.FromSeconds(5)); + + Assert.Equal("late-job", job.Name); + Assert.Equal(1, shard.MarkAsCompleteCallCount); + Assert.True(shard.IsAddingCompleted); + Assert.False(accessor.HasWritableShard(shardKey)); + } + + [Fact] + public async Task ExpiredJournaledShard_DrainsUnregistersAndDeletesStorage() + { + var timeProvider = new FakeTimeProvider(new DateTimeOffset(2026, 1, 1, 0, 0, 0, TimeSpan.Zero)); + var options = CreateOptions(); + options.ShardDuration = TimeSpan.FromSeconds(1); + var storageProvider = new VolatileJournalStorageProvider(); + await using var services = CreateJournaledServices(storageProvider, timeProvider); + var siloAddress = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5010), 0); + var localSiloDetails = new TestLocalSiloDetails(siloAddress); + var membership = new TestClusterMembershipService(); + membership.SetSiloStatus(siloAddress, SiloStatus.Active); + var optionsWrapper = Options.Create(options); + var journaledShardManager = new JournaledJobShardManager( + localSiloDetails, + services.GetRequiredService(), + services.GetRequiredService(), + services.GetRequiredService(), + membership, + services, + optionsWrapper, + services.GetRequiredService>()); + var (grainFactory, handledJob) = CreateCompletingGrainFactory(); + var overloadDetector = Substitute.For(); + overloadDetector.IsOverloaded.Returns(false); + var shardExecutor = new ShardExecutor( + grainFactory, + optionsWrapper, + overloadDetector, + NullLogger.Instance, + timeProvider); + var manager = new LocalDurableJobManager( + journaledShardManager, + shardExecutor, + grainFactory, + membership, + overloadDetector, + timeProvider, + optionsWrapper, + CreateSystemTargetShared(localSiloDetails), + NullLogger.Instance); + var accessor = new LocalDurableJobManager.TestAccessor(manager); + var job = await manager.ScheduleJobAsync(new() + { + Target = GrainId.Create("test", "target"), + JobName = "journaled-job", + DueTime = timeProvider.GetUtcNow() + }, CancellationToken.None); + + Assert.True(accessor.TryGetRunningShardTask(job.ShardId, out var runTask)); + + timeProvider.Advance(TimeSpan.FromSeconds(3)); + await accessor.ProcessShardCheckCycleAsync(CancellationToken.None); + + var jobContext = await handledJob.Task.WaitAsync(TimeSpan.FromSeconds(5)); + Assert.Equal(job.Id, jobContext.Job.Id); + await AdvanceUntilCompletedAsync(timeProvider, runTask!, TimeSpan.FromSeconds(1)); + await runTask!.WaitAsync(TimeSpan.FromSeconds(5)); + Assert.Null(await storageProvider.CreateStorage(JobShardId.Parse(job.ShardId).ToJournalId()).GetMetadataAsync()); + } + + private static DurableJobsOptions CreateOptions() => new() + { + ShardDuration = TimeSpan.FromMinutes(1), + ShardActivationBufferPeriod = TimeSpan.Zero, + ShardClaimRampUpDuration = TimeSpan.Zero, + ConcurrencySlowStartEnabled = false, + MaxConcurrentJobsPerSilo = 10 + }; + + private static LocalDurableJobManager CreateManager( + JobShardManager shardManager, + FakeTimeProvider timeProvider, + DurableJobsOptions options) + { + var siloAddress = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5000), 0); + var localSiloDetails = new TestLocalSiloDetails(siloAddress); + var grainFactory = Substitute.For(); + var overloadDetector = Substitute.For(); + overloadDetector.IsOverloaded.Returns(false); + var optionsWrapper = Options.Create(options); + var shardExecutor = new ShardExecutor( + grainFactory, + optionsWrapper, + overloadDetector, + NullLogger.Instance, + timeProvider); + + return new LocalDurableJobManager( + shardManager, + shardExecutor, + grainFactory, + new TestClusterMembershipService(), + overloadDetector, + timeProvider, + optionsWrapper, + CreateSystemTargetShared(localSiloDetails), + NullLogger.Instance); + } + + private static SystemTargetShared CreateSystemTargetShared(ILocalSiloDetails localSiloDetails) => new( + runtimeClient: null!, + localSiloDetails, + NullLoggerFactory.Instance, + Options.Create(new SchedulingOptions()), + grainReferenceActivator: null!, + timerRegistry: null!, + activations: new ActivationDirectory(), + schedulerInstruments: CreateSchedulerInstruments()); + + private static SchedulerInstruments CreateSchedulerInstruments() + { + var services = new ServiceCollection(); + services.AddMetrics(); + services.AddSingleton(); + services.AddSingleton(); + return services.BuildServiceProvider().GetRequiredService(); + } + + private static IJobShard CreateSubstituteShard(string id, DateTimeOffset start, DateTimeOffset end) + { + var shard = Substitute.For(); + shard.Id.Returns(id); + shard.StartTime.Returns(start); + shard.EndTime.Returns(end); + shard.MarkAsCompleteAsync(Arg.Any()).Returns(Task.CompletedTask); + return shard; + } + + private static ServiceProvider CreateJournaledServices(VolatileJournalStorageProvider storageProvider, TimeProvider timeProvider) + { + var builder = new TestSiloBuilder(); + builder.AddJournalStorage(); + builder.UseJsonJournalFormat(options => options.AddTypeInfoResolver(DurableJobsJsonContext.Default)); + builder.Services.AddLogging(); + builder.Services.AddSingleton(timeProvider); + builder.Services.AddSingleton(storageProvider); + builder.Services.AddSingleton(storageProvider); + return builder.Services.BuildServiceProvider(); + } + + private static (IInternalGrainFactory GrainFactory, TaskCompletionSource HandledJob) CreateCompletingGrainFactory() + { + var handledJob = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); + var grainFactory = Substitute.For(); + var extension = Substitute.For(); + extension.HandleDurableJobAsync(Arg.Any(), Arg.Any()) + .Returns(callInfo => + { + handledJob.TrySetResult(callInfo.ArgAt(0)); + return Task.FromResult(DurableJobRunResult.Completed); + }); + grainFactory.GetGrain(Arg.Any()).Returns(extension); + return (grainFactory, handledJob); + } + + private static async Task AdvanceUntilCompletedAsync(FakeTimeProvider timeProvider, Task task, TimeSpan advanceBy) + { + for (var i = 0; i < 10 && !task.IsCompleted; i++) + { + await Task.Yield(); + timeProvider.Advance(advanceBy); + } + } + + private sealed class CompletingShard(string id, DateTimeOffset start, DateTimeOffset end) : IJobShard + { + private readonly TaskCompletionSource _completed = new(TaskCreationOptions.RunContinuationsAsynchronously); + + public int MarkAsCompleteCallCount; + public int DisposeCallCount; + public TaskCompletionSource ConsumeStarted { get; } = new(TaskCreationOptions.RunContinuationsAsynchronously); + + public string Id { get; } = id; + + public DateTimeOffset StartTime { get; } = start; + + public DateTimeOffset EndTime { get; } = end; + + public IDictionary? Metadata => null; + + public bool IsAddingCompleted => _completed.Task.IsCompleted; + + public IAsyncEnumerable ConsumeDurableJobsAsync() => ConsumeAsync(); + + public ValueTask GetJobCountAsync() => ValueTask.FromResult(0); + + public Task MarkAsCompleteAsync(CancellationToken cancellationToken) + { + Interlocked.Increment(ref MarkAsCompleteCallCount); + _completed.TrySetResult(); + return Task.CompletedTask; + } + + public Task RemoveJobAsync(string jobId, CancellationToken cancellationToken) => Task.FromResult(false); + + public Task RetryJobLaterAsync(IJobRunContext jobContext, DateTimeOffset newDueTime, CancellationToken cancellationToken) => Task.CompletedTask; + + public Task TryScheduleJobAsync(ScheduleJobRequest request, CancellationToken cancellationToken) => Task.FromResult(null); + + public ValueTask DisposeAsync() + { + Interlocked.Increment(ref DisposeCallCount); + return ValueTask.CompletedTask; + } + + private async IAsyncEnumerable ConsumeAsync([EnumeratorCancellation] CancellationToken cancellationToken = default) + { + ConsumeStarted.TrySetResult(); + await _completed.Task.WaitAsync(cancellationToken); + yield break; + } + } + + private sealed class GateableSchedulingShard(string id, DateTimeOffset start, DateTimeOffset end) : IJobShard + { + private readonly SemaphoreSlim _lock = new(1, 1); + private bool _completed; + + public TaskCompletionSource ScheduleStarted { get; } = new(TaskCreationOptions.RunContinuationsAsynchronously); + + public TaskCompletionSource AllowScheduleToFinish { get; } = new(TaskCreationOptions.RunContinuationsAsynchronously); + + public int MarkAsCompleteCallCount; + + public string Id { get; } = id; + + public DateTimeOffset StartTime { get; } = start; + + public DateTimeOffset EndTime { get; } = end; + + public IDictionary? Metadata => null; + + public bool IsAddingCompleted => _completed; + + public IAsyncEnumerable ConsumeDurableJobsAsync() => ConsumeAsync(); + + public ValueTask GetJobCountAsync() => ValueTask.FromResult(0); + + public async Task MarkAsCompleteAsync(CancellationToken cancellationToken) + { + await _lock.WaitAsync(cancellationToken); + try + { + Interlocked.Increment(ref MarkAsCompleteCallCount); + _completed = true; + } + finally + { + _lock.Release(); + } + } + + public Task RemoveJobAsync(string jobId, CancellationToken cancellationToken) => Task.FromResult(false); + + public Task RetryJobLaterAsync(IJobRunContext jobContext, DateTimeOffset newDueTime, CancellationToken cancellationToken) => Task.CompletedTask; + + public async Task TryScheduleJobAsync(ScheduleJobRequest request, CancellationToken cancellationToken) + { + await _lock.WaitAsync(cancellationToken); + try + { + ScheduleStarted.TrySetResult(); + await AllowScheduleToFinish.Task.WaitAsync(cancellationToken); + if (_completed) + { + return null; + } + + return new DurableJob + { + Id = Guid.NewGuid().ToString(), + Name = request.JobName, + DueTime = request.DueTime, + TargetGrainId = request.Target, + ShardId = Id, + Metadata = request.Metadata + }; + } + finally + { + _lock.Release(); + } + } + + public ValueTask DisposeAsync() + { + _lock.Dispose(); + return ValueTask.CompletedTask; + } + + private static async IAsyncEnumerable ConsumeAsync() + { + await Task.CompletedTask; + yield break; + } + } + + private sealed class TestJobShardManager() : JobShardManager(SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5000), 0)) + { + public List AssignedShards { get; } = []; + + public List UnregisteredShards { get; } = []; + + public DateTimeOffset LastMaxDueTime { get; private set; } + + public override Task> AssignJobShardsAsync(DateTimeOffset maxDueTime, int maxNewClaims, CancellationToken cancellationToken) + { + LastMaxDueTime = maxDueTime; + return Task.FromResult(AssignedShards.ToList()); + } + + public override Task CreateShardAsync(DateTimeOffset minDueTime, DateTimeOffset maxDueTime, IDictionary metadata, CancellationToken cancellationToken) + => throw new NotSupportedException(); + + public override Task UnregisterShardAsync(IJobShard shard, CancellationToken cancellationToken) + { + UnregisteredShards.Add(shard); + return Task.CompletedTask; + } + } + + private sealed class TestLocalSiloDetails(SiloAddress siloAddress) : ILocalSiloDetails + { + public string Name => SiloAddress.ToParsableString(); + + public string ClusterId => "TestCluster"; + + public string DnsHostName => SiloAddress.ToParsableString(); + + public SiloAddress SiloAddress { get; } = siloAddress; + + public SiloAddress GatewayAddress => SiloAddress; + } + + private sealed class TestClusterMembershipService : IClusterMembershipService + { + private ImmutableDictionary _members = ImmutableDictionary.Empty; + private long _version; + + public ClusterMembershipSnapshot CurrentSnapshot => new(_members, new MembershipVersion(_version)); + + public IAsyncEnumerable MembershipUpdates => GetMembershipUpdates(); + + public void SetSiloStatus(SiloAddress siloAddress, SiloStatus status) + { + _members = _members.SetItem(siloAddress, new ClusterMember(siloAddress, status, siloAddress.ToParsableString())); + _version++; + } + + public ValueTask Refresh(MembershipVersion minimumVersion = default, CancellationToken cancellationToken = default) => ValueTask.CompletedTask; + + public Task TryKill(SiloAddress siloAddress) => Task.FromResult(false); + + private static async IAsyncEnumerable GetMembershipUpdates() + { + await Task.CompletedTask; + yield break; + } + } + + private sealed class TestSiloBuilder : ISiloBuilder + { + public IServiceCollection Services { get; } = new ServiceCollection(); + + public IConfiguration Configuration { get; } = new ConfigurationBuilder().Build(); + } +} diff --git a/test/Orleans.Core.Tests/DurableJobs/ShardExecutorTests.cs b/test/Orleans.Core.Tests/DurableJobs/ShardExecutorTests.cs index ca355780ded..d443113dfad 100644 --- a/test/Orleans.Core.Tests/DurableJobs/ShardExecutorTests.cs +++ b/test/Orleans.Core.Tests/DurableJobs/ShardExecutorTests.cs @@ -1,9 +1,10 @@ +using System.Runtime.CompilerServices; using Microsoft.Extensions.Logging.Abstractions; using Microsoft.Extensions.Options; +using Microsoft.Extensions.Time.Testing; using NSubstitute; using Orleans.DurableJobs; using Orleans.Runtime.Messaging; -using System.Runtime.CompilerServices; using Xunit; namespace NonSilo.Tests.ScheduledJobs; @@ -289,6 +290,88 @@ public async Task RunShardAsync_WhenJobReturnsPollAfter_EntersPollingLoopUntilCo await shard.Received(1).RemoveJobAsync(Arg.Any(), Arg.Any()); } + [Fact] + public async Task RunShardAsync_WhenJobReturnsPollAfter_UsesTimeProvider() + { + var timeProvider = new FakeTimeProvider(new DateTimeOffset(2026, 1, 1, 0, 0, 0, TimeSpan.Zero)); + var options = CreateOptions(maxConcurrentJobs: 10); + var overloadDetector = CreateOverloadDetector(isOverloaded: false); + var jobs = CreateJobs(1, timeProvider.GetUtcNow().AddSeconds(-1)); + var shard = CreateJobShard(jobs, startTime: timeProvider.GetUtcNow().AddMinutes(-1)); + var firstCall = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); + var secondCall = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); + var callCount = 0; + var grainFactory = Substitute.For(); + var extension = Substitute.For(); + extension.HandleDurableJobAsync(Arg.Any(), Arg.Any()) + .Returns(_ => + { + var currentCall = Interlocked.Increment(ref callCount); + if (currentCall == 1) + { + firstCall.SetResult(); + return Task.FromResult(DurableJobRunResult.PollAfter(TimeSpan.FromSeconds(5))); + } + + secondCall.SetResult(); + return Task.FromResult(DurableJobRunResult.Completed); + }); + grainFactory.GetGrain(Arg.Any()).Returns(extension); + var executor = new ShardExecutor(grainFactory, options, overloadDetector, NullLogger.Instance, timeProvider); + + var runTask = executor.RunShardAsync(shard, CancellationToken.None); + + await firstCall.Task.WaitAsync(TimeSpan.FromSeconds(5)); + Assert.False(secondCall.Task.IsCompleted); + + timeProvider.Advance(TimeSpan.FromSeconds(5)); + + await secondCall.Task.WaitAsync(TimeSpan.FromSeconds(5)); + await runTask.WaitAsync(TimeSpan.FromSeconds(5)); + await shard.Received(1).RemoveJobAsync(Arg.Any(), Arg.Any()); + } + + [Fact] + public async Task RunShardAsync_WhenOverloaded_UsesTimeProviderForBackoff() + { + var timeProvider = new FakeTimeProvider(new DateTimeOffset(2026, 1, 1, 0, 0, 0, TimeSpan.Zero)); + var options = CreateOptions(maxConcurrentJobs: 10, overloadBackoffDelay: TimeSpan.FromSeconds(5)); + var overloadDetector = Substitute.For(); + var overloaded = true; + var delayStarted = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); + var overloadChecks = 0; + overloadDetector.IsOverloaded.Returns(_ => + { + if (Interlocked.Increment(ref overloadChecks) == 2) + { + delayStarted.SetResult(); + } + + return Volatile.Read(ref overloaded); + }); + var jobs = CreateJobs(1, timeProvider.GetUtcNow().AddSeconds(-1)); + var shard = CreateJobShard(jobs, startTime: timeProvider.GetUtcNow().AddMinutes(-1)); + var grainFactory = CreateGrainFactory(); + var jobHandled = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); + ConfigureGrainFactoryWithSlowJobExecution(grainFactory, () => + { + jobHandled.SetResult(); + return Task.CompletedTask; + }); + var executor = new ShardExecutor(grainFactory, options, overloadDetector, NullLogger.Instance, timeProvider); + + var runTask = executor.RunShardAsync(shard, CancellationToken.None); + + await delayStarted.Task.WaitAsync(TimeSpan.FromSeconds(5)); + Assert.False(jobHandled.Task.IsCompleted); + + Volatile.Write(ref overloaded, false); + await AdvanceUntilCompletedAsync(timeProvider, jobHandled.Task, options.Value.OverloadBackoffDelay); + + await jobHandled.Task.WaitAsync(TimeSpan.FromSeconds(5)); + await runTask.WaitAsync(TimeSpan.FromSeconds(5)); + } + [Fact] public async Task RunShardAsync_WhenJobReturnsPollAfterThenFails_HandlesFailureCorrectly() { @@ -526,6 +609,15 @@ private static IOverloadDetector CreateOverloadDetector(bool isOverloaded) return detector; } + private static async Task AdvanceUntilCompletedAsync(FakeTimeProvider timeProvider, Task task, TimeSpan advanceBy) + { + for (var i = 0; i < 10 && !task.IsCompleted; i++) + { + await Task.Yield(); + timeProvider.Advance(advanceBy); + } + } + private static List CreateJobs(int count, DateTimeOffset? dueTime = null) { var jobs = new List(); diff --git a/test/Orleans.DurableJobs.Tests/DurableJobs/IJobShardManagerTestFixture.cs b/test/Orleans.DurableJobs.Tests/DurableJobs/IJobShardManagerTestFixture.cs index acd09191e18..3953a334546 100644 --- a/test/Orleans.DurableJobs.Tests/DurableJobs/IJobShardManagerTestFixture.cs +++ b/test/Orleans.DurableJobs.Tests/DurableJobs/IJobShardManagerTestFixture.cs @@ -1,23 +1,158 @@ -using System; -using System.Collections.Generic; -using System.Threading.Tasks; +#nullable enable + +using System.Collections.Immutable; +using System.Net; +using Microsoft.Extensions.Configuration; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Options; +using Orleans.Configuration.Internal; +using Orleans.Hosting; +using Orleans.Journaling; +using Orleans.Journaling.Json; using Orleans.Runtime; -using Orleans.DurableJobs; -namespace Tester.DurableJobs; +namespace Orleans.DurableJobs.Tests; + +public interface IJobShardManagerTestFixture +{ + Task CreateScopeAsync(); +} + +public interface IJobShardManagerTestScope : IAsyncDisposable +{ + TestSilo ActiveSilo { get; } + + TestSilo SecondActiveSilo { get; } + + TestSilo ThirdActiveSilo { get; } + + TestSilo FormerOwnerSilo { get; } + + DateTimeOffset Now { get; } + + JobShardManager CreateManager(TestSilo silo, DurableJobsOptions? options = null); -/// -/// Defines the contract for provider-specific test fixtures used by . -/// Each provider implementation (Azure, InMemory, etc.) should implement this interface to provide -/// the necessary infrastructure for running shared job shard manager tests. -/// -public interface IJobShardManagerTestFixture : IAsyncDisposable + void SetSiloStatus(TestSilo silo, SiloStatus status); +} + +public sealed record TestSilo(SiloAddress SiloAddress); + +public sealed class VolatileJobShardManagerTestFixture : IJobShardManagerTestFixture { - /// - /// Creates a new instance for the specified silo. - /// - /// The local silo details. - /// The cluster membership service for the manager. - /// A configured job shard manager instance. - JobShardManager CreateManager(ILocalSiloDetails localSiloDetails, IClusterMembershipService membershipService); + public Task CreateScopeAsync() + { + var builder = new TestSiloBuilder(); + builder.AddJournalStorage(); + builder.UseJsonJournalFormat(options => options.AddTypeInfoResolver(DurableJobsJsonContext.Default)); + builder.Services.AddLogging(); + builder.Services.AddSingleton(TimeProvider.System); + builder.Services.AddSingleton(); + builder.Services.AddFromExisting(); + builder.Services.AddFromExisting(); + + return Task.FromResult(new JournaledJobShardManagerTestScope(builder.Services.BuildServiceProvider())); + } + + private sealed class TestSiloBuilder : ISiloBuilder + { + public IServiceCollection Services { get; } = new ServiceCollection(); + + public IConfiguration Configuration { get; } = new ConfigurationBuilder().Build(); + } +} + +public class JournaledJobShardManagerTestScope : IJobShardManagerTestScope +{ + private static int _nextPort = 40_000; + private readonly ServiceProvider _services; + private readonly TestClusterMembershipService _membership = new(); + + public JournaledJobShardManagerTestScope(ServiceProvider services) + { + _services = services; + ActiveSilo = CreateSilo(); + SecondActiveSilo = CreateSilo(); + ThirdActiveSilo = CreateSilo(); + FormerOwnerSilo = CreateSilo(); + + SetSiloStatus(ActiveSilo, SiloStatus.Active); + SetSiloStatus(SecondActiveSilo, SiloStatus.Active); + SetSiloStatus(ThirdActiveSilo, SiloStatus.Active); + SetSiloStatus(FormerOwnerSilo, SiloStatus.Active); + } + + public TestSilo ActiveSilo { get; } + + public TestSilo SecondActiveSilo { get; } + + public TestSilo ThirdActiveSilo { get; } + + public TestSilo FormerOwnerSilo { get; } + + public DateTimeOffset Now => DateTimeOffset.UtcNow; + + public JobShardManager CreateManager(TestSilo silo, DurableJobsOptions? options = null) + => new JournaledJobShardManager( + new TestLocalSiloDetails(silo.SiloAddress), + _services.GetRequiredService(), + _services.GetRequiredService(), + _services.GetRequiredService(), + _membership, + _services, + Options.Create(options ?? new DurableJobsOptions()), + _services.GetRequiredService>()); + + public void SetSiloStatus(TestSilo silo, SiloStatus status) => _membership.SetSiloStatus(silo.SiloAddress, status); + + public virtual ValueTask DisposeAsync() + { + _services.Dispose(); + return ValueTask.CompletedTask; + } + + private static TestSilo CreateSilo() + { + var port = Interlocked.Increment(ref _nextPort); + return new(SiloAddress.New(new IPEndPoint(IPAddress.Loopback, port), port)); + } + + private sealed class TestLocalSiloDetails(SiloAddress siloAddress) : ILocalSiloDetails + { + public string Name => SiloAddress.ToParsableString(); + + public string ClusterId => "TestCluster"; + + public string DnsHostName => SiloAddress.ToParsableString(); + + public SiloAddress SiloAddress { get; } = siloAddress; + + public SiloAddress GatewayAddress => SiloAddress; + } + + private sealed class TestClusterMembershipService : IClusterMembershipService + { + private ImmutableDictionary _members = ImmutableDictionary.Empty; + private long _version; + + public ClusterMembershipSnapshot CurrentSnapshot => new(_members, new MembershipVersion(_version)); + + public IAsyncEnumerable MembershipUpdates => GetMembershipUpdates(); + + public void SetSiloStatus(SiloAddress siloAddress, SiloStatus status) + { + _members = _members.SetItem(siloAddress, new ClusterMember(siloAddress, status, siloAddress.ToParsableString())); + _version++; + } + + public ValueTask Refresh(MembershipVersion minimumVersion = default, CancellationToken cancellationToken = default) => ValueTask.CompletedTask; + + public Task TryKill(SiloAddress siloAddress) => Task.FromResult(false); + + private static async IAsyncEnumerable GetMembershipUpdates() + { + await Task.CompletedTask; + yield break; + } + } + } diff --git a/test/Orleans.DurableJobs.Tests/DurableJobs/InMemoryJobShardManagerTestFixture.cs b/test/Orleans.DurableJobs.Tests/DurableJobs/InMemoryJobShardManagerTestFixture.cs deleted file mode 100644 index 205f1c260b9..00000000000 --- a/test/Orleans.DurableJobs.Tests/DurableJobs/InMemoryJobShardManagerTestFixture.cs +++ /dev/null @@ -1,32 +0,0 @@ -using System.Threading.Tasks; -using Orleans.Runtime; -using Orleans.DurableJobs; - -namespace Tester.DurableJobs; - -/// -/// InMemory implementation of . -/// Provides the infrastructure needed to run shared job shard manager tests against the InMemory provider. -/// -internal sealed class InMemoryJobShardManagerTestFixture : IJobShardManagerTestFixture -{ - private readonly int _maxAdoptedCount; - - public InMemoryJobShardManagerTestFixture(int maxAdoptedCount = 3) - { - _maxAdoptedCount = maxAdoptedCount; - // Clear any state from previous tests - InMemoryJobShardManager.ClearAllShardsAsync().GetAwaiter().GetResult(); - } - - public JobShardManager CreateManager(ILocalSiloDetails localSiloDetails, IClusterMembershipService membershipService) - { - return new InMemoryJobShardManager(localSiloDetails.SiloAddress, membershipService, _maxAdoptedCount); - } - - public async ValueTask DisposeAsync() - { - // Clear state after tests - await InMemoryJobShardManager.ClearAllShardsAsync(); - } -} diff --git a/test/Orleans.DurableJobs.Tests/DurableJobs/InMemoryJobShardManagerTests.cs b/test/Orleans.DurableJobs.Tests/DurableJobs/InMemoryJobShardManagerTests.cs deleted file mode 100644 index cb98f199452..00000000000 --- a/test/Orleans.DurableJobs.Tests/DurableJobs/InMemoryJobShardManagerTests.cs +++ /dev/null @@ -1,132 +0,0 @@ -using System; -using System.Threading; -using System.Threading.Tasks; -using Xunit; - -namespace Tester.DurableJobs; - -/// -/// Tests for using the . -/// These tests verify shard lifecycle management, ownership, and failover semantics for the InMemory provider. -/// -[TestCategory("BVT"), TestCategory("DurableJobs")] -public class InMemoryJobShardManagerTests : IAsyncLifetime -{ - private readonly InMemoryJobShardManagerTestFixture _fixture; - private readonly JobShardManagerTestsRunner _runner; - - public InMemoryJobShardManagerTests() - { - _fixture = new InMemoryJobShardManagerTestFixture(); - _runner = new JobShardManagerTestsRunner(_fixture); - } - - public Task InitializeAsync() => Task.CompletedTask; - - public Task DisposeAsync() => _fixture.DisposeAsync().AsTask(); - - [SkippableFact] - public async Task InMemoryJobShardManager_ShardCreationAndAssignment() - { - using var cts = new CancellationTokenSource(TimeSpan.FromMinutes(2)); - await _runner.ShardCreationAndAssignment(cts.Token); - } - - [SkippableFact] - public async Task InMemoryJobShardManager_ReadFrozenShard() - { - using var cts = new CancellationTokenSource(TimeSpan.FromMinutes(2)); - await _runner.ReadFrozenShard(cts.Token); - } - - [SkippableFact] - public async Task InMemoryJobShardManager_LiveShard() - { - using var cts = new CancellationTokenSource(TimeSpan.FromMinutes(2)); - await _runner.LiveShard(cts.Token); - } - - [SkippableFact] - public async Task InMemoryJobShardManager_JobMetadata() - { - using var cts = new CancellationTokenSource(TimeSpan.FromMinutes(2)); - await _runner.JobMetadata(cts.Token); - } - - [SkippableFact] - public async Task InMemoryJobShardManager_ConcurrentShardAssignment_OwnershipConflicts() - { - using var cts = new CancellationTokenSource(TimeSpan.FromMinutes(2)); - await _runner.ConcurrentShardAssignment_OwnershipConflicts(cts.Token); - } - - [SkippableFact] - public async Task InMemoryJobShardManager_ShardMetadataMerge() - { - using var cts = new CancellationTokenSource(TimeSpan.FromMinutes(2)); - await _runner.ShardMetadataMerge(cts.Token); - } - - [SkippableFact] - public async Task InMemoryJobShardManager_StopProcessingShard() - { - using var cts = new CancellationTokenSource(TimeSpan.FromMinutes(2)); - await _runner.StopProcessingShard(cts.Token); - } - - [SkippableFact] - public async Task InMemoryJobShardManager_RetryJobLater() - { - using var cts = new CancellationTokenSource(TimeSpan.FromMinutes(2)); - await _runner.RetryJobLater(cts.Token); - } - - [SkippableFact] - public async Task InMemoryJobShardManager_JobCancellation() - { - using var cts = new CancellationTokenSource(TimeSpan.FromMinutes(2)); - await _runner.JobCancellation(cts.Token); - } - - [SkippableFact] - public async Task InMemoryJobShardManager_ShardRegistrationRetry_IdCollisions() - { - using var cts = new CancellationTokenSource(TimeSpan.FromMinutes(2)); - await _runner.ShardRegistrationRetry_IdCollisions(cts.Token); - } - - [SkippableFact] - public async Task InMemoryJobShardManager_UnregisterShard_WithJobsRemaining() - { - using var cts = new CancellationTokenSource(TimeSpan.FromMinutes(2)); - await _runner.UnregisterShard_WithJobsRemaining(cts.Token); - } - - [SkippableFact] - public async Task InMemoryJobShardManager_SlowStart_LimitsOrphanedShardClaims() - { - using var cts = new CancellationTokenSource(TimeSpan.FromMinutes(2)); - await _runner.SlowStart_LimitsOrphanedShardClaims(cts.Token); - } - - [SkippableFact] - public async Task InMemoryJobShardManager_SlowStart_ZeroBudgetClaimsNothing() - { - using var cts = new CancellationTokenSource(TimeSpan.FromMinutes(2)); - await _runner.SlowStart_ZeroBudgetClaimsNothing(cts.Token); - } - - [SkippableFact] - public async Task InMemoryJobShardManager_SlowStart_UnlimitedBudgetClaimsAll() - { - using var cts = new CancellationTokenSource(TimeSpan.FromMinutes(2)); - await _runner.SlowStart_UnlimitedBudgetClaimsAll(cts.Token); - } - - [SkippableFact] - public async Task InMemoryJobShardManager_SlowStart_BudgetExhaustion_DoesNotInflateAdoptedCount() - { - using var cts = new CancellationTokenSource(TimeSpan.FromMinutes(2)); - await _runner.SlowStart_BudgetExhaustion_DoesNotInflateAdoptedCount(cts.Token); - } -} diff --git a/test/Orleans.DurableJobs.Tests/DurableJobs/JobShardManagerTestsRunner.cs b/test/Orleans.DurableJobs.Tests/DurableJobs/JobShardManagerTestsRunner.cs index 3129399da68..e6ce9c48898 100644 --- a/test/Orleans.DurableJobs.Tests/DurableJobs/JobShardManagerTestsRunner.cs +++ b/test/Orleans.DurableJobs.Tests/DurableJobs/JobShardManagerTestsRunner.cs @@ -1,746 +1,273 @@ -using System; -using System.Collections.Generic; -using System.Collections.Immutable; -using System.Linq; -using System.Net; -using System.Threading; -using System.Threading.Tasks; -using Orleans.Runtime; -using Orleans.DurableJobs; +#nullable enable + +using TestExtensions; using Xunit; -namespace Tester.DurableJobs; +namespace Orleans.DurableJobs.Tests; -/// -/// Contains provider-agnostic test logic for job shard managers that can be run against different providers. -/// This class is similar to but operates at the infrastructure layer, -/// testing shard lifecycle management, ownership, and failover semantics. -/// -public class JobShardManagerTestsRunner +[TestCategory("BVT")] +public abstract class JobShardManagerTestsRunner(IJobShardManagerTestFixture fixture) { - private readonly IJobShardManagerTestFixture _fixture; - private readonly IDictionary _testMetadata; - private readonly InMemoryClusterMembershipService _membershipService; - - public JobShardManagerTestsRunner(IJobShardManagerTestFixture fixture) + [SkippableFact] + public async Task ShardCreationAndAssignmentUsesDistinctShardIdsForSameWindow() { - _fixture = fixture; - _testMetadata = new Dictionary - { - { "CreatedBy", "UnitTest" }, - { "Purpose", "Testing" } - }; - _membershipService = new InMemoryClusterMembershipService(); - } + await using var scope = await fixture.CreateScopeAsync(); + var manager = scope.CreateManager(scope.ActiveSilo); + var now = scope.Now; - /// - /// Sets the status of a silo in the cluster membership service. - /// - private void SetSiloStatus(SiloAddress siloAddress, SiloStatus status) - { - _membershipService.SetSiloStatus(siloAddress, status); - } + var shard1 = await manager.CreateShardAsync(now, now.AddMinutes(5), new Dictionary { ["index"] = "1" }, CancellationToken.None); + var shard2 = await manager.CreateShardAsync(now, now.AddMinutes(5), new Dictionary { ["index"] = "2" }, CancellationToken.None); - /// - /// Creates a job shard manager for the given silo address. - /// - private JobShardManager CreateManager(SiloAddress siloAddress) - { - var localSiloDetails = new TestLocalSiloDetails(siloAddress); - return _fixture.CreateManager(localSiloDetails, _membershipService); - } + var assigned = await manager.AssignJobShardsAsync(now.AddMinutes(5), int.MaxValue, CancellationToken.None); - /// - /// Tests basic shard creation and assignment workflow. - /// Verifies that shards are created with unique IDs and correctly assigned to their creator silo. - /// - public async Task ShardCreationAndAssignment(CancellationToken cancellationToken) - { - var silo1Address = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5000), 0); - var silo2Address = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5001), 0); - - SetSiloStatus(silo1Address, SiloStatus.Active); - SetSiloStatus(silo2Address, SiloStatus.Active); - var silo1Manager = CreateManager(silo1Address); - var silo2Manager = CreateManager(silo2Address); - - var date = DateTimeOffset.UtcNow; - var maxDate = date.AddHours(1); - - // Register multiple shards and ensure they are distinct - // two of them have the same time range - var shard1 = await silo1Manager.CreateShardAsync(date, maxDate, _testMetadata, cancellationToken); - var shard2 = await silo1Manager.CreateShardAsync(date, maxDate, _testMetadata, cancellationToken); - var shard3 = await silo1Manager.CreateShardAsync(date.AddHours(2), maxDate, _testMetadata, cancellationToken); - - Assert.Distinct([shard1.Id, shard2.Id, shard3.Id]); - - // All shards are now assigned to the creator silo - var assignedShards = await silo1Manager.AssignJobShardsAsync(DateTime.UtcNow.AddHours(3), maxNewClaims: int.MaxValue, cancellationToken); - Assert.Equal(3, assignedShards.Count); - Assert.Contains(shard1.Id, assignedShards.Select(s => s.Id)); - Assert.Contains(shard2.Id, assignedShards.Select(s => s.Id)); - Assert.Contains(shard3.Id, assignedShards.Select(s => s.Id)); - var emptyShards = await silo2Manager.AssignJobShardsAsync(DateTime.UtcNow.AddHours(3), maxNewClaims: int.MaxValue, cancellationToken); - Assert.Empty(emptyShards); - - // Mark the local silo as dead - SetSiloStatus(silo1Address, SiloStatus.Dead); - - // Now we can take over all three shards - var shards = await silo2Manager.AssignJobShardsAsync(DateTime.UtcNow.AddHours(3), maxNewClaims: int.MaxValue, cancellationToken); - Assert.Equal(3, shards.Count); - Assert.Contains(shard1.Id, shards.Select(s => s.Id)); - Assert.Contains(shard2.Id, shards.Select(s => s.Id)); - Assert.Contains(shard3.Id, shards.Select(s => s.Id)); - - // Register another silo - var silo3Address = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5002), 0); - SetSiloStatus(silo3Address, SiloStatus.Active); - var silo3Manager = CreateManager(silo3Address); - - // No unassigned shards - Assert.Empty(await silo3Manager.AssignJobShardsAsync(DateTime.UtcNow.AddHours(1), maxNewClaims: int.MaxValue, cancellationToken)); + Assert.Equal(2, assigned.Count); + Assert.NotEqual(shard1.Id, shard2.Id); + Assert.Contains(assigned, shard => shard.Id == shard1.Id && shard.Metadata!["index"] == "1"); + Assert.Contains(assigned, shard => shard.Id == shard2.Id && shard.Metadata!["index"] == "2"); } - /// - /// Tests reading and consuming jobs from a shard after ownership transfer. - /// Verifies that jobs are preserved during failover and can be consumed by the new owner. - /// - public async Task ReadFrozenShard(CancellationToken cancellationToken) + [SkippableFact] + public async Task DeadOwnerShardIsReassignedAndPreservesQueuedJobOrderAndMetadata() { - var silo1Address = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5000), 0); - var silo2Address = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5001), 0); - SetSiloStatus(silo1Address, SiloStatus.Active); - SetSiloStatus(silo2Address, SiloStatus.Active); - var silo1Manager = CreateManager(silo1Address); - var silo2Manager = CreateManager(silo2Address); - - var date = DateTime.UtcNow; - var shard1 = await silo1Manager.CreateShardAsync(date, date.AddHours(1), _testMetadata, cancellationToken); - - // Schedule some jobs - await shard1.TryScheduleJobAsync(new ScheduleJobRequest { Target = GrainId.Create("type", "target1"), JobName = "job1", DueTime = date.AddSeconds(1), Metadata = null }, cancellationToken); - await shard1.TryScheduleJobAsync(new ScheduleJobRequest { Target = GrainId.Create("type", "target1"), JobName = "job3", DueTime = date.AddSeconds(3), Metadata = null }, cancellationToken); - await shard1.TryScheduleJobAsync(new ScheduleJobRequest { Target = GrainId.Create("type", "target2"), JobName = "job2", DueTime = date.AddSeconds(2), Metadata = null }, cancellationToken); - await shard1.TryScheduleJobAsync(new ScheduleJobRequest { Target = GrainId.Create("type", "target1"), JobName = "job4", DueTime = date.AddSeconds(4), Metadata = null }, cancellationToken); - - // Mark the silo1 as dead, and create a new incarnation - SetSiloStatus(silo1Address, SiloStatus.Dead); - - // Take over the shard - var shards = await silo2Manager.AssignJobShardsAsync(DateTime.UtcNow.AddHours(1), maxNewClaims: int.MaxValue, cancellationToken); - Assert.Single(shards); - shard1 = shards[0]; - - var counter = 1; - await foreach (var jobCtx in shard1.ConsumeDurableJobsAsync().WithCancellation(cancellationToken)) - { - Assert.Equal($"job{counter}", jobCtx.Job.Name); - await shard1.RemoveJobAsync(jobCtx.Job.Id, cancellationToken); - counter++; - } - Assert.Equal(5, counter); - await silo2Manager.UnregisterShardAsync(shard1, cancellationToken); - - // No unassigned shards - Assert.Empty(await silo2Manager.AssignJobShardsAsync(DateTime.UtcNow.AddHours(1), maxNewClaims: int.MaxValue, cancellationToken)); + await using var scope = await fixture.CreateScopeAsync(); + var formerOwner = scope.CreateManager(scope.FormerOwnerSilo); + var newOwner = scope.CreateManager(scope.SecondActiveSilo); + var now = scope.Now; + var shard = await formerOwner.CreateShardAsync(now.AddMinutes(-5), now.AddMinutes(5), Metadata("stream", "alpha"), CancellationToken.None); + var later = await ScheduleJobAsync(shard, now.AddSeconds(-1), "later", Metadata("kind", "later")); + var earlier = await ScheduleJobAsync(shard, now.AddSeconds(-2), "earlier", Metadata("kind", "symbols=+/&?")); + + scope.SetSiloStatus(scope.FormerOwnerSilo, SiloStatus.Dead); + + var assigned = await newOwner.AssignJobShardsAsync(now.AddMinutes(5), int.MaxValue, CancellationToken.None); + var reassigned = Assert.Single(assigned); + var runs = await TakeAsync(reassigned, 2); + + Assert.Equal(shard.Id, reassigned.Id); + Assert.True(reassigned.IsAddingCompleted); + Assert.Equal([earlier!.Id, later!.Id], runs.Select(run => run.Job.Id).ToArray()); + Assert.Equal("symbols=+/&?", runs[0].Job.Metadata!["kind"]); + Assert.Equal("later", runs[1].Job.Metadata!["kind"]); } - /// - /// Tests consuming jobs from a live shard (one that continues to accept new jobs). - /// Verifies job scheduling, consumption, and cancellation during processing. - /// - public async Task LiveShard(CancellationToken cancellationToken) + [SkippableFact] + public async Task OpenAndClosedShardsAreReassignedAfterFailover() { - var startTime = DateTime.UtcNow; - var localAddress = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5000), 0); - SetSiloStatus(localAddress, SiloStatus.Active); - var manager = CreateManager(localAddress); - - var date = DateTime.UtcNow; - var shard1 = await manager.CreateShardAsync(date, date.AddYears(1), _testMetadata, cancellationToken); - - // Schedule some jobs - await shard1.TryScheduleJobAsync(new ScheduleJobRequest { Target = GrainId.Create("type", "target1"), JobName = "job0", DueTime = startTime.AddSeconds(1), Metadata = null }, cancellationToken); - await shard1.TryScheduleJobAsync(new ScheduleJobRequest { Target = GrainId.Create("type", "target1"), JobName = "job2", DueTime = startTime.AddSeconds(3), Metadata = null }, cancellationToken); - await shard1.TryScheduleJobAsync(new ScheduleJobRequest { Target = GrainId.Create("type", "target2"), JobName = "job1", DueTime = startTime.AddSeconds(2), Metadata = null }, cancellationToken); - var lastJob = await shard1.TryScheduleJobAsync(new ScheduleJobRequest { Target = GrainId.Create("type", "target1"), JobName = "job3", DueTime = startTime.AddSeconds(4), Metadata = null }, cancellationToken); - var jobToCancel = await shard1.TryScheduleJobAsync(new ScheduleJobRequest { Target = GrainId.Create("type", "target1"), JobName = "job4", DueTime = startTime.AddSeconds(5), Metadata = null }, cancellationToken); - - var counter = 0; - await shard1.MarkAsCompleteAsync(cancellationToken); - await shard1.RemoveJobAsync(jobToCancel.Id, cancellationToken); - await foreach (var jobCtx in shard1.ConsumeDurableJobsAsync().WithCancellation(cancellationToken)) - { - Assert.Equal($"job{counter}", jobCtx.Job.Name); - await shard1.RemoveJobAsync(jobCtx.Job.Id, cancellationToken); - counter++; - } - Assert.Equal(4, counter); - Assert.True(lastJob.DueTime <= DateTimeOffset.UtcNow); - await manager.UnregisterShardAsync(shard1, cancellationToken); - - // No unassigned shards - Assert.Empty(await manager.AssignJobShardsAsync(DateTime.UtcNow.AddHours(1), maxNewClaims: int.MaxValue, cancellationToken)); + await using var scope = await fixture.CreateScopeAsync(); + var formerOwner = scope.CreateManager(scope.FormerOwnerSilo); + var newOwner = scope.CreateManager(scope.SecondActiveSilo); + var now = scope.Now; + var closed = await formerOwner.CreateShardAsync(now, now.AddMinutes(5), Metadata("state", "closed"), CancellationToken.None); + await ScheduleJobAsync(closed, now.AddMinutes(-1), "closed-job"); + await formerOwner.UnregisterShardAsync(closed, CancellationToken.None); + var open = await formerOwner.CreateShardAsync(now.AddMinutes(1), now.AddMinutes(6), Metadata("state", "open"), CancellationToken.None); + await ScheduleJobAsync(open, now.AddMinutes(1), "open-job"); + + scope.SetSiloStatus(scope.FormerOwnerSilo, SiloStatus.Dead); + + var assigned = await newOwner.AssignJobShardsAsync(now.AddMinutes(10), int.MaxValue, CancellationToken.None); + + Assert.Equal(2, assigned.Count); + Assert.Contains(assigned, shard => shard.Id == open.Id); + Assert.Contains(assigned, shard => shard.Id == closed.Id); + Assert.True(assigned.All(static shard => shard.IsAddingCompleted)); } - /// - /// Tests job metadata persistence and retrieval across shard ownership transfer. - /// - public async Task JobMetadata(CancellationToken cancellationToken) + [SkippableFact] + public async Task LiveShardSchedulesAndConsumesJobsInDueTimeOrder() { - // Initialize 2 silos with two managers - var silo1Address = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5000), 0); - var silo2Address = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5001), 0); - - SetSiloStatus(silo1Address, SiloStatus.Active); - SetSiloStatus(silo2Address, SiloStatus.Active); - var silo1Manager = CreateManager(silo1Address); - var silo2Manager = CreateManager(silo2Address); + await using var scope = await fixture.CreateScopeAsync(); + var manager = scope.CreateManager(scope.ActiveSilo); + var now = scope.Now; + var shard = await manager.CreateShardAsync(now.AddMinutes(-1), now.AddMinutes(5), Metadata("stream", "live"), CancellationToken.None); + var later = await ScheduleJobAsync(shard, now.AddSeconds(-1), "later"); + var earlier = await ScheduleJobAsync(shard, now.AddSeconds(-2), "earlier"); - var date = DateTime.UtcNow; - var shard = await silo1Manager.CreateShardAsync(date, date.AddYears(1), _testMetadata, cancellationToken); + var assignedShard = Assert.Single(await manager.AssignJobShardsAsync(now.AddMinutes(5), int.MaxValue, CancellationToken.None)); + var runs = await TakeAsync(assignedShard, 2); - // Schedule jobs with different metadata on a single shard - var jobMetadata1 = new Dictionary - { - { "Priority", "High" }, - { "Category", "Payment" }, - { "RequestId", "12345" } - }; - var jobMetadata2 = new Dictionary - { - { "Priority", "Low" }, - { "Category", "Notification" } - }; - - var job1 = await shard.TryScheduleJobAsync(new ScheduleJobRequest { Target = GrainId.Create("type", "target1"), JobName = "job1", DueTime = DateTime.UtcNow.AddSeconds(1), Metadata = jobMetadata1 }, cancellationToken); - var job2 = await shard.TryScheduleJobAsync(new ScheduleJobRequest { Target = GrainId.Create("type", "target2"), JobName = "job2", DueTime = DateTime.UtcNow.AddSeconds(2), Metadata = jobMetadata2 }, cancellationToken); - var job3 = await shard.TryScheduleJobAsync(new ScheduleJobRequest { Target = GrainId.Create("type", "target3"), JobName = "job3", DueTime = DateTime.UtcNow.AddSeconds(3), Metadata = null }, cancellationToken); - - // Verify metadata is set on the durable jobs - Assert.Equal(jobMetadata1, job1.Metadata); - Assert.Equal(jobMetadata2, job2.Metadata); - Assert.Null(job3.Metadata); - - // Mark the silo owning the shard as dead - SetSiloStatus(silo1Address, SiloStatus.Dead); - - // Take over the shard with the other silo - var shards = await silo2Manager.AssignJobShardsAsync(DateTime.UtcNow.AddHours(1), maxNewClaims: int.MaxValue, cancellationToken); - Assert.Single(shards); - shard = shards[0]; - - // Consume jobs and verify metadata is preserved - var consumedJobs = new List(); - await foreach (var jobCtx in shard.ConsumeDurableJobsAsync().WithCancellation(cancellationToken)) - { - consumedJobs.Add(jobCtx.Job); - await shard.RemoveJobAsync(jobCtx.Job.Id, cancellationToken); - } - - Assert.Equal(3, consumedJobs.Count); - - var consumedJob1 = consumedJobs.First(j => j.Name == "job1"); - var consumedJob2 = consumedJobs.First(j => j.Name == "job2"); - var consumedJob3 = consumedJobs.First(j => j.Name == "job3"); - - Assert.Equal(jobMetadata1, consumedJob1.Metadata); - Assert.Equal(jobMetadata2, consumedJob2.Metadata); - Assert.Null(consumedJob3.Metadata); - - await silo2Manager.UnregisterShardAsync(shard, cancellationToken); + Assert.Equal([earlier!.Id, later!.Id], runs.Select(run => run.Job.Id).ToArray()); } - /// - /// Tests concurrent shard assignment to verify that only one silo can claim ownership of an orphaned shard. - /// - public async Task ConcurrentShardAssignment_OwnershipConflicts(CancellationToken cancellationToken) + [SkippableFact] + public async Task ConcurrentOwnershipConflictAllowsOnlyOneManagerToClaimShard() { - // Initialize 3 silos with 3 managers - var silo1Address = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5000), 0); - var silo2Address = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5001), 0); - var silo3Address = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5002), 0); - - SetSiloStatus(silo1Address, SiloStatus.Active); - SetSiloStatus(silo2Address, SiloStatus.Active); - SetSiloStatus(silo3Address, SiloStatus.Active); - var silo1Manager = CreateManager(silo1Address); - var silo2Manager = CreateManager(silo2Address); - var silo3Manager = CreateManager(silo3Address); - - var date = DateTime.UtcNow; - - // Create two shards on the first silo - var shard1 = await silo1Manager.CreateShardAsync(date, date.AddHours(1), _testMetadata, cancellationToken); - var shard2 = await silo1Manager.CreateShardAsync(date, date.AddHours(2), _testMetadata, cancellationToken); - - // Mark the first silo as dead - SetSiloStatus(silo1Address, SiloStatus.Dead); - - // Concurrently try to assign shards from silo2 and silo3 - var assignTask2 = silo2Manager.AssignJobShardsAsync(DateTime.UtcNow.AddHours(3), maxNewClaims: int.MaxValue, cancellationToken); - var assignTask3 = silo3Manager.AssignJobShardsAsync(DateTime.UtcNow.AddHours(3), maxNewClaims: int.MaxValue, cancellationToken); - - await Task.WhenAll(assignTask2, assignTask3); - - var shards2 = await assignTask2; - var shards3 = await assignTask3; - - // Verify that only one silo was able to assign each shard (no duplicates) - var totalAssignments = shards2.Count + shards3.Count; - Assert.Equal(2, totalAssignments); - - var allAssignedShardIds = shards2.Select(s => s.Id).Concat(shards3.Select(s => s.Id)).ToList(); - Assert.Contains(shard1.Id, allAssignedShardIds); - Assert.Contains(shard2.Id, allAssignedShardIds); - Assert.Equal(2, allAssignedShardIds.Distinct().Count()); + await using var scope = await fixture.CreateScopeAsync(); + var creator = scope.CreateManager(scope.ActiveSilo); + var claimant1 = scope.CreateManager(scope.SecondActiveSilo); + var claimant2 = scope.CreateManager(scope.ThirdActiveSilo); + var now = scope.Now; + var shard = await creator.CreateShardAsync(now, now.AddMinutes(5), Metadata("conflict", "true"), CancellationToken.None); + await ScheduleJobAsync(shard, now.AddMinutes(-1), "conflict-job"); + await creator.UnregisterShardAsync(shard, CancellationToken.None); + + var claims = await Task.WhenAll( + claimant1.AssignJobShardsAsync(now.AddMinutes(5), int.MaxValue, CancellationToken.None), + claimant2.AssignJobShardsAsync(now.AddMinutes(5), int.MaxValue, CancellationToken.None)); + + Assert.Single(claims.SelectMany(static claim => claim)); } - /// - /// Tests that shard metadata is correctly preserved and merged during ownership transfers. - /// - public async Task ShardMetadataMerge(CancellationToken cancellationToken) + [SkippableFact] + public async Task MetadataIsPreservedAcrossGracefulReassignmentIncludingSpecialCharacters() { - // Initialize 2 silos with 2 managers - var silo1Address = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5000), 0); - var silo2Address = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5001), 0); - - SetSiloStatus(silo1Address, SiloStatus.Active); - SetSiloStatus(silo2Address, SiloStatus.Active); - var silo1Manager = CreateManager(silo1Address); - var silo2Manager = CreateManager(silo2Address); - - var date = DateTime.UtcNow; - - // Create a shard on silo1 with some metadata, then update the metadata and verify it is merged correctly - var customMetadata = new Dictionary + await using var scope = await fixture.CreateScopeAsync(); + var first = scope.CreateManager(scope.ActiveSilo); + var second = scope.CreateManager(scope.SecondActiveSilo); + var now = scope.Now; + var metadata = new Dictionary { - { "Environment", "Production" }, - { "TenantId", "tenant-123" } + ["space key"] = "space value", + ["symbols-key"] = "symbols=+/&?", + ["slash/key"] = "slash-value" }; + var shard = await first.CreateShardAsync(now, now.AddMinutes(5), metadata, CancellationToken.None); + await ScheduleJobAsync(shard, now.AddMinutes(-1), "metadata-job"); + await first.UnregisterShardAsync(shard, CancellationToken.None); - var shard = await silo1Manager.CreateShardAsync(date, date.AddHours(1), customMetadata, cancellationToken); - Assert.NotNull(shard.Metadata); - Assert.All(customMetadata, kvp => - { - Assert.True(shard.Metadata.ContainsKey(kvp.Key)); - Assert.Equal(kvp.Value, shard.Metadata[kvp.Key]); - }); - - // Schedule a job to ensure shard persistence - await shard.TryScheduleJobAsync(new ScheduleJobRequest { Target = GrainId.Create("type", "target1"), JobName = "job1", DueTime = DateTime.UtcNow.AddSeconds(5), Metadata = null }, cancellationToken); - - SetSiloStatus(silo1Address, SiloStatus.Dead); + var reassigned = Assert.Single(await second.AssignJobShardsAsync(now.AddMinutes(5), int.MaxValue, CancellationToken.None)); - // Take over the shard from silo2 and verify the metadata is preserved - var shards = await silo2Manager.AssignJobShardsAsync(DateTime.UtcNow.AddHours(1), maxNewClaims: int.MaxValue, cancellationToken); - Assert.Single(shards); - shard = shards[0]; - - Assert.NotNull(shard.Metadata); - Assert.All(customMetadata, kvp => - { - Assert.True(shard.Metadata.ContainsKey(kvp.Key)); - Assert.Equal(kvp.Value, shard.Metadata[kvp.Key]); - }); + Assert.Equal(metadata, reassigned.Metadata); } - /// - /// Tests stopping shard processing and verifying jobs remain for reassignment. - /// - public async Task StopProcessingShard(CancellationToken cancellationToken) + [SkippableFact] + public async Task UnregisterWithJobsRemainingPreservesShardForLaterReassignment() { - var localAddress = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5000), 0); - SetSiloStatus(localAddress, SiloStatus.Active); - var manager = CreateManager(localAddress); - - var date = DateTime.UtcNow; - var shard1 = await manager.CreateShardAsync(date, date.AddYears(1), _testMetadata, cancellationToken); + await using var scope = await fixture.CreateScopeAsync(); + var first = scope.CreateManager(scope.ActiveSilo); + var second = scope.CreateManager(scope.SecondActiveSilo); + var now = scope.Now; + var shard = await first.CreateShardAsync(now, now.AddMinutes(5), Metadata("purpose", "stop-processing"), CancellationToken.None); + var job = await ScheduleJobAsync(shard, now.AddMinutes(-1), "remaining-job"); - // Schedule some jobs - await shard1.TryScheduleJobAsync(new ScheduleJobRequest { Target = GrainId.Create("type", "target1"), JobName = "job1", DueTime = DateTime.UtcNow.AddSeconds(5), Metadata = null }, cancellationToken); - await shard1.TryScheduleJobAsync(new ScheduleJobRequest { Target = GrainId.Create("type", "target1"), JobName = "job3", DueTime = DateTime.UtcNow.AddSeconds(10), Metadata = null }, cancellationToken); - await shard1.TryScheduleJobAsync(new ScheduleJobRequest { Target = GrainId.Create("type", "target2"), JobName = "job2", DueTime = DateTime.UtcNow.AddSeconds(6), Metadata = null }, cancellationToken); - await shard1.TryScheduleJobAsync(new ScheduleJobRequest { Target = GrainId.Create("type", "target1"), JobName = "job4", DueTime = DateTime.UtcNow.AddSeconds(15), Metadata = null }, cancellationToken); - - var counter = 1; - await foreach (var jobCtx in shard1.ConsumeDurableJobsAsync().WithCancellation(cancellationToken)) - { - Assert.Equal($"job{counter}", jobCtx.Job.Name); - if (counter == 2) - break; - await shard1.RemoveJobAsync(jobCtx.Job.Id, cancellationToken); - counter++; - } - Assert.Equal(2, counter); - await manager.UnregisterShardAsync(shard1, cancellationToken); - - var shards = await manager.AssignJobShardsAsync(DateTime.UtcNow.AddHours(1), maxNewClaims: int.MaxValue, cancellationToken); - Assert.Single(shards); - Assert.Equal(shard1.Id, shards[0].Id); - } + await first.UnregisterShardAsync(shard, CancellationToken.None); - /// - /// Tests retrying a job with a new due time. - /// - public async Task RetryJobLater(CancellationToken cancellationToken) - { - var localAddress = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5000), 0); - SetSiloStatus(localAddress, SiloStatus.Active); - var manager = CreateManager(localAddress); - var date = DateTime.UtcNow; - var shard1 = await manager.CreateShardAsync(date, date.AddYears(1), _testMetadata, cancellationToken); - - // Schedule a job - var job = await shard1.TryScheduleJobAsync(new ScheduleJobRequest { Target = GrainId.Create("type", "target1"), JobName = "job1", DueTime = DateTime.UtcNow.AddSeconds(1), Metadata = null }, cancellationToken); - var cts = new CancellationTokenSource(TimeSpan.FromSeconds(40)); - await foreach (var jobCtx in shard1.ConsumeDurableJobsAsync().WithCancellation(cts.Token)) - { - Assert.Equal("job1", jobCtx.Job.Name); - var newDueTime = DateTimeOffset.UtcNow.AddSeconds(1); - await shard1.RetryJobLaterAsync(jobCtx, newDueTime, cancellationToken); - break; - } + var reassigned = Assert.Single(await second.AssignJobShardsAsync(now.AddMinutes(5), int.MaxValue, CancellationToken.None)); + var run = await TakeOneAsync(reassigned); - // Consume again - await foreach (var jobCtx in shard1.ConsumeDurableJobsAsync().WithCancellation(cancellationToken)) - { - Assert.Equal("job1", jobCtx.Job.Name); - Assert.NotEqual(job.DueTime, jobCtx.Job.DueTime); - await shard1.RemoveJobAsync(jobCtx.Job.Id, cancellationToken); - break; - } - await manager.UnregisterShardAsync(shard1, cancellationToken); + Assert.True(reassigned.IsAddingCompleted); + Assert.Equal(job!.Id, run.Job.Id); + Assert.Null(await reassigned.TryScheduleJobAsync(CreateRequest(now.AddMinutes(1), "rejected"), CancellationToken.None)); } - - /// - /// Tests job cancellation before and during processing. - /// - public async Task JobCancellation(CancellationToken cancellationToken) + [SkippableFact] + public async Task RetryLaterPersistsThroughShardReassignment() { - // Initialize 2 silos with two managers - var silo1Address = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5000), 0); - var silo2Address = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5001), 0); - - SetSiloStatus(silo1Address, SiloStatus.Active); - SetSiloStatus(silo2Address, SiloStatus.Active); - var silo1Manager = CreateManager(silo1Address); - var silo2Manager = CreateManager(silo2Address); - - var date = DateTime.UtcNow; - var shard = await silo1Manager.CreateShardAsync(date, date.AddYears(1), _testMetadata, cancellationToken); - - // Schedule multiple jobs in a single shard - await shard.TryScheduleJobAsync(new ScheduleJobRequest { Target = GrainId.Create("type", "target1"), JobName = "job1", DueTime = DateTime.UtcNow.AddMilliseconds(500), Metadata = null }, cancellationToken); - var job2 = await shard.TryScheduleJobAsync(new ScheduleJobRequest { Target = GrainId.Create("type", "target2"), JobName = "job2", DueTime = DateTime.UtcNow.AddMilliseconds(1000), Metadata = null }, cancellationToken); - await shard.TryScheduleJobAsync(new ScheduleJobRequest { Target = GrainId.Create("type", "target3"), JobName = "job3", DueTime = DateTime.UtcNow.AddMilliseconds(1500), Metadata = null }, cancellationToken); - var job4 = await shard.TryScheduleJobAsync(new ScheduleJobRequest { Target = GrainId.Create("type", "target4"), JobName = "job4", DueTime = DateTime.UtcNow.AddMilliseconds(2000), Metadata = null }, cancellationToken); - - // Cancel job2 before processing starts - await shard.RemoveJobAsync(job2.Id, cancellationToken); - - // Start consuming jobs - var consumedJobs = new List(); - - await foreach (var jobCtx in shard.ConsumeDurableJobsAsync().WithCancellation(cancellationToken)) - { - consumedJobs.Add(jobCtx.Job.Name); - - // Cancel job4 during processing (after job1 is consumed) - if (jobCtx.Job.Name == "job1") - { - await shard.RemoveJobAsync(job4.Id, cancellationToken); - } - - await shard.RemoveJobAsync(jobCtx.Job.Id, cancellationToken); - - if (consumedJobs.Count >= 2) - { - break; - } - } - - // Verify that only job1 and job3 were consumed (job2 was cancelled before consumption, job4 was cancelled during) - Assert.Equal(2, consumedJobs.Count); - Assert.Contains("job1", consumedJobs); - Assert.Contains("job3", consumedJobs); - Assert.DoesNotContain("job2", consumedJobs); - Assert.DoesNotContain("job4", consumedJobs); - - // Mark the shard owner silo as dead and reassign to verify cancelled jobs are not in storage - SetSiloStatus(silo1Address, SiloStatus.Dead); - - var shards = await silo2Manager.AssignJobShardsAsync(DateTime.UtcNow.AddHours(1), maxNewClaims: int.MaxValue, cancellationToken); - Assert.Single(shards); - shard = shards[0]; - - var hasJobs = false; - using var cts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken); - cts.CancelAfter(TimeSpan.FromSeconds(5)); - await foreach (var jobCtx in shard.ConsumeDurableJobsAsync().WithCancellation(cts.Token)) - { - hasJobs = true; - break; - } - - Assert.False(hasJobs); - await silo2Manager.UnregisterShardAsync(shard, cancellationToken); + await using var scope = await fixture.CreateScopeAsync(); + var first = scope.CreateManager(scope.ActiveSilo); + var second = scope.CreateManager(scope.SecondActiveSilo); + var now = scope.Now; + var shard = await first.CreateShardAsync(now, now.AddMinutes(5), new Dictionary(), CancellationToken.None); + var job = await ScheduleJobAsync(shard, now.AddMinutes(-1), "retry-job"); + var run = await TakeOneAsync(shard); + + await shard.RetryJobLaterAsync(run, now.AddMinutes(-1), CancellationToken.None); + await first.UnregisterShardAsync(shard, CancellationToken.None); + + var reassigned = Assert.Single(await second.AssignJobShardsAsync(now.AddMinutes(5), int.MaxValue, CancellationToken.None)); + var retried = await TakeOneAsync(reassigned); + + Assert.Equal(job!.Id, retried.Job.Id); + Assert.Equal(run.DequeueCount + 1, retried.DequeueCount); } - /// - /// Tests that multiple shard registrations with the same time range produce unique IDs. - /// - public async Task ShardRegistrationRetry_IdCollisions(CancellationToken cancellationToken) + [SkippableFact] + public async Task CancellationsBeforeAndDuringProcessingPersistAfterReassignment() { - var localAddress = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5000), 0); - SetSiloStatus(localAddress, SiloStatus.Active); - - var manager = CreateManager(localAddress); - - var date = DateTime.UtcNow; - - var shard1 = await manager.CreateShardAsync(date, date.AddHours(1), _testMetadata, cancellationToken); - var shard2 = await manager.CreateShardAsync(date, date.AddHours(1), _testMetadata, cancellationToken); - var shard3 = await manager.CreateShardAsync(date, date.AddHours(1), _testMetadata, cancellationToken); - - Assert.Distinct([shard1.Id, shard2.Id, shard3.Id]); + await using var scope = await fixture.CreateScopeAsync(); + var first = scope.CreateManager(scope.ActiveSilo); + var second = scope.CreateManager(scope.SecondActiveSilo); + var now = scope.Now; + var shard = await first.CreateShardAsync(now.AddMinutes(-5), now.AddMinutes(5), new Dictionary(), CancellationToken.None); + var cancelBeforeRun = await ScheduleJobAsync(shard, now.AddMinutes(-3), "cancel-before"); + var cancelDuringRun = await ScheduleJobAsync(shard, now.AddMinutes(-2), "cancel-during"); + var remaining = await ScheduleJobAsync(shard, now.AddMinutes(-1), "remaining"); + + Assert.True(await shard.RemoveJobAsync(cancelBeforeRun!.Id, CancellationToken.None)); + var running = await TakeOneAsync(shard); + Assert.Equal(cancelDuringRun!.Id, running.Job.Id); + Assert.True(await shard.RemoveJobAsync(running.Job.Id, CancellationToken.None)); + await first.UnregisterShardAsync(shard, CancellationToken.None); + + var reassigned = Assert.Single(await second.AssignJobShardsAsync(now.AddMinutes(5), int.MaxValue, CancellationToken.None)); + var run = await TakeOneAsync(reassigned); + + Assert.Equal(remaining!.Id, run.Job.Id); + Assert.Equal(1, await reassigned.GetJobCountAsync()); } - /// - /// Tests that maxNewClaims limits the number of orphaned shards claimed, - /// while still returning all already-owned shards. - /// - public async Task SlowStart_LimitsOrphanedShardClaims(CancellationToken cancellationToken) + [SkippableFact] + public async Task SlowStartRespectsZeroLimitedUnlimitedAndRepeatedBudgets() { - var silo1Address = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5000), 0); - var silo2Address = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5001), 0); - - SetSiloStatus(silo1Address, SiloStatus.Active); - SetSiloStatus(silo2Address, SiloStatus.Active); - var silo1Manager = CreateManager(silo1Address); - var silo2Manager = CreateManager(silo2Address); + await using var scope = await fixture.CreateScopeAsync(); + var creator = scope.CreateManager(scope.ActiveSilo); + var claimant = scope.CreateManager(scope.SecondActiveSilo); + var now = scope.Now; + var ownedShard = await claimant.CreateShardAsync(now, now.AddMinutes(1), Metadata("owner", "claimant"), CancellationToken.None); + var orphanedShardIds = new List(); - var date = DateTimeOffset.UtcNow; - - // Create 5 shards owned by silo1 - var createdShardIds = new List(); - for (var i = 0; i < 5; i++) + for (var i = 0; i < 3; i++) { - var shard = await silo1Manager.CreateShardAsync(date, date.AddHours(1), _testMetadata, cancellationToken); - createdShardIds.Add(shard.Id); + var shard = await creator.CreateShardAsync(now.AddMinutes(i), now.AddMinutes(i + 1), Metadata("index", i.ToString()), CancellationToken.None); + orphanedShardIds.Add(shard.Id); } - // Kill silo1 so all 5 shards become orphaned - SetSiloStatus(silo1Address, SiloStatus.Dead); + scope.SetSiloStatus(scope.ActiveSilo, SiloStatus.Dead); - // Silo2 assigns with a budget of 2 — should claim at most 2 orphaned shards - var shards = await silo2Manager.AssignJobShardsAsync(DateTime.UtcNow.AddHours(2), maxNewClaims: 2, cancellationToken); - Assert.Equal(2, shards.Count); + var zeroBudget = await claimant.AssignJobShardsAsync(now.AddMinutes(10), maxNewClaims: 0, CancellationToken.None); + var firstLimitedBudget = await claimant.AssignJobShardsAsync(now.AddMinutes(10), maxNewClaims: 1, CancellationToken.None); + var secondLimitedBudget = await claimant.AssignJobShardsAsync(now.AddMinutes(10), maxNewClaims: 1, CancellationToken.None); + var unlimitedBudget = await claimant.AssignJobShardsAsync(now.AddMinutes(10), int.MaxValue, CancellationToken.None); - // Assign again with budget of 2 — should claim 2 more - shards = await silo2Manager.AssignJobShardsAsync(DateTime.UtcNow.AddHours(2), maxNewClaims: 2, cancellationToken); - // The 2 already-owned from before + 2 newly claimed = 4 - Assert.Equal(4, shards.Count); - - // Assign with budget of 10 — should claim the last remaining 1 + return all 4 already-owned = 5 - shards = await silo2Manager.AssignJobShardsAsync(DateTime.UtcNow.AddHours(2), maxNewClaims: 10, cancellationToken); - Assert.Equal(5, shards.Count); - Assert.All(createdShardIds, id => Assert.Contains(id, shards.Select(s => s.Id))); + Assert.Collection(zeroBudget, shard => Assert.Equal(ownedShard.Id, shard.Id)); + Assert.Equal(2, firstLimitedBudget.Count); + Assert.Equal(3, secondLimitedBudget.Count); + Assert.Equal(4, unlimitedBudget.Count); + Assert.Contains(unlimitedBudget, shard => shard.Id == ownedShard.Id); + Assert.All(orphanedShardIds, id => Assert.Contains(unlimitedBudget, shard => shard.Id == id)); } - /// - /// Tests that maxNewClaims = 0 prevents claiming any orphaned shards - /// but still returns already-owned shards. - /// - public async Task SlowStart_ZeroBudgetClaimsNothing(CancellationToken cancellationToken) - { - var silo1Address = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5000), 0); - var silo2Address = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5001), 0); - - SetSiloStatus(silo1Address, SiloStatus.Active); - SetSiloStatus(silo2Address, SiloStatus.Active); - var silo1Manager = CreateManager(silo1Address); - var silo2Manager = CreateManager(silo2Address); - - var date = DateTimeOffset.UtcNow; - - // Silo2 creates a shard that it owns - var ownedShard = await silo2Manager.CreateShardAsync(date, date.AddHours(1), _testMetadata, cancellationToken); - - // Silo1 creates 3 shards, then dies - for (var i = 0; i < 3; i++) + protected static ScheduleJobRequest CreateRequest(DateTimeOffset dueTime, string jobName, IReadOnlyDictionary? metadata = null) + => new() { - await silo1Manager.CreateShardAsync(date, date.AddHours(1), _testMetadata, cancellationToken); - } - - SetSiloStatus(silo1Address, SiloStatus.Dead); - - // Silo2 assigns with budget = 0: only its own shard returned - var shards = await silo2Manager.AssignJobShardsAsync(DateTime.UtcNow.AddHours(2), maxNewClaims: 0, cancellationToken); - Assert.Single(shards); - Assert.Equal(ownedShard.Id, shards[0].Id); - } + Target = GrainId.Create("durable-job-test", jobName), + JobName = jobName, + DueTime = dueTime, + Metadata = metadata + }; - /// - /// Tests that maxNewClaims = int.MaxValue (unlimited) claims all orphaned shards. - /// - public async Task SlowStart_UnlimitedBudgetClaimsAll(CancellationToken cancellationToken) + private static async Task ScheduleJobAsync(IJobShard shard, DateTimeOffset dueTime, string jobName, IReadOnlyDictionary? metadata = null) { - var silo1Address = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5000), 0); - var silo2Address = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5001), 0); - - SetSiloStatus(silo1Address, SiloStatus.Active); - SetSiloStatus(silo2Address, SiloStatus.Active); - var silo1Manager = CreateManager(silo1Address); - var silo2Manager = CreateManager(silo2Address); - - var date = DateTimeOffset.UtcNow; - - // Create 5 shards owned by silo1 - for (var i = 0; i < 5; i++) + if (dueTime < shard.StartTime) { - await silo1Manager.CreateShardAsync(date, date.AddHours(1), _testMetadata, cancellationToken); + dueTime = shard.StartTime; } - - SetSiloStatus(silo1Address, SiloStatus.Dead); - - // Unlimited budget claims everything - var shards = await silo2Manager.AssignJobShardsAsync(DateTime.UtcNow.AddHours(2), maxNewClaims: int.MaxValue, cancellationToken); - Assert.Equal(5, shards.Count); - } - - /// - /// Tests that when the slow-start budget prevents claiming a shard from a dead silo, - /// the shard's adopted count is not incremented. This ensures that budget exhaustion - /// does not cause false poison detection on subsequent attempts. - /// - public async Task SlowStart_BudgetExhaustion_DoesNotInflateAdoptedCount(CancellationToken cancellationToken) - { - var silo1Address = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5000), 0); - var silo2Address = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5001), 0); - - SetSiloStatus(silo1Address, SiloStatus.Active); - SetSiloStatus(silo2Address, SiloStatus.Active); - var silo1Manager = CreateManager(silo1Address); - var silo2Manager = CreateManager(silo2Address); - - var date = DateTimeOffset.UtcNow; - - // Create 3 shards on silo1 - for (var i = 0; i < 3; i++) + else if (dueTime > shard.EndTime) { - await silo1Manager.CreateShardAsync(date, date.AddHours(1), _testMetadata, cancellationToken); + dueTime = shard.EndTime; } - // Kill silo1 so all 3 shards become adopted (from dead silo) - SetSiloStatus(silo1Address, SiloStatus.Dead); - - // Claim only 1 shard per cycle — the other 2 are skipped due to budget - var shards = await silo2Manager.AssignJobShardsAsync(DateTime.UtcNow.AddHours(2), maxNewClaims: 1, cancellationToken); - Assert.Single(shards); - - // Claim 1 more — should succeed because adopted count was NOT inflated for skipped shards - shards = await silo2Manager.AssignJobShardsAsync(DateTime.UtcNow.AddHours(2), maxNewClaims: 1, cancellationToken); - Assert.Equal(2, shards.Count); // 1 already-owned + 1 newly claimed - - // Claim the last one - shards = await silo2Manager.AssignJobShardsAsync(DateTime.UtcNow.AddHours(2), maxNewClaims: 1, cancellationToken); - Assert.Equal(3, shards.Count); // all 3 now owned - - // Verify all shards were successfully claimed (none falsely poisoned) - Assert.Equal(3, shards.Count); + return await shard.TryScheduleJobAsync(CreateRequest(dueTime, jobName, metadata), CancellationToken.None); } - /// - /// Tests that unregistering a shard with remaining jobs preserves the shard for reassignment. - /// - public async Task UnregisterShard_WithJobsRemaining(CancellationToken cancellationToken) - { - // Initialize 2 silos with 2 managers - var silo1Address = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5000), 0); - var silo2Address = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5001), 0); - - SetSiloStatus(silo1Address, SiloStatus.Active); - SetSiloStatus(silo2Address, SiloStatus.Active); - var silo1Manager = CreateManager(silo1Address); - var silo2Manager = CreateManager(silo2Address); - - var date = DateTime.UtcNow; - var shard = await silo1Manager.CreateShardAsync(date, date.AddHours(1), _testMetadata, cancellationToken); - - // Create a shard on silo1, schedule some jobs, then unregister the shard - await shard.TryScheduleJobAsync(new ScheduleJobRequest { Target = GrainId.Create("type", "target1"), JobName = "job1", DueTime = DateTime.UtcNow.AddSeconds(1), Metadata = null }, cancellationToken); - await shard.TryScheduleJobAsync(new ScheduleJobRequest { Target = GrainId.Create("type", "target2"), JobName = "job2", DueTime = DateTime.UtcNow.AddSeconds(2), Metadata = null }, cancellationToken); - - await silo1Manager.UnregisterShardAsync(shard, cancellationToken); - - // The shard should NOT have been deleted since there were jobs remaining - SetSiloStatus(silo1Address, SiloStatus.Dead); - - // Take over the shard from silo2 and consume the jobs - var shards = await silo2Manager.AssignJobShardsAsync(DateTime.UtcNow.AddHours(1), maxNewClaims: int.MaxValue, cancellationToken); - Assert.Single(shards); - Assert.Equal(shard.Id, shards[0].Id); - - var consumedJobs = new List(); - await foreach (var jobCtx in shards[0].ConsumeDurableJobsAsync().WithCancellation(cancellationToken)) - { - consumedJobs.Add(jobCtx.Job.Name); - await shards[0].RemoveJobAsync(jobCtx.Job.Id, cancellationToken); - } - - Assert.Equal(2, consumedJobs.Count); - Assert.Contains("job1", consumedJobs); - Assert.Contains("job2", consumedJobs); - await silo2Manager.UnregisterShardAsync(shards[0], cancellationToken); - } + private static async Task TakeOneAsync(IJobShard shard) => (await TakeAsync(shard, 1))[0]; - /// - /// Simple implementation of for testing. - /// - private sealed class TestLocalSiloDetails : ILocalSiloDetails + private static async Task> TakeAsync(IJobShard shard, int count) { - public TestLocalSiloDetails(SiloAddress siloAddress) + var result = new List(); + using var cts = new CancellationTokenSource(TimeSpan.FromSeconds(30)); + await foreach (var run in shard.ConsumeDurableJobsAsync().WithCancellation(cts.Token)) { - SiloAddress = siloAddress; + result.Add(run); + if (result.Count == count) + { + break; + } } - public string Name => SiloAddress.ToString(); - - public string ClusterId => "TestCluster"; - - public string DnsHostName => SiloAddress.ToString(); - - public SiloAddress SiloAddress { get; } - - public SiloAddress GatewayAddress => SiloAddress; + Assert.Equal(count, result.Count); + return result; } - /// - /// Simple in-memory implementation of for testing. - /// - private sealed class InMemoryClusterMembershipService : IClusterMembershipService - { - private readonly Dictionary _silos = new(); - private int _version = 0; - - public ClusterMembershipSnapshot CurrentSnapshot => - new ClusterMembershipSnapshot(_silos.ToImmutableDictionary(), new MembershipVersion(_version)); - - public IAsyncEnumerable MembershipUpdates => throw new NotImplementedException(); - - public void SetSiloStatus(SiloAddress address, SiloStatus status) - { - _silos[address] = new ClusterMember(address, status, address.ToParsableString()); - _version++; - } - - public ValueTask Refresh(MembershipVersion minimumVersion = default, CancellationToken cancellationToken = default) => - ValueTask.CompletedTask; - - public Task TryKill(SiloAddress siloAddress) => throw new NotImplementedException(); - } + private static Dictionary Metadata(string key, string value) => new(StringComparer.Ordinal) { [key] = value }; } diff --git a/test/Orleans.DurableJobs.Tests/DurableJobs/JournaledJobShardManagerScenarioTests.cs b/test/Orleans.DurableJobs.Tests/DurableJobs/JournaledJobShardManagerScenarioTests.cs new file mode 100644 index 00000000000..5a59954680d --- /dev/null +++ b/test/Orleans.DurableJobs.Tests/DurableJobs/JournaledJobShardManagerScenarioTests.cs @@ -0,0 +1,8 @@ +using TestExtensions; +using Xunit; + +namespace Orleans.DurableJobs.Tests; + +[TestCategory("DurableJobs")] +public sealed class JournaledJobShardManagerScenarioTests(VolatileJobShardManagerTestFixture fixture) + : JobShardManagerTestsRunner(fixture), IClassFixture; diff --git a/test/Orleans.DurableJobs.Tests/DurableJobs/JournaledJobShardManagerTests.cs b/test/Orleans.DurableJobs.Tests/DurableJobs/JournaledJobShardManagerTests.cs new file mode 100644 index 00000000000..457cac96603 --- /dev/null +++ b/test/Orleans.DurableJobs.Tests/DurableJobs/JournaledJobShardManagerTests.cs @@ -0,0 +1,504 @@ +using System.Collections.Immutable; +using System.Net; +using Microsoft.Extensions.Configuration; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Logging; +using Microsoft.Extensions.Options; +using Orleans.Configuration.Internal; +using Orleans.DurableJobs; +using Orleans.Hosting; +using Orleans.Journaling; +using Orleans.Journaling.Json; +using Orleans.Runtime; +using Xunit; + +namespace Tester.DurableJobs; + +[TestCategory("BVT"), TestCategory("DurableJobs")] +public class JournaledJobShardManagerTests +{ + [Fact] + public async Task ReleasedShard_IsClaimedClosedAndReplayedFromJournal() + { + var storageProvider = new VolatileJournalStorageProvider(); + using var services = CreateServices(storageProvider); + var membership = new TestClusterMembershipService(); + var silo1 = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5000), 0); + var silo2 = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5001), 0); + membership.SetSiloStatus(silo1, SiloStatus.Active); + membership.SetSiloStatus(silo2, SiloStatus.Active); + + var manager1 = CreateManager(services, membership, silo1); + var manager2 = CreateManager(services, membership, silo2); + var start = DateTimeOffset.UtcNow.AddSeconds(-5); + var end = start.AddHours(1); + var shard = await manager1.CreateShardAsync( + start, + end, + new Dictionary { ["Purpose"] = "JournaledManagerTest" }, + CancellationToken.None); + + var scheduled = await shard.TryScheduleJobAsync(new() + { + Target = GrainId.Create("type", "target"), + JobName = "job", + DueTime = DateTimeOffset.UtcNow.AddSeconds(-1), + Metadata = new Dictionary { ["Kind"] = "Replay" } + }, CancellationToken.None); + Assert.NotNull(scheduled); + + await manager1.UnregisterShardAsync(shard, CancellationToken.None); + + var claimed = await manager2.AssignJobShardsAsync(DateTimeOffset.UtcNow.AddHours(1), int.MaxValue, CancellationToken.None); + var claimedShard = Assert.Single(claimed); + Assert.True(claimedShard.IsAddingCompleted); + Assert.Equal("JournaledManagerTest", claimedShard.Metadata!["Purpose"]); + + var rejected = await claimedShard.TryScheduleJobAsync(new() + { + Target = GrainId.Create("type", "target2"), + JobName = "new-job", + DueTime = DateTimeOffset.UtcNow, + Metadata = null + }, CancellationToken.None); + Assert.Null(rejected); + + var consumed = new List(); + await foreach (var jobContext in claimedShard.ConsumeDurableJobsAsync().WithCancellation(CancellationToken.None)) + { + consumed.Add(jobContext); + await claimedShard.RemoveJobAsync(jobContext.Job.Id, CancellationToken.None); + } + + var replayed = Assert.Single(consumed); + Assert.Equal(scheduled.Id, replayed.Job.Id); + Assert.Equal("Replay", replayed.Job.Metadata!["Kind"]); + Assert.Equal(1, replayed.DequeueCount); + + await manager2.UnregisterShardAsync(claimedShard, CancellationToken.None); + Assert.Empty(await manager2.AssignJobShardsAsync(DateTimeOffset.UtcNow.AddHours(1), int.MaxValue, CancellationToken.None)); + } + + [Fact] + public async Task EmptyShard_IsDeletedWhenUnregistered() + { + var storageProvider = new VolatileJournalStorageProvider(); + using var services = CreateServices(storageProvider); + var membership = new TestClusterMembershipService(); + var silo = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5010), 0); + membership.SetSiloStatus(silo, SiloStatus.Active); + + var manager = CreateManager(services, membership, silo); + var start = DateTimeOffset.UtcNow.AddMinutes(-1); + var shard = await manager.CreateShardAsync( + start, + start.AddHours(1), + new Dictionary { ["Purpose"] = "EmptyShardDelete" }, + CancellationToken.None); + var storageId = ((JournaledJobShard)shard).StorageId; + + Assert.NotNull(await storageProvider.CreateStorage(storageId).GetMetadataAsync()); + + await manager.UnregisterShardAsync(shard, CancellationToken.None); + + Assert.Null(await storageProvider.CreateStorage(storageId).GetMetadataAsync()); + Assert.Empty(await ToListAsync(storageProvider.ListAsync(JobShardId.StoragePrefix))); + } + + [Fact] + public async Task ClosedLocalShard_CanStillPersistRemovals() + { + var storageProvider = new VolatileJournalStorageProvider(); + using var services = CreateServices(storageProvider); + var membership = new TestClusterMembershipService(); + var silo = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5015), 0); + membership.SetSiloStatus(silo, SiloStatus.Active); + + var manager = CreateManager(services, membership, silo); + var start = DateTimeOffset.UtcNow.AddSeconds(-5); + var shard = await manager.CreateShardAsync( + start, + start.AddHours(1), + new Dictionary { ["Purpose"] = "ClosedLocalShard" }, + CancellationToken.None); + var scheduled = await shard.TryScheduleJobAsync(new() + { + Target = GrainId.Create("type", "target"), + JobName = "closed-local-job", + DueTime = DateTimeOffset.UtcNow.AddSeconds(-1), + Metadata = null + }, CancellationToken.None); + Assert.NotNull(scheduled); + + await shard.MarkAsCompleteAsync(CancellationToken.None); + await shard.DisposeAsync(); + + var reopenedManager = CreateManager(services, membership, silo); + var reopened = await reopenedManager.AssignJobShardsAsync(DateTimeOffset.UtcNow.AddHours(1), int.MaxValue, CancellationToken.None); + shard = Assert.Single(reopened); + Assert.True(shard.IsAddingCompleted); + + var rejected = await shard.TryScheduleJobAsync(new() + { + Target = GrainId.Create("type", "target2"), + JobName = "rejected-job", + DueTime = DateTimeOffset.UtcNow, + Metadata = null + }, CancellationToken.None); + Assert.Null(rejected); + + await foreach (var jobContext in shard.ConsumeDurableJobsAsync().WithCancellation(CancellationToken.None)) + { + Assert.Equal(scheduled.Id, jobContext.Job.Id); + Assert.True(await shard.RemoveJobAsync(jobContext.Job.Id, CancellationToken.None)); + } + + Assert.Equal(0, await shard.GetJobCountAsync()); + await reopenedManager.UnregisterShardAsync(shard, CancellationToken.None); + } + + [Fact] + public async Task DeadOwnerShard_IsAdoptedClosedAndReplayedFromJournal() + { + var storageProvider = new VolatileJournalStorageProvider(); + using var services = CreateServices(storageProvider); + var membership = new TestClusterMembershipService(); + var silo1 = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5020), 0); + var silo2 = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5021), 0); + membership.SetSiloStatus(silo1, SiloStatus.Active); + membership.SetSiloStatus(silo2, SiloStatus.Active); + + var manager1 = CreateManager(services, membership, silo1); + var manager2 = CreateManager(services, membership, silo2); + var start = DateTimeOffset.UtcNow.AddSeconds(-5); + var shard = await manager1.CreateShardAsync( + start, + start.AddHours(1), + new Dictionary { ["Purpose"] = "DeadOwnerAdoption" }, + CancellationToken.None); + var scheduled = await shard.TryScheduleJobAsync(new() + { + Target = GrainId.Create("type", "target"), + JobName = "dead-owner-job", + DueTime = DateTimeOffset.UtcNow.AddSeconds(-1), + Metadata = new Dictionary { ["Kind"] = "Adopted" } + }, CancellationToken.None); + Assert.NotNull(scheduled); + + membership.SetSiloStatus(silo1, SiloStatus.Dead); + + var claimed = await manager2.AssignJobShardsAsync(DateTimeOffset.UtcNow.AddHours(1), int.MaxValue, CancellationToken.None); + var claimedShard = Assert.Single(claimed); + Assert.True(claimedShard.IsAddingCompleted); + Assert.Equal("DeadOwnerAdoption", claimedShard.Metadata!["Purpose"]); + Assert.Equal(silo2, await manager2.GetShardOwnerAsync(claimedShard.Id, CancellationToken.None)); + + var consumed = new List(); + await foreach (var jobContext in claimedShard.ConsumeDurableJobsAsync().WithCancellation(CancellationToken.None)) + { + consumed.Add(jobContext); + await claimedShard.RemoveJobAsync(jobContext.Job.Id, CancellationToken.None); + } + + var replayed = Assert.Single(consumed); + Assert.Equal(scheduled.Id, replayed.Job.Id); + Assert.Equal("Adopted", replayed.Job.Metadata!["Kind"]); + Assert.Equal(1, replayed.DequeueCount); + + await manager2.UnregisterShardAsync(claimedShard, CancellationToken.None); + await shard.DisposeAsync(); + } + + [Fact] + public async Task DeadOwnerShard_IsPoisonedAfterAdoptionLimitExceeded() + { + var storageProvider = new VolatileJournalStorageProvider(); + using var services = CreateServices(storageProvider); + var membership = new TestClusterMembershipService(); + var silo1 = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5030), 0); + var silo2 = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5031), 0); + membership.SetSiloStatus(silo1, SiloStatus.Active); + membership.SetSiloStatus(silo2, SiloStatus.Active); + + var manager1 = CreateManager(services, membership, silo1); + var manager2 = CreateManager(services, membership, silo2, new DurableJobsOptions { MaxAdoptedCount = 0 }); + var start = DateTimeOffset.UtcNow.AddSeconds(-5); + var shard = await manager1.CreateShardAsync( + start, + start.AddHours(1), + new Dictionary { ["Purpose"] = "PoisonedShard" }, + CancellationToken.None); + var scheduled = await shard.TryScheduleJobAsync(new() + { + Target = GrainId.Create("type", "target"), + JobName = "poisoned-job", + DueTime = DateTimeOffset.UtcNow.AddSeconds(-1), + Metadata = null + }, CancellationToken.None); + Assert.NotNull(scheduled); + + membership.SetSiloStatus(silo1, SiloStatus.Dead); + + Assert.Empty(await manager2.AssignJobShardsAsync(DateTimeOffset.UtcNow.AddHours(1), int.MaxValue, CancellationToken.None)); + Assert.Null(await manager2.GetShardOwnerAsync(shard.Id, CancellationToken.None)); + Assert.Empty(await manager2.AssignJobShardsAsync(DateTimeOffset.UtcNow.AddHours(1), int.MaxValue, CancellationToken.None)); + + await shard.DisposeAsync(); + } + + [Fact] + public async Task LiveLocalShard_IsReturnedAndRemainsWritable() + { + var storageProvider = new VolatileJournalStorageProvider(); + using var services = CreateServices(storageProvider); + var membership = new TestClusterMembershipService(); + var silo = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5040), 0); + membership.SetSiloStatus(silo, SiloStatus.Active); + + var manager = CreateManager(services, membership, silo); + var start = DateTimeOffset.UtcNow.AddSeconds(-5); + var shard = await manager.CreateShardAsync( + start, + start.AddHours(1), + new Dictionary { ["Purpose"] = "LiveShard" }, + CancellationToken.None); + + await ScheduleJobAsync(shard, "first-live-job"); + + var assigned = await manager.AssignJobShardsAsync(DateTimeOffset.UtcNow.AddHours(1), int.MaxValue, CancellationToken.None); + var assignedShard = Assert.Single(assigned); + Assert.Equal(shard.Id, assignedShard.Id); + Assert.False(assignedShard.IsAddingCompleted); + Assert.Equal("LiveShard", assignedShard.Metadata!["Purpose"]); + + await ScheduleJobAsync(assignedShard, "second-live-job"); + Assert.Equal(2, await assignedShard.GetJobCountAsync()); + + await DrainAndUnregisterAsync(manager, assignedShard, expectedJobs: 2); + } + + [Fact] + public async Task ActiveRemoteOwnerShard_IsNotClaimedUntilOwnerDies() + { + var storageProvider = new VolatileJournalStorageProvider(); + using var services = CreateServices(storageProvider); + var membership = new TestClusterMembershipService(); + var silo1 = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5050), 0); + var silo2 = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5051), 0); + membership.SetSiloStatus(silo1, SiloStatus.Active); + membership.SetSiloStatus(silo2, SiloStatus.Active); + + var manager1 = CreateManager(services, membership, silo1); + var manager2 = CreateManager(services, membership, silo2); + var start = DateTimeOffset.UtcNow.AddSeconds(-5); + var shard = await manager1.CreateShardAsync( + start, + start.AddHours(1), + new Dictionary { ["Purpose"] = "ActiveOwner" }, + CancellationToken.None); + await ScheduleJobAsync(shard, "active-owner-job"); + + Assert.Empty(await manager2.AssignJobShardsAsync(DateTimeOffset.UtcNow.AddHours(1), int.MaxValue, CancellationToken.None)); + Assert.Equal(silo1, await manager2.GetShardOwnerAsync(shard.Id, CancellationToken.None)); + + membership.SetSiloStatus(silo1, SiloStatus.Dead); + + var adopted = await manager2.AssignJobShardsAsync(DateTimeOffset.UtcNow.AddHours(1), int.MaxValue, CancellationToken.None); + var adoptedShard = Assert.Single(adopted); + Assert.True(adoptedShard.IsAddingCompleted); + Assert.Equal("ActiveOwner", adoptedShard.Metadata!["Purpose"]); + Assert.Equal(silo2, await manager2.GetShardOwnerAsync(adoptedShard.Id, CancellationToken.None)); + + await DrainAndUnregisterAsync(manager2, adoptedShard); + await shard.DisposeAsync(); + } + + [Fact] + public async Task ShardMetadata_RoundTripsKeysRequiringEncoding() + { + var storageProvider = new VolatileJournalStorageProvider(); + using var services = CreateServices(storageProvider); + var membership = new TestClusterMembershipService(); + var silo1 = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5060), 0); + var silo2 = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5061), 0); + membership.SetSiloStatus(silo1, SiloStatus.Active); + membership.SetSiloStatus(silo2, SiloStatus.Active); + + var manager1 = CreateManager(services, membership, silo1); + var manager2 = CreateManager(services, membership, silo2); + var start = DateTimeOffset.UtcNow.AddSeconds(-5); + var shard = await manager1.CreateShardAsync( + start, + start.AddHours(1), + new Dictionary + { + ["key/with/slashes"] = "slash-value", + ["key+with=base64"] = "base64-value" + }, + CancellationToken.None); + await ScheduleJobAsync(shard, "metadata-job"); + await manager1.UnregisterShardAsync(shard, CancellationToken.None); + + var assigned = await manager2.AssignJobShardsAsync(DateTimeOffset.UtcNow.AddHours(1), int.MaxValue, CancellationToken.None); + var assignedShard = Assert.Single(assigned); + Assert.Equal("slash-value", assignedShard.Metadata!["key/with/slashes"]); + Assert.Equal("base64-value", assignedShard.Metadata["key+with=base64"]); + + await DrainAndUnregisterAsync(manager2, assignedShard); + } + + [Fact] + public async Task SlowStart_LimitsOrphanedShardClaims() + { + var storageProvider = new VolatileJournalStorageProvider(); + using var services = CreateServices(storageProvider); + var membership = new TestClusterMembershipService(); + var silo1 = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5070), 0); + var silo2 = SiloAddress.New(new IPEndPoint(IPAddress.Loopback, 5071), 0); + membership.SetSiloStatus(silo1, SiloStatus.Active); + membership.SetSiloStatus(silo2, SiloStatus.Active); + + var manager1 = CreateManager(services, membership, silo1); + var manager2 = CreateManager(services, membership, silo2); + var start = DateTimeOffset.UtcNow.AddSeconds(-5); + for (var i = 0; i < 3; i++) + { + var shard = await manager1.CreateShardAsync( + start, + start.AddHours(1), + new Dictionary { ["Index"] = i.ToString() }, + CancellationToken.None); + await ScheduleJobAsync(shard, $"orphaned-job-{i}"); + await manager1.UnregisterShardAsync(shard, CancellationToken.None); + } + + Assert.Empty(await manager2.AssignJobShardsAsync(DateTimeOffset.UtcNow.AddHours(1), maxNewClaims: 0, CancellationToken.None)); + + var firstClaim = await manager2.AssignJobShardsAsync(DateTimeOffset.UtcNow.AddHours(1), maxNewClaims: 1, CancellationToken.None); + var firstShard = Assert.Single(firstClaim); + await DrainAndUnregisterAsync(manager2, firstShard); + + var remainingClaims = await manager2.AssignJobShardsAsync(DateTimeOffset.UtcNow.AddHours(1), int.MaxValue, CancellationToken.None); + Assert.Equal(2, remainingClaims.Count); + foreach (var shard in remainingClaims) + { + await DrainAndUnregisterAsync(manager2, shard); + } + } + + private static ServiceProvider CreateServices(VolatileJournalStorageProvider storageProvider) + { + var builder = new TestSiloBuilder(); + builder.AddJournalStorage(); + builder.UseJsonJournalFormat(options => options.AddTypeInfoResolver(DurableJobsJsonContext.Default)); + builder.Services.AddLogging(); + builder.Services.AddSingleton(TimeProvider.System); + builder.Services.AddSingleton(storageProvider); + builder.Services.AddSingleton(storageProvider); + return builder.Services.BuildServiceProvider(); + } + + private static JournaledJobShardManager CreateManager( + IServiceProvider services, + TestClusterMembershipService membership, + SiloAddress siloAddress, + DurableJobsOptions options = null) + => new( + new TestLocalSiloDetails(siloAddress), + services.GetRequiredService(), + services.GetRequiredService(), + services.GetRequiredService(), + membership, + services, + Options.Create(options ?? new DurableJobsOptions()), + services.GetRequiredService>()); + + private static async Task ScheduleJobAsync(IJobShard shard, string jobName) + { + var scheduled = await shard.TryScheduleJobAsync(new() + { + Target = GrainId.Create("type", jobName), + JobName = jobName, + DueTime = DateTimeOffset.UtcNow.AddSeconds(-1), + Metadata = null + }, CancellationToken.None); + Assert.NotNull(scheduled); + return scheduled; + } + + private static async Task DrainAndUnregisterAsync(JournaledJobShardManager manager, IJobShard shard, int expectedJobs = 1) + { + var consumed = 0; + using var cts = new CancellationTokenSource(TimeSpan.FromSeconds(10)); + await foreach (var jobContext in shard.ConsumeDurableJobsAsync().WithCancellation(cts.Token)) + { + consumed++; + Assert.True(await shard.RemoveJobAsync(jobContext.Job.Id, cts.Token)); + if (consumed == expectedJobs) + { + break; + } + } + + Assert.Equal(expectedJobs, consumed); + Assert.Equal(0, await shard.GetJobCountAsync()); + await manager.UnregisterShardAsync(shard, CancellationToken.None); + } + + private static async Task> ToListAsync(IAsyncEnumerable source) + { + var result = new List(); + await foreach (var item in source) + { + result.Add(item); + } + + return result; + } + + private sealed class TestSiloBuilder : ISiloBuilder + { + public IServiceCollection Services { get; } = new ServiceCollection(); + + public IConfiguration Configuration { get; } = new ConfigurationBuilder().Build(); + } + + private sealed class TestLocalSiloDetails(SiloAddress siloAddress) : ILocalSiloDetails + { + public string Name => SiloAddress.ToParsableString(); + + public string ClusterId => "TestCluster"; + + public string DnsHostName => SiloAddress.ToParsableString(); + + public SiloAddress SiloAddress { get; } = siloAddress; + + public SiloAddress GatewayAddress => SiloAddress; + } + + private sealed class TestClusterMembershipService : IClusterMembershipService + { + private ImmutableDictionary _members = ImmutableDictionary.Empty; + private long _version; + + public ClusterMembershipSnapshot CurrentSnapshot => new(_members, new MembershipVersion(_version)); + + public IAsyncEnumerable MembershipUpdates => GetMembershipUpdates(); + + public void SetSiloStatus(SiloAddress siloAddress, SiloStatus status) + { + _members = _members.SetItem(siloAddress, new ClusterMember(siloAddress, status, siloAddress.ToParsableString())); + _version++; + } + + public ValueTask Refresh(MembershipVersion minimumVersion = default, CancellationToken cancellationToken = default) => ValueTask.CompletedTask; + + public Task TryKill(SiloAddress siloAddress) => Task.FromResult(false); + + private static async IAsyncEnumerable GetMembershipUpdates() + { + await Task.CompletedTask; + yield break; + } + } +} diff --git a/test/Orleans.DurableJobs.Tests/DurableJobs/JournaledJobShardStateTests.cs b/test/Orleans.DurableJobs.Tests/DurableJobs/JournaledJobShardStateTests.cs new file mode 100644 index 00000000000..12abccb66ac --- /dev/null +++ b/test/Orleans.DurableJobs.Tests/DurableJobs/JournaledJobShardStateTests.cs @@ -0,0 +1,131 @@ +using System; +using System.Linq; +using Orleans.DurableJobs; +using Orleans.Runtime; +using Xunit; + +namespace Tester.DurableJobs; + +[TestCategory("BVT"), TestCategory("DurableJobs")] +public class JournaledJobShardStateTests +{ + [Fact] + public void Replay_FoldsScheduleRetryAndRemoveOperations() + { + var shardId = new JobShardId("shard-a"); + var start = DateTimeOffset.UtcNow; + var state = new JournaledJobShardState(shardId, start, start.AddHours(1)); + var job = CreateJob(shardId, "job-1", "job", start.AddMinutes(1)); + var retryDueTime = start.AddHours(2); + + state.Apply(DurableJobShardJournalRecord.ForSchedule(job)); + state.Apply(DurableJobShardJournalRecord.ForRetry(job.Id, retryDueTime, dequeueCount: 1)); + + var retrySnapshot = state.CaptureSnapshot(); + var retried = Assert.Single(retrySnapshot.Jobs); + Assert.Equal(job.Id, retried.Job.Id); + Assert.Equal(retryDueTime, retried.Job.DueTime); + Assert.Equal(shardId.Value, retried.Job.ShardId); + Assert.Equal(1, retried.DequeueCount); + + state.Apply(DurableJobShardJournalRecord.ForRemove(job.Id)); + state.Apply(DurableJobShardJournalRecord.ForRemove(job.Id)); + + Assert.Equal(0, state.Count); + Assert.Empty(state.CaptureSnapshot().Jobs); + } + + [Fact] + public void Snapshot_ReplacesLiveJobsAndOmitsRemovedHistory() + { + var shardId = new JobShardId("shard-b"); + var start = DateTimeOffset.UtcNow; + var source = new JournaledJobShardState(shardId, start, start.AddHours(1)); + var removed = CreateJob(shardId, "removed", "removed", start.AddMinutes(1)); + var live = CreateJob(shardId, "live", "live", start.AddMinutes(2)); + + source.Apply(DurableJobShardJournalRecord.ForSchedule(removed)); + source.Apply(DurableJobShardJournalRecord.ForSchedule(live)); + source.Apply(DurableJobShardJournalRecord.ForRetry(live.Id, start.AddMinutes(3), dequeueCount: 2)); + source.Apply(DurableJobShardJournalRecord.ForRemove(removed.Id)); + + var snapshot = source.CaptureSnapshot(); + Assert.DoesNotContain(typeof(DurableJobShardSnapshot).GetProperties(), property => property.Name == nameof(IJobRunContext.RunId)); + Assert.DoesNotContain(typeof(DurableJobShardSnapshotEntry).GetProperties(), property => property.Name == nameof(IJobRunContext.RunId)); + + var target = new JournaledJobShardState(shardId, start, start.AddHours(1)); + target.Apply(DurableJobShardJournalRecord.ForSnapshot(snapshot)); + + var entry = Assert.Single(target.CaptureSnapshot().Jobs); + Assert.Equal(live.Id, entry.Job.Id); + Assert.Equal(start.AddMinutes(3), entry.Job.DueTime); + Assert.Equal(2, entry.DequeueCount); + Assert.DoesNotContain(target.CaptureSnapshot().Jobs, item => item.Job.Id == removed.Id); + } + + [Fact] + public void Retry_KeepsJobInSameShardWhenDueTimeMovesOutsideOriginalWindow() + { + var shardId = new JobShardId("shard-c"); + var start = DateTimeOffset.UtcNow; + var end = start.AddMinutes(10); + var state = new JournaledJobShardState(shardId, start, end); + var job = CreateJob(shardId, "job-1", "job", start.AddMinutes(1)); + var retryDueTime = end.AddDays(1); + + state.Apply(DurableJobShardJournalRecord.ForSchedule(job)); + state.Apply(DurableJobShardJournalRecord.ForRetry(job.Id, retryDueTime, dequeueCount: 1)); + + var entry = Assert.Single(state.CaptureSnapshot().Jobs); + Assert.Equal(shardId.Value, entry.Job.ShardId); + Assert.Equal(retryDueTime, entry.Job.DueTime); + } + + [Fact] + public async Task ConsumeDurableJobsAsync_YieldsDueJobsInDueTimeOrderAndIncrementsDequeueCount() + { + var shardId = new JobShardId("shard-d"); + var start = DateTimeOffset.UtcNow.AddMinutes(-1); + var state = new JournaledJobShardState(shardId, start, DateTimeOffset.UtcNow.AddMinutes(1)); + var third = CreateJob(shardId, "third", "third", DateTimeOffset.UtcNow.AddSeconds(-3)); + var first = CreateJob(shardId, "first", "first", DateTimeOffset.UtcNow.AddSeconds(-9)); + var second = CreateJob(shardId, "second", "second", DateTimeOffset.UtcNow.AddSeconds(-6)); + + state.Apply(DurableJobShardJournalRecord.ForSchedule(third)); + state.Apply(DurableJobShardJournalRecord.ForSchedule(first)); + state.Apply(DurableJobShardJournalRecord.ForSchedule(second)); + + var consumed = new List(); + await foreach (var jobContext in state.ConsumeDurableJobsAsync()) + { + consumed.Add(jobContext); + if (consumed.Count == 3) + { + break; + } + } + + Assert.Equal(["first", "second", "third"], consumed.Select(context => context.Job.Id).ToArray()); + Assert.All(consumed, context => Assert.Equal(1, context.DequeueCount)); + } + + [Fact] + public void JobShardId_MapsToJournalStorageIdentityWithoutExposingRawIds() + { + var shardId = new JobShardId("silo/with/slashes:job"); + + var storageId = shardId.ToJournalId(); + + Assert.True(JobShardId.StoragePrefix.IsPrefixOf(storageId)); + Assert.Equal(shardId, JobShardId.FromJournalId(storageId)); + } + + private static DurableJob CreateJob(JobShardId shardId, string id, string name, DateTimeOffset dueTime) => new() + { + Id = id, + Name = name, + DueTime = dueTime, + TargetGrainId = GrainId.Create("type", id), + ShardId = shardId.Value + }; +} diff --git a/test/Orleans.DurableJobs.Tests/Orleans.DurableJobs.Tests.csproj b/test/Orleans.DurableJobs.Tests/Orleans.DurableJobs.Tests.csproj index edeb584f491..06bdeb87b19 100644 --- a/test/Orleans.DurableJobs.Tests/Orleans.DurableJobs.Tests.csproj +++ b/test/Orleans.DurableJobs.Tests/Orleans.DurableJobs.Tests.csproj @@ -2,9 +2,15 @@ true $(TestTargetFrameworks) + $(NoWarn);ORLEANSEXP005 + + + all + runtime; build; native; contentfiles; analyzers; buildtransitive + diff --git a/test/Orleans.Journaling.Json.Tests/CodecRecoveryTests.cs b/test/Orleans.Journaling.Json.Tests/CodecRecoveryTests.cs index ef2ae99fe8e..a8996153ef6 100644 --- a/test/Orleans.Journaling.Json.Tests/CodecRecoveryTests.cs +++ b/test/Orleans.Journaling.Json.Tests/CodecRecoveryTests.cs @@ -475,8 +475,8 @@ public ValueTask ReadAsync(IJournalStorageConsumer consumer, CancellationToken c ArgumentNullException.ThrowIfNull(consumer); var metadata = storedJournalFormatKey is null - ? JournalFileMetadata.Empty - : new JournalFileMetadata(storedJournalFormatKey); + ? JournalMetadata.Empty + : new JournalMetadata(storedJournalFormatKey); if (inner.Segments.Count == 0) { consumer.Complete(metadata); diff --git a/test/Orleans.Journaling.Tests/AzureBlobCodecRecoveryTests.cs b/test/Orleans.Journaling.Tests/AzureBlobCodecRecoveryTests.cs index ab3a26736d3..29c745f9c8b 100644 --- a/test/Orleans.Journaling.Tests/AzureBlobCodecRecoveryTests.cs +++ b/test/Orleans.Journaling.Tests/AzureBlobCodecRecoveryTests.cs @@ -296,7 +296,7 @@ private sealed class RecordingJournalStorageConsumer : IJournalStorageConsumer public bool IsCompleted { get; private set; } - public void Read(JournalBufferReader buffer, IJournalFileMetadata? metadata) + public void Read(JournalBufferReader buffer, IJournalMetadata? metadata) { if (buffer.Length > 0) { diff --git a/test/Orleans.Journaling.Tests/AzureBlobJournalStorageTests.cs b/test/Orleans.Journaling.Tests/AzureBlobJournalStorageTests.cs index f911dd66554..ba33ae53d35 100644 --- a/test/Orleans.Journaling.Tests/AzureBlobJournalStorageTests.cs +++ b/test/Orleans.Journaling.Tests/AzureBlobJournalStorageTests.cs @@ -316,7 +316,7 @@ public async Task ReplaceAsync_WhenReplacingExistingCheckpoint_DeletesPreviousCh } [Fact] - public async Task ReplaceAsync_WhenOldCheckpointCleanupDisabled_DoesNotReadWalManifestOrDeletePreviousCheckpoint() + public async Task ReplaceAsync_WhenOldCheckpointCleanupDisabled_DoesNotDeletePreviousCheckpoint() { var appendBlobs = new FakeAppendBlobStore(); var checkpoints = new FakeBlockBlobStore(); @@ -329,7 +329,7 @@ public async Task ReplaceAsync_WhenOldCheckpointCleanupDisabled_DoesNotReadWalMa await storage.ReplaceAsync(new ReadOnlySequence([3]), CancellationToken.None); - Assert.Empty(appendBlobs.PropertiesCalls); + Assert.Single(appendBlobs.PropertiesCalls); Assert.True(checkpoints.Exists(previousCheckpoint)); Assert.Empty(checkpoints.DeleteCalls); } @@ -632,7 +632,7 @@ private sealed class DiscardingJournalStorageConsumer : IJournalStorageConsumer { public static DiscardingJournalStorageConsumer Instance { get; } = new(); - public void Read(JournalBufferReader buffer, IJournalFileMetadata? metadata) => buffer.Skip(buffer.Length); + public void Read(JournalBufferReader buffer, IJournalMetadata? metadata) => buffer.Skip(buffer.Length); } private sealed class CapturingJournalStorageConsumer : IJournalStorageConsumer @@ -641,7 +641,7 @@ private sealed class CapturingJournalStorageConsumer : IJournalStorageConsumer public MemoryStream Bytes { get; } = new(); - public void Read(JournalBufferReader buffer, IJournalFileMetadata? metadata) + public void Read(JournalBufferReader buffer, IJournalMetadata? metadata) { JournalFormatKey = metadata?.Format; while (buffer.Length > 0) diff --git a/test/Orleans.Journaling.Tests/JournalIdTests.cs b/test/Orleans.Journaling.Tests/JournalIdTests.cs new file mode 100644 index 00000000000..78ded8efc9f --- /dev/null +++ b/test/Orleans.Journaling.Tests/JournalIdTests.cs @@ -0,0 +1,45 @@ +using Xunit; + +namespace Orleans.Journaling.Tests; + +[TestCategory("BVT")] +public sealed class JournalIdTests +{ + [Fact] + public void Create_AcceptsReadOnlySpanSegments() + { + string[] segments = ["named", "logs", "segment/with spaces"]; + + var journalId = JournalId.Create(segments.AsSpan()); + + Assert.Equal("named/logs/segment%2Fwith%20spaces", journalId.Value); + } + + [Fact] + public void Create_AcceptsReadOnlySpanAdditionalSegments() + { + string[] additionalSegments = ["logs", "segment/with spaces"]; + + var journalId = JournalId.Create("named", additionalSegments.AsSpan()); + + Assert.Equal("named/logs/segment%2Fwith%20spaces", journalId.Value); + } + + [Fact] + public void Create_RejectsEmptyReadOnlySpanSegments() + { + var exception = Assert.Throws(() => JournalId.Create(ReadOnlySpan.Empty)); + + Assert.Equal("segments", exception.ParamName); + } + + [Fact] + public void Create_RejectsInvalidReadOnlySpanAdditionalSegments() + { + string[] additionalSegments = ["logs", ".."]; + + var exception = Assert.Throws(() => JournalId.Create("named", additionalSegments.AsSpan())); + + Assert.Equal("additionalSegments", exception.ParamName); + } +} diff --git a/test/Orleans.Journaling.Tests/StorageStreamingTests.cs b/test/Orleans.Journaling.Tests/StorageStreamingTests.cs index 0a708d5e13f..3969f726af6 100644 --- a/test/Orleans.Journaling.Tests/StorageStreamingTests.cs +++ b/test/Orleans.Journaling.Tests/StorageStreamingTests.cs @@ -224,7 +224,7 @@ private sealed class CapturingJournalStorageConsumer : IJournalStorageConsumer { public List Segments { get; } = []; - public void Read(JournalBufferReader buffer, IJournalFileMetadata? metadata) + public void Read(JournalBufferReader buffer, IJournalMetadata? metadata) { if (buffer.IsCompleted || buffer.Length == 0) { @@ -241,7 +241,7 @@ private sealed class TwoByteJournalStorageConsumer : IJournalStorageConsumer { public List Segments { get; } = []; - public void Read(JournalBufferReader buffer, IJournalFileMetadata? metadata) + public void Read(JournalBufferReader buffer, IJournalMetadata? metadata) { var temp = new byte[2]; while (buffer.TryPeek(temp)) @@ -267,7 +267,7 @@ private sealed class CompletionTrackingJournalStorageConsumer : IJournalStorageC public int CompletedLength { get; private set; } - public void Read(JournalBufferReader buffer, IJournalFileMetadata? metadata) + public void Read(JournalBufferReader buffer, IJournalMetadata? metadata) { IsCompleted = buffer.IsCompleted; CompletedLength = buffer.Length; @@ -276,7 +276,7 @@ public void Read(JournalBufferReader buffer, IJournalFileMetadata? metadata) private sealed class LeavingJournalStorageConsumer : IJournalStorageConsumer { - public void Read(JournalBufferReader buffer, IJournalFileMetadata? metadata) { } + public void Read(JournalBufferReader buffer, IJournalMetadata? metadata) { } } private sealed class ChunkedReadStream(byte[] data, int chunkSize) : Stream diff --git a/test/Orleans.Journaling.Tests/VolatileJournalStorageProviderTests.cs b/test/Orleans.Journaling.Tests/VolatileJournalStorageProviderTests.cs new file mode 100644 index 00000000000..8cdd93bad59 --- /dev/null +++ b/test/Orleans.Journaling.Tests/VolatileJournalStorageProviderTests.cs @@ -0,0 +1,134 @@ +using System.Buffers; +using Xunit; + +namespace Orleans.Journaling.Tests; + +[TestCategory("BVT")] +public sealed class VolatileJournalStorageProviderTests +{ + [Fact] + public async Task CreateIfNotExists_ListAndGetMetadataUseJournalIds() + { + var provider = new VolatileJournalStorageProvider(); + var idA = JournalId.Create("named", "logs", "a"); + var idB = JournalId.Create("named", "logs", "b"); + var idChild = JournalId.Create("named", "logs", "a", "child"); + var other = JournalId.Create("named", "other", "a"); + + var storageA = provider.CreateStorage(idA); + var created = await storageA.CreateIfNotExistsAsync(new Dictionary { ["owner"] = "one" }); + await provider.CreateStorage(idB).CreateIfNotExistsAsync(); + await provider.CreateStorage(idChild).CreateIfNotExistsAsync(); + await provider.CreateStorage(other).CreateIfNotExistsAsync(); + + Assert.True(created); + var metadata = await storageA.GetMetadataAsync(); + Assert.NotNull(metadata); + Assert.NotNull(metadata.ETag); + Assert.Equal("one", metadata.Properties["owner"]); + + var alreadyExists = await storageA.CreateIfNotExistsAsync(new Dictionary { ["owner"] = "two" }); + Assert.False(alreadyExists); + Assert.Equal("one", (await storageA.GetMetadataAsync())!.Properties["owner"]); + + var listed = await ToListAsync(provider.ListAsync(JournalId.Create("named", "logs"))); + Assert.Equal([idA, idChild, idB], listed); + + Assert.NotNull(await provider.CreateStorage(idB).GetMetadataAsync()); + Assert.Null(await provider.CreateStorage(JournalId.Create("named", "missing")).GetMetadataAsync()); + } + + [Fact] + public async Task UpdateMetadata_UsesETagCasAndReportsNoChange() + { + var provider = new VolatileJournalStorageProvider(); + var storage = provider.CreateStorage(JournalId.Create("named", "properties", "cas")); + Assert.True(await storage.CreateIfNotExistsAsync(new Dictionary + { + ["keep"] = "1", + ["remove"] = "2" + })); + var original = (await storage.GetMetadataAsync())!; + + var updated = await storage.UpdateMetadataAsync( + new Dictionary { ["keep"] = "3", ["add"] = "4" }, + ["remove"], + original.ETag); + + Assert.NotNull(updated); + Assert.NotEqual(original.ETag, updated.ETag); + Assert.Equal("3", updated.Properties["keep"]); + Assert.Equal("4", updated.Properties["add"]); + Assert.False(updated.Properties.ContainsKey("remove")); + + var stale = await storage.UpdateMetadataAsync( + new Dictionary { ["keep"] = "5" }, + remove: null, + original.ETag); + Assert.Null(stale); + Assert.Equal("3", (await storage.GetMetadataAsync())!.Properties["keep"]); + + var noChange = await storage.UpdateMetadataAsync( + new Dictionary { ["keep"] = "3" }, + remove: null, + updated.ETag); + Assert.NotNull(noChange); + Assert.Equal(updated.ETag, noChange.ETag); + } + + [Fact] + public async Task StorageOperationsUpdateMetadataAndDeleteRemovesStorage() + { + var provider = new VolatileJournalStorageProvider(); + var storageId = JournalId.Create("named", "conditional", "storage"); + var storage = provider.CreateStorage(storageId); + + Assert.Null(await storage.GetMetadataAsync()); + + await storage.AppendAsync(new ReadOnlySequence([1]), CancellationToken.None); + var appendProperties = await storage.GetMetadataAsync(); + Assert.NotNull(appendProperties); + Assert.NotNull(appendProperties.ETag); + + await storage.ReplaceAsync(new ReadOnlySequence([2]), CancellationToken.None); + var replaceProperties = await storage.GetMetadataAsync(); + Assert.NotNull(replaceProperties); + Assert.NotEqual(appendProperties.ETag, replaceProperties.ETag); + + await storage.AppendAsync(new ReadOnlySequence([3]), CancellationToken.None); + var finalProperties = await storage.GetMetadataAsync(); + Assert.NotNull(finalProperties); + Assert.NotEqual(replaceProperties.ETag, finalProperties.ETag); + + Assert.Equal([storageId], await ToListAsync(provider.ListAsync(storageId))); + + await storage.DeleteAsync(CancellationToken.None); + + Assert.Null(await storage.GetMetadataAsync()); + Assert.Empty(await ToListAsync(provider.ListAsync(storageId))); + } + + [Fact] + public async Task CallerCannotSetProviderOwnedProperties() + { + var provider = new VolatileJournalStorageProvider(); + var storage = provider.CreateStorage(JournalId.Create("named", "reserved", "properties")); + + await Assert.ThrowsAsync( + () => storage.CreateIfNotExistsAsync(new Dictionary { ["$owner"] = "provider" }).AsTask()); + + await Assert.ThrowsAsync( + () => storage.UpdateMetadataAsync(new Dictionary { ["$owner"] = "provider" }).AsTask()); + } + + private static async Task> ToListAsync(IAsyncEnumerable source) + { + var result = new List(); + await foreach (var item in source) + { + result.Add(item); + } + + return result; + } +}