Skip to content

Commit

Permalink
Smt on clearml (#200)
Browse files Browse the repository at this point in the history
* SMT on ClearML
* Replace CPU, GPU types with just Hangfire vs ClearML as well as engine type
* Allow each engine type to have it's own queue and docker image
* SMT build defaults on ClearML
* NMT local train removed
* Download and upload model in factory using tar.gz and the build directory
* Preserve changes from #205.

This reverts commit cf5f453.

---------

Co-authored-by: Damien Daspit <[email protected]>
  • Loading branch information
johnml1135 and ddaspit authored Jun 11, 2024
1 parent bf9bb16 commit 8c39e3f
Show file tree
Hide file tree
Showing 61 changed files with 1,945 additions and 1,405 deletions.
3 changes: 3 additions & 0 deletions .editorconfig
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@ csharp_new_line_before_finally = true
csharp_new_line_before_members_in_object_initializers = true
csharp_new_line_before_members_in_anonymous_types = true

# Indentation settings
csharp_indent_case_contents_when_block = false

# Namespace settings
csharp_style_namespace_declarations = file_scoped

Expand Down
3 changes: 1 addition & 2 deletions src/SIL.Machine.AspNetCore/Configuration/BuildJobOptions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,5 @@ public class BuildJobOptions
{
public const string Key = "BuildJob";

public Dictionary<BuildJobType, BuildJobRunner> Runners { get; set; } =
new() { { BuildJobType.Cpu, BuildJobRunner.Hangfire }, { BuildJobType.Gpu, BuildJobRunner.ClearML } };
public IList<ClearMLBuildQueue> ClearML { get; set; } = new List<ClearMLBuildQueue>();
}
9 changes: 9 additions & 0 deletions src/SIL.Machine.AspNetCore/Configuration/ClearMLBuildQueue.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
namespace SIL.Machine.AspNetCore.Configuration;

public class ClearMLBuildQueue
{
public TranslationEngineType TranslationEngineType { get; set; }
public string ModelType { get; set; } = "";
public string Queue { get; set; } = "default";
public string DockerImage { get; set; } = "";
}
3 changes: 0 additions & 3 deletions src/SIL.Machine.AspNetCore/Configuration/ClearMLOptions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,10 @@ public class ClearMLOptions
{
public const string Key = "ClearML";

public string Queue { get; set; } = "default";
public string AccessKey { get; set; } = "";
public string SecretKey { get; set; } = "";
public bool BuildPollingEnabled { get; set; } = false;
public TimeSpan BuildPollingTimeout { get; set; } = TimeSpan.FromSeconds(10);
public string ModelType { get; set; } = "huggingface";
public string RootProject { get; set; } = "Machine";
public string Project { get; set; } = "dev";
public string DockerImage { get; set; } = "";
}
120 changes: 43 additions & 77 deletions src/SIL.Machine.AspNetCore/Configuration/IMachineBuilderExtensions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,21 @@ public static IMachineBuilder AddSharedFileOptions(this IMachineBuilder builder,
return builder;
}

public static IMachineBuilder AddBuildJobOptions(
this IMachineBuilder builder,
Action<BuildJobOptions> configureOptions
)
{
builder.Services.Configure(configureOptions);
return builder;
}

public static IMachineBuilder AddBuildJobOptions(this IMachineBuilder builder, IConfiguration config)
{
builder.Services.Configure<BuildJobOptions>(config);
return builder;
}

public static IMachineBuilder AddThotSmtModel(this IMachineBuilder builder)
{
if (builder.Configuration is null)
Expand Down Expand Up @@ -131,26 +146,6 @@ public static IMachineBuilder AddClearMLService(this IMachineBuilder builder, st
return builder;
}

private static IMachineBuilder AddClearMLBuildJobRunner(this IMachineBuilder builder)
{
builder.Services.AddScoped<IBuildJobRunner, ClearMLBuildJobRunner>();
builder.Services.AddScoped<IClearMLBuildJobFactory, NmtClearMLBuildJobFactory>();
builder.Services.AddSingleton<ClearMLMonitorService>();
builder.Services.AddHostedService(p => p.GetRequiredService<ClearMLMonitorService>());

return builder;
}

private static IMachineBuilder AddHangfireBuildJobRunner(this IMachineBuilder builder)
{
builder.Services.AddScoped<IBuildJobRunner, HangfireBuildJobRunner>();

builder.Services.AddScoped<IHangfireBuildJobFactory, SmtTransferHangfireBuildJobFactory>();
builder.Services.AddScoped<IHangfireBuildJobFactory, NmtHangfireBuildJobFactory>();

return builder;
}

private static MongoStorageOptions GetMongoStorageOptions()
{
var mongoStorageOptions = new MongoStorageOptions
Expand Down Expand Up @@ -200,6 +195,7 @@ public static IMachineBuilder AddHangfireJobServer(
switch (engineType)
{
case TranslationEngineType.SmtTransfer:
builder.Services.AddSingleton<SmtTransferEngineStateService>();
builder.AddThotSmtModel().AddTransferEngine().AddUnigramTruecaser();
queues.Add("smt_transfer");
break;
Expand Down Expand Up @@ -252,7 +248,7 @@ await c.Indexes.CreateOrUpdateAsync(
);
await c.Indexes.CreateOrUpdateAsync(
new CreateIndexModel<TranslationEngine>(
Builders<TranslationEngine>.IndexKeys.Ascending(e => e.CurrentBuild!.JobRunner)
Builders<TranslationEngine>.IndexKeys.Ascending(e => e.CurrentBuild!.BuildJobRunner)
)
);
}
Expand Down Expand Up @@ -360,49 +356,38 @@ public static IMachineBuilder AddServalTranslationEngineService(
return builder;
}

public static IMachineBuilder AddBuildJobService(
this IMachineBuilder builder,
Action<BuildJobOptions> configureOptions
)
public static IMachineBuilder AddBuildJobService(this IMachineBuilder builder, string? smtTransferEngineDir = null)
{
builder.Services.Configure(configureOptions);
var options = new BuildJobOptions();
configureOptions(options);
return builder.AddBuildJobService(options);
}
builder.Services.AddScoped<IBuildJobService, BuildJobService>();

public static IMachineBuilder AddBuildJobService(this IMachineBuilder builder, IConfiguration config)
{
builder.Services.Configure<BuildJobOptions>(config);
var buildJobOptions = new BuildJobOptions();
config.GetSection(BuildJobOptions.Key).Bind(buildJobOptions);
return builder.AddBuildJobService(buildJobOptions);
}
builder.Services.AddScoped<IBuildJobRunner, ClearMLBuildJobRunner>();
builder.Services.AddScoped<IClearMLBuildJobFactory, NmtClearMLBuildJobFactory>();
builder.Services.AddScoped<IClearMLBuildJobFactory, SmtTransferClearMLBuildJobFactory>();
builder.Services.AddSingleton<ClearMLMonitorService>();
builder.Services.AddSingleton<IClearMLQueueService>(x => x.GetRequiredService<ClearMLMonitorService>());
builder.Services.AddHostedService(p => p.GetRequiredService<ClearMLMonitorService>());

public static IMachineBuilder AddBuildJobService(this IMachineBuilder builder)
{
if (builder.Configuration is null)
{
builder.AddBuildJobService(o => { });
}
else
{
builder.AddBuildJobService(builder.Configuration.GetSection(BuildJobOptions.Key));
builder.Services.AddScoped<IBuildJobRunner, HangfireBuildJobRunner>();
builder.Services.AddScoped<IHangfireBuildJobFactory, NmtHangfireBuildJobFactory>();
builder.Services.AddScoped<IHangfireBuildJobFactory, SmtTransferHangfireBuildJobFactory>();

if (smtTransferEngineDir is null)
{
var smtTransferEngineOptions = new SmtTransferEngineOptions();
builder.Configuration.GetSection(SmtTransferEngineOptions.Key).Bind(smtTransferEngineOptions);
string? driveLetter = Path.GetPathRoot(smtTransferEngineOptions.EnginesDir)?[..1];
if (driveLetter is null)
throw new InvalidOperationException("SMT Engine directory is required");
// add health check for disk storage capacity
builder
.Services.AddHealthChecks()
.AddDiskStorageHealthCheck(
x => x.AddDrive(driveLetter, 1_000), // 1GB
"SMT Engine Storage Capacity",
HealthStatus.Degraded
);
builder.Configuration?.GetSection(SmtTransferEngineOptions.Key).Bind(smtTransferEngineOptions);
smtTransferEngineDir = smtTransferEngineOptions.EnginesDir;
}
string? driveLetter = Path.GetPathRoot(smtTransferEngineDir)?[..1];
if (driveLetter is null)
throw new InvalidOperationException("SMT Engine directory is required");
// add health check for disk storage capacity
builder
.Services.AddHealthChecks()
.AddDiskStorageHealthCheck(
x => x.AddDrive(driveLetter, 1_000), // 1GB
"SMT Engine Storage Capacity",
HealthStatus.Degraded
);

return builder;
}
Expand All @@ -412,23 +397,4 @@ public static IMachineBuilder AddModelCleanupService(this IMachineBuilder builde
builder.Services.AddHostedService<ModelCleanupService>();
return builder;
}

private static IMachineBuilder AddBuildJobService(this IMachineBuilder builder, BuildJobOptions options)
{
builder.Services.AddScoped<IBuildJobService, BuildJobService>();

foreach (BuildJobRunner runnerType in options.Runners.Values.Distinct())
{
switch (runnerType)
{
case BuildJobRunner.ClearML:
builder.AddClearMLBuildJobRunner();
break;
case BuildJobRunner.Hangfire:
builder.AddHangfireBuildJobRunner();
break;
}
}
return builder;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,15 @@ public static IMachineBuilder AddMachine(this IServiceCollection services, IConf
builder.AddSharedFileOptions(o => { });
builder.AddSmtTransferEngineOptions(o => { });
builder.AddClearMLOptions(o => { });
builder.AddBuildJobOptions(o => { });
}
else
{
builder.AddServiceOptions(configuration.GetSection(ServiceOptions.Key));
builder.AddSharedFileOptions(configuration.GetSection(SharedFileOptions.Key));
builder.AddSmtTransferEngineOptions(configuration.GetSection(SmtTransferEngineOptions.Key));
builder.AddClearMLOptions(configuration.GetSection(ClearMLOptions.Key));
builder.AddBuildJobOptions(configuration.GetSection(BuildJobOptions.Key));
}
return builder;
}
Expand Down
13 changes: 10 additions & 3 deletions src/SIL.Machine.AspNetCore/Models/Build.cs
Original file line number Diff line number Diff line change
Expand Up @@ -8,18 +8,25 @@ public enum BuildJobState
Canceling
}

public enum BuildJobRunner
public enum BuildJobRunnerType
{
Hangfire,
ClearML
}

public enum BuildStage
{
Preprocess,
Train,
Postprocess
}

public record Build
{
public required string BuildId { get; init; }
public required BuildJobState JobState { get; init; }
public required string JobId { get; init; }
public required BuildJobRunner JobRunner { get; init; }
public required string Stage { get; init; }
public required BuildJobRunnerType BuildJobRunner { get; init; }
public required BuildStage Stage { get; init; }
public string? Options { get; set; }
}
3 changes: 3 additions & 0 deletions src/SIL.Machine.AspNetCore/Models/ClearMLTask.cs
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,7 @@ public required IReadOnlyDictionary<
string,
IReadOnlyDictionary<string, ClearMLMetricsEvent>
> LastMetrics { get; init; }

[JsonConverter(typeof(DictionaryStringStringConverter))]
public required IReadOnlyDictionary<string, string> Runtime { get; init; }
}
1 change: 1 addition & 0 deletions src/SIL.Machine.AspNetCore/Models/TranslationEngine.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ public record TranslationEngine : IEntity
public string Id { get; set; } = "";
public int Revision { get; set; } = 1;
public required string EngineId { get; init; }
public required TranslationEngineType Type { get; init; }
public required string SourceLanguage { get; init; }
public required string TargetLanguage { get; init; }
public required bool IsModelPersisted { get; init; }
Expand Down
1 change: 1 addition & 0 deletions src/SIL.Machine.AspNetCore/SIL.Machine.AspNetCore.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
<PackageReference Include="AspNetCore.HealthChecks.MongoDb" Version="6.0.2" />
<PackageReference Include="AspNetCore.HealthChecks.System" Version="6.0.2" />
<PackageReference Include="AWSSDK.S3" Version="3.7.205.8" />
<PackageReference Include="CommunityToolkit.HighPerformance" Version="8.2.2" />
<PackageReference Include="Grpc.AspNetCore" Version="2.57.0" />
<PackageReference Include="Grpc.AspNetCore.HealthChecks" Version="2.57.0" />
<PackageReference Include="HangFire" Version="1.8.5" />
Expand Down
Loading

0 comments on commit 8c39e3f

Please sign in to comment.