diff --git a/tools/issue-labeler/Directory.Build.props b/tools/issue-labeler/Directory.Build.props index 6aab64a60a5..b7220d6e29a 100644 --- a/tools/issue-labeler/Directory.Build.props +++ b/tools/issue-labeler/Directory.Build.props @@ -27,6 +27,12 @@ 16.11.0 + + + $(MSBuildProjectDirectory)\bin\ + $(MSBuildProjectDirectory)\obj\ + + diff --git a/tools/issue-labeler/IssueLabeler.sln b/tools/issue-labeler/IssueLabeler.sln index 762fe481d52..f568f876ef4 100644 --- a/tools/issue-labeler/IssueLabeler.sln +++ b/tools/issue-labeler/IssueLabeler.sln @@ -15,17 +15,22 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "SearchIndexCreator", "src\S {5B655051-531D-4968-8AF3-1DBA9A9F568C} = {5B655051-531D-4968-8AF3-1DBA9A9F568C} EndProjectSection EndProject +EndProject Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Azure.Sdk.Tools.GitHubEventProcessor", "..\github-event-processor\Azure.Sdk.Tools.GitHubEventProcessor\Azure.Sdk.Tools.GitHubEventProcessor.csproj", "{5B655051-531D-4968-8AF3-1DBA9A9F568C}" EndProject -Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "CreateMikLabelModel", "src\CreateMikLabelModel\CreateMikLabelModel.csproj", "{5966A77B-5114-4608-92AD-524F181FA0FC}" +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "IssueLabeler.Shared", "src\IssueLabeler.Shared\IssueLabeler.Shared.csproj", "{9E6BA2D8-3BBE-40D6-9DAF-0FC0CD362BD4}" EndProject -Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Hubbup.MikLabelModel", "src\Hubbup.MikLabelModel\Hubbup.MikLabelModel.csproj", "{CA47F6FC-382F-4034-9F12-517CC14E5CB0}" +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Common", "src\IssueLabelerMLPipeline\src\Common\Common.csproj", "{3F3044DC-A9F8-DE16-79DD-4A0C1649CD06}" EndProject -Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Azure.Sdk.LabelTrainer", "src\Azure.Sdk.Labels\Azure.Sdk.LabelTrainer.csproj", "{DB80D7FD-262D-429D-9700-72EF4D93F317}" +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Downloader", "src\IssueLabelerMLPipeline\src\Downloader\Downloader.csproj", "{AB75FE13-DB1A-4B6F-8B27-1486F98EA75C}" EndProject -Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "IssueLabeler.Shared", "src\IssueLabeler.Shared\IssueLabeler.Shared.csproj", "{9E6BA2D8-3BBE-40D6-9DAF-0FC0CD362BD4}" +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "GitHubClient", "src\IssueLabelerMLPipeline\src\GitHubClient\GitHubClient.csproj", "{57F2D1DC-DA30-40CA-AE1A-2EFD8139AF25}" +EndProject +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Predictor", "src\IssueLabelerMLPipeline\src\Predictor\Predictor.csproj", "{2E39B0A5-2F4A-4D6E-8A0D-0366238CB21E}" +EndProject +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Tester", "src\IssueLabelerMLPipeline\src\Tester\Tester.csproj", "{BEA133F4-5686-49DF-83E4-641C26B3CC25}" EndProject -Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Hubbup.MikLabelModel.Tests", "tests\Hubbup.MikLabelModel.Tests\Hubbup.MikLabelModel.Tests.csproj", "{CD3F13F1-8890-490A-BB47-9382E2131F5D}" +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Trainer", "src\IssueLabelerMLPipeline\src\Trainer\Trainer.csproj", "{F1FE4054-C44E-487F-90F9-2F111AB7BD9C}" EndProject Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Azure.Sdk.Tools.CodeownersUtils", "..\codeowners-utils\Azure.Sdk.Tools.CodeownersUtils\Azure.Sdk.Tools.CodeownersUtils.csproj", "{D27C2C44-3AC4-0732-FF87-DD1697A9DF37}" EndProject @@ -39,38 +44,42 @@ Global {4C9E75AF-468F-4DF7-BACD-EC0C2C66A96F}.Debug|Any CPU.Build.0 = Debug|Any CPU {4C9E75AF-468F-4DF7-BACD-EC0C2C66A96F}.Release|Any CPU.ActiveCfg = Release|Any CPU {4C9E75AF-468F-4DF7-BACD-EC0C2C66A96F}.Release|Any CPU.Build.0 = Release|Any CPU - {0AEAF8DD-C370-4090-B439-9CF364D29869}.Debug|Any CPU.ActiveCfg = Debug|Any CPU - {0AEAF8DD-C370-4090-B439-9CF364D29869}.Debug|Any CPU.Build.0 = Debug|Any CPU - {0AEAF8DD-C370-4090-B439-9CF364D29869}.Release|Any CPU.ActiveCfg = Release|Any CPU - {0AEAF8DD-C370-4090-B439-9CF364D29869}.Release|Any CPU.Build.0 = Release|Any CPU {5B655051-531D-4968-8AF3-1DBA9A9F568C}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {5B655051-531D-4968-8AF3-1DBA9A9F568C}.Debug|Any CPU.Build.0 = Debug|Any CPU {5B655051-531D-4968-8AF3-1DBA9A9F568C}.Release|Any CPU.ActiveCfg = Release|Any CPU {5B655051-531D-4968-8AF3-1DBA9A9F568C}.Release|Any CPU.Build.0 = Release|Any CPU - {5966A77B-5114-4608-92AD-524F181FA0FC}.Debug|Any CPU.ActiveCfg = Debug|Any CPU - {5966A77B-5114-4608-92AD-524F181FA0FC}.Debug|Any CPU.Build.0 = Debug|Any CPU - {5966A77B-5114-4608-92AD-524F181FA0FC}.Release|Any CPU.ActiveCfg = Release|Any CPU - {5966A77B-5114-4608-92AD-524F181FA0FC}.Release|Any CPU.Build.0 = Release|Any CPU - {CA47F6FC-382F-4034-9F12-517CC14E5CB0}.Debug|Any CPU.ActiveCfg = Debug|Any CPU - {CA47F6FC-382F-4034-9F12-517CC14E5CB0}.Debug|Any CPU.Build.0 = Debug|Any CPU - {CA47F6FC-382F-4034-9F12-517CC14E5CB0}.Release|Any CPU.ActiveCfg = Release|Any CPU - {CA47F6FC-382F-4034-9F12-517CC14E5CB0}.Release|Any CPU.Build.0 = Release|Any CPU - {DB80D7FD-262D-429D-9700-72EF4D93F317}.Debug|Any CPU.ActiveCfg = Debug|Any CPU - {DB80D7FD-262D-429D-9700-72EF4D93F317}.Debug|Any CPU.Build.0 = Debug|Any CPU - {DB80D7FD-262D-429D-9700-72EF4D93F317}.Release|Any CPU.ActiveCfg = Release|Any CPU - {DB80D7FD-262D-429D-9700-72EF4D93F317}.Release|Any CPU.Build.0 = Release|Any CPU {9E6BA2D8-3BBE-40D6-9DAF-0FC0CD362BD4}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {9E6BA2D8-3BBE-40D6-9DAF-0FC0CD362BD4}.Debug|Any CPU.Build.0 = Debug|Any CPU {9E6BA2D8-3BBE-40D6-9DAF-0FC0CD362BD4}.Release|Any CPU.ActiveCfg = Release|Any CPU {9E6BA2D8-3BBE-40D6-9DAF-0FC0CD362BD4}.Release|Any CPU.Build.0 = Release|Any CPU - {CD3F13F1-8890-490A-BB47-9382E2131F5D}.Debug|Any CPU.ActiveCfg = Debug|Any CPU - {CD3F13F1-8890-490A-BB47-9382E2131F5D}.Debug|Any CPU.Build.0 = Debug|Any CPU - {CD3F13F1-8890-490A-BB47-9382E2131F5D}.Release|Any CPU.ActiveCfg = Release|Any CPU - {CD3F13F1-8890-490A-BB47-9382E2131F5D}.Release|Any CPU.Build.0 = Release|Any CPU {D27C2C44-3AC4-0732-FF87-DD1697A9DF37}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {D27C2C44-3AC4-0732-FF87-DD1697A9DF37}.Debug|Any CPU.Build.0 = Debug|Any CPU {D27C2C44-3AC4-0732-FF87-DD1697A9DF37}.Release|Any CPU.ActiveCfg = Release|Any CPU {D27C2C44-3AC4-0732-FF87-DD1697A9DF37}.Release|Any CPU.Build.0 = Release|Any CPU + {3F3044DC-A9F8-DE16-79DD-4A0C1649CD06}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {3F3044DC-A9F8-DE16-79DD-4A0C1649CD06}.Debug|Any CPU.Build.0 = Debug|Any CPU + {3F3044DC-A9F8-DE16-79DD-4A0C1649CD06}.Release|Any CPU.ActiveCfg = Release|Any CPU + {3F3044DC-A9F8-DE16-79DD-4A0C1649CD06}.Release|Any CPU.Build.0 = Release|Any CPU + {AB75FE13-DB1A-4B6F-8B27-1486F98EA75C}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {AB75FE13-DB1A-4B6F-8B27-1486F98EA75C}.Debug|Any CPU.Build.0 = Debug|Any CPU + {AB75FE13-DB1A-4B6F-8B27-1486F98EA75C}.Release|Any CPU.ActiveCfg = Release|Any CPU + {AB75FE13-DB1A-4B6F-8B27-1486F98EA75C}.Release|Any CPU.Build.0 = Release|Any CPU + {57F2D1DC-DA30-40CA-AE1A-2EFD8139AF25}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {57F2D1DC-DA30-40CA-AE1A-2EFD8139AF25}.Debug|Any CPU.Build.0 = Debug|Any CPU + {57F2D1DC-DA30-40CA-AE1A-2EFD8139AF25}.Release|Any CPU.ActiveCfg = Release|Any CPU + {57F2D1DC-DA30-40CA-AE1A-2EFD8139AF25}.Release|Any CPU.Build.0 = Release|Any CPU + {2E39B0A5-2F4A-4D6E-8A0D-0366238CB21E}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {2E39B0A5-2F4A-4D6E-8A0D-0366238CB21E}.Debug|Any CPU.Build.0 = Debug|Any CPU + {2E39B0A5-2F4A-4D6E-8A0D-0366238CB21E}.Release|Any CPU.ActiveCfg = Release|Any CPU + {2E39B0A5-2F4A-4D6E-8A0D-0366238CB21E}.Release|Any CPU.Build.0 = Release|Any CPU + {BEA133F4-5686-49DF-83E4-641C26B3CC25}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {BEA133F4-5686-49DF-83E4-641C26B3CC25}.Debug|Any CPU.Build.0 = Debug|Any CPU + {BEA133F4-5686-49DF-83E4-641C26B3CC25}.Release|Any CPU.ActiveCfg = Release|Any CPU + {BEA133F4-5686-49DF-83E4-641C26B3CC25}.Release|Any CPU.Build.0 = Release|Any CPU + {F1FE4054-C44E-487F-90F9-2F111AB7BD9C}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {F1FE4054-C44E-487F-90F9-2F111AB7BD9C}.Debug|Any CPU.Build.0 = Debug|Any CPU + {F1FE4054-C44E-487F-90F9-2F111AB7BD9C}.Release|Any CPU.ActiveCfg = Release|Any CPU + {F1FE4054-C44E-487F-90F9-2F111AB7BD9C}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE diff --git a/tools/issue-labeler/src/Azure.Sdk.Labels/Azure.Sdk.LabelTrainer.csproj b/tools/issue-labeler/src/Azure.Sdk.Labels/Azure.Sdk.LabelTrainer.csproj deleted file mode 100644 index 9d52a6772dc..00000000000 --- a/tools/issue-labeler/src/Azure.Sdk.Labels/Azure.Sdk.LabelTrainer.csproj +++ /dev/null @@ -1,16 +0,0 @@ - - - - Exe - latest - Azure.Sdk.LabelTrainer - - - - - - - - - - diff --git a/tools/issue-labeler/src/Azure.Sdk.Labels/AzureSdkCombinedLabelModelTrainer.cs b/tools/issue-labeler/src/Azure.Sdk.Labels/AzureSdkCombinedLabelModelTrainer.cs deleted file mode 100644 index 53ee623cd80..00000000000 --- a/tools/issue-labeler/src/Azure.Sdk.Labels/AzureSdkCombinedLabelModelTrainer.cs +++ /dev/null @@ -1,57 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -using System.Collections.Generic; -using System.Threading.Tasks; -using CreateMikLabelModel; -using CreateMikLabelModel.ML; - -namespace Azure.Sdk.LabelTrainer -{ - /// - /// Provides functionality related to training label models, including building and curating the - /// sets of data needed to do so. - /// - /// - public class AzureSdkCombinedLabelModelTrainer : LabelModelTrainer - { - /// The set of core Azure SDK language repositories which should be used for training the combined model. - private static readonly string[] AzureSdkLanguageRepositories = new[] - { - "Azure/azure-sdk-for-net", - "Azure/azure-sdk-for-java", - "Azure/azure-sdk-for-python", - "Azure/azure-sdk-for-js", - "Azure/azure-sdk-for-go", - "Azure/azure-sdk-for-cpp", - "Azure/azure-sdk-for-rust", - }; - - /// - /// Initializes a new instance of the class. - /// - /// - /// The logging implementation to use for emitting messages. - /// - public AzureSdkCombinedLabelModelTrainer(ILogger logger) : base("Azure/azure-sdk", logger) - { - } - - /// - /// Queries repository items for data to use for training the models. - /// - /// - /// The access token to use for the GitHub API. - /// The base path to use for storing and querying training data. - /// The processor for preparing training set items from repository issues and pull requests. - /// The set of filters to apply to data when building the training set. If not provided, training items will not be filtered. - /// - /// The set of that were produced. - /// - public override Task> QueryTrainingData( - string gitHubAccessToken, - string trainingDataBasePath, - TrainingDataProcessor processor = default, - TrainingDataFilters filters = default) => QueryTrainingData(gitHubAccessToken, trainingDataBasePath, AzureSdkLanguageRepositories, processor, filters); - } -} diff --git a/tools/issue-labeler/src/Azure.Sdk.Labels/AzureSdkTrainingDataFilters.cs b/tools/issue-labeler/src/Azure.Sdk.Labels/AzureSdkTrainingDataFilters.cs deleted file mode 100644 index 8a3f8e4f71d..00000000000 --- a/tools/issue-labeler/src/Azure.Sdk.Labels/AzureSdkTrainingDataFilters.cs +++ /dev/null @@ -1,58 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -using System; -using System.Linq; -using CreateMikLabelModel; -using CreateMikLabelModel.Models; -using Octokit; - -namespace Azure.Sdk.LabelTrainer -{ - internal class AzureSdkTrainingDataFilters : TrainingDataFilters - { - private static readonly string[] AzureSdkRequiredIssueLabelNames = new[] { "customer-reported" }; - private static readonly string[] AzureSdkRequiredPullRequestLabelNames = Array.Empty(); - - public AzureSdkTrainingDataFilters() : base( - includeIssues: true, - includePullRequests: false, - requiredIssueLabelNames: AzureSdkRequiredIssueLabelNames, - requiredPullRequestLabelNames: AzureSdkRequiredPullRequestLabelNames) - { - } - - public override bool PullRequestFilter(PullRequestWithFiles pullRequest) => false; - - public override bool IssueFilter(Issue issue) - { - var categoryCount = 0; - var serviceCount = 0; - - if (RequiredIssueLabelNames.All(required => issue.Labels.Any(label => label.Name == required))) - { - foreach (var label in issue.Labels) - { - if (AzureSdkLabel.IsServiceLabel(label)) - { - ++serviceCount; - } - - if (AzureSdkLabel.IsCategoryLabel(label)) - { - ++categoryCount; - } - } - } - - // To be eligible for the training set, the issue must have all of the required - // labels and exactly one service label and one category label. Issues that have - // multiples, even if valid, aren't appropriate for training purposes. - - return (categoryCount == 1 && serviceCount == 1); - } - - public override bool LabelFilter(Label label) => - AzureSdkLabel.IsServiceLabel(label) || AzureSdkLabel.IsCategoryLabel(label); - } -} diff --git a/tools/issue-labeler/src/Azure.Sdk.Labels/AzureSdkTrainingDataProcessor.cs b/tools/issue-labeler/src/Azure.Sdk.Labels/AzureSdkTrainingDataProcessor.cs deleted file mode 100644 index 1aa1396acf0..00000000000 --- a/tools/issue-labeler/src/Azure.Sdk.Labels/AzureSdkTrainingDataProcessor.cs +++ /dev/null @@ -1,119 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -using System.Collections.Generic; -using CreateMikLabelModel; -using CreateMikLabelModel.Models; -using Octokit; - -namespace Azure.Sdk.LabelTrainer -{ - internal class AzureSdkTrainingDataProcessor : TrainingDataProcessor - { - /// - /// Initializes a new instance of the class. - /// - /// - /// The logger to use for reporting information as items are prepared. - /// - public AzureSdkTrainingDataProcessor(ILogger logger) : base(logger) - { - } - - /// - /// Prepares training data based on repository issues, transforming it into the - /// appropriate representation. - /// - /// - /// The raw training data, in the form of repository issues. - /// The name of the repository that was the source of the training data. - /// - /// The set of instances prepared from the . - /// - public override async IAsyncEnumerable PrepareData( - IAsyncEnumerable trainingData, - string repositoryName) - { - var itemCount = 0; - - await foreach (var issue in trainingData) - { - if (issue.Labels.Count > 0) - { - foreach (var label in issue.Labels) - { - var segment = GetSegment(label); - - if (segment != null) - { - ++itemCount; - yield return new TrainingDataItem(label.Name, segment, repositoryName, issue); - } - } - } - else - { - Logger.LogWarning($"Issue: { issue.Id } has no labels and should have been filtered."); - } - } - - Logger.LogInformation($"Prepared { itemCount } training set items from issue training data."); - } - - /// - /// Prepares training data based on repository pull requests, transforming it into the - /// appropriate representation. - /// - /// - /// The raw training data, in the form of repository pull requests. - /// The name of the repository that was the source of the training data. - /// - /// The set of instances prepared from the . - /// - public override async IAsyncEnumerable PrepareData( - IAsyncEnumerable trainingData, - string repositoryName) - { - var itemCount = 0; - - await foreach (var pullRequest in trainingData) - { - if (pullRequest.PullRequest.Labels.Count > 0) - { - foreach (var label in pullRequest.PullRequest.Labels) - { - var segment = GetSegment(label); - - if (segment != null) - { - ++itemCount; - yield return new TrainingDataItem(label.Name, DefaultSegmentName, repositoryName, pullRequest); - } - } - } - else - { - Logger.LogWarning($"Pull Request: { pullRequest.PullRequest.Id } has no labels and should have been filtered."); - } - } - - Logger.LogInformation($"Prepared { itemCount } training set items from pull request training data."); - } - - /// - /// Gets the segment that the training data should be associated with. - /// - /// - /// The label to consider. - /// - /// The segment name. - /// - private string GetSegment(Label label) => label switch - { - null => null, - _ when AzureSdkLabel.IsCategoryLabel(label) => "Category", - _ when AzureSdkLabel.IsServiceLabel(label) => "Service", - _ => null - }; - } -} diff --git a/tools/issue-labeler/src/Azure.Sdk.Labels/Program.cs b/tools/issue-labeler/src/Azure.Sdk.Labels/Program.cs deleted file mode 100644 index d463bd82434..00000000000 --- a/tools/issue-labeler/src/Azure.Sdk.Labels/Program.cs +++ /dev/null @@ -1,152 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -using System; -using System.Diagnostics; -using System.IO; -using System.Threading.Tasks; -using CreateMikLabelModel; -using IssueLabeler.Shared; - -namespace Azure.Sdk.LabelTrainer -{ - /// - /// Serves as the main entry point for the application. - /// - /// - public static class Program - { - /// The file to write output to; the current directory is assumed. - private const string TraceLogFilename = "trace.log"; - - /// - /// This utility will train a set of machine learning models intended to help with prediction of the - /// labels that should be added to GitHub items for basic categorization and routing. - /// - /// - /// The full path for the repository to train. - /// The access token to use for interacting with GitHub. - /// [OPTIONAL] The directory in which to keep the data files; if not specified, the current directory will be assumed. If specified, the directory will be created if it does not exist. - /// - /// - /// - /// dotnet run -- --repository "Azure/azure-sdk-for-net" --git-hub-token "[[ TOKEN ]]" - /// - /// - /// - /// - /// - /// dotnet run -- --repository "Azure/azure-sdk-for-net" --git-hub-token "[[ TOKEN ]]" --data-file-directory "c:\data\training" - /// - /// - /// - public static async Task Main(string repository, string gitHubToken, string dataFileDirectory = default) - { - if ((string.IsNullOrEmpty(repository)) || (string.IsNullOrEmpty(gitHubToken))) - { - Console.WriteLine(""); - Console.WriteLine("The repository path and GitHub access token must be specified."); - Console.WriteLine(""); - Console.WriteLine("Usage:"); - Console.WriteLine("\tdotnet run -- --repository \"all\" --git-hub-token \"[[ TOKEN ]]\""); - Console.WriteLine("\tdotnet run -- --repository \"Azure/azure-sdk-for-net\" --git-hub-token \"[[ TOKEN ]]\""); - Console.WriteLine("\tdotnet run -- --repository \"Azure/azure-sdk-for-js\" --git-hub-token \"[[ TOKEN ]]\" --data-file-directory \"c:\\data\\training\""); - Console.WriteLine(""); - - return -1; - } - - // Ensure the path for training data. - - dataFileDirectory = string.IsNullOrEmpty(dataFileDirectory) - ? Environment.CurrentDirectory - : dataFileDirectory; - - if (!Directory.Exists(dataFileDirectory)) - { - Directory.CreateDirectory(dataFileDirectory); - } - - // Build the set of training data. - - var logger = new ConsoleLogger(); - - var trainer = repository switch - { - "all" => new AzureSdkCombinedLabelModelTrainer(logger), - _ => new LabelModelTrainer(repository, logger) - }; - - // Step 1: Download the common set of training items and use them to prepare a training data set. This will include - // all segments for the different label types needed. - - Console.ForegroundColor = ConsoleColor.Green; - Console.WriteLine(new String('=', 80)); - Console.WriteLine(" Preparing training data"); - Console.WriteLine(new String('=', 80)); - Console.ResetColor(); - - var filters = new AzureSdkTrainingDataFilters(); - var processor = new AzureSdkTrainingDataProcessor(logger); - var trainingDataFiles = await trainer.QueryTrainingData(gitHubToken, dataFileDirectory, processor, filters).ConfigureAwait(false); - - Console.ForegroundColor = ConsoleColor.Green; - Console.WriteLine(new String('=', 80)); - Console.WriteLine(" Training data preparation complete."); - Console.WriteLine(new String('=', 80)); - Console.ResetColor(); - - // Each segment will produce an dedicated set of models for that specific label type; process each separately. - - foreach (var trainingSegment in trainingDataFiles) - { - Console.WriteLine(); - Console.ForegroundColor = ConsoleColor.Green; - Console.WriteLine(new String('=', 80)); - Console.WriteLine($" Processing segment: { trainingSegment.Key }"); - Console.WriteLine(new String('=', 80)); - Console.ResetColor(); - - // Step 2: Translate the training data into - - trainer.GenerateTrainingDatasets(trainingSegment.Value); - Console.WriteLine(); - - // Step 3: Train the model. - - trainer.TrainModels(trainingSegment.Value); - Console.WriteLine(); - - // Step 4: Test the model. - - trainer.TestModels(trainingSegment.Value); - - // Provide information on where the model files are. - - Console.WriteLine(); - Console.WriteLine(); - - if (!trainingSegment.Value.Issues.SkipProcessing) - { - Console.WriteLine($"Final issue model: '{ trainingSegment.Value.Issues.FinalModelPath }'"); - } - - if (!trainingSegment.Value.PullRequests.SkipProcessing) - { - Console.WriteLine($"Final pull request model: '{ trainingSegment.Value.PullRequests.FinalModelPath }'"); - } - - Console.ForegroundColor = ConsoleColor.Green; - Console.WriteLine(new String('=', 80)); - Console.WriteLine($" Segment: { trainingSegment.Key } complete."); - Console.WriteLine(new String('=', 80)); - Console.ResetColor(); - } - - Console.WriteLine(); - Console.WriteLine(); - Console.WriteLine("==== Training complete ===="); - return 0; - } - } -} \ No newline at end of file diff --git a/tools/issue-labeler/src/CreateMikLabelModel/CreateMikLabelModel.csproj b/tools/issue-labeler/src/CreateMikLabelModel/CreateMikLabelModel.csproj deleted file mode 100644 index 8e894d7cccb..00000000000 --- a/tools/issue-labeler/src/CreateMikLabelModel/CreateMikLabelModel.csproj +++ /dev/null @@ -1,17 +0,0 @@ - - - - latest - - - - - - - - - - - - - diff --git a/tools/issue-labeler/src/CreateMikLabelModel/DL/TrainingData.cs b/tools/issue-labeler/src/CreateMikLabelModel/DL/TrainingData.cs deleted file mode 100644 index af9cf811466..00000000000 --- a/tools/issue-labeler/src/CreateMikLabelModel/DL/TrainingData.cs +++ /dev/null @@ -1,75 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -using System; -using System.Collections.Generic; -using System.IO; -using System.Linq; -using CreateMikLabelModel.Models; -using Octokit; - -namespace CreateMikLabelModel -{ - internal static class TrainingData - { - public static void WriteTrainingItems( - IEnumerable trainingItems, - StreamWriter outputWriter) - { - var ordered = trainingItems - .OrderBy(x => x.CreatedAt.UtcDateTime.ToFileTimeUtc()) //-> first by created date - .ThenBy(x => x.RepositoryName) //-> then by repo name - .ThenBy(x => x.Identifier) //-> then by issue number - .Select(x => x.Data); - - foreach (var item in ordered) - { - outputWriter.WriteLine(item); - } - } - public static void WriteHeader(StreamWriter outputWriter) - { - outputWriter.WriteLine("CombinedID\tID\tLabel\tTitle\tDescription\tAuthor\tIsPR\tFilePaths"); - } - - public static string CreateTrainingData( - string labelName, - string repositoryName, - Issue source) => GetCompressedLine(null, labelName, source.User.Login, source.Body, source.Title, source.CreatedAt, source.Id, repositoryName, false); - - public static string CreateTrainingData( - string labelName, - string repositoryName, - PullRequestWithFiles source) => GetCompressedLine(source.FilePaths, labelName, source.PullRequest.User.Login, source.PullRequest.Body, source.PullRequest.Title, source.PullRequest.CreatedAt, source.PullRequest.Id, repositoryName, true); - - public static string[] SplitFilePaths(string joinedFilePaths) => joinedFilePaths.Split(';'); - - private static string GetCompressedLine( - IEnumerable filePaths, - string label, - string author, - string body, - string title, - DateTimeOffset createdAt, - long identifier, - string repositoryName, - bool isPullRequest) - { - var createdAtTicks = createdAt.UtcDateTime.ToFileTimeUtc(); - - author ??= "ghost"; - body = (body?? string.Empty).Replace('\r', ' ').Replace('\n', ' ').Replace('\t', ' ').Replace('"', '`'); - title = title.Replace('\r', ' ').Replace('\n', ' ').Replace('\t', ' ').Replace('"', '`'); - - if (isPullRequest) - { - var filePathsJoined = string.Join(";", filePaths); - return $"{createdAtTicks},{repositoryName},{identifier}\t{identifier}\t{label}\t{title}\t{body}\t{author}\t1\t{filePathsJoined}"; - } - else - { - return $"{createdAtTicks},{repositoryName},{identifier}\t{identifier}\t{label}\t{title}\t{body}\t{author}\t0\t"; - } - } - } -} \ No newline at end of file diff --git a/tools/issue-labeler/src/CreateMikLabelModel/DL/TrainingDataClient.cs b/tools/issue-labeler/src/CreateMikLabelModel/DL/TrainingDataClient.cs deleted file mode 100644 index e34753bc08d..00000000000 --- a/tools/issue-labeler/src/CreateMikLabelModel/DL/TrainingDataClient.cs +++ /dev/null @@ -1,208 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -using System; -using System.Collections.Generic; -using System.IO; -using System.Linq; -using System.Net; -using System.Net.Http; -using System.Net.Sockets; -using System.Threading; -using System.Threading.Tasks; -using CreateMikLabelModel.Models; -using Octokit; -using Polly; - -namespace CreateMikLabelModel -{ - internal class TrainingDataClient - { - private static int s_randomSeed = Environment.TickCount; - private static readonly ThreadLocal RandomNumberGenerator = new ThreadLocal(() => new Random(Interlocked.Increment(ref s_randomSeed)), false); - - private GitHubClient _client; - private ILogger _logger; - - public TrainingDataClient(string githubAccessToken, ILogger logger) - { - _client = new GitHubClient(new ProductHeaderValue("Microsoft-ML-IssueBot", "1.0.0.0")) - { - Credentials = new Credentials(githubAccessToken) - }; - - _logger = logger; - } - - public async IAsyncEnumerable GetIssuesAsync( - IEnumerable repositories, - TrainingDataFilters filters, - DateTimeOffset? startingDate = null) - { - var retryPolicy = CreateRetryPolicy>(); - - var request = new RepositoryIssueRequest - { - Since = startingDate, - Filter = IssueFilter.All, - State = ItemStateFilter.All - }; - - if (filters.RequiredIssueLabelNames != null) - { - foreach (var requiredLabel in filters.RequiredIssueLabelNames) - { - request.Labels.Add(requiredLabel); - } - } - - var options = new ApiOptions - { - PageSize = 100 - }; - - foreach (var repository in repositories) - { - _logger.LogInformation($"Querying issues for '{ repository }'."); - - var repositoryInfo = RepositoryInformation.Parse(repository); - var issues = await retryPolicy.ExecuteAsync(() => _client.Issue.GetAllForRepository(repositoryInfo.Owner, repositoryInfo.Name, request, options)).ConfigureAwait(false); - - _logger.LogInformation($"{ issues.Count } filtered issues were found for '{ repository }' before filtering was applied."); - - foreach (var issue in issues) - { - if (filters.IssueFilter(issue)) - { - yield return issue; - } - } - } - } - - public async IAsyncEnumerable GetPullRequestsAsync( - IEnumerable repositories, - TrainingDataFilters filters, - DateTimeOffset? startingDate = null) - { - var pullRequestRetryPolicy = CreateRetryPolicy>(); - var fileRetryPolicy = CreateRetryPolicy>(); - - var request = new PullRequestRequest - { - State = ItemStateFilter.All, - SortProperty = PullRequestSort.Created, - SortDirection = SortDirection.Descending, - }; - - var options = new ApiOptions - { - PageSize = 100 - }; - - foreach (var repository in repositories) - { - _logger.LogInformation($"Querying pull requests for '{ repository }'."); - - var repositoryInfo = RepositoryInformation.Parse(repository); - var pullRequests = await pullRequestRetryPolicy.ExecuteAsync(() => _client.PullRequest.GetAllForRepository(repositoryInfo.Owner, repositoryInfo.Name, request, options)).ConfigureAwait(false); - - _logger.LogInformation($"{ pullRequests.Count } pull requests were found for '{ repository }' before filtering was applied."); - - foreach (var pullRequest in pullRequests) - { - // Pull requests can't be filtered by date, so manually scrub any earlier than - // the requested starting date. - - if ((startingDate.HasValue) && (pullRequest.CreatedAt < startingDate.Value)) - { - continue; - } - - // Pull requests can't be filtered by labels, so manually scrub any that do not - // have the required labels associated. - - if ((filters.RequiredPullRequestLabelNames is { Length: > 0 }) - && (!filters.RequiredPullRequestLabelNames.All(requiredLabel => pullRequest.Labels.Any(label => label.Name == requiredLabel)))) - { - continue; - } - - var files = await fileRetryPolicy.ExecuteAsync(() => _client.PullRequest.Files(repositoryInfo.Owner, repositoryInfo.Name, pullRequest.Number)).ConfigureAwait(false); - var pullRequestWithFiles = new PullRequestWithFiles(pullRequest, files.Select(file => file.FileName).ToArray()); - - if (filters.PullRequestFilter(pullRequestWithFiles)) - { - yield return pullRequestWithFiles; - } - } - } - } - - private static IAsyncPolicy CreateRetryPolicy(int maxRetryAttempts = 10, int defaultAbuseBackoffSeconds = 30, double exponentialBackoffSeconds = 0.8, double baseJitterSeconds = 2) => - Policy - .Handle(ex => ShouldRetry(ex)) - .WaitAndRetryAsync( - maxRetryAttempts, - attempt => CalculateRetryDelay(attempt, exponentialBackoffSeconds, baseJitterSeconds), - async (exception, attempt) => - { - var delay = exception switch - { - RateLimitExceededException rateEx => ((rateEx.Reset - DateTimeOffset.Now).Add(TimeSpan.FromSeconds(5))), - AbuseException abuseEx => TimeSpan.FromSeconds(abuseEx.RetryAfterSeconds.GetValueOrDefault(defaultAbuseBackoffSeconds)), - _ => default(TimeSpan?) - }; - - if (delay.HasValue) - { - await Task.Delay(delay.Value).ConfigureAwait(false); - } - }); - - private static TimeSpan CalculateRetryDelay(int attempt, double exponentialBackoffSeconds, double baseJitterSeconds) => - TimeSpan.FromSeconds((Math.Pow(2, attempt) * exponentialBackoffSeconds) + (RandomNumberGenerator.Value.NextDouble() * baseJitterSeconds)); - - private static bool ShouldRetry(Exception ex) => ((IsRetriableException(ex)) || (IsRetriableException(ex?.InnerException))); - - private static bool IsRetriableException(Exception ex) - { - if (ex == null) - { - return false; - } - - switch (ex) - { - case AbuseException _: - case RateLimitExceededException _: - case TimeoutException _: - case TaskCanceledException _: - case OperationCanceledException _: - case WebException _: - case SocketException _: - case IOException _: - return true; - - case HttpRequestException requestEx: - return IsRetriableStatus(requestEx.StatusCode); - - case ApiException apiEx: - return IsRetriableStatus(apiEx.StatusCode); - - default: - return false; - }; - } - - private static bool IsRetriableStatus(HttpStatusCode? statusCode) => - ((statusCode == null) - || (statusCode == HttpStatusCode.Unauthorized) - || (statusCode == ((HttpStatusCode)408)) - || (statusCode == HttpStatusCode.Conflict) - || (statusCode == ((HttpStatusCode)429)) - || (statusCode == HttpStatusCode.InternalServerError) - || (statusCode == HttpStatusCode.ServiceUnavailable) - || (statusCode == HttpStatusCode.GatewayTimeout)); - } -} diff --git a/tools/issue-labeler/src/CreateMikLabelModel/DL/TrainingDataFilters.cs b/tools/issue-labeler/src/CreateMikLabelModel/DL/TrainingDataFilters.cs deleted file mode 100644 index eff66c42ce2..00000000000 --- a/tools/issue-labeler/src/CreateMikLabelModel/DL/TrainingDataFilters.cs +++ /dev/null @@ -1,106 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -using CreateMikLabelModel.Models; -using Octokit; - -namespace CreateMikLabelModel -{ - /// - /// The set of filters to apply when creating a training set. - /// - /// - public class TrainingDataFilters - { - /// - /// The set of names identifying labels which must all be present on an issue - /// for it to be included in the training set. - /// - /// - /// If null or empty, issues will not require any specific labels. - /// - public string[] RequiredIssueLabelNames { get; init; } - - /// - /// The set of names identifying labels which must all be present on a pull request - /// for it to be included in the training set. - /// - /// - /// If null or empty, pull requests will not require any specific labels. - /// - public string[] RequiredPullRequestLabelNames { get; init; } - - /// - /// Indicates whether or not issues should be included in the - /// training set. If included, the will be applied - /// to each issue for individual consideration. - /// - /// - /// true to include issues; otherwise, false. - /// - public bool IncludeIssues { get; init; } - - /// - /// Indicates whether or not pull requests should be included in the - /// training set. If included, the will be applied - /// to each issue for individual consideration. - /// - /// - /// true to include issues; otherwise, false. - /// - public bool IncludePullRequests { get; init; } - - /// - /// Initializes a new instance of the class. - /// - /// - /// A flag indicating whether or not issues should be included in the training set. - /// A flag indicating whether or not pull requests should be included in the training set. - /// The set of names identifying labels which all must be present on an issue for it to be included in the training set. - /// The set of names identifying labels which all must be present on a pull request for it to be included in the training set. - /// - public TrainingDataFilters( - bool includeIssues = true, - bool includePullRequests = true, - string[] requiredIssueLabelNames = default, - string[] requiredPullRequestLabelNames = default) - { - IncludeIssues = includeIssues; - IncludePullRequests = includePullRequests; - RequiredIssueLabelNames = requiredIssueLabelNames; - RequiredPullRequestLabelNames = requiredPullRequestLabelNames; - } - - /// - /// A filter applied to the issues under consideration for use in the training set. The filter - /// is only considered if is set. - /// - /// - /// The issue to consider. - /// - /// true if the should be included in the training set; otherwise, false. - /// - public virtual bool IssueFilter(Issue issue) => true; - - /// - /// A filter applied to the pull requests under consideration for use in the training set. The - /// filter is only considered if is set. - /// - /// - /// The pull request to consider. - /// - /// true if the should be included in the training set; otherwise, false. - /// - public virtual bool PullRequestFilter(PullRequestWithFiles pullRequest) => true; - - /// - /// A filter applied to the labels under consideration for use in the training set. - /// - /// - /// The label to consider. - /// - /// true if the should be included in the training set; otherwise, false. - /// - public virtual bool LabelFilter(Label label) => true; - } -} diff --git a/tools/issue-labeler/src/CreateMikLabelModel/DL/TrainingDataProcessor.cs b/tools/issue-labeler/src/CreateMikLabelModel/DL/TrainingDataProcessor.cs deleted file mode 100644 index cdbd8935061..00000000000 --- a/tools/issue-labeler/src/CreateMikLabelModel/DL/TrainingDataProcessor.cs +++ /dev/null @@ -1,106 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -using System.Collections.Generic; -using CreateMikLabelModel.Models; -using Octokit; - -namespace CreateMikLabelModel -{ - /// - /// The processor responsible for preparing training set items from the - /// raw repository data. - /// - /// - public class TrainingDataProcessor - { - /// The name of the default training segment. - public const string DefaultSegmentName = "Default"; - - /// - /// The logger to use for reporting information as items are prepared. - /// - /// - protected ILogger Logger { get; init; } - - /// - /// Initializes a new instance of the class. - /// - /// - /// The logger to use for reporting information as items are prepared. - /// - public TrainingDataProcessor(ILogger logger) => Logger = logger; - - /// - /// Prepares training data based on repository issues, transforming it into the - /// appropriate representation. - /// - /// - /// The raw training data, in the form of repository issues. - /// The name of the repository that was the source of the training data. - /// - /// The set of instances prepared from the . - /// - public virtual async IAsyncEnumerable PrepareData( - IAsyncEnumerable trainingData, - string repositoryName) - { - var itemCount = 0; - - await foreach (var issue in trainingData) - { - if (issue.Labels.Count > 0) - { - foreach (var label in issue.Labels) - { - ++itemCount; - yield return new TrainingDataItem(label.Name, DefaultSegmentName, repositoryName, issue); - } - } - else - { - ++itemCount; - yield return new TrainingDataItem(null, DefaultSegmentName, repositoryName, issue); - } - } - - Logger.LogInformation($"Prepared { itemCount } training set items from issue training data."); - } - - /// - /// Prepares training data based on repository pull requests, transforming it into the - /// appropriate representation. - /// - /// - /// The raw training data, in the form of repository pull requests. - /// The name of the repository that was the source of the training data. - /// - /// The set of instances prepared from the . - /// - public virtual async IAsyncEnumerable PrepareData( - IAsyncEnumerable trainingData, - string repositoryName) - { - var itemCount = 0; - - await foreach (var pullRequest in trainingData) - { - if (pullRequest.PullRequest.Labels.Count > 0) - { - foreach (var label in pullRequest.PullRequest.Labels) - { - ++itemCount; - yield return new TrainingDataItem(label.Name, DefaultSegmentName, repositoryName, pullRequest); - } - } - else - { - ++itemCount; - yield return new TrainingDataItem(null, DefaultSegmentName, repositoryName, pullRequest); - } - } - - Logger.LogInformation($"Prepared { itemCount } training set items from pull request training data."); - } - } -} diff --git a/tools/issue-labeler/src/CreateMikLabelModel/LabelModelTrainer.cs b/tools/issue-labeler/src/CreateMikLabelModel/LabelModelTrainer.cs deleted file mode 100644 index d62b7e0b155..00000000000 --- a/tools/issue-labeler/src/CreateMikLabelModel/LabelModelTrainer.cs +++ /dev/null @@ -1,360 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -using System; -using System.Collections.Generic; -using System.Diagnostics; -using System.IO; -using System.Linq; -using System.Threading.Tasks; -using CreateMikLabelModel.ML; -using CreateMikLabelModel.Models; - -namespace CreateMikLabelModel -{ - /// - /// Provides functionality related to training label models, including building anc curating the - /// sets of data needed to do so. - /// - /// - public class LabelModelTrainer - { - private ILogger _logger; - - /// - /// The repository that the trainer is associated with. - /// - /// - /// - /// The full path of the repository, including the owner name. For - /// example, "Azure/azure-sdk-for-net". - /// - /// - public string RepositoryPath { get; init; } - - /// - /// Initializes a new instance of the class. - /// - /// - /// The repository path to associate the training with. - /// The logging implementation to use for emitting messages. - /// - public LabelModelTrainer(string repositoryPath, ILogger logger) - { - _logger = logger ?? throw new ArgumentNullException(nameof(logger)); - - if (string.IsNullOrWhiteSpace(repositoryPath)) - { - throw new ArgumentNullException(nameof(repositoryPath)); - } - - RepositoryPath = repositoryPath; - } - - /// - /// Queries repository items for data to use for training the models. - /// - /// - /// The access token to use for the GitHub API. - /// The base path to use for storing and querying training data. - /// The processor for preparing training set items from repository issues and pull requests. - /// The set of filters to apply to data when building the training set. If not provided, training items will not be filtered. - /// - /// The set of that were produced. - /// - public virtual Task> QueryTrainingData( - string gitHubAccessToken, - string trainingDataBasePath, - TrainingDataProcessor processor = default, - TrainingDataFilters filters = default) => QueryTrainingData(gitHubAccessToken, trainingDataBasePath, new[] { RepositoryPath }, processor, filters); - - /// - /// Queries repository items for data to use for training the models. - /// - /// - /// The access token to use for the GitHub API. - /// The base path to use for storing and querying training data. - /// The group of repositories to include in this training set. - /// The processor for preparing training set items from repository issues and pull requests. - /// The set of filters to apply to data when building the training set. If not provided, training items will not be filtered. - /// - /// The set of that were produced. - /// - public virtual async Task> QueryTrainingData( - string gitHubAccessToken, - string trainingDataBasePath, - string[] trainingRepositoryGroup, - TrainingDataProcessor processor = default, - TrainingDataFilters filters = default) - { - if (gitHubAccessToken is { Length: 0 }) - { - throw new ArgumentException("GitHub access token is required.", nameof(gitHubAccessToken)); - } - - if (trainingDataBasePath is { Length: 0 }) - { - throw new ArgumentException("The base path for storing training data is required.", nameof(gitHubAccessToken)); - } - - if ((!Directory.Exists(trainingDataBasePath)) || (!ValidateWriteAccess(trainingDataBasePath))) - { - throw new ArgumentException("Either the directory does not exist or cannot be written to.", nameof(trainingDataBasePath)); - } - - if (trainingRepositoryGroup is { Length: 0 }) - { - throw new ArgumentException("The repository group is required and should contain at least one item.", nameof(trainingRepositoryGroup)); - } - - // If no explicit processor or filters were requested, accept all items as valid for the training set. - - processor ??= new TrainingDataProcessor(_logger); - filters ??= new TrainingDataFilters(); - - _logger.LogInformation($"Preparing the training set for '{ RepositoryPath }'."); - - var stopWatch = Stopwatch.StartNew(); - var trainingSetItemCount = 0; - var repositoryInformation = RepositoryInformation.Parse(RepositoryPath); - var trainingItemClient = new TrainingDataClient(gitHubAccessToken, _logger); - var trainingItems = new Dictionary>(); - - try - { - // Process issues, if they are to be included. - - if (filters.IncludeIssues) - { - await foreach (var trainingItem in processor.PrepareData(trainingItemClient.GetIssuesAsync(trainingRepositoryGroup, filters), repositoryInformation.Name)) - { - if (!trainingItems.ContainsKey(trainingItem.SegmentName)) - { - trainingItems.Add(trainingItem.SegmentName, new List()); - } - - trainingItems[trainingItem.SegmentName].Add(trainingItem); - ++trainingSetItemCount; - } - } - - // Process pull requests, if they are to be included. - - if (filters.IncludePullRequests) - { - await foreach (var trainingItem in processor.PrepareData(trainingItemClient.GetPullRequestsAsync(trainingRepositoryGroup, filters), repositoryInformation.Name)) - { - if (!trainingItems.ContainsKey(trainingItem.SegmentName)) - { - trainingItems.Add(trainingItem.SegmentName, new List()); - } - - trainingItems[trainingItem.SegmentName].Add(trainingItem); - ++trainingSetItemCount; - } - } - } - catch (Exception ex) - { - throw new ApplicationException("The training set was not able to be successfully prepared.", ex); - } - - stopWatch.Stop(); - _logger.LogInformation($"Done downloading data for training items in {stopWatch.Elapsed.TotalSeconds:0.00} seconds."); - - // With the data downloaded and prepared, write the training set data for each segment. - - _logger.LogInformation($"Writing out training data files for '{ RepositoryPath }'."); - - stopWatch.Restart(); - var trainingFiles = new Dictionary(trainingItems.Keys.Count); - - try - { - foreach (var segment in trainingItems) - { - var segmentFiles = CreateTrainingFilesForSegment(repositoryInformation, segment.Key, trainingDataBasePath, filters); - trainingFiles.Add(segment.Key, segmentFiles); - - using var outputWriter = new StreamWriter(segmentFiles.Issues.InputPath); - TrainingData.WriteHeader(outputWriter); - TrainingData.WriteTrainingItems(segment.Value, outputWriter); - } - } - catch (Exception ex) - { - throw new ApplicationException("The training data files were not able to be successfully written.", ex); - } - - stopWatch.Stop(); - _logger.LogInformation($"Done writing training data files in {stopWatch.Elapsed.TotalSeconds:0.00} seconds."); - - // Return the segments and associated files. - - return trainingFiles; - } - - /// - /// Generates the training datasets for issues and pull requests, writing out - /// the necessary files to the paths specified by the . - /// - /// - /// The locations of the files, both input and output, associated with training datasets. - /// - public void GenerateTrainingDatasets(TrainingDataSegment trainingFiles) - { - // Generate the dataset for issues. - - _logger.LogInformation("Generating the datasets for issues..."); - - var stopWatch = Stopwatch.StartNew(); - - if (!trainingFiles.Issues.SkipProcessing) - { - var issueData = TrainingDataset.ProcessIssueTrainingData(trainingFiles.Issues.InputPath).ToArray(); - - // There is always a header line present; if there are no other lines, then there was no - // issue data. - - if (issueData.Length > 1) - { - TrainingDataset.WriteDataset(trainingFiles.Issues, issueData); - _logger.LogInformation($"{ issueData.Length } issues were included in the datasets."); - } - else - { - _logger.LogInformation("No issue data was available for use in the datasets."); - } - } - else - { - _logger.LogInformation("Issues were configured to be excluded from the datasets; no issue data was used."); - } - - stopWatch.Stop(); - _logger.LogInformation($"Issue datasets are complete in {stopWatch.Elapsed.TotalSeconds:0.00} seconds."); - - // Generate the dataset for pull requests. - - _logger.LogInformation("Generating the datasets for pull requests..."); - stopWatch.Restart(); - - if (!trainingFiles.PullRequests.SkipProcessing) - { - var pullRequestData = TrainingDataset.ProcessPullRequestTrainingData(trainingFiles.PullRequests.InputPath).ToArray(); - - // There is always a header line present; if there are no other lines, then there was no - // pull request data. - - if (pullRequestData.Length > 1) - { - TrainingDataset.WriteDataset(trainingFiles.PullRequests, pullRequestData); - _logger.LogInformation($"{ pullRequestData.Length } pull requests were included in the datasets."); - } - else - { - _logger.LogInformation("No pull request data was available for use in the datasets."); - } - } - else - { - _logger.LogInformation("Pull requests were configured to be excluded from the datasets; no pull request data was used."); - } - - stopWatch.Stop(); - _logger.LogInformation($"Pull request datasets are complete in {stopWatch.Elapsed.TotalSeconds:0.00} seconds."); - } - - /// - /// Trains the machine learning models, using the previously prepared training datasets - /// identified by the paths specified in the specified. - /// - /// - /// The locations of the files for training datasets to be used for training the ML models. - /// - public void TrainModels(TrainingDataSegment trainingFiles) - { - var mlHelper = new MLHelper(_logger); - var stopWatch = Stopwatch.StartNew(); - - if (!trainingFiles.Issues.SkipProcessing) - { - _logger.LogInformation("Training the models for issues..."); - mlHelper.Train(trainingFiles.Issues, false); - } - else - { - _logger.LogInformation("Issues were configured to be excluded from the training; no issue data trained."); - } - - if (!trainingFiles.PullRequests.SkipProcessing) - { - _logger.LogInformation("Training the models for pull requests..."); - mlHelper.Train(trainingFiles.PullRequests, true); - } - else - { - _logger.LogInformation("Pull requests were configured to be excluded from the training; no pull request data was trained."); - } - - stopWatch.Stop(); - _logger.LogInformation($"Model training complete in {stopWatch.Elapsed.TotalSeconds:0.00} seconds."); - } - - /// - /// Tests the previously trained machine learning models identified by the paths specified in - /// the specified. - /// - /// - /// The locations of the files for training datasets to be used for training the ML models. - /// - public void TestModels(TrainingDataSegment trainingFiles) - { - var mlHelper = new MLHelper(_logger); - var stopWatch = Stopwatch.StartNew(); - - if (!trainingFiles.Issues.SkipProcessing) - { - _logger.LogInformation("Testing the models for issues..."); - mlHelper.Test(trainingFiles.Issues, false); - } - - if (!trainingFiles.PullRequests.SkipProcessing) - { - _logger.LogInformation("Testing the models for pull requests..."); - mlHelper.Test(trainingFiles.PullRequests, true); - } - - stopWatch.Stop(); - _logger.LogInformation($"Model testing complete in {stopWatch.Elapsed.TotalSeconds:0.00} seconds."); - } - - private static bool ValidateWriteAccess(string path) - { - try - { - using var file = File.Create(Path.Combine(path, Path.GetRandomFileName()), 1, FileOptions.DeleteOnClose); - file.Close(); - - return true; - } - catch (UnauthorizedAccessException) - { - return false; - } - } - - private static TrainingDataSegment CreateTrainingFilesForSegment( - RepositoryInformation repository, - string segmentName, - string trainingDataBasePath, - TrainingDataFilters filters) - { - var prefix = $"{ repository.Owner }-{ repository.Name }-{segmentName }"; - - return new TrainingDataSegment( - new TrainingDataFilePaths(trainingDataBasePath, prefix, forPrs: false, skip: !filters.IncludeIssues), - new TrainingDataFilePaths(trainingDataBasePath, prefix, forPrs: true, skip: !filters.IncludePullRequests)); - } - } -} diff --git a/tools/issue-labeler/src/CreateMikLabelModel/Logging/ConsoleLogger.cs b/tools/issue-labeler/src/CreateMikLabelModel/Logging/ConsoleLogger.cs deleted file mode 100644 index 05b1b2e2fc3..00000000000 --- a/tools/issue-labeler/src/CreateMikLabelModel/Logging/ConsoleLogger.cs +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -using System; - -namespace CreateMikLabelModel -{ - /// - /// Logs information to the using the - /// standard streams. - /// - /// - public class ConsoleLogger : ILogger - { - /// - /// Logs an informational message. - /// - /// - /// The message to log. - /// - public void LogInformation(string message) => Console.WriteLine(message); - - /// - /// Logs a warning message. - /// - /// - /// The message to log. - /// - public void LogWarning(string message) - { - var color = Console.ForegroundColor; - - Console.ForegroundColor = ConsoleColor.Yellow; - Console.WriteLine(message); - Console.ForegroundColor = color; - } - } -} diff --git a/tools/issue-labeler/src/CreateMikLabelModel/Logging/ILogger.cs b/tools/issue-labeler/src/CreateMikLabelModel/Logging/ILogger.cs deleted file mode 100644 index 2285c24f616..00000000000 --- a/tools/issue-labeler/src/CreateMikLabelModel/Logging/ILogger.cs +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -namespace CreateMikLabelModel -{ - /// - /// Allows messages of different categories to be logged. - /// - /// - public interface ILogger - { - /// - /// Logs an informational message. - /// - /// - /// The message to log. - /// - void LogInformation(string message); - - /// - /// Logs a warning message. - /// - /// - /// The message to log. - /// - void LogWarning(string message); - } -} diff --git a/tools/issue-labeler/src/CreateMikLabelModel/ML/ExperimentModifier.cs b/tools/issue-labeler/src/CreateMikLabelModel/ML/ExperimentModifier.cs deleted file mode 100644 index ced65e65731..00000000000 --- a/tools/issue-labeler/src/CreateMikLabelModel/ML/ExperimentModifier.cs +++ /dev/null @@ -1,91 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -using Microsoft.ML.AutoML; -using System; -using System.Collections.Generic; - -namespace CreateMikLabelModel.ML -{ - public struct ExperimentModifier - { - public ExperimentModifier(TrainingDataFilePaths paths, bool forPrs) - { - // set all to defaults: - ColumnSetup = (columnInformation, forPrs) => - { - // Customize column information returned by InferColumns API - columnInformation.CategoricalColumnNames.Clear(); - columnInformation.NumericColumnNames.Clear(); - columnInformation.IgnoredColumnNames.Clear(); - columnInformation.TextColumnNames.Clear(); - - // NOTE: depending on how the data changes over time this might need to get updated too. - // Only the Title and Description are needed, but since we are PreFeaturizing them we can - // ignore them here. - columnInformation.IgnoredColumnNames.Add("Title"); - columnInformation.IgnoredColumnNames.Add("Description"); - columnInformation.IgnoredColumnNames.Add("Author"); - columnInformation.IgnoredColumnNames.Add("IsPR"); - columnInformation.IgnoredColumnNames.Add("NumMentions"); - columnInformation.IgnoredColumnNames.Add("UserMentions"); - columnInformation.IgnoredColumnNames.Add("ID"); - columnInformation.IgnoredColumnNames.Add("CombinedID"); - - if (forPrs) - { - columnInformation.NumericColumnNames.Add("FileCount"); - columnInformation.IgnoredColumnNames.Add("Files"); - columnInformation.TextColumnNames.Add("FolderNames"); - columnInformation.IgnoredColumnNames.Add("Folders"); - columnInformation.IgnoredColumnNames.Add("FileExtensions"); - columnInformation.TextColumnNames.Add("Filenames"); - } - }; - - TrainerSetup = (trainers) => - { - trainers.Clear(); - if (forPrs) - { - trainers.Add(MulticlassClassificationTrainer.SdcaMaximumEntropy); - trainers.Add(MulticlassClassificationTrainer.FastTreeOva); - } - else - { - trainers.Add(MulticlassClassificationTrainer.SdcaMaximumEntropy); - // trainers.Add(MulticlassClassificationTrainer.LinearSupportVectorMachinesOva); - //trainers.Add(MulticlassClassificationTrainer.LightGbm); - } - }; - - ExperimentTime = 300; - LabelColumnName = "Label"; - ForPrs = forPrs; - Paths = paths; - } - - public ExperimentModifier( - bool forPrs, - uint experimentTime, - string labelColumnName, - TrainingDataFilePaths paths, - Action columnSetup, - Action> trainerSetup) - { - ForPrs = forPrs; - ExperimentTime = experimentTime; - LabelColumnName = labelColumnName; - Paths = paths; - ColumnSetup = columnSetup; - TrainerSetup = trainerSetup; - } - - public readonly uint ExperimentTime; - public readonly string LabelColumnName; - public readonly Action ColumnSetup; - public readonly Action> TrainerSetup; - public readonly bool ForPrs; - public readonly TrainingDataFilePaths Paths; - } -} diff --git a/tools/issue-labeler/src/CreateMikLabelModel/ML/LoggingHelper.cs b/tools/issue-labeler/src/CreateMikLabelModel/ML/LoggingHelper.cs deleted file mode 100644 index ffaa7b9fad1..00000000000 --- a/tools/issue-labeler/src/CreateMikLabelModel/ML/LoggingHelper.cs +++ /dev/null @@ -1,177 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -using Microsoft.ML.AutoML; -using Microsoft.ML.Data; -using System; -using System.Collections.Generic; -using System.Diagnostics; -using System.Linq; -using System.Text; - -namespace CreateMikLabelModel.ML -{ - internal class LoggingHelper - { - private const int Width = 114; - private readonly ILogger _logger; - - public LoggingHelper(ILogger logger) => _logger = logger; - - internal void PrintIterationMetrics(int iteration, string trainerName, MulticlassClassificationMetrics metrics, double? runtimeInSeconds) - { - PrintRow($"{iteration,-4} {trainerName,-35} {metrics?.MicroAccuracy ?? double.NaN,14:F4} {metrics?.MacroAccuracy ?? double.NaN,14:F4} {runtimeInSeconds.Value,9:F1}", Width); - } - - internal void PrintIterationException(Exception ex) - { - _logger.LogInformation($"Exception during AutoML iteration: {ex}"); - } - - internal void PrintMulticlassClassificationMetricsHeader() - { - PrintRow($"{"",-4} {"Trainer",-35} {"MicroAccuracy",14} {"MacroAccuracy",14} {"Duration",9}", Width); - } - - private void PrintRow(string message, int width) - { - _logger.LogInformation("|" + message.PadRight(width - 2) + "|"); - } - - public void ConsoleWriteHeader(params string[] lines) - { - _logger.LogInformation(" "); - foreach (var line in lines) - { - _logger.LogInformation(line); - } - var maxLength = lines.Select(x => x.Length).Max(); - _logger.LogInformation(new string('#', maxLength)); - } - - public static string BuildStringTable(IList arrValues) - { - var maxColumnsWidth = GetMaxColumnsWidth(arrValues); - var headerSpliter = new string('-', maxColumnsWidth.Sum(i => i + 3) - 1); - - var sb = new StringBuilder(); - for (var rowIndex = 0; rowIndex < arrValues.Count; rowIndex++) - { - if (rowIndex == 0) - { - sb.AppendFormat(" {0} ", headerSpliter); - sb.AppendLine(); - } - - for (var colIndex = 0; colIndex < arrValues[0].Length; colIndex++) - { - // Print cell - var cell = arrValues[rowIndex][colIndex]; - cell = cell.PadRight(maxColumnsWidth[colIndex]); - sb.Append(" | "); - sb.Append(cell); - } - - // Print end of line - sb.Append(" | "); - sb.AppendLine(); - - // Print splitter - if (rowIndex == 0) - { - sb.AppendFormat(" |{0}| ", headerSpliter); - sb.AppendLine(); - } - - if (rowIndex == arrValues.Count - 1) - { - sb.AppendFormat(" {0} ", headerSpliter); - } - } - - return sb.ToString(); - } - - private static int[] GetMaxColumnsWidth(IList arrValues) - { - var maxColumnsWidth = new int[arrValues[0].Length]; - for (var colIndex = 0; colIndex < arrValues[0].Length; colIndex++) - { - for (var rowIndex = 0; rowIndex < arrValues.Count; rowIndex++) - { - var newLength = arrValues[rowIndex][colIndex].Length; - var oldLength = maxColumnsWidth[colIndex]; - - if (newLength > oldLength) - { - maxColumnsWidth[colIndex] = newLength; - } - } - } - - return maxColumnsWidth; - } - - private class ColumnInferencePrinter - { - private static readonly string[] TableHeaders = new[] { "Name", "Data Type", "Purpose" }; - - private readonly ColumnInferenceResults _results; - - public ColumnInferencePrinter(ColumnInferenceResults results) - { - _results = results; - } - - public void Print() - { - var tableRows = new List(); - - // Add headers - tableRows.Add(TableHeaders); - - // Add column data - var info = _results.ColumnInformation; - AppendTableRow(tableRows, info.LabelColumnName, "Label"); - AppendTableRow(tableRows, info.ExampleWeightColumnName, "Weight"); - AppendTableRow(tableRows, info.SamplingKeyColumnName, "Sampling Key"); - AppendTableRows(tableRows, info.CategoricalColumnNames, "Categorical"); - AppendTableRows(tableRows, info.NumericColumnNames, "Numeric"); - AppendTableRows(tableRows, info.TextColumnNames, "Text"); - AppendTableRows(tableRows, info.IgnoredColumnNames, "Ignored"); - - Console.WriteLine(LoggingHelper.BuildStringTable(tableRows)); - } - - private void AppendTableRow(ICollection tableRows, - string columnName, string columnPurpose) - { - if (columnName == null) - { - return; - } - - tableRows.Add(new[] - { - columnName, - GetColumnDataType(columnName), - columnPurpose - }); - } - - private void AppendTableRows(ICollection tableRows, - IEnumerable columnNames, string columnPurpose) - { - foreach (var columnName in columnNames) - { - AppendTableRow(tableRows, columnName, columnPurpose); - } - } - - private string GetColumnDataType(string columnName) - { - return _results.TextLoaderOptions.Columns.First(c => c.Name == columnName).DataKind.ToString(); - } - } - } -} diff --git a/tools/issue-labeler/src/CreateMikLabelModel/ML/MLHelper.cs b/tools/issue-labeler/src/CreateMikLabelModel/ML/MLHelper.cs deleted file mode 100644 index af75bf252d6..00000000000 --- a/tools/issue-labeler/src/CreateMikLabelModel/ML/MLHelper.cs +++ /dev/null @@ -1,61 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -using Microsoft.ML; -using Microsoft.ML.Data; -using System; -using System.Diagnostics; - -namespace CreateMikLabelModel.ML -{ - public class MLHelper - { - private readonly MLContext _mLContext; - private readonly ILogger _logger; - - public MLHelper(ILogger logger) - { - _mLContext = new MLContext(seed: 0); - _logger = logger; - } - - public void Test(TrainingDataFilePaths files, bool forPrs) - { - MulticlassExperimentHelper.TestPrediction(_logger, _mLContext, files, forPrs: forPrs); - } - - public void Train(TrainingDataFilePaths files, bool forPrs) - { - var stopWatch = Stopwatch.StartNew(); - - var st = new ExperimentModifier(files, forPrs); - Train(st); - - stopWatch.Stop(); - _logger.LogInformation($"Done creating model in {stopWatch.ElapsedMilliseconds}ms"); - } - - private void Train(ExperimentModifier settings) - { - var setup = MulticlassExperimentSettingsHelper.SetupExperiment(_logger, _mLContext, settings, settings.Paths, settings.ForPrs); - - // Start experiment - var textLoader = _mLContext.Data.CreateTextLoader(setup.columnInference.TextLoaderOptions); - var paths = settings.Paths; - - // train once: - var experimentResult = MulticlassExperimentHelper.Train( - _logger, _mLContext, setup.experimentSettings, new MulticlassExperimentProgressHandler(_logger), paths, textLoader, setup.columnInference); - - // train twice - _ = MulticlassExperimentHelper.Retrain(experimentResult, - "refit model", - new MultiFileSource(paths.TrainPath, paths.ValidatePath), - paths.ValidatePath, - paths.FittedModelPath, textLoader, _logger, _mLContext); - - // final train: - _ = MulticlassExperimentHelper.Retrain(_logger, _mLContext, experimentResult, setup.columnInference, paths); - } - } -} diff --git a/tools/issue-labeler/src/CreateMikLabelModel/ML/MulticlassExperimentHelper.cs b/tools/issue-labeler/src/CreateMikLabelModel/ML/MulticlassExperimentHelper.cs deleted file mode 100644 index d0fb768866a..00000000000 --- a/tools/issue-labeler/src/CreateMikLabelModel/ML/MulticlassExperimentHelper.cs +++ /dev/null @@ -1,248 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -using System; -using System.Collections.Generic; -using System.Linq; -using IssueLabeler.Shared; -using Microsoft.ML; -using Microsoft.ML.AutoML; -using Microsoft.ML.Data; -using Microsoft.ML.Transforms.Text; - -namespace CreateMikLabelModel.ML -{ - public static class MulticlassExperimentHelper - { - public static ExperimentResult RunAutoMLExperiment( - ILogger logger, MLContext mlContext, MulticlassExperimentSettings experimentSettings, - MulticlassExperimentProgressHandler progressHandler, IDataView dataView, ColumnInferenceResults columnInference) - { - new LoggingHelper(logger).ConsoleWriteHeader("=============== Running AutoML experiment ==============="); - logger.LogInformation($"Running AutoML multiclass classification experiment for {experimentSettings.MaxExperimentTimeInSeconds} seconds..."); - - // Pre-featurize the title and description, and remove features that have less then 2. - IEstimator preFeaturizer = - preFeaturizer = mlContext.Transforms.Text.FeaturizeText("TextFeatures", - new TextFeaturizingEstimator.Options(), - new[] { "Title", "Description" }) - .Append(mlContext.Transforms.FeatureSelection.SelectFeaturesBasedOnCount("TextFeatures", "TextFeatures", 2)) - .AppendCacheCheckpoint(mlContext); - - var experimentResult = mlContext.Auto() - .CreateMulticlassClassificationExperiment(experimentSettings) - .Execute(dataView, columnInference.ColumnInformation, progressHandler: progressHandler, preFeaturizer: preFeaturizer); - - logger.LogInformation(Environment.NewLine); - logger.LogInformation($"num models created: {experimentResult.RunDetails.Count()}"); - - // Get top few runs ranked by accuracy - var topRuns = experimentResult.RunDetails - .Where(r => r.ValidationMetrics != null && !double.IsNaN(r.ValidationMetrics.MicroAccuracy)) - .OrderByDescending(r => r.ValidationMetrics.MicroAccuracy) - .Take(3) - .ToArray(); - - logger.LogInformation("Top models ranked by accuracy --"); - logger.LogInformation(CreateRow($"{"",-4} {"Trainer",-35} {"MicroAccuracy",14} {"MacroAccuracy",14} {"Duration",9}", Width)); - - for (var i = 0; i < topRuns.Length; i++) - { - var run = topRuns[i]; - logger.LogInformation(CreateRow($"{i,-4} {run.TrainerName,-35} {run.ValidationMetrics?.MicroAccuracy ?? double.NaN,14:F4} {run.ValidationMetrics?.MacroAccuracy ?? double.NaN,14:F4} {run.RuntimeInSeconds,9:F1}", Width)); - } - return experimentResult; - } - - public static ExperimentResult Train( - ILogger logger, MLContext mlContext, MulticlassExperimentSettings experimentSettings, - MulticlassExperimentProgressHandler progressHandler, TrainingDataFilePaths paths, TextLoader textLoader, ColumnInferenceResults columnInference) - { - var data = mlContext.Data.TrainTestSplit(textLoader.Load(paths.TrainPath, paths.ValidatePath), seed: 0); - var experimentResult = RunAutoMLExperiment(logger, mlContext, experimentSettings, progressHandler, data.TrainSet, columnInference); - - EvaluateTrainedModelAndPrintMetrics(logger, mlContext, experimentResult.BestRun.Model, experimentResult.BestRun.TrainerName, data.TestSet); - SaveModel(logger, mlContext, experimentResult.BestRun.Model, paths.ModelPath, data.TrainSet); - return experimentResult; - } - - public static ITransformer Retrain(ExperimentResult experimentResult, - string trainerName, MultiFileSource multiFileSource, string dataPath, string modelPath, TextLoader textLoader, ILogger logger, MLContext mlContext) - { - var dataView = textLoader.Load(dataPath); - new LoggingHelper(logger).ConsoleWriteHeader("=============== Re-fitting best pipeline ==============="); - - var combinedDataView = textLoader.Load(multiFileSource); - var bestRun = experimentResult.BestRun; - var refitModel = bestRun.Estimator.Fit(combinedDataView); - - EvaluateTrainedModelAndPrintMetrics(logger, mlContext, refitModel, trainerName, dataView); - SaveModel(logger, mlContext, refitModel, modelPath, dataView); - return refitModel; - } - - public static ITransformer Retrain(ILogger logger, MLContext mlContext, ExperimentResult experimentResult, - ColumnInferenceResults columnInference, TrainingDataFilePaths paths, bool fixedBug = false) - { - new LoggingHelper(logger).ConsoleWriteHeader("=============== Re-fitting best pipeline ==============="); - - var textLoader = mlContext.Data.CreateTextLoader(columnInference.TextLoaderOptions); - var combinedDataView = textLoader.Load(new MultiFileSource(paths.TrainPath, paths.ValidatePath, paths.TestPath)); - var bestRun = experimentResult.BestRun; - if (fixedBug) - { - // TODO: retry: below gave error but I thought it would work: - //refitModel = MulticlassExperiment.Retrain(experimentResult, - // "final model", - // new MultiFileSource(paths.TrainPath, paths.ValidatePath, paths.FittedPath), - // paths.TestPath, - // paths.FinalPath, textLoader, mlContext); - // but if failed before fixing this maybe the problem was in *EvaluateTrainedModelAndPrintMetrics* - - } - var refitModel = bestRun.Estimator.Fit(combinedDataView); - - EvaluateTrainedModelAndPrintMetrics(logger, mlContext, refitModel, "production model", textLoader.Load(paths.TestPath)); - // Save the re-fit model to a.ZIP file - SaveModel(logger, mlContext, refitModel, paths.FinalModelPath, textLoader.Load(paths.TestPath)); - - logger.LogInformation($"The model is saved to {paths.FinalModelPath}"); - return refitModel; - } - - private const int Width = 114; - - private static string CreateRow(string message, int width) => "|" + message.PadRight(width - 2) + "|"; - - /// - /// Evaluate the model and print metrics. - /// - private static void EvaluateTrainedModelAndPrintMetrics(ILogger logger, MLContext mlContext, ITransformer model, string trainerName, IDataView dataView) - { - logger.LogInformation("===== Evaluating model's accuracy with test data ====="); - var predictions = model.Transform(dataView); - var metrics = mlContext.MulticlassClassification.Evaluate(predictions, labelColumnName: "Label", scoreColumnName: "Score"); - - logger.LogInformation($"************************************************************"); - logger.LogInformation($"* Metrics for {trainerName} multi-class classification model "); - logger.LogInformation($"*-----------------------------------------------------------"); - logger.LogInformation($" MacroAccuracy = {metrics.MacroAccuracy:0.####}, a value between 0 and 1, the closer to 1, the better"); - logger.LogInformation($" MicroAccuracy = {metrics.MicroAccuracy:0.####}, a value between 0 and 1, the closer to 1, the better"); - logger.LogInformation($" LogLoss = {metrics.LogLoss:0.####}, the closer to 0, the better"); - for (int i = 0; i < metrics.PerClassLogLoss.Count; i++) - { - logger.LogInformation($" LogLoss for class {i+1} = {metrics.PerClassLogLoss[i]:0.####}, the closer to 0, the better"); - } - logger.LogInformation($"************************************************************"); - } - - private static void SaveModel(ILogger logger, MLContext mlContext, ITransformer model, string modelPath, IDataView dataview) - { - // Save the re-fit model to a.ZIP file - var consoleHelper = new LoggingHelper(logger); - consoleHelper.ConsoleWriteHeader("=============== Saving the model ==============="); - mlContext.Model.Save(model, dataview.Schema, modelPath); - logger.LogInformation($"The model is saved to {modelPath}"); - } - - public static void TestPrediction(ILogger logger, MLContext mlContext, TrainingDataFilePaths files, bool forPrs, double threshold = 0.4) - { - var trainedModel = mlContext.Model.Load(files.FittedModelPath, out _); - IEnumerable<(string knownLabel, GitHubIssuePrediction predictedResult, string issueNumber)> predictions = null; - string Legend1 = $"(includes not labeling issues with confidence lower than threshold. (here {threshold * 100.0f:#,0.00}%))"; - const string Legend2 = "(includes items that could be labeled if threshold was lower.)"; - const string Legend3 = "(those incorrectly labeled)"; - if (forPrs) - { - var testData = GetPullRequests(mlContext, files.TestPath); - logger.LogInformation($"{Environment.NewLine}Number of PRs tested: {testData.Length}"); - - var prEngine = mlContext.Model.CreatePredictionEngine(trainedModel); - predictions = testData - .Select(x => ( - knownLabel: x.Label, - predictedResult: prEngine.Predict(x), - issueNumber: x.ID.ToString() - )); - } - else - { - var testData = GetIssues(mlContext, files.TestPath); - logger.LogInformation($"{Environment.NewLine}\tNumber of issues tested: {testData.Length}"); - - var issueEngine = mlContext.Model.CreatePredictionEngine(trainedModel); - predictions = testData - .Select(x => ( - knownLabel: x.Label, - predictedResult: issueEngine.Predict(x), - issueNumber: x.ID.ToString() - )); - } - - var analysis = - predictions.Select(x => - ( - knownLabel: x.knownLabel, - predictedArea: x.predictedResult.Area, - maxScore: x.predictedResult.Score.Max(), - confidentInPrediction: x.predictedResult.Score.Max() >= threshold, - issueNumber: x.issueNumber - )); - - var countSuccess = analysis.Where(x => - (x.confidentInPrediction && x.predictedArea.Equals(x.knownLabel, StringComparison.Ordinal)) || - (!x.confidentInPrediction && !x.predictedArea.Equals(x.knownLabel, StringComparison.Ordinal))).Count(); - - var missedOpportunity = analysis - .Where(x => !x.confidentInPrediction && x.knownLabel.Equals(x.predictedArea, StringComparison.Ordinal)).Count(); - - var mistakes = analysis - .Where(x => x.confidentInPrediction && !x.knownLabel.Equals(x.predictedArea, StringComparison.Ordinal)) - .Select(x => new { Pair = $"\tPredicted: {x.predictedArea}, Actual:{x.knownLabel}", IssueNumbers = x.issueNumber, MaxConfidencePercentage = x.maxScore * 100.0f }) - .GroupBy(x => x.Pair) - .Select(x => new - { - Count = x.Count(), - PerdictedVsActual = x.Key, - Items = x, - }) - .OrderByDescending(x => x.Count); - int remaining = predictions.Count() - countSuccess - missedOpportunity; - - logger.LogInformation($"{Environment.NewLine}\thandled correctly: {countSuccess}{Environment.NewLine}\t{Legend1}{Environment.NewLine}"); - logger.LogInformation($"{Environment.NewLine}\tmissed: {missedOpportunity}{Environment.NewLine}\t{Legend2}{Environment.NewLine}"); - logger.LogInformation($"{Environment.NewLine}\tremaining: {remaining}{Environment.NewLine}\t{Legend3}{Environment.NewLine}"); - - foreach (var mismatch in mistakes.AsEnumerable()) - { - logger.LogInformation($"{mismatch.PerdictedVsActual}, NumFound: {mismatch.Count}"); - var sampleIssues = string.Join(Environment.NewLine, mismatch.Items.Select(x => $"\t\tFor #{x.IssueNumbers} was {x.MaxConfidencePercentage:#,0.00}% confident")); - logger.LogInformation($"{Environment.NewLine}{ sampleIssues }{Environment.NewLine}"); - } - } - - public static GitHubIssue[] GetIssues(MLContext mlContext, string dataFilePath) - { - var dataView = mlContext.Data.LoadFromTextFile( - path: dataFilePath, - hasHeader: true, - separatorChar: '\t', - allowQuoting: true, - allowSparse: false); - - return mlContext.Data.CreateEnumerable(dataView, false).ToArray(); - } - - public static GitHubPullRequest[] GetPullRequests(MLContext mlContext, string dataFilePath) - { - var dataView = mlContext.Data.LoadFromTextFile( - path: dataFilePath, - hasHeader: true, - separatorChar: '\t', - allowQuoting: true, - allowSparse: false); - - return mlContext.Data.CreateEnumerable(dataView, false).ToArray(); - } - } -} diff --git a/tools/issue-labeler/src/CreateMikLabelModel/ML/MulticlassExperimentProgressHandler.cs b/tools/issue-labeler/src/CreateMikLabelModel/ML/MulticlassExperimentProgressHandler.cs deleted file mode 100644 index a10513bbce4..00000000000 --- a/tools/issue-labeler/src/CreateMikLabelModel/ML/MulticlassExperimentProgressHandler.cs +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -using Microsoft.ML.AutoML; -using Microsoft.ML.Data; -using System; - -namespace CreateMikLabelModel.ML -{ - /// - /// Progress handler that AutoML will invoke after each model it produces and evaluates. - /// - public class MulticlassExperimentProgressHandler : IProgress> - { - private readonly LoggingHelper _consoleHelper; - private int _iterationIndex; - - public MulticlassExperimentProgressHandler(ILogger logger) => _consoleHelper = new LoggingHelper(logger); - - - public void Report(RunDetail iterationResult) - { - if (_iterationIndex++ == 0) - { - _consoleHelper.PrintMulticlassClassificationMetricsHeader(); - } - - if (iterationResult.Exception != null) - { - _consoleHelper.PrintIterationException(iterationResult.Exception); - } - else - { - _consoleHelper.PrintIterationMetrics(_iterationIndex, iterationResult.TrainerName, - iterationResult.ValidationMetrics, iterationResult.RuntimeInSeconds); - } - } - } -} diff --git a/tools/issue-labeler/src/CreateMikLabelModel/ML/MulticlassExperimentSettingsHelper.cs b/tools/issue-labeler/src/CreateMikLabelModel/ML/MulticlassExperimentSettingsHelper.cs deleted file mode 100644 index 1ae769c7e96..00000000000 --- a/tools/issue-labeler/src/CreateMikLabelModel/ML/MulticlassExperimentSettingsHelper.cs +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -using Microsoft.ML; -using Microsoft.ML.AutoML; -using System.IO; - -namespace CreateMikLabelModel.ML -{ - public static class MulticlassExperimentSettingsHelper - { - public static (ColumnInferenceResults columnInference, MulticlassExperimentSettings experimentSettings) SetupExperiment( - ILogger logger, MLContext mlContext, ExperimentModifier st, TrainingDataFilePaths paths, bool forPrs) - { - var columnInference = InferColumns(logger, mlContext, paths.TrainPath, st.LabelColumnName); - var columnInformation = columnInference.ColumnInformation; - st.ColumnSetup(columnInformation, forPrs); - - var experimentSettings = new MulticlassExperimentSettings(); - st.TrainerSetup(experimentSettings.Trainers); - experimentSettings.MaxExperimentTimeInSeconds = st.ExperimentTime; - - var cts = new System.Threading.CancellationTokenSource(); - experimentSettings.CancellationToken = cts.Token; - - // Set the cache directory to null. - // This will cause all models produced by AutoML to be kept in memory - // instead of written to disk after each run, as AutoML is training. - // (Please note: for an experiment on a large dataset, opting to keep all - // models trained by AutoML in memory could cause your system to run out - // of memory.) - experimentSettings.CacheDirectoryName = Path.GetTempPath(); - experimentSettings.OptimizingMetric = MulticlassClassificationMetric.MicroAccuracy; - return (columnInference, experimentSettings); - } - - /// - /// Infer columns in the dataset with AutoML. - /// - private static ColumnInferenceResults InferColumns(ILogger logger, MLContext mlContext, string dataPath, string labelColumnName) - { - new LoggingHelper(logger).ConsoleWriteHeader("=============== Inferring columns in dataset ==============="); - var columnInference = mlContext.Auto().InferColumns(dataPath, labelColumnName, groupColumns: false); - return columnInference; - } - } - -} \ No newline at end of file diff --git a/tools/issue-labeler/src/CreateMikLabelModel/ML/TrainingDataFilePaths.cs b/tools/issue-labeler/src/CreateMikLabelModel/ML/TrainingDataFilePaths.cs deleted file mode 100644 index c908f6ddb4e..00000000000 --- a/tools/issue-labeler/src/CreateMikLabelModel/ML/TrainingDataFilePaths.cs +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -using System.IO; - -namespace CreateMikLabelModel.ML -{ - public readonly struct TrainingDataFilePaths - { - public TrainingDataFilePaths(string folder, string commonPrefix, bool forPrs, bool skip) : this(folder, commonPrefix, string.Empty, forPrs, skip) - { - } - - public TrainingDataFilePaths(string folder, string commonPrefix, string modelPrefix, bool forPrs, bool skip) - { - Folder = folder; - SkipProcessing = skip; - InputPath = Path.Combine(Folder, commonPrefix + "-IssueAndPrData.tsv"); - var prefix = forPrs ? "-only-prs" : "-only-issues"; - - TrainPath = Path.Combine(Folder, commonPrefix + prefix + "-part1.tsv"); - ValidatePath = Path.Combine(Folder, commonPrefix + prefix + "-part2.tsv"); - TestPath = Path.Combine(Folder, commonPrefix + prefix + "-part3.tsv"); - ModelPath = Path.Combine(Folder, commonPrefix + prefix + modelPrefix + "-model.zip"); - FittedModelPath = Path.Combine(Folder, commonPrefix + prefix + modelPrefix + "-fitted-model.zip"); - FinalModelPath = Path.Combine(Folder, commonPrefix + prefix + modelPrefix + "-final-model.zip"); - } - - public readonly string Folder; - public readonly bool SkipProcessing; - public readonly string TrainPath; - public readonly string ValidatePath; - public readonly string TestPath; - public readonly string ModelPath; - public readonly string FittedModelPath; - public readonly string FinalModelPath; - public readonly string InputPath; - } -} diff --git a/tools/issue-labeler/src/CreateMikLabelModel/ML/TrainingDataSegment.cs b/tools/issue-labeler/src/CreateMikLabelModel/ML/TrainingDataSegment.cs deleted file mode 100644 index 598e54fe830..00000000000 --- a/tools/issue-labeler/src/CreateMikLabelModel/ML/TrainingDataSegment.cs +++ /dev/null @@ -1,7 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -namespace CreateMikLabelModel.ML -{ - public record TrainingDataSegment(TrainingDataFilePaths Issues, TrainingDataFilePaths PullRequests); -} diff --git a/tools/issue-labeler/src/CreateMikLabelModel/ML/TrainingDataset.cs b/tools/issue-labeler/src/CreateMikLabelModel/ML/TrainingDataset.cs deleted file mode 100644 index 4a2d8f579be..00000000000 --- a/tools/issue-labeler/src/CreateMikLabelModel/ML/TrainingDataset.cs +++ /dev/null @@ -1,265 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -using System; -using System.Collections.Generic; -using System.IO; -using System.Linq; -using System.Text; -using System.Text.RegularExpressions; -using IssueLabeler.Shared; - -namespace CreateMikLabelModel.ML -{ - internal static class TrainingDataset - { - private const int TrainingDataLineMinimum = 250; - private const string DataSetBasicHeaders = "CombinedID\tID\tLabel\tTitle\tDescription\tAuthor\tIsPR\tNumMentions\tUserMentions"; - private const string DataSetFileHeaders = DataSetBasicHeaders + "\tFileCount\tFiles\tFilenames\tFileExtensions\tFolderNames\tFolders"; - - private static readonly Regex UserMentionsExpression = new Regex(@"@[a-zA-Z0-9_//-]+", RegexOptions.Compiled); - private static readonly DiffHelper DiffHelper = new DiffHelper(); - - private static readonly Dictionary TrainingDataIndexes = new() - { - { "CombinedID", 0 }, - { "ID", 1 }, - { "Label", 2 }, - { "Title", 3 }, - { "Description", 4 }, - { "Author", 5 }, - { "IsPR", 6 }, - { "FilePaths", 7 } - }; - - public static IEnumerable ProcessIssueTrainingData(string trainingDataFilePath, bool includeFileColumns = false) => - ProcessTrainingData(trainingDataFilePath, includeFileColumns, line => line[TrainingDataIndexes["IsPR"]] != "1"); - - public static IEnumerable ProcessPullRequestTrainingData(string trainingDataFilePath, bool includeFileColumns = true) => - ProcessTrainingData(trainingDataFilePath, includeFileColumns, line => line[TrainingDataIndexes["IsPR"]] == "1"); - - public static void WriteDataset( - TrainingDataFilePaths filePaths, - string[] dataLines) - { - if (dataLines.Length < TrainingDataLineMinimum) - { - throw new ApplicationException($"At least { TrainingDataLineMinimum } training items are needed to create a training dataset; only { dataLines.Length - 1 } are available."); - } - - var trainingSetCount = (int)Math.Floor(dataLines.Length * 0.8); - var validateSetCount = (int)Math.Floor(dataLines.Length * 0.1); - var currentCount = 0; - var currentIndex = 1; - - FileStream datasetFile; - StreamWriter datasetWriter; - - // Create the training set. - - using (datasetFile = File.Open(Path.GetFullPath(filePaths.TrainPath), FileMode.OpenOrCreate, FileAccess.Write, FileShare.None)) - using (datasetWriter = new StreamWriter(datasetFile)) - { - // Write the header. - - datasetWriter.WriteLine(dataLines[0]); - - // Write the lines that belong in the set. - - while (currentCount < trainingSetCount) - { - datasetWriter.WriteLine(dataLines[currentIndex]); - - ++currentIndex; - ++currentCount; - } - } - - // Create the validate set. - - currentCount = 0; - - using (datasetFile = File.Open(Path.GetFullPath(filePaths.ValidatePath), FileMode.OpenOrCreate, FileAccess.Write, FileShare.None)) - using (datasetWriter = new StreamWriter(datasetFile)) - { - // Write the header. - - datasetWriter.WriteLine(dataLines[0]); - - // Write the lines that belong in the set. - - while (currentCount < validateSetCount) - { - datasetWriter.WriteLine(dataLines[currentIndex]); - - ++currentIndex; - ++currentCount; - } - } - - // Create the test set using all remaining data. - - using (datasetFile = File.Open(Path.GetFullPath(filePaths.TestPath), FileMode.OpenOrCreate, FileAccess.Write, FileShare.None)) - using (datasetWriter = new StreamWriter(datasetFile)) - { - // Write the header. - - datasetWriter.WriteLine(dataLines[0]); - - // Write the lines that belong in the set. - - while (currentIndex < dataLines.Length) - { - datasetWriter.WriteLine(dataLines[currentIndex]); - ++currentIndex; - } - } - } - - private static IEnumerable ProcessTrainingData( - string trainingDataFilePath, - bool includeFileColumns, - Func lineFilter) - { - using var dataFileStream = File.Open(Path.GetFullPath(trainingDataFilePath), FileMode.Open, FileAccess.Read, FileShare.Read); - using var dataFileReader = new StreamReader(dataFileStream); - - // Read and validate the training data headers - - var dataHeaders = dataFileReader.ReadLine(); - - if (!ValidateTrainingDataHeaders(dataHeaders)) - { - throw new ApplicationException("The training data file was not in the expected format."); - } - - // Emit the headers. - - yield return (includeFileColumns) ? DataSetFileHeaders : DataSetBasicHeaders; - - // Process each line of training data. - - var lineCount = 0; - var lineBuilder = new StringBuilder(); - var line = dataFileReader.ReadLine(); - - while (line != null) - { - var dataElements = line.Split('\t'); - - // Only process the line if it is accepted by the filter. - - if (lineFilter(dataElements)) - { - if (!byte.TryParse(dataElements[TrainingDataIndexes["IsPR"]], out var isPrBit)) - { - throw new ApplicationException($"Malformed training data for line '{ lineCount + 1 }'. The 'IsPR' flag could not be parsed."); - } - - if ((isPrBit < 0) || (isPrBit > 1)) - { - throw new ApplicationException($"Malformed training data for line '{ lineCount + 1 }'. The 'IsPR' flag has an invalid value: '{ isPrBit }' It should be either 0 or 1."); - } - - var mentions = GetUserMentions(dataElements[TrainingDataIndexes["Description"]]); - - lineBuilder - .Append(dataElements[TrainingDataIndexes["CombinedID"]]) - .Append('\t') - .Append(dataElements[TrainingDataIndexes["ID"]]) - .Append('\t') - .Append(dataElements[TrainingDataIndexes["Label"]]) - .Append('\t') - .Append(dataElements[TrainingDataIndexes["Title"]]) - .Append('\t') - .Append(dataElements[TrainingDataIndexes["Description"]]) - .Append('\t') - .Append(dataElements[TrainingDataIndexes["Author"]]) - .Append('\t') - .Append(isPrBit) - .Append('\t') - .Append(mentions.Length) - .Append('\t') - .Append(string.Join(' ', mentions)); - - if (includeFileColumns) - { - var filePaths = TrainingData.SplitFilePaths(dataElements[TrainingDataIndexes["FilePaths"]] ?? string.Empty) - .Where(path => !string.IsNullOrWhiteSpace(path)) - .ToArray(); - - AddFileInformationToLine(lineBuilder, filePaths, (isPrBit == 1)); - } - - // Emit the current line. - - yield return lineBuilder.ToString(); - } - - // Reset state for the next iteration. - - lineBuilder.Clear(); - line = dataFileReader.ReadLine(); - - ++lineCount; - } - } - - private static string[] GetUserMentions(string description) => - UserMentionsExpression - .Matches(description) - .Select(match => match.Value) - .ToArray(); - - private static void AddFileInformationToLine( - StringBuilder lineBuilder, - string[] filePaths, - bool isPullRequest) - { - // If the line is not being added for a pull request or there were no files, then file - // information will not be included. Add empty placeholder slugs and take no further action. - - if ((!isPullRequest) || filePaths.Length == 0) - { - lineBuilder - .Append('\t') - .Append(0) - .Append('\t', 5); - - return; - } - - var segmentedDiff = DiffHelper.SegmentDiff(filePaths); - - lineBuilder - .Append('\t') - .Append(string.Join(' ', filePaths)) - .Append('\t') - .Append(string.Join(' ', segmentedDiff.Filenames)) - .Append('\t') - .Append(string.Join(' ', segmentedDiff.Extensions)) - .Append('\t') - .Append(string.Join(' ', segmentedDiff.FolderNames)) - .Append('\t') - .Append(string.Join(' ', segmentedDiff.Folders)); - } - - private static bool ValidateTrainingDataHeaders(string headerLine) - { - var index = 0; - - foreach (var header in headerLine.Split('\t')) - { - if (TrainingDataIndexes[header] != index) - { - return false; - } - - ++index; - } - - return true; - } - - } -} diff --git a/tools/issue-labeler/src/CreateMikLabelModel/Models/PullRequestWithFiles.cs b/tools/issue-labeler/src/CreateMikLabelModel/Models/PullRequestWithFiles.cs deleted file mode 100644 index 008aab73906..00000000000 --- a/tools/issue-labeler/src/CreateMikLabelModel/Models/PullRequestWithFiles.cs +++ /dev/null @@ -1,14 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -using Octokit; - -namespace CreateMikLabelModel.Models -{ - public class PullRequestWithFiles - { - public PullRequest PullRequest { get; init; } - public string[] FilePaths { get; init; } - public PullRequestWithFiles(PullRequest pullRequest, string[] filePaths) => (PullRequest, FilePaths) = (pullRequest, filePaths); - } -} diff --git a/tools/issue-labeler/src/CreateMikLabelModel/Models/RepositoryInformation.cs b/tools/issue-labeler/src/CreateMikLabelModel/Models/RepositoryInformation.cs deleted file mode 100644 index baa029bbee2..00000000000 --- a/tools/issue-labeler/src/CreateMikLabelModel/Models/RepositoryInformation.cs +++ /dev/null @@ -1,27 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -namespace CreateMikLabelModel.Models -{ - public record RepositoryInformation(string Owner, string Name) - { - /// - /// Creates a new instance of the by parsing a repository - /// path. - /// - /// - /// The full repository path, in the format "Owner/repository-name". - /// - /// - /// - /// var info = RepositoryInformation.Parse("Azure/azure-sdk-for-net"); - /// - /// - /// - public static RepositoryInformation Parse(string repositoryPath) - { - var parts = repositoryPath.Split('/'); - return new(parts[0], parts[1]); - } - } -} diff --git a/tools/issue-labeler/src/CreateMikLabelModel/Models/TrainingDataItem.cs b/tools/issue-labeler/src/CreateMikLabelModel/Models/TrainingDataItem.cs deleted file mode 100644 index ac977a88501..00000000000 --- a/tools/issue-labeler/src/CreateMikLabelModel/Models/TrainingDataItem.cs +++ /dev/null @@ -1,19 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -using System; -using Octokit; - -namespace CreateMikLabelModel.Models -{ - public record TrainingDataItem(DateTimeOffset CreatedAt, long Identifier, string RepositoryName, string LabelName, string SegmentName, string Data) - { - public TrainingDataItem(string labelName, string segmentName, string repositoryName, Issue source) : this(source.CreatedAt, source.Id, repositoryName, labelName, segmentName, TrainingData.CreateTrainingData(labelName, repositoryName, source)) - { - } - - public TrainingDataItem(string labelName, string segmentName, string repositoryName, PullRequestWithFiles source) : this(source.PullRequest.CreatedAt, source.PullRequest.Id, repositoryName, labelName, segmentName, TrainingData.CreateTrainingData(labelName, repositoryName, source)) - { - } - } -} diff --git a/tools/issue-labeler/src/Hubbup.MikLabelModel/FullPrediction.cs b/tools/issue-labeler/src/Hubbup.MikLabelModel/FullPrediction.cs deleted file mode 100644 index 495127a491e..00000000000 --- a/tools/issue-labeler/src/Hubbup.MikLabelModel/FullPrediction.cs +++ /dev/null @@ -1,19 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -namespace Hubbup.MikLabelModel -{ - public class FullPrediction - { - public string PredictedLabel; - public float Score; - public int OriginalSchemaIndex; - - public FullPrediction(string predictedLabel, float score, int originalSchemaIndex) - { - PredictedLabel = predictedLabel; - Score = score; - OriginalSchemaIndex = originalSchemaIndex; - } - } -} \ No newline at end of file diff --git a/tools/issue-labeler/src/Hubbup.MikLabelModel/GitHubIssueTransformed.cs b/tools/issue-labeler/src/Hubbup.MikLabelModel/GitHubIssueTransformed.cs deleted file mode 100644 index a6a250b3d08..00000000000 --- a/tools/issue-labeler/src/Hubbup.MikLabelModel/GitHubIssueTransformed.cs +++ /dev/null @@ -1,15 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma warning disable 649 // We don't care about unused fields here, because they are mapped with the input file. - -namespace Hubbup.MikLabelModel -{ - internal class GitHubIssueTransformed - { - public string ID; - public string Area; - public string Title; - public string Description; - } -} diff --git a/tools/issue-labeler/src/Hubbup.MikLabelModel/Hubbup.MikLabelModel.csproj b/tools/issue-labeler/src/Hubbup.MikLabelModel/Hubbup.MikLabelModel.csproj deleted file mode 100644 index 18c62b9b1db..00000000000 --- a/tools/issue-labeler/src/Hubbup.MikLabelModel/Hubbup.MikLabelModel.csproj +++ /dev/null @@ -1,14 +0,0 @@ - - - - - - - - - - - - - - diff --git a/tools/issue-labeler/src/Hubbup.MikLabelModel/ILabeler.cs b/tools/issue-labeler/src/Hubbup.MikLabelModel/ILabeler.cs deleted file mode 100644 index 2e860d9f3e1..00000000000 --- a/tools/issue-labeler/src/Hubbup.MikLabelModel/ILabeler.cs +++ /dev/null @@ -1,14 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -using IssueLabeler.Shared; -using System.Threading.Tasks; - -namespace Hubbup.MikLabelModel -{ - public interface ILabeler - { - Task DispatchLabelsAsync(string owner, string repo, int number); - Task PredictUsingModelsFromStorageQueue(string owner, string repo, int number); - } -} \ No newline at end of file diff --git a/tools/issue-labeler/src/Hubbup.MikLabelModel/ILabelerLite.cs b/tools/issue-labeler/src/Hubbup.MikLabelModel/ILabelerLite.cs deleted file mode 100644 index 5e9dde9f1ec..00000000000 --- a/tools/issue-labeler/src/Hubbup.MikLabelModel/ILabelerLite.cs +++ /dev/null @@ -1,16 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -using IssueLabeler.Shared; -using Octokit; -using System.Threading.Tasks; -using System; -using System.Collections.Generic; - -namespace Hubbup.MikLabelModel -{ - public interface ILabelerLite - { - Task> QueryLabelPrediction(int issueNumber, string title, string body, string issueUserLogin, string repositoryName, string repositoryOwnerName); - } -} \ No newline at end of file diff --git a/tools/issue-labeler/src/Hubbup.MikLabelModel/IMikLabelerPathProvider.cs b/tools/issue-labeler/src/Hubbup.MikLabelModel/IMikLabelerPathProvider.cs deleted file mode 100644 index 866004f1985..00000000000 --- a/tools/issue-labeler/src/Hubbup.MikLabelModel/IMikLabelerPathProvider.cs +++ /dev/null @@ -1,10 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -namespace Hubbup.MikLabelModel -{ - public interface IMikLabelerPathProvider - { - (string issuePath, string prPath) GetModelPath(); - } -} diff --git a/tools/issue-labeler/src/Hubbup.MikLabelModel/IModelHolderFactory.cs b/tools/issue-labeler/src/Hubbup.MikLabelModel/IModelHolderFactory.cs deleted file mode 100644 index 8dfb0fca480..00000000000 --- a/tools/issue-labeler/src/Hubbup.MikLabelModel/IModelHolderFactory.cs +++ /dev/null @@ -1,93 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -using Hubbup.MikLabelModel; -using Microsoft.Extensions.Configuration; -using Microsoft.Extensions.Logging; -using System; -using System.Collections.Concurrent; - -namespace IssueLabeler.Shared.Models -{ - public interface IModelHolderFactory - { - IModelHolder CreateModelHolder(string owner, string repo); - IPredictor GetPredictor(string owner, string repo); - } - public class ModelHolderFactory : IModelHolderFactory - { - private readonly ConcurrentDictionary<(string, string), IModelHolder> _models = new ConcurrentDictionary<(string, string), IModelHolder>(); - private readonly ILogger _logger; - private IConfiguration _configuration; - private readonly IBackgroundTaskQueue _backgroundTaskQueue; - public ModelHolderFactory( - ILogger logger, - IConfiguration configuration, - IBackgroundTaskQueue backgroundTaskQueue) - { - _backgroundTaskQueue = backgroundTaskQueue; - _configuration = configuration; - _logger = logger; - } - - public IModelHolder CreateModelHolder(string owner, string repo) - { - if (!IsConfigured(repo)) - return null; - return _models.TryGetValue((owner, repo), out IModelHolder modelHolder) ? - modelHolder : - _models.GetOrAdd((owner, repo), InitFor(repo)); - } - - private bool IsConfigured(string repo) - { - // the following four configuration values are per repo values. - string configSection = $"IssueModel:{repo}:BlobName"; - if (!string.IsNullOrEmpty(_configuration[configSection])) - { - configSection = $"IssueModel:{repo}:BlobName"; - if (!string.IsNullOrEmpty(_configuration[configSection])) - { - configSection = $"PrModel:{repo}:PathPrefix"; - if (!string.IsNullOrEmpty(_configuration[configSection])) - { - // has both pr and issue config - allowed - configSection = $"PrModel:{repo}:BlobName"; - return !string.IsNullOrEmpty(_configuration[configSection]); - } - else - { - // has issue config only - allowed - configSection = $"PrModel:{repo}:BlobName"; - return string.IsNullOrEmpty(_configuration[configSection]); - } - } - } - return false; - } - - private IModelHolder InitFor(string repo) - { - var mh = new ModelHolder(_logger, _configuration, repo); - if (!mh.LoadRequested) - { - _backgroundTaskQueue.QueueBackgroundWorkItem((ct) => mh.LoadEnginesAsync()); - } - return mh; - } - - public IPredictor GetPredictor(string owner, string repo) - { - var modelHolder = CreateModelHolder(owner, repo); - if (modelHolder == null) - { - throw new InvalidOperationException($"Repo {owner}/{repo} is not yet configured for label prediction."); - } - if (!modelHolder.IsIssueEngineLoaded || (!modelHolder.UseIssuesForPrsToo && !modelHolder.IsPrEngineLoaded)) - { - throw new InvalidOperationException("Issue engine must be loaded."); - } - return new Predictor(_logger, modelHolder); - } - } -} \ No newline at end of file diff --git a/tools/issue-labeler/src/Hubbup.MikLabelModel/IModelHolderFactoryLite.cs b/tools/issue-labeler/src/Hubbup.MikLabelModel/IModelHolderFactoryLite.cs deleted file mode 100644 index 8f0d9c86dcb..00000000000 --- a/tools/issue-labeler/src/Hubbup.MikLabelModel/IModelHolderFactoryLite.cs +++ /dev/null @@ -1,184 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -using Hubbup.MikLabelModel; -using IssueLabeler.Shared; -using Microsoft.Extensions.Configuration; -using Microsoft.Extensions.Logging; -using System; -using System.Collections.Concurrent; -using System.Threading; -using System.Threading.Tasks; - -namespace Hubbup.MikLabelModel -{ - public interface IModelHolderFactoryLite - { - Task CreateModelHolders(string owner, string repo, string[] modelConfigNames); - Task CreateModelHolder(string owner, string repo, string modelBlobConfigName = null); - Task GetPredictor(string owner, string repo, string modelBlobConfigName = null); - } - public class ModelHolderFactoryLite : IModelHolderFactoryLite - { - private readonly ConcurrentDictionary<(string, string, string), IModelHolder> _models = new ConcurrentDictionary<(string, string, string), IModelHolder>(); - private readonly ILogger _logger; - private IConfiguration _configuration; - private SemaphoreSlim _sem = new SemaphoreSlim(1,1); - - public ModelHolderFactoryLite( - ILogger logger, - IConfiguration configuration) - { - _configuration = configuration; - _logger = logger; - } - - public async Task CreateModelHolders(string owner, string repo, string[] modelConfigNames) - { - var modelHolders = new IModelHolder[modelConfigNames.Length]; - var allHeld = true; - - // If all of the models are already held, return them. - for (int index = 0; index < modelConfigNames.Length; ++index) - { - if (_models.TryGetValue((owner, repo, modelConfigNames[index]), out var holder)) - { - modelHolders[index] = holder; - } - else - { - // At least one model is not held. No sense in checking the rest. - allHeld = false; - break; - } - } - - if (allHeld) - { - return modelHolders; - } - - // Some models need to be initialized; acquire the semaphore and initialize. - try - { - if (!_sem.Wait(0)) - { - await _sem.WaitAsync().ConfigureAwait(false); - } - - for (int index = 0; index < modelConfigNames.Length; ++index) - { - modelHolders[index] = await CreateModelHolderInternal(owner, repo, modelConfigNames[index]); - } - } - finally - { - if (_sem.CurrentCount <= 0) - { - _sem.Release(); - } - } - - return modelHolders; - } - - public async Task CreateModelHolder(string owner, string repo, string modelBlobConfigName = null) - { - if (_models.TryGetValue((owner, repo, modelBlobConfigName), out var modelHolder)) - { - return modelHolder; - } - - try - { - if (!_sem.Wait(0)) - { - await _sem.WaitAsync().ConfigureAwait(false); - } - - return await CreateModelHolderInternal(owner, repo, modelBlobConfigName).ConfigureAwait(false); - } - finally - { - if (_sem.CurrentCount <= 0) - { - _sem.Release(); - } - } - } - - public async Task CreateModelHolderInternal(string owner, string repo, string modelBlobConfigName) - { - IModelHolder modelHolder = null; - - if (IsConfigured(repo)) - { - if (_models.TryGetValue((owner, repo, modelBlobConfigName), out modelHolder)) - { - return modelHolder; - } - - modelHolder = await InitFor(repo, modelBlobConfigName); - _models.GetOrAdd((owner, repo, modelBlobConfigName), modelHolder); - } - - return modelHolder; - } - - public async Task GetPredictor(string owner, string repo, string modelBlobConfigName = null) - { - var modelHolder = await CreateModelHolder(owner, repo, modelBlobConfigName); - if (modelHolder == null) - { - throw new InvalidOperationException($"Repo {owner}/{repo} is not yet configured for label prediction."); - } - if (!modelHolder.IsIssueEngineLoaded || (!modelHolder.UseIssuesForPrsToo && !modelHolder.IsPrEngineLoaded)) - { - throw new InvalidOperationException("Issue engine must be loaded."); - } - return new Predictor(_logger, modelHolder) { ModelName = modelBlobConfigName }; - } - - private bool IsConfigured(string repo) - { - // the following four configuration values are per repo values. - string configSection = $"IssueModel.{repo.Replace("-", "_")}.BlobConfigNames"; - if (string.IsNullOrEmpty(_configuration[configSection])) - { - configSection = $"IssueModel:{repo}:BlobName"; - if (!string.IsNullOrEmpty(_configuration[configSection])) - { - configSection = $"IssueModel:{repo}:BlobName"; - if (!string.IsNullOrEmpty(_configuration[configSection])) - { - configSection = $"PrModel:{repo}:PathPrefix"; - if (!string.IsNullOrEmpty(_configuration[configSection])) - { - // has both pr and issue config - allowed - configSection = $"PrModel:{repo}:BlobName"; - return !string.IsNullOrEmpty(_configuration[configSection]); - } - else - { - // has issue config only - allowed - configSection = $"PrModel:{repo}:BlobName"; - return string.IsNullOrEmpty(_configuration[configSection]); - } - } - } - } - else { return true; } - return false; - } - - private async Task InitFor(string repo, string modelBlobConfigName = null) - { - var mh = new ModelHolder(_logger, _configuration, repo, modelBlobConfigName); - if (!mh.LoadRequested) - { - await mh.LoadEnginesAsync(); - } - return mh; - } - } -} \ No newline at end of file diff --git a/tools/issue-labeler/src/Hubbup.MikLabelModel/Labeler.cs b/tools/issue-labeler/src/Hubbup.MikLabelModel/Labeler.cs deleted file mode 100644 index 735a86a4060..00000000000 --- a/tools/issue-labeler/src/Hubbup.MikLabelModel/Labeler.cs +++ /dev/null @@ -1,548 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -using IssueLabeler.Shared; -using IssueLabeler.Shared.Models; -using Microsoft.Extensions.Configuration; -using Microsoft.Extensions.Logging; -using Octokit; -using System; -using System.Collections.Concurrent; -using System.Collections.Generic; -using System.Globalization; -using System.Linq; -using System.Net.Http; -using System.Text.Json; -using System.Text.RegularExpressions; -using System.Threading.Tasks; - -namespace Hubbup.MikLabelModel -{ - public class Labeler : ILabeler - { - private IQueueHelper _queueHelper; - private Regex _regex; - private readonly Regex _regexIssueMatch; - private readonly IDiffHelper _diffHelper; - private readonly ILogger _logger; - private readonly IHttpClientFactory _httpClientFactory; - private readonly IModelHolderFactory _modelHolderFactory; - private readonly IConfiguration _configuration; - private readonly bool _useIssueLabelerForPrsToo; - private readonly IGitHubClientWrapper _gitHubClientWrapper; - private readonly IBackgroundTaskQueue _backgroundTaskQueue; - - public Labeler( - IQueueHelper queueHelper, - IConfiguration configuration, - IHttpClientFactory httpClientFactory, - ILogger logger, - IBackgroundTaskQueue backgroundTaskQueue, - IGitHubClientWrapper gitHubClientWrapper, - IModelHolderFactory modelHolderFactory, - IDiffHelper diffHelper) - { - _queueHelper = queueHelper; - _backgroundTaskQueue = backgroundTaskQueue; - _gitHubClientWrapper = gitHubClientWrapper; - _diffHelper = diffHelper; - _regexIssueMatch = new Regex(@"[Ff]ix(?:ed|es|)( )+#(\d+)"); - _httpClientFactory = httpClientFactory; - _logger = logger; - _configuration = configuration; - _useIssueLabelerForPrsToo = configuration.GetSection(("UseIssueLabelerForPrsToo")).Get(); - _modelHolderFactory = modelHolderFactory; - } - - public async Task PredictUsingModelsFromStorageQueue(string owner, string repo, int number) - { - if (_regex == null) - { - _regex = new Regex(@"@[a-zA-Z0-9_//-]+"); - } - var predictor = _modelHolderFactory.GetPredictor(owner, repo); - - var iop = await _gitHubClientWrapper.GetIssue(owner, repo, number); - bool isPr = iop.PullRequest != null; - - string body = iop.Body ?? string.Empty; - var userMentions = _regex.Matches(body).Select(x => x.Value).ToArray(); - LabelSuggestion labelSuggestion = null; - - if (isPr && !_useIssueLabelerForPrsToo) - { - var prModel = await CreatePullRequest(owner, repo, iop.Number, iop.Title, iop.Body, userMentions, iop.User.Login); - labelSuggestion = await predictor.Predict(prModel); - _logger.LogInformation("predicted with pr model the new way"); - _logger.LogInformation(string.Join(",", labelSuggestion.LabelScores.Select(x => x.LabelName))); - return labelSuggestion; - } - var issueModel = CreateIssue(iop.Number, iop.Title, iop.Body, userMentions, iop.User.Login); - labelSuggestion = await predictor.Predict(issueModel); - _logger.LogInformation("predicted with issue model the new way"); - _logger.LogInformation(string.Join(",", labelSuggestion.LabelScores.Select(x => x.LabelName))); - return labelSuggestion; - } - - - public Task DispatchLabelsAsync(string owner, string repo, int number) - { - var tasks = new List(); - tasks.Add(InnerTask(owner, repo, number)); - return tasks.First(); - } - - private readonly ConcurrentDictionary<(string, string), LabelerOptions> _options = - new ConcurrentDictionary<(string, string), LabelerOptions>(); - - private LabelerOptions GetOptionsFor(string owner, string repo) - { - try - { - return _options.TryGetValue((owner, repo), out LabelerOptions options) ? - options : - _options.GetOrAdd((owner, repo), new LabelerOptions() - { - LabelRetriever = new LabelRetriever(owner, repo), - PredictionUrl = string.Format( - CultureInfo.InvariantCulture, - "{0}/api/WebhookIssue/{1}/{2}/", _configuration[$"{owner}:{repo}:prediction_url"], - owner, repo), - Threshold = double.Parse(_configuration[$"{owner}:{repo}:threshold"]), - CanUpdateIssue = _configuration.GetSection($"{owner}:{repo}:can_update_labels").Get(), - CanCommentOnIssue = _configuration.GetSection($"{owner}:{repo}:can_comment_on").Get() - }); - } - catch - { - // the repo is not configured, return null to skip - _logger.LogError($"{owner}/{repo} is not yet configured."); - return null; - } - } - - private class LabelerOptions - { - public ILabelRetriever LabelRetriever { get; set; } - public string PredictionUrl { get; set; } - public double Threshold { get; set; } - public bool CanCommentOnIssue { get; set; } - public bool CanUpdateIssue { get; set; } - } - - private async Task InnerTask(string owner, string repo, int number) - { - var options = GetOptionsFor(owner, repo); - if (options == null) - { - return; - } - var labelRetriever = options.LabelRetriever; - string msg = $"! dispatcher app - started query for {owner}/{repo}#{number}"; - _logger.LogInformation(msg); - - var iop = await _gitHubClientWrapper.GetIssue(owner, repo, number); - - var labels = new HashSet(); - GithubObjectType issueOrPr = iop.PullRequest != null ? GithubObjectType.PullRequest : GithubObjectType.Issue; - - if (labelRetriever.ShouldSkipUpdatingLabels(iop.User.Login)) - { - _logger.LogInformation($"! dispatcher app - skipped for racing for {issueOrPr} {number}."); - return; - } - - // get non area labels - labels = await GetNonAreaLabelsAsync(labelRetriever, owner, repo, iop); - - bool foundArea = false; - string theFoundLabel = default; - if (!labelRetriever.SkipPrediction) - { - // find shortcut to get label - if (iop.PullRequest != null) - { - string body = iop.Body ?? string.Empty; - if (labelRetriever.AllowTakingLinkedIssueLabel) - { - (string label, int number) linkedIssue = await GetAnyLinkedIssueLabel(owner, repo, body); - if (!string.IsNullOrEmpty(linkedIssue.label)) - { - _logger.LogInformation($"! dispatcher app - PR number {iop.Number} fixes issue number {linkedIssue.number} with area label {linkedIssue.label}."); - foundArea = true; - theFoundLabel = linkedIssue.label; - } - } - } - - // then try ML prediction - if (!foundArea) - { - var labelSuggestion = await GetLabelSuggestion(options.PredictionUrl, owner, repo, number); - if (labelSuggestion == null) - { - _backgroundTaskQueue.QueueBackgroundWorkItem((ct) => _queueHelper.InsertMessageTask($"TODO - Dispatch labels for: /{owner}/{repo}#{number}")); - return; - } - var topChoice = labelSuggestion.LabelScores.OrderByDescending(x => x.Score).First(); - if (labelRetriever.PreferManualLabelingFor(topChoice.LabelName)) - { - _logger.LogInformation($"# dispatcher app - skipped: prefer manual prediction instead."); - } - else if (topChoice.Score >= options.Threshold || labelRetriever.OkToIgnoreThresholdFor(topChoice.LabelName)) - { - foundArea = true; - theFoundLabel = topChoice.LabelName; - } - else - { - _logger.LogInformation($"! dispatcher app - The Model was not able to assign the label to the {issueOrPr} {number} confidently."); - } - } - } - await UpdateTask(options, owner, repo, number, foundArea, labels, theFoundLabel, issueOrPr, labelRetriever); - } - - private async Task UpdateTask( - LabelerOptions options, - string owner, string repo, - int number, - bool foundArea, - HashSet labels, - string theFoundLabel, - GithubObjectType issueOrPr, - ILabelRetriever labelRetriever) - { - - if (labelRetriever.AddDelayBeforeUpdatingLabels) - { - // to avoid race with dotnet-bot - await Task.Delay(TimeSpan.FromSeconds(10)); - } - - // get iop again - var iop = await _gitHubClientWrapper.GetIssue(owner, repo, number); - - var existingLabelList = iop?.Labels?.Where(x => !string.IsNullOrEmpty(x.Name)).Select(x => x.Name).ToList(); - bool issueMissingAreaLabel = !existingLabelList.Where(x => x.StartsWith("area-", StringComparison.OrdinalIgnoreCase)).Any(); - - // update section - if (labels.Count > 0 || (foundArea && issueMissingAreaLabel)) - { - //var issueUpdate = iop.ToUpdate(); - var issueUpdate = new IssueUpdate(); - - if (foundArea && issueMissingAreaLabel) - { - // no area label yet - issueUpdate.AddLabel(theFoundLabel); - } - - var existingLabelNames = existingLabelList.ToHashSet(); - foreach (var newLabel in labels) - { - if (!existingLabelNames.Contains(newLabel)) - { - issueUpdate.AddLabel(newLabel); - } - } - - if (options.CanUpdateIssue && issueUpdate.Labels != null && issueUpdate.Labels.Count > 0) - { - issueUpdate.Milestone = iop.Milestone?.Number; // The number of milestone associated with the issue. - foreach (var existingLabel in existingLabelNames) - { - issueUpdate.AddLabel(existingLabel); - } - await _gitHubClientWrapper.UpdateIssue(owner, repo, number, issueUpdate); - } - else if (!options.CanUpdateIssue && issueUpdate.Labels != null && issueUpdate.Labels.Count > 0) - { - _logger.LogInformation($"! skipped updating labels for {issueOrPr} {number}. would have become: {string.Join(",", issueUpdate.Labels)}"); - } - else - { - _logger.LogInformation($"! dispatcher app - No update made to labels for {issueOrPr} {number}."); - } - } - - // comment section - if (options.CanCommentOnIssue) - { - foreach (var labelFound in labels) - { - if (!string.IsNullOrEmpty(labelRetriever.CommentFor(labelFound))) - { - await _gitHubClientWrapper.CommentOn(owner, repo, iop.Number, labelRetriever.CommentFor(labelFound)); - } - } - - // if newlabels has no area-label and existing does not also. then comment - if (!foundArea && issueMissingAreaLabel && labelRetriever.CommentWhenMissingAreaLabel) - { - if (issueOrPr == GithubObjectType.Issue) - { - await _gitHubClientWrapper.CommentOn(owner, repo, iop.Number, labelRetriever.MessageToAddAreaLabelForIssue); - } - else - { - await _gitHubClientWrapper.CommentOn(owner, repo, iop.Number, labelRetriever.MessageToAddAreaLabelForPr); - } - } - } - else - { - _logger.LogInformation($"! dispatcher app - No comment made to labels for {issueOrPr} {number}."); - } - } - - private async Task GetLabelSuggestion(string partUrl, string owner, string repo, int number) - { - var predictionUrl = @$"{partUrl}{number}"; - var request = new HttpRequestMessage(HttpMethod.Get, predictionUrl); - var client = _httpClientFactory.CreateClient(); - var response = await client.SendAsync(request); - - if (response.IsSuccessStatusCode) - { - using var responseStream = await response.Content.ReadAsStreamAsync(); - var remotePrediction = await JsonSerializer.DeserializeAsync(responseStream, new JsonSerializerOptions { PropertyNameCaseInsensitive = true }); - var predictionList = remotePrediction.LabelScores.Select(ls => new LabelScore() - { - ScoredLabel = new ScoredLabel { LabelName = ls.LabelName, Score = ls.Score }, - Label = default - }).Select(x => x.ScoredLabel).ToList(); - - _logger.LogInformation("! received prediction: {0}", string.Join(",", predictionList.Select(x => x.LabelName))); - - return new LabelSuggestion() - { - LabelScores = predictionList, - }; - } - else - { - // queue task again until the suggestion comes back safe - _logger.LogError($"Could not retrieve label predictions for this issue. Remote HTTP prediction status code {response.StatusCode} from URL '{predictionUrl}'."); - return null; - } - } - - private async Task<(string label, int number)> GetAnyLinkedIssueLabel(string owner, string repo, string body) - { - Match match = _regexIssueMatch.Match(body); - if (match.Success && int.TryParse(match.Groups[2].Value, out int issueNumber)) - { - return (await TryGetIssueLabelForPrAsync(owner, repo, issueNumber), issueNumber); - } - return await Task.FromResult<(string, int)>(default); - } - - private async Task> GetNonAreaLabelsAsync(ILabelRetriever labelRetriever, string owner, string repo, Octokit.Issue iop) - { - if (_regex == null) - { - _regex = new Regex(@"@[a-zA-Z0-9_//-]+"); - } - string body = iop.Body ?? string.Empty; - var userMentions = _regex.Matches(body).Select(x => x.Value).ToArray(); - GitHubIssue iopModel = null; - if (iop.PullRequest != null) - { - iopModel = await CreatePullRequest(owner, repo, iop.Number, iop.Title, iop.Body, userMentions, iop.User.Login); - } - else - { - iopModel = CreateIssue(iop.Number, iop.Title, iop.Body, userMentions, iop.User.Login); - } - return labelRetriever.GetNonAreaLabelsForIssueAsync(iopModel); - } - - private static GitHubIssue CreateIssue(int number, string title, string body, string[] userMentions, string author) - { - return new GitHubIssue() - { - ID = number, - Title = title, - Description = body, - IsPR = 0, - Author = author, - UserMentions = string.Join(' ', userMentions), - NumMentions = userMentions.Length - }; - } - - private async Task CreatePullRequest(string owner, string repo, int number, string title, string body, string[] userMentions, string author) - { - var pr = new GitHubPullRequest() - { - ID = number, - Title = title, - Description = body, - IsPR = 1, - Author = author, - UserMentions = string.Join(' ', userMentions), - NumMentions = userMentions.Length, - }; - IReadOnlyList prFiles = await _gitHubClientWrapper.GetPullRequestFiles(owner, repo, number); - if (prFiles.Count != 0) - { - string[] filePaths = prFiles.Select(x => x.FileName).ToArray(); - var segmentedDiff = _diffHelper.SegmentDiff(filePaths); - pr.Files = string.Join(' ', segmentedDiff.FileDiffs); - pr.Filenames = string.Join(' ', segmentedDiff.Filenames); - pr.FileExtensions = string.Join(' ', segmentedDiff.Extensions); - pr.Folders = _diffHelper.FlattenWithWhitespace(segmentedDiff.Folders); - pr.FolderNames = _diffHelper.FlattenWithWhitespace(segmentedDiff.FolderNames); - } - pr.FileCount = prFiles.Count; - return pr; - } - - private async Task DoesPrAddNewApiAsync(string owner, string repo, int prNumber) - { - var pr = await _gitHubClientWrapper.GetPullRequest(owner, repo, prNumber); - var diff = new Uri(pr.DiffUrl); - var httpclient = _httpClientFactory.CreateClient(); - // TODO: fix failure here seen in logs. - var response = await httpclient.GetAsync(diff.LocalPath); - response.EnsureSuccessStatusCode(); - var content = await response.Content.ReadAsStringAsync(); - return TakeDiffContentReturnMeaning(content.Split("\n")); - } - - private async Task TryGetIssueLabelForPrAsync(string owner, string repo, int issueNumber) - { - var issue = await _gitHubClientWrapper.GetIssue(owner, repo, issueNumber); - return issue?.Labels? - .Where(x => !string.IsNullOrEmpty(x.Name)) - .Select(x => x.Name) - .Where(x => x.StartsWith("area-", StringComparison.OrdinalIgnoreCase)).FirstOrDefault(); - } - - private enum DiffContentLineReadingStatus - { - readyToStartOver = 0, - expectingIndex, - expectingTripleMinus, - expectingTriplePlus, - expectingDoubleAtSign - } - - private bool TakeDiffContentReturnMeaning(string[] contentLines) - { - string curFile = string.Empty; - var refFilesWithAdditions = new Dictionary(); - int additions = 0, deletions = 0; - bool lookingAtRefDiff = false; - var stat = DiffContentLineReadingStatus.readyToStartOver; - for (int i = 0; i < contentLines.Length; i++) - { - var line = contentLines[i]; - switch (stat) - { - case DiffContentLineReadingStatus.readyToStartOver: - if (ContainsRefChanges(line)) - { - if (!string.IsNullOrEmpty(curFile) && additions > deletions) - { - refFilesWithAdditions.Add(curFile, additions - deletions); - // reset - additions = 0; - deletions = 0; - } - lookingAtRefDiff = true; - curFile = line.Substring(13, line.IndexOf(@".cs b/") + 3 - 13); - stat = DiffContentLineReadingStatus.expectingIndex; - } - else if (line.StartsWith("diff --git")) - { - lookingAtRefDiff = false; - } - else if (lookingAtRefDiff) - { - if (line.StartsWith("+") && !IsUnwantedDiff(line)) - { - additions++; - } - else if (line.StartsWith("-") && !IsUnwantedDiff(line)) - { - deletions++; - } - } - break; - case DiffContentLineReadingStatus.expectingIndex: - if (line.StartsWith("index ")) - { - stat = DiffContentLineReadingStatus.expectingTripleMinus; - } - break; - case DiffContentLineReadingStatus.expectingTripleMinus: - if (line.StartsWith("--- ")) - { - stat = DiffContentLineReadingStatus.expectingTriplePlus; - } - break; - case DiffContentLineReadingStatus.expectingTriplePlus: - if (line.StartsWith("+++ ")) - { - stat = DiffContentLineReadingStatus.expectingDoubleAtSign; - } - break; - case DiffContentLineReadingStatus.expectingDoubleAtSign: - if (line.StartsWith("@@ ")) - { - stat = DiffContentLineReadingStatus.readyToStartOver; - } - break; - default: - break; - } - } - if (!string.IsNullOrEmpty(curFile) && additions > deletions) - { - refFilesWithAdditions.Add(curFile, additions - deletions); - } - return refFilesWithAdditions.Any(); - // given a diff content - // readyToStartOver = true - // additions = 0, deletions = 0 - // read all lines - // for each line, if readyToStartOver and starts with diff: set expectingIndex to true - // for each line, if expectingIndex and starts with index: set expectingTripleMinus - // for each line, if expectingTripleMinus and starts ---: set expectingTriplePlus - // for each line, if expectingTriplePlus and starts with +++: set expectingDoubleAtSign - // for each line, if expectingTriplePlus and starts with @@: set readyToStartOver - // for each line, if readyToStartOver and starts with +: additions++ and if starts with - deletions++ - // for each line, if readyToStartOver and starts with +: additions++ and if starts with - deletions++ - // for each line, if readyToStartOver and starts with diff: ... (already planned for) - // - - - } - - private bool IsUnwantedDiff(string line) - { - if (string.IsNullOrWhiteSpace(line.Substring(1))) - { - return true; - } - var trimmed = line.Substring(1).TrimStart(); - if (trimmed.StartsWith("[") || trimmed.StartsWith("#") || trimmed.StartsWith("//") || trimmed.StartsWith("using ")) - { - return true; - } - return false; - } - - private bool ContainsRefChanges(string content) - { - if (content.Contains(@"/ref/") && content.Contains(".cs b/src/libraries")) - { - return true; - } - return false; // diff --git a/src/libraries/(.*)/ref/(.*).cs b/src/libraries/(.*)/ref/(.*).cs - } - - } -} diff --git a/tools/issue-labeler/src/Hubbup.MikLabelModel/LabelerLite.cs b/tools/issue-labeler/src/Hubbup.MikLabelModel/LabelerLite.cs deleted file mode 100644 index c4f67f1d64a..00000000000 --- a/tools/issue-labeler/src/Hubbup.MikLabelModel/LabelerLite.cs +++ /dev/null @@ -1,153 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Text.RegularExpressions; -using System.Threading.Tasks; -using IssueLabeler.Shared; -using Microsoft.Extensions.Configuration; -using Microsoft.Extensions.Logging; -using Octokit; - -namespace Hubbup.MikLabelModel -{ - public class LabelerLite : ILabelerLite - { - private static Regex MentionsRegex { get; } = new Regex(@"@[a-zA-Z0-9_//-]+", RegexOptions.Compiled); - - private readonly ILogger _logger; - private readonly IModelHolderFactoryLite _modelHolderFactory; - private readonly IConfiguration _config; - private const float defaultConfidenceThreshold = 0.60f; - private const string defaultModel = "default model"; - - public LabelerLite( - ILogger logger, - IModelHolderFactoryLite modelHolderFactory, - IConfiguration config) - { - _logger = logger; - _modelHolderFactory = modelHolderFactory; - _config = config; - } - - public async Task> QueryLabelPrediction( - int issueNumber, - string title, - string body, - string issueUserLogin, - string repositoryName, - string repositoryOwnerName) - { - AssertNotNullOrEmpty(title, nameof(title)); - AssertNotNullOrEmpty(body, nameof(body)); - AssertNotNullOrEmpty(issueUserLogin, nameof(issueUserLogin)); - AssertNotNullOrEmpty(repositoryName, nameof(repositoryName)); - AssertNotNullOrEmpty(repositoryOwnerName, nameof(repositoryOwnerName)); - - _logger.LogInformation($"Predict Labels started query for {repositoryOwnerName}/{repositoryName}#{issueNumber}"); - - // Query raw predictions - var issueModel = CreateIssue(issueNumber, title, body, issueUserLogin); - var predictions = await GetPredictions(repositoryOwnerName, repositoryName, issueNumber, issueModel); - - // Determine the confidence threshold to use for filtering predictions - float confidenceThreshold; - - if (!float.TryParse(_config["ConfidenceThreshold"], out confidenceThreshold)) - { - confidenceThreshold = defaultConfidenceThreshold; - _logger.LogInformation($"Prediction confidence default threshold of {confidenceThreshold} will be used as no value was configured. {repositoryOwnerName}/{repositoryName}#{issueNumber}"); - } - else - { - _logger.LogInformation($"Prediction confidence threshold of {confidenceThreshold} will be used. {repositoryOwnerName}/{repositoryName}#{issueNumber}"); - } - - // Filter predictions based on the confidence threshold. - var predictedLabels = new List(); - - foreach (var labelSuggestion in predictions) - { - var topChoice = labelSuggestion.LabelScores.OrderByDescending(x => x.Score).First(); - - if (topChoice.Score >= confidenceThreshold) - { - predictedLabels.Add(topChoice.LabelName); - } - else - { - _logger.LogWarning($"Label prediction was below confidence level `{confidenceThreshold}` for Model:`{labelSuggestion.ModelConfigName ?? defaultModel}`: '{string.Join(", ", labelSuggestion.LabelScores.Select(x => $"{x.LabelName}:[{x.Score}]"))}'"); - } - } - - _logger.LogInformation($"Predict Labels query for {repositoryOwnerName}/{repositoryName}#{issueNumber} suggested {predictedLabels.Count} labels."); - return predictedLabels; - } - - private async Task> GetPredictions(string owner, string repo, int number, GitHubIssue issueModel) - { - List predictions = new List(); - List predictors = new List(); - - if (_config.TryGetConfigValue($"IssueModel.{repo.Replace("-", "_")}.BlobConfigNames", out var blobConfig)) - { - var blobConfigs = blobConfig.Split(';', StringSplitOptions.RemoveEmptyEntries); - foreach (var blobConfigName in blobConfigs) - { - // get a prediction for each model - var predictor = await _modelHolderFactory.GetPredictor(owner, repo, blobConfigName); - predictors.Add(predictor); - } - } - else - { - // Add just the default predictor - var predictor = await _modelHolderFactory.GetPredictor(owner, repo); - predictors.Add(predictor); - } - - foreach (var predictor in predictors) - { - var labelSuggestion = await predictor.Predict(issueModel); - labelSuggestion.ModelConfigName = predictor.ModelName; - if (labelSuggestion == null) - { - _logger.LogCritical($"Failed: Unable to get prediction for {owner}/{repo}#{number}. ModelName:{predictor.ModelName}"); - return null; - } - _logger.LogInformation($"Prediction results for {owner}/{repo}#{number}, Model:{labelSuggestion.ModelConfigName ?? defaultModel}: '{string.Join(",", labelSuggestion.LabelScores.Select(x => $"{x.LabelName}:{x.Score}"))}'"); - predictions.Add(labelSuggestion); - } - - return predictions; - } - - private static GitHubIssue CreateIssue(int number, string title, string body, string author) - { - var userMentions = MentionsRegex.Matches(body ?? string.Empty).Select(x => x.Value).ToArray(); - - return new GitHubIssue() - { - ID = number, - Title = title, - Description = body, - IsPR = 0, - Author = author, - UserMentions = string.Join(' ', userMentions), - NumMentions = userMentions.Length - }; - } - - private static void AssertNotNullOrEmpty(string value, string paramName) - { - if (string.IsNullOrEmpty(value)) - { - throw new ArgumentException($"{paramName} cannot be null or empty.", paramName); - } - } - } -} diff --git a/tools/issue-labeler/src/Hubbup.MikLabelModel/MikLabelerModel.cs b/tools/issue-labeler/src/Hubbup.MikLabelModel/MikLabelerModel.cs deleted file mode 100644 index d4d37749394..00000000000 --- a/tools/issue-labeler/src/Hubbup.MikLabelModel/MikLabelerModel.cs +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -using IssueLabeler.Shared; -using Microsoft.ML; - -namespace Hubbup.MikLabelModel -{ - //This "Labeler" class could be used in a different End-User application (Web app, other console app, desktop app, etc.) - public class MikLabelerModel - { - private readonly PredictionEngine _issuePredictionEngine; - private readonly PredictionEngine _prPredictionEngine; - - public MikLabelerModel((string modelPath, string prModelPath) paths) - { - var modelPath = paths.modelPath; - var prModelPath = paths.prModelPath; - var mlContext = new MLContext(seed: 1); - - // Load model from file - var trainedModel = mlContext.Model.Load(modelPath, inputSchema: out _); - var trainedPrModel = mlContext.Model.Load(prModelPath, inputSchema: out _); - - _issuePredictionEngine = mlContext.Model.CreatePredictionEngine(trainedModel); - _prPredictionEngine = mlContext.Model.CreatePredictionEngine(trainedPrModel); - } - - public MikLabelerPredictor GetPredictor() - { - // Create prediction engine related to the loaded trained model - return new MikLabelerPredictor(_issuePredictionEngine, _prPredictionEngine); - } - } -} diff --git a/tools/issue-labeler/src/Hubbup.MikLabelModel/MikLabelerPredictor.cs b/tools/issue-labeler/src/Hubbup.MikLabelModel/MikLabelerPredictor.cs deleted file mode 100644 index 666c1bb2c44..00000000000 --- a/tools/issue-labeler/src/Hubbup.MikLabelModel/MikLabelerPredictor.cs +++ /dev/null @@ -1,137 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -using IssueLabeler.Shared; -using Microsoft.ML; -using Microsoft.ML.Data; -using Octokit; -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text.RegularExpressions; - -namespace Hubbup.MikLabelModel -{ - public class MikLabelerPredictor - { - private readonly PredictionEngine _predictionEngine; - private readonly PredictionEngine _prPredictionEngine; - private readonly Regex _regex = new Regex(@"@[a-zA-Z0-9_//-]+"); - private readonly DiffHelper _diffHelper = new DiffHelper(); - - public MikLabelerPredictor(PredictionEngine predictionEngine, - PredictionEngine prPredictionEngine) - { - _predictionEngine = predictionEngine; - _prPredictionEngine = prPredictionEngine; - } - - public LabelSuggestion PredictLabel(Issue issue, string[] filePaths = null) - { - var userMentions = issue.Body != null ? _regex.Matches(issue.Body).Select(x => x.Value).ToArray() : new string[0]; - - List labelPredictions; - if (filePaths == null) - { - var aspnetIssue = new GitHubIssue - { - ID = issue.Number, - Title = issue.Title, - Description = issue.Body, - IsPR = 0, - Author = issue.User.Login, - UserMentions = string.Join(' ', userMentions), - NumMentions = userMentions.Length, - }; - var prediction = _predictionEngine.Predict(aspnetIssue); - labelPredictions = GetBestThreePredictions(prediction, forPrs: false); - } - else - { - var segmentedDiff = _diffHelper.SegmentDiff(filePaths); - var aspnetIssue = new GitHubPullRequest - { - ID = issue.Number, - Title = issue.Title, - Description = issue.Body, - IsPR = 1, - Author = issue.User.Login, - UserMentions = string.Join(' ', userMentions), - NumMentions = userMentions.Length, - FileCount = filePaths.Length, - Files = string.Join(' ', segmentedDiff.FileDiffs), - Filenames = string.Join(' ', segmentedDiff.Filenames), - FileExtensions = string.Join(' ', segmentedDiff.Extensions), - FolderNames = _diffHelper.FlattenWithWhitespace(segmentedDiff.FolderNames), - Folders = _diffHelper.FlattenWithWhitespace(segmentedDiff.Folders) - }; - var prediction = _prPredictionEngine.Predict(aspnetIssue); - labelPredictions = GetBestThreePredictions(prediction, forPrs: true); - } - - return new LabelSuggestion - { - LabelScores = labelPredictions, - }; - } - - public static List GetBestThreePredictions(float[] scores, VBuffer> slotNames) - { - var topThreeScores = GetIndexesOfTopScores(scores, 3); - - return new List - { - new ScoredLabel {LabelName=slotNames.GetItemOrDefault(topThreeScores[0]).ToString(), Score = scores[topThreeScores[0]] }, - new ScoredLabel {LabelName=slotNames.GetItemOrDefault(topThreeScores[1]).ToString(), Score = scores[topThreeScores[1]] }, - new ScoredLabel {LabelName=slotNames.GetItemOrDefault(topThreeScores[2]).ToString(), Score = scores[topThreeScores[2]] }, - }; - } - - private List GetBestThreePredictions(GitHubIssuePrediction prediction, bool forPrs) - { - var scores = prediction.Score; - - VBuffer> slotNames = default; - if (forPrs) - { - _prPredictionEngine.OutputSchema[nameof(GitHubIssuePrediction.Score)].GetSlotNames(ref slotNames); - } - else - { - _predictionEngine.OutputSchema[nameof(GitHubIssuePrediction.Score)].GetSlotNames(ref slotNames); - } - - var topThreeScores = GetIndexesOfTopScores(scores, 3); - - return new List - { - new ScoredLabel {LabelName=slotNames.GetItemOrDefault(topThreeScores[0]).ToString(), Score = scores[topThreeScores[0]] }, - new ScoredLabel {LabelName=slotNames.GetItemOrDefault(topThreeScores[1]).ToString(), Score = scores[topThreeScores[1]] }, - new ScoredLabel {LabelName=slotNames.GetItemOrDefault(topThreeScores[2]).ToString(), Score = scores[topThreeScores[2]] }, - }; - } - - private static IReadOnlyList GetIndexesOfTopScores(float[] scores, int n) - { - var indexedScores = scores - .Zip(Enumerable.Range(0, scores.Length), (score, index) => new IndexedScore(index, score)); - - var indexedScoresSortedByScore = indexedScores - .OrderByDescending(indexedScore => indexedScore.Score); - - return indexedScoresSortedByScore - .Take(n) - .Select(indexedScore => indexedScore.Index) - .ToList() - .AsReadOnly(); - } - - private struct IndexedScore - { - public IndexedScore(int index, float score) => (Index, Score) = (index, score); - - public int Index { get; } - public float Score { get; } - } - } -} diff --git a/tools/issue-labeler/src/Hubbup.MikLabelModel/MikLabelerProvider.cs b/tools/issue-labeler/src/Hubbup.MikLabelModel/MikLabelerProvider.cs deleted file mode 100644 index 94164b5967f..00000000000 --- a/tools/issue-labeler/src/Hubbup.MikLabelModel/MikLabelerProvider.cs +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -using Microsoft.Extensions.Logging; -using System.Collections.Concurrent; -using System.Diagnostics; - -namespace Hubbup.MikLabelModel -{ - public class MikLabelerProvider - { - private readonly ConcurrentDictionary<(string, string), MikLabelerModel> _mikLabelers = new ConcurrentDictionary<(string, string), MikLabelerModel>(); - private readonly ILogger _logger; - - public MikLabelerProvider(ILogger logger) - { - _logger = logger; - } - - public MikLabelerModel GetMikLabeler(IMikLabelerPathProvider pathProvider) - { - var paths = pathProvider.GetModelPath(); - return _mikLabelers.GetOrAdd( - paths, - p => - { - var stopwatch = new Stopwatch(); - stopwatch.Start(); - var model = new MikLabelerModel(p); - stopwatch.Stop(); - _logger.LogInformation("Creating new MikLabelerModel for paths {PATH} and {PR_PATH} in {TIME}ms", p.Item1, p.Item2, stopwatch.ElapsedMilliseconds); - return model; - }); - } - } -} diff --git a/tools/issue-labeler/src/Hubbup.MikLabelModel/MikLabelerStringPathProvider.cs b/tools/issue-labeler/src/Hubbup.MikLabelModel/MikLabelerStringPathProvider.cs deleted file mode 100644 index fb9cdee0302..00000000000 --- a/tools/issue-labeler/src/Hubbup.MikLabelModel/MikLabelerStringPathProvider.cs +++ /dev/null @@ -1,22 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -namespace Hubbup.MikLabelModel -{ - public class MikLabelerStringPathProvider : IMikLabelerPathProvider - { - private readonly string _path; - private readonly string _prPath; - - public MikLabelerStringPathProvider(string issuePath, string prPath) - { - _path = issuePath; - _prPath = prPath; - } - - (string issuePath, string prPath) IMikLabelerPathProvider.GetModelPath() - { - return (_path, _prPath); - } - } -} diff --git a/tools/issue-labeler/src/Hubbup.MikLabelModel/MyTrainerStrategy.cs b/tools/issue-labeler/src/Hubbup.MikLabelModel/MyTrainerStrategy.cs deleted file mode 100644 index 64536d7ae08..00000000000 --- a/tools/issue-labeler/src/Hubbup.MikLabelModel/MyTrainerStrategy.cs +++ /dev/null @@ -1,11 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -namespace Hubbup.MikLabelModel -{ - public enum MyTrainerStrategy - { - SdcaMultiClassTrainer = 1, - OVAAveragedPerceptronTrainer = 2, - }; -} diff --git a/tools/issue-labeler/src/Hubbup.MikLabelModel/Predictor.cs b/tools/issue-labeler/src/Hubbup.MikLabelModel/Predictor.cs deleted file mode 100644 index ddce0c8b65b..00000000000 --- a/tools/issue-labeler/src/Hubbup.MikLabelModel/Predictor.cs +++ /dev/null @@ -1,84 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -using IssueLabeler.Shared; -using Microsoft.Extensions.Logging; -using Microsoft.ML; -using Microsoft.ML.Data; -using System; -using System.Linq; -using System.Threading; -using System.Threading.Tasks; - -namespace Hubbup.MikLabelModel -{ - public class Predictor : IPredictor - { - private static SemaphoreSlim sem = new SemaphoreSlim(1); - private readonly ILogger logger; - private readonly IModelHolder modelHolder; - - public string ModelName { get; set; } - - public Predictor(ILogger logger, IModelHolder modelHolder) - { - this.logger = logger; - this.modelHolder = modelHolder; - } - - public Task Predict(GitHubIssue issue) - { - return Predict(issue, modelHolder.IssuePredEngine, logger); - } - - public Task Predict(GitHubPullRequest issue) - { - if (modelHolder.UseIssuesForPrsToo) - { - return Predict(issue, modelHolder.IssuePredEngine, logger); - } - return Predict(issue, modelHolder.PrPredEngine, logger); - } - - private static async Task Predict( - T issueOrPr, - PredictionEngine predEngine, - ILogger logger) - where T : GitHubIssue - { - if (predEngine == null) - { - throw new InvalidOperationException("expected prediction engine loaded."); - } - GitHubIssuePrediction prediction; - bool acquired = false; - - try - { - await sem.WaitAsync(); - acquired = true; - prediction = predEngine.Predict(issueOrPr); - } - finally - { - if (acquired) - { - sem.Release(); - } - } - - VBuffer> slotNames = default; - predEngine.OutputSchema[nameof(GitHubIssuePrediction.Score)].GetSlotNames(ref slotNames); - - float[] probabilities = prediction.Score; - var labelPredictions = MikLabelerPredictor.GetBestThreePredictions(probabilities, slotNames); - - float maxProbability = probabilities.Max(); - logger.LogInformation($"MaxProbability: {maxProbability} for #{issueOrPr.ID} - '{issueOrPr.Title}'"); - return new LabelSuggestion - { - LabelScores = labelPredictions, - }; - } - } -} diff --git a/tools/issue-labeler/src/IssueLabeler.Shared/AIOutput.cs b/tools/issue-labeler/src/IssueLabeler.Shared/AIOutput.cs deleted file mode 100644 index 6cea611df25..00000000000 --- a/tools/issue-labeler/src/IssueLabeler.Shared/AIOutput.cs +++ /dev/null @@ -1,15 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Threading.Tasks; - -namespace IssueLabeler.Shared -{ - public class AIOutput - { - public string Category { get; set; } - public string Service { get; set; } - public string Response { get; set; } - } -} diff --git a/tools/issue-labeler/src/Azure.Sdk.Labels/AzureSdkLabel.cs b/tools/issue-labeler/src/IssueLabeler.Shared/AzureSdkLabel.cs similarity index 80% rename from tools/issue-labeler/src/Azure.Sdk.Labels/AzureSdkLabel.cs rename to tools/issue-labeler/src/IssueLabeler.Shared/AzureSdkLabel.cs index 7f6dce4e5c9..ebedc6ae2e9 100644 --- a/tools/issue-labeler/src/Azure.Sdk.Labels/AzureSdkLabel.cs +++ b/tools/issue-labeler/src/IssueLabeler.Shared/AzureSdkLabel.cs @@ -1,12 +1,11 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -using System; -using Octokit; +using IssueLabeler.Shared.Models; -namespace Azure.Sdk.LabelTrainer +namespace IssueLabeler.Shared { - internal static class AzureSdkLabel + public static class AzureSdkLabel { public static bool IsServiceLabel(Label label) => string.Equals(label.Color, "e99695", StringComparison.InvariantCultureIgnoreCase); diff --git a/tools/issue-labeler/src/IssueLabeler.Shared/BackgroundTaskQueue.cs b/tools/issue-labeler/src/IssueLabeler.Shared/BackgroundTaskQueue.cs deleted file mode 100644 index bbb73a96dc4..00000000000 --- a/tools/issue-labeler/src/IssueLabeler.Shared/BackgroundTaskQueue.cs +++ /dev/null @@ -1,52 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -using Microsoft.Extensions.Logging; -using System.Collections.Concurrent; - -namespace IssueLabeler.Shared -{ - public interface IBackgroundTaskQueue - { - void QueueBackgroundWorkItem(Func workItem); - - Task> DequeueAsync( - CancellationToken cancellationToken); - } - - public class BackgroundTaskQueue : IBackgroundTaskQueue - { - private readonly ILogger _logger; - private ConcurrentQueue> _workItems = - new ConcurrentQueue>(); - private SemaphoreSlim _signal = new SemaphoreSlim(0); - - public BackgroundTaskQueue( - ILogger logger) - { - _logger = logger; - } - - public void QueueBackgroundWorkItem( - Func workItem) - { - if (workItem == null) - { - throw new ArgumentNullException(nameof(workItem)); - } - - _workItems.Enqueue(workItem); - _signal.Release(); - } - - public async Task> DequeueAsync( - CancellationToken cancellationToken) - { - await _signal.WaitAsync(cancellationToken); - _workItems.TryDequeue(out var workItem); - _logger.LogInformation("dequeued work item"); - - return workItem; - } - } -} \ No newline at end of file diff --git a/tools/issue-labeler/src/IssueLabeler.Shared/ConfigHelper.cs b/tools/issue-labeler/src/IssueLabeler.Shared/ConfigHelper.cs deleted file mode 100644 index a387cd1448d..00000000000 --- a/tools/issue-labeler/src/IssueLabeler.Shared/ConfigHelper.cs +++ /dev/null @@ -1,22 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -using Microsoft.Extensions.Configuration; - -namespace IssueLabeler.Shared -{ - public static class ConfigHelper - { - public static bool TryGetConfigValue(this IConfiguration config, string configName, out string? configValue, string? defaultValue = null) - { - - if (string.IsNullOrEmpty(config[configName])) - { - configValue = defaultValue; - return defaultValue != null; - } - configValue = config[configName]; - return true; - } - } -} diff --git a/tools/issue-labeler/src/IssueLabeler.Shared/DiffHelper.cs b/tools/issue-labeler/src/IssueLabeler.Shared/DiffHelper.cs deleted file mode 100644 index 35232e4ecb5..00000000000 --- a/tools/issue-labeler/src/IssueLabeler.Shared/DiffHelper.cs +++ /dev/null @@ -1,123 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -using System.Diagnostics; -using System.Text; - -namespace IssueLabeler.Shared -{ - public struct SegmentedDiff - { - public string[] FileDiffs { get; set; } - public IEnumerable Filenames { get; set; } - public IEnumerable Extensions { get; set; } - public Dictionary Folders { get; set; } - public Dictionary FolderNames { get; set; } - public bool AddDocInfo { get; set; } - public bool PossiblyExtensionsLabel { get; set; } - } - - public class DiffHelper : IDiffHelper - { - /// - /// name of files taken from fileDiffs - /// - public IEnumerable FilenamesOf(string[] fileDiffs) => fileDiffs.Select(fileWithDiff => Path.GetFileNameWithoutExtension(fileWithDiff)); - - /// - /// file extensions taken from fileDiffs - /// - public IEnumerable ExtensionsOf(string[] fileDiffs) => fileDiffs.Select(file => Path.GetExtension(file)). - Select(extension => string.IsNullOrEmpty(extension) ? "no_extension" : extension); - - public SegmentedDiff SegmentDiff(string[] fileDiffs) - { - if (fileDiffs == null || string.IsNullOrEmpty(string.Join(';', fileDiffs))) - { - throw new ArgumentNullException(nameof(fileDiffs)); - } - var folderNames = new Dictionary(); - var folders = new Dictionary(); - bool addDocInfo = false, possiblyExtensionsLabel = false; - string folderWithDiff, subfolder; - string[] folderNamesInPr; - foreach (var fileWithDiff in fileDiffs) - { - folderWithDiff = Path.GetDirectoryName(fileWithDiff) ?? string.Empty; - folderNamesInPr = folderWithDiff.Split(Path.DirectorySeparatorChar); - subfolder = string.Empty; - if (!string.IsNullOrEmpty(folderWithDiff)) - { - foreach (var folderNameInPr in folderNamesInPr) - { - if (folderNameInPr.Equals("ref", StringComparison.Ordinal) && - subfolder.StartsWith("src" + Path.DirectorySeparatorChar + "libraries", StringComparison.Ordinal) && - Path.GetExtension(fileWithDiff).Equals(".cs", StringComparison.OrdinalIgnoreCase)) - { - addDocInfo = true; - } - if (subfolder.StartsWith("src" + Path.DirectorySeparatorChar + "libraries" + Path.DirectorySeparatorChar + "Microsoft.Extensions.", StringComparison.Ordinal) && - Path.GetExtension(fileWithDiff).Equals(".cs", StringComparison.OrdinalIgnoreCase)) - { - possiblyExtensionsLabel = true; - } - subfolder += folderNameInPr; - if (folderNames.ContainsKey(folderNameInPr)) - { - folderNames[folderNameInPr] += 1; - } - else - { - folderNames.Add(folderNameInPr, 1); - } - if (folders.ContainsKey(subfolder)) - { - folders[subfolder] += 1; - } - else - { - folders.Add(subfolder, 1); - } - subfolder += Path.DirectorySeparatorChar; - } - } - } - return new SegmentedDiff() - { - FileDiffs = fileDiffs, - Filenames = FilenamesOf(fileDiffs), - Extensions = ExtensionsOf(fileDiffs), - Folders = folders, - FolderNames = folderNames, - AddDocInfo = addDocInfo, - PossiblyExtensionsLabel = possiblyExtensionsLabel - }; - } - - /// - /// flattens a dictionary to be repeated in a space separated format - /// - /// a dictionary containing text and number of times they were repeated - /// space delimited text - public string FlattenWithWhitespace(Dictionary folder) - { - var folderSb = new StringBuilder(); - foreach (var f in folder.OrderByDescending(x => x.Value)) - { - Debug.Assert(f.Value >= 1); - folderSb.Append(f.Key); - for (var j = 0; j < f.Value - 1; j++) - { - folderSb.Append(" ").Append(f.Key); - } - folderSb.Append(" "); - } - if (folderSb.Length == 0) - { - return string.Empty; - } - folderSb.Length--; - return folderSb.ToString(); - } - } -} diff --git a/tools/issue-labeler/src/IssueLabeler.Shared/GitHubClientFactory.cs b/tools/issue-labeler/src/IssueLabeler.Shared/GitHubClientFactory.cs deleted file mode 100644 index 86e42b03f39..00000000000 --- a/tools/issue-labeler/src/IssueLabeler.Shared/GitHubClientFactory.cs +++ /dev/null @@ -1,84 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -using Azure.Identity; -using Azure.Security.KeyVault.Secrets; -using GitHubJwt; -using Microsoft.Extensions.Configuration; -using Octokit; - -namespace IssueLabeler.Shared -{ - public sealed class GitHubClientFactory - { - private readonly IConfiguration _configuration; - - public GitHubClientFactory(IConfiguration configuration) - { - _configuration = configuration; - } - - public async Task CreateAsync() - { - // See: https://octokitnet.readthedocs.io/en/latest/github-apps/ for details. - - string localDevPAT = _configuration["GitHubDeveloperPAT"]; - if (localDevPAT != null) - { - return new GitHubClient(new ProductHeaderValue("GHNotif")) - { - Credentials = new Credentials(localDevPAT) - }; - } - else - { - var appId = Convert.ToInt32(_configuration["GitHubAppId"]); - SecretClient secretClient = new SecretClient(new Uri(_configuration["KeyVaultUri"]), new DefaultAzureCredential()); - KeyVaultSecret secret = await secretClient.GetSecretAsync(_configuration["AppSecretName"]).ConfigureAwait(false); - string privateKey = secret.Value; - - - var privateKeySource = new PlainStringPrivateKeySource(privateKey); - var generator = new GitHubJwtFactory( - privateKeySource, - new GitHubJwtFactoryOptions - { - AppIntegrationId = appId, - ExpirationSeconds = 8 * 60 // 600 is apparently too high - }); - var token = generator.CreateEncodedJwtToken(); - - var client = CreateForToken(token, AuthenticationType.Bearer); - await client.GitHubApps.GetAllInstallationsForCurrent(); - var installationTokenResult = await client.GitHubApps.CreateInstallationToken(long.Parse(_configuration["InstallationId"])); - - return CreateForToken(installationTokenResult.Token, AuthenticationType.Oauth); - } - } - - private static GitHubClient CreateForToken(string token, AuthenticationType authenticationType) - { - var productInformation = new ProductHeaderValue("issuelabelertemplate"); - var client = new GitHubClient(productInformation) - { - Credentials = new Credentials(token, authenticationType) - }; - return client; - } - - public sealed class PlainStringPrivateKeySource : IPrivateKeySource - { - private readonly string _key; - - public PlainStringPrivateKeySource(string key) - { - _key = key; - } - - public TextReader GetPrivateKeyReader() - { - return new StringReader(_key); - } - } - } -} diff --git a/tools/issue-labeler/src/IssueLabeler.Shared/GitHubClientWrapper.cs b/tools/issue-labeler/src/IssueLabeler.Shared/GitHubClientWrapper.cs deleted file mode 100644 index 0d042a7cc1e..00000000000 --- a/tools/issue-labeler/src/IssueLabeler.Shared/GitHubClientWrapper.cs +++ /dev/null @@ -1,136 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -using Microsoft.Extensions.Configuration; -using Microsoft.Extensions.Logging; -using Octokit; - -namespace IssueLabeler.Shared -{ - public interface IGitHubClientWrapper - { - Task GetIssue(string owner, string repo, int number); - Task GetPullRequest(string owner, string repo, int number); - Task> GetPullRequestFiles(string owner, string repo, int number); - Task CommentOn(string owner, string repo, int number, string v); - Task UpdateIssue(string owner, string repo, int number, IssueUpdate issueUpdate); - } - - public class GitHubClientWrapper : IGitHubClientWrapper - { - private readonly ILogger _logger; - private GitHubClient _client; - private readonly GitHubClientFactory _gitHubClientFactory; - - public GitHubClientWrapper( - ILogger logger, - GitHubClientFactory gitHubClientFactory) - { - _gitHubClientFactory = gitHubClientFactory; - _logger = logger; - - } - - // TODO add lambda to remove repetetive logic in this class - // -> call and pass a lambda calls create, and if fails remake and call it again. - - public async Task GetIssue(string owner, string repo, int number) - { - if (_client == null) - { - _client = await _gitHubClientFactory.CreateAsync(); - } - Octokit.Issue iop = null; - try - { - iop = await _client.Issue.Get(owner, repo, number); - } - catch (Exception ex) - { - _logger.LogError($"ex was of type {ex.GetType()}, message: {ex.Message}"); - _client = await _gitHubClientFactory.CreateAsync(); - iop = await _client.Issue.Get(owner, repo, number); - } - return iop; - } - - public async Task GetPullRequest(string owner, string repo, int number) - { - if (_client == null) - { - _client = await _gitHubClientFactory.CreateAsync(); - } - Octokit.PullRequest iop = null; - try - { - iop = await _client.PullRequest.Get(owner, repo, number); - } - catch (Exception ex) - { - _logger.LogError($"ex was of type {ex.GetType()}, message: {ex.Message}"); - _client = await _gitHubClientFactory.CreateAsync(); - iop = await _client.PullRequest.Get(owner, repo, number); - } - return iop; - } - - public async Task> GetPullRequestFiles(string owner, string repo, int number) - { - if (_client == null) - { - _client = await _gitHubClientFactory.CreateAsync(); - } - IReadOnlyList prFiles = null; - try - { - prFiles = await _client.PullRequest.Files(owner, repo, number); - - } - catch (Exception ex) - { - _logger.LogError($"ex was of type {ex.GetType()}, message: {ex.Message}"); - _client = await _gitHubClientFactory.CreateAsync(); - prFiles = await _client.PullRequest.Files(owner, repo, number); - } - return prFiles; - } - - public async Task UpdateIssue(string owner, string repo, int number, IssueUpdate issueUpdate) - { - if (_client == null) - { - _client = await _gitHubClientFactory.CreateAsync(); - } - try - { - await _client.Issue.Update(owner, repo, number, issueUpdate); - } - catch (Exception ex) - { - _logger.LogError($"ex was of type {ex.GetType()}, message: {ex.Message}"); - _client = await _gitHubClientFactory.CreateAsync(); - await _client.Issue.Update(owner, repo, number, issueUpdate); - } - } - - // lambda -> call and pass a lambda calls create, and if fails remake and call it again. - - public async Task CommentOn(string owner, string repo, int number, string comment) - { - if (_client == null) - { - _client = await _gitHubClientFactory.CreateAsync(); - } - try - { - await _client.Issue.Comment.Create(owner, repo, number, comment); - } - catch (Exception ex) - { - _logger.LogError($"ex was of type {ex.GetType()}, message: {ex.Message}"); - _client = await _gitHubClientFactory.CreateAsync(); - await _client.Issue.Comment.Create(owner, repo, number, comment); - } - } - } -} \ No newline at end of file diff --git a/tools/issue-labeler/src/IssueLabeler.Shared/GitHubIssue.cs b/tools/issue-labeler/src/IssueLabeler.Shared/GitHubIssue.cs index 05614f33b67..9fcfb6279ac 100644 --- a/tools/issue-labeler/src/IssueLabeler.Shared/GitHubIssue.cs +++ b/tools/issue-labeler/src/IssueLabeler.Shared/GitHubIssue.cs @@ -4,111 +4,26 @@ #pragma warning disable 649 // We don't care about unsused fields here, because they are mapped with the input file. using Microsoft.ML.Data; -using Octokit; namespace IssueLabeler.Shared { - - public class RepoIssueResult - { - public string Repo { get; set; } - public string Owner { get; set; } - public IReadOnlyList Issues { get; set; } - public int TotalCount { get; set; } - public List