diff --git a/alert-policies/azure-machine-learning/Errors.yml b/alert-policies/azure-machine-learning/Errors.yml new file mode 100644 index 0000000000..0faf15007c --- /dev/null +++ b/alert-policies/azure-machine-learning/Errors.yml @@ -0,0 +1,33 @@ +name: Errors + +description: |+ + This alert is triggered if the number of errors exceeds 20 within 10 minutes. +type: STATIC +nrql: + query: "FROM Metric SELECT sum(azure.machinelearningservices.workspaces.Errors) AS 'Errors'" + +# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE) +valueFunction: SINGLE_VALUE + +# List of Critical and Warning thresholds for the condition +terms: + - priority: CRITICAL + # Operator used to compare against the threshold. + operator: ABOVE + # Value that triggers a violation + threshold: 20 + # Time in seconds; 120 - 3600 + thresholdDuration: 600 + # How many data points must be in violation for the duration + thresholdOccurrences: ALL + + # Adding a Warning threshold is optional + - priority: WARNING + operator: ABOVE + threshold: 10 + thresholdDuration: 600 + thresholdOccurrences: ALL + +# Duration after which a violation automatically closes +# Time in seconds; 300 - 2592000 (Default: 86400 [1 day]) +violationTimeLimitSeconds: 86400 \ No newline at end of file diff --git a/dashboards/azure-machine-learning/Azure-Machine-Learning.png b/dashboards/azure-machine-learning/Azure-Machine-Learning.png new file mode 100644 index 0000000000..6cec96b9c7 Binary files /dev/null and b/dashboards/azure-machine-learning/Azure-Machine-Learning.png differ diff --git a/dashboards/azure-machine-learning/Azure-Machine-Learning01.png b/dashboards/azure-machine-learning/Azure-Machine-Learning01.png new file mode 100644 index 0000000000..2f8fbe27de Binary files /dev/null and b/dashboards/azure-machine-learning/Azure-Machine-Learning01.png differ diff --git a/dashboards/azure-machine-learning/Azure-Machine-Learning02.png b/dashboards/azure-machine-learning/Azure-Machine-Learning02.png new file mode 100644 index 0000000000..d7ef549f0b Binary files /dev/null and b/dashboards/azure-machine-learning/Azure-Machine-Learning02.png differ diff --git a/dashboards/azure-machine-learning/Azure-Machine-Learning03.png b/dashboards/azure-machine-learning/Azure-Machine-Learning03.png new file mode 100644 index 0000000000..7e87761719 Binary files /dev/null and b/dashboards/azure-machine-learning/Azure-Machine-Learning03.png differ diff --git a/dashboards/azure-machine-learning/Azure-Machine-Learning04.png b/dashboards/azure-machine-learning/Azure-Machine-Learning04.png new file mode 100644 index 0000000000..b22016e9cd Binary files /dev/null and b/dashboards/azure-machine-learning/Azure-Machine-Learning04.png differ diff --git a/dashboards/azure-machine-learning/azure-machine-learning.json b/dashboards/azure-machine-learning/azure-machine-learning.json new file mode 100644 index 0000000000..5673359f42 --- /dev/null +++ b/dashboards/azure-machine-learning/azure-machine-learning.json @@ -0,0 +1,1178 @@ +{ + "name": "Azure Machine Learning using Azure Monitor", + "description": null, + "pages": [ + { + "name": "Job Executions", + "description": null, + "widgets": [ + { + "title": "", + "layout": { + "column": 1, + "row": 1, + "width": 2, + "height": 2 + }, + "visualization": { + "id": "viz.markdown" + }, + "rawConfiguration": { + "text": "## Azure Machine Learning\nIt enables data scientists and developers to rapidly construct, deploy, and oversee high-quality models, instilling them with a sense of assurance. This expedites the journey to achieving value by incorporating cutting-edge machine learning operations (MLOps), seamless compatibility with open-source tools, and a suite of integrated resources." + } + }, + { + "title": "Cancel requested runs", + "layout": { + "column": 3, + "row": 1, + "width": 2, + "height": 2 + }, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountId": 0, + "query": "FROM Metric SELECT sum(azure.machinelearningservices.workspaces.CancelRequestedRuns) AS 'CancelRequestedRuns'" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": [ + { + "alertSeverity": "WARNING", + "value": -1 + } + ] + } + }, + { + "title": "Failed runs", + "layout": { + "column": 5, + "row": 1, + "width": 2, + "height": 2 + }, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountId": 0, + "query": "FROM Metric SELECT sum(azure.machinelearningservices.workspaces.FailedRuns) AS 'FailedRuns'" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": [ + { + "alertSeverity": "CRITICAL", + "value": -1 + } + ] + } + }, + { + "title": "Cancelled runs", + "layout": { + "column": 7, + "row": 1, + "width": 2, + "height": 2 + }, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountId": 0, + "query": "FROM Metric SELECT sum(azure.machinelearningservices.workspaces.CancelledRuns) AS 'CancelledRuns'" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": [ + { + "alertSeverity": "WARNING", + "value": -1 + } + ] + } + }, + { + "title": "Completed runs", + "layout": { + "column": 9, + "row": 1, + "width": 2, + "height": 2 + }, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountId": 0, + "query": "FROM Metric SELECT sum(azure.machinelearningservices.workspaces.CompletedRuns) AS 'CompletedRuns'" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": [ + { + "alertSeverity": "WARNING", + "value": 1000000 + } + ] + } + }, + { + "title": "Queued runs", + "layout": { + "column": 11, + "row": 1, + "width": 2, + "height": 2 + }, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountId": 0, + "query": "FROM Metric SELECT sum(azure.machinelearningservices.workspaces.QueuedRuns) AS 'QueuedRuns'" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": [ + { + "alertSeverity": "WARNING", + "value": -1 + } + ] + } + }, + { + "title": "Not responding and Not started runs", + "layout": { + "column": 1, + "row": 3, + "width": 8, + "height": 2 + }, + "visualization": { + "id": "viz.stacked-bar" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountId": 0, + "query": "FROM Metric SELECT sum(azure.machinelearningservices.workspaces.NotRespondingRuns) AS 'NotRespondingRuns', sum(azure.machinelearningservices.workspaces.NotStartedRuns) AS 'NotStartedRuns' TIMESERIES AUTO" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Errors", + "layout": { + "column": 9, + "row": 3, + "width": 2, + "height": 2 + }, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountId": 0, + "query": "FROM Metric SELECT sum(azure.machinelearningservices.workspaces.Errors) AS 'Errors'" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": [ + { + "alertSeverity": "CRITICAL", + "value": -1 + } + ] + } + }, + { + "title": "Warnings", + "layout": { + "column": 11, + "row": 3, + "width": 2, + "height": 2 + }, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountId": 0, + "query": "FROM Metric SELECT sum(azure.machinelearningservices.workspaces.Warnings) AS 'Warnings'" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": [ + { + "alertSeverity": "WARNING", + "value": -1 + } + ] + } + }, + { + "title": "Provisioning and Preparing runs", + "layout": { + "column": 1, + "row": 5, + "width": 4, + "height": 3 + }, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountId": 0, + "query": "FROM Metric SELECT sum(azure.machinelearningservices.workspaces.ProvisioningRuns) AS 'ProvisioningRuns', average(azure.machinelearningservices.workspaces.PreparingRuns) AS 'PreparingRuns' TIMESERIES AUTO" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "yAxisLeft": { + "zero": true + } + } + }, + { + "title": "Started and Starting runs", + "layout": { + "column": 5, + "row": 5, + "width": 4, + "height": 3 + }, + "visualization": { + "id": "viz.area" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountId": 0, + "query": "FROM Metric SELECT sum(azure.machinelearningservices.workspaces.StartedRuns) AS 'StartedRuns', sum(azure.machinelearningservices.workspaces.StartingRuns) AS 'StartingRuns' TIMESERIES AUTO" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Finalizing runs", + "layout": { + "column": 9, + "row": 5, + "width": 4, + "height": 3 + }, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountId": 0, + "query": "FROM Metric SELECT sum(azure.machinelearningservices.workspaces.FinalizingRuns) AS 'FinalizingRuns' TIMESERIES AUTO" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "yAxisLeft": { + "zero": true + } + } + } + ] + }, + { + "name": "Model Metrics", + "description": null, + "widgets": [ + { + "title": "", + "layout": { + "column": 1, + "row": 1, + "width": 4, + "height": 2 + }, + "visualization": { + "id": "viz.markdown" + }, + "rawConfiguration": { + "text": "# Azure Machine Learning Model\nThe Azure Machine Learning Model can be generated through either an Azure Machine Learning training run or through an external model training process that occurs outside of the Azure ecosystem." + } + }, + { + "title": "Model deploy failed", + "layout": { + "column": 5, + "row": 1, + "width": 2, + "height": 2 + }, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountId": 0, + "query": "FROM Metric SELECT sum(azure.machinelearningservices.workspaces.ModelDeployFailed) AS 'ModelDeployFailed'" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": [ + { + "alertSeverity": "CRITICAL", + "value": -1 + } + ] + } + }, + { + "title": "Model register failed", + "layout": { + "column": 7, + "row": 1, + "width": 2, + "height": 2 + }, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountId": 0, + "query": "FROM Metric SELECT sum(azure.machinelearningservices.workspaces.ModelRegisterFailed) AS 'ModelRegisterFailed'" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": [ + { + "alertSeverity": "CRITICAL", + "value": -1 + } + ] + } + }, + { + "title": "Model deploy succeeded", + "layout": { + "column": 9, + "row": 1, + "width": 2, + "height": 2 + }, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountId": 0, + "query": "FROM Metric SELECT sum(azure.machinelearningservices.workspaces.ModelDeploySucceeded) AS 'ModelDeploySucceeded'" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": [ + { + "alertSeverity": "WARNING", + "value": 100000 + } + ] + } + }, + { + "title": "Model register succeeded", + "layout": { + "column": 11, + "row": 1, + "width": 2, + "height": 2 + }, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountId": 0, + "query": "FROM Metric SELECT sum(azure.machinelearningservices.workspaces.ModelRegisterSucceeded) AS 'ModelRegisterSucceeded'" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": [ + { + "alertSeverity": "WARNING", + "value": 100000 + } + ] + } + }, + { + "title": "Model deploy started", + "layout": { + "column": 1, + "row": 3, + "width": 12, + "height": 4 + }, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountId": 0, + "query": "FROM Metric SELECT sum(azure.machinelearningservices.workspaces.ModelDeployStarted) AS 'ModelDeployStarted' TIMESERIES AUTO " + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "yAxisLeft": { + "zero": true + } + } + } + ] + }, + { + "name": "Quota Metrics", + "description": null, + "widgets": [ + { + "title": "", + "layout": { + "column": 1, + "row": 1, + "width": 3, + "height": 2 + }, + "visualization": { + "id": "viz.markdown" + }, + "rawConfiguration": { + "text": "# Azure Quotas for Machine Learning\nAzure uses limits and quotas to prevent budget overruns due to fraud, and to honor Azure capacity constraints." + } + }, + { + "title": "Quota utilization percentage", + "layout": { + "column": 4, + "row": 1, + "width": 9, + "height": 2 + }, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountId": 0, + "query": " FROM Metric SELECT average(azure.machinelearningservices.workspaces.QuotaUtilizationPercentage) AS 'QuotaUtilizationPercentage' TIMESERIES AUTO " + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "yAxisLeft": { + "zero": true + } + } + }, + { + "title": "Cores Information", + "layout": { + "column": 1, + "row": 3, + "width": 6, + "height": 4 + }, + "visualization": { + "id": "viz.stacked-bar" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountId": 0, + "query": "FROM Metric SELECT average(azure.machinelearningservices.workspaces.TotalCores) AS 'TotalCores',average(azure.machinelearningservices.workspaces.IdleCores) AS 'IdleCores',average(azure.machinelearningservices.workspaces.ActiveCores) AS 'ActiveCores',average(azure.machinelearningservices.workspaces.LeavingCores) AS 'LeavingCores',average(azure.machinelearningservices.workspaces.UnusableCores) AS 'UnusableCores',average(azure.machinelearningservices.workspaces.PreemptedCores) AS 'PreemptedCores' TIMESERIES AUTO " + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Nodes Information", + "layout": { + "column": 7, + "row": 3, + "width": 6, + "height": 4 + }, + "visualization": { + "id": "viz.stacked-bar" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountId": 0, + "query": "FROM Metric SELECT average(azure.machinelearningservices.workspaces.TotalNodes) AS 'TotalNodes',average(azure.machinelearningservices.workspaces.IdleNodes) AS 'IdleNodes',average(azure.machinelearningservices.workspaces.ActiveNodes) AS 'ActiveNodes',average(azure.machinelearningservices.workspaces.LeavingNodes) AS 'LeavingNodes',average(azure.machinelearningservices.workspaces.UnusableNodes) AS 'UnusableNodes',average(azure.machinelearningservices.workspaces.PreemptedNodes) AS 'PreemptedNodes' TIMESERIES AUTO" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + } + ] + }, + { + "name": "CPU Usage", + "description": null, + "widgets": [ + { + "title": "Cpu memory capacity (MB)", + "layout": { + "column": 1, + "row": 1, + "width": 2, + "height": 2 + }, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountId": 0, + "query": "FROM Metric SELECT average(azure.machinelearningservices.workspaces.CpuMemoryCapacityMegabytes)/1e+6 AS 'CpuMemoryCapacityMegabytes'" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": [ + { + "alertSeverity": "WARNING", + "value": -1 + } + ] + } + }, + { + "title": "Cpu capacity millicores", + "layout": { + "column": 3, + "row": 1, + "width": 2, + "height": 2 + }, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountId": 0, + "query": "FROM Metric SELECT average(azure.machinelearningservices.workspaces.CpuCapacityMillicores) AS 'CpuCapacityMillicores'" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": [ + { + "alertSeverity": "WARNING", + "value": -1 + } + ] + } + }, + { + "title": "Cpu utilization", + "layout": { + "column": 5, + "row": 1, + "width": 8, + "height": 2 + }, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountId": 0, + "query": "FROM Metric SELECT average(azure.machinelearningservices.workspaces.CpuUtilization) AS 'CpuUtilization' TIMESERIES AUTO" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "units": { + "unit": "PERCENTAGE" + }, + "yAxisLeft": { + "zero": true + } + } + }, + { + "title": "DiskAvailMegabytes & DiskUsedMegabytes", + "layout": { + "column": 1, + "row": 3, + "width": 4, + "height": 3 + }, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountId": 0, + "query": "FROM Metric SELECT average(azure.machinelearningservices.workspaces.DiskAvailMegabytes) AS 'DiskAvailMegabytes', average(azure.machinelearningservices.workspaces.DiskUsedMegabytes) AS 'DiskUsedMegabytes' TIMESERIES AUTO" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "units": { + "unit": "BYTES" + }, + "yAxisLeft": { + "zero": true + } + } + }, + { + "title": "Disk read & write megabytes", + "layout": { + "column": 5, + "row": 3, + "width": 4, + "height": 3 + }, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountId": 0, + "query": "FROM Metric SELECT average(azure.machinelearningservices.workspaces.DiskReadMegabytes) AS 'DiskReadMegabytes',average(azure.machinelearningservices.workspaces.DiskWriteMegabytes) AS 'DiskWriteMegabytes' TIMESERIES AUTO" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "units": { + "unit": "BYTES" + }, + "yAxisLeft": { + "zero": true + } + } + }, + { + "title": "Cpu memory utilization megabytes", + "layout": { + "column": 9, + "row": 3, + "width": 4, + "height": 3 + }, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountId": 0, + "query": "FROM Metric SELECT average(azure.machinelearningservices.workspaces.CpuMemoryUtilizationMegabytes) AS 'CpuMemoryUtilizationMegabytes' TIMESERIES AUTO" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "units": { + "unit": "BYTES" + }, + "yAxisLeft": { + "zero": true + } + } + } + ] + }, + { + "name": "GPU Usage", + "description": null, + "widgets": [ + { + "title": "IBTransmitMegabytes", + "layout": { + "column": 1, + "row": 1, + "width": 2, + "height": 2 + }, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountId": 0, + "query": "FROM Metric SELECT average(azure.machinelearningservices.workspaces.IBTransmitMegabytes) AS 'IBTransmitMegabytes'" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": [ + { + "alertSeverity": "WARNING", + "value": -1 + } + ] + } + }, + { + "title": "IBReceiveMegabytes", + "layout": { + "column": 3, + "row": 1, + "width": 2, + "height": 2 + }, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountId": 0, + "query": "FROM Metric SELECT average(azure.machinelearningservices.workspaces.IBReceiveMegabytes) AS 'IBReceiveMegabytes'" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": [ + { + "alertSeverity": "WARNING", + "value": -1 + } + ] + } + }, + { + "title": "Gpu Memory Capacity & Gpu Memory Utilization Megabytes", + "layout": { + "column": 5, + "row": 1, + "width": 8, + "height": 2 + }, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountId": 0, + "query": "FROM Metric SELECT average(azure.machinelearningservices.workspaces.GpuMemoryCapacityMegabytes) AS 'GpuMemoryCapacityMegabytes', average(azure.machinelearningservices.workspaces.GpuMemoryUtilizationMegabytes) AS 'GpuMemoryUtilizationMegabytes' TIMESERIES AUTO" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "units": { + "unit": "BYTES" + }, + "yAxisLeft": { + "zero": true + } + } + }, + { + "title": "Gpu capacity ( Milli GPUs )", + "layout": { + "column": 1, + "row": 3, + "width": 4, + "height": 3 + }, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountId": 0, + "query": "FROM Metric SELECT average(azure.machinelearningservices.workspaces.GpuCapacityMilliGPUs) AS 'GpuCapacityMilliGPUs' TIMESERIES AUTO" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "yAxisLeft": { + "zero": true + } + } + }, + { + "title": "Gpu energy joules", + "layout": { + "column": 5, + "row": 3, + "width": 4, + "height": 3 + }, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountId": 0, + "query": "FROM Metric SELECT sum(azure.machinelearningservices.workspaces.GpuEnergyJoules) AS 'GpuEnergyJoules' TIMESERIES AUTO" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "yAxisLeft": { + "zero": true + } + } + }, + { + "title": "Gpu memory & Gpu utilization (%)", + "layout": { + "column": 9, + "row": 3, + "width": 4, + "height": 3 + }, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountId": 0, + "query": "FROM Metric SELECT average(azure.machinelearningservices.workspaces.GpuMemoryUtilizationPercentage) AS 'GpuMemoryUtilizationPercentage', average(azure.machinelearningservices.workspaces.GpuUtilizationPercentage) AS 'GpuUtilizationPercentage' TIMESERIES AUTO" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "units": { + "unit": "PERCENTAGE" + }, + "yAxisLeft": { + "zero": true + } + } + }, + { + "title": "Gpu utilization & Gpu utilization (MilliGPUs)", + "layout": { + "column": 1, + "row": 6, + "width": 4, + "height": 3 + }, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountId": 0, + "query": "FROM Metric SELECT average(azure.machinelearningservices.workspaces.GpuUtilization) AS 'GpuUtilization',average(azure.machinelearningservices.workspaces.GpuUtilizationMilliGPUs) AS 'GpuUtilizationMilliGPUs' TIMESERIES AUTO" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "yAxisLeft": { + "zero": true + } + } + }, + { + "title": "Storage API failure & success count", + "layout": { + "column": 5, + "row": 6, + "width": 4, + "height": 3 + }, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountId": 0, + "query": "FROM Metric SELECT sum(azure.machinelearningservices.workspaces.StorageAPIFailureCount) AS 'StorageAPIFailureCount',sum(azure.machinelearningservices.workspaces.StorageAPISuccessCount) AS 'StorageAPISuccessCount' TIMESERIES AUTO" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "yAxisLeft": { + "zero": true + } + } + }, + { + "title": "Network input & output megabytes", + "layout": { + "column": 9, + "row": 6, + "width": 4, + "height": 3 + }, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountId": 0, + "query": "FROM Metric SELECT average(azure.machinelearningservices.workspaces.NetworkInputMegabytes) AS 'NetworkInputMegabytes',average(azure.machinelearningservices.workspaces.NetworkOutputMegabytes) AS 'NetworkOutputMegabytes' TIMESERIES AUTO" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "units": { + "unit": "BYTES" + }, + "yAxisLeft": { + "zero": true + } + } + } + ] + } + ] +} \ No newline at end of file diff --git a/quickstarts/azure/azure-machine-learning/config.yml b/quickstarts/azure/azure-machine-learning/config.yml new file mode 100644 index 0000000000..72444f4834 --- /dev/null +++ b/quickstarts/azure/azure-machine-learning/config.yml @@ -0,0 +1,40 @@ +slug: azure-machine-learning +title: Azure Machine Learning +description: |- + ## What is Azure Machine Learning? + + Azure Machine Learning empowers data scientists and developers to build, deploy, and manage high-quality models faster and with confidence. + + ### New Relic Azure Machine Learning quickstart features + + A standard dashboard that tracks key indicators like Job Execution, ActiveCores, CpuUtilization, FinalizingRuns, Model Deployment and more. It runs custom queries and visualizes the data immediately. + + ### Why monitor Azure Machine Learning with New Relic? + + [New Relic Azure Machine Learning](https://docs.newrelic.com/docs/infrastructure/microsoft-azure-integrations/azure-integrations-list/azure-machine-learning-monitoring-integration/) monitoring quickstart empowers you to track the performance of Azure Machine Learning via different metrics including Job Execution, ActiveCores, CpuUtilization, FinalizingRuns, Model Deployment and more. + + Our integration features a standard dashboard that provides interactive visualizations to explore your data, understand context, and get valuable insights. + + Start ingesting your Azure data today and get immediate access to our visualization dashboards so you can optimize your Azure service. +summary: |- + Monitor Azure Machine Learning by connecting Azure to New Relic +icon: logo.png +level: New Relic +authors: + - New Relic + - New Relic Partner +documentation: + - name: Azure Machine Learning installation docs + description: | + Monitor Azure Machine Learning by connecting Azure to New Relic. + url: >- + https://docs.newrelic.com/docs/infrastructure/microsoft-azure-integrations/azure-integrations-list/azure-machine-learning-monitoring-integration/ +keywords: + - azure + - azure machine learning + - machine learning + - azure ml +dashboards: + - azure-machine-learning +dataSourceIds: + - azure-monitor diff --git a/quickstarts/azure/azure-machine-learning/logo.png b/quickstarts/azure/azure-machine-learning/logo.png new file mode 100644 index 0000000000..020f948ca9 Binary files /dev/null and b/quickstarts/azure/azure-machine-learning/logo.png differ