many changes and adding dashboard

deliveryhero · Aug 8, 2023 · aac2c85 · aac2c85
1 parent 550c880
commit aac2c85
Show file tree

Hide file tree

Showing 9 changed files with 276 additions and 218 deletions.
diff --git a/README.md b/README.md
@@ -1,14 +1,14 @@
 # terraform-aws-service-quota-alarms
 
-The modules in this repo will create CloudWatch alarms for all available, critical AWS service quotas limits, in all AWS regions.
+The modules in this repo will create CloudWatch alarms for all available, critical AWS service quotas limits.
 
 AWS service quotas can be monitored in 2 different CloudWatch namespaces:
 
-1. `AWS/TrustedAdvisor`: These metrics come from the [Trusted Advisor](https://aws.amazon.com/premiumsupport/technology/trusted-advisor/) service and are simply represent usage of the specific quota limit as a percentage. This metrics are availble for all regions but are only visible in the `us-east-1` region.
-2. `AWS/Usage`: There are many metrics in this namespace that are split by 3 different `metric_name`:
-   a. `CallCount`: Most of the metrics in this namespace are of this type and are about rate limits of specific API calls for each service
-   b. `ResourceCount`: These metrics are mostly about the count of certain resource types per service
-   c. `ThrottleCount`: A few specific throttling metrics only for the CloudWatch service
+* 1\. `AWS/TrustedAdvisor`: These metrics come from the [Trusted Advisor](https://aws.amazon.com/premiumsupport/technology/trusted-advisor/) service and are simply represent usage of the specific quota limit as a percentage. This metrics are availble for all regions but are only visible in the `us-east-1` region.
+* 2\. `AWS/Usage`: There are many metrics in this namespace that are split by 3 different `metric_name`:
+  * a) `CallCount`: Most of the metrics in this namespace are of this type and are about rate limits of specific API calls for each service
+  * b) `ResourceCount`: These metrics are mostly about the count of certain resource types per service
+  * c) `ThrottleCount`: A few specific throttling metrics only for the CloudWatch service
 
 This module will create alarms for all metrics from items 1 and 2b.
 
@@ -23,14 +23,13 @@ This repo includes 2 terraform modules:
 
 - [modules/trusted_advisor_alarms](modules/trusted_advisor_alarms): Creates alarms in the `AWS/TrustedAdvisor` namespace for for quotas from multiple regions. This module should only be defined once in the `us-east-1` region.
 - [modules/usage_alarms](modules/usage_alarms): Creates alarms in the `AWS/Usage` namespace. This module needs to be defined for each region that is used.
+- [modules/dashboard](modules/dashboard): Creates a CloudWatch dashboard for all service quotas. This module should only be defined once in the `us-east-1` region.
 
 See [example](example) for a full example implimentation of both modules, multiple regions and multiple terraform AWS providers.
 
 ## Challenges of measuring service quota usage
 
-Generally the implementation in AWS measuring service quota usage seems inconsistent. The metrics are split across 2 different CloudWatch namespaces, each measured in a different way. There is many services in the `AWS/Usage` CloudWatch namespace that do not support the `SERVICE_QUOTA` math function so measurement of usage against the current quota limit is not possible. And some metrics under 2b are not a count of resource, e.g. `NumberOfMessagesPublishedPerAccount` for SNS service which measures messages published per minute.
-
-Furthermore, there seems to be a bug with `ClassicLoadBalancersPerRegion` quota for the "Elastic Load Balancing" service where the quota usage is always measured against the default limit, not the actual limit.
+Generally the implementation in AWS measuring service quota usage seems inconsistent. The metrics are split across 2 different CloudWatch namespaces, each measured in a different way. There is many services in the `AWS/Usage` CloudWatch namespace that do not support the `SERVICE_QUOTA` math function so measurement of usage against the current quota limit is not possible. Some AWS services have metrics in both namespaces, e.g. `EC2`. And some metrics under 2b are not a count of resource, e.g. `NumberOfMessagesPublishedPerAccount` for SNS service which measures messages published per minute. Furthermore, there seems to be a bug with `Elastic Load Balancing/ClassicLoadBalancersPerRegion` quota where the quota usage is always measured against the default limit, not the actual limit. And there exists additional inconsistencies in the AWS Service Quota console where the utilization numbers do not match the provided CloudWatch dashboard panel, for example with `SNS/NumberOfMessagesPublishedPerAccount`.
 
 ## Further reading
 

diff --git a/example/main.tf b/example/main.tf
@@ -6,6 +6,15 @@ locals {
   ]
 }
 
+module "dashboard" {
+  source  = "../modules/dashboard"
+  regions = local.regions
+
+  providers = {
+    aws = aws.us-east-1
+  }
+}
+
 module "trusted_advisor_alarms" {
   source  = "../modules/trusted_advisor_alarms"
   regions = local.regions
@@ -15,7 +24,6 @@ module "trusted_advisor_alarms" {
   }
 }
 
-
 module "usage_alarms_ap_southeast_1" {
   source = "../modules/usage_alarms"
 

diff --git a/modules/dashboard/main.tf b/modules/dashboard/main.tf
@@ -1,169 +1,24 @@
 locals {
-  trusted_advisor_service_limits = {
-    AutoScaling = [
-      "Auto Scaling groups",
-      "Launch configurations"
-    ]
-    CloudFormation = [
-      "Stacks"
-    ]
-    DynamoDB = [
-      "DynamoDB Read Capacity",
-      "DynamoDB Write Capacity"
-    ]
-    EBS = [
-      "Active snapshots",
-      "Cold HDD (sc1) volume storage (TiB)",
-      "General Purpose SSD (gp2) volume storage (TiB)",
-      "General Purpose SSD (gp3) volume storage",
-      "Magnetic (standard) volume storage (TiB)",
-      "Provisioned IOPS (SSD) storage (TiB)",
-      "Provisioned IOPS SSD (io2) Volume Storage",
-      "Provisioned IOPS",
-      "Throughput Optimized HDD (st1) volume storage (TiB)",
-    ]
-    EC2 = [
-      "Elastic IP addresses (EIPs)",
-      "On-Demand instances"
-    ]
-    ELB = [
-      "Active Application Load Balancers",
-      "Active Network Load Balancers",
-      "Active load balancers",
-    ]
-    Kinesis = [
-      "Shards per region"
-    ]
-    RDS = [
-      "Clusters",
-      "Cluster parameter groups",
-      "DB parameter groups",
-      "DB instances",
-      "Event subscriptions",
-      "RDS DB Manual Snapshots",
-      "Read replicas per master",
-      "Storage quota (GB)",
-      "Subnet groups",
-      "Subnets per subnet group",
-    ]
-    SES = [
-      "Daily sending quota"
-    ]
-    VPC = [
-      "EC2-VPC Elastic IP addresses (EIPs)",
-      "Internet gateways",
-      "VPCs",
-    ]
-  }
-
-  usage_service_limits = {
-    AutoScaling = {
-      None = ["NumberOfAutoScalingGroup"]
-    }
-    CloudWatch = {
-      None = ["InsightRule"]
-    }
-    DynamoDB = {
-      None = [
-        "AccountProvisionedWriteCapacityUnits",
-        "AccountProvisionedReadCapacityUnits",
-      ]
-    }
-    EC2 = {
-      "Standard/OnDemand" = ["vCPU"]
-      "Standard/Spot"     = ["vCPU"]
-    }
-    "Elastic Load Balancing" = {
-      None = [
-        "TargetGroupsPerApplicationLoadBalancer",
-        "ListenersPerApplicationLoadBalancer",
-        "TargetsPerTargetGroupPerRegion",
-        "TargetsPerAvailabilityZonePerNetworkLoadBalancer",
-        "TargetsPerApplicationLoadBalancer",
-        "ListenersPerClassicLoadBalancer",
-        "RoutingRulesPerApplicationLoadBalancer",
-        "RegisteredInstancesPerClassicLoadBalancer",
-        "TargetsPerNetworkLoadBalancer",
-        "ClassicLoadBalancersPerRegion",
-        "ListenersPerNetworkLoadBalancer",
-        "NetworkLoadBalancersENIsPerVPC",
-        "CertificatesPerApplicationLoadBalancer",
-        "TargetGroupsPerRegion",
-        "CertificatesPerNetworkLoadBalancer",
-        "ApplicationLoadBalancersPerRegion",
-        "NetworkLoadBalancersPerRegion",
-      ]
-    }
-    Firehose = {
-      None = ["DeliveryStreams"]
-    }
-    SNS = {
-      None = ["NumberOfMessagesPublishedPerAccount"]
+  usage_widget_header = {
+    type   = "text"
+    width  = 24
+    height = 2
+    properties = {
+      "markdown" : "# Usage metrics \n### These metrics come from the `AWS/Usage` namespace [here](https://us-east-1.console.aws.amazon.com/cloudwatch/home?region=us-east-1#metricsV2?graph=~()&query=~'*7bAWS*2fUsage*2cClass*2cResource*2cService*2cType*7d*20AWS*2fUsage*20MetricName*3dResourceCount) \n"
     }
   }
 
-  metrics_normalized_all = flatten([
-    for region in var.regions : [
-      for service_name, data in local.usage_service_limits : [
-        for class, limits in data : [
-          for resource in limits : {
-            class        = class
-            resource     = resource
-            region       = region
-            service_name = service_name
-            id           = replace(replace(lower(replace(join("", [service_name, class, resource]), "-", "")), " ", ""), "/", "")
-            label        = format("%s (%s): %s", service_name, class, resource)
-          }
-        ]
-      ]
-    ]
-  ])
-
-  metrics_normalized_service_region = {
-    for service_name, data in local.usage_service_limits : service_name => {
-      for region in var.regions : region => [for metric in local.metrics_normalized_all : metric if metric.region == region && metric.service_name == service_name]
+  trusted_advisor_widget_header = {
+    type   = "text"
+    width  = 24
+    height = 2
+    properties = {
+      "markdown" : "# TrustedAdvisor metrics \n### These metrics come from the `AWS/TrustedAdvisor` namespace [here](https://us-east-1.console.aws.amazon.com/cloudwatch/home?region=us-east-1#metricsV2?graph=~()&query=~'*7bAWS*2fTrustedAdvisor*2cRegion*2cServiceLimit*2cServiceName*7d*20MetricName*3dServiceLimitUsage) \n"
     }
   }
-
-  dashboard_widgets = flatten([
-    for service_name, region_data in local.metrics_normalized_service_region : [
-      for region, metrics in region_data : [
-        {
-          type = "metric"
-          properties = {
-            stat   = "Sum"
-            region = region
-            period = 300
-            view   = "timeSeries"
-            title  = format("%s: %s", service_name, region)
-            yAxis = {
-              left = {
-                label     = "Quota usage percentage"
-                max       = 100
-                min       = 0
-                showUnits = false
-              }
-            }
-            metrics = concat([
-              for metric in metrics : flatten([
-                [
-                  "AWS/Usage", "ResourceCount", "Class", metric["class"], "Resource", metric["resource"], "Service", metric["service_name"], "Type", "Resource",
-                  { id = metric["id"], region = metric["region"], visible = false }
-                ]
-              ])
-              ],
-              [for metric in metrics : [
-                { expression = "(${metric.id}/SERVICE_QUOTA(${metric.id}))*100", label = metric["label"], region = metric["region"] }
-              ]]
-            )
-          }
-        }
-      ]
-    ]
-  ])
 }
 
 resource "aws_cloudwatch_dashboard" "main" {
   dashboard_name = "ServiceQuotaUsage"
-  dashboard_body = jsonencode({ widgets = local.dashboard_widgets })
+  dashboard_body = jsonencode({ widgets = concat([local.usage_widget_header], local.usage_dashboard_widgets, [local.trusted_advisor_widget_header], local.trusted_advisor_dashboard_widgets) })
 }
diff --git a/modules/dashboard/trusted_advisor.tf b/modules/dashboard/trusted_advisor.tf
@@ -0,0 +1,115 @@
+locals {
+  trusted_advisor_service_limits = {
+    AutoScaling = [
+      "Auto Scaling groups",
+      "Launch configurations"
+    ]
+    CloudFormation = [
+      "Stacks"
+    ]
+    DynamoDB = [
+      "DynamoDB Read Capacity",
+      "DynamoDB Write Capacity"
+    ]
+    EBS = [
+      "Active snapshots",
+      "Cold HDD (sc1) volume storage (TiB)",
+      "General Purpose SSD (gp2) volume storage (TiB)",
+      "General Purpose SSD (gp3) volume storage",
+      "Magnetic (standard) volume storage (TiB)",
+      "Provisioned IOPS (SSD) storage (TiB)",
+      "Provisioned IOPS SSD (io2) Volume Storage",
+      "Provisioned IOPS",
+      "Throughput Optimized HDD (st1) volume storage (TiB)",
+    ]
+    EC2 = [
+      "Elastic IP addresses (EIPs)",
+      "On-Demand instances"
+    ]
+    ELB = [
+      "Active Application Load Balancers",
+      "Active Network Load Balancers",
+      "Active load balancers",
+    ]
+    Kinesis = [
+      "Shards per region"
+    ]
+    RDS = [
+      "Clusters",
+      "Cluster parameter groups",
+      "DB parameter groups",
+      "DB instances",
+      "Event subscriptions",
+      "RDS DB Manual Snapshots",
+      "Read replicas per master",
+      "Storage quota (GB)",
+      "Subnet groups",
+      "Subnets per subnet group",
+    ]
+    SES = [
+      "Daily sending quota"
+    ]
+    VPC = [
+      "EC2-VPC Elastic IP addresses (EIPs)",
+      "Internet gateways",
+      "VPCs",
+    ]
+  }
+
+  trusted_advisor_metrics_normalized_all = flatten([
+    for region in var.regions : [
+      for service_name, limits in local.trusted_advisor_service_limits : [
+        for resource in limits : {
+          resource     = resource
+          region       = region
+          service_name = service_name
+          id           = lower(replace(format("%s%s", service_name, resource), "/[\\W_]+/", ""))
+          label        = format("%s: %s", service_name, resource)
+        }
+      ]
+    ]
+  ])
+
+  trusted_advisor_metrics_normalized_service_region = {
+    for service_name, limits in local.trusted_advisor_service_limits : service_name => {
+      for region in var.regions : region => [for metric in local.trusted_advisor_metrics_normalized_all : metric if metric.region == region && metric.service_name == service_name]
+    }
+  }
+
+  trusted_advisor_dashboard_widgets = flatten([
+    for service_name, region_data in local.trusted_advisor_metrics_normalized_service_region : [
+      for region, metrics in region_data : [
+        {
+          type = "metric"
+          properties = {
+            stat   = "Sum"
+            region = "us-east-1"
+            period = 300
+            view   = "timeSeries"
+            title  = format("%s: %s", service_name, region)
+            yAxis = {
+              left = {
+                label     = "Quota usage percentage"
+                min       = 0
+                max       = 100
+                showUnits = false
+              }
+            }
+            metrics = concat([
+              for metric in metrics : flatten([
+                [
+                  "AWS/TrustedAdvisor", "ServiceLimitUsage", "ServiceName", metric["service_name"], "ServiceLimit", metric["resource"], "Region", metric["region"],
+                  { id = metric["id"], visible = false }
+                ]
+              ])
+              ],
+              [for metric in metrics : [
+                { expression = "${metric.id}*100", label = metric["label"] }
+              ]]
+            )
+          }
+        }
+      ]
+    ]
+  ])
+}