diff --git a/alert-policies/linkerd/ExcessTCPConnections.yml b/alert-policies/linkerd/ExcessTCPConnections.yml new file mode 100644 index 0000000000..a7ed227592 --- /dev/null +++ b/alert-policies/linkerd/ExcessTCPConnections.yml @@ -0,0 +1,40 @@ +# Name of the alert +name: Excess TCP Connections + +# Description and details +description: |+ + This alert is triggered when TCP open connections exceeds 150 for 5 minutes. +# Type of alert +type: STATIC + +# NRQL query +nrql: + + query: "SELECT max(tcp_open_connections) FROM Metric" + +# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE) +valueFunction: SINGLE_VALUE + +# List of Critical and Warning thresholds for the condition +terms: + - priority: CRITICAL + # Operator used to compare against the threshold. + operator: ABOVE + # Value that triggers a violation + threshold: 180 + # Time in seconds; 120 - 3600 + thresholdDuration: 300 + # How many data points must be in violation for the duration + thresholdOccurrences: ALL + - priority: WARNING + # Operator used to compare against the threshold. + operator: ABOVE + # Value that triggers a violation + threshold: 150 + # Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions + thresholdDuration: 300 + # How many data points must be in violation for the duration + thresholdOccurrences: ALL +# Duration after which a violation automatically closes +# Time in seconds; 300 - 2592000 (Default: 86400 [1 day]) +violationTimeLimitSeconds: 86400 \ No newline at end of file diff --git a/alert-policies/linkerd/HighResponseLatency.yml b/alert-policies/linkerd/HighResponseLatency.yml new file mode 100644 index 0000000000..84a2aa4ea8 --- /dev/null +++ b/alert-policies/linkerd/HighResponseLatency.yml @@ -0,0 +1,41 @@ +# Name of the alert +name: Response Latency Bucket + +# Description and details +description: |+ + This alert is triggered when response latency exceeds 150 ms for 5 minutes. +# Type of alert +type: STATIC + +# NRQL query +nrql: + + query: "SELECT percentile(`response_latency_ms_bucket`, 95) FROM Metric" + +# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE) +valueFunction: SINGLE_VALUE + +# List of Critical and Warning thresholds for the condition +terms: + - priority: CRITICAL + # Operator used to compare against the threshold. + operator: ABOVE + # Value that triggers a violation + threshold: 200 + # Time in seconds; 120 - 3600 + thresholdDuration: 300 + # How many data points must be in violation for the duration + thresholdOccurrences: ALL + - priority: WARNING + # Operator used to compare against the threshold. + operator: ABOVE + # Value that triggers a violation + threshold: 150 + # Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions + thresholdDuration: 300 + # How many data points must be in violation for the duration + thresholdOccurrences: ALL + +# Duration after which a violation automatically closes +# Time in seconds; 300 - 2592000 (Default: 86400 [1 day]) +violationTimeLimitSeconds: 86400 \ No newline at end of file diff --git a/alert-policies/linkerd/ProcessThreadsCountAlert.yml b/alert-policies/linkerd/ProcessThreadsCountAlert.yml new file mode 100644 index 0000000000..ac39dbcaca --- /dev/null +++ b/alert-policies/linkerd/ProcessThreadsCountAlert.yml @@ -0,0 +1,41 @@ +# Name of the alert +name: Process Threads Count + +# Description and details +description: |+ + This alert is triggered when the number of process threads exceeds 4 for 5 minutes. +# Type of alert +type: STATIC + +# NRQL query +nrql: + + query: "SELECT average(`process_threads`) AS `Thread Count Spike` FROM Metric" + +# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE) +valueFunction: SINGLE_VALUE + +# List of Critical and Warning thresholds for the condition +terms: + - priority: CRITICAL + # Operator used to compare against the threshold. + operator: ABOVE + # Value that triggers a violation + threshold: 6 + # Time in seconds; 120 - 3600 + thresholdDuration: 300 + # How many data points must be in violation for the duration + thresholdOccurrences: ALL + - priority: WARNING + # Operator used to compare against the threshold. + operator: ABOVE + # Value that triggers a violation + threshold: 4 + # Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions + thresholdDuration: 300 + # How many data points must be in violation for the duration + thresholdOccurrences: ALL + +# Duration after which a violation automatically closes +# Time in seconds; 300 - 2592000 (Default: 86400 [1 day]) +violationTimeLimitSeconds: 86400 \ No newline at end of file diff --git a/dashboards/linkerd/linkerd.json b/dashboards/linkerd/linkerd.json new file mode 100644 index 0000000000..ba554ea14b --- /dev/null +++ b/dashboards/linkerd/linkerd.json @@ -0,0 +1,483 @@ +{ + "name": "Linkerd", + "description": null, + "pages": [ + { + "name": "Overview", + "description": null, + "widgets": [ + { + "title": "", + "layout": { + "column": 1, + "row": 1, + "width": 2, + "height": 1 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.markdown" + }, + "rawConfiguration": { + "text": "![Linkerd-Logo](https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcR2eK2bt7sUsWysMVE7rGyz9sbYJF2eZvmstg&s)" + } + }, + { + "title": "TCP Connections", + "layout": { + "column": 3, + "row": 1, + "width": 4, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT max(tcp_open_connections), sum(tcp_read_bytes_total), sum(tcp_write_bytes_total) FROM Metric" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Control Balancer Queue Requests", + "layout": { + "column": 7, + "row": 1, + "width": 6, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.area" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT latest(control_identity_balancer_queue_requests_total), latest(control_destination_balancer_queue_requests_total) , latest(control_policy_balancer_queue_requests_total) FROM Metric TIMESERIES" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "", + "layout": { + "column": 1, + "row": 2, + "width": 2, + "height": 2 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.markdown" + }, + "rawConfiguration": { + "text": "**About**\n\nInstrument your application with New Relic - [Add Data](https://one.newrelic.com/).\n\nInstrument Linkerd with New Relic using the [documentation](https://docs.newrelic.com/docs/infrastructure/host-integrations/host-integrations-list/linkerd-integration/)\n\n[Please rate this dashboard](https://docs.google.com/forms/d/e/1FAIpQLSclR38J8WbbB2J1tHnllKUkzWZkJhf4SrJGyavpMd4t82NjnQ/viewform?usp=pp_url&entry.1615922415=Linkerd) here and let us know how we can improve it for you." + } + }, + { + "title": "Process Uptime (seconds)", + "layout": { + "column": 1, + "row": 4, + "width": 4, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.area" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT latest(process_uptime_seconds_total) FROM Metric TIMESERIES " + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Process CPU (seconds)", + "layout": { + "column": 5, + "row": 4, + "width": 4, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.area" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT latest(process_cpu_seconds_total) FROM Metric TIMESERIES" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Resource usage", + "layout": { + "column": 9, + "row": 4, + "width": 4, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.area" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT sum(process_cpu_seconds_total), latest(process_virtual_memory_bytes)/(1024*1024) as 'Memory in MB' FROM Metric TIMESERIES " + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + } + ] + }, + { + "name": "Linkerd Metrics", + "description": null, + "widgets": [ + { + "title": "HTTP Traffic Authorization", + "layout": { + "column": 1, + "row": 1, + "width": 4, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT sum(inbound_http_authz_allow_total) FROM Metric TIMESERIES " + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": { + "isLabelVisible": true + }, + "yAxisLeft": { + "zero": true + }, + "yAxisRight": { + "zero": true + } + } + }, + { + "title": "Total number of requests", + "layout": { + "column": 5, + "row": 1, + "width": 4, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT sum(request_total)as 'Total requests' FROM Metric " + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Latency ( milliseconds)", + "layout": { + "column": 9, + "row": 1, + "width": 4, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.stacked-bar" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "select average(response_latency_ms_bucket) FROM Metric TIMESERIES \n" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Control Response Latency Bucket", + "layout": { + "column": 1, + "row": 4, + "width": 4, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.histogram" + }, + "rawConfiguration": { + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT histogram(`control_response_latency_ms_bucket`) FROM Metric" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "yAxisLeft": { + "zero": true + } + } + }, + { + "title": "Stack Operation Stats", + "layout": { + "column": 5, + "row": 4, + "width": 4, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT sum(`stack_poll_total`), sum(`stack_poll_total_ms`), sum(`stack_create_total`), sum(`stack_drop_total`) FROM Metric" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Response Info", + "layout": { + "column": 9, + "row": 4, + "width": 4, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT sum(`response_total`), sum(`response_latency_ms_sum`) FROM Metric TIMESERIES " + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": { + "isLabelVisible": true + }, + "yAxisLeft": { + "zero": true + }, + "yAxisRight": { + "zero": true + } + } + }, + { + "title": "Proxy Build Info", + "layout": { + "column": 1, + "row": 7, + "width": 4, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT uniqueCount(`proxy_build_info`) AS `Proxy Build Info` FROM Metric" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Process Threads", + "layout": { + "column": 5, + "row": 7, + "width": 4, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT average(`process_threads`) FROM Metric" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "OpenCensus Span Tracking", + "layout": { + "column": 9, + "row": 7, + "width": 4, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT sum(`opencensus_span_exports`), sum(`opencensus_span_export_requests`), sum(`opencensus_span_export_streams`) FROM Metric TIMESERIES " + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": { + "isLabelVisible": true + }, + "yAxisLeft": { + "zero": true + }, + "yAxisRight": { + "zero": true + } + } + } + ] + } + ], + "variables": [] + } \ No newline at end of file diff --git a/dashboards/linkerd/linkerd01.png b/dashboards/linkerd/linkerd01.png new file mode 100644 index 0000000000..9209041518 Binary files /dev/null and b/dashboards/linkerd/linkerd01.png differ diff --git a/dashboards/linkerd/linkerd02.png b/dashboards/linkerd/linkerd02.png new file mode 100644 index 0000000000..0f19452bee Binary files /dev/null and b/dashboards/linkerd/linkerd02.png differ diff --git a/data-sources/linkerd/config.yml b/data-sources/linkerd/config.yml new file mode 100644 index 0000000000..d716510561 --- /dev/null +++ b/data-sources/linkerd/config.yml @@ -0,0 +1,11 @@ +id: linkerd +displayName: Linkerd +description: | + Enhance the performance monitoring and instrumentation of your Linkerd by integrating New Relic. +install: + primary: + link: + url: https://docs.newrelic.com/docs/infrastructure/host-integrations/host-integrations-list/linkerd-integration/ +keywords: + - linkerd +icon: logo.png diff --git a/data-sources/linkerd/logo.png b/data-sources/linkerd/logo.png new file mode 100644 index 0000000000..c19d0f195e Binary files /dev/null and b/data-sources/linkerd/logo.png differ diff --git a/quickstarts/linkerd/config.yml b/quickstarts/linkerd/config.yml new file mode 100644 index 0000000000..c561f59099 --- /dev/null +++ b/quickstarts/linkerd/config.yml @@ -0,0 +1,38 @@ +slug: linkerd +description: | + ## Why monitor your Linkerd? + Monitoring Linkerd with New Relic provides a robust framework for ensuring the reliability, performance, and security of your microservices architecture. + + ## Comprehensive monitoring quickstart for Linkerd + Integrating Linkerd with New Relic provides powerful observability and monitoring capabilities, ensuring your microservices run smoothly and efficiently. This enables you to monitor your service mesh more effectively, reducing troubleshooting time and allowing you to focus on developing new features and enhancing your applications. + + ## What’s included in the Linkerd quickstart? + New Relic Linkerd monitoring quickstart ability to cover quality on out-of-the-box reporting: + + - Dashboards (total requests, total response and tcp open connections) + - Alerts (high latency, high error rate, etc..) + +summary: | + Gain valuable insights into your Linkerd deployment with New Relic's monitoring and analysis capabilities. +icon: logo.png +level: New Relic +authors: + - New Relic + - Shivani Kashyap +title: Linkerd +documentation: + - name: Linkerd integration documentation + description: | + Enhance the performance monitoring and instrumentation of your Linkerd by integrating New Relic. + url: >- + https://docs.newrelic.com/docs/infrastructure/host-integrations/host-integrations-list/linkerd-integration/ +keywords: + - linkerd + - NR1_addData + - NR1_sys +dashboards: + - linkerd +alertPolicies: + - linkerd +dataSourceIds: + - linkerd \ No newline at end of file diff --git a/quickstarts/linkerd/logo.png b/quickstarts/linkerd/logo.png new file mode 100644 index 0000000000..c19d0f195e Binary files /dev/null and b/quickstarts/linkerd/logo.png differ