Skip to content

Commit

Permalink
Merge branch 'release' into hcpvault
Browse files Browse the repository at this point in the history
  • Loading branch information
sarahkitten authored Jan 11, 2024
2 parents 6e5bf98 + bc9dce2 commit a4f6d0d
Show file tree
Hide file tree
Showing 57 changed files with 4,858 additions and 2,497 deletions.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,6 @@ snapshots/

# yarn
yarn.lock
.yarn-integrity
.yarn-integrity
yarn-error.log
utils/yarn-error.log
2 changes: 1 addition & 1 deletion alert-policies/browser/AjaxTimetoSettle.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ type: BASELINE
# NRQL query
nrql:
# Baseline alerts can use an optional FACET
query: "SELECT average(timeToSettle) FROM AjaxRequest WHERE appName like '%' FACET appName TIMESERIES"
query: "SELECT average(timeToSettle) FROM AjaxRequest WHERE appName like '%' FACET appName"

# Direction in which baseline is set (Default: LOWER_ONLY)
baselineDirection: UPPER_ONLY
Expand Down
2 changes: 1 addition & 1 deletion alert-policies/langchain-vectordb/APIResponseTime.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ type: STATIC

# NRQL query
nrql:
query: "FROM LlmVectorSearch SELECT latest(response_time) WHERE applicationName IN ('LangChain observability trace') TIMESERIES"
query: "FROM LlmVectorSearch SELECT latest(response_time) WHERE applicationName IN ('LangChain observability trace')"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE
Expand Down
2 changes: 1 addition & 1 deletion alert-policies/llm-application/APIResponseTime.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ type: STATIC

# NRQL query
nrql:
query: "FROM LlmChatCompletionSummary, LlmCompletion, LlmEmbedding SELECT percentile(response_time, 95) TIMESERIES FACET entity.guid"
query: "FROM LlmChatCompletionSummary, LlmCompletion, LlmEmbedding SELECT percentile(response_time, 95) FACET entity.guid"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE
Expand Down
27 changes: 27 additions & 0 deletions alert-policies/mariadb/innodb-pending-reads-and-writes.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
name: InnoDB Pending Reads and Writes

description: |+
This alert is triggered when the aggregate number of pending reads and writes in the MySQL buffer pool is greater than 2 for 5 minutes, which indicates the database engine is backlogged and waiting on resources.
type: STATIC
nrql:
query: "FROM MysqlSample SELECT max(db.innodb.dataPendingReads) + max(db.innodb.dataPendingWrites) FACET displayName"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 2
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL

# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
29 changes: 29 additions & 0 deletions alert-policies/mariadb/max-connection-errors-per-second.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
name: Max Connection Errors per Second

description: |+
This alert is triggered when there is at least one error against the max_connections limit in a 5 minute window, which indicates you have requests to your MariaDB instance that are failing to connect.
This setting's default is 151, but can vary based on the underlying resources available to your instance. You can review your current max_connections limit with this query:
SHOW VARIABLES LIKE 'max_connections';
type: STATIC
nrql:
query: "FROM MysqlSample SELECT max(net.connectionErrorsMaxConnectionsPerSecond) FACET displayName"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 1
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: AT_LEAST_ONCE

# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
59 changes: 59 additions & 0 deletions alert-policies/mariadb/questions-per-second.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
name: Questions per Second

description: |+
This alert is triggered when the current rate of Questions is greater than 2 standard deviations above the baseline for 60s, which could be an early indicator of a saturation problem for your instance.
It is important to note that this alert is disabled by default and you need to edit the configuration in New Relic One to add a targeted MySQL instance:
"WHERE displayName = 'MySql Instance Name'"
This allows the baseline to be calculated against a single instance instead of all running MySQL instances being monitored.
type: BASELINE
nrql:
# Cannot use FACET in Baseline alerts
query: "FROM MysqlSample SELECT average(query.questionsPerSecond)"

# Direction in which baseline is set (Default: LOWER_ONLY)
baselineDirection: UPPER_ONLY

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 2
# Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions
thresholdDuration: 120
# How many data points must be in violation for the duration
thresholdOccurrences: ALL

# Adding a Warning threshold is optional
- priority: WARNING
operator: ABOVE
threshold: 1
thresholdDuration: 300
thresholdOccurrences: ALL

# Loss of Signal Settings
expiration:
# Close open violations if signal is lost (Default: false)
closeViolationsOnExpiration: false
# Open "Loss of Signal" violation if signal is lost (Default: false)
openViolationOnExpiration: false
# Time in seconds; Max value: 172800 (48hrs), null if closeViolationsOnExpiration and openViolationOnExpiration are both 'false'
expirationDuration:

# Advanced Signal Settings
signal:
# Max Value for Baseline conditions = 20
evaluationOffset: 3
# Type of value that should be used to fill gaps
fillOption: NONE
# Integer; Used in conjunction with STATIC fillOption, otherwise null
fillValue:

# OPTIONAL: URL of runbook to be sent with notification
runbookUrl:

# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
29 changes: 29 additions & 0 deletions alert-policies/mariadb/slow-queries-per-second.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
name: Slow Queries per Second

description: |+
This alert is triggered when the number of slow queries per second is greater than 5 for 5 minutes, which could indicate capacity issues or a query that has been changed and is experiencing performance issues.
The Slow_queries counter increments based on your settings applied to MySQL's long_query_time parameter (default 10s), which you can review with this query:
SHOW VARIABLES LIKE 'long_query_time';
type: STATIC
nrql:
query: "FROM MysqlSample SELECT average(query.slowQueriesPerSecond) FACET displayName"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 5
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL

# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
name: Scrape Errors
name: CPU Usage

description: |+
This alert is triggered when the cache size limit is reached above 4 for at least 5 minutes.
This alert is triggered when the CPU Usage exceeds 90% for at least 5 minutes.
type: STATIC
nrql:
query: "FROM Metric SELECT sum(nextcloud_scrape_errors_total) As 'Scrape Errors'"
query: "SELECT average(host.cpuPercent) AS '(%) used CPU' FROM Metric"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE).
valueFunction: SINGLE_VALUE
Expand All @@ -16,7 +16,7 @@ terms:
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation.
threshold: 4
threshold: 90
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration?
Expand All @@ -26,7 +26,7 @@ terms:
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation.
threshold: 1
threshold: 80
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration?
Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
name: Heap Memory Usage
name: Memory Usage

description: |+
This alert is triggered when the Heap Memory Usage is below 2 for at least 5 minutes.
This alert is triggered when the Memory Usage exceeds 90% for at least 5 minutes.
type: STATIC
nrql:
query: "FROM Metric select latest(go_memstats_heap_inuse_bytes)/1e+6 As 'Heap Memory Usage'"
query: "FROM NextcloudFlexMetrics select (latest(ocs.data.server.php.opcache.memory_usage.used_memory) / 1e+6) / (latest(ocs.data.server.php.opcache.memory_usage.free_memory) / 1e+6) * 100 AS '(%) used memory'"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE).
valueFunction: SINGLE_VALUE
Expand All @@ -14,19 +14,19 @@ valueFunction: SINGLE_VALUE
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: BELOW
operator: ABOVE
# Value that triggers a violation.
threshold: 2
threshold: 90
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration?
thresholdOccurrences: ALL

- priority: WARNING
# Operator used to compare against the threshold.
operator: BELOW
operator: ABOVE
# Value that triggers a violation.
threshold: 4
threshold: 80
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration?
Expand Down
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
# Name of the alert
name: SMTPD Connection Lost (%)
name: Bounced Emails

# Description and details
description: |+
This alert is triggered when the lost connections exceed 5%.
This alert is triggered when bounced emails exceed 5.
# Type of alert
type: STATIC

# NRQL query
nrql:
query: "SELECT (latest(PostfixSMTPDconnectionLost.value) * 100) / (SELECT latest(value) FROM PostfixSMTPDconnection) AS '(%) connection lost' FROM PostfixSMTPDconnectionLost"
query: "SELECT latest(bounced) As 'Bounced - failed emails' FROM `unixMonitor:postfix_messagesTotalRecords`"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE
Expand Down
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
# Name of the alert
name: SMTPD Connection Timed Out (%)
name: SMTP Authentication Fails

# Description and details
description: |+
This alert is triggered when the connection timeout exceeds 5%.
This alert is triggered when SMTP authentication fails exceed 5.
# Type of alert
type: STATIC

# NRQL query
nrql:
query: "SELECT (latest(PostfixSMTPDtimedOut.value) * 100) / (SELECT latest(value) FROM PostfixSMTPDconnection) AS '(%) connection timed out' FROM PostfixSMTPDtimedOut"
query: "SELECT latest(smtp_authfailed_count) FROM `unixMonitor:postfix_smtp_authFailed_today`"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE
Expand Down
Loading

0 comments on commit a4f6d0d

Please sign in to comment.