Skip to content

Commit

Permalink
Merge pull request #2245 from newrelic/release
Browse files Browse the repository at this point in the history
Release - 01/18/2024
  • Loading branch information
zstix authored Jan 18, 2024
2 parents 55aa613 + 27fa8a2 commit aa2aebc
Show file tree
Hide file tree
Showing 168 changed files with 2,422 additions and 417 deletions.
40 changes: 0 additions & 40 deletions .github/workflows/pr-open.yml

This file was deleted.

34 changes: 34 additions & 0 deletions alert-policies/amazon-backup/HighBackupJobFailure.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
name: High Backup Job Failure

description: |+
This alert is triggered if the number of Backup Job Failure exceeds 10 for 10 minutes.
type: STATIC
nrql:
query: "SELECT sum(`aws.backup.NumberOfBackupJobsFailed`) as 'Query' FROM Metric"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 10
# Time in seconds; 120 - 3600
thresholdDuration: 600
# How many data points must be in violation for the duration
thresholdOccurrences: ALL

# Adding a Warning threshold is optional
- priority: WARNING
operator: ABOVE
threshold: 5
thresholdDuration: 600
thresholdOccurrences: ALL

# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
34 changes: 34 additions & 0 deletions alert-policies/amazon-backup/HighCopyJobsFailure.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
name: High Copy Job Failure

description: |+
This alert is triggered if the number of Copy Job Failure exceeds 10 for 10 minutes.
type: STATIC
nrql:
query: "SELECT sum(`aws.backup.NumberOfCopyJobsFailed`) as 'Query' FROM Metric"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 10
# Time in seconds; 120 - 3600
thresholdDuration: 600
# How many data points must be in violation for the duration
thresholdOccurrences: ALL

# Adding a Warning threshold is optional
- priority: WARNING
operator: ABOVE
threshold: 5
thresholdDuration: 600
thresholdOccurrences: ALL

# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
34 changes: 34 additions & 0 deletions alert-policies/amazon-backup/HighRestoreJobsFailure.yml.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
name: High Restore Job Failure

description: |+
This alert is triggered if the number of Restore Job Failure exceeds 10 for 10 minutes.
type: STATIC
nrql:
query: "SELECT sum(`aws.backup.NumberOfRestoreJobsFailed`) as 'Query' FROM Metric"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 10
# Time in seconds; 120 - 3600
thresholdDuration: 600
# How many data points must be in violation for the duration
thresholdOccurrences: ALL

# Adding a Warning threshold is optional
- priority: WARNING
operator: ABOVE
threshold: 5
thresholdDuration: 600
thresholdOccurrences: ALL

# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
name: High Component Type Creation Failure

description: |+
This alert is triggered if the number of component type creation failure exceeds 10 for 5 minutes.
type: STATIC
nrql:
query: "SELECT sum(`aws.iottwinmaker.ComponentTypeCreationFailure`) as 'Query' FROM Metric"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 10
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL

# Adding a Warning threshold is optional
- priority: WARNING
operator: ABOVE
threshold: 5
thresholdDuration: 300
thresholdOccurrences: ALL

# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
34 changes: 34 additions & 0 deletions alert-policies/aws-iot-twinmaker/HighEntityCreationFailure.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
name: High Entity Creation Failure

description: |+
This alert is triggered if the number of entity creation failure exceeds 10 for 5 minutes.
type: STATIC
nrql:
query: "SELECT sum(`aws.iottwinmaker.EntityCreationFailure`) as 'Query' FROM Metric"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 10
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL

# Adding a Warning threshold is optional
- priority: WARNING
operator: ABOVE
threshold: 5
thresholdDuration: 300
thresholdOccurrences: ALL

# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
28 changes: 28 additions & 0 deletions alert-policies/pinecone-prometheus/PineconeIndexFullness.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
name: Pinecone Index Fullness

description: |+
- This metric indicates the index's fullness on a scale from 0 to 1
- An alert is triggered if the value exceeds the 80% threshold
- Resolution: If it surpasses 80%, we need to add another replica or increase the pod size
type: STATIC
nrql:
query: "FROM Metric SELECT average(pinecone_index_fullness) AS '(%) index fullness ' WHERE instrumentation.name = 'remote-write' and instrumentation.provider = 'prometheus' LIMIT MAX "

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 0.8
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL

# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
27 changes: 27 additions & 0 deletions alert-policies/pinecone-prometheus/PineconeRequestsErrors.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
name: Pinecone Request Errors

description: |+
- This metric displays the total count of data plane calls executed by clients that resulted in errors
- An alert is triggered if the value exceeds 0
type: STATIC
nrql:
query: "FROM Metric SELECT latest(pinecone_request_error_count_total) AS 'request errors' WHERE instrumentation.name = 'remote-write' and instrumentation.provider = 'prometheus' LIMIT MAX"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 1
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL

# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
29 changes: 29 additions & 0 deletions alert-policies/pinecone-prometheus/PineconeRequestsLatency.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
name: Pinecone Request Latency

description: |+
- This metric illustrates the server-side processing latency distribution for
Pinecone data plane calls
- An alert is triggered if the 50th percentile exceeds 100 ms
- Resolution: If it surpasses 100 ms, we need to add another replica
type: STATIC
nrql:
query: "FROM Metric SELECT percentile(pinecone_request_latency_seconds, 50) * 1000 as 'requests latency' WHERE instrumentation.name = 'remote-write' and instrumentation.provider = 'prometheus' LIMIT MAX"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 100
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL

# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
Original file line number Diff line number Diff line change
Expand Up @@ -3544,7 +3544,7 @@
"nrqlQueries": [
{
"accountIds": [],
"query": "SELECT latest(`apache.server.scoreboard.totalWorkers`) * uniqueCount(entity.name) as 'Total Workers', latest(`apache.server.idleWorkers`) * uniqueCount(entity.name) as 'Idle Workers', latest(`apache.server.busyWorkers`) * uniqueCount(entity.name) as 'Busy Workers' FROM Metric WHERE `metricName` IN ('apache.server.scoreboard.totalWorkers', 'apache.server.idleWorkers', 'apache.server.busyWorkers') TIMESERIES"
"query": "SELECT (latest(server.scoreboard.totalWorkers) * uniqueCount(entityName)) AS `Total Workers`, (latest(server.idleWorkers) * uniqueCount(entityName)) AS `Idle Workers`, (latest(server.busyWorkers) * uniqueCount(entityName)) AS `Busy Workers` FROM ApacheSample TIMESERIES"
}
],
"platformOptions": {
Expand Down Expand Up @@ -3593,7 +3593,7 @@
"nrqlQueries": [
{
"accountId": 0,
"query": "SELECT latest(host.cpuPercent) AS 'CPU used %' FROM Metric TIMESERIES"
"query": "SELECT latest(cpuPercent) AS `CPU used %` FROM SystemSample TIMESERIES"
}
],
"platformOptions": {
Expand Down Expand Up @@ -3623,7 +3623,7 @@
"nrqlQueries": [
{
"accountId": 0,
"query": "SELECT latest(host.memoryUsedPercent) AS 'Memory used %' FROM Metric TIMESERIES"
"query": "SELECT latest(memoryUsedPercent) AS `Memory used %` FROM SystemSample TIMESERIES"
}
],
"platformOptions": {
Expand Down Expand Up @@ -3653,7 +3653,7 @@
"nrqlQueries": [
{
"accountId": 0,
"query": "SELECT latest(host.disk.usedPercent) as 'Storage used %' FROM Metric TIMESERIES"
"query": "SELECT latest(diskUsedPercent) AS `Storage used %` FROM StorageSample TIMESERIES"
}
],
"platformOptions": {
Expand All @@ -3680,7 +3680,7 @@
"nrqlQueries": [
{
"accountId": 0,
"query": "SELECT latest(host.disk.usedPercent) as 'Used %' FROM Metric FACET device"
"query": "SELECT latest(diskUsedPercent) AS `Used %` FROM StorageSample FACET device"
}
],
"platformOptions": {
Expand Down
Loading

0 comments on commit aa2aebc

Please sign in to comment.