Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

#3666 - Queue Monitoring - Enable Prometheus Metrics #4012

Merged
merged 14 commits into from
Dec 2, 2024
4 changes: 4 additions & 0 deletions devops/openshift/queue-consumers-deploy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@ objects:
type: Rolling
template:
metadata:
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: ${PORT}
prometheus.io/path: /metrics
labels:
deploymentconfig: ${NAME}
spec:
Expand Down
52 changes: 18 additions & 34 deletions devops/openshift/sysdig-team.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,37 +7,21 @@ spec:
team:
description: The Sysdig Team for the OpenShift Project Set SIMS
users:
- name: [email protected]
role: ROLE_TEAM_EDIT
- name: [email protected]
role: ROLE_TEAM_EDIT
- name: [email protected]
role: ROLE_TEAM_EDIT
- name: [email protected]
role: ROLE_TEAM_EDIT
- name: [email protected]
role: ROLE_TEAM_EDIT
- name: [email protected]
role: ROLE_TEAM_EDIT
- name: [email protected]
role: ROLE_TEAM_STANDARD
- name: [email protected]
role: ROLE_TEAM_STANDARD
- name: [email protected]
role: ROLE_TEAM_STANDARD
- name: [email protected]
role: ROLE_TEAM_STANDARD
- name: [email protected]
role: ROLE_TEAM_STANDARD
- name: [email protected]
role: ROLE_TEAM_READ
- name: [email protected]
role: ROLE_TEAM_READ
- name: [email protected]
role: ROLE_TEAM_READ
- name: [email protected]
role: ROLE_TEAM_READ
- name: [email protected]
role: ROLE_TEAM_READ
- name: [email protected]
role: ROLE_TEAM_READ
- name: [email protected]
role: ROLE_TEAM_EDIT
- name: [email protected]
role: ROLE_TEAM_EDIT
- name: [email protected]
role: ROLE_TEAM_EDIT
- name: [email protected]
role: ROLE_TEAM_EDIT
- name: [email protected]
role: ROLE_TEAM_EDIT
- name: [email protected]
role: ROLE_TEAM_EDIT
- name: [email protected]
role: ROLE_TEAM_EDIT
- name: [email protected]
role: ROLE_TEAM_STANDARD
- name: [email protected]
role: ROLE_TEAM_STANDARD
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
export * from "./health-check/health.controller";
export * from "./metrics/metrics.controller";
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import {
Controller,
Get,
Header,
InternalServerErrorException,
} from "@nestjs/common";
import { register } from "prom-client";
import { InjectLogger, LoggerService } from "@sims/utilities/logger";

/**
* Allows prometheus to scrape metrics from the queue-consumers.
* @see https://developer.gov.bc.ca/docs/default/component/platform-developer-docs/docs/app-monitoring/user-defined-monitoring/#expose-the-metrics-from-your-app
*/
@Controller("metrics")
export class MetricsController {
/**
* Exports metrics from the queue-consumers.
* @returns metrics in Prometheus format.
*/
@Get()
@Header("content-type", register.contentType)
async getMetrics(): Promise<string> {
try {
return register.metrics();
} catch (error) {
this.logger.error("Error while getting metrics.", error);
throw new InternalServerErrorException(
"Error while getting metrics. See server logs for details.",
);
}
}

@InjectLogger()
logger: LoggerService;
}
Original file line number Diff line number Diff line change
Expand Up @@ -72,13 +72,14 @@ import {
import { SFASIntegrationModule } from "@sims/integrations/sfas-integration";
import { ATBCIntegrationModule } from "@sims/integrations/atbc-integration";
import { ECEIntegrationModule } from "@sims/integrations/institution-integration/ece-integration";
import { HealthController } from "./controllers";
import { HealthController, MetricsController } from "./controllers";
import { MicroserviceHealthIndicator, TerminusModule } from "@nestjs/terminus";
import { CASSupplierIntegrationService } from "./services/cas-supplier/cas-supplier.service";
import { VirusScanProcessor } from "./processors/virus-scan/virus-scan.processor";
import { CASService } from "@sims/integrations/cas/cas.service";
import { ObjectStorageService } from "@sims/integrations/object-storage";
import { BullBoardQueuesModule } from "./bull-board/bull-board-queues.module";
import { QueuesMetricsModule } from "./queues-metrics.module.module";

// TODO: Removed ATBCResponseIntegrationScheduler in providers, the queuename from enum and the decorators of the processor as part of #2539.
@Module({
Expand All @@ -87,6 +88,7 @@ import { BullBoardQueuesModule } from "./bull-board/bull-board-queues.module";
DatabaseModule,
QueueModule,
BullBoardQueuesModule,
QueuesMetricsModule,
ZeebeModule.forRoot(),
IER12IntegrationModule,
ECEIntegrationModule,
Expand Down Expand Up @@ -161,6 +163,6 @@ import { BullBoardQueuesModule } from "./bull-board/bull-board-queues.module";
CASActiveSupplierFoundProcessor,
CASActiveSupplierAndSiteFoundProcessor,
],
controllers: [HealthController],
controllers: [HealthController, MetricsController],
})
export class QueueConsumersModule {}
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import { Global, LoggerService, Module, OnModuleInit } from "@nestjs/common";
import { InjectLogger } from "@sims/utilities/logger";
import { MetricsService } from "./services";

@Global()
@Module({
providers: [MetricsService],
exports: [MetricsService],
})
export class QueuesMetricsModule implements OnModuleInit {
constructor(private readonly metricsService: MetricsService) {}

/**
* Method that is invoked on application initialization and
* is responsible for setting up the metrics for all queues.
*/
async onModuleInit(): Promise<void> {
this.logger.log("Associating queue events for metrics.");
this.metricsService.setGlobalMetricsConfigurations();
await this.metricsService.associateQueueEventsCountersMetrics();
}

@InjectLogger()
logger: LoggerService;
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ export * from "./workflow/workflow-enqueuer.service";
export * from "./cas-supplier/cas-supplier.service";
export * from "./student-file/student-file.service";
export * from "./cas-supplier/cas-evaluation-result-processor";
export * from "./metrics/metrics.service";
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import { QueueModel } from "@sims/services/queue";
import { Queue } from "bull";

/**
* Bull queues event names to have metrics associated.
* @see https://github.com/OptimalBits/bull/blob/develop/REFERENCE.md#events
*/
export enum QueuesMetricsEvents {
/**
* An error occurred, for instance, Redis connectivity issue.
*/
Error = "error",
/**
* A Job is waiting to be processed as soon as a worker is idling.
*/
Waiting = "waiting",
/**
* A job has started.
*/
Active = "active",
/**
* A job has been marked as stalled. This is useful for debugging job
* workers that crash or pause the event loop.
*/
Stalled = "stalled",
/**
* A job successfully completed with a result.
*/
Completed = "completed",
/**
* A job's progress was updated.
*/
Progress = "progress",
/**
* A job went to failed state.
*/
Failed = "failed",
/**
* The job changed to delayed state.
*/
Delayed = "delayed",
/**
* The queue has been paused.
*/
Paused = "paused",
/**
* A job was successfully removed.
*/
Removed = "removed",
/**
* The queue has been resumed.
*/
Resumed = "resumed",
/**
* Emitted every time the queue has processed all the waiting jobs
* (even if there can be some delayed jobs not yet processed).
*/
Drained = "drained",
/**
* A job failed to extend lock. This will be useful to debug redis
* connection issues and jobs getting restarted because workers
* are not able to extend locks.
*/
LockExtensionFailed = "lock-extension-failed",
}

/**
* Information to provide metrics to a queue.
*/
export interface MonitoredQueue {
provider: Queue;
queueModel: QueueModel;
}

/**
* Default label added to all the metrics.
*/
export const DEFAULT_METRICS_APP_LABEL = "queue-consumers";
Loading
Loading