Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: added setup for production observability (metrics via OTEL) #1924

Merged
merged 12 commits into from
Nov 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,14 @@ CAPTCHA_SECRET=

NEXT_PUBLIC_CAPTCHA_SITE_KEY=

OTEL_TELEMETRY_COLLECTION_ENABLED=
sheensantoscapadngan marked this conversation as resolved.
Show resolved Hide resolved
OTEL_EXPORT_TYPE=
OTEL_EXPORT_OTLP_ENDPOINT=
OTEL_OTLP_PUSH_INTERVAL=

OTEL_COLLECTOR_BASIC_AUTH_USERNAME=
OTEL_COLLECTOR_BASIC_AUTH_PASSWORD=

PLAIN_API_KEY=
PLAIN_WISH_LABEL_IDS=

Expand Down
4 changes: 3 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@ up-dev:
up-dev-ldap:
docker compose -f docker-compose.dev.yml --profile ldap up --build

up-dev-metrics:
docker compose -f docker-compose.dev.yml --profile metrics up --build

up-prod:
docker-compose -f docker-compose.prod.yml up --build

Expand All @@ -27,4 +30,3 @@ reviewable-api:
npm run type:check

reviewable: reviewable-ui reviewable-api

1,800 changes: 1,685 additions & 115 deletions backend/package-lock.json

Large diffs are not rendered by default.

8 changes: 8 additions & 0 deletions backend/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,14 @@
"@octokit/plugin-retry": "^5.0.5",
"@octokit/rest": "^20.0.2",
"@octokit/webhooks-types": "^7.3.1",
"@opentelemetry/api": "^1.9.0",
"@opentelemetry/auto-instrumentations-node": "^0.53.0",
"@opentelemetry/exporter-metrics-otlp-proto": "^0.55.0",
"@opentelemetry/exporter-prometheus": "^0.55.0",
"@opentelemetry/instrumentation": "^0.55.0",
"@opentelemetry/resources": "^1.28.0",
"@opentelemetry/sdk-metrics": "^1.28.0",
"@opentelemetry/semantic-conventions": "^1.27.0",
"@peculiar/asn1-schema": "^2.3.8",
"@peculiar/x509": "^1.12.1",
"@serdnam/pino-cloudwatch-transport": "^1.0.4",
Expand Down
15 changes: 12 additions & 3 deletions backend/src/lib/config/env.ts
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,15 @@ const envSchema = z
INFISICAL_CLOUD: zodStrBool.default("false"),
MAINTENANCE_MODE: zodStrBool.default("false"),
CAPTCHA_SECRET: zpStr(z.string().optional()),

// TELEMETRY
OTEL_TELEMETRY_COLLECTION_ENABLED: zodStrBool.default("false"),
OTEL_EXPORT_OTLP_ENDPOINT: zpStr(z.string().optional()),
OTEL_OTLP_PUSH_INTERVAL: z.coerce.number().default(30000),
OTEL_COLLECTOR_BASIC_AUTH_USERNAME: zpStr(z.string().optional()),
OTEL_COLLECTOR_BASIC_AUTH_PASSWORD: zpStr(z.string().optional()),
OTEL_EXPORT_TYPE: z.enum(["prometheus", "otlp"]).optional(),

PLAIN_API_KEY: zpStr(z.string().optional()),
PLAIN_WISH_LABEL_IDS: zpStr(z.string().optional()),
DISABLE_AUDIT_LOG_GENERATION: zodStrBool.default("false"),
Expand Down Expand Up @@ -203,11 +212,11 @@ let envCfg: Readonly<z.infer<typeof envSchema>>;

export const getConfig = () => envCfg;
// cannot import singleton logger directly as it needs config to load various transport
export const initEnvConfig = (logger: Logger) => {
export const initEnvConfig = (logger?: Logger) => {
const parsedEnv = envSchema.safeParse(process.env);
if (!parsedEnv.success) {
logger.error("Invalid environment variables. Check the error below");
logger.error(parsedEnv.error.issues);
(logger ?? console).error("Invalid environment variables. Check the error below");
(logger ?? console).error(parsedEnv.error.issues);
process.exit(-1);
}

Expand Down
91 changes: 91 additions & 0 deletions backend/src/lib/telemetry/instrumentation.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
import opentelemetry, { diag, DiagConsoleLogger, DiagLogLevel } from "@opentelemetry/api";
import { getNodeAutoInstrumentations } from "@opentelemetry/auto-instrumentations-node";
import { OTLPMetricExporter } from "@opentelemetry/exporter-metrics-otlp-proto";
import { PrometheusExporter } from "@opentelemetry/exporter-prometheus";
import { registerInstrumentations } from "@opentelemetry/instrumentation";
import { Resource } from "@opentelemetry/resources";
import { AggregationTemporality, MeterProvider, PeriodicExportingMetricReader } from "@opentelemetry/sdk-metrics";
import { ATTR_SERVICE_NAME, ATTR_SERVICE_VERSION } from "@opentelemetry/semantic-conventions";
import dotenv from "dotenv";

import { initEnvConfig } from "../config/env";

dotenv.config();

const initTelemetryInstrumentation = ({
exportType,
otlpURL,
otlpUser,
otlpPassword,
otlpPushInterval
}: {
exportType?: string;
otlpURL?: string;
otlpUser?: string;
otlpPassword?: string;
otlpPushInterval?: number;
}) => {
diag.setLogger(new DiagConsoleLogger(), DiagLogLevel.DEBUG);

const resource = Resource.default().merge(
new Resource({
[ATTR_SERVICE_NAME]: "infisical-core",
[ATTR_SERVICE_VERSION]: "0.1.0"
})
);

const metricReaders = [];
switch (exportType) {
case "prometheus": {
const promExporter = new PrometheusExporter();
metricReaders.push(promExporter);
break;
}
case "otlp": {
const otlpExporter = new OTLPMetricExporter({
url: `${otlpURL}/v1/metrics`,
headers: {
Authorization: `Basic ${btoa(`${otlpUser}:${otlpPassword}`)}`
},
temporalityPreference: AggregationTemporality.DELTA
});
metricReaders.push(
new PeriodicExportingMetricReader({
exporter: otlpExporter,
exportIntervalMillis: otlpPushInterval
})
);
break;
}
default:
throw new Error("Invalid OTEL export type");
}

const meterProvider = new MeterProvider({
resource,
readers: metricReaders
});

opentelemetry.metrics.setGlobalMeterProvider(meterProvider);

registerInstrumentations({
instrumentations: [getNodeAutoInstrumentations()]
});
};

const setupTelemetry = () => {
const appCfg = initEnvConfig();

if (appCfg.OTEL_TELEMETRY_COLLECTION_ENABLED) {
console.log("Initializing telemetry instrumentation");
maidul98 marked this conversation as resolved.
Show resolved Hide resolved
initTelemetryInstrumentation({
otlpURL: appCfg.OTEL_EXPORT_OTLP_ENDPOINT,
otlpUser: appCfg.OTEL_COLLECTOR_BASIC_AUTH_USERNAME,
otlpPassword: appCfg.OTEL_COLLECTOR_BASIC_AUTH_PASSWORD,
otlpPushInterval: appCfg.OTEL_OTLP_PUSH_INTERVAL,
exportType: appCfg.OTEL_EXPORT_TYPE
});
}
};

void setupTelemetry();
3 changes: 3 additions & 0 deletions backend/src/main.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import "./lib/telemetry/instrumentation";

import dotenv from "dotenv";
import path from "path";

Expand All @@ -18,6 +20,7 @@ dotenv.config();
const run = async () => {
const logger = await initLogger();
const appCfg = initEnvConfig(logger);

const db = initDbConnection({
dbConnectionUri: appCfg.DB_CONNECTION_URI,
dbRootCert: appCfg.DB_ROOT_CERT,
Expand Down
5 changes: 5 additions & 0 deletions backend/src/server/app.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import { TSmtpService } from "@app/services/smtp/smtp-service";

import { globalRateLimiterCfg } from "./config/rateLimiter";
import { addErrorsToResponseSchemas } from "./plugins/add-errors-to-response-schemas";
import { apiMetrics } from "./plugins/api-metrics";
import { fastifyErrHandler } from "./plugins/error-handler";
import { registerExternalNextjs } from "./plugins/external-nextjs";
import { serializerCompiler, validatorCompiler, ZodTypeProvider } from "./plugins/fastify-zod";
Expand Down Expand Up @@ -86,6 +87,10 @@ export const main = async ({ db, hsmModule, auditLogDb, smtp, logger, queue, key
// pull ip based on various proxy headers
await server.register(fastifyIp);

if (appCfg.OTEL_TELEMETRY_COLLECTION_ENABLED) {
await server.register(apiMetrics);
}

await server.register(fastifySwagger);
await server.register(fastifyFormBody);
await server.register(fastifyErrHandler);
Expand Down
21 changes: 21 additions & 0 deletions backend/src/server/plugins/api-metrics.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import opentelemetry from "@opentelemetry/api";
import fp from "fastify-plugin";

export const apiMetrics = fp(async (fastify) => {
const apiMeter = opentelemetry.metrics.getMeter("API");
const latencyHistogram = apiMeter.createHistogram("API_latency", {
unit: "ms"
});

fastify.addHook("onResponse", async (request, reply) => {
const { method } = request;
const route = request.routerPath;
const { statusCode } = reply;

latencyHistogram.record(reply.elapsedTime, {
route,
method,
statusCode
});
});
});
38 changes: 38 additions & 0 deletions docker-compose.dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ services:
- .env
ports:
- 4000:4000
- 9464:9464 # for OTEL collection of Prometheus metrics
environment:
- NODE_ENV=development
- DB_CONNECTION_URI=postgres://infisical:infisical@db/infisical?sslmode=disable
Expand All @@ -95,6 +96,42 @@ services:
extra_hosts:
- "host.docker.internal:host-gateway"

prometheus:
image: prom/prometheus
volumes:
- ./prometheus.dev.yml:/etc/prometheus/prometheus.yml
ports:
- "9090:9090"
command:
- "--config.file=/etc/prometheus/prometheus.yml"
profiles: [metrics]

otel-collector:
image: otel/opentelemetry-collector-contrib
volumes:
- ./otel-collector-config.yaml:/etc/otelcol-contrib/config.yaml
ports:
- 1888:1888 # pprof extension
- 8888:8888 # Prometheus metrics exposed by the Collector
- 8889:8889 # Prometheus exporter metrics
- 13133:13133 # health_check extension
- 4317:4317 # OTLP gRPC receiver
- 4318:4318 # OTLP http receiver
- 55679:55679 # zpages extension
profiles: [metrics-otel]

grafana:
image: grafana/grafana
container_name: grafana
restart: unless-stopped
environment:
- GF_LOG_LEVEL=debug
ports:
- "3005:3000"
volumes:
- "grafana_storage:/var/lib/grafana"
profiles: [metrics]

frontend:
container_name: infisical-dev-frontend
restart: unless-stopped
Expand Down Expand Up @@ -166,3 +203,4 @@ volumes:
driver: local
ldap_data:
ldap_config:
grafana_storage:
42 changes: 39 additions & 3 deletions docs/self-hosting/configuration/envars.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ Used to configure platform-specific security and operational settings
</ParamField>

<ParamField query="TELEMETRY_ENABLED" type="string" default="true" optional>
Telemetry helps us improve Infisical but if you want to dsiable it you may set this to `false`.
Telemetry helps us improve Infisical but if you want to disable it you may set
this to `false`.
</ParamField>

## Data Layer
Expand Down Expand Up @@ -65,8 +66,9 @@ DB_READ_REPLICAS=[{"DB_CONNECTION_URI":""}]
Use the command below to encode your certificate:
`echo "<certificate>" | base64`

If not provided it will use master SSL certificate.
If not provided it will use master SSL certificate.
</ParamField>

</Expandable>
</ParamField>

Expand Down Expand Up @@ -350,8 +352,9 @@ Optional (for TLS/SSL):

TLS: Available on the same ports (2525, 80, 25, 8025, or 587)
SSL: Available on ports 465, 8465, and 443

</Note>
</Accordion>
</Accordion>

## Authentication

Expand Down Expand Up @@ -518,3 +521,36 @@ To help you sync secrets from Infisical to services such as Github and Gitlab, I
OAuth2 client secret for Gitlab integration
</ParamField>
</Accordion>

## Observability

You can configure Infisical to collect and expose telemetry data for analytics and monitoring.

<ParamField
query="OTEL_TELEMETRY_COLLECTION_ENABLED"
type="string"
default="false"
>
Whether or not to collect and expose telemetry data.
</ParamField>

<ParamField query="OTEL_EXPORT_TYPE" type="enum" optional>
Supported types are `prometheus` and `otlp`.

If export type is set to `prometheus`, metric data will be exposed in port 9464 in the `/metrics` path.

If export type is set to `otlp`, you will have to configure a value for `OTEL_EXPORT_OTLP_ENDPOINT`.

</ParamField>

<ParamField query="OTEL_EXPORT_OTLP_ENDPOINT" type="string">
Where telemetry data would be pushed to for collection. This is only
applicable when `OTEL_EXPORT_TYPE` is set to `otlp`.
</ParamField>

<ParamField query="OTEL_COLLECTOR_BASIC_AUTH_USERNAME" type="string">
The username for authenticating with the telemetry collector.
</ParamField>
<ParamField query="OTEL_COLLECTOR_BASIC_AUTH_PASSWORD" type="string">
The password for authenticating with the telemetry collector.
</ParamField>
45 changes: 45 additions & 0 deletions otel-collector-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
extensions:
health_check:
pprof:
zpages:
basicauth/server:
htpasswd:
inline: |
infisical:infisical

receivers:
otlp:
protocols:
http:
endpoint: 0.0.0.0:4318
auth:
authenticator: basicauth/server

prometheus:
config:
scrape_configs:
- job_name: otel-collector
scrape_interval: 30s
static_configs:
- targets: [backend:9464]
metric_relabel_configs:
- action: labeldrop
regex: "service_instance_id|service_name"
processors:
batch:

exporters:
prometheus:
endpoint: "0.0.0.0:8889"
auth:
authenticator: basicauth/server
resource_to_telemetry_conversion:
enabled: true

service:
extensions: [basicauth/server, health_check, pprof, zpages]
pipelines:
metrics:
receivers: [otlp]
processors: [batch]
exporters: [prometheus]
5 changes: 5 additions & 0 deletions prometheus.dev.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
scrape_configs:
- job_name: "metric-collector"
scrape_interval: 30s
static_configs:
- targets: ["backend:9464"]
Loading