diff --git a/distribution/tools/server-cli/src/main/java/org/elasticsearch/server/cli/APMJvmOptions.java b/distribution/tools/server-cli/src/main/java/org/elasticsearch/server/cli/APMJvmOptions.java index 844c0bba651d5..84f90c01b03d4 100644 --- a/distribution/tools/server-cli/src/main/java/org/elasticsearch/server/cli/APMJvmOptions.java +++ b/distribution/tools/server-cli/src/main/java/org/elasticsearch/server/cli/APMJvmOptions.java @@ -16,6 +16,7 @@ import org.elasticsearch.common.settings.SecureSettings; import org.elasticsearch.common.settings.SecureString; import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.core.Booleans; import org.elasticsearch.core.Nullable; import java.io.IOException; @@ -38,6 +39,8 @@ * server URL and secret key can only be provided when Elasticsearch starts. */ class APMJvmOptions { + private static final String OTEL_METRICS_ENABLED_SYSTEM_PROPERTY = "telemetry.otel.metrics.enabled"; + /** * Contains agent configuration that must always be applied, and cannot be overridden. */ @@ -137,14 +140,24 @@ class APMJvmOptions { */ static List apmJvmOptions(Settings settings, @Nullable SecureSettings secrets, Path logsDir, Path tmpdir) throws UserException, IOException { - final Path agentJar = findAgentJar(); + boolean tracingEnabled = settings.getAsBoolean("telemetry.tracing.enabled", false); + boolean metricsEnabled = settings.getAsBoolean("telemetry.metrics.enabled", false); + boolean agentMetricsEnabled = Booleans.parseBoolean(System.getProperty(OTEL_METRICS_ENABLED_SYSTEM_PROPERTY, "false")) == false; + boolean attachAgent = tracingEnabled || (metricsEnabled && agentMetricsEnabled); + + final Path agentJar = findAgentJar(System.getProperty("user.dir")); - if (agentJar == null) { + if (attachAgent == false || agentJar == null) { return List.of(); } final Map propertiesMap = extractApmSettings(settings); + if (metricsEnabled == false || agentMetricsEnabled == false) { + propertiesMap.put("metrics_interval", "0s"); + propertiesMap.put("disable_metrics", "*"); + } + // Configures a log file to write to. Don't disable writing to a log file, // as the agent will then require extra Security Manager permissions when // it tries to do something else, and it's just painful. diff --git a/gradle/verification-metadata.xml b/gradle/verification-metadata.xml index 325d27b10633f..eb968234e8d0b 100644 --- a/gradle/verification-metadata.xml +++ b/gradle/verification-metadata.xml @@ -2244,6 +2244,31 @@ + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/modules/apm/build.gradle b/modules/apm/build.gradle index 91a75bf4926cc..a757ed2e8132b 100644 --- a/modules/apm/build.gradle +++ b/modules/apm/build.gradle @@ -14,19 +14,66 @@ esplugin { } def otelVersion = '1.57.0' -def otelSemconvVersion = '1.21.0-alpha' +def otelInstrumentationVersion = '2.23.0-alpha' dependencies { implementation "io.opentelemetry:opentelemetry-api:${otelVersion}" implementation "io.opentelemetry:opentelemetry-context:${otelVersion}" - implementation "io.opentelemetry:opentelemetry-semconv:${otelSemconvVersion}" - runtimeOnly "io.opentelemetry:opentelemetry-common:${otelVersion}" - runtimeOnly "co.elastic.apm:elastic-apm-agent-java8:1.55.0" + implementation "io.opentelemetry:opentelemetry-sdk:${otelVersion}" + implementation "io.opentelemetry:opentelemetry-sdk-metrics:${otelVersion}" + implementation("io.opentelemetry:opentelemetry-exporter-otlp:${otelVersion}") { + exclude group: 'io.opentelemetry', module: 'opentelemetry-exporter-sender-okhttp' + } + implementation "io.opentelemetry.instrumentation:opentelemetry-runtime-telemetry-java17:${otelInstrumentationVersion}" + implementation "io.opentelemetry.instrumentation:opentelemetry-runtime-telemetry-java8:${otelInstrumentationVersion}" + + implementation "io.opentelemetry:opentelemetry-sdk-common:${otelVersion}" + runtimeOnly "io.opentelemetry:opentelemetry-common:${otelVersion}" + runtimeOnly "io.opentelemetry:opentelemetry-sdk-trace:${otelVersion}" + runtimeOnly "io.opentelemetry:opentelemetry-sdk-logs:${otelVersion}" + runtimeOnly "io.opentelemetry:opentelemetry-exporter-common:${otelVersion}" + runtimeOnly "io.opentelemetry:opentelemetry-exporter-otlp-common:${otelVersion}" + runtimeOnly "io.opentelemetry:opentelemetry-exporter-sender-jdk:${otelVersion}" + runtimeOnly "io.opentelemetry:opentelemetry-sdk-extension-autoconfigure-spi:${otelVersion}" + runtimeOnly "co.elastic.apm:elastic-apm-agent-java8:1.55.0" + + testImplementation "io.opentelemetry:opentelemetry-sdk-testing:${otelVersion}" javaRestTestImplementation project(':modules:apm') javaRestTestImplementation project(':test:framework') } +tasks.named("thirdPartyAudit").configure { + ignoreMissingClasses( + 'com.fasterxml.jackson.core.JsonFactory', + 'com.fasterxml.jackson.core.JsonGenerator', + 'com.google.common.io.ByteStreams', + 'com.google.common.util.concurrent.ListenableFuture', + 'io.grpc.CallOptions', + 'io.grpc.Channel', + 'io.grpc.Drainable', + 'io.grpc.KnownLength', + 'io.grpc.ManagedChannel', + 'io.grpc.MethodDescriptor', + 'io.grpc.MethodDescriptor$Builder', + 'io.grpc.MethodDescriptor$Marshaller', + 'io.grpc.MethodDescriptor$MethodType', + 'io.grpc.stub.AbstractFutureStub', + 'io.grpc.stub.AbstractStub', + 'io.grpc.stub.ClientCalls' + ) + ignoreViolations( + // uses internal java api: sun.misc.Unsafe + 'io.opentelemetry.internal.shaded.jctools.util.UnsafeRefArrayAccess', + 'io.opentelemetry.internal.shaded.jctools.util.UnsafeAccess', + 'io.opentelemetry.exporter.internal.marshal.UnsafeAccess', + 'io.opentelemetry.exporter.internal.marshal.UnsafeAccess$UnsafeHolder', + 'io.opentelemetry.internal.shaded.jctools.queues.MpscArrayQueueProducerLimitField', + 'io.opentelemetry.internal.shaded.jctools.queues.MpscArrayQueueProducerIndexField', + 'io.opentelemetry.internal.shaded.jctools.queues.MpscArrayQueueConsumerIndexField' + ) +} + tasks.named("dependencyLicenses").configure { mapping from: /opentelemetry-.*/, to: 'opentelemetry' } diff --git a/modules/apm/src/main/java/module-info.java b/modules/apm/src/main/java/module-info.java index f179c4c7bbe0a..111db2794d24d 100644 --- a/modules/apm/src/main/java/module-info.java +++ b/modules/apm/src/main/java/module-info.java @@ -13,8 +13,13 @@ requires org.elasticsearch.xcontent; requires org.apache.logging.log4j; requires org.apache.lucene.core; - requires io.opentelemetry.context; requires io.opentelemetry.api; + requires io.opentelemetry.context; + requires io.opentelemetry.sdk; + requires io.opentelemetry.sdk.metrics; + requires io.opentelemetry.exporter.otlp; + requires io.opentelemetry.instrumentation.runtime_telemetry_java17; + requires io.opentelemetry.sdk.common; exports org.elasticsearch.telemetry.apm; exports org.elasticsearch.telemetry.apm.metrics; diff --git a/modules/apm/src/main/java/org/elasticsearch/telemetry/apm/APM.java b/modules/apm/src/main/java/org/elasticsearch/telemetry/apm/APM.java index 43447cfa21a62..b2e2b17a8a93a 100644 --- a/modules/apm/src/main/java/org/elasticsearch/telemetry/apm/APM.java +++ b/modules/apm/src/main/java/org/elasticsearch/telemetry/apm/APM.java @@ -21,6 +21,7 @@ import org.elasticsearch.telemetry.apm.internal.APMAgentSettings; import org.elasticsearch.telemetry.apm.internal.APMMeterService; import org.elasticsearch.telemetry.apm.internal.APMTelemetryProvider; +import org.elasticsearch.telemetry.apm.internal.OTelSdkSettings; import org.elasticsearch.telemetry.apm.internal.tracing.APMTracer; import java.util.Collection; @@ -88,6 +89,8 @@ public List> getSettings() { APMAgentSettings.TELEMETRY_API_KEY_SETTING, // Metrics APMAgentSettings.TELEMETRY_METRICS_ENABLED_SETTING, + OTelSdkSettings.TELEMETRY_OTEL_METRICS_ENDPOINT, + OTelSdkSettings.TELEMETRY_OTEL_METRICS_INTERVAL, // Tracing APMAgentSettings.TELEMETRY_TRACING_ENABLED_SETTING, APMAgentSettings.TELEMETRY_TRACING_NAMES_INCLUDE_SETTING, diff --git a/modules/apm/src/main/java/org/elasticsearch/telemetry/apm/internal/APMMeterService.java b/modules/apm/src/main/java/org/elasticsearch/telemetry/apm/internal/APMMeterService.java index 5a8978074d164..9e33e77549864 100644 --- a/modules/apm/src/main/java/org/elasticsearch/telemetry/apm/internal/APMMeterService.java +++ b/modules/apm/src/main/java/org/elasticsearch/telemetry/apm/internal/APMMeterService.java @@ -16,20 +16,22 @@ import org.elasticsearch.cluster.service.ClusterService; import org.elasticsearch.common.component.AbstractLifecycleComponent; import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.core.Booleans; import org.elasticsearch.telemetry.apm.APMMeterRegistry; import java.util.function.Supplier; public class APMMeterService extends AbstractLifecycleComponent { - private final APMMeterRegistry meterRegistry; + static final String OTEL_METRICS_ENABLED_SYSTEM_PROPERTY = "telemetry.otel.metrics.enabled"; + private final APMMeterRegistry meterRegistry; private final Supplier otelMeterSupplier; private final Supplier noopMeterSupplier; protected volatile boolean enabled; public APMMeterService(Settings settings) { - this(settings, APMMeterService.otelMeter(), APMMeterService.noopMeter()); + this(settings, createOtelMeterSupplier(settings), () -> OpenTelemetry.noop().getMeter("noop")); } public APMMeterService(Settings settings, Supplier otelMeterSupplier, Supplier noopMeterSupplier) { @@ -40,7 +42,14 @@ public APMMeterService(boolean enabled, Supplier otelMeterSupplier, Suppl this.enabled = enabled; this.otelMeterSupplier = otelMeterSupplier; this.noopMeterSupplier = noopMeterSupplier; - this.meterRegistry = new APMMeterRegistry(enabled ? createOtelMeter() : createNoopMeter()); + this.meterRegistry = new APMMeterRegistry(enabled ? otelMeterSupplier.get() : noopMeterSupplier.get()); + } + + private static Supplier createOtelMeterSupplier(Settings settings) { + if (Booleans.parseBoolean(System.getProperty(OTEL_METRICS_ENABLED_SYSTEM_PROPERTY, "false")) == false) { + return () -> GlobalOpenTelemetry.get().getMeter("elasticsearch"); + } + return new OTelSdkMeterSupplier(settings); } public APMMeterRegistry getMeterRegistry() { @@ -52,11 +61,7 @@ public APMMeterRegistry getMeterRegistry() { */ void setEnabled(boolean enabled) { this.enabled = enabled; - if (enabled) { - meterRegistry.setProvider(createOtelMeter()); - } else { - meterRegistry.setProvider(createNoopMeter()); - } + meterRegistry.setProvider(enabled ? otelMeterSupplier.get() : noopMeterSupplier.get()); } @Override @@ -64,29 +69,16 @@ protected void doStart() {} @Override protected void doStop() { - meterRegistry.setProvider(createNoopMeter()); + if (otelMeterSupplier instanceof AutoCloseable closeable) { + try { + closeable.close(); + } catch (Exception e) { + // TODO + } + } + meterRegistry.setProvider(noopMeterSupplier.get()); } @Override protected void doClose() {} - - protected Meter createOtelMeter() { - assert this.enabled; - return otelMeterSupplier.get(); - } - - protected Meter createNoopMeter() { - return noopMeterSupplier.get(); - } - - protected static Supplier noopMeter() { - return () -> OpenTelemetry.noop().getMeter("noop"); - } - - // to be used within doPrivileged block - private static Supplier otelMeter() { - var openTelemetry = GlobalOpenTelemetry.get(); - var meter = openTelemetry.getMeter("elasticsearch"); - return () -> meter; - } } diff --git a/modules/apm/src/main/java/org/elasticsearch/telemetry/apm/internal/MetricValidator.java b/modules/apm/src/main/java/org/elasticsearch/telemetry/apm/internal/MetricValidator.java index 24586d4f60f6d..3e93e8e9b75a4 100644 --- a/modules/apm/src/main/java/org/elasticsearch/telemetry/apm/internal/MetricValidator.java +++ b/modules/apm/src/main/java/org/elasticsearch/telemetry/apm/internal/MetricValidator.java @@ -67,7 +67,14 @@ public class MetricValidator { "es.thread_pool.searchable_snapshots_cache_fetch_async.*", "es.thread_pool.searchable_snapshots_cache_prewarming.*", "es.thread_pool.security-crypto.*", - "es.thread_pool.security-token-key.*" + "es.thread_pool.security-token-key.*", + // APM Java agent-compatible metric names (see https://www.elastic.co/docs/reference/apm/agents/java/metrics#metrics-jvm) + "system.cpu.*", + "system.memory.*", + "system.process.*", + "jvm.file_descriptor.*", + "jvm.gc.*", + "jvm.memory.*" ); /** @@ -241,7 +248,10 @@ private static class Attributes { Map.entry("es.tsdb.downsample.actions.shard.total", DOWNSAMPLE_ATTRIBUTES), Map.entry("es.tsdb.downsample.actions.total", DOWNSAMPLE_ATTRIBUTES), Map.entry("es.tsdb.downsample.latency.shard.histogram", DOWNSAMPLE_ATTRIBUTES), - Map.entry("es.tsdb.downsample.latency.total.histogram", DOWNSAMPLE_ATTRIBUTES) + Map.entry("es.tsdb.downsample.latency.total.histogram", DOWNSAMPLE_ATTRIBUTES), + // APM Java agent-compatible metrics (see https://www.elastic.co/docs/reference/apm/agents/java/metrics#metrics-jvm) + Map.entry("jvm.gc.count", Set.of("name")), + Map.entry("jvm.gc.time", Set.of("name")) ); // forbidden attributes known to cause issues due to mapping conflicts or high cardinality @@ -317,6 +327,11 @@ public static void assertValidAttributeNames(String metricName, Map, AutoCloseable { + private final Settings settings; + private SdkMeterProvider meterProvider; + private RuntimeMetrics runtimeMetrics; + + OTelSdkMeterSupplier(Settings settings) { + this.settings = settings; + } + + @Override + public Meter get() { + if (meterProvider == null) { + var exporter = createOTLPExporter(); + TimeValue intervalTimeValue = OTelSdkSettings.TELEMETRY_OTEL_METRICS_INTERVAL.get(settings); + var reader = PeriodicMetricReader.builder(exporter).setInterval(Duration.ofMillis(intervalTimeValue.millis())).build(); + meterProvider = SdkMeterProvider.builder() + .setResource(Resource.builder().put("service.name", "elasticsearch").build()) + .registerMetricReader(reader) + .build(); + var otelSdk = OpenTelemetrySdk.builder().setMeterProvider(meterProvider).build(); + runtimeMetrics = RuntimeMetrics.builder(otelSdk).disableAllFeatures().build(); + } + return meterProvider.get("elasticsearch"); + } + + private OtlpHttpMetricExporter createOTLPExporter() { + String endpoint = OTelSdkSettings.TELEMETRY_OTEL_METRICS_ENDPOINT.get(settings); + if (endpoint == null || endpoint.isEmpty()) { + throw new IllegalStateException( + APMMeterService.OTEL_METRICS_ENABLED_SYSTEM_PROPERTY + "=true requires telemetry.otel.metrics.endpoint to be configured" + ); + } + OtlpHttpMetricExporterBuilder builder = OtlpHttpMetricExporter.builder().setEndpoint(endpoint); + String authHeader = getAuthorizationHeader(); + if (authHeader != null) { + builder.addHeader("Authorization", authHeader); + } + return builder.build(); + } + + private String getAuthorizationHeader() { + try (SecureString apiKey = APMAgentSettings.TELEMETRY_API_KEY_SETTING.get(settings)) { + if (apiKey.isEmpty() == false) { + return "ApiKey " + apiKey; + } + } + try (SecureString secretToken = APMAgentSettings.TELEMETRY_SECRET_TOKEN_SETTING.get(settings)) { + if (secretToken.isEmpty() == false) { + return "Bearer " + secretToken; + } + } + return null; + } + + @Override + public void close() { + if (runtimeMetrics != null) { + runtimeMetrics.close(); + runtimeMetrics = null; + } + if (meterProvider != null) { + meterProvider.close(); + meterProvider = null; + } + } +} diff --git a/modules/apm/src/main/java/org/elasticsearch/telemetry/apm/internal/OTelSdkSettings.java b/modules/apm/src/main/java/org/elasticsearch/telemetry/apm/internal/OTelSdkSettings.java new file mode 100644 index 0000000000000..7fe9f098b9c41 --- /dev/null +++ b/modules/apm/src/main/java/org/elasticsearch/telemetry/apm/internal/OTelSdkSettings.java @@ -0,0 +1,34 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.telemetry.apm.internal; + +import org.elasticsearch.common.settings.Setting; +import org.elasticsearch.core.TimeValue; + +import java.util.Locale; + +import static org.elasticsearch.common.settings.Setting.Property.NodeScope; + +public class OTelSdkSettings { + private OTelSdkSettings() {} + + public static final Setting TELEMETRY_OTEL_METRICS_ENDPOINT = new Setting<>( + "telemetry.otel.metrics.endpoint", + "", + s -> s.toLowerCase(Locale.ROOT), + NodeScope + ); + + public static final Setting TELEMETRY_OTEL_METRICS_INTERVAL = Setting.timeSetting( + "telemetry.otel.metrics.interval", + TimeValue.timeValueSeconds(10), + NodeScope + ); +} diff --git a/modules/apm/src/main/plugin-metadata/entitlement-policy.yaml b/modules/apm/src/main/plugin-metadata/entitlement-policy.yaml index 216c67c492260..c65183e939687 100644 --- a/modules/apm/src/main/plugin-metadata/entitlement-policy.yaml +++ b/modules/apm/src/main/plugin-metadata/entitlement-policy.yaml @@ -1,3 +1,4 @@ +# These can be removed once the APM Agent is fully deprecated org.elasticsearch.telemetry.apm: - create_class_loader - write_system_properties: @@ -83,3 +84,17 @@ org.elasticsearch.telemetry.apm: - elastic.apm.application_packages - elastic.apm.stack_trace_limit - elastic.apm.span_stack_trace_min_duration +io.opentelemetry.sdk.metrics: + - manage_threads + +io.opentelemetry.exporter.sender.jdk.internal: + - outbound_network + +io.opentelemetry.instrumentation.runtime_telemetry_java17: + - manage_threads + +io.opentelemetry.instrumentation.runtime_telemetry_java8: + - manage_threads + +io.opentelemetry.sdk.common: + - manage_threads diff --git a/server/src/main/java/org/elasticsearch/monitor/metrics/SystemMetrics.java b/server/src/main/java/org/elasticsearch/monitor/metrics/SystemMetrics.java new file mode 100644 index 0000000000000..7bf07a3f189e4 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/monitor/metrics/SystemMetrics.java @@ -0,0 +1,230 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.monitor.metrics; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.elasticsearch.common.component.AbstractLifecycleComponent; +import org.elasticsearch.core.Booleans; +import org.elasticsearch.monitor.jvm.SunThreadInfo; +import org.elasticsearch.monitor.os.OsProbe; +import org.elasticsearch.monitor.process.ProcessProbe; +import org.elasticsearch.telemetry.metric.DoubleWithAttributes; +import org.elasticsearch.telemetry.metric.LongWithAttributes; +import org.elasticsearch.telemetry.metric.MeterRegistry; + +import java.lang.management.GarbageCollectorMXBean; +import java.lang.management.ManagementFactory; +import java.lang.management.ThreadMXBean; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.function.LongSupplier; + +public class SystemMetrics extends AbstractLifecycleComponent { + private final Logger logger = LogManager.getLogger(SystemMetrics.class); + public static final String OTEL_METRICS_ENABLED_SYSTEM_PROPERTY = "telemetry.otel.metrics.enabled"; + + private static final AllocatedBytesMetrics ALLOCATED_BYTES_METRICS = new AllocatedBytesMetrics(); + + private final MeterRegistry registry; + private final List metrics = new ArrayList<>(); + + public SystemMetrics(MeterRegistry registry) { + this.registry = registry; + } + + @Override + protected void doStart() { + if (Booleans.parseBoolean(System.getProperty(OTEL_METRICS_ENABLED_SYSTEM_PROPERTY, "false")) == false) { + return; + } + registerFileDescriptorsMetrics(); + registerSystemMemoryMetrics(); + registerCgroupMemoryMetrics(); + registerJvmGcMetrics(); + registerSystemCpuMetrics(); + } + + @Override + protected void doStop() {} + + @Override + protected void doClose() { + for (AutoCloseable metric : metrics) { + try { + metric.close(); + } catch (Exception e) { + logger.warn("metrics close() method should not throw Exception", e); + } + } + } + + private void registerFileDescriptorsMetrics() { + registerLongGaugeUnlessNegative( + "jvm.file_descriptor.count", + "The number of opened file descriptors.", + "{file_descriptor}", + ProcessProbe::getOpenFileDescriptorCount + ); + registerLongGaugeUnlessNegative( + "jvm.file_descriptor.limit", + "The maximum number of opened file descriptors.", + "{file_descriptor}", + ProcessProbe::getMaxFileDescriptorCount + ); + } + + private void registerSystemMemoryMetrics() { + if (OsProbe.getInstance().getTotalPhysicalMemorySize() <= 0) { + return; + } + metrics.add( + registry.registerLongGauge( + "system.memory.actual.free", + "Actual free memory in bytes. It is calculated based on the OS. " + + "On Linux it consists of the free memory plus caches and buffers." + + " On OSX it is a sum of free memory and the inactive memory. On Windows, this value does not include memory " + + "consumed by system caches and buffers.", + "By", + () -> new LongWithAttributes(OsProbe.getInstance().getActualFreePhysicalMemorySize()) + ) + ); + metrics.add( + registry.registerLongGauge( + "system.memory.total", + "Total memory.", + "By", + () -> new LongWithAttributes(OsProbe.getInstance().getTotalPhysicalMemorySize()) + ) + ); + + registerLongGaugeUnlessNegative( + "system.process.memory.size", + "The total virtual memory the process has.", + "By", + ProcessProbe::getTotalVirtualMemorySize + ); + + metrics.add( + registry.registerLongGauge( + "jvm.gc.alloc", + "An approximation of the total amount of memory, in bytes, allocated in heap memory.", + "By", + ALLOCATED_BYTES_METRICS::readAllocatedBytes + ) + ); + } + + private void registerCgroupMemoryMetrics() { + metrics.add( + registry.registerLongGauge( + "system.process.cgroup.memory.mem.usage.bytes", + "Memory usage in current cgroup slice.", + "By", + () -> new LongWithAttributes(OsProbe.getInstance().getCgroupMemoryUsageInBytes().orElse(0L)) + ) + ); + + metrics.add( + registry.registerLongGauge( + "system.process.cgroup.memory.mem.limit.bytes", + "Memory limit for current cgroup slice.", + "By", + () -> new LongWithAttributes(OsProbe.getInstance().getCgroupMemoryLimitInBytes().orElse(0L)) + ) + ); + } + + private void registerJvmGcMetrics() { + List beans = ManagementFactory.getGarbageCollectorMXBeans(); + if (beans.isEmpty()) { + return; + } + + metrics.add(registry.registerLongsAsyncCounter("jvm.gc.count", "The total number of collections that have occurred.", "1", () -> { + var measurements = new ArrayList(beans.size()); + for (GarbageCollectorMXBean bean : beans) { + long count = bean.getCollectionCount(); + if (count >= 0) { + measurements.add(new LongWithAttributes(count, Map.of("name", bean.getName()))); + } + } + return measurements; + })); + + metrics.add( + registry.registerLongsAsyncCounter( + "jvm.gc.time", + "The approximate accumulated collection elapsed time in " + "milliseconds.", + "ms", + () -> { + var measurements = new ArrayList(beans.size()); + for (GarbageCollectorMXBean bean : beans) { + long timeMs = bean.getCollectionTime(); + if (timeMs >= 0) { + measurements.add(new LongWithAttributes(timeMs, Map.of("name", bean.getName()))); + } + } + return measurements; + } + ) + ); + } + + private void registerSystemCpuMetrics() { + short initial = OsProbe.getSystemCpuPercent(); + if (initial < 0) { + return; + } + metrics.add( + registry.registerDoubleGauge( + "system.cpu.total.norm.pct", + "System-wide CPU usage as a ratio.", + "1", + () -> new DoubleWithAttributes(Math.max(0, OsProbe.getSystemCpuPercent() / 100.0)) + ) + ); + } + + private void registerLongGaugeUnlessNegative(String name, String description, String unit, LongSupplier supplier) { + long initial = supplier.getAsLong(); + if (initial < 0) { + return; + } + metrics.add(registry.registerLongGauge(name, description, unit, () -> new LongWithAttributes(supplier.getAsLong()))); + } + + private static final class AllocatedBytesMetrics { + private static final ThreadMXBean THREAD_MX_BEAN = ManagementFactory.getThreadMXBean(); + private volatile SunThreadInfo sunThreadInfo; + + private AllocatedBytesMetrics() {} + + LongWithAttributes readAllocatedBytes() { + if (sunThreadInfo == null) { + if (SunThreadInfo.INSTANCE.isThreadAllocatedMemorySupported() == false + || SunThreadInfo.INSTANCE.isThreadAllocatedMemoryEnabled() == false) { + return new LongWithAttributes(0); + } + this.sunThreadInfo = SunThreadInfo.INSTANCE; + } + + long total = 0; + for (long id : THREAD_MX_BEAN.getAllThreadIds()) { + long allocated = sunThreadInfo.getThreadAllocatedBytes(id); + if (allocated > 0) { + total += allocated; + } + } + return new LongWithAttributes(total); + } + } +} diff --git a/server/src/main/java/org/elasticsearch/monitor/os/OsProbe.java b/server/src/main/java/org/elasticsearch/monitor/os/OsProbe.java index ff4b92769bc96..d1e730b36da88 100644 --- a/server/src/main/java/org/elasticsearch/monitor/os/OsProbe.java +++ b/server/src/main/java/org/elasticsearch/monitor/os/OsProbe.java @@ -30,6 +30,7 @@ import java.util.List; import java.util.Map; import java.util.Optional; +import java.util.OptionalLong; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Stream; @@ -105,6 +106,52 @@ public long getFreePhysicalMemorySize() { } } + /** + * Returns the amount of actual free physical memory in bytes. + *

+ * This prefers {@code MemAvailable} from {@code /proc/meminfo} when available, and otherwise falls back to + * {@code MemFree + Buffers + Cached}. When {@code /proc/meminfo} can't be read or parsed, this falls back to + * {@link #getFreePhysicalMemorySize()}. + */ + public long getActualFreePhysicalMemorySize() { + if (Constants.LINUX == false) { + return getFreePhysicalMemorySize(); + } + try { + List meminfoLines = readProcMeminfo(); + long available = parseMeminfoKbToBytes(meminfoLines, "MemAvailable:"); + if (available > 0) { + return available; + } + long free = parseMeminfoKbToBytes(meminfoLines, "MemFree:"); + long buffers = parseMeminfoKbToBytes(meminfoLines, "Buffers:"); + long cached = parseMeminfoKbToBytes(meminfoLines, "Cached:"); + if (free > 0 && buffers > 0 && cached > 0) { + return free + buffers + cached; + } + } catch (Exception e) { + logger.warn("exception retrieving actual free physical memory", e); + } + return getFreePhysicalMemorySize(); + } + + private static long parseMeminfoKbToBytes(List meminfoLines, String prefix) { + for (String line : meminfoLines) { + if (line.startsWith(prefix) == false) { + continue; + } + try { + int end = line.lastIndexOf(" kB"); + String number = line.substring(prefix.length(), end).trim(); + return Long.parseLong(number) * 1024; + } catch (NumberFormatException | IndexOutOfBoundsException e) { + logger.warn("Unable to retrieve {} from meminfo line [{}]", prefix, line); + return 0; + } + } + return 0; + } + /** * Returns the total amount of physical memory in bytes. */ @@ -795,6 +842,31 @@ public static OsProbe getInstance() { private static final Logger logger = LogManager.getLogger(OsProbe.class); + public OptionalLong getCgroupMemoryUsageInBytes() { + OsStats.Cgroup cgroup = getCgroup(Constants.LINUX); + return cgroup == null ? OptionalLong.empty() : parseCgroupBytes(cgroup.getMemoryUsageInBytes()); + } + + public OptionalLong getCgroupMemoryLimitInBytes() { + OsStats.Cgroup cgroup = getCgroup(Constants.LINUX); + return cgroup == null ? OptionalLong.empty() : parseCgroupBytes(cgroup.getMemoryLimitInBytes()); + } + + private static OptionalLong parseCgroupBytes(String value) { + if (value == null || "max".equals(value)) { + return OptionalLong.empty(); + } + try { + BigInteger bigInt = new BigInteger(value); + if (bigInt.signum() < 0) { + return OptionalLong.empty(); + } + return OptionalLong.of(bigInt.longValueExact()); + } catch (NumberFormatException | ArithmeticException e) { + return OptionalLong.empty(); + } + } + OsInfo osInfo(long refreshInterval, Processors processors) throws IOException { return new OsInfo( refreshInterval, @@ -875,6 +947,9 @@ List readOsRelease() throws IOException { @SuppressForbidden(reason = "access /proc/meminfo") List readProcMeminfo() throws IOException { final List lines; + if (Constants.LINUX == false) { + return Collections.emptyList(); + } if (Files.exists(PathUtils.get("/proc/meminfo"))) { lines = Files.readAllLines(PathUtils.get("/proc/meminfo")); assert lines != null && lines.isEmpty() == false; @@ -889,28 +964,7 @@ List readProcMeminfo() throws IOException { */ long getTotalMemFromProcMeminfo() throws IOException { List meminfoLines = readProcMeminfo(); - final List memTotalLines = meminfoLines.stream().filter(line -> line.startsWith("MemTotal")).toList(); - assert memTotalLines.size() <= 1 : memTotalLines; - if (memTotalLines.size() == 1) { - final String memTotalLine = memTotalLines.get(0); - int beginIdx = memTotalLine.indexOf("MemTotal:"); - int endIdx = memTotalLine.lastIndexOf(" kB"); - if (beginIdx + 9 < endIdx) { - final String memTotalString = memTotalLine.substring(beginIdx + 9, endIdx).trim(); - try { - long memTotalInKb = Long.parseLong(memTotalString); - return memTotalInKb * 1024; - } catch (NumberFormatException e) { - logger.warn("Unable to retrieve total memory from meminfo line [" + memTotalLine + "]"); - return 0; - } - } else { - logger.warn("Unable to retrieve total memory from meminfo line [" + memTotalLine + "]"); - return 0; - } - } else { - return 0; - } + return parseMeminfoKbToBytes(meminfoLines, "MemTotal:"); } boolean isDebian8() throws IOException { diff --git a/server/src/main/java/org/elasticsearch/node/Node.java b/server/src/main/java/org/elasticsearch/node/Node.java index 49cad4f207c56..b13ee3258864f 100644 --- a/server/src/main/java/org/elasticsearch/node/Node.java +++ b/server/src/main/java/org/elasticsearch/node/Node.java @@ -66,6 +66,7 @@ import org.elasticsearch.monitor.jvm.JvmInfo; import org.elasticsearch.monitor.metrics.IndicesMetrics; import org.elasticsearch.monitor.metrics.NodeMetrics; +import org.elasticsearch.monitor.metrics.SystemMetrics; import org.elasticsearch.node.internal.TerminationHandler; import org.elasticsearch.plugins.ClusterCoordinationPlugin; import org.elasticsearch.plugins.ClusterPlugin; @@ -290,6 +291,7 @@ public Node start() throws NodeValidationException { injector.getInstance(FsHealthService.class).start(); injector.getInstance(NodeMetrics.class).start(); injector.getInstance(IndicesMetrics.class).start(); + injector.getInstance(SystemMetrics.class).start(); injector.getInstance(HealthPeriodicLogger.class).start(); injector.getInstance(SamplingService.class).start(); nodeService.getMonitorService().start(); @@ -490,6 +492,7 @@ private void stop() { stopIfStarted(TransportService.class); stopIfStarted(NodeMetrics.class); stopIfStarted(IndicesMetrics.class); + stopIfStarted(SystemMetrics.class); pluginLifecycleComponents.forEach(Node::stopIfStarted); // we should stop this last since it waits for resources to get released @@ -561,6 +564,7 @@ public synchronized void close() throws IOException { toClose.add(injector.getInstance(TransportService.class)); toClose.add(injector.getInstance(NodeMetrics.class)); toClose.add(injector.getInstance(IndicesMetrics.class)); + toClose.add(injector.getInstance(SystemMetrics.class)); if (ReadinessService.enabled(environment)) { toClose.add(injector.getInstance(ReadinessService.class)); } diff --git a/server/src/main/java/org/elasticsearch/node/NodeConstruction.java b/server/src/main/java/org/elasticsearch/node/NodeConstruction.java index 123138dea93ca..27a8587b4ab36 100644 --- a/server/src/main/java/org/elasticsearch/node/NodeConstruction.java +++ b/server/src/main/java/org/elasticsearch/node/NodeConstruction.java @@ -161,6 +161,7 @@ import org.elasticsearch.monitor.jvm.JvmInfo; import org.elasticsearch.monitor.metrics.IndicesMetrics; import org.elasticsearch.monitor.metrics.NodeMetrics; +import org.elasticsearch.monitor.metrics.SystemMetrics; import org.elasticsearch.node.internal.TerminationHandler; import org.elasticsearch.node.internal.TerminationHandlerProvider; import org.elasticsearch.persistent.PersistentTasksClusterService; @@ -1265,6 +1266,7 @@ public void sendRequest( final TimeValue metricsInterval = settings.getAsTime("telemetry.agent.metrics_interval", TimeValue.timeValueSeconds(10)); final NodeMetrics nodeMetrics = new NodeMetrics(telemetryProvider.getMeterRegistry(), nodeService, metricsInterval); final IndicesMetrics indicesMetrics = new IndicesMetrics(telemetryProvider.getMeterRegistry(), indicesService, metricsInterval); + final SystemMetrics systemMetrics = new SystemMetrics(telemetryProvider.getMeterRegistry()); OnlinePrewarmingService onlinePrewarmingService = pluginsService.loadSingletonServiceProvider( OnlinePrewarmingServiceProvider.class, @@ -1370,6 +1372,7 @@ public void sendRequest( b.bind(TransportService.class).toInstance(transportService); b.bind(NodeMetrics.class).toInstance(nodeMetrics); b.bind(IndicesMetrics.class).toInstance(indicesMetrics); + b.bind(SystemMetrics.class).toInstance(systemMetrics); b.bind(NetworkService.class).toInstance(networkService); b.bind(IndexMetadataVerifier.class).toInstance(indexMetadataVerifier); b.bind(ClusterInfoService.class).toInstance(clusterInfoService); diff --git a/server/src/test/java/org/elasticsearch/monitor/os/OsProbeTests.java b/server/src/test/java/org/elasticsearch/monitor/os/OsProbeTests.java index 3b259497d066d..bb06177270efa 100644 --- a/server/src/test/java/org/elasticsearch/monitor/os/OsProbeTests.java +++ b/server/src/test/java/org/elasticsearch/monitor/os/OsProbeTests.java @@ -323,6 +323,23 @@ public void testGetTotalMemFromProcMeminfo() throws Exception { assertThat(probe.getTotalMemFromProcMeminfo(), equalTo(memTotalInKb * 1024L)); } + public void testGetActualFreePhysicalMemory() { + int cgroupsVersion = 1; + + var meminfoLines = Arrays.asList( + "MemFree: 8467692 kB", + "MemAvailable: 39646240 kB", + "Buffers: 4699504 kB", + "Cached: 23290380 kB" + ); + OsProbe probe = buildStubOsProbe(cgroupsVersion, "", List.of(), meminfoLines); + assertThat(probe.getActualFreePhysicalMemorySize(), equalTo(39646240 * 1024L)); + + meminfoLines = Arrays.asList("MemFree: 10 kB", "Buffers: 20 kB", "Cached: 30 kB"); + probe = buildStubOsProbe(cgroupsVersion, "", List.of(), meminfoLines); + assertThat(probe.getActualFreePhysicalMemorySize(), equalTo((10 + 20 + 30) * 1024L)); + } + public void testTotalMemoryOverride() { assertThat(OsProbe.getTotalMemoryOverride("123456789"), is(123456789L)); assertThat(OsProbe.getTotalMemoryOverride("123456789123456789"), is(123456789123456789L));