Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

adds optional per table metrics #5030

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -330,6 +330,10 @@ public enum Property {
PropertyType.TIMEDURATION,
"The maximum amount of time that a Scanner should wait before retrying a failed RPC.",
"1.7.3"),
GENERAL_MICROMETER_TABLE_METRICS_ENABLED("general.micrometer.table.metrics.enabled", "false",
PropertyType.BOOLEAN,
"Enables per table metrics for a subset of meters. Turning this on will add tableId tags to some meters which will increase the cardinality of metrics.",
"4.0.0"),
GENERAL_MICROMETER_CACHE_METRICS_ENABLED("general.micrometer.cache.metrics.enabled", "false",
PropertyType.BOOLEAN, "Enables Caffeine Cache metrics functionality using Micrometer.",
"4.0.0"),
Expand Down
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@
<dependency>
<groupId>io.micrometer</groupId>
<artifactId>micrometer-bom</artifactId>
<version>1.12.2</version>
<version>1.13.6</version>
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FYI that some things will have to change down in the accumulo-testing Terraform contrib code when this is merged due to the version change.

<type>pom</type>
<scope>import</scope>
</dependency>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.accumulo.server.metrics;

import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.TimeUnit;
import java.util.function.Consumer;
import java.util.function.Supplier;

import org.apache.accumulo.core.conf.Property;
import org.apache.accumulo.core.data.TableId;
import org.apache.accumulo.core.manager.state.tables.TableState;
import org.apache.accumulo.core.metrics.MetricsProducer;
import org.apache.accumulo.core.util.Timer;
import org.apache.accumulo.server.ServerContext;
import org.slf4j.Logger;

import com.google.common.base.Preconditions;

import io.micrometer.core.instrument.Meter;
import io.micrometer.core.instrument.MeterRegistry;
import io.micrometer.core.instrument.Tag;

/**
* Common code for dealing with per table metrics. This code handles automatically creating and
* deleting per table metrics as needed. To use this class extend and implement
* {@link #newAllTablesMetrics(MeterRegistry, Consumer, List)} and
* {@link #newPerTableMetrics(MeterRegistry, TableId, Consumer, List)} to create per table metrics
* object and then use {@link #getTableMetrics(TableId)} to get those cached objects.
*/
public abstract class PerTableMetrics<T> implements MetricsProducer {

public static final String TABLE_ID_TAG_NAME = "tableId";

private final ServerContext context;

private static class TableMetricsInfo<T2> {
final T2 tableMetrics;
volatile Timer inactiveTime;
final List<Meter> meters;

public TableMetricsInfo(T2 tableMetrics, List<Meter> meters) {
this.tableMetrics = Objects.requireNonNull(tableMetrics);
this.meters = meters;
}
}

private final boolean perTableActive;
private final Supplier<Set<TableId>> activeTables;
private final ConcurrentHashMap<TableId,TableMetricsInfo<T>> perTableMetrics =
new ConcurrentHashMap<>();
private T allTableMetrics;
private volatile MeterRegistry registry;

public PerTableMetrics(ServerContext context, Supplier<Set<TableId>> activeTableSupplier) {
activeTables = activeTableSupplier;
perTableActive =
context.getConfiguration().getBoolean(Property.GENERAL_MICROMETER_TABLE_METRICS_ENABLED);
this.context = context;
if (perTableActive) {
context.getScheduledExecutor().scheduleAtFixedRate(this::refresh, 30, 30, TimeUnit.SECONDS);
}
}

/**
* This method exist so this class can log using the logger of the subclass.
*/
protected abstract Logger getLog();

/**
* Subclasses should implement this method to create a TableMetrics object that will be used in
* the case when per table metrics are disabled. The object returned by this method will alway be
* returned by {@link #getTableMetrics(TableId)} no matter what the table id is.
*
* @param registry register an meters for the table metrics in this registry
* @param meters a consumer that accepts meters to be removed from the registry when the table
* metrics object is discarded. Currently this consumer does nothing with the meters, its
* passed for consistency with
* {@link #newPerTableMetrics(MeterRegistry, TableId, Consumer, List)}
* @param tags currently an empty collection of tags, this is passed for consistency with
* {@link #PerTableMetrics(ServerContext, Supplier)}
* @return a new object that will be cached and later returned by
* {@link #getTableMetrics(TableId)}
*/
protected abstract T newAllTablesMetrics(MeterRegistry registry, Consumer<Meter> meters,
List<Tag> tags);

/**
*
* Subclasses should implement this method to create per table table metrics objects. This method
* is called in the case where per table metrics are enabled. These objects will be cached and
* returned by {@link #getTableMetrics(TableId)}. Table metrics object in the cache that are no
* longer needed will be automatically removed when the table is deleted or this server has not
* hosted the table for a bit.
*
* @param registry register an meters for the table metrics in this registry
* @param meters a consumer that accepts meters to be removed from the registry when the per table
* metrics object is discarded.
* @param tags returns a list with a single tag in it which is the tableId. These tags should be
* used when registering meters
* @return a new object that will be cached and later returned by
* {@link #getTableMetrics(TableId)}
*/
protected abstract T newPerTableMetrics(MeterRegistry registry, TableId tableId,
Consumer<Meter> meters, List<Tag> tags);

private TableMetricsInfo<T> getOrCreateTableMetrics(TableId tableId) {
Preconditions.checkState(perTableActive);
return perTableMetrics.computeIfAbsent(tableId, tid -> {
List<Meter> meters = new ArrayList<>();
T tableMetrics = newPerTableMetrics(registry, tableId, meters::add,
List.of(Tag.of(TABLE_ID_TAG_NAME, tid.canonical())));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm thinking it might be more useful that have a tableName tag and use the table name instead of the tableId.

getLog().debug("Created {} meters for table id {} in metrics registry.", meters.size(),
tableId);
return new TableMetricsInfo<>(tableMetrics, meters);
});
}

public void registerMetrics(MeterRegistry registry) {
Preconditions.checkState(this.registry == null);
this.registry = registry;
if (!perTableActive) {
this.allTableMetrics = newAllTablesMetrics(registry, m -> {}, List.of());
}
}

public T getTableMetrics(TableId tableId) {
Preconditions.checkState(registry != null);

if (!perTableActive) {
return allTableMetrics;
}

return getOrCreateTableMetrics(tableId).tableMetrics;
}

/**
* This method will create per table metrics for any tables that are active on this server and
* currently have no table metrics object in the cache. It will also remove an per table metrics
* object from the cache that have been inactive for a while or where the table was deleted.
*/
public synchronized void refresh() {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm curious if it might be better to evaluate whether per table metrics should be added or removed when a tablet is hosted or unhosted in the ScanServer and TabletServer. There are explicit mechanisms in the TabletServer for hosting and unhosting tablets. In the ScanServer we have the TabletMetadataLoader for hosting a tablet, and we could add an evictionListener to the tabletMetadataCache to handle a tablet removal. Thoughts on that?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It probably would be better to do that for all cases. Its only being done for the tablet server on tablet load for this method. The three other cases you mentioned are not done.

For the scan server I could not find a good place to register on tablet load, I will circle back and see what I can find. For now its probably ok that scan server does not register on load because it has no gauges, so when a scan happen it will touch meters which will load metrics. However that is shaky ground, if gauges were ever used then those may not be loaded until the timer task kicks in. Would also be good to push code to TabletHostingServer so that the metrics code can interact w/ the same code for each server type.

If all 4 cases are covered with callbacks then we could run the timer task less frequently. For the unload case I was completely leaving that to the timer task to catch.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Made some changes related to this in afa401f. Was able to optimize and centralize the code for detecting changes in the set of table ids. Using those changes could efficiently handle a tablet being loaded and detect if anything needed to be done. However for the case of a tablet being unloaded found that is hard to handle that efficiently because when one tablet is unloaded other tablet may still have that same tablet id, so need to scan all tablets on each tablet unload to see if anything needs to be done. Decided not to do anything for this case and leave it to the periodic timer task. Was able to centralize that timer task and make it more efficient though.

if (!perTableActive || registry == null) {
return;
}

var currentActive = activeTables.get();

currentActive.forEach(tid -> {
// This registers metrics for the table if none are currently registered and resets the
// inactiveTime if one exists
getOrCreateTableMetrics(tid).inactiveTime = null;
});

// clean up any tables that have been inactive for a bit
var iter = perTableMetrics.entrySet().iterator();
while (iter.hasNext()) {
var entry = iter.next();
var tableId = entry.getKey();
if (!currentActive.contains(tableId)) {
var tableMetricsInfo = entry.getValue();
var tableState = context.getTableManager().getTableState(tableId);
if (tableState == null || tableState == TableState.DELETING) {
// immediately remove deleted tables
iter.remove();
tableMetricsInfo.meters.forEach(registry::remove);
getLog().debug(
"Removed {} meters for table id {} from metrics registry because table was deleted.",
tableMetricsInfo.meters.size(), tableId);
} else if (tableMetricsInfo.inactiveTime == null) {
// the first time this table was seen as inactive so start a timer for removal
tableMetricsInfo.inactiveTime = Timer.startNew();
} else if (tableMetricsInfo.inactiveTime.hasElapsed(10, TimeUnit.MINUTES)) {
iter.remove();
tableMetricsInfo.meters.forEach(registry::remove);
getLog().debug(
"Removed {} meters for table id {} from metrics registry because table was inactive.",
tableMetricsInfo.meters.size(), tableId);
}
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,8 @@ public void run() {
}
}

server.refreshMetrics(extent.tableId());

tablet = null; // release this reference
successful = true;
} catch (Exception e) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,14 @@
package org.apache.accumulo.tserver;

import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.SortedMap;
import java.util.TreeMap;
import java.util.concurrent.atomic.AtomicReference;
import java.util.stream.Collectors;

import org.apache.accumulo.core.data.TableId;
import org.apache.accumulo.core.dataImpl.KeyExtent;
import org.apache.accumulo.tserver.tablet.Tablet;

Expand All @@ -34,19 +39,45 @@
*/
public class OnlineTablets {
private volatile SortedMap<KeyExtent,Tablet> snapshot = Collections.emptySortedMap();
private final AtomicReference<Map<TableId,SortedMap<KeyExtent,Tablet>>> perTableSnapshot =
new AtomicReference<>(null);
private final SortedMap<KeyExtent,Tablet> onlineTablets = new TreeMap<>();

public synchronized void put(KeyExtent ke, Tablet t) {
onlineTablets.put(ke, t);
snapshot = ImmutableSortedMap.copyOf(onlineTablets);
perTableSnapshot.set(null);
}

public synchronized void remove(KeyExtent ke) {
onlineTablets.remove(ke);
snapshot = ImmutableSortedMap.copyOf(onlineTablets);
perTableSnapshot.set(null);
}

SortedMap<KeyExtent,Tablet> snapshot() {
return snapshot;
}

private static Map<TableId,SortedMap<KeyExtent,Tablet>>
createPerTableSnapshot(SortedMap<KeyExtent,Tablet> snapshot) {
var tables = new HashMap<TableId,Map<KeyExtent,Tablet>>();
snapshot.forEach(((keyExtent, tablet) -> {
tables.computeIfAbsent(keyExtent.tableId(), tableId -> new HashMap<>()).put(keyExtent,
tablet);
}));
return tables.entrySet().stream().collect(Collectors.toUnmodifiableMap(Map.Entry::getKey,
entry -> ImmutableSortedMap.copyOf(entry.getValue())));
}

Map<TableId,SortedMap<KeyExtent,Tablet>> perTableSnapshot() {
var snap = perTableSnapshot.get();
if (snap != null) {
return snap;
} else {
snap = createPerTableSnapshot(snapshot);
perTableSnapshot.compareAndSet(null, snap);
return snap;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import java.util.function.Supplier;
import java.util.stream.Collectors;
import java.util.stream.Stream;

Expand All @@ -55,6 +56,7 @@
import org.apache.accumulo.core.conf.AccumuloConfiguration;
import org.apache.accumulo.core.conf.Property;
import org.apache.accumulo.core.conf.SiteConfiguration;
import org.apache.accumulo.core.data.TableId;
import org.apache.accumulo.core.dataImpl.KeyExtent;
import org.apache.accumulo.core.dataImpl.thrift.InitialMultiScan;
import org.apache.accumulo.core.dataImpl.thrift.InitialScan;
Expand Down Expand Up @@ -409,7 +411,12 @@ public void run() {

MetricsInfo metricsInfo = getContext().getMetricsInfo();

scanMetrics = new TabletServerScanMetrics(resourceManager::getOpenFiles);
// The following will read through everything in the cache which could update the access time of
// everything which is not desired for this use case, however the cache is expire after write
// and not expire after access so its probably ok.
Supplier<Set<TableId>> activeTables = () -> tabletMetadataCache.asMap().keySet().stream()
.map(KeyExtent::tableId).collect(Collectors.toSet());
scanMetrics = new TabletServerScanMetrics(context, activeTables, resourceManager::getOpenFiles);
sessionManager.setZombieCountConsumer(scanMetrics::setZombieScanThreads);
scanServerMetrics = new ScanServerMetrics(tabletMetadataCache);
blockCacheMetrics = new BlockCacheMetrics(resourceManager.getIndexCache(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,6 @@ public long startUpdate(TInfo tinfo, TCredentials credentials, TDurability tdura
// Make sure user is real
Durability durability = DurabilityImpl.fromThrift(tdurabilty);
security.authenticateUser(credentials, credentials);
server.updateMetrics.addPermissionErrors(0);

UpdateSession us =
new UpdateSession(new TservConstraintEnv(server.getContext(), security, credentials),
Expand Down Expand Up @@ -190,31 +189,31 @@ private void setUpdateTablet(UpdateSession us, KeyExtent keyExtent) {
// not serving tablet, so report all mutations as
// failures
us.failures.put(keyExtent, 0L);
server.updateMetrics.addUnknownTabletErrors(0);
server.updateMetrics.addUnknownTabletErrors(keyExtent.tableId(), 1);
}
} else {
log.warn("Denying access to table {} for user {}", keyExtent.tableId(), us.getUser());
long t2 = System.currentTimeMillis();
us.authTimes.addStat(t2 - t1);
us.currentTablet = null;
us.authFailures.put(keyExtent, SecurityErrorCode.PERMISSION_DENIED);
server.updateMetrics.addPermissionErrors(0);
server.updateMetrics.addPermissionErrors(keyExtent.tableId(), 1);
}
} catch (TableNotFoundException tnfe) {
log.error("Table " + tableId + " not found ", tnfe);
long t2 = System.currentTimeMillis();
us.authTimes.addStat(t2 - t1);
us.currentTablet = null;
us.authFailures.put(keyExtent, SecurityErrorCode.TABLE_DOESNT_EXIST);
server.updateMetrics.addUnknownTabletErrors(0);
server.updateMetrics.addUnknownTabletErrors(keyExtent.tableId(), 1);
} catch (ThriftSecurityException e) {
log.error("Denying permission to check user " + us.getUser() + " with user " + e.getUser(),
e);
long t2 = System.currentTimeMillis();
us.authTimes.addStat(t2 - t1);
us.currentTablet = null;
us.authFailures.put(keyExtent, e.getCode());
server.updateMetrics.addPermissionErrors(0);
server.updateMetrics.addPermissionErrors(keyExtent.tableId(), 1);
}
}

Expand Down Expand Up @@ -292,7 +291,9 @@ private void flush(UpdateSession us) {
List<Mutation> mutations = entry.getValue();
if (!mutations.isEmpty()) {
try {
server.updateMetrics.addMutationArraySize(mutations.size());
// TODO this metrics seems very expensive because of the update frequency
server.updateMetrics.addMutationArraySize(tablet.getExtent().tableId(),
mutations.size());

PreparedMutations prepared = tablet.prepareMutationsForCommit(us.cenv, mutations);

Expand All @@ -313,7 +314,7 @@ private void flush(UpdateSession us) {

if (!prepared.getViolations().isEmpty()) {
us.violations.add(prepared.getViolations());
server.updateMetrics.addConstraintViolations(0);
server.updateMetrics.addConstraintViolations(tablet.getExtent().tableId(), 1);
}
// Use the size of the original mutation list, regardless of how many mutations
// did not violate constraints.
Expand Down
Loading
Loading