Skip to content

Commit

Permalink
Issue 397 Metric reporter to log debug messages if configured number … (
Browse files Browse the repository at this point in the history
  • Loading branch information
manmagic3 authored Jun 17, 2024
1 parent e0668f7 commit ec32f93
Show file tree
Hide file tree
Showing 16 changed files with 673 additions and 8 deletions.
2 changes: 2 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# Changes
## Version 5.0.5

* Metric status logger for troubleshooting - Issue #397

## Version 5.0.4

* ecChronos will break if repair interval is shorter than the initial delay - Issue #667
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import com.ericsson.bss.cassandra.ecchronos.core.JmxProxyFactoryImpl;
import com.ericsson.bss.cassandra.ecchronos.core.TableStorageStates;
import com.ericsson.bss.cassandra.ecchronos.core.TableStorageStatesImpl;
import com.ericsson.bss.cassandra.ecchronos.core.metrics.MetricInspector;
import com.ericsson.bss.cassandra.ecchronos.core.metrics.TableRepairMetrics;
import com.ericsson.bss.cassandra.ecchronos.core.metrics.TableRepairMetricsImpl;
import com.ericsson.bss.cassandra.ecchronos.core.scheduling.RunPolicy;
Expand Down Expand Up @@ -62,6 +63,8 @@ public class ECChronosInternals implements Closeable
private final CASLockFactory myLockFactory;
private final CassandraMetrics myCassandraMetrics;

private final MetricInspector myMetricInspector;

public ECChronosInternals(final Config configuration,
final NativeConnectionProvider nativeConnectionProvider,
final JmxConnectionProvider jmxConnectionProvider,
Expand Down Expand Up @@ -108,11 +111,20 @@ public ECChronosInternals(final Config configuration,
.withTableStorageStates(myTableStorageStatesImpl)
.withMeterRegistry(meterRegistry)
.build();

myMetricInspector = new MetricInspector(meterRegistry,
configuration.getStatisticsConfig().getRepairFailuresCount(),
configuration.getStatisticsConfig().getRepairFailuresTimeWindow()
.getInterval(TimeUnit.MINUTES),
configuration.getStatisticsConfig().getTriggerIntervalForMetricInspection()
.getInterval(TimeUnit.MILLISECONDS));
myMetricInspector.startInspection();
}
else
{
myTableStorageStatesImpl = null;
myTableRepairMetricsImpl = null;
myMetricInspector = null;
}
myScheduleManagerImpl = ScheduleManagerImpl.builder()
.withLockFactory(myLockFactory)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ public final void setStatisticsConfig(final StatisticsConfig statisticsConfig)
if (statisticsConfig != null)
{
myStatisticsConfig = statisticsConfig;
myStatisticsConfig.validate();
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,24 +14,32 @@
*/
package com.ericsson.bss.cassandra.ecchronos.application.config.metrics;

import com.ericsson.bss.cassandra.ecchronos.application.config.Interval;
import com.fasterxml.jackson.annotation.JsonProperty;

import java.io.File;
import java.util.concurrent.TimeUnit;

public class StatisticsConfig
{
private static final int DEFAULT_FAILURES_TIME_WINDOW_IN_MINUTES = 30;
private static final int DEFAULT_TRIGGER_INTERVAL_FOR_METRIC_INSPECTION_IN_SECONDS = 5;
private static final int DEFAULT_REPAIR_FAILURES_COUNT = 5;
private boolean myIsEnabled = true;

private File myOutputDirectory = new File("./statistics");
private ReportingConfigs myReportingConfigs = new ReportingConfigs();
private String myMetricsPrefix = "";
private int myRepairFailuresCount = DEFAULT_REPAIR_FAILURES_COUNT;
private Interval myRepairFailuresTimeWindow = new Interval(DEFAULT_FAILURES_TIME_WINDOW_IN_MINUTES,
TimeUnit.MINUTES);
private Interval myTriggerIntervalForMetricInspection = new
Interval(DEFAULT_TRIGGER_INTERVAL_FOR_METRIC_INSPECTION_IN_SECONDS, TimeUnit.SECONDS);

@JsonProperty("enabled")
public final boolean isEnabled()
{
boolean isAnyReportingEnabled = myReportingConfigs.isFileReportingEnabled()
|| myReportingConfigs.isJmxReportingEnabled()
|| myReportingConfigs.isHttpReportingEnabled();
return myIsEnabled && isAnyReportingEnabled;
return myIsEnabled;
}

@JsonProperty("directory")
Expand All @@ -52,6 +60,24 @@ public final String getMetricsPrefix()
return myMetricsPrefix;
}

@JsonProperty("repair_failures_count")
public final int getRepairFailuresCount()
{
return myRepairFailuresCount;
}

@JsonProperty("repair_failures_time_window")
public final Interval getRepairFailuresTimeWindow()
{
return myRepairFailuresTimeWindow;
}

@JsonProperty("trigger_interval_for_metric_inspection")
public final Interval getTriggerIntervalForMetricInspection()
{
return myTriggerIntervalForMetricInspection;
}

@JsonProperty("enabled")
public final void setEnabled(final boolean enabled)
{
Expand All @@ -75,4 +101,37 @@ public final void setMetricsPrefix(final String metricsPrefix)
{
myMetricsPrefix = metricsPrefix;
}

@JsonProperty("repair_failures_count")
public final void setRepairFailuresCount(final int repairFailuresCount)
{
myRepairFailuresCount = repairFailuresCount;
}

@JsonProperty("repair_failures_time_window")
public final void setRepairFailuresTimeWindow(final Interval repairFailuresTimeWindow)
{
myRepairFailuresTimeWindow = repairFailuresTimeWindow;
}
@JsonProperty("trigger_interval_for_metric_inspection")
public final void setTriggerIntervalForMetricInspection(final Interval triggerIntervalForStatusLogger)
{
myTriggerIntervalForMetricInspection = triggerIntervalForStatusLogger;
}

public final void validate()
{
long repairTimeWindowInSeconds = getRepairFailuresTimeWindow().getInterval(TimeUnit.SECONDS);
long triggerIntervalForMetricInspection = getTriggerIntervalForMetricInspection()
.getInterval(TimeUnit.SECONDS);
if (triggerIntervalForMetricInspection >= repairTimeWindowInSeconds)
{
throw new IllegalArgumentException(String.format("Repair window time must be greater than trigger interval."
+ " Current repair window time: %d seconds,"
+ " trigger interval for metric inspection: %d seconds",
repairTimeWindowInSeconds, triggerIntervalForMetricInspection));
}
}

}

Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import io.micrometer.core.instrument.Clock;
import io.micrometer.core.instrument.composite.CompositeMeterRegistry;
import io.micrometer.core.instrument.config.MeterFilter;
import io.micrometer.core.instrument.simple.SimpleMeterRegistry;
import io.micrometer.jmx.JmxConfig;
import io.micrometer.jmx.JmxMeterRegistry;
import io.micrometer.prometheus.PrometheusConfig;
Expand Down Expand Up @@ -66,6 +67,7 @@ public MetricBeans(final Config config)
createPrometheusMeterRegistry(metricConfig);
}
}
createStatusLoggerMeterRegistry();
}

private void createJmxMeterRegistry(final StatisticsConfig metricConfig)
Expand Down Expand Up @@ -103,6 +105,12 @@ private void createPrometheusMeterRegistry(final StatisticsConfig metricConfig)
myCompositeMeterRegistry.add(myPrometheusMeterRegistry);
}

private void createStatusLoggerMeterRegistry()
{
SimpleMeterRegistry simpleMeterRegistry = new SimpleMeterRegistry();
myCompositeMeterRegistry.add(simpleMeterRegistry);
}

@Bean
public PrometheusMeterRegistry prometheusMeterRegistry()
{
Expand Down
18 changes: 18 additions & 0 deletions application/src/main/resources/ecc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,24 @@ statistics:
## The prefix cannot start or end with a dot or any other path separator.
##
prefix: ''
##
## Number of repair failures before status logger logs metrics in debug logs
## The number is used to trigger a status once number of failures is breached in a time window mentioned below
##
repair_failures_count: 5
##
## Time window over which to track repair failures in node for trigger status logger messages in debug log
##
repair_failures_time_window:
time: 30
unit: minutes
##
## Trigger interval for metric inspection.
## This time should always be lesser than repair_failures_time_window
##
trigger_interval_for_metric_inspection:
time: 5
unit: seconds

lock_factory:
cas:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,10 @@ public void testAllValues() throws Exception
assertThat(httpReportingConfig.getExcludedMetrics()).hasSize(1);
assertThat(httpReportingConfig.getExcludedMetrics()).contains(expectedHttpExcludedMetric);

assertThat(statisticsConfig.getRepairFailuresTimeWindow().getInterval(TimeUnit.MINUTES)).isEqualTo(5);
assertThat(statisticsConfig.getTriggerIntervalForMetricInspection().getInterval(TimeUnit.SECONDS)).isEqualTo(30);
assertThat(statisticsConfig.getRepairFailuresCount()).isEqualTo(5);

LockFactoryConfig lockFactoryConfig = config.getLockFactory();
assertThat(lockFactoryConfig.getCasLockFactoryConfig().getKeyspaceName()).isEqualTo("ecc");
assertThat(lockFactoryConfig.getCasLockFactoryConfig().getConsistencySerial().equals(ConsistencyType.LOCAL)).isTrue();
Expand Down Expand Up @@ -370,7 +374,11 @@ public void testDefault() throws Exception
assertThat(httpReportingConfig.isEnabled()).isTrue();
assertThat(httpReportingConfig.getExcludedMetrics()).isEmpty();

LockFactoryConfig lockFactoryConfig = config.getLockFactory();
assertThat(statisticsConfig.getRepairFailuresTimeWindow().getInterval(TimeUnit.MINUTES)).isEqualTo(30);
assertThat(statisticsConfig.getTriggerIntervalForMetricInspection().getInterval(TimeUnit.SECONDS)).isEqualTo(5);
assertThat(statisticsConfig.getRepairFailuresCount()).isEqualTo(5);

LockFactoryConfig lockFactoryConfig = config.getLockFactory();
assertThat(lockFactoryConfig.getCasLockFactoryConfig().getKeyspaceName()).isEqualTo("ecchronos");
assertThat(lockFactoryConfig.getCasLockFactoryConfig().getConsistencySerial().equals(ConsistencyType.DEFAULT)).isTrue();

Expand Down Expand Up @@ -408,7 +416,7 @@ public void testWarnIntervalLongerThanError()
}

@Test
public void testStatisticsDisabledIfNoReporting() throws Exception
public void testStatisticsEnabledIfNoReporting() throws Exception
{
ClassLoader classLoader = Thread.currentThread().getContextClassLoader();
File file = new File(classLoader.getResource("all_reporting_disabled.yml").getFile());
Expand All @@ -421,7 +429,18 @@ public void testStatisticsDisabledIfNoReporting() throws Exception
assertThat(statisticsConfig.getReportingConfigs().isJmxReportingEnabled()).isFalse();
assertThat(statisticsConfig.getReportingConfigs().isFileReportingEnabled()).isFalse();
assertThat(statisticsConfig.getReportingConfigs().isHttpReportingEnabled()).isFalse();
assertThat(statisticsConfig.isEnabled()).isFalse();
assertThat(statisticsConfig.isEnabled()).isTrue();
}

@Test
public void testTriggerIntervalBiggerThanRepairFailuresWindow()
{
ClassLoader classLoader = Thread.currentThread().getContextClassLoader();
File file = new File(classLoader.getResource("trigger_interval_bigger_than_repair_failures_window.yml").getFile());

ObjectMapper objectMapper = new ObjectMapper(new YAMLFactory());

assertThatExceptionOfType(JsonMappingException.class).isThrownBy(() -> objectMapper.readValue(file, Config.class));
}

public static class TestNativeConnectionProvider implements NativeConnectionProvider
Expand Down
7 changes: 7 additions & 0 deletions application/src/test/resources/all_set.yml
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,13 @@ statistics:
excludedMetrics:
- name: '.*httpExcluded'
prefix: "unittest"
repair_failures_count: 5
repair_failures_time_window:
time: 5
unit: minutes
trigger_interval_for_metric_inspection:
time: 30
unit: seconds

lock_factory:
cas:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#
# Copyright 2024 Telefonaktiebolaget LM Ericsson
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

statistics:
repair_failures_time_window:
time: 30
unit: seconds
trigger_interval_for_metric_inspection:
time: 5
unit: minutes
Loading

0 comments on commit ec32f93

Please sign in to comment.