Skip to content

Commit

Permalink
feat: implement metrics (#153)
Browse files Browse the repository at this point in the history
  • Loading branch information
mpetrun5 authored Apr 5, 2023
1 parent 1e14379 commit cea714e
Show file tree
Hide file tree
Showing 15 changed files with 182 additions and 22 deletions.
11 changes: 9 additions & 2 deletions app/app.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ import (
substrateExecutor "github.com/ChainSafe/sygma-relayer/chains/substrate/executor"
substrate_listener "github.com/ChainSafe/sygma-relayer/chains/substrate/listener"
substrate_pallet "github.com/ChainSafe/sygma-relayer/chains/substrate/pallet"
"github.com/ChainSafe/sygma-relayer/metrics"
"github.com/centrifuge/go-substrate-rpc-client/v4/signature"

"github.com/ChainSafe/sygma-relayer/comm/elector"
Expand Down Expand Up @@ -236,11 +237,17 @@ func Run() error {
}
}

go jobs.StartCommunicationHealthCheckJob(host, configuration.RelayerConfig.MpcConfig.CommHealthCheckInterval)
meter, err := opentelemetry.DefaultMeter(context.Background(), configuration.RelayerConfig.OpenTelemetryCollectorURL)
if err != nil {
panic(err)
}
metrics := metrics.NewTelemetry(meter)

go jobs.StartCommunicationHealthCheckJob(host, configuration.RelayerConfig.MpcConfig.CommHealthCheckInterval, metrics)

r := relayer.NewRelayer(
chains,
&opentelemetry.ConsoleTelemetry{},
metrics,
)

errChn := make(chan error)
Expand Down
5 changes: 4 additions & 1 deletion chains/evm/chain.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,11 +53,14 @@ func NewEVMChain(
}
}

func (c *EVMChain) Write(msgs []*message.Message) {
func (c *EVMChain) Write(msgs []*message.Message) error {
err := c.executor.Execute(msgs)
if err != nil {
log.Err(err).Str("domainID", string(c.domainID)).Msgf("error writing messages %+v", msgs)
return err
}

return nil
}

func (c *EVMChain) PollEvents(ctx context.Context, sysErr chan<- error, msgChan chan []*message.Message) {
Expand Down
5 changes: 4 additions & 1 deletion chains/substrate/chain.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,11 +69,14 @@ func (c *SubstrateChain) PollEvents(ctx context.Context, sysErr chan<- error, ms
go c.listener.ListenToEvents(ctx, startBlock, c.DomainID(), *c.blockstore, msgChan)
}

func (c *SubstrateChain) Write(msgs []*message.Message) {
func (c *SubstrateChain) Write(msgs []*message.Message) error {
err := c.executor.Execute(msgs)
if err != nil {
c.logger.Err(err).Msgf("error writing messages %+v on network %d", msgs, c.DomainID())
return err
}

return nil
}

func (c *SubstrateChain) DomainID() uint8 {
Expand Down
17 changes: 17 additions & 0 deletions docs/general/Metrics.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Metrics
Metrics are implemented via the OpenTelemetry [stack](https://opentelemetry.io/) and exported to the Opentelemetry [collector](https://opentelemetry.io/docs/collector/) which then be configured to export to supported metrics tools like Datadog.

## Exported metrics
The following metrics are exported:
```
chainbridge.DepositEventCount (counter) - count of indexed deposits
chainbridge.ExecutionErrorCount (counter) - count of executions that failed
chainbridge.ExecutionLatencyPerRoute (histogram) - latency between indexing event and executing it per route
chainbridge.ExecutionLatency (histogram) - latency between indexing event and executing it across all routes
sygma.TotalRelayers (gauge) - number of relayers currently in the subset for MPC
sygma.availableRelayers (gauge) - number of currently available relayers from the subset
```

## Env variables
- SYG_RELAYER_OPENTELEMETERYCOLLECTORURL - url of the opentelemetry collector application that collects metrics
15 changes: 12 additions & 3 deletions example/app/app.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (

"github.com/ChainSafe/chainbridge-core/crypto/secp256k1"
"github.com/ChainSafe/chainbridge-core/lvldb"
"github.com/ChainSafe/chainbridge-core/opentelemetry"

"github.com/ethereum/go-ethereum/common"
"github.com/libp2p/go-libp2p/core/crypto"
Expand All @@ -29,7 +30,6 @@ import (
coreListener "github.com/ChainSafe/chainbridge-core/chains/evm/listener"
"github.com/ChainSafe/chainbridge-core/e2e/dummy"
"github.com/ChainSafe/chainbridge-core/flags"
"github.com/ChainSafe/chainbridge-core/opentelemetry"
"github.com/ChainSafe/chainbridge-core/relayer"
"github.com/ChainSafe/chainbridge-core/store"

Expand All @@ -40,6 +40,8 @@ import (
substrateExecutor "github.com/ChainSafe/sygma-relayer/chains/substrate/executor"
substrate_listener "github.com/ChainSafe/sygma-relayer/chains/substrate/listener"
substrate_pallet "github.com/ChainSafe/sygma-relayer/chains/substrate/pallet"
"github.com/ChainSafe/sygma-relayer/jobs"
"github.com/ChainSafe/sygma-relayer/metrics"

"github.com/ChainSafe/sygma-relayer/chains/evm/calls/contracts/bridge"
"github.com/ChainSafe/sygma-relayer/chains/evm/calls/events"
Expand Down Expand Up @@ -204,16 +206,23 @@ func Run() error {
substrateChain := substrate.NewSubstrateChain(substrateListener, nil, blockstore, config, executor)

chains = append(chains, substrateChain)

}
default:
panic(fmt.Errorf("type '%s' not recognized", chainConfig["type"]))
}
}

meter, err := opentelemetry.DefaultMeter(context.Background(), configuration.RelayerConfig.OpenTelemetryCollectorURL)
if err != nil {
panic(err)
}
metrics := metrics.NewTelemetry(meter)

go jobs.StartCommunicationHealthCheckJob(host, configuration.RelayerConfig.MpcConfig.CommHealthCheckInterval, metrics)

r := relayer.NewRelayer(
chains,
&opentelemetry.ConsoleTelemetry{},
metrics,
)

errChn := make(chan error)
Expand Down
3 changes: 2 additions & 1 deletion example/cfg/config_evm-evm_1.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@
"encryptionKey": "test-enc-key"
},
"commHealthCheckInterval": "24h"
}
},
"opentelemetryCollectorURL": "http://otel-collector:4318"
},
"domains": [
{
Expand Down
3 changes: 2 additions & 1 deletion example/cfg/config_evm-evm_2.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@
"encryptionKey": "test-enc-key"
},
"commHealthCheckInterval": "24h"
}
},
"opentelemetryCollectorURL": "http://otel-collector:4318"
},
"domains": [
{
Expand Down
3 changes: 2 additions & 1 deletion example/cfg/config_evm-evm_3.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@
"encryptionKey": "test-enc-key"
},
"commHealthCheckInterval": "24h"
}
},
"opentelemetryCollectorURL": "http://otel-collector:4318"
},
"domains": [
{
Expand Down
20 changes: 20 additions & 0 deletions example/cfg/otel-collector-config.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
receivers:
otlp:
protocols:
grpc:
http:

exporters:
prometheus:
endpoint: 0.0.0.0:8889
namespace: default

extensions:
health_check:

service:
extensions: [health_check]
pipelines:
metrics:
exporters: [prometheus]
receivers: [otlp]
10 changes: 10 additions & 0 deletions example/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -83,3 +83,13 @@ services:
- APP_MODE=debug
- IDENTITY_KEY=71aec985a12cfcbabc7969845215ceac56038e87477c8337e373eab7b15796cf
- IDENTITY_KEY_TYPE=secp256k1


otel-collector:
container_name: otel-collector
image: otel/opentelemetry-collector
command: ["--config=/etc/otel-collector-config.yml"]
volumes:
- ./cfg/otel-collector-config.yml:/etc/otel-collector-config.yml
ports:
- "8889:8889" # Prometheus exporter metrics
9 changes: 4 additions & 5 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ module github.com/ChainSafe/sygma-relayer
go 1.18

require (
github.com/ChainSafe/chainbridge-core v1.2.0
github.com/ChainSafe/chainbridge-core v1.2.1-0.20230403091503-2ce735717171
github.com/binance-chain/tss-lib v0.0.0-00010101000000-000000000000
github.com/centrifuge/go-substrate-rpc-client v2.0.0+incompatible
github.com/centrifuge/go-substrate-rpc-client/v4 v4.0.13-0.20230328172517-bf8a95095d4d
Expand All @@ -19,6 +19,8 @@ require (
github.com/spf13/cobra v1.2.1
github.com/spf13/viper v1.9.0
github.com/stretchr/testify v1.8.0
go.opentelemetry.io/otel v1.0.1
go.opentelemetry.io/otel/metric v0.24.0
golang.org/x/exp v0.0.0-20220916125017-b168a2c6b86b
)

Expand All @@ -29,9 +31,8 @@ require (
github.com/agl/ed25519 v0.0.0-20200305024217-f36fc4b53d43 // indirect
github.com/benbjohnson/clock v1.3.0 // indirect
github.com/beorn7/perks v1.0.1 // indirect
github.com/btcsuite/btcd v0.22.1 // indirect
github.com/btcsuite/btcd v0.22.0-beta.0.20220201204404-81fbd9b67e54 // indirect
github.com/btcsuite/btcd/btcec/v2 v2.2.0 // indirect
github.com/btcsuite/btcd/chaincfg/chainhash v1.0.1 // indirect
github.com/btcsuite/btcutil v1.0.3-0.20211129182920-9c4bbabe7acd // indirect
github.com/cespare/xxhash/v2 v2.1.2 // indirect
github.com/containerd/cgroups v1.0.4 // indirect
Expand Down Expand Up @@ -136,11 +137,9 @@ require (
github.com/tklauser/go-sysconf v0.3.5 // indirect
github.com/tklauser/numcpus v0.2.2 // indirect
github.com/vedhavyas/go-subkey v1.0.3 // indirect
go.opentelemetry.io/otel v1.0.1 // indirect
go.opentelemetry.io/otel/exporters/otlp/otlpmetric v0.24.0 // indirect
go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v0.24.0 // indirect
go.opentelemetry.io/otel/internal/metric v0.24.0 // indirect
go.opentelemetry.io/otel/metric v0.24.0 // indirect
go.opentelemetry.io/otel/sdk v1.0.1 // indirect
go.opentelemetry.io/otel/sdk/export/metric v0.24.0 // indirect
go.opentelemetry.io/otel/sdk/metric v0.24.0 // indirect
Expand Down
10 changes: 4 additions & 6 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,8 @@ filippo.io/edwards25519 v1.0.0-rc.1/go.mod h1:N1IkdkCkiLB6tki+MYJoSx2JTY9NUlxZE7
git.apache.org/thrift.git v0.0.0-20180902110319-2566ecd5d999/go.mod h1:fPE2ZNJGynbRyZ4dJvy6G277gSllfV2HJqblrnkyeyg=
github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo=
github.com/ChainSafe/chainbridge-core v1.2.0 h1:5gKxTT4PcmjelYHNGJC+pAv0RhfdsCrCoPv2jd1yZoo=
github.com/ChainSafe/chainbridge-core v1.2.0/go.mod h1:j51yNYVujQPo0FkFj361cnNX0YjlBM+Qq9kvAudiKfU=
github.com/ChainSafe/chainbridge-core v1.2.1-0.20230403091503-2ce735717171 h1:ZYtNHBl0+K4TZOqpoKkX4SSWmCPUM5knnFkbc8UCCoc=
github.com/ChainSafe/chainbridge-core v1.2.1-0.20230403091503-2ce735717171/go.mod h1:j51yNYVujQPo0FkFj361cnNX0YjlBM+Qq9kvAudiKfU=
github.com/ChainSafe/go-schnorrkel v1.0.0 h1:3aDA67lAykLaG1y3AOjs88dMxC88PgUuHRrLeDnvGIM=
github.com/ChainSafe/go-schnorrkel v1.0.0/go.mod h1:dpzHYVxLZcp8pjlV+O+UR8K0Hp/z7vcchBSbMBEhCw4=
github.com/OneOfOne/xxhash v1.2.2/go.mod h1:HSdplMjZKSmBqAxg5vPj2TmRDmfkzw+cTzAElWljhcU=
Expand Down Expand Up @@ -87,12 +87,10 @@ github.com/binance-chain/edwards25519 v0.0.0-20200305024217-f36fc4b53d43/go.mod
github.com/bketelsen/crypt v0.0.4/go.mod h1:aI6NrJ0pMGgvZKL1iVgXLnfIFJtfV+bKCoqOes/6LfM=
github.com/bradfitz/go-smtpd v0.0.0-20170404230938-deb6d6237625/go.mod h1:HYsPBTaaSFSlLx/70C2HPIMNZpVV8+vt/A+FMnYP11g=
github.com/btcsuite/btcd v0.20.1-beta/go.mod h1:wVuoA8VJLEcwgqHBwHmzLRazpKxTv13Px/pDuV7OomQ=
github.com/btcsuite/btcd v0.22.1 h1:CnwP9LM/M9xuRrGSCGeMVs9iv09uMqwsVX7EeIpgV2c=
github.com/btcsuite/btcd v0.22.1/go.mod h1:wqgTSL29+50LRkmOVknEdmt8ZojIzhuWvgu/iptuN7Y=
github.com/btcsuite/btcd v0.22.0-beta.0.20220201204404-81fbd9b67e54 h1:khJx6kvXopB224O05cs6iwmX/zAh7RlkMf8MiPXVS1I=
github.com/btcsuite/btcd v0.22.0-beta.0.20220201204404-81fbd9b67e54/go.mod h1:vkwesBkYQtKXFYQYi9PyahtopbX53Tvk/O/qp2WI6Gk=
github.com/btcsuite/btcd/btcec/v2 v2.2.0 h1:fzn1qaOt32TuLjFlkzYSsBC35Q3KUjT1SwPxiMSCF5k=
github.com/btcsuite/btcd/btcec/v2 v2.2.0/go.mod h1:U7MHm051Al6XmscBQ0BoNydpOTsFAn707034b5nY8zU=
github.com/btcsuite/btcd/chaincfg/chainhash v1.0.1 h1:q0rUy8C/TYNBQS1+CGKw68tLOFYSNEs0TFnxxnS9+4U=
github.com/btcsuite/btcd/chaincfg/chainhash v1.0.1/go.mod h1:7SFka0XMvUgj3hfZtydOrQY2mwhPclbT2snogU7SQQc=
github.com/btcsuite/btclog v0.0.0-20170628155309-84c8d2346e9f/go.mod h1:TdznJufoqS23FtqVCzL0ZqgP5MqXbb4fg/WgDys70nA=
github.com/btcsuite/btcutil v0.0.0-20190425235716-9e5f4b9a998d/go.mod h1:+5NJ2+qvTyV9exUAL/rxXi3DcLg2Ts+ymUAY5y4NvMg=
github.com/btcsuite/btcutil v1.0.3-0.20211129182920-9c4bbabe7acd h1:vAwk2PCYxzUUGAXXtw66PyY2IMCwWBnm8GR5aLIxS3Q=
Expand Down
14 changes: 13 additions & 1 deletion jobs/jobs.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,29 @@ import (
"github.com/ChainSafe/sygma-relayer/comm"
"github.com/ChainSafe/sygma-relayer/comm/p2p"
"github.com/libp2p/go-libp2p/core/host"
"github.com/libp2p/go-libp2p/core/peer"
"github.com/rs/zerolog/log"
)

func StartCommunicationHealthCheckJob(h host.Host, interval time.Duration) {
type Metrics interface {
TrackRelayerStatus(unavailable peer.IDSlice, all peer.IDSlice)
}

func StartCommunicationHealthCheckJob(h host.Host, interval time.Duration, metrics Metrics) {
healthComm := p2p.NewCommunication(h, "p2p/health")
for {
time.Sleep(interval)
log.Info().Msg("Starting communication health check")

all := h.Peerstore().Peers()
unavailable := make(peer.IDSlice, 0)

communicationErrors := comm.ExecuteCommHealthCheck(healthComm, h.Peerstore().Peers())
for _, cerr := range communicationErrors {
log.Err(cerr).Msg("communication error")
unavailable = append(unavailable, cerr.Peer)
}

metrics.TrackRelayerStatus(unavailable, all)
}
}
44 changes: 44 additions & 0 deletions metrics/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
package metrics

import (
"context"

"github.com/ChainSafe/chainbridge-core/opentelemetry"
"go.opentelemetry.io/otel/metric"
)

type Metrics struct {
opentelemetry.ChainbridgeMetrics
DepositErrorRate metric.Int64Counter
TotalRelayers metric.Int64GaugeObserver
AvailableRelayers metric.Int64GaugeObserver
ExecutionLatency metric.Int64Histogram

TotalRelayerCount *int64
AvailableRelayerCount *int64
}

// NewMetrics creates an instance of metrics
func NewMetrics(meter metric.Meter) *Metrics {
totalRelayerCount := new(int64)
availableRelayerCount := new(int64)
return &Metrics{
ChainbridgeMetrics: *opentelemetry.NewChainbridgeMetrics(meter),
TotalRelayers: metric.Must(meter).NewInt64GaugeObserver(
"sygma.TotalRelayers",
func(ctx context.Context, result metric.Int64ObserverResult) {
result.Observe(*totalRelayerCount)
},
metric.WithDescription("Total number of relayers currently in the subset"),
),
AvailableRelayers: metric.Must(meter).NewInt64GaugeObserver(
"sygma.AvailableRelayers",
func(ctx context.Context, result metric.Int64ObserverResult) {
result.Observe(*availableRelayerCount)
},
metric.WithDescription("Available number of relayers currently in the subset"),
),
TotalRelayerCount: totalRelayerCount,
AvailableRelayerCount: availableRelayerCount,
}
}
35 changes: 35 additions & 0 deletions metrics/opentelemetry.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
package metrics

import (
"context"

"github.com/ChainSafe/chainbridge-core/opentelemetry"
"github.com/libp2p/go-libp2p/core/peer"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/metric"
)

type Telemetry struct {
opentelemetry.OpenTelemetry
metrics *Metrics

meter metric.Meter
}

// NewTelemetry initializes OpenTelementry metrics
func NewTelemetry(meter metric.Meter) *Telemetry {
coreTelemetry := opentelemetry.NewOpenTelemetry(meter)
metrics := NewMetrics(meter)

return &Telemetry{
OpenTelemetry: *coreTelemetry,
metrics: metrics,
meter: meter,
}
}

func (t *Telemetry) TrackRelayerStatus(unavailable peer.IDSlice, all peer.IDSlice) {
*t.metrics.TotalRelayerCount = int64(len(all))
*t.metrics.AvailableRelayerCount = int64(len(all) - len(unavailable))
t.meter.RecordBatch(context.Background(), []attribute.KeyValue{})
}

0 comments on commit cea714e

Please sign in to comment.