Skip to content

Commit

Permalink
feat: support the failure/success threshold for probe (megaease#233)
Browse files Browse the repository at this point in the history
* feat: support the failure/success threadhold for probe

* add more unit test

* fix the typo

* fix the jsonschema comma problem

* Apply suggestions from code review

Co-authored-by: Pantelis Roditis <[email protected]>

* rewording the log message

Co-authored-by: Pantelis Roditis <[email protected]>
  • Loading branch information
haoel and proditis authored Oct 19, 2022
1 parent 6983ee5 commit 240c658
Show file tree
Hide file tree
Showing 16 changed files with 436 additions and 42 deletions.
5 changes: 3 additions & 2 deletions cmd/easeprobe/probe.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,9 @@ import (

func configProbers(probers []probe.Prober) []probe.Prober {
gProbeConf := global.ProbeSettings{
Interval: conf.Get().Settings.Probe.Interval,
Timeout: conf.Get().Settings.Probe.Timeout,
Interval: conf.Get().Settings.Probe.Interval,
Timeout: conf.Get().Settings.Probe.Timeout,
StatusChangeThresholdSettings: conf.Get().Settings.Probe.StatusChangeThresholdSettings,
}
log.Debugf("Global Probe Configuration: %+v", gProbeConf)

Expand Down
5 changes: 3 additions & 2 deletions conf/conf.go
Original file line number Diff line number Diff line change
Expand Up @@ -94,8 +94,9 @@ type Notify struct {

// Probe is the settings of prober
type Probe struct {
Interval time.Duration `yaml:"interval" json:"interval,omitempty" jsonschema:"type=string,format=duration,title=Probe Interval,description=the interval of probe,default=1m"`
Timeout time.Duration `yaml:"timeout" json:"timeout,omitempty" jsonschema:"type=string,format=duration,title=Probe Timeout,description=the timeout of probe,default=30s"`
Interval time.Duration `yaml:"interval" json:"interval,omitempty" jsonschema:"type=string,format=duration,title=Probe Interval,description=the interval of probe,default=1m"`
Timeout time.Duration `yaml:"timeout" json:"timeout,omitempty" jsonschema:"type=string,format=duration,title=Probe Timeout,description=the timeout of probe,default=30s"`
global.StatusChangeThresholdSettings `yaml:",inline" json:",inline"`
}

// SLAReport is the settings for SLA report
Expand Down
6 changes: 5 additions & 1 deletion docs/Manual.md
Original file line number Diff line number Diff line change
Expand Up @@ -627,7 +627,8 @@ The following example configurations illustrate the EaseProbe supported features

- `timeout` - the maximum time to wait for the probe to complete. default: `30s`.
- `interval` - the interval time to run the probe. default: `1m`.

- `failure` - number of consecutive failed probes needed to determine the status down, default: 1
- `success` - number of consecutive successful probes needed to determine the status up, default: 1

## 7.1 HTTP Probe Configuration

Expand Down Expand Up @@ -1309,6 +1310,9 @@ settings:
probe:
timeout: 30s # the time out for all probes
interval: 1m # probe every minute for all probes
failure: 2 # number of consecutive failed probes needed to determine the status down, default: 1
success: 1 # number of consecutive successful probes needed to determine the status up, default: 1
# easeprobe program running log file.
log:
Expand Down
2 changes: 2 additions & 0 deletions global/global.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,8 @@ const (
DefaultTimeOut = time.Second * 30
// DefaultChannelName is the default wide channel name
DefaultChannelName = "__EaseProbe_Channel__"
// DefaultStatusChangeThresholdSetting is the threshold of status change
DefaultStatusChangeThresholdSetting = 1
)

const (
Expand Down
17 changes: 17 additions & 0 deletions global/probe.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,19 @@ package global

import "time"

// StatusChangeThresholdSettings is the settings for probe threshold
type StatusChangeThresholdSettings struct {
// the failures threshold such as 2, 5
Failure int `yaml:"failure,omitempty" json:"failure,omitempty" jsonschema:"title=Failure Threshold,description=the failures threshold to change the status such as 3,default=1"`
// the success threshold such as 2, 5
Success int `yaml:"success,omitempty" json:"success,omitempty" jsonschema:"title=Success Threshold,description=the success threshold to change the status such as 2,default=1"`
}

// ProbeSettings is the global probe setting
type ProbeSettings struct {
Interval time.Duration
Timeout time.Duration
StatusChangeThresholdSettings
}

// NormalizeTimeOut return a normalized timeout value
Expand All @@ -34,3 +43,11 @@ func (p *ProbeSettings) NormalizeTimeOut(t time.Duration) time.Duration {
func (p *ProbeSettings) NormalizeInterval(t time.Duration) time.Duration {
return normalize(p.Interval, t, 0, DefaultProbeInterval)
}

// NormalizeThreshold return a normalized threshold value
func (p *ProbeSettings) NormalizeThreshold(t StatusChangeThresholdSettings) StatusChangeThresholdSettings {
return StatusChangeThresholdSettings{
Failure: normalize(p.Failure, t.Failure, 0, DefaultStatusChangeThresholdSetting),
Success: normalize(p.Success, t.Success, 0, DefaultStatusChangeThresholdSetting),
}
}
62 changes: 62 additions & 0 deletions global/probe_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,5 +46,67 @@ func TestProbe(t *testing.T) {
p.Interval = 20
r = p.NormalizeInterval(0)
assert.Equal(t, time.Duration(20), r)
}

func TestStatusChangeThresholdSettings(t *testing.T) {
p := ProbeSettings{}

r := p.NormalizeThreshold(StatusChangeThresholdSettings{})
assert.Equal(t, StatusChangeThresholdSettings{
Failure: DefaultStatusChangeThresholdSetting,
Success: DefaultStatusChangeThresholdSetting,
}, r)

p.Failure = 2
p.Success = 3

r = p.NormalizeThreshold(StatusChangeThresholdSettings{
Failure: 1,
})
assert.Equal(t, StatusChangeThresholdSettings{
Failure: 1,
Success: 3,
}, r)

r = p.NormalizeThreshold(StatusChangeThresholdSettings{
Success: 2,
})
assert.Equal(t, StatusChangeThresholdSettings{
Failure: 2,
Success: 2,
}, r)

r = p.NormalizeThreshold(StatusChangeThresholdSettings{
Failure: 5,
Success: 6,
})
assert.Equal(t, StatusChangeThresholdSettings{
Failure: 5,
Success: 6,
}, r)

r = p.NormalizeThreshold(StatusChangeThresholdSettings{
Failure: 0,
})
assert.Equal(t, StatusChangeThresholdSettings{
Failure: 2,
Success: 3,
}, r)

r = p.NormalizeThreshold(StatusChangeThresholdSettings{
Success: -1,
})
assert.Equal(t, StatusChangeThresholdSettings{
Failure: 2,
Success: 3,
}, r)

p.Failure = -1
r = p.NormalizeThreshold(StatusChangeThresholdSettings{
Failure: 0,
})
assert.Equal(t, StatusChangeThresholdSettings{
Failure: 1,
Success: 3,
}, r)
}
91 changes: 70 additions & 21 deletions probe/base/base.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ package base

import (
"fmt"
"math"
"net"
"net/url"
"os"
Expand All @@ -44,15 +45,16 @@ type ProbeFuncType func() (bool, string)

// DefaultProbe is the default options for all probe
type DefaultProbe struct {
ProbeKind string `yaml:"-" json:"-"`
ProbeTag string `yaml:"-" json:"-"`
ProbeName string `yaml:"name" json:"name" jsonschema:"required,title=Probe Name,description=the name of probe must be unique"`
ProbeChannels []string `yaml:"channels" json:"channels,omitempty" jsonschema:"title=Probe Channels,description=the channels of probe message need to send to"`
ProbeTimeout time.Duration `yaml:"timeout,omitempty" json:"timeout,omitempty" jsonschema:"type=string,format=duration,title=Probe Timeout,description=the timeout of probe"`
ProbeTimeInterval time.Duration `yaml:"interval,omitempty" json:"interval,omitempty" jsonschema:"type=string,format=duration,title=Probe Interval,description=the interval of probe"`
ProbeFunc ProbeFuncType `yaml:"-" json:"-"`
ProbeResult *probe.Result `yaml:"-" json:"-"`
metrics *metrics `yaml:"-" json:"-"`
ProbeKind string `yaml:"-" json:"-"`
ProbeTag string `yaml:"-" json:"-"`
ProbeName string `yaml:"name" json:"name" jsonschema:"required,title=Probe Name,description=the name of probe must be unique"`
ProbeChannels []string `yaml:"channels" json:"channels,omitempty" jsonschema:"title=Probe Channels,description=the channels of probe message need to send to"`
ProbeTimeout time.Duration `yaml:"timeout,omitempty" json:"timeout,omitempty" jsonschema:"type=string,format=duration,title=Probe Timeout,description=the timeout of probe"`
ProbeTimeInterval time.Duration `yaml:"interval,omitempty" json:"interval,omitempty" jsonschema:"type=string,format=duration,title=Probe Interval,description=the interval of probe"`
global.StatusChangeThresholdSettings `yaml:",inline" json:",inline"`
ProbeFunc ProbeFuncType `yaml:"-" json:"-"`
ProbeResult *probe.Result `yaml:"-" json:"-"`
metrics *metrics `yaml:"-" json:"-"`
}

// Kind return the probe kind
Expand Down Expand Up @@ -85,6 +87,46 @@ func (d *DefaultProbe) Result() *probe.Result {
return d.ProbeResult
}

// LogTitle return the log title
func (d *DefaultProbe) LogTitle() string {
if len(d.ProbeTag) > 0 {
return fmt.Sprintf("[%s / %s / %s]", d.ProbeKind, d.ProbeTag, d.ProbeName)
}
return fmt.Sprintf("[%s / %s]", d.ProbeKind, d.ProbeName)
}

// CheckStatusThreshold check the status threshold
func (d *DefaultProbe) CheckStatusThreshold() probe.Status {
s := d.StatusChangeThresholdSettings
c := d.ProbeResult.Stat.StatusCounter
title := d.LogTitle()
log.Debugf(" %s - Status Threshold Checking - Current[%v], StatusCnt[%d], FailureThread[%d], SuccessThread[%d]",
title, c.CurrentStatus, c.StatusCount, s.Failure, s.Success)

if c.CurrentStatus == true && c.StatusCount >= s.Success {
if d.ProbeResult.Status != probe.StatusUp {
cnt := math.Max(float64(c.StatusCount), float64(s.Success))
log.Infof("%s - Status is UP! Threshold reached for success [%d/%d]", title, int(cnt), s.Success)
}
return probe.StatusUp
}
if c.CurrentStatus == false && c.StatusCount >= s.Failure {
if d.ProbeResult.Status != probe.StatusDown {
cnt := math.Max(float64(c.StatusCount), float64(s.Failure))
log.Infof("%s - Status is DOWN! Threshold reached for failure [%d/%d]", title, int(cnt), s.Failure)
}
return probe.StatusDown
}
if c.CurrentStatus == true {
log.Infof("%s - Status unchanged [%s]! Threshold is not reached for success [%d/%d].",
title, d.ProbeResult.PreStatus, c.StatusCount, s.Success)
} else {
log.Infof("%s - Status unchanged [%s]! Threshold is not reached for failure [%d/%d].",
title, d.ProbeResult.PreStatus, c.StatusCount, s.Failure)
}
return d.ProbeResult.PreStatus
}

// Config default config
func (d *DefaultProbe) Config(gConf global.ProbeSettings,
kind, tag, name, endpoint string, fn ProbeFuncType) error {
Expand All @@ -96,19 +138,28 @@ func (d *DefaultProbe) Config(gConf global.ProbeSettings,

d.ProbeTimeout = gConf.NormalizeTimeOut(d.ProbeTimeout)
d.ProbeTimeInterval = gConf.NormalizeInterval(d.ProbeTimeInterval)
d.StatusChangeThresholdSettings = gConf.NormalizeThreshold(d.StatusChangeThresholdSettings)

d.ProbeResult = probe.NewResultWithName(name)
d.ProbeResult.Name = name
d.ProbeResult.Endpoint = endpoint

// Set the new length of the status counter
maxLen := d.StatusChangeThresholdSettings.Failure
if d.StatusChangeThresholdSettings.Success > maxLen {
maxLen = d.StatusChangeThresholdSettings.Success
}
d.ProbeResult.Stat.StatusCounter.SetMaxLen(maxLen)

// if there no channels, use the default channel
if len(d.ProbeChannels) == 0 {
d.ProbeChannels = append(d.ProbeChannels, global.DefaultChannelName)
}

if len(d.ProbeTag) > 0 {
log.Infof("Probe [%s / %s] - [%s] base options are configured!", d.ProbeKind, d.ProbeTag, d.ProbeName)
} else {
log.Infof("Probe [%s] - [%s] base options are configured!", d.ProbeKind, d.ProbeName)
log.Infof("Probe %s base options are configured!", d.LogTitle())

if d.Failure > 1 || d.Success > 1 {
log.Infof("Probe %s Status Threshold are configured! failure[%d], success[%d]", d.LogTitle(), d.Failure, d.Success)
}

d.metrics = newMetrics(kind, tag)
Expand All @@ -130,21 +181,19 @@ func (d *DefaultProbe) Probe() probe.Result {

d.ProbeResult.RoundTripTime = time.Since(now)

status := probe.StatusUp
title := "Success"
if stat != true {
status = probe.StatusDown
title = "Error"
}
// check the status threshold
d.ProbeResult.Stat.StatusCounter.AppendStatus(stat, msg)
status := d.CheckStatusThreshold()
title := status.Title()

if len(d.ProbeTag) > 0 {
d.ProbeResult.Message = fmt.Sprintf("%s (%s/%s): %s", title, d.ProbeKind, d.ProbeTag, msg)
log.Debugf("[%s / %s / %s] - %s", d.ProbeKind, d.ProbeTag, d.ProbeName, msg)
} else {
d.ProbeResult.Message = fmt.Sprintf("%s (%s): %s", title, d.ProbeKind, msg)
log.Debugf("[%s / %s] - %s", d.ProbeKind, d.ProbeName, msg)
}

log.Debugf("%s - %s", d.LogTitle(), msg)

d.ProbeResult.PreStatus = d.ProbeResult.Status
d.ProbeResult.Status = status

Expand Down
62 changes: 62 additions & 0 deletions probe/base/base_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
package base

import (
"fmt"
"math/rand"
"net"
"os"
Expand Down Expand Up @@ -147,3 +148,64 @@ func TestProxyConnection(t *testing.T) {

monkey.UnpatchAll()
}

func TestStatusThreshold(t *testing.T) {
p := newDummyProber("probe")
p.StatusChangeThresholdSettings = global.StatusChangeThresholdSettings{
Failure: 2,
Success: 3,
}
p.Config(global.ProbeSettings{
StatusChangeThresholdSettings: global.StatusChangeThresholdSettings{
Failure: 2,
Success: 1,
},
})
assert.Equal(t, 3, p.ProbeResult.Stat.MaxLen)
assert.Equal(t, 2, p.StatusChangeThresholdSettings.Failure)
assert.Equal(t, 3, p.StatusChangeThresholdSettings.Success)

p.ProbeResult.Status = probe.StatusInit

cnt := 0
p.ProbeFunc = func() (bool, string) {
cnt++
return true, fmt.Sprintf("success - %d", cnt)
}

n := p.ProbeResult.Stat.MaxLen + 2
for i := 1; i <= n; i++ {
p.Probe()
if i < p.StatusChangeThresholdSettings.Success {

assert.Equal(t, probe.StatusInit, p.Result().Status)
} else {
assert.Equal(t, probe.StatusUp, p.Result().Status)
}
if i < p.ProbeResult.Stat.MaxLen {
assert.Equal(t, i, p.ProbeResult.Stat.StatusCount)
} else {
assert.Equal(t, p.ProbeResult.Stat.MaxLen, p.ProbeResult.Stat.StatusCount)
}
}

cnt = 0
p.ProbeFunc = func() (bool, string) {
cnt++
return false, fmt.Sprintf("failure - %d", cnt)
}

for i := 1; i <= n; i++ {
p.Probe()
if i < p.StatusChangeThresholdSettings.Failure {
assert.Equal(t, probe.StatusUp, p.Result().Status)
} else {
assert.Equal(t, probe.StatusDown, p.Result().Status)
}
if i < p.ProbeResult.Stat.MaxLen {
assert.Equal(t, i, p.ProbeResult.Stat.StatusCount)
} else {
assert.Equal(t, p.ProbeResult.Stat.MaxLen, p.ProbeResult.Stat.StatusCount)
}
}
}
10 changes: 6 additions & 4 deletions probe/data_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,9 @@ var testResults = []Result{
StatusUp: 70,
StatusDown: 30,
},
UpTime: 70 * time.Minute,
DownTime: 30 * time.Minute,
UpTime: 70 * time.Minute,
DownTime: 30 * time.Minute,
StatusCounter: *NewStatusCounter(1),
},
},
{
Expand All @@ -74,8 +75,9 @@ var testResults = []Result{
StatusUp: 270,
StatusDown: 30,
},
UpTime: 270 * time.Minute,
DownTime: 30 * time.Minute,
UpTime: 270 * time.Minute,
DownTime: 30 * time.Minute,
StatusCounter: *NewStatusCounter(2),
},
},
}
Expand Down
Loading

0 comments on commit 240c658

Please sign in to comment.