Skip to content

Commit

Permalink
[horus] Fix inability to run self-healing resources (#349)
Browse files Browse the repository at this point in the history
  • Loading branch information
mfordjody committed Sep 16, 2024
1 parent e4dd626 commit 4c822f5
Show file tree
Hide file tree
Showing 9 changed files with 81 additions and 66 deletions.
2 changes: 2 additions & 0 deletions app/horus/basic/config/file.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,13 +44,15 @@ type SlackConfiguration struct {
}

type RecoveryConfiguration struct {
Enabled bool `yaml:"enabled"`
DayNumber int `yaml:"dayNumber"`
CheckIntervalSecond int `yaml:"checkIntervalSecond"`
PromQueryTimeSecond int64 `yaml:"promQueryTimeSecond"`
DingTalk *DingTalkConfiguration `yaml:"dingTalk"`
}

type ModularConfiguration struct {
Enabled bool `yaml:"enabled"`
CordonDailyLimit map[string]int `yaml:"cordonDailyLimit"`
CheckQL map[string]string `yaml:"checkQL"`
RecoveryQL map[string]string `yaml:"recoveryQL"`
Expand Down
33 changes: 16 additions & 17 deletions app/horus/basic/db/db.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,22 +25,21 @@ import (
)

type NodeDataInfo struct {
Id int64 `json:"id"`
NodeName string `json:"node_name" xorm:"node_name"`
NodeIP string `json:"node_ip" xorm:"node_ip"`
Sn string `json:"sn"`
ClusterName string `json:"cluster_name" xorm:"cluster_name"`
ModuleName string `json:"module_name" xorm:"module_name"`
Reason string `json:"reason"`
Restart uint32 `json:"restart"`
Repair uint32 `json:"repair"`
RepairTicketUrl string `json:"repair_ticket_url" xorm:"repair_ticket_url"`
FirstDate string `json:"first_date" xorm:"first_date"`
CreateTime string `json:"create_time" xorm:"create_time created"`
UpdateTime string `json:"update_time" xorm:"update_time updated"`
RecoveryMark int64 `json:"recovery_mark" xorm:"recovery_mark"`
RecoveryQL string `json:"recovery_ql" xorm:"recovery_ql"`
CustomizeRecoveryModular map[string]string `xorm:"-"`
Id int64 `json:"id"`
NodeName string `json:"node_name" xorm:"node_name"`
NodeIP string `json:"node_ip" xorm:"node_ip"`
Sn string `json:"sn"`
ClusterName string `json:"cluster_name" xorm:"cluster_name"`
ModuleName string `json:"module_name" xorm:"module_name"`
Reason string `json:"reason"`
Restart uint32 `json:"restart"`
Repair uint32 `json:"repair"`
RepairTicketUrl string `json:"repair_ticket_url" xorm:"repair_ticket_url"`
FirstDate string `json:"first_date" xorm:"first_date"`
CreateTime string `json:"create_time" xorm:"create_time created"`
UpdateTime string `json:"update_time" xorm:"update_time updated"`
RecoveryMark int64 `json:"recovery_mark" xorm:"recovery_mark"`
RecoveryQL string `json:"recovery_ql" xorm:"recovery_ql"`
}

type PodDataInfo struct {
Expand Down Expand Up @@ -129,7 +128,7 @@ func GetRecoveryNodeDataInfoDate(day int) ([]*NodeDataInfo, error) {

func GetDailyLimitNodeDataInfoDate(day, module, cluster string) ([]*NodeDataInfo, error) {
var ndi []*NodeDataInfo
session := db.Where(fmt.Sprintf("DATE(first_date)='%s' AND module_name='%s' AND cluster_name='%s", day, module, cluster))
session := db.Where(fmt.Sprintf("DATE(first_date) = '%s' AND module_name = '%s' AND cluster_name = '%s'", day, module, cluster))
err := session.Find(&ndi)
return ndi, err
}
Expand Down
47 changes: 29 additions & 18 deletions app/horus/cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ func main() {

c, err := config.LoadFile(configFile)
if err != nil {
klog.Errorf("load config file failed err:%+v", c)
klog.Errorf("load config file failed err:%+v", err)
return
} else {
klog.Infof("load config file success.")
Expand All @@ -67,14 +67,22 @@ func main() {
cancel()
return nil
case <-ctx.Done():
return nil
}
}
})
group.Add(func() error {
for {
select {
case <-stopChan:
cancel()
return nil
}
}
})
group.Add(func() error {
http.Handle("/metrics", promhttp.Handler())
srv := http.Server{Addr: address}
srv := http.Server{Addr: c.Address}
err := srv.ListenAndServe()
if err != nil {
klog.Errorf("horus metrics err:%v", err)
Expand All @@ -90,18 +98,22 @@ func main() {
return nil
})
group.Add(func() error {
klog.Info("horus recovery manager start success.")
err := horus.RecoveryManager(ctx)
if err != nil {
klog.Errorf("horus recovery manager start failed error:%v", err)
if c.NodeRecovery.Enabled {
klog.Info("horus recovery manager start success.")
err := horus.RecoveryManager(ctx)
if err != nil {
klog.Errorf("horus recovery manager start failed error:%v", err)
}
}
return nil
})
group.Add(func() error {
klog.Info("horus customize modular manager start success.")
err := horus.CustomizeModularManager(ctx)
if err != nil {
klog.Errorf("horus customize modular manager start failed error:%v", err)
if c.CustomModular.Enabled {
klog.Info("horus customize modular manager start success.")
err := horus.CustomizeModularManager(ctx)
if err != nil {
klog.Errorf("horus customize modular manager start failed error:%v", err)
}
}
return nil
})
Expand All @@ -116,10 +128,7 @@ func (g *WaitGroup) Add(f func() error) {
g.wg.Add(1)
go func() {
defer g.wg.Done()
err := f()
if err != nil {
return
}
_ = f()
}()
}

Expand All @@ -131,11 +140,13 @@ func setupStopChanWithContext() (*WaitGroup, <-chan struct{}) {
stopChan := make(chan struct{})
SignalChan := make(chan os.Signal, 1)
signal.Notify(SignalChan, syscall.SIGTERM, syscall.SIGQUIT)
g := WaitGroup{}
g := &WaitGroup{}
g.Add(func() error {
<-stopChan
close(stopChan)
select {
case <-SignalChan:
close(stopChan)
}
return nil
})
return &g, stopChan
return g, stopChan
}
2 changes: 1 addition & 1 deletion app/horus/core/alert/dingtalk.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ import (
"net/http"
)

const DingTalkTitle = "项目组"
const DingTalkTitle = "horus 通知"

type T struct {
At struct {
Expand Down
1 change: 0 additions & 1 deletion app/horus/core/horuser/horuser.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,5 +67,4 @@ func k8sBuildConfig(kubeconfig string) (*rest.Config, error) {

func (h *Horuser) GetK8sContext() (context.Context, context.CancelFunc) {
return context.WithTimeout(context.Background(), time.Duration(h.cc.KubeTimeSecond)*time.Second)

}
25 changes: 11 additions & 14 deletions app/horus/core/horuser/modular.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ func (h *Horuser) CustomizeModular(ctx context.Context) {
wg.Add(1)
go func(clusterName, addr string) {
defer wg.Done()
h.CustomizeModularOnCluster(clusterName, addr)
}(clusterName, addr)
}
wg.Wait()
Expand All @@ -53,11 +54,12 @@ func (h *Horuser) CustomizeModularOnCluster(clusterName, addr string) {
vecs, err := h.InstantQuery(addr, ql, clusterName, h.cc.CustomModular.PromQueryTimeSecond)
if err != nil {
klog.Errorf("CustomizeModularOnCluster InstantQuery err:%v", err)
klog.Infof("clusterName:%vec ql: %v", clusterName, ql)
klog.Infof("clusterName:%v ql: %v", clusterName, ql)
return
}
count := len(vecs)
for index, vec := range vecs {
vec := vec
labelMap := vec.Metric
nodeName := string(labelMap["node"])
if nodeName == "" {
Expand All @@ -67,7 +69,8 @@ func (h *Horuser) CustomizeModularOnCluster(clusterName, addr string) {
}
ip := string(labelMap["instance"])
value := vec.Value.String()
klog.Infof("RunCommonModuleOnCluster.QueryRes.print[clusterName:%v][moduleName:%v][%d][nodeName:%v][value:%v][count:%v]", clusterName, moduleName, index+1, nodeName, value, count)
klog.Infof("CustomizeModularOnCluster Query result clusterName:%v moduleName:%v %d nodeName:%v value:%v count:%v",
clusterName, moduleName, index+1, nodeName, value, count)
h.CustomizeModularNodes(clusterName, moduleName, nodeName, ip)
}
}
Expand All @@ -77,21 +80,19 @@ func (h *Horuser) CustomizeModularNodes(clusterName, moduleName, nodeName, ip st
today := time.Now().Format("2006-01-02")

recoveryQL := h.cc.CustomModular.RecoveryQL[moduleName]
dailyLimit := h.cc.CustomModular.CordonDailyLimit[moduleName]

data, err := db.GetDailyLimitNodeDataInfoDate(today, moduleName, clusterName)
if err != nil {
klog.Errorf("CustomizeModularNodes GetDailyLimitNodeDataInfoDate err:%v", err)
return
}
klog.Infof("%v", data)

dailyLimit := h.cc.CustomModular.CordonDailyLimit[moduleName]
if len(data) > dailyLimit {
msg := fmt.Sprintf("【日期:%v】 【集群:%v\n】 【模块今日 Cordon 节点数: %v】\n 【已达到今日上限: %v】\n [节点:%v]",
data, clusterName, moduleName, dailyLimit, nodeName)
klog.Infof(msg)
klog.Infof("Attempting to send DingTalk message (limit exceeded): %s", msg)
alert.DingTalkSend(h.cc.CustomModular.DingTalk, msg)
klog.Infof("DingTalk message sent (limit exceeded)")
return
}

Expand All @@ -113,15 +114,11 @@ func (h *Horuser) CustomizeModularNodes(clusterName, moduleName, nodeName, ip st
res := "success"
if err != nil {
res = fmt.Sprintf("failed:%v", err)
klog.Errorf("Cordon failed:%v", err)
}
if err != nil {
res = fmt.Sprintf("failed:%v", err)
klog.Errorf("Cordon failed:%v", res)
}

msg := fmt.Sprintf("【集群:%v】\n 【%s 插件 Cordon 节点:%v】\n 【结果: %v】\n 【今日操作次数:%v】",
clusterName, moduleName, nodeName, res, len(today)+1)
klog.Infof(msg)
clusterName, moduleName, nodeName, res, len(data)+1)

klog.Infof("Attempting to send DingTalk message: %s", msg)
alert.DingTalkSend(h.cc.CustomModular.DingTalk, msg)
Expand All @@ -130,7 +127,7 @@ func (h *Horuser) CustomizeModularNodes(clusterName, moduleName, nodeName, ip st
_, err = write.AddOrGet()
if err != nil {
klog.Errorf("CustomizeModularNodes AddOrGet err:%v", err)
klog.Infof("moduleName:%v nodeName:%v", moduleName, nodeName)
klog.Infof("moduleName:%v nodeName:%v", moduleName, write.NodeName)
}
klog.Infof("CustomizeModularNodes AddOrGet success moduleName:%v nodeName:%v", moduleName, nodeName)
klog.Infof("CustomizeModularNodes AddOrGet success moduleName:%v nodeName:%v", moduleName, write.NodeName)
}
1 change: 1 addition & 0 deletions app/horus/core/horuser/prome.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ func (h *Horuser) InstantQuery(address, ql, clusterName string, timeWindowsSecon
if promClient == "" && address == "" {
klog.Errorf("prometheus get PromMultiple empty")
klog.Infof("clusterName:%v ql:%v", clusterName, ql)
return nil, err
}

apiV1 := prometheusV1.NewAPI(client)
Expand Down
1 change: 1 addition & 0 deletions app/horus/core/horuser/recovery.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ func (h *Horuser) recoveryNodes(n *db.NodeDataInfo) {
return
}
if len(vecs) != 2 {
klog.Errorf("%v", vecs)
return
}
klog.Infof("recoveryNodes check success.")
Expand Down
35 changes: 20 additions & 15 deletions deploy/horus/horus.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ kubeTimeSecond: 5
mysql:
name: horus
addr: "root:root@tcp(127.0.0.1:3306)/horus?charset=utf8&parseTime=True"
debug: true
debug: false

dingTalk:
webhookUrl: ~
Expand All @@ -35,36 +35,41 @@ slack:
kubeMultiple:
cluster: config.1

promMultiple:
cluster: http://192.168.15.128:32608

nodeRecovery:
enabled: false
dayNumber: 1
checkIntervalSecond: 60
promQueryTimeSecond: 5
checkIntervalSecond: 5
promQueryTimeSecond: 10
dingTalk:
webhookUrl: "https://oapi.dingtalk.com/robot/send?access_token=aa2f3f74d7a2504653ca89b7a673707ba1d04b6d9d320c3572e5464d8f81471e"
title: "【Horus 通知"
webhookUrl: "https://oapi.dingtalk.com/robot/send?access_token=37f8891e60e524013275cc01efafdb5976b81ef7269ce271b769bcd025826c12"
title: "horus 通知"
atMobiles:
- 15000000

customModular:
enabled: true
cordonDailyLimit:
node_filesystem_readonly: 1
node_arp_entries: 1
filesystem_readonly: 5
arp_entries: 5
checkQL:
filesystem_readonly: |-
node_filesystem_readonly{mountpoint="/"} == 1
node_filesystem_readonly{mountpoint="/",node="%s"} == 1
arp_entries: |-
node_arp_entries{device="ens160"} > 2
node_arp_entries{device="ens160",node="master"} > 2
recoveryQL:
filesystem_readonly: |-
node_filesystem_readonly{mountpoint="/"} == 0
node_filesystem_readonly{mountpoint="/",node="%s"} == 0
arp_entries: |-
node_arp_entries{device="ens160"} < 2
checkIntervalSecond: 60
promQueryTimeSecond: 5
node_arp_entries{device="ens160",node="%s"} < 2
checkIntervalSecond: 5
promQueryTimeSecond: 10
kubeMultiple:
cluster: config.1
dingTalk:
webhookUrl: "https://oapi.dingtalk.com/robot/send?access_token=aa2f3f74d7a2504653ca89b7a673707ba1d04b6d9d320c3572e5464d8f81471e"
title: "【Horus 通知"
webhookUrl: "https://oapi.dingtalk.com/robot/send?access_token=37f8891e60e524013275cc01efafdb5976b81ef7269ce271b769bcd025826c12"
title: "horus 通知"
atMobiles:
- 15000000

0 comments on commit 4c822f5

Please sign in to comment.