Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[horus] Fix inability to run self-healing resources #349

Merged
merged 1 commit into from
Sep 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions app/horus/basic/config/file.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,13 +44,15 @@ type SlackConfiguration struct {
}

type RecoveryConfiguration struct {
Enabled bool `yaml:"enabled"`
DayNumber int `yaml:"dayNumber"`
CheckIntervalSecond int `yaml:"checkIntervalSecond"`
PromQueryTimeSecond int64 `yaml:"promQueryTimeSecond"`
DingTalk *DingTalkConfiguration `yaml:"dingTalk"`
}

type ModularConfiguration struct {
Enabled bool `yaml:"enabled"`
CordonDailyLimit map[string]int `yaml:"cordonDailyLimit"`
CheckQL map[string]string `yaml:"checkQL"`
RecoveryQL map[string]string `yaml:"recoveryQL"`
Expand Down
33 changes: 16 additions & 17 deletions app/horus/basic/db/db.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,22 +25,21 @@ import (
)

type NodeDataInfo struct {
Id int64 `json:"id"`
NodeName string `json:"node_name" xorm:"node_name"`
NodeIP string `json:"node_ip" xorm:"node_ip"`
Sn string `json:"sn"`
ClusterName string `json:"cluster_name" xorm:"cluster_name"`
ModuleName string `json:"module_name" xorm:"module_name"`
Reason string `json:"reason"`
Restart uint32 `json:"restart"`
Repair uint32 `json:"repair"`
RepairTicketUrl string `json:"repair_ticket_url" xorm:"repair_ticket_url"`
FirstDate string `json:"first_date" xorm:"first_date"`
CreateTime string `json:"create_time" xorm:"create_time created"`
UpdateTime string `json:"update_time" xorm:"update_time updated"`
RecoveryMark int64 `json:"recovery_mark" xorm:"recovery_mark"`
RecoveryQL string `json:"recovery_ql" xorm:"recovery_ql"`
CustomizeRecoveryModular map[string]string `xorm:"-"`
Id int64 `json:"id"`
NodeName string `json:"node_name" xorm:"node_name"`
NodeIP string `json:"node_ip" xorm:"node_ip"`
Sn string `json:"sn"`
ClusterName string `json:"cluster_name" xorm:"cluster_name"`
ModuleName string `json:"module_name" xorm:"module_name"`
Reason string `json:"reason"`
Restart uint32 `json:"restart"`
Repair uint32 `json:"repair"`
RepairTicketUrl string `json:"repair_ticket_url" xorm:"repair_ticket_url"`
FirstDate string `json:"first_date" xorm:"first_date"`
CreateTime string `json:"create_time" xorm:"create_time created"`
UpdateTime string `json:"update_time" xorm:"update_time updated"`
RecoveryMark int64 `json:"recovery_mark" xorm:"recovery_mark"`
RecoveryQL string `json:"recovery_ql" xorm:"recovery_ql"`
}

type PodDataInfo struct {
Expand Down Expand Up @@ -129,7 +128,7 @@ func GetRecoveryNodeDataInfoDate(day int) ([]*NodeDataInfo, error) {

func GetDailyLimitNodeDataInfoDate(day, module, cluster string) ([]*NodeDataInfo, error) {
var ndi []*NodeDataInfo
session := db.Where(fmt.Sprintf("DATE(first_date)='%s' AND module_name='%s' AND cluster_name='%s", day, module, cluster))
session := db.Where(fmt.Sprintf("DATE(first_date) = '%s' AND module_name = '%s' AND cluster_name = '%s'", day, module, cluster))
err := session.Find(&ndi)
return ndi, err
}
Expand Down
47 changes: 29 additions & 18 deletions app/horus/cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ func main() {

c, err := config.LoadFile(configFile)
if err != nil {
klog.Errorf("load config file failed err:%+v", c)
klog.Errorf("load config file failed err:%+v", err)
return
} else {
klog.Infof("load config file success.")
Expand All @@ -67,14 +67,22 @@ func main() {
cancel()
return nil
case <-ctx.Done():
return nil
}
}
})
group.Add(func() error {
for {
select {
case <-stopChan:
cancel()
return nil
}
}
})
group.Add(func() error {
http.Handle("/metrics", promhttp.Handler())
srv := http.Server{Addr: address}
srv := http.Server{Addr: c.Address}
err := srv.ListenAndServe()
if err != nil {
klog.Errorf("horus metrics err:%v", err)
Expand All @@ -90,18 +98,22 @@ func main() {
return nil
})
group.Add(func() error {
klog.Info("horus recovery manager start success.")
err := horus.RecoveryManager(ctx)
if err != nil {
klog.Errorf("horus recovery manager start failed error:%v", err)
if c.NodeRecovery.Enabled {
klog.Info("horus recovery manager start success.")
err := horus.RecoveryManager(ctx)
if err != nil {
klog.Errorf("horus recovery manager start failed error:%v", err)
}
}
return nil
})
group.Add(func() error {
klog.Info("horus customize modular manager start success.")
err := horus.CustomizeModularManager(ctx)
if err != nil {
klog.Errorf("horus customize modular manager start failed error:%v", err)
if c.CustomModular.Enabled {
klog.Info("horus customize modular manager start success.")
err := horus.CustomizeModularManager(ctx)
if err != nil {
klog.Errorf("horus customize modular manager start failed error:%v", err)
}
}
return nil
})
Expand All @@ -116,10 +128,7 @@ func (g *WaitGroup) Add(f func() error) {
g.wg.Add(1)
go func() {
defer g.wg.Done()
err := f()
if err != nil {
return
}
_ = f()
}()
}

Expand All @@ -131,11 +140,13 @@ func setupStopChanWithContext() (*WaitGroup, <-chan struct{}) {
stopChan := make(chan struct{})
SignalChan := make(chan os.Signal, 1)
signal.Notify(SignalChan, syscall.SIGTERM, syscall.SIGQUIT)
g := WaitGroup{}
g := &WaitGroup{}
g.Add(func() error {
<-stopChan
close(stopChan)
select {
case <-SignalChan:
close(stopChan)
}
return nil
})
return &g, stopChan
return g, stopChan
}
2 changes: 1 addition & 1 deletion app/horus/core/alert/dingtalk.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ import (
"net/http"
)

const DingTalkTitle = "项目组"
const DingTalkTitle = "horus 通知"

type T struct {
At struct {
Expand Down
1 change: 0 additions & 1 deletion app/horus/core/horuser/horuser.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,5 +67,4 @@ func k8sBuildConfig(kubeconfig string) (*rest.Config, error) {

func (h *Horuser) GetK8sContext() (context.Context, context.CancelFunc) {
return context.WithTimeout(context.Background(), time.Duration(h.cc.KubeTimeSecond)*time.Second)

}
25 changes: 11 additions & 14 deletions app/horus/core/horuser/modular.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ func (h *Horuser) CustomizeModular(ctx context.Context) {
wg.Add(1)
go func(clusterName, addr string) {
defer wg.Done()
h.CustomizeModularOnCluster(clusterName, addr)
}(clusterName, addr)
}
wg.Wait()
Expand All @@ -53,11 +54,12 @@ func (h *Horuser) CustomizeModularOnCluster(clusterName, addr string) {
vecs, err := h.InstantQuery(addr, ql, clusterName, h.cc.CustomModular.PromQueryTimeSecond)
if err != nil {
klog.Errorf("CustomizeModularOnCluster InstantQuery err:%v", err)
klog.Infof("clusterName:%vec ql: %v", clusterName, ql)
klog.Infof("clusterName:%v ql: %v", clusterName, ql)
return
}
count := len(vecs)
for index, vec := range vecs {
vec := vec
labelMap := vec.Metric
nodeName := string(labelMap["node"])
if nodeName == "" {
Expand All @@ -67,7 +69,8 @@ func (h *Horuser) CustomizeModularOnCluster(clusterName, addr string) {
}
ip := string(labelMap["instance"])
value := vec.Value.String()
klog.Infof("RunCommonModuleOnCluster.QueryRes.print[clusterName:%v][moduleName:%v][%d][nodeName:%v][value:%v][count:%v]", clusterName, moduleName, index+1, nodeName, value, count)
klog.Infof("CustomizeModularOnCluster Query result clusterName:%v moduleName:%v %d nodeName:%v value:%v count:%v",
clusterName, moduleName, index+1, nodeName, value, count)
h.CustomizeModularNodes(clusterName, moduleName, nodeName, ip)
}
}
Expand All @@ -77,21 +80,19 @@ func (h *Horuser) CustomizeModularNodes(clusterName, moduleName, nodeName, ip st
today := time.Now().Format("2006-01-02")

recoveryQL := h.cc.CustomModular.RecoveryQL[moduleName]
dailyLimit := h.cc.CustomModular.CordonDailyLimit[moduleName]

data, err := db.GetDailyLimitNodeDataInfoDate(today, moduleName, clusterName)
if err != nil {
klog.Errorf("CustomizeModularNodes GetDailyLimitNodeDataInfoDate err:%v", err)
return
}
klog.Infof("%v", data)

dailyLimit := h.cc.CustomModular.CordonDailyLimit[moduleName]
if len(data) > dailyLimit {
msg := fmt.Sprintf("【日期:%v】 【集群:%v\n】 【模块今日 Cordon 节点数: %v】\n 【已达到今日上限: %v】\n [节点:%v]",
data, clusterName, moduleName, dailyLimit, nodeName)
klog.Infof(msg)
klog.Infof("Attempting to send DingTalk message (limit exceeded): %s", msg)
alert.DingTalkSend(h.cc.CustomModular.DingTalk, msg)
klog.Infof("DingTalk message sent (limit exceeded)")
return
}

Expand All @@ -113,15 +114,11 @@ func (h *Horuser) CustomizeModularNodes(clusterName, moduleName, nodeName, ip st
res := "success"
if err != nil {
res = fmt.Sprintf("failed:%v", err)
klog.Errorf("Cordon failed:%v", err)
}
if err != nil {
res = fmt.Sprintf("failed:%v", err)
klog.Errorf("Cordon failed:%v", res)
}

msg := fmt.Sprintf("【集群:%v】\n 【%s 插件 Cordon 节点:%v】\n 【结果: %v】\n 【今日操作次数:%v】",
clusterName, moduleName, nodeName, res, len(today)+1)
klog.Infof(msg)
clusterName, moduleName, nodeName, res, len(data)+1)

klog.Infof("Attempting to send DingTalk message: %s", msg)
alert.DingTalkSend(h.cc.CustomModular.DingTalk, msg)
Expand All @@ -130,7 +127,7 @@ func (h *Horuser) CustomizeModularNodes(clusterName, moduleName, nodeName, ip st
_, err = write.AddOrGet()
if err != nil {
klog.Errorf("CustomizeModularNodes AddOrGet err:%v", err)
klog.Infof("moduleName:%v nodeName:%v", moduleName, nodeName)
klog.Infof("moduleName:%v nodeName:%v", moduleName, write.NodeName)
}
klog.Infof("CustomizeModularNodes AddOrGet success moduleName:%v nodeName:%v", moduleName, nodeName)
klog.Infof("CustomizeModularNodes AddOrGet success moduleName:%v nodeName:%v", moduleName, write.NodeName)
}
1 change: 1 addition & 0 deletions app/horus/core/horuser/prome.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ func (h *Horuser) InstantQuery(address, ql, clusterName string, timeWindowsSecon
if promClient == "" && address == "" {
klog.Errorf("prometheus get PromMultiple empty")
klog.Infof("clusterName:%v ql:%v", clusterName, ql)
return nil, err
}

apiV1 := prometheusV1.NewAPI(client)
Expand Down
1 change: 1 addition & 0 deletions app/horus/core/horuser/recovery.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ func (h *Horuser) recoveryNodes(n *db.NodeDataInfo) {
return
}
if len(vecs) != 2 {
klog.Errorf("%v", vecs)
return
}
klog.Infof("recoveryNodes check success.")
Expand Down
35 changes: 20 additions & 15 deletions deploy/horus/horus.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ kubeTimeSecond: 5
mysql:
name: horus
addr: "root:root@tcp(127.0.0.1:3306)/horus?charset=utf8&parseTime=True"
debug: true
debug: false

dingTalk:
webhookUrl: ~
Expand All @@ -35,36 +35,41 @@ slack:
kubeMultiple:
cluster: config.1

promMultiple:
cluster: http://192.168.15.128:32608

nodeRecovery:
enabled: false
dayNumber: 1
checkIntervalSecond: 60
promQueryTimeSecond: 5
checkIntervalSecond: 5
promQueryTimeSecond: 10
dingTalk:
webhookUrl: "https://oapi.dingtalk.com/robot/send?access_token=aa2f3f74d7a2504653ca89b7a673707ba1d04b6d9d320c3572e5464d8f81471e"
title: "【Horus 通知"
webhookUrl: "https://oapi.dingtalk.com/robot/send?access_token=37f8891e60e524013275cc01efafdb5976b81ef7269ce271b769bcd025826c12"
title: "horus 通知"
atMobiles:
- 15000000

customModular:
enabled: true
cordonDailyLimit:
node_filesystem_readonly: 1
node_arp_entries: 1
filesystem_readonly: 5
arp_entries: 5
checkQL:
filesystem_readonly: |-
node_filesystem_readonly{mountpoint="/"} == 1
node_filesystem_readonly{mountpoint="/",node="%s"} == 1
arp_entries: |-
node_arp_entries{device="ens160"} > 2
node_arp_entries{device="ens160",node="master"} > 2
recoveryQL:
filesystem_readonly: |-
node_filesystem_readonly{mountpoint="/"} == 0
node_filesystem_readonly{mountpoint="/",node="%s"} == 0
arp_entries: |-
node_arp_entries{device="ens160"} < 2
checkIntervalSecond: 60
promQueryTimeSecond: 5
node_arp_entries{device="ens160",node="%s"} < 2
checkIntervalSecond: 5
promQueryTimeSecond: 10
kubeMultiple:
cluster: config.1
dingTalk:
webhookUrl: "https://oapi.dingtalk.com/robot/send?access_token=aa2f3f74d7a2504653ca89b7a673707ba1d04b6d9d320c3572e5464d8f81471e"
title: "【Horus 通知"
webhookUrl: "https://oapi.dingtalk.com/robot/send?access_token=37f8891e60e524013275cc01efafdb5976b81ef7269ce271b769bcd025826c12"
title: "horus 通知"
atMobiles:
- 15000000
Loading