Skip to content

Commit

Permalink
[horus] Self-healing operational repair
Browse files Browse the repository at this point in the history
  • Loading branch information
mfordjody committed Sep 17, 2024
1 parent 4c822f5 commit 6ce2d75
Show file tree
Hide file tree
Showing 4 changed files with 21 additions and 17 deletions.
2 changes: 1 addition & 1 deletion app/horus/core/alert/dingtalk.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ type Message struct {
func DingTalkSend(dk *config.DingTalkConfiguration, msg string) {
dtm := Message{MsgType: "text"}
dtm.Text.Content = fmt.Sprintf("%s\n"+
"日志:%s】", DingTalkTitle, msg)
"日志:%s】", DingTalkTitle, msg)
dtm.At.AtMobiles = dk.AtMobiles
bs, err := json.Marshal(dtm)
if err != nil {
Expand Down
4 changes: 2 additions & 2 deletions app/horus/core/horuser/modular.go
Original file line number Diff line number Diff line change
Expand Up @@ -111,13 +111,13 @@ func (h *Horuser) CustomizeModularNodes(clusterName, moduleName, nodeName, ip st
return
}
err = h.Cordon(nodeName, clusterName)
res := "success"
res := "Success"
if err != nil {
res = fmt.Sprintf("failed:%v", err)
klog.Errorf("Cordon failed:%v", res)
}

msg := fmt.Sprintf("【集群:%v】\n 【%s 插件 Cordon 节点:%v】\n 【结果: %v】\n 【今日操作次数:%v",
msg := fmt.Sprintf("【集群:%v】\n 【%s 插件 Cordon 节点:%v】\n 【结果: %v】\n 【今日操作次数:%v",
clusterName, moduleName, nodeName, res, len(data)+1)

klog.Infof("Attempting to send DingTalk message: %s", msg)
Expand Down
18 changes: 11 additions & 7 deletions app/horus/core/horuser/recovery.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,25 +60,29 @@ func (h *Horuser) recoveryNodes(n *db.NodeDataInfo) {
klog.Infof("clusterName:%v nodeName:%v", n.ClusterName, n.NodeName)
return
}
ql := fmt.Sprintf(n.RecoveryQL, n.NodeName)
vecs, err := h.InstantQuery(addr, ql, n.ClusterName, h.cc.NodeRecovery.PromQueryTimeSecond)

vecs, err := h.InstantQuery(addr, n.RecoveryQL, n.ClusterName, h.cc.NodeRecovery.PromQueryTimeSecond)
if err != nil {
klog.Errorf("recoveryNodes instantQuery err:%v ql:%v", err, ql)
klog.Errorf("recoveryNodes InstantQuery err:%v ql:%v", err, n.RecoveryQL)
return
}
if len(vecs) != 1 {
klog.Infof("Expected 1 result, but got: %d", len(vecs))
return
}
if len(vecs) != 2 {
klog.Errorf("%v", vecs)
if err != nil {
klog.Errorf("recoveryNodes instantQuery err:%v ql:%v", err, n.RecoveryQL)
return
}
klog.Infof("recoveryNodes check success.")

err = h.UnCordon(n.NodeName, n.ClusterName)
res := "success"
res := "Success"
if err != nil {
res = fmt.Sprintf("failed:%v", err)
}
msg := fmt.Sprintf("【自愈检查 %v: 恢复节点调度】【集群: %v】\n【节点: %v】【日期: %v】\n"+
"【自愈检查 QL: %v", res, n.ClusterName, n.NodeName, n.CreateTime, ql)
"【自愈检查 QL: %v", res, n.ClusterName, n.NodeName, n.CreateTime, n.RecoveryQL)
alert.DingTalkSend(h.cc.NodeRecovery.DingTalk, msg)

pass, err := n.RecoveryMarker()
Expand Down
14 changes: 7 additions & 7 deletions deploy/horus/horus.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,10 @@ kubeMultiple:
cluster: config.1

promMultiple:
cluster: http://192.168.15.128:32608
cluster: http://192.168.15.128:30201

nodeRecovery:
enabled: false
enabled: true
dayNumber: 1
checkIntervalSecond: 5
promQueryTimeSecond: 10
Expand All @@ -50,20 +50,20 @@ nodeRecovery:
- 15000000

customModular:
enabled: true
enabled: false
cordonDailyLimit:
filesystem_readonly: 5
arp_entries: 5
checkQL:
filesystem_readonly: |-
node_filesystem_readonly{mountpoint="/",node="%s"} == 1
node_filesystem_readonly{mountpoint="/"} == 1
arp_entries: |-
node_arp_entries{device="ens160",node="master"} > 2
node_arp_entries{device="ens160"} > 2
recoveryQL:
filesystem_readonly: |-
node_filesystem_readonly{mountpoint="/",node="%s"} == 0
node_filesystem_readonly{mountpoint="/"} == 0
arp_entries: |-
node_arp_entries{device="ens160",node="%s"} < 2
node_arp_entries{device="ens160"} > 2
checkIntervalSecond: 5
promQueryTimeSecond: 10
kubeMultiple:
Expand Down

0 comments on commit 6ce2d75

Please sign in to comment.