Skip to content

Commit

Permalink
[horus] Optimize alarm logs (#352)
Browse files Browse the repository at this point in the history
  • Loading branch information
mfordjody committed Sep 18, 2024
1 parent 7122f53 commit 7f288da
Show file tree
Hide file tree
Showing 4 changed files with 6 additions and 13 deletions.
3 changes: 1 addition & 2 deletions app/horus/core/alert/dingtalk.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,7 @@ type Message struct {

func DingTalkSend(dk *config.DingTalkConfiguration, msg string) {
dtm := Message{MsgType: "text"}
dtm.Text.Content = fmt.Sprintf("%s\n"+
"日志:%s】", DingTalkTitle, msg)
dtm.Text.Content = fmt.Sprint(DingTalkTitle, msg)
dtm.At.AtMobiles = dk.AtMobiles
bs, err := json.Marshal(dtm)
if err != nil {
Expand Down
4 changes: 2 additions & 2 deletions app/horus/core/horuser/modular.go
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ func (h *Horuser) CustomizeModularNodes(clusterName, moduleName, nodeName, ip st

dailyLimit := h.cc.CustomModular.CordonDailyLimit[moduleName]
if len(data) > dailyLimit {
msg := fmt.Sprintf("【日期:%v】 【集群:%v\n】 【模块今日 Cordon 节点数: %v】\n 【已达到今日上限: %v】\n 【节点:%v】",
msg := fmt.Sprintf("【日期:%v】 【集群:%v\n】 【今日 Cordon 节点数: %v】\n 【已达到今日上限: %v】\n 【节点:%v】",
data, clusterName, moduleName, dailyLimit, nodeName)
alert.DingTalkSend(h.cc.CustomModular.DingTalk, msg)
return
Expand All @@ -118,7 +118,7 @@ func (h *Horuser) CustomizeModularNodes(clusterName, moduleName, nodeName, ip st
klog.Errorf("Cordon failed:%v", res)
}

msg := fmt.Sprintf("【集群:%v】\n%s 插件 Cordon 节点:%v】\n结果: %v】\n 【今日操作次数:%v",
msg := fmt.Sprintf("\n【集群:%v】\n【发现 %s 异常已禁止调度】\n【已禁止调度节点:%v】\n处理结果: %v】\n 【今日操作次数:%v\n",
clusterName, moduleName, nodeName, res, len(data)+1)

klog.Infof("Attempting to send DingTalk message: %s", msg)
Expand Down
3 changes: 1 addition & 2 deletions app/horus/core/horuser/recovery.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,7 @@ func (h *Horuser) recoveryNodes(n *db.NodeDataInfo) {
if err != nil {
res = fmt.Sprintf("failed:%v", err)
}
msg := fmt.Sprintf("【自愈检查 %v: 恢复节点调度】【集群: %v】\n【节点: %v】【日期: %v】\n"+
"【自愈检查 QL: %v", res, n.ClusterName, n.NodeName, n.CreateTime, n.RecoveryQL)
msg := fmt.Sprintf("\n【集群: %v】\n【异常节点恢复调度】\n【已恢复调度节点: %v】\n【处理结果:%v】\n【日期: %v】\n", n.ClusterName, n.NodeName, res, n.CreateTime)
alert.DingTalkSend(h.cc.NodeRecovery.DingTalk, msg)

pass, err := n.RecoveryMarker()
Expand Down
9 changes: 2 additions & 7 deletions deploy/horus/horus.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ kubeMultiple:
cluster: config.1

promMultiple:
cluster: http://192.168.15.128:30201
cluster: http://192.168.15.128:31160

nodeRecovery:
enabled: true
Expand All @@ -52,18 +52,13 @@ nodeRecovery:
customModular:
enabled: true
cordonDailyLimit:
filesystem_readonly: 5
arp_entries: 5
filesystem_readonly: 1
checkQL:
filesystem_readonly: |-
node_filesystem_readonly{mountpoint="/"} != 1
arp_entries: |-
node_arp_entries{device="ens160"} > 2
recoveryQL:
filesystem_readonly: |-
node_filesystem_readonly{mountpoint="/",node="%s"} == 0
arp_entries: |-
node_arp_entries{device="ens160",node="%s"} > 2
checkIntervalSecond: 5
promQueryTimeSecond: 60
kubeMultiple:
Expand Down

0 comments on commit 7f288da

Please sign in to comment.