Skip to content

Commit

Permalink
[horus] Fix duplicate alarms (#353)
Browse files Browse the repository at this point in the history
  • Loading branch information
mfordjody committed Sep 18, 2024
1 parent 7f288da commit 68f2eb7
Show file tree
Hide file tree
Showing 5 changed files with 31 additions and 30 deletions.
10 changes: 5 additions & 5 deletions app/horus/basic/db/db.go
Original file line number Diff line number Diff line change
Expand Up @@ -119,16 +119,16 @@ func (n *NodeDataInfo) AddOrGet() (int64, error) {
return row, err
}

func GetRecoveryNodeDataInfoDate(day int) ([]*NodeDataInfo, error) {
var ndi []*NodeDataInfo
func GetRecoveryNodeDataInfoDate(day int) ([]NodeDataInfo, error) {
var ndi []NodeDataInfo
session := db.Where(fmt.Sprintf("recovery_mark = 0 AND first_date > DATE_SUB(CURDATE(), INTERVAL %d DAY)", day))
err := session.Find(&ndi)
return ndi, err
}

func GetDailyLimitNodeDataInfoDate(day, module, cluster string) ([]*NodeDataInfo, error) {
var ndi []*NodeDataInfo
session := db.Where(fmt.Sprintf("DATE(first_date) = '%s' AND module_name = '%s' AND cluster_name = '%s'", day, module, cluster))
func GetDailyLimitNodeDataInfoDate(day, module, cluster string) ([]NodeDataInfo, error) {
var ndi []NodeDataInfo
session := db.Where("DATE(first_date) = ? AND module_name = ? AND cluster_name = ?", day, module, cluster)
err := session.Find(&ndi)
return ndi, err
}
Expand Down
8 changes: 6 additions & 2 deletions app/horus/core/horuser/action.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ import (
"k8s.io/klog/v2"
)

func (h *Horuser) Cordon(nodeName, clusterName string) (err error) {
func (h *Horuser) Cordon(nodeName, clusterName, moduleName string) (err error) {
kubeClient := h.kubeClientMap[clusterName]
if kubeClient == nil {
klog.Errorf("node Cordon kubeClient by clusterName empty.")
Expand All @@ -36,9 +36,13 @@ func (h *Horuser) Cordon(nodeName, clusterName string) (err error) {
klog.Errorf("node Cordon get err nodeName:%v clusterName:%v", nodeName, clusterName)
return err
}
annotations := node.Annotations
if annotations == nil {
annotations = map[string]string{}
}
annotations["dubbo.apache.org/disable-by"] = "horus"

node.Spec.Unschedulable = true

ctxSecond, cancelSecond := h.GetK8sContext()
defer cancelSecond()
node, err = kubeClient.CoreV1().Nodes().Update(ctxSecond, node, v1.UpdateOptions{})
Expand Down
37 changes: 17 additions & 20 deletions app/horus/core/horuser/modular.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,22 +81,13 @@ func (h *Horuser) CustomizeModularNodes(clusterName, moduleName, nodeName, ip st
today := time.Now().Format("2006-01-02")

recoveryQL := fmt.Sprintf(h.cc.CustomModular.RecoveryQL[moduleName], nodeName)

data, err := db.GetDailyLimitNodeDataInfoDate(today, moduleName, clusterName)
if err != nil {
klog.Errorf("CustomizeModularNodes GetDailyLimitNodeDataInfoDate err:%v", err)
return
}
klog.Infof("%v", data)

dailyLimit := h.cc.CustomModular.CordonDailyLimit[moduleName]
if len(data) > dailyLimit {
msg := fmt.Sprintf("【日期:%v】 【集群:%v\n】 【今日 Cordon 节点数: %v】\n 【已达到今日上限: %v】\n 【节点:%v】",
data, clusterName, moduleName, dailyLimit, nodeName)
alert.DingTalkSend(h.cc.CustomModular.DingTalk, msg)
return
}

err = h.Cordon(nodeName, clusterName, moduleName)

write := db.NodeDataInfo{
NodeName: nodeName,
NodeIP: ip,
Expand All @@ -106,29 +97,35 @@ func (h *Horuser) CustomizeModularNodes(clusterName, moduleName, nodeName, ip st
FirstDate: today,
RecoveryQL: recoveryQL,
}

pass, _ := write.Check()
if pass {
klog.Infof("CustomizeModularNodes already existing clusterName:%v nodeName:%v moduleName:%v", clusterName, nodeName, moduleName)
return
}
err = h.Cordon(nodeName, clusterName)

_, err = write.AddOrGet()
if err != nil {
klog.Errorf("CustomizeModularNodes AddOrGet err:%v", err)
klog.Infof("moduleName:%v nodeName:%v", moduleName, write.NodeName)
}

res := "Success"
if err != nil {
res = fmt.Sprintf("failed:%v", err)
klog.Errorf("Cordon failed:%v", res)
}

msg := fmt.Sprintf("\n【集群:%v】\n【发现 %s 异常已禁止调度】\n【已禁止调度节点:%v】\n 【处理结果: %v】\n 【今日操作次数:%v】\n",
clusterName, moduleName, nodeName, res, len(data)+1)

klog.Infof("Attempting to send DingTalk message: %s", msg)
alert.DingTalkSend(h.cc.CustomModular.DingTalk, msg)
klog.Infof("DingTalk message sent")

_, err = write.AddOrGet()
if err != nil {
klog.Errorf("CustomizeModularNodes AddOrGet err:%v", err)
klog.Infof("moduleName:%v nodeName:%v", moduleName, write.NodeName)
dailyLimit := h.cc.CustomModular.CordonDailyLimit[moduleName]
if len(data) > dailyLimit {
msg := fmt.Sprintf("【日期:%v】 【集群:%v\n】 【今日 Cordon 节点数: %v】\n 【已达到今日上限: %v】\n 【节点:%v】",
data, clusterName, len(data), dailyLimit, nodeName)
alert.DingTalkSend(h.cc.CustomModular.DingTalk, msg)
return
}

klog.Infof("CustomizeModularNodes AddOrGet success moduleName:%v nodeName:%v", moduleName, write.NodeName)
}
2 changes: 1 addition & 1 deletion app/horus/core/horuser/recovery.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ func (h *Horuser) recoveryCheck(ctx context.Context) {
wp.StopWait()
}

func (h *Horuser) recoveryNodes(n *db.NodeDataInfo) {
func (h *Horuser) recoveryNodes(n db.NodeDataInfo) {
addr := h.cc.PromMultiple[n.ClusterName]
if addr == "" {
klog.Errorf("recoveryNodes PromMultiple get addr empty.")
Expand Down
4 changes: 2 additions & 2 deletions deploy/horus/horus.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,9 @@ nodeRecovery:
- 15000000

customModular:
enabled: true
enabled: false
cordonDailyLimit:
filesystem_readonly: 1
filesystem_readonly: 3
checkQL:
filesystem_readonly: |-
node_filesystem_readonly{mountpoint="/"} != 1
Expand Down

0 comments on commit 68f2eb7

Please sign in to comment.