Skip to content

Commit bfbff3f

Browse files
authored
fix monitor info (#5278)
* fix monitor info * Merge remote-tracking branch 'origin/fix-monitor' into fix-monitor * Merge remote-tracking branch 'origin/fix-monitor' into fix-monitor
1 parent 2864b0a commit bfbff3f

File tree

5 files changed

+106
-64
lines changed

5 files changed

+106
-64
lines changed

Diff for: service/exceptionmonitor/api/api.go

+6-5
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,6 @@ type QueryResult struct {
2323
}
2424

2525
type Info struct {
26-
// lastStatus、recoveryStatus、lastStatusTime、recoveryStatusTime、lastStatusInfo、recoveryStatusInfo
27-
//todo 是否应该分几个状态,是否有状态不正确的地方
2826
DatabaseClusterName string
2927
Namespace string
3028
DebtLevel string
@@ -74,16 +72,19 @@ const (
7472
//StatusUpdating = "Updating"
7573
StatusUnknown = ""
7674
MonitorTypeALL = "all"
75+
DiskChinese = "磁盘"
76+
MemoryChinese = "内存"
77+
CPUChinese = "CPU"
7778
)
7879

7980
var (
8081
ClientSet *kubernetes.Clientset
8182
DynamicClient *dynamic.DynamicClient
8283
DebtNamespaceMap = make(map[string]bool)
8384
DiskFullNamespaceMap = make(map[string]bool)
84-
DiskMonitorNamespaceMap = make(map[string]bool)
85-
CPUMonitorNamespaceMap = make(map[string]bool)
86-
MemMonitorNamespaceMap = make(map[string]bool)
85+
CPUNotificationInfoMap = make(map[string]*Info)
86+
MemNotificationInfoMap = make(map[string]*Info)
87+
DiskNotificationInfoMap = make(map[string]*Info)
8788
LastBackupStatusMap = make(map[string]string)
8889
IsSendBackupStatusMap = make(map[string]string)
8990
DatabaseNotificationInfoMap = make(map[string]*Info)

Diff for: service/exceptionmonitor/helper/monitor/database_monitor.go

+3-4
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ func checkDeletedDatabases() {
6262
// DatabaseClusterUID: databaseClusterUID,
6363
// Namespace: notificationInfo.Namespace,
6464
// DatabaseClusterName: databaseClusterName,
65-
// RecoveryStatus: "Deleted",
65+
// RecoveryStatus: "Deleted",ws
6666
//}
6767
notificationInfo.RecoveryStatus = "Deleted"
6868
notificationInfo.RecoveryTime = time.Now().Format("2006-01-02 15:04:05")
@@ -109,16 +109,15 @@ func processCluster(cluster metav1unstructured.Unstructured) {
109109
getClusterDatabaseInfo(cluster, &notificationInfo)
110110
switch notificationInfo.ExceptionStatus {
111111
case api.StatusRunning, api.StatusStopped:
112-
if _, ok := api.DatabaseNotificationInfoMap[notificationInfo.DatabaseClusterUID]; ok {
113-
recoveryNotificationInfo := api.DatabaseNotificationInfoMap[notificationInfo.DatabaseClusterUID]
112+
if value, ok := api.DatabaseNotificationInfoMap[notificationInfo.DatabaseClusterUID]; ok {
113+
recoveryNotificationInfo := value
114114
recoveryNotificationInfo.RecoveryStatus, recoveryNotificationInfo.RecoveryTime = getClusterDatabaseStatus(cluster, recoveryNotificationInfo)
115115
handleClusterRecovery(recoveryNotificationInfo)
116116
}
117117
case api.StatusDeleting, api.StatusStopping:
118118
// nothing to do
119119
break
120120
case api.StatusUnknown:
121-
//一般都是在新建,应该发到新建的飞书群中
122121
if _, ok := api.DatabaseNotificationInfoMap[notificationInfo.DatabaseClusterUID]; !ok {
123122
api.DatabaseNotificationInfoMap[notificationInfo.DatabaseClusterUID] = &notificationInfo
124123
//api.LastDatabaseClusterStatus[notificationInfo.DatabaseClusterUID] = notificationInfo.ExceptionStatus

Diff for: service/exceptionmonitor/helper/monitor/database_performance_monitor.go

+74-32
Original file line numberDiff line numberDiff line change
@@ -58,17 +58,18 @@ func checkDatabasePerformanceInNamespace(namespace string) error {
5858
func monitorCluster(cluster unstructured.Unstructured) {
5959
notificationInfo := api.Info{}
6060
getClusterDatabaseInfo(cluster, &notificationInfo)
61-
//notificationInfo.DatabaseClusterName, notificationInfo.DatabaseType, notificationInfo.Namespace, notificationInfo.DatabaseClusterUID = cluster.GetName(), cluster.GetLabels()[api.DatabaseTypeLabel], cluster.GetNamespace(), string(cluster.GetUID())
62-
//status, found, err := unstructured.NestedString(cluster.Object, "status", "phase")
63-
//if err != nil || !found {
64-
// log.Printf("Unable to get %s status in ns %s: %v", notificationInfo.DatabaseClusterName, notificationInfo.Namespace, err)
65-
//}
6661
debt, _, _ := checkDebt(notificationInfo.Namespace)
6762
if !debt {
6863
return
6964
}
70-
notificationInfo.NotificationType = notification.ExceptionType
7165
notificationInfo.ExceptionType = "阀值"
66+
if value, ok := api.CPUNotificationInfoMap[notificationInfo.DatabaseClusterUID]; ok {
67+
notificationInfo = *value
68+
} else if value, ok := api.MemNotificationInfoMap[notificationInfo.DatabaseClusterUID]; ok {
69+
notificationInfo = *value
70+
} else if value, ok := api.DiskNotificationInfoMap[notificationInfo.DatabaseClusterUID]; ok {
71+
notificationInfo = *value
72+
}
7273
switch notificationInfo.ExceptionStatus {
7374
case api.StatusDeleting, api.StatusCreating, api.StatusStopping, api.StatusStopped, api.StatusUnknown:
7475
break
@@ -84,58 +85,99 @@ func monitorCluster(cluster unstructured.Unstructured) {
8485

8586
func handleCPUMemMonitor(notificationInfo *api.Info) {
8687
if cpuUsage, err := CPUMemMonitor(notificationInfo, "cpu"); err == nil {
87-
processUsage(cpuUsage, api.DatabaseCPUMonitorThreshold, "CPU", notificationInfo, api.CPUMonitorNamespaceMap)
88+
processUsage(cpuUsage, api.DatabaseCPUMonitorThreshold, api.CPUChinese, notificationInfo)
8889
} else {
8990
log.Printf("Failed to monitor CPU: %v", err)
9091
}
9192
if memUsage, err := CPUMemMonitor(notificationInfo, "memory"); err == nil {
92-
processUsage(memUsage, api.DatabaseMemMonitorThreshold, "内存", notificationInfo, api.MemMonitorNamespaceMap)
93+
processUsage(memUsage, api.DatabaseMemMonitorThreshold, api.MemoryChinese, notificationInfo)
9394
} else {
9495
log.Printf("Failed to monitor Memory: %v", err)
9596
}
9697
}
9798

9899
func handleDiskMonitor(notificationInfo *api.Info) {
99100
if maxUsage, err := checkPerformance(notificationInfo, "disk"); err == nil {
100-
processUsage(maxUsage, api.DatabaseDiskMonitorThreshold, "磁盘", notificationInfo, api.DiskMonitorNamespaceMap)
101+
processUsage(maxUsage, api.DatabaseDiskMonitorThreshold, api.DiskChinese, notificationInfo)
101102
} else {
102103
log.Printf("Failed to monitor Disk: %v", err)
103104
}
104105
}
105106

106-
func processUsage(usage float64, threshold float64, performanceType string, notificationInfo *api.Info, monitorMap map[string]bool) {
107+
func processUsage(usage float64, threshold float64, performanceType string, notificationInfo *api.Info) {
107108
notificationInfo.PerformanceType = performanceType
108109
usageStr := strconv.FormatFloat(usage, 'f', 2, 64)
109-
if performanceType == "CPU" {
110+
if notificationInfo.PerformanceType == api.CPUChinese {
110111
notificationInfo.CPUUsage = usageStr
111-
} else if performanceType == "内存" {
112+
} else if performanceType == api.MemoryChinese {
112113
notificationInfo.MemUsage = usageStr
113-
} else if performanceType == "磁盘" {
114+
} else if performanceType == api.DiskChinese {
114115
notificationInfo.DiskUsage = usageStr
115116
}
116-
if usage >= threshold && !monitorMap[notificationInfo.DatabaseClusterUID] {
117-
alertMessage := notification.GetNotificationMessage(notificationInfo)
118-
notificationInfo.FeishuWebHook = api.FeishuWebhookURLMap["FeishuWebhookURLImportant"]
119-
if err := notification.SendFeishuNotification(notificationInfo, alertMessage); err != nil {
120-
log.Printf("Failed to send notification: %v", err)
117+
if usage >= threshold {
118+
if _, ok := api.CPUNotificationInfoMap[notificationInfo.DatabaseClusterUID]; !ok && notificationInfo.PerformanceType == api.CPUChinese {
119+
processException(notificationInfo, threshold)
120+
}
121+
if _, ok := api.MemNotificationInfoMap[notificationInfo.DatabaseClusterUID]; !ok && notificationInfo.PerformanceType == api.MemoryChinese {
122+
processException(notificationInfo, threshold)
121123
}
122-
monitorMap[notificationInfo.DatabaseClusterUID] = true
123-
if performanceType != "磁盘" {
124-
return
124+
if _, ok := api.DiskNotificationInfoMap[notificationInfo.DatabaseClusterUID]; !ok && notificationInfo.PerformanceType == api.DiskChinese {
125+
processException(notificationInfo, threshold)
125126
}
126-
ZNThreshold := NumberToChinese(int(threshold))
127-
if err := notification.SendToSms(notificationInfo, api.ClusterName, "数据库"+performanceType+"超过百分之"+ZNThreshold); err != nil {
128-
log.Printf("Failed to send Sms: %v", err)
127+
} else if usage < threshold {
128+
if _, ok := api.CPUNotificationInfoMap[notificationInfo.DatabaseClusterUID]; ok && notificationInfo.PerformanceType == api.CPUChinese {
129+
processRecovery(notificationInfo)
129130
}
130-
} else if usage < threshold && monitorMap[notificationInfo.DatabaseClusterUID] {
131-
notificationInfo.NotificationType = "recovery"
132-
notificationInfo.RecoveryTime = time.Now().Add(8 * time.Hour).Format("2006-01-02 15:04:05")
133-
alertMessage := notification.GetNotificationMessage(notificationInfo)
134-
notificationInfo.FeishuWebHook = api.FeishuWebhookURLMap["FeishuWebhookURLImportant"]
135-
if err := notification.SendFeishuNotification(notificationInfo, alertMessage); err != nil {
136-
log.Printf("Failed to send notification: %v", err)
131+
if _, ok := api.MemNotificationInfoMap[notificationInfo.DatabaseClusterUID]; ok && notificationInfo.PerformanceType == api.MemoryChinese {
132+
processRecovery(notificationInfo)
137133
}
138-
delete(monitorMap, notificationInfo.DatabaseClusterUID)
134+
if _, ok := api.DiskNotificationInfoMap[notificationInfo.DatabaseClusterUID]; ok && notificationInfo.PerformanceType == api.DiskChinese {
135+
processRecovery(notificationInfo)
136+
}
137+
}
138+
}
139+
140+
func processException(notificationInfo *api.Info, threshold float64) {
141+
notificationInfo.NotificationType = notification.ExceptionType
142+
alertMessage := notification.GetNotificationMessage(notificationInfo)
143+
notificationInfo.FeishuWebHook = api.FeishuWebhookURLMap["FeishuWebhookURLImportant"]
144+
if err := notification.SendFeishuNotification(notificationInfo, alertMessage); err != nil {
145+
log.Printf("Failed to send notification: %v", err)
146+
}
147+
if notificationInfo.PerformanceType == api.CPUChinese {
148+
api.CPUNotificationInfoMap[notificationInfo.DatabaseClusterUID] = notificationInfo
149+
return
150+
}
151+
if notificationInfo.PerformanceType == api.MemoryChinese {
152+
api.MemNotificationInfoMap[notificationInfo.DatabaseClusterUID] = notificationInfo
153+
return
154+
}
155+
if notificationInfo.PerformanceType == api.DiskChinese {
156+
api.DiskNotificationInfoMap[notificationInfo.DatabaseClusterUID] = notificationInfo
157+
}
158+
ZNThreshold := NumberToChinese(int(threshold))
159+
if err := notification.SendToSms(notificationInfo, api.ClusterName, "数据库"+notificationInfo.PerformanceType+"超过百分之"+ZNThreshold); err != nil {
160+
log.Printf("Failed to send Sms: %v", err)
161+
}
162+
}
163+
164+
func processRecovery(notificationInfo *api.Info) {
165+
notificationInfo.NotificationType = "recovery"
166+
notificationInfo.RecoveryStatus = notificationInfo.ExceptionStatus
167+
notificationInfo.RecoveryTime = time.Now().Add(8 * time.Hour).Format("2006-01-02 15:04:05")
168+
alertMessage := notification.GetNotificationMessage(notificationInfo)
169+
notificationInfo.FeishuWebHook = api.FeishuWebhookURLMap["FeishuWebhookURLImportant"]
170+
if err := notification.SendFeishuNotification(notificationInfo, alertMessage); err != nil {
171+
log.Printf("Failed to send notification: %v", err)
172+
}
173+
if notificationInfo.PerformanceType == api.CPUChinese {
174+
delete(api.CPUNotificationInfoMap, notificationInfo.DatabaseClusterUID)
175+
}
176+
if notificationInfo.PerformanceType == api.MemoryChinese {
177+
delete(api.MemNotificationInfoMap, notificationInfo.DatabaseClusterUID)
178+
}
179+
if notificationInfo.PerformanceType == api.DiskChinese {
180+
delete(api.DiskNotificationInfoMap, notificationInfo.DatabaseClusterUID)
139181
}
140182
}
141183

Diff for: service/exceptionmonitor/helper/monitor/quota_monitor.go

+14-5
Original file line numberDiff line numberDiff line change
@@ -16,18 +16,27 @@ import (
1616

1717
func QuotaMonitor() {
1818
for api.QuotaMonitor {
19-
if err := checkQuota(); err != nil {
19+
if err := checkQuota(api.ClusterNS); err != nil {
2020
log.Printf("Failed to check qouta: %v", err)
2121
}
2222
time.Sleep(3 * time.Hour)
2323
}
2424
}
2525

26-
func checkQuota() error {
27-
namespaceList, _ := api.ClientSet.CoreV1().Namespaces().List(context.Background(), metav1.ListOptions{})
26+
func checkQuota(namespaces []string) error {
27+
var namespaceList []v1.Namespace
2828

29-
fmt.Println(len(namespaceList.Items))
30-
for _, ns := range namespaceList.Items {
29+
// Fetch namespaces based on MonitorType
30+
if api.MonitorType == api.MonitorTypeALL {
31+
namespaces, _ := api.ClientSet.CoreV1().Namespaces().List(context.Background(), metav1.ListOptions{})
32+
namespaceList = namespaces.Items
33+
} else {
34+
for _, ns := range namespaces {
35+
namespace, _ := api.ClientSet.CoreV1().Namespaces().Get(context.Background(), ns, metav1.GetOptions{})
36+
namespaceList = append(namespaceList, *namespace)
37+
}
38+
}
39+
for _, ns := range namespaceList {
3140
if !strings.Contains(ns.Name, "ns-") {
3241
continue
3342
}

Diff for: service/exceptionmonitor/helper/notification/feishu.go

+9-18
Original file line numberDiff line numberDiff line change
@@ -75,15 +75,14 @@ func GetNotificationMessage(notificationInfo *api.Info) string {
7575
headerTemplate := "red"
7676
titleContent := "数据库" + notificationInfo.ExceptionType + "告警"
7777
usage := ""
78-
if notificationInfo.PerformanceType == "CPU" {
78+
if notificationInfo.PerformanceType == api.CPUChinese {
7979
usage = notificationInfo.CPUUsage
80-
} else if notificationInfo.PerformanceType == "内存" {
80+
} else if notificationInfo.PerformanceType == api.MemoryChinese {
8181
usage = notificationInfo.MemUsage
82-
} else if notificationInfo.PerformanceType == "磁盘" {
82+
} else if notificationInfo.PerformanceType == api.DiskChinese {
8383
usage = notificationInfo.DiskUsage
8484
}
8585

86-
//公共部分,状态和阀值的异常、恢复过程都需要,需要判断是否首次发送信息,是的话,就用这里,不是的话,就跳过(在之前的内容上追加)
8786
commonElements := []map[string]interface{}{
8887
{
8988
"tag": "div",
@@ -117,7 +116,6 @@ func GetNotificationMessage(notificationInfo *api.Info) string {
117116

118117
if notificationInfo.NotificationType == ExceptionType && notificationInfo.ExceptionType == "状态" {
119118
exceptionElements := []map[string]interface{}{
120-
//这个异常时间需要给值
121119
{
122120
"tag": "div",
123121
"text": map[string]string{
@@ -147,7 +145,7 @@ func GetNotificationMessage(notificationInfo *api.Info) string {
147145
},
148146
}
149147
notificationInfo.FeishuInfo = append(commonElements, exceptionElements...)
150-
} else if notificationInfo.ExceptionType == "阀值" {
148+
} else if notificationInfo.NotificationType == ExceptionType && notificationInfo.ExceptionType == "阀值" {
151149
exceptionElements := []map[string]interface{}{
152150
{
153151
"tag": "div",
@@ -161,24 +159,21 @@ func GetNotificationMessage(notificationInfo *api.Info) string {
161159
}
162160

163161
if notificationInfo.NotificationType == "recovery" {
164-
// todo 拿到之前的发送信息并加上,已做状态监控,未做阀值监控
165162
headerTemplate = "blue"
166163
titleContent = "数据库" + notificationInfo.ExceptionType + "恢复通知"
167164

168-
//获取之前发送的飞书内容
169165
separatorElements := []map[string]interface{}{
170166
{
171167
"tag": "div",
172168
"text": map[string]string{
173-
"content": "-------------------------------------------",
169+
"content": "-------------------------------------数据库恢复信息-------------------------------------",
174170
"tag": "lark_md",
175171
},
176172
},
177173
}
178174
notificationInfo.FeishuInfo = append(notificationInfo.FeishuInfo, separatorElements...)
179-
//elements = commonElements
175+
180176
if notificationInfo.ExceptionType == "阀值" {
181-
//todo 数据库阀值的恢复时间怎么跟其它统一起来,需要在数据库阀值恢复中增加恢复时间
182177
usageRecoveryElements := []map[string]interface{}{
183178
{
184179
"tag": "div",
@@ -212,7 +207,6 @@ func GetNotificationMessage(notificationInfo *api.Info) string {
212207
"config": map[string]bool{
213208
"wide_screen_mode": true,
214209
},
215-
//elements替换成notificationInfo.FeishuInfo
216210
"elements": notificationInfo.FeishuInfo,
217211
"header": map[string]interface{}{
218212
"template": headerTemplate,
@@ -253,11 +247,11 @@ func SendFeishuNotification(notification *api.Info, message string) error {
253247

254248
func getMessageIDMap(performanceType string) map[string]string {
255249
switch performanceType {
256-
case "磁盘":
250+
case api.DiskChinese:
257251
return api.DatabaseDiskMessageIDMap
258-
case "内存":
252+
case api.MemoryChinese:
259253
return api.DatabaseMemMessageIDMap
260-
case "CPU":
254+
case api.CPUChinese:
261255
return api.DatabaseCPUMessageIDMap
262256
case "Backup":
263257
return api.DatabaseBackupMessageIDMap
@@ -273,8 +267,6 @@ func updateFeishuNotification(messageID, message string) error {
273267
MessageId(messageID).
274268
Body(larkim.NewPatchMessageReqBodyBuilder().
275269
Content(message).Build()).Build()
276-
277-
fmt.Println(messageID)
278270
resp, err := feiShuClient.Im.Message.Patch(context.Background(), req)
279271
if err != nil {
280272
log.Println("Error:", err)
@@ -318,7 +310,6 @@ func createFeishuNotification(notification *api.Info, message string, messageIDM
318310
} else {
319311
messageIDMap[notification.DatabaseClusterName] = messageID
320312
}
321-
fmt.Println(messageIDMap)
322313
return nil
323314
}
324315

0 commit comments

Comments
 (0)