Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add monitor info #5267

Merged
merged 27 commits into from
Dec 6, 2024
Merged
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
ccbe98c
add database feishu info and cockroach monitor
wallyxjh Dec 5, 2024
3b7ec2a
add database feishu info and cockroach monitor
wallyxjh Dec 5, 2024
8facde0
Merge branch 'main' into add-monitor-info
wallyxjh Dec 5, 2024
4714991
add monitor info
wallyxjh Dec 5, 2024
c3fb626
Merge remote-tracking branch 'origin/add-monitor-info' into add-monit…
wallyxjh Dec 5, 2024
c969613
add-monitor-info
wallyxjh Dec 5, 2024
e257b60
Merge remote-tracking branch 'origin/add-monitor-info' into add-monit…
wallyxjh Dec 5, 2024
30b9e6d
add-monitor-info
wallyxjh Dec 5, 2024
75935b8
Merge remote-tracking branch 'origin/add-monitor-info' into add-monit…
wallyxjh Dec 5, 2024
967e639
add-monitor-info
wallyxjh Dec 5, 2024
80130e9
Merge remote-tracking branch 'origin/add-monitor-info' into add-monit…
wallyxjh Dec 5, 2024
a53dfc2
Merge remote-tracking branch 'origin/add-monitor-info' into add-monit…
wallyxjh Dec 5, 2024
49ccf23
Merge remote-tracking branch 'origin/add-monitor-info' into add-monit…
wallyxjh Dec 6, 2024
43c5053
Merge remote-tracking branch 'origin/add-monitor-info' into add-monit…
wallyxjh Dec 6, 2024
18d5b45
Merge remote-tracking branch 'origin/add-monitor-info' into add-monit…
wallyxjh Dec 6, 2024
1e8ba36
Merge remote-tracking branch 'origin/add-monitor-info' into add-monit…
wallyxjh Dec 6, 2024
f6268e0
Merge remote-tracking branch 'origin/add-monitor-info' into add-monit…
wallyxjh Dec 6, 2024
b2a9180
Merge remote-tracking branch 'origin/add-monitor-info' into add-monit…
wallyxjh Dec 6, 2024
eb40546
Merge remote-tracking branch 'origin/add-monitor-info' into add-monit…
wallyxjh Dec 6, 2024
4743e28
Merge remote-tracking branch 'origin/add-monitor-info' into add-monit…
wallyxjh Dec 6, 2024
f76400d
Merge remote-tracking branch 'origin/add-monitor-info' into add-monit…
wallyxjh Dec 6, 2024
3e41405
Merge remote-tracking branch 'origin/add-monitor-info' into add-monit…
wallyxjh Dec 6, 2024
954fce4
Merge remote-tracking branch 'origin/add-monitor-info' into add-monit…
wallyxjh Dec 6, 2024
1b52bee
Merge remote-tracking branch 'origin/add-monitor-info' into add-monit…
wallyxjh Dec 6, 2024
8eaa16c
Merge remote-tracking branch 'origin/add-monitor-info' into add-monit…
wallyxjh Dec 6, 2024
3822bcc
Merge remote-tracking branch 'origin/add-monitor-info' into add-monit…
wallyxjh Dec 6, 2024
22bc50d
Merge remote-tracking branch 'origin/add-monitor-info' into add-monit…
wallyxjh Dec 6, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 55 additions & 9 deletions service/exceptionmonitor/api/api.go
Original file line number Diff line number Diff line change
@@ -22,33 +22,71 @@ type QueryResult struct {
} `json:"data"`
}

type Info struct {
// lastStatus、recoveryStatus、lastStatusTime、recoveryStatusTime、lastStatusInfo、recoveryStatusInfo
//todo 是否应该分几个状态,是否有状态不正确的地方
DatabaseClusterName string
Namespace string
DebtLevel string
DatabaseType string
Events string
Reason string
NotificationType string
DiskUsage string
CPUUsage string
MemUsage string
PerformanceType string
ExceptionType string
ExceptionStatus string
RecoveryStatus string
ExceptionStatusTime string
RecoveryTime string
DatabaseClusterUID string
FeishuWebHook string
//struct
FeishuInfo []map[string]interface{}
}

type NameSpaceQuota struct {
NameSpace string
CPULimit string
MemLimit string
GPULimit string
EphemeralStorageLimit string
ObjectStorageLimit string
NodePortLimit string
StorageLimit string
CPUUsage string
MemUsage string
GPUUsage string
EphemeralStorageUsage string
ObjectStorageUsage string
NodePortUsage string
StorageUsage string
}

const (
StatusDeleting = "Deleting"
StatusCreating = "Creating"
StatusStopping = "Stopping"
StatusStopped = "Stopped"
StatusRunning = "Running"
StatusUpdating = "Updating"
//StatusUpdating = "Updating"
StatusUnknown = ""
MonitorTypeALL = "all"
)

var (
ClientSet *kubernetes.Clientset
DynamicClient *dynamic.DynamicClient
// records the last database status
LastDatabaseClusterStatus = make(map[string]string)
// record the debt ns
ExceptionDatabaseMap = make(map[string]bool)
FeishuWebHookMap = make(map[string]string)
ClientSet *kubernetes.Clientset
DynamicClient *dynamic.DynamicClient
DebtNamespaceMap = make(map[string]bool)
DiskFullNamespaceMap = make(map[string]bool)
DiskMonitorNamespaceMap = make(map[string]bool)
CPUMonitorNamespaceMap = make(map[string]bool)
MemMonitorNamespaceMap = make(map[string]bool)
LastBackupStatusMap = make(map[string]string)
IsSendBackupStatusMap = make(map[string]string)
DatabaseNamespaceMap = make(map[string]string)
DatabaseNotificationInfoMap = make(map[string]*Info)
ExceededQuotaException = "exceeded quota"
DiskException = "Writing to log file failed"
OwnerLabel = "user.sealos.io/owner"
@@ -65,13 +103,16 @@ var (
CPUMemMonitor bool
BackupMonitor bool
QuotaMonitor bool
CockroachMonitor bool
DatabaseDiskMonitorThreshold float64
DatabaseExceptionMonitorThreshold float64
DatabaseCPUMonitorThreshold float64
DatabaseMemMonitorThreshold float64
QuotaThreshold float64
APPID string
APPSECRET string
GlobalCockroachURI string
LocalCockroachURI string
DatabaseStatusMessageIDMap = make(map[string]string)
DatabaseDiskMessageIDMap = make(map[string]string)
DatabaseCPUMessageIDMap = make(map[string]string)
@@ -90,11 +131,14 @@ func GetENV() error {
MonitorType = getEnvWithCheck("MonitorType", &missingEnvVars)
clusterNS := getEnvWithCheck("ClusterNS", &missingEnvVars)
LOCALREGION = getEnvWithCheck("LOCALREGION", &missingEnvVars)
GlobalCockroachURI = getEnvWithCheck("GlobalCockroachURI", &missingEnvVars)
LocalCockroachURI = getEnvWithCheck("LocalCockroachURI", &missingEnvVars)
DatabaseMonitor, _ = strconv.ParseBool(getEnvWithCheck("DatabaseMonitor", &missingEnvVars))
DiskMonitor, _ = strconv.ParseBool(getEnvWithCheck("DiskMonitor", &missingEnvVars))
CPUMemMonitor, _ = strconv.ParseBool(getEnvWithCheck("CPUMemMonitor", &missingEnvVars))
BackupMonitor, _ = strconv.ParseBool(getEnvWithCheck("BackupMonitor", &missingEnvVars))
QuotaMonitor, _ = strconv.ParseBool(getEnvWithCheck("QuotaMonitor", &missingEnvVars))
CockroachMonitor, _ = strconv.ParseBool(getEnvWithCheck("CockroachMonitor", &missingEnvVars))
DatabaseDiskMonitorThreshold, _ = strconv.ParseFloat(getEnvWithCheck("DatabaseDiskMonitorThreshold", &missingEnvVars), 64)
DatabaseExceptionMonitorThreshold, _ = strconv.ParseFloat(getEnvWithCheck("DatabaseExceptionMonitorThreshold", &missingEnvVars), 64)
DatabaseCPUMonitorThreshold, _ = strconv.ParseFloat(getEnvWithCheck("DatabaseCPUMonitorThreshold", &missingEnvVars), 64)
@@ -119,6 +163,8 @@ func GetENV() error {
"FeishuWebhookURLBackup",
//Quota
"FeishuWebhookURLQuota",
//CockroachDB
"FeishuWebhookURLCockroachDB",
}, FeishuWebhookURLMap, &missingEnvVars)

// Get ClusterRegionMap
3 changes: 1 addition & 2 deletions service/exceptionmonitor/dao/init.go
Original file line number Diff line number Diff line change
@@ -3,9 +3,8 @@ package dao
import (
"os"

"github.com/labring/sealos/service/exceptionmonitor/api"

"github.com/labring/sealos/controllers/pkg/database/cockroach"
"github.com/labring/sealos/service/exceptionmonitor/api"
)

var (
7 changes: 4 additions & 3 deletions service/exceptionmonitor/go.mod
Original file line number Diff line number Diff line change
@@ -31,7 +31,7 @@ require (
github.com/golang/protobuf v1.5.4 // indirect
github.com/google/gnostic-models v0.6.8 // indirect
github.com/google/gofuzz v1.2.0 // indirect
github.com/google/uuid v1.3.0 // indirect
github.com/google/uuid v1.6.0 // indirect
github.com/imdario/mergo v0.3.16 // indirect
github.com/jackc/pgpassfile v1.0.0 // indirect
github.com/jackc/pgservicefile v0.0.0-20221227161230-091c0ba34f0a // indirect
@@ -41,6 +41,7 @@ require (
github.com/josharian/intern v1.0.0 // indirect
github.com/json-iterator/go v1.1.12 // indirect
github.com/larksuite/oapi-sdk-go/v3 v3.2.9 // indirect
github.com/lib/pq v1.10.9 // indirect
github.com/mailru/easyjson v0.7.7 // indirect
github.com/matoous/go-nanoid/v2 v2.0.0 // indirect
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
@@ -51,7 +52,7 @@ require (
go.mongodb.org/mongo-driver v1.12.1 // indirect
golang.org/x/crypto v0.21.0 // indirect
golang.org/x/net v0.23.0 // indirect
golang.org/x/oauth2 v0.10.0 // indirect
golang.org/x/oauth2 v0.12.0 // indirect
golang.org/x/sys v0.18.0 // indirect
golang.org/x/term v0.18.0 // indirect
golang.org/x/text v0.14.0 // indirect
@@ -70,7 +71,7 @@ require (
k8s.io/utils v0.0.0-20230726121419-3b25d923346b // indirect
sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect
sigs.k8s.io/structured-merge-diff/v4 v4.4.1 // indirect
sigs.k8s.io/yaml v1.3.0 // indirect
sigs.k8s.io/yaml v1.4.0 // indirect
)

replace (
2 changes: 2 additions & 0 deletions service/exceptionmonitor/go.sum
Original file line number Diff line number Diff line change
@@ -99,6 +99,8 @@ github.com/larksuite/oapi-sdk-go v1.1.48 h1:RHRr5LW68AibBzXVRXObUpkbS6TXapl4TAyh
github.com/larksuite/oapi-sdk-go v1.1.48/go.mod h1:7ybKAbVdKBjXuX0YrMTfnWUyCaIe/zeI1wqjNfN9XOk=
github.com/larksuite/oapi-sdk-go/v3 v3.2.9 h1:9zQAGrzhibNwdaGRkWUP1cAd2k2dJJDpbSffcfK0wPw=
github.com/larksuite/oapi-sdk-go/v3 v3.2.9/go.mod h1:ZEplY+kwuIrj/nqw5uSCINNATcH3KdxSN7y+UxYY5fI=
github.com/lib/pq v1.10.9 h1:YXG7RB+JIjhP29X+OtkiDnYaXQwpS4JEWq7dtCCRUEw=
github.com/lib/pq v1.10.9/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o=
github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0=
github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc=
github.com/matoous/go-nanoid v1.5.0/go.mod h1:zyD2a71IubI24efhpvkJz+ZwfwagzgSO6UNiFsZKN7U=
54 changes: 54 additions & 0 deletions service/exceptionmonitor/helper/monitor/cockroachdb_monitor.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
package monitor

import (
"fmt"
"log"
"time"

"github.com/labring/sealos/service/exceptionmonitor/api"
"github.com/labring/sealos/service/exceptionmonitor/helper/notification"
"gorm.io/driver/postgres"
"gorm.io/gorm"
"gorm.io/gorm/logger"
)

func CockroachMonitor() {
for api.CockroachMonitor {
notificationInfo := &api.Info{
FeishuWebHook: api.FeishuWebhookURLMap["FeishuWebhookURLCockroachDB"],
}
monitorCockroachDB(api.GlobalCockroachURI, "Global", notificationInfo)
monitorCockroachDB(api.LocalCockroachURI, "Local", notificationInfo)

time.Sleep(5 * time.Minute)
}
}

func monitorCockroachDB(uri, label string, notificationInfo *api.Info) {
if err := checkCockroachDB(uri); err != nil {
message := notification.GetCockroachMessage(err.Error(), label)
if sendErr := notification.SendFeishuNotification(notificationInfo, message); sendErr != nil {
log.Printf("Failed to send Feishu notification for %s: %v", label, sendErr)
}
}
}

func checkCockroachDB(CockroachConnection string) error {
db, err := gorm.Open(postgres.Open(CockroachConnection), &gorm.Config{
Logger: logger.Discard,
})
if err != nil {
return fmt.Errorf("failed to connect to CockroachDB: %v", err)
}

sqlDB, err := db.DB()
if err != nil {
return fmt.Errorf("failed to get database instance: %v", err)
}
defer sqlDB.Close()

if err := sqlDB.Ping(); err != nil {
return fmt.Errorf("failed to ping CockroachDB: %v", err)
}
return nil
}
Original file line number Diff line number Diff line change
@@ -90,17 +90,18 @@ func processBackup(backup unstructured.Unstructured) {
}

func SendBackupNotification(backupName, namespace, status, startTimestamp string) {
notificationInfo := notification.Info{
notificationInfo := api.Info{
DatabaseClusterName: backupName,
Namespace: namespace,
Status: status,
ExceptionStatus: status,
ExceptionType: "备份",
PerformanceType: "Backup",
NotificationType: "exception",
NotificationType: notification.ExceptionType,
FeishuWebHook: api.FeishuWebhookURLMap["FeishuWebhookURLBackup"],
}
if _, ok := api.LastBackupStatusMap[backupName]; !ok {
message := notification.GetBackupMessage("exception", namespace, backupName, status, startTimestamp, "")
if err := notification.SendFeishuNotification(notificationInfo, message, api.FeishuWebhookURLMap["FeishuWebhookURLBackup"]); err != nil {
message := notification.GetBackupMessage(notification.ExceptionType, namespace, backupName, status, startTimestamp, "")
if err := notification.SendFeishuNotification(&notificationInfo, message); err != nil {
log.Printf("Error sending exception notification:%v", err)
}
api.LastBackupStatusMap[backupName] = status
233 changes: 131 additions & 102 deletions service/exceptionmonitor/helper/monitor/database_monitor.go

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -56,91 +56,91 @@ func checkDatabasePerformanceInNamespace(namespace string) error {
}

func monitorCluster(cluster unstructured.Unstructured) {
databaseClusterName, databaseType, namespace, UID := cluster.GetName(), cluster.GetLabels()[api.DatabaseTypeLabel], cluster.GetNamespace(), string(cluster.GetUID())
status, found, err := unstructured.NestedString(cluster.Object, "status", "phase")
if err != nil || !found {
log.Printf("Unable to get %s status in ns %s: %v", databaseClusterName, namespace, err)
}
debt, _, _ := checkDebt(namespace)
notificationInfo := api.Info{}
getClusterDatabaseInfo(cluster, &notificationInfo)
//notificationInfo.DatabaseClusterName, notificationInfo.DatabaseType, notificationInfo.Namespace, notificationInfo.DatabaseClusterUID = cluster.GetName(), cluster.GetLabels()[api.DatabaseTypeLabel], cluster.GetNamespace(), string(cluster.GetUID())
//status, found, err := unstructured.NestedString(cluster.Object, "status", "phase")
//if err != nil || !found {
// log.Printf("Unable to get %s status in ns %s: %v", notificationInfo.DatabaseClusterName, notificationInfo.Namespace, err)
//}
debt, _, _ := checkDebt(notificationInfo.Namespace)
if !debt {
return
}
info := notification.Info{
DatabaseClusterName: databaseClusterName,
Namespace: namespace,
Status: status,
NotificationType: "exception",
ExceptionType: "阀值",
}
switch status {
notificationInfo.NotificationType = notification.ExceptionType
notificationInfo.ExceptionType = "阀值"
switch notificationInfo.ExceptionStatus {
case api.StatusDeleting, api.StatusCreating, api.StatusStopping, api.StatusStopped, api.StatusUnknown:
break
default:
if api.CPUMemMonitor {
handleCPUMemMonitor(namespace, databaseClusterName, databaseType, UID, info)
handleCPUMemMonitor(&notificationInfo)
}
if api.DiskMonitor {
handleDiskMonitor(namespace, databaseClusterName, databaseType, UID, info)
handleDiskMonitor(&notificationInfo)
}
}
}

func handleCPUMemMonitor(namespace, databaseClusterName, databaseType, UID string, info notification.Info) {
if cpuUsage, err := CPUMemMonitor(namespace, databaseClusterName, databaseType, "cpu"); err == nil {
processUsage(cpuUsage, api.DatabaseCPUMonitorThreshold, "CPU", UID, info, api.CPUMonitorNamespaceMap)
func handleCPUMemMonitor(notificationInfo *api.Info) {
if cpuUsage, err := CPUMemMonitor(notificationInfo, "cpu"); err == nil {
processUsage(cpuUsage, api.DatabaseCPUMonitorThreshold, "CPU", notificationInfo, api.CPUMonitorNamespaceMap)
} else {
log.Printf("Failed to monitor CPU: %v", err)
}
if memUsage, err := CPUMemMonitor(namespace, databaseClusterName, databaseType, "memory"); err == nil {
processUsage(memUsage, api.DatabaseMemMonitorThreshold, "内存", UID, info, api.MemMonitorNamespaceMap)
if memUsage, err := CPUMemMonitor(notificationInfo, "memory"); err == nil {
processUsage(memUsage, api.DatabaseMemMonitorThreshold, "内存", notificationInfo, api.MemMonitorNamespaceMap)
} else {
log.Printf("Failed to monitor Memory: %v", err)
}
}

func handleDiskMonitor(namespace, databaseClusterName, databaseType, UID string, info notification.Info) {
if maxUsage, err := checkPerformance(namespace, databaseClusterName, databaseType, "disk"); err == nil {
processUsage(maxUsage, api.DatabaseDiskMonitorThreshold, "磁盘", UID, info, api.DiskMonitorNamespaceMap)
func handleDiskMonitor(notificationInfo *api.Info) {
if maxUsage, err := checkPerformance(notificationInfo, "disk"); err == nil {
processUsage(maxUsage, api.DatabaseDiskMonitorThreshold, "磁盘", notificationInfo, api.DiskMonitorNamespaceMap)
} else {
log.Printf("Failed to monitor Disk: %v", err)
}
}

func processUsage(usage float64, threshold float64, performanceType, UID string, info notification.Info, monitorMap map[string]bool) {
info.PerformanceType = performanceType
func processUsage(usage float64, threshold float64, performanceType string, notificationInfo *api.Info, monitorMap map[string]bool) {
notificationInfo.PerformanceType = performanceType
usageStr := strconv.FormatFloat(usage, 'f', 2, 64)
if performanceType == "CPU" {
info.CPUUsage = usageStr
notificationInfo.CPUUsage = usageStr
} else if performanceType == "内存" {
info.MemUsage = usageStr
notificationInfo.MemUsage = usageStr
} else if performanceType == "磁盘" {
info.DiskUsage = usageStr
notificationInfo.DiskUsage = usageStr
}
if usage >= threshold && !monitorMap[UID] {
alertMessage := notification.GetNotificationMessage(info)
if err := notification.SendFeishuNotification(info, alertMessage, api.FeishuWebhookURLMap["FeishuWebhookURLImportant"]); err != nil {
if usage >= threshold && !monitorMap[notificationInfo.DatabaseClusterUID] {
alertMessage := notification.GetNotificationMessage(notificationInfo)
notificationInfo.FeishuWebHook = api.FeishuWebhookURLMap["FeishuWebhookURLImportant"]
if err := notification.SendFeishuNotification(notificationInfo, alertMessage); err != nil {
log.Printf("Failed to send notification: %v", err)
}
monitorMap[UID] = true
monitorMap[notificationInfo.DatabaseClusterUID] = true
if performanceType != "磁盘" {
return
}
ZNThreshold := NumberToChinese(int(threshold))
if err := notification.SendToSms(info.Namespace, info.DatabaseClusterName, api.ClusterName, "数据库"+performanceType+"超过百分之"+ZNThreshold); err != nil {
if err := notification.SendToSms(notificationInfo, api.ClusterName, "数据库"+performanceType+"超过百分之"+ZNThreshold); err != nil {
log.Printf("Failed to send Sms: %v", err)
}
} else if usage < threshold && monitorMap[UID] {
info.NotificationType = "recovery"
alertMessage := notification.GetNotificationMessage(info)
if err := notification.SendFeishuNotification(info, alertMessage, api.FeishuWebhookURLMap["FeishuWebhookURLImportant"]); err != nil {
} else if usage < threshold && monitorMap[notificationInfo.DatabaseClusterUID] {
notificationInfo.NotificationType = "recovery"
notificationInfo.RecoveryTime = time.Now().Add(8 * time.Hour).Format("2006-01-02 15:04:05")
alertMessage := notification.GetNotificationMessage(notificationInfo)
notificationInfo.FeishuWebHook = api.FeishuWebhookURLMap["FeishuWebhookURLImportant"]
if err := notification.SendFeishuNotification(notificationInfo, alertMessage); err != nil {
log.Printf("Failed to send notification: %v", err)
}
delete(monitorMap, UID)
delete(monitorMap, notificationInfo.DatabaseClusterUID)
}
}

func CPUMemMonitor(namespace, databaseClusterName, databaseType, checkType string) (float64, error) {
return checkPerformance(namespace, databaseClusterName, databaseType, checkType)
func CPUMemMonitor(notificationInfo *api.Info, checkType string) (float64, error) {
return checkPerformance(notificationInfo, checkType)
}

func NumberToChinese(num int) string {
10 changes: 5 additions & 5 deletions service/exceptionmonitor/helper/monitor/performance.go
Original file line number Diff line number Diff line change
@@ -15,11 +15,11 @@ import (
"github.com/labring/sealos/service/exceptionmonitor/api"
)

func checkPerformance(namespace, databaseClusterName, databaseType, checkType string) (float64, error) {
func checkPerformance(notificationInfo *api.Info, checkType string) (float64, error) {
params := url.Values{}
params.Add("namespace", namespace)
params.Add("app", databaseClusterName)
params.Add("type", databaseType)
params.Add("namespace", notificationInfo.Namespace)
params.Add("app", notificationInfo.DatabaseClusterName)
params.Add("type", notificationInfo.DatabaseType)
params.Add("query", checkType)

urlStr := api.BaseURL + "?" + params.Encode()
@@ -29,7 +29,7 @@ func checkPerformance(namespace, databaseClusterName, databaseType, checkType st
return 0.0, err
}

kubeconfig, err := getKubeConfig(namespace)
kubeconfig, err := getKubeConfig(notificationInfo.Namespace)
if err != nil {
return 0.0, err
}
11 changes: 6 additions & 5 deletions service/exceptionmonitor/helper/monitor/quota_monitor.go
Original file line number Diff line number Diff line change
@@ -39,26 +39,27 @@ func checkQuota() error {
if len(quotaList.Items) != 1 || quotaList.Items[0].Name != "quota-"+ns.Name {
continue
}
nsQuota := notification.NameSpaceQuota{
nsQuota := api.NameSpaceQuota{
NameSpace: ns.Name,
}
notificationInfo := notification.Info{
notificationInfo := api.Info{
ExceptionType: "Quota",
PerformanceType: "Quota",
NotificationType: "exception",
NotificationType: notification.ExceptionType,
}
send := processQuota(quotaList, &nsQuota)
if send {
message := notification.GetQuotaMessage(&nsQuota)
if err := notification.SendFeishuNotification(notificationInfo, message, api.FeishuWebhookURLMap["FeishuWebhookURLQuota"]); err != nil {
notificationInfo.FeishuWebHook = api.FeishuWebhookURLMap["FeishuWebhookURLQuota"]
if err := notification.SendFeishuNotification(&notificationInfo, message); err != nil {
log.Printf("Error sending exception notification:%v", err)
}
}
}
return nil
}

func processQuota(quotaList *v1.ResourceQuotaList, nsQuota *notification.NameSpaceQuota) bool {
func processQuota(quotaList *v1.ResourceQuotaList, nsQuota *api.NameSpaceQuota) bool {
send := false
for resourceName, hardQuantity := range quotaList.Items[0].Status.Hard {
usedQuantity, exists := quotaList.Items[0].Status.Used[resourceName]
10 changes: 5 additions & 5 deletions service/exceptionmonitor/helper/notification/desktop.go
Original file line number Diff line number Diff line change
@@ -31,7 +31,7 @@ func randString(n int) (string, error) {
return string(b), nil
}

func CreateNotification(namespace, name, status, notificationMessage, zhNotificationMessage string) {
func CreateNotification(notificationInfo *api.Info, notificationMessage, zhNotificationMessage string) {
gvr := schema.GroupVersionResource{
Group: "notification.sealos.io",
Version: "v1",
@@ -40,14 +40,14 @@ func CreateNotification(namespace, name, status, notificationMessage, zhNotifica

randomSuffix, _ := randString(5)
now := time.Now().UTC().Unix()
message := fmt.Sprintf("Because %s , Database %s current status : %s , Please check in time.", notificationMessage, name, status)
zhMessage := fmt.Sprintf("因为 %s , 数据库 %s 当前状态 : %s , 请及时检查.", zhNotificationMessage, name, status)
message := fmt.Sprintf("Because %s , Database %s current status : %s , Please check in time.", notificationMessage, notificationInfo.DatabaseClusterName, notificationInfo.ExceptionStatus)
zhMessage := fmt.Sprintf("因为 %s , 数据库 %s 当前状态 : %s , 请及时检查.", zhNotificationMessage, notificationInfo.DatabaseClusterName, notificationInfo.ExceptionStatus)
notification := &unstructured.Unstructured{
Object: map[string]interface{}{
"apiVersion": "notification.sealos.io/v1",
"kind": "Notification",
"metadata": map[string]interface{}{
"name": "database-exception" + name + randomSuffix,
"name": "database-exception" + notificationInfo.DatabaseClusterName + randomSuffix,
},
"spec": map[string]interface{}{
"title": "Database Exception",
@@ -67,7 +67,7 @@ func CreateNotification(namespace, name, status, notificationMessage, zhNotifica
},
}

_, err := api.DynamicClient.Resource(gvr).Namespace(namespace).Create(context.TODO(), notification, metav1.CreateOptions{})
_, err := api.DynamicClient.Resource(gvr).Namespace(notificationInfo.Namespace).Create(context.TODO(), notification, metav1.CreateOptions{})
if err != nil {
log.Printf("Failed to send desktop notification: %v", err)
}
148 changes: 95 additions & 53 deletions service/exceptionmonitor/helper/notification/feishu.go
Original file line number Diff line number Diff line change
@@ -6,7 +6,6 @@ import (
"fmt"
"log"
"regexp"
"time"

"github.com/labring/sealos/service/exceptionmonitor/api"
lark "github.com/larksuite/oapi-sdk-go/v3"
@@ -20,44 +19,59 @@ var (
feiShuClient *lark.Client
)

type Info struct {
DatabaseClusterName string
Namespace string
Status string
DebtLevel string
Events string
Reason string
NotificationType string
DiskUsage string
CPUUsage string
MemUsage string
PerformanceType string
ExceptionType string
func InitFeishuClient() {
feiShuClient = lark.NewClient(api.APPID, api.APPSECRET)
}

type NameSpaceQuota struct {
NameSpace string
CPULimit string
MemLimit string
GPULimit string
EphemeralStorageLimit string
ObjectStorageLimit string
NodePortLimit string
StorageLimit string
CPUUsage string
MemUsage string
GPUUsage string
EphemeralStorageUsage string
ObjectStorageUsage string
NodePortUsage string
StorageUsage string
}
func GetCockroachMessage(errMessage, cockroachType string) string {
headerTemplate := "red"
titleContent := "小强数据库异常告警"
elements := []map[string]interface{}{
{
"tag": "div",
"text": map[string]string{
"content": fmt.Sprintf("集群环境:%s", api.ClusterName),
"tag": "lark_md",
},
},
{
"tag": "div",
"text": map[string]string{
"content": fmt.Sprintf("数据库类型:%s", cockroachType),
"tag": "lark_md",
},
},
{
"tag": "div",
"text": map[string]string{
"content": fmt.Sprintf("异常信息:%s", errMessage),
"tag": "lark_md",
},
},
}
card := map[string]interface{}{
"config": map[string]bool{
"wide_screen_mode": true,
},
"elements": elements,
"header": map[string]interface{}{
"template": headerTemplate,
"title": map[string]string{
"content": titleContent,
"tag": "plain_text",
},
},
}

func InitFeishuClient() {
feiShuClient = lark.NewClient(api.APPID, api.APPSECRET)
databaseMessage, err := json.Marshal(card)
if err != nil {
fmt.Println("Error marshaling JSON:", err)
return ""
}
return string(databaseMessage)
}

func GetNotificationMessage(notificationInfo Info) string {
func GetNotificationMessage(notificationInfo *api.Info) string {
headerTemplate := "red"
titleContent := "数据库" + notificationInfo.ExceptionType + "告警"
usage := ""
@@ -68,8 +82,8 @@ func GetNotificationMessage(notificationInfo Info) string {
} else if notificationInfo.PerformanceType == "磁盘" {
usage = notificationInfo.DiskUsage
}
var elements []map[string]interface{}

//公共部分,状态和阀值的异常、恢复过程都需要,需要判断是否首次发送信息,是的话,就用这里,不是的话,就跳过(在之前的内容上追加)
commonElements := []map[string]interface{}{
{
"tag": "div",
@@ -95,15 +109,22 @@ func GetNotificationMessage(notificationInfo Info) string {
{
"tag": "div",
"text": map[string]string{
"content": fmt.Sprintf("数据库状态:%s", notificationInfo.Status),
"content": fmt.Sprintf("数据库状态:%s", notificationInfo.ExceptionStatus),
"tag": "lark_md",
},
},
}

if notificationInfo.NotificationType == ExceptionType && notificationInfo.ExceptionType == "状态" {
exceptionElements := []map[string]interface{}{
//这个异常时间需要给值
{
"tag": "div",
"text": map[string]string{
"content": fmt.Sprintf("数据库异常时间:%s", notificationInfo.ExceptionStatusTime),
"tag": "lark_md",
},
}, {
"tag": "div",
"text": map[string]string{
"content": fmt.Sprintf("欠费级别:%s", notificationInfo.DebtLevel),
@@ -125,7 +146,7 @@ func GetNotificationMessage(notificationInfo Info) string {
},
},
}
elements = append(commonElements, exceptionElements...)
notificationInfo.FeishuInfo = append(commonElements, exceptionElements...)
} else if notificationInfo.ExceptionType == "阀值" {
exceptionElements := []map[string]interface{}{
{
@@ -136,16 +157,29 @@ func GetNotificationMessage(notificationInfo Info) string {
},
},
}
elements = append(commonElements, exceptionElements...)
notificationInfo.FeishuInfo = append(commonElements, exceptionElements...)
}

if notificationInfo.NotificationType == "recovery" {
// todo 拿到之前的发送信息并加上,已做状态监控,未做阀值监控
headerTemplate = "blue"
titleContent = "数据库" + notificationInfo.ExceptionType + "恢复通知"

elements = commonElements
//获取之前发送的飞书内容
separatorElements := []map[string]interface{}{
{
"tag": "div",
"text": map[string]string{
"content": "-------------------------------------------",
"tag": "lark_md",
},
},
}
notificationInfo.FeishuInfo = append(notificationInfo.FeishuInfo, separatorElements...)
//elements = commonElements
if notificationInfo.ExceptionType == "阀值" {
exceptionElements := []map[string]interface{}{
//todo 数据库阀值的恢复时间怎么跟其它统一起来,需要在数据库阀值恢复中增加恢复时间
usageRecoveryElements := []map[string]interface{}{
{
"tag": "div",
"text": map[string]string{
@@ -154,24 +188,32 @@ func GetNotificationMessage(notificationInfo Info) string {
},
},
}
elements = append(elements, exceptionElements...)
notificationInfo.FeishuInfo = append(notificationInfo.FeishuInfo, usageRecoveryElements...)
}
exceptionElements := []map[string]interface{}{
recoveryTimeElements := []map[string]interface{}{
{
"tag": "div",
"text": map[string]string{
"content": fmt.Sprintf("数据库状态:%s", notificationInfo.RecoveryStatus),
"tag": "lark_md",
},
},
{
"tag": "div",
"text": map[string]string{
"content": fmt.Sprintf("数据库恢复时间:%s", time.Now().Add(8*time.Hour).Format("2006-01-02 15:04:05")),
"content": fmt.Sprintf("数据库恢复时间:%s", notificationInfo.RecoveryTime),
"tag": "lark_md",
},
},
}
elements = append(elements, exceptionElements...)
notificationInfo.FeishuInfo = append(notificationInfo.FeishuInfo, recoveryTimeElements...)
}
card := map[string]interface{}{
"config": map[string]bool{
"wide_screen_mode": true,
},
"elements": elements,
//elements替换成notificationInfo.FeishuInfo
"elements": notificationInfo.FeishuInfo,
"header": map[string]interface{}{
"template": headerTemplate,
"title": map[string]string{
@@ -189,9 +231,9 @@ func GetNotificationMessage(notificationInfo Info) string {
return string(databaseMessage)
}

func SendFeishuNotification(notification Info, message, feishuWebHook string) error {
func SendFeishuNotification(notification *api.Info, message string) error {
if api.MonitorType != "all" {
feishuWebHook = api.FeishuWebhookURLMap["FeishuWebhookURLImportant"]
notification.FeishuWebHook = api.FeishuWebhookURLMap["FeishuWebhookURLImportant"]
}

messageIDMap := getMessageIDMap(notification.PerformanceType)
@@ -202,7 +244,7 @@ func SendFeishuNotification(notification Info, message, feishuWebHook string) er
}
delete(messageIDMap, notification.DatabaseClusterName)
} else {
if err := createFeishuNotification(notification, message, feishuWebHook, messageIDMap); err != nil {
if err := createFeishuNotification(notification, message, messageIDMap); err != nil {
return err
}
}
@@ -246,11 +288,11 @@ func updateFeishuNotification(messageID, message string) error {
return nil
}

func createFeishuNotification(notification Info, message, feishuWebHook string, messageIDMap map[string]string) error {
func createFeishuNotification(notification *api.Info, message string, messageIDMap map[string]string) error {
req := larkim.NewCreateMessageReqBuilder().
ReceiveIdType("chat_id").
Body(larkim.NewCreateMessageReqBodyBuilder().
ReceiveId(feishuWebHook).
ReceiveId(notification.FeishuWebHook).
MsgType("interactive").
Content(message).Build()).Build()

@@ -317,7 +359,7 @@ func createCard(headerTemplate, headerTitle string, elements []map[string]string
return card
}

func GetQuotaMessage(nsQuota *NameSpaceQuota) string {
func GetQuotaMessage(nsQuota *api.NameSpaceQuota) string {
var card map[string]interface{}
elements := createQuotaElements(nsQuota)
card = createCard("red", "Quota阀值通知", elements)
@@ -330,7 +372,7 @@ func GetQuotaMessage(nsQuota *NameSpaceQuota) string {
return databaseMessage
}

func createQuotaElements(nsQuota *NameSpaceQuota) []map[string]string {
func createQuotaElements(nsQuota *api.NameSpaceQuota) []map[string]string {
elements := []map[string]string{
{"label": "集群环境", "value": api.ClusterName},
{"label": "命名空间", "value": nsQuota.NameSpace},
@@ -339,7 +381,7 @@ func createQuotaElements(nsQuota *NameSpaceQuota) []map[string]string {
return elements
}

func addNonEmptyFieldsToElements(nsQuota *NameSpaceQuota, elements *[]map[string]string) {
func addNonEmptyFieldsToElements(nsQuota *api.NameSpaceQuota, elements *[]map[string]string) {
fields := map[string]string{
"CPULimit": "CPU总量",
"CPUUsage": "CPU使用率",
6 changes: 3 additions & 3 deletions service/exceptionmonitor/helper/notification/sms.go
Original file line number Diff line number Diff line change
@@ -37,13 +37,13 @@ func GetPhoneNumberByNS(owner string) (string, error) {
return phone, nil
}

func SendToSms(namespace, databaseName, clusterName, content string) error {
func SendToSms(notificationInfo *api.Info, clusterName, content string) error {
smsClient, err := utils.CreateSMSClient(os.Getenv("SMSAccessKeyID"), os.Getenv("SMSAccessKeySecret"), os.Getenv("SMSEndpoint"))
if err != nil {
return err
}
name := strings.ReplaceAll(databaseName, "-", "/")
owner, _ := GetNSOwner(namespace)
name := strings.ReplaceAll(notificationInfo.DatabaseClusterName, "-", "/")
owner, _ := GetNSOwner(notificationInfo.Namespace)
phoneNumbers, err := GetPhoneNumberByNS(owner)
if err != nil {
return err
1 change: 1 addition & 0 deletions service/exceptionmonitor/main.go
Original file line number Diff line number Diff line change
@@ -19,6 +19,7 @@ func main() {
go monitor.DatabasePerformanceMonitor()
go monitor.DatabaseBackupMonitor()
go monitor.QuotaMonitor()
go monitor.CockroachMonitor()
select {}
}

103 changes: 103 additions & 0 deletions service/go.work.sum

Large diffs are not rendered by default.