diff --git a/aws-msa-reference/lma/site-values.yaml b/aws-msa-reference/lma/site-values.yaml index 2ae41ec..ba3d923 100644 --- a/aws-msa-reference/lma/site-values.yaml +++ b/aws-msa-reference/lma/site-values.yaml @@ -16,6 +16,8 @@ global: lokiHost: loki-loki-distributed-gateway lokiPort: 80 + lokiuserHost: loki-user-loki-distributed-gateway + lokiuserPort: 80 s3Service: "minio.lma.svc:9000" # depends on $lmaNameSpace (ex. minio.taco-system.svc) lmaNameSpace: lma @@ -148,19 +150,23 @@ charts: - name: taco-loki host: $(lokiHost) port: $(lokiPort) + lokiuser: + - name: taco-loki-user + host: $(lokiuserHost) + port: $(lokiuserPort) targetLogs: - tag: kube.* bufferChunkSize: 2M bufferMaxSize: 5M do_not_store_as_default: false index: container - loki_name: taco-loki + loki_name: taco-loki-user memBufLimit: 20MB multi_index: - index: platform loki_name: taco-loki key: $kubernetes['namespace_name'] - value: kube-system|$(lmaNameSpace)|taco-system|argo + value: kube-system|$(lmaNameSpace)|taco-system|gatekeeper-system|argo parser: docker path: /var/log/containers/*.log type: kubernates @@ -274,6 +280,8 @@ charts: # - --deduplication.replica-label="replica" storegateway.persistence.size: 8Gi ruler.nodeSelector: $(nodeSelector) + ruler.service.type: LoadBalancer + ruler.service.annotations: $(awsNlbAnnotation) ruler.alertmanagers: - http://alertmanager-operated:9093 ruler.persistence.size: 8Gi @@ -283,61 +291,7 @@ charts: rules: - alert: "PrometheusDown" expr: absent(up{prometheus="lma/lma-prometheus"}) - - alert: node-cpu-high-load - annotations: - message: 클러스터({{ $labels.taco_cluster }})의 노드({{ $labels.instance }})의 idle process의 cpu 점유율이 3분 동안 0% 입니다. (현재 사용률 {{$value}}) - description: 워커 노드 CPU가 과부하 상태입니다. 일시적인 서비스 Traffic 증가, Workload의 SW 오류, Server HW Fan Fail등 다양한 원인으로 인해 발생할 수 있습니다. - Checkpoint: 일시적인 Service Traffic의 증가가 관측되지 않았다면, Alert발생 노드에서 실행 되는 pod중 CPU 자원을 많이 점유하는 pod의 설정을 점검해 보시길 제안드립니다. 예를 들어 pod spec의 limit 설정으로 과도한 CPU자원 점유을 막을 수 있습니다. - summary: Cpu resources of the node {{ $labels.instance }} are running low. - discriminative: $labels.taco_cluster, $labels.instance - expr: (avg by (taco_cluster, instance) (rate(node_cpu_seconds_total{mode="idle"}[60s]))) < 0 #0.1 # 진짜 0? - for: 3m - labels: - severity: warning - - alert: node-memory-high-utilization - annotations: - message: 클러스터({{ $labels.taco_cluster }})의 노드({{ $labels.instance }})의 Memory 사용량이 3분동안 80% 를 넘어서고 있습니다. (현재 사용률 {{$value}}) - descriptioon: 워커 노드의 Memory 사용량이 80%를 넘었습니다. 일시적인 서비스 증가 및 SW 오류등 다양한 원인으로 발생할 수 있습니다. - Checkpoint: 일시적인 Service Traffic의 증가가 관측되지 않았다면, Alert발생 노드에서 실행되는 pod중 Memory 사용량이 높은 pod들에 대한 점검을 제안드립니다. - summary: Memory resources of the node {{ $labels.instance }} are running low. - discriminative: $labels.taco_cluster, $labels.instance - expr: (node_memory_MemAvailable_bytes/node_memory_MemTotal_bytes) < 0.2 - for: 3m - labels: - severity: warning - - alert: node-disk-full - annotations: - message: 지난 6시간동안의 추세로 봤을 때, 클러스터({{ $labels.taco_cluster }})의 노드({{ $labels.instance }})의 root 볼륨은 24시간 안에 Disk full이 예상됨 - description: 현재 Disk 사용 추세기준 24시간 내에 Disk 용량이 꽉 찰 것으로 예상됩니다. - Checkpoint: Disk 용량 최적화(삭제 및 Backup)을 수행하시길 권고합니다. 삭제할 내역이 없으면 증설 계획을 수립해 주십시요. - summary: Memory resources of the node {{ $labels.instance }} are running low. - discriminative: $labels.taco_cluster, $labels.instance - expr: predict_linear(node_filesystem_free_bytes{mountpoint="/"}[6h], 24*3600) < 0 - for: 30m - labels: - severity: critical - - alert: pvc-full - annotations: - message: 지난 6시간동안의 추세로 봤을 때, 클러스터({{ $labels.taco_cluster }})의 파드({{ $labels.persistentvolumeclaim }})가 24시간 안에 Disk full이 예상됨 - description: 현재 Disk 사용 추세기준 24시간 내에 Disk 용량이 꽉 찰것으로 예상됩니다. ({{ $labels.taco_cluster }} 클러스터, {{ $labels.persistentvolumeclaim }} PVC) - Checkpoint: Disk 용량 최적화(삭제 및 Backup)을 수행하시길 권고합니다. 삭제할 내역이 없으면 증설 계획을 수립해 주십시요. - summary: Disk resources of the volume(pvc) {{ $labels.persistentvolumeclaim }} are running low. - discriminative: $labels.taco_cluster, $labels.persistentvolumeclaim - expr: predict_linear(kubelet_volume_stats_available_bytes[6h], 24*3600) < 0 # kubelet_volume_stats_capacity_bytes - for: 30m - labels: - severity: critical - - alert: pod-restart-frequently - annotations: - message: 클러스터({{ $labels.taco_cluster }})의 파드({{ $labels.pod }})가 30분 동안 5회 이상 재기동 ({{ $value }}회) - description: 특정 Pod가 빈번하게 재기동 되고 있습니다. 점검이 필요합니다. ({{ $labels.taco_cluster }} 클러스터, {{ $labels.pod }} 파드) - Checkpoint: pod spec. 에 대한 점검이 필요합니다. pod의 log 및 status를 확인해 주세요. - discriminative: $labels.taco_cluster, $labels.pod, $labels.namespace - expr: increase(kube_pod_container_status_restarts_total{namespace!="kube-system"}[60m:]) > 2 # 몇회로 할 것인지? - for: 30m - labels: - severity: critical - + - name: thanos-config override: objectStorage: @@ -393,10 +347,42 @@ charts: aws: s3: http://$(defaultUser):$(defaultPassword)@$(s3Service)/minio +- name: loki-user + override: + global.dnsService: kube-dns + # global.clusterDomain: $(clusterName) # annotate cluste because the cluster name is still cluster.local regardless cluster + gateway.service.type: LoadBalancer + gateway.service.annotations: $(awsNlbAnnotation) + ingester.persistence.storageClass: $(storageClassName) + distributor.persistence.storageClass: $(storageClassName) + queryFrontend.persistence.storageClass: $(storageClassName) + ruler.persistence.storageClass: $(storageClassName) + indexGateway.persistence.storageClass: $(storageClassName) + # select target node's label + ingester.nodeSelector: $(nodeSelector) + distributor.nodeSelector: $(nodeSelector) + querier.nodeSelector: $(nodeSelector) + queryFrontend.nodeSelector: $(nodeSelector) + queryScheduler.nodeSelector: $(nodeSelector) + tableManager.nodeSelector: $(nodeSelector) + gateway.nodeSelector: $(nodeSelector) + compactor.nodeSelector: $(nodeSelector) + ruler.nodeSelector: $(nodeSelector) + indexGateway.nodeSelector: $(nodeSelector) + memcachedChunks.nodeSelector: $(nodeSelector) + memcachedFrontend.nodeSelector: $(nodeSelector) + memcachedIndexQueries.nodeSelector: $(nodeSelector) + memcachedIndexWrites.nodeSelector: $(nodeSelector) + loki: + storageConfig: + aws: + s3: http://$(defaultUser):$(defaultPassword)@$(s3Service)/minio + - name: lma-bucket override: s3.enabled: true s3.buckets: - name: $(clusterName)-tks-thanos - name: $(clusterName)-tks-loki + - name: $(clusterName)-tks-loki-user tks.iamRoles: $(tksIamRoles) diff --git a/aws-msa-reference/policy/kustomization.yaml b/aws-msa-reference/policy/kustomization.yaml new file mode 100644 index 0000000..7c415e6 --- /dev/null +++ b/aws-msa-reference/policy/kustomization.yaml @@ -0,0 +1,5 @@ +resources: + - ../base + +transformers: + - site-values.yaml diff --git a/aws-msa-reference/policy/site-values.yaml b/aws-msa-reference/policy/site-values.yaml new file mode 100644 index 0000000..80aa10e --- /dev/null +++ b/aws-msa-reference/policy/site-values.yaml @@ -0,0 +1,26 @@ +apiVersion: openinfradev.github.com/v1 +kind: HelmValuesTransformer +metadata: + name: site + +global: + nodeSelector: + taco-lma: enabled + clusterName: cluster.local + storageClassName: taco-storage + repository: https://openinfradev.github.io/helm-repo/ + +charts: +- name: opa-gatekeeper + override: + postUpgrade.nodeSelector: $(nodeSelector) + postInstall.nodeSelector: $(nodeSelector) + preUninstall.nodeSelector: $(nodeSelector) + controllerManager.nodeSelector: $(nodeSelector) + audit.nodeSelector: $(nodeSelector) + crds.nodeSelector: $(nodeSelector) + + enableDeleteOperations: true + +- name: policy-resources + override: {} diff --git a/aws-msa-reference/tks-cluster/site-values.yaml b/aws-msa-reference/tks-cluster/site-values.yaml index 406fe3e..6028c31 100644 --- a/aws-msa-reference/tks-cluster/site-values.yaml +++ b/aws-msa-reference/tks-cluster/site-values.yaml @@ -27,7 +27,7 @@ charts: sshKeyName: $(sshKeyName) cluster: name: $(clusterName) - kubernetesVersion: v1.26.10 + kubernetesVersion: v1.29.8 eksEnabled: false multitenancyId: kind: AWSClusterRoleIdentity @@ -54,6 +54,8 @@ charts: kubeadmControlPlane: replicas: $(tksCpNode) controlPlaneMachineType: $(tksCpNodeType) + ami: + id: ami-02e4e8f09921cfe97 machinePool: - name: taco machineType: $(tksInfraNodeType) @@ -69,6 +71,8 @@ charts: taco-ingress-gateway: enabled roleAdditionalPolicies: - "arn:aws:iam::aws:policy/AmazonS3FullAccess" + ami: + id: ami-02e4e8f09921cfe97 machineDeployment: - name: normal numberOfAZ: 3 # ap-northeast-2 @@ -80,6 +84,8 @@ charts: rootVolume: size: 50 type: gp2 + ami: + id: ami-02e4e8f09921cfe97 - name: ingress-nginx override: diff --git a/aws-reference/lma/site-values.yaml b/aws-reference/lma/site-values.yaml index 2ae41ec..6d8f0b3 100644 --- a/aws-reference/lma/site-values.yaml +++ b/aws-reference/lma/site-values.yaml @@ -16,6 +16,8 @@ global: lokiHost: loki-loki-distributed-gateway lokiPort: 80 + lokiuserHost: loki-user-loki-distributed-gateway + lokiuserPort: 80 s3Service: "minio.lma.svc:9000" # depends on $lmaNameSpace (ex. minio.taco-system.svc) lmaNameSpace: lma @@ -148,19 +150,23 @@ charts: - name: taco-loki host: $(lokiHost) port: $(lokiPort) + lokiuser: + - name: taco-loki-user + host: $(lokiuserHost) + port: $(lokiuserPort) targetLogs: - tag: kube.* bufferChunkSize: 2M bufferMaxSize: 5M do_not_store_as_default: false index: container - loki_name: taco-loki + loki_name: taco-loki-user memBufLimit: 20MB multi_index: - index: platform loki_name: taco-loki key: $kubernetes['namespace_name'] - value: kube-system|$(lmaNameSpace)|taco-system|argo + value: kube-system|$(lmaNameSpace)|taco-system|gatekeeper-system|argo parser: docker path: /var/log/containers/*.log type: kubernates @@ -244,7 +250,6 @@ charts: consoleIngress.nodeSelector: $(nodeSelector) postJob.nodeSelector: $(nodeSelector) - - name: thanos override: global.storageClass: $(storageClassName) @@ -274,6 +279,8 @@ charts: # - --deduplication.replica-label="replica" storegateway.persistence.size: 8Gi ruler.nodeSelector: $(nodeSelector) + ruler.service.type: LoadBalancer + ruler.service.annotations: $(awsNlbAnnotation) ruler.alertmanagers: - http://alertmanager-operated:9093 ruler.persistence.size: 8Gi @@ -283,61 +290,7 @@ charts: rules: - alert: "PrometheusDown" expr: absent(up{prometheus="lma/lma-prometheus"}) - - alert: node-cpu-high-load - annotations: - message: 클러스터({{ $labels.taco_cluster }})의 노드({{ $labels.instance }})의 idle process의 cpu 점유율이 3분 동안 0% 입니다. (현재 사용률 {{$value}}) - description: 워커 노드 CPU가 과부하 상태입니다. 일시적인 서비스 Traffic 증가, Workload의 SW 오류, Server HW Fan Fail등 다양한 원인으로 인해 발생할 수 있습니다. - Checkpoint: 일시적인 Service Traffic의 증가가 관측되지 않았다면, Alert발생 노드에서 실행 되는 pod중 CPU 자원을 많이 점유하는 pod의 설정을 점검해 보시길 제안드립니다. 예를 들어 pod spec의 limit 설정으로 과도한 CPU자원 점유을 막을 수 있습니다. - summary: Cpu resources of the node {{ $labels.instance }} are running low. - discriminative: $labels.taco_cluster, $labels.instance - expr: (avg by (taco_cluster, instance) (rate(node_cpu_seconds_total{mode="idle"}[60s]))) < 0 #0.1 # 진짜 0? - for: 3m - labels: - severity: warning - - alert: node-memory-high-utilization - annotations: - message: 클러스터({{ $labels.taco_cluster }})의 노드({{ $labels.instance }})의 Memory 사용량이 3분동안 80% 를 넘어서고 있습니다. (현재 사용률 {{$value}}) - descriptioon: 워커 노드의 Memory 사용량이 80%를 넘었습니다. 일시적인 서비스 증가 및 SW 오류등 다양한 원인으로 발생할 수 있습니다. - Checkpoint: 일시적인 Service Traffic의 증가가 관측되지 않았다면, Alert발생 노드에서 실행되는 pod중 Memory 사용량이 높은 pod들에 대한 점검을 제안드립니다. - summary: Memory resources of the node {{ $labels.instance }} are running low. - discriminative: $labels.taco_cluster, $labels.instance - expr: (node_memory_MemAvailable_bytes/node_memory_MemTotal_bytes) < 0.2 - for: 3m - labels: - severity: warning - - alert: node-disk-full - annotations: - message: 지난 6시간동안의 추세로 봤을 때, 클러스터({{ $labels.taco_cluster }})의 노드({{ $labels.instance }})의 root 볼륨은 24시간 안에 Disk full이 예상됨 - description: 현재 Disk 사용 추세기준 24시간 내에 Disk 용량이 꽉 찰 것으로 예상됩니다. - Checkpoint: Disk 용량 최적화(삭제 및 Backup)을 수행하시길 권고합니다. 삭제할 내역이 없으면 증설 계획을 수립해 주십시요. - summary: Memory resources of the node {{ $labels.instance }} are running low. - discriminative: $labels.taco_cluster, $labels.instance - expr: predict_linear(node_filesystem_free_bytes{mountpoint="/"}[6h], 24*3600) < 0 - for: 30m - labels: - severity: critical - - alert: pvc-full - annotations: - message: 지난 6시간동안의 추세로 봤을 때, 클러스터({{ $labels.taco_cluster }})의 파드({{ $labels.persistentvolumeclaim }})가 24시간 안에 Disk full이 예상됨 - description: 현재 Disk 사용 추세기준 24시간 내에 Disk 용량이 꽉 찰것으로 예상됩니다. ({{ $labels.taco_cluster }} 클러스터, {{ $labels.persistentvolumeclaim }} PVC) - Checkpoint: Disk 용량 최적화(삭제 및 Backup)을 수행하시길 권고합니다. 삭제할 내역이 없으면 증설 계획을 수립해 주십시요. - summary: Disk resources of the volume(pvc) {{ $labels.persistentvolumeclaim }} are running low. - discriminative: $labels.taco_cluster, $labels.persistentvolumeclaim - expr: predict_linear(kubelet_volume_stats_available_bytes[6h], 24*3600) < 0 # kubelet_volume_stats_capacity_bytes - for: 30m - labels: - severity: critical - - alert: pod-restart-frequently - annotations: - message: 클러스터({{ $labels.taco_cluster }})의 파드({{ $labels.pod }})가 30분 동안 5회 이상 재기동 ({{ $value }}회) - description: 특정 Pod가 빈번하게 재기동 되고 있습니다. 점검이 필요합니다. ({{ $labels.taco_cluster }} 클러스터, {{ $labels.pod }} 파드) - Checkpoint: pod spec. 에 대한 점검이 필요합니다. pod의 log 및 status를 확인해 주세요. - discriminative: $labels.taco_cluster, $labels.pod, $labels.namespace - expr: increase(kube_pod_container_status_restarts_total{namespace!="kube-system"}[60m:]) > 2 # 몇회로 할 것인지? - for: 30m - labels: - severity: critical - + - name: thanos-config override: objectStorage: @@ -393,10 +346,42 @@ charts: aws: s3: http://$(defaultUser):$(defaultPassword)@$(s3Service)/minio +- name: loki-user + override: + global.dnsService: kube-dns + # global.clusterDomain: $(clusterName) # annotate cluste because the cluster name is still cluster.local regardless cluster + gateway.service.type: LoadBalancer + gateway.service.annotations: $(awsNlbAnnotation) + ingester.persistence.storageClass: $(storageClassName) + distributor.persistence.storageClass: $(storageClassName) + queryFrontend.persistence.storageClass: $(storageClassName) + ruler.persistence.storageClass: $(storageClassName) + indexGateway.persistence.storageClass: $(storageClassName) + # select target node's label + ingester.nodeSelector: $(nodeSelector) + distributor.nodeSelector: $(nodeSelector) + querier.nodeSelector: $(nodeSelector) + queryFrontend.nodeSelector: $(nodeSelector) + queryScheduler.nodeSelector: $(nodeSelector) + tableManager.nodeSelector: $(nodeSelector) + gateway.nodeSelector: $(nodeSelector) + compactor.nodeSelector: $(nodeSelector) + ruler.nodeSelector: $(nodeSelector) + indexGateway.nodeSelector: $(nodeSelector) + memcachedChunks.nodeSelector: $(nodeSelector) + memcachedFrontend.nodeSelector: $(nodeSelector) + memcachedIndexQueries.nodeSelector: $(nodeSelector) + memcachedIndexWrites.nodeSelector: $(nodeSelector) + loki: + storageConfig: + aws: + s3: http://$(defaultUser):$(defaultPassword)@$(s3Service)/minio + - name: lma-bucket override: s3.enabled: true s3.buckets: - name: $(clusterName)-tks-thanos - name: $(clusterName)-tks-loki + - name: $(clusterName)-tks-loki-user tks.iamRoles: $(tksIamRoles) diff --git a/aws-reference/policy/kustomization.yaml b/aws-reference/policy/kustomization.yaml new file mode 100644 index 0000000..7c415e6 --- /dev/null +++ b/aws-reference/policy/kustomization.yaml @@ -0,0 +1,5 @@ +resources: + - ../base + +transformers: + - site-values.yaml diff --git a/aws-reference/policy/site-values.yaml b/aws-reference/policy/site-values.yaml new file mode 100644 index 0000000..80aa10e --- /dev/null +++ b/aws-reference/policy/site-values.yaml @@ -0,0 +1,26 @@ +apiVersion: openinfradev.github.com/v1 +kind: HelmValuesTransformer +metadata: + name: site + +global: + nodeSelector: + taco-lma: enabled + clusterName: cluster.local + storageClassName: taco-storage + repository: https://openinfradev.github.io/helm-repo/ + +charts: +- name: opa-gatekeeper + override: + postUpgrade.nodeSelector: $(nodeSelector) + postInstall.nodeSelector: $(nodeSelector) + preUninstall.nodeSelector: $(nodeSelector) + controllerManager.nodeSelector: $(nodeSelector) + audit.nodeSelector: $(nodeSelector) + crds.nodeSelector: $(nodeSelector) + + enableDeleteOperations: true + +- name: policy-resources + override: {} diff --git a/aws-reference/tks-cluster/site-values.yaml b/aws-reference/tks-cluster/site-values.yaml index 406fe3e..6028c31 100644 --- a/aws-reference/tks-cluster/site-values.yaml +++ b/aws-reference/tks-cluster/site-values.yaml @@ -27,7 +27,7 @@ charts: sshKeyName: $(sshKeyName) cluster: name: $(clusterName) - kubernetesVersion: v1.26.10 + kubernetesVersion: v1.29.8 eksEnabled: false multitenancyId: kind: AWSClusterRoleIdentity @@ -54,6 +54,8 @@ charts: kubeadmControlPlane: replicas: $(tksCpNode) controlPlaneMachineType: $(tksCpNodeType) + ami: + id: ami-02e4e8f09921cfe97 machinePool: - name: taco machineType: $(tksInfraNodeType) @@ -69,6 +71,8 @@ charts: taco-ingress-gateway: enabled roleAdditionalPolicies: - "arn:aws:iam::aws:policy/AmazonS3FullAccess" + ami: + id: ami-02e4e8f09921cfe97 machineDeployment: - name: normal numberOfAZ: 3 # ap-northeast-2 @@ -80,6 +84,8 @@ charts: rootVolume: size: 50 type: gp2 + ami: + id: ami-02e4e8f09921cfe97 - name: ingress-nginx override: diff --git a/byoh-reference/lma/site-values.yaml b/byoh-reference/lma/site-values.yaml index 414a194..d976bfb 100644 --- a/byoh-reference/lma/site-values.yaml +++ b/byoh-reference/lma/site-values.yaml @@ -16,6 +16,8 @@ global: lokiHost: loki-loki-distributed-gateway lokiPort: 80 + lokiuserHost: loki-user-loki-distributed-gateway + lokiuserPort: 80 s3Service: "minio.lma.svc:9000" # depends on $lmaNameSpace (ex. minio.taco-system.svc) lmaNameSpace: lma @@ -151,19 +153,23 @@ charts: - name: taco-loki host: $(lokiHost) port: $(lokiPort) + lokiuser: + - name: taco-loki-user + host: $(lokiuserHost) + port: $(lokiuserPort) targetLogs: - tag: kube.* bufferChunkSize: 2M bufferMaxSize: 5M do_not_store_as_default: false index: container - loki_name: taco-loki + loki_name: taco-loki-user memBufLimit: 20MB multi_index: - index: platform loki_name: taco-loki key: $kubernetes['namespace_name'] - value: kube-system|$(lmaNameSpace)|taco-system|argo + value: kube-system|$(lmaNameSpace)|taco-system|gatekeeper-system|argo parser: docker path: /var/log/containers/*.log type: kubernates @@ -285,67 +291,17 @@ charts: ruler.alertmanagers: - http://alertmanager-operated:9093 ruler.persistence.size: 8Gi + ruler.service: + type: NodePort + nodePort: 30007 ruler.config: groups: - name: "tks" rules: - alert: "PrometheusDown" expr: absent(up{prometheus="lma/lma-prometheus"}) - - alert: node-cpu-high-load - annotations: - message: 클러스터({{ $labels.taco_cluster }})의 노드({{ $labels.instance }})의 idle process의 cpu 점유율이 3분 동안 0% 입니다. (현재 사용률 {{$value}}) - description: 워커 노드 CPU가 과부하 상태입니다. 일시적인 서비스 Traffic 증가, Workload의 SW 오류, Server HW Fan Fail등 다양한 원인으로 인해 발생할 수 있습니다. - Checkpoint: 일시적인 Service Traffic의 증가가 관측되지 않았다면, Alert발생 노드에서 실행 되는 pod중 CPU 자원을 많이 점유하는 pod의 설정을 점검해 보시길 제안드립니다. 예를 들어 pod spec의 limit 설정으로 과도한 CPU자원 점유을 막을 수 있습니다. - summary: Cpu resources of the node {{ $labels.instance }} are running low. - discriminative: $labels.taco_cluster, $labels.instance - expr: (avg by (taco_cluster, instance) (rate(node_cpu_seconds_total{mode="idle"}[60s]))) < 0 #0.1 # 진짜 0? - for: 3m - labels: - severity: warning - - alert: node-memory-high-utilization - annotations: - message: 클러스터({{ $labels.taco_cluster }})의 노드({{ $labels.instance }})의 Memory 사용량이 3분동안 80% 를 넘어서고 있습니다. (현재 사용률 {{$value}}) - descriptioon: 워커 노드의 Memory 사용량이 80%를 넘었습니다. 일시적인 서비스 증가 및 SW 오류등 다양한 원인으로 발생할 수 있습니다. - Checkpoint: 일시적인 Service Traffic의 증가가 관측되지 않았다면, Alert발생 노드에서 실행되는 pod중 Memory 사용량이 높은 pod들에 대한 점검을 제안드립니다. - summary: Memory resources of the node {{ $labels.instance }} are running low. - discriminative: $labels.taco_cluster, $labels.instance - expr: (node_memory_MemAvailable_bytes/node_memory_MemTotal_bytes) < 0.2 - for: 3m - labels: - severity: warning - - alert: node-disk-full - annotations: - message: 지난 6시간동안의 추세로 봤을 때, 클러스터({{ $labels.taco_cluster }})의 노드({{ $labels.instance }})의 root 볼륨은 24시간 안에 Disk full이 예상됨 - description: 현재 Disk 사용 추세기준 24시간 내에 Disk 용량이 꽉 찰 것으로 예상됩니다. - Checkpoint: Disk 용량 최적화(삭제 및 Backup)을 수행하시길 권고합니다. 삭제할 내역이 없으면 증설 계획을 수립해 주십시요. - summary: Memory resources of the node {{ $labels.instance }} are running low. - discriminative: $labels.taco_cluster, $labels.instance - expr: predict_linear(node_filesystem_free_bytes{mountpoint="/"}[6h], 24*3600) < 0 - for: 30m - labels: - severity: critical - - alert: pvc-full - annotations: - message: 지난 6시간동안의 추세로 봤을 때, 클러스터({{ $labels.taco_cluster }})의 파드({{ $labels.persistentvolumeclaim }})가 24시간 안에 Disk full이 예상됨 - description: 현재 Disk 사용 추세기준 24시간 내에 Disk 용량이 꽉 찰것으로 예상됩니다. ({{ $labels.taco_cluster }} 클러스터, {{ $labels.persistentvolumeclaim }} PVC) - Checkpoint: Disk 용량 최적화(삭제 및 Backup)을 수행하시길 권고합니다. 삭제할 내역이 없으면 증설 계획을 수립해 주십시요. - summary: Disk resources of the volume(pvc) {{ $labels.persistentvolumeclaim }} are running low. - discriminative: $labels.taco_cluster, $labels.persistentvolumeclaim - expr: predict_linear(kubelet_volume_stats_available_bytes[6h], 24*3600) < 0 # kubelet_volume_stats_capacity_bytes - for: 30m - labels: - severity: critical - - alert: pod-restart-frequently - annotations: - message: 클러스터({{ $labels.taco_cluster }})의 파드({{ $labels.pod }})가 30분 동안 5회 이상 재기동 ({{ $value }}회) - description: 특정 Pod가 빈번하게 재기동 되고 있습니다. 점검이 필요합니다. ({{ $labels.taco_cluster }} 클러스터, {{ $labels.pod }} 파드) - Checkpoint: pod spec. 에 대한 점검이 필요합니다. pod의 log 및 status를 확인해 주세요. - discriminative: $labels.taco_cluster, $labels.pod, $labels.namespace - expr: increase(kube_pod_container_status_restarts_total{namespace!="kube-system"}[60m:]) > 2 # 몇회로 할 것인지? - for: 30m - labels: - severity: critical + - name: thanos-config override: objectStorage: @@ -402,6 +358,38 @@ charts: aws: s3: http://$(defaultUser):$(defaultPassword)@$(s3Service)/minio +- name: loki-user + override: + global.dnsService: kube-dns + gateway.service: + type: NodePort + nodePort: 30006 + gateway.service.annotations: $(awsNlbAnnotation) + ingester.persistence.storageClass: $(storageClassName) + distributor.persistence.storageClass: $(storageClassName) + queryFrontend.persistence.storageClass: $(storageClassName) + ruler.persistence.storageClass: $(storageClassName) + indexGateway.persistence.storageClass: $(storageClassName) + # select target node's label + ingester.nodeSelector: $(nodeSelector) + distributor.nodeSelector: $(nodeSelector) + querier.nodeSelector: $(nodeSelector) + queryFrontend.nodeSelector: $(nodeSelector) + queryScheduler.nodeSelector: $(nodeSelector) + tableManager.nodeSelector: $(nodeSelector) + gateway.nodeSelector: $(nodeSelector) + compactor.nodeSelector: $(nodeSelector) + ruler.nodeSelector: $(nodeSelector) + indexGateway.nodeSelector: $(nodeSelector) + memcachedChunks.nodeSelector: $(nodeSelector) + memcachedFrontend.nodeSelector: $(nodeSelector) + memcachedIndexQueries.nodeSelector: $(nodeSelector) + memcachedIndexWrites.nodeSelector: $(nodeSelector) + loki: + storageConfig: + aws: + s3: http://$(defaultUser):$(defaultPassword)@$(s3Service)/minio + - name: lma-bucket override: s3.enabled: true diff --git a/byoh-reference/policy/kustomization.yaml b/byoh-reference/policy/kustomization.yaml new file mode 100644 index 0000000..7c415e6 --- /dev/null +++ b/byoh-reference/policy/kustomization.yaml @@ -0,0 +1,5 @@ +resources: + - ../base + +transformers: + - site-values.yaml diff --git a/byoh-reference/policy/site-values.yaml b/byoh-reference/policy/site-values.yaml new file mode 100644 index 0000000..80aa10e --- /dev/null +++ b/byoh-reference/policy/site-values.yaml @@ -0,0 +1,26 @@ +apiVersion: openinfradev.github.com/v1 +kind: HelmValuesTransformer +metadata: + name: site + +global: + nodeSelector: + taco-lma: enabled + clusterName: cluster.local + storageClassName: taco-storage + repository: https://openinfradev.github.io/helm-repo/ + +charts: +- name: opa-gatekeeper + override: + postUpgrade.nodeSelector: $(nodeSelector) + postInstall.nodeSelector: $(nodeSelector) + preUninstall.nodeSelector: $(nodeSelector) + controllerManager.nodeSelector: $(nodeSelector) + audit.nodeSelector: $(nodeSelector) + crds.nodeSelector: $(nodeSelector) + + enableDeleteOperations: true + +- name: policy-resources + override: {} diff --git a/byoh-reference/tks-cluster/site-values.yaml b/byoh-reference/tks-cluster/site-values.yaml index e964fca..e417a91 100644 --- a/byoh-reference/tks-cluster/site-values.yaml +++ b/byoh-reference/tks-cluster/site-values.yaml @@ -18,7 +18,7 @@ charts: override: cluster: name: $(clusterName) - kubernetesVersion: v1.25.11 + kubernetesVersion: v1.29.8 byoCluster: bundleLookupBaseRegistry: harbor.taco-cat.xyz/cluster_api_provider_bringyourownhost controlPlaneEndpoint: diff --git a/byoh-ssu-reference/lma/site-values.yaml b/byoh-ssu-reference/lma/site-values.yaml index f0a4bfd..3919589 100644 --- a/byoh-ssu-reference/lma/site-values.yaml +++ b/byoh-ssu-reference/lma/site-values.yaml @@ -172,7 +172,7 @@ charts: - tag: kube.* bufferChunkSize: 2M bufferMaxSize: 5M - do_not_store_as_default: false + do_not_store_as_default: true index: container loki_name: taco-loki memBufLimit: 20MB @@ -180,7 +180,7 @@ charts: - index: platform loki_name: taco-loki key: $kubernetes['namespace_name'] - value: kube-system|$(lmaNameSpace)|taco-system|argo + value: kube-system|$(lmaNameSpace)|taco-system|gatekeeper-system|argo parser: docker path: /var/log/containers/*.log type: kubernates diff --git a/byoh-stage-reference/lma/site-values.yaml b/byoh-stage-reference/lma/site-values.yaml index 516ad94..7851442 100644 --- a/byoh-stage-reference/lma/site-values.yaml +++ b/byoh-stage-reference/lma/site-values.yaml @@ -171,7 +171,7 @@ charts: - tag: kube.* bufferChunkSize: 2M bufferMaxSize: 5M - do_not_store_as_default: false + do_not_store_as_default: true index: container loki_name: taco-loki memBufLimit: 20MB @@ -179,7 +179,7 @@ charts: - index: platform loki_name: taco-loki key: $kubernetes['namespace_name'] - value: kube-system|$(lmaNameSpace)|taco-system|argo + value: kube-system|$(lmaNameSpace)|taco-system|gatekeeper-system|argo parser: docker path: /var/log/containers/*.log type: kubernates diff --git a/byoh-suy-reference/lma/site-values.yaml b/byoh-suy-reference/lma/site-values.yaml index 516ad94..0d82936 100644 --- a/byoh-suy-reference/lma/site-values.yaml +++ b/byoh-suy-reference/lma/site-values.yaml @@ -171,7 +171,7 @@ charts: - tag: kube.* bufferChunkSize: 2M bufferMaxSize: 5M - do_not_store_as_default: false + do_not_store_as_default: true index: container loki_name: taco-loki memBufLimit: 20MB diff --git a/byok-reference/lma/kustomization.yaml b/byok-reference/lma/kustomization.yaml new file mode 100644 index 0000000..7c415e6 --- /dev/null +++ b/byok-reference/lma/kustomization.yaml @@ -0,0 +1,5 @@ +resources: + - ../base + +transformers: + - site-values.yaml diff --git a/byok-reference/lma/site-values.yaml b/byok-reference/lma/site-values.yaml new file mode 100644 index 0000000..5b43aaf --- /dev/null +++ b/byok-reference/lma/site-values.yaml @@ -0,0 +1,398 @@ +apiVersion: openinfradev.github.com/v1 +kind: HelmValuesTransformer +metadata: + name: site + +global: + nodeSelector: + taco-lma: enabled + clusterName: cluster.local + storageClassName: taco-storage + repository: https://openinfradev.github.io/helm-repo/ + serviceScrapeInterval: 30s + defaultPassword: password + defaultUser: taco + thanosObjstoreSecret: taco-objstore-secret + + lokiHost: loki-loki-distributed-gateway + lokiPort: 80 + lokiuserHost: loki-user-loki-distributed-gateway + lokiuserPort: 80 + s3Service: "minio.lma.svc:9000" # depends on $lmaNameSpace (ex. minio.taco-system.svc) + + lmaNameSpace: lma + + TksWebhookUrl: "FixItByWF" + SlackUrl: "FixItByWF" + SlackChannel: '#temporary-alert' + + grafanaDatasourceMetric: lma-prometheus:9090 + thanosQueryStores: + - thanos-storegateway:10901 + - prometheus-operated:10901 + + # servicemesh dashboard and grafana + realms: or2ru44fn + consoleUrl: tks-console.taco-cat.xyz + grafanaDomain: taco-cat.xyz + keycloakDomain: tks-console-dev.taco-cat.xyz + grafanaClientSecret: JLtsanYtrCg21RGxrcVmQP0GeuDFUhpA + + awsNlbAnnotation: + service.beta.kubernetes.io/aws-load-balancer-proxy-protocol: '*' + service.beta.kubernetes.io/aws-load-balancer-type: nlb + + tksIamRoles: [] + +charts: +- name: prometheus-operator + override: + prometheusOperator.nodeSelector: $(nodeSelector) + prometheusOperator.admissionWebhooks.patch.image.sha: "" + prometheusOperator.image.repository: tks/prometheus-operator + prometheusOperator.admissionWebhooks.patch.image.repository: tks/kube-webhook-certgen + prometheusOperator.prometheusConfigReloader.image.repository: tks/prometheus-config-reloader + prometheusOperator.thanosImage.repository: tks/thanos + +- name: prometheus + override: + kubeEtcd.enabled: true + prometheus.prometheusSpec.storageSpec.volumeClaimTemplate.spec.storageClassName: $(storageClassName) + prometheus.prometheusSpec.storageSpec.volumeClaimTemplate.spec.resources.requests.storage: 20Gi + prometheus.prometheusSpec.retention: 2d + prometheus.prometheusSpec.externalLabels.taco_cluster: $(clusterName) + prometheus.prometheusSpec.nodeSelector: $(nodeSelector) + prometheus.prometheusSpec.serviceMonitorNamespaceSelector.matchLabels.name: $(lmaNameSpace) + prometheus.prometheusSpec.podMonitorNamespaceSelector.matchLabels.name: $(lmaNameSpace) + prometheus.prometheusSpec.ruleNamespaceSelector.matchLabels.name: $(lmaNameSpace) + prometheus.thanosServiceExternal.annotations: $(awsNlbAnnotation) + prometheus.thanosServiceExternal: + type: NodePort + nodePort: 30004 + alertmanager.service.type: NodePort + alertmanager.service.nodePort: 30111 + alertmanager.alertmanagerSpec.alertmanagerConfigSelector.matchLabels.alertmanagerConfig: example + alertmanager.alertmanagerSpec.nodeSelector: $(nodeSelector) + alertmanager.alertmanagerSpec.retention: 2h + alertmanager.config: + global: + slack_api_url: $(SlackUrl) + receivers: + - name: tks-alert + webhook_configs: + - send_resolved: true + url: $(TksWebhookUrl) + route: + group_by: + - alertname + group_wait: 10s + receiver: tks-alert + repeat_interval: 1h + +- name: prometheus-node-exporter + override: + hostNetwork: false + +- name: kube-state-metrics + override: + nodeSelector: $(nodeSelector) + kubeVersion: v1.25.7 + +- name: prometheus-pushgateway + override: + nodeSelector: $(nodeSelector) + +- name: prometheus-process-exporter + override: + conf.processes: dockerd,kubelet,kube-proxy,ntpd,node + pod.hostNetwork: false + +- name: grafana + override: + adminPassword: password + persistence.storageClassName: $(storageClassName) + sidecar.dashboards.searchNamespace: ALL + # grafana oidc + service: + type: NodePort + nodePort: 30001 + service.annotations: $(awsNlbAnnotation) + grafana\.ini: + server: + domain: $(grafanaDomain) + root_url: http://$(grafanaDomain) + serve_from_sub_path: true + auth.generic_oauth: + enabled: true + name: keycloak + allow_sign_up: true + client_id: grafana + client_secret: $(grafanaClientSecret) + scopes: openid profile email + login_attribute_path: username + auth_url: https://$(keycloakDomain)/auth/realms/$(realms)/protocol/openid-connect/auth + token_url: https://$(keycloakDomain)/auth/realms/$(realms)/protocol/openid-connect/token + api_url: https://$(keycloakDomain)/auth/realms/$(realms)/protocol/openid-connect/userinfo + signout_redirect_url: $(consoleUrl)/login + auth: + disable_login_form: false + oauth_auto_login: true + disable_signout_menu: true + user: + auto_assign_org: true + auto_assign_org_role: Admin + +- name: fluent-operator + +- name: fluentbit + override: + fluentbit: + clusterName: $(clusterName) + outputs: + loki: + - name: taco-loki + host: $(lokiHost) + port: $(lokiPort) + lokiuser: + - name: taco-loki-user + host: $(lokiuserHost) + port: $(lokiuserPort) + targetLogs: + - tag: kube.* + bufferChunkSize: 2M + bufferMaxSize: 5M + do_not_store_as_default: false + index: container + loki_name: taco-loki-user + memBufLimit: 20MB + multi_index: + - index: platform + loki_name: taco-loki + key: $kubernetes['namespace_name'] + value: kube-system|$(lmaNameSpace)|taco-system|gatekeeper-system|argo + parser: docker + path: /var/log/containers/*.log + type: kubernates + extraArgs: + multilineParser: docker, cri + - tag: syslog.* + loki_name: taco-loki + index: syslog + parser: taco-syslog-parser-for-ubuntu + path: /var/log/messages + type: syslog + +- name: addons + override: + SPECIAL_VALUE: SPECIAL + serviceMonitor.trident: + enabled: false + interval: $(serviceScrapeInterval) + serviceMonitor.kubelet.interval: 30s + serviceMonitor.additionalScrapeConfigs: + grafanaDashboard.istio.enabled: false + grafanaDashboard.jaeger.enabled: false + grafanaDashboard.namespace: $(lmaNameSpace) + grafanaDatasource.namespace: $(lmaNameSpace) + serviceMonitor.istio.enabled: false + serviceMonitor.jaeger.enabled: false + serviceMonitor.argocd.enabled: false + serviceMonitor.argowf.enabled: false + prometheusRules.alert.enabled: false + prometheusRules.istio.aggregation.enabled: false + prometheusRules.istio.optimization.enabled: false + grafanaDatasource.prometheus.url: $(grafanaDatasourceMetric) + # grafanaDatasource.prometheus.url: "thanos-query.lma:9090" + grafanaDatasource.loki.url: $(lokiHost):$(lokiPort) + +- name: prometheus-adapter + override: + nodeSelector: $(nodeSelector) + +- name: kubernetes-event-exporter + override: + clustername: $(clusterName) + + conf.recievers: + - name: loki + type: file + config: + path: "/tmp/kubernetes-event.log" + addons: + loki: + enabled: true + host: $(lokiHost) + port: $(lokiPort) + target_file: "/tmp/kubernetes-event.log" + conf.default.hosts: + - "https://eck-elasticsearch-es-http.lma.svc.$(clusterName):9200" + +- name: minio + override: + users: + - accessKey: $(defaultUser) + secretKey: $(defaultPassword) + policy: consoleAdmin + buckets: + - name: tks-thanos + policy: public + purge: false + versioning: true + objectlocking: false + - name: tks-loki + policy: public + purge: false + versioning: true + objectlocking: false + persistence.storageClass: $(storageClassName) + persistence.size: 500Gi + persistence.accessMode: ReadWriteOnce + service: + type: NodePort + nodePort: 30003 + service.annotations: $(awsNlbAnnotation) + # deploy target node's label + consoleIngress.nodeSelector: $(nodeSelector) + postJob.nodeSelector: $(nodeSelector) + + +- name: thanos + override: + global.storageClass: $(storageClassName) + # temporarily add annotation because a cluster is using not cluster-name but 'cluster.local' + # clusterDomain: $(clusterName) + existingObjstoreSecret: $(thanosObjstoreSecret) + query.nodeSelector: $(nodeSelector) + query.service.type: ClusterIP + query.service.annotations: $(awsNlbAnnotation) + queryFrontend.nodeSelector: $(nodeSelector) + queryFrontend.service: + nodePorts: + http: 30005 + type: NodePort + queryFrontend.enabled: true + queryFrontend.config: |- + type: IN-MEMORY + config: + max_size: 512MB + max_size_items: 100 + validity: 100s + queryFrontend.extraFlags: [] + querier.stores: $(thanosQueryStores) + bucketweb.nodeSelector: $(nodeSelector) + compactor.nodeSelector: $(nodeSelector) + storegateway.nodeSelector: $(nodeSelector) + compactor.persistence.size: 8Gi + # compactor.extraFlags: + # - --compact.enable-vertical-compaction + # - --deduplication.replica-label="replica" + storegateway.persistence.size: 8Gi + ruler.nodeSelector: $(nodeSelector) + ruler.alertmanagers: + - http://alertmanager-operated:9093 + ruler.persistence.size: 8Gi + ruler.config: + groups: + - name: "tks" + rules: + - alert: "PrometheusDown" + expr: absent(up{prometheus="lma/lma-prometheus"}) + ruler.service: + type: NodePort + nodePort: 30007 + +- name: thanos-config + override: + objectStorage: + secretName: $(thanosObjstoreSecret) + rawConfig: + bucket: tks-thanos + endpoint: $(s3Service) + access_key: $(defaultUser) + secret_key: $(defaultPassword) + insecure: true + sidecarsService.name: thanos-sidecars + sidecarsService.endpoints: + - 192.168.97.102 # should not be in the loopback range (127.0.0.0/8) + +- name: prepare-etcd-secret + override: + nodeSelector: + "node-role.kubernetes.io/control-plane": "" + tolerations: + - key: "node-role.kubernetes.io/control-plane" + effect: "NoSchedule" + operator: "Exists" + +- name: loki + override: + global.dnsService: kube-dns + # global.clusterDomain: $(clusterName) # annotate cluste because the cluster name is still cluster.local regardless cluster + gateway.service: + type: NodePort + nodePort: 30002 + gateway.service.annotations: $(awsNlbAnnotation) + ingester.persistence.storageClass: $(storageClassName) + distributor.persistence.storageClass: $(storageClassName) + queryFrontend.persistence.storageClass: $(storageClassName) + ruler.persistence.storageClass: $(storageClassName) + indexGateway.persistence.storageClass: $(storageClassName) + # select target node's label + ingester.nodeSelector: $(nodeSelector) + distributor.nodeSelector: $(nodeSelector) + querier.nodeSelector: $(nodeSelector) + queryFrontend.nodeSelector: $(nodeSelector) + queryScheduler.nodeSelector: $(nodeSelector) + tableManager.nodeSelector: $(nodeSelector) + gateway.nodeSelector: $(nodeSelector) + compactor.nodeSelector: $(nodeSelector) + ruler.nodeSelector: $(nodeSelector) + indexGateway.nodeSelector: $(nodeSelector) + memcachedChunks.nodeSelector: $(nodeSelector) + memcachedFrontend.nodeSelector: $(nodeSelector) + memcachedIndexQueries.nodeSelector: $(nodeSelector) + memcachedIndexWrites.nodeSelector: $(nodeSelector) + loki: + storageConfig: + aws: + s3: http://$(defaultUser):$(defaultPassword)@$(s3Service)/minio + +- name: loki-user + override: + global.dnsService: kube-dns + gateway.service: + type: NodePort + nodePort: 30006 + gateway.service.annotations: $(awsNlbAnnotation) + ingester.persistence.storageClass: $(storageClassName) + distributor.persistence.storageClass: $(storageClassName) + queryFrontend.persistence.storageClass: $(storageClassName) + ruler.persistence.storageClass: $(storageClassName) + indexGateway.persistence.storageClass: $(storageClassName) + # select target node's label + ingester.nodeSelector: $(nodeSelector) + distributor.nodeSelector: $(nodeSelector) + querier.nodeSelector: $(nodeSelector) + queryFrontend.nodeSelector: $(nodeSelector) + queryScheduler.nodeSelector: $(nodeSelector) + tableManager.nodeSelector: $(nodeSelector) + gateway.nodeSelector: $(nodeSelector) + compactor.nodeSelector: $(nodeSelector) + ruler.nodeSelector: $(nodeSelector) + indexGateway.nodeSelector: $(nodeSelector) + memcachedChunks.nodeSelector: $(nodeSelector) + memcachedFrontend.nodeSelector: $(nodeSelector) + memcachedIndexQueries.nodeSelector: $(nodeSelector) + memcachedIndexWrites.nodeSelector: $(nodeSelector) + loki: + storageConfig: + aws: + s3: http://$(defaultUser):$(defaultPassword)@$(s3Service)/minio + +- name: lma-bucket + override: + s3.enabled: true + s3.buckets: + - name: $(clusterName)-tks-thanos + - name: $(clusterName)-tks-loki + tks.iamRoles: $(tksIamRoles) diff --git a/byok-reference/policy/kustomization.yaml b/byok-reference/policy/kustomization.yaml new file mode 100644 index 0000000..7c415e6 --- /dev/null +++ b/byok-reference/policy/kustomization.yaml @@ -0,0 +1,5 @@ +resources: + - ../base + +transformers: + - site-values.yaml diff --git a/byok-reference/policy/site-values.yaml b/byok-reference/policy/site-values.yaml new file mode 100644 index 0000000..80aa10e --- /dev/null +++ b/byok-reference/policy/site-values.yaml @@ -0,0 +1,26 @@ +apiVersion: openinfradev.github.com/v1 +kind: HelmValuesTransformer +metadata: + name: site + +global: + nodeSelector: + taco-lma: enabled + clusterName: cluster.local + storageClassName: taco-storage + repository: https://openinfradev.github.io/helm-repo/ + +charts: +- name: opa-gatekeeper + override: + postUpgrade.nodeSelector: $(nodeSelector) + postInstall.nodeSelector: $(nodeSelector) + preUninstall.nodeSelector: $(nodeSelector) + controllerManager.nodeSelector: $(nodeSelector) + audit.nodeSelector: $(nodeSelector) + crds.nodeSelector: $(nodeSelector) + + enableDeleteOperations: true + +- name: policy-resources + override: {} diff --git a/byok-reference/sealed-secrets/kustomization.yaml b/byok-reference/sealed-secrets/kustomization.yaml new file mode 100644 index 0000000..7c415e6 --- /dev/null +++ b/byok-reference/sealed-secrets/kustomization.yaml @@ -0,0 +1,5 @@ +resources: + - ../base + +transformers: + - site-values.yaml diff --git a/byok-reference/sealed-secrets/site-values.yaml b/byok-reference/sealed-secrets/site-values.yaml new file mode 100644 index 0000000..6fb83a4 --- /dev/null +++ b/byok-reference/sealed-secrets/site-values.yaml @@ -0,0 +1,6 @@ +apiVersion: openinfradev.github.com/v1 +kind: HelmValuesTransformer +metadata: + name: site + +charts: [] diff --git a/byok-reference/service-mesh/kustomization.yaml b/byok-reference/service-mesh/kustomization.yaml new file mode 100644 index 0000000..7c415e6 --- /dev/null +++ b/byok-reference/service-mesh/kustomization.yaml @@ -0,0 +1,5 @@ +resources: + - ../base + +transformers: + - site-values.yaml diff --git a/byok-reference/service-mesh/site-values.yaml b/byok-reference/service-mesh/site-values.yaml new file mode 100644 index 0000000..1686981 --- /dev/null +++ b/byok-reference/service-mesh/site-values.yaml @@ -0,0 +1,284 @@ +apiVersion: openinfradev.github.com/v1 +kind: HelmValuesTransformer +metadata: + name: site + +global: + clusterName: cluster.local + namespace: tks-msa + imageRegistry: harbor.taco-cat.xyz + serviceMeshControlNodeSelector: + tks-msa: enabled + serviceMeshIngressNodeSelector: + tks-ingressgateway: enabled + serviceMeshEgressNodeSelector: + tks-egressgateway: enabled + ingressGatewayLabel: istio-ingressgateway + egressGatewayLabel: istio-egressgateway + keycloakIssuerUri: https://keycloak.com/auth/realms/oraganization + keycloakClientPrefix: client-prefix + gatekeeperSecret: gatekeeper-secret + +charts: +- name: cert-manager + override: + image: + repository: $(imageRegistry)/tks/cert-manager-controller + nodeSelector: + tks-msa: enabled + webhook: + image: + repository: $(imageRegistry)/tks/cert-manager-webhook + nodeSelector: + tks-msa: enabled + cainjector: + image: + repository: $(imageRegistry)/tks/cert-manager-cainjector + nodeSelector: + tks-msa: enabled + +- name: k8ssandra-operator + override: + image: + registry: $(imageRegistry) + repository: tks/k8ssandra-operator + tag: v1.6.0 + nodeSelector: + tks-msa: enabled + cleaner: + image: + registry: $(imageRegistry) + repository: tks/k8ssandra-tools + tag: latest + client: + image: + registry: $(imageRegistry) + repository: tks/k8ssandra-tools + tag: latest + cass-operator: + image: + registry: $(imageRegistry) + repository: tks/cass-operator + tag: v1.14.0 + nodeSelector: + tks-msa: enabled + +- name: servicemesh-k8ssandra-resource + override: + namespace: $(namespace) + cassandra: + jmxInitContainerImage: + name: busybox + registry: $(imageRegistry)/tks + tag: 1.34.1 + datacenters: + size: 1 + perNodeConfigInitContainerImage: $(imageRegistry)/tks/yq:4 + initContainers: + serverConfigInitImage: $(imageRegistry)/tks/cass-config-builder:1.0-ubi7 + jmxInitContainerImage: + name: busybox + registry: $(imageRegistry)/tks + tag: 1.34.1 + containers: + - name: cassandra + image: $(imageRegistry)/tks/cass-management-api:4.0.6 + - name: server-system-logger + image: $(imageRegistry)/tks/system-logger:v1.14.0 + config: + heapSize: 2048M + storageConfig: + storageClassName: taco-storage + accessModes: ReadWriteOnce + size: 300Gi + racks: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: tks-msa + operator: In + values: + - enabled + stargate: + size: 1 + heapSize: 384M + containerImage: + registry: $(imageRegistry) + repository: tks + tag: v1.0.67 + nodeSelector: + tks-msa: enabled + +- name: istiod + override: + revision: "" + pilot.autoscaleEnabled: false + pilot.traceSampling: 0.1 + pilot.nodeSelector: $(serviceMeshControlNodeSelector) + global.hub: $(imageRegistry)/tks + global.proxy.clusterDomain: $(clusterName) + global.tracer.zipkin.address: jaeger-operator-jaeger-collector.$(namespace):9411 + +- name: istio-ingressgateway + override: + revision: "" + replicaCount: 1 + image: $(imageRegistry)/tks/proxyv2:1.17.2 + autoscaling: + enabled: false + minReplicas: 1 + maxReplicas: 5 + targetCPUUtilizationPercentage: 80 + service: + type: NodePort + ports: + - name: status-port + port: 15021 + protocol: TCP + targetPort: 15021 + nodePort: 30013 + - name: http2 + port: 80 + protocol: TCP + targetPort: 80 + nodePort: 30014 + - name: https + port: 443 + protocol: TCP + targetPort: 443 + nodePort: 30015 + #resources.requests.cpu: 1000m + #resources.requests.memory: 1024Mi + #resources.limits.cpu: 2000m + #resources.limits.memory: 2048Mi + nodeSelector: $(serviceMeshIngressNodeSelector) + +- name: istio-egressgateway + override: + revision: "" + replicaCount: 1 + image: $(imageRegistry)/tks/proxyv2:1.17.2 + autoscaling.enabled: false + service.type: ClusterIP + #resources.requests.cpu: 1000m + #resources.requests.memory: 1024Mi + #resources.limits.cpu: 2000m + #resources.limits.memory: 2048Mi + nodeSelector: $(serviceMeshEgressNodeSelector) + +- name: jaeger-operator + override: + image: + repository: $(imageRegistry)/tks/jaeger-operator + tag: 1.35.0 + nodeSelector: $(serviceMeshControlNodeSelector) + +- name: servicemesh-jaeger-resource + override: + namespace: tks-msa + sampling.param: 10 + collector.resources.requests.cpu: 500m + collector.resources.requests.memory: 1024Mi + collector.resources.limits.cpu: 1000m + collector.resources.limits.memory: 2048Mi + collector: + image: $(imageRegistry)/tks/jaeger-collector:1.35.0 + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: tks-msa + operator: In + values: + - enabled + storage: + type: cassandra + cassandra: + options: + servers: cassandra-dc-service.tks-msa.svc + keyspace: jaeger_v1_datacenter + cassandraCreateSchema: + image: $(imageRegistry)/tks/jaeger-cassandra-schema:1.35.0 + dependencies: + enabled: true + image: $(imageRegistry)/tks/spark-dependencies:1.35.0 + query: + image: $(imageRegistry)/tks/jaeger-query:1.35.0 + basePath: / + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: tks-msa + operator: In + values: + - enabled + agent: + image: $(imageRegistry)/tks/jaeger-agent:1.35.0 + cassandra: + user: + enabled: true + username: tks + password: tksword + nodeSelector: + tks-msa: enabled + elasticsearch.user.enabled: false + +- name: kiali-operator + override: + image: + repo: $(imageRegistry)/tks/kiali-operator + tag: v1.63.0 + nodeSelector: $(serviceMeshControlNodeSelector) + +- name: servicemesh-kiali-resource + override: + namespace: tks-msa + istioNamespace: tks-msa + deployment.namespace: tks-msa + deployment.image_name: $(imageRegistry)/tks/kiali + deployment.image_version: v1.63.0 + deployment.resources.requests.cpu: 500m + deployment.resources.requests.memory: 512Mi + deployment.resources.limits.cpu: 1000m + deployment.resources.limits.memory: 1024Mi + deployment.nodeSelector: + tks-msa: enabled + deployment.serviceType: NodePort + auth.strategy: openid + auth.openid.client_id: $(keycloakClientPrefix)-kiali + auth.openid.issuer_uri: $(keycloakIssuerUri) + auth.openid.username_claim: preferred_username + auth.openid.scopes: [ "openid", "email" ] + auth.openid.disable_rbac: true + externalServices.istio.configMapName: istio + externalServices.istio.istioIdentityDomain: svc.$(clusterName) + externalServices.prometheus.url: http://lma-prometheus.lma.svc:9090 + externalServices.tracing.inClusterUrl: http://jaeger-operator-jaeger-query.tks-msa:16686 + externalServices.tracing.url: https://jaeger-v2.taco-cat.xyz + externalServices.tracing.useGrpc: false + externalServices.grafana.auth.type: basic + externalServices.grafana.auth.username: admin + externalServices.grafana.auth.password: password + externalServices.grafana.inClusterUrl: http://grafana.lma.svc:80 + externalServices.grafana.url: https://grafana-v2.taco-cat.xyz + server.webRoot: / + +- name: gatekeeper + override: + nodeSelector: + tks-msa: enabled + config: + discovery-url: $(keycloakIssuerUri) + upstream-url: http://jaeger-operator-jaeger-query.tks-msa.svc:16686 + client-id: $(keycloakClientPrefix)-gatekeeper-jaeger + client-secret: $(gatekeeperSecret) + secure-cookie: false + service: + type: NodePort + proxy: + nodePort: 30012 diff --git a/byok-reference/tks-admin-tools/kustomization.yaml b/byok-reference/tks-admin-tools/kustomization.yaml new file mode 100644 index 0000000..7c415e6 --- /dev/null +++ b/byok-reference/tks-admin-tools/kustomization.yaml @@ -0,0 +1,5 @@ +resources: + - ../base + +transformers: + - site-values.yaml diff --git a/byok-reference/tks-admin-tools/site-values.yaml b/byok-reference/tks-admin-tools/site-values.yaml new file mode 100644 index 0000000..a9a5bf0 --- /dev/null +++ b/byok-reference/tks-admin-tools/site-values.yaml @@ -0,0 +1,111 @@ +apiVersion: openinfradev.github.com/v1 +kind: HelmValuesTransformer +metadata: + name: site + +global: + dbHost: ${DATABASE_HOST} + commonPassword: ${COMMON_PASSWORD} + storageClass: ${STORAGE_CLASS} + storageClassHa: ${STORAGE_CLASS_HA} + +charts: +- name: keycloak + override: + replicaCount: 3 + global.storageClass: $(storageClass) + auth.adminPassword: $(commonPassword) + ingress.enabled: true + ingress.hostname: TO_BE_FIXED + ingress.annotations: + nginx.ingress.kubernetes.io/proxy-buffer-size: 20k + acme.cert-manager.io/http01-edit-in-place: "true" + cert-manager.io/cluster-issuer: http0issuer + externalDatabase.host: $(dbHost) + externalDatabase.password: $(commonPassword) + +- name: tks-apis + override: + gitBaseUrl: https://github.com + gitAccount: decapod10 + db: + dbHost: $(dbHost) + adminPassword: $(commonPassword) + dbUser: tksuser + dbPassword: $(commonPassword) + tksapi: + replicaCount: 1 + tksAccount: + password: $(commonPassword) + args: + imageRegistryUrl: "harbor.taco-cat.xyz/appserving" + gitRepositoryUrl: "github.com/openinfradev" + keycloakAddress: http://keycloak.keycloak.svc:80/auth + tksbatch: + replicaCount: 1 + tksconsole: + replicaCount: 1 + +- name: harbor + override: + expose: + ingress: + hosts: + core: TO_BE_FIXED + className: "nginx" + externalURL: TO_BE_FIXED + persistence: + persistentVolumeClaim: + registry: + storageClass: $(storageClassHa) + accessMode: ReadWriteMany + size: 200Gi + chartmuseum: + storageClass: $(storageClassHa) + accessMode: ReadWriteMany + size: 20Gi + jobservice: + jobLog: + storageClass: $(storageClassHa) + accessMode: ReadWriteMany + scanDataExports: + storageClass: $(storageClassHa) + accessMode: ReadWriteMany + redis: + storageClass: $(storageClass) + accessMode: ReadWriteOnce + trivy: + storageClass: $(storageClass) + database: + type: external + external: + host: $(dbHost) + password: $(commonPassword) + sslmode: "require" + core: + replicas: 2 + jobservice: + replicas: 2 + registry: + replicas: 2 + chartmuseum: + replicas: 2 + trivy: + replicas: 2 + portal: + replicas: 2 + harborAdminPassword: $(commonPassword) + +- name: ingress-nginx + override: + controller: + resources: + requests: + cpu: 2000m + memory: 4Gi + service: + externalTrafficPolicy: Local + type: NodePort + config: + enable-underscores-in-headers: "true" + proxy-body-size: "10m" diff --git a/byok-reference/tks-cluster/kustomization.yaml b/byok-reference/tks-cluster/kustomization.yaml new file mode 100644 index 0000000..acae49c --- /dev/null +++ b/byok-reference/tks-cluster/kustomization.yaml @@ -0,0 +1,6 @@ +resources: +- ../base +- ../infra/byoh + +transformers: +- site-values.yaml diff --git a/byok-reference/tks-cluster/site-values.yaml b/byok-reference/tks-cluster/site-values.yaml new file mode 100644 index 0000000..e964fca --- /dev/null +++ b/byok-reference/tks-cluster/site-values.yaml @@ -0,0 +1,78 @@ +apiVersion: openinfradev.github.com/v1 +kind: HelmValuesTransformer +metadata: + name: site + +global: + # These values are replaced on cluster creation by workflow + clusterName: cluster.local + clusterEndpointHost: CHANGEME + clusterEndpointPort: CHANGEME + tksCpNode: CHNAGEME + tksInfraNode: CHNAGEME + tksUserNode: CHANGEME + keycloakIssuerUri: CHANGEME + keycloakClientId: CHANGEME +charts: +- name: cluster-api-byoh + override: + cluster: + name: $(clusterName) + kubernetesVersion: v1.25.11 + byoCluster: + bundleLookupBaseRegistry: harbor.taco-cat.xyz/cluster_api_provider_bringyourownhost + controlPlaneEndpoint: + host: $(clusterEndpointHost) + port: $(clusterEndpointPort) + kubeadmControlPlane: + selector: + matchLabels: + role: $(clusterName)-control-plane + replicas: $(tksCpNode) + clusterConfiguration: + apiServer: + extraArgs: + oidc-client-id: $(keycloakClientId) + oidc-issuer-url: $(keycloakIssuerUri) + machineDeployment: + - name: taco + replicas: $(tksInfraNode) + selector: + matchLabels: + role: $(clusterName)-tks + labels: + servicemesh: enabled + taco-egress-gateway: enabled + taco-ingress-gateway: enabled + taco-lma: enabled + - name: normal + replicas: $(tksUserNode) + selector: + matchLabels: + role: $(clusterName)-worker + +- name: ingress-nginx + override: + controller: + nodeSelector: + taco-lma: enabled + resources: + requests: + cpu: 2000m + memory: 4Gi + service: + externalTrafficPolicy: Local + type: NodePort + config: + enable-underscores-in-headers: "true" + proxy-body-size: "10m" + +- name: cluster-autoscaler + override: + discoveryNamespace: $(clusterName) + discoveryClusterName: $(clusterName) + +- name: cluster-autoscaler-rbac + override: + deployMgmtRbacOnly: + targetNamespace: $(clusterName) diff --git a/eks-msa-reference/lma/site-values.yaml b/eks-msa-reference/lma/site-values.yaml index 23ec784..112f656 100644 --- a/eks-msa-reference/lma/site-values.yaml +++ b/eks-msa-reference/lma/site-values.yaml @@ -16,6 +16,8 @@ global: lokiHost: loki-loki-distributed-gateway lokiPort: 80 + lokiuserHost: loki-user-loki-distributed-gateway + lokiuserPort: 80 s3Service: "minio.lma.svc:9000" # depends on $lmaNameSpace (ex. minio.taco-system.svc) lmaNameSpace: lma @@ -149,19 +151,23 @@ charts: - name: taco-loki host: $(lokiHost) port: $(lokiPort) + lokiuser: + - name: taco-loki-user + host: $(lokiuserHost) + port: $(lokiuserPort) targetLogs: - tag: kube.* bufferChunkSize: 2M bufferMaxSize: 5M do_not_store_as_default: false index: container - loki_name: taco-loki + loki_name: taco-loki-user memBufLimit: 20MB multi_index: - index: platform loki_name: taco-loki key: $kubernetes['namespace_name'] - value: kube-system|$(lmaNameSpace)|taco-system|argo + value: kube-system|$(lmaNameSpace)|taco-system|gatekeeper-system|argo parser: docker path: /var/log/containers/*.log type: kubernates @@ -275,6 +281,8 @@ charts: # - --deduplication.replica-label="replica" storegateway.persistence.size: 8Gi ruler.nodeSelector: $(nodeSelector) + ruler.service.type: LoadBalancer + ruler.service.annotations: $(awsNlbAnnotation) ruler.alertmanagers: - http://alertmanager-operated:9093 ruler.persistence.size: 8Gi @@ -284,61 +292,7 @@ charts: rules: - alert: "PrometheusDown" expr: absent(up{prometheus="lma/lma-prometheus"}) - - alert: node-cpu-high-load - annotations: - message: 클러스터({{ $labels.taco_cluster }})의 노드({{ $labels.instance }})의 idle process의 cpu 점유율이 3분 동안 0% 입니다. (현재 사용률 {{$value}}) - description: 워커 노드 CPU가 과부하 상태입니다. 일시적인 서비스 Traffic 증가, Workload의 SW 오류, Server HW Fan Fail등 다양한 원인으로 인해 발생할 수 있습니다. - Checkpoint: 일시적인 Service Traffic의 증가가 관측되지 않았다면, Alert발생 노드에서 실행 되는 pod중 CPU 자원을 많이 점유하는 pod의 설정을 점검해 보시길 제안드립니다. 예를 들어 pod spec의 limit 설정으로 과도한 CPU자원 점유을 막을 수 있습니다. - summary: Cpu resources of the node {{ $labels.instance }} are running low. - discriminative: $labels.taco_cluster, $labels.instance - expr: (avg by (taco_cluster, instance) (rate(node_cpu_seconds_total{mode="idle"}[60s]))) < 0 #0.1 # 진짜 0? - for: 3m - labels: - severity: warning - - alert: node-memory-high-utilization - annotations: - message: 클러스터({{ $labels.taco_cluster }})의 노드({{ $labels.instance }})의 Memory 사용량이 3분동안 80% 를 넘어서고 있습니다. (현재 사용률 {{$value}}) - descriptioon: 워커 노드의 Memory 사용량이 80%를 넘었습니다. 일시적인 서비스 증가 및 SW 오류등 다양한 원인으로 발생할 수 있습니다. - Checkpoint: 일시적인 Service Traffic의 증가가 관측되지 않았다면, Alert발생 노드에서 실행되는 pod중 Memory 사용량이 높은 pod들에 대한 점검을 제안드립니다. - summary: Memory resources of the node {{ $labels.instance }} are running low. - discriminative: $labels.taco_cluster, $labels.instance - expr: (node_memory_MemAvailable_bytes/node_memory_MemTotal_bytes) < 0.2 - for: 3m - labels: - severity: warning - - alert: node-disk-full - annotations: - message: 지난 6시간동안의 추세로 봤을 때, 클러스터({{ $labels.taco_cluster }})의 노드({{ $labels.instance }})의 root 볼륨은 24시간 안에 Disk full이 예상됨 - description: 현재 Disk 사용 추세기준 24시간 내에 Disk 용량이 꽉 찰 것으로 예상됩니다. - Checkpoint: Disk 용량 최적화(삭제 및 Backup)을 수행하시길 권고합니다. 삭제할 내역이 없으면 증설 계획을 수립해 주십시요. - summary: Memory resources of the node {{ $labels.instance }} are running low. - discriminative: $labels.taco_cluster, $labels.instance - expr: predict_linear(node_filesystem_free_bytes{mountpoint="/"}[6h], 24*3600) < 0 - for: 30m - labels: - severity: critical - - alert: pvc-full - annotations: - message: 지난 6시간동안의 추세로 봤을 때, 클러스터({{ $labels.taco_cluster }})의 파드({{ $labels.persistentvolumeclaim }})가 24시간 안에 Disk full이 예상됨 - description: 현재 Disk 사용 추세기준 24시간 내에 Disk 용량이 꽉 찰것으로 예상됩니다. ({{ $labels.taco_cluster }} 클러스터, {{ $labels.persistentvolumeclaim }} PVC) - Checkpoint: Disk 용량 최적화(삭제 및 Backup)을 수행하시길 권고합니다. 삭제할 내역이 없으면 증설 계획을 수립해 주십시요. - summary: Disk resources of the volume(pvc) {{ $labels.persistentvolumeclaim }} are running low. - discriminative: $labels.taco_cluster, $labels.persistentvolumeclaim - expr: predict_linear(kubelet_volume_stats_available_bytes[6h], 24*3600) < 0 # kubelet_volume_stats_capacity_bytes - for: 30m - labels: - severity: critical - - alert: pod-restart-frequently - annotations: - message: 클러스터({{ $labels.taco_cluster }})의 파드({{ $labels.pod }})가 30분 동안 5회 이상 재기동 ({{ $value }}회) - description: 특정 Pod가 빈번하게 재기동 되고 있습니다. 점검이 필요합니다. ({{ $labels.taco_cluster }} 클러스터, {{ $labels.pod }} 파드) - Checkpoint: pod spec. 에 대한 점검이 필요합니다. pod의 log 및 status를 확인해 주세요. - discriminative: $labels.taco_cluster, $labels.pod, $labels.namespace - expr: increase(kube_pod_container_status_restarts_total{namespace!="kube-system"}[60m:]) > 2 # 몇회로 할 것인지? - for: 30m - labels: - severity: critical - + - name: thanos-config override: objectStorage: @@ -394,10 +348,42 @@ charts: aws: s3: http://$(defaultUser):$(defaultPassword)@$(s3Service)/minio +- name: loki-user + override: + global.dnsService: kube-dns + # global.clusterDomain: $(clusterName) # annotate cluste because the cluster name is still cluster.local regardless cluster + gateway.service.type: LoadBalancer + gateway.service.annotations: $(awsNlbAnnotation) + ingester.persistence.storageClass: $(storageClassName) + distributor.persistence.storageClass: $(storageClassName) + queryFrontend.persistence.storageClass: $(storageClassName) + ruler.persistence.storageClass: $(storageClassName) + indexGateway.persistence.storageClass: $(storageClassName) + # select target node's label + ingester.nodeSelector: $(nodeSelector) + distributor.nodeSelector: $(nodeSelector) + querier.nodeSelector: $(nodeSelector) + queryFrontend.nodeSelector: $(nodeSelector) + queryScheduler.nodeSelector: $(nodeSelector) + tableManager.nodeSelector: $(nodeSelector) + gateway.nodeSelector: $(nodeSelector) + compactor.nodeSelector: $(nodeSelector) + ruler.nodeSelector: $(nodeSelector) + indexGateway.nodeSelector: $(nodeSelector) + memcachedChunks.nodeSelector: $(nodeSelector) + memcachedFrontend.nodeSelector: $(nodeSelector) + memcachedIndexQueries.nodeSelector: $(nodeSelector) + memcachedIndexWrites.nodeSelector: $(nodeSelector) + loki: + storageConfig: + aws: + s3: http://$(defaultUser):$(defaultPassword)@$(s3Service)/minio + - name: lma-bucket override: s3.enabled: true s3.buckets: - name: $(clusterName)-tks-thanos - name: $(clusterName)-tks-loki + - name: $(clusterName)-tks-loki-user tks.iamRoles: $(tksIamRoles) diff --git a/eks-msa-reference/policy/kustomization.yaml b/eks-msa-reference/policy/kustomization.yaml new file mode 100644 index 0000000..7c415e6 --- /dev/null +++ b/eks-msa-reference/policy/kustomization.yaml @@ -0,0 +1,5 @@ +resources: + - ../base + +transformers: + - site-values.yaml diff --git a/eks-msa-reference/policy/site-values.yaml b/eks-msa-reference/policy/site-values.yaml new file mode 100644 index 0000000..80aa10e --- /dev/null +++ b/eks-msa-reference/policy/site-values.yaml @@ -0,0 +1,26 @@ +apiVersion: openinfradev.github.com/v1 +kind: HelmValuesTransformer +metadata: + name: site + +global: + nodeSelector: + taco-lma: enabled + clusterName: cluster.local + storageClassName: taco-storage + repository: https://openinfradev.github.io/helm-repo/ + +charts: +- name: opa-gatekeeper + override: + postUpgrade.nodeSelector: $(nodeSelector) + postInstall.nodeSelector: $(nodeSelector) + preUninstall.nodeSelector: $(nodeSelector) + controllerManager.nodeSelector: $(nodeSelector) + audit.nodeSelector: $(nodeSelector) + crds.nodeSelector: $(nodeSelector) + + enableDeleteOperations: true + +- name: policy-resources + override: {} diff --git a/eks-msa-reference/tks-cluster/site-values.yaml b/eks-msa-reference/tks-cluster/site-values.yaml index 47c3174..8704c74 100644 --- a/eks-msa-reference/tks-cluster/site-values.yaml +++ b/eks-msa-reference/tks-cluster/site-values.yaml @@ -27,14 +27,14 @@ charts: name: $(clusterName) region: $(clusterRegion) eksEnabled: true - kubernetesVersion: v1.25.9 + kubernetesVersion: v1.29.8 eksAddons: - name: "aws-ebs-csi-driver" - version: "v1.18.0-eksbuild.1" + version: "v1.34.0-eksbuild.1" conflictResolution: "overwrite" - name: "vpc-cni" conflictResolution: "overwrite" - version: "v1.12.6-eksbuild.2" + version: "v1.18.3-eksbuild.2" multitenancyId: kind: AWSClusterRoleIdentity name: $(cloudAccountID)-account-role diff --git a/eks-reference/lma/site-values.yaml b/eks-reference/lma/site-values.yaml index 23ec784..112f656 100644 --- a/eks-reference/lma/site-values.yaml +++ b/eks-reference/lma/site-values.yaml @@ -16,6 +16,8 @@ global: lokiHost: loki-loki-distributed-gateway lokiPort: 80 + lokiuserHost: loki-user-loki-distributed-gateway + lokiuserPort: 80 s3Service: "minio.lma.svc:9000" # depends on $lmaNameSpace (ex. minio.taco-system.svc) lmaNameSpace: lma @@ -149,19 +151,23 @@ charts: - name: taco-loki host: $(lokiHost) port: $(lokiPort) + lokiuser: + - name: taco-loki-user + host: $(lokiuserHost) + port: $(lokiuserPort) targetLogs: - tag: kube.* bufferChunkSize: 2M bufferMaxSize: 5M do_not_store_as_default: false index: container - loki_name: taco-loki + loki_name: taco-loki-user memBufLimit: 20MB multi_index: - index: platform loki_name: taco-loki key: $kubernetes['namespace_name'] - value: kube-system|$(lmaNameSpace)|taco-system|argo + value: kube-system|$(lmaNameSpace)|taco-system|gatekeeper-system|argo parser: docker path: /var/log/containers/*.log type: kubernates @@ -275,6 +281,8 @@ charts: # - --deduplication.replica-label="replica" storegateway.persistence.size: 8Gi ruler.nodeSelector: $(nodeSelector) + ruler.service.type: LoadBalancer + ruler.service.annotations: $(awsNlbAnnotation) ruler.alertmanagers: - http://alertmanager-operated:9093 ruler.persistence.size: 8Gi @@ -284,61 +292,7 @@ charts: rules: - alert: "PrometheusDown" expr: absent(up{prometheus="lma/lma-prometheus"}) - - alert: node-cpu-high-load - annotations: - message: 클러스터({{ $labels.taco_cluster }})의 노드({{ $labels.instance }})의 idle process의 cpu 점유율이 3분 동안 0% 입니다. (현재 사용률 {{$value}}) - description: 워커 노드 CPU가 과부하 상태입니다. 일시적인 서비스 Traffic 증가, Workload의 SW 오류, Server HW Fan Fail등 다양한 원인으로 인해 발생할 수 있습니다. - Checkpoint: 일시적인 Service Traffic의 증가가 관측되지 않았다면, Alert발생 노드에서 실행 되는 pod중 CPU 자원을 많이 점유하는 pod의 설정을 점검해 보시길 제안드립니다. 예를 들어 pod spec의 limit 설정으로 과도한 CPU자원 점유을 막을 수 있습니다. - summary: Cpu resources of the node {{ $labels.instance }} are running low. - discriminative: $labels.taco_cluster, $labels.instance - expr: (avg by (taco_cluster, instance) (rate(node_cpu_seconds_total{mode="idle"}[60s]))) < 0 #0.1 # 진짜 0? - for: 3m - labels: - severity: warning - - alert: node-memory-high-utilization - annotations: - message: 클러스터({{ $labels.taco_cluster }})의 노드({{ $labels.instance }})의 Memory 사용량이 3분동안 80% 를 넘어서고 있습니다. (현재 사용률 {{$value}}) - descriptioon: 워커 노드의 Memory 사용량이 80%를 넘었습니다. 일시적인 서비스 증가 및 SW 오류등 다양한 원인으로 발생할 수 있습니다. - Checkpoint: 일시적인 Service Traffic의 증가가 관측되지 않았다면, Alert발생 노드에서 실행되는 pod중 Memory 사용량이 높은 pod들에 대한 점검을 제안드립니다. - summary: Memory resources of the node {{ $labels.instance }} are running low. - discriminative: $labels.taco_cluster, $labels.instance - expr: (node_memory_MemAvailable_bytes/node_memory_MemTotal_bytes) < 0.2 - for: 3m - labels: - severity: warning - - alert: node-disk-full - annotations: - message: 지난 6시간동안의 추세로 봤을 때, 클러스터({{ $labels.taco_cluster }})의 노드({{ $labels.instance }})의 root 볼륨은 24시간 안에 Disk full이 예상됨 - description: 현재 Disk 사용 추세기준 24시간 내에 Disk 용량이 꽉 찰 것으로 예상됩니다. - Checkpoint: Disk 용량 최적화(삭제 및 Backup)을 수행하시길 권고합니다. 삭제할 내역이 없으면 증설 계획을 수립해 주십시요. - summary: Memory resources of the node {{ $labels.instance }} are running low. - discriminative: $labels.taco_cluster, $labels.instance - expr: predict_linear(node_filesystem_free_bytes{mountpoint="/"}[6h], 24*3600) < 0 - for: 30m - labels: - severity: critical - - alert: pvc-full - annotations: - message: 지난 6시간동안의 추세로 봤을 때, 클러스터({{ $labels.taco_cluster }})의 파드({{ $labels.persistentvolumeclaim }})가 24시간 안에 Disk full이 예상됨 - description: 현재 Disk 사용 추세기준 24시간 내에 Disk 용량이 꽉 찰것으로 예상됩니다. ({{ $labels.taco_cluster }} 클러스터, {{ $labels.persistentvolumeclaim }} PVC) - Checkpoint: Disk 용량 최적화(삭제 및 Backup)을 수행하시길 권고합니다. 삭제할 내역이 없으면 증설 계획을 수립해 주십시요. - summary: Disk resources of the volume(pvc) {{ $labels.persistentvolumeclaim }} are running low. - discriminative: $labels.taco_cluster, $labels.persistentvolumeclaim - expr: predict_linear(kubelet_volume_stats_available_bytes[6h], 24*3600) < 0 # kubelet_volume_stats_capacity_bytes - for: 30m - labels: - severity: critical - - alert: pod-restart-frequently - annotations: - message: 클러스터({{ $labels.taco_cluster }})의 파드({{ $labels.pod }})가 30분 동안 5회 이상 재기동 ({{ $value }}회) - description: 특정 Pod가 빈번하게 재기동 되고 있습니다. 점검이 필요합니다. ({{ $labels.taco_cluster }} 클러스터, {{ $labels.pod }} 파드) - Checkpoint: pod spec. 에 대한 점검이 필요합니다. pod의 log 및 status를 확인해 주세요. - discriminative: $labels.taco_cluster, $labels.pod, $labels.namespace - expr: increase(kube_pod_container_status_restarts_total{namespace!="kube-system"}[60m:]) > 2 # 몇회로 할 것인지? - for: 30m - labels: - severity: critical - + - name: thanos-config override: objectStorage: @@ -394,10 +348,42 @@ charts: aws: s3: http://$(defaultUser):$(defaultPassword)@$(s3Service)/minio +- name: loki-user + override: + global.dnsService: kube-dns + # global.clusterDomain: $(clusterName) # annotate cluste because the cluster name is still cluster.local regardless cluster + gateway.service.type: LoadBalancer + gateway.service.annotations: $(awsNlbAnnotation) + ingester.persistence.storageClass: $(storageClassName) + distributor.persistence.storageClass: $(storageClassName) + queryFrontend.persistence.storageClass: $(storageClassName) + ruler.persistence.storageClass: $(storageClassName) + indexGateway.persistence.storageClass: $(storageClassName) + # select target node's label + ingester.nodeSelector: $(nodeSelector) + distributor.nodeSelector: $(nodeSelector) + querier.nodeSelector: $(nodeSelector) + queryFrontend.nodeSelector: $(nodeSelector) + queryScheduler.nodeSelector: $(nodeSelector) + tableManager.nodeSelector: $(nodeSelector) + gateway.nodeSelector: $(nodeSelector) + compactor.nodeSelector: $(nodeSelector) + ruler.nodeSelector: $(nodeSelector) + indexGateway.nodeSelector: $(nodeSelector) + memcachedChunks.nodeSelector: $(nodeSelector) + memcachedFrontend.nodeSelector: $(nodeSelector) + memcachedIndexQueries.nodeSelector: $(nodeSelector) + memcachedIndexWrites.nodeSelector: $(nodeSelector) + loki: + storageConfig: + aws: + s3: http://$(defaultUser):$(defaultPassword)@$(s3Service)/minio + - name: lma-bucket override: s3.enabled: true s3.buckets: - name: $(clusterName)-tks-thanos - name: $(clusterName)-tks-loki + - name: $(clusterName)-tks-loki-user tks.iamRoles: $(tksIamRoles) diff --git a/eks-reference/policy/kustomization.yaml b/eks-reference/policy/kustomization.yaml new file mode 100644 index 0000000..7c415e6 --- /dev/null +++ b/eks-reference/policy/kustomization.yaml @@ -0,0 +1,5 @@ +resources: + - ../base + +transformers: + - site-values.yaml diff --git a/eks-reference/policy/site-values.yaml b/eks-reference/policy/site-values.yaml new file mode 100644 index 0000000..80aa10e --- /dev/null +++ b/eks-reference/policy/site-values.yaml @@ -0,0 +1,26 @@ +apiVersion: openinfradev.github.com/v1 +kind: HelmValuesTransformer +metadata: + name: site + +global: + nodeSelector: + taco-lma: enabled + clusterName: cluster.local + storageClassName: taco-storage + repository: https://openinfradev.github.io/helm-repo/ + +charts: +- name: opa-gatekeeper + override: + postUpgrade.nodeSelector: $(nodeSelector) + postInstall.nodeSelector: $(nodeSelector) + preUninstall.nodeSelector: $(nodeSelector) + controllerManager.nodeSelector: $(nodeSelector) + audit.nodeSelector: $(nodeSelector) + crds.nodeSelector: $(nodeSelector) + + enableDeleteOperations: true + +- name: policy-resources + override: {} diff --git a/eks-reference/tks-cluster/site-values.yaml b/eks-reference/tks-cluster/site-values.yaml index 6d88add..a3319be 100644 --- a/eks-reference/tks-cluster/site-values.yaml +++ b/eks-reference/tks-cluster/site-values.yaml @@ -30,14 +30,14 @@ charts: name: $(clusterName) region: $(clusterRegion) eksEnabled: true - kubernetesVersion: v1.25.9 + kubernetesVersion: v1.29.8 eksAddons: - name: "aws-ebs-csi-driver" - version: "v1.18.0-eksbuild.1" + version: "v1.34.0-eksbuild.1" conflictResolution: "overwrite" - name: "vpc-cni" conflictResolution: "overwrite" - version: "v1.12.6-eksbuild.2" + version: "v1.18.3-eksbuild.2" multitenancyId: kind: AWSClusterRoleIdentity name: $(cloudAccountID)-account-role