From 2cbcfe0fe25f02bbcdd9105f4f3dc6085aa1cc1e Mon Sep 17 00:00:00 2001 From: sungil Date: Wed, 4 Oct 2023 04:51:55 +0000 Subject: [PATCH 01/12] policy: add a decapod app for policies --- aws-msa-reference/policy/kustomization.yaml | 5 ++++ aws-msa-reference/policy/site-values.yaml | 26 +++++++++++++++++++++ aws-reference/policy/kustomization.yaml | 5 ++++ aws-reference/policy/site-values.yaml | 26 +++++++++++++++++++++ eks-msa-reference/policy/kustomization.yaml | 5 ++++ eks-msa-reference/policy/site-values.yaml | 26 +++++++++++++++++++++ eks-reference/policy/kustomization.yaml | 5 ++++ eks-reference/policy/site-values.yaml | 26 +++++++++++++++++++++ 8 files changed, 124 insertions(+) create mode 100644 aws-msa-reference/policy/kustomization.yaml create mode 100644 aws-msa-reference/policy/site-values.yaml create mode 100644 aws-reference/policy/kustomization.yaml create mode 100644 aws-reference/policy/site-values.yaml create mode 100644 eks-msa-reference/policy/kustomization.yaml create mode 100644 eks-msa-reference/policy/site-values.yaml create mode 100644 eks-reference/policy/kustomization.yaml create mode 100644 eks-reference/policy/site-values.yaml diff --git a/aws-msa-reference/policy/kustomization.yaml b/aws-msa-reference/policy/kustomization.yaml new file mode 100644 index 0000000..7c415e6 --- /dev/null +++ b/aws-msa-reference/policy/kustomization.yaml @@ -0,0 +1,5 @@ +resources: + - ../base + +transformers: + - site-values.yaml diff --git a/aws-msa-reference/policy/site-values.yaml b/aws-msa-reference/policy/site-values.yaml new file mode 100644 index 0000000..80aa10e --- /dev/null +++ b/aws-msa-reference/policy/site-values.yaml @@ -0,0 +1,26 @@ +apiVersion: openinfradev.github.com/v1 +kind: HelmValuesTransformer +metadata: + name: site + +global: + nodeSelector: + taco-lma: enabled + clusterName: cluster.local + storageClassName: taco-storage + repository: https://openinfradev.github.io/helm-repo/ + +charts: +- name: opa-gatekeeper + override: + postUpgrade.nodeSelector: $(nodeSelector) + postInstall.nodeSelector: $(nodeSelector) + preUninstall.nodeSelector: $(nodeSelector) + controllerManager.nodeSelector: $(nodeSelector) + audit.nodeSelector: $(nodeSelector) + crds.nodeSelector: $(nodeSelector) + + enableDeleteOperations: true + +- name: policy-resources + override: {} diff --git a/aws-reference/policy/kustomization.yaml b/aws-reference/policy/kustomization.yaml new file mode 100644 index 0000000..7c415e6 --- /dev/null +++ b/aws-reference/policy/kustomization.yaml @@ -0,0 +1,5 @@ +resources: + - ../base + +transformers: + - site-values.yaml diff --git a/aws-reference/policy/site-values.yaml b/aws-reference/policy/site-values.yaml new file mode 100644 index 0000000..80aa10e --- /dev/null +++ b/aws-reference/policy/site-values.yaml @@ -0,0 +1,26 @@ +apiVersion: openinfradev.github.com/v1 +kind: HelmValuesTransformer +metadata: + name: site + +global: + nodeSelector: + taco-lma: enabled + clusterName: cluster.local + storageClassName: taco-storage + repository: https://openinfradev.github.io/helm-repo/ + +charts: +- name: opa-gatekeeper + override: + postUpgrade.nodeSelector: $(nodeSelector) + postInstall.nodeSelector: $(nodeSelector) + preUninstall.nodeSelector: $(nodeSelector) + controllerManager.nodeSelector: $(nodeSelector) + audit.nodeSelector: $(nodeSelector) + crds.nodeSelector: $(nodeSelector) + + enableDeleteOperations: true + +- name: policy-resources + override: {} diff --git a/eks-msa-reference/policy/kustomization.yaml b/eks-msa-reference/policy/kustomization.yaml new file mode 100644 index 0000000..7c415e6 --- /dev/null +++ b/eks-msa-reference/policy/kustomization.yaml @@ -0,0 +1,5 @@ +resources: + - ../base + +transformers: + - site-values.yaml diff --git a/eks-msa-reference/policy/site-values.yaml b/eks-msa-reference/policy/site-values.yaml new file mode 100644 index 0000000..80aa10e --- /dev/null +++ b/eks-msa-reference/policy/site-values.yaml @@ -0,0 +1,26 @@ +apiVersion: openinfradev.github.com/v1 +kind: HelmValuesTransformer +metadata: + name: site + +global: + nodeSelector: + taco-lma: enabled + clusterName: cluster.local + storageClassName: taco-storage + repository: https://openinfradev.github.io/helm-repo/ + +charts: +- name: opa-gatekeeper + override: + postUpgrade.nodeSelector: $(nodeSelector) + postInstall.nodeSelector: $(nodeSelector) + preUninstall.nodeSelector: $(nodeSelector) + controllerManager.nodeSelector: $(nodeSelector) + audit.nodeSelector: $(nodeSelector) + crds.nodeSelector: $(nodeSelector) + + enableDeleteOperations: true + +- name: policy-resources + override: {} diff --git a/eks-reference/policy/kustomization.yaml b/eks-reference/policy/kustomization.yaml new file mode 100644 index 0000000..7c415e6 --- /dev/null +++ b/eks-reference/policy/kustomization.yaml @@ -0,0 +1,5 @@ +resources: + - ../base + +transformers: + - site-values.yaml diff --git a/eks-reference/policy/site-values.yaml b/eks-reference/policy/site-values.yaml new file mode 100644 index 0000000..80aa10e --- /dev/null +++ b/eks-reference/policy/site-values.yaml @@ -0,0 +1,26 @@ +apiVersion: openinfradev.github.com/v1 +kind: HelmValuesTransformer +metadata: + name: site + +global: + nodeSelector: + taco-lma: enabled + clusterName: cluster.local + storageClassName: taco-storage + repository: https://openinfradev.github.io/helm-repo/ + +charts: +- name: opa-gatekeeper + override: + postUpgrade.nodeSelector: $(nodeSelector) + postInstall.nodeSelector: $(nodeSelector) + preUninstall.nodeSelector: $(nodeSelector) + controllerManager.nodeSelector: $(nodeSelector) + audit.nodeSelector: $(nodeSelector) + crds.nodeSelector: $(nodeSelector) + + enableDeleteOperations: true + +- name: policy-resources + override: {} From 49558f0d340ebb58ceceb3b70f926c21e74a5d83 Mon Sep 17 00:00:00 2001 From: sungil Date: Mon, 4 Dec 2023 20:32:09 +0000 Subject: [PATCH 02/12] fluentbit: do not store as default over every logs --- aws-msa-reference/lma/site-values.yaml | 2 +- aws-reference/lma/site-values.yaml | 2 +- byoh-reference/lma/site-values.yaml | 2 +- byoh-ssu-reference/lma/site-values.yaml | 2 +- byoh-stage-reference/lma/site-values.yaml | 2 +- byoh-suy-reference/lma/site-values.yaml | 2 +- eks-msa-reference/lma/site-values.yaml | 2 +- eks-reference/lma/site-values.yaml | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/aws-msa-reference/lma/site-values.yaml b/aws-msa-reference/lma/site-values.yaml index 2ae41ec..1558e0b 100644 --- a/aws-msa-reference/lma/site-values.yaml +++ b/aws-msa-reference/lma/site-values.yaml @@ -152,7 +152,7 @@ charts: - tag: kube.* bufferChunkSize: 2M bufferMaxSize: 5M - do_not_store_as_default: false + do_not_store_as_default: true index: container loki_name: taco-loki memBufLimit: 20MB diff --git a/aws-reference/lma/site-values.yaml b/aws-reference/lma/site-values.yaml index 2ae41ec..1558e0b 100644 --- a/aws-reference/lma/site-values.yaml +++ b/aws-reference/lma/site-values.yaml @@ -152,7 +152,7 @@ charts: - tag: kube.* bufferChunkSize: 2M bufferMaxSize: 5M - do_not_store_as_default: false + do_not_store_as_default: true index: container loki_name: taco-loki memBufLimit: 20MB diff --git a/byoh-reference/lma/site-values.yaml b/byoh-reference/lma/site-values.yaml index 414a194..166c147 100644 --- a/byoh-reference/lma/site-values.yaml +++ b/byoh-reference/lma/site-values.yaml @@ -155,7 +155,7 @@ charts: - tag: kube.* bufferChunkSize: 2M bufferMaxSize: 5M - do_not_store_as_default: false + do_not_store_as_default: true index: container loki_name: taco-loki memBufLimit: 20MB diff --git a/byoh-ssu-reference/lma/site-values.yaml b/byoh-ssu-reference/lma/site-values.yaml index 3683133..8eb0625 100644 --- a/byoh-ssu-reference/lma/site-values.yaml +++ b/byoh-ssu-reference/lma/site-values.yaml @@ -174,7 +174,7 @@ charts: - tag: kube.* bufferChunkSize: 2M bufferMaxSize: 5M - do_not_store_as_default: false + do_not_store_as_default: true index: container loki_name: taco-loki memBufLimit: 20MB diff --git a/byoh-stage-reference/lma/site-values.yaml b/byoh-stage-reference/lma/site-values.yaml index 3683133..8eb0625 100644 --- a/byoh-stage-reference/lma/site-values.yaml +++ b/byoh-stage-reference/lma/site-values.yaml @@ -174,7 +174,7 @@ charts: - tag: kube.* bufferChunkSize: 2M bufferMaxSize: 5M - do_not_store_as_default: false + do_not_store_as_default: true index: container loki_name: taco-loki memBufLimit: 20MB diff --git a/byoh-suy-reference/lma/site-values.yaml b/byoh-suy-reference/lma/site-values.yaml index 3683133..8eb0625 100644 --- a/byoh-suy-reference/lma/site-values.yaml +++ b/byoh-suy-reference/lma/site-values.yaml @@ -174,7 +174,7 @@ charts: - tag: kube.* bufferChunkSize: 2M bufferMaxSize: 5M - do_not_store_as_default: false + do_not_store_as_default: true index: container loki_name: taco-loki memBufLimit: 20MB diff --git a/eks-msa-reference/lma/site-values.yaml b/eks-msa-reference/lma/site-values.yaml index 23ec784..7c2430a 100644 --- a/eks-msa-reference/lma/site-values.yaml +++ b/eks-msa-reference/lma/site-values.yaml @@ -153,7 +153,7 @@ charts: - tag: kube.* bufferChunkSize: 2M bufferMaxSize: 5M - do_not_store_as_default: false + do_not_store_as_default: true index: container loki_name: taco-loki memBufLimit: 20MB diff --git a/eks-reference/lma/site-values.yaml b/eks-reference/lma/site-values.yaml index 23ec784..7c2430a 100644 --- a/eks-reference/lma/site-values.yaml +++ b/eks-reference/lma/site-values.yaml @@ -153,7 +153,7 @@ charts: - tag: kube.* bufferChunkSize: 2M bufferMaxSize: 5M - do_not_store_as_default: false + do_not_store_as_default: true index: container loki_name: taco-loki memBufLimit: 20MB From a8653f67852a4970aa54f49da50929763631c372 Mon Sep 17 00:00:00 2001 From: "taekyu.kang" Date: Fri, 19 Apr 2024 15:52:45 +0900 Subject: [PATCH 03/12] feature. add alert ruler for tks_policy --- aws-msa-reference/lma/site-values.yaml | 21 ++++++++++++++++++++- aws-reference/lma/site-values.yaml | 22 +++++++++++++++++++++- byoh-reference/lma/site-values.yaml | 20 ++++++++++++++++++++ eks-msa-reference/lma/site-values.yaml | 20 ++++++++++++++++++++ eks-reference/lma/site-values.yaml | 20 ++++++++++++++++++++ 5 files changed, 101 insertions(+), 2 deletions(-) diff --git a/aws-msa-reference/lma/site-values.yaml b/aws-msa-reference/lma/site-values.yaml index 1558e0b..a39b182 100644 --- a/aws-msa-reference/lma/site-values.yaml +++ b/aws-msa-reference/lma/site-values.yaml @@ -337,7 +337,26 @@ charts: for: 30m labels: severity: critical - + - alert: policy-audited + annotations: + Checkpoint: 정책위반이 발생하였습니다.({{ $labels.kind }} / {{ $labels.name }}) + description: 클러스터 ( {{ $labels.taco_cluster }})의 자원({{ $labels.violating_kind }} - {{ $labels.violating_namespace }} / {{ $labels.violating_nam }})에서 정책({{ $labels.kind }} / {{ $labels.name }})위반이 발생했습니다. 메시지 - {{ $labels.violation_msg }} + discriminative: $labels.kind,$labels.name,$labels.taco_cluster,$labels.violating_kind,$labels.violating_name,$labels.violating_namespace,$labels.violation_msg + message: 정책 위반({{ $labels.kind }} / {{ $labels.name }}) + expr: opa_scorecard_constraint_violations{namespace!='kube-system|taco-system|gatekeeper-system', violation_enforcement='warn'} == 1 + for: 1m + labels: + severity: critical + - alert: policy-blocked + annotations: + Checkpoint: "정책위반이 시도가 발생하였습니다.({{ $labels.kind }} / {{ $labels.name }})" + description: "클러스터 ( {{ $labels.taco_cluster }})의 자원({{ $labels.violating_kind }} - {{ $labels.violating_namespace }} / {{ $labels.violating_nam }})에서 정책({{ $labels.kind }} / {{ $labels.name }})위반 시도가 발생했습니다. 메시지 - {{ $labels.violation_msg }}" + discriminative: $labels.kind,$labels.name,$labels.taco_cluster,$labels.violating_kind,$labels.violating_name,$labels.violating_namespace,$labels.violation_msg + message: 정책 위반({{ $labels.kind }} / {{ $labels.name }}) 시도 + expr: opa_scorecard_constraint_violations{namespace!='kube-system|taco-system|gatekeeper-system',violation_enforcement=''} == 1 + for: 1m + labels: + severity: critical - name: thanos-config override: objectStorage: diff --git a/aws-reference/lma/site-values.yaml b/aws-reference/lma/site-values.yaml index 1558e0b..4ba03b3 100644 --- a/aws-reference/lma/site-values.yaml +++ b/aws-reference/lma/site-values.yaml @@ -337,7 +337,27 @@ charts: for: 30m labels: severity: critical - + - alert: policy-audited + annotations: + Checkpoint: 정책위반이 발생하였습니다.({{ $labels.kind }} / {{ $labels.name }}) + description: 클러스터 ( {{ $labels.taco_cluster }})의 자원({{ $labels.violating_kind }} - {{ $labels.violating_namespace }} / {{ $labels.violating_nam }})에서 정책({{ $labels.kind }} / {{ $labels.name }})위반이 발생했습니다. 메시지 - {{ $labels.violation_msg }} + discriminative: $labels.kind,$labels.name,$labels.taco_cluster,$labels.violating_kind,$labels.violating_name,$labels.violating_namespace,$labels.violation_msg + message: 정책 위반({{ $labels.kind }} / {{ $labels.name }}) + expr: opa_scorecard_constraint_violations{namespace!='kube-system|taco-system|gatekeeper-system', violation_enforcement='warn'} == 1 + for: 1m + labels: + severity: critical + - alert: policy-blocked + annotations: + Checkpoint: "정책위반이 시도가 발생하였습니다.({{ $labels.kind }} / {{ $labels.name }})" + description: "클러스터 ( {{ $labels.taco_cluster }})의 자원({{ $labels.violating_kind }} - {{ $labels.violating_namespace }} / {{ $labels.violating_nam }})에서 정책({{ $labels.kind }} / {{ $labels.name }})위반 시도가 발생했습니다. 메시지 - {{ $labels.violation_msg }}" + discriminative: $labels.kind,$labels.name,$labels.taco_cluster,$labels.violating_kind,$labels.violating_name,$labels.violating_namespace,$labels.violation_msg + message: 정책 위반({{ $labels.kind }} / {{ $labels.name }}) 시도 + expr: opa_scorecard_constraint_violations{namespace!='kube-system|taco-system|gatekeeper-system',violation_enforcement=''} == 1 + for: 1m + labels: + severity: critical + - name: thanos-config override: objectStorage: diff --git a/byoh-reference/lma/site-values.yaml b/byoh-reference/lma/site-values.yaml index 166c147..3974685 100644 --- a/byoh-reference/lma/site-values.yaml +++ b/byoh-reference/lma/site-values.yaml @@ -345,6 +345,26 @@ charts: for: 30m labels: severity: critical + - alert: policy-audited + annotations: + Checkpoint: 정책위반이 발생하였습니다.({{ $labels.kind }} / {{ $labels.name }}) + description: 클러스터 ( {{ $labels.taco_cluster }})의 자원({{ $labels.violating_kind }} - {{ $labels.violating_namespace }} / {{ $labels.violating_nam }})에서 정책({{ $labels.kind }} / {{ $labels.name }})위반이 발생했습니다. 메시지 - {{ $labels.violation_msg }} + discriminative: $labels.kind,$labels.name,$labels.taco_cluster,$labels.violating_kind,$labels.violating_name,$labels.violating_namespace,$labels.violation_msg + message: 정책 위반({{ $labels.kind }} / {{ $labels.name }}) + expr: opa_scorecard_constraint_violations{namespace!='kube-system|taco-system|gatekeeper-system', violation_enforcement='warn'} == 1 + for: 1m + labels: + severity: critical + - alert: policy-blocked + annotations: + Checkpoint: "정책위반이 시도가 발생하였습니다.({{ $labels.kind }} / {{ $labels.name }})" + description: "클러스터 ( {{ $labels.taco_cluster }})의 자원({{ $labels.violating_kind }} - {{ $labels.violating_namespace }} / {{ $labels.violating_nam }})에서 정책({{ $labels.kind }} / {{ $labels.name }})위반 시도가 발생했습니다. 메시지 - {{ $labels.violation_msg }}" + discriminative: $labels.kind,$labels.name,$labels.taco_cluster,$labels.violating_kind,$labels.violating_name,$labels.violating_namespace,$labels.violation_msg + message: 정책 위반({{ $labels.kind }} / {{ $labels.name }}) 시도 + expr: opa_scorecard_constraint_violations{namespace!='kube-system|taco-system|gatekeeper-system',violation_enforcement=''} == 1 + for: 1m + labels: + severity: critical - name: thanos-config override: diff --git a/eks-msa-reference/lma/site-values.yaml b/eks-msa-reference/lma/site-values.yaml index 7c2430a..baff9e8 100644 --- a/eks-msa-reference/lma/site-values.yaml +++ b/eks-msa-reference/lma/site-values.yaml @@ -338,6 +338,26 @@ charts: for: 30m labels: severity: critical + - alert: policy-audited + annotations: + Checkpoint: 정책위반이 발생하였습니다.({{ $labels.kind }} / {{ $labels.name }}) + description: 클러스터 ( {{ $labels.taco_cluster }})의 자원({{ $labels.violating_kind }} - {{ $labels.violating_namespace }} / {{ $labels.violating_nam }})에서 정책({{ $labels.kind }} / {{ $labels.name }})위반이 발생했습니다. 메시지 - {{ $labels.violation_msg }} + discriminative: $labels.kind,$labels.name,$labels.taco_cluster,$labels.violating_kind,$labels.violating_name,$labels.violating_namespace,$labels.violation_msg + message: 정책 위반({{ $labels.kind }} / {{ $labels.name }}) + expr: opa_scorecard_constraint_violations{namespace!='kube-system|taco-system|gatekeeper-system', violation_enforcement='warn'} == 1 + for: 1m + labels: + severity: critical + - alert: policy-blocked + annotations: + Checkpoint: "정책위반이 시도가 발생하였습니다.({{ $labels.kind }} / {{ $labels.name }})" + description: "클러스터 ( {{ $labels.taco_cluster }})의 자원({{ $labels.violating_kind }} - {{ $labels.violating_namespace }} / {{ $labels.violating_nam }})에서 정책({{ $labels.kind }} / {{ $labels.name }})위반 시도가 발생했습니다. 메시지 - {{ $labels.violation_msg }}" + discriminative: $labels.kind,$labels.name,$labels.taco_cluster,$labels.violating_kind,$labels.violating_name,$labels.violating_namespace,$labels.violation_msg + message: 정책 위반({{ $labels.kind }} / {{ $labels.name }}) 시도 + expr: opa_scorecard_constraint_violations{namespace!='kube-system|taco-system|gatekeeper-system',violation_enforcement=''} == 1 + for: 1m + labels: + severity: critical - name: thanos-config override: diff --git a/eks-reference/lma/site-values.yaml b/eks-reference/lma/site-values.yaml index 7c2430a..baff9e8 100644 --- a/eks-reference/lma/site-values.yaml +++ b/eks-reference/lma/site-values.yaml @@ -338,6 +338,26 @@ charts: for: 30m labels: severity: critical + - alert: policy-audited + annotations: + Checkpoint: 정책위반이 발생하였습니다.({{ $labels.kind }} / {{ $labels.name }}) + description: 클러스터 ( {{ $labels.taco_cluster }})의 자원({{ $labels.violating_kind }} - {{ $labels.violating_namespace }} / {{ $labels.violating_nam }})에서 정책({{ $labels.kind }} / {{ $labels.name }})위반이 발생했습니다. 메시지 - {{ $labels.violation_msg }} + discriminative: $labels.kind,$labels.name,$labels.taco_cluster,$labels.violating_kind,$labels.violating_name,$labels.violating_namespace,$labels.violation_msg + message: 정책 위반({{ $labels.kind }} / {{ $labels.name }}) + expr: opa_scorecard_constraint_violations{namespace!='kube-system|taco-system|gatekeeper-system', violation_enforcement='warn'} == 1 + for: 1m + labels: + severity: critical + - alert: policy-blocked + annotations: + Checkpoint: "정책위반이 시도가 발생하였습니다.({{ $labels.kind }} / {{ $labels.name }})" + description: "클러스터 ( {{ $labels.taco_cluster }})의 자원({{ $labels.violating_kind }} - {{ $labels.violating_namespace }} / {{ $labels.violating_nam }})에서 정책({{ $labels.kind }} / {{ $labels.name }})위반 시도가 발생했습니다. 메시지 - {{ $labels.violation_msg }}" + discriminative: $labels.kind,$labels.name,$labels.taco_cluster,$labels.violating_kind,$labels.violating_name,$labels.violating_namespace,$labels.violation_msg + message: 정책 위반({{ $labels.kind }} / {{ $labels.name }}) 시도 + expr: opa_scorecard_constraint_violations{namespace!='kube-system|taco-system|gatekeeper-system',violation_enforcement=''} == 1 + for: 1m + labels: + severity: critical - name: thanos-config override: From 9d2964c94fd9a37c666882a2b2f05690cdc39b9f Mon Sep 17 00:00:00 2001 From: "taekyu.kang" Date: Wed, 24 Apr 2024 09:58:39 +0900 Subject: [PATCH 04/12] feature. remove thanos ruler from all stack_templates --- aws-msa-reference/lma/site-values.yaml | 75 +------------------------- aws-reference/lma/site-values.yaml | 74 ------------------------- byoh-reference/lma/site-values.yaml | 74 ------------------------- eks-msa-reference/lma/site-values.yaml | 74 ------------------------- eks-reference/lma/site-values.yaml | 74 ------------------------- 5 files changed, 1 insertion(+), 370 deletions(-) diff --git a/aws-msa-reference/lma/site-values.yaml b/aws-msa-reference/lma/site-values.yaml index a39b182..55597ca 100644 --- a/aws-msa-reference/lma/site-values.yaml +++ b/aws-msa-reference/lma/site-values.yaml @@ -283,80 +283,7 @@ charts: rules: - alert: "PrometheusDown" expr: absent(up{prometheus="lma/lma-prometheus"}) - - alert: node-cpu-high-load - annotations: - message: 클러스터({{ $labels.taco_cluster }})의 노드({{ $labels.instance }})의 idle process의 cpu 점유율이 3분 동안 0% 입니다. (현재 사용률 {{$value}}) - description: 워커 노드 CPU가 과부하 상태입니다. 일시적인 서비스 Traffic 증가, Workload의 SW 오류, Server HW Fan Fail등 다양한 원인으로 인해 발생할 수 있습니다. - Checkpoint: 일시적인 Service Traffic의 증가가 관측되지 않았다면, Alert발생 노드에서 실행 되는 pod중 CPU 자원을 많이 점유하는 pod의 설정을 점검해 보시길 제안드립니다. 예를 들어 pod spec의 limit 설정으로 과도한 CPU자원 점유을 막을 수 있습니다. - summary: Cpu resources of the node {{ $labels.instance }} are running low. - discriminative: $labels.taco_cluster, $labels.instance - expr: (avg by (taco_cluster, instance) (rate(node_cpu_seconds_total{mode="idle"}[60s]))) < 0 #0.1 # 진짜 0? - for: 3m - labels: - severity: warning - - alert: node-memory-high-utilization - annotations: - message: 클러스터({{ $labels.taco_cluster }})의 노드({{ $labels.instance }})의 Memory 사용량이 3분동안 80% 를 넘어서고 있습니다. (현재 사용률 {{$value}}) - descriptioon: 워커 노드의 Memory 사용량이 80%를 넘었습니다. 일시적인 서비스 증가 및 SW 오류등 다양한 원인으로 발생할 수 있습니다. - Checkpoint: 일시적인 Service Traffic의 증가가 관측되지 않았다면, Alert발생 노드에서 실행되는 pod중 Memory 사용량이 높은 pod들에 대한 점검을 제안드립니다. - summary: Memory resources of the node {{ $labels.instance }} are running low. - discriminative: $labels.taco_cluster, $labels.instance - expr: (node_memory_MemAvailable_bytes/node_memory_MemTotal_bytes) < 0.2 - for: 3m - labels: - severity: warning - - alert: node-disk-full - annotations: - message: 지난 6시간동안의 추세로 봤을 때, 클러스터({{ $labels.taco_cluster }})의 노드({{ $labels.instance }})의 root 볼륨은 24시간 안에 Disk full이 예상됨 - description: 현재 Disk 사용 추세기준 24시간 내에 Disk 용량이 꽉 찰 것으로 예상됩니다. - Checkpoint: Disk 용량 최적화(삭제 및 Backup)을 수행하시길 권고합니다. 삭제할 내역이 없으면 증설 계획을 수립해 주십시요. - summary: Memory resources of the node {{ $labels.instance }} are running low. - discriminative: $labels.taco_cluster, $labels.instance - expr: predict_linear(node_filesystem_free_bytes{mountpoint="/"}[6h], 24*3600) < 0 - for: 30m - labels: - severity: critical - - alert: pvc-full - annotations: - message: 지난 6시간동안의 추세로 봤을 때, 클러스터({{ $labels.taco_cluster }})의 파드({{ $labels.persistentvolumeclaim }})가 24시간 안에 Disk full이 예상됨 - description: 현재 Disk 사용 추세기준 24시간 내에 Disk 용량이 꽉 찰것으로 예상됩니다. ({{ $labels.taco_cluster }} 클러스터, {{ $labels.persistentvolumeclaim }} PVC) - Checkpoint: Disk 용량 최적화(삭제 및 Backup)을 수행하시길 권고합니다. 삭제할 내역이 없으면 증설 계획을 수립해 주십시요. - summary: Disk resources of the volume(pvc) {{ $labels.persistentvolumeclaim }} are running low. - discriminative: $labels.taco_cluster, $labels.persistentvolumeclaim - expr: predict_linear(kubelet_volume_stats_available_bytes[6h], 24*3600) < 0 # kubelet_volume_stats_capacity_bytes - for: 30m - labels: - severity: critical - - alert: pod-restart-frequently - annotations: - message: 클러스터({{ $labels.taco_cluster }})의 파드({{ $labels.pod }})가 30분 동안 5회 이상 재기동 ({{ $value }}회) - description: 특정 Pod가 빈번하게 재기동 되고 있습니다. 점검이 필요합니다. ({{ $labels.taco_cluster }} 클러스터, {{ $labels.pod }} 파드) - Checkpoint: pod spec. 에 대한 점검이 필요합니다. pod의 log 및 status를 확인해 주세요. - discriminative: $labels.taco_cluster, $labels.pod, $labels.namespace - expr: increase(kube_pod_container_status_restarts_total{namespace!="kube-system"}[60m:]) > 2 # 몇회로 할 것인지? - for: 30m - labels: - severity: critical - - alert: policy-audited - annotations: - Checkpoint: 정책위반이 발생하였습니다.({{ $labels.kind }} / {{ $labels.name }}) - description: 클러스터 ( {{ $labels.taco_cluster }})의 자원({{ $labels.violating_kind }} - {{ $labels.violating_namespace }} / {{ $labels.violating_nam }})에서 정책({{ $labels.kind }} / {{ $labels.name }})위반이 발생했습니다. 메시지 - {{ $labels.violation_msg }} - discriminative: $labels.kind,$labels.name,$labels.taco_cluster,$labels.violating_kind,$labels.violating_name,$labels.violating_namespace,$labels.violation_msg - message: 정책 위반({{ $labels.kind }} / {{ $labels.name }}) - expr: opa_scorecard_constraint_violations{namespace!='kube-system|taco-system|gatekeeper-system', violation_enforcement='warn'} == 1 - for: 1m - labels: - severity: critical - - alert: policy-blocked - annotations: - Checkpoint: "정책위반이 시도가 발생하였습니다.({{ $labels.kind }} / {{ $labels.name }})" - description: "클러스터 ( {{ $labels.taco_cluster }})의 자원({{ $labels.violating_kind }} - {{ $labels.violating_namespace }} / {{ $labels.violating_nam }})에서 정책({{ $labels.kind }} / {{ $labels.name }})위반 시도가 발생했습니다. 메시지 - {{ $labels.violation_msg }}" - discriminative: $labels.kind,$labels.name,$labels.taco_cluster,$labels.violating_kind,$labels.violating_name,$labels.violating_namespace,$labels.violation_msg - message: 정책 위반({{ $labels.kind }} / {{ $labels.name }}) 시도 - expr: opa_scorecard_constraint_violations{namespace!='kube-system|taco-system|gatekeeper-system',violation_enforcement=''} == 1 - for: 1m - labels: - severity: critical + - name: thanos-config override: objectStorage: diff --git a/aws-reference/lma/site-values.yaml b/aws-reference/lma/site-values.yaml index 4ba03b3..976b438 100644 --- a/aws-reference/lma/site-values.yaml +++ b/aws-reference/lma/site-values.yaml @@ -283,80 +283,6 @@ charts: rules: - alert: "PrometheusDown" expr: absent(up{prometheus="lma/lma-prometheus"}) - - alert: node-cpu-high-load - annotations: - message: 클러스터({{ $labels.taco_cluster }})의 노드({{ $labels.instance }})의 idle process의 cpu 점유율이 3분 동안 0% 입니다. (현재 사용률 {{$value}}) - description: 워커 노드 CPU가 과부하 상태입니다. 일시적인 서비스 Traffic 증가, Workload의 SW 오류, Server HW Fan Fail등 다양한 원인으로 인해 발생할 수 있습니다. - Checkpoint: 일시적인 Service Traffic의 증가가 관측되지 않았다면, Alert발생 노드에서 실행 되는 pod중 CPU 자원을 많이 점유하는 pod의 설정을 점검해 보시길 제안드립니다. 예를 들어 pod spec의 limit 설정으로 과도한 CPU자원 점유을 막을 수 있습니다. - summary: Cpu resources of the node {{ $labels.instance }} are running low. - discriminative: $labels.taco_cluster, $labels.instance - expr: (avg by (taco_cluster, instance) (rate(node_cpu_seconds_total{mode="idle"}[60s]))) < 0 #0.1 # 진짜 0? - for: 3m - labels: - severity: warning - - alert: node-memory-high-utilization - annotations: - message: 클러스터({{ $labels.taco_cluster }})의 노드({{ $labels.instance }})의 Memory 사용량이 3분동안 80% 를 넘어서고 있습니다. (현재 사용률 {{$value}}) - descriptioon: 워커 노드의 Memory 사용량이 80%를 넘었습니다. 일시적인 서비스 증가 및 SW 오류등 다양한 원인으로 발생할 수 있습니다. - Checkpoint: 일시적인 Service Traffic의 증가가 관측되지 않았다면, Alert발생 노드에서 실행되는 pod중 Memory 사용량이 높은 pod들에 대한 점검을 제안드립니다. - summary: Memory resources of the node {{ $labels.instance }} are running low. - discriminative: $labels.taco_cluster, $labels.instance - expr: (node_memory_MemAvailable_bytes/node_memory_MemTotal_bytes) < 0.2 - for: 3m - labels: - severity: warning - - alert: node-disk-full - annotations: - message: 지난 6시간동안의 추세로 봤을 때, 클러스터({{ $labels.taco_cluster }})의 노드({{ $labels.instance }})의 root 볼륨은 24시간 안에 Disk full이 예상됨 - description: 현재 Disk 사용 추세기준 24시간 내에 Disk 용량이 꽉 찰 것으로 예상됩니다. - Checkpoint: Disk 용량 최적화(삭제 및 Backup)을 수행하시길 권고합니다. 삭제할 내역이 없으면 증설 계획을 수립해 주십시요. - summary: Memory resources of the node {{ $labels.instance }} are running low. - discriminative: $labels.taco_cluster, $labels.instance - expr: predict_linear(node_filesystem_free_bytes{mountpoint="/"}[6h], 24*3600) < 0 - for: 30m - labels: - severity: critical - - alert: pvc-full - annotations: - message: 지난 6시간동안의 추세로 봤을 때, 클러스터({{ $labels.taco_cluster }})의 파드({{ $labels.persistentvolumeclaim }})가 24시간 안에 Disk full이 예상됨 - description: 현재 Disk 사용 추세기준 24시간 내에 Disk 용량이 꽉 찰것으로 예상됩니다. ({{ $labels.taco_cluster }} 클러스터, {{ $labels.persistentvolumeclaim }} PVC) - Checkpoint: Disk 용량 최적화(삭제 및 Backup)을 수행하시길 권고합니다. 삭제할 내역이 없으면 증설 계획을 수립해 주십시요. - summary: Disk resources of the volume(pvc) {{ $labels.persistentvolumeclaim }} are running low. - discriminative: $labels.taco_cluster, $labels.persistentvolumeclaim - expr: predict_linear(kubelet_volume_stats_available_bytes[6h], 24*3600) < 0 # kubelet_volume_stats_capacity_bytes - for: 30m - labels: - severity: critical - - alert: pod-restart-frequently - annotations: - message: 클러스터({{ $labels.taco_cluster }})의 파드({{ $labels.pod }})가 30분 동안 5회 이상 재기동 ({{ $value }}회) - description: 특정 Pod가 빈번하게 재기동 되고 있습니다. 점검이 필요합니다. ({{ $labels.taco_cluster }} 클러스터, {{ $labels.pod }} 파드) - Checkpoint: pod spec. 에 대한 점검이 필요합니다. pod의 log 및 status를 확인해 주세요. - discriminative: $labels.taco_cluster, $labels.pod, $labels.namespace - expr: increase(kube_pod_container_status_restarts_total{namespace!="kube-system"}[60m:]) > 2 # 몇회로 할 것인지? - for: 30m - labels: - severity: critical - - alert: policy-audited - annotations: - Checkpoint: 정책위반이 발생하였습니다.({{ $labels.kind }} / {{ $labels.name }}) - description: 클러스터 ( {{ $labels.taco_cluster }})의 자원({{ $labels.violating_kind }} - {{ $labels.violating_namespace }} / {{ $labels.violating_nam }})에서 정책({{ $labels.kind }} / {{ $labels.name }})위반이 발생했습니다. 메시지 - {{ $labels.violation_msg }} - discriminative: $labels.kind,$labels.name,$labels.taco_cluster,$labels.violating_kind,$labels.violating_name,$labels.violating_namespace,$labels.violation_msg - message: 정책 위반({{ $labels.kind }} / {{ $labels.name }}) - expr: opa_scorecard_constraint_violations{namespace!='kube-system|taco-system|gatekeeper-system', violation_enforcement='warn'} == 1 - for: 1m - labels: - severity: critical - - alert: policy-blocked - annotations: - Checkpoint: "정책위반이 시도가 발생하였습니다.({{ $labels.kind }} / {{ $labels.name }})" - description: "클러스터 ( {{ $labels.taco_cluster }})의 자원({{ $labels.violating_kind }} - {{ $labels.violating_namespace }} / {{ $labels.violating_nam }})에서 정책({{ $labels.kind }} / {{ $labels.name }})위반 시도가 발생했습니다. 메시지 - {{ $labels.violation_msg }}" - discriminative: $labels.kind,$labels.name,$labels.taco_cluster,$labels.violating_kind,$labels.violating_name,$labels.violating_namespace,$labels.violation_msg - message: 정책 위반({{ $labels.kind }} / {{ $labels.name }}) 시도 - expr: opa_scorecard_constraint_violations{namespace!='kube-system|taco-system|gatekeeper-system',violation_enforcement=''} == 1 - for: 1m - labels: - severity: critical - name: thanos-config override: diff --git a/byoh-reference/lma/site-values.yaml b/byoh-reference/lma/site-values.yaml index 3974685..580462a 100644 --- a/byoh-reference/lma/site-values.yaml +++ b/byoh-reference/lma/site-values.yaml @@ -291,80 +291,6 @@ charts: rules: - alert: "PrometheusDown" expr: absent(up{prometheus="lma/lma-prometheus"}) - - alert: node-cpu-high-load - annotations: - message: 클러스터({{ $labels.taco_cluster }})의 노드({{ $labels.instance }})의 idle process의 cpu 점유율이 3분 동안 0% 입니다. (현재 사용률 {{$value}}) - description: 워커 노드 CPU가 과부하 상태입니다. 일시적인 서비스 Traffic 증가, Workload의 SW 오류, Server HW Fan Fail등 다양한 원인으로 인해 발생할 수 있습니다. - Checkpoint: 일시적인 Service Traffic의 증가가 관측되지 않았다면, Alert발생 노드에서 실행 되는 pod중 CPU 자원을 많이 점유하는 pod의 설정을 점검해 보시길 제안드립니다. 예를 들어 pod spec의 limit 설정으로 과도한 CPU자원 점유을 막을 수 있습니다. - summary: Cpu resources of the node {{ $labels.instance }} are running low. - discriminative: $labels.taco_cluster, $labels.instance - expr: (avg by (taco_cluster, instance) (rate(node_cpu_seconds_total{mode="idle"}[60s]))) < 0 #0.1 # 진짜 0? - for: 3m - labels: - severity: warning - - alert: node-memory-high-utilization - annotations: - message: 클러스터({{ $labels.taco_cluster }})의 노드({{ $labels.instance }})의 Memory 사용량이 3분동안 80% 를 넘어서고 있습니다. (현재 사용률 {{$value}}) - descriptioon: 워커 노드의 Memory 사용량이 80%를 넘었습니다. 일시적인 서비스 증가 및 SW 오류등 다양한 원인으로 발생할 수 있습니다. - Checkpoint: 일시적인 Service Traffic의 증가가 관측되지 않았다면, Alert발생 노드에서 실행되는 pod중 Memory 사용량이 높은 pod들에 대한 점검을 제안드립니다. - summary: Memory resources of the node {{ $labels.instance }} are running low. - discriminative: $labels.taco_cluster, $labels.instance - expr: (node_memory_MemAvailable_bytes/node_memory_MemTotal_bytes) < 0.2 - for: 3m - labels: - severity: warning - - alert: node-disk-full - annotations: - message: 지난 6시간동안의 추세로 봤을 때, 클러스터({{ $labels.taco_cluster }})의 노드({{ $labels.instance }})의 root 볼륨은 24시간 안에 Disk full이 예상됨 - description: 현재 Disk 사용 추세기준 24시간 내에 Disk 용량이 꽉 찰 것으로 예상됩니다. - Checkpoint: Disk 용량 최적화(삭제 및 Backup)을 수행하시길 권고합니다. 삭제할 내역이 없으면 증설 계획을 수립해 주십시요. - summary: Memory resources of the node {{ $labels.instance }} are running low. - discriminative: $labels.taco_cluster, $labels.instance - expr: predict_linear(node_filesystem_free_bytes{mountpoint="/"}[6h], 24*3600) < 0 - for: 30m - labels: - severity: critical - - alert: pvc-full - annotations: - message: 지난 6시간동안의 추세로 봤을 때, 클러스터({{ $labels.taco_cluster }})의 파드({{ $labels.persistentvolumeclaim }})가 24시간 안에 Disk full이 예상됨 - description: 현재 Disk 사용 추세기준 24시간 내에 Disk 용량이 꽉 찰것으로 예상됩니다. ({{ $labels.taco_cluster }} 클러스터, {{ $labels.persistentvolumeclaim }} PVC) - Checkpoint: Disk 용량 최적화(삭제 및 Backup)을 수행하시길 권고합니다. 삭제할 내역이 없으면 증설 계획을 수립해 주십시요. - summary: Disk resources of the volume(pvc) {{ $labels.persistentvolumeclaim }} are running low. - discriminative: $labels.taco_cluster, $labels.persistentvolumeclaim - expr: predict_linear(kubelet_volume_stats_available_bytes[6h], 24*3600) < 0 # kubelet_volume_stats_capacity_bytes - for: 30m - labels: - severity: critical - - alert: pod-restart-frequently - annotations: - message: 클러스터({{ $labels.taco_cluster }})의 파드({{ $labels.pod }})가 30분 동안 5회 이상 재기동 ({{ $value }}회) - description: 특정 Pod가 빈번하게 재기동 되고 있습니다. 점검이 필요합니다. ({{ $labels.taco_cluster }} 클러스터, {{ $labels.pod }} 파드) - Checkpoint: pod spec. 에 대한 점검이 필요합니다. pod의 log 및 status를 확인해 주세요. - discriminative: $labels.taco_cluster, $labels.pod, $labels.namespace - expr: increase(kube_pod_container_status_restarts_total{namespace!="kube-system"}[60m:]) > 2 # 몇회로 할 것인지? - for: 30m - labels: - severity: critical - - alert: policy-audited - annotations: - Checkpoint: 정책위반이 발생하였습니다.({{ $labels.kind }} / {{ $labels.name }}) - description: 클러스터 ( {{ $labels.taco_cluster }})의 자원({{ $labels.violating_kind }} - {{ $labels.violating_namespace }} / {{ $labels.violating_nam }})에서 정책({{ $labels.kind }} / {{ $labels.name }})위반이 발생했습니다. 메시지 - {{ $labels.violation_msg }} - discriminative: $labels.kind,$labels.name,$labels.taco_cluster,$labels.violating_kind,$labels.violating_name,$labels.violating_namespace,$labels.violation_msg - message: 정책 위반({{ $labels.kind }} / {{ $labels.name }}) - expr: opa_scorecard_constraint_violations{namespace!='kube-system|taco-system|gatekeeper-system', violation_enforcement='warn'} == 1 - for: 1m - labels: - severity: critical - - alert: policy-blocked - annotations: - Checkpoint: "정책위반이 시도가 발생하였습니다.({{ $labels.kind }} / {{ $labels.name }})" - description: "클러스터 ( {{ $labels.taco_cluster }})의 자원({{ $labels.violating_kind }} - {{ $labels.violating_namespace }} / {{ $labels.violating_nam }})에서 정책({{ $labels.kind }} / {{ $labels.name }})위반 시도가 발생했습니다. 메시지 - {{ $labels.violation_msg }}" - discriminative: $labels.kind,$labels.name,$labels.taco_cluster,$labels.violating_kind,$labels.violating_name,$labels.violating_namespace,$labels.violation_msg - message: 정책 위반({{ $labels.kind }} / {{ $labels.name }}) 시도 - expr: opa_scorecard_constraint_violations{namespace!='kube-system|taco-system|gatekeeper-system',violation_enforcement=''} == 1 - for: 1m - labels: - severity: critical - name: thanos-config override: diff --git a/eks-msa-reference/lma/site-values.yaml b/eks-msa-reference/lma/site-values.yaml index baff9e8..b5c2b8b 100644 --- a/eks-msa-reference/lma/site-values.yaml +++ b/eks-msa-reference/lma/site-values.yaml @@ -284,80 +284,6 @@ charts: rules: - alert: "PrometheusDown" expr: absent(up{prometheus="lma/lma-prometheus"}) - - alert: node-cpu-high-load - annotations: - message: 클러스터({{ $labels.taco_cluster }})의 노드({{ $labels.instance }})의 idle process의 cpu 점유율이 3분 동안 0% 입니다. (현재 사용률 {{$value}}) - description: 워커 노드 CPU가 과부하 상태입니다. 일시적인 서비스 Traffic 증가, Workload의 SW 오류, Server HW Fan Fail등 다양한 원인으로 인해 발생할 수 있습니다. - Checkpoint: 일시적인 Service Traffic의 증가가 관측되지 않았다면, Alert발생 노드에서 실행 되는 pod중 CPU 자원을 많이 점유하는 pod의 설정을 점검해 보시길 제안드립니다. 예를 들어 pod spec의 limit 설정으로 과도한 CPU자원 점유을 막을 수 있습니다. - summary: Cpu resources of the node {{ $labels.instance }} are running low. - discriminative: $labels.taco_cluster, $labels.instance - expr: (avg by (taco_cluster, instance) (rate(node_cpu_seconds_total{mode="idle"}[60s]))) < 0 #0.1 # 진짜 0? - for: 3m - labels: - severity: warning - - alert: node-memory-high-utilization - annotations: - message: 클러스터({{ $labels.taco_cluster }})의 노드({{ $labels.instance }})의 Memory 사용량이 3분동안 80% 를 넘어서고 있습니다. (현재 사용률 {{$value}}) - descriptioon: 워커 노드의 Memory 사용량이 80%를 넘었습니다. 일시적인 서비스 증가 및 SW 오류등 다양한 원인으로 발생할 수 있습니다. - Checkpoint: 일시적인 Service Traffic의 증가가 관측되지 않았다면, Alert발생 노드에서 실행되는 pod중 Memory 사용량이 높은 pod들에 대한 점검을 제안드립니다. - summary: Memory resources of the node {{ $labels.instance }} are running low. - discriminative: $labels.taco_cluster, $labels.instance - expr: (node_memory_MemAvailable_bytes/node_memory_MemTotal_bytes) < 0.2 - for: 3m - labels: - severity: warning - - alert: node-disk-full - annotations: - message: 지난 6시간동안의 추세로 봤을 때, 클러스터({{ $labels.taco_cluster }})의 노드({{ $labels.instance }})의 root 볼륨은 24시간 안에 Disk full이 예상됨 - description: 현재 Disk 사용 추세기준 24시간 내에 Disk 용량이 꽉 찰 것으로 예상됩니다. - Checkpoint: Disk 용량 최적화(삭제 및 Backup)을 수행하시길 권고합니다. 삭제할 내역이 없으면 증설 계획을 수립해 주십시요. - summary: Memory resources of the node {{ $labels.instance }} are running low. - discriminative: $labels.taco_cluster, $labels.instance - expr: predict_linear(node_filesystem_free_bytes{mountpoint="/"}[6h], 24*3600) < 0 - for: 30m - labels: - severity: critical - - alert: pvc-full - annotations: - message: 지난 6시간동안의 추세로 봤을 때, 클러스터({{ $labels.taco_cluster }})의 파드({{ $labels.persistentvolumeclaim }})가 24시간 안에 Disk full이 예상됨 - description: 현재 Disk 사용 추세기준 24시간 내에 Disk 용량이 꽉 찰것으로 예상됩니다. ({{ $labels.taco_cluster }} 클러스터, {{ $labels.persistentvolumeclaim }} PVC) - Checkpoint: Disk 용량 최적화(삭제 및 Backup)을 수행하시길 권고합니다. 삭제할 내역이 없으면 증설 계획을 수립해 주십시요. - summary: Disk resources of the volume(pvc) {{ $labels.persistentvolumeclaim }} are running low. - discriminative: $labels.taco_cluster, $labels.persistentvolumeclaim - expr: predict_linear(kubelet_volume_stats_available_bytes[6h], 24*3600) < 0 # kubelet_volume_stats_capacity_bytes - for: 30m - labels: - severity: critical - - alert: pod-restart-frequently - annotations: - message: 클러스터({{ $labels.taco_cluster }})의 파드({{ $labels.pod }})가 30분 동안 5회 이상 재기동 ({{ $value }}회) - description: 특정 Pod가 빈번하게 재기동 되고 있습니다. 점검이 필요합니다. ({{ $labels.taco_cluster }} 클러스터, {{ $labels.pod }} 파드) - Checkpoint: pod spec. 에 대한 점검이 필요합니다. pod의 log 및 status를 확인해 주세요. - discriminative: $labels.taco_cluster, $labels.pod, $labels.namespace - expr: increase(kube_pod_container_status_restarts_total{namespace!="kube-system"}[60m:]) > 2 # 몇회로 할 것인지? - for: 30m - labels: - severity: critical - - alert: policy-audited - annotations: - Checkpoint: 정책위반이 발생하였습니다.({{ $labels.kind }} / {{ $labels.name }}) - description: 클러스터 ( {{ $labels.taco_cluster }})의 자원({{ $labels.violating_kind }} - {{ $labels.violating_namespace }} / {{ $labels.violating_nam }})에서 정책({{ $labels.kind }} / {{ $labels.name }})위반이 발생했습니다. 메시지 - {{ $labels.violation_msg }} - discriminative: $labels.kind,$labels.name,$labels.taco_cluster,$labels.violating_kind,$labels.violating_name,$labels.violating_namespace,$labels.violation_msg - message: 정책 위반({{ $labels.kind }} / {{ $labels.name }}) - expr: opa_scorecard_constraint_violations{namespace!='kube-system|taco-system|gatekeeper-system', violation_enforcement='warn'} == 1 - for: 1m - labels: - severity: critical - - alert: policy-blocked - annotations: - Checkpoint: "정책위반이 시도가 발생하였습니다.({{ $labels.kind }} / {{ $labels.name }})" - description: "클러스터 ( {{ $labels.taco_cluster }})의 자원({{ $labels.violating_kind }} - {{ $labels.violating_namespace }} / {{ $labels.violating_nam }})에서 정책({{ $labels.kind }} / {{ $labels.name }})위반 시도가 발생했습니다. 메시지 - {{ $labels.violation_msg }}" - discriminative: $labels.kind,$labels.name,$labels.taco_cluster,$labels.violating_kind,$labels.violating_name,$labels.violating_namespace,$labels.violation_msg - message: 정책 위반({{ $labels.kind }} / {{ $labels.name }}) 시도 - expr: opa_scorecard_constraint_violations{namespace!='kube-system|taco-system|gatekeeper-system',violation_enforcement=''} == 1 - for: 1m - labels: - severity: critical - name: thanos-config override: diff --git a/eks-reference/lma/site-values.yaml b/eks-reference/lma/site-values.yaml index baff9e8..b5c2b8b 100644 --- a/eks-reference/lma/site-values.yaml +++ b/eks-reference/lma/site-values.yaml @@ -284,80 +284,6 @@ charts: rules: - alert: "PrometheusDown" expr: absent(up{prometheus="lma/lma-prometheus"}) - - alert: node-cpu-high-load - annotations: - message: 클러스터({{ $labels.taco_cluster }})의 노드({{ $labels.instance }})의 idle process의 cpu 점유율이 3분 동안 0% 입니다. (현재 사용률 {{$value}}) - description: 워커 노드 CPU가 과부하 상태입니다. 일시적인 서비스 Traffic 증가, Workload의 SW 오류, Server HW Fan Fail등 다양한 원인으로 인해 발생할 수 있습니다. - Checkpoint: 일시적인 Service Traffic의 증가가 관측되지 않았다면, Alert발생 노드에서 실행 되는 pod중 CPU 자원을 많이 점유하는 pod의 설정을 점검해 보시길 제안드립니다. 예를 들어 pod spec의 limit 설정으로 과도한 CPU자원 점유을 막을 수 있습니다. - summary: Cpu resources of the node {{ $labels.instance }} are running low. - discriminative: $labels.taco_cluster, $labels.instance - expr: (avg by (taco_cluster, instance) (rate(node_cpu_seconds_total{mode="idle"}[60s]))) < 0 #0.1 # 진짜 0? - for: 3m - labels: - severity: warning - - alert: node-memory-high-utilization - annotations: - message: 클러스터({{ $labels.taco_cluster }})의 노드({{ $labels.instance }})의 Memory 사용량이 3분동안 80% 를 넘어서고 있습니다. (현재 사용률 {{$value}}) - descriptioon: 워커 노드의 Memory 사용량이 80%를 넘었습니다. 일시적인 서비스 증가 및 SW 오류등 다양한 원인으로 발생할 수 있습니다. - Checkpoint: 일시적인 Service Traffic의 증가가 관측되지 않았다면, Alert발생 노드에서 실행되는 pod중 Memory 사용량이 높은 pod들에 대한 점검을 제안드립니다. - summary: Memory resources of the node {{ $labels.instance }} are running low. - discriminative: $labels.taco_cluster, $labels.instance - expr: (node_memory_MemAvailable_bytes/node_memory_MemTotal_bytes) < 0.2 - for: 3m - labels: - severity: warning - - alert: node-disk-full - annotations: - message: 지난 6시간동안의 추세로 봤을 때, 클러스터({{ $labels.taco_cluster }})의 노드({{ $labels.instance }})의 root 볼륨은 24시간 안에 Disk full이 예상됨 - description: 현재 Disk 사용 추세기준 24시간 내에 Disk 용량이 꽉 찰 것으로 예상됩니다. - Checkpoint: Disk 용량 최적화(삭제 및 Backup)을 수행하시길 권고합니다. 삭제할 내역이 없으면 증설 계획을 수립해 주십시요. - summary: Memory resources of the node {{ $labels.instance }} are running low. - discriminative: $labels.taco_cluster, $labels.instance - expr: predict_linear(node_filesystem_free_bytes{mountpoint="/"}[6h], 24*3600) < 0 - for: 30m - labels: - severity: critical - - alert: pvc-full - annotations: - message: 지난 6시간동안의 추세로 봤을 때, 클러스터({{ $labels.taco_cluster }})의 파드({{ $labels.persistentvolumeclaim }})가 24시간 안에 Disk full이 예상됨 - description: 현재 Disk 사용 추세기준 24시간 내에 Disk 용량이 꽉 찰것으로 예상됩니다. ({{ $labels.taco_cluster }} 클러스터, {{ $labels.persistentvolumeclaim }} PVC) - Checkpoint: Disk 용량 최적화(삭제 및 Backup)을 수행하시길 권고합니다. 삭제할 내역이 없으면 증설 계획을 수립해 주십시요. - summary: Disk resources of the volume(pvc) {{ $labels.persistentvolumeclaim }} are running low. - discriminative: $labels.taco_cluster, $labels.persistentvolumeclaim - expr: predict_linear(kubelet_volume_stats_available_bytes[6h], 24*3600) < 0 # kubelet_volume_stats_capacity_bytes - for: 30m - labels: - severity: critical - - alert: pod-restart-frequently - annotations: - message: 클러스터({{ $labels.taco_cluster }})의 파드({{ $labels.pod }})가 30분 동안 5회 이상 재기동 ({{ $value }}회) - description: 특정 Pod가 빈번하게 재기동 되고 있습니다. 점검이 필요합니다. ({{ $labels.taco_cluster }} 클러스터, {{ $labels.pod }} 파드) - Checkpoint: pod spec. 에 대한 점검이 필요합니다. pod의 log 및 status를 확인해 주세요. - discriminative: $labels.taco_cluster, $labels.pod, $labels.namespace - expr: increase(kube_pod_container_status_restarts_total{namespace!="kube-system"}[60m:]) > 2 # 몇회로 할 것인지? - for: 30m - labels: - severity: critical - - alert: policy-audited - annotations: - Checkpoint: 정책위반이 발생하였습니다.({{ $labels.kind }} / {{ $labels.name }}) - description: 클러스터 ( {{ $labels.taco_cluster }})의 자원({{ $labels.violating_kind }} - {{ $labels.violating_namespace }} / {{ $labels.violating_nam }})에서 정책({{ $labels.kind }} / {{ $labels.name }})위반이 발생했습니다. 메시지 - {{ $labels.violation_msg }} - discriminative: $labels.kind,$labels.name,$labels.taco_cluster,$labels.violating_kind,$labels.violating_name,$labels.violating_namespace,$labels.violation_msg - message: 정책 위반({{ $labels.kind }} / {{ $labels.name }}) - expr: opa_scorecard_constraint_violations{namespace!='kube-system|taco-system|gatekeeper-system', violation_enforcement='warn'} == 1 - for: 1m - labels: - severity: critical - - alert: policy-blocked - annotations: - Checkpoint: "정책위반이 시도가 발생하였습니다.({{ $labels.kind }} / {{ $labels.name }})" - description: "클러스터 ( {{ $labels.taco_cluster }})의 자원({{ $labels.violating_kind }} - {{ $labels.violating_namespace }} / {{ $labels.violating_nam }})에서 정책({{ $labels.kind }} / {{ $labels.name }})위반 시도가 발생했습니다. 메시지 - {{ $labels.violation_msg }}" - discriminative: $labels.kind,$labels.name,$labels.taco_cluster,$labels.violating_kind,$labels.violating_name,$labels.violating_namespace,$labels.violation_msg - message: 정책 위반({{ $labels.kind }} / {{ $labels.name }}) 시도 - expr: opa_scorecard_constraint_violations{namespace!='kube-system|taco-system|gatekeeper-system',violation_enforcement=''} == 1 - for: 1m - labels: - severity: critical - name: thanos-config override: From b7816bd53d4a1258203d72dd9c5c3e2faf78539c Mon Sep 17 00:00:00 2001 From: "taekyu.kang" Date: Thu, 25 Apr 2024 18:07:38 +0900 Subject: [PATCH 05/12] feature. change service type LoadBalancer for thanos-ruler --- aws-msa-reference/lma/site-values.yaml | 2 ++ aws-reference/lma/site-values.yaml | 2 ++ byoh-reference/lma/site-values.yaml | 2 ++ eks-msa-reference/lma/site-values.yaml | 2 ++ eks-reference/lma/site-values.yaml | 2 ++ 5 files changed, 10 insertions(+) diff --git a/aws-msa-reference/lma/site-values.yaml b/aws-msa-reference/lma/site-values.yaml index 55597ca..466de49 100644 --- a/aws-msa-reference/lma/site-values.yaml +++ b/aws-msa-reference/lma/site-values.yaml @@ -274,6 +274,8 @@ charts: # - --deduplication.replica-label="replica" storegateway.persistence.size: 8Gi ruler.nodeSelector: $(nodeSelector) + ruler.service.type: LoadBalancer + ruler.service.annotations: $(awsNlbAnnotation) ruler.alertmanagers: - http://alertmanager-operated:9093 ruler.persistence.size: 8Gi diff --git a/aws-reference/lma/site-values.yaml b/aws-reference/lma/site-values.yaml index 976b438..c998099 100644 --- a/aws-reference/lma/site-values.yaml +++ b/aws-reference/lma/site-values.yaml @@ -274,6 +274,8 @@ charts: # - --deduplication.replica-label="replica" storegateway.persistence.size: 8Gi ruler.nodeSelector: $(nodeSelector) + ruler.service.type: LoadBalancer + ruler.service.annotations: $(awsNlbAnnotation) ruler.alertmanagers: - http://alertmanager-operated:9093 ruler.persistence.size: 8Gi diff --git a/byoh-reference/lma/site-values.yaml b/byoh-reference/lma/site-values.yaml index 580462a..18765b8 100644 --- a/byoh-reference/lma/site-values.yaml +++ b/byoh-reference/lma/site-values.yaml @@ -282,6 +282,8 @@ charts: # - --deduplication.replica-label="replica" storegateway.persistence.size: 8Gi ruler.nodeSelector: $(nodeSelector) + ruler.service.type: LoadBalancer + ruler.service.annotations: $(awsNlbAnnotation) ruler.alertmanagers: - http://alertmanager-operated:9093 ruler.persistence.size: 8Gi diff --git a/eks-msa-reference/lma/site-values.yaml b/eks-msa-reference/lma/site-values.yaml index b5c2b8b..329df9e 100644 --- a/eks-msa-reference/lma/site-values.yaml +++ b/eks-msa-reference/lma/site-values.yaml @@ -275,6 +275,8 @@ charts: # - --deduplication.replica-label="replica" storegateway.persistence.size: 8Gi ruler.nodeSelector: $(nodeSelector) + ruler.service.type: LoadBalancer + ruler.service.annotations: $(awsNlbAnnotation) ruler.alertmanagers: - http://alertmanager-operated:9093 ruler.persistence.size: 8Gi diff --git a/eks-reference/lma/site-values.yaml b/eks-reference/lma/site-values.yaml index b5c2b8b..329df9e 100644 --- a/eks-reference/lma/site-values.yaml +++ b/eks-reference/lma/site-values.yaml @@ -275,6 +275,8 @@ charts: # - --deduplication.replica-label="replica" storegateway.persistence.size: 8Gi ruler.nodeSelector: $(nodeSelector) + ruler.service.type: LoadBalancer + ruler.service.annotations: $(awsNlbAnnotation) ruler.alertmanagers: - http://alertmanager-operated:9093 ruler.persistence.size: 8Gi From f321e70778c9dc53afd8e4c85fd7a9d68acc89df Mon Sep 17 00:00:00 2001 From: "taekyu.kang" Date: Fri, 3 May 2024 10:23:03 +0900 Subject: [PATCH 06/12] feature. add policy to byoh-reference --- byoh-reference/policy/kustomization.yaml | 5 +++++ byoh-reference/policy/site-values.yaml | 26 ++++++++++++++++++++++++ 2 files changed, 31 insertions(+) create mode 100644 byoh-reference/policy/kustomization.yaml create mode 100644 byoh-reference/policy/site-values.yaml diff --git a/byoh-reference/policy/kustomization.yaml b/byoh-reference/policy/kustomization.yaml new file mode 100644 index 0000000..7c415e6 --- /dev/null +++ b/byoh-reference/policy/kustomization.yaml @@ -0,0 +1,5 @@ +resources: + - ../base + +transformers: + - site-values.yaml diff --git a/byoh-reference/policy/site-values.yaml b/byoh-reference/policy/site-values.yaml new file mode 100644 index 0000000..80aa10e --- /dev/null +++ b/byoh-reference/policy/site-values.yaml @@ -0,0 +1,26 @@ +apiVersion: openinfradev.github.com/v1 +kind: HelmValuesTransformer +metadata: + name: site + +global: + nodeSelector: + taco-lma: enabled + clusterName: cluster.local + storageClassName: taco-storage + repository: https://openinfradev.github.io/helm-repo/ + +charts: +- name: opa-gatekeeper + override: + postUpgrade.nodeSelector: $(nodeSelector) + postInstall.nodeSelector: $(nodeSelector) + preUninstall.nodeSelector: $(nodeSelector) + controllerManager.nodeSelector: $(nodeSelector) + audit.nodeSelector: $(nodeSelector) + crds.nodeSelector: $(nodeSelector) + + enableDeleteOperations: true + +- name: policy-resources + override: {} From 7bd3a5f753d73be2a805fab7c34bd4ac49df7776 Mon Sep 17 00:00:00 2001 From: sungil Date: Mon, 20 May 2024 01:58:46 +0000 Subject: [PATCH 07/12] fluentbit: add collecting targets for policy-serving --- aws-msa-reference/lma/site-values.yaml | 2 +- aws-reference/lma/site-values.yaml | 2 +- byoh-reference/lma/site-values.yaml | 2 +- byoh-ssu-reference/lma/site-values.yaml | 2 +- byoh-stage-reference/lma/site-values.yaml | 2 +- eks-msa-reference/lma/site-values.yaml | 2 +- eks-reference/lma/site-values.yaml | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/aws-msa-reference/lma/site-values.yaml b/aws-msa-reference/lma/site-values.yaml index 466de49..25cf6eb 100644 --- a/aws-msa-reference/lma/site-values.yaml +++ b/aws-msa-reference/lma/site-values.yaml @@ -160,7 +160,7 @@ charts: - index: platform loki_name: taco-loki key: $kubernetes['namespace_name'] - value: kube-system|$(lmaNameSpace)|taco-system|argo + value: kube-system|$(lmaNameSpace)|taco-system|gatekeeper-system|argo parser: docker path: /var/log/containers/*.log type: kubernates diff --git a/aws-reference/lma/site-values.yaml b/aws-reference/lma/site-values.yaml index c998099..3d8d49a 100644 --- a/aws-reference/lma/site-values.yaml +++ b/aws-reference/lma/site-values.yaml @@ -160,7 +160,7 @@ charts: - index: platform loki_name: taco-loki key: $kubernetes['namespace_name'] - value: kube-system|$(lmaNameSpace)|taco-system|argo + value: kube-system|$(lmaNameSpace)|taco-system|gatekeeper-system|argo parser: docker path: /var/log/containers/*.log type: kubernates diff --git a/byoh-reference/lma/site-values.yaml b/byoh-reference/lma/site-values.yaml index 18765b8..c8aaf93 100644 --- a/byoh-reference/lma/site-values.yaml +++ b/byoh-reference/lma/site-values.yaml @@ -163,7 +163,7 @@ charts: - index: platform loki_name: taco-loki key: $kubernetes['namespace_name'] - value: kube-system|$(lmaNameSpace)|taco-system|argo + value: kube-system|$(lmaNameSpace)|taco-system|gatekeeper-system|argo parser: docker path: /var/log/containers/*.log type: kubernates diff --git a/byoh-ssu-reference/lma/site-values.yaml b/byoh-ssu-reference/lma/site-values.yaml index 14ba606..3919589 100644 --- a/byoh-ssu-reference/lma/site-values.yaml +++ b/byoh-ssu-reference/lma/site-values.yaml @@ -180,7 +180,7 @@ charts: - index: platform loki_name: taco-loki key: $kubernetes['namespace_name'] - value: kube-system|$(lmaNameSpace)|taco-system|argo + value: kube-system|$(lmaNameSpace)|taco-system|gatekeeper-system|argo parser: docker path: /var/log/containers/*.log type: kubernates diff --git a/byoh-stage-reference/lma/site-values.yaml b/byoh-stage-reference/lma/site-values.yaml index 0d82936..7851442 100644 --- a/byoh-stage-reference/lma/site-values.yaml +++ b/byoh-stage-reference/lma/site-values.yaml @@ -179,7 +179,7 @@ charts: - index: platform loki_name: taco-loki key: $kubernetes['namespace_name'] - value: kube-system|$(lmaNameSpace)|taco-system|argo + value: kube-system|$(lmaNameSpace)|taco-system|gatekeeper-system|argo parser: docker path: /var/log/containers/*.log type: kubernates diff --git a/eks-msa-reference/lma/site-values.yaml b/eks-msa-reference/lma/site-values.yaml index 329df9e..75946f3 100644 --- a/eks-msa-reference/lma/site-values.yaml +++ b/eks-msa-reference/lma/site-values.yaml @@ -161,7 +161,7 @@ charts: - index: platform loki_name: taco-loki key: $kubernetes['namespace_name'] - value: kube-system|$(lmaNameSpace)|taco-system|argo + value: kube-system|$(lmaNameSpace)|taco-system|gatekeeper-system|argo parser: docker path: /var/log/containers/*.log type: kubernates diff --git a/eks-reference/lma/site-values.yaml b/eks-reference/lma/site-values.yaml index 329df9e..75946f3 100644 --- a/eks-reference/lma/site-values.yaml +++ b/eks-reference/lma/site-values.yaml @@ -161,7 +161,7 @@ charts: - index: platform loki_name: taco-loki key: $kubernetes['namespace_name'] - value: kube-system|$(lmaNameSpace)|taco-system|argo + value: kube-system|$(lmaNameSpace)|taco-system|gatekeeper-system|argo parser: docker path: /var/log/containers/*.log type: kubernates From 02731237c6ca21b115938a7b1b0205f39aa880c0 Mon Sep 17 00:00:00 2001 From: sungil Date: Mon, 24 Jun 2024 06:02:14 +0000 Subject: [PATCH 08/12] user-logging: add loki for non-platform-logs as loki-user --- aws-msa-reference/lma/site-values.yaml | 45 ++++++++++++++++++++++--- aws-reference/lma/site-values.yaml | 42 +++++++++++++++++++++-- byoh-reference/lma/site-values.yaml | 46 +++++++++++++++++++++++--- eks-msa-reference/lma/site-values.yaml | 45 ++++++++++++++++++++++--- eks-reference/lma/site-values.yaml | 45 ++++++++++++++++++++++--- 5 files changed, 204 insertions(+), 19 deletions(-) diff --git a/aws-msa-reference/lma/site-values.yaml b/aws-msa-reference/lma/site-values.yaml index 25cf6eb..1f4d649 100644 --- a/aws-msa-reference/lma/site-values.yaml +++ b/aws-msa-reference/lma/site-values.yaml @@ -16,6 +16,8 @@ global: lokiHost: loki-loki-distributed-gateway lokiPort: 80 + lokiuserHost: loki-user-loki-distributed-gateway + lokiuserPort: 80 s3Service: "minio.lma.svc:9000" # depends on $lmaNameSpace (ex. minio.taco-system.svc) lmaNameSpace: lma @@ -148,13 +150,17 @@ charts: - name: taco-loki host: $(lokiHost) port: $(lokiPort) + lokiuser: + - name: taco-loki-user + host: $(lokiuserHost) + port: $(lokiuserPort) targetLogs: - tag: kube.* bufferChunkSize: 2M bufferMaxSize: 5M - do_not_store_as_default: true + do_not_store_as_default: false index: container - loki_name: taco-loki + loki_name: taco-loki-user memBufLimit: 20MB multi_index: - index: platform @@ -275,7 +281,7 @@ charts: storegateway.persistence.size: 8Gi ruler.nodeSelector: $(nodeSelector) ruler.service.type: LoadBalancer - ruler.service.annotations: $(awsNlbAnnotation) + ruler.service.annotations: $(awsNlbAnnotation) ruler.alertmanagers: - http://alertmanager-operated:9093 ruler.persistence.size: 8Gi @@ -285,7 +291,7 @@ charts: rules: - alert: "PrometheusDown" expr: absent(up{prometheus="lma/lma-prometheus"}) - + - name: thanos-config override: objectStorage: @@ -341,6 +347,37 @@ charts: aws: s3: http://$(defaultUser):$(defaultPassword)@$(s3Service)/minio +- name: loki-user + override: + global.dnsService: kube-dns + # global.clusterDomain: $(clusterName) # annotate cluste because the cluster name is still cluster.local regardless cluster + gateway.service.type: LoadBalancer + gateway.service.annotations: $(awsNlbAnnotation) + ingester.persistence.storageClass: $(storageClassName) + distributor.persistence.storageClass: $(storageClassName) + queryFrontend.persistence.storageClass: $(storageClassName) + ruler.persistence.storageClass: $(storageClassName) + indexGateway.persistence.storageClass: $(storageClassName) + # select target node's label + ingester.nodeSelector: $(nodeSelector) + distributor.nodeSelector: $(nodeSelector) + querier.nodeSelector: $(nodeSelector) + queryFrontend.nodeSelector: $(nodeSelector) + queryScheduler.nodeSelector: $(nodeSelector) + tableManager.nodeSelector: $(nodeSelector) + gateway.nodeSelector: $(nodeSelector) + compactor.nodeSelector: $(nodeSelector) + ruler.nodeSelector: $(nodeSelector) + indexGateway.nodeSelector: $(nodeSelector) + memcachedChunks.nodeSelector: $(nodeSelector) + memcachedFrontend.nodeSelector: $(nodeSelector) + memcachedIndexQueries.nodeSelector: $(nodeSelector) + memcachedIndexWrites.nodeSelector: $(nodeSelector) + loki: + storageConfig: + aws: + s3: http://$(defaultUser):$(defaultPassword)@$(s3Service)/minio + - name: lma-bucket override: s3.enabled: true diff --git a/aws-reference/lma/site-values.yaml b/aws-reference/lma/site-values.yaml index 3d8d49a..7d7ce3e 100644 --- a/aws-reference/lma/site-values.yaml +++ b/aws-reference/lma/site-values.yaml @@ -16,6 +16,8 @@ global: lokiHost: loki-loki-distributed-gateway lokiPort: 80 + lokiuserHost: loki-user-loki-distributed-gateway + lokiuserPort: 80 s3Service: "minio.lma.svc:9000" # depends on $lmaNameSpace (ex. minio.taco-system.svc) lmaNameSpace: lma @@ -148,13 +150,17 @@ charts: - name: taco-loki host: $(lokiHost) port: $(lokiPort) + lokiuser: + - name: taco-loki-user + host: $(lokiuserHost) + port: $(lokiuserPort) targetLogs: - tag: kube.* bufferChunkSize: 2M bufferMaxSize: 5M - do_not_store_as_default: true + do_not_store_as_default: false index: container - loki_name: taco-loki + loki_name: taco-loki-user memBufLimit: 20MB multi_index: - index: platform @@ -244,7 +250,6 @@ charts: consoleIngress.nodeSelector: $(nodeSelector) postJob.nodeSelector: $(nodeSelector) - - name: thanos override: global.storageClass: $(storageClassName) @@ -341,6 +346,37 @@ charts: aws: s3: http://$(defaultUser):$(defaultPassword)@$(s3Service)/minio +- name: loki-user + override: + global.dnsService: kube-dns + # global.clusterDomain: $(clusterName) # annotate cluste because the cluster name is still cluster.local regardless cluster + gateway.service.type: LoadBalancer + gateway.service.annotations: $(awsNlbAnnotation) + ingester.persistence.storageClass: $(storageClassName) + distributor.persistence.storageClass: $(storageClassName) + queryFrontend.persistence.storageClass: $(storageClassName) + ruler.persistence.storageClass: $(storageClassName) + indexGateway.persistence.storageClass: $(storageClassName) + # select target node's label + ingester.nodeSelector: $(nodeSelector) + distributor.nodeSelector: $(nodeSelector) + querier.nodeSelector: $(nodeSelector) + queryFrontend.nodeSelector: $(nodeSelector) + queryScheduler.nodeSelector: $(nodeSelector) + tableManager.nodeSelector: $(nodeSelector) + gateway.nodeSelector: $(nodeSelector) + compactor.nodeSelector: $(nodeSelector) + ruler.nodeSelector: $(nodeSelector) + indexGateway.nodeSelector: $(nodeSelector) + memcachedChunks.nodeSelector: $(nodeSelector) + memcachedFrontend.nodeSelector: $(nodeSelector) + memcachedIndexQueries.nodeSelector: $(nodeSelector) + memcachedIndexWrites.nodeSelector: $(nodeSelector) + loki: + storageConfig: + aws: + s3: http://$(defaultUser):$(defaultPassword)@$(s3Service)/minio + - name: lma-bucket override: s3.enabled: true diff --git a/byoh-reference/lma/site-values.yaml b/byoh-reference/lma/site-values.yaml index c8aaf93..1cacd0a 100644 --- a/byoh-reference/lma/site-values.yaml +++ b/byoh-reference/lma/site-values.yaml @@ -16,6 +16,8 @@ global: lokiHost: loki-loki-distributed-gateway lokiPort: 80 + lokiuserHost: loki-user-loki-distributed-gateway + lokiuserPort: 80 s3Service: "minio.lma.svc:9000" # depends on $lmaNameSpace (ex. minio.taco-system.svc) lmaNameSpace: lma @@ -151,13 +153,17 @@ charts: - name: taco-loki host: $(lokiHost) port: $(lokiPort) + lokiuser: + - name: taco-loki-user + host: $(lokiuserHost) + port: $(lokiuserPort) targetLogs: - tag: kube.* bufferChunkSize: 2M bufferMaxSize: 5M - do_not_store_as_default: true + do_not_store_as_default: false index: container - loki_name: taco-loki + loki_name: taco-loki-user memBufLimit: 20MB multi_index: - index: platform @@ -283,7 +289,7 @@ charts: storegateway.persistence.size: 8Gi ruler.nodeSelector: $(nodeSelector) ruler.service.type: LoadBalancer - ruler.service.annotations: $(awsNlbAnnotation) + ruler.service.annotations: $(awsNlbAnnotation) ruler.alertmanagers: - http://alertmanager-operated:9093 ruler.persistence.size: 8Gi @@ -293,7 +299,7 @@ charts: rules: - alert: "PrometheusDown" expr: absent(up{prometheus="lma/lma-prometheus"}) - + - name: thanos-config override: objectStorage: @@ -350,6 +356,38 @@ charts: aws: s3: http://$(defaultUser):$(defaultPassword)@$(s3Service)/minio +- name: loki-user + override: + global.dnsService: kube-dns + gateway.service: + type: NodePort + nodePort: 30006 + gateway.service.annotations: $(awsNlbAnnotation) + ingester.persistence.storageClass: $(storageClassName) + distributor.persistence.storageClass: $(storageClassName) + queryFrontend.persistence.storageClass: $(storageClassName) + ruler.persistence.storageClass: $(storageClassName) + indexGateway.persistence.storageClass: $(storageClassName) + # select target node's label + ingester.nodeSelector: $(nodeSelector) + distributor.nodeSelector: $(nodeSelector) + querier.nodeSelector: $(nodeSelector) + queryFrontend.nodeSelector: $(nodeSelector) + queryScheduler.nodeSelector: $(nodeSelector) + tableManager.nodeSelector: $(nodeSelector) + gateway.nodeSelector: $(nodeSelector) + compactor.nodeSelector: $(nodeSelector) + ruler.nodeSelector: $(nodeSelector) + indexGateway.nodeSelector: $(nodeSelector) + memcachedChunks.nodeSelector: $(nodeSelector) + memcachedFrontend.nodeSelector: $(nodeSelector) + memcachedIndexQueries.nodeSelector: $(nodeSelector) + memcachedIndexWrites.nodeSelector: $(nodeSelector) + loki: + storageConfig: + aws: + s3: http://$(defaultUser):$(defaultPassword)@$(s3Service)/minio + - name: lma-bucket override: s3.enabled: true diff --git a/eks-msa-reference/lma/site-values.yaml b/eks-msa-reference/lma/site-values.yaml index 75946f3..d43de8c 100644 --- a/eks-msa-reference/lma/site-values.yaml +++ b/eks-msa-reference/lma/site-values.yaml @@ -16,6 +16,8 @@ global: lokiHost: loki-loki-distributed-gateway lokiPort: 80 + lokiuserHost: loki-user-loki-distributed-gateway + lokiuserPort: 80 s3Service: "minio.lma.svc:9000" # depends on $lmaNameSpace (ex. minio.taco-system.svc) lmaNameSpace: lma @@ -149,13 +151,17 @@ charts: - name: taco-loki host: $(lokiHost) port: $(lokiPort) + lokiuser: + - name: taco-loki-user + host: $(lokiuserHost) + port: $(lokiuserPort) targetLogs: - tag: kube.* bufferChunkSize: 2M bufferMaxSize: 5M - do_not_store_as_default: true + do_not_store_as_default: false index: container - loki_name: taco-loki + loki_name: taco-loki-user memBufLimit: 20MB multi_index: - index: platform @@ -276,7 +282,7 @@ charts: storegateway.persistence.size: 8Gi ruler.nodeSelector: $(nodeSelector) ruler.service.type: LoadBalancer - ruler.service.annotations: $(awsNlbAnnotation) + ruler.service.annotations: $(awsNlbAnnotation) ruler.alertmanagers: - http://alertmanager-operated:9093 ruler.persistence.size: 8Gi @@ -286,7 +292,7 @@ charts: rules: - alert: "PrometheusDown" expr: absent(up{prometheus="lma/lma-prometheus"}) - + - name: thanos-config override: objectStorage: @@ -342,6 +348,37 @@ charts: aws: s3: http://$(defaultUser):$(defaultPassword)@$(s3Service)/minio +- name: loki-user + override: + global.dnsService: kube-dns + # global.clusterDomain: $(clusterName) # annotate cluste because the cluster name is still cluster.local regardless cluster + gateway.service.type: LoadBalancer + gateway.service.annotations: $(awsNlbAnnotation) + ingester.persistence.storageClass: $(storageClassName) + distributor.persistence.storageClass: $(storageClassName) + queryFrontend.persistence.storageClass: $(storageClassName) + ruler.persistence.storageClass: $(storageClassName) + indexGateway.persistence.storageClass: $(storageClassName) + # select target node's label + ingester.nodeSelector: $(nodeSelector) + distributor.nodeSelector: $(nodeSelector) + querier.nodeSelector: $(nodeSelector) + queryFrontend.nodeSelector: $(nodeSelector) + queryScheduler.nodeSelector: $(nodeSelector) + tableManager.nodeSelector: $(nodeSelector) + gateway.nodeSelector: $(nodeSelector) + compactor.nodeSelector: $(nodeSelector) + ruler.nodeSelector: $(nodeSelector) + indexGateway.nodeSelector: $(nodeSelector) + memcachedChunks.nodeSelector: $(nodeSelector) + memcachedFrontend.nodeSelector: $(nodeSelector) + memcachedIndexQueries.nodeSelector: $(nodeSelector) + memcachedIndexWrites.nodeSelector: $(nodeSelector) + loki: + storageConfig: + aws: + s3: http://$(defaultUser):$(defaultPassword)@$(s3Service)/minio + - name: lma-bucket override: s3.enabled: true diff --git a/eks-reference/lma/site-values.yaml b/eks-reference/lma/site-values.yaml index 75946f3..d43de8c 100644 --- a/eks-reference/lma/site-values.yaml +++ b/eks-reference/lma/site-values.yaml @@ -16,6 +16,8 @@ global: lokiHost: loki-loki-distributed-gateway lokiPort: 80 + lokiuserHost: loki-user-loki-distributed-gateway + lokiuserPort: 80 s3Service: "minio.lma.svc:9000" # depends on $lmaNameSpace (ex. minio.taco-system.svc) lmaNameSpace: lma @@ -149,13 +151,17 @@ charts: - name: taco-loki host: $(lokiHost) port: $(lokiPort) + lokiuser: + - name: taco-loki-user + host: $(lokiuserHost) + port: $(lokiuserPort) targetLogs: - tag: kube.* bufferChunkSize: 2M bufferMaxSize: 5M - do_not_store_as_default: true + do_not_store_as_default: false index: container - loki_name: taco-loki + loki_name: taco-loki-user memBufLimit: 20MB multi_index: - index: platform @@ -276,7 +282,7 @@ charts: storegateway.persistence.size: 8Gi ruler.nodeSelector: $(nodeSelector) ruler.service.type: LoadBalancer - ruler.service.annotations: $(awsNlbAnnotation) + ruler.service.annotations: $(awsNlbAnnotation) ruler.alertmanagers: - http://alertmanager-operated:9093 ruler.persistence.size: 8Gi @@ -286,7 +292,7 @@ charts: rules: - alert: "PrometheusDown" expr: absent(up{prometheus="lma/lma-prometheus"}) - + - name: thanos-config override: objectStorage: @@ -342,6 +348,37 @@ charts: aws: s3: http://$(defaultUser):$(defaultPassword)@$(s3Service)/minio +- name: loki-user + override: + global.dnsService: kube-dns + # global.clusterDomain: $(clusterName) # annotate cluste because the cluster name is still cluster.local regardless cluster + gateway.service.type: LoadBalancer + gateway.service.annotations: $(awsNlbAnnotation) + ingester.persistence.storageClass: $(storageClassName) + distributor.persistence.storageClass: $(storageClassName) + queryFrontend.persistence.storageClass: $(storageClassName) + ruler.persistence.storageClass: $(storageClassName) + indexGateway.persistence.storageClass: $(storageClassName) + # select target node's label + ingester.nodeSelector: $(nodeSelector) + distributor.nodeSelector: $(nodeSelector) + querier.nodeSelector: $(nodeSelector) + queryFrontend.nodeSelector: $(nodeSelector) + queryScheduler.nodeSelector: $(nodeSelector) + tableManager.nodeSelector: $(nodeSelector) + gateway.nodeSelector: $(nodeSelector) + compactor.nodeSelector: $(nodeSelector) + ruler.nodeSelector: $(nodeSelector) + indexGateway.nodeSelector: $(nodeSelector) + memcachedChunks.nodeSelector: $(nodeSelector) + memcachedFrontend.nodeSelector: $(nodeSelector) + memcachedIndexQueries.nodeSelector: $(nodeSelector) + memcachedIndexWrites.nodeSelector: $(nodeSelector) + loki: + storageConfig: + aws: + s3: http://$(defaultUser):$(defaultPassword)@$(s3Service)/minio + - name: lma-bucket override: s3.enabled: true From 75991bdd36a247f998f0fd69ded054fb11b426ec Mon Sep 17 00:00:00 2001 From: "taekyu.kang" Date: Tue, 16 Jul 2024 14:59:53 +0900 Subject: [PATCH 09/12] trivial. remove service type LoadBalaner from thanos-ruler --- byoh-reference/lma/site-values.yaml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/byoh-reference/lma/site-values.yaml b/byoh-reference/lma/site-values.yaml index 1cacd0a..d976bfb 100644 --- a/byoh-reference/lma/site-values.yaml +++ b/byoh-reference/lma/site-values.yaml @@ -288,17 +288,19 @@ charts: # - --deduplication.replica-label="replica" storegateway.persistence.size: 8Gi ruler.nodeSelector: $(nodeSelector) - ruler.service.type: LoadBalancer - ruler.service.annotations: $(awsNlbAnnotation) ruler.alertmanagers: - http://alertmanager-operated:9093 ruler.persistence.size: 8Gi + ruler.service: + type: NodePort + nodePort: 30007 ruler.config: groups: - name: "tks" rules: - alert: "PrometheusDown" expr: absent(up{prometheus="lma/lma-prometheus"}) + - name: thanos-config override: From 0b5c26e917c660e0c010b9f50fa8a74d8907471b Mon Sep 17 00:00:00 2001 From: "taekyu.kang" Date: Mon, 22 Jul 2024 16:38:58 +0900 Subject: [PATCH 10/12] feature. add byok-reference --- byok-reference/lma/kustomization.yaml | 5 + byok-reference/lma/site-values.yaml | 398 ++++++++++++++++++ byok-reference/policy/kustomization.yaml | 5 + byok-reference/policy/site-values.yaml | 26 ++ .../sealed-secrets/kustomization.yaml | 5 + .../sealed-secrets/site-values.yaml | 6 + .../service-mesh/kustomization.yaml | 5 + byok-reference/service-mesh/site-values.yaml | 284 +++++++++++++ .../tks-admin-tools/kustomization.yaml | 5 + .../tks-admin-tools/site-values.yaml | 111 +++++ byok-reference/tks-cluster/kustomization.yaml | 6 + byok-reference/tks-cluster/site-values.yaml | 78 ++++ 12 files changed, 934 insertions(+) create mode 100644 byok-reference/lma/kustomization.yaml create mode 100644 byok-reference/lma/site-values.yaml create mode 100644 byok-reference/policy/kustomization.yaml create mode 100644 byok-reference/policy/site-values.yaml create mode 100644 byok-reference/sealed-secrets/kustomization.yaml create mode 100644 byok-reference/sealed-secrets/site-values.yaml create mode 100644 byok-reference/service-mesh/kustomization.yaml create mode 100644 byok-reference/service-mesh/site-values.yaml create mode 100644 byok-reference/tks-admin-tools/kustomization.yaml create mode 100644 byok-reference/tks-admin-tools/site-values.yaml create mode 100644 byok-reference/tks-cluster/kustomization.yaml create mode 100644 byok-reference/tks-cluster/site-values.yaml diff --git a/byok-reference/lma/kustomization.yaml b/byok-reference/lma/kustomization.yaml new file mode 100644 index 0000000..7c415e6 --- /dev/null +++ b/byok-reference/lma/kustomization.yaml @@ -0,0 +1,5 @@ +resources: + - ../base + +transformers: + - site-values.yaml diff --git a/byok-reference/lma/site-values.yaml b/byok-reference/lma/site-values.yaml new file mode 100644 index 0000000..5b43aaf --- /dev/null +++ b/byok-reference/lma/site-values.yaml @@ -0,0 +1,398 @@ +apiVersion: openinfradev.github.com/v1 +kind: HelmValuesTransformer +metadata: + name: site + +global: + nodeSelector: + taco-lma: enabled + clusterName: cluster.local + storageClassName: taco-storage + repository: https://openinfradev.github.io/helm-repo/ + serviceScrapeInterval: 30s + defaultPassword: password + defaultUser: taco + thanosObjstoreSecret: taco-objstore-secret + + lokiHost: loki-loki-distributed-gateway + lokiPort: 80 + lokiuserHost: loki-user-loki-distributed-gateway + lokiuserPort: 80 + s3Service: "minio.lma.svc:9000" # depends on $lmaNameSpace (ex. minio.taco-system.svc) + + lmaNameSpace: lma + + TksWebhookUrl: "FixItByWF" + SlackUrl: "FixItByWF" + SlackChannel: '#temporary-alert' + + grafanaDatasourceMetric: lma-prometheus:9090 + thanosQueryStores: + - thanos-storegateway:10901 + - prometheus-operated:10901 + + # servicemesh dashboard and grafana + realms: or2ru44fn + consoleUrl: tks-console.taco-cat.xyz + grafanaDomain: taco-cat.xyz + keycloakDomain: tks-console-dev.taco-cat.xyz + grafanaClientSecret: JLtsanYtrCg21RGxrcVmQP0GeuDFUhpA + + awsNlbAnnotation: + service.beta.kubernetes.io/aws-load-balancer-proxy-protocol: '*' + service.beta.kubernetes.io/aws-load-balancer-type: nlb + + tksIamRoles: [] + +charts: +- name: prometheus-operator + override: + prometheusOperator.nodeSelector: $(nodeSelector) + prometheusOperator.admissionWebhooks.patch.image.sha: "" + prometheusOperator.image.repository: tks/prometheus-operator + prometheusOperator.admissionWebhooks.patch.image.repository: tks/kube-webhook-certgen + prometheusOperator.prometheusConfigReloader.image.repository: tks/prometheus-config-reloader + prometheusOperator.thanosImage.repository: tks/thanos + +- name: prometheus + override: + kubeEtcd.enabled: true + prometheus.prometheusSpec.storageSpec.volumeClaimTemplate.spec.storageClassName: $(storageClassName) + prometheus.prometheusSpec.storageSpec.volumeClaimTemplate.spec.resources.requests.storage: 20Gi + prometheus.prometheusSpec.retention: 2d + prometheus.prometheusSpec.externalLabels.taco_cluster: $(clusterName) + prometheus.prometheusSpec.nodeSelector: $(nodeSelector) + prometheus.prometheusSpec.serviceMonitorNamespaceSelector.matchLabels.name: $(lmaNameSpace) + prometheus.prometheusSpec.podMonitorNamespaceSelector.matchLabels.name: $(lmaNameSpace) + prometheus.prometheusSpec.ruleNamespaceSelector.matchLabels.name: $(lmaNameSpace) + prometheus.thanosServiceExternal.annotations: $(awsNlbAnnotation) + prometheus.thanosServiceExternal: + type: NodePort + nodePort: 30004 + alertmanager.service.type: NodePort + alertmanager.service.nodePort: 30111 + alertmanager.alertmanagerSpec.alertmanagerConfigSelector.matchLabels.alertmanagerConfig: example + alertmanager.alertmanagerSpec.nodeSelector: $(nodeSelector) + alertmanager.alertmanagerSpec.retention: 2h + alertmanager.config: + global: + slack_api_url: $(SlackUrl) + receivers: + - name: tks-alert + webhook_configs: + - send_resolved: true + url: $(TksWebhookUrl) + route: + group_by: + - alertname + group_wait: 10s + receiver: tks-alert + repeat_interval: 1h + +- name: prometheus-node-exporter + override: + hostNetwork: false + +- name: kube-state-metrics + override: + nodeSelector: $(nodeSelector) + kubeVersion: v1.25.7 + +- name: prometheus-pushgateway + override: + nodeSelector: $(nodeSelector) + +- name: prometheus-process-exporter + override: + conf.processes: dockerd,kubelet,kube-proxy,ntpd,node + pod.hostNetwork: false + +- name: grafana + override: + adminPassword: password + persistence.storageClassName: $(storageClassName) + sidecar.dashboards.searchNamespace: ALL + # grafana oidc + service: + type: NodePort + nodePort: 30001 + service.annotations: $(awsNlbAnnotation) + grafana\.ini: + server: + domain: $(grafanaDomain) + root_url: http://$(grafanaDomain) + serve_from_sub_path: true + auth.generic_oauth: + enabled: true + name: keycloak + allow_sign_up: true + client_id: grafana + client_secret: $(grafanaClientSecret) + scopes: openid profile email + login_attribute_path: username + auth_url: https://$(keycloakDomain)/auth/realms/$(realms)/protocol/openid-connect/auth + token_url: https://$(keycloakDomain)/auth/realms/$(realms)/protocol/openid-connect/token + api_url: https://$(keycloakDomain)/auth/realms/$(realms)/protocol/openid-connect/userinfo + signout_redirect_url: $(consoleUrl)/login + auth: + disable_login_form: false + oauth_auto_login: true + disable_signout_menu: true + user: + auto_assign_org: true + auto_assign_org_role: Admin + +- name: fluent-operator + +- name: fluentbit + override: + fluentbit: + clusterName: $(clusterName) + outputs: + loki: + - name: taco-loki + host: $(lokiHost) + port: $(lokiPort) + lokiuser: + - name: taco-loki-user + host: $(lokiuserHost) + port: $(lokiuserPort) + targetLogs: + - tag: kube.* + bufferChunkSize: 2M + bufferMaxSize: 5M + do_not_store_as_default: false + index: container + loki_name: taco-loki-user + memBufLimit: 20MB + multi_index: + - index: platform + loki_name: taco-loki + key: $kubernetes['namespace_name'] + value: kube-system|$(lmaNameSpace)|taco-system|gatekeeper-system|argo + parser: docker + path: /var/log/containers/*.log + type: kubernates + extraArgs: + multilineParser: docker, cri + - tag: syslog.* + loki_name: taco-loki + index: syslog + parser: taco-syslog-parser-for-ubuntu + path: /var/log/messages + type: syslog + +- name: addons + override: + SPECIAL_VALUE: SPECIAL + serviceMonitor.trident: + enabled: false + interval: $(serviceScrapeInterval) + serviceMonitor.kubelet.interval: 30s + serviceMonitor.additionalScrapeConfigs: + grafanaDashboard.istio.enabled: false + grafanaDashboard.jaeger.enabled: false + grafanaDashboard.namespace: $(lmaNameSpace) + grafanaDatasource.namespace: $(lmaNameSpace) + serviceMonitor.istio.enabled: false + serviceMonitor.jaeger.enabled: false + serviceMonitor.argocd.enabled: false + serviceMonitor.argowf.enabled: false + prometheusRules.alert.enabled: false + prometheusRules.istio.aggregation.enabled: false + prometheusRules.istio.optimization.enabled: false + grafanaDatasource.prometheus.url: $(grafanaDatasourceMetric) + # grafanaDatasource.prometheus.url: "thanos-query.lma:9090" + grafanaDatasource.loki.url: $(lokiHost):$(lokiPort) + +- name: prometheus-adapter + override: + nodeSelector: $(nodeSelector) + +- name: kubernetes-event-exporter + override: + clustername: $(clusterName) + + conf.recievers: + - name: loki + type: file + config: + path: "/tmp/kubernetes-event.log" + addons: + loki: + enabled: true + host: $(lokiHost) + port: $(lokiPort) + target_file: "/tmp/kubernetes-event.log" + conf.default.hosts: + - "https://eck-elasticsearch-es-http.lma.svc.$(clusterName):9200" + +- name: minio + override: + users: + - accessKey: $(defaultUser) + secretKey: $(defaultPassword) + policy: consoleAdmin + buckets: + - name: tks-thanos + policy: public + purge: false + versioning: true + objectlocking: false + - name: tks-loki + policy: public + purge: false + versioning: true + objectlocking: false + persistence.storageClass: $(storageClassName) + persistence.size: 500Gi + persistence.accessMode: ReadWriteOnce + service: + type: NodePort + nodePort: 30003 + service.annotations: $(awsNlbAnnotation) + # deploy target node's label + consoleIngress.nodeSelector: $(nodeSelector) + postJob.nodeSelector: $(nodeSelector) + + +- name: thanos + override: + global.storageClass: $(storageClassName) + # temporarily add annotation because a cluster is using not cluster-name but 'cluster.local' + # clusterDomain: $(clusterName) + existingObjstoreSecret: $(thanosObjstoreSecret) + query.nodeSelector: $(nodeSelector) + query.service.type: ClusterIP + query.service.annotations: $(awsNlbAnnotation) + queryFrontend.nodeSelector: $(nodeSelector) + queryFrontend.service: + nodePorts: + http: 30005 + type: NodePort + queryFrontend.enabled: true + queryFrontend.config: |- + type: IN-MEMORY + config: + max_size: 512MB + max_size_items: 100 + validity: 100s + queryFrontend.extraFlags: [] + querier.stores: $(thanosQueryStores) + bucketweb.nodeSelector: $(nodeSelector) + compactor.nodeSelector: $(nodeSelector) + storegateway.nodeSelector: $(nodeSelector) + compactor.persistence.size: 8Gi + # compactor.extraFlags: + # - --compact.enable-vertical-compaction + # - --deduplication.replica-label="replica" + storegateway.persistence.size: 8Gi + ruler.nodeSelector: $(nodeSelector) + ruler.alertmanagers: + - http://alertmanager-operated:9093 + ruler.persistence.size: 8Gi + ruler.config: + groups: + - name: "tks" + rules: + - alert: "PrometheusDown" + expr: absent(up{prometheus="lma/lma-prometheus"}) + ruler.service: + type: NodePort + nodePort: 30007 + +- name: thanos-config + override: + objectStorage: + secretName: $(thanosObjstoreSecret) + rawConfig: + bucket: tks-thanos + endpoint: $(s3Service) + access_key: $(defaultUser) + secret_key: $(defaultPassword) + insecure: true + sidecarsService.name: thanos-sidecars + sidecarsService.endpoints: + - 192.168.97.102 # should not be in the loopback range (127.0.0.0/8) + +- name: prepare-etcd-secret + override: + nodeSelector: + "node-role.kubernetes.io/control-plane": "" + tolerations: + - key: "node-role.kubernetes.io/control-plane" + effect: "NoSchedule" + operator: "Exists" + +- name: loki + override: + global.dnsService: kube-dns + # global.clusterDomain: $(clusterName) # annotate cluste because the cluster name is still cluster.local regardless cluster + gateway.service: + type: NodePort + nodePort: 30002 + gateway.service.annotations: $(awsNlbAnnotation) + ingester.persistence.storageClass: $(storageClassName) + distributor.persistence.storageClass: $(storageClassName) + queryFrontend.persistence.storageClass: $(storageClassName) + ruler.persistence.storageClass: $(storageClassName) + indexGateway.persistence.storageClass: $(storageClassName) + # select target node's label + ingester.nodeSelector: $(nodeSelector) + distributor.nodeSelector: $(nodeSelector) + querier.nodeSelector: $(nodeSelector) + queryFrontend.nodeSelector: $(nodeSelector) + queryScheduler.nodeSelector: $(nodeSelector) + tableManager.nodeSelector: $(nodeSelector) + gateway.nodeSelector: $(nodeSelector) + compactor.nodeSelector: $(nodeSelector) + ruler.nodeSelector: $(nodeSelector) + indexGateway.nodeSelector: $(nodeSelector) + memcachedChunks.nodeSelector: $(nodeSelector) + memcachedFrontend.nodeSelector: $(nodeSelector) + memcachedIndexQueries.nodeSelector: $(nodeSelector) + memcachedIndexWrites.nodeSelector: $(nodeSelector) + loki: + storageConfig: + aws: + s3: http://$(defaultUser):$(defaultPassword)@$(s3Service)/minio + +- name: loki-user + override: + global.dnsService: kube-dns + gateway.service: + type: NodePort + nodePort: 30006 + gateway.service.annotations: $(awsNlbAnnotation) + ingester.persistence.storageClass: $(storageClassName) + distributor.persistence.storageClass: $(storageClassName) + queryFrontend.persistence.storageClass: $(storageClassName) + ruler.persistence.storageClass: $(storageClassName) + indexGateway.persistence.storageClass: $(storageClassName) + # select target node's label + ingester.nodeSelector: $(nodeSelector) + distributor.nodeSelector: $(nodeSelector) + querier.nodeSelector: $(nodeSelector) + queryFrontend.nodeSelector: $(nodeSelector) + queryScheduler.nodeSelector: $(nodeSelector) + tableManager.nodeSelector: $(nodeSelector) + gateway.nodeSelector: $(nodeSelector) + compactor.nodeSelector: $(nodeSelector) + ruler.nodeSelector: $(nodeSelector) + indexGateway.nodeSelector: $(nodeSelector) + memcachedChunks.nodeSelector: $(nodeSelector) + memcachedFrontend.nodeSelector: $(nodeSelector) + memcachedIndexQueries.nodeSelector: $(nodeSelector) + memcachedIndexWrites.nodeSelector: $(nodeSelector) + loki: + storageConfig: + aws: + s3: http://$(defaultUser):$(defaultPassword)@$(s3Service)/minio + +- name: lma-bucket + override: + s3.enabled: true + s3.buckets: + - name: $(clusterName)-tks-thanos + - name: $(clusterName)-tks-loki + tks.iamRoles: $(tksIamRoles) diff --git a/byok-reference/policy/kustomization.yaml b/byok-reference/policy/kustomization.yaml new file mode 100644 index 0000000..7c415e6 --- /dev/null +++ b/byok-reference/policy/kustomization.yaml @@ -0,0 +1,5 @@ +resources: + - ../base + +transformers: + - site-values.yaml diff --git a/byok-reference/policy/site-values.yaml b/byok-reference/policy/site-values.yaml new file mode 100644 index 0000000..80aa10e --- /dev/null +++ b/byok-reference/policy/site-values.yaml @@ -0,0 +1,26 @@ +apiVersion: openinfradev.github.com/v1 +kind: HelmValuesTransformer +metadata: + name: site + +global: + nodeSelector: + taco-lma: enabled + clusterName: cluster.local + storageClassName: taco-storage + repository: https://openinfradev.github.io/helm-repo/ + +charts: +- name: opa-gatekeeper + override: + postUpgrade.nodeSelector: $(nodeSelector) + postInstall.nodeSelector: $(nodeSelector) + preUninstall.nodeSelector: $(nodeSelector) + controllerManager.nodeSelector: $(nodeSelector) + audit.nodeSelector: $(nodeSelector) + crds.nodeSelector: $(nodeSelector) + + enableDeleteOperations: true + +- name: policy-resources + override: {} diff --git a/byok-reference/sealed-secrets/kustomization.yaml b/byok-reference/sealed-secrets/kustomization.yaml new file mode 100644 index 0000000..7c415e6 --- /dev/null +++ b/byok-reference/sealed-secrets/kustomization.yaml @@ -0,0 +1,5 @@ +resources: + - ../base + +transformers: + - site-values.yaml diff --git a/byok-reference/sealed-secrets/site-values.yaml b/byok-reference/sealed-secrets/site-values.yaml new file mode 100644 index 0000000..6fb83a4 --- /dev/null +++ b/byok-reference/sealed-secrets/site-values.yaml @@ -0,0 +1,6 @@ +apiVersion: openinfradev.github.com/v1 +kind: HelmValuesTransformer +metadata: + name: site + +charts: [] diff --git a/byok-reference/service-mesh/kustomization.yaml b/byok-reference/service-mesh/kustomization.yaml new file mode 100644 index 0000000..7c415e6 --- /dev/null +++ b/byok-reference/service-mesh/kustomization.yaml @@ -0,0 +1,5 @@ +resources: + - ../base + +transformers: + - site-values.yaml diff --git a/byok-reference/service-mesh/site-values.yaml b/byok-reference/service-mesh/site-values.yaml new file mode 100644 index 0000000..1686981 --- /dev/null +++ b/byok-reference/service-mesh/site-values.yaml @@ -0,0 +1,284 @@ +apiVersion: openinfradev.github.com/v1 +kind: HelmValuesTransformer +metadata: + name: site + +global: + clusterName: cluster.local + namespace: tks-msa + imageRegistry: harbor.taco-cat.xyz + serviceMeshControlNodeSelector: + tks-msa: enabled + serviceMeshIngressNodeSelector: + tks-ingressgateway: enabled + serviceMeshEgressNodeSelector: + tks-egressgateway: enabled + ingressGatewayLabel: istio-ingressgateway + egressGatewayLabel: istio-egressgateway + keycloakIssuerUri: https://keycloak.com/auth/realms/oraganization + keycloakClientPrefix: client-prefix + gatekeeperSecret: gatekeeper-secret + +charts: +- name: cert-manager + override: + image: + repository: $(imageRegistry)/tks/cert-manager-controller + nodeSelector: + tks-msa: enabled + webhook: + image: + repository: $(imageRegistry)/tks/cert-manager-webhook + nodeSelector: + tks-msa: enabled + cainjector: + image: + repository: $(imageRegistry)/tks/cert-manager-cainjector + nodeSelector: + tks-msa: enabled + +- name: k8ssandra-operator + override: + image: + registry: $(imageRegistry) + repository: tks/k8ssandra-operator + tag: v1.6.0 + nodeSelector: + tks-msa: enabled + cleaner: + image: + registry: $(imageRegistry) + repository: tks/k8ssandra-tools + tag: latest + client: + image: + registry: $(imageRegistry) + repository: tks/k8ssandra-tools + tag: latest + cass-operator: + image: + registry: $(imageRegistry) + repository: tks/cass-operator + tag: v1.14.0 + nodeSelector: + tks-msa: enabled + +- name: servicemesh-k8ssandra-resource + override: + namespace: $(namespace) + cassandra: + jmxInitContainerImage: + name: busybox + registry: $(imageRegistry)/tks + tag: 1.34.1 + datacenters: + size: 1 + perNodeConfigInitContainerImage: $(imageRegistry)/tks/yq:4 + initContainers: + serverConfigInitImage: $(imageRegistry)/tks/cass-config-builder:1.0-ubi7 + jmxInitContainerImage: + name: busybox + registry: $(imageRegistry)/tks + tag: 1.34.1 + containers: + - name: cassandra + image: $(imageRegistry)/tks/cass-management-api:4.0.6 + - name: server-system-logger + image: $(imageRegistry)/tks/system-logger:v1.14.0 + config: + heapSize: 2048M + storageConfig: + storageClassName: taco-storage + accessModes: ReadWriteOnce + size: 300Gi + racks: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: tks-msa + operator: In + values: + - enabled + stargate: + size: 1 + heapSize: 384M + containerImage: + registry: $(imageRegistry) + repository: tks + tag: v1.0.67 + nodeSelector: + tks-msa: enabled + +- name: istiod + override: + revision: "" + pilot.autoscaleEnabled: false + pilot.traceSampling: 0.1 + pilot.nodeSelector: $(serviceMeshControlNodeSelector) + global.hub: $(imageRegistry)/tks + global.proxy.clusterDomain: $(clusterName) + global.tracer.zipkin.address: jaeger-operator-jaeger-collector.$(namespace):9411 + +- name: istio-ingressgateway + override: + revision: "" + replicaCount: 1 + image: $(imageRegistry)/tks/proxyv2:1.17.2 + autoscaling: + enabled: false + minReplicas: 1 + maxReplicas: 5 + targetCPUUtilizationPercentage: 80 + service: + type: NodePort + ports: + - name: status-port + port: 15021 + protocol: TCP + targetPort: 15021 + nodePort: 30013 + - name: http2 + port: 80 + protocol: TCP + targetPort: 80 + nodePort: 30014 + - name: https + port: 443 + protocol: TCP + targetPort: 443 + nodePort: 30015 + #resources.requests.cpu: 1000m + #resources.requests.memory: 1024Mi + #resources.limits.cpu: 2000m + #resources.limits.memory: 2048Mi + nodeSelector: $(serviceMeshIngressNodeSelector) + +- name: istio-egressgateway + override: + revision: "" + replicaCount: 1 + image: $(imageRegistry)/tks/proxyv2:1.17.2 + autoscaling.enabled: false + service.type: ClusterIP + #resources.requests.cpu: 1000m + #resources.requests.memory: 1024Mi + #resources.limits.cpu: 2000m + #resources.limits.memory: 2048Mi + nodeSelector: $(serviceMeshEgressNodeSelector) + +- name: jaeger-operator + override: + image: + repository: $(imageRegistry)/tks/jaeger-operator + tag: 1.35.0 + nodeSelector: $(serviceMeshControlNodeSelector) + +- name: servicemesh-jaeger-resource + override: + namespace: tks-msa + sampling.param: 10 + collector.resources.requests.cpu: 500m + collector.resources.requests.memory: 1024Mi + collector.resources.limits.cpu: 1000m + collector.resources.limits.memory: 2048Mi + collector: + image: $(imageRegistry)/tks/jaeger-collector:1.35.0 + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: tks-msa + operator: In + values: + - enabled + storage: + type: cassandra + cassandra: + options: + servers: cassandra-dc-service.tks-msa.svc + keyspace: jaeger_v1_datacenter + cassandraCreateSchema: + image: $(imageRegistry)/tks/jaeger-cassandra-schema:1.35.0 + dependencies: + enabled: true + image: $(imageRegistry)/tks/spark-dependencies:1.35.0 + query: + image: $(imageRegistry)/tks/jaeger-query:1.35.0 + basePath: / + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: tks-msa + operator: In + values: + - enabled + agent: + image: $(imageRegistry)/tks/jaeger-agent:1.35.0 + cassandra: + user: + enabled: true + username: tks + password: tksword + nodeSelector: + tks-msa: enabled + elasticsearch.user.enabled: false + +- name: kiali-operator + override: + image: + repo: $(imageRegistry)/tks/kiali-operator + tag: v1.63.0 + nodeSelector: $(serviceMeshControlNodeSelector) + +- name: servicemesh-kiali-resource + override: + namespace: tks-msa + istioNamespace: tks-msa + deployment.namespace: tks-msa + deployment.image_name: $(imageRegistry)/tks/kiali + deployment.image_version: v1.63.0 + deployment.resources.requests.cpu: 500m + deployment.resources.requests.memory: 512Mi + deployment.resources.limits.cpu: 1000m + deployment.resources.limits.memory: 1024Mi + deployment.nodeSelector: + tks-msa: enabled + deployment.serviceType: NodePort + auth.strategy: openid + auth.openid.client_id: $(keycloakClientPrefix)-kiali + auth.openid.issuer_uri: $(keycloakIssuerUri) + auth.openid.username_claim: preferred_username + auth.openid.scopes: [ "openid", "email" ] + auth.openid.disable_rbac: true + externalServices.istio.configMapName: istio + externalServices.istio.istioIdentityDomain: svc.$(clusterName) + externalServices.prometheus.url: http://lma-prometheus.lma.svc:9090 + externalServices.tracing.inClusterUrl: http://jaeger-operator-jaeger-query.tks-msa:16686 + externalServices.tracing.url: https://jaeger-v2.taco-cat.xyz + externalServices.tracing.useGrpc: false + externalServices.grafana.auth.type: basic + externalServices.grafana.auth.username: admin + externalServices.grafana.auth.password: password + externalServices.grafana.inClusterUrl: http://grafana.lma.svc:80 + externalServices.grafana.url: https://grafana-v2.taco-cat.xyz + server.webRoot: / + +- name: gatekeeper + override: + nodeSelector: + tks-msa: enabled + config: + discovery-url: $(keycloakIssuerUri) + upstream-url: http://jaeger-operator-jaeger-query.tks-msa.svc:16686 + client-id: $(keycloakClientPrefix)-gatekeeper-jaeger + client-secret: $(gatekeeperSecret) + secure-cookie: false + service: + type: NodePort + proxy: + nodePort: 30012 diff --git a/byok-reference/tks-admin-tools/kustomization.yaml b/byok-reference/tks-admin-tools/kustomization.yaml new file mode 100644 index 0000000..7c415e6 --- /dev/null +++ b/byok-reference/tks-admin-tools/kustomization.yaml @@ -0,0 +1,5 @@ +resources: + - ../base + +transformers: + - site-values.yaml diff --git a/byok-reference/tks-admin-tools/site-values.yaml b/byok-reference/tks-admin-tools/site-values.yaml new file mode 100644 index 0000000..a9a5bf0 --- /dev/null +++ b/byok-reference/tks-admin-tools/site-values.yaml @@ -0,0 +1,111 @@ +apiVersion: openinfradev.github.com/v1 +kind: HelmValuesTransformer +metadata: + name: site + +global: + dbHost: ${DATABASE_HOST} + commonPassword: ${COMMON_PASSWORD} + storageClass: ${STORAGE_CLASS} + storageClassHa: ${STORAGE_CLASS_HA} + +charts: +- name: keycloak + override: + replicaCount: 3 + global.storageClass: $(storageClass) + auth.adminPassword: $(commonPassword) + ingress.enabled: true + ingress.hostname: TO_BE_FIXED + ingress.annotations: + nginx.ingress.kubernetes.io/proxy-buffer-size: 20k + acme.cert-manager.io/http01-edit-in-place: "true" + cert-manager.io/cluster-issuer: http0issuer + externalDatabase.host: $(dbHost) + externalDatabase.password: $(commonPassword) + +- name: tks-apis + override: + gitBaseUrl: https://github.com + gitAccount: decapod10 + db: + dbHost: $(dbHost) + adminPassword: $(commonPassword) + dbUser: tksuser + dbPassword: $(commonPassword) + tksapi: + replicaCount: 1 + tksAccount: + password: $(commonPassword) + args: + imageRegistryUrl: "harbor.taco-cat.xyz/appserving" + gitRepositoryUrl: "github.com/openinfradev" + keycloakAddress: http://keycloak.keycloak.svc:80/auth + tksbatch: + replicaCount: 1 + tksconsole: + replicaCount: 1 + +- name: harbor + override: + expose: + ingress: + hosts: + core: TO_BE_FIXED + className: "nginx" + externalURL: TO_BE_FIXED + persistence: + persistentVolumeClaim: + registry: + storageClass: $(storageClassHa) + accessMode: ReadWriteMany + size: 200Gi + chartmuseum: + storageClass: $(storageClassHa) + accessMode: ReadWriteMany + size: 20Gi + jobservice: + jobLog: + storageClass: $(storageClassHa) + accessMode: ReadWriteMany + scanDataExports: + storageClass: $(storageClassHa) + accessMode: ReadWriteMany + redis: + storageClass: $(storageClass) + accessMode: ReadWriteOnce + trivy: + storageClass: $(storageClass) + database: + type: external + external: + host: $(dbHost) + password: $(commonPassword) + sslmode: "require" + core: + replicas: 2 + jobservice: + replicas: 2 + registry: + replicas: 2 + chartmuseum: + replicas: 2 + trivy: + replicas: 2 + portal: + replicas: 2 + harborAdminPassword: $(commonPassword) + +- name: ingress-nginx + override: + controller: + resources: + requests: + cpu: 2000m + memory: 4Gi + service: + externalTrafficPolicy: Local + type: NodePort + config: + enable-underscores-in-headers: "true" + proxy-body-size: "10m" diff --git a/byok-reference/tks-cluster/kustomization.yaml b/byok-reference/tks-cluster/kustomization.yaml new file mode 100644 index 0000000..acae49c --- /dev/null +++ b/byok-reference/tks-cluster/kustomization.yaml @@ -0,0 +1,6 @@ +resources: +- ../base +- ../infra/byoh + +transformers: +- site-values.yaml diff --git a/byok-reference/tks-cluster/site-values.yaml b/byok-reference/tks-cluster/site-values.yaml new file mode 100644 index 0000000..e964fca --- /dev/null +++ b/byok-reference/tks-cluster/site-values.yaml @@ -0,0 +1,78 @@ +apiVersion: openinfradev.github.com/v1 +kind: HelmValuesTransformer +metadata: + name: site + +global: + # These values are replaced on cluster creation by workflow + clusterName: cluster.local + clusterEndpointHost: CHANGEME + clusterEndpointPort: CHANGEME + tksCpNode: CHNAGEME + tksInfraNode: CHNAGEME + tksUserNode: CHANGEME + keycloakIssuerUri: CHANGEME + keycloakClientId: CHANGEME +charts: +- name: cluster-api-byoh + override: + cluster: + name: $(clusterName) + kubernetesVersion: v1.25.11 + byoCluster: + bundleLookupBaseRegistry: harbor.taco-cat.xyz/cluster_api_provider_bringyourownhost + controlPlaneEndpoint: + host: $(clusterEndpointHost) + port: $(clusterEndpointPort) + kubeadmControlPlane: + selector: + matchLabels: + role: $(clusterName)-control-plane + replicas: $(tksCpNode) + clusterConfiguration: + apiServer: + extraArgs: + oidc-client-id: $(keycloakClientId) + oidc-issuer-url: $(keycloakIssuerUri) + machineDeployment: + - name: taco + replicas: $(tksInfraNode) + selector: + matchLabels: + role: $(clusterName)-tks + labels: + servicemesh: enabled + taco-egress-gateway: enabled + taco-ingress-gateway: enabled + taco-lma: enabled + - name: normal + replicas: $(tksUserNode) + selector: + matchLabels: + role: $(clusterName)-worker + +- name: ingress-nginx + override: + controller: + nodeSelector: + taco-lma: enabled + resources: + requests: + cpu: 2000m + memory: 4Gi + service: + externalTrafficPolicy: Local + type: NodePort + config: + enable-underscores-in-headers: "true" + proxy-body-size: "10m" + +- name: cluster-autoscaler + override: + discoveryNamespace: $(clusterName) + discoveryClusterName: $(clusterName) + +- name: cluster-autoscaler-rbac + override: + deployMgmtRbacOnly: + targetNamespace: $(clusterName) From 228e57344d48794ed3572f3b953ed62c20812d0a Mon Sep 17 00:00:00 2001 From: "taekyu.kang" Date: Wed, 4 Sep 2024 17:36:24 +0900 Subject: [PATCH 11/12] bugfix. add s3 bucket 'tks-loki-user' --- aws-msa-reference/lma/site-values.yaml | 1 + aws-reference/lma/site-values.yaml | 1 + eks-msa-reference/lma/site-values.yaml | 1 + eks-reference/lma/site-values.yaml | 1 + 4 files changed, 4 insertions(+) diff --git a/aws-msa-reference/lma/site-values.yaml b/aws-msa-reference/lma/site-values.yaml index 1f4d649..ba3d923 100644 --- a/aws-msa-reference/lma/site-values.yaml +++ b/aws-msa-reference/lma/site-values.yaml @@ -384,4 +384,5 @@ charts: s3.buckets: - name: $(clusterName)-tks-thanos - name: $(clusterName)-tks-loki + - name: $(clusterName)-tks-loki-user tks.iamRoles: $(tksIamRoles) diff --git a/aws-reference/lma/site-values.yaml b/aws-reference/lma/site-values.yaml index 7d7ce3e..6d8f0b3 100644 --- a/aws-reference/lma/site-values.yaml +++ b/aws-reference/lma/site-values.yaml @@ -383,4 +383,5 @@ charts: s3.buckets: - name: $(clusterName)-tks-thanos - name: $(clusterName)-tks-loki + - name: $(clusterName)-tks-loki-user tks.iamRoles: $(tksIamRoles) diff --git a/eks-msa-reference/lma/site-values.yaml b/eks-msa-reference/lma/site-values.yaml index d43de8c..112f656 100644 --- a/eks-msa-reference/lma/site-values.yaml +++ b/eks-msa-reference/lma/site-values.yaml @@ -385,4 +385,5 @@ charts: s3.buckets: - name: $(clusterName)-tks-thanos - name: $(clusterName)-tks-loki + - name: $(clusterName)-tks-loki-user tks.iamRoles: $(tksIamRoles) diff --git a/eks-reference/lma/site-values.yaml b/eks-reference/lma/site-values.yaml index d43de8c..112f656 100644 --- a/eks-reference/lma/site-values.yaml +++ b/eks-reference/lma/site-values.yaml @@ -385,4 +385,5 @@ charts: s3.buckets: - name: $(clusterName)-tks-thanos - name: $(clusterName)-tks-loki + - name: $(clusterName)-tks-loki-user tks.iamRoles: $(tksIamRoles) From 6b49313f12a897e0fa039d5e9b0156ed4cdfcf59 Mon Sep 17 00:00:00 2001 From: "taekyu.kang" Date: Fri, 6 Sep 2024 16:54:48 +0900 Subject: [PATCH 12/12] feature. update kubernetes version to 1.29.8 --- aws-msa-reference/tks-cluster/site-values.yaml | 8 +++++++- aws-reference/tks-cluster/site-values.yaml | 8 +++++++- byoh-reference/tks-cluster/site-values.yaml | 2 +- eks-msa-reference/tks-cluster/site-values.yaml | 6 +++--- eks-reference/tks-cluster/site-values.yaml | 6 +++--- 5 files changed, 21 insertions(+), 9 deletions(-) diff --git a/aws-msa-reference/tks-cluster/site-values.yaml b/aws-msa-reference/tks-cluster/site-values.yaml index 406fe3e..6028c31 100644 --- a/aws-msa-reference/tks-cluster/site-values.yaml +++ b/aws-msa-reference/tks-cluster/site-values.yaml @@ -27,7 +27,7 @@ charts: sshKeyName: $(sshKeyName) cluster: name: $(clusterName) - kubernetesVersion: v1.26.10 + kubernetesVersion: v1.29.8 eksEnabled: false multitenancyId: kind: AWSClusterRoleIdentity @@ -54,6 +54,8 @@ charts: kubeadmControlPlane: replicas: $(tksCpNode) controlPlaneMachineType: $(tksCpNodeType) + ami: + id: ami-02e4e8f09921cfe97 machinePool: - name: taco machineType: $(tksInfraNodeType) @@ -69,6 +71,8 @@ charts: taco-ingress-gateway: enabled roleAdditionalPolicies: - "arn:aws:iam::aws:policy/AmazonS3FullAccess" + ami: + id: ami-02e4e8f09921cfe97 machineDeployment: - name: normal numberOfAZ: 3 # ap-northeast-2 @@ -80,6 +84,8 @@ charts: rootVolume: size: 50 type: gp2 + ami: + id: ami-02e4e8f09921cfe97 - name: ingress-nginx override: diff --git a/aws-reference/tks-cluster/site-values.yaml b/aws-reference/tks-cluster/site-values.yaml index 406fe3e..6028c31 100644 --- a/aws-reference/tks-cluster/site-values.yaml +++ b/aws-reference/tks-cluster/site-values.yaml @@ -27,7 +27,7 @@ charts: sshKeyName: $(sshKeyName) cluster: name: $(clusterName) - kubernetesVersion: v1.26.10 + kubernetesVersion: v1.29.8 eksEnabled: false multitenancyId: kind: AWSClusterRoleIdentity @@ -54,6 +54,8 @@ charts: kubeadmControlPlane: replicas: $(tksCpNode) controlPlaneMachineType: $(tksCpNodeType) + ami: + id: ami-02e4e8f09921cfe97 machinePool: - name: taco machineType: $(tksInfraNodeType) @@ -69,6 +71,8 @@ charts: taco-ingress-gateway: enabled roleAdditionalPolicies: - "arn:aws:iam::aws:policy/AmazonS3FullAccess" + ami: + id: ami-02e4e8f09921cfe97 machineDeployment: - name: normal numberOfAZ: 3 # ap-northeast-2 @@ -80,6 +84,8 @@ charts: rootVolume: size: 50 type: gp2 + ami: + id: ami-02e4e8f09921cfe97 - name: ingress-nginx override: diff --git a/byoh-reference/tks-cluster/site-values.yaml b/byoh-reference/tks-cluster/site-values.yaml index e964fca..e417a91 100644 --- a/byoh-reference/tks-cluster/site-values.yaml +++ b/byoh-reference/tks-cluster/site-values.yaml @@ -18,7 +18,7 @@ charts: override: cluster: name: $(clusterName) - kubernetesVersion: v1.25.11 + kubernetesVersion: v1.29.8 byoCluster: bundleLookupBaseRegistry: harbor.taco-cat.xyz/cluster_api_provider_bringyourownhost controlPlaneEndpoint: diff --git a/eks-msa-reference/tks-cluster/site-values.yaml b/eks-msa-reference/tks-cluster/site-values.yaml index 47c3174..8704c74 100644 --- a/eks-msa-reference/tks-cluster/site-values.yaml +++ b/eks-msa-reference/tks-cluster/site-values.yaml @@ -27,14 +27,14 @@ charts: name: $(clusterName) region: $(clusterRegion) eksEnabled: true - kubernetesVersion: v1.25.9 + kubernetesVersion: v1.29.8 eksAddons: - name: "aws-ebs-csi-driver" - version: "v1.18.0-eksbuild.1" + version: "v1.34.0-eksbuild.1" conflictResolution: "overwrite" - name: "vpc-cni" conflictResolution: "overwrite" - version: "v1.12.6-eksbuild.2" + version: "v1.18.3-eksbuild.2" multitenancyId: kind: AWSClusterRoleIdentity name: $(cloudAccountID)-account-role diff --git a/eks-reference/tks-cluster/site-values.yaml b/eks-reference/tks-cluster/site-values.yaml index 6d88add..a3319be 100644 --- a/eks-reference/tks-cluster/site-values.yaml +++ b/eks-reference/tks-cluster/site-values.yaml @@ -30,14 +30,14 @@ charts: name: $(clusterName) region: $(clusterRegion) eksEnabled: true - kubernetesVersion: v1.25.9 + kubernetesVersion: v1.29.8 eksAddons: - name: "aws-ebs-csi-driver" - version: "v1.18.0-eksbuild.1" + version: "v1.34.0-eksbuild.1" conflictResolution: "overwrite" - name: "vpc-cni" conflictResolution: "overwrite" - version: "v1.12.6-eksbuild.2" + version: "v1.18.3-eksbuild.2" multitenancyId: kind: AWSClusterRoleIdentity name: $(cloudAccountID)-account-role