apiVersion: v1 kind: ConfigMap metadata: name: lcm-bricks-monitoring-rules namespace: monitoring labels: app.kubernetes.io/name: {{ include "lcm-bricks.name" . }} team: lcm helm.sh/chart: {{ include "lcm-bricks.chart" . }} app.kubernetes.io/instance: {{ .Release.Name }} app.kubernetes.io/managed-by: {{ .Release.Service }} data: lcm-bricks-monitoring-rules.yaml: |+ groups: - name: lcm-bricks-monitoring-rules rules: - record: "container_pod:lcm_pod_container_status_restarts:increase10m" expr: increase(kube_pod_container_status_restarts_total{namespace='{{ .Release.Namespace }}'}[10m]) - alert: "[LCM] Pod has too many restarts on cluster={{ .Values.clusterId }}" expr: container_pod:lcm_pod_container_status_restarts:increase10m >= 1 labels: severity: warning team: lcm # switch to msf in production cluster_id: {{ .Values.clusterId }} annotations: description: "There is more than 0 restarts of {{`{{ $labels.pod }}`}} pod in the last 10 minutes" summary: "{{`{{ $labels.pod }}`}} pod has too many restarts" - alert: "[LCM] Pod has too many restarts on cluster={{ .Values.clusterId }}" expr: container_pod:lcm_pod_container_status_restarts:increase10m >= 2 labels: severity: critical team: lcm # switch to msf in production cluster_id: {{ .Values.clusterId }} annotations: description: "There is more than 1 restart of {{`{{ $labels.pod }}`}} pod in the last 10 minutes" summary: "{{`{{ $labels.pod }}`}} pod has too many restarts" - record: "container_pod:lcm_pod_container_status_oomkilled:increase10m" expr: increase(kube_pod_container_status_terminated_reason{namespace='{{ .Release.Namespace }}', reason='OOMKilled'}[10m]) - alert: "[LCM] OOMKill occured on cluster={{ .Values.clusterId }}" expr: container_pod:lcm_pod_container_status_oomkilled:increase10m >= 1 labels: severity: warning team: lcm # switch to msf in production cluster_id: {{ .Values.clusterId }} annotations: description: "{{`{{ $labels.pod }}`}} was OOMKilled in the last 30 minutes. Investigate and/or increase memoryRequest or memoryLimit." summary: "{{`{{ $labels.pod }}`}} OOMKill occured" - alert: "[LCM] OOMKill occured on cluster={{ .Values.clusterId }}" expr: container_pod:lcm_pod_container_status_oomkilled:increase10m >= 2 labels: severity: critical team: lcm # switch to msf in production cluster_id: {{ .Values.clusterId }} annotations: description: "{{`{{ $labels.pod }}`}} was OOMKilled in the last 10 minutes. Investigate and/or increase memoryRequest or memoryLimit." summary: "{{`{{ $labels.pod }}`}} OOMKill occured" - alert: "[LCM] Container is being throttled on cluster={{ .Values.clusterId }}" expr: rate(container_cpu_cfs_throttled_seconds_total{namespace='{{ .Release.Namespace }}'}[1m]) > 1 for: 5m labels: severity: critical team: lcm # switch to msf in production cluster_id: {{ .Values.clusterId }} annotations: description: "{{`{{ $labels.pod_name }}`}} container is beeing throttled and probably hit CPU limit. Investigate root cause and increase limit and/or number of replicas if necessary." summary: "{{`{{ $labels.pod_name }}`}} Container is being throttled" - alert: "[LCM] is doing too much pause GC on cluster={{ .Values.clusterId }}" expr: rate(jvm_gc_pause_seconds_sum{kubernetes_namespace='{{ .Release.Namespace }}'}[1m]) > 1 for: 5m labels: severity: critical team: lcm # switch to msf in production cluster_id: {{ .Values.clusterId }} annotations: description: "{{`{{ $labels.kubernetes_pod_name }}`}} container is spending too much time in pause garbage collector. Investigate root cause and increase heap size and/or number of replicas if necessary." summary: "{{`{{ $labels.kubernetes_pod_name }}`}} is doing too much pause GC" - alert: "[LCM] there is more than 100 jobs on cluster={{ .Values.clusterId }}" expr: count(kube_job_info{namespace="lcm"}) > 100 labels: severity: critical team: lcm # switch to msf in production cluster_id: {{ .Values.clusterId }} annotations: description: "There is more than 100 jobs in LCM namespace. They are likely not deleted." summary: "There is more than 100 jobs in LCM namespace."