apiVersion: v1
kind: ConfigMap
metadata:
  name: lcm-bricks-monitoring-rules
  namespace: monitoring
  labels:
    app.kubernetes.io/name: {{ include "lcm-bricks.name" . }}
    team: lcm
    helm.sh/chart: {{ include "lcm-bricks.chart" . }}
    app.kubernetes.io/instance: {{ .Release.Name }}
    app.kubernetes.io/managed-by: {{ .Release.Service }}
data:
  lcm-bricks-monitoring-rules.yaml: |+
    groups:
    - name: lcm-bricks-monitoring-rules
      rules:
      - record: "container_pod:lcm_pod_container_status_restarts:increase10m"
        expr: increase(kube_pod_container_status_restarts_total{namespace='{{ .Release.Namespace }}'}[10m])
      - alert: "[LCM] Pod has too many restarts on cluster={{ .Values.clusterId }}"
        expr: container_pod:lcm_pod_container_status_restarts:increase10m >= 1
        labels:
          severity: warning
          team: lcm # switch to msf in production
          cluster_id: {{ .Values.clusterId }}
        annotations:
          description: "There is more than 0 restarts of {{`{{ $labels.pod }}`}} pod in the last 10 minutes"
          summary: "{{`{{ $labels.pod }}`}} pod has too many restarts"
      - alert: "[LCM] Pod has too many restarts on cluster={{ .Values.clusterId }}"
        expr: container_pod:lcm_pod_container_status_restarts:increase10m >= 2
        labels:
          severity: critical
          team: lcm # switch to msf in production
          cluster_id: {{ .Values.clusterId }}
        annotations:
          description: "There is more than 1 restart of {{`{{ $labels.pod }}`}} pod in the last 10 minutes"
          summary: "{{`{{ $labels.pod }}`}} pod has too many restarts"
      - record: "container_pod:lcm_pod_container_status_oomkilled:increase10m"
        expr: increase(kube_pod_container_status_terminated_reason{namespace='{{ .Release.Namespace }}', reason='OOMKilled'}[10m])
      - alert: "[LCM] OOMKill occured on cluster={{ .Values.clusterId }}"
        expr: container_pod:lcm_pod_container_status_oomkilled:increase10m >= 1
        labels:
          severity: warning
          team: lcm # switch to msf in production
          cluster_id: {{ .Values.clusterId }}
        annotations:
          description: "{{`{{ $labels.pod }}`}} was OOMKilled in the last 30 minutes. Investigate and/or increase memoryRequest or memoryLimit."
          summary: "{{`{{ $labels.pod }}`}} OOMKill occured"
      - alert: "[LCM] OOMKill occured on cluster={{ .Values.clusterId }}"
        expr: container_pod:lcm_pod_container_status_oomkilled:increase10m >= 2
        labels:
          severity: critical
          team: lcm # switch to msf in production
          cluster_id: {{ .Values.clusterId }}
        annotations:
          description: "{{`{{ $labels.pod }}`}} was OOMKilled in the last 10 minutes. Investigate and/or increase memoryRequest or memoryLimit."
          summary: "{{`{{ $labels.pod }}`}} OOMKill occured"
      - alert: "[LCM] Container is being throttled on cluster={{ .Values.clusterId }}"
        expr: rate(container_cpu_cfs_throttled_seconds_total{namespace='{{ .Release.Namespace }}'}[1m]) > 1
        for: 5m
        labels:
          severity: critical
          team: lcm # switch to msf in production
          cluster_id: {{ .Values.clusterId }}
        annotations:
          description: "{{`{{ $labels.pod_name }}`}} container is beeing throttled and probably hit CPU limit. Investigate root cause and increase limit and/or number of replicas if necessary."
          summary: "{{`{{ $labels.pod_name }}`}} Container is being throttled"
      - alert: "[LCM] is doing too much pause GC on cluster={{ .Values.clusterId }}"
        expr: rate(jvm_gc_pause_seconds_sum{kubernetes_namespace='{{ .Release.Namespace }}'}[1m]) > 1
        for: 5m
        labels:
          severity: critical
          team: lcm # switch to msf in production
          cluster_id: {{ .Values.clusterId }}
        annotations:
          description: "{{`{{ $labels.kubernetes_pod_name }}`}} container is spending too much time in pause garbage collector. Investigate root cause and increase heap size and/or number of replicas if necessary."
          summary: "{{`{{ $labels.kubernetes_pod_name }}`}} is doing too much pause GC"
      - alert: "[LCM] there is more than 100 jobs on cluster={{ .Values.clusterId }}"
        expr: count(kube_job_info{namespace="lcm"}) > 100
        labels:
          severity: critical
          team: lcm # switch to msf in production
          cluster_id: {{ .Values.clusterId }}
        annotations:
          description: "There is more than 100 jobs in LCM namespace. They are likely not deleted."
          summary: "There is more than 100 jobs in LCM namespace."