rule.yml: |groups:- name: basic-and-importantrules:- alert: NodeCPUUsageexpr: (100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)) > 80for: 10mlabels:severity: criticalannotations:summary: "{{$labels.instance}}: High CPU usage detected"description: '{{$labels.instance}} CPU usage is above 80% (current value is {{ $value }})'- alert: NodeMEMUsageexpr: ((1 - (node_memory_MemAvailable_bytes / (node_memory_MemTotal_bytes))) * 100) > 80for: 10mlabels:severity: criticalannotations:summary: "{{$labels.instance}}: High Memory usage detected"description: '{{$labels.instance}} MEM usage is above 80% (current value is {{ $value }})'- alert: NodeDiskUsageexpr: (1-(node_filesystem_free_bytes{fstype=~"ext4|xfs"} / node_filesystem_size_bytes{fstype=~"ext4|xfs"}))*100 > 80for: 10mlabels:severity: criticalannotations:summary: "{{$labels.instance}}: High Disk usage detected"description: '{{$labels.instance}} Disk usage is above 80% (current value is {{ $value }})'- alert: API response time per minexpr: increase(http_server_requests_seconds_sum{uri!="/actuator/health"}[1m])/increase(http_server_requests_seconds_count{uri!="/actuator/health"}[1m])>2for: 1mlabels:severity: criticalannotations:description: '{{$labels.job}} {{$labels.url}} response time more than 2s. current value is {{ $value }}'- alert: Count of API request times per minexpr: increase(http_server_requests_seconds_count{uri!="/actuator/health",uri!="/actuator/prometheus",status!="200"}[1m])>1for: 1mlabels:severity: criticalannotations:description: '{{$labels.job}} {{$labels.url}} request error times is {{ $value }} in recent one min'- name: rabbitmq-monitoringrules:- alert: rabbitmq_queue_messagesexpr: rabbitmq_queue_messages{queue!~".*_DL"} > 10for: 5mlabels:severity: criticalannotations:description: 'queue name:{{$labels.queue}} is blocked. current count is {{ $value }}'- alert: rabbitmq_consumer_error_totalexpr: increase(rabbitmq_consumer_error_total[1m]) > 10for: 1mlabels:severity: criticalannotations:description: 'service name:{{$labels.job}} cannot consume the queues. current count is {{ $value }}'- alert: rabbitmq_connection_recovery_totalexpr: increase(rabbitmq_connection_recovery_total[1m]) > 10for: 1mlabels:severity: criticalannotations:description: 'service name:{{$labels.job}} connection recovery total is {{ $value }}'
