1. rule.yml: |
    2. groups:
    3. - name: basic-and-important
    4. rules:
    5. - alert: NodeCPUUsage
    6. expr: (100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)) > 80
    7. for: 10m
    8. labels:
    9. severity: critical
    10. annotations:
    11. summary: "{{$labels.instance}}: High CPU usage detected"
    12. description: '{{$labels.instance}} CPU usage is above 80% (current value is {{ $value }})'
    13. - alert: NodeMEMUsage
    14. expr: ((1 - (node_memory_MemAvailable_bytes / (node_memory_MemTotal_bytes))) * 100) > 80
    15. for: 10m
    16. labels:
    17. severity: critical
    18. annotations:
    19. summary: "{{$labels.instance}}: High Memory usage detected"
    20. description: '{{$labels.instance}} MEM usage is above 80% (current value is {{ $value }})'
    21. - alert: NodeDiskUsage
    22. expr: (1-(node_filesystem_free_bytes{fstype=~"ext4|xfs"} / node_filesystem_size_bytes{fstype=~"ext4|xfs"}))*100 > 80
    23. for: 10m
    24. labels:
    25. severity: critical
    26. annotations:
    27. summary: "{{$labels.instance}}: High Disk usage detected"
    28. description: '{{$labels.instance}} Disk usage is above 80% (current value is {{ $value }})'
    29. - alert: API response time per min
    30. expr: increase(http_server_requests_seconds_sum{uri!="/actuator/health"}[1m])/increase(http_server_requests_seconds_count{uri!="/actuator/health"}[1m])>2
    31. for: 1m
    32. labels:
    33. severity: critical
    34. annotations:
    35. description: '{{$labels.job}} {{$labels.url}} response time more than 2s. current value is {{ $value }}'
    36. - alert: Count of API request times per min
    37. expr: increase(http_server_requests_seconds_count{uri!="/actuator/health",uri!="/actuator/prometheus",status!="200"}[1m])>1
    38. for: 1m
    39. labels:
    40. severity: critical
    41. annotations:
    42. description: '{{$labels.job}} {{$labels.url}} request error times is {{ $value }} in recent one min'
    43. - name: rabbitmq-monitoring
    44. rules:
    45. - alert: rabbitmq_queue_messages
    46. expr: rabbitmq_queue_messages{queue!~".*_DL"} > 10
    47. for: 5m
    48. labels:
    49. severity: critical
    50. annotations:
    51. description: 'queue name:{{$labels.queue}} is blocked. current count is {{ $value }}'
    52. - alert: rabbitmq_consumer_error_total
    53. expr: increase(rabbitmq_consumer_error_total[1m]) > 10
    54. for: 1m
    55. labels:
    56. severity: critical
    57. annotations:
    58. description: 'service name:{{$labels.job}} cannot consume the queues. current count is {{ $value }}'
    59. - alert: rabbitmq_connection_recovery_total
    60. expr: increase(rabbitmq_connection_recovery_total[1m]) > 10
    61. for: 1m
    62. labels:
    63. severity: critical
    64. annotations:
    65. description: 'service name:{{$labels.job}} connection recovery total is {{ $value }}'