告警配置

  1. groups:
  2. - name: example
  3. rules:
  4. - alert: "实例丢失"
  5. expr: up{job="node-exporter"} == 0
  6. for: 1m
  7. labels:
  8. severity: page
  9. annotations:
  10. summary: "服务器实例 {{ $labels.instance }} 丢失"
  11. description: "{{ $labels.instance }} 上的任务 {{ $labels.job }} 已经停止了 1 分钟已上了"
  12. - alert: "磁盘容量小于 5%"
  13. expr: 100 - ((node_filesystem_avail_bytes{job="node-exporter",mountpoint=~".*",fstype=~"ext4|xfs|ext2|ext3"} * 100) / node_filesystem_size_bytes {job="node-exporter",mountpoint=~".*",fstype=~"ext4|xfs|ext2|ext3"}) > 95
  14. for: 30s
  15. annotations:
  16. summary: "服务器实例 {{ $labels.instance }} 磁盘不足 告警通知"
  17. description: "{{ $labels.instance }}磁盘 {{ $labels.device }} 资源 已不足 5%, 当前值: {{ $value }}"
  18. - alert: "内存容量小于 20%"
  19. expr: ((node_memory_MemTotal_bytes - node_memory_MemFree_bytes - node_memory_Buffers_bytes - node_memory_Cached_bytes) / (node_memory_MemTotal_bytes )) * 100 > 80
  20. for: 30s
  21. labels:
  22. severity: warning
  23. annotations:
  24. summary: "服务器实例 {{ $labels.instance }} 内存不足 告警通知"
  25. description: "{{ $labels.instance }}内存资源已不足 20%,当前值: {{ $value }}"
  26. - alert: "CPU 平均负载大于 4 个"
  27. expr: node_load5 > 4
  28. for: 30s
  29. annotations:
  30. sumary: "服务器实例 {{ $labels.instance }} CPU 负载 告警通知"
  31. description: "{{ $labels.instance }}CPU 平均负载(5 分钟) 已超过 4 ,当前值: {{ $value }}"
  32. - alert: "磁盘读 I/O 超过 30MB/s"
  33. expr: irate(node_disk_read_bytes_total{device="sda"}[1m]) > 30000000
  34. for: 30s
  35. annotations:
  36. sumary: "服务器实例 {{ $labels.instance }} I/O 读负载 告警通知"
  37. description: "{{ $labels.instance }}I/O 每分钟读已超过 30MB/s,当前值: {{ $value }}"
  38. - alert: "磁盘写 I/O 超过 30MB/s"
  39. expr: irate(node_disk_written_bytes_total{device="sda"}[1m]) > 30000000
  40. for: 30s
  41. annotations:
  42. sumary: "服务器实例 {{ $labels.instance }} I/O 写负载 告警通知"
  43. description: "{{ $labels.instance }}I/O 每分钟写已超过 30MB/s,当前值: {{ $value }}"
  44. - alert: "网卡流出速率大于 10MB/s"
  45. expr: (irate(node_network_transmit_bytes_total{device!~"lo"}[1m]) / 1000) > 1000000
  46. for: 30s
  47. annotations:
  48. sumary: "服务器实例 {{ $labels.instance }} 网卡流量负载 告警通知"
  49. description: "{{ $labels.instance }}网卡 {{ $labels.device }} 流量已经超过 10MB/s, 当前值: {{ $value }}"
  50. - alert: "CPU 使用率大于 90%"
  51. expr: 100 - ((avg by (instance,job,env)(irate(node_cpu_seconds_total{mode="idle"}[30s]))) *100) > 90
  52. for: 30s
  53. annotations:
  54. sumary: "服务器实例 {{ $labels.instance }} CPU 使用率 告警通知"
  55. description: "{{ $labels.instance }}CPU 使用率已超过 90%, 当前值: {{ $value }}"
  • 配置邮件告警
    1. vi alertmanager.yml
    配置如下: ```yaml global: resolve_timeout: 5m smtp_smarthost: ‘smtp.qq.com:25’ # 邮箱smtp服务器代理 smtp_from: ‘450733605@qq.com’ # 发送邮箱名称 smtp_auth_username: ‘450733605@qq.com’ # 邮箱名称 smtp_auth_password: ‘ktzwkmykjrbpbjjc’ # 邮箱密码或授权码 smtp_require_tls: false route: group_by: [‘alertname’] group_wait: 10s group_interval: 10s repeat_interval: 1h receiver: ‘mail’ receivers:
  • name: ‘mail’ email_configs:
    • to: ‘450733605@qq.com’ ```