有时候日志长时间未写入,我们认为服务可能有问题,需要监控起来

    创建一个测试日志文件

    1. touch /tmp/catalina.out

    编写agent.sh

    [root@bogon rules]# cat /data/scripts/agent.sh 
    #!/bin/bash
    
    #########################################################################
    # File Name: agent.sh
    # Created on: 2021-03-14 11:36:19
    # Author: Wu Kang
    # Last Modified: 2021-03-15 11:29:17
    # Description: 自定义监控
    #########################################################################
    
    HOST=`hostname`
    IP=`ip a|grep inet|egrep -v "127.0.0.1|fe80|::"|awk -F'/' '{print $1}'|awk  '{print $2}'|head -1`
    
    
    function get_process_count(){
      for process_name in  `cat process_list.txt`
      do
        count=`ps -ef | grep $process_name | grep -v grep | awk '{print $2}' | wc -l`
        #echo $count $process_name
        line='process_count{host="'$HOST'",process_name="'$process_name'",ip="'$IP'"} '$count''
        echo $line >>tmpdata.txt
        echo $line
      done
    }
    
    function get_log_delay_second(){
      logfile=/tmp/catalina.out
      system=$(date +%s)
      service=$(stat -c %Y $logfile)
      delay_second=$(($system-$service))
      line='log_delay{host="'$HOST'",logfile="'$logfile'",ip="'$IP'"} '$delay_second''
      echo $line >>tmpdata.txt
      echo $line
    }
    
    
    function get_port_listen(){
      for port in  `cat port_list.txt`
      do
        count=`ss -tnlp|awk '{print $4}'|grep $port|wc -l`
        #echo $count $process_name
        line='port_listen{host="'$HOST'",port="'$port'",ip="'$IP'"} '$count''
        echo $line >>tmpdata.txt
        echo $line
      done
    }
    
    function getdata(){
      get_process_count
      get_log_delay_second
      get_port_listen
    }
    
    
    function pushdata(){
      curl -XPOST --data-binary @tmpdata.txt http://192.168.0.15:9091/metrics/job/process
    }
    
    
    
    function run(){
      while true
      do
        >tmpdata.txt
        getdata
        pushdata
        sleep 1
      done
    
    }
    
    main(){
      run
    }
    
    main
    

    编写告警规则

    [root@bogon rules]# vim /opt/prometheus/ruleslog_delay.yml 
    groups:
    - name: process
      rules:
      - alert: 日志长时间无写入
        expr: log_delay >120
        for: 3s 
        labels:
          severity: warning
        annotations: 
          summary: "{{ $labels.logfile }} 2 分钟无写入"
          description: "host {{ $labels.host }} {{ $labels.ip }}{{ $labels.logfile }} 长时间无写入,value is {{ $value }}"
    
    systemctl reload prometheus
    

    钉钉告警相关截图
    image.png