有时候日志长时间未写入,我们认为服务可能有问题,需要监控起来
创建一个测试日志文件
touch /tmp/catalina.out
编写agent.sh
[root@bogon rules]# cat /data/scripts/agent.sh
#!/bin/bash
#########################################################################
# File Name: agent.sh
# Created on: 2021-03-14 11:36:19
# Author: Wu Kang
# Last Modified: 2021-03-15 11:29:17
# Description: 自定义监控
#########################################################################
HOST=`hostname`
IP=`ip a|grep inet|egrep -v "127.0.0.1|fe80|::"|awk -F'/' '{print $1}'|awk '{print $2}'|head -1`
function get_process_count(){
for process_name in `cat process_list.txt`
do
count=`ps -ef | grep $process_name | grep -v grep | awk '{print $2}' | wc -l`
#echo $count $process_name
line='process_count{host="'$HOST'",process_name="'$process_name'",ip="'$IP'"} '$count''
echo $line >>tmpdata.txt
echo $line
done
}
function get_log_delay_second(){
logfile=/tmp/catalina.out
system=$(date +%s)
service=$(stat -c %Y $logfile)
delay_second=$(($system-$service))
line='log_delay{host="'$HOST'",logfile="'$logfile'",ip="'$IP'"} '$delay_second''
echo $line >>tmpdata.txt
echo $line
}
function get_port_listen(){
for port in `cat port_list.txt`
do
count=`ss -tnlp|awk '{print $4}'|grep $port|wc -l`
#echo $count $process_name
line='port_listen{host="'$HOST'",port="'$port'",ip="'$IP'"} '$count''
echo $line >>tmpdata.txt
echo $line
done
}
function getdata(){
get_process_count
get_log_delay_second
get_port_listen
}
function pushdata(){
curl -XPOST --data-binary @tmpdata.txt http://192.168.0.15:9091/metrics/job/process
}
function run(){
while true
do
>tmpdata.txt
getdata
pushdata
sleep 1
done
}
main(){
run
}
main
编写告警规则
[root@bogon rules]# vim /opt/prometheus/ruleslog_delay.yml
groups:
- name: process
rules:
- alert: 日志长时间无写入
expr: log_delay >120
for: 3s
labels:
severity: warning
annotations:
summary: "{{ $labels.logfile }} 2 分钟无写入"
description: "host {{ $labels.host }} {{ $labels.ip }}{{ $labels.logfile }} 长时间无写入,value is {{ $value }}"
systemctl reload prometheus
钉钉告警相关截图