nagios|icinga 监控特定端口TCP连接数

运维监控   tcp连接数  

用法

  • 将脚本放在nagios-plugins目录下
    /usr/local/nagios/libexec/check_max_cons.sh
  • 添加自定义命令到 nrpe.cfg
    command[check_conns_args]=/usr/local/nagios/libexec/check_max_cons.sh -s $ARG1$ -w $ARG2$ -c $ARG3$
  • 重启插件(加入xinted服务的直接service xinetd restart)
    /usr/local/nagios/bin/nrpe -c /usr/local/nagios/etc/nrpe.cfg -d

效果图

  • 最大连接数超过3000报警,4000严重
  • 当前连接数864

shell 插件 (check_max_conns.sh)

#!/bin/bash
#nagios exit code
STATE_OK=0  
STATE_WARNING=1  
STATE_CRITICAL=2  
STATE_UNKNOWN=3

#help
help () {  
        local command=`basename $0`
        echo "NAME
        ${command} -- check network status
SYNOPSIS  
        ${command} [OPTION]
DESCRIPTION  
        -H IP ADDRESS
        -p LOCAL PORT
        -s [TIME_WAIT|FIN_WAIT|ESTABLISHED|CLOSING|SYN_SEND|TIMED_WAIT|LISTEN]
        -w warning
        -c critical
USAGE:  
Total connections:  
        $0 -w 1000 -c 2000
Port:  
        $0 -p PORT-w 1000 -c 2000
Host and Port:  
        $0 -H HOST -p PORT-w 1000 -c 2000
Status:  
        $0 -H HOST  -p PORT-s ESTABLISHED -w 1000 -c 2000" 1>&2
        exit ${STATE_WARNING}
}

check_num () {  
        local num_str="$1"
        echo ${num_str}|grep -E '^[0-9]+$' >/dev/null 2>&1 || local stat='not a positive integers!'
        if [ "${stat}" = 'not a positive integers!' ];then
                echo "${num_str} ${stat}" 1>&2
                exit ${STATE_WARNING}
        else
                local num_int=`echo ${num_str}*1|bc`
                if [ ${num_int} -lt 0 ];then
                        echo "${num_int} must be greater than 0!" 1>&2
                        exit ${STATE_WARNING}
                fi
        fi
}

check_ip () {  
        local ip_str="$1"
        echo "${ip_str}"|grep -P '^\d{1,3}(\.\d{1,3}){3}$' >/dev/null 2>&1 || local stat='not a ip!'
        if [ "${ip_stat}" = 'not a ip!' ];then
        echo "${ip_str} ${stat}" 1>&2
                exit ${STATE_WARNING}
    fi
}

check_state () {  
        local stat_str="$1"
        if [ -n "${stat_str}" ];then
                case "${stat_str}" in
                                TIME_WAIT|FIN_WAIT|ESTABLISHED|CLOSING|SYN_SEND|TIMED_WAIT)
                                        cmd="netstat -nt|grep ${stat_str}"
                ;;
                                LISTEN)
                                        cmd="netstat -ntl"
                                ;;
                *)
                                        echo "This script only support [TIME_WAIT|FIN_WAIT|ESTABLISHED|CLOSING|SYN_SEND|TIMED_WAIT]" 1>&2
                                        exit ${STATE_WARNING}
                ;;
                esac
        fi
}

logging () {  
local now_date=`date -d now +"%F %T"`  
local log_path='/var/log/tcp'  
local log_name=`date -d "now" +"%F"`

local uid=`id -u`  
if [ "${uid}" == '0' ];then  
        test -d ${log_path} || mkdir -p ${log_path}/
        chown nagios.nagios -R ${log_path}
fi

log="${log_path}/tcp_stat_${log_name}.log"  
echo "${now_date} ${info}"|sed 's/;//g' >> ${log}  
test -f ${log} && chown nagios.nagios ${log}  
}

message () {  
        local stat="$1"
        echo "TCP status is ${stat} - ${info}|Total_connections=${total_connections_int};${warning};${critical};${min};${max}"
}

#input
while getopts w:c:p:H:s:l opt  
do  
        case "$opt" in
                w) 
                        warning=$OPTARG
                        check_num "${warning}"
                ;;
        c) 
                        critical=$OPTARG
                        check_num "${critical}"
                ;;
        p) 
                        port="$OPTARG"
                        check_num "${port}"
                ;;
        H) 
                        ip="$OPTARG"
                        check_ip "${ip}"
                ;;
                s) 
                        state="$OPTARG"
                        check_state "${state}"
                ;;
                l)
                        log_status='on'
                ;;
        *) help;;
        esac
done  
shift $[ $OPTIND - 1 ]

#[ $# -gt 0 -o -z "${warning}" -o -z "${critical}" ] && help
[ $# -gt 0 -o -z "${warning}" ] && help

if [ -n "${warning}" -a -n "${critical}" ];then  
        if [ ${warning} -ge ${critical} ];then
                echo "-w ${warning} must lower than -c ${critical}!" 1>&2
                exit ${STATE_UNKNOWN}
        fi
fi

if [ -n "${warning}" -a -z "${critical}" ];then  
        if [ "${warning}" == "0" ];then
                critical="${warning}"
        else
                echo "Critical can not be empty!" 1>&2
                exit ${STATE_UNKNOWN}
        fi
fi

[ -z "${state}" ] && netstat_cmd="netstat -nt" || netstat_cmd="${cmd}"
[ -z "${ip}" -a -z "${port}" ] && run_cmd="${netstat_cmd}"
[ -n "${ip}" -a -z "${port}" ] && run_cmd="${netstat_cmd}|grep \"${ip}:\"" 
[ -n "${port}" -a -z "${ip}" ] && run_cmd="${netstat_cmd}|grep -P \":${port}\s\""
[ -n "${port}" -a -n "${ip}" ] && run_cmd="${netstat_cmd}|grep -P \"${ip}:${port}\s\"" 

info=`eval "${run_cmd}"|\  
awk 'BEGIN{OFS=":";ORS="; "}/^tcp/{stats[$(NF)]+=1;sum++}END{print "Total",sum;for (stat in stats) {print stat,stats[stat]}}'`

echo "${info}"|grep -E '[0-9]' >/dev/null 2>&1 || info="Total:0"

min=0  
max=4096  
total_connections_str=`echo "${info}"|grep -oP "Total:\d+"|awk -F':' '{print $2}'`  
total_connections_int=`echo "${total_connections_str}*1"|bc`  
echo "${total_connections_int}"|grep -E '^[0-9]+$' >/dev/null 2>&1 ||\  
eval "echo ${total_connections_int} not a number!;exit ${STATE_UNKNOWN}"

[ "${log_status}" == 'on' ] && logging

if [ "${warning}" == "0" ];then  
        if [ ${total_connections_int} -eq 0 ];then
                message "Warning"
                exit ${STATE_WARNING}
        else
                message "OK"
                exit ${STATE_OK}
        fi
fi

[ ${total_connections_int} -lt ${warning} ] && message "OK" && exit ${STATE_OK}
[ ${total_connections_int} -ge ${critical} ] && message "Critical" && exit ${STATE_CRITICAL}
[ ${total_connections_int} -ge ${warning} ] && message "Warning" && exit ${STATE_WARNING}