cacti+nagios 整合企业级监控平台(四)

运维监控   nagios配置  

上一篇:cacti+nagios 整合企业级监控平台(三)

六、nagios 监控报警配置
  • nagios QQ群消息报警
    请点击如上地址查看
  • nagios 邮件报警配置
    • 由于系统自带的邮件发送后容易被邮件拦截,所以配置msmtp+mutt发邮件
    • msmtp+mutt配置过程
  • 测试邮件发送成功后,修改nagios配置文件
    vim /usr/local/nagios/etc/objects/commands.cfg
#增加以下内容
# 'notify-host-by-email-mutt' command definition
define command{  
        command_name    notify-host-by-email-mutt
        command_line    /usr/bin/printf "%b" "***** Nagios *****\n\nNotification Type: $NOTIFICATIONTYPE$\nHost: $HOSTNAME$\nState: $HOSTSTATE$\nAddress: $HOSTADDRESS$\nInfo: $HOSTOUTPUT$\n
\nDate/Time: $LONGDATETIME$\n" | /usr/bin/mutt -s "** $NOTIFICATIONTYPE$ Host Alert: $HOSTNAME$ is $HOSTSTATE$ **" $CONTACTEMAIL$
        }

# 'notify-service-by-email-mutt' command definition
define command{  
        command_name    notify-service-by-email-mutt
        command_line    /usr/bin/printf "%b" "***** Nagios *****\n\nNotification Type: $NOTIFICATIONTYPE$\n\nService: $SERVICEDESC$\nHost: $HOSTALIAS$\nAddress: $HOSTADDRESS$\nState: $SERVC
ESTATE$\n\nDate/Time: $LONGDATETIME$\n\nAdditional Info:\n\n$SERVICEOUTPUT$\n" | /usr/bin/mutt -s "** $NOTIFICATIONTYPE$ Service Alert: $HOSTALIAS$/$SERVICEDESC$ is $SERVICESTATE$ **" $CONT  
ACTEMAIL$  
        }
  • 我这把联系人的和报警模块集成了一下
    vim /usr/local/nagios/etc/objects/contact.cfg
define contact{  
        name                            yunwei-contact         
        service_notification_period     24x7               
        host_notification_period        24x7                    
        service_notification_options    w,u,c,r,f,s             
        host_notification_options       d,u,r,f,s               
        service_notification_commands   notify-service-by-email-mutt,notify-service-by-qq
        host_notification_commands      notify-host-by-email-mutt,notify-host-by-qq
        register                        0                       
        }
define contact{  
        contact_name                    leoiceo
        use                             yunwei-contact
        alias                           lmb
        email                           leoiceo@gmail.com,158***34@139.com
}
#联系人组
define contactgroup{  
        contactgroup_name       yunwei-contactgroup
        alias                   Nagios Inception Contact
        members                 limengbo
        }
define host{  
        name                            host    
        notifications_enabled           1               
        event_handler_enabled           1               
        flap_detection_enabled          1               
        failure_prediction_enabled      1              
        process_perf_data               1              
        retain_status_information       1              
        retain_nonstatus_information    1             
        notification_period             24x7
        register                        0            
        }
#定义主机报警设置
define host{  
        name                            cn-server   
        use                             host  
        check_period                    24x7                    
        ;检查主机的时间段
        check_interval                  5                       
        ;nagios对主机的检查时间间隔,这里是5分钟
        retry_interval                  1                       
        ;重试检查时间间隔,单位是分钟
        max_check_attempts              10                      
        ;nagios对主机的最大检查次数,也就是nagios在检查发现某主机异常时,并不马上判断为异常状况;
        ;而是多试几次,因为有可能只是一时网络太拥挤,或是一些其他原因,让主机受到了一点影响;
        ;这里的10就是最多试10次的意思。
        check_command                   check-hosts-alive
        notification_period             workhours       
        notification_interval           120                     ;报警间隔120分钟
        notification_options            d,u,r           
        contact_groups                  yunwei-contactgroup
        register                        0             
        }
define service{  
        name                            service         
        active_checks_enabled           1                      
        passive_checks_enabled          1                      
        parallelize_check               1                      
        obsess_over_service             1                       
        check_freshness                 0                       
        notifications_enabled           1                       
        event_handler_enabled           1                       
        flap_detection_enabled          1                      
        failure_prediction_enabled      1                    
        process_perf_data               1                    
        retain_status_information       1                     
        retain_nonstatus_information    1                     
        is_volatile                     0                     
        check_period                    24x7
        max_check_attempts              3                       ;nagios对服务的最大检查次数。
        retry_check_interval            1                       ;重试检查时间间隔,单位是分钟                      
        notification_options            w,c,r                  
        ;w,u,c,r      u即unknown,表示不明状态.可不报    
        notification_interval           60                          
        notification_period             24x7
         register                        0                     
        notes_url               /nagios/cgi-bin/show.cgi?host=$HOSTNAME$&service=$SERVICEDESC$
        }

#定义服务的报警设置
define service{  
        name                            cn-service
        use                             service
        contact_groups                  yunwei-contactgroup
        max_check_attempts              4                       ;异常后的检查次数
        normal_check_interval           3                       ;服务的检查时间间隔3分钟
        retry_check_interval            1                       ;重试检查间隔
        register                        0
}
  • 定义一个服务组列表,别名用中文方便同事查看
    vim servicegroup.cfg
;系统负载
define servicegroup{  
        servicegroup_name       Current Load
        alias                   系统负载                ;Check_Load
}

;/data data分区空闲率
define servicegroup{  
        servicegroup_name       Data Partition
        alias                   Data分区空闲率          ;Check_Data
}

;/ 根分区磁盘使用率
define servicegroup{  
        servicegroup_name       Root Partition
        alias                   根分区空闲率            ;Check_Root
}

;交换内存使用率
define servicegroup{  
        servicegroup_name       Swap Usage
        alias                   交换分区空闲率          ;Check_Swap
}

;运行进程数监控
define servicegroup{  
        servicegroup_name       Total Processes
        alias                   总进程数                ;Check_Processes
}

;游戏端口检测
define servicegroup{  
        servicegroup_name       Game Port 
        alias                   游戏端口存活状态        ;Check_Game
}

;僵尸进程监控
define servicegroup{  
        servicegroup_name       Zombie Processses
        alias                   僵尸进程监控            ;check_zombie_procs
}
  • 配置主机组
    vim hostgroup.cfg
define hostgroup{  
        hostgroup_name  test-servers    ;The name of the hostgroup
        alias           测试服务器       ;Long name of the group
        members         log,login
        }

define hostgroup {  
        hostgroup_name  dev-servers
        alias           开发服务器
        members         devtest
}
七、nagios-plugin 客户端配置
  • 安装nagios-plugins-2.0.3.tar.gz和nrpe-2.15.tar.gz 即可
tar zxf nagios-plugins-2.0.3.tar.gz  
cd nagios-plugins-2.0.3  
./configure
make && make install  
cd ../

tar zxf nrpe-2.15.tar.gz  
cd nrpe-2.15  
./configure --prefix=/usr/local/icinga --enable-ssl --with-nrpe-user=nagios --with-nrpe-group=nagios --with-nagios-user=nagios --with-nagios-group=nagios --enable-command-args
make all  
make install-plugin  
make install-daemon  
make install-daemon-config  
> /usr/local/nagios/etc/nrpe.cfg
cat >> /usr/local/nagios/etc/nrpe.cfg << EOF  
log_facility=daemon  
pid_file=/var/run/nrpe.pid  
server_port=5666  
nrpe_user=nagios  
nrpe_group=nagios  
allowed_hosts=127.0.0.1,nagios监控端的IP  
dont_blame_nrpe=1         #支持传参  
debug=0  
command_timeout=60  
connection_timeout=300  
command[check_uptime]=/usr/local/nagios/libexec/check_uptime  
command[check_http_args]=/usr/local/nagios/libexec/check_http -H $ARG1$ -u /$ARG2$ -t 30  
command[check_users_args]=/usr/local/nagios/libexec/check_users -w $ARG1$ -c $ARG2$  
command[check_load_args]=/usr/local/nagios/libexec/check_load -w $ARG1$ -c $ARG2$  
command[check_root_args]=/usr/local/nagios/libexec/check_disk -w $ARG1$ -c $ARG2$ -p /  
command[check_data_args]=/usr/local/nagios/libexec/check_disk -w $ARG1$ -c $ARG2$ -p /data  
command[check_disk_args]=/usr/local/nagios/libexec/check_disk -w $ARG1$ -c $ARG2$ -p /$ARG3$  
command[check_zombie_procs_args]=/usr/local/nagios/libexec/check_procs -w $ARG1$ -c $ARG2$ -s Z  
command[check_total_procs_args]=/usr/local/nagios/libexec/check_procs -w $ARG1$ -c $ARG2$  
command[check_tcp_args]=/usr/local/nagios/libexec/check_tcp -p $ARG1$ -w $ARG2$ -c $ARG3$  
command[check_swap_args]=/usr/local/nagios/libexec/check_swap -w $ARG1$ -c $ARG2$  
command[check_mem_args]=/usr/local/nagios/libexec/check_mem -w $ARG1$ -c $ARG2$  
command[check_net_args]=/usr/local/nagios/libexec/check_net_traffic.sh -d $ARG1$ -w $ARG2$ -c $ARG3$  
EOF  
/usr/local/nagios/bin/nrpe -c /usr/local/nagios/etc/nrpe.cfg -d