432 lines
16 KiB
Perl
432 lines
16 KiB
Perl
#!/usr/bin/perl -w
|
|
# check_nginx_status.pl
|
|
# Version : 0.10
|
|
# Author : regis.leroy at makina-corpus.com
|
|
# Licence : GPL - http://www.fsf.org/licenses/gpl.txt
|
|
#
|
|
# help : ./check_nginx_status.pl -h
|
|
#
|
|
# issues & updates: http://github.com/regilero/check_inginx_status
|
|
use strict;
|
|
use Getopt::Long;
|
|
use LWP::UserAgent;
|
|
use Time::HiRes qw(gettimeofday tv_interval);
|
|
use Digest::MD5 qw(md5 md5_hex);
|
|
use FindBin;
|
|
|
|
|
|
# Nagios specific
|
|
use lib $FindBin::Bin;
|
|
use utils qw($TIMEOUT);
|
|
|
|
# Globals
|
|
my $Version='0.9';
|
|
my $Name=$0;
|
|
|
|
my $o_host = undef; # hostname
|
|
my $o_help= undef; # want some help ?
|
|
my $o_port= undef; # port
|
|
my $o_url = undef; # url to use, if not the default
|
|
my $o_user= undef; # user for auth
|
|
my $o_pass= ''; # password for auth
|
|
my $o_realm= ''; # password for auth
|
|
my $o_version= undef; # print version
|
|
my $o_warn_a_level= -1; # Number of active connections that will cause a warning
|
|
my $o_crit_a_level= -1; # Number of active connections that will cause an error
|
|
my $o_warn_rps_level= -1; # Number of Request per second that will cause a warning
|
|
my $o_crit_rps_level= -1; # Number of request Per second that will cause an error
|
|
my $o_warn_cps_level= -1; # Number of Connections per second that will cause a warning
|
|
my $o_crit_cps_level= -1; # Number of Connections per second that will cause an error
|
|
my $o_timeout= 15; # Default 15s Timeout
|
|
my $o_warn_thresold= undef; # warning thresolds entry
|
|
my $o_crit_thresold= undef; # critical thresolds entry
|
|
my $o_debug= undef; # debug mode
|
|
my $o_servername= undef; # ServerName (host header in http request)
|
|
my $o_https= undef; # SSL (HTTPS) mode
|
|
|
|
my $TempPath = '/tmp/'; # temp path
|
|
my $MaxTimeDif = 60*30; # Maximum uptime difference (seconds), default 30 minutes
|
|
|
|
my $nginx = 'NGINX'; # Could be used to store version also
|
|
|
|
# functions
|
|
sub show_versioninfo { print "$Name version : $Version\n"; }
|
|
|
|
sub print_usage {
|
|
print "Usage: $Name -H <host ip> [-p <port>] [-s servername] [-t <timeout>] [-w <WARN_THRESOLD> -c <CRIT_THRESOLD>] [-V] [-d] [-u <url>] [-U user -P pass -r realm]\n";
|
|
}
|
|
sub nagios_exit {
|
|
my ( $nickname, $status, $message, $perfdata , $silent) = @_;
|
|
my %STATUSCODE = (
|
|
'OK' => 0
|
|
, 'WARNING' => 1
|
|
, 'CRITICAL' => 2
|
|
, 'UNKNOWN' => 3
|
|
, 'PENDING' => 4
|
|
);
|
|
if(!defined($silent)) {
|
|
my $output = undef;
|
|
$output .= sprintf('%1$s %2$s - %3$s', $nickname, $status, $message);
|
|
if ($perfdata) {
|
|
$output .= sprintf('|%1$s', $perfdata);
|
|
}
|
|
$output .= chr(10);
|
|
print $output;
|
|
}
|
|
exit $STATUSCODE{$status};
|
|
}
|
|
|
|
# Get the alarm signal
|
|
$SIG{'ALRM'} = sub {
|
|
nagios_exit($nginx,"CRITICAL","ERROR: Alarm signal (Nagios timeout)");
|
|
};
|
|
|
|
sub help {
|
|
print "Nginx Monitor for Nagios version ",$Version,"\n";
|
|
print "GPL licence, (c)2012 Leroy Regis\n\n";
|
|
print_usage();
|
|
print <<EOT;
|
|
-h, --help
|
|
print this help message
|
|
-H, --hostname=HOST
|
|
name or IP address of host to check
|
|
-p, --port=PORT
|
|
Http port
|
|
-u, --url=URL
|
|
Specific URL to use, instead of the default "http://<hostname or IP>/nginx_status"
|
|
-s, --servername=SERVERNAME
|
|
ServerName, (host header of HTTP request) use it if you specified an IP in -H to match the good Virtualhost in your target
|
|
-S, --ssl
|
|
Wether we should use HTTPS instead of HTTP
|
|
-U, --user=user
|
|
Username for basic auth
|
|
-P, --pass=PASS
|
|
Password for basic auth
|
|
-r, --realm=REALM
|
|
Realm for basic auth
|
|
-d, --debug
|
|
Debug mode (show http request response)
|
|
-m, --maxreach=MAX
|
|
Number of max processes reached (since last check) that should trigger an alert
|
|
-t, --timeout=INTEGER
|
|
timeout in seconds (Default: $o_timeout)
|
|
-w, --warn=ACTIVE_CONN,REQ_PER_SEC,CONN_PER_SEC
|
|
number of active connections, ReqPerSec or ConnPerSec that will cause a WARNING
|
|
-1 for no warning
|
|
-c, --critical=ACTIVE_CONN,REQ_PER_SEC,CONN_PER_SEC
|
|
number of active connections, ReqPerSec or ConnPerSec that will cause a CRITICAL
|
|
-1 for no CRITICAL
|
|
-V, --version
|
|
prints version number
|
|
|
|
Note :
|
|
3 items can be managed on this check, this is why -w and -c parameters are using 3 values thresolds
|
|
- ACTIVE_CONN: Number of all opened connections, including connections to backends
|
|
- REQ_PER_SEC: Average number of request per second between this check and the previous one
|
|
- CONN_PER_SEC: Average number of connections per second between this check and the previous one
|
|
|
|
Examples:
|
|
|
|
This one will generate WARNING and CRITICIAL alerts if you reach 10 000 or 20 000 active connection; or
|
|
100 or 200 request per second; or 200 or 300 connections per second
|
|
check_nginx_status.pl -H 10.0.0.10 -u /foo/nginx_status -s mydomain.example.com -t 8 -w 10000,100,200 -c 20000,200,300
|
|
|
|
this will generate WARNING and CRITICAL alerts only on the number of active connections (with low numbers for nginx)
|
|
check_nginx_status.pl -H 10.0.0.10 -s mydomain.example.com -t 8 -w 10,-1,-1 -c 20,-1,-1
|
|
|
|
theses two equivalents will not generate any alert (if the nginx_status page is reachable) but could be used for graphics
|
|
check_nginx_status.pl -H 10.0.0.10 -s mydomain.example.com -w -1,-1,-1 -c -1,-1,-1
|
|
check_nginx_status.pl -H 10.0.0.10 -s mydomain.example.com
|
|
|
|
EOT
|
|
}
|
|
|
|
sub check_options {
|
|
Getopt::Long::Configure ("bundling");
|
|
GetOptions(
|
|
'h' => \$o_help, 'help' => \$o_help,
|
|
'd' => \$o_debug, 'debug' => \$o_debug,
|
|
'H:s' => \$o_host, 'hostname:s' => \$o_host,
|
|
's:s' => \$o_servername, 'servername:s' => \$o_servername,
|
|
'S:s' => \$o_https, 'ssl:s' => \$o_https,
|
|
'u:s' => \$o_url, 'url:s' => \$o_url,
|
|
'U:s' => \$o_user, 'user:s' => \$o_user,
|
|
'P:s' => \$o_pass, 'pass:s' => \$o_pass,
|
|
'r:s' => \$o_realm, 'realm:s' => \$o_realm,
|
|
'p:i' => \$o_port, 'port:i' => \$o_port,
|
|
'V' => \$o_version, 'version' => \$o_version,
|
|
'w:s' => \$o_warn_thresold,'warn:s' => \$o_warn_thresold,
|
|
'c:s' => \$o_crit_thresold,'critical:s' => \$o_crit_thresold,
|
|
't:i' => \$o_timeout, 'timeout:i' => \$o_timeout,
|
|
);
|
|
|
|
if (defined ($o_help)) {
|
|
help();
|
|
nagios_exit($nginx,"UNKNOWN","leaving","",1);
|
|
}
|
|
if (defined($o_version)) {
|
|
show_versioninfo();
|
|
nagios_exit($nginx,"UNKNOWN","leaving","",1);
|
|
};
|
|
|
|
if (defined($o_warn_thresold)) {
|
|
($o_warn_a_level,$o_warn_rps_level,$o_warn_cps_level) = split(',', $o_warn_thresold);
|
|
}
|
|
if (defined($o_crit_thresold)) {
|
|
($o_crit_a_level,$o_crit_rps_level,$o_crit_cps_level) = split(',', $o_crit_thresold);
|
|
}
|
|
if (defined($o_debug)) {
|
|
print("\nDebug thresolds: \nWarning: ($o_warn_thresold) => Active: $o_warn_a_level ReqPerSec :$o_warn_rps_level ConnPerSec: $o_warn_cps_level");
|
|
print("\nCritical ($o_crit_thresold) => : Active: $o_crit_a_level ReqPerSec: $o_crit_rps_level ConnPerSec : $o_crit_cps_level\n");
|
|
}
|
|
if ((defined($o_warn_a_level) && defined($o_crit_a_level)) &&
|
|
(($o_warn_a_level != -1) && ($o_crit_a_level != -1) && ($o_warn_a_level >= $o_crit_a_level)) ) {
|
|
nagios_exit($nginx,"UNKNOWN","Check warning and critical values for Active Process (1st part of thresold), warning level must be < crit level!");
|
|
}
|
|
if ((defined($o_warn_rps_level) && defined($o_crit_rps_level)) &&
|
|
(($o_warn_rps_level != -1) && ($o_crit_rps_level != -1) && ($o_warn_rps_level >= $o_crit_rps_level)) ) {
|
|
nagios_exit($nginx,"UNKNOWN","Check warning and critical values for ReqPerSec (2nd part of thresold), warning level must be < crit level!");
|
|
}
|
|
if ((defined($o_warn_cps_level) && defined($o_crit_cps_level)) &&
|
|
(($o_warn_cps_level != -1) && ($o_crit_cps_level != -1) && ($o_warn_cps_level >= $o_crit_cps_level)) ) {
|
|
nagios_exit($nginx,"UNKNOWN","Check warning and critical values for ConnPerSec (3rd part of thresold), warning level must be < crit level!");
|
|
}
|
|
# Check compulsory attributes
|
|
if (!defined($o_host)) {
|
|
print_usage();
|
|
nagios_exit($nginx,"UNKNOWN","-H host argument required");
|
|
}
|
|
}
|
|
|
|
########## MAIN ##########
|
|
|
|
check_options();
|
|
|
|
my $override_ip = $o_host;
|
|
my $ua = LWP::UserAgent->new(
|
|
protocols_allowed => ['http', 'https'],
|
|
timeout => $o_timeout
|
|
);
|
|
# we need to enforce the HTTP request is made on the Nagios Host IP and
|
|
# not on the DNS related IP for that domain
|
|
@LWP::Protocol::http::EXTRA_SOCK_OPTS = ( PeerAddr => $override_ip );
|
|
# this prevent used only once warning in -w mode
|
|
my $ua_settings = @LWP::Protocol::http::EXTRA_SOCK_OPTS;
|
|
|
|
my $timing0 = [gettimeofday];
|
|
my $response = undef;
|
|
my $url = undef;
|
|
|
|
if (!defined($o_url)) {
|
|
$o_url='/nginx_status';
|
|
} else {
|
|
# ensure we have a '/' as first char
|
|
$o_url = '/'.$o_url unless $o_url =~ m(^/)
|
|
}
|
|
my $proto='http://';
|
|
if(defined($o_https)) {
|
|
$proto='https://';
|
|
if (defined($o_port) && $o_port!=443) {
|
|
if (defined ($o_debug)) {
|
|
print "\nDEBUG: Notice: port is defined at $o_port and not 443, check you really want that in SSL mode! \n";
|
|
}
|
|
}
|
|
}
|
|
if (defined($o_servername)) {
|
|
if (!defined($o_port)) {
|
|
$url = $proto . $o_servername . $o_url;
|
|
} else {
|
|
$url = $proto . $o_servername . ':' . $o_port . $o_url;
|
|
}
|
|
} else {
|
|
if (!defined($o_port)) {
|
|
$url = $proto . $o_host . $o_url;
|
|
} else {
|
|
$url = $proto . $o_host . ':' . $o_port . $o_url;
|
|
}
|
|
}
|
|
if (defined ($o_debug)) {
|
|
print "\nDEBUG: HTTP url: \n";
|
|
print $url;
|
|
}
|
|
|
|
my $req = HTTP::Request->new( GET => $url );
|
|
|
|
if (defined($o_servername)) {
|
|
$req->header('Host' => $o_servername);
|
|
}
|
|
if (defined($o_user)) {
|
|
$req->authorization_basic($o_user, $o_pass);
|
|
}
|
|
|
|
if (defined ($o_debug)) {
|
|
print "\nDEBUG: HTTP request: \n";
|
|
print "IP used (better if it's an IP):" . $override_ip . "\n";
|
|
print $req->as_string;
|
|
}
|
|
$response = $ua->request($req);
|
|
my $timeelapsed = tv_interval ($timing0, [gettimeofday]);
|
|
|
|
my $InfoData = '';
|
|
my $PerfData = '';
|
|
#my @Time = (localtime); # list context and not scalar as we want the brutal timestamp
|
|
my $Time = time;
|
|
|
|
my $webcontent = undef;
|
|
if ($response->is_success) {
|
|
$webcontent=$response->decoded_content;
|
|
if (defined ($o_debug)) {
|
|
print "\nDEBUG: HTTP response:";
|
|
print $response->status_line;
|
|
print "\n".$response->header('Content-Type');
|
|
print "\n";
|
|
print $webcontent;
|
|
}
|
|
if ($response->header('Content-Type') =~ m/text\/html/) {
|
|
nagios_exit($nginx,"CRITICAL", "We have a response page for our request, but it's an HTML page, quite certainly not the status report of nginx");
|
|
}
|
|
# example of response content expected:
|
|
#Active connections: 10
|
|
#server accepts handled requests
|
|
#38500 38500 50690
|
|
#Reading: 5 Writing: 5 Waiting: 0
|
|
|
|
# number of all open connections including connections to backends
|
|
my $ActiveConn = 0;
|
|
if($webcontent =~ m/Active connections: (.*?)\n/) {
|
|
$ActiveConn = $1;
|
|
# triming
|
|
$ActiveConn =~ s/^\s+|\s+$//g;
|
|
}
|
|
|
|
|
|
# 3 counters with a space: accepted conn, handled conn and number of requests
|
|
my $counters = '';
|
|
my $AcceptedConn = 0;
|
|
my $HandledConn = 0;
|
|
my $NbRequests = 0;
|
|
if($webcontent =~ m/\nserver accepts handled requests\n(.*?)\n/) {
|
|
$counters = $1;
|
|
# triming
|
|
$counters =~ s/^\s+|\s+$//g;
|
|
#splitting
|
|
($AcceptedConn,$HandledConn,$NbRequests) = split(' ', $counters);
|
|
# triming
|
|
$AcceptedConn =~ s/^\s+|\s+$//g;
|
|
$HandledConn =~ s/^\s+|\s+$//g;
|
|
$NbRequests =~ s/^\s+|\s+$//g;
|
|
}
|
|
|
|
# nginx reads request header
|
|
my $Reading = 0;
|
|
# nginx reads request body, processes request, or writes response to a client
|
|
my $Writing = 0;
|
|
# keep-alive connections, actually it is active - (reading + writing)
|
|
my $Waiting = 0;
|
|
if($webcontent =~ m/Reading: (.*?)Writing: (.*?)Waiting: (.*?)$/) {
|
|
$Reading = $1;
|
|
$Writing = $2;
|
|
$Waiting = $3;
|
|
# triming
|
|
$Reading =~ s/^\s+|\s+$//g;
|
|
$Writing =~ s/^\s+|\s+$//g;
|
|
$Waiting =~ s/^\s+|\s+$//g;
|
|
}
|
|
|
|
# Debug
|
|
if (defined ($o_debug)) {
|
|
print ("\nDEBUG Parse results => Active :" . $ActiveConn . "\nAcceptedConn :" . $AcceptedConn . "\nHandledConn :" . $HandledConn . "\nNbRequests :".$NbRequests . "\nReading :" .$Reading . "\nWriting :" . $Writing . "\nWaiting :" . $Waiting . "\n");
|
|
}
|
|
|
|
my $TempFile = $TempPath.$o_host.'_check_nginx_status'.md5_hex($url);
|
|
my $FH;
|
|
|
|
my $LastTime = 0;
|
|
my $LastAcceptedConn = 0;
|
|
my $LastHandledConn = 0;
|
|
my $LastNbRequests = 0;
|
|
if ((-e $TempFile) && (-r $TempFile) && (-w $TempFile)) {
|
|
open ($FH, '<',$TempFile) or nagios_exit($nginx,"UNKNOWN","unable to read temporary data from :".$TempFile);
|
|
$LastTime = <$FH>;
|
|
$LastAcceptedConn = <$FH>;
|
|
$LastHandledConn = <$FH>;
|
|
$LastNbRequests = <$FH>;
|
|
close ($FH);
|
|
if (defined ($o_debug)) {
|
|
print ("\nDebug: data from temporary file: $TempFile\n");
|
|
print (" LastTime: $LastTime LastAcceptedConn: $LastAcceptedConn LastHandledConn: $LastHandledConn LastNbRequests: $LastNbRequests \n");
|
|
}
|
|
}
|
|
|
|
open ($FH, '>'.$TempFile) or nagios_exit($nginx,"UNKNOWN","unable to write temporary data in :".$TempFile);
|
|
#print $FH (@Time),"\n";
|
|
print $FH "$Time\n";
|
|
print $FH "$AcceptedConn\n";
|
|
print $FH "$HandledConn\n";
|
|
print $FH "$NbRequests\n";
|
|
close ($FH);
|
|
|
|
my $ConnPerSec = 0;
|
|
my $ReqPerSec = 0;
|
|
my $RequestsNew = 0;
|
|
# by default the average
|
|
my $ReqPerConn = 0;
|
|
if ($AcceptedConn > 0) {
|
|
$ReqPerConn = $NbRequests/$AcceptedConn;
|
|
}
|
|
my $elapsed = $Time - $LastTime ;
|
|
if (defined ($o_debug)) {
|
|
print ("\nDebug: pre-computation\n");
|
|
print ("Average ReqPerconn: $ReqPerConn, Seconds elapsed Since last check: $elapsed\n");
|
|
}
|
|
# check only if the counters may have been incremented
|
|
# but not if it may have been too much incremented
|
|
if ( ($elapsed < $MaxTimeDif) && ($elapsed != 0) ) {
|
|
$ConnPerSec = ($AcceptedConn-$LastAcceptedConn)/$elapsed;
|
|
$RequestsNew = $NbRequests-$LastNbRequests;
|
|
$ReqPerSec = $RequestsNew/$elapsed;
|
|
# get finer value
|
|
if ( $ConnPerSec!=0 ) {
|
|
my $ReqPerConn = $ReqPerSec/$ConnPerSec;
|
|
} else {
|
|
my $ReqPerConn = 0;
|
|
}
|
|
}
|
|
if (defined ($o_debug)) {
|
|
print ("\nDebug: data computed\n");
|
|
print ("ConnPerSec: $ConnPerSec ReqPerSec: $ReqPerSec ReqPerConn: $ReqPerConn\n");
|
|
}
|
|
$InfoData = sprintf (" %.3f sec. response time, Active: %d (Writing: %d Reading: %d Waiting: %d)"
|
|
. " ReqPerSec: %.3f ConnPerSec: %.3f ReqPerConn: %.3f"
|
|
,$timeelapsed,$ActiveConn,$Writing,$Reading,$Waiting,$ReqPerSec,$ConnPerSec,$ReqPerConn);
|
|
$PerfData = sprintf ("Writing=%d;;;; Reading=%d;;;; Waiting=%d;;;; Active=%d;;;; "
|
|
. "ReqPerSec=%f;;;; ConnPerSec=%f;;;; ReqPerConn=%f;;;;"
|
|
,($Writing),($Reading),($Waiting),($ActiveConn)
|
|
,($ReqPerSec),($ConnPerSec),($ReqPerConn));
|
|
# first all critical exists by priority
|
|
if (defined($o_crit_a_level) && (-1!=$o_crit_a_level) && ($ActiveConn >= $o_crit_a_level)) {
|
|
nagios_exit($nginx,"CRITICAL", "Active Connections are critically high " . $InfoData,$PerfData);
|
|
}
|
|
if (defined($o_crit_rps_level) && (-1!=$o_crit_rps_level) && ($ReqPerSec >= $o_crit_rps_level)) {
|
|
nagios_exit($nginx,"CRITICAL", "Request per second ratios is critically high " . $InfoData,$PerfData);
|
|
}
|
|
if (defined($o_crit_cps_level) && (-1!=$o_crit_cps_level) && ($ConnPerSec >= $o_crit_cps_level)) {
|
|
nagios_exit($nginx,"CRITICAL", "Connection per second ratio is critically high " . $InfoData,$PerfData);
|
|
}
|
|
# Then WARNING exits by priority
|
|
if (defined($o_warn_a_level) && (-1!=$o_warn_a_level) && ($ActiveConn >= $o_warn_a_level)) {
|
|
nagios_exit($nginx,"WARNING", "Active Connections are high " . $InfoData,$PerfData);
|
|
}
|
|
if (defined($o_warn_rps_level) && (-1!=$o_warn_rps_level) && ($ReqPerSec >= $o_warn_rps_level)) {
|
|
nagios_exit($nginx,"WARNING", "Requests per second ratio is high " . $InfoData,$PerfData);
|
|
}
|
|
if (defined($o_warn_cps_level) && (-1!=$o_warn_cps_level) && ($ConnPerSec >= $o_warn_cps_level)) {
|
|
nagios_exit($nginx,"WARNING", "Connection per second ratio is high " . $InfoData,$PerfData);
|
|
}
|
|
|
|
nagios_exit($nginx,"OK",$InfoData,$PerfData);
|
|
|
|
} else {
|
|
nagios_exit($nginx,"CRITICAL", $response->status_line);
|
|
}
|