Adding check_smart
This commit is contained in:
		
							parent
							
								
									1b3695de71
								
							
						
					
					
						commit
						e776f51323
					
				
					 7 changed files with 352 additions and 1 deletions
				
			
		
							
								
								
									
										3
									
								
								check_smart/Makefile
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										3
									
								
								check_smart/Makefile
									
										
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,3 @@
 | 
			
		|||
#/usr/bin/make -f
 | 
			
		||||
 | 
			
		||||
include ../common.mk
 | 
			
		||||
							
								
								
									
										332
									
								
								check_smart/check_smart
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										332
									
								
								check_smart/check_smart
									
										
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,332 @@
 | 
			
		|||
#!/usr/bin/perl -w
 | 
			
		||||
# Check SMART status of ATA/SCSI disks, returning any usable metrics as perfdata.
 | 
			
		||||
# For usage information, run ./check_smart -h
 | 
			
		||||
#
 | 
			
		||||
# This script was created under contract for the US Government and is therefore Public Domain
 | 
			
		||||
#
 | 
			
		||||
# Changes and Modifications
 | 
			
		||||
# =========================
 | 
			
		||||
# Feb 3, 2009: Kurt Yoder - initial version of script
 | 
			
		||||
# Mar 3, 2010: Giles Westwood - add support for hardware raid interfaces
 | 
			
		||||
 | 
			
		||||
use strict;
 | 
			
		||||
use Getopt::Long;
 | 
			
		||||
 | 
			
		||||
use File::Basename qw(basename);
 | 
			
		||||
my $basename = basename($0);
 | 
			
		||||
 | 
			
		||||
my $revision = '$Revision: 1.1 $';
 | 
			
		||||
 | 
			
		||||
use lib '/usr/lib/nagios/plugins/';
 | 
			
		||||
use utils qw(%ERRORS &print_revision &support &usage);
 | 
			
		||||
 | 
			
		||||
$ENV{'PATH'}='/bin:/usr/bin:/sbin:/usr/sbin';
 | 
			
		||||
$ENV{'BASH_ENV'}=''; 
 | 
			
		||||
$ENV{'ENV'}='';
 | 
			
		||||
 | 
			
		||||
use vars qw($opt_d $opt_debug $opt_h $opt_i $opt_v $opt_m);
 | 
			
		||||
Getopt::Long::Configure('bundling');
 | 
			
		||||
GetOptions(
 | 
			
		||||
	                  "debug"       => \$opt_debug,
 | 
			
		||||
	"d=s" => \$opt_d, "device=s"    => \$opt_d,
 | 
			
		||||
	"h"   => \$opt_h, "help"        => \$opt_h,
 | 
			
		||||
	"i=s" => \$opt_i, "interface=s" => \$opt_i,
 | 
			
		||||
	"m=s" => \$opt_m, "mode=s"      => \$opt_m,
 | 
			
		||||
	"v"   => \$opt_v, "version"     => \$opt_v,
 | 
			
		||||
);
 | 
			
		||||
 | 
			
		||||
if ($opt_v) {
 | 
			
		||||
	print_revision($basename,$revision);
 | 
			
		||||
	exit $ERRORS{'OK'};
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
if ($opt_h) {
 | 
			
		||||
	print_help(); 
 | 
			
		||||
	exit $ERRORS{'OK'};
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
my ($device, $interface, $mode) = qw//;
 | 
			
		||||
if ($opt_d) {
 | 
			
		||||
	unless($opt_i){
 | 
			
		||||
		print "must specify an interface for $opt_d using -i/--interface!\n\n";
 | 
			
		||||
		print_help();
 | 
			
		||||
		exit $ERRORS{'UNKNOWN'};
 | 
			
		||||
	}
 | 
			
		||||
        if (!$opt_i =~/3ware|cciss|areca/){
 | 
			
		||||
	        if ( -b $opt_d){
 | 
			
		||||
	        	$device = $opt_d;
 | 
			
		||||
        	}
 | 
			
		||||
        	else{
 | 
			
		||||
	        	print "$opt_d is not a valid block device!\n\n";
 | 
			
		||||
		        print_help();
 | 
			
		||||
		        exit $ERRORS{'UNKNOWN'};
 | 
			
		||||
	        }
 | 
			
		||||
        }       
 | 
			
		||||
        else{
 | 
			
		||||
                $device = $opt_d;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        if($opt_i =~
 | 
			
		||||
            /ata|scsi|3ware|cciss|areca|sat|marvell|usbsunplus|hpt|usbcypress|usbsunplus/){
 | 
			
		||||
                $interface = $opt_i;
 | 
			
		||||
        }
 | 
			
		||||
        else{
 | 
			
		||||
        	print "invalid interface $opt_i for $opt_d!\n\n";
 | 
			
		||||
                print_help();
 | 
			
		||||
                exit $ERRORS{'UNKNOWN'};
 | 
			
		||||
        }
 | 
			
		||||
}
 | 
			
		||||
else{
 | 
			
		||||
	print "must specify a device!\n\n";
 | 
			
		||||
	print_help();
 | 
			
		||||
	exit $ERRORS{'UNKNOWN'};
 | 
			
		||||
}
 | 
			
		||||
if ($opt_m){
 | 
			
		||||
        if ($opt_m =~ /scsi/){
 | 
			
		||||
                $mode = 'scsi';
 | 
			
		||||
        }
 | 
			
		||||
        if ($opt_m =~ /ata/){
 | 
			
		||||
                $mode = 'ata';
 | 
			
		||||
        }
 | 
			
		||||
}
 | 
			
		||||
else{
 | 
			
		||||
        $mode = 'unused';
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
my $smart_command = '/usr/bin/sudo /usr/sbin/smartctl';
 | 
			
		||||
my @error_messages = qw//;
 | 
			
		||||
my $exit_status = 'OK';
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
warn "###########################################################\n" if $opt_debug;
 | 
			
		||||
warn "(debug) CHECK 1: getting overall SMART health status\n" if $opt_debug;
 | 
			
		||||
warn "###########################################################\n\n\n" if $opt_debug;
 | 
			
		||||
 | 
			
		||||
my $full_command = "$smart_command -d $interface -H $device";
 | 
			
		||||
warn "(debug) executing:\n$full_command\n\n" if $opt_debug;
 | 
			
		||||
 | 
			
		||||
my @output = `$full_command`;
 | 
			
		||||
warn "(debug) output:\n@output\n\n" if $opt_debug;
 | 
			
		||||
 | 
			
		||||
# parse ata output, looking for "health status: passed"
 | 
			
		||||
my $found_status = 0;
 | 
			
		||||
my $line_str = 'SMART overall-health self-assessment test result: '; # ATA SMART line
 | 
			
		||||
my $ok_str = 'PASSED'; # ATA SMART OK string
 | 
			
		||||
 | 
			
		||||
if ($interface eq 'scsi' || $mode eq 'scsi'){
 | 
			
		||||
	$line_str = 'SMART Health Status: '; # SCSI SMART line
 | 
			
		||||
	$ok_str = 'OK'; #SCSI SMART OK string
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
foreach my $line (@output){
 | 
			
		||||
	if($line =~ /$line_str(.+)/){
 | 
			
		||||
		$found_status = 1;
 | 
			
		||||
		warn "(debug) parsing line:\n$line\n\n" if $opt_debug;
 | 
			
		||||
		if ($1 eq $ok_str) {
 | 
			
		||||
			warn "(debug) found string '$ok_str'; status OK\n\n" if $opt_debug;
 | 
			
		||||
		}
 | 
			
		||||
		else {
 | 
			
		||||
			warn "(debug) no '$ok_str' status; failing\n\n" if $opt_debug;
 | 
			
		||||
			push(@error_messages, "Health status: $1");
 | 
			
		||||
			escalate_status('CRITICAL');
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
unless ($found_status) {
 | 
			
		||||
	push(@error_messages, 'No health status line found');
 | 
			
		||||
	escalate_status('UNKNOWN');
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
warn "###########################################################\n" if $opt_debug;
 | 
			
		||||
warn "(debug) CHECK 2: getting silent SMART health check\n" if $opt_debug;
 | 
			
		||||
warn "###########################################################\n\n\n" if $opt_debug;
 | 
			
		||||
 | 
			
		||||
$full_command = "$smart_command -d $interface -q silent -A $device";
 | 
			
		||||
warn "(debug) executing:\n$full_command\n\n" if $opt_debug;
 | 
			
		||||
 | 
			
		||||
system($full_command);
 | 
			
		||||
my $return_code = $?;
 | 
			
		||||
warn "(debug) exit code:\n$return_code\n\n" if $opt_debug;
 | 
			
		||||
 | 
			
		||||
if ($return_code & 0x01) {
 | 
			
		||||
	push(@error_messages, 'Commandline parse failure');
 | 
			
		||||
	escalate_status('UNKNOWN');
 | 
			
		||||
}
 | 
			
		||||
if ($return_code & 0x02) {
 | 
			
		||||
	push(@error_messages, 'Device could not be opened');
 | 
			
		||||
	escalate_status('UNKNOWN');
 | 
			
		||||
}
 | 
			
		||||
if ($return_code & 0x04) {
 | 
			
		||||
	push(@error_messages, 'Checksum failure');
 | 
			
		||||
	escalate_status('WARNING');
 | 
			
		||||
}
 | 
			
		||||
if ($return_code & 0x08) {
 | 
			
		||||
	push(@error_messages, 'Disk is failing');
 | 
			
		||||
	escalate_status('CRITICAL');
 | 
			
		||||
}
 | 
			
		||||
if ($return_code & 0x10) {
 | 
			
		||||
	push(@error_messages, 'Disk is in prefail');
 | 
			
		||||
	escalate_status('WARNING');
 | 
			
		||||
}
 | 
			
		||||
if ($return_code & 0x20) {
 | 
			
		||||
	push(@error_messages, 'Disk may be close to failure');
 | 
			
		||||
	escalate_status('WARNING');
 | 
			
		||||
}
 | 
			
		||||
if ($return_code & 0x40) {
 | 
			
		||||
	push(@error_messages, 'Error log contains errors');
 | 
			
		||||
	escalate_status('WARNING');
 | 
			
		||||
}
 | 
			
		||||
if ($return_code & 0x80) {
 | 
			
		||||
	push(@error_messages, 'Self-test log contains errors');
 | 
			
		||||
	escalate_status('WARNING');
 | 
			
		||||
}
 | 
			
		||||
if ($return_code && !$exit_status) {
 | 
			
		||||
	push(@error_messages, 'Unknown return code');
 | 
			
		||||
	escalate_status('CRITICAL');
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
if ($return_code) {
 | 
			
		||||
	warn "(debug) non-zero exit code, generating error condition\n\n" if $opt_debug;
 | 
			
		||||
}
 | 
			
		||||
else {
 | 
			
		||||
	warn "(debug) zero exit code, status OK\n\n" if $opt_debug;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
warn "###########################################################\n" if $opt_debug;
 | 
			
		||||
warn "(debug) CHECK 3: getting detailed statistics\n" if $opt_debug;
 | 
			
		||||
warn "(debug) information contains a few more potential trouble spots\n" if $opt_debug;
 | 
			
		||||
warn "(debug) plus, we can also use the information for perfdata/graphing\n" if $opt_debug;
 | 
			
		||||
warn "###########################################################\n\n\n" if $opt_debug;
 | 
			
		||||
 | 
			
		||||
$full_command = "$smart_command -d $interface -A $device";
 | 
			
		||||
warn "(debug) executing:\n$full_command\n\n" if $opt_debug;
 | 
			
		||||
@output = `$full_command`;
 | 
			
		||||
warn "(debug) output:\n@output\n\n" if $opt_debug;
 | 
			
		||||
my @perfdata = qw//;
 | 
			
		||||
 | 
			
		||||
# separate metric-gathering and output analysis for ATA vs SCSI SMART output
 | 
			
		||||
if ($interface eq 'ata' || $mode eq 'ata'){
 | 
			
		||||
	foreach my $line(@output){
 | 
			
		||||
		# get lines that look like this:
 | 
			
		||||
		#    9 Power_On_Minutes        0x0032   241   241   000    Old_age   Always       -       113h+12m
 | 
			
		||||
		next unless $line =~ /^\s*\d+\s(\S+)\s+(?:\S+\s+){6}(\S+)\s+(\d+)/;
 | 
			
		||||
		my ($attribute_name, $when_failed, $raw_value) = ($1, $2, $3);
 | 
			
		||||
		if ($when_failed ne '-'){
 | 
			
		||||
			push(@error_messages, "Attribute $attribute_name failed at $when_failed");
 | 
			
		||||
			escalate_status('WARNING');
 | 
			
		||||
			warn "(debug) parsed SMART attribute $attribute_name with error condition:\n$when_failed\n\n" if $opt_debug;
 | 
			
		||||
		}
 | 
			
		||||
		# some attributes produce questionable data; no need to graph them
 | 
			
		||||
		if (grep {$_ eq $attribute_name} ('Unknown_Attribute', 'Power_On_Minutes') ){
 | 
			
		||||
			next;
 | 
			
		||||
		}
 | 
			
		||||
		push (@perfdata, "$attribute_name=$raw_value");
 | 
			
		||||
 | 
			
		||||
		# do some manual checks
 | 
			
		||||
		if ( ($attribute_name eq 'Current_Pending_Sector') && $raw_value ) {
 | 
			
		||||
			push(@error_messages, "Sectors pending re-allocation");
 | 
			
		||||
			escalate_status('WARNING');
 | 
			
		||||
			warn "(debug) Current_Pending_Sector is non-zero ($raw_value)\n\n" if $opt_debug;
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
else{
 | 
			
		||||
	my ($current_temperature, $max_temperature, $current_start_stop, $max_start_stop) = qw//;
 | 
			
		||||
	foreach my $line(@output){
 | 
			
		||||
		if ($line =~ /Current Drive Temperature:\s+(\d+)/){
 | 
			
		||||
			$current_temperature = $1;
 | 
			
		||||
		}
 | 
			
		||||
		elsif ($line =~ /Drive Trip Temperature:\s+(\d+)/){
 | 
			
		||||
			$max_temperature = $1;
 | 
			
		||||
		}
 | 
			
		||||
		elsif ($line =~ /Current start stop count:\s+(\d+)/){
 | 
			
		||||
			$current_start_stop = $1;
 | 
			
		||||
		}
 | 
			
		||||
		elsif ($line =~ /Recommended maximum start stop count:\s+(\d+)/){
 | 
			
		||||
			$max_start_stop = $1;
 | 
			
		||||
		}
 | 
			
		||||
		elsif ($line =~ /Elements in grown defect list:\s+(\d+)/){
 | 
			
		||||
			push (@perfdata, "defect_list=$1");
 | 
			
		||||
		}
 | 
			
		||||
		elsif ($line =~ /Blocks sent to initiator =\s+(\d+)/){
 | 
			
		||||
			push (@perfdata, "sent_blocks=$1");
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
	if($current_temperature){
 | 
			
		||||
		if($max_temperature){
 | 
			
		||||
			push (@perfdata, "temperature=$current_temperature;;$max_temperature");
 | 
			
		||||
			if($current_temperature > $max_temperature){
 | 
			
		||||
				warn "(debug) Disk temperature is greater than max ($current_temperature > $max_temperature)\n\n" if $opt_debug;
 | 
			
		||||
				push(@error_messages, 'Disk temperature is higher than maximum');
 | 
			
		||||
				escalate_status('CRITICAL');
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
		else{
 | 
			
		||||
			push (@perfdata, "temperature=$current_temperature");
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
	if($current_start_stop){
 | 
			
		||||
		if($max_start_stop){
 | 
			
		||||
			push (@perfdata, "start_stop=$current_start_stop;$max_start_stop");
 | 
			
		||||
			if($current_start_stop > $max_start_stop){
 | 
			
		||||
				warn "(debug) Disk start_stop is greater than max ($current_start_stop > $max_start_stop)\n\n" if $opt_debug;
 | 
			
		||||
				push(@error_messages, 'Disk start_stop is higher than maximum');
 | 
			
		||||
				escalate_status('WARNING');
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
		else{
 | 
			
		||||
			push (@perfdata, "start_stop=$current_start_stop");
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
warn "(debug) gathered perfdata:\n@perfdata\n\n" if $opt_debug;
 | 
			
		||||
my $perf_string = join(' ', @perfdata);
 | 
			
		||||
 | 
			
		||||
warn "###########################################################\n" if $opt_debug;
 | 
			
		||||
warn "(debug) FINAL STATUS: $exit_status\n" if $opt_debug;
 | 
			
		||||
warn "###########################################################\n\n\n" if $opt_debug;
 | 
			
		||||
 | 
			
		||||
warn "(debug) final status/output:\n" if $opt_debug;
 | 
			
		||||
 | 
			
		||||
my $status_string = '';
 | 
			
		||||
 | 
			
		||||
if($exit_status ne 'OK'){
 | 
			
		||||
	$status_string = "$exit_status: ".join(', ', @error_messages);
 | 
			
		||||
}
 | 
			
		||||
else {
 | 
			
		||||
	$status_string = "OK: no SMART errors detected";
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
print "$status_string|$perf_string\n";
 | 
			
		||||
exit $ERRORS{$exit_status};
 | 
			
		||||
 | 
			
		||||
sub print_help {
 | 
			
		||||
	print_revision($basename,$revision);
 | 
			
		||||
	print "Usage: $basename (--device=<SMART device> --interface=(ata|scsi)|-h|-v) [--debug]\n";
 | 
			
		||||
	print "  --debug: show debugging information\n";
 | 
			
		||||
	print "  -d/--device: a device to be SMART monitored, eg /dev/sda /dev/twa0\n";
 | 
			
		||||
	print "  -i/--interface: ata, scsi, sat,marvell,3ware,N,areca,N,\n";
 | 
			
		||||
        print "                  usbcypress, usbjmicron, usbsunplus, cciss,N, hpt,L/M (or hpt,L/M/N), and test\n";
 | 
			
		||||
	print "  -m/--mode: Set mode to scsi,ata when drives are behind\n";
 | 
			
		||||
        print "             hardware raid interface \n";
 | 
			
		||||
	print "  -h/--help: this help\n";
 | 
			
		||||
	print "  -v/--version: Version number\n";
 | 
			
		||||
	support();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# escalate an exit status IFF it's more severe than the previous exit status
 | 
			
		||||
sub escalate_status {
 | 
			
		||||
	my $requested_status = shift;
 | 
			
		||||
	# no test for 'CRITICAL'; automatically escalates upwards
 | 
			
		||||
	if ($requested_status eq 'WARNING') {
 | 
			
		||||
		return if $exit_status eq 'CRITICAL';
 | 
			
		||||
	}
 | 
			
		||||
	if ($requested_status eq 'UNKNOWN') {
 | 
			
		||||
		return if $exit_status eq 'WARNING';
 | 
			
		||||
		return if $exit_status eq 'CRITICAL';
 | 
			
		||||
	}
 | 
			
		||||
	$exit_status = $requested_status;
 | 
			
		||||
}
 | 
			
		||||
							
								
								
									
										5
									
								
								check_smart/check_smart.cfg
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										5
									
								
								check_smart/check_smart.cfg
									
										
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,5 @@
 | 
			
		|||
# 'check_smart' command definition
 | 
			
		||||
define command{
 | 
			
		||||
        command_name    check_smart
 | 
			
		||||
	command_line	/usr/lib/monitoring-plugins/check_smart $ARG1$
 | 
			
		||||
	}
 | 
			
		||||
							
								
								
									
										4
									
								
								check_smart/control
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										4
									
								
								check_smart/control
									
										
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,4 @@
 | 
			
		|||
Homepage: https://exchange.icinga.org/exchange/check_smart+hwraid
 | 
			
		||||
Uploaders: Jan Wagner <waja@cyconet.org>
 | 
			
		||||
Description: plugin to check SMART status of ATA/SCSI disks
 | 
			
		||||
Recommends: perl-modules, monitoring-plugins-common | nagios-plugins-common
 | 
			
		||||
							
								
								
									
										3
									
								
								check_smart/copyright
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										3
									
								
								check_smart/copyright
									
										
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,3 @@
 | 
			
		|||
Copyright (c) Kurt Yoder, Giles Westwood
 | 
			
		||||
 | 
			
		||||
License: public-domain
 | 
			
		||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue