From e776f51323a170068cdb21feffd3fae8b71d383a Mon Sep 17 00:00:00 2001 From: Jan Wagner Date: Tue, 7 Oct 2014 16:44:11 +0200 Subject: [PATCH] Adding check_smart --- check_smart/Makefile | 3 + check_smart/check_smart | 332 +++++++++++++++++++++++++++++++++++ check_smart/check_smart.cfg | 5 + check_smart/control | 4 + check_smart/copyright | 3 + debian/README.Debian.plugins | 3 + debian/control | 3 +- 7 files changed, 352 insertions(+), 1 deletion(-) create mode 100644 check_smart/Makefile create mode 100644 check_smart/check_smart create mode 100644 check_smart/check_smart.cfg create mode 100644 check_smart/control create mode 100644 check_smart/copyright diff --git a/check_smart/Makefile b/check_smart/Makefile new file mode 100644 index 0000000..cf9673d --- /dev/null +++ b/check_smart/Makefile @@ -0,0 +1,3 @@ +#/usr/bin/make -f + +include ../common.mk diff --git a/check_smart/check_smart b/check_smart/check_smart new file mode 100644 index 0000000..65d2672 --- /dev/null +++ b/check_smart/check_smart @@ -0,0 +1,332 @@ +#!/usr/bin/perl -w +# Check SMART status of ATA/SCSI disks, returning any usable metrics as perfdata. +# For usage information, run ./check_smart -h +# +# This script was created under contract for the US Government and is therefore Public Domain +# +# Changes and Modifications +# ========================= +# Feb 3, 2009: Kurt Yoder - initial version of script +# Mar 3, 2010: Giles Westwood - add support for hardware raid interfaces + +use strict; +use Getopt::Long; + +use File::Basename qw(basename); +my $basename = basename($0); + +my $revision = '$Revision: 1.1 $'; + +use lib '/usr/lib/nagios/plugins/'; +use utils qw(%ERRORS &print_revision &support &usage); + +$ENV{'PATH'}='/bin:/usr/bin:/sbin:/usr/sbin'; +$ENV{'BASH_ENV'}=''; +$ENV{'ENV'}=''; + +use vars qw($opt_d $opt_debug $opt_h $opt_i $opt_v $opt_m); +Getopt::Long::Configure('bundling'); +GetOptions( + "debug" => \$opt_debug, + "d=s" => \$opt_d, "device=s" => \$opt_d, + "h" => \$opt_h, "help" => \$opt_h, + "i=s" => \$opt_i, "interface=s" => \$opt_i, + "m=s" => \$opt_m, "mode=s" => \$opt_m, + "v" => \$opt_v, "version" => \$opt_v, +); + +if ($opt_v) { + print_revision($basename,$revision); + exit $ERRORS{'OK'}; +} + +if ($opt_h) { + print_help(); + exit $ERRORS{'OK'}; +} + +my ($device, $interface, $mode) = qw//; +if ($opt_d) { + unless($opt_i){ + print "must specify an interface for $opt_d using -i/--interface!\n\n"; + print_help(); + exit $ERRORS{'UNKNOWN'}; + } + if (!$opt_i =~/3ware|cciss|areca/){ + if ( -b $opt_d){ + $device = $opt_d; + } + else{ + print "$opt_d is not a valid block device!\n\n"; + print_help(); + exit $ERRORS{'UNKNOWN'}; + } + } + else{ + $device = $opt_d; + } + + if($opt_i =~ + /ata|scsi|3ware|cciss|areca|sat|marvell|usbsunplus|hpt|usbcypress|usbsunplus/){ + $interface = $opt_i; + } + else{ + print "invalid interface $opt_i for $opt_d!\n\n"; + print_help(); + exit $ERRORS{'UNKNOWN'}; + } +} +else{ + print "must specify a device!\n\n"; + print_help(); + exit $ERRORS{'UNKNOWN'}; +} +if ($opt_m){ + if ($opt_m =~ /scsi/){ + $mode = 'scsi'; + } + if ($opt_m =~ /ata/){ + $mode = 'ata'; + } +} +else{ + $mode = 'unused'; +} + +my $smart_command = '/usr/bin/sudo /usr/sbin/smartctl'; +my @error_messages = qw//; +my $exit_status = 'OK'; + + +warn "###########################################################\n" if $opt_debug; +warn "(debug) CHECK 1: getting overall SMART health status\n" if $opt_debug; +warn "###########################################################\n\n\n" if $opt_debug; + +my $full_command = "$smart_command -d $interface -H $device"; +warn "(debug) executing:\n$full_command\n\n" if $opt_debug; + +my @output = `$full_command`; +warn "(debug) output:\n@output\n\n" if $opt_debug; + +# parse ata output, looking for "health status: passed" +my $found_status = 0; +my $line_str = 'SMART overall-health self-assessment test result: '; # ATA SMART line +my $ok_str = 'PASSED'; # ATA SMART OK string + +if ($interface eq 'scsi' || $mode eq 'scsi'){ + $line_str = 'SMART Health Status: '; # SCSI SMART line + $ok_str = 'OK'; #SCSI SMART OK string +} + +foreach my $line (@output){ + if($line =~ /$line_str(.+)/){ + $found_status = 1; + warn "(debug) parsing line:\n$line\n\n" if $opt_debug; + if ($1 eq $ok_str) { + warn "(debug) found string '$ok_str'; status OK\n\n" if $opt_debug; + } + else { + warn "(debug) no '$ok_str' status; failing\n\n" if $opt_debug; + push(@error_messages, "Health status: $1"); + escalate_status('CRITICAL'); + } + } +} + +unless ($found_status) { + push(@error_messages, 'No health status line found'); + escalate_status('UNKNOWN'); +} + + +warn "###########################################################\n" if $opt_debug; +warn "(debug) CHECK 2: getting silent SMART health check\n" if $opt_debug; +warn "###########################################################\n\n\n" if $opt_debug; + +$full_command = "$smart_command -d $interface -q silent -A $device"; +warn "(debug) executing:\n$full_command\n\n" if $opt_debug; + +system($full_command); +my $return_code = $?; +warn "(debug) exit code:\n$return_code\n\n" if $opt_debug; + +if ($return_code & 0x01) { + push(@error_messages, 'Commandline parse failure'); + escalate_status('UNKNOWN'); +} +if ($return_code & 0x02) { + push(@error_messages, 'Device could not be opened'); + escalate_status('UNKNOWN'); +} +if ($return_code & 0x04) { + push(@error_messages, 'Checksum failure'); + escalate_status('WARNING'); +} +if ($return_code & 0x08) { + push(@error_messages, 'Disk is failing'); + escalate_status('CRITICAL'); +} +if ($return_code & 0x10) { + push(@error_messages, 'Disk is in prefail'); + escalate_status('WARNING'); +} +if ($return_code & 0x20) { + push(@error_messages, 'Disk may be close to failure'); + escalate_status('WARNING'); +} +if ($return_code & 0x40) { + push(@error_messages, 'Error log contains errors'); + escalate_status('WARNING'); +} +if ($return_code & 0x80) { + push(@error_messages, 'Self-test log contains errors'); + escalate_status('WARNING'); +} +if ($return_code && !$exit_status) { + push(@error_messages, 'Unknown return code'); + escalate_status('CRITICAL'); +} + +if ($return_code) { + warn "(debug) non-zero exit code, generating error condition\n\n" if $opt_debug; +} +else { + warn "(debug) zero exit code, status OK\n\n" if $opt_debug; +} + + +warn "###########################################################\n" if $opt_debug; +warn "(debug) CHECK 3: getting detailed statistics\n" if $opt_debug; +warn "(debug) information contains a few more potential trouble spots\n" if $opt_debug; +warn "(debug) plus, we can also use the information for perfdata/graphing\n" if $opt_debug; +warn "###########################################################\n\n\n" if $opt_debug; + +$full_command = "$smart_command -d $interface -A $device"; +warn "(debug) executing:\n$full_command\n\n" if $opt_debug; +@output = `$full_command`; +warn "(debug) output:\n@output\n\n" if $opt_debug; +my @perfdata = qw//; + +# separate metric-gathering and output analysis for ATA vs SCSI SMART output +if ($interface eq 'ata' || $mode eq 'ata'){ + foreach my $line(@output){ + # get lines that look like this: + # 9 Power_On_Minutes 0x0032 241 241 000 Old_age Always - 113h+12m + next unless $line =~ /^\s*\d+\s(\S+)\s+(?:\S+\s+){6}(\S+)\s+(\d+)/; + my ($attribute_name, $when_failed, $raw_value) = ($1, $2, $3); + if ($when_failed ne '-'){ + push(@error_messages, "Attribute $attribute_name failed at $when_failed"); + escalate_status('WARNING'); + warn "(debug) parsed SMART attribute $attribute_name with error condition:\n$when_failed\n\n" if $opt_debug; + } + # some attributes produce questionable data; no need to graph them + if (grep {$_ eq $attribute_name} ('Unknown_Attribute', 'Power_On_Minutes') ){ + next; + } + push (@perfdata, "$attribute_name=$raw_value"); + + # do some manual checks + if ( ($attribute_name eq 'Current_Pending_Sector') && $raw_value ) { + push(@error_messages, "Sectors pending re-allocation"); + escalate_status('WARNING'); + warn "(debug) Current_Pending_Sector is non-zero ($raw_value)\n\n" if $opt_debug; + } + } +} +else{ + my ($current_temperature, $max_temperature, $current_start_stop, $max_start_stop) = qw//; + foreach my $line(@output){ + if ($line =~ /Current Drive Temperature:\s+(\d+)/){ + $current_temperature = $1; + } + elsif ($line =~ /Drive Trip Temperature:\s+(\d+)/){ + $max_temperature = $1; + } + elsif ($line =~ /Current start stop count:\s+(\d+)/){ + $current_start_stop = $1; + } + elsif ($line =~ /Recommended maximum start stop count:\s+(\d+)/){ + $max_start_stop = $1; + } + elsif ($line =~ /Elements in grown defect list:\s+(\d+)/){ + push (@perfdata, "defect_list=$1"); + } + elsif ($line =~ /Blocks sent to initiator =\s+(\d+)/){ + push (@perfdata, "sent_blocks=$1"); + } + } + if($current_temperature){ + if($max_temperature){ + push (@perfdata, "temperature=$current_temperature;;$max_temperature"); + if($current_temperature > $max_temperature){ + warn "(debug) Disk temperature is greater than max ($current_temperature > $max_temperature)\n\n" if $opt_debug; + push(@error_messages, 'Disk temperature is higher than maximum'); + escalate_status('CRITICAL'); + } + } + else{ + push (@perfdata, "temperature=$current_temperature"); + } + } + if($current_start_stop){ + if($max_start_stop){ + push (@perfdata, "start_stop=$current_start_stop;$max_start_stop"); + if($current_start_stop > $max_start_stop){ + warn "(debug) Disk start_stop is greater than max ($current_start_stop > $max_start_stop)\n\n" if $opt_debug; + push(@error_messages, 'Disk start_stop is higher than maximum'); + escalate_status('WARNING'); + } + } + else{ + push (@perfdata, "start_stop=$current_start_stop"); + } + } +} +warn "(debug) gathered perfdata:\n@perfdata\n\n" if $opt_debug; +my $perf_string = join(' ', @perfdata); + +warn "###########################################################\n" if $opt_debug; +warn "(debug) FINAL STATUS: $exit_status\n" if $opt_debug; +warn "###########################################################\n\n\n" if $opt_debug; + +warn "(debug) final status/output:\n" if $opt_debug; + +my $status_string = ''; + +if($exit_status ne 'OK'){ + $status_string = "$exit_status: ".join(', ', @error_messages); +} +else { + $status_string = "OK: no SMART errors detected"; +} + +print "$status_string|$perf_string\n"; +exit $ERRORS{$exit_status}; + +sub print_help { + print_revision($basename,$revision); + print "Usage: $basename (--device= --interface=(ata|scsi)|-h|-v) [--debug]\n"; + print " --debug: show debugging information\n"; + print " -d/--device: a device to be SMART monitored, eg /dev/sda /dev/twa0\n"; + print " -i/--interface: ata, scsi, sat,marvell,3ware,N,areca,N,\n"; + print " usbcypress, usbjmicron, usbsunplus, cciss,N, hpt,L/M (or hpt,L/M/N), and test\n"; + print " -m/--mode: Set mode to scsi,ata when drives are behind\n"; + print " hardware raid interface \n"; + print " -h/--help: this help\n"; + print " -v/--version: Version number\n"; + support(); +} + +# escalate an exit status IFF it's more severe than the previous exit status +sub escalate_status { + my $requested_status = shift; + # no test for 'CRITICAL'; automatically escalates upwards + if ($requested_status eq 'WARNING') { + return if $exit_status eq 'CRITICAL'; + } + if ($requested_status eq 'UNKNOWN') { + return if $exit_status eq 'WARNING'; + return if $exit_status eq 'CRITICAL'; + } + $exit_status = $requested_status; +} diff --git a/check_smart/check_smart.cfg b/check_smart/check_smart.cfg new file mode 100644 index 0000000..51b1ae6 --- /dev/null +++ b/check_smart/check_smart.cfg @@ -0,0 +1,5 @@ +# 'check_smart' command definition +define command{ + command_name check_smart + command_line /usr/lib/monitoring-plugins/check_smart $ARG1$ + } diff --git a/check_smart/control b/check_smart/control new file mode 100644 index 0000000..3704ff1 --- /dev/null +++ b/check_smart/control @@ -0,0 +1,4 @@ +Homepage: https://exchange.icinga.org/exchange/check_smart+hwraid +Uploaders: Jan Wagner +Description: plugin to check SMART status of ATA/SCSI disks +Recommends: perl-modules, monitoring-plugins-common | nagios-plugins-common diff --git a/check_smart/copyright b/check_smart/copyright new file mode 100644 index 0000000..41e4a7c --- /dev/null +++ b/check_smart/copyright @@ -0,0 +1,3 @@ +Copyright (c) Kurt Yoder, Giles Westwood + +License: public-domain diff --git a/debian/README.Debian.plugins b/debian/README.Debian.plugins index 77c91a8..31715f2 100644 --- a/debian/README.Debian.plugins +++ b/debian/README.Debian.plugins @@ -37,6 +37,9 @@ check_redis: check_sentinel: Required Packages: ruby-redis +check_smart: + Required Packages: perl-base, perl-modules, monitoring-plugins-common | nagios-plugins-common + check_tftp: Required Packages: libnet-tftp-perl, monitoring-plugins-common | nagios-plugins-common diff --git a/debian/control b/debian/control index f543c3d..002a09f 100644 --- a/debian/control +++ b/debian/control @@ -11,7 +11,7 @@ Vcs-Browser: http://github.com/waja/monitoring-plugins-cyconet Package: monitoring-plugins-cyconet Architecture: any Depends: ${misc:Depends} -Recommends: ${shlibs:Depends}, ${python:Depends}, libnet-snmp-perl, monitoring-plugins-common | nagios-plugins-common, libtime-modules-perl, libwww-perl, python-argparse, libredis-perl, ruby-redis, libnet-tftp-perl, libxml-xpath-perl, ${perl:Depends} +Recommends: ${shlibs:Depends}, ${python:Depends}, libnet-snmp-perl, monitoring-plugins-common | nagios-plugins-common, libtime-modules-perl, libwww-perl, python-argparse, libredis-perl, ruby-redis, perl-modules, libnet-tftp-perl, libxml-xpath-perl, ${perl:Depends} Suggests: Enhances: nagios-plugins, nagios-plugins-basic, nagios-plugins-standard Description: Plugins for nagios compatible monitoring systems @@ -30,6 +30,7 @@ Description: Plugins for nagios compatible monitoring systems * check_phpfpm_status (0.9): plugin to check the fpm-status page report from php-fpm * check_redis (0.72): plugin that verifies redis server is working. * check_sentinel (0b8e0e388a): plugin to monitor Redis sentinel + * check_smart: plugin to check SMART status of ATA/SCSI disks * check_tftp (0.11): plugin that verifies TFTP server is working. * check_tomcat (1.4): plugin to check the tomcat status page. .