diff --git a/README.md b/README.md index 7fd6b40..f51bd84 100644 --- a/README.md +++ b/README.md @@ -64,6 +64,10 @@ Which would be enough to tell sanoid to take and keep 36 hourly snapshots, 30 da This option is designed to be run by a Nagios monitoring system. It reports on the health of the zpool your filesystems are on. It only monitors filesystems that are configured in the sanoid.conf file. ++ --monitor-capacity + + This option is designed to be run by a Nagios monitoring system. It reports on the capacity of the zpool your filesystems are on. It only monitors pools that are configured in the sanoid.conf file. + + --force-update This clears out sanoid's zfs snapshot listing cache. This is normally not needed. diff --git a/sanoid b/sanoid index c07e090..d0800e6 100755 --- a/sanoid +++ b/sanoid @@ -18,7 +18,8 @@ use Time::Local; # to parse dates in reverse my %args = ("configdir" => "/etc/sanoid"); GetOptions(\%args, "verbose", "debug", "cron", "readonly", "quiet", "monitor-health", "force-update", "configdir=s", - "monitor-snapshots", "take-snapshots", "prune-snapshots" + "monitor-snapshots", "take-snapshots", "prune-snapshots", + "monitor-capacity" ) or pod2usage(2); # If only config directory (or nothing) has been specified, default to --cron --verbose @@ -52,6 +53,7 @@ my @params = ( \%config, \%snaps, \%snapsbytype, \%snapsbypath ); if ($args{'debug'}) { $args{'verbose'}=1; blabber (@params); } if ($args{'monitor-snapshots'}) { monitor_snapshots(@params); } if ($args{'monitor-health'}) { monitor_health(@params); } +if ($args{'monitor-capacity'}) { monitor_capacity(@params); } if ($args{'force-update'}) { my $snaps = getsnaps( \%config, $cacheTTL, 1 ); } if ($args{'cron'}) { @@ -174,6 +176,61 @@ sub monitor_snapshots { exit $errorlevel; } + +#################################################################################### +#################################################################################### +#################################################################################### + +sub monitor_capacity { + my ($config, $snaps, $snapsbytype, $snapsbypath) = @_; + my %pools; + my @messages; + my $errlevel=0; + + # build pool list with corresponding capacity limits + foreach my $section (keys %config) { + my @pool = split ('/',$section); + + if (scalar @pool == 1 || !defined($pools{$pool[0]}) ) { + my %capacitylimits; + + if (!check_capacity_limit($config{$section}{'capacity_warn'})) { + die "ERROR: invalid zpool capacity warning limit!\n"; + } + + if ($config{$section}{'capacity_warn'} != 0) { + $capacitylimits{'warn'} = $config{$section}{'capacity_warn'}; + } + + if (!check_capacity_limit($config{$section}{'capacity_crit'})) { + die "ERROR: invalid zpool capacity critical limit!\n"; + } + + if ($config{$section}{'capacity_crit'} != 0) { + $capacitylimits{'crit'} = $config{$section}{'capacity_crit'}; + } + + if (%capacitylimits) { + $pools{$pool[0]} = \%capacitylimits; + } + } + } + + foreach my $pool (keys %pools) { + my $capacitylimitsref = $pools{$pool}; + + my ($exitcode, $msg) = check_zpool_capacity($pool,\%$capacitylimitsref); + if ($exitcode > $errlevel) { $errlevel = $exitcode; } + chomp $msg; + push (@messages, $msg); + } + + my @warninglevels = ('','*** WARNING *** ','*** CRITICAL *** '); + my $message = $warninglevels[$errlevel] . join (', ',@messages); + print "$message\n"; + exit $errlevel; +} + #################################################################################### #################################################################################### #################################################################################### @@ -900,6 +957,74 @@ sub check_zpool() { return ($ERRORS{$state},$msg); } # end check_zpool() +sub check_capacity_limit() { + my $value = shift; + + if (!defined($value) || $value !~ /^\d+\z/) { + return undef; + } + + if ($value < 0 || $value > 100) { + return undef; + } + + return 1 +} + +sub check_zpool_capacity() { + my %ERRORS=('DEPENDENT'=>4,'UNKNOWN'=>3,'OK'=>0,'WARNING'=>1,'CRITICAL'=>2); + my $state="UNKNOWN"; + my $msg="FAILURE"; + + my $pool=shift; + my $capacitylimitsref=shift; + my %capacitylimits=%$capacitylimitsref; + + my $statcommand="/sbin/zpool list -H -o cap $pool"; + + if (! open STAT, "$statcommand|") { + print ("$state '$statcommand' command returns no result!\n"); + exit $ERRORS{$state}; + } + + my $line = ; + close(STAT); + + chomp $line; + my @row = split(/ +/, $line); + my $cap=$row[0]; + + ## check for valid capacity value + if ($cap !~ m/^[0-9]{1,3}%$/ ) { + $state = "CRITICAL"; + $msg = sprintf "ZPOOL {%s} does not exist and/or is not responding!\n", $pool; + print $state, " ", $msg; + exit ($ERRORS{$state}); + } + + $state="OK"; + + # check capacity + my $capn = $cap; + $capn =~ s/\D//g; + + if (defined($capacitylimits{"warn"})) { + if ($capn >= $capacitylimits{"warn"}) { + $state = "WARNING"; + } + } + + if (defined($capacitylimits{"crit"})) { + if ($capn >= $capacitylimits{"crit"}) { + $state = "CRITICAL"; + } + } + + $msg = sprintf "ZPOOL %s : %s\n", $pool, $cap; + $msg = "$state $msg"; + return ($ERRORS{$state},$msg); +} # end check_zpool_capacity() + ###################################################################################################### ###################################################################################################### ###################################################################################################### @@ -1079,6 +1204,7 @@ Options: --force-update Clears out sanoid's zfs snapshot cache --monitor-health Reports on zpool "health", in a Nagios compatible format + --monitor-capacity Reports on zpool capacity, in a Nagios compatible format --monitor-snapshots Reports on snapshot "health", in a Nagios compatible format --take-snapshots Creates snapshots as specified in sanoid.conf --prune-snapshots Purges expired snapshots as specified in sanoid.conf diff --git a/sanoid.defaults.conf b/sanoid.defaults.conf index 35c804d..d86cc47 100644 --- a/sanoid.defaults.conf +++ b/sanoid.defaults.conf @@ -70,3 +70,7 @@ monthly_warn = 32 monthly_crit = 35 yearly_warn = 0 yearly_crit = 0 + +# default limits for capacity checks (if set to 0, limit will not be checked) +capacity_warn = 80 +capacity_crit = 95