Logoj0ke.net Open Build Service > Projects > server:monitoring > nagios-plugins-lsi > check_percraid_sas
Sign Up | Log In

File check_percraid_sas of Package nagios-plugins-lsi (Revision 2)

Currently displaying revision 2, show latest

 
1
#!/usr/bin/perl -w
2
3
# check_megaraid_sas Nagios plugin
4
# Copyright (C) 2007  Jonathan Delgado, delgado@molbio.mgh.harvard.edu
5
# 
6
# This program is free software; you can redistribute it and/or
7
# modify it under the terms of the GNU General Public License
8
# as published by the Free Software Foundation; either version 2
9
# of the License, or (at your option) any later version.
10
# 
11
# This program is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14
# GNU General Public License for more details.
15
# 
16
# You should have received a copy of the GNU General Public License
17
# along with this program; if not, write to the Free Software
18
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
19
# 
20
# 
21
# Nagios plugin to monitor the status of volumes attached to a LSI Megaraid SAS 
22
# controller, such as the Dell PERC5/i and PERC5/e. If you have any hotspares 
23
# attached to the controller, you can specify the number you should expect to 
24
# find with the '-s' flag.
25
#
26
# The paths for the Nagios plugins lib and MegaCli may need to me changed.
27
#
28
# Code for correct RAID level reporting contributed by Frode Nordahl, 2009/01/12.
29
# Some other code contributed by Morty Abzug, 2015-05-20
30
#
31
# $Author: delgado $
32
# $Revision: #12 $ $Date: 2010/10/18 $
33
34
use strict;
35
use Getopt::Long;
36
use lib qw(/usr/lib/nagios/plugins /usr/lib64/nagios/plugins); # possible pathes to your Nagios plugins and utils.pm
37
use utils qw(%ERRORS);
38
39
my $megaclibin = '/usr/sbin/perccli';  # the full path to your MegaCli binary
40
my $megacli = "$megaclibin";      # how we actually call MegaCli
41
my $megapostopt = '-NoLog';            # additional options to call at the end of MegaCli arguments
42
43
my ($adapters);
44
my $hotspares = 0;
45
my $hotsparecount = 0;
46
my $pdbad = 0;
47
my $pdcount = 0;
48
my $mediaerrors = 0;
49
my $mediaallow = 0;
50
my $consistency_check_is_ok = 0;
51
my $missing_is_ok = 0;
52
my $no_battery_is_ok = 0;
53
my $prederrors = 0;
54
my $predallow = 0;
55
my $othererrors = 0;
56
my $otherallow = 0;
57
my $result = '';
58
my $status = 'OK';
59
my $sudo;
60
my $checkbbu = 0;
61
my $bbu_charge_no_warning = 0;
62
my $check_cache;
63
my $do_help;
64
65
# handle options
66
Getopt::Long::Configure("bundling");
67
GetOptions(
68
  "b|bbu_check" => \$checkbbu,
69
  "B|bbu_charge_no_warning" => \$bbu_charge_no_warning,
70
  "c|cache_check" => \$check_cache,
71
  "h|help" => \$do_help,
72
  "m|media_allow=i" => \$mediaallow,
73
    "consistency_check_is_ok" => \$consistency_check_is_ok,
74
    "missing_is_ok" => \$missing_is_ok,
75
    "no_battery_is_ok" => \$no_battery_is_ok,
76
  "o|other_allow=i" => \$otherallow,
77
  "p|pred_allow=i" => \$predallow,
78
  "s|hotspares=i" => \$hotspares,
79
    "sudo" => \$sudo,
80
  );
81
82
if ( $do_help ) {
83
    print "Usage: $0 [-s number] [-m number] [-o number]\n";
84
    print "       -b check Battery Back Up status\n";
85
        print "       -B battery back up charging state is not a warning\n";
86
        print "       -c check that current cache policy matches default policy\n";
87
    print "       -m is the number of media errors to ignore\n";
88
        print "       --consistency_check_is_ok  consistency checks are OK\n";
89
        print "       --missing_is_ok  test returns OK if MegaCli is not present\n";
90
        print "       --no_battery_is_ok  lack of a battery is not a problem\n";
91
    print "       -p is the predictive error count to ignore\n";
92
    print "       -o is the number of other disk errors to ignore\n";
93
    print "       -s is how many hotspares are attached to the controller\n";
94
        print "       --sudo  should sudo be enabled\n";
95
    exit;
96
}
97
98
sub max_state ($$) {
99
    my ($current, $compare) = @_;
100
    
101
    if (($compare eq 'CRITICAL') || ($current eq 'CRITICAL')) {
102
        return 'CRITICAL';
103
    } elsif ($compare eq 'OK') {
104
        return $current;
105
    } elsif ($compare eq 'WARNING') {
106
        return 'WARNING';
107
    } elsif (($compare eq 'UNKNOWN') && ($current eq 'OK')) {
108
        return 'UNKNOWN';
109
    } else {
110
        return $current;
111
    }
112
}
113
114
sub exitreport ($$) {
115
    my ($status, $message) = @_;
116
    
117
    print STDOUT "$status: $message\n";
118
    exit $ERRORS{$status};
119
}
120
121
122
# Some sanity checks that you actually have something where you think MegaCli is
123
if (! -e $megaclibin) {
124
        if ($missing_is_ok) {
125
                exitreport($status, "$megaclibin is not present, missing_is_ok set")
126
        } else {
127
            exitreport('UNKNOWN',"error: $megaclibin does not exist");
128
        }
129
}
130
131
$megacli="sudo $megacli" if $sudo;
132
133
# Get the number of RAID controllers we have
134
open (ADPCOUNT, "$megacli -adpCount $megapostopt |")  
135
    || exitreport('UNKNOWN',"error: Could not execute $megacli -adpCount $megapostopt");
136
137
while (<ADPCOUNT>) {
138
    if ( m/Controller Count:\s*(\d+)/ ) {
139
        $adapters = $1;
140
        last;
141
    }
142
}
143
close ADPCOUNT;
144
145
exitreport('UNKNOWN',"error: unable to get controller count")
146
  if !defined $adapters;
147
148
ADAPTER: for ( my $adp = 0; $adp < $adapters; $adp++ ) {
149
    $result .= "$adp:";
150
    # Get the Battery Back Up state for this adapter
151
    my ($bbustate);
152
    if ($checkbbu) {
153
        open (BBUGETSTATUS, "$megacli -AdpBbuCmd -GetBbuStatus -a$adp $megapostopt |") 
154
            || exitreport('UNKNOWN', "error: Could not execute $megacli -AdpBbuCmd -GetBbuStatus -a$adp $megapostopt");
155
        
156
        my ($bbucharging, $bbufullycharged, $bburelativecharge, $bbuexitcode);
157
        my ($batterystate, $batteryreplacement, $issohgood);
158
        while (<BBUGETSTATUS>) {
159
            # Charging Status
160
            if ( m/Charging Status\s*:\s*(\w+)/i ) {
161
                $bbucharging = $1;
162
            } elsif ( m/Battery State\s*:\s*(\w+.*)/i) { # sometimes contains a space
163
                    $batterystate = $1;
164
            } elsif ( m/Fully Charged\s*:\s*(\w+)/i ) {
165
                $bbufullycharged = $1;
166
            } elsif ( m/Relative State of Charge\s*:\s*(\w+)/i ) {
167
                $bburelativecharge = $1;
168
            } elsif ( m/Exit Code\s*:\s*(\w+)/i ) {
169
                $bbuexitcode = $1;
170
            } elsif ( m/^\s*Battery Replacement required\s*:\s(\w+)\s*$/i) {
171
                $batteryreplacement = $1;
172
            } elsif ( m/^\s*isSOHGood\s*:\s*(\w+)\s*$/i) {
173
                $issohgood = $1;
174
            }
175
        }
176
        close BBUGETSTATUS;
177
178
        # Determine the BBU state
179
        if ( !defined $bbuexitcode || $bbuexitcode ne '0x00' ) {
180
                        if (!$no_battery_is_ok) {
181
              $bbustate = 'NOT FOUND';
182
              $status = max_state($status, 'CRITICAL');
183
                        } else {
184
              $bbustate = 'not found which could be ok';
185
                        }
186
        } elsif ( lc $batteryreplacement ne 'no' ) {
187
            $bbustate = 'battery needs replacing';
188
            $status = max_state($status, 'CRITICAL');
189
        } elsif ( defined $issohgood && lc $issohgood ne 'yes' ) {
190
            $bbustate = 'battery SOH is not good';
191
            $status = max_state($status, 'CRITICAL');
192
        } elsif ( $bbucharging ne 'None' && !$bbu_charge_no_warning) {
193
            $bbustate = 'Charging (' . $bburelativecharge . '%)';
194
            $status = max_state($status, 'WARNING');
195
        } elsif ( defined $bbufullycharged && $bbufullycharged ne 'Yes' && !$bbu_charge_no_warning) {
196
            # some adapters don't report on "Fully Charged", so
197
            # it's OK if it's not defined.
198
            $bbustate = 'Not Charging (' . $bburelativecharge . '%)';
199
            $status = max_state($status, 'WARNING');
200
        } elsif ( defined $batterystate && $batterystate ne 'Optimal' &&
201
                           $batterystate ne 'Operational') {
202
                $bbustate = $batterystate;
203
            $status = max_state($status, 'WARNING');
204
        } else {
205
            $bbustate = 'Charged (' . $bburelativecharge . '%)';
206
        }
207
            $result .= "BBU $bbustate:";
208
    }
209
210
    # Get the number of logical drives on this adapter
211
    open (LDGETNUM, "$megacli -LdGetNum -a$adp $megapostopt |") 
212
        || exitreport('UNKNOWN', "error: Could not execute $megacli -LdGetNum -a$adp $megapostopt");
213
    
214
    my ($ldnum);
215
    while (<LDGETNUM>) {
216
        if ( m/Number of Virtual drives configured on adapter \d:\s*(\d+)/i ) {
217
            $ldnum = $1;
218
            last;
219
        }
220
    }
221
    close LDGETNUM;
222
    
223
    LDISK: for ( my $ld = 0; $ld < $ldnum; $ld++ ) {
224
        # Get info on this particular logical drive
225
        open (LDINFO, "$megacli -LdInfo -L$ld -a$adp $megapostopt |") 
226
            || exitreport('UNKNOWN', "error: Could not execute $megacli -LdInfo -L$ld -a$adp $megapostopt ");
227
        
228
        my $consistency_output = '';
229
        my ($size, $unit, $raidlevel, $ldpdcount, $state, $spandepth, $consistency_percent, $consistency_minutes);
230
        my $current_cache_policy;
231
        my $default_cache_policy;
232
        while (<LDINFO>) {
233
            if ( m/^Size\s*:\s*((\d+\.?\d*)\s*(MB|GB|TB))/ ) {
234
                $size = $2;
235
                $unit = $3;
236
                # Adjust MB to GB if that's what we got
237
                if ( $unit eq 'MB' ) {
238
                    $size = sprintf( "%.0f", ($size / 1024) );
239
                    $unit= 'GB';
240
                }
241
            } elsif ( m/State\s*:\s*(\w+)/ ) {
242
                $state = $1;
243
                if ( $state ne 'Optimal' ) {
244
                    $status = max_state($status, 'CRITICAL');
245
                }
246
            } elsif ( m/Number Of Drives\s*(per span\s*)?:\s*(\d+)/ ) {
247
                $ldpdcount = $2;
248
            } elsif ( m/Span Depth\s*:\s*(\d+)/ ) {
249
                $spandepth = $1;
250
                        } elsif ( m/^\s*Default Cache Policy\s*:\s*(.*)/ ) {
251
                                $default_cache_policy=$1;
252
                        } elsif ( m/^\s*Current Cache Policy\s*:\s*(.*)/ ) {
253
                                $current_cache_policy=$1;
254
            } elsif ( m/RAID Level\s*: Primary-(\d)/ ) {
255
                $raidlevel = $1;
256
            } elsif ( m/\s+Check Consistency\s+:\s+Completed\s+(\d+)%,\s+Taken\s+(\d+)\s+min/ ) {
257
                $consistency_percent = $1;
258
                $consistency_minutes = $2;
259
            }
260
        }
261
        close LDINFO;
262
263
        # Report correct RAID-level and number of drives in case of Span configurations
264
        if ($ldpdcount && $spandepth > 1) {
265
            $ldpdcount = $ldpdcount * $spandepth;
266
            if ($raidlevel < 10) {
267
                $raidlevel = $raidlevel . "0";
268
            }
269
        }
270
        
271
        if ($consistency_percent) {
272
            $status = max_state($status, 'WARNING')
273
                          if !$consistency_check_is_ok;
274
            $consistency_output = "CC ${consistency_percent}% ${consistency_minutes}m:";
275
        }
276
        
277
        if ($check_cache) {
278
          if (defined($current_cache_policy) &&
279
                      defined($default_cache_policy) &&
280
                      $default_cache_policy eq $current_cache_policy) {
281
                    $result .= "cache policy $current_cache_policy:";
282
                  } elsif (!defined($current_cache_policy)) {
283
                    $result .= "cache policy UNKNOWN:";
284
                    $status = max_state($status, 'UNKNOWN');
285
                  } elsif (!defined($default_cache_policy)) {
286
                    $result .= "cache policy $current_cache_policy, default UNKNOWN:";
287
                    $status = max_state($status, 'UNKNOWN');
288
                  } else {
289
                    $result .= "cache policy $current_cache_policy, SHOULD BE $default_cache_policy:";
290
                    $status = max_state($status, 'WARNING');
291
          }
292
                }
293
294
        $result .= "$ld:RAID-$raidlevel:$ldpdcount drives:$size$unit:$consistency_output$state ";
295
    } #LDISK
296
    close LDINFO;
297
    
298
    # Get info on physical disks for this adapter
299
    open (PDLIST, "$megacli -PdList  -a$adp $megapostopt |") 
300
        || exitreport('UNKNOWN', "error: Could not execute $megacli -PdList -a$adp $megapostopt ");
301
    
302
    my ($slotnumber,$fwstate);
303
    PDISKS: while (<PDLIST>) {
304
        if ( m/Slot Number\s*:\s*(\d+)/ ) {
305
            $slotnumber = $1;
306
            $pdcount++;
307
        } elsif ( m/(\w+) Error Count\s*:\s*(\d+)/ ) {
308
            if ( $1 eq 'Media') {
309
                $mediaerrors += $2;
310
            } else {
311
                $othererrors += $2;
312
            }
313
        } elsif ( m/Predictive Failure Count\s*:\s*(\d+)/ ) {
314
            $prederrors += $1;
315
        } elsif ( m/Firmware state\s*:\s*(\w+)/ ) {
316
            $fwstate = $1;
317
            if ( $fwstate eq 'Hotspare' ) {
318
                $hotsparecount++;
319
            } elsif ( $fwstate eq 'Online' ) {
320
                # Do nothing
321
            } elsif ( $fwstate eq 'JBOD' ) {
322
                # Do nothing
323
            } elsif ( $fwstate eq 'Unconfigured' ) {
324
                # A drive not in anything, or a non drive device
325
                $pdcount--;
326
            } elsif ( $slotnumber != 255 ) {
327
                $pdbad++;
328
                $status = max_state($status, 'CRITICAL');
329
            }
330
        }
331
    } #PDISKS
332
    close PDLIST;
333
}
334
335
$result .= "Drives:$pdcount ";
336
337
# Any bad disks?
338
if ( $pdbad ) {
339
    $result .= "$pdbad Bad Drives ";
340
}
341
342
my $errorcount = $mediaerrors + $prederrors + $othererrors;
343
# Were there any errors?
344
if ( $errorcount ) {
345
    $result .= "($errorcount Errors: $mediaerrors media, $prederrors predictive, $othererrors other) ";
346
    if ( ( $mediaerrors > $mediaallow ) || 
347
         ( $prederrors > $predallow )   || 
348
         ( $othererrors > $otherallow ) ) {
349
        $status = max_state($status, 'WARNING');
350
    }
351
}
352
353
# Do we have as many hotspares as expected (if any)
354
if ( $hotspares ) {
355
    if ( $hotsparecount < $hotspares ) {
356
        $status = max_state($status, 'WARNING');
357
        $result .= "Hotspare(s):$hotsparecount (of $hotspares)";
358
    } else {
359
        $result .= "Hotspare(s):$hotsparecount";
360
    }
361
}
362
363
exitreport($status, $result);
364