File check_esx_gw of Package nagios-plugins-snmp
x
1
#!/usr/bin/perl -w
2
# vim:ts=4
3
# check_esx Version 2.4
4
my($Version) = 0.6;
5
# Modified by GroundWork to work with ESX 3; renamed to ..._gw to indicate
6
# a special interim version. This is considered a temporary working copy
7
# until the changes are folded back into the main line of development.
8
#
9
# Check the status of a virtual machine on a VMware ESX server, via SNMP.
10
# Return status in standard format for either Nagios or MRTG.
11
#
12
# Steve Shipway (www.steveshipway.org) Nov 2004
13
# Released under GNU GPL
14
#
15
# See dohelp{} below for usage, so we only need to maintain one copy
16
# and we don't get accidental divergence in what is documented.
17
#
18
# Version 2.0: Added SNMP agent extension to get memory split and ready time
19
# 2.1: Corrected some bugs. Use >0.01 instead of >0.
20
# 2.2: corrected opt_r bug, fa bug
21
# 2.3:
22
# 2.4: simpler guest names for list report
23
# gw 0.6: added some support for ESX 3
24
25
use strict;
26
use Net::SNMP;
27
use Getopt::Std;
28
29
my($STATEFILE) = "/var/tmp/esx_state"; # For rate counter (if not agent)
30
my($VMOID) = "1.3.6.1.4.1.6876"; # VMware MIB
31
my($UCDOID) = "1.3.6.1.4.1.2021.1000.10"; # where to find the agent plugin
32
my($SYSOID) = "1.3.6.1.2.1.1.1.0"; # system object to test SNMP working
33
my($OK,$WARNING,$CRITICAL,$UNKNOWN) = (0,1,2,3);
34
my(%VisibleStatus) = ($OK => "OK", $WARNING => "WARNING", $CRITICAL => "CRITICAL", $UNKNOWN => "UNKNOWN");
35
my($DEBUG) = 0;
36
my($TIMEOUT) = 5;
37
my($RETRIES) = 1;
38
my($SWAPINCRIT) = 2; # this many bps swap in is critical (else warn)
39
my($SWAPPCCRIT) = 4; # this % usage of swap is critical (else warn)
40
my($from,$to) = (0,99999);
41
my($snmp,$resp,$snmperr);
42
my($hostname) = '';
43
my($community) = 'public'; # Default community string
44
my($vhost) = '';
45
my($A, $B, $MSG) = ('U','U','');
46
my(@perf) = ();
47
my($STATUS) = $UNKNOWN;
48
my($MODE) = 0; # 0 = Nagios, 1 = MRTG
49
my($VMID) = -1; # set to -1 if not running
50
my($VMNO) = -1; # set to -1 if not defined
51
my($vmGuestState) = "notRunning";
52
my($warn,$crit) = (70,90); # usage warn/crit: 70/90 is virtualcentre default
53
my($rwarn,$rcrit) = (5,10); # cpu readytime warn/crit: VMWare say to crit at 5%
54
my(%lookup) = ();
55
my(%states) = ();
56
my(%tmpnet) = ();
57
my($fa,$sa,$fb,$sb);
58
my($esx_version) = 3;
59
60
use vars qw($opt_C $opt_H $opt_N $opt_M $opt_V $opt_h $opt_c $opt_t $opt_i $opt_d $opt_w $opt_l $opt_v $opt_r $opt_R);
61
62
sub base($) {
63
return '?' if(!$_[0]);
64
return $1 if( $_[0]=~/^(\S+)/ );
65
return $_[0];
66
}
67
68
sub dohelp {
69
print "Usage: $0 [-h] [-V] [-d] -H host [-C community] [-N | -M [-r]]\n";
70
print " [-l check [-v vhost] [-i interface] [-w warn -c crit]]\n";
71
print " [-t timeout] [-R retries]\n";
72
print " -h: just prints this help message\n";
73
print " -V: just prints the script version number\n";
74
print " -d: puts the script into debug mode\n";
75
print " -H host: ESX server machine\n";
76
print " -C community: the SNMP community string (default is \"public\")\n";
77
print " -N: Nagios mode (the default); need -w and -c for CPU, MEM\n";
78
print " -M: MRTG mode (-r specifies rate rather than counter)\n";
79
print " -l check: can be CPU MEM STATE LIST NET LISTNET (default is LIST)\n";
80
print " -v virtualhost: restrict probing to that one guest host; required for STATE;\n";
81
print " if not specified, probes total ESX system statistics\n";
82
print " -i interface: Only valid for NET\n";
83
print " -w warn -c crit: Nagios thresholds\n";
84
print " -t timeout: ([1..60] seconds) for individual SNMP queries\n";
85
print " -R retries: # of retries ([0..20]) for individual SNMP queries\n";
86
print "\nFor MRTG,\n";
87
print " CPU is total seconds (counter) for vhost or total over all if no vhost given.\n";
88
print " MEM is memory remaining in K.\n";
89
print " STATE is 1 for up, 0 for down.\n";
90
print " LIST is number of vhosts.\n";
91
print " NET is network throughput in bytes for specified vhost and/or interface\n";
92
print " (total of all if not specified).\n";
93
print "\nFor Nagios, specify thresholds as follows.\n";
94
print " CPU is percentage of allocated CPU (for vhosts) and of total CPU (if no vhost).\n";
95
print " MEM is active memory (for vhosts) or free phys memory (if no vhost) in K or %.\n";
96
print " STATE is CRITICAL if vhost is down.\n";
97
print " LIST is WARN if some are down, CRIT is all vhosts are down.\n";
98
print " NET is bytes/sec since last check, if possible (otherwise UNKNOWN).\n";
99
print "\nThresholds for MEM or LIST under Nagios, can be in K or %\n";
100
print " e.g.: -l MEM -w 2048K -c 1024K\n";
101
print " e.g.: -l MEM -v vhost -w 80% -c 90%\n";
102
print " e.g.: -l LIST -w 90% -c 0\n";
103
print " e.g.: -l LIST -w 10 -c 1\n";
104
print "Thresholds for CPU are in % (the trailing % symbol is optional)\n";
105
print " e.g.: -l CPU -w 80 -c 90\n";
106
print "Thresholds for NET are in BYTES/SEC (cannot use %)\n";
107
exit 0;
108
}
109
110
sub readstate {
111
return if(! -r $STATEFILE);
112
open STATE, "<$STATEFILE" or return;
113
flock STATE,1; # read lock
114
while( <STATE> ) { $states{$1}=$2 if( /^(\S+)=(\d+)/ ); }
115
flock STATE,8; # unlock
116
close STATE;
117
}
118
sub writestate {
119
my(%new) = @_;
120
121
# NOTE: For further discussion of the possible race conditions and security holes
122
# here, see Programming Perl, 3/e, pp. 571-576.
123
124
# We write-lock the state file for the entire interaction, and we flush the output
125
# before releasing the lock, to prevent any avoidable race conditions from showing
126
# up. The one race condition this doesn't prevent is multiple independent processes
127
# finding the file doesn't exist, and all of them opening the file in some kind of
128
# write mode (which is required to create a non-existent file). The race condition
129
# here arises because we use a lock mechanism that only allows us to do the locking
130
# after the file is opened, and not beforehand. Thus there is no adjudication such
131
# that only one process would be able to observe that the file does not yet exist.
132
# Whichever of those processes acquires the write lock last will overwrite the
133
# contents written by all the previous processes. But that circumstance will only
134
# occur in a limited, one-time situation, the file will still be left in a consistent
135
# state each time it is unlocked, and it will simply be as if the other processes
136
# never ran. The system will recover properly on subsequent runs.
137
if(-r $STATEFILE) {
138
# We open in read/write mode so we can later write the file without needing
139
# to release the lock and close the file between read and write activity.
140
# (Closing and re-opening would allow some other process to write the file
141
# in between, such that the data we just read would now be stale and those
142
# states we are not updating ourselves here would be overwritten here with
143
# data which is now out-of-date.)
144
open STATE, "+<$STATEFILE" or return;
145
flock STATE,2; # write lock
146
while( <STATE> ) { $states{$1}=$2 if( /^(\S+)=(\d+)/ ); }
147
} else {
148
# We open in append mode because that mode doesn't clobber an existing file.
149
# That's important so we don't truncate a file which some other process has
150
# already write-locked and is already in the middle of writing, and possibly
151
# leave around a broken file that some other process might read before we
152
# acquire the write lock.
153
open STATE, ">>$STATEFILE" or return;
154
flock STATE,2; # write lock
155
}
156
seek STATE,0,0; # rewind
157
truncate STATE,0;
158
foreach ( keys %new ) { $states{$_} = $new{$_}; }
159
foreach ( keys %states ) { print STATE "$_=".$states{$_}."\n"; }
160
# Actually, the flock call implicitly flushes the file before releasing the lock,
161
# so we don't need to do so explicitly here.
162
# my $oldfile = select STATE; $| = 1; select $oldfile; # flush any buffered output
163
flock STATE,8; # unlock
164
close STATE;
165
}
166
167
sub dooutput {
168
if( $MODE ) {
169
# MRTG
170
$A = 'U' if(!defined $A);
171
$B = $A if(!defined $B);
172
$MSG = "Returned values: $A, $B\n" if(!$MSG);
173
print "$A\n$B\n\n$MSG\n";
174
exit 0;
175
} else {
176
# Nagios
177
# Here we follow the format recommended in the Nagios plug-in development guidelines,
178
# plus a convention that says the status should be right up front in a readable form, so
179
# end-users can see it directly in contexts that might not color-highlight the message.
180
# The one thing this doesn't guarantee is that the message isn't so long that either it or
181
# the appended performance data doesn't get chopped off due to overall length limitations
182
# in the Nagios command pipe.
183
print "SERVICE STATUS: ", ($VisibleStatus{$STATUS} || "UNKNOWN"), ": ", $MSG, (scalar @perf ? "|" . join(" ",@perf) : ""), "\n";
184
exit $STATUS;
185
}
186
# should never get here
187
}
188
189
sub makesnmp() {
190
($snmp,$snmperr) = Net::SNMP->session( -hostname=>$hostname,
191
-community=>$community, -timeout=>$TIMEOUT, -retries=>$RETRIES );
192
if($snmperr) {
193
$A = $B = 'U';
194
print "($snmperr)\n" if($DEBUG);
195
$MSG = "Error: $snmperr";
196
$STATUS = $UNKNOWN;
197
dooutput; # exit
198
exit(0);
199
}
200
}
201
###########################################################################
202
# Read detailed memory and CPU data from extended snmp daemon, if possible
203
my(%stats) = ();
204
my($donereadagent) = 0;
205
sub readagent {
206
return "" if($donereadagent);
207
$MSG = "";
208
makesnmp() if(!$snmp);
209
$resp = $snmp->get_request( -varbindlist=>["$UCDOID.2.1"] );
210
if(!$resp) { # Fall back to the old way
211
return 1;
212
}
213
if( $resp->{"$UCDOID.2.1"} ne 'vmware' ) {
214
$MSG = "Incorrect SNMPD configuration: found '".$resp->{"$UCDOID.2.1"}."' when expected 'vmware'";
215
$STATUS = $UNKNOWN;
216
return 1;
217
}
218
$resp = $snmp->get_table( -baseoid=>"$UCDOID.101" );
219
if(!$resp) { # Fall back to the old way
220
# $MSG = "SNMP error: ".$snmp->error;
221
return 1;
222
}
223
# Convert the retrieved values to lookup hash
224
foreach my $oid ( keys %$resp ) {
225
if(( $oid =~ /\.101\.\d+$/ ) and ( $resp->{$oid}=~/^(\S+)=(.*)$/)) {
226
$stats{$1}=$2;
227
}
228
}
229
$donereadagent = 1;
230
return "";
231
}
232
233
sub getesxversion {
234
print "(snmp lookup)\n" if($DEBUG);
235
makesnmp() if(!$snmp);
236
$resp = $snmp->get_request( -varbindlist=>[ "$VMOID.1.2.0" ] );
237
if(!$resp) {
238
if(readagent) {
239
$MSG = "Error: No VMWare SNMP sub-agent running (vmware-snmpd)"
240
if(!$MSG);
241
$STATUS = $UNKNOWN;
242
dooutput; # exit
243
exit(0);
244
}
245
if(!$stats{'has-names'}) {
246
$MSG = "Error: No VMWare SNMP sub-agent running (vmware-snmpd)";
247
$STATUS = $UNKNOWN;
248
dooutput; # exit
249
exit(0);
250
}
251
$esx_version = 2; # just a blind assumption
252
} else {
253
$esx_version = $resp->{"$VMOID.1.2.0"};
254
$esx_version =~ s/\..*//;
255
}
256
}
257
258
# Read all the VM IDs from the vmware-snmpd MIB
259
sub getvmid {
260
print "(snmp lookup)\n" if($DEBUG);
261
262
makesnmp() if(!$snmp);
263
$resp = $snmp->get_table( -baseoid=>"$VMOID.2.1.1");
264
if(!$resp) {
265
$resp = $snmp->get_request( -varbindlist=>[ "$VMOID.1.1.0" ] );
266
if(!$resp) {
267
if(readagent) {
268
$MSG = "Error: No VMWare SNMP sub-agent running (vmware-snmpd)"
269
if(!$MSG);
270
$STATUS = $UNKNOWN;
271
dooutput; # exit
272
exit(0);
273
}
274
if(!$stats{'has-names'}) {
275
$MSG = "Error: No VMWare SNMP sub-agent running (vmware-snmpd)";
276
$STATUS = $UNKNOWN;
277
dooutput; # exit
278
exit(0);
279
}
280
foreach ( keys %stats ) {
281
if( /vhost-(\d+)-name/ ) {
282
$lookup{$1} = $stats{$_}; # id->name
283
$lookup{$stats{$_}} = "vmno-$1" ; # name->dummyOID
284
$lookup{"vmno-$1"} = $1 ; # dummyOID->id
285
}
286
}
287
} else {
288
print "No guests are defined on this server\n" if($DEBUG);
289
$MSG = "No guests defined on this server";
290
return;
291
}
292
} else {
293
foreach my $oid ( keys %$resp ) {
294
$oid =~ /(\d+)\.(\d+)$/;
295
if( $1 == 2 ) {
296
$lookup{$resp->{$oid}} = $2;
297
$lookup{$2} = $resp->{"$VMOID.2.1.1.7.$2"};
298
$lookup{$resp->{"$VMOID.2.1.1.7.$2"}} = $resp->{$oid};
299
if ( $esx_version == 3 ) {
300
$lookup{"vmGuestState-$2"} = $resp->{"$VMOID.2.1.1.8.$2"};
301
}
302
# } elsif( $1 == 7 ) {
303
# $lookup{$2} = $resp->{$oid};
304
}
305
}
306
}
307
return if(!$vhost); # we're just getting the table
308
if(defined $lookup{$vhost}) {
309
$VMNO = $lookup{$vhost};
310
if( defined $lookup{$VMNO} ) {
311
$VMID = $lookup{$VMNO};
312
if ( defined $lookup{"vmGuestState-$VMNO"} ) {
313
$vmGuestState = $lookup{"vmGuestState-$VMNO"};
314
}
315
} else {
316
$STATUS = $CRITICAL;
317
$MSG = "Virtual host $vhost($VMNO) is not running!";
318
}
319
} else {
320
# lets see if they just gave part of the vhost name?
321
foreach ( keys %lookup ) {
322
if( /^$vhost/i ) {
323
$VMNO = $lookup{$_};
324
if( defined $lookup{$VMNO} ) {
325
$VMID = $lookup{$VMNO};
326
if ( defined $lookup{"vmGuestState-$VMNO"} ) {
327
$vmGuestState = $lookup{"vmGuestState-$VMNO"};
328
}
329
$vhost = $_;
330
} else {
331
$STATUS = $CRITICAL;
332
$MSG = "Virtual host $vhost($VMNO) is not running!";
333
}
334
last;
335
}
336
}
337
if($VMNO<0) {
338
$STATUS = $UNKNOWN;
339
$MSG = "Virtual host $vhost is not defined!";
340
dooutput; # exit
341
exit(0);
342
}
343
}
344
345
print "(hostno=$VMNO, ID=$VMID)\n" if($DEBUG);
346
}
347
348
sub listvm {
349
my(@vh);
350
%lookup = (); @vh = ();
351
print "(snmp lookup)\n" if($DEBUG);
352
makesnmp() if(!$snmp);
353
$resp = $snmp->get_table( -baseoid=>"$VMOID.2.1.1");
354
if(!$resp) {
355
$resp = $snmp->get_request( -varbindlist=>[ "$VMOID.1.1.0" ] );
356
if(!$resp) {
357
if(readagent) {
358
$A = $B = 'U';
359
$MSG = "Error: No VMWare SNMP sub-agent running (vmware-snmpd)"
360
if(!$MSG);
361
$STATUS = $UNKNOWN;
362
dooutput; # exit
363
exit(0);
364
}
365
if(!$stats{'has-names'}) {
366
$MSG = "Error: No VMWare SNMP sub-agent running (vmware-snmpd)";
367
$STATUS = $UNKNOWN;
368
dooutput; # exit
369
exit(0);
370
}
371
foreach ( keys %stats ) {
372
if( /vhost-(\d+)-name/ ) {
373
$lookup{$1} = $stats{$_}; # id->name
374
$lookup{$stats{$_}} = "vmno-$1" ; # name->dummyOID
375
$lookup{"vmno-$1"} = $1 ; # dummyOID->id
376
push @vh,$stats{$_};
377
}
378
}
379
} else {
380
$A = $B = 0;
381
$MSG = "No VHosts are defined on this server";
382
$STATUS = $OK;
383
dooutput; # exit
384
exit(0);
385
}
386
} else {
387
foreach my $oid ( sort keys %$resp ) {
388
$oid =~ /(\d+)\.(\d+)$/;
389
if( $1 == 2 ) {
390
$lookup{$resp->{$oid}} = $2;
391
push @vh, $resp->{$oid};
392
} elsif( $esx_version == 2 && $1 == 7 ) {
393
$lookup{$2} = $resp->{$oid};
394
} elsif( $esx_version == 3 && $1 == 8 ) {
395
$lookup{$2} = $resp->{$oid};
396
}
397
}
398
}
399
$A = $B = 0;
400
foreach ( @vh ) {
401
next if(!$_);
402
$B++;
403
if ( $esx_version == 2 ) {
404
if( defined $lookup{$lookup{$_}} and ($lookup{$lookup{$_}} > 0)) {
405
$_ = (substr $_,0,16)."(".$lookup{$lookup{$_}}.")"; $A++;
406
} else {
407
$_ = (substr $_,0,16)."(DOWN)";
408
}
409
} else {
410
# This logic is for ESX 3.
411
if( defined $lookup{$lookup{$_}} and ($lookup{$lookup{$_}} eq "running")) {
412
$_ = $_."(UP)"; $A++;
413
} else {
414
# We don't want to truncate the vhostnames because the substrings might not be unique.
415
$_ = $_."(DOWN)";
416
}
417
}
418
$_ =~ s/ *\([^\)]+\)(\(.*\))/$1/;
419
}
420
$MSG = "VHosts: $A/$B up: ".(join ", ",@vh);
421
push @perf, "allvms_up_ct=$A;;;0;$B";
422
push @perf, "allvms_up_pc=". int($A/$B*10000)/100.0 ."%;;;0;100";
423
$STATUS = $OK;
424
}
425
426
sub readnet {
427
my($found);
428
429
$resp = $snmp->get_table( -baseoid=>"$VMOID.3.4.1");
430
if(!$resp) {
431
$resp = $snmp->get_request( -varbindlist=>[ "$VMOID.1.1" ] );
432
if($resp) { $A = $B = 0;
433
$MSG = "No VHosts defined"; $STATUS = $OK; return; }
434
$MSG = "Error: Unable to retrieve SNMP data";
435
$STATUS = $UNKNOWN;
436
return;
437
}
438
foreach my $oid ( keys %$resp ) {
439
$oid =~ /(\d+)\.(\d+)$/; # Type, index.
440
if( $1 == 3 ) {
441
$tmpnet{$2} = [
442
$resp->{$oid},
443
$resp->{"$VMOID.3.4.1.2.$2"},
444
($resp->{"$VMOID.3.4.1.7.$2"}*1024),
445
($resp->{"$VMOID.3.4.1.9.$2"}*1024)
446
];
447
}
448
}
449
return if($opt_l =~ /LIST/);
450
# We now have all the network statistics indexed by card or VMID
451
$A = $B = 0; $found = 0;
452
foreach ( keys %tmpnet ) {
453
if((($VMID<0) or ($VMID == $tmpnet{$_}[0])) # vm matches
454
and ((!$opt_i) or ($opt_i eq $tmpnet{$_}[1]))) { # net matches
455
$A += $tmpnet{$_}[2];
456
$B += $tmpnet{$_}[3];
457
$found = 1;
458
}
459
}
460
if(!$found) {
461
$MSG = "No network interfaces exist for ";
462
$MSG .= "vhost $vhost" if($VMID>-1);
463
$MSG .= " and " if($VMID>-1 and $opt_i);
464
$MSG .= " interface $opt_i" if ($opt_i);
465
$STATUS = $UNKNOWN;
466
}
467
}
468
469
###########################################################################
470
# Read general memory and CPU data from vmware-snmpd
471
# This is what we do if we can't get the detailed information.
472
sub readcpu {
473
my($k,@k);
474
my($t1,$t2,$a1);
475
476
$MSG = ""; $A = 0; $B = 0;
477
if( !$MODE or $opt_r ) {
478
readstate;
479
$t1 = $states{"$hostname-CPU-$vhost-time"};
480
$a1 = $states{"$hostname-CPU-$vhost"};
481
$t2 = time;
482
}
483
484
@k = ();
485
if( $VMID < 0 ) {
486
foreach ( keys %lookup ) {
487
push @k, "$VMOID.3.1.2.1.3.".$_ if( /^\d+$/ and $_>99);
488
}
489
} else {
490
$k = "$VMOID.3.1.2.1.3.$VMID";
491
@k = ( $k );
492
}
493
foreach $k ( @k ) { print "(retrieving $k)\n" if($DEBUG); }
494
$resp = $snmp->get_request( -varbindlist=>\@k );
495
if( $resp ) {
496
if($VMID<0){
497
$A = 0;
498
foreach ( keys %$resp ) { $A += $resp->{$_};
499
print "$_: ".$resp->{$_}."\n" if($DEBUG); }
500
} else {
501
$A = $resp->{$k};
502
}
503
$B = 0;
504
} else {
505
$A = $B = 'U';
506
if($VMID<0){
507
$MSG = "Unable to retrieve CPU statistics for ESX server: ".$snmp->error;
508
} else {
509
$MSG = "Unable to retrieve CPU statistics for $vhost: ".$snmp->error;
510
}
511
$STATUS = $UNKNOWN;
512
}
513
if(!$MSG){ # IE, no errors
514
$MSG = "CPU has used $A seconds";
515
$MSG .= " on $vhost" if($vhost);
516
if( !$MODE or $opt_r ) {
517
writestate( "$hostname-CPU-$vhost"=>$A, "$hostname-CPU-$vhost-time"=>$t2 )
518
if(!$t1 or ($t2-$t1)>30);
519
if(!$t1 or !$a1 or ($t1 >= $t2) or ( ($t2-$t1)>1000 ) ) {
520
if($vhost) {
521
$MSG = "No saved state for $vhost CPU time yet - wait for next poll.";
522
} else {
523
$MSG = "No saved state for ESX system CPU time yet - wait for next poll.";
524
}
525
$A = $B = "U";
526
$STATUS = $UNKNOWN;
527
} else {
528
print "Usage: $A-$a1 in $t2-$t1 = ".($A-$a1)." in ".($t2-$t1) if($DEBUG);
529
$A = int((($A - $a1)/($t2 - $t1))*10000)/100;
530
print " = $A\n" if($DEBUG);
531
$B = 0;
532
$MSG = "CPU usage is $A% ";
533
$MSG .= "on $vhost" if($vhost);
534
$MSG .= " (".($t2-$t1)."s average)";
535
if($A>110 or $A<0) {
536
$B = $A = 0;
537
$MSG = "Error reading CPU usage information."
538
}
539
}
540
}
541
}
542
}
543
544
sub readmem {
545
my($k1,$k2);
546
547
if($VMID < 0) {
548
$k1 = "$VMOID.3.2.1.0"; # Total physical present
549
$k2 = "$VMOID.3.2.3.0"; # Memory free
550
} else {
551
$k1 = "$VMOID.3.2.4.1.3.$VMID"; # VM memory max
552
$k2 = "$VMOID.3.2.4.1.4.$VMID"; # VM memory used
553
}
554
print "(retrieving $k1,$k2)\n" if($DEBUG);
555
$resp = $snmp->get_request( -varbindlist=>[$k1,$k2] );
556
if( $resp ) {
557
if($VMID < 0 ) {
558
$A = $resp->{$k2}; $B = $resp->{$k1};
559
} else {
560
$A = $resp->{$k2}; $B = $resp->{$k1}*1024;
561
$A = $B - $A; # memory remaining
562
}
563
} else {
564
$A = $B = 'U';
565
if($VMID<0) {
566
$MSG = "Unable to retrieve memory statistics for ESX server: ".$snmp->error;
567
} else {
568
$MSG = "Unable to retrieve memory statistics for $vhost: ".$snmp->error;
569
}
570
$STATUS = $UNKNOWN;
571
}
572
}
573
sub readconsolemempc {
574
my($k1,$k2);
575
576
$k1 = "$VMOID.3.2.1.0"; # Total physical present (enterprises.vmware.vmwResources.vmwMemory.memSize.0)
577
$k2 = "$VMOID.3.2.2.0"; # Memory used by console (enterprises.vmware.vmwResources.vmwMemory.memCOS.0)
578
print "(retrieving $k1,$k2)\n" if($DEBUG);
579
$resp = $snmp->get_request( -varbindlist=>[$k1,$k2] );
580
if( $resp ) {
581
return int( $resp->{$k2} / $resp->{$k1} * 10000) / 100.0;
582
} else {
583
return 'U';
584
}
585
}
586
587
sub readxcpu {
588
my($k,$C);
589
$MSG = ""; $A = 0; $B = 0; $STATUS = 0;
590
if( readagent ) {
591
print "(readagent failed: $MSG)\n" if($DEBUG);
592
readcpu if(!$MSG); # no vmware agent, no error
593
return;
594
}
595
if($vhost) {
596
if ( $esx_version <= 2 ) {
597
$k = "vhost-$VMID";
598
} else {
599
foreach my $key ( keys %stats ) {
600
if ( $stats{$key} eq $lookup{$lookup{$VMNO}} ) {
601
$key =~ /vhost-(\d+)-name/;
602
$k = "vhost-$1";
603
last;
604
}
605
}
606
}
607
if ( defined $k ) {
608
$A = $stats{"$k-cpu-used-pc"};
609
$B = $stats{"$k-cpu-ready-pc"};
610
} else {
611
$A = undef;
612
$B = undef;
613
}
614
$C = $A;
615
} else {
616
$k = "sys";
617
$A = $stats{"sys-cpu-used-pc"};
618
$B = $stats{"allvms-cpu-used-pc"};
619
$C = $A + $B if(defined $A and defined $B);
620
}
621
if(!defined $A or !defined $B) {
622
$A=$B='U'; $MSG="Gathering statistics, please wait.";
623
$STATUS = 3;
624
# Fill in some dummy performance data anyway, to keep downstream processes somewhat happy.
625
if ($vhost) {
626
push @perf, "vhost_cpu_used_pc=U%;;;0;100";
627
push @perf, "vhost_cpu_ready_pc=U%;;;0;100";
628
} else {
629
push @perf, "sys_cpu_used_pc=U%;;;0;100";
630
push @perf, "allvms_cpu_used_pc=U%;;;0;100";
631
push @perf, "sys_cpu_ready_pc=U%;;;0;100";
632
}
633
dooutput; exit 3;
634
}
635
636
if($vhost) {
637
$MSG = "vhost CPU used=$A% ready=$B%";
638
push @perf, "vhost_cpu_used_pc=$A%;;;0;100";
639
push @perf, "vhost_cpu_ready_pc=$B%;;;0;100";
640
} else {
641
$MSG = "CPU used sys=$A% vhosts=$B% readytime=".$stats{'sys-cpu-ready-pc'}."%";
642
push @perf, "sys_cpu_used_pc=$A%;;;0;100";
643
push @perf, "allvms_cpu_used_pc=$B%;;;0;100";
644
push @perf, "sys_cpu_ready_pc=".$stats{'sys-cpu-ready-pc'}."%;;;0;100";
645
}
646
647
# MRTG only
648
if($MODE) { dooutput; exit 0; }
649
# Nagios only
650
$crit =~ s/[^\d\.]//g; $warn =~ s/[^\d\.]//g;
651
$crit = 100 if(!$crit); $warn = 100 if(!$warn);
652
if( $C >= $crit ) {
653
$MSG .= "<BR>" if($MSG);
654
$MSG .= "CPU usage is CRITICAL ($C%)";
655
$STATUS = 2;
656
} elsif( $C >= $warn ) {
657
$MSG .= "<BR>" if($MSG);
658
$MSG .= "CPU usage is WARNING ($C%)";
659
$STATUS = 1 if($STATUS<2);
660
}
661
662
# Ready time
663
if( $stats{"$k-cpu-ready-pc"} >= $rcrit ) {
664
$MSG .= "<BR>" if($MSG);
665
$MSG .= "Ready time is CRITICAL (".$stats{"$k-cpu-ready-pc"}."%)";
666
$STATUS = 2;
667
} elsif( $stats{"$k-cpu-ready-pc"} >= $rwarn ) {
668
$MSG .= "<BR>" if($MSG);
669
$MSG .= "Ready time is WARNING (".$stats{"$k-cpu-ready-pc"}."%)";
670
$STATUS = 1 if($STATUS<2);
671
}
672
if(!$vhost) { # check all vhosts
673
foreach ( keys %lookup ) {
674
next if(!defined $stats{"vhost-$_-cpu-used-pc"});
675
$C=$stats{"vhost-$_-cpu-used-pc"};
676
if( $C >= $crit ) {
677
$MSG .= "<BR>" if($MSG);
678
$MSG .= "'".base($lookup{$_})."' CPU CRITICAL ($C%)";
679
$STATUS = 2;
680
} elsif( $C >= $warn ) {
681
$MSG .= "<BR>" if($MSG);
682
$MSG .= "'".base($lookup{$_})."' CPU WARNING ($C%)";
683
$STATUS = 1 if($STATUS<2);
684
}
685
if( $stats{"vhost-$_-cpu-ready-pc"} >= $rcrit ) {
686
$MSG .= "<BR>" if($MSG);
687
$MSG .= "'".base($lookup{$_})."' Ready time CRITICAL (".$stats{"vhost-$_-cpu-ready-pc"}."%)";
688
$STATUS = 2;
689
} elsif( $stats{"vhost-$_-cpu-ready-pc"} >= $rwarn ) {
690
$MSG .= "<BR>" if($MSG);
691
$MSG .= "'".base($lookup{$_})."' Ready time WARNING (".$stats{"vhost-$_-cpu-ready-pc"}."%)";
692
$STATUS = 1 if($STATUS<2);
693
}
694
}
695
}
696
dooutput;
697
exit 3; # not reached
698
}
699
sub readxmem {
700
my($pc,$max,$k,$memVMID);
701
702
$MSG = ""; $A = 0; $B = 0;
703
if( readagent() ) {
704
print "(readagent failed: $MSG)\n" if($DEBUG);
705
readmem if(!$MSG); # no vmware agent, no error
706
return;
707
}
708
709
if( $vhost ) {
710
if ( $esx_version <= 2 ) {
711
$k = "vhost-$VMID";
712
} else {
713
# for ESX Version 3
714
foreach my $key ( keys %stats ) {
715
if ( $stats{$key} eq $lookup{$lookup{$VMNO}} ) {
716
$key =~ /vhost-(\d+)-name/;
717
$memVMID = $1 - 1; # why this is off by one, we don't know, but it is
718
$k = "unknown-$memVMID";
719
last;
720
}
721
}
722
}
723
$A = $stats{"$k-mem-active"};
724
$B = $stats{"$k-mem-max"};
725
$max = $stats{"$k-mem-max"};
726
if(!defined $A or !defined $B) { $A=$B='U';
727
$MSG="Please wait, data being gathered."; $STATUS=3;
728
push @perf, "vhost_mem_act_pc=U%;;;0;100";
729
push @perf, "vhost_mem_pvt_pc=U%;;;0;100";
730
push @perf, "vhost_mem_shr_pc=U%;;;0;100";
731
push @perf, "vhost_mem_bal_pc=U%;;;0;100";
732
push @perf, "vhost_mem_swp_pc=U%;;;0;100";
733
dooutput; exit 0;
734
}
735
$pc = int($A/$B*10000)/100.0;
736
$MSG = "Memory active: ".int($A/1024000)."Mb ($pc%) [Total available ".int($B/1024000)."Mb]";
737
push @perf, "vhost_mem_act_pc=$pc%;;;0;100";
738
if($pc>=$crit) { $STATUS=2; $MSG = "CRIT: $MSG"; }
739
elsif($pc>=$warn) { $STATUS=1; $MSG = "WARN: $MSG"; }
740
} else {
741
$k = "allvms";
742
$A = $stats{'mem-free'};
743
$B = $stats{'mem-total'};
744
$max = $stats{"$k-mem-max"};
745
if(!defined $A or !defined $B) { $A=$B='U';
746
$MSG="Please wait, data being gathered."; $STATUS=3;
747
push @perf, "mem_free_pc=U\%;;;0;100";
748
if ( $esx_version == 3 ) {
749
push @perf, "console_mem_pc=U\%;;;0;100";
750
}
751
push @perf, "allvms_mem_pvt_pc=U\%;;;0;100";
752
push @perf, "allvms_mem_shr_pc=U\%;;;0;100";
753
push @perf, "allvms_mem_bal_pc=U\%;;;0;100";
754
push @perf, "allvms_mem_swp_pc=U\%;;;0;100";
755
dooutput; exit 0;
756
}
757
$pc = int($A/$B*10000)/100.0;
758
$MSG = "Memory free: ".int($A/1024000)."Mb ($pc\%) [Total available ".int($B/1024000)."Mb]";
759
push @perf, "mem_free_pc=$pc\%;;;0;100";
760
if ( $esx_version == 3 ) {
761
my ($consolemempc) = readconsolemempc;
762
$MSG .= " [Console=$consolemempc\%]";
763
push @perf, "console_mem_pc=" . $consolemempc . "\%;;;0;100";
764
}
765
if($pc<=$crit) { $STATUS=2; $MSG = "CRIT: $MSG"; }
766
elsif($pc<=$warn) { $STATUS=1; $MSG = "WARN: $MSG"; }
767
}
768
769
# MRTG
770
if($MODE) { dooutput; exit 0; }
771
772
# Nagios
773
if($max) {
774
$MSG .= "<BR>Memory split: pvt/shr/bal/swp = "
775
.(int(10000*$stats{"$k-mem-private"}/$max)/100.0)."%/"
776
.(int(10000*$stats{"$k-mem-shared"}/$max)/100.0)."%/"
777
.(int(10000*$stats{"$k-mem-balloon"}/$max)/100.0)."%/"
778
.(int(10000*$stats{"$k-mem-swap"}/$max)/100.0)."%";
779
if ($vhost) {
780
push @perf, "vhost_mem_pvt_pc=" . (int(10000*$stats{"$k-mem-private"}/$max)/100.0) . "%;;;0;100";
781
push @perf, "vhost_mem_shr_pc=" . (int(10000*$stats{"$k-mem-shared" }/$max)/100.0) . "%;;;0;100";
782
push @perf, "vhost_mem_bal_pc=" . (int(10000*$stats{"$k-mem-balloon"}/$max)/100.0) . "%;;;0;100";
783
push @perf, "vhost_mem_swp_pc=" . (int(10000*$stats{"$k-mem-swap" }/$max)/100.0) . "%;;;0;100";
784
} else {
785
push @perf, "allvms_mem_pvt_pc=" . (int(10000*$stats{"$k-mem-private"}/$max)/100.0) . "%;;;0;100";
786
push @perf, "allvms_mem_shr_pc=" . (int(10000*$stats{"$k-mem-shared" }/$max)/100.0) . "%;;;0;100";
787
push @perf, "allvms_mem_bal_pc=" . (int(10000*$stats{"$k-mem-balloon"}/$max)/100.0) . "%;;;0;100";
788
push @perf, "allvms_mem_swp_pc=" . (int(10000*$stats{"$k-mem-swap" }/$max)/100.0) . "%;;;0;100";
789
}
790
791
if($stats{"$k-mem-balloon"}) {
792
$pc = int(100000*$stats{"$k-mem-balloon"}/$max)/1000.0;
793
if($pc>=25) {
794
$MSG .= "<BR>CRIT: Balloon drivers in action! ($pc%)";
795
$STATUS = 2;
796
} elsif($pc>=0.01) {
797
$MSG .= "<BR>WARN: Balloon drivers in action! ($pc%)";
798
$STATUS = 1 if($STATUS<2);
799
}
800
}
801
}
802
if($stats{"$k-swap-in-bps"} and $stats{"$k-swap-in-bps"}>10) {
803
if($stats{"$k-swap-in-bps"}>$SWAPINCRIT) {
804
$MSG .= "<BR>CRIT: VMware swapping in action! (".$stats{"$k-swap-in-bps"}."Bps)";
805
$STATUS = 2;
806
} else {
807
$MSG .= "<BR>WARN: VMware swapping is starting!";
808
$STATUS = 1 if($STATUS<2);
809
}
810
} elsif($max and $stats{"$k-mem-swap"}) {
811
$pc = int(100000*$stats{"$k-mem-swap"}/$max)/1000.0;
812
if($pc>=$SWAPPCCRIT) {
813
$MSG .= "<BR>CRIT: VMWare swap space in use! ($pc%)";
814
$STATUS = 2;
815
} elsif($pc>=0.01) {
816
$MSG .= "<BR>WARN: VMWare swap space in use! ($pc%)";
817
$STATUS = 1 if($STATUS<2);
818
}
819
}
820
821
dooutput;
822
exit 3; # not reached
823
}
824
825
###########################################################################
826
getopts('hrdNMVH:c:t:v:w:C:l:i:R:');
827
$hostname = $opt_H if($opt_H);
828
$vhost = $opt_v if($opt_v);
829
$warn = $opt_w if($opt_w);
830
$crit = $opt_c if($opt_c);
831
$TIMEOUT = $opt_t if($opt_t);
832
$RETRIES = $opt_R if($opt_R);
833
$MODE = 1 if($opt_M);
834
$community = $opt_C if($opt_C);
835
$DEBUG = 1 if($opt_d);
836
dohelp if($opt_h);
837
838
if ($opt_V) {
839
$MSG = "$0 $Version";
840
dooutput;
841
exit 0;
842
}
843
844
if(!$hostname) {
845
$MSG = "No ESX server hostname specified with -H";
846
dooutput;
847
exit 0;
848
}
849
if( !$opt_l ) {
850
# $MSG = "You need to specify a command with -l";
851
# dooutput;
852
# exit 0;
853
$opt_l = "LIST";
854
}
855
getesxversion;
856
if( $opt_l =~ /LISTNET/i ) {
857
getvmid;
858
$MSG = "";
859
readnet;
860
if(!$MSG) {
861
my($tk);
862
foreach ( keys %tmpnet ) {
863
if(!$vhost or ($VMID eq $tmpnet{$_}[0]) ) {
864
$tk=$tmpnet{$_}[1];
865
next if($MSG=~/$tk/);
866
$MSG .= ', ' if($MSG);
867
# $MSG .= $lookup{$tmpnet{$_}[0]}."/" if(!$opt_v);
868
$MSG .= $tk;
869
}
870
}
871
$STATUS = $OK;
872
}
873
dooutput;
874
exit 0;
875
}
876
if( $opt_l =~ /LIST/i ) {
877
listvm;
878
if($warn =~ /(\d+)%/) {
879
$warn = $B * $1 / 100;
880
} elsif( $warn < 0 ) { $warn = $B - 1; }
881
if($crit =~ /(\d+)%/) {
882
$crit = $B * $1 / 100;
883
} elsif( $crit < 0 ) { $crit = 0; }
884
$STATUS = $WARNING if($A<=$warn); # If SOME are down
885
$STATUS = $CRITICAL if($A<=$crit); # If NONE are up
886
$STATUS = $OK if(!$B); # No guests at all
887
dooutput;
888
exit 3;
889
}
890
if( $opt_l !~ /NET|CPU|MEM|STAT/i ) {
891
$MSG = "Bad command $opt_l!";
892
dooutput;
893
exit 3;
894
}
895
if( $opt_l =~ /MEM|CPU|NET/ and !$MODE and ($crit<0 or $warn<0)) {
896
$MSG = "Invalid warn/critical thresholds for '$opt_l' (need -w and -c)";
897
dooutput;
898
exit 3;
899
}
900
901
902
# Now, we have host, vhost, community, and command
903
getvmid; # also opens SNMP object
904
if( $opt_l =~ /STAT/i ) {
905
if(!$vhost) {
906
$MSG = "No virtual hostname specified with -v";
907
dooutput;
908
exit 0;
909
}
910
if( ( $esx_version == 2 && $VMID < 0 ) || ( $esx_version == 3 && $vmGuestState ne "running" ) ) {
911
$STATUS = $CRITICAL; ($A,$B) = (0,0);
912
$MSG = "VHost $vhost is down or undefined.";
913
} else {
914
$STATUS = $OK; ($A,$B) = (1,1);
915
$MSG = "VHost $vhost is up (ID: $VMID)";
916
}
917
push @perf, "vhost_up=$A;;;0;1";
918
dooutput;
919
exit 0;
920
}
921
if($vhost and ( $esx_version == 2 && $VMID < 0 || $esx_version == 3 && $vmGuestState ne "running" )) {
922
$STATUS = $CRITICAL;
923
$MSG = "$vhost is not running." if(!$MSG);
924
if( $opt_l =~ /CPU/i ) {
925
# Fill in some dummy performance data anyway, to keep downstream processes happy.
926
push @perf, "vhost_cpu_used_pc=U%;;;0;100";
927
push @perf, "vhost_cpu_ready_pc=U%;;;0;100";
928
}
929
if( $opt_l =~ /MEM/i ) {
930
# Fill in some dummy performance data anyway, to keep downstream processes happy.
931
push @perf, "vhost_mem_act_pc=U%;;;0;100";
932
push @perf, "vhost_mem_pvt_pc=U%;;;0;100";
933
push @perf, "vhost_mem_shr_pc=U%;;;0;100";
934
push @perf, "vhost_mem_bal_pc=U%;;;0;100";
935
push @perf, "vhost_mem_swp_pc=U%;;;0;100";
936
}
937
if( $opt_l =~ /NET/i ) {
938
# Fill in some dummy performance data anyway, to keep downstream processes happy.
939
push @perf, "vhost_net_read=U;;;0";
940
push @perf, "vhost_net_write=U;;;0";
941
}
942
dooutput;
943
exit 0;
944
}
945
946
$STATUS = $OK;
947
if( $opt_l =~ /CPU/i ) {
948
$MSG = "";
949
readxcpu; # attempt to use extended MIB, else use VMWare MIB
950
} elsif( $opt_l =~ /NET/i ) {
951
my($t1,$t2,$a1,$b1);
952
$opt_i = "" if(!defined $opt_i);
953
$vhost = "" if(!defined $vhost);
954
if( !$MODE or $opt_r ) {
955
readstate;
956
$t1 = $states{"$hostname-NET-$vhost-$opt_i-time"};
957
$a1 = $states{"$hostname-NET-$vhost-$opt_i-r"};
958
$b1 = $states{"$hostname-NET-$vhost-$opt_i-w"};
959
$t2 = time;
960
}
961
$MSG = "";
962
readnet;
963
if(!$MSG){ # IE, no errors
964
$MSG = "Network counters Read=$A Write=$B";
965
$MSG .= " on $vhost" if($vhost);
966
if( $opt_i ) {
967
if( $vhost ) { $MSG .= '/'; } else { $MSG .= ' on '; }
968
$MSG .= $opt_i;
969
}
970
if( !$MODE or $opt_r ) {
971
writestate( "$hostname-NET-$vhost-$opt_i-r"=>$A,
972
"$hostname-NET-$vhost-$opt_i-w"=>$B,
973
"$hostname-NET-$vhost-$opt_i-time"=>$t2 )
974
if(!$t1 or ($t2-$t1)>30);
975
if(!$t1 or (!$a1 and !$b1) or ($t1 >= $t2) or (($t2 - $t1)>3600)) {
976
$MSG = "No saved state available yet - wait for next poll.";
977
$A = $B = "U";
978
$STATUS = $UNKNOWN;
979
if ($vhost) {
980
push @perf, "vhost_net_read=U;;;0";
981
push @perf, "vhost_net_write=U;;;0";
982
} else {
983
push @perf, "allvms_net_read=U;;;0";
984
push @perf, "allvms_net_write=U;;;0";
985
}
986
} else {
987
$A = ($A - $a1)/($t2 - $t1);
988
$B = ($B - $b1)/($t2 - $t1);
989
($fa,$sa,$fb,$sb) = ( $A, "", $B, "" );
990
# NOTE: This arithmetic is really misleading,
991
# though we haven't fixed it in this iteration of revisions.
992
# M should be either 1024*1024 or 1000*1000, not 1024*1000.
993
# And since K is 1024 here, M should be 1024*1024 for consistency.
994
if($fa >= 1024000) { $fa /= 1024000; $sa = 'M'; }
995
elsif($fa >= 1024) { $fa /= 1024; $sa = 'K'; }
996
if($fb >= 1024000) { $fb /= 1024000; $sb = 'M'; }
997
elsif($fb >= 1024) { $fb /= 1024; $sb = 'K'; }
998
$fa = int($fa * 100)/100; $fb = int($fb * 100)/100;
999
$MSG = "Network traffic $fa ".$sa."B/s read, $fb ".$sb."B/s write ";
1000
$MSG .= "on $vhost" if($vhost);
1001
if( $opt_i ) {
1002
if( $vhost ) { $MSG .= '/'; } else { $MSG .= 'on '; }
1003
$MSG .= $opt_i;
1004
}
1005
$MSG .= " (".($t2-$t1)."s average)";
1006
if ($vhost) {
1007
push @perf, "vhost_net_read=" . (int(100*$A)/100.0) . ";;;0";
1008
push @perf, "vhost_net_write=" . (int(100*$B)/100.0) . ";;;0";
1009
} else {
1010
push @perf, "allvms_net_read=" . (int(100*$A)/100.0) . ";;;0";
1011
push @perf, "allvms_net_write=" . (int(100*$B)/100.0) . ";;;0";
1012
}
1013
}
1014
}
1015
}
1016
} elsif( $opt_l =~ /MEM/i ) {
1017
my($pc,$tot,$av,$sfx);
1018
$MSG = "";
1019
readxmem;
1020
if(!$MSG) {
1021
$pc = int($A/$B*10000.0)/100.0;
1022
$sfx = "Kb"; $av = $A;
1023
if($av>2047) { $av = int($av/10.24)/100.0; $sfx="Mb"; }
1024
$av .= $sfx;
1025
$sfx = "Kb"; $tot = $B;
1026
if($tot>2047) { $tot = int($tot/10.24)/100.0; $sfx="Mb"; }
1027
$tot .= $sfx;
1028
$MSG = "Memory free: $av ($pc\%) [Total available $tot]" ;
1029
$MSG .= " on vhost $vhost" if($vhost);
1030
}
1031
} else {
1032
$MSG = "Invalid command $opt_l";
1033
$STATUS = $UNKNOWN;
1034
}
1035
1036
if( !$MODE and $STATUS==$OK ) {
1037
# Set Nagios thresholds
1038
if( $opt_l=~/MEM/i and $warn =~ /([\d\.]+)%/ ) { $warn = $B * $1 / 100.0; }
1039
elsif( $warn =~ /([\d\.]+)M/i ) { $warn = $1 * 1024; }
1040
elsif( $warn =~ /([\d\.]+)/i ) { $warn = $1; }
1041
if( $opt_l=~/MEM/i and $crit =~ /([\d\.]+)%/ ) { $crit = $B * $1 / 100.0; }
1042
elsif( $crit =~ /([\d\.]+)M/i ) { $crit = $1 * 1024; }
1043
elsif( $crit =~ /([\d\.]+)/i ) { $crit = $1; }
1044
if( $opt_l =~ /MEM/i ) {
1045
$STATUS = $WARNING if( $A <= $warn );
1046
$STATUS = $CRITICAL if( $A <= $crit );
1047
} elsif( $opt_l =~ /CPU/i ) {
1048
$STATUS = $WARNING if( ($A+$B) >= $warn );
1049
$STATUS = $CRITICAL if( ($A+$B) >= $crit );
1050
} elsif( $opt_l =~ /NET/i ) {
1051
$STATUS = $WARNING if( $A >= $warn );
1052
$STATUS = $WARNING if( $B >= $warn );
1053
$STATUS = $CRITICAL if( $A >= $crit );
1054
$STATUS = $CRITICAL if( $B >= $crit );
1055
} else {
1056
$STATUS = $WARNING if( $A <= $warn );
1057
$STATUS = $CRITICAL if( $A <= $crit );
1058
}
1059
}
1060
1061
$snmp->close;
1062
dooutput;
1063
exit 0;
1064