root/platform/nagios-plugins/check_ps @ 10651

Revision 10651, 13.0 kB (checked in by ihag, 6 years ago)

platform/nagios-plugins/check_ps: Simple Network Management Protocol Protocol (TM)

  • Property svn:executable set to *
  • Property svn:keywords set to Id
Line 
1#!/usr/bin/perl
2# check_ps -- Nagios Plugin for check specified process on the remote
3#             host is alive or not, via standard SNMP.
4# $Id$
5
6use strict;
7use Getopt::Long;
8use lib "/usr/local/nagios/libexec";  # for use 'utils.pm'
9use utils qw($TIMEOUT %ERRORS &print_revision &support &usage);
10use SNMP;
11
12###########################################################################
13# SNMPWrapper class
14package SNMPWrapper;
15use Fcntl ':flock';
16use Storable qw(nstore store_fd nstore_fd fd_retrieve freeze thaw dclone);
17
18sub new {
19    my($type, $conf, $opt) = @_;
20    my $self = {'conf' => $conf, 'option' => $opt};
21
22    $self->{'snmp'} = new SNMP::Session(%{$self->{'option'}});
23    bless $self, $type;
24}
25
26sub __die {
27    my($msg) = @_;
28    print "${main::PROGNAME} UNKNOWN: ${msg}\n";
29    exit($utils::ERRORS{'UNKNOWN'} || 3);
30}
31
32sub gettable {
33    my($self, @opt) = @_;
34
35    return $self->fetch(@opt) unless $self->getcache();
36    return $self->{'cached'};
37}
38
39sub session {
40    my $self = shift;
41    return $self->{'snmp'}
42}
43
44# __file_transaction(\%conf)
45# Arguments:
46#   $conf->{'file'}:  filename to pass to open.
47#   $conf->{'flag'}:  flag to pass flock.
48#   $conf->{'count'}: number of attempt file locking.
49#   $conf->{'name'}:  the name of this session. it appears in error message.
50#  Callback functions:
51#   $conf->{'do'}:    code reference. that executes after lock succeed.
52#   $conf->{'retry'}: code reference. that executes before retry locking.
53#                     when this code returns TRUE, exit transaction immediatly.
54#                     when returns FALSE, try locking continual.
55#   $conf->{'on_error'}: code reference. that executes when file was broken.
56#
57# returns:
58#   this function returns the result of 'do' or 'retry'.
59sub __file_transaction {
60    my($conf) = @_;
61    my $file = $conf->{'file'};
62    my $c = $conf->{'count'};
63    my $ret;
64    $conf->{'name'} ||= '__file_transaction';
65    TRY: {
66        eval {
67            use Fatal qw(:void open flock);
68            open(FD, $file);
69            flock(FD, $conf->{'flag'});
70            $ret = &{$conf->{'do'}}(\*FD);
71            close(FD);
72        };
73        last unless $@;
74        for ($@) {
75            /Can't open/ ?
76                __die("ERROR: " . $conf->{'name'} . ": Open failed ($_)\n") :
77            /Can't flock/ ? do {
78                close(FD);
79                unless (--$c) {
80                    __die("ERROR: " . $conf->{'name'} . ": lock failed " .
81                          $conf->{'count'} . "times ". "($_)\n");
82                }
83                $ret = &{$conf->{'retry'}};
84                return $ret if $ret;
85                sleep 1;
86                redo TRY;
87            } :
88            /Magic number checking on storable file failed/ ? do {
89                &{$conf->{'on_error'}} if $conf->{'on_error'};
90                __die("ERROR: " . $conf->{'name'} . ": " .
91                      "fd_retrieve failed. It may broken file ($_)\n")
92            } :
93            __die("ERROR: ".$conf->{'name'}.": Unknown error occured: $_\n");
94        }
95    }
96    return $ret;
97}
98
99sub fetch {
100    my($self, @opt) = @_;
101    my $conf = $self->{'conf'};
102    my $host = $conf->{'host'};
103    my $file = $conf->{'cachedir'} . "/" . $host;
104
105    return(__file_transaction({
106            'file'  => "+>> $file",
107            'flag'  => LOCK_EX|LOCK_NB,
108            'count' => 5,
109            'name'  => 'snmpget and write to cache',
110            'do'    => sub {
111                local(*FD) = @_;
112                seek(FD, 0, 0);
113                truncate(FD, 0);
114                my $result = $self->{'cached'}
115                           = $self->{'snmp'}->gettable(@opt);
116                store_fd($result, \*FD);
117                return($result);
118            },
119            'retry' => sub { return $self->getcache() },
120            'on_error' => sub { unlink $file },
121        }));
122}
123
124sub getcache {
125    my $self = shift;
126    my $conf = $self->{'conf'};
127    my $host = $conf->{'host'};
128    my $file = $conf->{'cachedir'} . "/" . $host;
129
130    my $status = my($dev, $ino, $mode, $nlink, $uid, $gid, $rdev, $size,
131                    $atime, $mtime, $ctime, $blksize, $blocks)
132               = stat($file);
133    return 0 unless (($status > 0) and ($size > 0) and
134                     (time - $mtime < $conf->{'expire'}));
135
136    return(__file_transaction({
137            'file'  => $file,
138            'flag'  => LOCK_SH|LOCK_NB,
139            'count' => 5,
140            'name'  => 'read from cache',
141            'do'    => sub {
142                local(*FD) = @_;
143                seek(FD, 0, 0);
144                return($self->{'cached'} = fd_retrieve(\*FD));
145            },
146            'retry' => sub { return undef },
147            'on_error' => sub { unlink $file },
148        }));
149}
150
151
152###########################################################################
153# main
154package main;
155our $PROGNAME;
156
157#
158# consts
159#
160($PROGNAME) = ($0 =~ qr{([^/]+)$});
161my $REVISION = '1.02';
162my $snmp_version = '2c';
163my $hrSWRunTable = '.1.3.6.1.2.1.25.4.2';   # HOST-RESOURSES-MIB::hrSWRunTable
164my $hrSWRunEntry = '.1.3.6.1.2.1.25.4.2.1'; # HOST-RESOURSES-MIB::hrSWRunEntry
165my $def_cachedir = "/tmp/${main::PROGNAME}"; # default cache directory
166my $def_expire = 50;
167
168
169#
170# vars
171#
172my $conf = {};
173
174
175#
176# functions
177#
178sub print_usage {
179    print "Usage: ${main::PROGNAME} -H <host> [-C community] " .
180          "-w <warning> -c <critical>\n";
181    print "       [-p <process>] [-t <timeout>] [-x <cache_expire>]\n";
182    print "       [-d <cache_dir>]\n";
183}
184
185sub print_help {
186    print_revision($main::PROGNAME, $REVISION);
187    print_usage();
188    print <<"_EOD_";
189-H, --hostname=HOST
190   Name or IP address of host to check
191-C, --community=community
192   SNMPv2c community (default: public)
193-w, --warning=INTEGER
194   Number of processes which a WARNING status will result
195-c, --critical=INTEGER
196   Number of processes which a CRITICAL status will result
197-p, --process=STRING
198   Name of process for watch (default: crond)
199-t, --timeout=INTEGER
200   Seconds before the plugin times out (default: ${TIMEOUT})
201-x, --expire=INTEGER
202   Keep SNMP result as cache while specified seconds (default: ${def_expire})
203   '-x 0' for don't use cache. (It cause increasing network load. Be careful!)
204-d, --dir=STRING
205   Cache directory (default: $def_cachedir)
206_EOD_
207}
208
209sub exit_on {
210    my($code, @msg) = @_;
211    $code = 'UNKNOWN' if ! exists $ERRORS{$code};
212
213    printf("%s %s: ", $main::PROGNAME, $code);
214    printf(@msg);
215    print "\n";
216    exit $ERRORS{$code};
217}
218
219sub parse_arg {
220    my($opt_V, $opt_h, $opt_w, $opt_c, $opt_H, $opt_C, $opt_p, $opt_t,
221       $opt_x, $opt_d);
222    my $conf = shift;
223    Getopt::Long::Configure('bundling');
224    GetOptions
225        ("V"   => \$opt_V, "version"    => \$opt_V,
226         "h"   => \$opt_h, "help"       => \$opt_h,
227         "w=s" => \$opt_w, "warning=s"  => \$opt_w,
228         "c=s" => \$opt_c, "critical=s" => \$opt_c,
229         "H=s" => \$opt_H, "hostname=s" => \$opt_H,
230         "C=s" => \$opt_C, "community=s" => \$opt_C,
231         "p=s" => \$opt_p, "process=s"  => \$opt_p,
232         "t=s" => \$opt_t, "timeout=s"  => \$opt_t,
233         "x=s" => \$opt_x, "expire=s"   => \$opt_x,
234         "d=s" => \$opt_d, "dir=s"      => \$opt_d,
235        );
236   
237    if ($opt_V) {
238        print_revision($main::PROGNAME, $REVISION);
239        exit $ERRORS{'OK'};
240    }
241   
242    if ($opt_h) {print_help(); exit $ERRORS{'OK'};}
243   
244    # mandatory options
245    ($opt_H) || usage("Host name/address not specified\n");
246    $conf->{'host'} = $1 if ($opt_H =~ /^([-.A-Za-z0-9]+)$/);
247    ($conf->{'host'}) || usage("Invalid host: $opt_H\n");
248   
249    ($opt_w) || usage("Warning threshold not specified\n");
250    $conf->{'warning'} = $1 if ($opt_w =~ /^(\d+)$/);
251    ($conf->{'warning'}) || usage("Invalid warning threshold: $opt_w\n");
252
253    ($opt_c) || usage("Critical threshold not specified\n");
254    $conf->{'critical'} = $1 if ($opt_c =~ /^(\d+)$/);
255    ($conf->{'critical'}) || usage("Invalid critical threshold: $opt_c\n");
256   
257    # optional options
258    $conf->{'community'} = $opt_C || 'public';
259    $conf->{'process'} = $opt_p || 'crond';
260   
261    $conf->{'timeout'} = $TIMEOUT * 100000;
262    if ($opt_t) {
263      usage("Invalid timeout specified\n") unless ($opt_t =~ /^(\d+)$/);
264      $conf->{'timeout'} = $1 * 100000;
265    }
266
267    $conf->{'expire'} = $def_expire;
268    if (defined $opt_x) {
269        usage("Invalid expire specified\n") unless ($opt_x =~ /^(\d+)$/);
270        $conf->{'expire'} = $1 + 0;
271    }
272
273    $conf->{'cachedir'} = $opt_d || $def_cachedir;
274    if (($conf->{'expire'} > 0) and (! -d $conf->{'cachedir'})) {
275        unless (mkdir($conf->{'cachedir'})) {
276            exit_on('UNKNOWN', "Cannot create cachedir (mkdir: " . $! . ")");
277        }
278    }
279    return $conf;
280}
281
282#
283# Initialize
284#
285
286$ENV{'PATH'} = '';
287$ENV{'BASH_ENV'} = '';
288$ENV{'ENV'} = '';
289
290parse_arg($conf);
291
292
293#
294# Polling
295#
296
297my $snmp = new SNMPWrapper($conf,
298                           { 'DestHost'  => $conf->{'host'},
299                             'Community' => $conf->{'community'},
300                             'Version'   => $snmp_version,
301                             'Timeout'   => $conf->{'timeout'} });
302my $result = $snmp->gettable($hrSWRunTable,
303                             { 'columns'   => ['hrSWRunPath'],
304                               'noindexes' => 1 });
305
306unless (%$result) {
307    exit_on('CRITICAL', 'ERROR: %s, (%s) in %s.',
308                        $snmp->session->{'ErrorStr'},
309                        @{%$conf}{'process', 'host'});
310}
311
312
313my $queue = {};
314foreach my $id (keys %$result) {
315    next unless $result->{$id}->{'hrSWRunPath'} eq $conf->{'process'};
316    $queue->{$id} = $result->{$id};
317}
318my $procs = keys(%$queue);
319
320if ($procs == 0) {
321    exit_on('CRITICAL', "There is no process (%s) in %s.",
322                        @{%$conf}{'process', 'host'});
323} elsif ($procs >= $conf->{'critical'}) {
324    exit_on('CRITICAL', "There is %d process (>=%s), (%s) in %s.",
325                        $procs, @{%$conf}{'critical', 'process', 'host'});
326} elsif ($procs >= $conf->{'warning'}) {
327    exit_on('WARNING',  "There is %d process (>=%s), (%s) in %s.",
328                        $procs, @{%$conf}{'warning', 'process', 'host'});
329}
330exit_on('OK', "There is %d process (%s) in %s.",
331              $procs, @{%$conf}{'process', 'host'});
332
333__END__
334
335=head1 NAME
336
337check_ps -- Nagios Plugin for check specified process on the remote
338            host is alive or not, via SNMP.
339
340=head1 SYNOPSIS
341
342check_ps -H <host> [-C community] -w <warning> -c <critical>
343         [-p <process>] [-t <timeout>] [-x <cache_expire>]
344         [-d <cache_dir>]
345
346=head1 DESCRIPTION
347
348check_ps collects remote process information via SNMPv2
349HOST-RESOURCES-MIB (OID: .1.3.6.1.2.1.25.4.2.1.4). If number of
350specified running process is too high or not runnning, It issues
351WARNING or CRITICAL states.
352
353check_ps aims to replace check_procs. check_procs is more powerful,
354but it needs the 'check_nspr' program to be installed in the remote
355systems.  It is painful to install such additional program in all
356remote systems, Especially, when you have a lot of systems to watch.
357
358check_ps only depends SNMP standard MIBs. So you only needed to
359make sure that remote system provides SNMP.
360
361Additionally, cache_ps stores the result of SNMP query to local cache.
362It decrease the network load even watching several processes.
363
364The options are as follows:
365
366  -H, --hostname=HOST
367     Name or IP address of host to check
368  -C, --community=community
369     SNMPv2c community
370  -w, --warning=INTEGER
371     Number of processes which a WARNING status will result
372  -c, --critical=INTEGER
373     Number of processes which a CRITICAL status will result
374  -p, --process=STRING
375     Name of process for watch. It attempts exactly match.
376  -t, --timeout=INTEGER
377     Seconds before the plugin times out.
378  -x, --expire=INTEGER
379     Keep SNMP result as cache while specified seconds.
380     '-x 0' for don't use cache. (It cause increasing network load,
381     especially you watching multiple remote processes. Be cafeful!)
382  -d, --dir=STRING
383     Cache directory.
384
385=head1 HOW TO DEPLOY IN YOUR ENVIRONMENT
386
387  1. Make sure that net-snmpd is installed in remote system.
388
389  2. Determine the name of process what you want to watch.
390     e.g. If you want to watch sshd process, type as follow:
391
392     % snmpwalk -v2c -c YOUR_SNMP_COMMUNITY_NAME HOST \
393       .1.3.6.1.2.1.25.4.2.1.4 | grep sshd
394     HOST-RESOURCES-MIB::hrSWRunPath.2056 = STRING: "/usr/sbin/sshd"
395     
396     In this case, "/usr/sbin/sshd" is the strings for pass to -p
397     option.
398
399  3. Dry run.
400     
401     # su - nagios
402     # ./check_ps -C YOUR_SNMP_COMMUNITY_NAME -H HOST
403                     -w 2 -c 10 -p /usr/sbin/sshd
404     check_ps OK: There is 1 process (/usr/sbin/sshd) in HOST.
405
406  4. Modify the nagios configuration file.
407     e.g.
408     
409     define command {
410       command_name check_sshd_proc
411       command_line $USER1$/check_ps -H $HOSTADDRESS$ \
412                    -C YOUR_SNMP_COMMUNITY -w $ARG1$ -c $ARG2$ \
413                    -p "/usr/sbin/sshd"
414     }
415     
416     Please remove backslashes on the end of line, and write it as
417     one line.
418     
419     Then, define services that use above command.
420
421  5. Reload nagios. Enjoy!
422
423=head1 REQUIREMENT
424
425=over 4
426
427=item * Perl 5
428
429=item * SNMP module version 5.0301 or later.
430
431=item * Nagios 3.0 or later.
432
433=back
434
435=head1 COPYRIGHT AND LICENSE
436
437AS IS.
Note: See TracBrowser for help on using the browser.