Common monitoring checks usable across multiple services or
node types.
For an overview of the monitoring check framework see
import datetime
import os
import numbers
import subprocess
import re
import random
import time
from accre.monitor import monitor_command
from accre.util import interpret_string_values, filehash
from accre.config import get_config
CONFIG = get_config()
def loadavg(opts):
Check the load average (1, 5, 15 minutes) of the server.
An option of 'warning' and/or 'critical' may be given with
a list of 1,5,15 min averages above which the check will return
a warning or critical status, or just a single number which will
return warning or critical if any of the three are above the
specified value.
If a cpuscaling option is given with a value of true, then
the limits for critical and warning will be multiplied by
the number of logical cpu cores on the node (counting
opts = interpret_string_values(opts)
data = os.getloadavg()
cpus = os.cpu_count()
status = 'OK'
warning = opts.get('warning', None)
critical = opts.get('critical', None)
cpuscaling = opts.get('cpuscaling', '').lower().startswith('t')
if warning is not None:
if isinstance(warning, numbers.Number):
warning = [warning]*3
for idx, val in enumerate(warning):
if cpuscaling:
val *= cpus
if data[idx] > val:
status = 'WARNING'
if critical is not None:
if isinstance(critical, numbers.Number):
critical = [critical]*3
for idx, val in enumerate(critical):
if cpuscaling:
val *= cpus
if data[idx] > val:
status = 'CRITICAL'
short_desc = (
'({0:.1f}, {1:.1f}, {2:.1f}) [{3} logical cores]'
.format(data[0], data[1], data[2], cpus)
long_desc = (
'1min: {0}\n5min: {1}\n15min: {2}'
.format(data[0], data[1], data[2])
return status, data, short_desc, long_desc
def diskusage(opts):
Check the used space on a mounted volume (default /)
Accepts an option of mountpoint to check a volume on a different
specifed directory, and warning/critical options to alert on
a fraction of total space used.
opts = interpret_string_values(opts)
mountpoint = opts.get('mountpoint', '/')
data = os.statvfs(mountpoint)
blocksize = data.f_frsize
total = data.f_blocks * blocksize
avail = data.f_bavail * blocksize
used = total - avail
status = 'OK'
if 'warning' in opts:
if used / total > opts['warning']:
status = 'WARNING'
if 'critical' in opts:
if used / total > opts['critical']:
status = 'CRITICAL'
data = {
'mountpoint': mountpoint,
'total_size': total,
'available_size': avail
short_desc = (
'{0} {1:.1f}% used ({2:.1f}GB total)'
.format(mountpoint, used / total * 100, total / 1024**3)
return status, data, short_desc
def certexpiry(opts):
Check the certificate file specified by the cert option (required).
By default, report critical if the certificate is expired or
will expire in the next three days, and warning if it will expire
in the next 14 days. These limits may be changed by critical
and warning options.
opts = interpret_string_values(opts)
if 'cert' not in opts:
return 'UNKNOWN', 'Required cert option was not provided'
warning = opts.get('warning', 14)
critical = opts.get('critical', 3)
proc = subprocess.Popen(
'/usr/bin/env', 'openssl', 'x509', '-in', opts['cert'],
'-noout', '-dates', '-fingerprint', '-subject'
stdout=subprocess.PIPE, stderr=subprocess.PIPE
stdout, stderr = proc.communicate(timeout=60)
if proc.returncode != 0:
return 'UNKNOWN', 'openssl did not return 0: {0}'.format(stderr)
data = {}
for line in stdout.decode('utf-8').splitlines():
key = line.split('=')[0]
value = ''.join(line.split('=')[1:])
data[key] = value
expiry = datetime.datetime.strptime(
'%b %d %H:%M:%S %Y %Z'
remaining = (expiry - datetime.datetime.utcnow()).days
status = 'OK'
if remaining < warning:
status = 'WARNING'
if remaining < critical:
status = 'CRITICAL'
if remaining >= 0:
short_text = 'Valid for {0} more days'.format(remaining)
short_text = 'Expired {0} days ago'.format(-remaining)
return status, data, short_text
def checkssh(opts):
Try to connect to the specified SSH server and port
checkssh [--verbose level] [--port port] [--timeout dt] [--use-ipv4 | --use-ipv6] --host server
--host server Server to probe
--port port Port to use. Defaults is 22.
--timeout dt How long to wait for a response in seconds. Default is 10.
--use-ipv4 Use IPv4 addresses
--use-ipv6 Use IPv6 addresses
--verbose level Verbosity level to get extra information. Default is 0.
Additional data returned:
error Error message
host Server name
port Port used
time Time(seconds) to process the check
timeout Max wait (seconds)
version SSH server version string returned
#Get the options
verbose = int(opts.get('verbose', '0'))
host = opts.get('host', 'localhost')
port = opts.get('port', '22')
timeout = opts.get('timeout', '10')
#Form the base command
sshcmd = CONFIG['monitor'].get('checkssh_path', '/usr/local/nagios/libexec/check_ssh')
cmd = [sshcmd, '-p', port , '-t', timeout, host]
#Insert the address family if specified
if 'use-ipv4' in opts:
cmd.insert(1, '-4')
elif 'use-ipv6' in opts:
cmd.insert(1, '-6')
if (verbose > 0):
print('Command: {0}'.format(' '.join(cmd)))
proc = subprocess.Popen(
cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
stdout, stderr = proc.communicate(timeout=60)
#Use the return code to set the status
status = NAGIOS_RETURN_MAPPING[proc.returncode]
if (verbose > 0):
print('Return code: {0} ({1})'.format(status, proc.returncode));
print('Command output: {0}'.format(stdout.decode('utf-8')))
result, info = stdout.decode('utf-8').split('|')
except ValueError:
info = ''
result = stdout.decode('utf-8')
if (verbose > 1):
print('result={0} info={1}'.format(result.strip(), info.strip()))
#Parse the info for the time
if (info != ''):
d = re.search("time=([0-9.]+)s;.*", info)
dt = d.group(1)
dt = '0'
#Munge the return variables based on the checks returncode
if (proc.returncode == 0):
short_desc = '-'.join(result.split('-')[1:]).strip() #Get the SSH version
version = short_desc #Actually have a version
errormsg = ''
elif (proc.returncode == 2):
short_desc = result.splitlines()[0] #The 1st line is the error message
errormsg = short_desc
version = ''
short_desc = result.splitlines()[0] #The 1st line is the error message
errormsg = short_desc
version = ''
data = {
'host': host,
'port': port,
'version': version,
'error' : errormsg,
'time': dt,
'timeout': timeout
return status, data, short_desc
def checkping(opts):
Ping the given server and decide if it's suitablly alive based on the
supplied latency and packet loss valure.
checkssh [--verbose level] [--packets n] [--timeout dt] [--use-ipv4 | --use-ipv6] --warn latency,loss% --critical latency,loss% --host server
--critical latency,loss% CRITICAL if latency(ms) or the percent lost packets is greater than provided
--host server Server to ping
--packets n Number of packets to send. Defaults is 5.
--timeout dt How long to wait for a response in seconds. Default is 10.
--use-ipv4 Use IPv4 addresses
--use-ipv6 Use IPv6 addresses
--verbose level Verbosity level to get extra information. Default is 0.
--warn latency,loss% WARN if latency(ms) or the percent lost packets is greater than provided
Additional data returned:
critical_latency Critical latency specified (ms)
critical_lost_percent Critical lost packets percent
error Error message
host Server name
latency Average packet latency
packets Packets sent
packet_loss PErcentage of packets lost
lost_percent Percentage of packets lost
rta Ping round trip time average (ms)
summary Test summary information
timeout Max wait (seconds)
warn_latency Warning latency specified (ms)
warn_lost_percent Warning lost packets percent
#Nagios Plugin return code mapping (Probably should be a global constant)
#Get the options
verbose = int(opts.get('verbose', '0'))
host = opts.get('host', 'localhost')
packets = opts.get('packets', '5')
timeout = opts.get('timeout', '10')
critical = opts.get('critical', '5000,100%')
warn = opts.get('warn', '3000,80%')
#Form the base command
pingcmd = CONFIG['monitor'].get('checkping_path', '/usr/local/nagios/libexec/check_ping')
cmd = [pingcmd, '-p', packets , '-t', timeout, '-c', critical, '-w', warn, '-H', host]
#Insert the address family if specified
if 'use-ipv4' in opts:
cmd.insert(1, '-4')
elif 'use-ipv6' in opts:
cmd.insert(1, '-6')
if (verbose > 0):
print('Command: {0}'.format(' '.join(cmd)))
proc = subprocess.Popen(
cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
stdout, stderr = proc.communicate(timeout=60)
#Use the return code to set the status
status = NAGIOS_RETURN_MAPPING[proc.returncode]
clat, cpct = critical.split(',')
wlat, wpct = warn.split(',')
if (verbose > 0):
print('Return code: {0} ({1})'.format(status, proc.returncode));
print('Command output: {0}'.format(stdout.decode('utf-8')))
result, info = stdout.decode('utf-8').split('|')
except ValueError:
info = ''
result = stdout.decode('utf-8')
if (verbose > 1):
print('result={0} info={1}'.format(result.strip(), info.strip()))
#Parse the info for the RTA and packet loss perecent
if (info != ''):
d = re.search("rta=([0-9.]+)ms.*pl=([0-9]+)%", info)
rta = d.group(1)
packet_loss = d.group(2)
rta = '0'
packet_loss = '100'
if (re.search('(Packet loss)', result) != None):
short_desc = '-'.join(result.split('-')[1:]).strip() #Get the SSH version
summary = short_desc #Actually have a summary
errormsg = ''
short_desc = result.splitlines()[0] #The 1st line is the error message
errormsg = short_desc
summary = ''
data = {
'critical_latency': clat,
'critical_percent': cpct,
'host': host,
'packets': packets,
'packet_loss': packet_loss,
'summary': summary,
'error' : errormsg,
'rta': rta,
'timeout': timeout,
'warn_latency': wlat,
'warn_percent': wpct
return status, data, short_desc
def checksum(opts):
Efficiently calculates the checksum of the target file.
--file Target file
--hash Hash function (md5, sha1, sha224, sha256, sha384, sha512)
if 'file' not in opts:
return 'UNKNOWN', 'Target file required'
elif 'hash' not in opts:
return 'UNKNOWN', 'Missing hash function'
data = {
'file': opts['file'],
'algorithm': opts['hash'],
'checksum': filehash(opts['file'], opts['hash'])
except ValueError as e:
data = {
'error': str(e)
return 'CRITICAL', data
except IOError as e:
data = {
'error': 'Unable to access file {0}'.format(opts['file']),
'exception': str(e)
return 'CRITICAL', data
return 'OK', data
def checkread(opts):
Check that it is possible to read and calculate a checksum of a randomly
selected file in the specified directory.
--dir Target directory
if 'dir' not in opts:
return 'UNKNOWN', 'Target directory required'
tdir = opts['dir']
# Get a list of all the files (not directories) in the target directory
fileslist = [os.path.join(tdir, f) for f in os.listdir(tdir) if os.path.isfile(os.path.join(tdir, f))]
except OSError as e:
data = {
'error': 'Unable to access',
'suggestion': 'Check that the target directory exists',
'exception': str(e)
return 'CRITICAL', data
# Randomly select a file
tfile = random.choice(fileslist)
except ValueError as e:
data = {
'error': 'No files in target directory',
'suggestion': 'Check that files are present in the target directory',
'exception': str(e)
return 'CRITICAL', data
data = checksum({'file': tfile, 'hash': 'sha256'})[1]
return 'OK', data
def accresystemhealth(opts):
Checks the general system (not-hardware) health of an ACCRE server
This check is an omnibus check for commonly found system problems
that can apply to any standard, CFE-managed node on the ACCRE
infrastructure. The following things will be checked:
1. System drive (/) block space, warn if <20%, crit if <10%
2. System drive (/) free inodes, warn if <20%, crit if <10%
3. Systemd init process memory usage, warn if >50M, crit if
4. Recent completion of a CFE run, warn if >15mins, crit if
These thresholds are not currently configurable.
status = 'OK'
warn = False
crit = False
data = {}
msgs = []
# check disk space
statvfs = os.statvfs('/')
bfree = statvfs.f_bavail / statvfs.f_blocks
data['root_block_pct_free'] = round(bfree*100, 3)
if bfree < 0.2:
warn = True
msgs.append('root filesystem space low')
if bfree < 0.1:
crit = True
# check disk inodes
ffree = statvfs.f_favail / statvfs.f_files
data['root_inode_pct_free'] = round(ffree*100, 3)
if ffree < 0.2:
warn = True
msgs.append('root filesystem inodes low')
if ffree < 0.1:
crit = True
# check systemd memory usage
arglist = ['ps', '-p', '1', '--no-headers', '-o', 'rss']
proc = subprocess.Popen(
arglist, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
stdout, stderr = proc.communicate(timeout=10)
rss = int(stdout.strip())
except Exception:
rss = None
if proc.returncode != 0 or rss is None:
crit = True
data['systemd_rss_usage'] = 'unknown'
msgs.append('problem running ps')
elif rss > 100000:
data['systemd_rss_usage'] = rss
crit = True
msgs.append('systemd rss usage high')
elif rss > 50000:
data['systemd_rss_usage'] = rss
warn = True
msgs.append('systemd rss usage high')
data['systemd_rss_usage'] = rss
# check CFE last run
now = time.time()
last_cfe = os.path.getmtime('/var/cfengine/promise_summary.log')
except Exception:
last_cfe = 0
mins = round((now - last_cfe) / 60, 2)
data['mins_since_last_cfe'] = mins
if last_cfe == 0:
crit = True
msgs.append('could not determine time of last CFE run')
elif mins > 30:
crit = True
msgs.append('CFE run not recently completed')
elif mins > 15:
msgs.append('CFE run not recently completed')
warn = True
if crit:
status = 'CRITICAL'
elif warn:
status = 'WARNING'
if not msgs:
msgs.append('no issues found')
pretext = ', '.join(msgs)
return status, data, pretext