Source code for accre.monitor_checks.common

"""
Common monitoring checks usable across multiple services or
node types.

For an overview of the monitoring check framework see
:mod:`accre.monitor`.
"""
import datetime
import os
import numbers
import subprocess
import re
import random
import time

from accre.monitor import monitor_command
from accre.util import interpret_string_values, filehash
from accre.config import get_config


CONFIG = get_config()
NAGIOS_RETURN_MAPPING = ('OK', 'WARNING', 'CRITICAL', 'UNKNOWN')


[docs]@monitor_command def loadavg(opts): """ Check the load average (1, 5, 15 minutes) of the server. An option of 'warning' and/or 'critical' may be given with a list of 1,5,15 min averages above which the check will return a warning or critical status, or just a single number which will return warning or critical if any of the three are above the specified value. If a cpuscaling option is given with a value of true, then the limits for critical and warning will be multiplied by the number of logical cpu cores on the node (counting hyperthreading). """ opts = interpret_string_values(opts) data = os.getloadavg() cpus = os.cpu_count() status = 'OK' warning = opts.get('warning', None) critical = opts.get('critical', None) cpuscaling = opts.get('cpuscaling', '').lower().startswith('t') if warning is not None: if isinstance(warning, numbers.Number): warning = [warning]*3 for idx, val in enumerate(warning): if cpuscaling: val *= cpus if data[idx] > val: status = 'WARNING' if critical is not None: if isinstance(critical, numbers.Number): critical = [critical]*3 for idx, val in enumerate(critical): if cpuscaling: val *= cpus if data[idx] > val: status = 'CRITICAL' short_desc = ( '({0:.1f}, {1:.1f}, {2:.1f}) [{3} logical cores]' .format(data[0], data[1], data[2], cpus) ) long_desc = ( '1min: {0}\n5min: {1}\n15min: {2}' .format(data[0], data[1], data[2]) ) return status, data, short_desc, long_desc
[docs]@monitor_command def diskusage(opts): """ Check the used space on a mounted volume (default /) Accepts an option of mountpoint to check a volume on a different specifed directory, and warning/critical options to alert on a fraction of total space used. """ opts = interpret_string_values(opts) mountpoint = opts.get('mountpoint', '/') data = os.statvfs(mountpoint) blocksize = data.f_frsize total = data.f_blocks * blocksize avail = data.f_bavail * blocksize used = total - avail status = 'OK' if 'warning' in opts: if used / total > opts['warning']: status = 'WARNING' if 'critical' in opts: if used / total > opts['critical']: status = 'CRITICAL' data = { 'mountpoint': mountpoint, 'total_size': total, 'available_size': avail } short_desc = ( '{0} {1:.1f}% used ({2:.1f}GB total)' .format(mountpoint, used / total * 100, total / 1024**3) ) return status, data, short_desc
[docs]@monitor_command def certexpiry(opts): """ Check the certificate file specified by the cert option (required). By default, report critical if the certificate is expired or will expire in the next three days, and warning if it will expire in the next 14 days. These limits may be changed by critical and warning options. """ opts = interpret_string_values(opts) if 'cert' not in opts: return 'UNKNOWN', 'Required cert option was not provided' warning = opts.get('warning', 14) critical = opts.get('critical', 3) proc = subprocess.Popen( [ '/usr/bin/env', 'openssl', 'x509', '-in', opts['cert'], '-noout', '-dates', '-fingerprint', '-subject' ], stdout=subprocess.PIPE, stderr=subprocess.PIPE ) stdout, stderr = proc.communicate(timeout=60) if proc.returncode != 0: return 'UNKNOWN', 'openssl did not return 0: {0}'.format(stderr) data = {} for line in stdout.decode('utf-8').splitlines(): key = line.split('=')[0] value = ''.join(line.split('=')[1:]) data[key] = value expiry = datetime.datetime.strptime( data['notAfter'], '%b %d %H:%M:%S %Y %Z' ) remaining = (expiry - datetime.datetime.utcnow()).days status = 'OK' if remaining < warning: status = 'WARNING' if remaining < critical: status = 'CRITICAL' if remaining >= 0: short_text = 'Valid for {0} more days'.format(remaining) else: short_text = 'Expired {0} days ago'.format(-remaining) return status, data, short_text
[docs]@monitor_command def checkssh(opts): """ Try to connect to the specified SSH server and port checkssh [--verbose level] [--port port] [--timeout dt] [--use-ipv4 | --use-ipv6] --host server Options --host server Server to probe --port port Port to use. Defaults is 22. --timeout dt How long to wait for a response in seconds. Default is 10. --use-ipv4 Use IPv4 addresses --use-ipv6 Use IPv6 addresses --verbose level Verbosity level to get extra information. Default is 0. Additional data returned: error Error message host Server name port Port used time Time(seconds) to process the check timeout Max wait (seconds) version SSH server version string returned """ #Get the options verbose = int(opts.get('verbose', '0')) host = opts.get('host', 'localhost') port = opts.get('port', '22') timeout = opts.get('timeout', '10') #Form the base command sshcmd = CONFIG['monitor'].get('checkssh_path', '/usr/local/nagios/libexec/check_ssh') cmd = [sshcmd, '-p', port , '-t', timeout, host] #Insert the address family if specified if 'use-ipv4' in opts: cmd.insert(1, '-4') elif 'use-ipv6' in opts: cmd.insert(1, '-6') if (verbose > 0): print('Command: {0}'.format(' '.join(cmd))) proc = subprocess.Popen( cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE ) stdout, stderr = proc.communicate(timeout=60) #Use the return code to set the status status = NAGIOS_RETURN_MAPPING[proc.returncode] if (verbose > 0): print('Return code: {0} ({1})'.format(status, proc.returncode)); print('Command output: {0}'.format(stdout.decode('utf-8'))) try: result, info = stdout.decode('utf-8').split('|') except ValueError: info = '' result = stdout.decode('utf-8') if (verbose > 1): print('result={0} info={1}'.format(result.strip(), info.strip())) #Parse the info for the time if (info != ''): d = re.search("time=([0-9.]+)s;.*", info) dt = d.group(1) else: dt = '0' #Munge the return variables based on the checks returncode if (proc.returncode == 0): short_desc = '-'.join(result.split('-')[1:]).strip() #Get the SSH version version = short_desc #Actually have a version errormsg = '' elif (proc.returncode == 2): short_desc = result.splitlines()[0] #The 1st line is the error message errormsg = short_desc version = '' else: short_desc = result.splitlines()[0] #The 1st line is the error message errormsg = short_desc version = '' data = { 'host': host, 'port': port, 'version': version, 'error' : errormsg, 'time': dt, 'timeout': timeout } return status, data, short_desc
[docs]@monitor_command def checkping(opts): """ Ping the given server and decide if it's suitablly alive based on the supplied latency and packet loss valure. checkssh [--verbose level] [--packets n] [--timeout dt] [--use-ipv4 | --use-ipv6] --warn latency,loss% --critical latency,loss% --host server Options --critical latency,loss% CRITICAL if latency(ms) or the percent lost packets is greater than provided --host server Server to ping --packets n Number of packets to send. Defaults is 5. --timeout dt How long to wait for a response in seconds. Default is 10. --use-ipv4 Use IPv4 addresses --use-ipv6 Use IPv6 addresses --verbose level Verbosity level to get extra information. Default is 0. --warn latency,loss% WARN if latency(ms) or the percent lost packets is greater than provided Additional data returned: critical_latency Critical latency specified (ms) critical_lost_percent Critical lost packets percent error Error message host Server name latency Average packet latency packets Packets sent packet_loss PErcentage of packets lost lost_percent Percentage of packets lost rta Ping round trip time average (ms) summary Test summary information timeout Max wait (seconds) warn_latency Warning latency specified (ms) warn_lost_percent Warning lost packets percent """ #Nagios Plugin return code mapping (Probably should be a global constant) #Get the options verbose = int(opts.get('verbose', '0')) host = opts.get('host', 'localhost') packets = opts.get('packets', '5') timeout = opts.get('timeout', '10') critical = opts.get('critical', '5000,100%') warn = opts.get('warn', '3000,80%') #Form the base command pingcmd = CONFIG['monitor'].get('checkping_path', '/usr/local/nagios/libexec/check_ping') cmd = [pingcmd, '-p', packets , '-t', timeout, '-c', critical, '-w', warn, '-H', host] #Insert the address family if specified if 'use-ipv4' in opts: cmd.insert(1, '-4') elif 'use-ipv6' in opts: cmd.insert(1, '-6') if (verbose > 0): print('Command: {0}'.format(' '.join(cmd))) proc = subprocess.Popen( cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE ) stdout, stderr = proc.communicate(timeout=60) #Use the return code to set the status status = NAGIOS_RETURN_MAPPING[proc.returncode] clat, cpct = critical.split(',') wlat, wpct = warn.split(',') if (verbose > 0): print('Return code: {0} ({1})'.format(status, proc.returncode)); print('Command output: {0}'.format(stdout.decode('utf-8'))) try: result, info = stdout.decode('utf-8').split('|') except ValueError: info = '' result = stdout.decode('utf-8') if (verbose > 1): print('result={0} info={1}'.format(result.strip(), info.strip())) #Parse the info for the RTA and packet loss perecent if (info != ''): d = re.search("rta=([0-9.]+)ms.*pl=([0-9]+)%", info) rta = d.group(1) packet_loss = d.group(2) else: rta = '0' packet_loss = '100' if (re.search('(Packet loss)', result) != None): short_desc = '-'.join(result.split('-')[1:]).strip() #Get the SSH version summary = short_desc #Actually have a summary errormsg = '' else: short_desc = result.splitlines()[0] #The 1st line is the error message errormsg = short_desc summary = '' data = { 'critical_latency': clat, 'critical_percent': cpct, 'host': host, 'packets': packets, 'packet_loss': packet_loss, 'summary': summary, 'error' : errormsg, 'rta': rta, 'timeout': timeout, 'warn_latency': wlat, 'warn_percent': wpct } return status, data, short_desc
[docs]@monitor_command def checksum(opts): """ Efficiently calculates the checksum of the target file. Options: --file Target file --hash Hash function (md5, sha1, sha224, sha256, sha384, sha512) """ if 'file' not in opts: return 'UNKNOWN', 'Target file required' elif 'hash' not in opts: return 'UNKNOWN', 'Missing hash function' try: data = { 'file': opts['file'], 'algorithm': opts['hash'], 'checksum': filehash(opts['file'], opts['hash']) } except ValueError as e: data = { 'error': str(e) } return 'CRITICAL', data except IOError as e: data = { 'error': 'Unable to access file {0}'.format(opts['file']), 'exception': str(e) } return 'CRITICAL', data return 'OK', data
[docs]@monitor_command def checkread(opts): """ Check that it is possible to read and calculate a checksum of a randomly selected file in the specified directory. Options: --dir Target directory """ if 'dir' not in opts: return 'UNKNOWN', 'Target directory required' tdir = opts['dir'] try: # Get a list of all the files (not directories) in the target directory fileslist = [os.path.join(tdir, f) for f in os.listdir(tdir) if os.path.isfile(os.path.join(tdir, f))] except OSError as e: data = { 'error': 'Unable to access', 'suggestion': 'Check that the target directory exists', 'exception': str(e) } return 'CRITICAL', data # Randomly select a file try: tfile = random.choice(fileslist) except ValueError as e: data = { 'error': 'No files in target directory', 'suggestion': 'Check that files are present in the target directory', 'exception': str(e) } return 'CRITICAL', data data = checksum({'file': tfile, 'hash': 'sha256'})[1] return 'OK', data
[docs]@monitor_command def accresystemhealth(opts): """ Checks the general system (not-hardware) health of an ACCRE server This check is an omnibus check for commonly found system problems that can apply to any standard, CFE-managed node on the ACCRE infrastructure. The following things will be checked: 1. System drive (/) block space, warn if <20%, crit if <10% 2. System drive (/) free inodes, warn if <20%, crit if <10% 3. Systemd init process memory usage, warn if >50M, crit if >100M 4. Recent completion of a CFE run, warn if >15mins, crit if >30mins These thresholds are not currently configurable. """ status = 'OK' warn = False crit = False data = {} msgs = [] # check disk space statvfs = os.statvfs('/') bfree = statvfs.f_bavail / statvfs.f_blocks data['root_block_pct_free'] = round(bfree*100, 3) if bfree < 0.2: warn = True msgs.append('root filesystem space low') if bfree < 0.1: crit = True # check disk inodes ffree = statvfs.f_favail / statvfs.f_files data['root_inode_pct_free'] = round(ffree*100, 3) if ffree < 0.2: warn = True msgs.append('root filesystem inodes low') if ffree < 0.1: crit = True # check systemd memory usage arglist = ['ps', '-p', '1', '--no-headers', '-o', 'rss'] proc = subprocess.Popen( arglist, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.DEVNULL ) stdout, stderr = proc.communicate(timeout=10) try: rss = int(stdout.strip()) except Exception: rss = None if proc.returncode != 0 or rss is None: crit = True data['systemd_rss_usage'] = 'unknown' msgs.append('problem running ps') elif rss > 100000: data['systemd_rss_usage'] = rss crit = True msgs.append('systemd rss usage high') elif rss > 50000: data['systemd_rss_usage'] = rss warn = True msgs.append('systemd rss usage high') else: data['systemd_rss_usage'] = rss # check CFE last run now = time.time() try: last_cfe = os.path.getmtime('/var/cfengine/promise_summary.log') except Exception: last_cfe = 0 mins = round((now - last_cfe) / 60, 2) data['mins_since_last_cfe'] = mins if last_cfe == 0: crit = True msgs.append('could not determine time of last CFE run') elif mins > 30: crit = True msgs.append('CFE run not recently completed') elif mins > 15: msgs.append('CFE run not recently completed') warn = True if crit: status = 'CRITICAL' elif warn: status = 'WARNING' if not msgs: msgs.append('no issues found') pretext = ', '.join(msgs) return status, data, pretext