Source code for accre.monitor_checks.auditor

"""
These "checks" are generally long running audits to be performed by the
ACCRE auditor. They will not be available in the ``accre-monitor`` command
unless an "auditor" field is set to true.

For an overview of the monitoring check framework see
:mod:`accre.monitor`.
"""
from collections import Counter
import datetime
from math import isclose
import socket
import subprocess
from urllib.parse import unquote

import yaml

from accre.account_management import VUNetIDValidator
from accre.gpfs import (
    run_mmlslicense,
    run_mmlsnode,
    run_gpfs_command,
    parse_gpfs_y_command,
    GPFSCommandError
)
from accre.monitor import monitor_command
from accre.nagios import retrieve_object_cache
from accre.proxmox import ACCREProxmox
from accre.ldap import ACCRELDAP, VUDS
from accre.database import VandyAdminDBClient
from accre.util import (
    interpret_string_values,
    get_primary_ip,
    byte_quantity_isclose
)
from accre.config import get_config
from accre.slurm import (
    get_slurm_associations,
    groups_by_account,
    get_default_groups,
    list_compute_nodes
)
from accre.email import (
    get_dl_addresses,
    add_dl_addresses,
    remove_dl_addresses,
    ZimbraCommandError
)

CONFIG = get_config()


[docs]@monitor_command def auditor_hello(opts): """ Declare yourself as the auditor. There should be only one auditor node. This will show the hostname, internal IP, and RSA host key of the node. """ keyloc = opts.get('keyloc', '/etc/ssh/ssh_host_rsa_key.pub') data = {} data['hostname'] = socket.gethostname() data['primary interface'] = get_primary_ip() proc = subprocess.Popen( ['/usr/bin/env', 'ssh-keygen', '-l', '-f', keyloc], stdout=subprocess.PIPE, stderr=subprocess.PIPE ) stdout, stderr = proc.communicate(timeout=60) data['rsa host key'] = stdout.decode('ascii') short_desc = 'Meesa the auditor. Yousa should follow me now, okeeday?' return 'OK', data, short_desc
[docs]@monitor_command def auditor_vandy_active(opts): """ Ensure that all active users in ACCRE LDAP (defined by login shell) are considered active VUnetIDs in Vanderbilt LDAP. Notice - 2021/01 - Eric - Currently this needs cleanup and has for over a year so we will not send WARNING/CRITICAL for now even if there are problems. """ pretext = 'All cluster users have active VUnetIDs' with ACCRELDAP(caching=True) as client: all_users = client.list_users() active_users = client.list_users(active=True) robots = client.list_robots() active_robots = list(set(active_users) & set(robots)) missing_users = [] locked_users = [] with VUNetIDValidator() as validator: for user in active_users: if not validator.exists(user): missing_users.append(user) elif validator.is_locked(user): locked_users.append(user) data = { 'Total ACCRE Users': len(all_users), 'Active ACCRE Users': len(active_users) } if active_robots: data['Active ACCRE Robots'] = ', '.join(active_robots) status = 'OK' if locked_users: data['Active ACCRE Users with locked VUNetIDs'] = ( ', '.join(locked_users) ) status = 'OK' pretext = 'OK - Active ACCRE users have locked VUNetIDs' if missing_users: data['Active ACCRE Users with missing VUNetIDs'] = ( ', '.join(missing_users) ) status = 'OK' pretext = 'OK - Active ACCRE users have missing VUNetIDs' return status, data, pretext
[docs]@monitor_command def auditor_accre_active(opts): """ Ensure that all active users in ACCRE LDAP (defined by login shell) are considered active in the ACCRE database, and that all users that are active in the ACCRE database are active users in LDAP. """ pretext = 'The active users in the ACCRE database match ACCRE LDAP' status = 'OK' with ACCRELDAP(caching=True) as client: active_ldap_users = set(client.list_users(active=True)) ldap_users = set(client.list_users()) inactive_ldap_users = ldap_users.difference(active_ldap_users) dbclient = VandyAdminDBClient() active_db_users = set(dbclient.list_users(active=True)) data = { 'Total Users (ldap)': len(ldap_users), 'Active Users (ldap)': len(active_ldap_users), 'Active Users (database)': len(active_db_users) } incorrectly_active = active_ldap_users.difference(active_db_users) if incorrectly_active: status = 'WARNING' pretext = 'WARNING - ACCRE LDAP user inconsistency with database' data['Active users (ldap) inactive or missing in database'] = ( ', '.join(sorted(list(incorrectly_active))) ) incorrectly_inactive = active_db_users.intersection(inactive_ldap_users) if incorrectly_inactive: status = 'CRITICAL' pretext = 'CRITICAL - ACCRE LDAP user inconsistency with database' data['Active users (db) set to inactive in ldap'] = ( ', '.join(sorted(list(incorrectly_inactive))) ) missing = active_db_users.difference(ldap_users) if missing: status = 'CRITICAL' pretext = 'CRITICAL - ACCRE LDAP user inconsistency with database' data['Active users (db) missing in ldap'] = ( ', '.join(sorted(list(missing))) ) return status, data, pretext
[docs]@monitor_command def auditor_accre_groups(opts): """ Ensure that all groups in ACCRE LDAP have a corresponding group in the database with the same GID, and that all active database groups are in LDAP. """ pretext = ( 'Check the groups in the ACCRE database for consistency ' 'with ACCRE LDAP' ) status = 'OK' with ACCRELDAP(caching=True) as client: ldap_groups = client.list_groups() gid_map = {x[0]: x[1] for x in client.list_group_gids()} dbclient = VandyAdminDBClient() database_groups = dbclient.list_groups() active_database_groups = dbclient.list_groups(active=True, posix=True) active_db_group_info = { g['name']: g for g in dbclient.all_groups_info(active=True, posix=True) } missing_database_groups = [] inconsistent_gids = [] for group in ldap_groups: if group not in active_database_groups: missing_database_groups.append(group) continue if active_db_group_info[group]['group_id'] != gid_map[group]: inconsistent_gids.append( '{0} db gid: {1} ldap gid: {2}'.format( group, active_db_group_info[group]['group_id'], gid_map[group] ) ) missing_ldap_groups = [ group for group in active_database_groups if group not in ldap_groups ] data = { 'Total LDAP Groups': len(ldap_groups), 'Total Database Groups': len(database_groups), 'Active Database Groups': len(active_database_groups) } if inconsistent_gids: status = 'CRITICAL' data['Inconsistent Group IDs'] = ', '.join(inconsistent_gids) if missing_database_groups: status = 'CRITICAL' data['LDAP groups missing in database'] = ( ', '.join(missing_database_groups) ) if missing_ldap_groups: status = 'CRITICAL' data['Active database groups missing in LDAP'] = ( ', '.join(missing_ldap_groups) ) return status, data, pretext
[docs]@monitor_command def auditor_accre_group_membership(opts): """ Ensure that all active LDAP users have primary and secondary groups matching the ACCRE database """ pretext = ( 'Check group membership consistency between LDAP and the Database' ) status = 'OK' bad_primary = [] ldap_missing_sec = [] ldap_extra_sec = [] dbclient = VandyAdminDBClient() posix_groups = dbclient.list_groups(active=True, posix=True) with ACCRELDAP(caching=True) as client: gid_map = {x[0]: x[1] for x in client.list_group_gids()} group_membership = client.group_membership() userlist = client.list_users(active=True) dbusers_list = dbclient.all_users_info(secondary_groups=True) dbusers = {u['vunetid']: u for u in dbusers_list} for user in userlist: db_info = dbusers[user] db_pri = db_info['group_id'] db_sec = { g for g in db_info['secondary_groups'] if g in posix_groups } ldap_pri = client.posixuser(user).gid ldap_all = { group for group in group_membership if user in group_membership[group] } if db_pri != ldap_pri: bad_primary.append( '{0} has primary group {1} in db and {2} in ldap'.format( user, db_pri, ldap_pri ) ) extra = ldap_all.difference(db_sec) extra.discard(db_info['group']) missing = db_sec.difference(ldap_all) if extra: ldap_extra_sec.append( '{0} has groups {1}'.format(user, extra) ) if missing: ldap_missing_sec.append( '{0} missing groups {1}'.format(user, missing) ) data = { 'Total users checked': len(userlist) } if bad_primary: status = 'CRITICAL' data['Users have mismatched primary groups'] = ', '.join(bad_primary) if ldap_extra_sec: status = 'CRITICAL' data['Users have additional groups in LDAP'] = ( ', '.join(ldap_extra_sec) ) if ldap_missing_sec: status = 'CRITICAL' data['Users are missing secondary groups in LDAP'] = ( ', '.join(ldap_missing_sec) ) return status, data, pretext
[docs]@monitor_command def auditor_scheduler_accounts(opts): """ Ensure that all accounts in the databaes have a corresponding account in the slurm database with the correct properties """ pretext = ( 'Check the accounts in the ACCRE database for consistency ' 'with the slurm scheduler' ) status = 'OK' assoc = get_slurm_associations(ssh=True) slurm_accts = { a['account'][:-len('_account')]: a for a in assoc if not a['user'] if a['account'].endswith('_account') } dbclient = VandyAdminDBClient() db_accts = { a['name']: a for a in dbclient.all_accounts_info() if a['active'] == True if a['scheduler_account'] == True } inconsistent_attrs = [] missing_db_accounts = [] for acct in slurm_accts: if acct not in db_accts: missing_db_accounts.append(acct) continue info = db_accts[acct] if not _compare_slurm_attrs(info, slurm_accts[acct]): inconsistent_attrs.append(acct) missing_slurm_accounts = [ acct for acct in db_accts if acct not in slurm_accts ] data = { 'Total Slurm Accounts': len(slurm_accts), 'Active Database Accounts': len(db_accts) } if inconsistent_attrs: status = 'CRITICAL' data['Inconsistent account attributes'] = ', '.join(inconsistent_attrs) if missing_db_accounts: status = 'CRITICAL' data['Slurm accounts missing in database'] = ( ', '.join(missing_db_accounts) ) if missing_slurm_accounts: status = 'CRITICAL' data['Active database accounts missing in slurm'] = ( ', '.join(missing_slurm_accounts) ) return status, data, pretext
[docs]@monitor_command def auditor_scheduler_groups(opts): """ Ensure that all scheduler groups have a corresponding group in the slurm database with the correct properties """ pretext = ( 'Check the groups in the ACCRE database for consistency ' 'with the slurm scheduler' ) status = 'OK' assoc = get_slurm_associations(ssh=True) slurm_groups = { a['account']: a for a in assoc if not a['user'] if not a['account'].endswith('_account') if not a['account'] == 'root' } accounts = {} for acct, gba in groups_by_account(ssh=True).items(): for g in gba: accounts[g] = acct dbclient = VandyAdminDBClient() active_database_groups = dbclient.list_groups(active=True, scheduler=True) active_db_group_info = { g['name']: g for g in dbclient.all_groups_info(active=True, scheduler=True) } inconsistent_attrs = [] inconsistent_accounts = [] missing_database_groups = [] for group in slurm_groups: if group not in active_database_groups: missing_database_groups.append(group) continue info = active_db_group_info[group] if not _compare_slurm_attrs(info, slurm_groups[group]): inconsistent_attrs.append(group) if (not info['account'] or info['account'] + '_account' != accounts[group]): inconsistent_accounts.append(group) missing_slurm_groups = [ group for group in active_database_groups if group not in slurm_groups ] data = { 'Total Slurm Groups': len(slurm_groups), 'Active Database Scheduler Groups': len(active_database_groups) } if inconsistent_attrs: status = 'CRITICAL' data['Inconsistent Group attributes'] = ', '.join(inconsistent_attrs) if inconsistent_accounts: status = 'CRITICAL' data['Inconsistent Group accounts'] = ', '.join(inconsistent_accounts) if missing_database_groups: status = 'CRITICAL' data['Scheduler groups missing in database'] = ( ', '.join(missing_database_groups) ) if missing_slurm_groups: status = 'CRITICAL' data['Active database groups missing in slurm'] = ( ', '.join(missing_slurm_groups) ) return status, data, pretext
[docs]@monitor_command def auditor_scheduler_associations(opts): """ Ensure that all scheduler associations in slurm match those as determined by the database user, group, and partition records. Note that this check does not look at accelerated partition associations. """ pretext = ( 'Check the users/groups/partitions in the ACCRE DB for consistency ' 'with the slurm scheduler' ) status = 'OK' assoc = get_slurm_associations(ssh=True) slurm_assoc = { (a['user'], a['account'], a['partition']) for a in assoc if a['user'] and a['user'] != 'root' } dbclient = VandyAdminDBClient() db_assoc = set(dbclient.list_scheduler_associations()) missing_slurm_assoc = db_assoc - slurm_assoc missing_database_assoc = slurm_assoc - db_assoc data = { 'Total Slurm Associations': len(slurm_assoc), 'Total Database Associations': len(db_assoc) } if missing_slurm_assoc: status = 'CRITICAL' data['Database associations missing in slurm'] = ( ', '.join(str(a) for a in missing_slurm_assoc) ) if missing_database_assoc: status = 'CRITICAL' data['Slurm associations not indicated by database'] = ( ', '.join(str(a) for a in missing_database_assoc) ) return status, data, pretext
[docs]@monitor_command def auditor_scheduler_acc_associations(opts): """ Ensure that all accelerated scheduler associations in slurm match those as determined by the database user, group, and partition records. """ pretext = ( 'Check the users/groups/partitions in the ACCRE DB for consistency ' 'with the slurm scheduler for accelerated parititons' ) status = 'OK' assoc = get_slurm_associations(ssh=True, accelerated=True, regular=False) # remove the "_acc" suffix from accelerated accounts in slurm slurm_assoc = { (a['user'], a['account'][:-4], a['partition']) for a in assoc if a['user'] and a['user'] != 'root' } dbclient = VandyAdminDBClient() db_assoc = set(dbclient.list_scheduler_acc_associations()) missing_slurm_assoc = db_assoc - slurm_assoc missing_database_assoc = slurm_assoc - db_assoc data = { 'Total Slurm Accel Associations': len(slurm_assoc), 'Total Database Accel Associations': len(db_assoc) } if missing_slurm_assoc: status = 'CRITICAL' data['Database accel associations missing in slurm'] = ( ', '.join(str(a) for a in missing_slurm_assoc) ) if missing_database_assoc: status = 'CRITICAL' data['Slurm accel associations not indicated by database'] = ( ', '.join(str(a) for a in missing_database_assoc) ) return status, data, pretext
[docs]@monitor_command def auditor_scheduler_default_groups(opts): """ Ensure that all active users have a default group in the scheduler that matches their primary group in the database if that primary group is a scheduler group. """ pretext = ( 'Check default group consistency between scheduler and the Database' ) status = 'OK' dbclient = VandyAdminDBClient() users = { u['vunetid']: u for u in dbclient.all_users_info() if u['active'] == True } scheduler_groups = dbclient.list_groups(active=True, scheduler=True) default_groups = get_default_groups(ssh=True) count = 0 bad_users = [] for user, info in users.items(): primary = info['group'] if primary not in scheduler_groups: continue if user not in default_groups: continue count += 1 if primary != default_groups[user]: # skip primary groups without partitions if not dbclient.group_info(primary)['partitions']: continue bad_users.append(user) data = { 'Total users checked': count } if bad_users: status = 'CRITICAL' data['Users have wrong default scheduler group'] = ', '.join(bad_users) return status, data, pretext
def _compare_slurm_attrs(a, b): """ Compare fairshare, qos, max_cpu, max_mem, and max_runmins """ for item in ('qos', 'fairshare', 'max_cpu', 'max_runmins'): if a[item] != b[item]: return False if a['max_mem'] is None or b['max_mem'] is None: if a['max_mem'] is b['max_mem']: return True else: return False if not byte_quantity_isclose(a['max_mem'], b['max_mem']): return False return True
[docs]@monitor_command def auditor_compute_node_checkin(opts): """ Ensure that all compute nodes responding to the SLURM scheduler have checked in with configuration management in the last 24 hours, and report a list of all dead nodes. """ pretext = ( 'Check that all responding compute nodes have checked in with CFE' ) status = 'OK' since = datetime.datetime.now() - datetime.timedelta(days=1) dbclient = VandyAdminDBClient() checkins = { x['hostname'] for x in dbclient.all_managed_hosts_info(since=since) } active_nodes = list_compute_nodes(ssh=True) all_nodes = list_compute_nodes(responding=False, ssh=True) dead_nodes = set(all_nodes) - set(active_nodes) active_hostnames = [x + '.vampire' for x in active_nodes] stale = [x for x in active_hostnames if x not in checkins] data = { 'Total compute nodes': len(all_nodes) } if dead_nodes: data['Dead compute nodes'] = ', '.join(dead_nodes) if stale: status = 'CRITICAL' data['Active compute nodes have not recently run CFE'] = ( ', '.join(stale) ) return status, data, pretext
[docs]@monitor_command def auditor_compute_node_nagios(opts): """ Ensure that all compute nodes responding to the SLURM scheduler are currently being monitored in nagios. """ pretext = ( 'Check that all responding compute nodes are monitored in nagios' ) status = 'OK' oc = retrieve_object_cache() nagios_nodes = oc.managed_hosts() active_nodes = list_compute_nodes(ssh=True) missing_nodes = set(active_nodes) - set(nagios_nodes) data = { 'Active compute nodes in nagios': len(active_nodes) - len(missing_nodes) } if missing_nodes: data['Compute nodes missing in nagios'] = ', '.join(missing_nodes) status = 'CRITICAL' return status, data, pretext
[docs]@monitor_command def auditor_compute_node_kernels(opts): """ Determine all the kernel versions on the compute nodes that have checked in (over last week) and report. """ pretext = ( 'Report on the kernel versions present on all compute nodes ' 'that have checked in within the last week.' ) status = 'OK' since = datetime.datetime.now() - datetime.timedelta(days=7) dbclient = VandyAdminDBClient() hosts_info = dbclient.all_managed_hosts_info(since=since) checkins = {x['hostname'] for x in hosts_info} compute_nodes = list_compute_nodes(ssh=True, responding=False) hostnames = [x + '.vampire' for x in compute_nodes] kernels = Counter( x['kernel'] for x in hosts_info if x['hostname'] in hostnames ) data = { 'Total compute nodes checking in': sum(kernels.values()) } if len(kernels) > 1: status = 'WARNING' data['Compute node kernel versions'] = ', '.join( '{0} nodes with {1}'.format(v, k) for k, v in kernels.items() ) return status, data, pretext
[docs]@monitor_command def auditor_gfps_license_count(opts): """ Determine the number of GPFS client and server licenses in use and compare to the configured limits. """ pretext = ( 'Report the current GPFS server and client licenses in use ' 'compared to the total available.' ) status = 'OK' gcraw = open('/etc/accre_general.yaml').read() gc = yaml.load(gcraw, Loader=yaml.BaseLoader) proxmox = ACCREProxmox() hypervisors = len(proxmox.cluster.resources.get(type='node')) linfo = run_mmlslicense(ssh=True) server_allowed = int(gc['gpfs']['licenses']['server']) client_allowed = int(gc['gpfs']['licenses']['client']) nodes = run_mmlsnode(ssh=True)['Accre'] baremetal = len([ n for n in nodes if not n.startswith('nsd') and not n.startswith('gpfsmgr') and not n.startswith('vm-') ]) virtual = len([n for n in nodes if n.startswith('vm-')]) data = {} data['Purchased server sockets'] = server_allowed data['Used server sockets'] = 2*(linfo['server']) data['Purchased client sockets'] = client_allowed total_client_sockets = 2*(hypervisors + baremetal) data['Used client sockets'] = total_client_sockets data['Hypervisor sockets'] = 2*hypervisors data['Virtual machine clients'] = virtual if total_client_sockets > client_allowed: status = 'CRITICAL' if 2*(linfo['server']) > server_allowed: status = 'CRITICAL' if linfo['missing_server']: data['Missing server licenses'] = linfo['missing_server'] status = 'CRITICAL' if linfo['missing_client']: data['Missing client licenses'] = linfo['missing_client'] status = 'CRITICAL' return status, data, pretext
[docs]@monitor_command def auditor_sync_zimbra_announce_dl(opts): """ Ensure all active users and PIs are in the zimbra announce_dl distribution list, make any required repairs to the list and report. """ pretext = ( 'Synchronize the zimbra annouce_dl distribution list with ' 'all active ACCRE users and PIs.' ) status = 'OK' data = {} client = VandyAdminDBClient() dbmails = { u['email'].lower() for u in client.all_users_info() if u['active'] } for pi in client.list_pis(active=True): info = client.pi_info(pi) dbmails.add(info['email'].lower()) data['Total active user and PI addresses'] = str(len(dbmails)) try: zimbra = get_dl_addresses('announce_dl') data['Total announce email recipients before sync'] = str(len(zimbra)) except ZimbraCommandError as e: status = 'CRITICAL' data['Zimbra command failure'] = str(e) missing = list(dbmails - zimbra) removable = list(zimbra - dbmails) if missing: try: zimbra = add_dl_addresses('announce_dl', missing) data['Added email addresses'] = ', '.join(missing) except ZimbraCommandError as e: status = 'CRITICAL' data['Zimbra command failure'] = str(e) if removable: try: zimbra = remove_dl_addresses('announce_dl', removable) data['Removed email addresses'] = ', '.join(removable) except ZimbraCommandError as e: status = 'CRITICAL' data['Zimbra command failure'] = str(e) return status, data, pretext
[docs]@monitor_command def auditor_sync_vuds_accre_users(opts): """ Ensure all active users and PIs that are not robot accounts are in the VUIT VUDS ACCRE_Users group, make any required repairs to the group and report. Users with legacy VUMC IDs will not be in VUDS. A text file with a list of email addresses for these users will be created and saved to the file /data/accre/accre_emails_non_vuds.txt on GPFS. """ pretext = ( 'Synchronize the VUIT VUDS ACCRE_Users group with ' 'all non-robot active ACCRE users and PIs.' ) status = 'OK' data = {} client = VandyAdminDBClient() users = set(client.list_users(active=True)) for pi in client.list_pis(active=True): users.add(pi) with ACCRELDAP() as a: robots = set(a.list_robots()) people = users - robots data['Total active non-robot user and PI addresses'] = str(len(people)) with VUDS() as ad: ad_users = set(ad.list_accre_users()) missing = people - ad_users removable = ad_users - people if missing: fails = [] newusers = [] for user in missing: if not ad.add_accre_users([user]): fails.append(user) else: newusers.append(user) data['Added new ACCRE users'] = ', '.join(newusers) if fails: data['Failed to add users'] = ', '.join(fails) data['Number of users not added'] = str(len(fails)) status = 'OK' if removable: success = ad.delete_accre_users(list(removable)) rmv_str = ', '.join(list(removable)) if success: data['Removed non-ACCRE users'] = rmv_str else: data['Failed to remove users'] = rmv_str status = 'CRITICAL' try: users_info = client.all_users_info() users_emails = {x['vunetid']: x['email'] for x in users_info} pis_info = client.all_pis_info() pis_emails = {x['vunetid']: x['email'] for x in pis_info} all_emails = {**users_emails, **pis_emails} gpfs_node = 'root@{0}'.format( CONFIG['account-management']['gpfs-node'] ) tmp_file = '/tmp/accre_emails_non_vuds.txt' dest_file = '/data/accre/accre_emails_non_vuds.txt' with open(tmp_file, 'w') as emailfile: for user in fails: emailfile.write(all_emails[user] + '\n') copy_cmd = ['scp', tmp_file, f'{gpfs_node}:{dest_file}'] proc = subprocess.Popen( copy_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE ) stdout, stderr = proc.communicate(timeout=60) if proc.returncode != 0: raise Exception('Failed to move emails file to gpfs node') except Exception as e: data['Failed to generate emails file'] = str(e) status = 'CRITICAL' return status, data, pretext
[docs]@monitor_command def auditor_gpfs_fileset_limits(opts): """ Ensure all GPFS filesets have the correct limits in GPFS as determined by the usage record and that the usage records have been updated in the last 48 hours """ pretext = ( 'Check GPFS limits match the database and that usage ' 'records are up-to-date.' ) status = 'OK' data = {} client = VandyAdminDBClient() filesets = { (f['filesystem'], f['name'], f['fileset']): f for f in client.all_gpfs_fileset_info(active=True) } data['Total active GPFS filesets'] = str(len(filesets)) filesystems = [f[0] for f in filesets.keys()] usage = {} for fsystem in filesystems: usage_records = client.get_gpfs_usage_records( filesystem=fsystem, type='FILESET' ) for r in usage_records: usage[(r['filesystem'], r['name'], r['fileset'])] = r stale_records = [] wrong_limits = [] for fset in filesets.keys(): # skip the root fileset, this produces no record in gpfs52 if fset[1] == 'root': continue stale = datetime.datetime.now() - datetime.timedelta(hours=48) if fset not in usage: stale_records.append(fset) continue if usage[fset]['last_check'] < stale: stale_records.append(fset) limits_ok = all([ isclose(usage[fset][x], filesets[fset][x], rel_tol=5e-3) for x in ('block_quota', 'block_limit', 'file_quota', 'file_limit') ]) if not limits_ok: wrong_limits.append(fset) if stale_records: fmt_stales = [f'{s[0]}/{s[1]}' for s in stale_records] if len(stale_records) > 10: stale_msg = ', '.join(fmt_stales[:10]) + ', ...' else: stale_msg = ', '.join(fmt_stales) data['Stale or missing usage records for filesets'] = stale_msg status = 'CRITICAL' if wrong_limits: fmt_limits = [f'{s[0]}/{s[1]}' for s in wrong_limits] if len(wrong_limits) > 10: limit_msg = ', '.join(fmt_limits[:10]) + ', ...' else: limit_msg = ', '.join(fmt_limits) data['Incorrect quota/limits for filesets'] = limit_msg status = 'CRITICAL' return status, data, pretext