"""
These "checks" are generally long running audits to be performed by the
ACCRE auditor. They will not be available in the ``accre-monitor`` command
unless an "auditor" field is set to true.
For an overview of the monitoring check framework see
:mod:`accre.monitor`.
"""
from collections import Counter
import datetime
from math import isclose
import socket
import subprocess
from urllib.parse import unquote
import yaml
from accre.account_management import VUNetIDValidator
from accre.gpfs import (
run_mmlslicense,
run_mmlsnode,
run_gpfs_command,
parse_gpfs_y_command,
GPFSCommandError
)
from accre.monitor import monitor_command
from accre.nagios import retrieve_object_cache
from accre.proxmox import ACCREProxmox
from accre.ldap import ACCRELDAP, VUDS
from accre.database import VandyAdminDBClient
from accre.util import (
interpret_string_values,
get_primary_ip,
byte_quantity_isclose
)
from accre.config import get_config
from accre.slurm import (
get_slurm_associations,
groups_by_account,
get_default_groups,
list_compute_nodes
)
from accre.email import (
get_dl_addresses,
add_dl_addresses,
remove_dl_addresses,
ZimbraCommandError
)
CONFIG = get_config()
[docs]@monitor_command
def auditor_hello(opts):
"""
Declare yourself as the auditor.
There should be only one auditor node. This will show the hostname,
internal IP, and RSA host key of the node.
"""
keyloc = opts.get('keyloc', '/etc/ssh/ssh_host_rsa_key.pub')
data = {}
data['hostname'] = socket.gethostname()
data['primary interface'] = get_primary_ip()
proc = subprocess.Popen(
['/usr/bin/env', 'ssh-keygen', '-l', '-f', keyloc],
stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
stdout, stderr = proc.communicate(timeout=60)
data['rsa host key'] = stdout.decode('ascii')
short_desc = 'Meesa the auditor. Yousa should follow me now, okeeday?'
return 'OK', data, short_desc
[docs]@monitor_command
def auditor_vandy_active(opts):
"""
Ensure that all active users in ACCRE LDAP (defined by login shell)
are considered active VUnetIDs in Vanderbilt LDAP.
Notice - 2021/01 - Eric - Currently this needs cleanup and has
for over a year so we will not send WARNING/CRITICAL for now even
if there are problems.
"""
pretext = 'All cluster users have active VUnetIDs'
with ACCRELDAP(caching=True) as client:
all_users = client.list_users()
active_users = client.list_users(active=True)
robots = client.list_robots()
active_robots = list(set(active_users) & set(robots))
missing_users = []
locked_users = []
with VUNetIDValidator() as validator:
for user in active_users:
if not validator.exists(user):
missing_users.append(user)
elif validator.is_locked(user):
locked_users.append(user)
data = {
'Total ACCRE Users': len(all_users),
'Active ACCRE Users': len(active_users)
}
if active_robots:
data['Active ACCRE Robots'] = ', '.join(active_robots)
status = 'OK'
if locked_users:
data['Active ACCRE Users with locked VUNetIDs'] = (
', '.join(locked_users)
)
status = 'OK'
pretext = 'OK - Active ACCRE users have locked VUNetIDs'
if missing_users:
data['Active ACCRE Users with missing VUNetIDs'] = (
', '.join(missing_users)
)
status = 'OK'
pretext = 'OK - Active ACCRE users have missing VUNetIDs'
return status, data, pretext
[docs]@monitor_command
def auditor_accre_active(opts):
"""
Ensure that all active users in ACCRE LDAP (defined by login shell)
are considered active in the ACCRE database, and that all users
that are active in the ACCRE database are active users in LDAP.
"""
pretext = 'The active users in the ACCRE database match ACCRE LDAP'
status = 'OK'
with ACCRELDAP(caching=True) as client:
active_ldap_users = set(client.list_users(active=True))
ldap_users = set(client.list_users())
inactive_ldap_users = ldap_users.difference(active_ldap_users)
dbclient = VandyAdminDBClient()
active_db_users = set(dbclient.list_users(active=True))
data = {
'Total Users (ldap)': len(ldap_users),
'Active Users (ldap)': len(active_ldap_users),
'Active Users (database)': len(active_db_users)
}
incorrectly_active = active_ldap_users.difference(active_db_users)
if incorrectly_active:
status = 'WARNING'
pretext = 'WARNING - ACCRE LDAP user inconsistency with database'
data['Active users (ldap) inactive or missing in database'] = (
', '.join(sorted(list(incorrectly_active)))
)
incorrectly_inactive = active_db_users.intersection(inactive_ldap_users)
if incorrectly_inactive:
status = 'CRITICAL'
pretext = 'CRITICAL - ACCRE LDAP user inconsistency with database'
data['Active users (db) set to inactive in ldap'] = (
', '.join(sorted(list(incorrectly_inactive)))
)
missing = active_db_users.difference(ldap_users)
if missing:
status = 'CRITICAL'
pretext = 'CRITICAL - ACCRE LDAP user inconsistency with database'
data['Active users (db) missing in ldap'] = (
', '.join(sorted(list(missing)))
)
return status, data, pretext
[docs]@monitor_command
def auditor_accre_groups(opts):
"""
Ensure that all groups in ACCRE LDAP have a corresponding group
in the database with the same GID, and that all active database
groups are in LDAP.
"""
pretext = (
'Check the groups in the ACCRE database for consistency '
'with ACCRE LDAP'
)
status = 'OK'
with ACCRELDAP(caching=True) as client:
ldap_groups = client.list_groups()
gid_map = {x[0]: x[1] for x in client.list_group_gids()}
dbclient = VandyAdminDBClient()
database_groups = dbclient.list_groups()
active_database_groups = dbclient.list_groups(active=True, posix=True)
active_db_group_info = {
g['name']: g for g in dbclient.all_groups_info(active=True, posix=True)
}
missing_database_groups = []
inconsistent_gids = []
for group in ldap_groups:
if group not in active_database_groups:
missing_database_groups.append(group)
continue
if active_db_group_info[group]['group_id'] != gid_map[group]:
inconsistent_gids.append(
'{0} db gid: {1} ldap gid: {2}'.format(
group, active_db_group_info[group]['group_id'], gid_map[group]
)
)
missing_ldap_groups = [
group for group in active_database_groups if group not in ldap_groups
]
data = {
'Total LDAP Groups': len(ldap_groups),
'Total Database Groups': len(database_groups),
'Active Database Groups': len(active_database_groups)
}
if inconsistent_gids:
status = 'CRITICAL'
data['Inconsistent Group IDs'] = ', '.join(inconsistent_gids)
if missing_database_groups:
status = 'CRITICAL'
data['LDAP groups missing in database'] = (
', '.join(missing_database_groups)
)
if missing_ldap_groups:
status = 'CRITICAL'
data['Active database groups missing in LDAP'] = (
', '.join(missing_ldap_groups)
)
return status, data, pretext
[docs]@monitor_command
def auditor_accre_group_membership(opts):
"""
Ensure that all active LDAP users have primary and secondary
groups matching the ACCRE database
"""
pretext = (
'Check group membership consistency between LDAP and the Database'
)
status = 'OK'
bad_primary = []
ldap_missing_sec = []
ldap_extra_sec = []
dbclient = VandyAdminDBClient()
posix_groups = dbclient.list_groups(active=True, posix=True)
with ACCRELDAP(caching=True) as client:
gid_map = {x[0]: x[1] for x in client.list_group_gids()}
group_membership = client.group_membership()
userlist = client.list_users(active=True)
dbusers_list = dbclient.all_users_info(secondary_groups=True)
dbusers = {u['vunetid']: u for u in dbusers_list}
for user in userlist:
db_info = dbusers[user]
db_pri = db_info['group_id']
db_sec = {
g for g in db_info['secondary_groups'] if g in posix_groups
}
ldap_pri = client.posixuser(user).gid
ldap_all = {
group for group in group_membership
if user in group_membership[group]
}
if db_pri != ldap_pri:
bad_primary.append(
'{0} has primary group {1} in db and {2} in ldap'.format(
user, db_pri, ldap_pri
)
)
extra = ldap_all.difference(db_sec)
extra.discard(db_info['group'])
missing = db_sec.difference(ldap_all)
if extra:
ldap_extra_sec.append(
'{0} has groups {1}'.format(user, extra)
)
if missing:
ldap_missing_sec.append(
'{0} missing groups {1}'.format(user, missing)
)
data = {
'Total users checked': len(userlist)
}
if bad_primary:
status = 'CRITICAL'
data['Users have mismatched primary groups'] = ', '.join(bad_primary)
if ldap_extra_sec:
status = 'CRITICAL'
data['Users have additional groups in LDAP'] = (
', '.join(ldap_extra_sec)
)
if ldap_missing_sec:
status = 'CRITICAL'
data['Users are missing secondary groups in LDAP'] = (
', '.join(ldap_missing_sec)
)
return status, data, pretext
[docs]@monitor_command
def auditor_scheduler_accounts(opts):
"""
Ensure that all accounts in the databaes have a corresponding account
in the slurm database with the correct properties
"""
pretext = (
'Check the accounts in the ACCRE database for consistency '
'with the slurm scheduler'
)
status = 'OK'
assoc = get_slurm_associations(ssh=True)
slurm_accts = {
a['account'][:-len('_account')]: a for a in assoc
if not a['user']
if a['account'].endswith('_account')
}
dbclient = VandyAdminDBClient()
db_accts = {
a['name']: a for a in dbclient.all_accounts_info()
if a['active'] == True if a['scheduler_account'] == True
}
inconsistent_attrs = []
missing_db_accounts = []
for acct in slurm_accts:
if acct not in db_accts:
missing_db_accounts.append(acct)
continue
info = db_accts[acct]
if not _compare_slurm_attrs(info, slurm_accts[acct]):
inconsistent_attrs.append(acct)
missing_slurm_accounts = [
acct for acct in db_accts if acct not in slurm_accts
]
data = {
'Total Slurm Accounts': len(slurm_accts),
'Active Database Accounts': len(db_accts)
}
if inconsistent_attrs:
status = 'CRITICAL'
data['Inconsistent account attributes'] = ', '.join(inconsistent_attrs)
if missing_db_accounts:
status = 'CRITICAL'
data['Slurm accounts missing in database'] = (
', '.join(missing_db_accounts)
)
if missing_slurm_accounts:
status = 'CRITICAL'
data['Active database accounts missing in slurm'] = (
', '.join(missing_slurm_accounts)
)
return status, data, pretext
[docs]@monitor_command
def auditor_scheduler_groups(opts):
"""
Ensure that all scheduler groups have a corresponding group
in the slurm database with the correct properties
"""
pretext = (
'Check the groups in the ACCRE database for consistency '
'with the slurm scheduler'
)
status = 'OK'
assoc = get_slurm_associations(ssh=True)
slurm_groups = {
a['account']: a for a in assoc
if not a['user']
if not a['account'].endswith('_account')
if not a['account'] == 'root'
}
accounts = {}
for acct, gba in groups_by_account(ssh=True).items():
for g in gba:
accounts[g] = acct
dbclient = VandyAdminDBClient()
active_database_groups = dbclient.list_groups(active=True, scheduler=True)
active_db_group_info = {
g['name']: g for g in
dbclient.all_groups_info(active=True, scheduler=True)
}
inconsistent_attrs = []
inconsistent_accounts = []
missing_database_groups = []
for group in slurm_groups:
if group not in active_database_groups:
missing_database_groups.append(group)
continue
info = active_db_group_info[group]
if not _compare_slurm_attrs(info, slurm_groups[group]):
inconsistent_attrs.append(group)
if (not info['account'] or
info['account'] + '_account' != accounts[group]):
inconsistent_accounts.append(group)
missing_slurm_groups = [
group for group in active_database_groups if group not in slurm_groups
]
data = {
'Total Slurm Groups': len(slurm_groups),
'Active Database Scheduler Groups': len(active_database_groups)
}
if inconsistent_attrs:
status = 'CRITICAL'
data['Inconsistent Group attributes'] = ', '.join(inconsistent_attrs)
if inconsistent_accounts:
status = 'CRITICAL'
data['Inconsistent Group accounts'] = ', '.join(inconsistent_accounts)
if missing_database_groups:
status = 'CRITICAL'
data['Scheduler groups missing in database'] = (
', '.join(missing_database_groups)
)
if missing_slurm_groups:
status = 'CRITICAL'
data['Active database groups missing in slurm'] = (
', '.join(missing_slurm_groups)
)
return status, data, pretext
[docs]@monitor_command
def auditor_scheduler_associations(opts):
"""
Ensure that all scheduler associations in slurm match those as determined
by the database user, group, and partition records. Note that this check
does not look at accelerated partition associations.
"""
pretext = (
'Check the users/groups/partitions in the ACCRE DB for consistency '
'with the slurm scheduler'
)
status = 'OK'
assoc = get_slurm_associations(ssh=True)
slurm_assoc = {
(a['user'], a['account'], a['partition'])
for a in assoc if a['user'] and a['user'] != 'root'
}
dbclient = VandyAdminDBClient()
db_assoc = set(dbclient.list_scheduler_associations())
missing_slurm_assoc = db_assoc - slurm_assoc
missing_database_assoc = slurm_assoc - db_assoc
data = {
'Total Slurm Associations': len(slurm_assoc),
'Total Database Associations': len(db_assoc)
}
if missing_slurm_assoc:
status = 'CRITICAL'
data['Database associations missing in slurm'] = (
', '.join(str(a) for a in missing_slurm_assoc)
)
if missing_database_assoc:
status = 'CRITICAL'
data['Slurm associations not indicated by database'] = (
', '.join(str(a) for a in missing_database_assoc)
)
return status, data, pretext
[docs]@monitor_command
def auditor_scheduler_acc_associations(opts):
"""
Ensure that all accelerated scheduler associations in slurm match those
as determined by the database user, group, and partition records.
"""
pretext = (
'Check the users/groups/partitions in the ACCRE DB for consistency '
'with the slurm scheduler for accelerated parititons'
)
status = 'OK'
assoc = get_slurm_associations(ssh=True, accelerated=True, regular=False)
# remove the "_acc" suffix from accelerated accounts in slurm
slurm_assoc = {
(a['user'], a['account'][:-4], a['partition'])
for a in assoc if a['user'] and a['user'] != 'root'
}
dbclient = VandyAdminDBClient()
db_assoc = set(dbclient.list_scheduler_acc_associations())
missing_slurm_assoc = db_assoc - slurm_assoc
missing_database_assoc = slurm_assoc - db_assoc
data = {
'Total Slurm Accel Associations': len(slurm_assoc),
'Total Database Accel Associations': len(db_assoc)
}
if missing_slurm_assoc:
status = 'CRITICAL'
data['Database accel associations missing in slurm'] = (
', '.join(str(a) for a in missing_slurm_assoc)
)
if missing_database_assoc:
status = 'CRITICAL'
data['Slurm accel associations not indicated by database'] = (
', '.join(str(a) for a in missing_database_assoc)
)
return status, data, pretext
[docs]@monitor_command
def auditor_scheduler_default_groups(opts):
"""
Ensure that all active users have a default group in the scheduler
that matches their primary group in the database if that primary
group is a scheduler group.
"""
pretext = (
'Check default group consistency between scheduler and the Database'
)
status = 'OK'
dbclient = VandyAdminDBClient()
users = {
u['vunetid']: u for u in dbclient.all_users_info()
if u['active'] == True
}
scheduler_groups = dbclient.list_groups(active=True, scheduler=True)
default_groups = get_default_groups(ssh=True)
count = 0
bad_users = []
for user, info in users.items():
primary = info['group']
if primary not in scheduler_groups:
continue
if user not in default_groups:
continue
count += 1
if primary != default_groups[user]:
# skip primary groups without partitions
if not dbclient.group_info(primary)['partitions']:
continue
bad_users.append(user)
data = {
'Total users checked': count
}
if bad_users:
status = 'CRITICAL'
data['Users have wrong default scheduler group'] = ', '.join(bad_users)
return status, data, pretext
def _compare_slurm_attrs(a, b):
"""
Compare fairshare, qos, max_cpu, max_mem, and max_runmins
"""
for item in ('qos', 'fairshare', 'max_cpu', 'max_runmins'):
if a[item] != b[item]:
return False
if a['max_mem'] is None or b['max_mem'] is None:
if a['max_mem'] is b['max_mem']:
return True
else:
return False
if not byte_quantity_isclose(a['max_mem'], b['max_mem']):
return False
return True
[docs]@monitor_command
def auditor_compute_node_checkin(opts):
"""
Ensure that all compute nodes responding to the SLURM scheduler have
checked in with configuration management in the last 24 hours, and
report a list of all dead nodes.
"""
pretext = (
'Check that all responding compute nodes have checked in with CFE'
)
status = 'OK'
since = datetime.datetime.now() - datetime.timedelta(days=1)
dbclient = VandyAdminDBClient()
checkins = {
x['hostname'] for x in dbclient.all_managed_hosts_info(since=since)
}
active_nodes = list_compute_nodes(ssh=True)
all_nodes = list_compute_nodes(responding=False, ssh=True)
dead_nodes = set(all_nodes) - set(active_nodes)
active_hostnames = [x + '.vampire' for x in active_nodes]
stale = [x for x in active_hostnames if x not in checkins]
data = {
'Total compute nodes': len(all_nodes)
}
if dead_nodes:
data['Dead compute nodes'] = ', '.join(dead_nodes)
if stale:
status = 'CRITICAL'
data['Active compute nodes have not recently run CFE'] = (
', '.join(stale)
)
return status, data, pretext
[docs]@monitor_command
def auditor_compute_node_nagios(opts):
"""
Ensure that all compute nodes responding to the SLURM scheduler are
currently being monitored in nagios.
"""
pretext = (
'Check that all responding compute nodes are monitored in nagios'
)
status = 'OK'
oc = retrieve_object_cache()
nagios_nodes = oc.managed_hosts()
active_nodes = list_compute_nodes(ssh=True)
missing_nodes = set(active_nodes) - set(nagios_nodes)
data = {
'Active compute nodes in nagios':
len(active_nodes) - len(missing_nodes)
}
if missing_nodes:
data['Compute nodes missing in nagios'] = ', '.join(missing_nodes)
status = 'CRITICAL'
return status, data, pretext
[docs]@monitor_command
def auditor_compute_node_kernels(opts):
"""
Determine all the kernel versions on the compute nodes that have
checked in (over last week) and report.
"""
pretext = (
'Report on the kernel versions present on all compute nodes '
'that have checked in within the last week.'
)
status = 'OK'
since = datetime.datetime.now() - datetime.timedelta(days=7)
dbclient = VandyAdminDBClient()
hosts_info = dbclient.all_managed_hosts_info(since=since)
checkins = {x['hostname'] for x in hosts_info}
compute_nodes = list_compute_nodes(ssh=True, responding=False)
hostnames = [x + '.vampire' for x in compute_nodes]
kernels = Counter(
x['kernel'] for x in hosts_info if x['hostname'] in hostnames
)
data = {
'Total compute nodes checking in': sum(kernels.values())
}
if len(kernels) > 1:
status = 'WARNING'
data['Compute node kernel versions'] = ', '.join(
'{0} nodes with {1}'.format(v, k) for k, v in kernels.items()
)
return status, data, pretext
[docs]@monitor_command
def auditor_gfps_license_count(opts):
"""
Determine the number of GPFS client and server licenses in
use and compare to the configured limits.
"""
pretext = (
'Report the current GPFS server and client licenses in use '
'compared to the total available.'
)
status = 'OK'
gcraw = open('/etc/accre_general.yaml').read()
gc = yaml.load(gcraw, Loader=yaml.BaseLoader)
proxmox = ACCREProxmox()
hypervisors = len(proxmox.cluster.resources.get(type='node'))
linfo = run_mmlslicense(ssh=True)
server_allowed = int(gc['gpfs']['licenses']['server'])
client_allowed = int(gc['gpfs']['licenses']['client'])
nodes = run_mmlsnode(ssh=True)['Accre']
baremetal = len([
n for n in nodes if not n.startswith('nsd')
and not n.startswith('gpfsmgr') and not n.startswith('vm-')
])
virtual = len([n for n in nodes if n.startswith('vm-')])
data = {}
data['Purchased server sockets'] = server_allowed
data['Used server sockets'] = 2*(linfo['server'])
data['Purchased client sockets'] = client_allowed
total_client_sockets = 2*(hypervisors + baremetal)
data['Used client sockets'] = total_client_sockets
data['Hypervisor sockets'] = 2*hypervisors
data['Virtual machine clients'] = virtual
if total_client_sockets > client_allowed:
status = 'CRITICAL'
if 2*(linfo['server']) > server_allowed:
status = 'CRITICAL'
if linfo['missing_server']:
data['Missing server licenses'] = linfo['missing_server']
status = 'CRITICAL'
if linfo['missing_client']:
data['Missing client licenses'] = linfo['missing_client']
status = 'CRITICAL'
return status, data, pretext
[docs]@monitor_command
def auditor_sync_zimbra_announce_dl(opts):
"""
Ensure all active users and PIs are in the zimbra announce_dl
distribution list, make any required repairs to the list and
report.
"""
pretext = (
'Synchronize the zimbra annouce_dl distribution list with '
'all active ACCRE users and PIs.'
)
status = 'OK'
data = {}
client = VandyAdminDBClient()
dbmails = {
u['email'].lower() for u in client.all_users_info()
if u['active']
}
for pi in client.list_pis(active=True):
info = client.pi_info(pi)
dbmails.add(info['email'].lower())
data['Total active user and PI addresses'] = str(len(dbmails))
try:
zimbra = get_dl_addresses('announce_dl')
data['Total announce email recipients before sync'] = str(len(zimbra))
except ZimbraCommandError as e:
status = 'CRITICAL'
data['Zimbra command failure'] = str(e)
missing = list(dbmails - zimbra)
removable = list(zimbra - dbmails)
if missing:
try:
zimbra = add_dl_addresses('announce_dl', missing)
data['Added email addresses'] = ', '.join(missing)
except ZimbraCommandError as e:
status = 'CRITICAL'
data['Zimbra command failure'] = str(e)
if removable:
try:
zimbra = remove_dl_addresses('announce_dl', removable)
data['Removed email addresses'] = ', '.join(removable)
except ZimbraCommandError as e:
status = 'CRITICAL'
data['Zimbra command failure'] = str(e)
return status, data, pretext
[docs]@monitor_command
def auditor_sync_vuds_accre_users(opts):
"""
Ensure all active users and PIs that are not robot accounts
are in the VUIT VUDS ACCRE_Users group, make any required repairs
to the group and report.
Users with legacy VUMC IDs will not be in VUDS. A text file with
a list of email addresses for these users will be created
and saved to the file /data/accre/accre_emails_non_vuds.txt
on GPFS.
"""
pretext = (
'Synchronize the VUIT VUDS ACCRE_Users group with '
'all non-robot active ACCRE users and PIs.'
)
status = 'OK'
data = {}
client = VandyAdminDBClient()
users = set(client.list_users(active=True))
for pi in client.list_pis(active=True):
users.add(pi)
with ACCRELDAP() as a:
robots = set(a.list_robots())
people = users - robots
data['Total active non-robot user and PI addresses'] = str(len(people))
with VUDS() as ad:
ad_users = set(ad.list_accre_users())
missing = people - ad_users
removable = ad_users - people
if missing:
fails = []
newusers = []
for user in missing:
if not ad.add_accre_users([user]):
fails.append(user)
else:
newusers.append(user)
data['Added new ACCRE users'] = ', '.join(newusers)
if fails:
data['Failed to add users'] = ', '.join(fails)
data['Number of users not added'] = str(len(fails))
status = 'OK'
if removable:
success = ad.delete_accre_users(list(removable))
rmv_str = ', '.join(list(removable))
if success:
data['Removed non-ACCRE users'] = rmv_str
else:
data['Failed to remove users'] = rmv_str
status = 'CRITICAL'
try:
users_info = client.all_users_info()
users_emails = {x['vunetid']: x['email'] for x in users_info}
pis_info = client.all_pis_info()
pis_emails = {x['vunetid']: x['email'] for x in pis_info}
all_emails = {**users_emails, **pis_emails}
gpfs_node = 'root@{0}'.format(
CONFIG['account-management']['gpfs-node']
)
tmp_file = '/tmp/accre_emails_non_vuds.txt'
dest_file = '/data/accre/accre_emails_non_vuds.txt'
with open(tmp_file, 'w') as emailfile:
for user in fails:
emailfile.write(all_emails[user] + '\n')
copy_cmd = ['scp', tmp_file, f'{gpfs_node}:{dest_file}']
proc = subprocess.Popen(
copy_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
stdout, stderr = proc.communicate(timeout=60)
if proc.returncode != 0:
raise Exception('Failed to move emails file to gpfs node')
except Exception as e:
data['Failed to generate emails file'] = str(e)
status = 'CRITICAL'
return status, data, pretext
[docs]@monitor_command
def auditor_gpfs_fileset_limits(opts):
"""
Ensure all GPFS filesets have the correct limits in GPFS as
determined by the usage record and that the usage records have
been updated in the last 48 hours
"""
pretext = (
'Check GPFS limits match the database and that usage '
'records are up-to-date.'
)
status = 'OK'
data = {}
client = VandyAdminDBClient()
filesets = {
(f['filesystem'], f['name'], f['fileset']): f
for f in client.all_gpfs_fileset_info(active=True)
}
data['Total active GPFS filesets'] = str(len(filesets))
filesystems = [f[0] for f in filesets.keys()]
usage = {}
for fsystem in filesystems:
usage_records = client.get_gpfs_usage_records(
filesystem=fsystem, type='FILESET'
)
for r in usage_records:
usage[(r['filesystem'], r['name'], r['fileset'])] = r
stale_records = []
wrong_limits = []
for fset in filesets.keys():
# skip the root fileset, this produces no record in gpfs52
if fset[1] == 'root':
continue
stale = datetime.datetime.now() - datetime.timedelta(hours=48)
if fset not in usage:
stale_records.append(fset)
continue
if usage[fset]['last_check'] < stale:
stale_records.append(fset)
limits_ok = all([
isclose(usage[fset][x], filesets[fset][x], rel_tol=5e-3)
for x in
('block_quota', 'block_limit', 'file_quota', 'file_limit')
])
if not limits_ok:
wrong_limits.append(fset)
if stale_records:
fmt_stales = [f'{s[0]}/{s[1]}' for s in stale_records]
if len(stale_records) > 10:
stale_msg = ', '.join(fmt_stales[:10]) + ', ...'
else:
stale_msg = ', '.join(fmt_stales)
data['Stale or missing usage records for filesets'] = stale_msg
status = 'CRITICAL'
if wrong_limits:
fmt_limits = [f'{s[0]}/{s[1]}' for s in wrong_limits]
if len(wrong_limits) > 10:
limit_msg = ', '.join(fmt_limits[:10]) + ', ...'
else:
limit_msg = ', '.join(fmt_limits)
data['Incorrect quota/limits for filesets'] = limit_msg
status = 'CRITICAL'
return status, data, pretext
[docs]@monitor_command
def auditor_gpfs_fileset_links(opts):
"""
Ensure all active GPFS filesets in the database exist in GPFS
and are linked correctly as specified in the database, and that
no other GPFS filesets exist.
"""
pretext = (
'Check GPFS filesets match the database and that link '
'records are corrent.'
)
status = 'OK'
data = {}
client = VandyAdminDBClient()
dbfs = client.all_gpfs_fileset_info()
# This is unlikely to change but if it does we can make it
# a configuration variable
FILESYSTEMS = ['gpfs51']
for fs in FILESYSTEMS:
raw = run_gpfs_command(
['/usr/lpp/mmfs/bin/mmlsfileset', fs, '-Y'],
cluster='accre2',
ssh=True
)
result = parse_gpfs_y_command(raw)
fsets = {x['filesetName']: unquote(x['path']) for x in result}
fsetsdb = {x['name']:x['path'] for x in dbfs if x['filesystem'] == fs}
data[f'Total active GPFS filesets on {fs}'] = str(len(fsetsdb))
missing = set(fsets.keys()) - set(fsetsdb.keys())
if missing:
data[f'Missing GPFS filesets on {fs}'] = ', '.join(missing)
status = 'CRITICAL'
extras = set(fsetsdb.keys()) - set(fsets.keys())
if extras:
data[f'Spurious GPFS filesets on {fs}'] = ', '.join(extras)
status = 'CRITICAL'
badlinks = set()
for fset in fsetsdb.keys():
if not fset in fsets:
continue
if fsetsdb[fset] != fsets[fset]:
badlinks.add(fset)
if badlinks:
data[f'Incorrect link location for filesets on {fs}'] = (
', '.join(badlinks)
)
status = 'CRITICAL'
return status, data, pretext