Merge "Add nagios check for radosgw-admin sync status"

This commit is contained in:
Zuul 2024-01-10 07:40:46 +00:00 committed by Gerrit Code Review
commit 0a03288a72
5 changed files with 248 additions and 0 deletions

View File

@ -260,6 +260,29 @@ options:
description: |
Whether additional checks report warning or error when their checks
are positive.
nagios_rgw_zones:
default: ""
type: string
description: |
Comma-separated list of zones that are expected to be connected to this
radosgw. These will be checked by the line "data sync source...
(zone-name)" in the output of `radosgw-admin sync status`.
.
Example:
.
zone1,zone2
nagios_rgw_additional_checks:
default: ""
type: string
description: |
List describing additional checks. Each item is a regular expression to
search in the output of radosgw-admin sync status. Note, this is a
list unlike `nagios_additional_checks` which uses a dictionary.
.
Example:
.
['data is behind on']
.
use-direct-io:
type: boolean
default: True

View File

@ -0,0 +1,191 @@
#!/usr/bin/env python3
# Copyright (C) 2005, 2006, 2007, 2012 James Troup <james.troup@canonical.com>
# Copyright (C) 2014, 2017 Canonical
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Authors: Danny Cocks <danny.cocks@canonical.com>
# Based on check_ceph_status.py and authors therein
import re
import argparse
import os
import subprocess
import sys
import time
import traceback
class CriticalError(Exception):
"""This indicates a critical error."""
pass
class UnknownError(Exception):
"""This indicates a unknown error was encountered."""
pass
def check_file_freshness(filename, newer_than=3600):
"""
Check a file exists, is readable and is newer than <n> seconds (where
<n> defaults to 3600).
"""
# First check the file exists and is readable
if not os.path.exists(filename):
raise CriticalError("%s: does not exist." % (filename))
if os.access(filename, os.R_OK) == 0:
raise CriticalError("%s: is not readable." % (filename))
# Then ensure the file is up-to-date enough
mtime = os.stat(filename).st_mtime
last_modified = time.time() - mtime
if last_modified > newer_than:
raise CriticalError("%s: was last modified on %s and is too old "
"(> %s seconds)."
% (filename, time.ctime(mtime), newer_than))
if last_modified < 0:
raise CriticalError("%s: was last modified on %s which is in the "
"future."
% (filename, time.ctime(mtime)))
def check_radosgw_status(args):
"""
Used to check the status of multizone RadosGW Ceph. Uses the output of
'radosgw-admin sync status', generated during the separate cronjob, to
determine if health is OK, and if not, should we alert on that situation.
As this is the first iteration of this function, we will only do a very
basic check and will rely on the charm config option
`nagios_rgw_additional_checks` which is passed to this script via
`args.additional_check`
:param args: argparse object formatted in the convention of generic Nagios
checks
:returns string, describing the status of the ceph cluster.
:raises: UnknownError, CriticalError
"""
if args.status_file:
check_file_freshness(args.status_file)
with open(args.status_file) as f:
status_data = f.read()
else:
try:
status_data = (subprocess.check_output(['radosgw-admin',
'sync',
'status'])
.decode('UTF-8'))
except subprocess.CalledProcessError as e:
raise UnknownError(
"UNKNOWN: radosgw-admin sync status command"
"failed with error: {}".format(e))
# If the realm name is empty, i.e. the first line is
# realm <some-uuid> ()
# then we assume this means this is not multizone, so exit early.
lines = status_data.split('\n')
if len(lines) >= 1 and re.match(r"realm .* \(\)", lines[0].strip()):
return "No multizone detected"
# This is a hangover from check_ceph_status.py and not directly applicable
# here. I include it for an additional check.
required_strings = ['realm', 'zonegroup', 'zone']
if not all(s in status_data for s in required_strings):
raise UnknownError('UNKNOWN: status data is incomplete')
# The default message if we end up with no alerts
message_all_ok = "All OK"
# The list to collect messages
msgs = []
# The always-done checks go here.
# Currently none
# Handle checks to do with given expected zones that should be connected.
if args.zones:
for zone in args.zones.split(','):
search_regex = r"data sync source:.*\(" + zone + r"\)"
if re.search(search_regex, status_data) is None:
msg = ("CRITICAL: Missing expected sync source '{}'"
.format(zone))
msgs.append(msg)
# For additional checks, also test these things
if args.additional_checks:
for check in args.additional_checks:
m = re.search(check, status_data)
if m is not None:
msgs.append("CRITICAL: {}".format(m.group(0)))
complete_output = '\n'.join(msgs)
if any(msg.startswith("CRITICAL") for msg in msgs):
raise CriticalError(complete_output)
elif len(msgs) >= 1:
raise UnknownError(complete_output)
else:
return message_all_ok
def parse_args(args):
parser = argparse.ArgumentParser(description='Check ceph status')
parser.add_argument('-f', '--file', dest='status_file',
default=False,
help='Optional file with "ceph status" output. '
'Generally useful for testing, and if the Nagios '
'user account does not have rights for the Ceph '
'config files.')
parser.add_argument('--zones', dest='zones',
default=None,
help="Check if the given zones, as a comma-separated "
"list, are present in the output. If they are "
"missing report critical.")
parser.add_argument('--additional_check', dest='additional_checks',
action='append',
help="Check if a given pattern exists in any status"
"message. If it does, report critical")
return parser.parse_args(args)
def main(args):
# Note: leaving "warning" in here, as a reminder for the expected NRPE
# returncodes, even though this script doesn't output any warnings.
EXIT_CODES = {'ok': 0, 'warning': 1, 'critical': 2, 'unknown': 3}
exitcode = 'unknown'
try:
output_msg = check_radosgw_status(args)
print(output_msg)
exitcode = 'ok'
except UnknownError as msg:
print(msg)
exitcode = 'unknown'
except CriticalError as msg:
print(msg)
exitcode = 'critical'
except Exception:
print("%s raised unknown exception '%s'" % ('check_ceph_status',
sys.exc_info()[0]))
print('=' * 60)
traceback.print_exc(file=sys.stdout)
print('=' * 60)
exitcode = 'unknown'
return EXIT_CODES[exitcode]
if __name__ == '__main__':
args = parse_args(sys.argv[1:])
status = main(args)
sys.exit(status)

View File

@ -31,3 +31,14 @@ ceph osd tree --format json > ${TMP_FILE}
chown root:nagios ${TMP_FILE}
chmod 0640 ${TMP_FILE}
mv ${TMP_FILE} ${DATA_FILE}
# Note: radosgw-admin sync status doesn't support outputting in json at time of writing
DATA_FILE="${DATA_DIR}/current-radosgw-admin-sync-status.raw"
TMP_FILE=$(mktemp -p ${DATA_DIR})
radosgw-admin sync status > ${TMP_FILE}
chown root:nagios ${TMP_FILE}
chmod 0640 ${TMP_FILE}
mv ${TMP_FILE} ${DATA_FILE}

View File

@ -109,6 +109,8 @@ NAGIOS_PLUGINS = '/usr/local/lib/nagios/plugins'
NAGIOS_FILE_FOLDER = '/var/lib/nagios'
SCRIPTS_DIR = '/usr/local/bin'
STATUS_FILE = '{}/cat-ceph-status.txt'.format(NAGIOS_FILE_FOLDER)
RADOSGW_STATUS_FILE = ('{}/current-radosgw-admin-sync-status.raw'
.format(NAGIOS_FILE_FOLDER))
STATUS_CRONFILE = '/etc/cron.d/cat-ceph-health'
HOST_OSD_COUNT_REPORT = '{}/host-osd-report.json'.format(NAGIOS_FILE_FOLDER)
@ -1198,6 +1200,10 @@ def update_nrpe_config():
'check_ceph_osd_count.py'),
os.path.join(NAGIOS_PLUGINS, 'check_ceph_osd_count.py'))
rsync(os.path.join(os.getenv('CHARM_DIR'), 'files', 'nagios',
'check_radosgw_sync_status.py'),
os.path.join(NAGIOS_PLUGINS, 'check_radosgw_sync_status.py'))
script = os.path.join(SCRIPTS_DIR, 'collect_ceph_status.sh')
rsync(os.path.join(os.getenv('CHARM_DIR'), 'files',
'nagios', 'collect_ceph_status.sh'),
@ -1257,6 +1263,21 @@ def update_nrpe_config():
description='Check whether all OSDs are up and in',
check_cmd=check_cmd
)
check_cmd = ('check_radosgw_sync_status.py -f {}'
.format(RADOSGW_STATUS_FILE))
if config('nagios_rgw_zones'):
check_cmd += ' --zones "{}"'.format(config('nagios_rgw_zones'))
if config('nagios_rgw_additional_checks'):
x = ast.literal_eval(config('nagios_rgw_additional_checks'))
for check in x:
check_cmd += ' --additional_check \"{}\"'.format(check)
nrpe_setup.add_check(
shortname='radosgw_multizone',
description='Check multizone radosgw health',
check_cmd=check_cmd
)
nrpe_setup.write()

View File

@ -55,6 +55,8 @@ CHARM_CONFIG = {'config-flags': '',
'nagios_raise_nodeepscrub': True,
'nagios_additional_checks': "",
'nagios_additional_checks_critical': False,
'nagios_rgw_zones': "",
'nagios_rgw_additional_checks': "",
'nagios_check_num_osds': False,
'disable-pg-max-object-skew': False,
'rbd-stats-pools': 'foo'}