Merge "Add nagios check for radosgw-admin sync status"
This commit is contained in:
commit
0a03288a72
23
config.yaml
23
config.yaml
|
@ -260,6 +260,29 @@ options:
|
|||
description: |
|
||||
Whether additional checks report warning or error when their checks
|
||||
are positive.
|
||||
nagios_rgw_zones:
|
||||
default: ""
|
||||
type: string
|
||||
description: |
|
||||
Comma-separated list of zones that are expected to be connected to this
|
||||
radosgw. These will be checked by the line "data sync source...
|
||||
(zone-name)" in the output of `radosgw-admin sync status`.
|
||||
.
|
||||
Example:
|
||||
.
|
||||
zone1,zone2
|
||||
nagios_rgw_additional_checks:
|
||||
default: ""
|
||||
type: string
|
||||
description: |
|
||||
List describing additional checks. Each item is a regular expression to
|
||||
search in the output of radosgw-admin sync status. Note, this is a
|
||||
list unlike `nagios_additional_checks` which uses a dictionary.
|
||||
.
|
||||
Example:
|
||||
.
|
||||
['data is behind on']
|
||||
.
|
||||
use-direct-io:
|
||||
type: boolean
|
||||
default: True
|
||||
|
|
|
@ -0,0 +1,191 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
# Copyright (C) 2005, 2006, 2007, 2012 James Troup <james.troup@canonical.com>
|
||||
# Copyright (C) 2014, 2017 Canonical
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
# Authors: Danny Cocks <danny.cocks@canonical.com>
|
||||
# Based on check_ceph_status.py and authors therein
|
||||
|
||||
import re
|
||||
import argparse
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
import traceback
|
||||
|
||||
|
||||
class CriticalError(Exception):
|
||||
"""This indicates a critical error."""
|
||||
pass
|
||||
|
||||
|
||||
class UnknownError(Exception):
|
||||
"""This indicates a unknown error was encountered."""
|
||||
pass
|
||||
|
||||
|
||||
def check_file_freshness(filename, newer_than=3600):
|
||||
"""
|
||||
Check a file exists, is readable and is newer than <n> seconds (where
|
||||
<n> defaults to 3600).
|
||||
"""
|
||||
# First check the file exists and is readable
|
||||
if not os.path.exists(filename):
|
||||
raise CriticalError("%s: does not exist." % (filename))
|
||||
if os.access(filename, os.R_OK) == 0:
|
||||
raise CriticalError("%s: is not readable." % (filename))
|
||||
|
||||
# Then ensure the file is up-to-date enough
|
||||
mtime = os.stat(filename).st_mtime
|
||||
last_modified = time.time() - mtime
|
||||
if last_modified > newer_than:
|
||||
raise CriticalError("%s: was last modified on %s and is too old "
|
||||
"(> %s seconds)."
|
||||
% (filename, time.ctime(mtime), newer_than))
|
||||
if last_modified < 0:
|
||||
raise CriticalError("%s: was last modified on %s which is in the "
|
||||
"future."
|
||||
% (filename, time.ctime(mtime)))
|
||||
|
||||
|
||||
def check_radosgw_status(args):
|
||||
"""
|
||||
Used to check the status of multizone RadosGW Ceph. Uses the output of
|
||||
'radosgw-admin sync status', generated during the separate cronjob, to
|
||||
determine if health is OK, and if not, should we alert on that situation.
|
||||
|
||||
As this is the first iteration of this function, we will only do a very
|
||||
basic check and will rely on the charm config option
|
||||
`nagios_rgw_additional_checks` which is passed to this script via
|
||||
`args.additional_check`
|
||||
|
||||
:param args: argparse object formatted in the convention of generic Nagios
|
||||
checks
|
||||
:returns string, describing the status of the ceph cluster.
|
||||
:raises: UnknownError, CriticalError
|
||||
"""
|
||||
|
||||
if args.status_file:
|
||||
check_file_freshness(args.status_file)
|
||||
with open(args.status_file) as f:
|
||||
status_data = f.read()
|
||||
else:
|
||||
try:
|
||||
status_data = (subprocess.check_output(['radosgw-admin',
|
||||
'sync',
|
||||
'status'])
|
||||
.decode('UTF-8'))
|
||||
except subprocess.CalledProcessError as e:
|
||||
raise UnknownError(
|
||||
"UNKNOWN: radosgw-admin sync status command"
|
||||
"failed with error: {}".format(e))
|
||||
|
||||
# If the realm name is empty, i.e. the first line is
|
||||
# realm <some-uuid> ()
|
||||
# then we assume this means this is not multizone, so exit early.
|
||||
lines = status_data.split('\n')
|
||||
if len(lines) >= 1 and re.match(r"realm .* \(\)", lines[0].strip()):
|
||||
return "No multizone detected"
|
||||
|
||||
# This is a hangover from check_ceph_status.py and not directly applicable
|
||||
# here. I include it for an additional check.
|
||||
required_strings = ['realm', 'zonegroup', 'zone']
|
||||
if not all(s in status_data for s in required_strings):
|
||||
raise UnknownError('UNKNOWN: status data is incomplete')
|
||||
|
||||
# The default message if we end up with no alerts
|
||||
message_all_ok = "All OK"
|
||||
# The list to collect messages
|
||||
msgs = []
|
||||
|
||||
# The always-done checks go here.
|
||||
# Currently none
|
||||
|
||||
# Handle checks to do with given expected zones that should be connected.
|
||||
if args.zones:
|
||||
for zone in args.zones.split(','):
|
||||
search_regex = r"data sync source:.*\(" + zone + r"\)"
|
||||
if re.search(search_regex, status_data) is None:
|
||||
msg = ("CRITICAL: Missing expected sync source '{}'"
|
||||
.format(zone))
|
||||
msgs.append(msg)
|
||||
|
||||
# For additional checks, also test these things
|
||||
if args.additional_checks:
|
||||
for check in args.additional_checks:
|
||||
m = re.search(check, status_data)
|
||||
if m is not None:
|
||||
msgs.append("CRITICAL: {}".format(m.group(0)))
|
||||
|
||||
complete_output = '\n'.join(msgs)
|
||||
if any(msg.startswith("CRITICAL") for msg in msgs):
|
||||
raise CriticalError(complete_output)
|
||||
elif len(msgs) >= 1:
|
||||
raise UnknownError(complete_output)
|
||||
else:
|
||||
return message_all_ok
|
||||
|
||||
|
||||
def parse_args(args):
|
||||
parser = argparse.ArgumentParser(description='Check ceph status')
|
||||
parser.add_argument('-f', '--file', dest='status_file',
|
||||
default=False,
|
||||
help='Optional file with "ceph status" output. '
|
||||
'Generally useful for testing, and if the Nagios '
|
||||
'user account does not have rights for the Ceph '
|
||||
'config files.')
|
||||
parser.add_argument('--zones', dest='zones',
|
||||
default=None,
|
||||
help="Check if the given zones, as a comma-separated "
|
||||
"list, are present in the output. If they are "
|
||||
"missing report critical.")
|
||||
parser.add_argument('--additional_check', dest='additional_checks',
|
||||
action='append',
|
||||
help="Check if a given pattern exists in any status"
|
||||
"message. If it does, report critical")
|
||||
|
||||
return parser.parse_args(args)
|
||||
|
||||
|
||||
def main(args):
|
||||
# Note: leaving "warning" in here, as a reminder for the expected NRPE
|
||||
# returncodes, even though this script doesn't output any warnings.
|
||||
EXIT_CODES = {'ok': 0, 'warning': 1, 'critical': 2, 'unknown': 3}
|
||||
exitcode = 'unknown'
|
||||
try:
|
||||
output_msg = check_radosgw_status(args)
|
||||
print(output_msg)
|
||||
exitcode = 'ok'
|
||||
except UnknownError as msg:
|
||||
print(msg)
|
||||
exitcode = 'unknown'
|
||||
except CriticalError as msg:
|
||||
print(msg)
|
||||
exitcode = 'critical'
|
||||
except Exception:
|
||||
print("%s raised unknown exception '%s'" % ('check_ceph_status',
|
||||
sys.exc_info()[0]))
|
||||
print('=' * 60)
|
||||
traceback.print_exc(file=sys.stdout)
|
||||
print('=' * 60)
|
||||
exitcode = 'unknown'
|
||||
return EXIT_CODES[exitcode]
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
args = parse_args(sys.argv[1:])
|
||||
status = main(args)
|
||||
sys.exit(status)
|
|
@ -31,3 +31,14 @@ ceph osd tree --format json > ${TMP_FILE}
|
|||
chown root:nagios ${TMP_FILE}
|
||||
chmod 0640 ${TMP_FILE}
|
||||
mv ${TMP_FILE} ${DATA_FILE}
|
||||
|
||||
|
||||
# Note: radosgw-admin sync status doesn't support outputting in json at time of writing
|
||||
DATA_FILE="${DATA_DIR}/current-radosgw-admin-sync-status.raw"
|
||||
TMP_FILE=$(mktemp -p ${DATA_DIR})
|
||||
|
||||
radosgw-admin sync status > ${TMP_FILE}
|
||||
|
||||
chown root:nagios ${TMP_FILE}
|
||||
chmod 0640 ${TMP_FILE}
|
||||
mv ${TMP_FILE} ${DATA_FILE}
|
||||
|
|
|
@ -109,6 +109,8 @@ NAGIOS_PLUGINS = '/usr/local/lib/nagios/plugins'
|
|||
NAGIOS_FILE_FOLDER = '/var/lib/nagios'
|
||||
SCRIPTS_DIR = '/usr/local/bin'
|
||||
STATUS_FILE = '{}/cat-ceph-status.txt'.format(NAGIOS_FILE_FOLDER)
|
||||
RADOSGW_STATUS_FILE = ('{}/current-radosgw-admin-sync-status.raw'
|
||||
.format(NAGIOS_FILE_FOLDER))
|
||||
STATUS_CRONFILE = '/etc/cron.d/cat-ceph-health'
|
||||
HOST_OSD_COUNT_REPORT = '{}/host-osd-report.json'.format(NAGIOS_FILE_FOLDER)
|
||||
|
||||
|
@ -1198,6 +1200,10 @@ def update_nrpe_config():
|
|||
'check_ceph_osd_count.py'),
|
||||
os.path.join(NAGIOS_PLUGINS, 'check_ceph_osd_count.py'))
|
||||
|
||||
rsync(os.path.join(os.getenv('CHARM_DIR'), 'files', 'nagios',
|
||||
'check_radosgw_sync_status.py'),
|
||||
os.path.join(NAGIOS_PLUGINS, 'check_radosgw_sync_status.py'))
|
||||
|
||||
script = os.path.join(SCRIPTS_DIR, 'collect_ceph_status.sh')
|
||||
rsync(os.path.join(os.getenv('CHARM_DIR'), 'files',
|
||||
'nagios', 'collect_ceph_status.sh'),
|
||||
|
@ -1257,6 +1263,21 @@ def update_nrpe_config():
|
|||
description='Check whether all OSDs are up and in',
|
||||
check_cmd=check_cmd
|
||||
)
|
||||
|
||||
check_cmd = ('check_radosgw_sync_status.py -f {}'
|
||||
.format(RADOSGW_STATUS_FILE))
|
||||
if config('nagios_rgw_zones'):
|
||||
check_cmd += ' --zones "{}"'.format(config('nagios_rgw_zones'))
|
||||
if config('nagios_rgw_additional_checks'):
|
||||
x = ast.literal_eval(config('nagios_rgw_additional_checks'))
|
||||
for check in x:
|
||||
check_cmd += ' --additional_check \"{}\"'.format(check)
|
||||
nrpe_setup.add_check(
|
||||
shortname='radosgw_multizone',
|
||||
description='Check multizone radosgw health',
|
||||
check_cmd=check_cmd
|
||||
)
|
||||
|
||||
nrpe_setup.write()
|
||||
|
||||
|
||||
|
|
|
@ -55,6 +55,8 @@ CHARM_CONFIG = {'config-flags': '',
|
|||
'nagios_raise_nodeepscrub': True,
|
||||
'nagios_additional_checks': "",
|
||||
'nagios_additional_checks_critical': False,
|
||||
'nagios_rgw_zones': "",
|
||||
'nagios_rgw_additional_checks': "",
|
||||
'nagios_check_num_osds': False,
|
||||
'disable-pg-max-object-skew': False,
|
||||
'rbd-stats-pools': 'foo'}
|
||||
|
|
Loading…
Reference in New Issue