Merge "Add nagios check for radosgw-admin sync status"
This commit is contained in:
commit
0a03288a72
23
config.yaml
23
config.yaml
|
@ -260,6 +260,29 @@ options:
|
||||||
description: |
|
description: |
|
||||||
Whether additional checks report warning or error when their checks
|
Whether additional checks report warning or error when their checks
|
||||||
are positive.
|
are positive.
|
||||||
|
nagios_rgw_zones:
|
||||||
|
default: ""
|
||||||
|
type: string
|
||||||
|
description: |
|
||||||
|
Comma-separated list of zones that are expected to be connected to this
|
||||||
|
radosgw. These will be checked by the line "data sync source...
|
||||||
|
(zone-name)" in the output of `radosgw-admin sync status`.
|
||||||
|
.
|
||||||
|
Example:
|
||||||
|
.
|
||||||
|
zone1,zone2
|
||||||
|
nagios_rgw_additional_checks:
|
||||||
|
default: ""
|
||||||
|
type: string
|
||||||
|
description: |
|
||||||
|
List describing additional checks. Each item is a regular expression to
|
||||||
|
search in the output of radosgw-admin sync status. Note, this is a
|
||||||
|
list unlike `nagios_additional_checks` which uses a dictionary.
|
||||||
|
.
|
||||||
|
Example:
|
||||||
|
.
|
||||||
|
['data is behind on']
|
||||||
|
.
|
||||||
use-direct-io:
|
use-direct-io:
|
||||||
type: boolean
|
type: boolean
|
||||||
default: True
|
default: True
|
||||||
|
|
|
@ -0,0 +1,191 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
# Copyright (C) 2005, 2006, 2007, 2012 James Troup <james.troup@canonical.com>
|
||||||
|
# Copyright (C) 2014, 2017 Canonical
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
#
|
||||||
|
# Authors: Danny Cocks <danny.cocks@canonical.com>
|
||||||
|
# Based on check_ceph_status.py and authors therein
|
||||||
|
|
||||||
|
import re
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import traceback
|
||||||
|
|
||||||
|
|
||||||
|
class CriticalError(Exception):
|
||||||
|
"""This indicates a critical error."""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class UnknownError(Exception):
|
||||||
|
"""This indicates a unknown error was encountered."""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def check_file_freshness(filename, newer_than=3600):
|
||||||
|
"""
|
||||||
|
Check a file exists, is readable and is newer than <n> seconds (where
|
||||||
|
<n> defaults to 3600).
|
||||||
|
"""
|
||||||
|
# First check the file exists and is readable
|
||||||
|
if not os.path.exists(filename):
|
||||||
|
raise CriticalError("%s: does not exist." % (filename))
|
||||||
|
if os.access(filename, os.R_OK) == 0:
|
||||||
|
raise CriticalError("%s: is not readable." % (filename))
|
||||||
|
|
||||||
|
# Then ensure the file is up-to-date enough
|
||||||
|
mtime = os.stat(filename).st_mtime
|
||||||
|
last_modified = time.time() - mtime
|
||||||
|
if last_modified > newer_than:
|
||||||
|
raise CriticalError("%s: was last modified on %s and is too old "
|
||||||
|
"(> %s seconds)."
|
||||||
|
% (filename, time.ctime(mtime), newer_than))
|
||||||
|
if last_modified < 0:
|
||||||
|
raise CriticalError("%s: was last modified on %s which is in the "
|
||||||
|
"future."
|
||||||
|
% (filename, time.ctime(mtime)))
|
||||||
|
|
||||||
|
|
||||||
|
def check_radosgw_status(args):
|
||||||
|
"""
|
||||||
|
Used to check the status of multizone RadosGW Ceph. Uses the output of
|
||||||
|
'radosgw-admin sync status', generated during the separate cronjob, to
|
||||||
|
determine if health is OK, and if not, should we alert on that situation.
|
||||||
|
|
||||||
|
As this is the first iteration of this function, we will only do a very
|
||||||
|
basic check and will rely on the charm config option
|
||||||
|
`nagios_rgw_additional_checks` which is passed to this script via
|
||||||
|
`args.additional_check`
|
||||||
|
|
||||||
|
:param args: argparse object formatted in the convention of generic Nagios
|
||||||
|
checks
|
||||||
|
:returns string, describing the status of the ceph cluster.
|
||||||
|
:raises: UnknownError, CriticalError
|
||||||
|
"""
|
||||||
|
|
||||||
|
if args.status_file:
|
||||||
|
check_file_freshness(args.status_file)
|
||||||
|
with open(args.status_file) as f:
|
||||||
|
status_data = f.read()
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
status_data = (subprocess.check_output(['radosgw-admin',
|
||||||
|
'sync',
|
||||||
|
'status'])
|
||||||
|
.decode('UTF-8'))
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
raise UnknownError(
|
||||||
|
"UNKNOWN: radosgw-admin sync status command"
|
||||||
|
"failed with error: {}".format(e))
|
||||||
|
|
||||||
|
# If the realm name is empty, i.e. the first line is
|
||||||
|
# realm <some-uuid> ()
|
||||||
|
# then we assume this means this is not multizone, so exit early.
|
||||||
|
lines = status_data.split('\n')
|
||||||
|
if len(lines) >= 1 and re.match(r"realm .* \(\)", lines[0].strip()):
|
||||||
|
return "No multizone detected"
|
||||||
|
|
||||||
|
# This is a hangover from check_ceph_status.py and not directly applicable
|
||||||
|
# here. I include it for an additional check.
|
||||||
|
required_strings = ['realm', 'zonegroup', 'zone']
|
||||||
|
if not all(s in status_data for s in required_strings):
|
||||||
|
raise UnknownError('UNKNOWN: status data is incomplete')
|
||||||
|
|
||||||
|
# The default message if we end up with no alerts
|
||||||
|
message_all_ok = "All OK"
|
||||||
|
# The list to collect messages
|
||||||
|
msgs = []
|
||||||
|
|
||||||
|
# The always-done checks go here.
|
||||||
|
# Currently none
|
||||||
|
|
||||||
|
# Handle checks to do with given expected zones that should be connected.
|
||||||
|
if args.zones:
|
||||||
|
for zone in args.zones.split(','):
|
||||||
|
search_regex = r"data sync source:.*\(" + zone + r"\)"
|
||||||
|
if re.search(search_regex, status_data) is None:
|
||||||
|
msg = ("CRITICAL: Missing expected sync source '{}'"
|
||||||
|
.format(zone))
|
||||||
|
msgs.append(msg)
|
||||||
|
|
||||||
|
# For additional checks, also test these things
|
||||||
|
if args.additional_checks:
|
||||||
|
for check in args.additional_checks:
|
||||||
|
m = re.search(check, status_data)
|
||||||
|
if m is not None:
|
||||||
|
msgs.append("CRITICAL: {}".format(m.group(0)))
|
||||||
|
|
||||||
|
complete_output = '\n'.join(msgs)
|
||||||
|
if any(msg.startswith("CRITICAL") for msg in msgs):
|
||||||
|
raise CriticalError(complete_output)
|
||||||
|
elif len(msgs) >= 1:
|
||||||
|
raise UnknownError(complete_output)
|
||||||
|
else:
|
||||||
|
return message_all_ok
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args(args):
|
||||||
|
parser = argparse.ArgumentParser(description='Check ceph status')
|
||||||
|
parser.add_argument('-f', '--file', dest='status_file',
|
||||||
|
default=False,
|
||||||
|
help='Optional file with "ceph status" output. '
|
||||||
|
'Generally useful for testing, and if the Nagios '
|
||||||
|
'user account does not have rights for the Ceph '
|
||||||
|
'config files.')
|
||||||
|
parser.add_argument('--zones', dest='zones',
|
||||||
|
default=None,
|
||||||
|
help="Check if the given zones, as a comma-separated "
|
||||||
|
"list, are present in the output. If they are "
|
||||||
|
"missing report critical.")
|
||||||
|
parser.add_argument('--additional_check', dest='additional_checks',
|
||||||
|
action='append',
|
||||||
|
help="Check if a given pattern exists in any status"
|
||||||
|
"message. If it does, report critical")
|
||||||
|
|
||||||
|
return parser.parse_args(args)
|
||||||
|
|
||||||
|
|
||||||
|
def main(args):
|
||||||
|
# Note: leaving "warning" in here, as a reminder for the expected NRPE
|
||||||
|
# returncodes, even though this script doesn't output any warnings.
|
||||||
|
EXIT_CODES = {'ok': 0, 'warning': 1, 'critical': 2, 'unknown': 3}
|
||||||
|
exitcode = 'unknown'
|
||||||
|
try:
|
||||||
|
output_msg = check_radosgw_status(args)
|
||||||
|
print(output_msg)
|
||||||
|
exitcode = 'ok'
|
||||||
|
except UnknownError as msg:
|
||||||
|
print(msg)
|
||||||
|
exitcode = 'unknown'
|
||||||
|
except CriticalError as msg:
|
||||||
|
print(msg)
|
||||||
|
exitcode = 'critical'
|
||||||
|
except Exception:
|
||||||
|
print("%s raised unknown exception '%s'" % ('check_ceph_status',
|
||||||
|
sys.exc_info()[0]))
|
||||||
|
print('=' * 60)
|
||||||
|
traceback.print_exc(file=sys.stdout)
|
||||||
|
print('=' * 60)
|
||||||
|
exitcode = 'unknown'
|
||||||
|
return EXIT_CODES[exitcode]
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
args = parse_args(sys.argv[1:])
|
||||||
|
status = main(args)
|
||||||
|
sys.exit(status)
|
|
@ -31,3 +31,14 @@ ceph osd tree --format json > ${TMP_FILE}
|
||||||
chown root:nagios ${TMP_FILE}
|
chown root:nagios ${TMP_FILE}
|
||||||
chmod 0640 ${TMP_FILE}
|
chmod 0640 ${TMP_FILE}
|
||||||
mv ${TMP_FILE} ${DATA_FILE}
|
mv ${TMP_FILE} ${DATA_FILE}
|
||||||
|
|
||||||
|
|
||||||
|
# Note: radosgw-admin sync status doesn't support outputting in json at time of writing
|
||||||
|
DATA_FILE="${DATA_DIR}/current-radosgw-admin-sync-status.raw"
|
||||||
|
TMP_FILE=$(mktemp -p ${DATA_DIR})
|
||||||
|
|
||||||
|
radosgw-admin sync status > ${TMP_FILE}
|
||||||
|
|
||||||
|
chown root:nagios ${TMP_FILE}
|
||||||
|
chmod 0640 ${TMP_FILE}
|
||||||
|
mv ${TMP_FILE} ${DATA_FILE}
|
||||||
|
|
|
@ -109,6 +109,8 @@ NAGIOS_PLUGINS = '/usr/local/lib/nagios/plugins'
|
||||||
NAGIOS_FILE_FOLDER = '/var/lib/nagios'
|
NAGIOS_FILE_FOLDER = '/var/lib/nagios'
|
||||||
SCRIPTS_DIR = '/usr/local/bin'
|
SCRIPTS_DIR = '/usr/local/bin'
|
||||||
STATUS_FILE = '{}/cat-ceph-status.txt'.format(NAGIOS_FILE_FOLDER)
|
STATUS_FILE = '{}/cat-ceph-status.txt'.format(NAGIOS_FILE_FOLDER)
|
||||||
|
RADOSGW_STATUS_FILE = ('{}/current-radosgw-admin-sync-status.raw'
|
||||||
|
.format(NAGIOS_FILE_FOLDER))
|
||||||
STATUS_CRONFILE = '/etc/cron.d/cat-ceph-health'
|
STATUS_CRONFILE = '/etc/cron.d/cat-ceph-health'
|
||||||
HOST_OSD_COUNT_REPORT = '{}/host-osd-report.json'.format(NAGIOS_FILE_FOLDER)
|
HOST_OSD_COUNT_REPORT = '{}/host-osd-report.json'.format(NAGIOS_FILE_FOLDER)
|
||||||
|
|
||||||
|
@ -1198,6 +1200,10 @@ def update_nrpe_config():
|
||||||
'check_ceph_osd_count.py'),
|
'check_ceph_osd_count.py'),
|
||||||
os.path.join(NAGIOS_PLUGINS, 'check_ceph_osd_count.py'))
|
os.path.join(NAGIOS_PLUGINS, 'check_ceph_osd_count.py'))
|
||||||
|
|
||||||
|
rsync(os.path.join(os.getenv('CHARM_DIR'), 'files', 'nagios',
|
||||||
|
'check_radosgw_sync_status.py'),
|
||||||
|
os.path.join(NAGIOS_PLUGINS, 'check_radosgw_sync_status.py'))
|
||||||
|
|
||||||
script = os.path.join(SCRIPTS_DIR, 'collect_ceph_status.sh')
|
script = os.path.join(SCRIPTS_DIR, 'collect_ceph_status.sh')
|
||||||
rsync(os.path.join(os.getenv('CHARM_DIR'), 'files',
|
rsync(os.path.join(os.getenv('CHARM_DIR'), 'files',
|
||||||
'nagios', 'collect_ceph_status.sh'),
|
'nagios', 'collect_ceph_status.sh'),
|
||||||
|
@ -1257,6 +1263,21 @@ def update_nrpe_config():
|
||||||
description='Check whether all OSDs are up and in',
|
description='Check whether all OSDs are up and in',
|
||||||
check_cmd=check_cmd
|
check_cmd=check_cmd
|
||||||
)
|
)
|
||||||
|
|
||||||
|
check_cmd = ('check_radosgw_sync_status.py -f {}'
|
||||||
|
.format(RADOSGW_STATUS_FILE))
|
||||||
|
if config('nagios_rgw_zones'):
|
||||||
|
check_cmd += ' --zones "{}"'.format(config('nagios_rgw_zones'))
|
||||||
|
if config('nagios_rgw_additional_checks'):
|
||||||
|
x = ast.literal_eval(config('nagios_rgw_additional_checks'))
|
||||||
|
for check in x:
|
||||||
|
check_cmd += ' --additional_check \"{}\"'.format(check)
|
||||||
|
nrpe_setup.add_check(
|
||||||
|
shortname='radosgw_multizone',
|
||||||
|
description='Check multizone radosgw health',
|
||||||
|
check_cmd=check_cmd
|
||||||
|
)
|
||||||
|
|
||||||
nrpe_setup.write()
|
nrpe_setup.write()
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -55,6 +55,8 @@ CHARM_CONFIG = {'config-flags': '',
|
||||||
'nagios_raise_nodeepscrub': True,
|
'nagios_raise_nodeepscrub': True,
|
||||||
'nagios_additional_checks': "",
|
'nagios_additional_checks': "",
|
||||||
'nagios_additional_checks_critical': False,
|
'nagios_additional_checks_critical': False,
|
||||||
|
'nagios_rgw_zones': "",
|
||||||
|
'nagios_rgw_additional_checks': "",
|
||||||
'nagios_check_num_osds': False,
|
'nagios_check_num_osds': False,
|
||||||
'disable-pg-max-object-skew': False,
|
'disable-pg-max-object-skew': False,
|
||||||
'rbd-stats-pools': 'foo'}
|
'rbd-stats-pools': 'foo'}
|
||||||
|
|
Loading…
Reference in New Issue