Update ceph nagios plugin

Changes ceph plugin so it ignores ceph rebalancing unless there is a
large percentage of misplaced/degraded objects (return warning for
that).

Adds config options to tweak that monitoring, and also just warn if
nodeep-scrub was deliberately set.

Includes some basic unit tests.

Change-Id: I317448cd769597068a706d3944d9d5419e0445c1
This commit is contained in:
Xav Paice 2017-05-05 21:50:56 +12:00
parent c5ac8402d0
commit e93c4a903a
11 changed files with 931 additions and 41 deletions

View File

@ -195,6 +195,22 @@ options:
description: |
A comma-separated list of nagios servicegroups. If left empty, the
nagios_context will be used as the servicegroup.
nagios_degraded_thresh:
default: 1.0
type: float
description: "Threshold for degraded ratio (0.1 = 10%)"
nagios_misplaced_thresh:
default: 10.0
type: float
description: "Threshold for misplaced ratio (0.1 = 10%)"
nagios_recovery_rate:
default: '1'
type: string
description: Recovery rate below which we consider recovery to be stalled
nagios_ignore_nodeepscub:
default: False
type: boolean
description: Whether to ignore the nodeep-scrub flag
use-direct-io:
type: boolean
default: True

View File

@ -1,53 +1,205 @@
#!/usr/bin/env python
# Copyright (C) 2014 Canonical
# All Rights Reserved
# Author: Jacek Nykis <jacek.nykis@canonical.com>
# Copyright (C) 2005, 2006, 2007, 2012 James Troup <james.troup@canonical.com>
# Copyright (C) 2014, 2017 Canonical
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Authors: Jacek Nykis <jacek.nykis@canonical.com>
# Xav Paice <xav.paice@canonical.com>
# James Troup <james.troup@canonical.com>
import re
import argparse
import json
import os
import subprocess
import nagios_plugin
import sys
import time
import traceback
class CriticalError(Exception):
"""This indicates a critical error."""
pass
class WarnError(Exception):
"""This indicates a warning condition."""
pass
class UnknownError(Exception):
"""This indicates a unknown error was encountered."""
pass
def check_file_freshness(filename, newer_than=3600):
"""
Check a file exists, is readable and is newer than <n> seconds (where
<n> defaults to 3600).
"""
# First check the file exists and is readable
if not os.path.exists(filename):
raise CriticalError("%s: does not exist." % (filename))
if os.access(filename, os.R_OK) == 0:
raise CriticalError("%s: is not readable." % (filename))
# Then ensure the file is up-to-date enough
mtime = os.stat(filename).st_mtime
last_modified = time.time() - mtime
if last_modified > newer_than:
raise CriticalError("%s: was last modified on %s and is too old "
"(> %s seconds)."
% (filename, time.ctime(mtime), newer_than))
if last_modified < 0:
raise CriticalError("%s: was last modified on %s which is in the "
"future."
% (filename, time.ctime(mtime)))
def check_ceph_status(args):
"""
Used to check the status of a Ceph cluster. Uses the output of 'ceph
status' to determine if health is OK, and if not, should we alert on that
situation.
If status is HEALTH_OK then this function returns OK with no further check.
Otherwise, look for known situations which could cause ceph status to
return not OK, but things which represent general operations and don't
warrant a pager event. These include OSD reweight actions, and
nodeep-scrub flag setting, with limits for the amount of misplaced data.
:param args: argparse object formatted in the convention of generic Nagios
checks
:returns string, describing the status of the ceph cluster.
"""
ignorable = (r'\d+ pgs (?:backfill|degraded|recovery_wait|stuck unclean)|'
'recovery \d+\/\d+ objects (?:degraded|misplaced)')
if args.ignore_nodeepscrub:
ignorable = ignorable + '|nodeep-scrub flag\(s\) set'
status_critical = False
if args.status_file:
nagios_plugin.check_file_freshness(args.status_file, 3600)
with open(args.status_file, "r") as f:
lines = f.readlines()
check_file_freshness(args.status_file)
with open(args.status_file) as f:
tree = f.read()
status_data = json.loads(tree)
else:
lines = subprocess.check_output(["ceph", "status"]).split('\n')
status_data = dict(l.strip().split(' ', 1) for l in lines if len(l) > 1)
try:
tree = subprocess.check_output(['ceph',
'status',
'--format', 'json'])
except subprocess.CalledProcessError as e:
raise UnknownError(
"UNKNOWN: ceph status command failed with error: {}".format(e))
status_data = json.loads(tree)
if ('health' not in status_data or
'monmap' not in status_data or
'osdmap' not in status_data):
raise nagios_plugin.UnknownError('UNKNOWN: status data is incomplete')
required_keys = ['health', 'monmap', 'pgmap']
if not all(key in status_data.keys() for key in required_keys):
raise UnknownError('UNKNOWN: status data is incomplete')
if status_data['health'] != 'HEALTH_OK':
msg = 'CRITICAL: ceph health status: "{}'.format(status_data['health'])
if (len(status_data['health'].split(' '))) == 1:
a = iter(lines)
for line in a:
if re.search('health', line) is not None:
msg1 = next(a)
msg += " "
msg += msg1.strip()
break
msg += '"'
raise nagios_plugin.CriticalError(msg)
if status_data['health']['overall_status'] != 'HEALTH_OK':
# Health is not OK, check if any lines are not in our list of OK
# any lines that don't match, check is critical
status_msg = []
for status in status_data['health']['summary']:
if not re.match(ignorable, status['summary']):
status_critical = True
status_msg.append(status['summary'])
# If we got this far, then the status is not OK but the status lines
# are all in our list of things we consider to be operational tasks.
# Check the thresholds and return CRITICAL if exceeded,
# otherwise there's something not accounted for and we want to know
# about it with a WARN alert.
degraded_ratio = status_data['pgmap'].get('degraded_ratio', 0.0)
if degraded_ratio > args.degraded_thresh:
status_critical = True
status_msg.append("Degraded ratio: {}".format(degraded_ratio))
misplaced_ratio = status_data['pgmap'].get('misplaced_ratio', 0.0)
if misplaced_ratio > args.misplaced_thresh:
status_critical = True
status_msg.append("Misplaced ratio: {}".format(misplaced_ratio))
recovering = status_data['pgmap'].get('recovering_objects_per_sec',
0.0)
if recovering < args.recovery_rate:
status_critical = True
status_msg.append("Recovering objects/sec {}".format(recovering))
if status_critical:
msg = 'CRITICAL: ceph health: "{} {}"'.format(
status_data['health']['overall_status'],
", ".join(status_msg))
raise CriticalError(msg)
if status_data['health']['overall_status'] == 'HEALTH_WARN':
msg = "WARNING: {}".format(", ".join(status_msg))
raise WarnError(msg)
message = "All OK"
print(message)
return message
osds = re.search("^.*: (\d+) osds: (\d+) up, (\d+) in", status_data['osdmap'])
if osds.group(1) > osds.group(2): # not all OSDs are "up"
msg = 'CRITICAL: Some OSDs are not up. Total: {}, up: {}'.format(
osds.group(1), osds.group(2))
raise nagios_plugin.CriticalError(msg)
print "All OK"
def parse_args(args):
parser = argparse.ArgumentParser(description='Check ceph status')
parser.add_argument('-f', '--file', dest='status_file',
default=False,
help='Optional file with "ceph status" output. '
'Generally useful for testing, and if the Nagios '
'user account does not have rights for the Ceph '
'config files.')
parser.add_argument('--degraded_thresh', dest='degraded_thresh',
default=1, type=float,
help="Threshold for degraded ratio (0.1 = 10%)")
parser.add_argument('--misplaced_thresh', dest='misplaced_thresh',
default=10, type=float,
help="Threshold for misplaced ratio (0.1 = 10%)")
parser.add_argument('--recovery_rate', dest='recovery_rate',
default=1, type=int,
help="Recovery rate below which we consider recovery "
"to be stalled")
parser.add_argument('--ignore_nodeepscrub', dest='ignore_nodeepscrub',
default=False, action='store_true',
help="Whether to ignore the nodeep-scrub flag. If "
"the nodeep-scrub flag is set, the check returns "
"warning if this param is passed, otherwise "
"returns critical.")
return parser.parse_args(args)
def main(args):
EXIT_CODES = {'ok': 0, 'warning': 1, 'critical': 2, 'unknown': 3}
exitcode = 'ok'
try:
msg = check_ceph_status(args)
except UnknownError as msg:
print(msg)
exitcode = 'unknown'
except CriticalError as msg:
print(msg)
exitcode = 'critical'
except WarnError as msg:
print(msg)
exitcode = 'critical'
except:
print("%s raised unknown exception '%s'" % ('check_ceph_status',
sys.exc_info()[0]))
print('=' * 60)
traceback.print_exc(file=sys.stdout)
print('=' * 60)
exitcode = 'unknown'
return EXIT_CODES[exitcode]
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Check ceph status')
parser.add_argument('-f', '--file', dest='status_file',
default=False, help='Optional file with "ceph status" output')
args = parser.parse_args()
nagios_plugin.try_check(check_ceph_status, args)
args = parse_args(sys.argv[1:])
status = main(args)
sys.exit(status)

View File

@ -15,4 +15,4 @@ if [ ! -d $DATA_DIR ]; then
mkdir -p $DATA_DIR
fi
ceph status >${DATA_DIR}/cat-ceph-status.txt
ceph status --format json >${DATA_DIR}/cat-ceph-status.txt

View File

@ -654,10 +654,18 @@ def update_nrpe_config():
hostname = nrpe.get_nagios_hostname()
current_unit = nrpe.get_nagios_unit_name()
nrpe_setup = nrpe.NRPE(hostname=hostname)
check_cmd = 'check_ceph_status.py -f {} --degraded_thresh {}' \
' --misplaced_thresh {}' \
' --recovery_rate {}'.format(STATUS_FILE,
config('nagios_degraded_thresh'),
config('nagios_misplaced_thresh'),
config('nagios_recovery_rate'))
if config('nagios_ignore_nodeepscub'):
check_cmd = check_cmd + ' --ignore_nodeepscrub'
nrpe_setup.add_check(
shortname="ceph",
description='Check Ceph health {%s}' % current_unit,
check_cmd='check_ceph_status.py -f {}'.format(STATUS_FILE)
check_cmd=check_cmd
)
nrpe_setup.write()

226
unit_tests/ceph_crit.json Normal file
View File

@ -0,0 +1,226 @@
{
"health": {
"health": {
"health_services": [
{
"mons": [
{
"name": "juju-2691ab-1-lxd-1",
"kb_total": 155284096,
"kb_used": 1247744,
"kb_avail": 154036352,
"avail_percent": 99,
"last_updated": "2017-05-17 03:31:35.562497",
"store_stats": {
"bytes_total": 1012055342,
"bytes_sst": 0,
"bytes_log": 29673298,
"bytes_misc": 982382044,
"last_updated": "0.000000"
},
"health": "HEALTH_OK"
},
{
"name": "juju-2691ab-13-lxd-0",
"kb_total": 153820288,
"kb_used": 1361280,
"kb_avail": 152459008,
"avail_percent": 99,
"last_updated": "2017-05-17 03:31:04.097201",
"store_stats": {
"bytes_total": 1370003168,
"bytes_sst": 0,
"bytes_log": 29813159,
"bytes_misc": 1340190009,
"last_updated": "0.000000"
},
"health": "HEALTH_OK"
},
{
"name": "juju-2691ab-2-lxd-1",
"kb_total": 155251072,
"kb_used": 1373440,
"kb_avail": 153877632,
"avail_percent": 99,
"last_updated": "2017-05-17 03:31:20.684777",
"store_stats": {
"bytes_total": 1400974192,
"bytes_sst": 0,
"bytes_log": 1129945,
"bytes_misc": 1399844247,
"last_updated": "0.000000"
},
"health": "HEALTH_OK"
}
]
}
]
},
"timechecks": {
"epoch": 32,
"round": 24492,
"round_status": "finished",
"mons": [
{
"name": "juju-2691ab-1-lxd-1",
"skew": 0,
"latency": 0,
"health": "HEALTH_OK"
},
{
"name": "juju-2691ab-13-lxd-0",
"skew": 0.000919,
"latency": 0.001036,
"health": "HEALTH_OK"
},
{
"name": "juju-2691ab-2-lxd-1",
"skew": 0,
"latency": 0.001009,
"health": "HEALTH_OK"
}
]
},
"summary": [
{
"severity": "HEALTH_WARN",
"summary": "48 pgs backfill_wait"
},
{
"severity": "HEALTH_WARN",
"summary": "45 pgs backfilling"
},
{
"severity": "HEALTH_WARN",
"summary": "1 pgs degraded"
},
{
"severity": "HEALTH_WARN",
"summary": "1 pgs recovery_wait"
},
{
"severity": "HEALTH_WARN",
"summary": "22 pgs stuck unclean"
},
{
"severity": "HEALTH_WARN",
"summary": "recovery 14/46842755 objects degraded (0.000%)"
},
{
"severity": "HEALTH_WARN",
"summary": "recovery 448540/46842755 objects misplaced (0.958%)"
},
{
"severity": "HEALTH_CRITICAL",
"summary": "Test critical status message"
}
],
"overall_status": "HEALTH_CRITICAL",
"detail": []
},
"fsid": "ca9451f1-5c4f-4e85-bb14-a08dfc0568f7",
"election_epoch": 32,
"quorum": [
0,
1,
2
],
"quorum_names": [
"juju-2691ab-1-lxd-1",
"juju-2691ab-13-lxd-0",
"juju-2691ab-2-lxd-1"
],
"monmap": {
"epoch": 1,
"fsid": "ca9451f1-5c4f-4e85-bb14-a08dfc0568f7",
"modified": "2016-12-03 08:09:21.854671",
"created": "2016-12-03 08:09:21.854671",
"mons": [
{
"rank": 0,
"name": "juju-2691ab-1-lxd-1",
"addr": "10.182.254.221:6789/0"
},
{
"rank": 1,
"name": "juju-2691ab-13-lxd-0",
"addr": "10.182.254.229:6789/0"
},
{
"rank": 2,
"name": "juju-2691ab-2-lxd-1",
"addr": "10.182.254.242:6789/0"
}
]
},
"osdmap": {
"osdmap": {
"epoch": 141540,
"num_osds": 314,
"num_up_osds": 314,
"num_in_osds": 314,
"full": false,
"nearfull": false,
"num_remapped_pgs": 92
}
},
"pgmap": {
"pgs_by_state": [
{
"state_name": "active+clean",
"count": 9274
},
{
"state_name": "active+remapped+wait_backfill",
"count": 48
},
{
"state_name": "active+remapped+backfilling",
"count": 45
},
{
"state_name": "active+clean+scrubbing+deep",
"count": 9
},
{
"state_name": "active+remapped",
"count": 2
},
{
"state_name": "active+recovery_wait+degraded",
"count": 1
},
{
"state_name": "active+clean+scrubbing",
"count": 1
}
],
"version": 13885884,
"num_pgs": 9380,
"data_bytes": 64713222471610,
"bytes_used": 193613093122048,
"bytes_avail": 690058090491904,
"bytes_total": 883671183613952,
"degraded_objects": 14,
"degraded_total": 46842755,
"degraded_ratio": 0,
"misplaced_objects": 448540,
"misplaced_total": 46842755,
"misplaced_ratio": 0.009575,
"recovering_objects_per_sec": 389,
"recovering_bytes_per_sec": 1629711746,
"recovering_keys_per_sec": 0,
"num_objects_recovered": 218,
"num_bytes_recovered": 912252928,
"num_keys_recovered": 0,
"read_bytes_sec": 117041457,
"write_bytes_sec": 293414043,
"read_op_per_sec": 5282,
"write_op_per_sec": 5270
},
"fsmap": {
"epoch": 1,
"by_rank": []
}
}

View File

@ -0,0 +1,177 @@
{
"health": {
"health": {
"health_services": [
{
"mons": [
{
"name": "node1",
"kb_total": 140956600,
"kb_used": 15916132,
"kb_avail": 117857208,
"avail_percent": 83,
"last_updated": "2017-05-17 03:23:11.248297",
"store_stats": {
"bytes_total": 140014259,
"bytes_sst": 0,
"bytes_log": 13670758,
"bytes_misc": 126343501,
"last_updated": "0.000000"
},
"health": "HEALTH_OK"
},
{
"name": "node2",
"kb_total": 70395920,
"kb_used": 10532504,
"kb_avail": 56264436,
"avail_percent": 79,
"last_updated": "2017-05-17 03:23:16.952673",
"store_stats": {
"bytes_total": 315512452,
"bytes_sst": 0,
"bytes_log": 21691698,
"bytes_misc": 293820754,
"last_updated": "0.000000"
},
"health": "HEALTH_OK"
},
{
"name": "juju-machine-85-lxc-10",
"kb_total": 131927524,
"kb_used": 79521024,
"kb_avail": 45954016,
"avail_percent": 34,
"last_updated": "2017-05-17 03:23:13.794034",
"store_stats": {
"bytes_total": 89036349,
"bytes_sst": 0,
"bytes_log": 21055337,
"bytes_misc": 67981012,
"last_updated": "0.000000"
},
"health": "HEALTH_OK"
}
]
}
]
},
"timechecks": {
"epoch": 280,
"round": 19874,
"round_status": "finished",
"mons": [
{
"name": "node1",
"skew": "0.000000",
"latency": "0.000000",
"health": "HEALTH_OK"
},
{
"name": "node2",
"skew": "-0.000000",
"latency": "0.000866",
"health": "HEALTH_OK"
},
{
"name": "juju-machine-85-lxc-10",
"skew": "-0.000000",
"latency": "0.018848",
"health": "HEALTH_OK"
}
]
},
"summary": [
{
"severity": "HEALTH_WARN",
"summary": "nodeep-scrub flag(s) set"
}
],
"overall_status": "HEALTH_WARN",
"detail": []
},
"fsid": "some_fsid",
"election_epoch": 280,
"quorum": [
0,
1,
2
],
"quorum_names": [
"node1",
"node2",
"juju-machine-85-lxc-10"
],
"monmap": {
"epoch": 3,
"fsid": "some_fsid",
"modified": "2016-11-25 00:08:51.235813",
"created": "0.000000",
"mons": [
{
"rank": 0,
"name": "node1",
"addr": "10.24.0.15:6789/0"
},
{
"rank": 1,
"name": "node2",
"addr": "10.24.0.17:6789/0"
},
{
"rank": 2,
"name": "juju-machine-85-lxc-10",
"addr": "10.24.0.195:6789/0"
}
]
},
"osdmap": {
"osdmap": {
"epoch": 37820,
"num_osds": 46,
"num_up_osds": 46,
"num_in_osds": 46,
"full": false,
"nearfull": false
}
},
"pgmap": {
"pgs_by_state": [
{
"state_name": "active+clean",
"count": 1988
},
{
"state_name": "active+remapped+wait_backfill",
"count": 3
},
{
"state_name": "active+remapped+backfilling",
"count": 1
}
],
"version": 58873447,
"num_pgs": 1992,
"data_bytes": 35851846298041,
"bytes_used": 107730678743040,
"bytes_avail": 63413590548480,
"bytes_total": 171144269291520,
"degraded_objects": 0,
"degraded_total": 25759217,
"degraded_ratio": 0,
"recovering_objects_per_sec": 17,
"recovering_bytes_per_sec": 72552794,
"recovering_keys_per_sec": 0,
"read_bytes_sec": 23935944,
"write_bytes_sec": 7024503,
"op_per_sec": 5332
},
"mdsmap": {
"epoch": 1,
"up": 0,
"in": 0,
"max": 1,
"by_rank": []
}
}

1
unit_tests/ceph_ok.json Normal file
View File

@ -0,0 +1 @@
{"health":{"health":{"health_services":[{"mons":[{"name":"somehost-2","kb_total":384443444,"kb_used":254122936,"kb_avail":110768868,"avail_percent":28,"last_updated":"2017-06-28 07:22:57.268852","store_stats":{"bytes_total":563914940,"bytes_sst":0,"bytes_log":1201349,"bytes_misc":562713591,"last_updated":"0.000000"},"health":"HEALTH_OK"},{"name":"somehost-3","kb_total":384443444,"kb_used":181563008,"kb_avail":183328796,"avail_percent":47,"last_updated":"2017-06-28 07:22:09.013733","store_stats":{"bytes_total":584703758,"bytes_sst":0,"bytes_log":17361907,"bytes_misc":567341851,"last_updated":"0.000000"},"health":"HEALTH_OK"},{"name":"somehost-4","kb_total":384443444,"kb_used":278218520,"kb_avail":86673284,"avail_percent":22,"last_updated":"2017-06-28 07:22:31.725105","store_stats":{"bytes_total":598087748,"bytes_sst":0,"bytes_log":26273616,"bytes_misc":571814132,"last_updated":"0.000000"},"health":"HEALTH_OK"}]}]},"timechecks":{"epoch":52,"round":35412,"round_status":"finished","mons":[{"name":"somehost-2","skew":0.000000,"latency":0.000000,"health":"HEALTH_OK"},{"name":"somehost-3","skew":-0.001662,"latency":0.000531,"health":"HEALTH_OK"},{"name":"somehost-4","skew":-0.000034,"latency":0.000425,"health":"HEALTH_OK"}]},"summary":[],"overall_status":"HEALTH_OK","detail":[]},"fsid":"9486fd14-676d-481c-aa16-77b071a315d8","election_epoch":52,"quorum":[0,1,2],"quorum_names":["somehost-2","somehost-3","somehost-4"],"monmap":{"epoch":1,"fsid":"9486fd14-676d-481c-aa16-77b071a315d8","modified":"2016-08-09 06:33:15.685755","created":"2016-08-09 06:33:15.685755","mons":[{"rank":0,"name":"somehost-2","addr":"10.28.2.21:6789\/0"},{"rank":1,"name":"somehost-3","addr":"10.28.2.22:6789\/0"},{"rank":2,"name":"somehost-4","addr":"10.28.2.23:6789\/0"}]},"osdmap":{"osdmap":{"epoch":11122,"num_osds":42,"num_up_osds":42,"num_in_osds":42,"full":false,"nearfull":false,"num_remapped_pgs":0}},"pgmap":{"pgs_by_state":[{"state_name":"active+clean","count":12350},{"state_name":"active+clean+scrubbing+deep","count":2}],"version":25999715,"num_pgs":12352,"data_bytes":13428555112092,"bytes_used":40180090028032,"bytes_avail":43795596517376,"bytes_total":83975686545408,"read_bytes_sec":92475,"write_bytes_sec":5309194,"read_op_per_sec":367,"write_op_per_sec":506},"fsmap":{"epoch":1,"by_rank":[]}}

222
unit_tests/ceph_params.json Normal file
View File

@ -0,0 +1,222 @@
{
"health": {
"health": {
"health_services": [
{
"mons": [
{
"name": "juju-2691ab-1-lxd-1",
"kb_total": 155284096,
"kb_used": 1247744,
"kb_avail": 154036352,
"avail_percent": 99,
"last_updated": "2017-05-17 03:31:35.562497",
"store_stats": {
"bytes_total": 1012055342,
"bytes_sst": 0,
"bytes_log": 29673298,
"bytes_misc": 982382044,
"last_updated": "0.000000"
},
"health": "HEALTH_OK"
},
{
"name": "juju-2691ab-13-lxd-0",
"kb_total": 153820288,
"kb_used": 1361280,
"kb_avail": 152459008,
"avail_percent": 99,
"last_updated": "2017-05-17 03:31:04.097201",
"store_stats": {
"bytes_total": 1370003168,
"bytes_sst": 0,
"bytes_log": 29813159,
"bytes_misc": 1340190009,
"last_updated": "0.000000"
},
"health": "HEALTH_OK"
},
{
"name": "juju-2691ab-2-lxd-1",
"kb_total": 155251072,
"kb_used": 1373440,
"kb_avail": 153877632,
"avail_percent": 99,
"last_updated": "2017-05-17 03:31:20.684777",
"store_stats": {
"bytes_total": 1400974192,
"bytes_sst": 0,
"bytes_log": 1129945,
"bytes_misc": 1399844247,
"last_updated": "0.000000"
},
"health": "HEALTH_OK"
}
]
}
]
},
"timechecks": {
"epoch": 32,
"round": 24492,
"round_status": "finished",
"mons": [
{
"name": "juju-2691ab-1-lxd-1",
"skew": 0,
"latency": 0,
"health": "HEALTH_OK"
},
{
"name": "juju-2691ab-13-lxd-0",
"skew": 0.000919,
"latency": 0.001036,
"health": "HEALTH_OK"
},
{
"name": "juju-2691ab-2-lxd-1",
"skew": 0,
"latency": 0.001009,
"health": "HEALTH_OK"
}
]
},
"summary": [
{
"severity": "HEALTH_WARN",
"summary": "48 pgs backfill_wait"
},
{
"severity": "HEALTH_WARN",
"summary": "45 pgs backfilling"
},
{
"severity": "HEALTH_WARN",
"summary": "1 pgs degraded"
},
{
"severity": "HEALTH_WARN",
"summary": "1 pgs recovery_wait"
},
{
"severity": "HEALTH_WARN",
"summary": "22 pgs stuck unclean"
},
{
"severity": "HEALTH_WARN",
"summary": "recovery lots/bignumber objects degraded (15%)"
},
{
"severity": "HEALTH_WARN",
"summary": "recovery 448540/46842755 objects misplaced (0.958%)"
}
],
"overall_status": "HEALTH_WARN",
"detail": []
},
"fsid": "ca9451f1-5c4f-4e85-bb14-a08dfc0568f7",
"election_epoch": 32,
"quorum": [
0,
1,
2
],
"quorum_names": [
"juju-2691ab-1-lxd-1",
"juju-2691ab-13-lxd-0",
"juju-2691ab-2-lxd-1"
],
"monmap": {
"epoch": 1,
"fsid": "ca9451f1-5c4f-4e85-bb14-a08dfc0568f7",
"modified": "2016-12-03 08:09:21.854671",
"created": "2016-12-03 08:09:21.854671",
"mons": [
{
"rank": 0,
"name": "juju-2691ab-1-lxd-1",
"addr": "10.182.254.221:6789/0"
},
{
"rank": 1,
"name": "juju-2691ab-13-lxd-0",
"addr": "10.182.254.229:6789/0"
},
{
"rank": 2,
"name": "juju-2691ab-2-lxd-1",
"addr": "10.182.254.242:6789/0"
}
]
},
"osdmap": {
"osdmap": {
"epoch": 141540,
"num_osds": 314,
"num_up_osds": 314,
"num_in_osds": 314,
"full": false,
"nearfull": false,
"num_remapped_pgs": 92
}
},
"pgmap": {
"pgs_by_state": [
{
"state_name": "active+clean",
"count": 9274
},
{
"state_name": "active+remapped+wait_backfill",
"count": 48
},
{
"state_name": "active+remapped+backfilling",
"count": 45
},
{
"state_name": "active+clean+scrubbing+deep",
"count": 9
},
{
"state_name": "active+remapped",
"count": 2
},
{
"state_name": "active+recovery_wait+degraded",
"count": 1
},
{
"state_name": "active+clean+scrubbing",
"count": 1
}
],
"version": 13885884,
"num_pgs": 9380,
"data_bytes": 64713222471610,
"bytes_used": 193613093122048,
"bytes_avail": 690058090491904,
"bytes_total": 883671183613952,
"degraded_objects": 14,
"degraded_total": 46842755,
"degraded_ratio": 0,
"misplaced_objects": 448540,
"misplaced_total": 46842755,
"misplaced_ratio": 0.15,
"recovering_objects_per_sec": 389,
"recovering_bytes_per_sec": 1629711746,
"recovering_keys_per_sec": 0,
"num_objects_recovered": 218,
"num_bytes_recovered": 912252928,
"num_keys_recovered": 0,
"read_bytes_sec": 117041457,
"write_bytes_sec": 293414043,
"read_op_per_sec": 5282,
"write_op_per_sec": 5270
},
"fsmap": {
"epoch": 1,
"by_rank": []
}
}

View File

@ -0,0 +1 @@
{"health":{"health":{"health_services":[{"mons":[{"name":"juju-2691ab-1-lxd-1","kb_total":155284096,"kb_used":1247744,"kb_avail":154036352,"avail_percent":99,"last_updated":"2017-05-17 03:31:35.562497","store_stats":{"bytes_total":1012055342,"bytes_sst":0,"bytes_log":29673298,"bytes_misc":982382044,"last_updated":"0.000000"},"health":"HEALTH_OK"},{"name":"juju-2691ab-13-lxd-0","kb_total":153820288,"kb_used":1361280,"kb_avail":152459008,"avail_percent":99,"last_updated":"2017-05-17 03:31:04.097201","store_stats":{"bytes_total":1370003168,"bytes_sst":0,"bytes_log":29813159,"bytes_misc":1340190009,"last_updated":"0.000000"},"health":"HEALTH_OK"},{"name":"juju-2691ab-2-lxd-1","kb_total":155251072,"kb_used":1373440,"kb_avail":153877632,"avail_percent":99,"last_updated":"2017-05-17 03:31:20.684777","store_stats":{"bytes_total":1400974192,"bytes_sst":0,"bytes_log":1129945,"bytes_misc":1399844247,"last_updated":"0.000000"},"health":"HEALTH_OK"}]}]},"timechecks":{"epoch":32,"round":24492,"round_status":"finished","mons":[{"name":"juju-2691ab-1-lxd-1","skew":0.000000,"latency":0.000000,"health":"HEALTH_OK"},{"name":"juju-2691ab-13-lxd-0","skew":0.000919,"latency":0.001036,"health":"HEALTH_OK"},{"name":"juju-2691ab-2-lxd-1","skew":0.000000,"latency":0.001009,"health":"HEALTH_OK"}]},"summary":[{"severity":"HEALTH_WARN","summary":"48 pgs backfill_wait"},{"severity":"HEALTH_WARN","summary":"45 pgs backfilling"},{"severity":"HEALTH_WARN","summary":"1 pgs degraded"},{"severity":"HEALTH_WARN","summary":"1 pgs recovery_wait"},{"severity":"HEALTH_WARN","summary":"22 pgs stuck unclean"},{"severity":"HEALTH_WARN","summary":"recovery 14\/46842755 objects degraded (0.000%)"},{"severity":"HEALTH_WARN","summary":"recovery 448540\/46842755 objects misplaced (0.958%)"}],"overall_status":"HEALTH_WARN","detail":[]},"fsid":"ca9451f1-5c4f-4e85-bb14-a08dfc0568f7","election_epoch":32,"quorum":[0,1,2],"quorum_names":["juju-2691ab-1-lxd-1","juju-2691ab-13-lxd-0","juju-2691ab-2-lxd-1"],"monmap":{"epoch":1,"fsid":"ca9451f1-5c4f-4e85-bb14-a08dfc0568f7","modified":"2016-12-03 08:09:21.854671","created":"2016-12-03 08:09:21.854671","mons":[{"rank":0,"name":"juju-2691ab-1-lxd-1","addr":"10.182.254.221:6789\/0"},{"rank":1,"name":"juju-2691ab-13-lxd-0","addr":"10.182.254.229:6789\/0"},{"rank":2,"name":"juju-2691ab-2-lxd-1","addr":"10.182.254.242:6789\/0"}]},"osdmap":{"osdmap":{"epoch":141540,"num_osds":314,"num_up_osds":314,"num_in_osds":314,"full":false,"nearfull":false,"num_remapped_pgs":92}},"pgmap":{"pgs_by_state":[{"state_name":"active+clean","count":9274},{"state_name":"active+remapped+wait_backfill","count":48},{"state_name":"active+remapped+backfilling","count":45},{"state_name":"active+clean+scrubbing+deep","count":9},{"state_name":"active+remapped","count":2},{"state_name":"active+recovery_wait+degraded","count":1},{"state_name":"active+clean+scrubbing","count":1}],"version":13885884,"num_pgs":9380,"data_bytes":64713222471610,"bytes_used":193613093122048,"bytes_avail":690058090491904,"bytes_total":883671183613952,"degraded_objects":14,"degraded_total":46842755,"degraded_ratio":0.000000,"misplaced_objects":448540,"misplaced_total":46842755,"misplaced_ratio":0.009575,"recovering_objects_per_sec":389,"recovering_bytes_per_sec":1629711746,"recovering_keys_per_sec":0,"num_objects_recovered":218,"num_bytes_recovered":912252928,"num_keys_recovered":0,"read_bytes_sec":117041457,"write_bytes_sec":293414043,"read_op_per_sec":5282,"write_op_per_sec":5270},"fsmap":{"epoch":1,"by_rank":[]}}

View File

@ -44,7 +44,11 @@ CHARM_CONFIG = {'config-flags': '',
'osd-format': 'ext4',
'monitor-hosts': '',
'prefer-ipv6': False,
'default-rbd-features': None}
'default-rbd-features': None,
'nagios_degraded_thresh': '1',
'nagios_misplaced_thresh': '10',
'nagios_recovery_rate': '1',
'nagios_ignore_nodeepscub': False}
class CephHooksTestCase(unittest.TestCase):
@ -168,7 +172,10 @@ class CephHooksTestCase(unittest.TestCase):
'use_syslog': 'true'}
self.assertEqual(ctxt, expected)
def test_nrpe_dependency_installed(self):
@patch.object(ceph_hooks, 'config')
def test_nrpe_dependency_installed(self, mock_config):
config = copy.deepcopy(CHARM_CONFIG)
mock_config.side_effect = lambda key: config[key]
with patch.multiple(ceph_hooks,
apt_install=DEFAULT,
rsync=DEFAULT,
@ -179,7 +186,12 @@ class CephHooksTestCase(unittest.TestCase):
mocks["apt_install"].assert_called_once_with(
["python-dbus", "lockfile-progs"])
def test_upgrade_charm_with_nrpe_relation_installs_dependencies(self):
@patch.object(ceph_hooks, 'config')
def test_upgrade_charm_with_nrpe_relation_installs_dependencies(
self,
mock_config):
config = copy.deepcopy(CHARM_CONFIG)
mock_config.side_effect = lambda key: config[key]
with patch.multiple(
ceph_hooks,
apt_install=DEFAULT,

View File

@ -0,0 +1,75 @@
# Copyright 2016 Canonical Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import os
import sys
from mock import patch
# import the module we want to test
os.sys.path.insert(1, os.path.join(sys.path[0], 'files/nagios'))
import check_ceph_status
@patch('subprocess.check_output')
class NagiosTestCase(unittest.TestCase):
def test_health_ok(self, mock_subprocess):
with open('unit_tests/ceph_ok.json') as f:
tree = f.read()
mock_subprocess.return_value = tree
args = check_ceph_status.parse_args(['--degraded_thresh', '1'])
check_output = check_ceph_status.check_ceph_status(args)
self.assertRegexpMatches(check_output, r"^All OK$")
def test_health_warn(self, mock_subprocess):
with open('unit_tests/ceph_warn.json') as f:
tree = f.read()
mock_subprocess.return_value = tree
args = check_ceph_status.parse_args(['--degraded_thresh', '1'])
self.assertRaises(check_ceph_status.WarnError,
lambda: check_ceph_status.check_ceph_status(args))
def test_health_crit(self, mock_subprocess):
with open('unit_tests/ceph_crit.json') as f:
tree = f.read()
mock_subprocess.return_value = tree
args = check_ceph_status.parse_args(['--degraded_thresh', '1'])
self.assertRaises(check_ceph_status.CriticalError,
lambda: check_ceph_status.check_ceph_status(args))
def test_health_lotsdegraded(self, mock_subprocess):
with open('unit_tests/ceph_params.json') as f:
tree = f.read()
mock_subprocess.return_value = tree
args = check_ceph_status.parse_args(['--degraded_thresh', '1'])
self.assertRaises(check_ceph_status.CriticalError,
lambda: check_ceph_status.check_ceph_status(args))
def test_health_nodeepscrub(self, mock_subprocess):
with open('unit_tests/ceph_nodeepscrub.json') as f:
tree = f.read()
mock_subprocess.return_value = tree
args = check_ceph_status.parse_args(['--degraded_thresh', '1'])
self.assertRaises(check_ceph_status.CriticalError,
lambda: check_ceph_status.check_ceph_status(args))
def test_health_nodeepscrubok(self, mock_subprocess):
with open('unit_tests/ceph_nodeepscrub.json') as f:
tree = f.read()
mock_subprocess.return_value = tree
args = check_ceph_status.parse_args(['--ignore_nodeepscrub'])
self.assertRaises(check_ceph_status.WarnError,
lambda: check_ceph_status.check_ceph_status(args))