Don't return Critical when ceph is in warning state.

Current implementation returns Critical when Ceph is in warning
state, checking for some known exceptions which are considered
operational tasks. However this causes many Alarms.
This patch changes the behavior to report Warning when Ceph is
in HEALTH_WARN. If known operational tasks are exceeding
thresholds, Critical is returned.

Change-Id: I7a330189da8f0ba9168cedb534823c5e8f4795ba
This commit is contained in:
Marian Gasparovic 2018-11-06 12:20:25 +01:00
parent 7a362ff0a5
commit 35c8e40e83
10 changed files with 919 additions and 264 deletions

View File

@ -189,17 +189,21 @@ options:
type: float
description: "Threshold for degraded ratio (0.1 = 10%)"
nagios_misplaced_thresh:
default: 10.0
default: 1.0
type: float
description: "Threshold for misplaced ratio (0.1 = 10%)"
nagios_recovery_rate:
default: '1'
type: string
description: Recovery rate below which we consider recovery to be stalled
nagios_ignore_nodeepscub:
default: False
description: |
Recovery rate (in objects/s) below which we consider recovery
to be stalled.
nagios_raise_nodeepscrub:
default: True
type: boolean
description: Whether to ignore the nodeep-scrub flag
description: |
Whether to report Critical instead of Warning when the nodeep-scrub
flag is set.
use-direct-io:
type: boolean
default: True

View File

@ -102,10 +102,6 @@ def check_ceph_status(args):
:returns string, describing the status of the ceph cluster.
"""
ignorable = (r'\d+ pgs (?:backfill|degraded|recovery_wait|stuck unclean)|'
'recovery \d+\/\d+ objects (?:degraded|misplaced)')
if args.ignore_nodeepscrub:
ignorable = ignorable + '|nodeep-scrub flag\(s\) set'
status_critical = False
if args.status_file:
check_file_freshness(args.status_file)
@ -136,41 +132,60 @@ def check_ceph_status(args):
luminous = False
if overall_status != 'HEALTH_OK':
# Health is not OK, check if any lines are not in our list of OK
# any lines that don't match, check is critical
# Health is not OK, collect status message(s) and
# decide whether to return warning or critical
status_critical = False
status_msg = []
if luminous:
status_messages = [x['summary']['message'] for x in status_data['health'].get('checks').values()]
status_messages = [x['summary']['message']
for x in
status_data['health'].get('checks').values()]
else:
status_messages = [x['summary'] for x in status_data['health']['summary']]
status_messages = [x['summary']
for x in
status_data['health']['summary']]
for status in status_messages:
if not re.match(ignorable, status):
status_msg.append(status)
# Check if nedeepscrub is set and whether it should raise an error
if args.raise_nodeepscrub:
if re.match("nodeep-scrub flag", status):
status_critical = True
if overall_status == 'HEALTH_CRITICAL' or \
overall_status == 'HEALTH_ERR':
# HEALTH_ERR, report critical
status_critical = True
else:
# HEALTH_WARN
# Check the threshold for a list of operational tasks,
# and return CRITICAL if exceeded
degraded_ratio = float(status_data['pgmap'].get('degraded_ratio',
0.0))
if degraded_ratio > args.degraded_thresh:
status_critical = True
status_msg.append(status)
# If we got this far, then the status is not OK but the status lines
# are all in our list of things we consider to be operational tasks.
# Check the thresholds and return CRITICAL if exceeded,
# otherwise there's something not accounted for and we want to know
# about it with a WARN alert.
degraded_ratio = status_data['pgmap'].get('degraded_ratio', 0.0)
if degraded_ratio > args.degraded_thresh:
status_critical = True
status_msg.append("Degraded ratio: {}".format(degraded_ratio))
misplaced_ratio = status_data['pgmap'].get('misplaced_ratio', 0.0)
if misplaced_ratio > args.misplaced_thresh:
status_critical = True
status_msg.append("Misplaced ratio: {}".format(misplaced_ratio))
recovering = status_data['pgmap'].get('recovering_objects_per_sec',
0.0)
if recovering < args.recovery_rate:
status_critical = True
status_msg.append("Recovering objects/sec {}".format(recovering))
if degraded_ratio > 0:
status_msg.append("Degraded ratio: {}".format(degraded_ratio))
misplaced_ratio = float(status_data['pgmap'].get('misplaced_ratio',
0.0))
if misplaced_ratio > args.misplaced_thresh:
status_critical = True
if misplaced_ratio > 0:
status_msg.append("Misplaced ratio: {}".
format(misplaced_ratio))
recovering = float(status_data['pgmap'].
get('recovering_objects_per_sec', 0.0))
if (degraded_ratio > 0 or misplaced_ratio > 0) \
and recovering > 0 \
and recovering < args.recovery_rate:
status_critical = True
if recovering > 0:
status_msg.append("Recovering objects/s {}".format(recovering))
if status_critical:
msg = 'CRITICAL: ceph health: "{} {}"'.format(
overall_status,
", ".join(status_msg))
raise CriticalError(msg)
if overall_status == 'HEALTH_WARN':
else:
# overall_status == 'HEALTH_WARN':
msg = "WARNING: {}".format(", ".join(status_msg))
raise WarnError(msg)
message = "All OK"
@ -187,21 +202,21 @@ def parse_args(args):
'user account does not have rights for the Ceph '
'config files.')
parser.add_argument('--degraded_thresh', dest='degraded_thresh',
default=1, type=float,
default=1.0, type=float,
help="Threshold for degraded ratio (0.1 = 10%)")
parser.add_argument('--misplaced_thresh', dest='misplaced_thresh',
default=10, type=float,
default=1.0, type=float,
help="Threshold for misplaced ratio (0.1 = 10%)")
parser.add_argument('--recovery_rate', dest='recovery_rate',
default=1, type=int,
help="Recovery rate below which we consider recovery "
"to be stalled")
parser.add_argument('--ignore_nodeepscrub', dest='ignore_nodeepscrub',
help="Recovery rate (in objects/s) below which we"
"consider recovery to be stalled")
parser.add_argument('--raise_nodeepscrub', dest='raise_nodeepscrub',
default=False, action='store_true',
help="Whether to ignore the nodeep-scrub flag. If "
"the nodeep-scrub flag is set, the check returns "
"warning if this param is passed, otherwise "
"returns critical.")
help="Whether to raise an error for the nodeep-scrub"
"flag. If the nodeep-scrub flag is set,"
"the check returns critical if this param is"
"passed, otherwise it returns warning.")
return parser.parse_args(args)
@ -218,7 +233,7 @@ def main(args):
exitcode = 'critical'
except WarnError as msg:
print(msg)
exitcode = 'critical'
exitcode = 'warning'
except:
print("%s raised unknown exception '%s'" % ('check_ceph_status',
sys.exc_info()[0]))

View File

@ -748,8 +748,8 @@ def update_nrpe_config():
config('nagios_degraded_thresh'),
config('nagios_misplaced_thresh'),
config('nagios_recovery_rate'))
if config('nagios_ignore_nodeepscub'):
check_cmd = check_cmd + ' --ignore_nodeepscrub'
if config('nagios_raise_nodeepscrub'):
check_cmd = check_cmd + ' --raise_nodeepscrub'
nrpe_setup.add_check(
shortname="ceph",
description='Check Ceph health {{{}}}'.format(current_unit),

View File

@ -0,0 +1,147 @@
{
"fsid": "b03a2900-e297-11e8-a7db-00163ed10659",
"health": {
"checks": {
"OSD_DOWN": {
"severity": "HEALTH_WARN",
"summary": {
"message": "3 osds down"
}
},
"OSD_HOST_DOWN": {
"severity": "HEALTH_WARN",
"summary": {
"message": "1 host (3 osds) down"
}
},
"OBJECT_MISPLACED": {
"severity": "HEALTH_WARN",
"summary": {
"message": "9883/43779 objects misplaced (22.575%)"
}
},
"PG_DEGRADED": {
"severity": "HEALTH_WARN",
"summary": {
"message": "Degraded data redundancy: 14001/43779 objects degraded (31.981%), 32 pgs degraded"
}
},
"POOL_APP_NOT_ENABLED": {
"severity": "HEALTH_WARN",
"summary": {
"message": "application not enabled on 1 pool(s)"
}
},
"TOO_FEW_PGS": {
"severity": "HEALTH_WARN",
"summary": {
"message": "too few PGs per OSD (7 < min 30)"
}
}
},
"status": "HEALTH_WARN"
},
"election_epoch": 5,
"quorum": [
0
],
"quorum_names": [
"juju-460e0f-11"
],
"monmap": {
"epoch": 1,
"fsid": "b03a2900-e297-11e8-a7db-00163ed10659",
"modified": "2018-11-07 14:17:12.324408",
"created": "2018-11-07 14:17:12.324408",
"features": {
"persistent": [
"kraken",
"luminous"
],
"optional": []
},
"mons": [
{
"rank": 0,
"name": "juju-460e0f-11",
"addr": "192.168.100.81:6789/0",
"public_addr": "192.168.100.81:6789/0"
}
]
},
"osdmap": {
"osdmap": {
"epoch": 72,
"num_osds": 9,
"num_up_osds": 6,
"num_in_osds": 9,
"full": false,
"nearfull": false,
"num_remapped_pgs": 16
}
},
"pgmap": {
"pgs_by_state": [
{
"state_name": "active+undersized+degraded",
"count": 16
},
{
"state_name": "active+undersized+degraded+remapped+backfill_wait",
"count": 14
},
{
"state_name": "active+undersized+degraded+remapped+backfilling",
"count": 2
}
],
"num_pgs": 32,
"num_pools": 1,
"num_objects": 14593,
"data_bytes": 61169729807,
"bytes_used": 14540595200,
"bytes_avail": 14889525248,
"bytes_total": 29430120448,
"degraded_objects": 14001,
"degraded_total": 43779,
"degraded_ratio": 0.319811,
"misplaced_objects": 9883,
"misplaced_total": 43779,
"misplaced_ratio": 0.225748
},
"fsmap": {
"epoch": 1,
"by_rank": []
},
"mgrmap": {
"epoch": 5,
"active_gid": 14097,
"active_name": "juju-460e0f-11",
"active_addr": "192.168.100.81:6800/204",
"available": true,
"standbys": [],
"modules": [
"balancer",
"restful",
"status"
],
"available_modules": [
"balancer",
"dashboard",
"influx",
"localpool",
"prometheus",
"restful",
"selftest",
"status",
"zabbix"
],
"services": {}
},
"servicemap": {
"epoch": 1,
"modified": "0.000000",
"services": {}
}
}

118
unit_tests/ceph_error.json Normal file
View File

@ -0,0 +1,118 @@
{
"health": {
"health": {
"health_services": [
{
"mons": [
{
"name": "juju-460e0f-12",
"kb_total": 1829760,
"kb_used": 835072,
"kb_avail": 994688,
"avail_percent": 54,
"last_updated": "2018-11-07 18:46:32.308592",
"store_stats": {
"bytes_total": 15678387,
"bytes_sst": 0,
"bytes_log": 420953,
"bytes_misc": 15257434,
"last_updated": "0.000000"
},
"health": "HEALTH_OK"
}
]
}
]
},
"timechecks": {
"epoch": 3,
"round": 0,
"round_status": "finished"
},
"summary": [
{
"severity": "HEALTH_ERR",
"summary": "6 pgs are stuck inactive for more than 300 seconds"
},
{
"severity": "HEALTH_WARN",
"summary": "7 pgs peering"
},
{
"severity": "HEALTH_WARN",
"summary": "6 pgs stuck inactive"
},
{
"severity": "HEALTH_WARN",
"summary": "6 pgs stuck unclean"
}
],
"overall_status": "HEALTH_ERR",
"detail": []
},
"fsid": "68a9ca14-e297-11e8-843c-00163e64b0c0",
"election_epoch": 3,
"quorum": [
0
],
"quorum_names": [
"juju-460e0f-12"
],
"monmap": {
"epoch": 1,
"fsid": "68a9ca14-e297-11e8-843c-00163e64b0c0",
"modified": "2018-11-07 14:17:27.659064",
"created": "2018-11-07 14:17:27.659064",
"mons": [
{
"rank": 0,
"name": "juju-460e0f-12",
"addr": "192.168.100.26:6789\/0"
}
]
},
"osdmap": {
"osdmap": {
"epoch": 28,
"num_osds": 9,
"num_up_osds": 9,
"num_in_osds": 9,
"full": false,
"nearfull": false,
"num_remapped_pgs": 0
}
},
"pgmap": {
"pgs_by_state": [
{
"state_name": "creating",
"count": 113
},
{
"state_name": "active+clean",
"count": 64
},
{
"state_name": "activating",
"count": 8
},
{
"state_name": "peering",
"count": 7
}
],
"version": 7831,
"num_pgs": 192,
"data_bytes": 1790967809,
"bytes_used": 9995157504,
"bytes_avail": 9157476352,
"bytes_total": 19152633856,
"write_bytes_sec": 89844495,
"read_op_per_sec": 0,
"write_op_per_sec": 21
},
"fsmap": {
"epoch": 1,
"by_rank": []
}
}

View File

@ -0,0 +1,147 @@
{
"fsid": "b03a2900-e297-11e8-a7db-00163ed10659",
"health": {
"checks": {
"OBJECT_MISPLACED": {
"severity": "HEALTH_WARN",
"summary": {
"message": "1560/12264 objects misplaced (12.720%)"
}
},
"PG_AVAILABILITY": {
"severity": "HEALTH_WARN",
"summary": {
"message": "Reduced data availability: 27 pgs inactive, 30 pgs peering"
}
},
"POOL_APP_NOT_ENABLED": {
"severity": "HEALTH_WARN",
"summary": {
"message": "application not enabled on 1 pool(s)"
}
},
"TOO_FEW_PGS": {
"severity": "HEALTH_WARN",
"summary": {
"message": "too few PGs per OSD (21 < min 30)"
}
}
},
"status": "HEALTH_WARN"
},
"election_epoch": 5,
"quorum": [
0
],
"quorum_names": [
"juju-460e0f-11"
],
"monmap": {
"epoch": 1,
"fsid": "b03a2900-e297-11e8-a7db-00163ed10659",
"modified": "2018-11-07 14:17:12.324408",
"created": "2018-11-07 14:17:12.324408",
"features": {
"persistent": [
"kraken",
"luminous"
],
"optional": []
},
"mons": [
{
"rank": 0,
"name": "juju-460e0f-11",
"addr": "192.168.100.81:6789/0",
"public_addr": "192.168.100.81:6789/0"
}
]
},
"osdmap": {
"osdmap": {
"epoch": 118,
"num_osds": 9,
"num_up_osds": 9,
"num_in_osds": 9,
"full": false,
"nearfull": false,
"num_remapped_pgs": 15
}
},
"pgmap": {
"pgs_by_state": [
{
"state_name": "unknown",
"count": 65
},
{
"state_name": "peering",
"count": 31
},
{
"state_name": "activating",
"count": 17
},
{
"state_name": "activating+remapped",
"count": 15
}
],
"num_pgs": 128,
"num_pools": 1,
"num_objects": 4088,
"data_bytes": 17187733578,
"bytes_used": 14360064000,
"bytes_avail": 15023263744,
"bytes_total": 29383327744,
"unknown_pgs_ratio": 0.507812,
"inactive_pgs_ratio": 0.492188,
"misplaced_objects": 1560,
"misplaced_total": 12264,
"misplaced_ratio": 0.127202,
"recovering_objects_per_sec": 14,
"recovering_bytes_per_sec": 60779755,
"recovering_keys_per_sec": 0,
"num_objects_recovered": 113,
"num_bytes_recovered": 471859200,
"num_keys_recovered": 0,
"read_bytes_sec": 0,
"write_bytes_sec": 244132150,
"read_op_per_sec": 0,
"write_op_per_sec": 116
},
"fsmap": {
"epoch": 1,
"by_rank": []
},
"mgrmap": {
"epoch": 5,
"active_gid": 14097,
"active_name": "juju-460e0f-11",
"active_addr": "192.168.100.81:6800/204",
"available": true,
"standbys": [],
"modules": [
"balancer",
"restful",
"status"
],
"available_modules": [
"balancer",
"dashboard",
"influx",
"localpool",
"prometheus",
"restful",
"selftest",
"status",
"zabbix"
],
"services": {}
},
"servicemap": {
"epoch": 1,
"modified": "0.000000",
"services": {}
}
}

View File

@ -1,177 +1,202 @@
{
"health": {
"health": {
"health_services": [
{
"mons": [
"health": {
"health_services": [
{
"mons": [
{
"name": "juju-c62a41-21-lxd-0",
"kb_total": 334602320,
"kb_used": 2127960,
"kb_avail": 315454468,
"avail_percent": 94,
"last_updated": "2018-11-08 09:47:09.932189",
"store_stats": {
"bytes_total": 34880542,
"bytes_sst": 0,
"bytes_log": 1647123,
"bytes_misc": 33233419,
"last_updated": "0.000000"
},
"health": "HEALTH_OK"
},
{
"name": "juju-c62a41-24-lxd-0",
"kb_total": 334602320,
"kb_used": 2128116,
"kb_avail": 315454312,
"avail_percent": 94,
"last_updated": "2018-11-08 09:47:16.418007",
"store_stats": {
"bytes_total": 36811676,
"bytes_sst": 0,
"bytes_log": 3574345,
"bytes_misc": 33237331,
"last_updated": "0.000000"
},
"health": "HEALTH_OK"
},
{
"name": "juju-c62a41-25-lxd-0",
"kb_total": 334602320,
"kb_used": 2128860,
"kb_avail": 315453568,
"avail_percent": 94,
"last_updated": "2018-11-08 09:47:21.198816",
"store_stats": {
"bytes_total": 37388424,
"bytes_sst": 0,
"bytes_log": 4151569,
"bytes_misc": 33236855,
"last_updated": "0.000000"
},
"health": "HEALTH_OK"
}
]
}
]
},
"timechecks": {
"epoch": 14,
"round": 4480,
"round_status": "finished",
"mons": [
{
"name": "juju-c62a41-21-lxd-0",
"skew": 0.000000,
"latency": 0.000000,
"health": "HEALTH_OK"
},
{
"name": "juju-c62a41-24-lxd-0",
"skew": 0.000282,
"latency": 0.000989,
"health": "HEALTH_OK"
},
{
"name": "juju-c62a41-25-lxd-0",
"skew": -0.001223,
"latency": 0.000776,
"health": "HEALTH_OK"
}
]
},
"summary": [
{
"name": "node1",
"kb_total": 140956600,
"kb_used": 15916132,
"kb_avail": 117857208,
"avail_percent": 83,
"last_updated": "2017-05-17 03:23:11.248297",
"store_stats": {
"bytes_total": 140014259,
"bytes_sst": 0,
"bytes_log": 13670758,
"bytes_misc": 126343501,
"last_updated": "0.000000"
},
"health": "HEALTH_OK"
"severity": "HEALTH_WARN",
"summary": "19 pgs backfill_wait"
},
{
"name": "node2",
"kb_total": 70395920,
"kb_used": 10532504,
"kb_avail": 56264436,
"avail_percent": 79,
"last_updated": "2017-05-17 03:23:16.952673",
"store_stats": {
"bytes_total": 315512452,
"bytes_sst": 0,
"bytes_log": 21691698,
"bytes_misc": 293820754,
"last_updated": "0.000000"
},
"health": "HEALTH_OK"
"severity": "HEALTH_WARN",
"summary": "4 pgs backfilling"
},
{
"name": "juju-machine-85-lxc-10",
"kb_total": 131927524,
"kb_used": 79521024,
"kb_avail": 45954016,
"avail_percent": 34,
"last_updated": "2017-05-17 03:23:13.794034",
"store_stats": {
"bytes_total": 89036349,
"bytes_sst": 0,
"bytes_log": 21055337,
"bytes_misc": 67981012,
"last_updated": "0.000000"
},
"health": "HEALTH_OK"
"severity": "HEALTH_WARN",
"summary": "1 pgs peering"
},
{
"severity": "HEALTH_WARN",
"summary": "24 pgs stuck unclean"
},
{
"severity": "HEALTH_WARN",
"summary": "recovery 17386\/112794 objects misplaced (15.414%)"
},
{
"severity": "HEALTH_WARN",
"summary": "pool pool1 has many more objects per pg than average (too few pgs?)"
},
{
"severity": "HEALTH_WARN",
"summary": "nodeep-scrub flag(s) set"
}
]
}
]
],
"overall_status": "HEALTH_WARN",
"detail": []
},
"timechecks": {
"epoch": 280,
"round": 19874,
"round_status": "finished",
"mons": [
{
"name": "node1",
"skew": "0.000000",
"latency": "0.000000",
"health": "HEALTH_OK"
},
{
"name": "node2",
"skew": "-0.000000",
"latency": "0.000866",
"health": "HEALTH_OK"
},
{
"name": "juju-machine-85-lxc-10",
"skew": "-0.000000",
"latency": "0.018848",
"health": "HEALTH_OK"
}
]
},
"summary": [
{
"severity": "HEALTH_WARN",
"summary": "nodeep-scrub flag(s) set"
}
"fsid": "66af7af5-2f60-4e0e-94dc-49f49bd37284",
"election_epoch": 14,
"quorum": [
0,
1,
2
],
"overall_status": "HEALTH_WARN",
"detail": []
},
"fsid": "some_fsid",
"election_epoch": 280,
"quorum": [
0,
1,
2
],
"quorum_names": [
"node1",
"node2",
"juju-machine-85-lxc-10"
],
"monmap": {
"epoch": 3,
"fsid": "some_fsid",
"modified": "2016-11-25 00:08:51.235813",
"created": "0.000000",
"mons": [
{
"rank": 0,
"name": "node1",
"addr": "10.24.0.15:6789/0"
},
{
"rank": 1,
"name": "node2",
"addr": "10.24.0.17:6789/0"
},
{
"rank": 2,
"name": "juju-machine-85-lxc-10",
"addr": "10.24.0.195:6789/0"
}
]
},
"osdmap": {
"quorum_names": [
"juju-c62a41-21-lxd-0",
"juju-c62a41-24-lxd-0",
"juju-c62a41-25-lxd-0"
],
"monmap": {
"epoch": 2,
"fsid": "66af7af5-2f60-4e0e-94dc-49f49bd37284",
"modified": "2018-10-31 15:37:56.902830",
"created": "2018-10-31 15:37:40.288870",
"mons": [
{
"rank": 0,
"name": "juju-c62a41-21-lxd-0",
"addr": "100.84.195.4:6789\/0"
},
{
"rank": 1,
"name": "juju-c62a41-24-lxd-0",
"addr": "100.84.196.4:6789\/0"
},
{
"rank": 2,
"name": "juju-c62a41-25-lxd-0",
"addr": "100.84.196.5:6789\/0"
}
]
},
"osdmap": {
"epoch": 37820,
"num_osds": 46,
"num_up_osds": 46,
"num_in_osds": 46,
"full": false,
"nearfull": false
"osdmap": {
"epoch": 316,
"num_osds": 48,
"num_up_osds": 48,
"num_in_osds": 48,
"full": false,
"nearfull": false,
"num_remapped_pgs": 22
}
},
"pgmap": {
"pgs_by_state": [
{
"state_name": "active+clean",
"count": 3448
},
{
"state_name": "active+remapped+wait_backfill",
"count": 19
},
{
"state_name": "active+remapped+backfilling",
"count": 4
},
{
"state_name": "peering",
"count": 1
}
],
"version": 141480,
"num_pgs": 3472,
"data_bytes": 157009583781,
"bytes_used": 487185850368,
"bytes_avail": 75282911256576,
"bytes_total": 75770097106944,
"misplaced_objects": 17386,
"misplaced_total": 112794,
"misplaced_ratio": 0.154139,
"recovering_objects_per_sec": 436,
"recovering_bytes_per_sec": 1832614589,
"recovering_keys_per_sec": 0,
"num_objects_recovered": 446,
"num_bytes_recovered": 1870659584,
"num_keys_recovered": 0
},
"fsmap": {
"epoch": 1,
"by_rank": []
}
},
"pgmap": {
"pgs_by_state": [
{
"state_name": "active+clean",
"count": 1988
},
{
"state_name": "active+remapped+wait_backfill",
"count": 3
},
{
"state_name": "active+remapped+backfilling",
"count": 1
}
],
"version": 58873447,
"num_pgs": 1992,
"data_bytes": 35851846298041,
"bytes_used": 107730678743040,
"bytes_avail": 63413590548480,
"bytes_total": 171144269291520,
"degraded_objects": 0,
"degraded_total": 25759217,
"degraded_ratio": 0,
"recovering_objects_per_sec": 17,
"recovering_bytes_per_sec": 72552794,
"recovering_keys_per_sec": 0,
"read_bytes_sec": 23935944,
"write_bytes_sec": 7024503,
"op_per_sec": 5332
},
"mdsmap": {
"epoch": 1,
"up": 0,
"in": 0,
"max": 1,
"by_rank": []
}
}

View File

@ -0,0 +1,102 @@
{
"fsid": "b03a2900-e297-11e8-a7db-00163ed10659",
"health": {
"checks": {
"OSDMAP_FLAGS": {
"severity": "HEALTH_WARN",
"summary": {
"message": "nodeep-scrub flag(s) set"
}
}
},
"status": "HEALTH_WARN"
},
"election_epoch": 5,
"quorum": [
0
],
"quorum_names": [
"juju-460e0f-11"
],
"monmap": {
"epoch": 1,
"fsid": "b03a2900-e297-11e8-a7db-00163ed10659",
"modified": "2018-11-07 14:17:12.324408",
"created": "2018-11-07 14:17:12.324408",
"features": {
"persistent": [
"kraken",
"luminous"
],
"optional": []
},
"mons": [
{
"rank": 0,
"name": "juju-460e0f-11",
"addr": "192.168.100.81:6789/0",
"public_addr": "192.168.100.81:6789/0"
}
]
},
"osdmap": {
"osdmap": {
"epoch": 518,
"num_osds": 9,
"num_up_osds": 9,
"num_in_osds": 9,
"full": false,
"nearfull": false,
"num_remapped_pgs": 0
}
},
"pgmap": {
"pgs_by_state": [
{
"state_name": "active+clean",
"count": 128
}
],
"num_pgs": 128,
"num_pools": 1,
"num_objects": 14896,
"data_bytes": 62440603919,
"bytes_used": 14225776640,
"bytes_avail": 9450938368,
"bytes_total": 23676715008
},
"fsmap": {
"epoch": 1,
"by_rank": []
},
"mgrmap": {
"epoch": 5,
"active_gid": 14097,
"active_name": "juju-460e0f-11",
"active_addr": "192.168.100.81:6800/204",
"available": true,
"standbys": [],
"modules": [
"balancer",
"restful",
"status"
],
"available_modules": [
"balancer",
"dashboard",
"influx",
"localpool",
"prometheus",
"restful",
"selftest",
"status",
"zabbix"
],
"services": {}
},
"servicemap": {
"epoch": 1,
"modified": "0.000000",
"services": {}
}
}

View File

@ -48,7 +48,7 @@ CHARM_CONFIG = {'config-flags': '',
'nagios_degraded_thresh': '1',
'nagios_misplaced_thresh': '10',
'nagios_recovery_rate': '1',
'nagios_ignore_nodeepscub': False,
'nagios_raise_nodeepscrub': True,
'disable-pg-max-object-skew': False}

View File

@ -32,6 +32,7 @@ class NagiosTestCase(unittest.TestCase):
ceph_version = check_ceph_status.get_ceph_version()
self.assertEqual(ceph_version, [10, 2, 9])
# All OK, pre-luminoius
@patch('check_ceph_status.get_ceph_version')
def test_health_ok(self, mock_ceph_version, mock_subprocess):
mock_ceph_version.return_value = [10, 2, 9]
@ -42,6 +43,84 @@ class NagiosTestCase(unittest.TestCase):
check_output = check_ceph_status.check_ceph_status(args)
self.assertRegex(check_output, r"^All OK$")
# Warning, pre-luminous
@patch('check_ceph_status.get_ceph_version')
def test_health_warn(self, mock_ceph_version, mock_subprocess):
mock_ceph_version.return_value = [10, 2, 9]
with open('unit_tests/ceph_warn.json') as f:
tree = f.read()
mock_subprocess.return_value = tree.encode('UTF-8')
args = check_ceph_status.parse_args("")
self.assertRaises(check_ceph_status.WarnError,
lambda: check_ceph_status.check_ceph_status(args))
# Error, pre-luminous, health_critical status
@patch('check_ceph_status.get_ceph_version')
def test_health_err(self, mock_ceph_version, mock_subprocess):
mock_ceph_version.return_value = [10, 2, 9]
with open('unit_tests/ceph_crit.json') as f:
tree = f.read()
mock_subprocess.return_value = tree.encode('UTF-8')
args = check_ceph_status.parse_args("")
self.assertRaises(check_ceph_status.CriticalError,
lambda: check_ceph_status.check_ceph_status(args))
# Error, pre-luminous, overall HEALTH_ERR
@patch('check_ceph_status.get_ceph_version')
def test_health_crit(self, mock_ceph_version, mock_subprocess):
mock_ceph_version.return_value = [10, 2, 9]
with open('unit_tests/ceph_error.json') as f:
tree = f.read()
mock_subprocess.return_value = tree.encode('UTF-8')
args = check_ceph_status.parse_args("")
self.assertRaises(check_ceph_status.CriticalError,
lambda: check_ceph_status.check_ceph_status(args))
# Error, pre-luminous, because misplaced ratio is too big
@patch('check_ceph_status.get_ceph_version')
def test_health_crit_misplaced(self, mock_ceph_version, mock_subprocess):
mock_ceph_version.return_value = [10, 2, 9]
with open('unit_tests/ceph_params.json') as f:
tree = f.read()
mock_subprocess.return_value = tree.encode('UTF-8')
args = check_ceph_status.parse_args(['--misplaced_thresh', '0.1'])
self.assertRaises(check_ceph_status.CriticalError,
lambda: check_ceph_status.check_ceph_status(args))
# Error, pre-luminous, because recovery rate is too low
@patch('check_ceph_status.get_ceph_version')
def test_health_crit_recovery(self, mock_ceph_version, mock_subprocess):
mock_ceph_version.return_value = [10, 2, 9]
with open('unit_tests/ceph_params.json') as f:
tree = f.read()
mock_subprocess.return_value = tree.encode('UTF-8')
args = check_ceph_status.parse_args(['--recovery_rate', '400'])
self.assertRaises(check_ceph_status.CriticalError,
lambda: check_ceph_status.check_ceph_status(args))
# Warning, pre-luminous, deepscrub
@patch('check_ceph_status.get_ceph_version')
def test_health_warn_deepscrub(self, mock_ceph_version, mock_subprocess):
mock_ceph_version.return_value = [10, 2, 9]
with open('unit_tests/ceph_nodeepscrub.json') as f:
tree = f.read()
mock_subprocess.return_value = tree.encode('UTF-8')
args = check_ceph_status.parse_args("")
self.assertRaises(check_ceph_status.WarnError,
lambda: check_ceph_status.check_ceph_status(args))
# Error, pre-luminous, deepscrub
@patch('check_ceph_status.get_ceph_version')
def test_health_crit_deepscrub(self, mock_ceph_version, mock_subprocess):
mock_ceph_version.return_value = [10, 2, 9]
with open('unit_tests/ceph_nodeepscrub.json') as f:
tree = f.read()
mock_subprocess.return_value = tree.encode('UTF-8')
args = check_ceph_status.parse_args(['--raise_nodeepscrub'])
self.assertRaises(check_ceph_status.CriticalError,
lambda: check_ceph_status.check_ceph_status(args))
# All OK, luminous
@patch('check_ceph_status.get_ceph_version')
def test_health_ok_luminous(self, mock_ceph_version, mock_subprocess):
mock_ceph_version.return_value = [12, 2, 0]
@ -52,62 +131,80 @@ class NagiosTestCase(unittest.TestCase):
check_output = check_ceph_status.check_ceph_status(args)
self.assertRegex(check_output, r"^All OK$")
# Warning, luminous
@patch('check_ceph_status.get_ceph_version')
def test_health_warn(self, mock_ceph_version, mock_subprocess):
mock_ceph_version.return_value = [10, 2, 9]
with open('unit_tests/ceph_warn.json') as f:
tree = f.read()
mock_subprocess.return_value = tree.encode('UTF-8')
args = check_ceph_status.parse_args(['--degraded_thresh', '1'])
self.assertRaises(check_ceph_status.WarnError,
lambda: check_ceph_status.check_ceph_status(args))
@patch('check_ceph_status.get_ceph_version')
def test_health_crit(self, mock_ceph_version, mock_subprocess):
mock_ceph_version.return_value = [10, 2, 9]
with open('unit_tests/ceph_crit.json') as f:
tree = f.read()
mock_subprocess.return_value = tree.encode('UTF-8')
args = check_ceph_status.parse_args(['--degraded_thresh', '1'])
self.assertRaises(check_ceph_status.CriticalError,
lambda: check_ceph_status.check_ceph_status(args))
@patch('check_ceph_status.get_ceph_version')
def test_health_crit_luminous(self, mock_ceph_version, mock_subprocess):
def test_health_warn_luminous(self, mock_ceph_version, mock_subprocess):
mock_ceph_version.return_value = [12, 2, 0]
with open('unit_tests/ceph_crit_luminous.json') as f:
with open('unit_tests/ceph_many_warnings_luminous.json') as f:
tree = f.read()
mock_subprocess.return_value = tree.encode('UTF-8')
args = check_ceph_status.parse_args(['--degraded_thresh', '1'])
self.assertRaises(check_ceph_status.CriticalError,
lambda: check_ceph_status.check_ceph_status(args))
@patch('check_ceph_status.get_ceph_version')
def test_health_lotsdegraded(self, mock_ceph_version, mock_subprocess):
mock_ceph_version.return_value = [10, 2, 9]
with open('unit_tests/ceph_params.json') as f:
tree = f.read()
mock_subprocess.return_value = tree.encode('UTF-8')
args = check_ceph_status.parse_args(['--degraded_thresh', '1'])
self.assertRaises(check_ceph_status.CriticalError,
lambda: check_ceph_status.check_ceph_status(args))
@patch('check_ceph_status.get_ceph_version')
def test_health_nodeepscrub(self, mock_ceph_version, mock_subprocess):
mock_ceph_version.return_value = [10, 2, 9]
with open('unit_tests/ceph_nodeepscrub.json') as f:
tree = f.read()
mock_subprocess.return_value = tree.encode('UTF-8')
args = check_ceph_status.parse_args(['--degraded_thresh', '1'])
self.assertRaises(check_ceph_status.CriticalError,
lambda: check_ceph_status.check_ceph_status(args))
@patch('check_ceph_status.get_ceph_version')
def test_health_nodeepscrubok(self, mock_ceph_version, mock_subprocess):
mock_ceph_version.return_value = [10, 2, 9]
with open('unit_tests/ceph_nodeepscrub.json') as f:
tree = f.read()
mock_subprocess.return_value = tree.encode('UTF-8')
args = check_ceph_status.parse_args(['--ignore_nodeepscrub'])
args = check_ceph_status.parse_args("")
self.assertRaises(check_ceph_status.WarnError,
lambda: check_ceph_status.check_ceph_status(args))
# Error, luminous, because of overall status
# Error, luminous, because misplaced ratio is too big
@patch('check_ceph_status.get_ceph_version')
def test_health_critical_misplaced_luminous(self,
mock_ceph_version,
mock_subprocess):
mock_ceph_version.return_value = [12, 2, 0]
with open('unit_tests/ceph_many_warnings_luminous.json') as f:
tree = f.read()
mock_subprocess.return_value = tree.encode('UTF-8')
args = check_ceph_status.parse_args(['--misplaced_thresh', '0.1'])
self.assertRaises(check_ceph_status.CriticalError,
lambda: check_ceph_status.check_ceph_status(args))
# Error, luminous, because degraded ratio is too big
@patch('check_ceph_status.get_ceph_version')
def test_health_critical_degraded_luminous(self,
mock_ceph_version,
mock_subprocess):
mock_ceph_version.return_value = [12, 2, 0]
with open('unit_tests/ceph_degraded_luminous.json') as f:
tree = f.read()
mock_subprocess.return_value = tree.encode('UTF-8')
args = check_ceph_status.parse_args(['--degraded_thresh', '0.1'])
self.assertRaises(check_ceph_status.CriticalError,
lambda: check_ceph_status.check_ceph_status(args))
# Error, luminous, because recovery rate is too low
@patch('check_ceph_status.get_ceph_version')
def test_health_critical_recovery_luminous(self,
mock_ceph_version,
mock_subprocess):
mock_ceph_version.return_value = [12, 2, 0]
with open('unit_tests/ceph_many_warnings_luminous.json') as f:
tree = f.read()
mock_subprocess.return_value = tree.encode('UTF-8')
args = check_ceph_status.parse_args(['--recovery_rate', '20'])
self.assertRaises(check_ceph_status.CriticalError,
lambda: check_ceph_status.check_ceph_status(args))
# Warning, luminous, deepscrub
@patch('check_ceph_status.get_ceph_version')
def test_health_warn_deepscrub_luminous(self,
mock_ceph_version,
mock_subprocess):
mock_ceph_version.return_value = [12, 2, 0]
with open('unit_tests/ceph_nodeepscrub_luminous.json') as f:
tree = f.read()
mock_subprocess.return_value = tree.encode('UTF-8')
args = check_ceph_status.parse_args("")
self.assertRaises(check_ceph_status.WarnError,
lambda: check_ceph_status.check_ceph_status(args))
# Error, luminous, deepscrub
@patch('check_ceph_status.get_ceph_version')
def test_health_crit_deepscrub_luminous(self,
mock_ceph_version,
mock_subprocess):
mock_ceph_version.return_value = [12, 2, 0]
with open('unit_tests/ceph_nodeepscrub_luminous.json') as f:
tree = f.read()
mock_subprocess.return_value = tree.encode('UTF-8')
args = check_ceph_status.parse_args(['--raise_nodeepscrub'])
self.assertRaises(check_ceph_status.CriticalError,
lambda: check_ceph_status.check_ceph_status(args))