Add nagios check for expected number of OSDs

This check does not require manually setting the number of expected
OSDs.

Initially, the charm sets the count (per-host) to that of what's
present in the OSD tree. The count will be updated (on a per-host
basis) when the number of OSDs grows, but not when it shrinks. There
is a charm action to reset the expected count using information from
the OSD tree.

Closes-Bug: #1952985
Change-Id: Ia6a060bf151908c1d4159e6bdffa7bfe1f0a7988
This commit is contained in:
Edin Sarajlic 2021-12-17 08:45:02 +08:00 committed by Chris MacNaughton
parent fd9104907e
commit b8af44aefa
7 changed files with 434 additions and 1 deletions

View File

@ -443,3 +443,5 @@ delete-user:
required: [username]
pg-repair:
description: "Repair inconsistent placement groups, if safe to do so."
reset-osd-count-report:
description: "Update report of osds present in osd tree. Used for monitoring."

View File

@ -0,0 +1 @@
reset_osd_count_report.py

View File

@ -0,0 +1,28 @@
#!/usr/bin/env python3
#
# Copyright 2021 Canonical Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
sys.path.append("hooks")
from ceph_hooks import update_host_osd_count_report
def reset_osd_count_report():
update_host_osd_count_report(reset=True)
if __name__ == '__main__':
reset_osd_count_report()

View File

@ -0,0 +1,121 @@
#!/usr/bin/env python3
# Copyright (C) 2021 Canonical
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import os
import sys
import time
EXIT_OK = 0
EXIT_WARN = 1
EXIT_CRIT = 2
EXIT_UNKNOWN = 3
EXIT_CODE_TEXT = ["OK", "WARN", "CRITICAL", "UNKNOWN"]
CURRENT_OSD_COUNT_FILE = "/var/lib/nagios/current-ceph-osd-count.json"
class CriticalError(Exception):
"""This indicates a critical error."""
def check_file_freshness(filename, newer_than=3600):
"""Check a file exists, is readable and is newer than <n> seconds.
:param filename: The filename to check
:type filename: str
:param newer_than: The file should be newer than n seconds, default 3600
:type: newer_than: int
:raises CriticalError: If file is not readable or older then <n> seconds
"""
# First check the file exists and is readable
if not os.path.exists(filename):
raise CriticalError("%s: does not exist." % (filename))
if os.access(filename, os.R_OK) == 0:
raise CriticalError("%s: is not readable." % (filename))
# Then ensure the file is up-to-date enough
mtime = os.stat(filename).st_mtime
last_modified = time.time() - mtime
if last_modified > newer_than:
raise CriticalError("%s: was last modified on %s and is too old "
"(> %s seconds)."
% (filename, time.ctime(mtime), newer_than))
if last_modified < 0:
raise CriticalError("%s: was last modified on %s which is in the "
"future."
% (filename, time.ctime(mtime)))
def check_ceph_osd_count(host_osd_count_report):
with open(host_osd_count_report, "r") as f:
expected_osd_map = json.load(f)
current_osd_map = get_osd_tree()
exit_code = EXIT_OK
err_msgs = []
for host, osd_list in expected_osd_map.items():
if host not in current_osd_map:
err_msgs.append("Missing host {}".format(host))
current_osd_map[host] = {}
if len(osd_list) <= len(current_osd_map[host]):
continue
missing_osds = list(set(osd_list) - set(current_osd_map[host]))
if missing_osds:
osd_ids = [str(osd) for osd in missing_osds]
err_msgs.append("Missing osds on "
"{}: {}".format(host,
", ".join(osd_ids)))
exit_code = EXIT_CRIT
return (exit_code, err_msgs)
def get_osd_tree():
"""Read CURRENT_OSD_COUNT_FILE to get the host osd map.
:return: The map of node and osd ids.
:rtype: Dict[str: List[str]]
"""
check_file_freshness(CURRENT_OSD_COUNT_FILE)
with open(CURRENT_OSD_COUNT_FILE, "r") as f:
current_osd_counts = json.load(f)
host_osd_map = {}
for node in current_osd_counts["nodes"]:
if node["type"] != "host":
continue
host_osd_map[node["name"]] = node["children"]
return host_osd_map
if __name__ == "__main__":
host_osd_report = sys.argv[1]
if not os.path.isfile(host_osd_report):
print("UNKNOWN: report file missing: {}".format(host_osd_report))
sys.exit(EXIT_UNKNOWN)
(exit_code, err_msgs) = check_ceph_osd_count(host_osd_report)
print("{} {}".format(EXIT_CODE_TEXT[exit_code],
", ".join(err_msgs)))
sys.exit(exit_code)

View File

@ -22,3 +22,12 @@ ceph status --format json >${TMP_FILE}
chown root:nagios ${TMP_FILE}
chmod 0640 ${TMP_FILE}
mv ${TMP_FILE} ${DATA_FILE}
DATA_FILE="${DATA_DIR}/current-ceph-osd-count.json"
TMP_FILE=$(mktemp -p ${DATA_DIR})
ceph osd tree --format json > ${TMP_FILE}
chown root:nagios ${TMP_FILE}
chmod 0640 ${TMP_FILE}
mv ${TMP_FILE} ${DATA_FILE}

View File

@ -20,6 +20,7 @@ import os
import subprocess
import sys
import uuid
import pathlib
sys.path.append('lib')
import charms_ceph.utils as ceph
@ -109,9 +110,11 @@ from charmhelpers.contrib.hardening.harden import harden
hooks = Hooks()
NAGIOS_PLUGINS = '/usr/local/lib/nagios/plugins'
NAGIOS_FILE_FOLDER = '/var/lib/nagios'
SCRIPTS_DIR = '/usr/local/bin'
STATUS_FILE = '/var/lib/nagios/cat-ceph-status.txt'
STATUS_FILE = '{}/cat-ceph-status.txt'.format(NAGIOS_FILE_FOLDER)
STATUS_CRONFILE = '/etc/cron.d/cat-ceph-health'
HOST_OSD_COUNT_REPORT = '{}/host-osd-report.json'.format(NAGIOS_FILE_FOLDER)
def check_for_upgrade():
@ -215,6 +218,44 @@ def emit_cephconf():
JOURNAL_ZAPPED = '/var/lib/ceph/journal_zapped'
def update_host_osd_count_report(reset=False):
"""Update report showing hosts->osds. Used for monitoring."""
current_osd_tree = ceph.get_osd_tree('admin')
# Convert [CrushLocation,...] -> {<host>: [osdid],...} for easy comparison
current_host_osd_map = {}
for osd in current_osd_tree:
osd_list = current_host_osd_map.get(osd.host, [])
osd_list.append(osd.identifier)
current_host_osd_map[osd.host] = osd_list
pathlib.Path(NAGIOS_FILE_FOLDER).mkdir(parents=True, exist_ok=True)
if not os.path.isfile(HOST_OSD_COUNT_REPORT) or reset:
write_file(HOST_OSD_COUNT_REPORT, '{}')
with open(HOST_OSD_COUNT_REPORT, "r") as f:
expected_host_osd_map = json.load(f)
if current_host_osd_map == expected_host_osd_map:
return
for host, osd_list in current_host_osd_map.items():
if host not in expected_host_osd_map:
expected_host_osd_map[host] = osd_list
if len(osd_list) > len(expected_host_osd_map[host]):
# osd list is growing, add them to the expected
expected_host_osd_map[host] = osd_list
if len(osd_list) == len(expected_host_osd_map[host]) and \
osd_list != expected_host_osd_map[host]:
# different osd ids, maybe hdd swap, refresh
expected_host_osd_map[host] = osd_list
write_file(HOST_OSD_COUNT_REPORT,
json.dumps(expected_host_osd_map))
@hooks.hook('config-changed')
@harden()
def config_changed():
@ -884,6 +925,9 @@ def osd_relation(relid=None, unit=None):
for relid in relation_ids('dashboard'):
dashboard_relation(relid)
if ready_for_service():
update_host_osd_count_report()
else:
log('mon cluster not in quorum - deferring fsid provision')
@ -1143,6 +1187,10 @@ def update_nrpe_config():
'check_ceph_status.py'),
os.path.join(NAGIOS_PLUGINS, 'check_ceph_status.py'))
rsync(os.path.join(os.getenv('CHARM_DIR'), 'files', 'nagios',
'check_ceph_osd_count.py'),
os.path.join(NAGIOS_PLUGINS, 'check_ceph_osd_count.py'))
script = os.path.join(SCRIPTS_DIR, 'collect_ceph_status.sh')
rsync(os.path.join(os.getenv('CHARM_DIR'), 'files',
'nagios', 'collect_ceph_status.sh'),
@ -1168,6 +1216,14 @@ def update_nrpe_config():
check_cmd=check_cmd
)
check_cmd = 'check_ceph_osd_count.py {} '.format(
HOST_OSD_COUNT_REPORT)
nrpe_setup.add_check(
shortname='ceph_osd_count',
description='Check if osd count matches expected count',
check_cmd=check_cmd
)
if config('nagios_additional_checks'):
additional_critical = config('nagios_additional_checks_critical')
x = ast.literal_eval(config('nagios_additional_checks'))

View File

@ -0,0 +1,216 @@
# Copyright 2021 Canonical Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import sys
import unittest
from unittest.mock import patch, mock_open
from src.ceph_hooks import update_host_osd_count_report
os.sys.path.insert(1, os.path.join(sys.path[0], 'lib'))
os.sys.path.insert(1, os.path.join(sys.path[0], 'files/nagios'))
import check_ceph_osd_count
from charms_ceph.utils import CrushLocation
class CheckCephOsdCountTestCase(unittest.TestCase):
@patch("check_ceph_osd_count.get_osd_tree")
def test_check_equal_ceph_osd_trees(self, mock_get_osd_tree):
"""Check that if current and expected osd trees match return OK exit"""
current_osd_tree = {"host1": [0]}
mock_get_osd_tree.return_value = current_osd_tree
expected_osd_tree = """{"host1": [0]}"""
with patch(
"check_ceph_osd_count.open",
mock_open(read_data=expected_osd_tree),
) as file:
(exit_code, _) = check_ceph_osd_count.check_ceph_osd_count(file)
self.assertEqual(exit_code, check_ceph_osd_count.EXIT_OK)
# change osd order
current_osd_tree = {"host1": [0, 1]}
mock_get_osd_tree.return_value = current_osd_tree
expected_osd_tree = """{"host1": [1, 0]}"""
with patch(
"check_ceph_osd_count.open",
mock_open(read_data=expected_osd_tree),
) as file:
(exit_code, _) = check_ceph_osd_count.check_ceph_osd_count(file)
self.assertEqual(exit_code, check_ceph_osd_count.EXIT_OK)
@patch("check_ceph_osd_count.get_osd_tree")
def test_check_missing_expected_osd(self, mock_get_osd_tree):
"""Check that missing expected osd returns appropriate exit code."""
current_osd_tree = {"host1": [0]}
mock_get_osd_tree.return_value = current_osd_tree
expected_osd_tree = """{"host1": [0, 1]}"""
with patch(
"check_ceph_osd_count.open",
mock_open(read_data=expected_osd_tree),
) as file:
(exit_code, _) = check_ceph_osd_count.check_ceph_osd_count(file)
self.assertEqual(exit_code, check_ceph_osd_count.EXIT_CRIT)
@patch("check_ceph_osd_count.get_osd_tree")
def test_check_missing_expected_host(self,
mock_get_osd_tree):
"""Check that missing expected host returns appropriate exit code."""
current_osd_tree = {"host1": [0]}
mock_get_osd_tree.return_value = current_osd_tree
expected_osd_tree = """{"host1": [0], "host2": [1]}"""
with patch(
"check_ceph_osd_count.open",
mock_open(read_data=expected_osd_tree),
) as file:
(exit_code, _) = check_ceph_osd_count.check_ceph_osd_count(file)
self.assertEqual(exit_code, check_ceph_osd_count.EXIT_CRIT)
@patch("check_ceph_osd_count.get_osd_tree")
def test_check_change_osd_ids(self, mock_get_osd_tree):
"""Check that a change in osd ids (of same length) is OK."""
current_osd_tree = {"host1": [1], "host2": [3]}
mock_get_osd_tree.return_value = current_osd_tree
expected_osd_tree = """{"host1": [0], "host2": [1]}"""
with patch(
"check_ceph_osd_count.open",
mock_open(read_data=expected_osd_tree),
) as file:
(exit_code, _) = check_ceph_osd_count.check_ceph_osd_count(file)
self.assertEqual(exit_code, check_ceph_osd_count.EXIT_OK)
@patch("check_ceph_osd_count.get_osd_tree")
def test_osd_tree_current_gt_expected(self, mock_get_osd_tree):
"""Check that growing osd list is added to expected."""
current_osd_tree = {"host1": [0, 1], "host2": [2]}
mock_get_osd_tree.return_value = current_osd_tree
expected_osd_tree = """{"host1": [0]}"""
with patch(
"check_ceph_osd_count.open",
mock_open(read_data=expected_osd_tree),
) as file:
(exit_code, _) = check_ceph_osd_count.check_ceph_osd_count(file)
self.assertEqual(exit_code, check_ceph_osd_count.EXIT_OK)
@patch("json.dumps")
@patch("src.ceph_hooks.write_file")
@patch("src.ceph_hooks.pathlib")
@patch("charms_ceph.utils.get_osd_tree")
def test_update_report_fresh_tree(self,
mock_get_osd_tree,
mock_pathlib,
mock_write_file,
mock_json_dumps):
"""Check that an empty expected tree triggers an update to expected."""
new_osd_tree = [CrushLocation(0, "osd.0", osd="osd.0", host="host1"),
CrushLocation(1, "osd.1", osd="osd.1", host="host1")]
new_osd_dict = {"host1": [0, 1]}
mock_get_osd_tree.return_value = new_osd_tree
with patch(
"src.ceph_hooks.open",
mock_open(read_data="{}"),
):
update_host_osd_count_report()
mock_json_dumps.assert_called_with(new_osd_dict)
@patch("json.dumps")
@patch("src.ceph_hooks.write_file")
@patch("src.ceph_hooks.pathlib")
@patch("charms_ceph.utils.get_osd_tree")
def test_update_report_new_host(self,
mock_get_osd_tree,
mock_pathlib,
mock_write_file,
mock_json_dumps):
"""Check that adding new host adds new host to expected tree."""
new_osd_tree = [CrushLocation(0, "osd.0", osd="osd.0", host="host1"),
CrushLocation(1, "osd.1", osd="osd.1", host="host1"),
CrushLocation(2, "osd.2", osd="osd.2", host="host2")]
mock_get_osd_tree.return_value = new_osd_tree
with patch(
"src.ceph_hooks.open",
mock_open(read_data="""{"host1": [0, 1]}"""),
):
update_host_osd_count_report()
mock_json_dumps.assert_called_with(
{"host1": [0, 1], "host2": [2]})
@patch("json.dumps")
@patch("src.ceph_hooks.write_file")
@patch("src.ceph_hooks.pathlib")
@patch("charms_ceph.utils.get_osd_tree")
def test_update_report_missing_host(self,
mock_get_osd_tree,
mock_pathlib,
mock_write_file,
mock_json_dumps):
"""Check that missing host is not removed from expected tree."""
new_osd_tree = [CrushLocation(0, "osd.0", osd="osd.0", host="host1"),
CrushLocation(2, "osd.2", osd="osd.2", host="host1")]
mock_get_osd_tree.return_value = new_osd_tree
with patch(
"src.ceph_hooks.open",
mock_open(read_data="""{"host1": [0], "host2": [1]}"""),
):
update_host_osd_count_report()
mock_json_dumps.assert_called_with(
{"host1": [0, 2], "host2": [1]})
@patch("json.dumps")
@patch("src.ceph_hooks.write_file")
@patch("src.ceph_hooks.pathlib")
@patch("charms_ceph.utils.get_osd_tree")
def test_update_report_fewer_osds(self,
mock_get_osd_tree,
mock_pathlib,
mock_write_file,
mock_json_dumps):
"""Check that report isn't updated when osd list shrinks."""
new_osd_tree = [CrushLocation(0, "osd.0", osd="osd.0", host="host1")]
mock_get_osd_tree.return_value = new_osd_tree
with patch(
"src.ceph_hooks.open",
mock_open(read_data="""{"host1": [0, 1]}"""),
):
update_host_osd_count_report()
mock_json_dumps.assert_called_with(
{"host1": [0, 1]})
@patch("json.dumps")
@patch("src.ceph_hooks.write_file")
@patch("src.ceph_hooks.pathlib")
@patch("charms_ceph.utils.get_osd_tree")
def test_update_report_diff_osd_ids(self,
mock_get_osd_tree,
mock_write_file,
mock_pathlib,
mock_json_dumps):
"""Check that new osdid list (of same length) becomes new expected."""
new_osd_tree = [CrushLocation(2, "osd.2", osd="osd.2", host="host1"),
CrushLocation(3, "osd.3", osd="osd.3", host="host1")]
mock_get_osd_tree.return_value = new_osd_tree
with patch(
"src.ceph_hooks.open",
mock_open(read_data="""{"host1": [0, 1]}"""),
):
update_host_osd_count_report()
mock_json_dumps.assert_called_with(
{"host1": [2, 3]})