diff --git a/actions.yaml b/actions.yaml index 9655a527..2ba87287 100644 --- a/actions.yaml +++ b/actions.yaml @@ -443,3 +443,5 @@ delete-user: required: [username] pg-repair: description: "Repair inconsistent placement groups, if safe to do so." +reset-osd-count-report: + description: "Update report of osds present in osd tree. Used for monitoring." diff --git a/actions/reset-osd-count-report b/actions/reset-osd-count-report new file mode 120000 index 00000000..ce265d1e --- /dev/null +++ b/actions/reset-osd-count-report @@ -0,0 +1 @@ +reset_osd_count_report.py \ No newline at end of file diff --git a/actions/reset_osd_count_report.py b/actions/reset_osd_count_report.py new file mode 100755 index 00000000..0334c441 --- /dev/null +++ b/actions/reset_osd_count_report.py @@ -0,0 +1,28 @@ +#!/usr/bin/env python3 +# +# Copyright 2021 Canonical Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys + +sys.path.append("hooks") +from ceph_hooks import update_host_osd_count_report + + +def reset_osd_count_report(): + update_host_osd_count_report(reset=True) + + +if __name__ == '__main__': + reset_osd_count_report() diff --git a/files/nagios/check_ceph_osd_count.py b/files/nagios/check_ceph_osd_count.py new file mode 100755 index 00000000..0703bfd7 --- /dev/null +++ b/files/nagios/check_ceph_osd_count.py @@ -0,0 +1,121 @@ +#!/usr/bin/env python3 + +# Copyright (C) 2021 Canonical +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os +import sys +import time + + +EXIT_OK = 0 +EXIT_WARN = 1 +EXIT_CRIT = 2 +EXIT_UNKNOWN = 3 +EXIT_CODE_TEXT = ["OK", "WARN", "CRITICAL", "UNKNOWN"] + +CURRENT_OSD_COUNT_FILE = "/var/lib/nagios/current-ceph-osd-count.json" + + +class CriticalError(Exception): + """This indicates a critical error.""" + + +def check_file_freshness(filename, newer_than=3600): + """Check a file exists, is readable and is newer than seconds. + + :param filename: The filename to check + :type filename: str + :param newer_than: The file should be newer than n seconds, default 3600 + :type: newer_than: int + :raises CriticalError: If file is not readable or older then seconds + """ + # First check the file exists and is readable + if not os.path.exists(filename): + raise CriticalError("%s: does not exist." % (filename)) + if os.access(filename, os.R_OK) == 0: + raise CriticalError("%s: is not readable." % (filename)) + + # Then ensure the file is up-to-date enough + mtime = os.stat(filename).st_mtime + last_modified = time.time() - mtime + if last_modified > newer_than: + raise CriticalError("%s: was last modified on %s and is too old " + "(> %s seconds)." + % (filename, time.ctime(mtime), newer_than)) + if last_modified < 0: + raise CriticalError("%s: was last modified on %s which is in the " + "future." + % (filename, time.ctime(mtime))) + + +def check_ceph_osd_count(host_osd_count_report): + + with open(host_osd_count_report, "r") as f: + expected_osd_map = json.load(f) + + current_osd_map = get_osd_tree() + + exit_code = EXIT_OK + err_msgs = [] + for host, osd_list in expected_osd_map.items(): + if host not in current_osd_map: + err_msgs.append("Missing host {}".format(host)) + current_osd_map[host] = {} + + if len(osd_list) <= len(current_osd_map[host]): + continue + + missing_osds = list(set(osd_list) - set(current_osd_map[host])) + if missing_osds: + osd_ids = [str(osd) for osd in missing_osds] + err_msgs.append("Missing osds on " + "{}: {}".format(host, + ", ".join(osd_ids))) + exit_code = EXIT_CRIT + + return (exit_code, err_msgs) + + +def get_osd_tree(): + """Read CURRENT_OSD_COUNT_FILE to get the host osd map. + + :return: The map of node and osd ids. + :rtype: Dict[str: List[str]] + """ + check_file_freshness(CURRENT_OSD_COUNT_FILE) + with open(CURRENT_OSD_COUNT_FILE, "r") as f: + current_osd_counts = json.load(f) + + host_osd_map = {} + for node in current_osd_counts["nodes"]: + if node["type"] != "host": + continue + + host_osd_map[node["name"]] = node["children"] + + return host_osd_map + + +if __name__ == "__main__": + host_osd_report = sys.argv[1] + if not os.path.isfile(host_osd_report): + print("UNKNOWN: report file missing: {}".format(host_osd_report)) + sys.exit(EXIT_UNKNOWN) + + (exit_code, err_msgs) = check_ceph_osd_count(host_osd_report) + print("{} {}".format(EXIT_CODE_TEXT[exit_code], + ", ".join(err_msgs))) + sys.exit(exit_code) diff --git a/files/nagios/collect_ceph_status.sh b/files/nagios/collect_ceph_status.sh index a2e284e2..514c219c 100755 --- a/files/nagios/collect_ceph_status.sh +++ b/files/nagios/collect_ceph_status.sh @@ -22,3 +22,12 @@ ceph status --format json >${TMP_FILE} chown root:nagios ${TMP_FILE} chmod 0640 ${TMP_FILE} mv ${TMP_FILE} ${DATA_FILE} + +DATA_FILE="${DATA_DIR}/current-ceph-osd-count.json" +TMP_FILE=$(mktemp -p ${DATA_DIR}) + +ceph osd tree --format json > ${TMP_FILE} + +chown root:nagios ${TMP_FILE} +chmod 0640 ${TMP_FILE} +mv ${TMP_FILE} ${DATA_FILE} diff --git a/src/ceph_hooks.py b/src/ceph_hooks.py index 9b01164d..e365bcaa 100755 --- a/src/ceph_hooks.py +++ b/src/ceph_hooks.py @@ -20,6 +20,7 @@ import os import subprocess import sys import uuid +import pathlib sys.path.append('lib') import charms_ceph.utils as ceph @@ -109,9 +110,11 @@ from charmhelpers.contrib.hardening.harden import harden hooks = Hooks() NAGIOS_PLUGINS = '/usr/local/lib/nagios/plugins' +NAGIOS_FILE_FOLDER = '/var/lib/nagios' SCRIPTS_DIR = '/usr/local/bin' -STATUS_FILE = '/var/lib/nagios/cat-ceph-status.txt' +STATUS_FILE = '{}/cat-ceph-status.txt'.format(NAGIOS_FILE_FOLDER) STATUS_CRONFILE = '/etc/cron.d/cat-ceph-health' +HOST_OSD_COUNT_REPORT = '{}/host-osd-report.json'.format(NAGIOS_FILE_FOLDER) def check_for_upgrade(): @@ -215,6 +218,44 @@ def emit_cephconf(): JOURNAL_ZAPPED = '/var/lib/ceph/journal_zapped' +def update_host_osd_count_report(reset=False): + """Update report showing hosts->osds. Used for monitoring.""" + current_osd_tree = ceph.get_osd_tree('admin') + + # Convert [CrushLocation,...] -> {: [osdid],...} for easy comparison + current_host_osd_map = {} + for osd in current_osd_tree: + osd_list = current_host_osd_map.get(osd.host, []) + osd_list.append(osd.identifier) + current_host_osd_map[osd.host] = osd_list + + pathlib.Path(NAGIOS_FILE_FOLDER).mkdir(parents=True, exist_ok=True) + if not os.path.isfile(HOST_OSD_COUNT_REPORT) or reset: + write_file(HOST_OSD_COUNT_REPORT, '{}') + + with open(HOST_OSD_COUNT_REPORT, "r") as f: + expected_host_osd_map = json.load(f) + + if current_host_osd_map == expected_host_osd_map: + return + + for host, osd_list in current_host_osd_map.items(): + if host not in expected_host_osd_map: + expected_host_osd_map[host] = osd_list + + if len(osd_list) > len(expected_host_osd_map[host]): + # osd list is growing, add them to the expected + expected_host_osd_map[host] = osd_list + + if len(osd_list) == len(expected_host_osd_map[host]) and \ + osd_list != expected_host_osd_map[host]: + # different osd ids, maybe hdd swap, refresh + expected_host_osd_map[host] = osd_list + + write_file(HOST_OSD_COUNT_REPORT, + json.dumps(expected_host_osd_map)) + + @hooks.hook('config-changed') @harden() def config_changed(): @@ -884,6 +925,9 @@ def osd_relation(relid=None, unit=None): for relid in relation_ids('dashboard'): dashboard_relation(relid) + if ready_for_service(): + update_host_osd_count_report() + else: log('mon cluster not in quorum - deferring fsid provision') @@ -1143,6 +1187,10 @@ def update_nrpe_config(): 'check_ceph_status.py'), os.path.join(NAGIOS_PLUGINS, 'check_ceph_status.py')) + rsync(os.path.join(os.getenv('CHARM_DIR'), 'files', 'nagios', + 'check_ceph_osd_count.py'), + os.path.join(NAGIOS_PLUGINS, 'check_ceph_osd_count.py')) + script = os.path.join(SCRIPTS_DIR, 'collect_ceph_status.sh') rsync(os.path.join(os.getenv('CHARM_DIR'), 'files', 'nagios', 'collect_ceph_status.sh'), @@ -1168,6 +1216,14 @@ def update_nrpe_config(): check_cmd=check_cmd ) + check_cmd = 'check_ceph_osd_count.py {} '.format( + HOST_OSD_COUNT_REPORT) + nrpe_setup.add_check( + shortname='ceph_osd_count', + description='Check if osd count matches expected count', + check_cmd=check_cmd + ) + if config('nagios_additional_checks'): additional_critical = config('nagios_additional_checks_critical') x = ast.literal_eval(config('nagios_additional_checks')) diff --git a/unit_tests/test_check_ceph_osd_count.py b/unit_tests/test_check_ceph_osd_count.py new file mode 100644 index 00000000..22aa382c --- /dev/null +++ b/unit_tests/test_check_ceph_osd_count.py @@ -0,0 +1,216 @@ +# Copyright 2021 Canonical Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys +import unittest + +from unittest.mock import patch, mock_open +from src.ceph_hooks import update_host_osd_count_report + +os.sys.path.insert(1, os.path.join(sys.path[0], 'lib')) +os.sys.path.insert(1, os.path.join(sys.path[0], 'files/nagios')) + +import check_ceph_osd_count + +from charms_ceph.utils import CrushLocation + + +class CheckCephOsdCountTestCase(unittest.TestCase): + + @patch("check_ceph_osd_count.get_osd_tree") + def test_check_equal_ceph_osd_trees(self, mock_get_osd_tree): + """Check that if current and expected osd trees match return OK exit""" + + current_osd_tree = {"host1": [0]} + mock_get_osd_tree.return_value = current_osd_tree + expected_osd_tree = """{"host1": [0]}""" + with patch( + "check_ceph_osd_count.open", + mock_open(read_data=expected_osd_tree), + ) as file: + (exit_code, _) = check_ceph_osd_count.check_ceph_osd_count(file) + self.assertEqual(exit_code, check_ceph_osd_count.EXIT_OK) + + # change osd order + current_osd_tree = {"host1": [0, 1]} + mock_get_osd_tree.return_value = current_osd_tree + expected_osd_tree = """{"host1": [1, 0]}""" + with patch( + "check_ceph_osd_count.open", + mock_open(read_data=expected_osd_tree), + ) as file: + (exit_code, _) = check_ceph_osd_count.check_ceph_osd_count(file) + self.assertEqual(exit_code, check_ceph_osd_count.EXIT_OK) + + @patch("check_ceph_osd_count.get_osd_tree") + def test_check_missing_expected_osd(self, mock_get_osd_tree): + """Check that missing expected osd returns appropriate exit code.""" + current_osd_tree = {"host1": [0]} + mock_get_osd_tree.return_value = current_osd_tree + expected_osd_tree = """{"host1": [0, 1]}""" + with patch( + "check_ceph_osd_count.open", + mock_open(read_data=expected_osd_tree), + ) as file: + + (exit_code, _) = check_ceph_osd_count.check_ceph_osd_count(file) + self.assertEqual(exit_code, check_ceph_osd_count.EXIT_CRIT) + + @patch("check_ceph_osd_count.get_osd_tree") + def test_check_missing_expected_host(self, + mock_get_osd_tree): + """Check that missing expected host returns appropriate exit code.""" + current_osd_tree = {"host1": [0]} + mock_get_osd_tree.return_value = current_osd_tree + expected_osd_tree = """{"host1": [0], "host2": [1]}""" + with patch( + "check_ceph_osd_count.open", + mock_open(read_data=expected_osd_tree), + ) as file: + + (exit_code, _) = check_ceph_osd_count.check_ceph_osd_count(file) + self.assertEqual(exit_code, check_ceph_osd_count.EXIT_CRIT) + + @patch("check_ceph_osd_count.get_osd_tree") + def test_check_change_osd_ids(self, mock_get_osd_tree): + """Check that a change in osd ids (of same length) is OK.""" + current_osd_tree = {"host1": [1], "host2": [3]} + mock_get_osd_tree.return_value = current_osd_tree + expected_osd_tree = """{"host1": [0], "host2": [1]}""" + with patch( + "check_ceph_osd_count.open", + mock_open(read_data=expected_osd_tree), + ) as file: + (exit_code, _) = check_ceph_osd_count.check_ceph_osd_count(file) + self.assertEqual(exit_code, check_ceph_osd_count.EXIT_OK) + + @patch("check_ceph_osd_count.get_osd_tree") + def test_osd_tree_current_gt_expected(self, mock_get_osd_tree): + """Check that growing osd list is added to expected.""" + current_osd_tree = {"host1": [0, 1], "host2": [2]} + mock_get_osd_tree.return_value = current_osd_tree + expected_osd_tree = """{"host1": [0]}""" + with patch( + "check_ceph_osd_count.open", + mock_open(read_data=expected_osd_tree), + ) as file: + (exit_code, _) = check_ceph_osd_count.check_ceph_osd_count(file) + self.assertEqual(exit_code, check_ceph_osd_count.EXIT_OK) + + @patch("json.dumps") + @patch("src.ceph_hooks.write_file") + @patch("src.ceph_hooks.pathlib") + @patch("charms_ceph.utils.get_osd_tree") + def test_update_report_fresh_tree(self, + mock_get_osd_tree, + mock_pathlib, + mock_write_file, + mock_json_dumps): + """Check that an empty expected tree triggers an update to expected.""" + new_osd_tree = [CrushLocation(0, "osd.0", osd="osd.0", host="host1"), + CrushLocation(1, "osd.1", osd="osd.1", host="host1")] + new_osd_dict = {"host1": [0, 1]} + mock_get_osd_tree.return_value = new_osd_tree + + with patch( + "src.ceph_hooks.open", + mock_open(read_data="{}"), + ): + update_host_osd_count_report() + mock_json_dumps.assert_called_with(new_osd_dict) + + @patch("json.dumps") + @patch("src.ceph_hooks.write_file") + @patch("src.ceph_hooks.pathlib") + @patch("charms_ceph.utils.get_osd_tree") + def test_update_report_new_host(self, + mock_get_osd_tree, + mock_pathlib, + mock_write_file, + mock_json_dumps): + """Check that adding new host adds new host to expected tree.""" + new_osd_tree = [CrushLocation(0, "osd.0", osd="osd.0", host="host1"), + CrushLocation(1, "osd.1", osd="osd.1", host="host1"), + CrushLocation(2, "osd.2", osd="osd.2", host="host2")] + mock_get_osd_tree.return_value = new_osd_tree + with patch( + "src.ceph_hooks.open", + mock_open(read_data="""{"host1": [0, 1]}"""), + ): + update_host_osd_count_report() + mock_json_dumps.assert_called_with( + {"host1": [0, 1], "host2": [2]}) + + @patch("json.dumps") + @patch("src.ceph_hooks.write_file") + @patch("src.ceph_hooks.pathlib") + @patch("charms_ceph.utils.get_osd_tree") + def test_update_report_missing_host(self, + mock_get_osd_tree, + mock_pathlib, + mock_write_file, + mock_json_dumps): + """Check that missing host is not removed from expected tree.""" + new_osd_tree = [CrushLocation(0, "osd.0", osd="osd.0", host="host1"), + CrushLocation(2, "osd.2", osd="osd.2", host="host1")] + mock_get_osd_tree.return_value = new_osd_tree + with patch( + "src.ceph_hooks.open", + mock_open(read_data="""{"host1": [0], "host2": [1]}"""), + ): + update_host_osd_count_report() + mock_json_dumps.assert_called_with( + {"host1": [0, 2], "host2": [1]}) + + @patch("json.dumps") + @patch("src.ceph_hooks.write_file") + @patch("src.ceph_hooks.pathlib") + @patch("charms_ceph.utils.get_osd_tree") + def test_update_report_fewer_osds(self, + mock_get_osd_tree, + mock_pathlib, + mock_write_file, + mock_json_dumps): + """Check that report isn't updated when osd list shrinks.""" + new_osd_tree = [CrushLocation(0, "osd.0", osd="osd.0", host="host1")] + mock_get_osd_tree.return_value = new_osd_tree + with patch( + "src.ceph_hooks.open", + mock_open(read_data="""{"host1": [0, 1]}"""), + ): + update_host_osd_count_report() + mock_json_dumps.assert_called_with( + {"host1": [0, 1]}) + + @patch("json.dumps") + @patch("src.ceph_hooks.write_file") + @patch("src.ceph_hooks.pathlib") + @patch("charms_ceph.utils.get_osd_tree") + def test_update_report_diff_osd_ids(self, + mock_get_osd_tree, + mock_write_file, + mock_pathlib, + mock_json_dumps): + """Check that new osdid list (of same length) becomes new expected.""" + new_osd_tree = [CrushLocation(2, "osd.2", osd="osd.2", host="host1"), + CrushLocation(3, "osd.3", osd="osd.3", host="host1")] + mock_get_osd_tree.return_value = new_osd_tree + with patch( + "src.ceph_hooks.open", + mock_open(read_data="""{"host1": [0, 1]}"""), + ): + update_host_osd_count_report() + mock_json_dumps.assert_called_with( + {"host1": [2, 3]})