From 11ee1ef563e3e7a3213fce6ef6151fe5283d0161 Mon Sep 17 00:00:00 2001 From: Robert Gildein Date: Wed, 4 Nov 2020 18:02:57 +0100 Subject: [PATCH] NRPE: Monitor threads connected to MySQL. Add a NRPE check to monitor the number of threads connected to the MySQL database, in proportion to the maximum number of connections. For this check, a nagios user will be created. This user does not have any permissions set, does not have access to any database and can only connect from localhost. Warning and Critical thresholds (in percentage) can be configured. Add an action to reset nagios's password. This action could only be run on the leader unit. Closes-Bug: #1816759 Change-Id: Id35b0331322c2744a9f839b3eb153eed1bc53aac --- README.md | 8 +++ actions.yaml | 5 ++ actions/actions.py | 14 ++++- actions/generate-nagios-password | 1 + config.yaml | 7 +++ hooks/percona_hooks.py | 33 ++++++++++- hooks/percona_utils.py | 94 ++++++++++++++++++++++++++++++++ templates/nagios-my.cnf | 3 + unit_tests/test_actions.py | 32 +++++++++++ unit_tests/test_percona_hooks.py | 36 +++++++++++- unit_tests/test_percona_utils.py | 69 +++++++++++++++++++++++ 11 files changed, 296 insertions(+), 6 deletions(-) create mode 120000 actions/generate-nagios-password create mode 100644 templates/nagios-my.cnf diff --git a/README.md b/README.md index 9cf5c36..542d173 100644 --- a/README.md +++ b/README.md @@ -38,6 +38,14 @@ to form its cluster. It is best practice to use this option as doing so ensures that the charm will wait until the cluster is up before accepting relations from other client applications. +#### `nrpe-threads-connected` + +The `nrpe-threads-connected` option sets Warning and Critical thresholds (in percent) +for NRPE check to monitor the number of threads connecting to the MySQL. +If the `nrpe-external-master` relationship is set, a nagios user who does +not have permission and can only connect from localhost is created before +the check is created. + ## Deployment To deploy a single percona-cluster unit: diff --git a/actions.yaml b/actions.yaml index fa993e6..cc1f7a8 100644 --- a/actions.yaml +++ b/actions.yaml @@ -64,3 +64,8 @@ mysqldump: description: | Comma delimited database names to dump. If left unset, all databases will be dumped. +generate-nagios-password: + description: | + Re-generate the password for the nagios user. This action can only be run on the leader unit, + which sets a new password and then the `leader-settings-changed` hook will be activate. + diff --git a/actions/actions.py b/actions/actions.py index 83280a6..13dc6c9 100755 --- a/actions/actions.py +++ b/actions/actions.py @@ -36,6 +36,7 @@ from charmhelpers.core.hookenv import ( from charmhelpers.core.host import ( CompareHostReleases, lsb_release, + pwgen, ) from charmhelpers.contrib.openstack.utils import ( @@ -223,6 +224,16 @@ def mysqldump(args): action_fail("mysqldump failed") +def generate_nagios_password(args): + """Regenerate nagios password.""" + if is_leader(): + leader_set({"nagios-password": pwgen()}) + percona_utils.set_nagios_user() + action_set({"output": "New password for nagios created successfully."}) + else: + action_fail("This action should only take place on the leader unit.") + + # A dictionary of all the defined actions to callables (which take # parsed arguments). ACTIONS = {"pause": pause, "resume": resume, "backup": backup, @@ -230,7 +241,8 @@ ACTIONS = {"pause": pause, "resume": resume, "backup": backup, "bootstrap-pxc": bootstrap_pxc, "notify-bootstrapped": notify_bootstrapped, "set-pxc-strict-mode": set_pxc_strict_mode, - "mysqldump": mysqldump} + "mysqldump": mysqldump, + "generate-nagios-password": generate_nagios_password} def main(args): diff --git a/actions/generate-nagios-password b/actions/generate-nagios-password new file mode 120000 index 0000000..405a394 --- /dev/null +++ b/actions/generate-nagios-password @@ -0,0 +1 @@ +actions.py \ No newline at end of file diff --git a/config.yaml b/config.yaml index 647cf0d..a055b6f 100644 --- a/config.yaml +++ b/config.yaml @@ -358,3 +358,10 @@ options: control message will be sent. The fc_limit defaults to 16 transactions. This effectively means that this is as far as a given node can be behind committing transactions from the cluster. + nrpe-threads-connected: + type: string + default: '80,90' + description: | + This configuration option represents the warning and critical percentages + that are used to check the number of threads connected to MySQL. + The value should be written as a string containing two numbers separated by commas. diff --git a/hooks/percona_hooks.py b/hooks/percona_hooks.py index 99e0b4d..50499ac 100755 --- a/hooks/percona_hooks.py +++ b/hooks/percona_hooks.py @@ -34,6 +34,7 @@ from charmhelpers.core.hookenv import ( DEBUG, INFO, WARNING, + ERROR, is_leader, network_get_primary_address, leader_get, @@ -57,6 +58,7 @@ from charmhelpers.fetch import ( apt_install, add_source, SourceConfigError, + filter_installed_packages, ) from charmhelpers.contrib.peerstorage import ( peer_echo, @@ -151,6 +153,9 @@ from percona_utils import ( delete_replication_user, list_replication_users, check_mysql_connection, + set_nagios_user, + get_nrpe_threads_connected_thresholds, + MYSQL_NAGIOS_CREDENTIAL_FILE, update_source, ADD_APT_REPOSITORY_FAILED, ) @@ -1035,16 +1040,40 @@ def leader_elected(): 'nrpe-external-master-relation-changed') def update_nrpe_config(): # python-dbus is used by check_upstart_job - apt_install('python-dbus') + # nagios-plugins-contrib add pmp-check-mysql-status check + packages = filter_installed_packages(["python-dbus", + "nagios-plugins-contrib"]) + apt_install(packages) + hostname = nrpe.get_nagios_hostname() current_unit = nrpe.get_nagios_unit_name() nrpe_setup = nrpe.NRPE(hostname=hostname) nrpe.add_init_service_checks(nrpe_setup, ['mysql'], current_unit) nrpe_setup.add_check( shortname='mysql_proc', - description='Check MySQL process {%s}' % current_unit, + description='Check MySQL process {}'.format(current_unit), check_cmd='check_procs -c 1:1 -C mysqld' ) + try: + warning_threads, critical_threads = \ + get_nrpe_threads_connected_thresholds() + except ValueError as error: + log("failed to get thresholds from nrpe-threads-connected due: " + "{}".format(error), level=ERROR) + log("the default thresholds are used") + warning_threads, critical_threads = 80, 90 + + set_nagios_user() + nrpe_setup.add_check( + shortname='mysql_threads', + description='Check MySQL connected threads', + check_cmd='pmp-check-mysql-status --defaults-file {credential_file} ' + '-x Threads_connected -o / -y max_connections -T pct ' + '-w {warning} -c {critical}'.format( + credential_file=MYSQL_NAGIOS_CREDENTIAL_FILE, + warning=warning_threads, + critical=critical_threads) + ) nrpe_setup.write() diff --git a/hooks/percona_utils.py b/hooks/percona_utils.py index e5927e6..c21bc14 100644 --- a/hooks/percona_utils.py +++ b/hooks/percona_utils.py @@ -97,6 +97,7 @@ ADD_APT_REPOSITORY_FAILED = "add-apt-repository-failed" # maintenance mode, but is currently not populated as the # charm_check_function() checks whether the unit is working properly. REQUIRED_INTERFACES = {} +MYSQL_NAGIOS_CREDENTIAL_FILE = "/etc/nagios/mysql-check.cnf" class LeaderNoBootstrapUUIDError(Exception): @@ -1053,6 +1054,8 @@ root_password = partial(_get_password, 'root-password') sst_password = partial(_get_password, 'sst-password') +nagios_password = partial(_get_password, 'nagios-password') + def pxc_installed(): '''Determine whether percona-xtradb-cluster is installed @@ -1744,3 +1747,94 @@ def update_source(source, key=None): else: apt_update() # run without retries log("apt update after adding a new package source") + + +def get_nrpe_threads_connected_thresholds(): + """This function get and verifies threshold values. + + The function confirms these conditions: + Thresholds for `nrpe-threads-connected` should be separable by `,` and + represented by an integer, where the warning threshold should be + in the range [0, 100) and the critical threshold in the range (0, 100). + At the same time, the warning threshold should be lower than + the critical one. + + :returns: warning threshold, critical threshold + :rtype: Tuple[int, int] + :raises: ValueError + """ + values = config("nrpe-threads-connected").split(",") + + if len(values) != 2: + raise ValueError("the wrong number of values was set for " + "the nrpe-threads-connected") + + try: + warning_threshold, critical_threshold = int(values[0]), int(values[1]) + except ValueError as error: + raise error + + if not (0 <= warning_threshold < 100 and 0 < critical_threshold <= 100): + raise ValueError("the warning threshold must be in the range [0,100) " + "and the critical threshold must be in the range " + "(0,100]") + + if warning_threshold >= critical_threshold: + raise ValueError("the warning threshold must be less than critical") + + return warning_threshold, critical_threshold + + +def write_nagios_my_cnf(): + """Create a MySQL configuration file with nagios user credentials.""" + context = { + "mysql_user": "nagios", "mysql_passwd": nagios_password(), + } + render("nagios-my.cnf", MYSQL_NAGIOS_CREDENTIAL_FILE, context, + owner="nagios", group="nagios", perms=0o640) + + +def create_nagios_user(): + """Create MySQL user for nagios to check the status of MySQL. + + This user does not have any permissions set, does not have access + to any database and can only connect from localhost. + + :raises: OperationalError + """ + m_helper = get_db_helper() + try: + m_helper.connect(password=m_helper.get_mysql_root_password()) + except OperationalError: + log("Could not connect to db", level=ERROR) + raise + # NOTE (rgildein): This user has access to the SHOW GLOBAL STATUS + # statement, which does not require any privilege, only the ability to + # connect to the server. + nagios_exists = False + try: + # NOTE (rgildein): This method of creation nagios user should be + # replaced by the `MySQLHelper.create_user` method after + # bug #1905586 has been fixed and merged. + nagios_exists = m_helper.select( + "SELECT EXISTS(SELECT 1 FROM mysql.user WHERE user = 'nagios')" + ) + m_helper.execute("CREATE USER 'nagios'@'localhost';") + except OperationalError as error: + if not nagios_exists: + log("Creating user nagios failed due: {}".format(error), + level="ERROR") + raise error + else: + log("User 'nagios'@'localhost' already exists.", level="WARNING") + # NOTE (rgildein): Update the user's password if it has changed. + m_helper.execute("ALTER USER 'nagios'@'localhost' IDENTIFIED BY " + "'{passwd}';".format(passwd=nagios_password())) + + +def set_nagios_user(): + """Create MySQL user and configuration file with user credentials.""" + if is_leader(): + create_nagios_user() + + write_nagios_my_cnf() diff --git a/templates/nagios-my.cnf b/templates/nagios-my.cnf new file mode 100644 index 0000000..064807a --- /dev/null +++ b/templates/nagios-my.cnf @@ -0,0 +1,3 @@ +[client] +user="{{ mysql_user }}" +password="{{ mysql_passwd }}" diff --git a/unit_tests/test_actions.py b/unit_tests/test_actions.py index 987ffc5..c25cdea 100644 --- a/unit_tests/test_actions.py +++ b/unit_tests/test_actions.py @@ -105,3 +105,35 @@ class MainTestCase(CharmTestCase): with mock.patch.dict(actions.ACTIONS, {"foo": dummy_action}): actions.main(["foo"]) self.assertEqual(dummy_calls, ["Action foo failed: uh oh"]) + + +class NagiosTestCase(CharmTestCase): + + def setUp(self): + super(NagiosTestCase, self).setUp(actions, + ["action_set", + "action_fail", + "is_leader", + "leader_set", + "pwgen", + ]) + + @patch.object(actions.percona_utils, "set_nagios_user") + def test_generate_nagios_password(self, mock_set_nagios_user): + """Test regenerate new password for nagios user.""" + self.is_leader.return_value = True + self.pwgen.return_value = "1234" + actions.generate_nagios_password([]) + self.leader_set.assert_called_once_with({"nagios-password": "1234"}) + mock_set_nagios_user.assert_called_once_with() + self.action_set.assert_called_once_with( + {"output": "New password for nagios created successfully."} + ) + + def test_generate_nagios_password_no_leader(self): + """Test regenerate new password for nagios user at no leader unit.""" + self.is_leader.return_value = False + actions.generate_nagios_password([]) + self.action_fail.assert_called_once_with( + "This action should only take place on the leader unit." + ) diff --git a/unit_tests/test_percona_hooks.py b/unit_tests/test_percona_hooks.py index e5bf83a..176304c 100644 --- a/unit_tests/test_percona_hooks.py +++ b/unit_tests/test_percona_hooks.py @@ -266,15 +266,45 @@ class TestNRPERelation(CharmTestCase): def setUp(self): patch_targets_nrpe = TO_PATCH[:] patch_targets_nrpe.remove("update_nrpe_config") - patch_targets_nrpe.append("nrpe") - patch_targets_nrpe.append("apt_install") + patch_targets_nrpe.extend(["nrpe", + "apt_install", + "config", + "set_nagios_user"]) CharmTestCase.setUp(self, hooks, patch_targets_nrpe) - def test_mysql_monitored(self): + @mock.patch("percona_utils.config") + @mock.patch("percona_utils.is_leader") + @mock.patch("percona_utils.leader_get") + def test_mysql_monitored(self, + mock_leader_get, + mock_is_leader, + mock_config): """The mysql service is monitored by Nagios.""" + self.nrpe.get_nagios_unit_name.return_value = "nagios-0" + self.test_config.set("nrpe-threads-connected", "80,90") + mock_config.side_effect = self.test_config.get + mock_is_leader.return_value = True + mock_leader_get.return_value = "1234" + nrpe_setup = mock.MagicMock() + nrpe_setup.add_check = mock.MagicMock() + self.nrpe.NRPE.return_value = nrpe_setup + hooks.update_nrpe_config() self.nrpe.add_init_service_checks.assert_called_once_with( mock.ANY, ["mysql"], mock.ANY) + nrpe_setup.add_check.assert_has_calls([ + mock.call( + shortname="mysql_proc", + description="Check MySQL process nagios-0", + check_cmd="check_procs -c 1:1 -C mysqld"), + mock.call( + shortname="mysql_threads", + description="Check MySQL connected threads", + check_cmd="pmp-check-mysql-status " + "--defaults-file /etc/nagios/mysql-check.cnf " + "-x Threads_connected -o / -y max_connections " + "-T pct -w 80 -c 90") + ]) class TestMasterRelation(CharmTestCase): diff --git a/unit_tests/test_percona_utils.py b/unit_tests/test_percona_utils.py index dd0f1ae..9443150 100644 --- a/unit_tests/test_percona_utils.py +++ b/unit_tests/test_percona_utils.py @@ -17,6 +17,7 @@ class UtilsTests(CharmTestCase): TO_PATCH = [ 'config', 'kv', + 'is_leader', 'leader_get', 'log', 'relation_ids', @@ -538,6 +539,74 @@ class UtilsTests(CharmTestCase): mock_apt_update.assert_not_called() + @mock.patch.object(percona_utils, "get_db_helper") + @mock.patch.object(percona_utils, "write_nagios_my_cnf") + def test_create_nagios_user(self, + mock_create_nagios_mysql_credential, + mock_get_db_helper): + my_mock = mock.Mock() + self.leader_get.return_value = "1234" + self.is_leader.return_value = True + mock_get_db_helper.return_value = my_mock + + percona_utils.create_nagios_user() + my_mock.select.assert_called_once_with( + "SELECT EXISTS(SELECT 1 FROM mysql.user WHERE user = 'nagios')" + ) + my_mock.execute.assert_has_calls([ + mock.call("CREATE USER 'nagios'@'localhost';"), + mock.call("ALTER USER 'nagios'@'localhost' IDENTIFIED BY '1234';"), + ]) + + class OperationalError(Exception): + pass + + percona_utils.OperationalError = OperationalError + + def mysql_create_user(*args, **kwargs): + raise OperationalError() + + my_mock.select.return_value = True + my_mock.execute.side_effect = mysql_create_user + with self.assertRaises(OperationalError): + percona_utils.create_nagios_user() + + def test_get_nrpe_threads_connected_thresholds(self): + """Test function for getting and verifying threshold values.""" + self.config.return_value = "a,1,2" + with self.assertRaises(ValueError) as context: + percona_utils.get_nrpe_threads_connected_thresholds() + self.assertEqual(ValueError("the wrong number of values was set " + "for the nrpe-threads-connected"), + context.exception) + + self.config.return_value = "a,1" + with self.assertRaises(ValueError) as context: + percona_utils.get_nrpe_threads_connected_thresholds() + self.assertEqual( + ValueError("invalid literal for int() with base 10: 'a'"), + context.exception) + + self.config.return_value = "50,200" + with self.assertRaises(ValueError) as context: + percona_utils.get_nrpe_threads_connected_thresholds() + self.assertEqual(ValueError("the warning threshold must be in the " + "range [0,100) and the critical " + "threshold must be in the range " + "(0,100]"), + context.exception) + + self.config.return_value = "90,60" + with self.assertRaises(ValueError) as context: + percona_utils.get_nrpe_threads_connected_thresholds() + self.assertEqual(ValueError("the warning threshold must be less " + "than critical"), + context.exception) + + self.config.return_value = "80,90" + thresholds = percona_utils.get_nrpe_threads_connected_thresholds() + self.assertEqual(thresholds, (80, 90)) + class UtilsTestsStatus(CharmTestCase):