From fda5176bd53f17a69f3e22b6b363bff96ff565c0 Mon Sep 17 00:00:00 2001 From: David Ames Date: Wed, 11 Jan 2017 16:00:39 -0800 Subject: [PATCH] Fix pacemaker down crm infinite loop On corosync restart, corosync may take longer than a minute to come up. The systemd start script times out too soon. Then pacemaker which is dependent on corosync is immediatly started and fails as corosync is still in the process of starting. Subsequently the charm would run crm node list to validate pacemaker. This would become an infinite loop. This change adds longer timeout values for systemd scripts and adds better error handling and communication to the end user. Change-Id: I7c3d018a03fddfb1f6bfd91fd7aeed4b13879e45 Partial-Bug: #1654403 --- README.md | 10 +++ config.yaml | 15 +++- hooks/hooks.py | 5 +- hooks/pcmk.py | 16 +++- hooks/utils.py | 120 ++++++++++++++++++++++++- templates/systemd-overrides.conf | 3 + tests/basic_deployment.py | 8 +- unit_tests/test_hacluster_utils.py | 138 ++++++++++++++++++++++++++++- unit_tests/test_pcmk.py | 14 +++ 9 files changed, 317 insertions(+), 12 deletions(-) create mode 100644 templates/systemd-overrides.conf diff --git a/README.md b/README.md index c98834b..07b45d1 100644 --- a/README.md +++ b/README.md @@ -29,6 +29,16 @@ To enable HA clustering support (for mysql for example): The principle charm must have explicit support for the hacluster interface in order for clustering to occur - otherwise nothing actually get configured. +# Settings + +It is best practice to set cluster_count to the number of expected units in the +cluster. The charm will build the cluster without this setting, however, race +conditions may occur in which one node is not yet aware of the total number of +relations to other hacluster units, leading to failure of the corosync and +pacemaker services to complete startup. + +Setting cluster_count helps guarantee the hacluster charm waits until all +expected peer relations are available before building the corosync cluster. # HA/Clustering diff --git a/config.yaml b/config.yaml index 7d94134..b788f03 100644 --- a/config.yaml +++ b/config.yaml @@ -53,6 +53,18 @@ options: . This configuration element is mandatory and the service will fail on install if it is not provided. The value must be base64 encoded. + service_start_timeout: + type: int + default: 180 + description: | + Systemd override value for corosync and pacemaker service start timeout in seconds. + Set value to -1 turn off timeout for the services. + service_stop_timeout: + type: int + default: 60 + description: | + Systemd override value for corosync and pacemaker service stop timeout in seconds. + Set value to -1 turn off timeout for the services. stonith_enabled: type: string default: 'False' @@ -89,7 +101,8 @@ options: If less that 3 is specified, the cluster will be configured to ignore any quorum problems; with 3 or more units, quorum will be enforced and services will be stopped in the event of a loss - of quorum. + of quorum. It is best practice to set this value to the expected + number of units to avoid potential race conditions. monitor_host: type: string default: diff --git a/hooks/hooks.py b/hooks/hooks.py index be89450..845933c 100755 --- a/hooks/hooks.py +++ b/hooks/hooks.py @@ -72,6 +72,7 @@ from utils import ( set_unit_status, ocf_file_exists, kill_legacy_ocf_daemon_process, + try_pcmk_wait, ) from charmhelpers.contrib.charmsupport import nrpe @@ -152,7 +153,7 @@ def config_changed(): status_set('maintenance', "Setting up corosync") if configure_corosync(): - pcmk.wait_for_pcmk() + try_pcmk_wait() configure_cluster_global() configure_monitor_host() configure_stonith() @@ -257,7 +258,7 @@ def ha_relation_changed(): # NOTE: this should be removed in 15.04 cycle as corosync # configuration should be set directly on subordinate configure_corosync() - pcmk.wait_for_pcmk() + try_pcmk_wait() configure_cluster_global() configure_monitor_host() configure_stonith() diff --git a/hooks/pcmk.py b/hooks/pcmk.py index ea4667f..230eb54 100644 --- a/hooks/pcmk.py +++ b/hooks/pcmk.py @@ -15,6 +15,7 @@ import commands import subprocess import socket +import time from charmhelpers.core.hookenv import ( log, @@ -22,12 +23,23 @@ from charmhelpers.core.hookenv import ( ) -def wait_for_pcmk(): +class ServicesNotUp(Exception): + pass + + +def wait_for_pcmk(retries=12, sleep=10): crm_up = None hostname = socket.gethostname() - while not crm_up: + for i in range(retries): + if crm_up: + return True output = commands.getstatusoutput("crm node list")[1] crm_up = hostname in output + time.sleep(sleep) + if not crm_up: + raise ServicesNotUp("Pacemaker or Corosync are still down after " + "waiting for {} retries. Last output: {}" + "".format(retries, output)) def commit(cmd): diff --git a/hooks/utils.py b/hooks/utils.py index 9064207..54ea17d 100644 --- a/hooks/utils.py +++ b/hooks/utils.py @@ -31,6 +31,7 @@ from charmhelpers.core.hookenv import ( local_unit, log, DEBUG, + ERROR, INFO, WARNING, relation_get, @@ -54,11 +55,11 @@ from charmhelpers.core.host import ( rsync, service_start, service_stop, - service_restart, service_running, write_file, file_hash, - lsb_release + lsb_release, + init_is_systemd, ) from charmhelpers.fetch import ( apt_install, @@ -104,6 +105,9 @@ COROSYNC_CONF_FILES = [ ] SUPPORTED_TRANSPORTS = ['udp', 'udpu', 'multicast', 'unicast'] +SYSTEMD_OVERRIDES_DIR = '/etc/systemd/system/{}.service.d' +SYSTEMD_OVERRIDES_FILE = '{}/overrides.conf' + class MAASConfigIncomplete(Exception): pass @@ -287,6 +291,41 @@ def get_corosync_conf(): return None +def emit_systemd_overrides_file(): + """Generate the systemd overrides file + With Start and Stop timeout values + Note: (David Ames) Bug#1654403 Work around + May be removed if bug is resolved + If timeout value is set to -1 pass infinity + """ + if not init_is_systemd(): + return + + stop_timeout = int(config('service_stop_timeout')) + if stop_timeout < 0: + stop_timeout = 'infinity' + start_timeout = int(config('service_start_timeout')) + if start_timeout < 0: + start_timeout = 'infinity' + + systemd_overrides_context = {'service_stop_timeout': stop_timeout, + 'service_start_timeout': start_timeout, + } + + for service in ['corosync', 'pacemaker']: + overrides_dir = SYSTEMD_OVERRIDES_DIR.format(service) + overrides_file = SYSTEMD_OVERRIDES_FILE.format(overrides_dir) + if not os.path.isdir(overrides_dir): + os.mkdir(overrides_dir) + + write_file(path=overrides_file, + content=render_template('systemd-overrides.conf', + systemd_overrides_context)) + + # Update systemd with the new information + subprocess.check_call(['systemctl', 'daemon-reload']) + + def emit_corosync_conf(): corosync_conf_context = get_corosync_conf() if corosync_conf_context: @@ -534,7 +573,7 @@ def restart_corosync_on_change(): if return_data: for path in COROSYNC_CONF_FILES: if checksums[path] != file_hash(path): - restart_corosync() + validated_restart_corosync() break return return_data @@ -542,20 +581,88 @@ def restart_corosync_on_change(): return wrap +def try_pcmk_wait(): + """Try pcmk.wait_for_pcmk() + Log results and set status message + """ + try: + pcmk.wait_for_pcmk() + log("Pacemaker is ready", DEBUG) + except pcmk.ServicesNotUp: + msg = ("Pacemaker is down. Please manually start it.") + log(msg, ERROR) + status_set('blocked', msg) + raise pcmk.ServicesNotUp(msg) + + @restart_corosync_on_change() def configure_corosync(): log('Configuring and (maybe) restarting corosync', level=DEBUG) + # David Ames Bug#1654403 Work around + # May be removed if bug is resolved + emit_systemd_overrides_file() return emit_base_conf() and emit_corosync_conf() +def services_running(): + """Determine if both Corosync and Pacemaker are running + Both from the operating system perspective and with a functional test + @returns boolean + """ + pacemaker_status = service_running("pacemaker") + corosync_status = service_running("corosync") + log("Pacemaker status: {}, Corosync status: {}" + "".format(pacemaker_status, corosync_status), + level=DEBUG) + if not (pacemaker_status and corosync_status): + # OS perspective + return False + else: + # Functional test of pacemaker + return pcmk.wait_for_pcmk() + + +def validated_restart_corosync(retries=10): + """Restart and validate Corosync and Pacemaker are in fact up and running. + + @param retries: number of attempts to restart the services before giving up + @raises pcmk.ServicesNotUp if after retries services are still not up + """ + for restart in range(retries): + try: + if restart_corosync(): + log("Corosync and Pacemaker are validated as up and running.", + INFO) + return + else: + log("Corosync or Pacemaker not validated as up yet, retrying", + WARNING) + except pcmk.ServicesNotUp: + log("Pacemaker failed to start, retrying", WARNING) + continue + + msg = ("Corosync and/or Pacemaker failed to restart after {} retries" + "".format(retries)) + log(msg, ERROR) + status_set('blocked', msg) + raise pcmk.ServicesNotUp(msg) + + def restart_corosync(): if service_running("pacemaker"): + log("Stopping pacemaker", DEBUG) service_stop("pacemaker") if not is_unit_paused_set(): - service_restart("corosync") + log("Stopping corosync", DEBUG) + service_stop("corosync") + log("Starting corosync", DEBUG) + service_start("corosync") + log("Starting pacemaker", DEBUG) service_start("pacemaker") + return services_running() + def validate_dns_ha(): """Validate the DNS HA @@ -715,6 +822,11 @@ def assess_status_helper(): node_count = int(config('cluster_count')) status = 'active' message = 'Unit is ready and clustered' + try: + try_pcmk_wait() + except pcmk.ServicesNotUp: + message = 'Pacemaker is down' + status = 'blocked' for relid in relation_ids('hanode'): if len(related_units(relid)) + 1 < node_count: status = 'blocked' diff --git a/templates/systemd-overrides.conf b/templates/systemd-overrides.conf new file mode 100644 index 0000000..2c46783 --- /dev/null +++ b/templates/systemd-overrides.conf @@ -0,0 +1,3 @@ +[Service] +TimeoutStartSec={{service_start_timeout}} +TimeoutStopSec={{service_stop_timeout}} diff --git a/tests/basic_deployment.py b/tests/basic_deployment.py index 633e457..b38f536 100644 --- a/tests/basic_deployment.py +++ b/tests/basic_deployment.py @@ -33,6 +33,9 @@ from charmhelpers.contrib.openstack.amulet.utils import ( u = OpenStackAmuletUtils(DEBUG) seconds_to_wait = 600 +# Set number of primary units and cluster-count for hacluster +NUM_UNITS = 3 + class HAClusterBasicDeployment(OpenStackAmuletDeployment): @@ -64,7 +67,7 @@ class HAClusterBasicDeployment(OpenStackAmuletDeployment): this_service = {'name': 'hacluster'} other_services = [ {'name': 'percona-cluster', 'constraints': {'mem': '3072M'}}, - {'name': 'keystone', 'units': 3}, + {'name': 'keystone', 'units': NUM_UNITS}, ] super(HAClusterBasicDeployment, self)._add_services(this_service, other_services) @@ -93,7 +96,8 @@ class HAClusterBasicDeployment(OpenStackAmuletDeployment): 'sst-password': 'ChangeMe123', } hacluster_config = { - 'debug': 'true' + 'debug': 'true', + 'cluster_count': NUM_UNITS, } configs = { diff --git a/unit_tests/test_hacluster_utils.py b/unit_tests/test_hacluster_utils.py index 19fa740..2804d27 100644 --- a/unit_tests/test_hacluster_utils.py +++ b/unit_tests/test_hacluster_utils.py @@ -21,6 +21,7 @@ import tempfile import unittest import utils +import pcmk def write_file(path, content, *args, **kwargs): @@ -31,7 +32,7 @@ def write_file(path, content, *args, **kwargs): @mock.patch.object(utils, 'log', lambda *args, **kwargs: None) @mock.patch.object(utils, 'write_file', write_file) -class UtilsTestCase(unittest.TestCase): +class UtilsTestCaseWriteTmp(unittest.TestCase): def setUp(self): self.tmpdir = tempfile.mkdtemp() @@ -88,6 +89,9 @@ class UtilsTestCase(unittest.TestCase): def test_debug_off(self): self.check_debug(False) + +class UtilsTestCase(unittest.TestCase): + @mock.patch.object(utils, 'config') def test_get_transport(self, mock_config): mock_config.return_value = 'udp' @@ -218,3 +222,135 @@ class UtilsTestCase(unittest.TestCase): check_output_mock.return_value = ps_output utils.kill_legacy_ocf_daemon_process('res_ceilometer_agent_central') call_mock.assert_called_once_with(['sudo', 'kill', '-9', '11109']) + + @mock.patch.object(pcmk, 'wait_for_pcmk') + def test_try_pcmk_wait(self, mock_wait_for_pcmk): + # Returns OK + mock_wait_for_pcmk.side_effect = None + self.assertEquals(None, utils.try_pcmk_wait()) + + # Raises Exception + mock_wait_for_pcmk.side_effect = pcmk.ServicesNotUp + with self.assertRaises(pcmk.ServicesNotUp): + utils.try_pcmk_wait() + + @mock.patch.object(pcmk, 'wait_for_pcmk') + @mock.patch.object(utils, 'service_running') + def test_services_running(self, mock_service_running, + mock_wait_for_pcmk): + # OS not running + mock_service_running.return_value = False + self.assertFalse(utils.services_running()) + + # Functional not running + mock_service_running.return_value = True + mock_wait_for_pcmk.side_effect = pcmk.ServicesNotUp + with self.assertRaises(pcmk.ServicesNotUp): + utils.services_running() + + # All running + mock_service_running.return_value = True + mock_wait_for_pcmk.side_effect = None + mock_wait_for_pcmk.return_value = True + self.assertTrue(utils.services_running()) + + @mock.patch.object(pcmk, 'wait_for_pcmk') + @mock.patch.object(utils, 'restart_corosync') + def test_validated_restart_corosync(self, mock_restart_corosync, + mock_wait_for_pcmk): + # Services are down + mock_restart_corosync.mock_calls = [] + mock_restart_corosync.return_value = False + with self.assertRaises(pcmk.ServicesNotUp): + utils.validated_restart_corosync(retries=3) + self.assertEqual(3, len(mock_restart_corosync.mock_calls)) + + # Services are up + mock_restart_corosync.mock_calls = [] + mock_restart_corosync.return_value = True + utils.validated_restart_corosync(retries=10) + self.assertEqual(1, len(mock_restart_corosync.mock_calls)) + + @mock.patch.object(utils, 'is_unit_paused_set') + @mock.patch.object(utils, 'services_running') + @mock.patch.object(utils, 'service_start') + @mock.patch.object(utils, 'service_stop') + @mock.patch.object(utils, 'service_running') + def test_restart_corosync(self, mock_service_running, + mock_service_stop, mock_service_start, + mock_services_running, mock_is_unit_paused_set): + # PM up, services down + mock_service_running.return_value = True + mock_is_unit_paused_set.return_value = False + mock_services_running.return_value = False + self.assertFalse(utils.restart_corosync()) + mock_service_stop.assert_has_calls([mock.call('pacemaker'), + mock.call('corosync')]) + mock_service_start.assert_has_calls([mock.call('corosync'), + mock.call('pacemaker')]) + + # PM already down, services down + mock_service_running.return_value = False + mock_is_unit_paused_set.return_value = False + mock_services_running.return_value = False + self.assertFalse(utils.restart_corosync()) + mock_service_stop.assert_has_calls([mock.call('corosync')]) + mock_service_start.assert_has_calls([mock.call('corosync'), + mock.call('pacemaker')]) + + # PM already down, services up + mock_service_running.return_value = True + mock_is_unit_paused_set.return_value = False + mock_services_running.return_value = True + self.assertTrue(utils.restart_corosync()) + mock_service_stop.assert_has_calls([mock.call('pacemaker'), + mock.call('corosync')]) + mock_service_start.assert_has_calls([mock.call('corosync'), + mock.call('pacemaker')]) + + @mock.patch.object(subprocess, 'check_call') + @mock.patch.object(utils.os, 'mkdir') + @mock.patch.object(utils.os.path, 'exists') + @mock.patch.object(utils, 'render_template') + @mock.patch.object(utils, 'write_file') + @mock.patch.object(utils, 'is_unit_paused_set') + @mock.patch.object(utils, 'config') + def test_emit_systemd_overrides_file(self, mock_config, + mock_is_unit_paused_set, + mock_write_file, mock_render_template, + mock_path_exists, + mock_mkdir, mock_check_call): + + # Normal values + cfg = {'service_stop_timeout': 30, + 'service_start_timeout': 90} + mock_config.side_effect = lambda key: cfg.get(key) + + mock_is_unit_paused_set.return_value = True + mock_path_exists.return_value = True + utils.emit_systemd_overrides_file() + self.assertEquals(2, len(mock_write_file.mock_calls)) + mock_render_template.assert_has_calls( + [mock.call('systemd-overrides.conf', cfg), + mock.call('systemd-overrides.conf', cfg)]) + mock_check_call.assert_has_calls([mock.call(['systemctl', + 'daemon-reload'])]) + mock_write_file.mock_calls = [] + mock_render_template.mock_calls = [] + mock_check_call.mock_calls = [] + + # Disable timeout + cfg = {'service_stop_timeout': -1, + 'service_start_timeout': -1} + expected_cfg = {'service_stop_timeout': 'infinity', + 'service_start_timeout': 'infinity'} + mock_config.side_effect = lambda key: cfg.get(key) + mock_is_unit_paused_set.return_value = True + mock_path_exists.return_value = True + utils.emit_systemd_overrides_file() + self.assertEquals(2, len(mock_write_file.mock_calls)) + mock_render_template.assert_has_calls( + [mock.call('systemd-overrides.conf', expected_cfg), + mock.call('systemd-overrides.conf', expected_cfg)]) + mock_check_call.assert_has_calls([mock.call(['systemctl', + 'daemon-reload'])]) diff --git a/unit_tests/test_pcmk.py b/unit_tests/test_pcmk.py index 5b5ce9e..43c5eca 100644 --- a/unit_tests/test_pcmk.py +++ b/unit_tests/test_pcmk.py @@ -34,3 +34,17 @@ class TestPcmk(unittest.TestCase): def test_crm_res_running_undefined(self, getstatusoutput): getstatusoutput.return_value = (1, "foobar") self.assertFalse(pcmk.crm_res_running('res_nova_consoleauth')) + + @mock.patch('socket.gethostname') + @mock.patch('commands.getstatusoutput') + def test_wait_for_pcmk(self, getstatusoutput, gethostname): + # Pacemaker is down + gethostname.return_value = 'hanode-1' + getstatusoutput.return_value = (1, 'Not the hostname') + with self.assertRaises(pcmk.ServicesNotUp): + pcmk.wait_for_pcmk(retries=2, sleep=0) + + # Pacemaker is up + gethostname.return_value = 'hanode-1' + getstatusoutput.return_value = (0, 'Hosname: hanode-1') + self.assertTrue(pcmk.wait_for_pcmk(retries=2, sleep=0))