diff --git a/senlin/engine/health_manager.py b/senlin/engine/health_manager.py index b51584168..68e45352f 100644 --- a/senlin/engine/health_manager.py +++ b/senlin/engine/health_manager.py @@ -312,11 +312,27 @@ class HealthManager(service.Service): url_template = params['poll_url'] verify_ssl = params['poll_url_ssl_verify'] + conn_error_as_unhealthy = params['poll_url_conn_error_as_unhealthy'] expected_resp_str = params['poll_url_healthy_response'] max_unhealthy_retry = params['poll_url_retry_limit'] retry_interval = params['poll_url_retry_interval'] node_update_timeout = params['node_update_timeout'] + def stop_node_recovery(): + node_last_updated = node.updated_at or node.init_at + if not timeutils.is_older_than( + node_last_updated, node_update_timeout): + LOG.info("Node %s was updated at %s which is less than " + "%d secs ago. Skip node recovery.", + node.id, node_last_updated, node_update_timeout) + return True + + LOG.info("Node %s is reported as down (%d retries left)", + node.id, available_attemps) + time.sleep(retry_interval) + + return False + url = self._expand_url_template(url_template, node) LOG.info("Polling node status from URL: %s", url) @@ -327,9 +343,14 @@ class HealthManager(service.Service): try: result = utils.url_fetch(url, verify=verify_ssl) except utils.URLFetchError as ex: - LOG.error("Error when requesting node health status from" - " %s: %s", url, ex) - return None + if conn_error_as_unhealthy: + if stop_node_recovery(): + return None + continue + else: + LOG.error("Error when requesting node health status from" + " %s: %s", url, ex) + return None LOG.debug("Node status returned from URL(%s): %s", url, result) @@ -342,18 +363,9 @@ class HealthManager(service.Service): "ACTIVE state", node.id) return None - node_last_updated = node.updated_at or node.init_at - if not timeutils.is_older_than( - node_last_updated, node_update_timeout): - LOG.info("Node %s was updated at %s which is less than " - "%d secs ago. Skip node recovery.", - node.id, node_last_updated, node_update_timeout) + if stop_node_recovery(): return None - LOG.info("Node %s is reported as down (%d retries left)", - node.id, available_attemps) - time.sleep(retry_interval) - # recover node after exhausting retries LOG.info("Requesting node recovery: %s", node.id) req = objects.NodeRecoverRequest(identity=node.id, diff --git a/senlin/policies/health_policy.py b/senlin/policies/health_policy.py index e320bc9bd..ef0d491cd 100644 --- a/senlin/policies/health_policy.py +++ b/senlin/policies/health_policy.py @@ -62,12 +62,13 @@ class HealthPolicy(base.Policy): _DETECTION_OPTIONS = ( DETECTION_INTERVAL, POLL_URL, POLL_URL_SSL_VERIFY, - POLL_URL_HEALTHY_RESPONSE, POLL_URL_RETRY_LIMIT, - POLL_URL_RETRY_INTERVAL, NODE_UPDATE_TIMEOUT, + POLL_URL_CONN_ERROR_AS_UNHEALTHY, POLL_URL_HEALTHY_RESPONSE, + POLL_URL_RETRY_LIMIT, POLL_URL_RETRY_INTERVAL, NODE_UPDATE_TIMEOUT, ) = ( 'interval', 'poll_url', 'poll_url_ssl_verify', - 'poll_url_healthy_response', 'poll_url_retry_limit', - 'poll_url_retry_interval', 'node_update_timeout', + 'poll_url_conn_error_as_unhealthy', 'poll_url_healthy_response', + 'poll_url_retry_limit', 'poll_url_retry_interval', + 'node_update_timeout', ) _RECOVERY_KEYS = ( @@ -122,6 +123,12 @@ class HealthPolicy(base.Policy): "'NODE_STATUS_POLL_URL'."), default=True, ), + POLL_URL_CONN_ERROR_AS_UNHEALTHY: schema.Boolean( + _("Whether to treat URL connection errors as an " + "indication of an unhealthy node. Only required " + "when type is 'NODE_STATUS_POLL_URL'."), + default=True, + ), POLL_URL_HEALTHY_RESPONSE: schema.String( _("String pattern in the poll URL response body " "that indicates a healthy node. " @@ -215,6 +222,8 @@ class HealthPolicy(base.Policy): self.interval = options.get(self.DETECTION_INTERVAL, 60) self.poll_url = options.get(self.POLL_URL, '') self.poll_url_ssl_verify = options.get(self.POLL_URL_SSL_VERIFY, True) + self.poll_url_conn_error_as_unhealthy = options.get( + self.POLL_URL_CONN_ERROR_AS_UNHEALTHY, True) self.poll_url_healthy_response = options.get( self.POLL_URL_HEALTHY_RESPONSE, '') self.poll_url_retry_limit = options.get(self.POLL_URL_RETRY_LIMIT, '') @@ -280,6 +289,8 @@ class HealthPolicy(base.Policy): 'recover_action': self.recover_actions, 'poll_url': self.poll_url, 'poll_url_ssl_verify': self.poll_url_ssl_verify, + 'poll_url_conn_error_as_unhealthy': + self.poll_url_conn_error_as_unhealthy, 'poll_url_healthy_response': self.poll_url_healthy_response, 'poll_url_retry_limit': self.poll_url_retry_limit, 'poll_url_retry_interval': self.poll_url_retry_interval, @@ -297,6 +308,8 @@ class HealthPolicy(base.Policy): 'interval': self.interval, 'poll_url': self.poll_url, 'poll_url_ssl_verify': self.poll_url_ssl_verify, + 'poll_url_conn_error_as_unhealthy': + self.poll_url_conn_error_as_unhealthy, 'poll_url_healthy_response': self.poll_url_healthy_response, 'poll_url_retry_limit': self.poll_url_retry_limit, 'poll_url_retry_interval': self.poll_url_retry_interval, diff --git a/senlin/tests/unit/engine/test_health_manager.py b/senlin/tests/unit/engine/test_health_manager.py index 3db5d2453..1d5936c31 100644 --- a/senlin/tests/unit/engine/test_health_manager.py +++ b/senlin/tests/unit/engine/test_health_manager.py @@ -719,6 +719,7 @@ class TestHealthManager(base.SenlinTestCase): params = { 'poll_url': 'FAKE_POLL_URL', 'poll_url_ssl_verify': True, + 'poll_url_conn_error_as_unhealthy': True, 'poll_url_healthy_response': 'FAKE_HEALTHY_PATTERN', 'poll_url_retry_limit': 2, 'poll_url_retry_interval': 1, @@ -753,6 +754,7 @@ class TestHealthManager(base.SenlinTestCase): params = { 'poll_url': 'FAKE_POLL_URL', 'poll_url_ssl_verify': True, + 'poll_url_conn_error_as_unhealthy': True, 'poll_url_healthy_response': 'FAKE_HEALTHY_PATTERN', 'poll_url_retry_limit': 2, 'poll_url_retry_interval': 1, @@ -789,6 +791,7 @@ class TestHealthManager(base.SenlinTestCase): params = { 'poll_url': 'FAKE_POLL_URL', 'poll_url_ssl_verify': True, + 'poll_url_conn_error_as_unhealthy': True, 'poll_url_healthy_response': 'FAKE_HEALTHY_PATTERN', 'poll_url_retry_limit': 2, 'poll_url_retry_interval': 1, @@ -826,6 +829,7 @@ class TestHealthManager(base.SenlinTestCase): params = { 'poll_url': 'FAKE_POLL_URL', 'poll_url_ssl_verify': True, + 'poll_url_conn_error_as_unhealthy': True, 'poll_url_healthy_response': 'FAKE_HEALTHY_PATTERN', 'poll_url_retry_limit': 2, 'poll_url_retry_interval': 1, @@ -864,6 +868,7 @@ class TestHealthManager(base.SenlinTestCase): params = { 'poll_url': 'FAKE_POLL_URL', 'poll_url_ssl_verify': False, + 'poll_url_conn_error_as_unhealthy': True, 'poll_url_healthy_response': 'FAKE_HEALTHY_PATTERN', 'poll_url_retry_limit': 2, 'poll_url_retry_interval': 1, @@ -886,6 +891,89 @@ class TestHealthManager(base.SenlinTestCase): ) mock_sleep.assert_has_calls([mock.call(1), mock.call(1)]) + @mock.patch.object(time, "sleep") + @mock.patch.object(tu, "is_older_than") + @mock.patch.object(hm.HealthManager, "_expand_url_template") + @mock.patch.object(utils, 'url_fetch') + @mock.patch.object(rpc_client.EngineClient, 'call') + def test__check_url_and_recover_node_conn_error( + self, mock_rpc, mock_url_fetch, mock_expand_url, mock_time, + mock_sleep): + ctx = mock.Mock() + node = mock.Mock() + node.status = consts.NS_ACTIVE + node.id = 'FAKE_ID' + mock_time.return_value = True + mock_expand_url.return_value = 'FAKE_EXPANDED_URL' + x_action_check = {'action': 'CHECK_ID'} + mock_rpc.return_value = x_action_check + mock_url_fetch.side_effect = utils.URLFetchError("Error") + params = { + 'poll_url': 'FAKE_POLL_URL', + 'poll_url_ssl_verify': False, + 'poll_url_conn_error_as_unhealthy': True, + 'poll_url_healthy_response': 'FAKE_HEALTHY_PATTERN', + 'poll_url_retry_limit': 2, + 'poll_url_retry_interval': 1, + 'node_update_timeout': 5, + } + + recover_action = {'operation': 'REBUILD'} + + # do it + res = self.hm._check_url_and_recover_node(ctx, node, recover_action, + params) + + self.assertEqual(mock_rpc.return_value, res) + mock_rpc.assert_called_once_with(ctx, 'node_recover', mock.ANY) + mock_url_fetch.assert_has_calls( + [ + mock.call('FAKE_EXPANDED_URL', verify=False), + mock.call('FAKE_EXPANDED_URL', verify=False) + ] + ) + mock_sleep.assert_has_calls([mock.call(1), mock.call(1)]) + + @mock.patch.object(time, "sleep") + @mock.patch.object(tu, "is_older_than") + @mock.patch.object(hm.HealthManager, "_expand_url_template") + @mock.patch.object(utils, 'url_fetch') + @mock.patch.object(rpc_client.EngineClient, 'call') + def test__check_url_and_recover_node_conn_error_noop( + self, mock_rpc, mock_url_fetch, mock_expand_url, mock_time, + mock_sleep): + ctx = mock.Mock() + node = mock.Mock() + node.status = consts.NS_ACTIVE + node.id = 'FAKE_ID' + mock_time.return_value = True + mock_expand_url.return_value = 'FAKE_EXPANDED_URL' + mock_url_fetch.side_effect = utils.URLFetchError("Error") + params = { + 'poll_url': 'FAKE_POLL_URL', + 'poll_url_ssl_verify': False, + 'poll_url_conn_error_as_unhealthy': False, + 'poll_url_healthy_response': 'FAKE_HEALTHY_PATTERN', + 'poll_url_retry_limit': 2, + 'poll_url_retry_interval': 1, + 'node_update_timeout': 5, + } + + recover_action = {'operation': 'REBUILD'} + + # do it + res = self.hm._check_url_and_recover_node(ctx, node, recover_action, + params) + + self.assertIsNone(res) + mock_rpc.assert_not_called() + mock_url_fetch.assert_has_calls( + [ + mock.call('FAKE_EXPANDED_URL', verify=False), + ] + ) + mock_sleep.assert_not_called() + @mock.patch.object(hm, "_chase_up") @mock.patch.object(hm.HealthManager, "_check_url_and_recover_node") @mock.patch.object(obj_node.Node, 'get_all_by_cluster') diff --git a/senlin/tests/unit/policies/test_health_policy.py b/senlin/tests/unit/policies/test_health_policy.py index 7d0a012d8..01d62e62b 100644 --- a/senlin/tests/unit/policies/test_health_policy.py +++ b/senlin/tests/unit/policies/test_health_policy.py @@ -120,6 +120,7 @@ class TestHealthPolicy(base.SenlinTestCase): 'interval': self.hp.interval, 'poll_url': '', 'poll_url_ssl_verify': True, + 'poll_url_conn_error_as_unhealthy': True, 'poll_url_healthy_response': '', 'poll_url_retry_limit': 3, 'poll_url_retry_interval': 3, @@ -141,6 +142,7 @@ class TestHealthPolicy(base.SenlinTestCase): 'recover_action': self.hp.recover_actions, 'poll_url': '', 'poll_url_ssl_verify': True, + 'poll_url_conn_error_as_unhealthy': True, 'poll_url_healthy_response': '', 'poll_url_retry_limit': 3, 'poll_url_retry_interval': 3,