Merge "Node poll URL improvements"

This commit is contained in:
Zuul 2018-09-10 22:28:46 +00:00 committed by Gerrit Code Review
commit 73b31d7c34
4 changed files with 132 additions and 17 deletions

View File

@ -312,11 +312,27 @@ class HealthManager(service.Service):
url_template = params['poll_url']
verify_ssl = params['poll_url_ssl_verify']
conn_error_as_unhealthy = params['poll_url_conn_error_as_unhealthy']
expected_resp_str = params['poll_url_healthy_response']
max_unhealthy_retry = params['poll_url_retry_limit']
retry_interval = params['poll_url_retry_interval']
node_update_timeout = params['node_update_timeout']
def stop_node_recovery():
node_last_updated = node.updated_at or node.init_at
if not timeutils.is_older_than(
node_last_updated, node_update_timeout):
LOG.info("Node %s was updated at %s which is less than "
"%d secs ago. Skip node recovery.",
node.id, node_last_updated, node_update_timeout)
return True
LOG.info("Node %s is reported as down (%d retries left)",
node.id, available_attemps)
time.sleep(retry_interval)
return False
url = self._expand_url_template(url_template, node)
LOG.info("Polling node status from URL: %s", url)
@ -327,9 +343,14 @@ class HealthManager(service.Service):
try:
result = utils.url_fetch(url, verify=verify_ssl)
except utils.URLFetchError as ex:
LOG.error("Error when requesting node health status from"
" %s: %s", url, ex)
return None
if conn_error_as_unhealthy:
if stop_node_recovery():
return None
continue
else:
LOG.error("Error when requesting node health status from"
" %s: %s", url, ex)
return None
LOG.debug("Node status returned from URL(%s): %s", url,
result)
@ -342,18 +363,9 @@ class HealthManager(service.Service):
"ACTIVE state", node.id)
return None
node_last_updated = node.updated_at or node.init_at
if not timeutils.is_older_than(
node_last_updated, node_update_timeout):
LOG.info("Node %s was updated at %s which is less than "
"%d secs ago. Skip node recovery.",
node.id, node_last_updated, node_update_timeout)
if stop_node_recovery():
return None
LOG.info("Node %s is reported as down (%d retries left)",
node.id, available_attemps)
time.sleep(retry_interval)
# recover node after exhausting retries
LOG.info("Requesting node recovery: %s", node.id)
req = objects.NodeRecoverRequest(identity=node.id,

View File

@ -62,12 +62,13 @@ class HealthPolicy(base.Policy):
_DETECTION_OPTIONS = (
DETECTION_INTERVAL, POLL_URL, POLL_URL_SSL_VERIFY,
POLL_URL_HEALTHY_RESPONSE, POLL_URL_RETRY_LIMIT,
POLL_URL_RETRY_INTERVAL, NODE_UPDATE_TIMEOUT,
POLL_URL_CONN_ERROR_AS_UNHEALTHY, POLL_URL_HEALTHY_RESPONSE,
POLL_URL_RETRY_LIMIT, POLL_URL_RETRY_INTERVAL, NODE_UPDATE_TIMEOUT,
) = (
'interval', 'poll_url', 'poll_url_ssl_verify',
'poll_url_healthy_response', 'poll_url_retry_limit',
'poll_url_retry_interval', 'node_update_timeout',
'poll_url_conn_error_as_unhealthy', 'poll_url_healthy_response',
'poll_url_retry_limit', 'poll_url_retry_interval',
'node_update_timeout',
)
_RECOVERY_KEYS = (
@ -122,6 +123,12 @@ class HealthPolicy(base.Policy):
"'NODE_STATUS_POLL_URL'."),
default=True,
),
POLL_URL_CONN_ERROR_AS_UNHEALTHY: schema.Boolean(
_("Whether to treat URL connection errors as an "
"indication of an unhealthy node. Only required "
"when type is 'NODE_STATUS_POLL_URL'."),
default=True,
),
POLL_URL_HEALTHY_RESPONSE: schema.String(
_("String pattern in the poll URL response body "
"that indicates a healthy node. "
@ -215,6 +222,8 @@ class HealthPolicy(base.Policy):
self.interval = options.get(self.DETECTION_INTERVAL, 60)
self.poll_url = options.get(self.POLL_URL, '')
self.poll_url_ssl_verify = options.get(self.POLL_URL_SSL_VERIFY, True)
self.poll_url_conn_error_as_unhealthy = options.get(
self.POLL_URL_CONN_ERROR_AS_UNHEALTHY, True)
self.poll_url_healthy_response = options.get(
self.POLL_URL_HEALTHY_RESPONSE, '')
self.poll_url_retry_limit = options.get(self.POLL_URL_RETRY_LIMIT, '')
@ -280,6 +289,8 @@ class HealthPolicy(base.Policy):
'recover_action': self.recover_actions,
'poll_url': self.poll_url,
'poll_url_ssl_verify': self.poll_url_ssl_verify,
'poll_url_conn_error_as_unhealthy':
self.poll_url_conn_error_as_unhealthy,
'poll_url_healthy_response': self.poll_url_healthy_response,
'poll_url_retry_limit': self.poll_url_retry_limit,
'poll_url_retry_interval': self.poll_url_retry_interval,
@ -297,6 +308,8 @@ class HealthPolicy(base.Policy):
'interval': self.interval,
'poll_url': self.poll_url,
'poll_url_ssl_verify': self.poll_url_ssl_verify,
'poll_url_conn_error_as_unhealthy':
self.poll_url_conn_error_as_unhealthy,
'poll_url_healthy_response': self.poll_url_healthy_response,
'poll_url_retry_limit': self.poll_url_retry_limit,
'poll_url_retry_interval': self.poll_url_retry_interval,

View File

@ -719,6 +719,7 @@ class TestHealthManager(base.SenlinTestCase):
params = {
'poll_url': 'FAKE_POLL_URL',
'poll_url_ssl_verify': True,
'poll_url_conn_error_as_unhealthy': True,
'poll_url_healthy_response': 'FAKE_HEALTHY_PATTERN',
'poll_url_retry_limit': 2,
'poll_url_retry_interval': 1,
@ -753,6 +754,7 @@ class TestHealthManager(base.SenlinTestCase):
params = {
'poll_url': 'FAKE_POLL_URL',
'poll_url_ssl_verify': True,
'poll_url_conn_error_as_unhealthy': True,
'poll_url_healthy_response': 'FAKE_HEALTHY_PATTERN',
'poll_url_retry_limit': 2,
'poll_url_retry_interval': 1,
@ -789,6 +791,7 @@ class TestHealthManager(base.SenlinTestCase):
params = {
'poll_url': 'FAKE_POLL_URL',
'poll_url_ssl_verify': True,
'poll_url_conn_error_as_unhealthy': True,
'poll_url_healthy_response': 'FAKE_HEALTHY_PATTERN',
'poll_url_retry_limit': 2,
'poll_url_retry_interval': 1,
@ -826,6 +829,7 @@ class TestHealthManager(base.SenlinTestCase):
params = {
'poll_url': 'FAKE_POLL_URL',
'poll_url_ssl_verify': True,
'poll_url_conn_error_as_unhealthy': True,
'poll_url_healthy_response': 'FAKE_HEALTHY_PATTERN',
'poll_url_retry_limit': 2,
'poll_url_retry_interval': 1,
@ -864,6 +868,7 @@ class TestHealthManager(base.SenlinTestCase):
params = {
'poll_url': 'FAKE_POLL_URL',
'poll_url_ssl_verify': False,
'poll_url_conn_error_as_unhealthy': True,
'poll_url_healthy_response': 'FAKE_HEALTHY_PATTERN',
'poll_url_retry_limit': 2,
'poll_url_retry_interval': 1,
@ -886,6 +891,89 @@ class TestHealthManager(base.SenlinTestCase):
)
mock_sleep.assert_has_calls([mock.call(1), mock.call(1)])
@mock.patch.object(time, "sleep")
@mock.patch.object(tu, "is_older_than")
@mock.patch.object(hm.HealthManager, "_expand_url_template")
@mock.patch.object(utils, 'url_fetch')
@mock.patch.object(rpc_client.EngineClient, 'call')
def test__check_url_and_recover_node_conn_error(
self, mock_rpc, mock_url_fetch, mock_expand_url, mock_time,
mock_sleep):
ctx = mock.Mock()
node = mock.Mock()
node.status = consts.NS_ACTIVE
node.id = 'FAKE_ID'
mock_time.return_value = True
mock_expand_url.return_value = 'FAKE_EXPANDED_URL'
x_action_check = {'action': 'CHECK_ID'}
mock_rpc.return_value = x_action_check
mock_url_fetch.side_effect = utils.URLFetchError("Error")
params = {
'poll_url': 'FAKE_POLL_URL',
'poll_url_ssl_verify': False,
'poll_url_conn_error_as_unhealthy': True,
'poll_url_healthy_response': 'FAKE_HEALTHY_PATTERN',
'poll_url_retry_limit': 2,
'poll_url_retry_interval': 1,
'node_update_timeout': 5,
}
recover_action = {'operation': 'REBUILD'}
# do it
res = self.hm._check_url_and_recover_node(ctx, node, recover_action,
params)
self.assertEqual(mock_rpc.return_value, res)
mock_rpc.assert_called_once_with(ctx, 'node_recover', mock.ANY)
mock_url_fetch.assert_has_calls(
[
mock.call('FAKE_EXPANDED_URL', verify=False),
mock.call('FAKE_EXPANDED_URL', verify=False)
]
)
mock_sleep.assert_has_calls([mock.call(1), mock.call(1)])
@mock.patch.object(time, "sleep")
@mock.patch.object(tu, "is_older_than")
@mock.patch.object(hm.HealthManager, "_expand_url_template")
@mock.patch.object(utils, 'url_fetch')
@mock.patch.object(rpc_client.EngineClient, 'call')
def test__check_url_and_recover_node_conn_error_noop(
self, mock_rpc, mock_url_fetch, mock_expand_url, mock_time,
mock_sleep):
ctx = mock.Mock()
node = mock.Mock()
node.status = consts.NS_ACTIVE
node.id = 'FAKE_ID'
mock_time.return_value = True
mock_expand_url.return_value = 'FAKE_EXPANDED_URL'
mock_url_fetch.side_effect = utils.URLFetchError("Error")
params = {
'poll_url': 'FAKE_POLL_URL',
'poll_url_ssl_verify': False,
'poll_url_conn_error_as_unhealthy': False,
'poll_url_healthy_response': 'FAKE_HEALTHY_PATTERN',
'poll_url_retry_limit': 2,
'poll_url_retry_interval': 1,
'node_update_timeout': 5,
}
recover_action = {'operation': 'REBUILD'}
# do it
res = self.hm._check_url_and_recover_node(ctx, node, recover_action,
params)
self.assertIsNone(res)
mock_rpc.assert_not_called()
mock_url_fetch.assert_has_calls(
[
mock.call('FAKE_EXPANDED_URL', verify=False),
]
)
mock_sleep.assert_not_called()
@mock.patch.object(hm, "_chase_up")
@mock.patch.object(hm.HealthManager, "_check_url_and_recover_node")
@mock.patch.object(obj_node.Node, 'get_all_by_cluster')

View File

@ -120,6 +120,7 @@ class TestHealthPolicy(base.SenlinTestCase):
'interval': self.hp.interval,
'poll_url': '',
'poll_url_ssl_verify': True,
'poll_url_conn_error_as_unhealthy': True,
'poll_url_healthy_response': '',
'poll_url_retry_limit': 3,
'poll_url_retry_interval': 3,
@ -141,6 +142,7 @@ class TestHealthPolicy(base.SenlinTestCase):
'recover_action': self.hp.recover_actions,
'poll_url': '',
'poll_url_ssl_verify': True,
'poll_url_conn_error_as_unhealthy': True,
'poll_url_healthy_response': '',
'poll_url_retry_limit': 3,
'poll_url_retry_interval': 3,