Stop deployment if one of the critical node fail (fail fast)

Details:
- add new node attribute: fail_if_error (boolean);
- add list of critical nodes for multinodes and HA mode.
- do not mark non-reported nodes as error nodes in deployment case.

Change-Id: I657ea6a02b20505e47527618e14c64fc9b48a6b4
Related-Bug: #1251634
This commit is contained in:
Vladimir Sharshov 2014-07-22 12:13:34 +04:00
parent 51f32395ee
commit 3d38d0a11d
8 changed files with 125 additions and 7 deletions

View File

@ -149,8 +149,9 @@ class Node(Base):
@property
def needs_redeploy(self):
return (self.status == 'error' or len(self.pending_roles)) and \
not self.pending_deletion
return (
self.status in ['error', 'provisioned'] or
len(self.pending_roles)) and not self.pending_deletion
@property
def needs_redeletion(self):

View File

@ -69,6 +69,8 @@ def get_nodes_not_for_deletion(cluster):
class DeploymentMultinodeSerializer(object):
critical_roles = ['controller', 'ceph-osd', 'primary-mongo']
@classmethod
def serialize(cls, cluster, nodes):
"""Method generates facts which
@ -78,6 +80,7 @@ class DeploymentMultinodeSerializer(object):
common_attrs = cls.get_common_attrs(cluster)
cls.set_deployment_priorities(nodes)
cls.set_critical_nodes(cluster, nodes)
return [dict_merge(node, common_attrs) for node in nodes]
@ -87,9 +90,7 @@ class DeploymentMultinodeSerializer(object):
attrs = objects.Attributes.merged_attrs_values(
cluster.attributes
)
release = objects.Release.get_by_uid(cluster.pending_release_id) \
if cluster.status == consts.CLUSTER_STATUSES.update \
else cluster.release
release = cls.current_release(cluster)
attrs['deployment_mode'] = cluster.mode
attrs['deployment_id'] = cluster.id
attrs['openstack_version'] = release.version
@ -113,6 +114,13 @@ class DeploymentMultinodeSerializer(object):
return attrs
@classmethod
def current_release(cls, cluster):
"""Actual cluster release."""
return objects.Release.get_by_uid(cluster.pending_release_id) \
if cluster.status == consts.CLUSTER_STATUSES.update \
else cluster.release
@classmethod
def set_storage_parameters(cls, cluster, attrs):
"""Generate pg_num as the number of OSDs across the cluster
@ -187,6 +195,14 @@ class DeploymentMultinodeSerializer(object):
'zabbix-server']):
n['priority'] = other_nodes_prior
@classmethod
def set_critical_nodes(cls, cluster, nodes):
"""Set behavior on nodes deployment error
during deployment process.
"""
for n in nodes:
n['fail_if_error'] = n['role'] in cls.critical_roles
@classmethod
def serialize_nodes(cls, nodes):
"""Serialize node for each role.
@ -316,6 +332,11 @@ class DeploymentMultinodeSerializer(object):
class DeploymentHASerializer(DeploymentMultinodeSerializer):
"""Serializer for ha mode."""
critical_roles = ['primary-controller',
'primary-mongo',
'primary-swift-proxy',
'ceph-osd']
@classmethod
def serialize_nodes(cls, nodes):
"""Serialize nodes and set primary-controller

View File

@ -140,7 +140,7 @@ class TaskHelper(object):
def get_nodes_to_deployment_error(cls, cluster):
q_nodes_to_error = db().query(Node).\
filter(Node.cluster == cluster).\
filter(Node.status.in_(['provisioned', 'deploying']))
filter(Node.status.in_(['deploying']))
return q_nodes_to_error

View File

@ -643,7 +643,7 @@ class CheckBeforeDeploymentTask(object):
def _is_disk_checking_required(cls, node):
"""Disk checking required in case if node is not provisioned.
"""
if node.status in ('ready', 'deploying') or \
if node.status in ('ready', 'deploying', 'provisioned') or \
(node.status == 'error' and node.error_type != 'provision'):
return False

View File

@ -171,11 +171,19 @@ class TestHandlers(BaseIntegrationTest):
'compute': 700
}
critical_mapping = {
'primary-controller': True,
'controller': False,
'cinder': False,
'compute': False
}
deployment_info = []
for node in nodes_db:
ips = assigned_ips[node.id]
for role in sorted(node.roles):
priority = priority_mapping[role]
is_critical = critical_mapping[role]
if isinstance(priority, list):
priority = priority.pop()
@ -184,6 +192,7 @@ class TestHandlers(BaseIntegrationTest):
'status': node.status,
'role': role,
'online': node.online,
'fail_if_error': is_critical,
'fqdn': 'node-%d.%s' % (node.id, settings.DNS_DOMAIN),
'priority': priority,
@ -219,6 +228,7 @@ class TestHandlers(BaseIntegrationTest):
lambda node: node['role'] == 'controller',
deployment_info)
controller_nodes[0]['role'] = 'primary-controller'
controller_nodes[0]['fail_if_error'] = True
supertask = self.env.launch_deployment()
deploy_task_uuid = [x.uuid for x in supertask.subtasks
@ -515,11 +525,20 @@ class TestHandlers(BaseIntegrationTest):
'cinder': 700,
'compute': 700
}
critical_mapping = {
'primary-controller': True,
'controller': False,
'cinder': False,
'compute': False
}
deployment_info = []
for node in nodes_db:
ips = assigned_ips[node.id]
for role in sorted(node.roles):
priority = priority_mapping[role]
is_critical = critical_mapping[role]
if isinstance(priority, list):
priority = priority.pop()
@ -528,6 +547,7 @@ class TestHandlers(BaseIntegrationTest):
'status': node.status,
'role': role,
'online': node.online,
'fail_if_error': is_critical,
'fqdn': 'node-%d.%s' % (node.id, settings.DNS_DOMAIN),
'priority': priority,
@ -621,6 +641,7 @@ class TestHandlers(BaseIntegrationTest):
lambda node: node['role'] == 'controller',
deployment_info)
controller_nodes[0]['role'] = 'primary-controller'
controller_nodes[0]['fail_if_error'] = True
supertask = self.env.launch_deployment()
deploy_task_uuid = [x.uuid for x in supertask.subtasks

View File

@ -281,6 +281,27 @@ class TestNovaOrchestratorSerializer(OrchestratorSerializerTestBase):
]
self.assertEqual(expected_priorities, nodes)
def test_set_critital_node(self):
nodes = [
{'role': 'mongo'},
{'role': 'mongo'},
{'role': 'primary-mongo'},
{'role': 'controller'},
{'role': 'ceph-osd'},
{'role': 'other'}
]
serializer = DeploymentMultinodeSerializer()
serializer.set_critical_nodes(self.cluster, nodes)
expected_ciritial_roles = [
{'role': 'mongo', 'fail_if_error': False},
{'role': 'mongo', 'fail_if_error': False},
{'role': 'primary-mongo', 'fail_if_error': True},
{'role': 'controller', 'fail_if_error': True},
{'role': 'ceph-osd', 'fail_if_error': True},
{'role': 'other', 'fail_if_error': False}
]
self.assertEqual(expected_ciritial_roles, nodes)
class TestNovaOrchestratorHASerializer(OrchestratorSerializerTestBase):
@ -382,6 +403,36 @@ class TestNovaOrchestratorHASerializer(OrchestratorSerializerTestBase):
]
self.assertEqual(expected_priorities, nodes)
def test_set_critital_node(self):
nodes = [
{'role': 'zabbix-server'},
{'role': 'primary-swift-proxy'},
{'role': 'swift-proxy'},
{'role': 'storage'},
{'role': 'mongo'},
{'role': 'primary-mongo'},
{'role': 'primary-controller'},
{'role': 'controller'},
{'role': 'controller'},
{'role': 'ceph-osd'},
{'role': 'other'}
]
self.serializer.set_critical_nodes(self.cluster, nodes)
expected_ciritial_roles = [
{'role': 'zabbix-server', 'fail_if_error': False},
{'role': 'primary-swift-proxy', 'fail_if_error': True},
{'role': 'swift-proxy', 'fail_if_error': False},
{'role': 'storage', 'fail_if_error': False},
{'role': 'mongo', 'fail_if_error': False},
{'role': 'primary-mongo', 'fail_if_error': True},
{'role': 'primary-controller', 'fail_if_error': True},
{'role': 'controller', 'fail_if_error': False},
{'role': 'controller', 'fail_if_error': False},
{'role': 'ceph-osd', 'fail_if_error': True},
{'role': 'other', 'fail_if_error': False}
]
self.assertEqual(expected_ciritial_roles, nodes)
def test_set_primary_controller_priority_not_depend_on_nodes_order(self):
controllers = filter(lambda n: 'controller' in n.roles, self.env.nodes)
expected_primary_controller = sorted(

View File

@ -194,6 +194,9 @@ class TestCheckBeforeDeploymentTask(BaseTestCase):
self.set_node_status('discover')
self.assertTrue(self.is_checking_required())
self.set_node_status('provisioned')
self.assertFalse(self.is_checking_required())
def test_is_disk_checking_required_in_case_of_error(self):
self.set_node_status('error')
self.set_node_error_type('provision')

View File

@ -98,6 +98,27 @@ class TestTaskHelpers(BaseTestCase):
computes = self.filter_by_role(nodes, 'compute')
self.assertEqual(len(computes), 1)
def test_redeploy_with_critial_roles(self):
cluster = self.create_env([
{'roles': ['controller'], 'status': 'error'},
{'roles': ['controller'], 'status': 'provisioned'},
{'roles': ['controller'], 'status': 'provisioned'},
{'roles': ['compute', 'cinder'], 'status': 'provisioned'},
{'roles': ['compute'], 'status': 'provisioned'},
{'roles': ['cinder'], 'status': 'provisioned'}])
nodes = TaskHelper.nodes_to_deploy(cluster)
self.assertEqual(len(nodes), 6)
controllers = self.filter_by_role(nodes, 'controller')
self.assertEqual(len(controllers), 3)
cinders = self.filter_by_role(nodes, 'cinder')
self.assertEqual(len(cinders), 2)
computes = self.filter_by_role(nodes, 'compute')
self.assertEqual(len(computes), 2)
# TODO(aroma): move it to utils testing code
def test_recalculate_deployment_task_progress(self):
cluster = self.create_env([