From 5b63958105d73eadac9c598982e68ef35defa4a1 Mon Sep 17 00:00:00 2001 From: Emilien Macchi Date: Tue, 16 Jun 2020 10:34:44 -0400 Subject: [PATCH] overcloud_deploy: raise at the end if Ansible Runner had exception In the context of MaxFailPercentage, if a node (e.g. one of 100 computes) has any fatal error during an Ansible play and we tolerate some percentage of failure; we want to raise the error but at the very end of the deployment. So this patch puts the following actions in a "finally" block so they always execute: * Create overcloudrc * Sets the right permissions to the clouds.yaml * Execute _deploy_postconfig * Print infos like Keystone entpoint, Horizon URL, RC file location and deploy message (saying if there is an error or not) And at the very end we raise the actual AnsibleRunner trace if the deployment failed. So even if a node failed and we tolerate it, we'll still finish the deployment until the end, but for UX purpose we want to raise it at the very end. Note that when it fails and it's tolerated, Ansible prints the node as "ignored": PLAY RECAP ****************************************************************** compute-0 : ok=555 (...) failed=0 skipped=484 rescued=0 ignored=0 compute-1 : ok=60 (...) failed=1 skipped=40 rescued=0 ignored=1 controller-0 : ok=960 (...) failed=0 skipped=709 rescued=0 ignored=0 controller-1 : ok=920 (...) failed=0 skipped=693 rescued=0 ignored=0 controller-2 : ok=919 (...) failed=0 skipped=693 rescued=0 ignored=0 undercloud : ok=86 (...) failed=0 skipped=57 rescued=0 ignored=0 To improve UX, we'll investigate an Ansible callback to properly tell what nodes needs to be re-deployed. Note: also mock copy_clouds_yaml since it wasn't tested before but it's failing to reach the files on the filesystem. Change-Id: I7d733499e74abe2cdf91526df608dc7c273bf19e --- .../overcloud_deploy/test_overcloud_deploy.py | 4 +- tripleoclient/v1/overcloud_deploy.py | 167 ++++++++++-------- 2 files changed, 93 insertions(+), 78 deletions(-) diff --git a/tripleoclient/tests/v1/overcloud_deploy/test_overcloud_deploy.py b/tripleoclient/tests/v1/overcloud_deploy/test_overcloud_deploy.py index 9a23f5aae..d8af0d39c 100644 --- a/tripleoclient/tests/v1/overcloud_deploy/test_overcloud_deploy.py +++ b/tripleoclient/tests/v1/overcloud_deploy/test_overcloud_deploy.py @@ -1404,6 +1404,7 @@ class TestDeployOvercloud(fakes.TestDeployOvercloud): fixture.mock_set_deployment_status.call_args[-1]['status']) mock_copy.assert_called_once() + @mock.patch('tripleoclient.utils.copy_clouds_yaml') @mock.patch('tripleoclient.v1.overcloud_deploy.DeployOvercloud.' '_get_undercloud_host_entry', autospec=True, return_value='192.168.0.1 uc.ctlplane.localhost uc.ctlplane') @@ -1419,7 +1420,8 @@ class TestDeployOvercloud(fakes.TestDeployOvercloud): mock_overcloudrc, mock_overcloud_endpoint, mock_create_tempest_deployer_input, - mock_get_undercloud_host_entry): + mock_get_undercloud_host_entry, + mock_copy): fixture = deployment.DeploymentWorkflowFixture() self.useFixture(fixture) clients = self.app.client_manager diff --git a/tripleoclient/v1/overcloud_deploy.py b/tripleoclient/v1/overcloud_deploy.py index 5d7e3ba61..4f3bc6fa9 100644 --- a/tripleoclient/v1/overcloud_deploy.py +++ b/tripleoclient/v1/overcloud_deploy.py @@ -968,6 +968,9 @@ class DeployOvercloud(command.Command): def take_action(self, parsed_args): self.log.debug("take_action(%s)" % parsed_args) + deploy_status = 'DEPLOY_SUCCESS' + deploy_message = 'without error' + self._setup_clients(parsed_args) # Swiftclient logs things like 404s at error level, which is a problem @@ -1014,94 +1017,104 @@ class DeployOvercloud(command.Command): # wont do anything. return - if parsed_args.config_download: - print("Deploying overcloud configuration") + try: + + if parsed_args.config_download: + print("Deploying overcloud configuration") + deployment.set_deployment_status( + clients=self.clients, + plan=stack.stack_name, + status='DEPLOYING' + ) + + if not parsed_args.config_download_only: + deployment.get_hosts_and_enable_ssh_admin( + stack, + parsed_args.overcloud_ssh_network, + parsed_args.overcloud_ssh_user, + self.get_key_pair(parsed_args), + parsed_args.overcloud_ssh_port_timeout, + verbosity=utils.playbook_verbosity(self=self) + ) + + if parsed_args.config_download_timeout: + timeout = parsed_args.config_download_timeout + else: + used = int((time.time() - start) // 60) + timeout = parsed_args.timeout - used + if timeout <= 0: + raise exceptions.DeploymentError( + 'Deployment timed out after %sm' % used) + + deployment_options = {} + if parsed_args.deployment_python_interpreter: + deployment_options['ansible_python_interpreter'] = \ + parsed_args.deployment_python_interpreter + + deployment.config_download( + self.log, + self.clients, + stack, + parsed_args.overcloud_ssh_network, + parsed_args.output_dir, + parsed_args.override_ansible_cfg, + timeout=parsed_args.overcloud_ssh_port_timeout, + verbosity=utils.playbook_verbosity(self=self), + deployment_options=deployment_options, + in_flight_validations=parsed_args.inflight, + deployment_timeout=timeout, + tags=parsed_args.tags, + skip_tags=parsed_args.skip_tags, + limit_hosts=utils.playbook_limit_parse( + limit_nodes=parsed_args.limit + ) + ) deployment.set_deployment_status( clients=self.clients, plan=stack.stack_name, - status='DEPLOYING' + status=deploy_status) + except Exception as deploy_e: + deploy_status = 'DEPLOY_FAILED' + deploy_message = 'with error' + deploy_trace = deploy_e + deployment.set_deployment_status( + clients=self.clients, + plan=stack.stack_name, + status=deploy_status ) + finally: + # Force fetching of attributes + stack.get() - try: - if not parsed_args.config_download_only: - deployment.get_hosts_and_enable_ssh_admin( - stack, - parsed_args.overcloud_ssh_network, - parsed_args.overcloud_ssh_user, - self.get_key_pair(parsed_args), - parsed_args.overcloud_ssh_port_timeout, - verbosity=utils.playbook_verbosity(self=self) - ) + rcpath = deployment.create_overcloudrc( + container=stack.stack_name, + no_proxy=parsed_args.no_proxy) - if parsed_args.config_download_timeout: - timeout = parsed_args.config_download_timeout - else: - used = int((time.time() - start) // 60) - timeout = parsed_args.timeout - used - if timeout <= 0: - raise exceptions.DeploymentError( - 'Deployment timed out after %sm' % used) + # Copy clouds.yaml to the cloud user directory + user = \ + getpwuid(os.stat(constants.CLOUD_HOME_DIR).st_uid).pw_name + utils.copy_clouds_yaml(user) + utils.create_tempest_deployer_input() - deployment_options = {} - if parsed_args.deployment_python_interpreter: - deployment_options['ansible_python_interpreter'] = \ - parsed_args.deployment_python_interpreter + # Run postconfig on create or force. Use force to makes sure + # endpoints are created with deploy reruns and upgrades + if (stack_create or parsed_args.force_postconfig + and not parsed_args.skip_postconfig): + self._deploy_postconfig(stack, parsed_args) - deployment.config_download( - self.log, - self.clients, - stack, - parsed_args.overcloud_ssh_network, - parsed_args.output_dir, - parsed_args.override_ansible_cfg, - timeout=parsed_args.overcloud_ssh_port_timeout, - verbosity=utils.playbook_verbosity(self=self), - deployment_options=deployment_options, - in_flight_validations=parsed_args.inflight, - deployment_timeout=timeout, - tags=parsed_args.tags, - skip_tags=parsed_args.skip_tags, - limit_hosts=utils.playbook_limit_parse( - limit_nodes=parsed_args.limit - ) - ) - deployment.set_deployment_status( - clients=self.clients, - plan=stack.stack_name, - status='DEPLOY_SUCCESS') - except Exception: - deployment.set_deployment_status( - clients=self.clients, - plan=stack.stack_name, - status='DEPLOY_FAILED' - ) - raise + overcloud_endpoint = utils.get_overcloud_endpoint(stack) - # Force fetching of attributes - stack.get() + horizon_url = deployment.get_horizon_url( + stack=stack.stack_name) - rcpath = deployment.create_overcloudrc(container=stack.stack_name, - no_proxy=parsed_args.no_proxy) + print("Overcloud Endpoint: {0}".format(overcloud_endpoint)) + print("Overcloud Horizon Dashboard URL: {0}".format( + horizon_url)) + print("Overcloud rc file: {0}".format(rcpath)) + print("Overcloud Deployed {0}".format(deploy_message)) - # Copy clouds.yaml to the cloud user directory - user = getpwuid(os.stat(constants.CLOUD_HOME_DIR).st_uid).pw_name - utils.copy_clouds_yaml(user) - utils.create_tempest_deployer_input() - - # Run postconfig on create or force. Use force to makes sure endpoints - # are created with deploy reruns and upgrades - if (stack_create or parsed_args.force_postconfig - and not parsed_args.skip_postconfig): - self._deploy_postconfig(stack, parsed_args) - - overcloud_endpoint = utils.get_overcloud_endpoint(stack) - - horizon_url = deployment.get_horizon_url(stack=stack.stack_name) - - print("Overcloud Endpoint: {0}".format(overcloud_endpoint)) - print("Overcloud Horizon Dashboard URL: {0}".format(horizon_url)) - print("Overcloud rc file: {0}".format(rcpath)) - print("Overcloud Deployed") + if deploy_status == 'DEPLOY_FAILED': + raise(deploy_trace) class GetDeploymentStatus(command.Command):