From ef2ace7f655a888a9aec6430e1d5c2071dda44f1 Mon Sep 17 00:00:00 2001 From: Anton Studenov Date: Fri, 20 May 2016 14:15:50 +0300 Subject: [PATCH] Add timeout_msg values in fuelweb_test and system_test - added waiters for node status - removed redundant try/except/assert Related blueprint: fuel-qa-failure-reason-grouping Change-Id: I06ccac42856712d1f9afe64402d34aeba3c07af6 --- fuelweb_test/helpers/checkers.py | 21 +-- fuelweb_test/helpers/fuel_actions.py | 4 +- fuelweb_test/helpers/nessus.py | 4 +- fuelweb_test/helpers/rally.py | 5 +- fuelweb_test/models/environment.py | 37 ++--- fuelweb_test/models/fuel_web_client.py | 171 ++++++++++-------------- system_test/actions/base.py | 3 +- system_test/actions/strength_actions.py | 57 ++++---- 8 files changed, 145 insertions(+), 157 deletions(-) diff --git a/fuelweb_test/helpers/checkers.py b/fuelweb_test/helpers/checkers.py index cc5254b1f..d9b62920d 100644 --- a/fuelweb_test/helpers/checkers.py +++ b/fuelweb_test/helpers/checkers.py @@ -256,7 +256,8 @@ def enable_feature_group(env, group): except (HTTPError, URLError): return False - wait(check_api_group_enabled, interval=10, timeout=60 * 20) + wait(check_api_group_enabled, interval=10, timeout=60 * 20, + timeout_msg='Failed to enable feature group - {!r}'.format(group)) def find_backup(ip): @@ -331,13 +332,13 @@ def check_mysql(ip, node_name): " information_schema.GLOBAL_STATUS" " WHERE VARIABLE_NAME" " = 'wsrep_local_state_comment';\"") - try: - wait(lambda: ssh_manager.execute(ip, check_cmd)['exit_code'] == 0, - timeout=10 * 60) - logger.info('MySQL daemon is started on {0}'.format(node_name)) - except TimeoutError: - logger.error('MySQL daemon is down on {0}'.format(node_name)) - raise + + wait(lambda: ssh_manager.execute(ip, check_cmd)['exit_code'] == 0, + timeout=10 * 60, + timeout_msg='MySQL daemon is down on {0}'.format(node_name)) + logger.info('MySQL daemon is started on {0}'.format(node_name)) + + # TODO(astudenov): add timeout_msg _wait( lambda: assert_equal( ssh_manager.execute( @@ -348,7 +349,9 @@ def check_mysql(ip, node_name): timeout=120) try: wait(lambda: ''.join(ssh_manager.execute( - ip, check_galera_cmd)['stdout']).rstrip() == 'Synced', timeout=600) + ip, check_galera_cmd)['stdout']).rstrip() == 'Synced', timeout=600, + timeout_msg='galera status != "Synced" on node {!r} with ip {}' + ''.format(node_name, ip)) except TimeoutError: logger.error('galera status is {0}'.format(''.join(ssh_manager.execute( ip, check_galera_cmd)['stdout']).rstrip())) diff --git a/fuelweb_test/helpers/fuel_actions.py b/fuelweb_test/helpers/fuel_actions.py index d98ffb05f..5bf3fefcb 100644 --- a/fuelweb_test/helpers/fuel_actions.py +++ b/fuelweb_test/helpers/fuel_actions.py @@ -200,7 +200,9 @@ class AdminActions(BaseActions): @logwrap def wait_for_fuel_ready(self, timeout=300): - wait(lambda: self.is_fuel_ready, timeout=timeout) + wait(lambda: self.is_fuel_ready, timeout=timeout, + timeout_msg="Fuel services are not ready, please check the " + "output of 'fuel-utils check_all") @logwrap @retry() diff --git a/fuelweb_test/helpers/nessus.py b/fuelweb_test/helpers/nessus.py index 26f865b29..0281775fe 100644 --- a/fuelweb_test/helpers/nessus.py +++ b/fuelweb_test/helpers/nessus.py @@ -168,7 +168,9 @@ class NessusClient(object): file_id = self.post('/scans/{0}/export'.format(scan_id), body=export_def)['file'] wait(lambda: self.check_scan_export_status(scan_id, file_id), - interval=10, timeout=600) + interval=10, timeout=600, + timeout_msg='Nessus export scan status != "ready" for ' + ' scan_id={} file_id={}'.format(scan_id, file_id)) return file_id def download_scan_result( diff --git a/fuelweb_test/helpers/rally.py b/fuelweb_test/helpers/rally.py index e5d7330a6..90269555c 100644 --- a/fuelweb_test/helpers/rally.py +++ b/fuelweb_test/helpers/rally.py @@ -297,7 +297,8 @@ class RallyTask(object): cmd = ("awk 'BEGIN{{retval=1}};/^Using task:/{{print $NF; retval=0}};" "END {{exit retval}}' {0}").format(temp_file) wait(lambda: self.engine.run_container_command(cmd)['exit_code'] == 0, - timeout=30) + timeout=30, timeout_msg='Rally task {!r} creation timeout' + ''.format(result)) result = self.engine.run_container_command(cmd) task_uuid = ''.join(result['stdout']).strip() assert_true(task_uuid in self.engine.list_tasks(), @@ -421,6 +422,6 @@ class RallyBenchmarkTest(object): '{0}'.format(self.current_task.status)) if result: wait(lambda: self.current_task.status == 'finished', - timeout=timeout) + timeout=timeout, timeout_msg='Rally benchmark test timeout') logger.info('Rally benchmark test is finished.') return RallyResult(json_results=self.current_task.get_results()) diff --git a/fuelweb_test/models/environment.py b/fuelweb_test/models/environment.py index d212bb676..af9ac07c6 100644 --- a/fuelweb_test/models/environment.py +++ b/fuelweb_test/models/environment.py @@ -16,7 +16,6 @@ import re import subprocess import time -from devops.error import TimeoutError from devops.helpers.helpers import _tcp_ping from devops.helpers.helpers import _wait from devops.helpers.helpers import wait @@ -121,7 +120,9 @@ class EnvironmentModel(object): time.sleep(5) with TimeStat("wait_for_nodes_to_start_and_register_in_nailgun"): - wait(lambda: all(self.nailgun_nodes(devops_nodes)), 15, timeout) + wait(lambda: all(self.nailgun_nodes(devops_nodes)), 15, timeout, + timeout_msg='Bootstrap timeout for nodes: {}' + ''.format([node.name for node in devops_nodes])) if not skip_timesync: self.sync_time() @@ -287,14 +288,7 @@ class EnvironmentModel(object): # Bug: 1455753 time.sleep(30) - for node in devops_nodes: - try: - wait(lambda: - self.fuel_web.get_nailgun_node_by_devops_node( - node)['online'], timeout=60 * 6) - except TimeoutError: - raise TimeoutError( - "Node {0} does not become online".format(node.name)) + self.fuel_web.wait_nodes_get_online_state(devops_nodes, timeout=60 * 6) return True def revert_snapshot(self, name, skip_timesync=False, @@ -314,6 +308,7 @@ class EnvironmentModel(object): self.sync_time() try: with QuietLogger(): + # TODO(astudenov): add timeout_msg _wait( self.fuel_web.client.get_releases, expected=( @@ -327,6 +322,7 @@ class EnvironmentModel(object): logger.exception( 'Unexpected exception while tried to get releases') if not skip_slaves_check: + # TODO(astudenov): add timeout_msg _wait(lambda: self.check_slaves_are_ready(), timeout=60 * 6) return True @@ -407,7 +403,8 @@ class EnvironmentModel(object): self.d_env.start([admin]) logger.info("Waiting for admin node to start up") - wait(lambda: admin.driver.node_active(admin), 60) + wait(lambda: admin.driver.node_active(admin), 60, + timeout_msg='Admin node startup timeout') logger.info("Proceed with installation") # update network parameters at boot screen admin.send_keys(self.get_keys(admin, custom=custom, @@ -452,7 +449,9 @@ class EnvironmentModel(object): wait(lambda: ( self.ssh_manager.execute_on_remote( admin_node_ip, cmd)['stdout'][0] != 'dead'), interval=10, - timeout=30) + timeout=30, + timeout_msg='Nginx service is dead after trying to enable ' + 'it with the command: {}'.format(cmd)) # pylint: disable=no-self-use @update_rpm_packages @@ -465,6 +464,7 @@ class EnvironmentModel(object): @logwrap def wait_for_provisioning(self, timeout=settings.WAIT_FOR_PROVISIONING_TIMEOUT): + # TODO(astudenov): add timeout_msg _wait(lambda: _tcp_ping( self.d_env.nodes( ).admin.get_ip_address_by_network_name @@ -496,14 +496,19 @@ class EnvironmentModel(object): wait(lambda: self.ssh_manager.exists_on_remote( self.ssh_manager.admin_ip, '/var/lock/wait_for_external_config'), - timeout=600) + timeout=600, + timeout_msg='wait_for_external_config lock file timeout ' + 'while bootstrapping the Fuel master node') check_cmd = 'pkill -0 -f wait_for_external_config' wait( lambda: self.ssh_manager.execute( ip=self.ssh_manager.admin_ip, - cmd=check_cmd)['exit_code'] == 0, timeout=timeout) + cmd=check_cmd)['exit_code'] == 0, + timeout=timeout, + timeout_msg='wait_for_external_config process timeout ' + 'while bootstrapping the Fuel master node') @logwrap def kill_wait_for_external_config(self): @@ -529,7 +534,9 @@ class EnvironmentModel(object): ip=self.ssh_manager.admin_ip, cmd="grep 'Fuel node deployment' '{:s}'".format(log_path) )['exit_code'] == 0, - timeout=(float(settings.ADMIN_NODE_BOOTSTRAP_TIMEOUT)) + timeout=(float(settings.ADMIN_NODE_BOOTSTRAP_TIMEOUT)), + timeout_msg='Fuel master node bootstrap timeout, ' + 'please check the log {}'.format(log_path) ) result = self.ssh_manager.execute( ip=self.ssh_manager.admin_ip, diff --git a/fuelweb_test/models/fuel_web_client.py b/fuelweb_test/models/fuel_web_client.py index 6fc255416..72f82adf0 100644 --- a/fuelweb_test/models/fuel_web_client.py +++ b/fuelweb_test/models/fuel_web_client.py @@ -136,7 +136,9 @@ class FuelWebClient29(object): lambda: all([run['status'] == 'finished' for run in self.client.get_ostf_test_run(cluster_id)]), - timeout=timeout) + timeout=timeout, + timeout_msg='OSTF tests run timeout ' + '(cluster_id={})'.format(cluster_id)) return self.client.get_ostf_test_run(cluster_id) @logwrap @@ -161,6 +163,7 @@ class FuelWebClient29(object): def assert_cluster_ready(self, os_conn, smiles_count, networks_count=2, timeout=300): logger.info('Assert cluster services are UP') + # TODO(astudenov): add timeout_msg _wait( lambda: self.get_cluster_status( os_conn, @@ -177,6 +180,7 @@ class FuelWebClient29(object): logger.info('Waiting {0} sec. for passed OSTF HA tests.' .format(timeout)) with QuietLogger(logging.ERROR): + # TODO(astudenov): add timeout_msg _wait(lambda: self.run_ostf(cluster_id, test_sets=['ha'], should_fail=should_fail), @@ -194,6 +198,7 @@ class FuelWebClient29(object): logger.info('Waiting {0} sec. for passed OSTF Sanity checks.' .format(timeout)) with QuietLogger(): + # TODO(astudenov): add timeout_msg _wait(lambda: self.run_ostf(cluster_id, test_sets=['sanity'], should_fail=should_fail), @@ -398,6 +403,24 @@ class FuelWebClient29(object): checkers.fail_deploy(not_ready_transactions) + def wait_node_is_online(self, devops_node, timeout=60 * 5): + logger.info( + 'Wait for node {!r} online status'.format(devops_node.name)) + wait(lambda: self.get_nailgun_node_by_devops_node( + devops_node)['online'], + timeout=timeout, + timeout_msg='Node {!r} failed to become online' + ''.format(devops_node.name)) + + def wait_node_is_offline(self, devops_node, timeout=60 * 5): + logger.info( + 'Wait for node {!r} offline status'.format(devops_node.name)) + wait(lambda: not self.get_nailgun_node_by_devops_node( + devops_node)['online'], + timeout=timeout, + timeout_msg='Node {!r} failed to become offline' + ''.format(devops_node.name)) + @logwrap def fqdn(self, devops_node): logger.info('Get FQDN of a devops node %s', devops_node.name) @@ -1270,6 +1293,14 @@ class FuelWebClient29(object): node['mac'] == nailgun_node['mac'] and node['status'] == 'discover', self.client.list_nodes())) + def wait_node_is_discovered(self, nailgun_node, timeout=6 * 60): + logger.info('Wait for node {!r} to become discovered' + ''.format(nailgun_node['name'])) + wait(lambda: self.is_node_discovered(nailgun_node), + timeout=timeout, + timeout_msg='Node {!r} failed to become discovered' + ''.format(nailgun_node['name'])) + @logwrap def run_network_verify(self, cluster_id): logger.info('Run network verification on the cluster %s', cluster_id) @@ -1334,17 +1365,15 @@ class FuelWebClient29(object): logger.info('Wait for task {0} seconds: {1}'.format( timeout, pretty_log(task, indent=1))) start = time.time() - try: - wait( - lambda: (self.client.get_task(task['id'])['status'] - not in ('pending', 'running')), - interval=interval, - timeout=timeout - ) - except TimeoutError: - raise TimeoutError( - "Waiting task \"{task}\" timeout {timeout} sec " - "was exceeded: ".format(task=task["name"], timeout=timeout)) + + wait( + lambda: (self.client.get_task(task['id'])['status'] + not in ('pending', 'running')), + interval=interval, + timeout=timeout, + timeout_msg='Waiting task {0!r} timeout {1} sec ' + 'was exceeded'.format(task['name'], timeout)) + took = time.time() - start task = self.client.get_task(task['id']) logger.info('Task finished. Took {0} seconds. {1}'.format( @@ -1354,21 +1383,15 @@ class FuelWebClient29(object): @logwrap def task_wait_progress(self, task, timeout, interval=5, progress=None): - try: - logger.info( - 'start to wait with timeout {0} ' - 'interval {1}'.format(timeout, interval)) - wait( - lambda: self.client.get_task( - task['id'])['progress'] >= progress, - interval=interval, - timeout=timeout - ) - except TimeoutError: - raise TimeoutError( - "Waiting task \"{task}\" timeout {timeout} sec " - "was exceeded: ".format(task=task["name"], timeout=timeout)) - + logger.info('start to wait with timeout {0} ' + 'interval {1}'.format(timeout, interval)) + wait( + lambda: self.client.get_task( + task['id'])['progress'] >= progress, + interval=interval, + timeout=timeout, + timeout_msg='Waiting task {0!r} timeout {1} sec ' + 'was exceeded'.format(task["name"], timeout)) return self.client.get_task(task['id']) # TODO(ddmitriev): this method will be replaced @@ -1415,12 +1438,8 @@ class FuelWebClient29(object): node_group, node_roles = self.get_node_group_and_role(node_name, nodes_dict) - wait(lambda: - self.get_nailgun_node_by_devops_node(devops_node)['online'], - timeout=60 * 2) + self.wait_node_is_online(devops_node, timeout=60 * 2) node = self.get_nailgun_node_by_devops_node(devops_node) - assert_true(node['online'], - 'Node {0} is offline'.format(node['mac'])) if custom_names: name = custom_names.get(node_name, @@ -1979,33 +1998,14 @@ class FuelWebClient29(object): remote.check_call('/sbin/shutdown -Ph now') for node in devops_nodes: - logger.info('Wait a %s node offline status', node.name) - try: - wait( - lambda: not self.get_nailgun_node_by_devops_node(node)[ - 'online'], timeout=60 * 10) - except TimeoutError: - assert_false( - self.get_nailgun_node_by_devops_node(node)['online'], - 'Node {0} has not become ' - 'offline after warm shutdown'.format(node.name)) + self.wait_node_is_offline(node) node.destroy() def warm_start_nodes(self, devops_nodes): logger.info('Starting nodes %s', [n.name for n in devops_nodes]) for node in devops_nodes: node.create() - for node in devops_nodes: - try: - wait( - lambda: self.get_nailgun_node_by_devops_node( - node)['online'], timeout=60 * 10) - except TimeoutError: - assert_true( - self.get_nailgun_node_by_devops_node(node)['online'], - 'Node {0} has not become online ' - 'after warm start'.format(node.name)) - logger.debug('Node {0} became online.'.format(node.name)) + self.wait_nodes_get_online_state(devops_nodes) def warm_restart_nodes(self, devops_nodes): logger.info('Reboot (warm restart) nodes %s', @@ -2023,15 +2023,8 @@ class FuelWebClient29(object): node.destroy() for node in devops_nodes: if wait_offline: - logger.info('Wait a %s node offline status', node.name) - try: - wait(lambda: not self.get_nailgun_node_by_devops_node( - node)['online'], timeout=60 * 10) - except TimeoutError: - assert_false( - self.get_nailgun_node_by_devops_node(node)['online'], - 'Node {0} has not become offline after ' - 'cold restart'.format(node.name)) + self.wait_node_is_offline(node) + if wait_after_destroy: time.sleep(wait_after_destroy) @@ -2040,15 +2033,7 @@ class FuelWebClient29(object): node.create() if wait_online: for node in devops_nodes: - try: - wait( - lambda: self.get_nailgun_node_by_devops_node( - node)['online'], timeout=60 * 10) - except TimeoutError: - assert_true( - self.get_nailgun_node_by_devops_node(node)['online'], - 'Node {0} has not become online' - ' after cold start'.format(node.name)) + self.wait_node_is_online(node) self.environment.sync_time() @logwrap @@ -2168,18 +2153,7 @@ class FuelWebClient29(object): @logwrap def wait_nodes_get_online_state(self, nodes, timeout=4 * 60): for node in nodes: - logger.info('Wait for %s node online status', node.name) - try: - wait(lambda: - self.get_nailgun_node_by_devops_node(node)['online'], - timeout=timeout) - except TimeoutError: - assert_true( - self.get_nailgun_node_by_devops_node(node)['online'], - 'Node {0} has not become online'.format(node.name)) - node = self.get_nailgun_node_by_devops_node(node) - assert_true(node['online'], - 'Node {0} is online'.format(node['mac'])) + self.wait_node_is_online(node, timeout=timeout) @logwrap def wait_mysql_galera_is_up(self, node_names, timeout=60 * 4): @@ -2214,14 +2188,10 @@ class FuelWebClient29(object): logger.info("Waiting for all Cinder services up.") for node_name in node_names: node = self.get_nailgun_node_by_name(node_name) - try: - wait(lambda: checkers.check_cinder_status(node['ip']), - timeout=300) - logger.info("All Cinder services up.") - except TimeoutError: - logger.error("Cinder services not ready.") - raise TimeoutError( - "Cinder services not ready. ") + wait(lambda: checkers.check_cinder_status(node['ip']), + timeout=300, + timeout_msg='Cinder services not ready') + logger.info("All Cinder services up.") return True def run_ostf_repeatably(self, cluster_id, test_name=None, @@ -2315,7 +2285,8 @@ class FuelWebClient29(object): "on node %s", fqdn) ceph.restart_monitor(remote_to_mon) - wait(lambda: not ceph.is_clock_skew(remote), timeout=120) + wait(lambda: not ceph.is_clock_skew(remote), timeout=120, + timeout_msg='check ceph time skew timeout') @logwrap def check_ceph_status(self, cluster_id, offline_nodes=(), @@ -2329,14 +2300,11 @@ class FuelWebClient29(object): for node in online_ceph_nodes: with self.environment.d_env\ .get_ssh_to_remote(node['ip']) as remote: - try: - wait(lambda: ceph.check_service_ready(remote) is True, - interval=20, timeout=600) - except TimeoutError: - error_msg = 'Ceph service is not properly started' \ - ' on {0}'.format(node['name']) - logger.error(error_msg) - raise TimeoutError(error_msg) + + wait(lambda: ceph.check_service_ready(remote) is True, + interval=20, timeout=600, + timeout_msg='Ceph service is not properly started' + ' on {0}'.format(node['name'])) logger.info('Ceph service is ready. Checking Ceph Health...') self.check_ceph_time_skew(cluster_id, offline_nodes) @@ -2753,7 +2721,8 @@ class FuelWebClient29(object): for osd_id in ids: remote_ceph.execute("ceph osd out {}".format(osd_id)) wait(lambda: ceph.is_health_ok(remote_ceph), - interval=30, timeout=10 * 60) + interval=30, timeout=10 * 60, + timeout_msg='ceph helth ok timeout') for osd_id in ids: if OPENSTACK_RELEASE_UBUNTU in OPENSTACK_RELEASE: remote_ceph.execute("stop ceph-osd id={}".format(osd_id)) diff --git a/system_test/actions/base.py b/system_test/actions/base.py index 3df837429..634dadf56 100644 --- a/system_test/actions/base.py +++ b/system_test/actions/base.py @@ -174,7 +174,8 @@ class PrepareActions(object): device='cdrom').volume.upload(cloud_image_settings_path) self.env.d_env.start([admin]) logger.info("Waiting for Centos node to start up") - wait(lambda: admin.driver.node_active(admin), 60) + wait(lambda: admin.driver.node_active(admin), 60, + timeout_msg='Centos node failed to start') logger.info("Waiting for Centos node ssh ready") self.env.wait_for_provisioning() diff --git a/system_test/actions/strength_actions.py b/system_test/actions/strength_actions.py index 0cef2ca7b..aa002adf0 100644 --- a/system_test/actions/strength_actions.py +++ b/system_test/actions/strength_actions.py @@ -63,7 +63,8 @@ class StrengthActions(object): n_nodes = [node['online'] for node in n_nodes] return n_nodes.count(False) == 0 - wait(wait_offline_nodes, timeout=60 * 5) + wait(wait_offline_nodes, timeout=60 * 5, + timeout_msg='Nodes failed to become offline') @deferred_decorator([make_snapshot_if_step_fail]) @action @@ -284,25 +285,26 @@ class FillRootActions(object): 'resources_running'] == '0' wait(checking_health_disk_attribute, - "Attribute #health_disk wasn't appeared " - "in attributes on node {} in {} seconds".format( - self.primary_controller_fqdn, - self.pcs_check_timeout), - timeout=self.pcs_check_timeout) + timeout=self.pcs_check_timeout, + timeout_msg="Attribute #health_disk wasn't appeared " + "in attributes on node {} in {} seconds".format( + self.primary_controller_fqdn, + self.pcs_check_timeout)) wait(checking_for_red_in_health_disk_attribute, - "Attribute #health_disk doesn't have 'red' value " - "on node {} in {} seconds".format( - self.primary_controller_fqdn, - self.pcs_check_timeout), - timeout=self.pcs_check_timeout) + timeout=self.pcs_check_timeout, + timeout_msg="Attribute #health_disk doesn't have 'red' value " + "on node {} in {} seconds".format( + self.primary_controller_fqdn, + self.pcs_check_timeout)) wait(check_stopping_resources, - "Attribute 'running_resources' doesn't have '0' value " - "on node {} in {} seconds".format( - self.primary_controller_fqdn, - self.pcs_check_timeout), - timeout=self.pcs_check_timeout) + timeout=self.pcs_check_timeout, + timeout_msg="Attribute 'running_resources' " + "doesn't have '0' value " + "on node {} in {} seconds".format( + self.primary_controller_fqdn, + self.pcs_check_timeout)) @deferred_decorator([make_snapshot_if_step_fail]) @action @@ -363,16 +365,17 @@ class FillRootActions(object): 'resources_running'] == self.slave_node_running_resources wait(checking_health_disk_attribute_is_not_present, - "Attribute #health_disk was appeared in attributes " - "on node {} in {} seconds".format( - self.primary_controller_fqdn, - self.pcs_check_timeout), - timeout=self.pcs_check_timeout) + timeout=self.pcs_check_timeout, + timeout_msg="Attribute #health_disk was appeared " + "in attributes on node {} in {} seconds".format( + self.primary_controller_fqdn, + self.pcs_check_timeout)) wait(check_started_resources, - "Attribute 'running_resources' doesn't have {} value " - "on node {} in {} seconds".format( - self.slave_node_running_resources, - self.primary_controller_fqdn, - self.pcs_check_timeout), - self.pcs_check_timeout) + timeout=self.pcs_check_timeout, + timeout_msg="Attribute 'running_resources' " + "doesn't have {} value " + "on node {} in {} seconds".format( + self.slave_node_running_resources, + self.primary_controller_fqdn, + self.pcs_check_timeout))