# Copyright 2015 Mirantis, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain # a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. from distutils import version import json import logging from lxml import etree import fuel_health from fuel_health.common import ssh from fuel_health.common.utils import data_utils from fuel_health.test import BaseTestCase LOG = logging.getLogger(__name__) class RabbitSanityClass(BaseTestCase): """TestClass contains RabbitMQ sanity checks.""" @classmethod def setUpClass(cls): cls.config = fuel_health.config.FuelConfig() cls._controllers = cls.config.compute.online_controllers cls.nodes = cls.config.compute.nodes cls._usr = cls.config.compute.controller_node_ssh_user cls._pwd = cls.config.compute.controller_node_ssh_password cls._key = cls.config.compute.path_to_private_key cls._ssh_timeout = cls.config.compute.ssh_timeout cls._password = None cls._userid = None cls.messages = [] cls.queues = [] cls.release_version = \ cls.config.compute.release_version.split('-')[1] @property def password(self): if version.StrictVersion(self.release_version)\ < version.StrictVersion('7.0'): self._password = self.get_conf_values().strip() return self._password if self._password is None: self._password = self.get_hiera_values( hiera_hash='rabbit', hash_key='password' ) # FIXME(mattmymo): Remove this after merging # https://review.openstack.org/#/c/276797/ if not self._password: self._password = self.get_hiera_values( hiera_hash='rabbit_hash', hash_key='password' ) return self._password @property def amqp_hosts_name(self): amqp_hosts_name = {} if version.StrictVersion(self.release_version)\ < version.StrictVersion('7.0'): for controller_ip in self._controllers: amqp_hosts_name[controller_ip] = [controller_ip, '5673'] return amqp_hosts_name nodes = self.get_hiera_values(hiera_hash='network_metadata', hash_key='nodes', json_parse=True) for ip, port in self.get_amqp_hosts(): for node in nodes: ips = [nodes[node]['network_roles'][role] for role in nodes[node]['network_roles']] if ip in ips: nailgun_nodes = [n for n in self.nodes if nodes[node]['name'] == n['hostname'] and n['online']] if len(nailgun_nodes) == 1: amqp_hosts_name[nodes[node]['name']] = [ip, port] return amqp_hosts_name @property def userid(self): if version.StrictVersion(self.release_version)\ < version.StrictVersion('7.0'): self._userid = 'nova' return self._userid if self._userid is None: self._userid = self.get_hiera_values( hiera_hash='rabbit', hash_key='user' ) # FIXME(mattmymo): Remove this after merging # https://review.openstack.org/#/c/276797/ if not self._userid: self._userid = self.get_hiera_values( hiera_hash='rabbit_hash', hash_key='user' ) return self._userid def get_ssh_connection_to_controller(self, controller): remote = ssh.Client(host=controller, username=self._usr, password=self._pwd, key_filename=self._key, timeout=self._ssh_timeout) return remote def list_nodes(self): if not self.amqp_hosts_name: self.fail('There are no online rabbit nodes') remote = \ self.get_ssh_connection_to_controller( self.amqp_hosts_name.keys()[0]) output = remote.exec_command("rabbitmqctl cluster_status") substring_ind = output.find('{running_nodes') sub_end_ind = output.find('cluster_name') result_str = output[substring_ind: sub_end_ind] num_node = result_str.count("rabbit@") return num_node def pick_rabbit_master(self): if not self.amqp_hosts_name: self.fail('There are no online rabbit nodes') remote = \ self.get_ssh_connection_to_controller( self.amqp_hosts_name.keys()[0]) LOG.info('ssh session to node {0} was open'.format( self.amqp_hosts_name.keys()[0])) LOG.info('Try to execute command ') output = remote.exec_command( "crm resource status master_p_rabbitmq-server") LOG.debug('Output is {0}'.format(output)) substring_ind = output.find( 'resource master_p_rabbitmq-server is running on:') sub_end_ind = output.find('Master') LOG.debug('Start index is {0} end' ' index is {1}'.format(substring_ind, sub_end_ind)) result_str = output[substring_ind: sub_end_ind] LOG.debug('Result string is {0}'.format(result_str)) return result_str def list_channels(self): if not self.amqp_hosts_name: self.fail('There are no online rabbit nodes') remote = \ self.get_ssh_connection_to_controller( self.amqp_hosts_name.keys()[0]) output = remote.exec_command("rabbitmqctl list_channels") LOG.debug('Result of executing command rabbitmqctl' ' list_channels is {0}'.format(output)) return output def get_hiera_values(self, hiera_hash="rabbit", hash_key=None, conf_path="/etc/hiera.yaml", json_parse=False): if hash_key is not None: lookup_cmd = ('value = hiera.lookup("{0}", {{}}, ' '{{}}, nil, :hash)["{1}"]').format(hiera_hash, hash_key) else: lookup_cmd = ('value = hiera.lookup("{0}", {{}},' ' {{}}, nil, :hash)').format(hiera_hash) if json_parse: print_cmd = 'require "json"; puts JSON.dump(value)' else: print_cmd = 'puts value' cmd = ('ruby -e \'require "hiera"; ' 'hiera = Hiera.new(:config => "{0}"); ' '{1}; {2};\'').format(conf_path, lookup_cmd, print_cmd) LOG.debug("Try to execute cmd {0}".format(cmd)) remote = self.get_ssh_connection_to_controller(self._controllers[0]) try: res = remote.exec_command(cmd) LOG.debug("result is {0}".format(res)) if json_parse: return json.loads(res.strip()) return res.strip() except Exception: LOG.exception("Fail to get data from Hiera DB!") self.fail("Fail to get data from Hiera DB!") def get_conf_values(self, variable="rabbit_password", sections="DEFAULT", conf_path="/etc/nova/nova.conf"): cmd = ("python -c 'import ConfigParser; " "cfg=ConfigParser.ConfigParser(); " "cfg.readfp(open('\"'{0}'\"')); " "print cfg.get('\"'{1}'\"', '\"'{2}'\"')'") LOG.debug("Try to execute cmd {0}".format(cmd)) remote = self.get_ssh_connection_to_controller(self._controllers[0]) try: res = remote.exec_command(cmd.format( conf_path, sections, variable)) LOG.debug("result is {0}".format(res)) return res except Exception: LOG.exception("Fail to get data from config") self.fail("Fail to get data from config") def get_amqp_hosts(self): if not self._controllers: self.fail('There are no online controllers') remote = self.get_ssh_connection_to_controller(self._controllers[0]) cmd = 'hiera amqp_hosts' LOG.debug("Try to execute cmd '{0}' on controller...".format(cmd)) result = remote.exec_command(cmd) LOG.debug("Result: {0}".format(result)) hosts = result.strip().split(',') return [host.lstrip().split(':')[0:2] for host in hosts] def check_rabbit_connections(self): if not self._controllers: self.fail('There are no online controllers') remote = self.get_ssh_connection_to_controller(self._controllers[0]) for key in self.amqp_hosts_name.keys(): ip, port = self.amqp_hosts_name[key] cmd = ("python -c 'import kombu;" " c = kombu.Connection(\"amqp://{1}:{2}@{0}:{3}//\");" " c.connect()'".format(ip, self.userid, self.password, port)) try: LOG.debug('Checking AMQP host "{0}"...'.format(ip)) remote.exec_command(cmd) except Exception: LOG.exception("Failed to establish AMQP connection") self.fail("Failed to establish AMQP connection to {1}/tcp " "port on {0} from controller node!".format(ip, port)) def create_queue(self): if not self._controllers: self.fail('There are no online controllers') remote = self.get_ssh_connection_to_controller(self._controllers[0]) for key in self.amqp_hosts_name.keys(): ip, port = self.amqp_hosts_name[key] test_queue = 'test-rabbit-{0}-{1}'.format( data_utils.rand_name() + data_utils.generate_uuid(), ip ) cmd = ("python -c 'import kombu;" " c = kombu.Connection(\"amqp://{1}:{2}@{0}:{3}//\");" " c.connect(); ch = c.channel(); q = kombu.Qu" "eue(\"{4}\", channel=ch, durable=False, queue_arguments={{" "\"x-expires\": 15 * 60 * 1000}}); q.declare()'".format( ip, self.userid, self.password, port, test_queue)) try: LOG.debug("Declaring queue {0} on host {1}".format( test_queue, ip)) self.queues.append(test_queue) remote.exec_command(cmd) except Exception: LOG.exception("Failed to declare queue on host") self.fail("Failed to declare queue on host {0}".format(ip)) def publish_message(self): if not self._controllers: self.fail('There are no online controllers') remote = self.get_ssh_connection_to_controller(self._controllers[0]) for key in self.amqp_hosts_name.keys(): ip, port = self.amqp_hosts_name[key] queues = [q for q in self.queues if ip in q] if not len(queues) > 0: self.fail("Can't publish message, queue created on host '{0}' " "doesn't exist!".format(ip)) test_queue = queues[0] id = data_utils.generate_uuid() cmd = ("python -c 'import kombu;" " c = kombu.Connection(\"amqp://{1}:{2}@{0}:{3}//\");" " c.connect(); ch = c.channel(); producer = " "kombu.Producer(channel=ch, routing_key=\"{4}\"); " "producer.publish(\"{5}\")'".format( ip, self.userid, self.password, port, test_queue, id)) try: LOG.debug('Try to publish message {0}'.format(id)) remote.exec_command(cmd) except Exception: LOG.exception("Failed to publish message!") self.fail("Failed to publish message!") self.messages.append({'queue': test_queue, 'id': id}) def check_queue_message_replication(self): if not self._controllers: self.fail('There are no online controllers') remote = self.get_ssh_connection_to_controller(self._controllers[0]) for key in self.amqp_hosts_name.keys(): ip, port = self.amqp_hosts_name[key] for message in self.messages: if ip in message['queue']: continue cmd = ("python -c 'import kombu;" " c = kombu.Connection(\"amqp://{1}:{2}@{0}:{3}//\");" " c.connect(); " "ch = c.channel(); q = kombu.Queue(\"{4}\", channel=ch)" "; msg = q.get(True); retval = 0 if msg.body in \"{5}\"" " else 1; exit(retval)'".format( ip, self.userid, self.password, port, message['queue'], message['id'])) try: LOG.debug('Checking that message with ID "{0}" was ' 'replicated over the cluster...'.format(id)) remote.exec_command(cmd) except Exception: LOG.exception('Failed to check message replication!') self.fail('Failed to check message replication!') self.messages.remove(message) break def delete_queue(self): if not self._controllers: self.fail('There are no online controllers') remote = self.get_ssh_connection_to_controller(self._controllers[0]) LOG.debug('Try to deleting queues {0}... '.format(self.queues)) if not self.queues: return host_key = self.amqp_hosts_name.keys()[0] ip, port = self.amqp_hosts_name[host_key] for test_queue in self.queues: cmd = ("python -c 'import kombu;" " c = kombu.Connection(\"amqp://{1}:{2}@{0}:{3}//\");" " c.connect(); ch = c.channel(); q = kombu.Qu" "eue(\"{4}\", channel=ch); q.delete();'".format( ip, self.userid, self.password, port, test_queue)) try: LOG.debug("Removing queue {0} on host {1}".format( test_queue, ip)) remote.exec_command(cmd) self.queues.remove(test_queue) except Exception: LOG.exception('Failed to delete queue') self.fail('Failed to delete queue "{0}"!'.format(test_queue)) class TestPacemakerBase(BaseTestCase): """TestPacemakerStatus class base methods.""" @classmethod def setUpClass(cls): super(TestPacemakerBase, cls).setUpClass() cls.config = fuel_health.config.FuelConfig() cls.controller_names = cls.config.compute.controller_names cls.online_controller_names = ( cls.config.compute.online_controller_names) cls.offline_controller_names = list( set(cls.controller_names) - set(cls.online_controller_names)) cls.online_controller_ips = cls.config.compute.online_controllers cls.controller_key = cls.config.compute.path_to_private_key cls.controller_user = cls.config.compute.ssh_user cls.controllers_pwd = cls.config.compute.controller_node_ssh_password cls.timeout = cls.config.compute.ssh_timeout def setUp(self): super(TestPacemakerBase, self).setUp() if 'ha' not in self.config.mode: self.skipTest('Cluster is not HA mode, skipping tests') if not self.online_controller_names: self.skipTest('There are no controller nodes') def _run_ssh_cmd(self, host, cmd): """Open SSH session with host and execute command.""" try: sshclient = ssh.Client(host, self.controller_user, self.controllers_pwd, key_filename=self.controller_key, timeout=self.timeout) return sshclient.exec_longrun_command(cmd) except Exception: LOG.exception("Failed on run ssh cmd") self.fail("%s command failed." % cmd) def _register_resource(self, res, res_name, resources): if res_name not in resources: resources[res_name] = { 'master': [], 'nodes': [], 'started': 0, 'stopped': 0, 'active': False, 'managed': False, 'failed': False, } if 'true' in res.get('active'): resources[res_name]['active'] = True if 'true' in res.get('managed'): resources[res_name]['managed'] = True if 'true' in res.get('failed'): resources[res_name]['failed'] = True res_role = res.get('role') num_nodes = int(res.get('nodes_running_on')) if num_nodes: resources[res_name]['started'] += num_nodes for rnode in res.iter('node'): if 'Master' in res_role: resources[res_name]['master'].append( rnode.get('name')) resources[res_name]['nodes'].append( rnode.get('name')) else: resources[res_name]['stopped'] += 1 def get_pcs_resources(self, pcs_status): """Get pacemaker resources status to a python dict: return: { str: { # Resource name 'started': int, # count of Master/Started 'stopped': int, # count of Stopped resources 'nodes': [node_name, ...], # All node names where the # resource is started 'master': [node_name, ...], # Node names for 'Master' # ('master' is also in 'nodes') 'active': bool, # Is resource active? 'managed': bool, # Is resource managed? 'failed': bool, # Has resource failed? }, ... } """ root = etree.fromstring(pcs_status) resources = {} for res_group in root.iter('resources'): for res in res_group: res_name = res.get('id') if 'resource' in res.tag: self._register_resource(res, res_name, resources) elif 'clone' in res.tag: for r in res: self._register_resource(r, res_name, resources) elif 'group' in res.tag: for r in res: res_name_ingroup = r.get('id') self._register_resource(r, res_name_ingroup, resources) self._register_resource(r, res_name, resources) return resources def get_pcs_nodes(self, pcs_status): root = etree.fromstring(pcs_status) nodes = {'Online': [], 'Offline': []} for nodes_group in root.iter('nodes'): for node in nodes_group: if 'true' in node.get('online'): nodes['Online'].append(node.get('name')) else: nodes['Offline'].append(node.get('name')) return nodes def get_pcs_constraints(self, constraints_xml): """Parse pacemaker constraints :param constraints_xml: XML string contains pacemaker constraints :return dict: {string: # Resource name, {'attrs': list, # List of dicts for resource # attributes on each node, 'enabled': list # List of strings with node names where # the resource allowed to start, 'with-rsc': string # Name of an another resource # from which this resource depends on. } } """ root = etree.fromstring(constraints_xml) constraints = {} # 1. Get all attributes from constraints for each resource for con_group in root.iter('constraints'): for con in con_group: if 'rsc_location' in con.tag or 'rsc_colocation' in con.tag: if 'score' not in con.keys(): # TODO(ddmitriev): process resource dependences # for 'rule' section continue rsc = con.get('rsc') if rsc not in constraints: constraints[rsc] = {'attrs': [con.attrib]} else: constraints[rsc]['attrs'].append(con.attrib) # 2. Make list of nodes for each resource where it is allowed to start. # Remove from 'enabled' list all nodes with score '-INFINITY' for rsc in constraints: attrs = constraints[rsc]['attrs'] enabled = [] disabled = [] for attr in attrs: if 'with-rsc' in attr: constraints[rsc]['with-rsc'] = attr['with-rsc'] elif 'node' in attr: if attr['score'] == '-INFINITY': disabled.append(attr['node']) else: enabled.append(attr['node']) constraints[rsc]['enabled'] = list(set(enabled) - set(disabled)) return constraints def get_resource_nodes(self, rsc, constraints, cluster_resources, orig_rsc): if rsc in orig_rsc: # Constraints loop detected! msg = ('There is a dependency loop in constraints configuration: ' 'resource "{0}" depends on the resource "{1}". Please check' ' the pacemaker configuration!' .format(orig_rsc[-1], rsc)) raise fuel_health.exceptions.InvalidConfiguration(msg) else: orig_rsc.append(rsc) # Nodes where the resource is allowed to start allowed = constraints[rsc]['enabled'] # Nodes where the parent resource is actually started if rsc in cluster_resources: started = cluster_resources[rsc]['nodes'] else: started = [] if 'with-rsc' in constraints[rsc]: # Recursively get nodes for the parent resource (parent_allowed, parent_started, parent_disallowed) = self.get_resource_nodes( constraints[rsc]['with-rsc'], constraints, cluster_resources, orig_rsc) if 'score' in constraints[rsc]: if constraints[rsc]['score'] == '-INFINITY': # If resource banned to start on the same nodes where # parent resource is started, then nodes where parent # resource is started should be removed from 'allowed' allowed = (set(allowed) - set(parent_started)) else: # Reduce 'allowed' list to only those nodes where # the parent resource is allowed and running allowed = list(set(parent_started) & set(parent_allowed) & set(allowed)) # List of nodes, where resource is started, but not allowed to start disallowed = list(set(started) - set(allowed)) return allowed, started, disallowed