fuel-qa/fuelweb_test/tests/tests_strength/test_failover.py

#    Copyright 2013 Mirantis, Inc.
#
#    Licensed under the Apache License, Version 2.0 (the "License"); you may
#    not use this file except in compliance with the License. You may obtain
#    a copy of the License at
#
#         http://www.apache.org/licenses/LICENSE-2.0
#
#    Unless required by applicable law or agreed to in writing, software
#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
#    License for the specific language governing permissions and limitations
#    under the License.

import re
import time

from devops.helpers.helpers import wait
from devops.helpers.helpers import _wait
from proboscis.asserts import assert_equal
from proboscis.asserts import assert_not_equal
from proboscis.asserts import assert_true
from proboscis import test
from proboscis import SkipTest

from fuelweb_test.helpers.checkers import check_mysql
from fuelweb_test.helpers.decorators import log_snapshot_on_error
from fuelweb_test.helpers import os_actions
from fuelweb_test import logger
from fuelweb_test.settings import DEPLOYMENT_MODE
from fuelweb_test.settings import NEUTRON_SEGMENT_TYPE
from fuelweb_test.settings import NEUTRON_ENABLE
from fuelweb_test.tests.base_test_case import SetupEnvironment
from fuelweb_test.tests.base_test_case import TestBasic


@test(groups=["thread_5", "ha", "neutron_failover", "ha_nova_destructive",
              "ha_neutron_destructive"])
class TestHaFailover(TestBasic):

    @test(depends_on=[SetupEnvironment.prepare_slaves_5],
          groups=["deploy_ha"])
    @log_snapshot_on_error
    def deploy_ha(self):
        """Prepare cluster in HA mode for failover tests

        Scenario:
            1. Create cluster
            2. Add 3 nodes with controller roles
            3. Add 2 nodes with compute roles
            4. Deploy the cluster
            8. Make snapshot

        Duration 70m
        Snapshot deploy_ha

        """
        try:
            self.check_run("deploy_ha")
        except SkipTest:
            return

        self.env.revert_snapshot("ready_with_5_slaves")

        settings = None

        if NEUTRON_ENABLE:
            settings = {
                "net_provider": 'neutron',
                "net_segment_type": NEUTRON_SEGMENT_TYPE
            }
        cluster_id = self.fuel_web.create_cluster(
            name=self.__class__.__name__,
            mode=DEPLOYMENT_MODE,
            settings=settings
        )
        self.fuel_web.update_nodes(
            cluster_id,
            {
                'slave-01': ['controller'],
                'slave-02': ['controller'],
                'slave-03': ['controller'],
                'slave-04': ['compute'],
                'slave-05': ['compute']
            }
        )
        self.fuel_web.deploy_cluster_wait(cluster_id)
        public_vip = self.fuel_web.get_public_vip(cluster_id)
        os_conn = os_actions.OpenStackActions(public_vip)
        if NEUTRON_ENABLE:
            self.fuel_web.assert_cluster_ready(
                os_conn, smiles_count=14, networks_count=2, timeout=300)
        else:
            self.fuel_web.assert_cluster_ready(
                os_conn, smiles_count=16, networks_count=1, timeout=300)
        self.fuel_web.verify_network(cluster_id)

        self.fuel_web.security.verify_firewall(cluster_id)

        # Bug #1289297. Pause 5 min to make sure that all remain activity
        # on the admin node has over before creating a snapshot.
        time.sleep(5 * 60)

        self.env.make_snapshot("deploy_ha", is_make=True)

    @test(depends_on_groups=['deploy_ha'],
          groups=["ha_destroy_controllers"])
    @log_snapshot_on_error
    def ha_destroy_controllers(self):
        """Destroy two controllers and check pacemaker status is correct

        Scenario:
            1. Destroy first controller
            2. Check pacemaker status
            3. Run OSTF
            4. Revert environment
            5. Destroy second controller
            6. Check pacemaker status
            7. Run OSTF

        Duration 35m
        """

        for devops_node in self.env.get_virtual_environment(
        ).nodes().slaves[:2]:
            self.env.revert_snapshot("deploy_ha")
            devops_node.suspend(False)
            self.fuel_web.assert_pacemaker(
                self.env.get_virtual_environment().nodes(
                ).slaves[2].name,
                set(self.env.get_virtual_environment(
                ).nodes().slaves[:3]) - {devops_node},
                [devops_node])

            cluster_id = self.fuel_web.client.get_cluster_id(
                self.__class__.__name__)

            # Wait until Nailgun marked suspended controller as offline
            wait(lambda: not self.fuel_web.get_nailgun_node_by_devops_node(
                devops_node)['online'],
                timeout=60 * 5)

            # Wait until MySQL Galera is UP on online controllers
            self.fuel_web.wait_mysql_galera_is_up(
                [n.name for n in
                 set(self.env.get_virtual_environment(
                 ).nodes().slaves[:3]) - {devops_node}])

            self.fuel_web.run_ostf(
                cluster_id=cluster_id,
                test_sets=['ha', 'smoke', 'sanity'],
                should_fail=1)

    @test(depends_on_groups=['deploy_ha'],
          groups=["ha_disconnect_controllers"])
    @log_snapshot_on_error
    def ha_disconnect_controllers(self):
        """Disconnect controllers and check pacemaker status is correct

        Scenario:
            1. Disconnect eth3 of the first controller
            2. Check pacemaker status
            3. Revert environment
            4. Disconnect eth3 of the second controller
            5. Check pacemaker status
            6. Run OSTF

        Duration 45m

        """

        for devops_node in self.env.get_virtual_environment(
        ).nodes().slaves[:2]:
            self.env.revert_snapshot("deploy_ha")

            remote = self.fuel_web.get_ssh_for_node(devops_node.name)
            remote.check_call('ifconfig eth2 down')
            self.fuel_web.assert_pacemaker(
                self.env.get_virtual_environment(
                ).nodes().slaves[2].name,
                set(self.env.get_virtual_environment(
                ).nodes().slaves[:3]) - {devops_node},
                [devops_node])

        cluster_id = self.fuel_web.client.get_cluster_id(
            self.__class__.__name__)

        # Wait until MySQL Galera is UP on some controller
        self.fuel_web.wait_mysql_galera_is_up(['slave-01'])

        self.fuel_web.run_ostf(
            cluster_id=cluster_id,
            test_sets=['ha', 'smoke', 'sanity'])

    @test(depends_on_groups=['deploy_ha'],
          groups=["ha_delete_vips"])
    @log_snapshot_on_error
    def ha_delete_vips(self):
        """Delete all management and public VIPs on all controller nodes.
        Verify that they are restored.
        Verify total amount of secondary IPs. Should be 2:
        management and public

        Scenario:
            1. Delete all secondary VIP
            2. Wait while it is being restored
            3. Verify it is restored
            4. Run OSTF

        Duration 30m

        """
        logger.debug('Start reverting of deploy_ha snapshot')
        self.env.revert_snapshot("deploy_ha")
        cluster_id = \
            self.fuel_web.client.get_cluster_id(self.__class__.__name__)
        logger.debug('Cluster id is {0}'.format(cluster_id))
        interfaces = ('hapr-p', 'hapr-m')
        slaves = self.env.get_virtual_environment(
        ).nodes().slaves[:3]
        logger.debug("Current nodes are {0}".format([i.name for i in slaves]))
        ips_amount = 0
        for devops_node in slaves:
            # Verify VIPs are started.
            ret = self.fuel_web.get_pacemaker_status(devops_node.name)
            logger.debug("Pacemaker status {0} for node {1}".format
                         (ret, devops_node.name))
            assert_true(
                re.search('vip__management\s+\(ocf::fuel:ns_IPaddr2\):'
                          '\s+Started node', ret),
                'vip management not started. '
                'Current pacemaker status is {0}'.format(ret))
            assert_true(
                re.search('vip__public\s+\(ocf::fuel:ns_IPaddr2\):'
                          '\s+Started node', ret),
                'vip public not started. '
                'Current pacemaker status is {0}'.format(ret))

            for interface in interfaces:
                # Look for management and public ip in namespace and remove it
                logger.debug("Start to looking for ip of Vips")
                addresses = self.fuel_web.ip_address_show(devops_node.name,
                                                          interface=interface,
                                                          namespace='haproxy')
                logger.debug("Vip addresses is {0} for node {1} and interface"
                             " {2}".format(addresses, devops_node.name,
                                           interface))
                ip_search = re.search(
                    'inet (?P<ip>\d+\.\d+\.\d+.\d+/\d+) scope global '
                    '{0}'.format(interface), addresses)

                if ip_search is None:
                    logger.debug("Ip show output does not"
                                 " match in regex. Current value is None")
                    continue
                ip = ip_search.group('ip')
                logger.debug("Founded ip is {0}".format(ip))
                logger.debug("Start ip {0} deletion on node {1} and "
                             "interface {2} ".format(ip, devops_node.name,
                                                     interface))
                self.fuel_web.ip_address_del(
                    node_name=devops_node.name,
                    interface=interface,
                    ip=ip, namespace='haproxy')

                # The ip should be restored
                ip_assigned = lambda nodes: \
                    any([ip in self.fuel_web.ip_address_show(
                        n.name, 'haproxy', interface)
                        for n in nodes])
                logger.debug("Waiting while deleted ip restores ...")
                wait(lambda: ip_assigned(slaves), timeout=30)
                assert_true(ip_assigned(slaves),
                            "IP isn't restored restored.")
                ips_amount += 1

                time.sleep(60)

                # Run OSTF tests
                self.fuel_web.run_ostf(
                    cluster_id=cluster_id,
                    test_sets=['ha', 'smoke', 'sanity'],
                    should_fail=1)
                # Revert initial state. VIP could be moved to other controller
                self.env.revert_snapshot("deploy_ha")
        assert_equal(ips_amount, 2,
                     'Not all VIPs were found: expect - 2, found {0}'.format(
                         ips_amount))

    @test(depends_on_groups=['deploy_ha'],
          groups=["ha_mysql_termination"])
    @log_snapshot_on_error
    def ha_mysql_termination(self):
        """Terminate mysql on all controllers one by one

        Scenario:
            1. Terminate mysql
            2. Wait while it is being restarted
            3. Verify it is restarted
            4. Go to another controller
            5. Run OSTF

        Duration 15m

        """
        self.env.revert_snapshot("deploy_ha")

        for devops_node in self.env.get_virtual_environment(
        ).nodes().slaves[:3]:
            remote = self.fuel_web.get_ssh_for_node(devops_node.name)
            logger.info('Terminating MySQL on {0}'.format(devops_node.name))

            try:
                remote.check_call('pkill -9 -x "mysqld"')
            except:
                logger.error('MySQL on {0} is down after snapshot revert'.
                             format(devops_node.name))
                raise

            check_mysql(remote, devops_node.name)

        cluster_id = self.fuel_web.client.get_cluster_id(
            self.__class__.__name__)

        self.fuel_web.wait_mysql_galera_is_up(['slave-01', 'slave-02',
                                               'slave-03'])

        self.fuel_web.run_ostf(
            cluster_id=cluster_id,
            test_sets=['ha', 'smoke', 'sanity'])

    @test(depends_on_groups=['deploy_ha'],
          groups=["ha_haproxy_termination"])
    @log_snapshot_on_error
    def ha_haproxy_termination(self):
        """Terminate haproxy on all controllers one by one

        Scenario:
            1. Terminate haproxy
            2. Wait while it is being restarted
            3. Verify it is restarted
            4. Go to another controller
            5. Run OSTF

        Duration 25m

        """
        self.env.revert_snapshot("deploy_ha")

        for devops_node in self.env.get_virtual_environment(
        ).nodes().slaves[:3]:
            remote = self.fuel_web.get_ssh_for_node(devops_node.name)
            remote.check_call('kill -9 $(pidof haproxy)')

            mysql_started = lambda: \
                len(remote.check_call(
                    'ps aux | grep "/usr/sbin/haproxy"')['stdout']) == 3
            wait(mysql_started, timeout=20)
            assert_true(mysql_started(), 'haproxy restarted')

        cluster_id = self.fuel_web.client.get_cluster_id(
            self.__class__.__name__)

        self.fuel_web.run_ostf(
            cluster_id=cluster_id,
            test_sets=['ha', 'smoke', 'sanity'])

    @test(depends_on_groups=['deploy_ha'],
          groups=["ha_pacemaker_configuration"])
    @log_snapshot_on_error
    def ha_pacemaker_configuration(self):
        """Verify resources are configured

        Scenario:
            1. SSH to controller node
            2. Verify resources are configured
            3. Go to next controller

        Duration 15m

        """
        self.env.revert_snapshot("deploy_ha")

        devops_ctrls = self.env.get_virtual_environment(
        ).nodes().slaves[:3]
        pcm_nodes = ' '.join(self.fuel_web.get_pcm_nodes(
            self.env.get_virtual_environment(
            ).nodes().slaves[0].name, pure=True)['Online'])
        logger.debug("pacemaker nodes are {0}".format(pcm_nodes))
        for devops_node in devops_ctrls:
            config = self.fuel_web.get_pacemaker_config(devops_node.name)
            logger.debug("config on node {0} is {1}".format(
                devops_node.name, config))
            assert_not_equal(re.search(
                "vip__public\s+\(ocf::fuel:ns_IPaddr2\):\s+Started\s+"
                "Clone Set:\s+clone_ping_vip__public\s+\[ping_vip__public\]"
                "\s+Started:\s+\[ {0} \]".format(pcm_nodes), config), None,
                'public vip is not configured right')
            assert_true(
                'vip__management	(ocf::fuel:ns_IPaddr2):	Started'
                in config, 'vip management is not configured right')
            assert_not_equal(re.search(
                "Clone Set: clone_p_(heat|openstack-heat)-engine"
                " \[p_(heat|openstack-heat)-engine\]\s+"
                "Started: \[ {0} \]".format(
                    pcm_nodes), config), None,
                'heat engine is not configured right')
            assert_not_equal(re.search(
                "Clone Set: clone_p_mysql \[p_mysql\]\s+Started:"
                " \[ {0} \]".format(pcm_nodes), config), None,
                'mysql is not configured right')
            assert_not_equal(re.search(
                "Clone Set: clone_p_haproxy \[p_haproxy\]\s+Started:"
                " \[ {0} \]".format(pcm_nodes), config), None,
                'haproxy is not configured right')

    @test(enabled=False, depends_on_groups=['deploy_ha'],
          groups=["ha_pacemaker_restart_heat_engine"])
    @log_snapshot_on_error
    def ha_pacemaker_restart_heat_engine(self):
        """Verify heat engine service is restarted
         by pacemaker on amqp connection loss

        Scenario:
            1. SSH to any controller
            2. Check heat-engine status
            3. Block heat-engine amqp connections
            4. Check heat-engine was stopped on current controller
            5. Unblock heat-engine amqp connections
            6. Check heat-engine process is running with new pid
            7. Check amqp connection re-appears for heat-engine

        Duration 15m

        """
        self.env.revert_snapshot("deploy_ha")
        ocf_success = "DEBUG: OpenStack Orchestration Engine" \
                      " (heat-engine) monitor succeeded"
        ocf_error = "ERROR: OpenStack Heat Engine is not connected to the" \
                    " AMQP server: AMQP connection test returned 1"

        heat_name = 'heat-engine'

        ocf_status = \
            'script -q -c "OCF_ROOT=/usr/lib/ocf' \
            ' /usr/lib/ocf/resource.d/fuel/{0}' \
            ' monitor 2>&1"'.format(heat_name)

        remote = self.fuel_web.get_ssh_for_node(
            self.env.get_virtual_environment(
            ).nodes().slaves[0].name)
        pid = ''.join(remote.execute('pgrep heat-engine')['stdout'])
        get_ocf_status = ''.join(
            remote.execute(ocf_status)['stdout']).rstrip()
        assert_true(ocf_success in get_ocf_status,
                    "heat engine is not succeeded, status is {0}".format(
                        get_ocf_status))
        assert_true(len(remote.execute(
            "netstat -nap | grep {0} | grep :5673".
            format(pid))['stdout']) > 0, 'There is no amqp connections')
        remote.execute("iptables -I OUTPUT 1 -m owner --uid-owner heat -m"
                       " state --state NEW,ESTABLISHED,RELATED -j DROP")

        wait(lambda: len(remote.execute
            ("netstat -nap | grep {0} | grep :5673".
             format(pid))['stdout']) == 0, timeout=300)

        get_ocf_status = ''.join(
            remote.execute(ocf_status)['stdout']).rstrip()
        logger.info('ocf status after blocking is {0}'.format(
            get_ocf_status))
        assert_true(ocf_error in get_ocf_status,
                    "heat engine is running, status is {0}".format(
                        get_ocf_status))

        remote.execute("iptables -D OUTPUT 1 -m owner --uid-owner heat -m"
                       " state --state NEW,ESTABLISHED,RELATED")
        _wait(lambda: assert_true(ocf_success in ''.join(
            remote.execute(ocf_status)['stdout']).rstrip()), timeout=240)
        newpid = ''.join(remote.execute('pgrep heat-engine')['stdout'])
        assert_true(pid != newpid, "heat pid is still the same")
        get_ocf_status = ''.join(remote.execute(
            ocf_status)['stdout']).rstrip()
        assert_true(ocf_success in get_ocf_status,
                    "heat engine is not succeeded, status is {0}".format(
                        get_ocf_status))
        assert_true(len(
            remote.execute("netstat -nap | grep {0} | grep :5673".format(
                newpid))['stdout']) > 0)
        cluster_id = self.fuel_web.get_last_created_cluster()
        self.fuel_web.run_ostf(cluster_id=cluster_id)

    @test(depends_on_groups=['deploy_ha'],
          groups=["ha_check_monit"])
    @log_snapshot_on_error
    def ha_check_monit(self):
        """Verify monit restarted nova
         service if it was killed

        Scenario:
            1. SSH to every compute node in cluster
            2. Kill nova-compute service
            3. Check service is restarted by monit

        Duration 25m

        """
        self.env.revert_snapshot("deploy_ha")
        for devops_node in self.env.get_virtual_environment(
        ).nodes().slaves[3:5]:
            remote = self.fuel_web.get_ssh_for_node(devops_node.name)
            remote.execute("kill -9 `pgrep nova-compute`")
            wait(
                lambda: len(remote.execute('pgrep nova-compute')['stdout'])
                == 1, timeout=120)
            assert_true(len(remote.execute('pgrep nova-compute')['stdout'])
                        == 1, 'Nova service was not restarted')
            assert_true(len(remote.execute(
                "grep \"nova-compute.*trying to restart\" "
                "/var/log/monit.log")['stdout']) > 0,
                'Nova service was not restarted')