Merge "Add test to find out statistic of connectivity failures in case of neutron agents migration" into stable/5.1

This commit is contained in:
Jenkins 2014-12-01 11:04:45 +00:00 committed by Gerrit Code Review
commit 7626c5aeed
2 changed files with 230 additions and 4 deletions

View File

@ -58,7 +58,7 @@ class OpenStackActions(common.Common):
return servers
def create_server_for_migration(self, neutron=False, scenario='',
timeout=100):
timeout=100, key_name=None):
name = "test-serv" + str(random.randint(1, 0x7fffffff))
security_group = {}
try:
@ -87,6 +87,7 @@ class OpenStackActions(common.Common):
image=image_id,
flavor=1,
userdata=scenario,
key_name=key_name,
**kwargs)
try:
helpers.wait(
@ -285,3 +286,54 @@ class OpenStackActions(common.Common):
if img.name == image_name:
return img
return None
def get_neutron_router(self):
return self.neutron.list_routers()
def get_routers_ids(self):
result = self.get_neutron_router()
ids = [i['id'] for i in result['routers']]
return ids
def get_l3_for_router(self, router_id):
return self.neutron.list_l3_agent_hosting_routers(router_id)
def get_l3_agent_ids(self, router_id):
result = self.get_l3_for_router(router_id)
ids = [i['id'] for i in result['agents']]
return ids
def get_l3_agent_hosts(self, router_id):
result = self.get_l3_for_router(router_id)
hosts = [i['host'] for i in result['agents']]
return hosts
def remove_l3_from_router(self, l3_agent, router_id):
return self.neutron.remove_router_from_l3_agent(l3_agent, router_id)
def add_l3_to_router(self, l3_agent, router_id):
return self.neutron.add_router_to_l3_agent(
l3_agent, {"router_id": router_id})
def list_agents(self):
return self.neutron.list_agents()
def get_available_l3_agents_ids(self, hosted_l3_agent_id):
result = self.list_agents()
ids = [i['id'] for i in result['agents']
if i['binary'] == 'neutron-l3-agent']
ids.remove(hosted_l3_agent_id)
return ids
def list_dhcp_agents_for_network(self, net_id):
return self.neutron.list_dhcp_agent_hosting_networks(net_id)
def get_node_with_dhcp_for_network(self, net_id):
result = self.list_dhcp_agents_for_network(net_id)
nodes = [i['host'] for i in result['agents']]
return nodes
def get_dhcp_agent_ids(self, net_id):
result = self.list_dhcp_agents_for_network(net_id)
ids = [i['id'] for i in result['agents']]
return ids

View File

@ -16,11 +16,16 @@ import time
import re
from devops.helpers.helpers import wait
from devops.error import TimeoutError
from proboscis import asserts
from proboscis.asserts import assert_equal
from proboscis.asserts import assert_true
from proboscis import SkipTest
from proboscis import test
from fuelweb_test import logger
from fuelweb_test import settings
from fuelweb_test.helpers.decorators import log_snapshot_on_error
from fuelweb_test.helpers import os_actions
from fuelweb_test.tests import base_test_case
@ -39,13 +44,14 @@ class TestNeutronFailover(base_test_case.TestBasic):
3. Add 2 nodes with compute roles
4. Add 1 node with cinder role
5. Deploy the cluster
6. Destroy controller with running l3 agent
7. Wait for OFFLINE of the controller at fuel UI
8. Run instance connectivity OSTF tests
Snapshot deploy_ha_neutron
"""
try:
self.check_run('deploy_ha_neutron')
except SkipTest:
return
self.env.revert_snapshot("ready")
self.env.bootstrap_nodes(self.env.nodes().slaves[:6])
@ -69,6 +75,24 @@ class TestNeutronFailover(base_test_case.TestBasic):
}
)
self.fuel_web.deploy_cluster_wait(cluster_id)
self.env.make_snapshot("deploy_ha_neutron", is_make=True)
@test(depends_on=[deploy_ha_neutron],
groups=["neutron_l3_migration"])
@log_snapshot_on_error
def neutron_l3_migration(self):
"""Check l3 agent migration after destroy controller
Scenario:
1. Revert snapshot with Neutron GRE cluster
2. Destroy controller with running l3 agent
3. Wait for OFFLINE of the controller at fuel UI
4. Run instance connectivity OSTF tests
Snapshot deploy_ha_neutron
"""
self.env.revert_snapshot("deploy_ha_neutron")
# Look for controller with l3 agent
ret = self.fuel_web.get_pacemaker_status(
@ -119,3 +143,153 @@ class TestNeutronFailover(base_test_case.TestBasic):
test_sets=['ha', 'smoke', 'sanity'],
should_fail=1,
failed_test_name=['Check that required services are running'])
@test(depends_on=[deploy_ha_neutron],
groups=["check_agents_migration"])
@log_snapshot_on_error
def check_agents_migration(self):
"""Check instance connectivity after neutron agents migration
Scenario:
1. Revert snapshot with Neutron GRE cluster
2. Create instance and assign floating ip to it
3. Add -INF collocation between l3-agent and dhcp-agent
4. Create agents_migration snapshot
5. 10 times revert snapshot agents_migration
6. Migrate manually dhcp-agent and l3-agent
7. Check ping to floating ip
8. In case of failure(permanent (> than 2 minute) icmp
connectivity loss) increase number of failures
9. Restart openvswitch-agent on any controller node
10. Ping floating ip again
11. In case of success (icmp connectivity restored
within a 2 minute) increase number of workarounds on 1
12. In case of failure (permanent (> than 2 minute) icmp
connectivity loss) increase number of failed workarounds on 1
Snapshot deploy_ha_neutron
"""
self.env.revert_snapshot("deploy_ha_neutron")
if settings.OPENSTACK_RELEASE_UBUNTU in settings.OPENSTACK_RELEASE:
restart_openvswitch = "crm resource restart" \
" clone_p_neutron-plugin-openvswitch-agent"
else:
restart_openvswitch = "crm resource restart" \
" clone_p_neutron-openvswitch-agent"
cluster_id = self.fuel_web.get_last_created_cluster()
os_con = os_actions.OpenStackActions(
self.fuel_web.get_public_vip(cluster_id))
remote = self.env.get_ssh_to_remote_by_name("slave-01")
instance = os_con.create_server_for_migration(
neutron=True)
floating_ip = os_con.assign_floating_ip(instance)
logger.debug('instance floating ip is {0}'.format(floating_ip.ip))
cmd = "ping -i 10 -c 1 -W 10 -w 120 {0}".format(floating_ip.ip)
try:
wait(lambda: remote.execute(cmd)['exit_code'] == 0, timeout=180)
except TimeoutError:
pass
res = remote.execute(cmd)
logger.debug("command execution {0}".format(res))
assert_equal(0, res['exit_code'],
'floating ip is not pingable, exit code {0}'.format(
res['exit_code']))
remote.execute("pcs constraint colocation add p_neutron-l3-agent"
" with p_neutron-dhcp-agent score=-INFINITY")
self.env.make_snapshot("agents_migration", is_make=True)
self.env.get_virtual_environment().resume()
failures = 0
workarounds = 0
failed_workarounds = 0
logger.info("start agents migration")
for i in range(0, 10):
logger.debug("starting iteration number {0}".format(i))
os_con = os_actions.OpenStackActions(
self.fuel_web.get_public_vip(cluster_id))
router_id = os_con.get_routers_ids()[0]
node_with_l3 = os_con.get_l3_agent_hosts(router_id)[0]
logger.debug("node with l3 agent is {0}".format(node_with_l3))
net_id = os_con.get_network('net04')['id']
node_with_dhcp = os_con.get_node_with_dhcp_for_network(net_id)[0]
logger.debug("node with dhcp agent is {0}".format(node_with_dhcp))
remote = self.env.get_ssh_to_remote_by_name('slave-01')
remote.execute("pcs resource move p_neutron-dhcp-agent")
wait((lambda:
not node_with_dhcp == os_con.get_node_with_dhcp_for_network(
net_id)[0]
if os_con.get_node_with_dhcp_for_network(net_id) else False),
timeout=60 * 3)
remote.execute("pcs resource move p_neutron-l3-agent")
wait((lambda:
not node_with_l3 == os_con.get_l3_agent_hosts(router_id)[0]
if os_con.get_l3_agent_hosts(router_id) else False),
timeout=60 * 3)
node_with_l3 = os_con.get_l3_agent_hosts(router_id)[0]
if settings.OPENSTACK_RELEASE_UBUNTU in \
settings.OPENSTACK_RELEASE:
node_fqdn = node_with_l3
else:
node_fqdn = node_with_l3.split(".")[0]
logger.debug("node fqdn is {0}".format(node_fqdn))
devops_node = self.fuel_web.find_devops_node_by_nailgun_fqdn(
node_fqdn, self.env.nodes().slaves[0:6])
l3_node_remote = self.env.get_ssh_to_remote_by_name(
devops_node.name)
try:
wait(
lambda: l3_node_remote.execute(cmd)['exit_code'] == 0,
timeout=120)
except TimeoutError:
pass
res = l3_node_remote.execute(cmd)
logger.debug("connectivity after migration is {0}".format(res))
if res['exit_code'] != 0:
logger.debug("we have connectivity failure")
failures += 1
l3_node_remote.execute(restart_openvswitch)
try:
wait(
lambda: l3_node_remote.execute(cmd)['exit_code'] == 0,
timeout=120)
except TimeoutError:
pass
res = l3_node_remote.execute(cmd)
logger.debug(
"connectivity after ovs restart is {0}".format(res))
if res['exit_code'] != 0:
failed_workarounds += 1
self.env.make_snapshot(
"fail_agents_migration_{0}".format(i), is_make=True)
self.env.revert_snapshot("agents_migration")
else:
workarounds += 1
remote.execute("pcs resource clear p_neutron-dhcp-agent")
remote.execute("pcs resource clear p_neutron-l3-agent")
logger.info("total number of failures is {0}".format(failures))
logger.info("number of successful workarounds is {0}".format(
workarounds))
logger.info("number of unsuccessful workarounds is {0}".format(
failed_workarounds))
logger.debug("total number of failures is {0}".format(failures))
logger.debug("number of successful workarounds is {0}".format(
workarounds))
logger.debug("number of unsuccessful workarounds is {0}".format(
failed_workarounds))
assert_true(failed_workarounds == 0, "there are failed workarounds")