tripleo-ha-utils/rally/plugins/instanceha.py

459 lines
18 KiB
Python

from os import path
import socket
import time
from rally.common import logging
from rally.common import sshutils
from rally import exceptions
from rally_openstack import consts
from rally_openstack import scenario
from rally_openstack.scenarios.vm import utils as vm_utils
from rally_openstack.scenarios.cinder import utils as cinder_utils
from rally.task import atomic
from rally.task import types
from rally.task import validation
from rally.task import utils as task_utils
import six
LOG = logging.getLogger(__name__)
def failover(self, host, command, port=22, username="", password="",
key_filename=None, pkey=None):
"""Trigger failover at host
:param host:
:param command:
:return:
"""
if key_filename:
key_filename = path.expanduser(key_filename)
LOG.info("Host: %s. Injecting Failover %s" % (host,
command))
try:
code, out, err = _run_command(self, server_ip=host, port=port,
username=username,
password=password,
key_filename=key_filename,
pkey=pkey, command=command
)
if code and code > 0:
raise exceptions.ScriptError(
"Error running command %(command)s. "
"Error %(code)s: %(error)s" % {
"command": command, "code": code, "error": err})
except exceptions.SSHTimeout:
LOG.debug("SSH session of disruptor command timeouted, continue...")
pass
def _run_command(self, server_ip, port, username, password, command,
pkey=None, key_filename=None):
"""Run command via SSH on server.
Create SSH connection for server, wait for server to become available
(there is a delay between server being set to ACTIVE and sshd being
available). Then call run_command_over_ssh to actually execute the
command.
Note: Shadows vm.utils.VMScenario._run_command to support key_filename.
:param server_ip: server ip address
:param port: ssh port for SSH connection
:param username: str. ssh username for server
:param password: Password for SSH authentication
:param command: Dictionary specifying command to execute.
See `rally info find VMTasks.boot_runcommand_delete' parameter
`command' docstring for explanation.
:param key_filename: private key filename for SSH authentication
:param pkey: key for SSH authentication
:returns: tuple (exit_status, stdout, stderr)
"""
if not key_filename:
pkey = pkey or self.context["user"]["keypair"]["private"]
ssh = sshutils.SSH(username, server_ip, port=port,
pkey=pkey, password=password,
key_filename=key_filename)
self._wait_for_ssh(ssh)
return _run_command_over_ssh(self, ssh, command)
@atomic.action_timer("vm.run_command_over_ssh")
def _run_command_over_ssh(self, ssh, command):
"""Run command inside an instance.
This is a separate function so that only script execution is timed.
:param ssh: A SSHClient instance.
:param command: Dictionary specifying command to execute.
See `rally info find VMTasks.boot_runcommand_delete' parameter
`command' docstring for explanation.
:returns: tuple (exit_status, stdout, stderr)
"""
cmd, stdin = [], None
interpreter = command.get("interpreter") or []
if interpreter:
if isinstance(interpreter, six.string_types):
interpreter = [interpreter]
elif type(interpreter) != list:
raise ValueError("command 'interpreter' value must be str "
"or list type")
cmd.extend(interpreter)
remote_path = command.get("remote_path") or []
if remote_path:
if isinstance(remote_path, six.string_types):
remote_path = [remote_path]
elif type(remote_path) != list:
raise ValueError("command 'remote_path' value must be str "
"or list type")
cmd.extend(remote_path)
if command.get("local_path"):
ssh.put_file(os.path.expanduser(
command["local_path"]), remote_path[-1],
mode=self.USER_RWX_OTHERS_RX_ACCESS_MODE)
if command.get("script_file"):
stdin = open(os.path.expanduser(command["script_file"]), "rb")
elif command.get("script_inline"):
stdin = six.moves.StringIO(command["script_inline"])
cmd.extend(command.get("command_args") or [])
return ssh.execute(cmd, stdin=stdin, timeout=10)
def one_killing_iteration(self, server, fip, computes, disruptor_cmd,
stop_instance):
"""Find the host where instance is hosted, disrupt the host and
verify status of the instance after the failover"""
server_admin = self.admin_clients("nova").servers.get(server.id)
host_name_pre = getattr(server_admin, "OS-EXT-SRV-ATTR:host")
host_name_ext = host_name_pre.split('.')[0] + ".external"
hypervisors = self.admin_clients("nova").hypervisors.list()
hostnames = []
for hypervisor in hypervisors:
hostnames.append(getattr(hypervisor, "hypervisor_hostname"))
if getattr(hypervisor, "hypervisor_hostname") == host_name_pre:
hypervisor_id = getattr(hypervisor, "id")
hypervisor = self.admin_clients("nova").hypervisors.get(hypervisor_id)
hypervisor_ip = socket.gethostbyname(host_name_ext.strip())
if not disruptor_cmd:
disruptor_cmd = {
"script_inline": "sudo sh -c \"echo b > /proc/sysrq-trigger\"",
"interpreter": "/bin/sh"
}
# Trigger failover of compute node hosting the instance
failover(self, host=hypervisor_ip,
command=disruptor_cmd,
port=computes.get("port", 22),
username=computes.get("username"),
password=computes.get("password"),
key_filename=computes.get("key_filename"),
pkey=computes.get("pkey")
)
# Wait for instance to be moved to different host
hostnames.remove(host_name_pre)
task_utils.wait_for(
server_admin,
status_attr="OS-EXT-SRV-ATTR:host",
ready_statuses=hostnames,
update_resource=task_utils.get_from_manager(),
timeout=120,
check_interval=5
)
# Check the instance is SHUTOFF in the case of stopped instance or
# that the instance is pingable
if stop_instance:
task_utils.wait_for(
server,
ready_statuses=["SHUTOFF"],
update_resource=task_utils.get_from_manager(),
timeout=60,
check_interval=2
)
#server_admin = self.admin_clients("nova").servers.get(server.id)
#host_name_post = getattr(server_admin, "OS-EXT-SRV-ATTR:host")
#if host_name_post in host_name_pre:
#raise exceptions.InvalidHostException()
else:
try:
if self.wait_for_ping:
self._wait_for_ping(fip["ip"])
except exceptions.TimeoutException:
console_logs = self._get_server_console_output(server,
None)
LOG.debug("VM console logs:\n%s", console_logs)
raise
def recover_instance_ha(self, image, flavor, computes,
volume_args=None,
floating_network=None,
use_floating_ip=True,
force_delete=False,
stop_instance=False,
disruptor_cmd=None,
iterations=1,
wait_for_ping=True,
max_log_length=None,
**kwargs):
"""Boot a server, trigger failover of host and verify instance.
:param image: glance image name to use for the vm
:param flavor: VM flavor name
:param computes: dictionary with credentials to the compute nodes
consisting of username, password, port, key_filename, disruptor
command and pkey.
Examples::
computes: {
username: heat-admin,
key_filename: /path/to/ssh/id_rsa.pub
port: 22
}
:param volume_args: volume args for booting server from volume
:param floating_network: external network name, for floating ip
:param use_floating_ip: bool, floating or fixed IP for SSH connection
:param force_delete: whether to use force_delete for servers
:param stop_instance: whether to stop instance before disruptor command
:param disruptor_cmd: command to be send to hosting compute node
:param iterations: number of compute node killing iteration
:param wait_for_ping: whether to check connectivity on server creation
:param **kwargs: extra arguments for booting the server
:param max_log_length: The number of tail nova console-log lines user
would like to retrieve
:returns:
"""
self.wait_for_ping = wait_for_ping
if volume_args:
volume = self.cinder.create_volume(volume_args["size"], imageRef=None)
kwargs["block_device_mapping"] = {"vdrally": "%s:::1" % volume.id}
server, fip = self._boot_server_with_fip(
image, flavor, use_floating_ip=use_floating_ip,
floating_network=floating_network,
key_name=self.context["user"]["keypair"]["name"],
**kwargs)
task_utils.wait_for(
server,
ready_statuses=["ACTIVE"],
update_resource=task_utils.get_from_manager(),
timeout=120,
check_interval=2
)
try:
if self.wait_for_ping:
self._wait_for_ping(fip["ip"])
except exceptions.TimeoutException:
console_logs = self._get_server_console_output(server,
max_log_length)
LOG.debug("VM console logs:\n%s", console_logs)
raise
if stop_instance:
self._stop_server(server)
task_utils.wait_for(
server,
ready_statuses=["SHUTOFF"],
update_resource=task_utils.get_from_manager(),
timeout=120,
check_interval=2
)
# Wait a little before killing the compute
# If we do not wait, backing image will get corrupted which was reported as bug
time.sleep(30)
for iteration in range(1, iterations+1):
one_killing_iteration(self, server, fip, computes,
disruptor_cmd, stop_instance)
# Give cluster some time to recover original compute node
LOG.info("Wait for compute nodes to come online after previous disruption")
time.sleep(360)
if stop_instance:
# Start instance If It was stopped.
self._start_server(server)
task_utils.wait_for(
server,
ready_statuses=["ACTIVE"],
update_resource=task_utils.get_from_manager(),
timeout=120,
check_interval=2
)
self._delete_server_with_fip(server, fip, force_delete=force_delete)
@types.convert(image={"type": "glance_image"},
flavor={"type": "nova_flavor"})
@validation.add("image_valid_on_flavor",
flavor_param="flavor", image_param="image")
@validation.add("valid_command", param_name="command", required=False)
@validation.add("number", param_name="port", minval=1, maxval=65535,
nullable=True, integer_only=True)
@validation.add("external_network_exists", param_name="floating_network")
@validation.add("required_services",
services=[consts.Service.NOVA, consts.Service.CINDER])
@validation.add("required_platform", platform="openstack",
users=True, admin=True)
@scenario.configure(context={"cleanup@openstack": ["nova", "cinder"],
"keypair@openstack": {}, "allow_ssh@openstack": None},
name="InstanceHA.recover_instance_fip_and_volume",
platform="openstack")
class InstanceHARecoverFIPAndVolume(vm_utils.VMScenario, cinder_utils.CinderBasic):
def __init__(self, *args, **kwargs):
super(InstanceHARecoverFIPAndVolume, self).__init__(*args, **kwargs)
def run(self, image, flavor, computes,
volume_args=None,
floating_network=None,
use_floating_ip=True,
force_delete=False,
wait_for_ping=True,
max_log_length=None,
**kwargs):
recover_instance_ha(self, image, flavor, computes,
volume_args=volume_args,
floating_network=floating_network,
use_floating_ip=use_floating_ip,
force_delete=force_delete,
wait_for_ping=wait_for_ping,
max_log_length=max_log_length,
**kwargs)
@types.convert(image={"type": "glance_image"},
flavor={"type": "nova_flavor"})
@validation.add("image_valid_on_flavor",
flavor_param="flavor", image_param="image")
@validation.add("valid_command", param_name="command", required=False)
@validation.add("number", param_name="port", minval=1, maxval=65535,
nullable=True, integer_only=True)
@validation.add("external_network_exists", param_name="floating_network")
@validation.add("required_services",
services=[consts.Service.NOVA, consts.Service.CINDER])
@validation.add("required_platform", platform="openstack",
users=True, admin=True)
@scenario.configure(context={"cleanup@openstack": ["nova", "cinder"],
"keypair@openstack": {}, "allow_ssh@openstack": None},
name="InstanceHA.recover_instance_two_cycles",
platform="openstack")
class InstanceHARecoverTwoCycle(vm_utils.VMScenario, cinder_utils.CinderBasic):
def __init__(self, *args, **kwargs):
super(InstanceHARecoverTwoCycle, self).__init__(*args, **kwargs)
def run(self, image, flavor, computes,
volume_args=None,
floating_network=None,
use_floating_ip=True,
force_delete=False,
wait_for_ping=True,
max_log_length=None,
**kwargs):
recover_instance_ha(self, image, flavor, computes,
volume_args=volume_args,
floating_network=floating_network,
use_floating_ip=use_floating_ip,
force_delete=force_delete,
iterations=2,
wait_for_ping=wait_for_ping,
max_log_length=max_log_length,
**kwargs)
@types.convert(image={"type": "glance_image"},
flavor={"type": "nova_flavor"})
@validation.add("image_valid_on_flavor",
flavor_param="flavor", image_param="image")
@validation.add("valid_command", param_name="command", required=False)
@validation.add("number", param_name="port", minval=1, maxval=65535,
nullable=True, integer_only=True)
@validation.add("external_network_exists", param_name="floating_network")
@validation.add("required_services",
services=[consts.Service.NOVA, consts.Service.CINDER])
@validation.add("required_platform", platform="openstack",
users=True, admin=True)
@scenario.configure(context={"cleanup@openstack": ["nova", "cinder"],
"keypair@openstack": {}, "allow_ssh@openstack": None},
name="InstanceHA.recover_stopped_instance_fip",
platform="openstack")
class InstanceHARecoverStopped(vm_utils.VMScenario, cinder_utils.CinderBasic):
def __init__(self, *args, **kwargs):
super(InstanceHARecoverStopped, self).__init__(*args, **kwargs)
def run(self, image, flavor, computes,
volume_args=None,
floating_network=None,
use_floating_ip=True,
force_delete=False,
wait_for_ping=True,
max_log_length=None,
**kwargs):
recover_instance_ha(self, image, flavor, computes,
volume_args=volume_args,
floating_network=floating_network,
use_floating_ip=use_floating_ip,
force_delete=force_delete,
stop_instance=True,
wait_for_ping=wait_for_ping,
max_log_length=max_log_length,
**kwargs)
@types.convert(image={"type": "glance_image"},
flavor={"type": "nova_flavor"})
@validation.add("image_valid_on_flavor",
flavor_param="flavor", image_param="image")
@validation.add("valid_command", param_name="command", required=False)
@validation.add("number", param_name="port", minval=1, maxval=65535,
nullable=True, integer_only=True)
@validation.add("external_network_exists", param_name="floating_network")
@validation.add("required_services",
services=[consts.Service.NOVA, consts.Service.CINDER])
@validation.add("required_platform", platform="openstack",
users=True, admin=True)
@scenario.configure(context={"cleanup@openstack": ["nova", "cinder"],
"keypair@openstack": {}, "allow_ssh@openstack": None},
name="InstanceHA.recover_instance_nova_compute",
platform="openstack")
class InstanceHARecoverNovaCompute(vm_utils.VMScenario, cinder_utils.CinderBasic):
def __init__(self, *args, **kwargs):
super(InstanceHARecoverNovaCompute, self).__init__(*args, **kwargs)
def run(self, image, flavor, computes,
volume_args=None,
floating_network=None,
use_floating_ip=True,
force_delete=False,
wait_for_ping=True,
max_log_length=None,
**kwargs):
disruptor_cmd = {
"script_inline": "sudo kill -9 $(ps -ef | grep ^nova* | awk \'{print$2}\'); echo {}",
"interpreter": "/bin/sh"
}
recover_instance_ha(self, image, flavor, computes,
volume_args=volume_args,
floating_network=floating_network,
use_floating_ip=use_floating_ip,
force_delete=force_delete,
disruptor_cmd=disruptor_cmd,
wait_for_ping=wait_for_ping,
max_log_length=max_log_length,
**kwargs)