Merge "Refactor evacuators"

This commit is contained in:
Jenkins 2017-07-14 09:07:26 +00:00 committed by Gerrit Code Review
commit 56efab7b54
6 changed files with 105 additions and 138 deletions

View File

@ -16,12 +16,6 @@
# Note: This option can be changed without restarting.
#debug = false
# DEPRECATED: If set to false, the logging level will be set to WARNING instead
# of the default INFO level. (boolean value)
# This option is deprecated for removal.
# Its value may be silently ignored in the future.
#verbose = true
# The name of a logging configuration file. This file is appended to any
# existing logging configuration files. For details about logging configuration
# files, see the Python logging module documentation. Note that when logging
@ -60,6 +54,12 @@
# is set. (boolean value)
#use_syslog = false
# Enable journald for logging. If running in a systemd environment you may wish
# to enable journal support. Doing so will use the journal native protocol
# which includes structured metadata in addition to log messages.This option is
# ignored if log_config_append is set. (boolean value)
#use_journal = false
# Syslog facility to receive log lines. This option is ignored if
# log_config_append is set. (string value)
#syslog_log_facility = LOG_USER
@ -88,7 +88,7 @@
# List of package logging levels in logger=LEVEL pairs. This option is ignored
# if log_config_append is set. (list value)
#default_log_levels = amqp=WARN,amqplib=WARN,boto=WARN,qpid=WARN,sqlalchemy=WARN,suds=INFO,oslo.messaging=INFO,iso8601=WARN,requests.packages.urllib3.connectionpool=WARN,urllib3.connectionpool=WARN,websocket=WARN,requests.packages.urllib3.util.retry=WARN,urllib3.util.retry=WARN,keystonemiddleware=WARN,routes.middleware=WARN,stevedore=WARN,taskflow=WARN,keystoneauth=WARN,oslo.cache=INFO,dogpile.core.dogpile=INFO
#default_log_levels = amqp=WARN,amqplib=WARN,boto=WARN,qpid=WARN,sqlalchemy=WARN,suds=INFO,oslo.messaging=INFO,oslo_messaging=INFO,iso8601=WARN,requests.packages.urllib3.connectionpool=WARN,urllib3.connectionpool=WARN,websocket=WARN,requests.packages.urllib3.util.retry=WARN,urllib3.util.retry=WARN,keystonemiddleware=WARN,routes.middleware=WARN,stevedore=WARN,taskflow=WARN,keystoneauth=WARN,oslo.cache=INFO,dogpile.core.dogpile=INFO
# Enables or disables publication of error events. (boolean value)
#publish_errors = false

View File

@ -22,72 +22,30 @@ class EvacuatorBaseDriver(object):
a unified interface
"""
def __init__(self, wait, retries, shared_storage, **kwargs):
def __init__(self, nodes, evacuator_conf, fencer):
"""
Initialize Evacuation driver with the config args
:param wait: time in seconds that the evcauator should wait before
retrying to disable the node
:param retries: Number of times the evacuator will try to disable the
compute node
:param shared_storage: Boolean; True if the compute nodes are running
on shared storage and False otherwise
:param kwargs: Dict of arguments that any future driver may need to
load it from the config file
:param nodes: A list of nodes to be evacuated!
:param evacuator_conf: A dict of arguments that got loaded from the
configuration file!
:return: None
"""
self.wait = wait
self.retries = retries
self.shared_storage = shared_storage
self.options = kwargs
self.nodes = nodes
self.evacuator_conf = evacuator_conf
self.fencer = fencer
@abc.abstractmethod
def disable_node(self, node):
"""
Disable the compute node from accepting any new VMs or requests
:param node: dict contains node's hostname
:return: True pr False
def evacuate(self, enable_fencing=True):
"""Evacuate the infected node.
:return: Two lists; the first one will be the succeeded nodes and the
other is the failed nodes
"""
pass
@abc.abstractmethod
def is_node_disabled(self, node):
"""
Check if node is already disabled or not
:param node: dict contains node's hostname
:return: True or False
"""
pass
@abc.abstractmethod
def evacuate_nodes(self, nodes):
"""
Will evacuate all running VMs on the required nodes
:param nodes: list of nodes
:return: list of nodes with updated status
"""
pass
@abc.abstractmethod
def get_node_instances(self, node):
"""
List instances on a compute host
:param node: dict contains node's hostname
:return: List contains running VMs on a given node
"""
pass
def get_info(self):
"""
Get Driver Information
:return: Dict contains driver information
"""
pass
@abc.abstractmethod
def get_node_status(self, node):
"""
Check the node status and report it
:param node: dict contains node's hostname
:return: dict with key 'status': 'True or False'
"""
pass

View File

@ -25,71 +25,19 @@ LOG = log.getLogger(__name__)
class EvacuationManager(object):
def __init__(self, enable_fencing=True):
evcuation_conf = CONF.get('evacuation')
self.driver = importutils.import_object(
evcuation_conf.get('driver'),
evcuation_conf.get('wait'),
evcuation_conf.get('retries'),
evcuation_conf.get('shared_storage'),
**evcuation_conf.get('options')
)
self.enable_fencing = enable_fencing
self.wait = evcuation_conf.get('wait')
self.retires = evcuation_conf.get('retries', 1)
if self.retires <= 0:
self.retires = 1
def evacuate(self, nodes):
# try to disable node
# @todo needs more error handling like if the status didn't update or
# we are unable to disable the node ???
failed_nodes = [] # maintain nodes that are going to fail at any state
succeeded_nodes = []
for node in nodes:
for i in range(0, self.retires):
status = self._disable_node(node)
# if True ( node disabled ) break the loop
if status:
break
else:
status = False
node['status'] = status
# make sure the disable request was successful
if not self.driver.get_node_status(node):
failed_nodes.append(node)
nodes.remove(node) # if the node failed at any step no reason
# to move it to the next step
else:
succeeded_nodes.append(node)
fencer = FencerManager(nodes)
evacuation_conf = CONF.get('evacuation')
driver = importutils.import_object(
evacuation_conf['driver'],
nodes,
evacuation_conf,
fencer
)
nodes = succeeded_nodes
if self.enable_fencing:
fencer = FencerManager(nodes)
nodes = fencer.fence()
"""
@todo this code needs to be commented for the time being till we fix
nova bug found in state, which always go up afer enable or disable. We
will use get_node_details for the time being from the main script to
get nodes details before evacuating ...
succeeded_nodes = []
for node in nodes:
node['instances'] = self.driver.get_node_instances(node)
succeeded_nodes.append(node)
nodes = succeeded_nodes
"""
# Start evacuation calls ...
evacuated_nodes = []
for i in range(0, self.retires):
try:
sleep(self.wait)
nodes = self.driver.evacuate_nodes(nodes)
if not nodes:
return evacuated_nodes
evacuated_nodes = nodes
except Exception as e:
LOG.error(e)
return evacuated_nodes
return driver.evacuate(self.enable_fencing)
def get_nodes_details(self, nodes):
"""
@ -98,9 +46,3 @@ class EvacuationManager(object):
:return: list of node with more details
"""
return get_nodes_details(nodes)
def _disable_node(self, node):
if not self.driver.is_node_disabled(node):
return self.driver.disable_node(node)
else:
True

View File

@ -15,16 +15,82 @@ from oslo_config import cfg
from oslo_log import log
from freezer_dr.evacuators.common.driver import EvacuatorBaseDriver
from freezer_dr.common.utils import get_os_client
import time
CONF = cfg.CONF
LOG = log.getLogger(__name__)
class StandardEvacuator(EvacuatorBaseDriver):
def __init__(self, wait, retires, shared_storage, **kwargs):
super(StandardEvacuator, self).__init__(wait, retires, shared_storage,
**kwargs)
def __init__(self, nodes, evacuator_conf, fencer):
super(StandardEvacuator, self).__init__(nodes, evacuator_conf, fencer)
# initialize the OS client!
self.client = get_os_client()
self.wait = evacuator_conf.get('wait')
self.retires = evacuator_conf.get('retries', 1)
if self.retires <= 0:
self.retires = 1
def _disable_node(self, node):
if not self.is_node_disabled(node):
return self.disable_node(node)
else:
True
def evacuate(self, enable_fencing=True):
# try to disable node
# @todo needs more error handling like if the status didn't update or
# we are unable to disable the node ???
failed_nodes = [] # maintain nodes that are going to fail at any state
succeeded_nodes = []
for node in self.nodes:
status = False
for i in range(0, self.retires):
status = self._disable_node(node)
# if True ( node disabled ) break the loop
if status:
break
else:
status = False
node['status'] = status
# make sure the disable request was successful
if not self.get_node_status(node):
# if the node failed at any step no reason to move it to
# the next step
failed_nodes.append(node)
self.nodes.remove(node) #
else:
succeeded_nodes.append(node)
nodes = succeeded_nodes
if enable_fencing:
self.fencer.update_nodes(nodes)
nodes = self.fencer.fence()
"""
@todo this code needs to be commented for the time being till we fix
nova bug found in state, which always go up afer enable or disable. We
will use get_node_details for the time being from the main script to
get nodes details before evacuating ...
succeeded_nodes = []
for node in nodes:
node['instances'] = self.driver.get_node_instances(node)
succeeded_nodes.append(node)
nodes = succeeded_nodes
"""
# Start evacuation calls ...
evacuated_nodes = []
for i in range(0, self.retires):
try:
time.sleep(self.wait)
nodes = self.evacuate_nodes(nodes)
if not nodes:
break
evacuated_nodes = nodes
except Exception as e:
LOG.error(e)
return evacuated_nodes, failed_nodes
def get_node_instances(self, node):
return self.client.get_hypervisor_instances(node)
@ -39,7 +105,5 @@ class StandardEvacuator(EvacuatorBaseDriver):
return self.client.get_node_status(node)
def evacuate_nodes(self, nodes):
return self.client.evacuate(nodes, shared_storage=self.shared_storage)
return self.client.evacuate(
nodes, shared_storage=self.evacuator_conf['shared_storage'])

View File

@ -19,9 +19,8 @@ class DummyEvacuator(EvacuatorBaseDriver):
of Freezer-DR.
"""
def __init__(self, wait, retires, shared_storage, **kwargs):
super(DummyEvacuator, self).__init__(wait, retires, shared_storage,
**kwargs)
def __init__(self, nodes, evacuator_conf, fencer):
super(DummyEvacuator, self).__init__(nodes, evacuator_conf, fencer)
def disable_node(self, node):
return True

View File

@ -41,7 +41,11 @@ def main():
# Shutdown the node
evac = EvacuationManager()
notify_nodes = evac.get_nodes_details(nodes)
evac.evacuate(nodes)
evacuated_nodes, failed_nodes = evac.evacuate(nodes)
LOG.debug("Successfully evacuated nodes {0}".format(evacuated_nodes))
LOG.debug("Failed to evacuate nodes {0}".format(failed_nodes))
notifier.notify(notify_nodes, 'success')
failed_nodes = evac.get_nodes_details(failed_nodes)
notifier.notify(failed_nodes, 'error')
else:
print "No nodes reported to be down"