Buidling a plugable evacuators - still need some work !

Change-Id: I03ab6d72cecbb05bed9366e7959cf2b573dd0047
This commit is contained in:
Saad Zaher 2016-01-06 18:22:57 +00:00
parent da9fbe0ee7
commit 043a677238
13 changed files with 358 additions and 14 deletions

9
development hints Normal file
View File

@ -0,0 +1,9 @@
# hints for Saad
We need to divide the software to be like monitoring component calling evacuation component
We need to decide we will process host by host or all hosts at once ?
Do we need to use threading ??
in docs/diagram.pdf > Stop monitoring this node and alert admin. How can we achieve something like that with the native driver without something to maintain the status of the previous run ?
After adding the notification driver, we can handle the failed nodes by triggering the driver to send notification directly to the admin ...

View File

@ -91,6 +91,30 @@
#instance_uuid_format = "[instance: %(uuid)s] "
[evacuation]
#
# From osha
#
# Time in seconds to wait between retries to disable compute node or put it in
# maintenance mode. Default 10 seconds (string value)
#driver = osha.evacuators.drivers.osha.standard.OshaStandardEvacuator
# Time in seconds to wait between retries to disable compute node or put it in
# maintenance mode. Default 10 seconds (integer value)
#wait = 10
# Number of retries to put node in maintenance mode before reporting failure to
# evacuate the node (integer value)
#retries = 1
# Dict contains kwargs to be passed to the evacuator driver. In case you have
# additional args needs to be passed to your evacuator please, list them as
# key0:value0, key1:value1, .... (dict value)
#options =
[fencer]
#
@ -125,7 +149,7 @@
# Openstack auth URI i.e. http://controller:5000 (string value)
#auth_uri = <None>
# Openstack auth URL i.e. http://controller:35357 (string value)
# Openstack auth URL i.e. http://controller:35357/v3 (string value)
#auth_url = <None>
# Openstack auth plugin i.e. ( password, token, ...) password is the only

View File

@ -71,7 +71,7 @@ _KEYSTONE_AUTH_TOKEN = [
help='Openstack auth URI i.e. http://controller:5000',
dest='auth_uri'),
cfg.StrOpt('auth_url',
help='Openstack auth URL i.e. http://controller:35357',
help='Openstack auth URL i.e. http://controller:35357/v3',
dest='auth_url'),
cfg.StrOpt('auth_plugin',
help='Openstack auth plugin i.e. ( password, token, ...) '
@ -106,6 +106,33 @@ _KEYSTONE_AUTH_TOKEN = [
]
_EVACUATION = [
cfg.StrOpt('driver',
default='osha.evacuators.drivers.osha.standard.'
'OshaStandardEvacuator',
help='Time in seconds to wait between retries to disable compute'
' node or put it in maintenance mode. Default 10 seconds',
dest='driver'),
cfg.IntOpt('wait',
default=10,
help='Time in seconds to wait between retries to disable compute'
' node or put it in maintenance mode. Default 10 seconds',
dest='wait'),
cfg.IntOpt('retries',
default=1,
help='Number of retries to put node in maintenance mode before '
'reporting failure to evacuate the node',
dest='retries'),
cfg.DictOpt('options',
default={},
help='Dict contains kwargs to be passed to the evacuator driver'
'. In case you have additional args needs to be passed to '
'your evacuator please, list them as key0:value0, '
'key1:value1, ....',
dest='options')
]
def build_os_options():
osclient_opts = [
cfg.StrOpt('os-username',
@ -200,6 +227,14 @@ def configure():
CONF.register_group(fencers_grp)
CONF.register_opts(_FENCER, group='fencer')
# Evacuation Section :)
evacuators_grp = cfg.OptGroup('evacuation',
title='Evacuation Options',
help='Evacuation Driver/plugin opts to be '
'used to Evacuate compute nodes')
CONF.register_group(evacuators_grp)
CONF.register_opts(_EVACUATION, group='evacuation')
# Osha Auth
keystone_grp = cfg.OptGroup('keystone_authtoken',
title='Keystone Auth Options',
@ -239,7 +274,8 @@ def list_opts():
None: _COMMON,
'monitoring': _MONITORS,
'keystone_authtoken': _KEYSTONE_AUTH_TOKEN,
'fencer': _FENCER
'fencer': _FENCER,
'evacuation': _EVACUATION
}
return _OPTS.items()

View File

@ -104,14 +104,26 @@ class OSClient:
endpoint_type=self.endpoint_type)
self.authSession = new_sess
evacuated_nodes = []
print "Nodes", nodes
for node in nodes:
hypervisors = nova.hypervisors.search(node.get('host'), True)
print "Hypervisor found is:", hypervisors
for hypervisor in hypervisors:
host = {'host': node.get('host'), 'servers': hypervisor.servers}
evacuated_nodes.append(host)
for server in hypervisor.servers:
output = nova.servers.evacuate(server.get('uuid'),
try:
output = nova.servers.evacuate(server.get('uuid'),
on_shared_storage=True)
except Exception as e:
print "ERRORORRRROROROROROROROROROROROROROROROROROROROR"
print e
for i in range(0, 100):
print "-",
if i == 50:
print "Evacuation Result !",
print
print output
exit()
@ -136,16 +148,17 @@ class OSClient:
auth_session = session.Session(auth=self.authSession.auth)
return auth_session
def get_node_status(self, hostname):
def get_node_status(self, node):
"""
Check the node nova-service status and if it's disabled or not
:param hostname: of the required node
:return: return dict contains the node status if it's disabled or not !
:param node: dict contains node info
:return: True or False. True => node disabled, False => node is enabled
or unknow status !
"""
nova = novaclient.Client(session=self.authSession,
endpoint_type=self.endpoint_type)
try:
node = nova.services.find(host=hostname)
node = nova.services.find(host=node.get('host'))
print node
except Exception as e:
LOG.error(e)
@ -153,14 +166,17 @@ class OSClient:
if not node:
return False
return node.to_dict()
node = node.to_dict()
if node.get('status') == 'disabled':
return True
return False
def disable_node(self, hostname):
def disable_node(self, node):
auth_session = session.Session(auth=self.authSession.auth)
nova = novaclient.Client(session=auth_session,
endpoint_type=self.endpoint_type)
try:
node = nova.services.find(host=hostname)
node = nova.services.find(host=node.get('host'))
except Exception as e:
LOG.error(e)
return False
@ -181,3 +197,14 @@ class OSClient:
return False
return True
def get_hypervisor_instances(self, node):
auth_session = session.Session(auth=self.authSession.auth)
nova = novaclient.Client(session=auth_session,
endpoint_type=self.endpoint_type)
hypervisors = nova.hypervisors.search(node.get('host'), True)
if not hypervisors:
return []
return hypervisors[0].servers

View File

@ -13,6 +13,12 @@
# limitations under the License.
import os
from osha.common.osclient import OSClient
from oslo_config import cfg
from oslo_log import log
CONF = cfg.CONF
LOG = log.getLogger(__name__)
def env(*env_vars, **kwargs):
@ -21,3 +27,24 @@ def env(*env_vars, **kwargs):
if value:
return value
return kwargs.get('default', '')
def get_os_client():
"""
Loads credentials from [keystone_authtoken] section in the configuration
file and initialize the client and return an instance of the client
:return: Initialized instance of OS Client
"""
credentials = CONF.get('keystone_authtoken')
client = OSClient(
authurl=credentials.get('auth_url'),
username=credentials.get('username'),
password=credentials.get('password'),
project_name=credentials.get('project_name'),
user_domain_id=credentials.get('user_domain_id'),
project_domain_id=credentials.get('project_domain_id'),
project_domain_name=credentials.get('project_domain_name'),
user_domain_name=credentials.get('user_domain_name')
)
return client

View File

@ -0,0 +1 @@
__author__ = 'saad'

View File

@ -0,0 +1 @@
__author__ = 'saad'

View File

@ -0,0 +1,90 @@
# (c) Copyright 2014,2015 Hewlett-Packard Development Company, L.P.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import abc
import six
@six.add_metaclass(abc.ABCMeta)
class EvacuatorBaseDriver(object):
"""
Abstract class for all evacuation drivers should implement to have
a unified interface
"""
def __init__(self, wait, retries, **kwargs):
"""
Initialize Evacuation driver with the config args
:param wait: time in seconds that the evcauator should wait before
retrying to disable the node
:param retries: Number of times the evacuator will try to disable the
compute node
:param kwargs: Dict of arguments that any future driver may need to
load it from the config file
:return: None
"""
self.wait = wait
self.retries = retries
self.options = kwargs
@abc.abstractmethod
def disable_node(self, node):
"""
Disable the compute node from accepting any new VMs or requests
:param node: dict contains node's hostname
:return: True pr False
"""
pass
@abc.abstractmethod
def is_node_disabled(self, node):
"""
Check if node is already disabled or not
:param node: dict contains node's hostname
:return: True or False
"""
pass
@abc.abstractmethod
def evacuate_nodes(self, nodes):
"""
Will evacuate all running VMs on the required nodes
:param nodes: list of nodes
:return: list of nodes with updated status
"""
pass
@abc.abstractmethod
def get_node_instances(self, node):
"""
List instances on a compute host
:param node: dict contains node's hostname
:return: List contains running VMs on a given node
"""
pass
def get_info(self):
"""
Get Driver Information
:return: Dict contains driver information
"""
pass
@abc.abstractmethod
def get_node_status(self, node):
"""
Check the node status and report it
:param node: dict contains node's hostname
:return: dict with key 'status': 'True or False'
"""
pass

View File

@ -0,0 +1,83 @@
# (c) Copyright 2014,2015 Hewlett-Packard Development Company, L.P.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from oslo_config import cfg
from oslo_log import log
from oslo_utils import importutils
from osha.fencers.common.manager import FencerManager
CONF = cfg.CONF
LOG = log.getLogger(__name__)
class EvacuationManager(object):
def __init__(self, enable_fencing=True):
evcuation_conf = CONF.get('evacuation')
self.driver = importutils.import_object(
evcuation_conf.get('driver'),
evcuation_conf.get('wait'),
evcuation_conf.get('retries'),
**evcuation_conf.get('options')
)
self.enable_fencing = enable_fencing
self.wait = evcuation_conf.get('wait')
self.retires = evcuation_conf.get('retries', 1)
if self.retires <= 0:
self.retires = 1
def evacuate(self, nodes):
# try to disable node
# @todo needs more error handling like if the status didn't update or
# we are unable to disable the node ???
failed_nodes = [] # maintain nodes that are going to fail at any state
succeeded_nodes = []
for node in nodes:
for i in range(0, self.retires):
status = self._disable_node(node)
# if True ( node disabled ) break the loop
if status:
break
else:
status = False
node['status'] = status
# make sure the disable request was successful
if not self.driver.get_node_status(node):
failed_nodes.append(node)
nodes.remove(node) # if the node failed at any step no reason
# to move it to the next step
else:
succeeded_nodes.append(node)
nodes = succeeded_nodes
if self.enable_fencing:
fencer = FencerManager(nodes)
nodes = fencer.fence()
succeeded_nodes = []
for node in nodes:
node['instances'] = self.driver.get_node_instances(node)
succeeded_nodes.append(node)
nodes = succeeded_nodes
from time import sleep
sleep(30)
evacuated_nodes = self.driver.evacuate_nodes(nodes)
return evacuated_nodes
def _disable_node(self, node):
if not self.driver.is_node_disabled(node):
return self.driver.disable_node(node)
else:
True

View File

@ -0,0 +1 @@
__author__ = 'saad'

View File

@ -0,0 +1 @@
__author__ = 'saad'

View File

@ -0,0 +1,44 @@
# (c) Copyright 2014,2015 Hewlett-Packard Development Company, L.P.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from oslo_config import cfg
from oslo_log import log
from osha.evacuators.common.driver import EvacuatorBaseDriver
from osha.common.utils import get_os_client
CONF = cfg.CONF
LOG = log.getLogger(__name__)
class OshaStandardEvacuator(EvacuatorBaseDriver):
def __init__(self, wait, retires, **kwargs):
super(OshaStandardEvacuator, self).__init__(wait, retires, **kwargs)
self.client = get_os_client()
def get_node_instances(self, node):
return self.client.get_hypervisor_instances(node)
def disable_node(self, node):
return self.client.disable_node(node)
def get_node_status(self, node):
return self.client.get_node_status(node)
def is_node_disabled(self, node):
return self.client.get_node_status(node)
def evacuate_nodes(self, nodes):
return self.client.evacuate(nodes)

View File

@ -15,7 +15,7 @@ from osha.common import config
from oslo_config import cfg
from oslo_log import log
from osha.monitors.common.manager import MonitorManager
from osha.evacuate import EvacuationManager
from osha.evacuators.common.manager import EvacuationManager
CONF = cfg.CONF
LOG = log.getLogger(__name__)
@ -36,8 +36,8 @@ def main():
# deployments
# Load Fence driver
# Shutdown the node
evacuator = EvacuationManager(nodes)
evacuator.evacuate()
evac = EvacuationManager()
evac.evacuate(nodes)
exit()
print "Fenced nodes are", nodes