Buidling a plugable evacuators - still need some work !
Change-Id: I03ab6d72cecbb05bed9366e7959cf2b573dd0047
This commit is contained in:
parent
da9fbe0ee7
commit
043a677238
|
@ -0,0 +1,9 @@
|
|||
# hints for Saad
|
||||
We need to divide the software to be like monitoring component calling evacuation component
|
||||
We need to decide we will process host by host or all hosts at once ?
|
||||
Do we need to use threading ??
|
||||
|
||||
in docs/diagram.pdf > Stop monitoring this node and alert admin. How can we achieve something like that with the native driver without something to maintain the status of the previous run ?
|
||||
|
||||
After adding the notification driver, we can handle the failed nodes by triggering the driver to send notification directly to the admin ...
|
||||
|
|
@ -91,6 +91,30 @@
|
|||
#instance_uuid_format = "[instance: %(uuid)s] "
|
||||
|
||||
|
||||
[evacuation]
|
||||
|
||||
#
|
||||
# From osha
|
||||
#
|
||||
|
||||
# Time in seconds to wait between retries to disable compute node or put it in
|
||||
# maintenance mode. Default 10 seconds (string value)
|
||||
#driver = osha.evacuators.drivers.osha.standard.OshaStandardEvacuator
|
||||
|
||||
# Time in seconds to wait between retries to disable compute node or put it in
|
||||
# maintenance mode. Default 10 seconds (integer value)
|
||||
#wait = 10
|
||||
|
||||
# Number of retries to put node in maintenance mode before reporting failure to
|
||||
# evacuate the node (integer value)
|
||||
#retries = 1
|
||||
|
||||
# Dict contains kwargs to be passed to the evacuator driver. In case you have
|
||||
# additional args needs to be passed to your evacuator please, list them as
|
||||
# key0:value0, key1:value1, .... (dict value)
|
||||
#options =
|
||||
|
||||
|
||||
[fencer]
|
||||
|
||||
#
|
||||
|
@ -125,7 +149,7 @@
|
|||
# Openstack auth URI i.e. http://controller:5000 (string value)
|
||||
#auth_uri = <None>
|
||||
|
||||
# Openstack auth URL i.e. http://controller:35357 (string value)
|
||||
# Openstack auth URL i.e. http://controller:35357/v3 (string value)
|
||||
#auth_url = <None>
|
||||
|
||||
# Openstack auth plugin i.e. ( password, token, ...) password is the only
|
||||
|
|
|
@ -71,7 +71,7 @@ _KEYSTONE_AUTH_TOKEN = [
|
|||
help='Openstack auth URI i.e. http://controller:5000',
|
||||
dest='auth_uri'),
|
||||
cfg.StrOpt('auth_url',
|
||||
help='Openstack auth URL i.e. http://controller:35357',
|
||||
help='Openstack auth URL i.e. http://controller:35357/v3',
|
||||
dest='auth_url'),
|
||||
cfg.StrOpt('auth_plugin',
|
||||
help='Openstack auth plugin i.e. ( password, token, ...) '
|
||||
|
@ -106,6 +106,33 @@ _KEYSTONE_AUTH_TOKEN = [
|
|||
]
|
||||
|
||||
|
||||
_EVACUATION = [
|
||||
cfg.StrOpt('driver',
|
||||
default='osha.evacuators.drivers.osha.standard.'
|
||||
'OshaStandardEvacuator',
|
||||
help='Time in seconds to wait between retries to disable compute'
|
||||
' node or put it in maintenance mode. Default 10 seconds',
|
||||
dest='driver'),
|
||||
cfg.IntOpt('wait',
|
||||
default=10,
|
||||
help='Time in seconds to wait between retries to disable compute'
|
||||
' node or put it in maintenance mode. Default 10 seconds',
|
||||
dest='wait'),
|
||||
cfg.IntOpt('retries',
|
||||
default=1,
|
||||
help='Number of retries to put node in maintenance mode before '
|
||||
'reporting failure to evacuate the node',
|
||||
dest='retries'),
|
||||
cfg.DictOpt('options',
|
||||
default={},
|
||||
help='Dict contains kwargs to be passed to the evacuator driver'
|
||||
'. In case you have additional args needs to be passed to '
|
||||
'your evacuator please, list them as key0:value0, '
|
||||
'key1:value1, ....',
|
||||
dest='options')
|
||||
]
|
||||
|
||||
|
||||
def build_os_options():
|
||||
osclient_opts = [
|
||||
cfg.StrOpt('os-username',
|
||||
|
@ -200,6 +227,14 @@ def configure():
|
|||
CONF.register_group(fencers_grp)
|
||||
CONF.register_opts(_FENCER, group='fencer')
|
||||
|
||||
# Evacuation Section :)
|
||||
evacuators_grp = cfg.OptGroup('evacuation',
|
||||
title='Evacuation Options',
|
||||
help='Evacuation Driver/plugin opts to be '
|
||||
'used to Evacuate compute nodes')
|
||||
CONF.register_group(evacuators_grp)
|
||||
CONF.register_opts(_EVACUATION, group='evacuation')
|
||||
|
||||
# Osha Auth
|
||||
keystone_grp = cfg.OptGroup('keystone_authtoken',
|
||||
title='Keystone Auth Options',
|
||||
|
@ -239,7 +274,8 @@ def list_opts():
|
|||
None: _COMMON,
|
||||
'monitoring': _MONITORS,
|
||||
'keystone_authtoken': _KEYSTONE_AUTH_TOKEN,
|
||||
'fencer': _FENCER
|
||||
'fencer': _FENCER,
|
||||
'evacuation': _EVACUATION
|
||||
}
|
||||
|
||||
return _OPTS.items()
|
||||
|
|
|
@ -104,14 +104,26 @@ class OSClient:
|
|||
endpoint_type=self.endpoint_type)
|
||||
self.authSession = new_sess
|
||||
evacuated_nodes = []
|
||||
print "Nodes", nodes
|
||||
for node in nodes:
|
||||
hypervisors = nova.hypervisors.search(node.get('host'), True)
|
||||
print "Hypervisor found is:", hypervisors
|
||||
for hypervisor in hypervisors:
|
||||
host = {'host': node.get('host'), 'servers': hypervisor.servers}
|
||||
evacuated_nodes.append(host)
|
||||
for server in hypervisor.servers:
|
||||
output = nova.servers.evacuate(server.get('uuid'),
|
||||
try:
|
||||
output = nova.servers.evacuate(server.get('uuid'),
|
||||
on_shared_storage=True)
|
||||
except Exception as e:
|
||||
print "ERRORORRRROROROROROROROROROROROROROROROROROROROR"
|
||||
print e
|
||||
for i in range(0, 100):
|
||||
print "-",
|
||||
if i == 50:
|
||||
print "Evacuation Result !",
|
||||
print
|
||||
|
||||
print output
|
||||
exit()
|
||||
|
||||
|
@ -136,16 +148,17 @@ class OSClient:
|
|||
auth_session = session.Session(auth=self.authSession.auth)
|
||||
return auth_session
|
||||
|
||||
def get_node_status(self, hostname):
|
||||
def get_node_status(self, node):
|
||||
"""
|
||||
Check the node nova-service status and if it's disabled or not
|
||||
:param hostname: of the required node
|
||||
:return: return dict contains the node status if it's disabled or not !
|
||||
:param node: dict contains node info
|
||||
:return: True or False. True => node disabled, False => node is enabled
|
||||
or unknow status !
|
||||
"""
|
||||
nova = novaclient.Client(session=self.authSession,
|
||||
endpoint_type=self.endpoint_type)
|
||||
try:
|
||||
node = nova.services.find(host=hostname)
|
||||
node = nova.services.find(host=node.get('host'))
|
||||
print node
|
||||
except Exception as e:
|
||||
LOG.error(e)
|
||||
|
@ -153,14 +166,17 @@ class OSClient:
|
|||
|
||||
if not node:
|
||||
return False
|
||||
return node.to_dict()
|
||||
node = node.to_dict()
|
||||
if node.get('status') == 'disabled':
|
||||
return True
|
||||
return False
|
||||
|
||||
def disable_node(self, hostname):
|
||||
def disable_node(self, node):
|
||||
auth_session = session.Session(auth=self.authSession.auth)
|
||||
nova = novaclient.Client(session=auth_session,
|
||||
endpoint_type=self.endpoint_type)
|
||||
try:
|
||||
node = nova.services.find(host=hostname)
|
||||
node = nova.services.find(host=node.get('host'))
|
||||
except Exception as e:
|
||||
LOG.error(e)
|
||||
return False
|
||||
|
@ -181,3 +197,14 @@ class OSClient:
|
|||
return False
|
||||
|
||||
return True
|
||||
|
||||
def get_hypervisor_instances(self, node):
|
||||
auth_session = session.Session(auth=self.authSession.auth)
|
||||
nova = novaclient.Client(session=auth_session,
|
||||
endpoint_type=self.endpoint_type)
|
||||
hypervisors = nova.hypervisors.search(node.get('host'), True)
|
||||
if not hypervisors:
|
||||
return []
|
||||
return hypervisors[0].servers
|
||||
|
||||
|
||||
|
|
|
@ -13,6 +13,12 @@
|
|||
# limitations under the License.
|
||||
|
||||
import os
|
||||
from osha.common.osclient import OSClient
|
||||
from oslo_config import cfg
|
||||
from oslo_log import log
|
||||
|
||||
CONF = cfg.CONF
|
||||
LOG = log.getLogger(__name__)
|
||||
|
||||
|
||||
def env(*env_vars, **kwargs):
|
||||
|
@ -21,3 +27,24 @@ def env(*env_vars, **kwargs):
|
|||
if value:
|
||||
return value
|
||||
return kwargs.get('default', '')
|
||||
|
||||
|
||||
def get_os_client():
|
||||
"""
|
||||
Loads credentials from [keystone_authtoken] section in the configuration
|
||||
file and initialize the client and return an instance of the client
|
||||
:return: Initialized instance of OS Client
|
||||
"""
|
||||
credentials = CONF.get('keystone_authtoken')
|
||||
client = OSClient(
|
||||
authurl=credentials.get('auth_url'),
|
||||
username=credentials.get('username'),
|
||||
password=credentials.get('password'),
|
||||
project_name=credentials.get('project_name'),
|
||||
user_domain_id=credentials.get('user_domain_id'),
|
||||
project_domain_id=credentials.get('project_domain_id'),
|
||||
project_domain_name=credentials.get('project_domain_name'),
|
||||
user_domain_name=credentials.get('user_domain_name')
|
||||
)
|
||||
|
||||
return client
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
__author__ = 'saad'
|
|
@ -0,0 +1 @@
|
|||
__author__ = 'saad'
|
|
@ -0,0 +1,90 @@
|
|||
# (c) Copyright 2014,2015 Hewlett-Packard Development Company, L.P.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import abc
|
||||
import six
|
||||
|
||||
|
||||
@six.add_metaclass(abc.ABCMeta)
|
||||
class EvacuatorBaseDriver(object):
|
||||
"""
|
||||
Abstract class for all evacuation drivers should implement to have
|
||||
a unified interface
|
||||
"""
|
||||
|
||||
def __init__(self, wait, retries, **kwargs):
|
||||
"""
|
||||
Initialize Evacuation driver with the config args
|
||||
:param wait: time in seconds that the evcauator should wait before
|
||||
retrying to disable the node
|
||||
:param retries: Number of times the evacuator will try to disable the
|
||||
compute node
|
||||
:param kwargs: Dict of arguments that any future driver may need to
|
||||
load it from the config file
|
||||
:return: None
|
||||
"""
|
||||
self.wait = wait
|
||||
self.retries = retries
|
||||
self.options = kwargs
|
||||
|
||||
@abc.abstractmethod
|
||||
def disable_node(self, node):
|
||||
"""
|
||||
Disable the compute node from accepting any new VMs or requests
|
||||
:param node: dict contains node's hostname
|
||||
:return: True pr False
|
||||
"""
|
||||
pass
|
||||
|
||||
@abc.abstractmethod
|
||||
def is_node_disabled(self, node):
|
||||
"""
|
||||
Check if node is already disabled or not
|
||||
:param node: dict contains node's hostname
|
||||
:return: True or False
|
||||
"""
|
||||
pass
|
||||
|
||||
@abc.abstractmethod
|
||||
def evacuate_nodes(self, nodes):
|
||||
"""
|
||||
Will evacuate all running VMs on the required nodes
|
||||
:param nodes: list of nodes
|
||||
:return: list of nodes with updated status
|
||||
"""
|
||||
pass
|
||||
|
||||
@abc.abstractmethod
|
||||
def get_node_instances(self, node):
|
||||
"""
|
||||
List instances on a compute host
|
||||
:param node: dict contains node's hostname
|
||||
:return: List contains running VMs on a given node
|
||||
"""
|
||||
pass
|
||||
|
||||
def get_info(self):
|
||||
"""
|
||||
Get Driver Information
|
||||
:return: Dict contains driver information
|
||||
"""
|
||||
pass
|
||||
|
||||
@abc.abstractmethod
|
||||
def get_node_status(self, node):
|
||||
"""
|
||||
Check the node status and report it
|
||||
:param node: dict contains node's hostname
|
||||
:return: dict with key 'status': 'True or False'
|
||||
"""
|
||||
pass
|
|
@ -0,0 +1,83 @@
|
|||
# (c) Copyright 2014,2015 Hewlett-Packard Development Company, L.P.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from oslo_config import cfg
|
||||
from oslo_log import log
|
||||
from oslo_utils import importutils
|
||||
from osha.fencers.common.manager import FencerManager
|
||||
|
||||
CONF = cfg.CONF
|
||||
LOG = log.getLogger(__name__)
|
||||
|
||||
|
||||
class EvacuationManager(object):
|
||||
|
||||
def __init__(self, enable_fencing=True):
|
||||
evcuation_conf = CONF.get('evacuation')
|
||||
self.driver = importutils.import_object(
|
||||
evcuation_conf.get('driver'),
|
||||
evcuation_conf.get('wait'),
|
||||
evcuation_conf.get('retries'),
|
||||
**evcuation_conf.get('options')
|
||||
)
|
||||
self.enable_fencing = enable_fencing
|
||||
self.wait = evcuation_conf.get('wait')
|
||||
self.retires = evcuation_conf.get('retries', 1)
|
||||
if self.retires <= 0:
|
||||
self.retires = 1
|
||||
|
||||
def evacuate(self, nodes):
|
||||
# try to disable node
|
||||
# @todo needs more error handling like if the status didn't update or
|
||||
# we are unable to disable the node ???
|
||||
failed_nodes = [] # maintain nodes that are going to fail at any state
|
||||
succeeded_nodes = []
|
||||
for node in nodes:
|
||||
for i in range(0, self.retires):
|
||||
status = self._disable_node(node)
|
||||
# if True ( node disabled ) break the loop
|
||||
if status:
|
||||
break
|
||||
else:
|
||||
status = False
|
||||
node['status'] = status
|
||||
# make sure the disable request was successful
|
||||
if not self.driver.get_node_status(node):
|
||||
failed_nodes.append(node)
|
||||
nodes.remove(node) # if the node failed at any step no reason
|
||||
# to move it to the next step
|
||||
else:
|
||||
succeeded_nodes.append(node)
|
||||
|
||||
nodes = succeeded_nodes
|
||||
if self.enable_fencing:
|
||||
fencer = FencerManager(nodes)
|
||||
nodes = fencer.fence()
|
||||
|
||||
succeeded_nodes = []
|
||||
for node in nodes:
|
||||
node['instances'] = self.driver.get_node_instances(node)
|
||||
succeeded_nodes.append(node)
|
||||
|
||||
nodes = succeeded_nodes
|
||||
from time import sleep
|
||||
sleep(30)
|
||||
evacuated_nodes = self.driver.evacuate_nodes(nodes)
|
||||
return evacuated_nodes
|
||||
|
||||
def _disable_node(self, node):
|
||||
if not self.driver.is_node_disabled(node):
|
||||
return self.driver.disable_node(node)
|
||||
else:
|
||||
True
|
||||
|
|
@ -0,0 +1 @@
|
|||
__author__ = 'saad'
|
|
@ -0,0 +1 @@
|
|||
__author__ = 'saad'
|
|
@ -0,0 +1,44 @@
|
|||
# (c) Copyright 2014,2015 Hewlett-Packard Development Company, L.P.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from oslo_config import cfg
|
||||
from oslo_log import log
|
||||
from osha.evacuators.common.driver import EvacuatorBaseDriver
|
||||
from osha.common.utils import get_os_client
|
||||
CONF = cfg.CONF
|
||||
LOG = log.getLogger(__name__)
|
||||
|
||||
|
||||
class OshaStandardEvacuator(EvacuatorBaseDriver):
|
||||
|
||||
def __init__(self, wait, retires, **kwargs):
|
||||
super(OshaStandardEvacuator, self).__init__(wait, retires, **kwargs)
|
||||
self.client = get_os_client()
|
||||
|
||||
def get_node_instances(self, node):
|
||||
return self.client.get_hypervisor_instances(node)
|
||||
|
||||
def disable_node(self, node):
|
||||
return self.client.disable_node(node)
|
||||
|
||||
def get_node_status(self, node):
|
||||
return self.client.get_node_status(node)
|
||||
|
||||
def is_node_disabled(self, node):
|
||||
return self.client.get_node_status(node)
|
||||
|
||||
def evacuate_nodes(self, nodes):
|
||||
return self.client.evacuate(nodes)
|
||||
|
||||
|
||||
|
|
@ -15,7 +15,7 @@ from osha.common import config
|
|||
from oslo_config import cfg
|
||||
from oslo_log import log
|
||||
from osha.monitors.common.manager import MonitorManager
|
||||
from osha.evacuate import EvacuationManager
|
||||
from osha.evacuators.common.manager import EvacuationManager
|
||||
|
||||
CONF = cfg.CONF
|
||||
LOG = log.getLogger(__name__)
|
||||
|
@ -36,8 +36,8 @@ def main():
|
|||
# deployments
|
||||
# Load Fence driver
|
||||
# Shutdown the node
|
||||
evacuator = EvacuationManager(nodes)
|
||||
evacuator.evacuate()
|
||||
evac = EvacuationManager()
|
||||
evac.evacuate(nodes)
|
||||
exit()
|
||||
|
||||
print "Fenced nodes are", nodes
|
Loading…
Reference in New Issue