Change-Id: I16c69cf1d2ad6ff7c5b05ec5f3528e7cd8785249
This commit is contained in:
Saad Zaher 2015-12-21 11:35:58 +00:00
parent b635db01e8
commit 0ee63ad837
10 changed files with 687 additions and 0 deletions

13
README.rst Normal file
View File

@ -0,0 +1,13 @@
Openstack Compute High Availability
Osha allows Openstack to have High availability in compute nodes. Simply it monitors all compute nodes in your deployment
and if there is any failure in one of the computes it launches the evacuation tool to evacuate this node and move all
instances to another compute node.
Osha has a plugable architecture so you can use any monitoring system you want to use it for monitoring your compute nodes
just by adding a simple plugin and adjust your configuration file to use this plugin or combination of plugins if you want
Osha runs as scheduler in the control plane which communicates with the monitoring system to get compute nodes status
For running osha under high availability mode, it should run with active passive mode.

165
daemon.py Normal file
View File

@ -0,0 +1,165 @@
#!/usr/bin/env python
import sys, os, time, atexit
from signal import SIGTERM
import logging as log
class Daemon:
"""
A generic daemon class.
Usage: subclass the Daemon class and override the run() method
"""
def __init__(self, pidfile, stdin='/dev/null', stdout='/dev/null',
stderr='/dev/null'):
self.stdin = stdin
self.stdout = stdout
self.stderr = stderr
self.pidfile = pidfile
def daemonize(self):
"""
do the UNIX double-fork magic, see Stevens' "Advanced
Programming in the UNIX Environment" for details
(ISBN 0201563177)
http://www.erlenstar.demon.co.uk/unix/faq_2.html#SEC16
"""
try:
pid = os.fork()
if pid > 0:
# exit first parent
sys.exit(0)
except OSError, e:
sys.stderr.write("fork #1 failed: %d (%s)\n" %
(e.errno, e.strerror))
log.error(e)
sys.exit(1)
# decouple from parent environment
os.chdir("/")
os.setsid()
os.umask(0)
# do second fork
try:
pid = os.fork()
if pid > 0:
# exit from second parent
sys.exit(0)
except OSError, e:
sys.stderr.write("fork #2 failed: %d (%s)\n"
% (e.errno, e.strerror))
log.error(e)
sys.exit(1)
# redirect standard file descriptors
sys.stdout.flush()
sys.stderr.flush()
si = file(self.stdin, 'r')
so = file(self.stdout, 'a+')
se = file(self.stderr, 'a+', 0)
os.dup2(si.fileno(), sys.stdin.fileno())
os.dup2(so.fileno(), sys.stdout.fileno())
os.dup2(se.fileno(), sys.stderr.fileno())
# write pidfile
atexit.register(self.delpid)
pid = str(os.getpid())
f = file(self.pidfile, 'w+')
f.write("%s\n" % pid)
f.close()
def delpid(self):
os.remove(self.pidfile)
def start(self):
"""
Start the daemon
"""
log.error("Test")
# Check for a pidfile to see if the daemon already runs
try:
pf = file(self.pidfile,'r')
pid = int(pf.read().strip())
pf.close()
except IOError as e:
pid = None
if pid:
message = "pidfile %s already exist. Daemon" \
" already running?\n"
sys.stderr.write(message % self.pidfile)
sys.exit(1)
# Start the daemon
self.daemonize()
self.run()
# @todo needs some enhancement like check /proc/%pid/status if it's
# really running or not ! may be it's killed by external process
# the PID won't be updated !
def status(self):
try:
pf = file(self.pidfile, 'r')
pid = int(pf.read().strip())
pf.close()
except IOError as e:
pid = None
if pid:
message = "pidfile %s already exist. Daemon already " \
"running. PID: %d \n"
sys.stdout.write(message % (self.pidfile, pid))
sys.exit(0)
else:
message = "Service not running !\n"
sys.stdout.write(message)
sys.exit(0)
def stop(self):
"""
Stop the daemon
"""
# Get the pid from the pidfile
try:
pf = file(self.pidfile,'r')
pid = int(pf.read().strip())
pf.close()
except IOError:
pid = None
if not pid:
message = "pidfile %s does not exist." \
" Daemon not running?\n"
sys.stderr.write(message % self.pidfile)
return # not an error in a restart
# Try killing the daemon process
try:
while 1:
os.kill(pid, SIGTERM)
time.sleep(0.1)
except OSError, err:
err = str(err)
if err.find("No such process") > 0:
if os.path.exists(self.pidfile):
os.remove(self.pidfile)
else:
print str(err)
sys.exit(1)
def restart(self):
"""
Restart the daemon
"""
self.stop()
self.start()
def run(self):
"""
You should override this method when you subclass Daemon.
It will be called after the process has been
daemonized by start() or restart().
"""

22
evacuate.py Normal file
View File

@ -0,0 +1,22 @@
#__author__ = 'saad'
from monitor import Monitor
import osclient
password = 'BOMrLNGHsoBb'
user_id = 'ec2548d6acb54e7ba24f479e2f3cb1a5'
username = 'admin'
auth_url = 'http://192.168.245.9:35357/v3'
project_name = 'demo'
project_id = 'f749b2874b0040aca92ea131210eb774'
user_domain_name = 'Default'
project_domain_name = 'Default'
client = osclient.OSClient(authurl=auth_url,
username=username,
password=password,
user_domain_name=user_domain_name,
project_name=project_name,
project_domain_name=project_domain_name,
endpoint_type='internal')
monitor = Monitor(client, 1)
monitor.monitor()

115
ipmitool.py Normal file
View File

@ -0,0 +1,115 @@
# __author__ = 'saad'
import sys
import subprocess
from distutils import spawn
class IpmiInterface:
_IPMI = 'ipmitool'
_RAW_CMD = '{0} -I {1} -H {2} -U {3} -P {4} '
_SUPPORTED_INTERFACES = ['lan', 'lanplus']
def __init__(self, host, username, password, verbose=False,
interface='lanplus'):
self._IPMI = spawn.find_executable('ipmitool')
if not self._IPMI:
self._IPMI = spawn.find_executable('ipmitool',
path=':'.join(sys.path))
if interface not in self._SUPPORTED_INTERFACES:
raise Exception("Provided Interface is not supported")
self._host = host
self._username = username
self._password = password
self._verbose = verbose
self._interface = interface
self._update_cmd_credentials(
host=host,
username=username,
password=password,
interface=interface
)
def _update_cmd_credentials(self, host, username, password, interface):
"""
Update credentials to work with different server
:param host: IPMI IP address of the server
:param username: IPMI username
:param password: IPMI password
:param interface: IPMI Interface lan, lanplus
"""
cmd = self._RAW_CMD.format(
self._IPMI,
interface,
host,
username,
password
)
self._cmd = cmd
def get_power_status(self):
"""
get the machine power status
:return: 1 if the power is on and 0 if the power is off. otherwise it
will return -1 for unknown state
"""
cmd = self._cmd + ' chassis power status'
output = self._process_request(cmd)
if self._verbose:
print "[Debug]: ", output
if 'is on'.lower() in output.lower():
return 1
elif 'is off'.lower() in output.lower():
return 0
return -1 # power status unknown
def power_down(self):
"""
shutdown the machine
"""
cmd = self._cmd + ' chassis power down'
output = self._process_request(cmd)
return output
def power_reset(self):
"""
restart the machine
"""
cmd = self._cmd + ' chassis power reset'
return self._process_request(cmd)
def power_on(self):
"""
power on the machine
"""
cmd = self._cmd + ' chassis power on'
return self._process_request(cmd)
def _process_request(self, cmd):
if self._verbose:
print "Executing IPMI command: ", cmd
process = subprocess.Popen(cmd, shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
output, error = process.communicate()
if self._verbose:
print "[Debug] Process Output: ", output
print "[Debug] Process Error: ", error
if process.returncode:
raise Exception(error)
return output
def _custom_cmd(self, cmd):
"""
execute custom ipmitool commands
:param cmd: string contains the command, for credentials and interface
you should _update_cmd_credentials to update them first
:return: output of the command you sent or raise error
"""
cmd = self._cmd + cmd
return self._process_request(cmd)

90
monitor.py Normal file
View File

@ -0,0 +1,90 @@
#__author__ = 'saad'
from time import sleep
class Monitor(object):
def __init__(self, client, wait):
self.client = client
self.wait = wait
def get_down_nodes(self):
# list all down nova compute
nova_down = self.is_nova_service_down()
# list all down hypervisors
hypervisor_down = self.is_hpyervisor_down()
# list all down openvswitch agents
agents_down = self.is_neutron_agents_down()
nodes_down = []
for node in nova_down:
if node in hypervisor_down and node in agents_down:
nodes_down.append(node)
return nodes_down
def monitor(self):
nodes_down = self.get_down_nodes()
nodes_to_evacuate = []
if nodes_down:
nodes_to_evacuate = self.process_failed_nodes(nodes_down)
evacuated_nodes = []
if nodes_to_evacuate:
evacuated_nodes = self.evacuate(nodes_to_evacuate)
if not evacuated_nodes:
raise "Error: node didn't evacuated !", nodes_to_evacuate
self.notify(evacuated_nodes)
# @todo needs to be implemented !
def notify(self, nodes):
print "These nodes %s Evacuated" % nodes[0]['host']
print nodes
"""
will be used to notify the admins that there is something went wrong !
"""
pass
def evacuate(self, nodes):
# @todo add shutdown process
# maintence mode not working with libvirt
# self.client.set_in_maintance(nodes)
evacuated = self.client.evacuate(nodes)
return evacuated
def process_failed_nodes(self, nodes):
sleep(self.wait)
nodes_down = self.get_down_nodes()
to_be_evacuated = []
for node in nodes_down:
if node in nodes:
to_be_evacuated.append(node)
return to_be_evacuated
def is_hpyervisor_down(self):
hypervisors = self.client.novahypervisors()
down_hosts = []
for hypervisor in hypervisors:
if hypervisor.get('state') == 'down':
host = {}
host['host'] = hypervisor.get('service').get('host')
down_hosts.append(hypervisor.get('service').get('host'))
return down_hosts
def is_nova_service_down(self):
computes = self.client.novacomputes()
down_hosts = []
for node in computes:
if node.get('state') == 'down' and node.get('status') == 'enabled':
down_hosts.append(node.get('host'))
return down_hosts
def is_neutron_agents_down(self):
agents = self.client.neutronagents()
down_hosts = []
for agent in agents:
if agent.get('admin_state_up') and not agent.get('alive'):
down_hosts.append(agent.get('host'))
return down_hosts

116
osclient.py Normal file
View File

@ -0,0 +1,116 @@
__author__ = 'saad'
from keystoneclient.auth.identity import v3
from keystoneclient import session
from novaclient.v2 import client as novaclient
from neutronclient.v2_0 import client as neutronclient
class OSClient:
def __init__(self, authurl, authmethod = 'password', ** kwargs):
"""
Provide Openstack credentials to initalize the connection to Openstack
:param authmethod: string authmethod should be password or token but
currently we support only password !
:param kwargs: username, user_id, project_name, project_id,
default_domain_id,
"""
self.authmethod = authmethod
self.authurl = authurl
if authmethod == 'password':
self.username = kwargs.get('username', None)
self.password = kwargs.get('password')
self.project_name = kwargs.get('project_name', None)
self.project_id = kwargs.get('project_id', None)
self.user_id = kwargs.get('user_id', None)
self.user_domain_id = kwargs.get('user_domain_id', None)
self.user_domain_name = kwargs.get('user_domain_name', None)
self.project_domain_name = kwargs.get('project_domain_name', None)
self.endpoint_type = kwargs.get('endpoint_type', 'internal')
else:
print "The available authmethod is password for the time being" \
"Please, provide a password credentials :) "
self.auth()
def auth(self):
auth = v3.Password(auth_url=self.authurl,
username=self.username,
password=self.password,
project_name=self.project_name,
user_domain_id=self.user_domain_id,
user_domain_name=self.user_domain_name,
project_domain_name=self.project_domain_name)
self.authSession = session.Session(auth=auth)
def novacomputes(self):
nova = novaclient.Client(session=self.authSession,
endpoint_type=self.endpoint_type)
services = nova.services.list()
compute_nodes = []
compute_hosts = []
for service in services:
service = service.to_dict()
if service.get('binary') == 'nova-compute':
compute_nodes.append(service)
compute_hosts.append(service.get('host'))
self.compute_hosts = compute_hosts
return compute_nodes
def novahypervisors(self):
nova = novaclient.Client(session=self.authSession,
endpoint_type=self.endpoint_type)
hypervisors = nova.hypervisors.list()
nova_hypervisors = []
for hypervisor in hypervisors:
nova_hypervisors.append(hypervisor.to_dict())
return nova_hypervisors
def neutronagents(self, hosts=[]):
if not hosts:
hosts = self.compute_hosts
new_sess = session.Session(auth=self.authSession.auth)
neutron = neutronclient.Client(session=new_sess,
endpoint_type=self.endpoint_type)
self.authSession = new_sess
agents = neutron.list_agents()
neutron_agents = []
for agent in agents.get('agents'):
if agent.get('host') in hosts and agent.get('binary') == \
'neutron-openvswitch-agent':
neutron_agents.append(agent)
return neutron_agents
def evacuate(self, nodes):
new_sess = session.Session(auth=self.authSession.auth)
nova = novaclient.Client(session=new_sess,
endpoint_type=self.endpoint_type)
self.authSession = new_sess
evacuated_nodes = []
for node in nodes:
hypervisors = nova.hypervisors.search(node, True)
for hypervisor in hypervisors:
host = {'host': node, 'servers': hypervisor.servers}
evacuated_nodes.append(host)
for server in hypervisor.servers:
pass
# output = nova.servers.evacuate(server.get('uuid'),
# on_shared_storage=True)
return evacuated_nodes
def set_in_maintance(self, nodes):
new_sess = session.Session(auth=self.authSession.auth)
nova = novaclient.Client(session=new_sess,
endpoint_type=self.endpoint_type)
self.authSession = new_sess
for node in nodes:
output = []
host = nova.hosts.get(node)[0]
values = {"maintenance_mode": "enable"}
try:
output.append(host.update(values))
except Exception as e:
print "ERROR ::: ", e
return output

37
osha.py Normal file
View File

@ -0,0 +1,37 @@
#__author__ = 'saad'
from daemon import Daemon
import sys
import logging as log
import time
log.basicConfig(filename='osha.log')
class Osha(Daemon):
def run(self):
# @todo scheduling code goes here ! may be apscheduler or just cron !
# just as a test ...
while True:
time.sleep(1)
if __name__ == '__main__':
osha = Osha('/var/run/osha/osha.pid') # won't run unless the folder is
# already created and have the correct permissions !
if len(sys.argv) == 2:
if sys.argv[1] == 'start':
osha.start()
elif sys.argv[1] == 'stop':
osha.stop()
elif sys.argv[1] == 'restart':
osha.restart()
elif sys.argv[1] == 'status':
osha.status()
else:
print "Unknown command "
print "Usage %s start|stop|restart" % sys.argv[0]
sys.exit(2)
sys.exit(0)
else:
print "Usage %s start|stop|restart" % sys.argv[0]
sys.exit(0)

9
requirements.txt Normal file
View File

@ -0,0 +1,9 @@
pbr>=0.6,!=0.7,<1.0
python-keystoneclient>=1.2.0,<1.4.0
python-neutronclient>=2.4.0,<2.5.0
python-novaclient>=2.22.0,<2.24.0
python-openstackclient>=1.0.3,<1.1.0
PyYAML>=3.1.0
oslo.config>=1.9.3,<1.10.0 # Apache-2.0
oslo.i18n>=1.5.0,<1.6.0 # Apache-2.0
oslo.log>=1.0.0,<1.1.0 # Apache-2.0

60
servers.yml Normal file
View File

@ -0,0 +1,60 @@
#
# (c) Copyright 2015 Hewlett Packard Enterprise Development Company LP
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
#
---
servers:
- id: deployer
ip-addr: 192.168.10.254
hostname: padawan-cp1-c0-m1-mgmt
ilo-ip: 192.168.9.2
ilo-password: password
ilo-user: admin
- id: ccn-0001
ip-addr: 192.168.10.3
hostname: padawan-cp1-c1-m1-mgmt
ilo-ip: 192.168.9.3
ilo-password: password
ilo-user: admin
- id: ccn-0002
ip-addr: 192.168.10.4
hostname: padawan-cp1-c1-m2-mgmt
ilo-ip: 192.168.9.4
ilo-password: password
ilo-user: admin
- id: ccn-0003
ip-addr: 192.168.10.5
hostname: padawan-cp1-c1-m3-mgmt
ilo-ip: 192.168.9.5
ilo-password: password
ilo-user: admin
- id: COMPUTE-0001
ip-addr: 192.168.10.6
hostname: padawan-ccp-comp0001-mgmt
ilo-ip: 192.168.9.6
ilo-password: password
ilo-user: admin
- id: COMPUTE-0002
ip-addr: 192.168.10.7
hostname: padawan-ccp-comp0002-mgmt
ilo-ip: 192.168.9.7
ilo-password: password
ilo-user: admin

60
yaml_parser.py Normal file
View File

@ -0,0 +1,60 @@
# __author__ = 'saad'
import yaml
import os
class YamlParser(object):
_INDEX = 'servers'
def __init__(self, yml_file, index='servers'):
"""
Provide Yaml file to parse it and process data
:param yml_file: path to yaml file
:param index: the key in the .yml file to get all servers listed under
this key. the default 'is servers'
"""
self.file = yml_file
self._INDEX = index
self.data = self.parse()
def parse(self):
if not self.file:
raise "No file specified !"
if not os.path.exists(self.file) or not os.path.isfile(self.file):
raise "File desn't exists"
stream = file(self.file, 'r')
data = yaml.load(stream)
return data
def find_server_by_ip(self, ip):
"""
get server information ilo username, password and ip
:param ip: mgmt ip address of the server, this should be the same like
the ip in the .yml file
:return: dict contains server information
"""
return self.find_server('ip-addr', ip)
def find_server_by_hostname(self, hostname):
"""
get server information ilo username, password and ip
:param hostname: hostname matches one of the ones in the .yml file
:return: dict contains the server information
"""
return self.find_server(key='hostname', value=hostname)
def find_server(self, key, value):
"""
Generic function to query the .yml file to get server information by any
key.
:param key:
:param value:
:return:
"""
for server in self.data.get(self._INDEX):
if server.get(key) == value:
return server
return None