HACluster refactoring

This commit is contained in:
Andres Rodriguez 2012-12-11 07:54:36 -05:00
parent 1c22ba36b4
commit f2b2373497
7 changed files with 191 additions and 81 deletions

26
TODO
View File

@ -15,23 +15,13 @@ HA Cluster (pacemaker/corosync) Charm
* TODO: Fix Disable upstart jobs * TODO: Fix Disable upstart jobs
- sudo sh -c "echo 'manual' > /etc/init/SERVICE.override" - sudo sh -c "echo 'manual' > /etc/init/SERVICE.override"
* BIG PROBlEM:
- given that we can only deploy hacluster once, and its config defines
the corosync configuration options, then we need to change the approach
on how the corosync is defined. Possible solution:
- in the 'service/charm' that uses hacluster, it will define the corosync options
- Instead of network source, it can define interfaces to use and assume each ethX
interface is connected to the same network and autodetect the network address.
* TODO: on juju destroy-server quantum, ha-relation-broken is executed.
we need to put nodes in standby or delete them.
* ERROR/BUG (discuss with jamespage):
- On add-unit in controller environment:
- subordinate (in added unit) gets the relation data in ha-relation-joined
- On add-unit in openstack
- subordinate (in added unit) *DOESN'T* get the the relation data in ha-relation-joined
- This is fine really cause we don't really need to re-add the services.
- However, the problem is that upstart jobs don't get stopped.
update-rc.d -f pacemaker remove update-rc.d -f pacemaker remove
update-rc.d pacemaker start 50 1 2 3 4 5 . stop 01 0 6 . update-rc.d pacemaker start 50 1 2 3 4 5 . stop 01 0 6 .
TODO: Problem seems to be that peer-relation gets executed before the subordinate relation.
In that case, peer relation would have to put nodes in standby and then the subordinate relation
will have to put the nodes online and configure the services. Or probably not use it at all.
Hanode-relation puts node in standby.
ha-relation counts nodes in hanode-relation and if >2 then we online them and setup cluster.

View File

@ -1,9 +1,4 @@
options: options:
corosync_bindnetaddr:
type: string
description: |
Network address of the interface on which corosync will communicate
with the other nodes of the cluster.
corosync_mcastaddr: corosync_mcastaddr:
default: 226.94.1.1 default: 226.94.1.1
type: string type: string
@ -11,13 +6,6 @@ options:
Multicast IP address to use for exchanging messages over the network. Multicast IP address to use for exchanging messages over the network.
If multiple clusters are on the same bindnetaddr network, this value If multiple clusters are on the same bindnetaddr network, this value
can be changed. can be changed.
corosync_mcastport:
default: 5405
type: int
description: |
Multicast Port number to use for exchanging messages. If multiple
clusters sit on the same Multicast IP Address, this value needs to
be changed.
corosync_pcmk_ver: corosync_pcmk_ver:
default: 1 default: 1
type: int type: int
@ -26,6 +14,7 @@ options:
Corosync how to start pacemaker Corosync how to start pacemaker
corosync_key: corosync_key:
type: string type: string
default: corosync-key
description: | description: |
This value will become the Corosync authentication key. To generate This value will become the Corosync authentication key. To generate
a suitable value use: a suitable value use:

View File

@ -0,0 +1 @@
hooks.py

View File

@ -21,27 +21,42 @@ import pcmk
def install(): def install():
utils.juju_log('INFO', 'Begin install hook.') utils.juju_log('INFO', 'Begin install hook.')
utils.configure_source() utils.configure_source()
utils.install('corosync', 'pacemaker', 'openstack-resource-agents') utils.install('corosync', 'pacemaker', 'openstack-resource-agents', 'python-netaddr')
utils.juju_log('INFO', 'End install hook.') utils.juju_log('INFO', 'End install hook.')
def get_corosync_conf():
for relid in utils.relation_ids('ha'):
for unit in utils.relation_list(relid):
conf = {
'corosync_bindnetaddr': utils.get_network_address(
utils.relation_get('corosync_bindiface',
unit, relid)),
'corosync_mcastport': utils.relation_get('corosync_mcastport',
unit, relid),
'corosync_mcastaddr': utils.config_get('corosync_mcastaddr'),
'corosync_pcmk_ver': utils.config_get('corosync_pcmk_ver'),
}
if None not in conf.itervalues():
return conf
return None
def emit_corosync_conf(): def emit_corosync_conf():
# read config variables # read config variables
corosync_conf_context = { corosync_conf_context = get_corosync_conf()
'corosync_bindnetaddr': utils.config_get('corosync_bindnetaddr'),
'corosync_mcastaddr': utils.config_get('corosync_mcastaddr'),
'corosync_mcastport': utils.config_get('corosync_mcastport'),
'corosync_pcmk_ver': utils.config_get('corosync_pcmk_ver'),
}
# write /etc/default/corosync file
with open('/etc/default/corosync', 'w') as corosync_default:
corosync_default.write(utils.render_template('corosync', corosync_conf_context))
# write config file (/etc/corosync/corosync.conf # write config file (/etc/corosync/corosync.conf
with open('/etc/corosync/corosync.conf', 'w') as corosync_conf: with open('/etc/corosync/corosync.conf', 'w') as corosync_conf:
corosync_conf.write(utils.render_template('corosync.conf', corosync_conf_context)) corosync_conf.write(utils.render_template('corosync.conf', corosync_conf_context))
def emit_base_conf():
corosync_default_context = {'corosync_enabled': 'yes'}
# write /etc/default/corosync file
with open('/etc/default/corosync', 'w') as corosync_default:
corosync_default.write(utils.render_template('corosync', corosync_default_context))
# write the authkey # write the authkey
corosync_key=utils.config_get('corosync_key') corosync_key=utils.config_get('corosync_key')
with open(corosync_key, 'w') as corosync_key_file: with open(corosync_key, 'w') as corosync_key_file:
@ -51,12 +66,6 @@ def emit_corosync_conf():
def config_changed(): def config_changed():
utils.juju_log('INFO', 'Begin config-changed hook.') utils.juju_log('INFO', 'Begin config-changed hook.')
# validate configuration options
corosync_bindnetaddr = utils.config_get('corosync_bindnetaddr')
if corosync_bindnetaddr == '':
utils.juju_log('CRITICAL', 'No bindnetaddr supplied, cannot proceed.')
sys.exit(1)
corosync_key = utils.config_get('corosync_key') corosync_key = utils.config_get('corosync_key')
if corosync_key == '': if corosync_key == '':
utils.juju_log('CRITICAL', utils.juju_log('CRITICAL',
@ -64,7 +73,7 @@ def config_changed():
sys.exit(1) sys.exit(1)
# Create a new config file # Create a new config file
emit_corosync_conf() emit_base_conf()
utils.juju_log('INFO', 'End config-changed hook.') utils.juju_log('INFO', 'End config-changed hook.')
@ -81,14 +90,15 @@ def start():
else: else:
utils.start("corosync") utils.start("corosync")
# TODO: Only start pacemaker after making sure # Only start pacemaker after making sure
# corosync has been started # corosync has been started
# Wait a few seconds for corosync to start. # Wait a few seconds for corosync to start.
time.sleep(2) time.sleep(2)
if utils.running("pacemaker"): if utils.running("corosync"):
utils.restart("pacemaker") if utils.running("pacemaker"):
else: utils.restart("pacemaker")
utils.start("pacemaker") else:
utils.start("pacemaker")
def stop(): def stop():
@ -100,14 +110,23 @@ def stop():
def ha_relation(): def ha_relation():
utils.juju_log('INFO', 'Begin ha relation joined/changed hook') utils.juju_log('INFO', 'Begin ha relation joined/changed hook')
pcmk.wait_for_pcmk() if utils.relation_get("corosync_bindiface") is None:
return
elif utils.relation_get("corosync_mcastport") is None:
return
else:
emit_corosync_conf()
utils.restart("corosync")
time.sleep(2)
utils.restart("pacemaker")
cmd = "crm configure property stonith-enabled=false" # Check that there's enough nodes in order to perform the
pcmk.commit(cmd) # configuration of the HA cluster
cmd = "crm configure property no-quorum-policy=ignore" if len(get_cluster_nodes()) < 2:
pcmk.commit(cmd) return
cmd = 'crm configure rsc_defaults $id="rsc-options" resource-stickiness="100"' else:
pcmk.commit(cmd) utils.juju_log('INFO', 'hanode-relation: Waiting for PCMK to start')
pcmk.wait_for_pcmk()
# Obtain relation information # Obtain relation information
import ast import ast
@ -120,6 +139,7 @@ def ha_relation():
init_services = {} if utils.relation_get("init_services") is None else ast.literal_eval(utils.relation_get("init_services")) init_services = {} if utils.relation_get("init_services") is None else ast.literal_eval(utils.relation_get("init_services"))
# Configuring the Resource # Configuring the Resource
utils.juju_log('INFO', 'ha-relation: Configuring Resources')
for res_name,res_type in resources.iteritems(): for res_name,res_type in resources.iteritems():
# disable the service we are going to put in HA # disable the service we are going to put in HA
if res_type.split(':')[0] == "lsb": if res_type.split(':')[0] == "lsb":
@ -131,7 +151,8 @@ def ha_relation():
if utils.running(init_services[res_name]): if utils.running(init_services[res_name]):
utils.stop(init_services[res_name]) utils.stop(init_services[res_name])
# Put the services in HA, if not already done so # Put the services in HA, if not already done so
if not pcmk.is_resource_present(res_name): #if not pcmk.is_resource_present(res_name):
if not pcmk.crm_opt_exists(res_name):
if resource_params[res_name] is None: if resource_params[res_name] is None:
cmd = 'crm -F configure primitive %s %s' % (res_name, res_type) cmd = 'crm -F configure primitive %s %s' % (res_name, res_type)
else: else:
@ -140,28 +161,45 @@ def ha_relation():
utils.juju_log('INFO', '%s' % cmd) utils.juju_log('INFO', '%s' % cmd)
# Configuring groups # Configuring groups
utils.juju_log('INFO', 'ha-relation: Configuring Groups')
for grp_name, grp_params in groups.iteritems(): for grp_name, grp_params in groups.iteritems():
cmd = 'crm -F configure group %s %s' % (grp_name, grp_params) if not pcmk.crm_opt_exists(grp_name):
pcmk.commit(cmd) cmd = 'crm -F configure group %s %s' % (grp_name, grp_params)
utils.juju_log('INFO', '%s' % cmd) pcmk.commit(cmd)
utils.juju_log('INFO', '%s' % cmd)
# Configuring ordering # Configuring ordering
utils.juju_log('INFO', 'ha-relation: Configuring Orders')
for ord_name, ord_params in orders.iteritems(): for ord_name, ord_params in orders.iteritems():
cmd = 'crm -F configure order %s %s' % (ord_name, ord_params) if not pcmk.crm_opt_exists(ord_name):
pcmk.commit(cmd) cmd = 'crm -F configure order %s %s' % (ord_name, ord_params)
utils.juju_log('INFO', '%s' % cmd) pcmk.commit(cmd)
utils.juju_log('INFO', '%s' % cmd)
# Configuring colocations # Configuring colocations
utils.juju_log('INFO', 'ha-relation: Configuring Colocations')
for col_name, col_params in colocations.iteritems(): for col_name, col_params in colocations.iteritems():
cmd = 'crm -F configure colocation %s %s' % (col_name, col_params) if not pcmk.crm_opt_exists(col_name):
pcmk.commit(cmd) cmd = 'crm -F configure colocation %s %s' % (col_name, col_params)
utils.juju_log('INFO', '%s' % cmd) pcmk.commit(cmd)
utils.juju_log('INFO', '%s' % cmd)
# Configuring clones # Configuring clones
utils.juju_log('INFO', 'ha-relation: Configuring Clones')
for cln_name, cln_params in clones.iteritems(): for cln_name, cln_params in clones.iteritems():
cmd = 'crm -F configure clone %s %s' % (cln_name, cln_params) if not pcmk.crm_opt_exists(cln_name):
pcmk.commit(cmd) cmd = 'crm -F configure clone %s %s' % (cln_name, cln_params)
utils.juju_log('INFO', '%s' % cmd) pcmk.commit(cmd)
utils.juju_log('INFO', '%s' % cmd)
for res_name,res_type in resources.iteritems():
# TODO: This should first check that the resources is running
if len(init_services) != 0 and res_name in init_services:
# If the resource is in HA already, and it is a service, restart
# the pcmk resource as the config file might have changed by the
# principal charm
cmd = 'crm resource restart %s' % res_name
pcmk.commit(cmd)
utils.juju_log('INFO', 'End ha relation joined/changed hook') utils.juju_log('INFO', 'End ha relation joined/changed hook')
@ -171,19 +209,50 @@ def ha_relation_departed():
# If this happens, and a new relation is created in the same machine # If this happens, and a new relation is created in the same machine
# (which already has node), then check whether it is standby and put it # (which already has node), then check whether it is standby and put it
# in online mode. This should be done in ha_relation_joined. # in online mode. This should be done in ha_relation_joined.
cmd = "crm -F node standby %s" % utils.get_unit_hostname() pcmk.standby(utils.get_unit_hostname())
pcmk.commit(cmd)
def get_cluster_nodes():
hosts = []
hosts.append('{}:6789'.format(utils.get_host_ip()))
for relid in utils.relation_ids('hanode'):
for unit in utils.relation_list(relid):
hosts.append(
'{}:6789'.format(utils.get_host_ip(
utils.relation_get('private-address',
unit, relid)))
)
hosts.sort()
return hosts
def hanode_relation():
utils.juju_log('INFO', 'Begin hanode peer relation hook')
if len(get_cluster_nodes()) >= 2:
utils.juju_log('INFO', 'hanode-relation: Waiting for PCMK to start')
pcmk.wait_for_pcmk()
utils.juju_log('INFO', 'hanode-relation: Doing global configuration')
cmd = "crm configure property stonith-enabled=false"
pcmk.commit(cmd)
cmd = "crm configure property no-quorum-policy=ignore"
pcmk.commit(cmd)
cmd = 'crm configure rsc_defaults $id="rsc-options" resource-stickiness="100"'
pcmk.commit(cmd)
utils.do_hooks({ utils.do_hooks({
'config-changed': config_changed,
'install': install, 'install': install,
'config-changed': config_changed,
'start': start, 'start': start,
'stop': stop, 'stop': stop,
'upgrade-charm': upgrade_charm, 'upgrade-charm': upgrade_charm,
'ha-relation-joined': ha_relation, 'ha-relation-joined': ha_relation,
'ha-relation-changed': ha_relation, 'ha-relation-changed': ha_relation,
'ha-relation-departed': ha_relation_departed, 'ha-relation-departed': ha_relation_departed,
'hanode-relation-joined': hanode_relation,
#'hanode-relation-departed': hanode_relation_departed, # TODO: should probably remove nodes from the cluster #'hanode-relation-departed': hanode_relation_departed, # TODO: should probably remove nodes from the cluster
}) })

View File

@ -20,12 +20,33 @@ def commit(cmd):
subprocess.call(cmd.split()) subprocess.call(cmd.split())
#def wait_for_cluster():
# while (not is_running()):
# time.sleep(3)
def is_resource_present(resource): def is_resource_present(resource):
(status, output) = commands.getstatusoutput("crm resource status %s" % resource) (status, output) = commands.getstatusoutput("crm resource status %s" % resource)
if status != 0: if status != 0:
return False return False
return True return True
def standby(node=None):
if node is None:
cmd = "crm -F node standby"
else:
cmd = "crm -F node standby %s" % node
commit(cmd)
def online(node=None):
if node is None:
cmd = "crm -F node online"
else:
cmd = "crm -F node online %s" % node
commit(cmd)
def crm_opt_exists(opt_name):
(status, output) = commands.getstatusoutput("crm configure show")
show_re = re.compile(opt_name)
opt = show_re.search(output)
if opt:
return True
return False

View File

@ -13,6 +13,8 @@ import re
import subprocess import subprocess
import socket import socket
import sys import sys
import fcntl
import struct
def do_hooks(hooks): def do_hooks(hooks):
@ -43,6 +45,12 @@ except ImportError:
install('python-jinja2') install('python-jinja2')
import jinja2 import jinja2
try:
from netaddr import *
except:
install('python-netaddr')
from netaddr import *
def render_template(template_name, context, template_dir=TEMPLATES_DIR): def render_template(template_name, context, template_dir=TEMPLATES_DIR):
templates = jinja2.Environment( templates = jinja2.Environment(
@ -223,3 +231,35 @@ def disable_lsb_services(*services):
def enable_lsb_services(*services): def enable_lsb_services(*services):
for service in services: for service in services:
subprocess.call(['update-rc.d','-f',service,'defaults']) subprocess.call(['update-rc.d','-f',service,'defaults'])
def get_iface_ipaddr(iface):
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
return socket.inet_ntoa(fcntl.ioctl(
s.fileno(),
0x8919, # SIOCGIFADDR
struct.pack('256s', iface[:15])
)[20:24])
def get_iface_netmask(iface):
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
return socket.inet_ntoa(fcntl.ioctl(
s.fileno(),
0x891b, # SIOCGIFNETMASK
struct.pack('256s', iface[:15])
)[20:24])
def get_netmask_cidr(netmask):
netmask = netmask.split('.')
binary_str = ''
for octet in netmask:
binary_str += bin(int(octet))[2:].zfill(8)
return str(len(binary_str.rstrip('0')))
def get_network_address(iface):
network = "%s/%s" % (get_iface_ipaddr(iface), get_netmask_cidr(get_iface_netmask(iface)))
ip = IPNetwork(network)
return str(ip.network)

View File

@ -1,3 +1,3 @@
# Configuration file created by the ha charm # Configuration file created by the ha charm
# start corosync at boot [yes|no] # start corosync at boot [yes|no]
START=yes START={{ corosync_enabled }}