1042 lines
33 KiB
Python
1042 lines
33 KiB
Python
#!/usr/bin/python
|
|
#
|
|
# Copyright 2016 Canonical Ltd
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
import ast
|
|
import pcmk
|
|
import maas
|
|
import json
|
|
import os
|
|
import re
|
|
import subprocess
|
|
import socket
|
|
import fcntl
|
|
import struct
|
|
import time
|
|
import xml.etree.ElementTree as ET
|
|
|
|
from base64 import b64decode
|
|
|
|
from charmhelpers.core.hookenv import (
|
|
local_unit,
|
|
log,
|
|
DEBUG,
|
|
ERROR,
|
|
INFO,
|
|
WARNING,
|
|
relation_get,
|
|
related_units,
|
|
relation_ids,
|
|
config,
|
|
unit_get,
|
|
status_set,
|
|
)
|
|
from charmhelpers.contrib.openstack.utils import (
|
|
get_host_ip,
|
|
set_unit_paused,
|
|
clear_unit_paused,
|
|
is_unit_paused_set,
|
|
is_unit_upgrading_set,
|
|
)
|
|
from charmhelpers.contrib.openstack.ha.utils import (
|
|
assert_charm_supports_dns_ha
|
|
)
|
|
from charmhelpers.core.host import (
|
|
mkdir,
|
|
rsync,
|
|
service_start,
|
|
service_stop,
|
|
service_running,
|
|
write_file,
|
|
file_hash,
|
|
lsb_release,
|
|
init_is_systemd,
|
|
CompareHostReleases,
|
|
)
|
|
from charmhelpers.fetch import (
|
|
apt_install,
|
|
add_source,
|
|
apt_update,
|
|
)
|
|
from charmhelpers.contrib.hahelpers.cluster import (
|
|
peer_ips,
|
|
)
|
|
from charmhelpers.contrib.network import ip as utils
|
|
|
|
import netifaces
|
|
from netaddr import IPNetwork
|
|
import jinja2
|
|
|
|
|
|
TEMPLATES_DIR = 'templates'
|
|
COROSYNC_CONF = '/etc/corosync/corosync.conf'
|
|
COROSYNC_DEFAULT = '/etc/default/corosync'
|
|
COROSYNC_AUTHKEY = '/etc/corosync/authkey'
|
|
COROSYNC_HACLUSTER_ACL_DIR = '/etc/corosync/uidgid.d'
|
|
COROSYNC_HACLUSTER_ACL = COROSYNC_HACLUSTER_ACL_DIR + '/hacluster'
|
|
COROSYNC_CONF_FILES = [
|
|
COROSYNC_DEFAULT,
|
|
COROSYNC_AUTHKEY,
|
|
COROSYNC_CONF,
|
|
COROSYNC_HACLUSTER_ACL,
|
|
]
|
|
SUPPORTED_TRANSPORTS = ['udp', 'udpu', 'multicast', 'unicast']
|
|
|
|
PCMKR_AUTHKEY = '/etc/pacemaker/authkey'
|
|
PCMKR_MAX_RETRIES = 3
|
|
PCMKR_SLEEP_SECS = 5
|
|
|
|
SYSTEMD_OVERRIDES_DIR = '/etc/systemd/system/{}.service.d'
|
|
SYSTEMD_OVERRIDES_FILE = '{}/overrides.conf'
|
|
|
|
|
|
MAAS_DNS_CONF_DIR = '/etc/maas_dns'
|
|
|
|
|
|
class MAASConfigIncomplete(Exception):
|
|
pass
|
|
|
|
|
|
def disable_upstart_services(*services):
|
|
for service in services:
|
|
with open("/etc/init/{}.override".format(service), "wt") as override:
|
|
override.write("manual")
|
|
|
|
|
|
def enable_upstart_services(*services):
|
|
for service in services:
|
|
path = '/etc/init/{}.override'.format(service)
|
|
if os.path.exists(path):
|
|
os.remove(path)
|
|
|
|
|
|
def disable_lsb_services(*services):
|
|
for service in services:
|
|
subprocess.check_call(['update-rc.d', '-f', service, 'remove'])
|
|
|
|
|
|
def enable_lsb_services(*services):
|
|
for service in services:
|
|
subprocess.check_call(['update-rc.d', '-f', service, 'defaults'])
|
|
|
|
|
|
def get_iface_ipaddr(iface):
|
|
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
|
|
return socket.inet_ntoa(fcntl.ioctl(
|
|
s.fileno(),
|
|
0x8919, # SIOCGIFADDR
|
|
struct.pack('256s', iface[:15])
|
|
)[20:24])
|
|
|
|
|
|
def get_iface_netmask(iface):
|
|
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
|
|
return socket.inet_ntoa(fcntl.ioctl(
|
|
s.fileno(),
|
|
0x891b, # SIOCGIFNETMASK
|
|
struct.pack('256s', iface[:15])
|
|
)[20:24])
|
|
|
|
|
|
def get_netmask_cidr(netmask):
|
|
netmask = netmask.split('.')
|
|
binary_str = ''
|
|
for octet in netmask:
|
|
binary_str += bin(int(octet))[2:].zfill(8)
|
|
return str(len(binary_str.rstrip('0')))
|
|
|
|
|
|
def get_network_address(iface):
|
|
if iface:
|
|
iface = str(iface)
|
|
network = "{}/{}".format(get_iface_ipaddr(iface),
|
|
get_netmask_cidr(get_iface_netmask(iface)))
|
|
ip = IPNetwork(network)
|
|
return str(ip.network)
|
|
else:
|
|
return None
|
|
|
|
|
|
def get_ipv6_network_address(iface):
|
|
# Behave in same way as ipv4 get_network_address() above if iface is None.
|
|
if not iface:
|
|
return None
|
|
|
|
try:
|
|
ipv6_addr = utils.get_ipv6_addr(iface=iface)[0]
|
|
all_addrs = netifaces.ifaddresses(iface)
|
|
|
|
for addr in all_addrs[netifaces.AF_INET6]:
|
|
if ipv6_addr == addr['addr']:
|
|
network = "{}/{}".format(addr['addr'], addr['netmask'])
|
|
return str(IPNetwork(network).network)
|
|
|
|
except ValueError:
|
|
msg = "Invalid interface '%s'" % iface
|
|
status_set('blocked', msg)
|
|
raise Exception(msg)
|
|
|
|
msg = "No valid network found for interface '%s'" % iface
|
|
status_set('blocked', msg)
|
|
raise Exception(msg)
|
|
|
|
|
|
def get_corosync_id(unit_name):
|
|
# Corosync nodeid 0 is reserved so increase all the nodeids to avoid it
|
|
off_set = 1000
|
|
return off_set + int(unit_name.split('/')[1])
|
|
|
|
|
|
def nulls(data):
|
|
"""Returns keys of values that are null (but not bool)"""
|
|
return [k for k in data.keys()
|
|
if not isinstance(data[k], bool) and not data[k]]
|
|
|
|
|
|
def get_corosync_conf():
|
|
if config('prefer-ipv6'):
|
|
ip_version = 'ipv6'
|
|
bindnetaddr = get_ipv6_network_address
|
|
else:
|
|
ip_version = 'ipv4'
|
|
bindnetaddr = get_network_address
|
|
|
|
transport = get_transport()
|
|
|
|
# NOTE(jamespage) use local charm configuration over any provided by
|
|
# principle charm
|
|
conf = {
|
|
'ip_version': ip_version,
|
|
'ha_nodes': get_ha_nodes(),
|
|
'transport': transport,
|
|
}
|
|
|
|
# NOTE(jamespage): only populate multicast configuration if udp is
|
|
# configured
|
|
if transport == 'udp':
|
|
conf.update({
|
|
'corosync_bindnetaddr': bindnetaddr(config('corosync_bindiface')),
|
|
'corosync_mcastport': config('corosync_mcastport'),
|
|
'corosync_mcastaddr': config('corosync_mcastaddr')
|
|
})
|
|
|
|
if config('prefer-ipv6'):
|
|
conf['nodeid'] = get_corosync_id(local_unit())
|
|
|
|
if config('netmtu'):
|
|
conf['netmtu'] = config('netmtu')
|
|
|
|
if config('debug'):
|
|
conf['debug'] = config('debug')
|
|
|
|
if not nulls(conf):
|
|
log("Found sufficient values in local config to populate "
|
|
"corosync.conf", level=DEBUG)
|
|
return conf
|
|
|
|
conf = {}
|
|
for relid in relation_ids('ha'):
|
|
for unit in related_units(relid):
|
|
conf = {
|
|
'ip_version': ip_version,
|
|
'ha_nodes': get_ha_nodes(),
|
|
'transport': transport,
|
|
}
|
|
|
|
# NOTE(jamespage): only populate multicast configuration if udpu is
|
|
# configured
|
|
if transport == 'udp':
|
|
bindiface = relation_get('corosync_bindiface',
|
|
unit, relid)
|
|
conf.update({
|
|
'corosync_bindnetaddr': bindnetaddr(bindiface),
|
|
'corosync_mcastport': relation_get('corosync_mcastport',
|
|
unit, relid),
|
|
'corosync_mcastaddr': config('corosync_mcastaddr'),
|
|
})
|
|
|
|
if config('prefer-ipv6'):
|
|
conf['nodeid'] = get_corosync_id(local_unit())
|
|
|
|
if config('netmtu'):
|
|
conf['netmtu'] = config('netmtu')
|
|
|
|
if config('debug'):
|
|
conf['debug'] = config('debug')
|
|
|
|
# Values up to this point must be non-null
|
|
if nulls(conf):
|
|
continue
|
|
|
|
return conf
|
|
|
|
missing = [k for k, v in conf.items() if v is None]
|
|
log('Missing required configuration: %s' % missing)
|
|
return None
|
|
|
|
|
|
def emit_systemd_overrides_file():
|
|
"""Generate the systemd overrides file
|
|
With Start and Stop timeout values
|
|
Note: (David Ames) Bug#1654403 Work around
|
|
May be removed if bug is resolved
|
|
If timeout value is set to -1 pass infinity
|
|
"""
|
|
if not init_is_systemd():
|
|
return
|
|
|
|
stop_timeout = int(config('service_stop_timeout'))
|
|
if stop_timeout < 0:
|
|
stop_timeout = 'infinity'
|
|
start_timeout = int(config('service_start_timeout'))
|
|
if start_timeout < 0:
|
|
start_timeout = 'infinity'
|
|
|
|
systemd_overrides_context = {'service_stop_timeout': stop_timeout,
|
|
'service_start_timeout': start_timeout,
|
|
}
|
|
|
|
for service in ['corosync', 'pacemaker']:
|
|
overrides_dir = SYSTEMD_OVERRIDES_DIR.format(service)
|
|
overrides_file = SYSTEMD_OVERRIDES_FILE.format(overrides_dir)
|
|
if not os.path.isdir(overrides_dir):
|
|
os.mkdir(overrides_dir)
|
|
|
|
write_file(path=overrides_file,
|
|
content=render_template('systemd-overrides.conf',
|
|
systemd_overrides_context))
|
|
|
|
# Update systemd with the new information
|
|
subprocess.check_call(['systemctl', 'daemon-reload'])
|
|
|
|
|
|
def emit_corosync_conf():
|
|
corosync_conf_context = get_corosync_conf()
|
|
if corosync_conf_context:
|
|
write_file(path=COROSYNC_CONF,
|
|
content=render_template('corosync.conf',
|
|
corosync_conf_context))
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
def get_pcmkr_key():
|
|
"""Return the pacemaker auth key"""
|
|
return config('pacemaker_key') or config('corosync_key')
|
|
|
|
|
|
def emit_base_conf():
|
|
if not os.path.isdir(COROSYNC_HACLUSTER_ACL_DIR):
|
|
os.mkdir(COROSYNC_HACLUSTER_ACL_DIR)
|
|
corosync_default_context = {'corosync_enabled': 'yes'}
|
|
write_file(path=COROSYNC_DEFAULT,
|
|
content=render_template('corosync',
|
|
corosync_default_context))
|
|
|
|
write_file(path=COROSYNC_HACLUSTER_ACL,
|
|
content=render_template('hacluster.acl', {}))
|
|
|
|
corosync_key = config('corosync_key')
|
|
if corosync_key:
|
|
write_file(path=COROSYNC_AUTHKEY,
|
|
content=b64decode(corosync_key),
|
|
perms=0o400)
|
|
pcmkr_key = get_pcmkr_key()
|
|
write_file(path=PCMKR_AUTHKEY,
|
|
owner='root',
|
|
group='haclient',
|
|
content=b64decode(pcmkr_key),
|
|
perms=0o440)
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
def render_template(template_name, context, template_dir=TEMPLATES_DIR):
|
|
templates = jinja2.Environment(
|
|
loader=jinja2.FileSystemLoader(template_dir)
|
|
)
|
|
template = templates.get_template(template_name)
|
|
return template.render(context)
|
|
|
|
|
|
def assert_charm_supports_ipv6():
|
|
"""Check whether we are able to support charms ipv6."""
|
|
_release = lsb_release()['DISTRIB_CODENAME'].lower()
|
|
if CompareHostReleases(_release) < "trusty":
|
|
msg = "IPv6 is not supported in the charms for Ubuntu " \
|
|
"versions less than Trusty 14.04"
|
|
status_set('blocked', msg)
|
|
raise Exception(msg)
|
|
|
|
|
|
def get_transport():
|
|
transport = config('corosync_transport')
|
|
_deprecated_transport_values = {"multicast": "udp", "unicast": "udpu"}
|
|
val = _deprecated_transport_values.get(transport, transport)
|
|
if val not in ['udp', 'udpu']:
|
|
msg = ("Unsupported corosync_transport type '%s' - supported "
|
|
"types are: %s" % (transport, ', '.join(SUPPORTED_TRANSPORTS)))
|
|
status_set('blocked', msg)
|
|
raise ValueError(msg)
|
|
|
|
return val
|
|
|
|
|
|
def get_ipv6_addr():
|
|
"""Exclude any ip addresses configured or managed by corosync."""
|
|
excludes = []
|
|
for rid in relation_ids('ha'):
|
|
for unit in related_units(rid):
|
|
resources = parse_data(rid, unit, 'resources')
|
|
for res in resources.values():
|
|
if 'ocf:heartbeat:IPv6addr' in res:
|
|
res_params = parse_data(rid, unit, 'resource_params')
|
|
res_p = res_params.get(res)
|
|
if res_p:
|
|
for k, v in res_p.values():
|
|
if utils.is_ipv6(v):
|
|
log("Excluding '%s' from address list" % v,
|
|
level=DEBUG)
|
|
excludes.append(v)
|
|
|
|
return utils.get_ipv6_addr(exc_list=excludes)[0]
|
|
|
|
|
|
def get_ha_nodes():
|
|
ha_units = peer_ips(peer_relation='hanode')
|
|
ha_nodes = {}
|
|
for unit in ha_units:
|
|
corosync_id = get_corosync_id(unit)
|
|
addr = ha_units[unit]
|
|
if config('prefer-ipv6'):
|
|
if not utils.is_ipv6(addr):
|
|
# Not an error since cluster may still be forming/updating
|
|
log("Expected an ipv6 address but got %s" % (addr),
|
|
level=WARNING)
|
|
|
|
ha_nodes[corosync_id] = addr
|
|
else:
|
|
ha_nodes[corosync_id] = get_host_ip(addr)
|
|
|
|
corosync_id = get_corosync_id(local_unit())
|
|
if config('prefer-ipv6'):
|
|
addr = get_ipv6_addr()
|
|
else:
|
|
addr = get_host_ip(unit_get('private-address'))
|
|
|
|
ha_nodes[corosync_id] = addr
|
|
|
|
return ha_nodes
|
|
|
|
|
|
def get_node_flags(flag):
|
|
"""Nodes which have advertised the given flag.
|
|
|
|
:param flag: Flag to check peers relation data for.
|
|
:type flag: str
|
|
:returns: List of IPs of nodes that are ready to join the cluster
|
|
:rtype: List
|
|
"""
|
|
hosts = []
|
|
if config('prefer-ipv6'):
|
|
hosts.append(get_ipv6_addr())
|
|
else:
|
|
hosts.append(unit_get('private-address'))
|
|
|
|
for relid in relation_ids('hanode'):
|
|
for unit in related_units(relid):
|
|
if relation_get(flag, rid=relid, unit=unit):
|
|
hosts.append(relation_get('private-address',
|
|
rid=relid,
|
|
unit=unit))
|
|
|
|
hosts.sort()
|
|
return hosts
|
|
|
|
|
|
def get_cluster_nodes():
|
|
"""Nodes which have advertised that they are ready to join the cluster.
|
|
|
|
:returns: List of IPs of nodes that are ready to join the cluster
|
|
:rtype: List
|
|
"""
|
|
return get_node_flags('ready')
|
|
|
|
|
|
def get_member_ready_nodes():
|
|
"""List of nodes which have advertised that they have joined the cluster.
|
|
|
|
:returns: List of IPs of nodes that have joined thcluster.
|
|
:rtype: List
|
|
"""
|
|
return get_node_flags('member_ready')
|
|
|
|
|
|
def parse_data(relid, unit, key):
|
|
"""Helper to detect and parse json or ast based relation data"""
|
|
_key = 'json_{}'.format(key)
|
|
data = relation_get(_key, unit, relid) or relation_get(key, unit, relid)
|
|
if data:
|
|
try:
|
|
return json.loads(data)
|
|
except (TypeError, ValueError):
|
|
return ast.literal_eval(data)
|
|
|
|
return {}
|
|
|
|
|
|
def configure_stonith():
|
|
if config('stonith_enabled') not in ['true', 'True', True]:
|
|
log('Disabling STONITH', level=INFO)
|
|
cmd = "crm configure property stonith-enabled=false"
|
|
pcmk.commit(cmd)
|
|
else:
|
|
log('Enabling STONITH for all nodes in cluster.', level=INFO)
|
|
# configure stontih resources for all nodes in cluster.
|
|
# note: this is totally provider dependent and requires
|
|
# access to the MAAS API endpoint, using endpoint and credentials
|
|
# set in config.
|
|
url = config('maas_url')
|
|
creds = config('maas_credentials')
|
|
if None in [url, creds]:
|
|
msg = 'maas_url and maas_credentials must be set ' \
|
|
'in config to enable STONITH.'
|
|
status_set('blocked', msg)
|
|
raise Exception(msg)
|
|
|
|
nodes = maas.MAASHelper(url, creds).list_nodes()
|
|
if not nodes:
|
|
msg = 'Could not obtain node inventory from ' \
|
|
'MAAS @ %s.' % url
|
|
status_set('blocked', msg)
|
|
raise Exception(msg)
|
|
|
|
cluster_nodes = pcmk.list_nodes()
|
|
for node in cluster_nodes:
|
|
rsc, constraint = pcmk.maas_stonith_primitive(nodes, node)
|
|
if not rsc:
|
|
msg = 'Failed to determine STONITH primitive for ' \
|
|
'node %s' % node
|
|
status_set('blocked', msg)
|
|
raise Exception(msg)
|
|
|
|
rsc_name = str(rsc).split(' ')[1]
|
|
if not pcmk.is_resource_present(rsc_name):
|
|
log('Creating new STONITH primitive %s.' % rsc_name,
|
|
level=DEBUG)
|
|
cmd = 'crm -F configure %s' % rsc
|
|
pcmk.commit(cmd)
|
|
if constraint:
|
|
cmd = 'crm -F configure %s' % constraint
|
|
pcmk.commit(cmd)
|
|
else:
|
|
log('STONITH primitive already exists for node.', level=DEBUG)
|
|
|
|
pcmk.commit("crm configure property stonith-enabled=true")
|
|
|
|
|
|
def configure_monitor_host():
|
|
"""Configure extra monitor host for better network failure detection"""
|
|
log('Checking monitor host configuration', level=DEBUG)
|
|
monitor_host = config('monitor_host')
|
|
if monitor_host:
|
|
if not pcmk.crm_opt_exists('ping'):
|
|
log('Implementing monitor host configuration (host: %s)' %
|
|
monitor_host, level=DEBUG)
|
|
monitor_interval = config('monitor_interval')
|
|
cmd = ('crm -w -F configure primitive ping '
|
|
'ocf:pacemaker:ping params host_list="%s" '
|
|
'multiplier="100" op monitor interval="%s" ' %
|
|
(monitor_host, monitor_interval))
|
|
pcmk.commit(cmd)
|
|
cmd = ('crm -w -F configure clone cl_ping ping '
|
|
'meta interleave="true"')
|
|
pcmk.commit(cmd)
|
|
else:
|
|
log('Reconfiguring monitor host configuration (host: %s)' %
|
|
monitor_host, level=DEBUG)
|
|
cmd = ('crm -w -F resource param ping set host_list="%s"' %
|
|
monitor_host)
|
|
else:
|
|
if pcmk.crm_opt_exists('ping'):
|
|
log('Disabling monitor host configuration', level=DEBUG)
|
|
pcmk.commit('crm -w -F resource stop ping')
|
|
pcmk.commit('crm -w -F configure delete ping')
|
|
|
|
|
|
def configure_cluster_global():
|
|
"""Configure global cluster options"""
|
|
log('Applying global cluster configuration', level=DEBUG)
|
|
# NOTE(lathiat) quorum in a two-node scenario is handled by
|
|
# corosync two_node=1. In this case quorum is required for
|
|
# initial cluster startup but not if a node was previously in
|
|
# contact with the full cluster.
|
|
log('Configuring no-quorum-policy to stop', level=DEBUG)
|
|
cmd = "crm configure property no-quorum-policy=stop"
|
|
|
|
pcmk.commit(cmd)
|
|
cmd = ('crm configure rsc_defaults $id="rsc-options" '
|
|
'resource-stickiness="100"')
|
|
pcmk.commit(cmd)
|
|
|
|
log('Configuring cluster-recheck-interval to 60 seconds', level=DEBUG)
|
|
cmd = "crm configure property cluster-recheck-interval=60"
|
|
pcmk.commit(cmd)
|
|
|
|
|
|
def configure_maas_stonith_resource(stonith_hostname):
|
|
"""Create stonith resource for the given hostname.
|
|
|
|
:param stonith_hostname: The hostname that the stonith management system
|
|
refers to the remote node as.
|
|
:type stonith_hostname: str
|
|
"""
|
|
log('Checking for existing stonith resource', level=DEBUG)
|
|
stonith_res_name = 'st-{}'.format(stonith_hostname.split('.')[0])
|
|
if not pcmk.is_resource_present(stonith_res_name):
|
|
ctxt = {
|
|
'url': config('maas_url'),
|
|
'apikey': config('maas_credentials'),
|
|
'hostnames': stonith_hostname,
|
|
'stonith_resource_name': stonith_res_name}
|
|
if all(ctxt.values()):
|
|
cmd = (
|
|
"crm configure primitive {stonith_resource_name} "
|
|
"stonith:external/maas "
|
|
"params url='{url}' apikey='{apikey}' hostnames={hostnames} "
|
|
"op monitor interval=25 start-delay=25 "
|
|
"timeout=25").format(**ctxt)
|
|
pcmk.commit(cmd, failure_is_fatal=True)
|
|
else:
|
|
raise ValueError("Missing configuration: {}".format(ctxt))
|
|
pcmk.commit(
|
|
"crm configure property stonith-enabled=true",
|
|
failure_is_fatal=True)
|
|
|
|
|
|
def get_ip_addr_from_resource_params(params):
|
|
"""Returns the IP address in the resource params provided
|
|
|
|
:return: the IP address in the params or None if not found
|
|
"""
|
|
reg_ex = r'.* ip_address="([a-fA-F\d\:\.]+)".*'
|
|
res = re.search(reg_ex, params)
|
|
return res.group(1) if res else None
|
|
|
|
|
|
def restart_corosync_on_change():
|
|
"""Simple decorator to restart corosync if any of its config changes"""
|
|
def wrap(f):
|
|
def wrapped_f(*args, **kwargs):
|
|
checksums = {}
|
|
for path in COROSYNC_CONF_FILES:
|
|
checksums[path] = file_hash(path)
|
|
return_data = f(*args, **kwargs)
|
|
# NOTE: this assumes that this call is always done around
|
|
# configure_corosync, which returns true if configuration
|
|
# files where actually generated
|
|
if return_data:
|
|
for path in COROSYNC_CONF_FILES:
|
|
if checksums[path] != file_hash(path):
|
|
validated_restart_corosync()
|
|
break
|
|
|
|
return return_data
|
|
return wrapped_f
|
|
return wrap
|
|
|
|
|
|
def try_pcmk_wait():
|
|
"""Try pcmk.wait_for_pcmk()
|
|
Log results and set status message
|
|
"""
|
|
try:
|
|
pcmk.wait_for_pcmk()
|
|
log("Pacemaker is ready", DEBUG)
|
|
except pcmk.ServicesNotUp:
|
|
msg = ("Pacemaker is down. Please manually start it.")
|
|
log(msg, ERROR)
|
|
status_set('blocked', msg)
|
|
raise pcmk.ServicesNotUp(msg)
|
|
|
|
|
|
@restart_corosync_on_change()
|
|
def configure_corosync():
|
|
log('Configuring and (maybe) restarting corosync', level=DEBUG)
|
|
# David Ames Bug#1654403 Work around
|
|
# May be removed if bug is resolved
|
|
emit_systemd_overrides_file()
|
|
return emit_base_conf() and emit_corosync_conf()
|
|
|
|
|
|
def services_running():
|
|
"""Determine if both Corosync and Pacemaker are running
|
|
Both from the operating system perspective and with a functional test
|
|
@returns boolean
|
|
"""
|
|
pacemaker_status = service_running("pacemaker")
|
|
corosync_status = service_running("corosync")
|
|
log("Pacemaker status: {}, Corosync status: {}"
|
|
"".format(pacemaker_status, corosync_status),
|
|
level=DEBUG)
|
|
if not (pacemaker_status and corosync_status):
|
|
# OS perspective
|
|
return False
|
|
else:
|
|
# Functional test of pacemaker
|
|
return pcmk.wait_for_pcmk()
|
|
|
|
|
|
def validated_restart_corosync(retries=10):
|
|
"""Restart and validate Corosync and Pacemaker are in fact up and running.
|
|
|
|
@param retries: number of attempts to restart the services before giving up
|
|
@raises pcmk.ServicesNotUp if after retries services are still not up
|
|
"""
|
|
for restart in range(retries):
|
|
try:
|
|
if restart_corosync():
|
|
log("Corosync and Pacemaker are validated as up and running.",
|
|
INFO)
|
|
return
|
|
else:
|
|
log("Corosync or Pacemaker not validated as up yet, retrying",
|
|
WARNING)
|
|
except pcmk.ServicesNotUp:
|
|
log("Pacemaker failed to start, retrying", WARNING)
|
|
continue
|
|
|
|
msg = ("Corosync and/or Pacemaker failed to restart after {} retries"
|
|
"".format(retries))
|
|
log(msg, ERROR)
|
|
status_set('blocked', msg)
|
|
raise pcmk.ServicesNotUp(msg)
|
|
|
|
|
|
def restart_corosync():
|
|
if service_running("pacemaker"):
|
|
log("Stopping pacemaker", DEBUG)
|
|
service_stop("pacemaker")
|
|
|
|
if not is_unit_paused_set():
|
|
log("Stopping corosync", DEBUG)
|
|
service_stop("corosync")
|
|
log("Starting corosync", DEBUG)
|
|
service_start("corosync")
|
|
log("Starting pacemaker", DEBUG)
|
|
service_start("pacemaker")
|
|
|
|
return services_running()
|
|
|
|
|
|
def validate_dns_ha():
|
|
"""Validate the DNS HA
|
|
|
|
Assert the charm will support DNS HA
|
|
Check MAAS related configuration options are properly set
|
|
|
|
:raises MAASConfigIncomplete: if maas_url and maas_credentials are not set
|
|
"""
|
|
|
|
# Will raise an exception if unable to continue
|
|
assert_charm_supports_dns_ha()
|
|
|
|
if config('maas_url') and config('maas_credentials'):
|
|
return True
|
|
else:
|
|
msg = ("DNS HA is requested but the maas_url or maas_credentials "
|
|
"settings are not set")
|
|
raise MAASConfigIncomplete(msg)
|
|
|
|
|
|
def setup_maas_api():
|
|
"""Install MAAS PPA and packages for accessing the MAAS API.
|
|
"""
|
|
add_source(config('maas_source'))
|
|
apt_update(fatal=True)
|
|
apt_install('python3-maas-client', fatal=True)
|
|
|
|
|
|
def setup_ocf_files():
|
|
"""Setup OCF resrouce agent files
|
|
"""
|
|
|
|
# TODO (thedac) Eventually we want to package the OCF files.
|
|
# Bundle with the charm until then.
|
|
mkdir('/usr/lib/ocf/resource.d/ceph')
|
|
mkdir('/usr/lib/ocf/resource.d/maas')
|
|
# Xenial corosync is not creating this directory
|
|
mkdir('/etc/corosync/uidgid.d')
|
|
|
|
rsync('ocf/ceph/rbd', '/usr/lib/ocf/resource.d/ceph/rbd')
|
|
rsync('ocf/maas/dns', '/usr/lib/ocf/resource.d/maas/dns')
|
|
rsync('ocf/maas/maas_dns.py', '/usr/lib/heartbeat/maas_dns.py')
|
|
rsync('ocf/maas/maasclient/', '/usr/lib/heartbeat/maasclient/')
|
|
rsync(
|
|
'ocf/maas/maas_stonith_plugin.py',
|
|
'/usr/lib/stonith/plugins/external/maas')
|
|
|
|
|
|
def write_maas_dns_address(resource_name, resource_addr):
|
|
"""Writes the specified IP address to the resource file for MAAS dns.
|
|
|
|
:param resource_name: the name of the resource the address belongs to.
|
|
This is the name of the file that will be written in /etc/maas_dns.
|
|
:param resource_addr: the IP address for the resource. This will be
|
|
written to the resource_name file.
|
|
"""
|
|
mkdir(MAAS_DNS_CONF_DIR)
|
|
write_file(os.path.join(MAAS_DNS_CONF_DIR, resource_name),
|
|
content=resource_addr)
|
|
|
|
|
|
def needs_maas_dns_migration():
|
|
"""Determines if the MAAS DNS ocf resources need migration.
|
|
|
|
:return: True if migration is necessary, False otherwise.
|
|
"""
|
|
try:
|
|
subprocess.check_call(['grep', 'OCF_RESOURCE_INSTANCE',
|
|
'/usr/lib/ocf/resource.d/maas/dns'])
|
|
return True
|
|
except subprocess.CalledProcessError:
|
|
# check_call will raise an exception if grep doesn't find the string
|
|
return False
|
|
|
|
|
|
def is_in_standby_mode(node_name):
|
|
"""Check if node is in standby mode in pacemaker
|
|
|
|
@param node_name: The name of the node to check
|
|
@returns boolean - True if node_name is in standby mode
|
|
"""
|
|
out = (subprocess
|
|
.check_output(['crm', 'node', 'status', node_name])
|
|
.decode('utf-8'))
|
|
root = ET.fromstring(out)
|
|
|
|
standby_mode = False
|
|
for nvpair in root.iter('nvpair'):
|
|
if (nvpair.attrib.get('name') == 'standby' and
|
|
nvpair.attrib.get('value') == 'on'):
|
|
standby_mode = True
|
|
return standby_mode
|
|
|
|
|
|
def get_hostname():
|
|
"""Return the hostname of this unit
|
|
|
|
@returns hostname
|
|
"""
|
|
return socket.gethostname()
|
|
|
|
|
|
def enter_standby_mode(node_name, duration='forever'):
|
|
"""Put this node into standby mode in pacemaker
|
|
|
|
@returns None
|
|
"""
|
|
subprocess.check_call(['crm', 'node', 'standby', node_name, duration])
|
|
|
|
|
|
def leave_standby_mode(node_name):
|
|
"""Take this node out of standby mode in pacemaker
|
|
|
|
@returns None
|
|
"""
|
|
subprocess.check_call(['crm', 'node', 'online', node_name])
|
|
|
|
|
|
def node_has_resources(node_name):
|
|
"""Check if this node is running resources
|
|
|
|
@param node_name: The name of the node to check
|
|
@returns boolean - True if node_name has resources
|
|
"""
|
|
out = subprocess.check_output(['crm_mon', '-X']).decode('utf-8')
|
|
root = ET.fromstring(out)
|
|
has_resources = False
|
|
for resource in root.iter('resource'):
|
|
for child in resource:
|
|
if child.tag == 'node' and child.attrib.get('name') == node_name:
|
|
has_resources = True
|
|
return has_resources
|
|
|
|
|
|
def set_unit_status():
|
|
"""Set the workload status for this unit
|
|
|
|
@returns None
|
|
"""
|
|
status_set(*assess_status_helper())
|
|
|
|
|
|
def resume_unit():
|
|
"""Resume services on this unit and update the units status
|
|
|
|
@returns None
|
|
"""
|
|
node_name = get_hostname()
|
|
messages = []
|
|
leave_standby_mode(node_name)
|
|
if is_in_standby_mode(node_name):
|
|
messages.append("Node still in standby mode")
|
|
if messages:
|
|
raise Exception("Couldn't resume: {}".format("; ".join(messages)))
|
|
else:
|
|
clear_unit_paused()
|
|
set_unit_status()
|
|
|
|
|
|
def pause_unit():
|
|
"""Pause services on this unit and update the units status
|
|
|
|
@returns None
|
|
"""
|
|
node_name = get_hostname()
|
|
messages = []
|
|
enter_standby_mode(node_name)
|
|
if not is_in_standby_mode(node_name):
|
|
messages.append("Node not in standby mode")
|
|
|
|
# some resources may take some time to be migrated out from the node. So 3
|
|
# retries are made with a 5 seconds wait between each one.
|
|
i = 0
|
|
ready = False
|
|
has_resources = False
|
|
while i < PCMKR_MAX_RETRIES and not ready:
|
|
if node_has_resources(node_name):
|
|
has_resources = True
|
|
i += 1
|
|
time.sleep(PCMKR_SLEEP_SECS)
|
|
else:
|
|
ready = True
|
|
has_resources = False
|
|
|
|
if has_resources:
|
|
messages.append("Resources still running on unit")
|
|
status, message = assess_status_helper()
|
|
if status != 'active':
|
|
messages.append(message)
|
|
if messages and not is_unit_upgrading_set():
|
|
raise Exception("Couldn't pause: {}".format("; ".join(messages)))
|
|
else:
|
|
set_unit_paused()
|
|
status_set("maintenance",
|
|
"Paused. Use 'resume' action to resume normal service.")
|
|
|
|
|
|
def assess_status_helper():
|
|
"""Assess status of unit
|
|
|
|
@returns status, message - status is workload status and message is any
|
|
corresponding messages
|
|
"""
|
|
|
|
if is_unit_upgrading_set():
|
|
return ("blocked",
|
|
"Ready for do-release-upgrde. Set complete when finished")
|
|
if is_unit_paused_set():
|
|
return ("maintenance",
|
|
"Paused. Use 'resume' action to resume normal service.")
|
|
|
|
node_count = int(config('cluster_count'))
|
|
status = 'active'
|
|
message = 'Unit is ready and clustered'
|
|
try:
|
|
try_pcmk_wait()
|
|
except pcmk.ServicesNotUp:
|
|
message = 'Pacemaker is down'
|
|
status = 'blocked'
|
|
for relid in relation_ids('hanode'):
|
|
if len(related_units(relid)) + 1 < node_count:
|
|
status = 'blocked'
|
|
message = ("Insufficient peer units for ha cluster "
|
|
"(require {})".format(node_count))
|
|
|
|
# if the status was not changed earlier, we verify the maintenance status
|
|
try:
|
|
if status == 'active':
|
|
prop = pcmk.get_property('maintenance-mode').strip()
|
|
except pcmk.PropertyNotFound:
|
|
# the property is not the output of 'crm configure show xml', so we use
|
|
# the default value for this property. For crmsh>=2.2.0 the default
|
|
# value is automatically provided by show-property or get-property.
|
|
prop = 'false'
|
|
|
|
if (status == 'active' and prop == 'true'):
|
|
# maintenance mode enabled in pacemaker
|
|
status = 'maintenance'
|
|
message = 'Pacemaker in maintenance mode'
|
|
|
|
return status, message
|
|
|
|
|
|
def ocf_file_exists(res_name, resources,
|
|
RES_ROOT='/usr/lib/ocf/resource.d'):
|
|
"""To determine whether the ocf file exists, allow multiple ocf
|
|
files with the same name in different directories
|
|
|
|
@param res_name: The name of the ocf resource to check
|
|
@param resources: ocf resources
|
|
@return: boolean - True if the ocf resource exists
|
|
"""
|
|
res_type = None
|
|
for key, val in resources.items():
|
|
if res_name == key:
|
|
if len(val.split(':')) > 2:
|
|
res_type = val.split(':')[1]
|
|
ocf_name = res_name.replace('res_', '').replace('_', '-')
|
|
ocf_file = os.path.join(RES_ROOT, res_type, ocf_name)
|
|
if os.path.isfile(ocf_file):
|
|
return True
|
|
return False
|
|
|
|
|
|
def kill_legacy_ocf_daemon_process(res_name):
|
|
"""Kill legacy ocf daemon process
|
|
|
|
@param res_name: The name of the ocf process to kill
|
|
"""
|
|
ocf_name = res_name.replace('res_', '').replace('_', '-')
|
|
reg_expr = '([0-9]+)\s+[^0-9]+{}'.format(ocf_name)
|
|
cmd = ['ps', '-eo', 'pid,cmd']
|
|
ps = subprocess.check_output(cmd).decode('utf-8')
|
|
res = re.search(reg_expr, ps, re.MULTILINE)
|
|
if res:
|
|
pid = res.group(1)
|
|
subprocess.call(['sudo', 'kill', '-9', pid])
|
|
|
|
|
|
def maintenance_mode(enable):
|
|
"""Enable/disable pacemaker's maintenance mode"""
|
|
|
|
log('Setting maintenance-mode to %s' % enable, level=INFO)
|
|
|
|
try:
|
|
current_state = pcmk.get_property('maintenance-mode').strip().lower()
|
|
except pcmk.PropertyNotFound:
|
|
current_state = 'false'
|
|
|
|
current_state = True if current_state == 'true' else False
|
|
log('Is maintenance-mode currently enabled? %s' % current_state,
|
|
level=DEBUG)
|
|
if current_state != enable:
|
|
pcmk.set_property('maintenance-mode', str(enable).lower())
|
|
else:
|
|
log('Desired value for maintenance-mode is already set', level=DEBUG)
|