Fix DNS resolution problems

There are two distinct DNS resolution problems. One, the nodename
reverse DNS resolution. And two, resolution of peers. This attempts
to fix both problems.

Nodename is only required when rabbitmq runs multiple instances on
the same host. Having this set complicates the DNS requirements.
Removing this setting may simplify the DNS problems this charm has
faced.

For peer resolution force the use of hostname rather than dynamically
attempting to resolve one from an IP. Set /etc/hosts with the local
hostname and with each peer.

Standardize the way an IPv4 or IPv6 address is selected throughout
the charm. Standardize the way a hostname is selected throughout the
charm.

Partial-Bug: 1584902
Change-Id: I105eb2684e61a553a52c5a944e8c562945e2a6eb
This commit is contained in:
David Ames 2016-10-20 15:59:38 -07:00
parent 0c93c06505
commit 0c2f3e6b1b
5 changed files with 100 additions and 115 deletions

View File

@ -21,6 +21,7 @@ import tempfile
import random
import time
import socket
from collections import OrderedDict
from rabbitmq_context import (
RabbitMQSSLContext,
@ -38,6 +39,11 @@ from charmhelpers.contrib.openstack.utils import (
is_unit_paused_set,
)
from charmhelpers.contrib.network.ip import (
get_ipv6_addr,
get_address_in_network,
)
from charmhelpers.core.hookenv import (
relation_ids,
related_units,
@ -49,6 +55,8 @@ from charmhelpers.core.hookenv import (
unit_get,
relation_set,
application_version_set,
config,
network_get_primary_address,
)
from charmhelpers.core.host import (
@ -66,9 +74,9 @@ from charmhelpers.contrib.peerstorage import (
peer_retrieve
)
from charmhelpers.fetch import get_upstream_version
from collections import OrderedDict
PACKAGES = ['rabbitmq-server', 'python-amqplib', 'lockfile-progs']
@ -337,7 +345,7 @@ def wait_app():
pid_file = run_dir + 'pid'
else:
pid_file = '/var/lib/rabbitmq/mnesia/rabbit@' \
+ get_local_nodename() + '.pid'
+ socket.gethostname() + '.pid'
log('Waiting for rabbitmq app to start: {}'.format(pid_file), DEBUG)
try:
rabbitmqctl('wait', pid_file)
@ -402,7 +410,7 @@ def cluster_with():
join_cluster(node)
# NOTE: toggle the cluster relation to ensure that any peers
# already clustered re-assess status correctly
relation_set(clustered=get_local_nodename())
relation_set(clustered=get_unit_hostname())
return True
except subprocess.CalledProcessError as e:
status_set('blocked', 'Failed to cluster with %s. Exception: %s'
@ -674,12 +682,17 @@ def running_nodes():
@cached
def leader_node():
''' Provide the leader node for clustering '''
''' Provide the leader node for clustering
@returns leader node's hostname or None
'''
# Each rabbitmq node should join_cluster with the leader
# to avoid split-brain clusters.
leader_node_ip = peer_retrieve('leader_node_ip')
if leader_node_ip:
return "rabbit@" + get_node_hostname(leader_node_ip)
leader_node_hostname = peer_retrieve('leader_node_hostname')
if leader_node_hostname:
return "rabbit@" + leader_node_hostname
else:
return None
def get_node_hostname(ip_addr):
@ -857,3 +870,45 @@ def _pause_resume_helper(f, configs):
f(assess_status_func(configs),
services=services(),
ports=None)
def get_unit_ip(amqp_relation=False):
"""Return this unit's IP.
Future proof to allow for network spaces or other more complex addresss
selection.
@raises Exception if prefer-ipv6 is configured but IPv6 unsupported.
@returns IPv6 or IPv4 address
"""
fallback = get_host_ip(unit_get('private-address'))
if config('prefer-ipv6'):
assert_charm_supports_ipv6()
return get_ipv6_addr()[0]
elif amqp_relation:
if config('access-network'):
# NOTE(jamespage)
# override private-address settings if access-network is
# configured and an appropriate network interface is
# configured.
return get_address_in_network(config('access-network'),
fallback)
else:
# NOTE(jamespage)
# Try using network spaces if access-network is not
# configured, fallback to private address if not
# supported
try:
return network_get_primary_address('amqp')
except NotImplementedError:
return fallback
else:
return fallback
def get_unit_hostname():
"""Return this unit's hostname.
@returns hostname
"""
return socket.gethostname()

View File

@ -19,7 +19,6 @@ import shutil
import sys
import subprocess
import glob
import socket
try:
import yaml # flake8: noqa
@ -42,14 +41,9 @@ from charmhelpers.contrib.hahelpers.cluster import (
is_elected_leader
)
from charmhelpers.contrib.openstack.utils import (
get_host_ip,
is_unit_paused_set,
)
from charmhelpers.contrib.network.ip import (
get_ipv6_addr
)
import charmhelpers.contrib.storage.linux.ceph as ceph
from charmhelpers.contrib.openstack.utils import save_script_rc
from charmhelpers.contrib.hardening.harden import harden
@ -74,7 +68,6 @@ from charmhelpers.core.hookenv import (
service_name,
local_unit,
config,
unit_get,
is_relation_made,
Hooks,
UnregisteredHookError,
@ -82,7 +75,6 @@ from charmhelpers.core.hookenv import (
charm_dir,
status_set,
unit_private_ip,
network_get_primary_address,
)
from charmhelpers.core.host import (
cmp_pkgrevno,
@ -102,7 +94,6 @@ from charmhelpers.contrib.peerstorage import (
leader_get,
)
from charmhelpers.contrib.network.ip import get_address_in_network
hooks = Hooks()
@ -116,7 +107,7 @@ SCRIPTS_DIR = '/usr/local/bin'
STATS_CRONFILE = '/etc/cron.d/rabbitmq-stats'
STATS_DATAFILE = os.path.join(RABBIT_DIR, 'data',
'{}_queue_stats.dat'
''.format(socket.gethostname()))
''.format(rabbit.get_unit_hostname()))
@hooks.hook('install.real')
@ -126,25 +117,6 @@ def install():
# NOTE(jamespage) install actually happens in config_changed hook
def configure_nodename():
'''Set RABBITMQ_NODENAME to something that's resolvable by my peers'''
nodename = rabbit.get_local_nodename()
log('configuring nodename', level=INFO)
if (nodename and
rabbit.get_node_name() != 'rabbit@%s' % nodename):
log('forcing nodename=%s' % nodename, level=INFO)
# would like to have used the restart_on_change decorator, but
# need to stop it under current nodename prior to updating env
log('Stopping rabbitmq-server.')
service_stop('rabbitmq-server')
rabbit.update_rmq_env_conf(hostname='rabbit@%s' % nodename,
ipv6=config('prefer-ipv6'))
if not is_unit_paused_set():
log('Starting rabbitmq-server.')
service_restart('rabbitmq-server')
rabbit.wait_app()
def configure_amqp(username, vhost, admin=False):
# get and update service password
password = rabbit.get_rabbit_password(username)
@ -165,10 +137,7 @@ def configure_amqp(username, vhost, admin=False):
@hooks.hook('amqp-relation-changed')
def amqp_changed(relation_id=None, remote_unit=None):
if config('prefer-ipv6'):
host_addr = get_ipv6_addr()[0]
else:
host_addr = unit_get('private-address')
host_addr = rabbit.get_unit_ip()
if not is_elected_leader('res_rabbitmq_vip'):
# NOTE(jamespage) clear relation to deal with data being
@ -225,30 +194,9 @@ def amqp_changed(relation_id=None, remote_unit=None):
queues[amqp]['username'],
queues[amqp]['vhost'])
if config('prefer-ipv6'):
relation_settings['private-address'] = host_addr
else:
hostname = None
fallback = unit_get('private-address')
if config('access-network'):
# NOTE(jamespage)
# override private-address settings if access-network is
# configured and an appropriate network interface is
# configured.
hostname = get_address_in_network(config('access-network'),
fallback)
else:
# NOTE(jamespage)
# Try using network spaces if access-network is not
# configured, fallback to private address if not
# supported
try:
hostname = network_get_primary_address('amqp')
except NotImplementedError:
hostname = fallback
relation_settings['hostname'] = \
relation_settings['private-address'] = hostname
relation_settings['hostname'] = \
relation_settings['private-address'] = \
rabbit.get_unit_ip(amqp_relation=True)
ssl_utils.configure_client_ssl(relation_settings)
@ -311,15 +259,10 @@ def is_sufficient_peers():
@hooks.hook('cluster-relation-joined')
def cluster_joined(relation_id=None):
relation_settings = {
'hostname': rabbit.get_local_nodename(),
'hostname': rabbit.get_unit_hostname(),
'private-address': rabbit.get_unit_ip(),
}
if config('prefer-ipv6'):
relation_settings['private-address'] = get_ipv6_addr()[0]
else:
relation_settings['private-address'] = get_host_ip(
unit_get('private-address'))
relation_set(relation_id=relation_id,
relation_settings=relation_settings)
@ -329,8 +272,6 @@ def cluster_joined(relation_id=None):
'rabbitmq cluster config.')
return
configure_nodename()
try:
if not is_leader():
log('Not the leader, deferring cookie propagation to leader')
@ -353,6 +294,7 @@ def cluster_joined(relation_id=None):
cookie = open(rabbit.COOKIE_PATH, 'r').read().strip()
peer_store('cookie', cookie)
peer_store('leader_node_ip', unit_private_ip())
peer_store('leader_node_hostname', rabbit.get_unit_hostname())
@hooks.hook('cluster-relation-changed')
@ -670,8 +612,9 @@ MAN_PLUGIN = 'rabbitmq_management'
@harden()
def config_changed():
if config('prefer-ipv6'):
rabbit.assert_charm_supports_ipv6()
# Update hosts with this unit's information
rabbit.update_hosts_file({rabbit.get_unit_ip():
rabbit.get_unit_hostname()})
# Add archive source if provided
add_source(config('source'), config('key'))
@ -690,8 +633,6 @@ def config_changed():
chown(RABBIT_DIR, rabbit.RABBIT_USER, rabbit.RABBIT_USER)
chmod(RABBIT_DIR, 0o775)
configure_nodename()
if config('management_plugin') is True:
rabbit.enable_plugin(MAN_PLUGIN)
open_port(55672)

View File

@ -380,24 +380,6 @@ class RmqBasicDeployment(OpenStackAmuletDeployment):
u.log.info('OK\n')
def test_300_rmq_config(self):
"""Verify the data in the rabbitmq conf file."""
conf = '/etc/rabbitmq/rabbitmq-env.conf'
sentry_units = self._get_rmq_sentry_units()
for unit in sentry_units:
host_name = unit.file_contents('/etc/hostname').strip()
u.log.debug('Checking rabbitmq config file data on '
'{} ({})...'.format(unit.info['unit_name'],
host_name))
expected = {
'RABBITMQ_NODENAME': 'rabbit@{}'.format(host_name)
}
file_contents = unit.file_contents(conf)
u.validate_sectionless_conf(file_contents, expected)
u.log.info('OK\n')
def test_400_rmq_cluster_running_nodes(self):
"""Verify that cluster status from each rmq juju unit shows
every cluster node as a running member in that cluster."""

View File

@ -180,11 +180,9 @@ class UtilsTests(unittest.TestCase):
'juju-devel3-machine-13')
mock_get_hostname.assert_called_with('192.168.20.50', fqdn=False)
@mock.patch('rabbit_utils.get_node_hostname')
@mock.patch('rabbit_utils.peer_retrieve')
def test_leader_node(self, mock_peer_retrieve, mock_get_node_hostname):
mock_peer_retrieve.return_value = '192.168.20.50'
mock_get_node_hostname.return_value = 'juju-devel3-machine-15'
def test_leader_node(self, mock_peer_retrieve):
mock_peer_retrieve.return_value = 'juju-devel3-machine-15'
self.assertEqual(rabbit_utils.leader_node(),
'rabbit@juju-devel3-machine-15')

View File

@ -38,20 +38,25 @@ class RelationUtil(TestCase):
super(RelationUtil, self).setUp()
@patch('rabbitmq_server_relations.peer_store_and_set')
@patch('rabbitmq_server_relations.get_ipv6_addr')
@patch('rabbitmq_server_relations.config')
@patch('rabbitmq_server_relations.relation_set')
@patch('rabbitmq_server_relations.cmp_pkgrevno')
@patch('rabbitmq_server_relations.is_clustered')
@patch('rabbitmq_server_relations.ssl_utils.configure_client_ssl')
@patch('rabbitmq_server_relations.unit_get')
@patch('rabbitmq_server_relations.rabbit.get_unit_ip')
@patch('rabbitmq_server_relations.relation_get')
@patch('rabbitmq_server_relations.is_elected_leader')
def test_amqp_changed_compare_versions_ha_queues(
self,
is_elected_leader, relation_get, unit_get, configure_client_ssl,
is_clustered, cmp_pkgrevno, relation_set, mock_config,
mock_get_ipv6_addr, mock_peer_store_and_set):
is_elected_leader,
relation_get,
get_unit_ip,
configure_client_ssl,
is_clustered,
cmp_pkgrevno,
relation_set,
mock_config,
mock_peer_store_and_set):
"""
Compare version above and below 3.0.1.
Make sure ha_queues is set correctly on each side.
@ -65,8 +70,7 @@ class RelationUtil(TestCase):
mock_config.side_effect = config
host_addr = "10.1.2.3"
unit_get.return_value = host_addr
mock_get_ipv6_addr.return_value = [host_addr]
get_unit_ip.return_value = host_addr
is_elected_leader.return_value = True
relation_get.return_value = {}
is_clustered.return_value = False
@ -87,20 +91,25 @@ class RelationUtil(TestCase):
relation_id=None)
@patch('rabbitmq_server_relations.peer_store_and_set')
@patch('rabbitmq_server_relations.get_ipv6_addr')
@patch('rabbitmq_server_relations.config')
@patch('rabbitmq_server_relations.relation_set')
@patch('rabbitmq_server_relations.cmp_pkgrevno')
@patch('rabbitmq_server_relations.is_clustered')
@patch('rabbitmq_server_relations.ssl_utils.configure_client_ssl')
@patch('rabbitmq_server_relations.unit_get')
@patch('rabbitmq_server_relations.rabbit.get_unit_ip')
@patch('rabbitmq_server_relations.relation_get')
@patch('rabbitmq_server_relations.is_elected_leader')
def test_amqp_changed_compare_versions_ha_queues_prefer_ipv6(
self,
is_elected_leader, relation_get, unit_get, configure_client_ssl,
is_clustered, cmp_pkgrevno, relation_set, mock_config,
mock_get_ipv6_addr, mock_peer_store_and_set):
is_elected_leader,
relation_get,
get_unit_ip,
configure_client_ssl,
is_clustered,
cmp_pkgrevno,
relation_set,
mock_config,
mock_peer_store_and_set):
"""
Compare version above and below 3.0.1.
Make sure ha_queues is set correctly on each side.
@ -114,9 +123,7 @@ class RelationUtil(TestCase):
mock_config.side_effect = config
ipv6_addr = "2001:db8:1:0:f816:3eff:fed6:c140"
mock_get_ipv6_addr.return_value = [ipv6_addr]
host_addr = "10.1.2.3"
unit_get.return_value = host_addr
get_unit_ip.return_value = ipv6_addr
is_elected_leader.return_value = True
relation_get.return_value = {}
is_clustered.return_value = False
@ -125,13 +132,15 @@ class RelationUtil(TestCase):
rabbitmq_server_relations.amqp_changed(None, None)
mock_peer_store_and_set.assert_called_with(
relation_settings={'private-address': ipv6_addr,
'hostname': ipv6_addr,
'ha_queues': True},
relation_id=None)
cmp_pkgrevno.return_value = 1
rabbitmq_server_relations.amqp_changed(None, None)
mock_peer_store_and_set.assert_called_with(
relation_settings={'private-address': ipv6_addr},
relation_settings={'private-address': ipv6_addr,
'hostname': ipv6_addr},
relation_id=None)
@patch.object(rabbitmq_server_relations, 'is_leader')