diff --git a/doc/source/admin/config-dhcp-ha.rst b/doc/source/admin/config-dhcp-ha.rst index 82780551b79..fcfbc87d40b 100644 --- a/doc/source/admin/config-dhcp-ha.rst +++ b/doc/source/admin/config-dhcp-ha.rst @@ -441,6 +441,38 @@ To test the HA of DHCP agent: #. Start DHCP agent on HostB. The VM gets the wanted IP again. +No HA for metadata service on isolated networks +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +All Neutron backends using the DHCP agent can also provide `metadata service +`_ in isolated +networks (i.e. networks without a router). In this case the DHCP agent manages +the metadata service (see config option `enable_isolated_metadata +`_). + +Note however that the metadata service is only redundant for IPv4, and not +IPv6, even when the DHCP service is configured to be highly available +(config option `dhcp_agents_per_network +`_ +> 1). This is because the DHCP agent will insert a route to the well known +metadata IPv4 address (`169.254.169.254`) via its own IP address, so it will +be reachable as long as the DHCP service is available at that IP address. +This also means that recovery after a failure is tied to the renewal of the +DHCP lease, since that route will only change if the DHCP server for a VM +changes. + +With IPv6, the well known metadata IPv6 address (`fe80::a9fe:a9fe`) is used, +but directly configured in the DHCP agent network namespace. +Due to the enforcement of duplicate address detection (DAD), this address +can only be configured in at most one DHCP network namespaces at any time. +See `RFC 4862 `_ for +details on the DAD process. + +For this reason, even when you have multiple DHCP agents, an arbitrary one +(where the metadata IPv6 address is not in `dadfailed` state) will serve all +metadata requests over IPv6. When that metadata service instance becomes +unreachable there is no failover and the service will become unreachable. + Disabling and removing an agent ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/neutron/agent/linux/dhcp.py b/neutron/agent/linux/dhcp.py index 84d444c659f..4f19d6cc730 100644 --- a/neutron/agent/linux/dhcp.py +++ b/neutron/agent/linux/dhcp.py @@ -41,6 +41,7 @@ from neutron.agent.linux import external_process from neutron.agent.linux import ip_lib from neutron.agent.linux import iptables_manager from neutron.cmd import runtime_checks as checks +from neutron.common import _constants as common_constants from neutron.common import utils as common_utils from neutron.ipam import utils as ipam_utils from neutron.privileged.agent.linux import dhcp as priv_dhcp @@ -1841,7 +1842,7 @@ class DeviceManager(object): if self.conf.force_metadata or self.conf.enable_isolated_metadata: ip_cidrs.append(constants.METADATA_CIDR) if netutils.is_ipv6_enabled(): - ip_cidrs.append(constants.METADATA_V6_CIDR) + ip_cidrs.append(common_constants.METADATA_V6_CIDR) self.driver.init_l3(interface_name, ip_cidrs, namespace=network.namespace) diff --git a/neutron/agent/linux/ip_lib.py b/neutron/agent/linux/ip_lib.py index 4d664381f89..99537290166 100644 --- a/neutron/agent/linux/ip_lib.py +++ b/neutron/agent/linux/ip_lib.py @@ -102,6 +102,10 @@ class AddressNotReady(exceptions.NeutronException): "become ready: %(reason)s") +class DADFailed(AddressNotReady): + pass + + InvalidArgument = privileged.InvalidArgument @@ -592,7 +596,7 @@ class IpAddrCommand(IpDeviceCommandBase): """Wait until an address is no longer marked 'tentative' or 'dadfailed' raises AddressNotReady if times out, address not present on interface - or DAD fails + raises DADFailed if Duplicate Address Detection fails """ def is_address_ready(): try: @@ -604,7 +608,7 @@ class IpAddrCommand(IpDeviceCommandBase): # Since both 'dadfailed' and 'tentative' will be set if DAD fails, # check 'dadfailed' first just to be explicit if addr_info['dadfailed']: - raise AddressNotReady( + raise DADFailed( address=address, reason=_('Duplicate address detected')) if addr_info['tentative']: return False diff --git a/neutron/agent/metadata/driver.py b/neutron/agent/metadata/driver.py index b7e69696a0d..a4a62444e23 100644 --- a/neutron/agent/metadata/driver.py +++ b/neutron/agent/metadata/driver.py @@ -33,6 +33,7 @@ from neutron.agent.l3 import namespaces from neutron.agent.linux import external_process from neutron.agent.linux import ip_lib from neutron.agent.linux import utils as linux_utils +from neutron.common import _constants as common_constants from neutron.common import coordination from neutron.common import metadata as comm_meta from neutron.common import utils as common_utils @@ -241,9 +242,30 @@ class MetadataDriver(object): # HAProxy cannot bind() until IPv6 Duplicate Address Detection # completes. We must wait until the address leaves its 'tentative' # state. - ip_lib.IpAddrCommand( - parent=ip_lib.IPDevice(name=bind_interface, namespace=ns_name) - ).wait_until_address_ready(address=bind_address_v6) + try: + ip_lib.IpAddrCommand( + parent=ip_lib.IPDevice(name=bind_interface, + namespace=ns_name) + ).wait_until_address_ready(address=bind_address_v6) + except ip_lib.DADFailed as exc: + # This failure means that another DHCP agent has already + # configured this metadata address, so all requests will + # be via that single agent. + LOG.info('DAD failed for address %(address)s on interface ' + '%(interface)s in namespace %(namespace)s on network ' + '%(network)s, deleting it. Exception: %(exception)s', + {'address': bind_address_v6, + 'interface': bind_interface, + 'namespace': ns_name, + 'network': network_id, + 'exception': str(exc)}) + try: + ip_lib.delete_ip_address(bind_address_v6, bind_interface, + namespace=ns_name) + except Exception as exc: + # do not re-raise a delete failure, just log + LOG.info('Address deletion failure: %s', str(exc)) + return pm.enable() monitor.register(uuid, METADATA_SERVICE_NAME, pm) cls.monitors[router_id] = pm @@ -338,6 +360,6 @@ def apply_metadata_nat_rules(router, proxy): if netutils.is_ipv6_enabled(): for c, r in proxy.metadata_nat_rules( proxy.metadata_port, - metadata_address=(constants.METADATA_V6_IP + '/128')): + metadata_address=(common_constants.METADATA_V6_CIDR)): router.iptables_manager.ipv6['nat'].add_rule(c, r) router.iptables_manager.apply() diff --git a/neutron/common/_constants.py b/neutron/common/_constants.py index 55fc718c497..03c07eb71a4 100644 --- a/neutron/common/_constants.py +++ b/neutron/common/_constants.py @@ -86,3 +86,6 @@ TRAIT_NETWORK_TUNNEL = 'CUSTOM_NETWORK_TUNNEL_PROVIDER' # The lowest binding index for L3 agents and DHCP agents. LOWEST_AGENT_BINDING_INDEX = 1 + +# Neutron-lib defines this with a /64 but it should be /128 +METADATA_V6_CIDR = constants.METADATA_V6_IP + '/128' diff --git a/neutron/conf/agent/database/agentschedulers_db.py b/neutron/conf/agent/database/agentschedulers_db.py index f58e0b27718..a46a18a2626 100644 --- a/neutron/conf/agent/database/agentschedulers_db.py +++ b/neutron/conf/agent/database/agentschedulers_db.py @@ -33,7 +33,9 @@ AGENTS_SCHEDULER_OPTS = [ 'network. If this number is greater than 1, the ' 'scheduler automatically assigns multiple DHCP agents ' 'for a given tenant network, providing high ' - 'availability for the DHCP service.')), + 'availability for the DHCP service. However this does ' + 'not provide high availability for the IPv6 metadata ' + 'service in isolated networks.')), cfg.BoolOpt('enable_services_on_agents_with_admin_state_down', default=False, help=_('Enable services on an agent with admin_state_up ' diff --git a/neutron/tests/unit/agent/dhcp/test_agent.py b/neutron/tests/unit/agent/dhcp/test_agent.py index ecfd37b17c7..7bf48f017f1 100644 --- a/neutron/tests/unit/agent/dhcp/test_agent.py +++ b/neutron/tests/unit/agent/dhcp/test_agent.py @@ -37,6 +37,7 @@ from neutron.agent.linux import dhcp from neutron.agent.linux import interface from neutron.agent.linux import utils as linux_utils from neutron.agent.metadata import driver as metadata_driver +from neutron.common import _constants as common_constants from neutron.common import config as common_config from neutron.common import utils from neutron.conf.agent import common as config @@ -1929,7 +1930,7 @@ class TestDeviceManager(base.BaseTestCase): expected_ips = ['172.9.9.9/24', const.METADATA_CIDR] if ipv6_enabled: - expected_ips.append(const.METADATA_V6_CIDR) + expected_ips.append(common_constants.METADATA_V6_CIDR) expected = [mock.call.get_device_name(port)] diff --git a/neutron/tests/unit/agent/linux/test_dhcp.py b/neutron/tests/unit/agent/linux/test_dhcp.py index 064461b649a..98ed27d93b5 100644 --- a/neutron/tests/unit/agent/linux/test_dhcp.py +++ b/neutron/tests/unit/agent/linux/test_dhcp.py @@ -33,6 +33,7 @@ import testtools from neutron.agent.linux import dhcp from neutron.agent.linux import ip_lib from neutron.cmd import runtime_checks as checks +from neutron.common import _constants as common_constants from neutron.common import utils as common_utils from neutron.conf.agent import common as config from neutron.conf.agent import dhcp as dhcp_config @@ -3295,7 +3296,7 @@ class TestDeviceManager(TestConfBase): if enable_isolated_metadata or force_metadata: expect_ips.extend([ constants.METADATA_CIDR, - constants.METADATA_V6_CIDR]) + common_constants.METADATA_V6_CIDR]) mgr.driver.init_l3.assert_called_with('ns-XXX', expect_ips, namespace='qdhcp-ns') diff --git a/neutron/tests/unit/agent/linux/test_ip_lib.py b/neutron/tests/unit/agent/linux/test_ip_lib.py index da754464c35..c488e90ddc2 100644 --- a/neutron/tests/unit/agent/linux/test_ip_lib.py +++ b/neutron/tests/unit/agent/linux/test_ip_lib.py @@ -792,7 +792,7 @@ class TestIpAddrCommand(TestIPCmdBase): def test_wait_until_address_dadfailed(self): self.addr_cmd.list = mock.Mock( return_value=[{'tentative': True, 'dadfailed': True}]) - with testtools.ExpectedException(ip_lib.AddressNotReady): + with testtools.ExpectedException(ip_lib.DADFailed): self.addr_cmd.wait_until_address_ready('abcd::1234') @mock.patch.object(common_utils, 'wait_until_true') diff --git a/neutron/tests/unit/agent/metadata/test_driver.py b/neutron/tests/unit/agent/metadata/test_driver.py index fc59b7fee8a..e3b0b8ef6e3 100644 --- a/neutron/tests/unit/agent/metadata/test_driver.py +++ b/neutron/tests/unit/agent/metadata/test_driver.py @@ -25,6 +25,7 @@ from oslo_utils import uuidutils from neutron.agent.l3 import agent as l3_agent from neutron.agent.l3 import router_info from neutron.agent.linux import external_process as ep +from neutron.agent.linux import ip_lib from neutron.agent.linux import iptables_manager from neutron.agent.linux import utils as linux_utils from neutron.agent.metadata import driver as metadata_driver @@ -76,6 +77,7 @@ class TestMetadataDriverProcess(base.BaseTestCase): EUNAME = 'neutron' EGNAME = 'neutron' METADATA_DEFAULT_IP = '169.254.169.254' + METADATA_DEFAULT_IPV6 = 'fe80::a9fe:a9fe' METADATA_PORT = 8080 METADATA_SOCKET = '/socket/path' PIDFILE = 'pidfile' @@ -140,7 +142,7 @@ class TestMetadataDriverProcess(base.BaseTestCase): agent._process_updated_router(router) f.assert_not_called() - def test_spawn_metadata_proxy(self): + def _test_spawn_metadata_proxy(self, dad_failed=False): router_id = _uuid() router_ns = 'qrouter-%s' % router_id service_name = 'haproxy' @@ -165,22 +167,32 @@ class TestMetadataDriverProcess(base.BaseTestCase): 'NamespaceManager.list_all', return_value={}),\ mock.patch( 'neutron.agent.linux.ip_lib.' - 'IpAddrCommand.wait_until_address_ready') as mock_wait: + 'IpAddrCommand.wait_until_address_ready') as mock_wait,\ + mock.patch( + 'neutron.agent.linux.ip_lib.' + 'delete_ip_address') as mock_del: agent = l3_agent.L3NATAgent('localhost') + agent.process_monitor = mock.Mock() cfg_file = os.path.join( metadata_driver.HaproxyConfigurator.get_config_path( agent.conf.state_path), "%s.conf" % router_id) mock_open = self.useFixture( lib_fixtures.OpenFixture(cfg_file)).mock_open - mock_wait.return_value = True + if dad_failed: + mock_wait.side_effect = ip_lib.DADFailed( + address=self.METADATA_DEFAULT_IP, reason='DAD failed') + else: + mock_wait.return_value = True agent.metadata_driver.spawn_monitored_metadata_proxy( agent.process_monitor, router_ns, self.METADATA_PORT, agent.conf, bind_address=self.METADATA_DEFAULT_IP, - router_id=router_id) + router_id=router_id, + bind_address_v6=self.METADATA_DEFAULT_IPV6, + bind_interface='fake-if') netns_execute_args = [ service_name, @@ -188,6 +200,8 @@ class TestMetadataDriverProcess(base.BaseTestCase): log_tag = ("haproxy-" + metadata_driver.METADATA_SERVICE_NAME + "-" + router_id) + bind_v6_line = 'bind %s:%s interface %s' % ( + self.METADATA_DEFAULT_IPV6, self.METADATA_PORT, 'fake-if') cfg_contents = metadata_driver._HAPROXY_CONFIG_TEMPLATE % { 'user': self.EUNAME, 'group': self.EGNAME, @@ -200,19 +214,35 @@ class TestMetadataDriverProcess(base.BaseTestCase): 'pidfile': self.PIDFILE, 'log_level': 'debug', 'log_tag': log_tag, - 'bind_v6_line': ''} + 'bind_v6_line': bind_v6_line} - mock_open.assert_has_calls([ - mock.call(cfg_file, 'w'), - mock.call().write(cfg_contents)], - any_order=True) + if dad_failed: + agent.process_monitor.register.assert_not_called() + mock_del.assert_called_once_with(self.METADATA_DEFAULT_IPV6, + 'fake-if', + namespace=router_ns) + else: + mock_open.assert_has_calls([ + mock.call(cfg_file, 'w'), + mock.call().write(cfg_contents)], any_order=True) - env = {ep.PROCESS_TAG: service_name + '-' + router_id} - ip_mock.assert_has_calls([ - mock.call(namespace=router_ns), - mock.call().netns.execute(netns_execute_args, addl_env=env, - run_as_root=True) - ]) + env = {ep.PROCESS_TAG: service_name + '-' + router_id} + ip_mock.assert_has_calls([ + mock.call(namespace=router_ns), + mock.call().netns.execute(netns_execute_args, addl_env=env, + run_as_root=True) + ]) + + agent.process_monitor.register.assert_called_once_with( + router_id, metadata_driver.METADATA_SERVICE_NAME, + mock.ANY) + mock_del.assert_not_called() + + def test_spawn_metadata_proxy(self): + self._test_spawn_metadata_proxy() + + def test_spawn_metadata_proxy_dad_failed(self): + self._test_spawn_metadata_proxy(dad_failed=True) def test_create_config_file_wrong_user(self): with mock.patch('pwd.getpwnam', side_effect=KeyError): diff --git a/releasenotes/notes/bug-1953165-6e848ea2c0398f56.yaml b/releasenotes/notes/bug-1953165-6e848ea2c0398f56.yaml new file mode 100644 index 00000000000..6c79c0daef7 --- /dev/null +++ b/releasenotes/notes/bug-1953165-6e848ea2c0398f56.yaml @@ -0,0 +1,16 @@ +--- +issues: + - | + The high availability of metadata service on isolated networks is limited + or non-existent. IPv4 metadata is redundant when the DHCP agent managing + it is redundant, but recovery is tied to the renewal of the DHCP lease, + making most recoveries very slow. IPv6 metadata is not redundant at all + as the IPv6 metadata address can only be configured in a single place at + a time as it is link-local. Multiple agents trying to configure it will + generate an IPv6 duplicate address detection failure. + + Administrators may observe the IPv6 metadata address in "dadfailed" state + in the DHCP namespace for this reason, which is only an indication it is + not highly available. Until a redesign is made to the isolated metadata + service there is not a better deployment option. See `bug 1953165 + `_ for information.