diff --git a/nova/conf/ironic.py b/nova/conf/ironic.py index ca1655dcc24b..9c1087eef7e7 100644 --- a/nova/conf/ironic.py +++ b/nova/conf/ironic.py @@ -82,6 +82,17 @@ Related options: 'service. Note that setting this to the empty string (``""``) ' 'will match the default conductor group, and is different than ' 'leaving the option unset.'), + cfg.StrOpt( + 'shard', + default=None, + max_length=255, + regex=r'^[a-zA-Z0-9_.-]+$', + help='Specify which ironic shard this nova-compute will manage. ' + 'This allows you to shard Ironic nodes between compute ' + 'services across conductors and conductor groups. ' + 'When a shard is set, the peer_list configuration is ignored. ' + 'We require that there is at most one nova-compute service ' + 'for each shard.'), cfg.ListOpt( 'peer_list', deprecated_for_removal=True, @@ -90,7 +101,6 @@ Related options: We do not recommend using nova-compute HA, please use passive failover of a single nova-compute service instead.""", default=[], - mutable=True, help='List of hostnames for all nova-compute services (including ' 'this host) with this conductor_group config value. ' 'Nodes matching the conductor_group value will be distributed ' diff --git a/nova/tests/unit/virt/ironic/test_driver.py b/nova/tests/unit/virt/ironic/test_driver.py index bfea28b71490..97b4fc4cda41 100644 --- a/nova/tests/unit/virt/ironic/test_driver.py +++ b/nova/tests/unit/virt/ironic/test_driver.py @@ -2933,6 +2933,31 @@ class HashRingTestCase(test.NoDBTestCase): uncalled=['host3']) mock_can_send.assert_called_once_with('1.46') + @mock.patch.object(ironic_driver.IronicDriver, '_can_send_version') + def test__refresh_hash_ring_peer_list_shard(self, mock_can_send): + services = ['host1', 'host2', 'host3'] + expected_hosts = {'host1'} + self.mock_is_up.return_value = True + self.flags(host='host1') + self.flags(shard='shard1', group='ironic') + self._test__refresh_hash_ring(services, expected_hosts, + uncalled=['host2', 'host3']) + mock_can_send.assert_not_called() + + @mock.patch.object(ironic_driver.IronicDriver, '_can_send_version') + def test__refresh_hash_ring_peer_list_shard_and_cg(self, mock_can_send): + services = ['host1', 'host2', 'host3'] + expected_hosts = {'host1'} + self.mock_is_up.return_value = True + self.flags(host='host1') + self.flags(shard='shard1', group='ironic') + self.flags(conductor_group='not-none', group='ironic') + # Note that this is getting ignored, because the shard is set + self.flags(peer_list=['host1', 'host2'], group='ironic') + self._test__refresh_hash_ring(services, expected_hosts, + uncalled=['host2', 'host3']) + mock_can_send.assert_not_called() + @mock.patch.object(ironic_driver.IronicDriver, '_can_send_version') def test__refresh_hash_ring_peer_list_old_api(self, mock_can_send): mock_can_send.side_effect = ( @@ -3007,7 +3032,8 @@ class NodeCacheTestCase(test.NoDBTestCase): def _test__refresh_cache(self, instances, nodes, hosts, mock_instances, mock_nodes, mock_hosts, mock_hash_ring, mock_can_send, partition_key=None, - can_send_146=True): + can_send_146=True, shard=None, + can_send_182=True): mock_instances.return_value = instances mock_nodes.return_value = nodes mock_hosts.side_effect = hosts @@ -3017,12 +3043,18 @@ class NodeCacheTestCase(test.NoDBTestCase): if not can_send_146: mock_can_send.side_effect = ( exception.IronicAPIVersionNotAvailable(version='1.46')) + if not can_send_182: + mock_can_send.side_effect = None, ( + exception.IronicAPIVersionNotAvailable(version='1.82')) + self.driver.node_cache = {} self.driver.node_cache_time = None kwargs = {} if partition_key is not None and can_send_146: kwargs['conductor_group'] = partition_key + if shard and can_send_182: + kwargs["shard"] = shard self.driver._refresh_cache() @@ -3100,6 +3132,74 @@ class NodeCacheTestCase(test.NoDBTestCase): expected_cache = {n.id: n for n in nodes} self.assertEqual(expected_cache, self.driver.node_cache) + def test__refresh_cache_shard(self): + # normal operation, one compute service + instances = [] + nodes = [ + _get_cached_node( + id=uuidutils.generate_uuid(), instance_id=None), + _get_cached_node( + id=uuidutils.generate_uuid(), instance_id=None), + _get_cached_node( + id=uuidutils.generate_uuid(), instance_id=None), + ] + hosts = [self.host, self.host, self.host] + shard = "shard1" + self.flags(shard=shard, group='ironic') + + self._test__refresh_cache(instances, nodes, hosts, + shard=shard) + + expected_cache = {n.id: n for n in nodes} + self.assertEqual(expected_cache, self.driver.node_cache) + + def test__refresh_cache_shard_and_conductor_group(self): + # normal operation, one compute service + instances = [] + nodes = [ + _get_cached_node( + id=uuidutils.generate_uuid(), instance_id=None), + _get_cached_node( + id=uuidutils.generate_uuid(), instance_id=None), + _get_cached_node( + id=uuidutils.generate_uuid(), instance_id=None), + ] + hosts = [self.host, self.host, self.host] + shard = "shard1" + self.flags(shard=shard, group='ironic') + partition_key = 'some-group' + self.flags(conductor_group=partition_key, group='ironic') + + self._test__refresh_cache(instances, nodes, hosts, + shard=shard, partition_key=partition_key) + + expected_cache = {n.id: n for n in nodes} + self.assertEqual(expected_cache, self.driver.node_cache) + + def test__refresh_cache_shard_and_conductor_group_skip_shard(self): + # normal operation, one compute service + instances = [] + nodes = [ + _get_cached_node( + id=uuidutils.generate_uuid(), instance_id=None), + _get_cached_node( + id=uuidutils.generate_uuid(), instance_id=None), + _get_cached_node( + id=uuidutils.generate_uuid(), instance_id=None), + ] + hosts = [self.host, self.host, self.host] + shard = "shard1" + self.flags(shard=shard, group='ironic') + partition_key = 'some-group' + self.flags(conductor_group=partition_key, group='ironic') + + self._test__refresh_cache(instances, nodes, hosts, + shard=shard, partition_key=partition_key, + can_send_182=False) + + expected_cache = {n.id: n for n in nodes} + self.assertEqual(expected_cache, self.driver.node_cache) + def test__refresh_cache_partition_key_old_api(self): # normal operation, one compute service instances = [] diff --git a/nova/virt/ironic/driver.py b/nova/virt/ironic/driver.py index d66e5a505dd9..446b0db60756 100644 --- a/nova/virt/ironic/driver.py +++ b/nova/virt/ironic/driver.py @@ -698,12 +698,16 @@ class IronicDriver(virt_driver.ComputeDriver): return False def _refresh_hash_ring(self, ctxt): - peer_list = None + # When requesting a shard, we assume each compute service is + # targeting a separate shard, so hard code peer_list to + # just this service + peer_list = None if not CONF.ironic.shard else {CONF.host} + # NOTE(jroll) if this is set, we need to limit the set of other # compute services in the hash ring to hosts that are currently up # and specified in the peer_list config option, as there's no way # to check which conductor_group other compute services are using. - if CONF.ironic.conductor_group is not None: + if peer_list is None and CONF.ironic.conductor_group is not None: try: # NOTE(jroll) first we need to make sure the Ironic API can # filter by conductor_group. If it cannot, limiting to @@ -766,21 +770,24 @@ class IronicDriver(virt_driver.ComputeDriver): # attribute. If the API isn't new enough to support conductor groups, # we fall back to managing all nodes. If it is new enough, we can # filter it in the API. + # NOTE(johngarbutt) similarly, if shard is set, we also limit the + # nodes that are returned by the shard key conductor_group = CONF.ironic.conductor_group - if conductor_group is not None: - try: + shard = CONF.ironic.shard + kwargs = {} + try: + if conductor_group is not None: self._can_send_version('1.46') - nodes = _get_node_list(conductor_group=conductor_group) - LOG.debug('Limiting manageable ironic nodes to conductor ' - 'group %s', conductor_group) - except exception.IronicAPIVersionNotAvailable: - LOG.error('Required Ironic API version 1.46 is not ' - 'available to filter nodes by conductor group. ' - 'All nodes will be eligible to be managed by ' - 'this compute service.') - nodes = _get_node_list() - else: - nodes = _get_node_list() + kwargs['conductor_group'] = conductor_group + if shard: + self._can_send_version('1.82') + kwargs['shard'] = shard + nodes = _get_node_list(**kwargs) + except exception.IronicAPIVersionNotAvailable: + LOG.error('Required Ironic API version is not ' + 'available to filter nodes by conductor group ' + 'and shard.') + nodes = _get_node_list(**kwargs) # NOTE(saga): As _get_node_list() will take a long # time to return in large clusters we need to call it before diff --git a/releasenotes/notes/ironic-shards-5641e4b1ab5bb7aa.yaml b/releasenotes/notes/ironic-shards-5641e4b1ab5bb7aa.yaml new file mode 100644 index 000000000000..5becd46e9a86 --- /dev/null +++ b/releasenotes/notes/ironic-shards-5641e4b1ab5bb7aa.yaml @@ -0,0 +1,12 @@ +--- +features: + - | + Ironic nova-compute services can now target a specific shard of ironic + nodes by setting the config ``[ironic]shard``. + This is particularly useful when using active-passive methods to choose + on which physical host your ironic nova-compute process is running, + while ensuring ``[DEFAULT]host`` stays the same for each shard. + You can use this alongside ``[ironic]conductor_group`` to further limit + which ironic nodes are managed by each nova-compute service. + Note that when you use ``[ironic]shard`` the ``[ironic]peer_list`` + is hard coded to a single nova-compute service.