From 29dc044a7aa1b1dd35ea4695c055feb5136ba1e5 Mon Sep 17 00:00:00 2001 From: Artom Lifshitz Date: Fri, 8 Mar 2024 13:00:30 -0500 Subject: [PATCH] pwr mgmt: make API into a per-driver object We want to test power management in our functional tests in multinode scenarios (ex: live migration). This was previously impossible because all the methods in nova.virt.libvirt.cpu.api and were at the module level, meaning both source and destination libvirt drivers would call the same method to online and offline cores. This made it impossible to maintain distinct core power state between source and destination. This patch inserts a nova.virt.libvirt.cpu.api.API class, and gives the libvirt driver a cpu_api attribute with an instance of that class. Along with the tiny API.core() helper, this allows new functional tests in the subsequent patches to stub out the core "model" code with distinct objects on the source and destination libvirt drivers, and enables a whole bunch of testing (and fixes!) around live migration. Related-bug: 2056613 Change-Id: I052535249b9a3e144bb68b8c588b5995eb345b97 --- nova/tests/unit/virt/libvirt/cpu/test_api.py | 35 ++-- nova/virt/libvirt/cpu/api.py | 164 ++++++++++--------- nova/virt/libvirt/driver.py | 19 ++- 3 files changed, 121 insertions(+), 97 deletions(-) diff --git a/nova/tests/unit/virt/libvirt/cpu/test_api.py b/nova/tests/unit/virt/libvirt/cpu/test_api.py index 890f4566c472..e7aa67570ffd 100644 --- a/nova/tests/unit/virt/libvirt/cpu/test_api.py +++ b/nova/tests/unit/virt/libvirt/cpu/test_api.py @@ -24,6 +24,7 @@ class TestAPI(test.NoDBTestCase): def setUp(self): super(TestAPI, self).setUp() self.core_1 = api.Core(1) + self.api = api.API() # Create a fake instance with two pinned CPUs but only one is on the # dedicated set @@ -82,7 +83,7 @@ class TestAPI(test.NoDBTestCase): self.flags(cpu_power_management=True, group='libvirt') self.flags(cpu_dedicated_set='1-2', group='compute') - api.power_up(self.fake_inst) + self.api.power_up(self.fake_inst) # only core #2 can be set as core #0 is not on the dedicated set # As a reminder, core(i).online calls set_online(i) mock_online.assert_called_once_with(2) @@ -93,7 +94,7 @@ class TestAPI(test.NoDBTestCase): self.flags(cpu_power_management_strategy='governor', group='libvirt') self.flags(cpu_dedicated_set='1-2', group='compute') - api.power_up(self.fake_inst) + self.api.power_up(self.fake_inst) # only core #2 can be set as core #1 is not on the dedicated set # As a reminder, core(i).set_high_governor calls set_governor(i) mock_set_governor.assert_called_once_with(2, 'performance') @@ -101,13 +102,13 @@ class TestAPI(test.NoDBTestCase): @mock.patch.object(core, 'set_online') def test_power_up_skipped(self, mock_online): self.flags(cpu_power_management=False, group='libvirt') - api.power_up(self.fake_inst) + self.api.power_up(self.fake_inst) mock_online.assert_not_called() @mock.patch.object(core, 'set_online') def test_power_up_skipped_if_standard_instance(self, mock_online): self.flags(cpu_power_management=True, group='libvirt') - api.power_up(objects.Instance(numa_topology=None)) + self.api.power_up(objects.Instance(numa_topology=None)) mock_online.assert_not_called() @mock.patch.object(core, 'set_offline') @@ -115,7 +116,7 @@ class TestAPI(test.NoDBTestCase): self.flags(cpu_power_management=True, group='libvirt') self.flags(cpu_dedicated_set='1-2', group='compute') - api.power_down(self.fake_inst) + self.api.power_down(self.fake_inst) # only core #2 can be set as core #1 is not on the dedicated set # As a reminder, core(i).online calls set_online(i) mock_offline.assert_called_once_with(2) @@ -126,7 +127,7 @@ class TestAPI(test.NoDBTestCase): self.flags(cpu_power_management_strategy='governor', group='libvirt') self.flags(cpu_dedicated_set='0-1', group='compute') - api.power_down(self.fake_inst) + self.api.power_down(self.fake_inst) # Make sure that core #0 is ignored, since it is special and cannot # be powered down. @@ -138,7 +139,7 @@ class TestAPI(test.NoDBTestCase): self.flags(cpu_power_management_strategy='governor', group='libvirt') self.flags(cpu_dedicated_set='1-2', group='compute') - api.power_down(self.fake_inst) + self.api.power_down(self.fake_inst) # only core #2 can be set as core #0 is not on the dedicated set # As a reminder, core(i).set_high_governor calls set_governor(i) @@ -147,13 +148,13 @@ class TestAPI(test.NoDBTestCase): @mock.patch.object(core, 'set_offline') def test_power_down_skipped(self, mock_offline): self.flags(cpu_power_management=False, group='libvirt') - api.power_down(self.fake_inst) + self.api.power_down(self.fake_inst) mock_offline.assert_not_called() @mock.patch.object(core, 'set_offline') def test_power_down_skipped_if_standard_instance(self, mock_offline): self.flags(cpu_power_management=True, group='libvirt') - api.power_down(objects.Instance(numa_topology=None)) + self.api.power_down(objects.Instance(numa_topology=None)) mock_offline.assert_not_called() @mock.patch.object(core, 'set_offline') @@ -161,7 +162,7 @@ class TestAPI(test.NoDBTestCase): self.flags(cpu_power_management=True, group='libvirt') self.flags(cpu_dedicated_set='0-2', group='compute') - api.power_down_all_dedicated_cpus() + self.api.power_down_all_dedicated_cpus() # All dedicated CPUs are turned offline, except CPU0 mock_offline.assert_has_calls([mock.call(1), mock.call(2)]) @@ -171,7 +172,7 @@ class TestAPI(test.NoDBTestCase): self.flags(cpu_power_management_strategy='governor', group='libvirt') self.flags(cpu_dedicated_set='0-2', group='compute') - api.power_down_all_dedicated_cpus() + self.api.power_down_all_dedicated_cpus() # All dedicated CPUs are turned offline, except CPU0 mock_set_governor.assert_has_calls([mock.call(1, 'powersave'), mock.call(2, 'powersave')]) @@ -179,7 +180,7 @@ class TestAPI(test.NoDBTestCase): @mock.patch.object(core, 'set_offline') def test_power_down_all_dedicated_cpus_skipped(self, mock_offline): self.flags(cpu_power_management=False, group='libvirt') - api.power_down_all_dedicated_cpus() + self.api.power_down_all_dedicated_cpus() mock_offline.assert_not_called() @mock.patch.object(core, 'set_offline') @@ -188,7 +189,7 @@ class TestAPI(test.NoDBTestCase): ): self.flags(cpu_power_management=True, group='libvirt') self.flags(cpu_dedicated_set=None, group='compute') - api.power_down_all_dedicated_cpus() + self.api.power_down_all_dedicated_cpus() mock_offline.assert_not_called() @mock.patch.object(core, 'get_governor') @@ -201,7 +202,7 @@ class TestAPI(test.NoDBTestCase): mock_get_governor.return_value = 'performance' mock_get_online.side_effect = (True, False) self.assertRaises(exception.InvalidConfiguration, - api.validate_all_dedicated_cpus) + self.api.validate_all_dedicated_cpus) @mock.patch.object(core, 'get_governor') @mock.patch.object(core, 'get_online') @@ -213,7 +214,7 @@ class TestAPI(test.NoDBTestCase): mock_get_online.return_value = True mock_get_governor.side_effect = ('powersave', 'performance') self.assertRaises(exception.InvalidConfiguration, - api.validate_all_dedicated_cpus) + self.api.validate_all_dedicated_cpus) @mock.patch.object(core, 'get_governor') @mock.patch.object(core, 'get_online') @@ -227,7 +228,7 @@ class TestAPI(test.NoDBTestCase): mock_get_online.return_value = True mock_get_governor.return_value = 'performance' - api.validate_all_dedicated_cpus() + self.api.validate_all_dedicated_cpus() # Make sure we skipped CPU0 mock_get_online.assert_has_calls([mock.call(1), mock.call(2)]) @@ -240,6 +241,6 @@ class TestAPI(test.NoDBTestCase): def test_validate_all_dedicated_cpus_no_cpu(self): self.flags(cpu_power_management=True, group='libvirt') self.flags(cpu_dedicated_set=None, group='compute') - api.validate_all_dedicated_cpus() + self.api.validate_all_dedicated_cpus() # no assert we want to make sure the validation won't raise if # no dedicated cpus are configured diff --git a/nova/virt/libvirt/cpu/api.py b/nova/virt/libvirt/cpu/api.py index d1e1b2628b0f..02e6cfa8b978 100644 --- a/nova/virt/libvirt/cpu/api.py +++ b/nova/virt/libvirt/cpu/api.py @@ -75,88 +75,102 @@ class Core: core.set_governor(self.ident, CONF.libvirt.cpu_power_governor_low) -def power_up(instance: objects.Instance) -> None: - if not CONF.libvirt.cpu_power_management: - return - if instance.numa_topology is None: - return +class API(object): - cpu_dedicated_set = hardware.get_cpu_dedicated_set_nozero() or set() - pcpus = instance.numa_topology.cpu_pinning.union( - instance.numa_topology.cpuset_reserved) - powered_up = set() - for pcpu in pcpus: - if pcpu in cpu_dedicated_set: - pcpu = Core(pcpu) - if CONF.libvirt.cpu_power_management_strategy == 'cpu_state': - pcpu.online = True - else: - pcpu.set_high_governor() - powered_up.add(str(pcpu)) - LOG.debug("Cores powered up : %s", powered_up) + def core(self, i): + """From a purely functional point of view, there is no need for this + method. However, we want to test power management in multinode + scenarios (ex: live migration) in our functional tests. If we + instantiated the Core class directly in the methods below, the + functional tests would not be able to distinguish between cores on the + source and destination hosts. In functional tests we can replace this + helper method by a stub that returns a fixture, allowing us to maintain + distinct core power state for each host. + See also nova.virt.libvirt.driver.LibvirtDriver.cpu_api. + """ + return Core(i) -def power_down(instance: objects.Instance) -> None: - if not CONF.libvirt.cpu_power_management: - return - if instance.numa_topology is None: - return + def power_up(self, instance: objects.Instance) -> None: + if not CONF.libvirt.cpu_power_management: + return + if instance.numa_topology is None: + return - cpu_dedicated_set = hardware.get_cpu_dedicated_set_nozero() or set() - pcpus = instance.numa_topology.cpu_pinning.union( - instance.numa_topology.cpuset_reserved) - powered_down = set() - for pcpu in pcpus: - if pcpu in cpu_dedicated_set: - pcpu = Core(pcpu) + cpu_dedicated_set = hardware.get_cpu_dedicated_set_nozero() or set() + pcpus = instance.numa_topology.cpu_pinning.union( + instance.numa_topology.cpuset_reserved) + powered_up = set() + for pcpu in pcpus: + if pcpu in cpu_dedicated_set: + pcpu = self.core(pcpu) + if CONF.libvirt.cpu_power_management_strategy == 'cpu_state': + pcpu.online = True + else: + pcpu.set_high_governor() + powered_up.add(str(pcpu)) + LOG.debug("Cores powered up : %s", powered_up) + + def power_down(self, instance: objects.Instance) -> None: + if not CONF.libvirt.cpu_power_management: + return + if instance.numa_topology is None: + return + + cpu_dedicated_set = hardware.get_cpu_dedicated_set_nozero() or set() + pcpus = instance.numa_topology.cpu_pinning.union( + instance.numa_topology.cpuset_reserved) + powered_down = set() + for pcpu in pcpus: + if pcpu in cpu_dedicated_set: + pcpu = self.core(pcpu) + if CONF.libvirt.cpu_power_management_strategy == 'cpu_state': + pcpu.online = False + else: + pcpu.set_low_governor() + powered_down.add(str(pcpu)) + LOG.debug("Cores powered down : %s", powered_down) + + def power_down_all_dedicated_cpus(self) -> None: + if not CONF.libvirt.cpu_power_management: + return + + cpu_dedicated_set = hardware.get_cpu_dedicated_set_nozero() or set() + for pcpu in cpu_dedicated_set: + pcpu = self.core(pcpu) if CONF.libvirt.cpu_power_management_strategy == 'cpu_state': pcpu.online = False else: pcpu.set_low_governor() - powered_down.add(str(pcpu)) - LOG.debug("Cores powered down : %s", powered_down) + LOG.debug("Cores powered down : %s", cpu_dedicated_set) - -def power_down_all_dedicated_cpus() -> None: - if not CONF.libvirt.cpu_power_management: - return - - cpu_dedicated_set = hardware.get_cpu_dedicated_set_nozero() or set() - for pcpu in cpu_dedicated_set: - pcpu = Core(pcpu) + def validate_all_dedicated_cpus(self) -> None: + if not CONF.libvirt.cpu_power_management: + return + cpu_dedicated_set = hardware.get_cpu_dedicated_set() or set() + governors = set() + cpu_states = set() + for pcpu in cpu_dedicated_set: + if (pcpu == 0 and + CONF.libvirt.cpu_power_management_strategy == 'cpu_state'): + LOG.warning('CPU0 is in cpu_dedicated_set, ' + 'but it is not eligible for state management ' + 'and will be ignored') + continue + pcpu = self.core(pcpu) + # we need to collect the governors strategy and the CPU states + governors.add(pcpu.governor) + cpu_states.add(pcpu.online) if CONF.libvirt.cpu_power_management_strategy == 'cpu_state': - pcpu.online = False - else: - pcpu.set_low_governor() - LOG.debug("Cores powered down : %s", cpu_dedicated_set) - - -def validate_all_dedicated_cpus() -> None: - if not CONF.libvirt.cpu_power_management: - return - cpu_dedicated_set = hardware.get_cpu_dedicated_set() or set() - governors = set() - cpu_states = set() - for pcpu in cpu_dedicated_set: - if (pcpu == 0 and - CONF.libvirt.cpu_power_management_strategy == 'cpu_state'): - LOG.warning('CPU0 is in cpu_dedicated_set, but it is not eligible ' - 'for state management and will be ignored') - continue - pcpu = Core(pcpu) - # we need to collect the governors strategy and the CPU states - governors.add(pcpu.governor) - cpu_states.add(pcpu.online) - if CONF.libvirt.cpu_power_management_strategy == 'cpu_state': - # all the cores need to have the same governor strategy - if len(governors) > 1: - msg = _("All the cores need to have the same governor strategy" - "before modifying the CPU states. You can reboot the " - "compute node if you prefer.") - raise exception.InvalidConfiguration(msg) - elif CONF.libvirt.cpu_power_management_strategy == 'governor': - # all the cores need to be online - if False in cpu_states: - msg = _("All the cores need to be online before modifying the " - "governor strategy.") - raise exception.InvalidConfiguration(msg) + # all the cores need to have the same governor strategy + if len(governors) > 1: + msg = _("All the cores need to have the same governor strategy" + "before modifying the CPU states. You can reboot the " + "compute node if you prefer.") + raise exception.InvalidConfiguration(msg) + elif CONF.libvirt.cpu_power_management_strategy == 'governor': + # all the cores need to be online + if False in cpu_states: + msg = _("All the cores need to be online before modifying the " + "governor strategy.") + raise exception.InvalidConfiguration(msg) diff --git a/nova/virt/libvirt/driver.py b/nova/virt/libvirt/driver.py index 88969306a3b0..4c908ccf3176 100644 --- a/nova/virt/libvirt/driver.py +++ b/nova/virt/libvirt/driver.py @@ -549,6 +549,15 @@ class LibvirtDriver(driver.ComputeDriver): # events about success or failure. self._device_event_handler = AsyncDeviceEventsHandler() + # NOTE(artom) From a pure functionality point of view, there's no need + # for this to be an attribute of self. However, we want to test power + # management in multinode scenarios (ex: live migration) in our + # functional tests. If the power management code was just a bunch of + # module level functions, the functional tests would not be able to + # distinguish between cores on the source and destination hosts. + # See also nova.virt.libvirt.cpu.api.API.core(). + self.cpu_api = libvirt_cpu.API() + def _discover_vpmems(self, vpmem_conf=None): """Discover vpmems on host and configuration. @@ -829,13 +838,13 @@ class LibvirtDriver(driver.ComputeDriver): # modified by Nova before. Note that it can provide an exception if # either the governor strategies are different between the cores or if # the cores are offline. - libvirt_cpu.validate_all_dedicated_cpus() + self.cpu_api.validate_all_dedicated_cpus() # NOTE(sbauza): We powerdown all dedicated CPUs but if some instances # exist that are pinned for some CPUs, then we'll later powerup those # CPUs when rebooting the instance in _init_instance() # Note that it can provide an exception if the config options are # wrongly modified. - libvirt_cpu.power_down_all_dedicated_cpus() + self.cpu_api.power_down_all_dedicated_cpus() # TODO(sbauza): Remove this code once mediated devices are persisted # across reboots. @@ -1559,7 +1568,7 @@ class LibvirtDriver(driver.ComputeDriver): if CONF.libvirt.virt_type == 'lxc': self._teardown_container(instance) # We're sure the instance is gone, we can shutdown the core if so - libvirt_cpu.power_down(instance) + self.cpu_api.power_down(instance) def destroy(self, context, instance, network_info, block_device_info=None, destroy_disks=True, destroy_secrets=True): @@ -3301,7 +3310,7 @@ class LibvirtDriver(driver.ComputeDriver): current_power_state = guest.get_power_state(self._host) - libvirt_cpu.power_up(instance) + self.cpu_api.power_up(instance) # TODO(stephenfin): Any reason we couldn't use 'self.resume' here? guest.launch(pause=current_power_state == power_state.PAUSED) @@ -7954,7 +7963,7 @@ class LibvirtDriver(driver.ComputeDriver): post_xml_callback() if power_on or pause: - libvirt_cpu.power_up(instance) + self.cpu_api.power_up(instance) guest.launch(pause=pause) return guest