diff --git a/doc/source/admin/cpu-topologies.rst b/doc/source/admin/cpu-topologies.rst index 179f7bd3775d..529542d8050e 100644 --- a/doc/source/admin/cpu-topologies.rst +++ b/doc/source/admin/cpu-topologies.rst @@ -95,14 +95,64 @@ In all cases where NUMA awareness is used, the ``NUMATopologyFilter`` filter must be enabled. Details on this filter are provided in :doc:`/admin/scheduling`. +The host's NUMA node(s) used are chosen based on some logic and controlled by +``packing_host_numa_cells_allocation_strategy`` configuration variable in +nova.conf. By default ``packing_host_numa_cells_allocation_strategy`` +variable is set to ``True``. It leads to attempt to chose NUMA node(s) with +less amount of free resources (or in other words **more used** NUMA nodes) +first. It is so-called "pack" strategy - we try to place as much as possible +load at **more used** host's NUMA node until it will be completely exhausted. +And only after we will choose **most used** host's NUMA node from the rest +available nodes on host. "Spread" strategy is reverse to "pack" strategy. +The NUMA node(s) with **more free** resources will be used first. So "spread" +strategy will try to balance load between all NUMA nodes and keep number of +free resources on all NUMA nodes as more equal as possible. + .. caution:: - The NUMA node(s) used are normally chosen at random. However, if a PCI - passthrough or SR-IOV device is attached to the instance, then the NUMA - node that the device is associated with will be used. This can provide - important performance improvements. However, booting a large number of - similar instances can result in unbalanced NUMA node usage. Care should - be taken to mitigate this issue. See this `discussion`_ for more details. + Host's NUMA nodes are placed in list and list is sorted based on strategy + chosen and resource available in each NUMA node. Sorts are performed on + same list one after another, so the last sort implemented is the sort + with most priority. + +The python performed so-called stable sort. It means that each sort executed +on same list will change order of list items only if item's property we sort on +differs. If this properties in all list's items are equal than elements order +will not changed. + +Sorts are performed on host's NUMA nodes list in the following order: + +* sort based on available memory on node(first sort-less priority) +* sort based on cpu usage (in case of shared CPUs requested by guest + VM topology) or free pinned cpus otherwise. +* sort based on number of free PCI device on node(last sort-top priority) + +Top sorting priority is for host's NUMA nodes with PCI devices attached. If VM +requested PCI device(s) logic **always** puts host's NUMA nodes with more PCI +devices at the beginnig of the host's NUMA nodes list. If PCI devices isn't +requested by VM than NUMA nodes with no (or less) PCI device available will be +placed at the beginnig of the list. + +.. caution:: + + The described logic for PCI devices is used **both** for "pack" and "spread" + strategies. It is done to keep backward compatibility with previous nova + versions. + + +During "pack" logic implementation rest (two) sorts are performed with sort +order to move NUMA nodes with more available resources (CPUs and memory) at the +END of host's NUMA nodes list. Sort based on memory is the first sort +implemented and has least priority. + +During "spread" logic implementation rest (two) sorts are performed with sort +order to move NUMA nodes with more available resources (CPUs and memory) at the +BEGINNING of host's NUMA nodes list. Sort based on memory is the first sort +implemented and has least priority. + +Finally resulting list (after all sorts) is passed next and attempts to place +VM's NUMA node to host's NUMA node are performed starting from the first +host's NUMA node in list. .. caution:: @@ -724,5 +774,4 @@ instances with a NUMA topology. .. Links .. _`Image metadata`: https://docs.openstack.org/image-guide/introduction.html#image-metadata -.. _`discussion`: http://lists.openstack.org/pipermail/openstack-dev/2016-March/090367.html .. _`MTTCG project`: http://wiki.qemu.org/Features/tcg-multithread diff --git a/nova/conf/compute.py b/nova/conf/compute.py index 5cf8c31714d1..263d77758695 100644 --- a/nova/conf/compute.py +++ b/nova/conf/compute.py @@ -1007,6 +1007,23 @@ Related options: * ``[scheduler]query_placement_for_image_type_support`` - enables filtering computes based on supported image types, which is required to be enabled for this to take effect. +"""), + cfg.BoolOpt('packing_host_numa_cells_allocation_strategy', + default=True, + help=""" +This option controls allocation strategy used to choose NUMA cells on host for +placing VM's NUMA cells (for VMs with defined numa topology). By +default host's NUMA cell with more resources consumed will be chosen first for +placing attempt. So the host cell with some usage will be packed with VM's cell +until it will be completely exhausted, before new free host's cell will be +used. When the packing_host_numa_cells_allocation_strategy variable is set to +``False``, host's NUMA cell with more resources available will be used. + +Possible values: + +* ``True``: Packing VM's NUMA cell on most used host NUMA cell. +* ``False``: Spreading VM's NUMA cell on host's NUMA cells with more resources + available. """), ] diff --git a/nova/tests/unit/virt/test_hardware.py b/nova/tests/unit/virt/test_hardware.py index 80d0133c20be..b7e004f93f67 100644 --- a/nova/tests/unit/virt/test_hardware.py +++ b/nova/tests/unit/virt/test_hardware.py @@ -19,6 +19,7 @@ import ddt import mock import testtools +import nova.conf from nova import exception from nova import objects from nova.objects import fields @@ -28,6 +29,8 @@ from nova.tests.unit import fake_pci_device_pools as fake_pci from nova.tests.unit.image.fake import fake_image_obj from nova.virt import hardware as hw +CONF = nova.conf.CONF + class InstanceInfoTests(test.NoDBTestCase): @@ -2753,7 +2756,7 @@ class VirtNUMAHostTopologyTestCase(test.NoDBTestCase): # the PCI device is found on host cell 1 pci_stats = _create_pci_stats(1) - # ...threfore an instance without a PCI device should get host cell 2 + # ...therefore an instance without a PCI device should get host cell 2 instance_topology = hw.numa_fit_instance_to_host( self.host, self.instance1, pci_stats=pci_stats) self.assertIsInstance(instance_topology, objects.InstanceNUMATopology) @@ -2763,7 +2766,7 @@ class VirtNUMAHostTopologyTestCase(test.NoDBTestCase): # the PCI device is now found on host cell 2 pci_stats = _create_pci_stats(2) - # ...threfore an instance without a PCI device should get host cell 1 + # ...therefore an instance without a PCI device should get host cell 1 instance_topology = hw.numa_fit_instance_to_host( self.host, self.instance1, pci_stats=pci_stats) self.assertIsInstance(instance_topology, objects.InstanceNUMATopology) @@ -5664,3 +5667,243 @@ class RescuePropertyTestCase(test.NoDBTestCase): meta = objects.ImageMeta.from_dict({'disk_format': 'raw'}) meta.properties = objects.ImageMetaProps.from_dict(props) self.assertEqual(expected, hw.check_hw_rescue_props(meta)) + + +class HostCellsSortingTestCase(test.NoDBTestCase): + # NOTE (IPO) It is possible to test all sorting cases with one defined + # host NUMA topo. + # We have 4 NUMA cells with the following properties: + # NUMA cell 0: have most cpu usage + # NUMA cell 1: will have most PCI available + # NUMA cell 2: have most free pcpus + # NUMA cell 3: have most available memory + # So it will be enough to check order of NUMA cell in resulting instance + # topo to check particular sorting case. + + def setUp(self): + super(HostCellsSortingTestCase, self).setUp() + + def _create_pci_stats(node, count): + test_dict = copy.copy(fake_pci.fake_pool_dict) + test_dict['numa_node'] = node + test_dict['vendor_id'] = '8086' + test_dict['product_id'] = 'fake-prod0' + test_dict['count'] = count + return stats.PciDeviceStats( + objects.NUMATopology(), + [objects.PciDevicePool.from_dict(test_dict)]) + + self.pci_stats = _create_pci_stats(1, 2) + + self.host = objects.NUMATopology(cells=[ + objects.NUMACell( + id=0, + cpuset=set([1, 2, 3, 4]), + pcpuset=set([1, 2, 3, 4]), + memory=4096, + cpu_usage=3, + memory_usage=2048, + pinned_cpus=set([1, 2]), + mempages=[objects.NUMAPagesTopology( + size_kb=4, total=524288, used=0)], + siblings=[set([1]), set([2]), set([3]), set([4])]), + objects.NUMACell( + id=1, + cpuset=set([5, 6, 7, 8]), + pcpuset=set([5, 6, 7, 8]), + memory=4096, + cpu_usage=2, + memory_usage=2048, + pinned_cpus=set([5, 6]), + mempages=[objects.NUMAPagesTopology( + size_kb=4, total=524288, used=0)], + siblings=[set([5]), set([6]), set([7]), set([8])]), + objects.NUMACell( + id=2, + cpuset=set([9, 10, 11, 12]), + pcpuset=set([9, 10, 11, 12]), + memory=4096, + cpu_usage=2, + memory_usage=2048, + pinned_cpus=set(), + mempages=[objects.NUMAPagesTopology( + size_kb=4, total=524288, used=0)], + siblings=[set([9]), set([10]), set([11]), set([12])]), + objects.NUMACell( + id=3, + cpuset=set([13, 14, 15, 16]), + pcpuset=set([13, 14, 15, 16]), + memory=4096, + cpu_usage=2, + memory_usage=1024, + pinned_cpus=set([13, 14]), + mempages=[objects.NUMAPagesTopology( + size_kb=4, total=524288, used=0)], + siblings=[set([13]), set([14]), set([15]), set([16])]) + ]) + + self.instance0 = objects.InstanceNUMATopology(cells=[ + objects.InstanceNUMACell( + id=0, cpuset=set([0]), pcpuset=set(), memory=2048), + objects.InstanceNUMACell( + id=1, cpuset=set([1]), pcpuset=set(), memory=2048), + objects.InstanceNUMACell( + id=2, cpuset=set([2]), pcpuset=set(), memory=2048), + objects.InstanceNUMACell( + id=3, cpuset=set([3]), pcpuset=set(), memory=2048) + ]) + + self.instance1 = objects.InstanceNUMATopology(cells=[ + objects.InstanceNUMACell( + id=0, cpuset=set([0]), pcpuset=set(), memory=2048), + objects.InstanceNUMACell( + id=1, cpuset=set([1]), pcpuset=set(), memory=2048), + objects.InstanceNUMACell( + id=2, cpuset=set([2]), pcpuset=set(), memory=2048), + ]) + + self.instance2 = objects.InstanceNUMATopology(cells=[ + objects.InstanceNUMACell( + id=0, + cpuset=set(), + pcpuset=set([0]), + memory=2048, + cpu_policy=fields.CPUAllocationPolicy.DEDICATED + ), + objects.InstanceNUMACell( + id=1, + cpuset=set(), + pcpuset=set([1]), + memory=2048, + cpu_policy=fields.CPUAllocationPolicy.DEDICATED + ), + objects.InstanceNUMACell( + id=2, + cpuset=set(), + pcpuset=set([2]), + memory=2048, + cpu_policy=fields.CPUAllocationPolicy.DEDICATED + )]) + + def assertInstanceNUMAcellOrder(self, list_to_check, instance_topo): + for cell, id in zip(instance_topo.cells, list_to_check): + self.assertEqual(cell.id, id) + + def test_sort_host_numa_cell_num_equal_instance_cell_num(self): + instance_topology = hw.numa_fit_instance_to_host( + self.host, self.instance0) + self.assertInstanceNUMAcellOrder([0, 1, 2, 3], instance_topology) + + def test_sort_no_pci_stats_no_shared_cpu_policy(self): + CONF.set_override( + 'packing_host_numa_cells_allocation_strategy', + True, + group = 'compute') + instance_topology = hw.numa_fit_instance_to_host( + self.host, self.instance2) + self.assertInstanceNUMAcellOrder([0, 1, 3], instance_topology) + CONF.set_override( + 'packing_host_numa_cells_allocation_strategy', + False, + group = 'compute') + instance_topology = hw.numa_fit_instance_to_host( + self.host, self.instance2) + self.assertInstanceNUMAcellOrder([2, 3, 0], instance_topology) + + def test_sort_no_pci_stats_shared_cpu_policy(self): + CONF.set_override( + 'packing_host_numa_cells_allocation_strategy', + True, + group = 'compute') + instance_topology = hw.numa_fit_instance_to_host( + self.host, self.instance1) + self.assertInstanceNUMAcellOrder([0, 1, 2], instance_topology) + CONF.set_override( + 'packing_host_numa_cells_allocation_strategy', + False, + group = 'compute') + instance_topology = hw.numa_fit_instance_to_host( + self.host, self.instance1) + self.assertInstanceNUMAcellOrder([3, 1, 2], instance_topology) + + def test_sort_pci_stats_pci_req_no_shared_cpu_policy(self): + CONF.set_override( + 'packing_host_numa_cells_allocation_strategy', + True, + group = 'compute') + pci_request = objects.InstancePCIRequest(count=1, + spec=[{'vendor_id': '8086', 'product_id': 'fake-prod0'}]) + pci_reqs = [pci_request] + instance_topology = hw.numa_fit_instance_to_host( + self.host, self.instance2, + pci_requests = pci_reqs, + pci_stats = self.pci_stats) + self.assertInstanceNUMAcellOrder([1, 0, 3], instance_topology) + CONF.set_override( + 'packing_host_numa_cells_allocation_strategy', + False, + group = 'compute') + instance_topology = hw.numa_fit_instance_to_host( + self.host, self.instance2, + pci_requests = pci_reqs, + pci_stats = self.pci_stats) + self.assertInstanceNUMAcellOrder([1, 2, 3], instance_topology) + + def test_sort_pci_stats_pci_req_shared_cpu_policy(self): + CONF.set_override( + 'packing_host_numa_cells_allocation_strategy', + True, + group = 'compute') + pci_request = objects.InstancePCIRequest(count=1, + spec=[{'vendor_id': '8086', 'product_id': 'fake-prod0'}]) + pci_reqs = [pci_request] + instance_topology = hw.numa_fit_instance_to_host( + self.host, self.instance1, + pci_requests = pci_reqs, + pci_stats = self.pci_stats) + self.assertInstanceNUMAcellOrder([1, 0, 2], instance_topology) + CONF.set_override( + 'packing_host_numa_cells_allocation_strategy', + False, + group = 'compute') + instance_topology = hw.numa_fit_instance_to_host( + self.host, self.instance1, + pci_requests = pci_reqs, + pci_stats = self.pci_stats) + self.assertInstanceNUMAcellOrder([1, 3, 2], instance_topology) + + def test_sort_pci_stats_no_pci_req_no_shared_cpu_policy(self): + CONF.set_override( + 'packing_host_numa_cells_allocation_strategy', + True, + group = 'compute') + instance_topology = hw.numa_fit_instance_to_host( + self.host, self.instance2, + pci_stats = self.pci_stats) + self.assertInstanceNUMAcellOrder([0, 3, 2], instance_topology) + CONF.set_override( + 'packing_host_numa_cells_allocation_strategy', + False, + group = 'compute') + instance_topology = hw.numa_fit_instance_to_host( + self.host, self.instance2, + pci_stats = self.pci_stats) + self.assertInstanceNUMAcellOrder([2, 3, 0], instance_topology) + + def test_sort_pci_stats_no_pci_req_shared_cpu_policy(self): + CONF.set_override( + 'packing_host_numa_cells_allocation_strategy', + True, + group = 'compute') + instance_topology = hw.numa_fit_instance_to_host( + self.host, self.instance1, + pci_stats = self.pci_stats) + self.assertInstanceNUMAcellOrder([0, 2, 3], instance_topology) + CONF.set_override( + 'packing_host_numa_cells_allocation_strategy', + False, + group = 'compute') + instance_topology = hw.numa_fit_instance_to_host( + self.host, self.instance1, + pci_stats = self.pci_stats) + self.assertInstanceNUMAcellOrder([3, 2, 0], instance_topology) diff --git a/nova/virt/hardware.py b/nova/virt/hardware.py index df4a8be3da89..c4ebae11ca0f 100644 --- a/nova/virt/hardware.py +++ b/nova/virt/hardware.py @@ -2295,14 +2295,67 @@ def numa_fit_instance_to_host( host_cells = host_topology.cells - # If PCI device(s) are not required, prefer host cells that don't have - # devices attached. Presence of a given numa_node in a PCI pool is - # indicative of a PCI device being associated with that node - if not pci_requests and pci_stats: - # TODO(stephenfin): pci_stats can't be None here but mypy can't figure - # that out for some reason - host_cells = sorted(host_cells, key=lambda cell: cell.id in [ - pool['numa_node'] for pool in pci_stats.pools]) # type: ignore + # We need to perform all optimizations only if number of instance's + # cells less than host's cells number. If it's equal, we'll use + # all cells and no sorting of the cells list is needed. + if len(host_topology) > len(instance_topology): + pack = CONF.compute.packing_host_numa_cells_allocation_strategy + # To balance NUMA cells usage based on several parameters + # some sorts performed on host_cells list to move less used cells + # to the beginning of the host_cells list (when pack variable is set to + # 'False'). When pack is set to 'True', most used cells will be put at + # the beginning of the host_cells list. + + # Fist sort is based on memory usage. cell.avail_memory returns free + # memory for cell. Revert sorting to get cells with more free memory + # first when pack is 'False' + host_cells = sorted( + host_cells, + reverse=not pack, + key=lambda cell: cell.avail_memory) + + # Next sort based on available dedicated or shared CPUs. + # cpu_policy is set to the same value in all cells so we use + # first cell in list (it exists if instance_topology defined) + # to get cpu_policy + if instance_topology.cells[0].cpu_policy in ( + None, fields.CPUAllocationPolicy.SHARED): + # sort based on used CPUs + host_cells = sorted( + host_cells, + reverse=pack, + key=lambda cell: cell.cpu_usage) + + else: + # sort based on presence of pinned CPUs + host_cells = sorted( + host_cells, + reverse=not pack, + key=lambda cell: len(cell.free_pcpus)) + + # Perform sort only if pci_stats exists + if pci_stats: + # Create dict with numa cell id as key + # and total number of free pci devices as value. + total_pci_in_cell: ty.Dict[int, int] = {} + for pool in pci_stats.pools: + if pool['numa_node'] in list(total_pci_in_cell): + total_pci_in_cell[pool['numa_node']] += pool['count'] + else: + total_pci_in_cell[pool['numa_node']] = pool['count'] + # For backward compatibility we will always 'spread': + # we always move host cells with PCI at the beginning if PCI + # requested by VM and move host cells with PCI at the end of the + # list if PCI isn't requested by VM + if pci_requests: + host_cells = sorted( + host_cells, + reverse=True, + key=lambda cell: total_pci_in_cell.get(cell.id, 0)) + else: + host_cells = sorted( + host_cells, + key=lambda cell: total_pci_in_cell.get(cell.id, 0)) for host_cell_perm in itertools.permutations( host_cells, len(instance_topology)): diff --git a/releasenotes/notes/extra-sorting-for-host-cells-c03e37de1e57043b.yaml b/releasenotes/notes/extra-sorting-for-host-cells-c03e37de1e57043b.yaml new file mode 100644 index 000000000000..ea708053146b --- /dev/null +++ b/releasenotes/notes/extra-sorting-for-host-cells-c03e37de1e57043b.yaml @@ -0,0 +1,10 @@ +--- +features: + - | + Extra sortings were added to numa_fit_instance_to_host function + to balance usage of hypervisor's NUMA cells. Hypervisor's NUMA + cells with more free resources (CPU, RAM, PCI if requested) + will be used first (spread strategy) when configuration option + ``packing_host_numa_cells_allocation_strategy`` was set to False. + Default value of ``packing_host_numa_cells_allocation_strategy`` + option is set to True which leads to packing strategy usage.