diff --git a/doc/source/admin/cpu-topologies.rst b/doc/source/admin/cpu-topologies.rst
index 179f7bd3775d..529542d8050e 100644
--- a/doc/source/admin/cpu-topologies.rst
+++ b/doc/source/admin/cpu-topologies.rst
@@ -95,14 +95,64 @@ In all cases where NUMA awareness is used, the ``NUMATopologyFilter``
 filter must be enabled. Details on this filter are provided in
 :doc:`/admin/scheduling`.
 
+The host's NUMA node(s) used are chosen based on some logic and controlled by
+``packing_host_numa_cells_allocation_strategy`` configuration variable in
+nova.conf. By default ``packing_host_numa_cells_allocation_strategy``
+variable is set to ``True``. It leads to attempt to chose NUMA node(s) with
+less amount of free resources (or in other words **more used** NUMA nodes)
+first. It is so-called "pack" strategy - we try to place as much as possible
+load at **more used** host's NUMA node until it will be completely exhausted.
+And only after we will choose **most used** host's NUMA node from the rest
+available nodes on host. "Spread" strategy is reverse to "pack" strategy.
+The NUMA node(s) with **more free** resources will be used first. So "spread"
+strategy will try to balance load between all NUMA nodes and keep number of
+free resources on all NUMA nodes as more equal as possible.
+
 .. caution::
 
-   The NUMA node(s) used are normally chosen at random. However, if a PCI
-   passthrough or SR-IOV device is attached to the instance, then the NUMA
-   node that the device is associated with will be used. This can provide
-   important performance improvements. However, booting a large number of
-   similar instances can result in unbalanced NUMA node usage. Care should
-   be taken to mitigate this issue. See this `discussion`_ for more details.
+    Host's NUMA nodes are placed in list and list is sorted based on strategy
+    chosen and resource available in each NUMA node. Sorts are performed on
+    same list one after another, so the last sort implemented is the sort
+    with most priority.
+
+The python performed so-called stable sort. It means that each sort executed
+on same list will change order of list items only if item's property we sort on
+differs. If this properties in all list's items are equal than elements order
+will not changed.
+
+Sorts are performed on host's NUMA nodes list in the following order:
+
+*  sort based on available memory on node(first sort-less priority)
+*  sort based on cpu usage (in case of shared CPUs requested by guest
+   VM topology) or free pinned cpus otherwise.
+*  sort based on number of free PCI device on node(last sort-top priority)
+
+Top sorting priority is for host's NUMA nodes with PCI devices attached. If VM
+requested PCI device(s) logic **always** puts host's NUMA nodes with more PCI
+devices at the beginnig of the host's NUMA nodes list. If PCI devices isn't
+requested by VM than NUMA nodes with no (or less) PCI device available will be
+placed at the beginnig of the list.
+
+.. caution::
+
+   The described logic for PCI devices is used **both** for "pack" and "spread"
+   strategies. It is done to keep backward compatibility with previous nova
+   versions.
+
+
+During "pack" logic implementation rest (two) sorts are performed with sort
+order to move NUMA nodes with more available resources (CPUs and memory) at the
+END of host's NUMA nodes list. Sort based on memory is the first sort
+implemented and has least priority.
+
+During "spread" logic implementation rest (two) sorts are performed with sort
+order to move NUMA nodes with more available resources (CPUs and memory) at the
+BEGINNING of host's NUMA nodes list. Sort based on memory is the first sort
+implemented and has least priority.
+
+Finally resulting list (after all sorts) is passed next and attempts to place
+VM's NUMA node to host's NUMA node are performed starting from the first
+host's NUMA node in list.
 
 .. caution::
 
@@ -724,5 +774,4 @@ instances with a NUMA topology.
 
 .. Links
 .. _`Image metadata`: https://docs.openstack.org/image-guide/introduction.html#image-metadata
-.. _`discussion`: http://lists.openstack.org/pipermail/openstack-dev/2016-March/090367.html
 .. _`MTTCG project`: http://wiki.qemu.org/Features/tcg-multithread
diff --git a/nova/conf/compute.py b/nova/conf/compute.py
index 5cf8c31714d1..263d77758695 100644
--- a/nova/conf/compute.py
+++ b/nova/conf/compute.py
@@ -1007,6 +1007,23 @@ Related options:
 * ``[scheduler]query_placement_for_image_type_support`` - enables
   filtering computes based on supported image types, which is required
   to be enabled for this to take effect.
+"""),
+    cfg.BoolOpt('packing_host_numa_cells_allocation_strategy',
+        default=True,
+        help="""
+This option controls allocation strategy used to choose NUMA cells on host for
+placing VM's NUMA cells (for VMs with defined numa topology). By
+default host's NUMA cell with more resources consumed will be chosen first for
+placing attempt. So the host cell with some usage will be packed with VM's cell
+until it will be completely exhausted, before new free host's cell will be
+used. When the packing_host_numa_cells_allocation_strategy variable is set to
+``False``, host's NUMA cell with more resources available will be used.
+
+Possible values:
+
+* ``True``: Packing VM's NUMA cell on most used host NUMA cell.
+* ``False``: Spreading VM's NUMA cell on host's NUMA cells with more resources
+  available.
 """),
 ]
 
diff --git a/nova/tests/unit/virt/test_hardware.py b/nova/tests/unit/virt/test_hardware.py
index 80d0133c20be..b7e004f93f67 100644
--- a/nova/tests/unit/virt/test_hardware.py
+++ b/nova/tests/unit/virt/test_hardware.py
@@ -19,6 +19,7 @@ import ddt
 import mock
 import testtools
 
+import nova.conf
 from nova import exception
 from nova import objects
 from nova.objects import fields
@@ -28,6 +29,8 @@ from nova.tests.unit import fake_pci_device_pools as fake_pci
 from nova.tests.unit.image.fake import fake_image_obj
 from nova.virt import hardware as hw
 
+CONF = nova.conf.CONF
+
 
 class InstanceInfoTests(test.NoDBTestCase):
 
@@ -2753,7 +2756,7 @@ class VirtNUMAHostTopologyTestCase(test.NoDBTestCase):
         # the PCI device is found on host cell 1
         pci_stats = _create_pci_stats(1)
 
-        # ...threfore an instance without a PCI device should get host cell 2
+        # ...therefore an instance without a PCI device should get host cell 2
         instance_topology = hw.numa_fit_instance_to_host(
                 self.host, self.instance1, pci_stats=pci_stats)
         self.assertIsInstance(instance_topology, objects.InstanceNUMATopology)
@@ -2763,7 +2766,7 @@ class VirtNUMAHostTopologyTestCase(test.NoDBTestCase):
         # the PCI device is now found on host cell 2
         pci_stats = _create_pci_stats(2)
 
-        # ...threfore an instance without a PCI device should get host cell 1
+        # ...therefore an instance without a PCI device should get host cell 1
         instance_topology = hw.numa_fit_instance_to_host(
                 self.host, self.instance1, pci_stats=pci_stats)
         self.assertIsInstance(instance_topology, objects.InstanceNUMATopology)
@@ -5664,3 +5667,243 @@ class RescuePropertyTestCase(test.NoDBTestCase):
         meta = objects.ImageMeta.from_dict({'disk_format': 'raw'})
         meta.properties = objects.ImageMetaProps.from_dict(props)
         self.assertEqual(expected, hw.check_hw_rescue_props(meta))
+
+
+class HostCellsSortingTestCase(test.NoDBTestCase):
+    # NOTE (IPO) It is possible to test all sorting cases with one defined
+    # host NUMA topo.
+    # We have 4 NUMA cells with the following properties:
+    # NUMA cell 0: have most cpu usage
+    # NUMA cell 1: will have most PCI available
+    # NUMA cell 2: have most free pcpus
+    # NUMA cell 3: have most available memory
+    # So it will be enough to check order of NUMA cell in resulting instance
+    # topo to check particular sorting case.
+
+    def setUp(self):
+        super(HostCellsSortingTestCase, self).setUp()
+
+        def _create_pci_stats(node, count):
+            test_dict = copy.copy(fake_pci.fake_pool_dict)
+            test_dict['numa_node'] = node
+            test_dict['vendor_id'] = '8086'
+            test_dict['product_id'] = 'fake-prod0'
+            test_dict['count'] = count
+            return stats.PciDeviceStats(
+                objects.NUMATopology(),
+                [objects.PciDevicePool.from_dict(test_dict)])
+
+        self.pci_stats = _create_pci_stats(1, 2)
+
+        self.host = objects.NUMATopology(cells=[
+            objects.NUMACell(
+                id=0,
+                cpuset=set([1, 2, 3, 4]),
+                pcpuset=set([1, 2, 3, 4]),
+                memory=4096,
+                cpu_usage=3,
+                memory_usage=2048,
+                pinned_cpus=set([1, 2]),
+                mempages=[objects.NUMAPagesTopology(
+                    size_kb=4, total=524288, used=0)],
+                siblings=[set([1]), set([2]), set([3]), set([4])]),
+            objects.NUMACell(
+                id=1,
+                cpuset=set([5, 6, 7, 8]),
+                pcpuset=set([5, 6, 7, 8]),
+                memory=4096,
+                cpu_usage=2,
+                memory_usage=2048,
+                pinned_cpus=set([5, 6]),
+                mempages=[objects.NUMAPagesTopology(
+                    size_kb=4, total=524288, used=0)],
+                siblings=[set([5]), set([6]), set([7]), set([8])]),
+            objects.NUMACell(
+                id=2,
+                cpuset=set([9, 10, 11, 12]),
+                pcpuset=set([9, 10, 11, 12]),
+                memory=4096,
+                cpu_usage=2,
+                memory_usage=2048,
+                pinned_cpus=set(),
+                mempages=[objects.NUMAPagesTopology(
+                    size_kb=4, total=524288, used=0)],
+                siblings=[set([9]), set([10]), set([11]), set([12])]),
+            objects.NUMACell(
+                id=3,
+                cpuset=set([13, 14, 15, 16]),
+                pcpuset=set([13, 14, 15, 16]),
+                memory=4096,
+                cpu_usage=2,
+                memory_usage=1024,
+                pinned_cpus=set([13, 14]),
+                mempages=[objects.NUMAPagesTopology(
+                    size_kb=4, total=524288, used=0)],
+                siblings=[set([13]), set([14]), set([15]), set([16])])
+        ])
+
+        self.instance0 = objects.InstanceNUMATopology(cells=[
+        objects.InstanceNUMACell(
+            id=0, cpuset=set([0]), pcpuset=set(), memory=2048),
+        objects.InstanceNUMACell(
+            id=1, cpuset=set([1]), pcpuset=set(), memory=2048),
+        objects.InstanceNUMACell(
+            id=2, cpuset=set([2]), pcpuset=set(), memory=2048),
+        objects.InstanceNUMACell(
+            id=3, cpuset=set([3]), pcpuset=set(), memory=2048)
+        ])
+
+        self.instance1 = objects.InstanceNUMATopology(cells=[
+        objects.InstanceNUMACell(
+            id=0, cpuset=set([0]), pcpuset=set(), memory=2048),
+        objects.InstanceNUMACell(
+            id=1, cpuset=set([1]), pcpuset=set(), memory=2048),
+        objects.InstanceNUMACell(
+            id=2, cpuset=set([2]), pcpuset=set(), memory=2048),
+        ])
+
+        self.instance2 = objects.InstanceNUMATopology(cells=[
+        objects.InstanceNUMACell(
+            id=0,
+            cpuset=set(),
+            pcpuset=set([0]),
+            memory=2048,
+            cpu_policy=fields.CPUAllocationPolicy.DEDICATED
+        ),
+        objects.InstanceNUMACell(
+            id=1,
+            cpuset=set(),
+            pcpuset=set([1]),
+            memory=2048,
+            cpu_policy=fields.CPUAllocationPolicy.DEDICATED
+        ),
+        objects.InstanceNUMACell(
+            id=2,
+            cpuset=set(),
+            pcpuset=set([2]),
+            memory=2048,
+            cpu_policy=fields.CPUAllocationPolicy.DEDICATED
+        )])
+
+    def assertInstanceNUMAcellOrder(self, list_to_check, instance_topo):
+        for cell, id in zip(instance_topo.cells, list_to_check):
+            self.assertEqual(cell.id, id)
+
+    def test_sort_host_numa_cell_num_equal_instance_cell_num(self):
+        instance_topology = hw.numa_fit_instance_to_host(
+                self.host, self.instance0)
+        self.assertInstanceNUMAcellOrder([0, 1, 2, 3], instance_topology)
+
+    def test_sort_no_pci_stats_no_shared_cpu_policy(self):
+        CONF.set_override(
+            'packing_host_numa_cells_allocation_strategy',
+            True,
+            group = 'compute')
+        instance_topology = hw.numa_fit_instance_to_host(
+                self.host, self.instance2)
+        self.assertInstanceNUMAcellOrder([0, 1, 3], instance_topology)
+        CONF.set_override(
+            'packing_host_numa_cells_allocation_strategy',
+            False,
+            group = 'compute')
+        instance_topology = hw.numa_fit_instance_to_host(
+                self.host, self.instance2)
+        self.assertInstanceNUMAcellOrder([2, 3, 0], instance_topology)
+
+    def test_sort_no_pci_stats_shared_cpu_policy(self):
+        CONF.set_override(
+            'packing_host_numa_cells_allocation_strategy',
+            True,
+            group = 'compute')
+        instance_topology = hw.numa_fit_instance_to_host(
+                self.host, self.instance1)
+        self.assertInstanceNUMAcellOrder([0, 1, 2], instance_topology)
+        CONF.set_override(
+            'packing_host_numa_cells_allocation_strategy',
+            False,
+            group = 'compute')
+        instance_topology = hw.numa_fit_instance_to_host(
+                self.host, self.instance1)
+        self.assertInstanceNUMAcellOrder([3, 1, 2], instance_topology)
+
+    def test_sort_pci_stats_pci_req_no_shared_cpu_policy(self):
+        CONF.set_override(
+            'packing_host_numa_cells_allocation_strategy',
+            True,
+            group = 'compute')
+        pci_request = objects.InstancePCIRequest(count=1,
+            spec=[{'vendor_id': '8086', 'product_id': 'fake-prod0'}])
+        pci_reqs = [pci_request]
+        instance_topology = hw.numa_fit_instance_to_host(
+                self.host, self.instance2,
+                pci_requests = pci_reqs,
+                pci_stats = self.pci_stats)
+        self.assertInstanceNUMAcellOrder([1, 0, 3], instance_topology)
+        CONF.set_override(
+            'packing_host_numa_cells_allocation_strategy',
+            False,
+            group = 'compute')
+        instance_topology = hw.numa_fit_instance_to_host(
+                self.host, self.instance2,
+                pci_requests = pci_reqs,
+                pci_stats = self.pci_stats)
+        self.assertInstanceNUMAcellOrder([1, 2, 3], instance_topology)
+
+    def test_sort_pci_stats_pci_req_shared_cpu_policy(self):
+        CONF.set_override(
+            'packing_host_numa_cells_allocation_strategy',
+            True,
+            group = 'compute')
+        pci_request = objects.InstancePCIRequest(count=1,
+            spec=[{'vendor_id': '8086', 'product_id': 'fake-prod0'}])
+        pci_reqs = [pci_request]
+        instance_topology = hw.numa_fit_instance_to_host(
+                self.host, self.instance1,
+                pci_requests = pci_reqs,
+                pci_stats = self.pci_stats)
+        self.assertInstanceNUMAcellOrder([1, 0, 2], instance_topology)
+        CONF.set_override(
+            'packing_host_numa_cells_allocation_strategy',
+            False,
+            group = 'compute')
+        instance_topology = hw.numa_fit_instance_to_host(
+                self.host, self.instance1,
+                pci_requests = pci_reqs,
+                pci_stats = self.pci_stats)
+        self.assertInstanceNUMAcellOrder([1, 3, 2], instance_topology)
+
+    def test_sort_pci_stats_no_pci_req_no_shared_cpu_policy(self):
+        CONF.set_override(
+            'packing_host_numa_cells_allocation_strategy',
+            True,
+            group = 'compute')
+        instance_topology = hw.numa_fit_instance_to_host(
+                self.host, self.instance2,
+                pci_stats = self.pci_stats)
+        self.assertInstanceNUMAcellOrder([0, 3, 2], instance_topology)
+        CONF.set_override(
+            'packing_host_numa_cells_allocation_strategy',
+            False,
+            group = 'compute')
+        instance_topology = hw.numa_fit_instance_to_host(
+                self.host, self.instance2,
+                pci_stats = self.pci_stats)
+        self.assertInstanceNUMAcellOrder([2, 3, 0], instance_topology)
+
+    def test_sort_pci_stats_no_pci_req_shared_cpu_policy(self):
+        CONF.set_override(
+            'packing_host_numa_cells_allocation_strategy',
+            True,
+            group = 'compute')
+        instance_topology = hw.numa_fit_instance_to_host(
+                self.host, self.instance1,
+                pci_stats = self.pci_stats)
+        self.assertInstanceNUMAcellOrder([0, 2, 3], instance_topology)
+        CONF.set_override(
+            'packing_host_numa_cells_allocation_strategy',
+            False,
+            group = 'compute')
+        instance_topology = hw.numa_fit_instance_to_host(
+                self.host, self.instance1,
+                pci_stats = self.pci_stats)
+        self.assertInstanceNUMAcellOrder([3, 2, 0], instance_topology)
diff --git a/nova/virt/hardware.py b/nova/virt/hardware.py
index df4a8be3da89..c4ebae11ca0f 100644
--- a/nova/virt/hardware.py
+++ b/nova/virt/hardware.py
@@ -2295,14 +2295,67 @@ def numa_fit_instance_to_host(
 
     host_cells = host_topology.cells
 
-    # If PCI device(s) are not required, prefer host cells that don't have
-    # devices attached. Presence of a given numa_node in a PCI pool is
-    # indicative of a PCI device being associated with that node
-    if not pci_requests and pci_stats:
-        # TODO(stephenfin): pci_stats can't be None here but mypy can't figure
-        # that out for some reason
-        host_cells = sorted(host_cells, key=lambda cell: cell.id in [
-            pool['numa_node'] for pool in pci_stats.pools])  # type: ignore
+    # We need to perform all optimizations only if number of instance's
+    # cells less than host's cells number. If it's equal, we'll use
+    # all cells and no sorting of the cells list is needed.
+    if len(host_topology) > len(instance_topology):
+        pack = CONF.compute.packing_host_numa_cells_allocation_strategy
+        # To balance NUMA cells usage based on several parameters
+        # some sorts performed on host_cells list to move less used cells
+        # to the beginning of the host_cells list (when pack variable is set to
+        # 'False'). When pack is set to 'True', most used cells will be put at
+        # the beginning of the host_cells list.
+
+        # Fist sort is based on memory usage. cell.avail_memory returns free
+        # memory for cell. Revert sorting to get cells with more free memory
+        # first when pack is 'False'
+        host_cells = sorted(
+            host_cells,
+            reverse=not pack,
+            key=lambda cell: cell.avail_memory)
+
+        # Next sort based on available dedicated or shared CPUs.
+        # cpu_policy is set to the same value in all cells so we use
+        # first cell in list (it exists if instance_topology defined)
+        # to get cpu_policy
+        if instance_topology.cells[0].cpu_policy in (
+                None, fields.CPUAllocationPolicy.SHARED):
+            # sort based on used CPUs
+            host_cells = sorted(
+                host_cells,
+                reverse=pack,
+                key=lambda cell: cell.cpu_usage)
+
+        else:
+            # sort based on presence of pinned CPUs
+            host_cells = sorted(
+                host_cells,
+                reverse=not pack,
+                key=lambda cell: len(cell.free_pcpus))
+
+        # Perform sort only if pci_stats exists
+        if pci_stats:
+            # Create dict with numa cell id as key
+            # and total number of free pci devices as value.
+            total_pci_in_cell: ty.Dict[int, int] = {}
+            for pool in pci_stats.pools:
+                if pool['numa_node'] in list(total_pci_in_cell):
+                    total_pci_in_cell[pool['numa_node']] += pool['count']
+                else:
+                    total_pci_in_cell[pool['numa_node']] = pool['count']
+            # For backward compatibility we will always 'spread':
+            # we always move host cells with PCI at the beginning if PCI
+            # requested by VM and move host cells with PCI at the end of the
+            # list if PCI isn't requested by VM
+            if pci_requests:
+                host_cells = sorted(
+                    host_cells,
+                    reverse=True,
+                    key=lambda cell: total_pci_in_cell.get(cell.id, 0))
+            else:
+                host_cells = sorted(
+                    host_cells,
+                    key=lambda cell: total_pci_in_cell.get(cell.id, 0))
 
     for host_cell_perm in itertools.permutations(
             host_cells, len(instance_topology)):
diff --git a/releasenotes/notes/extra-sorting-for-host-cells-c03e37de1e57043b.yaml b/releasenotes/notes/extra-sorting-for-host-cells-c03e37de1e57043b.yaml
new file mode 100644
index 000000000000..ea708053146b
--- /dev/null
+++ b/releasenotes/notes/extra-sorting-for-host-cells-c03e37de1e57043b.yaml
@@ -0,0 +1,10 @@
+---
+features:
+  - |
+    Extra sortings were added to numa_fit_instance_to_host function
+    to balance usage of hypervisor's NUMA cells. Hypervisor's NUMA
+    cells with more free resources (CPU, RAM, PCI if requested)
+    will be used first (spread strategy) when configuration option
+    ``packing_host_numa_cells_allocation_strategy`` was set to False.
+    Default value of ``packing_host_numa_cells_allocation_strategy``
+    option is set to True which leads to packing strategy usage.