Merge "fix cleaning up evacuated instances"

2017-10-17 18:08:14 +00:00 · 2017-10-17 18:08:14 +00:00 · e11a8687ae
parent 5e4c98a58f 9252ffdacf
commit e11a8687ae
3 changed files with 157 additions and 42 deletions
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@ -653,6 +653,12 @@ class ComputeManager(manager.Manager):
        local_instances = self._get_instances_on_driver(context)
        evacuated = [inst for inst in local_instances
                     if inst.uuid in evacuations]
+
+        # NOTE(gibi): We are called from init_host and at this point the
+        # compute_nodes of the resource tracker has not been populated yet so
+        # we cannot rely on the resource tracker here.
+        compute_nodes = {}
+
        for instance in evacuated:
            migration = evacuations[instance.uuid]
            LOG.info('Deleting instance as it has been evacuated from '
@ -676,9 +682,28 @@ class ComputeManager(manager.Manager):
                                network_info,
                                bdi, destroy_disks)

-            rt = self._get_resource_tracker()
-            rt.delete_allocation_for_evacuated_instance(
-                instance, migration.source_node)
+            # delete the allocation of the evacuated instance from this host
+            if migration.source_node not in compute_nodes:
+                try:
+                    cn_uuid = objects.ComputeNode.get_by_host_and_nodename(
+                        context, self.host, migration.source_node).uuid
+                    compute_nodes[migration.source_node] = cn_uuid
+                except exception.ComputeHostNotFound:
+                    LOG.error("Failed to clean allocation of evacuated "
+                              "instance as the source node %s is not found",
+                              migration.source_node, instance=instance)
+                    continue
+            cn_uuid = compute_nodes[migration.source_node]
+
+            my_resources = scheduler_utils.resources_from_flavor(
+                instance, instance.flavor)
+            res = self.reportclient.remove_provider_from_instance_allocation(
+                instance.uuid, cn_uuid, instance.user_id,
+                instance.project_id, my_resources)
+            if not res:
+                LOG.error("Failed to clean allocation of evacuated instance "
+                          "on the source node %s",
+                          cn_uuid, instance=instance)

            migration.status = 'completed'
            migration.save()
--- a/nova/tests/functional/test_servers.py
+++ b/nova/tests/functional/test_servers.py
@ -1670,44 +1670,24 @@ class ServerMovingTests(ProviderUsageBaseTestCase):
        self.assertFlavorMatchesAllocation(self.flavor1, dest_allocation)

        # start up the source compute
-        # NOTE(gibi): This is bug 1721652. The restart should go through
-        # without exception.
-        self.assertRaises(KeyError, self._restart_compute_service,
-                          self.compute1)
+        self._restart_compute_service(self.compute1)

        self.admin_api.put_service(
            source_compute_id, {'forced_down': 'false'})

-        # NOTE(gibi): This is bug 1721652. After the source host is started up
-        # the allocation should be cleaned.
        source_usages = self._get_provider_usages(source_rp_uuid)
-        self.assertFlavorMatchesAllocation(self.flavor1, source_usages)
+        self.assertEqual({'VCPU': 0,
+                          'MEMORY_MB': 0,
+                          'DISK_GB': 0},
+                         source_usages)

        dest_usages = self._get_provider_usages(dest_rp_uuid)
        self.assertFlavorMatchesAllocation(self.flavor1, dest_usages)

        allocations = self._get_allocations_by_server_uuid(server['id'])
-        self.assertEqual(2, len(allocations))
+        self.assertEqual(1, len(allocations))
        dest_allocation = allocations[dest_rp_uuid]['resources']
        self.assertFlavorMatchesAllocation(self.flavor1, dest_allocation)
-        source_allocation = allocations[source_rp_uuid]['resources']
-        self.assertFlavorMatchesAllocation(self.flavor1, source_allocation)
-
-        # NOTE(gibi): Once the bug 1721652 is fixed this should be the expected
-        # behavior
-        # source_usages = self._get_provider_usages(source_rp_uuid)
-        # self.assertEqual({'VCPU': 0,
-        #                   'MEMORY_MB': 0,
-        #                   'DISK_GB': 0},
-        #                  source_usages)
-        #
-        # dest_usages = self._get_provider_usages(dest_rp_uuid)
-        # self.assertFlavorMatchesAllocation(self.flavor1, dest_usages)
-        #
-        # allocations = self._get_allocations_by_server_uuid(server['id'])
-        # self.assertEqual(1, len(allocations))
-        # dest_allocation = allocations[dest_rp_uuid]['resources']
-        # self.assertFlavorMatchesAllocation(self.flavor1, dest_allocation)

        self._delete_and_check_allocations(
            server, source_rp_uuid, dest_rp_uuid)
--- a/nova/tests/unit/compute/test_compute_mgr.py
+++ b/nova/tests/unit/compute/test_compute_mgr.py
@ -650,8 +650,8 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
            self.compute.init_virt_events()
        self.assertFalse(mock_register.called)

-    @mock.patch('nova.compute.resource_tracker.ResourceTracker.'
-                'delete_allocation_for_evacuated_instance')
+    @mock.patch('nova.objects.ComputeNode.get_by_host_and_nodename')
+    @mock.patch('nova.scheduler.utils.resources_from_flavor')
    @mock.patch.object(manager.ComputeManager, '_get_instances_on_driver')
    @mock.patch.object(manager.ComputeManager, 'init_virt_events')
    @mock.patch.object(context, 'get_admin_context')
@ -663,7 +663,8 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
    @mock.patch('nova.objects.Migration.save')
    def test_init_host_with_evacuated_instance(self, mock_save, mock_mig_get,
            mock_temp_mut, mock_init_host, mock_destroy, mock_host_get,
-            mock_admin_ctxt, mock_init_virt, mock_get_inst, mock_delete_alloc):
+            mock_admin_ctxt, mock_init_virt, mock_get_inst, mock_resources,
+            mock_get_node):
        our_host = self.compute.host
        not_our_host = 'not-' + our_host

@ -674,15 +675,29 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
        mock_mig_get.return_value = [migration]
        mock_admin_ctxt.return_value = self.context
        mock_host_get.return_value = objects.InstanceList()
+        our_node = objects.ComputeNode(host=our_host, uuid=uuids.our_node_uuid)
+        mock_get_node.return_value = our_node
+        mock_resources.return_value = mock.sentinel.my_resources

        # simulate failed instance
        mock_get_inst.return_value = [deleted_instance]
-        with mock.patch.object(
+        with test.nested(
+            mock.patch.object(
                self.compute.network_api, 'get_instance_nw_info',
                side_effect = exception.InstanceNotFound(
-                    instance_id=deleted_instance['uuid'])) as mock_get_net:
+                    instance_id=deleted_instance['uuid'])),
+            mock.patch.object(
+                self.compute.reportclient,
+                'remove_provider_from_instance_allocation')
+        ) as (mock_get_net, mock_remove_allocation):
+
            self.compute.init_host()

+            mock_remove_allocation.assert_called_once_with(
+                deleted_instance.uuid, uuids.our_node_uuid,
+                deleted_instance.user_id, deleted_instance.project_id,
+                mock.sentinel.my_resources)
+
        mock_init_host.assert_called_once_with(host=our_host)
        mock_host_get.assert_called_once_with(self.context, our_host,
                                expected_attrs=['info_cache', 'metadata'])
@ -696,8 +711,6 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
        mock_destroy.assert_called_once_with(self.context, deleted_instance,
                                             mock.ANY, mock.ANY, mock.ANY)
        mock_save.assert_called_once_with()
-        mock_delete_alloc.assert_called_once_with(
-            deleted_instance, migration.source_node)

    def test_init_instance_with_binding_failed_vif_type(self):
        # this instance will plug a 'binding_failed' vif
@ -3334,16 +3347,21 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):

    def test_destroy_evacuated_instances(self):
        our_host = self.compute.host
-        instance_1 = objects.Instance(self.context)
+        flavor = objects.Flavor()
+        instance_1 = objects.Instance(self.context, flavor=flavor)
        instance_1.uuid = uuids.instance_1
        instance_1.task_state = None
        instance_1.vm_state = vm_states.ACTIVE
        instance_1.host = 'not-' + our_host
-        instance_2 = objects.Instance(self.context)
+        instance_1.user_id = uuids.user_id
+        instance_1.project_id = uuids.project_id
+        instance_2 = objects.Instance(self.context, flavor=flavor)
        instance_2.uuid = uuids.instance_2
        instance_2.task_state = None
        instance_2.vm_state = vm_states.ACTIVE
        instance_2.host = 'not-' + our_host
+        instance_2.user_id = uuids.user_id
+        instance_2.project_id = uuids.project_id

        # Only instance 2 has a migration record
        migration = objects.Migration(instance_uuid=instance_2.uuid)
@ -3351,6 +3369,9 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
        migration.status = 'done'
        migration.source_node = 'fake-node'

+        our_node = objects.ComputeNode(
+            host=our_host, uuid=uuids.our_node_uuid)
+
        with test.nested(
            mock.patch.object(self.compute, '_get_instances_on_driver',
                               return_value=[instance_1,
@ -3364,19 +3385,108 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
            mock.patch.object(self.compute.driver, 'destroy'),
            mock.patch('nova.objects.MigrationList.get_by_filters'),
            mock.patch('nova.objects.Migration.save'),
-            mock.patch('nova.compute.resource_tracker.ResourceTracker.'
-                       'delete_allocation_for_evacuated_instance')
+            mock.patch('nova.objects.ComputeNode.get_by_host_and_nodename'),
+            mock.patch('nova.scheduler.utils.resources_from_flavor'),
+            mock.patch.object(self.compute.reportclient,
+                              'remove_provider_from_instance_allocation')
        ) as (_get_instances_on_driver, get_instance_nw_info,
              _get_instance_block_device_info, _is_instance_storage_shared,
-              destroy, migration_list, migration_save, remove_allocation):
+              destroy, migration_list, migration_save, get_node,
+              get_resources, remove_allocation):
            migration_list.return_value = [migration]
+            get_node.return_value = our_node
+            get_resources.return_value = mock.sentinel.resources
+
            self.compute._destroy_evacuated_instances(self.context)
            # Only instance 2 should be deleted. Instance 1 is still running
            # here, but no migration from our host exists, so ignore it
            destroy.assert_called_once_with(self.context, instance_2, None,
                                            {}, True)
+
+            get_node.assert_called_once_with(
+                self.context, our_host, migration.source_node)
            remove_allocation.assert_called_once_with(
-                instance_2, migration.source_node)
+                instance_2.uuid, uuids.our_node_uuid, uuids.user_id,
+                uuids.project_id, mock.sentinel.resources)
+
+    def test_destroy_evacuated_instances_node_deleted(self):
+        our_host = self.compute.host
+        flavor = objects.Flavor()
+        instance_1 = objects.Instance(self.context, flavor=flavor)
+        instance_1.uuid = uuids.instance_1
+        instance_1.task_state = None
+        instance_1.vm_state = vm_states.ACTIVE
+        instance_1.host = 'not-' + our_host
+        instance_1.user_id = uuids.user_id
+        instance_1.project_id = uuids.project_id
+        instance_2 = objects.Instance(self.context, flavor=flavor)
+        instance_2.uuid = uuids.instance_2
+        instance_2.task_state = None
+        instance_2.vm_state = vm_states.ACTIVE
+        instance_2.host = 'not-' + our_host
+        instance_2.user_id = uuids.user_id
+        instance_2.project_id = uuids.project_id
+
+        migration_1 = objects.Migration(instance_uuid=instance_1.uuid)
+        # Consider the migration successful but the node was deleted while the
+        # compute was down
+        migration_1.status = 'done'
+        migration_1.source_node = 'deleted-node'
+
+        migration_2 = objects.Migration(instance_uuid=instance_2.uuid)
+        # Consider the migration successful
+        migration_2.status = 'done'
+        migration_2.source_node = 'fake-node'
+
+        our_node = objects.ComputeNode(
+            host=our_host, uuid=uuids.our_node_uuid)
+
+        with test.nested(
+            mock.patch.object(self.compute, '_get_instances_on_driver',
+                               return_value=[instance_1,
+                                             instance_2]),
+            mock.patch.object(self.compute.network_api, 'get_instance_nw_info',
+                               return_value=None),
+            mock.patch.object(self.compute, '_get_instance_block_device_info',
+                               return_value={}),
+            mock.patch.object(self.compute, '_is_instance_storage_shared',
+                               return_value=False),
+            mock.patch.object(self.compute.driver, 'destroy'),
+            mock.patch('nova.objects.MigrationList.get_by_filters'),
+            mock.patch('nova.objects.Migration.save'),
+            mock.patch('nova.objects.ComputeNode.get_by_host_and_nodename'),
+            mock.patch('nova.scheduler.utils.resources_from_flavor'),
+            mock.patch.object(self.compute.reportclient,
+                              'remove_provider_from_instance_allocation')
+        ) as (_get_instances_on_driver, get_instance_nw_info,
+              _get_instance_block_device_info, _is_instance_storage_shared,
+              destroy, migration_list, migration_save, get_node,
+              get_resources, remove_allocation):
+            migration_list.return_value = [migration_1, migration_2]
+
+            def fake_get_node(context, host, node):
+                if node == 'fake-node':
+                    return our_node
+                else:
+                    raise exception.ComputeHostNotFound(host=host)
+
+            get_node.side_effect = fake_get_node
+            get_resources.return_value = mock.sentinel.resources
+
+            self.compute._destroy_evacuated_instances(self.context)
+
+            # both instance_1 and instance_2 is destroyed in the driver
+            destroy.assert_has_calls(
+                [mock.call(self.context, instance_1, None, {}, True),
+                 mock.call(self.context, instance_2, None, {}, True)])
+
+            # but only instance_2 is deallocated as the compute node for
+            # instance_1 is already deleted
+            remove_allocation.assert_called_once_with(
+                instance_2.uuid, uuids.our_node_uuid, uuids.user_id,
+                uuids.project_id, mock.sentinel.resources)
+
+            self.assertEqual(2, get_node.call_count)

    @mock.patch('nova.compute.manager.ComputeManager.'
                '_destroy_evacuated_instances')