From 91455a522738572def003dfe5e4f0bec08255074 Mon Sep 17 00:00:00 2001
From: Matt Riedemann <mriedem.os@gmail.com>
Date: Fri, 28 Sep 2018 10:58:48 -0400
Subject: [PATCH] Add functional regression test for bug 1794996

The _destroy_evacuated_instances method on compute
startup tries to cleanup guests from the hypervisor
and allocations held against that compute node resource
provider by evacuated instances, but doesn't take into
account that those evacuated instances could have been
deleted in the meantime which leads to a lazy-load
InstanceNotFound error that kills the startup of the
compute service.

This change adds a functional regression test to recreate
the bug. A subsequent change with the fix will update
the test to show the bug is fixed.

Note that assertFlavorMatchesAllocation and
_boot_and_check_allocations are redefined in the test
class because If6aa37d9b6b48791e070799ab026c816fda4441c
refactored those methods which will cause problems with
backports of this test. The redefined methods will be
removed in a follow up cleanup patch.

NOTE(mriedem): The restart_compute_service() method
needed to be added to the functional test class because
change I17f67a02b27a90658df48856963ea3fb327e81dc is not
in Pike.

Change-Id: I19b0d8baea5440f5d5bc49a6956d9a97bf031a05
Related-Bug: #1794996
(cherry picked from commit d252f81573cdfe7a0966f134608bb85d17311e33)
(cherry picked from commit 83d74dbbb6b45bfeada0c0b9ac13385b126709bb)
(cherry picked from commit c3fd5e5061b837a78a95705074239c3d2e41e644)
---
 .../regressions/test_bug_1794996.py           | 204 ++++++++++++++++++
 1 file changed, 204 insertions(+)
 create mode 100644 nova/tests/functional/regressions/test_bug_1794996.py

diff --git a/nova/tests/functional/regressions/test_bug_1794996.py b/nova/tests/functional/regressions/test_bug_1794996.py
new file mode 100644
index 000000000000..f6b20691edb7
--- /dev/null
+++ b/nova/tests/functional/regressions/test_bug_1794996.py
@@ -0,0 +1,204 @@
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+from oslo_log import log as logging
+
+from nova import exception
+from nova.tests.functional import test_servers
+
+LOG = logging.getLogger(__name__)
+
+
+class TestEvacuateDeleteServerRestartOriginalCompute(
+        test_servers.ProviderUsageBaseTestCase):
+
+    compute_driver = 'fake.SmallFakeDriver'
+
+    def setUp(self):
+        super(TestEvacuateDeleteServerRestartOriginalCompute, self).setUp()
+        self.compute1 = self._start_compute(host='host1')
+        self.compute2 = self._start_compute(host='host2')
+
+        flavors = self.api.get_flavors()
+        self.flavor1 = flavors[0]
+
+    # NOTE(mriedem): This is here for backports and should be removed later
+    # on master (Stein).
+    def assertFlavorMatchesAllocation(self, flavor, allocation):
+        self.assertEqual(flavor['vcpus'], allocation['VCPU'])
+        self.assertEqual(flavor['ram'], allocation['MEMORY_MB'])
+        self.assertEqual(flavor['disk'], allocation['DISK_GB'])
+
+    # NOTE(mriedem): This is here for backports and should be removed later
+    # on master (Stein).
+    def _boot_and_check_allocations(self, flavor, source_hostname):
+        """Boot an instance and check that the resource allocation is correct
+        After booting an instance on the given host with a given flavor it
+        asserts that both the providers usages and resource allocations match
+        with the resources requested in the flavor. It also asserts that
+        running the periodic update_available_resource call does not change the
+        resource state.
+        :param flavor: the flavor the instance will be booted with
+        :param source_hostname: the name of the host the instance will be
+                                booted on
+        :return: the API representation of the booted instance
+        """
+        server_req = self._build_minimal_create_server_request(
+            self.api, 'some-server', flavor_id=flavor['id'],
+            image_uuid='155d900f-4e14-4e4c-a73d-069cbf4541e6',
+            networks='none')
+        server_req['availability_zone'] = 'nova:%s' % source_hostname
+        LOG.info('booting on %s', source_hostname)
+        created_server = self.api.post_server({'server': server_req})
+        server = self._wait_for_state_change(
+            self.admin_api, created_server, 'ACTIVE')
+
+        # Verify that our source host is what the server ended up on
+        self.assertEqual(source_hostname, server['OS-EXT-SRV-ATTR:host'])
+
+        source_rp_uuid = self._get_provider_uuid_by_host(source_hostname)
+
+        # Before we run periodics, make sure that we have allocations/usages
+        # only on the source host
+        source_usages = self._get_provider_usages(source_rp_uuid)
+        self.assertFlavorMatchesAllocation(flavor, source_usages)
+
+        # Check that the other providers has no usage
+        for rp_uuid in [self._get_provider_uuid_by_host(hostname)
+                        for hostname in self.computes.keys()
+                        if hostname != source_hostname]:
+            usages = self._get_provider_usages(rp_uuid)
+            self.assertEqual({'VCPU': 0,
+                              'MEMORY_MB': 0,
+                              'DISK_GB': 0}, usages)
+
+        # Check that the server only allocates resource from the host it is
+        # booted on
+        allocations = self._get_allocations_by_server_uuid(server['id'])
+        self.assertEqual(1, len(allocations),
+                         'No allocation for the server on the host it '
+                         'is booted on')
+        allocation = allocations[source_rp_uuid]['resources']
+        self.assertFlavorMatchesAllocation(flavor, allocation)
+
+        self._run_periodics()
+
+        # After running the periodics but before we start any other operation,
+        # we should have exactly the same allocation/usage information as
+        # before running the periodics
+
+        # Check usages on the selected host after boot
+        source_usages = self._get_provider_usages(source_rp_uuid)
+        self.assertFlavorMatchesAllocation(flavor, source_usages)
+
+        # Check that the server only allocates resource from the host it is
+        # booted on
+        allocations = self._get_allocations_by_server_uuid(server['id'])
+        self.assertEqual(1, len(allocations),
+                         'No allocation for the server on the host it '
+                         'is booted on')
+        allocation = allocations[source_rp_uuid]['resources']
+        self.assertFlavorMatchesAllocation(flavor, allocation)
+
+        # Check that the other providers has no usage
+        for rp_uuid in [self._get_provider_uuid_by_host(hostname)
+                        for hostname in self.computes.keys()
+                        if hostname != source_hostname]:
+            usages = self._get_provider_usages(rp_uuid)
+            self.assertEqual({'VCPU': 0,
+                              'MEMORY_MB': 0,
+                              'DISK_GB': 0}, usages)
+
+        return server
+
+    @staticmethod
+    def restart_compute_service(compute):
+        """Restart a compute service in a realistic way.
+
+        :param:compute: the nova-compute service to be restarted
+        """
+
+        # NOTE(gibi): The service interface cannot be used to simulate a real
+        # service restart as the manager object will not be recreated after a
+        # service.stop() and service.start() therefore the manager state will
+        # survive. For example the resource tracker will not be recreated after
+        # a stop start. The service.kill() call cannot help as it deletes
+        # the service from the DB which is unrealistic and causes that some
+        # operation that refers to the killed host (e.g. evacuate) fails.
+        # So this helper method tries to simulate a better compute service
+        # restart by cleaning up some of the internal state of the compute
+        # manager.
+        compute.stop()
+        compute.manager._resource_tracker = None
+        compute.start()
+
+    def test_evacuate_delete_server_restart_original_compute(self):
+        """Regression test for bug 1794996 where a server is successfully
+        evacuated from a down host and then deleted. Then the source compute
+        host is brought back online and attempts to cleanup the guest from
+        the hypervisor and allocations for the evacuated (and now deleted)
+        instance. Before the bug is fixed, the original compute fails to start
+        because lazy-loading the instance.flavor on the deleted instance,
+        which is needed to cleanup allocations from the source host, raises
+        InstanceNotFound.
+        """
+        source_hostname = self.compute1.host
+        dest_hostname = self.compute2.host
+
+        server = self._boot_and_check_allocations(
+            self.flavor1, source_hostname)
+
+        source_compute_id = self.admin_api.get_services(
+            host=source_hostname, binary='nova-compute')[0]['id']
+
+        self.compute1.stop()
+        # force it down to avoid waiting for the service group to time out
+        self.admin_api.put_service(
+            source_compute_id, {'forced_down': 'true'})
+
+        # evacuate the server
+        post = {'evacuate': {}}
+        self.api.post_server_action(
+            server['id'], post)
+        expected_params = {'OS-EXT-SRV-ATTR:host': dest_hostname,
+                           'status': 'ACTIVE'}
+        server = self._wait_for_server_parameter(self.api, server,
+                                                 expected_params)
+
+        # Expect to have allocation and usages on both computes as the
+        # source compute is still down
+        source_rp_uuid = self._get_provider_uuid_by_host(source_hostname)
+        dest_rp_uuid = self._get_provider_uuid_by_host(dest_hostname)
+
+        source_usages = self._get_provider_usages(source_rp_uuid)
+        self.assertFlavorMatchesAllocation(self.flavor1, source_usages)
+
+        dest_usages = self._get_provider_usages(dest_rp_uuid)
+        self.assertFlavorMatchesAllocation(self.flavor1, dest_usages)
+
+        allocations = self._get_allocations_by_server_uuid(server['id'])
+        self.assertEqual(2, len(allocations))
+        source_allocation = allocations[source_rp_uuid]['resources']
+        self.assertFlavorMatchesAllocation(self.flavor1, source_allocation)
+        dest_allocation = allocations[dest_rp_uuid]['resources']
+        self.assertFlavorMatchesAllocation(self.flavor1, dest_allocation)
+
+        # Delete the evacuated server. The allocations should be gone from
+        # both the original evacuated-from host and the evacuated-to host.
+        self._delete_and_check_allocations(server)
+
+        # restart the source compute
+        # FIXME(mriedem): This is bug 1794996 where we try to lazy-load the
+        # instance.flavor from the deleted instance which causes the compute
+        # startup to fail.
+        self.assertRaises(exception.InstanceNotFound,
+                          self.restart_compute_service, self.compute1)