diff --git a/nova/tests/functional/regressions/test_bug_1837955.py b/nova/tests/functional/regressions/test_bug_1837955.py new file mode 100644 index 000000000000..e47702b9df97 --- /dev/null +++ b/nova/tests/functional/regressions/test_bug_1837955.py @@ -0,0 +1,115 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +import time + +from nova import exception +from nova.tests import fixtures as nova_fixtures +from nova.tests.functional import integrated_helpers +from nova.tests.unit import fake_notifier +from nova.tests.unit.image import fake as fake_image + + +class BuildRescheduleClaimFailsTestCase( + integrated_helpers.ProviderUsageBaseTestCase): + """Regression test case for bug 1837955 where a server build fails on the + primary host and then attempting to allocate resources on the alternate + host, the alternate host is full and the allocations claim in placement + fails, resulting in the build failing due to MaxRetriesExceeded and the + server going to ERROR status. + """ + compute_driver = 'fake.SmallFakeDriver' + + def _wait_for_unversioned_notification(self, event_type): + for x in range(20): # wait up to 10 seconds + for notification in fake_notifier.NOTIFICATIONS: + if notification.event_type == event_type: + return notification + time.sleep(.5) + self.fail('Timed out waiting for unversioned notification %s. Got: %s' + % (event_type, fake_notifier.NOTIFICATIONS)) + + def test_build_reschedule_alt_host_alloc_fails(self): + # Start two compute services so we have one alternate host. + # Set cpu_allocation_ratio=1.0 to make placement inventory + # and allocations for VCPU easier to manage. + self.flags(cpu_allocation_ratio=1.0) + for x in range(2): + self._start_compute('host%i' % x) + + def fake_instance_claim(_self, _context, _inst, nodename, *a, **kw): + # Before triggering the reschedule to the other host, max out the + # capacity on the alternate host. + alt_nodename = 'host0' if nodename == 'host1' else 'host1' + rp_uuid = self._get_provider_uuid_by_host(alt_nodename) + inventories = self._get_provider_inventory(rp_uuid) + # Fake some other consumer taking all of the VCPU on the alt host. + # Since we set cpu_allocation_ratio=1.0 the total is the total + # capacity for VCPU on the host. + total_vcpu = inventories['VCPU']['total'] + alt_consumer = '7d32d0bc-af16-44b2-8019-a24925d76152' + allocs = { + 'allocations': { + rp_uuid: { + 'resources': { + 'VCPU': total_vcpu + } + } + }, + 'project_id': self.api.project_id, + 'user_id': self.api.project_id + } + resp = self.placement_api.put( + '/allocations/%s' % alt_consumer, allocs, version='1.12') + self.assertEqual(204, resp.status, resp.content) + raise exception.ComputeResourcesUnavailable(reason='overhead!') + + # Stub out the instance claim (regardless of which host the scheduler + # picks as the primary) to trigger a reschedule. + self.stub_out('nova.compute.manager.resource_tracker.ResourceTracker.' + 'instance_claim', fake_instance_claim) + + # Now that our stub is in place, try to create a server and wait for it + # to go to ERROR status. + server = self._build_minimal_create_server_request( + self.api, 'test_build_reschedule_alt_host_alloc_fails', + image_uuid=fake_image.get_valid_image_id(), + networks=[{'port': nova_fixtures.NeutronFixture.port_1['id']}]) + server = self.api.post_server({'server': server}) + # FIXME(mriedem): This is bug 1837955 where the status is stuck in + # BUILD rather than the vm_state being set to error and the task_state + # being set to None. Uncomment this when the bug is fixed. + # server = self._wait_for_state_change(self.api, server, 'ERROR') + + # Wait for the MaxRetriesExceeded fault to be recorded. + # set_vm_state_and_notify sets the vm_state to ERROR before the fault + # is recorded but after the notification is sent. So wait for the + # unversioned notification to show up and then get the fault. + # FIXME(mriedem): Uncomment this when bug 1837955 is fixed. + # self._wait_for_unversioned_notification( + # 'compute_task.build_instances') + # server = self.api.get_server(server['id']) + # self.assertIn('fault', server) + # self.assertIn('Exceeded maximum number of retries', + # server['fault']['message']) + + # TODO(mriedem): Remove this when the bug is fixed. We need to assert + # something before the bug is fixed to show the failure so check the + # logs. + for x in range(20): + logs = self.stdlog.logger.output + if 'MaxRetriesExceeded' in logs: + break + time.sleep(.5) + else: + self.fail('Timed out waiting for MaxRetriesExceeded to show up ' + 'in the logs.')