rt: soften warning case in _remove_deleted_instances_allocations

During an instance_claim during initial server create or unshelve,
the instance.host/node values will be set before the resource
tracker has the instance in the tracked_instances dict. If
_remove_deleted_instances_allocations is called with the instance
before it's being tracked, a warning like this is logged:

  Jul 31 13:12:57.455904 ubuntu-bionic-rax-iad-0009534722
  nova-compute[31951]: WARNING nova.compute.resource_tracker
  [None req-d6f2ae97-d8f7-46f6-8974-b42aeb58302d None None]
  Instance 227c23cd-aeb2-4b5a-b001-21bd920a5e77 is not being actively
  managed by this compute host but has allocations referencing this
  compute host: {u'resources': {u'MEMORY_MB': 64, u'VCPU': 1,
  u'DISK_GB': 1}}. Skipping heal of allocation because we do not know
  what to do.

This shows up quite frequently in CI runs (see the bug report for
a logstash query) which means it should not be a warning.

This change checks the instance task_state and if set then we only
log a debug message rather than the warning since we can assume we
are racing and the task will correct itself upon completion.

Change-Id: I6db8bea6761b68c39e6332d4698d1f8312863758
Closes-Bug: #1838541
This commit is contained in:
Matt Riedemann 2019-07-31 12:41:00 -04:00
parent cdc8b34875
commit ad4ef5af8b
2 changed files with 72 additions and 7 deletions

View File

@ -1387,12 +1387,26 @@ class ResourceTracker(object):
if (instance.host == cn.host and
instance.node == cn.hypervisor_hostname):
# The instance is supposed to be on this compute host but is
# not in the list of actively managed instances.
LOG.warning("Instance %s is not being actively managed by "
"this compute host but has allocations "
"referencing this compute host: %s. Skipping "
"heal of allocation because we do not know "
"what to do.", instance_uuid, alloc)
# not in the list of actively managed instances. This could be
# because we are racing with an instance_claim call during
# initial build or unshelve where the instance host/node is set
# before the instance is added to tracked_instances. If the
# task_state is set, then consider things in motion and log at
# debug level instead of warning.
if instance.task_state:
LOG.debug('Instance with task_state "%s" is not being '
'actively managed by this compute host but has '
'allocations referencing this compute node '
'(%s): %s. Skipping heal of allocations during '
'the task state transition.',
instance.task_state, cn.uuid, alloc,
instance=instance)
else:
LOG.warning("Instance %s is not being actively managed by "
"this compute host but has allocations "
"referencing this compute host: %s. Skipping "
"heal of allocation because we do not know "
"what to do.", instance_uuid, alloc)
continue
if instance.host != cn.host:
# The instance has been moved to another host either via a

View File

@ -3204,9 +3204,10 @@ class TestUpdateUsageFromInstance(BaseTestCase):
# instance and has allocations for it.
rc.delete_allocation_for_instance.assert_not_called()
@mock.patch('nova.compute.resource_tracker.LOG.warning')
@mock.patch('nova.objects.Instance.get_by_uuid')
def test_remove_deleted_instances_allocations_unknown_instance(
self, mock_inst_get):
self, mock_inst_get, mock_log_warning):
"""Tests the case that an instance is found with allocations for
this host/node but is not in the dict of tracked instances. The
allocations are not removed for the instance since we don't know
@ -3241,6 +3242,56 @@ class TestUpdateUsageFromInstance(BaseTestCase):
# testing the current behavior but in the future when we get smart
# and figure things out, this should actually be an error.
rc.delete_allocation_for_instance.assert_not_called()
# Assert the expected warning was logged.
mock_log_warning.assert_called_once()
self.assertIn("Instance %s is not being actively managed by "
"this compute host but has allocations "
"referencing this compute host",
mock_log_warning.call_args[0][0])
@mock.patch('nova.compute.resource_tracker.LOG.debug')
@mock.patch('nova.objects.Instance.get_by_uuid')
def test_remove_deleted_instances_allocations_state_transition_instance(
self, mock_inst_get, mock_log_debug):
"""Tests the case that an instance is found with allocations for
this host/node but is not in the dict of tracked instances but the
instance.task_state is not None so we do not log a warning nor remove
allocations since we want to let the operation play out.
"""
instance = copy.deepcopy(_INSTANCE_FIXTURES[0])
instance.task_state = task_states.SPAWNING
mock_inst_get.return_value = instance
rc = self.rt.reportclient
# No tracked instances on this node.
# But there is an allocation for an instance on this node.
allocs = report.ProviderAllocInfo(
allocations={
instance.uuid: {
'resources': {
'VCPU': 1,
'MEMORY_MB': 2048,
'DISK_GB': 20
}
}
}
)
rc.get_allocations_for_resource_provider = mock.MagicMock(
return_value=allocs)
rc.delete_allocation_for_instance = mock.MagicMock()
cn = self.rt.compute_nodes[_NODENAME]
ctx = mock.MagicMock()
# Call the method.
self.rt._remove_deleted_instances_allocations(
ctx, cn, [], {})
# We don't delete the allocation because the instance is on this host
# but is transitioning task states.
rc.delete_allocation_for_instance.assert_not_called()
# Assert the expected debug message was logged.
mock_log_debug.assert_called_once()
self.assertIn('Instance with task_state "%s" is not being '
'actively managed by this compute host but has '
'allocations referencing this compute node',
mock_log_debug.call_args[0][0])
def test_remove_deleted_instances_allocations_retrieval_fail(self):
"""When the report client errs or otherwise retrieves no allocations,