Fix leaked upload cleanup

The cleanup routine for leaked image uploads based its detection
on upload ids, but they are not unique except in the context of
a provider and build.  This meant that, for example, as long as
there was an upload with id 0000000001 for any image build for
the provider (very likely!) we would skip cleaning up any leaked
uploads with id 0000000001.

Correct this by using a key generated on build+upload (provider
is implied because we only consider uploads for our current
provider).

Update the tests relevant to this code to exercise this condition.

Change-Id: Ic68932b735d7439ca39e2fbfbe1f73c7942152d6
This commit is contained in:
James E. Blair 2024-03-09 13:39:39 -08:00
parent 1ac3f8bda4
commit fc95724601
2 changed files with 46 additions and 11 deletions

View File

@ -871,9 +871,9 @@ class StateMachineProvider(Provider, QuotaSupport):
known_uploads = set()
uploads = self._zk.getProviderUploads(self.provider.name)
for image in uploads.values():
for build in image.values():
for build_id, build in image.items():
for upload in build:
known_uploads.add(upload.id)
known_uploads.add(f'{build_id}-{upload.id}')
newly_leaked_nodes = {}
newly_leaked_uploads = {}
@ -882,7 +882,13 @@ class StateMachineProvider(Provider, QuotaSupport):
if pn != self.provider.name:
continue
node_id = resource.metadata.get('nodepool_node_id')
build_id = resource.metadata.get('nodepool_build_id')
upload_id = resource.metadata.get('nodepool_upload_id')
# Make a truly unique upload id
if upload_id:
build_upload_id = f'{build_id}-{upload_id}'
else:
build_upload_id = None
if node_id and node_id not in known_nodes:
newly_leaked_nodes[node_id] = resource
if node_id in self.possibly_leaked_nodes:
@ -898,9 +904,9 @@ class StateMachineProvider(Provider, QuotaSupport):
except Exception:
self.log.exception("Unable to delete leaked "
f"resource for node {node_id}")
if upload_id and upload_id not in known_uploads:
newly_leaked_uploads[upload_id] = resource
if upload_id in self.possibly_leaked_uploads:
if build_upload_id and build_upload_id not in known_uploads:
newly_leaked_uploads[build_upload_id] = resource
if build_upload_id in self.possibly_leaked_uploads:
# We've seen this twice now, so it's not a race
# condition.
try:
@ -911,8 +917,9 @@ class StateMachineProvider(Provider, QuotaSupport):
resource.plural_metric_name))
self._statsd.incr(key, 1)
except Exception:
self.log.exception("Unable to delete leaked "
f"resource for upload {upload_id}")
self.log.exception(
"Unable to delete leaked "
f"resource for upload {build_upload_id}")
self.possibly_leaked_nodes = newly_leaked_nodes
self.possibly_leaked_uploads = newly_leaked_uploads

View File

@ -1031,10 +1031,22 @@ class TestDriverAws(tests.DBTestCase):
def test_aws_resource_cleanup_import_snapshot(self):
# This tests the import_snapshot path
# Create a valid, non-leaked image to test id collisions, and
# that it is not deleted.
configfile = self.setup_config('aws/diskimage.yaml')
self.useBuilder(configfile)
current_image = self.waitForImage('ec2-us-west-2', 'fake-image')
# Assert the image exists
self.ec2.Image(current_image.external_id).state
# Start by setting up leaked resources
# Use low numbers to intentionally collide with the current
# image to ensure we test the non-uniqueness of upload ids.
image_tags = [
{'Key': 'nodepool_build_id', 'Value': '0000000042'},
{'Key': 'nodepool_upload_id', 'Value': '0000000042'},
{'Key': 'nodepool_build_id', 'Value': '0000000001'},
{'Key': 'nodepool_upload_id', 'Value': '0000000001'},
{'Key': 'nodepool_provider_name', 'Value': 'ec2-us-west-2'}
]
@ -1115,12 +1127,25 @@ class TestDriverAws(tests.DBTestCase):
# Probably not found
break
# Assert the non-leaked image still exists
self.ec2.Image(current_image.external_id).state
def test_aws_resource_cleanup_import_image(self):
# This tests the import_image path
# Create a valid, non-leaked image to test id collisions, and
# that it is not deleted.
configfile = self.setup_config('aws/diskimage.yaml')
self.useBuilder(configfile)
current_image = self.waitForImage('ec2-us-west-2', 'fake-image')
# Assert the image exists
self.ec2.Image(current_image.external_id).state
# Start by setting up leaked resources
image_tags = [
{'Key': 'nodepool_build_id', 'Value': '0000000042'},
{'Key': 'nodepool_upload_id', 'Value': '0000000042'},
{'Key': 'nodepool_build_id', 'Value': '0000000001'},
{'Key': 'nodepool_upload_id', 'Value': '0000000001'},
{'Key': 'nodepool_provider_name', 'Value': 'ec2-us-west-2'}
]
@ -1179,6 +1204,9 @@ class TestDriverAws(tests.DBTestCase):
# Probably not found
break
# Assert the non-leaked image still exists
self.ec2.Image(current_image.external_id).state
def test_aws_get_import_image_task(self):
# A unit test of the unusual error handling for missing tasks
configfile = self.setup_config('aws/diskimage.yaml')