Invalidate az cache on bad request

When getting error 400 we not only need to clear the image and flavor
cache but the az cache as well. Otherwise we will get constantly node
failures for any node request where nodepool chose that az
[1]. Currently the only way to recover from this situation is to
restart nodepool. Invalidating the cache doesn't fix the request that
failed due to this error but at least ensures that nodepool will
recover from this situation automatically for all further node
requests.

[1] Trace:
018-07-05 09:09:08,477 ERROR nodepool.NodeLauncher-0000123378: Launch attempt 2/3 failed for node 0000123378:
Traceback (most recent call last):
  File "/opt/nodepool-source/nodepool/driver/openstack/handler.py", line 221, in launch
    self._launchNode()
  File "/opt/nodepool-source/nodepool/driver/openstack/handler.py", line 134, in _launchNode
    volume_size=self.label.volume_size)
  File "/opt/nodepool-source/nodepool/driver/openstack/provider.py", line 378, in createServer
    return self._client.create_server(wait=False, **create_args)
  File "<decorator-gen-106>", line 2, in create_server
  File "/usr/lib/python3.5/site-packages/shade/_utils.py", line 410, in func_wrapper
    return func(*args, **kwargs)
  File "/usr/lib/python3.5/site-packages/shade/openstackcloud.py", line 6909, in create_server
    endpoint, json=server_json)
  File "/usr/lib/python3.5/site-packages/keystoneauth1/adapter.py", line 334, in post
    return self.request(url, 'POST', **kwargs)
  File "/usr/lib/python3.5/site-packages/shade/_adapter.py", line 158, in request
    return self._munch_response(response, error_message=error_message)
  File "/usr/lib/python3.5/site-packages/shade/_adapter.py", line 114, in _munch_response
    exc.raise_from_response(response, error_message=error_message)
  File "/usr/lib/python3.5/site-packages/shade/exc.py", line 171, in raise_from_response
    raise OpenStackCloudBadRequest(msg, response=response)
shade.exc.OpenStackCloudBadRequest: (400) Client Error for url: (...) The requested availability zone is not available

Change-Id: I5f653f159b08cf086d20c2398a9345bd4caa4d1e
This commit is contained in:
Tobias Henkel 2018-07-15 09:09:12 +02:00
parent fc1f80b6d1
commit 934b1eed9c
No known key found for this signature in database
GPG Key ID: 03750DEC158E5FA2
4 changed files with 104 additions and 4 deletions

View File

@ -19,7 +19,7 @@ import threading
import time
import uuid
import openstack
import openstack.exceptions
from nodepool import exceptions
from nodepool.driver.openstack.provider import OpenStackProvider
@ -100,6 +100,7 @@ class FakeOpenStackCloud(object):
Dummy(Dummy.FLAVOR, id='f2', ram=8192, name='Unreal Flavor',
vcpus=4),
]
self._azs = ['az1', 'az2']
self._server_list = []
self.max_cores, self.max_instances, self.max_ram = FakeOpenStackCloud.\
_get_quota()
@ -156,6 +157,12 @@ class FakeOpenStackCloud(object):
len(instance_list) >= self.max_instances):
over_quota = True
az = kw.get('availability_zone')
if az and az not in self._azs:
raise openstack.exceptions.BadRequestException(
message='The requested availability zone is not available',
http_status=400)
s = Dummy(instance_type,
id=uuid.uuid4().hex,
name=kw['name'],
@ -261,7 +268,7 @@ class FakeOpenStackCloud(object):
self._delete(name_or_id, self._server_list)
def list_availability_zone_names(self):
return ['fake-az1', 'fake-az2']
return self._azs.copy()
def get_compute_limits(self):
return Dummy(

View File

@ -324,13 +324,15 @@ class OpenStackProvider(Provider):
except openstack.exceptions.BadRequestException:
# We've gotten a 400 error from nova - which means the request
# was malformed. The most likely cause of that, unless something
# became functionally and systemically broken, is stale image
# became functionally and systemically broken, is stale az, image
# or flavor cache. Log a message, invalidate the caches so that
# next time we get new caches.
self._images = {}
self.__azs = None
self.__flavors = {} # TODO(gtema): caching
self.log.info(
"Clearing flavor and image caches due to 400 error from nova")
"Clearing az, flavor and image caches due to 400 error "
"from nova")
raise
def getServer(self, server_id):

View File

@ -0,0 +1,44 @@
elements-dir: .
images-dir: '{images_dir}'
build-log-dir: '{build_log_dir}'
zookeeper-servers:
- host: {zookeeper_host}
port: {zookeeper_port}
chroot: {zookeeper_chroot}
labels:
- name: fake-label
min-ready: 0
providers:
- name: fake-provider
cloud: fake
driver: fake
region-name: fake-region
rate: 0.0001
diskimages:
- name: fake-image
meta:
key: value
key2: value
pools:
- name: main
max-servers: 96
labels:
- name: fake-label
diskimage: fake-image
min-ram: 8192
flavor-name: 'Fake'
diskimages:
- name: fake-image
elements:
- fedora
- vm
release: 21
env-vars:
TMPDIR: /opt/dib_tmp
DIB_IMAGE_CACHE: /opt/dib_cache
DIB_CLOUD_IMAGES: http://download.fedoraproject.org/pub/fedora/linux/releases/test/21-Beta/Cloud/Images/x86_64/
BASE_IMAGE_FILE: Fedora-Cloud-Base-20141029-21_Beta.x86_64.qcow2

View File

@ -350,6 +350,53 @@ class TestLauncher(tests.DBTestCase):
self.assertEqual(req.state, zk.FAILED)
self.assertNotEqual(req.declined_by, [])
def test_az_change_recover(self):
'''
Test that nodepool recovers from az change in the cloud.
'''
configfile = self.setup_config('node_az_change.yaml')
self.useBuilder(configfile)
self.waitForImage('fake-provider', 'fake-image')
pool = self.useNodepool(configfile, watermark_sleep=1)
pool.start()
self.wait_for_config(pool)
req = zk.NodeRequest()
req.state = zk.REQUESTED
req.node_types.append('fake-label')
self.zk.storeNodeRequest(req)
req = self.waitForNodeRequest(req)
self.assertEqual(req.state, zk.FULFILLED)
# now change the azs in the cloud
cloud = pool.getProviderManager('fake-provider')._getClient()
cloud._azs = ['new-az1', 'new-az2']
# Do a second request. This will fail because the cached azs are not
# available anymore.
# TODO(tobiash): Ideally we should already be able to already recover
# this request.
req2 = zk.NodeRequest()
req2.state = zk.REQUESTED
req2.node_types.append('fake-label')
self.zk.storeNodeRequest(req2)
req2 = self.waitForNodeRequest(req2)
self.assertEqual(req2.state, zk.FAILED)
# Create a third request to test that nodepool successfully recovers
# from a stale az cache.
req3 = zk.NodeRequest()
req3.state = zk.REQUESTED
req3.node_types.append('fake-label')
self.zk.storeNodeRequest(req3)
req3 = self.waitForNodeRequest(req3)
self.assertEqual(req3.state, zk.FULFILLED)
node = self.zk.getNode(req3.nodes[0])
self.assertIn(node.az, ['new-az1', 'new-az2'])
def test_fail_minready_request_at_capacity(self):
'''
A min-ready request to a provider that is already at capacity should