Remove all the resources when the Nova VM creation failed

Backport candidate for stable/train

Story: 2006664
Task:  36926

Change-Id: If0991e0cef40ca78752fcd509d8438e90c9557bc
This commit is contained in:
Lingxian Kong 2019-10-03 16:48:09 +13:00
parent 77c409c0cb
commit 650794eaf9
9 changed files with 255 additions and 229 deletions

View File

@ -4,46 +4,8 @@
"trove_auth_url":"http://%service_host%/identity/v3/auth/tokens",
"trove_client_insecure":false,
"auth_strategy":null,
"auth_url": "http://%service_host%/identity/v3",
"trove_client_region_name": "%region_name%",
"nova_client": {
"url":"http://%service_host%:8774/v1.1",
"auth_url":"http://%service_host%/identity/v3",
"nova_service_type":"compute",
"volume_service_type":"volume"
},
"glance_client": {
"auth_url":"http://%service_host%/identity/v3"
},
"flavors": null,
"white_box":false,
"start_services": %startservices%,
"test_mgmt":false,
"use_local_ovz":false,
"use_venv":false,
"glance_code_root":"%glance_path%",
"glance_api_conf":"/vagrant/conf/glance-api.conf",
"glance_reg_conf":"/vagrant/conf/glance-reg.conf",
"glance_images_directory": "/glance_images",
"glance_image": "debian-squeeze-x86_64-openvz.tar.gz",
"report_directory":"%report_directory%",
"usr_bin_dir":"%bin_path%",
"nova_code_root":"%nova_path%",
"nova_conf":"/home/vagrant/nova.conf",
"keystone_code_root":"%keystone_path%",
"keystone_conf":"/etc/keystone/keystone.conf",
"keystone_use_combined":true,
"trove_code_root":"%trove_path%",
"trove_conf":"/tmp/trove.conf",
"trove_version":"v1.0",
"trove_api_updated":"2012-08-01T00:00:00Z",
"trove_max_accepted_volume_size": 1000,
"trove_max_instances_per_user": 55,
"trove_max_volumes_per_user": 100,
"use_reaper":false,
"users": [
{ "auth_user":"trove",
"auth_key":"%service_password%",
@ -73,6 +35,32 @@
}
}
],
"flavors": null,
"white_box":false,
"start_services": %startservices%,
"test_mgmt":false,
"use_local_ovz":false,
"use_venv":false,
"glance_code_root":"%glance_path%",
"glance_api_conf":"/vagrant/conf/glance-api.conf",
"glance_reg_conf":"/vagrant/conf/glance-reg.conf",
"glance_images_directory": "/glance_images",
"glance_image": "debian-squeeze-x86_64-openvz.tar.gz",
"report_directory":"%report_directory%",
"usr_bin_dir":"%bin_path%",
"nova_code_root":"%nova_path%",
"nova_conf":"/home/vagrant/nova.conf",
"keystone_code_root":"%keystone_path%",
"keystone_conf":"/etc/keystone/keystone.conf",
"keystone_use_combined":true,
"trove_code_root":"%trove_path%",
"trove_conf":"/tmp/trove.conf",
"trove_version":"v1.0",
"trove_api_updated":"2012-08-01T00:00:00Z",
"trove_max_accepted_volume_size": 1000,
"trove_max_instances_per_user": 55,
"trove_max_volumes_per_user": 100,
"use_reaper":false,
"root_removed_from_instance_api": true,
"root_timestamp_disabled": false,
"openvz_disabled": true,

View File

@ -170,11 +170,15 @@ class TroveCommonTraits(TroveBaseTraits):
if 'instance_type' not in self.payload:
flavor = instance.nova_client.flavors.get(instance.flavor_id)
self.payload['instance_size'] = flavor.ram
if self.server is None:
self.server = instance.nova_client.servers.get(
instance.server_id)
self.payload['availability_zone'] = getattr(
self.server, 'OS-EXT-AZ:availability_zone', None)
if self.server is None and instance.server_id:
try:
self.server = instance.nova_client.servers.get(
instance.server_id)
except Exception:
pass
if self.server:
self.payload['availability_zone'] = getattr(
self.server, 'OS-EXT-AZ:availability_zone', None)
if CONF.get(instance.datastore_version.manager).volume_support:
self.payload.update({
'volume_size': instance.volume_size,

View File

@ -21,8 +21,7 @@ from trove.cluster.views import ClusterView
from trove.common import cfg
from trove.common import server_group as srv_grp
from trove.common.strategies.cluster import base
from trove.common.strategies.cluster.experimental.cassandra.taskmanager import(
CassandraClusterTasks)
from trove.common.strategies.cluster.experimental.cassandra import taskmanager
from trove.common import utils
from trove.extensions.mgmt.clusters.views import MgmtClusterView
from trove.instance import models as inst_models
@ -133,7 +132,8 @@ class CassandraCluster(models.Cluster):
# Creating member instances.
num_instances = len(
CassandraClusterTasks.find_cluster_node_ids(cluster_id))
taskmanager.CassandraClusterTasks.find_cluster_node_ids(cluster_id)
)
new_instances = []
for instance_idx, instance in enumerate(instances, num_instances + 1):
instance_az = instance.get('availability_zone', None)

View File

@ -19,11 +19,12 @@ from datetime import datetime
from datetime import timedelta
import os.path
import re
from sqlalchemy import func
import six
from novaclient import exceptions as nova_exceptions
from oslo_config.cfg import NoSuchOptError
from oslo_log import log as logging
from sqlalchemy import func
from trove.backup.models import Backup
from trove.common import cfg
@ -31,9 +32,9 @@ from trove.common import crypto_utils as cu
from trove.common import exception
from trove.common.glance_remote import create_glance_client
from trove.common.i18n import _
import trove.common.instance as tr_instance
from trove.common import instance as tr_instance
from trove.common import neutron
from trove.common.notification import StartNotification
from trove.common import notification
from trove.common.remote import create_cinder_client
from trove.common.remote import create_dns_client
from trove.common.remote import create_guest_client
@ -664,9 +665,136 @@ class BaseInstance(SimpleInstance):
deltas,
_delete_resources)
def server_status_matches(self, expected_status, server=None):
if not server:
server = self.server
return server.status.upper() in (
status.upper() for status in expected_status)
def _delete_resources(self, deleted_at):
"""Implemented in subclass."""
pass
"""Delete the openstack resources related to an instance.
Deleting the instance should not break or raise exceptions because
the end users want their instances to be deleted anyway. Cloud operator
should consider the way to clean up orphan resources afterwards, e.g.
using the naming convention.
"""
LOG.info("Starting to delete resources for instance %s", self.id)
old_server = None
if self.server_id:
# Stop db
try:
old_server = self.nova_client.servers.get(self.server_id)
# The server may have already been marked as 'SHUTDOWN'
# but check for 'ACTIVE' in case of any race condition
# We specifically don't want to attempt to stop db if
# the server is in 'ERROR' or 'FAILED" state, as it will
# result in a long timeout
if self.server_status_matches(['ACTIVE', 'SHUTDOWN'],
server=self):
LOG.debug("Stopping datastore on instance %s before "
"deleting any resources.", self.id)
self.guest.stop_db()
except Exception as e:
LOG.warning("Failed to stop the database before attempting "
"to delete trove instance %s, error: %s", self.id,
six.text_type(e))
# Nova VM
if old_server:
try:
LOG.info("Deleting server for instance %s", self.id)
self.server.delete()
except Exception as e:
LOG.warning("Failed to delete compute server %s",
self.server_id, six.text_type(e))
# Neutron ports (floating IP)
try:
ret = self.neutron_client.list_ports(name='trove-%s' % self.id)
ports = ret.get("ports", [])
for port in ports:
LOG.info("Deleting port %s for instance %s", port["id"],
self.id)
neutron.delete_port(self.neutron_client, port["id"])
except Exception as e:
LOG.warning("Failed to delete ports for instance %s, "
"error: %s", self.id, six.text_type(e))
# Neutron security groups
try:
name = "%s-%s" % (CONF.trove_security_group_name_prefix, self.id)
ret = self.neutron_client.list_security_groups(name=name)
sgs = ret.get("security_groups", [])
for sg in sgs:
LOG.info("Deleting security group %s for instance %s",
sg["id"], self.id)
self.neutron_client.delete_security_group(sg["id"])
except Exception as e:
LOG.warning("Failed to delete security groups for instance %s, "
"error: %s", self.id, six.text_type(e))
# DNS resources, e.g. Designate
try:
dns_support = CONF.trove_dns_support
if dns_support:
dns_api = create_dns_client(self.context)
dns_api.delete_instance_entry(instance_id=self.id)
except Exception as e:
LOG.warning("Failed to delete dns entry of instance %s, error: %s",
self.id, six.text_type(e))
# Nova server group
try:
srv_grp.ServerGroup.delete(self.context, self.server_group)
except Exception as e:
LOG.warning("Failed to delete server group for %s, error: %s",
self.id, six.text_type(e))
def server_is_finished():
try:
server = self.nova_client.servers.get(self.server_id)
if not self.server_status_matches(['SHUTDOWN', 'ACTIVE'],
server=server):
LOG.warning("Server %(vm_id)s entered ERROR status "
"when deleting instance %(instance_id)s!",
{'vm_id': self.server_id,
'instance_id': self.id})
return False
except nova_exceptions.NotFound:
return True
if old_server:
try:
LOG.info("Waiting for compute server %s removal for "
"instance %s", self.server_id, self.id)
utils.poll_until(server_is_finished, sleep_time=2,
time_out=CONF.server_delete_time_out)
except exception.PollTimeOut:
LOG.warning("Failed to delete instance %(instance_id)s: "
"Timeout deleting compute server %(vm_id)s",
{'instance_id': self.id, 'vm_id': self.server_id})
# If volume has been resized it must be manually removed
try:
if self.volume_id:
volume = self.volume_client.volumes.get(self.volume_id)
if volume.status == "available":
volume.delete()
except Exception as e:
LOG.warning("Failed to delete volume for instance %s, error: %s",
self.id, six.text_type(e))
notification.TroveInstanceDelete(
instance=self,
deleted_at=timeutils.isotime(deleted_at),
server=old_server
).notify()
LOG.info("Finished to delete resources for instance %s", self.id)
def delete_async(self):
deleted_at = timeutils.utcnow()
@ -1117,7 +1245,7 @@ class Instance(BuiltInstance):
return SimpleInstance(context, db_info, service_status,
root_password, locality=locality)
with StartNotification(context, **call_args):
with notification.StartNotification(context, **call_args):
return run_with_quotas(context.project_id, deltas,
_create_resources)

View File

@ -20,10 +20,8 @@ import traceback
from cinderclient import exceptions as cinder_exceptions
from eventlet import greenthread
from eventlet.timeout import Timeout
from novaclient import exceptions as nova_exceptions
from oslo_log import log as logging
from oslo_utils import netutils
import six
from swiftclient.client import ClientException
from trove.backup import models as bkup_models
@ -54,13 +52,11 @@ from trove.common.notification import (
StartNotification,
TroveInstanceCreate,
TroveInstanceModifyVolume,
TroveInstanceModifyFlavor,
TroveInstanceDelete)
TroveInstanceModifyFlavor)
import trove.common.remote as remote
from trove.common.remote import create_cinder_client
from trove.common.remote import create_dns_client
from trove.common.remote import create_guest_client
from trove.common import server_group as srv_grp
from trove.common.strategies.cluster import strategy
from trove.common import template
from trove.common import timeutils
@ -416,21 +412,10 @@ class ClusterTasks(Cluster):
class FreshInstanceTasks(FreshInstance, NotifyMixin, ConfigurationMixin):
def _delete_resources(self, deleted_at):
LOG.debug("Begin _delete_resources for instance %s", self.id)
# If volume has "available" status, delete it manually.
try:
if self.volume_id:
volume = self.volume_client.volumes.get(self.volume_id)
if volume.status == "available":
volume.delete()
except Exception as e:
LOG.warning("Failed to delete volume for instance %s, error: %s",
self.id, six.text_type(e))
LOG.debug("End _delete_resource for instance %s", self.id)
"""
FreshInstanceTasks contains the tasks related an instance that not
associated with a compute server.
"""
def wait_for_instance(self, timeout, flavor):
# Make sure the service becomes active before sending a usage
@ -1073,125 +1058,10 @@ class FreshInstanceTasks(FreshInstance, NotifyMixin, ConfigurationMixin):
class BuiltInstanceTasks(BuiltInstance, NotifyMixin, ConfigurationMixin):
"""
Performs the various asynchronous instance related tasks.
BuiltInstanceTasks contains the tasks related an instance that already
associated with a compute server.
"""
def _delete_resources(self, deleted_at):
LOG.info("Starting to delete resources for instance %s", self.id)
# Stop db
server_id = self.db_info.compute_instance_id
old_server = self.nova_client.servers.get(server_id)
try:
# The server may have already been marked as 'SHUTDOWN'
# but check for 'ACTIVE' in case of any race condition
# We specifically don't want to attempt to stop db if
# the server is in 'ERROR' or 'FAILED" state, as it will
# result in a long timeout
if self.server_status_matches(['ACTIVE', 'SHUTDOWN'], server=self):
LOG.debug("Stopping datastore on instance %s before deleting "
"any resources.", self.id)
self.guest.stop_db()
except Exception as e:
LOG.warning("Failed to stop the datastore before attempting "
"to delete instance id %s, error: %s", self.id,
six.text_type(e))
# Nova VM
try:
LOG.info("Deleting server for instance %s", self.id)
self.server.delete()
except Exception as e:
LOG.warning("Failed to delete compute server %s", self.server.id,
six.text_type(e))
# Neutron ports
try:
ret = self.neutron_client.list_ports(name='trove-%s' % self.id)
ports = ret.get("ports", [])
for port in ports:
LOG.info("Deleting port %s for instance %s", port["id"],
self.id)
neutron.delete_port(self.neutron_client, port["id"])
except Exception as e:
LOG.warning("Failed to delete ports for instance %s, "
"error: %s", self.id, six.text_type(e))
# Neutron security groups
try:
name = "%s-%s" % (CONF.trove_security_group_name_prefix, self.id)
ret = self.neutron_client.list_security_groups(name=name)
sgs = ret.get("security_groups", [])
for sg in sgs:
LOG.info("Deleting security group %s for instance %s",
sg["id"], self.id)
self.neutron_client.delete_security_group(sg["id"])
except Exception as e:
LOG.warning("Failed to delete security groups for instance %s, "
"error: %s", self.id, six.text_type(e))
# DNS resources, e.g. Designate
try:
dns_support = CONF.trove_dns_support
if dns_support:
dns_api = create_dns_client(self.context)
dns_api.delete_instance_entry(instance_id=self.id)
except Exception as e:
LOG.warning("Failed to delete dns entry of instance %s, error: %s",
self.id, six.text_type(e))
# Nova server group
try:
srv_grp.ServerGroup.delete(self.context, self.server_group)
except Exception as e:
LOG.warning("Failed to delete server group for %s, error: %s",
self.id, six.text_type(e))
def server_is_finished():
try:
server = self.nova_client.servers.get(server_id)
if not self.server_status_matches(['SHUTDOWN', 'ACTIVE'],
server=server):
LOG.warning("Server %(server_id)s entered ERROR status "
"when deleting instance %(instance_id)s!",
{'server_id': server.id,
'instance_id': self.id})
return False
except nova_exceptions.NotFound:
return True
try:
LOG.info("Waiting for server %s removal for instance %s",
server_id, self.id)
utils.poll_until(server_is_finished, sleep_time=2,
time_out=CONF.server_delete_time_out)
except PollTimeOut:
LOG.warning("Failed to delete instance %(instance_id)s: "
"Timeout deleting compute server %(server_id)s",
{'instance_id': self.id, 'server_id': server_id})
# If volume has been resized it must be manually removed
try:
if self.volume_id:
volume = self.volume_client.volumes.get(self.volume_id)
if volume.status == "available":
volume.delete()
except Exception as e:
LOG.warning("Failed to delete volume for instance %s, error: %s",
self.id, six.text_type(e))
TroveInstanceDelete(instance=self,
deleted_at=timeutils.isotime(deleted_at),
server=old_server).notify()
LOG.info("Finished to delete resources for instance %s", self.id)
def server_status_matches(self, expected_status, server=None):
if not server:
server = self.server
return server.status.upper() in (
status.upper() for status in expected_status)
def resize_volume(self, new_size):
LOG.info("Resizing volume for instance %(instance_id)s from "
"%(old_size)s GB to %(new_size)s GB.",

View File

@ -43,13 +43,13 @@ class MgmtDataStoreVersion(object):
self.user = CONFIG.users.find_user(reqs)
self.client = create_dbaas_client(self.user)
self.images = []
if test_config.glance_client is not None:
glance_user = test_config.users.find_user(
Requirements(is_admin=True, services=["glance"]))
self.glance_client = create_glance_client(glance_user)
images = self.glance_client.images.list()
for image in images:
self.images.append(image.id)
glance_user = test_config.users.find_user(
Requirements(is_admin=True, services=["glance"]))
self.glance_client = create_glance_client(glance_user)
images = self.glance_client.images.list()
for image in images:
self.images.append(image.id)
def _find_ds_version_by_name(self, ds_version_name):
ds_versions = self.client.mgmt_datastore_versions.list()

View File

@ -120,3 +120,10 @@ class InstanceErrorCreateRunner(TestRunner):
self.assert_all_gone(delete_ids, expected_states[-1])
else:
raise SkipTest("Cleanup is not required.")
# All the neutron ports should be removed.
if self.error_inst_id:
ports = self.neutron_client.list_ports(
name='trove-%s' % self.error_inst_id
)
self.assert_equal(0, len(ports.get("ports", [])))

View File

@ -36,9 +36,9 @@ from trove.common import timeutils
from trove.common import utils
from trove.common.utils import poll_until, build_polling_task
from trove.tests.config import CONFIG
from trove.tests import util as test_util
from trove.tests.util.check import AttrCheck
from trove.tests.util import create_dbaas_client
from trove.tests.util import create_nova_client
from trove.tests.util.users import Requirements
CONF = cfg.CONF
@ -354,6 +354,7 @@ class TestRunner(object):
self._admin_client = None
self._swift_client = None
self._nova_client = None
self._neutron_client = None
self._test_helper = None
self._servers = {}
@ -492,7 +493,7 @@ class TestRunner(object):
user = CONFIG.users.find_user(requirements)
os_options = {'region_name': CONFIG.trove_client_region_name}
return swiftclient.client.Connection(
authurl=CONFIG.nova_client['auth_url'],
authurl=CONFIG.auth_url,
user=user.auth_user,
key=user.auth_key,
tenant_name=user.tenant,
@ -501,7 +502,21 @@ class TestRunner(object):
@property
def nova_client(self):
return create_nova_client(self.instance_info.admin_user)
if self._nova_client is None:
self._nova_client = test_util.create_nova_client(
self.instance_info.admin_user
)
return self._nova_client
@property
def neutron_client(self):
if self._neutron_client is None:
self._neutron_client = test_util.create_neutron_client(
self.instance_info.admin_user
)
return self._neutron_client
def register_debug_inst_ids(self, inst_ids):
"""Method to 'register' an instance ID (or list of instance IDs)

View File

@ -28,6 +28,11 @@ try:
except ImportError:
EVENT_AVAILABLE = False
import glanceclient
from keystoneauth1.identity import v3
from keystoneauth1 import session
from neutronclient.v2_0 import client as neutron_client
from novaclient import client as nova_client
from proboscis.asserts import assert_true
from proboscis.asserts import Check
from proboscis.asserts import fail
@ -141,41 +146,50 @@ def create_dbaas_client(user):
return TestClient(dbaas)
def create_nova_client(user, service_type=None):
"""Creates a rich client for the Nova API using the test config."""
if test_config.nova_client is None:
raise SkipTest("No nova_client info specified in the Test Config "
"so this test will be skipped.")
from novaclient.client import Client
if not service_type:
service_type = test_config.nova_client['nova_service_type']
openstack = Client(CONF.nova_client_version,
username=user.auth_user,
def create_keystone_session(user):
auth = v3.Password(username=user.auth_user,
password=user.auth_key,
user_domain_name='Default',
project_id=user.tenant_id,
auth_url=test_config.nova_client['auth_url'],
service_type=service_type, os_cache=False,
cacert=test_config.values.get('cacert', None))
user_domain_name='Default',
project_domain_name='Default',
auth_url=test_config.auth_url)
return session.Session(auth=auth)
def create_nova_client(user, service_type=None):
if not service_type:
service_type = CONF.nova_compute_service_type
openstack = nova_client.Client(
CONF.nova_client_version,
username=user.auth_user,
password=user.auth_key,
user_domain_name='Default',
project_id=user.tenant_id,
auth_url=CONFIG.auth_url,
service_type=service_type, os_cache=False,
cacert=test_config.values.get('cacert', None)
)
return TestClient(openstack)
def create_glance_client(user):
"""Creates a rich client for the Glance API using the test config."""
if test_config.glance_client is None:
raise SkipTest("No glance_client info specified in the Test Config "
"so this test will be skipped.")
from glanceclient import Client
from keystoneauth1.identity import v3
from keystoneauth1 import session
def create_neutron_client(user):
sess = create_keystone_session(user)
client = neutron_client.Client(
session=sess,
service_type=CONF.neutron_service_type,
region_name=CONFIG.trove_client_region_name,
insecure=CONF.neutron_api_insecure,
endpoint_type=CONF.neutron_endpoint_type
)
return TestClient(client)
def create_glance_client(user):
sess = create_keystone_session(user)
glance = glanceclient.Client(CONF.glance_client_version, session=sess)
auth = v3.Password(username=user.auth_user,
password=user.auth_key,
user_domain_name='Default',
project_id=user.tenant_id,
auth_url=test_config.glance_client['auth_url'])
session = session.Session(auth=auth)
glance = Client(CONF.glance_client_version, session=session)
return TestClient(glance)