Merge "backup init_host cleanup exception handling"

This commit is contained in:
Jenkins 2015-09-11 17:09:48 +00:00 committed by Gerrit Code Review
commit 0f6d84dcea
2 changed files with 298 additions and 117 deletions

View File

@ -199,113 +199,147 @@ class BackupManager(manager.SchedulerDependentManager):
for mgr in self.volume_managers.values():
self._init_volume_driver(ctxt, mgr.driver)
try:
self._cleanup_incomplete_backup_operations(ctxt)
except Exception:
# Don't block startup of the backup service.
LOG.exception(_LE("Problem cleaning incomplete backup "
"operations."))
def _cleanup_incomplete_backup_operations(self, ctxt):
LOG.info(_LI("Cleaning up incomplete backup operations."))
volumes = self.db.volume_get_all_by_host(ctxt, self.host)
for volume in volumes:
volume_host = volume_utils.extract_host(volume['host'], 'backend')
backend = self._get_volume_backend(host=volume_host)
mgr = self._get_manager(backend)
if volume['status'] == 'backing-up':
self._detach_all_attachments(ctxt, mgr, volume)
LOG.info(_LI('Resetting volume %(vol_id)s to previous '
'status %(status)s (was backing-up).'),
{'vol_id': volume['id'],
'status': volume['previous_status']})
self.db.volume_update(ctxt, volume['id'],
{'status': volume['previous_status']})
elif volume['status'] == 'restoring-backup':
self._detach_all_attachments(ctxt, mgr, volume)
LOG.info(_LI('setting volume %s to error_restoring '
'(was restoring-backup).'), volume['id'])
self.db.volume_update(ctxt, volume['id'],
{'status': 'error_restoring'})
try:
self._cleanup_one_volume(ctxt, volume)
except Exception:
LOG.exception(_LE("Problem cleaning up volume %(vol)s."),
{'vol': volume['id']})
# TODO(smulcahy) implement full resume of backup and restore
# operations on restart (rather than simply resetting)
backups = objects.BackupList.get_all_by_host(ctxt, self.host)
for backup in backups:
if backup['status'] == 'creating':
LOG.info(_LI('Resetting backup %s to error (was creating).'),
backup['id'])
err = 'incomplete backup reset on manager restart'
self._update_backup_error(backup, ctxt, err)
if backup['status'] == 'restoring':
LOG.info(_LI('Resetting backup %s to '
'available (was restoring).'),
backup['id'])
backup.status = 'available'
backup.save()
if backup['status'] == 'deleting':
LOG.info(_LI('Resuming delete on backup: %s.'), backup['id'])
if CONF.backup_service_inithost_offload:
# Offload all the pending backup delete operations to the
# threadpool to prevent the main backup service thread
# from being blocked.
self._add_to_threadpool(self.delete_backup, ctxt, backup)
else:
# By default, delete backups sequentially
self.delete_backup(ctxt, backup)
try:
self._cleanup_one_backup(ctxt, backup)
except Exception:
LOG.exception(_LE("Problem cleaning up backup %(bkup)s."),
{'bkup': backup['id']})
try:
self._cleanup_temp_volumes_snapshots_for_one_backup(ctxt,
backup)
except Exception:
LOG.exception(_LE("Problem cleaning temp volumes and "
"snapshots for backup %(bkup)s."),
{'bkup': backup['id']})
self._cleanup_temp_volumes_snapshots(backups)
def _cleanup_one_volume(self, ctxt, volume):
volume_host = volume_utils.extract_host(volume['host'], 'backend')
backend = self._get_volume_backend(host=volume_host)
mgr = self._get_manager(backend)
if volume['status'] == 'backing-up':
self._detach_all_attachments(ctxt, mgr, volume)
LOG.info(_LI('Resetting volume %(vol_id)s to previous '
'status %(status)s (was backing-up).'),
{'vol_id': volume['id'],
'status': volume['previous_status']})
self.db.volume_update(ctxt, volume['id'],
{'status': volume['previous_status']})
elif volume['status'] == 'restoring-backup':
self._detach_all_attachments(ctxt, mgr, volume)
LOG.info(_LI('setting volume %s to error_restoring '
'(was restoring-backup).'), volume['id'])
self.db.volume_update(ctxt, volume['id'],
{'status': 'error_restoring'})
def _cleanup_one_backup(self, ctxt, backup):
if backup['status'] == 'creating':
LOG.info(_LI('Resetting backup %s to error (was creating).'),
backup['id'])
err = 'incomplete backup reset on manager restart'
self._update_backup_error(backup, ctxt, err)
if backup['status'] == 'restoring':
LOG.info(_LI('Resetting backup %s to '
'available (was restoring).'),
backup['id'])
backup.status = 'available'
backup.save()
if backup['status'] == 'deleting':
LOG.info(_LI('Resuming delete on backup: %s.'), backup['id'])
if CONF.backup_service_inithost_offload:
# Offload all the pending backup delete operations to the
# threadpool to prevent the main backup service thread
# from being blocked.
self._add_to_threadpool(self.delete_backup, ctxt, backup)
else:
# By default, delete backups sequentially
self.delete_backup(ctxt, backup)
def _detach_all_attachments(self, ctxt, mgr, volume):
attachments = volume['volume_attachment'] or []
for attachment in attachments:
if (attachment['attached_host'] == self.host and
attachment['instance_uuid'] is None):
mgr.detach_volume(ctxt, volume['id'],
attachment['id'])
try:
mgr.detach_volume(ctxt, volume['id'],
attachment['id'])
except Exception:
LOG.exception(_LE("Detach attachment %(attach_id)s"
" failed."),
{'attach_id': attachment['id']},
resource=volume)
def _cleanup_temp_volumes_snapshots(self, backups):
def _cleanup_temp_volumes_snapshots_for_one_backup(self, ctxt, backup):
# NOTE(xyang): If the service crashes or gets restarted during the
# backup operation, there could be temporary volumes or snapshots
# that are not deleted. Make sure any temporary volumes or snapshots
# create by the backup job are deleted when service is started.
ctxt = context.get_admin_context()
for backup in backups:
try:
volume = self.db.volume_get(ctxt, backup.volume_id)
volume_host = volume_utils.extract_host(volume['host'],
'backend')
backend = self._get_volume_backend(host=volume_host)
mgr = self._get_manager(backend)
except (KeyError, exception.VolumeNotFound):
LOG.debug("Could not find a volume to clean up for "
"backup %s.", backup.id)
return
if backup.temp_volume_id and backup.status == 'error':
try:
volume = self.db.volume_get(ctxt, backup.volume_id)
volume_host = volume_utils.extract_host(volume['host'],
'backend')
backend = self._get_volume_backend(host=volume_host)
mgr = self._get_manager(backend)
except (KeyError, exception.VolumeNotFound):
LOG.debug("Could not find a volume to clean up for "
"backup %s.", backup.id)
continue
if backup.temp_volume_id and backup.status == 'error':
try:
temp_volume = self.db.volume_get(ctxt,
backup.temp_volume_id)
# The temp volume should be deleted directly thru the
# the volume driver, not thru the volume manager.
mgr.driver.delete_volume(temp_volume)
self.db.volume_destroy(ctxt, temp_volume['id'])
except exception.VolumeNotFound:
LOG.debug("Could not find temp volume %(vol)s to clean up "
"for backup %(backup)s.",
{'vol': backup.temp_volume_id,
'backup': backup.id})
backup.temp_volume_id = None
backup.save()
if backup.temp_snapshot_id and backup.status == 'error':
try:
temp_snapshot = objects.Snapshot.get_by_id(
ctxt, backup.temp_snapshot_id)
# The temp snapshot should be deleted directly thru the
# volume driver, not thru the volume manager.
mgr.driver.delete_snapshot(temp_snapshot)
with temp_snapshot.obj_as_admin():
self.db.volume_glance_metadata_delete_by_snapshot(
ctxt, temp_snapshot.id)
temp_snapshot.destroy()
except exception.SnapshotNotFound:
LOG.debug("Could not find temp snapshot %(snap)s to clean "
"up for backup %(backup)s.",
{'snap': backup.temp_snapshot_id,
'backup': backup.id})
backup.temp_snapshot_id = None
backup.save()
temp_volume = self.db.volume_get(ctxt,
backup.temp_volume_id)
# The temp volume should be deleted directly thru the
# the volume driver, not thru the volume manager.
mgr.driver.delete_volume(temp_volume)
self.db.volume_destroy(ctxt, temp_volume['id'])
except exception.VolumeNotFound:
LOG.debug("Could not find temp volume %(vol)s to clean up "
"for backup %(backup)s.",
{'vol': backup.temp_volume_id,
'backup': backup.id})
backup.temp_volume_id = None
backup.save()
if backup.temp_snapshot_id and backup.status == 'error':
try:
temp_snapshot = objects.Snapshot.get_by_id(
ctxt, backup.temp_snapshot_id)
# The temp snapshot should be deleted directly thru the
# volume driver, not thru the volume manager.
mgr.driver.delete_snapshot(temp_snapshot)
with temp_snapshot.obj_as_admin():
self.db.volume_glance_metadata_delete_by_snapshot(
ctxt, temp_snapshot.id)
temp_snapshot.destroy()
except exception.SnapshotNotFound:
LOG.debug("Could not find temp snapshot %(snap)s to clean "
"up for backup %(backup)s.",
{'snap': backup.temp_snapshot_id,
'backup': backup.id})
backup.temp_snapshot_id = None
backup.save()
def create_backup(self, context, backup):
"""Create volume backups using configured backup service."""

View File

@ -222,6 +222,7 @@ class BackupTestCase(BaseBackupTest):
temp_snapshot_id=temp_snap.id)
self.backup_mgr.init_host()
vol1 = db.volume_get(self.ctxt, vol1_id)
self.assertEqual('available', vol1['status'])
vol2 = db.volume_get(self.ctxt, vol2_id)
@ -269,55 +270,201 @@ class BackupTestCase(BaseBackupTest):
mock_add_threadpool.assert_has_calls(calls, any_order=True)
self.assertEqual(2, mock_add_threadpool.call_count)
@mock.patch.object(db, 'volume_get')
def test_init_host_handles_exception(self):
"""Test that exception in cleanup is handled."""
self.mock_object(self.backup_mgr, '_init_volume_driver')
mock_cleanup = self.mock_object(
self.backup_mgr,
'_cleanup_incomplete_backup_operations')
mock_cleanup.side_effect = [Exception]
self.assertIsNone(self.backup_mgr.init_host())
def test_cleanup_incomplete_backup_operations_with_exceptions(self):
"""Test cleanup resilience in the face of exceptions."""
fake_volume_list = [{'id': 'vol1'}, {'id': 'vol2'}]
mock_volume_get_by_host = self.mock_object(
db, 'volume_get_all_by_host')
mock_volume_get_by_host.return_value = fake_volume_list
mock_volume_cleanup = self.mock_object(
self.backup_mgr, '_cleanup_one_volume')
mock_volume_cleanup.side_effect = [Exception]
fake_backup_list = [{'id': 'bkup1'}, {'id': 'bkup2'}, {'id': 'bkup3'}]
mock_backup_get_by_host = self.mock_object(
objects.BackupList, 'get_all_by_host')
mock_backup_get_by_host.return_value = fake_backup_list
mock_backup_cleanup = self.mock_object(
self.backup_mgr, '_cleanup_one_backup')
mock_backup_cleanup.side_effect = [Exception]
mock_temp_cleanup = self.mock_object(
self.backup_mgr, '_cleanup_temp_volumes_snapshots_for_one_backup')
mock_temp_cleanup.side_effect = [Exception]
self.assertIsNone(
self.backup_mgr._cleanup_incomplete_backup_operations(
self.ctxt))
self.assertEqual(len(fake_volume_list), mock_volume_cleanup.call_count)
self.assertEqual(len(fake_backup_list), mock_backup_cleanup.call_count)
self.assertEqual(len(fake_backup_list), mock_temp_cleanup.call_count)
def test_cleanup_one_backing_up_volume(self):
"""Test cleanup_one_volume for volume status 'backing-up'."""
mock_get_manager = self.mock_object(
self.backup_mgr, '_get_manager')
mock_get_manager.return_value = 'fake_manager'
volume_id = self._create_volume_db_entry(status='backing-up',
previous_status='available')
volume = db.volume_get(self.ctxt, volume_id)
self.backup_mgr._cleanup_one_volume(self.ctxt, volume)
volume = db.volume_get(self.ctxt, volume_id)
self.assertEqual('available', volume['status'])
def test_cleanup_one_restoring_backup_volume(self):
"""Test cleanup_one_volume for volume status 'restoring-backup'."""
mock_get_manager = self.mock_object(
self.backup_mgr, '_get_manager')
mock_get_manager.return_value = 'fake_manager'
volume_id = self._create_volume_db_entry(status='restoring-backup')
volume = db.volume_get(self.ctxt, volume_id)
self.backup_mgr._cleanup_one_volume(self.ctxt, volume)
volume = db.volume_get(self.ctxt, volume_id)
self.assertEqual('error_restoring', volume['status'])
def test_cleanup_one_creating_backup(self):
"""Test cleanup_one_backup for volume status 'creating'."""
backup = self._create_backup_db_entry(status='creating')
self.backup_mgr._cleanup_one_backup(self.ctxt, backup)
self.assertEqual('error', backup.status)
def test_cleanup_one_restoring_backup(self):
"""Test cleanup_one_backup for volume status 'restoring'."""
backup = self._create_backup_db_entry(status='restoring')
self.backup_mgr._cleanup_one_backup(self.ctxt, backup)
self.assertEqual('available', backup.status)
def test_cleanup_one_deleting_backup(self):
"""Test cleanup_one_backup for volume status 'deleting'."""
backup = self._create_backup_db_entry(status='deleting')
self.backup_mgr._cleanup_one_backup(self.ctxt, backup)
self.assertRaises(exception.BackupNotFound,
db.backup_get,
self.ctxt,
backup.id)
def test_detach_all_attachments_handles_exceptions(self):
"""Test detach_all_attachments with exceptions."""
mock_log = self.mock_object(manager, 'LOG')
mock_volume_mgr = mock.Mock()
mock_detach_volume = mock_volume_mgr.detach_volume
mock_detach_volume.side_effect = [Exception]
fake_attachments = [
{
'id': 'attachment1',
'attached_host': 'testhost',
'instance_uuid': None,
},
{
'id': 'attachment2',
'attached_host': 'testhost',
'instance_uuid': None,
}
]
fake_volume = {
'id': 'fake_volume_id',
'volume_attachment': fake_attachments
}
self.backup_mgr._detach_all_attachments(self.ctxt,
mock_volume_mgr,
fake_volume)
self.assertEqual(len(fake_attachments), mock_log.exception.call_count)
@ddt.data(KeyError, exception.VolumeNotFound)
def test_cleanup_temp_volumes_snapshots_volume_not_found(
self, err, mock_volume_get):
def test_cleanup_temp_volumes_snapshots_for_one_backup_volume_not_found(
self, err):
"""Ensure we handle missing volume for a backup."""
mock_volume_get = self.mock_object(db, 'volume_get')
mock_volume_get.side_effect = [err]
backup1 = self._create_backup_db_entry(status='creating')
backups = [backup1]
backup = self._create_backup_db_entry(status='creating')
self.assertIsNone(self.backup_mgr._cleanup_temp_volumes_snapshots(
backups))
self.assertIsNone(
self.backup_mgr._cleanup_temp_volumes_snapshots_for_one_backup(
self.ctxt,
backup))
@mock.patch.object(lvm.LVMVolumeDriver, 'delete_snapshot')
def test_cleanup_temp_snapshot_not_found(self,
mock_delete_snapshot):
def test_cleanup_temp_snapshot_for_one_backup_not_found(self):
"""Ensure we handle missing temp snapshot for a backup."""
mock_delete_snapshot = self.mock_object(
lvm.LVMVolumeDriver, 'delete_snapshot')
vol1_id = self._create_volume_db_entry()
self._create_volume_attach(vol1_id)
db.volume_update(self.ctxt, vol1_id, {'status': 'backing-up'})
backup1 = self._create_backup_db_entry(status='error',
volume_id=vol1_id,
temp_snapshot_id='fake')
backups = [backup1]
self.assertEqual('fake', backups[0].temp_snapshot_id)
self.assertIsNone(self.backup_mgr._cleanup_temp_volumes_snapshots(
backups))
backup = self._create_backup_db_entry(status='error',
volume_id=vol1_id,
temp_snapshot_id='fake')
self.assertIsNone(
self.backup_mgr._cleanup_temp_volumes_snapshots_for_one_backup(
self.ctxt,
backup))
self.assertFalse(mock_delete_snapshot.called)
self.assertIsNone(backups[0].temp_snapshot_id)
backup1.destroy()
self.assertIsNone(backup.temp_snapshot_id)
backup.destroy()
db.volume_destroy(self.ctxt, vol1_id)
@mock.patch.object(lvm.LVMVolumeDriver, 'delete_volume')
def test_cleanup_temp_volume_not_found(self,
mock_delete_volume):
def test_cleanup_temp_volume_for_one_backup_not_found(self):
"""Ensure we handle missing temp volume for a backup."""
mock_delete_volume = self.mock_object(
lvm.LVMVolumeDriver, 'delete_volume')
vol1_id = self._create_volume_db_entry()
self._create_volume_attach(vol1_id)
db.volume_update(self.ctxt, vol1_id, {'status': 'backing-up'})
backup1 = self._create_backup_db_entry(status='error',
volume_id=vol1_id,
temp_volume_id='fake')
backups = [backup1]
self.assertEqual('fake', backups[0].temp_volume_id)
self.assertIsNone(self.backup_mgr._cleanup_temp_volumes_snapshots(
backups))
backup = self._create_backup_db_entry(status='error',
volume_id=vol1_id,
temp_volume_id='fake')
self.assertIsNone(
self.backup_mgr._cleanup_temp_volumes_snapshots_for_one_backup(
self.ctxt,
backup))
self.assertFalse(mock_delete_volume.called)
self.assertIsNone(backups[0].temp_volume_id)
backup1.destroy()
self.assertIsNone(backup.temp_volume_id)
backup.destroy()
db.volume_destroy(self.ctxt, vol1_id)
def test_create_backup_with_bad_volume_status(self):