Merge "Implement host polling monitor plugin"

This commit is contained in:
Zuul 2018-02-05 09:59:47 +00:00 committed by Gerrit Code Review
commit 6f43a4e43b
6 changed files with 454 additions and 9 deletions

View File

@ -24,21 +24,21 @@ def load_monitors(plugins):
monitors = []
# Setup a notification monitor
notification_plugins = []
notification_plugins = set([])
for plugin in plugins.values():
if plugin.monitor:
if plugin.monitor.is_notification_enabled():
notification_plugins.append(plugin.monitor)
notification_plugins.add(plugin.monitor)
if notification_plugins:
monitors.append(
notification_monitor.NotificationMonitor(notification_plugins))
# Setup a polling monitor
polling_plugins = []
polling_plugins = set([])
for plugin in plugins.values():
if plugin.monitor:
if plugin.monitor.is_polling_enabled():
polling_plugins.append(plugin.monitor)
polling_plugins.add(plugin.monitor)
if polling_plugins:
monitors.append(polling_monitor.PollingMonitor(polling_plugins))

View File

@ -53,7 +53,10 @@ class BaseMonitor(object):
lease_ids = set([])
for reservation_id, flags in reservation_flags.items():
db_api.reservation_update(reservation_id, flags)
LOG.debug('Reservation %s was updated: %s',
reservation_id, flags)
reservation = db_api.reservation_get(reservation_id)
lease_ids.add(reservation['lease_id'])
for lease_id in lease_ids:
LOG.debug('Lease %s was updated: {"degraded": True}', lease_id)
db_api.lease_update(lease_id, {'degraded': True})

View File

@ -86,6 +86,16 @@ class BasePlugin(object):
"""Take actions before the end of a lease"""
pass
def heal_reservations(self, failed_resources):
"""Heal reservations which suffer from resource failures.
:param: failed_resources: failed resources
:return: a dictionary of {reservation id: flags to update}
e.g. {'de27786d-bd96-46bb-8363-19c13b2c6657':
{'missing_resources': True}}
"""
raise NotImplementedError
@six.add_metaclass(abc.ABCMeta)
class BaseMonitorPlugin():

View File

@ -465,3 +465,17 @@ class VirtualInstancePlugin(base.BasePlugin, nova.NovaClientWrapper):
server.delete()
self.cleanup_resources(instance_reservation)
def heal_reservations(cls, failed_resources):
"""Heal reservations which suffer from resource failures.
:param: failed_resources: a list of failed hosts.
:return: a dictionary of {reservation id: flags to update}
e.g. {'de27786d-bd96-46bb-8363-19c13b2c6657':
{'missing_resources': True}}
"""
# TODO(hiro-kobayashi): Implement this method
LOG.warn('heal_reservations() is not implemented yet.')
return {}

View File

@ -91,6 +91,7 @@ class PhysicalHostPlugin(base.BasePlugin, nova.NovaClientWrapper):
project_name=CONF.os_admin_project_name,
project_domain_name=CONF.os_admin_user_domain_name)
self.monitor = PhysicalHostMonitorPlugin()
self.monitor.register_healing_handler(self.heal_reservations)
def reserve_resource(self, reservation_id, values):
"""Create reservation."""
@ -209,6 +210,73 @@ class PhysicalHostPlugin(base.BasePlugin, nova.NovaClientWrapper):
except manager_ex.AggregateNotFound:
pass
def heal_reservations(self, failed_resources):
"""Heal reservations which suffer from resource failures.
:param: failed_resources: a list of failed hosts.
:return: a dictionary of {reservation id: flags to update}
e.g. {'de27786d-bd96-46bb-8363-19c13b2c6657':
{'missing_resources': True}}
"""
reservation_flags = {}
failed_allocs = []
for host in failed_resources:
failed_allocs += db_api.host_allocation_get_all_by_values(
compute_host_id=host['id'])
for alloc in failed_allocs:
reservation = db_api.reservation_get(alloc['reservation_id'])
if reservation['resource_type'] != plugin.RESOURCE_TYPE:
continue
lease = db_api.lease_get(reservation['lease_id'])
host_reservation = None
pool = None
# Remove the failed host from the aggregate.
if reservation['status'] == status.reservation.ACTIVE:
host = db_api.host_get(alloc['compute_host_id'])
host_reservation = db_api.host_reservation_get(
reservation['resource_id'])
with trusts.create_ctx_from_trust(lease['trust_id']):
pool = nova.ReservationPool()
pool.remove_computehost(host_reservation['aggregate_id'],
host['service_name'])
# Allocate alternative resource.
start_date = max(datetime.datetime.utcnow(), lease['start_date'])
new_hostids = self._matching_hosts(
reservation['hypervisor_properties'],
reservation['resource_properties'],
'1-1', start_date, lease['end_date']
)
if not new_hostids:
if reservation['id'] not in reservation_flags:
reservation_flags[reservation['id']] = {}
reservation_flags[reservation['id']].update(
{'missing_resources': True})
db_api.host_allocation_destroy(alloc['id'])
LOG.warn('Could not find alternative host for reservation %s '
'(lease: %s).', reservation['id'], lease['name'])
else:
new_hostid = new_hostids.pop()
db_api.host_allocation_update(alloc['id'],
{'compute_host_id': new_hostid})
if reservation['status'] == status.reservation.ACTIVE:
# Add the alternative host into the aggregate.
new_host = db_api.host_get(new_hostid)
with trusts.create_ctx_from_trust(lease['trust_id']):
pool.add_computehost(host_reservation['aggregate_id'],
new_host['service_name'])
if reservation['id'] not in reservation_flags:
reservation_flags[reservation['id']] = {}
reservation_flags[reservation['id']].update(
{'resources_changed': True})
LOG.warn('Resource changed for reservation %s (lease: %s).',
reservation['id'], lease['name'])
return reservation_flags
def _get_extra_capabilities(self, host_id):
extra_capabilities = {}
raw_extra_capabilities = (
@ -535,8 +603,30 @@ class PhysicalHostPlugin(base.BasePlugin, nova.NovaClientWrapper):
return db_api.host_list()
class PhysicalHostMonitorPlugin(base.BaseMonitorPlugin):
class PhysicalHostMonitorPlugin(base.BaseMonitorPlugin,
nova.NovaClientWrapper):
"""Monitor plugin for physical host resource."""
# Singleton design pattern
_instance = None
def __new__(cls):
if not cls._instance:
cls._instance = super(PhysicalHostMonitorPlugin, cls).__new__(cls)
return cls._instance
def __init__(self):
super(PhysicalHostMonitorPlugin, self).__init__(
username=CONF.os_admin_username,
password=CONF.os_admin_password,
user_domain_name=CONF.os_admin_user_domain_name,
project_name=CONF.os_admin_project_name,
project_domain_name=CONF.os_admin_user_domain_name)
self.healing_handlers = []
def register_healing_handler(self, handler):
self.healing_handlers.append(handler)
def is_notification_enabled(self):
"""Check if the notification monitor is enabled."""
return CONF[plugin.RESOURCE_TYPE].enable_notification_monitor
@ -575,9 +665,64 @@ class PhysicalHostMonitorPlugin(base.BaseMonitorPlugin):
return CONF[plugin.RESOURCE_TYPE].polling_interval
def poll(self):
"""Check health of resources."""
LOG.debug('poll() is called.')
"""Detect and handle resource failures.
# TODO(hiro-kobayashi): Implement this method
:return: a dictionary of {reservation id: flags to update}
e.g. {'de27786d-bd96-46bb-8363-19c13b2c6657':
{'missing_resources': True}}
"""
LOG.trace('Poll...')
reservation_flags = {}
return {}
failed_hosts = self._poll_resource_failures()
if failed_hosts:
for host in failed_hosts:
LOG.warn('%s failed.', host['hypervisor_hostname'])
reservation_flags = self._handle_failures(failed_hosts)
return reservation_flags
def _poll_resource_failures(self):
"""Check health of hosts by calling Nova Hypervisors API.
:return: a list of failed hosts.
"""
failed_hosts = []
hosts = db_api.reservable_host_get_all_by_queries([])
for host in hosts:
with trusts.create_ctx_from_trust(host['trust_id']):
try:
hv = self.nova.hypervisors.get(host['id'])
LOG.debug('%s: state=%s, status=%s.',
hv.hypervisor_hostname, hv.state, hv.status)
if hv.state == 'down' or hv.status == 'disabled':
failed_hosts.append(host)
except Exception as e:
LOG.exception('Skipping health check of host %s. %s',
host['hypervisor_hostname'], str(e))
return failed_hosts
def _handle_failures(self, failed_hosts):
"""Handle resource failures.
:param: failed_hosts: a list of failed hosts.
:return: a dictionary of {reservation id: flags to update}
e.g. {'de27786d-bd96-46bb-8363-19c13b2c6657':
{'missing_resources': True}}
"""
# Update the computehosts table
for host in failed_hosts:
try:
db_api.host_update(host['id'], {'reservable': False})
except Exception as e:
LOG.exception('Failed to update %s. %s',
host['hypervisor_hostname'], str(e))
# Heal related reservations
reservation_flags = {}
for handler in self.healing_handlers:
reservation_flags.update(handler(failed_hosts))
return reservation_flags

View File

@ -1379,6 +1379,187 @@ class PhysicalHostPluginTestCase(tests.TestCase):
delete_server.assert_not_called()
delete_pool.assert_called_with(1)
def test_heal_reservations_before_start_and_resources_changed(self):
failed_hosts = [{'id': '1'}]
new_hostid = '2'
alloc_get = self.patch(self.db_api,
'host_allocation_get_all_by_values')
alloc_get.return_value = [{'id': 'alloc-1',
'compute_host_id': '1',
'reservation_id': 'rsrv-1'}]
alloc_destroy = self.patch(self.db_api, 'host_allocation_destroy')
reservation_get = self.patch(self.db_api, 'reservation_get')
reservation_get.return_value = {'id': 'rsrv-1',
'resource_type': plugin.RESOURCE_TYPE,
'lease_id': 'lease-1',
'status': 'pending',
'hypervisor_properties': [],
'resource_properties': [],
'resource_id': 'resource-1'}
host_get = self.patch(self.db_api, 'host_get')
host_get.return_value = {'service_name': 'compute'}
host_reservation_get = self.patch(self.db_api, 'host_reservation_get')
host_reservation_get.return_value = {'aggregate_id': 1}
lease_get = self.patch(self.db_api, 'lease_get')
lease_get.return_value = {
'name': 'lease-name',
'start_date': datetime.datetime(2020, 1, 1, 12, 00),
'end_date': datetime.datetime(2020, 1, 2, 12, 00),
'trust_id': 'trust-1'}
matching_hosts = self.patch(host_plugin.PhysicalHostPlugin,
'_matching_hosts')
matching_hosts.return_value = [new_hostid]
alloc_update = self.patch(self.db_api, 'host_allocation_update')
with mock.patch.object(datetime, 'datetime',
mock.Mock(wraps=datetime.datetime)) as patched:
patched.utcnow.return_value = datetime.datetime(2020, 1, 1,
11, 00)
result = self.fake_phys_plugin.heal_reservations(failed_hosts)
alloc_destroy.assert_not_called()
matching_hosts.assert_called_once_with(
[], [], '1-1',
datetime.datetime(2020, 1, 1, 12, 00),
datetime.datetime(2020, 1, 2, 12, 00))
alloc_update.assert_called_once_with('alloc-1',
{'compute_host_id': new_hostid})
self.assertEqual({}, result)
def test_heal_reservations_before_start_and_missing_resources(self):
failed_hosts = [{'id': '1'}]
alloc_get = self.patch(self.db_api,
'host_allocation_get_all_by_values')
alloc_get.return_value = [{'id': 'alloc-1',
'compute_host_id': '1',
'reservation_id': 'rsrv-1'}]
alloc_destroy = self.patch(self.db_api, 'host_allocation_destroy')
reservation_get = self.patch(self.db_api, 'reservation_get')
reservation_get.return_value = {'id': 'rsrv-1',
'resource_type': plugin.RESOURCE_TYPE,
'lease_id': 'lease-1',
'status': 'pending',
'hypervisor_properties': [],
'resource_properties': [],
'resource_id': 'resource-1'}
host_get = self.patch(self.db_api, 'host_get')
host_get.return_value = {'service_name': 'compute'}
host_reservation_get = self.patch(self.db_api, 'host_reservation_get')
host_reservation_get.return_value = {'aggregate_id': 1}
lease_get = self.patch(self.db_api, 'lease_get')
lease_get.return_value = {
'name': 'lease-name',
'start_date': datetime.datetime(2020, 1, 1, 12, 00),
'end_date': datetime.datetime(2020, 1, 2, 12, 00),
'trust_id': 'trust-1'}
matching_hosts = self.patch(host_plugin.PhysicalHostPlugin,
'_matching_hosts')
matching_hosts.return_value = []
alloc_update = self.patch(self.db_api, 'host_allocation_update')
with mock.patch.object(datetime, 'datetime',
mock.Mock(wraps=datetime.datetime)) as patched:
patched.utcnow.return_value = datetime.datetime(2020, 1, 1,
11, 00)
result = self.fake_phys_plugin.heal_reservations(failed_hosts)
alloc_destroy.assert_called_once_with('alloc-1')
matching_hosts.assert_called_once_with(
[], [], '1-1',
datetime.datetime(2020, 1, 1, 12, 00),
datetime.datetime(2020, 1, 2, 12, 00))
alloc_update.assert_not_called()
self.assertEqual({'rsrv-1': {'missing_resources': True}}, result)
def test_heal_active_reservations_and_resources_changed(self):
failed_hosts = [{'id': '1'}]
new_hostid = '2'
alloc_get = self.patch(self.db_api,
'host_allocation_get_all_by_values')
alloc_get.return_value = [{'id': 'alloc-1',
'compute_host_id': '1',
'reservation_id': 'rsrv-1'}]
alloc_destroy = self.patch(self.db_api, 'host_allocation_destroy')
reservation_get = self.patch(self.db_api, 'reservation_get')
reservation_get.return_value = {'id': 'rsrv-1',
'resource_type': plugin.RESOURCE_TYPE,
'lease_id': 'lease-1',
'status': 'active',
'hypervisor_properties': [],
'resource_properties': [],
'resource_id': 'resource-1'}
host_get = self.patch(self.db_api, 'host_get')
host_get.return_value = {'service_name': 'compute'}
host_reservation_get = self.patch(self.db_api, 'host_reservation_get')
host_reservation_get.return_value = {'aggregate_id': 1}
lease_get = self.patch(self.db_api, 'lease_get')
lease_get.return_value = {
'name': 'lease-name',
'start_date': datetime.datetime(2020, 1, 1, 12, 00),
'end_date': datetime.datetime(2020, 1, 2, 12, 00),
'trust_id': 'trust-1'}
matching_hosts = self.patch(host_plugin.PhysicalHostPlugin,
'_matching_hosts')
matching_hosts.return_value = [new_hostid]
alloc_update = self.patch(self.db_api, 'host_allocation_update')
with mock.patch.object(datetime, 'datetime',
mock.Mock(wraps=datetime.datetime)) as patched:
patched.utcnow.return_value = datetime.datetime(2020, 1, 1,
13, 00)
result = self.fake_phys_plugin.heal_reservations(failed_hosts)
alloc_destroy.assert_not_called()
matching_hosts.assert_called_once_with(
[], [], '1-1',
datetime.datetime(2020, 1, 1, 13, 00),
datetime.datetime(2020, 1, 2, 12, 00))
alloc_update.assert_called_once_with('alloc-1',
{'compute_host_id': new_hostid})
self.add_compute_host.assert_called_once_with(1, 'compute')
self.assertEqual({'rsrv-1': {'resources_changed': True}}, result)
def test_heal_active_reservations_and_missing_resources(self):
failed_hosts = [{'id': '1'}]
alloc_get = self.patch(self.db_api,
'host_allocation_get_all_by_values')
alloc_get.return_value = [{'id': 'alloc-1',
'compute_host_id': '1',
'reservation_id': 'rsrv-1'}]
alloc_destroy = self.patch(self.db_api, 'host_allocation_destroy')
reservation_get = self.patch(self.db_api, 'reservation_get')
reservation_get.return_value = {'id': 'rsrv-1',
'resource_type': plugin.RESOURCE_TYPE,
'lease_id': 'lease-1',
'status': 'pending',
'hypervisor_properties': [],
'resource_properties': [],
'resource_id': 'resource-1'}
host_get = self.patch(self.db_api, 'host_get')
host_get.return_value = {'service_name': 'compute'}
host_reservation_get = self.patch(self.db_api, 'host_reservation_get')
host_reservation_get.return_value = {'aggregate_id': 1}
lease_get = self.patch(self.db_api, 'lease_get')
lease_get.return_value = {
'name': 'lease-name',
'start_date': datetime.datetime(2020, 1, 1, 12, 00),
'end_date': datetime.datetime(2020, 1, 2, 12, 00),
'trust_id': 'trust-1'}
matching_hosts = self.patch(host_plugin.PhysicalHostPlugin,
'_matching_hosts')
matching_hosts.return_value = []
alloc_update = self.patch(self.db_api, 'host_allocation_update')
with mock.patch.object(datetime, 'datetime',
mock.Mock(wraps=datetime.datetime)) as patched:
patched.utcnow.return_value = datetime.datetime(2020, 1, 1,
13, 00)
result = self.fake_phys_plugin.heal_reservations(failed_hosts)
alloc_destroy.assert_called_once_with('alloc-1')
matching_hosts.assert_called_once_with(
[], [], '1-1',
datetime.datetime(2020, 1, 1, 13, 00),
datetime.datetime(2020, 1, 2, 12, 00))
alloc_update.assert_not_called()
self.assertEqual({'rsrv-1': {'missing_resources': True}}, result)
def test_matching_hosts_not_allocated_hosts(self):
def host_allocation_get_all_by_values(**kwargs):
if kwargs['compute_host_id'] == 'host1':
@ -1482,3 +1663,95 @@ class PhysicalHostPluginTestCase(tests.TestCase):
}
self.fake_phys_plugin._check_params(values)
self.assertEqual(values['before_end'], 'default')
class PhysicalHostMonitorPluginTestCase(tests.TestCase):
def setUp(self):
super(PhysicalHostMonitorPluginTestCase, self).setUp()
self.patch(nova_client, 'Client')
self.patch(base, 'url_for').return_value = 'http://foo.bar'
self.patch(context, 'BlazarContext')
self.patch(trusts, 'create_ctx_from_trust')
self.host_monitor_plugin = host_plugin.PhysicalHostMonitorPlugin()
def test_poll_resource_failures_state_down(self):
hosts = [
{'id': '1',
'hypervisor_hostname': 'compute-1',
'trust_id': 'trust-1'},
{'id': '2',
'hypervisor_hostname': 'compute-2',
'trust_id': 'trust-2'},
]
host_get_all = self.patch(db_api,
'reservable_host_get_all_by_queries')
host_get_all.return_value = hosts
hypervisor_get = self.patch(self.host_monitor_plugin.nova.hypervisors,
'get')
hypervisor_get.return_value = mock.MagicMock(state='down',
status='enabled')
result = self.host_monitor_plugin._poll_resource_failures()
self.assertEqual(hosts, result)
def test_poll_resource_failures_status_disabled(self):
hosts = [
{'id': '1',
'hypervisor_hostname': 'compute-1',
'trust_id': 'trust-1'},
{'id': '2',
'hypervisor_hostname': 'compute-2',
'trust_id': 'trust-2'},
]
host_get_all = self.patch(db_api,
'reservable_host_get_all_by_queries')
host_get_all.return_value = hosts
hypervisor_get = self.patch(self.host_monitor_plugin.nova.hypervisors,
'get')
hypervisor_get.return_value = mock.MagicMock(state='up',
status='disabled')
result = self.host_monitor_plugin._poll_resource_failures()
self.assertEqual(hosts, result)
def test_poll_resource_failures_nothing(self):
hosts = [
{'id': '1',
'hypervisor_hostname': 'compute-1',
'trust_id': 'trust-1'},
{'id': '2',
'hypervisor_hostname': 'compute-2',
'trust_id': 'trust-2'},
]
host_get_all = self.patch(db_api,
'reservable_host_get_all_by_queries')
host_get_all.return_value = hosts
hypervisor_get = self.patch(self.host_monitor_plugin.nova.hypervisors,
'get')
hypervisor_get.return_value = mock.MagicMock(state='up',
status='enabled')
result = self.host_monitor_plugin._poll_resource_failures()
self.assertEqual([], result)
def test_handle_failures(self):
hosts = [
{'id': '1',
'hypervisor_hostname': 'compute-1'}
]
reservation_flags = {
'rsrv-1': {'missing_resources': True}
}
host_update = self.patch(db_api, 'host_update')
heal_reservations = self.patch(host_plugin.PhysicalHostPlugin,
'heal_reservations')
heal_reservations.return_value = reservation_flags
self.host_monitor_plugin.healing_handlers = [heal_reservations]
result = self.host_monitor_plugin._handle_failures(hosts)
host_update.assert_called_once_with('1', {'reservable': False})
self.assertEqual(reservation_flags, result)