582 lines
21 KiB
Python
582 lines
21 KiB
Python
# Copyright 2014 DreamHost, LLC
|
|
#
|
|
# Author: DreamHost, LLC
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
|
# not use this file except in compliance with the License. You may obtain
|
|
# a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
|
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
|
# License for the specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
from datetime import datetime
|
|
import time
|
|
|
|
from oslo_config import cfg
|
|
|
|
from astara.drivers import states
|
|
from astara.common.i18n import _LE, _LI
|
|
|
|
CONF = cfg.CONF
|
|
INSTANCE_MANAGER_OPTS = [
|
|
cfg.IntOpt(
|
|
'hotplug_timeout',
|
|
default=10,
|
|
help='The amount of time to wait for nova to hotplug/unplug '
|
|
'networks from the instances.'),
|
|
cfg.IntOpt(
|
|
'boot_timeout', default=600),
|
|
cfg.IntOpt(
|
|
'error_state_cooldown',
|
|
default=30,
|
|
help='Number of seconds to ignore new events when an instance goes '
|
|
'into ERROR state.',
|
|
),
|
|
]
|
|
CONF.register_opts(INSTANCE_MANAGER_OPTS)
|
|
|
|
|
|
def synchronize_driver_state(f):
|
|
"""Wrapper that triggers a driver's synchronize_state function"""
|
|
def wrapper(self, *args, **kw):
|
|
state = f(self, *args, **kw)
|
|
self.driver.synchronize_state(*args, state=state)
|
|
return state
|
|
return wrapper
|
|
|
|
|
|
def ensure_cache(f):
|
|
"""Decorator to wrap around any function that uses self.instance_info.
|
|
|
|
Insures that self.instance_info is up to date and catches instances in a
|
|
GONE or missing state before wasting cycles trying to do something with it.
|
|
|
|
NOTE: This replaces the old function called _ensure_cache made a Decorator
|
|
rather than calling it explicitly at the start of all those functions.
|
|
"""
|
|
def wrapper(self, worker_context, *args, **kw):
|
|
# insure that self.instance_info is current before doing anything.
|
|
if not self.instance_info:
|
|
# attempt to populate instance_info
|
|
self.instance_info = (
|
|
worker_context.nova_client.get_instance_info(self.driver.name)
|
|
)
|
|
|
|
if self.instance_info:
|
|
(
|
|
self.instance_info.management_port,
|
|
self.instance_info.ports
|
|
) = worker_context.neutron.get_ports_for_instance(
|
|
self.instance_info.id_
|
|
)
|
|
|
|
return f(self, worker_context, *args, **kw)
|
|
|
|
return wrapper
|
|
|
|
|
|
class BootAttemptCounter(object):
|
|
def __init__(self):
|
|
self._attempts = 0
|
|
|
|
def start(self):
|
|
self._attempts += 1
|
|
|
|
def reset(self):
|
|
self._attempts = 0
|
|
|
|
@property
|
|
def count(self):
|
|
return self._attempts
|
|
|
|
|
|
class InstanceManager(object):
|
|
|
|
def __init__(self, driver, resource_id, worker_context):
|
|
"""The instance manager is your interface to the running instance.
|
|
wether it be virtual, container or physical.
|
|
|
|
Service specific code lives in the driver which is passed in here.
|
|
|
|
:param driver: driver object
|
|
:param resource_id: UUID of logical resource
|
|
:param worker_context:
|
|
"""
|
|
self.driver = driver
|
|
self.id = resource_id
|
|
self.log = self.driver.log
|
|
|
|
self.state = states.DOWN
|
|
|
|
self.instance_info = None
|
|
self.last_error = None
|
|
self._boot_counter = BootAttemptCounter()
|
|
self._last_synced_status = None
|
|
|
|
self.state = self.update_state(worker_context, silent=True)
|
|
|
|
@property
|
|
def attempts(self):
|
|
"""Property which returns the boot count.
|
|
|
|
:returns Int:
|
|
"""
|
|
return self._boot_counter.count
|
|
|
|
def reset_boot_counter(self):
|
|
"""Resets the boot counter.
|
|
|
|
:returns None:
|
|
"""
|
|
self._boot_counter.reset()
|
|
|
|
@synchronize_driver_state
|
|
def update_state(self, worker_context, silent=False):
|
|
"""Updates state of the instance and, by extension, its logical resource
|
|
|
|
:param worker_context:
|
|
:param silent:
|
|
:returns: state
|
|
"""
|
|
self._ensure_cache(worker_context)
|
|
|
|
if self.driver.get_state(worker_context) == states.GONE:
|
|
self.log.debug('%s driver reported its state is GONE',
|
|
self.driver.RESOURCE_NAME)
|
|
self.state = states.GONE
|
|
return self.state
|
|
|
|
if self.instance_info is None:
|
|
self.log.info(_LI('no backing instance, marking as down'))
|
|
self.state = states.DOWN
|
|
return self.state
|
|
|
|
addr = self.instance_info.management_address
|
|
if not addr:
|
|
self.log.debug('waiting for instance ports to be attached')
|
|
self.state = states.BOOTING
|
|
return self.state
|
|
|
|
for i in xrange(cfg.CONF.max_retries):
|
|
if self.driver.is_alive(self.instance_info.management_address):
|
|
if self.state != states.CONFIGURED:
|
|
self.state = states.UP
|
|
break
|
|
if not silent:
|
|
self.log.debug('Alive check failed. Attempt %d of %d',
|
|
i,
|
|
cfg.CONF.max_retries)
|
|
time.sleep(cfg.CONF.retry_delay)
|
|
else:
|
|
old_state = self.state
|
|
self._check_boot_timeout()
|
|
|
|
# If the instance isn't responding, make sure Nova knows about it
|
|
instance = worker_context.nova_client.get_instance_for_obj(self.id)
|
|
if instance is None and self.state != states.ERROR:
|
|
self.log.info('No instance was found; rebooting')
|
|
self.state = states.DOWN
|
|
self.instance_info = None
|
|
|
|
# update_state() is called from Alive() to check the
|
|
# status of the router. If we can't talk to the API at
|
|
# that point, the router should be considered missing and
|
|
# we should reboot it, so mark it states.DOWN if we think it was
|
|
# configured before.
|
|
if old_state == states.CONFIGURED and self.state != states.ERROR:
|
|
self.log.debug('Instance not alive, marking it as DOWN')
|
|
self.state = states.DOWN
|
|
|
|
# After the instance is all the way up, record how long it took
|
|
# to boot and accept a configuration.
|
|
self.instance_info = (
|
|
worker_context.nova_client.update_instance_info(
|
|
self.instance_info))
|
|
|
|
if not self.instance_info.booting and self.state == states.CONFIGURED:
|
|
# If we didn't boot the server (because we were restarted
|
|
# while it remained running, for example), we won't have a
|
|
# duration to log.
|
|
self.log.info('%s booted in %s seconds after %s attempts',
|
|
self.driver.RESOURCE_NAME,
|
|
self.instance_info.time_since_boot.total_seconds(),
|
|
self._boot_counter.count)
|
|
|
|
# Always reset the boot counter, even if we didn't boot
|
|
# the server ourself, so we don't accidentally think we
|
|
# have an erroring router.
|
|
self._boot_counter.reset()
|
|
return self.state
|
|
|
|
def boot(self, worker_context):
|
|
"""Boots the instance with driver pre/post boot hooks.
|
|
|
|
:returns: None
|
|
"""
|
|
self._ensure_cache(worker_context)
|
|
|
|
self.log.info('Booting %s' % self.driver.RESOURCE_NAME)
|
|
self.state = states.DOWN
|
|
self._boot_counter.start()
|
|
|
|
# driver preboot hook
|
|
self.driver.pre_boot(worker_context)
|
|
|
|
# try to boot the instance
|
|
try:
|
|
instance_info = worker_context.nova_client.boot_instance(
|
|
resource_type=self.driver.RESOURCE_NAME,
|
|
prev_instance_info=self.instance_info,
|
|
name=self.driver.name,
|
|
image_uuid=self.driver.image_uuid,
|
|
flavor=self.driver.flavor,
|
|
make_ports_callback=self.driver.make_ports(worker_context)
|
|
)
|
|
if not instance_info:
|
|
self.log.info(_LI('Previous instance is still deleting'))
|
|
# Reset the boot counter, causing the state machine to start
|
|
# again with a new Instance.
|
|
self.reset_boot_counter()
|
|
self.instance_info = None
|
|
return
|
|
except:
|
|
self.log.exception(_LE('Instance failed to start boot'))
|
|
self.driver.delete_ports(worker_context)
|
|
else:
|
|
# We have successfully started a (re)boot attempt so
|
|
# record the timestamp so we can report how long it takes.
|
|
self.state = states.BOOTING
|
|
self.instance_info = instance_info
|
|
|
|
# driver post boot hook
|
|
self.driver.post_boot(worker_context)
|
|
|
|
def check_boot(self, worker_context):
|
|
"""Checks status of instance, if ready triggers self.configure
|
|
"""
|
|
state = self.update_state(worker_context, silent=True)
|
|
if state in states.READY_STATES:
|
|
self.log.info('Instance has booted, attempting initial config')
|
|
self.configure(worker_context)
|
|
if self.state != states.CONFIGURED:
|
|
self._check_boot_timeout()
|
|
return self.state == states.CONFIGURED
|
|
|
|
self.log.debug('Instance is %s' % self.state.upper())
|
|
return False
|
|
|
|
@synchronize_driver_state
|
|
def set_error(self, worker_context, silent=False):
|
|
"""Set the internal and neutron status for the router to states.ERROR.
|
|
|
|
This is called from outside when something notices the router
|
|
is "broken". We don't use it internally because this class is
|
|
supposed to do what it's told and not make decisions about
|
|
whether or not the router is fatally broken.
|
|
"""
|
|
self._ensure_cache(worker_context)
|
|
self.state = states.ERROR
|
|
self.last_error = datetime.utcnow()
|
|
return self.state
|
|
|
|
@synchronize_driver_state
|
|
def clear_error(self, worker_context, silent=False):
|
|
"""Clear the internal error state.
|
|
|
|
This is called from outside when something wants to force a
|
|
router rebuild, so that the state machine that checks our
|
|
status won't think we are broken unless we actually break
|
|
again.
|
|
"""
|
|
# Clear the boot counter.
|
|
self._boot_counter.reset()
|
|
self._ensure_cache(worker_context)
|
|
self.state = states.DOWN
|
|
return self.state
|
|
|
|
@property
|
|
def error_cooldown(self):
|
|
"""Returns True if the instance was recently set to states.ERROR state.
|
|
"""
|
|
if self.last_error and self.state == states.ERROR:
|
|
seconds_since_error = (
|
|
datetime.utcnow() - self.last_error
|
|
).total_seconds()
|
|
if seconds_since_error < cfg.CONF.error_state_cooldown:
|
|
return True
|
|
return False
|
|
|
|
@synchronize_driver_state
|
|
def stop(self, worker_context):
|
|
"""Attempts to destroy the instance with configured timeout.
|
|
|
|
:param worker_context:
|
|
:returns:
|
|
"""
|
|
self._ensure_cache(worker_context)
|
|
self.log.info(_LI('Destroying instance'))
|
|
|
|
if not self.instance_info:
|
|
self.log.info(_LI('Instance already destroyed.'))
|
|
return states.GONE
|
|
|
|
worker_context.neutron.delete_vrrp_port(self.driver.id)
|
|
worker_context.neutron.delete_vrrp_port(self.driver.id, label='MGT')
|
|
|
|
try:
|
|
worker_context.nova_client.destroy_instance(self.instance_info)
|
|
except Exception:
|
|
self.log.exception(_LE('Error deleting router instance'))
|
|
|
|
start = time.time()
|
|
i = 0
|
|
while time.time() - start < cfg.CONF.boot_timeout:
|
|
i += 1
|
|
if not worker_context.nova_client.\
|
|
get_instance_by_id(self.instance_info.id_):
|
|
if self.state != states.GONE:
|
|
self.state = states.DOWN
|
|
return self.state
|
|
self.log.debug('Router has not finished stopping')
|
|
time.sleep(cfg.CONF.retry_delay)
|
|
self.log.error(_LE(
|
|
'Router failed to stop within %d secs'),
|
|
cfg.CONF.boot_timeout)
|
|
|
|
@synchronize_driver_state
|
|
def configure(self, worker_context):
|
|
"""Pushes config to instance
|
|
|
|
:param worker_context:
|
|
:param failure_state:
|
|
:param attempts:
|
|
:returns:
|
|
"""
|
|
self.log.debug('Begin instance config')
|
|
self.state = states.UP
|
|
attempts = cfg.CONF.max_retries
|
|
|
|
self._ensure_cache(worker_context)
|
|
if self.driver.get_state(worker_context) == states.GONE:
|
|
return states.GONE
|
|
|
|
interfaces = self.driver.get_interfaces(
|
|
self.instance_info.management_address)
|
|
|
|
if not self._verify_interfaces(self.driver.ports, interfaces):
|
|
# FIXME: Need a states.REPLUG state when we support hot-plugging
|
|
# interfaces.
|
|
self.log.debug("Interfaces aren't plugged as expected.")
|
|
self.state = states.REPLUG
|
|
return self.state
|
|
|
|
# TODO(mark): We're in the first phase of VRRP, so we need
|
|
# map the interface to the network ID.
|
|
# Eventually we'll send VRRP data and real interface data
|
|
port_mac_to_net = {
|
|
p.mac_address: p.network_id
|
|
for p in self.instance_info.ports
|
|
}
|
|
# Add in the management port
|
|
mgt_port = self.instance_info.management_port
|
|
port_mac_to_net[mgt_port.mac_address] = mgt_port.network_id
|
|
# this is a network to logical interface id
|
|
iface_map = {
|
|
port_mac_to_net[i['lladdr']]: i['ifname']
|
|
for i in interfaces if i['lladdr'] in port_mac_to_net
|
|
}
|
|
|
|
# sending all the standard config over to the driver for final updates
|
|
config = self.driver.build_config(
|
|
worker_context,
|
|
mgt_port,
|
|
iface_map
|
|
)
|
|
self.log.debug('preparing to update config to %r', config)
|
|
|
|
for i in xrange(attempts):
|
|
try:
|
|
self.driver.update_config(
|
|
self.instance_info.management_address,
|
|
config)
|
|
except Exception:
|
|
if i == attempts - 1:
|
|
# Only log the traceback if we encounter it many times.
|
|
self.log.exception(_LE('failed to update config'))
|
|
else:
|
|
self.log.debug(
|
|
'failed to update config, attempt %d',
|
|
i
|
|
)
|
|
time.sleep(cfg.CONF.retry_delay)
|
|
else:
|
|
self.state = states.CONFIGURED
|
|
self.log.info('Instance config updated')
|
|
return self.state
|
|
else:
|
|
self.state = states.RESTART
|
|
return self.state
|
|
|
|
def replug(self, worker_context):
|
|
|
|
"""Attempts to replug the network ports for an instance.
|
|
|
|
:param worker_context:
|
|
:returns:
|
|
"""
|
|
self.log.debug('Attempting to replug...')
|
|
|
|
self.driver.pre_plug(worker_context)
|
|
|
|
interfaces = self.driver.get_interfaces(
|
|
self.instance_info.management_address)
|
|
|
|
actual_macs = set((iface['lladdr'] for iface in interfaces))
|
|
instance_macs = set(p.mac_address for p in self.instance_info.ports)
|
|
instance_macs.add(self.instance_info.management_port.mac_address)
|
|
|
|
if instance_macs != actual_macs:
|
|
# our cached copy of the ports is wrong reboot and clean up
|
|
self.log.warning(
|
|
('Instance macs(%s) do not match actual macs (%s). Instance '
|
|
'cache appears out-of-sync'),
|
|
instance_macs, actual_macs
|
|
)
|
|
self.state = states.RESTART
|
|
return
|
|
|
|
instance_ports = {p.network_id: p for p in self.instance_info.ports}
|
|
instance_networks = set(instance_ports.keys())
|
|
|
|
logical_networks = set(p.network_id for p in self.driver.ports)
|
|
|
|
if logical_networks != instance_networks:
|
|
instance = worker_context.nova_client.get_instance_by_id(
|
|
self.instance_info.id_
|
|
)
|
|
|
|
# For each port that doesn't have a mac address on the instance...
|
|
for network_id in logical_networks - instance_networks:
|
|
port = worker_context.neutron.create_vrrp_port(
|
|
self.driver.id,
|
|
network_id
|
|
)
|
|
self.log.debug(
|
|
'Net %s is missing from the router, plugging: %s',
|
|
network_id, port.id
|
|
)
|
|
|
|
try:
|
|
instance.interface_attach(port.id, None, None)
|
|
except:
|
|
self.log.exception('Interface attach failed')
|
|
self.state = states.RESTART
|
|
return
|
|
self.instance_info.ports.append(port)
|
|
|
|
for network_id in instance_networks - logical_networks:
|
|
port = instance_ports[network_id]
|
|
self.log.debug(
|
|
'Net %s is detached from the router, unplugging: %s',
|
|
network_id, port.id
|
|
)
|
|
|
|
try:
|
|
instance.interface_detach(port.id)
|
|
except:
|
|
self.log.exception('Interface detach failed')
|
|
self.state = states.RESTART
|
|
return
|
|
|
|
self.instance_info.ports.remove(port)
|
|
|
|
# The action of attaching/detaching interfaces in Nova happens via the
|
|
# message bus and is *not* blocking. We need to wait a few seconds to
|
|
# see if the list of tap devices on the appliance actually changed. If
|
|
# not, assume the hotplug failed, and reboot the Instance.
|
|
replug_seconds = cfg.CONF.hotplug_timeout
|
|
while replug_seconds > 0:
|
|
self.log.debug(
|
|
"Waiting for interface attachments to take effect..."
|
|
)
|
|
interfaces = self.driver.get_interfaces(
|
|
self.instance_info.management_address)
|
|
|
|
if self._verify_interfaces(self.driver.ports, interfaces):
|
|
# replugging was successful
|
|
# TODO(mark) update port states
|
|
return
|
|
|
|
time.sleep(1)
|
|
replug_seconds -= 1
|
|
|
|
self.log.debug("Interfaces aren't plugged as expected, rebooting.")
|
|
self.state = states.RESTART
|
|
|
|
def _ensure_cache(self, worker_context):
|
|
self.instance_info = (
|
|
worker_context.nova_client.get_instance_info(self.driver.name)
|
|
)
|
|
|
|
if self.instance_info:
|
|
(
|
|
self.instance_info.management_port,
|
|
self.instance_info.ports
|
|
) = worker_context.neutron.get_ports_for_instance(
|
|
self.instance_info.id_
|
|
)
|
|
|
|
def _check_boot_timeout(self):
|
|
"""If the instance was created more than `boot_timeout` seconds
|
|
ago, log an error and set the state set to states.DOWN
|
|
"""
|
|
time_since_boot = self.instance_info.time_since_boot
|
|
|
|
if time_since_boot:
|
|
if time_since_boot.seconds < cfg.CONF.boot_timeout:
|
|
# Do not reset the state if we have an error
|
|
# condition already. The state will be reset when
|
|
# the router starts responding again, or when the
|
|
# error is cleared from a forced rebuild.
|
|
if self.state != states.ERROR:
|
|
self.state = states.BOOTING
|
|
else:
|
|
# If the instance was created more than `boot_timeout` seconds
|
|
# ago, log an error and set the state set to states.DOWN
|
|
self.log.info(
|
|
'Router is DOWN. Created over %d secs ago.',
|
|
cfg.CONF.boot_timeout)
|
|
# Do not reset the state if we have an error condition
|
|
# already. The state will be reset when the router starts
|
|
# responding again, or when the error is cleared from a
|
|
# forced rebuild.
|
|
if self.state != states.ERROR:
|
|
self.state = states.DOWN
|
|
|
|
def _verify_interfaces(self, ports, interfaces):
|
|
"""Verifies the network interfaces are what they should be.
|
|
"""
|
|
actual_macs = set((iface['lladdr'] for iface in interfaces))
|
|
self.log.debug('MACs found: %s', ', '.join(sorted(actual_macs)))
|
|
if not all(
|
|
getattr(p, 'mac_address', None) for p in ports
|
|
):
|
|
return False
|
|
|
|
num_logical_ports = len(list(ports))
|
|
num_instance_ports = len(list(self.instance_info.ports))
|
|
if num_logical_ports != num_instance_ports:
|
|
return False
|
|
|
|
expected_macs = set(p.mac_address
|
|
for p in self.instance_info.ports)
|
|
expected_macs.add(self.instance_info.management_port.mac_address)
|
|
self.log.debug('MACs expected: %s', ', '.join(sorted(expected_macs)))
|
|
|
|
return actual_macs == expected_macs
|