astara/akanda/rug/instance_manager.py

567 lines
20 KiB
Python
Executable File

# Copyright 2014 DreamHost, LLC
#
# Author: DreamHost, LLC
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
from datetime import datetime
from functools import wraps
import time
from oslo_config import cfg
from akanda.rug.api import configuration
from akanda.rug.api import akanda_client as router_api
from akanda.rug.api import neutron
DOWN = 'down'
BOOTING = 'booting'
UP = 'up'
CONFIGURED = 'configured'
RESTART = 'restart'
REPLUG = 'replug'
GONE = 'gone'
ERROR = 'error'
STATUS_MAP = {
DOWN: neutron.STATUS_DOWN,
BOOTING: neutron.STATUS_BUILD,
UP: neutron.STATUS_BUILD,
CONFIGURED: neutron.STATUS_ACTIVE,
ERROR: neutron.STATUS_ERROR,
}
CONF = cfg.CONF
INSTANCE_MANAGER_OPTS = [
cfg.IntOpt(
'hotplug_timeout', default=10,
help='The amount of time to wait for nova to hotplug/unplug '
'networks from the router instances'),
cfg.IntOpt(
'boot_timeout', default=600),
cfg.IntOpt(
'error_state_cooldown',
default=30,
help=('Number of seconds to ignore new events when a router goes '
'into ERROR state'),
),
]
CONF.register_opts(INSTANCE_MANAGER_OPTS)
def synchronize_router_status(f):
@wraps(f)
def wrapper(self, worker_context, silent=False):
old_status = self._last_synced_status
val = f(self, worker_context, silent)
if not self.router_obj:
return val
new_status = STATUS_MAP.get(self.state, neutron.STATUS_ERROR)
if not old_status or old_status != new_status:
worker_context.neutron.update_router_status(
self.router_obj.id,
new_status
)
self._last_synced_status = new_status
return val
return wrapper
class BootAttemptCounter(object):
def __init__(self):
self._attempts = 0
def start(self):
self._attempts += 1
def reset(self):
self._attempts = 0
@property
def count(self):
return self._attempts
class InstanceManager(object):
def __init__(self, router_id, tenant_id, log, worker_context):
self.router_id = router_id
self.tenant_id = tenant_id
self.log = log
self.state = DOWN
self.router_obj = None
self.instance_info = None
self.last_error = None
self._boot_counter = BootAttemptCounter()
self._last_synced_status = None
self.update_state(worker_context, silent=True)
@property
def attempts(self):
return self._boot_counter.count
def reset_boot_counter(self):
self._boot_counter.reset()
@synchronize_router_status
def update_state(self, worker_context, silent=False):
self._ensure_cache(worker_context)
if self.state == GONE:
self.log.debug('not updating state of deleted router')
return self.state
if self.instance_info is None:
self.log.debug('no backing instance, marking router as down')
self.state = DOWN
return self.state
addr = self.instance_info.management_address
for i in xrange(cfg.CONF.max_retries):
if router_api.is_alive(addr, cfg.CONF.akanda_mgt_service_port):
if self.state != CONFIGURED:
self.state = UP
break
if not silent:
self.log.debug(
'Alive check failed. Attempt %d of %d',
i,
cfg.CONF.max_retries,
)
time.sleep(cfg.CONF.retry_delay)
else:
old_state = self.state
self._check_boot_timeout()
# If the router isn't responding, make sure Nova knows about it
instance = worker_context.nova_client.get_instance_for_obj(
self.router_id
)
if instance is None and self.state != ERROR:
self.log.info('No instance was found; rebooting')
self.state = DOWN
self.instance_info = None
# update_state() is called from Alive() to check the
# status of the router. If we can't talk to the API at
# that point, the router should be considered missing and
# we should reboot it, so mark it down if we think it was
# configured before.
if old_state == CONFIGURED and self.state != ERROR:
self.log.debug(
'Did not find router alive, marking it as down',
)
self.state = DOWN
# After the router is all the way up, record how long it took
# to boot and accept a configuration.
if self.instance_info.booting and self.state == CONFIGURED:
# If we didn't boot the server (because we were restarted
# while it remained running, for example), we won't have a
# duration to log.
self.instance_info.confirm_up()
if self.instance_info.boot_duration:
self.log.info('Router booted in %s seconds after %s attempts',
self.instance_info.boot_duration.total_seconds(),
self._boot_counter.count)
# Always reset the boot counter, even if we didn't boot
# the server ourself, so we don't accidentally think we
# have an erroring router.
self._boot_counter.reset()
return self.state
def boot(self, worker_context, router_image_uuid):
self._ensure_cache(worker_context)
if self.state == GONE:
self.log.info('not booting deleted router')
return
self.log.info('Booting router')
self.state = DOWN
self._boot_counter.start()
def make_vrrp_ports():
mgt_port = worker_context.neutron.create_management_port(
self.router_obj.id
)
# FIXME(mark): ideally this should be ordered and de-duped
instance_ports = [
worker_context.neutron.create_vrrp_port(self.router_obj.id, n)
for n in (p.network_id for p in self.router_obj.ports)
]
return mgt_port, instance_ports
try:
# TODO(mark): make this pluggable
self._ensure_provider_ports(self.router_obj, worker_context)
# TODO(mark): make this handle errors more gracefully on cb fail
# TODO(mark): checkout from a pool - boot on demand for now
instance_info = worker_context.nova_client.boot_instance(
self.instance_info,
self.router_obj.id,
router_image_uuid,
make_vrrp_ports
)
if not instance_info:
self.log.info('Previous router is deleting')
return
except:
self.log.exception('Router failed to start boot')
# TODO(mark): attempt clean-up of failed ports
return
else:
# We have successfully started a (re)boot attempt so
# record the timestamp so we can report how long it takes.
self.state = BOOTING
self.instance_info = instance_info
def check_boot(self, worker_context):
ready_states = (UP, CONFIGURED)
if self.update_state(worker_context, silent=True) in ready_states:
self.log.info('Router has booted, attempting initial config')
self.configure(worker_context, BOOTING, attempts=1)
if self.state != CONFIGURED:
self._check_boot_timeout()
return self.state == CONFIGURED
self.log.debug('Router is %s' % self.state.upper())
return False
@synchronize_router_status
def set_error(self, worker_context, silent=False):
"""Set the internal and neutron status for the router to ERROR.
This is called from outside when something notices the router
is "broken". We don't use it internally because this class is
supposed to do what it's told and not make decisions about
whether or not the router is fatally broken.
"""
self._ensure_cache(worker_context)
if self.state == GONE:
self.log.debug('not updating state of deleted router')
return self.state
self.state = ERROR
self.last_error = datetime.utcnow()
return self.state
@synchronize_router_status
def clear_error(self, worker_context, silent=False):
"""Clear the internal error state.
This is called from outside when something wants to force a
router rebuild, so that the state machine that checks our
status won't think we are broken unless we actually break
again.
"""
# Clear the boot counter.
self._boot_counter.reset()
self._ensure_cache(worker_context)
if self.state == GONE:
self.log.debug('not updating state of deleted router')
return self.state
self.state = DOWN
return self.state
@property
def error_cooldown(self):
# Returns True if the router was recently set to ERROR state.
if self.last_error and self.state == ERROR:
seconds_since_error = (
datetime.utcnow() - self.last_error
).total_seconds()
if seconds_since_error < cfg.CONF.error_state_cooldown:
return True
return False
def stop(self, worker_context):
self._ensure_cache(worker_context)
if self.state == GONE:
self.log.info('Destroying router neutron has deleted')
else:
self.log.info('Destroying router')
try:
nova_client = worker_context.nova_client
nova_client.destroy_instance(self.instance_info)
except Exception:
self.log.exception('Error deleting router instance')
start = time.time()
while time.time() - start < cfg.CONF.boot_timeout:
if not nova_client.get_instance_by_id(self.instance_info.id_):
if self.state != GONE:
self.state = DOWN
return
self.log.debug('Router has not finished stopping')
time.sleep(cfg.CONF.retry_delay)
self.log.error(
'Router failed to stop within %d secs',
cfg.CONF.boot_timeout)
def configure(self, worker_context, failure_state=RESTART, attempts=None):
self.log.debug('Begin router config')
self.state = UP
attempts = attempts or cfg.CONF.max_retries
# FIXME: This might raise an error, which doesn't mean the
# *router* is broken, but does mean we can't update it.
# Change the exception to something the caller can catch
# safely.
self._ensure_cache(worker_context)
if self.state == GONE:
return
# FIXME: This should raise an explicit exception so the caller
# knows that we could not talk to the router (versus the issue
# above).
interfaces = router_api.get_interfaces(
self.instance_info.management_address,
cfg.CONF.akanda_mgt_service_port
)
if not self._verify_interfaces(self.router_obj, interfaces):
# FIXME: Need a REPLUG state when we support hot-plugging
# interfaces.
self.log.debug("Interfaces aren't plugged as expected.")
self.state = REPLUG
return
# TODO(mark): We're in the first phase of VRRP, so we need
# map the interface to the network ID.
# Eventually we'll send VRRP data and real interface data
port_mac_to_net = {
p.mac_address: p.network_id
for p in self.instance_info.ports
}
# Add in the management port
mgt_port = self.instance_info.management_port
port_mac_to_net[mgt_port.mac_address] = mgt_port.network_id
# this is a network to logical interface id
iface_map = {
port_mac_to_net[i['lladdr']]: i['ifname']
for i in interfaces if i['lladdr'] in port_mac_to_net
}
# FIXME: Need to catch errors talking to neutron here.
config = configuration.build_config(
worker_context.neutron,
self.router_obj,
mgt_port,
iface_map
)
self.log.debug('preparing to update config to %r', config)
for i in xrange(attempts):
try:
router_api.update_config(
self.instance_info.management_address,
cfg.CONF.akanda_mgt_service_port,
config
)
except Exception:
if i == attempts - 1:
# Only log the traceback if we encounter it many times.
self.log.exception('failed to update config')
else:
self.log.debug(
'failed to update config, attempt %d',
i
)
time.sleep(cfg.CONF.retry_delay)
else:
self.state = CONFIGURED
self.log.info('Router config updated')
return
else:
# FIXME: We failed to configure the router too many times,
# so restart it.
self.state = failure_state
def replug(self, worker_context):
self.log.debug('Attempting to replug...')
self._ensure_provider_ports(self.router_obj, worker_context)
interfaces = router_api.get_interfaces(
self.instance_info.management_address,
cfg.CONF.akanda_mgt_service_port
)
actual_macs = set((iface['lladdr'] for iface in interfaces))
instance_macs = set(p.mac_address for p in self.instance_info.ports)
instance_macs.add(self.instance_info.management_port.mac_address)
if instance_macs != actual_macs:
# our cached copy of the ports is wrong reboot and clean up
self.log.warning(
('Instance macs(%s) do not match actual macs (%s). Instance '
'cache appears out-of-sync'),
instance_macs, actual_macs
)
self.state = RESTART
return
instance_ports = {p.network_id: p for p in self.instance_info.ports}
instance_networks = set(instance_ports.keys())
logical_networks = set(p.network_id for p in self.router_obj.ports)
if logical_networks != instance_networks:
instance = worker_context.nova_client.get_instance_by_id(
self.instance_info.id_
)
# For each port that doesn't have a mac address on the instance...
for network_id in logical_networks - instance_networks:
port = worker_context.neutron.create_vrrp_port(
self.router_obj.id,
network_id
)
self.log.debug(
'Net %s is missing from the router, plugging: %s',
network_id, port.id
)
try:
instance.interface_attach(port.id, None, None)
except:
self.log.exception('Interface attach failed')
self.state = RESTART
return
self.instance_info.ports.append(port)
for network_id in instance_networks - logical_networks:
port = instance_ports[network_id]
self.log.debug(
'Net %s is detached from the router, unplugging: %s',
network_id, port.id
)
try:
instance.interface_detach(port.id)
except:
self.log.exception('Interface detach failed')
self.state = RESTART
return
self.instance_info.ports.remove(port)
# The action of attaching/detaching interfaces in Nova happens via the
# message bus and is *not* blocking. We need to wait a few seconds to
# see if the list of tap devices on the appliance actually changed. If
# not, assume the hotplug failed, and reboot the Instance.
replug_seconds = cfg.CONF.hotplug_timeout
while replug_seconds > 0:
self.log.debug(
"Waiting for interface attachments to take effect..."
)
interfaces = router_api.get_interfaces(
self.instance_info.management_address,
cfg.CONF.akanda_mgt_service_port
)
if self._verify_interfaces(self.router_obj, interfaces):
# replugging was successful
# TODO(mark) update port states
return
time.sleep(1)
replug_seconds -= 1
self.log.debug("Interfaces aren't plugged as expected, rebooting.")
self.state = RESTART
def _ensure_cache(self, worker_context):
try:
self.router_obj = worker_context.neutron.get_router_detail(
self.router_id
)
except neutron.RouterGone:
# The router has been deleted, set our state accordingly
# and return without doing any more work.
self.state = GONE
self.router_obj = None
if not self.instance_info:
self.instance_info = (
worker_context.nova_client.get_instance_info_for_obj(
self.router_id
)
)
if self.instance_info:
(
self.instance_info.management_port,
self.instance_info.ports
) = worker_context.neutron.get_ports_for_instance(
self.instance_info.id_
)
def _check_boot_timeout(self):
time_since_boot = self.instance_info.time_since_boot
if time_since_boot:
if time_since_boot.seconds < cfg.CONF.boot_timeout:
# Do not reset the state if we have an error
# condition already. The state will be reset when
# the router starts responding again, or when the
# error is cleared from a forced rebuild.
if self.state != ERROR:
self.state = BOOTING
else:
# If the instance was created more than `boot_timeout` seconds
# ago, log an error and set the state set to DOWN
self.log.info(
'Router is DOWN. Created over %d secs ago.',
cfg.CONF.boot_timeout)
# Do not reset the state if we have an error condition
# already. The state will be reset when the router starts
# responding again, or when the error is cleared from a
# forced rebuild.
if self.state != ERROR:
self.state = DOWN
def _verify_interfaces(self, logical_config, interfaces):
router_macs = set((iface['lladdr'] for iface in interfaces))
self.log.debug('MACs found: %s', ', '.join(sorted(router_macs)))
if not all(
getattr(p, 'mac_address', None) for p in logical_config.ports
):
return False
num_logical_ports = len(list(logical_config.ports))
num_instance_ports = len(list(self.instance_info.ports))
if num_logical_ports != num_instance_ports:
return False
expected_macs = set(p.mac_address
for p in self.instance_info.ports)
expected_macs.add(self.instance_info.management_port.mac_address)
self.log.debug('MACs expected: %s', ', '.join(sorted(expected_macs)))
return router_macs == expected_macs
def _ensure_provider_ports(self, router, worker_context):
if router.external_port is None:
# FIXME: Need to do some work to pick the right external
# network for a tenant.
self.log.debug('Adding external port to router')
ext_port = worker_context.neutron.create_router_external_port(
router
)
router.external_port = ext_port
return router