astara/akanda/rug/vm_manager.py

284 lines
10 KiB
Python

# Copyright 2014 DreamHost, LLC
#
# Author: DreamHost, LLC
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
from datetime import datetime
import netaddr
import time
from oslo.config import cfg
from akanda.rug.api import configuration
from akanda.rug.api import akanda_client as router_api
from akanda.rug.api import quantum
DOWN = 'down'
BOOTING = 'booting'
UP = 'up'
CONFIGURED = 'configured'
RESTART = 'restart'
GONE = 'gone'
class VmManager(object):
def __init__(self, router_id, tenant_id, log, worker_context):
self.router_id = router_id
self.tenant_id = tenant_id
self.log = log
self.state = DOWN
self.router_obj = None
self.last_boot = None
# FIXME: Probably need to pass context here
self.update_state(worker_context, silent=True)
def update_state(self, worker_context, silent=False):
self._ensure_cache(worker_context)
if self.state == GONE:
self.log.debug('not updating state of deleted router')
return self.state
if self.router_obj.management_port is None:
self.log.debug('no management port, marking router as down')
self.state = DOWN
return self.state
addr = _get_management_address(self.router_obj)
for i in xrange(cfg.CONF.max_retries):
if router_api.is_alive(addr, cfg.CONF.akanda_mgt_service_port):
if self.state != CONFIGURED:
self.state = UP
break
if not silent:
self.log.debug(
'Alive check failed. Attempt %d of %d',
i,
cfg.CONF.max_retries,
)
time.sleep(cfg.CONF.retry_delay)
else:
self.state = DOWN
if self.last_boot:
seconds_since_boot = (
datetime.utcnow() - self.last_boot
).seconds
if seconds_since_boot < cfg.CONF.boot_timeout:
self.state = BOOTING
else:
# If the VM was created more than `boot_timeout` seconds
# ago, log an error and leave the state set to DOWN
self.last_boot = None
self.log.info(
'Router is DOWN. Created over %d secs ago.',
cfg.CONF.boot_timeout)
return self.state
def boot(self, worker_context):
self._ensure_cache(worker_context)
if self.state == GONE:
self.log.info('not booting deleted router')
return
self.log.info('Booting router')
self.state = DOWN
try:
self._ensure_provider_ports(self.router_obj, worker_context)
# In the event that the current akanda instance isn't deleted
# cleanly (which we've seen in certain circumstances, like
# hypervisor failures), be proactive and attempt to clean up the
# router ports manually. This helps avoid a situation where the
# rug repeatedly attempts to plug stale router ports into the newly
# created akanda instance (and fails).
router = self.router_obj
instance = worker_context.nova_client.get_instance(router)
if instance is not None:
for p in router.ports:
if p.device_id == instance.id:
worker_context.neutron.clear_device_id(p)
created = worker_context.nova_client.reboot_router_instance(router)
if not created:
self.log.info('Previous router is deleting')
return
except:
self.log.exception('Router failed to start boot')
return
else:
self.last_boot = datetime.utcnow()
def check_boot(self, worker_context):
ready_states = (UP, CONFIGURED)
if self.update_state(worker_context, silent=True) in ready_states:
self.log.info('Router has booted, attempting initial config')
self.configure(worker_context, BOOTING, attempts=1)
return self.state == CONFIGURED
self.log.debug('Router is %s' % self.state.upper())
return False
def stop(self, worker_context):
self._ensure_cache(worker_context)
if self.state == GONE:
# We are being told to delete a router that neutron has
# already removed. Make a fake router object to use in
# this method.
router_obj = quantum.Router(
id_=self.router_id,
tenant_id=self.tenant_id,
name='unnamed',
admin_state_up=False,
)
self.log.info('Destroying router neutron has deleted')
else:
router_obj = self.router_obj
self.log.info('Destroying router')
nova_client = worker_context.nova_client
nova_client.destroy_router_instance(router_obj)
start = time.time()
while time.time() - start < cfg.CONF.boot_timeout:
if not nova_client.get_router_instance_status(router_obj):
if self.state != GONE:
self.state = DOWN
return
self.log.debug('Router has not finished stopping')
time.sleep(cfg.CONF.retry_delay)
self.log.error(
'Router failed to stop within %d secs',
cfg.CONF.boot_timeout)
def configure(self, worker_context, failure_state=RESTART, attempts=None):
self.log.debug('Begin router config')
self.state = UP
attempts = attempts or cfg.CONF.max_retries
# FIXME: This might raise an error, which doesn't mean the
# *router* is broken, but does mean we can't update it.
# Change the exception to something the caller can catch
# safely.
self._ensure_cache(worker_context)
if self.state == GONE:
return
addr = _get_management_address(self.router_obj)
# FIXME: This should raise an explicit exception so the caller
# knows that we could not talk to the router (versus the issue
# above).
interfaces = router_api.get_interfaces(
addr,
cfg.CONF.akanda_mgt_service_port
)
if not self._verify_interfaces(self.router_obj, interfaces):
# FIXME: Need a REPLUG state when we support hot-plugging
# interfaces.
self.log.debug("Interfaces aren't plugged as expected, rebooting.")
self.state = RESTART
return
# FIXME: Need to catch errors talking to neutron here.
config = configuration.build_config(
worker_context.neutron,
self.router_obj,
interfaces
)
self.log.debug('preparing to update config to %r', config)
for i in xrange(attempts):
try:
router_api.update_config(
addr,
cfg.CONF.akanda_mgt_service_port,
config
)
except Exception:
if i == attempts - 1:
# Only log the traceback if we encounter it many times.
self.log.exception('failed to update config')
else:
self.log.debug(
'failed to update config, attempt %d',
i
)
time.sleep(cfg.CONF.retry_delay)
else:
self.state = CONFIGURED
self.log.info('Router config updated')
return
else:
# FIXME: We failed to configure the router too many times,
# so restart it.
self.state = failure_state
def _ensure_cache(self, worker_context):
try:
self.router_obj = worker_context.neutron.get_router_detail(
self.router_id
)
except quantum.RouterGone:
# The router has been deleted, set our state accordingly
# and return without doing any more work.
self.state = GONE
self.router_obj = None
def _verify_interfaces(self, logical_config, interfaces):
router_macs = set((iface['lladdr'] for iface in interfaces))
self.log.debug('MACs found: %s', ', '.join(sorted(router_macs)))
if not all(
getattr(p, 'mac_address', None) for p in logical_config.ports
):
return False
expected_macs = set(p.mac_address
for p in logical_config.internal_ports)
expected_macs.add(logical_config.management_port.mac_address)
expected_macs.add(logical_config.external_port.mac_address)
self.log.debug('MACs expected: %s', ', '.join(sorted(expected_macs)))
return router_macs == expected_macs
def _ensure_provider_ports(self, router, worker_context):
if router.management_port is None:
self.log.debug('Adding management port to router')
mgt_port = worker_context.neutron.create_router_management_port(
router.id
)
router.management_port = mgt_port
if router.external_port is None:
# FIXME: Need to do some work to pick the right external
# network for a tenant.
self.log.debug('Adding external port to router')
ext_port = worker_context.neutron.create_router_external_port(
router
)
router.external_port = ext_port
return router
def _get_management_address(router):
network = netaddr.IPNetwork(cfg.CONF.management_prefix)
tokens = ['%02x' % int(t, 16)
for t in router.management_port.mac_address.split(':')]
eui64 = int(''.join(tokens[0:3] + ['ff', 'fe'] + tokens[3:6]), 16)
# the bit inversion is required by the RFC
return str(netaddr.IPAddress(network.value + (eui64 ^ 0x0200000000000000)))