astara/akanda/rug/vm_manager.py

# Copyright 2014 DreamHost, LLC
#
# Author: DreamHost, LLC
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.


from datetime import datetime
import netaddr
import time

from oslo.config import cfg

from akanda.rug.api import configuration
from akanda.rug.api import akanda_client as router_api
from akanda.rug.api import quantum

DOWN = 'down'
BOOTING = 'booting'
UP = 'up'
CONFIGURED = 'configured'
RESTART = 'restart'
GONE = 'gone'


class VmManager(object):
    def __init__(self, router_id, tenant_id, log, worker_context):
        self.router_id = router_id
        self.tenant_id = tenant_id
        self.log = log
        self.state = DOWN
        self.router_obj = None
        self.last_boot = None
        # FIXME: Probably need to pass context here
        self.update_state(worker_context, silent=True)

    def update_state(self, worker_context, silent=False):
        self._ensure_cache(worker_context)
        if self.state == GONE:
            self.log.debug('not updating state of deleted router')
            return self.state

        if self.router_obj.management_port is None:
            self.log.debug('no management port, marking router as down')
            self.state = DOWN
            return self.state

        addr = _get_management_address(self.router_obj)
        for i in xrange(cfg.CONF.max_retries):
            if router_api.is_alive(addr, cfg.CONF.akanda_mgt_service_port):
                if self.state != CONFIGURED:
                    self.state = UP
                break
            if not silent:
                self.log.debug(
                    'Alive check failed. Attempt %d of %d',
                    i,
                    cfg.CONF.max_retries,
                )
            time.sleep(cfg.CONF.retry_delay)
        else:
            self.state = DOWN
            if self.last_boot:
                seconds_since_boot = (
                    datetime.utcnow() - self.last_boot
                ).seconds
                if seconds_since_boot < cfg.CONF.boot_timeout:
                    self.state = BOOTING
                else:
                    # If the VM was created more than `boot_timeout` seconds
                    # ago, log an error and leave the state set to DOWN
                    self.last_boot = None
                    self.log.info(
                        'Router is DOWN.  Created over %d secs ago.',
                        cfg.CONF.boot_timeout)

        return self.state

    def boot(self, worker_context):
        self._ensure_cache(worker_context)
        if self.state == GONE:
            self.log.info('not booting deleted router')
            return

        self.log.info('Booting router')
        self.state = DOWN

        try:
            self._ensure_provider_ports(self.router_obj, worker_context)

            # In the event that the current akanda instance isn't deleted
            # cleanly (which we've seen in certain circumstances, like
            # hypervisor failures), be proactive and attempt to clean up the
            # router ports manually.  This helps avoid a situation where the
            # rug repeatedly attempts to plug stale router ports into the newly
            # created akanda instance (and fails).
            router = self.router_obj
            instance = worker_context.nova_client.get_instance(router)
            if instance is not None:
                for p in router.ports:
                    if p.device_id == instance.id:
                        worker_context.neutron.clear_device_id(p)
            created = worker_context.nova_client.reboot_router_instance(router)
            if not created:
                self.log.info('Previous router is deleting')
                return
        except:
            self.log.exception('Router failed to start boot')
            return
        else:
            self.last_boot = datetime.utcnow()

    def check_boot(self, worker_context):
        ready_states = (UP, CONFIGURED)
        if self.update_state(worker_context, silent=True) in ready_states:
            self.log.info('Router has booted, attempting initial config')
            self.configure(worker_context, BOOTING, attempts=1)
            return self.state == CONFIGURED
        self.log.debug('Router is %s' % self.state.upper())
        return False

    def stop(self, worker_context):
        self._ensure_cache(worker_context)
        if self.state == GONE:
            # We are being told to delete a router that neutron has
            # already removed. Make a fake router object to use in
            # this method.
            router_obj = quantum.Router(
                id_=self.router_id,
                tenant_id=self.tenant_id,
                name='unnamed',
                admin_state_up=False,
            )
            self.log.info('Destroying router neutron has deleted')
        else:
            router_obj = self.router_obj
            self.log.info('Destroying router')

        nova_client = worker_context.nova_client
        nova_client.destroy_router_instance(router_obj)

        start = time.time()
        while time.time() - start < cfg.CONF.boot_timeout:
            if not nova_client.get_router_instance_status(router_obj):
                if self.state != GONE:
                    self.state = DOWN
                return
            self.log.debug('Router has not finished stopping')
            time.sleep(cfg.CONF.retry_delay)
        self.log.error(
            'Router failed to stop within %d secs',
            cfg.CONF.boot_timeout)

    def configure(self, worker_context, failure_state=RESTART, attempts=None):
        self.log.debug('Begin router config')
        self.state = UP
        attempts = attempts or cfg.CONF.max_retries

        # FIXME: This might raise an error, which doesn't mean the
        # *router* is broken, but does mean we can't update it.
        # Change the exception to something the caller can catch
        # safely.
        self._ensure_cache(worker_context)
        if self.state == GONE:
            return

        addr = _get_management_address(self.router_obj)

        # FIXME: This should raise an explicit exception so the caller
        # knows that we could not talk to the router (versus the issue
        # above).
        interfaces = router_api.get_interfaces(
            addr,
            cfg.CONF.akanda_mgt_service_port
        )

        if not self._verify_interfaces(self.router_obj, interfaces):
            # FIXME: Need a REPLUG state when we support hot-plugging
            # interfaces.
            self.log.debug("Interfaces aren't plugged as expected, rebooting.")
            self.state = RESTART
            return

        # FIXME: Need to catch errors talking to neutron here.
        config = configuration.build_config(
            worker_context.neutron,
            self.router_obj,
            interfaces
        )
        self.log.debug('preparing to update config to %r', config)

        for i in xrange(attempts):
            try:
                router_api.update_config(
                    addr,
                    cfg.CONF.akanda_mgt_service_port,
                    config
                )
            except Exception:
                if i == attempts - 1:
                    # Only log the traceback if we encounter it many times.
                    self.log.exception('failed to update config')
                else:
                    self.log.debug(
                        'failed to update config, attempt %d',
                        i
                    )
                time.sleep(cfg.CONF.retry_delay)
            else:
                self.state = CONFIGURED
                self.log.info('Router config updated')
                return
        else:
            # FIXME: We failed to configure the router too many times,
            # so restart it.
            self.state = failure_state

    def _ensure_cache(self, worker_context):
        try:
            self.router_obj = worker_context.neutron.get_router_detail(
                self.router_id
            )
        except quantum.RouterGone:
            # The router has been deleted, set our state accordingly
            # and return without doing any more work.
            self.state = GONE
            self.router_obj = None

    def _verify_interfaces(self, logical_config, interfaces):
        router_macs = set((iface['lladdr'] for iface in interfaces))
        self.log.debug('MACs found: %s', ', '.join(sorted(router_macs)))

        if not all(
            getattr(p, 'mac_address', None) for p in logical_config.ports
        ):
            return False

        expected_macs = set(p.mac_address
                            for p in logical_config.internal_ports)
        expected_macs.add(logical_config.management_port.mac_address)
        expected_macs.add(logical_config.external_port.mac_address)
        self.log.debug('MACs expected: %s', ', '.join(sorted(expected_macs)))

        return router_macs == expected_macs

    def _ensure_provider_ports(self, router, worker_context):
        if router.management_port is None:
            self.log.debug('Adding management port to router')
            mgt_port = worker_context.neutron.create_router_management_port(
                router.id
            )
            router.management_port = mgt_port

        if router.external_port is None:
            # FIXME: Need to do some work to pick the right external
            # network for a tenant.
            self.log.debug('Adding external port to router')
            ext_port = worker_context.neutron.create_router_external_port(
                router
            )
            router.external_port = ext_port
        return router


def _get_management_address(router):
    network = netaddr.IPNetwork(cfg.CONF.management_prefix)

    tokens = ['%02x' % int(t, 16)
              for t in router.management_port.mac_address.split(':')]
    eui64 = int(''.join(tokens[0:3] + ['ff', 'fe'] + tokens[3:6]), 16)

    # the bit inversion is required by the RFC
    return str(netaddr.IPAddress(network.value + (eui64 ^ 0x0200000000000000)))