astara/astara/instance_manager.py

# Copyright 2014 DreamHost, LLC
#
# Author: DreamHost, LLC
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.

from datetime import datetime
from functools import wraps
import time
import six

from oslo_config import cfg

from astara.drivers import states
from astara.common.i18n import _LE, _LI
from astara.common import container


CONF = cfg.CONF
INSTANCE_MANAGER_OPTS = [
    cfg.IntOpt(
        'hotplug_timeout',
        default=10,
        help='The amount of time to wait for nova to hotplug/unplug '
        'networks from the instances.'),
    cfg.IntOpt(
        'boot_timeout', default=600),
    cfg.IntOpt(
        'error_state_cooldown',
        default=30,
        help='Number of seconds to ignore new events when an instance goes '
        'into ERROR state.',
    ),
]
CONF.register_opts(INSTANCE_MANAGER_OPTS)


def _generate_interface_map(instance, interfaces):
    # TODO(mark): We're in the first phase of VRRP, so we need
    # map the interface to the network ID.
    # Eventually we'll send VRRP data and real interface data
    port_mac_to_net = {
        p.mac_address: p.network_id
        for p in instance.ports
    }
    # Add in the management port
    mgt_port = instance.management_port
    port_mac_to_net[mgt_port.mac_address] = mgt_port.network_id
    # this is a network to logical interface id
    return {
        port_mac_to_net[i['lladdr']]: i['ifname']
        for i in interfaces if i['lladdr'] in port_mac_to_net
    }


def synchronize_driver_state(f):
    """Wrapper that triggers a driver's synchronize_state function"""
    def wrapper(self, *args, **kw):
        state = f(self, *args, **kw)
        self.resource.synchronize_state(*args, state=state)
        return state
    return wrapper


def ensure_cache(f):
    """Decorator to wrap around any function that uses self.instance_info.

    Ensures that self.instance_info is up to date and catches instances in a
    GONE or missing state before wasting cycles trying to do something with it.

    NOTE: This replaces the old function called _ensure_cache made a Decorator
    rather than calling it explicitly at the start of all those functions.
    """
    @wraps(f)
    def wrapper(self, worker_context, *args, **kw):
        self.instances.refresh(worker_context)

        instances = worker_context.nova_client.get_instances_for_obj(
            self.resource.name)
        for inst_info in instances:
            self.instances[inst_info.id_] = inst_info

        self.instances.update_ports(worker_context)

        return f(self, worker_context, *args, **kw)

    return wrapper


class BootAttemptCounter(object):
    def __init__(self):
        self._attempts = 0

    def start(self):
        self._attempts += 1

    def reset(self):
        self._attempts = 0

    @property
    def count(self):
        return self._attempts


class InstanceGroupManager(container.ResourceContainer):
    def __init__(self, log, resource):
        super(InstanceGroupManager, self).__init__()
        self.log = log
        self.resource = resource
        self._alive = set()

    @property
    def instances(self):
        """Returns the managed instances sorted by name"""
        return sorted(self.resources.values(), key=lambda i: i.name)

    def validate_ports(self):
        """Checks whether instance have management ports attached

        :returns: tuple containing two lists:
             (instances that have ports, instances that don't)
        """
        has_ports = set()
        for inst_info in set(self.resources.values()):
            if inst_info.management_address:
                has_ports.add(inst_info)
        return has_ports, set(self.resources.values()) - has_ports

    def are_alive(self):
        """Calls the check_check function all instances to ensure liveliness

        :returns: tuple containing two lists (alive_instances, dead_instances)
        """
        alive = set()
        for i in six.moves.range(cfg.CONF.max_retries):
            for inst_info in set(self.instances) - alive:
                if (inst_info.management_address and
                   self.resource.is_alive(inst_info.management_address)):
                    self.log.debug(
                        'Instance %s found alive after %s of %s attempts',
                        inst_info.id_, i, cfg.CONF.max_retries)
                    alive.add(inst_info)
                else:
                    self.log.debug(
                        'Alive check failed for instance %s. Attempt %d of %d',
                        inst_info.id_, i, cfg.CONF.max_retries)

            # all managed instances report alive
            if alive == set(self.instances):
                self._alive = [i.id_ for i in alive]
                return alive, []

        # zero managed instances report alive
        if not alive:
            self.log.debug(
                'Alive check failed for all instances after %s attempts.',
                cfg.CONF.max_retries)
            return [], self.instances

        dead = set(self.resources.values()) - alive
        self._alive = [i.id_ for i in alive - dead]
        return list(alive), list(dead)

    def update_ports(self, worker_context):
        """Refresh ports on all managed instance info objects"""
        for instance_info in self.instances:
            if not instance_info:
                continue
            (
                instance_info.management_port,
                instance_info.ports
            ) = worker_context.neutron.get_ports_for_instance(
                instance_info.id_
            )

    def get_interfaces(self):
        """Obtain a list of interfaces from each managed instance

        Skips any instance that has not already been verified as being alive.

        :returns: dict of {instance: [interfaces_dict]}
        """
        interfaces = {}
        for inst in self.instances:
            if inst.id_ not in self._alive:
                self.log.debug(
                    'Skipping interfaces on query on instance %s that '
                    'is not yet alive.', inst.id_)
                continue
            else:
                interfaces[inst] = self.resource.get_interfaces(
                    inst.management_address)
        return interfaces

    def verify_interfaces(self, ports):
        """Verify at least one instance in group has correct ports plugged"""
        for inst, interfaces in self.get_interfaces().items():
            actual_macs = set((iface['lladdr'] for iface in interfaces))
            self.log.debug(
                'MACs found on %s: %s', inst.id_,
                ', '.join(sorted(actual_macs)))
            if not all(
                getattr(p, 'mac_address', None) for p in ports
            ):
                return False

            num_instance_ports = len(list(inst.ports))
            num_logical_ports = len(list(ports))
            if num_logical_ports != num_instance_ports:
                self.log.debug(
                    'Expected %s instance ports but found %s',
                    num_logical_ports, num_instance_ports)
                return False

            expected_macs = set(p.mac_address
                                for p in inst.ports)
            expected_macs.add(inst.management_port.mac_address)
            self.log.debug(
                'MACs expected on: %s, %s',
                inst.id_, ', '.join(sorted(expected_macs)))

            if actual_macs == expected_macs:
                self.log.debug('Found all expected MACs on %s', inst.id_)
                return True

            self.log.debug(
                'Did not find all expected MACs on instance %s, '
                'actual MACs: %s', inst.id_, ', '.join(actual_macs))

        return False

    def _update_config(self, instance, config):
        self.log.debug(
            'Updating config for instance %s on resource %s',
            instance.id_, self.resource.id)
        self.log.debug('New config: %r', config)
        attempts = cfg.CONF.max_retries
        for i in six.moves.range(attempts):
            try:
                self.resource.update_config(
                    instance.management_address,
                    config)
            except Exception:
                if i == attempts - 1:
                    # Only log the traceback if we encounter it many times.
                    self.log.exception(_LE('failed to update config'))
                else:
                    self.log.debug(
                        'failed to update config, attempt %d',
                        i
                    )
                time.sleep(cfg.CONF.retry_delay)
            else:
                self.log.info('Instance config updated')
                return True
        else:
            return False

    def _ha_config(self, instance):
        """Builds configuration describing the HA cluster

        This informs the instance about any configuration relating to the HA
        cluster it should be joining.  ATM this is primarily used to inform
        an instance about the management addresses of its peers.

        :param instance: InstanceInfo object
        :returns: dict of HA configuration
        """
        peers = [
            i.management_address for i in self.instances
            if i.management_address != instance.management_address]

        # determine cluster priority by instance age. the older instance
        # gets the higher priority
        sorted_by_age = sorted(
            self.instances, key=lambda i: i.time_since_boot,
            reverse=True)

        if sorted_by_age.index(instance) == 0:
            priority = 100
        else:
            priority = 50

        return {
            'peers': peers,
            'priority': priority,
        }

    def configure(self, worker_context):
        # XXX config update can be dispatched to threads to speed
        # things up across multiple instances
        failed = []

        # get_interfaces() return returns only instances that are up and ready
        # for config
        instances_interfaces = self.get_interfaces()

        for inst, interfaces in instances_interfaces.items():
            # sending all the standard config over to the driver for
            # final updates
            config = self.resource.build_config(
                worker_context,
                inst.management_port,
                _generate_interface_map(inst, interfaces)
            )

            # while drivers are free to express their own ha config
            # requirements, the instance manager is the only one with
            # high level view of the cluster, ie knowledge of membership
            if self.resource.is_ha:
                config['ha_config'] = config.get('ha') or {}
                config['ha_config'].update(self._ha_config(inst))

            self.log.debug(
                'preparing to update config for instance %s on %s resource '
                'to %r', inst.id_, self.resource.RESOURCE_NAME, config)

            if self._update_config(inst, config) is not True:
                failed.append(inst)

        if set(failed) == set(self.instances):
            # all updates have failed
            self.log.error(
                'Could not update config for any instances on %s resource %s, '
                'marking resource state %s',
                self.resource.id, self.resource.RESOURCE_NAME, states.RESTART)
            return states.RESTART
        elif failed:
            # some updates to instances we thought to be alive have failed
            self.log.error(
                'Could not update config for some instances on %s '
                'resource %s marking %s resource state',
                self.resource.RESOURCE_NAME, self.resource.id, states.DEGRADED)
            return states.DEGRADED
        elif len(instances_interfaces.keys()) != len(self.instances):
            # instance_interfaces contains only instances that are alive
            # if we're still waiting on instances, remain degraded
            self.log.debug(
                'Config updated on %s of %s instances',
                len(instances_interfaces.keys()), len(self.instances))
            return states.DEGRADED
        else:
            self.log.debug(
                'Config updated across all instances on %s resource %s',
                self.resource.RESOURCE_NAME, self.resource.id)
            return states.CONFIGURED

    def delete(self, instance):
        """Removes nova server reference from manager"""
        del self.resources[instance.id_]

    def refresh(self, worker_context):
        """Update nova server reference for all managed instances"""
        for i in self.instances:
            if not worker_context.nova_client.update_instance_info(i):
                self.delete(i)

    def destroy(self, worker_context):
        """Destroys all nova instances and blocks until deletion"""
        worker_context.nova_client.delete_instances_and_wait(
            self.instances)

    def remove(self, worker_context, instance):
        """Destroys the nova instance, removes instance from group manager"""
        worker_context.nova_client.destroy_instance(instance)
        self.delete(instance)

    @property
    def next_instance_index(self):
        ids = [
            int(i.name.split('_')[1]) for i in
            self.instances]
        try:
            return max(ids) + 1
        except ValueError:
            return 0

    def create(self, worker_context):
        to_boot = self.required_instance_count - len(self.instances)
        self.log.debug(
            'Booting an additional %s instance(s) for resource %s',
            to_boot, self.resource.id)

        for i in six.moves.range(to_boot):
            name = '%s_%s' % (self.resource.name, self.next_instance_index)
            instance = worker_context.nova_client.boot_instance(
                resource_type=self.resource.RESOURCE_NAME,
                prev_instance_info=None,
                name=name,
                image_uuid=self.resource.image_uuid,
                flavor=self.resource.flavor,
                make_ports_callback=self.resource.make_ports(worker_context)

            )
            self.add_instance(instance)

    @property
    def required_instance_count(self):
        if self.resource.is_ha is True:
            return 2
        else:
            return 1

    @property
    def instance_count(self):
        return len(self.instances)

    @property
    def cluster_degraded(self):
        return self.instance_count < self.required_instance_count

    def add_instance(self, instance):
        """Adds a new instance or updates existing"""
        self.resources[instance.id_] = instance


class InstanceManager(object):

    def __init__(self, resource, worker_context):
        """The instance manager is your interface to the running instance.
        wether it be virtual, container or physical.

        Service specific code lives in the driver which is passed in here.

        :param resource: An driver instance for the managed resource
        :param resource_id: UUID of logical resource
        :param worker_context:
        """
        self.resource = resource
        self.log = self.resource.log

        self.state = states.DOWN

        self.instance_info = None
        self.instances = InstanceGroupManager(self.log, self.resource)
        self.last_error = None
        self._boot_counter = BootAttemptCounter()
        self._boot_logged = []
        self._last_synced_status = None

        self.state = self.update_state(worker_context, silent=True)

    @property
    def attempts(self):
        """Property which returns the boot count.

        :returns Int:
        """
        return self._boot_counter.count

    def reset_boot_counter(self):
        """Resets the boot counter.

        :returns None:
        """
        self._boot_counter.reset()

    @synchronize_driver_state
    @ensure_cache
    def update_state(self, worker_context, silent=False):
        """Updates state of the instance and, by extension, its logical resource

        :param worker_context:
        :param silent:
        :returns: state
        """
        if self.resource.get_state(worker_context) == states.GONE:
            self.log.debug('%s driver reported its state is %s',
                           self.resource.RESOURCE_NAME, states.GONE)
            self.state = states.GONE
            return self.state

        if not self.instances:
            self.log.info(_LI('no backing instance(s), marking as %s'),
                          states.DOWN)
            self.state = states.DOWN
            return self.state
        elif self.instances.cluster_degraded is True:
            self.log.info(_LI(
                'instance cluster for resource %s reports degraded'),
                self.resource.id)
            self.state = states.DEGRADED
            return self.state

        has_ports, no_ports = self.instances.validate_ports()

        # ports_state=None means no instances have ports
        if not has_ports:
            self.log.debug('waiting for instance ports to be attached')
            self.state = states.BOOTING
            return self.state

        # XXX TODO need to account for when only a subset of the cluster have
        # correct ports, kick back to Replug

        alive, dead = self.instances.are_alive()
        if not alive:
            # alive checked failed on all instances for an already configured
            # resource, mark it down.
            # XXX need to track timeouts per instance
            # self._check_boot_timeout()

            if self.state == states.CONFIGURED:
                self.log.debug('No instance(s) alive, marking it as %s',
                               states.DOWN)
                self.state = states.DOWN
                return self.state
        elif dead:
            # some subset of instances reported not alive, mark it degraded.
            if self.state == states.CONFIGURED:
                for i in dead:
                    instance = worker_context.nova_client.get_instance_by_id(
                        i.id_)
                    if instance is None and self.state != states.ERROR:
                        self.log.info(
                            'Instance %s was found; rebooting', i.id_)
                    self.instances.delete(i)
            self.state = states.DEGRADED
            return self.state

        self.instances.refresh(worker_context)
        if self.state == states.CONFIGURED:
            for i in alive:
                if not i.booting and i not in self._boot_logged:
                    self.log.info(
                        '%s booted in %s seconds after %s attempts',
                        self.resource.RESOURCE_NAME,
                        i.time_since_boot.total_seconds(),
                        self._boot_counter.count)
                    self._boot_logged.append(i)
            self.reset_boot_counter()
        else:
            if alive:
                self.state = states.UP

        return self.state

    @ensure_cache
    def boot(self, worker_context):
        """Boots the instances with driver pre/post boot hooks.

        :returns: None
        """
        self.log.info('Booting %s' % self.resource.RESOURCE_NAME)

        if self.state != states.DEGRADED:
            self.state = states.DOWN
            self._boot_counter.start()

        # driver preboot hook
        self.resource.pre_boot(worker_context)

        try:
            self.instances.create(worker_context)
            if not self.instances:
                self.log.info(_LI('Previous instances are still deleting'))
                # Reset the boot counter, causing the state machine to start
                # again with a new Instance.
                self.reset_boot_counter()
                return
        except:
            self.log.exception(_LE('Instances failed to start boot'))
        else:
            self.state = states.BOOTING

        # driver post boot hook
        self.resource.post_boot(worker_context)

    @synchronize_driver_state
    @ensure_cache
    def set_error(self, worker_context, silent=False):
        """Set the internal and neutron status for the router to states.ERROR.

        This is called from outside when something notices the router
        is "broken". We don't use it internally because this class is
        supposed to do what it's told and not make decisions about
        whether or not the router is fatally broken.
        """
        self.state = states.ERROR
        self.last_error = datetime.utcnow()
        return self.state

    @synchronize_driver_state
    @ensure_cache
    def clear_error(self, worker_context, silent=False):
        """Clear the internal error state.

        This is called from outside when something wants to force a
        router rebuild, so that the state machine that checks our
        status won't think we are broken unless we actually break
        again.
        """
        # Clear the boot counter.
        self._boot_counter.reset()
        self.state = states.DOWN
        return self.state

    @property
    def error_cooldown(self):
        """Returns True if the instance was recently set to states.ERROR state.
        """
        if self.last_error and self.state == states.ERROR:
            seconds_since_error = (
                datetime.utcnow() - self.last_error
            ).total_seconds()
            if seconds_since_error < cfg.CONF.error_state_cooldown:
                return True
        return False

    @synchronize_driver_state
    @ensure_cache
    def stop(self, worker_context):
        """Attempts to destroy the instance cluster

        :param worker_context:
        :returns:
        """
        self.log.info(_LI('Destroying instance'))

        self.resource.delete_ports(worker_context)

        if not self.instances:
            self.log.info(_LI('Instance(s) already destroyed.'))
            if self.state != states.GONE:
                self.state = states.DOWN
            return self.state

        try:
            self.instances.destroy(worker_context)
            if self.state != states.GONE:
                self.state = states.DOWN
        except Exception:
            self.log.exception(_LE('Failed to stop instance(s)'))

    @synchronize_driver_state
    @ensure_cache
    def configure(self, worker_context):
        """Pushes config to instance

        :param worker_context:
        :param failure_state:
        :param attempts:
        :returns:
        """
        self.log.debug('Begin instance config')
        self.state = states.UP

        if self.resource.get_state(worker_context) == states.GONE:
            return states.GONE

        if not self.instances:
            return states.DOWN

        if not self.instances.verify_interfaces(self.resource.ports):
            # XXX Need to acct for degraded cluster /w subset of nodes
            # having incorrect plugging.
            self.log.debug("Interfaces aren't plugged as expected.")
            self.state = states.REPLUG
            return self.state

        self.state = self.instances.configure(worker_context)
        return self.state

    def replug(self, worker_context):

        """Attempts to replug the network ports for an instance.

        :param worker_context:
        :returns:
        """
        self.log.debug('Attempting to replug...')

        self.resource.pre_plug(worker_context)

        for instance, interfaces in self.instances.get_interfaces().items():
            actual_macs = set((iface['lladdr'] for iface in interfaces))
            instance_macs = set(p.mac_address for p in instance.ports)
            instance_macs.add(instance.management_port.mac_address)

            if instance_macs != actual_macs:
                # our cached copy of the ports is wrong reboot and clean up
                self.log.warning((
                    'Instance macs(%s) do not match actual macs (%s). Instance'
                    ' cache appears out-of-sync'),
                    instance_macs, actual_macs
                )
                self.state = states.RESTART
                return

            instance_ports = {p.network_id: p for p in instance.ports}
            instance_networks = set(instance_ports.keys())

            logical_networks = set(p.network_id for p in self.resource.ports)

            if logical_networks != instance_networks:
                nova_instance = worker_context.nova_client.get_instance_by_id(
                    instance.id_
                )

                # For each port that doesn't have a mac address on the instance
                for network_id in logical_networks - instance_networks:
                    port = worker_context.neutron.create_vrrp_port(
                        self.resource.id,
                        network_id
                    )
                    self.log.debug(
                        'Net %s is missing from the appliance instance %s, '
                        'plugging: %s', network_id, instance.id_, port.id
                    )

                    try:
                        nova_instance.interface_attach(port.id, None, None)
                        instance.ports.append(port)
                    except:
                        self.log.exception(
                            'Interface attach failed on instance %s',
                            instance.id_)
                        self.instances.remove(worker_context, instance)

            # instance has been removed for failure, do not continue with
            # plugging
            if instance not in self.instances.values():
                continue

            ports_to_delete = []
            for network_id in instance_networks - logical_networks:
                port = instance_ports[network_id]
                self.log.debug(
                    'Net %s is detached from the router, unplugging: %s',
                    network_id, port.id
                )

                try:
                    nova_instance.interface_detach(port.id)
                    instance.ports.remove(port)
                    ports_to_delete.append(port)
                except:
                    self.log.exception(
                        'Interface detach failed on instance %s',
                        instance.id_)
                    self.instances.remove(worker_context, instance)

            # instance has been removed for failure, do not continue with
            # plugging
            if instance not in self.instances.values():
                continue

            if self._wait_for_interface_hotplug(instance) is not True:
                self.instances.remove(worker_context, instance)

        if not self.instances:
            # all instances were destroyed for plugging failure
            self.state = states.RESTART
        elif self.instances.cluster_degraded:
            # some instances were destroyed for plugging failure
            self.state = states.DEGRADED
        else:
            # plugging was successful
            for p in ports_to_delete:
                worker_context.neutron.api_client.delete_port(port.id)
            return

    def _wait_for_interface_hotplug(self, instance):
        """Waits for instance to report interfaces for all expected ports"""
        # The action of attaching/detaching interfaces in Nova happens via
        # the message bus and is *not* blocking.  We need to wait a few
        # seconds to if the list of tap devices on the appliance actually
        # changed.  If not, assume the hotplug failed, and reboot the
        # Instance.
        for i in six.moves.range(1, cfg.CONF.hotplug_timeout):
            self.log.debug(
                "Waiting for interface attachments to take effect..."
            )
            interfaces = self.resource.get_interfaces(
                instance.management_address)

            actual_macs = set((iface['lladdr'] for iface in interfaces))
            instance_macs = set(p.mac_address for p in instance.ports)
            instance_macs.add(instance.management_port.mac_address)
            if actual_macs == instance_macs:
                return True
            time.sleep(1)
        else:
            self.log.debug(
                "Interfaces aren't plugged as expected on instance %s, ",
                "marking for rebooting.", instance.id_)
        return False

    def _check_boot_timeout(self):
        """If the instance was created more than `boot_timeout` seconds
        ago, log an error and set the state set to states.DOWN
        """
        time_since_boot = self.instance_info.time_since_boot

        if time_since_boot:
            if time_since_boot.seconds < cfg.CONF.boot_timeout:
                # Do not reset the state if we have an error
                # condition already. The state will be reset when
                # the router starts responding again, or when the
                # error is cleared from a forced rebuild.
                if self.state != states.ERROR:
                    self.state = states.BOOTING
            else:
                # If the instance was created more than `boot_timeout` seconds
                # ago, log an error and set the state set to states.DOWN
                self.log.info(
                    'Router is DOWN.  Created over %d secs ago.',
                    cfg.CONF.boot_timeout)
                # Do not reset the state if we have an error condition
                # already. The state will be reset when the router starts
                # responding again, or when the error is cleared from a
                # forced rebuild.
                if self.state != states.ERROR:
                    self.state = states.DOWN