fuel-main/nailgun/nailgun/task/task.py

#    Copyright 2013 Mirantis, Inc.
#
#    Licensed under the Apache License, Version 2.0 (the "License"); you may
#    not use this file except in compliance with the License. You may obtain
#    a copy of the License at
#
#         http://www.apache.org/licenses/LICENSE-2.0
#
#    Unless required by applicable law or agreed to in writing, software
#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
#    License for the specific language governing permissions and limitations
#    under the License.


# -*- coding: utf-8 -*-

import uuid
import itertools
import traceback
import subprocess
import shlex
import json

import web
import netaddr
from sqlalchemy.orm import object_mapper, ColumnProperty
from sqlalchemy import or_

import nailgun.rpc as rpc
from nailgun.db import orm
from nailgun.logger import logger
from nailgun.settings import settings
from nailgun.notifier import notifier
from nailgun.network.manager import NetworkManager
from nailgun.api.models import Base
from nailgun.api.models import Network
from nailgun.api.models import NetworkGroup
from nailgun.api.models import Node
from nailgun.api.models import Cluster
from nailgun.api.models import IPAddr
from nailgun.api.validators import BasicValidator
from nailgun.task.fake import FAKE_THREADS
from nailgun.errors import errors
from nailgun.task.helpers import TaskHelper


def fake_cast(queue, messages, **kwargs):
    def make_thread(message, join_to=None):
        thread = FAKE_THREADS[message['method']](
            data=message,
            params=kwargs,
            join_to=join_to
        )
        logger.debug("Fake thread called: data: %s, params: %s",
                     message, kwargs)
        thread.start()
        thread.name = message['method'].upper()
        return thread

    if isinstance(messages, (list,)):
        thread = None
        for m in messages:
            thread = make_thread(m, join_to=thread)
    else:
        make_thread(messages)


if settings.FAKE_TASKS or settings.FAKE_TASKS_AMQP:
    rpc.cast = fake_cast


class DeploymentTask(object):
# LOGIC
# Use cases:
# 1. Cluster exists, node(s) added
#   If we add one node to existing OpenStack cluster, other nodes may require
#   updates (redeployment), but they don't require full system reinstallation.
#   How to: run deployment for all nodes which system type is target.
#   Run provisioning first and then deployment for nodes which are in
#   discover system type.
#   Q: Should we care about node status (provisioning, error, deploying)?
#   A: offline - when node doesn't respond (agent doesn't run, not
#                implemented); let's say user should remove this node from
#                cluster before deployment.
#      ready - target OS is loaded and node is Ok, we redeploy
#              ready nodes only if cluster has pending changes i.e.
#              network or cluster attrs were changed
#      discover - in discovery mode, provisioning is required
#      provisioning - at the time of task execution there should not be such
#                     case. If there is - previous provisioning has failed.
#                     Possible solution would be to try again to provision
#      deploying - the same as provisioning, but stucked in previous deploy,
#                  solution - try to deploy. May loose some data if reprovis.
#      error - recognized error in deployment or provisioning... We have to
#              know where the error was. If in deployment - reprovisioning may
#              not be a solution (can loose data). If in provisioning - can do
#              provisioning & deployment again
# 2. New cluster, just added nodes
#   Provision first, and run deploy as second
# 3. Remove some and add some another node
#   Deletion task will run first and will actually remove nodes, include
#   removal from DB.. however removal from DB happens when remove_nodes_resp
#   is ran. It means we have to filter nodes and not to run deployment on
#   those which are prepared for removal.

    @classmethod
    def message(cls, task):
        logger.debug("DeploymentTask.message(task=%s)" % task.uuid)
        task_uuid = task.uuid
        cluster_id = task.cluster.id
        netmanager = NetworkManager()

        nodes = TaskHelper.nodes_to_deploy(task.cluster)

        logger.info("Associated FQDNs to nodes: %s" %
                    ', '.join([n.fqdn for n in nodes]))

        nodes_ids = [n.id for n in nodes]
        if nodes_ids:
            logger.info("Assigning IP addresses to nodes..")
            netmanager.assign_ips(nodes_ids, "management")
            netmanager.assign_ips(nodes_ids, "public")
            netmanager.assign_ips(nodes_ids, "storage")

        nodes_with_attrs = []
        for n in nodes:
            n.pending_addition = False
            if n.status in ('ready', 'deploying'):
                n.status = 'provisioned'
            n.progress = 0
            orm().add(n)
            orm().commit()
            nodes_with_attrs.append(cls.__format_node_for_naily(n))

        cluster_attrs = task.cluster.attributes.merged_attrs_values()
        cluster_attrs['controller_nodes'] = cls.__controller_nodes(cluster_id)

        nets_db = orm().query(Network).join(NetworkGroup).\
            filter(NetworkGroup.cluster_id == cluster_id).all()

        ng_db = orm().query(NetworkGroup).filter_by(
            cluster_id=cluster_id).all()
        for net in ng_db:
            net_name = net.name + '_network_range'
            if net.name == 'floating':
                cluster_attrs[net_name] = \
                    cls.__get_ip_addresses_in_ranges(net)
            elif net.name == 'public':
                # We shouldn't pass public_network_range attribute
                continue
            else:
                cluster_attrs[net_name] = net.cidr

        cluster_attrs['network_manager'] = task.cluster.net_manager

        fixed_net = orm().query(NetworkGroup).filter_by(
            cluster_id=cluster_id).filter_by(name='fixed').first()
        # network_size is required for all managers, otherwise
        #  puppet will use default (255)
        cluster_attrs['network_size'] = fixed_net.network_size
        if cluster_attrs['network_manager'] == 'VlanManager':
            cluster_attrs['num_networks'] = fixed_net.amount
            cluster_attrs['vlan_start'] = fixed_net.vlan_start
            cls.__add_vlan_interfaces(nodes_with_attrs)

        if task.cluster.mode == 'ha':
            logger.info("HA mode chosen, creating VIP addresses for it..")
            cluster_attrs['management_vip'] = netmanager.assign_vip(
                cluster_id, "management")
            cluster_attrs['public_vip'] = netmanager.assign_vip(
                cluster_id, "public")

        cluster_attrs['deployment_mode'] = task.cluster.mode
        cluster_attrs['deployment_id'] = cluster_id

        message = {
            'method': 'deploy',
            'respond_to': 'deploy_resp',
            'args': {
                'task_uuid': task.uuid,
                'nodes': nodes_with_attrs,
                'attributes': cluster_attrs
            }
        }

        return message

    @classmethod
    def execute(cls, task):
        logger.debug("DeploymentTask.execute(task=%s)" % task.uuid)
        message = cls.message(task)
        task.cache = message
        orm().add(task)
        orm().commit()
        rpc.cast('naily', message)

    @classmethod
    def __format_node_for_naily(cls, n):
        netmanager = NetworkManager()
        return {
            'id': n.id, 'status': n.status, 'error_type': n.error_type,
            'uid': n.id, 'ip': n.ip, 'mac': n.mac, 'role': n.role,
            'fqdn': n.fqdn, 'progress': n.progress, 'meta': n.meta,
            'network_data': netmanager.get_node_networks(n.id),
            'online': n.online
        }

    @classmethod
    def __add_vlan_interfaces(cls, nodes):
        """
        We shouldn't pass to orchetrator fixed network
        when network manager is VlanManager, but we should specify
        fixed_interface (private_interface in terms of fuel) as result
        we just pass vlan_interface as node attribute.
        """
        netmanager = NetworkManager()
        for node in nodes:
            node_db = orm().query(Node).get(node['id'])

            fixed_interface = netmanager._get_interface_by_network_name(
                node_db, 'fixed')

            node['vlan_interface'] = fixed_interface.name

    @classmethod
    def __controller_nodes(cls, cluster_id):
        nodes = orm().query(Node).filter_by(
            cluster_id=cluster_id,
            role='controller',
            pending_deletion=False).order_by(Node.id)

        return map(cls.__format_node_for_naily, nodes)

    @classmethod
    def __get_ip_addresses_in_ranges(cls, network_group):
        """
        Get array of all possibale ip addresses in all ranges
        """
        ranges = []
        for ip_range in network_group.ip_ranges:
            ranges += map(lambda ip: str(ip),
                          list(netaddr.IPRange(ip_range.first, ip_range.last)))

        # Return only uniq ip addresses
        return sorted(list(set(ranges)))


class ProvisionTask(object):
    @classmethod
    def message(cls, task):
        logger.debug("ProvisionTask.message(task=%s)" % task.uuid)
        # this variable is used to set 'auth_key' in cobbler ks_meta
        cluster_attrs = task.cluster.attributes.merged_attrs_values()
        nodes = TaskHelper.nodes_to_provision(task.cluster)
        netmanager = NetworkManager()

        USE_FAKE = settings.FAKE_TASKS or settings.FAKE_TASKS_AMQP
        # TODO: For now we send nodes data to orchestrator
        # which is cobbler oriented. But for future we
        # need to use more abstract data structure.
        nodes_data = []
        for node in nodes:
            if not node.online:
                if not USE_FAKE:
                    raise Exception(
                        u"Node '%s' (id=%s) is offline."
                        " Remove it from environment and try again." %
                        (node.name, node.id)
                    )
                else:
                    logger.warning(
                        u"Node '%s' (id=%s) is offline."
                        " Remove it from environment and try again." %
                        (node.name, node.id)
                    )

            node_data = {
                'profile': settings.COBBLER_PROFILE,
                'power_type': 'ssh',
                'power_user': 'root',
                'power_address': node.ip,
                'name': TaskHelper.make_slave_name(node.id, node.role),
                'hostname': node.fqdn,
                'name_servers': '\"%s\"' % settings.DNS_SERVERS,
                'name_servers_search': '\"%s\"' % settings.DNS_SEARCH,
                'netboot_enabled': '1',
                'ks_meta': {
                    'puppet_auto_setup': 1,
                    'puppet_master': settings.PUPPET_MASTER_HOST,
                    'puppet_version': settings.PUPPET_VERSION,
                    'puppet_enable': 0,
                    'mco_auto_setup': 1,
                    'install_log_2_syslog': 1,
                    'mco_pskey': settings.MCO_PSKEY,
                    'mco_vhost': settings.MCO_VHOST,
                    'mco_host': settings.MCO_HOST,
                    'mco_user': settings.MCO_USER,
                    'mco_password': settings.MCO_PASSWORD,
                    'mco_connector': settings.MCO_CONNECTOR,
                    'mco_enable': 1,
                    'auth_key': "\"%s\"" % cluster_attrs.get('auth_key', ''),
                    'ks_spaces': "\"%s\"" % json.dumps(
                        node.attributes.volumes).replace("\"", "\\\"")
                }
            }

            if node.status == "discover":
                logger.info(
                    "Node %s seems booted with bootstrap image",
                    node.id
                )
                node_data['power_pass'] = settings.PATH_TO_BOOTSTRAP_SSH_KEY
            else:
                # If it's not in discover, we expect it to be booted
                #   in target system.
                # TODO: Get rid of expectations!
                logger.info(
                    "Node %s seems booted with real system",
                    node.id
                )
                node_data['power_pass'] = settings.PATH_TO_SSH_KEY

            # FIXME: move this code (updating) into receiver.provision_resp
            if not USE_FAKE:
                node.status = "provisioning"
                orm().add(node)
                orm().commit()

            # here we assign admin network IPs for node
            # one IP for every node interface
            netmanager.assign_admin_ips(
                node.id,
                len(node.meta.get('interfaces', []))
            )
            admin_net_id = netmanager.get_admin_network_id()
            admin_ips = set([i.ip_addr for i in orm().query(IPAddr).
                            filter_by(node=node.id).
                            filter_by(network=admin_net_id)])
            for i in node.meta.get('interfaces', []):
                if 'interfaces' not in node_data:
                    node_data['interfaces'] = {}
                node_data['interfaces'][i['name']] = {
                    'mac_address': i['mac'],
                    'static': '0',
                    'netmask': settings.ADMIN_NETWORK['netmask'],
                    'ip_address': admin_ips.pop(),
                }
                # interfaces_extra field in cobbler ks_meta
                # means some extra data for network interfaces
                # configuration. It is used by cobbler snippet.
                # For example, cobbler interface model does not
                # have 'peerdns' field, but we need this field
                # to be configured. So we use interfaces_extra
                # branch in order to set this unsupported field.
                if 'interfaces_extra' not in node_data:
                    node_data['interfaces_extra'] = {}
                node_data['interfaces_extra'][i['name']] = {
                    'peerdns': 'no',
                    'onboot': 'no'
                }

                # We want node to be able to PXE boot via any of its
                # interfaces. That is why we add all discovered
                # interfaces into cobbler system. But we want
                # assignted fqdn to be resolved into one IP address
                # because we don't completely support multiinterface
                # configuration yet.
                if i['mac'] == node.mac:
                    node_data['interfaces'][i['name']]['dns_name'] = node.fqdn
                    node_data['interfaces_extra'][i['name']]['onboot'] = 'yes'

            nodes_data.append(node_data)
            if not USE_FAKE:
                TaskHelper.prepare_syslog_dir(node)

        message = {
            'method': 'provision',
            'respond_to': 'provision_resp',
            'args': {
                'task_uuid': task.uuid,
                'engine': {
                    'url': settings.COBBLER_URL,
                    'username': settings.COBBLER_USER,
                    'password': settings.COBBLER_PASSWORD,
                },
                'nodes': nodes_data
            }
        }
        return message

    @classmethod
    def execute(cls, task):
        logger.debug("ProvisionTask.execute(task=%s)" % task.uuid)
        message = cls.message(task)
        task.cache = message
        orm().add(task)
        orm().commit()
        rpc.cast('naily', message)


class DeletionTask(object):

    @classmethod
    def execute(self, task, respond_to='remove_nodes_resp'):
        logger.debug("DeletionTask.execute(task=%s)" % task.uuid)
        task_uuid = task.uuid
        logger.debug("Nodes deletion task is running")
        nodes_to_delete = []
        nodes_to_delete_constant = []
        nodes_to_restore = []

        USE_FAKE = settings.FAKE_TASKS or settings.FAKE_TASKS_AMQP

        # no need to call naily if there are no nodes in cluster
        if respond_to == 'remove_cluster_resp' and \
                not list(task.cluster.nodes):
            rcvr = rpc.receiver.NailgunReceiver()
            rcvr.initialize()
            rcvr.remove_cluster_resp(
                task_uuid=task_uuid,
                status='ready',
                progress=100
            )
            return

        for node in task.cluster.nodes:
            if node.pending_deletion:
                nodes_to_delete.append({
                    'id': node.id,
                    'uid': node.id,
                    'role': node.role
                })

                if USE_FAKE:
                    # only fake tasks
                    new_node = {}
                    keep_attrs = (
                        'id',
                        'cluster_id',
                        'role',
                        'pending_deletion',
                        'pending_addition'
                    )
                    for prop in object_mapper(node).iterate_properties:
                        if isinstance(
                            prop, ColumnProperty
                        ) and prop.key not in keep_attrs:
                            new_node[prop.key] = getattr(node, prop.key)
                    nodes_to_restore.append(new_node)
                    # /only fake tasks

        # this variable is used to iterate over it
        # and be able to delete node from nodes_to_delete safely
        nodes_to_delete_constant = list(nodes_to_delete)

        for node in nodes_to_delete_constant:
            node_db = orm().query(Node).get(node['id'])

            slave_name = TaskHelper.make_slave_name(
                node['id'], node['role']
            )
            logger.debug("Removing node from database and pending it "
                         "to clean its MBR: %s", slave_name)
            if not node_db.online:
                logger.info(
                    "Node is offline, can't MBR clean: %s", slave_name)
                orm().delete(node_db)
                orm().commit()

                nodes_to_delete.remove(node)

        # only real tasks
        engine_nodes = []
        if not USE_FAKE:
            for node in nodes_to_delete_constant:
                slave_name = TaskHelper.make_slave_name(
                    node['id'], node['role']
                )
                logger.debug("Pending node to be removed from cobbler %s",
                             slave_name)
                engine_nodes.append(slave_name)
                try:
                    node_db = orm().query(Node).get(node['id'])
                    if node_db and node_db.fqdn:
                        node_hostname = node_db.fqdn
                    else:
                        node_hostname = TaskHelper.make_slave_fqdn(
                            node['id'], node['role'])
                    logger.info("Removing node cert from puppet: %s",
                                node_hostname)
                    cmd = "puppet cert clean {0}".format(node_hostname)
                    proc = subprocess.Popen(
                        shlex.split(cmd),
                        shell=False,
                        stdout=subprocess.PIPE,
                        stderr=subprocess.PIPE
                    )
                    p_stdout, p_stderr = proc.communicate()
                    logger.info(
                        "'{0}' executed, STDOUT: '{1}',"
                        " STDERR: '{2}'".format(
                            cmd,
                            p_stdout,
                            p_stderr
                        )
                    )
                except OSError:
                    logger.warning(
                        "'{0}' returned non-zero exit code".format(
                            cmd
                        )
                    )
                except Exception as e:
                    logger.warning("Exception occurred while trying to \
                            remove the system from Cobbler: '{0}'".format(
                        e.message))

        msg_delete = {
            'method': 'remove_nodes',
            'respond_to': respond_to,
            'args': {
                'task_uuid': task.uuid,
                'nodes': nodes_to_delete,
                'engine': {
                    'url': settings.COBBLER_URL,
                    'username': settings.COBBLER_USER,
                    'password': settings.COBBLER_PASSWORD,
                },
                'engine_nodes': engine_nodes
            }
        }
        # only fake tasks
        if USE_FAKE and nodes_to_restore:
            msg_delete['args']['nodes_to_restore'] = nodes_to_restore
        # /only fake tasks
        logger.debug("Calling rpc remove_nodes method")
        rpc.cast('naily', msg_delete)


class ClusterDeletionTask(object):

    @classmethod
    def execute(cls, task):
        logger.debug("Cluster deletion task is running")
        DeletionTask.execute(task, 'remove_cluster_resp')


class VerifyNetworksTask(object):

    @classmethod
    def execute(self, task, data):
        task_uuid = task.uuid
        nodes = []
        for n in task.cluster.nodes:
            node_json = {'uid': n.id, 'networks': []}
            for nic in n.interfaces:
                vlans = []
                for ng in nic.assigned_networks:
                    # Handle FuelWeb admin network first.
                    if not ng.cluster_id:
                        vlans.append(0)
                        continue
                    data_ng = filter(
                        lambda i: i['name'] == ng.name,
                        data
                    )[0]
                    vlans.extend(data_ng['vlans'])
                if not vlans:
                    continue
                node_json['networks'].append(
                    {'iface': nic.name, 'vlans': vlans}
                )
            nodes.append(node_json)

        message = {'method': 'verify_networks',
                   'respond_to': 'verify_networks_resp',
                   'args': {'task_uuid': task.uuid,
                            'nodes': nodes}}
        logger.debug("Network verification is called with: %s", message)

        task.cache = message
        orm().add(task)
        orm().commit()
        rpc.cast('naily', message)


class CheckNetworksTask(object):

    @classmethod
    def execute(self, task, data):
        # If not set in data then fetch from db
        if 'net_manager' in data:
            netmanager = data['net_manager']
        else:
            netmanager = task.cluster.net_manager

        if 'networks' in data:
            networks = data['networks']
        else:
            networks = map(lambda x: x.__dict__, task.cluster.network_groups)

        result = []
        err_msgs = []
        for ng in networks:
            net_errors = []
            ng_db = orm().query(NetworkGroup).get(ng['id'])
            if not ng_db:
                net_errors.append("id")
                err_msgs.append("Invalid network ID: {0}".format(ng['id']))
            else:
                if 'cidr' in ng:
                    fnet = netaddr.IPSet([ng['cidr']])

                    if fnet & netaddr.IPSet(settings.NET_EXCLUDE):
                        net_errors.append("cidr")
                        err_msgs.append(
                            "Intersection with admin "
                            "network(s) '{0}' found".format(
                                settings.NET_EXCLUDE
                            )
                        )
                    if fnet.size < ng['network_size'] * ng['amount']:
                        net_errors.append("cidr")
                        err_msgs.append(
                            "CIDR size for network '{0}' "
                            "is less than required".format(
                                ng.get('name') or ng_db.name or ng_db.id
                            )
                        )
                if ng.get('amount') > 1 and netmanager == 'FlatDHCPManager':
                    net_errors.append("amount")
                    err_msgs.append(
                        "Network amount for '{0}' is more than 1 "
                        "while using FlatDHCP manager.".format(
                            ng.get('name') or ng_db.name or ng_db.id
                        )
                    )
            if net_errors:
                result.append({
                    "id": int(ng["id"]),
                    "errors": net_errors
                })
        if err_msgs:
            task.result = result
            orm().add(task)
            orm().commit()
            full_err_msg = "\n".join(err_msgs)
            raise errors.NetworkCheckError(full_err_msg)


class CheckBeforeDeploymentTask(object):
    @classmethod
    def execute(cls, task):
        cls.__check_disks(task)
        cls.__check_network(task)

    @classmethod
    def __check_disks(cls, task):
        for node in task.cluster.nodes:
            node.volume_manager.check_free_space()

    @classmethod
    def __check_network(cls, task):
        netmanager = NetworkManager()
        nodes_count = len(task.cluster.nodes)

        public_network = filter(
            lambda ng: ng.name == 'public',
            task.cluster.network_groups)[0]
        public_network_size = cls.__network_size(public_network)

        if public_network_size < nodes_count:
            error_message = cls.__format_network_error(nodes_count)
            raise errors.NetworkCheckError(error_message)

    @classmethod
    def __network_size(cls, network):
        size = 0
        for ip_range in network.ip_ranges:
            size += len(netaddr.IPRange(ip_range.first, ip_range.last))
        return size

    @classmethod
    def __format_network_error(cls, nodes_count):
        return 'Not enough IP addresses. Public network must have at least '\
            '{nodes_count} IP addresses '.format(nodes_count=nodes_count) +\
            'for the current environment.'