nova/nova/compute/node.py

550 lines
20 KiB
Python

# vim: tabstop=4 shiftwidth=4 softtabstop=4
# Copyright [2010] [Anso Labs, LLC]
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Compute Node:
Runs on each compute node, managing the
hypervisor using libvirt.
"""
import base64
import json
import logging
import os
import random
import shutil
import sys
from nova import vendor
from twisted.internet import defer
from twisted.internet import task
from twisted.application import service
try:
import libvirt
except Exception, err:
logging.warning('no libvirt found')
from nova import exception
from nova import fakevirt
from nova import flags
from nova import process
from nova import utils
from nova.compute import disk
from nova.compute import model
from nova.compute import network
from nova.objectstore import image # for image_path flag
FLAGS = flags.FLAGS
flags.DEFINE_string('libvirt_xml_template',
utils.abspath('compute/libvirt.xml.template'),
'Network XML Template')
flags.DEFINE_bool('use_s3', True,
'whether to get images from s3 or use local copy')
flags.DEFINE_string('instances_path', utils.abspath('../instances'),
'where instances are stored on disk')
flags.DEFINE_string('instances_prefix', 'compute-',
'prefix for keepers for instances')
INSTANCE_TYPES = {}
INSTANCE_TYPES['m1.tiny'] = {'memory_mb': 512, 'vcpus': 1, 'local_gb': 0}
INSTANCE_TYPES['m1.small'] = {'memory_mb': 1024, 'vcpus': 1, 'local_gb': 10}
INSTANCE_TYPES['m1.medium'] = {'memory_mb': 2048, 'vcpus': 2, 'local_gb': 10}
INSTANCE_TYPES['m1.large'] = {'memory_mb': 4096, 'vcpus': 4, 'local_gb': 10}
INSTANCE_TYPES['m1.xlarge'] = {'memory_mb': 8192, 'vcpus': 4, 'local_gb': 10}
INSTANCE_TYPES['c1.medium'] = {'memory_mb': 2048, 'vcpus': 4, 'local_gb': 10}
# The number of processes to start in our process pool
# TODO(termie): this should probably be a flag and the pool should probably
# be a singleton
PROCESS_POOL_SIZE = 4
class Node(object, service.Service):
"""
Manages the running instances.
"""
def __init__(self):
""" load configuration options for this node and connect to libvirt """
super(Node, self).__init__()
self._instances = {}
self._conn = self._get_connection()
self._pool = process.Pool(PROCESS_POOL_SIZE)
self.instdir = model.InstanceDirectory()
# TODO(joshua): This needs to ensure system state, specifically: modprobe aoe
def _get_connection(self):
""" returns a libvirt connection object """
# TODO(termie): maybe lazy load after initial check for permissions
# TODO(termie): check whether we can be disconnected
if FLAGS.fake_libvirt:
conn = fakevirt.FakeVirtConnection.instance()
else:
auth = [[libvirt.VIR_CRED_AUTHNAME, libvirt.VIR_CRED_NOECHOPROMPT],
'root',
None]
conn = libvirt.openAuth('qemu:///system', auth, 0)
if conn == None:
logging.error('Failed to open connection to the hypervisor')
sys.exit(1)
return conn
def noop(self):
""" simple test of an AMQP message call """
return defer.succeed('PONG')
def get_instance(self, instance_id):
# inst = self.instdir.get(instance_id)
# return inst
if self.instdir.exists(instance_id):
return Instance.fromName(self._conn, self._pool, instance_id)
return None
@exception.wrap_exception
def adopt_instances(self):
""" if there are instances already running, adopt them """
return defer.succeed(0)
instance_names = [self._conn.lookupByID(x).name()
for x in self._conn.listDomainsID()]
for name in instance_names:
try:
new_inst = Instance.fromName(self._conn, self._pool, name)
new_inst.update_state()
except:
pass
return defer.succeed(len(self._instances))
@exception.wrap_exception
def describe_instances(self):
retval = {}
for inst in self.instdir.by_node(FLAGS.node_name):
retval[inst['instance_id']] = (Instance.fromName(self._conn, self._pool, inst['instance_id']))
return retval
@defer.inlineCallbacks
def report_state(self):
logging.debug("Reporting State")
return
@exception.wrap_exception
def run_instance(self, instance_id, **_kwargs):
""" launch a new instance with specified options """
logging.debug("Starting instance %s..." % (instance_id))
inst = self.instdir.get(instance_id)
inst['node_name'] = FLAGS.node_name
inst.save()
# TODO(vish) check to make sure the availability zone matches
new_inst = Instance(self._conn, name=instance_id,
pool=self._pool, data=inst)
if new_inst.is_running():
raise exception.Error("Instance is already running")
d = new_inst.spawn()
return d
@exception.wrap_exception
def terminate_instance(self, instance_id):
""" terminate an instance on this machine """
logging.debug("Got told to terminate instance %s" % instance_id)
instance = self.get_instance(instance_id)
# inst = self.instdir.get(instance_id)
if not instance:
raise exception.Error(
'trying to terminate unknown instance: %s' % instance_id)
d = instance.destroy()
# d.addCallback(lambda x: inst.destroy())
return d
@exception.wrap_exception
def reboot_instance(self, instance_id):
""" reboot an instance on this server
KVM doesn't support reboot, so we terminate and restart """
instance = self.get_instance(instance_id)
if not instance:
raise exception.Error(
'trying to reboot unknown instance: %s' % instance_id)
return instance.reboot()
@defer.inlineCallbacks
@exception.wrap_exception
def get_console_output(self, instance_id):
""" send the console output for an instance """
logging.debug("Getting console output for %s" % (instance_id))
inst = self.instdir.get(instance_id)
instance = self.get_instance(instance_id)
if not instance:
raise exception.Error(
'trying to get console log for unknown: %s' % instance_id)
rv = yield instance.console_output()
# TODO(termie): this stuff belongs in the API layer, no need to
# munge the data we send to ourselves
output = {"InstanceId" : instance_id,
"Timestamp" : "2",
"output" : base64.b64encode(rv)}
defer.returnValue(output)
@defer.inlineCallbacks
@exception.wrap_exception
def attach_volume(self, instance_id = None,
aoe_device = None, mountpoint = None):
utils.runthis("Attached Volume: %s",
"sudo virsh attach-disk %s /dev/etherd/%s %s"
% (instance_id, aoe_device, mountpoint.split("/")[-1]))
return defer.succeed(True)
def _init_aoe(self):
utils.runthis("Doin an AoE discover, returns %s", "sudo aoe-discover")
utils.runthis("Doin an AoE stat, returns %s", "sudo aoe-stat")
@exception.wrap_exception
def detach_volume(self, instance_id, mountpoint):
""" detach a volume from an instance """
# despite the documentation, virsh detach-disk just wants the device
# name without the leading /dev/
target = mountpoint.rpartition('/dev/')[2]
utils.runthis("Detached Volume: %s", "sudo virsh detach-disk %s %s "
% (instance_id, target))
return defer.succeed(True)
class Group(object):
def __init__(self, group_id):
self.group_id = group_id
class ProductCode(object):
def __init__(self, product_code):
self.product_code = product_code
def _create_image(data, libvirt_xml):
""" create libvirt.xml and copy files into instance path """
def basepath(path=''):
return os.path.abspath(os.path.join(data['basepath'], path))
def imagepath(path=''):
return os.path.join(FLAGS.images_path, path)
def image_url(path):
return "%s:%s/_images/%s" % (FLAGS.s3_host, FLAGS.s3_port, path)
logging.info(basepath('disk'))
try:
os.makedirs(data['basepath'])
os.chmod(data['basepath'], 0777)
except OSError:
# TODO: there is already an instance with this name, do something
pass
try:
logging.info('Creating image for: %s', data['instance_id'])
f = open(basepath('libvirt.xml'), 'w')
f.write(libvirt_xml)
f.close()
if not FLAGS.fake_libvirt:
if FLAGS.use_s3:
if not os.path.exists(basepath('disk')):
utils.fetchfile(image_url("%s/image" % data['image_id']),
basepath('disk-raw'))
if not os.path.exists(basepath('kernel')):
utils.fetchfile(image_url("%s/image" % data['kernel_id']),
basepath('kernel'))
if not os.path.exists(basepath('ramdisk')):
utils.fetchfile(image_url("%s/image" % data['ramdisk_id']),
basepath('ramdisk'))
else:
if not os.path.exists(basepath('disk')):
shutil.copyfile(imagepath("%s/image" % data['image_id']),
basepath('disk-raw'))
if not os.path.exists(basepath('kernel')):
shutil.copyfile(imagepath("%s/image" % data['kernel_id']),
basepath('kernel'))
if not os.path.exists(basepath('ramdisk')):
shutil.copyfile(imagepath("%s/image" %
data['ramdisk_id']),
basepath('ramdisk'))
if data['key_data']:
logging.info('Injecting key data into image %s' %
data['image_id'])
disk.inject_key(data['key_data'], basepath('disk-raw'))
if os.path.exists(basepath('disk')):
os.remove(basepath('disk'))
bytes = INSTANCE_TYPES[data['instance_type']]['local_gb'] * 1024 * 1024 * 1024
disk.partition(basepath('disk-raw'), basepath('disk'), bytes)
logging.info('Done create image for: %s', data['instance_id'])
except Exception as ex:
return {'exception': ex}
class Instance(object):
NOSTATE = 0x00
RUNNING = 0x01
BLOCKED = 0x02
PAUSED = 0x03
SHUTDOWN = 0x04
SHUTOFF = 0x05
CRASHED = 0x06
def is_pending(self):
return (self.state == Instance.NOSTATE or self.state == 'pending')
def is_destroyed(self):
return self.state == Instance.SHUTOFF
def is_running(self):
logging.debug("Instance state is: %s" % self.state)
return (self.state == Instance.RUNNING or self.state == 'running')
def __init__(self, conn, pool, name, data):
# TODO(termie): pool should probably be a singleton instead of being passed
# here and in the classmethods
""" spawn an instance with a given name """
# TODO(termie): pool should probably be a singleton instead of being passed
# here and in the classmethods
self._pool = pool
self._conn = conn
self.datamodel = data
print data
# NOTE(termie): to be passed to multiprocess self._s must be
# pickle-able by cPickle
self._s = {}
# TODO(termie): is instance_type that actual name for this?
size = data.get('instance_type', FLAGS.default_instance_type)
if size not in INSTANCE_TYPES:
raise exception.Error('invalid instance type: %s' % size)
self._s.update(INSTANCE_TYPES[size])
self._s['name'] = name
self._s['instance_id'] = name
self._s['instance_type'] = size
self._s['mac_address'] = data.get(
'mac_address', 'df:df:df:df:df:df')
self._s['basepath'] = data.get(
'basepath', os.path.abspath(
os.path.join(FLAGS.instances_path, self.name)))
self._s['memory_kb'] = int(self._s['memory_mb']) * 1024
# TODO(joshua) - Get this from network directory controller later
self._s['bridge_name'] = data.get('bridge_name', 'br0')
self._s['image_id'] = data.get('image_id', FLAGS.default_image)
self._s['kernel_id'] = data.get('kernel_id', FLAGS.default_kernel)
self._s['ramdisk_id'] = data.get('ramdisk_id', FLAGS.default_ramdisk)
self._s['owner_id'] = data.get('owner_id', '')
self._s['node_name'] = data.get('node_name', '')
self._s['user_data'] = data.get('user_data', '')
self._s['ami_launch_index'] = data.get('ami_launch_index', None)
self._s['launch_time'] = data.get('launch_time', None)
self._s['reservation_id'] = data.get('reservation_id', None)
# self._s['state'] = Instance.NOSTATE
self._s['state'] = data.get('state', Instance.NOSTATE)
self._s['key_data'] = data.get('key_data', None)
# TODO: we may not need to save the next few
self._s['groups'] = data.get('security_group', ['default'])
self._s['product_codes'] = data.get('product_code', [])
self._s['key_name'] = data.get('key_name', None)
self._s['addressing_type'] = data.get('addressing_type', None)
self._s['availability_zone'] = data.get('availability_zone', 'fixme')
#TODO: put real dns items here
self._s['private_dns_name'] = data.get('private_dns_name', 'fixme')
self._s['dns_name'] = data.get('dns_name',
self._s['private_dns_name'])
logging.debug("Finished init of Instance with id of %s" % name)
def toXml(self):
# TODO(termie): cache?
logging.debug("Starting the toXML method")
libvirt_xml = open(FLAGS.libvirt_xml_template).read()
xml_info = self._s.copy()
#xml_info.update(self._s)
# TODO(termie): lazy lazy hack because xml is annoying
xml_info['nova'] = json.dumps(self._s)
libvirt_xml = libvirt_xml % xml_info
logging.debug("Finished the toXML method")
return libvirt_xml
@classmethod
def fromName(cls, conn, pool, name):
""" use the saved data for reloading the instance """
# if FLAGS.fake_libvirt:
# raise Exception('this is a bit useless, eh?')
instdir = model.InstanceDirectory()
instance = instdir.get(name)
return cls(conn=conn, pool=pool, name=name, data=instance)
@property
def state(self):
return self._s['state']
@property
def name(self):
return self._s['name']
def describe(self):
return self._s
def info(self):
logging.debug("Getting info for dom %s" % self.name)
virt_dom = self._conn.lookupByName(self.name)
(state, max_mem, mem, num_cpu, cpu_time) = virt_dom.info()
return {'state': state,
'max_mem': max_mem,
'mem': mem,
'num_cpu': num_cpu,
'cpu_time': cpu_time}
def update_state(self):
info = self.info()
self._s['state'] = info['state']
self.datamodel['state'] = info['state']
self.datamodel['node_name'] = FLAGS.node_name
self.datamodel.save()
@exception.wrap_exception
def destroy(self):
if self.is_destroyed():
self.datamodel.destroy()
raise exception.Error('trying to destroy already destroyed'
' instance: %s' % self.name)
self._s['state'] = Instance.SHUTDOWN
self.datamodel['state'] = 'shutting_down'
self.datamodel.save()
try:
virt_dom = self._conn.lookupByName(self.name)
virt_dom.destroy()
except Exception, _err:
pass
# If the instance is already terminated, we're still happy
d = defer.Deferred()
d.addCallback(lambda x: self.datamodel.destroy())
# TODO(termie): short-circuit me for tests
# WE'LL save this for when we do shutdown,
# instead of destroy - but destroy returns immediately
timer = task.LoopingCall(f=None)
def _wait_for_shutdown():
try:
info = self.info()
if info['state'] == Instance.SHUTDOWN:
self._s['state'] = Instance.SHUTDOWN
#self.datamodel['state'] = 'shutdown'
#self.datamodel.save()
timer.stop()
d.callback(None)
except Exception:
self._s['state'] = Instance.SHUTDOWN
timer.stop()
d.callback(None)
timer.f = _wait_for_shutdown
timer.start(interval=0.5, now=True)
return d
@defer.inlineCallbacks
@exception.wrap_exception
def reboot(self):
# if not self.is_running():
# raise exception.Error(
# 'trying to reboot a non-running'
# 'instance: %s (state: %s)' % (self.name, self.state))
yield self._conn.lookupByName(self.name).destroy()
self.datamodel['state'] = 'rebooting'
self.datamodel.save()
self._s['state'] = Instance.NOSTATE
self._conn.createXML(self.toXml(), 0)
# TODO(termie): this should actually register a callback to check
# for successful boot
self.datamodel['state'] = 'running'
self.datamodel.save()
self._s['state'] = Instance.RUNNING
logging.debug('rebooted instance %s' % self.name)
defer.returnValue(None)
@exception.wrap_exception
def spawn(self):
self.datamodel['state'] = "spawning"
self.datamodel.save()
logging.debug("Starting spawn in Instance")
xml = self.toXml()
def _launch(retvals):
self.datamodel['state'] = 'launching'
self.datamodel.save()
try:
logging.debug("Arrived in _launch")
if retvals and 'exception' in retvals:
raise retvals['exception']
self._conn.createXML(self.toXml(), 0)
# TODO(termie): this should actually register
# a callback to check for successful boot
self._s['state'] = Instance.RUNNING
self.datamodel['state'] = 'running'
self.datamodel.save()
logging.debug("Instance is running")
except Exception as ex:
logging.debug(ex)
self.datamodel['state'] = 'shutdown'
self.datamodel.save()
#return self
d = self._pool.apply(_create_image, self._s, xml)
d.addCallback(_launch)
return d
@exception.wrap_exception
def console_output(self):
if not FLAGS.fake_libvirt:
fname = os.path.abspath(
os.path.join(self._s['basepath'], 'console.log'))
with open(fname, 'r') as f:
console = f.read()
else:
console = 'FAKE CONSOLE OUTPUT'
return defer.succeed(console)
def generate_mac(self):
mac = [0x00, 0x16, 0x3e, random.randint(0x00, 0x7f),
random.randint(0x00, 0xff), random.randint(0x00, 0xff)
]
return ':'.join(map(lambda x: "%02x" % x, mac))
class NetworkNode(Node):
def __init__(self, **kwargs):
super(NetworkNode, self).__init__(**kwargs)
self.virtNets = {}
def add_network(self, net_dict):
net = network.VirtNetwork(**net_dict)
self.virtNets[net.name] = net
self.virtNets[net.name].express()
return defer.succeed({'retval': 'network added'})
@exception.wrap_exception
def run_instance(self, instance_id, **kwargs):
inst = self.instdir.get(instance_id)
net_dict = json.loads(inst.get('network_str', "{}"))
self.add_network(net_dict)
return super(NetworkNode, self).run_instance(instance_id, **kwargs)