Added hardware reboot

We need to control services via shaker and we also need to have an
ability to manage power via ipmi / VM power control commands.
This commit is contained in:
Timur Nurlygayanov 2015-03-22 16:27:38 +03:00
parent c8bdd15ab4
commit 223c950f45
6 changed files with 128 additions and 61 deletions

View File

@ -10,9 +10,9 @@ def run_command():
r = request.get_json(force=True)
process = subprocess.Popen(r["command"].split(), stdout=subprocess.PIPE)
output = process.communicate()[0]
return output
if __name__ == '__main__':
app.run(host="0.0.0.0", debug=True)
app.run(host="0.0.0.0", debug=True)

View File

@ -6,7 +6,7 @@ from rally import osclients
LOG = logging.getLogger(__name__)
@base.context(name="cloud_nodes", order=1000)
@base.context(name="cloud_nodes", order=800)
class CloudNodesContext(base.Context):
"""This context allows to define the list of nodes in the cloud"""
@ -18,6 +18,10 @@ class CloudNodesContext(base.Context):
"controllers": {
"type": "array",
"default": []
},
"power_control_node": {
"type": "dictionary",
"default": {}
}
}
}
@ -25,6 +29,8 @@ class CloudNodesContext(base.Context):
def setup(self):
"""This method is called before the task start"""
self.context["controllers"] = self.config.get("controllers", [])
power_control_node = self.config.get("power_control_node", {})
self.context["power_control_node"] = power_control_node
def cleanup(self):
"""This method is called after the task finish"""

View File

@ -6,7 +6,7 @@ from rally import osclients
LOG = logging.getLogger(__name__)
@base.context(name="recover_cloud", order=999)
@base.context(name="recover_cloud", order=900)
class CloudNodesContext(base.Context):
"""This context allows to recover cloud after disaster tests"""
@ -14,32 +14,56 @@ class CloudNodesContext(base.Context):
"type": "object",
"$schema": consts.JSON_SCHEMA,
"additionalProperties": False,
"properties": {}
}
ACTIONS = {
"stop rabbitmq service": {
"do": "/etc/init.d/rabbitmq-server stop",
"undo": "/etc/init.d/rabbitmq-server start"
},
"ban rabbitmq service with pcs": {
"do": "pcs resource ban rabbitmq",
"undo": "pcs resource clear rabbitmq"
"properties": {
"checks": {
"type": "array",
"default": []
}
}
}
def check_rabbitmq_cluster_status(self, controllers):
command = "rabbitmqctl cluster_status"
for controller in controllers:
output = self.run_command(controller["shaker_agent_id"], command)
for line in output.splitlines():
if "nodes" in line and "running_nodes" not in line:
nodes = [node for node in line.split("'")
if "rabbit" in node]
if "running_nodes" in line:
active_nodes = [node for node in line.split("'")
if "rabbit" in node]
for node in nodes:
if node not in active_nodes:
return False
return True
def run_command(self, node, command, recover_command=None,
recover_timeout=0):
if recover_cmd is not None:
action = {"node": node, "command": command,
"timeout": recover_timeout}
self.context["recover_commands"].append(action)
r = requests.post("http://{0}/run_command".format(node),
headers={"Content-Type": "application/json"},
data=json.dumps({"command": command}))
return r.text
def setup(self):
"""This method is called before the task start"""
self.context["actions"] = self.ACTIONS
# done_actions contains information about name of shaker_id
# and action name which were executed, example:
# self.context["done_actions"] = [{"name": "node-1", "command": "ls"}]
self.context["done_actions"] = []
self.context["recover_commands"] = []
self.context["checks"] = self.config.get("checks", [])
def cleanup(self):
"""This method is called after the task finish"""
for action in self.context["done_actions"]:
## we need to import shaker somehow :)
shaker.run_command_on_node(action["node"],
ACTIONS[action["command"]]["undo"])
for action in self.context["recover_commands"]:
self.run_command(action["node"], action["command"])
time.sleep(action.get("timeout", 0))
controllers = self.context["controllers"]
if "rabbitmq_cluster_status" in self.context["checks"]:
if self.check_rabbitmq_cluster_status(controllers) is False:
raise "RabbitMQ cluster wasn't recovered"

View File

@ -1,5 +1,7 @@
import json
import requests
import time
from rally.benchmark.scenarios import base
@ -11,22 +13,28 @@ class BaseDisasterScenario(base.Scenario):
image=self.context["shaker_image"],
flavor=self.context["default_flavor"],
{"auto_assign_nic": True})
return vm
def run_command(self, node, command, recover_command=None):
if recover_cmd is not None:
action = {"node": node, "command": command}
self.context["recover_commands"].append(action)
def execute_command_on_shaker_node(self, node, command):
cmd = {"command": command}
r = requests.post("http://{0}/run_command".format(node),
headers={"Content-Type": "application/json"},
data=json.dumps(cmd))
data=json.dumps({"command": command}))
return r.text
def run_command(self, node, command):
return self.execute_command_on_shaker_node(node, command)
def power_off_controller(self, controller_id):
control_node = self.context["power_control_node"]
controller = self.context["controllers"][controller_id]
def run_disaster_command(self, node, command):
do = self.context["actions"][command]["do"]
self.run_command(control_node["shaker_agent_id"],
command=controller["hardware_power_off_cmd"],
recover_command=controller["hardware_power_on_cmd"],
recover_timeout=controller["power_on_timeout"])
time.sleep(controller["power_off_timeout"])
done = {"node": node, "command": command}
self.context["done_actions"].append(done)
self.execute_command_on_shaker_node(node, command)
def power_off_main_controller(self):
pass

View File

@ -1,35 +1,36 @@
import random
import base_disaster_scenario
from rally.benchmark.scenarios import base
class BaseDisasterScenario(base_disaster_scenario.BaseDisasterScenario):
class RabbitMQDisasterScenarios(base_disaster_scenario.BaseDisasterScenario):
@base.scenario()
def test_rabbitmq_failover01(self):
""" Test Scenario:
def power_off_one_cantroller(self):
""" Poweroff one contoller and verify cloud
1. Deploy OpenStack cloud with 3 controllers
2. Stop RabbitMQ services on all controllers
3. Start RabbitMQ on one controller
4. Create VM 10 times, create networks, volumes, upload images,
create users and etc.
5. Start all RabbitMQ services and repeat step #4
Setup:
OpenStack cloud with at least 3 controllers
Scenario:
1. Poweroff one controller
2. Verify cloud: create VM 10 times, create networks,
volumes, upload images
"""
for i in xrange(0, 3):
self.run_disaster_command(self.context["controllers"][i],
"stop rabbitmq service")
controller_id = random.randint(0, len(self.context["controllers"]))
self.power_off_controller(controller_id)
self.run_command(self.context["controllers"][0],
"/etc/init.d/rabbitmq-server start")
vm_list = []
for i in xrange(0, 10):
vm = self.boot_vm("test{0}".format(i))
vm_list.append(vm)
# (tnurlygayanov): TODO:
# Need to write the functions which will verify that cloud
# works fine: create/delete several VMs, networks, images,
# volumes and etc.
if i in xrange(0, 10):
self.boot_vm("test{0}".format(i))
timeout = 300
active_vms = []
while timeout > 0 and len(active_vms) < 10:
active_vms = [vm for vm in vm_list if vm.state == "ACTIVE"]
timeout -= 1
for i in xrange(0, 3):
self.run_command(self.context["controllers"][i],
"/etc/init.d/rabbitmq-server start")
if len(active_vms) < 10:
raise "Can't boot VMs"

View File

@ -1,13 +1,41 @@
{
"shaker_controller.sample_print": [
"RabbitMQDisasterScenarios.power_off_one_cantroller": [
{
"runner": {
"type": "serial",
"times": 5,
},
"context": {
"recover_cloud": {
"checks": ["rabbitmq_cluster_status", ]
},
"cloud_nodes": {
"controllers": ["1", "2", "3"]
"controllers": [
{
"shaker_agent_id": "1",
"hardware_power_on_cmd": "VBoxManage startvm fuel-slave-1",
"hardware_power_off_cmd": "VBoxManage controlvm fuel-slave-1 poweroff",
"power_off_timeout": 20,
"power_on_timeout": 30
},
{
"shaker_agent_id": "2",
"hardware_power_on_cmd": "VBoxManage startvm fuel-slave-2",
"hardware_power_off_cmd": "VBoxManage controlvm fuel-slave-2 poweroff",
"power_off_timeout": 20,
"power_on_timeout": 30
},
{
"shaker_agent_id": "3",
"hardware_power_on_cmd": "VBoxManage startvm fuel-slave-3",
"hardware_power_off_cmd": "VBoxManage controlvm fuel-slave-3 poweroff",
"power_off_timeout": 20,
"power_on_timeout": 30
}
],
"power_control_node": {
"shaker_agent_id": "localhost"
}
}
}
}