Added hardware reboot

We need to control services via shaker and we also need to have an ability to manage power via ipmi / VM power control commands.
2015-03-22 16:27:38 +03:00 · 2015-03-22 16:27:38 +03:00 · 223c950f45
parent c8bdd15ab4
commit 223c950f45
6 changed files with 128 additions and 61 deletions
--- a/dummy_shaker_agent.py
+++ b/dummy_shaker_agent.py
@ -10,9 +10,9 @@ def run_command():
    r = request.get_json(force=True)
    process = subprocess.Popen(r["command"].split(), stdout=subprocess.PIPE)
    output = process.communicate()[0]
-    
+
    return output


 if __name__ == '__main__':
-    app.run(host="0.0.0.0", debug=True)
+    app.run(host="0.0.0.0", debug=True)
--- a/rally-contexts/cloud_nodes_context.py
+++ b/rally-contexts/cloud_nodes_context.py
@ -6,7 +6,7 @@ from rally import osclients
 LOG = logging.getLogger(__name__)


-@base.context(name="cloud_nodes", order=1000)
+@base.context(name="cloud_nodes", order=800)
 class CloudNodesContext(base.Context):
    """This context allows to define the list of nodes in the cloud"""

@ -18,6 +18,10 @@ class CloudNodesContext(base.Context):
            "controllers": {
                "type": "array",
                "default": []
+            },
+            "power_control_node": {
+                "type": "dictionary",
+                "default": {}
            }
        }
    }
@ -25,6 +29,8 @@ class CloudNodesContext(base.Context):
    def setup(self):
        """This method is called before the task start"""
        self.context["controllers"] = self.config.get("controllers", [])
+        power_control_node = self.config.get("power_control_node", {})
+        self.context["power_control_node"] = power_control_node

    def cleanup(self):
        """This method is called after the task finish"""
--- a/rally-contexts/recover_cloud_context.py
+++ b/rally-contexts/recover_cloud_context.py
@ -6,7 +6,7 @@ from rally import osclients
 LOG = logging.getLogger(__name__)


-@base.context(name="recover_cloud", order=999)
+@base.context(name="recover_cloud", order=900)
 class CloudNodesContext(base.Context):
    """This context allows to recover cloud after disaster tests"""

@ -14,32 +14,56 @@ class CloudNodesContext(base.Context):
        "type": "object",
        "$schema": consts.JSON_SCHEMA,
        "additionalProperties": False,
-        "properties": {}
-    }
-
-    ACTIONS = {
-        "stop rabbitmq service": {
-            "do": "/etc/init.d/rabbitmq-server stop",
-            "undo": "/etc/init.d/rabbitmq-server start"
-        },
-        "ban rabbitmq service with pcs": {
-            "do": "pcs resource ban rabbitmq",
-            "undo": "pcs resource clear rabbitmq"
+        "properties": {
+            "checks": {
+                "type": "array",
+                "default": []
+            }
        }
    }

+    def check_rabbitmq_cluster_status(self, controllers):
+        command = "rabbitmqctl cluster_status"
+
+        for controller in controllers:
+            output = self.run_command(controller["shaker_agent_id"], command)
+            for line in output.splitlines():
+                if "nodes" in line and "running_nodes" not in line:
+                    nodes = [node for node in line.split("'")
+                             if "rabbit" in node]
+                if "running_nodes" in line:
+                    active_nodes = [node for node in line.split("'")
+                                    if "rabbit" in node]
+            for node in nodes:
+                if node not in active_nodes:
+                    return False
+        return True
+
+    def run_command(self, node, command, recover_command=None,
+                    recover_timeout=0):
+        if recover_cmd is not None:
+            action = {"node": node, "command": command,
+                      "timeout": recover_timeout}
+            self.context["recover_commands"].append(action)
+
+        r = requests.post("http://{0}/run_command".format(node),
+                          headers={"Content-Type": "application/json"},
+                          data=json.dumps({"command": command}))
+
+        return r.text
+
    def setup(self):
        """This method is called before the task start"""
-        self.context["actions"] = self.ACTIONS
-
-        # done_actions contains information about name of shaker_id
-        # and action name which were executed, example:
-        # self.context["done_actions"] = [{"name": "node-1", "command": "ls"}]
-        self.context["done_actions"] = []
+        self.context["recover_commands"] = []
+        self.context["checks"] = self.config.get("checks", [])

    def cleanup(self):
        """This method is called after the task finish"""
-        for action in self.context["done_actions"]:
-            ## we need to import shaker somehow :)
-            shaker.run_command_on_node(action["node"],
-                                        ACTIONS[action["command"]]["undo"])
+        for action in self.context["recover_commands"]:
+            self.run_command(action["node"], action["command"])
+            time.sleep(action.get("timeout", 0))
+
+        controllers = self.context["controllers"]
+        if "rabbitmq_cluster_status" in self.context["checks"]:
+            if self.check_rabbitmq_cluster_status(controllers) is False:
+                raise "RabbitMQ cluster wasn't recovered"
--- a/rally-scenarios/base_disaster_scenario.py
+++ b/rally-scenarios/base_disaster_scenario.py
@ -1,5 +1,7 @@
 import json
 import requests
+import time
+
 from rally.benchmark.scenarios import base


@ -11,22 +13,28 @@ class BaseDisasterScenario(base.Scenario):
                                 image=self.context["shaker_image"],
                                 flavor=self.context["default_flavor"],
                                 {"auto_assign_nic": True})
+        return vm
+
+    def run_command(self, node, command, recover_command=None):
+        if recover_cmd is not None:
+            action = {"node": node, "command": command}
+            self.context["recover_commands"].append(action)

-    def execute_command_on_shaker_node(self, node, command):
-        cmd = {"command": command}
        r = requests.post("http://{0}/run_command".format(node),
                          headers={"Content-Type": "application/json"},
-                          data=json.dumps(cmd))
+                          data=json.dumps({"command": command}))

        return r.text

-    def run_command(self, node, command):
-        return self.execute_command_on_shaker_node(node, command)
+    def power_off_controller(self, controller_id):
+        control_node = self.context["power_control_node"]
+        controller = self.context["controllers"][controller_id]

-    def run_disaster_command(self, node, command):
-        do = self.context["actions"][command]["do"]
+        self.run_command(control_node["shaker_agent_id"],
+                         command=controller["hardware_power_off_cmd"],
+                         recover_command=controller["hardware_power_on_cmd"],
+                         recover_timeout=controller["power_on_timeout"])
+        time.sleep(controller["power_off_timeout"])

-        done = {"node": node, "command": command}
-        self.context["done_actions"].append(done)
-
-        self.execute_command_on_shaker_node(node, command)
+    def power_off_main_controller(self):
+        pass
--- a/rally-scenarios/rabbitmq_disaster_scenarios.py
+++ b/rally-scenarios/rabbitmq_disaster_scenarios.py
@ -1,35 +1,36 @@
+import random
 import base_disaster_scenario
 from rally.benchmark.scenarios import base


-class BaseDisasterScenario(base_disaster_scenario.BaseDisasterScenario):
+class RabbitMQDisasterScenarios(base_disaster_scenario.BaseDisasterScenario):

    @base.scenario()
-    def test_rabbitmq_failover01(self):
-        """ Test Scenario:
+    def power_off_one_cantroller(self):
+        """ Poweroff one contoller and verify cloud

-        1. Deploy OpenStack cloud with 3 controllers
-        2. Stop RabbitMQ services on all controllers
-        3. Start RabbitMQ on one controller
-        4. Create VM 10 times, create networks, volumes, upload images,
-           create users and etc.
-        5. Start all RabbitMQ services and repeat step #4
+        Setup:
+        OpenStack cloud with at least 3 controllers
+
+        Scenario:
+        1. Poweroff one controller
+        2. Verify cloud: create VM 10 times, create networks,
+           volumes, upload images
        """

-        for i in xrange(0, 3):
-            self.run_disaster_command(self.context["controllers"][i],
-                                      "stop rabbitmq service")
+        controller_id = random.randint(0, len(self.context["controllers"]))
+        self.power_off_controller(controller_id)

-        self.run_command(self.context["controllers"][0],
-                         "/etc/init.d/rabbitmq-server start")
+        vm_list = []
+        for i in xrange(0, 10):
+            vm = self.boot_vm("test{0}".format(i))
+            vm_list.append(vm)

-        # (tnurlygayanov): TODO:
-        # Need to write the functions which will verify that cloud
-        # works fine: create/delete several VMs, networks, images,
-        # volumes and etc.
-        if i in xrange(0, 10):                         
-            self.boot_vm("test{0}".format(i))
+        timeout = 300
+        active_vms = []
+        while timeout > 0 and len(active_vms) < 10:
+            active_vms = [vm for vm in vm_list if vm.state == "ACTIVE"]
+            timeout -= 1

-        for i in xrange(0, 3):
-            self.run_command(self.context["controllers"][i],
-                             "/etc/init.d/rabbitmq-server start")
+        if len(active_vms) < 10:
+            raise "Can't boot VMs"
--- a/samples/tasks/scenarios/SampleScenario.json
+++ b/samples/tasks/scenarios/SampleScenario.json
@ -1,13 +1,41 @@
 {
-    "shaker_controller.sample_print": [
+    "RabbitMQDisasterScenarios.power_off_one_cantroller": [
        {
            "runner": {
                "type": "serial",
                "times": 5,
            },
            "context": {
+                "recover_cloud": {
+                    "checks": ["rabbitmq_cluster_status", ]
+                },
                "cloud_nodes": {
-                    "controllers": ["1", "2", "3"]
+                    "controllers": [
+                        {
+                            "shaker_agent_id": "1",
+                            "hardware_power_on_cmd": "VBoxManage startvm fuel-slave-1",
+                            "hardware_power_off_cmd": "VBoxManage controlvm fuel-slave-1 poweroff",
+                            "power_off_timeout": 20,
+                            "power_on_timeout": 30
+                        },
+                        {
+                            "shaker_agent_id": "2",
+                            "hardware_power_on_cmd": "VBoxManage startvm fuel-slave-2",
+                            "hardware_power_off_cmd": "VBoxManage controlvm fuel-slave-2 poweroff",
+                            "power_off_timeout": 20,
+                            "power_on_timeout": 30
+                        },
+                        {
+                            "shaker_agent_id": "3",
+                            "hardware_power_on_cmd": "VBoxManage startvm fuel-slave-3",
+                            "hardware_power_off_cmd": "VBoxManage controlvm fuel-slave-3 poweroff",
+                            "power_off_timeout": 20,
+                            "power_on_timeout": 30
+                        }
+                    ],
+                    "power_control_node": {
+                        "shaker_agent_id": "localhost"
+                    }
                }
            }
        }