From 7a5f3375bab444add7a2a512d3fab32435fb0a12 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Dulko?= Date: Wed, 8 Aug 2018 12:02:51 +0200 Subject: [PATCH] A/P HA tests This patch adds scenario tests for kuryr-controller A/P HA. Implements: blueprint high-availablity Change-Id: I1cd88056a6f7b719b8b58128ec8fffbce3e816f3 --- kuryr_tempest_plugin/config.py | 5 + kuryr_tempest_plugin/tests/scenario/base.py | 12 +- kuryr_tempest_plugin/tests/scenario/consts.py | 1 + .../tests/scenario/test_ha.py | 201 ++++++++++++++++++ 4 files changed, 217 insertions(+), 2 deletions(-) create mode 100644 kuryr_tempest_plugin/tests/scenario/test_ha.py diff --git a/kuryr_tempest_plugin/config.py b/kuryr_tempest_plugin/config.py index eadd2a57..9d03a7d3 100644 --- a/kuryr_tempest_plugin/config.py +++ b/kuryr_tempest_plugin/config.py @@ -52,4 +52,9 @@ kuryr_k8s_opts = [ cfg.StrOpt("ocp_router_fip", default=None, help="OCP Router floating IP"), cfg.BoolOpt("kuryr_daemon_enabled", default=True, help="Whether or not " "CNI should run as a daemon"), + cfg.BoolOpt("ap_ha", default=False, + help='Whether or not A/P HA of kuryr-controller is enabled'), + cfg.StrOpt("controller_deployment_name", default="kuryr-controller", + help="Name of Kubernetes Deployment running kuryr-controller " + "Pods") ] diff --git a/kuryr_tempest_plugin/tests/scenario/base.py b/kuryr_tempest_plugin/tests/scenario/base.py index c37d5c52..8b327b64 100644 --- a/kuryr_tempest_plugin/tests/scenario/base.py +++ b/kuryr_tempest_plugin/tests/scenario/base.py @@ -95,7 +95,8 @@ class BaseKuryrScenarioTest(manager.NetworkScenarioTest): @classmethod def create_pod(cls, name=None, labels=None, image='kuryr/demo', - namespace="default", annotations=None): + namespace="default", annotations=None, + wait_for_status=True): if not name: name = data_utils.rand_name(prefix='kuryr-pod') pod = cls.k8s_client.V1Pod() @@ -111,7 +112,7 @@ class BaseKuryrScenarioTest(manager.NetworkScenarioTest): cls.k8s_client.CoreV1Api().create_namespaced_pod(namespace=namespace, body=pod) status = "" - while status != "Running": + while status != "Running" and wait_for_status: # TODO(dmellado) add timeout config to tempest plugin time.sleep(1) status = cls.get_pod_status(name, namespace) @@ -661,3 +662,10 @@ class BaseKuryrScenarioTest(manager.NetworkScenarioTest): 'Got {}'.format(unique_resps)) self._run_threaded_and_assert(req, pred, fn_timeout=10) + + def create_and_ping_pod(self): + name, pod = self.create_pod() + self.addCleanup(self.delete_pod, name) + ip = self.get_pod_ip(name) + self.assertIsNotNone(ip) + self.assertTrue(self.ping_ip_address(ip)) diff --git a/kuryr_tempest_plugin/tests/scenario/consts.py b/kuryr_tempest_plugin/tests/scenario/consts.py index 31b620bb..c47afc41 100644 --- a/kuryr_tempest_plugin/tests/scenario/consts.py +++ b/kuryr_tempest_plugin/tests/scenario/consts.py @@ -12,3 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. POD_OUTPUT = 'HELLO! I AM ALIVE!!!' +HA_ENDPOINT_NAME = 'kuryr-controller' diff --git a/kuryr_tempest_plugin/tests/scenario/test_ha.py b/kuryr_tempest_plugin/tests/scenario/test_ha.py new file mode 100644 index 00000000..84fd04e7 --- /dev/null +++ b/kuryr_tempest_plugin/tests/scenario/test_ha.py @@ -0,0 +1,201 @@ +# Copyright 2018 Red Hat, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import datetime +import json +import threading +import time +import uuid + +import kubernetes +from oslo_log import log as logging +from tempest import config +from tempest.lib.common.utils import test_utils +from tempest.lib import decorators + +from kuryr_tempest_plugin.tests.scenario import base +from kuryr_tempest_plugin.tests.scenario import consts + +LOG = logging.getLogger(__name__) +CONF = config.CONF +TIMEOUT = 120 + + +class TestHighAvailabilityScenario(base.BaseKuryrScenarioTest): + + @classmethod + def skip_checks(cls): + super(TestHighAvailabilityScenario, cls).skip_checks() + if not (CONF.kuryr_kubernetes.ap_ha and + CONF.kuryr_kubernetes.containerized): + raise cls.skipException("kuryr-controller A/P HA must be enabled " + "and kuryr-kubernetes must run in " + "containerized mode.") + + def get_kuryr_leader_annotation(self): + try: + endpoint = self.k8s_client.CoreV1Api().read_namespaced_endpoints( + consts.HA_ENDPOINT_NAME, + CONF.kuryr_kubernetes.kube_system_namespace) + annotation = endpoint.metadata.annotations[ + 'control-plane.alpha.kubernetes.io/leader'] + return json.loads(annotation) + except kubernetes.client.rest.ApiException: + return None + + def wait_for_deployment_scale(self, desired_replicas, + desired_state='Running'): + def has_scaled(): + pods = self.k8s_client.CoreV1Api().list_namespaced_pod( + CONF.kuryr_kubernetes.kube_system_namespace, + label_selector='name=kuryr-controller') + + return (len(pods.items) == desired_replicas and + all([pod.status.phase == desired_state + for pod in pods.items])) + + self.assertTrue(test_utils.call_until_true(has_scaled, TIMEOUT, 5), + 'Timed out waiting for deployment to scale') + + def scale_controller_deployment(self, replicas): + self.k8s_client.AppsV1Api().patch_namespaced_deployment( + 'kuryr-controller', CONF.kuryr_kubernetes.kube_system_namespace, + {'spec': {'replicas': replicas}}) + self.wait_for_deployment_scale(replicas) + + @decorators.idempotent_id('3f09e7d1-0897-46b1-ba9d-ea4116523025') + def test_scale_up_controller(self): + controller_deployment = ( + self.k8s_client.AppsV1Api().read_namespaced_deployment( + CONF.kuryr_kubernetes.controller_deployment_name, + CONF.kuryr_kubernetes.kube_system_namespace)) + + # On cleanup scale to original number of replicas + self.addCleanup(self.scale_controller_deployment, + controller_deployment.spec.replicas) + + # Scale to just a single replica + self.scale_controller_deployment(1) + + # Create a pod and check connectivity + self.create_and_ping_pod() + + # Get current leader annotation + annotation = self.get_kuryr_leader_annotation() + self.assertIsNotNone(annotation) + transitions = annotation['leaderTransitions'] + + # Scale the controller up and wait until it starts + self.scale_controller_deployment(2) + + # Check if leader haven't switched + annotation = self.get_kuryr_leader_annotation() + self.assertEqual(transitions, annotation['leaderTransitions']) + + # Create another pod and check connectivity + self.create_and_ping_pod() + + @decorators.idempotent_id('afe75fa5-e9ca-4f7d-bc16-8f1dd7884eea') + def test_scale_down_controller(self): + controller_deployment = ( + self.k8s_client.AppsV1Api().read_namespaced_deployment( + CONF.kuryr_kubernetes.controller_deployment_name, + CONF.kuryr_kubernetes.kube_system_namespace)) + + # On cleanup scale to original number of replicas + self.addCleanup(self.scale_controller_deployment, + controller_deployment.spec.replicas) + + # Scale to 2 replicas + self.scale_controller_deployment(2) + + # Create a pod and check connectivity + self.create_and_ping_pod() + + # Scale the controller down and wait until it stops + self.scale_controller_deployment(1) + + # Create another pod and check connectivity + self.create_and_ping_pod() + + @decorators.idempotent_id('3b218c11-c77b-40a8-ba09-5dd5ae0f8ae3') + def test_auto_fencing(self): + controller_deployment = ( + self.k8s_client.AppsV1Api().read_namespaced_deployment( + CONF.kuryr_kubernetes.controller_deployment_name, + CONF.kuryr_kubernetes.kube_system_namespace)) + + # On cleanup scale to original number of replicas + self.addCleanup(self.scale_controller_deployment, + controller_deployment.spec.replicas) + + # Scale to 2 replicas + self.scale_controller_deployment(2) + + # Create a pod and check connectivity + self.create_and_ping_pod() + + def hostile_takeover(): + """Malform endpoint annotation to takeover the leadership + + This method runs for 3 minutes and for that time it malforms the + endpoint annotation to simulate another kuryr-controller taking + over the leadership. This should make other kuryr-controllers to + step down and stop processing any events for those 3 minutes. + """ + timeout = datetime.datetime.utcnow() + datetime.timedelta( + minutes=3) + fake_name = str(uuid.uuid4()) + while datetime.datetime.utcnow() < timeout: + current = datetime.datetime.utcnow() + renew = current + datetime.timedelta(seconds=5) + malformed = { + "holderIdentity": fake_name, + "leaseDurationSeconds": 5, + "acquireTime": current.strftime("%Y-%m-%dT%H:%M:%SZ"), + "renewTime": renew.strftime("%Y-%m-%dT%H:%M:%SZ"), + "leaderTransitions": 0, + } + self.k8s_client.CoreV1Api().patch_namespaced_endpoints( + consts.HA_ENDPOINT_NAME, + CONF.kuryr_kubernetes.kube_system_namespace, + {'metadata': {'annotations': { + 'control-plane.alpha.kubernetes.io/leader': + json.dumps(malformed)}}}) + time.sleep(2) + + t = threading.Thread(target=hostile_takeover) + t.start() + + # Create another pod and check that it's not getting wired. + time.sleep(15) # We need to wait a bit for controller to autofence. + name, pod = self.create_pod(wait_for_status=False) + + def is_pod_running(): + pod_obj = self.k8s_client.CoreV1Api().read_namespaced_pod( + name, 'default') + + return pod_obj.status.phase == 'Running' + + self.addCleanup(self.delete_pod, name) + self.assertFalse(test_utils.call_until_true(is_pod_running, TIMEOUT, + 5)) + + # Wait 120 seconds more, malformed annotation should get cleared + time.sleep(TIMEOUT) + + # Now pod should have the IP and be pingable + ip = self.get_pod_ip(name) + self.assertIsNotNone(ip) + self.assertTrue(self.ping_ip_address(ip, ping_timeout=TIMEOUT))