A/P HA tests

This patch adds scenario tests for kuryr-controller A/P HA.

Implements: blueprint high-availablity
Change-Id: I1cd88056a6f7b719b8b58128ec8fffbce3e816f3
This commit is contained in:
Michał Dulko 2018-08-08 12:02:51 +02:00
parent 3432ed77c5
commit 7a5f3375ba
4 changed files with 217 additions and 2 deletions

View File

@ -52,4 +52,9 @@ kuryr_k8s_opts = [
cfg.StrOpt("ocp_router_fip", default=None, help="OCP Router floating IP"),
cfg.BoolOpt("kuryr_daemon_enabled", default=True, help="Whether or not "
"CNI should run as a daemon"),
cfg.BoolOpt("ap_ha", default=False,
help='Whether or not A/P HA of kuryr-controller is enabled'),
cfg.StrOpt("controller_deployment_name", default="kuryr-controller",
help="Name of Kubernetes Deployment running kuryr-controller "
"Pods")
]

View File

@ -95,7 +95,8 @@ class BaseKuryrScenarioTest(manager.NetworkScenarioTest):
@classmethod
def create_pod(cls, name=None, labels=None, image='kuryr/demo',
namespace="default", annotations=None):
namespace="default", annotations=None,
wait_for_status=True):
if not name:
name = data_utils.rand_name(prefix='kuryr-pod')
pod = cls.k8s_client.V1Pod()
@ -111,7 +112,7 @@ class BaseKuryrScenarioTest(manager.NetworkScenarioTest):
cls.k8s_client.CoreV1Api().create_namespaced_pod(namespace=namespace,
body=pod)
status = ""
while status != "Running":
while status != "Running" and wait_for_status:
# TODO(dmellado) add timeout config to tempest plugin
time.sleep(1)
status = cls.get_pod_status(name, namespace)
@ -661,3 +662,10 @@ class BaseKuryrScenarioTest(manager.NetworkScenarioTest):
'Got {}'.format(unique_resps))
self._run_threaded_and_assert(req, pred, fn_timeout=10)
def create_and_ping_pod(self):
name, pod = self.create_pod()
self.addCleanup(self.delete_pod, name)
ip = self.get_pod_ip(name)
self.assertIsNotNone(ip)
self.assertTrue(self.ping_ip_address(ip))

View File

@ -12,3 +12,4 @@
# See the License for the specific language governing permissions and
# limitations under the License.
POD_OUTPUT = 'HELLO! I AM ALIVE!!!'
HA_ENDPOINT_NAME = 'kuryr-controller'

View File

@ -0,0 +1,201 @@
# Copyright 2018 Red Hat, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import datetime
import json
import threading
import time
import uuid
import kubernetes
from oslo_log import log as logging
from tempest import config
from tempest.lib.common.utils import test_utils
from tempest.lib import decorators
from kuryr_tempest_plugin.tests.scenario import base
from kuryr_tempest_plugin.tests.scenario import consts
LOG = logging.getLogger(__name__)
CONF = config.CONF
TIMEOUT = 120
class TestHighAvailabilityScenario(base.BaseKuryrScenarioTest):
@classmethod
def skip_checks(cls):
super(TestHighAvailabilityScenario, cls).skip_checks()
if not (CONF.kuryr_kubernetes.ap_ha and
CONF.kuryr_kubernetes.containerized):
raise cls.skipException("kuryr-controller A/P HA must be enabled "
"and kuryr-kubernetes must run in "
"containerized mode.")
def get_kuryr_leader_annotation(self):
try:
endpoint = self.k8s_client.CoreV1Api().read_namespaced_endpoints(
consts.HA_ENDPOINT_NAME,
CONF.kuryr_kubernetes.kube_system_namespace)
annotation = endpoint.metadata.annotations[
'control-plane.alpha.kubernetes.io/leader']
return json.loads(annotation)
except kubernetes.client.rest.ApiException:
return None
def wait_for_deployment_scale(self, desired_replicas,
desired_state='Running'):
def has_scaled():
pods = self.k8s_client.CoreV1Api().list_namespaced_pod(
CONF.kuryr_kubernetes.kube_system_namespace,
label_selector='name=kuryr-controller')
return (len(pods.items) == desired_replicas and
all([pod.status.phase == desired_state
for pod in pods.items]))
self.assertTrue(test_utils.call_until_true(has_scaled, TIMEOUT, 5),
'Timed out waiting for deployment to scale')
def scale_controller_deployment(self, replicas):
self.k8s_client.AppsV1Api().patch_namespaced_deployment(
'kuryr-controller', CONF.kuryr_kubernetes.kube_system_namespace,
{'spec': {'replicas': replicas}})
self.wait_for_deployment_scale(replicas)
@decorators.idempotent_id('3f09e7d1-0897-46b1-ba9d-ea4116523025')
def test_scale_up_controller(self):
controller_deployment = (
self.k8s_client.AppsV1Api().read_namespaced_deployment(
CONF.kuryr_kubernetes.controller_deployment_name,
CONF.kuryr_kubernetes.kube_system_namespace))
# On cleanup scale to original number of replicas
self.addCleanup(self.scale_controller_deployment,
controller_deployment.spec.replicas)
# Scale to just a single replica
self.scale_controller_deployment(1)
# Create a pod and check connectivity
self.create_and_ping_pod()
# Get current leader annotation
annotation = self.get_kuryr_leader_annotation()
self.assertIsNotNone(annotation)
transitions = annotation['leaderTransitions']
# Scale the controller up and wait until it starts
self.scale_controller_deployment(2)
# Check if leader haven't switched
annotation = self.get_kuryr_leader_annotation()
self.assertEqual(transitions, annotation['leaderTransitions'])
# Create another pod and check connectivity
self.create_and_ping_pod()
@decorators.idempotent_id('afe75fa5-e9ca-4f7d-bc16-8f1dd7884eea')
def test_scale_down_controller(self):
controller_deployment = (
self.k8s_client.AppsV1Api().read_namespaced_deployment(
CONF.kuryr_kubernetes.controller_deployment_name,
CONF.kuryr_kubernetes.kube_system_namespace))
# On cleanup scale to original number of replicas
self.addCleanup(self.scale_controller_deployment,
controller_deployment.spec.replicas)
# Scale to 2 replicas
self.scale_controller_deployment(2)
# Create a pod and check connectivity
self.create_and_ping_pod()
# Scale the controller down and wait until it stops
self.scale_controller_deployment(1)
# Create another pod and check connectivity
self.create_and_ping_pod()
@decorators.idempotent_id('3b218c11-c77b-40a8-ba09-5dd5ae0f8ae3')
def test_auto_fencing(self):
controller_deployment = (
self.k8s_client.AppsV1Api().read_namespaced_deployment(
CONF.kuryr_kubernetes.controller_deployment_name,
CONF.kuryr_kubernetes.kube_system_namespace))
# On cleanup scale to original number of replicas
self.addCleanup(self.scale_controller_deployment,
controller_deployment.spec.replicas)
# Scale to 2 replicas
self.scale_controller_deployment(2)
# Create a pod and check connectivity
self.create_and_ping_pod()
def hostile_takeover():
"""Malform endpoint annotation to takeover the leadership
This method runs for 3 minutes and for that time it malforms the
endpoint annotation to simulate another kuryr-controller taking
over the leadership. This should make other kuryr-controllers to
step down and stop processing any events for those 3 minutes.
"""
timeout = datetime.datetime.utcnow() + datetime.timedelta(
minutes=3)
fake_name = str(uuid.uuid4())
while datetime.datetime.utcnow() < timeout:
current = datetime.datetime.utcnow()
renew = current + datetime.timedelta(seconds=5)
malformed = {
"holderIdentity": fake_name,
"leaseDurationSeconds": 5,
"acquireTime": current.strftime("%Y-%m-%dT%H:%M:%SZ"),
"renewTime": renew.strftime("%Y-%m-%dT%H:%M:%SZ"),
"leaderTransitions": 0,
}
self.k8s_client.CoreV1Api().patch_namespaced_endpoints(
consts.HA_ENDPOINT_NAME,
CONF.kuryr_kubernetes.kube_system_namespace,
{'metadata': {'annotations': {
'control-plane.alpha.kubernetes.io/leader':
json.dumps(malformed)}}})
time.sleep(2)
t = threading.Thread(target=hostile_takeover)
t.start()
# Create another pod and check that it's not getting wired.
time.sleep(15) # We need to wait a bit for controller to autofence.
name, pod = self.create_pod(wait_for_status=False)
def is_pod_running():
pod_obj = self.k8s_client.CoreV1Api().read_namespaced_pod(
name, 'default')
return pod_obj.status.phase == 'Running'
self.addCleanup(self.delete_pod, name)
self.assertFalse(test_utils.call_until_true(is_pod_running, TIMEOUT,
5))
# Wait 120 seconds more, malformed annotation should get cleared
time.sleep(TIMEOUT)
# Now pod should have the IP and be pingable
ip = self.get_pod_ip(name)
self.assertIsNotNone(ip)
self.assertTrue(self.ping_ip_address(ip, ping_timeout=TIMEOUT))