From 9f5628e55bc70bc57bcf66e456fb18af7fe47859 Mon Sep 17 00:00:00 2001
From: Lucian Petrut <lpetrut@cloudbasesolutions.com>
Date: Tue, 9 Oct 2018 17:20:16 +0300
Subject: [PATCH] Improve clustered instance failover handling

Instances can bounce between hosts many times in a short interval,
especially if the CSVs go down (as much as 20 times in less than 2
minutes).

We're not handling this properly. The failover handling logic is
prone to race conditions, as multiple hosts may attempt to claim
the instance, which will end up in an inconsistent state.

We're introducing distributed locks, preventing races between hosts.
At the same time, we're validating the events, as the instances can
move again by the time we process the event.

The distributed lock backend will have to be configured.

At the same time, we're now waiting for "pending" cluster groups,
which may not even be registered in Hyper-V, so any action we take
on the VM would fail.

Closes-Bug: #1795299
Closes-Bug: #1796673

Change-Id: I3dbdcf208bb7a96bd516b41e4725a5fcb37280d6
---
 compute_hyperv/nova/cluster/clusterops.py     | 53 ++++++++++++-
 compute_hyperv/nova/cluster/driver.py         |  2 +
 compute_hyperv/nova/driver.py                 |  6 ++
 compute_hyperv/tests/test.py                  | 12 +++
 .../tests/unit/cluster/test_clusterops.py     | 77 ++++++++++++++++---
 .../tests/unit/test_coordination.py           |  2 +
 ...er-distributed-locks-5f12252af6b3913b.yaml | 11 +++
 7 files changed, 150 insertions(+), 13 deletions(-)
 create mode 100644 releasenotes/notes/cluster-distributed-locks-5f12252af6b3913b.yaml

diff --git a/compute_hyperv/nova/cluster/clusterops.py b/compute_hyperv/nova/cluster/clusterops.py
index e7b456e0..4e5c2780 100644
--- a/compute_hyperv/nova/cluster/clusterops.py
+++ b/compute_hyperv/nova/cluster/clusterops.py
@@ -16,6 +16,7 @@
 """Management class for Cluster VM operations."""
 
 import functools
+import time
 
 from nova.compute import power_state
 from nova.compute import task_states
@@ -26,11 +27,13 @@ from nova import objects
 from nova import utils
 from nova.virt import block_device
 from nova.virt import event as virtevent
+from os_win import constants as os_win_const
 from os_win import exceptions as os_win_exc
 from os_win import utilsfactory
 from oslo_log import log as logging
 
 import compute_hyperv.nova.conf
+from compute_hyperv.nova import coordination
 from compute_hyperv.nova import hostops
 from compute_hyperv.nova import serialconsoleops
 from compute_hyperv.nova import vmops
@@ -106,6 +109,7 @@ class ClusterOps(object):
                           instance.name, instance.host,
                           self._this_node)
 
+    @coordination.synchronized('failover-{instance_name}')
     def _failover_migrate(self, instance_name, new_host):
         """This method will check if the generated event is a legitimate
         failover to this node. If it is, it will proceed to prepare the
@@ -126,6 +130,31 @@ class ClusterOps(object):
                   'new_host': new_host,
                   'old_host': old_host})
 
+        # While the cluster group is in "pending" state, it may not even be
+        # registered in Hyper-V, so there's not much we can do. We'll have to
+        # wait for it to be handled by the Failover Cluster service.
+        self._wait_for_pending_instance(instance_name)
+
+        current_host = self._clustutils.get_vm_host(instance_name)
+        instance_moved_again = current_host.upper() != new_host.upper()
+        if instance_moved_again:
+            LOG.warning("While processing instance %(instance)s failover to "
+                        "%(host)s, it has moved to %(current_host)s.",
+                        dict(host=new_host,
+                             current_host=current_host,
+                             instance=instance_name))
+            new_host = current_host
+
+        host_changed = old_host.upper() != new_host.upper()
+        migrated_here = new_host.upper() == self._this_node.upper()
+        migrated_from_here = old_host.upper() == self._this_node.upper()
+
+        if not host_changed:
+            LOG.warning("The source node is the same as the destination "
+                        "node: %(host)s. The instance %(instance)s may have "
+                        "bounced between hosts due to a failure.",
+                        dict(host=old_host, instance=instance_name))
+
         if instance.task_state == task_states.MIGRATING:
             # In case of live migration triggered by the user, we get the
             # event that the instance changed host but we do not want
@@ -135,11 +164,11 @@ class ClusterOps(object):
 
         nw_info = self._network_api.get_instance_nw_info(self._context,
                                                          instance)
-        if old_host and old_host.upper() == self._this_node.upper():
-            LOG.debug('Actions at source node.')
+        if host_changed and migrated_from_here:
+            LOG.debug('Cleaning up moved instance: %s.', instance_name)
             self._vmops.unplug_vifs(instance, nw_info)
             return
-        elif new_host.upper() != self._this_node.upper():
+        if not migrated_here:
             LOG.debug('Instance %s did not failover to this node.',
                       instance_name)
             return
@@ -148,10 +177,26 @@ class ClusterOps(object):
                  {'instance': instance_name})
 
         self._nova_failover_server(instance, new_host)
-        self._failover_migrate_networks(instance, old_host)
+        if host_changed:
+            self._failover_migrate_networks(instance, old_host)
+
         self._vmops.plug_vifs(instance, nw_info)
         self._serial_console_ops.start_console_handler(instance_name)
 
+    def _wait_for_pending_instance(self, instance_name):
+        # TODO(lpetrut): switch to an event listener. We'd probably want to
+        # avoid having one event listener per failed over instance, as there
+        # can be many of them.
+        group_state = self._clustutils.get_cluster_group_state_info(
+            instance_name)['state']
+        while group_state == os_win_const.CLUSTER_GROUP_PENDING:
+            LOG.debug("Waiting for pending instance cluster group: %s",
+                      instance_name)
+            time.sleep(2)
+
+            group_state = self._clustutils.get_cluster_group_state_info(
+                instance_name)['state']
+
     def _failover_migrate_networks(self, instance, source):
         """This is called after a VM failovered to this node.
         This will change the owner of the neutron ports to this node.
diff --git a/compute_hyperv/nova/cluster/driver.py b/compute_hyperv/nova/cluster/driver.py
index cbc1a984..d95a8512 100644
--- a/compute_hyperv/nova/cluster/driver.py
+++ b/compute_hyperv/nova/cluster/driver.py
@@ -21,6 +21,8 @@ from compute_hyperv.nova import driver
 
 
 class HyperVClusterDriver(driver.HyperVDriver):
+    use_coordination = True
+
     def __init__(self, virtapi):
         super(HyperVClusterDriver, self).__init__(virtapi)
 
diff --git a/compute_hyperv/nova/driver.py b/compute_hyperv/nova/driver.py
index 0e4198f2..65faf8f3 100644
--- a/compute_hyperv/nova/driver.py
+++ b/compute_hyperv/nova/driver.py
@@ -30,6 +30,7 @@ from os_win import utilsfactory
 from oslo_log import log as logging
 import six
 
+from compute_hyperv.nova import coordination
 from compute_hyperv.nova import eventhandler
 from compute_hyperv.nova import hostops
 from compute_hyperv.nova import imagecache
@@ -108,6 +109,8 @@ class HyperVDriver(driver.ComputeDriver):
         "supports_trusted_certs": True,
     }
 
+    use_coordination = False
+
     def __init__(self, virtapi):
         # check if the current version of Windows is supported before any
         # further driver initialisation.
@@ -151,6 +154,9 @@ class HyperVDriver(driver.ComputeDriver):
         return False
 
     def init_host(self, host):
+        if self.use_coordination:
+            coordination.COORDINATOR.start()
+
         self._serialconsoleops.start_console_handlers()
 
         self._set_event_handler_callbacks()
diff --git a/compute_hyperv/tests/test.py b/compute_hyperv/tests/test.py
index b5d43f27..38a9694f 100644
--- a/compute_hyperv/tests/test.py
+++ b/compute_hyperv/tests/test.py
@@ -85,6 +85,7 @@ class NoDBTestCase(base.BaseTestCase):
     """
 
     TIMEOUT_SCALING_FACTOR = 1
+    MOCK_TOOZ = True
 
     def setUp(self):
         """Run before each test method to initialize test environment."""
@@ -111,6 +112,11 @@ class NoDBTestCase(base.BaseTestCase):
 
         self.useFixture(nova_fixtures.PoisonFunctions())
 
+        if self.MOCK_TOOZ:
+            self.patch('compute_hyperv.nova.coordination.Coordinator.start')
+            self.patch('compute_hyperv.nova.coordination.Coordinator.stop')
+            self.patch('compute_hyperv.nova.coordination.Coordinator.get_lock')
+
     def _clear_attrs(self):
         # Delete attributes that don't start with _ so they don't pin
         # memory around unnecessarily for the duration of the test
@@ -124,6 +130,12 @@ class NoDBTestCase(base.BaseTestCase):
         for k, v in six.iteritems(kw):
             CONF.set_override(k, v, group)
 
+    def patch(self, path, *args, **kwargs):
+        patcher = mock.patch(path, *args, **kwargs)
+        result = patcher.start()
+        self.addCleanup(patcher.stop)
+        return result
+
     def assertPublicAPISignatures(self, baseinst, inst):
         def get_public_apis(inst):
             methods = {}
diff --git a/compute_hyperv/tests/unit/cluster/test_clusterops.py b/compute_hyperv/tests/unit/cluster/test_clusterops.py
index 88e2abed..9295d875 100644
--- a/compute_hyperv/tests/unit/cluster/test_clusterops.py
+++ b/compute_hyperv/tests/unit/cluster/test_clusterops.py
@@ -21,6 +21,7 @@ from nova.compute import vm_states
 from nova.network.neutronv2 import api as network_api
 from nova import objects
 from nova.virt import event as virtevent
+from os_win import constants as os_win_const
 from os_win import exceptions as os_win_exc
 
 from compute_hyperv.nova.cluster import clusterops
@@ -144,10 +145,12 @@ class ClusterOpsTestCase(test_base.HyperVBaseTestCase):
             mock_instance1.name, mock_instance1.host,
             self.clusterops._this_node)
 
+    @mock.patch.object(clusterops.ClusterOps, '_wait_for_pending_instance')
     @mock.patch.object(clusterops, 'LOG')
     @mock.patch.object(clusterops.ClusterOps, '_get_instance_by_name')
     def test_failover_migrate_no_instance(self, mock_get_instance_by_name,
-                                          mock_LOG):
+                                          mock_LOG,
+                                          mock_wait_pending_instance):
         mock_get_instance_by_name.return_value = None
 
         self.clusterops._failover_migrate(mock.sentinel.instance_name,
@@ -159,35 +162,40 @@ class ClusterOpsTestCase(test_base.HyperVBaseTestCase):
         self.assertFalse(
             self.clusterops._network_api.get_instance_nw_info.called)
 
+    @mock.patch.object(clusterops.ClusterOps, '_wait_for_pending_instance')
     @mock.patch.object(clusterops, 'LOG')
     @mock.patch.object(clusterops.ClusterOps, '_get_instance_by_name')
     def test_failover_migrate_migrating(self, mock_get_instance_by_name,
-                                        mock_LOG):
+                                        mock_LOG, mock_wait_pending_instance):
         instance = mock_get_instance_by_name.return_value
         instance.task_state = task_states.MIGRATING
 
         self.clusterops._failover_migrate(mock.sentinel.instance_name,
-                                          mock.sentinel.new_host)
+                                          'new_host')
 
         mock_LOG.debug.assert_called_once_with(
             'Instance %s is live migrating.', mock.sentinel.instance_name)
 
+    @mock.patch.object(clusterops.ClusterOps, '_wait_for_pending_instance')
     @mock.patch.object(clusterops.ClusterOps, '_get_instance_by_name')
-    def test_failover_migrate_at_source_node(self, mock_get_instance_by_name):
+    def test_failover_migrate_at_source_node(self, mock_get_instance_by_name,
+                                             mock_wait_pending_instance):
         instance = mock_get_instance_by_name.return_value
         instance.host = 'old_host'
         self.clusterops._this_node = instance.host
 
         self.clusterops._failover_migrate(mock.sentinel.instance_name,
-                                          mock.sentinel.new_host)
+                                          'new_host')
 
         self.clusterops._vmops.unplug_vifs.assert_called_once_with(instance,
             self.clusterops._network_api.get_instance_nw_info.return_value)
 
+    @mock.patch.object(clusterops.ClusterOps, '_wait_for_pending_instance')
     @mock.patch.object(clusterops, 'LOG')
     @mock.patch.object(clusterops.ClusterOps, '_get_instance_by_name')
     def test_failover_migrate_not_this_node(self, mock_get_instance_by_name,
-                                            mock_LOG):
+                                            mock_LOG,
+                                            mock_wait_pending_instance):
         self.clusterops._this_node = 'new_host'
 
         self.clusterops._failover_migrate(mock.sentinel.instance_name,
@@ -197,21 +205,28 @@ class ClusterOpsTestCase(test_base.HyperVBaseTestCase):
             'Instance %s did not failover to this node.',
             mock.sentinel.instance_name)
 
+    @mock.patch.object(clusterops.ClusterOps, '_wait_for_pending_instance')
     @mock.patch.object(clusterops.ClusterOps, '_failover_migrate_networks')
     @mock.patch.object(clusterops.ClusterOps, '_nova_failover_server')
     @mock.patch.object(clusterops.ClusterOps, '_get_instance_by_name')
-    def test_failover_migrate_this_node(self, mock_get_instance_by_name,
-                                        mock_nova_failover_server,
-                                        mock_failover_migrate_networks):
+    def test_failover_migrate_changed_host(self, mock_get_instance_by_name,
+                                           mock_nova_failover_server,
+                                           mock_failover_migrate_networks,
+                                           mock_wait_pending_instance):
         instance = mock_get_instance_by_name.return_value
         old_host = 'old_host'
         new_host = 'new_host'
         instance.host = old_host
         self.clusterops._this_node = new_host
+        self._clustutils.get_vm_host.return_value = new_host
 
         self.clusterops._failover_migrate(mock.sentinel.instance_name,
                                           new_host)
 
+        mock_wait_pending_instance.assert_called_once_with(
+            mock.sentinel.instance_name)
+        self._clustutils.get_vm_host.assert_called_once_with(
+            mock.sentinel.instance_name)
         mock_get_instance_by_name.assert_called_once_with(
             mock.sentinel.instance_name)
         get_inst_nw_info = self.clusterops._network_api.get_instance_nw_info
@@ -225,6 +240,50 @@ class ClusterOpsTestCase(test_base.HyperVBaseTestCase):
         c_handler = self.clusterops._serial_console_ops.start_console_handler
         c_handler.assert_called_once_with(mock.sentinel.instance_name)
 
+    @mock.patch.object(clusterops.ClusterOps, '_wait_for_pending_instance')
+    @mock.patch.object(clusterops.ClusterOps, '_failover_migrate_networks')
+    @mock.patch.object(clusterops.ClusterOps, '_nova_failover_server')
+    @mock.patch.object(clusterops.ClusterOps, '_get_instance_by_name')
+    def test_failover_same_node(self, mock_get_instance_by_name,
+                                mock_nova_failover_server,
+                                mock_failover_migrate_networks,
+                                mock_wait_pending_instance):
+        # In some cases, the instances may bounce between hosts. We're testing
+        # the case in which the instance is actually returning to the initial
+        # host during the time in which we're processing events.
+        instance = mock_get_instance_by_name.return_value
+        old_host = 'old_host'
+        new_host = 'new_host'
+        instance.host = old_host
+        self.clusterops._this_node = old_host
+        self._clustutils.get_vm_host.return_value = old_host
+
+        self.clusterops._failover_migrate(mock.sentinel.instance_name,
+                                          new_host)
+
+        get_inst_nw_info = self.clusterops._network_api.get_instance_nw_info
+        get_inst_nw_info.assert_called_once_with(self.clusterops._context,
+                                                 instance)
+        mock_nova_failover_server.assert_called_once_with(instance, old_host)
+        self.clusterops._vmops.unplug_vifs.assert_not_called()
+        self.clusterops._vmops.plug_vifs.assert_called_once_with(
+            instance, get_inst_nw_info.return_value)
+        mock_failover_migrate_networks.assert_not_called()
+        c_handler = self.clusterops._serial_console_ops.start_console_handler
+        c_handler.assert_called_once_with(mock.sentinel.instance_name)
+
+    @mock.patch('time.sleep')
+    def test_wait_for_pending_instance(self, mock_sleep):
+        self._clustutils.get_cluster_group_state_info.side_effect = [
+            dict(state=os_win_const.CLUSTER_GROUP_PENDING),
+            dict(state=os_win_const.CLUSTER_GROUP_ONLINE)]
+
+        self.clusterops._wait_for_pending_instance(mock.sentinel.instance_name)
+
+        self._clustutils.get_cluster_group_state_info.assert_has_calls(
+            [mock.call(mock.sentinel.instance_name)] * 2)
+        mock_sleep.assert_called_once_with(2)
+
     def test_failover_migrate_networks(self):
         mock_instance = fake_instance.fake_instance_obj(self.context)
         fake_source = mock.MagicMock()
diff --git a/compute_hyperv/tests/unit/test_coordination.py b/compute_hyperv/tests/unit/test_coordination.py
index 6dffbd6a..e818ac43 100644
--- a/compute_hyperv/tests/unit/test_coordination.py
+++ b/compute_hyperv/tests/unit/test_coordination.py
@@ -50,6 +50,8 @@ class MockToozLock(tooz.locking.Lock):
 
 @mock.patch('tooz.coordination.get_coordinator')
 class CoordinatorTestCase(test_base.HyperVBaseTestCase):
+    MOCK_TOOZ = False
+
     def test_coordinator_start(self, get_coordinator):
         crd = get_coordinator.return_value
 
diff --git a/releasenotes/notes/cluster-distributed-locks-5f12252af6b3913b.yaml b/releasenotes/notes/cluster-distributed-locks-5f12252af6b3913b.yaml
new file mode 100644
index 00000000..758fc8bc
--- /dev/null
+++ b/releasenotes/notes/cluster-distributed-locks-5f12252af6b3913b.yaml
@@ -0,0 +1,11 @@
+---
+upgrade:
+  - |
+    When using the cluster driver, a distributed lock backend will have to be
+    configured.
+fixes:
+  - |
+    In order to fix race conditions that can occur when handling instance
+    failovers, the cluster driver is now using distributed locks. A
+    distributed lock backend (e.g. etcd, mysql, file based, etc) will have to
+    be configured.