Retry ebtables on race

Calls to ebtables can race with libvirt and cause nova, or libvirt to fail to apply ebtables rules. The goal of this patch is to provide a simple fix to improve the stability of the gate. We now call ebtables in a simple loop that retries on failure. Long term we want to update nova to make use of the --concurrent flag in newer versions of ebtables. The --concurrent flag implements a lock to prevent multiple invocations of ebtables from racing. This will require a newer libvirt and the ability to timeout long running execs (--concurrent can block forever if it never gets the lock). A future patch is forthcoming to add support for --concurrent. DocImpact Add ebtables_exec_attempts option (default=3). Change-Id: I3e04782ac4678581462f9bee4bb10d5f3b223457 Partial-Bug: #1316621
2014-11-20 18:29:15 -08:00 · 2014-11-20 18:29:15 -08:00 · fb9b205805
parent 3ada19a650
commit fb9b205805
2 changed files with 80 additions and 3 deletions
--- a/nova/network/linux_net.py
+++ b/nova/network/linux_net.py
@ -126,6 +126,9 @@ linux_net_opts = [
    cfg.BoolOpt('fake_network',
                default=False,
                help='If passed, use fake network devices and addresses'),
+    cfg.IntOpt('ebtables_exec_attempts',
+               default=3,
+               help='Number of times to retry ebtables commands on failure.'),
    ]

 CONF = cfg.CONF
@ -1639,20 +1642,53 @@ class LinuxBridgeInterfaceDriver(LinuxNetInterfaceDriver):
            delete_net_dev(bridge)


+# NOTE(cfb): This is a temporary fix to LP #1316621. We really want to call
+#            ebtables with --concurrent. In order to do that though we need
+#            libvirt to support this. Additionally since ebtables --concurrent
+#            will hang indefinitely waiting on the lock we need to teach
+#            oslo.concurrency.processutils how to timeout a long running
+#            process first. Once those are complete we can replace all of this
+#            with calls to ebtables --concurrent and a reasonable timeout.
+def _exec_ebtables(*cmd, **kwargs):
+    check_exit_code = kwargs.pop('check_exit_code', True)
+
+    # We always try at least once
+    attempts = CONF.ebtables_exec_attempts
+    if attempts <= 0:
+        attempts = 1
+    while attempts > 0:
+        attempts -= 1
+        # NOTE(cfb): ebtables reports all errors with a return code of 255.
+        #            As such we can't know if we hit a locking error, or some
+        #            other error (like a rule doesn't exist) so we have to
+        #            retry on all errors.
+        try:
+            _execute(*cmd, check_exit_code=[0], **kwargs)
+        except processutils.ProcessExecutionError:
+            if not attempts and check_exit_code:
+                LOG.warning(_LW('%s failed. Not Retrying.'), ' '.join(cmd))
+                raise
+            else:
+                LOG.warning(_LW('%s failed. Retrying.'), ' '.join(cmd))
+        else:
+            # Success
+            return
+
+
@utils.synchronized('ebtables', external=True)
 def ensure_ebtables_rules(rules, table='filter'):
    for rule in rules:
        cmd = ['ebtables', '-t', table, '-D'] + rule.split()
-        _execute(*cmd, check_exit_code=False, run_as_root=True)
+        _exec_ebtables(*cmd, check_exit_code=False, run_as_root=True)
        cmd[3] = '-I'
-        _execute(*cmd, run_as_root=True)
+        _exec_ebtables(*cmd, run_as_root=True)


@utils.synchronized('ebtables', external=True)
 def remove_ebtables_rules(rules, table='filter'):
    for rule in rules:
        cmd = ['ebtables', '-t', table, '-D'] + rule.split()
-        _execute(*cmd, check_exit_code=False, run_as_root=True)
+        _exec_ebtables(*cmd, check_exit_code=False, run_as_root=True)


 def isolate_dhcp_address(interface, address):
--- a/nova/tests/unit/network/test_linux_net.py
+++ b/nova/tests/unit/network/test_linux_net.py
@ -1113,3 +1113,44 @@ class LinuxNetworkTestCase(test.NoDBTestCase):
                      check_exit_code=[0, 2, 254])
        ]
        self._create_veth_pair(calls)
+
+    def test_exec_ebtables_success(self):
+        executes = []
+
+        def fake_execute(*args, **kwargs):
+            executes.append(args)
+            return "", ""
+
+        self.stubs.Set(self.driver, '_execute', fake_execute)
+        self.driver._exec_ebtables('fake')
+        self.assertEqual(1, len(executes))
+        self.mox.UnsetStubs()
+
+    def test_exec_ebtables_fail_all(self):
+        executes = []
+
+        def fake_execute(*args, **kwargs):
+            executes.append(args)
+            raise processutils.ProcessExecutionError('error')
+
+        self.stubs.Set(self.driver, '_execute', fake_execute)
+        self.assertRaises(processutils.ProcessExecutionError,
+                          self.driver._exec_ebtables, 'fake')
+        max_calls = CONF.ebtables_exec_attempts
+        self.assertEqual(max_calls, len(executes))
+        self.mox.UnsetStubs()
+
+    def test_exec_ebtables_fail_once(self):
+        executes = []
+
+        def fake_execute(*args, **kwargs):
+            executes.append(args)
+            if len(executes) == 1:
+                raise processutils.ProcessExecutionError('error')
+            else:
+                return "", ""
+
+        self.stubs.Set(self.driver, '_execute', fake_execute)
+        self.driver._exec_ebtables('fake')
+        self.assertEqual(2, len(executes))
+        self.mox.UnsetStubs()