Retry port lists on failure in PXE filter periodic sync
These calls are subject to transient network problems, we should not abort ironic-inspector process in this case. Also due to bug 1748893 the port listing API can sometimes return HTTP 400. This change retries port listing 5 times with 1 second break before aborting the periodic task and thus the process. This change introduces a dependency on the retrying library, which is already widely used in OpenStack (including ironic). Change-Id: I92fd70ca5692ce9f6798eedf9e540d5aa7c6f1af Closes-Bug: #1748893
This commit is contained in:
parent
e090ffa369
commit
3237511cc6
|
@ -17,6 +17,7 @@ from ironicclient import client
|
||||||
from ironicclient import exceptions as ironic_exc
|
from ironicclient import exceptions as ironic_exc
|
||||||
import netaddr
|
import netaddr
|
||||||
from oslo_config import cfg
|
from oslo_config import cfg
|
||||||
|
import retrying
|
||||||
|
|
||||||
from ironic_inspector.common.i18n import _
|
from ironic_inspector.common.i18n import _
|
||||||
from ironic_inspector.common import keystone
|
from ironic_inspector.common import keystone
|
||||||
|
@ -162,3 +163,16 @@ def get_node(node_id, ironic=None, **kwargs):
|
||||||
except ironic_exc.HttpError as exc:
|
except ironic_exc.HttpError as exc:
|
||||||
raise utils.Error(_("Cannot get node %(node)s: %(exc)s") %
|
raise utils.Error(_("Cannot get node %(node)s: %(exc)s") %
|
||||||
{'node': node_id, 'exc': exc})
|
{'node': node_id, 'exc': exc})
|
||||||
|
|
||||||
|
|
||||||
|
@retrying.retry(
|
||||||
|
retry_on_exception=lambda exc: isinstance(exc, ironic_exc.ClientException),
|
||||||
|
stop_max_attempt_number=5, wait_fixed=1000)
|
||||||
|
def call_with_retries(func, *args, **kwargs):
|
||||||
|
"""Call an ironic client function retrying all errors.
|
||||||
|
|
||||||
|
If an ironic client exception is raised, try calling the func again,
|
||||||
|
at most 5 times, waiting 1 sec between each call. If on the 5th attempt
|
||||||
|
the func raises again, the exception is propagated to the caller.
|
||||||
|
"""
|
||||||
|
return func(*args, **kwargs)
|
||||||
|
|
|
@ -66,7 +66,8 @@ class DnsmasqFilter(pxe_filter.BaseFilter):
|
||||||
timestamp_start = timeutils.utcnow()
|
timestamp_start = timeutils.utcnow()
|
||||||
active_macs = node_cache.active_macs()
|
active_macs = node_cache.active_macs()
|
||||||
ironic_macs = set(port.address for port in
|
ironic_macs = set(port.address for port in
|
||||||
ironic.port.list(limit=0, fields=['address']))
|
ir_utils.call_with_retries(ironic.port.list, limit=0,
|
||||||
|
fields=['address']))
|
||||||
blacklist_macs = _get_blacklist()
|
blacklist_macs = _get_blacklist()
|
||||||
# NOTE(milan) whitelist MACs of ports not kept in ironic anymore
|
# NOTE(milan) whitelist MACs of ports not kept in ironic anymore
|
||||||
# also whitelist active MACs that are still blacklisted in the
|
# also whitelist active MACs that are still blacklisted in the
|
||||||
|
|
|
@ -19,6 +19,7 @@ from eventlet.green import subprocess
|
||||||
from oslo_config import cfg
|
from oslo_config import cfg
|
||||||
from oslo_log import log
|
from oslo_log import log
|
||||||
|
|
||||||
|
from ironic_inspector.common import ironic as ir_utils
|
||||||
from ironic_inspector import node_cache
|
from ironic_inspector import node_cache
|
||||||
from ironic_inspector.pxe_filter import base as pxe_filter
|
from ironic_inspector.pxe_filter import base as pxe_filter
|
||||||
|
|
||||||
|
@ -225,9 +226,9 @@ def _ib_mac_to_rmac_mapping(ports):
|
||||||
|
|
||||||
|
|
||||||
def _get_blacklist(ironic):
|
def _get_blacklist(ironic):
|
||||||
ports = [
|
ports = [port.address for port in
|
||||||
port.address for port in ironic.port.list(
|
ir_utils.call_with_retries(ironic.port.list, limit=0,
|
||||||
limit=0, fields=['address', 'extra'])
|
fields=['address', 'extra'])
|
||||||
if port.address not in node_cache.active_macs()]
|
if port.address not in node_cache.active_macs()]
|
||||||
_ib_mac_to_rmac_mapping(ports)
|
_ib_mac_to_rmac_mapping(ports)
|
||||||
return ports
|
return ports
|
||||||
|
|
|
@ -15,6 +15,7 @@ import socket
|
||||||
import unittest
|
import unittest
|
||||||
|
|
||||||
from ironicclient import client
|
from ironicclient import client
|
||||||
|
from ironicclient import exc as ironic_exc
|
||||||
import mock
|
import mock
|
||||||
from oslo_config import cfg
|
from oslo_config import cfg
|
||||||
|
|
||||||
|
@ -144,3 +145,41 @@ class TestCapabilities(unittest.TestCase):
|
||||||
output = ir_utils.dict_to_capabilities(capabilities_dict)
|
output = ir_utils.dict_to_capabilities(capabilities_dict)
|
||||||
self.assertIn('cat:meow', output)
|
self.assertIn('cat:meow', output)
|
||||||
self.assertIn('dog:wuff', output)
|
self.assertIn('dog:wuff', output)
|
||||||
|
|
||||||
|
|
||||||
|
class TestCallWithRetries(unittest.TestCase):
|
||||||
|
def setUp(self):
|
||||||
|
super(TestCallWithRetries, self).setUp()
|
||||||
|
self.call = mock.Mock(spec=[])
|
||||||
|
|
||||||
|
def test_no_retries_on_success(self):
|
||||||
|
result = ir_utils.call_with_retries(self.call, 'meow', answer=42)
|
||||||
|
self.assertEqual(result, self.call.return_value)
|
||||||
|
self.call.assert_called_once_with('meow', answer=42)
|
||||||
|
|
||||||
|
def test_no_retries_on_python_error(self):
|
||||||
|
self.call.side_effect = RuntimeError('boom')
|
||||||
|
self.assertRaisesRegexp(RuntimeError, 'boom',
|
||||||
|
ir_utils.call_with_retries,
|
||||||
|
self.call, 'meow', answer=42)
|
||||||
|
self.call.assert_called_once_with('meow', answer=42)
|
||||||
|
|
||||||
|
@mock.patch('time.sleep', lambda _x: None)
|
||||||
|
def test_retries_on_ironicclient_error(self):
|
||||||
|
self.call.side_effect = [
|
||||||
|
ironic_exc.ClientException('boom')
|
||||||
|
] * 3 + [mock.sentinel.result]
|
||||||
|
|
||||||
|
result = ir_utils.call_with_retries(self.call, 'meow', answer=42)
|
||||||
|
self.assertEqual(result, mock.sentinel.result)
|
||||||
|
self.call.assert_called_with('meow', answer=42)
|
||||||
|
self.assertEqual(4, self.call.call_count)
|
||||||
|
|
||||||
|
@mock.patch('time.sleep', lambda _x: None)
|
||||||
|
def test_retries_on_ironicclient_error_with_failure(self):
|
||||||
|
self.call.side_effect = ironic_exc.ClientException('boom')
|
||||||
|
self.assertRaisesRegexp(ironic_exc.ClientException, 'boom',
|
||||||
|
ir_utils.call_with_retries,
|
||||||
|
self.call, 'meow', answer=42)
|
||||||
|
self.call.assert_called_with('meow', answer=42)
|
||||||
|
self.assertEqual(5, self.call.call_count)
|
||||||
|
|
|
@ -15,6 +15,7 @@ import datetime
|
||||||
import os
|
import os
|
||||||
|
|
||||||
import fixtures
|
import fixtures
|
||||||
|
from ironicclient import exc as ironic_exc
|
||||||
import mock
|
import mock
|
||||||
from oslo_config import cfg
|
from oslo_config import cfg
|
||||||
import six
|
import six
|
||||||
|
@ -299,6 +300,29 @@ class TestSync(DnsmasqTestBase):
|
||||||
self.timestamp_end - self.timestamp_start)
|
self.timestamp_end - self.timestamp_start)
|
||||||
])
|
])
|
||||||
|
|
||||||
|
@mock.patch('time.sleep', lambda _x: None)
|
||||||
|
def test__sync_with_port_list_retries(self):
|
||||||
|
self.mock_ironic.port.list.side_effect = [
|
||||||
|
ironic_exc.ConnectionRefused('boom'),
|
||||||
|
[mock.Mock(address=address) for address in self.ironic_macs]
|
||||||
|
]
|
||||||
|
self.driver._sync(self.mock_ironic)
|
||||||
|
|
||||||
|
self.mock__whitelist_mac.assert_has_calls([mock.call('active_mac'),
|
||||||
|
mock.call('gone_mac')],
|
||||||
|
any_order=True)
|
||||||
|
self.mock__blacklist_mac.assert_has_calls([mock.call('new_mac')],
|
||||||
|
any_order=True)
|
||||||
|
self.mock_ironic.port.list.assert_called_with(limit=0,
|
||||||
|
fields=['address'])
|
||||||
|
self.mock_active_macs.assert_called_once_with()
|
||||||
|
self.mock__get_blacklist.assert_called_once_with()
|
||||||
|
self.mock_log.debug.assert_has_calls([
|
||||||
|
mock.call('Syncing the driver'),
|
||||||
|
mock.call('The dnsmasq PXE filter was synchronized (took %s)',
|
||||||
|
self.timestamp_end - self.timestamp_start)
|
||||||
|
])
|
||||||
|
|
||||||
|
|
||||||
class Test_Execute(test_base.BaseTest):
|
class Test_Execute(test_base.BaseTest):
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
|
|
|
@ -14,6 +14,7 @@
|
||||||
# under the License.
|
# under the License.
|
||||||
|
|
||||||
import fixtures
|
import fixtures
|
||||||
|
from ironicclient import exc as ironic_exc
|
||||||
import mock
|
import mock
|
||||||
from oslo_config import cfg
|
from oslo_config import cfg
|
||||||
|
|
||||||
|
@ -354,3 +355,21 @@ class TestGetBlacklist(test_base.BaseTest):
|
||||||
self.mock_ironic.port.list.assert_called_once_with(
|
self.mock_ironic.port.list.assert_called_once_with(
|
||||||
limit=0, fields=['address', 'extra'])
|
limit=0, fields=['address', 'extra'])
|
||||||
self.mock__ib_mac_to_rmac_mapping.assert_called_once_with(ports)
|
self.mock__ib_mac_to_rmac_mapping.assert_called_once_with(ports)
|
||||||
|
|
||||||
|
@mock.patch('time.sleep', lambda _x: None)
|
||||||
|
def test_retry_on_port_list_failure(self):
|
||||||
|
self.mock_ironic.port.list.side_effect = [
|
||||||
|
ironic_exc.ConnectionRefused('boom'),
|
||||||
|
[
|
||||||
|
mock.Mock(address='foo'),
|
||||||
|
mock.Mock(address='bar'),
|
||||||
|
]
|
||||||
|
]
|
||||||
|
self.mock_active_macs.return_value = {'foo'}
|
||||||
|
|
||||||
|
ports = iptables._get_blacklist(self.mock_ironic)
|
||||||
|
# foo is an active address so we expect the blacklist contains only bar
|
||||||
|
self.assertEqual(['bar'], ports)
|
||||||
|
self.mock_ironic.port.list.assert_called_with(
|
||||||
|
limit=0, fields=['address', 'extra'])
|
||||||
|
self.mock__ib_mac_to_rmac_mapping.assert_called_once_with(ports)
|
||||||
|
|
|
@ -0,0 +1,11 @@
|
||||||
|
---
|
||||||
|
fixes:
|
||||||
|
- |
|
||||||
|
The periodic PXE filter update task now retries fetching port list from
|
||||||
|
the Bare Metal service 5 times (with 1 second delay) before giving up.
|
||||||
|
This ensures that a temporary networking glitch will not result in
|
||||||
|
the ironic-inspector service stopping.
|
||||||
|
upgrade:
|
||||||
|
- |
|
||||||
|
Adds dependency on the `retrying <https://github.com/rholder/retrying>`_
|
||||||
|
python library.
|
|
@ -29,6 +29,7 @@ oslo.policy>=1.30.0 # Apache-2.0
|
||||||
oslo.rootwrap>=5.8.0 # Apache-2.0
|
oslo.rootwrap>=5.8.0 # Apache-2.0
|
||||||
oslo.serialization!=2.19.1,>=2.18.0 # Apache-2.0
|
oslo.serialization!=2.19.1,>=2.18.0 # Apache-2.0
|
||||||
oslo.utils>=3.33.0 # Apache-2.0
|
oslo.utils>=3.33.0 # Apache-2.0
|
||||||
|
retrying!=1.3.0,>=1.2.3 # Apache-2.0
|
||||||
six>=1.10.0 # MIT
|
six>=1.10.0 # MIT
|
||||||
stevedore>=1.20.0 # Apache-2.0
|
stevedore>=1.20.0 # Apache-2.0
|
||||||
SQLAlchemy!=1.1.5,!=1.1.6,!=1.1.7,!=1.1.8,>=1.0.10 # MIT
|
SQLAlchemy!=1.1.5,!=1.1.6,!=1.1.7,!=1.1.8,>=1.0.10 # MIT
|
||||||
|
|
Loading…
Reference in New Issue