[OVN] Hash Ring: Better handle Neutron worker failures

This patch implements a more resilient approach to handle the case
where Neutron API workers are killed and restarted. Instead of marking
all nodes for that host as offline, this patch tries to remove the
worker that was killed from the Hash Ring leaving all others nodes for
that host online.

In case the we fail to remove the node and another entry is added upon the
restart of the worker this patch also logs a clear critical log message to
alert the operator that there are more Hash Ring nodes than API workers
(it's expect to be the same) and that OVSDB events could go missing if
they are routed to the previous node that failed to be removed from the
ring.

Closes-Bug: #2024205
Change-Id: I4b7376cf7df45fcc6e487970b068d06b4e74e319
Signed-off-by: Lucas Alvares Gomes <lucasagomes@gmail.com>
This commit is contained in:
Lucas Alvares Gomes 2023-06-16 13:44:07 +01:00
parent 28b2936451
commit 9e8e3a7867
6 changed files with 53 additions and 20 deletions

View File

@ -58,12 +58,12 @@ class OVNMechanismDriver(mech_driver.OVNMechanismDriver):
def ovn_client(self):
return self._ovn_client
def _set_hash_ring_nodes_offline(self):
"""Don't set hash ring nodes as offline.
def _remove_node_from_hash_ring(self):
"""Don't remove the node from the Hash Ring.
If this method was not overridden, cleanup would be performed when
calling the db sync and running neutron server would mark all the
nodes from the ring as offline.
calling the db sync and running neutron server would remove the
nodes from the Hash Ring.
"""
# Since we are not using the ovn mechanism driver while syncing,

View File

@ -50,6 +50,13 @@ def remove_nodes_from_host(context, group_name):
CONF.host, group_name)
def remove_node_by_uuid(context, node_uuid):
with db_api.CONTEXT_WRITER.using(context):
context.session.query(ovn_models.OVNHashRing).filter(
ovn_models.OVNHashRing.node_uuid == node_uuid).delete()
LOG.info('Node "%s" removed from the Hash Ring', node_uuid)
def _touch(context, updated_at=None, **filter_args):
if updated_at is None:
updated_at = timeutils.utcnow()
@ -96,7 +103,9 @@ def count_offline_nodes(context, interval, group_name):
return query.count()
def set_nodes_from_host_as_offline(context, group_name):
timestamp = datetime.datetime(day=26, month=10, year=1985, hour=9)
_touch(context, updated_at=timestamp, hostname=CONF.host,
group_name=group_name)
@db_api.CONTEXT_READER
def count_nodes_from_host(context, group_name):
query = context.session.query(ovn_models.OVNHashRing).filter(
ovn_models.OVNHashRing.group_name == group_name,
ovn_models.OVNHashRing.hostname == CONF.host)
return query.count()

View File

@ -286,17 +286,17 @@ class OVNMechanismDriver(api.MechanismDriver):
resources.SECURITY_GROUP_RULE,
events.BEFORE_DELETE)
def _set_hash_ring_nodes_offline(self, *args, **kwargs):
def _remove_node_from_hash_ring(self, *args, **kwargs):
# The node_uuid attribute will be empty for worker types
# that are not added to the Hash Ring and can be skipped
if self.node_uuid is None:
return
admin_context = n_context.get_admin_context()
ovn_hash_ring_db.set_nodes_from_host_as_offline(
admin_context, self.hash_ring_group)
LOG.info('Hash Ring nodes from host "%s" marked as offline',
cfg.CONF.host)
ovn_hash_ring_db.remove_node_by_uuid(
admin_context, self.node_uuid)
def pre_fork_initialize(self, resource, event, trigger, payload=None):
"""Pre-initialize the ML2/OVN driver."""
atexit.register(self._set_hash_ring_nodes_offline)
signal.signal(signal.SIGTERM, self._set_hash_ring_nodes_offline)
ovn_utils.create_neutron_pg_drop()
@staticmethod
@ -314,6 +314,10 @@ class OVNMechanismDriver(api.MechanismDriver):
thread for this host. Subsequently workers just need to register
themselves to the hash ring.
"""
# Attempt to remove the node from the ring when the worker stops
atexit.register(self._remove_node_from_hash_ring)
signal.signal(signal.SIGTERM, self._remove_node_from_hash_ring)
admin_context = n_context.get_admin_context()
if not self._hash_ring_probe_event.is_set():
# Clear existing entries

View File

@ -41,6 +41,7 @@ from neutron.db import ovn_revision_numbers_db as revision_numbers_db
from neutron.objects import ports as ports_obj
from neutron.objects import router as router_obj
from neutron.plugins.ml2.drivers.ovn.mech_driver.ovsdb import ovn_db_sync
from neutron import service
from neutron.services.logapi.drivers.ovn import driver as log_driver
@ -1100,3 +1101,20 @@ class HashRingHealthCheckPeriodics(object):
# here because we want the maintenance tasks from each instance to
# execute this task.
hash_ring_db.touch_nodes_from_host(self.ctx, self._group)
# Check the number of the nodes in the ring and log a message in
# case they are out of sync. See LP #2024205 for more information
# on this issue.
api_workers = service._get_api_workers()
num_nodes = hash_ring_db.count_nodes_from_host(self.ctx, self._group)
if num_nodes > api_workers:
LOG.critical(
'The number of nodes in the Hash Ring (%d) is higher than '
'the number of API workers (%d) for host "%s". Something is '
'not right and OVSDB events could be missed because of this. '
'Please check the status of the Neutron processes, this can '
'happen when the API workers are killed and restarted. '
'Restarting the service should fix the issue, see LP '
'#2024205 for more information.',
num_nodes, api_workers, cfg.CONF.host)

View File

@ -327,7 +327,7 @@ class TestOVNFunctionalBase(test_plugin.Ml2PluginV2TestCase,
# NOTE(ralonsoh): do not access to the DB at exit when the SQL
# connection is already closed, to avoid useless exception messages.
mock.patch.object(
self.mech_driver, '_set_hash_ring_nodes_offline').start()
self.mech_driver, '_remove_node_from_hash_ring').start()
self.mech_driver.pre_fork_initialize(
mock.ANY, mock.ANY, trigger_cls.trigger)

View File

@ -270,16 +270,18 @@ class TestHashRing(testlib_api.SqlTestCaseLight):
self.assertEqual(0, ovn_hash_ring_db.count_offline_nodes(
self.admin_ctx, interval=60, group_name=HASH_RING_TEST_GROUP))
def test_set_nodes_from_host_as_offline(self):
def test_remove_node_by_uuid(self):
self._add_nodes_and_assert_exists(count=3)
active_nodes = ovn_hash_ring_db.get_active_nodes(
self.admin_ctx, interval=60, group_name=HASH_RING_TEST_GROUP)
self.assertEqual(3, len(active_nodes))
ovn_hash_ring_db.set_nodes_from_host_as_offline(
self.admin_ctx, HASH_RING_TEST_GROUP)
node_to_remove = active_nodes[0].node_uuid
ovn_hash_ring_db.remove_node_by_uuid(
self.admin_ctx, node_to_remove)
active_nodes = ovn_hash_ring_db.get_active_nodes(
self.admin_ctx, interval=60, group_name=HASH_RING_TEST_GROUP)
self.assertEqual(0, len(active_nodes))
self.assertEqual(2, len(active_nodes))
self.assertNotIn(node_to_remove, [n.node_uuid for n in active_nodes])