[OVN] Hash Ring: Set nodes as offline upon exit

This patch implements the proposed solution from LP #2024205 where upon
a Neutron being killed, it could trigger the deletion of the entries
from the ovn_hash_ring table that matches the server hostname. When
this happens on all controllers this could lead to the ovn_hash_ring
being rendered empty which will result in ML2/OVN not processing any
OVSDB events.

Instead of removing the nodes from the ovn_hash_ring table at exit, this
patch changes the code to just mark them as offline instead. That way,
the nodes will remain registered in the table and the heartbeat thread
will set them as online again on the next beat. If the service is
stopped properly there won't be any heartbeat anymore and the nodes will
be seeing as offline by the Hash Ring Manager (same as if they were
deleted).

For more info see LP #2024205.

Closes-Bug: #2024205
Change-Id: I052841c87651773c4988fcf39f9f978094297704
Signed-off-by: Lucas Alvares Gomes <lucasagomes@gmail.com>
This commit is contained in:
Lucas Alvares Gomes 2023-06-16 10:16:23 +01:00
parent 42494ad6ae
commit f2e3ab3805
5 changed files with 40 additions and 13 deletions

View File

@ -58,12 +58,12 @@ class OVNMechanismDriver(mech_driver.OVNMechanismDriver):
def ovn_client(self):
return self._ovn_client
def _clean_hash_ring(self):
"""Don't clean the hash ring.
def _set_hash_ring_nodes_offline(self):
"""Don't set hash ring nodes as offline.
If this method was not overridden, cleanup would be performed when
calling the db sync and running neutron server would lose all the nodes
from the ring.
calling the db sync and running neutron server would mark all the
nodes from the ring as offline.
"""
# Since we are not using the ovn mechanism driver while syncing,

View File

@ -50,10 +50,12 @@ def remove_nodes_from_host(context, group_name):
CONF.host, group_name)
def _touch(context, **filter_args):
def _touch(context, updated_at=None, **filter_args):
if updated_at is None:
updated_at = timeutils.utcnow()
with db_api.CONTEXT_WRITER.using(context):
context.session.query(ovn_models.OVNHashRing).filter_by(
**filter_args).update({'updated_at': timeutils.utcnow()})
**filter_args).update({'updated_at': updated_at})
def touch_nodes_from_host(context, group_name):
@ -92,3 +94,9 @@ def get_active_nodes(context, interval, group_name, from_host=False):
def count_offline_nodes(context, interval, group_name):
query = _get_nodes_query(context, interval, group_name, offline=True)
return query.count()
def set_nodes_from_host_as_offline(context, group_name):
timestamp = datetime.datetime(day=26, month=10, year=1985, hour=9)
_touch(context, updated_at=timestamp, hostname=CONF.host,
group_name=group_name)

View File

@ -286,15 +286,17 @@ class OVNMechanismDriver(api.MechanismDriver):
resources.SECURITY_GROUP_RULE,
events.BEFORE_DELETE)
def _clean_hash_ring(self, *args, **kwargs):
def _set_hash_ring_nodes_offline(self, *args, **kwargs):
admin_context = n_context.get_admin_context()
ovn_hash_ring_db.remove_nodes_from_host(admin_context,
self.hash_ring_group)
ovn_hash_ring_db.set_nodes_from_host_as_offline(
admin_context, self.hash_ring_group)
LOG.info('Hash Ring nodes from host "%s" marked as offline',
cfg.CONF.host)
def pre_fork_initialize(self, resource, event, trigger, payload=None):
"""Pre-initialize the ML2/OVN driver."""
atexit.register(self._clean_hash_ring)
signal.signal(signal.SIGTERM, self._clean_hash_ring)
atexit.register(self._set_hash_ring_nodes_offline)
signal.signal(signal.SIGTERM, self._set_hash_ring_nodes_offline)
ovn_utils.create_neutron_pg_drop()
@staticmethod
@ -314,7 +316,9 @@ class OVNMechanismDriver(api.MechanismDriver):
"""
admin_context = n_context.get_admin_context()
if not self._hash_ring_probe_event.is_set():
self._clean_hash_ring()
# Clear existing entries
ovn_hash_ring_db.remove_nodes_from_host(admin_context,
self.hash_ring_group)
self.node_uuid = ovn_hash_ring_db.add_node(admin_context,
self.hash_ring_group)
self._hash_ring_thread = maintenance.MaintenanceThread()

View File

@ -326,7 +326,8 @@ class TestOVNFunctionalBase(test_plugin.Ml2PluginV2TestCase,
self.addCleanup(self.stop)
# NOTE(ralonsoh): do not access to the DB at exit when the SQL
# connection is already closed, to avoid useless exception messages.
mock.patch.object(self.mech_driver, '_clean_hash_ring').start()
mock.patch.object(
self.mech_driver, '_set_hash_ring_nodes_offline').start()
self.mech_driver.pre_fork_initialize(
mock.ANY, mock.ANY, trigger_cls.trigger)

View File

@ -269,3 +269,17 @@ class TestHashRing(testlib_api.SqlTestCaseLight):
# Assert no nodes are considered offline
self.assertEqual(0, ovn_hash_ring_db.count_offline_nodes(
self.admin_ctx, interval=60, group_name=HASH_RING_TEST_GROUP))
def test_set_nodes_from_host_as_offline(self):
self._add_nodes_and_assert_exists(count=3)
active_nodes = ovn_hash_ring_db.get_active_nodes(
self.admin_ctx, interval=60, group_name=HASH_RING_TEST_GROUP)
self.assertEqual(3, len(active_nodes))
ovn_hash_ring_db.set_nodes_from_host_as_offline(
self.admin_ctx, HASH_RING_TEST_GROUP)
active_nodes = ovn_hash_ring_db.get_active_nodes(
self.admin_ctx, interval=60, group_name=HASH_RING_TEST_GROUP)
self.assertEqual(0, len(active_nodes))