From f2e3ab380580df8c0ab33884f2e8797e2f5b0724 Mon Sep 17 00:00:00 2001 From: Lucas Alvares Gomes Date: Fri, 16 Jun 2023 10:16:23 +0100 Subject: [PATCH] [OVN] Hash Ring: Set nodes as offline upon exit This patch implements the proposed solution from LP #2024205 where upon a Neutron being killed, it could trigger the deletion of the entries from the ovn_hash_ring table that matches the server hostname. When this happens on all controllers this could lead to the ovn_hash_ring being rendered empty which will result in ML2/OVN not processing any OVSDB events. Instead of removing the nodes from the ovn_hash_ring table at exit, this patch changes the code to just mark them as offline instead. That way, the nodes will remain registered in the table and the heartbeat thread will set them as online again on the next beat. If the service is stopped properly there won't be any heartbeat anymore and the nodes will be seeing as offline by the Hash Ring Manager (same as if they were deleted). For more info see LP #2024205. Closes-Bug: #2024205 Change-Id: I052841c87651773c4988fcf39f9f978094297704 Signed-off-by: Lucas Alvares Gomes --- neutron/cmd/ovn/neutron_ovn_db_sync_util.py | 8 ++++---- neutron/db/ovn_hash_ring_db.py | 12 ++++++++++-- .../ml2/drivers/ovn/mech_driver/mech_driver.py | 16 ++++++++++------ neutron/tests/functional/base.py | 3 ++- neutron/tests/unit/db/test_ovn_hash_ring_db.py | 14 ++++++++++++++ 5 files changed, 40 insertions(+), 13 deletions(-) diff --git a/neutron/cmd/ovn/neutron_ovn_db_sync_util.py b/neutron/cmd/ovn/neutron_ovn_db_sync_util.py index 10816c44ad6..a4013c8d129 100644 --- a/neutron/cmd/ovn/neutron_ovn_db_sync_util.py +++ b/neutron/cmd/ovn/neutron_ovn_db_sync_util.py @@ -58,12 +58,12 @@ class OVNMechanismDriver(mech_driver.OVNMechanismDriver): def ovn_client(self): return self._ovn_client - def _clean_hash_ring(self): - """Don't clean the hash ring. + def _set_hash_ring_nodes_offline(self): + """Don't set hash ring nodes as offline. If this method was not overridden, cleanup would be performed when - calling the db sync and running neutron server would lose all the nodes - from the ring. + calling the db sync and running neutron server would mark all the + nodes from the ring as offline. """ # Since we are not using the ovn mechanism driver while syncing, diff --git a/neutron/db/ovn_hash_ring_db.py b/neutron/db/ovn_hash_ring_db.py index fd9430d6919..59f528f7730 100644 --- a/neutron/db/ovn_hash_ring_db.py +++ b/neutron/db/ovn_hash_ring_db.py @@ -50,10 +50,12 @@ def remove_nodes_from_host(context, group_name): CONF.host, group_name) -def _touch(context, **filter_args): +def _touch(context, updated_at=None, **filter_args): + if updated_at is None: + updated_at = timeutils.utcnow() with db_api.CONTEXT_WRITER.using(context): context.session.query(ovn_models.OVNHashRing).filter_by( - **filter_args).update({'updated_at': timeutils.utcnow()}) + **filter_args).update({'updated_at': updated_at}) def touch_nodes_from_host(context, group_name): @@ -92,3 +94,9 @@ def get_active_nodes(context, interval, group_name, from_host=False): def count_offline_nodes(context, interval, group_name): query = _get_nodes_query(context, interval, group_name, offline=True) return query.count() + + +def set_nodes_from_host_as_offline(context, group_name): + timestamp = datetime.datetime(day=26, month=10, year=1985, hour=9) + _touch(context, updated_at=timestamp, hostname=CONF.host, + group_name=group_name) diff --git a/neutron/plugins/ml2/drivers/ovn/mech_driver/mech_driver.py b/neutron/plugins/ml2/drivers/ovn/mech_driver/mech_driver.py index 36b8c67e07d..c3506f38166 100644 --- a/neutron/plugins/ml2/drivers/ovn/mech_driver/mech_driver.py +++ b/neutron/plugins/ml2/drivers/ovn/mech_driver/mech_driver.py @@ -286,15 +286,17 @@ class OVNMechanismDriver(api.MechanismDriver): resources.SECURITY_GROUP_RULE, events.BEFORE_DELETE) - def _clean_hash_ring(self, *args, **kwargs): + def _set_hash_ring_nodes_offline(self, *args, **kwargs): admin_context = n_context.get_admin_context() - ovn_hash_ring_db.remove_nodes_from_host(admin_context, - self.hash_ring_group) + ovn_hash_ring_db.set_nodes_from_host_as_offline( + admin_context, self.hash_ring_group) + LOG.info('Hash Ring nodes from host "%s" marked as offline', + cfg.CONF.host) def pre_fork_initialize(self, resource, event, trigger, payload=None): """Pre-initialize the ML2/OVN driver.""" - atexit.register(self._clean_hash_ring) - signal.signal(signal.SIGTERM, self._clean_hash_ring) + atexit.register(self._set_hash_ring_nodes_offline) + signal.signal(signal.SIGTERM, self._set_hash_ring_nodes_offline) ovn_utils.create_neutron_pg_drop() @staticmethod @@ -314,7 +316,9 @@ class OVNMechanismDriver(api.MechanismDriver): """ admin_context = n_context.get_admin_context() if not self._hash_ring_probe_event.is_set(): - self._clean_hash_ring() + # Clear existing entries + ovn_hash_ring_db.remove_nodes_from_host(admin_context, + self.hash_ring_group) self.node_uuid = ovn_hash_ring_db.add_node(admin_context, self.hash_ring_group) self._hash_ring_thread = maintenance.MaintenanceThread() diff --git a/neutron/tests/functional/base.py b/neutron/tests/functional/base.py index 8ef793989eb..a8f179d1bad 100644 --- a/neutron/tests/functional/base.py +++ b/neutron/tests/functional/base.py @@ -326,7 +326,8 @@ class TestOVNFunctionalBase(test_plugin.Ml2PluginV2TestCase, self.addCleanup(self.stop) # NOTE(ralonsoh): do not access to the DB at exit when the SQL # connection is already closed, to avoid useless exception messages. - mock.patch.object(self.mech_driver, '_clean_hash_ring').start() + mock.patch.object( + self.mech_driver, '_set_hash_ring_nodes_offline').start() self.mech_driver.pre_fork_initialize( mock.ANY, mock.ANY, trigger_cls.trigger) diff --git a/neutron/tests/unit/db/test_ovn_hash_ring_db.py b/neutron/tests/unit/db/test_ovn_hash_ring_db.py index eb9eaab30be..6d8e874acdb 100644 --- a/neutron/tests/unit/db/test_ovn_hash_ring_db.py +++ b/neutron/tests/unit/db/test_ovn_hash_ring_db.py @@ -269,3 +269,17 @@ class TestHashRing(testlib_api.SqlTestCaseLight): # Assert no nodes are considered offline self.assertEqual(0, ovn_hash_ring_db.count_offline_nodes( self.admin_ctx, interval=60, group_name=HASH_RING_TEST_GROUP)) + + def test_set_nodes_from_host_as_offline(self): + self._add_nodes_and_assert_exists(count=3) + + active_nodes = ovn_hash_ring_db.get_active_nodes( + self.admin_ctx, interval=60, group_name=HASH_RING_TEST_GROUP) + self.assertEqual(3, len(active_nodes)) + + ovn_hash_ring_db.set_nodes_from_host_as_offline( + self.admin_ctx, HASH_RING_TEST_GROUP) + + active_nodes = ovn_hash_ring_db.get_active_nodes( + self.admin_ctx, interval=60, group_name=HASH_RING_TEST_GROUP) + self.assertEqual(0, len(active_nodes))