From 98be6376b2ec74343e0b51eab4e5f36927cfc88e Mon Sep 17 00:00:00 2001 From: Kiran Pawar Date: Mon, 26 Sep 2022 14:24:29 +0000 Subject: [PATCH] Add 'state' column in 'services' table. When manila services are stopped or restarted via stop(), the DB entries are not deleted, they are destroyed only in kill() method. In cluster deployments, where multiple instances of manila services are deployed via PODs, unique hostname is derived from node name. However if pods are deployed again and launched on new hosts/nodes, the old entries of manila service remains as it is. Fix it by adding 'state' column in 'services' table and introducing per service cleanup function. On service stop, state is changed to 'stopped' and cleanup function will delete 'stopped' services unless they are 'up' again before cleanup periodic interval. Closes-bug: #1990839 Change-Id: I8b71c4c27ff8fcb25616a95a5ed8362a7f4ffc61 --- manila/api/v2/services.py | 3 +- manila/data/manager.py | 4 +- ...be2a413e13_add_state_column_for_service.py | 47 +++++++++++++++++++ manila/db/sqlalchemy/models.py | 1 + manila/manager.py | 6 ++- manila/scheduler/manager.py | 4 +- manila/service.py | 46 +++++++++++++++--- manila/share/manager.py | 4 +- manila/tests/api/v2/test_services.py | 7 ++- .../alembic/migrations_data_checks.py | 33 +++++++++++++ manila/tests/test_service.py | 42 ++++++++++++++++- ...e-column-for-service-c4fe2a6e312a1651.yaml | 11 +++++ 12 files changed, 193 insertions(+), 15 deletions(-) create mode 100644 manila/db/migrations/alembic/versions/aebe2a413e13_add_state_column_for_service.py create mode 100644 releasenotes/notes/bug-1990839-add-state-column-for-service-c4fe2a6e312a1651.yaml diff --git a/manila/api/v2/services.py b/manila/api/v2/services.py index 415dd724cd..337a1ded5a 100644 --- a/manila/api/v2/services.py +++ b/manila/api/v2/services.py @@ -19,7 +19,6 @@ import webob.exc from manila.api.openstack import wsgi from manila.api.views import services as services_views from manila import db -from manila import utils class ServiceMixin(object): @@ -47,7 +46,7 @@ class ServiceMixin(object): 'host': service['host'], 'zone': service['availability_zone']['name'], 'status': 'disabled' if service['disabled'] else 'enabled', - 'state': 'up' if utils.service_is_up(service) else 'down', + 'state': service['state'], 'updated_at': service['updated_at'], } services.append(service) diff --git a/manila/data/manager.py b/manila/data/manager.py index af3772cb43..896ccd2099 100644 --- a/manila/data/manager.py +++ b/manila/data/manager.py @@ -58,8 +58,10 @@ class DataManager(manager.Manager): def __init__(self, service_name=None, *args, **kwargs): super(DataManager, self).__init__(*args, **kwargs) self.busy_tasks_shares = {} + self.service_id = None - def init_host(self): + def init_host(self, service_id=None): + self.service_id = service_id ctxt = context.get_admin_context() shares = self.db.share_get_all(ctxt) for share in shares: diff --git a/manila/db/migrations/alembic/versions/aebe2a413e13_add_state_column_for_service.py b/manila/db/migrations/alembic/versions/aebe2a413e13_add_state_column_for_service.py new file mode 100644 index 0000000000..edc2b25fe9 --- /dev/null +++ b/manila/db/migrations/alembic/versions/aebe2a413e13_add_state_column_for_service.py @@ -0,0 +1,47 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +"""add state column for service + +Revision ID: aebe2a413e13 +Revises: ac0620cbe74d +Create Date: 2023-01-10 11:43:24.741726 + +""" + +# revision identifiers, used by Alembic. +revision = 'aebe2a413e13' +down_revision = 'ac0620cbe74d' + +from alembic import op +from oslo_log import log +import sqlalchemy as sa + +LOG = log.getLogger(__name__) + + +def upgrade(): + try: + op.add_column( + 'services', + sa.Column('state', sa.String(36), nullable=True)) + except Exception: + LOG.error("services table column state not added") + raise + + +def downgrade(): + try: + op.drop_column('services', 'state') + except Exception: + LOG.error("services table column state not dropped") + raise diff --git a/manila/db/sqlalchemy/models.py b/manila/db/sqlalchemy/models.py index 5cf7fda4f9..231855015b 100644 --- a/manila/db/sqlalchemy/models.py +++ b/manila/db/sqlalchemy/models.py @@ -63,6 +63,7 @@ class Service(BASE, ManilaBase): host = Column(String(255)) # , ForeignKey('hosts.id')) binary = Column(String(255)) topic = Column(String(255)) + state = Column(String(36)) report_count = Column(Integer, nullable=False, default=0) disabled = Column(Boolean, default=False) availability_zone_id = Column(String(36), diff --git a/manila/manager.py b/manila/manager.py index ccd2898d1f..6a90c90014 100644 --- a/manila/manager.py +++ b/manila/manager.py @@ -100,23 +100,25 @@ class Manager(base.Base, PeriodicTasks): """Tasks to be run at a periodic interval.""" return self.run_periodic_tasks(context, raise_on_error=raise_on_error) - def init_host(self): + def init_host(self, service_id=None): """Handle initialization if this is a standalone service. A hook point for services to execute tasks before the services are made available (i.e. showing up on RPC and starting to accept RPC calls) to other components. Child classes should override this method. + :param service_id: ID of the service where the manager is running. """ pass - def init_host_with_rpc(self): + def init_host_with_rpc(self, service_id=None): """A hook for service to do jobs after RPC is ready. Like init_host(), this method is a hook where services get a chance to execute tasks that *need* RPC. Child classes should override this method. + :param service_id: ID of the service where the manager is running. """ pass diff --git a/manila/scheduler/manager.py b/manila/scheduler/manager.py index f982473f04..ae5a87f2e9 100644 --- a/manila/scheduler/manager.py +++ b/manila/scheduler/manager.py @@ -87,8 +87,10 @@ class SchedulerManager(manager.Manager): self.driver = importutils.import_object(scheduler_driver) self.message_api = message_api.API() super(SchedulerManager, self).__init__(*args, **kwargs) + self.service_id = None - def init_host_with_rpc(self): + def init_host_with_rpc(self, service_id=None): + self.service_id = service_id ctxt = context.get_admin_context() self.request_service_capabilities(ctxt) diff --git a/manila/service.py b/manila/service.py index 4f2a04cab7..5c131857bc 100644 --- a/manila/service.py +++ b/manila/service.py @@ -33,6 +33,7 @@ from manila import coordination from manila import db from manila import exception from manila import rpc +from manila import utils from manila import version osprofiler_initializer = importutils.try_import('osprofiler.initializer') @@ -45,6 +46,10 @@ service_opts = [ cfg.IntOpt('report_interval', default=10, help='Seconds between nodes reporting state to datastore.'), + cfg.IntOpt('cleanup_interval', + min=300, + default=1800, + help='Seconds between cleaning up the stopped nodes.'), cfg.IntOpt('periodic_interval', default=60, help='Seconds between running periodic tasks.'), @@ -121,6 +126,7 @@ class Service(service.Service): *args, **kwargs) self.availability_zone = self.manager.availability_zone self.report_interval = report_interval + self.cleanup_interval = CONF.cleanup_interval self.periodic_interval = periodic_interval self.periodic_fuzzy_delay = periodic_fuzzy_delay self.saved_args, self.saved_kwargs = args, kwargs @@ -134,20 +140,22 @@ class Service(service.Service): LOG.info('Starting %(topic)s node (version %(version_string)s)', {'topic': self.topic, 'version_string': version_string}) self.model_disconnected = False - self.manager.init_host() ctxt = context.get_admin_context() - if self.coordinator: - coordination.LOCK_COORDINATOR.start() - try: service_ref = db.service_get_by_args(ctxt, self.host, self.binary) self.service_id = service_ref['id'] + db.service_update(ctxt, self.service_id, {'state': 'down'}) except exception.NotFound: self._create_service_ref(ctxt) + self.manager.init_host(service_id=self.service_id) + + if self.coordinator: + coordination.LOCK_COORDINATOR.start() + LOG.debug("Creating RPC server for service %s.", self.topic) target = messaging.Target(topic=self.topic, server=self.host) @@ -162,6 +170,9 @@ class Service(service.Service): self.tg.add_timer(self.report_interval, self.report_state, initial_delay=self.report_interval) + self.tg.add_timer(self.cleanup_interval, self.cleanup_services, + initial_delay=self.cleanup_interval) + if self.periodic_interval: if self.periodic_fuzzy_delay: initial_delay = random.randint(0, self.periodic_fuzzy_delay) @@ -176,6 +187,7 @@ class Service(service.Service): 'host': self.host, 'binary': self.binary, 'topic': self.topic, + 'state': 'up', 'report_count': 0, 'availability_zone': self.availability_zone } @@ -242,6 +254,8 @@ class Service(service.Service): except Exception: pass + db.service_update(context.get_admin_context(), + self.service_id, {'state': 'stopped'}) if self.coordinator: try: coordination.LOCK_COORDINATOR.stop() @@ -287,8 +301,12 @@ class Service(service.Service): service_ref['availability_zone']['name']): state_catalog['availability_zone'] = self.availability_zone - db.service_update(ctxt, - self.service_id, state_catalog) + if utils.service_is_up(service_ref): + state_catalog['state'] = 'up' + else: + if service_ref['state'] != 'stopped': + state_catalog['state'] = 'down' + db.service_update(ctxt, self.service_id, state_catalog) # TODO(termie): make this pattern be more elegant. if getattr(self, 'model_disconnected', False): @@ -301,6 +319,22 @@ class Service(service.Service): self.model_disconnected = True LOG.exception('model server went away') + def cleanup_services(self): + """Remove the stopped services of same topic from the datastore.""" + ctxt = context.get_admin_context() + try: + services = db.service_get_all(ctxt, self.topic) + except exception.NotFound: + LOG.debug('The service database object disappeared,' + 'Exiting from cleanup.') + return + + for svc in services: + if (svc['topic'] == self.topic and + svc['state'] == 'stopped' and + not utils.service_is_up(svc)): + db.service_destroy(ctxt, svc['id']) + class WSGIService(service.ServiceBase): """Provides ability to launch API from a 'paste' configuration.""" diff --git a/manila/share/manager.py b/manila/share/manager.py index 6f3735b9fe..819f321707 100644 --- a/manila/share/manager.py +++ b/manila/share/manager.py @@ -297,6 +297,7 @@ class ShareManager(manager.SchedulerDependentManager): self.driver = profiler.trace_cls("driver")(self.driver) self.hooks = [] self._init_hook_drivers() + self.service_id = None def _init_hook_drivers(self): # Try to initialize hook driver(s). @@ -334,9 +335,10 @@ class ShareManager(manager.SchedulerDependentManager): return pool @add_hooks - def init_host(self): + def init_host(self, service_id=None): """Initialization for a standalone service.""" + self.service_id = service_id ctxt = context.get_admin_context() driver_host_pair = "{}@{}".format( self.driver.__class__.__name__, diff --git a/manila/tests/api/v2/test_services.py b/manila/tests/api/v2/test_services.py index 819d55300c..eb56bcbdb9 100644 --- a/manila/tests/api/v2/test_services.py +++ b/manila/tests/api/v2/test_services.py @@ -37,6 +37,7 @@ fake_services_list = [ 'availability_zone': {'name': 'manila1'}, 'id': 1, 'disabled': True, + 'state': 'up', 'updated_at': datetime.datetime(2012, 10, 29, 13, 42, 2), 'created_at': datetime.datetime(2012, 9, 18, 2, 46, 27), }, @@ -46,6 +47,7 @@ fake_services_list = [ 'availability_zone': {'name': 'manila1'}, 'id': 2, 'disabled': True, + 'state': 'up', 'updated_at': datetime.datetime(2012, 10, 29, 13, 42, 5), 'created_at': datetime.datetime(2012, 9, 18, 2, 46, 27)}, { @@ -54,6 +56,7 @@ fake_services_list = [ 'availability_zone': {'name': 'manila2'}, 'id': 3, 'disabled': False, + 'state': 'down', 'updated_at': datetime.datetime(2012, 9, 19, 6, 55, 34), 'created_at': datetime.datetime(2012, 9, 18, 2, 46, 28)}, { @@ -62,6 +65,7 @@ fake_services_list = [ 'availability_zone': {'name': 'manila2'}, 'id': 4, 'disabled': True, + 'state': 'down', 'updated_at': datetime.datetime(2012, 9, 18, 8, 3, 38), 'created_at': datetime.datetime(2012, 9, 18, 2, 46, 28), }, @@ -74,7 +78,8 @@ fake_response_service_list = {'services': [ 'binary': 'manila-scheduler', 'host': 'host1', 'zone': 'manila1', - 'status': 'disabled', 'state': 'up', + 'status': 'disabled', + 'state': 'up', 'updated_at': datetime.datetime(2012, 10, 29, 13, 42, 2), }, { diff --git a/manila/tests/db/migrations/alembic/migrations_data_checks.py b/manila/tests/db/migrations/alembic/migrations_data_checks.py index 4d210f7db6..0887015dfb 100644 --- a/manila/tests/db/migrations/alembic/migrations_data_checks.py +++ b/manila/tests/db/migrations/alembic/migrations_data_checks.py @@ -3308,3 +3308,36 @@ class AddSubnetMetadata(BaseMigrationChecks): self.test_case.assertRaises(sa_exc.NoSuchTableError, utils.load_table, self.new_table_name, engine) + + +@map_to_migration('aebe2a413e13') +class AddServiceState(BaseMigrationChecks): + + def _get_service_data(self, options): + base_dict = { + 'binary': 'manila-share', + 'topic': 'share', + 'disabled': False, + 'report_count': '100', + } + base_dict.update(options) + return base_dict + + def setup_upgrade_data(self, engine): + service_fixture = [ + self._get_service_data({'host': 'fake1'}), + self._get_service_data({'host': 'fake2'}), + ] + services_table = utils.load_table('services', engine) + for fixture in service_fixture: + engine.execute(services_table.insert(fixture)) + + def check_upgrade(self, engine, data): + s_table = utils.load_table('services', engine) + for s in engine.execute(s_table.select()): + self.test_case.assertTrue(hasattr(s, 'state')) + + def check_downgrade(self, engine): + s_table = utils.load_table('services', engine) + for s in engine.execute(s_table.select()): + self.test_case.assertFalse(hasattr(s, 'state')) diff --git a/manila/tests/test_service.py b/manila/tests/test_service.py index e305806366..73ab3a591a 100644 --- a/manila/tests/test_service.py +++ b/manila/tests/test_service.py @@ -21,6 +21,8 @@ Unit Tests for remote procedure calls using queue """ +from datetime import datetime +from datetime import timedelta from unittest import mock import ddt @@ -120,6 +122,7 @@ service_create = { 'host': host, 'binary': binary, 'topic': topic, + 'state': 'up', 'report_count': 0, 'availability_zone': 'nova', } @@ -127,6 +130,7 @@ service_create_other_az = { 'host': host, 'binary': binary, 'topic': topic, + 'state': 'up', 'report_count': 0, 'availability_zone': 'other-zone', } @@ -134,6 +138,16 @@ service_ref = { 'host': host, 'binary': binary, 'topic': topic, + 'state': 'up', + 'report_count': 0, + 'availability_zone': {'name': 'nova'}, + 'id': 1, +} +service_ref_stopped = { + 'host': host, + 'binary': binary, + 'topic': topic, + 'state': 'stopped', 'report_count': 0, 'availability_zone': {'name': 'nova'}, 'id': 1, @@ -192,6 +206,8 @@ class ServiceTestCase(test.TestCase): @mock.patch.object(service.db, 'service_update', mock.Mock(return_value=service_ref. update({'report_count': 1}))) + @mock.patch.object(utils, 'service_is_up', + mock.Mock(return_value=True)) def test_report_state_newly_connected(self): serv = service.Service(host, binary, topic, CONF.fake_manager) serv.start() @@ -216,6 +232,8 @@ class ServiceTestCase(test.TestCase): @mock.patch.object(service.db, 'service_update', mock.Mock(return_value=service_ref. update({'report_count': 1}))) + @mock.patch.object(utils, 'service_is_up', + mock.Mock(return_value=True)) def test_report_state_newly_connected_different_az(self): serv = service.Service(host, binary, topic, CONF.fake_manager) serv.availability_zone = 'other-zone' @@ -242,6 +260,8 @@ class ServiceTestCase(test.TestCase): @mock.patch.object(service.db, 'service_update', mock.Mock(return_value=service_ref. update({'report_count': 1}))) + @mock.patch.object(utils, 'service_is_up', + mock.Mock(return_value=True)) def test_report_state_newly_connected_not_found(self): serv = service.Service(host, binary, topic, CONF.fake_manager) serv.start() @@ -268,7 +288,27 @@ class ServiceTestCase(test.TestCase): serv.report_state() serv.manager.is_service_ready.assert_called_once() - mock_db.service_update.assert_not_called() + + @ddt.data(True, False) + def test_cleanup_services(self, cleanup_interval_done): + with mock.patch.object(service, 'db') as mock_db: + mock_db.service_get_all.return_value = [service_ref] + serv = service.Service(host, binary, topic, CONF.fake_manager) + serv.start() + serv.cleanup_services() + mock_db.service_destroy.assert_not_called() + + if cleanup_interval_done: + service_ref_stopped['updated_at'] = ( + datetime.utcnow() - timedelta(minutes=10)) + else: + service_ref_stopped['updated_at'] = datetime.utcnow() + mock_db.service_get_all.return_value = [service_ref_stopped] + serv.stop() + serv.cleanup_services() + if cleanup_interval_done: + mock_db.service_destroy.assert_called_once_with( + mock.ANY, service_ref_stopped['id']) class TestWSGIService(test.TestCase): diff --git a/releasenotes/notes/bug-1990839-add-state-column-for-service-c4fe2a6e312a1651.yaml b/releasenotes/notes/bug-1990839-add-state-column-for-service-c4fe2a6e312a1651.yaml new file mode 100644 index 0000000000..2f79a63002 --- /dev/null +++ b/releasenotes/notes/bug-1990839-add-state-column-for-service-c4fe2a6e312a1651.yaml @@ -0,0 +1,11 @@ +--- +fixes: + - | + In cluster deployments, where multiple instances of manila services are + deployed via PODs, unique hostname is derived from node name. However if + pods are deployed again and launched on new hosts/nodes, the old entries + of manila service remains as it is. Fixed it by adding per service cleanup + function and also introducing 'state' column in 'services' table. The + service will be in either of 'up', 'down' or 'stopped' state. Cleanup will + delete DB entries of 'stopeed' services. For more details please refer, + `Launchpad bug 1990839 `_