Fix service filtering when claiming clusters

The health manager is not claiming clusters accurately when service is
restarted. The reason is that there are some dead service records to be
removed next time one of the engines gets restarted. This patch adds a
check about the service's status and filters the service records based
on the check result.

Change-Id: Id482e74068cc1cb4394c79f14f1c6442cf9f717b
Partial-Bug: #1613552
This commit is contained in:
tengqm 2016-08-16 02:19:30 -04:00
parent 7a47a3fd8d
commit ad8a538094
4 changed files with 56 additions and 4 deletions

View File

@ -1188,11 +1188,12 @@ def service_get_all(context):
# HealthRegistry
def registry_claim(context, engine_id):
with session_for_write() as session:
q_eng = session.query(models.Service)
svc_ids = [s.id for s in q_eng.all()]
engines = session.query(models.Service).all()
svc_ids = [e.id for e in engines if not utils.is_service_dead(e)]
q_reg = session.query(models.HealthRegistry)
q_reg = q_reg.filter(models.HealthRegistry.engine_id.notin_(svc_ids))
if svc_ids:
q_reg = q_reg.filter(
models.HealthRegistry.engine_id.notin_(svc_ids))
q_reg.update({'engine_id': engine_id}, synchronize_session=False)
result = q_reg.all()
return result

View File

@ -12,6 +12,9 @@
import six
from oslo_config import cfg
from oslo_utils import timeutils
def exact_filter(query, model, filters):
"""Applies exact match filtering to a query.
@ -73,3 +76,11 @@ def get_sort_params(value, default_key=None):
dirs.append('asc')
return keys, dirs
def is_service_dead(service):
"""Check if a given service is dead."""
cfg.CONF.import_opt("periodic_interval", "senlin.common.config")
max_elapse = 2 * cfg.CONF.periodic_interval
return timeutils.is_older_than(service.updated_at, max_elapse)

View File

@ -10,8 +10,10 @@
# License for the specific language governing permissions and limitations
# under the License.
import mock
from senlin.db.sqlalchemy import api as db_api
from senlin.db.sqlalchemy import utils as db_utils
from senlin.tests.unit.common import base
from senlin.tests.unit.common import utils
@ -58,6 +60,23 @@ class DBAPIRegistryTest(base.SenlinTestCase):
self.assertEqual('ENGINE_ID', registries[0].engine_id)
self.assertEqual('ENGINE_ID', registries[1].engine_id)
@mock.patch.object(db_utils, 'is_service_dead')
def test_registry_claim_with_dead_engine(self, mock_check):
db_api.service_create(self.ctx, 'SERVICE_ID_DEAD')
self._create_registry(
cluster_id='CLUSTER_1', check_type='NODE_STATUS_POLLING',
interval=60, params={}, engine_id='SERVICE_ID')
self._create_registry(
cluster_id='CLUSTER_1', check_type='NODE_STATUS_POLLING',
interval=60, params={}, engine_id='SERVICE_ID_DEAD')
mock_check.side_effect = [False, True]
registries = db_api.registry_claim(self.ctx, engine_id='ENGINE_ID')
self.assertEqual(1, len(registries))
self.assertEqual('ENGINE_ID', registries[0].engine_id)
def test_registry_delete(self):
registry = self._create_registry('CLUSTER_ID',
check_type='NODE_STATUS_POLLING',

View File

@ -11,6 +11,8 @@
# under the License.
import mock
from oslo_config import cfg
from oslo_utils import timeutils
from senlin.db.sqlalchemy import utils
from senlin.tests.unit.common import base
@ -95,3 +97,22 @@ class SortParamTest(base.SenlinTestCase):
self.assertEqual(['foo', 'bar', 'id'], keys)
self.assertEqual(['asc-nullsfirst', 'asc-nullsfirst',
'asc-nullsfirst'], dirs)
class ServiceAliveTest(base.SenlinTestCase):
def test_alive(self):
cfg.CONF.set_override('periodic_interval', 100)
service = mock.Mock(updated_at=timeutils.utcnow())
res = utils.is_service_dead(service)
self.assertFalse(res)
def test_dead(self):
cfg.CONF.set_override('periodic_interval', 0)
service = mock.Mock(updated_at=timeutils.utcnow())
res = utils.is_service_dead(service)
self.assertTrue(res)