diff --git a/nova/compute/multi_cell_list.py b/nova/compute/multi_cell_list.py index 780881e11cb3..e75cea414326 100644 --- a/nova/compute/multi_cell_list.py +++ b/nova/compute/multi_cell_list.py @@ -17,8 +17,12 @@ import itertools import six +from oslo_log import log as logging + from nova import context +LOG = logging.getLogger(__name__) + class RecordSortContext(object): def __init__(self, sort_keys, sort_dirs): @@ -243,15 +247,23 @@ class CrossCellLister(object): return (RecordWrapper(self.sort_ctx, inst) for inst in itertools.chain(local_marker_prefix, main_query_result)) - # FIXME(danms): If we raise or timeout on a cell we need to handle - # that here gracefully. The below routine will provide sentinels - # to indicate that, which will crash the merge below, but we don't - # handle this anywhere yet anyway. + # NOTE(tssurya): When the below routine provides sentinels to indicate + # a timeout on a cell, we ignore that cell to avoid the crash when + # doing the merge below and continue merging the results from the 'up' + # cells. + # TODO(tssurya): Modify this to return the minimal available info from + # the down cells. if self.cells: results = context.scatter_gather_cells(ctx, self.cells, 60, do_query) else: results = context.scatter_gather_all_cells(ctx, do_query) + for cell_uuid in list(results): + if results[cell_uuid] in (context.did_not_respond_sentinel, + context.raised_exception_sentinel): + LOG.warning("Cell %s is not responding and hence skipped " + "from the results.", cell_uuid) + results.pop(cell_uuid) # If a limit was provided, it was passed to the per-cell query # routines. That means we have NUM_CELLS * limit items across diff --git a/nova/tests/unit/compute/test_instance_list.py b/nova/tests/unit/compute/test_instance_list.py index 4ab2b2976429..b6d668b2ae69 100644 --- a/nova/tests/unit/compute/test_instance_list.py +++ b/nova/tests/unit/compute/test_instance_list.py @@ -13,6 +13,7 @@ import mock from nova.compute import instance_list +from nova.compute import multi_cell_list from nova import context as nova_context from nova import objects from nova import test @@ -137,3 +138,27 @@ class TestInstanceList(test.NoDBTestCase): None, None, cell_mappings=None) mock_cm.assert_not_called() + + @mock.patch('nova.context.scatter_gather_cells') + def test_get_instances_with_down_cells(self, mock_sg): + inst_cell0 = self.insts[uuids.cell0] + # storing the uuids of the instances from the up cell + uuid_initial = [inst['uuid'] for inst in inst_cell0] + + instances = (multi_cell_list.RecordWrapper(self.context, inst) + for inst in inst_cell0) + + # creating one up cell and two down cells + ret_val = {} + ret_val[uuids.cell0] = instances + ret_val[uuids.cell1] = nova_context.raised_exception_sentinel + ret_val[uuids.cell2] = nova_context.did_not_respond_sentinel + mock_sg.return_value = ret_val + + res = instance_list.get_instances_sorted(self.context, {}, None, None, + [], None, None) + + uuid_final = [inst['uuid'] for inst in res] + + # return the results from the up cell, ignoring the down cell. + self.assertEqual(uuid_initial, uuid_final) diff --git a/releasenotes/notes/bug-1726301-list-across-down-cells-82726cac592e9728.yaml b/releasenotes/notes/bug-1726301-list-across-down-cells-82726cac592e9728.yaml new file mode 100644 index 000000000000..db608d863d42 --- /dev/null +++ b/releasenotes/notes/bug-1726301-list-across-down-cells-82726cac592e9728.yaml @@ -0,0 +1,10 @@ +--- +fixes: + - | + Listing server and migration records used to give a 500 to users + when a cell database was unreachable. Now only records from available + cells are included to avoid the 500 error. The down cells are basically + skipped when forming the results and this solution is planned to be + further enhanced through the `blueprint handling-down-cell`_. + + .. _blueprint handling-down-cell: https://blueprints.launchpad.net/nova/+spec/handling-down-cell