diff --git a/swift/common/ring/builder.py b/swift/common/ring/builder.py index e1b4ccd02b..d6902dc9f6 100644 --- a/swift/common/ring/builder.py +++ b/swift/common/ring/builder.py @@ -525,7 +525,9 @@ class RingBuilder(object): # we'll gather a few times, or until we archive the plan for gather_count in range(MAX_BALANCE_GATHER_COUNT): - self._gather_parts_for_balance(assign_parts, replica_plan) + self._gather_parts_for_balance(assign_parts, replica_plan, + # firsrt attempt go for disperse + gather_count == 0) if not assign_parts: # most likely min part hours finish_status = 'Unable to finish' @@ -1097,6 +1099,12 @@ class RingBuilder(object): :param start: offset into self.parts to begin search :param replica_plan: replicanth targets for tiers """ + tier2children = self._build_tier2children() + parts_wanted_in_tier = defaultdict(int) + for dev in self._iter_devs(): + wanted = max(dev['parts_wanted'], 0) + for tier in dev['tiers']: + parts_wanted_in_tier[tier] += wanted # Last, we gather partitions from devices that are "overweight" because # they have more partitions than their parts_wanted. for offset in range(self.parts): @@ -1128,8 +1136,17 @@ class RingBuilder(object): replicas_at_tier[tier] < replica_plan[tier]['max'] for tier in dev['tiers']): + # we're stuck by replica plan + continue + for t in reversed(dev['tiers']): + if replicas_at_tier[t] - 1 < replica_plan[t]['min']: + # we're stuck at tier t + break + if sum(parts_wanted_in_tier[c] + for c in tier2children[t] + if c not in dev['tiers']) <= 0: + # we're stuck by weight continue - # this is the most overweight_device holding a replica # of this part that can shed it according to the plan dev['parts_wanted'] += 1 @@ -1141,15 +1158,19 @@ class RingBuilder(object): self._replica2part2dev[replica][part] = NONE_DEV for tier in dev['tiers']: replicas_at_tier[tier] -= 1 + parts_wanted_in_tier[tier] -= 1 self._set_part_moved(part) break - def _gather_parts_for_balance(self, assign_parts, replica_plan): + def _gather_parts_for_balance(self, assign_parts, replica_plan, + disperse_first): """ Gather parts that look like they should move for balance reasons. A simple gathers of parts that looks dispersible normally works out, we'll switch strategies if things don't seem to move. + :param disperse_first: boolean, avoid replicas on overweight devices + that need to be there for dispersion """ # pick a random starting point on the other side of the ring quarter_turn = (self.parts // 4) @@ -1162,10 +1183,10 @@ class RingBuilder(object): 'last_start': self._last_part_gather_start}) self._last_part_gather_start = start - self._gather_parts_for_balance_can_disperse( - assign_parts, start, replica_plan) - if not assign_parts: - self._gather_parts_for_balance_forced(assign_parts, start) + if disperse_first: + self._gather_parts_for_balance_can_disperse( + assign_parts, start, replica_plan) + self._gather_parts_for_balance_forced(assign_parts, start) def _gather_parts_for_balance_forced(self, assign_parts, start, **kwargs): """ diff --git a/test/unit/common/ring/test_builder.py b/test/unit/common/ring/test_builder.py index 54e612cf07..72bd2039b2 100644 --- a/test/unit/common/ring/test_builder.py +++ b/test/unit/common/ring/test_builder.py @@ -400,7 +400,7 @@ class TestRingBuilder(unittest.TestCase): for dev in rb._iter_devs(): dev['tiers'] = utils.tiers_for_dev(dev) assign_parts = defaultdict(list) - rb._gather_parts_for_balance(assign_parts, replica_plan) + rb._gather_parts_for_balance(assign_parts, replica_plan, False) max_run = 0 run = 0 last_part = 0 @@ -1621,9 +1621,7 @@ class TestRingBuilder(unittest.TestCase): rb.rebalance(seed=12345) part_counts = self._partition_counts(rb, key='zone') - self.assertEqual(part_counts[0], 212) - self.assertEqual(part_counts[1], 211) - self.assertEqual(part_counts[2], 345) + self.assertEqual({0: 212, 1: 211, 2: 345}, part_counts) # Now, devices 0 and 1 take 50% more than their fair shares by # weight. @@ -1633,9 +1631,7 @@ class TestRingBuilder(unittest.TestCase): rb.rebalance(seed=12345) part_counts = self._partition_counts(rb, key='zone') - self.assertEqual(part_counts[0], 256) - self.assertEqual(part_counts[1], 256) - self.assertEqual(part_counts[2], 256) + self.assertEqual({0: 256, 1: 256, 2: 256}, part_counts) # Devices 0 and 1 may take up to 75% over their fair share, but the # placement algorithm only wants to spread things out evenly between @@ -1698,9 +1694,12 @@ class TestRingBuilder(unittest.TestCase): rb.rebalance(seed=12345) part_counts = self._partition_counts(rb, key='ip') - self.assertEqual(part_counts['127.0.0.1'], 238) - self.assertEqual(part_counts['127.0.0.2'], 237) - self.assertEqual(part_counts['127.0.0.3'], 293) + + self.assertEqual({ + '127.0.0.1': 237, + '127.0.0.2': 237, + '127.0.0.3': 294, + }, part_counts) # Even out the weights: balance becomes perfect for dev in rb.devs: @@ -2451,6 +2450,105 @@ class TestRingBuilder(unittest.TestCase): (0, 0, '127.0.0.1', 3): [0, 256, 0, 0], }) + def test_undispersable_zone_converge_on_balance(self): + rb = ring.RingBuilder(8, 6, 0) + dev_id = 0 + # 3 regions, 2 zone for each region, 1 server with only *one* device in + # each zone (this is an absolutely pathological case) + for r in range(3): + for z in range(2): + ip = '127.%s.%s.1' % (r, z) + dev_id += 1 + rb.add_dev({'id': dev_id, 'region': r, 'zone': z, + 'weight': 1000, 'ip': ip, 'port': 10000, + 'device': 'd%s' % dev_id}) + rb.rebalance(seed=7) + + # sanity, all balanced and 0 dispersion + self.assertEqual(rb.get_balance(), 0) + self.assertEqual(rb.dispersion, 0) + + # add one device to the server in z1 for each region, N.B. when we + # *balance* this topology we will have very bad dispersion (too much + # weight in z1 compared to z2!) + for r in range(3): + z = 0 + ip = '127.%s.%s.1' % (r, z) + dev_id += 1 + rb.add_dev({'id': dev_id, 'region': r, 'zone': z, + 'weight': 1000, 'ip': ip, 'port': 10000, + 'device': 'd%s' % dev_id}) + + changed_part, _, _ = rb.rebalance(seed=7) + + # sanity, all part but only one replica moved to new devices + self.assertEqual(changed_part, 2 ** 8) + # so the first time, rings are still unbalanced becase we'll only move + # one replica of each part. + self.assertEqual(rb.get_balance(), 50.1953125) + self.assertEqual(rb.dispersion, 99.609375) + + # N.B. since we mostly end up grabbing parts by "weight forced" some + # seeds given some specific ring state will randomly pick bad + # part-replicas that end up going back down onto the same devices + changed_part, _, _ = rb.rebalance(seed=7) + self.assertEqual(changed_part, 14) + # ... this isn't a really "desirable" behavior, but even with bad luck, + # things do get better + self.assertEqual(rb.get_balance(), 47.265625) + self.assertEqual(rb.dispersion, 99.609375) + + # but if you stick with it, eventually the next rebalance, will get to + # move "the right" part-replicas, resulting in near optimal balance + changed_part, _, _ = rb.rebalance(seed=7) + self.assertEqual(changed_part, 240) + self.assertEqual(rb.get_balance(), 0.390625) + self.assertEqual(rb.dispersion, 99.609375) + + def test_undispersable_server_converge_on_balance(self): + rb = ring.RingBuilder(8, 6, 0) + dev_id = 0 + # 3 zones, 2 server for each zone, 2 device for each server + for z in range(3): + for i in range(2): + ip = '127.0.%s.%s' % (z, i + 1) + for d in range(2): + dev_id += 1 + rb.add_dev({'id': dev_id, 'region': 1, 'zone': z, + 'weight': 1000, 'ip': ip, 'port': 10000, + 'device': 'd%s' % dev_id}) + rb.rebalance(seed=7) + + # sanity, all balanced and 0 dispersion + self.assertEqual(rb.get_balance(), 0) + self.assertEqual(rb.dispersion, 0) + + # add one device for first server for each zone + for z in range(3): + ip = '127.0.%s.1' % z + dev_id += 1 + rb.add_dev({'id': dev_id, 'region': 1, 'zone': z, + 'weight': 1000, 'ip': ip, 'port': 10000, + 'device': 'd%s' % dev_id}) + + changed_part, _, _ = rb.rebalance(seed=7) + + # sanity, all part but only one replica moved to new devices + self.assertEqual(changed_part, 2 ** 8) + + # but the first time, those are still unbalance becase ring builder + # can move only one replica for each part + self.assertEqual(rb.get_balance(), 16.9921875) + self.assertEqual(rb.dispersion, 59.765625) + + rb.rebalance(seed=7) + + # converge into around 0~1 + self.assertGreaterEqual(rb.get_balance(), 0) + self.assertLess(rb.get_balance(), 1) + # dispersion doesn't get any worse + self.assertEqual(rb.dispersion, 59.765625) + def test_effective_overload(self): rb = ring.RingBuilder(8, 3, 1) # z0 @@ -3595,12 +3693,12 @@ class TestGetRequiredOverload(unittest.TestCase): rb.rebalance(seed=17) self.assertEqual(rb.get_balance(), 1581.6406249999998) - # but despite the overall trend toward imbalance, in the tier - # with the huge device, the small device is trying to shed parts - # as effectively as it can (which would be useful if it was the - # only small device isolated in a tier with other huge devices - # trying to gobble up all the replicanths in the tier - see - # `test_one_small_guy_does_not_spoil_his_buddy`!) + # but despite the overall trend toward imbalance, in the tier with the + # huge device, we want to see the small device (d4) try to shed parts + # as effectively as it can to the huge device in the same tier (d5) + # this is a useful behavior anytime when for whatever reason a device + # w/i a tier wants parts from another device already in the same tier + # another example is `test_one_small_guy_does_not_spoil_his_buddy` expected = { 0: 123, 1: 123, @@ -3691,6 +3789,45 @@ class TestGetRequiredOverload(unittest.TestCase): self.assertEqual(rb.get_balance(), 30.46875) # increasing overload moves towards one replica in each tier + rb.set_overload(0.3) + expected = { + 0: 0.553443113772455, + 1: 0.553443113772455, + 2: 0.553443113772455, + 3: 0.553443113772455, + 4: 0.778443113772455, + 5: 0.007784431137724551, + } + target_replicas = rb._build_target_replicas_by_tier() + self.assertEqual(expected, {t[-1]: r for (t, r) in + target_replicas.items() + if len(t) == 4}) + # ... and as always increasing overload makes balance *worse* + rb.rebalance(seed=12) + self.assertEqual(rb.get_balance(), 30.46875) + + # the little guy it really struggling to take his share tho + expected = { + 0: 142, + 1: 141, + 2: 142, + 3: 141, + 4: 200, + 5: 2, + } + self.assertEqual(expected, { + d['id']: d['parts'] for d in rb._iter_devs()}) + # ... and you can see it in the balance! + expected = { + 0: -7.367187499999986, + 1: -8.019531249999986, + 2: -7.367187499999986, + 3: -8.019531249999986, + 4: 30.46875, + 5: 30.46875, + } + self.assertEqual(expected, rb._build_balance_per_dev()) + rb.set_overload(0.5) expected = { 0: 0.5232035928143712, @@ -3705,7 +3842,7 @@ class TestGetRequiredOverload(unittest.TestCase): target_replicas.items() if len(t) == 4}) - # ... and as always increasing overload makes balance *worse* + # because the device is so small, balance get's bad quick rb.rebalance(seed=17) self.assertEqual(rb.get_balance(), 95.703125) diff --git a/test/unit/common/ring/test_composite_builder.py b/test/unit/common/ring/test_composite_builder.py index c81f622ca2..4bf51b3227 100644 --- a/test/unit/common/ring/test_composite_builder.py +++ b/test/unit/common/ring/test_composite_builder.py @@ -1115,7 +1115,7 @@ class TestCooperativeRingBuilder(BaseTestCompositeBuilder): rb1.rebalance() self.assertEqual([rb1], update_calls) self.assertEqual([rb1], can_part_move_calls.keys()) - self.assertEqual(512, len(can_part_move_calls[rb1])) + self.assertEqual(768, len(can_part_move_calls[rb1])) # two component builders with same parent builder cb = CompositeRingBuilder() @@ -1139,8 +1139,8 @@ class TestCooperativeRingBuilder(BaseTestCompositeBuilder): # rb1 is being rebalanced so gets checked, and rb2 also gets checked self.assertEqual(sorted([rb1, rb2]), sorted(can_part_move_calls)) - self.assertEqual(512, len(can_part_move_calls[rb1])) - self.assertEqual(512, len(can_part_move_calls[rb2])) + self.assertEqual(768, len(can_part_move_calls[rb1])) + self.assertEqual(768, len(can_part_move_calls[rb2])) def test_save_then_load(self): cb = CompositeRingBuilder() diff --git a/test/unit/common/ring/test_utils.py b/test/unit/common/ring/test_utils.py index a18a8808d2..d77d5c8553 100644 --- a/test/unit/common/ring/test_utils.py +++ b/test/unit/common/ring/test_utils.py @@ -619,10 +619,10 @@ class TestUtils(unittest.TestCase): rb.rebalance(seed=100) rb.validate() - self.assertEqual(rb.dispersion, 39.84375) + self.assertEqual(rb.dispersion, 55.46875) report = dispersion_report(rb) self.assertEqual(report['worst_tier'], 'r1z1') - self.assertEqual(report['max_dispersion'], 39.84375) + self.assertEqual(report['max_dispersion'], 44.921875) def build_tier_report(max_replicas, placed_parts, dispersion, replicas): @@ -633,16 +633,17 @@ class TestUtils(unittest.TestCase): 'replicas': replicas, } - # Each node should store 256 partitions to avoid multiple replicas + # Each node should store less than or equal to 256 partitions to + # avoid multiple replicas. # 2/5 of total weight * 768 ~= 307 -> 51 partitions on each node in # zone 1 are stored at least twice on the nodes expected = [ ['r1z1', build_tier_report( - 2, 256, 39.84375, [0, 0, 154, 102])], + 2, 256, 44.921875, [0, 0, 141, 115])], ['r1z1-127.0.0.1', build_tier_report( - 1, 256, 19.921875, [0, 205, 51, 0])], + 1, 242, 29.33884297520661, [14, 171, 71, 0])], ['r1z1-127.0.0.2', build_tier_report( - 1, 256, 19.921875, [0, 205, 51, 0])], + 1, 243, 29.218106995884774, [13, 172, 71, 0])], ] report = dispersion_report(rb, 'r1z1[^/]*$', verbose=True) graph = report['graph'] @@ -667,9 +668,9 @@ class TestUtils(unittest.TestCase): # can't move all the part-replicas in one rebalance rb.rebalance(seed=100) report = dispersion_report(rb, verbose=True) - self.assertEqual(rb.dispersion, 9.375) - self.assertEqual(report['worst_tier'], 'r1z1-127.0.0.1') - self.assertEqual(report['max_dispersion'], 7.18562874251497) + self.assertEqual(rb.dispersion, 11.71875) + self.assertEqual(report['worst_tier'], 'r1z1-127.0.0.2') + self.assertEqual(report['max_dispersion'], 8.875739644970414) # do a sencond rebalance rb.rebalance(seed=100) report = dispersion_report(rb, verbose=True)