From 23219664564d1b5a7ba02bbf8309ec699ab7a4cb Mon Sep 17 00:00:00 2001 From: Kota Tsuyuzaki Date: Fri, 30 Jun 2017 02:03:48 -0700 Subject: [PATCH] Accept a trade off of dispersion for balance ... but only if we *have* to! During the initial gather for balance we prefer to avoid replicas on over-weight devices that are already under-represented in any of it's tiers (i.e. if a zone has to have at least one, but may have as many of two, don't take the only replica). Instead we hope by going for replicas on over-weight devices that are at the limits of their dispersion we might have a better than even chance we find a better place for them during placement! This normally works on out - and especially so for rings which can disperse and balance. But for existing rings where we'd have to sacrifice dispersion to improve balance the existing optimistic gather will end up refusing to trade dispersion for balance - and instead get stuck without solving either! You should always be able to solve for *either* dispersion or balance. But if you can't solve *both* - we bail out on our optimistic gather much more quickly and instead just focus on improving balance. With this change, the ring can get into balanced (and un-dispersed) states much more quickly! Change-Id: I17ac627f94f64211afaccad15596a9fcab2fada2 Related-Change-Id: Ie6e2d116b65938edac29efa6171e2470bb3e8e12 Closes-Bug: 1699636 Closes-Bug: 1701472 --- swift/common/ring/builder.py | 35 +++- test/unit/common/ring/test_builder.py | 171 ++++++++++++++++-- .../common/ring/test_composite_builder.py | 6 +- test/unit/common/ring/test_utils.py | 19 +- 4 files changed, 195 insertions(+), 36 deletions(-) diff --git a/swift/common/ring/builder.py b/swift/common/ring/builder.py index e1b4ccd02b..d6902dc9f6 100644 --- a/swift/common/ring/builder.py +++ b/swift/common/ring/builder.py @@ -525,7 +525,9 @@ class RingBuilder(object): # we'll gather a few times, or until we archive the plan for gather_count in range(MAX_BALANCE_GATHER_COUNT): - self._gather_parts_for_balance(assign_parts, replica_plan) + self._gather_parts_for_balance(assign_parts, replica_plan, + # firsrt attempt go for disperse + gather_count == 0) if not assign_parts: # most likely min part hours finish_status = 'Unable to finish' @@ -1097,6 +1099,12 @@ class RingBuilder(object): :param start: offset into self.parts to begin search :param replica_plan: replicanth targets for tiers """ + tier2children = self._build_tier2children() + parts_wanted_in_tier = defaultdict(int) + for dev in self._iter_devs(): + wanted = max(dev['parts_wanted'], 0) + for tier in dev['tiers']: + parts_wanted_in_tier[tier] += wanted # Last, we gather partitions from devices that are "overweight" because # they have more partitions than their parts_wanted. for offset in range(self.parts): @@ -1128,8 +1136,17 @@ class RingBuilder(object): replicas_at_tier[tier] < replica_plan[tier]['max'] for tier in dev['tiers']): + # we're stuck by replica plan + continue + for t in reversed(dev['tiers']): + if replicas_at_tier[t] - 1 < replica_plan[t]['min']: + # we're stuck at tier t + break + if sum(parts_wanted_in_tier[c] + for c in tier2children[t] + if c not in dev['tiers']) <= 0: + # we're stuck by weight continue - # this is the most overweight_device holding a replica # of this part that can shed it according to the plan dev['parts_wanted'] += 1 @@ -1141,15 +1158,19 @@ class RingBuilder(object): self._replica2part2dev[replica][part] = NONE_DEV for tier in dev['tiers']: replicas_at_tier[tier] -= 1 + parts_wanted_in_tier[tier] -= 1 self._set_part_moved(part) break - def _gather_parts_for_balance(self, assign_parts, replica_plan): + def _gather_parts_for_balance(self, assign_parts, replica_plan, + disperse_first): """ Gather parts that look like they should move for balance reasons. A simple gathers of parts that looks dispersible normally works out, we'll switch strategies if things don't seem to move. + :param disperse_first: boolean, avoid replicas on overweight devices + that need to be there for dispersion """ # pick a random starting point on the other side of the ring quarter_turn = (self.parts // 4) @@ -1162,10 +1183,10 @@ class RingBuilder(object): 'last_start': self._last_part_gather_start}) self._last_part_gather_start = start - self._gather_parts_for_balance_can_disperse( - assign_parts, start, replica_plan) - if not assign_parts: - self._gather_parts_for_balance_forced(assign_parts, start) + if disperse_first: + self._gather_parts_for_balance_can_disperse( + assign_parts, start, replica_plan) + self._gather_parts_for_balance_forced(assign_parts, start) def _gather_parts_for_balance_forced(self, assign_parts, start, **kwargs): """ diff --git a/test/unit/common/ring/test_builder.py b/test/unit/common/ring/test_builder.py index 54e612cf07..72bd2039b2 100644 --- a/test/unit/common/ring/test_builder.py +++ b/test/unit/common/ring/test_builder.py @@ -400,7 +400,7 @@ class TestRingBuilder(unittest.TestCase): for dev in rb._iter_devs(): dev['tiers'] = utils.tiers_for_dev(dev) assign_parts = defaultdict(list) - rb._gather_parts_for_balance(assign_parts, replica_plan) + rb._gather_parts_for_balance(assign_parts, replica_plan, False) max_run = 0 run = 0 last_part = 0 @@ -1621,9 +1621,7 @@ class TestRingBuilder(unittest.TestCase): rb.rebalance(seed=12345) part_counts = self._partition_counts(rb, key='zone') - self.assertEqual(part_counts[0], 212) - self.assertEqual(part_counts[1], 211) - self.assertEqual(part_counts[2], 345) + self.assertEqual({0: 212, 1: 211, 2: 345}, part_counts) # Now, devices 0 and 1 take 50% more than their fair shares by # weight. @@ -1633,9 +1631,7 @@ class TestRingBuilder(unittest.TestCase): rb.rebalance(seed=12345) part_counts = self._partition_counts(rb, key='zone') - self.assertEqual(part_counts[0], 256) - self.assertEqual(part_counts[1], 256) - self.assertEqual(part_counts[2], 256) + self.assertEqual({0: 256, 1: 256, 2: 256}, part_counts) # Devices 0 and 1 may take up to 75% over their fair share, but the # placement algorithm only wants to spread things out evenly between @@ -1698,9 +1694,12 @@ class TestRingBuilder(unittest.TestCase): rb.rebalance(seed=12345) part_counts = self._partition_counts(rb, key='ip') - self.assertEqual(part_counts['127.0.0.1'], 238) - self.assertEqual(part_counts['127.0.0.2'], 237) - self.assertEqual(part_counts['127.0.0.3'], 293) + + self.assertEqual({ + '127.0.0.1': 237, + '127.0.0.2': 237, + '127.0.0.3': 294, + }, part_counts) # Even out the weights: balance becomes perfect for dev in rb.devs: @@ -2451,6 +2450,105 @@ class TestRingBuilder(unittest.TestCase): (0, 0, '127.0.0.1', 3): [0, 256, 0, 0], }) + def test_undispersable_zone_converge_on_balance(self): + rb = ring.RingBuilder(8, 6, 0) + dev_id = 0 + # 3 regions, 2 zone for each region, 1 server with only *one* device in + # each zone (this is an absolutely pathological case) + for r in range(3): + for z in range(2): + ip = '127.%s.%s.1' % (r, z) + dev_id += 1 + rb.add_dev({'id': dev_id, 'region': r, 'zone': z, + 'weight': 1000, 'ip': ip, 'port': 10000, + 'device': 'd%s' % dev_id}) + rb.rebalance(seed=7) + + # sanity, all balanced and 0 dispersion + self.assertEqual(rb.get_balance(), 0) + self.assertEqual(rb.dispersion, 0) + + # add one device to the server in z1 for each region, N.B. when we + # *balance* this topology we will have very bad dispersion (too much + # weight in z1 compared to z2!) + for r in range(3): + z = 0 + ip = '127.%s.%s.1' % (r, z) + dev_id += 1 + rb.add_dev({'id': dev_id, 'region': r, 'zone': z, + 'weight': 1000, 'ip': ip, 'port': 10000, + 'device': 'd%s' % dev_id}) + + changed_part, _, _ = rb.rebalance(seed=7) + + # sanity, all part but only one replica moved to new devices + self.assertEqual(changed_part, 2 ** 8) + # so the first time, rings are still unbalanced becase we'll only move + # one replica of each part. + self.assertEqual(rb.get_balance(), 50.1953125) + self.assertEqual(rb.dispersion, 99.609375) + + # N.B. since we mostly end up grabbing parts by "weight forced" some + # seeds given some specific ring state will randomly pick bad + # part-replicas that end up going back down onto the same devices + changed_part, _, _ = rb.rebalance(seed=7) + self.assertEqual(changed_part, 14) + # ... this isn't a really "desirable" behavior, but even with bad luck, + # things do get better + self.assertEqual(rb.get_balance(), 47.265625) + self.assertEqual(rb.dispersion, 99.609375) + + # but if you stick with it, eventually the next rebalance, will get to + # move "the right" part-replicas, resulting in near optimal balance + changed_part, _, _ = rb.rebalance(seed=7) + self.assertEqual(changed_part, 240) + self.assertEqual(rb.get_balance(), 0.390625) + self.assertEqual(rb.dispersion, 99.609375) + + def test_undispersable_server_converge_on_balance(self): + rb = ring.RingBuilder(8, 6, 0) + dev_id = 0 + # 3 zones, 2 server for each zone, 2 device for each server + for z in range(3): + for i in range(2): + ip = '127.0.%s.%s' % (z, i + 1) + for d in range(2): + dev_id += 1 + rb.add_dev({'id': dev_id, 'region': 1, 'zone': z, + 'weight': 1000, 'ip': ip, 'port': 10000, + 'device': 'd%s' % dev_id}) + rb.rebalance(seed=7) + + # sanity, all balanced and 0 dispersion + self.assertEqual(rb.get_balance(), 0) + self.assertEqual(rb.dispersion, 0) + + # add one device for first server for each zone + for z in range(3): + ip = '127.0.%s.1' % z + dev_id += 1 + rb.add_dev({'id': dev_id, 'region': 1, 'zone': z, + 'weight': 1000, 'ip': ip, 'port': 10000, + 'device': 'd%s' % dev_id}) + + changed_part, _, _ = rb.rebalance(seed=7) + + # sanity, all part but only one replica moved to new devices + self.assertEqual(changed_part, 2 ** 8) + + # but the first time, those are still unbalance becase ring builder + # can move only one replica for each part + self.assertEqual(rb.get_balance(), 16.9921875) + self.assertEqual(rb.dispersion, 59.765625) + + rb.rebalance(seed=7) + + # converge into around 0~1 + self.assertGreaterEqual(rb.get_balance(), 0) + self.assertLess(rb.get_balance(), 1) + # dispersion doesn't get any worse + self.assertEqual(rb.dispersion, 59.765625) + def test_effective_overload(self): rb = ring.RingBuilder(8, 3, 1) # z0 @@ -3595,12 +3693,12 @@ class TestGetRequiredOverload(unittest.TestCase): rb.rebalance(seed=17) self.assertEqual(rb.get_balance(), 1581.6406249999998) - # but despite the overall trend toward imbalance, in the tier - # with the huge device, the small device is trying to shed parts - # as effectively as it can (which would be useful if it was the - # only small device isolated in a tier with other huge devices - # trying to gobble up all the replicanths in the tier - see - # `test_one_small_guy_does_not_spoil_his_buddy`!) + # but despite the overall trend toward imbalance, in the tier with the + # huge device, we want to see the small device (d4) try to shed parts + # as effectively as it can to the huge device in the same tier (d5) + # this is a useful behavior anytime when for whatever reason a device + # w/i a tier wants parts from another device already in the same tier + # another example is `test_one_small_guy_does_not_spoil_his_buddy` expected = { 0: 123, 1: 123, @@ -3691,6 +3789,45 @@ class TestGetRequiredOverload(unittest.TestCase): self.assertEqual(rb.get_balance(), 30.46875) # increasing overload moves towards one replica in each tier + rb.set_overload(0.3) + expected = { + 0: 0.553443113772455, + 1: 0.553443113772455, + 2: 0.553443113772455, + 3: 0.553443113772455, + 4: 0.778443113772455, + 5: 0.007784431137724551, + } + target_replicas = rb._build_target_replicas_by_tier() + self.assertEqual(expected, {t[-1]: r for (t, r) in + target_replicas.items() + if len(t) == 4}) + # ... and as always increasing overload makes balance *worse* + rb.rebalance(seed=12) + self.assertEqual(rb.get_balance(), 30.46875) + + # the little guy it really struggling to take his share tho + expected = { + 0: 142, + 1: 141, + 2: 142, + 3: 141, + 4: 200, + 5: 2, + } + self.assertEqual(expected, { + d['id']: d['parts'] for d in rb._iter_devs()}) + # ... and you can see it in the balance! + expected = { + 0: -7.367187499999986, + 1: -8.019531249999986, + 2: -7.367187499999986, + 3: -8.019531249999986, + 4: 30.46875, + 5: 30.46875, + } + self.assertEqual(expected, rb._build_balance_per_dev()) + rb.set_overload(0.5) expected = { 0: 0.5232035928143712, @@ -3705,7 +3842,7 @@ class TestGetRequiredOverload(unittest.TestCase): target_replicas.items() if len(t) == 4}) - # ... and as always increasing overload makes balance *worse* + # because the device is so small, balance get's bad quick rb.rebalance(seed=17) self.assertEqual(rb.get_balance(), 95.703125) diff --git a/test/unit/common/ring/test_composite_builder.py b/test/unit/common/ring/test_composite_builder.py index c81f622ca2..4bf51b3227 100644 --- a/test/unit/common/ring/test_composite_builder.py +++ b/test/unit/common/ring/test_composite_builder.py @@ -1115,7 +1115,7 @@ class TestCooperativeRingBuilder(BaseTestCompositeBuilder): rb1.rebalance() self.assertEqual([rb1], update_calls) self.assertEqual([rb1], can_part_move_calls.keys()) - self.assertEqual(512, len(can_part_move_calls[rb1])) + self.assertEqual(768, len(can_part_move_calls[rb1])) # two component builders with same parent builder cb = CompositeRingBuilder() @@ -1139,8 +1139,8 @@ class TestCooperativeRingBuilder(BaseTestCompositeBuilder): # rb1 is being rebalanced so gets checked, and rb2 also gets checked self.assertEqual(sorted([rb1, rb2]), sorted(can_part_move_calls)) - self.assertEqual(512, len(can_part_move_calls[rb1])) - self.assertEqual(512, len(can_part_move_calls[rb2])) + self.assertEqual(768, len(can_part_move_calls[rb1])) + self.assertEqual(768, len(can_part_move_calls[rb2])) def test_save_then_load(self): cb = CompositeRingBuilder() diff --git a/test/unit/common/ring/test_utils.py b/test/unit/common/ring/test_utils.py index a18a8808d2..d77d5c8553 100644 --- a/test/unit/common/ring/test_utils.py +++ b/test/unit/common/ring/test_utils.py @@ -619,10 +619,10 @@ class TestUtils(unittest.TestCase): rb.rebalance(seed=100) rb.validate() - self.assertEqual(rb.dispersion, 39.84375) + self.assertEqual(rb.dispersion, 55.46875) report = dispersion_report(rb) self.assertEqual(report['worst_tier'], 'r1z1') - self.assertEqual(report['max_dispersion'], 39.84375) + self.assertEqual(report['max_dispersion'], 44.921875) def build_tier_report(max_replicas, placed_parts, dispersion, replicas): @@ -633,16 +633,17 @@ class TestUtils(unittest.TestCase): 'replicas': replicas, } - # Each node should store 256 partitions to avoid multiple replicas + # Each node should store less than or equal to 256 partitions to + # avoid multiple replicas. # 2/5 of total weight * 768 ~= 307 -> 51 partitions on each node in # zone 1 are stored at least twice on the nodes expected = [ ['r1z1', build_tier_report( - 2, 256, 39.84375, [0, 0, 154, 102])], + 2, 256, 44.921875, [0, 0, 141, 115])], ['r1z1-127.0.0.1', build_tier_report( - 1, 256, 19.921875, [0, 205, 51, 0])], + 1, 242, 29.33884297520661, [14, 171, 71, 0])], ['r1z1-127.0.0.2', build_tier_report( - 1, 256, 19.921875, [0, 205, 51, 0])], + 1, 243, 29.218106995884774, [13, 172, 71, 0])], ] report = dispersion_report(rb, 'r1z1[^/]*$', verbose=True) graph = report['graph'] @@ -667,9 +668,9 @@ class TestUtils(unittest.TestCase): # can't move all the part-replicas in one rebalance rb.rebalance(seed=100) report = dispersion_report(rb, verbose=True) - self.assertEqual(rb.dispersion, 9.375) - self.assertEqual(report['worst_tier'], 'r1z1-127.0.0.1') - self.assertEqual(report['max_dispersion'], 7.18562874251497) + self.assertEqual(rb.dispersion, 11.71875) + self.assertEqual(report['worst_tier'], 'r1z1-127.0.0.2') + self.assertEqual(report['max_dispersion'], 8.875739644970414) # do a sencond rebalance rb.rebalance(seed=100) report = dispersion_report(rb, verbose=True)