Accept a trade off of dispersion for balance

... but only if we *have* to!

During the initial gather for balance we prefer to avoid replicas on
over-weight devices that are already under-represented in any of it's
tiers (i.e. if a zone has to have at least one, but may have as many of
two, don't take the only replica).  Instead we hope by going for
replicas on over-weight devices that are at the limits of their
dispersion we might have a better than even chance we find a better
place for them during placement!

This normally works on out - and especially so for rings which can
disperse and balance.  But for existing rings where we'd have to
sacrifice dispersion to improve balance the existing optimistic gather
will end up refusing to trade dispersion for balance - and instead get
stuck without solving either!

You should always be able to solve for *either* dispersion or balance.
But if you can't solve *both* - we bail out on our optimistic gather
much more quickly and instead just focus on improving balance.  With
this change, the ring can get into balanced (and un-dispersed) states
much more quickly!

Change-Id: I17ac627f94f64211afaccad15596a9fcab2fada2
Related-Change-Id: Ie6e2d116b65938edac29efa6171e2470bb3e8e12
Closes-Bug: 1699636
Closes-Bug: 1701472
This commit is contained in:
Kota Tsuyuzaki 2017-06-30 02:03:48 -07:00 committed by Clay Gerrard
parent 097a9083ec
commit 2321966456
4 changed files with 195 additions and 36 deletions

View File

@ -525,7 +525,9 @@ class RingBuilder(object):
# we'll gather a few times, or until we archive the plan
for gather_count in range(MAX_BALANCE_GATHER_COUNT):
self._gather_parts_for_balance(assign_parts, replica_plan)
self._gather_parts_for_balance(assign_parts, replica_plan,
# firsrt attempt go for disperse
gather_count == 0)
if not assign_parts:
# most likely min part hours
finish_status = 'Unable to finish'
@ -1097,6 +1099,12 @@ class RingBuilder(object):
:param start: offset into self.parts to begin search
:param replica_plan: replicanth targets for tiers
"""
tier2children = self._build_tier2children()
parts_wanted_in_tier = defaultdict(int)
for dev in self._iter_devs():
wanted = max(dev['parts_wanted'], 0)
for tier in dev['tiers']:
parts_wanted_in_tier[tier] += wanted
# Last, we gather partitions from devices that are "overweight" because
# they have more partitions than their parts_wanted.
for offset in range(self.parts):
@ -1128,8 +1136,17 @@ class RingBuilder(object):
replicas_at_tier[tier] <
replica_plan[tier]['max']
for tier in dev['tiers']):
# we're stuck by replica plan
continue
for t in reversed(dev['tiers']):
if replicas_at_tier[t] - 1 < replica_plan[t]['min']:
# we're stuck at tier t
break
if sum(parts_wanted_in_tier[c]
for c in tier2children[t]
if c not in dev['tiers']) <= 0:
# we're stuck by weight
continue
# this is the most overweight_device holding a replica
# of this part that can shed it according to the plan
dev['parts_wanted'] += 1
@ -1141,15 +1158,19 @@ class RingBuilder(object):
self._replica2part2dev[replica][part] = NONE_DEV
for tier in dev['tiers']:
replicas_at_tier[tier] -= 1
parts_wanted_in_tier[tier] -= 1
self._set_part_moved(part)
break
def _gather_parts_for_balance(self, assign_parts, replica_plan):
def _gather_parts_for_balance(self, assign_parts, replica_plan,
disperse_first):
"""
Gather parts that look like they should move for balance reasons.
A simple gathers of parts that looks dispersible normally works out,
we'll switch strategies if things don't seem to move.
:param disperse_first: boolean, avoid replicas on overweight devices
that need to be there for dispersion
"""
# pick a random starting point on the other side of the ring
quarter_turn = (self.parts // 4)
@ -1162,10 +1183,10 @@ class RingBuilder(object):
'last_start': self._last_part_gather_start})
self._last_part_gather_start = start
self._gather_parts_for_balance_can_disperse(
assign_parts, start, replica_plan)
if not assign_parts:
self._gather_parts_for_balance_forced(assign_parts, start)
if disperse_first:
self._gather_parts_for_balance_can_disperse(
assign_parts, start, replica_plan)
self._gather_parts_for_balance_forced(assign_parts, start)
def _gather_parts_for_balance_forced(self, assign_parts, start, **kwargs):
"""

View File

@ -400,7 +400,7 @@ class TestRingBuilder(unittest.TestCase):
for dev in rb._iter_devs():
dev['tiers'] = utils.tiers_for_dev(dev)
assign_parts = defaultdict(list)
rb._gather_parts_for_balance(assign_parts, replica_plan)
rb._gather_parts_for_balance(assign_parts, replica_plan, False)
max_run = 0
run = 0
last_part = 0
@ -1621,9 +1621,7 @@ class TestRingBuilder(unittest.TestCase):
rb.rebalance(seed=12345)
part_counts = self._partition_counts(rb, key='zone')
self.assertEqual(part_counts[0], 212)
self.assertEqual(part_counts[1], 211)
self.assertEqual(part_counts[2], 345)
self.assertEqual({0: 212, 1: 211, 2: 345}, part_counts)
# Now, devices 0 and 1 take 50% more than their fair shares by
# weight.
@ -1633,9 +1631,7 @@ class TestRingBuilder(unittest.TestCase):
rb.rebalance(seed=12345)
part_counts = self._partition_counts(rb, key='zone')
self.assertEqual(part_counts[0], 256)
self.assertEqual(part_counts[1], 256)
self.assertEqual(part_counts[2], 256)
self.assertEqual({0: 256, 1: 256, 2: 256}, part_counts)
# Devices 0 and 1 may take up to 75% over their fair share, but the
# placement algorithm only wants to spread things out evenly between
@ -1698,9 +1694,12 @@ class TestRingBuilder(unittest.TestCase):
rb.rebalance(seed=12345)
part_counts = self._partition_counts(rb, key='ip')
self.assertEqual(part_counts['127.0.0.1'], 238)
self.assertEqual(part_counts['127.0.0.2'], 237)
self.assertEqual(part_counts['127.0.0.3'], 293)
self.assertEqual({
'127.0.0.1': 237,
'127.0.0.2': 237,
'127.0.0.3': 294,
}, part_counts)
# Even out the weights: balance becomes perfect
for dev in rb.devs:
@ -2451,6 +2450,105 @@ class TestRingBuilder(unittest.TestCase):
(0, 0, '127.0.0.1', 3): [0, 256, 0, 0],
})
def test_undispersable_zone_converge_on_balance(self):
rb = ring.RingBuilder(8, 6, 0)
dev_id = 0
# 3 regions, 2 zone for each region, 1 server with only *one* device in
# each zone (this is an absolutely pathological case)
for r in range(3):
for z in range(2):
ip = '127.%s.%s.1' % (r, z)
dev_id += 1
rb.add_dev({'id': dev_id, 'region': r, 'zone': z,
'weight': 1000, 'ip': ip, 'port': 10000,
'device': 'd%s' % dev_id})
rb.rebalance(seed=7)
# sanity, all balanced and 0 dispersion
self.assertEqual(rb.get_balance(), 0)
self.assertEqual(rb.dispersion, 0)
# add one device to the server in z1 for each region, N.B. when we
# *balance* this topology we will have very bad dispersion (too much
# weight in z1 compared to z2!)
for r in range(3):
z = 0
ip = '127.%s.%s.1' % (r, z)
dev_id += 1
rb.add_dev({'id': dev_id, 'region': r, 'zone': z,
'weight': 1000, 'ip': ip, 'port': 10000,
'device': 'd%s' % dev_id})
changed_part, _, _ = rb.rebalance(seed=7)
# sanity, all part but only one replica moved to new devices
self.assertEqual(changed_part, 2 ** 8)
# so the first time, rings are still unbalanced becase we'll only move
# one replica of each part.
self.assertEqual(rb.get_balance(), 50.1953125)
self.assertEqual(rb.dispersion, 99.609375)
# N.B. since we mostly end up grabbing parts by "weight forced" some
# seeds given some specific ring state will randomly pick bad
# part-replicas that end up going back down onto the same devices
changed_part, _, _ = rb.rebalance(seed=7)
self.assertEqual(changed_part, 14)
# ... this isn't a really "desirable" behavior, but even with bad luck,
# things do get better
self.assertEqual(rb.get_balance(), 47.265625)
self.assertEqual(rb.dispersion, 99.609375)
# but if you stick with it, eventually the next rebalance, will get to
# move "the right" part-replicas, resulting in near optimal balance
changed_part, _, _ = rb.rebalance(seed=7)
self.assertEqual(changed_part, 240)
self.assertEqual(rb.get_balance(), 0.390625)
self.assertEqual(rb.dispersion, 99.609375)
def test_undispersable_server_converge_on_balance(self):
rb = ring.RingBuilder(8, 6, 0)
dev_id = 0
# 3 zones, 2 server for each zone, 2 device for each server
for z in range(3):
for i in range(2):
ip = '127.0.%s.%s' % (z, i + 1)
for d in range(2):
dev_id += 1
rb.add_dev({'id': dev_id, 'region': 1, 'zone': z,
'weight': 1000, 'ip': ip, 'port': 10000,
'device': 'd%s' % dev_id})
rb.rebalance(seed=7)
# sanity, all balanced and 0 dispersion
self.assertEqual(rb.get_balance(), 0)
self.assertEqual(rb.dispersion, 0)
# add one device for first server for each zone
for z in range(3):
ip = '127.0.%s.1' % z
dev_id += 1
rb.add_dev({'id': dev_id, 'region': 1, 'zone': z,
'weight': 1000, 'ip': ip, 'port': 10000,
'device': 'd%s' % dev_id})
changed_part, _, _ = rb.rebalance(seed=7)
# sanity, all part but only one replica moved to new devices
self.assertEqual(changed_part, 2 ** 8)
# but the first time, those are still unbalance becase ring builder
# can move only one replica for each part
self.assertEqual(rb.get_balance(), 16.9921875)
self.assertEqual(rb.dispersion, 59.765625)
rb.rebalance(seed=7)
# converge into around 0~1
self.assertGreaterEqual(rb.get_balance(), 0)
self.assertLess(rb.get_balance(), 1)
# dispersion doesn't get any worse
self.assertEqual(rb.dispersion, 59.765625)
def test_effective_overload(self):
rb = ring.RingBuilder(8, 3, 1)
# z0
@ -3595,12 +3693,12 @@ class TestGetRequiredOverload(unittest.TestCase):
rb.rebalance(seed=17)
self.assertEqual(rb.get_balance(), 1581.6406249999998)
# but despite the overall trend toward imbalance, in the tier
# with the huge device, the small device is trying to shed parts
# as effectively as it can (which would be useful if it was the
# only small device isolated in a tier with other huge devices
# trying to gobble up all the replicanths in the tier - see
# `test_one_small_guy_does_not_spoil_his_buddy`!)
# but despite the overall trend toward imbalance, in the tier with the
# huge device, we want to see the small device (d4) try to shed parts
# as effectively as it can to the huge device in the same tier (d5)
# this is a useful behavior anytime when for whatever reason a device
# w/i a tier wants parts from another device already in the same tier
# another example is `test_one_small_guy_does_not_spoil_his_buddy`
expected = {
0: 123,
1: 123,
@ -3691,6 +3789,45 @@ class TestGetRequiredOverload(unittest.TestCase):
self.assertEqual(rb.get_balance(), 30.46875)
# increasing overload moves towards one replica in each tier
rb.set_overload(0.3)
expected = {
0: 0.553443113772455,
1: 0.553443113772455,
2: 0.553443113772455,
3: 0.553443113772455,
4: 0.778443113772455,
5: 0.007784431137724551,
}
target_replicas = rb._build_target_replicas_by_tier()
self.assertEqual(expected, {t[-1]: r for (t, r) in
target_replicas.items()
if len(t) == 4})
# ... and as always increasing overload makes balance *worse*
rb.rebalance(seed=12)
self.assertEqual(rb.get_balance(), 30.46875)
# the little guy it really struggling to take his share tho
expected = {
0: 142,
1: 141,
2: 142,
3: 141,
4: 200,
5: 2,
}
self.assertEqual(expected, {
d['id']: d['parts'] for d in rb._iter_devs()})
# ... and you can see it in the balance!
expected = {
0: -7.367187499999986,
1: -8.019531249999986,
2: -7.367187499999986,
3: -8.019531249999986,
4: 30.46875,
5: 30.46875,
}
self.assertEqual(expected, rb._build_balance_per_dev())
rb.set_overload(0.5)
expected = {
0: 0.5232035928143712,
@ -3705,7 +3842,7 @@ class TestGetRequiredOverload(unittest.TestCase):
target_replicas.items()
if len(t) == 4})
# ... and as always increasing overload makes balance *worse*
# because the device is so small, balance get's bad quick
rb.rebalance(seed=17)
self.assertEqual(rb.get_balance(), 95.703125)

View File

@ -1115,7 +1115,7 @@ class TestCooperativeRingBuilder(BaseTestCompositeBuilder):
rb1.rebalance()
self.assertEqual([rb1], update_calls)
self.assertEqual([rb1], can_part_move_calls.keys())
self.assertEqual(512, len(can_part_move_calls[rb1]))
self.assertEqual(768, len(can_part_move_calls[rb1]))
# two component builders with same parent builder
cb = CompositeRingBuilder()
@ -1139,8 +1139,8 @@ class TestCooperativeRingBuilder(BaseTestCompositeBuilder):
# rb1 is being rebalanced so gets checked, and rb2 also gets checked
self.assertEqual(sorted([rb1, rb2]), sorted(can_part_move_calls))
self.assertEqual(512, len(can_part_move_calls[rb1]))
self.assertEqual(512, len(can_part_move_calls[rb2]))
self.assertEqual(768, len(can_part_move_calls[rb1]))
self.assertEqual(768, len(can_part_move_calls[rb2]))
def test_save_then_load(self):
cb = CompositeRingBuilder()

View File

@ -619,10 +619,10 @@ class TestUtils(unittest.TestCase):
rb.rebalance(seed=100)
rb.validate()
self.assertEqual(rb.dispersion, 39.84375)
self.assertEqual(rb.dispersion, 55.46875)
report = dispersion_report(rb)
self.assertEqual(report['worst_tier'], 'r1z1')
self.assertEqual(report['max_dispersion'], 39.84375)
self.assertEqual(report['max_dispersion'], 44.921875)
def build_tier_report(max_replicas, placed_parts, dispersion,
replicas):
@ -633,16 +633,17 @@ class TestUtils(unittest.TestCase):
'replicas': replicas,
}
# Each node should store 256 partitions to avoid multiple replicas
# Each node should store less than or equal to 256 partitions to
# avoid multiple replicas.
# 2/5 of total weight * 768 ~= 307 -> 51 partitions on each node in
# zone 1 are stored at least twice on the nodes
expected = [
['r1z1', build_tier_report(
2, 256, 39.84375, [0, 0, 154, 102])],
2, 256, 44.921875, [0, 0, 141, 115])],
['r1z1-127.0.0.1', build_tier_report(
1, 256, 19.921875, [0, 205, 51, 0])],
1, 242, 29.33884297520661, [14, 171, 71, 0])],
['r1z1-127.0.0.2', build_tier_report(
1, 256, 19.921875, [0, 205, 51, 0])],
1, 243, 29.218106995884774, [13, 172, 71, 0])],
]
report = dispersion_report(rb, 'r1z1[^/]*$', verbose=True)
graph = report['graph']
@ -667,9 +668,9 @@ class TestUtils(unittest.TestCase):
# can't move all the part-replicas in one rebalance
rb.rebalance(seed=100)
report = dispersion_report(rb, verbose=True)
self.assertEqual(rb.dispersion, 9.375)
self.assertEqual(report['worst_tier'], 'r1z1-127.0.0.1')
self.assertEqual(report['max_dispersion'], 7.18562874251497)
self.assertEqual(rb.dispersion, 11.71875)
self.assertEqual(report['worst_tier'], 'r1z1-127.0.0.2')
self.assertEqual(report['max_dispersion'], 8.875739644970414)
# do a sencond rebalance
rb.rebalance(seed=100)
report = dispersion_report(rb, verbose=True)