From d66e95d1d96c9f5aee52740df53d6e784c7b8194 Mon Sep 17 00:00:00 2001 From: Mark Goddard Date: Thu, 16 May 2019 17:26:45 +0100 Subject: [PATCH] Fix keystone fernet key rotation scheduling Right now every controller rotates fernet keys. This is nice because should any controller die, we know the remaining ones will rotate the keys. However, we are currently over-rotating the keys. When we over rotate keys, we get logs like this: This is not a recognized Fernet token TokenNotFound Most clients can recover and get a new token, but some clients (like Nova passing tokens to other services) can't do that because it doesn't have the password to regenerate a new token. With three controllers, in crontab in keystone-fernet we see the once a day correctly staggered across the three controllers: ssh ctrl1 sudo cat /etc/kolla/keystone-fernet/crontab 0 0 * * * /usr/bin/fernet-rotate.sh ssh ctrl2 sudo cat /etc/kolla/keystone-fernet/crontab 0 8 * * * /usr/bin/fernet-rotate.sh ssh ctrl3 sudo cat /etc/kolla/keystone-fernet/crontab 0 16 * * * /usr/bin/fernet-rotate.sh Currently with three controllers we have this keystone config: [token] expiration = 86400 (although, keystone default is one hour) allow_expired_window = 172800 (this is the keystone default) [fernet_tokens] max_active_keys = 4 Currently, kolla-ansible configures key rotation according to the following: rotation_interval = token_expiration / num_hosts This means we rotate keys more quickly the more hosts we have, which doesn't make much sense. Keystone docs state: max_active_keys = ((token_expiration + allow_expired_window) / rotation_interval) + 2 For details see: https://docs.openstack.org/keystone/stein/admin/fernet-token-faq.html Rotation is based on pushing out a staging key, so should any server start using that key, other servers will consider that valid. Then each server in turn starts using the staging key, each in term demoting the existing primary key to a secondary key. Eventually you prune the secondary keys when there is no token in the wild that would need to be decrypted using that key. So this all makes sense. This change adds new variables for fernet_token_allow_expired_window and fernet_key_rotation_interval, so that we can correctly calculate the correct number of active keys. We now set the default rotation interval so as to minimise the number of active keys to 3 - one primary, one secondary, one buffer. This change also fixes the fernet cron job generator, which was broken in the following cases: * requesting an interval of more than 1 day resulted in no jobs * requesting an interval of more than 60 minutes, unless an exact multiple of 60 minutes, resulted in no jobs It should now be possible to request any interval up to a week divided by the number of hosts. Change-Id: I10c82dc5f83653beb60ddb86d558c5602153341a Closes-Bug: #1809469 (cherry picked from commit 6c1442c385450004dd253f3f464fe4336194be99) --- ansible/group_vars/all.yml | 8 + .../files/fernet_rotate_cron_generator.py | 81 +++++---- ansible/roles/keystone/tasks/config.yml | 7 +- .../roles/keystone/templates/keystone.conf.j2 | 12 +- doc/source/reference/index.rst | 1 + doc/source/reference/keystone-guide.rst | 43 +++++ .../fernet-key-rotation-8d40041d7d783dc7.yaml | 16 ++ tests/test_fernet_rotate_cron_generator.py | 169 +++++++++++------- 8 files changed, 239 insertions(+), 98 deletions(-) create mode 100644 doc/source/reference/keystone-guide.rst create mode 100644 releasenotes/notes/fernet-key-rotation-8d40041d7d783dc7.yaml diff --git a/ansible/group_vars/all.yml b/ansible/group_vars/all.yml index 14ba79cdd9..c3f79a9924 100644 --- a/ansible/group_vars/all.yml +++ b/ansible/group_vars/all.yml @@ -643,7 +643,15 @@ default_user_domain_id: "default" # Valid options are [ fernet ] keystone_token_provider: "fernet" +# Keystone fernet token expiry in seconds. Default is 1 day. fernet_token_expiry: 86400 +# Keystone window to allow expired fernet tokens. Default is 2 days. +fernet_token_allow_expired_window: 172800 +# Keystone fernet key rotation interval in seconds. Default is sum of token +# expiry and allow expired window, 3 days. This ensures the minimum number +# of keys are active. If this interval is lower than the sum of the token +# expiry and allow expired window, multiple active keys will be necessary. +fernet_key_rotation_interval: "{{ fernet_token_expiry + fernet_token_allow_expired_window }}" keystone_default_user_role: "_member_" diff --git a/ansible/roles/keystone/files/fernet_rotate_cron_generator.py b/ansible/roles/keystone/files/fernet_rotate_cron_generator.py index c660ff34ec..fb82011a17 100644 --- a/ansible/roles/keystone/files/fernet_rotate_cron_generator.py +++ b/ansible/roles/keystone/files/fernet_rotate_cron_generator.py @@ -26,6 +26,10 @@ DAY_SPAN = 24 * HOUR_SPAN WEEK_SPAN = 7 * DAY_SPAN +class RotationIntervalTooLong(Exception): + pass + + def json_exit(msg=None, failed=False, changed=False): if type(msg) is not dict: msg = {'msg': str(msg)} @@ -35,60 +39,69 @@ def json_exit(msg=None, failed=False, changed=False): def generate(host_index, total_hosts, total_rotation_mins): - min = '*' - hour = '*' - day = '*' + min = '*' # 0-59 + hour = '*' # 0-23 + day = '*' # 0-6 (day of week) crons = [] if host_index >= total_hosts: return crons - rotation_frequency = total_rotation_mins // total_hosts - cron_min = rotation_frequency * host_index + # We need to rotate the key every total_rotation_mins minutes. + # When there are N hosts, each host should rotate once every N * + # total_rotation_mins minutes, in a round-robin manner. + # We can generate a cycle for index 0, then add an offset specific to each + # host. + # NOTE: Minor under-rotation is better than over-rotation since tokens + # may become invalid if keys are over-rotated. + host_rotation_mins = total_rotation_mins * total_hosts + host_rotation_offset = total_rotation_mins * host_index - # Build crons for a week period - if total_rotation_mins == WEEK_SPAN: - day = cron_min // DAY_SPAN - hour = (cron_min % DAY_SPAN) // HOUR_SPAN - min = cron_min % HOUR_SPAN - crons.append({'min': min, 'hour': hour, 'day': day}) + # Can't currently rotate less than once per week. + if total_rotation_mins > WEEK_SPAN: + msg = ("Unable to schedule fernet key rotation with an interval " + "greater than 1 week divided by the number of hosts") + raise RotationIntervalTooLong(msg) - # Build crons for a day period - elif total_rotation_mins == DAY_SPAN: - hour = cron_min // HOUR_SPAN - min = cron_min % HOUR_SPAN - crons.append({'min': min, 'hour': hour, 'day': day}) + # Build crons multiple of a day + elif host_rotation_mins > DAY_SPAN: + time = host_rotation_offset + while time + total_rotation_mins <= WEEK_SPAN: + day = time // DAY_SPAN + hour = time % HOUR_SPAN + min = time % HOUR_SPAN + crons.append({'min': min, 'hour': hour, 'day': day}) + + time += host_rotation_mins # Build crons for multiple of an hour - elif total_rotation_mins % HOUR_SPAN == 0: - for multiple in range(1, DAY_SPAN // total_rotation_mins + 1): - time = cron_min - if multiple > 1: - time += total_rotation_mins * (multiple - 1) - + elif host_rotation_mins > HOUR_SPAN: + time = host_rotation_offset + while time + total_rotation_mins <= DAY_SPAN: hour = time // HOUR_SPAN min = time % HOUR_SPAN crons.append({'min': min, 'hour': hour, 'day': day}) - # Build crons for multiple of a minute - elif total_rotation_mins % MINUTE_SPAN == 0: - for multiple in range(1, HOUR_SPAN // total_rotation_mins + 1): - time = cron_min - if multiple > 1: - time += total_rotation_mins * (multiple - 1) + time += host_rotation_mins + # Build crons for multiple of a minute + else: + time = host_rotation_offset + while time + total_rotation_mins <= HOUR_SPAN: min = time // MINUTE_SPAN crons.append({'min': min, 'hour': hour, 'day': day}) + time += host_rotation_mins + return crons def main(): parser = argparse.ArgumentParser(description='''Creates a list of cron intervals for a node in a group of nodes to ensure each node runs - a cron in round robbin style.''') + a cron in round robin style.''') parser.add_argument('-t', '--time', - help='Time in seconds for a token rotation cycle', + help='Time in minutes for a key rotation cycle', required=True, type=int) parser.add_argument('-i', '--index', @@ -96,11 +109,15 @@ def main(): required=True, type=int) parser.add_argument('-n', '--number', - help='Number of tokens that should exist', + help='Number of hosts', required=True, type=int) args = parser.parse_args() - json_exit({'cron_jobs': generate(args.index, args.number, args.time)}) + try: + jobs = generate(args.index, args.number, args.time) + except Exception as e: + json_exit(str(e), failed=True) + json_exit({'cron_jobs': jobs}) if __name__ == "__main__": diff --git a/ansible/roles/keystone/tasks/config.yml b/ansible/roles/keystone/tasks/config.yml index 7175eb831b..c1f6b3642c 100644 --- a/ansible/roles/keystone/tasks/config.yml +++ b/ansible/roles/keystone/tasks/config.yml @@ -182,9 +182,14 @@ - Restart keystone container - name: Generate the required cron jobs for the node - local_action: "command python {{ role_path }}/files/fernet_rotate_cron_generator.py -t {{ (fernet_token_expiry | int) // 60 }} -i {{ groups['keystone'].index(inventory_hostname) }} -n {{ (groups['keystone'] | length) }}" + command: > + python {{ role_path }}/files/fernet_rotate_cron_generator.py + -t {{ (fernet_key_rotation_interval | int) // 60 }} + -i {{ groups['keystone'].index(inventory_hostname) }} + -n {{ (groups['keystone'] | length) }} register: cron_jobs_json when: keystone_token_provider == 'fernet' + delegate_to: localhost - name: Save the returned from cron jobs for building the crontab set_fact: diff --git a/ansible/roles/keystone/templates/keystone.conf.j2 b/ansible/roles/keystone/templates/keystone.conf.j2 index 4ea8b85a56..5e33e3c76f 100644 --- a/ansible/roles/keystone/templates/keystone.conf.j2 +++ b/ansible/roles/keystone/templates/keystone.conf.j2 @@ -32,9 +32,19 @@ domain_config_dir = /etc/keystone/domains revoke_by_id = False provider = {{ keystone_token_provider }} expiration = {{ fernet_token_expiry }} +allow_expired_window = {{ fernet_token_allow_expired_window }} [fernet_tokens] -max_active_keys = {{ (groups['keystone'] | length) + 1 }} +# Keystone docs note: +# max_active_keys = +# ((token_expiration + allow_expired_window) / rotation_frequency) + 2 +# https://docs.openstack.org/keystone/stein/admin/fernet-token-faq.html +# +# Use (x + y - 1) / y to round up integer division. +max_active_keys = {{ ((fernet_token_expiry | int + + fernet_token_allow_expired_window | int + + fernet_key_rotation_interval | int - 1) // + fernet_key_rotation_interval | int) + 2 }} [cache] backend = oslo_cache.memcache_pool diff --git a/doc/source/reference/index.rst b/doc/source/reference/index.rst index 61da4607b0..1c0ea47c71 100644 --- a/doc/source/reference/index.rst +++ b/doc/source/reference/index.rst @@ -13,6 +13,7 @@ Projects Deployment References designate-guide hyperv-guide ironic-guide + keystone-guide manila-guide manila-hnas-guide monasca-guide diff --git a/doc/source/reference/keystone-guide.rst b/doc/source/reference/keystone-guide.rst new file mode 100644 index 0000000000..b15e793ee8 --- /dev/null +++ b/doc/source/reference/keystone-guide.rst @@ -0,0 +1,43 @@ +.. _keystone-guide: + +=========================== +Keystone - Identity service +=========================== + +Tokens +------ + +The Keystone token provider is configured via ``keystone_token_provider``. The +default value for this is ``fernet``. + +Fernet Tokens +~~~~~~~~~~~~~ + +Fernet tokens require the use of keys that must be synchronised between +Keystone servers. Kolla Ansible deploys two containers to handle this - +``keystone_fernet`` runs cron jobs to rotate keys via rsync when necessary. +``keystone_ssh`` is an SSH server that provides the transport for rsync. In a +multi-host control plane, these rotations are performed by the hosts in a +round-robin manner. + +The following variables may be used to configure the token expiry and key +rotation. + +``fernet_token_expiry`` + Keystone fernet token expiry in seconds. Default is 86400, which is 1 day. +``fernet_token_allow_expired_window`` + Keystone window to allow expired fernet tokens. Default is 172800, which is + 2 days. +``fernet_key_rotation_interval`` + Keystone fernet key rotation interval in seconds. Default is sum of token + expiry and allow expired window, which is 3 days. + +The default rotation interval is set up to ensure that the minimum number of +keys may be active at any time. This is one primary key, one secondary key and +a buffer key - three in total. If the rotation interval is set lower than the +sum of the token expiry and token allow expired window, more active keys will +be configured in Keystone as necessary. + +Further infomation on Fernet tokens is available in the `Keystone +documentation +`__. diff --git a/releasenotes/notes/fernet-key-rotation-8d40041d7d783dc7.yaml b/releasenotes/notes/fernet-key-rotation-8d40041d7d783dc7.yaml new file mode 100644 index 0000000000..7e624091e5 --- /dev/null +++ b/releasenotes/notes/fernet-key-rotation-8d40041d7d783dc7.yaml @@ -0,0 +1,16 @@ +--- +upgrade: + - | + The Keystone fernet key rotation scheduling algorithm has been modified to + avoid issues with over-rotation of keys. + + The variables ``fernet_token_expiry``, + ``fernet_token_allow_expired_window`` and ``fernet_key_rotation_interval`` + may be set to configure the token expiry and key rotation schedule. + + By default, ``fernet_token_expiry`` is 86400, + ``fernet_token_allow_expired_window`` is 172800, and + ``fernet_key_rotation_interval`` is the sum of these two variables. This + allows for the minimum number of active keys - 3. + + See `bug 1809469 `__ for details. diff --git a/tests/test_fernet_rotate_cron_generator.py b/tests/test_fernet_rotate_cron_generator.py index 7c60bd2f6f..6e01f12d85 100644 --- a/tests/test_fernet_rotate_cron_generator.py +++ b/tests/test_fernet_rotate_cron_generator.py @@ -36,47 +36,85 @@ class FernetRotateCronGeneratorTest(base.BaseTestCase): expected = [] self._test(1, 0, 0, expected) - # total_rotation_mins == WEEK_SPAN: + # Invalid: total_rotation_mins > WEEK_SPAN: + + def test_1_week_1_min_1_host(self): + self.assertRaises(generator.RotationIntervalTooLong, + generator.generate, + 0, 1, 7 * 24 * 60 + 1) + + def test_1_week_1_min_2_hosts(self): + self.assertRaises(generator.RotationIntervalTooLong, + generator.generate, + 0, 2, 7 * 24 * 60 + 1) + + # host_rotation_mins > DAY_SPAN: + + def test_1_day_2_hosts(self): + expected = [{"min": 0, "hour": 0, "day": day} + for day in range(0, 7, 2)] + self._test(0, 2, 24 * 60, expected) + + expected = [{"min": 0, "hour": 0, "day": day} + for day in range(1, 7, 2)] + self._test(1, 2, 24 * 60, expected) + + def test_1_day_3_hosts(self): + expected = [{"min": 0, "hour": 0, "day": day} + for day in range(0, 7, 3)] + self._test(0, 3, 24 * 60, expected) + + expected = [{"min": 0, "hour": 0, "day": day} + for day in range(1, 7, 3)] + self._test(1, 3, 24 * 60, expected) + + expected = [{"min": 0, "hour": 0, "day": day} + for day in range(2, 7, 3)] + self._test(2, 3, 24 * 60, expected) + + def test_2_days_1_host(self): + expected = [{"min": 0, "hour": 0, "day": day} + for day in range(0, 6, 2)] + self._test(0, 1, 2 * 24 * 60, expected) + + def test_2_days_2_hosts(self): + expected = [{"min": 0, "hour": 0, "day": day} + for day in range(0, 7, 4)] + self._test(0, 2, 2 * 24 * 60, expected) + + expected = [{"min": 0, "hour": 0, "day": 2}] + self._test(1, 2, 2 * 24 * 60, expected) + + def test_3_days_1_host(self): + # NOTE: This is the default config in kolla-ansible for 1 host. + expected = [{"min": 0, "hour": 0, "day": day} + for day in range(0, 6, 3)] + self._test(0, 1, 3 * 24 * 60, expected) + + def test_3_days_3_hosts(self): + # NOTE: This is the default config in kolla-ansible for 3 hosts. + expected = [{"min": 0, "hour": 0, "day": 0}] + self._test(0, 3, 3 * 24 * 60, expected) + + expected = [{"min": 0, "hour": 0, "day": 3}] + self._test(1, 3, 3 * 24 * 60, expected) + + expected = [] + self._test(2, 3, 3 * 24 * 60, expected) def test_1_week_1_host(self): expected = [{"min": 0, "hour": 0, "day": 0}] self._test(0, 1, 7 * 24 * 60, expected) - def test_1_week_2_hosts(self): - expected = [{"min": 0, "hour": 0, "day": 0}] - self._test(0, 2, 7 * 24 * 60, expected) - - expected = [{"min": 0, "hour": 12, "day": 3}] - self._test(1, 2, 7 * 24 * 60, expected) - - # total_rotation_mins == DAY_SPAN: - - def test_1_day_1_host(self): - expected = [{"min": 0, "hour": 0, "day": "*"}] - self._test(0, 1, 24 * 60, expected) - - def test_1_day_2_hosts(self): - expected = [{"min": 0, "hour": 0, "day": "*"}] - self._test(0, 2, 24 * 60, expected) - - expected = [{"min": 0, "hour": 12, "day": "*"}] - self._test(1, 2, 24 * 60, expected) - - # total_rotation_mins % HOUR_SPAN == 0: - - def test_1_hour_1_host(self): - # nit: This could be a single hour: '*'. - expected = [{"min": 0, "hour": hour, "day": "*"} - for hour in range(24)] - self._test(0, 1, 60, expected) + # total_rotation_mins > HOUR_SPAN: def test_1_hour_2_hosts(self): expected = [{"min": 0, "hour": hour, "day": "*"} - for hour in range(24)] + for hour in range(0, 24, 2)] self._test(0, 2, 60, expected) - expected = [{"min": 30, "hour": hour, "day": "*"} - for hour in range(24)] + expected = [{"min": 0, "hour": hour, "day": "*"} + for hour in range(1, 24, 2)] self._test(1, 2, 60, expected) def test_2_hours_1_host(self): @@ -86,42 +124,56 @@ class FernetRotateCronGeneratorTest(base.BaseTestCase): def test_2_hours_2_hosts(self): expected = [{"min": 0, "hour": hour, "day": "*"} - for hour in range(0, 24, 2)] + for hour in range(0, 24, 4)] self._test(0, 2, 2 * 60, expected) expected = [{"min": 0, "hour": hour, "day": "*"} - for hour in range(1, 24, 2)] + for hour in range(2, 24, 4)] self._test(1, 2, 2 * 60, expected) - def test_2_days_1_host(self): - # FIXME: Anything greater than 1 day (except 1 week) returns no jobs. - expected = [] - self._test(0, 1, 2 * 24 * 60, expected) + def test_61_minutes_1_host(self): + # FIXME: Anything greater than 1 hour (unless an integer number of + # hours) returns no jobs. + expected = [{"min": hour, "hour": hour, "day": "*"} + for hour in range(0, 23)] + self._test(0, 1, 61, expected) - def test_2_days_2_hosts(self): - # FIXME: Anything greater than 1 day (except 1 week) returns no jobs. - expected = [] - self._test(0, 2, 2 * 24 * 60, expected) + def test_61_minutes_2_hosts(self): + # FIXME: Anything greater than 1 hour (unless an integer number of + # hours) returns no jobs. + expected = [{"min": hour, "hour": hour, "day": "*"} + for hour in range(0, 24, 2)] + self._test(0, 2, 61, expected) - expected = [] - self._test(1, 2, 2 * 24 * 60, expected) + expected = [{"min": hour, "hour": hour, "day": "*"} + for hour in range(1, 23, 2)] + self._test(1, 2, 61, expected) - # total_rotation_mins % MINUTE_SPAN == 0: + def test_1_day_1_host(self): + expected = [{"min": 0, "hour": 0, "day": "*"}] + self._test(0, 1, 24 * 60, expected) + + def test_12_hours_2_hosts(self): + expected = [{"min": 0, "hour": 0, "day": "*"}] + self._test(0, 2, 12 * 60, expected) + + expected = [{"min": 0, "hour": 12, "day": "*"}] + self._test(1, 2, 12 * 60, expected) + + # else: def test_1_minute_1_host(self): - # This could be a single hour: '*'. expected = [{"min": min, "hour": "*", "day": "*"} for min in range(60)] self._test(0, 1, 1, expected) def test_1_minute_2_hosts(self): - # This is kind of broken, but its an edge case so nevermind. expected = [{"min": min, "hour": "*", "day": "*"} - for min in range(60)] + for min in range(0, 60, 2)] self._test(0, 2, 1, expected) expected = [{"min": min, "hour": "*", "day": "*"} - for min in range(60)] + for min in range(1, 60, 2)] self._test(1, 2, 1, expected) def test_2_minutes_1_host(self): @@ -131,24 +183,13 @@ class FernetRotateCronGeneratorTest(base.BaseTestCase): def test_2_minutes_2_hosts(self): expected = [{"min": min, "hour": "*", "day": "*"} - for min in range(0, 60, 2)] + for min in range(0, 60, 4)] self._test(0, 2, 2, expected) expected = [{"min": min, "hour": "*", "day": "*"} - for min in range(1, 60, 2)] + for min in range(2, 60, 4)] self._test(1, 2, 2, expected) - def test_61_minutes_1_host(self): - # FIXME: Anything greater than 1 hour (unless an integer number of - # hours) returns no jobs. - expected = [] - self._test(0, 1, 61, expected) - - def test_61_minutes_2_hosts(self): - # FIXME: Anything greater than 1 hour (unless an integer number of - # hours) returns no jobs. - expected = [] - self._test(0, 1, 61, expected) - - expected = [] - self._test(1, 2, 61, expected) + def test_1_hour_1_host(self): + expected = [{"min": 0, "hour": "*", "day": "*"}] + self._test(0, 1, 60, expected)