summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAleksandr Mogylchenko <amogylchenko@mirantis.com>2017-02-17 11:30:10 +0100
committerAleksandr Mogylchenko <amogylchenko@mirantis.com>2017-03-02 14:25:57 +0100
commit94b4940afa70ff1ba2b1f4f1bb7e1622391e84c5 (patch)
tree922f362f3c81c886cc98af665bedd57fb95c183b
parent3eb74a9dbd0b8456a280bbea82ee06b617fd962c (diff)
HA for etcd based on Statefulsets
- no external monitor, thus after certain number of partial outages etcd might loose quorum; - allows specifying any etcd command-line arguments in config; Change-Id: Ib2b2b8bd9da2db4fb16914b6bb014fb38834c8e8
Notes
Notes (review): Verified+1: Mirantis CCP CI <mirantis-fuel-ccp-ci@mirantis.com> Code-Review+2: Proskurin Kirill <kproskurin@mirantis.com> Code-Review+2: Sergey Reshetnyak <sreshetniak@mirantis.com> Workflow+1: Sergey Reshetnyak <sreshetniak@mirantis.com> Verified+2: Jenkins Submitted-by: Jenkins Submitted-at: Tue, 07 Mar 2017 11:24:27 +0000 Reviewed-on: https://review.openstack.org/436490 Project: openstack/fuel-ccp-etcd Branch: refs/heads/master
-rw-r--r--service/etcd.yaml7
-rw-r--r--service/files/defaults.yaml4
-rw-r--r--service/files/entrypoint.py211
-rw-r--r--service/files/entrypoint.sh.j212
4 files changed, 219 insertions, 15 deletions
diff --git a/service/etcd.yaml b/service/etcd.yaml
index c46d998..e1707bd 100644
--- a/service/etcd.yaml
+++ b/service/etcd.yaml
@@ -1,6 +1,7 @@
1dsl_version: 0.4.0 1dsl_version: 0.4.0
2service: 2service:
3 name: etcd 3 name: etcd
4 kind: StatefulSet
4 ports: 5 ports:
5 - {{ etcd.client_port }} 6 - {{ etcd.client_port }}
6 - {{ etcd.server_port }} 7 - {{ etcd.server_port }}
@@ -12,7 +13,7 @@ service:
12 - name: etcd 13 - name: etcd
13 image: etcd 14 image: etcd
14 daemon: 15 daemon:
15 command: /opt/ccp/bin/entrypoint.sh 16 command: /opt/ccp/bin/entrypoint.py
16 files: 17 files:
17 - entrypoint 18 - entrypoint
18 # {% if etcd.tls.enabled %} 19 # {% if etcd.tls.enabled %}
@@ -22,8 +23,8 @@ service:
22 23
23files: 24files:
24 entrypoint: 25 entrypoint:
25 path: /opt/ccp/bin/entrypoint.sh 26 path: /opt/ccp/bin/entrypoint.py
26 content: entrypoint.sh.j2 27 content: entrypoint.py
27 perm: "0755" 28 perm: "0755"
28# {% if etcd.tls.enabled %} 29# {% if etcd.tls.enabled %}
29 server_certificate: 30 server_certificate:
diff --git a/service/files/defaults.yaml b/service/files/defaults.yaml
index b98955e..ae53c4d 100644
--- a/service/files/defaults.yaml
+++ b/service/files/defaults.yaml
@@ -8,6 +8,10 @@ configs:
8 cont: 2380 8 cont: 2380
9 tls: 9 tls:
10 enabled: true 10 enabled: true
11 token: cluster
12 additional_arguments:
13 election-timeout: 5000
14 heartbeat-interval: 250
11 15
12versions: 16versions:
13 etcd_version: v3.0.12 17 etcd_version: v3.0.12
diff --git a/service/files/entrypoint.py b/service/files/entrypoint.py
new file mode 100644
index 0000000..d20c938
--- /dev/null
+++ b/service/files/entrypoint.py
@@ -0,0 +1,211 @@
1#!/usr/bin/env python
2
3import functools
4import json
5import logging
6import requests
7import socket
8import subprocess
9import time
10import urlparse
11
12from requests.exceptions import RequestException, ConnectionError
13LOG_DATEFMT = "%Y-%m-%d %H:%M:%S"
14LOG_FORMAT = "%(asctime)s.%(msecs)03d - %(levelname)s - %(message)s"
15logging.basicConfig(format=LOG_FORMAT,
16 datefmt=LOG_DATEFMT,
17 level=logging.DEBUG)
18LOG = logging.getLogger(__name__)
19
20GLOBALS_PATH = '/etc/ccp/globals/globals.json'
21
22
23def retry(f):
24 @functools.wraps(f)
25 def wrap(*args, **kwargs):
26 attempts = config.connection_attempts
27 delay = config.connection_delay
28 while attempts > 1:
29 try:
30 return f(*args, **kwargs)
31 except (RequestException, ConnectionError) as err:
32 LOG.warning('Retrying in %d seconds because of %s', delay, err)
33 time.sleep(delay)
34 attempts -= 1
35 return f(*args, **kwargs)
36 return wrap
37
38
39class Configuration():
40 def __init__(self, config_file):
41 LOG.info("Getting global variables from %s", config_file)
42 values = {}
43 with open(config_file) as f:
44 global_conf = json.load(f)
45 for key in ['etcd', 'namespace', 'security', 'cluster_domain']:
46 values[key] = global_conf[key]
47 hostname = socket.gethostname()
48 ipaddr = socket.gethostbyname(hostname)
49 self.etcd_binary = '/usr/local/bin/etcd'
50 self.connection_delay = 2
51 self.connection_attempts = 5
52 self.client_port = int(values['etcd']['client_port']['cont'])
53 self.server_port = int(values['etcd']['server_port']['cont'])
54 self.tls = values['etcd']['tls']['enabled']
55 self.token = values['etcd']['token']
56 self.namespace = values['namespace']
57 self.cluster_domain = values['cluster_domain']
58 self.api_version = 'v2'
59 if self.tls:
60 self.host_template = 'https://%s:%d'
61 self.cert_file = '/opt/ccp/etc/tls/etcd_server_certificate.pem'
62 self.key_file = '/opt/ccp/etc/tls/etcd_server_key.pem'
63 self.ca_file = '/opt/ccp/etc/tls/ca.pem'
64 self.verify_connectivity = self.ca_file
65 else:
66 self.host_template = 'http://%s:%d'
67 self.verify_connectivity = False
68 fqdn_template = "%s.%s.svc.%s"
69 svc = fqdn_template % ('etcd', self.namespace, self.cluster_domain)
70 # Represents fqdn service endoint for etcd
71 self.service = self.host_template % (svc, self.client_port)
72 members_endpoint = '%s/members/' % self.api_version
73 # URL to query when accessing etcd members api
74 self.members_api = urlparse.urljoin(self.service, members_endpoint)
75 # When joining etcd cluster, members list is special:
76 # <name>=<peerURL>,<name2>=<peerURL2>,...
77 self.name = "%s.%s" % (hostname, svc)
78 self.peer_url = self.host_template % (ipaddr, self.server_port)
79 self.member_name = "%s=%s" % (self.name, self.peer_url)
80 self.arguments = values.get('etcd').get('additional_arguments', None)
81
82
83def start_etcd(config, bootstrap=False, initial_members=None):
84 name = config.name
85 client_port = config.client_port
86 server_port = config.server_port
87 client_host = config.host_template % (name, client_port)
88 server_host = config.host_template % (name, server_port)
89 if config.tls:
90 # We add insecure listener for checks
91 insecure_listener = ",http://%s:%s" % ('127.0.0.1', client_port)
92 else:
93 insecure_listener = ""
94 args = ['--name=%s' % name,
95 '--listen-peer-urls=%s' % server_host,
96 '--listen-client-urls=%s' % client_host + insecure_listener,
97 '--advertise-client-urls=%s' % client_host,
98 '--initial-advertise-peer-urls=%s' % server_host,
99 '--initial-cluster-token=%s' % config.token]
100 if config.tls:
101 args += ['--peer-auto-tls']
102 args += ['--cert-file=%s' % config.cert_file]
103 args += ['--key-file=%s' % config.key_file]
104 if bootstrap:
105 args += ["--initial-cluster=%s=%s" % (name, server_host)]
106 if initial_members:
107 args += ["--initial-cluster-state=existing",
108 "--initial-cluster=%s" % initial_members]
109 if config.arguments:
110 LOG.debug("Additional arguments are %s" % config.arguments)
111 custom = ["--%s=%s" % (k,v) for k,v in config.arguments.iteritems()]
112 args += custom
113 cmd = [config.etcd_binary] + args
114 LOG.info("Launching etcd with %s" % cmd)
115 subprocess.check_call(cmd, shell=False)
116
117
118@retry
119def _add_etcd_member(members_api, peer_url):
120 headers = {'content-type': 'application/json'}
121 data = {'peerURLs': [peer_url]}
122 verify = config.verify_connectivity
123 r = requests.post(members_api, json=data, headers=headers, verify=verify)
124 # https://coreos.com/etcd/docs/latest/v2/members_api.html
125 if r.status_code == 201:
126 return peer_url
127 elif r.status_code == 500:
128 # Request failed, but might be processed later, not sure how to handle
129 LOG.debug('Etcd cluster returned 500, might be busy...')
130 r.raise_for_status()
131 else:
132 r.raise_for_status()
133
134
135@retry
136def _delete_etcd_member(members_api, name):
137 # HTTP API needs id of the member to delete it
138 # So first we get member id, then we delete it - 2 calls total.
139 peers = _get_etcd_members(members_api)
140 _id = _get_etcd_member_id(peers, name)
141 LOG.debug("Deleting %s with id %s from etcd cluster..." % (name, _id))
142 url = urlparse.urljoin(members_api, _id)
143 verify = config.verify_connectivity
144 r = requests.delete(url, verify=verify)
145 if r.status_code == 204:
146 return [p for p in peers if p['name'] != name]
147 else:
148 LOG.debug("Delete failed with error %i", r.status_code)
149 r.raise_for_status()
150
151
152@retry
153def _get_etcd_members(members_api):
154 verify = config.verify_connectivity
155 r = requests.get(members_api, verify=verify)
156 if r.status_code == 200:
157 peers = r.json()['members']
158 return peers
159 else:
160 r.raise_for_status()
161
162
163def _etcd_members_as_string(peers):
164 # <name>=<peerURL>,<name2>=<peerURL2>,...
165 l = []
166 for m in peers:
167 if m['name']:
168 l.append("%s=%s" % (m['name'], m['peerURLs'][0]))
169 return ",".join(l)
170
171
172def _get_etcd_member_id(peers, name):
173 # Get member id from peers list
174 members = [p['id'] for p in peers if p['name'] == name]
175 if members:
176 return members[0]
177 else:
178 return None
179
180
181if __name__ == "__main__":
182 config = Configuration(GLOBALS_PATH)
183 etcd_members_api = config.members_api
184 try:
185 # The only reliable way to determine if etcd cluster exists is to query
186 # service.
187 peers = _get_etcd_members(etcd_members_api)
188 members = _etcd_members_as_string(peers)
189 except ConnectionError:
190 LOG.debug("No one seems to be alive...")
191 members = ""
192 if not members:
193 # TODO(amnk): add recovery from complete disaster (e.g. restore data
194 # from data-dir if it is available
195 LOG.debug("I'm a leader, starting...")
196 start_etcd(config, bootstrap=True)
197 else:
198 if config.name in members:
199 # If we find our hostname in existing members, we are recovering
200 # from some failure. Since we cannot guarantee having all needed
201 # data on new node, we need to delete ourselve before joining.
202 LOG.debug("Found myself in members...")
203 new_peers = _delete_etcd_member(etcd_members_api, config.name)
204 new_members = _etcd_members_as_string(new_peers)
205 else:
206 new_members = members
207 LOG.debug("Adding myself to cluster %s..." % etcd_members_api)
208 _add_etcd_member(etcd_members_api, config.peer_url)
209 all_members = new_members + ',' + config.member_name
210 LOG.debug("Joining %s" % members)
211 start_etcd(config, initial_members=all_members)
diff --git a/service/files/entrypoint.sh.j2 b/service/files/entrypoint.sh.j2
deleted file mode 100644
index 75d4c3d..0000000
--- a/service/files/entrypoint.sh.j2
+++ /dev/null
@@ -1,12 +0,0 @@
1#!/usr/bin/env bash
2
3{% if etcd.tls.enabled %}
4etcd --listen-client-urls=https://{{ network_topology["private"]["address"] }}:{{ etcd.client_port.cont }},http://127.0.0.1:{{ etcd.client_port.cont }}\
5 --advertise-client-urls=https://{{ address("etcd", etcd.client_port, with_scheme=False) }}\
6 --peer-auto-tls\
7 --cert-file=/opt/ccp/etc/tls/etcd_server_certificate.pem\
8 --key-file=/opt/ccp/etc/tls/etcd_server_key.pem\
9{% else %}
10etcd --listen-client-urls http://0.0.0.0:{{ etcd.client_port.cont }}\
11 --advertise-client-urls {{ address("etcd", etcd.client_port, with_scheme=True) }}
12{% endif %}