462 lines
15 KiB
Python
462 lines
15 KiB
Python
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
|
# not use this file except in compliance with the License. You may obtain
|
|
# a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
|
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
|
# License for the specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
import collections
|
|
import contextlib
|
|
import errno
|
|
import logging
|
|
import os
|
|
import random
|
|
import signal
|
|
import socket
|
|
import sys
|
|
import threading
|
|
import time
|
|
|
|
import setproctitle
|
|
|
|
LOG = logging.getLogger(__name__)
|
|
|
|
SIGNAL_TO_NAME = dict((getattr(signal, name), name) for name in dir(signal)
|
|
if name.startswith("SIG") and name not in ('SIG_DFL',
|
|
'SIG_IGN'))
|
|
|
|
|
|
class _ServiceConfig(object):
|
|
def __init__(self, service, workers, args, kwargs):
|
|
self.service = service
|
|
self.workers = workers
|
|
self.args = args
|
|
self.kwargs = kwargs
|
|
|
|
|
|
def _spawn(target):
|
|
t = threading.Thread(target=target)
|
|
t.daemon = True
|
|
t.start()
|
|
return t
|
|
|
|
|
|
@contextlib.contextmanager
|
|
def _exit_on_exception():
|
|
try:
|
|
yield
|
|
except SystemExit as exc:
|
|
os._exit(exc.code)
|
|
except BaseException:
|
|
LOG.exception('Unhandled exception')
|
|
os._exit(2)
|
|
|
|
|
|
class Service(object):
|
|
"""Base class for a service
|
|
|
|
This class will be executed in a new child process of a
|
|
:py:class:`ServiceRunner`. It registers signals to manager the reloading
|
|
and the ending of the process.
|
|
|
|
Methods :py:meth:`run`, :py:meth:`terminate` and :py:meth:`reload` are
|
|
optional.
|
|
"""
|
|
|
|
name = None
|
|
"""Service name used in the process title and the log messages in additionnal
|
|
of the worker_id."""
|
|
|
|
def __init__(self, worker_id):
|
|
"""Create a new Service
|
|
|
|
:param worker_id: the identifier of this service instance
|
|
:type worker_id: int
|
|
"""
|
|
super(Service, self).__init__()
|
|
if self.name is None:
|
|
self.name = self.__class__.__name__
|
|
self.worker_id = worker_id
|
|
self.pid = os.getpid()
|
|
|
|
pname = os.path.basename(sys.argv[0])
|
|
self._title = "%(name)s(%(worker_id)d) [%(pid)d]" % dict(
|
|
name=self.name, worker_id=self.worker_id, pid=self.pid)
|
|
|
|
# Set process title
|
|
setproctitle.setproctitle(
|
|
"%(pname)s - %(name)s(%(worker_id)d)" % dict(
|
|
pname=pname, name=self.name,
|
|
worker_id=self.worker_id))
|
|
|
|
def terminate(self):
|
|
"""Gracefully shutdown the service
|
|
|
|
This method will be executed when the Service has to shutdown cleanly.
|
|
|
|
If not implemented the process will just end with status 0.
|
|
|
|
To customize the exit code, the :py:class:`SystemExit` exception can be
|
|
used.
|
|
|
|
"""
|
|
|
|
def reload(self):
|
|
"""Reloading of the service
|
|
|
|
This method will be executed when the Service receives a SIGHUP.
|
|
|
|
If not implemented the process will just end with status 0 and
|
|
:py:class:`ServiceRunner` will start a new fresh process for this
|
|
service with the same worker_id.
|
|
"""
|
|
self._clean_exit()
|
|
|
|
def run(self):
|
|
"""Method representing the service activity
|
|
|
|
If not implemented the process will just wait to receive an ending
|
|
signal.
|
|
"""
|
|
|
|
def _run(self):
|
|
LOG.debug("Run service %s" % self._title)
|
|
with _exit_on_exception():
|
|
self.run()
|
|
|
|
def _reload(self, sig, frame):
|
|
with _exit_on_exception():
|
|
self.reload()
|
|
|
|
def _clean_exit(self, *args, **kwargs):
|
|
signal.signal(signal.SIGTERM, signal.SIG_IGN)
|
|
LOG.info('Caught SIGTERM signal, '
|
|
'graceful exiting of service %s' % self._title)
|
|
with _exit_on_exception():
|
|
self.terminate()
|
|
sys.exit(0)
|
|
|
|
|
|
class ServiceManager(object):
|
|
"""Manage lifetimes of services
|
|
|
|
:py:class:`ServiceManager` acts as a master process that controls the
|
|
lifetime of children processes and restart them if they die unexpectedly.
|
|
It also propagate some signals (SIGTERM, SIGALRM, SIGINT and SIGHUP) to
|
|
them.
|
|
|
|
Each child process runs an instance of a :py:class:`Service`.
|
|
|
|
An application must create only one :py:class:`ServiceManager` class and
|
|
use :py:meth:`ServiceManager.run()` as main loop of the application.
|
|
|
|
|
|
|
|
Usage::
|
|
|
|
class MyService(Service):
|
|
def __init__(self, worker_id, myconf):
|
|
super(MyService, self).__init__(worker_id)
|
|
preparing_my_job(myconf)
|
|
self.running = True
|
|
|
|
def run(self):
|
|
while self.running:
|
|
do_my_job()
|
|
|
|
def terminate(self):
|
|
self.running = False
|
|
gracefully_stop_my_jobs()
|
|
|
|
def reload(self):
|
|
restart_my_job()
|
|
|
|
conf = {'foobar': 2}
|
|
sr = ServiceManager()
|
|
sr.add(MyService, 5, conf)
|
|
sr.run()
|
|
|
|
This will create 5 children processes running the service MyService.
|
|
|
|
"""
|
|
|
|
_marker = object()
|
|
_process_runner_already_created = False
|
|
|
|
def __init__(self, wait_interval=0.01):
|
|
"""Creates the ServiceManager object
|
|
|
|
:param wait_interval: time between each new process spawn
|
|
:type wait_interval: float
|
|
|
|
"""
|
|
|
|
if self._process_runner_already_created:
|
|
raise RuntimeError("Only one instance of ProcessRunner per "
|
|
"application is allowed")
|
|
ServiceManager._process_runner_already_created = True
|
|
|
|
self._wait_interval = wait_interval
|
|
self._shutdown = threading.Event()
|
|
|
|
self._running_services = collections.defaultdict(dict)
|
|
self._services = []
|
|
self._forktimes = []
|
|
self._current_process = None
|
|
|
|
# Try to create a session id if possible
|
|
try:
|
|
os.setsid()
|
|
except OSError:
|
|
pass
|
|
|
|
self.readpipe, self.writepipe = os.pipe()
|
|
|
|
signal.signal(signal.SIGTERM, self._clean_exit)
|
|
signal.signal(signal.SIGINT, self._fast_exit)
|
|
signal.signal(signal.SIGALRM, self._alarm_exit)
|
|
signal.signal(signal.SIGHUP, self._reload_services)
|
|
|
|
def add(self, service, workers=1, args=None, kwargs=None):
|
|
"""Add a new service to the ServiceManager
|
|
|
|
:param service: callable that return an instance of :py:class:`Service`
|
|
:type service: callable
|
|
:param workers: number of processes/workers for this service
|
|
:type workers: int
|
|
:param args: additional positional arguments for this service
|
|
:type args: tuple
|
|
:param kwargs: additional keywoard arguments for this service
|
|
:type kwargs: dict
|
|
"""
|
|
self._services.append(_ServiceConfig(service, workers, args, kwargs))
|
|
|
|
def run(self):
|
|
"""Start and supervise services
|
|
|
|
This method will start and supervise all children processes
|
|
until the master process asked to shutdown by a SIGTERM.
|
|
|
|
All spawned processes are part of the same unix process group.
|
|
"""
|
|
|
|
self._systemd_notify_once()
|
|
while not self._shutdown.is_set():
|
|
info = self._wait_service()
|
|
if info is not None:
|
|
# Restart this particular service
|
|
conf, worker_id = info
|
|
else:
|
|
for conf in self._services:
|
|
if len(self._running_services[conf]) < conf.workers:
|
|
worker_id = len(self._running_services[conf])
|
|
break
|
|
else:
|
|
time.sleep(self._wait_interval)
|
|
continue
|
|
|
|
pid = self._start_service(conf, worker_id)
|
|
self._running_services[conf][pid] = worker_id
|
|
|
|
LOG.debug("Killing services with signal SIGTERM")
|
|
os.killpg(0, signal.SIGTERM)
|
|
|
|
LOG.debug("Waiting services to terminate")
|
|
# NOTE(sileht): We follow the termination of our children only
|
|
# so we can't use waitpid(0, 0)
|
|
for conf in self._services:
|
|
for pid in self._running_services[conf]:
|
|
try:
|
|
os.waitpid(pid, 0)
|
|
except OSError as e:
|
|
if e.errno == errno.ECHILD:
|
|
pass
|
|
else:
|
|
raise
|
|
|
|
LOG.debug("Shutdown finish")
|
|
sys.exit(0)
|
|
|
|
def _wait_service(self):
|
|
"""Return the last died service or None"""
|
|
try:
|
|
# Don't block if no child processes have exited
|
|
pid, status = os.waitpid(0, os.WNOHANG)
|
|
if not pid:
|
|
return None
|
|
except OSError as exc:
|
|
if exc.errno not in (errno.EINTR, errno.ECHILD):
|
|
raise
|
|
return None
|
|
|
|
if os.WIFSIGNALED(status):
|
|
sig = SIGNAL_TO_NAME.get(os.WTERMSIG(status))
|
|
LOG.info('Child %(pid)d killed by signal %(sig)s',
|
|
dict(pid=pid, sig=sig))
|
|
else:
|
|
code = os.WEXITSTATUS(status)
|
|
LOG.info('Child %(pid)d exited with status %(code)d',
|
|
dict(pid=pid, code=code))
|
|
|
|
for conf in self._running_services:
|
|
if pid in self._running_services[conf]:
|
|
return conf, self._running_services[conf].pop(pid)
|
|
|
|
LOG.error('pid %d not in service list', pid)
|
|
|
|
def _reload_services(self, *args, **kwargs):
|
|
if self._shutdown.is_set():
|
|
# NOTE(sileht): We are in shutdown process no need
|
|
# to reload anything
|
|
return
|
|
|
|
# Reset forktimes to respawn services quickly
|
|
self._forktimes = []
|
|
signal.signal(signal.SIGHUP, signal.SIG_IGN)
|
|
os.killpg(0, signal.SIGHUP)
|
|
signal.signal(signal.SIGHUP, self._reload_services)
|
|
|
|
def _clean_exit(self, *args, **kwargs):
|
|
# Don't need to be called more.
|
|
signal.signal(signal.SIGTERM, signal.SIG_IGN)
|
|
LOG.info('Caught SIGTERM signal, graceful exiting of master process')
|
|
self._shutdown.set()
|
|
|
|
def _fast_exit(self, signo, frame,
|
|
reason='Caught SIGINT signal, instantaneous exiting'):
|
|
signal.signal(signal.SIGINT, signal.SIG_IGN)
|
|
signal.signal(signal.SIGALRM, signal.SIG_IGN)
|
|
LOG.info(reason)
|
|
os.killpg(0, signal.SIGINT)
|
|
os._exit(1)
|
|
|
|
def _alarm_exit(self, signo, frame):
|
|
self._fast_exit(signo, frame,
|
|
reason='Graceful shutdown timeout exceeded, '
|
|
'instantaneous exiting of master process')
|
|
|
|
def _slowdown_respawn_if_needed(self):
|
|
# Limit ourselves to one process a second (over the period of
|
|
# number of workers * 1 second). This will allow workers to
|
|
# start up quickly but ensure we don't fork off children that
|
|
# die instantly too quickly.
|
|
|
|
expected_children = sum(s.workers for s in self._services)
|
|
if len(self._forktimes) > expected_children:
|
|
if time.time() - self._forktimes[0] < expected_children:
|
|
LOG.info('Forking too fast, sleeping')
|
|
time.sleep(1)
|
|
self._forktimes.pop(0)
|
|
self._forktimes.append(time.time())
|
|
|
|
def _start_service(self, config, worker_id):
|
|
self._slowdown_respawn_if_needed()
|
|
|
|
pid = os.fork()
|
|
if pid != 0:
|
|
return pid
|
|
|
|
# reset parent signals
|
|
signal.signal(signal.SIGINT, signal.SIG_DFL)
|
|
signal.signal(signal.SIGALRM, signal.SIG_DFL)
|
|
signal.signal(signal.SIGTERM, signal.SIG_DFL)
|
|
signal.signal(signal.SIGHUP, signal.SIG_DFL)
|
|
|
|
# Close write to ensure only parent has it open
|
|
os.close(self.writepipe)
|
|
|
|
_spawn(self._watch_parent_process)
|
|
|
|
# Reseed random number generator
|
|
random.seed()
|
|
|
|
# Create and run a new service
|
|
with _exit_on_exception():
|
|
catched_signals = {
|
|
signal.SIGHUP: None,
|
|
signal.SIGTERM: None,
|
|
}
|
|
|
|
def signal_delayer(sig, frame):
|
|
signal.signal(signal.SIGTERM, signal.SIG_IGN)
|
|
LOG.info('Caught signal (%s) during service initialisation, '
|
|
'delaying it' % sig)
|
|
catched_signals[sig] = frame
|
|
|
|
# Setup temporary signals
|
|
signal.signal(signal.SIGHUP, signal_delayer)
|
|
signal.signal(signal.SIGTERM, signal_delayer)
|
|
|
|
# Initialize the service process
|
|
args = tuple() if config.args is None else config.args
|
|
kwargs = dict() if config.kwargs is None else config.kwargs
|
|
self._current_process = config.service(worker_id, *args, **kwargs)
|
|
|
|
# Setup final signals
|
|
if catched_signals[signal.SIGTERM] is not None:
|
|
self._current_process._clean_exit(
|
|
signal.SIGTERM, catched_signals[signal.SIGTERM])
|
|
signal.signal(signal.SIGTERM, self._current_process._clean_exit)
|
|
|
|
if catched_signals[signal.SIGHUP] is not None:
|
|
self._current_process._reload(
|
|
signal.SIGHUP, catched_signals[signal.SIGHUP])
|
|
signal.signal(signal.SIGHUP, self._current_process._reload)
|
|
|
|
# Start the main thread
|
|
_spawn(self._current_process._run)
|
|
|
|
# Wait forever
|
|
# NOTE(sileht): we cannot use threading.Event().wait() or
|
|
# threading.Thread().join() because of
|
|
# https://bugs.python.org/issue5315
|
|
while True:
|
|
time.sleep(1000000000)
|
|
|
|
def _watch_parent_process(self):
|
|
# This will block until the write end is closed when the parent
|
|
# dies unexpectedly
|
|
try:
|
|
os.read(self.readpipe, 1)
|
|
except EnvironmentError:
|
|
pass
|
|
|
|
if self._current_process is not None:
|
|
LOG.info('Parent process has died unexpectedly, %s exiting'
|
|
% self._current_process._title)
|
|
with _exit_on_exception():
|
|
self._current_process.terminate()
|
|
sys.exit(0)
|
|
|
|
else:
|
|
os._exit(0)
|
|
|
|
@staticmethod
|
|
def _systemd_notify_once():
|
|
"""Send notification once to Systemd that service is ready.
|
|
|
|
Systemd sets NOTIFY_SOCKET environment variable with the name of the
|
|
socket listening for notifications from services.
|
|
This method removes the NOTIFY_SOCKET environment variable to ensure
|
|
notification is sent only once.
|
|
"""
|
|
|
|
notify_socket = os.getenv('NOTIFY_SOCKET')
|
|
if notify_socket:
|
|
if notify_socket.startswith('@'):
|
|
# abstract namespace socket
|
|
notify_socket = '\0%s' % notify_socket[1:]
|
|
sock = socket.socket(socket.AF_UNIX, socket.SOCK_DGRAM)
|
|
with contextlib.closing(sock):
|
|
try:
|
|
sock.connect(notify_socket)
|
|
sock.sendall(b'READY=1')
|
|
del os.environ['NOTIFY_SOCKET']
|
|
except EnvironmentError:
|
|
LOG.debug("Systemd notification failed", exc_info=True)
|