Added zookeeper based locking so only one process runs at a time.

This commit is contained in:
Tim Kuhlman 2014-03-06 17:24:29 -07:00
parent dd5bb8984a
commit 91bf693882
8 changed files with 110 additions and 30 deletions

View File

@ -18,6 +18,10 @@ There are three internal queues:
Notification classes inherit from the notification abstract class and implement their specific notification method.
There is a special KafkaCommitTracker class that keeps track of the last committed message and ones available for
commit, it then periodically commits all progress. This allows us to handle the situation where alarms that are not
acted on are quickly ready for commit but others which are prior to them in the kafka order are still in progress.
## High Availability
HA is handled by utilizing multiple partitions withing kafka. When multiple notification engines are running the partitions
are spread out among them, as engines die/restart things reshuffle.
@ -33,6 +37,8 @@ It is assumed the notification engine will be run by a process supervisor which
Yaml config file by default in '/etc/mon/notification.yaml' process runs via upstart script.
# Future Considerations
- Currently I lock the topic rather than the partitions. This effectively means there is only one active notification
engine at any given time. In the future to share the load among multiple daemons we could lock by partition.
- How fast is the mysql db? How much load do we put on it. Initially I think it makes most sense to read notification
details for each alarm but eventually I may want to cache that info.
- I am starting with a single KafkaConsumer and a single SentNotificationProcessor depending on load this may need

32
commit_tracker.py Normal file
View File

@ -0,0 +1,32 @@
class KafkaCommitTracker(object):
""" Tracks message offsets for a kafka topic and partitions.
Uses zookeeper to keep track of the last committed offset.
As messages are finished with processing the committed offset is updated
"""
# todo how do I have one object shared across processes? Maybe I don't maybe I use another internal
# queue for finished notifications. This then runs in a thread that reads that queue and acts
# appropriately.
def __init__(self, url, topic):
""" Setup the tracker
url is the zookeeper hostname:port
topic is the kafka topic to track
"""
self.zookeeper = url # todo
self.topic = topic
def finish(self, partition, offset):
""" Mark a message as finished.
"""
pass
# todo what to do if a single alarm is holding up committing others for a long time?
def get_offsets(self, partitions=None):
""" Return the offsets for specified partitions or all if partitions is None
The return is a dictionary with key name being partition # and value the offset
"""
return {}

2
debian/control vendored
View File

@ -11,6 +11,6 @@ X-Python-Version: >= 2.6
Package: mon-notification
Architecture: all
Section: python
Depends: ${misc:Depends}, ${python:Depends}, libpython2.7, python-pkg-resources, kafka-python, python-yaml, python-mysqldb
Depends: ${misc:Depends}, ${python:Depends}, libpython2.7, python-pkg-resources, kafka-python, python-yaml, python-mysqldb, python-kazoo
Description: Notification engine for monitoring.
Consumes alarms from Kafka and sends notifications appropriately.

View File

@ -1,4 +1,5 @@
kafka-python
kazoo
pyodbc
pkg-resources
yaml

View File

@ -3,6 +3,8 @@ import logging
from kafka.client import KafkaClient
from kafka.consumer import SimpleConsumer
from commit_tracker import KafkaCommitTracker
log = logging.getLogger(__name__)
class KafkaConsumer(object):
@ -15,24 +17,24 @@ class KafkaConsumer(object):
being consumed from.
For more information see, https://github.com/mumrah/kafka-python/issues/112
"""
def __init__(self, url, group, topic, queue):
def __init__(self, kafka_url, group, topic, queue, zookeeper_url):
"""
url, group, topic - kafka connection details
kafka_url, group, topic - kafka connection details
queue - a queue to publish log entries to
"""
self.kafka = KafkaClient(url)
# Todo connect to zookeeper and make a znode where I can track which consumers to consume from.
# Initially just grab them all, in the future probably should be spread out more, add this to future considerations
self.kafka = KafkaClient(kafka_url)
# No autocommit, it does not work with kafka 0.8.0 - see https://github.com/mumrah/kafka-python/issues/118
self.consumer = SimpleConsumer(self.kafka, group, topic, auto_commit=False)
self.queue = queue
self.tracker = KafkaCommitTracker(zookeeper_url, topic)
def run(self):
""" Consume from kafka and place alarm objects on the queue
This quite intentionally does not ack
"""
# Set current offsets to the last known position
self.consumer.offsets.update(self.tracker.get_offsets())
for message in self.consumer:
log.debug("Consuming message from kafka - value = %s" % message.message.value)
if self.queue.full():

57
main.py
View File

@ -10,7 +10,12 @@ from multiprocessing import Process, Queue
import os
import signal
import sys
import time
from kazoo.client import KazooClient
from kazoo.exceptions import KazooException
from commit_tracker import KafkaCommitTracker
from kafka_consumer import KafkaConsumer
from processors.alarm import AlarmProcessor
from processors.notification import NotificationProcessor
@ -20,12 +25,45 @@ log = logging.getLogger(__name__)
processors = [] # global list to facilitate clean signal handling
def clean_exit(signum, frame):
""" Exit cleanly on defined signals
def clean_exit(signum, frame=None):
""" Exit all processes cleanly
Can be called on an os signal or no zookeeper loosing connection.
"""
# todo - Figure out good exiting. For most situations, make sure it all shuts down nicely, finishing up anything in the queue
for process in processors:
process.terminate()
sys.exit(0)
def get_zookeeper_lock(url, topic):
""" Grab a lock in zookeeper or if not available retry in 30x
Add a listener to stop processes on
"""
topic_path = '/consumers/mon-notification/%s' % topic
zookeeper = KazooClient(url)
zookeeper.start()
while True:
# The path is ephemeral so if it exists wait then cycle again
if zookeeper.exists(topic_path):
log.info('Another process has the lock for topic %s, waiting then retrying.' % topic)
time.sleep(30)
continue
try:
zookeeper.create(topic_path, ephemeral=True, makepath=True)
except KazooException, e:
# If creating the path fails something beat us to it most likely, try again
log.debug('Error creating lock path %s\n%s' % (topic_path, e))
continue
else:
# Succeeded in grabbing the lock continue
log.info('Grabbed lock for topic %s' % topic)
break
# Set up a listener to exit if we loose connection, this always exits even if the zookeeper connection is only
# suspended, the process should be supervised so it starts right back up again.
zookeeper.add_listener(clean_exit)
def main(argv=None):
@ -48,6 +86,8 @@ def main(argv=None):
logging.basicConfig(level=logging.DEBUG)
# logging.basicConfig(format='%(asctime)s %(message)s', filename=log_path, level=logging.INFO)
# Todo review the code structure, is there a better design I could use?
#Create the queues
alarms = Queue(config['queues']['alarms_size'])
notifications = Queue(config['queues']['notifications_size'])
@ -55,7 +95,7 @@ def main(argv=None):
## Define processes
#start KafkaConsumer
kafka = Process(target=KafkaConsumer(config['kafka']['url'], config['kafka']['group'], config['kafka']['alarm_topic'], alarms).run)
kafka = Process(target=KafkaConsumer(config['kafka']['url'], config['kafka']['group'], config['kafka']['alarm_topic'], alarms, config['zookeeper']['url']).run)
processors.append(kafka)
# #Define AlarmProcessors
@ -71,21 +111,22 @@ def main(argv=None):
# processors.extend(notification_processors)
#
#Define SentNotificationProcessor
tracker = KafkaCommitTracker(config['zookeeper']['url'], config['kafka']['alarm_topic'])
# todo temp setup with the wrong queue to just test kafka basics
sent_notification_processor = Process(target=SentNotificationProcessor(config['kafka']['url'], config['kafka']['group'], config['kafka']['alarm_topic'], config['kafka']['notification_topic'], alarms).run)
sent_notification_processor = Process(target=SentNotificationProcessor(config['kafka']['url'], config['kafka']['notification_topic'], alarms, tracker).run)
# sent_notification_processor = Process(
# target=SentNotificationProcessor(
# config['kafka']['url'],
# config['kafka']['group'],
# config['kafka']['alarm_topic'],
# config['kafka']['notification_topic'],
# sent_notifications
# sent_notifications,
# tracker
# ).run
# )
processors.append(sent_notification_processor)
## Start
signal.signal(signal.SIGTERM, clean_exit)
get_zookeeper_lock(config['zookeeper']['url'], config['kafka']['alarm_topic'])
try:
log.info('Starting processes')
for process in processors:
@ -95,7 +136,7 @@ def main(argv=None):
for process in processors:
process.terminate()
if __name__ == "__main__":
sys.exit(main())

View File

@ -16,3 +16,6 @@ queues:
alarms_size: 1024
notifications_size: 1024
sent_notifications_size: 1024
zookeeper:
url: localhost:2181

View File

@ -9,29 +9,25 @@ log = logging.getLogger(__name__)
class SentNotificationProcessor(object):
""" Processes notifications which have been sent
This involves adding them into a kafka topic for persisting by another process and updating the offset of the
last processed alarm.
This involves adding them into a kafka topic for persisting by another process and marking that alarm as finished.
"""
def __init__(self, url, group, alarm_topic, notification_topic, queue):
def __init__(self, url, topic, queue, tracker):
"""
url, group - kafka connection details
alarm_topic - kafka topic to commit reads from, completes cycle started in kafka_consumer
notification_topic - kafka topic to publish notifications to
topic - kafka topic to publish notifications to
queue - a queue to read notifications from
"""
self.kafka = KafkaClient(url)
self.alarm_consumer = SimpleConsumer(self.kafka, group, alarm_topic, auto_commit=False)
#alarm_consumer = SimpleConsumer(self.kafka, group, alarm_topic, auto_commit=False)
# self.alarms = KafkaCommitTracker(alarm_consumer)
self.notification_producer = SimpleProducer(
self.producer = SimpleProducer(
self.kafka,
async=False,
req_acks=SimpleProducer.ACK_AFTER_LOCAL_WRITE,
ack_timeout=2000
)
self.notification_topic = notification_topic
self.topic = topic
self.tracker = tracker
self.queue = queue
def run(self):
@ -40,10 +36,9 @@ class SentNotificationProcessor(object):
while True:
# todo I expect the message format to change, the .message.value is just for test
message = self.queue.get().message.value
responses = self.notification_producer.send_messages(self.notification_topic, message)
responses = self.producer.send_messages(self.topic, message)
for resp in responses:
if resp.error != 0:
log.error('Error publishing to %s topic, error message %s, offset #%d' %
(self.notification_topic, resp.error, resp.offset))
# self.alarms.finished(message.id)
(self.topic, resp.error, resp.offset))
self.tracker.finish(message.partition, message.offset)