Added zookeeper based locking so only one process runs at a time.
This commit is contained in:
parent
dd5bb8984a
commit
91bf693882
|
@ -18,6 +18,10 @@ There are three internal queues:
|
|||
|
||||
Notification classes inherit from the notification abstract class and implement their specific notification method.
|
||||
|
||||
There is a special KafkaCommitTracker class that keeps track of the last committed message and ones available for
|
||||
commit, it then periodically commits all progress. This allows us to handle the situation where alarms that are not
|
||||
acted on are quickly ready for commit but others which are prior to them in the kafka order are still in progress.
|
||||
|
||||
## High Availability
|
||||
HA is handled by utilizing multiple partitions withing kafka. When multiple notification engines are running the partitions
|
||||
are spread out among them, as engines die/restart things reshuffle.
|
||||
|
@ -33,6 +37,8 @@ It is assumed the notification engine will be run by a process supervisor which
|
|||
Yaml config file by default in '/etc/mon/notification.yaml' process runs via upstart script.
|
||||
|
||||
# Future Considerations
|
||||
- Currently I lock the topic rather than the partitions. This effectively means there is only one active notification
|
||||
engine at any given time. In the future to share the load among multiple daemons we could lock by partition.
|
||||
- How fast is the mysql db? How much load do we put on it. Initially I think it makes most sense to read notification
|
||||
details for each alarm but eventually I may want to cache that info.
|
||||
- I am starting with a single KafkaConsumer and a single SentNotificationProcessor depending on load this may need
|
||||
|
|
|
@ -0,0 +1,32 @@
|
|||
|
||||
class KafkaCommitTracker(object):
|
||||
""" Tracks message offsets for a kafka topic and partitions.
|
||||
Uses zookeeper to keep track of the last committed offset.
|
||||
As messages are finished with processing the committed offset is updated
|
||||
"""
|
||||
|
||||
|
||||
# todo how do I have one object shared across processes? Maybe I don't maybe I use another internal
|
||||
# queue for finished notifications. This then runs in a thread that reads that queue and acts
|
||||
# appropriately.
|
||||
|
||||
|
||||
def __init__(self, url, topic):
|
||||
""" Setup the tracker
|
||||
url is the zookeeper hostname:port
|
||||
topic is the kafka topic to track
|
||||
"""
|
||||
self.zookeeper = url # todo
|
||||
self.topic = topic
|
||||
|
||||
def finish(self, partition, offset):
|
||||
""" Mark a message as finished.
|
||||
"""
|
||||
pass
|
||||
# todo what to do if a single alarm is holding up committing others for a long time?
|
||||
|
||||
def get_offsets(self, partitions=None):
|
||||
""" Return the offsets for specified partitions or all if partitions is None
|
||||
The return is a dictionary with key name being partition # and value the offset
|
||||
"""
|
||||
return {}
|
|
@ -11,6 +11,6 @@ X-Python-Version: >= 2.6
|
|||
Package: mon-notification
|
||||
Architecture: all
|
||||
Section: python
|
||||
Depends: ${misc:Depends}, ${python:Depends}, libpython2.7, python-pkg-resources, kafka-python, python-yaml, python-mysqldb
|
||||
Depends: ${misc:Depends}, ${python:Depends}, libpython2.7, python-pkg-resources, kafka-python, python-yaml, python-mysqldb, python-kazoo
|
||||
Description: Notification engine for monitoring.
|
||||
Consumes alarms from Kafka and sends notifications appropriately.
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
kafka-python
|
||||
kazoo
|
||||
pyodbc
|
||||
pkg-resources
|
||||
yaml
|
||||
|
|
|
@ -3,6 +3,8 @@ import logging
|
|||
from kafka.client import KafkaClient
|
||||
from kafka.consumer import SimpleConsumer
|
||||
|
||||
from commit_tracker import KafkaCommitTracker
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
class KafkaConsumer(object):
|
||||
|
@ -15,24 +17,24 @@ class KafkaConsumer(object):
|
|||
being consumed from.
|
||||
For more information see, https://github.com/mumrah/kafka-python/issues/112
|
||||
"""
|
||||
def __init__(self, url, group, topic, queue):
|
||||
def __init__(self, kafka_url, group, topic, queue, zookeeper_url):
|
||||
"""
|
||||
url, group, topic - kafka connection details
|
||||
kafka_url, group, topic - kafka connection details
|
||||
queue - a queue to publish log entries to
|
||||
"""
|
||||
self.kafka = KafkaClient(url)
|
||||
|
||||
# Todo connect to zookeeper and make a znode where I can track which consumers to consume from.
|
||||
# Initially just grab them all, in the future probably should be spread out more, add this to future considerations
|
||||
self.kafka = KafkaClient(kafka_url)
|
||||
|
||||
# No autocommit, it does not work with kafka 0.8.0 - see https://github.com/mumrah/kafka-python/issues/118
|
||||
self.consumer = SimpleConsumer(self.kafka, group, topic, auto_commit=False)
|
||||
self.queue = queue
|
||||
self.tracker = KafkaCommitTracker(zookeeper_url, topic)
|
||||
|
||||
def run(self):
|
||||
""" Consume from kafka and place alarm objects on the queue
|
||||
This quite intentionally does not ack
|
||||
"""
|
||||
# Set current offsets to the last known position
|
||||
self.consumer.offsets.update(self.tracker.get_offsets())
|
||||
|
||||
for message in self.consumer:
|
||||
log.debug("Consuming message from kafka - value = %s" % message.message.value)
|
||||
if self.queue.full():
|
||||
|
|
57
main.py
57
main.py
|
@ -10,7 +10,12 @@ from multiprocessing import Process, Queue
|
|||
import os
|
||||
import signal
|
||||
import sys
|
||||
import time
|
||||
|
||||
from kazoo.client import KazooClient
|
||||
from kazoo.exceptions import KazooException
|
||||
|
||||
from commit_tracker import KafkaCommitTracker
|
||||
from kafka_consumer import KafkaConsumer
|
||||
from processors.alarm import AlarmProcessor
|
||||
from processors.notification import NotificationProcessor
|
||||
|
@ -20,12 +25,45 @@ log = logging.getLogger(__name__)
|
|||
processors = [] # global list to facilitate clean signal handling
|
||||
|
||||
|
||||
def clean_exit(signum, frame):
|
||||
""" Exit cleanly on defined signals
|
||||
def clean_exit(signum, frame=None):
|
||||
""" Exit all processes cleanly
|
||||
Can be called on an os signal or no zookeeper loosing connection.
|
||||
"""
|
||||
# todo - Figure out good exiting. For most situations, make sure it all shuts down nicely, finishing up anything in the queue
|
||||
for process in processors:
|
||||
process.terminate()
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
def get_zookeeper_lock(url, topic):
|
||||
""" Grab a lock in zookeeper or if not available retry in 30x
|
||||
Add a listener to stop processes on
|
||||
"""
|
||||
topic_path = '/consumers/mon-notification/%s' % topic
|
||||
zookeeper = KazooClient(url)
|
||||
zookeeper.start()
|
||||
|
||||
while True:
|
||||
# The path is ephemeral so if it exists wait then cycle again
|
||||
if zookeeper.exists(topic_path):
|
||||
log.info('Another process has the lock for topic %s, waiting then retrying.' % topic)
|
||||
time.sleep(30)
|
||||
continue
|
||||
|
||||
try:
|
||||
zookeeper.create(topic_path, ephemeral=True, makepath=True)
|
||||
except KazooException, e:
|
||||
# If creating the path fails something beat us to it most likely, try again
|
||||
log.debug('Error creating lock path %s\n%s' % (topic_path, e))
|
||||
continue
|
||||
else:
|
||||
# Succeeded in grabbing the lock continue
|
||||
log.info('Grabbed lock for topic %s' % topic)
|
||||
break
|
||||
|
||||
# Set up a listener to exit if we loose connection, this always exits even if the zookeeper connection is only
|
||||
# suspended, the process should be supervised so it starts right back up again.
|
||||
zookeeper.add_listener(clean_exit)
|
||||
|
||||
|
||||
def main(argv=None):
|
||||
|
@ -48,6 +86,8 @@ def main(argv=None):
|
|||
logging.basicConfig(level=logging.DEBUG)
|
||||
# logging.basicConfig(format='%(asctime)s %(message)s', filename=log_path, level=logging.INFO)
|
||||
|
||||
# Todo review the code structure, is there a better design I could use?
|
||||
|
||||
#Create the queues
|
||||
alarms = Queue(config['queues']['alarms_size'])
|
||||
notifications = Queue(config['queues']['notifications_size'])
|
||||
|
@ -55,7 +95,7 @@ def main(argv=None):
|
|||
|
||||
## Define processes
|
||||
#start KafkaConsumer
|
||||
kafka = Process(target=KafkaConsumer(config['kafka']['url'], config['kafka']['group'], config['kafka']['alarm_topic'], alarms).run)
|
||||
kafka = Process(target=KafkaConsumer(config['kafka']['url'], config['kafka']['group'], config['kafka']['alarm_topic'], alarms, config['zookeeper']['url']).run)
|
||||
processors.append(kafka)
|
||||
|
||||
# #Define AlarmProcessors
|
||||
|
@ -71,21 +111,22 @@ def main(argv=None):
|
|||
# processors.extend(notification_processors)
|
||||
#
|
||||
#Define SentNotificationProcessor
|
||||
tracker = KafkaCommitTracker(config['zookeeper']['url'], config['kafka']['alarm_topic'])
|
||||
# todo temp setup with the wrong queue to just test kafka basics
|
||||
sent_notification_processor = Process(target=SentNotificationProcessor(config['kafka']['url'], config['kafka']['group'], config['kafka']['alarm_topic'], config['kafka']['notification_topic'], alarms).run)
|
||||
sent_notification_processor = Process(target=SentNotificationProcessor(config['kafka']['url'], config['kafka']['notification_topic'], alarms, tracker).run)
|
||||
# sent_notification_processor = Process(
|
||||
# target=SentNotificationProcessor(
|
||||
# config['kafka']['url'],
|
||||
# config['kafka']['group'],
|
||||
# config['kafka']['alarm_topic'],
|
||||
# config['kafka']['notification_topic'],
|
||||
# sent_notifications
|
||||
# sent_notifications,
|
||||
# tracker
|
||||
# ).run
|
||||
# )
|
||||
processors.append(sent_notification_processor)
|
||||
|
||||
## Start
|
||||
signal.signal(signal.SIGTERM, clean_exit)
|
||||
get_zookeeper_lock(config['zookeeper']['url'], config['kafka']['alarm_topic'])
|
||||
try:
|
||||
log.info('Starting processes')
|
||||
for process in processors:
|
||||
|
@ -95,7 +136,7 @@ def main(argv=None):
|
|||
for process in processors:
|
||||
process.terminate()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
|
||||
|
||||
|
|
|
@ -16,3 +16,6 @@ queues:
|
|||
alarms_size: 1024
|
||||
notifications_size: 1024
|
||||
sent_notifications_size: 1024
|
||||
|
||||
zookeeper:
|
||||
url: localhost:2181
|
||||
|
|
|
@ -9,29 +9,25 @@ log = logging.getLogger(__name__)
|
|||
|
||||
class SentNotificationProcessor(object):
|
||||
""" Processes notifications which have been sent
|
||||
This involves adding them into a kafka topic for persisting by another process and updating the offset of the
|
||||
last processed alarm.
|
||||
This involves adding them into a kafka topic for persisting by another process and marking that alarm as finished.
|
||||
"""
|
||||
|
||||
def __init__(self, url, group, alarm_topic, notification_topic, queue):
|
||||
def __init__(self, url, topic, queue, tracker):
|
||||
"""
|
||||
url, group - kafka connection details
|
||||
alarm_topic - kafka topic to commit reads from, completes cycle started in kafka_consumer
|
||||
notification_topic - kafka topic to publish notifications to
|
||||
topic - kafka topic to publish notifications to
|
||||
queue - a queue to read notifications from
|
||||
"""
|
||||
self.kafka = KafkaClient(url)
|
||||
self.alarm_consumer = SimpleConsumer(self.kafka, group, alarm_topic, auto_commit=False)
|
||||
#alarm_consumer = SimpleConsumer(self.kafka, group, alarm_topic, auto_commit=False)
|
||||
# self.alarms = KafkaCommitTracker(alarm_consumer)
|
||||
self.notification_producer = SimpleProducer(
|
||||
self.producer = SimpleProducer(
|
||||
self.kafka,
|
||||
async=False,
|
||||
req_acks=SimpleProducer.ACK_AFTER_LOCAL_WRITE,
|
||||
ack_timeout=2000
|
||||
)
|
||||
|
||||
self.notification_topic = notification_topic
|
||||
self.topic = topic
|
||||
self.tracker = tracker
|
||||
self.queue = queue
|
||||
|
||||
def run(self):
|
||||
|
@ -40,10 +36,9 @@ class SentNotificationProcessor(object):
|
|||
while True:
|
||||
# todo I expect the message format to change, the .message.value is just for test
|
||||
message = self.queue.get().message.value
|
||||
responses = self.notification_producer.send_messages(self.notification_topic, message)
|
||||
responses = self.producer.send_messages(self.topic, message)
|
||||
for resp in responses:
|
||||
if resp.error != 0:
|
||||
log.error('Error publishing to %s topic, error message %s, offset #%d' %
|
||||
(self.notification_topic, resp.error, resp.offset))
|
||||
|
||||
# self.alarms.finished(message.id)
|
||||
(self.topic, resp.error, resp.offset))
|
||||
self.tracker.finish(message.partition, message.offset)
|
||||
|
|
Loading…
Reference in New Issue