Better event checking timeouts

Previously we gave every event a 20 minute timeout. This meant that we
could eventually rollover on the day and start querying against current
indexes for data in older indexes. If this happens every query would
fail because we are looking in the wrong index. Every query failing
means we run the 20 minute timeout every time.

All this results in snowballing never being able to check if events are
indexed.

Address this by using the gerrit eventCreatedOn timestamp to determine
when our timeout is hit. We will timeout 20 minutes from that timestamp
regardless of how long interim processing has taken us. This should over
longer periods of time ensure we query the current index for current
events.

Change-Id: Ic9ed7fefae37d2668de5d89e0d06b8326eadfbb9
This commit is contained in:
Clark Boylan 2018-11-29 14:40:20 -08:00 committed by Matt Riedemann
parent f8c8cc1cda
commit 02d0651f29
2 changed files with 44 additions and 35 deletions

View File

@ -115,6 +115,7 @@ class FailEvent(object):
self.project = event['change']['project']
self.url = event['change']['url']
self.comment = event["comment"]
self.created_on = event["eventCreatedOn"]
# TODO(jogo): make FailEvent generate the jobs
self.failed_jobs = failed_jobs
self.config = config or er_conf.Config()
@ -257,12 +258,18 @@ class Stream(object):
def _does_es_have_data(self, event):
"""Wait till ElasticSearch is ready, but return False if timeout."""
NUMBER_OF_RETRIES = 30
SLEEP_TIME = 40
started_at = datetime.datetime.now()
# this checks that we've got the console log uploaded, need to retry
# We wait 20 minutes wall time since receiving the event until we
# treat the logs as missing
timeout = 1200
# Wait 40 seconds between queries.
sleep_time = 40
timed_out = False
# This checks that we've got the console log uploaded, need to retry
# in case ES goes bonkers on cold data, which it does some times.
for i in range(NUMBER_OF_RETRIES):
# We check at least once so that we can return success if data is
# there. But then only check again until we reach a timeout since
# the event was received.
while True:
try:
for job in event.failed_jobs:
# TODO(jogo): if there are three failed jobs and only the
@ -271,23 +278,32 @@ class Stream(object):
self._job_console_uploaded(
event.change, event.rev, job.name,
job.build_short_uuid)
self._has_required_files(
event.change, event.rev, job.name,
job.build_short_uuid)
break
except ConsoleNotReady as e:
self.log.debug(e)
time.sleep(SLEEP_TIME)
continue
except FilesNotReady as e:
self.log.info(e)
except pyelasticsearch.exceptions.InvalidJsonResponseError:
# If ElasticSearch returns an error code, sleep and retry
# TODO(jogo): if this works pull out search into a helper
# function that does this.
self.log.exception(
"Elastic Search not responding on attempt %d" % i)
time.sleep(SLEEP_TIME)
continue
else:
elapsed = format_timedelta(datetime.datetime.now() - started_at)
msg = ("Console logs not available after %ss for %s %d,%d,%s" %
"Elastic Search not responding")
# If we fall through then we had a failure of some sort.
# Wait until timeout is exceeded.
now = time.time()
if now > event.created_on + timeout:
# We've waited too long for this event, move on.
timed_out = True
break
time.sleep(sleep_time)
if timed_out:
elapsed = now - event.created_on
msg = ("Required files not ready after %ss for %s %d,%d,%s" %
(elapsed, job.name, event.change, event.rev,
job.build_short_uuid))
raise ResultTimedOut(msg)
@ -295,28 +311,10 @@ class Stream(object):
self.log.debug(
"Found hits for change_number: %d, patch_number: %d"
% (event.change, event.rev))
for i in range(NUMBER_OF_RETRIES):
try:
for job in event.failed_jobs:
self._has_required_files(
event.change, event.rev, job.name,
job.build_short_uuid)
self.log.info(
"All files present for change_number: %d, patch_number: %d"
% (event.change, event.rev))
time.sleep(10)
return True
except FilesNotReady as e:
self.log.info(e)
time.sleep(SLEEP_TIME)
# if we get to the end, we're broken
elapsed = format_timedelta(datetime.datetime.now() - started_at)
msg = ("Required files not ready after %ss for %s %d,%d,%s" %
(elapsed, job.name, event.change, event.rev,
job.build_short_uuid))
raise ResultTimedOut(msg)
self.log.info(
"All files present for change_number: %d, patch_number: %d"
% (event.change, event.rev))
return True
def get_failed_tempest(self):
self.log.debug("entering get_failed_tempest")

View File

@ -24,6 +24,7 @@
"revision": "74d75a1050c96eef498e9daaa80fcb9652003a20"
},
"type": "comment-added",
"eventCreatedOn": 1389371115,
"change": {
"topic": "bp\/mapping",
"url": "https:\/\/review.openstack.org\/60244",
@ -64,6 +65,7 @@
"revision": "10d31584a0696cd77200a8fee45900d2343e295e"
},
"type": "comment-added",
"eventCreatedOn": 1389384794,
"change": {
"topic": "65361",
"url": "https:\/\/review.openstack.org\/65361",
@ -104,6 +106,7 @@
"revision": "2e5a51c3f4e0121dbaf47c13b12809f6e7820d09"
},
"type": "comment-added",
"eventCreatedOn": 1389383805,
"change": {
"topic": "bp\/image-flavor-selection-ux-enhancement",
"url": "https:\/\/review.openstack.org\/63078",
@ -144,6 +147,7 @@
"revision": "3ca110f8dc6a552e951074faaca3a11ce65e40f3"
},
"type": "comment-added",
"eventCreatedOn": 1389306576,
"change": {
"topic": "slow-down-there-cowboy",
"url": "https:\/\/review.openstack.org\/65805",
@ -184,6 +188,7 @@
"revision": "4fe4f8b6dda390c432b2c6d2f6cff2152d3475f3"
},
"type": "comment-added",
"eventCreatedOn": 1389391347,
"change": {
"topic": "1187119-a",
"url": "https:\/\/review.openstack.org\/65378",
@ -224,6 +229,7 @@
"revision": "06b14616cb2a11514b0c151cc213c1ab5cea324b"
},
"type": "comment-added",
"eventCreatedOn": 1389378289,
"change": {
"topic": "bug\/1265108",
"url": "https:\/\/review.openstack.org\/64749",
@ -264,6 +270,7 @@
"revision": "06b14616cb2a11514b0c151cc213c1ab5cea324b"
},
"type": "comment-added",
"eventCreatedOn": 1389378289,
"change": {
"topic": "bug\/1265108",
"url": "https:\/\/review.openstack.org\/64750",
@ -304,6 +311,7 @@
"revision": "8044bcbfe8f10aa3f965992e30e9548258d6c736"
},
"type": "comment-added",
"eventCreatedOn": 1389387264,
"change": {
"topic": "bugs\/1267682",
"url": "https:\/\/review.openstack.org\/65902",
@ -344,6 +352,7 @@
"revision": "b05c9313f6873239897d7c88babdfa8ebe5fefed"
},
"type": "comment-added",
"eventCreatedOn": 1389388172,
"change": {
"topic": "bug\/1253497",
"url": "https:\/\/review.openstack.org\/66064",
@ -384,6 +393,7 @@
"revision": "fd9bee400259d370e837799236b0a4a6f0343438"
},
"type": "comment-added",
"eventCreatedOn": 1389354908,
"change": {
"topic": "del-param",
"url": "https:\/\/review.openstack.org\/65925",
@ -424,6 +434,7 @@
"revision": "af064c7a5304c672f7d71e4c8df5f69c195c739a"
},
"type": "comment-added",
"eventCreatedOn": 1387806669,
"change": {
"topic": "bug\/1259078",
"url": "https:\/\/review.openstack.org\/61074",