Stop workflow on error
Enhance error handling and fail maintenance session with state MAINTENANCE_FAILED Story: 2003830 Task: #26600 Change-Id: I6fd1821aa42efce0ddbd1bc1f780c640c026d380 Signed-off-by: Tomi Juvonen <tomi.juvonen@nokia.com>
This commit is contained in:
parent
e3ceda7232
commit
36af47855e
|
@ -178,7 +178,7 @@ class BaseWorkflow(Thread):
|
||||||
'PLANNED_MAINTENANCE': 'planned_maintenance',
|
'PLANNED_MAINTENANCE': 'planned_maintenance',
|
||||||
'MAINTENANCE_COMPLETE': 'maintenance_complete',
|
'MAINTENANCE_COMPLETE': 'maintenance_complete',
|
||||||
'MAINTENANCE_DONE': 'maintenance_done',
|
'MAINTENANCE_DONE': 'maintenance_done',
|
||||||
'FAILED': 'maintenance_failed'}
|
'MAINTENANCE_FAILED': 'maintenance_failed'}
|
||||||
self.url = "http://%s:%s" % (conf.host, conf.port)
|
self.url = "http://%s:%s" % (conf.host, conf.port)
|
||||||
self.auth = get_identity_auth(conf.workflow_user,
|
self.auth = get_identity_auth(conf.workflow_user,
|
||||||
conf.workflow_password,
|
conf.workflow_password,
|
||||||
|
@ -244,9 +244,14 @@ class BaseWorkflow(Thread):
|
||||||
def run(self):
|
def run(self):
|
||||||
LOG.info("%s: started" % self.session_id)
|
LOG.info("%s: started" % self.session_id)
|
||||||
while not self.stopped:
|
while not self.stopped:
|
||||||
if self.state != "MAINTENANCE_DONE" and self.state != "FAILED":
|
if self.state not in ["MAINTENANCE_DONE", "MAINTENANCE_FAILED"]:
|
||||||
statefunc = getattr(self, self.states_methods[self.state])
|
try:
|
||||||
statefunc()
|
statefunc = getattr(self, self.states_methods[self.state])
|
||||||
|
statefunc()
|
||||||
|
except Exception as e:
|
||||||
|
LOG.error("%s: %s Raised exception: %s" % (self.session_id,
|
||||||
|
statefunc, e), exc_info=True)
|
||||||
|
self.state = "MAINTENANCE_FAILED"
|
||||||
else:
|
else:
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
# IDLE while session removed
|
# IDLE while session removed
|
||||||
|
|
|
@ -106,9 +106,10 @@ class Workflow(BaseWorkflow):
|
||||||
project)
|
project)
|
||||||
reply_at = reply_time_str(self.conf.project_maintenance_reply)
|
reply_at = reply_time_str(self.conf.project_maintenance_reply)
|
||||||
if is_time_after_time(reply_at, actions_at):
|
if is_time_after_time(reply_at, actions_at):
|
||||||
raise Exception('%s: No time for project to'
|
LOG.error('%s: No time for project to answer in state: %s' %
|
||||||
' answer in state: %s' %
|
(self.session_id, state))
|
||||||
(self.session_id, state))
|
self.state = "MAINTENANCE_FAILED"
|
||||||
|
return False
|
||||||
metadata = self.session_data.metadata
|
metadata = self.session_data.metadata
|
||||||
self._project_notify(project, instance_ids, allowed_actions,
|
self._project_notify(project, instance_ids, allowed_actions,
|
||||||
actions_at, reply_at, state, metadata)
|
actions_at, reply_at, state, metadata)
|
||||||
|
@ -141,9 +142,11 @@ class Workflow(BaseWorkflow):
|
||||||
prev_hostname = ''
|
prev_hostname = ''
|
||||||
LOG.info('checking hypervisors for VCPU capacity')
|
LOG.info('checking hypervisors for VCPU capacity')
|
||||||
for hvisor in hvisors:
|
for hvisor in hvisors:
|
||||||
|
hostname = hvisor.__getattr__('hypervisor_hostname')
|
||||||
|
if hostname not in self.session_data.hosts:
|
||||||
|
continue
|
||||||
vcpus = hvisor.__getattr__('vcpus')
|
vcpus = hvisor.__getattr__('vcpus')
|
||||||
vcpus_used = hvisor.__getattr__('vcpus_used')
|
vcpus_used = hvisor.__getattr__('vcpus_used')
|
||||||
hostname = hvisor.__getattr__('hypervisor_hostname')
|
|
||||||
if prev_vcpus != 0 and prev_vcpus != vcpus:
|
if prev_vcpus != 0 and prev_vcpus != vcpus:
|
||||||
raise Exception('%s: %d vcpus on %s does not match to'
|
raise Exception('%s: %d vcpus on %s does not match to'
|
||||||
'%d on %s'
|
'%d on %s'
|
||||||
|
@ -315,7 +318,7 @@ class Workflow(BaseWorkflow):
|
||||||
vm_state = server.__dict__.get('OS-EXT-STS:vm_state')
|
vm_state = server.__dict__.get('OS-EXT-STS:vm_state')
|
||||||
LOG.info('server %s state %s' % (server_id, vm_state))
|
LOG.info('server %s state %s' % (server_id, vm_state))
|
||||||
last_vm_state = vm_state
|
last_vm_state = vm_state
|
||||||
retry_migrate = 5
|
retry_migrate = 2
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
server.migrate()
|
server.migrate()
|
||||||
|
@ -345,13 +348,16 @@ class Workflow(BaseWorkflow):
|
||||||
|
|
||||||
except BadRequest:
|
except BadRequest:
|
||||||
if retry_migrate == 0:
|
if retry_migrate == 0:
|
||||||
raise Exception('server %s migrate failed' % server_id)
|
LOG.error('server %s migrate failed after retries' %
|
||||||
|
server_id)
|
||||||
|
return False
|
||||||
# Might take time for scheduler to sync inconsistent instance
|
# Might take time for scheduler to sync inconsistent instance
|
||||||
# list for host
|
# list for host
|
||||||
retry_time = 180 - (retry_migrate * 30)
|
# TBD Retry doesn't help, need investigating if reproduces
|
||||||
|
retry_timeout = 150 - (retry_migrate * 60)
|
||||||
LOG.info('server %s migrate failed, retry in %s sec'
|
LOG.info('server %s migrate failed, retry in %s sec'
|
||||||
% (server_id, retry_time))
|
% (server_id, retry_timeout))
|
||||||
time.sleep(retry_time)
|
time.sleep(retry_timeout)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
LOG.error('server %s migration failed, Exception=%s' %
|
LOG.error('server %s migration failed, Exception=%s' %
|
||||||
(server_id, e))
|
(server_id, e))
|
||||||
|
@ -373,11 +379,11 @@ class Workflow(BaseWorkflow):
|
||||||
self.initialize_server_info()
|
self.initialize_server_info()
|
||||||
|
|
||||||
if not self.projects_listen_alarm('maintenance.scheduled'):
|
if not self.projects_listen_alarm('maintenance.scheduled'):
|
||||||
self.state = 'FAILED'
|
self.state = 'MAINTENANCE_FAILED'
|
||||||
return
|
return
|
||||||
|
|
||||||
if not self.confirm_maintenance():
|
if not self.confirm_maintenance():
|
||||||
self.state = 'FAILED'
|
self.state = 'MAINTENANCE_FAILED'
|
||||||
return
|
return
|
||||||
|
|
||||||
maintenance_empty_hosts = self.session_data.get_empty_hosts()
|
maintenance_empty_hosts = self.session_data.get_empty_hosts()
|
||||||
|
@ -412,7 +418,7 @@ class Workflow(BaseWorkflow):
|
||||||
LOG.info("%s: scale in" % self.session_id)
|
LOG.info("%s: scale in" % self.session_id)
|
||||||
|
|
||||||
if not self.confirm_scale_in():
|
if not self.confirm_scale_in():
|
||||||
self.state = 'FAILED'
|
self.state = 'MAINTENANCE_FAILED'
|
||||||
return
|
return
|
||||||
# TBD it takes time to have proper infromation updated about free
|
# TBD it takes time to have proper infromation updated about free
|
||||||
# capacity. Should make sure instances removed has also VCPUs removed
|
# capacity. Should make sure instances removed has also VCPUs removed
|
||||||
|
@ -436,7 +442,7 @@ class Workflow(BaseWorkflow):
|
||||||
LOG.info("%s: prepare_maintenance called" % self.session_id)
|
LOG.info("%s: prepare_maintenance called" % self.session_id)
|
||||||
host = self.find_host_to_be_empty()
|
host = self.find_host_to_be_empty()
|
||||||
if not self.confirm_host_to_be_emptied(host, 'PREPARE_MAINTENANCE'):
|
if not self.confirm_host_to_be_emptied(host, 'PREPARE_MAINTENANCE'):
|
||||||
self.state = 'FAILED'
|
self.state = 'MAINTENANCE_FAILED'
|
||||||
return
|
return
|
||||||
if not self.actions_to_have_empty_host(host):
|
if not self.actions_to_have_empty_host(host):
|
||||||
# TBD we found the hard way that we couldn't make host empty and
|
# TBD we found the hard way that we couldn't make host empty and
|
||||||
|
@ -455,7 +461,7 @@ class Workflow(BaseWorkflow):
|
||||||
empty_hosts = self.session_data.get_empty_hosts()
|
empty_hosts = self.session_data.get_empty_hosts()
|
||||||
if not empty_hosts:
|
if not empty_hosts:
|
||||||
LOG.info("%s: No empty host to be maintained" % self.session_id)
|
LOG.info("%s: No empty host to be maintained" % self.session_id)
|
||||||
self.state = 'FAILED'
|
self.state = 'MAINTENANCE_FAILED'
|
||||||
return
|
return
|
||||||
maintained_hosts = self.session_data.maintained_hosts
|
maintained_hosts = self.session_data.maintained_hosts
|
||||||
if not maintained_hosts:
|
if not maintained_hosts:
|
||||||
|
@ -508,13 +514,13 @@ class Workflow(BaseWorkflow):
|
||||||
not_maintained_hosts))
|
not_maintained_hosts))
|
||||||
host = not_maintained_hosts[0]
|
host = not_maintained_hosts[0]
|
||||||
if not self.confirm_host_to_be_emptied(host, 'PLANNED_MAINTENANCE'):
|
if not self.confirm_host_to_be_emptied(host, 'PLANNED_MAINTENANCE'):
|
||||||
self.state = 'FAILED'
|
self.state = 'MAINTENANCE_FAILED'
|
||||||
return
|
return
|
||||||
if not self.actions_to_have_empty_host(host):
|
if not self.actions_to_have_empty_host(host):
|
||||||
# Failure in here might indicate action to move instance failed.
|
# Failure in here might indicate action to move instance failed.
|
||||||
# This might be as Nova VCPU capacity was not yet emptied from
|
# This might be as Nova VCPU capacity was not yet emptied from
|
||||||
# expected target hosts
|
# expected target hosts
|
||||||
self.state = 'FAILED'
|
self.state = 'MAINTENANCE_FAILED'
|
||||||
return
|
return
|
||||||
self.update_server_info()
|
self.update_server_info()
|
||||||
self.state = 'START_MAINTENANCE'
|
self.state = 'START_MAINTENANCE'
|
||||||
|
@ -523,7 +529,9 @@ class Workflow(BaseWorkflow):
|
||||||
LOG.info("%s: maintenance_complete called" % self.session_id)
|
LOG.info("%s: maintenance_complete called" % self.session_id)
|
||||||
LOG.info('Projects may still need to up scale back to full '
|
LOG.info('Projects may still need to up scale back to full '
|
||||||
'capcity')
|
'capcity')
|
||||||
self.confirm_maintenance_complete()
|
if not self.confirm_maintenance_complete():
|
||||||
|
self.state = 'MAINTENANCE_FAILED'
|
||||||
|
return
|
||||||
self.update_server_info()
|
self.update_server_info()
|
||||||
self.state = 'MAINTENANCE_DONE'
|
self.state = 'MAINTENANCE_DONE'
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue