Stop workflow on error

Enhance error handling and fail maintenance session with state MAINTENANCE_FAILED Story: 2003830 Task: #26600 Change-Id: I6fd1821aa42efce0ddbd1bc1f780c640c026d380 Signed-off-by: Tomi Juvonen <tomi.juvonen@nokia.com>
2018-10-10 12:06:41 +03:00 · 2018-10-10 12:06:41 +03:00 · 36af47855e
parent e3ceda7232
commit 36af47855e
2 changed files with 34 additions and 21 deletions
--- a/fenix/workflow/workflow.py
+++ b/fenix/workflow/workflow.py
@ -178,7 +178,7 @@ class BaseWorkflow(Thread):
                               'PLANNED_MAINTENANCE': 'planned_maintenance',
                               'MAINTENANCE_COMPLETE': 'maintenance_complete',
                               'MAINTENANCE_DONE': 'maintenance_done',
-                               'FAILED': 'maintenance_failed'}
+                               'MAINTENANCE_FAILED': 'maintenance_failed'}
        self.url = "http://%s:%s" % (conf.host, conf.port)
        self.auth = get_identity_auth(conf.workflow_user,
                                      conf.workflow_password,
@ -244,9 +244,14 @@ class BaseWorkflow(Thread):
    def run(self):
        LOG.info("%s: started" % self.session_id)
        while not self.stopped:
-            if self.state != "MAINTENANCE_DONE" and self.state != "FAILED":
+            if self.state not in ["MAINTENANCE_DONE", "MAINTENANCE_FAILED"]:
-                statefunc = getattr(self, self.states_methods[self.state])
+                try:
-                statefunc()
+                    statefunc = getattr(self, self.states_methods[self.state])
                    statefunc()
                except Exception as e:
                    LOG.error("%s: %s Raised exception: %s" % (self.session_id,
                              statefunc, e), exc_info=True)
                    self.state = "MAINTENANCE_FAILED"
            else:
                time.sleep(1)
                # IDLE while session removed
--- a/fenix/workflow/workflows/default.py
+++ b/fenix/workflow/workflows/default.py
@ -106,9 +106,10 @@ class Workflow(BaseWorkflow):
                                                        project)
            reply_at = reply_time_str(self.conf.project_maintenance_reply)
            if is_time_after_time(reply_at, actions_at):
-                raise Exception('%s: No time for project to'
+                LOG.error('%s: No time for project to answer in state: %s' %
-                                ' answer in state: %s' %
+                          (self.session_id, state))
-                                (self.session_id, state))
+                self.state = "MAINTENANCE_FAILED"
                return False
            metadata = self.session_data.metadata
            self._project_notify(project, instance_ids, allowed_actions,
                                 actions_at, reply_at, state, metadata)
@ -141,9 +142,11 @@ class Workflow(BaseWorkflow):
        prev_hostname = ''
        LOG.info('checking hypervisors for VCPU capacity')
        for hvisor in hvisors:
            hostname = hvisor.__getattr__('hypervisor_hostname')
            if hostname not in self.session_data.hosts:
                continue
            vcpus = hvisor.__getattr__('vcpus')
            vcpus_used = hvisor.__getattr__('vcpus_used')
            hostname = hvisor.__getattr__('hypervisor_hostname')
            if prev_vcpus != 0 and prev_vcpus != vcpus:
                raise Exception('%s: %d vcpus on %s does not match to'
                                '%d on %s'
@ -315,7 +318,7 @@ class Workflow(BaseWorkflow):
        vm_state = server.__dict__.get('OS-EXT-STS:vm_state')
        LOG.info('server %s state %s' % (server_id, vm_state))
        last_vm_state = vm_state
-        retry_migrate = 5
+        retry_migrate = 2
        while True:
            try:
                server.migrate()
@ -345,13 +348,16 @@ class Workflow(BaseWorkflow):
            except BadRequest:
                if retry_migrate == 0:
-                    raise Exception('server %s migrate failed' % server_id)
+                    LOG.error('server %s migrate failed after retries' %
                              server_id)
                    return False
                # Might take time for scheduler to sync inconsistent instance
                # list for host
-                retry_time = 180 - (retry_migrate * 30)
+                # TBD Retry doesn't help, need investigating if reproduces
                retry_timeout = 150 - (retry_migrate * 60)
                LOG.info('server %s migrate failed, retry in %s sec'
-                         % (server_id, retry_time))
+                         % (server_id, retry_timeout))
-                time.sleep(retry_time)
+                time.sleep(retry_timeout)
            except Exception as e:
                LOG.error('server %s migration failed, Exception=%s' %
                          (server_id, e))
@ -373,11 +379,11 @@ class Workflow(BaseWorkflow):
        self.initialize_server_info()
        if not self.projects_listen_alarm('maintenance.scheduled'):
-            self.state = 'FAILED'
+            self.state = 'MAINTENANCE_FAILED'
            return
        if not self.confirm_maintenance():
-            self.state = 'FAILED'
+            self.state = 'MAINTENANCE_FAILED'
            return
        maintenance_empty_hosts = self.session_data.get_empty_hosts()
@ -412,7 +418,7 @@ class Workflow(BaseWorkflow):
        LOG.info("%s: scale in" % self.session_id)
        if not self.confirm_scale_in():
-            self.state = 'FAILED'
+            self.state = 'MAINTENANCE_FAILED'
            return
        # TBD it takes time to have proper infromation updated about free
        # capacity. Should make sure instances removed has also VCPUs removed
@ -436,7 +442,7 @@ class Workflow(BaseWorkflow):
        LOG.info("%s: prepare_maintenance called" % self.session_id)
        host = self.find_host_to_be_empty()
        if not self.confirm_host_to_be_emptied(host, 'PREPARE_MAINTENANCE'):
-            self.state = 'FAILED'
+            self.state = 'MAINTENANCE_FAILED'
            return
        if not self.actions_to_have_empty_host(host):
            # TBD we found the hard way that we couldn't make host empty and
@ -455,7 +461,7 @@ class Workflow(BaseWorkflow):
        empty_hosts = self.session_data.get_empty_hosts()
        if not empty_hosts:
            LOG.info("%s: No empty host to be maintained" % self.session_id)
-            self.state = 'FAILED'
+            self.state = 'MAINTENANCE_FAILED'
            return
        maintained_hosts = self.session_data.maintained_hosts
        if not maintained_hosts:
@ -508,13 +514,13 @@ class Workflow(BaseWorkflow):
                                                   not_maintained_hosts))
        host = not_maintained_hosts[0]
        if not self.confirm_host_to_be_emptied(host, 'PLANNED_MAINTENANCE'):
-            self.state = 'FAILED'
+            self.state = 'MAINTENANCE_FAILED'
            return
        if not self.actions_to_have_empty_host(host):
            # Failure in here might indicate action to move instance failed.
            # This might be as Nova VCPU capacity was not yet emptied from
            # expected target hosts
-            self.state = 'FAILED'
+            self.state = 'MAINTENANCE_FAILED'
            return
        self.update_server_info()
        self.state = 'START_MAINTENANCE'
@ -523,7 +529,9 @@ class Workflow(BaseWorkflow):
        LOG.info("%s: maintenance_complete called" % self.session_id)
        LOG.info('Projects may still need to up scale back to full '
                 'capcity')
-        self.confirm_maintenance_complete()
+        if not self.confirm_maintenance_complete():
            self.state = 'MAINTENANCE_FAILED'
            return
        self.update_server_info()
        self.state = 'MAINTENANCE_DONE'