diff --git a/ironic/conductor/base_manager.py b/ironic/conductor/base_manager.py index 47f2081ac7..7287ea4ebf 100644 --- a/ironic/conductor/base_manager.py +++ b/ironic/conductor/base_manager.py @@ -125,9 +125,12 @@ class BaseConductorManager(object): self._keepalive_evt = threading.Event() """Event for the keepalive thread.""" - # TODO(dtantsur): make the threshold configurable? - rejection_func = rejection.reject_when_reached( - CONF.conductor.workers_pool_size) + # NOTE(dtantsur): do not allow queuing work. Given our model, it's + # better to reject an incoming request with HTTP 503 or reschedule + # a periodic task that end up with hidden backlog that is hard + # to track and debug. Using 1 instead of 0 because of how things are + # ordered in futurist (it checks for rejection first). + rejection_func = rejection.reject_when_reached(1) self._executor = futurist.GreenThreadPoolExecutor( max_workers=CONF.conductor.workers_pool_size, check_and_reject=rejection_func) diff --git a/ironic/conf/conductor.py b/ironic/conf/conductor.py index 25b0453da3..416d30ccb0 100644 --- a/ironic/conf/conductor.py +++ b/ironic/conf/conductor.py @@ -22,7 +22,7 @@ from ironic.common.i18n import _ opts = [ cfg.IntOpt('workers_pool_size', - default=100, min=3, + default=300, min=3, help=_('The size of the workers greenthread pool. ' 'Note that 2 threads will be reserved by the conductor ' 'itself for handling heart beats and periodic tasks. ' diff --git a/releasenotes/notes/workers-20ca5c225c1474e0.yaml b/releasenotes/notes/workers-20ca5c225c1474e0.yaml new file mode 100644 index 0000000000..3b55be736b --- /dev/null +++ b/releasenotes/notes/workers-20ca5c225c1474e0.yaml @@ -0,0 +1,25 @@ +--- +issues: + - | + When configuring a single-conductor environment, make sure the number + of worker pools (``[conductor]worker_pool_size``) is larger than the + maximum parallel deployments (``[conductor]max_concurrent_deploy``). + This was not the case by default previously (the options used to be set + to 100 and 250 accordingly). +upgrade: + - | + Because of a fix in the internal worker pool handling, you may now start + seeing requests rejected with HTTP 503 under a very high load earlier than + before. In this case, try increasing the ``[conductor]worker_pool_size`` + option or consider adding more conductors. + - | + The default worker pool size (the ``[conductor]worker_pool_size`` option) + has been increased from 100 to 300. You may want to consider increasing + it even further if your environment allows that. +fixes: + - | + Fixes handling new requests when the maximum number of internal workers + is reached. Previously, after reaching the maximum number of workers + (100 by default), we would queue the same number of requests (100 again). + This was not intentional, and now Ironic no longer queues requests if + there are no free threads to run them.