From cdd60504ecdfb97a3f85dffdb7ad18aedbf2021d Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Thu, 11 Oct 2018 17:38:31 -0400 Subject: [PATCH] Cleanup down ports Cleanup will be periodic (every 3 minutes by default, not yet configurable) and will be logged and reported via statsd. Change-Id: I81b57d6f6142e64dd0ebf31531ca6489d6c46583 --- doc/source/operation.rst | 6 ++ nodepool/driver/fake/provider.py | 21 +++++ nodepool/driver/openstack/provider.py | 79 ++++++++++++++++++- nodepool/tests/test_launcher.py | 21 +++++ .../notes/port-cleanup-667d476437f03358.yaml | 7 ++ 5 files changed, 133 insertions(+), 1 deletion(-) create mode 100644 releasenotes/notes/port-cleanup-667d476437f03358.yaml diff --git a/doc/source/operation.rst b/doc/source/operation.rst index 41950d44e..bf75ee1ae 100644 --- a/doc/source/operation.rst +++ b/doc/source/operation.rst @@ -329,6 +329,12 @@ Nodepool launcher * ready * used +.. zuul:stat:: nodepool.provider..downPorts + :type: counter + + Number of ports in the DOWN state that have been removed automatically + in the cleanup resources phase of the OpenStack driver. + .. zuul:stat:: nodepool.provider..nodes. :type: gauge diff --git a/nodepool/driver/fake/provider.py b/nodepool/driver/fake/provider.py index 6e6fdcd4d..ed83be6c9 100644 --- a/nodepool/driver/fake/provider.py +++ b/nodepool/driver/fake/provider.py @@ -31,6 +31,7 @@ class Dummy(object): INSTANCE = 'Instance' FLAVOR = 'Flavor' LOCATION = 'Server.Location' + PORT = 'Port' def __init__(self, kind, **kw): self.__kind = kind @@ -104,6 +105,12 @@ class FakeOpenStackCloud(object): self._server_list = [] self.max_cores, self.max_instances, self.max_ram = FakeOpenStackCloud.\ _get_quota() + self._down_ports = [ + Dummy(Dummy.PORT, id='1a', status='DOWN', + device_owner="compute:nova"), + Dummy(Dummy.PORT, id='2b', status='DOWN', + device_owner=None), + ] def _get(self, name_or_id, instance_list): self.log.debug("Get %s in %s" % (name_or_id, repr(instance_list))) @@ -281,6 +288,20 @@ class FakeOpenStackCloud(object): total_ram_used=8192 * len(self._server_list) ) + def list_ports(self, filters=None): + if filters and filters.get('status') == 'DOWN': + return self._down_ports + return [] + + def delete_port(self, port_id): + tmp_ports = [] + for port in self._down_ports: + if port.id != port_id: + tmp_ports.append(port) + else: + self.log.debug("Deleted port ID: %s", port_id) + self._down_ports = tmp_ports + class FakeUploadFailCloud(FakeOpenStackCloud): log = logging.getLogger("nodepool.FakeUploadFailCloud") diff --git a/nodepool/driver/openstack/provider.py b/nodepool/driver/openstack/provider.py index d202f9f38..de479fcc3 100755 --- a/nodepool/driver/openstack/provider.py +++ b/nodepool/driver/openstack/provider.py @@ -26,6 +26,7 @@ from nodepool.driver import Provider from nodepool.driver.utils import QuotaInformation from nodepool.nodeutils import iterate_timeout from nodepool.task_manager import TaskManager +from nodepool import stats from nodepool import version from nodepool import zk @@ -50,6 +51,10 @@ class OpenStackProvider(Provider): self._taskmanager = None self._current_nodepool_quota = None self._zk = None + self._down_ports = set() + self._last_port_cleanup = None + self._port_cleanup_interval_secs = 180 + self._statsd = stats.get_client() def start(self, zk_conn): if self._use_taskmanager: @@ -417,6 +422,21 @@ class OpenStackProvider(Provider): **meta) return image.id + def listPorts(self, status=None): + ''' + List known ports. + + :param str status: A valid port status. E.g., 'ACTIVE' or 'DOWN'. + ''' + if status: + ports = self._client.list_ports(filters={'status': status}) + else: + ports = self._client.list_ports() + return ports + + def deletePort(self, port_id): + self._client.delete_port(port_id) + def listImages(self): return self._client.list_images() @@ -444,7 +464,7 @@ class OpenStackProvider(Provider): self.log.debug('Deleting server %s' % server_id) self.deleteServer(server_id) - def cleanupLeakedResources(self): + def cleanupLeakedInstances(self): ''' Delete any leaked server instances. @@ -492,6 +512,63 @@ class OpenStackProvider(Provider): node.state = zk.DELETING self._zk.storeNode(node) + def filterComputePorts(self, ports): + ''' + Return a list of compute ports (or no device owner). + + We are not interested in ports for routers or DHCP. + ''' + ret = [] + for p in ports: + if p.device_owner is None or p.device_owner.startswith("compute:"): + ret.append(p) + return ret + + def cleanupLeakedPorts(self): + if not self._last_port_cleanup: + self._last_port_cleanup = time.monotonic() + ports = self.listPorts(status='DOWN') + ports = self.filterComputePorts(ports) + self._down_ports = set([p.id for p in ports]) + return + + # Return if not enough time has passed between cleanup + last_check_in_secs = int(time.monotonic() - self._last_port_cleanup) + if last_check_in_secs <= self._port_cleanup_interval_secs: + return + + ports = self.listPorts(status='DOWN') + ports = self.filterComputePorts(ports) + current_set = set([p.id for p in ports]) + remove_set = current_set & self._down_ports + + removed_count = 0 + for port_id in remove_set: + try: + self.deletePort(port_id) + except Exception: + self.log.exception("Exception deleting port %s in %s:", + port_id, self.provider.name) + else: + removed_count += 1 + self.log.debug("Removed DOWN port %s in %s", + port_id, self.provider.name) + + if self._statsd and removed_count: + key = 'nodepool.provider.%s.downPorts' % (self.provider.name) + self._statsd.incr(key, removed_count) + + self._last_port_cleanup = time.monotonic() + + # Rely on OpenStack to tell us the down ports rather than doing our + # own set adjustment. + ports = self.listPorts(status='DOWN') + ports = self.filterComputePorts(ports) + self._down_ports = set([p.id for p in ports]) + + def cleanupLeakedResources(self): + self.cleanupLeakedInstances() + self.cleanupLeakedPorts() if self.provider.clean_floating_ips: self._client.delete_unattached_floating_ips() diff --git a/nodepool/tests/test_launcher.py b/nodepool/tests/test_launcher.py index 492f84ed6..f06107ff5 100644 --- a/nodepool/tests/test_launcher.py +++ b/nodepool/tests/test_launcher.py @@ -1751,3 +1751,24 @@ class TestLauncher(tests.DBTestCase): req3 = self.waitForNodeRequest(req3) self.assertEqual(req3.state, zk.FAILED) + + def test_leaked_port_cleanup(self): + configfile = self.setup_config('node.yaml') + self.useBuilder(configfile) + pool = self.useNodepool(configfile, watermark_sleep=1) + pool.cleanup_interval = 1 + pool.start() + self.waitForNodes('fake-label') + + manager = pool.getProviderManager('fake-provider') + down_ports = manager.listPorts(status='DOWN') + self.assertEqual(2, len(down_ports)) + self.log.debug("Down ports: %s", down_ports) + + # Change the port cleanup interval to happen quicker + manager._port_cleanup_interval_secs = 2 + while manager.listPorts(status='DOWN'): + time.sleep(1) + + self.assertReportedStat('nodepool.provider.fake-provider.downPorts', + value='2', kind='c') diff --git a/releasenotes/notes/port-cleanup-667d476437f03358.yaml b/releasenotes/notes/port-cleanup-667d476437f03358.yaml new file mode 100644 index 000000000..e89e375ad --- /dev/null +++ b/releasenotes/notes/port-cleanup-667d476437f03358.yaml @@ -0,0 +1,7 @@ +--- +features: + - | + Added a new routine to the OpenStack driver cleanup resources phase that + will remove any ports reported to be in the DOWN state. Ports will have + to be seen as DOWN for at least three minutes before they will be removed. + The number of ports removed will be reported to statsd.