Add cgroup support to ram sensor
When running within k8s the system memory statistics are useless as soon there are configured limits (which is strongly advised). In this case we additionally need to check the cgroups. Change-Id: Idebe5d7e60dc862e89d012594ab362a19f18708d
This commit is contained in:
parent
5f5032cf82
commit
145e62b568
|
@ -188,6 +188,12 @@ These metrics are emitted by the Zuul :ref:`scheduler`:
|
|||
The used RAM (excluding buffers and cache) on this executor, as
|
||||
a percentage multiplied by 100.
|
||||
|
||||
.. stat:: pct_used_ram_cgroup
|
||||
:type: gauge
|
||||
|
||||
The used RAM (excluding buffers and cache) on this executor allowed by
|
||||
the cgroup, as percentage multiplied by 100.
|
||||
|
||||
.. stat:: zuul.nodepool.requests
|
||||
|
||||
Holds metrics related to Zuul requests and responses from Nodepool.
|
||||
|
|
|
@ -0,0 +1,6 @@
|
|||
---
|
||||
features:
|
||||
- |
|
||||
The :attr:`executor.min_avail_mem` setting now takes cgroup limits
|
||||
into account. There is also a new metric
|
||||
`zuul.executor.<executor>.pct_used_ram_cgroup` available.
|
|
@ -0,0 +1,34 @@
|
|||
cache 0
|
||||
rss 561152
|
||||
rss_huge 0
|
||||
mapped_file 0
|
||||
dirty 0
|
||||
writeback 0
|
||||
swap 0
|
||||
pgpgin 654
|
||||
pgpgout 517
|
||||
pgfault 1089
|
||||
pgmajfault 0
|
||||
inactive_anon 0
|
||||
active_anon 454656
|
||||
inactive_file 0
|
||||
active_file 0
|
||||
unevictable 0
|
||||
hierarchical_memory_limit 5368709120
|
||||
hierarchical_memsw_limit 5368709120
|
||||
total_cache 0
|
||||
total_rss 5153960755
|
||||
total_rss_huge 0
|
||||
total_mapped_file 0
|
||||
total_dirty 0
|
||||
total_writeback 0
|
||||
total_swap 0
|
||||
total_pgpgin 654
|
||||
total_pgpgout 517
|
||||
total_pgfault 1089
|
||||
total_pgmajfault 0
|
||||
total_inactive_anon 0
|
||||
total_active_anon 454656
|
||||
total_inactive_file 0
|
||||
total_active_file 0
|
||||
total_unevictable 0
|
|
@ -0,0 +1,34 @@
|
|||
cache 0
|
||||
rss 561152
|
||||
rss_huge 0
|
||||
mapped_file 0
|
||||
dirty 0
|
||||
writeback 0
|
||||
swap 0
|
||||
pgpgin 654
|
||||
pgpgout 517
|
||||
pgfault 1089
|
||||
pgmajfault 0
|
||||
inactive_anon 0
|
||||
active_anon 454656
|
||||
inactive_file 0
|
||||
active_file 0
|
||||
unevictable 0
|
||||
hierarchical_memory_limit 9223372036854771712
|
||||
hierarchical_memsw_limit 9223372036854771712
|
||||
total_cache 0
|
||||
total_rss 561152
|
||||
total_rss_huge 0
|
||||
total_mapped_file 0
|
||||
total_dirty 0
|
||||
total_writeback 0
|
||||
total_swap 0
|
||||
total_pgpgin 654
|
||||
total_pgpgout 517
|
||||
total_pgfault 1089
|
||||
total_pgmajfault 0
|
||||
total_inactive_anon 0
|
||||
total_active_anon 454656
|
||||
total_inactive_file 0
|
||||
total_active_file 0
|
||||
total_unevictable 0
|
|
@ -0,0 +1,34 @@
|
|||
cache 0
|
||||
rss 561152
|
||||
rss_huge 0
|
||||
mapped_file 0
|
||||
dirty 0
|
||||
writeback 0
|
||||
swap 0
|
||||
pgpgin 654
|
||||
pgpgout 517
|
||||
pgfault 1089
|
||||
pgmajfault 0
|
||||
inactive_anon 0
|
||||
active_anon 454656
|
||||
inactive_file 0
|
||||
active_file 0
|
||||
unevictable 0
|
||||
hierarchical_memory_limit 5368709120
|
||||
hierarchical_memsw_limit 5368709120
|
||||
total_cache 0
|
||||
total_rss 1073741824
|
||||
total_rss_huge 0
|
||||
total_mapped_file 0
|
||||
total_dirty 0
|
||||
total_writeback 0
|
||||
total_swap 0
|
||||
total_pgpgin 654
|
||||
total_pgpgout 517
|
||||
total_pgfault 1089
|
||||
total_pgmajfault 0
|
||||
total_inactive_anon 0
|
||||
total_active_anon 454656
|
||||
total_inactive_file 0
|
||||
total_active_file 0
|
||||
total_unevictable 0
|
|
@ -31,6 +31,7 @@ from tests.base import (
|
|||
)
|
||||
|
||||
from zuul.executor.sensors.startingbuilds import StartingBuildsSensor
|
||||
from zuul.executor.sensors.ram import RAMSensor
|
||||
|
||||
|
||||
class TestExecutorRepos(ZuulTestCase):
|
||||
|
@ -466,12 +467,59 @@ class TestGovernor(ZuulTestCase):
|
|||
pass
|
||||
ram = Dummy()
|
||||
ram.percent = 20.0 # 20% used
|
||||
ram.total = 8 * 1024 * 1024 * 1024 # 8GiB
|
||||
vm_mock.return_value = ram
|
||||
loadavg_mock.return_value = (0.0, 0.0, 0.0)
|
||||
self.executor_server.manageLoad()
|
||||
self.assertTrue(self.executor_server.accepting_work)
|
||||
loadavg_mock.return_value = (100.0, 100.0, 100.0)
|
||||
self.executor_server.manageLoad()
|
||||
self.assertFalse(self.executor_server.accepting_work)
|
||||
|
||||
@mock.patch('os.getloadavg')
|
||||
@mock.patch('psutil.virtual_memory')
|
||||
def test_ram_governor(self, vm_mock, loadavg_mock):
|
||||
class Dummy(object):
|
||||
pass
|
||||
ram = Dummy()
|
||||
ram.percent = 20.0 # 20% used
|
||||
ram.total = 8 * 1024 * 1024 * 1024 # 8GiB
|
||||
vm_mock.return_value = ram
|
||||
loadavg_mock.return_value = (0.0, 0.0, 0.0)
|
||||
self.executor_server.manageLoad()
|
||||
self.assertTrue(self.executor_server.accepting_work)
|
||||
ram.percent = 99.0 # 99% used
|
||||
loadavg_mock.return_value = (100.0, 100.0, 100.0)
|
||||
self.executor_server.manageLoad()
|
||||
self.assertFalse(self.executor_server.accepting_work)
|
||||
|
||||
@mock.patch('os.getloadavg')
|
||||
@mock.patch('psutil.virtual_memory')
|
||||
def test_ram_cgroup_governor(self, vm_mock, loadavg_mock):
|
||||
class Dummy(object):
|
||||
pass
|
||||
ram = Dummy()
|
||||
ram.percent = 20.0 # 20% used
|
||||
ram.total = 8 * 1024 * 1024 * 1024 # 8GiB
|
||||
vm_mock.return_value = ram
|
||||
loadavg_mock.return_value = (0.0, 0.0, 0.0)
|
||||
|
||||
# Set no cgroup limit
|
||||
ram_sensor = [x for x in self.executor_server.sensors
|
||||
if isinstance(x, RAMSensor)][0]
|
||||
ram_sensor.cgroup_stats_file = os.path.join(
|
||||
FIXTURE_DIR, 'cgroup', 'memory.stat.nolimit')
|
||||
self.executor_server.manageLoad()
|
||||
self.assertTrue(self.executor_server.accepting_work)
|
||||
|
||||
# Set cgroup limit 5GiB and ram usage 20%
|
||||
ram_sensor.cgroup_stats_file = os.path.join(
|
||||
FIXTURE_DIR, 'cgroup', 'memory.stat.ok')
|
||||
self.executor_server.manageLoad()
|
||||
self.assertTrue(self.executor_server.accepting_work)
|
||||
|
||||
# Set cgroup limit 5GiB and ram usage 96%
|
||||
ram_sensor.cgroup_stats_file = os.path.join(
|
||||
FIXTURE_DIR, 'cgroup', 'memory.stat.bad')
|
||||
self.executor_server.manageLoad()
|
||||
self.assertFalse(self.executor_server.accepting_work)
|
||||
|
||||
|
|
|
@ -13,11 +13,14 @@
|
|||
# under the License.
|
||||
|
||||
import logging
|
||||
import math
|
||||
import psutil
|
||||
|
||||
from zuul.executor.sensors import SensorInterface
|
||||
from zuul.lib.config import get_default
|
||||
|
||||
CGROUP_STATS_FILE = '/sys/fs/cgroup/memory/memory.stat'
|
||||
|
||||
|
||||
def get_avail_mem_pct():
|
||||
avail_mem_pct = 100.0 - psutil.virtual_memory().percent
|
||||
|
@ -30,6 +33,38 @@ class RAMSensor(SensorInterface):
|
|||
def __init__(self, config=None):
|
||||
self.min_avail_mem = float(get_default(config, 'executor',
|
||||
'min_avail_mem', '5.0'))
|
||||
self.cgroup_stats_file = CGROUP_STATS_FILE
|
||||
|
||||
def _read_cgroup_stat(self):
|
||||
stat = {}
|
||||
try:
|
||||
with open(self.cgroup_stats_file) as f:
|
||||
for line in f.readlines():
|
||||
key, value = line.split(' ')
|
||||
stat[key] = int(value.strip())
|
||||
except Exception:
|
||||
pass
|
||||
return stat
|
||||
|
||||
def _get_cgroup_limit(self):
|
||||
stat = self._read_cgroup_stat()
|
||||
limit = stat.get('hierarchical_memory_limit', math.inf)
|
||||
mem_total = psutil.virtual_memory().total
|
||||
if limit < mem_total:
|
||||
return limit
|
||||
else:
|
||||
return math.inf
|
||||
|
||||
def _get_avail_mem_pct_cgroup(self):
|
||||
stat = self._read_cgroup_stat()
|
||||
limit = stat.get('hierarchical_memory_limit', math.inf)
|
||||
usage = stat.get('total_rss', math.inf)
|
||||
|
||||
if math.isinf(limit) or math.isinf(usage):
|
||||
# pretend we have all memory available if we got infs
|
||||
return 100
|
||||
|
||||
return 100.0 - usage / limit * 100
|
||||
|
||||
def isOk(self):
|
||||
avail_mem_pct = get_avail_mem_pct()
|
||||
|
@ -38,10 +73,27 @@ class RAMSensor(SensorInterface):
|
|||
return False, "low memory {:3.1f}% < {}".format(
|
||||
avail_mem_pct, self.min_avail_mem)
|
||||
|
||||
return True, "{:3.1f}% <= {}".format(avail_mem_pct, self.min_avail_mem)
|
||||
if math.isinf(self._get_cgroup_limit()):
|
||||
# we have no cgroup defined limit so we're done now
|
||||
return True, "{:3.1f}% <= {}".format(
|
||||
avail_mem_pct, self.min_avail_mem)
|
||||
|
||||
avail_mem_pct_cgroup = self._get_avail_mem_pct_cgroup()
|
||||
if avail_mem_pct_cgroup < self.min_avail_mem:
|
||||
return False, "low memory cgroup {:3.1f}% < {}".format(
|
||||
avail_mem_pct_cgroup, self.min_avail_mem)
|
||||
|
||||
return True, "{:3.1f}% <= {}, {:3.1f}% <= {}".format(
|
||||
avail_mem_pct, self.min_avail_mem,
|
||||
avail_mem_pct_cgroup, self.min_avail_mem)
|
||||
|
||||
def reportStats(self, statsd, base_key):
|
||||
avail_mem_pct = get_avail_mem_pct()
|
||||
|
||||
statsd.gauge(base_key + '.pct_used_ram',
|
||||
int((100.0 - avail_mem_pct) * 100))
|
||||
|
||||
if math.isfinite(self._get_cgroup_limit()):
|
||||
avail_mem_pct_cgroup = self._get_avail_mem_pct_cgroup()
|
||||
statsd.gauge(base_key + '.pct_used_ram_cgroup',
|
||||
int((100.0 - avail_mem_pct_cgroup) * 100))
|
||||
|
|
Loading…
Reference in New Issue