monasca-agent/monasca_agent/collector/checks_d/ovs.py

681 lines
27 KiB
Python

#!/bin/env python
# (C) Copyright 2016 Hewlett Packard Enterprise Development Company LP
# Copyright 2017 Fujitsu LIMITED
from copy import deepcopy
import json
import math
import os
import re
import socket
import stat
import subprocess
import time
from neutronclient.v2_0 import client as neutron_client
from novaclient import client as nova_client
from monasca_agent.collector.checks import AgentCheck
import monasca_agent.collector.checks.utils as utils
from monasca_agent.common import keystone
from monasca_agent import version as ma_version
OVS_CMD = """\
%s --columns=name,external_ids,statistics,options \
--format=json --data=json list Interface\
"""
DPDK_PORT_PREFIX = 'vhu'
HEALTH_METRICS = ['tx_errors', 'rx_errors', 'tx_dropped', 'rx_dropped']
"""Monasca Agent interface for ovs router port and dhcp metrics"""
class OvsCheck(AgentCheck):
"""This check gathers open vswitch router and dhcp metrics.
"""
def __init__(self, name, init_config, agent_config, instances=None):
AgentCheck.__init__(self, name, init_config,
agent_config, instances=[{}])
cache_dir = self.init_config.get('cache_dir')
self.ctr_cache_file = os.path.join(cache_dir, 'ovs_metrics.json')
self.port_cache_file = os.path.join(cache_dir, 'ovs_ports.json')
self.use_bits = self.init_config.get('network_use_bits')
self.check_router_ha = self.init_config.get('check_router_ha')
self.ovs_cmd = OVS_CMD % self.init_config.get('ovs_cmd')
include_re = self.init_config.get('included_interface_re', None)
self.use_absolute_metrics = self.init_config.get('use_absolute_metrics')
self.use_rate_metrics = self.init_config.get('use_rate_metrics')
self.use_health_metrics = self.init_config.get('use_health_metrics')
self.publish_router_capacity = self.init_config.get('publish_router_capacity')
if include_re is None:
include_re = 'qg.*'
else:
include_re = include_re + '|' + 'qg.*'
self.include_iface_re = re.compile(include_re)
self.session = keystone.get_session(**self.init_config)
def check(self, instance):
time_start = time.time()
interface_data = self._get_ovs_data()
sample_time = float("{:9f}".format(time.time()))
measure = self._get_measure()
if not interface_data:
#
# No OVS data, nothing to do here.
#
return
port_cache = self._load_port_cache()
ctr_cache = self._load_counter_cache()
dims_base = self._set_dimensions({'service': 'networking',
'component': 'ovs'},
instance)
ifx_deltas = {}
for ifx in interface_data:
if not re.match(self.include_iface_re, ifx):
self.log.debug("include_iface_re {0} does not match with "
"ovs-vsctl interface {1} ".format(self.include_iface_re.pattern,
ifx))
continue
if ifx not in ctr_cache:
ctr_cache[ifx] = {}
for metric_name, idx in self._get_metrics_map(measure).items():
interface_stats_key = self._get_interface_stats_key(idx, metric_name, measure, ifx)
statistics_dict = interface_data[ifx]['statistics']
value = statistics_dict[interface_stats_key] \
if interface_stats_key in statistics_dict else 0
if metric_name in ctr_cache[ifx]:
cache_time = ctr_cache[ifx][metric_name]['timestamp']
time_diff = sample_time - float(cache_time)
try:
cache_value = ctr_cache[ifx][metric_name]['value']
val_diff = (value - cache_value) / time_diff
except ZeroDivisionError:
#
# Being extra safe here, in case we divide by zero
# just skip this reading with check below.
#
val_diff = -1
if val_diff < 0:
#
# Bad value, save current reading and skip
#
self.log.warn("Ignoring negative router sample for: "
"{0} new value: {1} old value: {2}"
.format(ifx, value,
ctr_cache[ifx][metric_name]))
ctr_cache[ifx][metric_name] = {
'timestamp': sample_time,
'value': value}
continue
if ifx not in ifx_deltas:
uuid = interface_data[ifx]['external_ids']['iface-id']
ifx_deltas.update({ifx: {'port_uuid': uuid}})
if self.use_bits and 'bytes' in interface_stats_key:
val_diff = val_diff * 8
ifx_deltas[ifx][interface_stats_key] = val_diff
#
# Save the current counter for this metric to the cache
# for the next time we wake up
#
ctr_cache[ifx][metric_name] = {
'timestamp': sample_time,
'value': value}
# Done collecting current rates and updating the cache file,
# let's publish.
#
tried_one_update = False
host_router_max_bw = 0
active_routers = 0
for ifx, value in ifx_deltas.items():
port_uuid = value['port_uuid']
if port_uuid not in port_cache and not tried_one_update:
#
# Only attempt to update port cache
# file for a missing port uuid once per wakeup.
#
tried_one_update = True
log_msg = "port_uuid {0} not in port cache -- updating."
self.log.info(log_msg.format(port_uuid))
port_cache = self._update_port_cache()
if not port_cache:
self.log.error("port_cache is empty.")
continue
port_info = port_cache.get(port_uuid)
if not port_info:
log_msg = "port_uuid {0} not known to neutron -- ghost port?"
self.log.error(log_msg.format(port_uuid))
continue
device_uuid = port_info['device_uuid']
is_router_port = port_info['is_router_port']
tenant_id = port_info['tenant_id']
tenant_name = None
if 'tenant_name' in port_info:
tenant_name = port_info['tenant_name']
if is_router_port and not self._is_active_router(device_uuid):
continue
if is_router_port:
router_name = port_info['router_name']
if not router_name:
log_msg = "Missing router name for router_uuid {0} -- skipping."
self.log.error(log_msg.format(device_uuid))
continue
ifx_dimensions = {'resource_id': device_uuid,
'port_id': port_uuid,
'router_name': router_name}
active_routers += 1
else:
ifx_dimensions = {'resource_id': device_uuid,
'port_id': port_uuid}
this_dimensions = dims_base.copy()
this_dimensions.update(ifx_dimensions)
customer_dimensions = this_dimensions.copy()
del customer_dimensions['hostname']
ops_dimensions = this_dimensions.copy()
ops_dimensions.update({'tenant_id': tenant_id})
if tenant_name:
ops_dimensions.update({'tenant_name': tenant_name})
for metric_name, idx in self._get_metrics_map(measure).items():
# POST to customer project
interface_stats_key = self._get_interface_stats_key(idx, metric_name, measure, ifx)
if interface_stats_key not in value:
#
# If we've skipped a given metric above due to
# counter rollover/negative value, we won't have
# a value to publish for that metric this round.
#
continue
if is_router_port:
metric_name_rate = "vrouter.{0}_sec".format(metric_name)
metric_name_abs = "vrouter.{0}".format(metric_name)
else:
metric_name_rate = "vswitch.{0}_sec".format(metric_name)
metric_name_abs = "vswitch.{0}".format(metric_name)
if not self.use_health_metrics and interface_stats_key in HEALTH_METRICS:
continue
if self.use_rate_metrics:
self.gauge(metric_name_rate, value[interface_stats_key],
dimensions=customer_dimensions,
delegated_tenant=tenant_id,
hostname='SUPPRESS')
# POST to operations project with "ovs." prefix
self.gauge("ovs.{0}".format(metric_name_rate), value[interface_stats_key],
dimensions=ops_dimensions)
if self.use_absolute_metrics:
statistics_dict = interface_data[ifx]['statistics']
abs_value = statistics_dict[interface_stats_key] \
if interface_stats_key in statistics_dict else 0
if self.use_bits and 'bytes' in interface_stats_key:
abs_value = abs_value * 8
# POST to customer
self.gauge(metric_name_abs, abs_value,
dimensions=customer_dimensions,
delegated_tenant=tenant_id,
hostname='SUPPRESS')
# POST to operations project
self.gauge("ovs.{0}".format(metric_name_abs), abs_value,
dimensions=ops_dimensions)
self._publish_max_bw_metrics(port_info, customer_dimensions,
ops_dimensions)
host_router_max_bw += self._get_port_cache_max_bw(port_info)
if host_router_max_bw > 0:
self.gauge('ovs.vrouter.host_max_bw_kb', host_router_max_bw,
dimensions=dims_base)
self.gauge('ovs.vrouter.active_routers', active_routers, dimensions=dims_base)
self._update_counter_cache(ctr_cache,
math.ceil(time.time() - time_start), measure)
def _get_ovs_data(self):
data_columns = ['name', 'external_ids', 'statistics', 'options']
output = self._process_command(self.ovs_cmd)
parsed_ovs_data = {}
if not output:
return parsed_ovs_data
try:
raw_ovs_json_data = json.loads(output)
except ValueError:
#
# Make sure we got valid json
#
return {}
# There are multiple lines returned, one for each device
for line in raw_ovs_json_data['data']:
ifx_data = {}
ifx_name = None
# There is one entry in each line for every data_column
# print "Row:"
for column_num in range(0, len(data_columns)):
# Two choices here, it's either a value or a (sub) map
# If it's a value it becomes a key in the hierarchy
if line[column_num][0] == "map":
map_data = {}
for value in line[column_num][1]:
map_data.update({value[0]: value[1]})
ifx_data.update({data_columns[column_num]: map_data})
else:
# In this specific case name (interface) is our main key
# How this would generalize, I don't know.
if data_columns[column_num] == "name":
ifx_name = line[column_num]
# Put the interface data in the main dictionary,
# ensuring we actually built something with the output.
if ifx_name and ifx_data:
parsed_ovs_data.update({ifx_name: ifx_data})
return parsed_ovs_data
def _get_os_info(self, uuid, all_data):
for data in all_data:
if data['id'] == uuid:
return data
return None
def _get_nova_client(self):
region_name = self.init_config.get('region_name')
endpoint_type = self.init_config.get("endpoint_type", "publicURL")
nc = nova_client.Client(2, session=self.session,
endpoint_type=endpoint_type,
service_type="compute",
region_name=region_name,
client_name='monasca-agent[ovs]',
client_version=ma_version.version_string)
return nc
def _get_neutron_client(self):
region_name = self.init_config.get('region_name')
endpoint_type = self.init_config.get("endpoint_type", "publicURL")
return neutron_client.Client(session=self.session,
region_name=region_name,
endpoint_type=endpoint_type,
client_name='monasca-agent[ovs]',
client_version=ma_version.version_string)
def _run_command(self, command, input=None):
self.log.debug("Executing command - {0}".format(command))
errcode = None
stdout = None
stderr = None
try:
process = subprocess.Popen(command,
shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
stdin=subprocess.PIPE)
stdout, stderr = process.communicate(input=input)
errcode = process.returncode
self.log.debug('errcode - {0}, stdout - {1}, stderr - {2}'.
format(errcode,
stdout,
stderr))
except Exception:
self.log.error("Failure while executing command - {0}".
format(command))
return errcode, stdout, stderr
def _process_command(self, command):
"""Runs the command and returns json output
"""
errcode, stdout, stderr = self._run_command(command)
cmd_output = ''
if stdout:
for line in stdout:
cmd_output = cmd_output + line
return cmd_output
def _load_counter_cache(self):
"""Load the counter metrics from the previous collection iteration
"""
ctr_cache = {}
try:
with open(self.ctr_cache_file, 'r') as cache_json:
ctr_cache = json.load(cache_json)
except (IOError, TypeError, ValueError):
#
# Couldn't load a cache file, or it's invalid json
# and is corrupt. By returning an empty dict we'll ensure
# that the cache gets rebuilt.
#
self.log.warning("Counter cache missing or corrupt, rebuilding.")
ctr_cache = {}
return ctr_cache
def _update_counter_cache(self, ctr_cache, run_time, measure):
# Remove migrated or deleted ports from the counter cache
write_ctr_cache = deepcopy(ctr_cache)
#
# Grab the first metric name and see if it's in the cache.
#
metric_name = self._get_metrics_map(measure).keys()[0]
for ifx in ctr_cache:
if metric_name not in ctr_cache[ifx]:
self.log.info("Expiring old/empty {0} from cache".format(ifx))
del(write_ctr_cache[ifx])
try:
with open(self.ctr_cache_file, 'w') as cache_json:
json.dump(write_ctr_cache, cache_json)
if stat.S_IMODE(os.stat(self.ctr_cache_file).st_mode) != 0o600:
os.chmod(self.ctr_cache_file, 0o600)
except IOError as e:
self.log.error("Cannot write to {0}: {1}".
format(self.ctr_cache_file, e))
def _get_metrics_map(self, measure):
metrics_map = {"out_%s" % measure: "tx_bytes",
"in_%s" % measure: "rx_bytes",
"in_packets": "rx_packets",
"out_packets": "tx_packets",
"in_dropped": "rx_dropped",
"out_dropped": "tx_dropped",
"in_errors": "rx_errors",
"out_errors": "tx_errors"}
return metrics_map
def _update_port_cache(self):
"""Collect port_uuid, device_uuid, router_name, and tenant_id
for all routers.
"""
if not hasattr(self, 'neutron_client'):
self.neutron_client = self._get_neutron_client()
port_cache = {}
try:
self.log.debug("Retrieving Neutron port data")
all_ports_data = self.neutron_client.list_ports()
self.log.debug("Retrieving Neutron router data")
all_routers_data = self.neutron_client.list_routers()
except Exception as e:
self.log.exception("Unable to get neutron data: %s", str(e))
return port_cache
all_ports_data = all_ports_data['ports']
all_routers_data = all_routers_data['routers']
#
# Only make the keystone call to get the tenant list
# if we are configured to publish tenant names.
#
if self.init_config.get('metadata') and 'tenant_name' in self.init_config.get('metadata'):
tenants = utils.get_tenant_list(self.init_config, self.log)
else:
tenants = []
for port_data in all_ports_data:
port_uuid = port_data['id']
device_uuid = port_data['device_id']
router_info = self._get_os_info(device_uuid, all_routers_data)
if router_info:
tenant_id = router_info['tenant_id']
is_router_port = True
router_name = router_info['name']
else:
tenant_id = port_data['tenant_id']
is_router_port = False
router_name = ""
port_cache[port_uuid] = {'device_uuid': device_uuid,
'router_name': router_name,
'is_router_port': is_router_port,
'tenant_id': tenant_id}
tenant_name = utils.get_tenant_name(tenants, tenant_id)
if tenant_name:
port_cache[port_uuid]['tenant_name'] = tenant_name
port_cache = self._add_max_bw_to_port_cache(port_cache,
all_ports_data)
port_cache['last_update'] = int(time.time())
# Write the updated cache
try:
with open(self.port_cache_file, 'w') as cache_json:
json.dump(port_cache, cache_json)
if stat.S_IMODE(os.stat(self.port_cache_file).st_mode) != 0o600:
os.chmod(self.port_cache_file, 0o600)
except IOError as e:
self.log.error("Cannot write to {0}: {1}".
format(self.port_cache_file, e))
return port_cache
def _get_port_cache_max_bw(self, port_info):
if port_info['is_router_port'] and 'max_bw_kb' in port_info:
return port_info['max_bw_kb']
else:
return 0
def _publish_max_bw_metrics(self, port_info, cust_dims, ops_dims):
max_bw_kb = self._get_port_cache_max_bw(port_info)
if not self.publish_router_capacity or max_bw_kb == 0:
return
metric_name = 'vrouter.max_bw_kb'
self.gauge(metric_name, max_bw_kb, dimensions=cust_dims,
delegated_tenant=ops_dims['tenant_id'],
hostname='SUPPRESS')
self.gauge("ovs.{0}".format(metric_name), max_bw_kb,
dimensions=ops_dims)
def _get_max_flavor_bw(self, flavor_keys):
avg_bw = 0
peak_bw = 0
burst_bw = 0
#
# we'll sum inbound and outbound for the max possible throughput
#
avg_re = re.compile('quota:vif_.*bound_average')
peak_re = re.compile('quota:vif_.*bound_peak')
burst_re = re.compile('quota:vif_.*bound_burst')
for key in flavor_keys:
if re.match(avg_re, key):
avg_bw += int(flavor_keys[key])
elif re.match(peak_re, key):
peak_bw += int(flavor_keys[key])
elif re.match(burst_re, key):
burst_bw += int(flavor_keys[key])
return max(avg_bw, peak_bw, burst_bw)
def _add_max_bw_to_port_cache(self, port_cache, all_ports_data):
if not self.publish_router_capacity:
return port_cache
tmp_port_cache = deepcopy(port_cache)
#
# No need to do a flavor get multiple times
# for the same flavor when rebuilding the cache.
#
flavor_cache = {}
try:
if not hasattr(self, 'nova_client'):
self.nova_client = self._get_nova_client()
for uuid in port_cache:
router_max_bw_kb = 0
port = port_cache[uuid]
if not port['is_router_port']:
continue
inst_ids = self._get_instance_ids(all_ports_data, port['device_uuid'])
for instance_id in inst_ids:
instance = self.nova_client.servers.get(instance_id)
flavor_id = instance.flavor['id']
if flavor_id not in flavor_cache:
flavor = self.nova_client.flavors.get(instance.flavor['id'])
flavor_cache[flavor_id] = flavor.get_keys()
router_max_bw_kb += self._get_max_flavor_bw(flavor_cache[flavor_id])
if router_max_bw_kb > 0:
tmp_port_cache[uuid]['max_bw_kb'] = router_max_bw_kb
except Exception as e:
msg = "Unable to get the nova instance and flavor info: {0}"
self.log.error(msg.format(e))
return tmp_port_cache
def _get_instance_ids(self, ports, router_uuid):
subnet_ids = self._get_port_ids(ports,
'network:router_interface',
[router_uuid],
'device_id',
'network_id')
instance_ids = self._get_port_ids(ports,
'compute:',
subnet_ids,
'network_id',
'device_id')
return instance_ids
def _get_port_ids(self, ports, owner, uuids, match_field, return_field):
return_uuids = []
if len(uuids) == 0:
return return_uuids
for port in ports:
if port[match_field] in uuids and owner in port['device_owner']:
return_uuids.append(port[return_field])
return return_uuids
def _load_port_cache(self):
"""Load the cache map of router/dhcp port uuids to router uuid, name,
and tenant name.
"""
port_cache = {}
try:
with open(self.port_cache_file, 'r') as cache_json:
port_cache = json.load(cache_json)
# Is it time to force a refresh of this data?
if self.init_config.get('neutron_refresh') is not None:
time_diff = time.time() - port_cache['last_update']
if time_diff > self.init_config.get('neutron_refresh'):
self.log.warning("Time to update neutron cache file")
self._update_port_cache()
except (IOError, TypeError, ValueError):
# The file may not exist yet, or is corrupt. Rebuild it now.
self.log.warning("Port cache doesn't exists , rebuilding.")
port_cache = self._update_port_cache()
return port_cache
def _is_active_router(self, uuid):
if not self.check_router_ha:
#
# We're not configured to check router ha -- let's not bother
# making the additional neutron calls.
#
return True
active_host = None
local_host = socket.gethostname()
try:
if not hasattr(self, 'neutron_client'):
self.neutron_client = self._get_neutron_client()
result = self.neutron_client.list_l3_agent_hosting_routers(uuid)
except Exception as e:
#
# Failed to get the hosting l3 agent, we'll default to calling
# the router active.
#
self.log.error("Unable to get the hosting agent: {0}".format(e))
return False
for agent in result['agents']:
if 'ha_state' not in agent.keys() or agent['ha_state'] is None:
#
# HA isn't enabled for this router,
# so it's active if we've made it this far.
#
return True
if agent['ha_state'] == 'active':
active_host = agent['host']
if not active_host:
#
# Somehow we didn't find the host above, assume
# it is active.
#
return True
if active_host != local_host:
return False
return True
def _get_interface_stats_key_for_dpdk(self, metric_name, measure):
""""Get the interface statistics keys value based on type of port."""
# The reason for swapping the metric_map value is in DPDK the tx counters
# given by ovs-vsctl is actually rx counter and vice-versa.
metrics_map = {"out_%s" % measure: "rx_bytes",
"in_%s" % measure: "tx_bytes",
"in_packets": "tx_packets",
"out_packets": "rx_packets",
"out_dropped": "tx_dropped",
"in_dropped": "rx_dropped",
"out_errors": "tx_errors",
"in_errors": "rx_errors"}
return metrics_map[metric_name]
def _get_measure(self):
if self.use_bits:
measure = 'bits'
else:
measure = 'bytes'
return measure
def _get_interface_stats_key(self, idx, metric_name, measure, interface):
"""Get the interface statistics key based on the interface."""
if re.match(r"^" + DPDK_PORT_PREFIX, interface):
# ovs-vsctl does not give rx_dropped and tx_error statistics for DPDK ports
# This check is for DPDK ports.
return self._get_interface_stats_key_for_dpdk(metric_name,
measure)
else:
# For non DPDK ports return the same idx value.
return idx