monasca-transform/monasca_transform/processor/pre_hourly_processor.py

# Copyright 2016 Hewlett Packard Enterprise Development Company LP
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
from kafka.common import OffsetRequestPayload
from kafka import KafkaClient

from pyspark.sql import SQLContext
from pyspark.streaming.kafka import KafkaUtils
from pyspark.streaming.kafka import OffsetRange

import datetime
import logging
from oslo_config import cfg
import simport


from monasca_transform.component.insert.kafka_insert import KafkaInsert
from monasca_transform.component.setter.rollup_quantity import RollupQuantity
from monasca_transform.data_driven_specs.data_driven_specs_repo \
    import DataDrivenSpecsRepo
from monasca_transform.data_driven_specs.data_driven_specs_repo \
    import DataDrivenSpecsRepoFactory
from monasca_transform.processor import Processor
from monasca_transform.transform.storage_utils import StorageUtils
from monasca_transform.transform.transform_utils import InstanceUsageUtils
from monasca_transform.transform import TransformContextUtils

LOG = logging.getLogger(__name__)


class PreHourlyProcessor(Processor):
    """Processor to process usage data published to metrics_pre_hourly topic a
    and publish final rolled up metrics to metrics topic in kafka.
    """

    @staticmethod
    def log_debug(message):
        LOG.debug(message)

    @staticmethod
    def save_kafka_offsets(current_offsets,
                           batch_time_info):
        """save current offsets to offset specification."""

        offset_specs = simport.load(cfg.CONF.repositories.offsets)()

        app_name = PreHourlyProcessor.get_app_name()

        for o in current_offsets:
            PreHourlyProcessor.log_debug(
                "saving: OffSetRanges: %s %s %s %s, "
                "batch_time_info: %s" % (
                    o.topic, o.partition, o.fromOffset, o.untilOffset,
                    str(batch_time_info)))
        # add new offsets, update revision
        offset_specs.add_all_offsets(app_name,
                                     current_offsets,
                                     batch_time_info)

    @staticmethod
    def reset_kafka_offsets(app_name):
        """delete all offsets from the offset specification."""
        # get the offsets from global var
        offset_specs = simport.load(cfg.CONF.repositories.offsets)()
        offset_specs.delete_all_kafka_offsets(app_name)

    @staticmethod
    def get_app_name():
        """get name of this application. Will be used to
        store offsets in database
        """
        return "mon_metrics_kafka_pre_hourly"

    @staticmethod
    def get_kafka_topic():
        """get name of kafka topic for transformation."""
        return "metrics_pre_hourly"

    @staticmethod
    def is_time_to_run(check_time):
        """return True if its time to run this processor.
        For now it just checks to see if its start of the hour
        i.e. the minute is 00.
        """
        this_min = int(datetime.datetime.strftime(check_time, '%M'))

        # run pre hourly processor only at top of the hour
        if this_min == 0:
            return True
        else:
            return False

    @staticmethod
    def _get_offsets_from_kafka(brokers,
                                topic,
                                offset_time):
        """get dict representing kafka
        offsets.
        """
        # get client
        client = KafkaClient(brokers)

        # get partitions for a topic
        partitions = client.topic_partitions[topic]

        # https://cwiki.apache.org/confluence/display/KAFKA/
        # A+Guide+To+The+Kafka+Protocol#
        # AGuideToTheKafkaProtocol-OffsetRequest
        MAX_OFFSETS = 1
        offset_requests = [OffsetRequestPayload(topic,
                                                part_name,
                                                offset_time,
                                                MAX_OFFSETS) for part_name
                           in partitions.keys()]

        offsets_responses = client.send_offset_request(offset_requests)

        offset_dict = {}
        for response in offsets_responses:
            key = "_".join((response.topic,
                            str(response.partition)))
            offset_dict[key] = response

        return offset_dict

    @staticmethod
    def _parse_saved_offsets(app_name, topic, saved_offset_spec):
        """get dict representing saved
        offsets.
        """
        offset_dict = {}
        for key, value in saved_offset_spec.items():
            if key.startswith("%s_%s" % (app_name, topic)):
                spec_app_name = value.get_app_name()
                spec_topic = value.get_topic()
                spec_partition = int(value.get_partition())
                spec_from_offset = value.get_from_offset()
                spec_until_offset = value.get_until_offset()
                key = "_".join((spec_topic,
                                str(spec_partition)))
                offset_dict[key] = (spec_app_name,
                                    spec_topic,
                                    spec_partition,
                                    spec_from_offset,
                                    spec_until_offset)
        return offset_dict

    @staticmethod
    def _get_new_offset_range_list(brokers, topic):
        """get offset range from earliest to latest."""
        offset_range_list = []

        # https://cwiki.apache.org/confluence/display/KAFKA/
        # A+Guide+To+The+Kafka+Protocol#
        # AGuideToTheKafkaProtocol-OffsetRequest
        GET_LATEST_OFFSETS = -1
        latest_dict = PreHourlyProcessor.\
            _get_offsets_from_kafka(brokers, topic,
                                    GET_LATEST_OFFSETS)

        GET_EARLIEST_OFFSETS = -2
        earliest_dict = PreHourlyProcessor.\
            _get_offsets_from_kafka(brokers, topic,
                                    GET_EARLIEST_OFFSETS)

        for item in latest_dict:
            until_offset = latest_dict[item].offsets[0]
            from_offset = earliest_dict[item].offsets[0]
            partition = latest_dict[item].partition
            topic = latest_dict[item].topic
            offset_range_list.append(OffsetRange(topic,
                                                 partition,
                                                 from_offset,
                                                 until_offset))

        return offset_range_list

    @staticmethod
    def _get_offset_range_list(brokers,
                               topic,
                               app_name,
                               saved_offset_spec):
        """get offset range from saved offset to latest.
        """
        offset_range_list = []

        # https://cwiki.apache.org/confluence/display/KAFKA/
        # A+Guide+To+The+Kafka+Protocol#
        # AGuideToTheKafkaProtocol-OffsetRequest
        GET_LATEST_OFFSETS = -1
        latest_dict = PreHourlyProcessor.\
            _get_offsets_from_kafka(brokers, topic,
                                    GET_LATEST_OFFSETS)

        GET_EARLIEST_OFFSETS = -2
        earliest_dict = PreHourlyProcessor.\
            _get_offsets_from_kafka(brokers, topic,
                                    GET_EARLIEST_OFFSETS)

        saved_dict = PreHourlyProcessor.\
            _parse_saved_offsets(app_name, topic, saved_offset_spec)

        for item in latest_dict:
            # saved spec
            (spec_app_name,
             spec_topic_name,
             spec_partition,
             spec_from_offset,
             spec_until_offset) = saved_dict[item]

            # until
            until_offset = latest_dict[item].offsets[0]

            # from
            if (spec_until_offset is not None and int(spec_until_offset) >= 0):
                from_offset = spec_until_offset
            else:
                from_offset = earliest_dict[item].offsets[0]

            partition = latest_dict[item].partition
            topic = latest_dict[item].topic
            offset_range_list.append(OffsetRange(topic,
                                                 partition,
                                                 from_offset,
                                                 until_offset))

        return offset_range_list

    @staticmethod
    def get_processing_offset_range_list(processing_time):
        """get offset range to fetch data from. The
        range will last from the last saved offsets to current offsets
        available. If there are no last saved offsets available in the
        database the starting offsets will be set to the earliest
        available in kafka.
        """

        offset_specifications = simport.load(cfg.CONF.repositories.offsets)()

        # get application name, will be used to get offsets from database
        app_name = PreHourlyProcessor.get_app_name()

        saved_offset_spec = offset_specifications.get_kafka_offsets(app_name)

        # get kafka topic to fetch data
        topic = PreHourlyProcessor.get_kafka_topic()

        offset_range_list = []
        if len(saved_offset_spec) < 1:

            PreHourlyProcessor.log_debug(
                "No saved offsets available..."
                "connecting to kafka and fetching "
                "from earliest available offset ...")

            offset_range_list = PreHourlyProcessor._get_new_offset_range_list(
                cfg.CONF.messaging.brokers,
                topic)
        else:
            PreHourlyProcessor.log_debug(
                "Saved offsets available..."
                "connecting to kafka and fetching from saved offset ...")

            offset_range_list = PreHourlyProcessor._get_offset_range_list(
                cfg.CONF.messaging.brokers,
                topic,
                app_name,
                saved_offset_spec)
        return offset_range_list

    @staticmethod
    def fetch_pre_hourly_data(spark_context,
                              offset_range_list):
        """get metrics pre hourly data from offset range list."""

        # get kafka stream over the same offsets
        pre_hourly_rdd = KafkaUtils.createRDD(spark_context,
                                              {"metadata.broker.list":
                                                  cfg.CONF.messaging.brokers},
                                              offset_range_list)
        return pre_hourly_rdd

    @staticmethod
    def pre_hourly_to_instance_usage_df(pre_hourly_rdd):
        """convert raw pre hourly data into instance usage dataframe."""
        #
        # extract second column containing instance usage data
        #
        instance_usage_rdd = pre_hourly_rdd.map(
            lambda iud: iud[1])

        #
        # convert usage data rdd to instance usage df
        #
        sqlc = SQLContext.getOrCreate(pre_hourly_rdd.context)
        instance_usage_df = \
            InstanceUsageUtils.create_df_from_json_rdd(
                sqlc,
                instance_usage_rdd)

        return instance_usage_df

    @staticmethod
    def process_instance_usage(transform_context, instance_usage_df):
        """second stage aggregation. Aggregate instance usage rdd
        data and write results to metrics topic in kafka.
        """

        transform_spec_df = transform_context.transform_spec_df_info

        #
        # do a rollup operation
        #
        agg_params = transform_spec_df.select(
            "aggregation_params_map.pre_hourly_group_by_list")\
            .collect()[0].asDict()
        pre_hourly_group_by_list = agg_params["pre_hourly_group_by_list"]

        if (len(pre_hourly_group_by_list) == 1 and
                pre_hourly_group_by_list[0] == "default"):
            pre_hourly_group_by_list = ["tenant_id", "user_id",
                                        "resource_uuid",
                                        "geolocation", "region", "zone",
                                        "host", "project_id",
                                        "aggregated_metric_name",
                                        "aggregation_period"]

        # get aggregation period
        agg_params = transform_spec_df.select(
            "aggregation_params_map.aggregation_period").collect()[0].asDict()
        aggregation_period = agg_params["aggregation_period"]

        # get 2stage operation
        agg_params = transform_spec_df.select(
            "aggregation_params_map.pre_hourly_operation")\
            .collect()[0].asDict()
        pre_hourly_operation = agg_params["pre_hourly_operation"]

        instance_usage_df = \
            RollupQuantity.do_rollup(pre_hourly_group_by_list,
                                     aggregation_period,
                                     pre_hourly_operation,
                                     instance_usage_df)
        # insert metrics
        instance_usage_df = KafkaInsert.insert(transform_context,
                                               instance_usage_df)
        return instance_usage_df

    @staticmethod
    def do_transform(instance_usage_df):
        """start processing (aggregating) metrics
        """
        #
        # look in instance_usage_df for list of metrics to be processed
        #
        metric_ids_df = instance_usage_df.select(
            "processing_meta.metric_id").distinct()
        metric_ids_to_process = [row.metric_id
                                 for row in metric_ids_df.collect()]

        data_driven_specs_repo = DataDrivenSpecsRepoFactory.\
            get_data_driven_specs_repo()
        sqlc = SQLContext.getOrCreate(instance_usage_df.rdd.context)
        transform_specs_df = data_driven_specs_repo.get_data_driven_specs(
            sql_context=sqlc,
            data_driven_spec_type=DataDrivenSpecsRepo.transform_specs_type)

        for metric_id in metric_ids_to_process:
            transform_spec_df = transform_specs_df.select(
                ["aggregation_params_map", "metric_id"]
            ).where(transform_specs_df.metric_id == metric_id)
            source_instance_usage_df = instance_usage_df.select("*").where(
                instance_usage_df.processing_meta.metric_id == metric_id)

            # set transform_spec_df in TransformContext
            transform_context = \
                TransformContextUtils.get_context(
                    transform_spec_df_info=transform_spec_df)

            PreHourlyProcessor.process_instance_usage(
                transform_context, source_instance_usage_df)

    @staticmethod
    def run_processor(spark_context, processing_time):
        """process data in metrics_pre_hourly queue, starting
           from the last saved offsets, else start from earliest
           offsets available
           """

        offset_range_list = \
            PreHourlyProcessor.get_processing_offset_range_list(
                processing_time)

        # get pre hourly data
        pre_hourly_rdd = PreHourlyProcessor.fetch_pre_hourly_data(
            spark_context, offset_range_list)

        # get instance usage df
        instance_usage_df = PreHourlyProcessor.pre_hourly_to_instance_usage_df(
            pre_hourly_rdd)

        #
        # cache instance usage df
        #
        if cfg.CONF.pre_hourly_processor.enable_instance_usage_df_cache:
            storage_level_prop = \
                cfg.CONF.pre_hourly_processor\
                .instance_usage_df_cache_storage_level
            storage_level = StorageUtils.get_storage_level(
                storage_level_prop)
            instance_usage_df.persist(storage_level)

        # aggregate pre hourly data
        PreHourlyProcessor.do_transform(instance_usage_df)

        # remove cache
        if cfg.CONF.pre_hourly_processor.enable_instance_usage_df_cache:
            instance_usage_df.unpersist()

        # save latest metrics_pre_hourly offsets in the database
        PreHourlyProcessor.save_kafka_offsets(offset_range_list,
                                              processing_time)