# Copyright 2016 Hewlett Packard Enterprise Development Company LP # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain # a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. from monasca_transform.transform.grouping import Grouping from monasca_transform.transform.grouping import GroupingResults from monasca_transform.transform.grouping import RecordStoreWithGroupBy class GroupSortbyTimestamp(Grouping): @staticmethod def log_debug(logStr): print(str) # LOG.debug(logStr) @staticmethod def _get_groupby_key(row_decorated): """Build a group by key using the group by column list. row_decorated: [[Rows(a=1, b=1, c=2, d=3)],[groupby_a,groupby_b]] """ # LOG.debug(whoami(row_decorated)) # LOG.debug(row_decorated) groupby_columns_list = row_decorated[1] groupby_key = "" for gcol in groupby_columns_list: groupby_key = "^".join((groupby_key, eval(".".join(("row", gcol))))) return groupby_key @staticmethod def _prepare_for_groupby(record_store_with_groupby_rdd): """creates a new rdd where: the first element of each row contains array of grouping key and event timestamp fields. Grouping key and event timestamp fields are used by partitioning and sorting function to partition the data by grouping key and then sort the elements in a group by the timestamp """ # get the record store data and group by columns record_store_data = record_store_with_groupby_rdd.record_store_data groupby_columns_list = \ record_store_with_groupby_rdd.groupby_columns_list # construct a group by key # key1=value1^key2=value2^... groupby_key_value = "" for gcol in groupby_columns_list: groupby_key_value = \ "^".join((groupby_key_value, "=".join((gcol, eval(".".join(("record_store_data", gcol))))))) # return a key-value rdd return [groupby_key_value, record_store_data] @staticmethod def _sortby_timestamp(result_iterable): # LOG.debug(whoami(result_iterable.data[0])) # sort list might cause OOM, if the group has lots of items # use group_sortby_timestamp_partitions module instead if you run # into OOM sorted_list = sorted(result_iterable.data, key=lambda row: row.event_timestamp_string) return sorted_list @staticmethod def _group_sortby_timestamp(record_store_df, groupby_columns_list): # convert the dataframe rdd to normal rdd and add the group by column # list record_store_with_groupby_rdd = record_store_df.rdd.\ map(lambda x: RecordStoreWithGroupBy(x, groupby_columns_list)) # convert rdd into key-value rdd record_store_with_groupby_rdd_key_val = record_store_with_groupby_rdd.\ map(GroupSortbyTimestamp._prepare_for_groupby) first_step = record_store_with_groupby_rdd_key_val.groupByKey() record_store_rdd_grouped_sorted = first_step.mapValues( GroupSortbyTimestamp._sortby_timestamp) return record_store_rdd_grouped_sorted @staticmethod def _get_group_first_last_quantity_udf(grouplistiter): """Return stats that include: first row key, first_event_timestamp, first event quantity, last_event_timestamp and last event quantity """ first_row = None last_row = None # extract key and value list group_key = grouplistiter[0] grouped_values = grouplistiter[1] count = 0.0 for row in grouped_values: # set the first row if first_row is None: first_row = row # set the last row last_row = row count = count + 1 first_event_timestamp_unix = None first_event_timestamp_string = None first_event_quantity = None if first_row is not None: first_event_timestamp_unix = first_row.event_timestamp_unix first_event_timestamp_string = first_row.event_timestamp_string first_event_quantity = first_row.event_quantity last_event_timestamp_unix = None last_event_timestamp_string = None last_event_quantity = None if last_row is not None: last_event_timestamp_unix = last_row.event_timestamp_unix last_event_timestamp_string = last_row.event_timestamp_string last_event_quantity = last_row.event_quantity results_dict = {"firstrecord_timestamp_unix": first_event_timestamp_unix, "firstrecord_timestamp_string": first_event_timestamp_string, "firstrecord_quantity": first_event_quantity, "lastrecord_timestamp_unix": last_event_timestamp_unix, "lastrecord_timestamp_string": last_event_timestamp_string, "lastrecord_quantity": last_event_quantity, "record_count": count} group_key_dict = Grouping._parse_grouping_key(group_key) return GroupingResults(group_key, results_dict, group_key_dict) @staticmethod def fetch_group_latest_oldest_quantity(record_store_df, transform_spec_df, groupby_columns_list): """To group record store data sort by timestamp within group and get first and last timestamp along with quantity within each group This function uses key-value pair rdd's groupBy function to do groupby """ # group and order elements in group record_store_grouped_data_rdd = \ GroupSortbyTimestamp._group_sortby_timestamp(record_store_df, groupby_columns_list) # find stats for a group record_store_grouped_rows = \ record_store_grouped_data_rdd.\ map(GroupSortbyTimestamp. _get_group_first_last_quantity_udf) return record_store_grouped_rows