Properly interpolate missing data for test_runs

This commit modifies the data interpolation of the test_run_aggregator method used by the per test view. Previously the commit just removed rows post resample and d3 would do a linear interpolation between points for missing data points. However, this is disingenous because it gives the illusion that there were results for that period. This commit switches to use pandas' index aware interpolation with a limit of 20 (the same as our sample size for the rolling mean and std dev calculations) consecutive interpolated points. This way we show large gaps in the data properly, but smaller gaps are treated like they were previously. Change-Id: I8998faabe4e7fb523e2a8b5cdddb2b9e16e46e47
2015-12-11 15:35:35 -05:00 · 2015-12-11 15:35:35 -05:00 · ceb96144cc
parent 2b17af27d1
commit ceb96144cc
2 changed files with 15 additions and 14 deletions
--- a/openstack_health/base_aggregator.py
+++ b/openstack_health/base_aggregator.py
@ -12,6 +12,13 @@
 # License for the specific language governing permissions and limitations
 # under the License.

+resample_matrix = {
+    'day': 'D',
+    'hour': '1H',
+    'min': '1T',
+    'sec': '1S',
+}
+

 class BaseAggregator(object):
    def _update_datetime_to_fit_resolution(self,
--- a/openstack_health/test_run_aggregator.py
+++ b/openstack_health/test_run_aggregator.py
@ -15,7 +15,7 @@
 import pandas as pd
 from subunit2sql import read_subunit

-from base_aggregator import BaseAggregator
+import base_aggregator as base


 def convert_test_runs_list_to_time_series_dict(test_runs_list, resample):
@ -49,22 +49,16 @@ def convert_test_runs_list_to_time_series_dict(test_runs_list, resample):
    df['stddev_run_time'] = pd.rolling_std(df['run_time'], 20)

    # Resample numeric data for the run_time graph from successful runs
-    resample_matrix = {
-        'day': 'D',
-        'hour': '1H',
-        'min': '1T',
-        'sec': '1S',
-    }
    numeric_df = df[df['status'] == 'success'].resample(
-        resample_matrix[resample], how='mean')
+        base.resample_matrix[resample], how='mean')
    # Drop duplicate or invalid colums
    del(numeric_df['run_id'])
    del(df['run_time'])
-    del(df['avg_run_time'])
-    del(df['stddev_run_time'])
-
-    # Drop missing data from the resample
-    numeric_df = numeric_df.dropna(how='all')
+    # Interpolate missing data
+    numeric_df['run_time'] = numeric_df.interpolate(method='time', limit=20)
+    # Add rolling mean and std dev of run_time to datafram
+    numeric_df['avg_run_time'] = pd.rolling_mean(numeric_df['run_time'], 20)
+    numeric_df['stddev_run_time'] = pd.rolling_std(numeric_df['run_time'], 20)

    # Convert the dataframes to a dict
    numeric_dict = dict(
@ -131,7 +125,7 @@ class Counter(object):
        return (self.passes, self.failures, self.skips)


-class TestRunAggregator(BaseAggregator):
+class TestRunAggregator(base.BaseAggregator):
    def __init__(self, test_runs):
        self.test_runs = test_runs