From ceb96144cc72674c8429693c11cf9399955811fc Mon Sep 17 00:00:00 2001
From: Matthew Treinish <mtreinish@kortar.org>
Date: Fri, 11 Dec 2015 15:35:35 -0500
Subject: [PATCH] Properly interpolate missing data for test_runs

This commit modifies the data interpolation of the test_run_aggregator
method used by the per test view. Previously the commit just removed
rows post resample and d3 would do a linear interpolation between
points for missing data points. However, this is disingenous because
it gives the illusion that there were results for that period. This
commit switches to use pandas' index aware interpolation with a limit
of 20 (the same as our sample size for the rolling mean and std dev
calculations) consecutive interpolated points. This way we show large
gaps in the data properly, but smaller gaps are treated like they
were previously.

Change-Id: I8998faabe4e7fb523e2a8b5cdddb2b9e16e46e47
---
 openstack_health/base_aggregator.py     |  7 +++++++
 openstack_health/test_run_aggregator.py | 22 ++++++++--------------
 2 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/openstack_health/base_aggregator.py b/openstack_health/base_aggregator.py
index 452efdc6..85c59a8d 100644
--- a/openstack_health/base_aggregator.py
+++ b/openstack_health/base_aggregator.py
@@ -12,6 +12,13 @@
 # License for the specific language governing permissions and limitations
 # under the License.
 
+resample_matrix = {
+    'day': 'D',
+    'hour': '1H',
+    'min': '1T',
+    'sec': '1S',
+}
+
 
 class BaseAggregator(object):
     def _update_datetime_to_fit_resolution(self,
diff --git a/openstack_health/test_run_aggregator.py b/openstack_health/test_run_aggregator.py
index adc9a935..fb945ca9 100644
--- a/openstack_health/test_run_aggregator.py
+++ b/openstack_health/test_run_aggregator.py
@@ -15,7 +15,7 @@
 import pandas as pd
 from subunit2sql import read_subunit
 
-from base_aggregator import BaseAggregator
+import base_aggregator as base
 
 
 def convert_test_runs_list_to_time_series_dict(test_runs_list, resample):
@@ -49,22 +49,16 @@ def convert_test_runs_list_to_time_series_dict(test_runs_list, resample):
     df['stddev_run_time'] = pd.rolling_std(df['run_time'], 20)
 
     # Resample numeric data for the run_time graph from successful runs
-    resample_matrix = {
-        'day': 'D',
-        'hour': '1H',
-        'min': '1T',
-        'sec': '1S',
-    }
     numeric_df = df[df['status'] == 'success'].resample(
-        resample_matrix[resample], how='mean')
+        base.resample_matrix[resample], how='mean')
     # Drop duplicate or invalid colums
     del(numeric_df['run_id'])
     del(df['run_time'])
-    del(df['avg_run_time'])
-    del(df['stddev_run_time'])
-
-    # Drop missing data from the resample
-    numeric_df = numeric_df.dropna(how='all')
+    # Interpolate missing data
+    numeric_df['run_time'] = numeric_df.interpolate(method='time', limit=20)
+    # Add rolling mean and std dev of run_time to datafram
+    numeric_df['avg_run_time'] = pd.rolling_mean(numeric_df['run_time'], 20)
+    numeric_df['stddev_run_time'] = pd.rolling_std(numeric_df['run_time'], 20)
 
     # Convert the dataframes to a dict
     numeric_dict = dict(
@@ -131,7 +125,7 @@ class Counter(object):
         return (self.passes, self.failures, self.skips)
 
 
-class TestRunAggregator(BaseAggregator):
+class TestRunAggregator(base.BaseAggregator):
     def __init__(self, test_runs):
         self.test_runs = test_runs