Properly interpolate missing data for test_runs

This commit modifies the data interpolation of the test_run_aggregator
method used by the per test view. Previously the commit just removed
rows post resample and d3 would do a linear interpolation between
points for missing data points. However, this is disingenous because
it gives the illusion that there were results for that period. This
commit switches to use pandas' index aware interpolation with a limit
of 20 (the same as our sample size for the rolling mean and std dev
calculations) consecutive interpolated points. This way we show large
gaps in the data properly, but smaller gaps are treated like they
were previously.

Change-Id: I8998faabe4e7fb523e2a8b5cdddb2b9e16e46e47
This commit is contained in:
Matthew Treinish 2015-12-11 15:35:35 -05:00
parent 2b17af27d1
commit ceb96144cc
No known key found for this signature in database
GPG Key ID: FD12A0F214C9E177
2 changed files with 15 additions and 14 deletions

View File

@ -12,6 +12,13 @@
# License for the specific language governing permissions and limitations
# under the License.
resample_matrix = {
'day': 'D',
'hour': '1H',
'min': '1T',
'sec': '1S',
}
class BaseAggregator(object):
def _update_datetime_to_fit_resolution(self,

View File

@ -15,7 +15,7 @@
import pandas as pd
from subunit2sql import read_subunit
from base_aggregator import BaseAggregator
import base_aggregator as base
def convert_test_runs_list_to_time_series_dict(test_runs_list, resample):
@ -49,22 +49,16 @@ def convert_test_runs_list_to_time_series_dict(test_runs_list, resample):
df['stddev_run_time'] = pd.rolling_std(df['run_time'], 20)
# Resample numeric data for the run_time graph from successful runs
resample_matrix = {
'day': 'D',
'hour': '1H',
'min': '1T',
'sec': '1S',
}
numeric_df = df[df['status'] == 'success'].resample(
resample_matrix[resample], how='mean')
base.resample_matrix[resample], how='mean')
# Drop duplicate or invalid colums
del(numeric_df['run_id'])
del(df['run_time'])
del(df['avg_run_time'])
del(df['stddev_run_time'])
# Drop missing data from the resample
numeric_df = numeric_df.dropna(how='all')
# Interpolate missing data
numeric_df['run_time'] = numeric_df.interpolate(method='time', limit=20)
# Add rolling mean and std dev of run_time to datafram
numeric_df['avg_run_time'] = pd.rolling_mean(numeric_df['run_time'], 20)
numeric_df['stddev_run_time'] = pd.rolling_std(numeric_df['run_time'], 20)
# Convert the dataframes to a dict
numeric_dict = dict(
@ -131,7 +125,7 @@ class Counter(object):
return (self.passes, self.failures, self.skips)
class TestRunAggregator(BaseAggregator):
class TestRunAggregator(base.BaseAggregator):
def __init__(self, test_runs):
self.test_runs = test_runs