From 8009742c0f4fd7870698dc5eb59f9f94fd02789d Mon Sep 17 00:00:00 2001 From: Hiroyuki Eguchi Date: Wed, 8 Feb 2017 16:37:37 +0900 Subject: [PATCH] Add split dataset method In machine learning, user splits the dataset in general. One is for creating a prediction model, the other is for evaluating model. Add a split dataset command to make it possible. implements blueprint split-dataset Change-Id: Idb323a1240d790d5628f9eb31cba8d3a29ad64e8 --- meteos/api/v1/datasets.py | 10 +++++- meteos/cluster/binary/meteos-script-1.6.0.py | 18 +++++++++++ meteos/engine/api.py | 33 ++++++++++++++++++-- meteos/engine/drivers/generic.py | 5 ++- meteos/engine/manager.py | 8 +++++ 5 files changed, 70 insertions(+), 4 deletions(-) diff --git a/meteos/api/v1/datasets.py b/meteos/api/v1/datasets.py index dc670a9..5927b50 100644 --- a/meteos/api/v1/datasets.py +++ b/meteos/api/v1/datasets.py @@ -128,6 +128,12 @@ class DatasetController(wsgi.Controller, wsgi.AdminActionsMixin): swift_tenant = dataset.get('swift_tenant') swift_username = dataset.get('swift_username') swift_password = dataset.get('swift_password') + percent_train = dataset.get('percent_train', '0.7') + percent_test = dataset.get('percent_test', '0.3') + + if (method == 'split' + and not float(percent_train) + float(percent_test) == 1.0): + raise exc.HTTPUnprocessableEntity() new_dataset = self.engine_api.create_dataset(context, display_name, @@ -141,7 +147,9 @@ class DatasetController(wsgi.Controller, wsgi.AdminActionsMixin): experiment.cluster_id, swift_tenant, swift_username, - swift_password) + swift_password, + percent_train, + percent_test) return self._view_builder.detail(req, new_dataset) diff --git a/meteos/cluster/binary/meteos-script-1.6.0.py b/meteos/cluster/binary/meteos-script-1.6.0.py index 3fa285d..0d029c8 100644 --- a/meteos/cluster/binary/meteos-script-1.6.0.py +++ b/meteos/cluster/binary/meteos-script-1.6.0.py @@ -521,6 +521,24 @@ class MeteosSparkController(object): exec('self.data = self.data' + cmd) self.save_data() + def split_dataset(self): + + self.load_data() + + percent_train = self.job_args['dataset']['percent_train'] + percent_test = self.job_args['dataset']['percent_test'] + + # Split the data into training and test sets + (trainData, testData) = self.data.randomSplit([float(percent_train), + float(percent_test)]) + self.data = trainData + self.save_data() + + # save testData + testData.collect() + datapath = 'data-' + self.job_args['dataset']['test_dataset']['id'] + testData.saveAsTextFile(datapath) + def create_model(self): self.load_data() diff --git a/meteos/engine/api.py b/meteos/engine/api.py index 9a91c07..34ec294 100644 --- a/meteos/engine/api.py +++ b/meteos/engine/api.py @@ -20,6 +20,7 @@ Handles all requests relating to learnings. """ +import copy from oslo_log import log from oslo_utils import excutils import six @@ -230,7 +231,8 @@ class API(base.Base): def create_dataset(self, context, name, description, method, source_dataset_url, params, template_id, job_template_id, experiment_id, cluster_id, - swift_tenant, swift_username, swift_password): + swift_tenant, swift_username, swift_password, + percent_train, percent_test): """Create a Dataset""" policy.check_policy(context, 'dataset', 'create') @@ -247,6 +249,24 @@ class API(base.Base): 'cluster_id': cluster_id } + test_dataset = {} + + if method == 'split': + train_data_name = name + '_train_' + percent_train + test_data_name = name + '_test_' + percent_test + + tmp_dataset = copy.deepcopy(dataset) + + dataset['display_name'] = train_data_name + + tmp_dataset['display_name'] = test_data_name + + try: + test_dataset = self.db.dataset_create(context, tmp_dataset) + except Exception: + with excutils.save_and_reraise_exception(): + self.db.dataset_delete(context, test_dataset['id']) + try: result = self.db.dataset_create(context, dataset) result['template_id'] = template_id @@ -254,11 +274,20 @@ class API(base.Base): result['swift_tenant'] = swift_tenant result['swift_username'] = swift_username result['swift_password'] = swift_password + result['test_dataset'] = test_dataset + result['percent_train'] = percent_train + result['percent_test'] = percent_test + self.engine_rpcapi.create_dataset(context, result) updates = {'status': constants.STATUS_CREATING} - LOG.info(_LI("Accepted parsing of dataset %s."), result['id']) self.db.dataset_update(context, result['id'], updates) + + if result['test_dataset']: + self.db.dataset_update(context, + result['test_dataset']['id'], + updates) + except Exception: with excutils.save_and_reraise_exception(): self.db.dataset_delete(context, result['id']) diff --git a/meteos/engine/drivers/generic.py b/meteos/engine/drivers/generic.py index 195b6d8..d620b28 100644 --- a/meteos/engine/drivers/generic.py +++ b/meteos/engine/drivers/generic.py @@ -162,7 +162,10 @@ class GenericLearningDriver(driver.LearningDriver): job_args['source_dataset_url'] = request_specs\ .get('source_dataset_url') job_args['dataset_format'] = request_specs.get('dataset_format') - dataset_args = {'params': request_specs.get('params')} + dataset_args = {'params': request_specs.get('params'), + 'test_dataset': request_specs.get('test_dataset'), + 'percent_train': request_specs.get('percent_train'), + 'percent_test': request_specs.get('percent_test')} job_args['dataset'] = dataset_args # Set parameters of Swift diff --git a/meteos/engine/manager.py b/meteos/engine/manager.py index d5e078a..e31637a 100644 --- a/meteos/engine/manager.py +++ b/meteos/engine/manager.py @@ -227,6 +227,14 @@ class LearningManager(manager.Manager): self._update_status(context, 'DataSet', request_spec['id'], job_id, stdout, stderr) + if request_spec['test_dataset']: + self._update_status(context, + 'DataSet', + request_spec['test_dataset']['id'], + job_id, + None, + stderr) + def delete_dataset(self, context, cluster_id=None, job_id=None, id=None): """Deletes a Dataset.""" context = context.elevated()