diff --git a/monasca_analytics/sml/logistic_regression.py b/monasca_analytics/sml/logistic_regression.py new file mode 100644 index 0000000..673e78c --- /dev/null +++ b/monasca_analytics/sml/logistic_regression.py @@ -0,0 +1,100 @@ +#!/usr/bin/env python + +# Copyright (c) 2016 FUJITSU LIMITED +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +import logging + +import numpy as np +from sklearn import linear_model +from sklearn.metrics import classification_report +import voluptuous + +from monasca_analytics.sml.base import BaseSML +from monasca_analytics.util.validation_utils import NoSpaceCharacter + +logger = logging.getLogger(__name__) + +ANOMALY = 1 +NON_ANOMALY = 0 +N_SAMPLES = 1000 + + +class LogisticRegression(BaseSML): + """Anomaly detection based on the LogisticRegression algorithm""" + + def __init__(self, _id, _config): + super(LogisticRegression, self).__init__(_id, _config) + self._nb_samples = int(_config['nb_samples']) + + @staticmethod + def validate_config(_config): + log_reg_schema = voluptuous.Schema({ + 'module': voluptuous.And( + basestring, NoSpaceCharacter()), + 'nb_samples': voluptuous.Or(float, int) + }, required=True) + return log_reg_schema(_config) + + @staticmethod + def get_default_config(): + return { + 'module': LogisticRegression.__name__, + 'nb_samples': N_SAMPLES + } + + def get_params(): + return [ + params.ParamDescriptor('nb_samples', type_util.Number(), N_SAMPLES) + ] + + def number_of_samples_required(self): + return self._nb_samples + + def _generate_train_test_sets(self, samples, ratio_train): + num_samples_train = int(len(samples) * ratio_train) + + data, labels = np.hsplit(samples, [-1]) + X_train = np.array(data[:num_samples_train]) + _labels = np.array(labels[:num_samples_train]) + X_train_label = _labels.ravel() + X_test = np.array(data[num_samples_train:]) + _labels = np.array(labels[num_samples_train:]) + X_test_label = _labels.ravel() + return X_train, X_train_label, X_test, X_test_label + + def _get_best_detector(self, train, label): + detector = linear_model.LogisticRegression() + detector.fit(train, label) + return detector + + def learn_structure(self, samples): + X_train, X_train_label, X_test, X_test_label = \ + self._generate_train_test_sets(samples, 0.75) + logger.info('Trainig with ' + str(len(X_train)) + + 'samples; testing with ' + str(len(X_test)) + ' samples.') + + lr_detector = self._get_best_detector(X_train, X_train_label) + Y_test = lr_detector.predict(X_test) + + num_anomalies = Y_test[Y_test == ANOMALY].size + logger.info('Found ' + str(num_anomalies) + + ' anomalies in testing set') + + logger.info('Confusion Matrix: \n{}'. + format(classification_report( + X_test_label, + Y_test, + target_names=['no', 'yes']))) + return lr_detector diff --git a/test/sml/test_logistic_regression.py b/test/sml/test_logistic_regression.py new file mode 100644 index 0000000..0cda9b1 --- /dev/null +++ b/test/sml/test_logistic_regression.py @@ -0,0 +1,58 @@ +#!/usr/bin/env python + +# Copyright (c) 2016 Hewlett Packard Enterprise Development Company, L.P. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +import logging + +import numpy as np +from sklearn import linear_model + +from monasca_analytics.sml import logistic_regression +from test.util_for_testing import MonanasTestCase + +logger = logging.getLogger(__name__) + + +class TestLogisticRegression(MonanasTestCase): + + def setUp(self): + super(TestLogisticRegression, self).setUp() + self.lr_sml = logistic_regression.LogisticRegression( + "fakeid", {"module": "fake", "nb_samples": 1000}) + + def tearDown(self): + super(TestLogisticRegression, self).tearDown() + + def get_testing_data(self): + a = np.random.uniform(size=1000) + b = np.random.uniform(size=1000) + c = np.random.uniform(size=1000) + d = np.random.uniform(size=1000) + labels = np.random.randint(2, size=1000) + return np.array([a, b, c, d, labels]).T + + def test_generate_train_test_sets(self): + data = self.get_testing_data() + X_train, X_train_labeled, X_test, X_test_labeled =\ + self.lr_sml._generate_train_test_sets(data, 0.6) + self.assertEqual(600, len(X_train)) + self.assertEqual(600, len(X_train_labeled)) + self.assertEqual(400, len(X_test)) + self.assertEqual(400, len(X_test_labeled)) + + def test_learn_structure(self): + data = self.get_testing_data() + clf = self.lr_sml.learn_structure(data) + self.assertIsInstance(clf, linear_model.LogisticRegression) diff --git a/test/util/test_common_util.py b/test/util/test_common_util.py index 6965fc7..fd43a62 100644 --- a/test/util/test_common_util.py +++ b/test/util/test_common_util.py @@ -113,7 +113,8 @@ class CommonUtilTest(unittest.TestCase): "SvmOneClass", "IsolationForest", "EllipticEnvelope", - "DecisionTreeClassifier"], + "DecisionTreeClassifier", + "LogisticRegression"], names) def test_get_voter_class_by_name(self):