monasca-analytics/monasca_analytics/sml/lingam.py

149 lines
4.6 KiB
Python

#!/usr/bin/env python
# Copyright (c) 2016 Hewlett Packard Enterprise Development Company, L.P.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import logging
import math
import numpy as np
import schema
from sklearn import decomposition
from monasca_analytics.sml import base
logger = logging.getLogger(__name__)
class LiNGAM(base.BaseSML):
"""Causality discovery assuming a linear non gaussian acyclic data model"""
def __init__(self, _id, _config):
super(LiNGAM, self).__init__(_id, _config)
self._threshold = _config["params"]["threshold"]
self._threshold = 0.1
@staticmethod
def validate_config(_config):
schema.Schema({
"module": schema.And(basestring,
lambda i: not any(c.isspace() for c in i)),
"params": {
"threshold": float
}
}).validate(_config)
@staticmethod
def get_default_config():
return {
"module": LiNGAM.__name__,
"params": {
"threshold": 0.1
}
}
def number_of_samples_required(self):
return 5000
def learn_structure(self, samples):
logger.debug(samples.shape)
logger.debug(samples)
threshold = self._threshold
causality_matrix, _ = LiNGAM._discover_structure(samples)
structure = causality_matrix > threshold
logger.info("Causality Matrix: {0}".format(causality_matrix))
logger.info("Learned causality: {0}".format(structure))
return structure
@staticmethod
def _discover_structure(data):
# Add a random noise uniformly distributed to avoid singularity
# when performing the ICA
data += np.random.random_sample(data.shape)
# Create the ICA node to get the inverse of the mixing matrix
k, w, _ = decomposition.fastica(data)
w = np.dot(w, k)
n = w.shape[0]
best_nzd = float("inf")
best_slt = float("inf")
best_w_permuted = w
causality_matrix = None
causal_perm = None
if n < 9:
perm = LiNGAM._perms(n)
for i in range(perm.shape[1]):
perm_matrix = np.eye(n)
perm_matrix = perm_matrix[:, perm[:, i]]
w_permuted = perm_matrix.dot(w)
cost = LiNGAM._cost_non_zero_diag(w_permuted)
if cost < best_nzd:
best_nzd = cost
best_w_permuted = w_permuted
w_opt = best_w_permuted
w_opt = w_opt / np.diag(w_opt).reshape((n, 1))
b_matrix = np.eye(n) - w_opt
best_b_permuted = b_matrix
best_i = 0
for i in range(perm.shape[1]):
b_permuted = b_matrix[:, perm[:, i]][perm[:, i], :]
cost = LiNGAM._cost_strictly_lower_triangular(
b_permuted)
if cost < best_slt:
best_slt = cost
best_i = i
best_b_permuted = b_permuted
causal_perm = perm[:, best_i]
causality_matrix = b_matrix
percent_upper = best_slt / np.sum(best_b_permuted ** 2)
if percent_upper > 0.2:
# TODO(David): Change that code to raise an exception instead
logger.error("LiNGAM failed to run on the data set")
logger.error(
"--> B permuted matrix is at best {}% lower triangular"
.format(percent_upper))
return causality_matrix, causal_perm
@staticmethod
def _cost_strictly_lower_triangular(b):
return np.sum((np.tril(b, -1) - b) ** 2)
@staticmethod
def _cost_non_zero_diag(w):
return np.sum(1 / np.abs(np.diag(w)))
@staticmethod
def _perms(n):
k = 1
p = np.empty((2 * n - 1, math.factorial(n)), np.uint8)
for i in range(n):
p[i, :k] = i
p[i + 1:2 * i + 1, :k] = p[:i, :k]
for j in range(i):
p[:i + 1, k * (j + 1):k * (j + 2)] = p[j + 1:j + i + 2, :k]
k *= i + 1
return p[:n, :]