fuel-astute/lib/fuel_deployment/cluster.rb

651 lines
20 KiB
Ruby

# Copyright 2015 Mirantis, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
require 'erb'
require 'open3'
module Deployment
# The Cluster object contains nodes and controls the deployment flow.
# It loops through the nodes and runs tasks on then
# when the node is ready and the task is available.
#
# attr [Object] id Misc identifier of this process
# @attr_reader [Hash<Symbol => Deployment::Node>] nodes The nodes of this cluster
# @attr [Deployment::Concurrency::Counter] node_concurrency Controls the
# maximum number of nodes running tasks at the same time
class Cluster
# @param [String] id Cluster name
def initialize(id=nil)
@nodes = {}
@uid = id
@node_concurrency = Deployment::Concurrency::Counter.new
@task_concurrency = Deployment::Concurrency::Group.new
@emergency_brake = false
@fault_tolerance_groups = []
@subgraphs = []
@dot_task_filter = nil
@dot_node_filter = nil
@dot_plot_number = 0
end
include Enumerable
include Deployment::Log
attr_accessor :uid
attr_accessor :gracefully_stop_mark
attr_accessor :subgraphs
attr_reader :emergency_brake
attr_reader :nodes
attr_reader :node_concurrency
attr_reader :task_concurrency
attr_reader :fault_tolerance_groups
attr_accessor :dot_node_filter
attr_accessor :dot_task_filter
attr_accessor :dot_plot_number
# Add an existing node object to the cluster
# @param [Deployment::Node] node a new node object
# @raise [Deployment::InvalidArgument] If the object is not a node
# @return [Deployment::Node]
def node_add(node)
raise Deployment::InvalidArgument.new self, 'Cluster can add only nodes!', node unless node.is_a? Deployment::Node
return node_get node if node_present? node
unless node.cluster == self
node.cluster.node_remove node if node.cluster
end
nodes.store prepare_key(node), node
node.cluster = self
node
end
alias :add_node :node_add
# Create a new node object by its name and add it to the cluster.
# Or, if the node already exists, return the existing node object.
# @param [String, Symbol] node The name of the new node
# @param [Class] node_class Optional custom node class
# @return [Deployment::Node]
def node_create(node, node_class=Deployment::Node)
if node_present? node
node = node_get node
elsif node.is_a? Deployment::Node
node = node_add node
else
node = node_class.new node, self
node = node_add node unless node_present? node
end
node
end
alias :create_node :node_create
alias :new_node :node_create
alias :node_new :node_create
# Remove a node from this cluster
# @param [Deployment::Node, String, Symbol] node
# @return [void]
def node_remove(node)
return unless node_present? node
nodes.delete prepare_key(node)
end
alias :remove_node :node_remove
# Retrieve a node object from the cluster
# @param [String, Symbol] node The name of the node to retrieve
# @return [Deployment::Node, nil]
def node_get(node)
nodes.fetch prepare_key(node), nil
end
alias :get_node :node_get
alias :[] :node_get
def node_present?(node)
nodes.key? prepare_key(node)
end
alias :has_node? :node_present?
alias :key? :node_present?
# Prepare the hash key from the node
# @param [Deployment::Task,String,Symbol] node
def prepare_key(node)
node = node.name if node.is_a? Deployment::Node
node.to_s.to_sym
end
# Iterates through all cluster nodes
# @yield Deployment::Node
def each_node(&block)
nodes.each_value(&block)
end
alias :each :each_node
# Iterates through all the tasks on all nodes
# @yield Deployment::Task
def each_task
return to_enum(:each_task) unless block_given?
each_node do |node|
node.each_task do |task|
yield task
end
end
end
# Sets up subgraphs for execution
# e.g. user might want to run only a subset
# of tasks: in this case he sends
# an array of subgraphs to be executed.
# Each array consists of starting vertices
# and ending vertices. These vertices are then
# traversed forward or backward
def setup_start_end
cluster_tasks_set = Set.new each_task
tasks_to_include = Set.new
self.subgraphs.each do |subgraph|
setup_start_end_piece(subgraph, cluster_tasks_set).each do |piece|
tasks_to_include.add piece
end
end
to_skip_tasks = cluster_tasks_set - tasks_to_include
to_skip_tasks.each do |task|
warn "Skipping task #{task} due to subgraph evaluation"
task.skip!
end
end
def setup_start_end_piece(subgraph, cluster)
start_tasks = Set.new
end_tasks = Set.new
subgraph.fetch('start', []).each do |task|
visit(task).each do |t|
start_tasks.add t
end
end
subgraph.fetch('end', []).each do |task|
visit(task, direction: :backward).each do |t|
end_tasks.add t
end
end
start_tasks = start_tasks.empty? ? cluster : start_tasks
end_tasks = end_tasks.empty? ? cluster : end_tasks
start_tasks & end_tasks
end
# Iterates through the task that are ready to be run
# @yield Deployment::Task
def each_ready_task
return to_enum(:each_ready_task) unless block_given?
each_task do |task|
yield task if task.ready?
end
end
# Check if graphs have a closed loop
# @return [true, false]
def has_loop?
begin
topology_sort
false
rescue Deployment::LoopDetected
true
end
end
# Topology sort all tasks in all graphs
# Tarjan's algorithm
# @return [Array<Deployment::Task>]
# @raise
def topology_sort
topology = []
permanently_visited = Set.new
temporary_visited = []
loop do
next_task = each_task.find do |task|
not (permanently_visited.include? task or temporary_visited.include? task)
end
return topology unless next_task
visit(next_task, permanently_visited, temporary_visited).each do |task|
topology.insert 0, task
end
end
topology
end
# Tarjan's Algorithm visit function
# @return [Array<Deployment::Task>]
# @raise Deployment::LoopDetected If a loop is detected in the graph
# These parameters are carried through the recursion calls:
# @param [Array<Deployment::Task>] topology A list of topologically sorted tasks
# @param [Set<Deployment::Task>] permanently_visited Set of permanently visited tasks
# @param [Array<Deployment::Task>] temporary_visited List of temporary visited tasks
# @param [Symbol direction] direction Which direction to traverse things: :forward or :backward
# @yield [Deployment::Task]
def visit(task, permanently_visited = Set.new, temporary_visited = [], direction: :forward, &block)
unless block_given?
return to_enum(method=:visit, task, permanently_visited, temporary_visited, direction: direction)
end
if temporary_visited.include? task
# This node have already been visited in this small iteration and
# it means that there is a loop.
temporary_visited << task
raise Deployment::LoopDetected.new self, 'Loop detected!', temporary_visited
end
if permanently_visited.include? task
# We have already checked this node for loops in
# its forward dependencies. Skip it.
return
end
# Start a small iteration over this node's forward dependencies
# add this node to the last iteration visit list and run recursion
# on the forward dependencies
temporary_visited << task
task_method = "each_#{direction}_dependency"
task.send(task_method.to_sym) do |_task|
visit(_task, permanently_visited, temporary_visited, direction: direction, &block)
end
# Small iteration have completed without loops.
# We add this node to the list of permanently marked nodes and
# remove in from the temporary marked nodes list.
permanently_visited.add task
temporary_visited.delete task
# Insert this node to the head of topology sort list and return it.
yield task
end
# Process a single node when it's visited.
# First, poll the node's status nad leave it the node is not ready.
# Then try to get a next task from the node and run it, or leave, if
# there is none available.
# @param [Deployment::Node] node
# @return [void]
def process_node(node)
debug "Process node: #{node}"
hook 'pre_node', node
return if node.skipped?
node.poll
hook 'internal_post_node_poll', node
hook 'post_node_poll', node
return unless node.ready?
ready_task = node.ready_task
return unless ready_task
ready_task.run
hook 'post_node', node
end
# Run a hook method is this method is defined
# @param [String, Symbol] name Hook name
# @param [Object] args Hook arguments
def hook(name, *args)
name = ('hook_' + name.to_s).to_sym
send name, *args if respond_to? name
end
# Loops once through all nodes and processes each one
# @return [void]
def process_all_nodes
debug 'Start processing all nodes'
hook 'pre_all'
each_node do |node|
process_node node
end
hook 'post_all'
end
# Run this deployment process.
# It will loop through all nodes running task
# until the deployment will be considered finished.
# Deployment is finished if all the nodes have all tasks finished
# successfully, or finished with other statuses.
# Actually, it's enough to check only for finished nodes.
# @return [true, false]
def run
ready_nodes = each_ready_task.to_a.join ', '
info "Starting the deployment process. Starting tasks: #{ready_nodes}"
hook 'internal_pre_run'
hook 'pre_run'
topology_sort
result = loop do
if all_nodes_are_successful?
status = 'All nodes are deployed successfully. '\
'Stopping the deployment process!'
result = {
:success => true,
:status => status,
}
break result
end
gracefully_stop! if has_failed_critical_nodes?
if all_nodes_are_finished?
status = "All nodes are finished. Failed tasks: "\
"#{failed_tasks.join ', '} Stopping the "\
"deployment process!"
result = if has_failed_critical_nodes?
{
:success => false,
:status => status,
:failed_nodes => failed_nodes,
:skipped_nodes => skipped_nodes,
:failed_tasks => failed_tasks
}
else
{
:success => true,
:status => status,
:failed_nodes => failed_nodes,
:skipped_nodes => skipped_nodes,
:failed_tasks => failed_tasks
}
end
break result
end
# run loop over all nodes
process_all_nodes
end
info result[:status]
hook 'post_run', result
result
end
alias :deploy :run
# Get the list of critical nodes
# @return [Array<Deployment::Node>]
def critical_nodes
select do |node|
node.critical?
end
end
# Get the list of critical nodes that have failed
# @return [Array<Deployment::Node>]
def failed_critical_nodes
critical_nodes.select do |node|
node.failed? && !node.skipped?
end
end
# Check if there are some critical nodes
# that have failed
# @return [true, false]
def has_failed_critical_nodes?
failed_critical_nodes.any?
end
# Get the list of the failed nodes
# @return [Array<Deployment::Node>]
def failed_nodes
select do |node|
node.failed? && !node.skipped?
end
end
def skipped_nodes
select do |node|
node.skipped?
end
end
# Get the list of the failed nodes
# @return [Array<Deployment::Task>]
def failed_tasks
each_task.select do |task|
task.status == :failed
end
end
# Get the list of tasks that have no forward dependencies
# They are the ending points of the deployment.
# @return [Array<Deployment::Task>]
def ending_tasks
each_task.reject do |task|
task.dependency_forward_any?
end
end
# Get the list of tasks that have no backward dependencies
# They are the starting points of the deployment.
# @return [Array<Deployment::Task>]
def starting_tasks
each_task.reject do |task|
task.dependency_backward_any?
end
end
# Get the list of tasks that have no dependencies at all.
# They are most likely have been lost for some reason.
# @return [Array<Deployment::Task>]
def orphan_tasks
each_task.reject do |task|
task.dependency_backward_any? or task.dependency_forward_any?
end
end
# Check if some nodes are failed
# @return [true, false]
def has_failed_nodes?
failed_nodes.any?
end
# Check if all nodes are finished
# @return [true, false]
def all_nodes_are_finished?
all? do |node|
node.finished?
end
end
# Check if all nodes are successful
# @return [true, false]
def all_nodes_are_successful?
all? do |node|
node.successful?
end
end
# Count the total task number on all nodes
# @return [Integer]
def tasks_total_count
inject(0) do |sum, node|
sum + node.graph.tasks_total_count
end
end
# Count the total number of the failed tasks
# @return [Integer]
def tasks_failed_count
inject(0) do |sum, node|
sum + node.graph.tasks_failed_count
end
end
# Count the total number of the successful tasks
# @return [Integer]
def tasks_successful_count
inject(0) do |sum, node|
sum + node.graph.tasks_successful_count
end
end
# Count the total number of the finished tasks
# @return [Integer]
def tasks_finished_count
inject(0) do |sum, node|
sum + node.graph.tasks_finished_count
end
end
# Count the total number of the pending tasks
# @return [Integer]
def tasks_pending_count
inject(0) do |sum, node|
sum + node.graph.tasks_pending_count
end
end
# Generate the deployment graph representation
# in the DOT language
# @return [String]
def to_dot
template = <<-eos
digraph "<%= uid || 'graph' %>" {
node[ style = "filled, solid"];
<% each_task do |task| -%>
<% next unless task.name =~ dot_task_filter if dot_task_filter -%>
<% next unless task.node.name =~ dot_node_filter if dot_node_filter and task.node -%>
"<%= task %>" [label = "<%= task %>", fillcolor = "<%= task.color %>"];
<% end -%>
<% each_task do |task| -%>
<% task.each_forward_dependency do |forward_task| -%>
<% next unless task.name =~ dot_task_filter if dot_task_filter -%>
<% next unless task.node.name =~ dot_node_filter if dot_node_filter and task.node -%>
<% next unless forward_task.name =~ dot_task_filter if dot_task_filter -%>
<% next unless forward_task.node.name =~ dot_node_filter if dot_node_filter and forward_task.node -%>
"<%= task %>" -> "<%= forward_task %>";
<% end -%>
<% end -%>
}
eos
ERB.new(template, nil, '-').result(binding)
end
# Plot the graph using the 'dot' binary
# Will use incrementing value unless provided.
# @param [Hash] options
# Will use autogenerated name in the current folder unless provided
# @return [true, false] Successful?
def make_image(options={})
file = options.fetch :file, nil
suffix = options.fetch :suffix, nil
type = options.fetch :type, 'svg'
unless file
unless suffix
suffix = dot_plot_number
self.dot_plot_number += 1
end
if suffix.is_a? Integer
suffix = suffix.to_s.rjust 5, '0'
end
graph_name = uid || 'graph'
file = "#{graph_name}-#{suffix}.#{type}"
end
info "Writing the graph image: '#{suffix}' to the file: '#{file}'"
command = ['dot', '-T', type, '-o', file]
Open3.popen2e(*command) do |stdin, out, process|
stdin.puts to_dot
stdin.close
output = out.read
debug output unless output.empty?
process.value.exitstatus == 0
end
end
# Get the array of this cluster's node names.
# They can be used for reference.
# @return [Array<String>]
def node_names
map do |node|
node.name
end.sort
end
def stop_condition(&block)
self.gracefully_stop_mark = block
end
def hook_internal_post_node_poll(*args)
gracefully_stop(args[0])
validate_fault_tolerance(args[0])
end
def hook_internal_pre_run(*args)
return unless has_failed_nodes?
failed_nodes.each { |node| validate_fault_tolerance(node) }
end
# Check if the deployment process should stop
# @return [true, false]
def gracefully_stop?
return true if @emergency_brake
if gracefully_stop_mark && gracefully_stop_mark.call
info "Stop deployment by stop condition (external reason)"
@emergency_brake = true
end
@emergency_brake
end
def gracefully_stop(node)
if gracefully_stop? && node.ready?
node.set_status_skipped
hook 'post_gracefully_stop', node
end
end
def gracefully_stop!
return if @emergency_brake
info "Stop deployment by internal reason"
@emergency_brake = true
end
def fault_tolerance_groups=(groups=[])
@fault_tolerance_groups = groups.select { |group| group['node_ids'].present? }
@fault_tolerance_groups.each { |group| group['failed_node_ids'] = [] }
debug "Setup fault tolerance groups: #{@fault_tolerance_groups}"
end
def validate_fault_tolerance(node)
return if gracefully_stop?
if node.failed?
count_tolerance_fail(node)
gracefully_stop! if fault_tolerance_excess?
end
end
def count_tolerance_fail(node)
fault_tolerance_groups.select do |g|
g['node_ids'].include?(node.name)
end.each do |group|
debug "Count failed node #{node.name} for group #{group['name']}"
group['fault_tolerance'] -= 1
group['node_ids'].delete(node.name)
group['failed_node_ids'] << node.name
end
end
def fault_tolerance_excess?
is_failed = fault_tolerance_groups.select { |group| group['fault_tolerance'] < 0 }
return false if is_failed.empty?
warn "Fault tolerance exceeded the stop conditions #{is_failed}"
true
end
# @return [String]
def to_s
"Cluster[#{uid}]"
end
# @return [String]
def inspect
message = "#{self}"
message += "{Tasks: #{tasks_finished_count}/#{tasks_total_count} Nodes: #{node_names.join ', '}}" if nodes.any?
message
end
end
end