#!/usr/local/ruby-current/bin/ruby # Copyright:: Copyright (c) 2014 eGlobalTech, Inc., all rights reserved # # Licensed under the BSD-3 license (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License in the root of the project or at # # http://egt-labs.com/mu/LICENSE.html # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. require 'optimist' require 'json' require File.realpath(File.expand_path(File.dirname(__FILE__)+"/mu-load-config.rb")) require 'mu' $opts = Optimist::options do banner <<-EOS Usage: #{$0} [-c] [-w] [-l] [-d] [-a] [-e ] [-p ] [-m ] [-o ] [-x ] [ deploy_id|node_name [ ... ] ] EOS opt :concurrent, "Max number of processes to run concurrently when invoking Chef or MommaCat on multiple nodes.", :require => false, :default => 10, :type => :integer opt :list, "Perform no action, but instead return a list of matching hosts. Default behavior with no other flags.", :require => false, :default => false, :type => :boolean opt :deploys, "Operate on matching deploy IDs instead of node names.", :require => false, :default => false, :type => :boolean opt :all, "Operate on all nodes/deploys. Use with caution.", :require => false, :default => false, :type => :boolean opt :platform, "Operate exclusively on one nodes of a particular operating system. Can be used in conjunction with -a or -d. Valid platforms: linux, windows", :require => false, :type => :string opt :environment, "Operate exclusively on one nodes with a particular environment (e.g. dev, prod). Can be used in conjunction with -a or -d.", :require => false, :type => :string opt :override_chef_runlist, "An alternate runlist to pass to Chef, in groomeronly mode.", :require => false, :type => :string opt :xecute, "Run a shell command on matching nodes. Overrides --mode and suppresses some informational output in favor of scriptability.", :require => false, :type => :string opt :mode, "Action to perform on matching nodes. Valid actions: groom, groomeronly, awsmeta, vaults, certs, chefupgrade", :require => false, :default => "groomeronly", :type => :string opt :verbose, "Show output from Chef runs, etc", :require => false, :default => false, :type => :boolean opt :winrm, "Force WinRM connection. Disable SSH fallback", :require => false, :default => false, :type => :boolean opt :info, "List a particular node attribute", :require => false, :default => 'nodename', :type => :string end MU.setLogging(MU::Logger::LOUD) if $opts[:verbose] $opts[:mode] = "groomeronly" if $opts[:mode] == "chefrun" if !["groom", "groomeronly", "vaults", "userdata", "awsmeta", "certs", "chefupgrade"].include?($opts[:mode]) Optimist::die(:mode, "--mode must be one of: groom, groomeronly, awsmeta, vaults, certs, chefupgrade") end if $opts[:platform] and !["linux", "windows"].include?($opts[:platform]) Optimist::die(:platform, "--platform must be one of: linux, windows") end if !$opts[:xecute] and !$opts[:override_chef_runlist] and !$opts[:mode_given] $opts[:list] = true end if $opts[:override_chef_runlist] and !$opts[:mode_given] $opts[:mode_given] = true end if ARGV.empty? and !$opts[:all] and !$opts[:platform] and !$opts[:environment] and !$opts[:list] Optimist::educate exit 1 end Thread.abort_on_exception = true master_pid = Process.pid $children = {} signals = Signal.list signals.keys.each { |sig| # Ruby 2.3 doesn't want to trap these next if ["ILL", "FPE", "KILL", "BUS", "SEGV", "STOP", "VTALRM"].include?(sig) Signal.trap(signals[sig]) do if Process.pid == master_pid $children.each_pair { |pid, node| if ["INT", "TERM", "EXIT", "ABRT"].include?(sig) Process.kill("KILL", pid) # aka --dammit else begin Process.kill(sig, pid) rescue Errno::ESRCH end end } if ["INT", "TERM", "EXIT"].include?(sig) Process.waitall end end end } # Run through our filters so we can pass flat lists into our methods that # actually do things. avail_deploys = MU::MommaCat.listDeploys do_deploys = [] do_nodes = [] ok = true if $opts[:all] or (ARGV.size == 0 and !$opts[:deploys]) do_deploys = avail_deploys else if $opts[:deploys] and !$opts[:all] ARGV.each { |arg| matched = avail_deploys.select { |deploy| deploy.match(/#{Regexp.quote(arg)}/i) } if matched.size == 0 MU.log "Deploy pattern '#{arg}' doesn't appear to match anything", MU::ERR end do_deploys = do_deploys.concat(matched).uniq } else do_nodes = ARGV do_deploys = [] matched = 0 if do_nodes.size > 0 # Just load the deploys we need do_nodes.each { |node| if node.match(/^(.*?-[^\-]+?-\d{10}-[A-Z]{2})-.*/) matched += 1 do_deploys << node.sub(/^(.*?-[^\-]+?-\d{10}-[A-Z]{2})-.*/, '\1') end } do_deploys.uniq! end if do_deploys.size == 0 and do_nodes.size > 0 and (matched > 0 or ARGV.size > 0) do_deploys = avail_deploys end end end avail_nodes = [] @avail_node_attributes = [] do_deploys.each { |muid| mommacat = MU::MommaCat.new(muid, skip_resource_objects: true) mommacat.listNodes.each_pair { |nodename, server| next if server.nil? or server['conf'].nil? id = server['instance_id'] server['conf']["platform"] = "linux" if !server['conf'].has_key?("platform") or %w{centos centos6 centos7 ubuntu ubuntu14 rhel rhel7 rhel71 linux amazon}.include?(server['conf']["platform"]) server['conf']["platform"] = "windows" if %w{win2k12r2 win2k12 win2k8 win2k8r2}.include?(server['conf']["platform"]) next if !$opts[:platform].nil? and server['conf']["platform"] != $opts[:platform] next if !$opts[:environment].nil? and MU.environment.upcase != $opts[:environment].upcase avail_nodes << nodename @avail_node_attributes << server } } if do_nodes.size > 0 matching = avail_nodes.select { |node| matched = false do_nodes.each { |pattern| if node.match(/#{Regexp.quote(pattern)}/i) matched = true break end } matched } do_nodes = matching else do_nodes = avail_nodes end do_nodes.sort! #do_nodes.sort!{ |x,y| (x[$opts[:info]] <=> y[$opts[:info]]) } #puts node_attributes[0] if $opts[:list] if $opts[:info].eql? 'nodename' puts do_nodes else do_nodes.each do |node| @avail_node_attributes.each do |attr| if attr['nodename'].eql? node puts "#{attr['nodename']}: #{attr[$opts[:info]]}" end end end end exit end exit 1 if !ok def reGroom(deploys = MU::MommaCat.listDeploys, nodes = [], vaults_only: false, groomeronly: false) badnodes = [] count = 0 deploys.each { |muid| mommacat = MU::MommaCat.new(muid) next if mommacat.kittens.nil? or mommacat.kittens['servers'].nil? mommacat.kittens['servers'].each_pair { |habitat, nodeclasses| nodeclasses.each_pair { |nodeclass, servers| servers.each_pair { |mu_name, server| next if nodes.size > 0 and !nodes.include?(mu_name) count = count + 1 child = Process.fork { begin type = "server" type = "server_pool" if server.config.has_key?("basis") if vaults_only next if !server.config.has_key?("vault_access") server.config["vault_access"].each { |v| MU::Groomer::Chef.grantSecretAccess(mu_name, v['vault'], v['item']) } elsif groomeronly server.groomer.run else mommacat.groomNode(server.cloud_id, nodeclass, type, mu_name: mu_name) end rescue Exception => e MU.log e.inspect, MU::ERR, details: e.backtrace exit 1 end } $children[child] = mu_name } while $children.size >= $opts[:concurrent]-1 child = Process.wait if !$?.success? badnodes << $children[child] end $children.delete(child) end } } } Process.waitall.each { |child| if !child[1].success? badnodes << $children[child[0]] end } if badnodes.size > 0 MU.log "Not all Momma Cat runs exited cleanly", MU::WARN, details: badnodes end end def runCommand(deploys = MU::MommaCat.listDeploys, nodes = [], cmd = nil, print_output: $opts[:verbose], noop: false) badnodes = [] count = 0 deploys.each { |muid| mommacat = MU::MommaCat.new(muid) mommacat.listNodes.each_pair { |nodename, server| next if server['conf'].nil? server['conf']["platform"] = "linux" if !server['conf'].has_key?("platform") next if nodes.size > 0 and !nodes.include?(nodename) count = count + 1 child = Process.fork { done = false begin serverobj = mommacat.findLitterMate(type: "server", mu_name: nodename) if !serverobj MU.log "Failed to load server object for #{nodename}", MU::ERR next end MU.log "Running '#{cmd}' on #{nodename} (##{count})" if !print_output # Set Variables to catch the output and exit code of the execution output = nil exitcode = -1 # Determine which protocols to attempt if serverobj.windows? attempt_winrm = true if $opts[:winrm] attempt_ssh = false else attempt_ssh = true end else attempt_winrm = false attempt_ssh = true end # Attempt WinRM Connection, and Fall back to SSH if attempt_winrm exec_protocol = 'WinRM' # Attempt to make a connection and exec the command resp = nil begin shell = serverobj.getWinRMSession(0, timeout: 10, winrm_retries: 1) resp = shell.run(cmd) rescue MU::MuError => e end if resp # WINRM CONNECTION AND EXECUTION SUCCESS output = resp.stdout if resp.stdout exitcode = resp.exitcode if resp.exitcode if exitcode.eql? 0 attempt_ssh = false else puts resp.stderr if resp.stderr puts output end end if exitcode != 0 if attempt_ssh MU.log "#{nodename} WinRM exec failed, trying SSH", MU::NOTICE else MU.log "#{nodename} WinRM exec failed, NOT trying SSH", MU::WARN end end end if attempt_ssh exec_protocol = 'SSH' # this should use getSSHSession, for the sake of symmetry output = `ssh -q #{nodename} "#{cmd}" 2>&1 < /dev/null` exitcode = $?.exitstatus end if exitcode != 0 if output if serverobj.windows? and output.match(/NoMethodError: unknown property or method: `ConnectServer'/) MU.log "#{nodename} encountered transient Windows/Chef ConnectServer error, retrying", MU::WARN elsif print_output done = true puts "#{nodename} - #{output}" if output.match(/[^\s]/) MU.log "#{nodename} did not exit cleanly", MU::WARN else done = true MU.log "#{nodename} did not exit cleanly", MU::WARN, details: output.slice(-2000, 2000) end else done = true MU.log "#{nodename} did not exit cleanly", MU::WARN end exit exitcode if done else MU.log "#{nodename} complete via #{exec_protocol}" done = true end end until done puts "#{nodename} - #{output}" if print_output and output.match(/[^\s]/) } $children[child] = nodename while $children.size >= $opts[:concurrent] - 1 child = Process.wait if !$?.success? badnodes << $children[child] end $children.delete(child) end } } Process.waitall.each { |child| if !child[1].success? badnodes << $children[child[0]] end } if badnodes.size > 0 cmd = "Chef" if $opts[:mode] == "groomeronly" if !print_output MU.log "Not all `#{cmd}` runs exited cleanly", MU::WARN, details: badnodes else MU.log "Not all `#{cmd}` runs exited cleanly", MU::WARN end end end def updateAWSMetaData(deploys = MU::MommaCat.listDeploys, nodes = []) deploys.each { |muid| mommacat = MU::MommaCat.new(muid) if mommacat.original_config.nil? MU.log "Failed to locate original config data for #{muid}", MU::WARN next end # Clean up the userdata of matching Autoscale groups by replacing their # Launch Configurations with new ones, if mommacat.original_config.has_key?("server_pools") mommacat.original_config['server_pools'].each { |server| svr_class = server['name'] server["platform"] = "linux" if !server.has_key?("platform") pool_name = mommacat.getResourceName(svr_class) if nodes.size > 0 matched = false nodes.each { |n| if n.match(/^#{Regexp.quote(pool_name)}-[a-z0-9]{3}$/i) matched = true end } next if !matched end # MU::Cloud::AWS::Server.createIAMProfile(pool_name, base_profile: server['iam_role'], extra_policies: server['iam_policies']) pool_obj = mommacat.findLitterMate(type: "server_pool", mu_name: pool_name) pool_obj.groom resp = MU::Cloud::AWS.autoscale.describe_auto_scaling_groups( auto_scaling_group_names: [pool_name] ) if resp.nil? MU.log "Failed to locate any Autoscale Groups named #{pool_name}", MU::WARN next end resp.auto_scaling_groups.each { |asg| launch = MU::Cloud::AWS.autoscale.describe_launch_configurations( launch_configuration_names: [asg.launch_configuration_name] ).launch_configurations.first olduserdata = Base64.decode64(launch.user_data) userdata = MU::Cloud::AWS::Server.fetchUserdata( platform: server["platform"], template_variables: { "deployKey" => Base64.urlsafe_encode64(mommacat.public_key), "deploySSHKey" => mommacat.ssh_public_key, "muID" => muid, "muUser" => MU.chef_user, "mommaCatPort" => MU.mommaCatPort, "publicIP" => MU.mu_public_ip, "resourceName" => svr_class, "windowsAdminName" => server['windows_admin_username'], "skipApplyUpdates" => server['skipinitialupdates'], "resourceType" => "server_pool" }, custom_append: server['userdata_script'] ) # Figure out which devices are embedded in the AMI already. image = MU::Cloud::AWS.ec2.describe_images(image_ids: [server["basis"]["launch_config"]["ami_id"]]).images.first if image.nil? MU.log "#{server["basis"]["launch_config"]["ami_id"]} does not exist, skipping launch config #{asg.launch_configuration_name}", MU::ERR next end ext_disks = {} if !image.block_device_mappings.nil? image.block_device_mappings.each { |disk| if !disk.device_name.nil? and !disk.device_name.empty? and !disk.ebs.nil? and !disk.ebs.empty? ext_disks[disk.device_name] = MU.structToHash(disk.ebs) if ext_disks[disk.device_name].has_key?(:snapshot_id) ext_disks[disk.device_name].delete(:encrypted) end end } end storage = [] if !server["basis"]["launch_config"]["storage"].nil? server["basis"]["launch_config"]["storage"].each { |vol| if ext_disks.has_key?(vol["device"]) if ext_disks[vol["device"]].has_key?(:snapshot_id) vol.delete("encrypted") end end mapping, cfm_mapping = MU::Cloud::AWS::Server.convertBlockDeviceMapping(vol) storage << mapping } end storage.concat(MU::Cloud::AWS::Server.ephemeral_mappings) if userdata != olduserdata or launch.image_id != server["basis"]["launch_config"]["ami_id"] or launch.ebs_optimized != server["basis"]["launch_config"]["ebs_optimized"] or launch.instance_type != server["basis"]["launch_config"]["size"] or launch.instance_monitoring.enabled != server["basis"]["launch_config"]["monitoring"] # launch.block_device_mappings != storage # XXX block device comparison isn't this simple need_update = true end next if !need_update # Put our Autoscale group onto a temporary launch config begin MU::Cloud::AWS.autoscale.create_launch_configuration( launch_configuration_name: pool_name+"-TMP", user_data: Base64.encode64(userdata), image_id: server["basis"]["launch_config"]["ami_id"], key_name: launch.key_name, security_groups: launch.security_groups, instance_type: server["basis"]["launch_config"]["size"], block_device_mappings: storage, instance_monitoring: {:enabled => server["basis"]["launch_config"]["monitoring"]}, iam_instance_profile: launch.iam_instance_profile, ebs_optimized: server["basis"]["launch_config"]["ebs_optimized"], associate_public_ip_address: launch.associate_public_ip_address ) rescue ::Aws::AutoScaling::Errors::ValidationError => e if e.message.match(/Member must have length less than or equal to (\d+)/) MU.log "Userdata script too long updating #{pool_name} Launch Config (#{Base64.encode64(userdata).size.to_s}/#{Regexp.last_match[1]} bytes)", MU::ERR else MU.log "Error updating #{pool_name} Launch Config", MU::ERR, details: e.message end next end MU::Cloud::AWS.autoscale.update_auto_scaling_group( auto_scaling_group_name: pool_name, launch_configuration_name: pool_name+"-TMP" ) # ...now back to an identical one with the "real" name MU::Cloud::AWS.autoscale.delete_launch_configuration( launch_configuration_name: pool_name ) MU::Cloud::AWS.autoscale.create_launch_configuration( launch_configuration_name: pool_name, user_data: Base64.encode64(userdata), image_id: server["basis"]["launch_config"]["ami_id"], key_name: launch.key_name, security_groups: launch.security_groups, instance_type: server["basis"]["launch_config"]["size"], block_device_mappings: storage, instance_monitoring: {:enabled => server["basis"]["launch_config"]["monitoring"]}, iam_instance_profile: launch.iam_instance_profile, ebs_optimized: server["basis"]["launch_config"]["ebs_optimized"], associate_public_ip_address: launch.associate_public_ip_address ) MU::Cloud::AWS.autoscale.update_auto_scaling_group( auto_scaling_group_name: pool_name, launch_configuration_name: pool_name ) MU::Cloud::AWS.autoscale.delete_launch_configuration( launch_configuration_name: pool_name+"-TMP" ) MU.log "Launch Configuration #{asg.launch_configuration_name} replaced" } } end # Update the userdata of live nodes. They must be in the Stopped state for # us to do so. mommacat.listNodes.each_pair { |nodename, server| if server['conf'].nil? MU.log "Failed to find config data for server #{nodename}", MU::WARN next end id = server['cloud_id'] id = server['instance_id'] if id.nil? desc = MU::Cloud::AWS.ec2(region: server['region']).describe_instances(instance_ids: [id]).reservations.first.instances.first server['conf']["platform"] = "linux" if !server['conf'].has_key?("platform") next if nodes.size > 0 and !nodes.include?(nodename) mytype = "server" if server['conf'].has_key?("basis") or server['conf']['#TYPENAME'] == "ServerPool" or server['conf']["#MU_CLASS"] == "MU::Cloud::AWS::ServerPool" mytype = "server_pool" else server_obj = mommacat.findLitterMate(type: "server", mu_name: nodename) server_obj.groom end olduserdata = Base64.decode64(MU::Cloud::AWS.ec2(region: server['region']).describe_instance_attribute( instance_id: id, attribute: "userData" ).user_data.value) userdata = MU::Cloud::AWS::Server.fetchUserdata( platform: server['conf']["platform"], template_variables: { "deployKey" => Base64.urlsafe_encode64(mommacat.public_key), "deploySSHKey" => mommacat.ssh_public_key, "muID" => muid, "muUser" => MU.chef_user, "publicIP" => MU.mu_public_ip, "resourceName" => server['conf']['name'], "windowsAdminName" => server['conf']['windows_admin_username'], "skipApplyUpdates" => server['conf']['skipinitialupdates'], "resourceType" => mytype }, custom_append: server['userdata_script'] ) if userdata == olduserdata MU.log "#{nodename} has up-to-date userdata, skipping", MU::DEBUG next end if desc.state.name != "stopped" MU.log "#{nodename} needs a userdata update, but is not in Stopped state", MU::NOTICE if mytype == "server_pool" pool_name = mommacat.getResourceName(server['conf']['name']) MU.log "Note: Be sure to pause Autoscaling for this group before stopping this instance, e.g. with: aws autoscaling suspend-processes --auto-scaling-group-name #{pool_name}", MU::WARN end next end MU.log "Updating #{nodename} userdata (#{server["conf"]["platform"]})" begin MU::Cloud::AWS.ec2(region: server['region']).modify_instance_attribute( instance_id: id, attribute: "userData", value: Base64.encode64(userdata) ) rescue ::Aws::EC2::Errors::InvalidParameterValue => e if e.message.match(/User data is limited to (\d+)/) MU.log "Userdata script too long updating #{nodename} (#{userdata.size.to_s}/#{Regexp.last_match[1]} bytes)", MU::ERR else MU.log "Error replacing userData on #{nodename}", MU::ERR, details: e.message end end } } end def sslCerts(deploys = MU::MommaCat.listDeploys, nodes = [], vaults_only: false) badnodes = [] count = 0 deploys.each { |muid| mommacat = MU::MommaCat.new(muid) mommacat.listNodes.each_pair { |nodename, server| next if server['conf'].nil? server['conf']["platform"] = "linux" if !server['conf'].has_key?("platform") next if nodes.size > 0 and !nodes.include?(nodename) if server['conf'].nil? MU.log "Failed to find config data for server #{nodename}", MU::WARN next end server_obj = mommacat.findLitterMate(type: "server", mu_name: nodename) mommacat.nodeSSLCerts(server_obj) } } end def chefUpgrade(deploys = MU::MommaCat.listDeploys, nodes = []) badnodes = [] deploys.each { |muid| mommacat = MU::MommaCat.new(muid) mommacat.listNodes.each_pair { |nodename, server| next if server['conf'].nil? server['conf']["platform"] = "linux" if !server['conf'].has_key?("platform") next if nodes.size > 0 and !nodes.include?(nodename) if server['conf'].nil? MU.log "Failed to find config data for server #{nodename}", MU::WARN next end child = Process.fork { server_obj = mommacat.findLitterMate(type: "server", mu_name: nodename) begin server_obj.groomer.reinstall rescue Exception end } $children[child] = nodename while $children.size >= $opts[:concurrent]-1 child = Process.wait if !$?.success? badnodes << $children[child] end $children.delete(child) end } } Process.waitall.each { |child| if !child[1].success? badnodes << $children[child[0]] end } if badnodes.size > 0 MU.log "Not all Chef upgrades exited cleanly", MU::WARN, details: badnodes end end if $opts[:xecute] runCommand(do_deploys, do_nodes, $opts[:xecute], print_output: true) elsif $opts[:mode] == "certs" sslCerts(do_deploys, do_nodes) elsif $opts[:mode] == "groom" reGroom(do_deploys, do_nodes) elsif $opts[:mode] == "vaults" reGroom(do_deploys, do_nodes, vaults_only: true) elsif $opts[:mode] == "chefupgrade" chefUpgrade(do_deploys, do_nodes) elsif $opts[:mode] == "groomeronly" print_output = $opts[:verbose] || do_nodes.size == 1 if $opts[:override_chef_runlist] # runCommand(do_deploys, do_nodes, chef_runlist: $opts[:override_chef_runlist], groomeronly: true, print_output: print_output) else # runCommand(do_deploys, do_nodes, groomeronly: true, print_output: print_output) reGroom(do_deploys, do_nodes, groomeronly: true) end elsif $opts[:mode] == "userdata" or $opts[:mode] == "awsmeta" # Need Google equiv and to select nodes correctly based on what cloud they're in updateAWSMetaData(do_deploys, do_nodes) end