require 'restclient' require 'yaml' require 'json' require 'ipaddress' require 'uri' module Cute module G5K # = {Cute::G5K} exceptions # # The generated exceptions are divided in 5 groups: # # - {Cute::G5K::BadRequest BadRequest} it means that the syntax you passed to some {Cute::G5K::API G5K::API} method is not correct from # the Grid'5000 services point of view. # - {Cute::G5K::RequestFailed RequestFailed} it means that there is a server problem or there is nothing the user can do to solve the problem. # - {Cute::G5K::NotFound} it means that the requested resources do not exist. # - {Cute::G5K::Unauthorized} it means that there is an authentication problem. # - {Cute::G5K::EventTimeout} this exception is triggered by the methods that wait for events such as: # job submission and environment deployment. class Error < Exception attr_accessor :orig # Original exception def initialize(message = nil, object = nil) super(message) self.orig = object end def method_missing(method) return orig.send(method) end end # It wraps the http response 400 that corresponds to a bad request. # When using the {Cute::G5K::API#reserve reserve} or {Cute::G5K::API#reserve deploy} methods this could mean: # a bad syntax in the request, not valid properties in the request, # not enough resources to supply the request, non existing environment, etc. # # = Example # # You can handle this exception and decide what to do with your experiment. # In the example below, we iterate over all sites until a site has resources with the property 'ib20g' set to 'YES'. # # require 'cute' # # g5k = Cute::G5K::API.new() # # sites = g5k.site_uids # # sites.each do |site| # # begin # job = g5k.reserve(:site => site, :resources => "{ib20g='YES'}/nodes=2/core=1",:walltime => '00:30:00', :keys => "~/my_ssh_jobkey" ) # rescue Cute::G5K::BadRequest # puts "Resource not available in this site, trying with another one" # end # # end class BadRequest < Error end # It wraps all Restclient exceptions with http codes: 403, 405,406, 412, 415, 500, 502, 503 and 504. class RequestFailed < Error end # It wraps the exceptions generated by Timeout::Error class EventTimeout < Error end # It wraps the Restclient exception 404 class NotFound < Error end # It wraps the Restclient exception RestClient::Unauthorized class Unauthorized < Error end # @api private class G5KArray < Array def uids return self.map { |it| it['uid'] } end def rel_self return rel('self') end def rel(r) return self['links'].detect { |x| x['rel'] == r }['href'] end end # Provides an abstraction for handling G5K responses. # @api private # @see https://api.grid5000.fr/doc/3.0/reference/grid5000-media-types.html # When this structure is used to describe jobs, it is expected to have the # following fields which depend on the version of the API. # {"uid"=>604692, # "user_uid"=>"name", # "user"=>"name", # "walltime"=>3600, # "queue"=>"default", # "state"=>"running", # "project"=>"default", # "name"=>"rubyCute job", # "types"=>["deploy"], # "mode"=>"PASSIVE", # "command"=>"./oarapi.subscript.ZzvnM", # "submitted_at"=>1423575384, # "scheduled_at"=>1423575386, # "started_at"=>1423575386, # "message"=>"FIFO scheduling OK", # "properties"=>"(deploy = 'YES') AND maintenance = 'NO'", # "directory"=>"/home/name", # "events"=>[], # "links"=>[{"rel"=>"self", "href"=>"/sid/sites/nancy/jobs/604692", "type"=>"application/vnd.grid5000.item+json"}, # {"rel"=>"parent", "href"=>"/sid/sites/nancy", "type"=>"application/vnd.grid5000.item+json"}], # "resources_by_type"=> # {"cores"=> # ["griffon-8.nancy.grid5000.fr", # "griffon-8.nancy.grid5000.fr", # "griffon-8.nancy.grid5000.fr", # "griffon-8.nancy.grid5000.fr", # "griffon-9.nancy.grid5000.fr", # "griffon-9.nancy.grid5000.fr", # "griffon-9.nancy.grid5000.fr", # "griffon-9.nancy.grid5000.fr", # "griffon-77.nancy.grid5000.fr", # "griffon-77.nancy.grid5000.fr", # "griffon-77.nancy.grid5000.fr", # "griffon-77.nancy.grid5000.fr", # "vlans"=>["5"]}, # "assigned_nodes"=>["griffon-8.nancy.grid5000.fr", "griffon-9.nancy.grid5000.fr", "griffon-77.nancy.grid5000.fr"], # "deploy"=> # {"created_at"=>1423575401, # "environment"=>"http://public.sophia.grid5000.fr/~nniclausse/openmx.dsc", # "key"=>"https://api.grid5000.fr/sid/sites/nancy/files/cruizsanabria-key-84f3f1dbb1279bc1bddcd618e26c960307d653c5", # "nodes"=>["griffon-8.nancy.grid5000.fr", "griffon-9.nancy.grid5000.fr", "griffon-77.nancy.grid5000.fr"], # "site_uid"=>"nancy", # "status"=>"processing", # "uid"=>"D-751096de-0c33-461a-9d27-56be1b2dd980", # "updated_at"=>1423575401, # "user_uid"=>"cruizsanabria", # "vlan"=>5, # "links"=> # [{"rel"=>"self", "href"=>"/sid/sites/nancy/deployments/D-751096de-0c33-461a-9d27-56be1b2dd980", "type"=>"application/vnd.grid5000.item+json"}, class G5KJSON < Hash def items return self['items'] end def nodes return self['nodes'] end def resources return self['resources_by_type'].nil?? Hash.new : self['resources_by_type'] end def rel(r) return self['links'].detect { |x| x['rel'] == r }['href'] end def uid return self['uid'] end def rel_self return rel('self') end def rel_parent return rel('parent') end def refresh(g5k) return g5k.get_json(rel_self) end def self.parse(s) return JSON.parse(s, :object_class => G5KJSON, :array_class => G5KArray) end end # Manages the low level operations for communicating with the REST API. # @api private class G5KRest attr_reader :user # Initializes a REST connection # @param uri [String] resource identifier which normally is the URL of the Rest API # @param user [String] user if authentication is needed # @param pass [String] password if authentication is needed # @param on_error [Symbol] option to deactivate the {Cute::G5K::RequestFailed RequestFailed} exceptions def initialize(uri,api_version,user,pass,on_error) @user = user @pass = pass @api_version = api_version.nil? ? "stable" : api_version if (user.nil? or pass.nil?) @endpoint = uri # Inside Grid'5000 else user_escaped = CGI.escape(user) pass_escaped = CGI.escape(pass) @endpoint = "https://#{user_escaped}:#{pass_escaped}@#{uri.split("https://")[1]}" end machine =`uname -ov`.chop @user_agent = "ruby-cute/#{VERSION} (#{machine}) Ruby #{RUBY_VERSION}" @api = RestClient::Resource.new(@endpoint, :timeout => 30,:verify_ssl => false) # some versions of restclient do not verify by default SSL certificates , :verify_ssl => true) # SSL verify is disabled due to Grid'5000 API certificate problem @on_error = on_error test_connection end # Returns a resource object # @param path [String] this complements the URI to address to a specific resource def resource(path) path = path[1..-1] if path.start_with?('/') return @api[path] end # @return [Hash] the HTTP response # @param path [String] this complements the URI to address to a specific resource def get_json(path) begin r = resource(path).get(:content_type => "application/json", :user_agent => @user_agent) rescue => e handle_exception(e) end return G5KJSON.parse(r) end # Creates a resource on the server # @param path [String] this complements the URI to address to a specific resource # @param json [Hash] contains the characteristics of the resources to be created. def post_json(path, json) begin r = resource(path).post(json.to_json, :content_type => "application/json", :accept => "application/json", :user_agent => @user_agent) rescue => e handle_exception(e) end return G5KJSON.parse(r) end # Deletes a resource on the server # @param path [String] this complements the URI to address to a specific resource def delete_json(path) begin return resource(path).delete() rescue RestClient::InternalServerError => e raise RequestFailed.new("Service internal error", e) end end # @return the parent link def follow_parent(obj) get_json(obj.rel_parent) end private # Tests the connection and raises an error in case of a problem def test_connection begin return get_json("/#{@api_version}/") rescue Cute::G5K::Unauthorized raise "Your Grid'5000 credentials are not recognized" end end # Issues a Cute::G5K exception according to the http status code def handle_exception(e) unless e.respond_to? :http_code raise e end # Handling G5k API errors case e.http_code when 400 raise BadRequest.new("Bad request", e) when 404 raise NotFound.new("Resource not found", e) when 401 raise Unauthorized.new("Authentication problem",e) else if @on_error == :ignore return nil else raise RequestFailed.new("Grid5000 API internal error", e) end end end end # This class helps you to access Grid'5000 REST API. # Thus, the most common actions such as reservation of nodes and deployment can be easily scripted. # To simplify the use of the module, it is better to create a file with the following information: # # $ cat > ~/.grid5000_api.yml << EOF # uri: https://api.grid5000.fr/ # username: user # password: ********** # version: sid # EOF # # The *username* and *password* are not necessary if you are using the module from inside Grid'5000. # You can take a look at the {Cute::G5K::API#initialize G5K::API constructor} to see more details of # this configuration. # # = Getting started # # As already said, the goal of {Cute::G5K::API G5K::API} class is to present a high level abstraction to manage the most common activities # in Grid'5000 such as: the reservation of resources and the deployment of environments. # Consequently, these activities can be easily scripted using Ruby. # The advantage of this is that you can use all Ruby constructs (e.g., loops, conditionals, blocks, iterators, etc) to script your experiments. # In the presence of error, {Cute::G5K::API G5K::API} raises exceptions (see {Cute::G5K::Error G5K exceptions}), # that you can handle to decide the workflow of your experiment # (see {Cute::G5K::API#wait_for_deploy wait_for_deploy} and {Cute::G5K::API#wait_for_deploy wait_for_job}). # In the following example it is shown how {Cute::G5K::API G5K::API} is used. The example represents # the reservation of 3 nodes in Nancy site for 1 hour: # # require 'cute' # # g5k = Cute::G5K::API.new() # # job = g5k.reserve(:nodes => 3, :site => 'nancy', :walltime => '01:00:00') # # puts "Assigned nodes : #{job['assigned_nodes']}" # # If that is all you want to do, you can write that into a file, let's say *example.rb* and execute it using the Ruby interpreter. # # $ ruby example.rb # # The execution will block until you got the reservation. Then, you can interact with the nodes you reserved in the way you used to or # add more code to the previous script for controlling your experiment with Ruby-Cute as shown in this # {http://www.rubydoc.info/github/ruby-cute/ruby-cute/master/file/examples/g5k_exp_virt.rb example}. # We have just used the method {Cute::G5K::API#reserve reserve} that allow us to reserve resources in Grid'5000. # This method can be used to reserve resources in deployment mode and deploy our own software environment on them using # {http://kadeploy3.gforge.inria.fr/ Kadeploy}. For this we use the option *:env* of the {Cute::G5K::API#reserve reserve} method. # Therefore, it will first reserve the resources and then deploy the specified environment. # The method {Cute::G5K::API#reserve reserve} will block until the deployment is done. # The following Ruby script illustrates all we have just said. # # require 'cute' # # g5k = Cute::G5K::API.new() # # job = g5k.reserve(:nodes => 1, :site => 'grenoble', :walltime => '00:40:00', :env => 'wheezy-x64-base') # # puts "Assigned nodes : #{job['assigned_nodes']}" # # By default your public ssh key '~/.ssh/id_rsa.pub' will be copied on the deployed machines, # you can specify another path for your keys with the option *:keys*. # In order to deploy your own environment, you have to put the tar file that contains the operating system you want to deploy and # the environment description file, under the public directory of a given site. # *VLANS* are supported by adding the parameter :vlan => type where type can be: *:routed*, *:local*, *:global*. # The following example, reserves 10 nodes in the Lille site, starts the deployment of a custom environment over the nodes # and puts the nodes under a routed VLAN. We used the method {Cute::G5K::API#get_vlan_nodes get_vlan_nodes} to get the # new hostnames assigned to your nodes. # # require 'cute' # # g5k = Cute::G5K::API.new() # # job = g5k.reserve(:site => "lille", :nodes => 10, # :env => 'https://public.lyon.grid5000.fr/~user/debian_custom_img.yaml', # :vlan => :routed, :keys => "~/my_ssh_key") # # # puts "Log in into the nodes using the following hostnames: #{g5k.get_vlan_nodes(job)}" # # If you do not want that the method {Cute::G5K::API#reserve reserve} perform the deployment for you, you have to use the option :type => :deploy. # This can be useful when deploying different environments in your reserved nodes, for example: deploying the environments for a small HPC cluster. # You have to use the method {Cute::G5K::API#deploy deploy} for performing the deploy. # This method do not block by default, that is why you have to use the method {Cute::G5K::API#wait_for_deploy wait_for_deploy} in order to block the execution # until the deployment is done. # # require 'cute' # # g5k = Cute::G5K::API.new() # # job = g5k.reserve(:site => "lyon", :nodes => 5, :walltime => "03:00:00", :type => :deploy) # # nodes = job["assigned_nodes"] # # slaves = nodes[1..4] # master = nodes-slaves # # g5k.deploy(job,:nodes => master, :env => 'https://public.lyon.grid5000.fr/~user/debian_master_img.yaml') # g5k.deploy(job,:nodes => slaves, :env => 'https://public.lyon.grid5000.fr/~user/debian_slaves_img.yaml') # # g5k.wait_for_deploy(job) # # puts "master node: #{master}" # puts "slaves nodes: #{slaves}" # # You can check out the documentation of {Cute::G5K::API#reserve reserve} and {Cute::G5K::API#deploy deploy} methods # to know all the parameters supported and more complex uses. # # == Another useful methods # # Let's use *pry* to show other useful methods. As shown in {file:README.md Ruby Cute} the *cute* command will open a # pry shell with some modules preloaded and it will create the variable $g5k to access {Cute::G5K::API G5K::API} class. # Therefore, we can consult the name of the cluster available in a specific site. # # [4] pry(main)> $g5k.cluster_uids("grenoble") # => ["adonis", "edel", "genepi"] # # As well as the deployable environments: # # [6] pry(main)> $g5k.environment_uids("grenoble") # => ["squeeze-x64-base", "squeeze-x64-big", "squeeze-x64-nfs", "wheezy-x64-base", "wheezy-x64-big", "wheezy-x64-min", "wheezy-x64-nfs", "wheezy-x64-xen"] # # For getting a list of sites available in Grid'5000 you can use: # # [7] pry(main)> $g5k.site_uids() # => ["grenoble", "lille", "luxembourg", "lyon",...] # # We can get the status of nodes in a given site by using: # # [8] pry(main)> $g5k.nodes_status("lyon") # => {"taurus-2.lyon.grid5000.fr"=>"besteffort", "taurus-16.lyon.grid5000.fr"=>"besteffort", "taurus-15.lyon.grid5000.fr"=>"besteffort", ...} # # We can get information about our submitted jobs by using: # # [11] pry(main)> $g5k.get_my_jobs("grenoble") # => [{"uid"=>1679094, # "user_uid"=>"cruizsanabria", # "user"=>"cruizsanabria", # "walltime"=>3600, # "queue"=>"default", # "state"=>"running", ...}, ...] # # If we are done with our experiment, we can release the submitted job or all jobs in a given site as follows: # # [12] pry(main)> $g5k.release(job) # [13] pry(main)> $g5k.release_all("grenoble") class API # Assigns a logger # # = Examples # You can use this attribute to control how to log all messages produce by {Cute::G5K::API G5K::API}. # For example, below we use the logger available in Ruby standard library. # # require 'cute' # require 'logger' # # g5k = Cute::G5K::API.new() # # g5k.logger = Logger.new(File.new('experiment_1.log')) attr_accessor :logger # Initializes a REST connection for Grid'5000 API # # = Example # You can specify another configuration file using the option *:conf_file*, for example: # # g5k = Cute::G5K::API.new(:conf_file =>"config file path") # # You can specify other parameters to use: # # g5k = Cute::G5K::API.new(:uri => "https://api.grid5000.fr", :version => "sid") # # If you want to ignore {Cute::G5K::RequestFailed ResquestFailed} exceptions you can use: # # g5k = Cute::G5K::API.new(:on_error => :ignore) # # @param [Hash] params Contains initialization parameters. # @option params [String] :conf_file Path for configuration file # @option params [String] :uri REST API URI to contact # @option params [String] :version Version of the REST API to use # @option params [String] :username Username to access the REST API # @option params [String] :password Password to access the REST API # @option params [Symbol] :on_error Set to :ignore if you want to ignore {Cute::G5K::RequestFailed ResquestFailed} exceptions. def initialize(params={}) config = {} default_file = "#{ENV['HOME']}/.grid5000_api.yml" if params[:conf_file].nil? then params[:conf_file] = default_file if File.exist?(default_file) end params[:username] ||= params[:user] params[:password] ||= params[:pass] # backward compatibility config = YAML.load(File.open(params[:conf_file],'r')) unless params[:conf_file].nil? @user = params[:username] || config["username"] @pass = params[:password] || config["password"] @uri = params[:uri] || config["uri"] || "https://api.grid5000.fr/" @api_version = params[:version] || config["version"] || "stable" @logger = nil begin @g5k_connection = G5KRest.new(@uri,@api_version,@user,@pass,params[:on_error]) rescue => e msg_create_file = "" if (not File.exist?(default_file)) && params[:conf_file].nil? then msg_create_file = "Please create the file: ~/.grid5000_api.yml and put the necessary credentials or use the option :conf_file to indicate another file for the credentials" end raise "Unable to authorize against the Grid'5000 API. #{e.message} #{msg_create_file}" end end # It returns the site name. Example: # site #=> "rennes" # This will only work when {Cute::G5K::API G5K::API} is used within Grid'5000. # In the other cases it will return *nil* # @return [String] the site name where the method is called on def site p = `hostname`.chop res = /^.*\.(.*).*\.grid5000.fr/.match(p) res[1] unless res.nil? end # @api private # @return the rest point for performing low level REST requests def rest @g5k_connection end # @return [String] Grid'5000 user def g5k_user return @user.nil? ? ENV['USER'] : @user end # Returns all sites identifiers # # = Example: # site_uids #=> ["grenoble", "lille", "luxembourg", "lyon",...] # # @return [Array] all site identifiers def site_uids return sites.uids end # Returns all cluster identifiers # # = Example: # cluster_uids("grenoble") #=> ["adonis", "edel", "genepi"] # # @return [Array] cluster identifiers def cluster_uids(site) return clusters(site).uids end # Returns the name of the environments deployable in a given site. # These can be used with {Cute::G5K::API#reserve reserve} and {Cute::G5K::API#deploy deploy} methods # # = Example: # environment_uids("nancy") #=> ["squeeze-x64-base", "squeeze-x64-big", "squeeze-x64-nfs", ...] # # @return [Array] environment identifiers def environment_uids(site) # environments are returned by the API following the format squeeze-x64-big-1.8 # it returns environments without the version environment_uids = environments(site).uids.map{ |e| e_match = /(.*)-(.*)/.match(e) new_name = e_match.nil? ? "" : e_match[1] } return environment_uids.uniq end # @return [Hash] all the status information of a given Grid'5000 site # @param site [String] a valid Grid'5000 site name def site_status(site) @g5k_connection.get_json(api_uri("sites/#{site}/status")) end # @return [Hash] the nodes state (e.g, free, busy, etc) that belong to a given Grid'5000 site # @param site [String] a valid Grid'5000 site name def nodes_status(site) nodes = {} site_status(site).nodes.each do |node| name = node[0] status = node[1]["soft"] nodes[name] = status end return nodes end # @return [Array] the description of all Grid'5000 sites def sites @g5k_connection.get_json(api_uri("sites")).items end # @return [Array] the description of clusters that belong to a given Grid'5000 site # @param site [String] a valid Grid'5000 site name def clusters(site) @g5k_connection.get_json(api_uri("sites/#{site}/clusters")).items end # @return [Array] the description of all environments registered in a Grid'5000 site def environments(site) @g5k_connection.get_json(api_uri("sites/#{site}/environments")).items end # @return [Hash] all the jobs submitted in a given Grid'5000 site, # if a uid is provided only the jobs owned by the user are shown. # @param site [String] a valid Grid'5000 site name # @param uid [String] user name in Grid'5000 # @param states [Array] or [String] jobs state: running, waiting (multiple states can be specified) def get_jobs(site, uid = nil, states = nil) parameters = [] if states then states = [states] if states.is_a?(String) parameters.push("state=#{states.join(",")}") end parameters.push("user=#{uid}") if uid parameters.push("limit=25") if (states.nil? and uid.nil?) jobs = @g5k_connection.get_json(api_uri("/sites/#{site}/jobs?#{parameters.join("&")}")).items jobs.map{ |j| @g5k_connection.get_json(j.rel_self)} # This request sometime is could take a little long when all jobs are requested # The API return by default 50 the limit was set to 25 (e.g., 23 seconds). end # @return [Hash] the last 50 deployments performed in a Grid'5000 site # @param site [String] a valid Grid'5000 site name # @param uid [String] user name in Grid'5000 def get_deployments(site, uid = nil) @g5k_connection.get_json(api_uri("sites/#{site}/deployments/?user=#{uid}")).items end # @return [Hash] information concerning a given job submitted in a Grid'5000 site # @param site [String] a valid Grid'5000 site name # @param jid [Fixnum] a valid job identifier def get_job(site, jid) @g5k_connection.get_json(api_uri("/sites/#{site}/jobs/#{jid}")) end # @return [Hash] switches information available in a given Grid'5000 site. # @param site [String] a valid Grid'5000 site name def get_switches(site) items = @g5k_connection.get_json(api_uri("/sites/#{site}/network_equipments")).items items = items.select { |x| x['kind'] == 'switch' } # extract nodes connected to those switches items.each { |switch| conns = switch['linecards'].detect { |c| c['kind'] == 'node' } next if conns.nil? # IB switches for example nodes = conns['ports'] \ .select { |x| x != {} } \ .map { |x| x['uid'] } \ .map { |x| "#{x}.#{site}.grid5000.fr"} switch['nodes'] = nodes } return items end # @return [Hash] information of a specific switch available in a given Grid'5000 site. # @param site [String] a valid Grid'5000 site name # @param name [String] a valid switch name def get_switch(site, name) s = get_switches(site).detect { |x| x.uid == name } raise "Unknown switch '#{name}'" if s.nil? return s end # Returns information using the Metrology API. # # = Example # # You can get detailed information of available metrics in a given site: # get_metric("rennes") # # If you are only interested in the names of the available metrics: # get_metric("rennes").uids #=> ["cpu_nice", "boottime", "bytes_in", ...] # # Then, you can get information about the probes available for a specific metric: # get_metric("rennes",:metric => "network_in") # # Finally, you can query on a specific probe: # get_metric("rennes",:metric => "network_in",:query => {:from => 1450374553, :to => 1450374553, :only => "parasilo-11-eth0"}) # # @return [Array] information of a specific metric in a given Grid'5000 site. # @param site [String] a valid Grid'5000 site name # @param [Hash] opts Options for metric query # @option opts [String] :metric specific metric to query on # @option opts [Hash] :query timeseries parameters (e.g. only, resolution, from, to) def get_metric(site,opts ={}) params = opts[:metric].nil? ? "" : "/#{opts[:metric]}/timeseries" if opts[:query] params+="?" opts[:query].each{ |k,v| params+="#{k}=#{v}&"} end @g5k_connection.get_json(api_uri("sites/#{site}/metrics#{params}")).items end # Returns information of all my jobs submitted in a given site. # By default it only shows the jobs in state *running*. # You can specify another state like this: # # = Examples # get_my_jobs("nancy", "waiting") # Getting several states: # get_my_jobs("nancy", ["waiting","running"]) # Valid states are specified in {https://api.grid5000.fr/doc/4.0/reference/spec.html Grid'5000 API spec} # @return [Array] all my submitted jobs to a given site and their associated deployments. # @param site [String] a valid Grid'5000 site name # @param states [String/Array] possible job state values (waiting, launching, running, hold, error, terminated) def get_my_jobs(site, states = "running") # raise ArgumentError,"States parameter should be an Array" unless states.is_a?(Array) jobs = get_jobs(site, g5k_user, states) deployments = get_deployments(site, g5k_user) # filtering deployments only the job in state running make sense jobs.map{ |j| j["deploy"] = deployments.select{ |d| d["created_at"] > j["started_at"]} if j["state"] == "running"} return jobs end # Returns an Array with all subnets reserved by a given job. # Each element of the Array is a {https://github.com/bluemonk/ipaddress IPAddress::IPv4} object which we can interact with to obtain # the details of our reserved subnets: # # = Example # require 'cute' # # g5k = Cute::G5K::API.new() # # job = g5k.reserve(:site => "lyon", :resources => "/slash_22=1+{virtual!='none'}/nodes=1") # # subnet = g5k.get_subnets(job).first #=> we use 'first' because it is an array and we only reserved one subnet. # # ips = subnet.map{ |ip| ip.to_s } # # @return [Array] all the subnets defined in a given job # @param job [G5KJSON] as described in {Cute::G5K::G5KJSON job} def get_subnets(job) if job.resources["subnets"].nil? return nil else subnets = job.resources["subnets"] end subnets.map{|s| IPAddress::IPv4.new s } end # @return [Array] all the nodes in the VLAN # @param job [G5KJSON] as described in {Cute::G5K::G5KJSON job} def get_vlan_nodes(job) if job.resources["vlans"].nil? return nil else vlan_id = job.resources["vlans"].first end nodes = job["assigned_nodes"] reg = /^(\w+-\d+)(\..*)$/ nodes.map { |name| reg.match(name)[1]+"-kavlan-"+vlan_id.to_s+reg.match(name)[2] unless reg.match(name).nil? } end # Releases all jobs on a site # @param site [String] a valid Grid'5000 site name def release_all(site) raise ArgumentError, "parameter should be a string" unless site.is_a?(String) Timeout.timeout(20) do jobs = get_my_jobs(site,"running") + get_my_jobs(site,"waiting") break if jobs.empty? begin jobs.each { |j| release(j) } rescue Cute::G5K::RequestFailed => e raise unless e.response.include?('already killed') end end return true end # Releases a resource, it can be a job or a deploy. # @param resource [G5KJSON] as described in {Cute::G5K::G5KJSON job} def release(resource) raise ArgumentError, "parameter should be a G5KJSON data type" unless resource.is_a?(Cute::G5K::G5KJSON) begin return @g5k_connection.delete_json(resource.rel_self) rescue Cute::G5K::RequestFailed => e raise unless e.response.include?('already killed') end end # Performs a reservation in Grid'5000. # # = Examples # # By default this method blocks until the reservation is ready, # if we want this method to return after creating the reservation we set the option *:wait* to *false*. # Then, you can use the method {Cute::G5K::API#wait_for_job wait_for_job} to wait for the reservation. # # job = g5k.reserve(:nodes => 25, :site => 'luxembourg', :walltime => '01:00:00', :wait => false) # # job = g5k.wait_for_job(job, :wait_time => 100) # # == Reserving with properties # # job = g5k.reserve(:site => 'lyon', :nodes => 2, :properties => "wattmeter='YES'") # # job = g5k.reserve(:site => 'nancy', :nodes => 1, :properties => "switch='sgraphene1'") # # job = g5k.reserve(:site => 'nancy', :nodes => 1, :properties => "cputype='Intel Xeon E5-2650'") # # == Subnet reservation # # The example below reserves 2 nodes in the cluster *chirloute* located in Lille for 1 hour as well as 2 /22 subnets. # We will get 2048 IP addresses that can be used, for example, in virtual machines. # If walltime is not specified, 1 hour walltime will be assigned to the reservation. # # job = g5k.reserve(:site => 'lille', :cluster => 'chirloute', :nodes => 2, # :env => 'wheezy-x64-xen', :keys => "~/my_ssh_jobkey", # :subnets => [22,2]) # # == Before using OAR hierarchy # All non-deploy reservations are submitted by default with the OAR option "-allow_classic_ssh" # which does not take advantage of the CPU/core management level. # Therefore, in order to take advantage of this capability, SSH keys have to be specified at the moment of reserving resources. # This has to be used whenever we perform a reservation with cpu and core hierarchy. # Users are encouraged to create a pair of SSH keys for managing jobs, for instance the following command can be used: # # ssh-keygen -N "" -t rsa -f ~/my_ssh_jobkey # # The reserved nodes can be accessed using "oarsh" or by configuring the SSH connection as shown in {https://www.grid5000.fr/mediawiki/index.php/OAR2 OAR2}. # You have to specify different keys per reservation if you want several jobs running at the same time in the same site. # Example using the OAR hierarchy: # # job = g5k.reserve(:site => "grenoble", :switches => 3, :nodes => 1, :cpus => 1, :cores => 1, :keys => "~/my_ssh_jobkey") # # == Using OAR syntax # # The parameter *:resources* can be used instead of parameters such as: *:cluster*, *:nodes*, *:cpus*, *:walltime*, *:vlan*, *:subnets*, *:properties*, etc, # which are shortcuts for OAR syntax. These shortcuts are ignored if the the parameter *:resources* is used. # Using the parameter *:resources* allows to express more flexible and complex reservations by using directly the OAR syntax. # Therefore, the two examples shown below are equivalent: # # job = g5k.reserve(:site => "grenoble", :switches => 3, :nodes => 1, :cpus => 1, :cores => 1, :keys => "~/my_ssh_jobkey") # job = g5k.reserve(:site => "grenoble", :resources => "/switch=3/nodes=1/cpu=1/core=1", :keys => "~/my_ssh_jobkey") # # Combining OAR hierarchy with properties: # # job = g5k.reserve(:site => "grenoble", :resources => "{ib10g='YES' and memnode=24160}/cluster=1/nodes=2/core=1", :keys => "~/my_ssh_jobkey") # # If we want 2 nodes with the following constraints: # 1) nodes on 2 different clusters of the same site, 2) nodes with virtualization capability enabled # 3) 1 /22 subnet. The reservation will be like: # # job = g5k.reserve(:site => "rennes", :resources => "/slash_22=1+{virtual!='none'}/cluster=2/nodes=1") # # Another reservation for two clusters: # # job = g5k.reserve(:site => "nancy", :resources => "{cluster='graphene'}/nodes=2+{cluster='griffon'}/nodes=3") # # Reservation using a local VLAN # # job = g5k.reserve(:site => 'nancy', :resources => "{type='kavlan-local'}/vlan=1,nodes=1", :env => 'wheezy-x64-xen') # # @return [G5KJSON] as described in {Cute::G5K::G5KJSON job} # @param [Hash] opts Options for reservation in Grid'5000 # @option opts [Numeric] :nodes Number of nodes to reserve # @option opts [String] :walltime Walltime of the reservation # @option opts [String] :site Grid'5000 site # @option opts [Symbol] :type Type of reservation: :deploy, :allow_classic_ssh # @option opts [String] :name Reservation name # @option opts [String] :cmd The command to execute when the job starts (e.g. ./my-script.sh). # @option opts [String] :cluster Valid Grid'5000 cluster # @option opts [String] :queue A specific job queue # @option opts [Array] :subnets 1) prefix_size, 2) number of subnets # @option opts [String] :env Environment name for {http://kadeploy3.gforge.inria.fr/ Kadeploy} # @option opts [Symbol] :vlan Vlan type: :routed, :local, :global # @option opts [String] :properties OAR properties defined in the cluster # @option opts [String] :resources OAR syntax for complex submissions # @option opts [String] :reservation Request a job to be scheduled a specific date. # The date format is "YYYY-MM-DD HH:MM:SS". # @option opts [Boolean] :wait Whether or not to wait until the job is running (default is true) def reserve(opts) # checking valid options valid_opts = [:site, :cluster, :switches, :cpus, :cores, :nodes, :walltime, :cmd, :type, :name, :subnets, :env, :vlan, :properties, :resources, :reservation, :wait, :keys, :queue, :env_user] unre_opts = opts.keys - valid_opts raise ArgumentError, "Unrecognized option #{unre_opts}" unless unre_opts.empty? nodes = opts.fetch(:nodes, 1) walltime = opts.fetch(:walltime, '01:00:00') site = opts[:site] type = opts.fetch(:type, []) name = opts.fetch(:name, 'rubyCute job') command = opts[:cmd] opts[:wait] = true if opts[:wait].nil? cluster = opts[:cluster] switches = opts[:switches] cpus = opts[:cpus] cores = opts[:cores] subnets = opts[:subnets] properties = opts[:properties] reservation = opts[:reservation] resources = opts.fetch(:resources, "") type = [:deploy] if opts[:env] type = [type] if type.is_a?(Symbol) keys = opts[:keys] queue = opts[:queue] vlan_opts = {:routed => "kavlan",:global => "kavlan-global",:local => "kavlan-local"} vlan = nil unless opts[:vlan].nil? if vlan_opts.include?(opts[:vlan]) then vlan = vlan_opts.fetch(opts[:vlan]) else raise ArgumentError, 'Option for vlan not recognized' end end raise 'At least nodes, time and site must be given' if [nodes, walltime, site].any? { |x| x.nil? } raise 'nodes should be an integer or a string containing either ALL or BEST' unless (nodes.is_a?(Fixnum) or ["ALL","BEST"].include?(nodes)) secs = walltime.to_secs walltime = walltime.to_time command = "sleep #{secs}" if command.nil? if resources == "" resources = "/switch=#{switches}" unless switches.nil? resources += "/nodes=#{nodes}" resources += "/cpu=#{cpus}" unless cpus.nil? resources += "/core=#{cores}" unless cores.nil? if cluster resources = (cluster.is_a?(Fixnum) ? "/cluster=#{cluster}" : "{cluster='#{cluster}'}") + resources end resources = "{type='#{vlan}'}/vlan=1+" + resources unless vlan.nil? resources = "slash_#{subnets[0]}=#{subnets[1]}+" + resources unless subnets.nil? end resources += ",walltime=#{walltime}" unless resources.include?("walltime") payload = { 'resources' => resources, 'name' => name, 'command' => command } info "Reserving resources: #{resources} (type: #{type}) (in #{site})" payload['properties'] = properties unless properties.nil? payload['types'] = type.map{ |t| t.to_s} unless type.nil? type.map!{|t| t.to_sym} unless type.nil? payload['queue'] = queue if queue unless type.include?(:deploy) if opts[:keys] payload['import-job-key-from-file'] = [ File.expand_path(keys) ] else payload['types'] = [ 'allow_classic_ssh' ] end end if reservation payload['reservation'] = reservation info "Starting this reservation at #{reservation}" end begin # Support for the option "import-job-key-from-file" # The request has to be redirected to the OAR API given that Grid'5000 API # does not support some OAR options. if payload['import-job-key-from-file'] then temp = @g5k_connection.post_json(api_uri("sites/#{site}/internal/oarapi/jobs"),payload) sleep 1 # This is for being sure that our job appears on the list r = get_my_jobs(site,nil).select{ |j| j["uid"] == temp["id"] }.first else r = @g5k_connection.post_json(api_uri("sites/#{site}/jobs"),payload) # This makes reference to the same class end rescue Error => e info "Fail to submit job" info e.message e.http_body.split("\\n").each{ |line| info line} raise end job = @g5k_connection.get_json(r.rel_self) job = wait_for_job(job) if opts[:wait] == true opts.delete(:nodes) # to not collapse with deploy options deploy(job,opts) unless opts[:env].nil? #type == :deploy return job end # Blocks until job is in *running* state # # = Example # You can pass the parameter *:wait_time* that allows you to timeout the submission (by default is 10h). # The method will throw a {Cute::G5K::EventTimeout Timeout} exception # that you can catch and react upon. # The following example shows how can be used, let's suppose we want to find 5 nodes available for # 3 hours. We can try in each site using the script below. # # require 'cute' # # g5k = Cute::G5K::API.new() # # sites = g5k.site_uids # # sites.each{ |site| # job = g5k.reserve(:site => site, :nodes => 5, :wait => false, :walltime => "03:00:00") # begin # job = g5k.wait_for_job(job, :wait_time => 60) # puts "Nodes assigned #{job['assigned_nodes']}" # break # rescue Cute::G5K::EventTimeout # puts "We waited too long in site #{site} let's release the job and try in another site" # g5k.release(job) # end # } # # @param job [G5KJSON] as described in {Cute::G5K::G5KJSON job} # @param [Hash] opts Options # @option opts [Numeric] :wait_time Number of seconds to wait before triggering a timeout def wait_for_job(job,opts={}) opts[:wait_time] = 36000 if opts[:wait_time].nil? jid = job['uid'] info "Waiting for reservation #{jid}" begin Timeout.timeout(opts[:wait_time]) do while true job = job.refresh(@g5k_connection) t = job['scheduled_at'] if !t.nil? t = Time.at(t) secs = [ t - Time.now, 0 ].max.to_i info "Reservation #{jid} should be available at #{t} (#{secs} s)" end break if job['state'] == 'running' raise "Job is finishing." if job['state'] == 'finishing' Kernel.sleep(5) end end rescue Timeout::Error raise EventTimeout.new("Event timeout") end info "Reservation #{jid} ready" return job end # Deploys an environment in a set of reserved nodes using {http://kadeploy3.gforge.inria.fr/ Kadeploy}. # A job structure returned by {Cute::G5K::API#reserve reserve} or {Cute::G5K::API#get_my_jobs get_my_jobs} methods # is mandatory as a parameter as well as the environment to deploy. # By default this method does not block, for that you have to set the option *:wait* to *true*. # # = Examples # Deploying the production environment *wheezy-x64-base* on all the reserved nodes and wait until the deployment is done: # # deploy(job, :env => "wheezy-x64-base", :wait => true) # # Other parameters you can specify are *:nodes* [Array] for deploying on specific nodes within a job and # *:keys* [String] to specify the public key to use during the deployment. # # deploy(job, :nodes => ["genepi-2.grid5000.fr"], :env => "wheezy-x64-xen", :keys => "~/my_key") # # The parameter *:keys* [String] can be a string specifying the path of the key (as the previous case) # or the contents of the public ssh key as the example given below: # # deploy(job,:env => "jessie-x64-big", :keys => File.read("/tmp/test_key/test_key.pub")) # # @param job [G5KJSON] as described in {Cute::G5K::G5KJSON job} # @param [Hash] opts Deploy options # @option opts [String] :env {http://kadeploy3.gforge.inria.fr/ Kadeploy} environment to deploy # @option opts [Array] :nodes Specifies the nodes to deploy on # @option opts [String] :keys Specifies the SSH keys to copy for the deployment # @option opts [Boolean] :wait Whether or not to wait until the deployment is done (default is false) # @return [G5KJSON] a job with deploy information as described in {Cute::G5K::G5KJSON job} def deploy(job, opts = {}) # checking valid options, same as reserve option even though some option dont make any sense valid_opts = [:site, :cluster, :switches, :cpus, :cores, :nodes, :walltime, :cmd, :type, :name, :subnets, :env, :vlan, :properties, :resources, :reservation, :wait, :keys, :queue, :env_user] unre_opts = opts.keys - valid_opts raise ArgumentError, "Unrecognized option #{unre_opts}" unless unre_opts.empty? raise ArgumentError, "Unrecognized job format" unless job.is_a?(G5KJSON) env = opts[:env] raise ArgumentError, "Environment must be given" if env.nil? nodes = opts[:nodes].nil? ? job['assigned_nodes'] : opts[:nodes] raise "Unrecognized nodes format, use an Array" unless nodes.is_a?(Array) site = @g5k_connection.follow_parent(job).uid if opts[:keys].nil? then public_key_path = File.expand_path("~/.ssh/id_rsa.pub") if File.exist?(public_key_path) then public_key_file = File.read(public_key_path) else raise ArgumentError, "No public ssh key found" end else # We check if the string passed contains the ssh public key if (opts[:keys].length < 300 && (opts[:keys] =~ /^ssh.*/).nil?) public_key_file = File.read("#{File.expand_path(opts[:keys])}.pub").chop else public_key_file = opts[:keys] end end payload = { 'nodes' => nodes, 'environment' => env, 'key' => public_key_file, } if !job.resources["vlans"].nil? vlan = job.resources["vlans"].first payload['vlan'] = vlan info "Found VLAN with uid = #{vlan}" end payload['user'] = opts[:env_user] unless opts[:env_user].nil? info "Creating deployment" begin r = @g5k_connection.post_json(api_uri("sites/#{site}/deployments"), payload) rescue Error => e info "Fail to deploy" info e.message e.http_body.split("\\n").each{ |line| info line} raise end job["deploy"] = [] if job["deploy"].nil? job["deploy"].push(r) job = wait_for_deploy(job) if opts[:wait] == true return job end # Returns the status of all deployments performed within a job. # The results can be filtered using a Hash with valid deployment properties # described in {https://api.grid5000.fr/doc/4.0/reference/spec.html Grid'5000 API spec}. # # = Example # # deploy_status(job, :nodes => ["adonis-10.grenoble.grid5000.fr"], :status => "terminated") # # @return [Array] status of deploys within a job # @param job [G5KJSON] as described in {Cute::G5K::G5KJSON job} # @param filter [Hash] filter the deployments to be returned. def deploy_status(job,filter = {}) job["deploy"].map!{ |d| d.refresh(@g5k_connection) } filter.keep_if{ |k,v| v} # removes nil values if filter.empty? status = job["deploy"].map{ |d| d["status"] } else status = job["deploy"].map{ |d| d["status"] if filter.select{ |k,v| d[k.to_s] != v }.empty? } end return status.compact end # Blocks until deployments have *terminated* status # # = Examples # This method requires a job as a parameter and it will blocks by default until all deployments # within the job pass form *processing* status to *terminated* status. # # wait_for_deploy(job) # # You can wait for specific deployments using the option *:nodes*. This can be useful when performing different deployments on the reserved resources. # # wait_for_deploy(job, :nodes => ["adonis-10.grenoble.grid5000.fr"]) # # Another parameter you can specify is *:wait_time* that allows you to timeout the deployment (by default is 10h). # The method will throw a {Cute::G5K::EventTimeout Timeout} exception # that you can catch and react upon. This example illustrates how this can be used. # # require 'cute' # # g5k = Cute::G5K::API.new() # # job = g5k.reserve(:nodes => 1, :site => 'lyon', :type => :deploy) # # begin # g5k.deploy(job,:env => 'wheezy-x64-base') # g5k.wait_for_deploy(job,:wait_time => 100) # rescue Cute::G5K::EventTimeout # puts "We waited too long let's release the job" # g5k.release(job) # end # # @param job [G5KJSON] as described in {Cute::G5K::G5KJSON job} # @param opts [Hash] options def wait_for_deploy(job,opts = {}) raise "Deploy information not present in the given job" if job["deploy"].nil? opts.merge!({:wait_time => 36000}) if opts[:wait_time].nil? nodes = opts[:nodes] begin Timeout.timeout(opts[:wait_time]) do # it will ask just for processing status status = deploy_status(job,{:nodes => nodes, :status => "processing"}) until status.empty? do info "Waiting for #{status.length} deployment" sleep 4 status = deploy_status(job,{:nodes => nodes, :status => "processing"}) end info "Deployment finished" return job end rescue Timeout::Error raise EventTimeout.new("Timeout triggered") end end # It returns an array of machines that did not deploy successfully # = Example # It can be used to try a new deploy: # # badnodes = g5k.check_deployment(job["deploy"].last) # g5k.deploy(job,:nodes => badnodes, :env => 'wheezy-x64-base') # g5k.wait_for_deploy(job) # # @return [Array] machines that did not deploy successfully # @param deploy_info [Hash] deployment structure information def check_deployment(deploy_info) deploy_info["result"].select{ |p,v| v["state"] == "KO"}.keys end private # Handles the output of messages within the module # @param msg [String] message to show def info(msg) if @logger.nil? then t = Time.now s = t.strftime('%Y-%m-%d %H:%M:%S.%L') puts "#{s} => #{msg}" else @logger.info(msg) end end # @return a valid Grid'5000 resource # it avoids "//" def api_uri(path) path = path[1..-1] if path.start_with?('/') return "#{@api_version}/#{path}" end end end end