lib/origen/application/lsf.rb in origen-0.60.7 vs lib/origen/application/lsf.rb in origen-0.60.8

- old
+ new

@@ -32,18 +32,28 @@ # - prio High prio queue with low slot count, useful if you don't have slots available in normal queue. See PrioritizingMyJobs. # - ondemand On-Demand Servers to satisfy urgent and short-term (2 weeks or less) customer compute requirements. # - wam WAM cron processing # - grid Low-priority batch jobs (random sim, regressions, etc). Access to all spare CPU cycles. attr_accessor :cores + # default 400. Set number of max jobs (- 100) to be able to run by this user before lsf submission pauses. + # eg if max jobs you want ran is 500, set to 400, which will block until local submissions + # is less than 400 at which point it can batch submit up to 100 more jobs + attr_accessor :max_jobs + # default false. used when calculating remote job count for comparison with the max jobs parameter + # If set to true, then only jobs of the specified queue will be counted, effectively making + # the max_jobs value a max_jobs per queue + attr_accessor :queue_count_only def initialize @group = Origen.site_config.lsf_group @project = Origen.site_config.lsf_project @resource = Origen.site_config.lsf_resource @queue = Origen.site_config.lsf_queue @debug = Origen.site_config.lsf_debug @cores = Origen.site_config.lsf_cores + @max_jobs = Origen.site_config.lsf_max_jobs || 400 + @queue_count_only = Origen.site_config.lsf_queue_count_only || false end end # Accessor for the global LSF configuration, use this to modify the default # LSF configuration for a given setup. Typically an alternate configuration would @@ -78,11 +88,11 @@ # Submits the given command to the LSF, returns the LSF job ID def submit(command, options = {}) options = { dependents: [], - rerunnable: true, # Will rerun automatically if the execution host fails + rerunnable: true # Will rerun automatically if the execution host fails }.merge(options) limit_job_submissions do group = options[:group] || config.group group = group ? "-G #{group}" : '' project = options[:project] || config.project @@ -101,11 +111,11 @@ dependents = "-w '#{dependents}'" end cmd = "bsub -oo /dev/null #{dependents} #{rerunnable} #{group} #{project} #{resource} #{queue} #{cores} '#{command}'" if config.debug puts cmd - '496212' # Return a dummy ID to keep the caller happy + '496212' # Return a dummy ID to keep the caller happy else output = `#{cmd}` Origen.log.info output.strip if output.split("\n").last =~ /Job <(\d+)> is submitted/ Regexp.last_match[1] @@ -138,11 +148,20 @@ def remote_jobs_count i = 0 `bjobs 2>&1`.split("\n").each do |line| if line =~ /^(\d+).*(RUN|PEND)/ - i += 1 + if @queue_count_only && @queue + # only count jobs for current queue, helpful for when + # you have a service account user that runs lsf for a + # lot of jobs in addition to origen jobs + if line =~ /#{@queue}/ + i += 1 + end + else + i += 1 + end end end i end @@ -150,10 +169,10 @@ # to warn if a single users current job count gets above 500. # This method prevents that stage from being reached. def limit_job_submissions @local_job_count ||= 0 if @local_job_count == 100 - while remote_jobs_count > 400 + while remote_jobs_count > @max_jobs puts 'Waiting for submitted jobs count to fall below limit...' sleep 5 end @local_job_count = 0 yield