lib/origen/application/lsf.rb in origen-0.60.7 vs lib/origen/application/lsf.rb in origen-0.60.8
- old
+ new
@@ -32,18 +32,28 @@
# - prio High prio queue with low slot count, useful if you don't have slots available in normal queue. See PrioritizingMyJobs.
# - ondemand On-Demand Servers to satisfy urgent and short-term (2 weeks or less) customer compute requirements.
# - wam WAM cron processing
# - grid Low-priority batch jobs (random sim, regressions, etc). Access to all spare CPU cycles.
attr_accessor :cores
+ # default 400. Set number of max jobs (- 100) to be able to run by this user before lsf submission pauses.
+ # eg if max jobs you want ran is 500, set to 400, which will block until local submissions
+ # is less than 400 at which point it can batch submit up to 100 more jobs
+ attr_accessor :max_jobs
+ # default false. used when calculating remote job count for comparison with the max jobs parameter
+ # If set to true, then only jobs of the specified queue will be counted, effectively making
+ # the max_jobs value a max_jobs per queue
+ attr_accessor :queue_count_only
def initialize
@group = Origen.site_config.lsf_group
@project = Origen.site_config.lsf_project
@resource = Origen.site_config.lsf_resource
@queue = Origen.site_config.lsf_queue
@debug = Origen.site_config.lsf_debug
@cores = Origen.site_config.lsf_cores
+ @max_jobs = Origen.site_config.lsf_max_jobs || 400
+ @queue_count_only = Origen.site_config.lsf_queue_count_only || false
end
end
# Accessor for the global LSF configuration, use this to modify the default
# LSF configuration for a given setup. Typically an alternate configuration would
@@ -78,11 +88,11 @@
# Submits the given command to the LSF, returns the LSF job ID
def submit(command, options = {})
options = {
dependents: [],
- rerunnable: true, # Will rerun automatically if the execution host fails
+ rerunnable: true # Will rerun automatically if the execution host fails
}.merge(options)
limit_job_submissions do
group = options[:group] || config.group
group = group ? "-G #{group}" : ''
project = options[:project] || config.project
@@ -101,11 +111,11 @@
dependents = "-w '#{dependents}'"
end
cmd = "bsub -oo /dev/null #{dependents} #{rerunnable} #{group} #{project} #{resource} #{queue} #{cores} '#{command}'"
if config.debug
puts cmd
- '496212' # Return a dummy ID to keep the caller happy
+ '496212' # Return a dummy ID to keep the caller happy
else
output = `#{cmd}`
Origen.log.info output.strip
if output.split("\n").last =~ /Job <(\d+)> is submitted/
Regexp.last_match[1]
@@ -138,11 +148,20 @@
def remote_jobs_count
i = 0
`bjobs 2>&1`.split("\n").each do |line|
if line =~ /^(\d+).*(RUN|PEND)/
- i += 1
+ if @queue_count_only && @queue
+ # only count jobs for current queue, helpful for when
+ # you have a service account user that runs lsf for a
+ # lot of jobs in addition to origen jobs
+ if line =~ /#{@queue}/
+ i += 1
+ end
+ else
+ i += 1
+ end
end
end
i
end
@@ -150,10 +169,10 @@
# to warn if a single users current job count gets above 500.
# This method prevents that stage from being reached.
def limit_job_submissions
@local_job_count ||= 0
if @local_job_count == 100
- while remote_jobs_count > 400
+ while remote_jobs_count > @max_jobs
puts 'Waiting for submitted jobs count to fall below limit...'
sleep 5
end
@local_job_count = 0
yield