lib/macroape/cli/preprocess_collection.rb in macroape-3.3.7 vs lib/macroape/cli/preprocess_collection.rb in macroape-3.3.8

- old
+ new

@@ -3,91 +3,80 @@ require 'shellwords' module Macroape module CLI module PreprocessCollection - + def self.main(argv) - help_string = %q{ - Command-line format: - ruby preprocess_collection.rb <file or folder with PWMs or .stdin with filenames> [options] + doc = <<-EOS.strip_doc + Command-line format: + #{run_tool_cmd} <file or folder with PWMs or .stdin with filenames> <output file> [options] - Options: - [-p <list of P-values>] - [-d <rough discretization> <precise discretization>] - [-b <background probabilities, ACGT - 4 numbers, space-delimited, sum should be equal to 1>] - [-o <output file>] - [-n <name>] - specify name for a collection. Default filename is based on this parameter - [--silent] - don't show current progress information during scan (by default this information's written into stderr) - [--pcm] - treats your input motifs as PCM-s. Motifs are converted to PWMs internally so output is the same as for according PWMs + Options: + [-p <list of P-values>] - comma separated(no spaces allowed) list of P-values to precalculate thresholds + [-d <rough discretization>,<precise discretization>] - set discretization rates, comma delimited (no spaces allowed), order doesn't matter + [--silent] - hide current progress information during scan (printed to stderr by default) + [--pcm] - treat the input file as Position Count Matrix. PCM-to-PWM transformation to be done internally. + [--boundary lower|upper] Upper boundary (default) means that the obtained P-value is greater than or equal to the requested P-value + [-b <background probabilities] ACGT - 4 numbers, comma-delimited(spaces not allowed), sum should be equal to 1, like 0.25,0.24,0.26,0.25 - The tool stores preprocessed Macroape collection to the specified YAML-file. + The tool preprocesses and stores Macroape motif collection in the specified YAML-file. - Example: - ruby preprocess_collection.rb ./motifs -p 0.001 0.0005 0.0001 -d 1 10 -b 0.2 0.3 0.2 0.3 -o collection.yaml - } + Example: + #{run_tool_cmd} ./motifs collection.yaml -p 0.001,0.0005,0.0001 -d 1,10 -b 0.2,0.3,0.3,0.2 + EOS - if ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)} - STDERR.puts help_string + if argv.empty? || ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)} + STDERR.puts doc exit end data_model = argv.delete('--pcm') ? Bioinform::PCM : Bioinform::PWM - + default_pvalues = [0.0005] background = [1,1,1,1] rough_discretization = 1 precise_discretization = 10 - output_file = 'collection.yaml' - max_hash_size = 1000000 - + max_hash_size = 10000000 + data_source = argv.shift - - raise "No input. You'd specify file or folder with pwms" unless data_source + output_file = argv.shift + + raise 'No input. You should specify file or folder with pwms' unless data_source raise "Error! File or folder #{data_source} doesn't exist" unless Dir.exist?(data_source) || File.exist?(data_source) || data_source == '.stdin' + raise 'You should specify output file' unless output_file pvalues = [] silent = false - output_file_specified = false + pvalue_boundary = :upper + until argv.empty? case argv.shift when '-b' - background = argv.shift(4).map(&:to_f) + background = argv.shift.split(',').map(&:to_f) raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless background == background.reverse when '-p' - loop do - begin - Float(argv.first) - pvalues << argv.shift.to_f - rescue - raise StopIteration - end - end + pvalues = argv.shift.split(',').map(&:to_f) when '-d' - rough_discretization, precise_discretization = argv.shift(2).map(&:to_f).sort - when '-o' - output_file = argv.shift - output_file_specified = true - when '-m' + rough_discretization, precise_discretization = argv.shift.split(',').map(&:to_f).sort + when '--max-hash-size' max_hash_size = argv.shift.to_i - when '-n' - collection_name = argv.shift when '--silent' silent = true + when '--boundary' + pvalue_boundary = argv.shift.to_sym + raise 'boundary should be either lower or upper' unless pvalue_boundary == :lower || pvalue_boundary == :upper end end pvalues = default_pvalues if pvalues.empty? - collection = Bioinform::Collection.new(rough_discretization: rough_discretization, + collection = Bioinform::Collection.new(rough_discretization: rough_discretization, precise_discretization: precise_discretization, background: background, pvalues: pvalues) - if collection_name - collection.name = collection_name - output_file = "#{collection_name}.yaml" if !output_file_specified - end - + + data_source = data_source.gsub("\\",'/') if File.directory?(data_source) motifs = Dir.glob(File.join(data_source,'*')).sort.map do |filename| pwm = data_model.new(File.read(filename)) pwm.name ||= File.basename(filename, File.extname(filename)) pwm @@ -104,46 +93,67 @@ motifs << motif end else raise "Specified data source `#{data_source}` is neither directory nor file nor even .stdin" end - + pwms = motifs.map(&:to_pwm) - + pwms.each_with_index do |pwm,index| - STDERR.puts "#{index + 1} -- Name: #{pwm.name}, Length: #{pwm.length}" unless silent - + STDERR.puts "Motif #{pwm.name}, length: #{pwm.length} (#{index+1} of #{pwms.size}, #{index*100/pwms.size}% complete)" unless silent + # When support of onefile collections is introduced - then here should be check if name exists. # Otherwise it should skip motif and tell you about this # Also two command line options to fail on skipping or to skip silently should be included info = OpenStruct.new(rough: {}, precise: {}) pwm.set_parameters(background: background, max_hash_size: max_hash_size) skip_motif = false - pwm.discrete(rough_discretization).thresholds(*pvalues) do |pvalue, threshold, real_pvalue| + + fill_rough_infos = ->(pvalue, threshold, real_pvalue) do if real_pvalue == 0 $stderr.puts "#{pwm.name} at pvalue #{pvalue} has threshold that yields real-pvalue 0 in rough mode. Rough calculation will be skipped" else info.rough[pvalue] = threshold / rough_discretization end end - pwm.discrete(precise_discretization).thresholds(*pvalues) do |pvalue, threshold, real_pvalue| + fill_precise_infos = ->(pvalue, threshold, real_pvalue) do if real_pvalue == 0 $stderr.puts "#{pwm.name} at pvalue #{pvalue} has threshold that yields real-pvalue 0 in precise mode. Motif will be excluded from collection" skip_motif = true else info.precise[pvalue] = threshold / precise_discretization end end + + if pvalue_boundary == :lower + pwm.discrete(rough_discretization).thresholds(*pvalues, &fill_rough_infos) + else + pwm.discrete(rough_discretization).weak_thresholds(*pvalues, &fill_rough_infos) + end + + if pvalue_boundary == :lower + pwm.discrete(precise_discretization).thresholds(*pvalues, &fill_precise_infos) + else + pwm.discrete(precise_discretization).weak_thresholds(*pvalues,&fill_precise_infos) + end collection.add_pm(pwm, info) unless skip_motif end + STDERR.puts "100% complete. Saving results" unless silent File.open(output_file, 'w') do |f| f.puts(collection.to_yaml) end + puts OutputInformation.new{|infos| + infos.add_parameter('P', 'P-value list', pvalues.join(',')) + infos.add_parameter('VR', 'discretization value, rough', rough_discretization) + infos.add_parameter('VP', 'discretization value, precise', precise_discretization) + infos.add_parameter('PB', 'P-value boundary', pvalue_boundary) + infos.background_parameter('B', 'background', background) + }.result rescue => err - STDERR.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse -help option for help\n" + STDERR.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse --help option for help\n\n#{doc}" end end end end \ No newline at end of file