lib/macroape/cli/preprocess_collection.rb in macroape-3.3.6 vs lib/macroape/cli/preprocess_collection.rb in macroape-3.3.7

- old
+ new

@@ -1,52 +1,55 @@ -require 'macroape' +require_relative '../../macroape' require 'yaml' +require 'shellwords' module Macroape module CLI module PreprocessCollection def self.main(argv) help_string = %q{ Command-line format: - ruby preprocess_collection.rb <file or folder with PWMs or .stdin with PWMs> [options] + ruby preprocess_collection.rb <file or folder with PWMs or .stdin with filenames> [options] Options: [-p <list of P-values>] [-d <rough discretization> <precise discretization>] [-b <background probabilities, ACGT - 4 numbers, space-delimited, sum should be equal to 1>] [-o <output file>] + [-n <name>] - specify name for a collection. Default filename is based on this parameter [--silent] - don't show current progress information during scan (by default this information's written into stderr) [--pcm] - treats your input motifs as PCM-s. Motifs are converted to PWMs internally so output is the same as for according PWMs The tool stores preprocessed Macroape collection to the specified YAML-file. Example: ruby preprocess_collection.rb ./motifs -p 0.001 0.0005 0.0001 -d 1 10 -b 0.2 0.3 0.2 0.3 -o collection.yaml } - if argv.empty? || ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)} + if ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)} STDERR.puts help_string exit end data_model = argv.delete('--pcm') ? Bioinform::PCM : Bioinform::PWM default_pvalues = [0.0005] background = [1,1,1,1] rough_discretization = 1 precise_discretization = 10 - output_file = 'collection.yaml' + output_file = 'collection.yaml' max_hash_size = 1000000 data_source = argv.shift raise "No input. You'd specify file or folder with pwms" unless data_source raise "Error! File or folder #{data_source} doesn't exist" unless Dir.exist?(data_source) || File.exist?(data_source) || data_source == '.stdin' pvalues = [] silent = false + output_file_specified = false until argv.empty? case argv.shift when '-b' background = argv.shift(4).map(&:to_f) raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless background == background.reverse @@ -61,35 +64,47 @@ end when '-d' rough_discretization, precise_discretization = argv.shift(2).map(&:to_f).sort when '-o' output_file = argv.shift + output_file_specified = true when '-m' max_hash_size = argv.shift.to_i + when '-n' + collection_name = argv.shift when '--silent' silent = true end end pvalues = default_pvalues if pvalues.empty? collection = Bioinform::Collection.new(rough_discretization: rough_discretization, precise_discretization: precise_discretization, background: background, pvalues: pvalues) + if collection_name + collection.name = collection_name + output_file = "#{collection_name}.yaml" if !output_file_specified + end if File.directory?(data_source) motifs = Dir.glob(File.join(data_source,'*')).sort.map do |filename| pwm = data_model.new(File.read(filename)) pwm.name ||= File.basename(filename, File.extname(filename)) pwm end elsif File.file?(data_source) input = File.read(data_source) - motifs = data_model.choose_parser(input).split_on_motifs(input, data_model) + motifs = data_model.split_on_motifs(input) elsif data_source == '.stdin' - input = $stdin.read - motifs = data_model.choose_parser(input).split_on_motifs(input, data_model) + filelist = $stdin.read.shellsplit + motifs = [] + filelist.each do |filename| + motif = data_model.new(File.read(filename)) + motif.name ||= File.basename(filename, File.extname(filename)) + motifs << motif + end else raise "Specified data source `#{data_source}` is neither directory nor file nor even .stdin" end pwms = motifs.map(&:to_pwm) @@ -100,22 +115,31 @@ # When support of onefile collections is introduced - then here should be check if name exists. # Otherwise it should skip motif and tell you about this # Also two command line options to fail on skipping or to skip silently should be included info = OpenStruct.new(rough: {}, precise: {}) - pwm.background!(background).max_hash_size!(max_hash_size) + pwm.set_parameters(background: background, max_hash_size: max_hash_size) + skip_motif = false pwm.discrete(rough_discretization).thresholds(*pvalues) do |pvalue, threshold, real_pvalue| - info.rough[pvalue] = threshold / rough_discretization + if real_pvalue == 0 + $stderr.puts "#{pwm.name} at pvalue #{pvalue} has threshold that yields real-pvalue 0 in rough mode. Rough calculation will be skipped" + else + info.rough[pvalue] = threshold / rough_discretization + end end pwm.discrete(precise_discretization).thresholds(*pvalues) do |pvalue, threshold, real_pvalue| - info.precise[pvalue] = threshold / precise_discretization + if real_pvalue == 0 + $stderr.puts "#{pwm.name} at pvalue #{pvalue} has threshold that yields real-pvalue 0 in precise mode. Motif will be excluded from collection" + skip_motif = true + else + info.precise[pvalue] = threshold / precise_discretization + end end - - collection.add_pm(pwm, info) + collection.add_pm(pwm, info) unless skip_motif end - File.open(output_file,'w') do |f| + File.open(output_file, 'w') do |f| f.puts(collection.to_yaml) end rescue => err STDERR.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse -help option for help\n" end \ No newline at end of file