#!/usr/bin/env ruby # encoding: utf-8 # SeqTrimNext: Next generation sequencing preprocessor # Copyright (C) <2011> # Authors: Almudena Bocinos Rioboo, Diego Dario Guerrero Fernandez, # Rocio Bautista Moreno, Juan Falgueras Cano & M. Gonzalo Claros # email: soporte@scbi.uma.es - http://www.scbi.uma.es # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . #= SEQTRIM II # #== Running # # Seqtrim can be run locally or in a parallel/distributted environment. # #=== Running locally #* list # #=== Running in a distributted environment # #== SEC 2 # #=== SUB 2.1 # # #finds the classes that were in the folder 'classes' # ROOT_PATH=File.dirname(__FILE__) # $: << File.expand_path(File.join(ROOT_PATH, 'classes')) # # #finds the classes that were in the folder 'plugins' # $: << File.expand_path(File.join(ROOT_PATH, 'plugins')) # # # #finds the classes that were in the folder 'plugins' # $: << File.expand_path(File.join(ROOT_PATH, 'actions')) # # #finds the classes that were in the folder 'utils' # $: << File.expand_path(File.join(ROOT_PATH, 'utils')) # # $: << File.expand_path(File.join(ROOT_PATH, 'classes','em_classes')) # to test scbi_drb gem locally # $: << File.expand_path('~/progs/ruby/gems/scbi_drb/lib/') # $: << File.expand_path(ROOT_PATH) $: << File.expand_path('~/progs/ruby/gems/seqtrimnext/lib/') # $: << File.expand_path('~/progs/ruby/gems/scbi_mapreduce/lib/') require 'seqtrimnext' require 'scbi_headers' def put_header header = ScbiHeader.new('SeqTrimNEXT',Seqtrimnext::SEQTRIM_VERSION) header.description="SeqtrimNEXT is a customizable and distributed pre-processing software for NGS (Next Generation Sequencing) biological data. It makes use of scbi_mapreduce gem to be able to run in parallel and distributed environments. It is specially suited for Roche 454 (normal and paired-end) & Ilumina datasets, although it could be easyly adapted to any other situation." header.copyright='2011' header.authors<< "Darío Guerrero" header.authors<< "Almudena Bocinos" header.authors<< "Rocío Bautista" header.authors<< "Noé Fernández" header.authors<< "Juan Falgueras" header.authors<< "M. Gonzalo Claros" # header.articles<< "Article one: with one description line" # header.articles<< "Article two: with one description line" # To output the header puts header end put_header ############ PATHS ####################### $SEQTRIM_PATH = ROOT_PATH if ENV['SEQTRIMNEXT_INIT'] && File.exists?(ENV['SEQTRIMNEXT_INIT']) $SEQTRIMNEXT_INIT=File.expand_path(ENV['SEQTRIMNEXT_INIT']) else $SEQTRIMNEXT_INIT=File.join($SEQTRIM_PATH,'init_env') end # if there is a BLASTDB environment var, then use it if ENV['BLASTDB']# && Dir.exists?(ENV['BLASTDB']) $FORMATTED_DB_PATH = ENV['BLASTDB'] $DB_PATH = File.dirname($FORMATTED_DB_PATH) else # otherwise use ROOTPATH + DB $FORMATTED_DB_PATH = File.expand_path(File.join(ROOT_PATH, "DB",'formatted')) $DB_PATH = File.expand_path(File.join(ROOT_PATH, "DB")) end ENV['BLASTDB']=$FORMATTED_DB_PATH OUTPUT_PATH='output_files' # TODO - COMENTAR todas las clases y metodos para que salga la descripcion cuando hagas rdoc en el terminal #Checks install requeriments require 'install_requirements' ins = InstallRequirements.new if (!ins.check_install_requirements) exit(-1) end require "logger" require 'optparse' require "global_match" require "seqtrim" require "params.rb" require "plugin.rb" require "sequence.rb" require "plugin_manager.rb" require "make_blast_db" require 'hash_stats' require 'list_db' require 'install_database' require 'socket' def show_additional_help puts "\n"*3 puts "E.g.: processing a fastq sequences file" puts "#{File.basename($0)} -t genomics_454.txt -Q sequences.fastq" puts "\n"*2 puts "E.g.: processing a fasta file with qual" puts "#{File.basename($0)} -t genomics_454.txt -f sequences.fasta -q sequences.qual" templates = Dir.glob(File.join($SEQTRIM_PATH,'templates','*.txt')).map{|t| File.basename(t)} puts "\n\n ========================================================================================================" puts " Available templates to use with -t option (you can also use your own template):" puts " Templates at: #{File.join($SEQTRIM_PATH,'templates')}" puts " ========================================================================================================\n\n" templates.map{|e| puts " "+e} puts "\n\n ========================================================================================================" puts " Available databases to use in custom template files (you can also use your own database):" puts " Databases at: #{$DB_PATH}" puts " ========================================================================================================\n\n" ListDb.list_databases($DB_PATH).map{|e| puts " "+e} # # ip_list = Socket.ip_address_list.select{|e| e.ipv4?}.map{|e| e.ip_address} # # puts ip_list exit end # Reads the parameters from console. For this is used ARGV, that is an array. options = {} optparse = OptionParser.new do |opts| # Set a banner, displayed at the top # of the help screen. opts.banner = "Usage: #{$0} -t template_file \{-Q fastaQ_file | -f fasta_file -q qual_file\} [options]" # Define the options, and what they do #options[:server_ip] = '127.0.0.1' options[:server_ip] = '0.0.0.0' opts.on( '-s', '--server IP', 'Server ip. Can use a partial ip to select the apropriate interface' ) do |server_ip| # get list of available ips ip_list = Socket.ip_address_list.select{|e| e.ipv4?}.map{|e| e.ip_address} ip=ip_list.select{|ip| ip.index(server_ip)==0}.first if !ip ip='0.0.0.0' # $LOG.info("No available ip matching #{server_ip}") end # $ .info("Using ip #{ip}") options[:server_ip] = ip end options[:port] = 0 #50000 opts.on( '-p', '--port PORT', 'Server port. If set to 0, an arbitrary empty port will be used') do |port| options[:port] = port.to_i end options[:workers] = 2 opts.on( '-w', '--workers COUNT', 'Number of workers, or file containing machine names to launch workers with ssh' ) do |workers| if File.exists?(workers) # use workers file options[:workers] = File.read(workers).split("\n").map{|w| w.chomp} else begin options[:workers] = Integer(workers) rescue STDERR.puts "ERROR:Invalid workers parameter #{options[:workers]}" exit -1 end end end options[:only_workers] = false opts.on( '-o', '--only_workers', 'Only launch workers' ) do options[:only_workers] = true end options[:check_db] = false opts.on( '-c', '--check_databases', 'Check Blast databases and reformat if necessary' ) do options[:check_db] = true end options[:use_checkpoint] = false opts.on( '-C', '--use_checkpoint', 'Restore at checkpoint if scbi_mapreduce_checkpoint file is available' ) do options[:use_checkpoint] = true end # options[:skip_initial_stats] = false # opts.on( '-k', '--skip_initial_stats', 'Skip initial stats' ) do # options[:skip_initial_stats] = true # end options[:install_db] = nil opts.on( '-i', '--install_databases TYPE', 'Install base databases and reformat them if necessary') do |db_type| options[:install_db] = db_type end options[:logfile] = STDOUT opts.on( '-l', '--logfile FILE', 'Write log to FILE' ) do |file| options[:logfile] = file end options[:fastq] = nil opts.on( '-Q', '--fastq FILE1,FILE2',Array, 'Fastq input file. Use - for ' ) do |file| options[:fastq] = file puts "FILES:",file,file.class end options[:format] = nil opts.on( '-F', '--fastq_quality_format FORMAT', 'Fastq input quality format use sanger or illumina18 for phred+33 based scores. Use illumina15 for phred+64 based scores (default is sanger) file. Use - for ' ) do |value| options[:format] = value if !['sanger','illumina15', 'illumina18'].include?(value) STDERR.puts "ERROR: Invalid FASTQ format parameter #{value}" exit -1 end end options[:fasta] = nil opts.on( '-f', '--fasta FILE', 'Fasta input file' ) do |file| options[:fasta] = file end options[:qual] = nil opts.on( '-q', '--qual FILE', 'Qual input file' ) do |file| options[:qual] = file end options[:list_db] = nil options[:list_db_name] = 'ALL' opts.on( '-L', '--list_db [DB_NAME]', 'List entries IDs in DB_NAME. Use "-L all" to view all available databases' ) do |value| options[:list_db] = true options[:list_db_name] = value if value end options[:gen_params] = false opts.on( '-G', '--generate_template', 'Generates a sample template file with default parameters' ) do options[:gen_params] = true end options[:template] = nil opts.on( '-t', '--template TEMPLATE_FILE', 'Use TEMPLATE_FILE instead of default parameters' ) do |file| options[:template] = file end options[:chunk_size] = 5000 opts.on( '-g', '--group_size chunk_size', 'Group sequences in chunks of size ' ) do |cs| options[:chunk_size] = cs.to_i end options[:json] = nil opts.on( '-j', '--json', 'Save results in json file' ) do options[:json] = true end options[:skip_output] = false opts.on( '-K', '--no-verbose', 'Change to no verbose mode. Every sequence will not be written to output log' ) do options[:skip_output] = true end options[:skip_report] = false opts.on( '-R', '--no-report', 'Do not generate final PDF report (gem scbi_seqtrimnext_report required if you want to generate PDF report).' ) do options[:skip_report] = true end options[:write_in_gzip] = false opts.on( '-z', '--gzip', 'Generate output files in gzip format.' ) do options[:write_in_gzip] = true end # This displays the help screen, all programs are # assumed to have this option. opts.on_tail( '-h', '--help', 'Display this screen' ) do puts opts show_additional_help exit -1 end end # parse options and remove from ARGV optparse.parse! if options[:list_db] then # List database entries in a database ListDb.new($DB_PATH,options[:list_db_name]) exit -1 end if options[:gen_params] then # Generates a sample params file in current directory Params.generate_sample_params exit -1 end #set logger # system('rm logs/*') FileUtils.mkdir('logs') if !File.exists?('logs') $LOG = Logger.new(options[:logfile]) $LOG.datetime_format = "%Y-%m-%d %H:%M:%S" #logger.level = Logger::INFO #DEBUG < INFO < WARN < ERROR < FATAL < UNKNOWN $LOG.info("SeqTrimNext version #{Seqtrimnext::SEQTRIM_VERSION}") $LOG.info("STN command: #{$0}") $LOG.info("Using BLASTDB: "+ $FORMATTED_DB_PATH) $LOG.info("Using options: "+ options.to_json) if options[:install_db] then #install databases InstallDatabase.new(options[:install_db],$DB_PATH) # reformat databases MakeBlastDb.new($DB_PATH) exit end if !File.exists?($FORMATTED_DB_PATH) STDERR.puts "Database path not found: #{$FORMATTED_DB_PATH}. \n\n\nInstall databases to this path or set your BLASTDB environment variable (eg.: export BLASTDB=new_path)" exit(-1) end if options[:check_db] then # check and format blast databases MakeBlastDb.new($DB_PATH) exit end required_options = options[:template] && (options[:fastq] || (options[:fasta])) # if ((ARGV.count != 2) && (ARGV.count != 3)) # con esto vemos si hay argumentos, if (ARGV.count != 0) || (!required_options) # con esto vemos si hay argumentos, puts "You must provide all required options" puts "" puts optparse.help exit(-1) end # check for template if (!File.exists?(options[:template])) if File.exists?(File.join($SEQTRIM_PATH,'templates',options[:template])) options[:template] = File.join($SEQTRIM_PATH,'templates',options[:template]) else $LOG.info "Params file: #{options[:template]} doesn't exists. \n\nYou can use your own template or specify one from this list:\n=============================" puts Dir.glob(File.join($SEQTRIM_PATH,'templates','*.txt')).map{|t| File.basename(t)} exit(-1) end end $LOG.info "Using init file: #{$SEQTRIMNEXT_INIT}" $LOG.info "Using params file: #{options[:template]}" # check file existence if options[:fastq] options[:fastq].each do |fastq_file| # fastq file if (!fastq_file.nil? && fastq_file!='-' && !File.exists?(File.expand_path(fastq_file))) $LOG.error "Input file: #{fastq_file} doesn't exists" exit(-1) end end end # fasta file if (!options[:fasta].nil? && !File.exists?(options[:fasta])) $LOG.error "Input file: #{options[:fasta]} doesn't exists" exit(-1) end # qual file if ((!options[:qual].nil?)&&(!File.exists?(options[:qual]))) $LOG.error "Input file: #{options[:qual]} doesn't exists" exit(-1) end s = Seqtrim.new(options) #generate report if !options[:skip_report] && system("which generate_report.rb > /dev/null ") cmd="generate_report.rb output_files 2> report_generation_errors.log" $LOG.info "Generating report #{cmd}" `#{cmd}` else skip_text='.' if options[:skip_report] skip_text=' and remove the -R option from the command line.' end $LOG.info "If you want a detailed report in PDF format, be sure you have installed the optional seqtrimnext_report gem (gem install seqtrimnext_report)#{skip_text}" end exit(Seqtrim.exit_status)