#!/usr/bin/env ruby
# encoding: utf-8
# SeqTrimNext: Next generation sequencing preprocessor
# Copyright (C) <2011>
# Authors: Almudena Bocinos Rioboo, Diego Dario Guerrero Fernandez,
# Rocio Bautista Moreno, Juan Falgueras Cano & M. Gonzalo Claros
# email: soporte@scbi.uma.es - http://www.scbi.uma.es
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see .
#= SEQTRIM II
#
#== Running
#
# Seqtrim can be run locally or in a parallel/distributted environment.
#
#=== Running locally
#* list
#
#=== Running in a distributted environment
#
#== SEC 2
#
#=== SUB 2.1
#
# #finds the classes that were in the folder 'classes'
# ROOT_PATH=File.dirname(__FILE__)
# $: << File.expand_path(File.join(ROOT_PATH, 'classes'))
#
# #finds the classes that were in the folder 'plugins'
# $: << File.expand_path(File.join(ROOT_PATH, 'plugins'))
#
#
# #finds the classes that were in the folder 'plugins'
# $: << File.expand_path(File.join(ROOT_PATH, 'actions'))
#
# #finds the classes that were in the folder 'utils'
# $: << File.expand_path(File.join(ROOT_PATH, 'utils'))
#
# $: << File.expand_path(File.join(ROOT_PATH, 'classes','em_classes'))
# to test scbi_drb gem locally
# $: << File.expand_path('~/progs/ruby/gems/scbi_drb/lib/')
# $: << File.expand_path(ROOT_PATH)
$: << File.expand_path('~/progs/ruby/gems/seqtrimnext/lib/')
# $: << File.expand_path('~/progs/ruby/gems/scbi_mapreduce/lib/')
require 'seqtrimnext'
require 'scbi_headers'
def put_header
header = ScbiHeader.new('SeqTrimNEXT',Seqtrimnext::SEQTRIM_VERSION)
header.description="SeqtrimNEXT is a customizable and distributed pre-processing software for NGS (Next Generation Sequencing) biological data. It makes use of scbi_mapreduce gem to be able to run in parallel and distributed environments. It is specially suited for Roche 454 (normal and paired-end) & Ilumina datasets, although it could be easyly adapted to any other situation."
header.copyright='2011'
header.authors<< "Darío Guerrero"
header.authors<< "Almudena Bocinos"
header.authors<< "Rocío Bautista"
header.authors<< "Noé Fernández"
header.authors<< "Juan Falgueras"
header.authors<< "M. Gonzalo Claros"
# header.articles<< "Article one: with one description line"
# header.articles<< "Article two: with one description line"
# To output the header
puts header
end
put_header
############ PATHS #######################
$SEQTRIM_PATH = ROOT_PATH
if ENV['SEQTRIMNEXT_INIT'] && File.exists?(ENV['SEQTRIMNEXT_INIT'])
$SEQTRIMNEXT_INIT=File.expand_path(ENV['SEQTRIMNEXT_INIT'])
else
$SEQTRIMNEXT_INIT=File.join($SEQTRIM_PATH,'init_env')
end
# if there is a BLASTDB environment var, then use it
if ENV['BLASTDB']# && Dir.exists?(ENV['BLASTDB'])
$FORMATTED_DB_PATH = ENV['BLASTDB']
$DB_PATH = File.dirname($FORMATTED_DB_PATH)
else # otherwise use ROOTPATH + DB
$FORMATTED_DB_PATH = File.expand_path(File.join(ROOT_PATH, "DB",'formatted'))
$DB_PATH = File.expand_path(File.join(ROOT_PATH, "DB"))
end
ENV['BLASTDB']=$FORMATTED_DB_PATH
OUTPUT_PATH='output_files'
# TODO - COMENTAR todas las clases y metodos para que salga la descripcion cuando hagas rdoc en el terminal
#Checks install requeriments
require 'install_requirements'
ins = InstallRequirements.new
if (!ins.check_install_requirements)
exit
end
require "logger"
require 'optparse'
require "global_match"
require "seqtrim"
require "params.rb"
require "plugin.rb"
require "sequence.rb"
require "plugin_manager.rb"
require "make_blast_db"
require 'hash_stats'
require 'list_db'
require 'install_database'
require 'socket'
def show_additional_help
puts "\n"*3
puts "E.g.: processing a fastq sequences file"
puts "#{$0} -t genomics_454.txt -Q sequences.fastq"
puts "\n"*2
puts "E.g.: processing a fasta file with qual"
puts "#{$0} -t genomics_454.txt -f sequences.fasta -q sequences.qual"
templates = Dir.glob(File.join($SEQTRIM_PATH,'templates','*.txt')).map{|t| File.basename(t)}
puts "\n\n ========================================================================================================"
puts " Available templates to use with -t option (you can also use your own template):"
puts " Templates at: #{File.join($SEQTRIM_PATH,'templates')}"
puts " ========================================================================================================\n\n"
templates.map{|e| puts " "+e}
puts "\n\n ========================================================================================================"
puts " Available databases to use in custom template files (you can also use your own database):"
puts " Databases at: #{$DB_PATH}"
puts " ========================================================================================================\n\n"
ListDb.list_databases($DB_PATH).map{|e| puts " "+e}
#
# ip_list = Socket.ip_address_list.select{|e| e.ipv4?}.map{|e| e.ip_address}
#
# puts ip_list
exit
end
# Reads the parameters from console. For this is used ARGV, that is an array.
options = {}
optparse = OptionParser.new do |opts|
# Set a banner, displayed at the top
# of the help screen.
opts.banner = "Usage: #{$0} -t template_file \{-Q fastaQ_file | -f fasta_file -q qual_file\} [options]"
# Define the options, and what they do
#options[:server_ip] = '127.0.0.1'
options[:server_ip] = '0.0.0.0'
opts.on( '-s', '--server IP', 'Server ip. Can use a partial ip to select the apropriate interface' ) do |server_ip|
# get list of available ips
ip_list = Socket.ip_address_list.select{|e| e.ipv4?}.map{|e| e.ip_address}
ip=ip_list.select{|ip| ip.index(server_ip)==0}.first
if !ip
ip='0.0.0.0'
# $LOG.info("No available ip matching #{server_ip}")
end
# $ .info("Using ip #{ip}")
options[:server_ip] = ip
end
options[:port] = 0 #50000
opts.on( '-p', '--port PORT', 'Server port. If set to 0, an arbitrary empty port will be used') do |port|
options[:port] = port.to_i
end
options[:workers] = 2
opts.on( '-w', '--workers COUNT', 'Number of workers, or file containing machine names to launch workers with ssh' ) do |workers|
if File.exists?(workers)
# use workers file
options[:workers] = File.read(workers).split("\n").map{|w| w.chomp}
else
begin
options[:workers] = Integer(workers)
rescue
STDERR.puts "ERROR:Invalid workers parameter #{options[:workers]}"
exit
end
end
end
options[:only_workers] = false
opts.on( '-o', '--only_workers', 'Only launch workers' ) do
options[:only_workers] = true
end
options[:check_db] = false
opts.on( '-c', '--check_databases', 'Check Blast databases and reformat if necessary' ) do
options[:check_db] = true
end
options[:use_checkpoint] = false
opts.on( '-C', '--use_checkpoint', 'Restore at checkpoint if scbi_mapreduce_checkpoint file is available' ) do
options[:use_checkpoint] = true
end
# options[:skip_initial_stats] = false
# opts.on( '-k', '--skip_initial_stats', 'Skip initial stats' ) do
# options[:skip_initial_stats] = true
# end
options[:install_db] = nil
opts.on( '-i', '--install_databases TYPE', 'Install base databases and reformat them if necessary') do |db_type|
options[:install_db] = db_type
end
options[:logfile] = STDOUT
opts.on( '-l', '--logfile FILE', 'Write log to FILE' ) do |file|
options[:logfile] = file
end
options[:fastq] = nil
opts.on( '-Q', '--fastq FILE1,FILE2',Array, 'Fastq input file. Use - for ' ) do |file|
options[:fastq] = file
puts "FILES:",file,file.class
end
options[:format] = nil
opts.on( '-F', '--fastq_quality_format FORMAT', 'Fastq input quality format use sanger or illumina18 for phred+33 based scores. Use illumina15 for phred+64 based scores (default is sanger) file. Use - for ' ) do |value|
options[:format] = value
if !['sanger','illumina15', 'illumina18'].include?(value)
STDERR.puts "ERROR: Invalid FASTQ format parameter #{value}"
exit
end
end
options[:fasta] = nil
opts.on( '-f', '--fasta FILE', 'Fasta input file' ) do |file|
options[:fasta] = file
end
options[:qual] = nil
opts.on( '-q', '--qual FILE', 'Qual input file' ) do |file|
options[:qual] = file
end
options[:list_db] = nil
options[:list_db_name] = 'ALL'
opts.on( '-L', '--list_db [DB_NAME]', 'List entries IDs in DB_NAME. Use "-L all" to view all available databases' ) do |value|
options[:list_db] = true
options[:list_db_name] = value if value
end
options[:gen_params] = false
opts.on( '-G', '--generate_template', 'Generates a sample template file with default parameters' ) do
options[:gen_params] = true
end
options[:template] = nil
opts.on( '-t', '--template TEMPLATE_FILE', 'Use TEMPLATE_FILE instead of default parameters' ) do |file|
options[:template] = file
end
options[:chunk_size] = 100
opts.on( '-g', '--group_size chunk_size', 'Group sequences in chunks of size ' ) do |cs|
options[:chunk_size] = cs.to_i
end
options[:json] = nil
opts.on( '-j', '--json', 'Save results in json file' ) do
options[:json] = true
end
options[:skip_output] = false
opts.on( '-K', '--no-verbose', 'Change to no verbose mode. Every sequence will not be written to output log' ) do
options[:skip_output] = true
end
options[:skip_report] = false
opts.on( '-R', '--no-report', 'Do not generate final PDF report (gem scbi_seqtrimnext_report required if you want to generate PDF report).' ) do
options[:skip_report] = true
end
options[:write_in_gzip] = false
opts.on( '-z', '--gzip', 'Generate output files in gzip format.' ) do
options[:write_in_gzip] = true
end
# This displays the help screen, all programs are
# assumed to have this option.
opts.on_tail( '-h', '--help', 'Display this screen' ) do
puts opts
show_additional_help
exit
end
end
# parse options and remove from ARGV
optparse.parse!
if options[:list_db] then
# List database entries in a database
ListDb.new($DB_PATH,options[:list_db_name])
exit
end
if options[:gen_params] then
# Generates a sample params file in current directory
Params.generate_sample_params
exit
end
#set logger
# system('rm logs/*')
FileUtils.mkdir('logs') if !File.exists?('logs')
$LOG = Logger.new(options[:logfile])
$LOG.datetime_format = "%Y-%m-%d %H:%M:%S"
#logger.level = Logger::INFO
#DEBUG < INFO < WARN < ERROR < FATAL < UNKNOWN
$LOG.info("SeqTrimNext version #{Seqtrimnext::SEQTRIM_VERSION}")
$LOG.info("Using BLASTDB: "+ $FORMATTED_DB_PATH)
$LOG.info("Using options: "+ options.to_json)
if options[:install_db] then
#install databases
InstallDatabase.new(options[:install_db],$DB_PATH)
# reformat databases
MakeBlastDb.new($DB_PATH)
exit
end
if !File.exists?($FORMATTED_DB_PATH)
STDERR.puts "Database path not found: #{$FORMATTED_DB_PATH}. \n\n\nInstall databases to this path or set your BLASTDB environment variable (eg.: export BLASTDB=new_path)"
exit
end
if options[:check_db] then
# check and format blast databases
MakeBlastDb.new($DB_PATH)
exit
end
required_options = options[:template] && (options[:fastq] || (options[:fasta]))
# if ((ARGV.count != 2) && (ARGV.count != 3)) # con esto vemos si hay argumentos,
if (ARGV.count != 0) || (!required_options) # con esto vemos si hay argumentos,
puts "You must provide all required options"
puts ""
puts optparse.help
exit
end
# check for template
if (!File.exists?(options[:template]))
if File.exists?(File.join($SEQTRIM_PATH,'templates',options[:template]))
options[:template] = File.join($SEQTRIM_PATH,'templates',options[:template])
else
$LOG.info "Params file: #{options[:template]} doesn't exists. \n\nYou can use your own template or specify one from this list:\n============================="
puts Dir.glob(File.join($SEQTRIM_PATH,'templates','*.txt')).map{|t| File.basename(t)}
exit
end
end
$LOG.info "Using init file: #{$SEQTRIMNEXT_INIT}"
$LOG.info "Using params file: #{options[:template]}"
# check file existence
if options[:fastq]
options[:fastq].each do |fastq_file|
# fastq file
if (!fastq_file.nil? && fastq_file!='-' && !File.exists?(File.expand_path(fastq_file)))
$LOG.error "Input file: #{fastq_file} doesn't exists"
exit
end
end
end
# fasta file
if (!options[:fasta].nil? && !File.exists?(options[:fasta]))
$LOG.error "Input file: #{options[:fasta]} doesn't exists"
exit
end
# qual file
if ((!options[:qual].nil?)&&(!File.exists?(options[:qual])))
$LOG.error "Input file: #{options[:qual]} doesn't exists"
exit
end
s = Seqtrim.new(options)
#generate report
if !options[:skip_report] && system("which generate_report.rb > /dev/null ")
cmd="generate_report.rb output_files 2> report_generation_errors.log"
$LOG.info "Generating report #{cmd}"
`#{cmd}`
else
skip_text='.'
if options[:skip_report]
skip_text=' and remove the -R option from the command line.'
end
$LOG.info "If you want a detailed report in PDF format, be sure you have installed the optional seqtrimnext_report gem (gem install seqtrimnext_report)#{skip_text}"
end