#!/usr/bin/env ruby
#
# BioRuby bio-blastxmlparser Plugin
# Author:: Pjotr Prins
# Copyright:: 2011
# License:: MIT License
#
# Copyright (C) 2010,2011 Pjotr Prins <pjotr.prins@thebird.nl>

rootpath = File.dirname(File.dirname(__FILE__))
$: << File.join(rootpath,'lib')

BLASTXML_VERSION = File.new(File.join(rootpath,'VERSION')).read.chomp

$stderr.print "BioRuby BLAST XML Parser "+BLASTXML_VERSION+" Copyright (C) 2011 Pjotr Prins <pjotr.prins@thebird.nl>\n\n"

USAGE = <<EOM

  bioblastxmlparser filename(s)

    Use --help switch for more information

== Examples

Print result fields of iterations containing 'lcl', using a regex

  blastxmlparser -e 'iter.query_id=~/lcl/' test/data/nt_example_blastn.m7

Print fields where bit_score > 145

  blastxmlparser -e 'hsp.bit_score>145' test/data/nt_example_blastn.m7

prints a tab delimited

  1       1       lcl|1_0 lcl|I_74685     1       5.82208e-34
  2       1       lcl|1_0 lcl|I_1 1       5.82208e-34
  3       2       lcl|2_0 lcl|I_2 1       6.05436e-59
  4       3       lcl|3_0 lcl|I_3 1       2.03876e-56

The second and third column show the BLAST iteration, and the others
relate to the hits.

As this is evaluated Ruby, it is also possible to use the XML element
names directly

  blastxmlparser -e 'hsp["Hsp_bit-score"].to_i>145' test/data/nt_example_blastn.m7

And it is possible to print (non default) named fields where E-value < 0.001 
and hit length > 100. E.g.

  blastxmlparser -n 'hsp.evalue,hsp.qseq' -e 'hsp.evalue<0.01 and hit.len>100' test/data/nt_example_blastn.m7

  1       5.82208e-34     AGTGAAGCTTCTAGATATTTGGCGGGTACCTCTAATTTTGCCT...
  2       5.82208e-34     AGTGAAGCTTCTAGATATTTGGCGGGTACCTCTAATTTTGCCT...
  3       2.76378e-11     AATATGGTAGCTACAGAAACGGTAGTACACTCTTC     
  4       1.13373e-13     CTAAACACAGGAGCATATAGGTTGGCAGGCAGGCAAAAT 
  5       2.76378e-11     GAAGAGTGTACTACCGTTTCTGTAGCTACCATATT     
  etc. etc.

prints the evalue and qseq columns. To output FASTA use --output-fasta

  blastxmlparser --output-fasta -e 'hsp.evalue<0.01 and hit.len>100' test/data/nt_example_blastn.m7

which prints matching sequences, where the first field is the accession, followed
by query iteration id, and hit_id. E.g.

  >I_74685 1|lcl|1_0 lcl|I_74685 [57809 - 57666] (REVERSE SENSE) 
  AGTGAAGCTTCTAGATATTTGGCGGGTACCTCTAATTTTGCCTGCCTGCCAACCTATATGCTCCTGTGTTTAG
  >I_1 1|lcl|1_0 lcl|I_1 [477 - 884] 
  AGTGAAGCTTCTAGATATTTGGCGGGTACCTCTAATTTTGCCTGCCTGCCAACCTATATGCTCCTGTGTTTAG
  etc. etc.

To use the low-mem (iterated slower) version of the parser use

  blastxmlparser --parser split -n 'hsp.evalue,hsp.qseq' -e 'hsp.evalue<0.01 and hit.len>100' test/data/nt_example_blastn.m7

== URL

The project lives at http://github.com/pjotrp/blastxmlparser. If you use this software, please cite http://dx.doi.org/10.1093/bioinformatics/btq475

== Copyright

Copyright (c) 2011 Pjotr Prins under the MIT licence.  See LICENSE.txt and http://www.opensource.org/licenses/mit-license.html for further details.

EOM

if ARGV.size == 0
  print USAGE
  exit 1
end

require 'bio-blastxmlparser'
require 'optparse'
require 'ostruct'

require 'bio-logger'
Bio::Log::CLI.logger('stderr')
Bio::Log::CLI.trace('info')

options = OpenStruct.new()

opts = OptionParser.new do |o|

  o.on_tail("-h", "--help", "Show help and examples") {
    print(opts)
    print USAGE
    exit()
  }

  o.banner = "== Usage\n  #{File.basename($0)} [options] file(s)"

  o.separator ""

  o.on("-p name", "--parser name", "Use full|split parser (default full)") do |p|
    options.parser = p.to_sym
  end
  
  o.on("--output-fasta","Output FASTA") do |b|
    options.output_fasta = true
  end
  
  o.on("-n fields","--named fields",String, "Set named fields") do |s|
    options.fields = s.split(/,/)
  end
  
  o.on("-e filter","--exec filter",String, "Execute filter") do |s|
    options.exec = s
  end
  
  o.separator ""

  o.on("--logger filename",String,"Log to file (default stderr)") do | name |
    Bio::Log::CLI.logger(name)
  end
 
  o.on("--trace options",String,"Set log level (default INFO, see bio-logger)") do | s |
    Bio::Log::CLI.trace(s)
  end
  
  o.on("-q", "--quiet", "Run quietly") do |q|
    Bio::Log::CLI.trace('error')
  end
  
  o.on("-v", "--verbose", "Run verbosely") do |v|
    Bio::Log::CLI.trace('info')
  end
  
  o.on("--debug", "Show debug messages") do |v|
    Bio::Log::CLI.trace('debug')
  end

end

begin
  opts.parse!(ARGV)

  Bio::Log::CLI.configure('bio-blastxmlparser')
  logger = Bio::Log::LoggerPlus['bio-blastxmlparser']

  ARGV.each do | fn |
    logger.info("XML parsing #{fn}")
    n = if options.parser == :split
      Bio::Blast::XmlSplitterIterator.new(fn).to_enum
    else
      Bio::Blast::XmlIterator.new(fn).to_enum
    end
    i = 1
    n.each do | iter |
      iter.each do | hit |
        hit.each do | hsp |
          do_print = if options.exec
                       eval(options.exec)
                     else
                       true
                     end
          if do_print
            if options.output_fasta
              print ">"+hit.accession+' '+iter.iter_num.to_s+'|'+iter.query_id+' '+hit.hit_id+' '+hit.hit_def+"\n"
              print hsp.qseq+"\n"
            else
              if options.fields
                print i,"\t"
                options.fields.each do | f |
                  print eval(f),"\t"
                end
                print "\n"
                else
                  print [i,iter.iter_num,iter.query_id,hit.hit_id,hsp.hsp_num,hsp.evalue].join("\t"),"\n" 
              end
            end
            i += 1
          end
        end
      end
    end
  end
rescue OptionParser::InvalidOption => e
  opts[:invalid_argument] = e.message
end