#!/usr/bin/env ruby # # BioRuby bio-blastxmlparser Plugin # Author:: Pjotr Prins # Copyright:: 2011 # License:: MIT License # # Copyright (C) 2010,2011 Pjotr Prins <pjotr.prins@thebird.nl> rootpath = File.dirname(File.dirname(__FILE__)) $: << File.join(rootpath,'lib') BLASTXML_VERSION = File.new(File.join(rootpath,'VERSION')).read.chomp $stderr.print "BioRuby BLAST XML Parser "+BLASTXML_VERSION+" Copyright (C) 2011 Pjotr Prins <pjotr.prins@thebird.nl>\n\n" USAGE = <<EOM bioblastxmlparser filename(s) Use --help switch for more information == Examples Print result fields of iterations containing 'lcl', using a regex blastxmlparser -e 'iter.query_id=~/lcl/' test/data/nt_example_blastn.m7 Print fields where bit_score > 145 blastxmlparser -e 'hsp.bit_score>145' test/data/nt_example_blastn.m7 prints a tab delimited 1 1 lcl|1_0 lcl|I_74685 1 5.82208e-34 2 1 lcl|1_0 lcl|I_1 1 5.82208e-34 3 2 lcl|2_0 lcl|I_2 1 6.05436e-59 4 3 lcl|3_0 lcl|I_3 1 2.03876e-56 The second and third column show the BLAST iteration, and the others relate to the hits. As this is evaluated Ruby, it is also possible to use the XML element names directly blastxmlparser -e 'hsp["Hsp_bit-score"].to_i>145' test/data/nt_example_blastn.m7 And it is possible to print (non default) named fields where E-value < 0.001 and hit length > 100. E.g. blastxmlparser -n 'hsp.evalue,hsp.qseq' -e 'hsp.evalue<0.01 and hit.len>100' test/data/nt_example_blastn.m7 1 5.82208e-34 AGTGAAGCTTCTAGATATTTGGCGGGTACCTCTAATTTTGCCT... 2 5.82208e-34 AGTGAAGCTTCTAGATATTTGGCGGGTACCTCTAATTTTGCCT... 3 2.76378e-11 AATATGGTAGCTACAGAAACGGTAGTACACTCTTC 4 1.13373e-13 CTAAACACAGGAGCATATAGGTTGGCAGGCAGGCAAAAT 5 2.76378e-11 GAAGAGTGTACTACCGTTTCTGTAGCTACCATATT etc. etc. prints the evalue and qseq columns. To output FASTA use --output-fasta blastxmlparser --output-fasta -e 'hsp.evalue<0.01 and hit.len>100' test/data/nt_example_blastn.m7 which prints matching sequences, where the first field is the accession, followed by query iteration id, and hit_id. E.g. >I_74685 1|lcl|1_0 lcl|I_74685 [57809 - 57666] (REVERSE SENSE) AGTGAAGCTTCTAGATATTTGGCGGGTACCTCTAATTTTGCCTGCCTGCCAACCTATATGCTCCTGTGTTTAG >I_1 1|lcl|1_0 lcl|I_1 [477 - 884] AGTGAAGCTTCTAGATATTTGGCGGGTACCTCTAATTTTGCCTGCCTGCCAACCTATATGCTCCTGTGTTTAG etc. etc. To use the low-mem (iterated slower) version of the parser use blastxmlparser --parser split -n 'hsp.evalue,hsp.qseq' -e 'hsp.evalue<0.01 and hit.len>100' test/data/nt_example_blastn.m7 == URL The project lives at http://github.com/pjotrp/blastxmlparser. If you use this software, please cite http://dx.doi.org/10.1093/bioinformatics/btq475 == Copyright Copyright (c) 2011 Pjotr Prins under the MIT licence. See LICENSE.txt and http://www.opensource.org/licenses/mit-license.html for further details. EOM if ARGV.size == 0 print USAGE exit 1 end require 'bio-blastxmlparser' require 'optparse' require 'ostruct' require 'bio-logger' Bio::Log::CLI.logger('stderr') Bio::Log::CLI.trace('info') options = OpenStruct.new() opts = OptionParser.new do |o| o.on_tail("-h", "--help", "Show help and examples") { print(opts) print USAGE exit() } o.banner = "== Usage\n #{File.basename($0)} [options] file(s)" o.separator "" o.on("-p name", "--parser name", "Use full|split parser (default full)") do |p| options.parser = p.to_sym end o.on("--output-fasta","Output FASTA") do |b| options.output_fasta = true end o.on("-n fields","--named fields",String, "Set named fields") do |s| options.fields = s.split(/,/) end o.on("-e filter","--exec filter",String, "Execute filter") do |s| options.exec = s end o.separator "" o.on("--logger filename",String,"Log to file (default stderr)") do | name | Bio::Log::CLI.logger(name) end o.on("--trace options",String,"Set log level (default INFO, see bio-logger)") do | s | Bio::Log::CLI.trace(s) end o.on("-q", "--quiet", "Run quietly") do |q| Bio::Log::CLI.trace('error') end o.on("-v", "--verbose", "Run verbosely") do |v| Bio::Log::CLI.trace('info') end o.on("--debug", "Show debug messages") do |v| Bio::Log::CLI.trace('debug') end end begin opts.parse!(ARGV) Bio::Log::CLI.configure('bio-blastxmlparser') logger = Bio::Log::LoggerPlus['bio-blastxmlparser'] ARGV.each do | fn | logger.info("XML parsing #{fn}") n = if options.parser == :split Bio::Blast::XmlSplitterIterator.new(fn).to_enum else Bio::Blast::XmlIterator.new(fn).to_enum end i = 1 n.each do | iter | iter.each do | hit | hit.each do | hsp | do_print = if options.exec eval(options.exec) else true end if do_print if options.output_fasta print ">"+hit.accession+' '+iter.iter_num.to_s+'|'+iter.query_id+' '+hit.hit_id+' '+hit.hit_def+"\n" print hsp.qseq+"\n" else if options.fields print i,"\t" options.fields.each do | f | print eval(f),"\t" end print "\n" else print [i,iter.iter_num,iter.query_id,hit.hit_id,hsp.hsp_num,hsp.evalue].join("\t"),"\n" end end i += 1 end end end end end rescue OptionParser::InvalidOption => e opts[:invalid_argument] = e.message end