#!/usr/bin/env ruby require 'rubygems' require 'trollop' SCRIPT_NAME = "phyta-assign" #parse command line arguments opts = Trollop::options do opt :input_file, "The output of the BLASTplus alignment in XML format", :type => String opt :output_file, "The name of the output table in CSV format", :type => String opt :database_server, "Optional: The address of the MySQL database server", :type => String, :default => "localhost" opt :database_user, "Optional: The name of the database user", :type => String, :default => "root", :short => "-u" opt :database_password, "Optional: The password of the database user", :type => String, :default => "no password", :short => "-p" opt :database_name, "Optional: The name of the NCBI taxonomy database", :type => String, :default => "kingdom_assignment_taxonomy", :short => "-n" opt :filter, "A file in YAML format containing a list of taxa to be considered contaminants", :type => String, :default => "Use builtin filter capturing Bacteria, Archaea, Viruses and NONE. To learn how to write your own filters, visit https://github.com/PalMuc/bio-phyta/wiki/Custom-filters ", :short => "-f" end unless opts[:input_file_given] && opts[:output_file_given] puts "Invalid arguments, see --help for more information." abort end unless opts[:database_password_given] opts[:database_password] = nil end #Use the correct database connector if RUBY_PLATFORM =~ /java/ puts "You are running JRuby, the jdbc/mysql database connector will be used." require 'jdbc/mysql' else require 'mysql' end require 'sequel' require 'nokogiri' require 'bio' require 'yaml' require 'csv' if CSV.const_defined? :Reader require 'fastercsv' INSTALLED_CSV = FasterCSV else INSTALLED_CSV = CSV end $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib')) $LOAD_PATH.unshift(File.dirname(__FILE__)) require 'kingdom_db' require 'blast_string_parser' rootpath = File.dirname(File.dirname(__FILE__)) PHYTA_VERSION = File.new(File.join(rootpath,'VERSION')).read.chomp puts "Running #{SCRIPT_NAME} #{PHYTA_VERSION}" puts "Settings: " + opts.inspect filter_array = nil if opts[:filter_given] begin filter_array = YAML::load(File.open(opts[:filter], 'r')) rescue Exception => e puts "Error: #{e.message}" puts e.backtrace.join("\n") puts "Please see https://github.com/PalMuc/bio-phyta/wiki/Custom-filters for instructions on how to write filters" abort end unless filter_array.is_a? Array puts "Error: Invalid filter format.\nPlease see https://github.com/PalMuc/bio-phyta/wiki/Custom-filters for instructions on how to write filters" abort end else filter_array = KingdomDB::DEFAULT_FILTER end #Initialize auxiliary classes blast_parser = BlastStringParser.new() #Open input file unless File.exists?(opts[:input_file]) puts "The input file at " + File.expand_path(opts[:input_file]) + " could not be opened!" exit end file = File.new(opts[:input_file]) reader = Nokogiri::XML::Reader(file) #Initialize database db = KingdomDB.new(opts[:database_server], opts[:database_user], opts[:database_password], opts[:database_name]) #Initialize output file if File.exists?(opts[:output_file]) puts "The output file at " + File.expand_path(opts[:output_file]) + " already exists!" exit end output = INSTALLED_CSV.open(opts[:output_file], "w", { :col_sep => ";", :headers => ["query sequence id", "hit accession number", "sgi", "evalue", "species", "subject annotation", "subject score", "kingdom"], :write_headers => true}) filter_hash = db.get_filter(filter_array) current_query = "" hit_id = "" hit_def = "" hit_accession = "" hsp_evalue = "" subject_score = "" kingdom = "" #Go through the XML with a pull-parser reader.each do |elem| if elem.name == "Iteration_query-def"&& elem.node_type == Nokogiri::XML::Node::ELEMENT_NODE #We are at the beginning of an iteration current_query = elem.inner_xml elsif elem.name == "Hit" && elem.node_type == Nokogiri::XML::Node::ELEMENT_NODE #We are at the beginning of a Hit #Load the node representing this hit into memory and extract required information hit = Nokogiri::XML(elem.outer_xml) hit_id = hit.xpath("//Hit_id").inner_text hit_def = hit.xpath("//Hit_def").inner_text hit_accession = hit.xpath("//Hit_accession").inner_text hsp_evalue = hit.xpath("//Hsp[1]/Hsp_evalue").inner_text.to_f #Yep, the first element really has number 1 subject_score = hit.xpath("//Hsp[1]/Hsp_bit-score").inner_text.to_f #Set the subject score to be the bit-score of the first HSP subject_gi = blast_parser.get_sgi_info(hit_id) species_name = nil begin species_name = blast_parser.get_species_name(hit_def) rescue RuntimeError puts "Can not find " + hit_def.to_s[0..20] + "..." begin species_name = db.name_from_gi(subject_gi) rescue RuntimeError puts "ERROR: Could not find gi " + subject_gi.to_s end end begin kingdom = db.match_filter(species_name, filter_hash) rescue RuntimeError kingdom = "NOT FOUND" end if kingdom.nil? kingdom = "NONE" end output << [blast_parser.get_query_seq(current_query), hit_accession, subject_gi, hsp_evalue, species_name, blast_parser.get_subject_annotation(hit_def), subject_score, kingdom] end end output.close puts "Parsing finished!"