require 'ox' Ox.default_options = { skip: :skip_none } require 'sequenceserver/report' require 'sequenceserver/links' require_relative 'formatter' require_relative 'query' require_relative 'hit' require_relative 'hsp' module SequenceServer module BLAST # Captures results of a BLAST search. # # A report is constructed from a search id. Search id is simply the # basename of the temporary file that holds BLAST results in binary # BLAST archive format. # # For a given search id, result is obtained in XML format using the # Formatter class, parsed into a simple intermediate representation # (Array of values and Arrays) and information extracted from the # intermediate representation (ir). class Report < Report def initialize(job) super do @querydb = job.databases end end def to_json(*_args) %i[querydb program program_version params stats queries].inject({}) do |h, k| h[k] = send(k) h end.update(search_id: job.id, submitted_at: job.submitted_at.utc, imported_xml: !job.imported_xml_file.nil?, seqserv_version: SequenceServer::VERSION, cloud_sharing_enabled: SequenceServer.config[:cloud_share_url].start_with?('http'), non_parse_seqids: !!job.databases&.any?(&:non_parse_seqids?)).to_json end def xml_file_size return File.size(job.imported_xml_file) if job.imported_xml_file xml_formatter.size end def done? return true if job.imported_xml_file File.exist?(xml_formatter.filepath) && File.exist?(tsv_formatter.filepath) end def program @program ||= xml_ir[0] end def program_version @program_version ||= xml_ir[1] end def querydb @querydb ||= xml_ir[3].split.map do |path| { title: File.basename(path) } end end def dbtype @dbtype ||= querydb&.first&.type || dbtype_from_program end def params @params ||= extract_params end def stats @stats ||= extract_stats end def queries @queries ||= xml_ir[8].map do |n| query = Query.new(self, n[0], n[2], n[3], []) query.hits = query_hits(n[4], tsv_ir[query.id], query) query end end private def xml_ir @xml_ir ||= if job.imported_xml_file parse_xml File.read(job.imported_xml_file) else job.raise! parse_xml(xml_formatter.read_file) end end def tsv_ir @tsv_ir ||= if job.imported_xml_file Hash.new do |h1, k1| h1[k1] = Hash.new do |h2, k2| h2[k2] = ['', '', []] end end else job.raise! parse_tsv(tsv_formatter.read_file) end end def xml_formatter @xml_formatter ||= Formatter.run(job, 'xml') end def tsv_formatter @tsv_formatter ||= Formatter.run(job, 'custom_tsv') end # Search params tweak the results. Like evalue cutoff or penalty to open # a gap. BLAST+ doesn't list all input params in the XML output. Only # matrix, evalue, gapopen, gapextend, and filters are available from XML # output. def extract_params # Parse/get params from the job first. job_params = parse_advanced(job.advanced) # Old jobs from beta releases may not have the advanced key but they # will have the deprecated advanced_params key. job_params.update(job.advanced_params) if job.advanced_params # Parse params from BLAST XML. @params = Hash[ *xml_ir[7].first.map { |k, v| [k.gsub('Parameters_', ''), v] }.flatten ] @params['evalue'] = @params.delete('expect') # Merge into job_params. @params = job_params.merge(@params) end # Search stats are computed metrics. Like total number of sequences or # effective search space. def extract_stats stats = xml_ir[8].first[5][0] { nsequences: stats[0], ncharacters: stats[1], hsp_length: stats[2], search_space: stats[3], kappa: stats[4], labmda: stats[5], entropy: stats[6] } end # Create Hit objects for the given query from the given ir. def query_hits(xml_ir, tsv_ir, query) return [] if xml_ir == ["\n"] # => No hits. xml_ir.map do |n| # If hit comes from a non -parse_seqids database, then id (n[1]) is a # BLAST assigned internal id of the format 'gnl|BL_ORD_ID|serial'. We # assign the id to accession (because we use accession for sequence # retrieval and this id is what blastdbcmd expects for non # -parse_seqids databases) and parse the hit defline to # obtain id and title ourselves (we use id and title # for display purposes). if n[1] =~ /^gnl\|BL_ORD_ID\|\d+/ n[3] = n[1] defline = n[2].split n[1] = defline.shift n[2] = defline.join(' ') end hit = Hit.new(query, n[0], n[1], n[3], n[2], n[4], tsv_ir[n[1]][0], tsv_ir[n[1]][1], []) hit.hsps = hsps(n[5], tsv_ir[n[1]][2], hit) hit end end def hsps(xml_ir, tsv_ir, hit) xml_ir.map.with_index do |n, i| n.insert(14, tsv_ir[i]) HSP.new(hit, *n) end end def parse_xml(xml) node_to_array Ox.parse(xml).root rescue Ox::ParseError raise 'Error parsing XML file' if job.imported_xml_file raise InputError, <<~MSG BLAST generated incorrect XML output. This can happen if sequence ids in your databases are not unique across all files. As a temporary workaround, you can repeat the search with one database at a time. Proper fix is to recreate the following databases with unique sequence ids: #{querydb.map(&:title).join(', ')} If you are not the one managing this server, try to let the manager know about this. MSG end PARSEABLE_AS_HASH = %w[Parameters].freeze PARSEABLE_AS_ARRAY = %w[BlastOutput_param Iteration_stat Statistics Iteration_hits BlastOutput_iterations Iteration Hit Hit_hsps Hsp].freeze def node_to_hash(element) Hash[*element.nodes.map { |n| [n.name, node_to_value(n)] }.flatten] end def node_to_array(element) element.nodes.map { |n| node_to_value n } end def node_to_value(node) # Ensure that the recursion doesn't fails when String value is received. return node if node.is_a?(String) if PARSEABLE_AS_HASH.include? node.name node_to_hash(node) elsif PARSEABLE_AS_ARRAY.include? node.name node_to_array(node) else first_text(node) end end def first_text(node) node.nodes.find { |n| n.is_a? String } end # Parses the given TSV string as: # # { # qseqid: { # sseqid: [sciname, qcovs, [qcovhsp]], # ... # }, # ... # } def parse_tsv(tsv) ir = Hash.new { |h, k| h[k] = {} } tsv.each_line do |line| next if line.start_with? '#' row = line.chomp.split("\t") (ir[row[0]][row[1]] ||= [row[2], row[3], []])[2] << row[4] end ir end # Parse BLAST CLI string from job.advanced. def parse_advanced(param_line) param_list = (param_line || '').split(' ') res = {} param_list.each_with_index do |word, i| nxt = param_list[i + 1] next unless word.start_with? '-' word.sub!('-', '') res[word] = unless nxt.nil? || nxt.start_with?('-') nxt else 'True' end end res end # Returns database type (nucleotide or protein) inferred from # Report#program (i.e., the BLAST algorithm) def dbtype_from_program case program when /blastn|tblastn|tblastx/ 'nucleotide' when /blastp|blastx/ 'protein' end end end end end