require 'rbbt' require 'rbbt/tsv' require 'rbbt/tsv/attach' require 'rbbt/util/log' require 'cgi' # This module interacts with BioMart. It performs queries to BioMart and # synthesises a hash with the results. Note that this module connects to the # online BioMart WS using the Open in 'rbbt/util/open' module which offers # caching by default. To obtain up to date results you may need to clear the # cache from previous queries. module BioMart class BioMart::QueryError < StandardError; end BIOMART_URL = 'http://www.ensembl.org/biomart/martservice?query=' MISSING_IN_ARCHIVE = Rbbt.etc.biomart.missing_in_archive.exists? ? Rbbt.etc.biomart.missing_in_archive.yaml : {} private @@biomart_query_xml = <<-EOT EOT def self.set_archive(date) if defined? Rbbt and Rbbt.etc.allowed_biomart_archives.exists? raise "Biomart archive #{ date } is not allowed in this installation" unless Rbbt.etc.allowed_biomart_archives.read.split("\n").include? date end Thread.current['archive'] = date Thread.current['archive_url'] = BIOMART_URL.sub(/www/, date + '.archive') Log.debug "Using Archive URL #{ Thread.current['archive_url'] }" end def self.unset_archive Log.debug "Restoring current version URL #{BIOMART_URL}" Thread.current['archive'] = nil Thread.current['archive_url'] = nil end def self.with_archive(data) begin set_archive(data) yield ensure unset_archive end end def self.get(database, main, attrs = nil, filters = nil, data = nil, open_options = {}) open_options = Misc.add_defaults open_options, :wget_options => {"--read-timeout=" => 9000, "--tries=" => 1} repeats = true attrs ||= [] filters ||= ["with_#{main}"] if chunk_filter = open_options.delete(:chunk_filter) filter, values = chunk_filter merged_file = TmpFile.tmp_file f = File.open(merged_file, 'w') values.each do |value| data = get(database, main, attrs, filters + [[filter, value]], data, open_options) f.write Open.read(data) end f.close return merged_file end query = @@biomart_query_xml.dup query.sub!(//,database) query.sub!(//, filters.collect{|name, v| v.nil? ? "" : "" }.join("\n") ) query.sub!(//,"") query.sub!(//, attrs.collect{|name| ""}.join("\n") ) url = Thread.current['archive_url'] ? Thread.current['archive_url'] + query.gsub(/\n/,' ') : BIOMART_URL + query.gsub(/\n/,' ') begin response = Open.read(url, open_options.dup) rescue Open.remove_from_cache url, open_options raise $! end if response.empty? or response =~ /Query ERROR:/ Open.remove_from_cache url, open_options raise BioMart::QueryError, response end if not response =~ /\[success\]$/sm Open.remove_from_cache url, open_options raise BioMart::QueryError, "Uncomplete result" end response.sub!(/\n\[success\]$/sm,'') result_file = TmpFile.tmp_file Open.write(result_file, response) new_datafile = TmpFile.tmp_file if data.nil? TSV.merge_row_fields Open.open(result_file), new_datafile data = new_datafile else TSV.merge_different_fields data, result_file, new_datafile FileUtils.rm data data = new_datafile end FileUtils.rm result_file data end public # This method performs a query in biomart for a datasets and a given set of # attributes, there must be a main attribute that will be used as the key in # the result hash, optionally there may be a list of additional attributes # and filters. The data parameter at the end is used internally to # incrementally building the result, due to a limitation of the BioMart WS # that only allows 3 external arguments, users normally should leave it # unspecified or nil. The result is a hash, where the keys are the different # values for the main attribute, and the value is a hash with every other # attribute as key, and as value and array with all possible values (Note # that for a given value of the main attribute, there may be more than one # value for another attribute). If filters is left a nil it adds a filter to # the BioMart query to remove results with the main attribute empty, this may # cause an error if the BioMart WS does not allow filtering with that # attribute. def self.query(database, main, attrs = nil, filters = nil, data = nil, open_options = {}) IndiferentHash.setup(open_options) open_options = Misc.add_defaults open_options, :nocache => false, :filename => nil, :field_names => nil, :by_chr => false filename, field_names, by_chr = Misc.process_options open_options, :filename, :field_names, :by_chr attrs ||= [] open_options = Misc.add_defaults open_options, :keep_empty => false, :merge => true IndiferentHash.setup(open_options) Log.low "BioMart query: '#{main}' [#{(attrs || []) * ', '}] [#{(filters || []) * ', '}] #{open_options.inspect}" max_items = 2 chunks = [] chunk = [] attrs.each{|a| chunk << a if chunk.length == max_items chunks << chunk chunk = [] end } chunks << chunk if chunk.any? chunks << [] if chunks.empty? Log.low "Chunks: #{chunks.length}" if chunks.any? chunks.each_with_index{|chunk,i| Log.low "Chunk #{ i + 1 } / #{chunks.length}: [#{chunk * ", "}]" data = get(database, main, chunk, filters, data, open_options.dup) } else data = get(database, main, [], filters, data, open_options.dup) end open_options[:filename] = "BioMart[#{main}+#{attrs.length}]" if filename.nil? results = TSV.open data, open_options results.key_field = main results.fields = attrs results else Open.write(filename) do |f| f.puts "#: " << Misc.hash2string(TSV::ENTRIES.collect{|key| [key, open_options[key]]}) if field_names.nil? f.puts "#" << [main, attrs].flatten * "\t" else f.puts "#" << field_names * "\t" end f.write Open.read(data) end FileUtils.rm data filename end end def self.tsv(database, main, attrs = nil, filters = nil, data = nil, open_options = {}) attrs ||= [] current_archive = Thread.current['archive'] missing = MISSING_IN_ARCHIVE['all'] || [] missing += MISSING_IN_ARCHIVE[current_archive] || [] if current_archive MISSING_IN_ARCHIVE.each do |k,v| if k =~ /^<(.*)/ t = $1.strip missing+=v if Organism.compare_archives(current_archive, t) == -1 elsif k=~ /^>(.*)/ t = $1.strip missing+=v if Organism.compare_archives(current_archive, t) == 1 end end attrs = attrs.uniq.reject{|attr| missing.include? attr[1]} changes = {} missing.select{|m| m.include? "~" }.each do |str| orig,_sep, new = str.partition "~" changes[orig] = new end changed = true while changed new_attrs = attrs.collect{|n,k| [n, changes[k] || k] } changed = new_attrs != attrs attrs = new_attrs end codes = attrs.collect{|attr| attr[1]} if open_options[:filename].nil? tsv = query(database, main.last, codes, filters, data, open_options) tsv.key_field = main.first tsv.fields = attrs.collect{|attr| attr.first} tsv else query(database, main.last, codes, filters, data, open_options.merge(:field_names => [main.first, attrs.collect{|attr| attr.first}].flatten)) end end end