require 'rbbt-util' require 'rbbt/util/log' # This module interacts with BioMart. It performs queries to BioMart and # synthesises a hash with the results. Note that this module connects to the # online BioMart WS using the Open in 'rbbt/util/open' module which offers # caching by default. To obtain up to date results you may need to clear the # cache from previous queries. module BioMart class BioMart::QueryError < StandardError; end private @@biomart_query_xml = <<-EOT EOT def self.get(database, main, attrs = nil, filters = nil, data = nil, open_options = {}) attrs ||= [] filters ||= ["with_#{main}"] data ||= {} query = @@biomart_query_xml.dup query.sub!(//,database) query.sub!(//, filters.collect{|name| ""}.join("\n") ) query.sub!(//,"") query.sub!(//, attrs.collect{|name| ""}.join("\n") ) response = Open.read('http://www.biomart.org/biomart/martservice?query=' + query.gsub(/\n/,' '), open_options) if response =~ /Query ERROR:/ raise BioMart::QueryError, response end response.each_line{|l| parts = l.chomp.split(/\t/) main = parts.shift next if main.nil? || main.empty? data[main] ||= {} attrs.each{|name| value = parts.shift data[main][name] ||= [] next if value.nil? or value.empty? if data[main][name] data[main][name] = [value] else data[main][name] << value unless data[main][name].include? value end } } data end public # This method performs a query in biomart for a datasets and a given set of # attributes, there must be a main attribute that will be used as the key in # the result hash, optionally there may be a list of additional attributes # and filters. The data parameter at the end is used internally to # incrementally building the result, due to a limitation of the BioMart WS # that only allows 3 external arguments, users normally should leave it # unspecified or nil. The result is a hash, where the keys are the different # values for the main attribute, and the value is a hash with every other # attribute as key, and as value and array with all possible values (Note # that for a given value of the main attribute, there may be more than one # value for another attribute). If filters is left a nil it adds a filter to # the BioMart query to remove results with the main attribute empty, this may # cause an error if the BioMart WS does not allow filtering with that # attribute. def self.query(database, main, attrs = nil, filters = nil, data = nil, open_options = {}) open_options = Misc.add_defaults open_options, :nocache => false attrs ||= [] data ||= {} Log.low "BioMart query: '#{main}' [#{(attrs || []) * ', '}] [#{(filters || []) * ', '}] #{open_options.inspect}" max_items = 2 chunks = [] chunk = [] attrs.each{|a| chunk << a if chunk.length == max_items chunks << chunk chunk = [] end } chunks << chunk if chunk.any? Log.low "Chunks: #{chunks.length}" chunks.each_with_index{|chunk,i| Log.low "Chunk #{ i }: [#{chunk * ", "}]" data = get(database, main, chunk, filters, data, open_options) } data end def self.tsv(database, main, attrs = nil, filters = nil, data = nil, open_options = {}) codes = attrs.collect{|attr| attr[1]} data = query(database, main.last, codes, filters, data, open_options) tsv = TSV.new({}) data.each do |key, info| tsv[key] = info.values_at(*codes) end tsv.key_field = main.first tsv.fields = attrs.collect{|attr| attr.first} tsv end end