lib/biomart/dataset.rb in biomart-0.1.5 vs lib/biomart/dataset.rb in biomart-0.2.0

- old
+ new

@@ -60,14 +60,24 @@ # Function to perform a Biomart count. Returns an integer value for # the result of the count query. # # optional arguments: - # - # :filters:: hash of key-value pairs (filter => search term) - # :timeout:: set a timeout length for the request (secs) + # + # { + # :timeout => integer, # set a timeout length for the request (secs) + # :filters => {} # hash of key-value pairs (filter => search term) + # } def count( args={} ) + if args[:federate] + raise Biomart::ArgumentError, "You cannot federate a count query." + end + + if args[:required_attributes] + raise Biomart::ArgumentError, "The :required_attributes option is not allowed on count queries." + end + result = request( :method => 'post', :url => @url, :timeout => args[:timeout], :query => generate_xml( @@ -80,34 +90,55 @@ end # Function to perform a Biomart search. # # optional arguments: + # + # { + # :process_results => true/false, # convert search results to object + # :timeout => integer, # set a timeout length for the request (secs) + # :filters => {}, # hash of key-value pairs (filter => search term) + # :attributes => [], # array of attributes to retrieve + # :required_attributes => [], # array of attributes that are required + # :federate => [ + # { + # :dataset => Biomart::Dataset, # A dataset object to federate with + # :filters => {}, # hash of key-value pairs (filter => search term) + # :attributes => [] # array of attributes to retrieve + # } + # ] + # } # - # :filters:: hash of key-value pairs (filter => search term) - # :attributes:: array of attributes to retrieve - # :process_results:: true/false - convert search results to object - # :timeout:: set a timeout length for the request (secs) + # Note, if you do not pass any filters or attributes arguments, the defaults + # for the dataset shall be used. # + # Also, using the :required_attributes option - this performs AND logic and will require + # data to be returned in all of the listed attributes in order for it to be returned. + # # By default will return a hash with the following: # - # :headers:: array of headers - # :data:: array of arrays containing search results + # { + # :headers => [], # array of headers + # :data => [] # array of arrays containing search results + # } # # But with the :process_results option will return an array of hashes, # where each hash represents a row of results (keyed by the attribute name). def search( args={} ) + if args[:required_attributes] and !args[:required_attributes].is_a?(Array) + raise Biomart::ArgumentError, "The :required_attributes option must be passed as an array." + end + response = request( :method => 'post', :url => @url, :timeout => args[:timeout], - :query => generate_xml( - :filters => args[:filters], - :attributes => args[:attributes] - ) + :query => generate_xml( process_xml_args(args) ) ) + result = process_tsv( args, response ) + result = filter_data_rows( args, result ) if args[:required_attributes] result = conv_results_to_a_of_h( result ) if args[:process_results] return result end # Utility function to build the Biomart query XML @@ -116,42 +147,26 @@ xml = Builder::XmlMarkup.new( :target => biomart_xml, :indent => 2 ) xml.instruct! xml.declare!( :DOCTYPE, :Query ) xml.Query( :virtualSchemaName => "default", :formatter => "TSV", :header => "0", :uniqueRows => "1", :count => args[:count], :datasetConfigVersion => "0.6" ) { - xml.Dataset( :name => @name, :interface => "default" ) { - - if args[:filters] - args[:filters].each do |name,value| - if value.is_a? Array - value = value.join(",") - end - xml.Filter( :name => name, :value => value ) + dataset_xml( xml, self, { :filters => args[:filters], :attributes => args[:attributes] } ) + + if args[:federate] + args[:federate].each do |joined_dataset| + unless joined_dataset[:dataset].is_a?(Biomart::Dataset) + raise Biomart::ArgumentError, "You must pass a Biomart::Dataset object to the :federate[:dataset] option." end - else - self.filters.each do |name,filter| - if filter.default - xml.Filter( :name => name, :value => filter.default_value ) - end - end + + dataset_xml( + xml, + joined_dataset[:dataset], + { :filters => joined_dataset[:filters], :attributes => joined_dataset[:attributes] } + ) end - - unless args[:count] - if args[:attributes] - args[:attributes].each do |name| - xml.Attribute( :name => name ) - end - else - self.attributes.each do |name,attribute| - if attribute.default - xml.Attribute( :name => name ) - end - end - end - end - - } + end + } return biomart_xml end @@ -188,26 +203,82 @@ REXML::XPath.each( document, '//AttributeDescription' ) do |a| @attributes[ a.attributes["internalName"] ] = Attribute.new( a.attributes ) end end + # Utility function to process and test the arguments passed for + # the xml query. + def process_xml_args( args={} ) + xml_args = { + :filters => args[:filters], + :attributes => args[:attributes] + } + + if args[:federate] + unless args[:federate].is_a?(Array) + raise Biomart::ArgumentError, "The :federate option must be passed as an array." + end + + unless args[:federate].size == 1 + raise Biomart::ArgumentError, "Sorry, we can only federate two datasets at present. This limitation shall be lifted in version 0.8 of biomart." + end + + xml_args[:federate] = args[:federate] + end + + return xml_args + end + + # Helper function to produce the portion of the biomart xml for + # a dataset query. + def dataset_xml( xml, dataset, args ) + xml.Dataset( :name => dataset.name, :interface => "default" ) { + + if args[:filters] + args[:filters].each do |name,value| + if value.is_a? Array + value = value.join(",") + end + xml.Filter( :name => name, :value => value ) + end + else + dataset.filters.each do |name,filter| + if filter.default? + xml.Filter( :name => name, :value => filter.default_value ) + end + end + end + + unless args[:count] + if args[:attributes] + args[:attributes].each do |name| + xml.Attribute( :name => name ) + end + else + dataset.attributes.each do |name,attribute| + if attribute.default? + xml.Attribute( :name => name ) + end + end + end + end + + } + end + # Utility function to transform the tab-separated data retrieved # from the Biomart search query into a ruby object. def process_tsv( args, tsv ) headers = [] parsed_data = [] + + append_header_attributes_for_tsv( headers, self, args[:attributes] ) - if args[:attributes] - args[:attributes].each do |attribute| - headers.push(attribute) + if args[:federate] + args[:federate].each do |joined_dataset| + append_header_attributes_for_tsv( headers, joined_dataset[:dataset], joined_dataset[:attributes] ) end - else - self.attributes.each do |name,attribute| - if attribute.default - headers.push(name) - end - end end parsed_data = [] if CSV.const_defined? :Reader # Ruby < 1.9 CSV code @@ -229,10 +300,26 @@ :headers => headers, :data => parsed_data } end + # Helper function to append the attribute names to the 'headers' array + # for processing the returned results. + def append_header_attributes_for_tsv( headers, dataset, attributes ) + if attributes + attributes.each do |attribute| + headers.push(attribute) + end + else + dataset.attributes.each do |name,attribute| + if attribute.default? + headers.push(name) + end + end + end + end + # Utility function to process TSV formatted data that raises errors. (Biomart # has a habit of serving out this...) First attempts to use the CSV modules # 'parse_line' function to read in the data, if that fails, tries to use split # to recover the data. def parse_tsv_line_by_line( expected_row_size, tsv ) @@ -299,8 +386,48 @@ result_objects.push(tmp) end return result_objects end - + + # Utility function to remove data rows from a search result that do not include + # the :required_attributes. + def filter_data_rows( args, result ) + # Get the list of attributes searched for... + attributes = args[:attributes] ? args[:attributes] : [] + if attributes.empty? + self.attributes.each do |name,attribute| + if attribute.default? + attributes.push(name) + end + end + end + + # Work out which attribute positions we need to test... + positions_to_test = [] + attributes.each_index do |index| + if args[:required_attributes].include?(attributes[index]) + positions_to_test.push(index) + end + end + + # Now go through the results and filter out the unwanted data... + filtered_data = [] + result[:data].each do |data_row| + save_row_count = 0 + + positions_to_test.each do |position| + save_row_count = save_row_count + 1 unless data_row[position].nil? + end + + if save_row_count == positions_to_test.size + filtered_data.push(data_row) + end + end + + return { + :headers => result[:headers], + :data => filtered_data + } + end end end \ No newline at end of file