lib/biomart/dataset.rb in biomart-0.1.5 vs lib/biomart/dataset.rb in biomart-0.2.0
- old
+ new
@@ -60,14 +60,24 @@
# Function to perform a Biomart count. Returns an integer value for
# the result of the count query.
#
# optional arguments:
- #
- # :filters:: hash of key-value pairs (filter => search term)
- # :timeout:: set a timeout length for the request (secs)
+ #
+ # {
+ # :timeout => integer, # set a timeout length for the request (secs)
+ # :filters => {} # hash of key-value pairs (filter => search term)
+ # }
def count( args={} )
+ if args[:federate]
+ raise Biomart::ArgumentError, "You cannot federate a count query."
+ end
+
+ if args[:required_attributes]
+ raise Biomart::ArgumentError, "The :required_attributes option is not allowed on count queries."
+ end
+
result = request(
:method => 'post',
:url => @url,
:timeout => args[:timeout],
:query => generate_xml(
@@ -80,34 +90,55 @@
end
# Function to perform a Biomart search.
#
# optional arguments:
+ #
+ # {
+ # :process_results => true/false, # convert search results to object
+ # :timeout => integer, # set a timeout length for the request (secs)
+ # :filters => {}, # hash of key-value pairs (filter => search term)
+ # :attributes => [], # array of attributes to retrieve
+ # :required_attributes => [], # array of attributes that are required
+ # :federate => [
+ # {
+ # :dataset => Biomart::Dataset, # A dataset object to federate with
+ # :filters => {}, # hash of key-value pairs (filter => search term)
+ # :attributes => [] # array of attributes to retrieve
+ # }
+ # ]
+ # }
#
- # :filters:: hash of key-value pairs (filter => search term)
- # :attributes:: array of attributes to retrieve
- # :process_results:: true/false - convert search results to object
- # :timeout:: set a timeout length for the request (secs)
+ # Note, if you do not pass any filters or attributes arguments, the defaults
+ # for the dataset shall be used.
#
+ # Also, using the :required_attributes option - this performs AND logic and will require
+ # data to be returned in all of the listed attributes in order for it to be returned.
+ #
# By default will return a hash with the following:
#
- # :headers:: array of headers
- # :data:: array of arrays containing search results
+ # {
+ # :headers => [], # array of headers
+ # :data => [] # array of arrays containing search results
+ # }
#
# But with the :process_results option will return an array of hashes,
# where each hash represents a row of results (keyed by the attribute name).
def search( args={} )
+ if args[:required_attributes] and !args[:required_attributes].is_a?(Array)
+ raise Biomart::ArgumentError, "The :required_attributes option must be passed as an array."
+ end
+
response = request(
:method => 'post',
:url => @url,
:timeout => args[:timeout],
- :query => generate_xml(
- :filters => args[:filters],
- :attributes => args[:attributes]
- )
+ :query => generate_xml( process_xml_args(args) )
)
+
result = process_tsv( args, response )
+ result = filter_data_rows( args, result ) if args[:required_attributes]
result = conv_results_to_a_of_h( result ) if args[:process_results]
return result
end
# Utility function to build the Biomart query XML
@@ -116,42 +147,26 @@
xml = Builder::XmlMarkup.new( :target => biomart_xml, :indent => 2 )
xml.instruct!
xml.declare!( :DOCTYPE, :Query )
xml.Query( :virtualSchemaName => "default", :formatter => "TSV", :header => "0", :uniqueRows => "1", :count => args[:count], :datasetConfigVersion => "0.6" ) {
- xml.Dataset( :name => @name, :interface => "default" ) {
-
- if args[:filters]
- args[:filters].each do |name,value|
- if value.is_a? Array
- value = value.join(",")
- end
- xml.Filter( :name => name, :value => value )
+ dataset_xml( xml, self, { :filters => args[:filters], :attributes => args[:attributes] } )
+
+ if args[:federate]
+ args[:federate].each do |joined_dataset|
+ unless joined_dataset[:dataset].is_a?(Biomart::Dataset)
+ raise Biomart::ArgumentError, "You must pass a Biomart::Dataset object to the :federate[:dataset] option."
end
- else
- self.filters.each do |name,filter|
- if filter.default
- xml.Filter( :name => name, :value => filter.default_value )
- end
- end
+
+ dataset_xml(
+ xml,
+ joined_dataset[:dataset],
+ { :filters => joined_dataset[:filters], :attributes => joined_dataset[:attributes] }
+ )
end
-
- unless args[:count]
- if args[:attributes]
- args[:attributes].each do |name|
- xml.Attribute( :name => name )
- end
- else
- self.attributes.each do |name,attribute|
- if attribute.default
- xml.Attribute( :name => name )
- end
- end
- end
- end
-
- }
+ end
+
}
return biomart_xml
end
@@ -188,26 +203,82 @@
REXML::XPath.each( document, '//AttributeDescription' ) do |a|
@attributes[ a.attributes["internalName"] ] = Attribute.new( a.attributes )
end
end
+ # Utility function to process and test the arguments passed for
+ # the xml query.
+ def process_xml_args( args={} )
+ xml_args = {
+ :filters => args[:filters],
+ :attributes => args[:attributes]
+ }
+
+ if args[:federate]
+ unless args[:federate].is_a?(Array)
+ raise Biomart::ArgumentError, "The :federate option must be passed as an array."
+ end
+
+ unless args[:federate].size == 1
+ raise Biomart::ArgumentError, "Sorry, we can only federate two datasets at present. This limitation shall be lifted in version 0.8 of biomart."
+ end
+
+ xml_args[:federate] = args[:federate]
+ end
+
+ return xml_args
+ end
+
+ # Helper function to produce the portion of the biomart xml for
+ # a dataset query.
+ def dataset_xml( xml, dataset, args )
+ xml.Dataset( :name => dataset.name, :interface => "default" ) {
+
+ if args[:filters]
+ args[:filters].each do |name,value|
+ if value.is_a? Array
+ value = value.join(",")
+ end
+ xml.Filter( :name => name, :value => value )
+ end
+ else
+ dataset.filters.each do |name,filter|
+ if filter.default?
+ xml.Filter( :name => name, :value => filter.default_value )
+ end
+ end
+ end
+
+ unless args[:count]
+ if args[:attributes]
+ args[:attributes].each do |name|
+ xml.Attribute( :name => name )
+ end
+ else
+ dataset.attributes.each do |name,attribute|
+ if attribute.default?
+ xml.Attribute( :name => name )
+ end
+ end
+ end
+ end
+
+ }
+ end
+
# Utility function to transform the tab-separated data retrieved
# from the Biomart search query into a ruby object.
def process_tsv( args, tsv )
headers = []
parsed_data = []
+
+ append_header_attributes_for_tsv( headers, self, args[:attributes] )
- if args[:attributes]
- args[:attributes].each do |attribute|
- headers.push(attribute)
+ if args[:federate]
+ args[:federate].each do |joined_dataset|
+ append_header_attributes_for_tsv( headers, joined_dataset[:dataset], joined_dataset[:attributes] )
end
- else
- self.attributes.each do |name,attribute|
- if attribute.default
- headers.push(name)
- end
- end
end
parsed_data = []
if CSV.const_defined? :Reader
# Ruby < 1.9 CSV code
@@ -229,10 +300,26 @@
:headers => headers,
:data => parsed_data
}
end
+ # Helper function to append the attribute names to the 'headers' array
+ # for processing the returned results.
+ def append_header_attributes_for_tsv( headers, dataset, attributes )
+ if attributes
+ attributes.each do |attribute|
+ headers.push(attribute)
+ end
+ else
+ dataset.attributes.each do |name,attribute|
+ if attribute.default?
+ headers.push(name)
+ end
+ end
+ end
+ end
+
# Utility function to process TSV formatted data that raises errors. (Biomart
# has a habit of serving out this...) First attempts to use the CSV modules
# 'parse_line' function to read in the data, if that fails, tries to use split
# to recover the data.
def parse_tsv_line_by_line( expected_row_size, tsv )
@@ -299,8 +386,48 @@
result_objects.push(tmp)
end
return result_objects
end
-
+
+ # Utility function to remove data rows from a search result that do not include
+ # the :required_attributes.
+ def filter_data_rows( args, result )
+ # Get the list of attributes searched for...
+ attributes = args[:attributes] ? args[:attributes] : []
+ if attributes.empty?
+ self.attributes.each do |name,attribute|
+ if attribute.default?
+ attributes.push(name)
+ end
+ end
+ end
+
+ # Work out which attribute positions we need to test...
+ positions_to_test = []
+ attributes.each_index do |index|
+ if args[:required_attributes].include?(attributes[index])
+ positions_to_test.push(index)
+ end
+ end
+
+ # Now go through the results and filter out the unwanted data...
+ filtered_data = []
+ result[:data].each do |data_row|
+ save_row_count = 0
+
+ positions_to_test.each do |position|
+ save_row_count = save_row_count + 1 unless data_row[position].nil?
+ end
+
+ if save_row_count == positions_to_test.size
+ filtered_data.push(data_row)
+ end
+ end
+
+ return {
+ :headers => result[:headers],
+ :data => filtered_data
+ }
+ end
end
end
\ No newline at end of file