lib/dataset.rb in opentox-ruby-0.0.1 vs lib/dataset.rb in opentox-ruby-0.0.2

- old
+ new

@@ -1,226 +1,330 @@ module OpenTox + # Ruby wrapper for OpenTox Dataset Webservices (http://opentox.org/dev/apis/api-1.2/dataset). class Dataset - attr_accessor :uri, :title, :creator, :data, :features, :compounds + include OpenTox - def initialize( owl=nil ) - @data = {} - @features = [] + attr_reader :features, :compounds, :data_entries, :metadata + + # Create dataset with optional URI. Does not load data into the dataset - you will need to execute one of the load_* methods to pull data from a service or to insert it from other representations. + # @example Create an empty dataset + # dataset = OpenTox::Dataset.new + # @example Create an empty dataset with URI + # dataset = OpenTox::Dataset.new("http:://webservices.in-silico/ch/dataset/1") + # @param [optional, String] uri Dataset URI + # @return [OpenTox::Dataset] Dataset object + def initialize(uri=nil) + super uri + @features = {} @compounds = [] - - # creates dataset object from Opentox::Owl object - # use Dataset.find( <uri> ) to load dataset from rdf-supporting datasetservice - # note: does not load all feature values, as this is time consuming - if owl - raise "invalid param" unless owl.is_a?(OpenTox::Owl) - @title = owl.get("title") - @creator = owl.get("creator") - @uri = owl.uri - # when loading a dataset from owl, only compound- and feature-uris are loaded - owl.load_dataset(@compounds, @features) - # all features are marked as dirty - # as soon as a feature-value is requested all values for this feature are loaded from the rdf - @dirty_features = @features.dclone - @owl = owl - end + @data_entries = {} end - - def self.find(uri, accept_header=nil) - - unless accept_header - if (@@config[:yaml_hosts].include?(URI.parse(uri).host)) - accept_header = 'application/x-yaml' - else - accept_header = "application/rdf+xml" - end - end - - case accept_header - when "application/x-yaml" - d = YAML.load RestClientWrapper.get(uri.to_s.strip, :accept => 'application/x-yaml').to_s - d.uri = uri unless d.uri - when "application/rdf+xml" - owl = OpenTox::Owl.from_uri(uri.to_s.strip, "Dataset") - d = Dataset.new(owl) - else - raise "cannot get datset with accept header: "+accept_header.to_s - end - d + + # Create an empty dataset and save it at the dataset service (assigns URI to dataset) + # @example Create new dataset and save it to obtain a URI + # dataset = OpenTox::Dataset.create + # @param [optional, String] uri Dataset URI + # @return [OpenTox::Dataset] Dataset object + def self.create(uri=CONFIG[:services]["opentox-dataset"]) + dataset = Dataset.new + dataset.save + dataset end - - # converts a dataset represented in owl to yaml - # (uses a temporary dataset) - # note: to_yaml is overwritten, loads complete owl dataset values - def self.owl_to_yaml( owl_data, uri) - owl = OpenTox::Owl.from_data(owl_data, uri, "Dataset") - d = Dataset.new(owl) - d.to_yaml + + # Create dataset from CSV file (format specification: http://toxcreate.org/help) + # - loads data_entries, compounds, features + # - sets metadata (warnings) for parser errors + # - you will have to set remaining metadata manually + # @param [String] file CSV file path + # @return [OpenTox::Dataset] Dataset object with CSV data + def self.create_from_csv_file(file) + dataset = Dataset.create + parser = Parser::Spreadsheets.new + parser.dataset = dataset + parser.load_csv(File.open(file).read) + dataset.save + dataset end - - # creates a new dataset, using only those compounsd specified in new_compounds - # returns uri of new dataset - def create_new_dataset( new_compounds, new_features, new_title, new_creator ) - - LOGGER.debug "create new dataset with "+new_compounds.size.to_s+"/"+compounds.size.to_s+" compounds" - raise "no new compounds selected" unless new_compounds and new_compounds.size>0 - - # load require features - if ((defined? @dirty_features) && (@dirty_features & new_features).size > 0) - (@dirty_features & new_features).each{|f| load_feature_values(f)} - end - - dataset = OpenTox::Dataset.new - dataset.title = new_title - dataset.creator = new_creator - dataset.features = new_features - dataset.compounds = new_compounds - - # Copy dataset data for compounds and features - # PENDING: why storing feature values in an array? - new_compounds.each do |c| - data_c = [] - raise "no data for compound '"+c.to_s+"'" if @data[c]==nil - @data[c].each do |d| - m = {} - new_features.each do |f| - m[f] = d[f] - end - data_c << m - end - dataset.data[c] = data_c - end - return dataset.save + + # Find a dataset and load all data. This can be time consuming, use Dataset.new together with one of the load_* methods for a fine grained control over data loading. + # @param [String] uri Dataset URI + # @return [OpenTox::Dataset] Dataset object with all data + def self.find(uri) + dataset = Dataset.new(uri) + dataset.load_all + dataset end + + # Get all datasets from a service + # @param [optional,String] uri URI of the dataset service, defaults to service specified in configuration + # @return [Array] Array of dataset object without data (use one of the load_* methods to pull data from the server) + def self.all(uri=CONFIG[:services]["opentox-dataset"]) + RestClientWrapper.get(uri,:accept => "text/uri-list").to_s.each_line.collect{|u| Dataset.new(u)} + end + + # Load YAML representation into the dataset + # @param [String] yaml YAML representation of the dataset + # @return [OpenTox::Dataset] Dataset object with YAML data + def load_yaml(yaml) + copy YAML.load(yaml) + end + + def load_rdfxml(rdfxml) + load_rdfxml_file Tempfile.open("ot-rdfxml"){|f| f.write(rdfxml)}.path + end + + # Load RDF/XML representation from a file + # @param [String] file File with RDF/XML representation of the dataset + # @return [OpenTox::Dataset] Dataset object with RDF/XML data + def load_rdfxml_file(file) + parser = Parser::Owl::Dataset.new @uri + parser.uri = file.path + copy parser.load_uri + end + + # Load CSV string (format specification: http://toxcreate.org/help) + # - loads data_entries, compounds, features + # - sets metadata (warnings) for parser errors + # - you will have to set remaining metadata manually + # @param [String] csv CSV representation of the dataset + # @return [OpenTox::Dataset] Dataset object with CSV data + def load_csv(csv) + save unless @uri # get a uri for creating features + parser = Parser::Spreadsheets.new + parser.dataset = self + parser.load_csv(csv) + end + + # Load Spreadsheet book (created with roo gem http://roo.rubyforge.org/, excel format specification: http://toxcreate.org/help) + # - loads data_entries, compounds, features + # - sets metadata (warnings) for parser errors + # - you will have to set remaining metadata manually + # @param [Excel] book Excel workbook object (created with roo gem) + # @return [OpenTox::Dataset] Dataset object with Excel data + def load_spreadsheet(book) + save unless @uri # get a uri for creating features + parser = Parser::Spreadsheets.new + parser.dataset = self + parser.load_spreadsheet(book) + end - # returns classification value - def get_predicted_class(compound, feature) - v = get_value(compound, feature) - if v.is_a?(Hash) - k = v.keys.grep(/classification/).first - unless k.empty? - #if v.has_key?(:classification) - return v[k] - else - return "no classification key" - end - elsif v.is_a?(Array) - raise "predicted class value is an array\n"+ - "value "+v.to_s+"\n"+ - "value-class "+v.class.to_s+"\n"+ - "dataset "+@uri.to_s+"\n"+ - "compound "+compound.to_s+"\n"+ - "feature "+feature.to_s+"\n" + # Load and return only metadata of a Dataset object + # @return [Hash] Metadata of the dataset + def load_metadata + add_metadata Parser::Owl::Dataset.new(@uri).load_metadata + self.uri = @uri if @uri # keep uri + @metadata + end + + # Load all data (metadata, data_entries, compounds and features) from URI + def load_all + if (CONFIG[:yaml_hosts].include?(URI.parse(@uri).host)) + copy YAML.load(RestClientWrapper.get(@uri, :accept => "application/x-yaml")) else - return v + parser = Parser::Owl::Dataset.new(@uri) + copy parser.load_uri end end - - # returns regression value - def get_predicted_regression(compound, feature) - v = get_value(compound, feature) - if v.is_a?(Hash) - k = v.keys.grep(/regression/).first - unless k.empty? - return v[k] - else - return "no regression key" - end - elsif v.is_a?(Array) - raise "predicted regression value is an array\n"+ - "value "+v.to_s+"\n"+ - "value-class "+v.class.to_s+"\n"+ - "dataset "+@uri.to_s+"\n"+ - "compound "+compound.to_s+"\n"+ - "feature "+feature.to_s+"\n" - else - return v + + # Load and return only compound URIs from the dataset service + # @return [Array] Compound URIs in the dataset + def load_compounds + RestClientWrapper.get(File.join(uri,"compounds"),:accept=> "text/uri-list").to_s.each_line do |compound_uri| + @compounds << compound_uri.chomp end + @compounds.uniq! end - - # returns prediction confidence if available - def get_prediction_confidence(compound, feature) - v = get_value(compound, feature) - if v.is_a?(Hash) - k = v.keys.grep(/confidence/).first - unless k.empty? - #if v.has_key?(:confidence) - return v[k].abs - #return v["http://ot-dev.in-silico.ch/model/lazar#confidence"].abs + + # Load and return only features from the dataset service + # @return [Hash] Features of the dataset + def load_features + parser = Parser::Owl::Dataset.new(@uri) + @features = parser.load_features + @features + end + + # Detect feature type(s) in the dataset + # @return [String] `classification", "regression", "mixed" or unknown` + def feature_type + feature_types = @features.collect{|f,metadata| metadata[OT.isA]}.uniq + if feature_types.size > 1 + "mixed" + else + case feature_types.first + when /NominalFeature/ + "classification" + when /NumericFeature/ + "regression" else - # PENDING: return nil isntead of raising an exception - raise "no confidence key" + "unknown" end - else - LOGGER.warn "no confidence for compound: "+compound.to_s+", feature: "+feature.to_s - return 1 end end - - # return compound-feature value - def get_value(compound, feature) - if (defined? @dirty_features) && @dirty_features.include?(feature) - load_feature_values(feature) - end - - v = @data[compound] - return nil if v == nil # missing values for all features - if v.is_a?(Array) - # PENDING: why using an array here? - v.each do |e| - if e.is_a?(Hash) - if e.has_key?(feature) - return e[feature] - end - else - raise "invalid internal value type" - end + + # Get Spreadsheet representation + # @return [Spreadsheet::Workbook] Workbook which can be written with the spreadsheet gem (data_entries only, metadata will will be discarded)) + def to_spreadsheet + Serializer::Spreadsheets.new(self).to_spreadsheet + end + + # Get Excel representation (alias for to_spreadsheet) + # @return [Spreadsheet::Workbook] Workbook which can be written with the spreadsheet gem (data_entries only, metadata will will be discarded)) + def to_xls + to_spreadsheet + end + + # Get CSV string representation (data_entries only, metadata will be discarded) + # @return [String] CSV representation + def to_csv + Serializer::Spreadsheets.new(self).to_csv + end + + # Get OWL-DL in ntriples format + # @return [String] N-Triples representation + def to_ntriples + s = Serializer::Owl.new + s.add_dataset(self) + s.to_ntriples + end + + # Get OWL-DL in RDF/XML format + # @return [String] RDF/XML representation + def to_rdfxml + s = Serializer::Owl.new + s.add_dataset(self) + s.to_rdfxml + end + + # Get name (DC.title) of a feature + # @param [String] feature Feature URI + # @return [String] Feture title + def feature_name(feature) + @features[feature][DC.title] + end + + def title + @metadata[DC.title] + end + + # Insert a statement (compound_uri,feature_uri,value) + # @example Insert a statement (compound_uri,feature_uri,value) + # dataset.add "http://webservices.in-silico.ch/compound/InChI=1S/C6Cl6/c7-1-2(8)4(10)6(12)5(11)3(1)9", "http://webservices.in-silico.ch/dataset/1/feature/hamster_carcinogenicity", true + # @param [String] compound Compound URI + # @param [String] feature Compound URI + # @param [Boolean,Float] value Feature value + def add (compound,feature,value) + @compounds << compound unless @compounds.include? compound + @features[feature] = {} unless @features[feature] + @data_entries[compound] = {} unless @data_entries[compound] + @data_entries[compound][feature] = [] unless @data_entries[compound][feature] + @data_entries[compound][feature] << value + end + + # Add/modify metadata, existing entries will be overwritten + # @example + # dataset.add_metadata({DC.title => "any_title", DC.creator => "my_email"}) + # @param [Hash] metadata Hash mapping predicate_uris to values + def add_metadata(metadata) + metadata.each { |k,v| @metadata[k] = v } + end + + # Add a feature + # @param [String] feature Feature URI + # @param [Hash] metadata Hash with feature metadata + def add_feature(feature,metadata={}) + @features[feature] = metadata + end + + # Add/modify metadata for a feature + # @param [String] feature Feature URI + # @param [Hash] metadata Hash with feature metadata + def add_feature_metadata(feature,metadata) + metadata.each { |k,v| @features[feature][k] = v } + end + + # Save dataset at the dataset service + # - creates a new dataset if uri is not set + # - overwrites dataset if uri exists + # @return [String] Dataset URI + def save + # TODO: rewrite feature URI's ?? + @compounds.uniq! + if @uri + if (CONFIG[:yaml_hosts].include?(URI.parse(@uri).host)) + RestClientWrapper.post(@uri,{:content_type => "application/x-yaml"},self.to_yaml) + else + File.open("ot-post-file.rdf","w+") { |f| f.write(self.to_rdfxml); @path = f.path } + task_uri = RestClient.post(@uri, {:file => File.new(@path)},{:accept => "text/uri-list"}).to_s.chomp + #task_uri = `curl -X POST -H "Accept:text/uri-list" -F "file=@#{@path};type=application/rdf+xml" http://apps.ideaconsult.net:8080/ambit2/dataset` + Task.find(task_uri).wait_for_completion + self.uri = RestClientWrapper.get(task_uri,:accept => 'text/uri-list') end - return nil #missing value else - raise "value is not an array\n"+ - "value "+v.to_s+"\n"+ - "value-class "+v.class.to_s+"\n"+ - "dataset "+@uri.to_s+"\n"+ - "compound "+compound.to_s+"\n"+ - "feature "+feature.to_s+"\n" + # create dataset if uri is empty + self.uri = RestClientWrapper.post(CONFIG[:services]["opentox-dataset"],{}).to_s.chomp end + @uri end - # loads specified feature and removes dirty-flag, loads all features if feature is nil - def load_feature_values(feature=nil) - if feature - raise "feature already loaded" unless @dirty_features.include?(feature) - @owl.load_dataset_feature_values(@compounds, @data, [feature]) - @dirty_features.delete(feature) + # Delete dataset at the dataset service + def delete + RestClientWrapper.delete @uri + end + + private + # Copy a dataset (rewrites URI) + def copy(dataset) + @metadata = dataset.metadata + @data_entries = dataset.data_entries + @compounds = dataset.compounds + @features = dataset.features + if @uri + self.uri = @uri else - @data = {} unless @data - @owl.load_dataset_feature_values(@compounds, @data, @dirty_features) - @dirty_features.clear + @uri = dataset.metadata[XSD.anyURI] end end - - # overwrite to yaml: - # in case dataset is loaded from owl: - # * load all values - def to_yaml - # loads all features - if ((defined? @dirty_features) && @dirty_features.size > 0) - load_feature_values - end - super + end + + # Class with special methods for lazar prediction datasets + class LazarPrediction < Dataset + + # Find a prediction dataset and load all data. + # @param [String] uri Prediction dataset URI + # @return [OpenTox::Dataset] Prediction dataset object with all data + def self.find(uri) + prediction = LazarPrediction.new(uri) + prediction.load_all + prediction end - - # * remove @owl from yaml, not necessary - def to_yaml_properties - super - ["@owl"] + + def value(compound) + @data_entries[compound.uri].collect{|f,v| v.first if f.match(/prediction/)}.compact.first end - # saves (changes) as new dataset in dataset service - # returns uri - # uses to yaml method (which is overwritten) - def save - OpenTox::RestClientWrapper.post(@@config[:services]["opentox-dataset"],{:content_type => "application/x-yaml"},self.to_yaml).strip + def confidence(compound) + feature_uri = @data_entries[compound.uri].collect{|f,v| f if f.match(/prediction/)}.compact.first + @features[feature_uri][OT.confidence] end + + def descriptors(compound) + @data_entries[compound.uri].collect{|f,v| @features[f] if f.match(/descriptor/)}.compact if @data_entries[compound.uri] + end + + def measured_activities(compound) + source = @metadata[OT.hasSource] + @data_entries[compound.uri].collect{|f,v| v if f.match(/#{source}/)}.compact.flatten + end + + def neighbors(compound) + @data_entries[compound.uri].collect{|f,v| @features[f] if f.match(/neighbor/)}.compact + end + +# def errors(compound) +# features = @data_entries[compound.uri].keys +# features.collect{|f| @features[f][OT.error]}.join(" ") if features +# end + end end