# BioInterchange converts non-RDF data formats into RDF. # # Convert TSV, XML, GFF3, GVF and other files into RDF triples using # BioInterchange's command-line tool, its web-services or make use # of it as a gem in your own Ruby implementation. module BioInterchange ### Global behaviour settings, which can be altered programmatically or via the CLI: # If true, then RDF::Graph's "insert" function will be overwritten so that it # immediately outputs N-Triples. This reduces memory requirements (since no RDF # graph is kept in memory) and performance (since no looping through an RDF graph # is necessary). @@skip_rdf_graph = true def self.skip_rdf_graph @@skip_rdf_graph end # Custom Exceptions and Errors require 'biointerchange/exceptions' # Ontologies (besides the ones from the 'rdf' gem) require 'biointerchange/gff3o' require 'biointerchange/goxref' require 'biointerchange/gvf1o' require 'biointerchange/sio' require 'biointerchange/so' require 'biointerchange/sofa' # Reader/writer interfaces require 'biointerchange/reader' require 'biointerchange/writer' # # TEXT MINING # # Text mining readers require 'biointerchange/textmining/text_mining_reader' require 'biointerchange/textmining/pubannos_json_reader' require 'biointerchange/textmining/pdfx_xml_reader' # Text mining model require 'biointerchange/textmining/document' require 'biointerchange/textmining/content' require 'biointerchange/textmining/content_connection' require 'biointerchange/textmining/process' # Text mining writers require 'biointerchange/textmining/text_mining_rdf_ntriples' # # GENOMICS # ### GFF3 ### # Reader require 'biointerchange/genomics/gff3_reader' # Feature base model require 'biointerchange/genomics/gff3_pragmas' require 'biointerchange/genomics/gff3_feature_set' require 'biointerchange/genomics/gff3_feature' # Writer require 'biointerchange/genomics/gff3_rdf_ntriples' ### GVF ### # Reader require 'biointerchange/genomics/gvf_reader' # Feature base model require 'biointerchange/genomics/gvf_pragmas' require 'biointerchange/genomics/gvf_feature_set' require 'biointerchange/genomics/gvf_feature' # Writer # ...same GFF3 writer # # ACTUAL COMMAND LINE IMPLEMENTATION # # Option parsing require 'getopt/long' def self.cli begin opt = Getopt::Long.getopts( ["--help", "-h", Getopt::BOOLEAN], ["--debug", "-d", Getopt::BOOLEAN], # set debug mode => print stack traces ["--no_rdf_graph_optimization", "-n", Getopt::BOOLEAN], # set self.skip_rdf_graph to false ["--batchsize", "-b", Getopt::OPTIONAL], # batchsize for readers/writers that support +postpone?+ ["--input", "-i", Getopt::REQUIRED], # input file format ["--rdf", "-r", Getopt::REQUIRED], # output file format ["--annotate_name", Getopt::OPTIONAL], # name of resourcce/tool/person ["--annotate_name_id", Getopt::OPTIONAL], # uri of resource/tool/person ["--annotate_date", Getopt::OPTIONAL], # date of processing/annotation ["--annotate_version", Getopt::OPTIONAL], # version number of resource ["--file", "-f", Getopt::OPTIONAL], # file to read, will read from STDIN if not supplied ["--out", "-o", Getopt::OPTIONAL], # output file, will out to STDOUT if not supplied ["--version", "-v", Getopt::BOOLEAN] # output the version number of the gem and exit ) if opt['help'] or not (opt['input'] and opt['rdf'] or opt['version']) then puts "Usage: ruby #{$0} -i -r [options]" puts '' puts 'Supported input formats (--input /-i ):' puts ' biointerchange.gff3 : GFF3' puts ' biointerchange.gvf : GVF' puts ' dbcls.catanns.json : PubAnnotation JSON' puts ' uk.ac.man.pdfx : PDFx XML' puts '' puts 'Supported output formats (--rdf /-r )' puts ' rdf.biointerchange.gff3 : RDF N-Triples for the following input' puts ' biointerchange.gff3' puts ' rdf.biointerchange.gvf : RDF N-Triples for the following input' puts ' biointerchange.gff3' puts ' biointerchange.gvf' puts ' rdf.bh12.sio : RDF N-Triples for the following inputs' puts ' dbcls.catanns.json' puts ' uk.ac.man.pdfx' puts '' puts 'I/O options:' puts ' -f /--file : file to read; STDIN used if not supplied' puts ' -o /--out : output file; STDOUT used if not supplied' puts '' puts 'Input-/RDF-format specific options:' puts ' Input: dbcls.catanns.json, uk.ac.man.pdfx' puts ' Output: rdf.bh12.sio' puts ' Options:' puts ' --annotate_date : date of processing/annotation (optional)' puts ' --annotate_version : version number of resource (optional)' puts ' --annotate_name : name of resource/tool/person (required)' puts ' --annotate_name_id : URI of resource/tool/person (required)' puts '' puts 'Input-/RDF-format specific options:' puts ' Input: biointerchange.gff3 or biointerchange.gvf' puts ' Output: rdf.biointerchange.gff3 or rdf.biointerchange.gvf' puts ' Options:' puts ' -b /--batchsize : process features in batches of the given size (optional)' puts ' -t /--date : date when the GFF3/GVF file was created (optional)' puts ' --name : name of the GFF3/GVF file creator (optional)' puts ' --name_id : email address of the GFF3/GVF file creator (optional)' puts '' puts 'Other options:' puts ' -v / --version : print the Gem\'s version number and exit' puts ' -d / --debug : turn on debugging output (for stacktraces)' puts ' -h --help : this message' exit 1 end # Print version number and exit: if opt['version'] then puts "BioInterchange #{Gem.loaded_specs["biointerchange"].version}" exit end # Turn off optimization, if requested. This will generate an RDF graph in memory and # at least double memory requirements and runtime. @@skip_rdf_graph = false if opt['no_rdf_graph_optimization'] # Check if the input/rdf options are supported: if opt['input'] == 'dbcls.catanns.json' or opt['input'] == 'uk.ac.man.pdfx' then if opt['rdf'] == 'rdf.bh12.sio' then raise ArgumentError, 'Require --name and --name_id options to specify source of annotations (e.g., a manual annotators name, or software tool name) and their associated URI (e.g., email address, or webaddress).' unless opt['name'] and opt['name_id'] else unsupported_combination end elsif opt['input'] == 'biointerchange.gff3' then if opt['rdf'] == 'rdf.biointerchange.gff3' then # Okay. No further arguments required. else unsupported_combination end elsif opt['input'] == 'biointerchange.gvf' then if opt['rdf'] == 'rdf.biointerchange.gff3' or opt['rdf'] == 'rdf.biointerchange.gvf' then # Okay. No further arguments required. else unsupported_combination end else unsupported_combination end wrong_type('batchsize', 'a positive integer') if opt['batchsize'] and not opt['batchsize'].match(/^[1-9][0-9]*$/) opt['batchsize'] = opt['batchsize'].to_i if opt['batchsize'] # Generate model from file (deserialization). # Note: if-clauses are lexicographically ordered. reader = nil if opt['input'] == 'biointerchange.gff3' then reader = BioInterchange::Genomics::GFF3Reader.new(opt['annotate_name'], opt['annotate_name_id'], opt['annotate_date'], opt['batchsize']) elsif opt['input'] == 'biointerchange.gvf' then reader = BioInterchange::Genomics::GVFReader.new(opt['annotate_name'], opt['annotate_name_id'], opt['annotate_date'], opt['batchsize']) elsif opt['input'] == 'dbcls.catanns.json' then reader = BioInterchange::TextMining::PubannosJsonReader.new(opt['annotate_name'], opt['annotate_name_id'], opt['annotate_date'], BioInterchange::TextMining::Process::UNSPECIFIED, opt['version']) elsif opt['input'] == 'uk.ac.man.pdfx' then reader = BioInterchange::TextMining::PdfxXmlReader.new(opt['annotate_name'], opt['annotate_name_id'], opt['annotate_date'], BioInterchange::TextMining::Process::UNSPECIFIED, opt['annotate_version']) end if opt["file"] input_source = File.new(opt["file"],'r') else input_source = STDIN end # Generate rdf from model (serialization). # Note: if-clauses are lexicographically ordered. writer = nil if opt['rdf'] == 'rdf.bh12.sio' then writer = BioInterchange::TextMining::RDFWriter.new(File.new(opt['out'], 'w')) if opt['out'] writer = BioInterchange::TextMining::RDFWriter.new(STDOUT) unless opt['out'] end if opt['rdf'] == 'rdf.biointerchange.gff3' or opt['rdf'] == 'rdf.biointerchange.gvf' then writer = BioInterchange::Genomics::RDFWriter.new(File.new(opt['out'], 'w')) if opt['out'] writer = BioInterchange::Genomics::RDFWriter.new(STDOUT) unless opt['out'] end begin model = reader.deserialize(input_source) writer.serialize(model) end while reader.postponed? rescue ArgumentError => e $stderr.puts e.message $stderr.puts e.backtrace if opt['debug'] exit 1 rescue Getopt::Long::Error => e $stderr.puts e.message #$stderr.puts e.backtrace if opt['debug'] exit 1 rescue BioInterchange::Exceptions::InputFormatError => e $stderr.puts e.message $stderr.puts e.backtrace if opt['debug'] exit 2 end end # # Helper functions # # Returns the values of several named parameters. # # +map+:: a map of named parameters and their values # +parameters+:: the names of the parameter values we are interested in def self.get_parameters(map, parameters) parameters.map { |parameter| if parameter.instance_of? Array then parameter[0].call(*BioInterchange::get_parameters(map, parameter[1..-1])) else map[parameter] end } end # Returns a "safe" version of a label that can be used as a Ruby method name. # # +label+:: string that should be converted into a "safe" string that can be used as a Ruby method name def self.make_safe_label(label) label.gsub(/[ '-.<>\/]/, '_').gsub(/\([^\)]*?\)/, '').sub(/^(\d+)/, "a_#{$1}").gsub(/^_+|_+$/, '').gsub(/_+/, '_') end private def self.unsupported_combination raise ArgumentError, 'This input/output format combination is not supported.' end def self.wrong_type(parameter, expected_type) raise ArgumentError, "The parameter '#{parameter}' needs to be #{expected_type}." end end # Overwrite RDF::Graph implementation, in case we do not want to keep # the complete graph in memory. If the implementing writer does not # set an output stream via +fast_ostream+, then fall back to the original # implementation. module RDF class Graph # DO NOT keep old insert implementation due to infinite recursion caused by module loading dependencies! # alias_method :graph_building_insert, :insert # Set an output stream for writing in +insert+. # # +ostream+:: Output stream that is populated by +insert+, if optimization can be carried out. def fast_ostream(ostream) @ostream = ostream end # Alternative implementation to +insert+, which can immediately output N-Triples instead # of building an in-memory graph first. # # +statement+:: RDF statement that should be serialized. def insert(statement) if BioInterchange::skip_rdf_graph and @ostream then @ostream.puts(statement.to_ntriples) else insert_statement(statement) end end end end