require 'open-uri' require 'rexml/document' require 'rexml/streamlistener' $ANNOTS = [] class GIListener include REXML include StreamListener attr_accessor :annotations def initialize @get_title = false @annotations = [] end def tag_start(name, attributes) #puts "NAME" + name #p attributes if name == "Item" && attributes["Name"] == "Title" @get_title = true end end def text(text) #puts "TEXT: " + text + @get_title.to_s if @get_title #puts "GETTING TITLE!" @annotations.push text.chomp @get_title = false end end end class GI BATCH_SIZE = 500 # takes an array of gi numbers and returns an array of annotation # This allows use of the batch search mode on NCBI # returns nil if no internet connection def self.gi2annot(list_of_gi_numbers) annots = [] loop do batch = list_of_gi_numbers.slice!(0..BATCH_SIZE) if batch.size == 0 then break end string = batch.join(",") url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=protein&retmode=xml&id=#{string}" #puts url begin open(url) do |handle| annots.push( *(parse_etool_output(handle)) ) end rescue SocketError return nil end end annots end protected # Returns a list of Annotation strings def self.parse_etool_output(handle) listener = GIListener.new parser = REXML::Parsers::StreamParser.new(handle, listener) parser.parse listener.annotations end end =begin 24115498 NP_710008 chaperonin GroEL [Shigella flexneri 2a str. 301] gi|24115498|ref|NP_710008.1|[24115498] 24115498 2002/10/16 2006/04/03 512 198214 live 434011 CAA24741 unnamed protein product [Escherichia coli] gi|434011|emb|CAA24741.1|[434011] 434011 1983/12/06 2005/04/18 0 562 live =end