lib/bio/db/embl/sptr.rb in bio-1.4.1 vs lib/bio/db/embl/sptr.rb in bio-1.4.2

- old
+ new

@@ -48,11 +48,22 @@ # returns a Hash of the ID line. # # returns a content (Int or String) of the ID line by a given key. # Hash keys: ['ENTRY_NAME', 'DATA_CLASS', 'MODECULE_TYPE', 'SEQUENCE_LENGTH'] # - # === ID Line + # === ID Line (since UniProtKB release 9.0 of 31-Oct-2006) + # ID P53_HUMAN Reviewed; 393 AA. + # #"ID #{ENTRY_NAME} #{DATA_CLASS}; #{SEQUENCE_LENGTH}." + # + # === Examples + # obj.id_line #=> {"ENTRY_NAME"=>"P53_HUMAN", "DATA_CLASS"=>"Reviewed", + # "SEQUENCE_LENGTH"=>393, "MOLECULE_TYPE"=>nil} + # + # obj.id_line('ENTRY_NAME') #=> "P53_HUMAN" + # + # + # === ID Line (older style) # ID P53_HUMAN STANDARD; PRT; 393 AA. # #"ID #{ENTRY_NAME} #{DATA_CLASS}; #{MOLECULE_TYPE}; #{SEQUENCE_LENGTH}." # # === Examples # obj.id_line #=> {"ENTRY_NAME"=>"P53_HUMAN", "DATA_CLASS"=>"STANDARD", @@ -63,15 +74,24 @@ def id_line(key = nil) return id_line[key] if key return @data['ID'] if @data['ID'] part = @orig['ID'].split(/ +/) + if part[4].to_s.chomp == 'AA.' then + # after UniProtKB release 9.0 of 31-Oct-2006 + # (http://www.uniprot.org/docs/sp_news.htm) + molecule_type = nil + sequence_length = part[3].to_i + else + molecule_type = part[3].sub(/;/,'') + sequence_length = part[4].to_i + end @data['ID'] = { 'ENTRY_NAME' => part[1], 'DATA_CLASS' => part[2].sub(/;/,''), - 'MOLECULE_TYPE' => part[3].sub(/;/,''), - 'SEQUENCE_LENGTH' => part[4].to_i + 'MOLECULE_TYPE' => molecule_type, + 'SEQUENCE_LENGTH' => sequence_length } end # returns a ENTRY_NAME in the ID line. @@ -109,16 +129,31 @@ # returns a Hash of information in the DT lines. # hash keys: # ['created', 'sequence', 'annotation'] + #-- # also Symbols acceptable (ASAP): # [:created, :sequence, :annotation] + #++ # - # returns a String of information in the DT lines by a given key.. + # Since UniProtKB release 7.0 of 07-Feb-2006, the DT line format is + # changed, and the word "annotation" is no longer used in DT lines. + # Despite the change, the word "annotation" is still used for keeping + # compatibility. # + # returns a String of information in the DT lines by a given key. + # # === DT Line; date (3/entry) + # DT DD-MMM-YYY (integrated into UniProtKB/XXXXX.) + # DT DD-MMM-YYY (sequence version NN) + # DT DD-MMM-YYY (entry version NN) + # + # The format have been changed in UniProtKB release 7.0 of 07-Feb-2006. + # Below is the older format. + # + # === Old format of DT Line; date (3/entry) # DT DD-MMM-YYY (rel. NN, Created) # DT DD-MMM-YYY (rel. NN, Last sequence update) # DT DD-MMM-YYY (rel. NN, Last annotation update) def dt(key = nil) return dt[key] if key @@ -131,40 +166,168 @@ 'annotation' => part[2].sub(/\w{2} /,'').strip } end + # (private) parses DE line (description lines) + # since UniProtKB release 14.0 of 22-Jul-2008 + # + # Return array containing array. + # + # http://www.uniprot.org/docs/sp_news.htm + def parse_DE_line_rel14(str) + # Retruns if it is not the new format since Rel.14 + return nil unless /^DE (RecName|AltName|SubName)\: / =~ str + ret = [] + cur = nil + str.each_line do |line| + case line + when /^DE (Includes|Contains)\: *$/ + cur = [ $1 ] + ret.push cur + cur = nil + #subcat_and_desc = nil + next + when /^DE *(RecName|AltName|SubName)\: +(.*)/ + category = $1 + subcat_and_desc = $2 + cur = [ category ] + ret.push cur + when /^DE *(Flags)\: +(.*)/ + category = $1 + desc = $2 + flags = desc.strip.split(/\s*\;\s*/) || [] + cur = [ category, flags ] + ret.push cur + cur = nil + #subcat_and_desc = nil + next + when /^DE *(.*)/ + subcat_and_desc = $1 + else + warn "Warning: skipped DE line in unknown format: #{line.inspect}" + #subcat_and_desc = nil + next + end + case subcat_and_desc + when nil + # does nothing + when /\A([^\=]+)\=(.*)/ + subcat = $1 + desc = $2 + desc.sub!(/\;\s*\z/, '') + unless cur + warn "Warning: unknown category in DE line: #{line.inspect}" + cur = [ '' ] + ret.push cur + end + cur.push [ subcat, desc ] + else + warn "Warning: skipped DE line description in unknown format: #{line.inspect}" + end + end + ret + end + private :parse_DE_line_rel14 + # returns the proposed official name of the protein. + # Returns a String. + # + # Since UniProtKB release 14.0 of 22-Jul-2008, the DE line format have + # been changed. The method returns the full name which is taken from + # "RecName: Full=" or "SubName: Full=" line normally in the beginning of + # the DE lines. + # Unlike parser for old format, no special treatments for fragment or + # precursor. + # + # For old format, the method parses the DE lines and returns the protein + # name as a String. # # === DE Line; description (>=1) # "DE #{OFFICIAL_NAME} (#{SYNONYM})" # "DE #{OFFICIAL_NAME} (#{SYNONYM}) [CONTEINS: #1; #2]." # OFFICIAL_NAME 1/entry # SYNONYM >=0 # CONTEINS >=0 def protein_name - name = "" - if de_line = fetch('DE') then - str = de_line[/^[^\[]*/] # everything preceding the first [ (the "contains" part) - name = str[/^[^(]*/].strip - name << ' (Fragment)' if str =~ /fragment/i + @data['DE'] ||= parse_DE_line_rel14(get('DE')) + parsed_de_line = @data['DE'] + if parsed_de_line then + # since UniProtKB release 14.0 of 22-Jul-2008 + name = nil + parsed_de_line.each do |a| + case a[0] + when 'RecName', 'SubName' + if name_pair = a[1..-1].find { |b| b[0] == 'Full' } then + name = name_pair[1] + break + end + end + end + name = name.to_s + else + # old format (before Rel. 13.x) + name = "" + if de_line = fetch('DE') then + str = de_line[/^[^\[]*/] # everything preceding the first [ (the "contains" part) + name = str[/^[^(]*/].strip + name << ' (Fragment)' if str =~ /fragment/i + end end return name end - # returns an array of synonyms (unofficial names). + # returns synonyms (unofficial and/or alternative names). + # Returns an Array containing String objects. # + # Since UniProtKB release 14.0 of 22-Jul-2008, the DE line format have + # been changed. The method returns the full or short names which are + # taken from "RecName: Short=", "RecName: EC=", and AltName lines, + # except after "Contains:" or "Includes:". + # For keeping compatibility with old format parser, "RecName: EC=N.N.N.N" + # is reported as "EC N.N.N.N". + # In addition, to prevent confusion, "Allergen=" and "CD_antigen=" + # prefixes are added for the corresponding fields. + # + # For old format, the method parses the DE lines and returns synonyms. # synonyms are each placed in () following the official name on the DE line. def synonyms ary = Array.new - if de_line = fetch('DE') then - line = de_line.sub(/\[.*\]/,'') # ignore stuff between [ and ]. That's the "contains" part + @data['DE'] ||= parse_DE_line_rel14(get('DE')) + parsed_de_line = @data['DE'] + if parsed_de_line then + # since UniProtKB release 14.0 of 22-Jul-2008 + parsed_de_line.each do |a| + case a[0] + when 'Includes', 'Contains' + break #the each loop + when 'RecName', 'SubName', 'AltName' + a[1..-1].each do |b| + if name = b[1] and b[1] != self.protein_name then + case b[0] + when 'EC' + name = "EC " + b[1] + when 'Allergen', 'CD_antigen' + name = b[0] + '=' + b[1] + else + name = b[1] + end + ary.push name + end + end + end #case a[0] + end #parsed_de_line.each + else + # old format (before Rel. 13.x) + if de_line = fetch('DE') then + line = de_line.sub(/\[.*\]/,'') # ignore stuff between [ and ]. That's the "contains" part line.scan(/\([^)]+/) do |synonym| unless synonym =~ /fragment/i then ary << synonym[1..-1].strip # index to remove the leading ( end + end end end return ary end @@ -917,28 +1080,37 @@ } }[0] end private :cc_subcellular_location - - # CC -!- WEB RESOURCE: NAME=ResourceName[; NOTE=FreeText][; URL=WWWAddress]. + + #-- + # Since UniProtKB release 12.2 of 11-Sep-2007: + # CC -!- WEB RESOURCE: Name=ResourceName[; Note=FreeText][; URL=WWWAddress]. # Old format: + # CC -!- WEB RESOURCE: NAME=ResourceName[; NOTE=FreeText][; URL=WWWAddress]. + #++ + def cc_web_resource(data) data.map {|x| - entry = {'NAME' => nil, 'NOTE' => nil, 'URL' => nil} + entry = {'Name' => nil, 'Note' => nil, 'URL' => nil} x.split(';').each do |y| case y - when /NAME=(.+)/ - entry['NAME'] = $1.strip - when /NOTE=(.+)/ - entry['NOTE'] = $1.strip - when /URL="(.+)"/ + when /(Name|Note)\=(.+)/ + key = $1 + val = $2.strip + entry[key] = val + when /(NAME|NOTE)\=(.+)/ + key = $1.downcase.capitalize + val = $2.strip + entry[key] = val + when /URL\=\"(.+)\"/ entry['URL'] = $1.strip end end entry } end - + private :cc_web_resource # returns databases cross-references in the DR lines. # * Bio::SPTR#dr -> Hash w/in Array # # === DR Line; defabases cross-reference (>=0)