sptr.rb in bio-1.4.2

- old
+ new

@@ -48,11 +48,22 @@
   # returns a Hash of the ID line.
   #
   # returns a content (Int or String) of the ID line by a given key.
   # Hash keys: ['ENTRY_NAME', 'DATA_CLASS', 'MODECULE_TYPE', 'SEQUENCE_LENGTH']
   #
-  # === ID Line
+  # === ID Line (since UniProtKB release 9.0 of 31-Oct-2006)
+  #   ID   P53_HUMAN               Reviewed;         393 AA.
+  #   #"ID  #{ENTRY_NAME} #{DATA_CLASS}; #{SEQUENCE_LENGTH}."
+  #
+  # === Examples
+  #   obj.id_line  #=> {"ENTRY_NAME"=>"P53_HUMAN", "DATA_CLASS"=>"Reviewed", 
+  #                     "SEQUENCE_LENGTH"=>393, "MOLECULE_TYPE"=>nil}
+  #
+  #   obj.id_line('ENTRY_NAME') #=> "P53_HUMAN"
+  #
+  # 
+  # === ID Line (older style)
   #   ID   P53_HUMAN      STANDARD;      PRT;   393 AA.
   #   #"ID  #{ENTRY_NAME} #{DATA_CLASS}; #{MOLECULE_TYPE}; #{SEQUENCE_LENGTH}."
   #
   # === Examples
   #   obj.id_line  #=> {"ENTRY_NAME"=>"P53_HUMAN", "DATA_CLASS"=>"STANDARD", 
@@ -63,15 +74,24 @@
   def id_line(key = nil)
     return id_line[key] if key
     return @data['ID'] if @data['ID']
 
     part = @orig['ID'].split(/ +/)         
+    if part[4].to_s.chomp == 'AA.' then
+      # after UniProtKB release 9.0 of 31-Oct-2006
+      # (http://www.uniprot.org/docs/sp_news.htm)
+      molecule_type   = nil
+      sequence_length = part[3].to_i
+    else
+      molecule_type   = part[3].sub(/;/,'')
+      sequence_length = part[4].to_i
+    end
     @data['ID'] = {
       'ENTRY_NAME'      => part[1],
       'DATA_CLASS'      => part[2].sub(/;/,''),
-      'MOLECULE_TYPE'   => part[3].sub(/;/,''),
-      'SEQUENCE_LENGTH' => part[4].to_i 
+      'MOLECULE_TYPE'   => molecule_type,
+      'SEQUENCE_LENGTH' => sequence_length
     }
   end
 
 
   # returns a ENTRY_NAME in the ID line. 
@@ -109,16 +129,31 @@
 
 
   # returns a Hash of information in the DT lines.
   #  hash keys: 
   #    ['created', 'sequence', 'annotation']
+  #--
   #  also Symbols acceptable (ASAP):
   #    [:created, :sequence, :annotation]
+  #++
   #
-  # returns a String of information in the DT lines by a given key..
+  # Since UniProtKB release 7.0 of 07-Feb-2006, the DT line format is
+  # changed, and the word "annotation" is no longer used in DT lines.
+  # Despite the change, the word "annotation" is still used for keeping
+  # compatibility.
   #
+  # returns a String of information in the DT lines by a given key.
+  #
   # === DT Line; date (3/entry)
+  #   DT DD-MMM-YYY (integrated into UniProtKB/XXXXX.)
+  #   DT DD-MMM-YYY (sequence version NN)
+  #   DT DD-MMM-YYY (entry version NN)
+  #
+  # The format have been changed in UniProtKB release 7.0 of 07-Feb-2006.
+  # Below is the older format.
+  #
+  # === Old format of DT Line; date (3/entry)
   #   DT DD-MMM-YYY (rel. NN, Created)
   #   DT DD-MMM-YYY (rel. NN, Last sequence update)
   #   DT DD-MMM-YYY (rel. NN, Last annotation update)
   def dt(key = nil)
     return dt[key] if key
@@ -131,40 +166,168 @@
       'annotation' => part[2].sub(/\w{2}   /,'').strip
     }
   end
 
 
+  # (private) parses DE line (description lines)
+  # since UniProtKB release 14.0 of 22-Jul-2008
+  #
+  # Return array containing array.
+  #
+  # http://www.uniprot.org/docs/sp_news.htm
+  def parse_DE_line_rel14(str)
+    # Retruns if it is not the new format since Rel.14
+    return nil unless /^DE   (RecName|AltName|SubName)\: / =~ str
+    ret = []
+    cur = nil
+    str.each_line do |line|
+      case line
+      when /^DE   (Includes|Contains)\: *$/
+        cur = [ $1 ]
+        ret.push cur
+        cur = nil
+        #subcat_and_desc = nil
+        next
+      when /^DE   *(RecName|AltName|SubName)\: +(.*)/
+        category = $1
+        subcat_and_desc = $2
+        cur = [ category ]
+        ret.push cur
+      when /^DE   *(Flags)\: +(.*)/
+        category = $1
+        desc = $2
+        flags = desc.strip.split(/\s*\;\s*/) || []
+        cur = [ category, flags ]
+        ret.push cur
+        cur = nil
+        #subcat_and_desc = nil
+        next
+      when /^DE   *(.*)/
+        subcat_and_desc = $1
+      else
+        warn "Warning: skipped DE line in unknown format: #{line.inspect}"
+        #subcat_and_desc = nil
+        next
+      end
+      case subcat_and_desc
+      when nil
+        # does nothing
+      when /\A([^\=]+)\=(.*)/
+        subcat = $1
+        desc = $2
+        desc.sub!(/\;\s*\z/, '')
+        unless cur
+          warn "Warning: unknown category in DE line: #{line.inspect}"
+          cur = [ '' ]
+          ret.push cur
+        end
+        cur.push [ subcat, desc ]
+      else
+        warn "Warning: skipped DE line description in unknown format: #{line.inspect}"
+      end
+    end
+    ret
+  end
+  private :parse_DE_line_rel14
+
   # returns the proposed official name of the protein.
+  # Returns a String.
+  #
+  # Since UniProtKB release 14.0 of 22-Jul-2008, the DE line format have
+  # been changed. The method returns the full name which is taken from
+  # "RecName: Full=" or "SubName: Full=" line normally in the beginning of
+  # the DE lines. 
+  # Unlike parser for old format, no special treatments for fragment or
+  # precursor.
+  #
+  # For old format, the method parses the DE lines and returns the protein
+  # name as a String.
   # 
   # === DE Line; description (>=1)
   #  "DE #{OFFICIAL_NAME} (#{SYNONYM})"
   #  "DE #{OFFICIAL_NAME} (#{SYNONYM}) [CONTEINS: #1; #2]."
   #  OFFICIAL_NAME  1/entry
   #  SYNONYM        >=0
   #  CONTEINS       >=0
   def protein_name
-    name = ""
-    if de_line = fetch('DE') then
-      str = de_line[/^[^\[]*/] # everything preceding the first [ (the "contains" part)
-      name = str[/^[^(]*/].strip
-      name << ' (Fragment)' if str =~ /fragment/i
+    @data['DE'] ||= parse_DE_line_rel14(get('DE'))
+    parsed_de_line = @data['DE']
+    if parsed_de_line then
+      # since UniProtKB release 14.0 of 22-Jul-2008
+      name = nil
+      parsed_de_line.each do |a|
+        case a[0]
+        when 'RecName', 'SubName'
+          if name_pair = a[1..-1].find { |b| b[0] == 'Full' } then
+            name = name_pair[1]
+            break
+          end
+        end
+      end
+      name = name.to_s
+    else
+      # old format (before Rel. 13.x)
+      name = ""
+      if de_line = fetch('DE') then
+        str = de_line[/^[^\[]*/] # everything preceding the first [ (the "contains" part)
+        name = str[/^[^(]*/].strip
+        name << ' (Fragment)' if str =~ /fragment/i
+      end
     end
     return name
   end
 
 
-  # returns an array of synonyms (unofficial names).
+  # returns synonyms (unofficial and/or alternative names).
+  # Returns an Array containing String objects.
   #
+  # Since UniProtKB release 14.0 of 22-Jul-2008, the DE line format have
+  # been changed. The method returns the full or short names which are
+  # taken from "RecName: Short=", "RecName: EC=", and AltName lines,
+  # except after "Contains:" or "Includes:".
+  # For keeping compatibility with old format parser, "RecName: EC=N.N.N.N"
+  # is reported as "EC N.N.N.N".
+  # In addition, to prevent confusion, "Allergen=" and "CD_antigen=" 
+  # prefixes are added for the corresponding fields.
+  #
+  # For old format, the method parses the DE lines and returns synonyms.
   # synonyms are each placed in () following the official name on the DE line.
   def synonyms
     ary = Array.new
-    if de_line = fetch('DE') then
-      line = de_line.sub(/\[.*\]/,'') # ignore stuff between [ and ].  That's the "contains" part
+    @data['DE'] ||= parse_DE_line_rel14(get('DE'))
+    parsed_de_line = @data['DE']
+    if parsed_de_line then
+      # since UniProtKB release 14.0 of 22-Jul-2008
+      parsed_de_line.each do |a|
+        case a[0]
+        when 'Includes', 'Contains'
+          break #the each loop
+        when 'RecName', 'SubName', 'AltName'
+          a[1..-1].each do |b|
+            if name = b[1] and b[1] != self.protein_name then
+              case b[0]
+              when 'EC'
+                name = "EC " + b[1]
+              when 'Allergen', 'CD_antigen'
+                name = b[0] + '=' + b[1]
+              else
+                name = b[1]
+              end
+              ary.push name
+            end
+          end
+        end #case a[0]
+      end #parsed_de_line.each
+    else
+      # old format (before Rel. 13.x)
+      if de_line = fetch('DE') then
+        line = de_line.sub(/\[.*\]/,'') # ignore stuff between [ and ].  That's the "contains" part
       line.scan(/\([^)]+/) do |synonym| 
         unless synonym =~ /fragment/i then 
           ary << synonym[1..-1].strip # index to remove the leading (  
         end
+        end
       end
     end
     return ary
   end
 
@@ -917,28 +1080,37 @@
       } 
     }[0]
   end
   private :cc_subcellular_location
 
-  
-  # CC   -!- WEB RESOURCE: NAME=ResourceName[; NOTE=FreeText][; URL=WWWAddress].  
+
+  #--
+  # Since UniProtKB release 12.2 of 11-Sep-2007:
+  # CC   -!- WEB RESOURCE: Name=ResourceName[; Note=FreeText][; URL=WWWAddress].  # Old format:
+  # CC   -!- WEB RESOURCE: NAME=ResourceName[; NOTE=FreeText][; URL=WWWAddress].
+  #++
+
   def cc_web_resource(data)
     data.map {|x|
-      entry = {'NAME' => nil, 'NOTE' => nil, 'URL' => nil}
+      entry = {'Name' => nil, 'Note' => nil, 'URL' => nil}
       x.split(';').each do |y|
         case y
-        when /NAME=(.+)/
-          entry['NAME'] = $1.strip
-        when /NOTE=(.+)/
-          entry['NOTE'] = $1.strip
-        when /URL="(.+)"/
+        when /(Name|Note)\=(.+)/
+          key = $1
+          val = $2.strip
+          entry[key] = val
+        when /(NAME|NOTE)\=(.+)/
+          key = $1.downcase.capitalize
+          val = $2.strip
+          entry[key] = val
+        when /URL\=\"(.+)\"/
           entry['URL'] = $1.strip
         end
       end
       entry
     }
   end
-  
+  private :cc_web_resource
 
   # returns databases cross-references in the DR lines.
   # * Bio::SPTR#dr  -> Hash w/in Array
   #
   # === DR Line; defabases cross-reference (>=0)