lib/bio/db/embl/uniprotkb.rb in bio-2.0.3 vs lib/bio/db/embl/uniprotkb.rb in bio-2.0.4

- old
+ new

@@ -246,13 +246,12 @@ # "DE #{OFFICIAL_NAME} (#{SYNONYM}) [CONTEINS: #1; #2]." # OFFICIAL_NAME 1/entry # SYNONYM >=0 # CONTEINS >=0 def protein_name - @data['DE'] ||= parse_DE_line_rel14(get('DE')) - parsed_de_line = @data['DE'] - if parsed_de_line then + parsed_de_line = self.de + if parsed_de_line.kind_of?(Array) then # since UniProtKB release 14.0 of 22-Jul-2008 name = nil parsed_de_line.each do |a| case a[0] when 'RecName', 'SubName' @@ -273,11 +272,10 @@ end end return name end - # returns synonyms (unofficial and/or alternative names). # Returns an Array containing String objects. # # Since UniProtKB release 14.0 of 22-Jul-2008, the DE line format have # been changed. The method returns the full or short names which are @@ -290,13 +288,12 @@ # # For old format, the method parses the DE lines and returns synonyms. # synonyms are each placed in () following the official name on the DE line. def synonyms ary = Array.new - @data['DE'] ||= parse_DE_line_rel14(get('DE')) - parsed_de_line = @data['DE'] - if parsed_de_line then + parsed_de_line = self.de + if parsed_de_line.kind_of?(Array) then # since UniProtKB release 14.0 of 22-Jul-2008 parsed_de_line.each do |a| case a[0] when 'Includes', 'Contains' break #the each loop @@ -328,10 +325,24 @@ end end return ary end + # Returns an Array (for new format since rel 14) + # or a String (for old format before rel 14) for the DE line. + # + def de + return @data['DE'] if @data['DE'] + parsed_de_line = parse_DE_line_rel14(get('DE')) + case parsed_de_line + when Array # new format since rel14 + @data['DE'] ||= parsed_de_line + else + super + end + @data['DE'] + end # returns gene names in the GN line. # # New UniProt/SwissProt format: # * Bio::UniProtKB#gn -> [ <gene record>* ] @@ -1195,13 +1206,128 @@ # def ft(feature_key = nil) return ft[feature_key] if feature_key return @data['FT'] if @data['FT'] + ftstr = get('FT') + ftlines = ftstr.split("\n") + for i in 0..10 do + if /^FT +([^\s]+) +(([^\s]+)\:)?([\<\?]?[0-9]+|\?)(?:\.\.([\>\?]?[0-9]+|\?))?\s*$/ =~ ftlines[i] && + /^FT +\/([^\s\=]+)(?:\=(\")?(.+)(\")?)?\s*$/ =~ ftlines[i+1] then + fmt_2019_11 = true + break #for i + end + end #for i + + hash = if fmt_2019_11 then + ft_2019_11_parser(ftlines) + else + ft_legacy_parser(ftlines) + end + @data['FT'] = hash + end + + # FT parser since UniProt release 2019_11 + # https://www.uniprot.org/release-notes/2019-12-18-release#text%5Fft + def ft_2019_11_parser(ftlines) table = [] + cur_ft = nil + cont = false begin - get('FT').split("\n").each do |line| + ftlines.each do |line| + if /^FT +([^\s]+) +(([^\s]+)\:)?([\<\?]?[0-9]+|\?)(?:\.\.([\>\?]?[0-9]+|\?))?\s*$/ =~ line + cur_ft = [$1.to_s, # Feature Name + "#{$2}#{$4}", # From + $5.to_s, # To + [] # Qualifiers + ] + table.push cur_ft + cont = false + elsif cont && /^FT {19}/ =~ line + str = $' + str.rstrip! + orig = cur_ft[3][-1][1].to_s + if orig.size > 0 && orig[-1] != ' ' && + str.length > 0 && str[0] != ' ' then + orig.concat ' ' + end + orig.concat str + cur_ft[3][-1][1] = orig + if cont && orig[-1] == "\"" + orig.chop! + cont = false + end + elsif /^FT +\/([^\s\=]+)(?:\=(\")?(.+))?\s*$/ =~ line + key = $1 + val = $3 + val.rstrip! + cur_ft[3].push [ key, val ] + cont = false + if $2 == "\"" + if val.to_s[-1] == "\"" + val.chop! + else + cont = true + end + end + else + raise "FT parse error: #{line.inspect}" + end + end + + hash = {} + table.each do |feature| + cur_h = { + # Removing '<', '>' or '?' in FROM/TO endopoint. + 'From' => feature[1].sub(/\D/, '').to_i, + 'To' => feature[2].sub(/\D/, '').to_i, + 'diff' => [], + 'original' => feature + } + hash[feature[0]] ||= [] + hash[feature[0]].push cur_h + feature[3].each do |a| + case a[0] + when 'From', 'To', 'Description', 'FTId', 'diff', 'original' + ; # do nothing + else + cur_h[a[0]] = a[1] + end + end + if cur_h["id"] then + cur_h['FTId'] = cur_h['id'] + end + + case feature[0] + when 'VARSPLIC', 'VARIANT', 'VAR_SEQ', 'CONFLICT' + case cur_h['note'].to_s + when /(\w[\w ]*\w*) - ?> (\w[\w ]*\w*)/ + original_res = $1 + changed_res = $2 + original_res = original_res.gsub(/ /,'').strip + chenged_res = changed_res.gsub(/ /,'').strip + when /Missing/i + original_res = seq.subseq(cur_h['From'], + cur_h['To']) + changed_res = '' + end + cur_h['diff'] = [original_res, chenged_res] + end + end + rescue + raise "Invalid FT Lines(#{$!}) in #{entry_id}:, \n'#{self.get('FT')}'\n" + end + + hash + end + private :ft_2019_11_parser + + # FT parser for the format before Uniprot release 2019_11 + def ft_legacy_parser(ftlines) + table = [] + begin + ftlines.each do |line| if line =~ /^FT \w/ feature = line.chomp.ljust(74) table << [feature[ 5..12].strip, # Feature Name feature[14..19].strip, # From feature[21..26].strip, # To @@ -1254,13 +1380,12 @@ end rescue raise "Invalid FT Lines(#{$!}) in #{entry_id}:, \n'#{self.get('FT')}'\n" end - @data['FT'] = hash + hash end - - + private :ft_legacy_parser # returns a Hash of conteins in the SQ lines. # * Bio::UniProtKBL#sq -> hsh # # returns a value of a key given in the SQ lines.