lib/bio/db/embl/uniprotkb.rb in bio-2.0.3 vs lib/bio/db/embl/uniprotkb.rb in bio-2.0.4
- old
+ new
@@ -246,13 +246,12 @@
# "DE #{OFFICIAL_NAME} (#{SYNONYM}) [CONTEINS: #1; #2]."
# OFFICIAL_NAME 1/entry
# SYNONYM >=0
# CONTEINS >=0
def protein_name
- @data['DE'] ||= parse_DE_line_rel14(get('DE'))
- parsed_de_line = @data['DE']
- if parsed_de_line then
+ parsed_de_line = self.de
+ if parsed_de_line.kind_of?(Array) then
# since UniProtKB release 14.0 of 22-Jul-2008
name = nil
parsed_de_line.each do |a|
case a[0]
when 'RecName', 'SubName'
@@ -273,11 +272,10 @@
end
end
return name
end
-
# returns synonyms (unofficial and/or alternative names).
# Returns an Array containing String objects.
#
# Since UniProtKB release 14.0 of 22-Jul-2008, the DE line format have
# been changed. The method returns the full or short names which are
@@ -290,13 +288,12 @@
#
# For old format, the method parses the DE lines and returns synonyms.
# synonyms are each placed in () following the official name on the DE line.
def synonyms
ary = Array.new
- @data['DE'] ||= parse_DE_line_rel14(get('DE'))
- parsed_de_line = @data['DE']
- if parsed_de_line then
+ parsed_de_line = self.de
+ if parsed_de_line.kind_of?(Array) then
# since UniProtKB release 14.0 of 22-Jul-2008
parsed_de_line.each do |a|
case a[0]
when 'Includes', 'Contains'
break #the each loop
@@ -328,10 +325,24 @@
end
end
return ary
end
+ # Returns an Array (for new format since rel 14)
+ # or a String (for old format before rel 14) for the DE line.
+ #
+ def de
+ return @data['DE'] if @data['DE']
+ parsed_de_line = parse_DE_line_rel14(get('DE'))
+ case parsed_de_line
+ when Array # new format since rel14
+ @data['DE'] ||= parsed_de_line
+ else
+ super
+ end
+ @data['DE']
+ end
# returns gene names in the GN line.
#
# New UniProt/SwissProt format:
# * Bio::UniProtKB#gn -> [ <gene record>* ]
@@ -1195,13 +1206,128 @@
#
def ft(feature_key = nil)
return ft[feature_key] if feature_key
return @data['FT'] if @data['FT']
+ ftstr = get('FT')
+ ftlines = ftstr.split("\n")
+ for i in 0..10 do
+ if /^FT +([^\s]+) +(([^\s]+)\:)?([\<\?]?[0-9]+|\?)(?:\.\.([\>\?]?[0-9]+|\?))?\s*$/ =~ ftlines[i] &&
+ /^FT +\/([^\s\=]+)(?:\=(\")?(.+)(\")?)?\s*$/ =~ ftlines[i+1] then
+ fmt_2019_11 = true
+ break #for i
+ end
+ end #for i
+
+ hash = if fmt_2019_11 then
+ ft_2019_11_parser(ftlines)
+ else
+ ft_legacy_parser(ftlines)
+ end
+ @data['FT'] = hash
+ end
+
+ # FT parser since UniProt release 2019_11
+ # https://www.uniprot.org/release-notes/2019-12-18-release#text%5Fft
+ def ft_2019_11_parser(ftlines)
table = []
+ cur_ft = nil
+ cont = false
begin
- get('FT').split("\n").each do |line|
+ ftlines.each do |line|
+ if /^FT +([^\s]+) +(([^\s]+)\:)?([\<\?]?[0-9]+|\?)(?:\.\.([\>\?]?[0-9]+|\?))?\s*$/ =~ line
+ cur_ft = [$1.to_s, # Feature Name
+ "#{$2}#{$4}", # From
+ $5.to_s, # To
+ [] # Qualifiers
+ ]
+ table.push cur_ft
+ cont = false
+ elsif cont && /^FT {19}/ =~ line
+ str = $'
+ str.rstrip!
+ orig = cur_ft[3][-1][1].to_s
+ if orig.size > 0 && orig[-1] != ' ' &&
+ str.length > 0 && str[0] != ' ' then
+ orig.concat ' '
+ end
+ orig.concat str
+ cur_ft[3][-1][1] = orig
+ if cont && orig[-1] == "\""
+ orig.chop!
+ cont = false
+ end
+ elsif /^FT +\/([^\s\=]+)(?:\=(\")?(.+))?\s*$/ =~ line
+ key = $1
+ val = $3
+ val.rstrip!
+ cur_ft[3].push [ key, val ]
+ cont = false
+ if $2 == "\""
+ if val.to_s[-1] == "\""
+ val.chop!
+ else
+ cont = true
+ end
+ end
+ else
+ raise "FT parse error: #{line.inspect}"
+ end
+ end
+
+ hash = {}
+ table.each do |feature|
+ cur_h = {
+ # Removing '<', '>' or '?' in FROM/TO endopoint.
+ 'From' => feature[1].sub(/\D/, '').to_i,
+ 'To' => feature[2].sub(/\D/, '').to_i,
+ 'diff' => [],
+ 'original' => feature
+ }
+ hash[feature[0]] ||= []
+ hash[feature[0]].push cur_h
+ feature[3].each do |a|
+ case a[0]
+ when 'From', 'To', 'Description', 'FTId', 'diff', 'original'
+ ; # do nothing
+ else
+ cur_h[a[0]] = a[1]
+ end
+ end
+ if cur_h["id"] then
+ cur_h['FTId'] = cur_h['id']
+ end
+
+ case feature[0]
+ when 'VARSPLIC', 'VARIANT', 'VAR_SEQ', 'CONFLICT'
+ case cur_h['note'].to_s
+ when /(\w[\w ]*\w*) - ?> (\w[\w ]*\w*)/
+ original_res = $1
+ changed_res = $2
+ original_res = original_res.gsub(/ /,'').strip
+ chenged_res = changed_res.gsub(/ /,'').strip
+ when /Missing/i
+ original_res = seq.subseq(cur_h['From'],
+ cur_h['To'])
+ changed_res = ''
+ end
+ cur_h['diff'] = [original_res, chenged_res]
+ end
+ end
+ rescue
+ raise "Invalid FT Lines(#{$!}) in #{entry_id}:, \n'#{self.get('FT')}'\n"
+ end
+
+ hash
+ end
+ private :ft_2019_11_parser
+
+ # FT parser for the format before Uniprot release 2019_11
+ def ft_legacy_parser(ftlines)
+ table = []
+ begin
+ ftlines.each do |line|
if line =~ /^FT \w/
feature = line.chomp.ljust(74)
table << [feature[ 5..12].strip, # Feature Name
feature[14..19].strip, # From
feature[21..26].strip, # To
@@ -1254,13 +1380,12 @@
end
rescue
raise "Invalid FT Lines(#{$!}) in #{entry_id}:, \n'#{self.get('FT')}'\n"
end
- @data['FT'] = hash
+ hash
end
-
-
+ private :ft_legacy_parser
# returns a Hash of conteins in the SQ lines.
# * Bio::UniProtKBL#sq -> hsh
#
# returns a value of a key given in the SQ lines.