lib/bio/location.rb in bio-1.0.0 vs lib/bio/location.rb in bio-1.1.0
- old
+ new
@@ -1,264 +1,48 @@
#
# = bio/location.rb - Locations/Location class (GenBank location format)
#
-# Copyright:: Copyright (C) 2001, 2005
-# KATAYAMA Toshiaki <k@bioruby.org>
-# License:: LGPL
+# Copyright:: Copyright (C) 2001, 2005 Toshiaki Katayama <k@bioruby.org>
+# Copyright:: Copyright (C) 2006 Jan Aerts <jan.aerts@bbsrc.ac.uk>
+# License:: The Ruby License
#
-# $Id: location.rb,v 0.22 2005/12/18 15:50:06 k Exp $
+# $Id: location.rb,v 0.28 2007/04/05 23:35:39 trevor Exp $
#
-# == Appendix : GenBank location descriptor classification
-#
-# === Definition of the position notation of the GenBank location format
-#
-# According to the GenBank manual 'gbrel.txt', I classified position notations
-# into 10 patterns - (A) to (J).
-#
-# 3.4.12.2 Feature Location
-#
-# The second column of the feature descriptor line designates the
-# location of the feature in the sequence. The location descriptor
-# begins at position 22. Several conventions are used to indicate
-# sequence location.
-#
-# Base numbers in location descriptors refer to numbering in the entry,
-# which is not necessarily the same as the numbering scheme used in the
-# published report. The first base in the presented sequence is numbered
-# base 1. Sequences are presented in the 5 to 3 direction.
-#
-# Location descriptors can be one of the following:
-#
-# (A) 1. A single base;
-#
-# (B) 2. A contiguous span of bases;
-#
-# (C) 3. A site between two bases;
-#
-# (D) 4. A single base chosen from a range of bases;
-#
-# (E) 5. A single base chosen from among two or more specified bases;
-#
-# (F) 6. A joining of sequence spans;
-#
-# (G) 7. A reference to an entry other than the one to which the feature
-# belongs (i.e., a remote entry), followed by a location descriptor
-# referring to the remote sequence;
-#
-# (H) 8. A literal sequence (a string of bases enclosed in quotation marks).
-#
-#
-# (C) A site between two residues, such as an endonuclease cleavage site, is
-# indicated by listing the two bases separated by a carat (e.g., 23^24).
-#
-# (D) A single residue chosen from a range of residues is indicated by the
-# number of the first and last bases in the range separated by a single
-# period (e.g., 23.79). The symbols < and > indicate that the end point
-# (I) of the range is beyond the specified base number.
-#
-# (B) A contiguous span of bases is indicated by the number of the first and
-# last bases in the range separated by two periods (e.g., 23..79). The
-# (I) symbols < and > indicate that the end point of the range is beyond the
-# specified base number. Starting and ending positions can be indicated
-# by base number or by one of the operators described below.
-#
-# Operators are prefixes that specify what must be done to the indicated
-# sequence to locate the feature. The following are the operators
-# available, along with their most common format and a description.
-#
-# (J) complement (location): The feature is complementary to the location
-# indicated. Complementary strands are read 5 to 3.
-#
-# (F) join (location, location, .. location): The indicated elements should
-# be placed end to end to form one contiguous sequence.
-#
-# (F) order (location, location, .. location): The elements are found in the
-# specified order in the 5 to 3 direction, but nothing is implied about
-# the rationality of joining them.
-#
-# (F) group (location, location, .. location): The elements are related and
-# should be grouped together, but no order is implied.
-#
-# (E) one-of (location, location, .. location): The element can be any one,
-# but only one, of the items listed.
-#
-# === Reduction strategy of the position notations
-#
-# (A) Location n
-#
-# (B) Location n..m
-#
-# (C) Location n^m
-#
-# (D) (n.m) => Location n
-#
-# (E) one-of(n,m,..) => Location n
-# one-of(n..m,..) => Location n..m
-#
-# (F) order(loc,loc,..) => join(loc, loc,..)
-# group(loc,loc,..) => join(loc, loc,..)
-# join(loc,loc,..) => Sequence
-#
-# (G) ID:loc => Location with ID
-#
-# (H) "atgc" => Location only with Sequence
-#
-# (I) <n => Location n with lt flag
-# >n => Location n with gt flag
-# <n..m => Location n..m with lt flag
-# n..>m => Location n..m with gt flag
-# <n..>m => Location n..m with lt, gt flag
-#
-# (J) complement(loc) => Sequence
-#
-# (K) replace(loc, str) => Location with replacement Sequence
-#
-# === GenBank location examples
-#
-# (C) n^m
-#
-# * [AB015179] 754^755
-# * [AF179299] complement(53^54)
-# * [CELXOL1ES] replace(4480^4481,"")
-# * [ECOUW87] replace(4792^4793,"a")
-# * [APLPCII] replace(1905^1906,"acaaagacaccgccctacgcc")
-#
-# (D) (n.m)
-#
-# * [HACSODA] 157..(800.806)
-# * [HALSODB] (67.68)..(699.703)
-# * [AP001918] (45934.45974)..46135
-# * [BACSPOJ] <180..(731.761)
-# * [BBU17998] (88.89)..>1122
-# * [ECHTGA] complement((1700.1708)..(1715.1721))
-# * [ECPAP17] complement(<22..(255.275))
-# * [LPATOVGNS] complement((64.74)..1525)
-# * [PIP404CG] join((8298.8300)..10206,1..855)
-# * [BOVMHDQBY4] join(M30006.1:(392.467)..575,M30005.1:415..681,M30004.1:129..410,M30004.1:907..1017,521..534)
-# * [HUMMIC2A] replace((651.655)..(651.655),"")
-# * [HUMSOD102] order(L44135.1:(454.445)..>538,<1..181)
-#
-# (E) one-of
-#
-# * [ECU17136] one-of(898,900)..983
-# * [CELCYT1A] one-of(5971..6308,5971..6309)
-# * [DMU17742] 8050..one-of(10731,10758,10905,11242)
-# * [PFU27807] one-of(623,627,632)..one-of(628,633,637)
-# * [BTBAINH1] one-of(845,953,963,1078,1104)..1354
-# * [ATU39449] join(one-of(969..1094,970..1094,995..1094,1018..1094),1518..1587,1726..2119,2220..2833,2945..3215)
-#
-# (F) join, order, group
-#
-# * [AB037374S2] join(AB037374.1:1..177,1..807)
-# * [AP000001] join(complement(1..61),complement(AP000007.1:252907..253505))
-# * [ASNOS11] join(AF130124.1:<2563..2964,AF130125.1:21..157,AF130126.1:12..174,AF130127.1:21..112,AF130128.1:21..162,AF130128.1:281..595,AF130128.1:661..842,AF130128.1:916..1030,AF130129.1:21..115,AF130130.1:21..165,AF130131.1:21..125,AF130132.1:21..428,AF130132.1:492..746,AF130133.1:21..168,AF130133.1:232..401,AF130133.1:475..906,AF130133.1:970..1107,AF130133.1:1176..1367,21..>128)
-#
-# * [AARPOB2] order(AF194507.1:<1..510,1..>871)
-# * [AF006691] order(912..1918,20410..21416)
-# * [AF024666] order(complement(18919..19224),complement(13965..14892))
-# * [AF264948] order(27066..27076,27089..27099,27283..27314,27330..27352)
-# * [D63363] order(3..26,complement(964..987))
-# * [ECOCURLI2] order(complement(1009..>1260),complement(AF081827.1:<1..177))
-# * [S72388S2] order(join(S72388.1:757..911,S72388.1:609..1542),1..>139)
-# * [HEYRRE07] order(complement(1..38),complement(M82666.1:1..140),complement(M82665.1:1..176),complement(M82664.1:1..215),complement(M82663.1:1..185),complement(M82662.1:1..49),complement(M82661.1:1..133))
-# * [COL11A1G34] order(AF101079.1:558..1307,AF101080.1:1..749,AF101081.1:1..898,AF101082.1:1..486,AF101083.1:1..942,AF101084.1:1..1734,AF101085.1:1..2385,AF101086.1:1..1813,AF101087.1:1..2287,AF101088.1:1..1073,AF101089.1:1..989,AF101090.1:1..5017,AF101091.1:1..3401,AF101092.1:1..1225,AF101093.1:1..1072,AF101094.1:1..989,AF101095.1:1..1669,AF101096.1:1..918,AF101097.1:1..1114,AF101098.1:1..1074,AF101099.1:1..1709,AF101100.1:1..986,AF101101.1:1..1934,AF101102.1:1..1699,AF101103.1:1..940,AF101104.1:1..2330,AF101105.1:1..4467,AF101106.1:1..1876,AF101107.1:1..2465,AF101108.1:1..1150,AF101109.1:1..1170,AF101110.1:1..1158,AF101111.1:1..1193,1..611)
-#
-# group() are found in the COMMENT field only (in GenBank 122.0)
-#
-# gbpat2.seq: FT repeat_region group(598..606,611..619)
-# gbpat2.seq: FT repeat_region group(8..16,1457..1464).
-# gbpat2.seq: FT variation group(t1,t2)
-# gbpat2.seq: FT variation group(t1,t3)
-# gbpat2.seq: FT variation group(t1,t2,t3)
-# gbpat2.seq: FT repeat_region group(11..202,203..394)
-# gbpri9.seq:COMMENT Residues reported = 'group(1..2145);'.
-#
-# (G) ID:location
-#
-# * [AARPOB2] order(AF194507.1:<1..510,1..>871)
-# * [AF178221S4] join(AF178221.1:<1..60,AF178222.1:1..63,AF178223.1:1..42,1..>90)
-# * [BOVMHDQBY4] join(M30006.1:(392.467)..575,M30005.1:415..681,M30004.1:129..410,M30004.1:907..1017,521..534)
-# * [HUMSOD102] order(L44135.1:(454.445)..>538,<1..181)
-# * [SL16SRRN1] order(<1..>267,X67092.1:<1..>249,X67093.1:<1..>233)
-#
-# (I) <, >
-#
-# * [A5U48871] <1..>318
-# * [AA23SRRNP] <1..388
-# * [AA23SRRNP] 503..>1010
-# * [AAM5961] complement(<1..229)
-# * [AAM5961] complement(5231..>5598)
-# * [AF043934] join(<1,60..99,161..241,302..370,436..594,676..887,993..1141,1209..1329,1387..1559,1626..1646,1708..>1843)
-# * [BACSPOJ] <180..(731.761)
-# * [BBU17998] (88.89)..>1122
-# * [AARPOB2] order(AF194507.1:<1..510,1..>871)
-# * [SL16SRRN1] order(<1..>267,X67092.1:<1..>249,X67093.1:<1..>233)
-#
-# (J) complement
-#
-# * [AF179299] complement(53^54) <= hoge insertion site etc.
-# * [AP000001] join(complement(1..61),complement(AP000007.1:252907..253505))
-# * [AF209868S2] order(complement(1..>308),complement(AF209868.1:75..336))
-# * [AP000001] join(complement(1..61),complement(AP000007.1:252907..253505))
-# * [CPPLCG] complement(<1..(1093.1098))
-# * [D63363] order(3..26,complement(964..987))
-# * [ECHTGA] complement((1700.1708)..(1715.1721))
-# * [ECOUXW] order(complement(1658..1663),complement(1636..1641))
-# * [LPATOVGNS] complement((64.74)..1525)
-# * [AF129075] complement(join(71606..71829,75327..75446,76039..76203,76282..76353,76914..77029,77114..77201,77276..77342,78138..78316,79755..79892,81501..81562,81676..81856,82341..82490,84208..84287,85032..85122,88316..88403))
-# * [ZFDYST2] join(AF137145.1:<1..18,complement(<1..99))
-#
-# (K) replace
-#
-# * [CSU27710] replace(64,"A")
-# * [CELXOL1ES] replace(5256,"t")
-# * [ANICPC] replace(1..468,"")
-# * [CSU27710] replace(67..68,"GC")
-# * [CELXOL1ES] replace(4480^4481,"") <= ? only one case in GenBank 122.0
-# * [ECOUW87] replace(4792^4793,"a")
-# * [CEU34893] replace(1..22,"ggttttaacccagttactcaag")
-# * [APLPCII] replace(1905^1906,"acaaagacaccgccctacgcc")
-# * [MBDR3S1] replace(1400..>9281,"")
-# * [HUMMHDPB1F] replace(complement(36..37),"ttc")
-# * [HUMMIC2A] replace((651.655)..(651.655),"")
-# * [LEIMDRPGP] replace(1..1554,"L01572")
-# * [TRBND3] replace(376..395,"atttgtgtgtggtaatta")
-# * [TRBND3] replace(376..395,"atttgtgtgggtaatttta")
-# * [TRBND3] replace(376..395,"attttgttgttgttttgttttgaatta")
-# * [TRBND3] replace(376..395,"atgtgtggtgaatta")
-# * [TRBND3] replace(376..395,"atgtgtgtggtaatta")
-# * [TRBND3] replace(376..395,"gatttgttgtggtaatttta")
-# * [MSU09460] replace(193, <= replace(193, "t")
-# * [HUMMAGE12X] replace(3002..3003, <= replace(3002..3003, "GC")
-# * [ADR40FIB] replace(510..520, <= replace(510..520, "taatcctaccg")
-# * [RATDYIIAAB] replace(1306..1443,"aagaacatccacggagtcagaactgggctcttcacgccggatttggcgttcgaggccattgtgaaaaagcaggcaatgcaccagcaagctcagttcctacccctgcgtggacctggttatccaggagctaatcagtacagttaggtggtcaagctgaaagagccctgtctgaaa")
-#
-#--
+
+module Bio
+
+# == Description
#
-# This library is free software; you can redistribute it and/or
-# modify it under the terms of the GNU Lesser General Public
-# License as published by the Free Software Foundation; either
-# version 2 of the License, or (at your option) any later version.
+# The Bio::Location class describes the position of a genomic locus.
+# Typically, Bio::Location objects are created automatically when the
+# user creates a Bio::Locations object, instead of initialized directly.
#
-# This library is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-# Lesser General Public License for more details.
+# == Usage
#
-# You should have received a copy of the GNU Lesser General Public
-# License along with this library; if not, write to the Free Software
-# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+# location = Bio::Location.new('500..550')
+# puts "start=" + location.from.to_s + ";end=" + location.to.to_s
#
-#++
+# #, or better: through Bio::Locations
+# locations = Bio::Locations.new('500..550')
+# locations.each do |location|
+# puts "start=" + location.from.to_s + ";end=" + location.to.to_s
+# end
#
-
-module Bio
-
class Location
- # Pass a range of the 'location' segment. The 'location' segment can be
- # 'ID:' + ('n' or 'n..m' or 'n^m' or "seq") with '<' or '>'.
+ include Comparable
+
+ # Parses a'location' segment, which can be 'ID:' + ('n' or 'n..m' or 'n^m'
+ # or "seq") with '<' or '>', and returns a Bio::Location object.
+ #
+ # location = Bio::Location.new('500..550')
+ #
+ # ---
+ # *Arguments*:
+ # * (required) _str_: GenBank style position string (see Bio::Locations
+ # documentation)
+ # *Returns*:: the Bio::Location object
def initialize(location = nil)
if location
if location =~ /:/ # (G) ID:location
xref_id, location = location.split(':')
@@ -271,11 +55,11 @@
end
end
# s : start base, e : end base => from, to
case location
- when /^[<>]?(\d+)$/ # (A, I) n
+ when /^[<>]?(\d+)$/ # (A, I) n
s = e = $1.to_i
when /^[<>]?(\d+)\.\.[<>]?(\d+)$/ # (B, I) n..m
s = $1.to_i
e = $2.to_i
if e - s < 0
@@ -308,70 +92,256 @@
@xref_id = xref_id # link to the external entry as GenBank ID
end
attr_accessor :from, :to, :strand, :sequence, :lt, :gt, :xref_id
- # Complement the sequence from outside.
+ # Complements the sequence (i.e. alternates the strand).
+ # ---
+ # *Returns*:: the Bio::Location object
def complement
@strand *= -1
self # return Location object
end
- # Replace the sequence from outside.
+ # Replaces the sequence of the location.
+ # ---
+ # *Arguments*:
+ # * (required) _sequence_: sequence to be used to replace the sequence
+ # at the location
+ # *Returns*:: the Bio::Location object
def replace(sequence)
- @sequence = sequence.downcase
+ @sequence = sequence.downcase
self # return Location object
end
- # Returns a range (from..to) of the segment as a Range object.
+ # Returns the range (from..to) of the location as a Range object.
def range
@from..@to
end
-end # class location
+ # Check where a Bio::Location object is located compared to another
+ # Bio::Location object (mainly to facilitate the use of Comparable).
+ # A location A is upstream of location B if the start position of
+ # location A is smaller than the start position of location B. If
+ # they're the same, the end positions are checked.
+ # ---
+ # *Arguments*:
+ # * (required) _other location_: a Bio::Location object
+ # *Returns*::
+ # * 1 if self < other location
+ # * -1 if self > other location
+ # * 0 if both location are the same
+ # * nil if the argument is not a Bio::Location object
+ def <=>(other)
+ if ! other.kind_of?(Bio::Location)
+ return nil
+ end
+ if @from.to_f < other.from.to_f
+ return -1
+ elsif @from.to_f > other.from.to_f
+ return 1
+ end
+ if @to.to_f < other.to.to_f
+ return -1
+ elsif @to.to_f > other.to.to_f
+ return 1
+ end
+ return 0
+ end
+
+end # Location
+
+# == Description
+#
+# The Bio::Locations class is a container for Bio::Location objects:
+# creating a Bio::Locations object (based on a GenBank style position string)
+# will spawn an array of Bio::Location objects.
+#
+# == Usage
+#
+# locations = Bio::Locations.new('join(complement(500..550), 600..625)')
+# locations.each do |loc|
+# puts "class = " + loc.class.to_s
+# puts "range = #{loc.from}..#{loc.to} (strand = #{loc.strand})"
+# end
+# # Output would be:
+# # class = Bio::Location
+# # range = 500..550 (strand = -1)
+# # class = Bio::Location
+# # range = 600..625 (strand = 1)
+#
+# # For the following three location strings, print the span and range
+# ['one-of(898,900)..983',
+# 'one-of(5971..6308,5971..6309)',
+# '8050..one-of(10731,10758,10905,11242)'].each do |loc|
+# location = Bio::Locations.new(loc)
+# puts location.span
+# puts location.range
+# end
+#
+# === GenBank location descriptor classification
+#
+# ==== Definition of the position notation of the GenBank location format
+#
+# According to the GenBank manual 'gbrel.txt', position notations were
+# classified into 10 patterns - (A) to (J).
+#
+# 3.4.12.2 Feature Location
+#
+# The second column of the feature descriptor line designates the
+# location of the feature in the sequence. The location descriptor
+# begins at position 22. Several conventions are used to indicate
+# sequence location.
+#
+# Base numbers in location descriptors refer to numbering in the entry,
+# which is not necessarily the same as the numbering scheme used in the
+# published report. The first base in the presented sequence is numbered
+# base 1. Sequences are presented in the 5 to 3 direction.
+#
+# Location descriptors can be one of the following:
+#
+# (A) 1. A single base;
+#
+# (B) 2. A contiguous span of bases;
+#
+# (C) 3. A site between two bases;
+#
+# (D) 4. A single base chosen from a range of bases;
+#
+# (E) 5. A single base chosen from among two or more specified bases;
+#
+# (F) 6. A joining of sequence spans;
+#
+# (G) 7. A reference to an entry other than the one to which the feature
+# belongs (i.e., a remote entry), followed by a location descriptor
+# referring to the remote sequence;
+#
+# (H) 8. A literal sequence (a string of bases enclosed in quotation marks).
+#
+# ==== Description commented with pattern IDs.
+#
+# (C) A site between two residues, such as an endonuclease cleavage site, is
+# indicated by listing the two bases separated by a carat (e.g., 23^24).
+#
+# (D) A single residue chosen from a range of residues is indicated by the
+# number of the first and last bases in the range separated by a single
+# period (e.g., 23.79). The symbols < and > indicate that the end point
+# (I) of the range is beyond the specified base number.
+#
+# (B) A contiguous span of bases is indicated by the number of the first and
+# last bases in the range separated by two periods (e.g., 23..79). The
+# (I) symbols < and > indicate that the end point of the range is beyond the
+# specified base number. Starting and ending positions can be indicated
+# by base number or by one of the operators described below.
+#
+# Operators are prefixes that specify what must be done to the indicated
+# sequence to locate the feature. The following are the operators
+# available, along with their most common format and a description.
+#
+# (J) complement (location): The feature is complementary to the location
+# indicated. Complementary strands are read 5 to 3.
+#
+# (F) join (location, location, .. location): The indicated elements should
+# be placed end to end to form one contiguous sequence.
+#
+# (F) order (location, location, .. location): The elements are found in the
+# specified order in the 5 to 3 direction, but nothing is implied about
+# the rationality of joining them.
+#
+# (F) group (location, location, .. location): The elements are related and
+# should be grouped together, but no order is implied.
+#
+# (E) one-of (location, location, .. location): The element can be any one,
+# but only one, of the items listed.
+#
+# === Reduction strategy of the position notations
+#
+# * (A) Location n
+# * (B) Location n..m
+# * (C) Location n^m
+# * (D) (n.m) => Location n
+# * (E)
+# * one-of(n,m,..) => Location n
+# * one-of(n..m,..) => Location n..m
+# * (F)
+# * order(loc,loc,..) => join(loc, loc,..)
+# * group(loc,loc,..) => join(loc, loc,..)
+# * join(loc,loc,..) => Sequence
+# * (G) ID:loc => Location with ID
+# * (H) "atgc" => Location only with Sequence
+# * (I)
+# * <n => Location n with lt flag
+# * >n => Location n with gt flag
+# * <n..m => Location n..m with lt flag
+# * n..>m => Location n..m with gt flag
+# * <n..>m => Location n..m with lt, gt flag
+# * (J) complement(loc) => Sequence
+# * (K) replace(loc, str) => Location with replacement Sequence
+#
class Locations
include Enumerable
- # Parse a GenBank style position string and returns a Locations object,
- # which contains a list of Location objects.
+ # Parses a GenBank style position string and returns a Bio::Locations
+ # object, which contains a list of Bio::Location objects.
+ #
+ # locations = Bio::Locations.new('join(complement(500..550), 600..625)')
+ #
+ # ---
+ # *Arguments*:
+ # * (required) _str_: GenBank style position string
+ # *Returns*:: Bio::Locations object
def initialize(position)
if position.is_a? Array
@locations = position
else
position = gbl_cleanup(position) # preprocessing
- @locations = gbl_pos2loc(position) # create an Array of Location
+ @locations = gbl_pos2loc(position) # create an Array of Bio::Location objects
end
end
+
+ # An Array of Bio::Location objects
attr_accessor :locations
- # Iterates on each Location object.
+ # Evaluate equality of Bio::Locations object.
+ def equals?(other)
+ if ! other.kind_of?(Bio::Locations)
+ return nil
+ end
+ if self.sort == other.sort
+ return true
+ else
+ return false
+ end
+ end
+
+ # Iterates on each Bio::Location object.
def each
@locations.each do |x|
yield(x)
end
end
- # Returns nth Location object.
+ # Returns nth Bio::Location object.
def [](n)
@locations[n]
end
- # Returns first Location object.
+ # Returns first Bio::Location object.
def first
@locations.first
end
- # Returns last Location object.
+ # Returns last Bio::Location object.
def last
@locations.last
end
# Returns an Array containing overall min and max position [min, max]
- # of this Locations object.
+ # of this Bio::Locations object.
def span
span_min = @locations.min { |a,b| a.from <=> b.from }
span_max = @locations.max { |a,b| a.to <=> b.to }
return span_min.from, span_max.to
end
@@ -394,13 +364,26 @@
end
len
end
alias size length
- # Convert absolute position in DNA (na) to relative position in RNA (na).
- # If type == :aa,
- # convert absolute position in DNA (na) to relative position in Protein (aa).
+ # Converts absolute position in the whole of the DNA sequence to relative
+ # position in the locus.
+ #
+ # This method can for example be used to relate positions in a DNA-sequence
+ # with those in RNA. In this use, the optional ':aa'-flag returns the
+ # position of the associated amino-acid rather than the nucleotide.
+ #
+ # loc = Bio::Locations.new('complement(12838..13533)')
+ # puts loc.relative(13524) # => 10
+ # puts loc.relative(13506, :aa) # => 3
+ #
+ # ---
+ # *Arguments*:
+ # * (required) _position_: nucleotide position within whole of the sequence
+ # * _:aa_: flag that lets method return position in aminoacid coordinates
+ # *Returns*:: position within the location
def relative(n, type = nil)
case type
when :location
;
when :aa
@@ -412,22 +395,27 @@
else
abs2rel(n)
end
end
- # Convert relative position in RNA (na) to absolute position in DNA (na).
- # If type == :aa,
- # convert relative position in Protein (aa) -> absolute position in DNA (na).
+ # Converts relative position in the locus to position in the whole of the
+ # DNA sequence.
+ #
+ # This method can for example be used to relate positions in a DNA-sequence
+ # with those in RNA. In this use, the optional ':aa'-flag returns the
+ # position of the associated amino-acid rather than the nucleotide.
#
- # * Examples
+ # loc = Bio::Locations.new('complement(12838..13533)')
+ # puts loc.absolute(10) # => 13524
+ # puts loc.absolute(10, :aa) # => 13506
#
- # loc = Bio::Locations.new('complement(12838..13533)')
- # loc.absolute(10) #=> 13524 (rel2abs)
- # loc.relative(13524) #=> 10 (abs2rel)
- # loc.absolute(10, :aa) #=> 13506 (rel2abs)
- # loc.relative(13506, :aa) #=> 10 (abs2rel)
- #
+ # ---
+ # *Arguments*:
+ # * (required) _position_: nucleotide position within locus
+ # * _:aa_: flag to be used if _position_ is a aminoacid position rather than
+ # a nucleotide position
+ # *Returns*:: position within the whole of the sequence
def absolute(n, type = nil)
case type
when :location
;
when :aa
@@ -450,23 +438,23 @@
# select one base # (D) n.m
# .. n m :
# <match> $1 ( $2 $3 not )
position.gsub!(/(\.{2})?\(?([<>\d]+)\.([<>\d]+)(?!:)\)?/) do |match|
if $1
- $1 + $3 # ..(n.m) => ..m
+ $1 + $3 # ..(n.m) => ..m
else
- $2 # (?n.m)? => n
+ $2 # (?n.m)? => n
end
end
# select the 1st location # (E) one-of()
# <match> .. one-of ($2 ,$3 )
position.gsub!(/(\.{2})?one-of\(([^,]+),([^)]+)\)/) do |match|
if $1
- $1 + $3.gsub(/.*,(.*)/, '\1') # ..one-of(n,m) => ..m
+ $1 + $3.gsub(/.*,(.*)/, '\1') # ..one-of(n,m) => ..m
else
- $2 # one-of(n,m) => n
+ $2 # one-of(n,m) => n
end
end
# substitute order(), group() by join() # (F) group(), order()
position.gsub!(/(order|group)/, 'join')
@@ -512,11 +500,11 @@
join_list.each do |position|
ary << gbl_pos2loc(position)
end
- when /^complement\((.*)\)$/ # (J) complement()
+ when /^complement\((.*)\)$/ # (J) complement()
position = $1
gbl_pos2loc(position).reverse_each do |location|
ary << location.complement
end
@@ -577,21 +565,148 @@
else
return n + cursor + 1 - x.from
end
end
end
- return nil # out of range
+ return nil # out of range
end
-end # class Locations
+end # Locations
-end # module Bio
+end # Bio
+
+# === GenBank location examples
+#
+# (C) n^m
+#
+# * [AB015179] 754^755
+# * [AF179299] complement(53^54)
+# * [CELXOL1ES] replace(4480^4481,"")
+# * [ECOUW87] replace(4792^4793,"a")
+# * [APLPCII] replace(1905^1906,"acaaagacaccgccctacgcc")
+#
+# (D) (n.m)
+#
+# * [HACSODA] 157..(800.806)
+# * [HALSODB] (67.68)..(699.703)
+# * [AP001918] (45934.45974)..46135
+# * [BACSPOJ] <180..(731.761)
+# * [BBU17998] (88.89)..>1122
+# * [ECHTGA] complement((1700.1708)..(1715.1721))
+# * [ECPAP17] complement(<22..(255.275))
+# * [LPATOVGNS] complement((64.74)..1525)
+# * [PIP404CG] join((8298.8300)..10206,1..855)
+# * [BOVMHDQBY4] join(M30006.1:(392.467)..575,M30005.1:415..681,M30004.1:129..410,M30004.1:907..1017,521..534)
+# * [HUMMIC2A] replace((651.655)..(651.655),"")
+# * [HUMSOD102] order(L44135.1:(454.445)..>538,<1..181)
+#
+# (E) one-of
+#
+# * [ECU17136] one-of(898,900)..983
+# * [CELCYT1A] one-of(5971..6308,5971..6309)
+# * [DMU17742] 8050..one-of(10731,10758,10905,11242)
+# * [PFU27807] one-of(623,627,632)..one-of(628,633,637)
+# * [BTBAINH1] one-of(845,953,963,1078,1104)..1354
+# * [ATU39449] join(one-of(969..1094,970..1094,995..1094,1018..1094),1518..1587,1726..2119,2220..2833,2945..3215)
+#
+# (F) join, order, group
+#
+# * [AB037374S2] join(AB037374.1:1..177,1..807)
+# * [AP000001] join(complement(1..61),complement(AP000007.1:252907..253505))
+# * [ASNOS11] join(AF130124.1:<2563..2964,AF130125.1:21..157,AF130126.1:12..174,AF130127.1:21..112,AF130128.1:21..162,AF130128.1:281..595,AF130128.1:661..842,AF130128.1:916..1030,AF130129.1:21..115,AF130130.1:21..165,AF130131.1:21..125,AF130132.1:21..428,AF130132.1:492..746,AF130133.1:21..168,AF130133.1:232..401,AF130133.1:475..906,AF130133.1:970..1107,AF130133.1:1176..1367,21..>128)
+#
+# * [AARPOB2] order(AF194507.1:<1..510,1..>871)
+# * [AF006691] order(912..1918,20410..21416)
+# * [AF024666] order(complement(18919..19224),complement(13965..14892))
+# * [AF264948] order(27066..27076,27089..27099,27283..27314,27330..27352)
+# * [D63363] order(3..26,complement(964..987))
+# * [ECOCURLI2] order(complement(1009..>1260),complement(AF081827.1:<1..177))
+# * [S72388S2] order(join(S72388.1:757..911,S72388.1:609..1542),1..>139)
+# * [HEYRRE07] order(complement(1..38),complement(M82666.1:1..140),complement(M82665.1:1..176),complement(M82664.1:1..215),complement(M82663.1:1..185),complement(M82662.1:1..49),complement(M82661.1:1..133))
+# * [COL11A1G34] order(AF101079.1:558..1307,AF101080.1:1..749,AF101081.1:1..898,AF101082.1:1..486,AF101083.1:1..942,AF101084.1:1..1734,AF101085.1:1..2385,AF101086.1:1..1813,AF101087.1:1..2287,AF101088.1:1..1073,AF101089.1:1..989,AF101090.1:1..5017,AF101091.1:1..3401,AF101092.1:1..1225,AF101093.1:1..1072,AF101094.1:1..989,AF101095.1:1..1669,AF101096.1:1..918,AF101097.1:1..1114,AF101098.1:1..1074,AF101099.1:1..1709,AF101100.1:1..986,AF101101.1:1..1934,AF101102.1:1..1699,AF101103.1:1..940,AF101104.1:1..2330,AF101105.1:1..4467,AF101106.1:1..1876,AF101107.1:1..2465,AF101108.1:1..1150,AF101109.1:1..1170,AF101110.1:1..1158,AF101111.1:1..1193,1..611)
+#
+# group() are found in the COMMENT field only (in GenBank 122.0)
+#
+# gbpat2.seq: FT repeat_region group(598..606,611..619)
+# gbpat2.seq: FT repeat_region group(8..16,1457..1464).
+# gbpat2.seq: FT variation group(t1,t2)
+# gbpat2.seq: FT variation group(t1,t3)
+# gbpat2.seq: FT variation group(t1,t2,t3)
+# gbpat2.seq: FT repeat_region group(11..202,203..394)
+# gbpri9.seq:COMMENT Residues reported = 'group(1..2145);'.
+#
+# (G) ID:location
+#
+# * [AARPOB2] order(AF194507.1:<1..510,1..>871)
+# * [AF178221S4] join(AF178221.1:<1..60,AF178222.1:1..63,AF178223.1:1..42,1..>90)
+# * [BOVMHDQBY4] join(M30006.1:(392.467)..575,M30005.1:415..681,M30004.1:129..410,M30004.1:907..1017,521..534)
+# * [HUMSOD102] order(L44135.1:(454.445)..>538,<1..181)
+# * [SL16SRRN1] order(<1..>267,X67092.1:<1..>249,X67093.1:<1..>233)
+#
+# (I) <, >
+#
+# * [A5U48871] <1..>318
+# * [AA23SRRNP] <1..388
+# * [AA23SRRNP] 503..>1010
+# * [AAM5961] complement(<1..229)
+# * [AAM5961] complement(5231..>5598)
+# * [AF043934] join(<1,60..99,161..241,302..370,436..594,676..887,993..1141,1209..1329,1387..1559,1626..1646,1708..>1843)
+# * [BACSPOJ] <180..(731.761)
+# * [BBU17998] (88.89)..>1122
+# * [AARPOB2] order(AF194507.1:<1..510,1..>871)
+# * [SL16SRRN1] order(<1..>267,X67092.1:<1..>249,X67093.1:<1..>233)
+#
+# (J) complement
+#
+# * [AF179299] complement(53^54) <= hoge insertion site etc.
+# * [AP000001] join(complement(1..61),complement(AP000007.1:252907..253505))
+# * [AF209868S2] order(complement(1..>308),complement(AF209868.1:75..336))
+# * [AP000001] join(complement(1..61),complement(AP000007.1:252907..253505))
+# * [CPPLCG] complement(<1..(1093.1098))
+# * [D63363] order(3..26,complement(964..987))
+# * [ECHTGA] complement((1700.1708)..(1715.1721))
+# * [ECOUXW] order(complement(1658..1663),complement(1636..1641))
+# * [LPATOVGNS] complement((64.74)..1525)
+# * [AF129075] complement(join(71606..71829,75327..75446,76039..76203,76282..76353,76914..77029,77114..77201,77276..77342,78138..78316,79755..79892,81501..81562,81676..81856,82341..82490,84208..84287,85032..85122,88316..88403))
+# * [ZFDYST2] join(AF137145.1:<1..18,complement(<1..99))
+#
+# (K) replace
+#
+# * [CSU27710] replace(64,"A")
+# * [CELXOL1ES] replace(5256,"t")
+# * [ANICPC] replace(1..468,"")
+# * [CSU27710] replace(67..68,"GC")
+# * [CELXOL1ES] replace(4480^4481,"") <= ? only one case in GenBank 122.0
+# * [ECOUW87] replace(4792^4793,"a")
+# * [CEU34893] replace(1..22,"ggttttaacccagttactcaag")
+# * [APLPCII] replace(1905^1906,"acaaagacaccgccctacgcc")
+# * [MBDR3S1] replace(1400..>9281,"")
+# * [HUMMHDPB1F] replace(complement(36..37),"ttc")
+# * [HUMMIC2A] replace((651.655)..(651.655),"")
+# * [LEIMDRPGP] replace(1..1554,"L01572")
+# * [TRBND3] replace(376..395,"atttgtgtgtggtaatta")
+# * [TRBND3] replace(376..395,"atttgtgtgggtaatttta")
+# * [TRBND3] replace(376..395,"attttgttgttgttttgttttgaatta")
+# * [TRBND3] replace(376..395,"atgtgtggtgaatta")
+# * [TRBND3] replace(376..395,"atgtgtgtggtaatta")
+# * [TRBND3] replace(376..395,"gatttgttgtggtaatttta")
+# * [MSU09460] replace(193, <= replace(193, "t")
+# * [HUMMAGE12X] replace(3002..3003, <= replace(3002..3003, "GC")
+# * [ADR40FIB] replace(510..520, <= replace(510..520, "taatcctaccg")
+# * [RATDYIIAAB] replace(1306..1443,"aagaacatccacggagtcagaactgggctcttcacgccggatttggcgttcgaggccattgtgaaaaagcaggcaatgcaccagcaagctcagttcctacccctgcgtggacctggttatccaggagctaatcagtacagttaggtggtcaagctgaaagagccctgtctgaaa")
+#
+
if __FILE__ == $0
puts "Test new & span methods"
[
+ '450',
+ '500..600',
+ 'join(500..550, 600..625)',
+ 'complement(join(500..550, 600..625))',
+ 'join(complement(500..550), 600..625)',
'754^755',
'complement(53^54)',
'replace(4792^4793,"a")',
'replace(1905^1906,"acaaagacaccgccctacgcc")',
'157..(800.806)',
@@ -615,13 +730,18 @@
'order(3..26,complement(964..987))',
'order(L44135.1:(454.445)..>538,<1..181)',
'<200001..<318389',
].each do |pos|
p pos
- p Bio::Locations.new(pos).span
- p Bio::Locations.new(pos).range
- p Bio::Locations.new(pos)
+# p Bio::Locations.new(pos)
+# p Bio::Locations.new(pos).span
+# p Bio::Locations.new(pos).range
+ Bio::Locations.new(pos).each do |location|
+ puts "class=" + location.class.to_s
+ puts "start=" + location.from.to_s + "\tend=" + location.to.to_s + "\tstrand=" + location.strand.to_s
+ end
+
end
puts "Test rel2abs/abs2rel method"
[
'6..15',
@@ -644,7 +764,9 @@
pos = 'join(complement(6..10),complement(16..30))'
loc = Bio::Locations.new(pos)
print "pos : "; p pos
print "`- loc[1] : "; p loc[1]
print " `- range : "; p loc[1].range
+
+ puts Bio::Location.new('5').<=>(Bio::Location.new('3'))
end