lib/parse_fasta/sequence.rb in parse_fasta-1.3.0 vs lib/parse_fasta/sequence.rb in parse_fasta-1.4.0
- old
+ new
@@ -23,11 +23,12 @@
# Calculates GC content
#
# Calculates GC content by dividing count of G + C divided by count
# of G + C + T + A + U. If there are both T's and U's in the
# Sequence, things will get weird, but then again, that wouldn't
- # happen, now would it!
+ # happen, now would it! Ambiguous bases are ignored similar to
+ # BioRuby.
#
# @example Get GC of a Sequence
# Sequence.new('ACTg').gc #=> 0.5
# @example Using with FastaFile#each_record
# FastaFile.open('reads.fna', 'r').each_record do |header, sequence|
@@ -44,9 +45,79 @@
t = s.count('t')
a = s.count('a')
u = s.count('u')
return 0 if c + g + t + a + u == 0
- return (c + g).quo(c + g + t + a + u).to_f
+ return (c + g) / (c + g + t + a + u).to_f
end
+ # Returns a map of base counts
+ #
+ # This method will check if the sequence is DNA or RNA and return a
+ # count map appropriate for each. If a truthy argument is given, the
+ # count of ambiguous bases will be returned as well.
+ #
+ # If a sequence has both T and U present, will warn the user and
+ # keep going. Will return a map with counts of both, however.
+ #
+ # @example Get base counts of DNA sequence without ambiguous bases
+ # Sequence.new('AcTGn').base_counts
+ # #=> { a: 1, c: 1, t: 1, g: 1 }
+ # @example Get base counts of DNA sequence with ambiguous bases
+ # Sequence.new('AcTGn').base_counts(true)
+ # #=> { a: 1, c: 1, t: 1, g: 1, n: 1 }
+ # @example Get base counts of RNA sequence without ambiguous bases
+ # Sequence.new('AcUGn').base_counts
+ # #=> { a: 1, c: 1, u: 1, g: 1 }
+ # @example Get base counts of DNA sequence with ambiguous bases
+ # Sequence.new('AcUGn').base_counts(true)
+ # #=> { a: 1, c: 1, u: 1, g: 1, n: 1 }
+ #
+ # @return [Hash] A hash with base as key, count as value
+ def base_counts(count_ambiguous_bases=nil)
+ s = self.downcase
+ t = s.count('t')
+ u = s.count('u')
+ counts = { a: s.count('a'), c: s.count('c'), g: s.count('g') }
+
+ if t > 0 && u == 0
+ counts[:t] = t
+ elsif t == 0 && u > 0
+ counts[:u] = u
+ elsif t > 0 && u > 0
+ warn('ERROR: A sequence contains both T and U')
+ counts[:t], counts[:u] = t, u
+ end
+
+ counts[:n] = s.count('n') if count_ambiguous_bases
+
+ counts
+ end
+
+ # Returns a map of base frequencies
+ #
+ # Counts bases with the `base_counts` method, then divides each
+ # count by the total bases counted to give frequency for each
+ # base. If a truthy argument is given, ambiguous bases will be
+ # included in the total and their frequency reported. Can discern
+ # between DNA and RNA.
+ #
+ # If default or falsy argument is given, ambiguous bases will not be
+ # counted in the total base count and their frequency will not be
+ # given.
+ #
+ # @example Get base frequencies of DNA sequence without ambiguous bases
+ # Sequence.new('AcTGn').base_counts
+ # #=> { a: 0.25, c: 0.25, t: 0.25, g: 0.25 }
+ # @example Get base counts of DNA sequence with ambiguous bases
+ # Sequence.new('AcTGn').base_counts(true)
+ # #=> { a: 0.2, c: 0.2, t: 0.2, g: 0.2, n: 0.2 }
+ #
+ # @return [Hash] A hash with base as key, frequency as value
+ def base_frequencies(count_ambiguous_bases=nil)
+ base_counts = self.base_counts(count_ambiguous_bases)
+ total_bases = base_counts.values.reduce(:+).to_f
+ base_freqs =
+ base_counts.map { |base, count| [base, count/total_bases] }.flatten
+ Hash[*base_freqs]
+ end
end