lib/parse_fasta/sequence.rb in parse_fasta-1.3.0 vs lib/parse_fasta/sequence.rb in parse_fasta-1.4.0

- old
+ new

@@ -23,11 +23,12 @@ # Calculates GC content # # Calculates GC content by dividing count of G + C divided by count # of G + C + T + A + U. If there are both T's and U's in the # Sequence, things will get weird, but then again, that wouldn't - # happen, now would it! + # happen, now would it! Ambiguous bases are ignored similar to + # BioRuby. # # @example Get GC of a Sequence # Sequence.new('ACTg').gc #=> 0.5 # @example Using with FastaFile#each_record # FastaFile.open('reads.fna', 'r').each_record do |header, sequence| @@ -44,9 +45,79 @@ t = s.count('t') a = s.count('a') u = s.count('u') return 0 if c + g + t + a + u == 0 - return (c + g).quo(c + g + t + a + u).to_f + return (c + g) / (c + g + t + a + u).to_f end + # Returns a map of base counts + # + # This method will check if the sequence is DNA or RNA and return a + # count map appropriate for each. If a truthy argument is given, the + # count of ambiguous bases will be returned as well. + # + # If a sequence has both T and U present, will warn the user and + # keep going. Will return a map with counts of both, however. + # + # @example Get base counts of DNA sequence without ambiguous bases + # Sequence.new('AcTGn').base_counts + # #=> { a: 1, c: 1, t: 1, g: 1 } + # @example Get base counts of DNA sequence with ambiguous bases + # Sequence.new('AcTGn').base_counts(true) + # #=> { a: 1, c: 1, t: 1, g: 1, n: 1 } + # @example Get base counts of RNA sequence without ambiguous bases + # Sequence.new('AcUGn').base_counts + # #=> { a: 1, c: 1, u: 1, g: 1 } + # @example Get base counts of DNA sequence with ambiguous bases + # Sequence.new('AcUGn').base_counts(true) + # #=> { a: 1, c: 1, u: 1, g: 1, n: 1 } + # + # @return [Hash] A hash with base as key, count as value + def base_counts(count_ambiguous_bases=nil) + s = self.downcase + t = s.count('t') + u = s.count('u') + counts = { a: s.count('a'), c: s.count('c'), g: s.count('g') } + + if t > 0 && u == 0 + counts[:t] = t + elsif t == 0 && u > 0 + counts[:u] = u + elsif t > 0 && u > 0 + warn('ERROR: A sequence contains both T and U') + counts[:t], counts[:u] = t, u + end + + counts[:n] = s.count('n') if count_ambiguous_bases + + counts + end + + # Returns a map of base frequencies + # + # Counts bases with the `base_counts` method, then divides each + # count by the total bases counted to give frequency for each + # base. If a truthy argument is given, ambiguous bases will be + # included in the total and their frequency reported. Can discern + # between DNA and RNA. + # + # If default or falsy argument is given, ambiguous bases will not be + # counted in the total base count and their frequency will not be + # given. + # + # @example Get base frequencies of DNA sequence without ambiguous bases + # Sequence.new('AcTGn').base_counts + # #=> { a: 0.25, c: 0.25, t: 0.25, g: 0.25 } + # @example Get base counts of DNA sequence with ambiguous bases + # Sequence.new('AcTGn').base_counts(true) + # #=> { a: 0.2, c: 0.2, t: 0.2, g: 0.2, n: 0.2 } + # + # @return [Hash] A hash with base as key, frequency as value + def base_frequencies(count_ambiguous_bases=nil) + base_counts = self.base_counts(count_ambiguous_bases) + total_bases = base_counts.values.reduce(:+).to_f + base_freqs = + base_counts.map { |base, count| [base, count/total_bases] }.flatten + Hash[*base_freqs] + end end