trimmer.rb in lederhosen-1.7.0

- old
+ new

@@ -1,59 +1,42 @@
 module Lederhosen
 module Trimmer
 
-# Base class for trimming paired-end reads
-class PairedTrimmer < Enumerator
-  def initialize(paired_iterator, args = {})
-    @paired_iterator = paired_iterator
-    @pretrim         = args[:pretrim]
-    @min_length      = args[:min_length] || 70
-    @min             = args[:min] || 20
-    @offset          = args[:cutoff] || 64 # XXX should both be called 'cutoff'
-    @pretrim         = args[:pretrim] || false
-  end
+##
+# Code used for sequence trimming
+#
+# - PairedTrimmer
+# - HuangTrimmer
+# - ProbabilityTrimmer
+# - QSEQTrimmer
+#
+# Some major refactoring needs to get done here
+#
 
-  def each(&block)
-    @paired_iterator.each_with_index do |a, i|
-      seqa = trim_seq a[0], :pretrim => @pretrim
-      seqb = trim_seq a[1], :pretrim => @pretrim
-      unless [seqa, seqb].include? nil
-        if seqb.length >= @min_length && seqa.length >= @min_length
-          seqb = reverse_complement(seqb) # experiment-specific?
-          a = Fasta.new :name => "#{i}:0", :sequence => seqa
-          b = Fasta.new :name => "#{i}:1", :sequence => seqb
-          block.yield a
-          block.yield b
-        else # we just skip bad reads entirely
-          next
-        end
-      else
-        next
-      end
-    end
-  end
+# HaungTrimmer
+#
+# class that has the trim function. Used in mixins
+# this trim function is based on the function documented
+# in the paper:
+#   Huang X, Wang J, Aluru S, Yang SP, Hillier L. (2003). PCAP:
+#   a whole-genome assembly program. Genome Res 13:
+#   2164–2170.
+#
+# The implementation is a direct copy from the perl implementation
+# implemented in Pangea 1.0:
+#   PANGEA: pipeline for analysis of next generation amplicons
+#   A Giongo, DB Crabb, AG Davis-Richardson - ISME , 2010
+#
+class HuangTrimmer
 
-  # reverse complement a DNA sequence
-  # assumes only GATCN nucleotides
-  def reverse_complement(s)
-    s.reverse.tr('GATCNgatcn','CTAGNctagn')
+  def initialize(args={})
+    @min = args[:min]
+    @offset = args[:offset]
   end
 
-  # this method does the actual trimming. It is a class method
-  # so you can use it if you don't want to initialize a PairedTrimmer
-  def trim_seq(dna, args={})
+  def trim_seq(dna)
 
-    # trim primers off of sequence
-    # XXX this is experiment-specific and needs to be made
-    # into a parameter
-    if @pretrim
-      dna.sequence = dna.sequence[@pretrim..-1]
-      dna.quality  = dna.quality[@pretrim..-1]
-    end
-
-    dna.sequence.gsub! '.', 'N'
-
     _sum, _max, first, last, start, _end = 0, 0, 0, 0, 0
 
     dna.quality.each_byte.each_with_index do |b, a|
       _sum += (b - @offset - @min)
       if _sum > _max
@@ -64,18 +47,134 @@
         _sum = 0
         first = a
       end
     end
 
-    dna.sequence[start, _end - start] rescue nil
+    begin
+      dna.sequence[start, _end - start].gsub('.', 'N')
+    rescue
+      nil
+    end
   end
+end
 
+#
+# return the longest string starting from the left side
+# where the PROBABILITY OF ERROR as computed from the PHRED
+# scores does not go above a certain cutoff
+# (default is 0.005)
+#
+class ProbabilityTrimmer
+
+  def initialize(args = {})
+    @cutoff = args[:cutoff] || 0.005
+    @min = args[:min]
+    @seqtech = args[:seq_tech] || fail
+    # must be illumina, sanger or solexa
+  end
+
+  def trim_seq(dna)
+    trim_coord = dna.sequence.size
+    probabilities = dna.send(:"#{@seqtech}_probabilities")
+    probabilities.each_with_index do |q, i|
+      if q > @cutoff
+        trim_coord = i
+        break
+      end
+    end
+
+    begin
+      dna.sequence[0..trim_coord].gsub('.', 'N')
+    rescue
+      nil
+    end
+  end
 end
 
 #
+# Base class for trimming paired-end reads
+#
+class PairedTrimmer < Enumerator
+
+  def initialize(args = {})
+    @pretrim    = args[:pretrim]
+    # TODO
+    # need to be able to trim from left, right of pairs
+    # thinking about specifying a "trimming language"
+    #
+    # Something like:
+    #
+    # --trim="5L0 0L3"
+    # --trim="0L4 2L6"
+    #
+    # also thinking about breaking all of this trimming stuff
+    # out into its own package. (to be more unixy and stuff ;)
+    #
+    @min_length = args[:min_length] || 70
+    @min         = args[:min] || 20
+    @offset      = args[:cutoff] || 64 # XXX should both be called 'cutoff'
+    @left_trim   = args[:left_trim] || 0 # trim adapter sequence
+    @skip_ambig  = args[:skip_ambiguous] || false
+    @trimmer     = args[:trimmer] || ProbabilityTrimmer.new(:min => @min,
+                                                           :offset => @offset,
+                                                           :seq_tech =>
+                                                           :illumina)
+  end
+
+  def each(&block)
+
+    skipped_because_singleton = 0
+    skipped_because_length = 0
+    skipped_because_ambig = 0
+
+    @paired_iterator.each_with_index do |a, i|
+      seqa = @trimmer.trim_seq(a[0])[@left_trim..-1] rescue nil # trim adapter sequence
+      seqb = @trimmer.trim_seq a[1]
+
+      # make sure sequences are good
+      # (both pairs survived and both are at least min_length long)
+      # optionally skip reads that contain ambiguous nucleotides (N)
+      if [seqa, seqb].include? nil
+        skipped_because_singleton += 1
+      elsif !(seqb.length >= @min_length && seqa.length >= @min_length)
+        skipped_because_length += 1
+      elsif @skip_ambig and (seqb =~ /N/ or seqa =~ /N/)
+        skipped_because_ambig
+      else # reads are good
+        #
+        # TODO
+        # this is experiment specific. I save memory down the road
+        # by having both of the reads in the forward orientation
+        # but depending on the sequencing technology/pipeline
+        # this may change.
+        #
+        # I'm planning on removing the trimming steps from lederhosen
+        # for their own gem. With that, this will go too.
+        #
+        seqb = reverse_complement(seqb)
+        
+        # Create and yield new fasta objects
+        # Perhaps this is slow?
+        a = Fasta.new :name => "#{i}:0", :sequence => seqa
+        b = Fasta.new :name => "#{i}:1", :sequence => seqb
+        block.yield a
+        block.yield b
+      end
+    end
+  end
+
+  # reverse complement a DNA sequence
+  # assumes only GATCN nucleotides
+  def reverse_complement(s)
+    s.reverse.tr('GATCNgatcn','CTAGNctagn')
+  end
+end
+
+#
 # Yields trimmed fasta records given an input
 # interleaved, paired-end fastq file
+#
 class InterleavedTrimmer < PairedTrimmer
 
   def initialize(interleaved_file, args = {})
     # create an iterator that yields paired records
     # as an array
@@ -86,19 +185,20 @@
       rescue Zlib::GzipFile::Error
         File.open(interleaved_file)
       end
 
     reads = Dna.new handle
-    iterator = reads.each_slice(2)
+    @paired_iterator = reads.each_slice(2)
 
-    super(iterator, args)
-
+    super(args)
   end
 end
 
+#
 # Yield trimmed fasta records given an two separate
 # paired QSEQ files
+#
 class QSEQTrimmer < PairedTrimmer
   def initialize(left_file, right_file, args = {})
     # create an iterator that yields paired records
     # as an array
 
@@ -110,12 +210,12 @@
       end
 
     left_file_reads  = Dna.new left_handle
     right_reads = Dna.new right_handle
 
-    iterator = left_file_reads.zip(right_reads)
+    @paired_iterator = left_file_reads.zip(right_reads)
 
-    super(iterator, args)
+    super(args)
 
     left_handle.close
     right_handle.close
   end
 end