lib/lederhosen/trimmer.rb in lederhosen-1.6.2 vs lib/lederhosen/trimmer.rb in lederhosen-1.7.0
- old
+ new
@@ -1,59 +1,42 @@
module Lederhosen
module Trimmer
-# Base class for trimming paired-end reads
-class PairedTrimmer < Enumerator
- def initialize(paired_iterator, args = {})
- @paired_iterator = paired_iterator
- @pretrim = args[:pretrim]
- @min_length = args[:min_length] || 70
- @min = args[:min] || 20
- @offset = args[:cutoff] || 64 # XXX should both be called 'cutoff'
- @pretrim = args[:pretrim] || false
- end
+##
+# Code used for sequence trimming
+#
+# - PairedTrimmer
+# - HuangTrimmer
+# - ProbabilityTrimmer
+# - QSEQTrimmer
+#
+# Some major refactoring needs to get done here
+#
- def each(&block)
- @paired_iterator.each_with_index do |a, i|
- seqa = trim_seq a[0], :pretrim => @pretrim
- seqb = trim_seq a[1], :pretrim => @pretrim
- unless [seqa, seqb].include? nil
- if seqb.length >= @min_length && seqa.length >= @min_length
- seqb = reverse_complement(seqb) # experiment-specific?
- a = Fasta.new :name => "#{i}:0", :sequence => seqa
- b = Fasta.new :name => "#{i}:1", :sequence => seqb
- block.yield a
- block.yield b
- else # we just skip bad reads entirely
- next
- end
- else
- next
- end
- end
- end
+# HaungTrimmer
+#
+# class that has the trim function. Used in mixins
+# this trim function is based on the function documented
+# in the paper:
+# Huang X, Wang J, Aluru S, Yang SP, Hillier L. (2003). PCAP:
+# a whole-genome assembly program. Genome Res 13:
+# 2164–2170.
+#
+# The implementation is a direct copy from the perl implementation
+# implemented in Pangea 1.0:
+# PANGEA: pipeline for analysis of next generation amplicons
+# A Giongo, DB Crabb, AG Davis-Richardson - ISME , 2010
+#
+class HuangTrimmer
- # reverse complement a DNA sequence
- # assumes only GATCN nucleotides
- def reverse_complement(s)
- s.reverse.tr('GATCNgatcn','CTAGNctagn')
+ def initialize(args={})
+ @min = args[:min]
+ @offset = args[:offset]
end
- # this method does the actual trimming. It is a class method
- # so you can use it if you don't want to initialize a PairedTrimmer
- def trim_seq(dna, args={})
+ def trim_seq(dna)
- # trim primers off of sequence
- # XXX this is experiment-specific and needs to be made
- # into a parameter
- if @pretrim
- dna.sequence = dna.sequence[@pretrim..-1]
- dna.quality = dna.quality[@pretrim..-1]
- end
-
- dna.sequence.gsub! '.', 'N'
-
_sum, _max, first, last, start, _end = 0, 0, 0, 0, 0
dna.quality.each_byte.each_with_index do |b, a|
_sum += (b - @offset - @min)
if _sum > _max
@@ -64,18 +47,134 @@
_sum = 0
first = a
end
end
- dna.sequence[start, _end - start] rescue nil
+ begin
+ dna.sequence[start, _end - start].gsub('.', 'N')
+ rescue
+ nil
+ end
end
+end
+#
+# return the longest string starting from the left side
+# where the PROBABILITY OF ERROR as computed from the PHRED
+# scores does not go above a certain cutoff
+# (default is 0.005)
+#
+class ProbabilityTrimmer
+
+ def initialize(args = {})
+ @cutoff = args[:cutoff] || 0.005
+ @min = args[:min]
+ @seqtech = args[:seq_tech] || fail
+ # must be illumina, sanger or solexa
+ end
+
+ def trim_seq(dna)
+ trim_coord = dna.sequence.size
+ probabilities = dna.send(:"#{@seqtech}_probabilities")
+ probabilities.each_with_index do |q, i|
+ if q > @cutoff
+ trim_coord = i
+ break
+ end
+ end
+
+ begin
+ dna.sequence[0..trim_coord].gsub('.', 'N')
+ rescue
+ nil
+ end
+ end
end
#
+# Base class for trimming paired-end reads
+#
+class PairedTrimmer < Enumerator
+
+ def initialize(args = {})
+ @pretrim = args[:pretrim]
+ # TODO
+ # need to be able to trim from left, right of pairs
+ # thinking about specifying a "trimming language"
+ #
+ # Something like:
+ #
+ # --trim="5L0 0L3"
+ # --trim="0L4 2L6"
+ #
+ # also thinking about breaking all of this trimming stuff
+ # out into its own package. (to be more unixy and stuff ;)
+ #
+ @min_length = args[:min_length] || 70
+ @min = args[:min] || 20
+ @offset = args[:cutoff] || 64 # XXX should both be called 'cutoff'
+ @left_trim = args[:left_trim] || 0 # trim adapter sequence
+ @skip_ambig = args[:skip_ambiguous] || false
+ @trimmer = args[:trimmer] || ProbabilityTrimmer.new(:min => @min,
+ :offset => @offset,
+ :seq_tech =>
+ :illumina)
+ end
+
+ def each(&block)
+
+ skipped_because_singleton = 0
+ skipped_because_length = 0
+ skipped_because_ambig = 0
+
+ @paired_iterator.each_with_index do |a, i|
+ seqa = @trimmer.trim_seq(a[0])[@left_trim..-1] rescue nil # trim adapter sequence
+ seqb = @trimmer.trim_seq a[1]
+
+ # make sure sequences are good
+ # (both pairs survived and both are at least min_length long)
+ # optionally skip reads that contain ambiguous nucleotides (N)
+ if [seqa, seqb].include? nil
+ skipped_because_singleton += 1
+ elsif !(seqb.length >= @min_length && seqa.length >= @min_length)
+ skipped_because_length += 1
+ elsif @skip_ambig and (seqb =~ /N/ or seqa =~ /N/)
+ skipped_because_ambig
+ else # reads are good
+ #
+ # TODO
+ # this is experiment specific. I save memory down the road
+ # by having both of the reads in the forward orientation
+ # but depending on the sequencing technology/pipeline
+ # this may change.
+ #
+ # I'm planning on removing the trimming steps from lederhosen
+ # for their own gem. With that, this will go too.
+ #
+ seqb = reverse_complement(seqb)
+
+ # Create and yield new fasta objects
+ # Perhaps this is slow?
+ a = Fasta.new :name => "#{i}:0", :sequence => seqa
+ b = Fasta.new :name => "#{i}:1", :sequence => seqb
+ block.yield a
+ block.yield b
+ end
+ end
+ end
+
+ # reverse complement a DNA sequence
+ # assumes only GATCN nucleotides
+ def reverse_complement(s)
+ s.reverse.tr('GATCNgatcn','CTAGNctagn')
+ end
+end
+
+#
# Yields trimmed fasta records given an input
# interleaved, paired-end fastq file
+#
class InterleavedTrimmer < PairedTrimmer
def initialize(interleaved_file, args = {})
# create an iterator that yields paired records
# as an array
@@ -86,19 +185,20 @@
rescue Zlib::GzipFile::Error
File.open(interleaved_file)
end
reads = Dna.new handle
- iterator = reads.each_slice(2)
+ @paired_iterator = reads.each_slice(2)
- super(iterator, args)
-
+ super(args)
end
end
+#
# Yield trimmed fasta records given an two separate
# paired QSEQ files
+#
class QSEQTrimmer < PairedTrimmer
def initialize(left_file, right_file, args = {})
# create an iterator that yields paired records
# as an array
@@ -110,12 +210,12 @@
end
left_file_reads = Dna.new left_handle
right_reads = Dna.new right_handle
- iterator = left_file_reads.zip(right_reads)
+ @paired_iterator = left_file_reads.zip(right_reads)
- super(iterator, args)
+ super(args)
left_handle.close
right_handle.close
end
end