module Lederhosen
module Trimmer

##
# Code used for sequence trimming
#
# - PairedTrimmer
# - HuangTrimmer
# - ProbabilityTrimmer
# - QSEQTrimmer
#
# Some major refactoring needs to get done here
#

# HaungTrimmer
#
# class that has the trim function. Used in mixins
# this trim function is based on the function documented
# in the paper:
#   Huang X, Wang J, Aluru S, Yang SP, Hillier L. (2003). PCAP:
#   a whole-genome assembly program. Genome Res 13:
#   2164–2170.
#
# The implementation is a direct copy from the perl implementation
# implemented in Pangea 1.0:
#   PANGEA: pipeline for analysis of next generation amplicons
#   A Giongo, DB Crabb, AG Davis-Richardson - ISME , 2010
#
class HuangTrimmer

  def initialize(args={})
    @min = args[:min]
    @offset = args[:offset]
  end

  def trim_seq(dna)

    _sum, _max, first, last, start, _end = 0, 0, 0, 0, 0

    dna.quality.each_byte.each_with_index do |b, a|
      _sum += (b - @offset - @min)
      if _sum > _max
        _max = _sum
        _end = a
        start = first
      elsif _sum < 0
        _sum = 0
        first = a
      end
    end

    begin
      dna.sequence[start, _end - start].gsub('.', 'N')
    rescue
      nil
    end
  end
end

#
# return the longest string starting from the left side
# where the PROBABILITY OF ERROR as computed from the PHRED
# scores does not go above a certain cutoff
# (default is 0.005)
#
class ProbabilityTrimmer

  def initialize(args = {})
    @cutoff = args[:cutoff] || 0.005
    @min = args[:min]
    @seqtech = args[:seq_tech] || fail
    # must be illumina, sanger or solexa
  end

  def trim_seq(dna)
    trim_coord = dna.sequence.size
    probabilities = dna.send(:"#{@seqtech}_probabilities")
    probabilities.each_with_index do |q, i|
      if q > @cutoff
        trim_coord = i
        break
      end
    end

    begin
      dna.sequence[0..trim_coord].gsub('.', 'N')
    rescue
      nil
    end
  end
end

#
# Base class for trimming paired-end reads
#
class PairedTrimmer < Enumerator

  def initialize(args = {})
    @pretrim    = args[:pretrim]
    # TODO
    # need to be able to trim from left, right of pairs
    # thinking about specifying a "trimming language"
    #
    # Something like:
    #
    # --trim="5L0 0L3"
    # --trim="0L4 2L6"
    #
    # also thinking about breaking all of this trimming stuff
    # out into its own package. (to be more unixy and stuff ;)
    #
    @min_length = args[:min_length] || 70
    @min         = args[:min] || 20
    @offset      = args[:cutoff] || 64 # XXX should both be called 'cutoff'
    @left_trim   = args[:left_trim] || 0 # trim adapter sequence
    @skip_ambig  = args[:skip_ambiguous] || false
    @trimmer     = args[:trimmer] || ProbabilityTrimmer.new(:min => @min,
                                                           :offset => @offset,
                                                           :seq_tech =>
                                                           :illumina)
  end

  def each(&block)

    skipped_because_singleton = 0
    skipped_because_length = 0
    skipped_because_ambig = 0

    @paired_iterator.each_with_index do |a, i|
      seqa = @trimmer.trim_seq(a[0])[@left_trim..-1] rescue nil # trim adapter sequence
      seqb = @trimmer.trim_seq a[1]

      # make sure sequences are good
      # (both pairs survived and both are at least min_length long)
      # optionally skip reads that contain ambiguous nucleotides (N)
      if [seqa, seqb].include? nil
        skipped_because_singleton += 1
      elsif !(seqb.length >= @min_length && seqa.length >= @min_length)
        skipped_because_length += 1
      elsif @skip_ambig and (seqb =~ /N/ or seqa =~ /N/)
        skipped_because_ambig
      else # reads are good
        #
        # TODO
        # this is experiment specific. I save memory down the road
        # by having both of the reads in the forward orientation
        # but depending on the sequencing technology/pipeline
        # this may change.
        #
        # I'm planning on removing the trimming steps from lederhosen
        # for their own gem. With that, this will go too.
        #
        seqb = reverse_complement(seqb)
        
        # Create and yield new fasta objects
        # Perhaps this is slow?
        a = Fasta.new :name => "#{i}:0", :sequence => seqa
        b = Fasta.new :name => "#{i}:1", :sequence => seqb
        block.yield a
        block.yield b
      end
    end
  end

  # reverse complement a DNA sequence
  # assumes only GATCN nucleotides
  def reverse_complement(s)
    s.reverse.tr('GATCNgatcn','CTAGNctagn')
  end
end

#
# Yields trimmed fasta records given an input
# interleaved, paired-end fastq file
#
class InterleavedTrimmer < PairedTrimmer

  def initialize(interleaved_file, args = {})
    # create an iterator that yields paired records
    # as an array

    handle =
      begin
        Zlib::GzipReader.open(interleaved_file)
      rescue Zlib::GzipFile::Error
        File.open(interleaved_file)
      end

    reads = Dna.new handle
    @paired_iterator = reads.each_slice(2)

    super(args)
  end
end

#
# Yield trimmed fasta records given an two separate
# paired QSEQ files
#
class QSEQTrimmer < PairedTrimmer
  def initialize(left_file, right_file, args = {})
    # create an iterator that yields paired records
    # as an array

    left_handle, right_handle =
      begin
        [ Zlib::GzipReader.open(left_file), Zlib::GzipReader.open(right_file)]
      rescue Zlib::GzipFile::Error
        [ File.open(left_file), File.open(right_file) ]
      end

    left_file_reads  = Dna.new left_handle
    right_reads = Dna.new right_handle

    @paired_iterator = left_file_reads.zip(right_reads)

    super(args)

    left_handle.close
    right_handle.close
  end
end

end # module Trimmer
end # module Lederhosen