Sha256: fc2ab34c635bab7c1fa4422c3c6b17e10417e1d45d5780f30d32a4c223eb04a7

Contents?: true

Size: 1.93 KB

Versions: 6

Compression:

Stored size: 1.93 KB

Contents

##
# FILTER READS WITH LOW ABUNDANCE KMERS
#

module Lederhosen
  class CLI

    desc "k_filter",
         "filter novel reads likely to form small/singleton clusters (experimental)"

    method_option :input,    :type => :string,  :required => true
    method_option :output,   :type => :string,  :required => true
    method_option :k,        :type => :numeric, :required => true
    method_option :cutoff,   :type => :numeric, :required => true

    def k_filter
      input  = options[:input]
      output = options[:output]
      k_len  = options[:k].to_i
      cutoff = options[:cutoff]

      counting_table = Hash.new { |h, k| h[k] = 0 }
      total_reads = 0

      ohai "counting kmers"
      File.open(input) do |handle|
        records = Dna.new handle
        records.each do |r|
          total_reads += 1
          kmers = r.sequence.to_kmers(k_len)
          kmers.each { |x| counting_table[x] += 1 }
        end
      end

      sum_of_kmers = counting_table.values.inject(:+)

      ohai "total reads = #{total_reads}"
      ohai "sum of kmers = #{sum_of_kmers}"

      kept = 0
      total_reads = total_reads.to_f

      pbar = ProgressBar.new "saving", total_reads.to_i
      output = File.open(output, 'w')
      File.open(input) do |handle|
        records = Dna.new handle

        records.each do |r|
          kmers = r.sequence.to_kmers(k_len)

          # check if any of the kmers are rare
          keep = true
          coverage = 0
          kmers.each do |kmer|
            # if any of the kmers are rare, don't print the read
            c = counting_table[kmer]
            coverage += c
            if c < cutoff
              keep = false
              break
            end
          end

          if keep
            kept += 1
            output.puts r
          end
          pbar.inc      
        end
      end

      pbar.finish

      ohai "survivors = #{kept} (#{kept/total_reads.to_f})"
      output.close
    end
  end

end

Version data entries

6 entries across 6 versions & 1 rubygems

Version Path
lederhosen-0.2.0 lib/lederhosen/tasks/k_filter.rb
lederhosen-0.1.9 lib/lederhosen/tasks/k_filter.rb
lederhosen-0.1.8 lib/lederhosen/tasks/k_filter.rb
lederhosen-0.1.7 lib/lederhosen/tasks/k_filter.rb
lederhosen-0.1.6 lib/lederhosen/tasks/k_filter.rb
lederhosen-0.1.5 lib/lederhosen/tasks/k_filter.rb