# -*- coding: utf-8 -*-

module Bio
  module FastQC
    class Parser
      def initialize(fastqc_data_txt)
        @data = fastqc_data_txt
        @object = parse(@data)
        @base = self.basic_statistics
      end

      def parse(data)
        modules = data.split(">>END_MODULE\n")
        modules.map do |node|
          lines = node.split("\n")
          rm_header = lines.map do |line|
            if line !~ /^\#/ || line =~ /^#Total Duplicate Percentage/
              line.split("\t")
            end
          end
          rm_header.compact
        end
      end

      def fastqc_version
        @data.split("\n").first.split("\t").last
      end

      def basic_statistics
        Hash[*@object.select{|a| a.first.first == ">>Basic Statistics" }.flatten]
      end

      def filename
        @base["Filename"]
      end

      def file_type
        @base["File type"]
      end

      def encoding
        @base["Encoding"]
      end

      def total_sequences
        @base["Total Sequences"].to_i
      end

      def filtered_sequences
        @base["Filtered Sequences"].to_i
      end

      def sequence_length
        @base["Sequence length"]
      end

      def min_length
        l = @base["Sequence length"]
        if l =~ /\d-\d/
          l.sub(/-\d+$/,"").to_i
        else
          l.to_i
        end
      end

      def max_length
        l = @base["Sequence length"]
        if l =~ /\d-\d/
          l.sub(/^\d+-/,"").to_i
        else
          l.to_i
        end
      end

      def percent_gc
        @base["%GC"].to_i
      end

      def per_base_sequence_quality
        node = @object.select{|a| a.first.first == ">>Per base sequence quality" }
        node.first.select{|n| n.first != ">>Per base sequence quality" }
      end

      ## Custom module: overall mean base call quality indicator
      def overall_mean_quality_score
        per_base = self.per_base_sequence_quality
        v = per_base.map{|c| (10**(c[1].to_f/-10)).to_f }
        -10 * Math.log10(v.reduce(:+) / v.size)
      end

      ## Custom module: overall median base call quality indicator
      def overall_median_quality_score
        per_base = self.per_base_sequence_quality
        v = per_base.map{|c| (10**(c[2].to_f/-10)).to_f }
        -10 * Math.log10(v.reduce(:+) / v.size)
      end

      def per_tile_sequence_quality
        node = @object.select{|a| a.first.first == ">>Per tile sequence quality" }
        node.first.select{|n| n.first != ">>Per tile sequence quality" }
      rescue
        []
      end

      def per_sequence_quality_scores
        node = @object.select{|a| a.first.first == ">>Per sequence quality scores" }
        node.first.select{|n| n.first != ">>Per sequence quality scores" }
      end

      def per_base_sequence_content
        node = @object.select{|a| a.first.first == ">>Per base sequence content" }
        node.first.select{|n| n.first != ">>Per base sequence content" }
      end

      def per_sequence_gc_content
        node = @object.select{|a| a.first.first == ">>Per sequence GC content" }
        node.first.select{|n| n.first != ">>Per sequence GC content" }
      end

      def per_sequence_gc_content
        node = @object.select{|a| a.first.first == ">>Per sequence GC content" }
        node.first.select{|n| n.first != ">>Per sequence GC content" }
      end

      def per_base_n_content
        node = @object.select{|a| a.first.first == ">>Per base N content" }
        node.first.select{|n| n.first != ">>Per base N content" }
      end

      ## Custom module: overall N content
      def overall_n_content
        per_base = self.per_base_n_content
        v = per_base.map{|c| c[1].to_f }
        v.reduce(:+) / v.size
      end

      def sequence_length_distribution
        node = @object.select{|a| a.first.first == ">>Sequence Length Distribution" }
        node.first.select{|n| n.first != ">>Sequence Length Distribution" }
      end

      ## Custom module: mean sequence length calculated from distribution
      def mean_sequence_length
        distribution = self.sequence_length_distribution
        sum = distribution.map do |length_count|
          length = length_count[0]
          count = length_count[1].to_f
          if length =~ /\d-\d/
            f = length.sub(/-\d+$/,"").to_i
            b = length.sub(/^\d+-/,"").to_i
            mean = (f + b) / 2
            mean * count
          else
            length.to_i * count
          end
        end
        sum.reduce(:+) / self.total_sequences
      end

      ## Custom module: median sequence length calculated from distribution
      def median_sequence_length
        distribution = self.sequence_length_distribution
        array = distribution.map do |length_count|
          length = length_count[0]
          count = length_count[1].to_i
          if length =~ /\d-\d/
            f = length.sub(/-\d+$/,"").to_i
            b = length.sub(/^\d+-/,"").to_i
            mean = (f + b) / 2
            [mean.to_f] * count
          else
            [length.to_f] * count
          end
        end
        sorted = array.flatten.sort
        quot = sorted.size / 2
        if !sorted.size.even?
          sorted[quot]
        else
          f = sorted[quot]
          b = sorted[quot - 1]
          (f + b) / 2
        end
      end

      def sequence_duplication_levels
        node = @object.select{|a| a.first.first == ">>Sequence Duplication Levels" }
        node.first.select{|n| n.first != ">>Sequence Duplication Levels" && n.first != "\#Total Duplicate Percentage" }
      end

      def total_duplicate_percentage
        node = @object.select{|a| a.first.first == ">>Sequence Duplication Levels" }
        node.first.select{|n| n.first == "\#Total Duplicate Percentage" }.flatten[1].to_f
      end

      def overrepresented_sequences
        node = @object.select{|a| a.first.first == ">>Overrepresented sequences" }
        node.first.select{|n| n.first != ">>Overrepresented sequences" }
      end

      def adapter_content
        node = @object.select{|a| a.first.first == ">>Adapter Content" }
        node.first.select{|n| n.first != ">>Adapter Content" }
      end

      def kmer_content
        node = @object.select{|a| a.first.first == ">>Kmer Content" }
        node.first.select{|n| n.first != ">>Kmer Content" }
      end

      def summary
        {
          fastqc_version: self.fastqc_version,
          filename: self.filename,
          file_type: self.file_type,
          encoding: self.encoding,
          total_sequences: self.total_sequences,
          filtered_sequences: self.filtered_sequences,
          sequence_length: self.sequence_length,
          percent_gc: self.percent_gc,
          per_base_sequence_quality: self.per_base_sequence_quality,
          per_tile_sequence_quality: self.per_tile_sequence_quality,
          per_sequence_quality_scores: self.per_sequence_quality_scores,
          per_base_sequence_content: self.per_base_sequence_content,
          per_sequence_gc_content: self.per_sequence_gc_content,
          per_base_n_content: self.per_base_n_content,
          sequence_length_distribution: self.sequence_length_distribution,
          total_duplicate_percentage: self.total_duplicate_percentage,
          sequence_duplication_levels: self.sequence_duplication_levels,
          overrepresented_sequences: self.overrepresented_sequences,
          adapter_content: self.adapter_content,
          kmer_content: self.kmer_content,
          min_length: self.min_length,
          max_length: self.max_length,
          overall_mean_quality_score: self.overall_mean_quality_score,
          overall_median_quality_score: self.overall_median_quality_score,
          overall_n_content: self.overall_n_content,
          mean_sequence_length: self.mean_sequence_length,
          median_sequence_length: self.median_sequence_length,
        }
      end
    end
  end
end