require 'json'
require 'genevalidator/validation_report'
require 'genevalidator/ext/array'
module GeneValidator
  ##
  # Class that stores the validation output information
  class GeneMergeValidationOutput < ValidationReport
    attr_reader :slope
    attr_reader :threshold_down
    attr_reader :threshold_up
    attr_reader :unimodality
    attr_reader :result

    # These thresholds are emperically chosen.
    UPPER_THRESHOLD = 1.2 # radians
    LOWER_THRESHOLD = 0.4 # radians

    def initialize(short_header, header, description, slope, unimodality,
                   expected = :no)
      @short_header, @header, @description = short_header, header, description
      @slope          = slope.round(1)
      @slope          = @slope.abs if @slope == -0.0
      @unimodality    = unimodality
      @threshold_down = LOWER_THRESHOLD
      @threshold_up   = UPPER_THRESHOLD
      @result         = validation
      @expected       = expected
      @plot_files     = []
      @approach       = 'We expect the query sequence to encode a single' \
                        ' protein-coding gene. Here, we analyse the' \
                        ' High-scoring Segment Pairs (HSPs) identified by' \
                        ' BLAST to determine whether the query includes' \
                        ' sequence from two or more genes.'
      @explanation    = explain
      @conclusion     = conclude
    end

    def explain
      if @unimodality
        'The start coordinates and the end coordinates of HSPs are unimodally' \
        ' distributed.'
      else
        'The distribution of start and/or end-coordinates of HSPs are' \
        ' multi-modal. To detect potential problems we performed a linear'\
        ' regression (with coordinates weighted inversely proportionally to '\
        " hit strength). The resulting slope is #{@slope}."
      end
    end

    def conclude
      if @unimodality
        'This suggest that the query sequence represents a single gene.'
      else
        diff = (@result == :yes) ? ' within' : ' outside'
        t = "This slope is #{diff} our empirically calculated thresholds" \
            ' (0.4 and 1.2).'
        if @result == :yes
          t << ' This suggests the query contains sequence from two or more' \
               ' different genes.'
        else
          t << ' There is no evidence that the query contains sequence from' \
               ' multiple genes.'
        end
        t
      end
    end

    def print
      (@slope.nan?) ? 'Inf' : "#{@slope}"
    end

    def validation
      (@slope > threshold_down && @slope < threshold_up) ? :yes : :no
    end

    def color
      (validation == :no) ? 'success' : 'danger'
    end
  end

  ##
  # This class contains the methods necessary for
  # checking whether there is evidence that the
  # prediction is a merge of multiple genes
  class GeneMergeValidation < ValidationTest
    attr_reader :hits
    attr_reader :prediction
    attr_reader :filename

    ##
    # Initilizes the object
    # Params:
    # +type+: type of the predicted sequence (:nucleotide or :protein)
    # +prediction+: a +Sequence+ object representing the blast query
    # +hits+: a vector of +Sequence+ objects (representing blast hits)
    # +filename+: name of the input file, used when generatig the plot files
    # +boundary+: the offset of the hit from which we start analysing the hit
    def initialize(type, prediction, hits, filename, boundary = 10)
      super
      @short_header = 'Gene_Merge'
      @header       = 'Gene Merge'
      @description  = 'Check whether BLAST hits make evidence about a merge' \
                      ' of two genes that match the predicted gene.'
      @cli_name     = 'merge'
      @filename     = filename
      @boundary     = boundary
    end

    ##
    # Validation test for gene merge
    # Output:
    # +GeneMergeValidationOutput+ object
    def run
      fail NotEnoughHitsError unless hits.length >= 5
      fail Exception unless prediction.is_a?(Sequence) && hits[0].is_a?(Sequence)

      start = Time.now

      pairs = hits.map { |hit| Pair.new(hit.hsp_list.map{ |hsp| hsp.match_query_from }.min,
                                        hit.hsp_list.map{ |hsp| hsp.match_query_to }.max) }
      xx_0 = pairs.map(&:x)
      yy_0 = pairs.map(&:y)

      # minimum start shoud be at 'boundary' residues
      xx = xx_0.map { |x| (x < @boundary) ? @boundary : x }

      # maximum end should be at length - 'boundary' residues
      yy = yy_0.map do |y|
        if y > @prediction.raw_sequence.length - @boundary
          @prediction.raw_sequence.length - @boundary
        else
          y
        end
      end

      line_slope = slope(xx, yy, (1..hits.length).map{ |x| 1 / (x + 0.0) })
      ## YW - what is this weighting?

      unimodality = false
      if unimodality_test(xx, yy)
        unimodality = true
        lm_slope = 0.0
      else
        lm_slope = line_slope[1]
      end

      y_intercept = line_slope[0]

      @validation_report = GeneMergeValidationOutput.new(@short_header, @header,
                                                         @description, lm_slope,
                                                         unimodality)
      if unimodality
        plot1 = plot_2d_start_from
      else
        plot1 = plot_2d_start_from(lm_slope, y_intercept)
      end

      @validation_report.plot_files.push(plot1)
      plot2 = plot_matched_regions
      @validation_report.plot_files.push(plot2)
      @validation_report.running_time = Time.now - start
      @validation_report

    rescue NotEnoughHitsError
      @validation_report = ValidationReport.new('Not enough evidence', :warning,
                                                @short_header, @header,
                                                @description, @approach,
                                                @explanation, @conclusion)
    end

    ##
    # Generates a json file containing data used for
    # plotting the matched region of the prediction for each hit
    # Param
    # +output+: location where the plot will be saved in jped file format
    # +hits+: array of Sequence objects
    # +prediction+: Sequence objects
    def plot_matched_regions(output = "#{filename}_match.json", hits = @hits)

      colors   = ['orange', 'blue']  ##{colors[i%2]
      f        = File.open(output, 'w')
      no_lines = hits.length

      hits_less = hits[0..[no_lines, hits.length - 1].min]

      f.write((hits_less.each_with_index.map { |hit, i|
        { 'y' => i,
          'start' => hit.hsp_list.map(&:match_query_from).min,
          'stop' => hit.hsp_list.map(&:match_query_to).max,
          'color'=>'black',
          'dotted'=>'true'}}.flatten +
        hits_less.each_with_index.map { |hit, i|
          hit.hsp_list.map { |hsp|
            { 'y' => i,
              'start' => hsp.match_query_from,
              'stop' => hsp.match_query_to,
              'color' => 'orange'} } }.flatten).to_json)
      f.close

      Plot.new(output.scan(%r{([^/]+)$})[0][0],
               :lines,
               'Gene Merge Validation: Query coord covered by blast hit (1 line/hit)',
               '',
               'Offset in Prediction',
               'Hit Number',
               hits_less.length)
    end

    ##
    # Generates a json file containing data used for
    # plotting the start/end of the matched region offsets in the prediction
    # Param
    # +slope+: slope of the linear regression line
    # +y_intercept+: the ecuation of the line is y= slope*x + y_intercept
    # +output+: location where the plot will be saved in jped file format
    # +hits+: array of Sequence objects
    def plot_2d_start_from(slope = nil, y_intercept = nil,
                           output = "#{filename}_match_2d.json", hits = @hits)
      pairs = hits.map do |hit|
        Pair.new(hit.hsp_list.map(&:match_query_from).min,
                 hit.hsp_list.map(&:match_query_to).max)
      end

      xx = pairs.map(&:x)
      yy = pairs.map(&:y)

      freq_x = xx.inject(Hash.new(0)) { |h, v| h[v] += 1; h }
      filename_x = "#{filename}_merge_x.json"
      f = File.open(filename_x, 'w')
      f.write([freq_x.collect { |k,v|
          { 'key' => k, 'value' => v, 'main' => (1==2) }
        }].to_json)
      f.close
      plot3 = Plot.new(filename_x.scan(%r{([^/]+)$})[0][0],
                       :simplebars,
                       '[Gene Merge] X projection',
                       '',
                       'x projection',
                       'number of sequences')
#       @validation_report.plot_files.push(plot3)

      freq_y = yy.inject(Hash.new(0)) { |h,v| h[v] += 1; h }
      filename_y = "#{filename}_merge_y.json"
      f = File.open(filename_y, 'w')
      f.write([freq_y.collect { |k, v|
          { 'key' => k, 'value' => v, 'main' => (1 == 2) }
        }].to_json)
      f.close
      plot4 = Plot.new(filename_y.scan(%r{([^/]+)$})[0][0],
                       :simplebars,
                       '[Gene Merge] Y projection',
                       '',
                       'y projection',
                       'number of sequences')
#       @validation_report.plot_files.push(plot4)

      f = File.open(output, 'w')
      f.write(hits.map { |hit| {'x' => hit.hsp_list.map(&:match_query_from).min,
                                'y' => hit.hsp_list.map(&:match_query_to).max,
                                'color' => 'red'}}.to_json)
      f.close

      Plot.new(output.scan(%r{([^/]+)$})[0][0],
               :scatter,
               'Gene Merge Validation: Start/end of matching hit coord. on query (1 point/hit)',
               '',
               'Start Offset (most left hsp)',
               'End Offset (most right hsp)',
               y_intercept,
               slope)
    end

    ##
    # Caclulates the slope of the regression line
    # give a set of 2d coordonates of the start/stop offests of the hits
    # Param
    # xx: +Array+ of integers
    # yy : +Array+ of integers
    # weights: +Array+ of integers
    # Output:
    # The ecuation of the regression line: [y slope]
    def slope(xx, yy, weights = nil)
      weights = Array.new(hits.length, 1) if weights.nil?

      # calculate the slope
      xx_weighted = xx.each_with_index.map { |x, i| x * weights[i] }
      yy_weighted = yy.each_with_index.map { |y, i| y * weights[i] }

      denominator = weights.reduce(0) { |a, e| a + e }

      x_mean = xx_weighted.reduce(0) { |a, e| a + e } / (denominator + 0.0)
      y_mean = yy_weighted.reduce(0) { |a, e| a + e } / (denominator + 0.0)

      numerator = (0...xx.length).reduce(0) do |a, e|
        a + (weights[e] * (xx[e] - x_mean) * (yy[e] - y_mean))
      end

      denominator = (0...xx.length).reduce(0) do |a, e|
        a + (weights[e] * ((xx[e] - x_mean)**2))
      end

      slope = numerator / (denominator + 0.0)
      y_intercept = y_mean - (slope * x_mean)

      [y_intercept, slope]
    end

    ##
    # Caclulates the slope of the regression line
    # give a set of 2d coordonates of the start/stop offests of the hits
    # Param
    # xx : +Array+ of integers
    # yy : +Array+ of integers
    # Output:
    # The ecuation of the regression line: [y slope]
    def slope_statsample(xx, yy)
      require 'statsample'
      sr = Statsample::Regression.simple(xx.to_scale, yy.to_scale)
      [sr.a, sr.b]
    end

    ##
    # xx and yy are the projections of the 2-d data on the two axes
    def unimodality_test(xx, yy)
      mean_x = xx.mean
      median_x = xx.median
      mode_x = xx.mode
      sd_x = xx.standard_deviation

      if sd_x == 0
        cond1_x = true
        cond2_x = true
        cond3_x = true
      else
        cond1_x = ((mean_x - median_x).abs / (sd_x + 0.0)) < Math.sqrt(0.6)
        cond2_x = ((mean_x - mode_x).abs / (sd_x + 0.0)) < Math.sqrt(0.3)
        cond3_x = ((median_x - mode_x).abs / (sd_x + 0.0)) < Math.sqrt(0.3)
      end

      mean_y = yy.mean
      median_y = yy.median
      mode_y = yy.mode
      sd_y = yy.standard_deviation

      if sd_y == 0
        cond1_y = true
        cond2_y = true
        cond3_y = true
      else
        cond1_y = ((mean_y - median_y).abs / (sd_y + 0.0)) < Math.sqrt(0.6)
        cond2_y = ((mean_y - mode_y).abs / (sd_y + 0.0)) < Math.sqrt(0.3)
        cond3_y = ((median_y - mode_y).abs / (sd_y + 0.0)) < Math.sqrt(0.3)
      end

      if cond1_x && cond2_x && cond3_x && cond1_y && cond2_y && cond3_y
        true
      else
        false
      end
    end
  end
end