require 'ostruct'
require 'set'
require 'hash_by'
require 'spec_id/precision'
require 'roc'
require 'sample_enzyme'  # for others
require 'spec_id/bioworks'
require 'spec_id/sequest'
require 'spec_id/proph'
require 'spec_id_xml'

class SampleEnzyme ; end

class Mass
  # http://expasy.org/tools/findmod/findmod_masses.html
  # still need to add the modifications
  MONO = {
    :A => 71.03711,
    :R => 156.10111,
    :N => 114.04293,
    :D => 115.02694,
    :C => 103.00919,
    :E => 129.04259,
    :Q => 128.05858,
    :G => 57.02146,
    :H => 137.05891,
    :I => 113.08406,
    :L => 113.08406,
    :K => 128.09496,
    :M => 131.04049,
    :F => 147.06841,
    :P => 97.05276,
    :S => 87.03203,
    :T => 101.04768,
    :W => 186.07931,
    :Y => 163.06333,
    :V => 99.06841,

    :h => 1.00783,
    :h_plus => 1.00728,
    :o => 15.9949146,
    :h2o => 18.01056,

  }
  AVG = {
    :A => 71.0788,
    :R => 156.1875,
    :N => 114.1038,
    :D => 115.0886,
    :C => 103.1388,
    :E => 129.1155,
    :Q => 128.1307,
    :G => 57.0519,
    :H => 137.1411,
    :I => 113.1594,
    :L => 113.1594,
    :K => 128.1741,
    :M => 131.1926,
    :F => 147.1766,
    :P => 97.1167,
    :S => 87.0782,
    :T => 101.1051,
    :W => 186.2132,
    :Y => 163.1760,
    :V => 99.1326,

    :h => 1.00794,
    :h_plus => 1.00739,
    :o => 15.9994,
    :h2o => 18.01524,
  }
end

module SpecID ; end

class GenericSpecID ; include SpecID ; end

module SpecID
  MONO = Mass::MONO
  AVG = Mass::AVG
   
  attr_accessor :peps, :prots
  # True if a high protein/peptide score is better than low, false otherwise
  # This is set automatically for known file types
  attr_accessor :hi_prob_best

  # A relative pathname of the file the specid object is derived from
  attr_accessor :filename

  # tp = file_type
  # Will return a SpecID object (really, the object corresponding to the
  # file type which mixes in SpecID [is_a?(SpecID) == true])
  # If no file is given, will return a GenericSpecID object.
  def self.new(file=nil, tp=nil)
    if file 
      from_file(file, tp)
    else
      GenericSpecID.new      
    end
  end

  # tp = file_type
  # only takes an array if they are srf files!
  def self.from_file(file, tp=nil)
    obj = nil
    unless tp
      tp = file_type(file)
    end
    obj = case tp
    when 'srg'
      @hi_prob_best = false
      SRFGroup.new(file)
    when 'bioworks'
      @hi_prob_best = false
      Bioworks.new(file)
    when 'protproph'
      @hi_prob_best = true
      Proph::ProtSummary.new(file)
    else
      abort "UNRECOGNIZED file type for #{file}"
    end
    obj
  end

  def inspect
    "<#{self.class} #peps=\"#{peps.size}\">"
  end

  # takes a comma separated list  or array and extends the last to create an
  # array of desired size
  def self.extend_args(arg, desired_size)
    arg_arr = arg
    if arg.is_a? String
      arg_arr = arg.split(',')
    end
    new_arr = []
    last_arg = arg_arr[0]
    desired_size.times do |i|
      if arg_arr[i]
        new_arr[i] = arg_arr[i]
        last_arg = new_arr[i]
      else
        new_arr[i] = last_arg
      end
    end
    new_arr
  end

  # takes an array of proteins, each having peps
  # peptide grouping is done
  # by-
  # the protein with the most unique peptides ends up taking any
  # degenerate peptides, tie goes to one with most hits total, then the one
  # that had the top xcorr(s) (before removing any peptides).All other
  # proteins with identical peptides will lose those peptides.  So, the rich
  # stay rich, and the poor get poorer.
  # returns an array of triplets where each is [prot, pep_hits,
  # uniq_aaseqs] (uniq_aaseqs is an array) where the protein contains >= 1
  # peptide.  The internal links (prot.peps and pep.prots) is NOT modified!!
  # update_prots == true will set each protein with the peptides found
  def self.occams_razor(array_of_prots, update_prots=false)
    peps_found = Set.new 

    to_sort = array_of_prots.map do |prot|
      pps = prot.peps

      peps_by_uniq_aaseq = pps.hash_by(:aaseq)
      uniq_aaseqs = Set.new( pps.map {|pep| pep.aaseq } )
      xcorrs = pps.map {|pep| pep.xcorr }

      silly = OpenStruct.new
      # 0                1         2            3     4            5
      [uniq_aaseqs.size, pps.size, xcorrs.sort, prot, uniq_aaseqs, peps_by_uniq_aaseq] 
    end
    prot_triplets = []
    to_sort.sort.reverse.each do |ar|
      prot = ar[3]
      ## overlapping set:
      common = peps_found & ar[4]
      ## find the uniq ones in our little set of peptides:
      uniq = ar[4] - common
      pep_hits = []
      if uniq.size != 0
        ## add to the found list:
        peps_found.merge(uniq)
        uniq.each do |seq|
          pep_hits.push( *(ar[5][seq]) ) 
        end
        prot_triplets << [prot, pep_hits, uniq.to_a]
        prot.peps = pep_hits if update_prots 
      end
    end
    prot_triplets
  end


  ## basically, this is the command line wrapper
  def self.precision(argv)
    Prec.new.run_cmd_line(argv)
  end


  # returns number of true positives (array) and the specified output (as
  # parallel array).  Requires the classification method and a sorted array of
  # tp values and an array fp values.
  # (This is simply a wrapper around ROC#by_tps method!)
  def by_tps(classification_method, tp, fp)
    ROC.new.by_tps(classification_method, tp, fp)
  end
  
  # from the unique set of peptide hits, create a separate peptide hit for
  # each protein reference where that peptide only references that protein
  # e.g. pep.prots = [(a single protein)]
  def pep_prots
    pps = []
    peps.each do |pep|
      pep.prots.map do |prt|
        pep.dup     
        pep.prots = [prt]
        pps << pep
      end
    end
    pps
  end

  def classify_by_regex(items, regex, fp_on_match=true)
    case items
    when :prots
      myproc = proc { |prt| 
        if prt.reference =~ regex ; !fp_on_match
        else ; fp_on_match end 
      }
      return classify(items, myproc)
    when :peps
      match = [] ; nomatch = []
      peps.each do |pep| 
        match_prots = [] ; nomatch_prots = []
        (hit, nohit) = pep.prots.partition do |prot|
          prot.reference =~ regex
        end
        if hit.size == 0
          nomatch << pep
        elsif nohit.size == 0
          match << pep
        else ## both have hits
          pep.prots = match_prots
          nomatch_pep = pep.dup
          nomatch_pep.prots = nomatch_prots
          match << pep
          nomatch << pep
        end
      end
      if fp_on_match
        return [nomatch , match]
      else
        return [match, nomatch]
      end
    else
      abort "don't recognize "
    end
  end

  # returns [tp, fp] based on the protein prefix for items where items =
  # (:prot|:peps)
  # this may result in a duplication of some peptides if they match both
  # normal and decoy proteins.  In this case, the protein arrays are split,
  # too, so that each points only to its breed of protein. 
  def classify_by_false_flag(items, flag, fp_on_match=true, prefix=false)
    if prefix
      regex = /^#{Regexp.escape(flag)}/ 
    else
      regex = /#{Regexp.escape(flag)}/ 
    end
    classify_by_regex(items, regex, fp_on_match)
  end

  # Returns (match, nomatch)
  # items = symbol (:prots, :peps)
  # Returns two arrays, those returning true from classify_item_by and those
  # returning false 
  def classify(items, classify_item_by)
    its = send(items)
    f = []; t = []
    its.each do |it|
      if classify_item_by.call(it)
        t << it
      else
        f << it
      end
    end
    [t,f]
  end

  # returns two arrays, true positives and false positives (determined by proc
  # classify_item_by) sorted by proc rank_item_by.  Items will be ranked from
  # lowest to highest based on the return value of rank_item_by. items is a
  # symbol (:prots or :peps)
  def rank_and_classify(items, rank_item_by, classify_item_by)
    its = send(items)
    #its.each do |it| puts it.probability.to_s ; puts it.reference end
    doublets = its.collect do |item|
      [ rank_item_by.call(item),
        classify_item_by.call(item) ]
    end
    roc = ROC.new
    tp, fp = roc.prep_list(doublets)
    return tp, fp
  end


  # returns a proc for getting all probabilities so that an ascending sort
  # will put the best scores first
  def probability_proc
    if hi_prob_best
      get_prob_proc = proc {|prt| prt.probability * -1 }
    else
      get_prob_proc = proc {|prt| prt.probability }
    end
    get_prob_proc
  end

  def separate_by_prefix(items, fp_prefix)
    its = send(items)

    if items == :prots
    elsif items == :peps
      abort "not implemented yet"
    else
      abort "no other items recognized yet"
    end
  end

  # sorts the probabilities and then
  # calcs predicted number hits and precision for protein probabilities
  # (summing probabilities)
  # one_minus_ppv = SUM(1-probX)/#prots = what is commonly and mistakenly
  # called false positive rate
  # SUM(1-probX)/#prots
  def num_hits_and_ppv_for_protein_prophet_probabilities
    current_sum_one_minus_prob = 0.0
    num_prots = []
    ppv = []
    prot_cnt = 0
    probs = prots.map {|v| v.probability}
    sorted = probs.sort.reverse
    sorted.each do |prob|
      prot_cnt += 1
      num_prots << prot_cnt
      current_sum_one_minus_prob += 1.0 - prob
      ppv << 1.0 - ( current_sum_one_minus_prob / prot_cnt )
      # current_fpr_ratio = current_sum_one_minus_prob / prot_cnt
    end
    [num_prots, ppv]
  end

  # convenience method for the common task of determining precision for
  # proteins (with decoy proteins found by false_flag)
  # returns (num_hits, precision)
  def num_hits_and_ppv_for_prob(false_flag, prefix=false)
    if prefix
      regex = /^#{Regexp.escape(false_flag)}/ 
    else
      regex = /#{Regexp.escape(false_flag)}/ 
    end
    prob_proc = probability_proc
    myproc = proc { |prt| 
      if prt.reference =~ regex ; false
      else ; true end 
    }

    real_hits, decoy_hits = rank_and_classify(:prots, prob_proc, myproc)
    
    (num_hits, num_tps, precision) = DecoyROC.new.pred_and_tps_and_ppv(real_hits, decoy_hits)
    [num_hits, precision]
  end

#  # takes the existing spec_id object and marshals it into "file.msh"
#  # a new file will always look for a file.msh to load
#  def marshal(force=false)
#    if !(File.exist? @marshal_file)| force
#      File.open(@marshal_file, 'w') {|out| Marshal.dump(@obj, out) }
#    end
#  end

  # Returns 'bioworks' if bioworks xml, 'protproph' if Protein prophet
  # 'srf' if SRF file, 'srg' if search results group file.
  def self.file_type(file)
    if file =~ /\.srg$/
      return 'srg'
    end
    if IO.read(file, 7,438) == 'Enzyme:'
      return 'srf'
    end
    File.open(file) do |fh|
      lines = ""
      4.times { lines << fh.readline }
      if lines =~ /<bioworksinfo>/
        return 'bioworks'
      elsif lines =~ /<protein_summary/ && lines =~ /xmlns="http:\/\/regis-web.systemsbiology.net\/protXML"/
        return 'protproph'
      end
    end
  end


  ##############################################
  # These are pretty specific to Smriti's needs:

  # Given a hash of peptide arrays by some attribute key
  # Return two sorted arrays of sorted probabilities
  # The first of the min and second of the best 10 of each peptide array
  def min_and_best10(hash)
    ## choose the min probability and sort by prob
    min_peptides = hash.collect do |k,v|
      v.min {|a,b| a.peptide_probability <=> b.peptide_probability }
    end
    #puts min_peptides[0] # -> Bioworks::Pep
    min_sorted_peps = sorted_probabilities(min_peptides)
    #puts min_sorted_peps[0] # -> probability (Float)

    peptides_by_tens = [] 
    hash.each do |k,v|
      arr = v.sort_by {|pep| pep.peptide_probability }.slice(0,10)
      peptides_by_tens.push(*arr)
    end

    top_10_sorted_peps = sorted_probabilities(peptides_by_tens)
    #puts top_10_sorted_peps[0] # -> float
    #puts "size: top_10_sorted_peps.size : #{top_10_sorted_peps.size}"
    #puts "size: min_sorted_peps.size : #{min_sorted_peps.size}"
    #p top_10_sorted_peps
    #p min_sorted_peps 
    return min_sorted_peps, top_10_sorted_peps
  end

  # Returns a list of sorted probabilities given the array of peptides
  def sorted_probabilities(peptides)
    #puts peptides.first.peptide_probability.class
    #peptides.each do |pep| print pep.class.to_s + " " end
    #puts peptides.first.is_a? Array
    #abort "DFHDFD"
    peptides.collect{|pep| pep.probability }.sort
  end

  # returns a sorted lists of probabilities based on all pepprots (a peptide
  # associated with a protein)
  def pep_probs_by_pep_prots
    sorted_probabilities(peps)
  end

  ##########################################################################
  # WARNING! These might be dangerous to your health if there are multiple
  # files collected in your bioworks file
  ##########################################################################
  
  # (prob_list_by_min, prob_list_by_best10)
  # returns 2 sorted lists of probabilities based on:
  #   1. best peptide hit
  #   2. top 10 peptide hits
  # on a per scan basis
  # NOTE: you may want to hash on base_name first!
  def pep_probs_by_scan
    hash = peps.hash_by(:first_scan, :last_scan)
    return min_and_best10(hash)
  end


  #(prob_list_by_min, prob_list_by_best10)
  # same as pep_probs_by_scan but per charge state
  # NOTE: you may want to hash on base_name first!
  def pep_probs_by_scan_charge
    hash = peps.hash_by(:first_scan, :last_scan, :charge)
    return min_and_best10(hash)
  end

  # (prob_list_by_min)
  # hashes on seq-charge and returns the sorted list of probabilities of top
  # hit per seq-charge
  # NOTE: you may want to hash on base_name first!
  def pep_probs_by_seq_charge
    hash = peps.hash_by(:sequence, :charge)
    min_peptides = hash.collect do |k,v|
      v.min {|a,b| a.peptide_probability <=> b.peptide_probability }
    end
    sorted_probabilities(min_peptides)
  end

  ##########################################################################
  # USE these if you have multiple files in your bioworks.xml file 
  ##########################################################################
  # (prob_list_by_min, prob_list_by_best10)
  # returns 2 sorted lists of probabilities based on:
  #   1. best peptide hit
  #   2. top 10 peptide hits
  # on a per scan basis
  # NOTE: you may want to hash on base_name first!
  def pep_probs_by_bn_scan
    hash = peps.hash_by(:base_name, :first_scan, :last_scan)
    return min_and_best10(hash)
  end


  #(prob_list_by_min, prob_list_by_best10)
  # same as pep_probs_by_scan but per charge state
  # NOTE: you may want to hash on base_name first!
  def pep_probs_by_bn_scan_charge
    hash = peps.hash_by(:base_name, :first_scan, :last_scan, :charge)
    return min_and_best10(hash)
  end

  # (prob_list_by_min)
  # hashes on seq-charge and returns the sorted list of probabilities of top
  # hit per seq-charge
  # NOTE: you may want to hash on base_name first!
  def pep_probs_by_bn_seq_charge
    hash = peps.hash_by(:base_name, :sequence, :charge)
    min_peptides = hash.collect do |k,v|
      v.min {|a,b| a.peptide_probability <=> b.peptide_probability }
    end
    sorted_probabilities(min_peptides)
  end
end


# A Generic spectraID protein
module SpecID::Prot
  # probability is always a float!
  attr_accessor :probability, :reference, :peps

  def <=> (other)
    self.reference <=> other.reference
  end

end

module SpecID::Pep

   Non_standard_amino_acid_char_re = /[^A-Z\.\-]/

  attr_accessor :prots
  attr_accessor :probability
  # full sequence: (<firstAA>.<sequence>.<last>) with '-' for no first
  # or last.
  attr_accessor :sequence

  # the basic amino acid sequence (no leading or trailing '.' or amino acids)
  # should not contain any special symbols, etc.
  attr_accessor :aaseq
  attr_accessor :charge

  # removes nonstandard chars with Non_standard_amino_acid_char_re
  # preserves A-Z and '.' and '-'
  def self.remove_non_amino_acids(sequence)
    sequence.gsub(Non_standard_amino_acid_char_re, '')
  end

  # remove_non_amino_acids && split_sequence
  def self.prepare_sequence(val)
    nv = remove_non_amino_acids(val)
    split_sequence(nv)
  end

  def <=>(other)
    aaseq <=> other.aaseq
  end

  # Returns prev, peptide, next from sequence.  Parse errors return
  # nil,nil,nil
  #   R.PEPTIDE.A  # -> R, PEPTIDE, A
  #   R.PEPTIDE.-  # -> R, PEPTIDE, -
  #   PEPTIDE.A    # -> -, PEPTIDE, A
  #   A.PEPTIDE    # -> A, PEPTIDE, -
  #   PEPTIDE      # -> nil,nil,nil
  def self.split_sequence(val)
    peptide_prev_aa = ""; peptide = ""; peptide_next_aa = ""
    pieces = val.split('.') 
    case pieces.size
    when 3
      peptide_prev_aa, peptide, peptide_next_aa = *pieces
    when 2
      if pieces[0].size > 1  ## N termini
        peptide_prev_aa, peptide, peptide_next_aa = '-', pieces[0], pieces[1]
      else  ## C termini
        peptide_prev_aa, peptide, peptide_next_aa = pieces[0], pieces[1], '-'
      end
    when 1  ## this must be a parse error!
      peptide_prev_aa, peptide, peptide_next_aa = nil,nil,nil
    when 0
      peptide_prev_aa, peptide, peptide_next_aa = nil,nil,nil
    end
    return peptide_prev_aa, peptide, peptide_next_aa
  end

  ## 
  def self.sequence_to_aaseq(sequence)
    after_removed = remove_non_amino_acids(sequence)
    pieces = after_removed.split('.') 
    case pieces.size
    when 3
      pieces[1]
    when 2
      if pieces[0].size > 1  ## N termini
        pieces[0]
      else  ## C termini
        pieces[1]
      end
    when 1  ## this must be a parse error!
      pieces[0] ## which is the peptide itself  
    else
      abort "bad peptide sequence: #{sequence}"
    end
  end

  # This will rapidly determine the list of proteins for which given
  # peptides belong.  It is meant to be low level and fast (eventually),
  # so it asks for the data in a format amenable to this.
  # returns a mirror array where each entry is an array of Fasta::Prot
  # objects where each protein contains the sequence
  def self.protein_groups_by_sequence(peptide_strings_list, fasta_obj)
    prots = fasta_obj.prots
    prot_seqs = prots.map do |prot|
      prot.aaseq
    end

    groups = peptide_strings_list.map do |pep_seq|
      prot_index = 0
      protein_group = []
      prot_seqs.each do |prot_seq|
        if prot_seq.include? pep_seq
          protein_group << prots[prot_index]
        end
        prot_index += 1
      end
      protein_group
    end

    groups
  end

  # units can be :mmu, :amu, :ppm
  def mass_accuracy(pep, unit=:ppm, mono=true)
    # 10^6 * deltam accuracy/ m[measured]
    # i.e., theoretical mass 1000, measured 999.9: 100ppm
    # http://www.waters.com/WatersDivision/ContentD.asp?watersit=EGOO-66LRQD
    # pep.mass is the theoretical M+H of the peptide
    # this assumes that the deltacn value we're being told is correct, but I
    # have my suspicions (since the <mass> value is not accurate...)

    ######## TO COMPLETE (and add to spec_id..?)
    case unit
    when :ppm
    when :amu
    when :mmu
    end
  end
end

class SpecID::GenericProt
  include SpecID::Prot
end

class SpecID::GenericPep 
  include SpecID::Pep
end