#require 'ms/parser'
#require 'ms/parser/mzxml'
require 'ms/msrun'
require 'spec_id/proph'
require 'vec'

require 'pp'

class Align

  # Returns an array of peptides where each peptide passes threshold criteria
  # and each has been updated with scans, and dta filenames dependent on
  # matching with the basename_noext of the mztimes files.
  # Each peptide is guaranteed unique by sequence+charge
  # mztimes is an array of mzXML or .timeIndex files
  # if a peptide has no scans in the given mztimes set, it is discarded
  def peps_with_scans(mztimes, prot_xml, pep_proph_xml, prot_prob=1.0, pep_init_prob=1.0, pep_nsp_prob=1.0)

    ## Create scan indices on msrun name
    if mztimes.class != Array ; mztimes = [mztimes] end
    msrun_indices = mztimes.collect do |file| MS::MSRunIndex.new(file) end
    scanindex_by_basename_noext = {}
    msrun_indices.each do |runindex|
      scanindex_by_basename_noext[runindex.basename_noext] = runindex.scans_by_num
    end

    dta_filenames = Proph::Pep::Parser.new.dta_filenames_by_seq_charge(pep_proph_xml, "regex")

    parser = Proph::Prot::Parser.new
    parser.get_prots_and_peps(prot_xml, prot_prob, pep_init_prob, pep_nsp_prob, "regex")
    peptides = parser.peps
    peptides = Proph::Pep.uniq_by_seqcharge(peptides)
    ## we update each peptide with a list of dtafilenames
    ## then we update with a parallel list of scans (one for each dtafn...
    ## unless there are multiple scans associated with each filename
    ## in which case it will be an array
    _update_filenames(peptides, dta_filenames)
    peptides = _update_and_filter_by_scans(peptides, scanindex_by_basename_noext)
    return peptides
  end

  # takes the list of filenames for each peptide, and adds a scan
  # indexed from by scanindex
  # If keys are not in scanindex_by_basename_noext, then the scan is not
  # in the peptide!
  # if a peptide has no scans, it is not returned
  # if a filename is not recognized, it is dropped from the list
  def _update_and_filter_by_scans(peptides, scanindex_by_basename_noext)
    newpeps = []
    peptides.each do |pep|
      newfilenames = []
      pep.filenames.each do |dtafilename|
        (dtabase,first,last,charge) = dtafilename.split('.')
        if scanindex_by_basename_noext.key?(dtabase)
          newfilenames << dtafilename
          if first == last
            pep.scans << scanindex_by_basename_noext[dtabase][first.to_i]
          else
            scans = (first.to_i...last.to_i).collect do |index|
              scanindex_by_basename_noext[dtabase].scans_by_num[index]
            end
            pep.scans << scans
          end
        else
        end
      end
      pep.filenames = newfilenames
      if pep.scans.size > 0
        newpeps << pep
      end
    end
    newpeps
  end

  # takes an array of peptide arrays
  # will find the overlapping set
  # returns an array of peptide arrays
  # assumes that each pep_group is unique on sequence+charge
  def overlapping_peps_by_seqcharge(pep_groups)
    ## CREATE overlapping set:
    hashes = pep_groups.collect do |group|
      group.hash_uniq_by(:sequence, :charge)
    end
    pep_keys = hashes.collect do |hash|
      hash.collect do |k,v| k end
    end
    olapping_keys = pep_keys.inject do |olap,obj|
      olap & obj
    end
    pep_arrays = hashes.collect do |hash|
      pep_array = olapping_keys.collect do |k|
        hash[k]
      end
    end
  end

  # tosses out any peptides from pep_groups where the
  # arithmetic_avg_scan_by_parent_time.time is greater than 'deviations' from
  # the least squares regression line assumes that each peptide is parallel
  # (performed iteratively)
  def toss_outliers(pep_groups, deviations=0.0)
    arr_of_vecs = pep_groups.collect do |peps|
      time_arr = peps.collect do |pep| 
        pep.arithmetic_avg_scan_by_parent_time.time
      end
      VecD.new(time_arr)
    end

    # in the future this could be expanded for multiple dimensions
    indices = arr_of_vecs.first.outliers_iteratively(deviations, arr_of_vecs[1])

    # remove the peptides that are outliers
    #pep_groups.each do |peps| puts peps.size.to_s end
    pep_groups.each do |peps|
      indices.each do |i| peps.delete_at(i) end
    end
    #pep_groups.each do |peps| puts peps.size.to_s end
    pep_groups
  end

  # max_dups will toss out any peptides having > max_dups dtafilenames
  # Currently, this will only take 2 groups of peptides
  def overlapping_peps_by_seqcharge_with_filter(pep_groups, max_dups=nil, outlier_cutoff=0.0)
    pep_groups.collect! do |pep_group|
      pep_group.first.class.filter_by_max_dup_scans(max_dups, pep_group)
    end
    pep_groups = overlapping_peps_by_seqcharge(pep_groups)
    toss_outliers(pep_groups, outlier_cutoff)
    pep_groups
  end

  def _update_filenames(peptides, dta_filenames_by_seq_charge)
    peptides.each do |pep|
      pep.filenames = dta_filenames_by_seq_charge[[pep.sequence, pep.charge]]
    end
  end

  # Returns a hash[dtabase] -> [pep, ...]
  # Proteins must have peptides
  def _peps_by_dtabase(peptides)
    ## organize peptides based on filenames
    peptides_by_dtabase = Hash.new{|h,k| h[k] = [] }
    peptides.each do |k,pep|
      pep.filenames.each do |fn|
        file = fn.split(".")[0]
        peptides_by_file[file] << pep
      end
    end
    peptides_by_dtabase
  end


end