require 'numo/narray'
require 'semtools'

HPOS = 0
CHR = 1
START = 2
STOP = 3

def load_hpo_ontology(hpo_file, excluded_hpo_file)
  hpo = nil
  if !hpo_file.include?('.json')
    if !excluded_hpo_file.nil?
      hpo = Ontology.new(file: hpo_file, load_file: true, removable_terms: read_excluded_hpo_file(excluded_hpo_file))
    else
      hpo = Ontology.new(file: hpo_file, load_file: true)
    end
  else
    hpo = Ontology.new
    hpo.read(hpo_file)
    if !excluded_hpo_file.nil?
      hpo.add_removable_terms(read_excluded_hpo_file(excluded_hpo_file))
      hpo.remove_removable()
      hpo.build_index()
    end
  end
  return hpo
end

def format_patient_data(patient_data, options, hpo)
  rejected_hpos = []
  rejected_patients = []
  patient_data.each do |pat_id, patient_record|
    hpos, chr, start, stop = patient_record

    if options[:hpo_names]
      hpos, pat_rejected_hpos = hpo.translate_names(hpos)
      if !pat_rejected_hpos.empty?
        STDERR.puts "WARNING: patient #{pat_id} has the unknown hpo NAMES '#{pat_rejected_hpos.join(',')}'. Rejected."
        rejected_hpos.concat(pat_rejected_hpos)
      end
    end

    hpos, pat_rejected_hpos = hpo.check_ids(hpos.map{|a| a.to_sym})
    if !pat_rejected_hpos.empty?
      STDERR.puts "WARNING: patient #{pat_id} has the unknown hpo CODES '#{pat_rejected_hpos.join(',')}'. Rejected."
      rejected_hpos.concat(pat_rejected_hpos)
    end
    if hpos.empty?
      rejected_patients << pat_id
    else
      patient_record[HPOS] = hpos
    end
  end
  return rejected_hpos.uniq, rejected_patients
end

def compute_hpo_list_and_childs(patient_data, hpo)
  all_hpo = []
  suggested_childs = {}
  total_terms = 0
  terms_with_more_specific_childs = 0
  patient_data.each do |pat_id, hpos|
    total_terms += hpos.length
    more_specific_childs = hpo.get_childs_table(hpos, true)
    terms_with_more_specific_childs += more_specific_childs.select{|hpo_record| !hpo_record.last.empty?}.length #Exclude phenotypes with no childs
    suggested_childs[pat_id] = more_specific_childs  
    all_hpo.concat(hpos)
  end
  return all_hpo.uniq, suggested_childs, terms_with_more_specific_childs.fdiv(total_terms)
end

def clean_patient_profiles(hpo, patient_profiles)
  rejected_patients = []
  patient_profiles.each do |pat, prof|
    phens = hpo.clean_profile_hard(prof)
    if phens.empty?
      rejected_patients << pat
    else
      patient_profiles[pat] = phens
    end
  end
  patient_profiles.select!{|pat_id, patient_record| !rejected_patients.include?(pat_id)}
  hpo.profiles = {}
  hpo.load_profiles(patient_profiles)

end

def generate_patient_hpo_matrix(patient_data, cohort_hpos)
  matrix = []
  n = cohort_hpos.length
  patient_data.each do |pat_id, pat_hpos|
    vector = Array.new(n, 0)
    pat_hpos.each do |hpo|
      vector[cohort_hpos.index(hpo)] = 1
    end
    matrix << vector
  end
  return matrix
end

def generate_patient_hpo_matrix_numo(patient_data, cohort_hpos)
  y_names = patient_data.keys
  x_names = cohort_hpos
  x_names_indx = {}
  cohort_hpos.each_with_index{|hp,x| x_names_indx[hp]=x}
  # row (y), cols (x)
  matrix = Numo::DFloat.zeros(patient_data.length, cohort_hpos.length)
  i = 0
  patient_data.each do |pat_id, pat_hpos|
    pat_hpos.each do |hp|
      matrix[i, x_names_indx[hp]] = 1
    end
    i += 1
  end
  return matrix, y_names, x_names
end

def write_matrix_for_R(matrix, x_names, y_names, file)
  File.open(file, 'w') do |f|
    f.puts x_names.join("\t")
    matrix.each_with_index do |row, i|
      f.puts [y_names[i]].concat(row).join("\t")
    end
  end
end

def process_clustered_patients(options, clustered_patients, patient_uniq_profiles, patient_data, equivalence, hpo, phenotype_ic, patient_id_type) # get ic and chromosomes
  all_ics = []
  all_lengths = []
  top_cluster_phenotypes = []
  cluster_data_by_chromosomes = []
  multi_chromosome_patients = 0
  processed_clusters = 0
  clustered_patients.sort_by{|cl_id, pat_ids| pat_ids.length }.reverse.each do |cluster_id, patient_ids|
    next if patient_ids.length == 1
    chrs = Hash.new(0)
    all_phens = []
    profile_ics = []
    profile_lengths = []
    processed_patients = []
    patient_ids.each do |pat_id|
      phenotypes = patient_uniq_profiles[pat_id]  
      #pat_id = pat_id.gsub(/_i\d+$/,'') if patient_id_type != 'generated'
      processed_patients << pat_id 
      profile_ics << get_profile_ic(phenotypes, phenotype_ic)
      profile_lengths << phenotypes.length
      if processed_clusters < options[:clusters2show_detailed_phen_data]
        phen_names, rejected_codes = hpo.translate_ids(phenotypes) #optional
        all_phens << phen_names
      end
      variants = equivalence[pat_id]
      variants.each do |variant|
        variant_data = patient_data[variant]
        chrs[variant_data[CHR]] += 1 if !options[:chromosome_col].nil? && variant_data[CHR] != '-'
      end
    end
    num_of_patients = processed_patients.length
    next if num_of_patients == 1 # Check that current cluster only has one patient with several mutations
    top_cluster_phenotypes << all_phens if processed_clusters < options[:clusters2show_detailed_phen_data]
    all_ics << profile_ics
    all_lengths << profile_lengths
    if !options[:chromosome_col].nil?
      multi_chromosome_patients += num_of_patients if chrs.length > 1
      chrs.each do |chr, count|
        cluster_data_by_chromosomes << [cluster_id, num_of_patients, chr, count]
      end
    end
    processed_clusters += 1
  end
  return all_ics, all_lengths, cluster_data_by_chromosomes, top_cluster_phenotypes, multi_chromosome_patients
end

def get_profile_ic(hpo_names, phenotype_ic)
  ic = 0
  profile_length = 0
  hpo_names.each do |hpo_id|
    hpo_ic = phenotype_ic[hpo_id]
    # STDERR.puts phenotype_ic.inspect
    ic += hpo_ic if !hpo_ic.nil?
    profile_length += 1
  end
  profile_length = 1 if profile_length == 0
  return ic.fdiv(profile_length)
end

def write_cluster_ic_data(all_ics, profile_lengths, cluster_ic_data_file, limit)
  File.open(cluster_ic_data_file, 'w') do |f|
    f.puts %w[cluster_id ic Plen].join("\t")
    all_ics.each_with_index do |cluster_ics, i|
      break if i == limit
      cluster_length = cluster_ics.length
      cluster_ics.each_with_index do |clust_ic, j|
        f.puts "#{cluster_length}_#{i}\t#{clust_ic}\t#{profile_lengths[i][j]}"
      end
    end
  end
end

def write_cluster_chromosome_data(cluster_data, cluster_chromosome_data_file, limit)
  File.open(cluster_chromosome_data_file, 'w') do |f|
    f.puts %w[cluster_id chr count].join("\t")
    index = 0
    last_id = cluster_data.first.first unless cluster_data.empty?
    cluster_data.each do |cluster_id, patient_number, chr, count|
      index += 1 if cluster_id != last_id 
      break if index == limit
      f.puts ["#{patient_number}_#{index}", chr, count].join("\t")
      last_id = cluster_id
    end
  end
end

def write_coverage_data(coverage_to_plot, coverage_to_plot_file)
  File.open(coverage_to_plot_file, 'w') do |f|
    coverage_to_plot.each do |chr, position, freq|
     f.puts "#{chr}\t#{position}\t#{freq}"
   end
  end
end

def get_uniq_hpo_profiles(patient_data) # To avoid duplications due to more one mutation in the same patient
  hpo_profiles = {}
  equivalence = {}
  patient_data.each do |variant_id, patient_rec|
    pat_id, count = variant_id.split('_i')
    hpo_profiles[pat_id] = patient_rec[HPOS]
    query =  equivalence[pat_id]
    if query.nil?
      equivalence[pat_id] = [variant_id]
    else
      query << variant_id
    end
  end
  return hpo_profiles, equivalence
end

def get_patient_ids(patient_data) # To aviod duplications due to more one mutation in the same patient
  ids = []
  patient_data.each do |pat_id, hpos|
    id, count = pat_id.split('_i')
    ids << id
  end
  return  ids.uniq
end

def get_summary_stats(patient_data, rejected_patients, cohort_hpos, hpo)
  stats = []
  stats << ['Unique HPO terms', cohort_hpos.length]
  stats << ['Cohort size', get_patient_ids(patient_data).length]
  stats << ['Rejected patients by empty profile', rejected_patients.length]
  # stats << ['HPOs per patient (average)', hpo.get_profile_mean_length]
  stats << ['HPOs per patient (average)', hpo.get_profiles_mean_size]
  stats << ['HPO terms per patient: percentile 90', hpo.get_profile_length_at_percentile(perc=90)]
  return stats
end

def cluster_patients(patient_data, cohort_hpos, matrix_file, clustered_patients_file)
  if !File.exists?(matrix_file)
    pat_hpo_matrix, pat_id, hp_id  = generate_patient_hpo_matrix_numo(patient_data, cohort_hpos)
    x_axis_file = matrix_file.gsub('.npy','_x.lst')
    File.open(x_axis_file, 'w'){|f| f.print hp_id.join("\n") }  
    y_axis_file = matrix_file.gsub('.npy','_y.lst')
    File.open(y_axis_file, 'w'){|f| f.print pat_id.join("\n") }
    Npy.save(matrix_file, pat_hpo_matrix)
  end
  system("#{File.join(EXTERNAL_CODE, 'get_clusters.R')} -d #{matrix_file} -o #{clustered_patients_file} -y #{matrix_file.gsub('.npy','')}") if !File.exists?(clustered_patients_file)
  clustered_patients = load_clustered_patients(clustered_patients_file)
  return(clustered_patients)
end

def get_profile_ontology_distribution_tables(hpo)
  ontology_levels, distribution_percentage = hpo.get_profile_ontology_distribution_tables
  ontology_levels.unshift(["level", "ontology", "cohort"])
  distribution_percentage.unshift(["level", "ontology", "weighted cohort", "uniq terms cohort"])
  return ontology_levels, distribution_percentage
end


def write_detailed_hpo_profile_evaluation(suggested_childs, detailed_profile_evaluation_file, summary_stats)
  CSV.open(detailed_profile_evaluation_file, "wb") do |csv|
    suggested_childs.each do |pat_id, suggestions|
      warning = nil
      warning = 'WARNING: Very few phenotypes' if suggestions.length < 4
      csv << ["PATIENT #{pat_id}", "#{warning}"]
      csv << ["CURRENT PHENOTYPES", "PUTATIVE MORE SPECIFIC PHENOTYPES"]
      suggestions.each do |parent, childs|
        parent_code, parent_name = parent
        if childs.empty?
          csv << ["#{parent_name} (#{parent_code})", '-']
        else
          parent_writed = false
          childs.each do |child_code, child_name|
            if !parent_writed
              parent_field = "#{parent_name} (#{parent_code})"
              parent_writed = true
            else
              parent_field = ""
            end
            csv << [parent_field, "#{child_name} (#{child_code})"]
          end
        end
      end
      csv << ["", ""]
    end
  end
end

def write_arrays4scatterplot(x_axis_value, y_axis_value, filename, x_axis_name, y_axis_name)
  File.open(filename, 'w') do |f|
    f.puts "#{x_axis_name}\t#{y_axis_name}"
    x_axis_value.each_with_index do |value,i|
      y_value = y_axis_value[i]
      raise("The #{i} position is not presented in y_axis_value") if y_value.nil?
      f.puts [value, y_value].join("\t")
    end
  end
end

def process_patient_data(patient_data)
	parsed_patient_data = {}
	patient_data.each do |patientID, metadata|
		phenotypes, chr, start, stop = metadata
    next if chr == '-'
		info = [patientID, start.to_i, stop.to_i]
		query = parsed_patient_data[chr]
		if query.nil?
			parsed_patient_data[chr] = [info]
		else
			query << info
		end
	end
	return parsed_patient_data
end

def get_final_coverage(raw_coverage, bin_size)
	coverage_to_plot = []
	raw_coverage.each do |chr, coverages|
		coverages.each do |start, stop, coverage|
			bin_start = start - start % bin_size
			bin_stop = stop - stop%bin_size
			while bin_start < bin_stop
				coverage_to_plot << [chr, bin_start, coverage]
				bin_start += bin_size
			end
		end
	end
	return coverage_to_plot
end

def get_sor_length_distribution(raw_coverage)
	all_cnvs_length = []
	cnvs_count = []
	raw_coverage.each do |chr, coords_info|
		coords_info.each do |start, stop, pat_records|
			region_length = stop - start + 1
			all_cnvs_length << [region_length, pat_records]
		end
	end
	all_cnvs_length.sort!{|c1, c2| c1[1] <=> c2[1]}
	return all_cnvs_length
end

def get_cnvs_length(patient_data)
	length_stats = Hash.new(0)
	patient_data.each do |pat_id, patient_record|
    	string_hpos, chr, start, stop = patient_record
    	length_stats[stop - start] += 1
    end
    return length_stats.to_a.sort!{|stat| stat[1] <=> stat[1] }
end


def calculate_coverage(regions_data, delete_thresold = 0)
	raw_coverage = {}
	n_regions = 0
	patients = 0
	nt = 0
	regions_data.each do |start, stop, chr, node|
		number_of_patients = node.split('.').last.to_i
		if number_of_patients <= delete_thresold
			number_of_patients = 0
		else
			n_regions += 1
			patients += number_of_patients
			nt += stop - start			
		end
		coords = [start, stop, number_of_patients]
		query = raw_coverage[chr]
		if query.nil?
			raw_coverage[chr] = [coords]
		else
			query << coords
		end
	end
	return raw_coverage, n_regions, nt, patients.fdiv(n_regions)
end

def get_profile_redundancy(hpo)
  #TODO: sort both arrays consequently
  #TODO: bear in mind join both arrays with zip and sort by one, to get an [a[a]]
  # profile_sizes = hpo.get_profile_sizes
  profile_sizes = hpo.get_profiles_sizes
  # parental_hpos_per_profile = hpo.compute_redundant_parental_terms_per_profile
  parental_hpos_per_profile = hpo.parentals_per_profile# clean_profiles
  parental_hpos_per_profile = parental_hpos_per_profile.map{|item| item[0]}
  profile_sizes, parental_hpos_per_profile = profile_sizes.zip(parental_hpos_per_profile).sort_by{|i| i.first}.reverse.transpose
  return profile_sizes, parental_hpos_per_profile
end

def format_profiles_similarity_data(profiles_similarity)
  matrix = []
  element_names = profiles_similarity.keys
  matrix << element_names
  profiles_similarity.each do |elementA, relations|
    row = [elementA]
    element_names.each do |elementB|
      if elementA == elementB
        row << 'NA'
      else
        query = relations[elementB]
        if !query.nil?
          row << query
        else
          row << profiles_similarity[elementB][elementA]
        end
      end
    end
    matrix << row
  end
  matrix[0].unshift('pat')
  return matrix
end

def format_profiles_similarity_data_pairs(profiles_similarity)
  pairs = []
  element_names = profiles_similarity.keys
  profiles_similarity.each do |elementA, relations|
    element_names.each do |elementB|
      if elementA != elementB
        pair = [elementA, elementB]
        query = relations[elementB]
        if !query.nil?
          pair << query
        else
          pair << profiles_similarity[elementB][elementA]
        end
        pairs << pair
      end
    end
  end
  return pairs
end

def format_profiles_similarity_data_numo(profiles_similarity)
  element_names = profiles_similarity.keys
  matrix = Numo::DFloat.zeros(element_names.length, element_names.length)
  i = 0
  profiles_similarity.each do |elementA, relations|
    element_names.each_with_index do |elementB, j|
      if elementA != elementB
        query = relations[elementB]
        if !query.nil?
          matrix[i, j] = query
        else
          matrix[i, j] = profiles_similarity[elementB][elementA]
        end
      end
    end
    i += 1
  end
  return matrix, element_names
end

def write_similarity_matrix(similarity_matrix, similarity_matrix_file)  
  File.open(similarity_matrix_file, 'w') do |f|
    similarity_matrix.each do |row|
      f.puts row.join("\t")
    end
  end
end

def write_profile_pairs(similarity_pairs, filename)
  File.open(filename, 'w') do |f|
    similarity_pairs.each do |pairsA, pairsB_and_values|
      pairsB_and_values.each do |pairsB, values|
        f.puts "#{pairsA}\t#{pairsB}\t#{values}"
      end
    end
  end
end

def parse_clusters_file(clusters_file, patient_profiles)
  clusters_info = {}
  clusters_table = []
  File.open(clusters_file).each do |line|
    line.chomp!
    patientID, clusterID = line.split("\t")
    patientHPOProfile = patient_profiles[patientID]
    query = clusters_info[clusterID]
    if query.nil? 
      clusters_info[clusterID] = {patientID => patientHPOProfile}
    else
      query[patientID] = patientHPOProfile
    end
  end
  clusters_info.each do |clusterID, patients_info|
    patients_per_cluster = patients_info.keys.length
    clusters_table << [clusterID, patients_per_cluster, patients_info.keys, patients_info.values]
  end
  return clusters_table, clusters_info
end

def get_patient_hpo_frequency(patient_uniq_profiles, hpo_frequency_file)
  hpo_frequency = Hash.new(0)
  patient_uniq_profiles.values.each do |hpos|
    hpos.each do |hpo|
      hpo_frequency[hpo] += 1
    end
  end
  File.open(hpo_frequency_file, 'w') do |f|
    hpo_frequency.each do |hpo_code, freq|
      f.puts "#{hpo_code.to_s}\t#{freq}"
    end
  end
end

def get_cluster_metadata(clusters_info, output_file)
  average_hp_per_pat_distribution = []
  tmp = []
  clusters_info.each do |cl_id, pat_info|
      hp_per_pat_in_clust = pat_info.values.map{|a| a.length}
      hp_per_pat_ave = hp_per_pat_in_clust.sum.fdiv(hp_per_pat_in_clust.length)
      average_hp_per_pat_distribution << [pat_info.length, hp_per_pat_ave]
      tmp << hp_per_pat_in_clust
  end
  total_clusters = clusters_info.length
  average_phenotypes_by_cluster = tmp.flatten.sum.fdiv(total_clusters)
  File.open(output_file, 'w') do |f|
    f.puts "#{'PatientsNumber'}\t#{'HPOAverage'}"
    average_hp_per_pat_distribution.each do |patient_num, ave|
      f.puts "#{patient_num}\t#{ave}"
    end
  end
end