#! /usr/bin/env ruby

ROOT_PATH = File.dirname(__FILE__)
$: << File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'pets'))

require 'benchmark'
require 'parallel'
require 'optparse'
require 'report_html'
require 'pets'

##########################
#OPT-PARSER
##########################

options = {}
OptionParser.new do |opts|
  opts.banner = "Usage: #{__FILE__} [options]"

  options[:coverage_analysis] = true
  opts.on("-a", "--coverage_analysis", "Deactivate genome coverage analysis. Default true") do 
    options[:coverage_analysis] = false
  end

  options[:bin_size] = 50000
  opts.on("-b", "--bin_size INTEGER", "Maximum number of bins to plot the coverage") do |data|
    options[:bin_size] = data.to_i
  end

  options[:clusters2show_detailed_phen_data] = 3
  opts.on("-C", "--clusters2show INTEGER", "How many patient clusters are show in detailed phenotype cluster data section. Default 3") do |data|
    options[:clusters2show_detailed_phen_data] = data.to_i
  end

  options[:chromosome_col] = nil
  opts.on("-c", "--chromosome_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the chromosome") do |data|
    options[:chromosome_col] = data
  end

  options[:id_col] = nil
  opts.on("-d", "--pat_id_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the patient id") do |data|
    options[:id_col] = data
  end

  options[:excluded_hpo] = nil
  opts.on("-E", "--excluded_hpo PATH", "List of HPO phenotypes to exclude (low informative)") do |excluded_hpo|
    options[:excluded_hpo] = excluded_hpo
  end

  options[:end_col] = nil
  opts.on("-e", "--end_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the end mutation coordinate") do |data|
    options[:end_col] = data
  end

  options[:minClusterProportion] = 0.01
  opts.on("-M", "--minClusterProportion FLOAT", "Minimum percentage of patients per cluster") do |data|
    options[:minClusterProportion] = data.to_f
  end

  options[:patients_filter] = 2
  opts.on("-f", "--patients_filter INTEGER", "Minimum number of patients sharing SORs. Default 0") do |data|
    options[:patients_filter] = data.to_i
  end

  options[:clusters2graph] = 30
  opts.on("-g", "--clusters2graph INTEGER", "How may patient clusters are plotted in cluster plots. Default 30") do |data|
    options[:clusters2graph] = data.to_i
  end

  options[:genome_assembly] = 'hg38'
  opts.on("-G", "--genome_assembly STRING", "Genome assembly version. Please choose between hg18, hg19 and hg38. Default hg38") do |data|
    options[:genome_assembly] = data
  end

  options[:header] = true
  #chr\tstart\tstop
  opts.on("-H", "--header", "Set if the file has a line header. Default true") do 
    options[:header] = false
  end

  options[:input_file] = nil
  opts.on("-i", "--input_file PATH", "Input file with patient data") do |data|
    options[:input_file] = data
  end

  options[:clustering_methods] = ['resnik', 'jiang_conrath', 'lin']
  opts.on("-m", "--clustering_methods ARRAY", "Clustering methods") do |data|
    options[:clustering_methods] = data.split(',')
  end

  options[:names] = false
  opts.on("-n", "--hpo_names", "Define if the input HPO are human readable names. Default false") do
    options[:names] = true
  end

  options[:output_file] = nil
  opts.on("-o", "--output_file PATH", "Output file with patient data") do |data|
    options[:output_file] = data
  end

  options[:hpo_file] = nil
  opts.on("-P", "--hpo_file PATH", "Input HPO file for extracting HPO codes") do |value|
    options[:hpo_file] = value
  end

  options[:ont_col] = nil
  opts.on("-p", "--hpo_term_col INTEGER/STRING", "Column name if header true or 0-based position of the column with the HPO terms") do |data|
  	options[:ont_col] = data
  end

  options[:separator] = '|'
  opts.on("-S", "--hpo_separator STRING", "Set which character must be used to split the HPO profile. Default '|'") do |data|
  	options[:separator] = data
  end

  options[:start_col] = nil
  opts.on("-s", "--start_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the start mutation coordinate") do |data|
  	options[:start_col] = data
  end

  options[:root_node] = "HP:0000118"
  opts.on("-r", "--root_node", "Root node from which the ontology will be represented ") do |root_node|
    options[:root_node] = root_node
  end

  options[:ic_stats] = 'freq'
  opts.on("-t", "--ic_stats STRING", "'freq' to compute IC based en hpo frequency in the input cohort. 'freq_internal' to use precomputed internal IC stats. 'onto' to compute ic based on hpo ontology structure.. Default freq") do |data|
    options[:ic_stats] = data
  end

  options[:threads] = 1
  opts.on("-T", "--threads INTEGER", "Number of threads to be used in calculations. Default 1") do |data|
    options[:threads] = data.to_i
  end

  options[:reference_profiles] = nil
  opts.on("--reference_profiles PATH", "Path to file tabulated file with first column as id profile and second column with ontology terms separated by separator. ") do |opt|
    options[:reference_profiles] = opt
  end

  options[:sim_thr] = nil
  opts.on("--sim_thr FLOAT", "Keep pairs with similarity value >= FLOAT. ") do |opt|
    options[:sim_thr] = opt.to_f
  end

  opts.on_tail("-h", "--help", "Show this message") do
    puts opts
    exit
  end

end.parse!


##########################
#MAIN
##########################

if options[:genome_assembly] == 'hg19' || options[:genome_assembly] == 'hg37'
  CHR_SIZE = File.join(EXTERNAL_DATA, 'chromosome_sizes_hg19.txt')
elsif options[:genome_assembly] == 'hg38'
  CHR_SIZE = File.join(EXTERNAL_DATA, 'chromosome_sizes_hg38.txt')
elsif options[:genome_assembly] == 'hg18'
  CHR_SIZE = File.join(EXTERNAL_DATA, 'chromosome_sizes_hg18.txt')
else
  abort('Wrong human genome assembly. Please choose between hg19, hg18 or hg38.')
end

output_folder = File.dirname(options[:output_file])
detailed_profile_evaluation_file = File.join(output_folder, 'detailed_hpo_profile_evaluation.csv')
rejected_file = File.join(output_folder, 'rejected_records.txt')
temp_folder = File.join(output_folder, 'temp')
matrix_file = File.join(temp_folder, 'pat_hpo_matrix.npy')
hpo_ic_file = File.join(temp_folder, 'hpo_ic.txt')
hpo_profile_ic_file = File.join(temp_folder, 'hpo_ic_profile.txt')
hpo_frequency_file = File.join(temp_folder, 'hpo_cohort_frequency.txt')
parents_per_term_file = File.join(temp_folder, 'parents_per_term.txt')
clustered_patients_file = File.join(temp_folder, 'cluster_asignation')
cluster_ic_data_file = File.join(temp_folder, 'cluster_ic_data.txt')
cluster_chromosome_data_file = File.join(temp_folder, 'cluster_chromosome_data.txt')
coverage_to_plot_file = File.join(temp_folder, 'coverage_data.txt')
sor_coverage_to_plot_file = File.join(temp_folder, 'sor_coverage_data.txt')
ronto_file = File.join(temp_folder, 'hpo_freq_colour')


Dir.mkdir(temp_folder) if !File.exists?(temp_folder)

hpo_file = !ENV['hpo_file'].nil? ? ENV['hpo_file'] : HPO_FILE
Cohort.load_ontology(:hpo, hpo_file, options[:excluded_hpo])
Cohort.act_ont = :hpo

patient_data, rejected_hpos_L, rejected_patients_L = Cohort_Parser.load(options)
rejected_hpos_C, rejected_patients_C = patient_data.check
rejected_hpos = rejected_hpos_L | rejected_hpos_C
rejected_patients = rejected_patients_L + rejected_patients_C
File.open(rejected_file, 'w'){|f| f.puts (rejected_patients).join("\n")}

patient_data.link2ont(Cohort.act_ont) # TODO: check if method load should call to this and use the semtools checking methods (take care to only remove invalid terms)

profile_sizes, parental_hpos_per_profile = patient_data.get_profile_redundancy
patient_data.check(hard=true)
hpo_stats = patient_data.get_profiles_terms_frequency() # hpo NAME, freq
hpo_stats.each{ |stat| stat[1] = stat[1]*100}
File.open(hpo_frequency_file, 'w') do |f|
  patient_data.get_profiles_terms_frequency(translate: false).each do |hpo_code, freq| # hpo CODE, freq
    f.puts "#{hpo_code.to_s}\t#{freq}"
  end
end
suggested_childs, fraction_terms_specific_childs = patient_data.compute_term_list_and_childs()
ontology_levels, distribution_percentage = patient_data.get_profile_ontology_distribution_tables()
onto_ic, freq_ic, onto_ic_profile, freq_ic_profile = patient_data.get_ic_analysis()

if options[:ic_stats] == 'freq_internal'
  ic_file = !ENV['ic_file'].nil? ? ENV['ic_file'] : IC_FILE
  freq_ic = load_hpo_ci_values(ic_file)
  phenotype_ic = freq_ic
  freq_ic_profile = {}
  patient_data.each_profile do |pat_id, phenotypes|
    freq_ic_profile[pat_id] = get_profile_ic(phenotypes, phenotype_ic)
  end
elsif options[:ic_stats] == 'freq'
  phenotype_ic = freq_ic
elsif options[:ic_stats] == 'onto'
  phenotype_ic = onto_ic
end

clustered_patients = dummy_cluster_patients(patient_data.profiles, matrix_file, clustered_patients_file) 
all_ics, prof_lengths, clust_by_chr, top_clust_phen, multi_chr_pats = process_dummy_clustered_patients(options, clustered_patients, patient_data, phenotype_ic)

summary_stats = get_summary_stats(patient_data, rejected_patients, hpo_stats, fraction_terms_specific_childs, rejected_hpos)

all_cnvs_length = []
if !options[:chromosome_col].nil?
  summary_stats << ['Number of clusters with mutations accross > 1 chromosomes', multi_chr_pats]
  
  #----------------------------------
  # Prepare data to plot coverage
  #----------------------------------
  if options[:coverage_analysis]
    patient_data.index_vars
    all_cnvs_length = patient_data.get_vars_sizes(true)
    cnv_size_average = get_mean_size(all_cnvs_length)
    patients_by_cluster, sors = patient_data.generate_cluster_regions(:reg_overlap, 'A', 0)

    ###1. Process CNVs
    raw_coverage, n_cnv, nt, pats_per_region = calculate_coverage(sors)
    summary_stats << ['Average variant size', cnv_size_average.round(4)]
    summary_stats << ['Nucleotides affected by mutations', nt]
    summary_stats << ['Number of genome windows', n_cnv]
    summary_stats << ['Mean patients per genome window', pats_per_region.round(4)]
    coverage_to_plot = get_final_coverage(raw_coverage, options[:bin_size])

    ###2. Process SORs
    raw_sor_coverage, n_sor, nt, pats_per_region = calculate_coverage(sors, options[:patients_filter] - 1)
    summary_stats << ["Number of genome window shared by >= #{options[:patients_filter]} patients", n_sor]
    summary_stats << ["Number of patients with at least 1 SOR", patients_by_cluster.length]
    summary_stats << ['Nucleotides affected by mutations', nt]
    # summary_stats << ['Patient average per region', pats_per_region]
    sor_coverage_to_plot = get_final_coverage(raw_sor_coverage, options[:bin_size])

    all_sor_length = get_sor_length_distribution(raw_sor_coverage)  
  end
end

#----------------------------------
# Write files for report
#----------------------------------
write_detailed_hpo_profile_evaluation(suggested_childs, detailed_profile_evaluation_file, summary_stats)
write_arrays4scatterplot(onto_ic.values, freq_ic.values, hpo_ic_file, 'OntoIC', 'FreqIC') # hP terms
write_arrays4scatterplot(onto_ic_profile.values, freq_ic_profile.values, hpo_profile_ic_file, 'OntoIC', 'FreqIC') #HP profiles
write_arrays4scatterplot(profile_sizes, parental_hpos_per_profile, parents_per_term_file, 'ProfileSize', 'ParentTerms')
write_cluster_ic_data(all_ics, prof_lengths, cluster_ic_data_file, options[:clusters2graph])

system_call(EXTERNAL_CODE, 'plot_scatterplot_simple.R', "-i #{hpo_ic_file} -o #{File.join(temp_folder, 'hpo_ics.pdf')} -x 'OntoIC' -y 'FreqIC' --x_tag 'HP Ontology IC' --y_tag 'HP Frequency based IC' --x_lim '0,4.5' --y_lim '0,4.5'") if !File.exists?(File.join(temp_folder, 'hpo_ics.pdf'))
system_call(EXTERNAL_CODE, 'plot_scatterplot_simple.R', "-i #{hpo_profile_ic_file} -o #{File.join(temp_folder, 'hpo_profile_ics.pdf')} -x 'OntoIC' -y 'FreqIC' --x_tag 'HP Ontology Profile IC' --y_tag 'HP Frequency based Profile IC' --x_lim '0,4.5' --y_lim '0,4.5'") if !File.exists?(File.join(temp_folder, 'hpo_profile_ics.pdf'))
system_call(EXTERNAL_CODE, 'plot_scatterplot_simple.R', "-i #{parents_per_term_file} -o #{File.join(temp_folder, 'parents_per_term.pdf')} -x 'ProfileSize' -y 'ParentTerms' --x_tag 'Patient HPO profile size' --y_tag 'Parent HPO terms within the profile'")
system_call(EXTERNAL_CODE, 'ronto_plotter.R', "-i #{hpo_frequency_file} -o #{ronto_file} --root_node #{options[:root_node]} -O #{hpo_file.gsub('.json','.obo')}") if !File.exist?(ronto_file + '.png') ###Cohort frequency calculation
system_call(EXTERNAL_CODE, 'plot_boxplot.R', "#{cluster_ic_data_file} #{temp_folder} cluster_id ic 'Cluster size/id' 'Information coefficient' 'Plen' 'Profile size'")

if !options[:chromosome_col].nil?
  write_cluster_chromosome_data(clust_by_chr, cluster_chromosome_data_file, options[:clusters2graph])
  system_call(EXTERNAL_CODE, 'plot_scatterplot.R', "#{cluster_chromosome_data_file} #{temp_folder} cluster_id chr count 'Cluster size/id' 'Chromosome' 'Patients'")
  if options[:coverage_analysis]
    ###1. Process CNVs
    write_coverage_data(coverage_to_plot, coverage_to_plot_file)
    system_call(EXTERNAL_CODE, 'plot_area.R', "-d #{coverage_to_plot_file} -o #{temp_folder}/coverage_plot -x V2 -y V3 -f V1 -H -m #{CHR_SIZE} -t CNV")
    ###2. Process SORs
    write_coverage_data(sor_coverage_to_plot, sor_coverage_to_plot_file)
    system_call(EXTERNAL_CODE, 'plot_area.R', "-d #{sor_coverage_to_plot_file} -o #{temp_folder}/sor_coverage_plot -x V2 -y V3 -f V1 -H -m #{CHR_SIZE} -t SOR")
  end
end

#----------------------------------
# CLUSTER COHORT ANALYZER REPORT
#----------------------------------
get_semantic_similarity_clustering(options, patient_data, temp_folder)

#----------------------------------
# GENERAL COHORT ANALYZER REPORT
#----------------------------------
new_cluster_phenotypes = get_top_dummy_clusters_stats(top_clust_phen)

container = {
  :temp_folder => temp_folder,
  # :top_clust_phen => top_clust_phen.length,
  :summary_stats => summary_stats,
  :clustering_methods => options[:clustering_methods],
  :hpo_stats => hpo_stats,
  :all_cnvs_length => all_cnvs_length,
  :all_sor_length => all_sor_length,
  :new_cluster_phenotypes => new_cluster_phenotypes.keys.length,
  :ontology_levels => ontology_levels,
  :distribution_percentage => distribution_percentage
}

clust_info = []
new_cluster_phenotypes.each do |clusterID, info|
    phens = info[1].join(', ')
    freqs = info[2].map{|a| a.round(4)}.join(', ')
    clust_info << [info[0], phens, freqs]
    container["clust_#{clusterID}"] = clust_info
    clust_info = []
end
template = File.open(File.join(REPORT_FOLDER, 'cohort_report.erb')).read
report = Report_html.new(container, 'Cohort quality report')
report.build(template)
report.write(options[:output_file]+'.html')