#! /usr/bin/env ruby ROOT_PATH = File.dirname(__FILE__) REPORT_FOLDER = File.expand_path(File.join(ROOT_PATH, '..', 'templates')) EXTERNAL_DATA = File.expand_path(File.join(ROOT_PATH, '..', 'external_data')) EXTERNAL_CODE = File.expand_path(File.join(ROOT_PATH, '..', 'external_code')) HPO_FILE = File.join(EXTERNAL_DATA, 'hp.json') IC_FILE = File.join(EXTERNAL_DATA, 'uniq_hpo_with_CI.txt') $: << File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'pets')) require 'benchmark' require 'parallel' require 'optparse' require 'csv' require 'npy' require 'generalMethods.rb' require 'coPatReporterMethods.rb' require 'report_html' require 'semtools' #Expand class (semtools modifications if necessary): class Ontology end ########################## # FUNCTIONS ########################## def translate_codes(clusters, hpo) translated_clusters = [] clusters.each do |clusterID, num_of_pats, patientIDs_ary, patient_hpos_ary| translate_codes = patient_hpos_ary.map{|patient_hpos| patient_hpos.map{|hpo_code| hpo.translate_id(hpo_code)}} translated_clusters << [clusterID, num_of_pats, patientIDs_ary, patient_hpos_ary, translate_codes ] end return translated_clusters end ########################## #OPT-PARSER ########################## options = {} OptionParser.new do |opts| opts.banner = "Usage: #{__FILE__} [options]" options[:coverage_analysis] = true opts.on("-a", "--coverage_analysis", "Deactivate genome coverage analysis. Default true") do options[:coverage_analysis] = false end options[:bin_size] = 50000 opts.on("-b", "--bin_size INTEGER", "Maximum number of bins to plot the coverage") do |data| options[:bin_size] = data.to_i end options[:clusters2show_detailed_phen_data] = 3 opts.on("-C", "--clusters2show INTEGER", "How many patient clusters are show in detailed phenotype cluster data section. Default 3") do |data| options[:clusters2show_detailed_phen_data] = data.to_i end options[:chromosome_col] = nil opts.on("-c", "--chromosome_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the chromosome") do |data| options[:chromosome_col] = data end options[:pat_id_col] = nil opts.on("-d", "--pat_id_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the patient id") do |data| options[:pat_id_col] = data end options[:excluded_hpo] = nil opts.on("-E", "--excluded_hpo PATH", "List of HPO phenotypes to exclude (low informative)") do |excluded_hpo| options[:excluded_hpo] = excluded_hpo end options[:end_col] = nil opts.on("-e", "--end_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the end mutation coordinate") do |data| options[:end_col] = data end options[:minClusterProportion] = 0.01 opts.on("-M", "--minClusterProportion FLOAT", "Minimum percentage of patients per cluster") do |data| options[:minClusterProportion] = data.to_f end options[:patients_filter] = 2 opts.on("-f", "--patients_filter INTEGER", "Minimum number of patients sharing SORs. Default 0") do |data| options[:patients_filter] = data.to_i end options[:clusters2graph] = 30 opts.on("-g", "--clusters2graph INTEGER", "How may patient clusters are plotted in cluster plots. Default 30") do |data| options[:clusters2graph] = data.to_i end options[:genome_assembly] = 'hg38' opts.on("-G", "--genome_assembly STRING", "Genome assembly version. Please choose between hg18, hg19 and hg38. Default hg38") do |data| options[:genome_assembly] = data end options[:header] = true #chr\tstart\tstop opts.on("-H", "--header", "Set if the file has a line header. Default true") do options[:header] = false end options[:input_file] = nil opts.on("-i", "--input_file PATH", "Input file with patient data") do |data| options[:input_file] = data end options[:clustering_methods] = ['resnik', 'jiang_conrath', 'lin'] opts.on("-m", "--clustering_methods ARRAY", "Clustering methods") do |data| options[:clustering_methods] = data.split(',') end options[:hpo_names] = false opts.on("-n", "--hpo_names", "Define if the input HPO are human readable names. Default false") do options[:hpo_names] = true end options[:output_file] = nil opts.on("-o", "--output_file PATH", "Output file with patient data") do |data| options[:output_file] = data end options[:hpo_file] = nil opts.on("-P", "--hpo_file PATH", "Input HPO file for extracting HPO codes") do |value| options[:hpo_file] = value end options[:hpo_col] = nil opts.on("-p", "--hpo_term_col INTEGER/STRING", "Column name if header true or 0-based position of the column with the HPO terms") do |data| options[:hpo_col] = data end options[:hpo_separator] = '|' opts.on("-S", "--hpo_separator STRING", "Set which character must be used to split the HPO profile. Default '|'") do |data| options[:hpo_separator] = data end options[:start_col] = nil opts.on("-s", "--start_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the start mutation coordinate") do |data| options[:start_col] = data end options[:root_node] = "HP:0000118" opts.on("-r", "--root_node", "Root node from which the ontology will be represented ") do |root_node| options[:root_node] = root_node end options[:ic_stats] = 'freq' opts.on("-t", "--ic_stats STRING", "'freq' to compute IC based en hpo frequency in the input cohort. 'freq_internal' to use precomputed internal IC stats. 'onto' to compute ic based on hpo ontology structure.. Default freq") do |data| options[:ic_stats] = data end options[:threads] = 1 opts.on("-T", "--threads INTEGER", "Number of threads to be used in calculations. Default 1") do |data| options[:threads] = data.to_i end opts.on_tail("-h", "--help", "Show this message") do puts opts exit end end.parse! ########################## #MAIN ########################## if options[:genome_assembly] == 'hg19' || options[:genome_assembly] == 'hg37' CHR_SIZE = File.join(EXTERNAL_DATA, 'chromosome_sizes_hg19.txt') elsif options[:genome_assembly] == 'hg38' CHR_SIZE = File.join(EXTERNAL_DATA, 'chromosome_sizes_hg38.txt') elsif options[:genome_assembly] == 'hg18' CHR_SIZE = File.join(EXTERNAL_DATA, 'chromosome_sizes_hg18.txt') else abort('Wrong human genome assembly. Please choose between hg19, hg18 or hg38.') end output_folder = File.dirname(options[:output_file]) detailed_profile_evaluation_file = File.join(output_folder, 'detailed_hpo_profile_evaluation.csv') rejected_file = File.join(output_folder, 'rejected_records.txt') temp_folder = File.join(output_folder, 'temp') matrix_file = File.join(temp_folder, 'pat_hpo_matrix.npy') hpo_ic_file = File.join(temp_folder, 'hpo_ic.txt') hpo_profile_ic_file = File.join(temp_folder, 'hpo_ic_profile.txt') hpo_frequency_file = File.join(temp_folder, 'hpo_cohort_frequency.txt') parents_per_term_file = File.join(temp_folder, 'parents_per_term.txt') clustered_patients_file = File.join(temp_folder, 'cluster_asignation') cluster_ic_data_file = File.join(temp_folder, 'cluster_ic_data.txt') cluster_chromosome_data_file = File.join(temp_folder, 'cluster_chromosome_data.txt') coverage_to_plot_file = File.join(temp_folder, 'coverage_data.txt') sor_coverage_to_plot_file = File.join(temp_folder, 'sor_coverage_data.txt') Dir.mkdir(temp_folder) if !File.exists?(temp_folder) hpo_file = !ENV['hpo_file'].nil? ? ENV['hpo_file'] : HPO_FILE hpo = load_hpo_ontology(hpo_file, options[:excluded_hpo]) patient_data = load_patient_cohort(options) rejected_hpos, rejected_patients = format_patient_data(patient_data, options, hpo) File.open(rejected_file, 'w'){|f| f.puts rejected_patients.join("\n")} patient_data.select!{|pat_id, patient_record| !rejected_patients.include?(pat_id)} patient_uniq_profiles, equivalence = get_uniq_hpo_profiles(patient_data) hpo.load_profiles(patient_uniq_profiles) profile_sizes, parental_hpos_per_profile = get_profile_redundancy(hpo) clean_patient_profiles(hpo, patient_uniq_profiles) cohort_hpos, suggested_childs, fraction_terms_specific_childs = compute_hpo_list_and_childs(patient_uniq_profiles, hpo) ontology_levels, distribution_percentage = get_profile_ontology_distribution_tables(hpo) onto_ic, freq_ic = hpo.get_observed_ics_by_onto_and_freq # IC for TERMS onto_ic_profile, freq_ic_profile = hpo.get_profiles_resnik_dual_ICs # IC for PROFILES if options[:ic_stats] == 'freq_internal' ic_file = ENV['ic_file'] ic_file = IC_FILE if ic_file.nil? freq_ic = load_hpo_ci_values(ic_file) phenotype_ic = freq_ic freq_ic_profile = {} patient_uniq_profiles.each do |pat_id, phenotypes| freq_ic_profile[pat_id] = get_profile_ic(phenotypes, phenotype_ic) end else if options[:ic_stats] == 'freq' phenotype_ic = freq_ic elsif options[:ic_stats] == 'onto' phenotype_ic = onto_ic end end clustered_patients = cluster_patients(patient_uniq_profiles, cohort_hpos, matrix_file, clustered_patients_file) all_ics, profile_lengths, cluster_data_by_chromosomes, top_cluster_phenotypes, multi_chromosome_patients = process_clustered_patients(options, clustered_patients, patient_uniq_profiles, patient_data, equivalence, hpo, phenotype_ic, options[:pat_id_col]) get_patient_hpo_frequency(patient_uniq_profiles, hpo_frequency_file) summary_stats = get_summary_stats(patient_uniq_profiles, rejected_patients, cohort_hpos, hpo) summary_stats << ['Percentage of HPO with more specific children', (fraction_terms_specific_childs * 100).round(4)] summary_stats << ['DsI for uniq HP terms', hpo.get_dataset_specifity_index('uniq')] summary_stats << ['DsI for frequency weigthed HP terms', hpo.get_dataset_specifity_index('weigthed')] hpo_stats = hpo.get_profiles_terms_frequency() hpo_stats.each{ |stat| stat[1] = stat[1]*100} summary_stats << ['Number of unknown phenotypes', rejected_hpos.length] all_cnvs_length = [] if !options[:chromosome_col].nil? summary_stats << ['Number of clusters with mutations accross > 1 chromosomes', multi_chromosome_patients] #---------------------------------- # Prepare data to plot coverage #---------------------------------- if options[:coverage_analysis] processed_patient_data = process_patient_data(patient_data) cnv_sizes = [] processed_patient_data.each do |chr, metadata| metadata.each do |patientID, start, stop| cnv_sizes << stop - start end end cnv_size_average = cnv_sizes.inject{ |sum, el| sum + el }.fdiv(cnv_sizes.length.to_f) patients_by_cluster, sors = generate_cluster_regions(processed_patient_data, 'A', 0) total_patients_sharing_sors = [] all_patients = patients_by_cluster.keys all_patients.each do |identifier| total_patients_sharing_sors << identifier.split('_i').first end all_cnvs_length = get_cnvs_length(patient_data) ###1. Process CNVs raw_coverage, n_cnv, nt, pats_per_region = calculate_coverage(sors) summary_stats << ['Average variant size', cnv_size_average.round(4)] summary_stats << ['Nucleotides affected by mutations', nt] summary_stats << ['Number of genome windows', n_cnv] summary_stats << ['Mean patients per genome window', pats_per_region.round(4)] coverage_to_plot = get_final_coverage(raw_coverage, options[:bin_size]) ###2. Process SORs raw_sor_coverage, n_sor, nt, pats_per_region = calculate_coverage(sors, options[:patients_filter] - 1) summary_stats << ["Number of genome window shared by >= #{options[:patients_filter]} patients", n_sor] summary_stats << ["Number of patients with at least 1 SOR", total_patients_sharing_sors.uniq.length] summary_stats << ['Nucleotides affected by mutations', nt] # summary_stats << ['Patient average per region', pats_per_region] sor_coverage_to_plot = get_final_coverage(raw_sor_coverage, options[:bin_size]) all_sor_length = get_sor_length_distribution(raw_sor_coverage) end end #---------------------------------- # Write files for report #---------------------------------- write_detailed_hpo_profile_evaluation(suggested_childs, detailed_profile_evaluation_file, summary_stats) write_arrays4scatterplot(onto_ic.values, freq_ic.values, hpo_ic_file, 'OntoIC', 'FreqIC') # hP terms write_arrays4scatterplot(onto_ic_profile.values, freq_ic_profile.values, hpo_profile_ic_file, 'OntoIC', 'FreqIC') #HP profiles write_arrays4scatterplot(profile_sizes, parental_hpos_per_profile, parents_per_term_file, 'ProfileSize', 'ParentTerms') system_call(EXTERNAL_CODE, 'plot_scatterplot_simple.R', "-i #{hpo_ic_file} -o #{File.join(temp_folder, 'hpo_ics.pdf')} -x 'OntoIC' -y 'FreqIC' --x_tag 'HP Ontology IC' --y_tag 'HP Frequency based IC' --x_lim '0,4.5' --y_lim '0,4.5'") if !File.exists?(File.join(temp_folder, 'hpo_ics.pdf')) system_call(EXTERNAL_CODE, 'plot_scatterplot_simple.R', "-i #{hpo_profile_ic_file} -o #{File.join(temp_folder, 'hpo_profile_ics.pdf')} -x 'OntoIC' -y 'FreqIC' --x_tag 'HP Ontology Profile IC' --y_tag 'HP Frequency based Profile IC' --x_lim '0,4.5' --y_lim '0,4.5'") if !File.exists?(File.join(temp_folder, 'hpo_profile_ics.pdf')) system_call(EXTERNAL_CODE, 'plot_scatterplot_simple.R', "-i #{parents_per_term_file} -o #{File.join(temp_folder, 'parents_per_term.pdf')} -x 'ProfileSize' -y 'ParentTerms' --x_tag 'Patient HPO profile size' --y_tag 'Parent HPO terms within the profile'") ###Cohort frequency calculation ronto_file = File.join(temp_folder, 'hpo_freq_colour') system_call(EXTERNAL_CODE, 'ronto_plotter.R', "-i #{hpo_frequency_file} -o #{ronto_file} --root_node #{options[:root_node]} -O #{hpo_file.gsub('.json','.obo')}") if !File.exist?(ronto_file + '.png') write_cluster_ic_data(all_ics, profile_lengths, cluster_ic_data_file, options[:clusters2graph]) system_call(EXTERNAL_CODE, 'plot_boxplot.R', "#{cluster_ic_data_file} #{temp_folder} cluster_id ic 'Cluster size/id' 'Information coefficient' 'Plen' 'Profile size'") if !options[:chromosome_col].nil? write_cluster_chromosome_data(cluster_data_by_chromosomes, cluster_chromosome_data_file, options[:clusters2graph]) system_call(EXTERNAL_CODE, 'plot_scatterplot.R', "#{cluster_chromosome_data_file} #{temp_folder} cluster_id chr count 'Cluster size/id' 'Chromosome' 'Patients'") if options[:coverage_analysis] ###1. Process CNVs write_coverage_data(coverage_to_plot, coverage_to_plot_file) system_call(EXTERNAL_CODE, 'plot_area.R', "-d #{coverage_to_plot_file} -o #{temp_folder}/coverage_plot -x V2 -y V3 -f V1 -H -m #{CHR_SIZE} -t CNV") ###2. Process SORs write_coverage_data(sor_coverage_to_plot, sor_coverage_to_plot_file) system_call(EXTERNAL_CODE, 'plot_area.R', "-d #{sor_coverage_to_plot_file} -o #{temp_folder}/sor_coverage_plot -x V2 -y V3 -f V1 -H -m #{CHR_SIZE} -t SOR") end end #---------------------------------- # CLUSTER COHORT ANALYZER REPORT #---------------------------------- Parallel.each(options[:clustering_methods], in_processes: options[:threads] ) do |method_name| matrix_filename = File.join(temp_folder, "similarity_matrix_#{method_name}.npy") axis_file = matrix_filename.gsub('.npy','.lst') profiles_similarity_filename = File.join(temp_folder, ['profiles_similarity', method_name].join('_').concat('.txt')) clusters_distribution_filename = File.join(temp_folder, ['clusters_distribution', method_name].join('_').concat('.txt')) if !File.exists?(matrix_filename) profiles_similarity = hpo.compare_profiles(sim_type: method_name.to_sym) write_profile_pairs(profiles_similarity, profiles_similarity_filename) similarity_matrix, axis_names = format_profiles_similarity_data_numo(profiles_similarity) File.open(axis_file, 'w'){|f| f.print axis_names.join("\n") } Npy.save(matrix_filename, similarity_matrix) end ext_var = '' if method_name == 'resnik' ext_var = '-m max' elsif method_name == 'lin' ext_var = '-m comp1' end out_file = File.join(temp_folder, method_name) system_call(EXTERNAL_CODE, 'plot_heatmap.R', "-y #{axis_file} -d #{matrix_filename} -o #{out_file} -M #{options[:minClusterProportion]} -t dynamic -H #{ext_var}") if !File.exists?(out_file + '_heatmap.png') clusters_codes, clusters_info = parse_clusters_file(File.join(temp_folder, "#{method_name}_clusters.txt"), patient_uniq_profiles) get_cluster_metadata(clusters_info, clusters_distribution_filename) out_file = File.join(temp_folder, ['clusters_distribution', method_name].join('_')) system_call(EXTERNAL_CODE, 'xyplot_graph.R', "-d #{clusters_distribution_filename} -o #{out_file} -x PatientsNumber -y HPOAverage") if !File.exists?(out_file) clusters = translate_codes(clusters_codes, hpo) container = { :temp_folder => temp_folder, :cluster_name => method_name, :clusters => clusters, :hpo => hpo } template = File.open(File.join(REPORT_FOLDER, 'cluster_report.erb')).read report = Report_html.new(container, 'Patient clusters report') report.build(template) report.write(options[:output_file]+"_#{method_name}_clusters.html") end system_call(EXTERNAL_CODE, 'generate_boxpot.R', "-i #{temp_folder} -o #{File.join(temp_folder, 'sim_boxplot')}") if !File.exists?(File.join(temp_folder, 'sim_boxplot.png')) #---------------------------------- # GENERAL COHORT ANALYZER REPORT #---------------------------------- total_patients = 0 new_cluster_phenotypes = {} phenotypes_frequency = Hash.new(0) top_cluster_phenotypes.each_with_index do |cluster, clusterID| total_patients = cluster.length cluster.each do |phenotypes| phenotypes.each do |p| phenotypes_frequency[p] += 1 end end new_cluster_phenotypes[clusterID] = [total_patients, phenotypes_frequency.keys, phenotypes_frequency.values.map{|v| v.fdiv(total_patients) * 100}] phenotypes_frequency = Hash.new(0) end container = { :temp_folder => temp_folder, # :top_cluster_phenotypes => top_cluster_phenotypes.length, :summary_stats => summary_stats, :clustering_methods => options[:clustering_methods], :hpo_stats => hpo_stats, :all_cnvs_length => all_cnvs_length, :all_sor_length => all_sor_length, :new_cluster_phenotypes => new_cluster_phenotypes.keys.length, :ontology_levels => ontology_levels, :distribution_percentage => distribution_percentage } clust_info = [] new_cluster_phenotypes.each do |clusterID, info| phens = info[1].join(', ') freqs = info[2].map{|a| a.round(4)}.join(', ') clust_info << [info[0], phens, freqs] container["clust_#{clusterID}"] = clust_info clust_info = [] end template = File.open(File.join(REPORT_FOLDER, 'cohort_report.erb')).read report = Report_html.new(container, 'Cohort quality report') report.build(template) report.write(options[:output_file]+'.html')