#! /usr/bin/env ruby ROOT_PATH = File.dirname(__FILE__) REPORT_FOLDER = File.expand_path(File.join(ROOT_PATH, '..', 'templates')) EXTERNAL_DATA = File.expand_path(File.join(ROOT_PATH, '..', 'external_data')) HPO_FILE = File.join(EXTERNAL_DATA, 'hp.obo') IC_FILE = File.join(EXTERNAL_DATA, 'uniq_hpo_with_CI.txt') CHR_SIZE = File.join(EXTERNAL_DATA, 'chromosome_sizes_hg19.txt') $: << File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'gephepred')) require 'optparse' require 'csv' require 'generalMethods.rb' require 'coPatReporterMethods.rb' require 'report_html' ########################## #METHODS ########################## HPOS = 0 CHR = 1 START = 2 STOP = 3 def format_patient_data(patient_data, options, name2code_dictionary, hpo_storage, hpo_parent_child_relations) all_hpo = [] rejected_hpos = [] suggested_childs = {} patient_data.each do |pat_id, patient_record| string_hpos, chr, start, stop = patient_record hpos = string_hpos.split(options[:hpo_separator]) translate_hpo_names2codes(hpos, name2code_dictionary, pat_id, rejected_hpos) if options[:hpo_names] suggested_childs[pat_id] = check_hpo_codes(hpos, hpo_storage, hpo_parent_child_relations, pat_id, rejected_hpos) all_hpo.concat(hpos) patient_record[HPOS] = hpos patient_record[START] = start.to_i if !start.nil? patient_record[STOP] = stop.to_i if !stop.nil? end return all_hpo.uniq, suggested_childs, rejected_hpos.uniq end def translate_hpo_names2codes(hpos, hpo_dictionary, pat_id, rejected_hpos) hpo_codes = [] hpos.each_with_index do |hpo_name, i| hpo_code = hpo_dictionary[hpo_name] if hpo_code.nil? STDERR.puts "WARNING: patient #{pat_id} has the unknown hpo NAME '#{hpo_name}'. Rejected." rejected_hpos << hpo_name else hpo_codes << hpo_code end end hpos.clear hpos.concat(hpo_codes) end def check_hpo_codes(hpos, hpo_storage, hpo_parent_child_relations, pat_id, rejected_hpos) more_specific_hpo = [] hpos.each_with_index do |hpo_code, i| hpo_data = hpo_storage[hpo_code] if hpo_data.nil? hpos[i] = nil STDERR.puts "WARNING: patient #{pat_id} has the unknown hpo CODE '#{hpo_code}'. Rejected." rejected_hpos << hpo_code else main_hpo_code, name = hpo_data hpos[i] = main_hpo_code # change from alternate hpo codes to the main ones childs = hpo_parent_child_relations[main_hpo_code] if childs.nil? specific_childs = [] else specific_childs = childs end more_specific_hpo << [[main_hpo_code, name], specific_childs] end end hpos.compact! return more_specific_hpo end def generate_patient_hpo_matrix(patient_data, cohort_hpos) matrix = [] n = cohort_hpos.length patient_data.each do |pat_id, patient_record| pat_hpos = patient_record[HPOS] vector = Array.new(n, 0) pat_hpos.each do |hpo| vector[cohort_hpos.index(hpo)] = 1 end matrix << vector end return matrix end def write_matrix_for_R(matrix, x_names, y_names, file) File.open(file, 'w') do |f| f.puts x_names.join("\t") matrix.each_with_index do |row, i| f.puts [y_names[i]].concat(row).join("\t") end end end def process_clustered_patients(options, clustered_patients, patient_data) # get ic and chromosomes if options[:ic_stats] ic_file = ENV['ic_file'] ic_file = IC_FILE if ic_file.nil? phenotype_ic = load_hpo_ci_values(ic_file) else phenotype_ic = compute_IC_values(patient_data, $patient_number) end all_ics = [] top_cluster_phenotypes = [] cluster_data_by_chromosomes = [] multi_chromosome_patients = 0 processed_clusters = 0 clustered_patients.sort_by{|cl_id, pat_ids| pat_ids.length }.reverse.each do |cluster_id, patient_ids| num_of_patients = patient_ids.length next if num_of_patients == 1 chrs = Hash.new(0) all_phens = [] profile_ics = [] patient_ids.each do |pat_id| patient = patient_data[pat_id] phenotypes = patient[HPOS] profile_ics << get_profile_ic(phenotypes, phenotype_ic) #optional all_phens << phenotypes if processed_clusters < options[:clusters2show_detailed_phen_data] chrs[patient[CHR]] += 1 if !options[:chromosome_col].nil? end top_cluster_phenotypes << all_phens if processed_clusters < options[:clusters2show_detailed_phen_data] all_ics << profile_ics # STDERR.puts [cluster_id, num_of_patients, chr, count].inspect if !options[:chromosome_col].nil? multi_chromosome_patients += num_of_patients if chrs.length > 1 chrs.each do |chr, count| cluster_data_by_chromosomes << [cluster_id, num_of_patients, chr, count] end end processed_clusters += 1 end # STDERR.puts cluster_data_by_chromosomes.inspect return all_ics, cluster_data_by_chromosomes, top_cluster_phenotypes, multi_chromosome_patients end def get_profile_ic(hpo_names, phenotype_ic) ic = 0 profile_length = 0 hpo_names.each do |hpo_id| hpo_ic = phenotype_ic[hpo_id] # STDERR.puts phenotype_ic.inspect ic += hpo_ic if !hpo_ic.nil? profile_length += 1 end profile_length = 1 if profile_length == 0 return ic.fdiv(profile_length) end def write_cluster_ic_data(all_ics, cluster_ic_data_file, limit) File.open(cluster_ic_data_file, 'w') do |f| f.puts %w[cluster_id ic].join("\t") all_ics.each_with_index do |cluster_ics, i| break if i == limit cluster_length = cluster_ics.length cluster_ics.each do |clust_ic| f.puts "#{cluster_length}_#{i}\t#{clust_ic}" end end end end def write_cluster_chromosome_data(cluster_data, cluster_chromosome_data_file, limit) File.open(cluster_chromosome_data_file, 'w') do |f| f.puts %w[cluster_id chr count].join("\t") index = 0 last_id = cluster_data.first.first unless cluster_data.empty? cluster_data.each do |cluster_id, patient_number, chr, count| index += 1 if cluster_id != last_id break if index == limit f.puts ["#{patient_number}_#{index}", chr, count].join("\t") last_id = cluster_id end end end def write_coverage_data(coverage_to_plot, coverage_to_plot_file) File.open(coverage_to_plot_file, 'w') do |f| coverage_to_plot.each do |chr, position, freq| f.puts "#{chr}\t#{position}\t#{freq}" end end end def get_hpo_profile(patient_data) hpo_profiles = [] patient_data.each do |pat_id, pat_data| hpo_profiles << pat_data[HPOS] end return hpo_profiles end def get_summary_stats(patient_data, cohort_hpos, all_hpo_profiles) stats = [] ids = [] stats << ['Unique HPOs', cohort_hpos.length] patient_ids = patient_data.keys patient_ids.each do |pat_id| id, count = pat_id.split('_i') ids << id end n_pat = ids.uniq.length stats << ['Number of patients in the cohort', n_pat] all_hpo_prof_lengths = all_hpo_profiles.map{|p| p.length}.sort stats << ['HPOs per patient (average)', all_hpo_prof_lengths.inject(0){|sum, n| sum + n}.fdiv(n_pat).round(4)] hpo_pat90 = nil rate = 0 count = 0 while rate <= 0.1 hpo_pat90 = all_hpo_prof_lengths[count+1] rate = count.fdiv(n_pat) count += 1 end stats << ['HPOs for patient in percentile 90', hpo_pat90] return stats end def hpo_stats(all_hpo_profiles) stats = Hash.new(0) all_hpo_profiles.each do |profile| profile.each do |hpo| stats[hpo] += 1 end end n_profiles = all_hpo_profiles.length hpo_stats = [] stats.each do |hpo, count| hpo_stats << [hpo, count.fdiv(n_profiles)*100] end hpo_stats.sort!{|h1, h2| h2[1] <=> h1[1]} return hpo_stats[0..20] end def translate_hpo_codes2names(all_hpo_profiles, hpo_storage) all_hpo_profiles.each do |profile| profile.each_with_index do |hpo, i| hpo_data = hpo_storage[hpo] if hpo_data.nil? STDERR.puts "WARNING: hpo code '#{hpo}' not exists." else profile[i] = hpo_data[1] end end end end def write_detailed_hpo_profile_evaluation(suggested_childs, detailed_profile_evaluation_file, summary_stats) hpo_count = 0 parent_hpo_count = 0 CSV.open(detailed_profile_evaluation_file, "wb") do |csv| suggested_childs.each do |pat_id, suggestions| warning = nil warning = 'WARNING: Very few phenotypes' if suggestions.length < 4 csv << ["PATIENT #{pat_id}", "#{warning}"] csv << ["CURRENT PHENOTYPES", "PUTATIVE MORE SPECIFIC PHENOTYPES"] suggestions.each do |parent, childs| hpo_count += 1 parent_code, parent_name = parent if childs.empty? csv << ["#{parent_name} (#{parent_code})", '-'] else parent_hpo_count += 1 parent_writed = false childs.each do |child_code, child_name| if !parent_writed parent_field = "#{parent_name} (#{parent_code})" parent_writed = true else parent_field = "" end csv << [parent_field, "#{child_name} (#{child_code})"] end end end csv << ["", ""] end end summary_stats << ['Percentage of defined HPOs that have more specific childs', (parent_hpo_count.fdiv(hpo_count) * 100).round(4)] end ########################## #OPT-PARSER ########################## options = {} OptionParser.new do |opts| opts.banner = "Usage: #{__FILE__} [options]" options[:coverage_analysis] = true opts.on("-a", "--coverage_analysis", "Deactivate genome coverage analysis. Default true") do options[:coverage_analysis] = false end options[:bin_size] = 50000 opts.on("-b", "--bin_size INTEGER", "Maximum number of bins to plot the coverage") do |data| options[:bin_size] = data.to_i end options[:clusters2show_detailed_phen_data] = 3 opts.on("-C", "--clusters2show INTEGER", "How many patient clusters are show in detailed phenotype cluster data section. Default 3") do |data| options[:clusters2show_detailed_phen_data] = data.to_i end options[:chromosome_col] = nil opts.on("-c", "--chromosome_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the chromosome") do |data| options[:chromosome_col] = data end options[:pat_id_col] = nil opts.on("-d", "--pat_id_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the patient id") do |data| options[:pat_id_col] = data end options[:excluded_hpo] = nil opts.on("-E", "--excluded_hpo PATH", "List of HPO phenotypes to exclude (low informative)") do |excluded_hpo| options[:excluded_hpo] = excluded_hpo end options[:end_col] = nil opts.on("-e", "--end_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the end mutation coordinate") do |data| options[:end_col] = data end options[:patients_filter] = 2 opts.on("-f", "--patients_filter INTEGER", "Minimum number of patients sharing SORs. Default 0") do |data| options[:patients_filter] = data.to_i end options[:clusters2graph] = 30 opts.on("-g", "--clusters2graph INTEGER", "How may patient clusters are plotted in cluster plots. Default 30") do |data| options[:clusters2graph] = data.to_i end options[:header] = true #chr\tstart\tstop opts.on("-H", "--header", "Set if the file has a line header. Default true") do options[:header] = false end options[:input_file] = nil opts.on("-i", "--input_file PATH", "Input file with patient data") do |data| options[:input_file] = data end options[:hpo_names] = false opts.on("-n", "--hpo_names", "Define if the input HPO are human readable names. Default false") do options[:hpo_names] = true end options[:output_file] = nil opts.on("-o", "--output_file PATH", "Output file with patient data") do |data| options[:output_file] = data end options[:hpo_file] = nil opts.on("-P", "--hpo_file PATH", "Input HPO file for extracting HPO codes") do |value| options[:hpo_file] = value end options[:hpo_col] = nil opts.on("-p", "--hpo_term_col INTEGER/STRING", "Column name if header true or 0-based position of the column with the HPO terms") do |data| options[:hpo_col] = data end options[:hpo_separator] = '|' opts.on("-S", "--hpo_separator STRING", "Set which character must be used to split the HPO profile. Default '|'") do |data| options[:hpo_separator] = data end options[:start_col] = nil opts.on("-s", "--start_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the start mutation coordinate") do |data| options[:start_col] = data end options[:ic_stats] = false opts.on("-t", "--ic_stats", "Use internal IC stats. Default false") do options[:ic_stats] = true end opts.on_tail("-h", "--help", "Show this message") do puts opts exit end end.parse! ########################## #MAIN ########################## output_folder = File.dirname(options[:output_file]) detailed_profile_evaluation_file = File.join(output_folder, 'detailed_hpo_profile_evaluation.csv') temp_folder = File.join(output_folder, 'temp') matrix_file = File.join(temp_folder, 'pat_hpo_matrix.txt') clustered_patients_file = File.join(temp_folder, 'cluster_asignation') cluster_ic_data_file = File.join(temp_folder, 'cluster_ic_data.txt') cluster_chromosome_data_file = File.join(temp_folder, 'cluster_chromosome_data.txt') coverage_to_plot_file = File.join(temp_folder, 'coverage_data.txt') sor_coverage_to_plot_file = File.join(temp_folder, 'sor_coverage_data.txt') # cnvs_lenght_to_plot_file = File.join(temp_folder, 'cnvs_lenght.txt') Dir.mkdir(temp_folder) if !File.exists?(temp_folder) # LOAD HPO DATA #------------------------- # #load hpo dictionaries hpo_black_list = [] hpo_black_list = load_hpo_black_list(options[:excluded_hpo]) if !options[:excluded_hpo].nil? hpo_file = ENV['hpo_file'] hpo_file = HPO_FILE if hpo_file.nil? hpo_storage = load_hpo_file(hpo_file, hpo_black_list) hpo_parent_child_relations = get_child_parent_relations(hpo_storage) name2code_dictionary = create_hpo_dictionary(hpo_storage) if options[:hpo_names] patient_data, $patient_number = load_patient_cohort(options) cohort_hpos, suggested_childs, rejected_hpos = format_patient_data(patient_data, options, name2code_dictionary, hpo_storage, hpo_parent_child_relations) pat_hpo_matrix = generate_patient_hpo_matrix(patient_data, cohort_hpos) write_matrix_for_R(pat_hpo_matrix, cohort_hpos, patient_data.keys, matrix_file) system("get_clusters.R #{matrix_file} #{temp_folder}") if !File.exists?(clustered_patients_file) clustered_patients = load_clustered_patients(clustered_patients_file) all_ics, cluster_data_by_chromosomes, top_cluster_phenotypes, multi_chromosome_patients = process_clustered_patients(options, clustered_patients, patient_data) write_cluster_ic_data(all_ics, cluster_ic_data_file, options[:clusters2graph]) system("plot_boxplot.R #{cluster_ic_data_file} #{temp_folder} cluster_id ic 'Cluster size/id' 'Information coefficient'") all_hpo_profiles = get_hpo_profile(patient_data) translate_hpo_codes2names(all_hpo_profiles, hpo_storage) summary_stats = get_summary_stats(patient_data, cohort_hpos, all_hpo_profiles) write_detailed_hpo_profile_evaluation(suggested_childs, detailed_profile_evaluation_file, summary_stats) hpo_stats = hpo_stats(all_hpo_profiles) summary_stats << ['Number of unknown phenotypes', rejected_hpos.length] all_cnvs_length = [] if !options[:chromosome_col].nil? summary_stats << ['Number of clusters with mutations accross > 1 chromosomes', multi_chromosome_patients] write_cluster_chromosome_data(cluster_data_by_chromosomes, cluster_chromosome_data_file, options[:clusters2graph]) system("plot_scatterplot.R #{cluster_chromosome_data_file} #{temp_folder} cluster_id chr count 'Cluster size/id' 'Chromosome' 'Patients'") #---------------------------------- #Prepare data to plot coverage if options[:coverage_analysis] processed_patient_data = process_patient_data(patient_data) cnv_sizes = [] processed_patient_data.each do |chr, metadata| metadata.each do |patientID, start, stop| cnv_sizes << stop - start end end cnv_size_average = cnv_sizes.inject{ |sum, el| sum + el }.fdiv(cnv_sizes.length.to_f) patients_by_cluster, sors = generate_cluster_regions(processed_patient_data, 'A', 0) total_patients_sharing_sors = [] all_patients = patients_by_cluster.keys all_patients.each do |identifier| total_patients_sharing_sors << identifier.split('_i').first end all_cnvs_length = get_cnvs_length(patient_data) ###1. Process CNVs raw_coverage, n_cnv, nt, pats_per_region = calculate_coverage(sors) summary_stats << ['Number of genome windows', n_cnv] summary_stats << ['Nucleotides affected by mutations', nt] summary_stats << ['Patient average per region', pats_per_region.round(4)] summary_stats << ['CNV size average', cnv_size_average.round(4)] coverage_to_plot = get_final_coverage(raw_coverage, options[:bin_size]) write_coverage_data(coverage_to_plot, coverage_to_plot_file) cmd = "plot_area.R -d #{coverage_to_plot_file} -o #{temp_folder}/coverage_plot -x V2 -y V3 -f V1 -H -m #{CHR_SIZE} -t CNV" system(cmd) ###2. Process SORs raw_sor_coverage, n_sor, nt, pats_per_region = calculate_coverage(sors, options[:patients_filter] - 1) summary_stats << ["Number of patients with at least 1 SOR", total_patients_sharing_sors.uniq.length] summary_stats << ["Number of SORs with >= #{options[:patients_filter]} patients", n_sor] summary_stats << ['Nucleotides affected by mutations', nt] # summary_stats << ['Patient average per region', pats_per_region] sor_coverage_to_plot = get_final_coverage(raw_sor_coverage, options[:bin_size]) write_coverage_data(sor_coverage_to_plot, sor_coverage_to_plot_file) system("plot_area.R -d #{sor_coverage_to_plot_file} -o #{temp_folder}/sor_coverage_plot -x V2 -y V3 -f V1 -H -m #{CHR_SIZE} -t SOR") all_sor_length = get_sor_length_distribution(raw_sor_coverage) end end #---------------------------------- #Report total_patients = 0 new_cluster_phenotypes = {} phenotypes_frequency = Hash.new(0) top_cluster_phenotypes.each_with_index do |cluster, clusterID| total_patients = cluster.length cluster.each do |phenotypes| phenotypes.each do |p| phenotypes_frequency[p] += 1 end end new_cluster_phenotypes[clusterID] = [total_patients, phenotypes_frequency.keys, phenotypes_frequency.values.map{|v| v.fdiv(total_patients) * 100}] phenotypes_frequency = Hash.new(0) end container = { :temp_folder => temp_folder, # :top_cluster_phenotypes => top_cluster_phenotypes.length, :summary_stats => summary_stats, :hpo_stats => hpo_stats, :all_cnvs_length => all_cnvs_length, :all_sor_length => all_sor_length, :new_cluster_phenotypes => new_cluster_phenotypes.keys.length } # top_cluster_phenotypes.each_with_index do |cluster, i| # clust_pr = cluster.map{|pr| [pr.join(', ')] } # container["clust_#{i}"] = clust_pr # end clust_info = [] new_cluster_phenotypes.each do |clusterID, info| phens = info[1].join(', ') freqs = info[2].map{|a| a.round(4)}.join(', ') clust_info << [info[0], phens, freqs] container["clust_#{clusterID}"] = clust_info clust_info = [] end template = File.open(File.join(REPORT_FOLDER, 'cohort_report.erb')).read report = Report_html.new(container, 'Cohort quality report') report.build(template) report.write(options[:output_file]+'.html')