generate_boxpot.R in pets-0.2.4

- old
+ new

@@ -1,20 +1,39 @@
 #!/usr/bin/env Rscript
 suppressMessages(library(dplyr))
 
-load_file <- function(file_path, cluster_sim_out = NULL){ 
+#####################
+## FUNCTIONS
+#####################
+
+load_file <- function(file_path, cluster_sim_out = NULL, sim_method = 'lin'){ 
 	# sim_matrix <- read.table(file = file.path(file_path), sep = "\t", stringsAsFactors = FALSE, header = FALSE)
-	sim_matrix <- RcppCNPy::npyLoad(file.path(file_path, "similarity_matrix_lin.npy"))
-	axis_labels <- read.table(file.path(file_path, "similarity_matrix_lin.lst"), header=FALSE, stringsAsFactors=FALSE)
- 	colnames(sim_matrix) <- axis_labels$V1
- 	rownames(sim_matrix) <- axis_labels$V1
- 	diag(sim_matrix) <- NA
+	file_name <- file.path(file_path,paste0('similarity_matrix_',sim_method,'.npy'))
+	sim_matrix <- RcppCNPy::npyLoad(file_name)
+	file_name <- file.path(file_path,paste0('similarity_matrix_',sim_method,'.lst'))
+	if(file.exists(file_name)){ # squared matrix
 
- 	groups <- read.table(file.path(file_path, "lin_clusters.txt"), header=FALSE)
+		axis_labels <- read.table(file_name, header=FALSE, stringsAsFactors=FALSE, sep="\t")
+	 	colnames(sim_matrix) <- axis_labels$V1
+	 	rownames(sim_matrix) <- axis_labels$V1
+	 	diag(sim_matrix) <- NA
+		file_name <- paste0(sim_method,'_clusters.txt')
+		split_mode = "byboth"
+		
+	}else{ # rectangular matrix
+		axis_labels_x <- read.table(file.path(file_path, paste0('similarity_matrix_',sim_method,'_x.lst')), header=FALSE, stringsAsFactors=FALSE, sep="\t")
+		axis_labels_y <- read.table(file.path(file_path, paste0('similarity_matrix_',sim_method,'_y.lst')), header=FALSE, stringsAsFactors=FALSE, sep="\t")
+	 	colnames(sim_matrix) <- axis_labels_x$V1
+	 	rownames(sim_matrix) <- axis_labels_y$V1
+		file_name <- paste0(sim_method,'_clusters_rows.txt')
+		split_mode = "byrows"
+	}
+	
+ 	groups <- read.table(file.path(file_path, file_name), header=FALSE, sep="\t")
  	groups_vec <- groups[,2]
 	names(groups_vec) <- groups[,1]
- 	sim_within_groups <- calc_sim_within_groups(sim_matrix, groups_vec)
+ 	sim_within_groups <- calc_sim_within_groups(sim_matrix, groups_vec, split_mode = split_mode)
  	if (!is.null(cluster_sim_out))
  	write.table(sim_within_groups, cluster_sim_out, quote=FALSE, row.names=TRUE, sep="\t", col.names = FALSE)
 	sim_matrix <- sim_matrix %>% as.data.frame %>% tibble::rownames_to_column() %>% 
 	    tidyr::pivot_longer(-rowname) %>% dplyr::filter(rowname != name)
 	colnames(sim_matrix) <- c("pat_c", "pat_r", "Similarity")
@@ -24,45 +43,58 @@
 		data.frame(Similarity = sim_within_groups, sim_type = "Clustered_patients"))
 	return(tagged_data)
 }
 
 
-get_group_submatrix_mean <- function(group, matrix_transf, groups=groups) {
-  mean(matrix_transf[
-		names(groups)[groups %in% group],
-		names(groups)[groups %in% group]
-      ], na.rm=TRUE
-  )
+get_group_submatrix_mean <- function(group, matrix_transf, groups=groups, split_mode = "byboth") {
+	submatrix <- matrix_transf
+	if (split_mode %in% c("byboth", "bycols")){
+		submatrix <- submatrix[,names(groups)[groups %in% group]]
+	}
+
+	if (split_mode %in% c("byboth", "byrows")){
+		submatrix <- submatrix[names(groups)[groups %in% group],]
+	}
+  mean(submatrix, na.rm=TRUE)
 }
 
-calc_sim_within_groups <- function(matrix_transf, groups) {
+calc_sim_within_groups <- function(matrix_transf, groups, split_mode = "byboth") {
 	unique_groups <- unique(groups)
-	group_mean_sim <- sapply(unique_groups, get_group_submatrix_mean, matrix_transf=matrix_transf, groups=groups)
+	group_mean_sim <- sapply(unique_groups, get_group_submatrix_mean, matrix_transf=matrix_transf, groups=groups, split_mode = split_mode)
 	names(group_mean_sim) <- unique_groups
 	group_mean_sim
 }
 
+#####################
+## OPTPARSE
+#####################
+
 option_list <- list(
   optparse::make_option(c("-i", "--input_paths"), type="character", default=NULL,
-    help="Path to Npy and names."),
+    help="Path to Npy and names"),
+  optparse::make_option(c("-m", "--sim_method"), type="character", default='lin',
+    help="Similarity method"),
   optparse::make_option(c("-o", "--output_file"), type="character", default=NULL,
     help="Output graph file name"),
   optparse::make_option(c("-t", "--tags"), type="character", default=NULL,
     help="Comma separate tags in the same order than files")
 )
 opt <- optparse::parse_args(optparse::OptionParser(option_list=option_list))
 
+#####################
+## MAIN
+#####################
 
 all_files <- unlist(strsplit(opt$input_paths, ","))
 tags <- seq(length(all_files))
 if (!is.null(opt$tags)){
 	tags <- unlist(strsplit(opt$tags, ","))
 }
 
 similarity_dist <- list()
 for (file_i in seq(length(all_files))) {
-	similarity_dist[[tags[file_i]]] <- load_file(all_files[file_i], cluster_sim_out = paste0(opt$output_file,"_", tags[file_i],"_cluster_sim"))
+	similarity_dist[[tags[file_i]]] <- load_file(all_files[file_i], cluster_sim_out = paste0(opt$output_file,"_", tags[file_i],"_cluster_sim"), sim_method = opt$sim_method)
 }
 similarity_dist[["enod"]] <- NULL
 for (tag in names(similarity_dist)){
 	print(tag)
 	sim_df <- similarity_dist[[tag]]
@@ -81,10 +113,8 @@
 	ggplot2::theme(axis.text = ggplot2::element_text(size =14), 
 				   axis.title = ggplot2::element_text(size=18, face="bold"),
 				   legend.position = "top",
 				   legend.title = ggplot2::element_text(size = 14),
   					legend.text = ggplot2::element_text(size = 14)) +
-	ggplot2::labs(fill = "Lin similarity")
+	ggplot2::labs(fill = paste0(opt$sim_method, " similarity"))
 
 ggplot2::ggsave(filename = paste0(opt$output_file,".png"),pp,width = 20, height = 18, dpi = 200, units = "cm", device='png')
-
-