library(COTAN)
library(pROC)
options(parallelly.fork.enable = TRUE)
library(Seurat)
library(monocle3)
library(reticulate)
library(cowplot)
library(stringr)
library(dplyr)
library(tidyr)
library(ggplot2)
library(ggpubr)
<- c("COTAN"="#F73604","Seurat"="#ABD9E9","Seurat_scTr"="#74ADD1","Seurat_Bimod"="#4575B4", "Monocle"="#DAABE9", "Scanpy"="#7F9B5C" )
methods.color
<- "Results/FDR/"
dirOut if (!dir.exists(dirOut)) {
dir.create(dirOut)
}
<- "Data/MouseCortexFromLoom/FDR/MergedClusters_For_FDR/" dataSetDir
FDR analysis - Results - thresholds true 3% 10%
Preamble
Dataset composition
<- read.csv(file.path(dataSetDir,"Cells_Usage_DataFrame.csv"),
datasets_csv row.names = 1
)
1:3,] datasets_csv[
Group Collection E13.5.432 E13.5.187 E13.5.434
1 2_Clusters_even_near E13.5-434_+_E15.0-428 0 0 318
2 2_Clusters_even_near E15.0-432_+_E13.5-432 536 0 0
3 2_Clusters_even_near E15.0-508_+_E15.0-509 0 0 0
E13.5.184 E13.5.437 E13.5.510 E15.0.432 E15.0.509 E15.0.510 E15.0.508
1 0 0 0 0 0 0 0
2 0 0 0 536 0 0 0
3 0 0 0 0 397 0 397
E15.0.428 E15.0.434 E15.0.437 E17.5.516 E17.5.505
1 318 0 0 0 0
2 0 0 0 0 0
3 0 0 0 0 0
Define which genes are expressed
For each data set we need to define, independently from the DEA methods, which genes are specific for each cluster. So we need to define first which genes are expressed and which are not expressed. To do so we can take advantage from the fact that we have the original clusters from which the cells were sampled to create the artificial datasets. So looking to the original cluster we define as expressed all genes present in at least the 10% of cells and we define as not expressed the genes completely absent or expressed in less than 3% of cells.
Since these two thresholds can have a big influence on the tools performances we will test also others in other pages.
<- readRDS("Data/MouseCortexFromLoom/FDR/Results/GenePresence_PerCluster.RDS")
file.presence
for (file in list.files("Data/MouseCortexFromLoom/SingleClusterRawData/")) {
# print(file)
<- str_split(file,pattern = "_",simplify = T)[1]
Code <- str_split(Code,pattern = "e",simplify = T)[2]
Time <- str_split(Code,pattern = "e",simplify = T)[1]
Cluster <- str_remove(Cluster,pattern = "Cl")
Cluster <- paste0("E",Time,"-",Cluster)
Cluster <- "Absent"
file.presence[,Cluster] <- readRDS(file.path("Data/MouseCortexFromLoom/SingleClusterRawData/",
dataset.cl
file))<- rowSums(dataset.cl > 0)
number.cell.expressing <- round(0.03*dim(dataset.cl)[2],digits = 0)
AbsentThreshold <- round(0.1*dim(dataset.cl)[2],digits = 0)
PresenceThreshold names(number.cell.expressing[number.cell.expressing > AbsentThreshold]),Cluster] <- "Uncertain"
file.presence[
names(number.cell.expressing[number.cell.expressing >= PresenceThreshold]),Cluster] <- "Present"
file.presence[print(Cluster)
print(table(file.presence[,Cluster]))
}
[1] "E13.5-184"
Absent Present Uncertain
5519 6027 3149
[1] "E13.5-187"
Absent Present Uncertain
4926 7151 2618
[1] "E15.0-428"
Absent Present Uncertain
6386 5253 3056
[1] "E13.5-432"
Absent Present Uncertain
5765 5975 2955
[1] "E15.0-432"
Absent Present Uncertain
5912 5843 2940
[1] "E13.5-434"
Absent Present Uncertain
6124 5487 3084
[1] "E15.0-434"
Absent Present Uncertain
6301 5396 2998
[1] "E13.5-437"
Absent Present Uncertain
5943 5783 2969
[1] "E15.0-437"
Absent Present Uncertain
6017 5713 2965
[1] "E17.5-505"
Absent Present Uncertain
6048 5662 2985
[1] "E15.0-508"
Absent Present Uncertain
5625 6240 2830
[1] "E15.0-509"
Absent Present Uncertain
5452 6454 2789
[1] "E13.5-510"
Absent Present Uncertain
4924 6958 2813
[1] "E15.0-510"
Absent Present Uncertain
5070 6952 2673
[1] "E17.5-516"
Absent Present Uncertain
5821 5847 3027
2 Clusters even
2_Clusters_even_near
True vector
<-datasets_csv[datasets_csv$Group == "2_Clusters_even_near",]
subset.datasets_csv
<- NA
ground_truth_tot
for (ind in 1:dim(subset.datasets_csv)[1]) {
<- str_split(subset.datasets_csv$Collection[ind],pattern = "_[+]_",simplify = T)
clusters
<- file.presence[,clusters]
file.presence.subset
#file.presence.subset <- as.matrix(file.presence.subset)
<- as.data.frame(matrix(nrow = nrow(file.presence.subset),
ground_truth ncol = ncol(file.presence.subset)))
rownames(ground_truth) <- rownames(file.presence.subset)
colnames(ground_truth) <- colnames(file.presence.subset)
== "Absent"] <- 0
ground_truth[file.presence.subset == "Present"] <- 1
ground_truth[file.presence.subset == "Uncertain"] <- 0.6
ground_truth[file.presence.subset
<- ground_truth
file.presence.subset
for (col in 1:ncol(ground_truth)) {
<- FALSE
ground_truth[,col] == 1 & rowMeans(file.presence.subset[,-col,drop = FALSE]) < 0.555 ,col] <- TRUE
ground_truth[file.presence.subset[,col]
}
$genes <- rownames(ground_truth)
ground_truth<- pivot_longer(ground_truth,
ground_truth cols = 1:(ncol(ground_truth)-1),
names_to = "clusters")
$data_set <- subset.datasets_csv[ind,1]
ground_truth$set_number <- ind
ground_truth<- rbind(ground_truth_tot, ground_truth)
ground_truth_tot
}<- ground_truth_tot[2:nrow(ground_truth_tot),]
ground_truth_tot
head(ground_truth_tot,20)
# A tibble: 20 × 5
genes clusters value data_set set_number
<chr> <chr> <lgl> <chr> <int>
1 Neil2 E13.5-434 FALSE 2_Clusters_even_near 1
2 Neil2 E15.0-428 FALSE 2_Clusters_even_near 1
3 Lamc1 E13.5-434 FALSE 2_Clusters_even_near 1
4 Lamc1 E15.0-428 FALSE 2_Clusters_even_near 1
5 Lama1 E13.5-434 FALSE 2_Clusters_even_near 1
6 Lama1 E15.0-428 FALSE 2_Clusters_even_near 1
7 Hs3st1 E13.5-434 FALSE 2_Clusters_even_near 1
8 Hs3st1 E15.0-428 FALSE 2_Clusters_even_near 1
9 Fabp3 E13.5-434 FALSE 2_Clusters_even_near 1
10 Fabp3 E15.0-428 FALSE 2_Clusters_even_near 1
11 Nrg2 E13.5-434 FALSE 2_Clusters_even_near 1
12 Nrg2 E15.0-428 FALSE 2_Clusters_even_near 1
13 Kdelr3 E13.5-434 FALSE 2_Clusters_even_near 1
14 Kdelr3 E15.0-428 FALSE 2_Clusters_even_near 1
15 Bend4 E13.5-434 FALSE 2_Clusters_even_near 1
16 Bend4 E15.0-428 FALSE 2_Clusters_even_near 1
17 Gjb4 E13.5-434 FALSE 2_Clusters_even_near 1
18 Gjb4 E15.0-428 FALSE 2_Clusters_even_near 1
19 Mogs E13.5-434 FALSE 2_Clusters_even_near 1
20 Mogs E15.0-428 FALSE 2_Clusters_even_near 1
length(unique(ground_truth_tot$genes))
[1] 14695
sum(ground_truth_tot$value)
[1] 24
ROC for COTAN
<- NA
onlyPositive.pVal.Cotan_tot
for (ind in 1:dim(subset.datasets_csv)[1]) {
#print(ind)
<- paste0(subset.datasets_csv$Group[ind],"_",
file.code $Collection[ind])
subset.datasets_csv<- readRDS(file = file.path(dataSetDir,paste0(file.code,".RDS")))
dataset <- getClusterizationData(dataset,clName = "mergedClusters")[[2]]
deaCOTAN <- pValueFromDEA(deaCOTAN,
pvalCOTAN numCells = getNumCells(dataset),method = "none")
<- getClusterizationData(dataset, clName = "mergedClusters")[[1]]
clusterization <- NA
cl.names
for (cl.val in unique(clusterization)) {
#print(cl.val)
<- c(cl.names,
cl.names str_split(names(clusterization[clusterization == cl.val])[1],
pattern = "_",simplify = T)[1])
<- cl.names[!is.na(cl.names)]
cl.names
}colnames(deaCOTAN) <- cl.names
colnames(pvalCOTAN) <- cl.names
<- str_split(datasets_csv$Collection[ind],pattern = "_[+]_",simplify = T)
clusters
<- pvalCOTAN
onlyPositive.pVal.Cotan
for (cl in cl.names) {
print(cl)
#temp.DEA.CotanSign <- deaCOTAN[rownames(pvalCOTAN[pvalCOTAN[,cl] < 0.05,]) ,]
rownames(deaCOTAN[deaCOTAN[,cl] < 0,]),cl] <- 1 #onlyPositive.pVal.Cotan[rownames(deaCOTAN[deaCOTAN[,cl] < 0,]),cl]+1
onlyPositive.pVal.Cotan[
}
$genes <- rownames(onlyPositive.pVal.Cotan)
onlyPositive.pVal.Cotan<- pivot_longer(onlyPositive.pVal.Cotan,
onlyPositive.pVal.Cotan values_to = "p_values",
cols = 1:(ncol(onlyPositive.pVal.Cotan)-1),
names_to = "clusters")
$data_set <- subset.datasets_csv[ind,1]
onlyPositive.pVal.Cotan$set_number <- ind
onlyPositive.pVal.Cotan<- rbind(onlyPositive.pVal.Cotan_tot, onlyPositive.pVal.Cotan)
onlyPositive.pVal.Cotan_tot
}
[1] "E13.5-434"
[1] "E15.0-428"
[1] "E15.0-432"
[1] "E13.5-432"
[1] "E15.0-509"
[1] "E15.0-508"
<- onlyPositive.pVal.Cotan_tot[2:nrow(onlyPositive.pVal.Cotan_tot),]
onlyPositive.pVal.Cotan_tot
<- merge.data.frame(onlyPositive.pVal.Cotan_tot,
onlyPositive.pVal.Cotan_tot by = c("genes","clusters","data_set","set_number"),all.x = T,all.y = F)
ground_truth_tot,
#onlyPositive.pVal.Cotan_tot <- onlyPositive.pVal.Cotan_tot[order(onlyPositive.pVal.Cotan_tot$p_values,
# decreasing = F),]
# df <- as.data.frame(matrix(nrow = nrow(onlyPositive.pVal.Cotan_tot),ncol = 3))
# colnames(df) <- c("TPR","FPR","Method")
# df$Method <- "COTAN"
#
# Positive <- sum(onlyPositive.pVal.Cotan_tot$value)
# Negative <- sum(!onlyPositive.pVal.Cotan_tot$value)
#
# for (i in 1:nrow(onlyPositive.pVal.Cotan_tot)) {
# df[i,"TPR"] <- sum(onlyPositive.pVal.Cotan_tot[1:i,"value"])/Positive
# df[i,"FPR"] <- (i-sum(onlyPositive.pVal.Cotan_tot[1:i,"value"]))/Negative
# Convert TRUE/FALSE to 1/0
$value <- as.numeric(onlyPositive.pVal.Cotan_tot$value)
onlyPositive.pVal.Cotan_tot
# Compute the ROC curve - note that we invert the p-values with 1 - p_values
<- roc(onlyPositive.pVal.Cotan_tot$value, 1 - onlyPositive.pVal.Cotan_tot$p_values)
roc_resultCOTAN
# Plot the ROC curve
#plot(roc_resultCOTAN)
ROC for Seurat
<- NA
deaSeurat_tot
for (ind in 1:dim(subset.datasets_csv)[1]) {
#print(ind)
<- paste0(subset.datasets_csv$Group[ind],"_",subset.datasets_csv$Collection[ind])
file.code
<- readRDS(file = file.path(dataSetDir,paste0(file.code,".RDS")))
dataset
<- getClusterizationData(dataset, clName = "mergedClusters")[[1]]
clusterization
<- read.csv(file.path(dirOut,paste0(file.code,"Seurat_DEA_genes.csv")), row.names = 1)
deaSeurat
<- NA
cl.names
for (cl.val in unique(clusterization)) {
#print(cl.val)
<- str_split(names(clusterization[clusterization == cl.val])[1],
cl.name pattern = "_",simplify = T)[1]
<- c(cl.names,cl.name)
cl.names <- cl.names[!is.na(cl.names)]
cl.names
$cluster == cl.val,]$cluster <- cl.name
deaSeurat[deaSeurat
}
<- str_split(datasets_csv$Collection[ind],pattern = "_[+]_",simplify = T)
clusters
colnames(deaSeurat) <- str_replace(colnames(deaSeurat),
pattern = "cluster",
replacement = "clusters")
colnames(deaSeurat) <- str_replace(colnames(deaSeurat),
pattern = "gene",
replacement = "genes")
$data_set <- subset.datasets_csv[ind,1]
deaSeurat$set_number <- ind
deaSeurat<- rbind(deaSeurat_tot, deaSeurat)
deaSeurat_tot
}<- deaSeurat_tot[2:nrow(deaSeurat_tot),]
deaSeurat_tot
<- merge.data.frame(deaSeurat_tot,
deaSeurat_tot
ground_truth_tot,by = c("genes","clusters","data_set","set_number"),
all.x = T,all.y = F)
# Convert TRUE/FALSE to 1/0
$value <- as.numeric(deaSeurat_tot$value)
deaSeurat_tot
# Compute the ROC curve - note that we invert the p-values with 1 - p_values
<- roc(deaSeurat_tot$value, 1 - deaSeurat_tot$p_val)
roc_resultSeurat
# Plot the ROC curve
#plot(roc_resultSeurat)
ROC for Seurat scTransform
<- NA
deaSeurat_tot
for (ind in 1:dim(subset.datasets_csv)[1]) {
#print(ind)
<- paste0(subset.datasets_csv$Group[ind],"_",subset.datasets_csv$Collection[ind])
file.code
<- readRDS(file = file.path(dataSetDir,paste0(file.code,".RDS")))
dataset
<- getClusterizationData(dataset, clName = "mergedClusters")[[1]]
clusterization
<- read.csv(file.path(dirOut,paste0(file.code,"Seurat_DEA_ScTransform_genes.csv")), row.names = 1)
deaSeurat
<- NA
cl.names
for (cl.val in unique(clusterization)) {
#print(cl.val)
<- str_split(names(clusterization[clusterization == cl.val])[1],
cl.name pattern = "_",simplify = T)[1]
<- c(cl.names,cl.name)
cl.names <- cl.names[!is.na(cl.names)]
cl.names
$cluster == cl.val,]$cluster <- cl.name
deaSeurat[deaSeurat
}
<- str_split(datasets_csv$Collection[ind],pattern = "_[+]_",simplify = T)
clusters
colnames(deaSeurat) <- str_replace(colnames(deaSeurat),
pattern = "cluster",
replacement = "clusters")
colnames(deaSeurat) <- str_replace(colnames(deaSeurat),
pattern = "gene",
replacement = "genes")
$data_set <- subset.datasets_csv[ind,1]
deaSeurat$set_number <- ind
deaSeurat<- rbind(deaSeurat_tot, deaSeurat)
deaSeurat_tot
}<- deaSeurat_tot[2:nrow(deaSeurat_tot),]
deaSeurat_tot
<- merge.data.frame(deaSeurat_tot,
deaSeurat_tot
ground_truth_tot,by = c("genes","clusters","data_set","set_number"),
all.x = T,all.y = F)
# Convert TRUE/FALSE to 1/0
$value <- as.numeric(deaSeurat_tot$value)
deaSeurat_tot
# Compute the ROC curve - note that we invert the p-values with 1 - p_values
<- roc(deaSeurat_tot$value, 1 - deaSeurat_tot$p_val) roc_resultSeurat_scTr
ROC for Seurat Bimod
<- NA
deaSeurat_tot
for (ind in 1:dim(subset.datasets_csv)[1]) {
#print(ind)
<- paste0(subset.datasets_csv$Group[ind],"_",subset.datasets_csv$Collection[ind])
file.code
<- readRDS(file = file.path(dataSetDir,paste0(file.code,".RDS")))
dataset
<- getClusterizationData(dataset, clName = "mergedClusters")[[1]]
clusterization
<- read.csv(file.path(dirOut,paste0(file.code,"Seurat_DEA_Bimod_genes.csv")), row.names = 1)
deaSeurat
<- NA
cl.names
for (cl.val in unique(clusterization)) {
#print(cl.val)
<- str_split(names(clusterization[clusterization == cl.val])[1],
cl.name pattern = "_",simplify = T)[1]
<- c(cl.names,cl.name)
cl.names <- cl.names[!is.na(cl.names)]
cl.names
$cluster == cl.val,]$cluster <- cl.name
deaSeurat[deaSeurat
}
<- str_split(datasets_csv$Collection[ind],pattern = "_[+]_",simplify = T)
clusters
colnames(deaSeurat) <- str_replace(colnames(deaSeurat),
pattern = "cluster",
replacement = "clusters")
colnames(deaSeurat) <- str_replace(colnames(deaSeurat),
pattern = "gene",
replacement = "genes")
$data_set <- subset.datasets_csv[ind,1]
deaSeurat$set_number <- ind
deaSeurat<- rbind(deaSeurat_tot, deaSeurat)
deaSeurat_tot
}<- deaSeurat_tot[2:nrow(deaSeurat_tot),]
deaSeurat_tot
<- merge.data.frame(deaSeurat_tot,
deaSeurat_tot
ground_truth_tot,by = c("genes","clusters","data_set","set_number"),
all.x = T,all.y = F)
# Convert TRUE/FALSE to 1/0
$value <- as.numeric(deaSeurat_tot$value)
deaSeurat_tot
# Compute the ROC curve - note that we invert the p-values with 1 - p_values
<- roc(deaSeurat_tot$value, 1 - deaSeurat_tot$p_val) roc_resultSeurat_Bimod
ROC for Monocle
<- NA
deaMonocle_tot for (ind in 1:dim(subset.datasets_csv)[1]) {
<- paste0(subset.datasets_csv$Group[ind],"_",subset.datasets_csv$Collection[ind])
file.code <- readRDS(file = file.path(dataSetDir,paste0(file.code,".RDS")))
dataset
<- getClusterizationData(dataset, clName = "mergedClusters")[[1]]
clusterization #print(file.code)
<- read.csv(file.path(dirOut,paste0(file.code,"Monocle_DEA_genes.csv")),row.names = 1)
deaMonocle
<- NA
cl.names
for (cl.val in unique(clusterization)) {
#print(cl.val)
<- str_split(names(clusterization[clusterization == cl.val])[1],
cl.name pattern = "_",simplify = T)[1]
<- c(cl.names,cl.name)
cl.names <- cl.names[!is.na(cl.names)]
cl.names
$cell_group == cl.val,"cell_group"] <- cl.name
deaMonocle[deaMonocle
}
<- str_split(datasets_csv$Collection[ind],pattern = "_[+]_",simplify = T)
clusters
colnames(deaMonocle) <- str_replace(colnames(deaMonocle),
pattern = "cell_group",
replacement = "clusters")
colnames(deaMonocle) <- str_replace(colnames(deaMonocle),
pattern = "gene_id",
replacement = "genes")
$data_set <- subset.datasets_csv[ind,1]
deaMonocle$set_number <- ind
deaMonocle<- as.data.frame(deaMonocle)
deaMonocle <- rbind(deaMonocle_tot, deaMonocle)
deaMonocle_tot
}<- deaMonocle_tot[2:nrow(deaMonocle_tot),]
deaMonocle_tot
<- merge.data.frame(deaMonocle_tot,
deaMonocle_tot
ground_truth_tot,by = c("genes","clusters","data_set","set_number"),
all.x = T,all.y = F)
# Convert TRUE/FALSE to 1/0
$value <- as.numeric(deaMonocle_tot$value)
deaMonocle_tot
# Compute the ROC curve - note that we invert the p-values with 1 - p_values
<- roc(deaMonocle_tot$value, 1 - deaMonocle_tot$marker_test_p_value)
roc_resultMonocle
# Plot the ROC curve
#plot(roc_resultMonocle)
ROC from Scanpy
<- NA
deaScanpy_tot for (ind in 1:dim(subset.datasets_csv)[1]) {
<- paste0(subset.datasets_csv$Group[ind],"_",subset.datasets_csv$Collection[ind])
file.code <- readRDS(file = file.path(dataSetDir,paste0(file.code,".RDS")))
dataset
<- getClusterizationData(dataset, clName = "mergedClusters")[[1]]
clusterization #print(file.code)
<- read.csv(file.path(dirOut,paste0(file.code,"Scanpy_DEA_genes.csv")),
deaScanpy row.names = 1)
<- NA
cl.names
for (cl.val in unique(clusterization)) {
#print(cl.val)
<- str_split(names(clusterization[clusterization == cl.val])[1],
cl.name pattern = "_",simplify = T)[1]
<- c(cl.names,cl.name)
cl.names <- cl.names[!is.na(cl.names)]
cl.names
$clusters == paste0("cl",cl.val),"clusters"] <- cl.name
deaScanpy[deaScanpy
}
<- str_split(datasets_csv$Collection[ind],pattern = "_[+]_",simplify = T)
clusters
$data_set <- subset.datasets_csv[ind,1]
deaScanpy$set_number <- ind
deaScanpy<- rbind(deaScanpy_tot, deaScanpy)
deaScanpy_tot
}<- deaScanpy_tot[2:nrow(deaScanpy_tot),]
deaScanpy_tot
<- merge.data.frame(deaScanpy_tot,
deaScanpy_tot
ground_truth_tot,by = c("genes","clusters","data_set","set_number"),
all.x = T,all.y = F)
# Convert TRUE/FALSE to 1/0
$value <- as.numeric(deaScanpy_tot$value)
deaScanpy_tot
# Compute the ROC curve - note that we invert the p-values with 1 - p_values
<- roc(deaScanpy_tot$value, 1 - deaScanpy_tot$pval)
roc_resultScanpy
# Plot the ROC curve
#plot(roc_resultScanpy)
Summary ROC for all methods
<- ggroc(list(COTAN=roc_resultCOTAN,
TwoClusters_even_near Seurat=roc_resultSeurat,
Seurat_scTr = roc_resultSeurat_scTr,
Seurat_Bimod = roc_resultSeurat_Bimod,
Monocle=roc_resultMonocle,
Scanpy=roc_resultScanpy),
aes = c("colour"),
legacy.axes = T,
linewidth = 1)+theme_light()+
scale_colour_manual("",
values = methods.color)
<- TwoClusters_even_near +
TwoClusters_even_nearPL xlab("FPR") +
ylab("TPR")
TwoClusters_even_nearPL
2_Clusters_even_medium
True vector
<-datasets_csv[datasets_csv$Group == "2_Clusters_even_medium",]
subset.datasets_csv
<- NA
ground_truth_tot
for (ind in 1:dim(subset.datasets_csv)[1]) {
<- str_split(subset.datasets_csv$Collection[ind],pattern = "_[+]_",simplify = T)
clusters
<- file.presence[,clusters]
file.presence.subset
#file.presence.subset <- as.matrix(file.presence.subset)
<- as.data.frame(matrix(nrow = nrow(file.presence.subset),
ground_truth ncol = ncol(file.presence.subset)))
rownames(ground_truth) <- rownames(file.presence.subset)
colnames(ground_truth) <- colnames(file.presence.subset)
== "Absent"] <- 0
ground_truth[file.presence.subset == "Present"] <- 1
ground_truth[file.presence.subset == "Uncertain"] <- 0.6
ground_truth[file.presence.subset
<- ground_truth
file.presence.subset
for (col in 1:ncol(ground_truth)) {
<- FALSE
ground_truth[,col] == 1 & rowMeans(file.presence.subset[,-col,drop = FALSE]) < 0.555 ,col] <- TRUE
ground_truth[file.presence.subset[,col]
}
$genes <- rownames(ground_truth)
ground_truth<- pivot_longer(ground_truth,
ground_truth cols = 1:(ncol(ground_truth)-1),
names_to = "clusters")
$data_set <- subset.datasets_csv[ind,1]
ground_truth$set_number <- ind
ground_truth<- rbind(ground_truth_tot, ground_truth)
ground_truth_tot
}<- ground_truth_tot[2:nrow(ground_truth_tot),]
ground_truth_tot
head(ground_truth_tot)
# A tibble: 6 × 5
genes clusters value data_set set_number
<chr> <chr> <lgl> <chr> <int>
1 Neil2 E13.5-187 FALSE 2_Clusters_even_medium 1
2 Neil2 E13.5-184 FALSE 2_Clusters_even_medium 1
3 Lamc1 E13.5-187 FALSE 2_Clusters_even_medium 1
4 Lamc1 E13.5-184 FALSE 2_Clusters_even_medium 1
5 Lama1 E13.5-187 FALSE 2_Clusters_even_medium 1
6 Lama1 E13.5-184 FALSE 2_Clusters_even_medium 1
length(unique(ground_truth_tot$genes))
[1] 14695
sum(ground_truth_tot$value)
[1] 771
ROC for COTAN
<- NA
onlyPositive.pVal.Cotan_tot
for (ind in 1:dim(subset.datasets_csv)[1]) {
#print(ind)
<- paste0(subset.datasets_csv$Group[ind],"_",
file.code $Collection[ind])
subset.datasets_csv<- readRDS(file = file.path(dataSetDir,paste0(file.code,".RDS")))
dataset <- getClusterizationData(dataset,clName = "mergedClusters")[[2]]
deaCOTAN <- pValueFromDEA(deaCOTAN,
pvalCOTAN numCells = getNumCells(dataset),method = "none")
<- getClusterizationData(dataset, clName = "mergedClusters")[[1]]
clusterization <- NA
cl.names
for (cl.val in unique(clusterization)) {
#print(cl.val)
<- c(cl.names,
cl.names str_split(names(clusterization[clusterization == cl.val])[1],
pattern = "_",simplify = T)[1])
<- cl.names[!is.na(cl.names)]
cl.names
}colnames(deaCOTAN) <- cl.names
colnames(pvalCOTAN) <- cl.names
<- str_split(datasets_csv$Collection[ind],pattern = "_[+]_",simplify = T)
clusters
<- pvalCOTAN
onlyPositive.pVal.Cotan
for (cl in cl.names) {
print(cl)
#temp.DEA.CotanSign <- deaCOTAN[rownames(pvalCOTAN[pvalCOTAN[,cl] < 0.05,]) ,]
rownames(deaCOTAN[deaCOTAN[,cl] < 0,]),cl] <- 1 #onlyPositive.pVal.Cotan[rownames(deaCOTAN[deaCOTAN[,cl] < 0,]),cl]+1
onlyPositive.pVal.Cotan[
}
$genes <- rownames(onlyPositive.pVal.Cotan)
onlyPositive.pVal.Cotan<- pivot_longer(onlyPositive.pVal.Cotan,
onlyPositive.pVal.Cotan values_to = "p_values",
cols = 1:(ncol(onlyPositive.pVal.Cotan)-1),
names_to = "clusters")
$data_set <- subset.datasets_csv[ind,1]
onlyPositive.pVal.Cotan$set_number <- ind
onlyPositive.pVal.Cotan<- rbind(onlyPositive.pVal.Cotan_tot, onlyPositive.pVal.Cotan)
onlyPositive.pVal.Cotan_tot
}
[1] "E13.5-187"
[1] "E13.5-184"
[1] "E17.5-516"
[1] "E15.0-434"
[1] "E15.0-508"
[1] "E15.0-437"
<- onlyPositive.pVal.Cotan_tot[2:nrow(onlyPositive.pVal.Cotan_tot),]
onlyPositive.pVal.Cotan_tot
<- merge.data.frame(onlyPositive.pVal.Cotan_tot,
onlyPositive.pVal.Cotan_tot by = c("genes","clusters","data_set","set_number"),all.x = T,all.y = F)
ground_truth_tot,
#onlyPositive.pVal.Cotan_tot <- onlyPositive.pVal.Cotan_tot[order(onlyPositive.pVal.Cotan_tot$p_values,
# decreasing = F),]
# df <- as.data.frame(matrix(nrow = nrow(onlyPositive.pVal.Cotan_tot),ncol = 3))
# colnames(df) <- c("TPR","FPR","Method")
# df$Method <- "COTAN"
#
# Positive <- sum(onlyPositive.pVal.Cotan_tot$value)
# Negative <- sum(!onlyPositive.pVal.Cotan_tot$value)
#
# for (i in 1:nrow(onlyPositive.pVal.Cotan_tot)) {
# df[i,"TPR"] <- sum(onlyPositive.pVal.Cotan_tot[1:i,"value"])/Positive
# df[i,"FPR"] <- (i-sum(onlyPositive.pVal.Cotan_tot[1:i,"value"]))/Negative
# Convert TRUE/FALSE to 1/0
$value <- as.numeric(onlyPositive.pVal.Cotan_tot$value)
onlyPositive.pVal.Cotan_tot
# Compute the ROC curve - note that we invert the p-values with 1 - p_values
<- roc(onlyPositive.pVal.Cotan_tot$value, 1 - onlyPositive.pVal.Cotan_tot$p_values)
roc_resultCOTAN
# Plot the ROC curve
#plot(roc_resultCOTAN)
ROC for Seurat
<- NA
deaSeurat_tot
for (ind in 1:dim(subset.datasets_csv)[1]) {
#print(ind)
<- paste0(subset.datasets_csv$Group[ind],"_",subset.datasets_csv$Collection[ind])
file.code
<- readRDS(file = file.path(dataSetDir,paste0(file.code,".RDS")))
dataset
<- getClusterizationData(dataset, clName = "mergedClusters")[[1]]
clusterization
<- read.csv(file.path(dirOut,paste0(file.code,"Seurat_DEA_genes.csv")), row.names = 1)
deaSeurat
<- NA
cl.names
for (cl.val in unique(clusterization)) {
#print(cl.val)
<- str_split(names(clusterization[clusterization == cl.val])[1],
cl.name pattern = "_",simplify = T)[1]
<- c(cl.names,cl.name)
cl.names <- cl.names[!is.na(cl.names)]
cl.names
$cluster == cl.val,]$cluster <- cl.name
deaSeurat[deaSeurat
}
<- str_split(datasets_csv$Collection[ind],pattern = "_[+]_",simplify = T)
clusters
colnames(deaSeurat) <- str_replace(colnames(deaSeurat),
pattern = "cluster",
replacement = "clusters")
colnames(deaSeurat) <- str_replace(colnames(deaSeurat),
pattern = "gene",
replacement = "genes")
$data_set <- subset.datasets_csv[ind,1]
deaSeurat$set_number <- ind
deaSeurat<- rbind(deaSeurat_tot, deaSeurat)
deaSeurat_tot
}<- deaSeurat_tot[2:nrow(deaSeurat_tot),]
deaSeurat_tot
<- merge.data.frame(deaSeurat_tot,
deaSeurat_tot
ground_truth_tot,by = c("genes","clusters","data_set","set_number"),
all.x = T,all.y = F)
# Convert TRUE/FALSE to 1/0
$value <- as.numeric(deaSeurat_tot$value)
deaSeurat_tot
# Compute the ROC curve - note that we invert the p-values with 1 - p_values
<- roc(deaSeurat_tot$value, 1 - deaSeurat_tot$p_val)
roc_resultSeurat
# Plot the ROC curve
#plot(roc_resultSeurat)
ROC for Seurat scTransform
<- NA
deaSeurat_tot
for (ind in 1:dim(subset.datasets_csv)[1]) {
#print(ind)
<- paste0(subset.datasets_csv$Group[ind],"_",subset.datasets_csv$Collection[ind])
file.code
<- readRDS(file = file.path(dataSetDir,paste0(file.code,".RDS")))
dataset
<- getClusterizationData(dataset, clName = "mergedClusters")[[1]]
clusterization
<- read.csv(file.path(dirOut,paste0(file.code,"Seurat_DEA_ScTransform_genes.csv")), row.names = 1)
deaSeurat
<- NA
cl.names
for (cl.val in unique(clusterization)) {
#print(cl.val)
<- str_split(names(clusterization[clusterization == cl.val])[1],
cl.name pattern = "_",simplify = T)[1]
<- c(cl.names,cl.name)
cl.names <- cl.names[!is.na(cl.names)]
cl.names
$cluster == cl.val,]$cluster <- cl.name
deaSeurat[deaSeurat
}
<- str_split(datasets_csv$Collection[ind],pattern = "_[+]_",simplify = T)
clusters
colnames(deaSeurat) <- str_replace(colnames(deaSeurat),
pattern = "cluster",
replacement = "clusters")
colnames(deaSeurat) <- str_replace(colnames(deaSeurat),
pattern = "gene",
replacement = "genes")
$data_set <- subset.datasets_csv[ind,1]
deaSeurat$set_number <- ind
deaSeurat<- rbind(deaSeurat_tot, deaSeurat)
deaSeurat_tot
}<- deaSeurat_tot[2:nrow(deaSeurat_tot),]
deaSeurat_tot
<- merge.data.frame(deaSeurat_tot,
deaSeurat_tot
ground_truth_tot,by = c("genes","clusters","data_set","set_number"),
all.x = T,all.y = F)
# Convert TRUE/FALSE to 1/0
$value <- as.numeric(deaSeurat_tot$value)
deaSeurat_tot
# Compute the ROC curve - note that we invert the p-values with 1 - p_values
<- roc(deaSeurat_tot$value, 1 - deaSeurat_tot$p_val) roc_resultSeurat_scTr
ROC for Seurat Bimod
<- NA
deaSeurat_tot
for (ind in 1:dim(subset.datasets_csv)[1]) {
#print(ind)
<- paste0(subset.datasets_csv$Group[ind],"_",subset.datasets_csv$Collection[ind])
file.code
<- readRDS(file = file.path(dataSetDir,paste0(file.code,".RDS")))
dataset
<- getClusterizationData(dataset, clName = "mergedClusters")[[1]]
clusterization
<- read.csv(file.path(dirOut,paste0(file.code,"Seurat_DEA_Bimod_genes.csv")), row.names = 1)
deaSeurat
<- NA
cl.names
for (cl.val in unique(clusterization)) {
#print(cl.val)
<- str_split(names(clusterization[clusterization == cl.val])[1],
cl.name pattern = "_",simplify = T)[1]
<- c(cl.names,cl.name)
cl.names <- cl.names[!is.na(cl.names)]
cl.names
$cluster == cl.val,]$cluster <- cl.name
deaSeurat[deaSeurat
}
<- str_split(datasets_csv$Collection[ind],pattern = "_[+]_",simplify = T)
clusters
colnames(deaSeurat) <- str_replace(colnames(deaSeurat),
pattern = "cluster",
replacement = "clusters")
colnames(deaSeurat) <- str_replace(colnames(deaSeurat),
pattern = "gene",
replacement = "genes")
$data_set <- subset.datasets_csv[ind,1]
deaSeurat$set_number <- ind
deaSeurat<- rbind(deaSeurat_tot, deaSeurat)
deaSeurat_tot
}<- deaSeurat_tot[2:nrow(deaSeurat_tot),]
deaSeurat_tot
<- merge.data.frame(deaSeurat_tot,
deaSeurat_tot
ground_truth_tot,by = c("genes","clusters","data_set","set_number"),
all.x = T,all.y = F)
# Convert TRUE/FALSE to 1/0
$value <- as.numeric(deaSeurat_tot$value)
deaSeurat_tot
# Compute the ROC curve - note that we invert the p-values with 1 - p_values
<- roc(deaSeurat_tot$value, 1 - deaSeurat_tot$p_val) roc_resultSeurat_Bimod
ROC for Monocle
<- NA
deaMonocle_tot for (ind in 1:dim(subset.datasets_csv)[1]) {
<- paste0(subset.datasets_csv$Group[ind],"_",subset.datasets_csv$Collection[ind])
file.code <- readRDS(file = file.path(dataSetDir,paste0(file.code,".RDS")))
dataset
<- getClusterizationData(dataset, clName = "mergedClusters")[[1]]
clusterization #print(file.code)
<- read.csv(file.path(dirOut,paste0(file.code,"Monocle_DEA_genes.csv")),row.names = 1)
deaMonocle
<- NA
cl.names
for (cl.val in unique(clusterization)) {
#print(cl.val)
<- str_split(names(clusterization[clusterization == cl.val])[1],
cl.name pattern = "_",simplify = T)[1]
<- c(cl.names,cl.name)
cl.names <- cl.names[!is.na(cl.names)]
cl.names
$cell_group == cl.val,"cell_group"] <- cl.name
deaMonocle[deaMonocle
}
<- str_split(datasets_csv$Collection[ind],pattern = "_[+]_",simplify = T)
clusters
colnames(deaMonocle) <- str_replace(colnames(deaMonocle),
pattern = "cell_group",
replacement = "clusters")
colnames(deaMonocle) <- str_replace(colnames(deaMonocle),
pattern = "gene_id",
replacement = "genes")
$data_set <- subset.datasets_csv[ind,1]
deaMonocle$set_number <- ind
deaMonocle<- as.data.frame(deaMonocle)
deaMonocle <- rbind(deaMonocle_tot, deaMonocle)
deaMonocle_tot
}<- deaMonocle_tot[2:nrow(deaMonocle_tot),]
deaMonocle_tot
<- merge.data.frame(deaMonocle_tot,
deaMonocle_tot
ground_truth_tot,by = c("genes","clusters","data_set","set_number"),
all.x = T,all.y = F)
# Convert TRUE/FALSE to 1/0
$value <- as.numeric(deaMonocle_tot$value)
deaMonocle_tot
# Compute the ROC curve - note that we invert the p-values with 1 - p_values
<- roc(deaMonocle_tot$value, 1 - deaMonocle_tot$marker_test_p_value)
roc_resultMonocle
# Plot the ROC curve
#plot(roc_resultMonocle)
ROC from Scanpy
<- NA
deaScanpy_tot for (ind in 1:dim(subset.datasets_csv)[1]) {
<- paste0(subset.datasets_csv$Group[ind],"_",subset.datasets_csv$Collection[ind])
file.code <- readRDS(file = file.path(dataSetDir,paste0(file.code,".RDS")))
dataset
<- getClusterizationData(dataset, clName = "mergedClusters")[[1]]
clusterization #print(file.code)
<- read.csv(file.path(dirOut,paste0(file.code,"Scanpy_DEA_genes.csv")),
deaScanpy row.names = 1)
<- NA
cl.names
for (cl.val in unique(clusterization)) {
#print(cl.val)
<- str_split(names(clusterization[clusterization == cl.val])[1],
cl.name pattern = "_",simplify = T)[1]
<- c(cl.names,cl.name)
cl.names <- cl.names[!is.na(cl.names)]
cl.names
$clusters == paste0("cl",cl.val),"clusters"] <- cl.name
deaScanpy[deaScanpy
}
<- str_split(datasets_csv$Collection[ind],pattern = "_[+]_",simplify = T)
clusters
$data_set <- subset.datasets_csv[ind,1]
deaScanpy$set_number <- ind
deaScanpy<- rbind(deaScanpy_tot, deaScanpy)
deaScanpy_tot
}<- deaScanpy_tot[2:nrow(deaScanpy_tot),]
deaScanpy_tot
<- merge.data.frame(deaScanpy_tot,
deaScanpy_tot
ground_truth_tot,by = c("genes","clusters","data_set","set_number"),
all.x = T,all.y = F)
# Convert TRUE/FALSE to 1/0
$value <- as.numeric(deaScanpy_tot$value)
deaScanpy_tot
# Compute the ROC curve - note that we invert the p-values with 1 - p_values
<- roc(deaScanpy_tot$value, 1 - deaScanpy_tot$pval)
roc_resultScanpy
# Plot the ROC curve
#plot(roc_resultScanpy)
Summary ROC for all methods
<- ggroc(list(COTAN=roc_resultCOTAN,
Two_Clusters_even_medium Seurat=roc_resultSeurat,
Seurat_scTr = roc_resultSeurat_scTr,
Seurat_Bimod = roc_resultSeurat_Bimod,
Monocle=roc_resultMonocle,
Scanpy=roc_resultScanpy),
aes = c("colour"),
legacy.axes = T,
linewidth = 1)+theme_light()+
scale_colour_manual("",
values = methods.color)
<- Two_Clusters_even_medium + xlab("FPR") + ylab("TPR")
Two_Clusters_even_mediumPL Two_Clusters_even_mediumPL
2_Clusters_even_far
True vector
<-datasets_csv[datasets_csv$Group == "2_Clusters_even_far",]
subset.datasets_csv
<- NA
ground_truth_tot
for (ind in 1:dim(subset.datasets_csv)[1]) {
<- str_split(subset.datasets_csv$Collection[ind],pattern = "_[+]_",simplify = T)
clusters
<- file.presence[,clusters]
file.presence.subset
#file.presence.subset <- as.matrix(file.presence.subset)
<- as.data.frame(matrix(nrow = nrow(file.presence.subset),
ground_truth ncol = ncol(file.presence.subset)))
rownames(ground_truth) <- rownames(file.presence.subset)
colnames(ground_truth) <- colnames(file.presence.subset)
== "Absent"] <- 0
ground_truth[file.presence.subset == "Present"] <- 1
ground_truth[file.presence.subset == "Uncertain"] <- 0.6
ground_truth[file.presence.subset
<- ground_truth
file.presence.subset
for (col in 1:ncol(ground_truth)) {
<- FALSE
ground_truth[,col] == 1 & rowMeans(file.presence.subset[,-col,drop = FALSE]) < 0.555 ,col] <- TRUE
ground_truth[file.presence.subset[,col]
}
$genes <- rownames(ground_truth)
ground_truth<- pivot_longer(ground_truth,
ground_truth cols = 1:(ncol(ground_truth)-1),
names_to = "clusters")
$data_set <- subset.datasets_csv[ind,1]
ground_truth$set_number <- ind
ground_truth<- rbind(ground_truth_tot, ground_truth)
ground_truth_tot
}<- ground_truth_tot[2:nrow(ground_truth_tot),]
ground_truth_tot
head(ground_truth_tot)
# A tibble: 6 × 5
genes clusters value data_set set_number
<chr> <chr> <lgl> <chr> <int>
1 Neil2 E17.5-516 FALSE 2_Clusters_even_far 1
2 Neil2 E13.5-187 FALSE 2_Clusters_even_far 1
3 Lamc1 E17.5-516 FALSE 2_Clusters_even_far 1
4 Lamc1 E13.5-187 FALSE 2_Clusters_even_far 1
5 Lama1 E17.5-516 FALSE 2_Clusters_even_far 1
6 Lama1 E13.5-187 TRUE 2_Clusters_even_far 1
length(unique(ground_truth_tot$genes))
[1] 14695
sum(ground_truth_tot$value)
[1] 2325
ROC for COTAN
<- NA
onlyPositive.pVal.Cotan_tot
for (ind in 1:dim(subset.datasets_csv)[1]) {
#print(ind)
<- paste0(subset.datasets_csv$Group[ind],"_",
file.code $Collection[ind])
subset.datasets_csv<- readRDS(file = file.path(dataSetDir,paste0(file.code,".RDS")))
dataset <- getClusterizationData(dataset,clName = "mergedClusters")[[2]]
deaCOTAN <- pValueFromDEA(deaCOTAN,
pvalCOTAN numCells = getNumCells(dataset),method = "none")
<- getClusterizationData(dataset, clName = "mergedClusters")[[1]]
clusterization <- NA
cl.names
for (cl.val in unique(clusterization)) {
#print(cl.val)
<- c(cl.names,
cl.names str_split(names(clusterization[clusterization == cl.val])[1],
pattern = "_",simplify = T)[1])
<- cl.names[!is.na(cl.names)]
cl.names
}colnames(deaCOTAN) <- cl.names
colnames(pvalCOTAN) <- cl.names
<- str_split(datasets_csv$Collection[ind],pattern = "_[+]_",simplify = T)
clusters
<- pvalCOTAN
onlyPositive.pVal.Cotan
for (cl in cl.names) {
print(cl)
#temp.DEA.CotanSign <- deaCOTAN[rownames(pvalCOTAN[pvalCOTAN[,cl] < 0.05,]) ,]
rownames(deaCOTAN[deaCOTAN[,cl] < 0,]),cl] <- 1 #onlyPositive.pVal.Cotan[rownames(deaCOTAN[deaCOTAN[,cl] < 0,]),cl]+1
onlyPositive.pVal.Cotan[
}
$genes <- rownames(onlyPositive.pVal.Cotan)
onlyPositive.pVal.Cotan<- pivot_longer(onlyPositive.pVal.Cotan,
onlyPositive.pVal.Cotan values_to = "p_values",
cols = 1:(ncol(onlyPositive.pVal.Cotan)-1),
names_to = "clusters")
$data_set <- subset.datasets_csv[ind,1]
onlyPositive.pVal.Cotan$set_number <- ind
onlyPositive.pVal.Cotan<- rbind(onlyPositive.pVal.Cotan_tot, onlyPositive.pVal.Cotan)
onlyPositive.pVal.Cotan_tot
}
[1] "E13.5-187"
[1] "E17.5-516"
[1] "E15.0-510"
[1] "E13.5-437"
[1] "E15.0-509"
[1] "E13.5-184"
<- onlyPositive.pVal.Cotan_tot[2:nrow(onlyPositive.pVal.Cotan_tot),]
onlyPositive.pVal.Cotan_tot
<- merge.data.frame(onlyPositive.pVal.Cotan_tot,
onlyPositive.pVal.Cotan_tot by = c("genes","clusters","data_set","set_number"),all.x = T,all.y = F)
ground_truth_tot,
#onlyPositive.pVal.Cotan_tot <- onlyPositive.pVal.Cotan_tot[order(onlyPositive.pVal.Cotan_tot$p_values,
# decreasing = F),]
# df <- as.data.frame(matrix(nrow = nrow(onlyPositive.pVal.Cotan_tot),ncol = 3))
# colnames(df) <- c("TPR","FPR","Method")
# df$Method <- "COTAN"
#
# Positive <- sum(onlyPositive.pVal.Cotan_tot$value)
# Negative <- sum(!onlyPositive.pVal.Cotan_tot$value)
#
# for (i in 1:nrow(onlyPositive.pVal.Cotan_tot)) {
# df[i,"TPR"] <- sum(onlyPositive.pVal.Cotan_tot[1:i,"value"])/Positive
# df[i,"FPR"] <- (i-sum(onlyPositive.pVal.Cotan_tot[1:i,"value"]))/Negative
# Convert TRUE/FALSE to 1/0
$value <- as.numeric(onlyPositive.pVal.Cotan_tot$value)
onlyPositive.pVal.Cotan_tot
# Compute the ROC curve - note that we invert the p-values with 1 - p_values
<- roc(onlyPositive.pVal.Cotan_tot$value, 1 - onlyPositive.pVal.Cotan_tot$p_values)
roc_resultCOTAN
# Plot the ROC curve
#plot(roc_resultCOTAN)
ROC for Seurat
<- NA
deaSeurat_tot
for (ind in 1:dim(subset.datasets_csv)[1]) {
#print(ind)
<- paste0(subset.datasets_csv$Group[ind],"_",subset.datasets_csv$Collection[ind])
file.code
<- readRDS(file = file.path(dataSetDir,paste0(file.code,".RDS")))
dataset
<- getClusterizationData(dataset, clName = "mergedClusters")[[1]]
clusterization
<- read.csv(file.path(dirOut,paste0(file.code,"Seurat_DEA_genes.csv")), row.names = 1)
deaSeurat
<- NA
cl.names
for (cl.val in unique(clusterization)) {
#print(cl.val)
<- str_split(names(clusterization[clusterization == cl.val])[1],
cl.name pattern = "_",simplify = T)[1]
<- c(cl.names,cl.name)
cl.names <- cl.names[!is.na(cl.names)]
cl.names
$cluster == cl.val,]$cluster <- cl.name
deaSeurat[deaSeurat
}
<- str_split(datasets_csv$Collection[ind],pattern = "_[+]_",simplify = T)
clusters
colnames(deaSeurat) <- str_replace(colnames(deaSeurat),
pattern = "cluster",
replacement = "clusters")
colnames(deaSeurat) <- str_replace(colnames(deaSeurat),
pattern = "gene",
replacement = "genes")
$data_set <- subset.datasets_csv[ind,1]
deaSeurat$set_number <- ind
deaSeurat<- rbind(deaSeurat_tot, deaSeurat)
deaSeurat_tot
}<- deaSeurat_tot[2:nrow(deaSeurat_tot),]
deaSeurat_tot
<- merge.data.frame(deaSeurat_tot,
deaSeurat_tot
ground_truth_tot,by = c("genes","clusters","data_set","set_number"),
all.x = T,all.y = F)
# Convert TRUE/FALSE to 1/0
$value <- as.numeric(deaSeurat_tot$value)
deaSeurat_tot
# Compute the ROC curve - note that we invert the p-values with 1 - p_values
<- roc(deaSeurat_tot$value, 1 - deaSeurat_tot$p_val)
roc_resultSeurat
# Plot the ROC curve
#plot(roc_resultSeurat)
ROC for Seurat scTransform
<- NA
deaSeurat_tot
for (ind in 1:dim(subset.datasets_csv)[1]) {
#print(ind)
<- paste0(subset.datasets_csv$Group[ind],"_",subset.datasets_csv$Collection[ind])
file.code
<- readRDS(file = file.path(dataSetDir,paste0(file.code,".RDS")))
dataset
<- getClusterizationData(dataset, clName = "mergedClusters")[[1]]
clusterization
<- read.csv(file.path(dirOut,paste0(file.code,"Seurat_DEA_ScTransform_genes.csv")), row.names = 1)
deaSeurat
<- NA
cl.names
for (cl.val in unique(clusterization)) {
#print(cl.val)
<- str_split(names(clusterization[clusterization == cl.val])[1],
cl.name pattern = "_",simplify = T)[1]
<- c(cl.names,cl.name)
cl.names <- cl.names[!is.na(cl.names)]
cl.names
$cluster == cl.val,]$cluster <- cl.name
deaSeurat[deaSeurat
}
<- str_split(datasets_csv$Collection[ind],pattern = "_[+]_",simplify = T)
clusters
colnames(deaSeurat) <- str_replace(colnames(deaSeurat),
pattern = "cluster",
replacement = "clusters")
colnames(deaSeurat) <- str_replace(colnames(deaSeurat),
pattern = "gene",
replacement = "genes")
$data_set <- subset.datasets_csv[ind,1]
deaSeurat$set_number <- ind
deaSeurat<- rbind(deaSeurat_tot, deaSeurat)
deaSeurat_tot
}<- deaSeurat_tot[2:nrow(deaSeurat_tot),]
deaSeurat_tot
<- merge.data.frame(deaSeurat_tot,
deaSeurat_tot
ground_truth_tot,by = c("genes","clusters","data_set","set_number"),
all.x = T,all.y = F)
# Convert TRUE/FALSE to 1/0
$value <- as.numeric(deaSeurat_tot$value)
deaSeurat_tot
# Compute the ROC curve - note that we invert the p-values with 1 - p_values
<- roc(deaSeurat_tot$value, 1 - deaSeurat_tot$p_val) roc_resultSeurat_scTr
ROC for Seurat Bimod
<- NA
deaSeurat_tot
for (ind in 1:dim(subset.datasets_csv)[1]) {
#print(ind)
<- paste0(subset.datasets_csv$Group[ind],"_",subset.datasets_csv$Collection[ind])
file.code
<- readRDS(file = file.path(dataSetDir,paste0(file.code,".RDS")))
dataset
<- getClusterizationData(dataset, clName = "mergedClusters")[[1]]
clusterization
<- read.csv(file.path(dirOut,paste0(file.code,"Seurat_DEA_Bimod_genes.csv")), row.names = 1)
deaSeurat
<- NA
cl.names
for (cl.val in unique(clusterization)) {
#print(cl.val)
<- str_split(names(clusterization[clusterization == cl.val])[1],
cl.name pattern = "_",simplify = T)[1]
<- c(cl.names,cl.name)
cl.names <- cl.names[!is.na(cl.names)]
cl.names
$cluster == cl.val,]$cluster <- cl.name
deaSeurat[deaSeurat
}
<- str_split(datasets_csv$Collection[ind],pattern = "_[+]_",simplify = T)
clusters
colnames(deaSeurat) <- str_replace(colnames(deaSeurat),
pattern = "cluster",
replacement = "clusters")
colnames(deaSeurat) <- str_replace(colnames(deaSeurat),
pattern = "gene",
replacement = "genes")
$data_set <- subset.datasets_csv[ind,1]
deaSeurat$set_number <- ind
deaSeurat<- rbind(deaSeurat_tot, deaSeurat)
deaSeurat_tot
}<- deaSeurat_tot[2:nrow(deaSeurat_tot),]
deaSeurat_tot
<- merge.data.frame(deaSeurat_tot,
deaSeurat_tot
ground_truth_tot,by = c("genes","clusters","data_set","set_number"),
all.x = T,all.y = F)
# Convert TRUE/FALSE to 1/0
$value <- as.numeric(deaSeurat_tot$value)
deaSeurat_tot
# Compute the ROC curve - note that we invert the p-values with 1 - p_values
<- roc(deaSeurat_tot$value, 1 - deaSeurat_tot$p_val) roc_resultSeurat_Bimod
ROC for Monocle
<- NA
deaMonocle_tot for (ind in 1:dim(subset.datasets_csv)[1]) {
<- paste0(subset.datasets_csv$Group[ind],"_",subset.datasets_csv$Collection[ind])
file.code <- readRDS(file = file.path(dataSetDir,paste0(file.code,".RDS")))
dataset
<- getClusterizationData(dataset, clName = "mergedClusters")[[1]]
clusterization #print(file.code)
<- read.csv(file.path(dirOut,paste0(file.code,"Monocle_DEA_genes.csv")),row.names = 1)
deaMonocle
<- NA
cl.names
for (cl.val in unique(clusterization)) {
#print(cl.val)
<- str_split(names(clusterization[clusterization == cl.val])[1],
cl.name pattern = "_",simplify = T)[1]
<- c(cl.names,cl.name)
cl.names <- cl.names[!is.na(cl.names)]
cl.names
$cell_group == cl.val,"cell_group"] <- cl.name
deaMonocle[deaMonocle
}
<- str_split(datasets_csv$Collection[ind],pattern = "_[+]_",simplify = T)
clusters
colnames(deaMonocle) <- str_replace(colnames(deaMonocle),
pattern = "cell_group",
replacement = "clusters")
colnames(deaMonocle) <- str_replace(colnames(deaMonocle),
pattern = "gene_id",
replacement = "genes")
$data_set <- subset.datasets_csv[ind,1]
deaMonocle$set_number <- ind
deaMonocle<- as.data.frame(deaMonocle)
deaMonocle <- rbind(deaMonocle_tot, deaMonocle)
deaMonocle_tot
}<- deaMonocle_tot[2:nrow(deaMonocle_tot),]
deaMonocle_tot
<- merge.data.frame(deaMonocle_tot,
deaMonocle_tot
ground_truth_tot,by = c("genes","clusters","data_set","set_number"),
all.x = T,all.y = F)
# Convert TRUE/FALSE to 1/0
$value <- as.numeric(deaMonocle_tot$value)
deaMonocle_tot
# Compute the ROC curve - note that we invert the p-values with 1 - p_values
<- roc(deaMonocle_tot$value, 1 - deaMonocle_tot$marker_test_p_value)
roc_resultMonocle
# Plot the ROC curve
#plot(roc_resultMonocle)
ROC from Scanpy
<- NA
deaScanpy_tot for (ind in 1:dim(subset.datasets_csv)[1]) {
<- paste0(subset.datasets_csv$Group[ind],"_",subset.datasets_csv$Collection[ind])
file.code <- readRDS(file = file.path(dataSetDir,paste0(file.code,".RDS")))
dataset
<- getClusterizationData(dataset, clName = "mergedClusters")[[1]]
clusterization #print(file.code)
<- read.csv(file.path(dirOut,paste0(file.code,"Scanpy_DEA_genes.csv")),
deaScanpy row.names = 1)
<- NA
cl.names
for (cl.val in unique(clusterization)) {
#print(cl.val)
<- str_split(names(clusterization[clusterization == cl.val])[1],
cl.name pattern = "_",simplify = T)[1]
<- c(cl.names,cl.name)
cl.names <- cl.names[!is.na(cl.names)]
cl.names
$clusters == paste0("cl",cl.val),"clusters"] <- cl.name
deaScanpy[deaScanpy
}
<- str_split(datasets_csv$Collection[ind],pattern = "_[+]_",simplify = T)
clusters
$data_set <- subset.datasets_csv[ind,1]
deaScanpy$set_number <- ind
deaScanpy<- rbind(deaScanpy_tot, deaScanpy)
deaScanpy_tot
}<- deaScanpy_tot[2:nrow(deaScanpy_tot),]
deaScanpy_tot
<- merge.data.frame(deaScanpy_tot,
deaScanpy_tot
ground_truth_tot,by = c("genes","clusters","data_set","set_number"),
all.x = T,all.y = F)
# Convert TRUE/FALSE to 1/0
$value <- as.numeric(deaScanpy_tot$value)
deaScanpy_tot
# Compute the ROC curve - note that we invert the p-values with 1 - p_values
<- roc(deaScanpy_tot$value, 1 - deaScanpy_tot$pval)
roc_resultScanpy
# Plot the ROC curve
#plot(roc_resultScanpy)
Summary ROC for all methods
<- ggroc(list(COTAN=roc_resultCOTAN,
Two_Clusters_even_far Seurat=roc_resultSeurat,
Seurat_scTr = roc_resultSeurat_scTr,
Seurat_Bimod = roc_resultSeurat_Bimod,
Monocle=roc_resultMonocle,
Scanpy=roc_resultScanpy),
aes = c("colour"),
legacy.axes = T,
linewidth = 1)+theme_light()+
scale_colour_manual("",
values = methods.color)
<- Two_Clusters_even_far + xlab("FPR") + ylab("TPR")
Two_Clusters_even_farPL Two_Clusters_even_farPL
2 clusters uneven
2_Clusters_uneven_near
True vector
<-datasets_csv[datasets_csv$Group == "2_Clusters_uneven_near",]
subset.datasets_csv
<- NA
ground_truth_tot
for (ind in 1:dim(subset.datasets_csv)[1]) {
<- str_split(subset.datasets_csv$Collection[ind],pattern = "_[+]_",simplify = T)
clusters
<- file.presence[,clusters]
file.presence.subset
#file.presence.subset <- as.matrix(file.presence.subset)
<- as.data.frame(matrix(nrow = nrow(file.presence.subset),
ground_truth ncol = ncol(file.presence.subset)))
rownames(ground_truth) <- rownames(file.presence.subset)
colnames(ground_truth) <- colnames(file.presence.subset)
== "Absent"] <- 0
ground_truth[file.presence.subset == "Present"] <- 1
ground_truth[file.presence.subset == "Uncertain"] <- 0.6
ground_truth[file.presence.subset
<- ground_truth
file.presence.subset
for (col in 1:ncol(ground_truth)) {
<- FALSE
ground_truth[,col] == 1 & rowMeans(file.presence.subset[,-col,drop = FALSE]) < 0.555 ,col] <- TRUE
ground_truth[file.presence.subset[,col]
}
$genes <- rownames(ground_truth)
ground_truth<- pivot_longer(ground_truth,
ground_truth cols = 1:(ncol(ground_truth)-1),
names_to = "clusters")
$data_set <- subset.datasets_csv[ind,1]
ground_truth$set_number <- ind
ground_truth<- rbind(ground_truth_tot, ground_truth)
ground_truth_tot
}<- ground_truth_tot[2:nrow(ground_truth_tot),]
ground_truth_tot
head(ground_truth_tot)
# A tibble: 6 × 5
genes clusters value data_set set_number
<chr> <chr> <lgl> <chr> <int>
1 Neil2 E13.5-434 FALSE 2_Clusters_uneven_near 1
2 Neil2 E15.0-428 FALSE 2_Clusters_uneven_near 1
3 Lamc1 E13.5-434 FALSE 2_Clusters_uneven_near 1
4 Lamc1 E15.0-428 FALSE 2_Clusters_uneven_near 1
5 Lama1 E13.5-434 FALSE 2_Clusters_uneven_near 1
6 Lama1 E15.0-428 FALSE 2_Clusters_uneven_near 1
length(unique(ground_truth_tot$genes))
[1] 14695
sum(ground_truth_tot$value)
[1] 24
ROC for COTAN
<- NA
onlyPositive.pVal.Cotan_tot
for (ind in 1:dim(subset.datasets_csv)[1]) {
#print(ind)
<- paste0(subset.datasets_csv$Group[ind],"_",
file.code $Collection[ind])
subset.datasets_csv<- readRDS(file = file.path(dataSetDir,paste0(file.code,".RDS")))
dataset <- getClusterizationData(dataset,clName = "mergedClusters")[[2]]
deaCOTAN <- pValueFromDEA(deaCOTAN,
pvalCOTAN numCells = getNumCells(dataset),method = "none")
<- getClusterizationData(dataset, clName = "mergedClusters")[[1]]
clusterization <- NA
cl.names
for (cl.val in unique(clusterization)) {
#print(cl.val)
<- c(cl.names,
cl.names str_split(names(clusterization[clusterization == cl.val])[1],
pattern = "_",simplify = T)[1])
<- cl.names[!is.na(cl.names)]
cl.names
}colnames(deaCOTAN) <- cl.names
colnames(pvalCOTAN) <- cl.names
<- str_split(datasets_csv$Collection[ind],pattern = "_[+]_",simplify = T)
clusters
<- pvalCOTAN
onlyPositive.pVal.Cotan
for (cl in cl.names) {
print(cl)
#temp.DEA.CotanSign <- deaCOTAN[rownames(pvalCOTAN[pvalCOTAN[,cl] < 0.05,]) ,]
rownames(deaCOTAN[deaCOTAN[,cl] < 0,]),cl] <- 1 #onlyPositive.pVal.Cotan[rownames(deaCOTAN[deaCOTAN[,cl] < 0,]),cl]+1
onlyPositive.pVal.Cotan[
}
$genes <- rownames(onlyPositive.pVal.Cotan)
onlyPositive.pVal.Cotan<- pivot_longer(onlyPositive.pVal.Cotan,
onlyPositive.pVal.Cotan values_to = "p_values",
cols = 1:(ncol(onlyPositive.pVal.Cotan)-1),
names_to = "clusters")
$data_set <- subset.datasets_csv[ind,1]
onlyPositive.pVal.Cotan$set_number <- ind
onlyPositive.pVal.Cotan<- rbind(onlyPositive.pVal.Cotan_tot, onlyPositive.pVal.Cotan)
onlyPositive.pVal.Cotan_tot
}
[1] "E13.5-434"
[1] "E15.0-428"
[1] "E15.0-432"
[1] "E13.5-432"
[1] "E15.0-509"
[1] "E15.0-508"
<- onlyPositive.pVal.Cotan_tot[2:nrow(onlyPositive.pVal.Cotan_tot),]
onlyPositive.pVal.Cotan_tot
<- merge.data.frame(onlyPositive.pVal.Cotan_tot,
onlyPositive.pVal.Cotan_tot by = c("genes","clusters","data_set","set_number"),all.x = T,all.y = F)
ground_truth_tot,
#onlyPositive.pVal.Cotan_tot <- onlyPositive.pVal.Cotan_tot[order(onlyPositive.pVal.Cotan_tot$p_values,
# decreasing = F),]
# df <- as.data.frame(matrix(nrow = nrow(onlyPositive.pVal.Cotan_tot),ncol = 3))
# colnames(df) <- c("TPR","FPR","Method")
# df$Method <- "COTAN"
#
# Positive <- sum(onlyPositive.pVal.Cotan_tot$value)
# Negative <- sum(!onlyPositive.pVal.Cotan_tot$value)
#
# for (i in 1:nrow(onlyPositive.pVal.Cotan_tot)) {
# df[i,"TPR"] <- sum(onlyPositive.pVal.Cotan_tot[1:i,"value"])/Positive
# df[i,"FPR"] <- (i-sum(onlyPositive.pVal.Cotan_tot[1:i,"value"]))/Negative
# Convert TRUE/FALSE to 1/0
$value <- as.numeric(onlyPositive.pVal.Cotan_tot$value)
onlyPositive.pVal.Cotan_tot
# Compute the ROC curve - note that we invert the p-values with 1 - p_values
<- roc(onlyPositive.pVal.Cotan_tot$value, 1 - onlyPositive.pVal.Cotan_tot$p_values)
roc_resultCOTAN
# Plot the ROC curve
#plot(roc_resultCOTAN)
ROC for Seurat
<- NA
deaSeurat_tot
for (ind in 1:dim(subset.datasets_csv)[1]) {
#print(ind)
<- paste0(subset.datasets_csv$Group[ind],"_",subset.datasets_csv$Collection[ind])
file.code
<- readRDS(file = file.path(dataSetDir,paste0(file.code,".RDS")))
dataset
<- getClusterizationData(dataset, clName = "mergedClusters")[[1]]
clusterization
<- read.csv(file.path(dirOut,paste0(file.code,"Seurat_DEA_genes.csv")), row.names = 1)
deaSeurat
<- NA
cl.names
for (cl.val in unique(clusterization)) {
#print(cl.val)
<- str_split(names(clusterization[clusterization == cl.val])[1],
cl.name pattern = "_",simplify = T)[1]
<- c(cl.names,cl.name)
cl.names <- cl.names[!is.na(cl.names)]
cl.names
$cluster == cl.val,]$cluster <- cl.name
deaSeurat[deaSeurat
}
<- str_split(datasets_csv$Collection[ind],pattern = "_[+]_",simplify = T)
clusters
colnames(deaSeurat) <- str_replace(colnames(deaSeurat),
pattern = "cluster",
replacement = "clusters")
colnames(deaSeurat) <- str_replace(colnames(deaSeurat),
pattern = "gene",
replacement = "genes")
$data_set <- subset.datasets_csv[ind,1]
deaSeurat$set_number <- ind
deaSeurat<- rbind(deaSeurat_tot, deaSeurat)
deaSeurat_tot
}<- deaSeurat_tot[2:nrow(deaSeurat_tot),]
deaSeurat_tot
<- merge.data.frame(deaSeurat_tot,
deaSeurat_tot
ground_truth_tot,by = c("genes","clusters","data_set","set_number"),
all.x = T,all.y = F)
# Convert TRUE/FALSE to 1/0
$value <- as.numeric(deaSeurat_tot$value)
deaSeurat_tot
# Compute the ROC curve - note that we invert the p-values with 1 - p_values
<- roc(deaSeurat_tot$value, 1 - deaSeurat_tot$p_val)
roc_resultSeurat
# Plot the ROC curve
plot(roc_resultSeurat)
ROC for Seurat scTransform
<- NA
deaSeurat_tot
for (ind in 1:dim(subset.datasets_csv)[1]) {
#print(ind)
<- paste0(subset.datasets_csv$Group[ind],"_",subset.datasets_csv$Collection[ind])
file.code
<- readRDS(file = file.path(dataSetDir,paste0(file.code,".RDS")))
dataset
<- getClusterizationData(dataset, clName = "mergedClusters")[[1]]
clusterization
<- read.csv(file.path(dirOut,paste0(file.code,"Seurat_DEA_ScTransform_genes.csv")), row.names = 1)
deaSeurat
<- NA
cl.names
for (cl.val in unique(clusterization)) {
#print(cl.val)
<- str_split(names(clusterization[clusterization == cl.val])[1],
cl.name pattern = "_",simplify = T)[1]
<- c(cl.names,cl.name)
cl.names <- cl.names[!is.na(cl.names)]
cl.names
$cluster == cl.val,]$cluster <- cl.name
deaSeurat[deaSeurat
}
<- str_split(datasets_csv$Collection[ind],pattern = "_[+]_",simplify = T)
clusters
colnames(deaSeurat) <- str_replace(colnames(deaSeurat),
pattern = "cluster",
replacement = "clusters")
colnames(deaSeurat) <- str_replace(colnames(deaSeurat),
pattern = "gene",
replacement = "genes")
$data_set <- subset.datasets_csv[ind,1]
deaSeurat$set_number <- ind
deaSeurat<- rbind(deaSeurat_tot, deaSeurat)
deaSeurat_tot
}<- deaSeurat_tot[2:nrow(deaSeurat_tot),]
deaSeurat_tot
<- merge.data.frame(deaSeurat_tot,
deaSeurat_tot
ground_truth_tot,by = c("genes","clusters","data_set","set_number"),
all.x = T,all.y = F)
# Convert TRUE/FALSE to 1/0
$value <- as.numeric(deaSeurat_tot$value)
deaSeurat_tot
# Compute the ROC curve - note that we invert the p-values with 1 - p_values
<- roc(deaSeurat_tot$value, 1 - deaSeurat_tot$p_val) roc_resultSeurat_scTr
ROC for Seurat Bimod
<- NA
deaSeurat_tot
for (ind in 1:dim(subset.datasets_csv)[1]) {
#print(ind)
<- paste0(subset.datasets_csv$Group[ind],"_",subset.datasets_csv$Collection[ind])
file.code
<- readRDS(file = file.path(dataSetDir,paste0(file.code,".RDS")))
dataset
<- getClusterizationData(dataset, clName = "mergedClusters")[[1]]
clusterization
<- read.csv(file.path(dirOut,paste0(file.code,"Seurat_DEA_Bimod_genes.csv")), row.names = 1)
deaSeurat
<- NA
cl.names
for (cl.val in unique(clusterization)) {
#print(cl.val)
<- str_split(names(clusterization[clusterization == cl.val])[1],
cl.name pattern = "_",simplify = T)[1]
<- c(cl.names,cl.name)
cl.names <- cl.names[!is.na(cl.names)]
cl.names
$cluster == cl.val,]$cluster <- cl.name
deaSeurat[deaSeurat
}
<- str_split(datasets_csv$Collection[ind],pattern = "_[+]_",simplify = T)
clusters
colnames(deaSeurat) <- str_replace(colnames(deaSeurat),
pattern = "cluster",
replacement = "clusters")
colnames(deaSeurat) <- str_replace(colnames(deaSeurat),
pattern = "gene",
replacement = "genes")
$data_set <- subset.datasets_csv[ind,1]
deaSeurat$set_number <- ind
deaSeurat<- rbind(deaSeurat_tot, deaSeurat)
deaSeurat_tot
}<- deaSeurat_tot[2:nrow(deaSeurat_tot),]
deaSeurat_tot
<- merge.data.frame(deaSeurat_tot,
deaSeurat_tot
ground_truth_tot,by = c("genes","clusters","data_set","set_number"),
all.x = T,all.y = F)
# Convert TRUE/FALSE to 1/0
$value <- as.numeric(deaSeurat_tot$value)
deaSeurat_tot
# Compute the ROC curve - note that we invert the p-values with 1 - p_values
<- roc(deaSeurat_tot$value, 1 - deaSeurat_tot$p_val) roc_resultSeurat_Bimod
ROC for Monocle
<- NA
deaMonocle_tot for (ind in 1:dim(subset.datasets_csv)[1]) {
<- paste0(subset.datasets_csv$Group[ind],"_",subset.datasets_csv$Collection[ind])
file.code <- readRDS(file = file.path(dataSetDir,paste0(file.code,".RDS")))
dataset
<- getClusterizationData(dataset, clName = "mergedClusters")[[1]]
clusterization #print(file.code)
<- read.csv(file.path(dirOut,paste0(file.code,"Monocle_DEA_genes.csv")),row.names = 1)
deaMonocle
<- NA
cl.names
for (cl.val in unique(clusterization)) {
#print(cl.val)
<- str_split(names(clusterization[clusterization == cl.val])[1],
cl.name pattern = "_",simplify = T)[1]
<- c(cl.names,cl.name)
cl.names <- cl.names[!is.na(cl.names)]
cl.names
$cell_group == cl.val,"cell_group"] <- cl.name
deaMonocle[deaMonocle
}
<- str_split(datasets_csv$Collection[ind],pattern = "_[+]_",simplify = T)
clusters
colnames(deaMonocle) <- str_replace(colnames(deaMonocle),
pattern = "cell_group",
replacement = "clusters")
colnames(deaMonocle) <- str_replace(colnames(deaMonocle),
pattern = "gene_id",
replacement = "genes")
$data_set <- subset.datasets_csv[ind,1]
deaMonocle$set_number <- ind
deaMonocle<- as.data.frame(deaMonocle)
deaMonocle <- rbind(deaMonocle_tot, deaMonocle)
deaMonocle_tot
}<- deaMonocle_tot[2:nrow(deaMonocle_tot),]
deaMonocle_tot
<- merge.data.frame(deaMonocle_tot,
deaMonocle_tot
ground_truth_tot,by = c("genes","clusters","data_set","set_number"),
all.x = T,all.y = F)
# Convert TRUE/FALSE to 1/0
$value <- as.numeric(deaMonocle_tot$value)
deaMonocle_tot
# Compute the ROC curve - note that we invert the p-values with 1 - p_values
<- roc(deaMonocle_tot$value, 1 - deaMonocle_tot$marker_test_p_value)
roc_resultMonocle
# Plot the ROC curve
#plot(roc_resultMonocle)
ROC from Scanpy
<- NA
deaScanpy_tot for (ind in 1:dim(subset.datasets_csv)[1]) {
<- paste0(subset.datasets_csv$Group[ind],"_",subset.datasets_csv$Collection[ind])
file.code <- readRDS(file = file.path(dataSetDir,paste0(file.code,".RDS")))
dataset
<- getClusterizationData(dataset, clName = "mergedClusters")[[1]]
clusterization #print(file.code)
<- read.csv(file.path(dirOut,paste0(file.code,"Scanpy_DEA_genes.csv")),
deaScanpy row.names = 1)
<- NA
cl.names
for (cl.val in unique(clusterization)) {
#print(cl.val)
<- str_split(names(clusterization[clusterization == cl.val])[1],
cl.name pattern = "_",simplify = T)[1]
<- c(cl.names,cl.name)
cl.names <- cl.names[!is.na(cl.names)]
cl.names
$clusters == paste0("cl",cl.val),"clusters"] <- cl.name
deaScanpy[deaScanpy
}
<- str_split(datasets_csv$Collection[ind],pattern = "_[+]_",simplify = T)
clusters
$data_set <- subset.datasets_csv[ind,1]
deaScanpy$set_number <- ind
deaScanpy<- rbind(deaScanpy_tot, deaScanpy)
deaScanpy_tot
}<- deaScanpy_tot[2:nrow(deaScanpy_tot),]
deaScanpy_tot
<- merge.data.frame(deaScanpy_tot,
deaScanpy_tot
ground_truth_tot,by = c("genes","clusters","data_set","set_number"),
all.x = T,all.y = F)
# Convert TRUE/FALSE to 1/0
$value <- as.numeric(deaScanpy_tot$value)
deaScanpy_tot
# Compute the ROC curve - note that we invert the p-values with 1 - p_values
<- roc(deaScanpy_tot$value, 1 - deaScanpy_tot$pval)
roc_resultScanpy
# Plot the ROC curve
#plot(roc_resultScanpy)
Summary ROC for all methods
<- ggroc(list(COTAN=roc_resultCOTAN,
Two_Clusters_uneven_near Seurat=roc_resultSeurat,
Seurat_scTr = roc_resultSeurat_scTr,
Seurat_Bimod = roc_resultSeurat_Bimod,
Monocle=roc_resultMonocle,
Scanpy=roc_resultScanpy),
aes = c("colour"),
legacy.axes = T,
linewidth = 1)+theme_light()+
scale_colour_manual("",
values = methods.color)
<- Two_Clusters_uneven_near + xlab("FPR") + ylab("TPR")
Two_Clusters_uneven_nearPL Two_Clusters_uneven_nearPL
2_Clusters_uneven_medium
True vector
<-datasets_csv[datasets_csv$Group == "2_Clusters_uneven_medium",]
subset.datasets_csv
<- NA
ground_truth_tot
for (ind in 1:dim(subset.datasets_csv)[1]) {
<- str_split(subset.datasets_csv$Collection[ind],pattern = "_[+]_",simplify = T)
clusters
<- file.presence[,clusters]
file.presence.subset
#file.presence.subset <- as.matrix(file.presence.subset)
<- as.data.frame(matrix(nrow = nrow(file.presence.subset),
ground_truth ncol = ncol(file.presence.subset)))
rownames(ground_truth) <- rownames(file.presence.subset)
colnames(ground_truth) <- colnames(file.presence.subset)
== "Absent"] <- 0
ground_truth[file.presence.subset == "Present"] <- 1
ground_truth[file.presence.subset == "Uncertain"] <- 0.6
ground_truth[file.presence.subset
<- ground_truth
file.presence.subset
for (col in 1:ncol(ground_truth)) {
<- FALSE
ground_truth[,col] == 1 & rowMeans(file.presence.subset[,-col,drop = FALSE]) < 0.555 ,col] <- TRUE
ground_truth[file.presence.subset[,col]
}
$genes <- rownames(ground_truth)
ground_truth<- pivot_longer(ground_truth,
ground_truth cols = 1:(ncol(ground_truth)-1),
names_to = "clusters")
$data_set <- subset.datasets_csv[ind,1]
ground_truth$set_number <- ind
ground_truth<- rbind(ground_truth_tot, ground_truth)
ground_truth_tot
}<- ground_truth_tot[2:nrow(ground_truth_tot),]
ground_truth_tot
head(ground_truth_tot)
# A tibble: 6 × 5
genes clusters value data_set set_number
<chr> <chr> <lgl> <chr> <int>
1 Neil2 E13.5-187 FALSE 2_Clusters_uneven_medium 1
2 Neil2 E13.5-184 FALSE 2_Clusters_uneven_medium 1
3 Lamc1 E13.5-187 FALSE 2_Clusters_uneven_medium 1
4 Lamc1 E13.5-184 FALSE 2_Clusters_uneven_medium 1
5 Lama1 E13.5-187 FALSE 2_Clusters_uneven_medium 1
6 Lama1 E13.5-184 FALSE 2_Clusters_uneven_medium 1
length(unique(ground_truth_tot$genes))
[1] 14695
sum(ground_truth_tot$value)
[1] 771
ROC for COTAN
<- NA
onlyPositive.pVal.Cotan_tot
for (ind in 1:dim(subset.datasets_csv)[1]) {
#print(ind)
<- paste0(subset.datasets_csv$Group[ind],"_",
file.code $Collection[ind])
subset.datasets_csv<- readRDS(file = file.path(dataSetDir,paste0(file.code,".RDS")))
dataset <- getClusterizationData(dataset,clName = "mergedClusters")[[2]]
deaCOTAN <- pValueFromDEA(deaCOTAN,
pvalCOTAN numCells = getNumCells(dataset),method = "none")
<- getClusterizationData(dataset, clName = "mergedClusters")[[1]]
clusterization <- NA
cl.names
for (cl.val in unique(clusterization)) {
#print(cl.val)
<- c(cl.names,
cl.names str_split(names(clusterization[clusterization == cl.val])[1],
pattern = "_",simplify = T)[1])
<- cl.names[!is.na(cl.names)]
cl.names
}colnames(deaCOTAN) <- cl.names
colnames(pvalCOTAN) <- cl.names
<- str_split(datasets_csv$Collection[ind],pattern = "_[+]_",simplify = T)
clusters
<- pvalCOTAN
onlyPositive.pVal.Cotan
for (cl in cl.names) {
print(cl)
#temp.DEA.CotanSign <- deaCOTAN[rownames(pvalCOTAN[pvalCOTAN[,cl] < 0.05,]) ,]
rownames(deaCOTAN[deaCOTAN[,cl] < 0,]),cl] <- 1 #onlyPositive.pVal.Cotan[rownames(deaCOTAN[deaCOTAN[,cl] < 0,]),cl]+1
onlyPositive.pVal.Cotan[
}
$genes <- rownames(onlyPositive.pVal.Cotan)
onlyPositive.pVal.Cotan<- pivot_longer(onlyPositive.pVal.Cotan,
onlyPositive.pVal.Cotan values_to = "p_values",
cols = 1:(ncol(onlyPositive.pVal.Cotan)-1),
names_to = "clusters")
$data_set <- subset.datasets_csv[ind,1]
onlyPositive.pVal.Cotan$set_number <- ind
onlyPositive.pVal.Cotan<- rbind(onlyPositive.pVal.Cotan_tot, onlyPositive.pVal.Cotan)
onlyPositive.pVal.Cotan_tot
}
[1] "E13.5-187"
[1] "E13.5-184"
[1] "E17.5-516"
[1] "E15.0-434"
[1] "E15.0-508"
[1] "E15.0-437"
<- onlyPositive.pVal.Cotan_tot[2:nrow(onlyPositive.pVal.Cotan_tot),]
onlyPositive.pVal.Cotan_tot
<- merge.data.frame(onlyPositive.pVal.Cotan_tot,
onlyPositive.pVal.Cotan_tot by = c("genes","clusters","data_set","set_number"),all.x = T,all.y = F)
ground_truth_tot,
#onlyPositive.pVal.Cotan_tot <- onlyPositive.pVal.Cotan_tot[order(onlyPositive.pVal.Cotan_tot$p_values,
# decreasing = F),]
# df <- as.data.frame(matrix(nrow = nrow(onlyPositive.pVal.Cotan_tot),ncol = 3))
# colnames(df) <- c("TPR","FPR","Method")
# df$Method <- "COTAN"
#
# Positive <- sum(onlyPositive.pVal.Cotan_tot$value)
# Negative <- sum(!onlyPositive.pVal.Cotan_tot$value)
#
# for (i in 1:nrow(onlyPositive.pVal.Cotan_tot)) {
# df[i,"TPR"] <- sum(onlyPositive.pVal.Cotan_tot[1:i,"value"])/Positive
# df[i,"FPR"] <- (i-sum(onlyPositive.pVal.Cotan_tot[1:i,"value"]))/Negative
# Convert TRUE/FALSE to 1/0
$value <- as.numeric(onlyPositive.pVal.Cotan_tot$value)
onlyPositive.pVal.Cotan_tot
# Compute the ROC curve - note that we invert the p-values with 1 - p_values
<- roc(onlyPositive.pVal.Cotan_tot$value, 1 - onlyPositive.pVal.Cotan_tot$p_values)
roc_resultCOTAN
# Plot the ROC curve
#plot(roc_resultCOTAN)
ROC for Seurat
<- NA
deaSeurat_tot
for (ind in 1:dim(subset.datasets_csv)[1]) {
#print(ind)
<- paste0(subset.datasets_csv$Group[ind],"_",subset.datasets_csv$Collection[ind])
file.code
<- readRDS(file = file.path(dataSetDir,paste0(file.code,".RDS")))
dataset
<- getClusterizationData(dataset, clName = "mergedClusters")[[1]]
clusterization
<- read.csv(file.path(dirOut,paste0(file.code,"Seurat_DEA_genes.csv")), row.names = 1)
deaSeurat
<- NA
cl.names
for (cl.val in unique(clusterization)) {
#print(cl.val)
<- str_split(names(clusterization[clusterization == cl.val])[1],
cl.name pattern = "_",simplify = T)[1]
<- c(cl.names,cl.name)
cl.names <- cl.names[!is.na(cl.names)]
cl.names
$cluster == cl.val,]$cluster <- cl.name
deaSeurat[deaSeurat
}
<- str_split(datasets_csv$Collection[ind],pattern = "_[+]_",simplify = T)
clusters
colnames(deaSeurat) <- str_replace(colnames(deaSeurat),
pattern = "cluster",
replacement = "clusters")
colnames(deaSeurat) <- str_replace(colnames(deaSeurat),
pattern = "gene",
replacement = "genes")
$data_set <- subset.datasets_csv[ind,1]
deaSeurat$set_number <- ind
deaSeurat<- rbind(deaSeurat_tot, deaSeurat)
deaSeurat_tot
}<- deaSeurat_tot[2:nrow(deaSeurat_tot),]
deaSeurat_tot
<- merge.data.frame(deaSeurat_tot,
deaSeurat_tot
ground_truth_tot,by = c("genes","clusters","data_set","set_number"),
all.x = T,all.y = F)
# Convert TRUE/FALSE to 1/0
$value <- as.numeric(deaSeurat_tot$value)
deaSeurat_tot
# Compute the ROC curve - note that we invert the p-values with 1 - p_values
<- roc(deaSeurat_tot$value, 1 - deaSeurat_tot$p_val)
roc_resultSeurat
# Plot the ROC curve
#plot(roc_resultSeurat)
ROC for Seurat scTransform
<- NA
deaSeurat_tot
for (ind in 1:dim(subset.datasets_csv)[1]) {
#print(ind)
<- paste0(subset.datasets_csv$Group[ind],"_",subset.datasets_csv$Collection[ind])
file.code
<- readRDS(file = file.path(dataSetDir,paste0(file.code,".RDS")))
dataset
<- getClusterizationData(dataset, clName = "mergedClusters")[[1]]
clusterization
<- read.csv(file.path(dirOut,paste0(file.code,"Seurat_DEA_ScTransform_genes.csv")), row.names = 1)
deaSeurat
<- NA
cl.names
for (cl.val in unique(clusterization)) {
#print(cl.val)
<- str_split(names(clusterization[clusterization == cl.val])[1],
cl.name pattern = "_",simplify = T)[1]
<- c(cl.names,cl.name)
cl.names <- cl.names[!is.na(cl.names)]
cl.names
$cluster == cl.val,]$cluster <- cl.name
deaSeurat[deaSeurat
}
<- str_split(datasets_csv$Collection[ind],pattern = "_[+]_",simplify = T)
clusters
colnames(deaSeurat) <- str_replace(colnames(deaSeurat),
pattern = "cluster",
replacement = "clusters")
colnames(deaSeurat) <- str_replace(colnames(deaSeurat),
pattern = "gene",
replacement = "genes")
$data_set <- subset.datasets_csv[ind,1]
deaSeurat$set_number <- ind
deaSeurat<- rbind(deaSeurat_tot, deaSeurat)
deaSeurat_tot
}<- deaSeurat_tot[2:nrow(deaSeurat_tot),]
deaSeurat_tot
<- merge.data.frame(deaSeurat_tot,
deaSeurat_tot
ground_truth_tot,by = c("genes","clusters","data_set","set_number"),
all.x = T,all.y = F)
# Convert TRUE/FALSE to 1/0
$value <- as.numeric(deaSeurat_tot$value)
deaSeurat_tot
# Compute the ROC curve - note that we invert the p-values with 1 - p_values
<- roc(deaSeurat_tot$value, 1 - deaSeurat_tot$p_val) roc_resultSeurat_scTr
ROC for Seurat Bimod
<- NA
deaSeurat_tot
for (ind in 1:dim(subset.datasets_csv)[1]) {
#print(ind)
<- paste0(subset.datasets_csv$Group[ind],"_",subset.datasets_csv$Collection[ind])
file.code
<- readRDS(file = file.path(dataSetDir,paste0(file.code,".RDS")))
dataset
<- getClusterizationData(dataset, clName = "mergedClusters")[[1]]
clusterization
<- read.csv(file.path(dirOut,paste0(file.code,"Seurat_DEA_Bimod_genes.csv")), row.names = 1)
deaSeurat
<- NA
cl.names
for (cl.val in unique(clusterization)) {
#print(cl.val)
<- str_split(names(clusterization[clusterization == cl.val])[1],
cl.name pattern = "_",simplify = T)[1]
<- c(cl.names,cl.name)
cl.names <- cl.names[!is.na(cl.names)]
cl.names
$cluster == cl.val,]$cluster <- cl.name
deaSeurat[deaSeurat
}
<- str_split(datasets_csv$Collection[ind],pattern = "_[+]_",simplify = T)
clusters
colnames(deaSeurat) <- str_replace(colnames(deaSeurat),
pattern = "cluster",
replacement = "clusters")
colnames(deaSeurat) <- str_replace(colnames(deaSeurat),
pattern = "gene",
replacement = "genes")
$data_set <- subset.datasets_csv[ind,1]
deaSeurat$set_number <- ind
deaSeurat<- rbind(deaSeurat_tot, deaSeurat)
deaSeurat_tot
}<- deaSeurat_tot[2:nrow(deaSeurat_tot),]
deaSeurat_tot
<- merge.data.frame(deaSeurat_tot,
deaSeurat_tot
ground_truth_tot,by = c("genes","clusters","data_set","set_number"),
all.x = T,all.y = F)
# Convert TRUE/FALSE to 1/0
$value <- as.numeric(deaSeurat_tot$value)
deaSeurat_tot
# Compute the ROC curve - note that we invert the p-values with 1 - p_values
<- roc(deaSeurat_tot$value, 1 - deaSeurat_tot$p_val) roc_resultSeurat_Bimod
ROC for Monocle
<- NA
deaMonocle_tot for (ind in 1:dim(subset.datasets_csv)[1]) {
<- paste0(subset.datasets_csv$Group[ind],"_",subset.datasets_csv$Collection[ind])
file.code <- readRDS(file = file.path(dataSetDir,paste0(file.code,".RDS")))
dataset
<- getClusterizationData(dataset, clName = "mergedClusters")[[1]]
clusterization #print(file.code)
<- read.csv(file.path(dirOut,paste0(file.code,"Monocle_DEA_genes.csv")),row.names = 1)
deaMonocle
<- NA
cl.names
for (cl.val in unique(clusterization)) {
#print(cl.val)
<- str_split(names(clusterization[clusterization == cl.val])[1],
cl.name pattern = "_",simplify = T)[1]
<- c(cl.names,cl.name)
cl.names <- cl.names[!is.na(cl.names)]
cl.names
$cell_group == cl.val,"cell_group"] <- cl.name
deaMonocle[deaMonocle
}
<- str_split(datasets_csv$Collection[ind],pattern = "_[+]_",simplify = T)
clusters
colnames(deaMonocle) <- str_replace(colnames(deaMonocle),
pattern = "cell_group",
replacement = "clusters")
colnames(deaMonocle) <- str_replace(colnames(deaMonocle),
pattern = "gene_id",
replacement = "genes")
$data_set <- subset.datasets_csv[ind,1]
deaMonocle$set_number <- ind
deaMonocle<- as.data.frame(deaMonocle)
deaMonocle <- rbind(deaMonocle_tot, deaMonocle)
deaMonocle_tot
}<- deaMonocle_tot[2:nrow(deaMonocle_tot),]
deaMonocle_tot
<- merge.data.frame(deaMonocle_tot,
deaMonocle_tot
ground_truth_tot,by = c("genes","clusters","data_set","set_number"),
all.x = T,all.y = F)
# Convert TRUE/FALSE to 1/0
$value <- as.numeric(deaMonocle_tot$value)
deaMonocle_tot
# Compute the ROC curve - note that we invert the p-values with 1 - p_values
<- roc(deaMonocle_tot$value, 1 - deaMonocle_tot$marker_test_p_value)
roc_resultMonocle
# Plot the ROC curve
#plot(roc_resultMonocle)
ROC from Scanpy
<- NA
deaScanpy_tot for (ind in 1:dim(subset.datasets_csv)[1]) {
<- paste0(subset.datasets_csv$Group[ind],"_",subset.datasets_csv$Collection[ind])
file.code <- readRDS(file = file.path(dataSetDir,paste0(file.code,".RDS")))
dataset
<- getClusterizationData(dataset, clName = "mergedClusters")[[1]]
clusterization #print(file.code)
<- read.csv(file.path(dirOut,paste0(file.code,"Scanpy_DEA_genes.csv")),
deaScanpy row.names = 1)
<- NA
cl.names
for (cl.val in unique(clusterization)) {
#print(cl.val)
<- str_split(names(clusterization[clusterization == cl.val])[1],
cl.name pattern = "_",simplify = T)[1]
<- c(cl.names,cl.name)
cl.names <- cl.names[!is.na(cl.names)]
cl.names
$clusters == paste0("cl",cl.val),"clusters"] <- cl.name
deaScanpy[deaScanpy
}
<- str_split(datasets_csv$Collection[ind],pattern = "_[+]_",simplify = T)
clusters
$data_set <- subset.datasets_csv[ind,1]
deaScanpy$set_number <- ind
deaScanpy<- rbind(deaScanpy_tot, deaScanpy)
deaScanpy_tot
}<- deaScanpy_tot[2:nrow(deaScanpy_tot),]
deaScanpy_tot
<- merge.data.frame(deaScanpy_tot,
deaScanpy_tot
ground_truth_tot,by = c("genes","clusters","data_set","set_number"),
all.x = T,all.y = F)
# Convert TRUE/FALSE to 1/0
$value <- as.numeric(deaScanpy_tot$value)
deaScanpy_tot
# Compute the ROC curve - note that we invert the p-values with 1 - p_values
<- roc(deaScanpy_tot$value, 1 - deaScanpy_tot$pval)
roc_resultScanpy
# Plot the ROC curve
#plot(roc_resultScanpy)
Summary ROC for all methods
<- ggroc(list(COTAN=roc_resultCOTAN,
Two_Clusters_uneven_medium Seurat=roc_resultSeurat,
Seurat_scTr = roc_resultSeurat_scTr,
Seurat_Bimod = roc_resultSeurat_Bimod,
Monocle=roc_resultMonocle,
Scanpy=roc_resultScanpy),
aes = c("colour"),
legacy.axes = T,
linewidth = 1)+theme_light()+
scale_colour_manual("",
values = methods.color)
<- Two_Clusters_uneven_medium + xlab("FPR") + ylab("TPR")
Two_Clusters_uneven_mediumPL
Two_Clusters_uneven_mediumPL
2_Clusters_uneven_far
True vector
<-datasets_csv[datasets_csv$Group == "2_Clusters_uneven_far",]
subset.datasets_csv
<- NA
ground_truth_tot
for (ind in 1:dim(subset.datasets_csv)[1]) {
<- str_split(subset.datasets_csv$Collection[ind],pattern = "_[+]_",simplify = T)
clusters
<- file.presence[,clusters]
file.presence.subset
#file.presence.subset <- as.matrix(file.presence.subset)
<- as.data.frame(matrix(nrow = nrow(file.presence.subset),
ground_truth ncol = ncol(file.presence.subset)))
rownames(ground_truth) <- rownames(file.presence.subset)
colnames(ground_truth) <- colnames(file.presence.subset)
== "Absent"] <- 0
ground_truth[file.presence.subset == "Present"] <- 1
ground_truth[file.presence.subset == "Uncertain"] <- 0.6
ground_truth[file.presence.subset
<- ground_truth
file.presence.subset
for (col in 1:ncol(ground_truth)) {
<- FALSE
ground_truth[,col] == 1 & rowMeans(file.presence.subset[,-col,drop = FALSE]) < 0.555 ,col] <- TRUE
ground_truth[file.presence.subset[,col]
}
$genes <- rownames(ground_truth)
ground_truth<- pivot_longer(ground_truth,
ground_truth cols = 1:(ncol(ground_truth)-1),
names_to = "clusters")
$data_set <- subset.datasets_csv[ind,1]
ground_truth$set_number <- ind
ground_truth<- rbind(ground_truth_tot, ground_truth)
ground_truth_tot
}<- ground_truth_tot[2:nrow(ground_truth_tot),]
ground_truth_tot
head(ground_truth_tot)
# A tibble: 6 × 5
genes clusters value data_set set_number
<chr> <chr> <lgl> <chr> <int>
1 Neil2 E17.5-516 FALSE 2_Clusters_uneven_far 1
2 Neil2 E13.5-187 FALSE 2_Clusters_uneven_far 1
3 Lamc1 E17.5-516 FALSE 2_Clusters_uneven_far 1
4 Lamc1 E13.5-187 FALSE 2_Clusters_uneven_far 1
5 Lama1 E17.5-516 FALSE 2_Clusters_uneven_far 1
6 Lama1 E13.5-187 TRUE 2_Clusters_uneven_far 1
length(unique(ground_truth_tot$genes))
[1] 14695
sum(ground_truth_tot$value)
[1] 2325
ROC for COTAN
<- NA
onlyPositive.pVal.Cotan_tot
for (ind in 1:dim(subset.datasets_csv)[1]) {
#print(ind)
<- paste0(subset.datasets_csv$Group[ind],"_",
file.code $Collection[ind])
subset.datasets_csv<- readRDS(file = file.path(dataSetDir,paste0(file.code,".RDS")))
dataset <- getClusterizationData(dataset,clName = "mergedClusters")[[2]]
deaCOTAN <- pValueFromDEA(deaCOTAN,
pvalCOTAN numCells = getNumCells(dataset),method = "none")
<- getClusterizationData(dataset, clName = "mergedClusters")[[1]]
clusterization <- NA
cl.names
for (cl.val in unique(clusterization)) {
#print(cl.val)
<- c(cl.names,
cl.names str_split(names(clusterization[clusterization == cl.val])[1],
pattern = "_",simplify = T)[1])
<- cl.names[!is.na(cl.names)]
cl.names
}colnames(deaCOTAN) <- cl.names
colnames(pvalCOTAN) <- cl.names
<- str_split(datasets_csv$Collection[ind],pattern = "_[+]_",simplify = T)
clusters
<- pvalCOTAN
onlyPositive.pVal.Cotan
for (cl in cl.names) {
print(cl)
#temp.DEA.CotanSign <- deaCOTAN[rownames(pvalCOTAN[pvalCOTAN[,cl] < 0.05,]) ,]
rownames(deaCOTAN[deaCOTAN[,cl] < 0,]),cl] <- 1 #onlyPositive.pVal.Cotan[rownames(deaCOTAN[deaCOTAN[,cl] < 0,]),cl]+1
onlyPositive.pVal.Cotan[
}
$genes <- rownames(onlyPositive.pVal.Cotan)
onlyPositive.pVal.Cotan<- pivot_longer(onlyPositive.pVal.Cotan,
onlyPositive.pVal.Cotan values_to = "p_values",
cols = 1:(ncol(onlyPositive.pVal.Cotan)-1),
names_to = "clusters")
$data_set <- subset.datasets_csv[ind,1]
onlyPositive.pVal.Cotan$set_number <- ind
onlyPositive.pVal.Cotan<- rbind(onlyPositive.pVal.Cotan_tot, onlyPositive.pVal.Cotan)
onlyPositive.pVal.Cotan_tot
}
[1] "E13.5-187"
[1] "E17.5-516"
[1] "E15.0-510"
[1] "E13.5-437"
[1] "E15.0-509"
[1] "E13.5-184"
<- onlyPositive.pVal.Cotan_tot[2:nrow(onlyPositive.pVal.Cotan_tot),]
onlyPositive.pVal.Cotan_tot
<- merge.data.frame(onlyPositive.pVal.Cotan_tot,
onlyPositive.pVal.Cotan_tot by = c("genes","clusters","data_set","set_number"),all.x = T,all.y = F)
ground_truth_tot,
#onlyPositive.pVal.Cotan_tot <- onlyPositive.pVal.Cotan_tot[order(onlyPositive.pVal.Cotan_tot$p_values,
# decreasing = F),]
# df <- as.data.frame(matrix(nrow = nrow(onlyPositive.pVal.Cotan_tot),ncol = 3))
# colnames(df) <- c("TPR","FPR","Method")
# df$Method <- "COTAN"
#
# Positive <- sum(onlyPositive.pVal.Cotan_tot$value)
# Negative <- sum(!onlyPositive.pVal.Cotan_tot$value)
#
# for (i in 1:nrow(onlyPositive.pVal.Cotan_tot)) {
# df[i,"TPR"] <- sum(onlyPositive.pVal.Cotan_tot[1:i,"value"])/Positive
# df[i,"FPR"] <- (i-sum(onlyPositive.pVal.Cotan_tot[1:i,"value"]))/Negative
# Convert TRUE/FALSE to 1/0
$value <- as.numeric(onlyPositive.pVal.Cotan_tot$value)
onlyPositive.pVal.Cotan_tot
# Compute the ROC curve - note that we invert the p-values with 1 - p_values
<- roc(onlyPositive.pVal.Cotan_tot$value, 1 - onlyPositive.pVal.Cotan_tot$p_values)
roc_resultCOTAN
# Plot the ROC curve
#plot(roc_resultCOTAN)
ROC for Seurat
<- NA
deaSeurat_tot
for (ind in 1:dim(subset.datasets_csv)[1]) {
#print(ind)
<- paste0(subset.datasets_csv$Group[ind],"_",subset.datasets_csv$Collection[ind])
file.code
<- readRDS(file = file.path(dataSetDir,paste0(file.code,".RDS")))
dataset
<- getClusterizationData(dataset, clName = "mergedClusters")[[1]]
clusterization
<- read.csv(file.path(dirOut,paste0(file.code,"Seurat_DEA_genes.csv")), row.names = 1)
deaSeurat
<- NA
cl.names
for (cl.val in unique(clusterization)) {
#print(cl.val)
<- str_split(names(clusterization[clusterization == cl.val])[1],
cl.name pattern = "_",simplify = T)[1]
<- c(cl.names,cl.name)
cl.names <- cl.names[!is.na(cl.names)]
cl.names
$cluster == cl.val,]$cluster <- cl.name
deaSeurat[deaSeurat
}
<- str_split(datasets_csv$Collection[ind],pattern = "_[+]_",simplify = T)
clusters
colnames(deaSeurat) <- str_replace(colnames(deaSeurat),
pattern = "cluster",
replacement = "clusters")
colnames(deaSeurat) <- str_replace(colnames(deaSeurat),
pattern = "gene",
replacement = "genes")
$data_set <- subset.datasets_csv[ind,1]
deaSeurat$set_number <- ind
deaSeurat<- rbind(deaSeurat_tot, deaSeurat)
deaSeurat_tot
}<- deaSeurat_tot[2:nrow(deaSeurat_tot),]
deaSeurat_tot
<- merge.data.frame(deaSeurat_tot,
deaSeurat_tot
ground_truth_tot,by = c("genes","clusters","data_set","set_number"),
all.x = T,all.y = F)
# Convert TRUE/FALSE to 1/0
$value <- as.numeric(deaSeurat_tot$value)
deaSeurat_tot
# Compute the ROC curve - note that we invert the p-values with 1 - p_values
<- roc(deaSeurat_tot$value, 1 - deaSeurat_tot$p_val)
roc_resultSeurat
# Plot the ROC curve
#plot(roc_resultSeurat)
ROC for Seurat scTransform
<- NA
deaSeurat_tot
for (ind in 1:dim(subset.datasets_csv)[1]) {
#print(ind)
<- paste0(subset.datasets_csv$Group[ind],"_",subset.datasets_csv$Collection[ind])
file.code
<- readRDS(file = file.path(dataSetDir,paste0(file.code,".RDS")))
dataset
<- getClusterizationData(dataset, clName = "mergedClusters")[[1]]
clusterization
<- read.csv(file.path(dirOut,paste0(file.code,"Seurat_DEA_ScTransform_genes.csv")), row.names = 1)
deaSeurat
<- NA
cl.names
for (cl.val in unique(clusterization)) {
#print(cl.val)
<- str_split(names(clusterization[clusterization == cl.val])[1],
cl.name pattern = "_",simplify = T)[1]
<- c(cl.names,cl.name)
cl.names <- cl.names[!is.na(cl.names)]
cl.names
$cluster == cl.val,]$cluster <- cl.name
deaSeurat[deaSeurat
}
<- str_split(datasets_csv$Collection[ind],pattern = "_[+]_",simplify = T)
clusters
colnames(deaSeurat) <- str_replace(colnames(deaSeurat),
pattern = "cluster",
replacement = "clusters")
colnames(deaSeurat) <- str_replace(colnames(deaSeurat),
pattern = "gene",
replacement = "genes")
$data_set <- subset.datasets_csv[ind,1]
deaSeurat$set_number <- ind
deaSeurat<- rbind(deaSeurat_tot, deaSeurat)
deaSeurat_tot
}<- deaSeurat_tot[2:nrow(deaSeurat_tot),]
deaSeurat_tot
<- merge.data.frame(deaSeurat_tot,
deaSeurat_tot
ground_truth_tot,by = c("genes","clusters","data_set","set_number"),
all.x = T,all.y = F)
# Convert TRUE/FALSE to 1/0
$value <- as.numeric(deaSeurat_tot$value)
deaSeurat_tot
# Compute the ROC curve - note that we invert the p-values with 1 - p_values
<- roc(deaSeurat_tot$value, 1 - deaSeurat_tot$p_val) roc_resultSeurat_scTr
ROC for Seurat Bimod
<- NA
deaSeurat_tot
for (ind in 1:dim(subset.datasets_csv)[1]) {
#print(ind)
<- paste0(subset.datasets_csv$Group[ind],"_",subset.datasets_csv$Collection[ind])
file.code
<- readRDS(file = file.path(dataSetDir,paste0(file.code,".RDS")))
dataset
<- getClusterizationData(dataset, clName = "mergedClusters")[[1]]
clusterization
<- read.csv(file.path(dirOut,paste0(file.code,"Seurat_DEA_Bimod_genes.csv")), row.names = 1)
deaSeurat
<- NA
cl.names
for (cl.val in unique(clusterization)) {
#print(cl.val)
<- str_split(names(clusterization[clusterization == cl.val])[1],
cl.name pattern = "_",simplify = T)[1]
<- c(cl.names,cl.name)
cl.names <- cl.names[!is.na(cl.names)]
cl.names
$cluster == cl.val,]$cluster <- cl.name
deaSeurat[deaSeurat
}
<- str_split(datasets_csv$Collection[ind],pattern = "_[+]_",simplify = T)
clusters
colnames(deaSeurat) <- str_replace(colnames(deaSeurat),
pattern = "cluster",
replacement = "clusters")
colnames(deaSeurat) <- str_replace(colnames(deaSeurat),
pattern = "gene",
replacement = "genes")
$data_set <- subset.datasets_csv[ind,1]
deaSeurat$set_number <- ind
deaSeurat<- rbind(deaSeurat_tot, deaSeurat)
deaSeurat_tot
}<- deaSeurat_tot[2:nrow(deaSeurat_tot),]
deaSeurat_tot
<- merge.data.frame(deaSeurat_tot,
deaSeurat_tot
ground_truth_tot,by = c("genes","clusters","data_set","set_number"),
all.x = T,all.y = F)
# Convert TRUE/FALSE to 1/0
$value <- as.numeric(deaSeurat_tot$value)
deaSeurat_tot
# Compute the ROC curve - note that we invert the p-values with 1 - p_values
<- roc(deaSeurat_tot$value, 1 - deaSeurat_tot$p_val) roc_resultSeurat_Bimod
ROC for Monocle
<- NA
deaMonocle_tot for (ind in 1:dim(subset.datasets_csv)[1]) {
<- paste0(subset.datasets_csv$Group[ind],"_",subset.datasets_csv$Collection[ind])
file.code <- readRDS(file = file.path(dataSetDir,paste0(file.code,".RDS")))
dataset
<- getClusterizationData(dataset, clName = "mergedClusters")[[1]]
clusterization #print(file.code)
<- read.csv(file.path(dirOut,paste0(file.code,"Monocle_DEA_genes.csv")),row.names = 1)
deaMonocle
<- NA
cl.names
for (cl.val in unique(clusterization)) {
#print(cl.val)
<- str_split(names(clusterization[clusterization == cl.val])[1],
cl.name pattern = "_",simplify = T)[1]
<- c(cl.names,cl.name)
cl.names <- cl.names[!is.na(cl.names)]
cl.names
$cell_group == cl.val,"cell_group"] <- cl.name
deaMonocle[deaMonocle
}
<- str_split(datasets_csv$Collection[ind],pattern = "_[+]_",simplify = T)
clusters
colnames(deaMonocle) <- str_replace(colnames(deaMonocle),
pattern = "cell_group",
replacement = "clusters")
colnames(deaMonocle) <- str_replace(colnames(deaMonocle),
pattern = "gene_id",
replacement = "genes")
$data_set <- subset.datasets_csv[ind,1]
deaMonocle$set_number <- ind
deaMonocle<- as.data.frame(deaMonocle)
deaMonocle <- rbind(deaMonocle_tot, deaMonocle)
deaMonocle_tot
}<- deaMonocle_tot[2:nrow(deaMonocle_tot),]
deaMonocle_tot
<- merge.data.frame(deaMonocle_tot,
deaMonocle_tot
ground_truth_tot,by = c("genes","clusters","data_set","set_number"),
all.x = T,all.y = F)
# Convert TRUE/FALSE to 1/0
$value <- as.numeric(deaMonocle_tot$value)
deaMonocle_tot
# Compute the ROC curve - note that we invert the p-values with 1 - p_values
<- roc(deaMonocle_tot$value, 1 - deaMonocle_tot$marker_test_p_value)
roc_resultMonocle
# Plot the ROC curve
#plot(roc_resultMonocle)
ROC from Scanpy
<- NA
deaScanpy_tot for (ind in 1:dim(subset.datasets_csv)[1]) {
<- paste0(subset.datasets_csv$Group[ind],"_",subset.datasets_csv$Collection[ind])
file.code <- readRDS(file = file.path(dataSetDir,paste0(file.code,".RDS")))
dataset
<- getClusterizationData(dataset, clName = "mergedClusters")[[1]]
clusterization #print(file.code)
<- read.csv(file.path(dirOut,paste0(file.code,"Scanpy_DEA_genes.csv")),
deaScanpy row.names = 1)
<- NA
cl.names
for (cl.val in unique(clusterization)) {
#print(cl.val)
<- str_split(names(clusterization[clusterization == cl.val])[1],
cl.name pattern = "_",simplify = T)[1]
<- c(cl.names,cl.name)
cl.names <- cl.names[!is.na(cl.names)]
cl.names
$clusters == paste0("cl",cl.val),"clusters"] <- cl.name
deaScanpy[deaScanpy
}
<- str_split(datasets_csv$Collection[ind],pattern = "_[+]_",simplify = T)
clusters
$data_set <- subset.datasets_csv[ind,1]
deaScanpy$set_number <- ind
deaScanpy<- rbind(deaScanpy_tot, deaScanpy)
deaScanpy_tot
}<- deaScanpy_tot[2:nrow(deaScanpy_tot),]
deaScanpy_tot
<- merge.data.frame(deaScanpy_tot,
deaScanpy_tot
ground_truth_tot,by = c("genes","clusters","data_set","set_number"),
all.x = T,all.y = F)
# Convert TRUE/FALSE to 1/0
$value <- as.numeric(deaScanpy_tot$value)
deaScanpy_tot
# Compute the ROC curve - note that we invert the p-values with 1 - p_values
<- roc(deaScanpy_tot$value, 1 - deaScanpy_tot$pval)
roc_resultScanpy
# Plot the ROC curve
#plot(roc_resultScanpy)
Summary ROC for all methods
<- ggroc(list(COTAN=roc_resultCOTAN,
Two_Clusters_uneven_far Seurat=roc_resultSeurat,
Seurat_scTr = roc_resultSeurat_scTr,
Seurat_Bimod = roc_resultSeurat_Bimod,
Monocle=roc_resultMonocle,
Scanpy=roc_resultScanpy),
aes = c("colour"),
legacy.axes = T,
linewidth = 1)+theme_light()+
scale_colour_manual("",
values = methods.color)
<- Two_Clusters_uneven_far + xlab("FPR") + ylab("TPR")
Two_Clusters_uneven_farPL
<- get_legend(Two_Clusters_uneven_farPL)
lg
Two_Clusters_uneven_farPL
3 clusters
3_Clusters_even
True vector
<-datasets_csv[datasets_csv$Group == "3_Clusters_even",]
subset.datasets_csv
<- NA
ground_truth_tot
for (ind in 1:dim(subset.datasets_csv)[1]) {
<- str_split(subset.datasets_csv$Collection[ind],pattern = "_[+]_",simplify = T)
clusters
<- file.presence[,clusters]
file.presence.subset
#file.presence.subset <- as.matrix(file.presence.subset)
<- as.data.frame(matrix(nrow = nrow(file.presence.subset),
ground_truth ncol = ncol(file.presence.subset)))
rownames(ground_truth) <- rownames(file.presence.subset)
colnames(ground_truth) <- colnames(file.presence.subset)
== "Absent"] <- 0
ground_truth[file.presence.subset == "Present"] <- 1
ground_truth[file.presence.subset == "Uncertain"] <- 0.6
ground_truth[file.presence.subset
<- ground_truth
file.presence.subset
for (col in 1:ncol(ground_truth)) {
<- FALSE
ground_truth[,col] == 1 & rowMeans(file.presence.subset[,-col,drop = FALSE]) < 0.555 ,col] <- TRUE
ground_truth[file.presence.subset[,col]
}
$genes <- rownames(ground_truth)
ground_truth<- pivot_longer(ground_truth,
ground_truth cols = 1:(ncol(ground_truth)-1),
names_to = "clusters")
$data_set <- subset.datasets_csv[ind,1]
ground_truth$set_number <- ind
ground_truth<- rbind(ground_truth_tot, ground_truth)
ground_truth_tot
}<- ground_truth_tot[2:nrow(ground_truth_tot),]
ground_truth_tot
head(ground_truth_tot)
# A tibble: 6 × 5
genes clusters value data_set set_number
<chr> <chr> <lgl> <chr> <int>
1 Neil2 E15.0-437 FALSE 3_Clusters_even 1
2 Neil2 E13.5-510 FALSE 3_Clusters_even 1
3 Neil2 E13.5-437 FALSE 3_Clusters_even 1
4 Lamc1 E15.0-437 FALSE 3_Clusters_even 1
5 Lamc1 E13.5-510 FALSE 3_Clusters_even 1
6 Lamc1 E13.5-437 FALSE 3_Clusters_even 1
length(unique(ground_truth_tot$genes))
[1] 14695
sum(ground_truth_tot$value)
[1] 2512
ROC for COTAN
<- NA
onlyPositive.pVal.Cotan_tot
for (ind in 1:dim(subset.datasets_csv)[1]) {
#print(ind)
<- paste0(subset.datasets_csv$Group[ind],"_",
file.code $Collection[ind])
subset.datasets_csv<- readRDS(file = file.path(dataSetDir,paste0(file.code,".RDS")))
dataset <- getClusterizationData(dataset,clName = "mergedClusters")[[2]]
deaCOTAN <- pValueFromDEA(deaCOTAN,
pvalCOTAN numCells = getNumCells(dataset),method = "none")
<- getClusterizationData(dataset, clName = "mergedClusters")[[1]]
clusterization <- NA
cl.names
for (cl.val in unique(clusterization)) {
#print(cl.val)
<- c(cl.names,
cl.names str_split(names(clusterization[clusterization == cl.val])[1],
pattern = "_",simplify = T)[1])
<- cl.names[!is.na(cl.names)]
cl.names
}colnames(deaCOTAN) <- cl.names
colnames(pvalCOTAN) <- cl.names
<- str_split(datasets_csv$Collection[ind],pattern = "_[+]_",simplify = T)
clusters
<- pvalCOTAN
onlyPositive.pVal.Cotan
for (cl in cl.names) {
print(cl)
#temp.DEA.CotanSign <- deaCOTAN[rownames(pvalCOTAN[pvalCOTAN[,cl] < 0.05,]) ,]
rownames(deaCOTAN[deaCOTAN[,cl] < 0,]),cl] <- 1 #onlyPositive.pVal.Cotan[rownames(deaCOTAN[deaCOTAN[,cl] < 0,]),cl]+1
onlyPositive.pVal.Cotan[
}
$genes <- rownames(onlyPositive.pVal.Cotan)
onlyPositive.pVal.Cotan<- pivot_longer(onlyPositive.pVal.Cotan,
onlyPositive.pVal.Cotan values_to = "p_values",
cols = 1:(ncol(onlyPositive.pVal.Cotan)-1),
names_to = "clusters")
$data_set <- subset.datasets_csv[ind,1]
onlyPositive.pVal.Cotan$set_number <- ind
onlyPositive.pVal.Cotan<- rbind(onlyPositive.pVal.Cotan_tot, onlyPositive.pVal.Cotan)
onlyPositive.pVal.Cotan_tot
}
[1] "E13.5-437"
[1] "E15.0-437"
[1] "E13.5-510"
[1] "E17.5-516"
[1] "E13.5-437"
[1] "E17.5-505"
[1] "E15.0-510"
[1] "E15.0-428"
[1] "E13.5-510"
<- onlyPositive.pVal.Cotan_tot[2:nrow(onlyPositive.pVal.Cotan_tot),]
onlyPositive.pVal.Cotan_tot
<- merge.data.frame(onlyPositive.pVal.Cotan_tot,
onlyPositive.pVal.Cotan_tot by = c("genes","clusters","data_set","set_number"),all.x = T,all.y = F)
ground_truth_tot,
#onlyPositive.pVal.Cotan_tot <- onlyPositive.pVal.Cotan_tot[order(onlyPositive.pVal.Cotan_tot$p_values,
# decreasing = F),]
# df <- as.data.frame(matrix(nrow = nrow(onlyPositive.pVal.Cotan_tot),ncol = 3))
# colnames(df) <- c("TPR","FPR","Method")
# df$Method <- "COTAN"
#
# Positive <- sum(onlyPositive.pVal.Cotan_tot$value)
# Negative <- sum(!onlyPositive.pVal.Cotan_tot$value)
#
# for (i in 1:nrow(onlyPositive.pVal.Cotan_tot)) {
# df[i,"TPR"] <- sum(onlyPositive.pVal.Cotan_tot[1:i,"value"])/Positive
# df[i,"FPR"] <- (i-sum(onlyPositive.pVal.Cotan_tot[1:i,"value"]))/Negative
# Convert TRUE/FALSE to 1/0
$value <- as.numeric(onlyPositive.pVal.Cotan_tot$value)
onlyPositive.pVal.Cotan_tot
# Compute the ROC curve - note that we invert the p-values with 1 - p_values
<- roc(onlyPositive.pVal.Cotan_tot$value, 1 - onlyPositive.pVal.Cotan_tot$p_values)
roc_resultCOTAN
# Plot the ROC curve
#plot(roc_resultCOTAN)
ROC for Seurat
<- NA
deaSeurat_tot
for (ind in 1:dim(subset.datasets_csv)[1]) {
#print(ind)
<- paste0(subset.datasets_csv$Group[ind],"_",subset.datasets_csv$Collection[ind])
file.code
<- readRDS(file = file.path(dataSetDir,paste0(file.code,".RDS")))
dataset
<- getClusterizationData(dataset, clName = "mergedClusters")[[1]]
clusterization
<- read.csv(file.path(dirOut,paste0(file.code,"Seurat_DEA_genes.csv")), row.names = 1)
deaSeurat
<- NA
cl.names
for (cl.val in unique(clusterization)) {
#print(cl.val)
<- str_split(names(clusterization[clusterization == cl.val])[1],
cl.name pattern = "_",simplify = T)[1]
<- c(cl.names,cl.name)
cl.names <- cl.names[!is.na(cl.names)]
cl.names
$cluster == cl.val,]$cluster <- cl.name
deaSeurat[deaSeurat
}
<- str_split(datasets_csv$Collection[ind],pattern = "_[+]_",simplify = T)
clusters
colnames(deaSeurat) <- str_replace(colnames(deaSeurat),
pattern = "cluster",
replacement = "clusters")
colnames(deaSeurat) <- str_replace(colnames(deaSeurat),
pattern = "gene",
replacement = "genes")
$data_set <- subset.datasets_csv[ind,1]
deaSeurat$set_number <- ind
deaSeurat<- rbind(deaSeurat_tot, deaSeurat)
deaSeurat_tot
}<- deaSeurat_tot[2:nrow(deaSeurat_tot),]
deaSeurat_tot
<- merge.data.frame(deaSeurat_tot,
deaSeurat_tot
ground_truth_tot,by = c("genes","clusters","data_set","set_number"),
all.x = T,all.y = F)
# Convert TRUE/FALSE to 1/0
$value <- as.numeric(deaSeurat_tot$value)
deaSeurat_tot
# Compute the ROC curve - note that we invert the p-values with 1 - p_values
<- roc(deaSeurat_tot$value, 1 - deaSeurat_tot$p_val)
roc_resultSeurat
# Plot the ROC curve
#plot(roc_resultSeurat)
ROC for Seurat scTransform
<- NA
deaSeurat_tot
for (ind in 1:dim(subset.datasets_csv)[1]) {
#print(ind)
<- paste0(subset.datasets_csv$Group[ind],"_",subset.datasets_csv$Collection[ind])
file.code
<- readRDS(file = file.path(dataSetDir,paste0(file.code,".RDS")))
dataset
<- getClusterizationData(dataset, clName = "mergedClusters")[[1]]
clusterization
<- read.csv(file.path(dirOut,paste0(file.code,"Seurat_DEA_ScTransform_genes.csv")), row.names = 1)
deaSeurat
<- NA
cl.names
for (cl.val in unique(clusterization)) {
#print(cl.val)
<- str_split(names(clusterization[clusterization == cl.val])[1],
cl.name pattern = "_",simplify = T)[1]
<- c(cl.names,cl.name)
cl.names <- cl.names[!is.na(cl.names)]
cl.names
$cluster == cl.val,]$cluster <- cl.name
deaSeurat[deaSeurat
}
<- str_split(datasets_csv$Collection[ind],pattern = "_[+]_",simplify = T)
clusters
colnames(deaSeurat) <- str_replace(colnames(deaSeurat),
pattern = "cluster",
replacement = "clusters")
colnames(deaSeurat) <- str_replace(colnames(deaSeurat),
pattern = "gene",
replacement = "genes")
$data_set <- subset.datasets_csv[ind,1]
deaSeurat$set_number <- ind
deaSeurat<- rbind(deaSeurat_tot, deaSeurat)
deaSeurat_tot
}<- deaSeurat_tot[2:nrow(deaSeurat_tot),]
deaSeurat_tot
<- merge.data.frame(deaSeurat_tot,
deaSeurat_tot
ground_truth_tot,by = c("genes","clusters","data_set","set_number"),
all.x = T,all.y = F)
# Convert TRUE/FALSE to 1/0
$value <- as.numeric(deaSeurat_tot$value)
deaSeurat_tot
# Compute the ROC curve - note that we invert the p-values with 1 - p_values
<- roc(deaSeurat_tot$value, 1 - deaSeurat_tot$p_val) roc_resultSeurat_scTr
ROC for Seurat Bimod
<- NA
deaSeurat_tot
for (ind in 1:dim(subset.datasets_csv)[1]) {
#print(ind)
<- paste0(subset.datasets_csv$Group[ind],"_",subset.datasets_csv$Collection[ind])
file.code
<- readRDS(file = file.path(dataSetDir,paste0(file.code,".RDS")))
dataset
<- getClusterizationData(dataset, clName = "mergedClusters")[[1]]
clusterization
<- read.csv(file.path(dirOut,paste0(file.code,"Seurat_DEA_Bimod_genes.csv")), row.names = 1)
deaSeurat
<- NA
cl.names
for (cl.val in unique(clusterization)) {
#print(cl.val)
<- str_split(names(clusterization[clusterization == cl.val])[1],
cl.name pattern = "_",simplify = T)[1]
<- c(cl.names,cl.name)
cl.names <- cl.names[!is.na(cl.names)]
cl.names
$cluster == cl.val,]$cluster <- cl.name
deaSeurat[deaSeurat
}
<- str_split(datasets_csv$Collection[ind],pattern = "_[+]_",simplify = T)
clusters
colnames(deaSeurat) <- str_replace(colnames(deaSeurat),
pattern = "cluster",
replacement = "clusters")
colnames(deaSeurat) <- str_replace(colnames(deaSeurat),
pattern = "gene",
replacement = "genes")
$data_set <- subset.datasets_csv[ind,1]
deaSeurat$set_number <- ind
deaSeurat<- rbind(deaSeurat_tot, deaSeurat)
deaSeurat_tot
}<- deaSeurat_tot[2:nrow(deaSeurat_tot),]
deaSeurat_tot
<- merge.data.frame(deaSeurat_tot,
deaSeurat_tot
ground_truth_tot,by = c("genes","clusters","data_set","set_number"),
all.x = T,all.y = F)
# Convert TRUE/FALSE to 1/0
$value <- as.numeric(deaSeurat_tot$value)
deaSeurat_tot
# Compute the ROC curve - note that we invert the p-values with 1 - p_values
<- roc(deaSeurat_tot$value, 1 - deaSeurat_tot$p_val) roc_resultSeurat_Bimod
ROC for Monocle
<- NA
deaMonocle_tot for (ind in 1:dim(subset.datasets_csv)[1]) {
<- paste0(subset.datasets_csv$Group[ind],"_",subset.datasets_csv$Collection[ind])
file.code <- readRDS(file = file.path(dataSetDir,paste0(file.code,".RDS")))
dataset
<- getClusterizationData(dataset, clName = "mergedClusters")[[1]]
clusterization #print(file.code)
<- read.csv(file.path(dirOut,paste0(file.code,"Monocle_DEA_genes.csv")),row.names = 1)
deaMonocle
<- NA
cl.names
for (cl.val in unique(clusterization)) {
#print(cl.val)
<- str_split(names(clusterization[clusterization == cl.val])[1],
cl.name pattern = "_",simplify = T)[1]
<- c(cl.names,cl.name)
cl.names <- cl.names[!is.na(cl.names)]
cl.names
$cell_group == cl.val,"cell_group"] <- cl.name
deaMonocle[deaMonocle
}
<- str_split(datasets_csv$Collection[ind],pattern = "_[+]_",simplify = T)
clusters
colnames(deaMonocle) <- str_replace(colnames(deaMonocle),
pattern = "cell_group",
replacement = "clusters")
colnames(deaMonocle) <- str_replace(colnames(deaMonocle),
pattern = "gene_id",
replacement = "genes")
$data_set <- subset.datasets_csv[ind,1]
deaMonocle$set_number <- ind
deaMonocle<- as.data.frame(deaMonocle)
deaMonocle <- rbind(deaMonocle_tot, deaMonocle)
deaMonocle_tot
}<- deaMonocle_tot[2:nrow(deaMonocle_tot),]
deaMonocle_tot
<- merge.data.frame(deaMonocle_tot,
deaMonocle_tot
ground_truth_tot,by = c("genes","clusters","data_set","set_number"),
all.x = T,all.y = F)
# Convert TRUE/FALSE to 1/0
$value <- as.numeric(deaMonocle_tot$value)
deaMonocle_tot
# Compute the ROC curve - note that we invert the p-values with 1 - p_values
<- roc(deaMonocle_tot$value, 1 - deaMonocle_tot$marker_test_p_value)
roc_resultMonocle
# Plot the ROC curve
#plot(roc_resultMonocle)
ROC from Scanpy
<- NA
deaScanpy_tot for (ind in 1:dim(subset.datasets_csv)[1]) {
<- paste0(subset.datasets_csv$Group[ind],"_",subset.datasets_csv$Collection[ind])
file.code <- readRDS(file = file.path(dataSetDir,paste0(file.code,".RDS")))
dataset
<- getClusterizationData(dataset, clName = "mergedClusters")[[1]]
clusterization #print(file.code)
<- read.csv(file.path(dirOut,paste0(file.code,"Scanpy_DEA_genes.csv")),
deaScanpy row.names = 1)
<- NA
cl.names
for (cl.val in unique(clusterization)) {
#print(cl.val)
<- str_split(names(clusterization[clusterization == cl.val])[1],
cl.name pattern = "_",simplify = T)[1]
<- c(cl.names,cl.name)
cl.names <- cl.names[!is.na(cl.names)]
cl.names
$clusters == paste0("cl",cl.val),"clusters"] <- cl.name
deaScanpy[deaScanpy
}
<- str_split(datasets_csv$Collection[ind],pattern = "_[+]_",simplify = T)
clusters
$data_set <- subset.datasets_csv[ind,1]
deaScanpy$set_number <- ind
deaScanpy<- rbind(deaScanpy_tot, deaScanpy)
deaScanpy_tot
}<- deaScanpy_tot[2:nrow(deaScanpy_tot),]
deaScanpy_tot
<- merge.data.frame(deaScanpy_tot,
deaScanpy_tot
ground_truth_tot,by = c("genes","clusters","data_set","set_number"),
all.x = T,all.y = F)
# Convert TRUE/FALSE to 1/0
$value <- as.numeric(deaScanpy_tot$value)
deaScanpy_tot
# Compute the ROC curve - note that we invert the p-values with 1 - p_values
<- roc(deaScanpy_tot$value, 1 - deaScanpy_tot$pval)
roc_resultScanpy
# Plot the ROC curve
#plot(roc_resultScanpy)
Summary ROC for all methods
<- ggroc(list(COTAN=roc_resultCOTAN,
Three_Clusters_even Seurat=roc_resultSeurat,
Seurat_scTr = roc_resultSeurat_scTr,
Seurat_Bimod = roc_resultSeurat_Bimod,
Monocle=roc_resultMonocle,
Scanpy=roc_resultScanpy),
aes = c("colour"),
legacy.axes = T,
linewidth = 1)+theme_light()+
scale_colour_manual("",
values = methods.color)
<- Three_Clusters_even +
Three_Clusters_evenPL xlab("FPR") + ylab("TPR")+theme(legend.position="none")
Three_Clusters_even
3_Clusters_uneven
True vector
<-datasets_csv[datasets_csv$Group == "3_Clusters_uneven",]
subset.datasets_csv
<- NA
ground_truth_tot
for (ind in 1:dim(subset.datasets_csv)[1]) {
<- str_split(subset.datasets_csv$Collection[ind],pattern = "_[+]_",simplify = T)
clusters
<- file.presence[,clusters]
file.presence.subset
#file.presence.subset <- as.matrix(file.presence.subset)
<- as.data.frame(matrix(nrow = nrow(file.presence.subset),
ground_truth ncol = ncol(file.presence.subset)))
rownames(ground_truth) <- rownames(file.presence.subset)
colnames(ground_truth) <- colnames(file.presence.subset)
== "Absent"] <- 0
ground_truth[file.presence.subset == "Present"] <- 1
ground_truth[file.presence.subset == "Uncertain"] <- 0.6
ground_truth[file.presence.subset
<- ground_truth
file.presence.subset
for (col in 1:ncol(ground_truth)) {
<- FALSE
ground_truth[,col] == 1 & rowMeans(file.presence.subset[,-col,drop = FALSE]) < 0.555 ,col] <- TRUE
ground_truth[file.presence.subset[,col]
}
$genes <- rownames(ground_truth)
ground_truth<- pivot_longer(ground_truth,
ground_truth cols = 1:(ncol(ground_truth)-1),
names_to = "clusters")
$data_set <- subset.datasets_csv[ind,1]
ground_truth$set_number <- ind
ground_truth<- rbind(ground_truth_tot, ground_truth)
ground_truth_tot
}<- ground_truth_tot[2:nrow(ground_truth_tot),]
ground_truth_tot
head(ground_truth_tot)
# A tibble: 6 × 5
genes clusters value data_set set_number
<chr> <chr> <lgl> <chr> <int>
1 Neil2 E15.0-428 FALSE 3_Clusters_uneven 1
2 Neil2 E13.5-434 FALSE 3_Clusters_uneven 1
3 Neil2 E15.0-510 FALSE 3_Clusters_uneven 1
4 Lamc1 E15.0-428 FALSE 3_Clusters_uneven 1
5 Lamc1 E13.5-434 FALSE 3_Clusters_uneven 1
6 Lamc1 E15.0-510 FALSE 3_Clusters_uneven 1
length(unique(ground_truth_tot$genes))
[1] 14695
sum(ground_truth_tot$value)
[1] 2816
ROC for COTAN
<- NA
onlyPositive.pVal.Cotan_tot
for (ind in 1:dim(subset.datasets_csv)[1]) {
#print(ind)
<- paste0(subset.datasets_csv$Group[ind],"_",
file.code $Collection[ind])
subset.datasets_csv<- readRDS(file = file.path(dataSetDir,paste0(file.code,".RDS")))
dataset <- getClusterizationData(dataset,clName = "mergedClusters")[[2]]
deaCOTAN <- pValueFromDEA(deaCOTAN,
pvalCOTAN numCells = getNumCells(dataset),method = "none")
<- getClusterizationData(dataset, clName = "mergedClusters")[[1]]
clusterization <- NA
cl.names
for (cl.val in unique(clusterization)) {
#print(cl.val)
<- c(cl.names,
cl.names str_split(names(clusterization[clusterization == cl.val])[1],
pattern = "_",simplify = T)[1])
<- cl.names[!is.na(cl.names)]
cl.names
}colnames(deaCOTAN) <- cl.names
colnames(pvalCOTAN) <- cl.names
<- str_split(datasets_csv$Collection[ind],pattern = "_[+]_",simplify = T)
clusters
<- pvalCOTAN
onlyPositive.pVal.Cotan
for (cl in cl.names) {
print(cl)
#temp.DEA.CotanSign <- deaCOTAN[rownames(pvalCOTAN[pvalCOTAN[,cl] < 0.05,]) ,]
rownames(deaCOTAN[deaCOTAN[,cl] < 0,]),cl] <- 1 #onlyPositive.pVal.Cotan[rownames(deaCOTAN[deaCOTAN[,cl] < 0,]),cl]+1
onlyPositive.pVal.Cotan[
}
$genes <- rownames(onlyPositive.pVal.Cotan)
onlyPositive.pVal.Cotan<- pivot_longer(onlyPositive.pVal.Cotan,
onlyPositive.pVal.Cotan values_to = "p_values",
cols = 1:(ncol(onlyPositive.pVal.Cotan)-1),
names_to = "clusters")
$data_set <- subset.datasets_csv[ind,1]
onlyPositive.pVal.Cotan$set_number <- ind
onlyPositive.pVal.Cotan<- rbind(onlyPositive.pVal.Cotan_tot, onlyPositive.pVal.Cotan)
onlyPositive.pVal.Cotan_tot
}
[1] "E15.0-510"
[1] "E13.5-434"
[1] "E15.0-428"
[1] "E15.0-432"
[1] "E13.5-432"
[1] "E13.5-187"
[1] "E15.0-509"
[1] "E15.0-508"
[1] "E13.5-184"
<- onlyPositive.pVal.Cotan_tot[2:nrow(onlyPositive.pVal.Cotan_tot),]
onlyPositive.pVal.Cotan_tot
<- merge.data.frame(onlyPositive.pVal.Cotan_tot,
onlyPositive.pVal.Cotan_tot by = c("genes","clusters","data_set","set_number"),all.x = T,all.y = F)
ground_truth_tot,
#onlyPositive.pVal.Cotan_tot <- onlyPositive.pVal.Cotan_tot[order(onlyPositive.pVal.Cotan_tot$p_values,
# decreasing = F),]
# df <- as.data.frame(matrix(nrow = nrow(onlyPositive.pVal.Cotan_tot),ncol = 3))
# colnames(df) <- c("TPR","FPR","Method")
# df$Method <- "COTAN"
#
# Positive <- sum(onlyPositive.pVal.Cotan_tot$value)
# Negative <- sum(!onlyPositive.pVal.Cotan_tot$value)
#
# for (i in 1:nrow(onlyPositive.pVal.Cotan_tot)) {
# df[i,"TPR"] <- sum(onlyPositive.pVal.Cotan_tot[1:i,"value"])/Positive
# df[i,"FPR"] <- (i-sum(onlyPositive.pVal.Cotan_tot[1:i,"value"]))/Negative
# Convert TRUE/FALSE to 1/0
$value <- as.numeric(onlyPositive.pVal.Cotan_tot$value)
onlyPositive.pVal.Cotan_tot
# Compute the ROC curve - note that we invert the p-values with 1 - p_values
<- roc(onlyPositive.pVal.Cotan_tot$value, 1 - onlyPositive.pVal.Cotan_tot$p_values)
roc_resultCOTAN
# Plot the ROC curve
#plot(roc_resultCOTAN)
ROC for Seurat
<- NA
deaSeurat_tot
for (ind in 1:dim(subset.datasets_csv)[1]) {
#print(ind)
<- paste0(subset.datasets_csv$Group[ind],"_",subset.datasets_csv$Collection[ind])
file.code
<- readRDS(file = file.path(dataSetDir,paste0(file.code,".RDS")))
dataset
<- getClusterizationData(dataset, clName = "mergedClusters")[[1]]
clusterization
<- read.csv(file.path(dirOut,paste0(file.code,"Seurat_DEA_genes.csv")), row.names = 1)
deaSeurat
<- NA
cl.names
for (cl.val in unique(clusterization)) {
#print(cl.val)
<- str_split(names(clusterization[clusterization == cl.val])[1],
cl.name pattern = "_",simplify = T)[1]
<- c(cl.names,cl.name)
cl.names <- cl.names[!is.na(cl.names)]
cl.names
$cluster == cl.val,]$cluster <- cl.name
deaSeurat[deaSeurat
}
<- str_split(datasets_csv$Collection[ind],pattern = "_[+]_",simplify = T)
clusters
colnames(deaSeurat) <- str_replace(colnames(deaSeurat),
pattern = "cluster",
replacement = "clusters")
colnames(deaSeurat) <- str_replace(colnames(deaSeurat),
pattern = "gene",
replacement = "genes")
$data_set <- subset.datasets_csv[ind,1]
deaSeurat$set_number <- ind
deaSeurat<- rbind(deaSeurat_tot, deaSeurat)
deaSeurat_tot
}<- deaSeurat_tot[2:nrow(deaSeurat_tot),]
deaSeurat_tot
<- merge.data.frame(deaSeurat_tot,
deaSeurat_tot
ground_truth_tot,by = c("genes","clusters","data_set","set_number"),
all.x = T,all.y = F)
# Convert TRUE/FALSE to 1/0
$value <- as.numeric(deaSeurat_tot$value)
deaSeurat_tot
# Compute the ROC curve - note that we invert the p-values with 1 - p_values
<- roc(deaSeurat_tot$value, 1 - deaSeurat_tot$p_val)
roc_resultSeurat
# Plot the ROC curve
#plot(roc_resultSeurat)
ROC for Seurat scTransform
<- NA
deaSeurat_tot
for (ind in 1:dim(subset.datasets_csv)[1]) {
#print(ind)
<- paste0(subset.datasets_csv$Group[ind],"_",subset.datasets_csv$Collection[ind])
file.code
<- readRDS(file = file.path(dataSetDir,paste0(file.code,".RDS")))
dataset
<- getClusterizationData(dataset, clName = "mergedClusters")[[1]]
clusterization
<- read.csv(file.path(dirOut,paste0(file.code,"Seurat_DEA_ScTransform_genes.csv")), row.names = 1)
deaSeurat
<- NA
cl.names
for (cl.val in unique(clusterization)) {
#print(cl.val)
<- str_split(names(clusterization[clusterization == cl.val])[1],
cl.name pattern = "_",simplify = T)[1]
<- c(cl.names,cl.name)
cl.names <- cl.names[!is.na(cl.names)]
cl.names
$cluster == cl.val,]$cluster <- cl.name
deaSeurat[deaSeurat
}
<- str_split(datasets_csv$Collection[ind],pattern = "_[+]_",simplify = T)
clusters
colnames(deaSeurat) <- str_replace(colnames(deaSeurat),
pattern = "cluster",
replacement = "clusters")
colnames(deaSeurat) <- str_replace(colnames(deaSeurat),
pattern = "gene",
replacement = "genes")
$data_set <- subset.datasets_csv[ind,1]
deaSeurat$set_number <- ind
deaSeurat<- rbind(deaSeurat_tot, deaSeurat)
deaSeurat_tot
}<- deaSeurat_tot[2:nrow(deaSeurat_tot),]
deaSeurat_tot
<- merge.data.frame(deaSeurat_tot,
deaSeurat_tot
ground_truth_tot,by = c("genes","clusters","data_set","set_number"),
all.x = T,all.y = F)
# Convert TRUE/FALSE to 1/0
$value <- as.numeric(deaSeurat_tot$value)
deaSeurat_tot
# Compute the ROC curve - note that we invert the p-values with 1 - p_values
<- roc(deaSeurat_tot$value, 1 - deaSeurat_tot$p_val) roc_resultSeurat_scTr
ROC for Seurat Bimod
<- NA
deaSeurat_tot
for (ind in 1:dim(subset.datasets_csv)[1]) {
#print(ind)
<- paste0(subset.datasets_csv$Group[ind],"_",subset.datasets_csv$Collection[ind])
file.code
<- readRDS(file = file.path(dataSetDir,paste0(file.code,".RDS")))
dataset
<- getClusterizationData(dataset, clName = "mergedClusters")[[1]]
clusterization
<- read.csv(file.path(dirOut,paste0(file.code,"Seurat_DEA_Bimod_genes.csv")), row.names = 1)
deaSeurat
<- NA
cl.names
for (cl.val in unique(clusterization)) {
#print(cl.val)
<- str_split(names(clusterization[clusterization == cl.val])[1],
cl.name pattern = "_",simplify = T)[1]
<- c(cl.names,cl.name)
cl.names <- cl.names[!is.na(cl.names)]
cl.names
$cluster == cl.val,]$cluster <- cl.name
deaSeurat[deaSeurat
}
<- str_split(datasets_csv$Collection[ind],pattern = "_[+]_",simplify = T)
clusters
colnames(deaSeurat) <- str_replace(colnames(deaSeurat),
pattern = "cluster",
replacement = "clusters")
colnames(deaSeurat) <- str_replace(colnames(deaSeurat),
pattern = "gene",
replacement = "genes")
$data_set <- subset.datasets_csv[ind,1]
deaSeurat$set_number <- ind
deaSeurat<- rbind(deaSeurat_tot, deaSeurat)
deaSeurat_tot
}<- deaSeurat_tot[2:nrow(deaSeurat_tot),]
deaSeurat_tot
<- merge.data.frame(deaSeurat_tot,
deaSeurat_tot
ground_truth_tot,by = c("genes","clusters","data_set","set_number"),
all.x = T,all.y = F)
# Convert TRUE/FALSE to 1/0
$value <- as.numeric(deaSeurat_tot$value)
deaSeurat_tot
# Compute the ROC curve - note that we invert the p-values with 1 - p_values
<- roc(deaSeurat_tot$value, 1 - deaSeurat_tot$p_val) roc_resultSeurat_Bimod
ROC for Monocle
<- NA
deaMonocle_tot for (ind in 1:dim(subset.datasets_csv)[1]) {
<- paste0(subset.datasets_csv$Group[ind],"_",subset.datasets_csv$Collection[ind])
file.code <- readRDS(file = file.path(dataSetDir,paste0(file.code,".RDS")))
dataset
<- getClusterizationData(dataset, clName = "mergedClusters")[[1]]
clusterization #print(file.code)
<- read.csv(file.path(dirOut,paste0(file.code,"Monocle_DEA_genes.csv")),row.names = 1)
deaMonocle
<- NA
cl.names
for (cl.val in unique(clusterization)) {
#print(cl.val)
<- str_split(names(clusterization[clusterization == cl.val])[1],
cl.name pattern = "_",simplify = T)[1]
<- c(cl.names,cl.name)
cl.names <- cl.names[!is.na(cl.names)]
cl.names
$cell_group == cl.val,"cell_group"] <- cl.name
deaMonocle[deaMonocle
}
<- str_split(datasets_csv$Collection[ind],pattern = "_[+]_",simplify = T)
clusters
colnames(deaMonocle) <- str_replace(colnames(deaMonocle),
pattern = "cell_group",
replacement = "clusters")
colnames(deaMonocle) <- str_replace(colnames(deaMonocle),
pattern = "gene_id",
replacement = "genes")
$data_set <- subset.datasets_csv[ind,1]
deaMonocle$set_number <- ind
deaMonocle<- as.data.frame(deaMonocle)
deaMonocle <- rbind(deaMonocle_tot, deaMonocle)
deaMonocle_tot
}<- deaMonocle_tot[2:nrow(deaMonocle_tot),]
deaMonocle_tot
<- merge.data.frame(deaMonocle_tot,
deaMonocle_tot
ground_truth_tot,by = c("genes","clusters","data_set","set_number"),
all.x = T,all.y = F)
# Convert TRUE/FALSE to 1/0
$value <- as.numeric(deaMonocle_tot$value)
deaMonocle_tot
# Compute the ROC curve - note that we invert the p-values with 1 - p_values
<- roc(deaMonocle_tot$value, 1 - deaMonocle_tot$marker_test_p_value)
roc_resultMonocle
# Plot the ROC curve
#plot(roc_resultMonocle)
ROC from Scanpy
<- NA
deaScanpy_tot for (ind in 1:dim(subset.datasets_csv)[1]) {
<- paste0(subset.datasets_csv$Group[ind],"_",subset.datasets_csv$Collection[ind])
file.code <- readRDS(file = file.path(dataSetDir,paste0(file.code,".RDS")))
dataset
<- getClusterizationData(dataset, clName = "mergedClusters")[[1]]
clusterization #print(file.code)
<- read.csv(file.path(dirOut,paste0(file.code,"Scanpy_DEA_genes.csv")),
deaScanpy row.names = 1)
<- NA
cl.names
for (cl.val in unique(clusterization)) {
#print(cl.val)
<- str_split(names(clusterization[clusterization == cl.val])[1],
cl.name pattern = "_",simplify = T)[1]
<- c(cl.names,cl.name)
cl.names <- cl.names[!is.na(cl.names)]
cl.names
$clusters == paste0("cl",cl.val),"clusters"] <- cl.name
deaScanpy[deaScanpy
}
<- str_split(datasets_csv$Collection[ind],pattern = "_[+]_",simplify = T)
clusters
$data_set <- subset.datasets_csv[ind,1]
deaScanpy$set_number <- ind
deaScanpy<- rbind(deaScanpy_tot, deaScanpy)
deaScanpy_tot
}<- deaScanpy_tot[2:nrow(deaScanpy_tot),]
deaScanpy_tot
<- merge.data.frame(deaScanpy_tot,
deaScanpy_tot
ground_truth_tot,by = c("genes","clusters","data_set","set_number"),
all.x = T,all.y = F)
# Convert TRUE/FALSE to 1/0
$value <- as.numeric(deaScanpy_tot$value)
deaScanpy_tot
# Compute the ROC curve - note that we invert the p-values with 1 - p_values
<- roc(deaScanpy_tot$value, 1 - deaScanpy_tot$pval)
roc_resultScanpy
# Plot the ROC curve
#plot(roc_resultScanpy)
Summary ROC for all methods
<- ggroc(list(COTAN=roc_resultCOTAN,
Three_Clusters_uneven Seurat=roc_resultSeurat,
Seurat_scTr = roc_resultSeurat_scTr,
Seurat_Bimod = roc_resultSeurat_Bimod,
Monocle=roc_resultMonocle,
Scanpy=roc_resultScanpy),
aes = c("colour"),
legacy.axes = T,
linewidth = 1)+theme_light()+
scale_colour_manual("",
values = methods.color)
<- Three_Clusters_uneven + xlab("FPR") + ylab("TPR")+theme(legend.position="none")
Three_Clusters_unevenPL
Three_Clusters_uneven
5 clusters
5_Clusters_uneven
True vector
<-datasets_csv[datasets_csv$Group == "5_Clusters_uneven",]
subset.datasets_csv
<- NA
ground_truth_tot
for (ind in 1:dim(subset.datasets_csv)[1]) {
<- str_split(subset.datasets_csv$Collection[ind],pattern = "_[+]_",simplify = T)
clusters
<- file.presence[,clusters]
file.presence.subset
#file.presence.subset <- as.matrix(file.presence.subset)
<- as.data.frame(matrix(nrow = nrow(file.presence.subset),
ground_truth ncol = ncol(file.presence.subset)))
rownames(ground_truth) <- rownames(file.presence.subset)
colnames(ground_truth) <- colnames(file.presence.subset)
== "Absent"] <- 0
ground_truth[file.presence.subset == "Present"] <- 1
ground_truth[file.presence.subset == "Uncertain"] <- 0.6
ground_truth[file.presence.subset
<- ground_truth
file.presence.subset
for (col in 1:ncol(ground_truth)) {
<- FALSE
ground_truth[,col] == 1 & rowMeans(file.presence.subset[,-col,drop = FALSE]) < 0.555 ,col] <- TRUE
ground_truth[file.presence.subset[,col]
}
$genes <- rownames(ground_truth)
ground_truth<- pivot_longer(ground_truth,
ground_truth cols = 1:(ncol(ground_truth)-1),
names_to = "clusters")
$data_set <- subset.datasets_csv[ind,1]
ground_truth$set_number <- ind
ground_truth<- rbind(ground_truth_tot, ground_truth)
ground_truth_tot
}<- ground_truth_tot[2:nrow(ground_truth_tot),]
ground_truth_tot
head(ground_truth_tot)
# A tibble: 6 × 5
genes clusters value data_set set_number
<chr> <chr> <lgl> <chr> <int>
1 Neil2 E13.5-510 FALSE 5_Clusters_uneven 1
2 Neil2 E15.0-437 FALSE 5_Clusters_uneven 1
3 Neil2 E15.0-510 FALSE 5_Clusters_uneven 1
4 Neil2 E13.5-432 FALSE 5_Clusters_uneven 1
5 Neil2 E13.5-437 FALSE 5_Clusters_uneven 1
6 Lamc1 E13.5-510 FALSE 5_Clusters_uneven 1
length(unique(ground_truth_tot$genes))
[1] 14695
sum(ground_truth_tot$value)
[1] 3544
ROC for COTAN
<- NA
onlyPositive.pVal.Cotan_tot
for (ind in 1:dim(subset.datasets_csv)[1]) {
#print(ind)
<- paste0(subset.datasets_csv$Group[ind],"_",
file.code $Collection[ind])
subset.datasets_csv<- readRDS(file = file.path(dataSetDir,paste0(file.code,".RDS")))
dataset <- getClusterizationData(dataset,clName = "mergedClusters")[[2]]
deaCOTAN <- pValueFromDEA(deaCOTAN,
pvalCOTAN numCells = getNumCells(dataset),method = "none")
<- getLambda(dataset)
genesLambda <- as.data.frame(genesLambda)
genesLambda $genes <- rownames(genesLambda)
genesLambda
<- getClusterizationData(dataset, clName = "mergedClusters")[[1]]
clusterization <- NA
cl.names
for (cl.val in unique(clusterization)) {
#print(cl.val)
<- c(cl.names,
cl.names str_split(names(clusterization[clusterization == cl.val])[1],
pattern = "_",simplify = T)[1])
<- cl.names[!is.na(cl.names)]
cl.names
}colnames(deaCOTAN) <- cl.names
colnames(pvalCOTAN) <- cl.names
<- str_split(datasets_csv$Collection[ind],pattern = "_[+]_",simplify = T)
clusters
<- pvalCOTAN
onlyPositive.pVal.Cotan
for (cl in cl.names) {
print(cl)
#temp.DEA.CotanSign <- deaCOTAN[rownames(pvalCOTAN[pvalCOTAN[,cl] < 0.05,]) ,]
rownames(deaCOTAN[deaCOTAN[,cl] < 0,]),cl] <- 1 #onlyPositive.pVal.Cotan[rownames(deaCOTAN[deaCOTAN[,cl] < 0,]),cl]+1
onlyPositive.pVal.Cotan[
}
$genes <- rownames(onlyPositive.pVal.Cotan)
onlyPositive.pVal.Cotan<- pivot_longer(onlyPositive.pVal.Cotan,
onlyPositive.pVal.Cotan values_to = "p_values",
cols = 1:(ncol(onlyPositive.pVal.Cotan)-1),
names_to = "clusters")
$data_set <- subset.datasets_csv[ind,1]
onlyPositive.pVal.Cotan$set_number <- ind
onlyPositive.pVal.Cotan
<- merge(onlyPositive.pVal.Cotan,genesLambda,by="genes")
onlyPositive.pVal.Cotan
<- rbind(onlyPositive.pVal.Cotan_tot, onlyPositive.pVal.Cotan)
onlyPositive.pVal.Cotan_tot
}
[1] "E13.5-432"
[1] "E15.0-510"
[1] "E13.5-437"
[1] "E15.0-437"
[1] "E13.5-510"
[1] "E13.5-434"
[1] "E15.0-428"
[1] "E13.5-184"
[1] "E15.0-434"
[1] "E17.5-505"
[1] "E15.0-432"
[1] "E13.5-432"
[1] "E15.0-509"
[1] "E15.0-508"
[1] "E13.5-187"
<- onlyPositive.pVal.Cotan_tot[2:nrow(onlyPositive.pVal.Cotan_tot),]
onlyPositive.pVal.Cotan_tot
<- merge.data.frame(onlyPositive.pVal.Cotan_tot,
onlyPositive.pVal.Cotan_tot by = c("genes","clusters","data_set","set_number"),all.x = T,all.y = F)
ground_truth_tot,
# Convert TRUE/FALSE to 1/0
$value <- as.numeric(onlyPositive.pVal.Cotan_tot$value)
onlyPositive.pVal.Cotan_tot
# Compute the ROC curve - note that we invert the p-values with 1 - p_values
<- roc(onlyPositive.pVal.Cotan_tot$value, 1 - onlyPositive.pVal.Cotan_tot$p_values)
roc_resultCOTAN
<- quantile(onlyPositive.pVal.Cotan_tot[onlyPositive.pVal.Cotan_tot$value == 0,]$p_values,probs = 0.05)
cotan.threshold.pval
# Plot the ROC curve
#plot(roc_resultCOTAN)
ROC for Seurat
<- NA
deaSeurat_tot
for (ind in 1:dim(subset.datasets_csv)[1]) {
#print(ind)
<- paste0(subset.datasets_csv$Group[ind],"_",subset.datasets_csv$Collection[ind])
file.code
<- readRDS(file = file.path(dataSetDir,paste0(file.code,".RDS")))
dataset
<- getClusterizationData(dataset, clName = "mergedClusters")[[1]]
clusterization
<- read.csv(file.path(dirOut,paste0(file.code,"Seurat_DEA_genes.csv")), row.names = 1)
deaSeurat
<- NA
cl.names
for (cl.val in unique(clusterization)) {
#print(cl.val)
<- str_split(names(clusterization[clusterization == cl.val])[1],
cl.name pattern = "_",simplify = T)[1]
<- c(cl.names,cl.name)
cl.names <- cl.names[!is.na(cl.names)]
cl.names
$cluster == cl.val,]$cluster <- cl.name
deaSeurat[deaSeurat
}
<- str_split(datasets_csv$Collection[ind],pattern = "_[+]_",simplify = T)
clusters
colnames(deaSeurat) <- str_replace(colnames(deaSeurat),
pattern = "cluster",
replacement = "clusters")
colnames(deaSeurat) <- str_replace(colnames(deaSeurat),
pattern = "gene",
replacement = "genes")
$data_set <- subset.datasets_csv[ind,1]
deaSeurat$set_number <- ind
deaSeurat<- rbind(deaSeurat_tot, deaSeurat)
deaSeurat_tot
}<- deaSeurat_tot[2:nrow(deaSeurat_tot),]
deaSeurat_tot
<- merge.data.frame(deaSeurat_tot,
deaSeurat_tot
ground_truth_tot,by = c("genes","clusters","data_set","set_number"),
all.x = T,all.y = F)
# Convert TRUE/FALSE to 1/0
$value <- as.numeric(deaSeurat_tot$value)
deaSeurat_tot
<- deaSeurat_tot
deaSeurat_tot_base
# Compute the ROC curve - note that we invert the p-values with 1 - p_values
<- roc(deaSeurat_tot$value, 1 - deaSeurat_tot$p_val)
roc_resultSeurat
<- quantile(deaSeurat_tot_base[deaSeurat_tot_base$value == 0,]$p_val,probs = 0.05)
seurat.threshold.pval # Plot the ROC curve
#plot(roc_resultSeurat)
ROC for Seurat scTransform
<- NA
deaSeurat_tot
for (ind in 1:dim(subset.datasets_csv)[1]) {
#print(ind)
<- paste0(subset.datasets_csv$Group[ind],"_",subset.datasets_csv$Collection[ind])
file.code
<- readRDS(file = file.path(dataSetDir,paste0(file.code,".RDS")))
dataset
<- getClusterizationData(dataset, clName = "mergedClusters")[[1]]
clusterization
<- read.csv(file.path(dirOut,paste0(file.code,"Seurat_DEA_ScTransform_genes.csv")), row.names = 1)
deaSeurat
<- NA
cl.names
for (cl.val in unique(clusterization)) {
#print(cl.val)
<- str_split(names(clusterization[clusterization == cl.val])[1],
cl.name pattern = "_",simplify = T)[1]
<- c(cl.names,cl.name)
cl.names <- cl.names[!is.na(cl.names)]
cl.names
$cluster == cl.val,]$cluster <- cl.name
deaSeurat[deaSeurat
}
<- str_split(datasets_csv$Collection[ind],pattern = "_[+]_",simplify = T)
clusters
colnames(deaSeurat) <- str_replace(colnames(deaSeurat),
pattern = "cluster",
replacement = "clusters")
colnames(deaSeurat) <- str_replace(colnames(deaSeurat),
pattern = "gene",
replacement = "genes")
$data_set <- subset.datasets_csv[ind,1]
deaSeurat$set_number <- ind
deaSeurat<- rbind(deaSeurat_tot, deaSeurat)
deaSeurat_tot
}<- deaSeurat_tot[2:nrow(deaSeurat_tot),]
deaSeurat_tot
<- merge.data.frame(deaSeurat_tot,
deaSeurat_tot
ground_truth_tot,by = c("genes","clusters","data_set","set_number"),
all.x = T,all.y = F)
# Convert TRUE/FALSE to 1/0
$value <- as.numeric(deaSeurat_tot$value)
deaSeurat_tot
<- deaSeurat_tot
deaSeurat_tot_scTr
<- quantile(deaSeurat_tot_scTr[deaSeurat_tot_scTr$value == 0,]$p_val,probs = 0.05)
Seurat_scTr.threshold.pval
# Compute the ROC curve - note that we invert the p-values with 1 - p_values
<- roc(deaSeurat_tot$value, 1 - deaSeurat_tot$p_val) roc_resultSeurat_scTr
ROC for Seurat Bimod
<- NA
deaSeurat_tot
for (ind in 1:dim(subset.datasets_csv)[1]) {
#print(ind)
<- paste0(subset.datasets_csv$Group[ind],"_",subset.datasets_csv$Collection[ind])
file.code
<- readRDS(file = file.path(dataSetDir,paste0(file.code,".RDS")))
dataset
<- getClusterizationData(dataset, clName = "mergedClusters")[[1]]
clusterization
<- read.csv(file.path(dirOut,paste0(file.code,"Seurat_DEA_Bimod_genes.csv")), row.names = 1)
deaSeurat
<- NA
cl.names
for (cl.val in unique(clusterization)) {
#print(cl.val)
<- str_split(names(clusterization[clusterization == cl.val])[1],
cl.name pattern = "_",simplify = T)[1]
<- c(cl.names,cl.name)
cl.names <- cl.names[!is.na(cl.names)]
cl.names
$cluster == cl.val,]$cluster <- cl.name
deaSeurat[deaSeurat
}
<- str_split(datasets_csv$Collection[ind],pattern = "_[+]_",simplify = T)
clusters
colnames(deaSeurat) <- str_replace(colnames(deaSeurat),
pattern = "cluster",
replacement = "clusters")
colnames(deaSeurat) <- str_replace(colnames(deaSeurat),
pattern = "gene",
replacement = "genes")
$data_set <- subset.datasets_csv[ind,1]
deaSeurat$set_number <- ind
deaSeurat<- rbind(deaSeurat_tot, deaSeurat)
deaSeurat_tot
}<- deaSeurat_tot[2:nrow(deaSeurat_tot),]
deaSeurat_tot
<- merge.data.frame(deaSeurat_tot,
deaSeurat_tot
ground_truth_tot,by = c("genes","clusters","data_set","set_number"),
all.x = T,all.y = F)
# Convert TRUE/FALSE to 1/0
$value <- as.numeric(deaSeurat_tot$value)
deaSeurat_tot
<- deaSeurat_tot
deaSeurat_tot_Bimod
<- quantile(deaSeurat_tot_Bimod[deaSeurat_tot_Bimod$value == 0,]$p_val,probs = 0.05)
seurat_bimod.threshold.pval
# Compute the ROC curve - note that we invert the p-values with 1 - p_values
<- roc(deaSeurat_tot$value, 1 - deaSeurat_tot$p_val) roc_resultSeurat_Bimod
ROC for Monocle
<- NA
deaMonocle_tot for (ind in 1:dim(subset.datasets_csv)[1]) {
<- paste0(subset.datasets_csv$Group[ind],"_",subset.datasets_csv$Collection[ind])
file.code <- readRDS(file = file.path(dataSetDir,paste0(file.code,".RDS")))
dataset
<- getClusterizationData(dataset, clName = "mergedClusters")[[1]]
clusterization #print(file.code)
<- read.csv(file.path(dirOut,paste0(file.code,"Monocle_DEA_genes.csv")),row.names = 1)
deaMonocle
<- NA
cl.names
for (cl.val in unique(clusterization)) {
#print(cl.val)
<- str_split(names(clusterization[clusterization == cl.val])[1],
cl.name pattern = "_",simplify = T)[1]
<- c(cl.names,cl.name)
cl.names <- cl.names[!is.na(cl.names)]
cl.names
$cell_group == cl.val,"cell_group"] <- cl.name
deaMonocle[deaMonocle
}
<- str_split(datasets_csv$Collection[ind],pattern = "_[+]_",simplify = T)
clusters
colnames(deaMonocle) <- str_replace(colnames(deaMonocle),
pattern = "cell_group",
replacement = "clusters")
colnames(deaMonocle) <- str_replace(colnames(deaMonocle),
pattern = "gene_id",
replacement = "genes")
$data_set <- subset.datasets_csv[ind,1]
deaMonocle$set_number <- ind
deaMonocle<- as.data.frame(deaMonocle)
deaMonocle <- rbind(deaMonocle_tot, deaMonocle)
deaMonocle_tot
}<- deaMonocle_tot[2:nrow(deaMonocle_tot),]
deaMonocle_tot
<- merge.data.frame(deaMonocle_tot,
deaMonocle_tot
ground_truth_tot,by = c("genes","clusters","data_set","set_number"),
all.x = T,all.y = F)
# Convert TRUE/FALSE to 1/0
$value <- as.numeric(deaMonocle_tot$value)
deaMonocle_tot
<- quantile(deaMonocle_tot[deaMonocle_tot$value == 0,]$marker_test_p_value,probs = 0.05)
monocle.threshold.pval
# Compute the ROC curve - note that we invert the p-values with 1 - p_values
<- roc(deaMonocle_tot$value, 1 - deaMonocle_tot$marker_test_p_value)
roc_resultMonocle
# Plot the ROC curve
#plot(roc_resultMonocle)
ROC from Scanpy
<- NA
deaScanpy_tot for (ind in 1:dim(subset.datasets_csv)[1]) {
<- paste0(subset.datasets_csv$Group[ind],"_",subset.datasets_csv$Collection[ind])
file.code <- readRDS(file = file.path(dataSetDir,paste0(file.code,".RDS")))
dataset
<- getClusterizationData(dataset, clName = "mergedClusters")[[1]]
clusterization #print(file.code)
<- read.csv(file.path(dirOut,paste0(file.code,"Scanpy_DEA_genes.csv")),
deaScanpy row.names = 1)
<- NA
cl.names
for (cl.val in unique(clusterization)) {
#print(cl.val)
<- str_split(names(clusterization[clusterization == cl.val])[1],
cl.name pattern = "_",simplify = T)[1]
<- c(cl.names,cl.name)
cl.names <- cl.names[!is.na(cl.names)]
cl.names
$clusters == paste0("cl",cl.val),"clusters"] <- cl.name
deaScanpy[deaScanpy
}
<- str_split(datasets_csv$Collection[ind],pattern = "_[+]_",simplify = T)
clusters
$data_set <- subset.datasets_csv[ind,1]
deaScanpy$set_number <- ind
deaScanpy<- rbind(deaScanpy_tot, deaScanpy)
deaScanpy_tot
}<- deaScanpy_tot[2:nrow(deaScanpy_tot),]
deaScanpy_tot
<- merge.data.frame(deaScanpy_tot,
deaScanpy_tot
ground_truth_tot,by = c("genes","clusters","data_set","set_number"),
all.x = T,all.y = F)
# Convert TRUE/FALSE to 1/0
$value <- as.numeric(deaScanpy_tot$value)
deaScanpy_tot
<- quantile(deaScanpy_tot[deaScanpy_tot$value == 0,]$pval,probs = 0.05)
scanpy.threshold.pval
# Compute the ROC curve - note that we invert the p-values with 1 - p_values
<- roc(deaScanpy_tot$value, 1 - deaScanpy_tot$pval)
roc_resultScanpy
# Plot the ROC curve
#plot(roc_resultScanpy)
Summary ROC for all methods
<- ggroc(list(COTAN=roc_resultCOTAN,
Five_Clusters Seurat=roc_resultSeurat,
Seurat_scTr = roc_resultSeurat_scTr,
Seurat_Bimod = roc_resultSeurat_Bimod,
Monocle=roc_resultMonocle,
Scanpy=roc_resultScanpy),
aes = c("colour"),
legacy.axes = T,
linewidth = 1)+theme_light()+
scale_colour_manual("",
values = methods.color)
<- Five_Clusters + xlab("FPR") + ylab("TPR")
Five_Clusters_unevenPL
Five_Clusters_unevenPL
Global Summary
2 Clusters
<- ggarrange(TwoClusters_even_nearPL,Two_Clusters_even_mediumPL, Two_Clusters_even_farPL,Two_Clusters_uneven_nearPL, Two_Clusters_uneven_mediumPL,Two_Clusters_uneven_farPL,
plot_2cl labels = c("Even_Near", "Even_Medium", "Even_Far", "Uneven_Near","Uneven_Medium","Uneven_Far"),
ncol = 3, nrow = 2, common.legend = T, legend = "bottom")
pdf(paste0(dirOut,"ROC_two_supp.pdf"),width = 15,height = 10)
plot(plot_2cl)
dev.off()
png
2
plot_2cl
3 and 5 Clusters
<- ggarrange(Three_Clusters_evenPL,Three_Clusters_unevenPL, NULL, Five_Clusters_unevenPL,
plot_3_5 labels = c("3_Even", "3_Uneven", "", "5_Uneven"),
ncol = 2, nrow = 2, common.legend = T, legend = "bottom")
pdf(paste0(dirOut,"ROC_three_five_supp.pdf"),width = 10,height = 10)
plot(plot_3_5)
dev.off()
png
2
plot_3_5
<- ggarrange(Two_Clusters_uneven_farPL,Three_Clusters_unevenPL, Five_Clusters_unevenPL,
plot3 labels = c("A", "B", "C"),
ncol = 3, nrow = 1, common.legend = T, legend = "none")
pdf(paste0(dirOut,"ROC_three_main.pdf"),width = 15,height = 5)
plot(plot3)
dev.off()
png
2
plot3
Histogram with gene expression level
<- as.data.frame(ground_truth_tot[ground_truth_tot$value ==1,])
ground_truth_tot_subset
<- onlyPositive.pVal.Cotan_tot[,c("genes","set_number","clusters","p_values","genesLambda")]
COTAN_data
colnames(COTAN_data) <- c("genes","set_number","clusters","p_value_COTAN","genesLambda")
<- merge(ground_truth_tot_subset,
ground_truth_tot_subset
COTAN_data,by=c("genes","set_number", "clusters"),
all.x = T, all.y= FALSE)
<- deaSeurat_tot_base[,c("genes","set_number","clusters", "p_val")]
Seurat_data_base colnames(Seurat_data_base) <- c("genes","set_number","clusters","p_value_Seurat")
<- merge(ground_truth_tot_subset,
ground_truth_tot_subset
Seurat_data_base,by=c("genes","set_number","clusters"),
all.x = T, all.y= FALSE)
<- deaSeurat_tot_scTr[,c("genes","set_number","clusters", "p_val")]
Seurat_data_scTr colnames(Seurat_data_scTr) <- c("genes","set_number","clusters","p_value_Seurat_scTr")
<- merge(ground_truth_tot_subset,
ground_truth_tot_subset
Seurat_data_scTr,by=c("genes","set_number","clusters"),
all.x = T, all.y= FALSE)
<- deaSeurat_tot_Bimod[,c("genes","set_number","clusters", "p_val")]
Seurat_data_Bimod colnames(Seurat_data_Bimod) <- c("genes","set_number","clusters","p_value_Seurat_Bimod")
<- merge(ground_truth_tot_subset,
ground_truth_tot_subset
Seurat_data_Bimod,by=c("genes","set_number","clusters"),
all.x = T, all.y= FALSE)
<- deaMonocle_tot[,c("genes","set_number","clusters", "marker_test_p_value")]
data_Monocle colnames(data_Monocle) <- c("genes","set_number","clusters","p_value_Monocle")
<- merge(ground_truth_tot_subset,
ground_truth_tot_subset
data_Monocle,by=c("genes","set_number","clusters"),
all.x = T, all.y= FALSE)
<- deaScanpy_tot[,c("genes","set_number","clusters", "pval")]
data_Scanpy colnames(data_Scanpy) <- c("genes","set_number","clusters","p_value_Scanpy")
<- merge(ground_truth_tot_subset,
ground_truth_tot_subset
data_Scanpy,by=c("genes","set_number","clusters"),
all.x = T, all.y= FALSE)
<- ground_truth_tot_subset %>% mutate(lambda_bin = cut(genesLambda, breaks=50))
ground_truth_tot_subset
is.na(ground_truth_tot_subset)] <- 1
ground_truth_tot_subset[
head(ground_truth_tot_subset)
genes set_number clusters value data_set p_value_COTAN
1 A330008L17Rik 1 E13.5-432 TRUE 5_Clusters_uneven 2.397295e-10
2 A330084C13Rik 1 E15.0-510 TRUE 5_Clusters_uneven 1.517987e-01
3 A330084C13Rik 2 E17.5-505 TRUE 5_Clusters_uneven 1.871017e-01
4 A330102I10Rik 1 E15.0-510 TRUE 5_Clusters_uneven 4.020632e-05
5 A330102I10Rik 2 E17.5-505 TRUE 5_Clusters_uneven 3.580400e-01
6 A630089N07Rik 2 E13.5-184 TRUE 5_Clusters_uneven 1.152963e-10
genesLambda p_value_Seurat p_value_Seurat_scTr p_value_Seurat_Bimod
1 0.07870370 8.273625e-09 9.966968e-09 6.141385e-08
2 0.09027778 1.000000e+00 8.095134e-03 8.676023e-05
3 0.04289216 1.000000e+00 1.000000e+00 1.000000e+00
4 0.08024691 2.547529e-07 1.579015e-07 1.104961e-05
5 0.03431373 1.000000e+00 1.000000e+00 2.718294e-04
6 0.06617647 1.632048e-12 7.812652e-13 1.731541e-09
p_value_Monocle p_value_Scanpy lambda_bin
1 7.760581e-09 0.012422646 (0.069,0.134]
2 5.065628e-02 0.266934766 (0.069,0.134]
3 2.001536e-01 0.671631728 (0.000846,0.069]
4 1.989451e-06 0.028717392 (0.069,0.134]
5 1.935776e-01 0.793509146 (0.000846,0.069]
6 1.328598e-09 0.004754579 (0.000846,0.069]
<- levels(ground_truth_tot_subset$lambda_bin)
bins <- bins[!is.na(bins)]
bins
<- as.data.frame(matrix(nrow = length(bins),ncol = 7,data = 0))
forBarChart
colnames(forBarChart) <- c("bins","COTAN","Seurat","Seurat_Bimod","Seurat_scTr","Monocle", "Scanpy")
$bins <- bins
forBarChartrownames(forBarChart) <- forBarChart$bins
#p_val_threshold <- 0.05
for (i in as.vector(forBarChart$bins)) {
"COTAN"] <- sum(ground_truth_tot_subset$lambda_bin == i & ground_truth_tot_subset$p_value_COTAN <= cotan.threshold.pval)
forBarChart[ i,
"Seurat"] <- sum(ground_truth_tot_subset$lambda_bin == i & ground_truth_tot_subset$p_value_Seurat <= seurat.threshold.pval)
forBarChart[ i,
"Seurat_Bimod"] <- sum(ground_truth_tot_subset$lambda_bin == i & ground_truth_tot_subset$p_value_Seurat_Bimod <= seurat_bimod.threshold.pval)
forBarChart[ i,
"Seurat_scTr"] <- sum(ground_truth_tot_subset$lambda_bin == i & ground_truth_tot_subset$p_value_Seurat_scTr <= Seurat_scTr.threshold.pval)
forBarChart[ i,
"Monocle"] <- sum(ground_truth_tot_subset$lambda_bin == i & ground_truth_tot_subset$p_value_Monocle <= monocle.threshold.pval)
forBarChart[ i,
"Scanpy"] <- sum(ground_truth_tot_subset$lambda_bin == i & ground_truth_tot_subset$p_value_Scanpy <= scanpy.threshold.pval)
forBarChart[ i,
}
<- as.data.frame(str_split(forBarChart$bins,pattern = ",",simplify = T))
temp.bin 1] <- as.numeric(str_remove(temp.bin[,1],pattern = "[(]"))
temp.bin[,2] <- as.numeric(str_remove(temp.bin[,2],pattern = "[]]"))
temp.bin[,
$bins <- round(rowMeans(temp.bin),digits = 2)
forBarChart
<- forBarChart[rowSums(forBarChart[,2:ncol(forBarChart)]) > 0,]
forBarChart
<- forBarChart[1:8,]
forBarChart.perc
$tot <- rowSums(forBarChart.perc[,2:ncol(forBarChart.perc)])
forBarChart.perc
2:(ncol(forBarChart.perc)-1)] <- round(forBarChart.perc[,2:(ncol(forBarChart.perc)-1)]/forBarChart.perc$tot,digits = 4)
forBarChart.perc[,
<- pivot_longer(forBarChart.perc,
forBarChart.perc cols = 2:(ncol(forBarChart.perc)-1),
names_to = "Methods")
<- pivot_longer(forBarChart[1:8,],
forBarChart.number cols = 2:(ncol(forBarChart)),
names_to = "Methods")
colnames(forBarChart.number) <- c("bins","Methods", "valueOriginal")
<- merge(forBarChart.perc,forBarChart.number, by = c("bins","Methods"))
forBarChart.perc
<- ggplot(forBarChart.perc,aes(x=bins,y=value,fill=Methods))+
barChart geom_bar(stat="identity")+
geom_text(aes(label = valueOriginal),
position = position_stack(vjust = 0.5), size=3)+
scale_fill_manual(
values = methods.color)+
theme_light()+
scale_x_continuous(breaks =forBarChart.perc$bins)+
theme(axis.title.y=element_blank(),axis.title.x=element_blank(),
legend.position="none",axis.text.x = element_text(angle = 45, hjust=1)
+
)scale_y_continuous(labels = scales::percent_format())
pdf(paste0(dirOut,"EnrichedGenesForExpressionLevel.pdf"),height = 3,width = 5)
barChartdev.off()
png
2
plot(barChart)
<-
plot.with.inset ggdraw() +
draw_plot(plot3) +
draw_plot(as_ggplot(lg), x = 0.18, y = 0.1, width = 0.2, height = .45)+
draw_plot(barChart, x = 0.78, y = 0.09, width = 0.22, height = .5)
pdf(paste0(dirOut,"Plot3WihtGenesExLevel.pdf"),height = 5,width = 15)
plot(plot.with.inset)
dev.off()
png
2
plot(plot.with.inset)