library(COTAN)
options(parallelly.fork.enable = TRUE)
library(Seurat)
library(monocle3)
library(reticulate)
library(stringr)
library(dplyr)
library(tidyr)
library(ggplot2)
library(viridis)Differential expression analisys: type I error
dirOut <- "Results/TypeIError/"
dataSetDir <- "Data/MouseCortexFromLoom/SingleClusterRawData/"Automatic functions
COTAN.DEA <- function(dataSet,clusters.list, GEO.code, sequencingMethod,sampleCondition, clName,dirOut,percentage){
obj <- automaticCOTANObjectCreation(raw = dataSet,
calcCoex = F,
cores = 10,
saveObj = F,
GEO = GEO.code,
sequencingMethod = sequencingMethod,
sampleCondition = sampleCondition)
obj <- addClusterization(obj,clName = clName,clusters = clusters.list)
DF.DEA <- DEAOnClusters(obj,clName = clName )
pval.DEA <- pValueFromDEA(DF.DEA,numCells = getNumCells(obj))
adj.pval.DEA <- pval.DEA
for (col in colnames(pval.DEA)) {
adj.pval.DEA[,col] <- p.adjust(adj.pval.DEA[,col],method = "bonferroni")
}
n.genes.DEA <- sum(adj.pval.DEA < 0.05)
res <- list("n.genes.DEA"=n.genes.DEA,"DF.DEA"=DF.DEA,"adj.pval.DEA" = adj.pval.DEA)
write.csv(res$adj.pval.DEA, file = paste0(dirOut,clName,"_de_genes_COTAN_",percentage,".csv"))
return(res)
}
Seurat.DEA <- function(dataSet,clusters.list, project, dirOut,percentage){
pbmc <- CreateSeuratObject(counts = dataSet, project = project, min.cells = 3, min.features = 20)
stopifnot(length(clusters.list)==length(pbmc$orig.ident))
pbmc <- NormalizeData(pbmc, normalization.method = "LogNormalize", scale.factor = 10000)
pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000)
all.genes <- rownames(pbmc)
pbmc <- ScaleData(pbmc, features = all.genes)
pbmc <- RunPCA(pbmc)
pbmc <- RunUMAP(pbmc, dims = 1:20)
pbmc@meta.data$TestCl <- NA
pbmc@meta.data[names(clusters.list),]$TestCl <- factor(clusters.list)
pbmc <- SetIdent(pbmc,value = "TestCl")
pbmc.markers <- FindAllMarkers(pbmc, only.pos = TRUE)
n.genes.DEA <- sum(pbmc.markers$p_val_adj < 0.05)
print(n.genes.DEA)
write.csv(pbmc.markers, file = paste0(dirOut,project,"_de_genes_Seurat_",percentage,".csv"))
return(list("n.genes.DEA"=n.genes.DEA,"markers"= pbmc.markers))
}
Monocle.DEA <- function(dataSet,clusters.list,project, dirOut,percentage){
cell_metadata = as.data.frame(clusters.list[colnames(dataSet)])
colnames(cell_metadata) <- "Clusters"
cds <- new_cell_data_set(dataSet[rowSums(dataSet) > 3,],
cell_metadata = cell_metadata
)
colData(cds)$cluster <- clusters.list[rownames(colData(cds))]
#cds <- preprocess_cds(cds, num_dim = 100)
#cds <- reduce_dimension(cds)
#cds <- cluster_cells(cds, resolution=1e-5)
marker_test_res <- top_markers(cds,
group_cells_by="Clusters",
genes_to_test_per_group = dim(cds)[1],
cores=10)
# de_results <- fit_models(cds,model_formula_str = " ~ cluster",cores = 10,verbose = T)
# fit_coefs <- coefficient_table(de_results)
#
# fit_coefs <- fit_coefs %>% filter(term == "cluster")
write.csv(marker_test_res, file = paste0(dirOut,project,"_de_genes_Monocle_",percentage,".csv"))
# return(list("n.genes.DEA"=sum(fit_coefs$q_value < 0.05, na.rm = T),
# "fit_coefs"= fit_coefs,
# "de_results"=de_results))
return(list("n.genes.DEA"=sum(marker_test_res$marker_test_q_value < 0.05, na.rm = T),
"marker_test_res"= marker_test_res
))
}
ScanPy.DEA <- function(dataSet,
clusters.list,
project,
dirOut,
percentage){
pbmc <- CreateSeuratObject(counts = dataSet, project = project, min.cells = 3, min.features = 20)
pbmc@meta.data$TestCl <- NA
pbmc@meta.data[names(clusters.list),]$TestCl <- clusters.list
exprs <- pbmc@assays$RNA$counts
meta <- pbmc[[]]
#feature_meta <- GetAssay(pbmc)[[]]
tmp <- as.data.frame(matrix(data = NA,
ncol = 1,
nrow = nrow(pbmc@assays$RNA$counts)))
rownames(tmp) <- rownames(pbmc@assays$RNA$counts)
feature_meta <- tmp
#embedding <- Embeddings(pbmc, "umap")
Sys.setenv(RETICULATE_PYTHON = "../../../bin/python3")
py <- import("sys")
source_python("src/scanpyTypeIError.py")
scanpyTypeIError(exprs,
meta,
feature_meta,
"mt",
dirOut,
percentage,project)
out <- read.csv(file = paste0(dirOut,
project,
"_Scanpy_de_genes_",
percentage,
".csv"),
header = T,
row.names = 1)
gc()
return(out)
}We would evaluate the Type I error and to do so, we consider some transcriptionally uniform clusters from the Loom dataset. We splitted these cluster in two partitions (with three different dimensions: 1/2, 1/3 and 1/4) one containing the cells with higher library size and the other with the lower library size. On these two clusters we than tested the four different software.
for (perc in c(0.75)) { #0.5, 0.33, 0.25,0.67,
outPutMatrix <- as.data.frame(matrix(nrow = 1,ncol = 4))
colnames(outPutMatrix) <- c("COTAN","Seurat", "Monocle", "Scanpy")
for (dataSet in list.files(dataSetDir)) {
cluster.name <- str_split(dataSet,pattern = "_", simplify = T)[1]
print(cluster.name)
outTemp <- NA
dataSet <- readRDS(paste0(dataSetDir,dataSet))
cl1 <- colnames(dataSet)[order(colSums(dataSet),decreasing = T)[1:round(ncol(dataSet)*perc,digits = 0)]]
clusters.list <- list("cl1"=cl1,
"cl2"=colnames(dataSet)[!colnames(dataSet) %in% cl1])
clusters.list <- setNames(rep(1,ncol(dataSet)),
colnames(dataSet))
clusters.list[colnames(dataSet)[!colnames(dataSet) %in% cl1]] <- 2
cotan.dea.out <- COTAN.DEA(dataSet = dataSet,
clusters.list = clusters.list,
GEO.code = "",
sequencingMethod = "10x",
sampleCondition = "Temp",
clName = cluster.name,
dirOut, percentage = perc)
outTemp <- c(outTemp,cotan.dea.out$n.genes.DEA)
rm(cotan.dea.out)
gc()
seurat.dea.out <- Seurat.DEA(dataSet = dataSet,
clusters.list = clusters.list,
project = cluster.name,
dirOut, percentage = perc)
outTemp <- c(outTemp,seurat.dea.out$n.genes.DEA)
rm(seurat.dea.out)
gc()
monocle.dea.out <- Monocle.DEA(dataSet = dataSet,
clusters.list = clusters.list,
project = cluster.name,
dirOut = dirOut, percentage = perc)
outTemp <- c(outTemp,monocle.dea.out$n.genes.DEA)
rm(monocle.dea.out)
gc()
ScanPy.dea.out <- ScanPy.DEA(dataSet = dataSet,
clusters.list = clusters.list,
project = cluster.name,
dirOut = dirOut, percentage = perc
)
ScanPy.dea.out.filterd <- ScanPy.dea.out[ScanPy.dea.out$pval_adj < 0.05
& ScanPy.dea.out$clusters == "cl1.0",]
outTemp <- c(outTemp,dim(ScanPy.dea.out.filterd)[1])
rm(ScanPy.dea.out.filterd)
gc()
outTemp <- outTemp[2:length(outTemp)]
outPutMatrix <- rbind(outPutMatrix,outTemp)
rownames(outPutMatrix)[nrow(outPutMatrix)] <- cluster.name
write.csv(outPutMatrix,paste0(dirOut,"Complete_outPut_",perc,".csv"))
}
outPutMatrix <- outPutMatrix[2:nrow(outPutMatrix),]
write.csv(outPutMatrix,paste0(dirOut,"Complete_outPut_",perc,".csv"))
}Summarize the output
methods.color <- c("COTAN"="#F73604","Seurat"="#ABD9E9","Seurat_scTr"="#74ADD1","Seurat_Bimod"="#4575B4", "Monocle"="#DAABE9", "Scanpy"="#7F9B5C" )
df_plot <- NA
for (perc in c(0.5, 0.33, 0.25,0.67, 0.75)) {
outPutMatrix <- read.csv(paste0(dirOut,"Complete_outPut_",perc,".csv"))
outPutMatrix$Division <- perc
df_plot <- rbind(df_plot,outPutMatrix)
}
df_plot <- df_plot[2:nrow(df_plot),]
df_plot <- as.data.frame(pivot_longer(df_plot,cols = c(2:5),values_to = "N.Genes",names_to = "Method"))
df_plot[df_plot$Method == "Cotan",]$Method <- "COTAN"
df_plot[df_plot$Method == "ScanPy",]$Method <- "Scanpy"
typeIerrorPlot <- ggplot(df_plot,aes(x = Method, y=N.Genes,fill=Method))+geom_boxplot(outliers = F)+
scale_fill_manual(
values = methods.color)+ facet_wrap(~Division,scales = "free",nrow = 1)+
geom_jitter(color="black", alpha=0.5,width = 0.2) +
theme_light()+
theme(axis.text.x=element_blank(),axis.title.x=element_blank(),
legend.position="bottom",
plot.title = element_text(size=11)
)
pdf(paste0(dirOut,"Type1ErrorPlot.pdf"),height = 3,width = 15)
plot(typeIerrorPlot)
dev.off()png
2
plot(typeIerrorPlot)
sessionInfo()R version 4.4.0 (2024-04-24)
Platform: x86_64-pc-linux-gnu
Running under: Ubuntu 20.04.6 LTS
Matrix products: default
BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/liblapack.so.3; LAPACK version 3.9.0
locale:
[1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C LC_TIME=C.UTF-8
[4] LC_COLLATE=C.UTF-8 LC_MONETARY=C.UTF-8 LC_MESSAGES=C.UTF-8
[7] LC_PAPER=C.UTF-8 LC_NAME=C LC_ADDRESS=C
[10] LC_TELEPHONE=C LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C
time zone: Europe/Rome
tzcode source: system (glibc)
attached base packages:
[1] stats4 stats graphics grDevices utils datasets methods
[8] base
other attached packages:
[1] viridis_0.6.5 viridisLite_0.4.2
[3] ggplot2_3.5.1 tidyr_1.3.1
[5] dplyr_1.1.4 stringr_1.5.1
[7] reticulate_1.37.0 monocle3_1.3.4
[9] SingleCellExperiment_1.26.0 SummarizedExperiment_1.34.0
[11] GenomicRanges_1.56.0 GenomeInfoDb_1.40.0
[13] IRanges_2.38.0 S4Vectors_0.42.0
[15] MatrixGenerics_1.16.0 matrixStats_1.3.0
[17] Biobase_2.64.0 BiocGenerics_0.50.0
[19] Seurat_5.1.0 SeuratObject_5.0.2
[21] sp_2.1-4 COTAN_2.5.2
loaded via a namespace (and not attached):
[1] RcppAnnoy_0.0.22 splines_4.4.0
[3] later_1.3.2 tibble_3.2.1
[5] polyclip_1.10-6 fastDummies_1.7.3
[7] lifecycle_1.0.4 doParallel_1.0.17
[9] globals_0.16.3 lattice_0.22-6
[11] MASS_7.3-60.2 dendextend_1.17.1
[13] magrittr_2.0.3 plotly_4.10.4
[15] rmarkdown_2.27 yaml_2.3.8
[17] httpuv_1.6.15 sctransform_0.4.1
[19] spam_2.10-0 askpass_1.2.0
[21] spatstat.sparse_3.0-3 minqa_1.2.7
[23] cowplot_1.1.3 pbapply_1.7-2
[25] RColorBrewer_1.1-3 abind_1.4-5
[27] zlibbioc_1.50.0 Rtsne_0.17
[29] purrr_1.0.2 GenomeInfoDbData_1.2.12
[31] circlize_0.4.16 ggrepel_0.9.5
[33] irlba_2.3.5.1 listenv_0.9.1
[35] spatstat.utils_3.0-4 terra_1.7-78
[37] umap_0.2.10.0 goftest_1.2-3
[39] RSpectra_0.16-1 spatstat.random_3.2-3
[41] dqrng_0.4.0 fitdistrplus_1.1-11
[43] parallelly_1.37.1 DelayedMatrixStats_1.26.0
[45] leiden_0.4.3.1 codetools_0.2-20
[47] DelayedArray_0.30.1 tidyselect_1.2.1
[49] shape_1.4.6.1 farver_2.1.2
[51] UCSC.utils_1.0.0 lme4_1.1-35.3
[53] ScaledMatrix_1.12.0 spatstat.explore_3.2-7
[55] jsonlite_1.8.8 GetoptLong_1.0.5
[57] progressr_0.14.0 ggridges_0.5.6
[59] survival_3.6-4 iterators_1.0.14
[61] foreach_1.5.2 tools_4.4.0
[63] ica_1.0-3 Rcpp_1.0.12
[65] glue_1.7.0 gridExtra_2.3
[67] SparseArray_1.4.5 xfun_0.44
[69] ggthemes_5.1.0 withr_3.0.0
[71] fastmap_1.2.0 boot_1.3-30
[73] fansi_1.0.6 openssl_2.2.0
[75] digest_0.6.35 rsvd_1.0.5
[77] parallelDist_0.2.6 R6_2.5.1
[79] mime_0.12 colorspace_2.1-0
[81] scattermore_1.2 tensor_1.5
[83] spatstat.data_3.0-4 utf8_1.2.4
[85] generics_0.1.3 data.table_1.15.4
[87] httr_1.4.7 htmlwidgets_1.6.4
[89] S4Arrays_1.4.1 uwot_0.2.2
[91] pkgconfig_2.0.3 gtable_0.3.5
[93] ComplexHeatmap_2.20.0 lmtest_0.9-40
[95] XVector_0.44.0 htmltools_0.5.8.1
[97] dotCall64_1.1-1 clue_0.3-65
[99] scales_1.3.0 png_0.1-8
[101] knitr_1.46 rstudioapi_0.16.0
[103] reshape2_1.4.4 rjson_0.2.21
[105] nloptr_2.0.3 nlme_3.1-164
[107] zoo_1.8-12 GlobalOptions_0.1.2
[109] KernSmooth_2.23-24 parallel_4.4.0
[111] miniUI_0.1.1.1 RcppZiggurat_0.1.6
[113] pillar_1.9.0 grid_4.4.0
[115] vctrs_0.6.5 RANN_2.6.1
[117] promises_1.3.0 BiocSingular_1.20.0
[119] beachmat_2.20.0 xtable_1.8-4
[121] cluster_2.1.6 evaluate_0.23
[123] zeallot_0.1.0 cli_3.6.2
[125] compiler_4.4.0 rlang_1.1.3
[127] crayon_1.5.2 future.apply_1.11.2
[129] labeling_0.4.3 plyr_1.8.9
[131] stringi_1.8.4 deldir_2.0-4
[133] BiocParallel_1.38.0 assertthat_0.2.1
[135] munsell_0.5.1 lazyeval_0.2.2
[137] spatstat.geom_3.2-9 PCAtools_2.16.0
[139] Matrix_1.7-0 RcppHNSW_0.6.0
[141] patchwork_1.2.0 sparseMatrixStats_1.16.0
[143] future_1.33.2 shiny_1.8.1.1
[145] ROCR_1.0-11 Rfast_2.1.0
[147] igraph_2.0.3 RcppParallel_5.1.7