library(COTAN)
options(parallelly.fork.enable = TRUE)
library(Seurat)
library(monocle3)
library(reticulate)
library(stringr)
library(dplyr)
library(tidyr)
library(ggplot2)
library(viridis)
Differential expression analisys: type I error
<- "Results/TypeIError/"
dirOut <- "Data/MouseCortexFromLoom/SingleClusterRawData/" dataSetDir
Automatic functions
<- function(dataSet,clusters.list, GEO.code, sequencingMethod,sampleCondition, clName,dirOut,percentage){
COTAN.DEA <- automaticCOTANObjectCreation(raw = dataSet,
obj calcCoex = F,
cores = 10,
saveObj = F,
GEO = GEO.code,
sequencingMethod = sequencingMethod,
sampleCondition = sampleCondition)
<- addClusterization(obj,clName = clName,clusters = clusters.list)
obj
<- DEAOnClusters(obj,clName = clName )
DF.DEA <- pValueFromDEA(DF.DEA,numCells = getNumCells(obj))
pval.DEA <- pval.DEA
adj.pval.DEA for (col in colnames(pval.DEA)) {
<- p.adjust(adj.pval.DEA[,col],method = "bonferroni")
adj.pval.DEA[,col]
}<- sum(adj.pval.DEA < 0.05)
n.genes.DEA <- list("n.genes.DEA"=n.genes.DEA,"DF.DEA"=DF.DEA,"adj.pval.DEA" = adj.pval.DEA)
res
write.csv(res$adj.pval.DEA, file = paste0(dirOut,clName,"_de_genes_COTAN_",percentage,".csv"))
return(res)
}
<- function(dataSet,clusters.list, project, dirOut,percentage){
Seurat.DEA <- CreateSeuratObject(counts = dataSet, project = project, min.cells = 3, min.features = 20)
pbmc
stopifnot(length(clusters.list)==length(pbmc$orig.ident))
<- NormalizeData(pbmc, normalization.method = "LogNormalize", scale.factor = 10000)
pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000)
pbmc <- rownames(pbmc)
all.genes <- ScaleData(pbmc, features = all.genes)
pbmc
<- RunPCA(pbmc)
pbmc <- RunUMAP(pbmc, dims = 1:20)
pbmc
@meta.data$TestCl <- NA
pbmc@meta.data[names(clusters.list),]$TestCl <- factor(clusters.list)
pbmc
<- SetIdent(pbmc,value = "TestCl")
pbmc
<- FindAllMarkers(pbmc, only.pos = TRUE)
pbmc.markers
<- sum(pbmc.markers$p_val_adj < 0.05)
n.genes.DEA
print(n.genes.DEA)
write.csv(pbmc.markers, file = paste0(dirOut,project,"_de_genes_Seurat_",percentage,".csv"))
return(list("n.genes.DEA"=n.genes.DEA,"markers"= pbmc.markers))
}
<- function(dataSet,clusters.list,project, dirOut,percentage){
Monocle.DEA
= as.data.frame(clusters.list[colnames(dataSet)])
cell_metadata colnames(cell_metadata) <- "Clusters"
<- new_cell_data_set(dataSet[rowSums(dataSet) > 3,],
cds cell_metadata = cell_metadata
)colData(cds)$cluster <- clusters.list[rownames(colData(cds))]
#cds <- preprocess_cds(cds, num_dim = 100)
#cds <- reduce_dimension(cds)
#cds <- cluster_cells(cds, resolution=1e-5)
<- top_markers(cds,
marker_test_res group_cells_by="Clusters",
genes_to_test_per_group = dim(cds)[1],
cores=10)
# de_results <- fit_models(cds,model_formula_str = " ~ cluster",cores = 10,verbose = T)
# fit_coefs <- coefficient_table(de_results)
#
# fit_coefs <- fit_coefs %>% filter(term == "cluster")
write.csv(marker_test_res, file = paste0(dirOut,project,"_de_genes_Monocle_",percentage,".csv"))
# return(list("n.genes.DEA"=sum(fit_coefs$q_value < 0.05, na.rm = T),
# "fit_coefs"= fit_coefs,
# "de_results"=de_results))
return(list("n.genes.DEA"=sum(marker_test_res$marker_test_q_value < 0.05, na.rm = T),
"marker_test_res"= marker_test_res
))
}
<- function(dataSet,
ScanPy.DEA
clusters.list,
project,
dirOut,
percentage){<- CreateSeuratObject(counts = dataSet, project = project, min.cells = 3, min.features = 20)
pbmc
@meta.data$TestCl <- NA
pbmc@meta.data[names(clusters.list),]$TestCl <- clusters.list
pbmc
<- pbmc@assays$RNA$counts
exprs
<- pbmc[[]]
meta #feature_meta <- GetAssay(pbmc)[[]]
<- as.data.frame(matrix(data = NA,
tmp ncol = 1,
nrow = nrow(pbmc@assays$RNA$counts)))
rownames(tmp) <- rownames(pbmc@assays$RNA$counts)
<- tmp
feature_meta #embedding <- Embeddings(pbmc, "umap")
Sys.setenv(RETICULATE_PYTHON = "../../../bin/python3")
<- import("sys")
py
source_python("src/scanpyTypeIError.py")
scanpyTypeIError(exprs,
meta,
feature_meta, "mt",
dirOut,
percentage,project)
<- read.csv(file = paste0(dirOut,
out
project,"_Scanpy_de_genes_",
percentage,".csv"),
header = T,
row.names = 1)
gc()
return(out)
}
We would evaluate the Type I error and to do so, we consider some transcriptionally uniform clusters from the Loom dataset. We splitted these cluster in two partitions (with three different dimensions: 1/2, 1/3 and 1/4) one containing the cells with higher library size and the other with the lower library size. On these two clusters we than tested the four different software.
for (perc in c(0.75)) { #0.5, 0.33, 0.25,0.67,
<- as.data.frame(matrix(nrow = 1,ncol = 4))
outPutMatrix colnames(outPutMatrix) <- c("COTAN","Seurat", "Monocle", "Scanpy")
for (dataSet in list.files(dataSetDir)) {
<- str_split(dataSet,pattern = "_", simplify = T)[1]
cluster.name
print(cluster.name)
<- NA
outTemp
<- readRDS(paste0(dataSetDir,dataSet))
dataSet
<- colnames(dataSet)[order(colSums(dataSet),decreasing = T)[1:round(ncol(dataSet)*perc,digits = 0)]]
cl1
<- list("cl1"=cl1,
clusters.list "cl2"=colnames(dataSet)[!colnames(dataSet) %in% cl1])
<- setNames(rep(1,ncol(dataSet)),
clusters.list colnames(dataSet))
colnames(dataSet)[!colnames(dataSet) %in% cl1]] <- 2
clusters.list[
<- COTAN.DEA(dataSet = dataSet,
cotan.dea.out clusters.list = clusters.list,
GEO.code = "",
sequencingMethod = "10x",
sampleCondition = "Temp",
clName = cluster.name,
percentage = perc)
dirOut,
<- c(outTemp,cotan.dea.out$n.genes.DEA)
outTemp
rm(cotan.dea.out)
gc()
<- Seurat.DEA(dataSet = dataSet,
seurat.dea.out clusters.list = clusters.list,
project = cluster.name,
percentage = perc)
dirOut,
<- c(outTemp,seurat.dea.out$n.genes.DEA)
outTemp
rm(seurat.dea.out)
gc()
<- Monocle.DEA(dataSet = dataSet,
monocle.dea.out clusters.list = clusters.list,
project = cluster.name,
dirOut = dirOut, percentage = perc)
<- c(outTemp,monocle.dea.out$n.genes.DEA)
outTemp rm(monocle.dea.out)
gc()
<- ScanPy.DEA(dataSet = dataSet,
ScanPy.dea.out clusters.list = clusters.list,
project = cluster.name,
dirOut = dirOut, percentage = perc
)
<- ScanPy.dea.out[ScanPy.dea.out$pval_adj < 0.05
ScanPy.dea.out.filterd & ScanPy.dea.out$clusters == "cl1.0",]
<- c(outTemp,dim(ScanPy.dea.out.filterd)[1])
outTemp rm(ScanPy.dea.out.filterd)
gc()
<- outTemp[2:length(outTemp)]
outTemp
<- rbind(outPutMatrix,outTemp)
outPutMatrix rownames(outPutMatrix)[nrow(outPutMatrix)] <- cluster.name
write.csv(outPutMatrix,paste0(dirOut,"Complete_outPut_",perc,".csv"))
}
<- outPutMatrix[2:nrow(outPutMatrix),]
outPutMatrix write.csv(outPutMatrix,paste0(dirOut,"Complete_outPut_",perc,".csv"))
}
Summarize the output
<- c("COTAN"="#F73604","Seurat"="#ABD9E9","Seurat_scTr"="#74ADD1","Seurat_Bimod"="#4575B4", "Monocle"="#DAABE9", "Scanpy"="#7F9B5C" )
methods.color <- NA
df_plot for (perc in c(0.5, 0.33, 0.25,0.67, 0.75)) {
<- read.csv(paste0(dirOut,"Complete_outPut_",perc,".csv"))
outPutMatrix
$Division <- perc
outPutMatrix
<- rbind(df_plot,outPutMatrix)
df_plot
}<- df_plot[2:nrow(df_plot),]
df_plot
<- as.data.frame(pivot_longer(df_plot,cols = c(2:5),values_to = "N.Genes",names_to = "Method"))
df_plot
$Method == "Cotan",]$Method <- "COTAN"
df_plot[df_plot$Method == "ScanPy",]$Method <- "Scanpy"
df_plot[df_plot
<- ggplot(df_plot,aes(x = Method, y=N.Genes,fill=Method))+geom_boxplot(outliers = F)+
typeIerrorPlot scale_fill_manual(
values = methods.color)+ facet_wrap(~Division,scales = "free",nrow = 1)+
geom_jitter(color="black", alpha=0.5,width = 0.2) +
theme_light()+
theme(axis.text.x=element_blank(),axis.title.x=element_blank(),
legend.position="bottom",
plot.title = element_text(size=11)
)
pdf(paste0(dirOut,"Type1ErrorPlot.pdf"),height = 3,width = 15)
plot(typeIerrorPlot)
dev.off()
png
2
plot(typeIerrorPlot)
sessionInfo()
R version 4.4.0 (2024-04-24)
Platform: x86_64-pc-linux-gnu
Running under: Ubuntu 20.04.6 LTS
Matrix products: default
BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/liblapack.so.3; LAPACK version 3.9.0
locale:
[1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C LC_TIME=C.UTF-8
[4] LC_COLLATE=C.UTF-8 LC_MONETARY=C.UTF-8 LC_MESSAGES=C.UTF-8
[7] LC_PAPER=C.UTF-8 LC_NAME=C LC_ADDRESS=C
[10] LC_TELEPHONE=C LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C
time zone: Europe/Rome
tzcode source: system (glibc)
attached base packages:
[1] stats4 stats graphics grDevices utils datasets methods
[8] base
other attached packages:
[1] viridis_0.6.5 viridisLite_0.4.2
[3] ggplot2_3.5.1 tidyr_1.3.1
[5] dplyr_1.1.4 stringr_1.5.1
[7] reticulate_1.37.0 monocle3_1.3.4
[9] SingleCellExperiment_1.26.0 SummarizedExperiment_1.34.0
[11] GenomicRanges_1.56.0 GenomeInfoDb_1.40.0
[13] IRanges_2.38.0 S4Vectors_0.42.0
[15] MatrixGenerics_1.16.0 matrixStats_1.3.0
[17] Biobase_2.64.0 BiocGenerics_0.50.0
[19] Seurat_5.1.0 SeuratObject_5.0.2
[21] sp_2.1-4 COTAN_2.5.2
loaded via a namespace (and not attached):
[1] RcppAnnoy_0.0.22 splines_4.4.0
[3] later_1.3.2 tibble_3.2.1
[5] polyclip_1.10-6 fastDummies_1.7.3
[7] lifecycle_1.0.4 doParallel_1.0.17
[9] globals_0.16.3 lattice_0.22-6
[11] MASS_7.3-60.2 dendextend_1.17.1
[13] magrittr_2.0.3 plotly_4.10.4
[15] rmarkdown_2.27 yaml_2.3.8
[17] httpuv_1.6.15 sctransform_0.4.1
[19] spam_2.10-0 askpass_1.2.0
[21] spatstat.sparse_3.0-3 minqa_1.2.7
[23] cowplot_1.1.3 pbapply_1.7-2
[25] RColorBrewer_1.1-3 abind_1.4-5
[27] zlibbioc_1.50.0 Rtsne_0.17
[29] purrr_1.0.2 GenomeInfoDbData_1.2.12
[31] circlize_0.4.16 ggrepel_0.9.5
[33] irlba_2.3.5.1 listenv_0.9.1
[35] spatstat.utils_3.0-4 terra_1.7-78
[37] umap_0.2.10.0 goftest_1.2-3
[39] RSpectra_0.16-1 spatstat.random_3.2-3
[41] dqrng_0.4.0 fitdistrplus_1.1-11
[43] parallelly_1.37.1 DelayedMatrixStats_1.26.0
[45] leiden_0.4.3.1 codetools_0.2-20
[47] DelayedArray_0.30.1 tidyselect_1.2.1
[49] shape_1.4.6.1 farver_2.1.2
[51] UCSC.utils_1.0.0 lme4_1.1-35.3
[53] ScaledMatrix_1.12.0 spatstat.explore_3.2-7
[55] jsonlite_1.8.8 GetoptLong_1.0.5
[57] progressr_0.14.0 ggridges_0.5.6
[59] survival_3.6-4 iterators_1.0.14
[61] foreach_1.5.2 tools_4.4.0
[63] ica_1.0-3 Rcpp_1.0.12
[65] glue_1.7.0 gridExtra_2.3
[67] SparseArray_1.4.5 xfun_0.44
[69] ggthemes_5.1.0 withr_3.0.0
[71] fastmap_1.2.0 boot_1.3-30
[73] fansi_1.0.6 openssl_2.2.0
[75] digest_0.6.35 rsvd_1.0.5
[77] parallelDist_0.2.6 R6_2.5.1
[79] mime_0.12 colorspace_2.1-0
[81] scattermore_1.2 tensor_1.5
[83] spatstat.data_3.0-4 utf8_1.2.4
[85] generics_0.1.3 data.table_1.15.4
[87] httr_1.4.7 htmlwidgets_1.6.4
[89] S4Arrays_1.4.1 uwot_0.2.2
[91] pkgconfig_2.0.3 gtable_0.3.5
[93] ComplexHeatmap_2.20.0 lmtest_0.9-40
[95] XVector_0.44.0 htmltools_0.5.8.1
[97] dotCall64_1.1-1 clue_0.3-65
[99] scales_1.3.0 png_0.1-8
[101] knitr_1.46 rstudioapi_0.16.0
[103] reshape2_1.4.4 rjson_0.2.21
[105] nloptr_2.0.3 nlme_3.1-164
[107] zoo_1.8-12 GlobalOptions_0.1.2
[109] KernSmooth_2.23-24 parallel_4.4.0
[111] miniUI_0.1.1.1 RcppZiggurat_0.1.6
[113] pillar_1.9.0 grid_4.4.0
[115] vctrs_0.6.5 RANN_2.6.1
[117] promises_1.3.0 BiocSingular_1.20.0
[119] beachmat_2.20.0 xtable_1.8-4
[121] cluster_2.1.6 evaluate_0.23
[123] zeallot_0.1.0 cli_3.6.2
[125] compiler_4.4.0 rlang_1.1.3
[127] crayon_1.5.2 future.apply_1.11.2
[129] labeling_0.4.3 plyr_1.8.9
[131] stringi_1.8.4 deldir_2.0-4
[133] BiocParallel_1.38.0 assertthat_0.2.1
[135] munsell_0.5.1 lazyeval_0.2.2
[137] spatstat.geom_3.2-9 PCAtools_2.16.0
[139] Matrix_1.7-0 RcppHNSW_0.6.0
[141] patchwork_1.2.0 sparseMatrixStats_1.16.0
[143] future_1.33.2 shiny_1.8.1.1
[145] ROCR_1.0-11 Rfast_2.1.0
[147] igraph_2.0.3 RcppParallel_5.1.7