Differential expression analisys: type I error

Author

Silvia Giulia Galfrè

Published

June 2, 2024

library(COTAN)
options(parallelly.fork.enable = TRUE)
library(Seurat)
library(monocle3)
library(reticulate)
library(stringr)
library(dplyr)
library(tidyr)
library(ggplot2)
library(viridis)

dirOut <- "Results/TypeIError/"
dataSetDir <- "Data/MouseCortexFromLoom/SingleClusterRawData/"

Automatic functions

COTAN.DEA <- function(dataSet,clusters.list, GEO.code, sequencingMethod,sampleCondition, clName,dirOut,percentage){
  obj <- automaticCOTANObjectCreation(raw = dataSet,
                                      calcCoex = F,
                                      cores = 10,
                                      saveObj = F,
                                      GEO = GEO.code,
                                      sequencingMethod = sequencingMethod,
                                      sampleCondition = sampleCondition)
  obj <- addClusterization(obj,clName = clName,clusters = clusters.list)
  
  DF.DEA <- DEAOnClusters(obj,clName = clName )
  pval.DEA <- pValueFromDEA(DF.DEA,numCells = getNumCells(obj))
  adj.pval.DEA <- pval.DEA
  for (col in colnames(pval.DEA)) {
    adj.pval.DEA[,col] <- p.adjust(adj.pval.DEA[,col],method = "bonferroni")
  }
  n.genes.DEA <- sum(adj.pval.DEA < 0.05)
  res <- list("n.genes.DEA"=n.genes.DEA,"DF.DEA"=DF.DEA,"adj.pval.DEA" = adj.pval.DEA)
  
  write.csv(res$adj.pval.DEA, file = paste0(dirOut,clName,"_de_genes_COTAN_",percentage,".csv"))
  
  return(res)

}

Seurat.DEA <- function(dataSet,clusters.list, project, dirOut,percentage){
  pbmc <- CreateSeuratObject(counts = dataSet, project = project, min.cells = 3, min.features = 20)
  
  stopifnot(length(clusters.list)==length(pbmc$orig.ident))
  
  
  pbmc <- NormalizeData(pbmc, normalization.method = "LogNormalize", scale.factor = 10000)
  pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000)
  all.genes <- rownames(pbmc)
  pbmc <- ScaleData(pbmc, features = all.genes)
  
  pbmc <- RunPCA(pbmc)
  pbmc <- RunUMAP(pbmc, dims = 1:20)
  
  pbmc@meta.data$TestCl <- NA
  pbmc@meta.data[names(clusters.list),]$TestCl <- factor(clusters.list)
  
  pbmc <- SetIdent(pbmc,value = "TestCl")
  
  pbmc.markers <- FindAllMarkers(pbmc, only.pos = TRUE)
  
  
  
  n.genes.DEA <- sum(pbmc.markers$p_val_adj < 0.05)
  
  print(n.genes.DEA)
  
  write.csv(pbmc.markers, file = paste0(dirOut,project,"_de_genes_Seurat_",percentage,".csv"))
  
  return(list("n.genes.DEA"=n.genes.DEA,"markers"= pbmc.markers))
  }

Monocle.DEA <- function(dataSet,clusters.list,project, dirOut,percentage){
  
  cell_metadata = as.data.frame(clusters.list[colnames(dataSet)])
  colnames(cell_metadata) <- "Clusters"
  cds <- new_cell_data_set(dataSet[rowSums(dataSet) > 3,],
                         cell_metadata = cell_metadata
                         )
  colData(cds)$cluster <- clusters.list[rownames(colData(cds))]

  #cds <- preprocess_cds(cds, num_dim = 100)
  #cds <- reduce_dimension(cds)
  #cds <- cluster_cells(cds, resolution=1e-5)

  marker_test_res <- top_markers(cds, 
                                 group_cells_by="Clusters", 
                                 genes_to_test_per_group = dim(cds)[1],
                                  cores=10)
  
  # de_results <- fit_models(cds,model_formula_str = " ~ cluster",cores = 10,verbose = T)
  # fit_coefs <- coefficient_table(de_results)
  # 
  # fit_coefs <- fit_coefs %>% filter(term == "cluster")
  
  write.csv(marker_test_res, file = paste0(dirOut,project,"_de_genes_Monocle_",percentage,".csv"))
  
  # return(list("n.genes.DEA"=sum(fit_coefs$q_value < 0.05, na.rm = T),
  #             "fit_coefs"= fit_coefs,
  #             "de_results"=de_results))
  return(list("n.genes.DEA"=sum(marker_test_res$marker_test_q_value < 0.05, na.rm = T),
              "marker_test_res"= marker_test_res
              ))
}


ScanPy.DEA <- function(dataSet,
                       clusters.list, 
                       project, 
                       dirOut,
                       percentage){
  pbmc <- CreateSeuratObject(counts = dataSet, project = project, min.cells = 3, min.features = 20)
  
  
  pbmc@meta.data$TestCl <- NA
  pbmc@meta.data[names(clusters.list),]$TestCl <- clusters.list
  
  exprs <- pbmc@assays$RNA$counts
  
  meta <- pbmc[[]]
  #feature_meta <- GetAssay(pbmc)[[]]
  tmp <- as.data.frame(matrix(data = NA, 
                              ncol = 1, 
                              nrow = nrow(pbmc@assays$RNA$counts)))
  rownames(tmp) <- rownames(pbmc@assays$RNA$counts)
  
  feature_meta <- tmp
  #embedding <- Embeddings(pbmc, "umap")
  
  Sys.setenv(RETICULATE_PYTHON = "../../../bin/python3")
  py <- import("sys")
  
  source_python("src/scanpyTypeIError.py")
  scanpyTypeIError(exprs, 
                   meta, 
                   feature_meta, 
                   "mt", 
                   dirOut, 
                   percentage,project)

  out <- read.csv(file = paste0(dirOut,
                                project,
                                "_Scanpy_de_genes_",
                                percentage,
                                ".csv"),
                  header = T, 
                  row.names = 1)
  
  gc()

  return(out)
}

We would evaluate the Type I error and to do so, we consider some transcriptionally uniform clusters from the Loom dataset. We splitted these cluster in two partitions (with three different dimensions: 1/2, 1/3 and 1/4) one containing the cells with higher library size and the other with the lower library size. On these two clusters we than tested the four different software.

for (perc in c(0.75)) { #0.5, 0.33, 0.25,0.67, 
  outPutMatrix <- as.data.frame(matrix(nrow = 1,ncol = 4))
  colnames(outPutMatrix) <- c("COTAN","Seurat", "Monocle", "Scanpy")

  for (dataSet in list.files(dataSetDir)) {
  
    cluster.name <- str_split(dataSet,pattern = "_", simplify = T)[1]
    
    print(cluster.name)
    outTemp <- NA
     
    dataSet <- readRDS(paste0(dataSetDir,dataSet))
    

    cl1 <- colnames(dataSet)[order(colSums(dataSet),decreasing = T)[1:round(ncol(dataSet)*perc,digits = 0)]]
    
    clusters.list <- list("cl1"=cl1,
                          "cl2"=colnames(dataSet)[!colnames(dataSet) %in% cl1])
  
    clusters.list <- setNames(rep(1,ncol(dataSet)),
                              colnames(dataSet))
    clusters.list[colnames(dataSet)[!colnames(dataSet) %in% cl1]] <- 2 
  
    cotan.dea.out <- COTAN.DEA(dataSet = dataSet,
                               clusters.list = clusters.list,
                               GEO.code = "",
                               sequencingMethod = "10x",
                               sampleCondition = "Temp",
                               clName =  cluster.name,
                               dirOut, percentage = perc)
  
    outTemp <- c(outTemp,cotan.dea.out$n.genes.DEA)
    
    rm(cotan.dea.out)
    gc()
  
    seurat.dea.out <- Seurat.DEA(dataSet = dataSet,
                               clusters.list = clusters.list,
                               project = cluster.name,
                               dirOut, percentage = perc)
  
    outTemp <- c(outTemp,seurat.dea.out$n.genes.DEA)
    
    rm(seurat.dea.out)
    gc()
  
  
    monocle.dea.out <- Monocle.DEA(dataSet = dataSet,
                                   clusters.list = clusters.list,
                                   project = cluster.name,
                                   dirOut = dirOut, percentage = perc)
  
    outTemp <- c(outTemp,monocle.dea.out$n.genes.DEA)
    rm(monocle.dea.out)
    gc()
  
    ScanPy.dea.out <- ScanPy.DEA(dataSet = dataSet, 
                                 clusters.list = clusters.list, 
                                 project = cluster.name, 
                                 dirOut = dirOut, percentage = perc
                                  )
    
  
    ScanPy.dea.out.filterd <- ScanPy.dea.out[ScanPy.dea.out$pval_adj < 0.05
                                             & ScanPy.dea.out$clusters == "cl1.0",]
    
    outTemp <- c(outTemp,dim(ScanPy.dea.out.filterd)[1])
    rm(ScanPy.dea.out.filterd)
    gc()
    
    outTemp <- outTemp[2:length(outTemp)]
    
    outPutMatrix <- rbind(outPutMatrix,outTemp)
    rownames(outPutMatrix)[nrow(outPutMatrix)] <- cluster.name
    
    write.csv(outPutMatrix,paste0(dirOut,"Complete_outPut_",perc,".csv"))
  }
  
  outPutMatrix <-   outPutMatrix[2:nrow(outPutMatrix),]
  write.csv(outPutMatrix,paste0(dirOut,"Complete_outPut_",perc,".csv"))

}

Summarize the output

methods.color <- c("COTAN"="#F73604","Seurat"="#ABD9E9","Seurat_scTr"="#74ADD1","Seurat_Bimod"="#4575B4", "Monocle"="#DAABE9", "Scanpy"="#7F9B5C" )
df_plot <- NA
for (perc in c(0.5, 0.33, 0.25,0.67, 0.75)) {
  outPutMatrix <- read.csv(paste0(dirOut,"Complete_outPut_",perc,".csv"))
  
  outPutMatrix$Division <- perc
  
  df_plot <- rbind(df_plot,outPutMatrix)
  
}
df_plot <- df_plot[2:nrow(df_plot),]

df_plot <- as.data.frame(pivot_longer(df_plot,cols = c(2:5),values_to = "N.Genes",names_to = "Method"))

df_plot[df_plot$Method == "Cotan",]$Method <- "COTAN"
df_plot[df_plot$Method == "ScanPy",]$Method <- "Scanpy"


typeIerrorPlot <- ggplot(df_plot,aes(x = Method, y=N.Genes,fill=Method))+geom_boxplot(outliers = F)+
scale_fill_manual(
    values = methods.color)+ facet_wrap(~Division,scales = "free",nrow = 1)+
    geom_jitter(color="black",  alpha=0.5,width = 0.2) +
    theme_light()+
    theme(axis.text.x=element_blank(),axis.title.x=element_blank(),
      legend.position="bottom",
      plot.title = element_text(size=11)
    )

pdf(paste0(dirOut,"Type1ErrorPlot.pdf"),height = 3,width = 15)

plot(typeIerrorPlot)
dev.off()

png 
  2

plot(typeIerrorPlot)

sessionInfo()

R version 4.4.0 (2024-04-24)
Platform: x86_64-pc-linux-gnu
Running under: Ubuntu 20.04.6 LTS

Matrix products: default
BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/liblapack.so.3;  LAPACK version 3.9.0

locale:
 [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
 [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
 [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
[10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   

time zone: Europe/Rome
tzcode source: system (glibc)

attached base packages:
[1] stats4    stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
 [1] viridis_0.6.5               viridisLite_0.4.2          
 [3] ggplot2_3.5.1               tidyr_1.3.1                
 [5] dplyr_1.1.4                 stringr_1.5.1              
 [7] reticulate_1.37.0           monocle3_1.3.4             
 [9] SingleCellExperiment_1.26.0 SummarizedExperiment_1.34.0
[11] GenomicRanges_1.56.0        GenomeInfoDb_1.40.0        
[13] IRanges_2.38.0              S4Vectors_0.42.0           
[15] MatrixGenerics_1.16.0       matrixStats_1.3.0          
[17] Biobase_2.64.0              BiocGenerics_0.50.0        
[19] Seurat_5.1.0                SeuratObject_5.0.2         
[21] sp_2.1-4                    COTAN_2.5.2                

loaded via a namespace (and not attached):
  [1] RcppAnnoy_0.0.22          splines_4.4.0            
  [3] later_1.3.2               tibble_3.2.1             
  [5] polyclip_1.10-6           fastDummies_1.7.3        
  [7] lifecycle_1.0.4           doParallel_1.0.17        
  [9] globals_0.16.3            lattice_0.22-6           
 [11] MASS_7.3-60.2             dendextend_1.17.1        
 [13] magrittr_2.0.3            plotly_4.10.4            
 [15] rmarkdown_2.27            yaml_2.3.8               
 [17] httpuv_1.6.15             sctransform_0.4.1        
 [19] spam_2.10-0               askpass_1.2.0            
 [21] spatstat.sparse_3.0-3     minqa_1.2.7              
 [23] cowplot_1.1.3             pbapply_1.7-2            
 [25] RColorBrewer_1.1-3        abind_1.4-5              
 [27] zlibbioc_1.50.0           Rtsne_0.17               
 [29] purrr_1.0.2               GenomeInfoDbData_1.2.12  
 [31] circlize_0.4.16           ggrepel_0.9.5            
 [33] irlba_2.3.5.1             listenv_0.9.1            
 [35] spatstat.utils_3.0-4      terra_1.7-78             
 [37] umap_0.2.10.0             goftest_1.2-3            
 [39] RSpectra_0.16-1           spatstat.random_3.2-3    
 [41] dqrng_0.4.0               fitdistrplus_1.1-11      
 [43] parallelly_1.37.1         DelayedMatrixStats_1.26.0
 [45] leiden_0.4.3.1            codetools_0.2-20         
 [47] DelayedArray_0.30.1       tidyselect_1.2.1         
 [49] shape_1.4.6.1             farver_2.1.2             
 [51] UCSC.utils_1.0.0          lme4_1.1-35.3            
 [53] ScaledMatrix_1.12.0       spatstat.explore_3.2-7   
 [55] jsonlite_1.8.8            GetoptLong_1.0.5         
 [57] progressr_0.14.0          ggridges_0.5.6           
 [59] survival_3.6-4            iterators_1.0.14         
 [61] foreach_1.5.2             tools_4.4.0              
 [63] ica_1.0-3                 Rcpp_1.0.12              
 [65] glue_1.7.0                gridExtra_2.3            
 [67] SparseArray_1.4.5         xfun_0.44                
 [69] ggthemes_5.1.0            withr_3.0.0              
 [71] fastmap_1.2.0             boot_1.3-30              
 [73] fansi_1.0.6               openssl_2.2.0            
 [75] digest_0.6.35             rsvd_1.0.5               
 [77] parallelDist_0.2.6        R6_2.5.1                 
 [79] mime_0.12                 colorspace_2.1-0         
 [81] scattermore_1.2           tensor_1.5               
 [83] spatstat.data_3.0-4       utf8_1.2.4               
 [85] generics_0.1.3            data.table_1.15.4        
 [87] httr_1.4.7                htmlwidgets_1.6.4        
 [89] S4Arrays_1.4.1            uwot_0.2.2               
 [91] pkgconfig_2.0.3           gtable_0.3.5             
 [93] ComplexHeatmap_2.20.0     lmtest_0.9-40            
 [95] XVector_0.44.0            htmltools_0.5.8.1        
 [97] dotCall64_1.1-1           clue_0.3-65              
 [99] scales_1.3.0              png_0.1-8                
[101] knitr_1.46                rstudioapi_0.16.0        
[103] reshape2_1.4.4            rjson_0.2.21             
[105] nloptr_2.0.3              nlme_3.1-164             
[107] zoo_1.8-12                GlobalOptions_0.1.2      
[109] KernSmooth_2.23-24        parallel_4.4.0           
[111] miniUI_0.1.1.1            RcppZiggurat_0.1.6       
[113] pillar_1.9.0              grid_4.4.0               
[115] vctrs_0.6.5               RANN_2.6.1               
[117] promises_1.3.0            BiocSingular_1.20.0      
[119] beachmat_2.20.0           xtable_1.8-4             
[121] cluster_2.1.6             evaluate_0.23            
[123] zeallot_0.1.0             cli_3.6.2                
[125] compiler_4.4.0            rlang_1.1.3              
[127] crayon_1.5.2              future.apply_1.11.2      
[129] labeling_0.4.3            plyr_1.8.9               
[131] stringi_1.8.4             deldir_2.0-4             
[133] BiocParallel_1.38.0       assertthat_0.2.1         
[135] munsell_0.5.1             lazyeval_0.2.2           
[137] spatstat.geom_3.2-9       PCAtools_2.16.0          
[139] Matrix_1.7-0              RcppHNSW_0.6.0           
[141] patchwork_1.2.0           sparseMatrixStats_1.16.0 
[143] future_1.33.2             shiny_1.8.1.1            
[145] ROCR_1.0-11               Rfast_2.1.0              
[147] igraph_2.0.3              RcppParallel_5.1.7