FDR analysis - define DE genes

Author

Silvia Giulia Galfrè

Published

February 27, 2024

Automatic functions

library(COTAN)
options(parallelly.fork.enable = TRUE)
library(Seurat)
library(monocle3)
library(reticulate)
library(stringr)
library(dplyr)

dirOut <- "Results/FDR/"
if (!dir.exists(dirOut)) {
  dir.create(dirOut)
}

dataSetDir <- "Data/MouseCortexFromLoom/FDR/MergedClusters_For_FDR/"

Seurat.DEA <- function(dataSet,clusters.list, project, dirOut#,percentage
                       ){
  pbmc <- CreateSeuratObject(counts = dataSet,
                             project = project, min.cells = 3, min.features = 20)
  
  stopifnot(length(clusters.list)==length(pbmc$orig.ident))
  
  
  pbmc <- NormalizeData(pbmc, normalization.method = "LogNormalize", scale.factor = 10000)
  pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000)
  all.genes <- rownames(pbmc)
  pbmc <- ScaleData(pbmc, features = all.genes)
  
  pbmc <- RunPCA(pbmc)
  pbmc <- RunUMAP(pbmc, dims = 1:20)
  
  pbmc@meta.data$TestCl <- NA
  pbmc@meta.data[names(clusters.list),]$TestCl <- factor(clusters.list)
  
  pbmc <- SetIdent(pbmc,value = "TestCl")
  
  pbmc.markers <- FindAllMarkers(pbmc, only.pos = TRUE)
  
  
  
  n.genes.DEA <- sum(pbmc.markers$p_val_adj < 0.05)
  
  print(n.genes.DEA)
  
  #write.csv(pbmc.markers, file = paste0(dirOut,project,"_de_genes_Seurat_",percentage,".csv"))
  
  return(list("n.genes.DEA"=n.genes.DEA,"markers"= pbmc.markers))
}


Seurat.DEA.bimod <- function(dataSet,clusters.list, project, dirOut#,percentage
                       ){
  pbmc <- CreateSeuratObject(counts = dataSet,
                             project = project, min.cells = 3, min.features = 20)
  
  stopifnot(length(clusters.list)==length(pbmc$orig.ident))
  
  
  pbmc <- NormalizeData(pbmc, normalization.method = "LogNormalize", scale.factor = 10000)
  pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000)
  all.genes <- rownames(pbmc)
  pbmc <- ScaleData(pbmc, features = all.genes)
  
  pbmc <- RunPCA(pbmc)
  pbmc <- RunUMAP(pbmc, dims = 1:20)
  
  pbmc@meta.data$TestCl <- NA
  pbmc@meta.data[names(clusters.list),]$TestCl <- factor(clusters.list)
  
  pbmc <- SetIdent(pbmc,value = "TestCl")
  
  pbmc.markers <- FindAllMarkers(pbmc, only.pos = TRUE,test.use = "bimod" )
  
  
  
  n.genes.DEA <- sum(pbmc.markers$p_val_adj < 0.05)
  
  print(n.genes.DEA)
  
  #write.csv(pbmc.markers, file = paste0(dirOut,project,"_de_genes_Seurat_",percentage,".csv"))
  
  return(list("n.genes.DEA"=n.genes.DEA,"markers"= pbmc.markers))
}

Seurat.DEAScTransform <- function(dataSet,clusters.list, project, dirOut#,percentage
                       ){
  pbmc <- CreateSeuratObject(counts = dataSet,
                             project = project, min.cells = 3, min.features = 20)
  
  stopifnot(length(clusters.list)==length(pbmc$orig.ident))

  # store mitochondrial percentage in object meta data
  pbmc <- PercentageFeatureSet(pbmc, pattern = "^mt-", col.name = "percent.mt")

# run sctransform
  pbmc <- SCTransform(pbmc, vars.to.regress = "percent.mt", verbose = FALSE)

  
  pbmc <- RunPCA(pbmc)
  pbmc <- RunUMAP(pbmc, dims = 1:20)
  
  pbmc@meta.data$TestCl <- NA
  pbmc@meta.data[names(clusters.list),]$TestCl <- factor(clusters.list)
  
  pbmc <- SetIdent(pbmc,value = "TestCl")
  
  pbmc.markers <- FindAllMarkers(pbmc, only.pos = TRUE)
  
  
  
  n.genes.DEA <- sum(pbmc.markers$p_val_adj < 0.05)
  
  print(n.genes.DEA)
  
  #write.csv(pbmc.markers, file = paste0(dirOut,project,"_de_genes_Seurat_",percentage,".csv"))
  
  return(list("n.genes.DEA"=n.genes.DEA,"markers"= pbmc.markers))
}




Monocle.DEA <- function(dataSet,clusters.list,project, dirOut#,percentage
                        ){
  
  cell_metadata = as.data.frame(clusters.list[colnames(dataSet)])
  colnames(cell_metadata) <- "Clusters"
  cds <- new_cell_data_set(dataSet[rowSums(dataSet) > 3,],
                         cell_metadata = cell_metadata
                         )
  colData(cds)$cluster <- clusters.list[rownames(colData(cds))]

  #cds <- preprocess_cds(cds, num_dim = 100)
  #cds <- reduce_dimension(cds)
  #cds <- cluster_cells(cds, resolution=1e-5)
  marker_test_res <- top_markers(cds, 
                                 group_cells_by="Clusters", 
                                 genes_to_test_per_group = dim(cds)[1],
                                  cores=10)
  
  # de_results <- fit_models(cds,model_formula_str = " ~ cluster",cores = 10,verbose = T)
  # fit_coefs <- coefficient_table(de_results)
  # 
  # fit_coefs <- fit_coefs %>% filter(grepl("cluster",term))
  # fit_coefs <- as.data.frame(fit_coefs)
  #write.csv(as.data.frame(fit_coefs[,c("num_cells_expressed" ,"gene_id","p_value","q_value")]), file = paste0(dirOut,project,"_de_genes_Monocle_",percentage,".csv"))
  
  return(list("n.genes.DEA"=sum(marker_test_res$marker_test_q_value < 0.05, na.rm = T),
              "marker_test_res"= marker_test_res
              ))
}


ScamPy.DEA <- function(dataSet,
                       clusters.list, 
                       project, 
                       dirOut,
                       percentage){
  pbmc <- CreateSeuratObject(counts = dataSet, project = project, min.cells = 3, min.features = 20)
  
  
  pbmc@meta.data$TestCl <- NA
  pbmc@meta.data[names(clusters.list),]$TestCl <- clusters.list
  
  exprs <- pbmc@assays$RNA$counts
  
  meta <- pbmc[[]]
  #feature_meta <- GetAssay(pbmc)[[]]
  tmp <- as.data.frame(matrix(data = NA, 
                              ncol = 1, 
                              nrow = nrow(pbmc@assays$RNA$counts)))
  rownames(tmp) <- rownames(pbmc@assays$RNA$counts)
  
  feature_meta <- tmp
  #embedding <- Embeddings(pbmc, "umap")
  
  Sys.setenv(RETICULATE_PYTHON = "../../../bin/python3")
  py <- import("sys")
  
  source_python("src/scanpyFDR.py")
  scanpyFDR(exprs, 
                   meta, 
                   feature_meta, 
                   "mt", 
                   dirOut, 
                   project)

  out <- read.csv(file = paste0(dirOut,
                                project,
                                "_Scampy_DEA_all_genes.csv"
                               ),
                  header = T,
                  row.names = 1)

  gc()

  return(out)
}

Cluster Gene Enrichment

datasets_csv <- read.csv(file.path(dataSetDir,"Cells_Usage_DataFrame.csv"),
                         row.names = 1
                        ) 

datasets_csv

                      Group
1      2_Clusters_even_near
2      2_Clusters_even_near
3      2_Clusters_even_near
4    2_Clusters_even_medium
5    2_Clusters_even_medium
6    2_Clusters_even_medium
7       2_Clusters_even_far
8       2_Clusters_even_far
9       2_Clusters_even_far
10   2_Clusters_uneven_near
11   2_Clusters_uneven_near
12   2_Clusters_uneven_near
13 2_Clusters_uneven_medium
14 2_Clusters_uneven_medium
15 2_Clusters_uneven_medium
16    2_Clusters_uneven_far
17    2_Clusters_uneven_far
18    2_Clusters_uneven_far
19          3_Clusters_even
20          3_Clusters_even
21          3_Clusters_even
22        3_Clusters_uneven
23        3_Clusters_uneven
24        3_Clusters_uneven
25        5_Clusters_uneven
26        5_Clusters_uneven
27        5_Clusters_uneven
                                                  Collection E13.5.432
1                                      E13.5-434_+_E15.0-428         0
2                                      E15.0-432_+_E13.5-432       536
3                                      E15.0-508_+_E15.0-509         0
4                                      E13.5-187_+_E13.5-184         0
5                                      E15.0-434_+_E17.5-516         0
6                                      E15.0-437_+_E15.0-508         0
7                                      E17.5-516_+_E13.5-187         0
8                                      E15.0-510_+_E13.5-437         0
9                                      E15.0-509_+_E13.5-184         0
10                                     E13.5-434_+_E15.0-428         0
11                                     E15.0-432_+_E13.5-432        66
12                                     E15.0-508_+_E15.0-509         0
13                                     E13.5-187_+_E13.5-184         0
14                                     E15.0-434_+_E17.5-516         0
15                                     E15.0-437_+_E15.0-508         0
16                                     E17.5-516_+_E13.5-187         0
17                                     E15.0-510_+_E13.5-437         0
18                                     E15.0-509_+_E13.5-184         0
19                         E15.0-437_+_E13.5-510_+_E13.5-437         0
20                         E17.5-505_+_E17.5-516_+_E13.5-437         0
21                         E15.0-510_+_E15.0-428_+_E13.5-510         0
22                         E15.0-428_+_E13.5-434_+_E15.0-510         0
23                         E13.5-187_+_E13.5-432_+_E15.0-432       168
24                         E15.0-509_+_E13.5-184_+_E15.0-508         0
25 E13.5-510_+_E15.0-437_+_E15.0-510_+_E13.5-432_+_E13.5-437       518
26 E15.0-428_+_E13.5-434_+_E15.0-434_+_E17.5-505_+_E13.5-184         0
27 E13.5-432_+_E15.0-509_+_E15.0-432_+_E13.5-187_+_E15.0-508       440
   E13.5.187 E13.5.434 E13.5.184 E13.5.437 E13.5.510 E15.0.432 E15.0.509
1          0       318         0         0         0         0         0
2          0         0         0         0         0       536         0
3          0         0         0         0         0         0       397
4        292         0       292         0         0         0         0
5          0         0         0         0         0         0         0
6          0         0         0         0         0         0         0
7        297         0         0         0         0         0         0
8          0         0         0       259         0         0         0
9          0         0       292         0         0         0       292
10         0       326         0         0         0         0         0
11         0         0         0         0         0       586         0
12         0         0         0         0         0         0       402
13       334         0        38         0         0         0         0
14         0         0         0         0         0         0         0
15         0         0         0         0         0         0         0
16       334         0         0         0         0         0         0
17         0         0         0        45         0         0         0
18         0         0        45         0         0         0       402
19         0         0         0       248       248         0         0
20         0         0         0       203         0         0         0
21         0         0         0         0       248         0         0
22         0       115         0         0         0         0         0
23        84         0         0         0         0       586         0
24         0         0        58         0         0         0       402
25         0         0         0       259        65         0         0
26         0       326       163         0         0         0         0
27        74         0         0         0         0       586       293
   E15.0.510 E15.0.508 E15.0.428 E15.0.434 E15.0.437 E17.5.516 E17.5.505
1          0         0       318         0         0         0         0
2          0         0         0         0         0         0         0
3          0       397         0         0         0         0         0
4          0         0         0         0         0         0         0
5          0         0         0       273         0       273         0
6          0       258         0         0       258         0         0
7          0         0         0         0         0       297         0
8        259         0         0         0         0         0         0
9          0         0         0         0         0         0         0
10         0         0        37         0         0         0         0
11         0         0         0         0         0         0         0
12         0        45         0         0         0         0         0
13         0         0         0         0         0         0         0
14         0         0         0        33         0       297         0
15         0       397         0         0        45         0         0
16         0         0         0         0         0        38         0
17       402         0         0         0         0         0         0
18         0         0         0         0         0         0         0
19         0         0         0         0       248         0         0
20         0         0         0         0         0       203       203
21       248         0       248         0         0         0         0
22       402         0        58         0         0         0         0
23         0         0         0         0         0         0         0
24         0       115         0         0         0         0         0
25       389         0         0         0        65         0         0
26         0         0       245        41         0         0        41
27         0        74         0         0         0         0         0

DEA genes for COTAN

for (ind in 1:dim(datasets_csv)[1]) {
  #print(ind)
  file.code <- paste0(datasets_csv$Group[ind],"_",datasets_csv$Collection[ind])
  dataset <- readRDS(file = file.path(dataSetDir,paste0(file.code,".RDS")))
  deaCOTAN <- getClusterizationData(dataset,clName = "mergedClusters")[[2]]
  pvalCOTAN <- pValueFromDEA(deaCOTAN,
              numCells = getNumCells(dataset),method = "bonferroni")

  df.genes <- deaCOTAN[rowMin(as.matrix(pvalCOTAN)) < 0.05,]
  
  write.csv(df.genes,file.path(dirOut,paste0(file.code,"COTAN_DEA_genes.csv")))
  
  }

DEA genes for Seurat

for (ind in 1:dim(datasets_csv)[1]) {
  #print(ind)
  file.code <- paste0(datasets_csv$Group[ind],"_",datasets_csv$Collection[ind])
  dataset <- readRDS(file = file.path(dataSetDir,paste0(file.code,".RDS")))

  clusters <- getClusterizationData(dataset,clName = "mergedClusters")[[1]]
  deaSeurat <- Seurat.DEA(getRawData(dataset),
                          clusters.list = clusters,
                          project = file.code,
                          dirOut = dirOut)
  write.csv(deaSeurat$markers,
          file.path(dirOut,paste0(file.code,"Seurat_DEA_genes.csv")))
}

DEA genes for Seurat scTransform

for (ind in 1:dim(datasets_csv)[1]) {
  #print(ind)
  file.code <- paste0(datasets_csv$Group[ind],"_",datasets_csv$Collection[ind])
  dataset <- readRDS(file = file.path(dataSetDir,paste0(file.code,".RDS")))

  clusters <- getClusterizationData(dataset,clName = "mergedClusters")[[1]]
  deaSeurat <- Seurat.DEAScTransform(getRawData(dataset),
                          clusters.list = clusters,
                          project = file.code,
                          dirOut = dirOut)
  write.csv(deaSeurat$markers,
          file.path(dirOut,paste0(file.code,"Seurat_DEA_ScTransform_genes.csv")))
}

DEA genes for Seurat test: bimod

for (ind in 1:dim(datasets_csv)[1]) {
  #print(ind)
  file.code <- paste0(datasets_csv$Group[ind],"_",datasets_csv$Collection[ind])
  dataset <- readRDS(file = file.path(dataSetDir,paste0(file.code,".RDS")))

  clusters <- getClusterizationData(dataset,clName = "mergedClusters")[[1]]
  deaSeurat <- Seurat.DEA.bimod(getRawData(dataset),
                          clusters.list = clusters,
                          project = file.code,
                          dirOut = dirOut)
  write.csv(deaSeurat$markers,
          file.path(dirOut,paste0(file.code,"Seurat_DEA_Bimod_genes.csv")))
}

DEA from Monocle

for (ind in 1:dim(datasets_csv)[1]) {
  file.code <- paste0(datasets_csv$Group[ind],"_",datasets_csv$Collection[ind])
  dataset <- readRDS(file = file.path(dataSetDir,paste0(file.code,".RDS")))
  print(file.code)
  clusters <- getClusterizationData(dataset,clName = "mergedClusters")[[1]]
  
  deaMonocle <- Monocle.DEA(dataSet = getRawData(dataset),
                          clusters.list = clusters,
                          project = file.code,
                          dirOut = dirOut)
  
  # deaMonocleOut <- deaMonocle$fit_coefs[,c("num_cells_expressed", 
  #                                         "gene_id","term","estimate",
  #                                         "std_err", "test_val","p_value", 
  #                                         "normalized_effect","q_value")]
  
  
  write.csv(deaMonocle$marker_test_res,
          file.path(dirOut,paste0(file.code,"Monocle_DEA_genes.csv"))) 
  
  #rm(deaMonocleOut)
  rm(deaMonocle)
  gc()
}

DEA from ScamPy

for (ind in 1:dim(datasets_csv)[1]) {
  file.code <- paste0(datasets_csv$Group[ind],"_",datasets_csv$Collection[ind])
  dataset <- readRDS(file = file.path(dataSetDir,paste0(file.code,".RDS")))

  clusters <- getClusterizationData(dataset,clName = "mergedClusters")[[1]]
  
  deaScamPy <- ScamPy.DEA(dataSet = getRawData(dataset),
                          clusters.list = clusters,
                          project = file.code,
                          dirOut = dirOut)
  
  write.csv(deaScamPy,
          file.path(dirOut,paste0(file.code,"ScamPy_DEA_genes.csv"))) 
  
}

Footer

sessionInfo()

R version 4.3.2 (2023-10-31)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 20.04.6 LTS

Matrix products: default
BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0 
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0

locale:
 [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
 [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
 [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
[10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   

time zone: Europe/Rome
tzcode source: system (glibc)

attached base packages:
[1] stats4    stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
 [1] dplyr_1.1.2                 stringr_1.5.0              
 [3] reticulate_1.35.0           monocle3_1.3.4             
 [5] SingleCellExperiment_1.22.0 SummarizedExperiment_1.30.2
 [7] GenomicRanges_1.52.0        GenomeInfoDb_1.36.1        
 [9] IRanges_2.34.1              S4Vectors_0.38.1           
[11] MatrixGenerics_1.12.3       matrixStats_1.2.0          
[13] Biobase_2.60.0              BiocGenerics_0.46.0        
[15] Seurat_5.0.0                SeuratObject_5.0.0         
[17] sp_2.1-1                    COTAN_2.3.3                

loaded via a namespace (and not attached):
  [1] RcppAnnoy_0.0.21          splines_4.3.2            
  [3] later_1.3.1               bitops_1.0-7             
  [5] tibble_3.2.1              polyclip_1.10-4          
  [7] fastDummies_1.7.3         lifecycle_1.0.3          
  [9] doParallel_1.0.17         globals_0.16.2           
 [11] lattice_0.22-5            MASS_7.3-60              
 [13] dendextend_1.17.1         magrittr_2.0.3           
 [15] plotly_4.10.2             rmarkdown_2.24           
 [17] yaml_2.3.7                httpuv_1.6.11            
 [19] sctransform_0.4.1         spam_2.10-0              
 [21] askpass_1.2.0             spatstat.sparse_3.0-2    
 [23] minqa_1.2.5               cowplot_1.1.1            
 [25] pbapply_1.7-2             RColorBrewer_1.1-3       
 [27] zlibbioc_1.46.0           abind_1.4-5              
 [29] Rtsne_0.16                purrr_1.0.1              
 [31] RCurl_1.98-1.12           GenomeInfoDbData_1.2.10  
 [33] circlize_0.4.15           ggrepel_0.9.3            
 [35] irlba_2.3.5.1             listenv_0.9.0            
 [37] spatstat.utils_3.0-3      terra_1.7-39             
 [39] umap_0.2.10.0             goftest_1.2-3            
 [41] RSpectra_0.16-1           spatstat.random_3.2-1    
 [43] dqrng_0.3.0               fitdistrplus_1.1-11      
 [45] parallelly_1.36.0         DelayedMatrixStats_1.22.5
 [47] leiden_0.4.3              codetools_0.2-19         
 [49] DelayedArray_0.26.7       tidyselect_1.2.0         
 [51] shape_1.4.6               lme4_1.1-34              
 [53] ScaledMatrix_1.8.1        viridis_0.6.4            
 [55] spatstat.explore_3.2-1    jsonlite_1.8.7           
 [57] GetoptLong_1.0.5          ellipsis_0.3.2           
 [59] progressr_0.14.0          ggridges_0.5.4           
 [61] survival_3.5-7            iterators_1.0.14         
 [63] foreach_1.5.2             tools_4.3.2              
 [65] ica_1.0-3                 Rcpp_1.0.11              
 [67] glue_1.7.0                gridExtra_2.3            
 [69] xfun_0.39                 ggthemes_5.0.0           
 [71] withr_3.0.0               fastmap_1.1.1            
 [73] boot_1.3-28               fansi_1.0.4              
 [75] openssl_2.1.0             digest_0.6.33            
 [77] rsvd_1.0.5                parallelDist_0.2.6       
 [79] R6_2.5.1                  mime_0.12                
 [81] colorspace_2.1-0          scattermore_1.2          
 [83] tensor_1.5                spatstat.data_3.0-1      
 [85] utf8_1.2.3                tidyr_1.3.0              
 [87] generics_0.1.3            data.table_1.15.0        
 [89] httr_1.4.6                htmlwidgets_1.6.2        
 [91] S4Arrays_1.2.0            uwot_0.1.16              
 [93] pkgconfig_2.0.3           gtable_0.3.3             
 [95] ComplexHeatmap_2.16.0     lmtest_0.9-40            
 [97] XVector_0.40.0            htmltools_0.5.7          
 [99] dotCall64_1.1-0           clue_0.3-64              
[101] scales_1.3.0              png_0.1-8                
[103] knitr_1.43                rstudioapi_0.15.0        
[105] reshape2_1.4.4            rjson_0.2.21             
[107] nloptr_2.0.3              nlme_3.1-163             
[109] zoo_1.8-12                GlobalOptions_0.1.2      
[111] KernSmooth_2.23-22        parallel_4.3.2           
[113] miniUI_0.1.1.1            RcppZiggurat_0.1.6       
[115] pillar_1.9.0              grid_4.3.2               
[117] vctrs_0.6.3               RANN_2.6.1               
[119] promises_1.2.0.1          BiocSingular_1.16.0      
[121] beachmat_2.16.0           xtable_1.8-4             
[123] cluster_2.1.6             evaluate_0.21            
[125] zeallot_0.1.0             cli_3.6.1                
[127] compiler_4.3.2            rlang_1.1.1              
[129] crayon_1.5.2              future.apply_1.11.0      
[131] plyr_1.8.8                stringi_1.8.1            
[133] viridisLite_0.4.2         deldir_2.0-2             
[135] BiocParallel_1.34.2       assertthat_0.2.1         
[137] munsell_0.5.0             lazyeval_0.2.2           
[139] spatstat.geom_3.2-4       PCAtools_2.14.0          
[141] Matrix_1.6-3              RcppHNSW_0.6.0           
[143] patchwork_1.2.0           sparseMatrixStats_1.12.2 
[145] future_1.33.0             ggplot2_3.4.2            
[147] shiny_1.8.0               ROCR_1.0-11              
[149] Rfast_2.1.0               igraph_1.6.0             
[151] RcppParallel_5.1.7