FDR analysis - define DE genes

Author

Silvia Giulia Galfrè

Published

February 27, 2024

Automatic functions

library(COTAN)
options(parallelly.fork.enable = TRUE)
library(Seurat)
library(monocle3)
library(reticulate)
library(stringr)
library(dplyr)

dirOut <- "Results/FDR/"
if (!dir.exists(dirOut)) {
  dir.create(dirOut)
}

dataSetDir <- "Data/MouseCortexFromLoom/FDR/MergedClusters_For_FDR/"
Seurat.DEA <- function(dataSet,clusters.list, project, dirOut#,percentage
                       ){
  pbmc <- CreateSeuratObject(counts = dataSet,
                             project = project, min.cells = 3, min.features = 20)
  
  stopifnot(length(clusters.list)==length(pbmc$orig.ident))
  
  
  pbmc <- NormalizeData(pbmc, normalization.method = "LogNormalize", scale.factor = 10000)
  pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000)
  all.genes <- rownames(pbmc)
  pbmc <- ScaleData(pbmc, features = all.genes)
  
  pbmc <- RunPCA(pbmc)
  pbmc <- RunUMAP(pbmc, dims = 1:20)
  
  pbmc@meta.data$TestCl <- NA
  pbmc@meta.data[names(clusters.list),]$TestCl <- factor(clusters.list)
  
  pbmc <- SetIdent(pbmc,value = "TestCl")
  
  pbmc.markers <- FindAllMarkers(pbmc, only.pos = TRUE)
  
  
  
  n.genes.DEA <- sum(pbmc.markers$p_val_adj < 0.05)
  
  print(n.genes.DEA)
  
  #write.csv(pbmc.markers, file = paste0(dirOut,project,"_de_genes_Seurat_",percentage,".csv"))
  
  return(list("n.genes.DEA"=n.genes.DEA,"markers"= pbmc.markers))
}


Seurat.DEA.bimod <- function(dataSet,clusters.list, project, dirOut#,percentage
                       ){
  pbmc <- CreateSeuratObject(counts = dataSet,
                             project = project, min.cells = 3, min.features = 20)
  
  stopifnot(length(clusters.list)==length(pbmc$orig.ident))
  
  
  pbmc <- NormalizeData(pbmc, normalization.method = "LogNormalize", scale.factor = 10000)
  pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000)
  all.genes <- rownames(pbmc)
  pbmc <- ScaleData(pbmc, features = all.genes)
  
  pbmc <- RunPCA(pbmc)
  pbmc <- RunUMAP(pbmc, dims = 1:20)
  
  pbmc@meta.data$TestCl <- NA
  pbmc@meta.data[names(clusters.list),]$TestCl <- factor(clusters.list)
  
  pbmc <- SetIdent(pbmc,value = "TestCl")
  
  pbmc.markers <- FindAllMarkers(pbmc, only.pos = TRUE,test.use = "bimod" )
  
  
  
  n.genes.DEA <- sum(pbmc.markers$p_val_adj < 0.05)
  
  print(n.genes.DEA)
  
  #write.csv(pbmc.markers, file = paste0(dirOut,project,"_de_genes_Seurat_",percentage,".csv"))
  
  return(list("n.genes.DEA"=n.genes.DEA,"markers"= pbmc.markers))
}

Seurat.DEAScTransform <- function(dataSet,clusters.list, project, dirOut#,percentage
                       ){
  pbmc <- CreateSeuratObject(counts = dataSet,
                             project = project, min.cells = 3, min.features = 20)
  
  stopifnot(length(clusters.list)==length(pbmc$orig.ident))

  # store mitochondrial percentage in object meta data
  pbmc <- PercentageFeatureSet(pbmc, pattern = "^mt-", col.name = "percent.mt")

# run sctransform
  pbmc <- SCTransform(pbmc, vars.to.regress = "percent.mt", verbose = FALSE)

  
  pbmc <- RunPCA(pbmc)
  pbmc <- RunUMAP(pbmc, dims = 1:20)
  
  pbmc@meta.data$TestCl <- NA
  pbmc@meta.data[names(clusters.list),]$TestCl <- factor(clusters.list)
  
  pbmc <- SetIdent(pbmc,value = "TestCl")
  
  pbmc.markers <- FindAllMarkers(pbmc, only.pos = TRUE)
  
  
  
  n.genes.DEA <- sum(pbmc.markers$p_val_adj < 0.05)
  
  print(n.genes.DEA)
  
  #write.csv(pbmc.markers, file = paste0(dirOut,project,"_de_genes_Seurat_",percentage,".csv"))
  
  return(list("n.genes.DEA"=n.genes.DEA,"markers"= pbmc.markers))
}




Monocle.DEA <- function(dataSet,clusters.list,project, dirOut#,percentage
                        ){
  
  cell_metadata = as.data.frame(clusters.list[colnames(dataSet)])
  colnames(cell_metadata) <- "Clusters"
  cds <- new_cell_data_set(dataSet[rowSums(dataSet) > 3,],
                         cell_metadata = cell_metadata
                         )
  colData(cds)$cluster <- clusters.list[rownames(colData(cds))]

  #cds <- preprocess_cds(cds, num_dim = 100)
  #cds <- reduce_dimension(cds)
  #cds <- cluster_cells(cds, resolution=1e-5)
  marker_test_res <- top_markers(cds, 
                                 group_cells_by="Clusters", 
                                 genes_to_test_per_group = dim(cds)[1],
                                  cores=10)
  
  # de_results <- fit_models(cds,model_formula_str = " ~ cluster",cores = 10,verbose = T)
  # fit_coefs <- coefficient_table(de_results)
  # 
  # fit_coefs <- fit_coefs %>% filter(grepl("cluster",term))
  # fit_coefs <- as.data.frame(fit_coefs)
  #write.csv(as.data.frame(fit_coefs[,c("num_cells_expressed" ,"gene_id","p_value","q_value")]), file = paste0(dirOut,project,"_de_genes_Monocle_",percentage,".csv"))
  
  return(list("n.genes.DEA"=sum(marker_test_res$marker_test_q_value < 0.05, na.rm = T),
              "marker_test_res"= marker_test_res
              ))
}


ScamPy.DEA <- function(dataSet,
                       clusters.list, 
                       project, 
                       dirOut,
                       percentage){
  pbmc <- CreateSeuratObject(counts = dataSet, project = project, min.cells = 3, min.features = 20)
  
  
  pbmc@meta.data$TestCl <- NA
  pbmc@meta.data[names(clusters.list),]$TestCl <- clusters.list
  
  exprs <- pbmc@assays$RNA$counts
  
  meta <- pbmc[[]]
  #feature_meta <- GetAssay(pbmc)[[]]
  tmp <- as.data.frame(matrix(data = NA, 
                              ncol = 1, 
                              nrow = nrow(pbmc@assays$RNA$counts)))
  rownames(tmp) <- rownames(pbmc@assays$RNA$counts)
  
  feature_meta <- tmp
  #embedding <- Embeddings(pbmc, "umap")
  
  Sys.setenv(RETICULATE_PYTHON = "../../../bin/python3")
  py <- import("sys")
  
  source_python("src/scanpyFDR.py")
  scanpyFDR(exprs, 
                   meta, 
                   feature_meta, 
                   "mt", 
                   dirOut, 
                   project)

  out <- read.csv(file = paste0(dirOut,
                                project,
                                "_Scampy_DEA_all_genes.csv"
                               ),
                  header = T,
                  row.names = 1)

  gc()

  return(out)
}

Cluster Gene Enrichment

datasets_csv <- read.csv(file.path(dataSetDir,"Cells_Usage_DataFrame.csv"),
                         row.names = 1
                        ) 

datasets_csv
                      Group
1      2_Clusters_even_near
2      2_Clusters_even_near
3      2_Clusters_even_near
4    2_Clusters_even_medium
5    2_Clusters_even_medium
6    2_Clusters_even_medium
7       2_Clusters_even_far
8       2_Clusters_even_far
9       2_Clusters_even_far
10   2_Clusters_uneven_near
11   2_Clusters_uneven_near
12   2_Clusters_uneven_near
13 2_Clusters_uneven_medium
14 2_Clusters_uneven_medium
15 2_Clusters_uneven_medium
16    2_Clusters_uneven_far
17    2_Clusters_uneven_far
18    2_Clusters_uneven_far
19          3_Clusters_even
20          3_Clusters_even
21          3_Clusters_even
22        3_Clusters_uneven
23        3_Clusters_uneven
24        3_Clusters_uneven
25        5_Clusters_uneven
26        5_Clusters_uneven
27        5_Clusters_uneven
                                                  Collection E13.5.432
1                                      E13.5-434_+_E15.0-428         0
2                                      E15.0-432_+_E13.5-432       536
3                                      E15.0-508_+_E15.0-509         0
4                                      E13.5-187_+_E13.5-184         0
5                                      E15.0-434_+_E17.5-516         0
6                                      E15.0-437_+_E15.0-508         0
7                                      E17.5-516_+_E13.5-187         0
8                                      E15.0-510_+_E13.5-437         0
9                                      E15.0-509_+_E13.5-184         0
10                                     E13.5-434_+_E15.0-428         0
11                                     E15.0-432_+_E13.5-432        66
12                                     E15.0-508_+_E15.0-509         0
13                                     E13.5-187_+_E13.5-184         0
14                                     E15.0-434_+_E17.5-516         0
15                                     E15.0-437_+_E15.0-508         0
16                                     E17.5-516_+_E13.5-187         0
17                                     E15.0-510_+_E13.5-437         0
18                                     E15.0-509_+_E13.5-184         0
19                         E15.0-437_+_E13.5-510_+_E13.5-437         0
20                         E17.5-505_+_E17.5-516_+_E13.5-437         0
21                         E15.0-510_+_E15.0-428_+_E13.5-510         0
22                         E15.0-428_+_E13.5-434_+_E15.0-510         0
23                         E13.5-187_+_E13.5-432_+_E15.0-432       168
24                         E15.0-509_+_E13.5-184_+_E15.0-508         0
25 E13.5-510_+_E15.0-437_+_E15.0-510_+_E13.5-432_+_E13.5-437       518
26 E15.0-428_+_E13.5-434_+_E15.0-434_+_E17.5-505_+_E13.5-184         0
27 E13.5-432_+_E15.0-509_+_E15.0-432_+_E13.5-187_+_E15.0-508       440
   E13.5.187 E13.5.434 E13.5.184 E13.5.437 E13.5.510 E15.0.432 E15.0.509
1          0       318         0         0         0         0         0
2          0         0         0         0         0       536         0
3          0         0         0         0         0         0       397
4        292         0       292         0         0         0         0
5          0         0         0         0         0         0         0
6          0         0         0         0         0         0         0
7        297         0         0         0         0         0         0
8          0         0         0       259         0         0         0
9          0         0       292         0         0         0       292
10         0       326         0         0         0         0         0
11         0         0         0         0         0       586         0
12         0         0         0         0         0         0       402
13       334         0        38         0         0         0         0
14         0         0         0         0         0         0         0
15         0         0         0         0         0         0         0
16       334         0         0         0         0         0         0
17         0         0         0        45         0         0         0
18         0         0        45         0         0         0       402
19         0         0         0       248       248         0         0
20         0         0         0       203         0         0         0
21         0         0         0         0       248         0         0
22         0       115         0         0         0         0         0
23        84         0         0         0         0       586         0
24         0         0        58         0         0         0       402
25         0         0         0       259        65         0         0
26         0       326       163         0         0         0         0
27        74         0         0         0         0       586       293
   E15.0.510 E15.0.508 E15.0.428 E15.0.434 E15.0.437 E17.5.516 E17.5.505
1          0         0       318         0         0         0         0
2          0         0         0         0         0         0         0
3          0       397         0         0         0         0         0
4          0         0         0         0         0         0         0
5          0         0         0       273         0       273         0
6          0       258         0         0       258         0         0
7          0         0         0         0         0       297         0
8        259         0         0         0         0         0         0
9          0         0         0         0         0         0         0
10         0         0        37         0         0         0         0
11         0         0         0         0         0         0         0
12         0        45         0         0         0         0         0
13         0         0         0         0         0         0         0
14         0         0         0        33         0       297         0
15         0       397         0         0        45         0         0
16         0         0         0         0         0        38         0
17       402         0         0         0         0         0         0
18         0         0         0         0         0         0         0
19         0         0         0         0       248         0         0
20         0         0         0         0         0       203       203
21       248         0       248         0         0         0         0
22       402         0        58         0         0         0         0
23         0         0         0         0         0         0         0
24         0       115         0         0         0         0         0
25       389         0         0         0        65         0         0
26         0         0       245        41         0         0        41
27         0        74         0         0         0         0         0

DEA genes for COTAN

for (ind in 1:dim(datasets_csv)[1]) {
  #print(ind)
  file.code <- paste0(datasets_csv$Group[ind],"_",datasets_csv$Collection[ind])
  dataset <- readRDS(file = file.path(dataSetDir,paste0(file.code,".RDS")))
  deaCOTAN <- getClusterizationData(dataset,clName = "mergedClusters")[[2]]
  pvalCOTAN <- pValueFromDEA(deaCOTAN,
              numCells = getNumCells(dataset),method = "bonferroni")

  df.genes <- deaCOTAN[rowMin(as.matrix(pvalCOTAN)) < 0.05,]
  
  write.csv(df.genes,file.path(dirOut,paste0(file.code,"COTAN_DEA_genes.csv")))
  
  }

DEA genes for Seurat

for (ind in 1:dim(datasets_csv)[1]) {
  #print(ind)
  file.code <- paste0(datasets_csv$Group[ind],"_",datasets_csv$Collection[ind])
  dataset <- readRDS(file = file.path(dataSetDir,paste0(file.code,".RDS")))

  clusters <- getClusterizationData(dataset,clName = "mergedClusters")[[1]]
  deaSeurat <- Seurat.DEA(getRawData(dataset),
                          clusters.list = clusters,
                          project = file.code,
                          dirOut = dirOut)
  write.csv(deaSeurat$markers,
          file.path(dirOut,paste0(file.code,"Seurat_DEA_genes.csv")))
}

DEA genes for Seurat scTransform

for (ind in 1:dim(datasets_csv)[1]) {
  #print(ind)
  file.code <- paste0(datasets_csv$Group[ind],"_",datasets_csv$Collection[ind])
  dataset <- readRDS(file = file.path(dataSetDir,paste0(file.code,".RDS")))

  clusters <- getClusterizationData(dataset,clName = "mergedClusters")[[1]]
  deaSeurat <- Seurat.DEAScTransform(getRawData(dataset),
                          clusters.list = clusters,
                          project = file.code,
                          dirOut = dirOut)
  write.csv(deaSeurat$markers,
          file.path(dirOut,paste0(file.code,"Seurat_DEA_ScTransform_genes.csv")))
}

DEA genes for Seurat test: bimod

for (ind in 1:dim(datasets_csv)[1]) {
  #print(ind)
  file.code <- paste0(datasets_csv$Group[ind],"_",datasets_csv$Collection[ind])
  dataset <- readRDS(file = file.path(dataSetDir,paste0(file.code,".RDS")))

  clusters <- getClusterizationData(dataset,clName = "mergedClusters")[[1]]
  deaSeurat <- Seurat.DEA.bimod(getRawData(dataset),
                          clusters.list = clusters,
                          project = file.code,
                          dirOut = dirOut)
  write.csv(deaSeurat$markers,
          file.path(dirOut,paste0(file.code,"Seurat_DEA_Bimod_genes.csv")))
}

DEA from Monocle

for (ind in 1:dim(datasets_csv)[1]) {
  file.code <- paste0(datasets_csv$Group[ind],"_",datasets_csv$Collection[ind])
  dataset <- readRDS(file = file.path(dataSetDir,paste0(file.code,".RDS")))
  print(file.code)
  clusters <- getClusterizationData(dataset,clName = "mergedClusters")[[1]]
  
  deaMonocle <- Monocle.DEA(dataSet = getRawData(dataset),
                          clusters.list = clusters,
                          project = file.code,
                          dirOut = dirOut)
  
  # deaMonocleOut <- deaMonocle$fit_coefs[,c("num_cells_expressed", 
  #                                         "gene_id","term","estimate",
  #                                         "std_err", "test_val","p_value", 
  #                                         "normalized_effect","q_value")]
  
  
  write.csv(deaMonocle$marker_test_res,
          file.path(dirOut,paste0(file.code,"Monocle_DEA_genes.csv"))) 
  
  #rm(deaMonocleOut)
  rm(deaMonocle)
  gc()
}

DEA from ScamPy

for (ind in 1:dim(datasets_csv)[1]) {
  file.code <- paste0(datasets_csv$Group[ind],"_",datasets_csv$Collection[ind])
  dataset <- readRDS(file = file.path(dataSetDir,paste0(file.code,".RDS")))

  clusters <- getClusterizationData(dataset,clName = "mergedClusters")[[1]]
  
  deaScamPy <- ScamPy.DEA(dataSet = getRawData(dataset),
                          clusters.list = clusters,
                          project = file.code,
                          dirOut = dirOut)
  
  write.csv(deaScamPy,
          file.path(dirOut,paste0(file.code,"ScamPy_DEA_genes.csv"))) 
  
}