CD14 Data-set Anaysis with cleaning

#library(SingleCellExperiment)
#library(DuoClustering2018)
#library(tidyr)
library(ggplot2)
#library(ggsankey) # remotes::install_github("davidsjoberg/ggsankey")
library(tibble)
library(zeallot)
library(COTAN)
options(parallelly.fork.enable = TRUE)

outDir <- "Data/CD14Cleaned/"

setLoggingLevel(2)
setLoggingFile(file.path(outDir, "cd14_analysis.log"))
library(Seurat)
dataset <- Read10X(file.path(outDir, "/OrigialDatahg19"))
sampleCondition <- "CD14_Monocytes"
cd14Obj <- COTAN(raw = dataset)
cd14Obj <- initializeMetaDataset(cd14Obj,
                                 GEO = "10X CD14+ Monocytes",
                                 sequencingMethod = "10X",
                                 sampleCondition = sampleCondition)

sampleCondition
[1] "CD14_Monocytes"

Inspect cells’ sizes

cellSizePlot(cd14Obj, splitPattern = "-", numCol = 2)

Drop cells with too many reads as they are probably doublets

cellsSizeThr <- 2500
cd14Obj <- addElementToMetaDataset(cd14Obj, "Cells size threshold", cellsSizeThr)

cells_to_rem <- getCells(cd14Obj)[getCellsSize(cd14Obj) > cellsSizeThr]
cd14Obj <- dropGenesCells(cd14Obj, cells = cells_to_rem)

cellSizePlot(cd14Obj, splitPattern = "-", numCol = 2)

Inspect the number of expressed genes per cell

genesSizePlot(cd14Obj, splitPattern = "-", numCol = 2)

Drop cells with too high genes expression as they are probably doublets

genesSizeThr <- 800
cd14Obj <- addElementToMetaDataset(cd14Obj, "Num genes threshold", genesSizeThr)

numExprGenes <- getNumExpressedGenes(cd14Obj)
cells_to_rem <- names(numExprGenes)[numExprGenes > genesSizeThr]
cd14Obj <- dropGenesCells(cd14Obj, cells = cells_to_rem)

genesSizePlot(cd14Obj, splitPattern = "-", numCol = 2)

Check number of mitochondrial genes expressed in each cell

mitGenesPattern <- "^[Mm][Tt]-"
getGenes(cd14Obj)[grep(mitGenesPattern, getGenes(cd14Obj))]
 [1] "MT-ND1"  "MT-ND2"  "MT-CO1"  "MT-CO2"  "MT-ATP8" "MT-ATP6" "MT-CO3" 
 [8] "MT-ND3"  "MT-ND4L" "MT-ND4"  "MT-ND5"  "MT-ND6"  "MT-CYB" 
c(mitPlot, mitSizes) %<-%
  mitochondrialPercentagePlot(cd14Obj, genePrefix = mitGenesPattern,
                              splitPattern = "-", numCol = 2)

plot(mitPlot)

Cells with a too high percentage of mitochondrial genes are likely dead (or at the last problematic) cells. So we drop them!

mitPercThr <- 7
cd14Obj <- addElementToMetaDataset(cd14Obj, "Mitoc. perc. threshold", mitPercThr)

cells_to_rem <- rownames(mitSizes)[mitSizes[["mit.percentage"]] > mitPercThr]

cd14Obj <- dropGenesCells(cd14Obj, cells = cells_to_rem)

c(mitPlot, mitSizes) %<-%
  mitochondrialPercentagePlot(cd14Obj, genePrefix = mitGenesPattern,
                              splitPattern = "-", numCol = 2)

plot(mitPlot)

Check the number of ribosomal genes expressed in each cell

ribGenesPattern <- "^RP[SL]\\d+"
getGenes(cd14Obj)[grep(ribGenesPattern, getGenes(cd14Obj))]
  [1] "RPL22"          "RPL11"          "RPS6KA1"        "RPS8"          
  [5] "RPL5"           "RPS27"          "RPS10P7"        "RPS6KC1"       
  [9] "RPS7"           "RPS27A"         "RPL31"          "RPL37A"        
 [13] "RPL32"          "RPL15"          "RPL14"          "RPL29"         
 [17] "RPL24"          "RPL22L1"        "RPL39L"         "RPL35A"        
 [21] "RPL9"           "RPL34-AS1"      "RPL34"          "RPS3A"         
 [25] "RPL37"          "RPS23"          "RPS14"          "RPL26L1"       
 [29] "RPS18"          "RPS10-NUDT3"    "RPS10"          "RPL10A"        
 [33] "RPL7L1"         "RPS12"          "RPS6KA2"        "RPS6KA2-AS1"   
 [37] "RPS6KA3"        "RPS4X"          "RPS6KA6"        "RPL36A"        
 [41] "RPL36A-HNRNPH2" "RPL39"          "RPL10"          "RPS20"         
 [45] "RPL7"           "RPL30"          "RPL8"           "RPS6"          
 [49] "RPL35"          "RPL12"          "RPL7A"          "RPS24"         
 [53] "RPL27A"         "RPS13"          "RPS6KA4"        "RPS6KB2"       
 [57] "RPS3"           "RPS25"          "RPS26"          "RPL41"         
 [61] "RPL6"           "RPL21"          "RPL10L"         "RPS29"         
 [65] "RPL36AL"        "RPS6KL1"        "RPS6KA5"        "RPS27L"        
 [69] "RPL4"           "RPS17"          "RPS17L"         "RPL3L"         
 [73] "RPS2"           "RPS15A"         "RPL13"          "RPL26"         
 [77] "RPL23A"         "RPL23"          "RPL19"          "RPL27"         
 [81] "RPS6KB1"        "RPL38"          "RPL17-C18orf32" "RPL17"         
 [85] "RPS21"          "RPS15"          "RPL36"          "RPS28"         
 [89] "RPL18A"         "RPS16"          "RPS19"          "RPL18"         
 [93] "RPL13A"         "RPS11"          "RPS9"           "RPL28"         
 [97] "RPS5"           "RPS4Y1"         "RPS4Y2"         "RPL3"          
[101] "RPS19BP1"      
c(ribPlot, ribSizes) %<-%
  mitochondrialPercentagePlot(cd14Obj, genePrefix = mitGenesPattern,
                              splitPattern = "-", numCol = 2)

plot(ribPlot)

Check no further outliers after all the culling

cellSizePlot(cd14Obj, splitPattern = "-", numCol = 2)

genesSizePlot(cd14Obj, splitPattern = "-", numCol = 2)

Clean: round 1

cd14Obj <- clean(cd14Obj)

c(pcaCellsPlot, pcaCellsData, genesPlot, UDEPlot, nuPlot, zoomedNuPlot) %<-% cleanPlots(cd14Obj)

plot(pcaCellsPlot)

plot(genesPlot)

cd14Obj <- addElementToMetaDataset(cd14Obj, "Num drop B group", 0)

Visualize if all is ok:

plot(UDEPlot)

plot(nuPlot)

plot(zoomedNuPlot)

Final cleaning to check all is OK

cd14Obj <- clean(cd14Obj)

c(pcaCellsPlot, pcaCellsData, genesPlot, UDEPlot, nuPlot, zoomedNuPlot) %<-% cleanPlots(cd14Obj)

plot(pcaCellsPlot)

plot(pcaCellsData)

plot(genesPlot)

plot(UDEPlot)

plot(nuPlot)

plot(zoomedNuPlot)

plot(cellSizePlot(cd14Obj, splitPattern = "-", numCol = 2))

plot(genesSizePlot(cd14Obj, splitPattern = "-", numCol = 2))

Calculate genes’ COEX

Sys.time()

cd14Obj <- proceedToCoex(cd14Obj, calcCoex = TRUE, cores = 12,
                         saveObj = TRUE, outDir = outDir)
cd14Obj <- readRDS(file = file.path(outDir, paste0(sampleCondition, ".cotan.RDS")))
gdiData <- calculateGDI(cd14Obj)

genesToLabel <- head(rownames(gdiData[order(gdiData[["GDI"]],
                                            decreasing = TRUE), ]), n = 50L)

sort(genesToLabel)
 [1] "ABI3"      "ARHGDIB"   "ATP5A1"    "ATP6V1G1"  "CALM2"     "CD53"     
 [7] "CD7"       "CLEC10A"   "CPVL"      "CTSW"      "FCER1A"    "FKBP1A"   
[13] "H2AFY"     "HLA-DMB"   "HLA-DPA1"  "HLA-DPB1"  "HLA-DQA1"  "HLA-DQA2" 
[19] "HLA-DQB1"  "HLA-DRA"   "HMGN1"     "HNRNPA0"   "HNRNPA1"   "HNRNPA2B1"
[25] "IFITM1"    "IL2RG"     "IL32"      "LDHA"      "LDHB"      "MYL12A"   
[31] "NPM1"      "PARK7"     "PDIA6"     "PGK1"      "PTPRCAP"   "RAB7A"    
[37] "RBM3"      "RHOA"      "RHOC"      "RPL5"      "S100A6"    "S100A8"   
[43] "S100A9"    "SELL"      "SLC25A5"   "TMEM230"   "TMEM66"    "TUBA1B"   
[49] "TYROBP"    "YBX1"     
gdiData[genesToLabel[[50L]], "GDI"]
[1] 1.606624
gdiPlot <- GDIPlot(cd14Obj, GDIIn = gdiData, GDIThreshold = 1.4,
                   genes = list("Top 10 GDI genes" = genesToLabel[1L:10L]))

plot(gdiPlot)

Sys.time()
[1] "2024-05-10 19:18:01 CEST"

Save the COTAN object

saveRDS(cd14Obj, file = file.path(outDir, paste0(sampleCondition, ".cotan.RDS")))
cd14Obj <- readRDS(file = file.path(outDir, paste0(sampleCondition, ".cotan.RDS")))
c(splitClusters, splitCoexDF) %<-%
  cellsUniformClustering(cd14Obj, GDIThreshold = 1.43, cores = 13,
                         saveObj = TRUE, outDir = outDir)

cd14Obj <- addClusterization(cd14Obj, clName = "split",
                             clusters = splitClusters,
                             coexDF = splitCoexDF, override = TRUE)
splitClusters <- getClusterizationData(cd14Obj, clName = "split")[[1]]

table(splitClusters)
splitClusters
  1   2   3   4   5 
878  38 637 837  48 
saveRDS(cd14Obj, file = file.path(outDir, paste0(sampleCondition, ".cotan.RDS")))
c(mergedClusters, mergedCoexDF) %<-%
  mergeUniformCellsClusters(cd14Obj, clusters = splitClusters,
                            GDIThreshold = 1.43, cores = 13,
                            saveObj = TRUE, outDir = outDir)

cd14Obj <- addClusterization(cd14Obj, clName = "merge",
                             clusters = mergedClusters,
                             coexDF = mergedCoexDF,
                             override = TRUE)
mergedClusters <- getClusterizationData(cd14Obj, clName = "merge")[[1]]

table(mergedClusters)
mergedClusters
   1    2    3 
1522  878   38 
saveRDS(cd14Obj, file = file.path(outDir, paste0(sampleCondition, ".cotan.RDS")))
sessionInfo()
R version 4.3.2 (2023-10-31)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 20.04.6 LTS

Matrix products: default
BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/liblapack.so.3;  LAPACK version 3.9.0

locale:
 [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
 [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
 [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
[10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   

time zone: Europe/Rome
tzcode source: system (glibc)

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] Seurat_5.0.0       SeuratObject_5.0.0 sp_2.1-1           COTAN_2.5.0       
[5] zeallot_0.1.0      tibble_3.2.1       ggplot2_3.5.0     

loaded via a namespace (and not attached):
  [1] RcppAnnoy_0.0.21          splines_4.3.2            
  [3] later_1.3.1               R.oo_1.26.0              
  [5] polyclip_1.10-4           fastDummies_1.7.3        
  [7] lifecycle_1.0.3           doParallel_1.0.17        
  [9] globals_0.16.2            lattice_0.22-5           
 [11] MASS_7.3-60               dendextend_1.17.1        
 [13] magrittr_2.0.3            plotly_4.10.2            
 [15] rmarkdown_2.24            yaml_2.3.7               
 [17] httpuv_1.6.11             sctransform_0.4.1        
 [19] spam_2.10-0               askpass_1.2.0            
 [21] spatstat.sparse_3.0-2     reticulate_1.36.1        
 [23] cowplot_1.1.1             pbapply_1.7-2            
 [25] RColorBrewer_1.1-3        abind_1.4-5              
 [27] Rtsne_0.17                purrr_1.0.1              
 [29] R.utils_2.12.2            BiocGenerics_0.46.0      
 [31] circlize_0.4.15           IRanges_2.34.1           
 [33] S4Vectors_0.38.1          ggrepel_0.9.5            
 [35] irlba_2.3.5.1             listenv_0.9.0            
 [37] spatstat.utils_3.0-3      umap_0.2.10.0            
 [39] goftest_1.2-3             RSpectra_0.16-1          
 [41] spatstat.random_3.2-1     dqrng_0.3.0              
 [43] fitdistrplus_1.1-11       parallelly_1.37.1        
 [45] DelayedMatrixStats_1.22.5 leiden_0.4.3             
 [47] codetools_0.2-19          DelayedArray_0.26.7      
 [49] tidyselect_1.2.0          shape_1.4.6              
 [51] farver_2.1.1              ScaledMatrix_1.8.1       
 [53] viridis_0.6.4             matrixStats_1.2.0        
 [55] stats4_4.3.2              spatstat.explore_3.2-1   
 [57] jsonlite_1.8.7            GetoptLong_1.0.5         
 [59] ellipsis_0.3.2            progressr_0.14.0         
 [61] ggridges_0.5.4            survival_3.5-8           
 [63] iterators_1.0.14          foreach_1.5.2            
 [65] tools_4.3.2               ica_1.0-3                
 [67] Rcpp_1.0.11               glue_1.7.0               
 [69] gridExtra_2.3             xfun_0.39                
 [71] MatrixGenerics_1.12.3     ggthemes_5.1.0           
 [73] dplyr_1.1.2               withr_3.0.0              
 [75] fastmap_1.1.1             fansi_1.0.4              
 [77] openssl_2.1.0             digest_0.6.33            
 [79] rsvd_1.0.5                parallelDist_0.2.6       
 [81] R6_2.5.1                  mime_0.12                
 [83] colorspace_2.1-0          scattermore_1.2          
 [85] tensor_1.5                spatstat.data_3.0-1      
 [87] R.methodsS3_1.8.2         utf8_1.2.3               
 [89] tidyr_1.3.0               generics_0.1.3           
 [91] data.table_1.15.0         httr_1.4.6               
 [93] htmlwidgets_1.6.2         S4Arrays_1.2.0           
 [95] uwot_0.1.16               pkgconfig_2.0.3          
 [97] gtable_0.3.3              ComplexHeatmap_2.16.0    
 [99] lmtest_0.9-40             htmltools_0.5.8          
[101] dotCall64_1.1-0           clue_0.3-64              
[103] scales_1.3.0              png_0.1-8                
[105] knitr_1.43                rstudioapi_0.15.0        
[107] reshape2_1.4.4            rjson_0.2.21             
[109] nlme_3.1-163              zoo_1.8-12               
[111] GlobalOptions_0.1.2       stringr_1.5.0            
[113] KernSmooth_2.23-22        parallel_4.3.2           
[115] miniUI_0.1.1.1            RcppZiggurat_0.1.6       
[117] pillar_1.9.0              grid_4.3.2               
[119] vctrs_0.6.3               RANN_2.6.1               
[121] promises_1.2.0.1          BiocSingular_1.16.0      
[123] beachmat_2.16.0           xtable_1.8-4             
[125] cluster_2.1.6             evaluate_0.21            
[127] cli_3.6.1                 compiler_4.3.2           
[129] rlang_1.1.1               crayon_1.5.2             
[131] future.apply_1.11.0       labeling_0.4.2           
[133] plyr_1.8.8                stringi_1.8.1            
[135] viridisLite_0.4.2         deldir_2.0-2             
[137] BiocParallel_1.34.2       assertthat_0.2.1         
[139] munsell_0.5.0             lazyeval_0.2.2           
[141] spatstat.geom_3.2-4       PCAtools_2.14.0          
[143] Matrix_1.6-3              RcppHNSW_0.6.0           
[145] patchwork_1.2.0           sparseMatrixStats_1.12.2 
[147] future_1.33.0             shiny_1.8.0              
[149] ROCR_1.0-11               Rfast_2.1.0              
[151] igraph_2.0.3              RcppParallel_5.1.7