Filtering of PBMC2 using COTAN

Published

January 19, 2024

Library import

library(dplyr)
library(COTAN)
library(Seurat)
library(tibble)
library(ggplot2)
library(zeallot)
library(DropletUtils)

Settings

datasetName = 'PBMC2'
datasetFolder = './Data/'

inDir  = paste(datasetFolder, datasetName, '/raw/10X/', sep='')
outDir = paste(datasetFolder, datasetName, '/filtered/', sep='')
dir10X = paste(outDir, '10X/', sep='')

if (!dir.exists(outDir)) {
  dir.create(outDir, recursive = TRUE, showWarnings = FALSE)
}

setLoggingLevel(2)
setLoggingFile(paste(outDir, "logfile.log", sep=""))
options(parallelly.fork.enable = TRUE)

Data loading

dataset = Read10X(data.dir = inDir, strip.suffix = TRUE)
dataset = dataset[[1]]
sampleCond <- datasetName
PBMC2 <- COTAN(raw = dataset)
PBMC2 <- initializeMetaDataset(
  PBMC2,
  GEO = paste("10X ", datasetName, sep=""),
  sequencingMethod = "10X",
  sampleCond = sampleCond
)

Inspect cells’ sizes

cellSizePlot(PBMC2)

Drop cells with too many reads as they are probably doublets

cellsSizeThr <- 20000
PBMC2 <- addElementToMetaDataset(PBMC2, "Cells size threshold", cellsSizeThr)

cellsToRem <- getCells(PBMC2)[getCellsSize(PBMC2) > cellsSizeThr]
PBMC2 <- dropGenesCells(PBMC2, cells = cellsToRem)

cellSizePlot(PBMC2, splitPattern = "-", numCol = 2)

Inspect the number of expressed genes per cell

genesSizePlot(PBMC2, splitPattern = "-", numCol = 2)

Drop cells with too high genes expression as they are probably doublets

geneSizeThr <- 3500
PBMC2 <- addElementToMetaDataset(PBMC2, "Num genes threshold", geneSizeThr)

numExprGenes <- getNumExpressedGenes(PBMC2)
cellsToRem <- names(numExprGenes)[numExprGenes > geneSizeThr]
PBMC2 <- dropGenesCells(PBMC2, cells = cellsToRem)

genesSizePlot(PBMC2, splitPattern = "-", numCol = 2)

Check number of mithocondrial genes expressed in each cell

mitGenesPattern <- "^[Mm][Tt]-"
getGenes(PBMC2)[grep(mitGenesPattern, getGenes(PBMC2))]
 [1] "MT-ND1"  "MT-ND2"  "MT-CO1"  "MT-CO2"  "MT-ATP8" "MT-ATP6" "MT-CO3" 
 [8] "MT-ND3"  "MT-ND4L" "MT-ND4"  "MT-ND5"  "MT-ND6"  "MT-CYB" 
c(mitPlot, mitSizes) %<-%
  mitochondrialPercentagePlot(PBMC2, genePrefix = mitGenesPattern,
                              splitPattern = "-", numCol = 2)

plot(mitPlot)

We drop cells with a too high percentage of mitocondrial genes (are likely dead)

mitPercThr <- 10
PBMC2 <- addElementToMetaDataset(PBMC2, "Mitoc. perc. threshold", mitPercThr)

cellsToRem <- rownames(mitSizes)[mitSizes[["mit.percentage"]] > mitPercThr]

PBMC2 <- dropGenesCells(PBMC2, cells = cellsToRem)

c(mitPlot, mitSizes) %<-%
  mitochondrialPercentagePlot(PBMC2, genePrefix = mitGenesPattern,
                              splitPattern = "-", numCol = 2)

plot(mitPlot)

Check number of ribosomial genes expressed in each cell

ribGenesPattern <- "^RP[SL]\\d+"
getGenes(PBMC2)[grep(ribGenesPattern, getGenes(PBMC2))]
 [1] "RPL22"       "RPL11"       "RPS6KA1"     "RPS8"        "RPL5"       
 [6] "RPS27"       "RPS6KC1"     "RPS7"        "RPS27A"      "RPL31"      
[11] "RPL37A"      "RPL32"       "RPL15"       "RPL14"       "RPL29"      
[16] "RPL24"       "RPL22L1"     "RPL39L"      "RPL35A"      "RPL9"       
[21] "RPL34-AS1"   "RPL34"       "RPS3A"       "RPL37"       "RPS23"      
[26] "RPS14"       "RPL26L1"     "RPS18"       "RPS10-NUDT3" "RPS10"      
[31] "RPL10A"      "RPL7L1"      "RPS12"       "RPS6KA2"     "RPS6KA2-IT1"
[36] "RPS6KA2-AS1" "RPS20"       "RPL7"        "RPL30"       "RPL8"       
[41] "RPS6"        "RPL35"       "RPL12"       "RPL7A"       "RPS24"      
[46] "RPL27A"      "RPS13"       "RPS6KA4"     "RPS6KB2"     "RPS6KB2-AS1"
[51] "RPS3"        "RPS25"       "RPS26"       "RPL41"       "RPL6"       
[56] "RPL21"       "RPL10L"      "RPS29"       "RPL36AL"     "RPS6KL1"    
[61] "RPS6KA5"     "RPS27L"      "RPL4"        "RPS17"       "RPL3L"      
[66] "RPS2"        "RPS15A"      "RPL13"       "RPL26"       "RPL23A"     
[71] "RPL23"       "RPL19"       "RPL27"       "RPS6KB1"     "RPL38"      
[76] "RPL17"       "RPS15"       "RPL36"       "RPS28"       "RPL18A"     
[81] "RPS16"       "RPS19"       "RPL18"       "RPL13A"      "RPS11"      
[86] "RPS9"        "RPL28"       "RPS5"        "RPS21"       "RPL3"       
[91] "RPS19BP1"    "RPS6KA3"     "RPS4X"       "RPS6KA6"     "RPL36A"     
[96] "RPL39"       "RPL10"       "RPS4Y1"      "RPS4Y2"     
c(ribPlot, ribSizes) %<-%
  mitochondrialPercentagePlot(PBMC2, genePrefix = ribGenesPattern,
                              splitPattern = "-", numCol = 2)

plot(ribPlot)

Check no further outliers after all the culling

cellSizePlot(PBMC2, splitPattern = "-", numCol = 2)

genesSizePlot(PBMC2, splitPattern = "-", numCol = 2)

Cleaning, round 1

PBMC2 <- clean(PBMC2)

c(pcaCellsPlot, pcaCellsData, genesPlot, UDEPlot, nuPlot, zoomedNuPlot) %<-% cleanPlots(PBMC2)

plot(pcaCellsPlot)

plot(genesPlot)

PBMC2 <- addElementToMetaDataset(PBMC2, "Num drop B group", 0)
plot(UDEPlot)

plot(nuPlot)

plot(zoomedNuPlot)  

yset=0.16
nuDf <- data.frame("nu" = sort(getNu(PBMC2)), "n" = seq_along(getNu(PBMC2)))
PBMC2 <- addElementToMetaDataset(PBMC2, "Threshold low UDE cells:", yset)
cellsToRem <-rownames(nuDf)[nuDf[["nu"]] < yset]
PBMC2 <- dropGenesCells(PBMC2, cells = cellsToRem)

Cleaning, round 2

PBMC2 <- clean(PBMC2)

c(pcaCellsPlot, pcaCellsData, genesPlot, UDEPlot, nuPlot, zoomedNuPlot) %<-% cleanPlots(PBMC2)

plot(pcaCellsPlot)

plot(pcaCellsData)

plot(genesPlot)

plot(UDEPlot)

plot(nuPlot)

plot(zoomedNuPlot)

plot(cellSizePlot(PBMC2, splitPattern = "-", numCol = 2))

plot(genesSizePlot(PBMC2, splitPattern = "-", numCol = 2))

Save the filtered dataset

if (!dir.exists(dir10X)) {
  write10xCounts(dir10X, getRawData(PBMC2))
}
saveRDS(PBMC2, file = paste0(outDir, sampleCond, ".cotan.RDS"))
sessionInfo()
R version 4.3.2 (2023-10-31)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 20.04.6 LTS

Matrix products: default
BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0 
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0

locale:
 [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
 [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
 [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
[10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   

time zone: Europe/Rome
tzcode source: system (glibc)

attached base packages:
[1] stats4    stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
 [1] DropletUtils_1.20.0         SingleCellExperiment_1.22.0
 [3] SummarizedExperiment_1.30.2 Biobase_2.60.0             
 [5] GenomicRanges_1.52.0        GenomeInfoDb_1.36.1        
 [7] IRanges_2.34.1              S4Vectors_0.38.1           
 [9] BiocGenerics_0.46.0         MatrixGenerics_1.12.3      
[11] matrixStats_1.2.0           zeallot_0.1.0              
[13] ggplot2_3.4.2               tibble_3.2.1               
[15] Seurat_5.0.0                SeuratObject_5.0.0         
[17] sp_2.1-1                    COTAN_2.3.0                
[19] dplyr_1.1.2                

loaded via a namespace (and not attached):
  [1] RcppAnnoy_0.0.21          splines_4.3.2            
  [3] later_1.3.1               bitops_1.0-7             
  [5] R.oo_1.25.0               polyclip_1.10-4          
  [7] fastDummies_1.7.3         lifecycle_1.0.3          
  [9] edgeR_3.42.4              doParallel_1.0.17        
 [11] globals_0.16.2            lattice_0.22-5           
 [13] MASS_7.3-60               dendextend_1.17.1        
 [15] magrittr_2.0.3            limma_3.56.2             
 [17] plotly_4.10.2             rmarkdown_2.24           
 [19] yaml_2.3.7                httpuv_1.6.11            
 [21] sctransform_0.4.1         spam_2.10-0              
 [23] askpass_1.2.0             spatstat.sparse_3.0-2    
 [25] reticulate_1.34.0         cowplot_1.1.1            
 [27] pbapply_1.7-2             RColorBrewer_1.1-3       
 [29] zlibbioc_1.46.0           abind_1.4-5              
 [31] Rtsne_0.16                R.utils_2.12.2           
 [33] purrr_1.0.1               RCurl_1.98-1.12          
 [35] circlize_0.4.15           GenomeInfoDbData_1.2.10  
 [37] ggrepel_0.9.3             irlba_2.3.5.1            
 [39] listenv_0.9.0             spatstat.utils_3.0-3     
 [41] umap_0.2.10.0             goftest_1.2-3            
 [43] RSpectra_0.16-1           spatstat.random_3.2-1    
 [45] dqrng_0.3.0               fitdistrplus_1.1-11      
 [47] parallelly_1.36.0         DelayedMatrixStats_1.22.5
 [49] leiden_0.4.3              codetools_0.2-19         
 [51] DelayedArray_0.26.7       scuttle_1.10.2           
 [53] tidyselect_1.2.0          shape_1.4.6              
 [55] farver_2.1.1              ScaledMatrix_1.8.1       
 [57] viridis_0.6.4             spatstat.explore_3.2-1   
 [59] jsonlite_1.8.7            GetoptLong_1.0.5         
 [61] ellipsis_0.3.2            progressr_0.14.0         
 [63] ggridges_0.5.4            survival_3.5-7           
 [65] iterators_1.0.14          foreach_1.5.2            
 [67] tools_4.3.2               ica_1.0-3                
 [69] Rcpp_1.0.11               glue_1.6.2               
 [71] gridExtra_2.3             xfun_0.39                
 [73] ggthemes_5.0.0            HDF5Array_1.28.1         
 [75] withr_2.5.0               fastmap_1.1.1            
 [77] rhdf5filters_1.12.1       fansi_1.0.4              
 [79] openssl_2.1.0             digest_0.6.33            
 [81] rsvd_1.0.5                parallelDist_0.2.6       
 [83] R6_2.5.1                  mime_0.12                
 [85] colorspace_2.1-0          scattermore_1.2          
 [87] tensor_1.5                spatstat.data_3.0-1      
 [89] R.methodsS3_1.8.2         utf8_1.2.3               
 [91] tidyr_1.3.0               generics_0.1.3           
 [93] data.table_1.14.8         httr_1.4.6               
 [95] htmlwidgets_1.6.2         S4Arrays_1.2.0           
 [97] uwot_0.1.16               pkgconfig_2.0.3          
 [99] gtable_0.3.3              ComplexHeatmap_2.16.0    
[101] lmtest_0.9-40             XVector_0.40.0           
[103] htmltools_0.5.7           dotCall64_1.1-0          
[105] clue_0.3-64               scales_1.3.0             
[107] png_0.1-8                 knitr_1.43               
[109] rstudioapi_0.15.0         reshape2_1.4.4           
[111] rjson_0.2.21              nlme_3.1-163             
[113] rhdf5_2.44.0              zoo_1.8-12               
[115] GlobalOptions_0.1.2       stringr_1.5.0            
[117] KernSmooth_2.23-22        parallel_4.3.2           
[119] miniUI_0.1.1.1            RcppZiggurat_0.1.6       
[121] pillar_1.9.0              grid_4.3.2               
[123] vctrs_0.6.3               RANN_2.6.1               
[125] promises_1.2.0.1          BiocSingular_1.16.0      
[127] beachmat_2.16.0           xtable_1.8-4             
[129] cluster_2.1.6             evaluate_0.21            
[131] locfit_1.5-9.8            cli_3.6.1                
[133] compiler_4.3.2            rlang_1.1.1              
[135] crayon_1.5.2              future.apply_1.11.0      
[137] labeling_0.4.2            plyr_1.8.8               
[139] stringi_1.8.1             viridisLite_0.4.2        
[141] deldir_2.0-2              BiocParallel_1.34.2      
[143] assertthat_0.2.1          munsell_0.5.0            
[145] lazyeval_0.2.2            spatstat.geom_3.2-4      
[147] PCAtools_2.14.0           Matrix_1.6-3             
[149] RcppHNSW_0.5.0            patchwork_1.1.2          
[151] sparseMatrixStats_1.12.2  future_1.33.0            
[153] Rhdf5lib_1.22.0           shiny_1.8.0              
[155] ROCR_1.0-11               Rfast_2.1.0              
[157] igraph_1.6.0              RcppParallel_5.1.7