Filtering of PBMC2 using COTAN

Published

January 20, 2026

Library import

library(dplyr)
library(COTAN)
library(Seurat)
library(tibble)
library(ggplot2)
library(zeallot)
library(DropletUtils)

Settings

datasetName = 'PBMC2'
datasetFolder = './Data/'

inDir  = paste(datasetFolder, datasetName, '/raw/10X/', sep='')
outDir = paste(datasetFolder, datasetName, '/filtered/', sep='')
dir10X = paste(outDir, '10X/', sep='')

if (!dir.exists(outDir)) {
  dir.create(outDir, recursive = TRUE, showWarnings = FALSE)
}

setLoggingLevel(2)
setLoggingFile(paste(outDir, "logfile.log", sep=""))
options(parallelly.fork.enable = TRUE)

Data loading

dataset = Read10X(data.dir = inDir, strip.suffix = TRUE)
dataset = dataset[[1]]
sampleCond <- datasetName
PBMC2 <- COTAN(raw = dataset)
PBMC2 <- initializeMetaDataset(
  PBMC2,
  GEO = paste("10X ", datasetName, sep=""),
  sequencingMethod = "10X",
  sampleCond = sampleCond
)

Inspect cells’ sizes

cellSizePlot(PBMC2)

Drop cells with too many reads as they are probably doublets

cellsSizeThr <- 20000
PBMC2 <- addElementToMetaDataset(PBMC2, "Cells size threshold", cellsSizeThr)

cellsToRem <- getCells(PBMC2)[getCellsSize(PBMC2) > cellsSizeThr]
PBMC2 <- dropGenesCells(PBMC2, cells = cellsToRem)

Inspect the number of expressed genes per cell

genesSizePlot(PBMC2)

Drop cells with too high genes expression as they are probably doublets

geneSizeThr <- 3500
PBMC2 <- addElementToMetaDataset(PBMC2, "Num genes threshold", geneSizeThr)

numExprGenes <- getNumExpressedGenes(PBMC2)
cellsToRem <- names(numExprGenes)[numExprGenes > geneSizeThr]
PBMC2 <- dropGenesCells(PBMC2, cells = cellsToRem)

genesSizePlot(PBMC2)

cellsToRem <- names(numExprGenes)[numExprGenes < 100]
PBMC2 <- dropGenesCells(PBMC2, cells = cellsToRem)

genesSizePlot(PBMC2)

Check number of mithocondrial genes expressed in each cell

mitGenesPattern <- "^[Mm][Tt]-"
getGenes(PBMC2)[grep(mitGenesPattern, getGenes(PBMC2))]
 [1] "MT-ND1"  "MT-ND2"  "MT-CO1"  "MT-CO2"  "MT-ATP8" "MT-ATP6" "MT-CO3" 
 [8] "MT-ND3"  "MT-ND4L" "MT-ND4"  "MT-ND5"  "MT-ND6"  "MT-CYB" 
c(mitPlot, mitSizes) %<-%
  mitochondrialPercentagePlot(PBMC2, genePrefix = mitGenesPattern)

plot(mitPlot)

We drop cells with a too high percentage of mitocondrial genes (are likely dead)

mitPercThr <- 10
PBMC2 <- addElementToMetaDataset(PBMC2, "Mitoc. perc. threshold", mitPercThr)

cellsToRem <- rownames(mitSizes)[mitSizes[["mit.percentage"]] > mitPercThr]

PBMC2 <- dropGenesCells(PBMC2, cells = cellsToRem)

c(mitPlot, mitSizes) %<-%
  mitochondrialPercentagePlot(PBMC2, genePrefix = mitGenesPattern)

plot(mitPlot)

Check number of ribosomial genes expressed in each cell

ribGenesPattern <- "^RP[SL]\\d+"
getGenes(PBMC2)[grep(ribGenesPattern, getGenes(PBMC2))]
 [1] "RPL22"       "RPL11"       "RPS6KA1"     "RPS8"        "RPL5"       
 [6] "RPS27"       "RPS6KC1"     "RPS7"        "RPS27A"      "RPL31"      
[11] "RPL37A"      "RPL32"       "RPL15"       "RPL14"       "RPL29"      
[16] "RPL24"       "RPL22L1"     "RPL39L"      "RPL35A"      "RPL9"       
[21] "RPL34-AS1"   "RPL34"       "RPS3A"       "RPL37"       "RPS23"      
[26] "RPS14"       "RPL26L1"     "RPS18"       "RPS10-NUDT3" "RPS10"      
[31] "RPL10A"      "RPL7L1"      "RPS12"       "RPS6KA2"     "RPS6KA2-IT1"
[36] "RPS6KA2-AS1" "RPS20"       "RPL7"        "RPL30"       "RPL8"       
[41] "RPS6"        "RPL35"       "RPL12"       "RPL7A"       "RPS24"      
[46] "RPL27A"      "RPS13"       "RPS6KA4"     "RPS6KB2"     "RPS6KB2-AS1"
[51] "RPS3"        "RPS25"       "RPS26"       "RPL41"       "RPL6"       
[56] "RPL21"       "RPL10L"      "RPS29"       "RPL36AL"     "RPS6KL1"    
[61] "RPS6KA5"     "RPS27L"      "RPL4"        "RPS17"       "RPL3L"      
[66] "RPS2"        "RPS15A"      "RPL13"       "RPL26"       "RPL23A"     
[71] "RPL23"       "RPL19"       "RPL27"       "RPS6KB1"     "RPL38"      
[76] "RPL17"       "RPS15"       "RPL36"       "RPS28"       "RPL18A"     
[81] "RPS16"       "RPS19"       "RPL18"       "RPL13A"      "RPS11"      
[86] "RPS9"        "RPL28"       "RPS5"        "RPS21"       "RPL3"       
[91] "RPS19BP1"    "RPS6KA3"     "RPS4X"       "RPS6KA6"     "RPL36A"     
[96] "RPL39"       "RPL10"       "RPS4Y1"      "RPS4Y2"     
c(ribPlot, ribSizes) %<-%
  mitochondrialPercentagePlot(PBMC2, genePrefix = ribGenesPattern)

plot(ribPlot)

Check no further outliers after all the culling

cellSizePlot(PBMC2)

genesSizePlot(PBMC2)

Cleaning, round 1

PBMC2 <- clean(PBMC2)

c(pcaCellsPlot, pcaCellsData, genesPlot, UDEPlot, nuPlot, zoomedNuPlot) %<-% cleanPlots(PBMC2)

plot(pcaCellsPlot)

plot(genesPlot)

plot(pcaCellsPlot)

PBMC2 <- addElementToMetaDataset(PBMC2, "Num drop B group", 0)
plot(UDEPlot)

plot(nuPlot)

plot(zoomedNuPlot)  

Save the filtered dataset

if (!dir.exists(dir10X)) {
  write10xCounts(dir10X, getRawData(PBMC2))
}
PBMC2 <- proceedToCoex(PBMC2, calcCoex = TRUE, cores = 5L, saveObj = FALSE)
saveRDS(PBMC2, file = paste0(outDir, sampleCond, ".cotan.RDS"))

Sys.time()
[1] "2026-01-20 10:06:25 CET"
sessionInfo()
R version 4.5.2 (2025-10-31)
Platform: x86_64-pc-linux-gnu
Running under: Ubuntu 22.04.5 LTS

Matrix products: default
BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0 
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0  LAPACK version 3.10.0

locale:
 [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
 [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
 [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
[10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   

time zone: Europe/Rome
tzcode source: system (glibc)

attached base packages:
[1] stats4    stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
 [1] DropletUtils_1.28.1         SingleCellExperiment_1.32.0
 [3] SummarizedExperiment_1.38.1 Biobase_2.70.0             
 [5] GenomicRanges_1.62.1        Seqinfo_1.0.0              
 [7] IRanges_2.44.0              S4Vectors_0.48.0           
 [9] BiocGenerics_0.56.0         generics_0.1.3             
[11] MatrixGenerics_1.22.0       matrixStats_1.5.0          
[13] zeallot_0.2.0               ggplot2_4.0.1              
[15] tibble_3.3.0                Seurat_5.2.1               
[17] SeuratObject_5.1.0          sp_2.2-0                   
[19] COTAN_2.11.1                dplyr_1.1.4                

loaded via a namespace (and not attached):
  [1] RcppAnnoy_0.0.22          splines_4.5.2            
  [3] later_1.4.2               R.oo_1.27.1              
  [5] polyclip_1.10-7           fastDummies_1.7.5        
  [7] lifecycle_1.0.4           edgeR_4.6.3              
  [9] doParallel_1.0.17         globals_0.18.0           
 [11] lattice_0.22-7            MASS_7.3-65              
 [13] ggdist_3.3.3              dendextend_1.19.0        
 [15] magrittr_2.0.4            limma_3.64.0             
 [17] plotly_4.11.0             rmarkdown_2.29           
 [19] yaml_2.3.10               httpuv_1.6.16            
 [21] sctransform_0.4.2         spam_2.11-1              
 [23] spatstat.sparse_3.1-0     reticulate_1.42.0        
 [25] cowplot_1.2.0             pbapply_1.7-2            
 [27] RColorBrewer_1.1-3        abind_1.4-8              
 [29] Rtsne_0.17                R.utils_2.13.0           
 [31] purrr_1.2.0               circlize_0.4.16          
 [33] GenomeInfoDbData_1.2.14   ggrepel_0.9.6            
 [35] irlba_2.3.5.1             listenv_0.9.1            
 [37] spatstat.utils_3.1-4      goftest_1.2-3            
 [39] RSpectra_0.16-2           dqrng_0.4.1              
 [41] spatstat.random_3.4-1     fitdistrplus_1.2-2       
 [43] parallelly_1.46.0         DelayedMatrixStats_1.30.0
 [45] codetools_0.2-20          DelayedArray_0.36.0      
 [47] scuttle_1.18.0            tidyselect_1.2.1         
 [49] shape_1.4.6.1             UCSC.utils_1.4.0         
 [51] farver_2.1.2              ScaledMatrix_1.16.0      
 [53] viridis_0.6.5             spatstat.explore_3.4-2   
 [55] jsonlite_2.0.0            GetoptLong_1.0.5         
 [57] progressr_0.15.1          ggridges_0.5.6           
 [59] survival_3.8-3            iterators_1.0.14         
 [61] foreach_1.5.2             tools_4.5.2              
 [63] ica_1.0-3                 Rcpp_1.1.0               
 [65] glue_1.8.0                gridExtra_2.3            
 [67] SparseArray_1.10.8        xfun_0.52                
 [69] distributional_0.5.0      ggthemes_5.2.0           
 [71] GenomeInfoDb_1.44.0       HDF5Array_1.36.0         
 [73] withr_3.0.2               fastmap_1.2.0            
 [75] rhdf5filters_1.20.0       digest_0.6.37            
 [77] rsvd_1.0.5                parallelDist_0.2.6       
 [79] R6_2.6.1                  mime_0.13                
 [81] colorspace_2.1-1          scattermore_1.2          
 [83] tensor_1.5                spatstat.data_3.1-6      
 [85] R.methodsS3_1.8.2         h5mread_1.0.0            
 [87] tidyr_1.3.1               data.table_1.17.0        
 [89] httr_1.4.7                htmlwidgets_1.6.4        
 [91] S4Arrays_1.10.1           uwot_0.2.3               
 [93] pkgconfig_2.0.3           gtable_0.3.6             
 [95] ComplexHeatmap_2.26.0     lmtest_0.9-40            
 [97] S7_0.2.1                  XVector_0.50.0           
 [99] htmltools_0.5.8.1         dotCall64_1.2            
[101] zigg_0.0.2                clue_0.3-66              
[103] scales_1.4.0              png_0.1-8                
[105] spatstat.univar_3.1-3     knitr_1.50               
[107] reshape2_1.4.4            rjson_0.2.23             
[109] nlme_3.1-168              rhdf5_2.52.0             
[111] proxy_0.4-27              zoo_1.8-14               
[113] GlobalOptions_0.1.2       stringr_1.6.0            
[115] KernSmooth_2.23-26        parallel_4.5.2           
[117] miniUI_0.1.2              pillar_1.10.2            
[119] grid_4.5.2                vctrs_0.6.5              
[121] RANN_2.6.2                promises_1.3.2           
[123] BiocSingular_1.26.1       beachmat_2.26.0          
[125] xtable_1.8-4              cluster_2.1.8.1          
[127] evaluate_1.0.3            locfit_1.5-9.12          
[129] cli_3.6.5                 compiler_4.5.2           
[131] rlang_1.1.6               crayon_1.5.3             
[133] future.apply_1.20.0       labeling_0.4.3           
[135] plyr_1.8.9                stringi_1.8.7            
[137] viridisLite_0.4.2         deldir_2.0-4             
[139] BiocParallel_1.44.0       assertthat_0.2.1         
[141] lazyeval_0.2.2            spatstat.geom_3.4-1      
[143] Matrix_1.7-4              RcppHNSW_0.6.0           
[145] patchwork_1.3.2           sparseMatrixStats_1.20.0 
[147] future_1.58.0             Rhdf5lib_1.30.0          
[149] statmod_1.5.0             shiny_1.11.0             
[151] ROCR_1.0-11               Rfast_2.1.5.1            
[153] igraph_2.1.4              RcppParallel_5.1.10