DatasetCleaning_Run41

Preamble

# Util libs
library(assertthat)
library(ggplot2)
library(zeallot)
library(conflicted)
library(Matrix)

# Data processing libs
if (!suppressWarnings(require(COTAN))) {
  devtools::load_all("~/dev/COTAN/COTAN/")
}

conflicts_prefer(zeallot::`%->%`, zeallot::`%<-%`)

options(parallelly.fork.enable = TRUE)

setLoggingLevel(2L)

List input files

inDir <- file.path("..")
outDir <- file.path(".")

list.files(path = inDir, pattern = "\\.RDS$")
[1] "capillary_blood_samples_pbmcs-Run_28.RDS"        
[2] "capillary_blood_samples_pbmcs-Run_40-Cleaned.RDS"
[3] "capillary_blood_samples_pbmcs-Run_40.RDS"        
[4] "capillary_blood_samples_pbmcs-Run_41-Cleaned.RDS"
[5] "capillary_blood_samples_pbmcs-Run_41.RDS"        
[6] "capillary_blood_samples_pbmcs-Run_62.RDS"        
[7] "capillary_blood_samples_pbmcs-Run_77-Cleaned.RDS"
[8] "capillary_blood_samples_pbmcs-Run_77.RDS"        
[9] "capillary_blood_samples_pbmcs.RDS"               
globalCondition <- "capillary_blood_samples_pbmcs"

thisRun <- "41"
fileNameIn <- paste0(globalCondition, "-Run_", thisRun, ".RDS")

setLoggingFile(file.path(outDir, paste0("DatasetCleaning_Run", thisRun, ".log")))

Load dataset

aRunObj <- readRDS(file = file.path(inDir, fileNameIn))

getAllConditions(aRunObj)
[1] "Sample.IDs"              "Participant.IDs"        
[3] "Cell.Barcoding.Runs"     "Lane"                   
[5] "cell_barcoding_protocol" "run_lane_batch"         
[7] "original_sample_id"     
getClusterizations(aRunObj)
[1] "cell_type_level_1" "cell_type_level_2" "cell_type_level_3"
[4] "cell_type_level_4" "c1"                "c2"               
[7] "c3"                "c4"               

Cleaning

clean() using standard thresholds

cellsToDrop <- names(getClusterizationData(aRunObj,"c4")[[1]][is.na(getClusterizationData(aRunObj,"c4")[[1]])])

aRunObj <- dropGenesCells(aRunObj, cells = cellsToDrop)

aRunObj <- clean(aRunObj)

Check the initial plots

cellSizePlot(aRunObj, condName = "Participant.IDs")

genesSizePlot(aRunObj, condName = "Participant.IDs")

scatterPlot(aRunObj, condName = "Participant.IDs")

c(mitPerPlot, mitPerDf) %<-%
  mitochondrialPercentagePlot(aRunObj, condName = "Participant.IDs",
                              genePrefix = "^MT-")
mitPerPlot

Remove too high mitochondrial percentage cells

Expected percentages are around 6%-10%: too high percentage imply dead cells

sum(mitPerDf[["mit.percentage"]] > 20.0)
[1] 0
sum(mitPerDf[["mit.percentage"]] > 15.0)
[1] 6
sum(mitPerDf[["mit.percentage"]] > 14.0)
[1] 12
sum(mitPerDf[["mit.percentage"]] > 10.0)
[1] 377
# drop above 14%
mitPerThr <- 14.0
aRunObj <-
  addElementToMetaDataset(
    aRunObj,
    "Mitochondrial percentage threshold", mitPerThr)

cellsToDrop <- getCells(aRunObj)[which(mitPerDf[["mit.percentage"]] > mitPerThr)]

aRunObj <- dropGenesCells(aRunObj, cells = cellsToDrop)

Redo the plots

cellSizePlot(aRunObj, condName = "Participant.IDs")

genesSizePlot(aRunObj, condName = "Participant.IDs")

scatterPlot(aRunObj, condName = "Participant.IDs")

c(mitPerPlot, mitPerDf) %<-%
  mitochondrialPercentagePlot(aRunObj, condName = "Participant.IDs",
                              genePrefix = "^MT-")
mitPerPlot

Remove cells with too high size

librarySize <- getCellsSize(aRunObj)

sum(librarySize > 25000)
[1] 25
sum(librarySize > 20000)
[1] 81
sum(librarySize > 18000)
[1] 120
sum(librarySize > 15000)
[1] 233
# drop above 11%
cellSizeThr <- 25000
aRunObj <-
  addElementToMetaDataset(
    aRunObj,
    "Cell size threshold", cellSizeThr)

cellsToDrop <- getCells(aRunObj)[which(librarySize > cellSizeThr)]

aRunObj <- dropGenesCells(aRunObj, cells = cellsToDrop)

Redo the plots

cellSizePlot(aRunObj, condName = "Participant.IDs")

genesSizePlot(aRunObj, condName = "Participant.IDs")

scatterPlot(aRunObj, condName = "Participant.IDs")

c(mitPerPlot, mitPerDf) %<-%
  mitochondrialPercentagePlot(aRunObj, condName = "Participant.IDs",
                              genePrefix = "^MT-")
mitPerPlot

Check for spurious clusters

aRunObj <- clean(aRunObj)

c(pcaCells, pcaCellsData, genes, UDE, nu, zoomedNu) %<-%
  cleanPlots(aRunObj, includePCA = TRUE)

Plot PCA and Nu

plot(pcaCells)

plot(UDE)

plot(pcaCellsData)

plot(genes)

plot(nu)

plot(zoomedNu)

Remove cells below nu elbow point

nu <- getNu(aRunObj)

sum(nu < 0.35)
[1] 3
sum(nu < 0.38)
[1] 7
sum(nu < 0.4)
[1] 11
sum(nu < 0.43)
[1] 24
# drop above 11%
lowNuThr <- 0.4
aRunObj <-
  addElementToMetaDataset(
    aRunObj,
    "Low Nu threshold", lowNuThr)

cellsToDrop <- getCells(aRunObj)[which(nu < lowNuThr)]

cells_to_rem <- rownames(pcaCellsData)[pcaCellsData[["groups"]] == "B"]

aRunObj <- dropGenesCells(aRunObj, cells = c(cellsToDrop,cells_to_rem))

Redo all the plots

aRunObj <- clean(aRunObj)

c(pcaCells, pcaCellsData, genes, UDE, nu, zoomedNu) %<-%
  cleanPlots(aRunObj, includePCA = TRUE)
plot(pcaCells)

plot(UDE)

#plot(pcaCellsData)

plot(genes)

plot(nu)

plot(zoomedNu)

cellSizePlot(aRunObj,conditions = getClusterizationData(aRunObj,clName = "c1")[[1]])

genesSizePlot(aRunObj,conditions = getClusterizationData(aRunObj,clName = "c1")[[1]])

scatterPlot(aRunObj,conditions = getClusterizationData(aRunObj,clName = "c1")[[1]])

cellSizePlot(aRunObj,conditions = getClusterizationData(aRunObj,clName = "c2")[[1]])

genesSizePlot(aRunObj,conditions = getClusterizationData(aRunObj,clName = "c2")[[1]])

scatterPlot(aRunObj,conditions = getClusterizationData(aRunObj,clName = "c2")[[1]])

c(mitPerPlotc1, mitPerDfc1) %<-%
  mitochondrialPercentagePlot(aRunObj, conditions = getClusterizationData(aRunObj,clName = "c1")[[1]],
                              genePrefix = "^MT-")

c(mitPerPlotc2, mitPerDfc2) %<-%
  mitochondrialPercentagePlot(aRunObj, conditions = getClusterizationData(aRunObj,clName = "c2")[[1]],
                              genePrefix = "^MT-")
mitPerPlotc1

mitPerPlotc2

Finalize object and save

aRunObj <- proceedToCoex(aRunObj, calcCoex = FALSE, cores = 5L, saveObj = FALSE)
fileNameOut <- paste0(globalCondition, "-Run_", thisRun, "-Cleaned", ".RDS")

saveRDS(aRunObj, file = file.path(inDir, fileNameOut))
cellSizePlot(aRunObj,conditions = getClusterizationData(aRunObj,"c2")[[1]])

genesSizePlot(aRunObj,conditions = getClusterizationData(aRunObj,"c2")[[1]])

c(mitPerPlot, mitPerDf) %<-%
  mitochondrialPercentagePlot(aRunObj, conditions = getClusterizationData(aRunObj,"c2")[[1]],
                              genePrefix = "^MT-")
mitPerPlot

cellSizePlot(aRunObj,conditions = getClusterizationData(aRunObj,"c3")[[1]])

genesSizePlot(aRunObj,conditions = getClusterizationData(aRunObj,"c3")[[1]])

c(mitPerPlot, mitPerDf) %<-%
  mitochondrialPercentagePlot(aRunObj, conditions = getClusterizationData(aRunObj,"c3")[[1]],
                              genePrefix = "^MT-")
mitPerPlot

table(getClusterizationData(aRunObj,"c3")[[1]])

                                 pDC                          CD4 T Cells 
                                   7                                 2473 
                 Gamma-Delta T Cells                                 cDC2 
                                 273                                   26 
              Intermediate Monocytes                                 cDC3 
                                  86                                   11 
                CD56 Bright NK Cells             Classical Memory B Cells 
                                  26                                  115 
Mucosal-Associated Invariant T Cells                    CD56 Dim NK Cells 
                                 463                                  274 
                 Classical Monocytes                          CD8 T Cells 
                                 252                                 1886 
              Nonclassical Monocytes                        Naive B Cells 
                                  86                                  250 
                                asDC                   IgM Memory B Cells 
                                   1                                  307 
  Classical Monocytes - HSP artifact               Age-associated B Cells 
                                   1                                    7 
                   Adaptive NK Cells 
                                  14 

Check uniform clusters

getClData <- function(objCOTAN, clList, cl) {
  checker <- new("SimpleGDIUniformityCheck")
  checker@check@maxRatioBeyond  <- 0.02
  cluster <- names(clList)[[cl]]
  cells <- clList[[cl]]
  res <- checkClusterUniformity(objCOTAN, cluster = cluster, cells = cells,checker = checker, 
                                 cores = 6, saveObj = FALSE)
  return(res)
}
originalC3Cl <- getClusters(aRunObj, clName = "c3")

sort(table(originalC3Cl), decreasing = TRUE)
originalC3Cl
                         CD4 T Cells                          CD8 T Cells 
                                2473                                 1886 
Mucosal-Associated Invariant T Cells                   IgM Memory B Cells 
                                 463                                  307 
                   CD56 Dim NK Cells                  Gamma-Delta T Cells 
                                 274                                  273 
                 Classical Monocytes                        Naive B Cells 
                                 252                                  250 
            Classical Memory B Cells               Intermediate Monocytes 
                                 115                                   86 
              Nonclassical Monocytes                                 cDC2 
                                  86                                   26 
                CD56 Bright NK Cells                    Adaptive NK Cells 
                                  26                                   14 
                                cDC3                                  pDC 
                                  11                                    7 
              Age-associated B Cells                                 asDC 
                                   7                                    1 
  Classical Monocytes - HSP artifact 
                                   1 
largeCl <- which(table(originalC3Cl) >= 50)

largeOrigCl <- originalC3Cl[(originalC3Cl %in% names(largeCl)),
                                drop = TRUE]

largeOrigCl <- toClustersList(largeOrigCl)
vapply(largeOrigCl, length, integer(1))
                         CD4 T Cells                  Gamma-Delta T Cells 
                                2473                                  273 
              Intermediate Monocytes             Classical Memory B Cells 
                                  86                                  115 
Mucosal-Associated Invariant T Cells                    CD56 Dim NK Cells 
                                 463                                  274 
                 Classical Monocytes                          CD8 T Cells 
                                 252                                 1886 
              Nonclassical Monocytes                        Naive B Cells 
                                  86                                  250 
                  IgM Memory B Cells 
                                 307 
resDataC3 <- lapply(seq_along(largeOrigCl), FUN = getClData,
                       objCOTAN = aRunObj, clList = largeOrigCl)

clDataC3 <- checkersToDF(resDataC3)
rownames(clDataC3) <- names(largeOrigCl)

clDataC3
                                                        class
CD4 T Cells                          SimpleGDIUniformityCheck
Gamma-Delta T Cells                  SimpleGDIUniformityCheck
Intermediate Monocytes               SimpleGDIUniformityCheck
Classical Memory B Cells             SimpleGDIUniformityCheck
Mucosal-Associated Invariant T Cells SimpleGDIUniformityCheck
CD56 Dim NK Cells                    SimpleGDIUniformityCheck
Classical Monocytes                  SimpleGDIUniformityCheck
CD8 T Cells                          SimpleGDIUniformityCheck
Nonclassical Monocytes               SimpleGDIUniformityCheck
Naive B Cells                        SimpleGDIUniformityCheck
IgM Memory B Cells                   SimpleGDIUniformityCheck
                                     check.isCheckAbove check.GDIThreshold
CD4 T Cells                                       FALSE                1.4
Gamma-Delta T Cells                               FALSE                1.4
Intermediate Monocytes                            FALSE                1.4
Classical Memory B Cells                          FALSE                1.4
Mucosal-Associated Invariant T Cells              FALSE                1.4
CD56 Dim NK Cells                                 FALSE                1.4
Classical Monocytes                               FALSE                1.4
CD8 T Cells                                       FALSE                1.4
Nonclassical Monocytes                            FALSE                1.4
Naive B Cells                                     FALSE                1.4
IgM Memory B Cells                                FALSE                1.4
                                     check.maxRatioBeyond check.maxRankBeyond
CD4 T Cells                                          0.02                   0
Gamma-Delta T Cells                                  0.02                   0
Intermediate Monocytes                               0.02                   0
Classical Memory B Cells                             0.02                   0
Mucosal-Associated Invariant T Cells                 0.02                   0
CD56 Dim NK Cells                                    0.02                   0
Classical Monocytes                                  0.02                   0
CD8 T Cells                                          0.02                   0
Nonclassical Monocytes                               0.02                   0
Naive B Cells                                        0.02                   0
IgM Memory B Cells                                   0.02                   0
                                     check.fractionBeyond check.thresholdRank
CD4 T Cells                                  3.912110e-01                   0
Gamma-Delta T Cells                          1.018973e-01                   0
Intermediate Monocytes                       7.077642e-05                   0
Classical Memory B Cells                     3.111619e-02                   0
Mucosal-Associated Invariant T Cells         7.696423e-02                   0
CD56 Dim NK Cells                            3.321295e-02                   0
Classical Monocytes                          1.807822e-02                   0
CD8 T Cells                                  4.241862e-01                   0
Nonclassical Monocytes                       2.232498e-02                   0
Naive B Cells                                8.345291e-02                   0
IgM Memory B Cells                           1.488212e-01                   0
                                     check.quantileAtRatio check.quantileAtRank
CD4 T Cells                                       2.119466                  NaN
Gamma-Delta T Cells                               1.527186                  NaN
Intermediate Monocytes                            1.287027                  NaN
Classical Memory B Cells                          1.424139                  NaN
Mucosal-Associated Invariant T Cells              1.495065                  NaN
CD56 Dim NK Cells                                 1.421830                  NaN
Classical Monocytes                               1.394924                  NaN
CD8 T Cells                                       2.233193                  NaN
Nonclassical Monocytes                            1.406190                  NaN
Naive B Cells                                     1.500825                  NaN
IgM Memory B Cells                                1.593955                  NaN
                                     isUniform clusterSize
CD4 T Cells                              FALSE        2473
Gamma-Delta T Cells                      FALSE         273
Intermediate Monocytes                    TRUE          86
Classical Memory B Cells                 FALSE         115
Mucosal-Associated Invariant T Cells     FALSE         463
CD56 Dim NK Cells                        FALSE         274
Classical Monocytes                       TRUE         252
CD8 T Cells                              FALSE        1886
Nonclassical Monocytes                   FALSE          86
Naive B Cells                            FALSE         250
IgM Memory B Cells                       FALSE         307
Sys.time()
[1] "2026-01-05 12:59:56 CET"
sessionInfo()
R version 4.5.2 (2025-10-31)
Platform: x86_64-pc-linux-gnu
Running under: Ubuntu 22.04.5 LTS

Matrix products: default
BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0 
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0  LAPACK version 3.10.0

locale:
 [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
 [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
 [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
[10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   

time zone: Europe/Rome
tzcode source: system (glibc)

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] COTAN_2.11.1     Matrix_1.7-4     conflicted_1.2.0 zeallot_0.2.0   
[5] ggplot2_4.0.1    assertthat_0.2.1

loaded via a namespace (and not attached):
  [1] RcppAnnoy_0.0.22            splines_4.5.2              
  [3] later_1.4.2                 tibble_3.3.0               
  [5] polyclip_1.10-7             fastDummies_1.7.5          
  [7] lifecycle_1.0.4             doParallel_1.0.17          
  [9] processx_3.8.6              globals_0.18.0             
 [11] lattice_0.22-7              MASS_7.3-65                
 [13] ggdist_3.3.3                dendextend_1.19.0          
 [15] magrittr_2.0.4              plotly_4.11.0              
 [17] rmarkdown_2.29              yaml_2.3.10                
 [19] httpuv_1.6.16               Seurat_5.2.1               
 [21] sctransform_0.4.2           spam_2.11-1                
 [23] sp_2.2-0                    spatstat.sparse_3.1-0      
 [25] reticulate_1.42.0           cowplot_1.2.0              
 [27] pbapply_1.7-2               RColorBrewer_1.1-3         
 [29] abind_1.4-8                 Rtsne_0.17                 
 [31] GenomicRanges_1.60.0        purrr_1.2.0                
 [33] BiocGenerics_0.54.0         coro_1.1.0                 
 [35] torch_0.14.2                circlize_0.4.16            
 [37] GenomeInfoDbData_1.2.14     IRanges_2.44.0             
 [39] S4Vectors_0.48.0            ggrepel_0.9.6              
 [41] irlba_2.3.5.1               listenv_0.9.1              
 [43] spatstat.utils_3.1-4        goftest_1.2-3              
 [45] RSpectra_0.16-2             spatstat.random_3.4-1      
 [47] fitdistrplus_1.2-2          parallelly_1.46.0          
 [49] codetools_0.2-20            DelayedArray_0.34.1        
 [51] tidyselect_1.2.1            shape_1.4.6.1              
 [53] UCSC.utils_1.4.0            farver_2.1.2               
 [55] ScaledMatrix_1.16.0         viridis_0.6.5              
 [57] matrixStats_1.5.0           stats4_4.5.2               
 [59] spatstat.explore_3.4-2      jsonlite_2.0.0             
 [61] GetoptLong_1.0.5            progressr_0.15.1           
 [63] ggridges_0.5.6              survival_3.8-3             
 [65] iterators_1.0.14            foreach_1.5.2              
 [67] tools_4.5.2                 ica_1.0-3                  
 [69] Rcpp_1.0.14                 glue_1.8.0                 
 [71] gridExtra_2.3               SparseArray_1.10.8         
 [73] xfun_0.52                   MatrixGenerics_1.20.0      
 [75] distributional_0.5.0        ggthemes_5.2.0             
 [77] GenomeInfoDb_1.44.0         dplyr_1.1.4                
 [79] withr_3.0.2                 fastmap_1.2.0              
 [81] callr_3.7.6                 digest_0.6.37              
 [83] rsvd_1.0.5                  parallelDist_0.2.6         
 [85] R6_2.6.1                    mime_0.13                  
 [87] colorspace_2.1-1            scattermore_1.2            
 [89] tensor_1.5                  spatstat.data_3.1-6        
 [91] tidyr_1.3.1                 generics_0.1.3             
 [93] data.table_1.17.0           httr_1.4.7                 
 [95] htmlwidgets_1.6.4           S4Arrays_1.10.1            
 [97] uwot_0.2.3                  pkgconfig_2.0.3            
 [99] gtable_0.3.6                ComplexHeatmap_2.26.0      
[101] lmtest_0.9-40               S7_0.2.1                   
[103] SingleCellExperiment_1.32.0 XVector_0.50.0             
[105] htmltools_0.5.8.1           dotCall64_1.2              
[107] zigg_0.0.2                  clue_0.3-66                
[109] SeuratObject_5.1.0          scales_1.4.0               
[111] Biobase_2.68.0              png_0.1-8                  
[113] spatstat.univar_3.1-3       knitr_1.50                 
[115] reshape2_1.4.4              rjson_0.2.23               
[117] nlme_3.1-168                proxy_0.4-27               
[119] cachem_1.1.0                zoo_1.8-14                 
[121] GlobalOptions_0.1.2         stringr_1.6.0              
[123] KernSmooth_2.23-26          parallel_4.5.2             
[125] miniUI_0.1.2                pillar_1.10.2              
[127] grid_4.5.2                  vctrs_0.6.5                
[129] RANN_2.6.2                  promises_1.3.2             
[131] BiocSingular_1.26.1         beachmat_2.26.0            
[133] xtable_1.8-4                cluster_2.1.8.1            
[135] evaluate_1.0.3              cli_3.6.5                  
[137] compiler_4.5.2              rlang_1.1.6                
[139] crayon_1.5.3                future.apply_1.20.0        
[141] labeling_0.4.3              ps_1.9.1                   
[143] plyr_1.8.9                  stringi_1.8.7              
[145] viridisLite_0.4.2           deldir_2.0-4               
[147] BiocParallel_1.44.0         lazyeval_0.2.2             
[149] spatstat.geom_3.4-1         RcppHNSW_0.6.0             
[151] patchwork_1.3.2             bit64_4.6.0-1              
[153] future_1.58.0               shiny_1.11.0               
[155] SummarizedExperiment_1.38.1 ROCR_1.0-11                
[157] Rfast_2.1.5.1               igraph_2.1.4               
[159] memoise_2.0.1               RcppParallel_5.1.10        
[161] bit_4.6.0