# Util libs
library(assertthat)
library(ggplot2)
library(zeallot)
library(conflicted)
library(Matrix)
# Data processing libs
if (!suppressWarnings(require(COTAN))) {
devtools::load_all("~/dev/COTAN/COTAN/")
}
conflicts_prefer(zeallot::`%->%`, zeallot::`%<-%`)
options(parallelly.fork.enable = TRUE)
setLoggingLevel(2L)DatasetCleaning_Run41
Preamble
List input files
inDir <- file.path("..")
outDir <- file.path(".")
list.files(path = inDir, pattern = "\\.RDS$")[1] "capillary_blood_samples_pbmcs-Run_28.RDS"
[2] "capillary_blood_samples_pbmcs-Run_40-Cleaned.RDS"
[3] "capillary_blood_samples_pbmcs-Run_40.RDS"
[4] "capillary_blood_samples_pbmcs-Run_41-Cleaned.RDS"
[5] "capillary_blood_samples_pbmcs-Run_41.RDS"
[6] "capillary_blood_samples_pbmcs-Run_62.RDS"
[7] "capillary_blood_samples_pbmcs-Run_77-Cleaned.RDS"
[8] "capillary_blood_samples_pbmcs-Run_77.RDS"
[9] "capillary_blood_samples_pbmcs.RDS"
globalCondition <- "capillary_blood_samples_pbmcs"
thisRun <- "41"
fileNameIn <- paste0(globalCondition, "-Run_", thisRun, ".RDS")
setLoggingFile(file.path(outDir, paste0("DatasetCleaning_Run", thisRun, ".log")))Load dataset
aRunObj <- readRDS(file = file.path(inDir, fileNameIn))
getAllConditions(aRunObj)[1] "Sample.IDs" "Participant.IDs"
[3] "Cell.Barcoding.Runs" "Lane"
[5] "cell_barcoding_protocol" "run_lane_batch"
[7] "original_sample_id"
getClusterizations(aRunObj)[1] "cell_type_level_1" "cell_type_level_2" "cell_type_level_3"
[4] "cell_type_level_4" "c1" "c2"
[7] "c3" "c4"
Cleaning
clean() using standard thresholds
cellsToDrop <- names(getClusterizationData(aRunObj,"c4")[[1]][is.na(getClusterizationData(aRunObj,"c4")[[1]])])
aRunObj <- dropGenesCells(aRunObj, cells = cellsToDrop)
aRunObj <- clean(aRunObj)Check the initial plots
cellSizePlot(aRunObj, condName = "Participant.IDs")
genesSizePlot(aRunObj, condName = "Participant.IDs")
scatterPlot(aRunObj, condName = "Participant.IDs")
c(mitPerPlot, mitPerDf) %<-%
mitochondrialPercentagePlot(aRunObj, condName = "Participant.IDs",
genePrefix = "^MT-")
mitPerPlot
Remove too high mitochondrial percentage cells
Expected percentages are around 6%-10%: too high percentage imply dead cells
sum(mitPerDf[["mit.percentage"]] > 20.0)[1] 0
sum(mitPerDf[["mit.percentage"]] > 15.0)[1] 6
sum(mitPerDf[["mit.percentage"]] > 14.0)[1] 12
sum(mitPerDf[["mit.percentage"]] > 10.0)[1] 377
# drop above 14%
mitPerThr <- 14.0
aRunObj <-
addElementToMetaDataset(
aRunObj,
"Mitochondrial percentage threshold", mitPerThr)
cellsToDrop <- getCells(aRunObj)[which(mitPerDf[["mit.percentage"]] > mitPerThr)]
aRunObj <- dropGenesCells(aRunObj, cells = cellsToDrop)Redo the plots
cellSizePlot(aRunObj, condName = "Participant.IDs")
genesSizePlot(aRunObj, condName = "Participant.IDs")
scatterPlot(aRunObj, condName = "Participant.IDs")
c(mitPerPlot, mitPerDf) %<-%
mitochondrialPercentagePlot(aRunObj, condName = "Participant.IDs",
genePrefix = "^MT-")
mitPerPlot
Remove cells with too high size
librarySize <- getCellsSize(aRunObj)
sum(librarySize > 25000)[1] 25
sum(librarySize > 20000)[1] 81
sum(librarySize > 18000)[1] 120
sum(librarySize > 15000)[1] 233
# drop above 11%
cellSizeThr <- 25000
aRunObj <-
addElementToMetaDataset(
aRunObj,
"Cell size threshold", cellSizeThr)
cellsToDrop <- getCells(aRunObj)[which(librarySize > cellSizeThr)]
aRunObj <- dropGenesCells(aRunObj, cells = cellsToDrop)Redo the plots
cellSizePlot(aRunObj, condName = "Participant.IDs")
genesSizePlot(aRunObj, condName = "Participant.IDs")
scatterPlot(aRunObj, condName = "Participant.IDs")
c(mitPerPlot, mitPerDf) %<-%
mitochondrialPercentagePlot(aRunObj, condName = "Participant.IDs",
genePrefix = "^MT-")
mitPerPlot
Check for spurious clusters
aRunObj <- clean(aRunObj)
c(pcaCells, pcaCellsData, genes, UDE, nu, zoomedNu) %<-%
cleanPlots(aRunObj, includePCA = TRUE)Plot PCA and Nu
plot(pcaCells)
plot(UDE)
plot(pcaCellsData)
plot(genes)
plot(nu)
plot(zoomedNu)
Remove cells below nu elbow point
nu <- getNu(aRunObj)
sum(nu < 0.35)[1] 3
sum(nu < 0.38)[1] 7
sum(nu < 0.4)[1] 11
sum(nu < 0.43)[1] 24
# drop above 11%
lowNuThr <- 0.4
aRunObj <-
addElementToMetaDataset(
aRunObj,
"Low Nu threshold", lowNuThr)
cellsToDrop <- getCells(aRunObj)[which(nu < lowNuThr)]
cells_to_rem <- rownames(pcaCellsData)[pcaCellsData[["groups"]] == "B"]
aRunObj <- dropGenesCells(aRunObj, cells = c(cellsToDrop,cells_to_rem))Redo all the plots
aRunObj <- clean(aRunObj)
c(pcaCells, pcaCellsData, genes, UDE, nu, zoomedNu) %<-%
cleanPlots(aRunObj, includePCA = TRUE)plot(pcaCells)
plot(UDE)
#plot(pcaCellsData)
plot(genes)
plot(nu)
plot(zoomedNu)
cellSizePlot(aRunObj,conditions = getClusterizationData(aRunObj,clName = "c1")[[1]])
genesSizePlot(aRunObj,conditions = getClusterizationData(aRunObj,clName = "c1")[[1]])
scatterPlot(aRunObj,conditions = getClusterizationData(aRunObj,clName = "c1")[[1]])
cellSizePlot(aRunObj,conditions = getClusterizationData(aRunObj,clName = "c2")[[1]])
genesSizePlot(aRunObj,conditions = getClusterizationData(aRunObj,clName = "c2")[[1]])
scatterPlot(aRunObj,conditions = getClusterizationData(aRunObj,clName = "c2")[[1]])
c(mitPerPlotc1, mitPerDfc1) %<-%
mitochondrialPercentagePlot(aRunObj, conditions = getClusterizationData(aRunObj,clName = "c1")[[1]],
genePrefix = "^MT-")
c(mitPerPlotc2, mitPerDfc2) %<-%
mitochondrialPercentagePlot(aRunObj, conditions = getClusterizationData(aRunObj,clName = "c2")[[1]],
genePrefix = "^MT-")
mitPerPlotc1
mitPerPlotc2
Finalize object and save
aRunObj <- proceedToCoex(aRunObj, calcCoex = FALSE, cores = 5L, saveObj = FALSE)fileNameOut <- paste0(globalCondition, "-Run_", thisRun, "-Cleaned", ".RDS")
saveRDS(aRunObj, file = file.path(inDir, fileNameOut))cellSizePlot(aRunObj,conditions = getClusterizationData(aRunObj,"c2")[[1]])
genesSizePlot(aRunObj,conditions = getClusterizationData(aRunObj,"c2")[[1]])
c(mitPerPlot, mitPerDf) %<-%
mitochondrialPercentagePlot(aRunObj, conditions = getClusterizationData(aRunObj,"c2")[[1]],
genePrefix = "^MT-")
mitPerPlot
cellSizePlot(aRunObj,conditions = getClusterizationData(aRunObj,"c3")[[1]])
genesSizePlot(aRunObj,conditions = getClusterizationData(aRunObj,"c3")[[1]])
c(mitPerPlot, mitPerDf) %<-%
mitochondrialPercentagePlot(aRunObj, conditions = getClusterizationData(aRunObj,"c3")[[1]],
genePrefix = "^MT-")
mitPerPlot
table(getClusterizationData(aRunObj,"c3")[[1]])
pDC CD4 T Cells
7 2473
Gamma-Delta T Cells cDC2
273 26
Intermediate Monocytes cDC3
86 11
CD56 Bright NK Cells Classical Memory B Cells
26 115
Mucosal-Associated Invariant T Cells CD56 Dim NK Cells
463 274
Classical Monocytes CD8 T Cells
252 1886
Nonclassical Monocytes Naive B Cells
86 250
asDC IgM Memory B Cells
1 307
Classical Monocytes - HSP artifact Age-associated B Cells
1 7
Adaptive NK Cells
14
Check uniform clusters
getClData <- function(objCOTAN, clList, cl) {
checker <- new("SimpleGDIUniformityCheck")
checker@check@maxRatioBeyond <- 0.02
cluster <- names(clList)[[cl]]
cells <- clList[[cl]]
res <- checkClusterUniformity(objCOTAN, cluster = cluster, cells = cells,checker = checker,
cores = 6, saveObj = FALSE)
return(res)
}originalC3Cl <- getClusters(aRunObj, clName = "c3")
sort(table(originalC3Cl), decreasing = TRUE)originalC3Cl
CD4 T Cells CD8 T Cells
2473 1886
Mucosal-Associated Invariant T Cells IgM Memory B Cells
463 307
CD56 Dim NK Cells Gamma-Delta T Cells
274 273
Classical Monocytes Naive B Cells
252 250
Classical Memory B Cells Intermediate Monocytes
115 86
Nonclassical Monocytes cDC2
86 26
CD56 Bright NK Cells Adaptive NK Cells
26 14
cDC3 pDC
11 7
Age-associated B Cells asDC
7 1
Classical Monocytes - HSP artifact
1
largeCl <- which(table(originalC3Cl) >= 50)
largeOrigCl <- originalC3Cl[(originalC3Cl %in% names(largeCl)),
drop = TRUE]
largeOrigCl <- toClustersList(largeOrigCl)vapply(largeOrigCl, length, integer(1)) CD4 T Cells Gamma-Delta T Cells
2473 273
Intermediate Monocytes Classical Memory B Cells
86 115
Mucosal-Associated Invariant T Cells CD56 Dim NK Cells
463 274
Classical Monocytes CD8 T Cells
252 1886
Nonclassical Monocytes Naive B Cells
86 250
IgM Memory B Cells
307
resDataC3 <- lapply(seq_along(largeOrigCl), FUN = getClData,
objCOTAN = aRunObj, clList = largeOrigCl)
clDataC3 <- checkersToDF(resDataC3)
rownames(clDataC3) <- names(largeOrigCl)
clDataC3 class
CD4 T Cells SimpleGDIUniformityCheck
Gamma-Delta T Cells SimpleGDIUniformityCheck
Intermediate Monocytes SimpleGDIUniformityCheck
Classical Memory B Cells SimpleGDIUniformityCheck
Mucosal-Associated Invariant T Cells SimpleGDIUniformityCheck
CD56 Dim NK Cells SimpleGDIUniformityCheck
Classical Monocytes SimpleGDIUniformityCheck
CD8 T Cells SimpleGDIUniformityCheck
Nonclassical Monocytes SimpleGDIUniformityCheck
Naive B Cells SimpleGDIUniformityCheck
IgM Memory B Cells SimpleGDIUniformityCheck
check.isCheckAbove check.GDIThreshold
CD4 T Cells FALSE 1.4
Gamma-Delta T Cells FALSE 1.4
Intermediate Monocytes FALSE 1.4
Classical Memory B Cells FALSE 1.4
Mucosal-Associated Invariant T Cells FALSE 1.4
CD56 Dim NK Cells FALSE 1.4
Classical Monocytes FALSE 1.4
CD8 T Cells FALSE 1.4
Nonclassical Monocytes FALSE 1.4
Naive B Cells FALSE 1.4
IgM Memory B Cells FALSE 1.4
check.maxRatioBeyond check.maxRankBeyond
CD4 T Cells 0.02 0
Gamma-Delta T Cells 0.02 0
Intermediate Monocytes 0.02 0
Classical Memory B Cells 0.02 0
Mucosal-Associated Invariant T Cells 0.02 0
CD56 Dim NK Cells 0.02 0
Classical Monocytes 0.02 0
CD8 T Cells 0.02 0
Nonclassical Monocytes 0.02 0
Naive B Cells 0.02 0
IgM Memory B Cells 0.02 0
check.fractionBeyond check.thresholdRank
CD4 T Cells 3.912110e-01 0
Gamma-Delta T Cells 1.018973e-01 0
Intermediate Monocytes 7.077642e-05 0
Classical Memory B Cells 3.111619e-02 0
Mucosal-Associated Invariant T Cells 7.696423e-02 0
CD56 Dim NK Cells 3.321295e-02 0
Classical Monocytes 1.807822e-02 0
CD8 T Cells 4.241862e-01 0
Nonclassical Monocytes 2.232498e-02 0
Naive B Cells 8.345291e-02 0
IgM Memory B Cells 1.488212e-01 0
check.quantileAtRatio check.quantileAtRank
CD4 T Cells 2.119466 NaN
Gamma-Delta T Cells 1.527186 NaN
Intermediate Monocytes 1.287027 NaN
Classical Memory B Cells 1.424139 NaN
Mucosal-Associated Invariant T Cells 1.495065 NaN
CD56 Dim NK Cells 1.421830 NaN
Classical Monocytes 1.394924 NaN
CD8 T Cells 2.233193 NaN
Nonclassical Monocytes 1.406190 NaN
Naive B Cells 1.500825 NaN
IgM Memory B Cells 1.593955 NaN
isUniform clusterSize
CD4 T Cells FALSE 2473
Gamma-Delta T Cells FALSE 273
Intermediate Monocytes TRUE 86
Classical Memory B Cells FALSE 115
Mucosal-Associated Invariant T Cells FALSE 463
CD56 Dim NK Cells FALSE 274
Classical Monocytes TRUE 252
CD8 T Cells FALSE 1886
Nonclassical Monocytes FALSE 86
Naive B Cells FALSE 250
IgM Memory B Cells FALSE 307
Sys.time()[1] "2026-01-05 12:59:56 CET"
sessionInfo()R version 4.5.2 (2025-10-31)
Platform: x86_64-pc-linux-gnu
Running under: Ubuntu 22.04.5 LTS
Matrix products: default
BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0 LAPACK version 3.10.0
locale:
[1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C LC_TIME=C.UTF-8
[4] LC_COLLATE=C.UTF-8 LC_MONETARY=C.UTF-8 LC_MESSAGES=C.UTF-8
[7] LC_PAPER=C.UTF-8 LC_NAME=C LC_ADDRESS=C
[10] LC_TELEPHONE=C LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C
time zone: Europe/Rome
tzcode source: system (glibc)
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] COTAN_2.11.1 Matrix_1.7-4 conflicted_1.2.0 zeallot_0.2.0
[5] ggplot2_4.0.1 assertthat_0.2.1
loaded via a namespace (and not attached):
[1] RcppAnnoy_0.0.22 splines_4.5.2
[3] later_1.4.2 tibble_3.3.0
[5] polyclip_1.10-7 fastDummies_1.7.5
[7] lifecycle_1.0.4 doParallel_1.0.17
[9] processx_3.8.6 globals_0.18.0
[11] lattice_0.22-7 MASS_7.3-65
[13] ggdist_3.3.3 dendextend_1.19.0
[15] magrittr_2.0.4 plotly_4.11.0
[17] rmarkdown_2.29 yaml_2.3.10
[19] httpuv_1.6.16 Seurat_5.2.1
[21] sctransform_0.4.2 spam_2.11-1
[23] sp_2.2-0 spatstat.sparse_3.1-0
[25] reticulate_1.42.0 cowplot_1.2.0
[27] pbapply_1.7-2 RColorBrewer_1.1-3
[29] abind_1.4-8 Rtsne_0.17
[31] GenomicRanges_1.60.0 purrr_1.2.0
[33] BiocGenerics_0.54.0 coro_1.1.0
[35] torch_0.14.2 circlize_0.4.16
[37] GenomeInfoDbData_1.2.14 IRanges_2.44.0
[39] S4Vectors_0.48.0 ggrepel_0.9.6
[41] irlba_2.3.5.1 listenv_0.9.1
[43] spatstat.utils_3.1-4 goftest_1.2-3
[45] RSpectra_0.16-2 spatstat.random_3.4-1
[47] fitdistrplus_1.2-2 parallelly_1.46.0
[49] codetools_0.2-20 DelayedArray_0.34.1
[51] tidyselect_1.2.1 shape_1.4.6.1
[53] UCSC.utils_1.4.0 farver_2.1.2
[55] ScaledMatrix_1.16.0 viridis_0.6.5
[57] matrixStats_1.5.0 stats4_4.5.2
[59] spatstat.explore_3.4-2 jsonlite_2.0.0
[61] GetoptLong_1.0.5 progressr_0.15.1
[63] ggridges_0.5.6 survival_3.8-3
[65] iterators_1.0.14 foreach_1.5.2
[67] tools_4.5.2 ica_1.0-3
[69] Rcpp_1.0.14 glue_1.8.0
[71] gridExtra_2.3 SparseArray_1.10.8
[73] xfun_0.52 MatrixGenerics_1.20.0
[75] distributional_0.5.0 ggthemes_5.2.0
[77] GenomeInfoDb_1.44.0 dplyr_1.1.4
[79] withr_3.0.2 fastmap_1.2.0
[81] callr_3.7.6 digest_0.6.37
[83] rsvd_1.0.5 parallelDist_0.2.6
[85] R6_2.6.1 mime_0.13
[87] colorspace_2.1-1 scattermore_1.2
[89] tensor_1.5 spatstat.data_3.1-6
[91] tidyr_1.3.1 generics_0.1.3
[93] data.table_1.17.0 httr_1.4.7
[95] htmlwidgets_1.6.4 S4Arrays_1.10.1
[97] uwot_0.2.3 pkgconfig_2.0.3
[99] gtable_0.3.6 ComplexHeatmap_2.26.0
[101] lmtest_0.9-40 S7_0.2.1
[103] SingleCellExperiment_1.32.0 XVector_0.50.0
[105] htmltools_0.5.8.1 dotCall64_1.2
[107] zigg_0.0.2 clue_0.3-66
[109] SeuratObject_5.1.0 scales_1.4.0
[111] Biobase_2.68.0 png_0.1-8
[113] spatstat.univar_3.1-3 knitr_1.50
[115] reshape2_1.4.4 rjson_0.2.23
[117] nlme_3.1-168 proxy_0.4-27
[119] cachem_1.1.0 zoo_1.8-14
[121] GlobalOptions_0.1.2 stringr_1.6.0
[123] KernSmooth_2.23-26 parallel_4.5.2
[125] miniUI_0.1.2 pillar_1.10.2
[127] grid_4.5.2 vctrs_0.6.5
[129] RANN_2.6.2 promises_1.3.2
[131] BiocSingular_1.26.1 beachmat_2.26.0
[133] xtable_1.8-4 cluster_2.1.8.1
[135] evaluate_1.0.3 cli_3.6.5
[137] compiler_4.5.2 rlang_1.1.6
[139] crayon_1.5.3 future.apply_1.20.0
[141] labeling_0.4.3 ps_1.9.1
[143] plyr_1.8.9 stringi_1.8.7
[145] viridisLite_0.4.2 deldir_2.0-4
[147] BiocParallel_1.44.0 lazyeval_0.2.2
[149] spatstat.geom_3.4-1 RcppHNSW_0.6.0
[151] patchwork_1.3.2 bit64_4.6.0-1
[153] future_1.58.0 shiny_1.11.0
[155] SummarizedExperiment_1.38.1 ROCR_1.0-11
[157] Rfast_2.1.5.1 igraph_2.1.4
[159] memoise_2.0.1 RcppParallel_5.1.10
[161] bit_4.6.0