DatasetCleaning_Run77

Preamble

# Util libs
library(assertthat)
library(ggplot2)
library(zeallot)
library(conflicted)
library(Matrix)

# Data processing libs
if (!suppressWarnings(require(COTAN))) {
  devtools::load_all("~/dev/COTAN/COTAN/")
}

conflicts_prefer(zeallot::`%->%`, zeallot::`%<-%`)

options(parallelly.fork.enable = TRUE)

setLoggingLevel(2L)

List input files

inDir <- file.path("..")
outDir <- file.path(".")

list.files(path = inDir, pattern = "\\.RDS$")
[1] "capillary_blood_samples_pbmcs-Run_28.RDS"        
[2] "capillary_blood_samples_pbmcs-Run_40-Cleaned.RDS"
[3] "capillary_blood_samples_pbmcs-Run_40.RDS"        
[4] "capillary_blood_samples_pbmcs-Run_41-Cleaned.RDS"
[5] "capillary_blood_samples_pbmcs-Run_41.RDS"        
[6] "capillary_blood_samples_pbmcs-Run_62.RDS"        
[7] "capillary_blood_samples_pbmcs-Run_77-Cleaned.RDS"
[8] "capillary_blood_samples_pbmcs-Run_77.RDS"        
[9] "capillary_blood_samples_pbmcs.RDS"               
globalCondition <- "capillary_blood_samples_pbmcs"

thisRun <- "77"
fileNameIn <- paste0(globalCondition, "-Run_", thisRun, ".RDS")

setLoggingFile(file.path(outDir, paste0("DatasetCleaning_Run", thisRun, ".log")))

Load dataset

aRunObj <- readRDS(file = file.path(inDir, fileNameIn))

getAllConditions(aRunObj)
[1] "Sample.IDs"              "Participant.IDs"        
[3] "Cell.Barcoding.Runs"     "Lane"                   
[5] "cell_barcoding_protocol" "run_lane_batch"         
[7] "original_sample_id"     
getClusterizations(aRunObj)
[1] "cell_type_level_1" "cell_type_level_2" "cell_type_level_3"
[4] "cell_type_level_4" "c1"                "c2"               
[7] "c3"                "c4"               

Cleaning

clean() using standard thresholds

Compare to the other runs, this one does not have the cell labelling!

#cellsToDrop <- names(getClusterizationData(aRunObj,"c4")[[1]][is.na(getClusterizationData(aRunObj,"c4")[[1]])])

#aRunObj <- dropGenesCells(aRunObj, cells = cellsToDrop)


aRunObj <- clean(aRunObj)

Check the initial plots

cellSizePlot(aRunObj, condName = "Participant.IDs")

genesSizePlot(aRunObj, condName = "Participant.IDs")

scatterPlot(aRunObj, condName = "Participant.IDs")

c(mitPerPlot, mitPerDf) %<-%
  mitochondrialPercentagePlot(aRunObj, condName = "Participant.IDs",
                              genePrefix = "^MT-")
mitPerPlot

Remove too high mitochondrial percentage cells

Expected percentages are around 6%-10%: too high percentage imply dead cells

sum(mitPerDf[["mit.percentage"]] > 20.0)
[1] 126
sum(mitPerDf[["mit.percentage"]] > 15.0)
[1] 219
sum(mitPerDf[["mit.percentage"]] > 14.0)
[1] 250
sum(mitPerDf[["mit.percentage"]] > 10.0)
[1] 537
# drop above 14%
mitPerThr <- 20.0
aRunObj <-
  addElementToMetaDataset(
    aRunObj,
    "Mitochondrial percentage threshold", mitPerThr)

cellsToDrop <- getCells(aRunObj)[which(mitPerDf[["mit.percentage"]] > mitPerThr)]

aRunObj <- dropGenesCells(aRunObj, cells = cellsToDrop)

Redo the plots

cellSizePlot(aRunObj, condName = "Participant.IDs")

genesSizePlot(aRunObj, condName = "Participant.IDs")

scatterPlot(aRunObj, condName = "Participant.IDs")

c(mitPerPlot, mitPerDf) %<-%
  mitochondrialPercentagePlot(aRunObj, condName = "Participant.IDs",
                              genePrefix = "^MT-")
mitPerPlot

Remove cells with too high size

librarySize <- getCellsSize(aRunObj)

sum(librarySize > 25000)
[1] 24
sum(librarySize > 20000)
[1] 34
sum(librarySize > 18000)
[1] 47
sum(librarySize > 15000)
[1] 99
sum(librarySize > 14000)
[1] 118
# drop above 15K
cellSizeThr <- 15000
aRunObj <-
  addElementToMetaDataset(
    aRunObj,
    "Cell size threshold", cellSizeThr)

cellsToDrop <- getCells(aRunObj)[which(librarySize > cellSizeThr)]

aRunObj <- dropGenesCells(aRunObj, cells = cellsToDrop)

Redo the plots

cellSizePlot(aRunObj, condName = "Participant.IDs")

genesSizePlot(aRunObj, condName = "Participant.IDs")

scatterPlot(aRunObj, condName = "Participant.IDs")

c(mitPerPlot, mitPerDf) %<-%
  mitochondrialPercentagePlot(aRunObj, condName = "Participant.IDs",
                              genePrefix = "^MT-")
mitPerPlot

Check for spurious clusters

aRunObj <- clean(aRunObj)

c(pcaCells, pcaCellsData, genes, UDE, nu, zoomedNu) %<-%
  cleanPlots(aRunObj, includePCA = TRUE)

Plot PCA and Nu

plot(pcaCells)

plot(UDE)

plot(pcaCellsData)

plot(genes)

plot(nu)

plot(zoomedNu)

Remove cells below nu elbow point

nu <- getNu(aRunObj)

sum(nu < 0.15)
[1] 15
sum(nu < 0.18)
[1] 19
sum(nu < 0.20)
[1] 21
# drop above 11%
lowNuThr <- 0.19
aRunObj <-
  addElementToMetaDataset(
    aRunObj,
    "Low Nu threshold", lowNuThr)

cellsToDrop <- getCells(aRunObj)[which(nu < lowNuThr)]
cells_to_rem <- rownames(pcaCellsData)[pcaCellsData[["groups"]] == "B"]

aRunObj <- dropGenesCells(aRunObj, cells = c(cellsToDrop,cells_to_rem))

Redo all the plots

aRunObj <- clean(aRunObj)

c(pcaCells, pcaCellsData, genes, UDE, nu, zoomedNu) %<-%
  cleanPlots(aRunObj, includePCA = TRUE)
plot(pcaCells)

plot(UDE)

plot(pcaCellsData)

plot(genes)

plot(nu)

plot(zoomedNu)

cellSizePlot(aRunObj, condName = "Sample.IDs")

genesSizePlot(aRunObj, condName = "Sample.IDs")

scatterPlot(aRunObj, condName = "Sample.IDs")

cellSizePlot(aRunObj, condName = "Participant.IDs")

genesSizePlot(aRunObj, condName = "Participant.IDs")

scatterPlot(aRunObj, condName = "Participant.IDs")

c(mitPerPlot, mitPerDf) %<-%
  mitochondrialPercentagePlot(aRunObj, condName = "Participant.IDs",
                              genePrefix = "^MT-")
mitPerPlot

Finalize object and save

aRunObj <- proceedToCoex(aRunObj, calcCoex = FALSE, cores = 1L, saveObj = FALSE)
fileNameOut <- paste0(globalCondition, "-Run_", thisRun, "-Cleaned", ".RDS")

saveRDS(aRunObj, file = file.path(inDir, fileNameOut))
df <- UDE@data
df$Participant.IDs <- NA
df$Sample.IDs <- NA
df$lane <- NA
df$Participant.IDs <-getCondition(aRunObj,condName = "Participant.IDs")
df$Sample.IDs <- getCondition(aRunObj,condName = "Sample.IDs")
df$lane <-getCondition(aRunObj,condName = "Lane")


df$Sample.IDs <- as.character(df$Sample.IDs)
df$Participant.IDs <- as.character(df$Participant.IDs)
df$lane <- as.character(df$lane)


ggplot(df,aes(x=PC1,y=PC2,color = Participant.IDs)) +geom_point()

ggplot(df,aes(x=PC1,y=PC2,color = Sample.IDs)) +geom_point()

ggplot(df,aes(x=PC1,y=PC2,color = lane)) +geom_point()

ggplot(df,aes(x=PC1,y=PC3,color = Participant.IDs)) +geom_point()

ggplot(df,aes(x=PC1,y=PC3,color = Sample.IDs)) +geom_point()

ggplot(df,aes(x=PC1,y=PC3,color = lane)) +geom_point()

ggplot(df,aes(x=PC1,y=PC4,color = Participant.IDs)) +geom_point()

ggplot(df,aes(x=PC1,y=PC4,color = Sample.IDs)) +geom_point()

ggplot(df,aes(x=PC1,y=PC4,color = lane)) +geom_point()

Sys.time()
[1] "2026-01-05 12:34:22 CET"
sessionInfo()
R version 4.5.2 (2025-10-31)
Platform: x86_64-pc-linux-gnu
Running under: Ubuntu 22.04.5 LTS

Matrix products: default
BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0 
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0  LAPACK version 3.10.0

locale:
 [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
 [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
 [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
[10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   

time zone: Europe/Rome
tzcode source: system (glibc)

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] COTAN_2.11.1     Matrix_1.7-4     conflicted_1.2.0 zeallot_0.2.0   
[5] ggplot2_4.0.1    assertthat_0.2.1

loaded via a namespace (and not attached):
  [1] RcppAnnoy_0.0.22            splines_4.5.2              
  [3] later_1.4.2                 tibble_3.3.0               
  [5] polyclip_1.10-7             fastDummies_1.7.5          
  [7] lifecycle_1.0.4             doParallel_1.0.17          
  [9] globals_0.18.0              lattice_0.22-7             
 [11] MASS_7.3-65                 ggdist_3.3.3               
 [13] dendextend_1.19.0           magrittr_2.0.4             
 [15] plotly_4.11.0               rmarkdown_2.29             
 [17] yaml_2.3.10                 httpuv_1.6.16              
 [19] Seurat_5.2.1                sctransform_0.4.2          
 [21] spam_2.11-1                 sp_2.2-0                   
 [23] spatstat.sparse_3.1-0       reticulate_1.42.0          
 [25] cowplot_1.2.0               pbapply_1.7-2              
 [27] RColorBrewer_1.1-3          abind_1.4-8                
 [29] Rtsne_0.17                  GenomicRanges_1.60.0       
 [31] purrr_1.2.0                 BiocGenerics_0.54.0        
 [33] circlize_0.4.16             GenomeInfoDbData_1.2.14    
 [35] IRanges_2.44.0              S4Vectors_0.48.0           
 [37] ggrepel_0.9.6               irlba_2.3.5.1              
 [39] listenv_0.9.1               spatstat.utils_3.1-4       
 [41] goftest_1.2-3               RSpectra_0.16-2            
 [43] spatstat.random_3.4-1       fitdistrplus_1.2-2         
 [45] parallelly_1.46.0           codetools_0.2-20           
 [47] DelayedArray_0.34.1         tidyselect_1.2.1           
 [49] shape_1.4.6.1               UCSC.utils_1.4.0           
 [51] farver_2.1.2                ScaledMatrix_1.16.0        
 [53] viridis_0.6.5               matrixStats_1.5.0          
 [55] stats4_4.5.2                spatstat.explore_3.4-2     
 [57] jsonlite_2.0.0              GetoptLong_1.0.5           
 [59] progressr_0.15.1            ggridges_0.5.6             
 [61] survival_3.8-3              iterators_1.0.14           
 [63] foreach_1.5.2               tools_4.5.2                
 [65] ica_1.0-3                   Rcpp_1.0.14                
 [67] glue_1.8.0                  gridExtra_2.3              
 [69] SparseArray_1.10.8          xfun_0.52                  
 [71] MatrixGenerics_1.20.0       distributional_0.5.0       
 [73] ggthemes_5.2.0              GenomeInfoDb_1.44.0        
 [75] dplyr_1.1.4                 withr_3.0.2                
 [77] fastmap_1.2.0               digest_0.6.37              
 [79] rsvd_1.0.5                  parallelDist_0.2.6         
 [81] R6_2.6.1                    mime_0.13                  
 [83] colorspace_2.1-1            scattermore_1.2            
 [85] tensor_1.5                  spatstat.data_3.1-6        
 [87] tidyr_1.3.1                 generics_0.1.3             
 [89] data.table_1.17.0           httr_1.4.7                 
 [91] htmlwidgets_1.6.4           S4Arrays_1.10.1            
 [93] uwot_0.2.3                  pkgconfig_2.0.3            
 [95] gtable_0.3.6                ComplexHeatmap_2.26.0      
 [97] lmtest_0.9-40               S7_0.2.1                   
 [99] SingleCellExperiment_1.32.0 XVector_0.50.0             
[101] htmltools_0.5.8.1           dotCall64_1.2              
[103] zigg_0.0.2                  clue_0.3-66                
[105] SeuratObject_5.1.0          scales_1.4.0               
[107] Biobase_2.68.0              png_0.1-8                  
[109] spatstat.univar_3.1-3       knitr_1.50                 
[111] reshape2_1.4.4              rjson_0.2.23               
[113] nlme_3.1-168                proxy_0.4-27               
[115] cachem_1.1.0                zoo_1.8-14                 
[117] GlobalOptions_0.1.2         stringr_1.6.0              
[119] KernSmooth_2.23-26          parallel_4.5.2             
[121] miniUI_0.1.2                pillar_1.10.2              
[123] grid_4.5.2                  vctrs_0.6.5                
[125] RANN_2.6.2                  promises_1.3.2             
[127] BiocSingular_1.26.1         beachmat_2.26.0            
[129] xtable_1.8-4                cluster_2.1.8.1            
[131] evaluate_1.0.3              cli_3.6.5                  
[133] compiler_4.5.2              rlang_1.1.6                
[135] crayon_1.5.3                future.apply_1.20.0        
[137] labeling_0.4.3              plyr_1.8.9                 
[139] stringi_1.8.7               viridisLite_0.4.2          
[141] deldir_2.0-4                BiocParallel_1.44.0        
[143] lazyeval_0.2.2              spatstat.geom_3.4-1        
[145] RcppHNSW_0.6.0              patchwork_1.3.2            
[147] future_1.58.0               shiny_1.11.0               
[149] SummarizedExperiment_1.38.1 ROCR_1.0-11                
[151] Rfast_2.1.5.1               igraph_2.1.4               
[153] memoise_2.0.1               RcppParallel_5.1.10