Dataset Cleaning

Preamble

# Util libs
library(assertthat)
library(ggplot2)
library(zeallot)
library(conflicted)
library(Matrix)

# Data processing libs
if (!suppressWarnings(require(COTAN))) {
  devtools::load_all("~/dev/COTAN/COTAN/")
}

conflicts_prefer(zeallot::`%->%`, zeallot::`%<-%`)

options(parallelly.fork.enable = TRUE)

setLoggingLevel(2L)

List input files

#setwd("d:/COTAN_Datasets_analysis/Data/NewDataRevision/")
inDir <- file.path("MouseBrainOpenProblem")
outDir <- file.path("MouseBrainOpenProblem")

list.files(path = inDir, pattern = "\\.RDS$")
[1] "MouseBrainOP_COTAN_raw.RDS"    "MouseBrainOPCOTAN-Cleaned.RDS"
globalCondition <- "MouseBrainOP"

fileNameIn <- paste0(globalCondition, "_COTAN_raw.RDS")

setLoggingFile(file.path(outDir, paste0("DatasetCleaning.log")))

Load dataset

CotanObj <- readRDS(file = file.path(inDir, fileNameIn))

getAllConditions(CotanObj)
 [1] "metadata_cells"        "age_days"              "brain_hemisphere"     
 [4] "brain_region"          "brain_subregion"       "class"                
 [7] "cluster"               "driver_lines"          "eye_condition"        
[10] "facs_container"        "facs_date"             "facs_sort_criteria"   
[13] "genotype"              "gfp_cpm"               "reporter_lines"       
[16] "rna_amplification_set" "sample_id"             "sample_type"          
[19] "seq_batch"             "seq_name"              "seq_tube"             
[22] "sex"                  
getClusterizations(CotanObj)
[1] "cell_type"        "metadata_cells"   "age_days"         "brain_hemisphere"
[5] "brain_region"     "brain_subregion"  "class"            "cluster"         
dim(getRawData(CotanObj))
[1] 34617 14249
table(getCondition(CotanObj, condName = "class"))

  Endothelial     GABAergic Glutamatergic  Non-Neuronal 
          181          6125          7366           577 

KEeping only nuronal cells

cellsToDrop <- names(getCondition(CotanObj, condName = "class")[getCondition(CotanObj, condName = "class") %in% c("Endothelial","Non-Neuronal")])

CotanObj <- dropGenesCells(CotanObj, cells = cellsToDrop)

Cleaning

clean() using standard thresholds

CotanObj <- clean(CotanObj)

Check the initial plots

cellSizePlot(CotanObj, condName = "facs_sort_criteria")

genesSizePlot(CotanObj, condName = "facs_sort_criteria")

scatterPlot(CotanObj, condName = "facs_sort_criteria")

Since the canonical prefix for mitocondiral genes is not found, we will use the genes defined by MitoCarta3.0

mitGenes <- readxl::read_excel("MouseBrainOpenProblem/Mouse.MitoCarta3.0.xls",sheet =2)
mitGenes <- unique(mitGenes[mitGenes$MSMS_NUM_TISSUES == 14,]$Symbol)

c(mitPerPlot, mitPerDf) %<-%
  genesPercentagePlot(CotanObj, condName = "facs_sort_criteria",title = "Mit %",
  genes = mitGenes)

mitPerPlot

Remove too high mitochondrial percentage cells

Expected percentages are around 6%-10%: too high percentage imply dead cells

sum(mitPerDf[["percentage"]] > 15)
[1] 1
sum(mitPerDf[["percentage"]] > 14.0)
[1] 4
sum(mitPerDf[["percentage"]] > 13.0)
[1] 9
sum(mitPerDf[["percentage"]] > 12.0)
[1] 78
sum(mitPerDf[["percentage"]] > 11.0)
[1] 310
sum(mitPerDf[["percentage"]] > 10.0)
[1] 793
# drop above 14%
mitPerThr <- 12.0
CotanObj <-
  addElementToMetaDataset(
    CotanObj,
    "Mitochondrial percentage threshold", mitPerThr)

cellsToDrop <- getCells(CotanObj)[which(mitPerDf[["percentage"]] > mitPerThr)]

CotanObj <- dropGenesCells(CotanObj, cells = cellsToDrop)

Redo the plots

cellSizePlot(CotanObj, condName = "facs_sort_criteria")

genesSizePlot(CotanObj, condName = "facs_sort_criteria")

scatterPlot(CotanObj, condName = "facs_sort_criteria")

c(mitPerPlot, mitPerDf) %<-%
  genesPercentagePlot(CotanObj, condName = "facs_sort_criteria",title = "Mit %",
  genes = mitGenes)
mitPerPlot

Remove cells with too high size

librarySize <- getCellsSize(CotanObj)

sum(librarySize > 4000000)
[1] 32
sum(librarySize > 3500000)
[1] 45
sum(librarySize > 3000000)
[1] 76
# drop above 11%
cellSizeThr <- 3500000
CotanObj <-
  addElementToMetaDataset(
    CotanObj,
    "Cell size threshold", cellSizeThr)

cellsToDrop <- getCells(CotanObj)[which(librarySize > cellSizeThr)]

CotanObj <- dropGenesCells(CotanObj, cells = cellsToDrop)

Redo the plots

cellSizePlot(CotanObj, condName = "facs_sort_criteria")

genesSizePlot(CotanObj, condName = "facs_sort_criteria")

scatterPlot(CotanObj, condName = "facs_sort_criteria")

c(mitPerPlot, mitPerDf) %<-%
  genesPercentagePlot(CotanObj, condName = "facs_sort_criteria",title = "Mit %",
  genes = mitGenes)

mitPerPlot

Check for spurious clusters

CotanObj <- clean(CotanObj)

c(pcaCells, pcaCellsData, genes, UDE, nu, zoomedNu) %<-%
  cleanPlots(CotanObj, includePCA = TRUE)

Plot PCA and Nu

plot(pcaCells)

plot(UDE)

plot(pcaCellsData)

plot(genes)

plot(nu)

plot(zoomedNu)

Remove cells below nu elbow point

nu <- getNu(CotanObj)

sum(nu < 0.1)
[1] 13
sum(nu < 0.11)
[1] 17
sum(nu < 0.12)
[1] 19
sum(nu < 0.13)
[1] 20
# drop above 11%
lowNuThr <- 0.13
CotanObj <-
  addElementToMetaDataset(
    CotanObj,
    "Low Nu threshold", lowNuThr)

cellsToDrop <- getCells(CotanObj)[which(nu < lowNuThr)]
#cells_to_rem <- rownames(pcaCellsData)[pcaCellsData[["groups"]] == "B"]

CotanObj <- dropGenesCells(CotanObj, cells = cellsToDrop)

Redo all the plots

CotanObj <- clean(CotanObj)

c(pcaCells, pcaCellsData, genes, UDE, nu, zoomedNu) %<-%
  cleanPlots(CotanObj, includePCA = TRUE)
plot(pcaCells)

plot(UDE)

#plot(pcaCellsData)

plot(genes)

plot(nu)

plot(zoomedNu)

# cellSizePlot(CotanObj,conditions = getClusterizationData(CotanObj,clName = "c1")[[1]])

# genesSizePlot(CotanObj,conditions = getClusterizationData(CotanObj,clName = "c1")[[1]])
# scatterPlot(CotanObj,conditions = getClusterizationData(CotanObj,clName = "c1")[[1]])

# cellSizePlot(CotanObj,conditions = getClusterizationData(CotanObj,clName = "c2")[[1]])
# genesSizePlot(CotanObj,conditions = getClusterizationData(CotanObj,clName = "c2")[[1]])
# scatterPlot(CotanObj,conditions = getClusterizationData(CotanObj,clName = "c2")[[1]])


# c(mitPerPlotc1, mitPerDfc1) %<-%
#   mitochondrialPercentagePlot(CotanObj, conditions = getClusterizationData(CotanObj,clName = "c1")[[1]],
#                               genePrefix = "^MT-")

# c(mitPerPlotc2, mitPerDfc2) %<-%
#   mitochondrialPercentagePlot(CotanObj, conditions = getClusterizationData(CotanObj,clName = "c2")[[1]],
#                               genePrefix = "^MT-")
# mitPerPlotc1
# mitPerPlotc2

Finalize object and save

CotanObj <- proceedToCoex(CotanObj, calcCoex = FALSE, cores = 5L, saveObj = FALSE)
fileNameOut <- paste0(globalCondition, "COTAN-Cleaned", ".RDS")
saveRDS(CotanObj, file = file.path(inDir, fileNameOut))
Sys.time()
[1] "2026-01-09 18:01:34 CET"
sessionInfo()
R version 4.5.2 (2025-10-31)
Platform: x86_64-pc-linux-gnu
Running under: Ubuntu 22.04.5 LTS

Matrix products: default
BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0 
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0  LAPACK version 3.10.0

locale:
 [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
 [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
 [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
[10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   

time zone: Europe/Rome
tzcode source: system (glibc)

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] COTAN_2.11.1     Matrix_1.7-4     conflicted_1.2.0 zeallot_0.2.0   
[5] ggplot2_4.0.1    assertthat_0.2.1

loaded via a namespace (and not attached):
  [1] RcppAnnoy_0.0.22            splines_4.5.2              
  [3] later_1.4.2                 cellranger_1.1.0           
  [5] tibble_3.3.0                polyclip_1.10-7            
  [7] fastDummies_1.7.5           lifecycle_1.0.4            
  [9] doParallel_1.0.17           globals_0.18.0             
 [11] lattice_0.22-7              MASS_7.3-65                
 [13] ggdist_3.3.3                dendextend_1.19.0          
 [15] magrittr_2.0.4              plotly_4.11.0              
 [17] rmarkdown_2.29              yaml_2.3.10                
 [19] httpuv_1.6.16               Seurat_5.2.1               
 [21] sctransform_0.4.2           spam_2.11-1                
 [23] sp_2.2-0                    spatstat.sparse_3.1-0      
 [25] reticulate_1.42.0           cowplot_1.2.0              
 [27] pbapply_1.7-2               RColorBrewer_1.1-3         
 [29] abind_1.4-8                 Rtsne_0.17                 
 [31] GenomicRanges_1.62.1        purrr_1.2.0                
 [33] BiocGenerics_0.56.0         circlize_0.4.16            
 [35] GenomeInfoDbData_1.2.14     IRanges_2.44.0             
 [37] S4Vectors_0.48.0            ggrepel_0.9.6              
 [39] irlba_2.3.5.1               listenv_0.9.1              
 [41] spatstat.utils_3.1-4        goftest_1.2-3              
 [43] RSpectra_0.16-2             spatstat.random_3.4-1      
 [45] fitdistrplus_1.2-2          parallelly_1.46.0          
 [47] codetools_0.2-20            DelayedArray_0.36.0        
 [49] tidyselect_1.2.1            shape_1.4.6.1              
 [51] UCSC.utils_1.4.0            farver_2.1.2               
 [53] ScaledMatrix_1.16.0         viridis_0.6.5              
 [55] matrixStats_1.5.0           stats4_4.5.2               
 [57] spatstat.explore_3.4-2      Seqinfo_1.0.0              
 [59] jsonlite_2.0.0              GetoptLong_1.0.5           
 [61] progressr_0.15.1            ggridges_0.5.6             
 [63] survival_3.8-3              iterators_1.0.14           
 [65] foreach_1.5.2               tools_4.5.2                
 [67] ica_1.0-3                   Rcpp_1.1.0                 
 [69] glue_1.8.0                  gridExtra_2.3              
 [71] SparseArray_1.10.8          xfun_0.52                  
 [73] MatrixGenerics_1.22.0       distributional_0.5.0       
 [75] ggthemes_5.2.0              GenomeInfoDb_1.44.0        
 [77] dplyr_1.1.4                 withr_3.0.2                
 [79] fastmap_1.2.0               digest_0.6.37              
 [81] rsvd_1.0.5                  parallelDist_0.2.6         
 [83] R6_2.6.1                    mime_0.13                  
 [85] colorspace_2.1-1            scattermore_1.2            
 [87] tensor_1.5                  spatstat.data_3.1-6        
 [89] tidyr_1.3.1                 generics_0.1.3             
 [91] data.table_1.17.0           httr_1.4.7                 
 [93] htmlwidgets_1.6.4           S4Arrays_1.10.1            
 [95] uwot_0.2.3                  pkgconfig_2.0.3            
 [97] gtable_0.3.6                ComplexHeatmap_2.26.0      
 [99] lmtest_0.9-40               S7_0.2.1                   
[101] SingleCellExperiment_1.32.0 XVector_0.50.0             
[103] htmltools_0.5.8.1           dotCall64_1.2              
[105] zigg_0.0.2                  clue_0.3-66                
[107] SeuratObject_5.1.0          scales_1.4.0               
[109] Biobase_2.70.0              png_0.1-8                  
[111] spatstat.univar_3.1-3       knitr_1.50                 
[113] reshape2_1.4.4              rjson_0.2.23               
[115] nlme_3.1-168                proxy_0.4-27               
[117] cachem_1.1.0                zoo_1.8-14                 
[119] GlobalOptions_0.1.2         stringr_1.6.0              
[121] KernSmooth_2.23-26          parallel_4.5.2             
[123] miniUI_0.1.2                pillar_1.10.2              
[125] grid_4.5.2                  vctrs_0.6.5                
[127] RANN_2.6.2                  promises_1.3.2             
[129] BiocSingular_1.26.1         beachmat_2.26.0            
[131] xtable_1.8-4                cluster_2.1.8.1            
[133] evaluate_1.0.3              cli_3.6.5                  
[135] compiler_4.5.2              rlang_1.1.6                
[137] crayon_1.5.3                future.apply_1.20.0        
[139] labeling_0.4.3              plyr_1.8.9                 
[141] stringi_1.8.7               viridisLite_0.4.2          
[143] deldir_2.0-4                BiocParallel_1.44.0        
[145] lazyeval_0.2.2              spatstat.geom_3.4-1        
[147] RcppHNSW_0.6.0              patchwork_1.3.2            
[149] future_1.58.0               shiny_1.11.0               
[151] SummarizedExperiment_1.38.1 ROCR_1.0-11                
[153] Rfast_2.1.5.1               igraph_2.1.4               
[155] memoise_2.0.1               RcppParallel_5.1.10        
[157] readxl_1.4.5