# Util libs
library(assertthat)
library(ggplot2)
library(zeallot)
library(conflicted)
library(Matrix)
# Data processing libs
if (!suppressWarnings(require(COTAN))) {
devtools::load_all("~/dev/COTAN/COTAN/")
}
conflicts_prefer(zeallot::`%->%`, zeallot::`%<-%`)
options(parallelly.fork.enable = TRUE)
setLoggingLevel(2L)Dataset Cleaning
Preamble
List input files
#setwd("d:/COTAN_Datasets_analysis/Data/NewDataRevision/")
inDir <- file.path("MouseBrainOpenProblem")
outDir <- file.path("MouseBrainOpenProblem")
list.files(path = inDir, pattern = "\\.RDS$")[1] "MouseBrainOP_COTAN_raw.RDS" "MouseBrainOPCOTAN-Cleaned.RDS"
globalCondition <- "MouseBrainOP"
fileNameIn <- paste0(globalCondition, "_COTAN_raw.RDS")
setLoggingFile(file.path(outDir, paste0("DatasetCleaning.log")))Load dataset
CotanObj <- readRDS(file = file.path(inDir, fileNameIn))
getAllConditions(CotanObj) [1] "metadata_cells" "age_days" "brain_hemisphere"
[4] "brain_region" "brain_subregion" "class"
[7] "cluster" "driver_lines" "eye_condition"
[10] "facs_container" "facs_date" "facs_sort_criteria"
[13] "genotype" "gfp_cpm" "reporter_lines"
[16] "rna_amplification_set" "sample_id" "sample_type"
[19] "seq_batch" "seq_name" "seq_tube"
[22] "sex"
getClusterizations(CotanObj)[1] "cell_type" "metadata_cells" "age_days" "brain_hemisphere"
[5] "brain_region" "brain_subregion" "class" "cluster"
dim(getRawData(CotanObj))[1] 34617 14249
table(getCondition(CotanObj, condName = "class"))
Endothelial GABAergic Glutamatergic Non-Neuronal
181 6125 7366 577
KEeping only nuronal cells
cellsToDrop <- names(getCondition(CotanObj, condName = "class")[getCondition(CotanObj, condName = "class") %in% c("Endothelial","Non-Neuronal")])
CotanObj <- dropGenesCells(CotanObj, cells = cellsToDrop)Cleaning
clean() using standard thresholds
CotanObj <- clean(CotanObj)Check the initial plots
cellSizePlot(CotanObj, condName = "facs_sort_criteria")
genesSizePlot(CotanObj, condName = "facs_sort_criteria")
scatterPlot(CotanObj, condName = "facs_sort_criteria")
Since the canonical prefix for mitocondiral genes is not found, we will use the genes defined by MitoCarta3.0
mitGenes <- readxl::read_excel("MouseBrainOpenProblem/Mouse.MitoCarta3.0.xls",sheet =2)
mitGenes <- unique(mitGenes[mitGenes$MSMS_NUM_TISSUES == 14,]$Symbol)
c(mitPerPlot, mitPerDf) %<-%
genesPercentagePlot(CotanObj, condName = "facs_sort_criteria",title = "Mit %",
genes = mitGenes)
mitPerPlot
Remove too high mitochondrial percentage cells
Expected percentages are around 6%-10%: too high percentage imply dead cells
sum(mitPerDf[["percentage"]] > 15)[1] 1
sum(mitPerDf[["percentage"]] > 14.0)[1] 4
sum(mitPerDf[["percentage"]] > 13.0)[1] 9
sum(mitPerDf[["percentage"]] > 12.0)[1] 78
sum(mitPerDf[["percentage"]] > 11.0)[1] 310
sum(mitPerDf[["percentage"]] > 10.0)[1] 793
# drop above 14%
mitPerThr <- 12.0
CotanObj <-
addElementToMetaDataset(
CotanObj,
"Mitochondrial percentage threshold", mitPerThr)
cellsToDrop <- getCells(CotanObj)[which(mitPerDf[["percentage"]] > mitPerThr)]
CotanObj <- dropGenesCells(CotanObj, cells = cellsToDrop)Redo the plots
cellSizePlot(CotanObj, condName = "facs_sort_criteria")
genesSizePlot(CotanObj, condName = "facs_sort_criteria")
scatterPlot(CotanObj, condName = "facs_sort_criteria")
c(mitPerPlot, mitPerDf) %<-%
genesPercentagePlot(CotanObj, condName = "facs_sort_criteria",title = "Mit %",
genes = mitGenes)
mitPerPlot
Remove cells with too high size
librarySize <- getCellsSize(CotanObj)
sum(librarySize > 4000000)[1] 32
sum(librarySize > 3500000)[1] 45
sum(librarySize > 3000000)[1] 76
# drop above 11%
cellSizeThr <- 3500000
CotanObj <-
addElementToMetaDataset(
CotanObj,
"Cell size threshold", cellSizeThr)
cellsToDrop <- getCells(CotanObj)[which(librarySize > cellSizeThr)]
CotanObj <- dropGenesCells(CotanObj, cells = cellsToDrop)Redo the plots
cellSizePlot(CotanObj, condName = "facs_sort_criteria")
genesSizePlot(CotanObj, condName = "facs_sort_criteria")
scatterPlot(CotanObj, condName = "facs_sort_criteria")
c(mitPerPlot, mitPerDf) %<-%
genesPercentagePlot(CotanObj, condName = "facs_sort_criteria",title = "Mit %",
genes = mitGenes)
mitPerPlot
Check for spurious clusters
CotanObj <- clean(CotanObj)
c(pcaCells, pcaCellsData, genes, UDE, nu, zoomedNu) %<-%
cleanPlots(CotanObj, includePCA = TRUE)Plot PCA and Nu
plot(pcaCells)
plot(UDE)
plot(pcaCellsData)
plot(genes)
plot(nu)
plot(zoomedNu)
Remove cells below nu elbow point
nu <- getNu(CotanObj)
sum(nu < 0.1)[1] 13
sum(nu < 0.11)[1] 17
sum(nu < 0.12)[1] 19
sum(nu < 0.13)[1] 20
# drop above 11%
lowNuThr <- 0.13
CotanObj <-
addElementToMetaDataset(
CotanObj,
"Low Nu threshold", lowNuThr)
cellsToDrop <- getCells(CotanObj)[which(nu < lowNuThr)]
#cells_to_rem <- rownames(pcaCellsData)[pcaCellsData[["groups"]] == "B"]
CotanObj <- dropGenesCells(CotanObj, cells = cellsToDrop)Redo all the plots
CotanObj <- clean(CotanObj)
c(pcaCells, pcaCellsData, genes, UDE, nu, zoomedNu) %<-%
cleanPlots(CotanObj, includePCA = TRUE)plot(pcaCells)
plot(UDE)
#plot(pcaCellsData)
plot(genes)
plot(nu)
plot(zoomedNu)
# cellSizePlot(CotanObj,conditions = getClusterizationData(CotanObj,clName = "c1")[[1]])
# genesSizePlot(CotanObj,conditions = getClusterizationData(CotanObj,clName = "c1")[[1]])
# scatterPlot(CotanObj,conditions = getClusterizationData(CotanObj,clName = "c1")[[1]])
# cellSizePlot(CotanObj,conditions = getClusterizationData(CotanObj,clName = "c2")[[1]])
# genesSizePlot(CotanObj,conditions = getClusterizationData(CotanObj,clName = "c2")[[1]])
# scatterPlot(CotanObj,conditions = getClusterizationData(CotanObj,clName = "c2")[[1]])
# c(mitPerPlotc1, mitPerDfc1) %<-%
# mitochondrialPercentagePlot(CotanObj, conditions = getClusterizationData(CotanObj,clName = "c1")[[1]],
# genePrefix = "^MT-")
# c(mitPerPlotc2, mitPerDfc2) %<-%
# mitochondrialPercentagePlot(CotanObj, conditions = getClusterizationData(CotanObj,clName = "c2")[[1]],
# genePrefix = "^MT-")
# mitPerPlotc1
# mitPerPlotc2Finalize object and save
CotanObj <- proceedToCoex(CotanObj, calcCoex = FALSE, cores = 5L, saveObj = FALSE)fileNameOut <- paste0(globalCondition, "COTAN-Cleaned", ".RDS")
saveRDS(CotanObj, file = file.path(inDir, fileNameOut))Sys.time()[1] "2026-01-09 18:01:34 CET"
sessionInfo()R version 4.5.2 (2025-10-31)
Platform: x86_64-pc-linux-gnu
Running under: Ubuntu 22.04.5 LTS
Matrix products: default
BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0 LAPACK version 3.10.0
locale:
[1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C LC_TIME=C.UTF-8
[4] LC_COLLATE=C.UTF-8 LC_MONETARY=C.UTF-8 LC_MESSAGES=C.UTF-8
[7] LC_PAPER=C.UTF-8 LC_NAME=C LC_ADDRESS=C
[10] LC_TELEPHONE=C LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C
time zone: Europe/Rome
tzcode source: system (glibc)
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] COTAN_2.11.1 Matrix_1.7-4 conflicted_1.2.0 zeallot_0.2.0
[5] ggplot2_4.0.1 assertthat_0.2.1
loaded via a namespace (and not attached):
[1] RcppAnnoy_0.0.22 splines_4.5.2
[3] later_1.4.2 cellranger_1.1.0
[5] tibble_3.3.0 polyclip_1.10-7
[7] fastDummies_1.7.5 lifecycle_1.0.4
[9] doParallel_1.0.17 globals_0.18.0
[11] lattice_0.22-7 MASS_7.3-65
[13] ggdist_3.3.3 dendextend_1.19.0
[15] magrittr_2.0.4 plotly_4.11.0
[17] rmarkdown_2.29 yaml_2.3.10
[19] httpuv_1.6.16 Seurat_5.2.1
[21] sctransform_0.4.2 spam_2.11-1
[23] sp_2.2-0 spatstat.sparse_3.1-0
[25] reticulate_1.42.0 cowplot_1.2.0
[27] pbapply_1.7-2 RColorBrewer_1.1-3
[29] abind_1.4-8 Rtsne_0.17
[31] GenomicRanges_1.62.1 purrr_1.2.0
[33] BiocGenerics_0.56.0 circlize_0.4.16
[35] GenomeInfoDbData_1.2.14 IRanges_2.44.0
[37] S4Vectors_0.48.0 ggrepel_0.9.6
[39] irlba_2.3.5.1 listenv_0.9.1
[41] spatstat.utils_3.1-4 goftest_1.2-3
[43] RSpectra_0.16-2 spatstat.random_3.4-1
[45] fitdistrplus_1.2-2 parallelly_1.46.0
[47] codetools_0.2-20 DelayedArray_0.36.0
[49] tidyselect_1.2.1 shape_1.4.6.1
[51] UCSC.utils_1.4.0 farver_2.1.2
[53] ScaledMatrix_1.16.0 viridis_0.6.5
[55] matrixStats_1.5.0 stats4_4.5.2
[57] spatstat.explore_3.4-2 Seqinfo_1.0.0
[59] jsonlite_2.0.0 GetoptLong_1.0.5
[61] progressr_0.15.1 ggridges_0.5.6
[63] survival_3.8-3 iterators_1.0.14
[65] foreach_1.5.2 tools_4.5.2
[67] ica_1.0-3 Rcpp_1.1.0
[69] glue_1.8.0 gridExtra_2.3
[71] SparseArray_1.10.8 xfun_0.52
[73] MatrixGenerics_1.22.0 distributional_0.5.0
[75] ggthemes_5.2.0 GenomeInfoDb_1.44.0
[77] dplyr_1.1.4 withr_3.0.2
[79] fastmap_1.2.0 digest_0.6.37
[81] rsvd_1.0.5 parallelDist_0.2.6
[83] R6_2.6.1 mime_0.13
[85] colorspace_2.1-1 scattermore_1.2
[87] tensor_1.5 spatstat.data_3.1-6
[89] tidyr_1.3.1 generics_0.1.3
[91] data.table_1.17.0 httr_1.4.7
[93] htmlwidgets_1.6.4 S4Arrays_1.10.1
[95] uwot_0.2.3 pkgconfig_2.0.3
[97] gtable_0.3.6 ComplexHeatmap_2.26.0
[99] lmtest_0.9-40 S7_0.2.1
[101] SingleCellExperiment_1.32.0 XVector_0.50.0
[103] htmltools_0.5.8.1 dotCall64_1.2
[105] zigg_0.0.2 clue_0.3-66
[107] SeuratObject_5.1.0 scales_1.4.0
[109] Biobase_2.70.0 png_0.1-8
[111] spatstat.univar_3.1-3 knitr_1.50
[113] reshape2_1.4.4 rjson_0.2.23
[115] nlme_3.1-168 proxy_0.4-27
[117] cachem_1.1.0 zoo_1.8-14
[119] GlobalOptions_0.1.2 stringr_1.6.0
[121] KernSmooth_2.23-26 parallel_4.5.2
[123] miniUI_0.1.2 pillar_1.10.2
[125] grid_4.5.2 vctrs_0.6.5
[127] RANN_2.6.2 promises_1.3.2
[129] BiocSingular_1.26.1 beachmat_2.26.0
[131] xtable_1.8-4 cluster_2.1.8.1
[133] evaluate_1.0.3 cli_3.6.5
[135] compiler_4.5.2 rlang_1.1.6
[137] crayon_1.5.3 future.apply_1.20.0
[139] labeling_0.4.3 plyr_1.8.9
[141] stringi_1.8.7 viridisLite_0.4.2
[143] deldir_2.0-4 BiocParallel_1.44.0
[145] lazyeval_0.2.2 spatstat.geom_3.4-1
[147] RcppHNSW_0.6.0 patchwork_1.3.2
[149] future_1.58.0 shiny_1.11.0
[151] SummarizedExperiment_1.38.1 ROCR_1.0-11
[153] Rfast_2.1.5.1 igraph_2.1.4
[155] memoise_2.0.1 RcppParallel_5.1.10
[157] readxl_1.4.5