library(rlang)
library(testthat)
library(ggplot2)
library(tibble)
library(zeallot)
library(Seurat)
library(aricode)
library(funtimes)
library(monocle3)
library(cluster)
library(COTAN)
= c('default', 'antibody', 'celltypist')
tunings = c('monocle', 'scanpy', 'seurat', 'scvi-tools', 'COTAN')
tools
<- function(probMatrixDF,
averageClustersDistance
clustersList, numDim) {
<- function(subMatrix) {
squareDist <- scale(subMatrix, scale = FALSE, center = TRUE)
subMatrix return(sum(rowSums(subMatrix^2)))
}
<- PCAtools::pca(mat = probMatrixDF, rank = numDim, transposed = TRUE)
pca
= 100 - sum(pca[["variance"]])
residual_variance
<- scale(x = pca[["rotated"]], center = TRUE, scale = TRUE)
normMatrix rownames(normMatrix) <- rownames(probMatrixDF)
= dim(normMatrix)[2]
final_dimensions
<- 0.0
sumDist for (cl in clustersList) {
<- sumDist + squareDist(normMatrix[cl, , drop = FALSE])
sumDist
}= sumDist / ncol(normMatrix) / (nrow(normMatrix) - length(clustersList))
score return(list("residual_variance" = residual_variance, "final_dimensions" = final_dimensions, "score" = score))
}
Probability score computation
PBMC1
= 'PBMC1'
dataset = paste('Data/', dataset, '/celltypist/Immune_All_Low_probability_matrix.csv', sep='')
celltypistPath = paste('Data/', dataset, '/probabilistic_score.csv', sep='') outPath
Load CellTypist clusterization
<- read.csv(celltypistPath, header = TRUE)
probMatrixDF <- column_to_rownames(probMatrixDF, var = "X")
probMatrixDF rownames(probMatrixDF) <- gsub("[.]", ":", rownames(probMatrixDF))
rownames(probMatrixDF) <- gsub("X10X", "10X", rownames(probMatrixDF))
rownames(probMatrixDF) <- substr(rownames(probMatrixDF), 1, nchar(rownames(probMatrixDF)) - 2)
Estimate number of dimensions
<- PCAtools::pca(mat = probMatrixDF, rank = 50, transposed = TRUE)
prc plot(prc$sdev)
<- 20 nDims
Calculate the projection to spherical distribution [Malahanobis distance]: the sum of square distances from center of sub-cluster in pca-projected space
= data.frame()#"tool" = NA, "dataset" = dataset, "tuning" = NA, "score" = NA, "residual_variance" = NA, "final_dimensions" = NA)
scores for (tool in tools) {
for (tuning in tunings) {
= paste('Data/', dataset, '/', tool, '/', tuning, '/clustering_labels.csv', sep='')
labelsPath <- read.csv(labelsPath, header = TRUE)
labels <- split(as.character(labels$cell), labels$cluster)
clustersList= averageClustersDistance(probMatrixDF, clustersList,numDim = nDims)
score_data = rbind(
scores
scores,data.frame(
"tool" = tool,
"dataset" = dataset,
"tuning" = tuning,
"score" = score_data$score,
"residual_variance" = score_data$residual_variance,
"final_dimensions" = score_data$final_dimensions
)
)
}
}write.csv(scores, outPath, row.names = FALSE)
scores
tool dataset tuning score residual_variance final_dimensions
1 monocle PBMC1 default 0.9064319 0.6614691 20
2 monocle PBMC1 antibody 0.7098190 0.6614691 20
3 monocle PBMC1 celltypist 0.6698957 0.6614691 20
4 scanpy PBMC1 default 0.5717224 0.6614691 20
5 scanpy PBMC1 antibody 0.6405931 0.6614691 20
6 scanpy PBMC1 celltypist 0.5765256 0.6614691 20
7 seurat PBMC1 default 0.6290241 0.6614691 20
8 seurat PBMC1 antibody 0.6275275 0.6614691 20
9 seurat PBMC1 celltypist 0.5283759 0.6614691 20
10 scvi-tools PBMC1 default 0.6027325 0.6614691 20
11 scvi-tools PBMC1 antibody 0.6605860 0.6614691 20
12 scvi-tools PBMC1 celltypist 0.5735559 0.6614691 20
13 COTAN PBMC1 default 0.5945573 0.6614691 20
14 COTAN PBMC1 antibody 0.7028721 0.6614691 20
15 COTAN PBMC1 celltypist 0.5945573 0.6614691 20
set.seed(42)
<- rownames(probMatrixDF)
shuffledCells <- function(n) {
rndDist <- set_names(rep(1:n, ceiling(9000/n))[1:nrow(probMatrixDF)], shuffledCells)
clusters <- split(names(clusters), clusters)
cluster_list <- averageClustersDistance(probMatrixDF, cluster_list,nDims)
d return(d$score)
}<- c(1:15, seq(20, 50, by = 5), seq(60, 100, by = 10),
sizes seq(150, 1000, by = 50), seq(1500, 9000, by = 500))
<- vapply(sizes,rndDist, numeric(1))
sizesToDists names(sizesToDists) <- sizes
pdf(paste('Data/', dataset, '/probability_score_dim_dependency.pdf', sep=''))
plot(names(sizesToDists), sizesToDists)
dev.off()
png
2
plot(names(sizesToDists), sizesToDists)
PBMC2
= 'PBMC2'
dataset = paste('Data/', dataset, '/celltypist/Immune_All_Low_probability_matrix.csv', sep='')
celltypistPath = paste('Data/', dataset, '/probabilistic_score.csv', sep='') outPath
Load CellTypist clusterization
<- read.csv(celltypistPath, header = TRUE)
probMatrixDF <- column_to_rownames(probMatrixDF, var = "X")
probMatrixDF rownames(probMatrixDF) <- gsub("[.]", ":", rownames(probMatrixDF))
rownames(probMatrixDF) <- gsub("X10X", "10X", rownames(probMatrixDF))
rownames(probMatrixDF) <- substr(rownames(probMatrixDF), 1, nchar(rownames(probMatrixDF)) - 2)
Estimate number of dimensions
<- PCAtools::pca(mat = probMatrixDF, rank = 50, transposed = TRUE)
prc plot(prc$sdev)
<- 20 nDims
Calculate the projection to spherical distribution [Malahanobis distance]: the sum of square distances from center of sub-cluster in pca-projected space
= data.frame()#"tool" = NA, "dataset" = dataset, "tuning" = NA, "score" = NA, "residual_variance" = NA, "final_dimensions" = NA)
scores for (tool in tools) {
for (tuning in tunings) {
= paste('Data/', dataset, '/', tool, '/', tuning, '/clustering_labels.csv', sep='')
labelsPath <- read.csv(labelsPath, header = TRUE)
labels <- split(as.character(labels$cell), labels$cluster)
clustersList= averageClustersDistance(probMatrixDF, clustersList,numDim = nDims)
score_data = rbind(
scores
scores,data.frame(
"tool" = tool,
"dataset" = dataset,
"tuning" = tuning,
"score" = score_data$score,
"residual_variance" = score_data$residual_variance,
"final_dimensions" = score_data$final_dimensions
)
)
}
}write.csv(scores, outPath, row.names = FALSE)
scores
tool dataset tuning score residual_variance final_dimensions
1 monocle PBMC2 default 0.9547651 0.632864 20
2 monocle PBMC2 antibody 0.7238461 0.632864 20
3 monocle PBMC2 celltypist 0.6804478 0.632864 20
4 scanpy PBMC2 default 0.5757901 0.632864 20
5 scanpy PBMC2 antibody 0.4720212 0.632864 20
6 scanpy PBMC2 celltypist 0.5721244 0.632864 20
7 seurat PBMC2 default 0.5494186 0.632864 20
8 seurat PBMC2 antibody 0.3949425 0.632864 20
9 seurat PBMC2 celltypist 0.4951102 0.632864 20
10 scvi-tools PBMC2 default 0.5851914 0.632864 20
11 scvi-tools PBMC2 antibody 0.4727416 0.632864 20
12 scvi-tools PBMC2 celltypist 0.5576107 0.632864 20
13 COTAN PBMC2 default 0.5599573 0.632864 20
14 COTAN PBMC2 antibody 0.6612736 0.632864 20
15 COTAN PBMC2 celltypist 0.5599573 0.632864 20
set.seed(42)
<- rownames(probMatrixDF)
shuffledCells <- function(n) {
rndDist <- set_names(rep(1:n, ceiling(9000/n))[1:nrow(probMatrixDF)], shuffledCells)
clusters <- split(names(clusters), clusters)
cluster_list <- averageClustersDistance(probMatrixDF, cluster_list,nDims)
d return(d$score)
}<- c(1:15, seq(20, 50, by = 5), seq(60, 100, by = 10),
sizes seq(150, 1000, by = 50), seq(1500, 9000, by = 500))
<- vapply(sizes, rndDist, numeric(1))
sizesToDists names(sizesToDists) <- sizes
pdf(paste('Data/', dataset, '/probability_score_dim_dependency.pdf', sep=''))
plot(names(sizesToDists), sizesToDists)
dev.off()
png
2
plot(names(sizesToDists), sizesToDists)
PBMC3
= 'PBMC3'
dataset = paste('Data/', dataset, '/celltypist/Immune_All_Low_probability_matrix.csv', sep='')
celltypistPath = paste('Data/', dataset, '/probabilistic_score.csv', sep='') outPath
Load CellTypist clusterization
<- read.csv(celltypistPath, header = TRUE)
probMatrixDF <- column_to_rownames(probMatrixDF, var = "X")
probMatrixDF rownames(probMatrixDF) <- gsub("[.]", ":", rownames(probMatrixDF))
rownames(probMatrixDF) <- gsub("X10X", "10X", rownames(probMatrixDF))
rownames(probMatrixDF) <- substr(rownames(probMatrixDF), 1, nchar(rownames(probMatrixDF)) - 2)
Estimate number of dimensions
<- PCAtools::pca(mat = probMatrixDF, rank = 50, transposed = TRUE)
prc plot(prc$sdev)
<- 20 nDims
Calculate the projection to spherical distribution [Malahanobis distance]: the sum of square distances from center of sub-cluster in pca-projected space
= data.frame()#"tool" = NA, "dataset" = dataset, "tuning" = NA, "score" = NA, "residual_variance" = NA, "final_dimensions" = NA)
scores for (tool in tools) {
for (tuning in tunings) {
= paste('Data/', dataset, '/', tool, '/', tuning, '/clustering_labels.csv', sep='')
labelsPath <- read.csv(labelsPath, header = TRUE)
labels <- split(as.character(labels$cell), labels$cluster)
clustersList= averageClustersDistance(probMatrixDF, clustersList,numDim = nDims)
score_data = rbind(
scores
scores,data.frame(
"tool" = tool,
"dataset" = dataset,
"tuning" = tuning,
"score" = score_data$score,
"residual_variance" = score_data$residual_variance,
"final_dimensions" = score_data$final_dimensions
)
)
}
}write.csv(scores, outPath, row.names = FALSE)
scores
tool dataset tuning score residual_variance final_dimensions
1 monocle PBMC3 default 0.9116157 0.666637 20
2 monocle PBMC3 antibody 0.7342920 0.666637 20
3 monocle PBMC3 celltypist 0.7235588 0.666637 20
4 scanpy PBMC3 default 0.5771388 0.666637 20
5 scanpy PBMC3 antibody 0.4865492 0.666637 20
6 scanpy PBMC3 celltypist 0.5750278 0.666637 20
7 seurat PBMC3 default 0.5237283 0.666637 20
8 seurat PBMC3 antibody 0.4024118 0.666637 20
9 seurat PBMC3 celltypist 0.5238608 0.666637 20
10 scvi-tools PBMC3 default 0.5606163 0.666637 20
11 scvi-tools PBMC3 antibody 0.4673312 0.666637 20
12 scvi-tools PBMC3 celltypist 0.5082920 0.666637 20
13 COTAN PBMC3 default 0.5034928 0.666637 20
14 COTAN PBMC3 antibody 0.6774677 0.666637 20
15 COTAN PBMC3 celltypist 0.6318363 0.666637 20
set.seed(42)
<- rownames(probMatrixDF)
shuffledCells <- function(n) {
rndDist <- set_names(rep(1:n, ceiling(9000/n))[1:nrow(probMatrixDF)], shuffledCells)
clusters <- split(names(clusters), clusters)
cluster_list <- averageClustersDistance(probMatrixDF, cluster_list,nDims)
d return(d$score)
}<- c(1:15, seq(20, 50, by = 5), seq(60, 100, by = 10),
sizes seq(150, 1000, by = 50), seq(1500, 9000, by = 500))
<- vapply(sizes, rndDist, numeric(1))
sizesToDists names(sizesToDists) <- sizes
pdf(paste('Data/', dataset, '/probability_score_dim_dependency.pdf', sep=''))
plot(names(sizesToDists), sizesToDists)
dev.off()
png
2
plot(names(sizesToDists), sizesToDists)
PBMC4
= 'PBMC4'
dataset = paste('Data/', dataset, '/celltypist/Immune_All_Low_probability_matrix.csv', sep='')
celltypistPath = paste('Data/', dataset, '/probabilistic_score.csv', sep='') outPath
Load CellTypist clusterization
<- read.csv(celltypistPath, header = TRUE)
probMatrixDF <- column_to_rownames(probMatrixDF, var = "X")
probMatrixDF rownames(probMatrixDF) <- gsub("[.]", ":", rownames(probMatrixDF))
rownames(probMatrixDF) <- gsub("X10X", "10X", rownames(probMatrixDF))
rownames(probMatrixDF) <- substr(rownames(probMatrixDF), 1, nchar(rownames(probMatrixDF)) - 2)
Estimate number of dimensions
<- PCAtools::pca(mat = probMatrixDF, rank = 50, transposed = TRUE)
prc plot(prc$sdev)
<- 20 nDims
Calculate the projection to spherical distribution [Malahanobis distance]: the sum of square distances from center of sub-cluster in pca-projected space
= data.frame()#"tool" = NA, "dataset" = dataset, "tuning" = NA, "score" = NA, "residual_variance" = NA, "final_dimensions" = NA)
scores for (tool in tools) {
for (tuning in tunings) {
= paste('Data/', dataset, '/', tool, '/', tuning, '/clustering_labels.csv', sep='')
labelsPath <- read.csv(labelsPath, header = TRUE)
labels <- split(as.character(labels$cell), labels$cluster)
clustersList= averageClustersDistance(probMatrixDF, clustersList,numDim = nDims)
score_data = rbind(
scores
scores,data.frame(
"tool" = tool,
"dataset" = dataset,
"tuning" = tuning,
"score" = score_data$score,
"residual_variance" = score_data$residual_variance,
"final_dimensions" = score_data$final_dimensions
)
)
}
}write.csv(scores, outPath, row.names = FALSE)
scores
tool dataset tuning score residual_variance final_dimensions
1 monocle PBMC4 default 0.9079955 0.7881445 20
2 monocle PBMC4 antibody 0.6896226 0.7881445 20
3 monocle PBMC4 celltypist 0.6513185 0.7881445 20
4 scanpy PBMC4 default 0.5667508 0.7881445 20
5 scanpy PBMC4 antibody 0.5639221 0.7881445 20
6 scanpy PBMC4 celltypist 0.5916285 0.7881445 20
7 seurat PBMC4 default 0.5272270 0.7881445 20
8 seurat PBMC4 antibody 0.4877846 0.7881445 20
9 seurat PBMC4 celltypist 0.5265706 0.7881445 20
10 scvi-tools PBMC4 default 0.6212800 0.7881445 20
11 scvi-tools PBMC4 antibody 0.5174726 0.7881445 20
12 scvi-tools PBMC4 celltypist 0.5219027 0.7881445 20
13 COTAN PBMC4 default 0.5755159 0.7881445 20
14 COTAN PBMC4 antibody 0.6613133 0.7881445 20
15 COTAN PBMC4 celltypist 0.5825237 0.7881445 20
set.seed(42)
<- rownames(probMatrixDF)
shuffledCells <- function(n) {
rndDist <- set_names(rep(1:n, ceiling(9000/n))[1:nrow(probMatrixDF)], shuffledCells)
clusters <- split(names(clusters), clusters)
cluster_list <- averageClustersDistance(probMatrixDF, cluster_list,nDims)
d return(d$score)
}<- c(1:15, seq(20, 50, by = 5), seq(60, 100, by = 10),
sizes seq(150, 1000, by = 50), seq(1500, 9000, by = 500))
<- vapply(sizes, rndDist, numeric(1))
sizesToDists names(sizesToDists) <- sizes
pdf(paste('Data/', dataset, '/probability_score_dim_dependency.pdf', sep=''))
plot(names(sizesToDists), sizesToDists)
dev.off()
png
2
plot(names(sizesToDists), sizesToDists)
Sys.time()
[1] "2024-05-10 17:40:27 CEST"
sessionInfo()
R version 4.3.2 (2023-10-31)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 20.04.6 LTS
Matrix products: default
BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/liblapack.so.3; LAPACK version 3.9.0
locale:
[1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C LC_TIME=C.UTF-8
[4] LC_COLLATE=C.UTF-8 LC_MONETARY=C.UTF-8 LC_MESSAGES=C.UTF-8
[7] LC_PAPER=C.UTF-8 LC_NAME=C LC_ADDRESS=C
[10] LC_TELEPHONE=C LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C
time zone: Europe/Rome
tzcode source: system (glibc)
attached base packages:
[1] stats4 stats graphics grDevices utils datasets methods
[8] base
other attached packages:
[1] COTAN_2.5.0 cluster_2.1.6
[3] monocle3_1.3.4 SingleCellExperiment_1.22.0
[5] SummarizedExperiment_1.30.2 GenomicRanges_1.52.0
[7] GenomeInfoDb_1.36.1 IRanges_2.34.1
[9] S4Vectors_0.38.1 MatrixGenerics_1.12.3
[11] matrixStats_1.2.0 Biobase_2.60.0
[13] BiocGenerics_0.46.0 funtimes_9.1
[15] aricode_1.0.3 Seurat_5.0.0
[17] SeuratObject_5.0.0 sp_2.1-1
[19] zeallot_0.1.0 tibble_3.2.1
[21] ggplot2_3.5.0 testthat_3.2.0
[23] rlang_1.1.1
loaded via a namespace (and not attached):
[1] spatstat.sparse_3.0-2 bitops_1.0-7
[3] httr_1.4.6 RColorBrewer_1.1-3
[5] doParallel_1.0.17 tools_4.3.2
[7] sctransform_0.4.1 backports_1.4.1
[9] utf8_1.2.3 R6_2.5.1
[11] lazyeval_0.2.2 uwot_0.1.16
[13] GetoptLong_1.0.5 withr_3.0.0
[15] gridExtra_2.3 parallelDist_0.2.6
[17] fdrtool_1.2.17 progressr_0.14.0
[19] qgraph_1.9.8 cli_3.6.1
[21] spatstat.explore_3.2-1 fastDummies_1.7.3
[23] mvtnorm_1.2-2 spatstat.data_3.0-1
[25] askpass_1.2.0 ggridges_0.5.4
[27] pbapply_1.7-2 pbivnorm_0.6.0
[29] foreign_0.8-86 dbscan_1.1-11
[31] parallelly_1.37.1 rstudioapi_0.15.0
[33] generics_0.1.3 shape_1.4.6
[35] gtools_3.9.4 ica_1.0-3
[37] spatstat.random_3.2-1 dendextend_1.17.1
[39] graphicalVAR_0.3.3 dplyr_1.1.2
[41] Matrix_1.6-3 fansi_1.0.4
[43] abind_1.4-5 PCAtools_2.14.0
[45] terra_1.7-39 lifecycle_1.0.3
[47] yaml_2.3.7 clusterGeneration_1.3.8
[49] Rtsne_0.17 grid_4.3.2
[51] lavaan_0.6-16 mlVAR_0.5.1
[53] dqrng_0.3.0 promises_1.2.0.1
[55] crayon_1.5.2 miniUI_0.1.1.1
[57] lattice_0.22-5 beachmat_2.16.0
[59] cowplot_1.1.1 pillar_1.9.0
[61] knitr_1.43 ComplexHeatmap_2.16.0
[63] rjson_0.2.21 boot_1.3-28
[65] corpcor_1.6.10 future.apply_1.11.0
[67] codetools_0.2-19 leiden_0.4.3
[69] glue_1.7.0 data.table_1.15.0
[71] vctrs_0.6.3 png_0.1-8
[73] spam_2.10-0 Rdpack_2.6
[75] gtable_0.3.3 assertthat_0.2.1
[77] gsubfn_0.7 xfun_0.39
[79] rbibutils_2.2.15 S4Arrays_1.2.0
[81] mime_0.12 Rfast_2.1.0
[83] coda_0.19-4 survival_3.5-8
[85] Kendall_2.2.1 iterators_1.0.14
[87] MplusAutomation_1.1.0 ellipsis_0.3.2
[89] fitdistrplus_1.1-11 ROCR_1.0-11
[91] nlme_3.1-163 RcppAnnoy_0.0.21
[93] irlba_2.3.5.1 KernSmooth_2.23-22
[95] rpart_4.1.23 colorspace_2.1-0
[97] Hmisc_5.1-0 nnet_7.3-19
[99] mnormt_2.1.1 tidyselect_1.2.0
[101] compiler_4.3.2 glmnet_4.1-8
[103] htmlTable_2.4.1 DelayedArray_0.26.7
[105] plotly_4.10.2 checkmate_2.3.0
[107] scales_1.3.0 psych_2.4.3
[109] lmtest_0.9-40 quadprog_1.5-8
[111] stringr_1.5.0 digest_0.6.33
[113] goftest_1.2-3 spatstat.utils_3.0-3
[115] minqa_1.2.5 rmarkdown_2.24
[117] XVector_0.40.0 htmltools_0.5.8
[119] pkgconfig_2.0.3 jpeg_0.1-10
[121] base64enc_0.1-3 umap_0.2.10.0
[123] lme4_1.1-34 sparseMatrixStats_1.12.2
[125] fastmap_1.1.1 ggthemes_5.1.0
[127] GlobalOptions_0.1.2 htmlwidgets_1.6.2
[129] DelayedMatrixStats_1.22.5 shiny_1.8.0
[131] zoo_1.8-12 jsonlite_1.8.7
[133] BiocParallel_1.34.2 BiocSingular_1.16.0
[135] RCurl_1.98-1.12 magrittr_2.0.3
[137] Formula_1.2-5 GenomeInfoDbData_1.2.10
[139] dotCall64_1.1-0 patchwork_1.2.0
[141] texreg_1.39.3 munsell_0.5.0
[143] Rcpp_1.0.11 viridis_0.6.4
[145] proto_1.0.0 reticulate_1.36.1
[147] RcppZiggurat_0.1.6 stringi_1.8.1
[149] brio_1.1.3 zlibbioc_1.46.0
[151] MASS_7.3-60 plyr_1.8.8
[153] parallel_4.3.2 listenv_0.9.0
[155] ggrepel_0.9.5 deldir_2.0-2
[157] splines_4.3.2 pander_0.6.5
[159] tensor_1.5 circlize_0.4.15
[161] igraph_2.0.3 spatstat.geom_3.2-4
[163] RcppHNSW_0.6.0 reshape2_1.4.4
[165] ScaledMatrix_1.8.1 evaluate_0.21
[167] RcppParallel_5.1.7 nloptr_2.0.3
[169] foreach_1.5.2 httpuv_1.6.11
[171] openssl_2.1.0 RANN_2.6.1
[173] tidyr_1.3.0 purrr_1.0.1
[175] polyclip_1.10-4 clue_0.3-64
[177] future_1.33.0 scattermore_1.2
[179] rsvd_1.0.5 xtable_1.8-4
[181] RSpectra_0.16-1 later_1.3.1
[183] glasso_1.11 viridisLite_0.4.2
[185] arm_1.13-1 globals_0.16.2