Mixing Uniform Clusters To Estimate GDI Sensitivity using Run 40

Published

February 2, 2026

Preamble

library(assertthat)
library(rlang)
library(stringr)
library(scales)
library(ggplot2)
library(zeallot)
library(conflicted)
library(data.table)
library(tibble)
library(tidyr)

# Data processing libs
if (!suppressWarnings(require(COTAN))) {
  devtools::load_all("~/dev/COTAN/COTAN/")
}
library(parallelDist)

conflicts_prefer(zeallot::`%->%`, zeallot::`%<-%`)

options(parallelly.fork.enable = TRUE)

inDir <- file.path("Data/Brown_PBMC_Datasets/")

outDir <- file.path("Results/GDI_Sensitivity/PBMCBrownRun40/")

setLoggingLevel(2)
setLoggingFile(file.path(inDir, "MixingClustersForGDIDelta.log"))

Loading all COTAN Objects

list.files(path = inDir, pattern = "\\.RDS$")

 [1] "capillary_blood_samples_pbmcs-Run_28.RDS"        
 [2] "capillary_blood_samples_pbmcs-Run_40-Cleaned.RDS"
 [3] "capillary_blood_samples_pbmcs-Run_40.RDS"        
 [4] "capillary_blood_samples_pbmcs-Run_41-Cleaned.RDS"
 [5] "capillary_blood_samples_pbmcs-Run_41.RDS"        
 [6] "capillary_blood_samples_pbmcs-Run_62.RDS"        
 [7] "capillary_blood_samples_pbmcs-Run_77-Cleaned.RDS"
 [8] "capillary_blood_samples_pbmcs-Run_77.RDS"        
 [9] "capillary_blood_samples_pbmcs.RDS"               
[10] "CD4Tcells-Run_40-Cleaned.RDS"                    
[11] "Run_40Cl_0CD4TcellsRawData.RDS"                  
[12] "Run_40Cl_1CD4TcellsRawData.RDS"                  
[13] "Run_40Cl_5CD4TcellsRawData.RDS"                  
[14] "Run_40Cl_7CD4TcellsRawData.RDS"                  
[15] "Run_40Cl_8CD4TcellsRawData.RDS"                  
[16] "Run_40Cl_9CD4TcellsRawData.RDS"

globalCondition <- "capillary_blood_samples_pbmcs"

thisRun <- "40"
fileNameIn <- paste0(globalCondition, "-Run_", thisRun, "-Cleaned.RDS")

Load COTAN object

aRunObj <- readRDS(file = file.path(inDir, fileNameIn))

sampleCondition <- getMetadataElement(aRunObj, datasetTags()[["cond"]])[[1L]]

getClusterizations(aRunObj)

 [1] "cell_type_level_1" "cell_type_level_2" "cell_type_level_3"
 [4] "cell_type_level_4" "c1"                "c2"               
 [7] "c3"                "c4"                "Sample_C3"        
[10] "SelUnifCl"

relevantClusters <- getClusters(aRunObj, clName = "SelUnifCl")

###Relevant clusters

relevantClustersList <- toClustersList(relevantClusters)
relevantClustersList <- relevantClustersList[names(relevantClustersList) != "Other"]

Collect `size` and `GDI` for all selected clusters (baseline data)

fileNameRelData <- file.path(outDir, "relevantClChecksRes.csv")

relevantResData <- read.csv(file = fileNameRelData,
                            header = TRUE, row.names = 1L)
relevantResData <- relevantResData[rownames(relevantResData) != "Other", ]

baselineGDI <- relevantResData[, c("clusterSize", "check.quantileAtRatio")]

colnames(baselineGDI) <- c("size", "GDI")

write.csv(baselineGDI, file = file.path(outDir, "BaselineGDI.csv"))

Load baseline data

baselineGDI <- read.csv(file = file.path(outDir, "BaselineGDI.csv"),
                        header = TRUE, row.names = 1L)

Calculate the GDI of the mixtures of clusters

Define all mixing fractions

allMixingFractions <- c(0.05, 0.10, 0.20, 0.40, 0.80)
names(allMixingFractions) <-
  str_pad(scales::label_percent()(allMixingFractions), 3, pad = "0")

This is to be run once per wanted mixing-fraction

# small run
# 
set.seed(137)

for (mixingStr in names(allMixingFractions)) {
  mixingFraction <- allMixingFractions[[mixingStr]]

  resFileOut <-
    file.path(outDir, paste0("GDI_with_", mixingStr, "_Mixing.RDS"))
  
  results <- data.frame()

  for (mainName in rownames(baselineGDI)) {
    mainSize <- baselineGDI[mainName, "size"]
    mainGDI  <- baselineGDI[mainName, "GDI"]
    
    mainCluster <- relevantClustersList[[mainName]]

    for (clName in rownames(baselineGDI)) {
      if (clName == mainName) next

      logThis(paste("Mixing", mainName, "with extra",
                    mixingStr, "cells from", clName), logLevel = 1)

      clSize <- baselineGDI[clName, "size"]
      actuallyMixedCells <- min(ceiling(mixingFraction * mainSize), clSize)
      actualFraction <- actuallyMixedCells / mainSize
      
      sampledCluster <- sample(relevantClustersList[[clName]], actuallyMixedCells)

      cellsToDrop <- !(getCells(aRunObj) %in% c(mainCluster, sampledCluster))
      mixedObj <- dropGenesCells(aRunObj, cells = getCells(aRunObj)[cellsToDrop])

      # Calculate the merged COEX (1 core to avoid multi-process issues)
      mixedObj <- proceedToCoex(mixedObj, calcCoex = TRUE, cores = 1L)

      # Extract the GDI quantile
      mixedGDIData <- calculateGDI(mixedObj)
      rm(mixedObj)

      gdi <- getColumnFromDF(mixedGDIData, colName = "GDI")
      gdi <- sort(gdi, decreasing = TRUE)
      lastPercentile <- quantile(gdi, probs = 0.99)
      rm(gdi)

      GDIIncr <- lastPercentile - mainGDI
      
      results <-
        rbind(
          results,
          data.frame(
            "MainCluster"    = mainName,
            "OtherCluster"   = clName,
            "MixingFraction" = actualFraction,
            "GDI"            = lastPercentile,
            "GDIIncrement"   = GDIIncr
          )
        )

      logThis(
        paste("Mixing", mainName, "with", mixingStr, "of", clName,
              "accomplished with GDI", lastPercentile, "and increment", GDIIncr),
        logLevel = 1)
    }
    
    # save at each mainCuster so not to loose effort in case of errors
    rownames(results) <- NULL
  
    saveRDS(results, resFileOut)
  }

}

Load calculated data for analysis

allResList <-
  lapply(
    names(allMixingFractions),
    function(mixStr) {
      readRDS(file.path(outDir, paste0("GDI_with_", mixStr, "_Mixing.RDS")))
    })
names(allResList) <- names(allMixingFractions)

assert_that(unique(sapply(allResList, ncol)) == 5L)

[1] TRUE

Merge all results and calculate the fitting regression for each cluster pair

allRes <- cbind(allResList[[1L]][, 1:2],
                lapply(allResList, function(df) { df[, 3:5] }))

colnames(allRes) <-
  c(colnames(allResList[[1L]])[1:2],
    paste0(
      rep(colnames(allResList[[1L]])[3:5], 5),
      "_",
      rep(names(allMixingFractions), each = 3)
    )
  )

Recall cluster distance and add it to the results

zeroOneAvg <- readRDS(file.path(outDir,  "allZeroOne.RDS"))
# zeroOneAvg <- readRDS(file.path(outDir, "distanceZeroOne.RDS"))

distZeroOne <-
  as.matrix(parDist(t(zeroOneAvg), method = "hellinger",
                    diag = TRUE, upper = TRUE))^2

distZeroOneLong <-
  rownames_to_column(as.data.frame(distZeroOne), var = "MainCluster")
distZeroOneLong <-
  pivot_longer(distZeroOneLong,
               cols = !MainCluster,
               names_to = "OtherCluster", 
               values_to = "Distance")

distZeroOneLong <-
  as.data.frame(distZeroOneLong[distZeroOneLong[["Distance"]] != 0.0, ])

assert_that(identical(distZeroOneLong[, 1:2], allRes[, 1:2]))

[1] TRUE

perm <- order(distZeroOneLong[["Distance"]])

Scatter plot of ΔGDI vs Distance

Merge all data and plot it using a-priory (squared) distance as discriminant

allResPerm <- do.call(rbind,lapply(allResList, function(df) df[perm, ]))
rownames(allResPerm) <- NULL

allResPerm <-
  cbind(allResPerm,
        "ClusterPair" = rep.int(seq_along(perm), 5),
        "Distance"    = rep(distZeroOneLong[["Distance"]][perm], 5))

head(allResPerm)

  MainCluster OtherCluster MixingFraction      GDI  GDIIncrement ClusterPair
1           0            5     0.05000000 1.443775  0.0222651676           1
2           5            0     0.05122951 1.384208  0.0173951251           2
3           0            1     0.05000000 1.438330  0.0168202884           3
4           1            0     0.05112782 1.379750  0.0286431493           4
5           0            9     0.05000000 1.426839  0.0053299285           5
6           9            0     0.05333333 1.361838 -0.0005519038           6
    Distance
1 0.01489277
2 0.01489277
3 0.02026159
4 0.02026159
5 0.02928972
6 0.02928972

mg <- function(mixing) { round(log2(mixing * 40)) }

IScPlot <- ggplot(allResPerm,
                  aes(x = paste0(mg(MixingFraction)),
                      y = GDIIncrement,
                      group = mg(MixingFraction))) +
  geom_boxplot(varwidth = TRUE) +
  scale_x_discrete(
    breaks = paste0(mg(allMixingFractions)),
    labels = names(allMixingFractions)
  ) +
  labs(x = "Mixing Fraction") +
  theme(plot.title = element_blank())

plot(IScPlot)

Sys.time()

[1] "2026-02-02 16:06:22 CET"

#Sys.info()
sessionInfo()

R version 4.5.2 (2025-10-31)
Platform: x86_64-pc-linux-gnu
Running under: Ubuntu 22.04.5 LTS

Matrix products: default
BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0 
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0  LAPACK version 3.10.0

locale:
 [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
 [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
 [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
[10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   

time zone: Europe/Rome
tzcode source: system (glibc)

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] parallelDist_0.2.6 COTAN_2.11.1       tidyr_1.3.1        tibble_3.3.0      
 [5] data.table_1.18.0  conflicted_1.2.0   zeallot_0.2.0      ggplot2_4.0.1     
 [9] scales_1.4.0       stringr_1.6.0      rlang_1.1.7        assertthat_0.2.1  

loaded via a namespace (and not attached):
  [1] RcppAnnoy_0.0.22            splines_4.5.2              
  [3] later_1.4.2                 polyclip_1.10-7            
  [5] fastDummies_1.7.5           lifecycle_1.0.4            
  [7] doParallel_1.0.17           globals_0.18.0             
  [9] lattice_0.22-7              MASS_7.3-65                
 [11] ggdist_3.3.3                dendextend_1.19.0          
 [13] magrittr_2.0.4              plotly_4.12.0              
 [15] rmarkdown_2.29              yaml_2.3.12                
 [17] httpuv_1.6.16               otel_0.2.0                 
 [19] Seurat_5.4.0                sctransform_0.4.2          
 [21] spam_2.11-1                 sp_2.2-0                   
 [23] spatstat.sparse_3.1-0       reticulate_1.44.1          
 [25] cowplot_1.2.0               pbapply_1.7-2              
 [27] RColorBrewer_1.1-3          abind_1.4-8                
 [29] Rtsne_0.17                  GenomicRanges_1.62.1       
 [31] purrr_1.2.0                 BiocGenerics_0.56.0        
 [33] circlize_0.4.16             GenomeInfoDbData_1.2.14    
 [35] IRanges_2.44.0              S4Vectors_0.48.0           
 [37] ggrepel_0.9.6               irlba_2.3.5.1              
 [39] listenv_0.10.0              spatstat.utils_3.2-1       
 [41] goftest_1.2-3               RSpectra_0.16-2            
 [43] spatstat.random_3.4-3       fitdistrplus_1.2-2         
 [45] parallelly_1.46.0           codetools_0.2-20           
 [47] DelayedArray_0.36.0         tidyselect_1.2.1           
 [49] shape_1.4.6.1               UCSC.utils_1.4.0           
 [51] farver_2.1.2                ScaledMatrix_1.16.0        
 [53] viridis_0.6.5               matrixStats_1.5.0          
 [55] stats4_4.5.2                spatstat.explore_3.7-0     
 [57] Seqinfo_1.0.0               jsonlite_2.0.0             
 [59] GetoptLong_1.1.0            progressr_0.18.0           
 [61] ggridges_0.5.6              survival_3.8-3             
 [63] iterators_1.0.14            foreach_1.5.2              
 [65] tools_4.5.2                 ica_1.0-3                  
 [67] Rcpp_1.1.0                  glue_1.8.0                 
 [69] gridExtra_2.3               SparseArray_1.10.8         
 [71] xfun_0.52                   distributional_0.6.0       
 [73] MatrixGenerics_1.22.0       ggthemes_5.2.0             
 [75] GenomeInfoDb_1.44.0         dplyr_1.1.4                
 [77] withr_3.0.2                 fastmap_1.2.0              
 [79] digest_0.6.37               rsvd_1.0.5                 
 [81] R6_2.6.1                    mime_0.13                  
 [83] colorspace_2.1-1            scattermore_1.2            
 [85] tensor_1.5                  spatstat.data_3.1-9        
 [87] generics_0.1.3              httr_1.4.7                 
 [89] htmlwidgets_1.6.4           S4Arrays_1.10.1            
 [91] uwot_0.2.3                  pkgconfig_2.0.3            
 [93] gtable_0.3.6                ComplexHeatmap_2.26.0      
 [95] lmtest_0.9-40               S7_0.2.1                   
 [97] SingleCellExperiment_1.32.0 XVector_0.50.0             
 [99] htmltools_0.5.8.1           dotCall64_1.2              
[101] zigg_0.0.2                  clue_0.3-66                
[103] SeuratObject_5.3.0          Biobase_2.70.0             
[105] png_0.1-8                   spatstat.univar_3.1-6      
[107] knitr_1.50                  reshape2_1.4.4             
[109] rjson_0.2.23                nlme_3.1-168               
[111] proxy_0.4-29                cachem_1.1.0               
[113] zoo_1.8-14                  GlobalOptions_0.1.2        
[115] KernSmooth_2.23-26          parallel_4.5.2             
[117] miniUI_0.1.2                pillar_1.11.1              
[119] grid_4.5.2                  vctrs_0.7.0                
[121] RANN_2.6.2                  promises_1.5.0             
[123] BiocSingular_1.26.1         beachmat_2.26.0            
[125] xtable_1.8-4                cluster_2.1.8.1            
[127] evaluate_1.0.5              cli_3.6.5                  
[129] compiler_4.5.2              crayon_1.5.3               
[131] future.apply_1.20.0         labeling_0.4.3             
[133] plyr_1.8.9                  stringi_1.8.7              
[135] viridisLite_0.4.2           deldir_2.0-4               
[137] BiocParallel_1.44.0         lazyeval_0.2.2             
[139] spatstat.geom_3.7-0         Matrix_1.7-4               
[141] RcppHNSW_0.6.0              patchwork_1.3.2            
[143] future_1.69.0               shiny_1.12.1               
[145] SummarizedExperiment_1.38.1 ROCR_1.0-11                
[147] Rfast_2.1.5.1               igraph_2.2.1               
[149] memoise_2.0.1               RcppParallel_5.1.10

Preamble

Loading all COTAN Objects

Load COTAN object

Collect size and GDI for all selected clusters (baseline data)

Load baseline data

Calculate the GDI of the mixtures of clusters

Define all mixing fractions

This is to be run once per wanted mixing-fraction

Load calculated data for analysis

Merge all results and calculate the fitting regression for each cluster pair

Recall cluster distance and add it to the results

Scatter plot of ΔGDI vs Distance

Merge all data and plot it using a-priory (squared) distance as discriminant

Collect `size` and `GDI` for all selected clusters (baseline data)