Mixing Uniform Clusters To Estimate GDI Sensitivity using Run 40

Published

February 2, 2026

Preamble

library(assertthat)
library(rlang)
library(stringr)
library(scales)
library(ggplot2)
library(zeallot)
library(conflicted)
library(data.table)
library(tibble)
library(tidyr)

# Data processing libs
if (!suppressWarnings(require(COTAN))) {
  devtools::load_all("~/dev/COTAN/COTAN/")
}
library(parallelDist)

conflicts_prefer(zeallot::`%->%`, zeallot::`%<-%`)

options(parallelly.fork.enable = TRUE)

inDir <- file.path("Data/Brown_PBMC_Datasets/")

outDir <- file.path("Results/GDI_Sensitivity/PBMCBrownRun40/")

setLoggingLevel(2)
setLoggingFile(file.path(inDir, "MixingClustersForGDIDelta.log"))

Loading all COTAN Objects

list.files(path = inDir, pattern = "\\.RDS$")
 [1] "capillary_blood_samples_pbmcs-Run_28.RDS"        
 [2] "capillary_blood_samples_pbmcs-Run_40-Cleaned.RDS"
 [3] "capillary_blood_samples_pbmcs-Run_40.RDS"        
 [4] "capillary_blood_samples_pbmcs-Run_41-Cleaned.RDS"
 [5] "capillary_blood_samples_pbmcs-Run_41.RDS"        
 [6] "capillary_blood_samples_pbmcs-Run_62.RDS"        
 [7] "capillary_blood_samples_pbmcs-Run_77-Cleaned.RDS"
 [8] "capillary_blood_samples_pbmcs-Run_77.RDS"        
 [9] "capillary_blood_samples_pbmcs.RDS"               
[10] "CD4Tcells-Run_40-Cleaned.RDS"                    
[11] "Run_40Cl_0CD4TcellsRawData.RDS"                  
[12] "Run_40Cl_1CD4TcellsRawData.RDS"                  
[13] "Run_40Cl_5CD4TcellsRawData.RDS"                  
[14] "Run_40Cl_7CD4TcellsRawData.RDS"                  
[15] "Run_40Cl_8CD4TcellsRawData.RDS"                  
[16] "Run_40Cl_9CD4TcellsRawData.RDS"                  
globalCondition <- "capillary_blood_samples_pbmcs"

thisRun <- "40"
fileNameIn <- paste0(globalCondition, "-Run_", thisRun, "-Cleaned.RDS")

Load COTAN object

aRunObj <- readRDS(file = file.path(inDir, fileNameIn))

sampleCondition <- getMetadataElement(aRunObj, datasetTags()[["cond"]])[[1L]]

getClusterizations(aRunObj)
 [1] "cell_type_level_1" "cell_type_level_2" "cell_type_level_3"
 [4] "cell_type_level_4" "c1"                "c2"               
 [7] "c3"                "c4"                "Sample_C3"        
[10] "SelUnifCl"        
relevantClusters <- getClusters(aRunObj, clName = "SelUnifCl")

###Relevant clusters

relevantClustersList <- toClustersList(relevantClusters)
relevantClustersList <- relevantClustersList[names(relevantClustersList) != "Other"]

Collect size and GDI for all selected clusters (baseline data)

fileNameRelData <- file.path(outDir, "relevantClChecksRes.csv")

relevantResData <- read.csv(file = fileNameRelData,
                            header = TRUE, row.names = 1L)
relevantResData <- relevantResData[rownames(relevantResData) != "Other", ]

baselineGDI <- relevantResData[, c("clusterSize", "check.quantileAtRatio")]

colnames(baselineGDI) <- c("size", "GDI")

write.csv(baselineGDI, file = file.path(outDir, "BaselineGDI.csv"))

Load baseline data

baselineGDI <- read.csv(file = file.path(outDir, "BaselineGDI.csv"),
                        header = TRUE, row.names = 1L)

Calculate the GDI of the mixtures of clusters

Define all mixing fractions

allMixingFractions <- c(0.05, 0.10, 0.20, 0.40, 0.80)
names(allMixingFractions) <-
  str_pad(scales::label_percent()(allMixingFractions), 3, pad = "0")

This is to be run once per wanted mixing-fraction

# small run
# 
set.seed(137)

for (mixingStr in names(allMixingFractions)) {
  mixingFraction <- allMixingFractions[[mixingStr]]

  resFileOut <-
    file.path(outDir, paste0("GDI_with_", mixingStr, "_Mixing.RDS"))
  
  results <- data.frame()

  for (mainName in rownames(baselineGDI)) {
    mainSize <- baselineGDI[mainName, "size"]
    mainGDI  <- baselineGDI[mainName, "GDI"]
    
    mainCluster <- relevantClustersList[[mainName]]

    for (clName in rownames(baselineGDI)) {
      if (clName == mainName) next

      logThis(paste("Mixing", mainName, "with extra",
                    mixingStr, "cells from", clName), logLevel = 1)

      clSize <- baselineGDI[clName, "size"]
      actuallyMixedCells <- min(ceiling(mixingFraction * mainSize), clSize)
      actualFraction <- actuallyMixedCells / mainSize
      
      sampledCluster <- sample(relevantClustersList[[clName]], actuallyMixedCells)

      cellsToDrop <- !(getCells(aRunObj) %in% c(mainCluster, sampledCluster))
      mixedObj <- dropGenesCells(aRunObj, cells = getCells(aRunObj)[cellsToDrop])

      # Calculate the merged COEX (1 core to avoid multi-process issues)
      mixedObj <- proceedToCoex(mixedObj, calcCoex = TRUE, cores = 1L)

      # Extract the GDI quantile
      mixedGDIData <- calculateGDI(mixedObj)
      rm(mixedObj)

      gdi <- getColumnFromDF(mixedGDIData, colName = "GDI")
      gdi <- sort(gdi, decreasing = TRUE)
      lastPercentile <- quantile(gdi, probs = 0.99)
      rm(gdi)

      GDIIncr <- lastPercentile - mainGDI
      
      results <-
        rbind(
          results,
          data.frame(
            "MainCluster"    = mainName,
            "OtherCluster"   = clName,
            "MixingFraction" = actualFraction,
            "GDI"            = lastPercentile,
            "GDIIncrement"   = GDIIncr
          )
        )

      logThis(
        paste("Mixing", mainName, "with", mixingStr, "of", clName,
              "accomplished with GDI", lastPercentile, "and increment", GDIIncr),
        logLevel = 1)
    }
    
    # save at each mainCuster so not to loose effort in case of errors
    rownames(results) <- NULL
  
    saveRDS(results, resFileOut)
  }

}

Load calculated data for analysis

allResList <-
  lapply(
    names(allMixingFractions),
    function(mixStr) {
      readRDS(file.path(outDir, paste0("GDI_with_", mixStr, "_Mixing.RDS")))
    })
names(allResList) <- names(allMixingFractions)

assert_that(unique(sapply(allResList, ncol)) == 5L)
[1] TRUE

Merge all results and calculate the fitting regression for each cluster pair

allRes <- cbind(allResList[[1L]][, 1:2],
                lapply(allResList, function(df) { df[, 3:5] }))

colnames(allRes) <-
  c(colnames(allResList[[1L]])[1:2],
    paste0(
      rep(colnames(allResList[[1L]])[3:5], 5),
      "_",
      rep(names(allMixingFractions), each = 3)
    )
  )

Recall cluster distance and add it to the results

zeroOneAvg <- readRDS(file.path(outDir,  "allZeroOne.RDS"))
# zeroOneAvg <- readRDS(file.path(outDir, "distanceZeroOne.RDS"))

distZeroOne <-
  as.matrix(parDist(t(zeroOneAvg), method = "hellinger",
                    diag = TRUE, upper = TRUE))^2

distZeroOneLong <-
  rownames_to_column(as.data.frame(distZeroOne), var = "MainCluster")
distZeroOneLong <-
  pivot_longer(distZeroOneLong,
               cols = !MainCluster,
               names_to = "OtherCluster", 
               values_to = "Distance")

distZeroOneLong <-
  as.data.frame(distZeroOneLong[distZeroOneLong[["Distance"]] != 0.0, ])

assert_that(identical(distZeroOneLong[, 1:2], allRes[, 1:2]))
[1] TRUE
perm <- order(distZeroOneLong[["Distance"]])

Scatter plot of ΔGDI vs Distance

Merge all data and plot it using a-priory (squared) distance as discriminant

allResPerm <- do.call(rbind,lapply(allResList, function(df) df[perm, ]))
rownames(allResPerm) <- NULL

allResPerm <-
  cbind(allResPerm,
        "ClusterPair" = rep.int(seq_along(perm), 5),
        "Distance"    = rep(distZeroOneLong[["Distance"]][perm], 5))

head(allResPerm)
  MainCluster OtherCluster MixingFraction      GDI  GDIIncrement ClusterPair
1           0            5     0.05000000 1.443775  0.0222651676           1
2           5            0     0.05122951 1.384208  0.0173951251           2
3           0            1     0.05000000 1.438330  0.0168202884           3
4           1            0     0.05112782 1.379750  0.0286431493           4
5           0            9     0.05000000 1.426839  0.0053299285           5
6           9            0     0.05333333 1.361838 -0.0005519038           6
    Distance
1 0.01489277
2 0.01489277
3 0.02026159
4 0.02026159
5 0.02928972
6 0.02928972
mg <- function(mixing) { round(log2(mixing * 40)) }

IScPlot <- ggplot(allResPerm,
                  aes(x = paste0(mg(MixingFraction)),
                      y = GDIIncrement,
                      group = mg(MixingFraction))) +
  geom_boxplot(varwidth = TRUE) +
  scale_x_discrete(
    breaks = paste0(mg(allMixingFractions)),
    labels = names(allMixingFractions)
  ) +
  labs(x = "Mixing Fraction") +
  theme(plot.title = element_blank())

plot(IScPlot)

Sys.time()
[1] "2026-02-02 16:06:22 CET"
#Sys.info()
sessionInfo()
R version 4.5.2 (2025-10-31)
Platform: x86_64-pc-linux-gnu
Running under: Ubuntu 22.04.5 LTS

Matrix products: default
BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0 
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0  LAPACK version 3.10.0

locale:
 [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
 [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
 [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
[10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   

time zone: Europe/Rome
tzcode source: system (glibc)

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] parallelDist_0.2.6 COTAN_2.11.1       tidyr_1.3.1        tibble_3.3.0      
 [5] data.table_1.18.0  conflicted_1.2.0   zeallot_0.2.0      ggplot2_4.0.1     
 [9] scales_1.4.0       stringr_1.6.0      rlang_1.1.7        assertthat_0.2.1  

loaded via a namespace (and not attached):
  [1] RcppAnnoy_0.0.22            splines_4.5.2              
  [3] later_1.4.2                 polyclip_1.10-7            
  [5] fastDummies_1.7.5           lifecycle_1.0.4            
  [7] doParallel_1.0.17           globals_0.18.0             
  [9] lattice_0.22-7              MASS_7.3-65                
 [11] ggdist_3.3.3                dendextend_1.19.0          
 [13] magrittr_2.0.4              plotly_4.12.0              
 [15] rmarkdown_2.29              yaml_2.3.12                
 [17] httpuv_1.6.16               otel_0.2.0                 
 [19] Seurat_5.4.0                sctransform_0.4.2          
 [21] spam_2.11-1                 sp_2.2-0                   
 [23] spatstat.sparse_3.1-0       reticulate_1.44.1          
 [25] cowplot_1.2.0               pbapply_1.7-2              
 [27] RColorBrewer_1.1-3          abind_1.4-8                
 [29] Rtsne_0.17                  GenomicRanges_1.62.1       
 [31] purrr_1.2.0                 BiocGenerics_0.56.0        
 [33] circlize_0.4.16             GenomeInfoDbData_1.2.14    
 [35] IRanges_2.44.0              S4Vectors_0.48.0           
 [37] ggrepel_0.9.6               irlba_2.3.5.1              
 [39] listenv_0.10.0              spatstat.utils_3.2-1       
 [41] goftest_1.2-3               RSpectra_0.16-2            
 [43] spatstat.random_3.4-3       fitdistrplus_1.2-2         
 [45] parallelly_1.46.0           codetools_0.2-20           
 [47] DelayedArray_0.36.0         tidyselect_1.2.1           
 [49] shape_1.4.6.1               UCSC.utils_1.4.0           
 [51] farver_2.1.2                ScaledMatrix_1.16.0        
 [53] viridis_0.6.5               matrixStats_1.5.0          
 [55] stats4_4.5.2                spatstat.explore_3.7-0     
 [57] Seqinfo_1.0.0               jsonlite_2.0.0             
 [59] GetoptLong_1.1.0            progressr_0.18.0           
 [61] ggridges_0.5.6              survival_3.8-3             
 [63] iterators_1.0.14            foreach_1.5.2              
 [65] tools_4.5.2                 ica_1.0-3                  
 [67] Rcpp_1.1.0                  glue_1.8.0                 
 [69] gridExtra_2.3               SparseArray_1.10.8         
 [71] xfun_0.52                   distributional_0.6.0       
 [73] MatrixGenerics_1.22.0       ggthemes_5.2.0             
 [75] GenomeInfoDb_1.44.0         dplyr_1.1.4                
 [77] withr_3.0.2                 fastmap_1.2.0              
 [79] digest_0.6.37               rsvd_1.0.5                 
 [81] R6_2.6.1                    mime_0.13                  
 [83] colorspace_2.1-1            scattermore_1.2            
 [85] tensor_1.5                  spatstat.data_3.1-9        
 [87] generics_0.1.3              httr_1.4.7                 
 [89] htmlwidgets_1.6.4           S4Arrays_1.10.1            
 [91] uwot_0.2.3                  pkgconfig_2.0.3            
 [93] gtable_0.3.6                ComplexHeatmap_2.26.0      
 [95] lmtest_0.9-40               S7_0.2.1                   
 [97] SingleCellExperiment_1.32.0 XVector_0.50.0             
 [99] htmltools_0.5.8.1           dotCall64_1.2              
[101] zigg_0.0.2                  clue_0.3-66                
[103] SeuratObject_5.3.0          Biobase_2.70.0             
[105] png_0.1-8                   spatstat.univar_3.1-6      
[107] knitr_1.50                  reshape2_1.4.4             
[109] rjson_0.2.23                nlme_3.1-168               
[111] proxy_0.4-29                cachem_1.1.0               
[113] zoo_1.8-14                  GlobalOptions_0.1.2        
[115] KernSmooth_2.23-26          parallel_4.5.2             
[117] miniUI_0.1.2                pillar_1.11.1              
[119] grid_4.5.2                  vctrs_0.7.0                
[121] RANN_2.6.2                  promises_1.5.0             
[123] BiocSingular_1.26.1         beachmat_2.26.0            
[125] xtable_1.8-4                cluster_2.1.8.1            
[127] evaluate_1.0.5              cli_3.6.5                  
[129] compiler_4.5.2              crayon_1.5.3               
[131] future.apply_1.20.0         labeling_0.4.3             
[133] plyr_1.8.9                  stringi_1.8.7              
[135] viridisLite_0.4.2           deldir_2.0-4               
[137] BiocParallel_1.44.0         lazyeval_0.2.2             
[139] spatstat.geom_3.7-0         Matrix_1.7-4               
[141] RcppHNSW_0.6.0              patchwork_1.3.2            
[143] future_1.69.0               shiny_1.12.1               
[145] SummarizedExperiment_1.38.1 ROCR_1.0-11                
[147] Rfast_2.1.5.1               igraph_2.2.1               
[149] memoise_2.0.1               RcppParallel_5.1.10