GDI Increment From Mixing

Author

Marco Fantozzi

Published

February 2, 2026

Preamble

library(assertthat)
library(rlang)
library(scales)
library(ggplot2)
library(zeallot)
library(data.table)
library(parallelDist)
library(tidyr)
library(tidyverse)
library(ggh4x)
library(gridExtra)

#options(parallelly.fork.enable = TRUE)

inDir <- file.path("Results")

#setLoggingLevel(2)
#setLoggingFile(file.path(inDir, "MixingClustersGDI_ForebrainDorsal.log"))

outDir <- file.path(inDir, "GDI_Sensitivity")
if (!file.exists(outDir)) {
  dir.create(outDir)
}

Re-Load calculated data for analysis

Define all mixing fractions

allMixingFractions <- c(0.05, 0.10, 0.20, 0.40, 0.80)
names(allMixingFractions) <-
  str_pad(scales::label_percent()(allMixingFractions), 3, pad = "0")

Actually load the data

allResList <-
  lapply(
    names(allMixingFractions),
    function(mixStr) {
      readRDS(file.path(outDir, paste0("GDI_with_", mixStr, "_Mixing.RDS")))
    })
names(allResList) <- names(allMixingFractions)

assert_that(unique(sapply(allResList, ncol)) == 5L)
[1] TRUE

Add baseline (“00%” mixing)

baseline <- allResList[[1L]]

baseline[["MixingFraction"]] <- 0.0
baseline[["GDI"]] <- baseline[["GDI"]] - baseline[["GDIIncrement"]]
baseline[["GDIIncrement"]] <- 0.0

allResListWithBase <- append(list("00%" = baseline), allResList)

rm(baseline)

Merge all results and calculate the fitting regression for each cluster pair

allRes <-
  cbind(allResList[[1L]][, 1:2],
        lapply(allResList, function(df) { df[, 3:5] }))

colnames(allRes) <-
  c(colnames(allResList[[1L]])[1:2],
    paste0(
      rep(colnames(allResList[[1L]])[3:5], 5),
      "_",
      rep(names(allMixingFractions), each = 3)
    )
  )

allResWithBase <-
  cbind(allResListWithBase[[1L]][, 1:2],
        lapply(allResListWithBase, function(df) { df[, 3:5] }))

colnames(allResWithBase) <-
  c(colnames(allResListWithBase[[1L]])[1:2],
    paste0(
      rep(colnames(allResListWithBase[[1L]])[3:5], 5),
      "_",
      rep(c("00%", names(allMixingFractions)), each = 3)
    )
  )

assert_that(
  identical(
    allRes, allResWithBase[, str_detect(colnames(allResWithBase),
                                        "00%", negate = TRUE)]
  )
)
[1] TRUE

Recall cluster distance and add it to the results

zeroOneAvg <- readRDS(file.path(outDir,  "allZeroOne.RDS"))
# zeroOneAvg <- readRDS(file.path(outDir, "distanceZeroOne.RDS"))

distZeroOne <-
  as.matrix(parDist(t(zeroOneAvg), method = "hellinger",
                    diag = TRUE, upper = TRUE))^2

distZeroOneLong <-
  rownames_to_column(as.data.frame(distZeroOne), var = "MainCluster")
distZeroOneLong <-
  pivot_longer(distZeroOneLong,
               cols = !MainCluster,
               names_to = "OtherCluster", 
               values_to = "Distance")

distZeroOneLong <-
  as.data.frame(distZeroOneLong[distZeroOneLong[["Distance"]] != 0.0, ])

assert_that(identical(distZeroOneLong[, 1:2], allRes[, 1:2]))
[1] TRUE
perm <- order(distZeroOneLong[["Distance"]])

Scatter plot of ΔGDI vs Distance

distDF <- cbind(distZeroOneLong[, "Distance", drop = FALSE],
                sqrt(distZeroOneLong[, "Distance", drop = FALSE]))
colnames(distDF) <- c("Distance", "DistanceSqrt")

D2IPlot <- ggplot(cbind(allResList[["20%"]], distDF),
                  aes(x = Distance, y = GDIIncrement)) +
             geom_point() +
             geom_smooth(method = lm, formula = y ~ x)
           # +  xlim(0, 1.5) + ylim(0, 1.5) + coord_fixed()

plot(D2IPlot)

Recall cluster distance and add it to the results

zeroOneAvg <- readRDS(file.path(outDir,  "allZeroOne.RDS"))
# zeroOneAvg <- readRDS(file.path(outDir, "distanceZeroOne.RDS"))

distZeroOne <-
  as.matrix(parDist(t(zeroOneAvg), method = "hellinger",
                    diag = TRUE, upper = TRUE))^2

distZeroOneLong <-
  rownames_to_column(as.data.frame(distZeroOne), var = "MainCluster")
distZeroOneLong <-
  pivot_longer(distZeroOneLong,
               cols = !MainCluster,
               names_to = "OtherCluster", 
               values_to = "Distance")

distZeroOneLong <-
  as.data.frame(distZeroOneLong[distZeroOneLong[["Distance"]] != 0.0, ])

assert_that(identical(distZeroOneLong[, 1:2], allRes[, 1:2]))
[1] TRUE
perm <- order(distZeroOneLong[["Distance"]])

Scatter plot of ΔGDI vs Distance

distDF <- cbind(distZeroOneLong[, "Distance", drop = FALSE],
                sqrt(distZeroOneLong[, "Distance", drop = FALSE]))
colnames(distDF) <- c("Distance", "DistanceSqrt")

D2IPlot <- ggplot(cbind(allResList[["20%"]], distDF),
                  aes(x = Distance, y = GDIIncrement)) +
             geom_point() +
             geom_smooth(method = lm, formula = y ~ x)
           # +  xlim(0, 1.5) + ylim(0, 1.5) + coord_fixed()

plot(D2IPlot)

Merge all data and plot it using a-priory (squared) distance as discriminant

allResPerm <-
  do.call(rbind, lapply(allResList, function(df) df[perm, ]))

allResPerm <-
  cbind(allResPerm,
        "ClusterPair" = rep.int(seq_along(perm),
                                length(allResList)),
        "Distance"    = rep(distZeroOneLong[["Distance"]][perm],
                            length(allResList)))
rownames(allResPerm) <- NULL

dim(allResPerm)
[1] 1050    7
allResWithBasePerm <-
  do.call(rbind,lapply(allResListWithBase, function(df) df[perm, ]))

allResWithBasePerm <-
  cbind(allResWithBasePerm,
        "ClusterPair" = rep.int(seq_along(perm),
                                length(allResListWithBase)),
        "Distance"    = rep(distZeroOneLong[["Distance"]][perm],
                            length(allResListWithBase)))
rownames(allResWithBasePerm) <- NULL

dim(allResWithBasePerm)
[1] 1260    7
assert_that(
  all.equal(
    allResPerm,
    allResWithBasePerm[(length(perm) + 1):nrow(allResWithBasePerm), ],
    check.attributes = FALSE
  ),
  msg = "Trouble with meshing all res permutated data"
)
[1] TRUE

Plot all data using the mixing fraction as abscissas and the distance as color

IScPlot <-
  ggplot(allResPerm,
         aes(x = MixingFraction, y = GDIIncrement, color = Distance)
  ) +
  geom_point() +
  scale_color_continuous(type = "viridis") +
  scale_x_log10() +
  labs(
    x = "Ratio between sizes of second and first cluster",
    y = "Top 1% GDI score increment of genes"
  )

plot(IScPlot)

GScPlot <-
  ggplot(allResWithBasePerm,
         aes(x = MixingFraction, y = GDI, color = Distance)
  ) +
  geom_point() +
  scale_color_continuous(type = "viridis") +
  labs(
    x = "Ratio between sizes of second and first cluster",
    y = "Top 1% GDI score of genes"
  )

plot(GScPlot)

Reorder by cluster pair

reOrder <- function(df, numBlocks) {
  blockLength <- nrow(df) / numBlocks
  permut <- rep(1:blockLength, each = numBlocks) +
              rep(seq(0, nrow(df) - 1, by = blockLength), times = numBlocks)
  return(df[permut, ])
}

allResPerm2 <- reOrder(allResPerm, 5)
allResWithBasePerm2 <- reOrder(allResWithBasePerm, 6)

head(allResPerm[["ClusterPair"]], 20)
 [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
head(allResPerm2[["ClusterPair"]], 20)
 [1] 1 1 1 1 1 2 2 2 2 2 3 3 3 3 3 4 4 4 4 4

Plot by lines

numBuckets <- 6L
bucketLength <- length(perm) %/% numBuckets

ILinesPLot <-
  ggplot(
    allResPerm2,
    aes(x = MixingFraction, y = GDIIncrement,
        color = (ClusterPair - 1) %/% bucketLength + 0.5)
  ) + 
  geom_path(aes(group = ClusterPair)) +
  theme(legend.position = "none") +
  scale_colour_stepsn(
    colours = rev(hcl.colors(numBuckets, palette = "Dark 2"))
  ) +
  labs(
    x = "Ratio between sizes of second and first cluster",
    y = "Top 1% GDI score increment of genes"
  )

plot(ILinesPLot)

GLinesPLot <-
  ggplot(
    allResWithBasePerm2,
    aes(x = MixingFraction, y = GDI,
        color = (ClusterPair - 1) %/% bucketLength + 0.5)
  ) + 
  geom_path(aes(group = ClusterPair)) +
  theme(legend.position = "none") +
  scale_colour_stepsn(
    colours = rev(hcl.colors(numBuckets, palette = "Dark 2"))
  ) +
  labs(
    x = "Ratio between sizes of second and first cluster",
    y = "Top 1% GDI score of genes"
  )

plot(GLinesPLot)

ΔGDI main plot

numBuckets <- 6L
bucketLength <- length(perm) %/% numBuckets

mg <- function(mixings) {
  res <- mixings
  actOn <- res != 0.0
  res[actOn] <- round(log2(res[actOn] * 40))
  return(res)
}

group <- factor((allResPerm[["ClusterPair"]] - 1) %/% bucketLength + 1)
levels(group) <- paste0("Distance bin ", 1:numBuckets)

discreteMixing <- factor(mg(allResPerm[["MixingFraction"]]))
levels(discreteMixing) <- names(allMixingFractions)

allResPermBuck <-
  cbind(allResPerm, "Group" = group, "DiscreteMixing" = discreteMixing)

first.part  <- c("Distance bin 1", "Distance bin 2", "Distance bin 3")
second.part <- c("Distance bin 4", "Distance bin 5", "Distance bin 6")

IBoxPlotA <-
  ggplot(
    allResPermBuck[allResPermBuck[["Group"]] %in% first.part,],
    aes(x = DiscreteMixing, y = GDIIncrement,
        fill = Group, group = DiscreteMixing)
  ) +
  geom_boxplot() +
  geom_jitter(width = 0.25, alpha = 0.5) +
  scale_fill_manual(values = c("#0B5345", "#117A65", "#16A085")) +
  facet_grid2(cols = vars(Group)) +
  scale_y_continuous(
    limits = c(-0.05, 0.95),
    breaks = scales::breaks_width(0.5)
  ) +
  theme_light() +
  theme(legend.position = "none", axis.title.x = element_blank()) +
  labs(
    y = ""
  )

IBoxPlotB <-
  ggplot(
    allResPermBuck[allResPermBuck[["Group"]] %in% second.part, ],
    aes(x = DiscreteMixing, y = GDIIncrement,
        fill = Group, group = DiscreteMixing)
  ) +
  geom_boxplot() +
  geom_jitter(width = 0.25, alpha = 0.5) +
  scale_fill_manual(values = c("#1ABC9C", "#48C9B0", "#A3E4D7")) +
  facet_grid2(cols = vars(Group)) +
  scale_y_continuous(
    limits = c(-0.05, 2.2),
    breaks = scales::breaks_width(0.5)
  ) +
  theme_light() +
  theme(legend.position = "none") +
  labs(
    x = "Ratio between sizes of second and first cluster",
    y = "Variation of the 99th percentile of GDI scores"
  )

finalIPlot <-
  gridExtra::grid.arrange(
    IBoxPlotA, IBoxPlotB, nrow = 3, ncol = 1,
    layout_matrix = cbind(c(1, rep(2, 2)))
  )

grDevices::pdf(file.path(outDir, "GDI-Increment_LaManno_ForebrainDorsal.pdf"),
               width = 7, height = 7)
grid::grid.newpage()
grid::grid.draw(finalIPlot)
grDevices::dev.off()
png 
  2 

GDI main plot

numBuckets <- 6L
bucketLength <- length(perm) %/% numBuckets

mg <- function(mixings) {
  res <- mixings
  actOn <- res != 0.0
  res[actOn] <- round(log2(res[actOn] * 40))
  return(res)
}

group <- factor((allResWithBasePerm[["ClusterPair"]] - 1) %/% bucketLength + 1)
levels(group) <- paste0("Distance bin ", 1:numBuckets)

discreteMixing <- factor(mg(allResWithBasePerm[["MixingFraction"]]))
levels(discreteMixing) <- c("00%", names(allMixingFractions))

allResWithBasePermBuck <-
  cbind(allResWithBasePerm, "Group" = group, "DiscreteMixing" = discreteMixing)

first.part  <- c("Distance bin 1", "Distance bin 2", "Distance bin 3")
second.part <- c("Distance bin 4", "Distance bin 5", "Distance bin 6")


GBoxPlotA <-
  ggplot(
    allResWithBasePermBuck[allResWithBasePermBuck[["Group"]] %in% first.part,],
    aes(x = DiscreteMixing, y = GDI,
        fill = Group, group = DiscreteMixing)
  ) +
  geom_boxplot() +
  geom_jitter(width = 0.25, alpha = 0.5) +
  scale_fill_manual(values = c("#0B5345", "#117A65", "#16A085")) +
  facet_grid2(cols = vars(Group)) +
  scale_y_continuous(
    limits = c(1.30, 2.35),
    breaks = scales::breaks_width(0.5)
  ) +
  theme_light() +
  theme(legend.position = "none", axis.title.x = element_blank()) +
  labs(
    y = "Top 1% GDI score of genes"
  )

GBoxPlotB <-
  ggplot(
    allResWithBasePermBuck[allResWithBasePermBuck[["Group"]] %in% second.part,],
    aes(x = DiscreteMixing, y = GDI,
        fill = Group, group = DiscreteMixing)
  ) +
  geom_boxplot() +
  geom_jitter(width = 0.25, alpha = 0.5) +
  scale_fill_manual(values = c("#1ABC9C", "#48C9B0", "#A3E4D7")) +
  facet_grid2(cols = vars(Group)) +
  scale_y_continuous(
    limits = c(1.30, 3.60),
    breaks = scales::breaks_width(0.5)
  ) +
  theme_light() +
  theme(legend.position = "none") +
  labs(
    x = "Ratio between sizes of second and first cluster",
    y = "Top 1% GDI score of genes"
  )

finalGPlot <-
  gridExtra::grid.arrange(
    GBoxPlotA, GBoxPlotB, nrow = 3, ncol = 1,
    layout_matrix = cbind(c(1, rep(2, 2)))
  )

grDevices::pdf(file.path(outDir, "GDI_LaManno_ForebrainDorsal.pdf"),
               width = 7, height = 7)
grid::grid.newpage()
grid::grid.draw(finalGPlot)
grDevices::dev.off()
png 
  2 

Load calculated data about multi clusters cases for analysis

Compare estimated vs real GDI increment

# Scatter plot of the effective increment [Y] against estimated increment [X]
pg <- ggplot(resMix20_2, aes(x = PredictedGDIIncrement, y = GDIIncrement)) +
  geom_point() +
  geom_smooth(method=lm, formula = y ~ x + 0) +
  coord_fixed() +
  xlim(0, 1.5) + ylim(0, 1.5) + 
  labs(
    x = "Predicted - variation of the 99th percentile of GDI scores",
    y = "Observed - variation of the 99th percentile of GDI scores"
  )

plot(pg)

pdf(file.path(outDir, "Estimated-vs-Real_GDI-Increment.pdf"),
               width = 7, height = 7)
plot(pg)
dev.off()
png 
  2 

The plot shows that having the 20% extraneous cells in the mixture coming from multiple clusters does not affect significantly the sensitivity of the GDI to score cluster uniformity.

Sys.time()
[1] "2026-02-02 09:47:35 CET"
#Sys.info()
sessionInfo()
R version 4.5.2 (2025-10-31 ucrt)
Platform: x86_64-w64-mingw32/x64
Running under: Windows 11 x64 (build 26200)

Matrix products: default
  LAPACK version 3.12.1

locale:
[1] LC_COLLATE=English_United States.utf8 
[2] LC_CTYPE=English_United States.utf8   
[3] LC_MONETARY=English_United States.utf8
[4] LC_NUMERIC=C                          
[5] LC_TIME=English_United States.utf8    

time zone: Europe/Rome
tzcode source: internal

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] gridExtra_2.3      ggh4x_0.3.1        lubridate_1.9.4    forcats_1.0.1     
 [5] stringr_1.6.0      dplyr_1.1.4        purrr_1.2.0        readr_2.1.6       
 [9] tibble_3.3.0       tidyverse_2.0.0    tidyr_1.3.1        parallelDist_0.2.7
[13] data.table_1.18.0  zeallot_0.2.0      ggplot2_4.0.1      scales_1.4.0      
[17] rlang_1.1.6        assertthat_0.2.1  

loaded via a namespace (and not attached):
 [1] generics_0.1.4        lattice_0.22-7        stringi_1.8.7        
 [4] hms_1.1.4             digest_0.6.39         magrittr_2.0.4       
 [7] evaluate_1.0.5        grid_4.5.2            timechange_0.3.0     
[10] RColorBrewer_1.1-3    fastmap_1.2.0         Matrix_1.7-4         
[13] jsonlite_2.0.0        mgcv_1.9-3            viridisLite_0.4.2    
[16] codetools_0.2-20      cli_3.6.5             splines_4.5.2        
[19] withr_3.0.2           yaml_2.3.12           otel_0.2.0           
[22] tools_4.5.2           tzdb_0.5.0            vctrs_0.6.5          
[25] R6_2.6.1              lifecycle_1.0.4       htmlwidgets_1.6.4    
[28] pkgconfig_2.0.3       RcppParallel_5.1.11-1 pillar_1.11.1        
[31] gtable_0.3.6          glue_1.8.0            Rcpp_1.1.0           
[34] xfun_0.54             tidyselect_1.2.1      knitr_1.51           
[37] farver_2.1.2          nlme_3.1-168          htmltools_0.5.9      
[40] labeling_0.4.3        rmarkdown_2.30        compiler_4.5.2       
[43] S7_0.2.1