FDR analysis - define DE genes - Memento

Author

Silvia Giulia Galfrè

import os
import sys
import numpy as np
import pandas as pd
from scipy import sparse
from pathlib import Path

os.getcwd()

# Path to the folder containing mementoTypeIError.py
src_path = Path.cwd() / "src"
sys.path.insert(0, str(src_path))

import mementoTypeIError

dirOut = "Results/FDR/"
dataSetDir = "Data/MouseCortexFromLoom/FDR/MergedClusters_For_FDR/"
datasetOutDir = "Results/FDR_dataset/"

datasets_csv = pd.read_csv(
    os.path.join(dataSetDir, "Cells_Usage_DataFrame.csv"),
    index_col=0
)

datasets_csv.shape[1]

for _, row in datasets_csv.iterrows():
    file_code = f"{row['Group']}_{row['Collection']}"
    print(file_code)

    raw_path = datasetOutDir+"/"+f"{file_code}_RawData.csv"
    genes_path = datasetOutDir+"/"+f"{file_code}_Genes.csv"
    barcodes_path = datasetOutDir+"/"+f"{file_code}_barcodes.csv"
    clusters_path = datasetOutDir+"/"+f"{file_code}_clusters.csv"

    # Raw counts matrix (likely genes x cells), with rownames/colnames saved by R
    raw_df = pd.read_csv(raw_path, index_col=0)  # keeps rownames as index
    # Convert to sparse (optional, but recommended)
    dataRaw = sparse.csc_matrix(raw_df.values)

    # Genes/cells saved by write.csv are usually 1-column with an index column
    genes_df = pd.read_csv(genes_path, index_col=0)
    genes = genes_df.iloc[:, 0].astype(str).to_list()

    barcodes_df = pd.read_csv(barcodes_path, index_col=0)
    barcodes = barcodes_df.iloc[:, 0].astype(str).to_list()

    # Clusters: has fake_cluster and barcode columns (plus an index column from write.csv)
    clusters_df = pd.read_csv(clusters_path, index_col=0)
    # ensure columns are strings
    clusters_df["barcode"] = clusters_df["barcode"].astype(str)
    clusters_df["fake_cluster"] = clusters_df["fake_cluster"].astype(str)

    df = mementoTypeIError.mementoDEA(
        dataRaw=dataRaw,
        genes=genes,
        barcodes=barcodes,
        code=file_code,
        labels_df=clusters_df,
        dirOut=dirOut,
        num_cpus=10,
        num_boot=5000,
        capture_rate=0.05,
        obs_key="fake_cluster"
    )

from datetime import datetime, timezone
datetime.now()

datetime.datetime(2026, 1, 17, 17, 59, 59, 283548)

import importlib.metadata as md

def session_packages_with_versions():
    pkgs = sorted({name.split(".")[0] for name in sys.modules.keys() if name and not name.startswith("_")})
    rows = []
    for p in pkgs:
        try:
            v = md.version(p)
        except md.PackageNotFoundError:
            v = None  # stdlib or not a distribution name
        if v is not None:
            rows.append((p, v))
    return rows

for name, ver in session_packages_with_versions():
    print(f"{name}=={ver}")

IPython==8.38.0
anndata==0.11.4
asttokens==3.0.1
colorama==0.4.4
comm==0.2.3
cycler==0.12.1
debugpy==1.8.19
decorator==5.2.1
exceptiongroup==1.3.1
executing==2.2.1
h5py==3.15.1
ipykernel==7.1.0
jedi==0.19.2
joblib==1.5.3
jupyter_client==8.8.0
jupyter_core==5.9.1
kiwisolver==1.4.9
legacy_api_wrap==1.5
llvmlite==0.46.0
matplotlib==3.10.8
matplotlib_inline==0.2.1
natsort==8.4.0
numba==0.63.1
numpy==2.2.6
packaging==25.0
pandas==2.3.3
parso==0.8.5
patsy==1.0.2
platformdirs==4.5.1
prompt_toolkit==3.0.52
psutil==7.2.1
pure_eval==0.2.3
pygments==2.19.2
pyparsing==3.3.1
pytz==2022.1
scanpy==1.11.5
scipy==1.15.3
six==1.16.0
stack_data==0.6.3
statsmodels==0.14.6
threadpoolctl==3.6.0
tornado==6.5.4
traitlets==5.14.3
typing_extensions==4.15.0
tzdata==2025.3
wcwidth==0.2.14