2 Cleaning & Normalization

Code

#pkg
library(Seurat)
library(SoupX)
library(scDblFinder)
library(Matrix)
library(rtracklayer)
library(GenomeInfoDb)
library(tidyverse)
library(forcats)
library(ggplot2)
library(here)
library(tidyverse)
library(readxl)
library(gt)
library(paletteer)
library(scales)
library(DT)
library(htmlwidgets)
library(rtracklayer)
library(GenomeInfoDb)
library(fs)
library(glmGamPoi)
library(furrr)
library(future)

if (requireNamespace("glmGamPoi", quietly = TRUE)) message("glmGamPoi available: SCT will be faster.")
if (requireNamespace("presto", quietly = TRUE)) message("presto available: marker finding will be faster.")

# src
source(here::here("src/function/stat_function/stat_analysis_main.R")) # for make plot 
source(here::here("src/function/fig_export.R")) # This function saves a given plot (plot_x) as both a PDF and a high-resolution PNG file at specified dimensions.

runs_base <- "/mnt/ibioldata/LBMC_data/cmaslard/data/ScBdLR_init_2025_private/cellranger/"
#runs_base <- "/Volumes/IBIOLdata_storage/LBMC_data/cmaslard/data/ScBdLR_init_2025_private/cellranger/"

A sample is considered ‘good’ if:

Estimated Number of Cells: 627–14k. Anything <~1–2k is low yield for droplet-based sc/snRNA-seq.
Mean Reads per Cell: ~28k–120k. High depth (>80k) only helps if it translates into more genes/UMIs.
Median Genes per Cell / Median UMI per Cell: ~1.4k–6.7k genes and ~1.9k–6.7k UMIs. ≥2–3k genes/cell is a good floor in plants; 1.4–2.0k is weak.
Valid Barcodes / Valid UMIs: ~92–98% / ~100% — excellent across the board.
Sequencing Saturation: 52–82%. >70–80% means diminishing returns from more depth; ~50–65% still has room if you need more molecules.
Q30 (Barcode/RNA/UMI): ~92–95% — very good.
Reads Mapped (Genome / Confidently to Genome): ~93–96% / ~49–86% — generally solid.
Confidently Intergenic/Intronic/Exonic: Exonic varies a lot (34–72%). Exonic <~50% suggests more non-transcriptome signal (ambient RNA, pre-mRNA bias, annotation mismatch).
Confidently to Transcriptome: ~37–73%. Good runs are ≥60–70%. <50–55% is a red flag for downstream expression analysis.
Reads Antisense to Gene: ~3–6% — fine.
Fraction Reads in Cells: ~63–80%. ≥70% is healthy; low 60s suggests ambient RNA / many empty droplets.
Total Genes Detected: ~31–37k — consistent.
High mapped read rate (>70–80% of expected genome/mRNA).
Median number of genes detected per cell/nucleus is correct:
Whole cells: ~1,500–5,000 genes
Nuclei: often lower (500–2,500), but consistent between samples.
Low proportion of organelle UMIs (chloroplast + mitochondrion) → typically <5–10%, otherwise a lot of debris/ambient RNA.
Low proportion of ribosomal reads (depending on capture).
Doublets/minor artefacts ≤5%.

2.1 Summary of the various datasets (see also the previous page)

Code

runs <- list.dirs(runs_base, full.names = TRUE, recursive = FALSE)
runs_org  <- runs[str_ends(basename(runs), "_org")]
file_name <- "metrics_summary.csv"

all_tbl <- tibble(path = paste0(runs_org,"/",file_name)) %>% 
  mutate(filename = path_file(path),
         dir      = path_dir(path),
         sample = path_file(runs_org))  %>% 
  dplyr::mutate(data = map(path, ~ readr::read_csv(.x, show_col_types = FALSE))) |>
  dplyr::select(sample, data)  %>% 
  unnest(data)

out_dir <- here::here("data/output/verification")
dir.create(out_dir, showWarnings = FALSE, recursive = TRUE)
write.csv(all_tbl,
          file.path(out_dir, "all_sample_metrics_summary.csv"),
          row.names = FALSE)

Code

# ---- Palettes & options ----
c_col <- c("#ffffff", paletteer::paletteer_dynamic("cartography::green.pal", 20) |> as.character())
c_col_light_blue <- c("#edf2fb", "#e2eafc", "#d7e3fc", "#ccdbfd", "#c1d3fe")
c_table_width <- gt::px(900)
c_save <- FALSE

# ---- Lecture ----
out_dir  <- here::here("data/output/verification")
csv_path <- file.path(out_dir, "all_sample_metrics_summary.csv")
tbl <- readr::read_csv(csv_path, show_col_types = FALSE)

stopifnot("sample" %in% names(tbl))

# ---- 1) Convertir les colonnes pourcentage "xx.x%" -> proportion numérique (0..1) ----
pct_cols <- names(tbl)[map_lgl(tbl, ~ is.character(.x) && any(str_detect(.x, "%"), na.rm = TRUE))]
tbl <- tbl |>
  mutate(across(all_of(pct_cols),
                ~ readr::parse_number(.x, locale = readr::locale(grouping_mark = ",")) / 100))

# (optionnel) convertir d'autres colonnes "numériques en texte" (sans %)
# num_like_cols <- names(tbl)[map_lgl(tbl, ~ is.character(.x) && all(str_detect(.x, "^[0-9,\\.]+$") | is.na(.x)))]
# tbl <- tbl |> mutate(across(all_of(num_like_cols),
#                             ~ readr::parse_number(.x, locale = readr::locale(grouping_mark=","))))

# ---- 2) Construire le tableau gt : chaque colonne numérique a sa propre échelle de couleur ----
num_cols <- setdiff(names(select(tbl, where(is.numeric))), "sample")
view_tbl <- select(tbl, sample, all_of(num_cols))

gt_table <- gt(view_tbl, rowname_col = "sample") |>
  tab_options(table.width = c_table_width)

# Heuristique: colonnes de "compte" (grands nombres) -> bleu; % -> format percent; autres -> vert
count_like_pat <- "(^n_|count|cells|reads|num$|number)"
gt_table <- reduce(num_cols, .init = gt_table, .f = function(g, col) {
  dom <- range(view_tbl[[col]], na.rm = TRUE)
  if (!is.finite(dom[1]) || !is.finite(dom[2])) dom <- c(0, 0)  # garde-fou

  is_percent <- col %in% pct_cols
  is_count   <- stringr::str_detect(col, count_like_pat) && !is_percent

  g1 <- if (is_percent) {
    fmt_percent(g, columns = dplyr::all_of(col), decimals = 1, scale_values = TRUE)
  } else {
    fmt_number(g, columns = dplyr::all_of(col), decimals = if (is_count) 0 else 1)
  }

  data_color(
    g1,
    columns = dplyr::all_of(col),
    colors  = scales::col_numeric(
      palette = if (is_count) c_col_light_blue else c_col,
      domain  = dom
    )
  )
})

if (isTRUE(c_save)) {
  gtsave(gt_table, filename = file.path(out_dir, "all_sample_metrics_summary_gt.html"))
}

gt_table

	Estimated Number of Cells	Mean Reads per Cell	Median Genes per Cell	Number of Reads	Valid Barcodes	Valid UMIs	Sequencing Saturation	Q30 Bases in Barcode	Q30 Bases in RNA Read	Q30 Bases in UMI	Reads Mapped to Genome	Reads Mapped Confidently to Genome	Reads Mapped Confidently to Intergenic Regions	Reads Mapped Confidently to Intronic Regions	Reads Mapped Confidently to Exonic Regions	Reads Mapped Confidently to Transcriptome	Reads Mapped Antisense to Gene	Fraction Reads in Cells	Total Genes Detected	Median UMI Counts per Cell
Brachy_Bd_4h_B_org	627.0	120,780.0	3,569.0	75,729,176.0	95.2%	99.9%	75.4%	95.6%	93.0%	94.9%	95.4%	58.5%	7.7%	8.2%	42.7%	46.5%	4.4%	72.7%	30,995.0	6,303.0
Brachy_Brachy_0h_org	14,030.0	27,813.0	2,331.0	390,210,890.0	96.8%	100.0%	52.2%	95.1%	92.8%	94.7%	95.5%	83.6%	7.5%	5.8%	70.2%	72.4%	3.6%	68.1%	35,990.0	4,005.0
Brachy_Brachy_12_Ah_org	1,604.0	87,191.0	4,048.0	139,854,898.0	95.9%	100.0%	76.2%	95.3%	93.0%	94.9%	94.3%	75.2%	8.7%	7.8%	58.8%	61.8%	4.7%	70.3%	32,937.0	6,726.0
Brachy_Brachy_12Bh_org	9,947.0	36,869.0	2,544.0	366,737,926.0	96.5%	100.0%	65.5%	95.2%	93.0%	94.8%	94.0%	78.8%	9.2%	8.6%	61.0%	63.4%	6.2%	65.6%	37,017.0	3,849.0
Brachy_Brachy_4h_A_org	4,655.0	65,958.0	2,808.0	307,034,042.0	94.1%	100.0%	70.4%	95.3%	92.4%	94.5%	94.7%	60.8%	8.0%	8.5%	44.4%	48.3%	4.6%	70.4%	35,845.0	4,592.0
Brachy_Brachy_4h_org	7,007.0	31,408.0	1,930.0	220,074,138.0	98.1%	100.0%	82.3%	95.2%	93.2%	94.8%	94.5%	85.7%	9.2%	4.6%	71.8%	72.6%	3.8%	80.0%	32,732.0	2,437.0
Brachy_Brachy_6h_org	9,673.0	35,593.0	1,371.0	344,291,058.0	93.6%	99.9%	74.9%	95.2%	93.8%	94.8%	93.3%	63.0%	9.9%	8.5%	44.6%	47.9%	5.1%	62.9%	34,482.0	1,887.0
Brachy_Brachy_8h_org	2,484.0	120,346.0	2,329.0	298,939,576.0	91.8%	100.0%	80.0%	95.2%	94.3%	94.8%	95.5%	48.5%	7.8%	6.3%	34.5%	36.9%	3.8%	64.9%	32,953.0	3,627.0

2.2 Calculate the % of chloroplasts and mitochondria

Chloroplast data came from NC_011032
Mitochondria: accept common patterns (works for Brachy or Rice fallback) NC_011033.1 and NC_011033

Code

# ---- 0) Pick a GTF (prefer organelle ref, fallback to nuclear) ----
gtf_candidates <- c(
  here::here("data/refs/BdistachyonBd21_3_10X_Genome_organelle/genes/genes.gtf.gz"),
  here::here("data/refs/BdistachyonBd21_3_10X_Genome/genes/genes.gtf.gz")
)
gtf_file <- gtf_candidates[file.exists(gtf_candidates)][1]
message("Using GTF: ", gtf_file)
gtf <- rtracklayer::import(gtf_file)

# ---- 1) Define organelle gene feature sets from the GTF ----
seqs <- as.character(GenomeInfoDb::seqnames(gtf))

# Chloroplast contig was normalized during build to "NC_011032"
is_cp <- seqs == "NC_011032"

# Mitochondria: accept common patterns (works for Brachy or Rice fallback)
is_mt <- seqs %in% c("NC_011033.1", "NC_011033")

cp_names <- unique(gtf$gene_name[is_cp & !is.na(gtf$gene_name)])
cp_ids   <- unique(gtf$gene_id  [is_cp & !is.na(gtf$gene_id)])
mt_names <- unique(gtf$gene_name[is_mt & !is.na(gtf$gene_name)])
mt_ids   <- unique(gtf$gene_id  [is_mt & !is.na(gtf$gene_id)])

cp_features_ref <- unique(c(cp_names, cp_ids))
mt_features_ref <- unique(c(mt_names, mt_ids))

# Seurat often converts "_" -> "-" in rownames; prepare both variants
dash <- function(x) gsub("_", "-", x, fixed = TRUE)
cp_features_ref_dash <- dash(cp_features_ref)
mt_features_ref_dash <- dash(mt_features_ref)

# ---- Helpers ----
# robust counts extractor (Seurat v4/v5)
get_counts <- function(obj, assay = DefaultAssay(obj)) {
  out <- tryCatch(GetAssayData(obj, assay = assay, layer = "counts"),
                  error = function(e) NULL)
  if (is.null(out)) out <- GetAssayData(obj, assay = assay, slot = "counts")
  out
}

# find matrix in a run folder (works with or without "outs/")
find_matrix <- function(run_dir) {
  cand <- c(file.path(run_dir, "filtered_feature_bc_matrix"),
            file.path(run_dir, "outs", "filtered_feature_bc_matrix"))
  d <- cand[dir.exists(cand)][1]
  if (is.na(d)) {
    # also try HDF5
    h5 <- c(file.path(run_dir, "filtered_feature_bc_matrix.h5"),
            file.path(run_dir, "outs", "filtered_feature_bc_matrix.h5"))
    h5 <- h5[file.exists(h5)][1]
    if (!is.na(h5)) return(h5)
    return(NULL)
  }
  d
}

# compute % chloro / % mito for one run
percent_organelle_one <- function(run_dir) {
  mat_path <- find_matrix(run_dir)
  if (is.null(mat_path)) return(NULL)

  if (dir.exists(mat_path)) {
    mat <- Read10X(mat_path)
  } else if (grepl("\\.h5$", mat_path)) {
    mat <- Read10X_h5(mat_path)
  } else {
    return(NULL)
  }

  s <- CreateSeuratObject(mat, project = basename(run_dir))
  counts <- get_counts(s)
  rn <- rownames(counts)

  # Match features (support "_"->"-")
  feats_cp <- intersect(rn, unique(c(cp_features_ref, cp_features_ref_dash)))
  feats_mt <- intersect(rn, unique(c(mt_features_ref, mt_features_ref_dash)))

  tot <- pmax(Matrix::colSums(counts), 1)
  pct_cp <- if (length(feats_cp)) Matrix::colSums(counts[feats_cp, , drop = FALSE]) / tot * 100 else rep(0, ncol(counts))
  pct_mt <- if (length(feats_mt)) Matrix::colSums(counts[feats_mt, , drop = FALSE]) / tot * 100 else rep(0, ncol(counts))

  tibble(
    sample = basename(run_dir),
    cell   = colnames(counts),
    pct_chloroplast = as.numeric(pct_cp),
    pct_mito        = as.numeric(pct_mt),
    pct_organelle   = as.numeric(pct_cp + pct_mt)
  )
}

# ---- 2) Apply to all local runs that end with "_org" ----
runs_all  <- list.dirs(runs_base, full.names = TRUE, recursive = FALSE)
runs_org  <- runs_all[str_ends(basename(runs_all), "_org")]

df <- purrr::map_dfr(runs_org, percent_organelle_one)
stopifnot(nrow(df) > 0)

# tidy & ordering
df <- df %>% filter(!is.na(pct_organelle))
df <- df %>% mutate(sample = factor(sample),
                    sample_f = forcats::fct_reorder(sample, pct_organelle, .fun = median))

# ---- 3) Violin plots WITH points ----
set.seed(123)
max_points <- 5000
df_points  <- if (nrow(df) > max_points) dplyr::sample_n(df, max_points) else df

# A) Faceted by run (chloroplast vs mito vs total)
df_long <- df %>%
  pivot_longer(c(pct_chloroplast, pct_mito, pct_organelle),
               names_to = "metric", values_to = "percent")

p_facets <- ggplot(df_long, aes(x = "cells", y = percent)) +
  geom_violin(scale = "width", trim = TRUE) +
  geom_boxplot(width = 0.12, outlier.shape = NA, alpha = 0.5) +
  geom_point(
    data = df_points %>% pivot_longer(c(pct_chloroplast, pct_mito, pct_organelle),
                                      names_to = "metric", values_to = "percent"),
    aes(x = "cells", y = percent),
    position = position_jitter(width = 0.08, height = 0),
    size = 0.4, alpha = 0.35
  ) +
  facet_wrap(~ sample_f + metric, scales = "free_y") +
  labs(
    title = "Organelle RNA content per run",
    x = "",
    y = "Percent of UMI counts per nucleus (%)"
  ) +
  theme_bw() +
  theme(strip.text = element_text(face = "bold"),
        axis.text.x = element_blank(),
        panel.grid.minor = element_blank())
print(p_facets)

# B) All runs side-by-side (one panel per metric)
p_side <- ggplot(df_long %>% filter(metric != "pct_organelle"), aes(x = sample, y = percent)) +
  geom_violin(scale = "width", trim = TRUE) +
  geom_boxplot(width = 0.12, outlier.shape = NA, alpha = 0.5) +
  geom_point(
    data = df_points %>%
      pivot_longer(c(pct_chloroplast, pct_mito, pct_organelle),
                   names_to = "metric", values_to = "percent") %>%
      filter(metric != "pct_organelle") %>% 
      mutate(sample_f = forcats::fct_reorder(sample, percent, .fun = median)),
    aes(x = sample_f, y = percent),
    position = position_jitter(width = 0.15, height = 0),
    size = 0.4, alpha = 0.35
  ) +
  facet_wrap(~ metric, scales = "free_y") +
  labs(
    title = "Organelle RNA content across runs",
    x = "Sample (run)",
    y = "Percent of UMI counts per nucleus (%)"
  ) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        panel.grid.minor = element_blank())
print(p_side)

fig_export(path = "report/snrnaseq/plot/cleaning/n_organelle", plot_x = p_side, height_i = 6, width_i = 16, res = 600)

# ---- 4) Numeric summary per run ----
summary_tbl <- df %>%
  group_by(sample) %>%
  summarise(
    n_cells           = n(),
    median_pct_chloro = median(pct_chloroplast),
    mean_pct_chloro   = mean(pct_chloroplast),
    max_pct_chloro    = max(pct_chloroplast),
    median_pct_mito   = median(pct_mito),
    mean_pct_mito     = mean(pct_mito),
    max_pct_mito      = max(pct_mito),
    median_pct_org    = median(pct_organelle),
    mean_pct_org      = mean(pct_organelle),
    max_pct_org       = max(pct_organelle)
  ) %>% arrange(median_pct_org)

summary_tbl

# Optional: save summary
out_dir <- here::here("data/output/verification")
dir.create(out_dir, showWarnings = FALSE, recursive = TRUE)
write.csv(summary_tbl,
          file.path(out_dir, "percent_organelle_summary.csv"),
          row.names = FALSE)

Organelle quality control: start with ≤5 to 10% of total organelles; inspect distributions per sample. For all samples, sela meets expectations.

Code

# ---- Constantes (palette & tailles) ----
c_col <- c("#ffffff", paletteer::paletteer_dynamic("cartography::green.pal", 20) |> as.character())
c_col_light_blue <- c("#edf2fb", "#e2eafc", "#d7e3fc", "#ccdbfd", "#c1d3fe")
c_table_width <- gt::px(800)
c_save <- FALSE

# ---- Lecture : uniquement le CSV déjà créé ----
out_dir  <- here::here("data/output/verification")
csv_path <- file.path(out_dir, "percent_organelle_summary.csv")

summary_tbl <- readr::read_csv(csv_path, show_col_types = FALSE) |>
  arrange(median_pct_org)

# Domaines pour couleurs
max_org   <- max(summary_tbl$mean_pct_org, na.rm = TRUE)
max_cells <- max(summary_tbl$n_cells,     na.rm = TRUE)

# ---- 1) Tableau statique "gt" (look & feel proche de ton code) ----
gt_table <- summary_tbl |>
  dplyr::select(sample, n_cells, median_pct_org, mean_pct_org, max_pct_org, median_pct_chloro, mean_pct_chloro, max_pct_chloro, median_pct_mito, mean_pct_mito, max_pct_mito) %>% 
  gt(rowname_col = "sample") |>
  fmt_number(
    columns = c(median_pct_org,    mean_pct_org ,
                median_pct_chloro, mean_pct_chloro, 
                median_pct_mito,   mean_pct_mito
                 ),
    decimals = 1
  ) |>
  fmt_number(columns = n_cells, decimals = 0) |>
  data_color(
    columns = c(median_pct_org, mean_pct_org),
    colors  = scales::col_numeric(palette = c_col, domain = c(max_org, 0)) # dégradé inversé (fort -> foncé)
  ) |>
  data_color(
    columns = n_cells,
    colors  = scales::col_numeric(palette = c_col_light_blue, domain = c(0, max_cells))
  ) |>
  tab_options(
    table.width = c_table_width,
    heading.subtitle.font.size = 10,
    column_labels.border.bottom.width = gt::px(3)
  )

if (isTRUE(c_save)) {
  gtsave(gt_table, filename = file.path(out_dir, "percent_organelle_summary_gt.html"))
}

gt_table

	n_cells	median_pct_org	mean_pct_org	max_pct_org	max_pct_chloro	median_pct_mito	mean_pct_mito	max_pct_mito
Brachy_Brachy_4h_org	7,007	0.1	0.1	5.46697	0.5344418	0.1	0.1	5.46697
Brachy_Brachy_0h_org	14,030	0.2	0.4	43.27323	1.8480493	0.2	0.4	43.27323
Brachy_Brachy_12Bh_org	9,947	0.2	0.4	25.76000	3.1674208	0.2	0.4	23.92000
Brachy_Brachy_6h_org	9,673	0.2	0.4	62.91990	1.1194030	0.2	0.4	62.91990
Brachy_Brachy_4h_A_org	4,655	0.3	0.6	59.40803	1.4883347	0.3	0.6	59.30233
Brachy_Brachy_12_Ah_org	1,604	0.3	0.6	44.09308	3.7328094	0.3	0.6	42.09427
Brachy_Bd_4h_B_org	627	0.4	1.1	57.49588	4.2640990	0.3	1.0	57.49588
Brachy_Brachy_8h_org	2,484	0.7	1.8	59.79239	7.4576271	0.7	1.7	59.58478

Code

# ---- 2) Version interactive avec flèches de tri + filtres (DT) ----
dt_table <- DT::datatable(
  summary_tbl,
  rownames = FALSE,
  filter = "top",   # boîtes de filtre en tête de colonnes
  options = list(
    pageLength = 30,
    autoWidth  = TRUE,
    order = list(list(which(names(summary_tbl) == "median_pct_org") - 1, "asc"))
  ),
  class = "stripe hover order-column display"
) |>
  # petit barplot de fond pour "median_pct_org" (optionnel)
  DT::formatStyle(
    "median_pct_org",
    background = styleColorBar(c(0, max_org), c_col[length(c_col)]),
    backgroundSize = "100% 90%",
    backgroundRepeat = "no-repeat",
    backgroundPosition = "center"
  )

if (isTRUE(c_save)) {
  htmlwidgets::saveWidget(
    dt_table,
    file = file.path(out_dir, "percent_organelle_summary_interactive.html"),
    selfcontained = TRUE
  )
}

This matrix folder is then loaded into R/Seurat.

2.3 Cleaning and select good sample

Removes chloroplast and mitochondrial genes from the count matrices (using your GTF; with a fallback),
Applies ambient RNA correction (SoupX) + doublet removal (scDblFinder),
Runs per-cell QC and per-sample QC,
Decides, for each timepoint (0h, 4h, 6h, 8h, 12h), whether to:
Keep the best replicate only, or
Merge replicates when they are acceptable but individually small,
And guarantees you end up with one object per timepoint (by merging if needed).

I’ve kept it modular so you can tweak thresholds quickly.

What the script does

Discover runs (*_org) and parse a timepoint label from each sample name (0h, 4h, 6h, 8h, 12h).
Identify organelle features from the GTF (chloroplast and mitochondria contigs). Those rows are removed from the count matrices before QC.
For each run:

load 10x matrices (Read10X/Read10X_h5),
SoupX: estimate and correct ambient RNA,
build a Seurat object,
label %cp / %mt (only to measure; counts already have cp/mt removed),
scDblFinder: remove doublets,
apply cell-level QC (min features/UMIs; optional %organelle threshold if you want).
Summarize sample-level QC (cells kept, median genes/UMIs, fraction reads in cells if you have it, etc.).
Decision logic per timepoint:
Mark each replicate PASS if it meets minimal sample-level criteria (e.g., ≥2k cells retained, median genes/cell ≥2k, transcriptome/confident exonic proxy if present).
If multiple PASS → merge.
If none PASS but one is “near-pass” → optionally merge the near-pass replicates to salvage one timepoint. If only one PASS → keep best replicate.

Output:

a named list of timepoint-level Seurat objects (by_timepoint[[ “0h” ]], …),
a QC table with PASS/CHECK status,
RDS files.

Code

## ---- user settings / thresholds ----
min_cells_per_sample   <- 1000      # retained singlets after QC
min_genes_per_cell     <- 2000
min_umis_per_cell      <- 1000      # optional; set lower if needed
max_genes_per_cell     <- 13000     # defensive high-end cutoff
max_umis_per_cell      <- 300000    # defensive high-end cutoff
max_pct_organelle_cell <- 5         # per-cell %org (if you want an additional guard)
timepoints_required    <- c("0h","4h","6h","8h","12h")

## ---- find organelle features from GTF ----
gtf_candidates <- c(
  here::here("data/refs/BdistachyonBd21_3_10X_Genome_organelle/genes/genes.gtf.gz"),
  here::here("data/refs/BdistachyonBd21_3_10X_Genome/genes/genes.gtf.gz")
)
gtf_file <- gtf_candidates[file.exists(gtf_candidates)][1]
message("[GTF] Using: ", gtf_file)
gtf <- rtracklayer::import(gtf_file)
seqs <- as.character(GenomeInfoDb::seqnames(gtf))

## These contig IDs can be adapted; keep your earlier mapping if different
is_cp <- seqs %in% c("NC_011032","NC_011032.1")
is_mt <- seqs %in% c("NC_011033","NC_011033.1")

cp_names <- unique(gtf$gene_name[is_cp & !is.na(gtf$gene_name)])
cp_ids   <- unique(gtf$gene_id  [is_cp & !is.na(gtf$gene_id)])
mt_names <- unique(gtf$gene_name[is_mt & !is.na(gtf$gene_name)])
mt_ids   <- unique(gtf$gene_id  [is_mt & !is.na(gtf$gene_id)])

org_refs <- unique(c(cp_names, cp_ids, mt_names, mt_ids))
org_refs_dash <- gsub("_","-", org_refs, fixed = TRUE)

is_org_feature <- function(genes) {
  genes %in% org_refs | genes %in% org_refs_dash
}

## ---- helpers: find matrix inside a run folder ----
find_matrix <- function(run_dir) {
  # typical 10x structure
  cand <- c(
    file.path(run_dir, "filtered_feature_bc_matrix"),
    file.path(run_dir, "outs", "filtered_feature_bc_matrix")
  )
  d <- cand[dir.exists(cand)][1]
  if (!is.na(d)) return(d)
  # h5 fallback
  h5 <- c(
    file.path(run_dir, "filtered_feature_bc_matrix.h5"),
    file.path(run_dir, "outs", "filtered_feature_bc_matrix.h5")
  )
  h5 <- h5[file.exists(h5)][1]
  if (!is.na(h5)) return(h5)
  return(NA_character_)
}

## ---- parse timepoint from sample name ----
# expects tokens like "..._0h_org", "..._4h_A_org", etc.
# vector-safe: returns one timepoint per input element
parse_timepoint <- function(x) {
  # grab the number that is followed by optional letters/underscores and an 'h'
  tp_num <- stringr::str_extract(x, "\\d+(?=[A-Za-z_]*h)")
  ifelse(is.na(tp_num), NA_character_, paste0(tp_num, "h"))
}

## tolerant SoupX wrapper
safe_soupx_adjust <- function(run_dir, mat_fallback) {
  # try a sequence of more permissive settings
  tries <- list(
    list(tfidfMin = 0.1, soupQuantile = 0.9, nSoupGene = 100),
    list(tfidfMin = 0.0, soupQuantile = 0.95, nSoupGene = 200),
    list(tfidfMin = 0.0, soupQuantile = 1.0,  nSoupGene = 300)
  )
  for (par in tries) {
    msg <- paste0("[SoupX] Trying tfidfMin=", par$tfidfMin,
                  ", soupQuantile=", par$soupQuantile,
                  ", nSoupGene=", par$nSoupGene, " on ", basename(run_dir))
    message(msg)
    adj <- try({
      sc <- load10X(run_dir)                              # raw + filtered if present
      auto <- autoEstCont(sc,
                          tfidfMin     = par$tfidfMin,
                          soupQuantile = par$soupQuantile,
                          nSoupGene    = par$nSoupGene)
      adjustCounts(auto)
    }, silent = TRUE)
    if (!inherits(adj, "try-error")) return(adj)
  }
  message("[SoupX] Gave up on ", basename(run_dir), " → using uncorrected counts")
  return(mat_fallback)  # fallback to uncorrected counts
}

## ---- load & clean one run (returns a Seurat object of QCed singlets) ----
process_one_run <- function(run_dir) {
  mat_path <- find_matrix(run_dir)
  if (is.na(mat_path)) return(NULL)

  # 1) read counts from your folder layout (dir or .h5)
  mat <- if (dir.exists(mat_path)) Read10X(mat_path) else Read10X_h5(mat_path)

  # 2) SoupX with tolerance + fallback
  adj <- safe_soupx_adjust(run_dir, mat_fallback = mat)

  # 3) remove organelle features
  keep <- !is_org_feature(rownames(adj))
  adj_no_org <- adj[keep, , drop = FALSE]

  # 4) Seurat
  obj <- CreateSeuratObject(counts = adj_no_org, project = basename(run_dir), min.cells = 1, min.features = 1)

  # 5) Doublets (robuste)
  sce <- as.SingleCellExperiment(obj)
  sce <- tryCatch(scDblFinder(sce),
                  error = function(e) { message("[scDblFinder] ", e$message, " -> all singlets"); sce })
  if (!"scDblFinder.class" %in% colnames(SummarizedExperiment::colData(sce))) {
    obj$doublet_class <- "singlet"; obj$doublet_score <- NA_real_
  } else {
    obj$doublet_class <- sce$scDblFinder.class; obj$doublet_score <- sce$scDblFinder.score
  }

  # 6) per-cell QC via meta.data
  md <- obj@meta.data
  keep_idx <- which(
    md$nFeature_RNA >= min_genes_per_cell &
    md$nCount_RNA   >= min_umis_per_cell &
    md$nFeature_RNA <= max_genes_per_cell &
    md$nCount_RNA   <= max_umis_per_cell &
    md$doublet_class == "singlet"
  )
  if (!length(keep_idx)) { message("[QC] No cells kept in ", basename(run_dir)); return(NULL) }
  obj <- subset(obj, cells = colnames(obj)[keep_idx])

  # 7) light normalization
  obj <- NormalizeData(obj)
  obj <- FindVariableFeatures(obj)

  # annotate once (scalar) then broadcast
  sample_id_scalar <- basename(run_dir)
  obj$sample_id <- sample_id_scalar
  obj$timepoint <- parse_timepoint(sample_id_scalar)[1]

  obj
}

## ---- discover runs and process all ----
runs_all <- list.dirs(runs_base, full.names = TRUE, recursive = FALSE)
runs_org <- runs_all[str_ends(basename(runs_all), "_org")]

options(future.globals.maxSize = 100 * 1024^3)  # 8 Go

plan(multisession, workers = 20)  # marche partout, sûr et sans prise de tête

objs <- future_map(
  runs_org,
  process_one_run,
  .options  = furrr_options(seed = TRUE, scheduling = 1),
  .progress = TRUE
)

# objs <- map(runs_org, process_one_run) # for sequential analysis
names(objs) <- basename(runs_org)
objs <- discard(objs, is.null)

stopifnot(length(objs) > 0)

## ---- sample-level QC summary ----
sample_qc <- map_dfr(objs, function(o) {
  tibble(
    sample    = unique(o$sample_id),
    timepoint = unique(o$timepoint),
    n_cells   = ncol(o),
    med_genes = median(o$nFeature_RNA),
    med_umis  = median(o$nCount_RNA)
  )
})

## If you have per-sample “fraction reads in cells” or “mapped to transcriptome” from metrics CSV,
## you can join them here by sample name and use them in the pass criteria too.
## For now, we gate only on n_cells and med_genes.
sample_qc <- sample_qc %>%
  mutate(
    pass_cells = n_cells >= min_cells_per_sample,
    pass_genes = med_genes >= min_genes_per_cell,
    pass       = pass_cells & pass_genes
  )

print(sample_qc)

## ---- decide per timepoint: merge vs best replicate ----
choose_and_merge_by_timepoint <- function(objs, qc_table, required_tp) {
  stopifnot(is.list(objs), !is.null(names(objs)), all(nzchar(names(objs))))
  tps <- union(required_tp, unique(qc_table$timepoint))

  out <- list()
  decisions <- list()

  for (tp in tps) {
    cand_names <- qc_table %>% dplyr::filter(timepoint == tp) %>% dplyr::pull(sample)
    cand_objs  <- objs[cand_names]
    cand_qc    <- qc_table %>% dplyr::filter(timepoint == tp)

    if (length(cand_objs) == 0) {
      decisions[[tp]] <- tibble(timepoint = tp, decision = "MISSING", used = NA_character_)
      next
    }

    pass_idx <- which(cand_qc$pass)

    if (length(pass_idx) >= 2) {
      use <- cand_names[pass_idx]
      merged <- Reduce(
        function(a, b) merge(a, y = b, add.cell.ids = c(a = unique(a$sample_id)[1], b = unique(b$sample_id)[1])),
        cand_objs[use]
      )
      merged$timepoint <- tp
      out[[tp]] <- merged
      decisions[[tp]] <- tibble(timepoint = tp, decision = "MERGE_PASS", used = paste(use, collapse = ","))
      next
    }

    if (length(pass_idx) == 1) {
      use  <- cand_names[pass_idx]
      keep <- cand_objs[[use]]
      keep$timepoint <- tp
      out[[tp]] <- keep
      decisions[[tp]] <- tibble(timepoint = tp, decision = "KEEP_BEST_PASS", used = use)
      next
    }

    cand_qc <- cand_qc %>% dplyr::arrange(dplyr::desc(n_cells))
    # use nrow(.) instead of n()
    near <- cand_qc %>% dplyr::slice(seq_len(min(2, nrow(.))))

    if (nrow(near) >= 2) {
      use <- near$sample
      merged <- Reduce(
        function(a, b) merge(a, y = b, add.cell.ids = c(a = unique(a$sample_id)[1], b = unique(b$sample_id)[1])),
        cand_objs[use]
      )
      merged$timepoint <- tp
      out[[tp]] <- merged
      decisions[[tp]] <- tibble(timepoint = tp, decision = "MERGE_TOP2_NEAR_PASS", used = paste(use, collapse = ","))
    } else {
      use  <- cand_names[1]
      keep <- cand_objs[[use]]
      keep$timepoint <- tp
      out[[tp]] <- keep
      decisions[[tp]] <- tibble(timepoint = tp, decision = "KEEP_ONLY_AVAILABLE_FAILS", used = use)
    }
  }

  list(objects_by_tp = out, decisions = dplyr::bind_rows(decisions))
}

merged <- choose_and_merge_by_timepoint(objs, sample_qc, timepoints_required)

by_timepoint <- merged$objects   # named list: "0h","4h","6h","8h","12h"
tp_decisions <- merged$decisions
print(tp_decisions)

## ---- optional: save results ----
out_dir <- here::here("data/output/clean_objects")
dir.create(out_dir, showWarnings = FALSE, recursive = TRUE)

# Save QC decisions
readr::write_csv(sample_qc, file.path(out_dir, "sample_qc_summary.csv"))
readr::write_csv(tp_decisions, file.path(out_dir, "timepoint_merge_decisions.csv"))

# Save one RDS per timepoint
iwalk(by_timepoint, ~ saveRDS(.x, file = file.path(out_dir, paste0("Seurat_", .y, ".rds"))))
message("Saved cleaned timepoint objects to: ", out_dir)

remove ambient RNA (optional if SoupX not needed)
detect & drop doublets
re-QC / filter (nFeature, nCount, %organelle)
normalize + find variable features (LogNorm or SCT)
integrate across timepoints
dimensionality reduction (PCA/UMAP), neighbors, clustering (several resolutions)
markers per cluster (optionally with presto for speed)
first-pass annotation (marker enrichment / module scores; transfer from references if available)
export objects, metrics, plots, and marker tables

2.3.1 Normalization

Regress organelle % during normalization to reduce technical variance.
Batch correction only if you really see batch structure (UMAP colored by sample is your friend).
Doublets: 3–8% typical; filter if they cluster or confound DE.
Keep raw objects before filtering (e.g., combined_raw <- combined) when exploring thresholds.

PCA was performed using a sparse SVD decomposition (RSpectra::svds), which is mathematically equivalent to classical PCA but computationally stable for large SCT-normalized matrices. This approach is recommended for reciprocal PCA integration in Seurat and does not affect downstream clustering or biological interpretation.

Code

## ---- packages ----
library(Seurat)
library(tidyverse)
library(Matrix)
library(here)
library(rtracklayer)
library(GenomeInfoDb)
library(SingleCellExperiment)
library(scDblFinder)
## optional speed-ups (if installed)
if (requireNamespace("glmGamPoi", quietly = TRUE)) message("glmGamPoi available: SCT will be faster.")
if (requireNamespace("presto", quietly = TRUE)) message("presto available: marker finding will be faster.")

## optional ambient RNA (SoupX)
use_soupx <- TRUE
if (use_soupx) {
  stopifnot(requireNamespace("SoupX", quietly = TRUE))
}

## ---- paths ----
clean_dir <- here("data/output/clean_objects")  # where you saved Seurat_0h.rds etc.
out_dir   <- here("data/output/postmerge"); dir.create(out_dir, showWarnings = FALSE, recursive = TRUE)

## ---- load timepoint objects ----
tps <- c("0h","4h","6h","8h","12h")
by_tp <- setNames(vector("list", length(tps)), tps)
for (tp in tps) {
  fp <- file.path(clean_dir, paste0("Seurat_", tp, ".rds"))
  if (file.exists(fp)) by_tp[[tp]] <- readRDS(fp)
}
by_tp <- by_tp[!vapply(by_tp, is.null, logical(1))]
stopifnot(length(by_tp) > 0)

## ---- organelle feature sets from your organelle GTF (same logic as before) ----
gtf_candidates <- c(
  here("data/refs/BdistachyonBd21_3_10X_Genome_organelle/genes/genes.gtf.gz"),
  here("data/refs/BdistachyonBd21_3_10X_Genome/genes/genes.gtf.gz")
)
gtf_file <- gtf_candidates[file.exists(gtf_candidates)][1]
message("Using GTF for organelle features: ", gtf_file)
gtf <- rtracklayer::import(gtf_file)
seqs <- as.character(GenomeInfoDb::seqnames(gtf))
is_cp <- seqs == "NC_011032"
is_mt <- seqs %in% c("NC_011033.1", "NC_011033")
cp_features_ref <- unique(c(na.omit(gtf$gene_name[is_cp]), na.omit(gtf$gene_id[is_cp])))
mt_features_ref <- unique(c(na.omit(gtf$gene_name[is_mt]), na.omit(gtf$gene_id[is_mt])))
dash <- function(x) gsub("_","-", x, fixed = TRUE)
cp_all <- unique(c(cp_features_ref, dash(cp_features_ref)))
mt_all <- unique(c(mt_features_ref, dash(mt_features_ref)))

# v5-safe counts accessor
get_counts <- function(s, assay = DefaultAssay(s)) {
  # prefer LayerData (v5)
  out <- tryCatch(SeuratObject::LayerData(s, assay = assay, layer = "counts"),
                  error = function(e) NULL)
  if (is.null(out)) {
    out <- tryCatch(GetAssayData(s, assay = assay, layer = "counts"),
                    error = function(e) NULL)
  }
  if (is.null(out)) out <- GetAssayData(s, assay = assay, slot = "counts")  # v4 fallback
  out
}

# if multiple counts layers exist, join them into a single layer named "counts"
ensure_single_counts <- function(s, assay = "RNA") {
  if (inherits(s[[assay]], "Assay5")) {
    lyr <- SeuratObject::Layers(s[[assay]])
    has_counts <- grep("^counts(\\.|$)", lyr, value = TRUE)
    if (length(has_counts) > 1) {
      s <- JoinLayers(s, assay = assay)  # collapses sample-specific layers
    } else if (length(has_counts) == 1 && has_counts != "counts") {
      s[[assay]] <- SeuratObject::RenameLayers(s[[assay]],
                                               new.layers = setNames("counts", has_counts))
    }
  }
  s
}

add_organelle_percents <- function(s, cp_genes, mt_genes) {
  DefaultAssay(s) <- "RNA"
  s <- ensure_single_counts(s, "RNA")
  counts <- get_counts(s, "RNA")
  stopifnot(!is.null(counts))
  rn <- rownames(counts)
  cp <- intersect(rn, cp_genes)
  mt <- intersect(rn, mt_genes)
  tot   <- pmax(Matrix::colSums(counts), 1)
  pct_cp <- if (length(cp)) Matrix::colSums(counts[cp, , drop = FALSE]) / tot * 100 else rep(0, ncol(counts))
  pct_mt <- if (length(mt)) Matrix::colSums(counts[mt, , drop = FALSE]) / tot * 100 else rep(0, ncol(counts))
  s$pct_cp  <- as.numeric(pct_cp)
  s$pct_mt  <- as.numeric(pct_mt)
  s$pct_org <- s$pct_cp + s$pct_mt
  s
}

## ---- (optional) SoupX per timepoint (requires raw droplets) ----
## If you have the raw droplet matrices (not only filtered), set 'use_soupx <- TRUE'
## and point 'raw_path_fun' to each run folder that produced the timepoint object.
#use_soupx= F
#if (use_soupx) {
#  raw_path_fun <- function(tp_id) {
#    ## TODO: return path to the raw (unfiltered) matrix or 'outs' for that timepoint
#    ## e.g.: here("data/cellranger", paste0("Brachy_", tp_id, "_org"), "outs")
#    NA_character_
#  }
#  for (nm in names(by_tp)) {
#    rp <- raw_path_fun(nm)
#    if (!is.na(rp) && dir.exists(rp)) {
#      sx <- SoupX::load10X(rp)
#      sx <- SoupX::autoEstCont(sx)
#      adj <- SoupX::adjustCounts(sx)
#      by_tp[[nm]] <- CreateSeuratObject(adj, project = nm)
#    }
#  }
#}

## ---- doublets + re-QC per timepoint ----
min_genes   <- 1000
max_genes   <- 13000
min_counts  <- 1000
max_counts  <- 300000
max_pct_cp  <- 30
max_pct_mt  <- 10

# to add into article "Doublets were identified using scDblFinder. For one low-quality/low-cell sample, the doublet detection algorithm was unstable; for this sample, cells were retained as singlets and potential doublets were inspected at the clustering/QC stage." 

for (nm in names(by_tp)) {
  s <- by_tp[[nm]]

  # 1) unifier les layers de counts
  s <- ensure_single_counts(s, "RNA")

  # 2) pourcent cp/mt
  s <- add_organelle_percents(s, cp_all, mt_all)

  # 3) nettoyer les colonnes / lignes tout-zéro AVANT scDblFinder
  counts <- get_counts(s, "RNA")
  keep_cells <- Matrix::colSums(counts) > 0
  keep_genes <- Matrix::rowSums(counts) > 0
  s <- s[keep_genes, keep_cells]

  if (ncol(s) < 200) {
    message("[scDblFinder] ", nm, ": trop peu de cellules (", ncol(s), ") -> on marque tout en singlets.")
    s$doublet_class <- "singlet"
    s$doublet_score <- NA_real_
  } else {
    sce <- as.SingleCellExperiment(s)
    k_auto   <- max(5, floor(0.01 * ncol(sce)))
    dims_use <- min(30, max(10, floor(ncol(sce) / 50)))

    sce <- tryCatch(
      scDblFinder(
        sce,
        dims    = dims_use,
        k       = k_auto,
        BPPARAM = BiocParallel::SerialParam()
      ),
      error = function(e) {
        message("[scDblFinder] ERREUR sur ", nm, ": ",
                e$message, " -> on tague tout en singlets.")
        sce$scDblFinder.class <- factor("singlet", levels = c("singlet","doublet"))
        sce$scDblFinder.score <- NA_real_
        sce
      }
    )

    s$doublet_class <- sce$scDblFinder.class
    s$doublet_score <- sce$scDblFinder.score
  }

  # 4) filtres QC
  s <- subset(
    s,
    subset =
      nFeature_RNA >= min_genes & nFeature_RNA <= max_genes &
      nCount_RNA   >= min_counts & nCount_RNA   <= max_counts &
      pct_cp <= max_pct_cp & pct_mt <= max_pct_mt &
      doublet_class == "singlet"
  )

  by_tp[[nm]] <- s
}

# to tcheck 
sapply(by_tp, ncol)

## 1) Ajouter le timepoint comme meta dans chaque objet
for (nm in names(by_tp)) {
  by_tp[[nm]]$timepoint <- nm
}

## 2) Fusionner tous les timepoints dans un seul objet
# important: on merge sur l'assay RNA (par défaut)
obj_all <- Reduce(function(x, y) merge(x, y), by_tp)

## 3) Normalisation globale avec SCT sur TOUTES les cellules
DefaultAssay(obj_all) <- "RNA"
obj_all <- SCTransform(obj_all, verbose = FALSE)

if (!requireNamespace("RSpectra", quietly = TRUE)) {
  stop("Package 'RSpectra' is required for safe_pca_RS(); install it with install.packages('RSpectra').")
}

safe_pca_RS <- function(obj, assay = "SCT", npcs = 50) {
  DefaultAssay(obj) <- assay
  
  # 1) récupérer la matrice SCT "data"
  M <- SeuratObject::GetAssayData(obj, assay = assay, layer = "data")
  
  # 2) garder uniquement les gènes/colonnes valides tout en restant en sparse
  good_rows <- Matrix::rowSums(!is.finite(M)) == 0
  if (!all(good_rows)) {
    M <- M[good_rows, , drop = FALSE]
  }
  
  good_cols <- Matrix::colSums(M != 0) > 0
  if (!all(good_cols)) {
    M <- M[, good_cols, drop = FALSE]
  }
  
  # synchroniser l'objet Seurat avec cette matrice nettoyée
  obj <- obj[rownames(M), colnames(M)]
  
  # 3) PCA avec RSpectra directement sur la matrice clairsemée
  X <- Matrix::t(M)  # cells x genes (toujours sparse)
  k <- min(npcs, ncol(X) - 1, nrow(X))
  if (k < 2) stop("Pas assez de dimensions pour la PCA (k = ", k, ").")
  
  sv <- RSpectra::svds(X, k = k)
  
  # embeddings = U * D  (cells x k)
  embeddings <- sv$u %*% diag(sv$d)
  rownames(embeddings) <- rownames(X)              # cell names
  colnames(embeddings) <- paste0("PC_", seq_len(k))
  
  # loadings = V (genes x k)
  loadings <- sv$v
  rownames(loadings) <- colnames(X)                # gene names
  colnames(loadings) <- paste0("PC_", seq_len(k))
  
  obj[["pca"]] <- CreateDimReducObject(
    embeddings = embeddings,
    loadings   = loadings,
    stdev      = sv$d,
    key        = "PC_",
    assay      = assay
  )
  
  obj
}

obj_all <- safe_pca_RS(obj_all, assay = "SCT", npcs = 50)

"pca" %in% Reductions(obj_all)
# doit renvoyer TRUE

Embeddings(obj_all, "pca")[1:5, 1:3]  # juste pour voir

## 4) PCA / UMAP / clustering

obj_all <- RunUMAP(obj_all, dims = 1:30, verbose = FALSE)
obj_all <- FindNeighbors(obj_all, dims = 1:30, verbose = FALSE)
obj_all <- FindClusters(obj_all, resolution = 0.6)  # par ex.

# Ensure a consistent timepoint order (0h, 4h, 6h, 8h, 12h) for all UMAP plots
tp_levels <- c("0h", "4h", "6h", "8h", "12h")
if ("timepoint" %in% colnames(obj_all@meta.data)) {
  obj_all$timepoint <- factor(obj_all$timepoint, levels = tp_levels)
}

# Vue globale
DimPlot(obj_all, reduction = "umap", group.by = "seurat_clusters", label = TRUE)

# Voir les timepoints
DimPlot(obj_all, reduction = "umap", group.by = "timepoint")

# Suivre les clusters au cours du temps
DimPlot(
  obj_all,
  reduction = "umap",
  group.by = "seurat_clusters",
  split.by = "timepoint",
  ncol = 3
)

out_dir <- here::here("report/snrnaseq/plot")
dir.create(out_dir, showWarnings = FALSE, recursive = TRUE)

# 1) UMAP avec clusters
p1 <- DimPlot(obj_all, reduction = "umap",
              group.by = "seurat_clusters", label = TRUE)

# 2) UMAP coloré par timepoint
p2 <- DimPlot(obj_all, reduction = "umap",
              group.by = "timepoint")

# 3) UMAP split par timepoint pour suivre les clusters dans le temps
p3 <- DimPlot(
  obj_all,
  reduction = "umap",
  group.by = "seurat_clusters",
  split.by = "timepoint",
  ncol = 3
)

ggsave(file.path(out_dir, "umap_clusters.png"), plot = p1, width = 8, height = 6, dpi = 300)
ggsave(file.path(out_dir, "umap_timepoint.png"), plot = p2, width = 8, height = 6, dpi = 300)
ggsave(file.path(out_dir, "umap_clusters_by_timepoint.png"), plot = p3, width = 12, height = 8, dpi = 300)

## ---- markers per cluster (one resolution; change as you like) ----
## choisir la résolution qu’on utilise (ici 0.6)
Idents(obj_all) <- obj_all$seurat_clusters
# 0) s'assurer qu'on travaille bien sur SCT pour les marqueurs
DefaultAssay(obj_all) <- "SCT"

# 1) préparer les modèles SCT pour les tests
obj_all <- PrepSCTFindMarkers(obj_all)

# 2) puis lancer FindAllMarkers
if (requireNamespace("presto", quietly = TRUE)) {
  markers <- presto::wilcoxauc(
    obj_all,
    seurat_assay = "SCT",
    seurat_slot  = "data"
  ) %>%
    as_tibble() %>%
    group_by(group) %>%
    arrange(desc(logFC))

  write.csv(
    markers,
    file.path(out_dir, "markers_presto_res0.6.csv"),
    row.names = FALSE
  )
} else {
  markers <- FindAllMarkers(
    obj_all,
    assay          = "SCT",
    slot           = "data",
    only.pos       = TRUE,
    logfc.threshold = 0.25,
    min.pct         = 0.1
  )

  write.csv(
    markers,
    file.path(out_dir, "markers_res0.6.csv"),
    row.names = FALSE
  )
}

## ---- simple marker-based annotation scaffold (optional) ----
## Provide a named list of marker sets per putative cell type;
## these can be rice/arab markers mapped to Bd IDs. Example placeholders:
"BdiBd21-3.2G0092900.v1.2" %in% rownames(obj_all)
marker_sets <- list(
  Pericycle = c("BdiBd21-3.2G0092900.v1.2", "BdiBd21-3.1G0109300.v1.2"),
  Endodermis = c("BdiBd21-3.3G0792100.v1.2"),
  Cortex     = c("BdiBd21-3.4G0368600.v1.2")
)

DefaultAssay(obj_all) <- "RNA"
ms_col_map <- character()
for (nm in names(marker_sets)) {
  genes_ok <- intersect(marker_sets[[nm]], rownames(obj_all))
  if (length(genes_ok) < 5) {
    message("⚠️ Trop peu de gènes trouvés pour ", nm, " (", length(genes_ok), "), je saute.")
  } else {
    prev_cols <- colnames(obj_all@meta.data)
    obj_all <- AddModuleScore(
      obj_all,
      features = list(genes_ok),
      name     = paste0("MS_", nm),
      assay    = "RNA"
    )
    new_cols <- setdiff(colnames(obj_all@meta.data), prev_cols)
    if (length(new_cols)) {
      ms_col_map[new_cols] <- nm
    }
  }
}

## pick label by max module score (very rough first pass)
ms_cols <- names(ms_col_map)
if (length(ms_cols)) {
  score_mat <- as.matrix(obj_all@meta.data[, ms_cols, drop = FALSE])
  score_mat[is.na(score_mat)] <- -Inf
  best_col <- ms_cols[max.col(score_mat, ties.method = "first")]
  obj_all$celltype_guess <- unname(ms_col_map[best_col])
} else {
  obj_all$celltype_guess <- NA_character_
}

pdf(file.path(out_dir, "umap_res0.6_celltype_guess.pdf"), width = 8, height = 6)
print(DimPlot(obj_all, group.by = "celltype_guess", label = TRUE))
dev.off()

DefaultAssay(obj_all) <- "RNA"  # pour rester simple

for (cl in names(marker_sets)) {
  obj_all <- AddModuleScore(
    obj_all,
    features = list(marker_sets[[cl]]),
    name     = paste0("MS_cluster", cl),
    assay    = "RNA"
  )
}

## ---- save objects ----
## ---- save heavy objects (final) ----

# Dossier pour les objets R (séparé des plots)
obj_dir <- here::here("data/output/snrnaseq_objects")
dir.create(obj_dir, showWarnings = FALSE, recursive = TRUE)

## 1) Sauvegarder la liste complète des objets par timepoint (après QC + SCT)
saveRDS(
  by_tp,
  file = file.path(obj_dir, "by_tp_QC_SCT_list.rds")
)

## 1b) Sauvegarder aussi chaque timepoint séparément (pratique pour recharger un seul)
lapply(names(by_tp), function(nm) {
  saveRDS(
    by_tp[[nm]],
    file = file.path(obj_dir, paste0("Seurat_", nm, "_QC_SCT.rds"))
  )
})

## 2) Sauvegarder l’objet fusionné avec PCA/UMAP/clusters
# obj_all contient : timepoints fusionnés, SCT global, PCA (RSpectra), UMAP, clusters
saveRDS(
  obj_all,
  file = file.path(obj_dir, "obj_all_SCT_PCA_UMAP_clusters.rds")
)

## 3) Sauvegarder les marqueurs si l’objet `markers` existe
if (exists("markers")) {
  write.csv(
    markers,
    file = file.path(obj_dir, "markers_res0.6.csv"),
    row.names = FALSE
  )
}

## 4) (optionnel) Exporter les coordonnées UMAP en CSV
if ("umap" %in% Seurat::Reductions(obj_all)) {
  umap_df <- Embeddings(obj_all, "umap") |>
    as.data.frame() |>
    tibble::rownames_to_column("cell_id")
  
  write.csv(
    umap_df,
    file = file.path(obj_dir, "umap_coordinates_res0.6.csv"),
    row.names = FALSE
  )
}

saveRDS(integrated, file = file.path(out_dir, "integrated_SCT_pca_umap_neighbors.rds"))
lapply(names(clustered), function(nm) saveRDS(clustered[[nm]], file = file.path(out_dir, paste0("integrated_", nm, ".rds"))))

2.4 Post-cleaning sanity checks

Code

library(Seurat)
library(tidyverse)
library(future)

# 1) pas de parallélisation pour cette étape
future::plan("sequential")

# 2) augmenter la limite (par ex. 100 Go)
options(future.globals.maxSize = 20 * 1024^3)

obj_dir <- here::here("data/output/snrnaseq_objects/")  # ou le chemin que tu as utilisé

# Reload the merged object to explore results without rerunning the full pipeline
obj_all <- readRDS(file.path(obj_dir, "obj_all_SCT_PCA_UMAP_clusters.rds"))

# Enforce a consistent timepoint order for UMAP panels (0h, 4h, 6h, 8h, 12h)
tp_levels <- c("0h", "4h", "6h", "8h", "12h")
if ("timepoint" %in% colnames(obj_all@meta.data)) {
  obj_all$timepoint <- factor(obj_all$timepoint, levels = tp_levels)
}

# UMAP by clusters
DimPlot(obj_all, reduction = "umap", group.by = "seurat_clusters", label = TRUE)

# UMAP by timepoint
DimPlot(obj_all, reduction = "umap", group.by = "timepoint")

# UMAP split by timepoint (facet per timepoint, fixed order)
DimPlot(
  obj_all,
  reduction = "umap",
  group.by = "seurat_clusters",
  split.by = "timepoint",
  ncol = 3
)

# Pull QC exports for quick inspection / downstream plotting
markers <- read.csv(file.path(obj_dir, "markers_res0.6.csv"))
head(markers)

umap_df <- read.csv(file.path(obj_dir, "umap_coordinates_res0.6.csv"))

head(umap_df)
# cell_id, UMAP_1, UMAP_2

UMAP split by timepoint (umap_clusters_by_timepoint_diag.png)
Shows the same UMAP layout colored by clusters, facetted for 0h, 4h, 6h, 8h and 12h. The goal is to check that each timepoint reuses the same global structures, rather than forming isolated “islands” that would suggest strong batch effects.

2.5 Cluster coherence diagnostics

Code

# Enforce a consistent timepoint order for all plots
tp_levels <- c("0h", "4h", "6h", "8h", "12h")
if ("timepoint" %in% colnames(obj_all@meta.data)) {
  obj_all$timepoint <- factor(obj_all$timepoint, levels = tp_levels)
}

# 1) UMAP facetted by timepoint (fixed order) to check that structures are reused
p_umap_tp <- DimPlot(
  obj_all,
  reduction = "umap",
  group.by  = "seurat_clusters",
  split.by  = "timepoint",
  ncol      = 3,
  order     = TRUE
)
fig_export(
  path    = "report/snrnaseq/plot/cleaning/umap_clusters_by_timepoint_diag",
  plot_x  = p_umap_tp,
  height_i = 6,
  width_i  = 12,
  res_i    = 300,
  format   = "png"
)

# 2) Cluster composition per timepoint (barplot of proportions)
cl_tp <- obj_all@meta.data |>
  tibble::as_tibble(rownames = "cell_id") |>
  dplyr::count(timepoint, seurat_clusters) |>
  dplyr::group_by(timepoint) |>
  dplyr::mutate(prop = n / sum(n))

p_comp <- ggplot(cl_tp, aes(x = timepoint, y = prop, fill = seurat_clusters)) +
  geom_col(width = 0.8, color = "white", size = 0.1) +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
  labs(
    x = "Timepoint",
    y = "Cluster proportion",
    title = "Cluster composition per timepoint"
  ) +
  theme_bw() +
  theme(legend.position = "right")
fig_export(
  path    = "report/snrnaseq/plot/cleaning/cluster_proportion",
  plot_x  = p_comp,
  height_i = 6,
  width_i  = 8,
  res_i    = 300,
  format   = "png"
)

# 3) QC gradients on UMAP to ensure no technical drift drives the embedding
qc_features <- intersect(c("nCount_RNA", "nFeature_RNA", "pct_cp", "pct_mt"), colnames(obj_all@meta.data))
# Keep only QC features that actually vary; constants (e.g. pct_cp = 0 for all cells) are not informative
qc_features <- qc_features[
  vapply(
    qc_features,
    function(f) stats::var(obj_all@meta.data[[f]], na.rm = TRUE) > 0,
    logical(1)
  )
]
if (length(qc_features)) {
  p_qc_umap <- FeaturePlot(
    obj_all,
    features  = qc_features,
    reduction = "umap",
    order     = TRUE,
    ncol      = 2
  ) +
    theme_bw()
  fig_export(
    path    = "report/snrnaseq/plot/cleaning/qc_umap",
    plot_x  = p_qc_umap,
    height_i = 6,
    width_i  = 8,
    res_i    = 300,
    format   = "png"
  )
}

# 4) Cluster-level QC summary (median library size / genes / organelle %)
qc_summary <- obj_all@meta.data |>
  tibble::as_tibble(rownames = "cell_id") |>
  dplyr::group_by(seurat_clusters) |>
  dplyr::summarise(
    med_nCount = median(nCount_RNA, na.rm = TRUE),
    med_nFeat  = median(nFeature_RNA, na.rm = TRUE),
    med_pct_cp = median(pct_cp, na.rm = TRUE),
    med_pct_mt = median(pct_mt, na.rm = TRUE)
  ) |>
  tidyr::pivot_longer(-seurat_clusters, names_to = "metric", values_to = "median")

p_qc_summary <- ggplot(qc_summary, aes(x = seurat_clusters, y = median, fill = metric)) +
  geom_col(position = "dodge") +
  labs(
    x = "Cluster",
    y = "Median QC value",
    title = "Per-cluster QC medians"
  ) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
fig_export(
  path    = "report/snrnaseq/plot/cleaning/qc_cluster_medians",
  plot_x  = p_qc_summary,
  height_i = 6,
  width_i  = 8,
  res_i    = 300,
  format   = "png"
)

Cluster composition per timepoint (cluster_proportion.png)
Stacked barplot of cluster proportions in each timepoint. This helps verify that major clusters are contributed by several timepoints and that no cluster is driven by a single timepoint only (which could indicate a technical artefact or an over‑specific cluster).

QC gradients on UMAP (qc_umap.png)
UMAP colored by nCount_RNA and nFeature_RNA. The absence of large continuous gradients across the embedding indicates that depth and gene detection are not the main drivers of the UMAP structure.

Per-cluster QC medians (qc_cluster_medians.png)
For each cluster, shows median library size, number of detected genes, and organelle percentages. This is used to flag clusters with extreme QC values (very low complexity, very high counts, or high organelle content) that might correspond to low-quality cells, doublets, or debris.