---
output: html_document
editor_options:
chunk_output_type: console
---
## Cleaning & Normalization
```{r}
#pkg
library(Seurat)
library(SoupX)
library(scDblFinder)
library(Matrix)
library(rtracklayer)
library(GenomeInfoDb)
library(tidyverse)
library(forcats)
library(ggplot2)
library(here)
library(tidyverse)
library(readxl)
library(gt)
library(paletteer)
library(scales)
library(DT)
library(htmlwidgets)
library(rtracklayer)
library(GenomeInfoDb)
library(fs)
library(glmGamPoi)
library(furrr)
library(future)
if (requireNamespace("glmGamPoi", quietly = TRUE)) message("glmGamPoi available: SCT will be faster.")
if (requireNamespace("presto", quietly = TRUE)) message("presto available: marker finding will be faster.")
# src
source(here::here("src/function/stat_function/stat_analysis_main.R")) # for make plot
source(here::here("src/function/fig_export.R")) # This function saves a given plot (plot_x) as both a PDF and a high-resolution PNG file at specified dimensions.
runs_base <- "/mnt/ibioldata/LBMC_data/cmaslard/data/ScBdLR_init_2025_private/cellranger/"
#runs_base <- "/Volumes/IBIOLdata_storage/LBMC_data/cmaslard/data/ScBdLR_init_2025_private/cellranger/"
```
A sample is considered ‘good’ if:
- Estimated Number of Cells: 627–14k. Anything \<\~1–2k is low yield for droplet-based sc/snRNA-seq.
- Mean Reads per Cell: \~28k–120k. High depth (\>80k) only helps if it translates into more genes/UMIs.
- Median Genes per Cell / Median UMI per Cell: \~1.4k–6.7k genes and \~1.9k–6.7k UMIs. ≥2–3k genes/cell is a good floor in plants; 1.4–2.0k is weak.
- Valid Barcodes / Valid UMIs: \~92–98% / \~100% — excellent across the board.
- Sequencing Saturation: 52–82%. \>70–80% means diminishing returns from more depth; \~50–65% still has room if you need more molecules.
- Q30 (Barcode/RNA/UMI): \~92–95% — very good.
- Reads Mapped (Genome / Confidently to Genome): \~93–96% / \~49–86% — generally solid.
- Confidently Intergenic/Intronic/Exonic: Exonic varies a lot (34–72%). Exonic \<\~50% suggests more non-transcriptome signal (ambient RNA, pre-mRNA bias, annotation mismatch).
- Confidently to Transcriptome: \~37–73%. Good runs are ≥60–70%. \<50–55% is a red flag for downstream expression analysis.
- Reads Antisense to Gene: \~3–6% — fine.
- Fraction Reads in Cells: \~63–80%. ≥70% is healthy; low 60s suggests ambient RNA / many empty droplets.
- Total Genes Detected: \~31–37k — consistent.
- High mapped read rate (\>70–80% of expected genome/mRNA).
- Median number of genes detected per cell/nucleus is correct:
- Whole cells: \~1,500–5,000 genes
- Nuclei: often lower (500–2,500), but consistent between samples.
- Low proportion of organelle UMIs (chloroplast + mitochondrion) → typically \<5–10%, otherwise a lot of debris/ambient RNA.
- Low proportion of ribosomal reads (depending on capture).
- Doublets/minor artefacts ≤5%.
## Summary of the various datasets (see also the previous page)
```{r, eval=F}
runs <- list.dirs(runs_base, full.names = TRUE, recursive = FALSE)
runs_org <- runs[str_ends(basename(runs), "_org")]
file_name <- "metrics_summary.csv"
all_tbl <- tibble(path = paste0(runs_org,"/",file_name)) %>%
mutate(filename = path_file(path),
dir = path_dir(path),
sample = path_file(runs_org)) %>%
dplyr::mutate(data = map(path, ~ readr::read_csv(.x, show_col_types = FALSE))) |>
dplyr::select(sample, data) %>%
unnest(data)
out_dir <- here::here("data/output/verification")
dir.create(out_dir, showWarnings = FALSE, recursive = TRUE)
write.csv(all_tbl,
file.path(out_dir, "all_sample_metrics_summary.csv"),
row.names = FALSE)
```
```{r}
# ---- Palettes & options ----
c_col <- c("#ffffff", paletteer::paletteer_dynamic("cartography::green.pal", 20) |> as.character())
c_col_light_blue <- c("#edf2fb", "#e2eafc", "#d7e3fc", "#ccdbfd", "#c1d3fe")
c_table_width <- gt::px(900)
c_save <- FALSE
# ---- Lecture ----
out_dir <- here::here("data/output/verification")
csv_path <- file.path(out_dir, "all_sample_metrics_summary.csv")
tbl <- readr::read_csv(csv_path, show_col_types = FALSE)
stopifnot("sample" %in% names(tbl))
# ---- 1) Convertir les colonnes pourcentage "xx.x%" -> proportion numérique (0..1) ----
pct_cols <- names(tbl)[map_lgl(tbl, ~ is.character(.x) && any(str_detect(.x, "%"), na.rm = TRUE))]
tbl <- tbl |>
mutate(across(all_of(pct_cols),
~ readr::parse_number(.x, locale = readr::locale(grouping_mark = ",")) / 100))
# (optionnel) convertir d'autres colonnes "numériques en texte" (sans %)
# num_like_cols <- names(tbl)[map_lgl(tbl, ~ is.character(.x) && all(str_detect(.x, "^[0-9,\\.]+$") | is.na(.x)))]
# tbl <- tbl |> mutate(across(all_of(num_like_cols),
# ~ readr::parse_number(.x, locale = readr::locale(grouping_mark=","))))
# ---- 2) Construire le tableau gt : chaque colonne numérique a sa propre échelle de couleur ----
num_cols <- setdiff(names(select(tbl, where(is.numeric))), "sample")
view_tbl <- select(tbl, sample, all_of(num_cols))
gt_table <- gt(view_tbl, rowname_col = "sample") |>
tab_options(table.width = c_table_width)
# Heuristique: colonnes de "compte" (grands nombres) -> bleu; % -> format percent; autres -> vert
count_like_pat <- "(^n_|count|cells|reads|num$|number)"
gt_table <- reduce(num_cols, .init = gt_table, .f = function(g, col) {
dom <- range(view_tbl[[col]], na.rm = TRUE)
if (!is.finite(dom[1]) || !is.finite(dom[2])) dom <- c(0, 0) # garde-fou
is_percent <- col %in% pct_cols
is_count <- stringr::str_detect(col, count_like_pat) && !is_percent
g1 <- if (is_percent) {
fmt_percent(g, columns = dplyr::all_of(col), decimals = 1, scale_values = TRUE)
} else {
fmt_number(g, columns = dplyr::all_of(col), decimals = if (is_count) 0 else 1)
}
data_color(
g1,
columns = dplyr::all_of(col),
colors = scales::col_numeric(
palette = if (is_count) c_col_light_blue else c_col,
domain = dom
)
)
})
if (isTRUE(c_save)) {
gtsave(gt_table, filename = file.path(out_dir, "all_sample_metrics_summary_gt.html"))
}
gt_table
```
## Calculate the % of chloroplasts and mitochondria
- Chloroplast data came from NC_011032
- Mitochondria: accept common patterns (works for Brachy or Rice fallback) NC_011033.1 and NC_011033
```{r, eval = F}
# ---- 0) Pick a GTF (prefer organelle ref, fallback to nuclear) ----
gtf_candidates <- c(
here::here("data/refs/BdistachyonBd21_3_10X_Genome_organelle/genes/genes.gtf.gz"),
here::here("data/refs/BdistachyonBd21_3_10X_Genome/genes/genes.gtf.gz")
)
gtf_file <- gtf_candidates[file.exists(gtf_candidates)][1]
message("Using GTF: ", gtf_file)
gtf <- rtracklayer::import(gtf_file)
# ---- 1) Define organelle gene feature sets from the GTF ----
seqs <- as.character(GenomeInfoDb::seqnames(gtf))
# Chloroplast contig was normalized during build to "NC_011032"
is_cp <- seqs == "NC_011032"
# Mitochondria: accept common patterns (works for Brachy or Rice fallback)
is_mt <- seqs %in% c("NC_011033.1", "NC_011033")
cp_names <- unique(gtf$gene_name[is_cp & !is.na(gtf$gene_name)])
cp_ids <- unique(gtf$gene_id [is_cp & !is.na(gtf$gene_id)])
mt_names <- unique(gtf$gene_name[is_mt & !is.na(gtf$gene_name)])
mt_ids <- unique(gtf$gene_id [is_mt & !is.na(gtf$gene_id)])
cp_features_ref <- unique(c(cp_names, cp_ids))
mt_features_ref <- unique(c(mt_names, mt_ids))
# Seurat often converts "_" -> "-" in rownames; prepare both variants
dash <- function(x) gsub("_", "-", x, fixed = TRUE)
cp_features_ref_dash <- dash(cp_features_ref)
mt_features_ref_dash <- dash(mt_features_ref)
# ---- Helpers ----
# robust counts extractor (Seurat v4/v5)
get_counts <- function(obj, assay = DefaultAssay(obj)) {
out <- tryCatch(GetAssayData(obj, assay = assay, layer = "counts"),
error = function(e) NULL)
if (is.null(out)) out <- GetAssayData(obj, assay = assay, slot = "counts")
out
}
# find matrix in a run folder (works with or without "outs/")
find_matrix <- function(run_dir) {
cand <- c(file.path(run_dir, "filtered_feature_bc_matrix"),
file.path(run_dir, "outs", "filtered_feature_bc_matrix"))
d <- cand[dir.exists(cand)][1]
if (is.na(d)) {
# also try HDF5
h5 <- c(file.path(run_dir, "filtered_feature_bc_matrix.h5"),
file.path(run_dir, "outs", "filtered_feature_bc_matrix.h5"))
h5 <- h5[file.exists(h5)][1]
if (!is.na(h5)) return(h5)
return(NULL)
}
d
}
# compute % chloro / % mito for one run
percent_organelle_one <- function(run_dir) {
mat_path <- find_matrix(run_dir)
if (is.null(mat_path)) return(NULL)
if (dir.exists(mat_path)) {
mat <- Read10X(mat_path)
} else if (grepl("\\.h5$", mat_path)) {
mat <- Read10X_h5(mat_path)
} else {
return(NULL)
}
s <- CreateSeuratObject(mat, project = basename(run_dir))
counts <- get_counts(s)
rn <- rownames(counts)
# Match features (support "_"->"-")
feats_cp <- intersect(rn, unique(c(cp_features_ref, cp_features_ref_dash)))
feats_mt <- intersect(rn, unique(c(mt_features_ref, mt_features_ref_dash)))
tot <- pmax(Matrix::colSums(counts), 1)
pct_cp <- if (length(feats_cp)) Matrix::colSums(counts[feats_cp, , drop = FALSE]) / tot * 100 else rep(0, ncol(counts))
pct_mt <- if (length(feats_mt)) Matrix::colSums(counts[feats_mt, , drop = FALSE]) / tot * 100 else rep(0, ncol(counts))
tibble(
sample = basename(run_dir),
cell = colnames(counts),
pct_chloroplast = as.numeric(pct_cp),
pct_mito = as.numeric(pct_mt),
pct_organelle = as.numeric(pct_cp + pct_mt)
)
}
# ---- 2) Apply to all local runs that end with "_org" ----
runs_all <- list.dirs(runs_base, full.names = TRUE, recursive = FALSE)
runs_org <- runs_all[str_ends(basename(runs_all), "_org")]
df <- purrr::map_dfr(runs_org, percent_organelle_one)
stopifnot(nrow(df) > 0)
# tidy & ordering
df <- df %>% filter(!is.na(pct_organelle))
df <- df %>% mutate(sample = factor(sample),
sample_f = forcats::fct_reorder(sample, pct_organelle, .fun = median))
# ---- 3) Violin plots WITH points ----
set.seed(123)
max_points <- 5000
df_points <- if (nrow(df) > max_points) dplyr::sample_n(df, max_points) else df
# A) Faceted by run (chloroplast vs mito vs total)
df_long <- df %>%
pivot_longer(c(pct_chloroplast, pct_mito, pct_organelle),
names_to = "metric", values_to = "percent")
p_facets <- ggplot(df_long, aes(x = "cells", y = percent)) +
geom_violin(scale = "width", trim = TRUE) +
geom_boxplot(width = 0.12, outlier.shape = NA, alpha = 0.5) +
geom_point(
data = df_points %>% pivot_longer(c(pct_chloroplast, pct_mito, pct_organelle),
names_to = "metric", values_to = "percent"),
aes(x = "cells", y = percent),
position = position_jitter(width = 0.08, height = 0),
size = 0.4, alpha = 0.35
) +
facet_wrap(~ sample_f + metric, scales = "free_y") +
labs(
title = "Organelle RNA content per run",
x = "",
y = "Percent of UMI counts per nucleus (%)"
) +
theme_bw() +
theme(strip.text = element_text(face = "bold"),
axis.text.x = element_blank(),
panel.grid.minor = element_blank())
print(p_facets)
# B) All runs side-by-side (one panel per metric)
p_side <- ggplot(df_long %>% filter(metric != "pct_organelle"), aes(x = sample, y = percent)) +
geom_violin(scale = "width", trim = TRUE) +
geom_boxplot(width = 0.12, outlier.shape = NA, alpha = 0.5) +
geom_point(
data = df_points %>%
pivot_longer(c(pct_chloroplast, pct_mito, pct_organelle),
names_to = "metric", values_to = "percent") %>%
filter(metric != "pct_organelle") %>%
mutate(sample_f = forcats::fct_reorder(sample, percent, .fun = median)),
aes(x = sample_f, y = percent),
position = position_jitter(width = 0.15, height = 0),
size = 0.4, alpha = 0.35
) +
facet_wrap(~ metric, scales = "free_y") +
labs(
title = "Organelle RNA content across runs",
x = "Sample (run)",
y = "Percent of UMI counts per nucleus (%)"
) +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid.minor = element_blank())
print(p_side)
fig_export(path = "report/snrnaseq/plot/cleaning/n_organelle", plot_x = p_side, height_i = 6, width_i = 16, res = 600)
# ---- 4) Numeric summary per run ----
summary_tbl <- df %>%
group_by(sample) %>%
summarise(
n_cells = n(),
median_pct_chloro = median(pct_chloroplast),
mean_pct_chloro = mean(pct_chloroplast),
max_pct_chloro = max(pct_chloroplast),
median_pct_mito = median(pct_mito),
mean_pct_mito = mean(pct_mito),
max_pct_mito = max(pct_mito),
median_pct_org = median(pct_organelle),
mean_pct_org = mean(pct_organelle),
max_pct_org = max(pct_organelle)
) %>% arrange(median_pct_org)
summary_tbl
# Optional: save summary
out_dir <- here::here("data/output/verification")
dir.create(out_dir, showWarnings = FALSE, recursive = TRUE)
write.csv(summary_tbl,
file.path(out_dir, "percent_organelle_summary.csv"),
row.names = FALSE)
```
{fig-align="center"}
Organelle quality control: start with ≤5 to 10% of total organelles; inspect distributions per sample. For all samples, sela meets expectations.
```{r, RootTipsNumber}
# ---- Constantes (palette & tailles) ----
c_col <- c("#ffffff", paletteer::paletteer_dynamic("cartography::green.pal", 20) |> as.character())
c_col_light_blue <- c("#edf2fb", "#e2eafc", "#d7e3fc", "#ccdbfd", "#c1d3fe")
c_table_width <- gt::px(800)
c_save <- FALSE
# ---- Lecture : uniquement le CSV déjà créé ----
out_dir <- here::here("data/output/verification")
csv_path <- file.path(out_dir, "percent_organelle_summary.csv")
summary_tbl <- readr::read_csv(csv_path, show_col_types = FALSE) |>
arrange(median_pct_org)
# Domaines pour couleurs
max_org <- max(summary_tbl$mean_pct_org, na.rm = TRUE)
max_cells <- max(summary_tbl$n_cells, na.rm = TRUE)
# ---- 1) Tableau statique "gt" (look & feel proche de ton code) ----
gt_table <- summary_tbl |>
dplyr::select(sample, n_cells, median_pct_org, mean_pct_org, max_pct_org, median_pct_chloro, mean_pct_chloro, max_pct_chloro, median_pct_mito, mean_pct_mito, max_pct_mito) %>%
gt(rowname_col = "sample") |>
fmt_number(
columns = c(median_pct_org, mean_pct_org ,
median_pct_chloro, mean_pct_chloro,
median_pct_mito, mean_pct_mito
),
decimals = 1
) |>
fmt_number(columns = n_cells, decimals = 0) |>
data_color(
columns = c(median_pct_org, mean_pct_org),
colors = scales::col_numeric(palette = c_col, domain = c(max_org, 0)) # dégradé inversé (fort -> foncé)
) |>
data_color(
columns = n_cells,
colors = scales::col_numeric(palette = c_col_light_blue, domain = c(0, max_cells))
) |>
tab_options(
table.width = c_table_width,
heading.subtitle.font.size = 10,
column_labels.border.bottom.width = gt::px(3)
)
if (isTRUE(c_save)) {
gtsave(gt_table, filename = file.path(out_dir, "percent_organelle_summary_gt.html"))
}
gt_table
# ---- 2) Version interactive avec flèches de tri + filtres (DT) ----
dt_table <- DT::datatable(
summary_tbl,
rownames = FALSE,
filter = "top", # boîtes de filtre en tête de colonnes
options = list(
pageLength = 30,
autoWidth = TRUE,
order = list(list(which(names(summary_tbl) == "median_pct_org") - 1, "asc"))
),
class = "stripe hover order-column display"
) |>
# petit barplot de fond pour "median_pct_org" (optionnel)
DT::formatStyle(
"median_pct_org",
background = styleColorBar(c(0, max_org), c_col[length(c_col)]),
backgroundSize = "100% 90%",
backgroundRepeat = "no-repeat",
backgroundPosition = "center"
)
if (isTRUE(c_save)) {
htmlwidgets::saveWidget(
dt_table,
file = file.path(out_dir, "percent_organelle_summary_interactive.html"),
selfcontained = TRUE
)
}
```
This matrix folder is then loaded into R/Seurat.
## Cleaning and select good sample
- Removes chloroplast and mitochondrial genes from the count matrices (using your GTF; with a fallback),
- Applies ambient RNA correction (SoupX) + doublet removal (scDblFinder),
- Runs per-cell QC and per-sample QC,
- Decides, for each timepoint (0h, 4h, 6h, 8h, 12h), whether to:
- Keep the best replicate only, or
- Merge replicates when they are acceptable but individually small,
- And guarantees you end up with one object per timepoint (by merging if needed).
I’ve kept it modular so you can tweak thresholds quickly.
**What the script does**
1. Discover runs (\*\_org) and parse a timepoint label from each sample name (0h, 4h, 6h, 8h, 12h).
2. Identify organelle features from the GTF (chloroplast and mitochondria contigs). Those rows are removed from the count matrices before QC.
3. For each run:\
- load 10x matrices (Read10X/Read10X_h5),
- SoupX: estimate and correct ambient RNA,
- build a Seurat object,
- label %cp / %mt (only to measure; counts already have cp/mt removed),
- scDblFinder: remove doublets,
- apply cell-level QC (min features/UMIs; optional %organelle threshold if you want).
- Summarize sample-level QC (cells kept, median genes/UMIs, fraction reads in cells if you have it, etc.).
- Decision logic per timepoint:
- Mark each replicate PASS if it meets minimal sample-level criteria (e.g., ≥2k cells retained, median genes/cell ≥2k, transcriptome/confident exonic proxy if present).
- If multiple PASS → merge.\
If none PASS but one is “near-pass” → optionally merge the near-pass replicates to salvage one timepoint. If only one PASS → keep best replicate.
6. Output:
- a named list of timepoint-level Seurat objects (by_timepoint\[\[ "0h" \]\], …),
- a QC table with PASS/CHECK status,
- RDS files.
```{r, eval =F}
## ---- user settings / thresholds ----
min_cells_per_sample <- 1000 # retained singlets after QC
min_genes_per_cell <- 2000
min_umis_per_cell <- 1000 # optional; set lower if needed
max_genes_per_cell <- 13000 # defensive high-end cutoff
max_umis_per_cell <- 300000 # defensive high-end cutoff
max_pct_organelle_cell <- 5 # per-cell %org (if you want an additional guard)
timepoints_required <- c("0h","4h","6h","8h","12h")
## ---- find organelle features from GTF ----
gtf_candidates <- c(
here::here("data/refs/BdistachyonBd21_3_10X_Genome_organelle/genes/genes.gtf.gz"),
here::here("data/refs/BdistachyonBd21_3_10X_Genome/genes/genes.gtf.gz")
)
gtf_file <- gtf_candidates[file.exists(gtf_candidates)][1]
message("[GTF] Using: ", gtf_file)
gtf <- rtracklayer::import(gtf_file)
seqs <- as.character(GenomeInfoDb::seqnames(gtf))
## These contig IDs can be adapted; keep your earlier mapping if different
is_cp <- seqs %in% c("NC_011032","NC_011032.1")
is_mt <- seqs %in% c("NC_011033","NC_011033.1")
cp_names <- unique(gtf$gene_name[is_cp & !is.na(gtf$gene_name)])
cp_ids <- unique(gtf$gene_id [is_cp & !is.na(gtf$gene_id)])
mt_names <- unique(gtf$gene_name[is_mt & !is.na(gtf$gene_name)])
mt_ids <- unique(gtf$gene_id [is_mt & !is.na(gtf$gene_id)])
org_refs <- unique(c(cp_names, cp_ids, mt_names, mt_ids))
org_refs_dash <- gsub("_","-", org_refs, fixed = TRUE)
is_org_feature <- function(genes) {
genes %in% org_refs | genes %in% org_refs_dash
}
## ---- helpers: find matrix inside a run folder ----
find_matrix <- function(run_dir) {
# typical 10x structure
cand <- c(
file.path(run_dir, "filtered_feature_bc_matrix"),
file.path(run_dir, "outs", "filtered_feature_bc_matrix")
)
d <- cand[dir.exists(cand)][1]
if (!is.na(d)) return(d)
# h5 fallback
h5 <- c(
file.path(run_dir, "filtered_feature_bc_matrix.h5"),
file.path(run_dir, "outs", "filtered_feature_bc_matrix.h5")
)
h5 <- h5[file.exists(h5)][1]
if (!is.na(h5)) return(h5)
return(NA_character_)
}
## ---- parse timepoint from sample name ----
# expects tokens like "..._0h_org", "..._4h_A_org", etc.
# vector-safe: returns one timepoint per input element
parse_timepoint <- function(x) {
# grab the number that is followed by optional letters/underscores and an 'h'
tp_num <- stringr::str_extract(x, "\\d+(?=[A-Za-z_]*h)")
ifelse(is.na(tp_num), NA_character_, paste0(tp_num, "h"))
}
## tolerant SoupX wrapper
safe_soupx_adjust <- function(run_dir, mat_fallback) {
# try a sequence of more permissive settings
tries <- list(
list(tfidfMin = 0.1, soupQuantile = 0.9, nSoupGene = 100),
list(tfidfMin = 0.0, soupQuantile = 0.95, nSoupGene = 200),
list(tfidfMin = 0.0, soupQuantile = 1.0, nSoupGene = 300)
)
for (par in tries) {
msg <- paste0("[SoupX] Trying tfidfMin=", par$tfidfMin,
", soupQuantile=", par$soupQuantile,
", nSoupGene=", par$nSoupGene, " on ", basename(run_dir))
message(msg)
adj <- try({
sc <- load10X(run_dir) # raw + filtered if present
auto <- autoEstCont(sc,
tfidfMin = par$tfidfMin,
soupQuantile = par$soupQuantile,
nSoupGene = par$nSoupGene)
adjustCounts(auto)
}, silent = TRUE)
if (!inherits(adj, "try-error")) return(adj)
}
message("[SoupX] Gave up on ", basename(run_dir), " → using uncorrected counts")
return(mat_fallback) # fallback to uncorrected counts
}
## ---- load & clean one run (returns a Seurat object of QCed singlets) ----
process_one_run <- function(run_dir) {
mat_path <- find_matrix(run_dir)
if (is.na(mat_path)) return(NULL)
# 1) read counts from your folder layout (dir or .h5)
mat <- if (dir.exists(mat_path)) Read10X(mat_path) else Read10X_h5(mat_path)
# 2) SoupX with tolerance + fallback
adj <- safe_soupx_adjust(run_dir, mat_fallback = mat)
# 3) remove organelle features
keep <- !is_org_feature(rownames(adj))
adj_no_org <- adj[keep, , drop = FALSE]
# 4) Seurat
obj <- CreateSeuratObject(counts = adj_no_org, project = basename(run_dir), min.cells = 1, min.features = 1)
# 5) Doublets (robuste)
sce <- as.SingleCellExperiment(obj)
sce <- tryCatch(scDblFinder(sce),
error = function(e) { message("[scDblFinder] ", e$message, " -> all singlets"); sce })
if (!"scDblFinder.class" %in% colnames(SummarizedExperiment::colData(sce))) {
obj$doublet_class <- "singlet"; obj$doublet_score <- NA_real_
} else {
obj$doublet_class <- sce$scDblFinder.class; obj$doublet_score <- sce$scDblFinder.score
}
# 6) per-cell QC via meta.data
md <- obj@meta.data
keep_idx <- which(
md$nFeature_RNA >= min_genes_per_cell &
md$nCount_RNA >= min_umis_per_cell &
md$nFeature_RNA <= max_genes_per_cell &
md$nCount_RNA <= max_umis_per_cell &
md$doublet_class == "singlet"
)
if (!length(keep_idx)) { message("[QC] No cells kept in ", basename(run_dir)); return(NULL) }
obj <- subset(obj, cells = colnames(obj)[keep_idx])
# 7) light normalization
obj <- NormalizeData(obj)
obj <- FindVariableFeatures(obj)
# annotate once (scalar) then broadcast
sample_id_scalar <- basename(run_dir)
obj$sample_id <- sample_id_scalar
obj$timepoint <- parse_timepoint(sample_id_scalar)[1]
obj
}
## ---- discover runs and process all ----
runs_all <- list.dirs(runs_base, full.names = TRUE, recursive = FALSE)
runs_org <- runs_all[str_ends(basename(runs_all), "_org")]
options(future.globals.maxSize = 100 * 1024^3) # 8 Go
plan(multisession, workers = 20) # marche partout, sûr et sans prise de tête
objs <- future_map(
runs_org,
process_one_run,
.options = furrr_options(seed = TRUE, scheduling = 1),
.progress = TRUE
)
# objs <- map(runs_org, process_one_run) # for sequential analysis
names(objs) <- basename(runs_org)
objs <- discard(objs, is.null)
stopifnot(length(objs) > 0)
## ---- sample-level QC summary ----
sample_qc <- map_dfr(objs, function(o) {
tibble(
sample = unique(o$sample_id),
timepoint = unique(o$timepoint),
n_cells = ncol(o),
med_genes = median(o$nFeature_RNA),
med_umis = median(o$nCount_RNA)
)
})
## If you have per-sample “fraction reads in cells” or “mapped to transcriptome” from metrics CSV,
## you can join them here by sample name and use them in the pass criteria too.
## For now, we gate only on n_cells and med_genes.
sample_qc <- sample_qc %>%
mutate(
pass_cells = n_cells >= min_cells_per_sample,
pass_genes = med_genes >= min_genes_per_cell,
pass = pass_cells & pass_genes
)
print(sample_qc)
## ---- decide per timepoint: merge vs best replicate ----
choose_and_merge_by_timepoint <- function(objs, qc_table, required_tp) {
stopifnot(is.list(objs), !is.null(names(objs)), all(nzchar(names(objs))))
tps <- union(required_tp, unique(qc_table$timepoint))
out <- list()
decisions <- list()
for (tp in tps) {
cand_names <- qc_table %>% dplyr::filter(timepoint == tp) %>% dplyr::pull(sample)
cand_objs <- objs[cand_names]
cand_qc <- qc_table %>% dplyr::filter(timepoint == tp)
if (length(cand_objs) == 0) {
decisions[[tp]] <- tibble(timepoint = tp, decision = "MISSING", used = NA_character_)
next
}
pass_idx <- which(cand_qc$pass)
if (length(pass_idx) >= 2) {
use <- cand_names[pass_idx]
merged <- Reduce(
function(a, b) merge(a, y = b, add.cell.ids = c(a = unique(a$sample_id)[1], b = unique(b$sample_id)[1])),
cand_objs[use]
)
merged$timepoint <- tp
out[[tp]] <- merged
decisions[[tp]] <- tibble(timepoint = tp, decision = "MERGE_PASS", used = paste(use, collapse = ","))
next
}
if (length(pass_idx) == 1) {
use <- cand_names[pass_idx]
keep <- cand_objs[[use]]
keep$timepoint <- tp
out[[tp]] <- keep
decisions[[tp]] <- tibble(timepoint = tp, decision = "KEEP_BEST_PASS", used = use)
next
}
cand_qc <- cand_qc %>% dplyr::arrange(dplyr::desc(n_cells))
# use nrow(.) instead of n()
near <- cand_qc %>% dplyr::slice(seq_len(min(2, nrow(.))))
if (nrow(near) >= 2) {
use <- near$sample
merged <- Reduce(
function(a, b) merge(a, y = b, add.cell.ids = c(a = unique(a$sample_id)[1], b = unique(b$sample_id)[1])),
cand_objs[use]
)
merged$timepoint <- tp
out[[tp]] <- merged
decisions[[tp]] <- tibble(timepoint = tp, decision = "MERGE_TOP2_NEAR_PASS", used = paste(use, collapse = ","))
} else {
use <- cand_names[1]
keep <- cand_objs[[use]]
keep$timepoint <- tp
out[[tp]] <- keep
decisions[[tp]] <- tibble(timepoint = tp, decision = "KEEP_ONLY_AVAILABLE_FAILS", used = use)
}
}
list(objects_by_tp = out, decisions = dplyr::bind_rows(decisions))
}
merged <- choose_and_merge_by_timepoint(objs, sample_qc, timepoints_required)
by_timepoint <- merged$objects # named list: "0h","4h","6h","8h","12h"
tp_decisions <- merged$decisions
print(tp_decisions)
## ---- optional: save results ----
out_dir <- here::here("data/output/clean_objects")
dir.create(out_dir, showWarnings = FALSE, recursive = TRUE)
# Save QC decisions
readr::write_csv(sample_qc, file.path(out_dir, "sample_qc_summary.csv"))
readr::write_csv(tp_decisions, file.path(out_dir, "timepoint_merge_decisions.csv"))
# Save one RDS per timepoint
iwalk(by_timepoint, ~ saveRDS(.x, file = file.path(out_dir, paste0("Seurat_", .y, ".rds"))))
message("Saved cleaned timepoint objects to: ", out_dir)
```
1. remove ambient RNA (optional if SoupX not needed)
2. detect & drop doublets
3. re-QC / filter (nFeature, nCount, %organelle)
4. normalize + find variable features (LogNorm or SCT)
5. integrate across timepoints
6. dimensionality reduction (PCA/UMAP), neighbors, clustering (several resolutions)
7. markers per cluster (optionally with presto for speed)
8. first-pass annotation (marker enrichment / module scores; transfer from references if available)
9. export objects, metrics, plots, and marker tables
### Normalization
- Regress organelle % during normalization to reduce technical variance.
- Batch correction only if you really see batch structure (UMAP colored by sample is your friend).
- Doublets: 3–8% typical; filter if they cluster or confound DE.
- Keep raw objects before filtering (e.g., combined_raw \<- combined) when exploring thresholds.
PCA was performed using a sparse SVD decomposition (RSpectra::svds), which is mathematically equivalent to classical PCA but computationally stable for large SCT-normalized matrices. This approach is recommended for reciprocal PCA integration in Seurat and does not affect downstream clustering or biological interpretation.
```{r, eval = F}
## ---- packages ----
library(Seurat)
library(tidyverse)
library(Matrix)
library(here)
library(rtracklayer)
library(GenomeInfoDb)
library(SingleCellExperiment)
library(scDblFinder)
## optional speed-ups (if installed)
if (requireNamespace("glmGamPoi", quietly = TRUE)) message("glmGamPoi available: SCT will be faster.")
if (requireNamespace("presto", quietly = TRUE)) message("presto available: marker finding will be faster.")
## optional ambient RNA (SoupX)
use_soupx <- TRUE
if (use_soupx) {
stopifnot(requireNamespace("SoupX", quietly = TRUE))
}
## ---- paths ----
clean_dir <- here("data/output/clean_objects") # where you saved Seurat_0h.rds etc.
out_dir <- here("data/output/postmerge"); dir.create(out_dir, showWarnings = FALSE, recursive = TRUE)
## ---- load timepoint objects ----
tps <- c("0h","4h","6h","8h","12h")
by_tp <- setNames(vector("list", length(tps)), tps)
for (tp in tps) {
fp <- file.path(clean_dir, paste0("Seurat_", tp, ".rds"))
if (file.exists(fp)) by_tp[[tp]] <- readRDS(fp)
}
by_tp <- by_tp[!vapply(by_tp, is.null, logical(1))]
stopifnot(length(by_tp) > 0)
## ---- organelle feature sets from your organelle GTF (same logic as before) ----
gtf_candidates <- c(
here("data/refs/BdistachyonBd21_3_10X_Genome_organelle/genes/genes.gtf.gz"),
here("data/refs/BdistachyonBd21_3_10X_Genome/genes/genes.gtf.gz")
)
gtf_file <- gtf_candidates[file.exists(gtf_candidates)][1]
message("Using GTF for organelle features: ", gtf_file)
gtf <- rtracklayer::import(gtf_file)
seqs <- as.character(GenomeInfoDb::seqnames(gtf))
is_cp <- seqs == "NC_011032"
is_mt <- seqs %in% c("NC_011033.1", "NC_011033")
cp_features_ref <- unique(c(na.omit(gtf$gene_name[is_cp]), na.omit(gtf$gene_id[is_cp])))
mt_features_ref <- unique(c(na.omit(gtf$gene_name[is_mt]), na.omit(gtf$gene_id[is_mt])))
dash <- function(x) gsub("_","-", x, fixed = TRUE)
cp_all <- unique(c(cp_features_ref, dash(cp_features_ref)))
mt_all <- unique(c(mt_features_ref, dash(mt_features_ref)))
# v5-safe counts accessor
get_counts <- function(s, assay = DefaultAssay(s)) {
# prefer LayerData (v5)
out <- tryCatch(SeuratObject::LayerData(s, assay = assay, layer = "counts"),
error = function(e) NULL)
if (is.null(out)) {
out <- tryCatch(GetAssayData(s, assay = assay, layer = "counts"),
error = function(e) NULL)
}
if (is.null(out)) out <- GetAssayData(s, assay = assay, slot = "counts") # v4 fallback
out
}
# if multiple counts layers exist, join them into a single layer named "counts"
ensure_single_counts <- function(s, assay = "RNA") {
if (inherits(s[[assay]], "Assay5")) {
lyr <- SeuratObject::Layers(s[[assay]])
has_counts <- grep("^counts(\\.|$)", lyr, value = TRUE)
if (length(has_counts) > 1) {
s <- JoinLayers(s, assay = assay) # collapses sample-specific layers
} else if (length(has_counts) == 1 && has_counts != "counts") {
s[[assay]] <- SeuratObject::RenameLayers(s[[assay]],
new.layers = setNames("counts", has_counts))
}
}
s
}
add_organelle_percents <- function(s, cp_genes, mt_genes) {
DefaultAssay(s) <- "RNA"
s <- ensure_single_counts(s, "RNA")
counts <- get_counts(s, "RNA")
stopifnot(!is.null(counts))
rn <- rownames(counts)
cp <- intersect(rn, cp_genes)
mt <- intersect(rn, mt_genes)
tot <- pmax(Matrix::colSums(counts), 1)
pct_cp <- if (length(cp)) Matrix::colSums(counts[cp, , drop = FALSE]) / tot * 100 else rep(0, ncol(counts))
pct_mt <- if (length(mt)) Matrix::colSums(counts[mt, , drop = FALSE]) / tot * 100 else rep(0, ncol(counts))
s$pct_cp <- as.numeric(pct_cp)
s$pct_mt <- as.numeric(pct_mt)
s$pct_org <- s$pct_cp + s$pct_mt
s
}
## ---- (optional) SoupX per timepoint (requires raw droplets) ----
## If you have the raw droplet matrices (not only filtered), set 'use_soupx <- TRUE'
## and point 'raw_path_fun' to each run folder that produced the timepoint object.
#use_soupx= F
#if (use_soupx) {
# raw_path_fun <- function(tp_id) {
# ## TODO: return path to the raw (unfiltered) matrix or 'outs' for that timepoint
# ## e.g.: here("data/cellranger", paste0("Brachy_", tp_id, "_org"), "outs")
# NA_character_
# }
# for (nm in names(by_tp)) {
# rp <- raw_path_fun(nm)
# if (!is.na(rp) && dir.exists(rp)) {
# sx <- SoupX::load10X(rp)
# sx <- SoupX::autoEstCont(sx)
# adj <- SoupX::adjustCounts(sx)
# by_tp[[nm]] <- CreateSeuratObject(adj, project = nm)
# }
# }
#}
## ---- doublets + re-QC per timepoint ----
min_genes <- 1000
max_genes <- 13000
min_counts <- 1000
max_counts <- 300000
max_pct_cp <- 30
max_pct_mt <- 10
# to add into article "Doublets were identified using scDblFinder. For one low-quality/low-cell sample, the doublet detection algorithm was unstable; for this sample, cells were retained as singlets and potential doublets were inspected at the clustering/QC stage."
for (nm in names(by_tp)) {
s <- by_tp[[nm]]
# 1) unifier les layers de counts
s <- ensure_single_counts(s, "RNA")
# 2) pourcent cp/mt
s <- add_organelle_percents(s, cp_all, mt_all)
# 3) nettoyer les colonnes / lignes tout-zéro AVANT scDblFinder
counts <- get_counts(s, "RNA")
keep_cells <- Matrix::colSums(counts) > 0
keep_genes <- Matrix::rowSums(counts) > 0
s <- s[keep_genes, keep_cells]
if (ncol(s) < 200) {
message("[scDblFinder] ", nm, ": trop peu de cellules (", ncol(s), ") -> on marque tout en singlets.")
s$doublet_class <- "singlet"
s$doublet_score <- NA_real_
} else {
sce <- as.SingleCellExperiment(s)
k_auto <- max(5, floor(0.01 * ncol(sce)))
dims_use <- min(30, max(10, floor(ncol(sce) / 50)))
sce <- tryCatch(
scDblFinder(
sce,
dims = dims_use,
k = k_auto,
BPPARAM = BiocParallel::SerialParam()
),
error = function(e) {
message("[scDblFinder] ERREUR sur ", nm, ": ",
e$message, " -> on tague tout en singlets.")
sce$scDblFinder.class <- factor("singlet", levels = c("singlet","doublet"))
sce$scDblFinder.score <- NA_real_
sce
}
)
s$doublet_class <- sce$scDblFinder.class
s$doublet_score <- sce$scDblFinder.score
}
# 4) filtres QC
s <- subset(
s,
subset =
nFeature_RNA >= min_genes & nFeature_RNA <= max_genes &
nCount_RNA >= min_counts & nCount_RNA <= max_counts &
pct_cp <= max_pct_cp & pct_mt <= max_pct_mt &
doublet_class == "singlet"
)
by_tp[[nm]] <- s
}
# to tcheck
sapply(by_tp, ncol)
## 1) Ajouter le timepoint comme meta dans chaque objet
for (nm in names(by_tp)) {
by_tp[[nm]]$timepoint <- nm
}
## 2) Fusionner tous les timepoints dans un seul objet
# important: on merge sur l'assay RNA (par défaut)
obj_all <- Reduce(function(x, y) merge(x, y), by_tp)
## 3) Normalisation globale avec SCT sur TOUTES les cellules
DefaultAssay(obj_all) <- "RNA"
obj_all <- SCTransform(obj_all, verbose = FALSE)
if (!requireNamespace("RSpectra", quietly = TRUE)) {
stop("Package 'RSpectra' is required for safe_pca_RS(); install it with install.packages('RSpectra').")
}
safe_pca_RS <- function(obj, assay = "SCT", npcs = 50) {
DefaultAssay(obj) <- assay
# 1) récupérer la matrice SCT "data"
M <- SeuratObject::GetAssayData(obj, assay = assay, layer = "data")
# 2) garder uniquement les gènes/colonnes valides tout en restant en sparse
good_rows <- Matrix::rowSums(!is.finite(M)) == 0
if (!all(good_rows)) {
M <- M[good_rows, , drop = FALSE]
}
good_cols <- Matrix::colSums(M != 0) > 0
if (!all(good_cols)) {
M <- M[, good_cols, drop = FALSE]
}
# synchroniser l'objet Seurat avec cette matrice nettoyée
obj <- obj[rownames(M), colnames(M)]
# 3) PCA avec RSpectra directement sur la matrice clairsemée
X <- Matrix::t(M) # cells x genes (toujours sparse)
k <- min(npcs, ncol(X) - 1, nrow(X))
if (k < 2) stop("Pas assez de dimensions pour la PCA (k = ", k, ").")
sv <- RSpectra::svds(X, k = k)
# embeddings = U * D (cells x k)
embeddings <- sv$u %*% diag(sv$d)
rownames(embeddings) <- rownames(X) # cell names
colnames(embeddings) <- paste0("PC_", seq_len(k))
# loadings = V (genes x k)
loadings <- sv$v
rownames(loadings) <- colnames(X) # gene names
colnames(loadings) <- paste0("PC_", seq_len(k))
obj[["pca"]] <- CreateDimReducObject(
embeddings = embeddings,
loadings = loadings,
stdev = sv$d,
key = "PC_",
assay = assay
)
obj
}
obj_all <- safe_pca_RS(obj_all, assay = "SCT", npcs = 50)
"pca" %in% Reductions(obj_all)
# doit renvoyer TRUE
Embeddings(obj_all, "pca")[1:5, 1:3] # juste pour voir
## 4) PCA / UMAP / clustering
obj_all <- RunUMAP(obj_all, dims = 1:30, verbose = FALSE)
obj_all <- FindNeighbors(obj_all, dims = 1:30, verbose = FALSE)
obj_all <- FindClusters(obj_all, resolution = 0.6) # par ex.
# Ensure a consistent timepoint order (0h, 4h, 6h, 8h, 12h) for all UMAP plots
tp_levels <- c("0h", "4h", "6h", "8h", "12h")
if ("timepoint" %in% colnames(obj_all@meta.data)) {
obj_all$timepoint <- factor(obj_all$timepoint, levels = tp_levels)
}
# Vue globale
DimPlot(obj_all, reduction = "umap", group.by = "seurat_clusters", label = TRUE)
# Voir les timepoints
DimPlot(obj_all, reduction = "umap", group.by = "timepoint")
# Suivre les clusters au cours du temps
DimPlot(
obj_all,
reduction = "umap",
group.by = "seurat_clusters",
split.by = "timepoint",
ncol = 3
)
out_dir <- here::here("report/snrnaseq/plot")
dir.create(out_dir, showWarnings = FALSE, recursive = TRUE)
# 1) UMAP avec clusters
p1 <- DimPlot(obj_all, reduction = "umap",
group.by = "seurat_clusters", label = TRUE)
# 2) UMAP coloré par timepoint
p2 <- DimPlot(obj_all, reduction = "umap",
group.by = "timepoint")
# 3) UMAP split par timepoint pour suivre les clusters dans le temps
p3 <- DimPlot(
obj_all,
reduction = "umap",
group.by = "seurat_clusters",
split.by = "timepoint",
ncol = 3
)
ggsave(file.path(out_dir, "umap_clusters.png"), plot = p1, width = 8, height = 6, dpi = 300)
ggsave(file.path(out_dir, "umap_timepoint.png"), plot = p2, width = 8, height = 6, dpi = 300)
ggsave(file.path(out_dir, "umap_clusters_by_timepoint.png"), plot = p3, width = 12, height = 8, dpi = 300)
## ---- markers per cluster (one resolution; change as you like) ----
## choisir la résolution qu’on utilise (ici 0.6)
Idents(obj_all) <- obj_all$seurat_clusters
# 0) s'assurer qu'on travaille bien sur SCT pour les marqueurs
DefaultAssay(obj_all) <- "SCT"
# 1) préparer les modèles SCT pour les tests
obj_all <- PrepSCTFindMarkers(obj_all)
# 2) puis lancer FindAllMarkers
if (requireNamespace("presto", quietly = TRUE)) {
markers <- presto::wilcoxauc(
obj_all,
seurat_assay = "SCT",
seurat_slot = "data"
) %>%
as_tibble() %>%
group_by(group) %>%
arrange(desc(logFC))
write.csv(
markers,
file.path(out_dir, "markers_presto_res0.6.csv"),
row.names = FALSE
)
} else {
markers <- FindAllMarkers(
obj_all,
assay = "SCT",
slot = "data",
only.pos = TRUE,
logfc.threshold = 0.25,
min.pct = 0.1
)
write.csv(
markers,
file.path(out_dir, "markers_res0.6.csv"),
row.names = FALSE
)
}
## ---- simple marker-based annotation scaffold (optional) ----
## Provide a named list of marker sets per putative cell type;
## these can be rice/arab markers mapped to Bd IDs. Example placeholders:
"BdiBd21-3.2G0092900.v1.2" %in% rownames(obj_all)
marker_sets <- list(
Pericycle = c("BdiBd21-3.2G0092900.v1.2", "BdiBd21-3.1G0109300.v1.2"),
Endodermis = c("BdiBd21-3.3G0792100.v1.2"),
Cortex = c("BdiBd21-3.4G0368600.v1.2")
)
DefaultAssay(obj_all) <- "RNA"
ms_col_map <- character()
for (nm in names(marker_sets)) {
genes_ok <- intersect(marker_sets[[nm]], rownames(obj_all))
if (length(genes_ok) < 5) {
message("⚠️ Trop peu de gènes trouvés pour ", nm, " (", length(genes_ok), "), je saute.")
} else {
prev_cols <- colnames(obj_all@meta.data)
obj_all <- AddModuleScore(
obj_all,
features = list(genes_ok),
name = paste0("MS_", nm),
assay = "RNA"
)
new_cols <- setdiff(colnames(obj_all@meta.data), prev_cols)
if (length(new_cols)) {
ms_col_map[new_cols] <- nm
}
}
}
## pick label by max module score (very rough first pass)
ms_cols <- names(ms_col_map)
if (length(ms_cols)) {
score_mat <- as.matrix(obj_all@meta.data[, ms_cols, drop = FALSE])
score_mat[is.na(score_mat)] <- -Inf
best_col <- ms_cols[max.col(score_mat, ties.method = "first")]
obj_all$celltype_guess <- unname(ms_col_map[best_col])
} else {
obj_all$celltype_guess <- NA_character_
}
pdf(file.path(out_dir, "umap_res0.6_celltype_guess.pdf"), width = 8, height = 6)
print(DimPlot(obj_all, group.by = "celltype_guess", label = TRUE))
dev.off()
DefaultAssay(obj_all) <- "RNA" # pour rester simple
for (cl in names(marker_sets)) {
obj_all <- AddModuleScore(
obj_all,
features = list(marker_sets[[cl]]),
name = paste0("MS_cluster", cl),
assay = "RNA"
)
}
## ---- save objects ----
## ---- save heavy objects (final) ----
# Dossier pour les objets R (séparé des plots)
obj_dir <- here::here("data/output/snrnaseq_objects")
dir.create(obj_dir, showWarnings = FALSE, recursive = TRUE)
## 1) Sauvegarder la liste complète des objets par timepoint (après QC + SCT)
saveRDS(
by_tp,
file = file.path(obj_dir, "by_tp_QC_SCT_list.rds")
)
## 1b) Sauvegarder aussi chaque timepoint séparément (pratique pour recharger un seul)
lapply(names(by_tp), function(nm) {
saveRDS(
by_tp[[nm]],
file = file.path(obj_dir, paste0("Seurat_", nm, "_QC_SCT.rds"))
)
})
## 2) Sauvegarder l’objet fusionné avec PCA/UMAP/clusters
# obj_all contient : timepoints fusionnés, SCT global, PCA (RSpectra), UMAP, clusters
saveRDS(
obj_all,
file = file.path(obj_dir, "obj_all_SCT_PCA_UMAP_clusters.rds")
)
## 3) Sauvegarder les marqueurs si l’objet `markers` existe
if (exists("markers")) {
write.csv(
markers,
file = file.path(obj_dir, "markers_res0.6.csv"),
row.names = FALSE
)
}
## 4) (optionnel) Exporter les coordonnées UMAP en CSV
if ("umap" %in% Seurat::Reductions(obj_all)) {
umap_df <- Embeddings(obj_all, "umap") |>
as.data.frame() |>
tibble::rownames_to_column("cell_id")
write.csv(
umap_df,
file = file.path(obj_dir, "umap_coordinates_res0.6.csv"),
row.names = FALSE
)
}
saveRDS(integrated, file = file.path(out_dir, "integrated_SCT_pca_umap_neighbors.rds"))
lapply(names(clustered), function(nm) saveRDS(clustered[[nm]], file = file.path(out_dir, paste0("integrated_", nm, ".rds"))))
```
## Post-cleaning sanity checks
```{r, eval=F}
library(Seurat)
library(tidyverse)
library(future)
# 1) pas de parallélisation pour cette étape
future::plan("sequential")
# 2) augmenter la limite (par ex. 100 Go)
options(future.globals.maxSize = 20 * 1024^3)
obj_dir <- here::here("data/output/snrnaseq_objects/") # ou le chemin que tu as utilisé
# Reload the merged object to explore results without rerunning the full pipeline
obj_all <- readRDS(file.path(obj_dir, "obj_all_SCT_PCA_UMAP_clusters.rds"))
# Enforce a consistent timepoint order for UMAP panels (0h, 4h, 6h, 8h, 12h)
tp_levels <- c("0h", "4h", "6h", "8h", "12h")
if ("timepoint" %in% colnames(obj_all@meta.data)) {
obj_all$timepoint <- factor(obj_all$timepoint, levels = tp_levels)
}
# UMAP by clusters
DimPlot(obj_all, reduction = "umap", group.by = "seurat_clusters", label = TRUE)
# UMAP by timepoint
DimPlot(obj_all, reduction = "umap", group.by = "timepoint")
# UMAP split by timepoint (facet per timepoint, fixed order)
DimPlot(
obj_all,
reduction = "umap",
group.by = "seurat_clusters",
split.by = "timepoint",
ncol = 3
)
# Pull QC exports for quick inspection / downstream plotting
markers <- read.csv(file.path(obj_dir, "markers_res0.6.csv"))
head(markers)
umap_df <- read.csv(file.path(obj_dir, "umap_coordinates_res0.6.csv"))
head(umap_df)
# cell_id, UMAP_1, UMAP_2
```
{fig-align="center"}
{fig-align="center"}
- **UMAP split by timepoint (`umap_clusters_by_timepoint_diag.png`)**
Shows the same UMAP layout colored by clusters, facetted for 0h, 4h, 6h, 8h and 12h. The goal is to check that each timepoint reuses the same global structures, rather than forming isolated “islands” that would suggest strong batch effects.
{fig-align="center"}
## Cluster coherence diagnostics
```{r, eval=F}
# Enforce a consistent timepoint order for all plots
tp_levels <- c("0h", "4h", "6h", "8h", "12h")
if ("timepoint" %in% colnames(obj_all@meta.data)) {
obj_all$timepoint <- factor(obj_all$timepoint, levels = tp_levels)
}
# 1) UMAP facetted by timepoint (fixed order) to check that structures are reused
p_umap_tp <- DimPlot(
obj_all,
reduction = "umap",
group.by = "seurat_clusters",
split.by = "timepoint",
ncol = 3,
order = TRUE
)
fig_export(
path = "report/snrnaseq/plot/cleaning/umap_clusters_by_timepoint_diag",
plot_x = p_umap_tp,
height_i = 6,
width_i = 12,
res_i = 300,
format = "png"
)
# 2) Cluster composition per timepoint (barplot of proportions)
cl_tp <- obj_all@meta.data |>
tibble::as_tibble(rownames = "cell_id") |>
dplyr::count(timepoint, seurat_clusters) |>
dplyr::group_by(timepoint) |>
dplyr::mutate(prop = n / sum(n))
p_comp <- ggplot(cl_tp, aes(x = timepoint, y = prop, fill = seurat_clusters)) +
geom_col(width = 0.8, color = "white", size = 0.1) +
scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
labs(
x = "Timepoint",
y = "Cluster proportion",
title = "Cluster composition per timepoint"
) +
theme_bw() +
theme(legend.position = "right")
fig_export(
path = "report/snrnaseq/plot/cleaning/cluster_proportion",
plot_x = p_comp,
height_i = 6,
width_i = 8,
res_i = 300,
format = "png"
)
# 3) QC gradients on UMAP to ensure no technical drift drives the embedding
qc_features <- intersect(c("nCount_RNA", "nFeature_RNA", "pct_cp", "pct_mt"), colnames(obj_all@meta.data))
# Keep only QC features that actually vary; constants (e.g. pct_cp = 0 for all cells) are not informative
qc_features <- qc_features[
vapply(
qc_features,
function(f) stats::var(obj_all@meta.data[[f]], na.rm = TRUE) > 0,
logical(1)
)
]
if (length(qc_features)) {
p_qc_umap <- FeaturePlot(
obj_all,
features = qc_features,
reduction = "umap",
order = TRUE,
ncol = 2
) +
theme_bw()
fig_export(
path = "report/snrnaseq/plot/cleaning/qc_umap",
plot_x = p_qc_umap,
height_i = 6,
width_i = 8,
res_i = 300,
format = "png"
)
}
# 4) Cluster-level QC summary (median library size / genes / organelle %)
qc_summary <- obj_all@meta.data |>
tibble::as_tibble(rownames = "cell_id") |>
dplyr::group_by(seurat_clusters) |>
dplyr::summarise(
med_nCount = median(nCount_RNA, na.rm = TRUE),
med_nFeat = median(nFeature_RNA, na.rm = TRUE),
med_pct_cp = median(pct_cp, na.rm = TRUE),
med_pct_mt = median(pct_mt, na.rm = TRUE)
) |>
tidyr::pivot_longer(-seurat_clusters, names_to = "metric", values_to = "median")
p_qc_summary <- ggplot(qc_summary, aes(x = seurat_clusters, y = median, fill = metric)) +
geom_col(position = "dodge") +
labs(
x = "Cluster",
y = "Median QC value",
title = "Per-cluster QC medians"
) +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
fig_export(
path = "report/snrnaseq/plot/cleaning/qc_cluster_medians",
plot_x = p_qc_summary,
height_i = 6,
width_i = 8,
res_i = 300,
format = "png"
)
```
- **Cluster composition per timepoint (`cluster_proportion.png`)**
Stacked barplot of cluster proportions in each timepoint. This helps verify that major clusters are contributed by several timepoints and that no cluster is driven by a single timepoint only (which could indicate a technical artefact or an over‑specific cluster).
{fig-align="center"}
- **QC gradients on UMAP (`qc_umap.png`)**
UMAP colored by `nCount_RNA` and `nFeature_RNA`. The absence of large continuous gradients across the embedding indicates that depth and gene detection are not the main drivers of the UMAP structure.
{fig-align="center"}
- **Per-cluster QC medians (`qc_cluster_medians.png`)**
For each cluster, shows median library size, number of detected genes, and organelle percentages. This is used to flag clusters with extreme QC values (very low complexity, very high counts, or high organelle content) that might correspond to low-quality cells, doublets, or debris.
{fig-align="center"}