Code
#pkg
library(Seurat)
library(SoupX)
library(scDblFinder)
library(Matrix)
library(rtracklayer)
library(GenomeInfoDb)
library(tidyverse)
library(forcats)
library(ggplot2)
library(here)
library(tidyverse)
library(readxl)
library(gt)
library(paletteer)
library(scales)
library(DT)
library(htmlwidgets)
library(rtracklayer)
library(GenomeInfoDb)
library(fs)
library(glmGamPoi)
library(furrr)
library(future)

if (requireNamespace("glmGamPoi", quietly = TRUE)) message("glmGamPoi available: SCT will be faster.")
if (requireNamespace("presto", quietly = TRUE)) message("presto available: marker finding will be faster.")

# src
source(here::here("src/function/stat_function/stat_analysis_main.R")) # for make plot 
source(here::here("src/function/fig_export.R")) # This function saves a given plot (plot_x) as both a PDF and a high-resolution PNG file at specified dimensions.

runs_base <- "/mnt/ibioldata/LBMC_data/cmaslard/data/ScBdLR_init_2025_private/cellranger/"
#runs_base <- "/Volumes/IBIOLdata_storage/LBMC_data/cmaslard/data/ScBdLR_init_2025_private/cellranger/"
Code
obj_dir <- here("data/output/snrnaseq_objects")

# Load merged object with SCT, PCA, UMAP, clusters, module scores
obj_all <- readRDS(file.path(obj_dir, "obj_all_SCT_PCA_UMAP_clusters.rds"))

DefaultAssay(obj_all) <- "SCT"
Idents(obj_all)       <- obj_all$seurat_clusters

markers <- readr::read_csv(file.path(obj_dir, "markers_res0.6.csv"))

# Recreate module-score based cell type guess if it is missing
if (is.null(obj_all$celltype_guess) || all(is.na(obj_all$celltype_guess))) {
  marker_sets <- list(
    Pericycle = c("BdiBd21-3.2G0092900.v1.2", "BdiBd21-3.1G0109300.v1.2"),
    Endodermis = c("BdiBd21-3.3G0792100.v1.2"),
    Cortex     = c("BdiBd21-3.4G0368600.v1.2")
  )

  DefaultAssay(obj_all) <- "RNA"
  ms_col_map <- character()
  for (nm in names(marker_sets)) {
    genes_ok <- intersect(marker_sets[[nm]], rownames(obj_all))
    if (length(genes_ok) < 5) next
    prev_cols <- colnames(obj_all@meta.data)
    obj_all <- AddModuleScore(
      obj_all,
      features = list(genes_ok),
      name     = paste0("MS_", nm),
      assay    = "RNA"
    )
    new_cols <- setdiff(colnames(obj_all@meta.data), prev_cols)
    if (length(new_cols)) ms_col_map[new_cols] <- nm
  }

  ms_cols <- names(ms_col_map)
  if (length(ms_cols)) {
    score_mat <- as.matrix(obj_all@meta.data[, ms_cols, drop = FALSE])
    score_mat[is.na(score_mat)] <- -Inf
    best_col <- ms_cols[max.col(score_mat, ties.method = "first")]
    obj_all$celltype_guess <- unname(ms_col_map[best_col])
  } else {
    obj_all$celltype_guess <- NA_character_
  }

  DefaultAssay(obj_all) <- "SCT"
}

3.0.1 Marker filtering and top genes per cluster (project paths)

Code
# Output directory for annotation artefacts
annot_dir <- here::here("data/output/annotation")
fs::dir_create(annot_dir)

# Load merged object (already SCT-normalized, PCA/UMAP/clusters done)
obj_dir <- here::here("data/output/snrnaseq_objects")
obj_all <- readRDS(file.path(obj_dir, "obj_all_SCT_PCA_UMAP_clusters.rds"))
DefaultAssay(obj_all) <- "SCT"
Idents(obj_all)       <- obj_all$seurat_clusters

# Load cluster markers (computed in cleaning/normalization)
markers <- readr::read_csv(file.path(obj_dir, "markers_res0.6.csv"))

# Optional metadata paths (place the CSV/TXT files under data/inputs_ref/metadata)
meta_dir <- here::here("data/inputs_ref/metadata")
meta_rice_path   <- file.path(meta_dir, "PCMDB_Download_20240515011841.csv")
meta_brachy_path <- file.path(meta_dir, "BdistachyonBd21_3_537_v1.2.annotation_info.txt")
meta_arabi_path  <- file.path(meta_dir, "SRP285817_Lateralrootprimordia_detail.csv")
meta_og_path     <- file.path(meta_dir, "Monocot5Pangenome_proteins.txt")  # fallback orthogroups

# Helper to safely read metadata (only if files are present locally)
read_meta_tables <- function() {
  if (!file.exists(meta_rice_path)) {
    message("⚠️ Rice marker file manquant: place `PCMDB_Download_20240515011841.csv` dans ", meta_dir)
    return(NULL)
  }
  meta_rice <- readr::read_csv(meta_rice_path, show_col_types = FALSE)

  ## Deux stratégies de mapping vers Brachy :
  ## 1) via fichier annotation_info si présent (best_rice_gene)
  ## 2) fallback via orthogroup si annotation_info absent mais Monocot5Pangenome_proteins.txt présent
  mapping <- NULL
  if (file.exists(meta_brachy_path)) {
    meta_brachy <- readr::read_tsv(meta_brachy_path, show_col_types = FALSE)
    mapping <- meta_brachy |>
      dplyr::transmute(
        locus_clean = gsub("\\.v[0-9.]+$", "", locusName),
        Gene_id     = best_rice_gene
      ) |>
      dplyr::filter(!is.na(Gene_id), Gene_id != "")
  } else if (file.exists(meta_og_path)) {
    message("ℹ️ `annotation_info` absent, fallback via orthogroups.")
    og <- readr::read_tsv(meta_og_path, show_col_types = FALSE)
    og <- og |>
      dplyr::mutate(
        gene_clean = gsub("\\.v[0-9.]+$", "", protein),   # retire suffixe de version
        gene_clean = gsub("_[^_]+$", "", gene_clean),      # retire suffixe type "_1"
        gene_clean = gsub("\\.[0-9]+$", "", gene_clean)    # retire isoforme finale
      )
    og_bra <- og |> dplyr::filter(species == "brachy") |>
      dplyr::transmute(orthogroup, locus_clean = gene_clean)
    og_rice <- og |> dplyr::filter(species == "rice") |>
      dplyr::transmute(orthogroup, Gene_id = gene_clean)
    mapping <- og_bra |>
      dplyr::inner_join(og_rice, by = "orthogroup") |>
      dplyr::select(locus_clean, Gene_id) |>
      dplyr::distinct()
  }

  if (is.null(mapping) || nrow(mapping) == 0) {
    message("⚠️ Aucun mapping Brachy↔Riz trouvé (ni annotation_info ni orthogroups utilisables).")
    return(NULL)
  }

  meta_combined <- mapping |>
    dplyr::left_join(meta_rice, by = "Gene_id")

  if (file.exists(meta_arabi_path)) {
    meta_arabi <- readr::read_delim(meta_arabi_path, delim = ";", show_col_types = FALSE)
    meta_combined <- dplyr::left_join(
      meta_combined,
      meta_arabi,
      by = c("best_arabi_gene" = "GeneID")
    )
  }

  meta_combined |>
    dplyr::mutate(locus_clean = gsub("\\.v[0-9.]+$", "", locusName))
}

meta_tables <- read_meta_tables()

# Filter markers with reasonable specificity thresholds
markers_filt <- markers |>
  dplyr::filter(
    p_val_adj < 0.05,   # statistically significant
    pct.1     > 0.8,    # expressed in most cells of the cluster
    pct.2     < 0.25    # rare outside the cluster
  )

# Top 10 markers per cluster by log2FC (ties dropped to keep exactly 10)
top10 <- markers_filt |>
  dplyr::group_by(cluster) |>
  dplyr::slice_max(avg_log2FC, n = 10, with_ties = FALSE) |>
  dplyr::ungroup() |>
  dplyr::arrange(cluster, dplyr::desc(avg_log2FC))

readr::write_csv(top10, file.path(annot_dir, "top10_markers_res0.6.csv"))
readr::write_csv(markers_filt, file.path(annot_dir, "markers_filtered_res0.6.csv"))

# Join with rice/Brachy metadata if available
if (!is.null(meta_tables)) {
  annotated_markers <- markers_filt |>
    dplyr::mutate(locus_clean = gsub("\\.v[0-9.]+$", "", gene)) |>
    dplyr::left_join(meta_tables, by = "locus_clean") |>
    dplyr::select(-locus_clean)
  readr::write_csv(annotated_markers, file.path(annot_dir, "markers_filtered_res0.6_with_rice_meta.csv"))

  top10_annot <- top10 |>
    dplyr::mutate(locus_clean = gsub("\\.v[0-9.]+$", "", gene)) |>
    dplyr::left_join(meta_tables, by = "locus_clean") |>
    dplyr::select(-locus_clean)
  readr::write_csv(top10_annot, file.path(annot_dir, "top10_markers_res0.6_with_rice_meta.csv"))

  # Assign a rice-based cell type per cluster (most frequent Cell_type in filtered markers)
  cluster_ct <- annotated_markers |>
    dplyr::filter(!is.na(Cell_type) & Cell_type != "") |>
    dplyr::count(cluster, Cell_type, sort = TRUE) |>
    dplyr::group_by(cluster) |>
    dplyr::slice_max(n, with_ties = FALSE) |>
    dplyr::ungroup() |>
    dplyr::rename(celltype_rice = Cell_type)

  # Map to obj_all metadata
  if (nrow(cluster_ct) > 0) {
    map_vec <- cluster_ct$celltype_rice
    names(map_vec) <- cluster_ct$cluster
    obj_all$celltype_rice <- map_vec[as.character(obj_all$seurat_clusters)]
    readr::write_csv(cluster_ct, file.path(annot_dir, "cluster_celltype_rice_mapping.csv"))
  }
}

# Simple summary: number of filtered markers per cluster
marker_counts <- markers_filt |>
  dplyr::count(cluster, name = "n_markers") |>
  dplyr::arrange(cluster)
readr::write_csv(marker_counts, file.path(annot_dir, "marker_counts_per_cluster.csv"))

p_marker_counts <- ggplot(marker_counts, aes(x = factor(cluster), y = n_markers, fill = n_markers)) +
  geom_col() +
  scale_fill_gradient(low = "grey90", high = "brown") +
  theme_bw() +
  labs(
    x = "Cluster",
    y = "# filtered markers",
    title = "Number of filtered markers per cluster"
  )

fig_export(
  path     = "report/snrnaseq/plot/annotation/marker_counts_per_cluster",
  plot_x   = p_marker_counts,
  height_i = 4,
  width_i  = 7,
  res_i    = 300,
  format   = "png"
)

# Optional: join with your own gene metadata (e.g., tissues/cell types) if available.
# Example placeholder: if you have a CSV mapping 'gene' -> 'Cell_type' or 'Tissus_type'
# meta_map <- readr::read_csv(here::here("data/refs/gene_metadata.csv"))
# top10 <- dplyr::left_join(top10, meta_map, by = c("gene" = "gene_id"))
# readr::write_csv(top10, file.path(annot_dir, "top10_markers_res0.6_with_meta.csv"))

# Optional: quick UMAP view with current cluster IDs and the rough module-score guess
p_clusters <- DimPlot(obj_all, reduction = "umap", group.by = "seurat_clusters", label = TRUE)
fig_export(
  path     = "report/snrnaseq/plot/annotation/umap_clusters_res0.6",
  plot_x   = p_clusters,
  height_i = 6,
  width_i  = 8,
  res_i    = 300,
  format   = "png"
)

if ("celltype_guess" %in% colnames(obj_all@meta.data) && !all(is.na(obj_all$celltype_guess))) {
  p_guess <- DimPlot(obj_all, reduction = "umap", group.by = "celltype_guess", label = TRUE)
  fig_export(
    path     = "report/snrnaseq/plot/annotation/umap_celltype_guess_res0.6",
    plot_x   = p_guess,
    height_i = 6,
    width_i  = 8,
    res_i    = 300,
    format   = "png"
  )
}

if ("celltype_rice" %in% colnames(obj_all@meta.data) && !all(is.na(obj_all$celltype_rice))) {
  p_rice <- DimPlot(obj_all, reduction = "umap", group.by = "celltype_rice", label = TRUE)
  fig_export(
    path     = "report/snrnaseq/plot/annotation/umap_celltype_rice_res0.6",
    plot_x   = p_rice,
    height_i = 6,
    width_i  = 8,
    res_i    = 300,
    format   = "png"
  )
}

Use top10_markers_res0.6.csv as your starting table for manual annotation: cross-check with known root/lateral root markers, then assign cell type labels per cluster. The UMAP exports (umap_clusters_res0.6, umap_celltype_guess_res0.6) help visualize your decisions. If you have external marker sets (e.g., rice/Arabidopsis), plug them into the optional metadata join to enrich the top10 table.