---
output: html_document
editor_options:
chunk_output_type: console
---
## Annotation
```{r}
#pkg
library(Seurat)
library(SoupX)
library(scDblFinder)
library(Matrix)
library(rtracklayer)
library(GenomeInfoDb)
library(tidyverse)
library(forcats)
library(ggplot2)
library(here)
library(tidyverse)
library(readxl)
library(gt)
library(paletteer)
library(scales)
library(DT)
library(htmlwidgets)
library(rtracklayer)
library(GenomeInfoDb)
library(fs)
library(glmGamPoi)
library(furrr)
library(future)
if (requireNamespace("glmGamPoi", quietly = TRUE)) message("glmGamPoi available: SCT will be faster.")
if (requireNamespace("presto", quietly = TRUE)) message("presto available: marker finding will be faster.")
# src
source(here::here("src/function/stat_function/stat_analysis_main.R")) # for make plot
source(here::here("src/function/fig_export.R")) # This function saves a given plot (plot_x) as both a PDF and a high-resolution PNG file at specified dimensions.
runs_base <- "/mnt/ibioldata/LBMC_data/cmaslard/data/ScBdLR_init_2025_private/cellranger/"
#runs_base <- "/Volumes/IBIOLdata_storage/LBMC_data/cmaslard/data/ScBdLR_init_2025_private/cellranger/"
```
```{r, eval = FALSE}
obj_dir <- here("data/output/snrnaseq_objects")
# Load merged object with SCT, PCA, UMAP, clusters, module scores
obj_all <- readRDS(file.path(obj_dir, "obj_all_SCT_PCA_UMAP_clusters.rds"))
DefaultAssay(obj_all) <- "SCT"
Idents(obj_all) <- obj_all$seurat_clusters
markers <- readr::read_csv(file.path(obj_dir, "markers_res0.6.csv"))
# Recreate module-score based cell type guess if it is missing
if (is.null(obj_all$celltype_guess) || all(is.na(obj_all$celltype_guess))) {
marker_sets <- list(
Pericycle = c("BdiBd21-3.2G0092900.v1.2", "BdiBd21-3.1G0109300.v1.2"),
Endodermis = c("BdiBd21-3.3G0792100.v1.2"),
Cortex = c("BdiBd21-3.4G0368600.v1.2")
)
DefaultAssay(obj_all) <- "RNA"
ms_col_map <- character()
for (nm in names(marker_sets)) {
genes_ok <- intersect(marker_sets[[nm]], rownames(obj_all))
if (length(genes_ok) < 5) next
prev_cols <- colnames(obj_all@meta.data)
obj_all <- AddModuleScore(
obj_all,
features = list(genes_ok),
name = paste0("MS_", nm),
assay = "RNA"
)
new_cols <- setdiff(colnames(obj_all@meta.data), prev_cols)
if (length(new_cols)) ms_col_map[new_cols] <- nm
}
ms_cols <- names(ms_col_map)
if (length(ms_cols)) {
score_mat <- as.matrix(obj_all@meta.data[, ms_cols, drop = FALSE])
score_mat[is.na(score_mat)] <- -Inf
best_col <- ms_cols[max.col(score_mat, ties.method = "first")]
obj_all$celltype_guess <- unname(ms_col_map[best_col])
} else {
obj_all$celltype_guess <- NA_character_
}
DefaultAssay(obj_all) <- "SCT"
}
```
### Marker filtering and top genes per cluster (project paths)
```{r, eval=FALSE}
# Output directory for annotation artefacts
annot_dir <- here::here("data/output/annotation")
fs::dir_create(annot_dir)
# Load merged object (already SCT-normalized, PCA/UMAP/clusters done)
obj_dir <- here::here("data/output/snrnaseq_objects")
obj_all <- readRDS(file.path(obj_dir, "obj_all_SCT_PCA_UMAP_clusters.rds"))
DefaultAssay(obj_all) <- "SCT"
Idents(obj_all) <- obj_all$seurat_clusters
# Load cluster markers (computed in cleaning/normalization)
markers <- readr::read_csv(file.path(obj_dir, "markers_res0.6.csv"))
# Optional metadata paths (place the CSV/TXT files under data/inputs_ref/metadata)
meta_dir <- here::here("data/inputs_ref/metadata")
meta_rice_path <- file.path(meta_dir, "PCMDB_Download_20240515011841.csv")
meta_brachy_path <- file.path(meta_dir, "BdistachyonBd21_3_537_v1.2.annotation_info.txt")
meta_arabi_path <- file.path(meta_dir, "SRP285817_Lateralrootprimordia_detail.csv")
meta_og_path <- file.path(meta_dir, "Monocot5Pangenome_proteins.txt") # fallback orthogroups
# Helper to safely read metadata (only if files are present locally)
read_meta_tables <- function() {
if (!file.exists(meta_rice_path)) {
message("⚠️ Rice marker file manquant: place `PCMDB_Download_20240515011841.csv` dans ", meta_dir)
return(NULL)
}
meta_rice <- readr::read_csv(meta_rice_path, show_col_types = FALSE)
## Deux stratégies de mapping vers Brachy :
## 1) via fichier annotation_info si présent (best_rice_gene)
## 2) fallback via orthogroup si annotation_info absent mais Monocot5Pangenome_proteins.txt présent
mapping <- NULL
if (file.exists(meta_brachy_path)) {
meta_brachy <- readr::read_tsv(meta_brachy_path, show_col_types = FALSE)
mapping <- meta_brachy |>
dplyr::transmute(
locus_clean = gsub("\\.v[0-9.]+$", "", locusName),
Gene_id = best_rice_gene
) |>
dplyr::filter(!is.na(Gene_id), Gene_id != "")
} else if (file.exists(meta_og_path)) {
message("ℹ️ `annotation_info` absent, fallback via orthogroups.")
og <- readr::read_tsv(meta_og_path, show_col_types = FALSE)
og <- og |>
dplyr::mutate(
gene_clean = gsub("\\.v[0-9.]+$", "", protein), # retire suffixe de version
gene_clean = gsub("_[^_]+$", "", gene_clean), # retire suffixe type "_1"
gene_clean = gsub("\\.[0-9]+$", "", gene_clean) # retire isoforme finale
)
og_bra <- og |> dplyr::filter(species == "brachy") |>
dplyr::transmute(orthogroup, locus_clean = gene_clean)
og_rice <- og |> dplyr::filter(species == "rice") |>
dplyr::transmute(orthogroup, Gene_id = gene_clean)
mapping <- og_bra |>
dplyr::inner_join(og_rice, by = "orthogroup") |>
dplyr::select(locus_clean, Gene_id) |>
dplyr::distinct()
}
if (is.null(mapping) || nrow(mapping) == 0) {
message("⚠️ Aucun mapping Brachy↔Riz trouvé (ni annotation_info ni orthogroups utilisables).")
return(NULL)
}
meta_combined <- mapping |>
dplyr::left_join(meta_rice, by = "Gene_id")
if (file.exists(meta_arabi_path)) {
meta_arabi <- readr::read_delim(meta_arabi_path, delim = ";", show_col_types = FALSE)
meta_combined <- dplyr::left_join(
meta_combined,
meta_arabi,
by = c("best_arabi_gene" = "GeneID")
)
}
meta_combined |>
dplyr::mutate(locus_clean = gsub("\\.v[0-9.]+$", "", locusName))
}
meta_tables <- read_meta_tables()
# Filter markers with reasonable specificity thresholds
markers_filt <- markers |>
dplyr::filter(
p_val_adj < 0.05, # statistically significant
pct.1 > 0.8, # expressed in most cells of the cluster
pct.2 < 0.25 # rare outside the cluster
)
# Top 10 markers per cluster by log2FC (ties dropped to keep exactly 10)
top10 <- markers_filt |>
dplyr::group_by(cluster) |>
dplyr::slice_max(avg_log2FC, n = 10, with_ties = FALSE) |>
dplyr::ungroup() |>
dplyr::arrange(cluster, dplyr::desc(avg_log2FC))
readr::write_csv(top10, file.path(annot_dir, "top10_markers_res0.6.csv"))
readr::write_csv(markers_filt, file.path(annot_dir, "markers_filtered_res0.6.csv"))
# Join with rice/Brachy metadata if available
if (!is.null(meta_tables)) {
annotated_markers <- markers_filt |>
dplyr::mutate(locus_clean = gsub("\\.v[0-9.]+$", "", gene)) |>
dplyr::left_join(meta_tables, by = "locus_clean") |>
dplyr::select(-locus_clean)
readr::write_csv(annotated_markers, file.path(annot_dir, "markers_filtered_res0.6_with_rice_meta.csv"))
top10_annot <- top10 |>
dplyr::mutate(locus_clean = gsub("\\.v[0-9.]+$", "", gene)) |>
dplyr::left_join(meta_tables, by = "locus_clean") |>
dplyr::select(-locus_clean)
readr::write_csv(top10_annot, file.path(annot_dir, "top10_markers_res0.6_with_rice_meta.csv"))
# Assign a rice-based cell type per cluster (most frequent Cell_type in filtered markers)
cluster_ct <- annotated_markers |>
dplyr::filter(!is.na(Cell_type) & Cell_type != "") |>
dplyr::count(cluster, Cell_type, sort = TRUE) |>
dplyr::group_by(cluster) |>
dplyr::slice_max(n, with_ties = FALSE) |>
dplyr::ungroup() |>
dplyr::rename(celltype_rice = Cell_type)
# Map to obj_all metadata
if (nrow(cluster_ct) > 0) {
map_vec <- cluster_ct$celltype_rice
names(map_vec) <- cluster_ct$cluster
obj_all$celltype_rice <- map_vec[as.character(obj_all$seurat_clusters)]
readr::write_csv(cluster_ct, file.path(annot_dir, "cluster_celltype_rice_mapping.csv"))
}
}
# Simple summary: number of filtered markers per cluster
marker_counts <- markers_filt |>
dplyr::count(cluster, name = "n_markers") |>
dplyr::arrange(cluster)
readr::write_csv(marker_counts, file.path(annot_dir, "marker_counts_per_cluster.csv"))
p_marker_counts <- ggplot(marker_counts, aes(x = factor(cluster), y = n_markers, fill = n_markers)) +
geom_col() +
scale_fill_gradient(low = "grey90", high = "brown") +
theme_bw() +
labs(
x = "Cluster",
y = "# filtered markers",
title = "Number of filtered markers per cluster"
)
fig_export(
path = "report/snrnaseq/plot/annotation/marker_counts_per_cluster",
plot_x = p_marker_counts,
height_i = 4,
width_i = 7,
res_i = 300,
format = "png"
)
# Optional: join with your own gene metadata (e.g., tissues/cell types) if available.
# Example placeholder: if you have a CSV mapping 'gene' -> 'Cell_type' or 'Tissus_type'
# meta_map <- readr::read_csv(here::here("data/refs/gene_metadata.csv"))
# top10 <- dplyr::left_join(top10, meta_map, by = c("gene" = "gene_id"))
# readr::write_csv(top10, file.path(annot_dir, "top10_markers_res0.6_with_meta.csv"))
# Optional: quick UMAP view with current cluster IDs and the rough module-score guess
p_clusters <- DimPlot(obj_all, reduction = "umap", group.by = "seurat_clusters", label = TRUE)
fig_export(
path = "report/snrnaseq/plot/annotation/umap_clusters_res0.6",
plot_x = p_clusters,
height_i = 6,
width_i = 8,
res_i = 300,
format = "png"
)
if ("celltype_guess" %in% colnames(obj_all@meta.data) && !all(is.na(obj_all$celltype_guess))) {
p_guess <- DimPlot(obj_all, reduction = "umap", group.by = "celltype_guess", label = TRUE)
fig_export(
path = "report/snrnaseq/plot/annotation/umap_celltype_guess_res0.6",
plot_x = p_guess,
height_i = 6,
width_i = 8,
res_i = 300,
format = "png"
)
}
if ("celltype_rice" %in% colnames(obj_all@meta.data) && !all(is.na(obj_all$celltype_rice))) {
p_rice <- DimPlot(obj_all, reduction = "umap", group.by = "celltype_rice", label = TRUE)
fig_export(
path = "report/snrnaseq/plot/annotation/umap_celltype_rice_res0.6",
plot_x = p_rice,
height_i = 6,
width_i = 8,
res_i = 300,
format = "png"
)
}
```
Use `top10_markers_res0.6.csv` as your starting table for manual annotation: cross-check with known root/lateral root markers, then assign cell type labels per cluster. The UMAP exports (`umap_clusters_res0.6`, `umap_celltype_guess_res0.6`) help visualize your decisions. If you have external marker sets (e.g., rice/Arabidopsis), plug them into the optional metadata join to enrich the `top10` table.