---
output: html_document
editor_options:
chunk_output_type: console
---
## Markers analyse
```{r}
#pkg
library(Seurat)
library(SoupX)
library(scDblFinder)
library(Matrix)
library(rtracklayer)
library(GenomeInfoDb)
library(tidyverse)
library(forcats)
library(ggplot2)
library(here)
library(tidyverse)
library(readxl)
library(gt)
library(paletteer)
library(scales)
library(DT)
library(htmlwidgets)
library(rtracklayer)
library(fs)
library(glmGamPoi)
library(furrr)
library(future)
library(googlesheets4)
library(DT)
library(janitor)
library(biomaRt)
library(readr)
library(stringr)
library(tidyr)
library(GenomicRanges)
library(GO.db)
library(AnnotationDbi)
library(patchwork)
if (requireNamespace("glmGamPoi", quietly = TRUE)) message("glmGamPoi available: SCT will be faster.")
if (requireNamespace("presto", quietly = TRUE)) message("presto available: marker finding will be faster.")
# src
source(here::here("src/function/stat_function/stat_analysis_main.R")) # for make plot
source(here::here("src/function/fig_export.R")) # This function saves a given plot (plot_x) as both a PDF and a high-resolution PNG file at specified dimensions.
runs_base <- "/mnt/ibioldata/LBMC_data/cmaslard/data/ScBdLR_init_2025_private/cellranger/"
#runs_base <- "/Volumes/IBIOLdata_storage/LBMC_data/cmaslard/data/ScBdLR_init_2025_private/cellranger/"
# ---- 0) Pick a GTF (prefer organelle ref, fallback to nuclear) ----
gtf_candidates <- c(
here::here("data/refs/BdistachyonBd21_3_10X_Genome_organelle/genes/genes.gtf.gz"),
here::here("data/refs/BdistachyonBd21_3_10X_Genome/genes/genes.gtf.gz")
)
gtf_file <- gtf_candidates[file.exists(gtf_candidates)][1]
message("Using GTF: ", gtf_file)
gtf <- rtracklayer::import(gtf_file)
obj_dir <- here("data/output/snrnaseq_objects")
obj_all <- readRDS(file.path(obj_dir, "obj_all_SCT_PCA_UMAP_clusters.rds"))
```
## Find the top ones for some clusers
```{r, eval =F}
DefaultAssay(obj_all) <- "SCT"
Idents(obj_all) <- obj_all$seurat_clusters
# Prepare SCT models for marker testing (needed once per object)
obj_all <- PrepSCTFindMarkers(obj_all)
# Example: markers for clusters 6 and 2 against all other cells
markers_cl6 <- FindMarkers(
obj_all,
ident.1 = "6",
assay = "SCT",
slot = "data",
logfc.threshold = 0.25,
min.pct = 0.1
)
markers_cl2 <- FindMarkers(
obj_all,
ident.1 = "2",
assay = "SCT",
slot = "data",
logfc.threshold = 0.25,
min.pct = 0.1
)
markers_cl9 <- FindMarkers(
obj_all,
ident.1 = "9",
assay = "SCT",
slot = "data",
logfc.threshold = 0.25,
min.pct = 0.1
)
markers_cl20 <- FindMarkers(
obj_all,
ident.1 = "20",
assay = "SCT",
slot = "data",
logfc.threshold = 0.25,
min.pct = 0.1
)
markers_cl16 <- FindMarkers(
obj_all,
ident.1 = "16",
assay = "SCT",
slot = "data",
logfc.threshold = 0.25,
min.pct = 0.1
)
# Inspect the top 20 markers for each cluster
head(markers_cl6[order(-markers_cl6$avg_log2FC), ], 20)
head(markers_cl2[order(-markers_cl2$avg_log2FC), ], 20)
head(markers_cl9[order(-markers_cl9$avg_log2FC), ], 20)
head(markers_cl20[order(-markers_cl20$avg_log2FC), ], 20)
```
## Importing Google Calc to verify certain candidates found in the literature
```{r, eval = T}
sheet_url <- "https://docs.google.com/spreadsheets/d/1pL6r-ZwITYZoVLR7xS-OkTkGWVkothUbsY0klSni45c/edit?gid=399921653#gid=399921653"
gs4_deauth()
candidates <- read_sheet(
ss = sheet_url,
sheet = "Candidates"
) %>%
janitor::clean_names()
# Display as an interactive, filterable table
datatable(
candidates,
filter = "top", # column filters
rownames = FALSE,
extensions = c("Buttons"),
options = list(
pageLength = 25,
autoWidth = TRUE,
dom = "Bfrtip",
buttons = c("copy", "csv", "excel")
)
)
write_csv(candidates, file = here::here("data/output/candidates/candidates.csv"), na = "")
```
## Markers for certain clusters and tissues
```{r, eval=F}
# ----------------------------
# Helpers
# ----------------------------
strip_version <- function(x) sub("\\.v[0-9.]+$", "", x)
safe_dir_create <- function(path) {
dir.create(path, recursive = TRUE, showWarnings = FALSE)
stopifnot(dir.exists(path))
invisible(path)
}
anno_row_kv <- function(annotation_row) {
if (is.null(annotation_row) || nrow(annotation_row) == 0) {
return(tibble::tibble(field = character(), value = character()))
}
annotation_row <- dplyr::slice(annotation_row, 1) # au cas où plusieurs lignes
tibble::tibble(
field = names(annotation_row),
value = vapply(annotation_row, function(x) {
# rend lisible même si c'est une liste / plusieurs valeurs
if (is.list(x)) paste(unlist(x), collapse = "; ") else paste(x, collapse = "; ")
}, FUN.VALUE = character(1))
)
}
go_long_kv <- function(anno_go_long) {
if (is.null(anno_go_long) || nrow(anno_go_long) == 0) {
return(tibble::tibble(field = character(), value = character()))
}
tibble::tibble(field = names(anno_go_long)) |>
dplyr::mutate(
value = vapply(field, function(f) {
x <- anno_go_long[[f]]
x <- unique(as.character(x))
x <- x[!is.na(x) & x != ""]
paste(x, collapse = "; ")
}, FUN.VALUE = character(1))
)
}
get_jgi_annotation_for_gene <- function(gene_id, jgi_anno) {
gene_id_clean <- strip_version(gene_id)
jgi_anno2 <- jgi_anno %>%
mutate(
locus_clean = strip_version(locusName),
transcript_clean = strip_version(transcriptName)
)
anno_gene <- jgi_anno2 %>%
filter(locus_clean == gene_id_clean | transcript_clean == gene_id_clean)
# Extract ALL GO IDs robustly (regex)
go_list <- stringr::str_extract_all(anno_gene$GO %||% "", "GO:\\d{7}")
go_vec <- unique(unlist(go_list))
go_vec <- go_vec[!is.na(go_vec) & go_vec != ""]
# Long format
anno_go_long <- tibble::tibble(
locusName = anno_gene$locusName,
transcriptName = anno_gene$transcriptName,
peptideName = anno_gene$peptideName,
go_id_list = go_list
) %>%
tidyr::unnest(go_id_list) %>%
dplyr::rename(go_id = go_id_list) %>%
dplyr::filter(!is.na(go_id), go_id != "") %>%
dplyr::distinct()
# Try GO.db annotations (TERM / ONTOLOGY / DEFINITION)
go_annot <- NULL
if (length(go_vec) > 0 &&
requireNamespace("GO.db", quietly = TRUE) &&
requireNamespace("AnnotationDbi", quietly = TRUE)) {
go_annot <- tryCatch({
res <- AnnotationDbi::select(
GO.db::GO.db,
keys = go_vec,
columns = c("TERM", "ONTOLOGY", "DEFINITION"),
keytype = "GOID"
)
dplyr::rename(res, go_id = GOID)
}, error = function(e) {
# fallback without DEFINITION
tryCatch({
res <- AnnotationDbi::select(
GO.db::GO.db,
keys = go_vec,
columns = c("TERM", "ONTOLOGY"),
keytype = "GOID"
)
dplyr::rename(res, go_id = GOID)
}, error = function(e2) NULL)
})
if (!is.null(go_annot)) {
anno_go_long <- anno_go_long %>%
dplyr::left_join(go_annot, by = "go_id")
}
}
list(
anno_gene = anno_gene,
anno_go_long = anno_go_long,
go_terms = go_vec,
go_annot = go_annot
)
}
`%||%` <- function(a, b) if (is.null(a)) b else a
get_gtf_features_for_gene <- function(gene_id, gtf) {
info_gene <- gtf[gtf$gene_id == gene_id]
if (length(info_gene) == 0) {
stop("No GTF entries found for gene_id = ", gene_id)
}
gene_range <- range(info_gene)
gene_all <- data.frame(
gene_id = info_gene$gene_id,
gene_name = info_gene$gene_name,
transcript_id = info_gene$transcript_id,
type = as.character(info_gene$type),
source = as.character(info_gene$source),
score = info_gene$score,
phase = info_gene$phase,
seqname = as.character(seqnames(info_gene)),
start = start(info_gene),
end = end(info_gene),
strand = as.character(strand(info_gene)),
width = width(info_gene),
stringsAsFactors = FALSE
) %>%
distinct() %>%
arrange(transcript_id, type, start, end)
by_type <- split(gene_all, gene_all$type)
per_transcript <- gene_all %>%
filter(!is.na(transcript_id)) %>%
group_by(transcript_id, type) %>%
summarise(
n_features = n(),
total_bp = sum(width, na.rm = TRUE),
.groups = "drop"
) %>%
arrange(transcript_id, desc(total_bp))
overview <- list(
gene_id = unique(gene_all$gene_id),
gene_name = unique(gene_all$gene_name),
seqname = unique(gene_all$seqname),
strand = unique(gene_all$strand),
start = start(gene_range),
end = end(gene_range),
gene_length_bp = width(gene_range),
n_transcripts = length(unique(na.omit(gene_all$transcript_id))),
feature_counts = sort(table(gene_all$type), decreasing = TRUE)
)
list(
info_gene = info_gene,
overview = overview,
tables = list(
all = gene_all,
by_type = by_type,
exons = if ("exon" %in% names(by_type)) by_type[["exon"]] else NULL,
cds = if ("CDS" %in% names(by_type)) by_type[["CDS"]] else NULL,
transcripts = if ("transcript" %in% names(by_type)) by_type[["transcript"]] else NULL,
per_transcript = per_transcript
)
)
}
get_candidates_info <- function(gene_id, df_candidates) {
# tries multiple common column names; returns NULL if none match
if (is.null(df_candidates) || nrow(df_candidates) == 0) return(NULL)
gene_clean <- strip_version(gene_id)
possible_cols <- c("gene_id", "gene", "geneid", "id", "locusName", "locus", "feature_id")
present_cols <- intersect(possible_cols, names(df_candidates))
if (length(present_cols) == 0) return(NULL)
# try matching in any of these columns
out <- NULL
for (col in present_cols) {
x <- df_candidates[[col]]
x_clean <- strip_version(as.character(x))
idx <- which(x_clean == gene_clean)
if (length(idx) > 0) {
out <- df_candidates[idx, , drop = FALSE]
break
}
}
out
}
# ----------------------------
# Main: build and save summary
# ----------------------------
build_and_save_gene_summary <- function(
gene_id,
gtf,
jgi_anno_file,
candidates_csv = NULL,
out_base = here("report/snrnaseq/plot/marker")
) {
# load JGI annotation
jgi_anno <- read_tsv(jgi_anno_file, show_col_types = FALSE)
# load candidates if provided
df_candidates <- NULL
if (!is.null(candidates_csv) && file.exists(candidates_csv)) {
df_candidates <- read_csv(candidates_csv, show_col_types = FALSE)
}
# output directory
path_gene <- safe_dir_create(file.path(out_base, gene_id))
# GTF part
gtf_res <- get_gtf_features_for_gene(gene_id, gtf)
# JGI part (may be empty if not found)
jgi_res <- get_jgi_annotation_for_gene(gene_id, jgi_anno)
jgi_go_kv <- go_long_kv(jgi_res$anno_go_long)
# candidates part (optional)
cand_res <- get_candidates_info(gene_id, df_candidates)
jgi_row_kv <- anno_row_kv(jgi_res$anno_gene)
# assemble all-in-one object
gene_summary <- list(
input = list(
gene_id = gene_id,
gene_id_clean = strip_version(gene_id),
jgi_anno_file = jgi_anno_file,
candidates_csv = candidates_csv,
out_dir = path_gene
),
overview = gtf_res$overview,
# <--- THIS is what you want:
tables = gtf_res$tables,
jgi = list(
annotation_row = jgi_row_kv,
go_terms = jgi_res$go_terms,
go_long = jgi_res$anno_go_long, # tu peux garder la table longue
go_kv = jgi_go_kv, # <- version “en colonne”
go_annot = jgi_res$go_annot
),
candidates = cand_res
)
# save RDS (main)
saveRDS(gene_summary, file.path(path_gene, "gene_summary.rds"))
# helpful exports (optional but nice)
write.csv(gtf_res$tables$all, file.path(path_gene, "gtf_all_features.csv"), row.names = FALSE)
if (!is.null(jgi_res$anno_gene) && nrow(jgi_res$anno_gene) > 0) {
write.csv(jgi_res$anno_gene, file.path(path_gene, "jgi_annotation.csv"), row.names = FALSE)
}
if (!is.null(jgi_res$anno_go_long) && nrow(jgi_res$anno_go_long) > 0) {
write.csv(jgi_res$anno_go_long, file.path(path_gene, "jgi_go_terms_long.csv"), row.names = FALSE)
}
if (!is.null(cand_res) && nrow(cand_res) > 0) {
write.csv(cand_res, file.path(path_gene, "candidates_row.csv"), row.names = FALSE)
}
return(gene_summary)
}
# ----------------------------
# Data
# ----------------------------
df_candidates<- read_csv(here::here("data/output/candidates/candidates.csv"))
path_output_marker<-here::here("report/snrnaseq/plot/marker")
DefaultAssay(obj_all) <- "SCT"
Idents(obj_all) <- obj_all$seurat_clusters
# Prepare SCT models for marker testing (needed once per object)
obj_all <- PrepSCTFindMarkers(obj_all)
# ----------------------------
# Example usage
# ----------------------------
gene_id <- "BdiBd21-3.4G0260700.v1.2" # example top marker for a pericycle-related cluster
gene_base2 <- "BdiBd21-3.5G0204500"
gene_id <- "BdiBd21-3.4G0260700.v1.2"
gene_id <- "BdiBd21-3.5G0204500.v1.2"
gene_id <- "BdiBd21-3.2G0308600.v1.2" # Best from 9 with GO
# LOAD GJI info
# anno_file <- here::here("data/inputs_ref/JGI/BdistachyonBd21_3_537_v1.2.P14.annotation_info.txt.gz")
# jgi_anno <- read_tsv(anno_file, show_col_types = FALSE)
#
# gene_id_clean <- strip_version(gene_id)
#
# jgi_anno2 <- jgi_anno %>%
# mutate(
# locus_clean = strip_version(locusName),
# transcript_clean = strip_version(transcriptName)
# )
# Try matching by locusName first, otherwise transcriptName
# anno_gene <- jgi_anno2 %>%
# filter(locus_clean == gene_id_clean | transcript_clean == gene_id_clean)
# function to create all plot but also function that regroup all info.
export_gene_marker_plots_and_summary<-function(gene_id){
print(gene_id)
path_gene <- file.path(path_output_marker, gene_id)
gene_summary <- build_and_save_gene_summary(
gene_id = gene_id,
gtf = gtf,
jgi_anno_file = here("data/inputs_ref/JGI/BdistachyonBd21_3_537_v1.2.P14.annotation_info.txt.gz"),
candidates_csv = here("data/output/candidates/candidates.csv"),
out_base = here("report/snrnaseq/plot/marker")
)
# gene_summary$overview
# gene_summary$jgi$go_terms
# gene_summary$tables$all
#
# gene_summary$jgi$go_annot
# gene_summary$jgi$annotation_row
# gene_summary$jgi$go_long
#### Cluster-wise average expression for the example gene ----
get_gene_cluster_means <- function(obj, gene_id,
assay = "SCT",
layer = "counts",
ident_col = "seurat_clusters") {
DefaultAssay(obj) <- assay
Idents(obj) <- obj[[ident_col]][, 1]
mat <- GetAssayData(obj, assay = assay, layer = layer)
if (!gene_id %in% rownames(mat)) {
stop("Gene ", gene_id, " is not present in layer '", layer, "' of assay ", assay, ".")
}
expr_vec <- as.numeric(mat[gene_id, ])
cl <- as.factor(Idents(obj))
mean_by_cl <- tapply(expr_vec, cl, mean, na.rm = TRUE)
tibble(
cluster = names(mean_by_cl),
mean_expr = as.numeric(mean_by_cl)
)
}
p_mean <- get_gene_cluster_means(
obj_all,
gene_id = gene_id,
assay = "SCT",
layer = "counts" # or "data" if you prefer SCT residuals
) %>% ggplot(., aes(x = cluster, y = mean_expr)) +
geom_col() +
theme_bw() +
labs(
x = "Cluster",
y = paste("Mean", gene_id, "expression (", DefaultAssay(obj_all), "/", "counts", ")", sep = ""),
title = paste("Average expression of", gene_id, "per cluster")
)
fig_export(
path = paste0(path_gene,"/","mean_per_cluster"),
plot_x = p_mean,
height_i = 4.5,
width_i = 6,
res_i = 300,
format = "png"
)
# Mean expression by cluster and timepoint (heatmap)
if ("timepoint" %in% colnames(obj_all@meta.data)) {
DefaultAssay(obj_all) <- "SCT"
mat_ct <- GetAssayData(obj_all, assay = "SCT", layer = "counts")
stopifnot(gene_id %in% rownames(mat_ct))
expr_vec_ct <- as.numeric(mat_ct[gene_id, ])
meta_ct <- obj_all@meta.data
# Force timepoint order: 0, 4, 6, 8, 12 (with or without 'h')
tp_vals <- as.character(meta_ct$timepoint)
tp_vals <- gsub("\\s+", "", tp_vals)
has_h <- any(grepl("h$", tp_vals, ignore.case = TRUE))
desired_tp <- if (has_h) c("0h", "4h", "6h", "8h", "12h") else c("0", "4", "6", "8", "12")
# keep only levels that exist
desired_tp <- desired_tp[desired_tp %in% unique(tp_vals)]
meta_ct$timepoint <- if (length(desired_tp) >= 2) factor(tp_vals, levels = desired_tp) else factor(tp_vals)
df_ct <- tibble::tibble(
cluster = as.factor(meta_ct$seurat_clusters),
timepoint = meta_ct$timepoint,
expr = expr_vec_ct
) |>
dplyr::group_by(cluster, timepoint) |>
dplyr::summarise(mean_expr = mean(expr, na.rm = TRUE), .groups = "drop")
p_mean_ct <- ggplot(df_ct, aes(x = timepoint, y = cluster, fill = mean_expr)) +
geom_tile() +
scale_fill_gradient(low = "grey90", high = "purple") +
theme_bw() +
labs(
x = "Timepoint",
y = "Cluster",
fill = "Mean expression",
title = paste("Mean", gene_id, "expression by cluster and timepoint")
)+ scale_y_discrete(limits = rev)
fig_export(
path = paste0(path_gene, "/", "mean_cluster_timepoint"),
plot_x = p_mean_ct,
height_i = 4,
width_i = 7,
res_i = 300,
format = "png"
)
}
# Violin and UMAP views for the example marker ----
# if you want to combine plots later
plot_gene_violin_umap <- function(
obj,
gene_id,
assay = "SCT",
cluster_col = "seurat_clusters",
tp_col = "timepoint",
reduction = "umap",
expr_layer = "data", # "data" or "counts"
pt_size = 0.1,
tp_order = c("0", "4", "6", "8", "12") # desired order (without 'h')
) {
# packages
# Seurat, ggplot2, patchwork (for plot_layout/plot_annotation)
# Assay & identities
DefaultAssay(obj) <- assay
if (!cluster_col %in% colnames(obj@meta.data)) {
stop("Column '", cluster_col, "' not found in obj@meta.data.")
}
Idents(obj) <- as.factor(obj@meta.data[[cluster_col]])
# Check gene existence in the requested layer
mat <- tryCatch(
GetAssayData(obj, assay = assay, layer = expr_layer),
error = function(e) GetAssayData(obj, assay = assay, slot = expr_layer) # Seurat v4 fallback
)
if (!gene_id %in% rownames(mat)) {
stop("Gene ", gene_id, " is not present in assay ", assay, " / layer ", expr_layer, ".")
}
## 1) Violin plot by cluster
p_violin <- tryCatch(
VlnPlot(
obj,
features = gene_id,
group.by = cluster_col,
assay = assay,
layer = expr_layer,
pt.size = pt_size
),
error = function(e) {
# Seurat v4 fallback
VlnPlot(
obj,
features = gene_id,
group.by = cluster_col,
assay = assay,
slot = expr_layer,
pt.size = pt_size
)
}
) +
theme_bw() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(face = "bold")
) +
labs(
x = "Cluster",
y = paste0(gene_id, " expression (", assay, "/", expr_layer, ")"),
title = paste("Expression of", gene_id, "by cluster")
)
## 2) UMAP feature plot (all timepoints combined)
p_umap <- FeaturePlot(
obj,
features = gene_id,
reduction = reduction,
order = TRUE
) +
theme_bw() +
theme(
axis.text = element_blank(),
axis.title = element_blank(),
axis.ticks = element_blank(),
plot.title = element_text(face = "bold")
) +
labs(title = paste("UMAP -", gene_id))
## 3) Optional UMAP split by timepoint (ordered panels)
p_umap_split <- NULL
if (!is.null(tp_col) && tp_col %in% colnames(obj@meta.data)) {
# Force timepoint order by setting factor levels
tp_vals <- as.character(obj@meta.data[[tp_col]])
tp_vals <- gsub("\\s+", "", tp_vals)
# Detect whether your timepoints include 'h' suffix
has_h <- any(grepl("h$", tp_vals, ignore.case = TRUE))
if (has_h) {
desired_levels <- paste0(tp_order, "h")
} else {
desired_levels <- tp_order
}
# Apply factor with desired order; keep only levels that exist in data
desired_levels <- desired_levels[desired_levels %in% unique(tp_vals)]
if (length(desired_levels) >= 2) {
obj@meta.data[[tp_col]] <- factor(tp_vals, levels = desired_levels)
} else {
# fallback: keep existing order if we can't match
obj@meta.data[[tp_col]] <- factor(tp_vals)
}
p_umap_split <- FeaturePlot(
obj,
features = gene_id,
reduction = reduction,
split.by = tp_col,
ncol = min(4, length(levels(obj@meta.data[[tp_col]]))),
order = TRUE,
keep.scale = "all"
) &
theme_bw() &
theme(
axis.text = element_blank(),
axis.title = element_blank(),
axis.ticks = element_blank(),
plot.title = element_text(face = "bold")
)
p_umap_split <- p_umap_split +
patchwork::plot_layout(guides = "collect") +
patchwork::plot_annotation(title = paste("UMAP split by", tp_col, "-", gene_id)) &
theme(legend.position = "right")
}
list(
violin = p_violin,
umap = p_umap,
umap_split = p_umap_split
)
}
# execute ----
plots_gene <- plot_gene_violin_umap(
obj = obj_all,
gene_id = gene_id,
assay = "SCT",
cluster_col = "seurat_clusters",
tp_col = "timepoint",
reduction = "umap"
)
# Export example marker plots
fig_export(
path = paste0(path_gene,"/","violin_SCT_cluster"),
plot_x = plots_gene$violin,
height_i = 14,
width_i = 17,
res_i = 300,
format = "png"
)
fig_export(
path = paste0(path_gene,"/","umap"),
plot_x = plots_gene$umap,
height_i = 14,
width_i = 16,
res_i = 300,
format = "png"
)
if (!is.null(plots_gene$umap_split)) {
fig_export(
path = paste0(path_gene,"/","umap_split"),
plot_x = plots_gene$umap_split,
height_i = 10,
width_i = 24,
res_i = 300,
format = "png"
)
}
}
export_gene_marker_plots_and_summary("BdiBd21-3.1G0921900.v1.2")
gene_ids <- unique(paste0(candidates$brachypodium_gene_id, ".v1.2"))
gene_ids <-markers_cl20 %>% tibble::rownames_to_column("gene") %>% dplyr::arrange(dplyr::desc(avg_log2FC)) %>% dplyr::slice_head(n = 10) %>% dplyr::pull(gene) # 10 first from cluster 20
gene_ids <-markers_cl16 %>% tibble::rownames_to_column("gene") %>% dplyr::arrange(dplyr::desc(avg_log2FC)) %>% dplyr::slice_head(n = 10) %>% dplyr::pull(gene) # 10 first from cluster 16
to_remove <- c(
"BdiBd21-3.1G0589500.v1.2"
#"BdiBd21-3.2G0000000.v1.2"
)
gene_ids <- setdiff(gene_ids, to_remove)
# warning some are not in our genome
purrr::walk(gene_ids, export_gene_marker_plots_and_summary)
```
```{r, echo=FALSE, results="asis", message=FALSE, warning=FALSE}
root_dir <- "plot/marker"
gene_dirs <- list.dirs(root_dir, full.names = TRUE, recursive = FALSE)
gene_names <- basename(gene_dirs)
umap_path <- file.path(gene_dirs, "umap.png")
umap_split_path <- file.path(gene_dirs, "umap_split.png")
mean_per_cluster_path <- file.path(gene_dirs, "mean_per_cluster.png")
violin_SCT_cluster_path <- file.path(gene_dirs, "violin_SCT_cluster.png")
mean_cluster_timepoint_path <- file.path(gene_dirs, "mean_cluster_timepoint.png")
rds_path <- file.path(gene_dirs, "gene_summary.rds")
keep <- file.exists(umap_path) | file.exists(umap_split_path) | file.exists(rds_path)
gene_names <- gene_names[keep]
gene_dirs <- gene_dirs[keep]
umap_path <- umap_path[keep]
umap_split_path <- umap_split_path[keep]
rds_path <- rds_path[keep]
render_overview <- function(ov) {
if (is.null(ov)) return("_overview absent_\n\n")
esc <- function(x) {
x <- as.character(x)
x <- gsub("\\|", "\\\\|", x)
x <- gsub("\n", "<br>", x)
x
}
if (is.list(ov) && !is.null(names(ov))) {
key <- names(ov)
val <- vapply(ov, function(v) {
if (is.null(v)) return("")
if (length(v) == 1 && (is.atomic(v) || is.factor(v))) {
return(ifelse(is.na(v), "NA", esc(v)))
}
if (is.atomic(v) && !is.null(names(v))) {
parts <- paste0(names(v), ": ", as.character(v))
return(esc(paste(parts, collapse = "; ")))
}
out <- paste(capture.output(print(v)), collapse = " ")
out <- gsub("\\s+", " ", out)
esc(out)
}, character(1))
paste0(
"| key | value |\n|---|---|\n",
paste0("| **", esc(key), "** | ", val, " |\n", collapse = ""),
"\n"
)
} else {
out <- paste(capture.output(print(ov)), collapse = "\n")
paste0("```text\n", out, "\n```\n\n")
}
}
# pkg : htmltools
# install.packages("htmltools")
print_table_simple <- function(tab, id, max_rows = 500, max_height_px = 520) {
if (is.null(tab)) { cat("_table absent_\n\n"); return(invisible(NULL)) }
if (!is.data.frame(tab) || nrow(tab) == 0) { cat("_table vide/incorrecte_\n\n"); return(invisible(NULL)) }
flatten_col <- function(x) {
if (is.factor(x)) return(as.character(x))
if (inherits(x, "POSIXt") || inherits(x, "Date")) return(as.character(x))
if (is.list(x)) {
return(vapply(x, function(el) {
if (is.null(el) || length(el) == 0) return("")
if (is.atomic(el) || is.factor(el)) return(paste(as.character(el), collapse = "; "))
out <- paste(capture.output(print(el)), collapse = " ")
gsub("\\s+", " ", out)
}, character(1)))
}
if (!is.atomic(x)) {
out <- vapply(seq_along(x), function(i) {
z <- paste(capture.output(print(x[i])), collapse = " ")
gsub("\\s+", " ", z)
}, character(1))
return(out)
}
x
}
tab2 <- as.data.frame(lapply(tab, flatten_col), stringsAsFactors = FALSE)
if (is.finite(max_rows) && nrow(tab2) > max_rows) tab2 <- tab2[seq_len(max_rows), , drop = FALSE]
esc <- function(v) {
v <- as.character(v)
v[is.na(v)] <- "NA"
htmltools::htmlEscape(v)
}
tab3 <- as.data.frame(lapply(tab2, esc), stringsAsFactors = FALSE)
thead <- paste0("<tr>", paste0("<th>", names(tab3), "</th>", collapse = ""), "</tr>")
tbody <- apply(tab3, 1, function(r) paste0("<tr>", paste0("<td>", r, "</td>", collapse = ""), "</tr>"))
html <- paste0(
'<div id="', id, '" style="overflow:auto; max-height:', max_height_px, 'px; border:1px solid #e5e5e5; border-radius:8px; padding:8px;">',
'<table class="table table-sm table-striped" style="width:max-content; margin:0;">',
"<thead>", thead, "</thead>",
"<tbody>", paste0(tbody, collapse = ""), "</tbody>",
"</table></div>\n\n"
)
cat(html)
invisible(tab2)
}
if (length(gene_names) == 0) {
cat("Aucune ressource trouvée (attendu: `plot/marker/<GENE>/umap.png`, `umap_split.png` et/ou `gene_summary.rds`).")
} else {
o <- order(tolower(gene_names))
gene_names <- gene_names[o]
rds_path <- rds_path[o]
umap_path <- umap_path[o]
umap_split_path <- umap_split_path[o]
mean_per_cluster_path <- mean_per_cluster_path[o]
violin_SCT_cluster_path <- violin_SCT_cluster_path[o]
mean_cluster_timepoint_path <- mean_cluster_timepoint_path[o]
cat(":::: {.panel-tabset}\n\n")
for (i in seq_along(gene_names)) {
cat("## ", gene_names[i], "\n\n", sep = "")
cat("::: {.panel-tabset}\n\n")
gene_summary <- NULL
if (file.exists(rds_path[i])) {
gene_summary <- tryCatch(readRDS(rds_path[i]), error = function(e) NULL)
}
cat("### Overview\n\n")
if (!is.null(gene_summary)) cat(render_overview(gene_summary$overview)) else cat("_gene_summary.rds absent/illisible_\n\n")
cat("### All\n\n")
if (!is.null(gene_summary) && !is.null(gene_summary$tables) && "all" %in% names(gene_summary$tables)) {
print_table_simple(gene_summary$tables$all, id = paste0("tbl_all_", i), max_rows = 500)
} else {
cat("_gene_summary$tables$all absent_\n\n")
}
cat("### JGI\n\n")
if (!is.null(gene_summary) &&
!is.null(gene_summary$jgi) &&
!is.null(gene_summary$jgi$annotation_row)) {
print_table_simple(
gene_summary$jgi$annotation_row,
id = paste0("tbl_jgi_", i),
max_rows = 50 # c’est 1 ligne chez toi, mais au cas où…
)
} else {
cat("_gene_summary$jgi$annotation_row absent_\n\n")
}
cat("### GO\n\n")
if (!is.null(gene_summary) &&
!is.null(gene_summary$jgi) &&
!is.null(gene_summary$jgi$go_kv)) {
print_table_simple(
gene_summary$jgi$go_kv,
id = paste0("tbl_jgi_", i),
max_rows = 50 # c’est 1 ligne chez toi, mais au cas où…
)
} else {
cat("_gene_summary$jgi$go_kv_\n\n")
}
cat("### Umap\n\n")
if (file.exists(umap_path[i])) cat("{width=100%}\n\n", sep="") else cat("_umap.png absent_\n\n")
cat("### Umap split\n\n")
if (file.exists(umap_split_path[i])) cat("{width=100%}\n\n", sep="") else cat("_umap_split.png absent_\n\n")
cat("### Mean per cluster\n\n")
if (file.exists(mean_per_cluster_path[i])) cat("{width=100%}\n\n", sep="") else cat("_mean_per_cluster_path.png absent_\n\n")
cat("### Violin per cluster\n\n")
if (file.exists(violin_SCT_cluster_path[i])) cat("{width=100%}\n\n", sep="") else cat("_violin_SCT_cluster_path.png absent_\n\n")
cat("### Mean cluster timepoint\n\n")
if (file.exists(mean_cluster_timepoint_path[i])) cat("{width=100%}\n\n", sep="") else cat("_mean_cluster_timepoint_path.png absent_\n\n")
cat(":::\n\n") # fin tabset interne
}
cat("::::\n") # fin tabset externe
}
```
These plots show in which clusters the candidate pericycle marker BdiBd21-3.4G0260700 is most strongly expressed (barplot), how its expression is distributed within each cluster (violin), and how its average expression varies jointly with cluster identity and timepoint (heatmap).
The mean-expression barplot highlights the cluster where BdiBd21-3.5G0204500 is most enriched, the violin plot shows how specific this enrichment is compared to other clusters, and the UMAP views reveal where these expressing cells sit in the global embedding and across timepoints. The associated `markers_gene2` table (especially the `p_val_adj` column) indicates whether this enrichment is statistically significant.