Commit 4a435285 authored by Leonard Herault's avatar Leonard Herault
Browse files

initial commit R_src

parent 5566366a
# Adapted from https://github.com/satijalab/seurat/issues/2201
DoMultiBarHeatmap <- function (object,
features = NULL,
cells = NULL,
group.by = "ident",
additional.group.by = NULL,
group.bar = TRUE,
disp.min = -2.5,
disp.max = NULL,
slot = "scale.data",
assay = NULL,
label = TRUE,
size = 5.5,
hjust = 0,
angle = 45,
raster = TRUE,
draw.lines = TRUE,
lines.width = NULL,
group.bar.height = 0.02,
combine = TRUE,
group.col = NULL
)
{
cells <- cells %||% colnames(x = object)
if (is.numeric(x = cells)) {
cells <- colnames(x = object)[cells]
}
assay <- assay %||% DefaultAssay(object = object)
DefaultAssay(object = object) <- assay
features <- features %||% VariableFeatures(object = object)
## Why reverse???
features <- rev(x = unique(x = features))
disp.max <- disp.max %||% ifelse(test = slot == "scale.data",
yes = 2.5, no = 6)
possible.features <- rownames(x = GetAssayData(object = object,
slot = slot))
if (any(!features %in% possible.features)) {
bad.features <- features[!features %in% possible.features]
features <- features[features %in% possible.features]
if (length(x = features) == 0) {
stop("No requested features found in the ", slot,
" slot for the ", assay, " assay.")
}
warning("The following features were omitted as they were not found in the ",
slot, " slot for the ", assay, " assay: ", paste(bad.features,
collapse = ", "))
}
data <- as.data.frame(x = as.matrix(x = t(x = GetAssayData(object = object,
slot = slot)[features, cells, drop = FALSE])))
object <- suppressMessages(expr = StashIdent(object = object,
save.name = "ident"))
group.by <- group.by %||% "ident"
groups.use <- object[[c(group.by, additional.group.by)]][cells, , drop = FALSE]
plots <- list()
for (i in group.by) {
data.group <- data
group.use <- groups.use[, c(i, additional.group.by), drop = FALSE]
for(colname in colnames(group.use)){
if (!is.factor(x = group.use[[colname]])) {
group.use[[colname]] <- factor(x = group.use[[colname]])
}
}
if (draw.lines) {
lines.width <- lines.width %||% ceiling(x = nrow(x = data.group) *
0.0025)
placeholder.cells <- sapply(X = 1:(length(x = levels(x = group.use[[i]])) *
lines.width), FUN = function(x) {
return(Seurat:::RandomName(length = 20))
})
placeholder.groups <- data.frame(foo=rep(x = levels(x = group.use[[i]]), times = lines.width))
placeholder.groups[additional.group.by] = NA
colnames(placeholder.groups) <- colnames(group.use)
rownames(placeholder.groups) <- placeholder.cells
group.levels <- levels(x = group.use[[i]])
group.use <- sapply(group.use, as.vector)
rownames(x = group.use) <- cells
group.use <- rbind(group.use, placeholder.groups)
na.data.group <- matrix(data = NA, nrow = length(x = placeholder.cells),
ncol = ncol(x = data.group), dimnames = list(placeholder.cells,
colnames(x = data.group)))
data.group <- rbind(data.group, na.data.group)
}
group.use <- group.use[with(group.use, eval(parse(text=paste('order(', paste(c(i, additional.group.by), collapse=', '), ')', sep='')))), , drop=F]
plot <- Seurat:::SingleRasterMap(data = data.group, raster = raster,
disp.min = disp.min, disp.max = disp.max, feature.order = features,
cell.order = rownames(x = group.use), group.by = group.use[[i]])
if (group.bar) {
if(is.null(group.col)) {
colPal <- scales::hue_pal()
} else {
colPal <- function(n) {
return(group.col[c(1:n)])
}
}
pbuild <- ggplot_build(plot = plot)
group.use2 <- group.use
cols <- list()
na.group <- Seurat:::RandomName(length = 20)
for (colname in rev(x = colnames(group.use2))){
if (colname == group.by){
colid = colname
colPal2 <- colPal
} else {
colid = colname
colPal2 <- function(n) {
return(rev(scales::hue_pal()(n)))
}
}
if (draw.lines) {
levels(x = group.use2[[colname]]) <- c(levels(x = group.use2[[colname]]), na.group)
group.use2[placeholder.cells, colname] <- na.group
cols[[colname]] <- c(colPal2(length(x = levels(x = group.use[[colname]]))), "#FFFFFF")
} else {
cols[[colname]] <- c(colPal2(length(x = levels(x = group.use[[colname]]))))
}
names(x = cols[[colname]]) <- levels(x = group.use2[[colname]])
y.range <- diff(x = pbuild$layout$panel_params[[1]]$y.range)
y.pos <- max(pbuild$layout$panel_params[[1]]$y.range) + y.range * 0.015
y.max <- y.pos + group.bar.height * y.range
pbuild$layout$panel_params[[1]]$y.range <- c(pbuild$layout$panel_params[[1]]$y.range[1], y.max)
#print(cols[[colname]][group.use2[[colname]]])
plot <- suppressMessages(plot +
annotation_raster(raster = t(x = cols[[colname]][group.use2[[colname]]]), xmin = -Inf, xmax = Inf, ymin = y.pos, ymax = y.max) +
annotation_custom(grob = grid::textGrob(label = colid, hjust = 0, gp = grid::gpar(cex = 0.75)), ymin = mean(c(y.pos, y.max)), ymax = mean(c(y.pos, y.max)), xmin = Inf, xmax = Inf) +
coord_cartesian(ylim = c(0, y.max), clip = "off"))
#temp <- as.data.frame(cols[[colname]][levels(group.use[[colname]])])
#colnames(temp) <- 'color'
#temp$x <- temp$y <- 1
#temp[['name']] <- as.factor(rownames(temp))
#temp <- ggplot(temp, aes(x=x, y=y, fill=name)) + geom_point(shape=21, size=5) + labs(fill=colname) + theme(legend.position = "bottom")
#legend <- get_legend(temp)
#multiplot(plot, legend, heights=3,1)
if ((colname == group.by) && label) {
x.max <- max(pbuild$layout$panel_params[[1]]$x.range)
x.divs <- pbuild$layout$panel_params[[1]]$x.major
group.use$x <- x.divs
label.x.pos <- tapply(X = group.use$x, INDEX = group.use[[colname]],
FUN = median) * x.max
label.x.pos <- data.frame(group = names(x = label.x.pos),
label.x.pos)
label.x.pos$group <- str_split_fixed(label.x.pos$group,"_",2)[,2]
plot <- plot + geom_text(stat = "identity",
data = label.x.pos, aes_string(label = "group",
x = "label.x.pos"), y = y.max + y.max *
0.03 * 0.5, angle = angle, hjust = hjust,
size = size)
plot <- suppressMessages(plot + coord_cartesian(ylim = c(0,
y.max + y.max * 0.002 * max(nchar(x = levels(x = group.use[[colname]]))) *
size), clip = "off"))
}
}
}
plot <- plot + theme(line = element_blank())
plots[[i]] <- plot
}
if (combine) {
plots <- CombinePlots(plots = plots)
}
return(plots)
}
# function to make gene enrichment analyzis with gProfileR
## function to perform gprofler enrcihment analysis on each gene cluster of
## a differentially expressed gene result (eg data frame of DE genes with a column cluster)
getGeneClustGprofile <- function(deg_clust,
background,
organism = "mmusculus",
hier_filtering = "none",
ordered_query = F) {
resEnrichTest <- lapply(split(as.vector(deg_clust$Gene),f= deg_clust$Cluster),gprofiler,
organism = "mmusculus",
custom_bg = background,
ordered_query = ordered_query,
hier_filtering = hier_filtering)
return(resEnrichTest)
}
## function to plot gprofiler results
plotWriteResEnrich <- function(resEnrichTest,
sources=c("BP","keg"),
outdir = "./gprofiler",
clusterLabel =names(resEnrichTest),
colors = brewer.pal(length(resEnrichTest)+1, "Set1")) {
dir.create(outdir,recursive = T,showWarnings = F)
colors <- colors
for (cluster in clusterLabel) {
write.table(file= paste(outdir,"/gprofiler_table_clust_",cluster,".tsv", sep = ""),
resEnrichTest[[cluster]],quote = F,sep = '\t',row.names = F)
for (s in sources) {
results <- resEnrichTest[[cluster]][which(resEnrichTest[[cluster]][,"domain"] == s),]
results <- results[order(results$p.value),]
results <- as.data.frame(results[,c("term.id","term.name","p.value")])
results$logPval <- -log(base = 10,x = results$p.value)
if(s == "tf") {
results$term.name <- paste(results$term.name,results$term.id)
}
## TO DO : if to much term plot only the first n
if (length(results$term.name) > 55) {
png(paste(outdir,"/gprofiler_top30_",cluster,"_",s,".png",sep =""),width = 600,height = 600)
gp <- ggplot(results[c(1:30),], aes(x=reorder(term.name, logPval), y=logPval)) +
geom_bar(stat='identity',fill = colors[which(names(resEnrichTest)==cluster)] ) +
coord_flip() +
xlab("term name") +
ylab("-log10(p value)") +
theme(text = element_text(size = 20))
print(gp)
dev.off()
png(paste(outdir,"/gprofiler_",cluster,"_",s,".png",sep =""),width = 1400,height = 1400)
} else {
png(paste(outdir,"/gprofiler_",cluster,"_",s,".png",sep =""),width = 600,height = 600)
}
gp <- ggplot(results, aes(x=reorder(term.name, logPval), y=logPval)) +
geom_bar(stat='identity',fill = colors[which(names(resEnrichTest)==cluster)] ) +
coord_flip() +
xlab("term name") +
ylab("-log10(p value)") +
theme(text = element_text(size = 20))
print(gp)
dev.off()
}
}
}
## function to composate
gProfileAnalysis <- function(deg_clust,
background,
organism = "mmusculus",
hier_filtering = "moderate",
ordered_query = F,
sources=c("BP","keg","CC",'MF','tf'),
outdir = "/gprofiler",
clusterLabel =names(resEnrichTest),
colors = brewer.pal(length(resEnrichTest)+1, "Set1")) {
resEnrichTest <- getGeneClustGprofile(deg_clust = deg_clust,
background = background,
organism = organism,
hier_filtering = hier_filtering,
ordered_query = ordered_query)
plotWriteResEnrich(resEnrichTest,
sources=sources,
outdir = outdir,
clusterLabel =clusterLabel,
colors = colors)
return(resEnrichTest)
}
# function for hypergeometric test
# hypergeometric test https://dputhier.github.io/ASG/practicals/go_statistics_td/go_statistics_td_2015.html
#DEPRECATED Seurat 2
# testHyper <- function(gene_set, signature, background,seurat) {
# signature <- signature[which(is.element(signature,set = rownames(seurat@data)))]
# genesMarked <- gene_set[which(is.element(gene_set,set = signature))]
# p.value <- phyper(q=length(genesMarked) -1,
# m=length(signature),
# n=length(background) - length(signature), k= length(gene_set), lower.tail=FALSE)
# return(p.value)
# }
#For Seurat 3
testHyper3 <- function(gene_set, signature, background,seurat) {
signature <- signature[which(is.element(signature,set = rownames(seurat)))]
genesMarked <- gene_set[which(is.element(gene_set,set = signature))]
p.value <- phyper(q=length(genesMarked) -1,
m=length(signature),
n=length(background) - length(signature), k= length(gene_set), lower.tail=FALSE)
return(p.value)
}
testHyperCells <- function(cells_pull, cells_setTarget, allCells) {
cellsMarked <- gene_set[which(is.element(gene_set,set = cells_set))]
p.value <- phyper(q=length(cellsMarked) -1,
m=length(cells_setTarget),
n=length(allCells) - length(cells_setTarget), k= length(cells_pull), lower.tail=FALSE)
return(p.value)
}
## DEPRECATED Seurat2
# testHyperSig <- function(signature,seurat,markers,clust) {
# result <- testHyper(gene_set=markers[which(markers$Cluster == clust),"Gene"],
# signature=signature,seurat=seurat,background=rownames(seurat@data))
# return(result)
# }
#For Seurat 3
testHyperSig3 <- function(signature,seurat,markers,clust) {
result <- testHyper3(gene_set=markers[which(markers$Cluster == clust),"Gene"],
signature=signature,seurat=seurat,background=rownames(seurat))
return(result)
}
##------------------------------------------------------------------------------
## L. Herault
##------------------------------------------------------------------------------
## -----------------------------------------------------------------------------
## Libraries
## -----------------------------------------------------------------------------
suppressMessages(library(monocle))
suppressMessages(library(Seurat))
suppressMessages(library(BiocParallel))
suppressMessages(library(getopt))
suppressMessages(library(gridExtra))
suppressMessages(library(gProfileR))
suppressMessages(library(RColorBrewer))
suppressMessages(library(readxl))
suppressMessages(library(stringr))
suppressMessages(library(scales))
suppressMessages(library(grid))
source("R_src/getSigPerCells.R")
source("R_src/Enrichment.R")
source("R_src/funForSeurat.R")
# Seurat 3 analysis of an individual sample required for using CaSTLe script.
## -----------------------------------------------------------------------------
## Command line args
## -----------------------------------------------------------------------------
spec = matrix(c(
'help', 'h', 0, "logical", "Help about the program",
'inputRDS', 'i', 1, "character", "REQUIRED : 10X data prepared as monocle or seurat object (.RDS generated by prepare_data.R).",
'outdir', 'o',1, "character", 'Outdir path (default ./)',
"num_dim", 'n',1,"numeric", "Number of dimension to use for ordering (eg n first pc of PCA on input data) 10 by default",
"correction", "b",1,"character", "Covariable to use as blocking factor (eg one or several columns of pData: betch, cell cycle phases... separated by +)",
"minPropCellExp", "p",1,"numeric", "minimal proportion of cells that expressed the genes kept for the analaysis 0.001 by default",
"norm_method", "z",1, "character", "normalisation method, logNorm seurat (by default) or sctransform",
"resolution", "r", 1,"numeric", "resolution for Seurat clustering 0.9 by default",
"signaturesFile", "s", 1, "character", "path to folder with signature list store as rds object",
"identRemoved", "d", 1, "character", "Optionnal cluster to remove only work if input is a seurat object",
"nonExpressedGenesRemoved", "e", 0,"logical", "non expressed gene already removed default to FALSE",
"gprofiler", "g", 0, "logical", "if true doing gprofiler default to true",
"logfc_threshold", "l", 1, "numeric", "logfc threshold for finding cluster markers (1 cluster vs all deg) 0.25 by default",
"rodriguezSig", "k", 1, "character", "path for xls file of Rodriguez result"
), byrow=TRUE, ncol=5);
opt = getopt(spec)
# if help was asked, print a friendly message
# and exit with a non-zero error code
args <- commandArgs()
if ( !is.null(opt$help) | is.null(opt$inputRDS)) {
cat("For an individual sample, perform PCA, tSNE, UMAP, Louvain clustering, diff expression between clusters with Seurat3 package with filtered poor quality cells data (gbm_cds monocle object)")
cat(getopt(spec, usage=TRUE))
q(status=1)
}
#set default arguments values
if (is.null(opt$num_dim)) {
opt$num_dim <- 10
}
if (is.null(opt$resolution)) {
opt$resolution <- 0.8
}
if (is.null(opt$minPropCellExp)) {
opt$minPropCellExp <- 0.001
print(opt$minPropCellExp)
}
if (is.null(opt$logfc_threshold)) {
opt$logfc_threshold = 0.25
}
print(paste("logfc threshold:", opt$logfc_threshold) )
if (is.null(opt$outdir)) {
opt$outdir = "./"
}
if (is.null(opt$nonExpressedGenesRemoved)) {
opt$nonExpressedGenesRemoved = F
} else {
opt$nonExpressedGenesRemoved = T
}
if (is.null(opt$gprofiler)) {
opt$gprofiler <- T
} else {
opt$gprofiler <- F
}
print(opt$gprofiler)
print(opt$nonExpressedGenesRemoved)
# get correction vector
if (!is.null(opt$correction)) {
corrections <- strsplit(x = opt$correction,split = "\\+")[[1]]
} else {
corrections <- NULL
}
# create blank for the plots
blank <- grid.rect(gp=gpar(col="white"))
dir.create(opt$outdir,recursive = T,showWarnings = F)
object <- readRDS(opt$inputRDS)
# input is a monocle object in the workflow
# Code to use this script with seurat input (eg to re analyse)
if (is(object)[1]== "CellDataSet") {
print("input monocle")
gbm_cds <- readRDS(opt$inputRDS)
} else {
if(is(object) == "Seurat") {
#It is easier to pass by a monocle object and then reget the seurat object with the genes filtered
print("input seurat object")
seurat <- object
if (!is.null(opt$identRemoved)) {
print(unique(seurat@active.ident))
print(paste("removing identities",opt$identRemoved))
seurat <- SubsetData(seurat,ident.remove = strsplit(x = opt$identRemoved,split = ",")[[1]],subset.raw = T) # may not work with seurat3
}
pd <- new("AnnotatedDataFrame", data = gbm_cds@meta.data)
fd <- new("AnnotatedDataFrame", data = data.frame(gene_short_name = rownames(gbm_cds)))
rownames(fd) <- fd$gene_short_name
gbm_cds <- newCellDataSet(GetAssayData(gbm_cds,slot = "counts"),
phenoData = pd,
featureData = fd,
lowerDetectionLimit = 0.1,
expressionFamily = negbinomial.size())
gbm_cds <- detectGenes(gbm_cds, min_expr = 0.1)
}
}
if(opt$nonExpressedGenesRemoved == F) {
print("remove non expressed genes (non expressed in at least X% of the cells X user option in monocle dp feature 5% in seurat tutorial 0,1%)")
fData(gbm_cds)$use_for_seurat <- fData(gbm_cds)$num_cells_expressed > opt$minPropCellExp * ncol(gbm_cds)
gbm_to_seurat <- gbm_cds[fData(gbm_cds)$use_for_seurat==T,]
} else {
gbm_to_seurat <- gbm_cds
}
if (is.element("Cluster",colnames(pData(gbm_to_seurat)))) {
colnames(pData(gbm_to_seurat))[which(colnames(pData(gbm_to_seurat))=="Cluster")] <- "Cluster_monocle"
}
##Convert to seurat3
#check for dup genes
# Only needed if ensemble id (it is the case in the workflow)
dupGeneNames <- fData(gbm_to_seurat)[which((duplicated(featureData(gbm_to_seurat)$gene_short_name))),"gene_short_name"]
if(length(dupGeneNames) == 0) {
rownames(gbm_to_seurat) <- fData(gbm_to_seurat)$gene_short_name
} else {
write.csv(fData(gbm_to_seurat)[which(is.element(fData(gbm_to_seurat)$gene_short_name,dupGeneNames)),],
paste(opt$outdir,"/dupGenesName.csv",sep = "")
)
print("Dup gene short names existing, making them unique...")
rownames(gbm_to_seurat) <- make.unique(fData(gbm_to_seurat)$gene_short_name, sep = "--")
}
seurat <- CreateSeuratObject(counts = exprs(gbm_to_seurat), meta.data = pData(gbm_to_seurat))
#################################################################################################
####################################### Seurat Workflow #########################################
#################################################################################################
if (opt$norm_method == "sctransform") {
seurat <- SCTransform(object = seurat, vars.to.regress = c("G2M_score","S_score","G1_score"))
} else {
seurat <- NormalizeData(object = seurat)
seurat <- FindVariableFeatures(object = seurat,selection.method = "vst", nfeatures = 2000, verbose = T)
seurat <- ScaleData(object = seurat,vars.to.regress = corrections)
}
seurat <- RunPCA(object = seurat)
png(paste(opt$outdir,"/ElbowPlot.png",sep =""))
ElbowPlot(object = seurat,ndims = 30)
dev.off()
print("Clustering...")
seurat <- FindNeighbors(object = seurat,dims = c(1:opt$num_dim),k.param = 20)
seurat <- FindClusters(object = seurat,resolution = c(0.5,0.6,0.7,0.8,0.9,1,1.2))
print("Running TSNE...")
seurat <- RunTSNE(seurat,dims = c(1:opt$num_dim))
print("Running UMAP...")
seurat <- RunUMAP(seurat,dims = c(1:opt$num_dim))
print("UMAP ok")
print(colnames(seurat@meta.data))
colPrefix <- "RNA_snn_res."
if(opt$norm_method == "sctransform") {
colPrefix <- "SCT_snn_res."
}
umapListRes <- list()
tsneListRes <- list()
for (r in c(0.6,0.8,1,1.2)) {
umapListRes[[as.character(r)]] <- DimPlot(seurat,
reduction = "umap",
label = T,
group.by = paste(colPrefix,r,sep="")) +
NoLegend() +
ggtitle(paste("res",r))
tsneListRes[[as.character(r)]] <- DimPlot(seurat,
reduction = "tsne",
label = T,
group.by = paste(colPrefix,r,sep="")) +
NoLegend() +
ggtitle(paste("res",r))
}
png(paste(opt$outdir,"/tsne_different_res.png",sep = ""),height = 800,width = 800)
grid.arrange(tsneListRes[[1]],tsneListRes[[2]],tsneListRes[[3]],tsneListRes[[4]])
dev.off()
png(paste(opt$outdir,"/umap_different_res.png",sep = ""),height = 800,width = 800)
grid.arrange(umapListRes[[1]],umapListRes[[2]],umapListRes[[3]],umapListRes[[4]])
dev.off()
Idents(seurat) <- paste(colPrefix,opt$resolution,sep = "")
seurat@meta.data$numclust <- seurat@meta.data[,paste(colPrefix,opt$resolution,sep = "")]
#Check for unwanted source of variation
pPhases <- DimPlot(seurat,group.by = "phases")
if (!is.null(seurat@meta.data$predicted)) {
pPred <- DimPlot(seurat,group.by = "predicted")
} else {
pPred <- blank
}
pUMI <- FeaturePlot(seurat, "Total_mRNAs")
pMito <- FeaturePlot(seurat, "percentMito")
png(paste(opt$outdir,"/umap_factors.png",sep = ""),height = 800,width = 800)
grid.arrange(pPhases,pPred,pUMI,pMito)
dev.off()
#Check for unwanted source of variation
pPhases <- DimPlot(seurat,group.by = "phases",reduction = "tsne")
if (!is.null(seurat@meta.data$predicted)) {
pPred <- DimPlot(seurat,group.by = "predicted",reduction = "tsne")
} else {
pPred <- blank
}
pUMI <- FeaturePlot(seurat, "Total_mRNAs",reduction = "tsne")
pMito <- FeaturePlot(seurat, "percentMito",reduction = "tsne")
png(paste(opt$outdir,"/tsne_factors.png",sep = ""),height = 800,width = 800)
grid.arrange(pPhases,pPred,pUMI,pMito)
dev.off()
#Check for genes
gene_list<- c("Pdzk1ip1","Mllt3",
"Ctla2a","Cd27","Cd34",
"Lig1","Hells","Tyms",
"Notch2","Lst1",
"Irf7","Stat1",
"Pf4","Itga2b",
"Klf1","Gata1",
"Mpo","Cd48",
"Fcer1a","Hdc",
"Il7r","Thy1", # Ccr9 not found
"Cdc20","Ccnb1","Racgap1",
"Mzb1","Ly6d", "Trp53inp1",
"Jun","Fos","Nr4a1")
gene_list <- gene_list[which(is.element(set=rownames(seurat),el = gene_list))]
dir.create(paste(opt$outdir,"/genes/umap/",sep =""),recursive = T)
dir.create(paste(opt$outdir,"/genes/tsne/",sep =""),recursive = T)
for (g in gene_list) {
png(paste(opt$outdir,"/genes/umap/",g,".png",sep = ""))
plot(FeaturePlot(seurat,features = g))
dev.off()
}
for (g in gene_list) {
png(paste(opt$outdir,"/genes/tsne/",g,".png",sep = ""))
plot(FeaturePlot(seurat,features = g),reduction = "tsne")
dev.off()
}
markers <- FindAllMarkers(seurat,only.pos = T,logfc.threshold= opt$logfc_threshold)
markers <- markers[which(markers$p_val_adj < 0.05),]
write.table(x = markers,paste(opt$outdir,"/markers.tsv", sep =""),sep = "\t",quote = F,row.names = F,col.names = T)
dir.create(paste(opt$outdir,"/markers/",sep = ""))
ylab <- "LogNormalized UMI counts"
if (opt$norm_method == "sctransform") {
ylab <- "Expression level"
}
for (numClust in unique(markers$cluster)) {
print(head(markers[which(markers$cluster == numClust),],n=9))
png(paste(opt$outdir,"/markers/Cluster_",numClust,"_topGenesVlnPlot.png",sep =""),width = 1000, height = 1000)
plot(VlnPlot(object = seurat, features = head(markers[which(markers$cluster == numClust),"gene"],n=9),pt.size = 0.5) +
labs(x = "Clusters",y= ylab,colour = "black") +
theme(axis.text = element_text(size=20),
plot.title = element_text(size=25)) )
dev.off()
}
#################################################################################################
########## Enrichment ##########
#################################################################################################
## Add signatures
dir.create(paste(opt$outdir,"/cellSignatures/",sep = ""))
signatures <- readRDS(opt$signaturesFile)
for (sig in c(1:length(signatures))) {
sigName <- names(signatures)[sig]
signature <- signatures[[sig]]
seurat <- scoreCells3(seurat,signature,outdir= paste(opt$outdir,"/cellSignatures/",sep=""),sigName)
}
## Enrichment
# Modify colnames of markers to use gProfileAnalysis function
firstup <- function(x) {
substr(x, 1, 1) <- toupper(substr(x, 1, 1))
x
}
colnames(markers) <- firstup(colnames(markers))
#be careful background
if(opt$gprofiler) {
gprofiler_result <- gProfileAnalysis(deg_clust = markers,
outdir = paste(opt$outdir,"/gProfileR", sep =""),
background = rownames(seurat),
colors = hue_pal()(length(unique(markers$Cluster))))
saveRDS(gprofiler_result,file=paste(opt$outdir,"/gProfileR/gprofiler_results.rds",sep =""))
}
# Test Rodriguez cluster sig
if(!is.null(opt$rodriguezSig)) {
clusterNames <- c("C1","C2","C3","Mk","Er","Ba","Neu","Mo1","Mo2", "preDC","preB","preT")
RodriguezClustersSig <- lapply(X= c(1:length(clusterNames)),FUN = read_xlsx,path = opt$rodriguezSig)
names(RodriguezClustersSig) <- clusterNames
getOnlyPos <- function(clustersSig) {
clusterSig <- clustersSig[which(clustersSig$log.effect > 0),]
return(clusterSig)
}
RodriguezClustersSigPos <- lapply(X= RodriguezClustersSig, getOnlyPos)
signaturesRodriguez <- lapply(RodriguezClustersSigPos,"[[",1 )
firstup <- function(x) {
substr(x, 1, 1) <- toupper(substr(x, 1, 1))
x
}
colnames(markers) <- firstup(colnames(markers))
getClustEnrichForRodriguez <- function(clust,signatures,seurat,markers) {
clustSig <- lapply(signatures,testHyperSig3,seurat,markers,clust)
if (!is.null(seurat@meta.data$predicted)) {
propCellTypesLearned <- table(seurat@meta.data[which(seurat@meta.data$numclust==clust),"predicted"])/length(seurat@meta.data[which(seurat@meta.data$numclust==clust),"predicted"])
} else{
propCellTypesLearned <- NULL
}
clustInfo <- c(clustSig,propCellTypesLearned)
return(clustInfo)
}
clust_list <- lapply(unique(markers$Cluster),getClustEnrichForRodriguez,signature=signaturesRodriguez,seurat =seurat,markers =markers) ##markers arg forgotten in testHyper sig
names(clust_list) <- paste("cluster_",unique(markers$Cluster),sep="")
clust_table <- as.data.frame(matrix(unlist(clust_list), nrow=length(unlist(clust_list[1]))))
colnames(clust_table) <- names(clust_list)
rownames(clust_table) <- names(clust_list[[1]])
clust_df <- as.data.frame(t(clust_table))
write.csv(clust_df,file = paste(opt$outdir,"/clustInfoRodriguez.csv",sep =""),quote = F)
}
#Clusters table summary
clust_table <- data.frame()
print("Creating cluster table summary")
getClustInfo <- function(clust,signatures,seurat,markers) {
clustInfo <- list()
clustInfo$num_cells <- dim(seurat@meta.data[which(seurat@active.ident==clust),])[1]
clustInfo$percent_cells <- clustInfo$num_cells/dim(seurat@meta.data)[1]
percentPhases <- table(seurat@meta.data[which(seurat@active.ident==clust),"phases"])/length(seurat@meta.data[which(seurat@active.ident==clust),"phases"]) #In fact this is fraction not percentage
if(!is.null(seurat@meta.data$predicted)) {
percentPredicted <- table(seurat@meta.data[which(seurat@active.ident==clust),"predicted"])/length(seurat@meta.data[which(seurat@active.ident==clust),"predicted"]) #In fact this is fraction not percentage
for (p in unique(seurat@meta.data$predicted)) {
print(p)
if (is.element(p,names(percentPhases))) {
clustInfo[[p]] <- percentPhases[p]
} else {
clustInfo[[p]] <- 0
}
}
}
for (p in c("G1_G0","S","G2_M")) {
if (is.element(p,names(percentPhases))) {
clustInfo[[p]] <- percentPhases[p]
} else {
clustInfo[[p]] <- 0
}
}
clustInfo$median_genes_expressed <- median(seurat@meta.data[which(seurat@active.ident==clust),"numGenesPerCells"])
clustInfo$median_nUMI <- median(seurat@meta.data[which(seurat@active.ident==clust),"Total_mRNAs"])
clustInfo$median_percentMitochGenes <- median(seurat@meta.data[which(seurat@active.ident==clust),"percentMito"])
clustSig <- lapply(signatures,testHyperSig3,seurat,markers,clust)
clustInfo <- c(clustInfo,clustSig)
}
allSignatures <- c(signatures,signaturesRodriguez)
clust_list <- lapply(levels(unique(seurat@active.ident)),getClustInfo,allSignatures,seurat,markers)
names(clust_list) <- paste("cluster_",levels(unique(seurat@active.ident)),sep="")
saveRDS(clust_list,paste(opt$outdir,"/clust_list_save.rds",sep =""))
clust_table <- as.data.frame(matrix(unlist(clust_list), nrow=length(unlist(clust_list[1]))))
colnames(clust_table) <- names(clust_list)
rownames(clust_table) <- names(clust_list[[1]])
clust_df <- as.data.frame(t(clust_table))
write.table(x = clust_df,file = paste(opt$outdir,"/clusters_table.tsv",sep =""),sep="\t",quote=F,col.names = NA)
saveRDS(seurat,file = paste(opt$outdir,"/seurat.rds",sep = ""))
##------------------------------------------------------------------------------
## L. Herault
##------------------------------------------------------------------------------
## -----------------------------------------------------------------------------
## Libraries
## -----------------------------------------------------------------------------
suppressMessages(library(monocle))
suppressMessages(library(Seurat))
suppressMessages(library(BiocParallel))
suppressMessages(library(getopt))
suppressMessages(library(gridExtra))
suppressMessages(library(gProfileR))
suppressMessages(library(RColorBrewer))
suppressMessages(library(readxl))
suppressMessages(library(stringr))
suppressMessages(library(scales))
suppressMessages(library(cowplot))
suppressMessages(library(sctransform))
suppressMessages(library(stringr))
suppressMessages(library(plyr))
suppressMessages(library(grid))
source("R_src/getSigPerCells.R")
source("R_src/Enrichment.R")
source("R_src/funForSeurat.R")
# Seurat 3 integration workflow
## -----------------------------------------------------------------------------
## Command line args
## -----------------------------------------------------------------------------
spec = matrix(c(
'help', 'h', 0, "logical", "Help about the program",
'inputFiles', 'i', 1, "character", "REQUIRED: 10X dataset paths prepared as hspc.combined object (.RDS generated by prepare_data.R) separated by +",
'signaturesFile', 's',1, "character", "REQUIRED: signatures rds file",
'outdir', 'o',1, "character", 'Outdir path (default ./)',
"num_dim_CCA", 'q',1,"numeric", "First n dimensions of CCA to use for FindIntegrationAnchors Seurat function (15 by default)",
"num_dim_weight", 'w',1,"numeric", "First n dimensions to use for IntegrateData Seurat function (15 by default)",
"num_dim", 'n',1,"numeric", "First n dimensions of PCA to use for clustering, UMAP and TSNE (15 by default)",
"num_dim_integrated",'N',1,"numeric", "Number of PCA dimension computed to analyse integrated data (40 by default)",
"cores", 'c',1, "numeric", "Number of cores to use for ordering (for differencially expressed gene between clusters test)",
"resolution", 'r',1, "numeric", "resolution for hspc.combined clustering",
"correction", "b",1,"character", "Covariable to use as blocking factor (eg one or several columns of pData: betch, cell cycle phases... separated by +)",
"logfc_threshold", "l", 1, "numeric", "logfc threshold for finding cluster markers (1 cluster vs all deg) 0.25 by default",
"rodriguezSig", "k", 1, "character", "path for xls file of Rodriguez result",
"norm_method", "z",1, "character", "normalization method, logNorm (by default) or sctransform",
"reusePca", "p", 0, "character","re use pca calculated before when caculating anchor weights for each dataset default to FALSE (permit to correct for cell cycle before integration)"
), byrow=TRUE, ncol=5);
opt = getopt(spec)
# if help was asked, print a friendly message
# and exit with a non-zero error code
args <- commandArgs()
if ( !is.null(opt$help) | is.null(opt$inputFiles)) {
cat("Perform Seurat 3 integration workflow, then cluster the cell with seurat 3 at different resolutions")
cat(getopt(spec, usage=TRUE))
q(status=1)
}
#set default arguments values
if (is.null(opt$resolution)) {
opt$resolution <- 0.6
}
if (is.null(opt$logfc_threshold)) {
opt$logfc_threshold <- 0.25
}
if (is.null(opt$num_dim_CCA)) {
opt$num_dim_CCA <- 15
}
if (is.null(opt$num_dim_weight)) {
opt$num_dim_weight <- 15
}
if (is.null(opt$num_dim)) {
opt$num_dim <- 15
}
if (is.null(opt$num_dim_integrated)) {
opt$num_dim_integrated <- 40
}
if (is.null(opt$norm_method)) {
opt$norm_method <- "logNorm"
}
if (is.null(opt$reusePca)) {
opt$reusePca <- FALSE
}
blank <- grid.rect(gp=gpar(col="white"))
print(opt$reusePca)
dir.create(opt$outdir,recursive = T,showWarnings = F)
corrections <- strsplit(x = opt$correction,split = "\\+")[[1]]
dir.create(opt$outdir,recursive = T,showWarnings = F)
hspc.listFile <- strsplit(opt$inputFiles,split = "\\+")[[1]]
hspc.list <- list()
#load dataset
for (i in 1:length(x = hspc.listFile)) {
#For testing, in final workflow sampleName will be incorporated in metadata with the loading of cell ranger matrix
sampleName <- strsplit(hspc.listFile[i],split = "/")[[1]][2]
hspc.list[[i]] <- readRDS(hspc.listFile[i])
hspc.list[[i]]@meta.data$sampleName <- sampleName
hspc.list[[i]] <- RenameCells(hspc.list[[i]],add.cell.id = sampleName)
if(opt$norm_method != "sctransform") {
hspc.list[[i]] <- NormalizeData(object = hspc.list[[i]], verbose = FALSE)
hspc.list[[i]] <- FindVariableFeatures(object = hspc.list[[i]], selection.method = "vst",
nfeatures = 2000, verbose = FALSE)
} else {
hspc.list[[i]] <- SCTransform(hspc.list[[i]],vars.to.regress = corrections,verbose = T)
}
}
hspc.anchors <- FindIntegrationAnchors(object.list = hspc.list, dims = 1:opt$num_dim_CCA)
if(opt$reusePca) {
print("Re use pca")
hspc.combined <- IntegrateData(anchorset = hspc.anchors, weight.reduction = "pca", dims = 1:opt$num_dim_weight)
}
hspc.combined <- IntegrateData(anchorset = hspc.anchors, dims = 1:opt$num_dim_weight)
DefaultAssay(object = hspc.combined) <- "integrated"
# Run the standard workflow for visualization and clustering
#Here certainly need to reuse SCTransform on combined data if norm_method = sct
hspc.combined <- ScaleData(object = hspc.combined, verbose = T,vars.to.regress = corrections)
hspc.combined <- RunPCA(object = hspc.combined, npcs = opt$num_dim_integrated, verbose = FALSE)
png(paste(opt$outdir,"/ElbowPlot.png",sep =""))
ElbowPlot(object = hspc.combined,ndims = opt$num_dim_integrated)
dev.off()
# t-SNE UMAP and Clustering
hspc.combined <- RunUMAP(object = hspc.combined, reduction = "pca", dims = 1:opt$num_dim)
hspc.combined <- RunTSNE(object = hspc.combined, reduction = "pca", dims = 1:opt$num_dim)
hspc.combined <- FindNeighbors(object = hspc.combined, reduction = "pca", dims = 1:opt$num_dim)
hspc.combined <- FindClusters(hspc.combined, resolution = c(0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,1.2))
colPrefix <- "integrated_snn_res."
Idents(hspc.combined) <- paste(colPrefix,opt$resolution,sep = "")
hspc.combined@meta.data$AGE <- "Old"
hspc.combined@meta.data$AGE[which(hspc.combined@meta.data$age == "2_months")] <- "Young"
# Visualization of the samples
p1 <- DimPlot(object = hspc.combined, reduction = "umap", group.by = "sampleName")
p2 <- DimPlot(object = hspc.combined, reduction = "umap", label = TRUE)
p3 <- DimPlot(object = hspc.combined, reduction = "umap", group.by = "AGE")
p4 <- DimPlot(object = hspc.combined, reduction = "umap", group.by = "runDate")
p3tsne <- DimPlot(object = hspc.combined, reduction = "tsne", group.by = "AGE")
p4tsne <- DimPlot(object = hspc.combined, reduction = "tsne", group.by = "runDate")
png(paste(opt$outdir,"/umap_samples.png",sep =""),height = 800,width=1200)
grid.arrange(p1, p2,ncol = 2)
dev.off()
png(paste(opt$outdir,"/AGE.png",sep =""),height = 800,width=1200)
grid.arrange(p3, p3tsne,ncol = 2)
dev.off()
png(paste(opt$outdir,"/runDate.png",sep =""),height = 800,width=1200)
grid.arrange(p4, p4tsne,ncol = 2)
dev.off()
png(paste(opt$outdir,"/UmapClusters.png",sep =""),height = 800,width=1200)
DimPlot(object = hspc.combined, reduction = "umap", label = TRUE)
dev.off()
p1tsne <- DimPlot(object = hspc.combined, reduction = "tsne", group.by = "sampleName")
p2tsne <- DimPlot(object = hspc.combined, reduction = "tsne", label = TRUE)
png(paste(opt$outdir,"/tsne_samples.png",sep =""),height = 800,width=1000)
grid.arrange(p1tsne, p2tsne,ncol = 2)
dev.off()
#Visualistaoin one sample at a time
sampleNames <- unique(hspc.combined@meta.data$sampleName)
dir.create(paste(opt$outdir,"/samples/umap/",sep = ""),recursive = T)
dir.create(paste(opt$outdir,"/samples/tsne/",sep = ""),recursive = T)
#samplePlotList <- list()
for (s in sampleNames) {
png(paste(opt$outdir,"/samples/umap/",s,".png",sep = ""))
plot(DimPlot(object = hspc.combined, cells= grep(s, hspc.combined@meta.data$sampleName), reduction = "umap") + ggtitle(s))
dev.off()
png(paste(opt$outdir,"/samples/tsne/",s,".png",sep = ""))
plot(DimPlot(object = hspc.combined, cells= grep(s, hspc.combined@meta.data$sampleName), reduction = "tsne") + ggtitle(s))
dev.off()
}
umapListRes <- list()
tsneListRes <- list()
for (r in c(0.6,0.8,1,1.2)) {
umapListRes[[as.character(r)]] <- DimPlot(hspc.combined,
reduction = "umap",
label = T,
group.by = paste(colPrefix,r,sep="")) +
NoLegend() +
ggtitle(paste("res",r))
tsneListRes[[as.character(r)]] <- DimPlot(hspc.combined,
reduction = "tsne",
label = T,
group.by = paste(colPrefix,r,sep="")) +
NoLegend() +
ggtitle(paste("res",r))
}
png(paste(opt$outdir,"/tsne_different_res.png",sep = ""),height = 800,width = 800)
grid.arrange(tsneListRes[[1]],tsneListRes[[2]],tsneListRes[[3]],tsneListRes[[4]])
dev.off()
png(paste(opt$outdir,"/umap_different_res.png",sep = ""),height = 800,width = 800)
grid.arrange(umapListRes[[1]],umapListRes[[2]],umapListRes[[3]],umapListRes[[4]])
dev.off()
hspc.combined@meta.data$numclust <- hspc.combined@meta.data[,paste(colPrefix,opt$resolution,sep = "")]
#Check for unwanted source of variation UMAP
pPhases <- DimPlot(hspc.combined,group.by = "phases")
if (!is.null(hspc.combined@meta.data$predicted)) {
pPred <- DimPlot(hspc.combined,group.by = "predicted")
} else {
pPred <- blank
}
pUMI <- FeaturePlot(hspc.combined, "Total_mRNAs")
pMito <- FeaturePlot(hspc.combined, "percentMito")
png(paste(opt$outdir,"/umap_factors.png",sep = ""),height = 800,width = 800)
grid.arrange(pPhases,pPred,pUMI,pMito)
dev.off()
#Check for unwanted source of variation tSNE
pPhases <- DimPlot(hspc.combined,group.by = "phases",reduction = "tsne")
if (!is.null(hspc.combined@meta.data$predicted)) {
pPred <- DimPlot(hspc.combined,group.by = "predicted",reduction = "tsne")
} else {
pPred <- blank
}
pUMI <- FeaturePlot(hspc.combined, "Total_mRNAs",reduction = "tsne")
pMito <- FeaturePlot(hspc.combined, "percentMito",reduction = "tsne")
png(paste(opt$outdir,"/tsne_factors.png",sep = ""),height = 800,width = 800)
grid.arrange(pPhases,pPred,pUMI,pMito)
dev.off()
#Check for genes
gene_list<- c("Pdzk1ip1","Mllt3",
"Ctla2a","Cd27","Cd34",
"Lig1","Hells","Tyms",
"Notch2","Lst1",
"Irf7","Stat1",
"Pf4","Itga2b",
"Klf1","Gata1",
"Mpo","Cd48",
"Fcer1a","Hdc",
"Il7r","Thy1", # Ccr9 not found
"Cdc20","Ccnb1","Racgap1",
"Mzb1","Ly6d", "Trp53inp1",
"Jun","Fos","Nr4a1","Jund")
gene_list <- gene_list[which(is.element(set=rownames(hspc.combined),el = gene_list))]
dir.create(paste(opt$outdir,"/genes/umap/",sep =""),recursive = T)
dir.create(paste(opt$outdir,"/genes/tsne/",sep =""),recursive = T)
for (g in gene_list) {
png(paste(opt$outdir,"/genes/umap/",g,".png",sep = ""))
plot(FeaturePlot(hspc.combined,features = g))
dev.off()
}
for (g in gene_list) {
png(paste(opt$outdir,"/genes/tsne/",g,".png",sep = ""))
plot(FeaturePlot(hspc.combined,features = g,reduction = "tsne"))
dev.off()
}
#AGE prop for each cluster
age_clust <- getAGEPropPerClustBarplot(hspc.combined)
#sample prop for each cluster
sample_clust <- getSamplePropPerClustBarplot(hspc.combined)
#runDate prop for each cluster
runDate_clust <- getRunDatePropPerClustBarplot(hspc.combined)
png(paste(opt$outdir,"/propPerClust.png",sep = ""))
grid.arrange(age_clust,sample_clust,runDate_clust,p2tsne)
dev.off()
## saving results
saveRDS(hspc.combined,paste(opt$outdir,"/combined.rds",sep =""))
##------------------------------------------------------------------------------
## L. Herault
##------------------------------------------------------------------------------
## -----------------------------------------------------------------------------
## Libraries
## -----------------------------------------------------------------------------
suppressMessages(library(monocle))
suppressMessages(library(Seurat))
suppressMessages(library(BiocParallel))
suppressMessages(library(getopt))
suppressMessages(library(gridExtra))
suppressMessages(library(gProfileR))
suppressMessages(library(RColorBrewer))
suppressMessages(library(readxl))
suppressMessages(library(stringr))
suppressMessages(library(plyr))
suppressMessages(library(biomaRt))
suppressMessages(library(scales))
source("R_src/getSigPerCells.R")
source("R_src/Enrichment.R")
source("R_src/funForSeurat.R")
# Analysis of seurat 3 integration and clusternig workflow
## -----------------------------------------------------------------------------
## Command line args
## -----------------------------------------------------------------------------
spec = matrix(c(
'help', 'h', 0, "logical", "Help about the program",
'inputRDS', 'i', 1, "character", "REQUIRED : 10X data prepared qs seurat object with CCA and first clustering (.RDS generated by prepare_data.R).",
'outdir', 'o',1, "character", 'Outdir path (default ./)',
"mart", 'm',1, "character", "mart data rds format if biomaRt server is down and you have a save ",
"resolution", 'r',1, "numeric", "resolution of the clustering analysed default to 0.8",
"num_dim", 'n', 1, "numeric", "first n dimension of the CCA to use for the reclustering",
"minPropCellExp", "p",1,"character", "proportion minimal of cells that expressed the genes kept for the analaysis 0.001 by default",
"signaturesFile", "s", 1, "character", "path to folder with signature list store as rds object",
"logfc_threshold", "l", 1, "numeric", "logfc threshold for finding cluster markers (1 cluster vs all deg) 0.25 by default",
"rodriguezSig", "k", 1, "character", "path for xls file of Rodriguez result",
"gprofiler", "g", 0, "logical", "Put to avoid gprofiler analysis",
"norm_method", "z",1, "character", "normalisation method, logNorm seurat (by default) or sctransform"
), byrow=TRUE, ncol=5);
opt = getopt(spec)
# if help was asked, print a friendly message
# and exit with a non-zero error code
args <- commandArgs()
if ( !is.null(opt$help) | is.null(opt$inputRDS)) {
cat("Perform diff expression between clusters previously obtained with Seurat3, make a summary table of cluster metrics and signature enrichments")
cat(getopt(spec, usage=TRUE))
q(status=1)
}
if (is.null(opt$logfc_threshold)) {
opt$logfc_threshold = 0.25
}
if (is.null(opt$norm_method)) {
opt$norm_method = "logNorm"
}
print(paste("logfc threshold:", opt$logfc_threshold) )
if (is.null(opt$outdir)) {
opt$outdir = "./"
}
if(is.null(opt$doMarkers)) {
opt$doMarkers = T
}
if (is.null(opt$gprofiler)) {
opt$gprofiler <- T
} else {
opt$gprofiler <- F
}
if (is.null(opt$mart)){
mart <- useDataset("mmusculus_gene_ensembl", useMart("ensembl"))
} else {
mart <- readRDS(opt$mart)
}
dir.create(opt$outdir,recursive = T,showWarnings = F)
hspc.combined <- readRDS(opt$inputRDS)
signatures <- readRDS(opt$signaturesFile)
# Set active idents/slot
Idents(hspc.combined) <- paste("integrated_snn_res.",opt$resolution,sep ="")
hspc.combined@meta.data$numclust <- Idents(hspc.combined)
DefaultAssay(object = hspc.combined) <- "RNA"
if (opt$norm_method == "sctransform") {
DefaultAssay(object = hspc.combined) <- "SCT"
}
## Plotting
png(paste(opt$outdir,"/seurat_res",opt$resolution,".png",sep = ""),height = 800,width = 800)
DimPlot(hspc.combined,label = TRUE,pt.size = 1.5) + ggtitle(paste("seurat_res",opt$resolution,sep = "")) + NoLegend()
dev.off()
#age <- getAGEPropPerClustBarplot(hspc.combined)
print(paste("integrated_snn_res.",opt$resolution,sep =""))
print(colnames(hspc.combined@meta.data))
#ageEnrich <- getEnrichAge(hspc.combined,clustCol =paste("integrated_snn_res.",opt$resolution,sep =""),metaCol = "AGE")
# ageEnrich <- as.data.frame(t(ageEnrich))
# ageEnrich$color <- "black"
# ageEnrich[which(as.numeric(as.vector(ageEnrich$phyper)) < 0.05),"color"] <- "red"
# age <- age + theme(axis.text.y = element_text(colour = ageEnrich[,'color']))
# png(paste(opt$outdir,"/bp_AGE.png",sep = ""),height = 800,width = 800)
# plot(age)
# dev.off()
png(paste(opt$outdir,"/bp_sample.png",sep = ""),height = 800,width = 800)
getSamplePropPerClustBarplot(hspc.combined)
dev.off()
png(paste(opt$outdir,"/bp_runDate.png",sep = ""),height = 800,width = 800)
getRunDatePropPerClustBarplot(hspc.combined)
dev.off()
png(paste(opt$outdir,"/bp_phases.png",sep = ""),height = 800,width = 800)
getPhasePropPerClustBarplot(hspc.combined)
dev.off()
if (!is.null(hspc.combined@meta.data$predicted)) {
png(paste(opt$outdir,"/bp_predicted.png",sep = ""),height = 800,width = 800)
getPredictedPropPerClustBarplot(hspc.combined)
dev.off()
}
## Perform differential expression analysis between clusters
dirMarkers <- paste(opt$outdir,"/markers_res",opt$resolution,sep ="")
dir.create(dirMarkers)
markers <- FindAllMarkers(hspc.combined,only.pos = T,logfc.threshold= opt$logfc_threshold)
markers <- markers[which(markers$p_val_adj < 0.05),]
ylab <- "LogNormalized UMI counts"
if (opt$norm_method == "sctransform") {
ylab <- "Expression level"
}
for (numClust in unique(markers$cluster)) {
print(head(markers[which(markers$cluster == numClust),],n=9))
png(paste(dirMarkers,"/Cluster_",numClust,"_topGenesVlnPlot.png",sep =""),width = 1000, height = 1000)
print(plot(VlnPlot(object = hspc.combined, features = head(markers[which(markers$cluster == numClust),"gene"],n=9),pt.size = 0.5) +
labs(x = "Clusters",y= ylab,colour = "black") +
theme(axis.text = element_text(size=20),
plot.title = element_text(size=25)) ))
dev.off()
}
write.table(x = markers,paste(dirMarkers,"/markers.tsv", sep =""),sep = "\t",quote = F,row.names = F,col.names = T)
print("getting TF markers")
TF_markers <- getBM(attributes=c("external_gene_name"),
filters=c("external_gene_name","go"),
values=list(markers$gene,
"GO:0003700"),mart=mart)
#Add tf from bonzanni plus Zbtb16 and Gfi1b
TF_bonzani <- data.frame(external_gene_name = c("Spi1","Tal1","Zfpm1","Cbfa2t3","Erg","Fli1","Gata1","Gata2","Hhex","Runx1","Smad6","Zbtb16","Gfi1b"))
TF_markers <- rbind(TF_markers,TF_bonzani)
TF_markers <- unique(TF_markers)
write.table(TF_markers,paste(dirMarkers,"/markers_and_bonzanni_TF.tsv",sep =""),
sep = "\t",quote = F,row.names = F, col.names = F)
# Modify colnames of markers to use gProfileAnalysis function
firstup <- function(x) {
substr(x, 1, 1) <- toupper(substr(x, 1, 1))
x
}
colnames(markers) <- firstup(colnames(markers))
if(opt$gprofiler) {
gprofiler_result <- gProfileAnalysis(deg_clust = markers,
outdir = paste(dirMarkers,"/gProfileR", sep =""),
background = rownames(hspc.combined),
colors = hue_pal()(length(unique(markers$Cluster))))
saveRDS(gprofiler_result,file=paste(dirMarkers,"/gProfileR/gprofiler_results.rds",sep =""))
}
getClustTable(opt$rodriguezSig,markers=markers,signatures = signatures,seurat = hspc.combined,outdir = dirMarkers)
##------------------------------------------------------------------------------
## L. Herault
##------------------------------------------------------------------------------
## -----------------------------------------------------------------------------
## Libraries
## -----------------------------------------------------------------------------
suppressMessages(library(Seurat))
suppressMessages(library(BiocParallel))
suppressMessages(library(getopt))
suppressMessages(library(gridExtra))
#suppressMessages(library(gProfileR))
suppressMessages(library(RColorBrewer))
suppressMessages(library(readxl))
suppressMessages(library(stringr))
suppressMessages(library(scales))
suppressMessages(library(grid))
suppressMessages(library(ggplot2))
source("R_src/getSigPerCells.R")
source("R_src/Enrichment.R")
source("R_src/funForSeurat.R")
# Seurat 3 analysis of an individual sample required for using CaSTLe script.
## -----------------------------------------------------------------------------
## Command line args
## -----------------------------------------------------------------------------
spec = matrix(c(
'help', 'h', 0, "logical", "Help about the program",
'inputRDS', 'i', 1, "character", "REQUIRED : 10X data prepared as monocle or seurat object (.RDS generated by prepare_data.R).",
'outdir', 'o',1, "character", 'Outdir path (default ./)',
"num_dim", 'n',1,"numeric", "Number of dimension to use for ordering (eg n first pc of PCA on input data) 10 by default",
"correction", "b",1,"character", "Covariable to use as blocking factor (eg one or several columns of pData: betch, cell cycle phases... separated by +)",
"minPropCellExp", "p",1,"numeric", "minimal proportion of cells that expressed the genes kept for the analaysis 0.001 by default",
"norm_method", "z",1, "character", "normalisation method, logNorm seurat (by default) or sctransform",
"resolution", "r", 1,"numeric", "resolution for Seurat clustering 0.9 by default",
"signaturesFile", "s", 1, "character", "path to folder with signature list store as rds object",
"identRemoved", "d", 1, "character", "Optionnal cluster to remove only work if input is a seurat object",
"nonExpressedGenesRemoved", "e", 0,"logical", "non expressed gene already removed default to FALSE",
"gprofiler", "g", 0, "logical", "if true doing gprofiler default to true",
"logfc_threshold", "l", 1, "numeric", "logfc threshold for finding cluster markers (1 cluster vs all deg) 0.25 by default"
), byrow=TRUE, ncol=5);
opt = getopt(spec)
## For testing
#
# opt <- list()
# opt$outdir <- "output/testSeurat4/"
# opt$correction <- "G2M_score+S_score+G1_score"
# opt$inputRDS <- "seurat_treated.rds"
# opt$signatureFile <- "output/signatures/publicSignatures.rds"
# if help was asked, print a friendly message
# and exit with a non-zero error code
args <- commandArgs()
if ( !is.null(opt$help) | is.null(opt$inputRDS)) {
cat("For an individual sample, perform PCA, tSNE, UMAP, Louvain clustering, diff expression between clusters with Seurat3 package with filtered poor quality cells data (gbm_cds monocle object)")
cat(getopt(spec, usage=TRUE))
q(status=1)
}
#set default arguments values
if (is.null(opt$num_dim)) {
opt$num_dim <- 10
}
if (is.null(opt$resolution)) {
opt$resolution <- 0.8
}
if (is.null(opt$minPropCellExp)) {
opt$minPropCellExp <- 0.001
print(opt$minPropCellExp)
}
if (is.null(opt$logfc_threshold)) {
opt$logfc_threshold = 0.25
}
print(paste("logfc threshold:", opt$logfc_threshold) )
if (is.null(opt$outdir)) {
opt$outdir = "./"
}
if (is.null(opt$norm_method)) {
opt$norm_method = "logNorm"
}
if (is.null(opt$nonExpressedGenesRemoved)) {
opt$nonExpressedGenesRemoved = F
} else {
opt$nonExpressedGenesRemoved = T
}
# if (is.null(opt$gprofiler)) {
# opt$gprofiler <- T
# } else {
# opt$gprofiler <- F
# }
# get correction vector
if (!is.null(opt$correction)) {
corrections <- strsplit(x = opt$correction,split = "\\+")[[1]]
} else {
corrections <- NULL
}
dir.create(opt$outdir,recursive = T,showWarnings = F)
print(opt)
# Loading
seurat <- readRDS(opt$inputRDS)
# Remove non expressed genes if not done
if(opt$nonExpressedGenesRemoved == F) {
genesFiltered <- list(genes_before = dim(seurat)[1])
print("remove non expressed genes (non expressed in at least X% of the cells X user option in monocle dp feature 5% in seurat tutorial 0,1%)")
threshold <- opt$minPropCellExp * ncol(seurat)
seurat <- CreateSeuratObject(counts = as.matrix(GetAssayData(seurat,slot = 'counts',assay = "RNA")),
assay = "RNA",
meta.data = seurat@meta.data,
min.cells = threshold)
genesFiltered$genes_after <- dim(seurat)[1]
png(paste(opt$outdir,"/genesFiltering.png",sep = ""))
barplot(unlist(genesFiltered),main = "Genes filtering")
dev.off()
}
#################################################################################################
####################################### Seurat Workflow #########################################
#################################################################################################
classicSeuratWorkflow <- function(seurat, corrections,outdir) {
dir.create(outdir,recursive = T,showWarnings = F)
if (opt$norm_method == "sctransform") {
seurat <- SCTransform(object = seurat, vars.to.regress = corrections)
} else {
seurat <- NormalizeData(object = seurat)
seurat <- FindVariableFeatures(object = seurat,selection.method = "vst", nfeatures = 2000, verbose = T)
seurat <- ScaleData(object = seurat,vars.to.regress = corrections)
}
seurat <- RunPCA(object = seurat)
png(paste(outdir,"/ElbowPlot.png",sep =""))
ElbowPlot(object = seurat,ndims = 30)
dev.off()
print("Clustering...")
seurat <- FindNeighbors(object = seurat,dims = c(1:opt$num_dim),k.param = 20)
seurat <- FindClusters(object = seurat,resolution = c(0.5,0.6,0.7,0.8,0.9,1,1.2))
print("Running TSNE...")
seurat <- RunTSNE(seurat,dims = c(1:opt$num_dim))
print("Running UMAP...")
seurat <- RunUMAP(seurat,dims = c(1:opt$num_dim))
print("UMAP ok")
print(colnames(seurat@meta.data))
colPrefix <- "RNA_snn_res."
if(opt$norm_method == "sctransform") {
colPrefix <- "SCT_snn_res."
}
umapListRes <- list()
tsneListRes <- list()
for (r in c(0.6,0.8,1,1.2)) {
umapListRes[[as.character(r)]] <- DimPlot(seurat,
reduction = "umap",
label = T,
group.by = paste(colPrefix,r,sep="")) +
NoLegend() +
ggplot2::ggtitle(paste("res",r))
tsneListRes[[as.character(r)]] <- DimPlot(seurat,
reduction = "tsne",
label = T,
group.by = paste(colPrefix,r,sep="")) +
NoLegend() +
ggplot2::ggtitle(paste("res",r))
}
png(paste(outdir,"/tsne_different_res.png",sep = ""),height = 800,width = 800)
grid.arrange(tsneListRes[[1]],tsneListRes[[2]],tsneListRes[[3]],tsneListRes[[4]])
dev.off()
png(paste(outdir,"/umap_different_res.png",sep = ""),height = 800,width = 800)
grid.arrange(umapListRes[[1]],umapListRes[[2]],umapListRes[[3]],umapListRes[[4]])
dev.off()
Idents(seurat) <- paste(colPrefix,opt$resolution,sep = "")
seurat@meta.data$numclust <- seurat@meta.data[,paste(colPrefix,opt$resolution,sep = "")]
#Check for unwanted source of variation
pPhases <- DimPlot(seurat,group.by = "phases")
if (!is.null(seurat@meta.data$predicted)) {
pPred <- DimPlot(seurat,group.by = "predicted")
} else {
pPred <- DimPlot(seurat)
}
pUMI <- FeaturePlot(seurat, "nCount_RNA")
pMito <- FeaturePlot(seurat, "percentMito")
png(paste(outdir,"/umap_factors.png",sep = ""),height = 800,width = 800)
grid.arrange(pPhases,pPred,pUMI,pMito)
dev.off()
#Check for unwanted source of variation
pPhases <- DimPlot(seurat,group.by = "phases",reduction = "tsne")
if (!is.null(seurat@meta.data$predicted)) {
pPred <- DimPlot(seurat,group.by = "predicted",reduction = "tsne")
} else {
pPred <- DimPlot(seurat,reduction = "tsne")
}
pUMI <- FeaturePlot(seurat, "nCount_RNA",reduction = "tsne")
pMito <- FeaturePlot(seurat, "percentMito",reduction = "tsne")
png(paste(outdir,"/tsne_factors.png",sep = ""),height = 800,width = 800)
grid.arrange(pPhases,pPred,pUMI,pMito)
dev.off()
return(seurat)
}
## First without integration without correction
unintegrated <- seurat
DefaultAssay(unintegrated) <- "RNA"
classicSeuratWorkflow(unintegrated,correction = NULL,outdir = paste0(opt$outdir,"/SeuratWoIntegrationWoCr/"))
## without integration without correction
classicSeuratWorkflow(unintegrated,correction = corrections,outdir = paste0(opt$outdir,"/SeuratWoIntegrationWithCr/"))
## With integration without correction
if (!is.null(corrections)) {
print("Seurat workflow without correction")
classicSeuratWorkflow(seurat,correction = NULL,outdir = paste0(opt$outdir,"/SeuratWithoutCr/"))
}
## With integration with correction
seurat <- classicSeuratWorkflow(seurat,corrections = corrections,outdir =opt$outdir)
#################################################################################################
####################################### Markers analysis ########################################
#################################################################################################
markers <- FindAllMarkers(seurat,only.pos = T,logfc.threshold= opt$logfc_threshold)
markers <- markers[which(markers$p_val_adj < 0.05),]
write.table(x = markers,paste(opt$outdir,"/markers.tsv", sep =""),sep = "\t",quote = F,row.names = F,col.names = T)
dir.create(paste(opt$outdir,"/markers/",sep = ""))
ylab <- "LogNormalized UMI counts"
if (opt$norm_method == "sctransform") {
ylab <- "Expression level"
}
for (numClust in unique(markers$cluster)) {
print(head(markers[which(markers$cluster == numClust),],n=9))
png(paste(opt$outdir,"/markers/Cluster_",numClust,"_topGenesVlnPlot.png",sep =""),width = 1000, height = 1000)
plot(VlnPlot(object = seurat, features = head(markers[which(markers$cluster == numClust),"gene"],n=9),pt.size = 0.5) +
labs(x = "Clusters",y= ylab,colour = "black") +
theme(axis.text = element_text(size=20),
plot.title = element_text(size=25)) )
dev.off()
}
## Add signatures scores
dir.create(paste(opt$outdir,"/cellSignatures/",sep = ""))
signatures <- readRDS(opt$signaturesFile)
names(signatures) <- paste0(names(signatures),"_",seurat$sampleName[1])
for (sig in c(1:length(signatures))) {
sigName <- names(signatures)[sig]
signature <- signatures[[sig]]
seurat <- scoreCells3(seurat,signature,outdir= paste(opt$outdir,"/cellSignatures/",sep=""),sigName)
}
#################################################################################################
########## Enrichment ##########
#################################################################################################
# ## Enrichment
#
# # Modify colnames of markers to use gProfileAnalysis function
# firstup <- function(x) {
# substr(x, 1, 1) <- toupper(substr(x, 1, 1))
# x
# }
#
# colnames(markers) <- firstup(colnames(markers))
#
#
# #be careful background
# if(opt$gprofiler) {
# gprofiler_result <- gProfileAnalysis(deg_clust = markers,
# outdir = paste(opt$outdir,"/gProfileR", sep =""),
# background = rownames(seurat),
# colors = hue_pal()(length(unique(markers$Cluster))))
#
# saveRDS(gprofiler_result,file=paste(opt$outdir,"/gProfileR/gprofiler_results.rds",sep =""))
# }
#
#
#
# # Test Rodriguez cluster sig
# if(!is.null(opt$rodriguezSig)) {
#
# clusterNames <- c("C1","C2","C3","Mk","Er","Ba","Neu","Mo1","Mo2", "preDC","preB","preT")
#
# RodriguezClustersSig <- lapply(X= c(1:length(clusterNames)),FUN = read_xlsx,path = opt$rodriguezSig)
#
# names(RodriguezClustersSig) <- clusterNames
#
# getOnlyPos <- function(clustersSig) {
# clusterSig <- clustersSig[which(clustersSig$log.effect > 0),]
# return(clusterSig)
# }
#
# RodriguezClustersSigPos <- lapply(X= RodriguezClustersSig, getOnlyPos)
#
#
# signaturesRodriguez <- lapply(RodriguezClustersSigPos,"[[",1 )
#
#
# firstup <- function(x) {
# substr(x, 1, 1) <- toupper(substr(x, 1, 1))
# x
# }
#
# colnames(markers) <- firstup(colnames(markers))
#
# getClustEnrichForRodriguez <- function(clust,signatures,seurat,markers) {
# clustSig <- lapply(signatures,testHyperSig3,seurat,markers,clust)
# if (!is.null(seurat@meta.data$predicted)) {
# propCellTypesLearned <- table(seurat@meta.data[which(seurat@meta.data$numclust==clust),"predicted"])/length(seurat@meta.data[which(seurat@meta.data$numclust==clust),"predicted"])
# } else{
# propCellTypesLearned <- NULL
# }
# clustInfo <- c(clustSig,propCellTypesLearned)
# return(clustInfo)
# }
#
# clust_list <- lapply(unique(markers$Cluster),getClustEnrichForRodriguez,signature=signaturesRodriguez,seurat =seurat,markers =markers) ##markers arg forgotten in testHyper sig
#
# names(clust_list) <- paste("cluster_",unique(markers$Cluster),sep="")
#
# clust_table <- as.data.frame(matrix(unlist(clust_list), nrow=length(unlist(clust_list[1]))))
# colnames(clust_table) <- names(clust_list)
# rownames(clust_table) <- names(clust_list[[1]])
#
# clust_df <- as.data.frame(t(clust_table))
#
# write.csv(clust_df,file = paste(opt$outdir,"/clustInfoRodriguez.csv",sep =""),quote = F)
#
# }
#################################################################################################
########## Cluster summary table #########
#################################################################################################
clust_table <- data.frame()
print("Creating cluster summary table ")
firstup <- function(x) {
substr(x, 1, 1) <- toupper(substr(x, 1, 1))
x
}
colnames(markers) <- firstup(colnames(markers))
getClustInfo <- function(clust,signatures,seurat,markers) {
clustInfo <- list()
clustInfo$num_cells <- dim(seurat@meta.data[which(seurat@active.ident==clust),])[1]
clustInfo$percent_cells <- clustInfo$num_cells/dim(seurat@meta.data)[1]
percentPhases <- table(seurat@meta.data[which(seurat@active.ident==clust),"phases"])/length(seurat@meta.data[which(seurat@active.ident==clust),"phases"]) #In fact this is fraction not percentage
if(!is.null(seurat@meta.data$predicted)) {
percentPredicted <- table(seurat@meta.data[which(seurat@active.ident==clust),"predicted"])/length(seurat@meta.data[which(seurat@active.ident==clust),"predicted"]) #In fact this is fraction not percentage
for (p in unique(seurat@meta.data$predicted)) {
print(p)
if (is.element(p,names(percentPhases))) {
clustInfo[[p]] <- percentPhases[p]
} else {
clustInfo[[p]] <- 0
}
}
}
for (p in c("G1_G0","S","G2_M")) {
if (is.element(p,names(percentPhases))) {
clustInfo[[p]] <- percentPhases[p]
} else {
clustInfo[[p]] <- 0
}
}
clustInfo$median_genes_expressed <- median(seurat@meta.data[which(seurat@active.ident==clust),"nFeature_RNA"])
clustInfo$median_nUMI <- median(seurat@meta.data[which(seurat@active.ident==clust),"nCount_RNA"])
clustInfo$median_percentMitochGenes <- median(seurat@meta.data[which(seurat@active.ident==clust),"percentMito"])
clustSig <- lapply(signatures,testHyperSig3,seurat,markers,clust)
clustInfo <- c(clustInfo,clustSig)
}
#allSignatures <- c(signatures,signaturesRodriguez)
clust_list <- lapply(levels(unique(seurat@active.ident)),getClustInfo,signatures,seurat,markers)
names(clust_list) <- paste("cluster_",levels(unique(seurat@active.ident)),sep="")
saveRDS(clust_list,paste(opt$outdir,"/clust_list_save.rds",sep =""))
clust_table <- as.data.frame(matrix(unlist(clust_list), nrow=length(unlist(clust_list[1]))))
colnames(clust_table) <- names(clust_list)
rownames(clust_table) <- names(clust_list[[1]])
clust_df <- as.data.frame(t(clust_table))
write.table(x = clust_df,file = paste(opt$outdir,"/clusters_table.tsv",sep =""),sep="\t",quote=F,col.names = NA)
saveRDS(seurat,file = paste(opt$outdir,"/seurat.rds",sep = ""))
##------------------------------------------------------------------------------
## L. Herault
##------------------------------------------------------------------------------
## -----------------------------------------------------------------------------
## Libraries
## -----------------------------------------------------------------------------
#suppressMessages(library(monocle))
suppressMessages(library(Seurat))
suppressMessages(library(BiocParallel))
suppressMessages(library(getopt))
suppressMessages(library(gridExtra))
#suppressMessages(library(gProfileR))
suppressMessages(library(RColorBrewer))
suppressMessages(library(readxl))
suppressMessages(library(stringr))
suppressMessages(library(scales))
suppressMessages(library(cowplot))
suppressMessages(library(sctransform))
suppressMessages(library(stringr))
suppressMessages(library(plyr))
suppressMessages(library(grid))
suppressMessages(library(ggplot2))
source("R_src/getSigPerCells.R")
source("R_src/Enrichment.R")
source("R_src/funForSeurat.R")
# Seurat 3 integration workflow
## -----------------------------------------------------------------------------
## Command line args
## -----------------------------------------------------------------------------
spec = matrix(c(
'help', 'h', 0, "logical", "Help about the program",
'inputFiles', 'i', 1, "character", "REQUIRED: 10X dataset paths prepared as seurat object (.RDS generated by prepare_data.R) separated by +",
'signaturesFile', 's',1, "character", "REQUIRED: signatures rds file",
'outdir', 'o',1, "character", 'Outdir path (default ./)',
"num_dim_CCA", 'q',1,"numeric", "First n dimensions of CCA to use for FindIntegrationAnchors Seurat function (15 by default)",
"num_dim_weight", 'w',1,"numeric", "First n dimensions to use for IntegrateData Seurat function (15 by default)",
"num_dim", 'n',1,"numeric", "First n dimensions of PCA to use for clustering, UMAP and TSNE (15 by default)",
"num_dim_integrated",'N',1,"numeric", "Number of PCA dimension computed to analyse integrated data (40 by default)",
"cores", 'c',1, "numeric", "Number of cores to use for ordering (for differencially expressed gene between clusters test)",
"resolution", 'r',1, "numeric", "resolution for smp.combined clustering",
"correction", "b",1,"character", "Covariable to use as blocking factor (eg one or several columns of pData: betch, cell cycle phases... separated by +)",
"logfc_threshold", "l", 1, "numeric", "logfc threshold for finding cluster markers (1 cluster vs all deg) 0.25 by default",
"rodriguezSig", "k", 1, "character", "path for xls file of Rodriguez result",
"norm_method", "z",1, "character", "normalization method, logNorm (by default) or sctransform",
"reusePca", "p", 0, "character","re use pca calculated before when caculating anchor weights for each dataset default to FALSE (permit to correct for cell cycle before integration)"
), byrow=TRUE, ncol=5);
opt = getopt(spec)
# if help was asked, print a friendly message
# and exit with a non-zero error code
args <- commandArgs()
if ( !is.null(opt$help) | is.null(opt$inputFiles)) {
cat("Perform Seurat 3 integration workflow, then cluster the cell with seurat 3 at different resolutions")
cat(getopt(spec, usage=TRUE))
q(status=1)
}
#set default arguments values
if (is.null(opt$resolution)) {
opt$resolution <- 0.6
}
if (is.null(opt$logfc_threshold)) {
opt$logfc_threshold <- 0.25
}
if (is.null(opt$num_dim_CCA)) {
opt$num_dim_CCA <- 15
}
if (is.null(opt$num_dim_weight)) {
opt$num_dim_weight <- 15
}
if (is.null(opt$num_dim)) {
opt$num_dim <- 15
}
if (is.null(opt$num_dim_integrated)) {
opt$num_dim_integrated <- 40
}
if (is.null(opt$norm_method)) {
opt$norm_method <- "logNorm"
}
if (is.null(opt$reusePca)) {
opt$reusePca <- FALSE
}
print(opt$reusePca)
dir.create(opt$outdir,recursive = T,showWarnings = F)
corrections <- strsplit(x = opt$correction,split = "\\+")[[1]]
dir.create(opt$outdir,recursive = T,showWarnings = F)
smp.listFile <- strsplit(opt$inputFiles,split = "\\+")[[1]]
smp.list <- list()
#load dataset
for (i in 1:length(x = smp.listFile)) {
#For testing, in final workflow sampleName will be incorporated in metadata with the loading of cell ranger matrix
smp.list[[i]] <- readRDS(smp.listFile[i])
sampleName <- unique(smp.list[[i]]@meta.data$sampleName)
smp.list[[i]] <- RenameCells(smp.list[[i]],add.cell.id = sampleName)
if(opt$norm_method != "sctransform") {
smp.list[[i]] <- NormalizeData(object = smp.list[[i]], verbose = FALSE)
smp.list[[i]] <- FindVariableFeatures(object = smp.list[[i]], selection.method = "vst",
nfeatures = 2000, verbose = FALSE)
} else {
smp.list[[i]] <- SCTransform(smp.list[[i]],vars.to.regress = corrections,verbose = T)
}
}
smp.anchors <- FindIntegrationAnchors(object.list = smp.list, dims = 1:opt$num_dim_CCA)
if(opt$reusePca) {
print("Re use pca")
smp.combined <- IntegrateData(anchorset = smp.anchors, weight.reduction = "pca", dims = 1:opt$num_dim_weight)
}
smp.combined <- IntegrateData(anchorset = smp.anchors, dims = 1:opt$num_dim_weight)
# Run the standard workflow for visualization and clustering
#Here certainly need to reuse SCTransform on combined data if norm_method = sct
classicSeuratWorkflow <- function(smp.combined, corrections,outdir) {
dir.create(outdir,recursive = T)
smp.combined <- ScaleData(object = smp.combined, verbose = T,vars.to.regress = corrections)
smp.combined <- RunPCA(object = smp.combined, npcs = opt$num_dim_integrated, verbose = FALSE)
png(paste(outdir,"/ElbowPlot.png",sep =""))
ElbowPlot(object = smp.combined,ndims = opt$num_dim_integrated)
dev.off()
# t-SNE UMAP and Clustering
smp.combined <- RunUMAP(object = smp.combined, reduction = "pca", dims = 1:opt$num_dim)
smp.combined <- RunTSNE(object = smp.combined, reduction = "pca", dims = 1:opt$num_dim)
smp.combined <- FindNeighbors(object = smp.combined, reduction = "pca", dims = 1:opt$num_dim)
smp.combined <- FindClusters(smp.combined, resolution = c(0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,1.2))
colPrefix <- paste0(DefaultAssay(smp.combined),"_snn_res.")
Idents(smp.combined) <- paste(colPrefix,opt$resolution,sep = "")
smp.combined@meta.data$AGE <- "Old"
smp.combined@meta.data$AGE[which(smp.combined@meta.data$age == "2_months")] <- "Young"
# Visualization of the samples
p1 <- DimPlot(object = smp.combined, reduction = "umap", group.by = "sampleName")
p2 <- DimPlot(object = smp.combined, reduction = "umap", label = TRUE)
p3 <- DimPlot(object = smp.combined, reduction = "umap", group.by = "AGE")
p4 <- DimPlot(object = smp.combined, reduction = "umap", group.by = "runDate")
p3tsne <- DimPlot(object = smp.combined, reduction = "tsne", group.by = "AGE")
p4tsne <- DimPlot(object = smp.combined, reduction = "tsne", group.by = "runDate")
png(paste(outdir,"/umap_samples.png",sep =""),height = 800,width=1200)
grid.arrange(p1, p2,ncol = 2)
dev.off()
png(paste(outdir,"/AGE.png",sep =""),height = 800,width=1200)
grid.arrange(p3, p3tsne,ncol = 2)
dev.off()
png(paste(outdir,"/runDate.png",sep =""),height = 800,width=1200)
grid.arrange(p4, p4tsne,ncol = 2)
dev.off()
png(paste(outdir,"/UmapClusters.png",sep =""),height = 800,width=1200)
DimPlot(object = smp.combined, reduction = "umap", label = TRUE)
dev.off()
p1tsne <- DimPlot(object = smp.combined, reduction = "tsne", group.by = "sampleName")
p2tsne <- DimPlot(object = smp.combined, reduction = "tsne", label = TRUE)
png(paste(outdir,"/tsne_samples.png",sep =""),height = 800,width=1000)
grid.arrange(p1tsne, p2tsne,ncol = 2)
dev.off()
#Visualistaoin one sample at a time
sampleNames <- unique(smp.combined@meta.data$sampleName)
dir.create(paste(outdir,"/samples/umap/",sep = ""),recursive = T)
dir.create(paste(outdir,"/samples/tsne/",sep = ""),recursive = T)
#samplePlotList <- list()
for (s in sampleNames) {
png(paste(outdir,"/samples/umap/",s,".png",sep = ""))
plot(DimPlot(object = smp.combined, cells= grep(s, smp.combined@meta.data$sampleName), reduction = "umap") + ggtitle(s))
dev.off()
png(paste(outdir,"/samples/tsne/",s,".png",sep = ""))
plot(DimPlot(object = smp.combined, cells= grep(s, smp.combined@meta.data$sampleName), reduction = "tsne") + ggtitle(s))
dev.off()
}
umapListRes <- list()
tsneListRes <- list()
for (r in c(0.6,0.8,1,1.2)) {
umapListRes[[as.character(r)]] <- DimPlot(smp.combined,
reduction = "umap",
label = T,
group.by = paste(colPrefix,r,sep="")) +
NoLegend() +
ggtitle(paste("res",r))
tsneListRes[[as.character(r)]] <- DimPlot(smp.combined,
reduction = "tsne",
label = T,
group.by = paste(colPrefix,r,sep="")) +
NoLegend() +
ggtitle(paste("res",r))
}
png(paste(outdir,"/tsne_different_res.png",sep = ""),height = 800,width = 800)
grid.arrange(tsneListRes[[1]],tsneListRes[[2]],tsneListRes[[3]],tsneListRes[[4]])
dev.off()
png(paste(outdir,"/umap_different_res.png",sep = ""),height = 800,width = 800)
grid.arrange(umapListRes[[1]],umapListRes[[2]],umapListRes[[3]],umapListRes[[4]])
dev.off()
smp.combined@meta.data$numclust <- smp.combined@meta.data[,paste(colPrefix,opt$resolution,sep = "")]
#Check for unwanted source of variation UMAP
pPhases <- DimPlot(smp.combined,group.by = "phases")
if (!is.null(smp.combined@meta.data$predicted)) {
pPred <- DimPlot(smp.combined,group.by = "predicted")
} else {
pPred <- DimPlot(smp.combined)
}
pUMI <- FeaturePlot(smp.combined, "nCount_RNA")
pMito <- FeaturePlot(smp.combined, "percentMito")
png(paste(outdir,"/umap_factors.png",sep = ""),height = 800,width = 800)
grid.arrange(pPhases,pPred,pUMI,pMito)
dev.off()
#Check for unwanted source of variation tSNE
pPhases <- DimPlot(smp.combined,group.by = "phases",reduction = "tsne")
if (!is.null(smp.combined@meta.data$predicted)) {
pPred <- DimPlot(smp.combined,group.by = "predicted",reduction = "tsne")
} else {
pPred <- DimPlot(smp.combined,reduction = "tsne")
}
pUMI <- FeaturePlot(smp.combined, "nCount_RNA",reduction = "tsne")
pMito <- FeaturePlot(smp.combined, "percentMito",reduction = "tsne")
png(paste(outdir,"/tsne_factors.png",sep = ""),height = 800,width = 800)
grid.arrange(pPhases,pPred,pUMI,pMito)
dev.off()
return(smp.combined)
}
print("Seurat workflow without integration")
DefaultAssay(object = smp.combined) <- "RNA"
smp.combined <- NormalizeData(object = smp.combined)
smp.combined <- FindVariableFeatures(object = smp.combined,selection.method = "vst", nfeatures = 2000, verbose = T)
if (!is.null(corrections)) {
print("Seurat workflow without correction")
classicSeuratWorkflow(smp.combined,correction = NULL,outdir = paste0(opt$outdir,"/SeuratMergingWoIntegrationWoCr/"))
}
classicSeuratWorkflow(smp.combined,corrections = corrections,outdir = paste0(opt$outdir,"/SeuratMergingWoIntegration/"))
print("Seurat workflow with integration")
DefaultAssay(object = smp.combined) <- "integrated"
if (!is.null(corrections)) {
print("Seurat workflow without correction")
classicSeuratWorkflow(smp.combined,correction = NULL,outdir = paste0(opt$outdir,"/SeuratWithoutCr/"))
}
smp.combined <- classicSeuratWorkflow(smp.combined,corrections = corrections,outdir =opt$outdir)
#AGE prop for each cluster
age_clust <- getAGEPropPerClustBarplot(smp.combined)
#sample prop for each cluster
sample_clust <- getSamplePropPerClustBarplot(smp.combined)
#runDate prop for each cluster
runDate_clust <- getRunDatePropPerClustBarplot(smp.combined)
png(paste(opt$outdir,"/propPerClust.png",sep = ""))
grid.arrange(age_clust,sample_clust,runDate_clust,
DimPlot(object = smp.combined, reduction = "tsne", label = TRUE))
dev.off()
## saving results
saveRDS(smp.combined,paste(opt$outdir,"/combined.rds",sep =""))
#-----------------------------------------------------#
# Signac Motif analysis tutorial
#-----------------------------------------------------#
suppressMessages(library(getopt))
suppressMessages(library(Signac))
suppressMessages(library(Seurat))
suppressMessages(library(JASPAR2020))
suppressMessages(library(TFBSTools))
suppressMessages(library(BSgenome.Mmusculus.UCSC.mm10))
suppressMessages(library(ggplot2))
# library(patchwork)
##-------------------------------------------------------------------------##
## Option list
##-------------------------------------------------------------------------##
spec = matrix(c(
'help', 'h', 0, "logical", "Help about the program",
'inputRDS', 'i', 1, "character", "REQUIRED: 10X data prepared as seurat/signac object.",
'outdir', 'o',1, "character", 'Outdir path (default ./)',
'species', 's', 1, "numeric", "Species ID number for the motif DB, human = 9606, mouse = 10090 (default human)",
'assay', 'a', 1, "character", "Name of the assay that contain the atac data (default 'peaks')",
'cluster', 'c', 1, "character", "Name of the cluster identity for the markers",
'IdentName', 'n', 1, "character", "Idents new name '+' separated",
'IdentLevel', 'l', 1, "character", "Levels of the new Ident '+' separated",
'integrated', 'I', 1, "logical", "Is the RDS an integrated dataset or not"
), byrow=TRUE, ncol=5);
opt = getopt(spec)
# if help was asked, print a friendly message
# and exit with a non-zero error code
args <- commandArgs()
if ( !is.null(opt$help) | is.null(opt$inputRDS)) {
cat("Help message")
cat(getopt(spec, usage=TRUE))
q(status=1)
}
if (is.null(opt$outdir)) {
opt$outdir = "./"
}
if (is.null(opt$species)){
opt$species <- 9606
}
if(is.null(opt$assay)){
opt$assay <- "peaks"
}
# opt$inputRDS <- "/shared/projects/scRNA_HSPC_Aging/GMP_PRARA/output/TransferRnaAtac/coembedAll.rds"
# opt$IdentName <- "Neu1+Neu3+Neu2+NeuRA1+Rep+NeuRA2"
# opt$IdentLevel <- "Rep+Neu1+Neu2+Neu3+NeuRA1+NeuRA2"
# opt$species <- 9606
# opt$cluster <- "numclust"
defAssay <- opt$assay
outdir <- opt$outdir
dir.create(outdir, recursive = T)
#------------------------------------#
# Load the motif dataBase
#------------------------------------#
print("Load JASPAR DB")
# Get a list of motif position frequency matrices from the JASPAR database
# species = 9606 is human species id while the tutorial is using mice dataset
pfm <- getMatrixSet(
x = JASPAR2020,
opts = list(species = opt$species, all_versions = FALSE)
)
#------------------------------------#
# Read seurat and prepare the data
#------------------------------------#
seurat <- readRDS(opt$inputRDS)
DefaultAssay(seurat) <- defAssay
Idents(seurat) <- opt$cluster
print("Entering analysis loop")
print(paste("opt$integrated == ", opt$integrated, sep = ""))
if(opt$integrated == TRUE){
# Prepare data
# Rename the cluster with real name for an easier analysis
new.ident.name <- strsplit(opt$IdentName, split = "\\+")[[1]]
names(new.ident.name) <- levels(seurat)
seurat <- RenameIdents(seurat, new.ident.name)
seurat@meta.data$FinalCluster <- Idents(seurat)
ident_levels <- strsplit(opt$IdentLevel, split = "\\+")[[1]]
seurat@meta.data$FinalCluster <- factor(seurat@meta.data$FinalCluster, levels = ident_levels)
Idents(seurat) <- "FinalCluster"
# Subset only ATAC data
Idents(seurat) <- "condType"
seurat_atac <- subset(seurat, idents = grep(pattern = "ATAC", x = unique(Idents(seurat)), value = T))
seurat_atac[[defAssay]] <- as.ChromatinAssay(seurat_atac[[defAssay]], seqinfo = "mm10")
Idents(seurat_atac) <- "FinalCluster"
# Adding motif information
seurat_atac <- AddMotifs(object = seurat_atac, genome = BSgenome.Mmusculus.UCSC.mm10, pfm = pfm)
motif_correl <- seurat_atac@assays$peaks@motifs@motif.names
motif_correl.df <- data.frame(do.call("rbind", motif_correl))
names(motif_correl.df)[1] <- "Motif_Name"
motif_correl.df$Motif_ID <- rownames(motif_correl.df)
write.table(x = motif_correl.df, file = paste0(outdir, "/motif_name_table.tsv"), sep = "\t", row.names = F, col.names = T, quote = F)
#----------------------------#
# Method 1 : Find DA region
#----------------------------#
# dir.create(paste0(outdir, "/Overrep_Motif"), recursive = T)
# print("Starting method 1")
# ## Treatment impact
# dir.create(paste0(outdir, "/Overrep_Motif/Treatment"), recursive = T)
# # FindMarkers using Logistic regression
# Idents(seurat_atac) <- "condition"
#
# da_peaks_treatment <- FindMarkers(seurat_atac, ident.1 = "Treated", ident.2 = "Control", only.pos = T, test.use = "LR", latent.vars = "nCount_peaks")
# da_peaks_treatment.sig <- rownames(da_peaks_treatment[da_peaks_treatment$p_val < 0.005, ])
#
# # Calculate a motif score for each of the most DA region
# enriched_motifs_treatment <- FindMotifs(object = seurat_atac, features = da_peaks_treatment.sig)
# motif_plot_treatment <- MotifPlot(seurat_atac, head(enriched_motifs_treatment$motif, 6))
#
# ggsave(paste0(outdir, "/Overrep_Motif/Treatment", "/Top_motif_treatment", '.png'), plot = motif_plot_treatment, device = 'png', path = NULL,
# scale = 1, width = 20, height = 15, units = 'cm', dpi = 300)
#
# write.table(da_peaks_treatment, file = paste0(outdir, "/Overrep_Motif/Treatment", "/DA_genomic_regions_treatment", ".tsv"),
# sep = "\t", col.names = T, row.names = T, quote = F)
# write.table(enriched_motifs_treatment, file = paste0(outdir, "/Overrep_Motif/Treatment", "/Motif_enrichment_treatment", ".tsv"),
# sep = "\t", col.names = T, row.names = T, quote = F)
# print("Treatment analysis done")
#
# ## Cluster impact
# dir.create(paste0(outdir, "/Overrep_Motif/Cluster"), recursive = T)
#
# Idents(seurat_atac) <- "FinalCluster"
#
# print(unique(seurat_atac@meta.data$FinalCluster))
# # BiocParallel::register(BiocParallel::SerialParam())
# # da_peaks_cluster <- FindAllMarkers(seurat_atac, only.pos = T, test.use = "LR", latent.vars = "nCount_peaks")
# da_peaks_cluster.list <- list()
# for(cluster in unique(seurat_atac$FinalCluster)){
# print(paste("Find markers cluster", cluster, sep = " "))
# if(nrow(seurat_atac@meta.data[seurat_atac$FinalCluster %in% cluster,]) >3){
# da_peaks_cluster.list[[cluster]] <- FindMarkers(seurat_atac, ident.1 = cluster, only.pos = T, test.use = "LR", latent.vars = "nCount_peaks")
# da_peaks_cluster.list[[cluster]]$Cluster <- cluster
# da_peaks_cluster.list[[cluster]]$peaks <- rownames(da_peaks_cluster.list[[cluster]])
# } else{
# print("Not enough cells")
# }
# }
#
# da_peaks_cluster.db <- do.call("rbind", da_peaks_cluster.list)
# print("FindMarkers cluster done")
#
# enriched_motifs_cluster.list <- list()
# for(cluster in unique(da_peaks_cluster.db$Cluster)){
# print(paste(cluster, "FindMotif", sep = " "))
# da_peaks_cluster.sig <- da_peaks_cluster.db[which(da_peaks_cluster.db$p_val < 0.005 & da_peaks_cluster.db$Cluster == cluster),]
# da_peaks_cluster.sig.name <- da_peaks_cluster.sig$peaks
# enriched_motifs_cluster.list[[paste0(cluster, "_motifs")]] <- FindMotifs(seurat_atac, features = da_peaks_cluster.sig.name)
# enriched_motifs_cluster.list[[paste0(cluster, "_motifs")]]$Cluster <- cluster
#
# # motif_plot <- MotifPlot(atac_small, motifs = head(enriched_motifs_cluster.list[[paste0(cluster, "_motifs")]]$motif, 6))
# # ggsave(paste0(outdir, "/Overrep_Motif/Cluster", "/Top_motif_cluster_", cluster, '.png'), plot = motif_plot, device = 'png', path = NULL,
# # scale = 1, width = 20, height = 15, units = 'cm', dpi = 300)
# }
# enriched_motifs_cluster.df <- do.call("rbind", enriched_motifs_cluster.list)
#
# write.table(da_peaks_cluster.db, file = paste0(outdir, "/Overrep_Motif/Cluster", "/Motif_enrichment_cluster", ".tsv"),
# sep = "\t", col.names = T, row.names = T, quote = F)
# write.table(enriched_motifs_cluster.df, file = paste0(outdir, "/Overrep_Motif/Cluster", "/Motif_enrichment_cluster", ".tsv"),
# sep = "\t", col.names = T, row.names = T, quote = F)
# print("cluster impact done")
#---------------------------------------#
# Method 2 : Computing motif activity
#---------------------------------------#
dir.create(paste0(outdir, "/Motif_activity"), recursive = T)
print("Starting method 2")
# Computing motif per cell motif activity score
# Try this for error with multicore
BiocParallel::register(BiocParallel::SerialParam())
seurat_atac <- RunChromVAR(object = seurat_atac, genome = BSgenome.Mmusculus.UCSC.mm10)
DefaultAssay(seurat_atac) <- "chromvar"
print("End chromvar")
# Find differential activity in motif per treatment
dir.create(paste0(outdir, "/Motif_activity/Treatment"), recursive = T)
print("Starting treatment analysis")
Idents(seurat_atac) <- "condition"
diff_act_treatment <- FindMarkers(object = seurat_atac, ident.1 = "Treated", ident.2 = "Control", test.use = "LR", latent.vars = "nCount_peaks")
diff_act_treatment$Motif_Name <- motif_correl.df[rownames(diff_act_treatment), "Motif_Name"]
top_motif_treatment <- MotifPlot(object = seurat_atac, motifs = head(rownames(diff_act_treatment), 6), assay = "peaks")
write.table(diff_act_treatment, file = paste0(outdir, "/Motif_activity/Treatment", "/Motif_enrich_treatment", ".tsv"),
sep = "\t", row.names = T, col.names = T, quote = F)
ggsave(paste0(outdir, "/Motif_activity/Treatment", "/Top_motif_treatment", '.png'), plot = top_motif_treatment, device = 'png', path = NULL,
scale = 1, width = 20, height = 15, units = 'cm', dpi = 300)
print("End treatment analysis")
# Find differential activity in motif per cluster
dir.create(paste0(outdir, "/Motif_activity/Cluster"), recursive = T)
print("Starting cluster analysis")
Idents(seurat_atac) <- "FinalCluster"
diff_act_cluster <- FindAllMarkers(object = seurat_atac, only.pos = T, test.use = "LR", latent.vars = "nCount_peaks")
for(cluster in unique(diff_act_cluster$cluster)){
top_motif <- diff_act_cluster[diff_act_cluster$cluster %in% cluster,]
top_motif.name <- head(top_motif$gene, 6)
top_motif.plot <- MotifPlot(object = seurat_atac, motifs = top_motif.name, assay = "peaks")
ggsave(paste0(outdir, "/Motif_activity/Cluster", "/Top_motif_cluster_", cluster, '.png'), plot = top_motif.plot, device = 'png', path = NULL,
scale = 1, width = 20, height = 15, units = 'cm', dpi = 300)
}
# rownames(diff_act_cluster) <- diff_act_cluster$gene
print("Adding Motif_name to table output")
diff_act_cluster.list <- list()
for(cluster in unique(diff_act_cluster$cluster)){
diff_act_cluster.temp <- diff_act_cluster[diff_act_cluster$cluster %in% cluster,]
rownames(diff_act_cluster.temp) <- diff_act_cluster.temp$gene
diff_act_cluster.temp$Motif_Name <- motif_correl.df[rownames(diff_act_cluster.temp), "Motif_Name"]
diff_act_cluster.list[[cluster]] <- diff_act_cluster.temp
}
diff_act_cluster.final <- do.call("rbind", diff_act_cluster.list)
write.table(diff_act_cluster.final, file = paste0(outdir, "/Motif_activity/Cluster", "/Motif_enrich_cluster", ".tsv"),
sep = "\t", row.names = T, col.names = T, quote = F)
print("End cluster analysis")
# Analysing motif markers of difference between pop IN a cluster (ex : Clus1_Treated vs Clust1_Control)
print("Starting the treatment per cluster analysis")
dir.create(paste0(outdir, "/Motif_activity/Treatment_per_clust"), recursive = T)
seurat_atac$cluster_cond <- paste(seurat_atac$condition, seurat_atac$FinalCluster, sep = "_")
Idents(seurat_atac) <- "cluster_cond"
diff_act_cluster_cond.list <- list()
for(cluster in unique(seurat_atac$FinalCluster)){
print(paste("Treated_vs_Control", cluster, sep = "_"))
if(nrow(seurat_atac@meta.data[seurat_atac$cluster_cond %in% paste("Control", cluster, sep = "_"),]) >3 &&
nrow(seurat_atac@meta.data[seurat_atac$cluster_cond %in% paste("Treated", cluster, sep = "_"),])){
print("Enough cells for analysis")
diff_act_cluster_cond.list[[paste("Treated_vs_Control", cluster, sep = "_")]] <- FindMarkers(object = seurat_atac, ident.1 = paste("Treated", cluster, sep = "_"),
ident.2 = paste("Control", cluster, sep = "_"),
test.use = 'LR', latent.vars = 'nCount_peaks')
diff_act_cluster_cond.list[[paste("Treated_vs_Control", cluster, sep = "_")]]$Comparison <- "Treated_vs_Control"
diff_act_cluster_cond.list[[paste("Treated_vs_Control", cluster, sep = "_")]]$Cluster <- cluster
diff_act_cluster_cond.list[[paste("Treated_vs_Control", cluster, sep = "_")]]$Motif_ID <- rownames(diff_act_cluster_cond.list[[paste("Treated_vs_Control", cluster, sep = "_")]])
diff_act_cluster_cond.list[[paste("Treated_vs_Control", cluster, sep = "_")]]$Motif_Name <- motif_correl.df[rownames(diff_act_cluster_cond.list[[paste("Treated_vs_Control", cluster, sep = "_")]]), "Motif_Name"]
}else{
print("Not enough cells")
}
}
diff_act_cluster_cond.df <- do.call("rbind", diff_act_cluster_cond.list)
print("Plotting top motif for treated and control per cluster")
for(cluster in unique(diff_act_cluster_cond.df$Cluster)){
top_motif <- diff_act_cluster_cond.df[diff_act_cluster_cond.df$Cluster %in% cluster,]
top_motif.name <- head(top_motif$Motif_ID, 6)
bot_motif.name <- tail(top_motif$Motif_ID, 6)
top_motif.plot <- MotifPlot(object = seurat_atac, motifs = top_motif.name, assay = "peaks")
bot_motif.plot <- MotifPlot(object = seurat_atac, motifs = bot_motif.name, assay = "peaks")
ggsave(paste0(outdir, "/Motif_activity/Treatment_per_clust", "/Top_motif_treated_cluster_", cluster, '.png'), plot = top_motif.plot, device = 'png', path = NULL,
scale = 1, width = 20, height = 15, units = 'cm', dpi = 300)
ggsave(paste0(outdir, "/Motif_activity/Treatment_per_clust", "/Top_motif_control_cluster_", cluster, '.png'), plot = bot_motif.plot, device = 'png', path = NULL,
scale = 1, width = 20, height = 15, units = 'cm', dpi = 300)
}
write.table(diff_act_cluster_cond.df, file = paste0(outdir, "/Motif_activity/Treatment_per_clust", "/Motif_enrich_cluster", ".tsv"),
sep = "\t", row.names = T, col.names = T, quote = F)
print("End of the treatment per cluster analysis")
saveRDS(seurat_atac, file = paste0(outdir, "/atac_motif", '.rds'))
} else if(opt$integrated == FALSE){
print("Only computing motif activity score")
# Adding motif information
seurat <- AddMotifs(object = seurat, genome = BSgenome.Mmusculus.UCSC.mm10, pfm = pfm)
motif_correl <- seurat@assays$peaks@motifs@motif.names
motif_correl.df <- data.frame(do.call("rbind", motif_correl))
names(motif_correl.df)[1] <- "Motif_Name"
motif_correl.df$Motif_ID <- rownames(motif_correl.df)
write.table(x = motif_correl.df, file = paste0(outdir, "/motif_name_table.tsv"), sep = "\t", row.names = F, col.names = T, quote = F)
# Calculate motif score
BiocParallel::register(BiocParallel::SerialParam())
seurat <- RunChromVAR(object = seurat, genome = BSgenome.Mmusculus.UCSC.mm10)
saveRDS(seurat, file = paste0(outdir, "/atac_motif", ".rds"))
}
#-----------------------------------------------------#
# Signac Motif analysis tutorial
#-----------------------------------------------------#
suppressMessages(library(getopt))
suppressMessages(library(Signac))
suppressMessages(library(Seurat))
suppressMessages(library(JASPAR2020))
suppressMessages(library(TFBSTools))
suppressMessages(library(BSgenome.Mmusculus.UCSC.mm10))
suppressMessages(library(ggplot2))
# library(patchwork)
##-------------------------------------------------------------------------##
## Option list
##-------------------------------------------------------------------------##
spec = matrix(c(
'help', 'h', 0, "logical", "Help about the program",
'inputRDS', 'i', 1, "character", "REQUIRED: 10X data prepared as seurat/signac object.",
'outdir', 'o',1, "character", 'Outdir path (default ./)',
'species', 's', 1, "numeric", "Species ID number for the motif DB, human = 9606, mouse = 10090 (default human)",
'assay', 'a', 1, "character", "Name of the assay that contain the atac data (default 'peaks')",
'cluster', 'c', 1, "character", "Name of the cluster identity for the markers",
'IdentName', 'n', 1, "character", "Idents new name '+' separated",
'IdentLevel', 'l', 1, "character", "Levels of the new Ident '+' separated",
'integrated', 'I', 1, "logical", "Is the RDS an integrated dataset or not"
), byrow=TRUE, ncol=5);
opt = getopt(spec)
# if help was asked, print a friendly message
# and exit with a non-zero error code
args <- commandArgs()
if ( !is.null(opt$help) | is.null(opt$inputRDS)) {
cat("Help message")
cat(getopt(spec, usage=TRUE))
q(status=1)
}
if (is.null(opt$outdir)) {
opt$outdir = "./"
}
if (is.null(opt$species)){
opt$species <- 9606
}
if(is.null(opt$assay)){
opt$assay <- "peaks"
}
# opt$inputRDS <- "/shared/projects/scRNA_HSPC_Aging/GMP_PRARA/output/TransferRnaAtac/coembedAll.rds"
# opt$IdentName <- "Neu2+Rep+Neu3+NeuRA2+NeuRA1+Neu1"
# opt$IdentLevel <- "Rep+Neu1+Neu2+Neu3+NeuRA1+NeuRA2"
# opt$species <- 9606
# opt$cluster <- "predicted.id"
defAssay <- opt$assay
outdir <- opt$outdir
dir.create(outdir, recursive = T)
#------------------------------------#
# Load the motif dataBase
#------------------------------------#
print("Load JASPAR DB")
# Get a list of motif position frequency matrices from the JASPAR database
# species = 9606 is human species id while the tutorial is using mice dataset
pfm <- getMatrixSet(
x = JASPAR2020,
opts = list(species = opt$species, all_versions = FALSE)
)
#------------------------------------#
# Read seurat and prepare the data
#------------------------------------#
seurat <- readRDS(opt$inputRDS)
DefaultAssay(seurat) <- defAssay
Idents(seurat) <- opt$cluster
print("Entering analysis loop")
print(paste("opt$integrated == ", opt$integrated, sep = ""))
if(opt$integrated == TRUE){
# Prepare data
# Rename the cluster with real name for an easier analysis
new.ident.name <- strsplit(opt$IdentName, split = "\\+")[[1]]
names(new.ident.name) <- levels(seurat)
seurat <- RenameIdents(seurat, new.ident.name)
seurat@meta.data$FinalCluster <- Idents(seurat)
ident_levels <- strsplit(opt$IdentLevel, split = "\\+")[[1]]
seurat@meta.data$FinalCluster <- factor(seurat@meta.data$FinalCluster, levels = ident_levels)
Idents(seurat) <- "FinalCluster"
seurat_atac <- seurat
# Adding motif information
seurat_atac <- AddMotifs(object = seurat_atac, genome = BSgenome.Mmusculus.UCSC.mm10, pfm = pfm)
motif_correl <- seurat_atac@assays$peaks@motifs@motif.names
motif_correl.df <- data.frame(do.call("rbind", motif_correl))
names(motif_correl.df)[1] <- "Motif_Name"
motif_correl.df$Motif_ID <- rownames(motif_correl.df)
write.table(x = motif_correl.df, file = paste0(outdir, "/motif_name_table.tsv"), sep = "\t", row.names = F, col.names = T, quote = F)
#---------------------------------------#
# Method 2 : Computing motif activity
#---------------------------------------#
dir.create(paste0(outdir, "/Motif_activity"), recursive = T)
print("Starting method 2")
# Computing motif per cell motif activity score
# Try this for error with multicore
BiocParallel::register(BiocParallel::SerialParam())
seurat_atac <- RunChromVAR(object = seurat_atac, genome = BSgenome.Mmusculus.UCSC.mm10)
DefaultAssay(seurat_atac) <- "chromvar"
print("End chromvar")
# Find differential activity in motif per treatment
dir.create(paste0(outdir, "/Motif_activity/Treatment"), recursive = T)
print("Starting treatment analysis")
Idents(seurat_atac) <- "condition"
diff_act_treatment <- FindMarkers(object = seurat_atac, ident.1 = "Treated", ident.2 = "Control", test.use = "LR", latent.vars = "nCount_peaks")
diff_act_treatment$Motif_Name <- motif_correl.df[rownames(diff_act_treatment), "Motif_Name"]
top_motif_treatment <- MotifPlot(object = seurat_atac, motifs = head(rownames(diff_act_treatment), 6), assay = "peaks")
write.table(diff_act_treatment, file = paste0(outdir, "/Motif_activity/Treatment", "/Motif_enrich_treatment", ".tsv"),
sep = "\t", row.names = T, col.names = T, quote = F)
ggsave(paste0(outdir, "/Motif_activity/Treatment", "/Top_motif_treatment", '.png'), plot = top_motif_treatment, device = 'png', path = NULL,
scale = 1, width = 20, height = 15, units = 'cm', dpi = 300)
print("End treatment analysis")
# Find differential activity in motif per cluster
dir.create(paste0(outdir, "/Motif_activity/Cluster"), recursive = T)
print("Starting cluster analysis")
Idents(seurat_atac) <- "FinalCluster"
diff_act_cluster <- FindAllMarkers(object = seurat_atac, only.pos = T, test.use = "LR", latent.vars = "nCount_peaks")
for(cluster in unique(diff_act_cluster$cluster)){
top_motif <- diff_act_cluster[diff_act_cluster$cluster %in% cluster,]
top_motif.name <- head(top_motif$gene, 6)
top_motif.plot <- MotifPlot(object = seurat_atac, motifs = top_motif.name, assay = "peaks")
ggsave(paste0(outdir, "/Motif_activity/Cluster", "/Top_motif_cluster_", cluster, '.png'), plot = top_motif.plot, device = 'png', path = NULL,
scale = 1, width = 20, height = 15, units = 'cm', dpi = 300)
}
# rownames(diff_act_cluster) <- diff_act_cluster$gene
print("Adding Motif_name to table output")
diff_act_cluster.list <- list()
for(cluster in unique(diff_act_cluster$cluster)){
diff_act_cluster.temp <- diff_act_cluster[diff_act_cluster$cluster %in% cluster,]
rownames(diff_act_cluster.temp) <- diff_act_cluster.temp$gene
diff_act_cluster.temp$Motif_Name <- motif_correl.df[rownames(diff_act_cluster.temp), "Motif_Name"]
diff_act_cluster.list[[cluster]] <- diff_act_cluster.temp
}
diff_act_cluster.final <- do.call("rbind", diff_act_cluster.list)
write.table(diff_act_cluster.final, file = paste0(outdir, "/Motif_activity/Cluster", "/Motif_enrich_cluster", ".tsv"),
sep = "\t", row.names = T, col.names = T, quote = F)
print("End cluster analysis")
# Analysing motif markers of difference between pop IN a cluster (ex : Clus1_Treated vs Clust1_Control)
print("Starting the treatment per cluster analysis")
dir.create(paste0(outdir, "/Motif_activity/Treatment_per_clust"), recursive = T)
seurat_atac$cluster_cond <- paste(seurat_atac$condition, seurat_atac$FinalCluster, sep = "_")
Idents(seurat_atac) <- "cluster_cond"
diff_act_cluster_cond.list <- list()
for(cluster in unique(seurat_atac$FinalCluster)){
print(paste("Treated_vs_Control", cluster, sep = "_"))
if(nrow(seurat_atac@meta.data[seurat_atac$cluster_cond %in% paste("Control", cluster, sep = "_"),]) >3 &&
nrow(seurat_atac@meta.data[seurat_atac$cluster_cond %in% paste("Treated", cluster, sep = "_"),])){
print("Enough cells for analysis")
diff_act_cluster_cond.list[[paste("Treated_vs_Control", cluster, sep = "_")]] <- FindMarkers(object = seurat_atac, ident.1 = paste("Treated", cluster, sep = "_"),
ident.2 = paste("Control", cluster, sep = "_"),
test.use = 'LR', latent.vars = 'nCount_peaks')
diff_act_cluster_cond.list[[paste("Treated_vs_Control", cluster, sep = "_")]]$Comparison <- "Treated_vs_Control"
diff_act_cluster_cond.list[[paste("Treated_vs_Control", cluster, sep = "_")]]$Cluster <- cluster
diff_act_cluster_cond.list[[paste("Treated_vs_Control", cluster, sep = "_")]]$Motif_ID <- rownames(diff_act_cluster_cond.list[[paste("Treated_vs_Control", cluster, sep = "_")]])
diff_act_cluster_cond.list[[paste("Treated_vs_Control", cluster, sep = "_")]]$Motif_Name <- motif_correl.df[rownames(diff_act_cluster_cond.list[[paste("Treated_vs_Control", cluster, sep = "_")]]), "Motif_Name"]
}else{
print("Not enough cells")
}
}
diff_act_cluster_cond.df <- do.call("rbind", diff_act_cluster_cond.list)
print("Plotting top motif for treated and control per cluster")
for(cluster in unique(diff_act_cluster_cond.df$Cluster)){
top_motif <- diff_act_cluster_cond.df[diff_act_cluster_cond.df$Cluster %in% cluster,]
top_motif.name <- head(top_motif$Motif_ID, 6)
bot_motif.name <- tail(top_motif$Motif_ID, 6)
top_motif.plot <- MotifPlot(object = seurat_atac, motifs = top_motif.name, assay = "peaks")
bot_motif.plot <- MotifPlot(object = seurat_atac, motifs = bot_motif.name, assay = "peaks")
ggsave(paste0(outdir, "/Motif_activity/Treatment_per_clust", "/Top_motif_treated_cluster_", cluster, '.png'), plot = top_motif.plot, device = 'png', path = NULL,
scale = 1, width = 20, height = 15, units = 'cm', dpi = 300)
ggsave(paste0(outdir, "/Motif_activity/Treatment_per_clust", "/Top_motif_control_cluster_", cluster, '.png'), plot = bot_motif.plot, device = 'png', path = NULL,
scale = 1, width = 20, height = 15, units = 'cm', dpi = 300)
}
write.table(diff_act_cluster_cond.df, file = paste0(outdir, "/Motif_activity/Treatment_per_clust", "/Motif_enrich_cluster", ".tsv"),
sep = "\t", row.names = T, col.names = T, quote = F)
print("End of the treatment per cluster analysis")
saveRDS(seurat_atac, file = paste0(outdir, "/atac_motif", '.rds'))
} else if(opt$integrated == FALSE){
print("Only computing motif activity score")
# Adding motif information
seurat <- AddMotifs(object = seurat, genome = BSgenome.Mmusculus.UCSC.mm10, pfm = pfm)
motif_correl <- seurat@assays$peaks@motifs@motif.names
motif_correl.df <- data.frame(do.call("rbind", motif_correl))
names(motif_correl.df)[1] <- "Motif_Name"
motif_correl.df$Motif_ID <- rownames(motif_correl.df)
write.table(x = motif_correl.df, file = paste0(outdir, "/motif_name_table.tsv"), sep = "\t", row.names = F, col.names = T, quote = F)
# Calculate motif score
BiocParallel::register(BiocParallel::SerialParam())
seurat <- RunChromVAR(object = seurat, genome = BSgenome.Mmusculus.UCSC.mm10)
saveRDS(seurat, file = paste0(outdir, "/atac_motif", ".rds"))
}
##------------------------------------------------------------------------------
## L. Herault
##------------------------------------------------------------------------------
## -----------------------------------------------------------------------------
## Libraries
## -----------------------------------------------------------------------------
suppressMessages(library(Signac))
suppressMessages(library(Seurat))
suppressMessages(library(GenomeInfoDb))
suppressMessages(library(ggplot2))
suppressMessages(library(patchwork))
set.seed(1234)
# Analysis of seurat 3 integration and clusternig workflow
## -----------------------------------------------------------------------------
## Command line args
## -----------------------------------------------------------------------------
spec = matrix(c(
'help', 'h', 0, "logical", "Help about the program",
'filename', 'i', 1, 'character', 'input filtered_peak_bc_matrix.h5',
'metafile', 'm', 1, 'character', 'input metadata file eg. singlecell.csv',
'fragments', 'f', 1, 'character', 'fragments.tsv.gz file',
'genome', 'g', 1, 'character', 'genome, corresponding R libraries have to be installed (eg EnsDb.Mmusculus.v79) for mm10',
'minCell', 'c', 1, 'numeric', 'min cell number by feature (default 10)',
'minFeature', 'e', 1, 'numeric', "min feature number per cells (default 200)",
'outdir', 'o',1, "character", 'Outdir path (default ./)',
"tfTested", 't',1, "character", "TF tested list (eg TF with a motif in the scenic database",
"regulonJson", 'r',1, "character", "Regulon main Json file",
"regulonJsonSupp", "s", 1, "character", "Regulon supp Json files (separated by +)",
"subConditionName", "n", 1, "character", "correspondong names (separated by +)"
), byrow=TRUE, ncol=5);
## default settings
if (is.null(opt$outdir)) {
opt$outdir = "./"
}
if (is.null(opt$minCell)) {
opt$minCell = 10
}
if (is.null(opt$minFeature)) {
opt$minFeature = 200
}
# Printing option before running
for (o in names(opt)) {
print(paste(o,":", opt[[o]]))
}
## Load data
counts <- Read10X_h5(filename = opt$filename)
metadata <- read.csv(
file = opt$metafile,
header = TRUE,
row.names = 1
)
chrom_assay <- CreateChromatinAssay(
counts = counts,
sep = c(":", "-"),
genome = opt$genome,
fragments = opt$fragments,
min.cells = opt$minCell,
min.features = opt$minFeature
)
signac <- CreateSeuratObject(
counts = chrom_assay,
assay = "peaks",
meta.data = metadata
)
if(opt$genome == "mm10") {
suppressMessages(library(EnsDb.Mmusculus.v79))
ensdb = EnsDb.Mmusculus.v79
}
# extract gene annotations from EnsDb
annotations <- GetGRangesFromEnsDb(ensdb = ensdb)
# change to UCSC style since the data was mapped to hg19
seqlevelsStyle(annotations) <- 'UCSC'
genome(annotations) <- opt$genome
# add the gene information to the object
Annotation(signac) <- annotations
signac <- NucleosomeSignal(object = signac)
# Quality control and cells filtering
# compute TSS enrichment score per cell
signac <- TSSEnrichment(object = signac, fast = FALSE)
# add blacklist ratio and fraction of reads in peaks
signac$pct_reads_in_peaks <- signac$peak_region_fragments / signac$passed_filters * 100
signac$blacklist_ratio <- signac$blacklist_region_fragments / signac$peak_region_fragments
# Plot TSS enrichment, separating cells in two groups
signac$high.tss <- ifelse(signac$TSS.enrichment > 2, 'High', 'Low')
png(pasteO(opt$oudir,'/TSS_enrichment.png'),width = 800,height = 800)
TSSPlot(signac, group.by = 'high.tss') + NoLegend()
dev.off()
# Plot FragmentHistogram on chr1, separating cells in two groups depending on nucleosome signal
signac$nucleosome_group <- ifelse(signac$nucleosome_signal > 2, 'NS > 2', 'NS < 2')
png(pasteO(opt$oudir,'/fragmentHistogram.png'),width = 800,height = 800)
FragmentHistogram(object = signac,region = "chr1-1-100000000",group.by = 'nucleosome_group')
dev.off()
# VlnPlot of control metrics
png(pasteO(opt$oudir,'/vlncontrolMetrics.png'),width = 1400,height = 500)
VlnPlot(
object = signac,
features = c('pct_reads_in_peaks', 'peak_region_fragments',
'TSS.enrichment', 'blacklist_ratio', 'nucleosome_signal'),
pt.size = 0.001,
ncol = 5
)
dev.off()
# Filter outliers
signac <- subset(
x = signac,
subset = peak_region_fragments > 3000 &
peak_region_fragments < 20000 &
pct_reads_in_peaks > 15 &
blacklist_ratio < 0.05 &
nucleosome_signal < 4 &
TSS.enrichment > 2
)
signac
## -----------------------------------------------------------------------------
## Libraries
## -----------------------------------------------------------------------------
print("Loading library")
library(Seurat)
library(getopt)
library(scales)
library(RColorBrewer)
library(plyr)
library(ggplot2)
library(gridExtra)
library(cowplot)
library(reshape2)
library(cowplot)
library(gridExtra)
source("R_src/do_consensus.R")
source("R_src/funForAnalysisSeurat.R")
theme_set(theme_classic())
spec = matrix(c(
'help', 'h', 0, "logical", "Help about the program",
'inputRDS', 'i', 1, "character", "REQUIRED: 10X data ordered by monocle (.RDS generated by orderCL.R).",
'actinnPathGMP', 'g', 1, "character", "Path to actinn multiple run directory",
'actinnPathProg', 'p', 1, "character", "Path to actinn multiple run directory",
'signatures', 's', 1, "character", "Signature files from nestorowa analysis",
'outdir', 'o',1, "character", 'Outdir path (default ./)'
), byrow=TRUE, ncol=5);
opt = getopt(spec)
args <- commandArgs()
if ( !is.null(opt$help) | is.null(opt$inputRDS)) {
cat("Gene filtering of a gbm cds ordered. For scenic use, two filters")
cat(getopt(spec, usage=TRUE))
q(status=1)
}
if (is.null(opt$outdir)) {
opt$outdir = "./"
}
outdir <- opt$outdir
# Data for test
# opt$inputRDS <- "/shared/projects/scRNA_HSPC_Aging/GMP_PRARA/output/RNA/streamAnalysisSeurat/seuratPseudotime.rds"
# opt$actinnPathGMP <- "/shared/projects/scRNA_HSPC_Aging/GMP_PRARA/output/RNA/multiple_Actinn_Nestorowa_gmp/"
# opt$actinnPathProg <- "/shared/projects/scRNA_HSPC_Aging/GMP_PRARA/output/RNA/multiple_Actinn_Nestorowa_prog/"
# opt$signatures <- "/shared/projects/scRNA_HSPC_Aging/GMP_PRARA/output/RNA/Nestorowa/progenitors_signature.csv"
dir.create(outdir, recursive = T)
print("Loading files")
# Read Seurat file
seurat <- readRDS(opt$inputRDS)
# Read and do actinn consensus
actinn_consensus_gmp <- do_consensus(opt$actinnPathGMP)
actinn_consensus_prog <- do_consensus(opt$actinnPathProg)
# Read signatures list
signatures <- read.table(opt$signatures, sep = ",", header = T)
## Color palette
colorTreatment <- c("#664CFF", "#FF8000")
colorCluster <- c("#E69F00", "#CC79A7", "#0072B2", "#009E73", "#D55E00", "#56B4E9")
# Add actinn result as meta data
seurat$cellType_Actinn_GMP <- actinn_consensus_gmp[rownames(seurat@meta.data),"Consensus"]
seurat$cellType_Actinn_GMP <- factor(seurat$cellType_Actinn_GMP, levels = c("GMP_broad", "CMP_broad", "MEP_broad"))
seurat$cellType_Actinn_Prog <- actinn_consensus_prog[rownames(seurat@meta.data),"Consensus"]
seurat$cellType_Actinn_Prog <- factor(seurat$cellType_Actinn_Prog, levels = c("Prog", "HSPC"))
# Calculate signature score
for(sig in 1:ncol(signatures)){
print(names(signatures[sig]))
seurat <- AddModuleScore(seurat, features = list(signatures[,sig]), name = names(signatures[sig]))
colnames(seurat@meta.data)[which(colnames(seurat@meta.data) == paste0(names(signatures[sig]), 1))] <- names(signatures[sig])
}
seurat[["Signature_nesto"]] <- CreateAssayObject(data = t(as.matrix(seurat@meta.data[,names(signatures)])))
## Plotting results of Actinn transfer
# Cell size UMAP 0.4
## Prog transfer and signature ##
## We only look at Prog signature, the transfer result is not convincing
# UMAP
# UMAP_actinn_prog <- DimPlot(seurat, group.by = "cellType_Actinn_prog") + ggtitle("")
# Violin plot
Prog_all_melt <- melt(seurat@meta.data[,c("Prog_signature_up", "Prog_signature_down")])
Prog_Ctrl_melt <- melt(seurat@meta.data[seurat$condition %in% "Control", c("Prog_signature_up", "Prog_signature_down")])
Prog_RA_melt <- melt(seurat@meta.data[seurat$condition %in% "Treated", c("Prog_signature_up", "Prog_signature_down")])
violinplot_prog <- ggplot(Prog_all_melt, aes(x = variable, y = value)) +
geom_violin(fill = "grey") + theme(legend.position = "none", axis.title.x = element_blank()) +
ylab("Signature score") + scale_x_discrete(labels = c("Prog_signature_up" = "Prog", "Prog_signature_down" = "HSPC"))
violinplot_Ctrl_prog <- ggplot(Prog_Ctrl_melt, aes(x = variable, y = value)) +
geom_violin(fill = "grey") + theme(legend.position = "none", axis.title.x = element_blank()) +
ylab("Signature score") + scale_x_discrete(labels = c("Prog_signature_up" = "Prog", "Prog_signature_down" = "HSPC"))
violinplot_RA_prog <- ggplot(Prog_RA_melt, aes(x = variable, y = value)) +
geom_violin(fill = "grey") + theme(legend.position = "none", axis.title.x = element_blank()) +
ylab("Signature score") + scale_x_discrete(labels = c("Prog_signature_up" = "Prog", "Prog_signature_down" = "HSPC"))
ggsave(paste0(outdir, 'Vln_Prog_HSPC', '.png'), plot = violinplot_prog, device = 'png', path = NULL,
scale = 1, width = 15, height = 10, units = 'cm', dpi = 300)
ggsave(paste0(outdir, 'Vln_Ctrl_Prog_HSPC', '.png'), plot = violinplot_Ctrl_prog, device = 'png', path = NULL,
scale = 1, width = 15, height = 10, units = 'cm', dpi = 300)
ggsave(paste0(outdir, 'Vln_RA_Prog_HSPC', '.png'), plot = violinplot_RA_prog, device = 'png', path = NULL,
scale = 1, width = 15, height = 10, units = 'cm', dpi = 300)
## GMP transfer and signature visualisation ##
# Violin Plot
GMP_all_melt <- melt(seurat@meta.data[,c("GMP_signature_up", "CMP_signature_up", "MEP_signature_up")])
GMP_Ctrl_melt <- melt(seurat@meta.data[seurat$condition %in% "Control", c("GMP_signature_up", "CMP_signature_up", "MEP_signature_up")])
GMP_RA_melt <- melt(seurat@meta.data[seurat$condition %in% "Treated", c("GMP_signature_up", "CMP_signature_up", "MEP_signature_up")])
violinplot_gmp <- ggplot(GMP_all_melt, aes(x = variable, y = value)) +
geom_violin(fill = "grey") + theme(legend.position = "none", axis.title.x = element_blank()) +
ylab("Signature score") + scale_x_discrete(labels = c("GMP_signature_up" = "GMP", "CMP_signature_up" = "CMP", "MEP_signature_up" = "MEP"))
violinplot_Ctrl_gmp <- ggplot(GMP_Ctrl_melt, aes(x = variable, y = value)) +
geom_violin(fill = "grey") + theme(legend.position = "none", axis.title.x = element_blank()) +
ylab("Signature score") + scale_x_discrete(labels = c("GMP_signature_up" = "GMP", "CMP_signature_up" = "CMP", "MEP_signature_up" = "MEP"))
violinplot_RA_gmp <- ggplot(GMP_RA_melt, aes(x = variable, y = value)) +
geom_violin(fill = "grey") + theme(legend.position = "none", axis.title.x = element_blank()) +
ylab("Signature score") + scale_x_discrete(labels = c("GMP_signature_up" = "GMP", "CMP_signature_up" = "CMP", "MEP_signature_up" = "MEP"))
ggsave(paste0(outdir, 'Vln_GMP_CMP_MEP', '.png'), plot = violinplot_gmp, device = 'png', path = NULL,
scale = 1, width = 15, height = 10, units = 'cm', dpi = 300)
ggsave(paste0(outdir, 'Vln_Ctrl_GMP_CMP_MEP', '.png'), plot = violinplot_Ctrl_gmp, device = 'png', path = NULL,
scale = 1, width = 15, height = 10, units = 'cm', dpi = 300)
ggsave(paste0(outdir, 'Vln_RA_GMP_CMP_MEP', '.png'), plot = violinplot_RA_gmp, device = 'png', path = NULL,
scale = 1, width = 15, height = 10, units = 'cm', dpi = 300)
# UMAP
# Can't plot after subseting on MEP_broad because there is only one cell
Idents(seurat) <- "cellType_Actinn_GMP"
seurat_gmp <- subset(seurat, idents = "GMP_broad")
seurat_cmp <- subset(seurat, idents = "CMP_broad")
# seurat_mep <- subset(seurat, idents = "MEP_broad")
Idents(seurat) <- "FinalCluster"
UMAP_actinn_GMP <- DimPlot(seurat_gmp, group.by = "cellType_Actinn_GMP", pt.size = 0.4) +
theme(axis.title = element_blank(), axis.text = element_blank(), legend.position = "none") +
scale_color_manual(values = "grey")
UMAP_actinn_CMP <- DimPlot(seurat_cmp, group.by = "cellType_Actinn_GMP", pt.size = 0.4) +
theme(axis.title = element_blank(), axis.text = element_blank(), legend.position = "none") +
scale_color_manual(values = "grey")
# UMAP_actinn_MEP <- DimPlot(seurat_mep, group.by = "cellType_Actinn_GMP")
UMAP_actinn_gmp_split <- DimPlot(seurat, group.by = "cellType_Actinn_GMP", split.by = "cellType_Actinn_GMP", pt.size = 0.4) +
theme(axis.title = element_blank(), axis.text = element_blank(), legend.position = "none") + ggtitle("") +
scale_color_manual(values = c("grey", "grey", "grey"))
ggsave(paste0(outdir, 'UMAP_GMP_split', '.png'), plot = UMAP_actinn_gmp_split, device = 'png', path = NULL,
scale = 1, width = 25, height = 15, units = 'cm', dpi = 300)
ggsave(paste0(outdir, 'UMAP_GMP', '.png'), plot = UMAP_actinn_GMP, device = 'png', path = NULL,
scale = 1, width = 15, height = 15, units = 'cm', dpi = 300)
ggsave(paste0(outdir, 'UMAP_CMP', '.png'), plot = UMAP_actinn_CMP, device = 'png', path = NULL,
scale = 1, width = 15, height = 15, units = 'cm', dpi = 300)
saveRDS(seurat, paste0(outdir, "seurat_actinn", ".rds"))
\ No newline at end of file
## -----------------------------------------------------------------------------
## Libraries
## -----------------------------------------------------------------------------
library(Seurat)
library(monocle)
library(plyr)
library(ggplot2)
library(getopt)
## -----------------------------------------------------------------------------
## Command line args
## -----------------------------------------------------------------------------
spec = matrix(c(
'help', 'h', 0, "logical", "Help about the program",
"inputSeurat", "i",1, "character", "input seurat object with cluster column names numclust",
'outdir', 'o',1, "character", 'Outdir path (default ./)',
"surfaceMarkers", "s", 1, "character", "List of surface marker split by \\+",
"namePop", "n", 1, "character", "Name of the double pos population"
), byrow=TRUE, ncol=5);
opt = getopt(spec)
# if help was asked, print a friendly message
# and exit with a non-zero error code
args <- commandArgs()
if ( !is.null(opt$help) | is.null(opt$inputSeurat)) {
cat("Create influence graph from regulon table")
cat(getopt(spec, usage=TRUE))
q(status=1)
}
if(is.null(opt$outdir)){
opt$outdir <- "./"
}
theme_set(theme_classic())
# Testing option
opt$outdir <- "/shared/projects/scRNA_HSPC_Aging/GMP_PRARA/output/RNA/Seurat4_integration/Analysis/"
opt$inputSeurat <- readRDS("/shared/projects/scRNA_HSPC_Aging/GMP_PRARA/output/RNA/regulonAnalysis/seuratAUC_2.rds")
opt$surfaceMarkers <- "Ly6g+Itgam"
opt$namePop <- "Diff"
outdir <- opt$outdir
# Read surface markers
surfaceMarkers <- strsplit(opt$surfaceMarkers, split = "\\+")[[1]]
# Open seurat then save it as CDS object
seurat <- readRDS(opt$inputSeurat)
seurat_cds <- as.CellDataSet(seurat, assay = "RNA")
# Classify cells with surface markers
cth <- newCellTypeHierarchy()
SM_1_id <- row.names(subset(fData(seurat_cds), gene_short_name == surfaceMarkers[1]))
SM_2_id <- row.names(subset(fData(seurat_cds), gene_short_name == surfaceMarkers[1]))
cth <- addCellType(cth, opt$namePop, classify_func =
function(x){ x[SM_1_id,] >=1 & x[SM_2_id,] >=1})
seurat_cds <- classifyCells(seurat_cds, cth, 0.1)
# Add cellType information to the seurat object
seurat@meta.data$CellType <- pData(seurat_cds)[rownames(seurat@meta.data), "CellType"]
# Barplot cellType distribution
summary_diff <- ddply(seurat@meta.data,~FinalCluster + CellType + condition, nrow)
summary_diff$Diff_cell <- factor(summary_diff$CellType, levels = c("Unknown", "Diff"))
cellType_control_bp <- ggplot(data = data.frame(summary_diff[summary_diff$condition %in% "Control",]), aes(x = FinalCluster, y = V1, fill = CellType)) +
geom_bar(stat = "identity", position = position_dodge()) +
ylab(label = "") + xlab(label = "") + coord_flip()
cellType_treated_bp <- ggplot(data = data.frame(summary_diff[summary_diff$condition %in% "Treated",]), aes(x = FinalCluster, y = V1, fill = CellType)) +
geom_bar(stat = "identity", position = position_dodge()) +
ylab(label = "") + xlab(label = "") + coord_flip()
ggsave(paste0(outdir, "BarPlot/", 'BP_CS_Markers_', opt$namePop, '_control', '.png'), plot = cellType_control_bp, device = 'png', path = NULL,
scale = 1, width = 10, height = 10, units = 'cm', dpi = 300)
ggsave(paste0(outdir, "BarPlot/", 'BP_CS_Markers_', opt$namePop, '_treated', '.png'), plot = cellType_treated_bp, device = 'png', path = NULL,
scale = 1, width = 10, height = 10, units = 'cm', dpi = 300)
saveRDS(seurat, paste0(outdir, "seurat_CT", ".rds"))
castle <- function(source_seurat,target_seurat,outdir = "./") {
set.seed(2018)
BREAKS=c(-1, 0, 1, 6, Inf)
nFeatures = 100
dir.create(outdir,showWarnings = F)
# 1. Load datasets in scater format: loaded files expected to contain "Large SingleCellExperiment" object
# 1.1 convert from seurat (be careful seurat 2) to sce object if seurat object as input
source <- Convert(from = source_seurat, to = "sce")
target <- Convert(from = target_seurat, to = "sce")
ds1 = t(exprs(source))
ds2 = t(exprs(target))
sourceCellTypes = as.factor(colData(source)[,"library_id"])
# 2. Unify sets, excluding low expressed genes
source_n_cells_counts = apply(exprs(source), 1, function(x) { sum(x > 0) } )
print("source OK")
target_n_cells_counts = apply(exprs(target), 1, function(x) { sum(x > 0) } )
common_genes = intersect( rownames(source)[source_n_cells_counts>10],
rownames(target)[target_n_cells_counts>10]
)
remove(source_n_cells_counts, target_n_cells_counts)
ds1 = ds1[, colnames(ds1) %in% common_genes]
ds2 = ds2[, colnames(ds2) %in% common_genes]
ds = rbind(ds1[,common_genes], ds2[,common_genes])
isSource = c(rep(TRUE,nrow(ds1)), rep(FALSE,nrow(ds2)))
remove(ds1, ds2)
# 3. Highest mean in both source and target
topFeaturesAvg = colnames(ds)[order(apply(ds, 2, mean), decreasing = T)]
# 4. Highest mutual information in source
topFeaturesMi = names(sort(apply(ds[isSource,],2,function(x) { compare(cut(x,breaks=BREAKS),sourceCellTypes,method = "nmi") }), decreasing = T))
# 5. Top n genes that appear in both mi and avg
selectedFeatures = union(head(topFeaturesAvg, nFeatures) , head(topFeaturesMi, nFeatures) )
# 6. remove correlated features
tmp = cor(as.matrix(ds[,selectedFeatures]), method = "pearson")
tmp[!lower.tri(tmp)] = 0
selectedFeatures = selectedFeatures[apply(tmp,2,function(x) any(x < 0.9))]
remove(tmp)
# 7,8. Convert data from continous to binned dummy vars
# break datasets to bins
dsBins = apply(ds[, selectedFeatures], 2, cut, breaks= BREAKS)
# use only bins with more than one value
nUniq = apply(dsBins, 2, function(x) { length(unique(x)) })
# convert to dummy vars
ds = model.matrix(~ . , as.data.frame(dsBins[,nUniq>1]))
remove(dsBins, nUniq)
# 9. Classify
train = runif(nrow(ds[isSource,]))<0.8
#slightly different setup for multiclass and binary classification
if (length(unique(sourceCellTypes)) > 2) {
xg=xgboost(data=ds[isSource,][train, ] ,
label=as.numeric(sourceCellTypes[train])-1,
objective="multi:softmax", num_class=length(unique(sourceCellTypes)),
eta=0.7 , nthread=5, nround=20, verbose=0,
gamma=0.001, max_depth=5, min_child_weight=10)
} else {
xg=xgboost(data=ds[isSource,][train, ] ,
label=as.numeric(sourceCellTypes[train])-1,
eta=0.7 , nthread=5, nround=20, verbose=0,
gamma=0.001, max_depth=5, min_child_weight=10)
}
# 10. Predict
predictedClasses = predict(xg, ds[!isSource, ])
target_seurat@meta.data$predicted <- predictedClasses
conv <- data.frame(cellType = unique(sourceCellTypes),numCellTypes=unique(as.numeric(sourceCellTypes[train]))-1)
for (num in conv$numCellTypes) {
target_seurat@meta.data$predicted[target_seurat@meta.data$predicted==num] <- as.character(conv[which(conv$numCellType== num),"cellType"])
}
target_seurat@meta.data$predicted <- factor(target_seurat@meta.data$predicted,levels = c("LTHSC","STHSC", "MPP2", "MPP3"))
png(paste(outdir,"/cellTypeLearned_tsne.png", sep = ""),height = 800,width = 800)
TSNEPlot(target_seurat,group.by="predicted")
dev.off()
return(target_seurat)
}
castleSeurat3 <- function(source_seurat,target_seurat,outdir = "./") {
set.seed(2018)
BREAKS=c(-1, 0, 1, 6, Inf)
nFeatures = 100
dir.create(outdir,showWarnings = F)
# 1. Load datasets in scater format: loaded files expected to contain "Large SingleCellExperiment" object
# 1.1 convert from seurat (be careful seurat 3) to sce object if seurat object as input
source <- as.SingleCellExperiment(source_seurat, to = "sce",data = "logcounts")
target <- as.SingleCellExperiment(target_seurat, to = "sce",data = "logcounts")
ds1 = t(exprs(source))
ds2 = t(exprs(target))
sourceCellTypes = as.factor(colData(source)[,"library_id"])
# 2. Unify sets, excluding low expressed genes
source_n_cells_counts = apply(exprs(source), 1, function(x) { sum(x > 0) } )
print("source OK")
target_n_cells_counts = apply(exprs(target), 1, function(x) { sum(x > 0) } )
common_genes = intersect( rownames(source)[source_n_cells_counts>10],
rownames(target)[target_n_cells_counts>10]
)
remove(source_n_cells_counts, target_n_cells_counts)
ds1 = ds1[, colnames(ds1) %in% common_genes]
ds2 = ds2[, colnames(ds2) %in% common_genes]
ds = rbind(ds1[,common_genes], ds2[,common_genes])
isSource = c(rep(TRUE,nrow(ds1)), rep(FALSE,nrow(ds2)))
remove(ds1, ds2)
# 3. Highest mean in both source and target
topFeaturesAvg = colnames(ds)[order(apply(ds, 2, mean), decreasing = T)]
# 4. Highest mutual information in source
topFeaturesMi = names(sort(apply(ds[isSource,],2,function(x) { compare(cut(x,breaks=BREAKS),sourceCellTypes,method = "nmi") }), decreasing = T))
# 5. Top n genes that appear in both mi and avg
selectedFeatures = union(head(topFeaturesAvg, nFeatures) , head(topFeaturesMi, nFeatures) )
# 6. remove correlated features
tmp = cor(as.matrix(ds[,selectedFeatures]), method = "pearson")
tmp[!lower.tri(tmp)] = 0
selectedFeatures = selectedFeatures[apply(tmp,2,function(x) any(x < 0.9))]
remove(tmp)
# 7,8. Convert data from continous to binned dummy vars
# break datasets to bins
dsBins = apply(ds[, selectedFeatures], 2, cut, breaks= BREAKS)
# use only bins with more than one value
nUniq = apply(dsBins, 2, function(x) { length(unique(x)) })
# convert to dummy vars
ds = model.matrix(~ . , as.data.frame(dsBins[,nUniq>1]))
remove(dsBins, nUniq)
# 9. Classify
train = runif(nrow(ds[isSource,]))<0.8
#slightly different setup for multiclass and binary classification
if (length(unique(sourceCellTypes)) > 2) {
xg=xgboost(data=ds[isSource,][train, ] ,
label=as.numeric(sourceCellTypes[train])-1,
objective="multi:softmax", num_class=length(unique(sourceCellTypes)),
eta=0.7 , nthread=5, nround=20, verbose=0,
gamma=0.001, max_depth=5, min_child_weight=10)
} else {
xg=xgboost(data=ds[isSource,][train, ] ,
label=as.numeric(sourceCellTypes[train])-1,
eta=0.7 , nthread=5, nround=20, verbose=0,
gamma=0.001, max_depth=5, min_child_weight=10)
}
# 10. Predict
predictedClasses = predict(xg, ds[!isSource, ])
target_seurat@meta.data$predicted <- predictedClasses
conv <- data.frame(cellType = unique(sourceCellTypes),numCellTypes=unique(as.numeric(sourceCellTypes[train]))-1)
for (num in conv$numCellTypes) {
target_seurat@meta.data$predicted[target_seurat@meta.data$predicted==num] <- as.character(conv[which(conv$numCellType== num),"cellType"])
}
target_seurat@meta.data$predicted <- factor(target_seurat@meta.data$predicted,levels = c("LTHSC","STHSC", "MPP2", "MPP3"))
png(paste(outdir,"/cellTypeLearned_tsne.png", sep = ""),height = 800,width = 800)
DimPlot(target_seurat,group.by ="predicted")
dev.off()
return(target_seurat)
}
transfertToMonocle <- function(target_seurat,target_monocle,outdir = "./") {
pData(target_monocle)$predicted <- target_seurat@meta.data$predicted
png(paste(outdir,"/cellTypeLearned_ddrtree.png", sep = ""),height = 800,width = 800)
print(plot_cell_trajectory(target_monocle,color_by = "predicted"))
dev.off()
return(target_monocle)
}
suppressMessages(library(gProfileR))
suppressMessages(library(RColorBrewer))
require(scales)
suppressMessages(library(monocle))
suppressMessages(library(Seurat))
suppressMessages(library(scater))
suppressMessages(library(xgboost))
suppressMessages(library(igraph))
suppressMessages(library(getopt))
source("R_src/getSigPerCells.R")
source("R_src/Enrichment.R")
source("R_src/castle.R")
spec = matrix(c(
'help', 'h', 0, "logical", "Help about the program",
'sourceRDS', 'i', 1, "character", "REQUIRED: 10X data to trained classifier prepared as Seurat object (.RDS generated by RodriguezAnalysis.R).",
'targetRDS', 't', 1, "character", "REQUIRED: 10X data targeted prepared as seurat object (.RDS generated by SeuratCL.R (filtered, normalized)).",
'targetMonocleRDS', 'm', 1, "character", "REQUIRED: 10X data targeted prepared as monocle object (.RDS generated by seuratMonocleOrdering.R).",
'outdir', 'o',1, "character", 'Outdir path (default ./)'
), byrow=TRUE, ncol=5);
opt = getopt(spec)
if ( !is.null(opt$help) | is.null(opt$sourceRDS)) {
cat("Perform perform cell type assignation using a source dataset and castle XGradientBoost tool")
cat(getopt(spec, usage=TRUE))
q(status=1)
}
source_seurat <- readRDS(opt$sourceRDS)
target_seurat <- readRDS(opt$targetRDS)
seurat <- castleSeurat3(source_seurat,target_seurat,outdir = opt$outdir)
if (!is.null(opt$targetMonocleRDS)) {
target_monocle <- readRDS(opt$targetMonocleRDS)
monocle <- transfertToMonocle(seurat,target_monocle,outdir = opt$outdir)
saveRDS(monocle,paste(opt$outdir,"/monocleTrainedCt.rds",sep =""))
}
png(paste(opt$outdir,"/CellTypeLearnedPropGG.png",sep = ""))
pie <- ggplot(seurat@meta.data,
aes(x = factor(1), fill = factor(predicted))) + geom_bar(width = 1)
pie <- pie + coord_polar(theta = "y") +
theme(axis.title.x = element_blank(), axis.title.y = element_blank())
print(pie)
dev.off()
png(paste(opt$outdir,"/CellTypeLearnedProp.png",sep = ""))
pie(table(seurat@meta.data$predicted),labels = names(table(seurat@meta.data$predicted)),col = hue_pal()(4),main = paste("predicted prop"))
dev.off()
saveRDS(seurat,paste(opt$outdir,"/seuratTrainedCt.rds",sep = ""))
#Functions to compute difference of feature in seurat object (because Seurat compute logFC)
getExtAvgDiff <- function(row) {
res <- NA
if(row[1] > 0 & row[2] > 0) {
res <- min(c(row))
}
if(row[1] < 0 & row[2] < 0) {
res <- max(c(row))
}
return(res)
}
featureDiff <- function(seurat,cells.1,cells.2,feature) {
data <- GetAssayData(seurat,slot = "data")
total.diff <- mean(data[feature,cells.1]) - mean(data[feature,cells.2])
return(total.diff)
}
getTrueDiff <- function (seurat,table,colIdent = "numclust",suffix = "",colTest="AGE") {
if(!is.null(levels(seurat[[colTest]]))) {
factors <- levels(seurat@meta.data[,colTest])
} else {
factors <- unique(seurat@meta.data[,colTest])
}
factors <- as.vector(factors)
table[,paste("avg_diff",suffix,sep = "")] <- NA
for (rm in rownames(table)) {
feature <- table[rm,"Gene"]
cells.1 <- colnames(seurat)[which(seurat[[colIdent]] == table[rm,colIdent] & seurat[[colTest]] == factors[1])]
cells.2 <- colnames(seurat)[which(seurat[[colIdent]] == table[rm,colIdent] & seurat[[colTest]] == factors[2])]
table[rm,paste("avg_diff",suffix,sep = "")] <- featureDiff(seurat,cells.1,cells.2,feature)
}
return(table)
}
## Test functions
FindMarkerPerClustGroupVar <- function(cluster,
hspc.combined,
condition ="AGE", ## only var with two factors allowed
grouping.var = "platform",
max.cells.per.ident = Inf,
filterOnPadj = T,
logfc.threshold = 0.25,
min.pct = 0.1,
keepDiverging = F,
test.use = "wilcox",
identCol = "numclust",
pseudocount.use = 1,
computeTrueDiff= F) {
clusterCondition <- paste0("cluster.",condition)
if(!is.null(levels(seurat@meta.data[,condition]))) {
factors <- levels(seurat@meta.data[,condition])
} else {
factors <- unique(seurat@meta.data[,condition])
}
hspc.combined@meta.data[,clusterCondition] <- paste(hspc.combined@meta.data[,identCol],
hspc.combined@meta.data[,condition], sep = "_")
test <- table(hspc.combined@meta.data[,clusterCondition], hspc.combined@meta.data[,grouping.var])
groups = unique(hspc.combined@meta.data[,grouping.var])
ident1 = paste0(cluster,"_",factors[1])
ident2 = paste0(cluster,"_",factors[2])
if( (test[ident1,groups[1]]>3 & test[ident1,groups[2]]>3) &
(test[ident2,groups[1]]>3&test[ident2,groups[2]]>3)) {
Idents(object = hspc.combined) <- clusterCondition
markers <- FindConservedMarkers(hspc.combined,
assay = DefaultAssay(hspc.combined),
grouping.var = grouping.var,
test.use=test.use,
ident.1 = ident1,
ident.2 = ident2,
pseudocount.use = pseudocount.use,
min.pct = min.pct,
logfc.threshold = logfc.threshold,
max.cells.per.ident = max.cells.per.ident)
print(dim(markers))
#Add a column the min abs(logFC)
markers$min_avg_logFC <- NA
if(!keepDiverging) {
#print("keep only markers with conserved logfc sign")
markers <- markers[which(markers[,paste0(groups[1],"_avg_logFC")]/markers[,paste0(group[2],"_avg_logFC")] > 0),]
} else {
#print("combined p_val of diverging markers between two batch are set to 1 and their min_avg_logfc to 0")
markers[which(markers[,paste0(groups[1],"_avg_logFC")]/markers[,paste0(group[2],"_avg_logFC")] > 0),"min_avg_logFC"] <- 0
markers[which(markers[,paste0(groups[1],"_avg_logFC")]/markers[,paste0(group[2],"_avg_logFC")] > 0),"minimump_p_val"] <- 1
}
for (g in rownames(markers)) {
if(markers[g,paste0(groups[1],"_avg_logFC")] < 0) {
markers[g,"min_avg_logFC"] <- max(markers[g,c(paste0(groups[1],"_avg_logFC"),paste0(groups[2],"_avg_logFC"))])
} else {
markers[g,"min_avg_logFC"] <- min(markers[g,c(paste0(groups[1],"_avg_logFC"),paste0(groups[2],"_avg_logFC"))])
}
}
markers$Cluster <- paste0(ident1,"_up")
markers$Cluster[which(markers$min_avg_logFC < 0)] <- paste0(ident1,"_down")
markers$Gene <- rownames(markers)
markers <- markers[order(markers$min_avg_logFC),]
if(computeTrueDiff) {
markers$numclust <- str_split_fixed(markers$Cluster,pattern = "_",n=3)[,1]
markers <- getTrueDiff(seurat[,which(seurat[[grouping.var]] == group[1])],
markers,
colIdent = "numclust",
colTest = condition,
suffix = paste0("_",group[1]))
markers <- getTrueDiff(seurat[,which(seurat[[grouping.var]] == group[2])],
markers,
colIdent = "numclust",
colTest = condition,
suffix = paste0("_",group[2]))
markers$min_avg_diff <- apply(markers[,c(paste0("avg_diff_",group[1]),paste0("avg_diff_",group[2]))],
1,
FUN=getExtAvgDiff)
}
if (filterOnPadj) {
markers <- markers[which(markers$A_p_val_adj < 0.05 & markers$B_p_val_adj < 0.05),]
}
}else {
markers <- NULL
}
return(markers)
}
FindMarkerPerClust <- function(cluster,
hspc.combined,
condition ="AGE", ## only var with two factors allowed
max.cells.per.ident = Inf,
logfc.threshold = 0.25,
min.pct = 0.1,
test.use = "wilcox",
identCol = "numclust",
pseudocount.use = 1,
computeTrueDiff = F) {
clusterCondition <- paste0("cluster.",condition)
if(!is.null(levels(seurat@meta.data[,condition]))) {
factors <- levels(seurat@meta.data[,condition])
} else {
factors <- unique(seurat@meta.data[,condition])
}
hspc.combined@meta.data[,clusterCondition] <- paste(hspc.combined@meta.data[,identCol],
hspc.combined@meta.data[,condition], sep = "_")
test <- table(hspc.combined@meta.data[,clusterCondition])
ident1 = paste0(cluster,"_",factors[1])
ident2 = paste0(cluster,"_",factors[2])
if( test[ident1]>3 & test[ident2]>3) {
Idents(object = hspc.combined) <- clusterCondition
markers <- FindMarkers(hspc.combined,
assay = DefaultAssay(hspc.combined),
test.use=test.use,
ident.1 = ident1,
ident.2 = ident2,
pseudocount.use = pseudocount.use,
min.pct = min.pct,
logfc.threshold = logfc.threshold,
max.cells.per.ident = max.cells.per.ident)
markers$Cluster <- paste0(ident1,"_up")
markers$Cluster[which(markers$avg_logFC < 0)] <- paste0(ident1,"_down")
markers$Gene <- rownames(markers)
markers <- markers[order(markers$avg_logFC),]
if(computeTrueDiff) {
markers$numclust <- str_split_fixed(markers$Cluster,pattern = "_",n=3)[,1]
markers <- getTrueDiff(seurat,
markers,
colIdent = "numclust",
colTest = condition
)
}
} else {
markers <- NULL
}
return(markers)
}
# R function to prepare single cell data for monocle & Seurat
getCellCyclePhases <- function(gbm_cds,outdir = "./") {
dir.create(path = outdir,recursive = T,showWarnings = F)
gene_count_matrix <- as.matrix(exprs(gbm_cds))
set.seed(100)
mm.pairs <- readRDS(system.file("exdata", "mouse_cycle_markers.rds", package="scran"))
assignments <- cyclone(gene_count_matrix, mm.pairs, gene.names=rownames(gene_count_matrix))
## add cell cycle phase to pData
pData(gbm_cds)$phases <- assignments$phases
pData(gbm_cds)$G1_score <- assignments$scores$G1
pData(gbm_cds)$G2M_score <- assignments$scores$G2M
pData(gbm_cds)$S_score <- assignments$scores$S
pData(gbm_cds)[which(pData(gbm_cds)$phases=="G1"),"phases"] <- "G1_G0"
pData(gbm_cds)[which(pData(gbm_cds)$phases=="G2M"),"phases"] <- "G2_M"
png(paste(outdir,"/cell_cycle_phases_assignment_on_umi.png",sep = ""))
plot(assignments$score$G1, assignments$score$G2M,
xlab="G1 score", ylab="G2/M score", pch=16)
dev.off()
print("Cell cycle phases added..")
return(gbm_cds)
}
getCellCyclePhasesSeurat <- function(seurat,outdir = "./") {
dir.create(path = outdir,recursive = T,showWarnings = F)
gene_count_matrix <- as.matrix(GetAssayData(seurat,slot = 'counts',assay = "RNA"))
set.seed(100)
mm.pairs <- readRDS(system.file("exdata", "mouse_cycle_markers.rds", package="scran"))
assignments <- cyclone(gene_count_matrix, mm.pairs, gene.names=rownames(gene_count_matrix))
## add cell cycle phase to pData
seurat@meta.data$phases <- assignments$phases
seurat@meta.data$G1_score <- assignments$scores$G1
seurat@meta.data$G2M_score <- assignments$scores$G2M
seurat@meta.data$S_score <- assignments$scores$S
seurat@meta.data[which(seurat@meta.data$phases=="G1"),"phases"] <- "G1_G0"
seurat@meta.data[which(seurat@meta.data$phases=="G2M"),"phases"] <- "G2_M"
png(paste(outdir,"/cell_cycle_phases_assignment_on_umi.png",sep = ""))
plot(assignments$score$G1, assignments$score$G2M,
xlab="G1 score", ylab="G2/M score", pch=16)
dev.off()
print("Cell cycle phases added..")
return(seurat)
}
addPercentMitoch <- function(gbm_cds) { #in fact this is fraction of mitochondrial transcripts
mitoGenes <- grep(pattern = "mt-",x=fData(gbm_cds)$gene_short_name,value = T)
mitoGenesEnsembl <- rownames(gbm_cds)[which(is.element(el = fData(gbm_cds)$gene_short_name,set = mitoGenes))]
percentMito <- Matrix::colSums(as.matrix(exprs(gbm_cds))[mitoGenesEnsembl, ])/Matrix::colSums(as.matrix(exprs(gbm_cds)))
pData(gbm_cds)$percentMito <- percentMito
return(gbm_cds)
}
#Filtering low-quality cells
filterCells <- function(gbm_cds,outdir="./",num_cells_expressed=10,min_expr=0.1,propMitochFilter=NULL) {
dir.create(path = outdir,recursive = T,showWarnings = F)
#Set detection threshold
gbm_cds <- detectGenes(gbm_cds, min_expr = min_expr)
#Define gene expressed as gene detected in more than n cell (only use for plotting distribution)
expressed_genes <- row.names(subset(fData(gbm_cds),
num_cells_expressed >= num_cells_expressed))
#filtering cells as explained in monocle doc, cutting distribution tails
#eg. remove cells with no RNA or too much RNA
pData(gbm_cds)$Total_mRNAs <- Matrix::colSums(exprs(gbm_cds))
png(paste(outdir,"/densityTotalmRNA_raw.png",sep =""))
qplot(Total_mRNAs, data = pData(gbm_cds), geom ="density")
dev.off()
upper_bound <- 10^(mean(log10(pData(gbm_cds)$Total_mRNAs)) +
2*sd(log10(pData(gbm_cds)$Total_mRNAs)))
lower_bound <- 10^(mean(log10(pData(gbm_cds)$Total_mRNAs)) -
2*sd(log10(pData(gbm_cds)$Total_mRNAs)))
png(paste(outdir,"/densityWithFirstFilter.png",sep =""))
print(qplot(Total_mRNAs, data = pData(gbm_cds), geom ="density") +
geom_vline(xintercept = lower_bound) +
geom_vline(xintercept = upper_bound))
dev.off()
gbm_cds <- gbm_cds[,pData(gbm_cds)$Total_mRNAs > lower_bound &
pData(gbm_cds)$Total_mRNAs < upper_bound]
# redefining expressed gene after cells filtering
gbm_cds <- detectGenes(gbm_cds, min_expr = min_expr)
expressed_genes <- row.names(subset(fData(gbm_cds),num_cells_expressed >= num_cells_expressed)) #genes expressed in at least 10 cells of the data set.
# Log-transform each value in the expression matrix.
L <- log(exprs(gbm_cds[expressed_genes,]))
# Standardize each gene, so that they are all on the same scale,
# Then melt the data with plyr so we can plot it easily
melted_dens_df <- reshape2::melt(Matrix::t(scale(Matrix::t(L))))
# Plot the distribution of the standardized gene expression values.
png(paste(outdir,"/standardized_distribution.png",sep =""))
print(qplot(value, geom = "density", data = melted_dens_df) +
stat_function(fun = dnorm, size = 0.5, color = 'red') +
xlab("Standardized log(UMIcounts)") +
ylab("Density"))
dev.off()
## Visualize percentage of mitochondiral genes
gbm_cds <- addPercentMitoch(gbm_cds)
png(paste(outdir,"/percent_mito.png",sep =""))
plot(hist(gbm_cds$percentMito*100,breaks = 100),main = "percentage of mitochondrial transcripts")
abline(v=propMitochFilter*100)
dev.off()
return(gbm_cds)
}
do_consensus <- function(path){
# Opening the predicted files
predicted_files <- list.files(path)
predicted_names <- strsplit(predicted_files, split = ".txt")
predicted_list <- list()
for(i in 1:length(predicted_files)){
name <- predicted_names[[i]]
print(name)
predicted_list[[name]] <- assign(predicted_names[[i]], read.table(paste0(path, predicted_files[i]), row.names = 1, header = T))
}
# adding all the columns to a single table
predicted_table <- do.call("cbind", predicted_list)
# convert to dataframe
predicted_data <- data.frame(predicted_table)
# transpose
predicted_data <- t(predicted_data)
# creation "Consensus"
predicted_consensus=data.frame(Consensus = 1:ncol(predicted_data))
for (i in 1:ncol(predicted_data)){
table_compteur <- table(predicted_data[,i])
predicted_consensus[i,1] <- names(which.max(table_compteur))
rownames(predicted_consensus)[i] <- colnames(predicted_data)[i]
}
return(predicted_consensus)
}
# Script to make analyzis of pseudotime with a gbm_cds ordered
# command line interface
## -----------------------------------------------------------------------------
## Libraries
## -----------------------------------------------------------------------------
suppressMessages(library(monocle))
suppressMessages(library(getopt))
suppressMessages(library(biomaRt))
source("R_src/Enrichment.R")
spec = matrix(c(
'help', 'h', 0, "logical", "Help about the program",
'inputData', 'i', 1, "character", "REQUIRED: matrix data (.tsv generated by the report) from 10X data ordered by monocle.",
'outdir', 'o',1, "character", 'Outdir path (default ./)',
'firstFilter', 'f',1,"numeric", 'Keep only the genes with at least N = f(X) 2 mean imputed counts across all samples (X=2 by default) \n(e.g. the total number the gene would have, if it was expressed with a value of X in 1% of the cells). \nAdjust X value according to the dataset (it will depend on the dataset units, e.g. UMI, TPMs???).',
'secondFilter', 's',1, "numeric", 'Keep the genes that are detected in at least X% of the cells (X=1% by default)'
), byrow=TRUE, ncol=5);
opt = getopt(spec)
# if help was asked, print a friendly message
# and exit with a non-zero error code
args <- commandArgs()
if ( !is.null(opt$help) | is.null(opt$inputData)) {
cat("Gene filtering of a gbm cds ordered. For scenic use, two filters")
cat(getopt(spec, usage=TRUE))
q(status=1)
}
#set default arguments values
if (is.null(opt$outdir)) {
opt$outdir = "./"
}
if (is.null(opt$firstFilter)) {
opt$firstFilter = 2
}
if (is.null(opt$secondFilter)) {
opt$secondFilter = 0.01
}
data <- read.table(opt$inputData,sep = "\t",header = T,row.names = 1)
genes <- list()
genes$names <- rownames(data)
nCellsPerGene <- apply(data, 1, function(x) sum(x>0))
nCountsPerGene <- apply(data, 1, sum)
summary(nCellsPerGene)
sum(data>0) / sum(data==0)
#first filter
minReads <- opt$firstFilter*.01*ncol(data)
genesLeftMinReads <- names(nCountsPerGene)[which(nCountsPerGene > minReads)]
length(genesLeftMinReads)
#second filter
minSamples <- ncol(data)*opt$secondFilter
nCellsPerGene2 <- nCellsPerGene[genesLeftMinReads]
genesLeftMinCells <- names(nCellsPerGene2)[which(nCellsPerGene2 > minSamples)]
length(genesLeftMinCells)
t_data <- t(data)
print(head(genesLeftMinCells))
write.table(t_data[,c(genesLeftMinCells)],file = paste(opt$outdir,"/expressionRawCountFilteredForScenic.tsv",sep=""),sep = '\t',quote = F)
# Script to make analyzis of pseudotime with a gbm_cds ordered
# command line interface
## -----------------------------------------------------------------------------
## Libraries
## -----------------------------------------------------------------------------
# http://stackoverflow.com/questions/4090169/elegant-way-to-check-for-missing-packages-and-install-them
suppressMessages(library(monocle))
suppressMessages(library(getopt))
spec = matrix(c(
'help', 'h', 0, "logical", "Help about the program",
'inputRDS', 'i', 1, "character", "REQUIRED: 10X data ordered by monocle (.RDS generated by orderCL.R).",
'outdir', 'o',1, "character", 'Outdir path (default ./)',
'firstFilter', 'f',1,"numeric", 'Keep only the genes with at least N = f(X) UMI counts across all samples (X=2 by default) \n(e.g. the total number the gene would have, if it was expressed with a value of X in 1% of the cells). \nAdjust X value according to the dataset (it will depend on the dataset units, e.g. UMI, TPMs???).',
'secondFilter', 's',1, "numeric", 'Keep the genes that are detected in at least X% of the cells (X=1% by default)',
'markerTable', 'm',1, "character", "Markers table for the identification of transcription factors",
'mouseTF','t', 1, 'character', 'Mouse transcription factors list to identify TF in markers table'
), byrow=TRUE, ncol=5);
opt = getopt(spec)
# if help was asked, print a friendly message
# and exit with a non-zero error code
args <- commandArgs()
if ( !is.null(opt$help) | is.null(opt$inputRDS)) {
cat("Gene filtering of a gbm cds ordered. For scenic use, two filters")
cat(getopt(spec, usage=TRUE))
q(status=1)
}
#set default arguments values
if (is.null(opt$outdir)) {
opt$outdir = "./"
}
if (is.null(opt$firstFilter)) {
opt$firstFilter = 2
}
if (is.null(opt$secondFilter)) {
opt$secondFilter = 0.01
}
if(!is.null(opt$markerTable)) {
print("find TF in marker table...")
markerTable <- read.table(opt$markerTable,sep = "\t",header = T)
TFlist <- read.table(opt$mouseTF,header = F)
TF_markers <- data.frame(external_gene_name = markerTable$gene[which(markerTable$gene %in% TFlist$V1)])
#Add tf from bonzanni plus Gfi1b
TF_bonzani <- data.frame(external_gene_name = c("Spi1","Tal1","Zfpm1","Cbfa2t3","Erg","Fli1","Gata1","Gata2","Hhex","Runx1","Smad6","Zbtb16","Gfi1b"))
TF_markers <- rbind(TF_markers,TF_bonzani)
TF_markers <- unique(TF_markers$external_gene_name)
write.table(TF_markers,paste(opt$outdir,"/markers_and_bonzanni_TF.tsv",sep =""),
sep = "\t",quote = F,row.names = F, col.names = F)
}
gbm_cds <- readRDS(opt$inputRDS)
# Convert to monocle object if necessary (filtering script initially made for monocle)
if(is(gbm_cds) == "Seurat") {
#It is easier to pass by a monocle object and then reget the seurat object with the genes filtered
print("input seurat object converting it to monocle")
DefaultAssay(object = gbm_cds) <- "RNA"
pd <- new("AnnotatedDataFrame", data = gbm_cds@meta.data)
fd <- new("AnnotatedDataFrame", data = data.frame(gene_short_name = rownames(gbm_cds)))
rownames(fd) <- fd$gene_short_name
gbm_cds <- newCellDataSet(GetAssayData(gbm_cds,slot = "counts"),
phenoData = pd,
featureData = fd,
lowerDetectionLimit = 0.1,
expressionFamily = negbinomial.size())
gbm_cds <- detectGenes(gbm_cds, min_expr = 0.1)
}
exp <- as.matrix(exprs(gbm_cds))
geneShortName<-make.unique(as.character(fData(gbm_cds)$gene_short_name)) # to see between make.unique, remove dup..
rownames(exp) <- geneShortName
nCellsPerGene <- apply(exp, 1, function(x) sum(x>0))
nCountsPerGene <- apply(exp, 1, sum)
summary(nCellsPerGene)
sum(exp>0) / sum(exp==0)
#first filter
minReads <- opt$firstFilter*.01*ncol(exp)
genesLeftMinReads <- names(nCountsPerGene)[which(nCountsPerGene > minReads)]
length(genesLeftMinReads)
#second filter
minSamples <- ncol(exp)*opt$secondFilter
nCellsPerGene2 <- nCellsPerGene[genesLeftMinReads]
genesLeftMinCells <- names(nCellsPerGene2)[which(nCellsPerGene2 > minSamples)]
length(genesLeftMinCells)
t_exp <- t(exp)
print(head(genesLeftMinCells))
write.table(t_exp[,genesLeftMinCells],file = paste(opt$outdir,"/expressionRawCountFilteredForScenic.tsv",sep=""),sep = '\t',quote = F)
# Script to make analyzis of pseudotime with a gbm_cds ordered
# command line interface
## -----------------------------------------------------------------------------
## Libraries
## -----------------------------------------------------------------------------
# http://stackoverflow.com/questions/4090169/elegant-way-to-check-for-missing-packages-and-install-them
suppressMessages(library(Seurat))
suppressMessages(library(getopt))
spec = matrix(c(
'help', 'h', 0, "logical", "Help about the program",
'inputRDS', 'i', 1, "character", "REQUIRED: 10X data ordered by monocle (.RDS generated by orderCL.R).",
'outdir', 'o',1, "character", 'Outdir path (default ./)',
'firstFilter', 'f',1,"numeric", 'Keep only the genes with at least N = f(X) UMI counts across all samples (X=2 by default) \n(e.g. the total number the gene would have, if it was expressed with a value of X in 1% of the cells). \nAdjust X value according to the dataset (it will depend on the dataset units, e.g. UMI, TPMs???).',
'secondFilter', 's',1, "numeric", 'Keep the genes that are detected in at least X% of the cells (X=1% by default)',
'mouseTF','t', 1, 'character', 'Mouse transcription factors list to identify TF in markers table'
), byrow=TRUE, ncol=5);
opt = getopt(spec)
# if help was asked, print a friendly message
# and exit with a non-zero error code
args <- commandArgs()
if ( !is.null(opt$help) | is.null(opt$inputRDS)) {
cat("Gene filtering of a gbm cds ordered. For scenic use, two filters")
cat(getopt(spec, usage=TRUE))
q(status=1)
}
#set default arguments values
if (is.null(opt$outdir)) {
opt$outdir = "./"
}
if (is.null(opt$firstFilter)) {
opt$firstFilter = 2
}
if (is.null(opt$secondFilter)) {
opt$secondFilter = 0.01
}
seurat <- readRDS(opt$inputRDS)
DefaultAssay(object = seurat) <- "RNA"
exp <- as.matrix(GetAssayData(seurat,slot = "counts"))
nCellsPerGene <- apply(exp, 1, function(x) sum(x>0))
nCountsPerGene <- apply(exp, 1, sum)
summary(nCellsPerGene)
sum(exp>0) / sum(exp==0)
#first filter
minReads <- opt$firstFilter*.01*ncol(exp)
genesLeftMinReads <- names(nCountsPerGene)[which(nCountsPerGene > minReads)]
length(genesLeftMinReads)
#second filter
minSamples <- ncol(exp)*opt$secondFilter
nCellsPerGene2 <- nCellsPerGene[genesLeftMinReads]
genesLeftMinCells <- names(nCellsPerGene2)[which(nCellsPerGene2 > minSamples)]
length(genesLeftMinCells)
t_exp <- t(exp)
print(head(genesLeftMinCells))
t_exp <- t(exp)
print(head(genesLeftMinCells))
write.table(t_exp[,genesLeftMinCells],file = paste(opt$outdir,"/expressionRawCountFilteredForScenic.tsv",sep=""),sep = '\t',quote = F)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment