diff --git a/R_src/DoMultipleBarHeatmap.R b/R_src/DoMultipleBarHeatmap.R new file mode 100644 index 0000000000000000000000000000000000000000..7a1fd93a301e0404fac2a3bcdaa726ff36d54b4a --- /dev/null +++ b/R_src/DoMultipleBarHeatmap.R @@ -0,0 +1,179 @@ +# Adapted from https://github.com/satijalab/seurat/issues/2201 +DoMultiBarHeatmap <- function (object, + features = NULL, + cells = NULL, + group.by = "ident", + additional.group.by = NULL, + group.bar = TRUE, + disp.min = -2.5, + disp.max = NULL, + slot = "scale.data", + assay = NULL, + label = TRUE, + size = 5.5, + hjust = 0, + angle = 45, + raster = TRUE, + draw.lines = TRUE, + lines.width = NULL, + group.bar.height = 0.02, + combine = TRUE, + group.col = NULL + ) +{ + cells <- cells %||% colnames(x = object) + if (is.numeric(x = cells)) { + cells <- colnames(x = object)[cells] + } + assay <- assay %||% DefaultAssay(object = object) + DefaultAssay(object = object) <- assay + features <- features %||% VariableFeatures(object = object) + ## Why reverse??? + features <- rev(x = unique(x = features)) + disp.max <- disp.max %||% ifelse(test = slot == "scale.data", + yes = 2.5, no = 6) + possible.features <- rownames(x = GetAssayData(object = object, + slot = slot)) + if (any(!features %in% possible.features)) { + bad.features <- features[!features %in% possible.features] + features <- features[features %in% possible.features] + if (length(x = features) == 0) { + stop("No requested features found in the ", slot, + " slot for the ", assay, " assay.") + } + warning("The following features were omitted as they were not found in the ", + slot, " slot for the ", assay, " assay: ", paste(bad.features, + collapse = ", ")) + } + data <- as.data.frame(x = as.matrix(x = t(x = GetAssayData(object = object, + slot = slot)[features, cells, drop = FALSE]))) + + object <- suppressMessages(expr = StashIdent(object = object, + save.name = "ident")) + + group.by <- group.by %||% "ident" + groups.use <- object[[c(group.by, additional.group.by)]][cells, , drop = FALSE] + plots <- list() + for (i in group.by) { + data.group <- data + group.use <- groups.use[, c(i, additional.group.by), drop = FALSE] + + for(colname in colnames(group.use)){ + if (!is.factor(x = group.use[[colname]])) { + group.use[[colname]] <- factor(x = group.use[[colname]]) + } + } + + if (draw.lines) { + lines.width <- lines.width %||% ceiling(x = nrow(x = data.group) * + 0.0025) + placeholder.cells <- sapply(X = 1:(length(x = levels(x = group.use[[i]])) * + lines.width), FUN = function(x) { + return(Seurat:::RandomName(length = 20)) + }) + placeholder.groups <- data.frame(foo=rep(x = levels(x = group.use[[i]]), times = lines.width)) + placeholder.groups[additional.group.by] = NA + colnames(placeholder.groups) <- colnames(group.use) + rownames(placeholder.groups) <- placeholder.cells + + group.levels <- levels(x = group.use[[i]]) + + group.use <- sapply(group.use, as.vector) + rownames(x = group.use) <- cells + + group.use <- rbind(group.use, placeholder.groups) + + na.data.group <- matrix(data = NA, nrow = length(x = placeholder.cells), + ncol = ncol(x = data.group), dimnames = list(placeholder.cells, + colnames(x = data.group))) + data.group <- rbind(data.group, na.data.group) + } + + + + group.use <- group.use[with(group.use, eval(parse(text=paste('order(', paste(c(i, additional.group.by), collapse=', '), ')', sep='')))), , drop=F] + + + plot <- Seurat:::SingleRasterMap(data = data.group, raster = raster, + disp.min = disp.min, disp.max = disp.max, feature.order = features, + cell.order = rownames(x = group.use), group.by = group.use[[i]]) + + if (group.bar) { + + if(is.null(group.col)) { + colPal <- scales::hue_pal() + } else { + colPal <- function(n) { + return(group.col[c(1:n)]) + } + } + pbuild <- ggplot_build(plot = plot) + group.use2 <- group.use + cols <- list() + na.group <- Seurat:::RandomName(length = 20) + for (colname in rev(x = colnames(group.use2))){ + if (colname == group.by){ + colid = colname + colPal2 <- colPal + } else { + colid = colname + colPal2 <- function(n) { + return(rev(scales::hue_pal()(n))) + } + } + + if (draw.lines) { + levels(x = group.use2[[colname]]) <- c(levels(x = group.use2[[colname]]), na.group) + group.use2[placeholder.cells, colname] <- na.group + cols[[colname]] <- c(colPal2(length(x = levels(x = group.use[[colname]]))), "#FFFFFF") + } else { + cols[[colname]] <- c(colPal2(length(x = levels(x = group.use[[colname]])))) + } + names(x = cols[[colname]]) <- levels(x = group.use2[[colname]]) + y.range <- diff(x = pbuild$layout$panel_params[[1]]$y.range) + y.pos <- max(pbuild$layout$panel_params[[1]]$y.range) + y.range * 0.015 + y.max <- y.pos + group.bar.height * y.range + pbuild$layout$panel_params[[1]]$y.range <- c(pbuild$layout$panel_params[[1]]$y.range[1], y.max) + #print(cols[[colname]][group.use2[[colname]]]) + plot <- suppressMessages(plot + + annotation_raster(raster = t(x = cols[[colname]][group.use2[[colname]]]), xmin = -Inf, xmax = Inf, ymin = y.pos, ymax = y.max) + + annotation_custom(grob = grid::textGrob(label = colid, hjust = 0, gp = grid::gpar(cex = 0.75)), ymin = mean(c(y.pos, y.max)), ymax = mean(c(y.pos, y.max)), xmin = Inf, xmax = Inf) + + coord_cartesian(ylim = c(0, y.max), clip = "off")) + + #temp <- as.data.frame(cols[[colname]][levels(group.use[[colname]])]) + #colnames(temp) <- 'color' + #temp$x <- temp$y <- 1 + #temp[['name']] <- as.factor(rownames(temp)) + + #temp <- ggplot(temp, aes(x=x, y=y, fill=name)) + geom_point(shape=21, size=5) + labs(fill=colname) + theme(legend.position = "bottom") + #legend <- get_legend(temp) + #multiplot(plot, legend, heights=3,1) + + if ((colname == group.by) && label) { + x.max <- max(pbuild$layout$panel_params[[1]]$x.range) + x.divs <- pbuild$layout$panel_params[[1]]$x.major + group.use$x <- x.divs + label.x.pos <- tapply(X = group.use$x, INDEX = group.use[[colname]], + FUN = median) * x.max + label.x.pos <- data.frame(group = names(x = label.x.pos), + label.x.pos) + label.x.pos$group <- str_split_fixed(label.x.pos$group,"_",2)[,2] + plot <- plot + geom_text(stat = "identity", + data = label.x.pos, aes_string(label = "group", + x = "label.x.pos"), y = y.max + y.max * + 0.03 * 0.5, angle = angle, hjust = hjust, + size = size) + plot <- suppressMessages(plot + coord_cartesian(ylim = c(0, + y.max + y.max * 0.002 * max(nchar(x = levels(x = group.use[[colname]]))) * + size), clip = "off")) + } + } + } + plot <- plot + theme(line = element_blank()) + plots[[i]] <- plot + } + if (combine) { + plots <- CombinePlots(plots = plots) + } + return(plots) +} diff --git a/R_src/Enrichment.R b/R_src/Enrichment.R new file mode 100644 index 0000000000000000000000000000000000000000..b412636836161261ffa8e4d9ec08bc49fac8edf9 --- /dev/null +++ b/R_src/Enrichment.R @@ -0,0 +1,156 @@ +# function to make gene enrichment analyzis with gProfileR + +## function to perform gprofler enrcihment analysis on each gene cluster of +## a differentially expressed gene result (eg data frame of DE genes with a column cluster) +getGeneClustGprofile <- function(deg_clust, + background, + organism = "mmusculus", + hier_filtering = "none", + ordered_query = F) { + + resEnrichTest <- lapply(split(as.vector(deg_clust$Gene),f= deg_clust$Cluster),gprofiler, + organism = "mmusculus", + custom_bg = background, + ordered_query = ordered_query, + hier_filtering = hier_filtering) + return(resEnrichTest) +} + + +## function to plot gprofiler results +plotWriteResEnrich <- function(resEnrichTest, + sources=c("BP","keg"), + outdir = "./gprofiler", + clusterLabel =names(resEnrichTest), + colors = brewer.pal(length(resEnrichTest)+1, "Set1")) { + + dir.create(outdir,recursive = T,showWarnings = F) + + colors <- colors + + for (cluster in clusterLabel) { + write.table(file= paste(outdir,"/gprofiler_table_clust_",cluster,".tsv", sep = ""), + resEnrichTest[[cluster]],quote = F,sep = '\t',row.names = F) + for (s in sources) { + results <- resEnrichTest[[cluster]][which(resEnrichTest[[cluster]][,"domain"] == s),] + results <- results[order(results$p.value),] + results <- as.data.frame(results[,c("term.id","term.name","p.value")]) + results$logPval <- -log(base = 10,x = results$p.value) + if(s == "tf") { + results$term.name <- paste(results$term.name,results$term.id) + } + + ## TO DO : if to much term plot only the first n + + if (length(results$term.name) > 55) { + png(paste(outdir,"/gprofiler_top30_",cluster,"_",s,".png",sep =""),width = 600,height = 600) + gp <- ggplot(results[c(1:30),], aes(x=reorder(term.name, logPval), y=logPval)) + + geom_bar(stat='identity',fill = colors[which(names(resEnrichTest)==cluster)] ) + + coord_flip() + + xlab("term name") + + ylab("-log10(p value)") + + theme(text = element_text(size = 20)) + print(gp) + + dev.off() + + png(paste(outdir,"/gprofiler_",cluster,"_",s,".png",sep =""),width = 1400,height = 1400) + + } else { + png(paste(outdir,"/gprofiler_",cluster,"_",s,".png",sep =""),width = 600,height = 600) + } + gp <- ggplot(results, aes(x=reorder(term.name, logPval), y=logPval)) + + geom_bar(stat='identity',fill = colors[which(names(resEnrichTest)==cluster)] ) + + coord_flip() + + xlab("term name") + + ylab("-log10(p value)") + + theme(text = element_text(size = 20)) + print(gp) + + dev.off() + } + + } +} + +## function to composate +gProfileAnalysis <- function(deg_clust, + background, + organism = "mmusculus", + hier_filtering = "moderate", + ordered_query = F, + sources=c("BP","keg","CC",'MF','tf'), + outdir = "/gprofiler", + clusterLabel =names(resEnrichTest), + colors = brewer.pal(length(resEnrichTest)+1, "Set1")) { + + resEnrichTest <- getGeneClustGprofile(deg_clust = deg_clust, + background = background, + organism = organism, + hier_filtering = hier_filtering, + ordered_query = ordered_query) + + plotWriteResEnrich(resEnrichTest, + sources=sources, + outdir = outdir, + clusterLabel =clusterLabel, + colors = colors) + + return(resEnrichTest) + +} + +# function for hypergeometric test + +# hypergeometric test https://dputhier.github.io/ASG/practicals/go_statistics_td/go_statistics_td_2015.html + +#DEPRECATED Seurat 2 +# testHyper <- function(gene_set, signature, background,seurat) { +# signature <- signature[which(is.element(signature,set = rownames(seurat@data)))] +# genesMarked <- gene_set[which(is.element(gene_set,set = signature))] +# p.value <- phyper(q=length(genesMarked) -1, +# m=length(signature), +# n=length(background) - length(signature), k= length(gene_set), lower.tail=FALSE) +# return(p.value) +# } + +#For Seurat 3 +testHyper3 <- function(gene_set, signature, background,seurat) { + signature <- signature[which(is.element(signature,set = rownames(seurat)))] + genesMarked <- gene_set[which(is.element(gene_set,set = signature))] + p.value <- phyper(q=length(genesMarked) -1, + m=length(signature), + n=length(background) - length(signature), k= length(gene_set), lower.tail=FALSE) + return(p.value) +} + + + +testHyperCells <- function(cells_pull, cells_setTarget, allCells) { + cellsMarked <- gene_set[which(is.element(gene_set,set = cells_set))] + p.value <- phyper(q=length(cellsMarked) -1, + m=length(cells_setTarget), + n=length(allCells) - length(cells_setTarget), k= length(cells_pull), lower.tail=FALSE) + return(p.value) +} + +## DEPRECATED Seurat2 +# testHyperSig <- function(signature,seurat,markers,clust) { +# result <- testHyper(gene_set=markers[which(markers$Cluster == clust),"Gene"], +# signature=signature,seurat=seurat,background=rownames(seurat@data)) +# return(result) +# } + +#For Seurat 3 +testHyperSig3 <- function(signature,seurat,markers,clust) { + result <- testHyper3(gene_set=markers[which(markers$Cluster == clust),"Gene"], + signature=signature,seurat=seurat,background=rownames(seurat)) + return(result) +} + + + + + + + diff --git a/R_src/Seurat3CL.R b/R_src/Seurat3CL.R new file mode 100644 index 0000000000000000000000000000000000000000..d4237602e4685ba99958404d14ba687ad316cedb --- /dev/null +++ b/R_src/Seurat3CL.R @@ -0,0 +1,519 @@ +##------------------------------------------------------------------------------ +## L. Herault +##------------------------------------------------------------------------------ + +## ----------------------------------------------------------------------------- +## Libraries +## ----------------------------------------------------------------------------- + +suppressMessages(library(monocle)) +suppressMessages(library(Seurat)) +suppressMessages(library(BiocParallel)) +suppressMessages(library(getopt)) +suppressMessages(library(gridExtra)) +suppressMessages(library(gProfileR)) +suppressMessages(library(RColorBrewer)) +suppressMessages(library(readxl)) +suppressMessages(library(stringr)) +suppressMessages(library(scales)) +suppressMessages(library(grid)) + + + +source("R_src/getSigPerCells.R") +source("R_src/Enrichment.R") +source("R_src/funForSeurat.R") + +# Seurat 3 analysis of an individual sample required for using CaSTLe script. + +## ----------------------------------------------------------------------------- +## Command line args +## ----------------------------------------------------------------------------- + +spec = matrix(c( + 'help', 'h', 0, "logical", "Help about the program", + 'inputRDS', 'i', 1, "character", "REQUIRED : 10X data prepared as monocle or seurat object (.RDS generated by prepare_data.R).", + 'outdir', 'o',1, "character", 'Outdir path (default ./)', + "num_dim", 'n',1,"numeric", "Number of dimension to use for ordering (eg n first pc of PCA on input data) 10 by default", + "correction", "b",1,"character", "Covariable to use as blocking factor (eg one or several columns of pData: betch, cell cycle phases... separated by +)", + "minPropCellExp", "p",1,"numeric", "minimal proportion of cells that expressed the genes kept for the analaysis 0.001 by default", + "norm_method", "z",1, "character", "normalisation method, logNorm seurat (by default) or sctransform", + "resolution", "r", 1,"numeric", "resolution for Seurat clustering 0.9 by default", + "signaturesFile", "s", 1, "character", "path to folder with signature list store as rds object", + "identRemoved", "d", 1, "character", "Optionnal cluster to remove only work if input is a seurat object", + "nonExpressedGenesRemoved", "e", 0,"logical", "non expressed gene already removed default to FALSE", + "gprofiler", "g", 0, "logical", "if true doing gprofiler default to true", + "logfc_threshold", "l", 1, "numeric", "logfc threshold for finding cluster markers (1 cluster vs all deg) 0.25 by default", + "rodriguezSig", "k", 1, "character", "path for xls file of Rodriguez result" + +), byrow=TRUE, ncol=5); + +opt = getopt(spec) + + +# if help was asked, print a friendly message +# and exit with a non-zero error code + +args <- commandArgs() +if ( !is.null(opt$help) | is.null(opt$inputRDS)) { + cat("For an individual sample, perform PCA, tSNE, UMAP, Louvain clustering, diff expression between clusters with Seurat3 package with filtered poor quality cells data (gbm_cds monocle object)") + cat(getopt(spec, usage=TRUE)) + q(status=1) +} + + +#set default arguments values + +if (is.null(opt$num_dim)) { + opt$num_dim <- 10 +} + +if (is.null(opt$resolution)) { + opt$resolution <- 0.8 +} + +if (is.null(opt$minPropCellExp)) { + opt$minPropCellExp <- 0.001 + print(opt$minPropCellExp) +} + + +if (is.null(opt$logfc_threshold)) { + opt$logfc_threshold = 0.25 +} + +print(paste("logfc threshold:", opt$logfc_threshold) ) + +if (is.null(opt$outdir)) { + opt$outdir = "./" +} + + +if (is.null(opt$nonExpressedGenesRemoved)) { + opt$nonExpressedGenesRemoved = F +} else { + opt$nonExpressedGenesRemoved = T +} + +if (is.null(opt$gprofiler)) { + opt$gprofiler <- T +} else { + opt$gprofiler <- F +} + +print(opt$gprofiler) +print(opt$nonExpressedGenesRemoved) + + +# get correction vector +if (!is.null(opt$correction)) { + corrections <- strsplit(x = opt$correction,split = "\\+")[[1]] +} else { + corrections <- NULL +} + +# create blank for the plots +blank <- grid.rect(gp=gpar(col="white")) + +dir.create(opt$outdir,recursive = T,showWarnings = F) + +object <- readRDS(opt$inputRDS) + +# input is a monocle object in the workflow +# Code to use this script with seurat input (eg to re analyse) +if (is(object)[1]== "CellDataSet") { + + print("input monocle") + gbm_cds <- readRDS(opt$inputRDS) + +} else { + if(is(object) == "Seurat") { + #It is easier to pass by a monocle object and then reget the seurat object with the genes filtered + print("input seurat object") + seurat <- object + + if (!is.null(opt$identRemoved)) { + print(unique(seurat@active.ident)) + print(paste("removing identities",opt$identRemoved)) + seurat <- SubsetData(seurat,ident.remove = strsplit(x = opt$identRemoved,split = ",")[[1]],subset.raw = T) # may not work with seurat3 + } + + + pd <- new("AnnotatedDataFrame", data = gbm_cds@meta.data) + fd <- new("AnnotatedDataFrame", data = data.frame(gene_short_name = rownames(gbm_cds))) + rownames(fd) <- fd$gene_short_name + + gbm_cds <- newCellDataSet(GetAssayData(gbm_cds,slot = "counts"), + phenoData = pd, + featureData = fd, + lowerDetectionLimit = 0.1, + expressionFamily = negbinomial.size()) + + gbm_cds <- detectGenes(gbm_cds, min_expr = 0.1) + + + + } +} + +if(opt$nonExpressedGenesRemoved == F) { + print("remove non expressed genes (non expressed in at least X% of the cells X user option in monocle dp feature 5% in seurat tutorial 0,1%)") + fData(gbm_cds)$use_for_seurat <- fData(gbm_cds)$num_cells_expressed > opt$minPropCellExp * ncol(gbm_cds) + + gbm_to_seurat <- gbm_cds[fData(gbm_cds)$use_for_seurat==T,] +} else { + gbm_to_seurat <- gbm_cds +} + +if (is.element("Cluster",colnames(pData(gbm_to_seurat)))) { + colnames(pData(gbm_to_seurat))[which(colnames(pData(gbm_to_seurat))=="Cluster")] <- "Cluster_monocle" +} + +##Convert to seurat3 +#check for dup genes + +# Only needed if ensemble id (it is the case in the workflow) + +dupGeneNames <- fData(gbm_to_seurat)[which((duplicated(featureData(gbm_to_seurat)$gene_short_name))),"gene_short_name"] + +if(length(dupGeneNames) == 0) { + rownames(gbm_to_seurat) <- fData(gbm_to_seurat)$gene_short_name + +} else { + write.csv(fData(gbm_to_seurat)[which(is.element(fData(gbm_to_seurat)$gene_short_name,dupGeneNames)),], + paste(opt$outdir,"/dupGenesName.csv",sep = "") + ) + print("Dup gene short names existing, making them unique...") + rownames(gbm_to_seurat) <- make.unique(fData(gbm_to_seurat)$gene_short_name, sep = "--") + +} + + +seurat <- CreateSeuratObject(counts = exprs(gbm_to_seurat), meta.data = pData(gbm_to_seurat)) + + + +################################################################################################# +####################################### Seurat Workflow ######################################### +################################################################################################# + +if (opt$norm_method == "sctransform") { + + seurat <- SCTransform(object = seurat, vars.to.regress = c("G2M_score","S_score","G1_score")) + +} else { + + seurat <- NormalizeData(object = seurat) + seurat <- FindVariableFeatures(object = seurat,selection.method = "vst", nfeatures = 2000, verbose = T) + seurat <- ScaleData(object = seurat,vars.to.regress = corrections) + +} + +seurat <- RunPCA(object = seurat) + + +png(paste(opt$outdir,"/ElbowPlot.png",sep ="")) +ElbowPlot(object = seurat,ndims = 30) +dev.off() + + +print("Clustering...") + +seurat <- FindNeighbors(object = seurat,dims = c(1:opt$num_dim),k.param = 20) +seurat <- FindClusters(object = seurat,resolution = c(0.5,0.6,0.7,0.8,0.9,1,1.2)) + +print("Running TSNE...") + +seurat <- RunTSNE(seurat,dims = c(1:opt$num_dim)) + +print("Running UMAP...") + +seurat <- RunUMAP(seurat,dims = c(1:opt$num_dim)) + +print("UMAP ok") +print(colnames(seurat@meta.data)) + +colPrefix <- "RNA_snn_res." +if(opt$norm_method == "sctransform") { + colPrefix <- "SCT_snn_res." +} + +umapListRes <- list() +tsneListRes <- list() +for (r in c(0.6,0.8,1,1.2)) { + umapListRes[[as.character(r)]] <- DimPlot(seurat, + reduction = "umap", + label = T, + group.by = paste(colPrefix,r,sep="")) + + NoLegend() + + ggtitle(paste("res",r)) + + tsneListRes[[as.character(r)]] <- DimPlot(seurat, + reduction = "tsne", + label = T, + group.by = paste(colPrefix,r,sep="")) + + NoLegend() + + ggtitle(paste("res",r)) +} + +png(paste(opt$outdir,"/tsne_different_res.png",sep = ""),height = 800,width = 800) +grid.arrange(tsneListRes[[1]],tsneListRes[[2]],tsneListRes[[3]],tsneListRes[[4]]) +dev.off() + +png(paste(opt$outdir,"/umap_different_res.png",sep = ""),height = 800,width = 800) +grid.arrange(umapListRes[[1]],umapListRes[[2]],umapListRes[[3]],umapListRes[[4]]) +dev.off() + + + + +Idents(seurat) <- paste(colPrefix,opt$resolution,sep = "") +seurat@meta.data$numclust <- seurat@meta.data[,paste(colPrefix,opt$resolution,sep = "")] + + +#Check for unwanted source of variation +pPhases <- DimPlot(seurat,group.by = "phases") +if (!is.null(seurat@meta.data$predicted)) { + pPred <- DimPlot(seurat,group.by = "predicted") +} else { + pPred <- blank +} + +pUMI <- FeaturePlot(seurat, "Total_mRNAs") +pMito <- FeaturePlot(seurat, "percentMito") + +png(paste(opt$outdir,"/umap_factors.png",sep = ""),height = 800,width = 800) +grid.arrange(pPhases,pPred,pUMI,pMito) +dev.off() + +#Check for unwanted source of variation +pPhases <- DimPlot(seurat,group.by = "phases",reduction = "tsne") +if (!is.null(seurat@meta.data$predicted)) { + pPred <- DimPlot(seurat,group.by = "predicted",reduction = "tsne") +} else { + pPred <- blank +} +pUMI <- FeaturePlot(seurat, "Total_mRNAs",reduction = "tsne") +pMito <- FeaturePlot(seurat, "percentMito",reduction = "tsne") + +png(paste(opt$outdir,"/tsne_factors.png",sep = ""),height = 800,width = 800) +grid.arrange(pPhases,pPred,pUMI,pMito) +dev.off() + + +#Check for genes + +gene_list<- c("Pdzk1ip1","Mllt3", + "Ctla2a","Cd27","Cd34", + "Lig1","Hells","Tyms", + "Notch2","Lst1", + "Irf7","Stat1", + "Pf4","Itga2b", + "Klf1","Gata1", + "Mpo","Cd48", + "Fcer1a","Hdc", + "Il7r","Thy1", # Ccr9 not found + "Cdc20","Ccnb1","Racgap1", + "Mzb1","Ly6d", "Trp53inp1", + "Jun","Fos","Nr4a1") + +gene_list <- gene_list[which(is.element(set=rownames(seurat),el = gene_list))] + +dir.create(paste(opt$outdir,"/genes/umap/",sep =""),recursive = T) +dir.create(paste(opt$outdir,"/genes/tsne/",sep =""),recursive = T) +for (g in gene_list) { + png(paste(opt$outdir,"/genes/umap/",g,".png",sep = "")) + plot(FeaturePlot(seurat,features = g)) + dev.off() +} + +for (g in gene_list) { + png(paste(opt$outdir,"/genes/tsne/",g,".png",sep = "")) + plot(FeaturePlot(seurat,features = g),reduction = "tsne") + dev.off() +} + +markers <- FindAllMarkers(seurat,only.pos = T,logfc.threshold= opt$logfc_threshold) +markers <- markers[which(markers$p_val_adj < 0.05),] + +write.table(x = markers,paste(opt$outdir,"/markers.tsv", sep =""),sep = "\t",quote = F,row.names = F,col.names = T) + + +dir.create(paste(opt$outdir,"/markers/",sep = "")) + +ylab <- "LogNormalized UMI counts" +if (opt$norm_method == "sctransform") { + ylab <- "Expression level" +} + +for (numClust in unique(markers$cluster)) { + print(head(markers[which(markers$cluster == numClust),],n=9)) + png(paste(opt$outdir,"/markers/Cluster_",numClust,"_topGenesVlnPlot.png",sep =""),width = 1000, height = 1000) + plot(VlnPlot(object = seurat, features = head(markers[which(markers$cluster == numClust),"gene"],n=9),pt.size = 0.5) + + labs(x = "Clusters",y= ylab,colour = "black") + + theme(axis.text = element_text(size=20), + plot.title = element_text(size=25)) ) + dev.off() + +} + +################################################################################################# +########## Enrichment ########## +################################################################################################# + +## Add signatures +dir.create(paste(opt$outdir,"/cellSignatures/",sep = "")) +signatures <- readRDS(opt$signaturesFile) + + +for (sig in c(1:length(signatures))) { + sigName <- names(signatures)[sig] + signature <- signatures[[sig]] + seurat <- scoreCells3(seurat,signature,outdir= paste(opt$outdir,"/cellSignatures/",sep=""),sigName) +} + +## Enrichment + +# Modify colnames of markers to use gProfileAnalysis function +firstup <- function(x) { + substr(x, 1, 1) <- toupper(substr(x, 1, 1)) + x +} + +colnames(markers) <- firstup(colnames(markers)) + + +#be careful background +if(opt$gprofiler) { +gprofiler_result <- gProfileAnalysis(deg_clust = markers, + outdir = paste(opt$outdir,"/gProfileR", sep =""), + background = rownames(seurat), + colors = hue_pal()(length(unique(markers$Cluster)))) + +saveRDS(gprofiler_result,file=paste(opt$outdir,"/gProfileR/gprofiler_results.rds",sep ="")) +} + + + +# Test Rodriguez cluster sig +if(!is.null(opt$rodriguezSig)) { + + clusterNames <- c("C1","C2","C3","Mk","Er","Ba","Neu","Mo1","Mo2", "preDC","preB","preT") + + RodriguezClustersSig <- lapply(X= c(1:length(clusterNames)),FUN = read_xlsx,path = opt$rodriguezSig) + + names(RodriguezClustersSig) <- clusterNames + + getOnlyPos <- function(clustersSig) { + clusterSig <- clustersSig[which(clustersSig$log.effect > 0),] + return(clusterSig) + } + + RodriguezClustersSigPos <- lapply(X= RodriguezClustersSig, getOnlyPos) + + + signaturesRodriguez <- lapply(RodriguezClustersSigPos,"[[",1 ) + + + firstup <- function(x) { + substr(x, 1, 1) <- toupper(substr(x, 1, 1)) + x + } + + colnames(markers) <- firstup(colnames(markers)) + + getClustEnrichForRodriguez <- function(clust,signatures,seurat,markers) { + clustSig <- lapply(signatures,testHyperSig3,seurat,markers,clust) + if (!is.null(seurat@meta.data$predicted)) { + propCellTypesLearned <- table(seurat@meta.data[which(seurat@meta.data$numclust==clust),"predicted"])/length(seurat@meta.data[which(seurat@meta.data$numclust==clust),"predicted"]) + } else{ + propCellTypesLearned <- NULL + } + clustInfo <- c(clustSig,propCellTypesLearned) + return(clustInfo) + } + + clust_list <- lapply(unique(markers$Cluster),getClustEnrichForRodriguez,signature=signaturesRodriguez,seurat =seurat,markers =markers) ##markers arg forgotten in testHyper sig + + names(clust_list) <- paste("cluster_",unique(markers$Cluster),sep="") + + clust_table <- as.data.frame(matrix(unlist(clust_list), nrow=length(unlist(clust_list[1])))) + colnames(clust_table) <- names(clust_list) + rownames(clust_table) <- names(clust_list[[1]]) + + clust_df <- as.data.frame(t(clust_table)) + + write.csv(clust_df,file = paste(opt$outdir,"/clustInfoRodriguez.csv",sep =""),quote = F) + +} + + +#Clusters table summary + +clust_table <- data.frame() + +print("Creating cluster table summary") + + +getClustInfo <- function(clust,signatures,seurat,markers) { + + clustInfo <- list() + clustInfo$num_cells <- dim(seurat@meta.data[which(seurat@active.ident==clust),])[1] + clustInfo$percent_cells <- clustInfo$num_cells/dim(seurat@meta.data)[1] + percentPhases <- table(seurat@meta.data[which(seurat@active.ident==clust),"phases"])/length(seurat@meta.data[which(seurat@active.ident==clust),"phases"]) #In fact this is fraction not percentage + + if(!is.null(seurat@meta.data$predicted)) { + percentPredicted <- table(seurat@meta.data[which(seurat@active.ident==clust),"predicted"])/length(seurat@meta.data[which(seurat@active.ident==clust),"predicted"]) #In fact this is fraction not percentage + + for (p in unique(seurat@meta.data$predicted)) { + print(p) + if (is.element(p,names(percentPhases))) { + clustInfo[[p]] <- percentPhases[p] + } else { + clustInfo[[p]] <- 0 + } + } + + } + + for (p in c("G1_G0","S","G2_M")) { + if (is.element(p,names(percentPhases))) { + clustInfo[[p]] <- percentPhases[p] + } else { + clustInfo[[p]] <- 0 + } + } + + + clustInfo$median_genes_expressed <- median(seurat@meta.data[which(seurat@active.ident==clust),"numGenesPerCells"]) + clustInfo$median_nUMI <- median(seurat@meta.data[which(seurat@active.ident==clust),"Total_mRNAs"]) + clustInfo$median_percentMitochGenes <- median(seurat@meta.data[which(seurat@active.ident==clust),"percentMito"]) + + + clustSig <- lapply(signatures,testHyperSig3,seurat,markers,clust) + + clustInfo <- c(clustInfo,clustSig) + +} + +allSignatures <- c(signatures,signaturesRodriguez) + +clust_list <- lapply(levels(unique(seurat@active.ident)),getClustInfo,allSignatures,seurat,markers) + +names(clust_list) <- paste("cluster_",levels(unique(seurat@active.ident)),sep="") + +saveRDS(clust_list,paste(opt$outdir,"/clust_list_save.rds",sep ="")) + +clust_table <- as.data.frame(matrix(unlist(clust_list), nrow=length(unlist(clust_list[1])))) +colnames(clust_table) <- names(clust_list) +rownames(clust_table) <- names(clust_list[[1]]) + + +clust_df <- as.data.frame(t(clust_table)) + +write.table(x = clust_df,file = paste(opt$outdir,"/clusters_table.tsv",sep =""),sep="\t",quote=F,col.names = NA) + +saveRDS(seurat,file = paste(opt$outdir,"/seurat.rds",sep = "")) + + + diff --git a/R_src/Seurat3_integration.R b/R_src/Seurat3_integration.R new file mode 100644 index 0000000000000000000000000000000000000000..444b07cb9f3b12ee886056fdeaf3291b3ff1c4aa --- /dev/null +++ b/R_src/Seurat3_integration.R @@ -0,0 +1,336 @@ +##------------------------------------------------------------------------------ +## L. Herault +##------------------------------------------------------------------------------ + +## ----------------------------------------------------------------------------- +## Libraries +## ----------------------------------------------------------------------------- + +suppressMessages(library(monocle)) +suppressMessages(library(Seurat)) +suppressMessages(library(BiocParallel)) +suppressMessages(library(getopt)) +suppressMessages(library(gridExtra)) +suppressMessages(library(gProfileR)) +suppressMessages(library(RColorBrewer)) +suppressMessages(library(readxl)) +suppressMessages(library(stringr)) +suppressMessages(library(scales)) +suppressMessages(library(cowplot)) +suppressMessages(library(sctransform)) +suppressMessages(library(stringr)) +suppressMessages(library(plyr)) +suppressMessages(library(grid)) + + +source("R_src/getSigPerCells.R") +source("R_src/Enrichment.R") +source("R_src/funForSeurat.R") + + +# Seurat 3 integration workflow + +## ----------------------------------------------------------------------------- +## Command line args +## ----------------------------------------------------------------------------- + +spec = matrix(c( + 'help', 'h', 0, "logical", "Help about the program", + 'inputFiles', 'i', 1, "character", "REQUIRED: 10X dataset paths prepared as hspc.combined object (.RDS generated by prepare_data.R) separated by +", + 'signaturesFile', 's',1, "character", "REQUIRED: signatures rds file", + 'outdir', 'o',1, "character", 'Outdir path (default ./)', + "num_dim_CCA", 'q',1,"numeric", "First n dimensions of CCA to use for FindIntegrationAnchors Seurat function (15 by default)", + "num_dim_weight", 'w',1,"numeric", "First n dimensions to use for IntegrateData Seurat function (15 by default)", + "num_dim", 'n',1,"numeric", "First n dimensions of PCA to use for clustering, UMAP and TSNE (15 by default)", + "num_dim_integrated",'N',1,"numeric", "Number of PCA dimension computed to analyse integrated data (40 by default)", + "cores", 'c',1, "numeric", "Number of cores to use for ordering (for differencially expressed gene between clusters test)", + "resolution", 'r',1, "numeric", "resolution for hspc.combined clustering", + "correction", "b",1,"character", "Covariable to use as blocking factor (eg one or several columns of pData: betch, cell cycle phases... separated by +)", + "logfc_threshold", "l", 1, "numeric", "logfc threshold for finding cluster markers (1 cluster vs all deg) 0.25 by default", + "rodriguezSig", "k", 1, "character", "path for xls file of Rodriguez result", + "norm_method", "z",1, "character", "normalization method, logNorm (by default) or sctransform", + "reusePca", "p", 0, "character","re use pca calculated before when caculating anchor weights for each dataset default to FALSE (permit to correct for cell cycle before integration)" +), byrow=TRUE, ncol=5); + +opt = getopt(spec) + +# if help was asked, print a friendly message +# and exit with a non-zero error code + +args <- commandArgs() +if ( !is.null(opt$help) | is.null(opt$inputFiles)) { + cat("Perform Seurat 3 integration workflow, then cluster the cell with seurat 3 at different resolutions") + cat(getopt(spec, usage=TRUE)) + q(status=1) +} + +#set default arguments values + +if (is.null(opt$resolution)) { + opt$resolution <- 0.6 +} + +if (is.null(opt$logfc_threshold)) { + opt$logfc_threshold <- 0.25 +} + +if (is.null(opt$num_dim_CCA)) { + opt$num_dim_CCA <- 15 +} + +if (is.null(opt$num_dim_weight)) { + opt$num_dim_weight <- 15 +} + +if (is.null(opt$num_dim)) { + opt$num_dim <- 15 +} + +if (is.null(opt$num_dim_integrated)) { + opt$num_dim_integrated <- 40 +} + +if (is.null(opt$norm_method)) { + opt$norm_method <- "logNorm" +} + +if (is.null(opt$reusePca)) { + opt$reusePca <- FALSE +} + +blank <- grid.rect(gp=gpar(col="white")) + +print(opt$reusePca) + +dir.create(opt$outdir,recursive = T,showWarnings = F) + + +corrections <- strsplit(x = opt$correction,split = "\\+")[[1]] + + +dir.create(opt$outdir,recursive = T,showWarnings = F) + + +hspc.listFile <- strsplit(opt$inputFiles,split = "\\+")[[1]] + +hspc.list <- list() +#load dataset + +for (i in 1:length(x = hspc.listFile)) { + #For testing, in final workflow sampleName will be incorporated in metadata with the loading of cell ranger matrix + sampleName <- strsplit(hspc.listFile[i],split = "/")[[1]][2] + hspc.list[[i]] <- readRDS(hspc.listFile[i]) + hspc.list[[i]]@meta.data$sampleName <- sampleName + hspc.list[[i]] <- RenameCells(hspc.list[[i]],add.cell.id = sampleName) + + if(opt$norm_method != "sctransform") { + hspc.list[[i]] <- NormalizeData(object = hspc.list[[i]], verbose = FALSE) + hspc.list[[i]] <- FindVariableFeatures(object = hspc.list[[i]], selection.method = "vst", + nfeatures = 2000, verbose = FALSE) + } else { + hspc.list[[i]] <- SCTransform(hspc.list[[i]],vars.to.regress = corrections,verbose = T) + } +} + + +hspc.anchors <- FindIntegrationAnchors(object.list = hspc.list, dims = 1:opt$num_dim_CCA) + +if(opt$reusePca) { + print("Re use pca") + hspc.combined <- IntegrateData(anchorset = hspc.anchors, weight.reduction = "pca", dims = 1:opt$num_dim_weight) +} + +hspc.combined <- IntegrateData(anchorset = hspc.anchors, dims = 1:opt$num_dim_weight) + +DefaultAssay(object = hspc.combined) <- "integrated" + +# Run the standard workflow for visualization and clustering +#Here certainly need to reuse SCTransform on combined data if norm_method = sct + +hspc.combined <- ScaleData(object = hspc.combined, verbose = T,vars.to.regress = corrections) +hspc.combined <- RunPCA(object = hspc.combined, npcs = opt$num_dim_integrated, verbose = FALSE) + + +png(paste(opt$outdir,"/ElbowPlot.png",sep ="")) +ElbowPlot(object = hspc.combined,ndims = opt$num_dim_integrated) +dev.off() + + +# t-SNE UMAP and Clustering +hspc.combined <- RunUMAP(object = hspc.combined, reduction = "pca", dims = 1:opt$num_dim) +hspc.combined <- RunTSNE(object = hspc.combined, reduction = "pca", dims = 1:opt$num_dim) +hspc.combined <- FindNeighbors(object = hspc.combined, reduction = "pca", dims = 1:opt$num_dim) +hspc.combined <- FindClusters(hspc.combined, resolution = c(0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,1.2)) + +colPrefix <- "integrated_snn_res." + +Idents(hspc.combined) <- paste(colPrefix,opt$resolution,sep = "") + +hspc.combined@meta.data$AGE <- "Old" +hspc.combined@meta.data$AGE[which(hspc.combined@meta.data$age == "2_months")] <- "Young" + + + +# Visualization of the samples +p1 <- DimPlot(object = hspc.combined, reduction = "umap", group.by = "sampleName") +p2 <- DimPlot(object = hspc.combined, reduction = "umap", label = TRUE) +p3 <- DimPlot(object = hspc.combined, reduction = "umap", group.by = "AGE") +p4 <- DimPlot(object = hspc.combined, reduction = "umap", group.by = "runDate") +p3tsne <- DimPlot(object = hspc.combined, reduction = "tsne", group.by = "AGE") +p4tsne <- DimPlot(object = hspc.combined, reduction = "tsne", group.by = "runDate") + + + +png(paste(opt$outdir,"/umap_samples.png",sep =""),height = 800,width=1200) +grid.arrange(p1, p2,ncol = 2) +dev.off() + +png(paste(opt$outdir,"/AGE.png",sep =""),height = 800,width=1200) +grid.arrange(p3, p3tsne,ncol = 2) +dev.off() + +png(paste(opt$outdir,"/runDate.png",sep =""),height = 800,width=1200) +grid.arrange(p4, p4tsne,ncol = 2) +dev.off() + +png(paste(opt$outdir,"/UmapClusters.png",sep =""),height = 800,width=1200) +DimPlot(object = hspc.combined, reduction = "umap", label = TRUE) +dev.off() + + +p1tsne <- DimPlot(object = hspc.combined, reduction = "tsne", group.by = "sampleName") +p2tsne <- DimPlot(object = hspc.combined, reduction = "tsne", label = TRUE) + +png(paste(opt$outdir,"/tsne_samples.png",sep =""),height = 800,width=1000) +grid.arrange(p1tsne, p2tsne,ncol = 2) +dev.off() + +#Visualistaoin one sample at a time +sampleNames <- unique(hspc.combined@meta.data$sampleName) +dir.create(paste(opt$outdir,"/samples/umap/",sep = ""),recursive = T) +dir.create(paste(opt$outdir,"/samples/tsne/",sep = ""),recursive = T) + +#samplePlotList <- list() + for (s in sampleNames) { + png(paste(opt$outdir,"/samples/umap/",s,".png",sep = "")) + plot(DimPlot(object = hspc.combined, cells= grep(s, hspc.combined@meta.data$sampleName), reduction = "umap") + ggtitle(s)) + dev.off() + png(paste(opt$outdir,"/samples/tsne/",s,".png",sep = "")) + plot(DimPlot(object = hspc.combined, cells= grep(s, hspc.combined@meta.data$sampleName), reduction = "tsne") + ggtitle(s)) + dev.off() + } + + +umapListRes <- list() +tsneListRes <- list() +for (r in c(0.6,0.8,1,1.2)) { + umapListRes[[as.character(r)]] <- DimPlot(hspc.combined, + reduction = "umap", + label = T, + group.by = paste(colPrefix,r,sep="")) + + NoLegend() + + ggtitle(paste("res",r)) + + tsneListRes[[as.character(r)]] <- DimPlot(hspc.combined, + reduction = "tsne", + label = T, + group.by = paste(colPrefix,r,sep="")) + + NoLegend() + + ggtitle(paste("res",r)) +} + +png(paste(opt$outdir,"/tsne_different_res.png",sep = ""),height = 800,width = 800) +grid.arrange(tsneListRes[[1]],tsneListRes[[2]],tsneListRes[[3]],tsneListRes[[4]]) +dev.off() + +png(paste(opt$outdir,"/umap_different_res.png",sep = ""),height = 800,width = 800) +grid.arrange(umapListRes[[1]],umapListRes[[2]],umapListRes[[3]],umapListRes[[4]]) +dev.off() + + + +hspc.combined@meta.data$numclust <- hspc.combined@meta.data[,paste(colPrefix,opt$resolution,sep = "")] + + +#Check for unwanted source of variation UMAP +pPhases <- DimPlot(hspc.combined,group.by = "phases") +if (!is.null(hspc.combined@meta.data$predicted)) { + pPred <- DimPlot(hspc.combined,group.by = "predicted") +} else { + pPred <- blank +} +pUMI <- FeaturePlot(hspc.combined, "Total_mRNAs") +pMito <- FeaturePlot(hspc.combined, "percentMito") + +png(paste(opt$outdir,"/umap_factors.png",sep = ""),height = 800,width = 800) +grid.arrange(pPhases,pPred,pUMI,pMito) +dev.off() + +#Check for unwanted source of variation tSNE +pPhases <- DimPlot(hspc.combined,group.by = "phases",reduction = "tsne") +if (!is.null(hspc.combined@meta.data$predicted)) { + pPred <- DimPlot(hspc.combined,group.by = "predicted",reduction = "tsne") +} else { + pPred <- blank +} +pUMI <- FeaturePlot(hspc.combined, "Total_mRNAs",reduction = "tsne") +pMito <- FeaturePlot(hspc.combined, "percentMito",reduction = "tsne") + +png(paste(opt$outdir,"/tsne_factors.png",sep = ""),height = 800,width = 800) +grid.arrange(pPhases,pPred,pUMI,pMito) +dev.off() + + +#Check for genes + +gene_list<- c("Pdzk1ip1","Mllt3", + "Ctla2a","Cd27","Cd34", + "Lig1","Hells","Tyms", + "Notch2","Lst1", + "Irf7","Stat1", + "Pf4","Itga2b", + "Klf1","Gata1", + "Mpo","Cd48", + "Fcer1a","Hdc", + "Il7r","Thy1", # Ccr9 not found + "Cdc20","Ccnb1","Racgap1", + "Mzb1","Ly6d", "Trp53inp1", + "Jun","Fos","Nr4a1","Jund") + +gene_list <- gene_list[which(is.element(set=rownames(hspc.combined),el = gene_list))] + +dir.create(paste(opt$outdir,"/genes/umap/",sep =""),recursive = T) +dir.create(paste(opt$outdir,"/genes/tsne/",sep =""),recursive = T) +for (g in gene_list) { + png(paste(opt$outdir,"/genes/umap/",g,".png",sep = "")) + plot(FeaturePlot(hspc.combined,features = g)) + dev.off() +} + +for (g in gene_list) { + png(paste(opt$outdir,"/genes/tsne/",g,".png",sep = "")) + plot(FeaturePlot(hspc.combined,features = g,reduction = "tsne")) + dev.off() +} + +#AGE prop for each cluster +age_clust <- getAGEPropPerClustBarplot(hspc.combined) +#sample prop for each cluster +sample_clust <- getSamplePropPerClustBarplot(hspc.combined) +#runDate prop for each cluster +runDate_clust <- getRunDatePropPerClustBarplot(hspc.combined) + +png(paste(opt$outdir,"/propPerClust.png",sep = "")) +grid.arrange(age_clust,sample_clust,runDate_clust,p2tsne) +dev.off() + + +## saving results + +saveRDS(hspc.combined,paste(opt$outdir,"/combined.rds",sep ="")) + + + + + + diff --git a/R_src/Seurat3_integration_biology.R b/R_src/Seurat3_integration_biology.R new file mode 100644 index 0000000000000000000000000000000000000000..327783be28510d2a40e6022a77d3c552a9c16658 --- /dev/null +++ b/R_src/Seurat3_integration_biology.R @@ -0,0 +1,219 @@ +##------------------------------------------------------------------------------ +## L. Herault +##------------------------------------------------------------------------------ + +## ----------------------------------------------------------------------------- +## Libraries +## ----------------------------------------------------------------------------- + +suppressMessages(library(monocle)) +suppressMessages(library(Seurat)) +suppressMessages(library(BiocParallel)) +suppressMessages(library(getopt)) +suppressMessages(library(gridExtra)) +suppressMessages(library(gProfileR)) +suppressMessages(library(RColorBrewer)) +suppressMessages(library(readxl)) +suppressMessages(library(stringr)) +suppressMessages(library(plyr)) +suppressMessages(library(biomaRt)) + + + +suppressMessages(library(scales)) + + +source("R_src/getSigPerCells.R") +source("R_src/Enrichment.R") +source("R_src/funForSeurat.R") + +# Analysis of seurat 3 integration and clusternig workflow + +## ----------------------------------------------------------------------------- +## Command line args +## ----------------------------------------------------------------------------- + +spec = matrix(c( + 'help', 'h', 0, "logical", "Help about the program", + 'inputRDS', 'i', 1, "character", "REQUIRED : 10X data prepared qs seurat object with CCA and first clustering (.RDS generated by prepare_data.R).", + 'outdir', 'o',1, "character", 'Outdir path (default ./)', + "mart", 'm',1, "character", "mart data rds format if biomaRt server is down and you have a save ", + "resolution", 'r',1, "numeric", "resolution of the clustering analysed default to 0.8", + "num_dim", 'n', 1, "numeric", "first n dimension of the CCA to use for the reclustering", + "minPropCellExp", "p",1,"character", "proportion minimal of cells that expressed the genes kept for the analaysis 0.001 by default", + "signaturesFile", "s", 1, "character", "path to folder with signature list store as rds object", + "logfc_threshold", "l", 1, "numeric", "logfc threshold for finding cluster markers (1 cluster vs all deg) 0.25 by default", + "rodriguezSig", "k", 1, "character", "path for xls file of Rodriguez result", + "gprofiler", "g", 0, "logical", "Put to avoid gprofiler analysis", + "norm_method", "z",1, "character", "normalisation method, logNorm seurat (by default) or sctransform" +), byrow=TRUE, ncol=5); + + + +opt = getopt(spec) + + +# if help was asked, print a friendly message +# and exit with a non-zero error code + +args <- commandArgs() +if ( !is.null(opt$help) | is.null(opt$inputRDS)) { + cat("Perform diff expression between clusters previously obtained with Seurat3, make a summary table of cluster metrics and signature enrichments") + cat(getopt(spec, usage=TRUE)) + q(status=1) +} + + +if (is.null(opt$logfc_threshold)) { + opt$logfc_threshold = 0.25 +} +if (is.null(opt$norm_method)) { + opt$norm_method = "logNorm" +} + +print(paste("logfc threshold:", opt$logfc_threshold) ) + +if (is.null(opt$outdir)) { + opt$outdir = "./" +} + + +if(is.null(opt$doMarkers)) { + opt$doMarkers = T +} + +if (is.null(opt$gprofiler)) { + opt$gprofiler <- T +} else { + opt$gprofiler <- F +} + + +if (is.null(opt$mart)){ + mart <- useDataset("mmusculus_gene_ensembl", useMart("ensembl")) +} else { + mart <- readRDS(opt$mart) +} + + +dir.create(opt$outdir,recursive = T,showWarnings = F) + +hspc.combined <- readRDS(opt$inputRDS) +signatures <- readRDS(opt$signaturesFile) + +# Set active idents/slot +Idents(hspc.combined) <- paste("integrated_snn_res.",opt$resolution,sep ="") + +hspc.combined@meta.data$numclust <- Idents(hspc.combined) + +DefaultAssay(object = hspc.combined) <- "RNA" +if (opt$norm_method == "sctransform") { + DefaultAssay(object = hspc.combined) <- "SCT" +} + +## Plotting + +png(paste(opt$outdir,"/seurat_res",opt$resolution,".png",sep = ""),height = 800,width = 800) +DimPlot(hspc.combined,label = TRUE,pt.size = 1.5) + ggtitle(paste("seurat_res",opt$resolution,sep = "")) + NoLegend() +dev.off() + +#age <- getAGEPropPerClustBarplot(hspc.combined) + +print(paste("integrated_snn_res.",opt$resolution,sep ="")) +print(colnames(hspc.combined@meta.data)) + +#ageEnrich <- getEnrichAge(hspc.combined,clustCol =paste("integrated_snn_res.",opt$resolution,sep =""),metaCol = "AGE") + +# ageEnrich <- as.data.frame(t(ageEnrich)) +# ageEnrich$color <- "black" +# ageEnrich[which(as.numeric(as.vector(ageEnrich$phyper)) < 0.05),"color"] <- "red" +# age <- age + theme(axis.text.y = element_text(colour = ageEnrich[,'color'])) + + +# png(paste(opt$outdir,"/bp_AGE.png",sep = ""),height = 800,width = 800) +# plot(age) +# dev.off() + +png(paste(opt$outdir,"/bp_sample.png",sep = ""),height = 800,width = 800) +getSamplePropPerClustBarplot(hspc.combined) +dev.off() + +png(paste(opt$outdir,"/bp_runDate.png",sep = ""),height = 800,width = 800) +getRunDatePropPerClustBarplot(hspc.combined) +dev.off() + +png(paste(opt$outdir,"/bp_phases.png",sep = ""),height = 800,width = 800) +getPhasePropPerClustBarplot(hspc.combined) +dev.off() + +if (!is.null(hspc.combined@meta.data$predicted)) { +png(paste(opt$outdir,"/bp_predicted.png",sep = ""),height = 800,width = 800) +getPredictedPropPerClustBarplot(hspc.combined) +dev.off() +} + + +## Perform differential expression analysis between clusters + + +dirMarkers <- paste(opt$outdir,"/markers_res",opt$resolution,sep ="") +dir.create(dirMarkers) + +markers <- FindAllMarkers(hspc.combined,only.pos = T,logfc.threshold= opt$logfc_threshold) +markers <- markers[which(markers$p_val_adj < 0.05),] + +ylab <- "LogNormalized UMI counts" +if (opt$norm_method == "sctransform") { + ylab <- "Expression level" +} + +for (numClust in unique(markers$cluster)) { + print(head(markers[which(markers$cluster == numClust),],n=9)) + png(paste(dirMarkers,"/Cluster_",numClust,"_topGenesVlnPlot.png",sep =""),width = 1000, height = 1000) + print(plot(VlnPlot(object = hspc.combined, features = head(markers[which(markers$cluster == numClust),"gene"],n=9),pt.size = 0.5) + + labs(x = "Clusters",y= ylab,colour = "black") + + theme(axis.text = element_text(size=20), + plot.title = element_text(size=25)) )) + dev.off() +} + +write.table(x = markers,paste(dirMarkers,"/markers.tsv", sep =""),sep = "\t",quote = F,row.names = F,col.names = T) + +print("getting TF markers") +TF_markers <- getBM(attributes=c("external_gene_name"), + filters=c("external_gene_name","go"), + values=list(markers$gene, + "GO:0003700"),mart=mart) + +#Add tf from bonzanni plus Zbtb16 and Gfi1b +TF_bonzani <- data.frame(external_gene_name = c("Spi1","Tal1","Zfpm1","Cbfa2t3","Erg","Fli1","Gata1","Gata2","Hhex","Runx1","Smad6","Zbtb16","Gfi1b")) +TF_markers <- rbind(TF_markers,TF_bonzani) +TF_markers <- unique(TF_markers) + +write.table(TF_markers,paste(dirMarkers,"/markers_and_bonzanni_TF.tsv",sep =""), + sep = "\t",quote = F,row.names = F, col.names = F) + +# Modify colnames of markers to use gProfileAnalysis function +firstup <- function(x) { + substr(x, 1, 1) <- toupper(substr(x, 1, 1)) + x +} + +colnames(markers) <- firstup(colnames(markers)) + + +if(opt$gprofiler) { + gprofiler_result <- gProfileAnalysis(deg_clust = markers, + outdir = paste(dirMarkers,"/gProfileR", sep =""), + background = rownames(hspc.combined), + colors = hue_pal()(length(unique(markers$Cluster)))) + + saveRDS(gprofiler_result,file=paste(dirMarkers,"/gProfileR/gprofiler_results.rds",sep ="")) +} + +getClustTable(opt$rodriguezSig,markers=markers,signatures = signatures,seurat = hspc.combined,outdir = dirMarkers) + + + + + diff --git a/R_src/Seurat4CL.R b/R_src/Seurat4CL.R new file mode 100644 index 0000000000000000000000000000000000000000..72860cbada218263263d0bb81f1447080e791191 --- /dev/null +++ b/R_src/Seurat4CL.R @@ -0,0 +1,490 @@ +##------------------------------------------------------------------------------ +## L. Herault +##------------------------------------------------------------------------------ + +## ----------------------------------------------------------------------------- +## Libraries +## ----------------------------------------------------------------------------- + +suppressMessages(library(Seurat)) +suppressMessages(library(BiocParallel)) +suppressMessages(library(getopt)) +suppressMessages(library(gridExtra)) +#suppressMessages(library(gProfileR)) +suppressMessages(library(RColorBrewer)) +suppressMessages(library(readxl)) +suppressMessages(library(stringr)) +suppressMessages(library(scales)) +suppressMessages(library(grid)) +suppressMessages(library(ggplot2)) + + + + +source("R_src/getSigPerCells.R") +source("R_src/Enrichment.R") +source("R_src/funForSeurat.R") + +# Seurat 3 analysis of an individual sample required for using CaSTLe script. + +## ----------------------------------------------------------------------------- +## Command line args +## ----------------------------------------------------------------------------- + +spec = matrix(c( + 'help', 'h', 0, "logical", "Help about the program", + 'inputRDS', 'i', 1, "character", "REQUIRED : 10X data prepared as monocle or seurat object (.RDS generated by prepare_data.R).", + 'outdir', 'o',1, "character", 'Outdir path (default ./)', + "num_dim", 'n',1,"numeric", "Number of dimension to use for ordering (eg n first pc of PCA on input data) 10 by default", + "correction", "b",1,"character", "Covariable to use as blocking factor (eg one or several columns of pData: betch, cell cycle phases... separated by +)", + "minPropCellExp", "p",1,"numeric", "minimal proportion of cells that expressed the genes kept for the analaysis 0.001 by default", + "norm_method", "z",1, "character", "normalisation method, logNorm seurat (by default) or sctransform", + "resolution", "r", 1,"numeric", "resolution for Seurat clustering 0.9 by default", + "signaturesFile", "s", 1, "character", "path to folder with signature list store as rds object", + "identRemoved", "d", 1, "character", "Optionnal cluster to remove only work if input is a seurat object", + "nonExpressedGenesRemoved", "e", 0,"logical", "non expressed gene already removed default to FALSE", + "gprofiler", "g", 0, "logical", "if true doing gprofiler default to true", + "logfc_threshold", "l", 1, "numeric", "logfc threshold for finding cluster markers (1 cluster vs all deg) 0.25 by default" + +), byrow=TRUE, ncol=5); + +opt = getopt(spec) + +## For testing +# +# opt <- list() +# opt$outdir <- "output/testSeurat4/" +# opt$correction <- "G2M_score+S_score+G1_score" +# opt$inputRDS <- "seurat_treated.rds" +# opt$signatureFile <- "output/signatures/publicSignatures.rds" + + + +# if help was asked, print a friendly message +# and exit with a non-zero error code + +args <- commandArgs() +if ( !is.null(opt$help) | is.null(opt$inputRDS)) { + cat("For an individual sample, perform PCA, tSNE, UMAP, Louvain clustering, diff expression between clusters with Seurat3 package with filtered poor quality cells data (gbm_cds monocle object)") + cat(getopt(spec, usage=TRUE)) + q(status=1) +} + + +#set default arguments values + +if (is.null(opt$num_dim)) { + opt$num_dim <- 10 +} + +if (is.null(opt$resolution)) { + opt$resolution <- 0.8 +} + +if (is.null(opt$minPropCellExp)) { + opt$minPropCellExp <- 0.001 + print(opt$minPropCellExp) +} + + +if (is.null(opt$logfc_threshold)) { + opt$logfc_threshold = 0.25 +} + +print(paste("logfc threshold:", opt$logfc_threshold) ) + +if (is.null(opt$outdir)) { + opt$outdir = "./" +} + +if (is.null(opt$norm_method)) { + opt$norm_method = "logNorm" +} + + + +if (is.null(opt$nonExpressedGenesRemoved)) { + opt$nonExpressedGenesRemoved = F +} else { + opt$nonExpressedGenesRemoved = T +} + +# if (is.null(opt$gprofiler)) { +# opt$gprofiler <- T +# } else { +# opt$gprofiler <- F +# } + + +# get correction vector +if (!is.null(opt$correction)) { + corrections <- strsplit(x = opt$correction,split = "\\+")[[1]] +} else { + corrections <- NULL +} + + + +dir.create(opt$outdir,recursive = T,showWarnings = F) + +print(opt) + +# Loading +seurat <- readRDS(opt$inputRDS) + +# Remove non expressed genes if not done +if(opt$nonExpressedGenesRemoved == F) { + genesFiltered <- list(genes_before = dim(seurat)[1]) + + print("remove non expressed genes (non expressed in at least X% of the cells X user option in monocle dp feature 5% in seurat tutorial 0,1%)") + threshold <- opt$minPropCellExp * ncol(seurat) + + seurat <- CreateSeuratObject(counts = as.matrix(GetAssayData(seurat,slot = 'counts',assay = "RNA")), + assay = "RNA", + meta.data = seurat@meta.data, + min.cells = threshold) + genesFiltered$genes_after <- dim(seurat)[1] + + png(paste(opt$outdir,"/genesFiltering.png",sep = "")) + barplot(unlist(genesFiltered),main = "Genes filtering") + dev.off() + +} + + + +################################################################################################# +####################################### Seurat Workflow ######################################### +################################################################################################# + + +classicSeuratWorkflow <- function(seurat, corrections,outdir) { + dir.create(outdir,recursive = T,showWarnings = F) + + if (opt$norm_method == "sctransform") { + + seurat <- SCTransform(object = seurat, vars.to.regress = corrections) + + } else { + + seurat <- NormalizeData(object = seurat) + seurat <- FindVariableFeatures(object = seurat,selection.method = "vst", nfeatures = 2000, verbose = T) + seurat <- ScaleData(object = seurat,vars.to.regress = corrections) + + } + + seurat <- RunPCA(object = seurat) + + + png(paste(outdir,"/ElbowPlot.png",sep ="")) + ElbowPlot(object = seurat,ndims = 30) + dev.off() + + + print("Clustering...") + + seurat <- FindNeighbors(object = seurat,dims = c(1:opt$num_dim),k.param = 20) + seurat <- FindClusters(object = seurat,resolution = c(0.5,0.6,0.7,0.8,0.9,1,1.2)) + + print("Running TSNE...") + + seurat <- RunTSNE(seurat,dims = c(1:opt$num_dim)) + + print("Running UMAP...") + + seurat <- RunUMAP(seurat,dims = c(1:opt$num_dim)) + + print("UMAP ok") + print(colnames(seurat@meta.data)) + + colPrefix <- "RNA_snn_res." + if(opt$norm_method == "sctransform") { + colPrefix <- "SCT_snn_res." + } + + umapListRes <- list() + tsneListRes <- list() + for (r in c(0.6,0.8,1,1.2)) { + umapListRes[[as.character(r)]] <- DimPlot(seurat, + reduction = "umap", + label = T, + group.by = paste(colPrefix,r,sep="")) + + NoLegend() + + ggplot2::ggtitle(paste("res",r)) + + tsneListRes[[as.character(r)]] <- DimPlot(seurat, + reduction = "tsne", + label = T, + group.by = paste(colPrefix,r,sep="")) + + NoLegend() + + ggplot2::ggtitle(paste("res",r)) + } + + png(paste(outdir,"/tsne_different_res.png",sep = ""),height = 800,width = 800) + grid.arrange(tsneListRes[[1]],tsneListRes[[2]],tsneListRes[[3]],tsneListRes[[4]]) + dev.off() + + png(paste(outdir,"/umap_different_res.png",sep = ""),height = 800,width = 800) + grid.arrange(umapListRes[[1]],umapListRes[[2]],umapListRes[[3]],umapListRes[[4]]) + dev.off() + + + + + Idents(seurat) <- paste(colPrefix,opt$resolution,sep = "") + seurat@meta.data$numclust <- seurat@meta.data[,paste(colPrefix,opt$resolution,sep = "")] + + + #Check for unwanted source of variation + pPhases <- DimPlot(seurat,group.by = "phases") + if (!is.null(seurat@meta.data$predicted)) { + pPred <- DimPlot(seurat,group.by = "predicted") + } else { + pPred <- DimPlot(seurat) + } + + pUMI <- FeaturePlot(seurat, "nCount_RNA") + pMito <- FeaturePlot(seurat, "percentMito") + + png(paste(outdir,"/umap_factors.png",sep = ""),height = 800,width = 800) + grid.arrange(pPhases,pPred,pUMI,pMito) + dev.off() + + #Check for unwanted source of variation + pPhases <- DimPlot(seurat,group.by = "phases",reduction = "tsne") + if (!is.null(seurat@meta.data$predicted)) { + pPred <- DimPlot(seurat,group.by = "predicted",reduction = "tsne") + } else { + pPred <- DimPlot(seurat,reduction = "tsne") + } + pUMI <- FeaturePlot(seurat, "nCount_RNA",reduction = "tsne") + pMito <- FeaturePlot(seurat, "percentMito",reduction = "tsne") + + png(paste(outdir,"/tsne_factors.png",sep = ""),height = 800,width = 800) + grid.arrange(pPhases,pPred,pUMI,pMito) + dev.off() + + return(seurat) + +} + + +## First without integration without correction +unintegrated <- seurat +DefaultAssay(unintegrated) <- "RNA" +classicSeuratWorkflow(unintegrated,correction = NULL,outdir = paste0(opt$outdir,"/SeuratWoIntegrationWoCr/")) + +## without integration without correction + +classicSeuratWorkflow(unintegrated,correction = corrections,outdir = paste0(opt$outdir,"/SeuratWoIntegrationWithCr/")) + + +## With integration without correction +if (!is.null(corrections)) { + print("Seurat workflow without correction") + classicSeuratWorkflow(seurat,correction = NULL,outdir = paste0(opt$outdir,"/SeuratWithoutCr/")) +} + +## With integration with correction + +seurat <- classicSeuratWorkflow(seurat,corrections = corrections,outdir =opt$outdir) + +################################################################################################# +####################################### Markers analysis ######################################## +################################################################################################# + +markers <- FindAllMarkers(seurat,only.pos = T,logfc.threshold= opt$logfc_threshold) +markers <- markers[which(markers$p_val_adj < 0.05),] + +write.table(x = markers,paste(opt$outdir,"/markers.tsv", sep =""),sep = "\t",quote = F,row.names = F,col.names = T) + + +dir.create(paste(opt$outdir,"/markers/",sep = "")) + +ylab <- "LogNormalized UMI counts" +if (opt$norm_method == "sctransform") { + ylab <- "Expression level" +} + +for (numClust in unique(markers$cluster)) { + print(head(markers[which(markers$cluster == numClust),],n=9)) + png(paste(opt$outdir,"/markers/Cluster_",numClust,"_topGenesVlnPlot.png",sep =""),width = 1000, height = 1000) + plot(VlnPlot(object = seurat, features = head(markers[which(markers$cluster == numClust),"gene"],n=9),pt.size = 0.5) + + labs(x = "Clusters",y= ylab,colour = "black") + + theme(axis.text = element_text(size=20), + plot.title = element_text(size=25)) ) + dev.off() + +} + + + +## Add signatures scores +dir.create(paste(opt$outdir,"/cellSignatures/",sep = "")) +signatures <- readRDS(opt$signaturesFile) +names(signatures) <- paste0(names(signatures),"_",seurat$sampleName[1]) + +for (sig in c(1:length(signatures))) { + sigName <- names(signatures)[sig] + signature <- signatures[[sig]] + seurat <- scoreCells3(seurat,signature,outdir= paste(opt$outdir,"/cellSignatures/",sep=""),sigName) +} + +################################################################################################# +########## Enrichment ########## +################################################################################################# + + +# ## Enrichment +# +# # Modify colnames of markers to use gProfileAnalysis function +# firstup <- function(x) { +# substr(x, 1, 1) <- toupper(substr(x, 1, 1)) +# x +# } +# +# colnames(markers) <- firstup(colnames(markers)) +# +# +# #be careful background +# if(opt$gprofiler) { +# gprofiler_result <- gProfileAnalysis(deg_clust = markers, +# outdir = paste(opt$outdir,"/gProfileR", sep =""), +# background = rownames(seurat), +# colors = hue_pal()(length(unique(markers$Cluster)))) +# +# saveRDS(gprofiler_result,file=paste(opt$outdir,"/gProfileR/gprofiler_results.rds",sep ="")) +# } +# +# +# +# # Test Rodriguez cluster sig +# if(!is.null(opt$rodriguezSig)) { +# +# clusterNames <- c("C1","C2","C3","Mk","Er","Ba","Neu","Mo1","Mo2", "preDC","preB","preT") +# +# RodriguezClustersSig <- lapply(X= c(1:length(clusterNames)),FUN = read_xlsx,path = opt$rodriguezSig) +# +# names(RodriguezClustersSig) <- clusterNames +# +# getOnlyPos <- function(clustersSig) { +# clusterSig <- clustersSig[which(clustersSig$log.effect > 0),] +# return(clusterSig) +# } +# +# RodriguezClustersSigPos <- lapply(X= RodriguezClustersSig, getOnlyPos) +# +# +# signaturesRodriguez <- lapply(RodriguezClustersSigPos,"[[",1 ) +# +# +# firstup <- function(x) { +# substr(x, 1, 1) <- toupper(substr(x, 1, 1)) +# x +# } +# +# colnames(markers) <- firstup(colnames(markers)) +# +# getClustEnrichForRodriguez <- function(clust,signatures,seurat,markers) { +# clustSig <- lapply(signatures,testHyperSig3,seurat,markers,clust) +# if (!is.null(seurat@meta.data$predicted)) { +# propCellTypesLearned <- table(seurat@meta.data[which(seurat@meta.data$numclust==clust),"predicted"])/length(seurat@meta.data[which(seurat@meta.data$numclust==clust),"predicted"]) +# } else{ +# propCellTypesLearned <- NULL +# } +# clustInfo <- c(clustSig,propCellTypesLearned) +# return(clustInfo) +# } +# +# clust_list <- lapply(unique(markers$Cluster),getClustEnrichForRodriguez,signature=signaturesRodriguez,seurat =seurat,markers =markers) ##markers arg forgotten in testHyper sig +# +# names(clust_list) <- paste("cluster_",unique(markers$Cluster),sep="") +# +# clust_table <- as.data.frame(matrix(unlist(clust_list), nrow=length(unlist(clust_list[1])))) +# colnames(clust_table) <- names(clust_list) +# rownames(clust_table) <- names(clust_list[[1]]) +# +# clust_df <- as.data.frame(t(clust_table)) +# +# write.csv(clust_df,file = paste(opt$outdir,"/clustInfoRodriguez.csv",sep =""),quote = F) +# +# } + + +################################################################################################# +########## Cluster summary table ######### +################################################################################################# + +clust_table <- data.frame() + +print("Creating cluster summary table ") + + firstup <- function(x) { + substr(x, 1, 1) <- toupper(substr(x, 1, 1)) + x + } + + colnames(markers) <- firstup(colnames(markers)) + +getClustInfo <- function(clust,signatures,seurat,markers) { + + clustInfo <- list() + clustInfo$num_cells <- dim(seurat@meta.data[which(seurat@active.ident==clust),])[1] + clustInfo$percent_cells <- clustInfo$num_cells/dim(seurat@meta.data)[1] + percentPhases <- table(seurat@meta.data[which(seurat@active.ident==clust),"phases"])/length(seurat@meta.data[which(seurat@active.ident==clust),"phases"]) #In fact this is fraction not percentage + + if(!is.null(seurat@meta.data$predicted)) { + percentPredicted <- table(seurat@meta.data[which(seurat@active.ident==clust),"predicted"])/length(seurat@meta.data[which(seurat@active.ident==clust),"predicted"]) #In fact this is fraction not percentage + + for (p in unique(seurat@meta.data$predicted)) { + print(p) + if (is.element(p,names(percentPhases))) { + clustInfo[[p]] <- percentPhases[p] + } else { + clustInfo[[p]] <- 0 + } + } + + } + + for (p in c("G1_G0","S","G2_M")) { + if (is.element(p,names(percentPhases))) { + clustInfo[[p]] <- percentPhases[p] + } else { + clustInfo[[p]] <- 0 + } + } + + + clustInfo$median_genes_expressed <- median(seurat@meta.data[which(seurat@active.ident==clust),"nFeature_RNA"]) + clustInfo$median_nUMI <- median(seurat@meta.data[which(seurat@active.ident==clust),"nCount_RNA"]) + clustInfo$median_percentMitochGenes <- median(seurat@meta.data[which(seurat@active.ident==clust),"percentMito"]) + + + clustSig <- lapply(signatures,testHyperSig3,seurat,markers,clust) + + clustInfo <- c(clustInfo,clustSig) + +} + +#allSignatures <- c(signatures,signaturesRodriguez) + +clust_list <- lapply(levels(unique(seurat@active.ident)),getClustInfo,signatures,seurat,markers) + +names(clust_list) <- paste("cluster_",levels(unique(seurat@active.ident)),sep="") + +saveRDS(clust_list,paste(opt$outdir,"/clust_list_save.rds",sep ="")) + +clust_table <- as.data.frame(matrix(unlist(clust_list), nrow=length(unlist(clust_list[1])))) +colnames(clust_table) <- names(clust_list) +rownames(clust_table) <- names(clust_list[[1]]) + + +clust_df <- as.data.frame(t(clust_table)) + +write.table(x = clust_df,file = paste(opt$outdir,"/clusters_table.tsv",sep =""),sep="\t",quote=F,col.names = NA) + +saveRDS(seurat,file = paste(opt$outdir,"/seurat.rds",sep = "")) + + + diff --git a/R_src/Seurat4_integration.R b/R_src/Seurat4_integration.R new file mode 100644 index 0000000000000000000000000000000000000000..3b679bac5b62a0600afa126135bc236a19bada4c --- /dev/null +++ b/R_src/Seurat4_integration.R @@ -0,0 +1,335 @@ +##------------------------------------------------------------------------------ +## L. Herault +##------------------------------------------------------------------------------ + +## ----------------------------------------------------------------------------- +## Libraries +## ----------------------------------------------------------------------------- + +#suppressMessages(library(monocle)) +suppressMessages(library(Seurat)) +suppressMessages(library(BiocParallel)) +suppressMessages(library(getopt)) +suppressMessages(library(gridExtra)) +#suppressMessages(library(gProfileR)) +suppressMessages(library(RColorBrewer)) +suppressMessages(library(readxl)) +suppressMessages(library(stringr)) +suppressMessages(library(scales)) +suppressMessages(library(cowplot)) +suppressMessages(library(sctransform)) +suppressMessages(library(stringr)) +suppressMessages(library(plyr)) +suppressMessages(library(grid)) +suppressMessages(library(ggplot2)) + + + +source("R_src/getSigPerCells.R") +source("R_src/Enrichment.R") +source("R_src/funForSeurat.R") + + +# Seurat 3 integration workflow + +## ----------------------------------------------------------------------------- +## Command line args +## ----------------------------------------------------------------------------- + +spec = matrix(c( + 'help', 'h', 0, "logical", "Help about the program", + 'inputFiles', 'i', 1, "character", "REQUIRED: 10X dataset paths prepared as seurat object (.RDS generated by prepare_data.R) separated by +", + 'signaturesFile', 's',1, "character", "REQUIRED: signatures rds file", + 'outdir', 'o',1, "character", 'Outdir path (default ./)', + "num_dim_CCA", 'q',1,"numeric", "First n dimensions of CCA to use for FindIntegrationAnchors Seurat function (15 by default)", + "num_dim_weight", 'w',1,"numeric", "First n dimensions to use for IntegrateData Seurat function (15 by default)", + "num_dim", 'n',1,"numeric", "First n dimensions of PCA to use for clustering, UMAP and TSNE (15 by default)", + "num_dim_integrated",'N',1,"numeric", "Number of PCA dimension computed to analyse integrated data (40 by default)", + "cores", 'c',1, "numeric", "Number of cores to use for ordering (for differencially expressed gene between clusters test)", + "resolution", 'r',1, "numeric", "resolution for smp.combined clustering", + "correction", "b",1,"character", "Covariable to use as blocking factor (eg one or several columns of pData: betch, cell cycle phases... separated by +)", + "logfc_threshold", "l", 1, "numeric", "logfc threshold for finding cluster markers (1 cluster vs all deg) 0.25 by default", + "rodriguezSig", "k", 1, "character", "path for xls file of Rodriguez result", + "norm_method", "z",1, "character", "normalization method, logNorm (by default) or sctransform", + "reusePca", "p", 0, "character","re use pca calculated before when caculating anchor weights for each dataset default to FALSE (permit to correct for cell cycle before integration)" +), byrow=TRUE, ncol=5); + +opt = getopt(spec) + +# if help was asked, print a friendly message +# and exit with a non-zero error code + +args <- commandArgs() +if ( !is.null(opt$help) | is.null(opt$inputFiles)) { + cat("Perform Seurat 3 integration workflow, then cluster the cell with seurat 3 at different resolutions") + cat(getopt(spec, usage=TRUE)) + q(status=1) +} + +#set default arguments values + +if (is.null(opt$resolution)) { + opt$resolution <- 0.6 +} + +if (is.null(opt$logfc_threshold)) { + opt$logfc_threshold <- 0.25 +} + +if (is.null(opt$num_dim_CCA)) { + opt$num_dim_CCA <- 15 +} + +if (is.null(opt$num_dim_weight)) { + opt$num_dim_weight <- 15 +} + +if (is.null(opt$num_dim)) { + opt$num_dim <- 15 +} + +if (is.null(opt$num_dim_integrated)) { + opt$num_dim_integrated <- 40 +} + +if (is.null(opt$norm_method)) { + opt$norm_method <- "logNorm" +} + +if (is.null(opt$reusePca)) { + opt$reusePca <- FALSE +} + +print(opt$reusePca) + +dir.create(opt$outdir,recursive = T,showWarnings = F) + + +corrections <- strsplit(x = opt$correction,split = "\\+")[[1]] + + +dir.create(opt$outdir,recursive = T,showWarnings = F) + + +smp.listFile <- strsplit(opt$inputFiles,split = "\\+")[[1]] + +smp.list <- list() +#load dataset + +for (i in 1:length(x = smp.listFile)) { + #For testing, in final workflow sampleName will be incorporated in metadata with the loading of cell ranger matrix + smp.list[[i]] <- readRDS(smp.listFile[i]) + sampleName <- unique(smp.list[[i]]@meta.data$sampleName) + smp.list[[i]] <- RenameCells(smp.list[[i]],add.cell.id = sampleName) + + if(opt$norm_method != "sctransform") { + smp.list[[i]] <- NormalizeData(object = smp.list[[i]], verbose = FALSE) + smp.list[[i]] <- FindVariableFeatures(object = smp.list[[i]], selection.method = "vst", + nfeatures = 2000, verbose = FALSE) + } else { + smp.list[[i]] <- SCTransform(smp.list[[i]],vars.to.regress = corrections,verbose = T) + } +} + + +smp.anchors <- FindIntegrationAnchors(object.list = smp.list, dims = 1:opt$num_dim_CCA) + +if(opt$reusePca) { + print("Re use pca") + smp.combined <- IntegrateData(anchorset = smp.anchors, weight.reduction = "pca", dims = 1:opt$num_dim_weight) +} + +smp.combined <- IntegrateData(anchorset = smp.anchors, dims = 1:opt$num_dim_weight) + + +# Run the standard workflow for visualization and clustering +#Here certainly need to reuse SCTransform on combined data if norm_method = sct + + +classicSeuratWorkflow <- function(smp.combined, corrections,outdir) { + dir.create(outdir,recursive = T) + smp.combined <- ScaleData(object = smp.combined, verbose = T,vars.to.regress = corrections) + smp.combined <- RunPCA(object = smp.combined, npcs = opt$num_dim_integrated, verbose = FALSE) + + + png(paste(outdir,"/ElbowPlot.png",sep ="")) + ElbowPlot(object = smp.combined,ndims = opt$num_dim_integrated) + dev.off() + + + # t-SNE UMAP and Clustering + smp.combined <- RunUMAP(object = smp.combined, reduction = "pca", dims = 1:opt$num_dim) + smp.combined <- RunTSNE(object = smp.combined, reduction = "pca", dims = 1:opt$num_dim) + smp.combined <- FindNeighbors(object = smp.combined, reduction = "pca", dims = 1:opt$num_dim) + smp.combined <- FindClusters(smp.combined, resolution = c(0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,1.2)) + + colPrefix <- paste0(DefaultAssay(smp.combined),"_snn_res.") + + Idents(smp.combined) <- paste(colPrefix,opt$resolution,sep = "") + + smp.combined@meta.data$AGE <- "Old" + smp.combined@meta.data$AGE[which(smp.combined@meta.data$age == "2_months")] <- "Young" + + + + # Visualization of the samples + p1 <- DimPlot(object = smp.combined, reduction = "umap", group.by = "sampleName") + p2 <- DimPlot(object = smp.combined, reduction = "umap", label = TRUE) + p3 <- DimPlot(object = smp.combined, reduction = "umap", group.by = "AGE") + p4 <- DimPlot(object = smp.combined, reduction = "umap", group.by = "runDate") + p3tsne <- DimPlot(object = smp.combined, reduction = "tsne", group.by = "AGE") + p4tsne <- DimPlot(object = smp.combined, reduction = "tsne", group.by = "runDate") + + + + png(paste(outdir,"/umap_samples.png",sep =""),height = 800,width=1200) + grid.arrange(p1, p2,ncol = 2) + dev.off() + + png(paste(outdir,"/AGE.png",sep =""),height = 800,width=1200) + grid.arrange(p3, p3tsne,ncol = 2) + dev.off() + + png(paste(outdir,"/runDate.png",sep =""),height = 800,width=1200) + grid.arrange(p4, p4tsne,ncol = 2) + dev.off() + + png(paste(outdir,"/UmapClusters.png",sep =""),height = 800,width=1200) + DimPlot(object = smp.combined, reduction = "umap", label = TRUE) + dev.off() + + + p1tsne <- DimPlot(object = smp.combined, reduction = "tsne", group.by = "sampleName") + p2tsne <- DimPlot(object = smp.combined, reduction = "tsne", label = TRUE) + + png(paste(outdir,"/tsne_samples.png",sep =""),height = 800,width=1000) + grid.arrange(p1tsne, p2tsne,ncol = 2) + dev.off() + + #Visualistaoin one sample at a time + sampleNames <- unique(smp.combined@meta.data$sampleName) + dir.create(paste(outdir,"/samples/umap/",sep = ""),recursive = T) + dir.create(paste(outdir,"/samples/tsne/",sep = ""),recursive = T) + + #samplePlotList <- list() + for (s in sampleNames) { + png(paste(outdir,"/samples/umap/",s,".png",sep = "")) + plot(DimPlot(object = smp.combined, cells= grep(s, smp.combined@meta.data$sampleName), reduction = "umap") + ggtitle(s)) + dev.off() + png(paste(outdir,"/samples/tsne/",s,".png",sep = "")) + plot(DimPlot(object = smp.combined, cells= grep(s, smp.combined@meta.data$sampleName), reduction = "tsne") + ggtitle(s)) + dev.off() + } + + + umapListRes <- list() + tsneListRes <- list() + for (r in c(0.6,0.8,1,1.2)) { + umapListRes[[as.character(r)]] <- DimPlot(smp.combined, + reduction = "umap", + label = T, + group.by = paste(colPrefix,r,sep="")) + + NoLegend() + + ggtitle(paste("res",r)) + + tsneListRes[[as.character(r)]] <- DimPlot(smp.combined, + reduction = "tsne", + label = T, + group.by = paste(colPrefix,r,sep="")) + + NoLegend() + + ggtitle(paste("res",r)) + } + + png(paste(outdir,"/tsne_different_res.png",sep = ""),height = 800,width = 800) + grid.arrange(tsneListRes[[1]],tsneListRes[[2]],tsneListRes[[3]],tsneListRes[[4]]) + dev.off() + + png(paste(outdir,"/umap_different_res.png",sep = ""),height = 800,width = 800) + grid.arrange(umapListRes[[1]],umapListRes[[2]],umapListRes[[3]],umapListRes[[4]]) + dev.off() + + + + smp.combined@meta.data$numclust <- smp.combined@meta.data[,paste(colPrefix,opt$resolution,sep = "")] + + + #Check for unwanted source of variation UMAP + pPhases <- DimPlot(smp.combined,group.by = "phases") + if (!is.null(smp.combined@meta.data$predicted)) { + pPred <- DimPlot(smp.combined,group.by = "predicted") + } else { + pPred <- DimPlot(smp.combined) + } + pUMI <- FeaturePlot(smp.combined, "nCount_RNA") + pMito <- FeaturePlot(smp.combined, "percentMito") + + png(paste(outdir,"/umap_factors.png",sep = ""),height = 800,width = 800) + grid.arrange(pPhases,pPred,pUMI,pMito) + dev.off() + + #Check for unwanted source of variation tSNE + pPhases <- DimPlot(smp.combined,group.by = "phases",reduction = "tsne") + if (!is.null(smp.combined@meta.data$predicted)) { + pPred <- DimPlot(smp.combined,group.by = "predicted",reduction = "tsne") + } else { + pPred <- DimPlot(smp.combined,reduction = "tsne") + } + pUMI <- FeaturePlot(smp.combined, "nCount_RNA",reduction = "tsne") + pMito <- FeaturePlot(smp.combined, "percentMito",reduction = "tsne") + + png(paste(outdir,"/tsne_factors.png",sep = ""),height = 800,width = 800) + grid.arrange(pPhases,pPred,pUMI,pMito) + dev.off() + return(smp.combined) +} + +print("Seurat workflow without integration") +DefaultAssay(object = smp.combined) <- "RNA" +smp.combined <- NormalizeData(object = smp.combined) +smp.combined <- FindVariableFeatures(object = smp.combined,selection.method = "vst", nfeatures = 2000, verbose = T) + +if (!is.null(corrections)) { + print("Seurat workflow without correction") + classicSeuratWorkflow(smp.combined,correction = NULL,outdir = paste0(opt$outdir,"/SeuratMergingWoIntegrationWoCr/")) +} + +classicSeuratWorkflow(smp.combined,corrections = corrections,outdir = paste0(opt$outdir,"/SeuratMergingWoIntegration/")) + + + +print("Seurat workflow with integration") + +DefaultAssay(object = smp.combined) <- "integrated" + + +if (!is.null(corrections)) { + print("Seurat workflow without correction") + classicSeuratWorkflow(smp.combined,correction = NULL,outdir = paste0(opt$outdir,"/SeuratWithoutCr/")) +} + +smp.combined <- classicSeuratWorkflow(smp.combined,corrections = corrections,outdir =opt$outdir) + + + +#AGE prop for each cluster +age_clust <- getAGEPropPerClustBarplot(smp.combined) +#sample prop for each cluster +sample_clust <- getSamplePropPerClustBarplot(smp.combined) +#runDate prop for each cluster +runDate_clust <- getRunDatePropPerClustBarplot(smp.combined) + +png(paste(opt$outdir,"/propPerClust.png",sep = "")) +grid.arrange(age_clust,sample_clust,runDate_clust, + DimPlot(object = smp.combined, reduction = "tsne", label = TRUE)) +dev.off() + + +## saving results + +saveRDS(smp.combined,paste(opt$outdir,"/combined.rds",sep ="")) + + + + + + diff --git a/R_src/SignacMotifAnnotCL.R b/R_src/SignacMotifAnnotCL.R new file mode 100644 index 0000000000000000000000000000000000000000..6fb5896188d8e40fc1fdf03cc66f805ae576465c --- /dev/null +++ b/R_src/SignacMotifAnnotCL.R @@ -0,0 +1,324 @@ +#-----------------------------------------------------# +# Signac Motif analysis tutorial +#-----------------------------------------------------# + +suppressMessages(library(getopt)) +suppressMessages(library(Signac)) +suppressMessages(library(Seurat)) +suppressMessages(library(JASPAR2020)) +suppressMessages(library(TFBSTools)) +suppressMessages(library(BSgenome.Mmusculus.UCSC.mm10)) +suppressMessages(library(ggplot2)) +# library(patchwork) + + +##-------------------------------------------------------------------------## +## Option list +##-------------------------------------------------------------------------## + +spec = matrix(c( + 'help', 'h', 0, "logical", "Help about the program", + 'inputRDS', 'i', 1, "character", "REQUIRED: 10X data prepared as seurat/signac object.", + 'outdir', 'o',1, "character", 'Outdir path (default ./)', + 'species', 's', 1, "numeric", "Species ID number for the motif DB, human = 9606, mouse = 10090 (default human)", + 'assay', 'a', 1, "character", "Name of the assay that contain the atac data (default 'peaks')", + 'cluster', 'c', 1, "character", "Name of the cluster identity for the markers", + 'IdentName', 'n', 1, "character", "Idents new name '+' separated", + 'IdentLevel', 'l', 1, "character", "Levels of the new Ident '+' separated", + 'integrated', 'I', 1, "logical", "Is the RDS an integrated dataset or not" +), byrow=TRUE, ncol=5); + +opt = getopt(spec) + +# if help was asked, print a friendly message +# and exit with a non-zero error code + +args <- commandArgs() +if ( !is.null(opt$help) | is.null(opt$inputRDS)) { + cat("Help message") + cat(getopt(spec, usage=TRUE)) + q(status=1) +} + +if (is.null(opt$outdir)) { + opt$outdir = "./" +} + +if (is.null(opt$species)){ + opt$species <- 9606 +} + +if(is.null(opt$assay)){ + opt$assay <- "peaks" +} + +# opt$inputRDS <- "/shared/projects/scRNA_HSPC_Aging/GMP_PRARA/output/TransferRnaAtac/coembedAll.rds" +# opt$IdentName <- "Neu1+Neu3+Neu2+NeuRA1+Rep+NeuRA2" +# opt$IdentLevel <- "Rep+Neu1+Neu2+Neu3+NeuRA1+NeuRA2" +# opt$species <- 9606 +# opt$cluster <- "numclust" + + +defAssay <- opt$assay + +outdir <- opt$outdir + +dir.create(outdir, recursive = T) + +#------------------------------------# +# Load the motif dataBase +#------------------------------------# +print("Load JASPAR DB") +# Get a list of motif position frequency matrices from the JASPAR database +# species = 9606 is human species id while the tutorial is using mice dataset +pfm <- getMatrixSet( + x = JASPAR2020, + opts = list(species = opt$species, all_versions = FALSE) +) + +#------------------------------------# +# Read seurat and prepare the data +#------------------------------------# + +seurat <- readRDS(opt$inputRDS) + +DefaultAssay(seurat) <- defAssay + +Idents(seurat) <- opt$cluster + +print("Entering analysis loop") +print(paste("opt$integrated == ", opt$integrated, sep = "")) +if(opt$integrated == TRUE){ + # Prepare data + # Rename the cluster with real name for an easier analysis + new.ident.name <- strsplit(opt$IdentName, split = "\\+")[[1]] + + names(new.ident.name) <- levels(seurat) + + seurat <- RenameIdents(seurat, new.ident.name) + seurat@meta.data$FinalCluster <- Idents(seurat) + + ident_levels <- strsplit(opt$IdentLevel, split = "\\+")[[1]] + + seurat@meta.data$FinalCluster <- factor(seurat@meta.data$FinalCluster, levels = ident_levels) + + Idents(seurat) <- "FinalCluster" + + # Subset only ATAC data + Idents(seurat) <- "condType" + seurat_atac <- subset(seurat, idents = grep(pattern = "ATAC", x = unique(Idents(seurat)), value = T)) + seurat_atac[[defAssay]] <- as.ChromatinAssay(seurat_atac[[defAssay]], seqinfo = "mm10") + + Idents(seurat_atac) <- "FinalCluster" + + # Adding motif information + seurat_atac <- AddMotifs(object = seurat_atac, genome = BSgenome.Mmusculus.UCSC.mm10, pfm = pfm) + + motif_correl <- seurat_atac@assays$peaks@motifs@motif.names + motif_correl.df <- data.frame(do.call("rbind", motif_correl)) + names(motif_correl.df)[1] <- "Motif_Name" + motif_correl.df$Motif_ID <- rownames(motif_correl.df) + + write.table(x = motif_correl.df, file = paste0(outdir, "/motif_name_table.tsv"), sep = "\t", row.names = F, col.names = T, quote = F) + + #----------------------------# + # Method 1 : Find DA region + #----------------------------# + # dir.create(paste0(outdir, "/Overrep_Motif"), recursive = T) + # print("Starting method 1") + # ## Treatment impact + # dir.create(paste0(outdir, "/Overrep_Motif/Treatment"), recursive = T) + # # FindMarkers using Logistic regression + # Idents(seurat_atac) <- "condition" + # + # da_peaks_treatment <- FindMarkers(seurat_atac, ident.1 = "Treated", ident.2 = "Control", only.pos = T, test.use = "LR", latent.vars = "nCount_peaks") + # da_peaks_treatment.sig <- rownames(da_peaks_treatment[da_peaks_treatment$p_val < 0.005, ]) + # + # # Calculate a motif score for each of the most DA region + # enriched_motifs_treatment <- FindMotifs(object = seurat_atac, features = da_peaks_treatment.sig) + # motif_plot_treatment <- MotifPlot(seurat_atac, head(enriched_motifs_treatment$motif, 6)) + # + # ggsave(paste0(outdir, "/Overrep_Motif/Treatment", "/Top_motif_treatment", '.png'), plot = motif_plot_treatment, device = 'png', path = NULL, + # scale = 1, width = 20, height = 15, units = 'cm', dpi = 300) + # + # write.table(da_peaks_treatment, file = paste0(outdir, "/Overrep_Motif/Treatment", "/DA_genomic_regions_treatment", ".tsv"), + # sep = "\t", col.names = T, row.names = T, quote = F) + # write.table(enriched_motifs_treatment, file = paste0(outdir, "/Overrep_Motif/Treatment", "/Motif_enrichment_treatment", ".tsv"), + # sep = "\t", col.names = T, row.names = T, quote = F) + # print("Treatment analysis done") + # + # ## Cluster impact + # dir.create(paste0(outdir, "/Overrep_Motif/Cluster"), recursive = T) + # + # Idents(seurat_atac) <- "FinalCluster" + # + # print(unique(seurat_atac@meta.data$FinalCluster)) + # # BiocParallel::register(BiocParallel::SerialParam()) + # # da_peaks_cluster <- FindAllMarkers(seurat_atac, only.pos = T, test.use = "LR", latent.vars = "nCount_peaks") + # da_peaks_cluster.list <- list() + # for(cluster in unique(seurat_atac$FinalCluster)){ + # print(paste("Find markers cluster", cluster, sep = " ")) + # if(nrow(seurat_atac@meta.data[seurat_atac$FinalCluster %in% cluster,]) >3){ + # da_peaks_cluster.list[[cluster]] <- FindMarkers(seurat_atac, ident.1 = cluster, only.pos = T, test.use = "LR", latent.vars = "nCount_peaks") + # da_peaks_cluster.list[[cluster]]$Cluster <- cluster + # da_peaks_cluster.list[[cluster]]$peaks <- rownames(da_peaks_cluster.list[[cluster]]) + # } else{ + # print("Not enough cells") + # } + # } + # + # da_peaks_cluster.db <- do.call("rbind", da_peaks_cluster.list) + # print("FindMarkers cluster done") + # + # enriched_motifs_cluster.list <- list() + # for(cluster in unique(da_peaks_cluster.db$Cluster)){ + # print(paste(cluster, "FindMotif", sep = " ")) + # da_peaks_cluster.sig <- da_peaks_cluster.db[which(da_peaks_cluster.db$p_val < 0.005 & da_peaks_cluster.db$Cluster == cluster),] + # da_peaks_cluster.sig.name <- da_peaks_cluster.sig$peaks + # enriched_motifs_cluster.list[[paste0(cluster, "_motifs")]] <- FindMotifs(seurat_atac, features = da_peaks_cluster.sig.name) + # enriched_motifs_cluster.list[[paste0(cluster, "_motifs")]]$Cluster <- cluster + # + # # motif_plot <- MotifPlot(atac_small, motifs = head(enriched_motifs_cluster.list[[paste0(cluster, "_motifs")]]$motif, 6)) + # # ggsave(paste0(outdir, "/Overrep_Motif/Cluster", "/Top_motif_cluster_", cluster, '.png'), plot = motif_plot, device = 'png', path = NULL, + # # scale = 1, width = 20, height = 15, units = 'cm', dpi = 300) + # } + # enriched_motifs_cluster.df <- do.call("rbind", enriched_motifs_cluster.list) + # + # write.table(da_peaks_cluster.db, file = paste0(outdir, "/Overrep_Motif/Cluster", "/Motif_enrichment_cluster", ".tsv"), + # sep = "\t", col.names = T, row.names = T, quote = F) + # write.table(enriched_motifs_cluster.df, file = paste0(outdir, "/Overrep_Motif/Cluster", "/Motif_enrichment_cluster", ".tsv"), + # sep = "\t", col.names = T, row.names = T, quote = F) + # print("cluster impact done") + #---------------------------------------# + # Method 2 : Computing motif activity + #---------------------------------------# + dir.create(paste0(outdir, "/Motif_activity"), recursive = T) + + print("Starting method 2") + # Computing motif per cell motif activity score + # Try this for error with multicore + BiocParallel::register(BiocParallel::SerialParam()) + seurat_atac <- RunChromVAR(object = seurat_atac, genome = BSgenome.Mmusculus.UCSC.mm10) + + DefaultAssay(seurat_atac) <- "chromvar" + print("End chromvar") + + # Find differential activity in motif per treatment + dir.create(paste0(outdir, "/Motif_activity/Treatment"), recursive = T) + print("Starting treatment analysis") + + Idents(seurat_atac) <- "condition" + diff_act_treatment <- FindMarkers(object = seurat_atac, ident.1 = "Treated", ident.2 = "Control", test.use = "LR", latent.vars = "nCount_peaks") + diff_act_treatment$Motif_Name <- motif_correl.df[rownames(diff_act_treatment), "Motif_Name"] + + top_motif_treatment <- MotifPlot(object = seurat_atac, motifs = head(rownames(diff_act_treatment), 6), assay = "peaks") + + write.table(diff_act_treatment, file = paste0(outdir, "/Motif_activity/Treatment", "/Motif_enrich_treatment", ".tsv"), + sep = "\t", row.names = T, col.names = T, quote = F) + + ggsave(paste0(outdir, "/Motif_activity/Treatment", "/Top_motif_treatment", '.png'), plot = top_motif_treatment, device = 'png', path = NULL, + scale = 1, width = 20, height = 15, units = 'cm', dpi = 300) + print("End treatment analysis") + + # Find differential activity in motif per cluster + dir.create(paste0(outdir, "/Motif_activity/Cluster"), recursive = T) + + print("Starting cluster analysis") + Idents(seurat_atac) <- "FinalCluster" + diff_act_cluster <- FindAllMarkers(object = seurat_atac, only.pos = T, test.use = "LR", latent.vars = "nCount_peaks") + + for(cluster in unique(diff_act_cluster$cluster)){ + top_motif <- diff_act_cluster[diff_act_cluster$cluster %in% cluster,] + top_motif.name <- head(top_motif$gene, 6) + + top_motif.plot <- MotifPlot(object = seurat_atac, motifs = top_motif.name, assay = "peaks") + ggsave(paste0(outdir, "/Motif_activity/Cluster", "/Top_motif_cluster_", cluster, '.png'), plot = top_motif.plot, device = 'png', path = NULL, + scale = 1, width = 20, height = 15, units = 'cm', dpi = 300) + } + + + + # rownames(diff_act_cluster) <- diff_act_cluster$gene + print("Adding Motif_name to table output") + diff_act_cluster.list <- list() + for(cluster in unique(diff_act_cluster$cluster)){ + diff_act_cluster.temp <- diff_act_cluster[diff_act_cluster$cluster %in% cluster,] + rownames(diff_act_cluster.temp) <- diff_act_cluster.temp$gene + diff_act_cluster.temp$Motif_Name <- motif_correl.df[rownames(diff_act_cluster.temp), "Motif_Name"] + diff_act_cluster.list[[cluster]] <- diff_act_cluster.temp + } + diff_act_cluster.final <- do.call("rbind", diff_act_cluster.list) + + write.table(diff_act_cluster.final, file = paste0(outdir, "/Motif_activity/Cluster", "/Motif_enrich_cluster", ".tsv"), + sep = "\t", row.names = T, col.names = T, quote = F) + print("End cluster analysis") + + # Analysing motif markers of difference between pop IN a cluster (ex : Clus1_Treated vs Clust1_Control) + print("Starting the treatment per cluster analysis") + dir.create(paste0(outdir, "/Motif_activity/Treatment_per_clust"), recursive = T) + + seurat_atac$cluster_cond <- paste(seurat_atac$condition, seurat_atac$FinalCluster, sep = "_") + Idents(seurat_atac) <- "cluster_cond" + + diff_act_cluster_cond.list <- list() + for(cluster in unique(seurat_atac$FinalCluster)){ + print(paste("Treated_vs_Control", cluster, sep = "_")) + + if(nrow(seurat_atac@meta.data[seurat_atac$cluster_cond %in% paste("Control", cluster, sep = "_"),]) >3 && + nrow(seurat_atac@meta.data[seurat_atac$cluster_cond %in% paste("Treated", cluster, sep = "_"),])){ + print("Enough cells for analysis") + diff_act_cluster_cond.list[[paste("Treated_vs_Control", cluster, sep = "_")]] <- FindMarkers(object = seurat_atac, ident.1 = paste("Treated", cluster, sep = "_"), + ident.2 = paste("Control", cluster, sep = "_"), + test.use = 'LR', latent.vars = 'nCount_peaks') + + diff_act_cluster_cond.list[[paste("Treated_vs_Control", cluster, sep = "_")]]$Comparison <- "Treated_vs_Control" + diff_act_cluster_cond.list[[paste("Treated_vs_Control", cluster, sep = "_")]]$Cluster <- cluster + diff_act_cluster_cond.list[[paste("Treated_vs_Control", cluster, sep = "_")]]$Motif_ID <- rownames(diff_act_cluster_cond.list[[paste("Treated_vs_Control", cluster, sep = "_")]]) + diff_act_cluster_cond.list[[paste("Treated_vs_Control", cluster, sep = "_")]]$Motif_Name <- motif_correl.df[rownames(diff_act_cluster_cond.list[[paste("Treated_vs_Control", cluster, sep = "_")]]), "Motif_Name"] + }else{ + print("Not enough cells") + } + } + diff_act_cluster_cond.df <- do.call("rbind", diff_act_cluster_cond.list) + + print("Plotting top motif for treated and control per cluster") + for(cluster in unique(diff_act_cluster_cond.df$Cluster)){ + top_motif <- diff_act_cluster_cond.df[diff_act_cluster_cond.df$Cluster %in% cluster,] + top_motif.name <- head(top_motif$Motif_ID, 6) + bot_motif.name <- tail(top_motif$Motif_ID, 6) + + top_motif.plot <- MotifPlot(object = seurat_atac, motifs = top_motif.name, assay = "peaks") + bot_motif.plot <- MotifPlot(object = seurat_atac, motifs = bot_motif.name, assay = "peaks") + ggsave(paste0(outdir, "/Motif_activity/Treatment_per_clust", "/Top_motif_treated_cluster_", cluster, '.png'), plot = top_motif.plot, device = 'png', path = NULL, + scale = 1, width = 20, height = 15, units = 'cm', dpi = 300) + ggsave(paste0(outdir, "/Motif_activity/Treatment_per_clust", "/Top_motif_control_cluster_", cluster, '.png'), plot = bot_motif.plot, device = 'png', path = NULL, + scale = 1, width = 20, height = 15, units = 'cm', dpi = 300) + } + + write.table(diff_act_cluster_cond.df, file = paste0(outdir, "/Motif_activity/Treatment_per_clust", "/Motif_enrich_cluster", ".tsv"), + sep = "\t", row.names = T, col.names = T, quote = F) + print("End of the treatment per cluster analysis") + + saveRDS(seurat_atac, file = paste0(outdir, "/atac_motif", '.rds')) + +} else if(opt$integrated == FALSE){ + print("Only computing motif activity score") + + # Adding motif information + seurat <- AddMotifs(object = seurat, genome = BSgenome.Mmusculus.UCSC.mm10, pfm = pfm) + + motif_correl <- seurat@assays$peaks@motifs@motif.names + motif_correl.df <- data.frame(do.call("rbind", motif_correl)) + names(motif_correl.df)[1] <- "Motif_Name" + motif_correl.df$Motif_ID <- rownames(motif_correl.df) + + write.table(x = motif_correl.df, file = paste0(outdir, "/motif_name_table.tsv"), sep = "\t", row.names = F, col.names = T, quote = F) + + # Calculate motif score + BiocParallel::register(BiocParallel::SerialParam()) + seurat <- RunChromVAR(object = seurat, genome = BSgenome.Mmusculus.UCSC.mm10) + + saveRDS(seurat, file = paste0(outdir, "/atac_motif", ".rds")) +} + + diff --git a/R_src/SignacMotifAnnotCL_v2.R b/R_src/SignacMotifAnnotCL_v2.R new file mode 100644 index 0000000000000000000000000000000000000000..328896f7c90d3b5b3f1162748ed24a7a55ed7525 --- /dev/null +++ b/R_src/SignacMotifAnnotCL_v2.R @@ -0,0 +1,250 @@ +#-----------------------------------------------------# +# Signac Motif analysis tutorial +#-----------------------------------------------------# + +suppressMessages(library(getopt)) +suppressMessages(library(Signac)) +suppressMessages(library(Seurat)) +suppressMessages(library(JASPAR2020)) +suppressMessages(library(TFBSTools)) +suppressMessages(library(BSgenome.Mmusculus.UCSC.mm10)) +suppressMessages(library(ggplot2)) +# library(patchwork) + + +##-------------------------------------------------------------------------## +## Option list +##-------------------------------------------------------------------------## + +spec = matrix(c( + 'help', 'h', 0, "logical", "Help about the program", + 'inputRDS', 'i', 1, "character", "REQUIRED: 10X data prepared as seurat/signac object.", + 'outdir', 'o',1, "character", 'Outdir path (default ./)', + 'species', 's', 1, "numeric", "Species ID number for the motif DB, human = 9606, mouse = 10090 (default human)", + 'assay', 'a', 1, "character", "Name of the assay that contain the atac data (default 'peaks')", + 'cluster', 'c', 1, "character", "Name of the cluster identity for the markers", + 'IdentName', 'n', 1, "character", "Idents new name '+' separated", + 'IdentLevel', 'l', 1, "character", "Levels of the new Ident '+' separated", + 'integrated', 'I', 1, "logical", "Is the RDS an integrated dataset or not" +), byrow=TRUE, ncol=5); + +opt = getopt(spec) + +# if help was asked, print a friendly message +# and exit with a non-zero error code + +args <- commandArgs() +if ( !is.null(opt$help) | is.null(opt$inputRDS)) { + cat("Help message") + cat(getopt(spec, usage=TRUE)) + q(status=1) +} + +if (is.null(opt$outdir)) { + opt$outdir = "./" +} + +if (is.null(opt$species)){ + opt$species <- 9606 +} + +if(is.null(opt$assay)){ + opt$assay <- "peaks" +} + +# opt$inputRDS <- "/shared/projects/scRNA_HSPC_Aging/GMP_PRARA/output/TransferRnaAtac/coembedAll.rds" +# opt$IdentName <- "Neu2+Rep+Neu3+NeuRA2+NeuRA1+Neu1" +# opt$IdentLevel <- "Rep+Neu1+Neu2+Neu3+NeuRA1+NeuRA2" +# opt$species <- 9606 +# opt$cluster <- "predicted.id" + + +defAssay <- opt$assay + +outdir <- opt$outdir + +dir.create(outdir, recursive = T) + +#------------------------------------# +# Load the motif dataBase +#------------------------------------# +print("Load JASPAR DB") +# Get a list of motif position frequency matrices from the JASPAR database +# species = 9606 is human species id while the tutorial is using mice dataset +pfm <- getMatrixSet( + x = JASPAR2020, + opts = list(species = opt$species, all_versions = FALSE) +) + +#------------------------------------# +# Read seurat and prepare the data +#------------------------------------# + +seurat <- readRDS(opt$inputRDS) + +DefaultAssay(seurat) <- defAssay + +Idents(seurat) <- opt$cluster + +print("Entering analysis loop") +print(paste("opt$integrated == ", opt$integrated, sep = "")) +if(opt$integrated == TRUE){ + # Prepare data + # Rename the cluster with real name for an easier analysis + new.ident.name <- strsplit(opt$IdentName, split = "\\+")[[1]] + + names(new.ident.name) <- levels(seurat) + + seurat <- RenameIdents(seurat, new.ident.name) + seurat@meta.data$FinalCluster <- Idents(seurat) + + ident_levels <- strsplit(opt$IdentLevel, split = "\\+")[[1]] + + seurat@meta.data$FinalCluster <- factor(seurat@meta.data$FinalCluster, levels = ident_levels) + + Idents(seurat) <- "FinalCluster" + + seurat_atac <- seurat + # Adding motif information + seurat_atac <- AddMotifs(object = seurat_atac, genome = BSgenome.Mmusculus.UCSC.mm10, pfm = pfm) + + motif_correl <- seurat_atac@assays$peaks@motifs@motif.names + motif_correl.df <- data.frame(do.call("rbind", motif_correl)) + names(motif_correl.df)[1] <- "Motif_Name" + motif_correl.df$Motif_ID <- rownames(motif_correl.df) + + write.table(x = motif_correl.df, file = paste0(outdir, "/motif_name_table.tsv"), sep = "\t", row.names = F, col.names = T, quote = F) + + #---------------------------------------# + # Method 2 : Computing motif activity + #---------------------------------------# + dir.create(paste0(outdir, "/Motif_activity"), recursive = T) + + print("Starting method 2") + # Computing motif per cell motif activity score + # Try this for error with multicore + BiocParallel::register(BiocParallel::SerialParam()) + seurat_atac <- RunChromVAR(object = seurat_atac, genome = BSgenome.Mmusculus.UCSC.mm10) + + DefaultAssay(seurat_atac) <- "chromvar" + print("End chromvar") + + # Find differential activity in motif per treatment + dir.create(paste0(outdir, "/Motif_activity/Treatment"), recursive = T) + print("Starting treatment analysis") + + Idents(seurat_atac) <- "condition" + diff_act_treatment <- FindMarkers(object = seurat_atac, ident.1 = "Treated", ident.2 = "Control", test.use = "LR", latent.vars = "nCount_peaks") + diff_act_treatment$Motif_Name <- motif_correl.df[rownames(diff_act_treatment), "Motif_Name"] + + top_motif_treatment <- MotifPlot(object = seurat_atac, motifs = head(rownames(diff_act_treatment), 6), assay = "peaks") + + write.table(diff_act_treatment, file = paste0(outdir, "/Motif_activity/Treatment", "/Motif_enrich_treatment", ".tsv"), + sep = "\t", row.names = T, col.names = T, quote = F) + + ggsave(paste0(outdir, "/Motif_activity/Treatment", "/Top_motif_treatment", '.png'), plot = top_motif_treatment, device = 'png', path = NULL, + scale = 1, width = 20, height = 15, units = 'cm', dpi = 300) + print("End treatment analysis") + + # Find differential activity in motif per cluster + dir.create(paste0(outdir, "/Motif_activity/Cluster"), recursive = T) + + print("Starting cluster analysis") + Idents(seurat_atac) <- "FinalCluster" + diff_act_cluster <- FindAllMarkers(object = seurat_atac, only.pos = T, test.use = "LR", latent.vars = "nCount_peaks") + + for(cluster in unique(diff_act_cluster$cluster)){ + top_motif <- diff_act_cluster[diff_act_cluster$cluster %in% cluster,] + top_motif.name <- head(top_motif$gene, 6) + + top_motif.plot <- MotifPlot(object = seurat_atac, motifs = top_motif.name, assay = "peaks") + ggsave(paste0(outdir, "/Motif_activity/Cluster", "/Top_motif_cluster_", cluster, '.png'), plot = top_motif.plot, device = 'png', path = NULL, + scale = 1, width = 20, height = 15, units = 'cm', dpi = 300) + } + + + + # rownames(diff_act_cluster) <- diff_act_cluster$gene + print("Adding Motif_name to table output") + diff_act_cluster.list <- list() + for(cluster in unique(diff_act_cluster$cluster)){ + diff_act_cluster.temp <- diff_act_cluster[diff_act_cluster$cluster %in% cluster,] + rownames(diff_act_cluster.temp) <- diff_act_cluster.temp$gene + diff_act_cluster.temp$Motif_Name <- motif_correl.df[rownames(diff_act_cluster.temp), "Motif_Name"] + diff_act_cluster.list[[cluster]] <- diff_act_cluster.temp + } + diff_act_cluster.final <- do.call("rbind", diff_act_cluster.list) + + write.table(diff_act_cluster.final, file = paste0(outdir, "/Motif_activity/Cluster", "/Motif_enrich_cluster", ".tsv"), + sep = "\t", row.names = T, col.names = T, quote = F) + print("End cluster analysis") + + # Analysing motif markers of difference between pop IN a cluster (ex : Clus1_Treated vs Clust1_Control) + print("Starting the treatment per cluster analysis") + dir.create(paste0(outdir, "/Motif_activity/Treatment_per_clust"), recursive = T) + + seurat_atac$cluster_cond <- paste(seurat_atac$condition, seurat_atac$FinalCluster, sep = "_") + Idents(seurat_atac) <- "cluster_cond" + + diff_act_cluster_cond.list <- list() + for(cluster in unique(seurat_atac$FinalCluster)){ + print(paste("Treated_vs_Control", cluster, sep = "_")) + + if(nrow(seurat_atac@meta.data[seurat_atac$cluster_cond %in% paste("Control", cluster, sep = "_"),]) >3 && + nrow(seurat_atac@meta.data[seurat_atac$cluster_cond %in% paste("Treated", cluster, sep = "_"),])){ + print("Enough cells for analysis") + diff_act_cluster_cond.list[[paste("Treated_vs_Control", cluster, sep = "_")]] <- FindMarkers(object = seurat_atac, ident.1 = paste("Treated", cluster, sep = "_"), + ident.2 = paste("Control", cluster, sep = "_"), + test.use = 'LR', latent.vars = 'nCount_peaks') + + diff_act_cluster_cond.list[[paste("Treated_vs_Control", cluster, sep = "_")]]$Comparison <- "Treated_vs_Control" + diff_act_cluster_cond.list[[paste("Treated_vs_Control", cluster, sep = "_")]]$Cluster <- cluster + diff_act_cluster_cond.list[[paste("Treated_vs_Control", cluster, sep = "_")]]$Motif_ID <- rownames(diff_act_cluster_cond.list[[paste("Treated_vs_Control", cluster, sep = "_")]]) + diff_act_cluster_cond.list[[paste("Treated_vs_Control", cluster, sep = "_")]]$Motif_Name <- motif_correl.df[rownames(diff_act_cluster_cond.list[[paste("Treated_vs_Control", cluster, sep = "_")]]), "Motif_Name"] + }else{ + print("Not enough cells") + } + } + diff_act_cluster_cond.df <- do.call("rbind", diff_act_cluster_cond.list) + + print("Plotting top motif for treated and control per cluster") + for(cluster in unique(diff_act_cluster_cond.df$Cluster)){ + top_motif <- diff_act_cluster_cond.df[diff_act_cluster_cond.df$Cluster %in% cluster,] + top_motif.name <- head(top_motif$Motif_ID, 6) + bot_motif.name <- tail(top_motif$Motif_ID, 6) + + top_motif.plot <- MotifPlot(object = seurat_atac, motifs = top_motif.name, assay = "peaks") + bot_motif.plot <- MotifPlot(object = seurat_atac, motifs = bot_motif.name, assay = "peaks") + ggsave(paste0(outdir, "/Motif_activity/Treatment_per_clust", "/Top_motif_treated_cluster_", cluster, '.png'), plot = top_motif.plot, device = 'png', path = NULL, + scale = 1, width = 20, height = 15, units = 'cm', dpi = 300) + ggsave(paste0(outdir, "/Motif_activity/Treatment_per_clust", "/Top_motif_control_cluster_", cluster, '.png'), plot = bot_motif.plot, device = 'png', path = NULL, + scale = 1, width = 20, height = 15, units = 'cm', dpi = 300) + } + + write.table(diff_act_cluster_cond.df, file = paste0(outdir, "/Motif_activity/Treatment_per_clust", "/Motif_enrich_cluster", ".tsv"), + sep = "\t", row.names = T, col.names = T, quote = F) + print("End of the treatment per cluster analysis") + + saveRDS(seurat_atac, file = paste0(outdir, "/atac_motif", '.rds')) + +} else if(opt$integrated == FALSE){ + print("Only computing motif activity score") + + # Adding motif information + seurat <- AddMotifs(object = seurat, genome = BSgenome.Mmusculus.UCSC.mm10, pfm = pfm) + + motif_correl <- seurat@assays$peaks@motifs@motif.names + motif_correl.df <- data.frame(do.call("rbind", motif_correl)) + names(motif_correl.df)[1] <- "Motif_Name" + motif_correl.df$Motif_ID <- rownames(motif_correl.df) + + write.table(x = motif_correl.df, file = paste0(outdir, "/motif_name_table.tsv"), sep = "\t", row.names = F, col.names = T, quote = F) + + # Calculate motif score + BiocParallel::register(BiocParallel::SerialParam()) + seurat <- RunChromVAR(object = seurat, genome = BSgenome.Mmusculus.UCSC.mm10) + + saveRDS(seurat, file = paste0(outdir, "/atac_motif", ".rds")) +} + + diff --git a/R_src/SignacOneSampleCL.R b/R_src/SignacOneSampleCL.R new file mode 100644 index 0000000000000000000000000000000000000000..0aef397096d7969629a4f67f1f72753ec4952bb1 --- /dev/null +++ b/R_src/SignacOneSampleCL.R @@ -0,0 +1,151 @@ +##------------------------------------------------------------------------------ +## L. Herault +##------------------------------------------------------------------------------ + +## ----------------------------------------------------------------------------- +## Libraries +## ----------------------------------------------------------------------------- + +suppressMessages(library(Signac)) +suppressMessages(library(Seurat)) +suppressMessages(library(GenomeInfoDb)) +suppressMessages(library(ggplot2)) +suppressMessages(library(patchwork)) +set.seed(1234) + + +# Analysis of seurat 3 integration and clusternig workflow + +## ----------------------------------------------------------------------------- +## Command line args +## ----------------------------------------------------------------------------- + +spec = matrix(c( + 'help', 'h', 0, "logical", "Help about the program", + 'filename', 'i', 1, 'character', 'input filtered_peak_bc_matrix.h5', + 'metafile', 'm', 1, 'character', 'input metadata file eg. singlecell.csv', + 'fragments', 'f', 1, 'character', 'fragments.tsv.gz file', + 'genome', 'g', 1, 'character', 'genome, corresponding R libraries have to be installed (eg EnsDb.Mmusculus.v79) for mm10', + 'minCell', 'c', 1, 'numeric', 'min cell number by feature (default 10)', + 'minFeature', 'e', 1, 'numeric', "min feature number per cells (default 200)", + 'outdir', 'o',1, "character", 'Outdir path (default ./)', + "tfTested", 't',1, "character", "TF tested list (eg TF with a motif in the scenic database", + "regulonJson", 'r',1, "character", "Regulon main Json file", + "regulonJsonSupp", "s", 1, "character", "Regulon supp Json files (separated by +)", + "subConditionName", "n", 1, "character", "correspondong names (separated by +)" + +), byrow=TRUE, ncol=5); + +## default settings + +if (is.null(opt$outdir)) { + opt$outdir = "./" +} + +if (is.null(opt$minCell)) { + opt$minCell = 10 +} + +if (is.null(opt$minFeature)) { + opt$minFeature = 200 +} + +# Printing option before running + +for (o in names(opt)) { + print(paste(o,":", opt[[o]])) +} + +## Load data +counts <- Read10X_h5(filename = opt$filename) + +metadata <- read.csv( + file = opt$metafile, + header = TRUE, + row.names = 1 +) + +chrom_assay <- CreateChromatinAssay( + counts = counts, + sep = c(":", "-"), + genome = opt$genome, + fragments = opt$fragments, + min.cells = opt$minCell, + min.features = opt$minFeature +) + +signac <- CreateSeuratObject( + counts = chrom_assay, + assay = "peaks", + meta.data = metadata +) + +if(opt$genome == "mm10") { + suppressMessages(library(EnsDb.Mmusculus.v79)) + ensdb = EnsDb.Mmusculus.v79 +} + +# extract gene annotations from EnsDb +annotations <- GetGRangesFromEnsDb(ensdb = ensdb) + +# change to UCSC style since the data was mapped to hg19 +seqlevelsStyle(annotations) <- 'UCSC' +genome(annotations) <- opt$genome + +# add the gene information to the object +Annotation(signac) <- annotations + +signac <- NucleosomeSignal(object = signac) + +# Quality control and cells filtering + +# compute TSS enrichment score per cell +signac <- TSSEnrichment(object = signac, fast = FALSE) + +# add blacklist ratio and fraction of reads in peaks +signac$pct_reads_in_peaks <- signac$peak_region_fragments / signac$passed_filters * 100 +signac$blacklist_ratio <- signac$blacklist_region_fragments / signac$peak_region_fragments + +# Plot TSS enrichment, separating cells in two groups +signac$high.tss <- ifelse(signac$TSS.enrichment > 2, 'High', 'Low') + +png(pasteO(opt$oudir,'/TSS_enrichment.png'),width = 800,height = 800) +TSSPlot(signac, group.by = 'high.tss') + NoLegend() +dev.off() + +# Plot FragmentHistogram on chr1, separating cells in two groups depending on nucleosome signal + +signac$nucleosome_group <- ifelse(signac$nucleosome_signal > 2, 'NS > 2', 'NS < 2') + +png(pasteO(opt$oudir,'/fragmentHistogram.png'),width = 800,height = 800) +FragmentHistogram(object = signac,region = "chr1-1-100000000",group.by = 'nucleosome_group') +dev.off() + +# VlnPlot of control metrics +png(pasteO(opt$oudir,'/vlncontrolMetrics.png'),width = 1400,height = 500) + +VlnPlot( + object = signac, + features = c('pct_reads_in_peaks', 'peak_region_fragments', + 'TSS.enrichment', 'blacklist_ratio', 'nucleosome_signal'), + pt.size = 0.001, + ncol = 5 +) +dev.off() + +# Filter outliers +signac <- subset( + x = signac, + subset = peak_region_fragments > 3000 & + peak_region_fragments < 20000 & + pct_reads_in_peaks > 15 & + blacklist_ratio < 0.05 & + nucleosome_signal < 4 & + TSS.enrichment > 2 +) + +signac + + + + diff --git a/R_src/actinnSeuratCL.R b/R_src/actinnSeuratCL.R new file mode 100644 index 0000000000000000000000000000000000000000..b9e1d0b3468bd8c7595261c42d125cfbd11face9 --- /dev/null +++ b/R_src/actinnSeuratCL.R @@ -0,0 +1,188 @@ +## ----------------------------------------------------------------------------- +## Libraries +## ----------------------------------------------------------------------------- +print("Loading library") + +library(Seurat) +library(getopt) +library(scales) +library(RColorBrewer) +library(plyr) +library(ggplot2) +library(gridExtra) +library(cowplot) +library(reshape2) +library(cowplot) +library(gridExtra) + +source("R_src/do_consensus.R") +source("R_src/funForAnalysisSeurat.R") + +theme_set(theme_classic()) + + + + +spec = matrix(c( + 'help', 'h', 0, "logical", "Help about the program", + 'inputRDS', 'i', 1, "character", "REQUIRED: 10X data ordered by monocle (.RDS generated by orderCL.R).", + 'actinnPathGMP', 'g', 1, "character", "Path to actinn multiple run directory", + 'actinnPathProg', 'p', 1, "character", "Path to actinn multiple run directory", + 'signatures', 's', 1, "character", "Signature files from nestorowa analysis", + 'outdir', 'o',1, "character", 'Outdir path (default ./)' +), byrow=TRUE, ncol=5); + +opt = getopt(spec) + + +args <- commandArgs() +if ( !is.null(opt$help) | is.null(opt$inputRDS)) { + cat("Gene filtering of a gbm cds ordered. For scenic use, two filters") + cat(getopt(spec, usage=TRUE)) + q(status=1) +} + +if (is.null(opt$outdir)) { + opt$outdir = "./" +} +outdir <- opt$outdir + +# Data for test +# opt$inputRDS <- "/shared/projects/scRNA_HSPC_Aging/GMP_PRARA/output/RNA/streamAnalysisSeurat/seuratPseudotime.rds" +# opt$actinnPathGMP <- "/shared/projects/scRNA_HSPC_Aging/GMP_PRARA/output/RNA/multiple_Actinn_Nestorowa_gmp/" +# opt$actinnPathProg <- "/shared/projects/scRNA_HSPC_Aging/GMP_PRARA/output/RNA/multiple_Actinn_Nestorowa_prog/" +# opt$signatures <- "/shared/projects/scRNA_HSPC_Aging/GMP_PRARA/output/RNA/Nestorowa/progenitors_signature.csv" + +dir.create(outdir, recursive = T) + +print("Loading files") +# Read Seurat file +seurat <- readRDS(opt$inputRDS) + +# Read and do actinn consensus +actinn_consensus_gmp <- do_consensus(opt$actinnPathGMP) +actinn_consensus_prog <- do_consensus(opt$actinnPathProg) + +# Read signatures list +signatures <- read.table(opt$signatures, sep = ",", header = T) + +## Color palette + +colorTreatment <- c("#664CFF", "#FF8000") +colorCluster <- c("#E69F00", "#CC79A7", "#0072B2", "#009E73", "#D55E00", "#56B4E9") + + +# Add actinn result as meta data + +seurat$cellType_Actinn_GMP <- actinn_consensus_gmp[rownames(seurat@meta.data),"Consensus"] +seurat$cellType_Actinn_GMP <- factor(seurat$cellType_Actinn_GMP, levels = c("GMP_broad", "CMP_broad", "MEP_broad")) + +seurat$cellType_Actinn_Prog <- actinn_consensus_prog[rownames(seurat@meta.data),"Consensus"] +seurat$cellType_Actinn_Prog <- factor(seurat$cellType_Actinn_Prog, levels = c("Prog", "HSPC")) + +# Calculate signature score + +for(sig in 1:ncol(signatures)){ + print(names(signatures[sig])) + seurat <- AddModuleScore(seurat, features = list(signatures[,sig]), name = names(signatures[sig])) + colnames(seurat@meta.data)[which(colnames(seurat@meta.data) == paste0(names(signatures[sig]), 1))] <- names(signatures[sig]) +} + +seurat[["Signature_nesto"]] <- CreateAssayObject(data = t(as.matrix(seurat@meta.data[,names(signatures)]))) + + +## Plotting results of Actinn transfer +# Cell size UMAP 0.4 + +## Prog transfer and signature ## +## We only look at Prog signature, the transfer result is not convincing + +# UMAP +# UMAP_actinn_prog <- DimPlot(seurat, group.by = "cellType_Actinn_prog") + ggtitle("") + + +# Violin plot +Prog_all_melt <- melt(seurat@meta.data[,c("Prog_signature_up", "Prog_signature_down")]) +Prog_Ctrl_melt <- melt(seurat@meta.data[seurat$condition %in% "Control", c("Prog_signature_up", "Prog_signature_down")]) +Prog_RA_melt <- melt(seurat@meta.data[seurat$condition %in% "Treated", c("Prog_signature_up", "Prog_signature_down")]) + + + +violinplot_prog <- ggplot(Prog_all_melt, aes(x = variable, y = value)) + + geom_violin(fill = "grey") + theme(legend.position = "none", axis.title.x = element_blank()) + + ylab("Signature score") + scale_x_discrete(labels = c("Prog_signature_up" = "Prog", "Prog_signature_down" = "HSPC")) + +violinplot_Ctrl_prog <- ggplot(Prog_Ctrl_melt, aes(x = variable, y = value)) + + geom_violin(fill = "grey") + theme(legend.position = "none", axis.title.x = element_blank()) + + ylab("Signature score") + scale_x_discrete(labels = c("Prog_signature_up" = "Prog", "Prog_signature_down" = "HSPC")) + +violinplot_RA_prog <- ggplot(Prog_RA_melt, aes(x = variable, y = value)) + + geom_violin(fill = "grey") + theme(legend.position = "none", axis.title.x = element_blank()) + + ylab("Signature score") + scale_x_discrete(labels = c("Prog_signature_up" = "Prog", "Prog_signature_down" = "HSPC")) + + +ggsave(paste0(outdir, 'Vln_Prog_HSPC', '.png'), plot = violinplot_prog, device = 'png', path = NULL, + scale = 1, width = 15, height = 10, units = 'cm', dpi = 300) +ggsave(paste0(outdir, 'Vln_Ctrl_Prog_HSPC', '.png'), plot = violinplot_Ctrl_prog, device = 'png', path = NULL, + scale = 1, width = 15, height = 10, units = 'cm', dpi = 300) +ggsave(paste0(outdir, 'Vln_RA_Prog_HSPC', '.png'), plot = violinplot_RA_prog, device = 'png', path = NULL, + scale = 1, width = 15, height = 10, units = 'cm', dpi = 300) +## GMP transfer and signature visualisation ## + +# Violin Plot +GMP_all_melt <- melt(seurat@meta.data[,c("GMP_signature_up", "CMP_signature_up", "MEP_signature_up")]) +GMP_Ctrl_melt <- melt(seurat@meta.data[seurat$condition %in% "Control", c("GMP_signature_up", "CMP_signature_up", "MEP_signature_up")]) +GMP_RA_melt <- melt(seurat@meta.data[seurat$condition %in% "Treated", c("GMP_signature_up", "CMP_signature_up", "MEP_signature_up")]) + +violinplot_gmp <- ggplot(GMP_all_melt, aes(x = variable, y = value)) + + geom_violin(fill = "grey") + theme(legend.position = "none", axis.title.x = element_blank()) + + ylab("Signature score") + scale_x_discrete(labels = c("GMP_signature_up" = "GMP", "CMP_signature_up" = "CMP", "MEP_signature_up" = "MEP")) +violinplot_Ctrl_gmp <- ggplot(GMP_Ctrl_melt, aes(x = variable, y = value)) + + geom_violin(fill = "grey") + theme(legend.position = "none", axis.title.x = element_blank()) + + ylab("Signature score") + scale_x_discrete(labels = c("GMP_signature_up" = "GMP", "CMP_signature_up" = "CMP", "MEP_signature_up" = "MEP")) +violinplot_RA_gmp <- ggplot(GMP_RA_melt, aes(x = variable, y = value)) + + geom_violin(fill = "grey") + theme(legend.position = "none", axis.title.x = element_blank()) + + ylab("Signature score") + scale_x_discrete(labels = c("GMP_signature_up" = "GMP", "CMP_signature_up" = "CMP", "MEP_signature_up" = "MEP")) + + +ggsave(paste0(outdir, 'Vln_GMP_CMP_MEP', '.png'), plot = violinplot_gmp, device = 'png', path = NULL, + scale = 1, width = 15, height = 10, units = 'cm', dpi = 300) +ggsave(paste0(outdir, 'Vln_Ctrl_GMP_CMP_MEP', '.png'), plot = violinplot_Ctrl_gmp, device = 'png', path = NULL, + scale = 1, width = 15, height = 10, units = 'cm', dpi = 300) +ggsave(paste0(outdir, 'Vln_RA_GMP_CMP_MEP', '.png'), plot = violinplot_RA_gmp, device = 'png', path = NULL, + scale = 1, width = 15, height = 10, units = 'cm', dpi = 300) +# UMAP + +# Can't plot after subseting on MEP_broad because there is only one cell +Idents(seurat) <- "cellType_Actinn_GMP" + +seurat_gmp <- subset(seurat, idents = "GMP_broad") +seurat_cmp <- subset(seurat, idents = "CMP_broad") +# seurat_mep <- subset(seurat, idents = "MEP_broad") + +Idents(seurat) <- "FinalCluster" + +UMAP_actinn_GMP <- DimPlot(seurat_gmp, group.by = "cellType_Actinn_GMP", pt.size = 0.4) + + theme(axis.title = element_blank(), axis.text = element_blank(), legend.position = "none") + + scale_color_manual(values = "grey") + +UMAP_actinn_CMP <- DimPlot(seurat_cmp, group.by = "cellType_Actinn_GMP", pt.size = 0.4) + + theme(axis.title = element_blank(), axis.text = element_blank(), legend.position = "none") + + scale_color_manual(values = "grey") + +# UMAP_actinn_MEP <- DimPlot(seurat_mep, group.by = "cellType_Actinn_GMP") + +UMAP_actinn_gmp_split <- DimPlot(seurat, group.by = "cellType_Actinn_GMP", split.by = "cellType_Actinn_GMP", pt.size = 0.4) + + theme(axis.title = element_blank(), axis.text = element_blank(), legend.position = "none") + ggtitle("") + + scale_color_manual(values = c("grey", "grey", "grey")) + + +ggsave(paste0(outdir, 'UMAP_GMP_split', '.png'), plot = UMAP_actinn_gmp_split, device = 'png', path = NULL, + scale = 1, width = 25, height = 15, units = 'cm', dpi = 300) +ggsave(paste0(outdir, 'UMAP_GMP', '.png'), plot = UMAP_actinn_GMP, device = 'png', path = NULL, + scale = 1, width = 15, height = 15, units = 'cm', dpi = 300) +ggsave(paste0(outdir, 'UMAP_CMP', '.png'), plot = UMAP_actinn_CMP, device = 'png', path = NULL, + scale = 1, width = 15, height = 15, units = 'cm', dpi = 300) + + +saveRDS(seurat, paste0(outdir, "seurat_actinn", ".rds")) \ No newline at end of file diff --git a/R_src/assignSurfaceMarkerSeuratCL.R b/R_src/assignSurfaceMarkerSeuratCL.R new file mode 100644 index 0000000000000000000000000000000000000000..8160f87ecf34a7fce840af44a7cff7d5f57ef922 --- /dev/null +++ b/R_src/assignSurfaceMarkerSeuratCL.R @@ -0,0 +1,89 @@ +## ----------------------------------------------------------------------------- +## Libraries +## ----------------------------------------------------------------------------- + +library(Seurat) +library(monocle) +library(plyr) +library(ggplot2) +library(getopt) + +## ----------------------------------------------------------------------------- +## Command line args +## ----------------------------------------------------------------------------- + +spec = matrix(c( + 'help', 'h', 0, "logical", "Help about the program", + "inputSeurat", "i",1, "character", "input seurat object with cluster column names numclust", + 'outdir', 'o',1, "character", 'Outdir path (default ./)', + "surfaceMarkers", "s", 1, "character", "List of surface marker split by \\+", + "namePop", "n", 1, "character", "Name of the double pos population" +), byrow=TRUE, ncol=5); + + +opt = getopt(spec) + +# if help was asked, print a friendly message +# and exit with a non-zero error code + +args <- commandArgs() +if ( !is.null(opt$help) | is.null(opt$inputSeurat)) { + cat("Create influence graph from regulon table") + cat(getopt(spec, usage=TRUE)) + q(status=1) +} + +if(is.null(opt$outdir)){ + opt$outdir <- "./" +} + +theme_set(theme_classic()) + +# Testing option +opt$outdir <- "/shared/projects/scRNA_HSPC_Aging/GMP_PRARA/output/RNA/Seurat4_integration/Analysis/" +opt$inputSeurat <- readRDS("/shared/projects/scRNA_HSPC_Aging/GMP_PRARA/output/RNA/regulonAnalysis/seuratAUC_2.rds") +opt$surfaceMarkers <- "Ly6g+Itgam" +opt$namePop <- "Diff" + +outdir <- opt$outdir + +# Read surface markers +surfaceMarkers <- strsplit(opt$surfaceMarkers, split = "\\+")[[1]] + +# Open seurat then save it as CDS object +seurat <- readRDS(opt$inputSeurat) +seurat_cds <- as.CellDataSet(seurat, assay = "RNA") + + +# Classify cells with surface markers +cth <- newCellTypeHierarchy() + +SM_1_id <- row.names(subset(fData(seurat_cds), gene_short_name == surfaceMarkers[1])) +SM_2_id <- row.names(subset(fData(seurat_cds), gene_short_name == surfaceMarkers[1])) + +cth <- addCellType(cth, opt$namePop, classify_func = + function(x){ x[SM_1_id,] >=1 & x[SM_2_id,] >=1}) + +seurat_cds <- classifyCells(seurat_cds, cth, 0.1) + +# Add cellType information to the seurat object +seurat@meta.data$CellType <- pData(seurat_cds)[rownames(seurat@meta.data), "CellType"] + +# Barplot cellType distribution +summary_diff <- ddply(seurat@meta.data,~FinalCluster + CellType + condition, nrow) +summary_diff$Diff_cell <- factor(summary_diff$CellType, levels = c("Unknown", "Diff")) + +cellType_control_bp <- ggplot(data = data.frame(summary_diff[summary_diff$condition %in% "Control",]), aes(x = FinalCluster, y = V1, fill = CellType)) + + geom_bar(stat = "identity", position = position_dodge()) + + ylab(label = "") + xlab(label = "") + coord_flip() + +cellType_treated_bp <- ggplot(data = data.frame(summary_diff[summary_diff$condition %in% "Treated",]), aes(x = FinalCluster, y = V1, fill = CellType)) + + geom_bar(stat = "identity", position = position_dodge()) + + ylab(label = "") + xlab(label = "") + coord_flip() + +ggsave(paste0(outdir, "BarPlot/", 'BP_CS_Markers_', opt$namePop, '_control', '.png'), plot = cellType_control_bp, device = 'png', path = NULL, + scale = 1, width = 10, height = 10, units = 'cm', dpi = 300) +ggsave(paste0(outdir, "BarPlot/", 'BP_CS_Markers_', opt$namePop, '_treated', '.png'), plot = cellType_treated_bp, device = 'png', path = NULL, + scale = 1, width = 10, height = 10, units = 'cm', dpi = 300) + +saveRDS(seurat, paste0(outdir, "seurat_CT", ".rds")) diff --git a/R_src/castle.R b/R_src/castle.R new file mode 100644 index 0000000000000000000000000000000000000000..cca3dcaf06ed85830930cca142969934c50bcfad --- /dev/null +++ b/R_src/castle.R @@ -0,0 +1,215 @@ +castle <- function(source_seurat,target_seurat,outdir = "./") { + set.seed(2018) + BREAKS=c(-1, 0, 1, 6, Inf) + nFeatures = 100 + + dir.create(outdir,showWarnings = F) + + # 1. Load datasets in scater format: loaded files expected to contain "Large SingleCellExperiment" object + + + # 1.1 convert from seurat (be careful seurat 2) to sce object if seurat object as input + source <- Convert(from = source_seurat, to = "sce") + target <- Convert(from = target_seurat, to = "sce") + + + + + ds1 = t(exprs(source)) + ds2 = t(exprs(target)) + sourceCellTypes = as.factor(colData(source)[,"library_id"]) + + # 2. Unify sets, excluding low expressed genes + source_n_cells_counts = apply(exprs(source), 1, function(x) { sum(x > 0) } ) + print("source OK") + target_n_cells_counts = apply(exprs(target), 1, function(x) { sum(x > 0) } ) + common_genes = intersect( rownames(source)[source_n_cells_counts>10], + rownames(target)[target_n_cells_counts>10] + ) + remove(source_n_cells_counts, target_n_cells_counts) + ds1 = ds1[, colnames(ds1) %in% common_genes] + ds2 = ds2[, colnames(ds2) %in% common_genes] + ds = rbind(ds1[,common_genes], ds2[,common_genes]) + isSource = c(rep(TRUE,nrow(ds1)), rep(FALSE,nrow(ds2))) + remove(ds1, ds2) + + # 3. Highest mean in both source and target + topFeaturesAvg = colnames(ds)[order(apply(ds, 2, mean), decreasing = T)] + + # 4. Highest mutual information in source + topFeaturesMi = names(sort(apply(ds[isSource,],2,function(x) { compare(cut(x,breaks=BREAKS),sourceCellTypes,method = "nmi") }), decreasing = T)) + + # 5. Top n genes that appear in both mi and avg + selectedFeatures = union(head(topFeaturesAvg, nFeatures) , head(topFeaturesMi, nFeatures) ) + + # 6. remove correlated features + tmp = cor(as.matrix(ds[,selectedFeatures]), method = "pearson") + tmp[!lower.tri(tmp)] = 0 + selectedFeatures = selectedFeatures[apply(tmp,2,function(x) any(x < 0.9))] + remove(tmp) + + # 7,8. Convert data from continous to binned dummy vars + # break datasets to bins + dsBins = apply(ds[, selectedFeatures], 2, cut, breaks= BREAKS) + # use only bins with more than one value + nUniq = apply(dsBins, 2, function(x) { length(unique(x)) }) + # convert to dummy vars + ds = model.matrix(~ . , as.data.frame(dsBins[,nUniq>1])) + remove(dsBins, nUniq) + + + # 9. Classify + train = runif(nrow(ds[isSource,]))<0.8 + #slightly different setup for multiclass and binary classification + if (length(unique(sourceCellTypes)) > 2) { + xg=xgboost(data=ds[isSource,][train, ] , + label=as.numeric(sourceCellTypes[train])-1, + objective="multi:softmax", num_class=length(unique(sourceCellTypes)), + eta=0.7 , nthread=5, nround=20, verbose=0, + gamma=0.001, max_depth=5, min_child_weight=10) + } else { + xg=xgboost(data=ds[isSource,][train, ] , + label=as.numeric(sourceCellTypes[train])-1, + eta=0.7 , nthread=5, nround=20, verbose=0, + gamma=0.001, max_depth=5, min_child_weight=10) + } + + + + # 10. Predict + predictedClasses = predict(xg, ds[!isSource, ]) + + target_seurat@meta.data$predicted <- predictedClasses + + + conv <- data.frame(cellType = unique(sourceCellTypes),numCellTypes=unique(as.numeric(sourceCellTypes[train]))-1) + + + for (num in conv$numCellTypes) { + target_seurat@meta.data$predicted[target_seurat@meta.data$predicted==num] <- as.character(conv[which(conv$numCellType== num),"cellType"]) + } + + + + target_seurat@meta.data$predicted <- factor(target_seurat@meta.data$predicted,levels = c("LTHSC","STHSC", "MPP2", "MPP3")) + png(paste(outdir,"/cellTypeLearned_tsne.png", sep = ""),height = 800,width = 800) + TSNEPlot(target_seurat,group.by="predicted") + dev.off() + + + + return(target_seurat) +} + +castleSeurat3 <- function(source_seurat,target_seurat,outdir = "./") { + set.seed(2018) + BREAKS=c(-1, 0, 1, 6, Inf) + nFeatures = 100 + + dir.create(outdir,showWarnings = F) + + # 1. Load datasets in scater format: loaded files expected to contain "Large SingleCellExperiment" object + + + # 1.1 convert from seurat (be careful seurat 3) to sce object if seurat object as input + source <- as.SingleCellExperiment(source_seurat, to = "sce",data = "logcounts") + target <- as.SingleCellExperiment(target_seurat, to = "sce",data = "logcounts") + + + + + ds1 = t(exprs(source)) + ds2 = t(exprs(target)) + sourceCellTypes = as.factor(colData(source)[,"library_id"]) + + # 2. Unify sets, excluding low expressed genes + source_n_cells_counts = apply(exprs(source), 1, function(x) { sum(x > 0) } ) + print("source OK") + target_n_cells_counts = apply(exprs(target), 1, function(x) { sum(x > 0) } ) + common_genes = intersect( rownames(source)[source_n_cells_counts>10], + rownames(target)[target_n_cells_counts>10] + ) + remove(source_n_cells_counts, target_n_cells_counts) + ds1 = ds1[, colnames(ds1) %in% common_genes] + ds2 = ds2[, colnames(ds2) %in% common_genes] + ds = rbind(ds1[,common_genes], ds2[,common_genes]) + isSource = c(rep(TRUE,nrow(ds1)), rep(FALSE,nrow(ds2))) + remove(ds1, ds2) + + # 3. Highest mean in both source and target + topFeaturesAvg = colnames(ds)[order(apply(ds, 2, mean), decreasing = T)] + + # 4. Highest mutual information in source + topFeaturesMi = names(sort(apply(ds[isSource,],2,function(x) { compare(cut(x,breaks=BREAKS),sourceCellTypes,method = "nmi") }), decreasing = T)) + + # 5. Top n genes that appear in both mi and avg + selectedFeatures = union(head(topFeaturesAvg, nFeatures) , head(topFeaturesMi, nFeatures) ) + + # 6. remove correlated features + tmp = cor(as.matrix(ds[,selectedFeatures]), method = "pearson") + tmp[!lower.tri(tmp)] = 0 + selectedFeatures = selectedFeatures[apply(tmp,2,function(x) any(x < 0.9))] + remove(tmp) + + # 7,8. Convert data from continous to binned dummy vars + # break datasets to bins + dsBins = apply(ds[, selectedFeatures], 2, cut, breaks= BREAKS) + # use only bins with more than one value + nUniq = apply(dsBins, 2, function(x) { length(unique(x)) }) + # convert to dummy vars + ds = model.matrix(~ . , as.data.frame(dsBins[,nUniq>1])) + remove(dsBins, nUniq) + + + # 9. Classify + train = runif(nrow(ds[isSource,]))<0.8 + #slightly different setup for multiclass and binary classification + if (length(unique(sourceCellTypes)) > 2) { + xg=xgboost(data=ds[isSource,][train, ] , + label=as.numeric(sourceCellTypes[train])-1, + objective="multi:softmax", num_class=length(unique(sourceCellTypes)), + eta=0.7 , nthread=5, nround=20, verbose=0, + gamma=0.001, max_depth=5, min_child_weight=10) + } else { + xg=xgboost(data=ds[isSource,][train, ] , + label=as.numeric(sourceCellTypes[train])-1, + eta=0.7 , nthread=5, nround=20, verbose=0, + gamma=0.001, max_depth=5, min_child_weight=10) + } + + + + # 10. Predict + predictedClasses = predict(xg, ds[!isSource, ]) + + target_seurat@meta.data$predicted <- predictedClasses + + + conv <- data.frame(cellType = unique(sourceCellTypes),numCellTypes=unique(as.numeric(sourceCellTypes[train]))-1) + + + for (num in conv$numCellTypes) { + target_seurat@meta.data$predicted[target_seurat@meta.data$predicted==num] <- as.character(conv[which(conv$numCellType== num),"cellType"]) + } + + + + target_seurat@meta.data$predicted <- factor(target_seurat@meta.data$predicted,levels = c("LTHSC","STHSC", "MPP2", "MPP3")) + png(paste(outdir,"/cellTypeLearned_tsne.png", sep = ""),height = 800,width = 800) + DimPlot(target_seurat,group.by ="predicted") + dev.off() + + + + return(target_seurat) +} + + +transfertToMonocle <- function(target_seurat,target_monocle,outdir = "./") { + pData(target_monocle)$predicted <- target_seurat@meta.data$predicted + png(paste(outdir,"/cellTypeLearned_ddrtree.png", sep = ""),height = 800,width = 800) + print(plot_cell_trajectory(target_monocle,color_by = "predicted")) + dev.off() + return(target_monocle) +} + diff --git a/R_src/castleSeurat3CL.R b/R_src/castleSeurat3CL.R new file mode 100644 index 0000000000000000000000000000000000000000..e2d8cc61e7d82e99afa53a47e5719bf70ae13e1e --- /dev/null +++ b/R_src/castleSeurat3CL.R @@ -0,0 +1,59 @@ +suppressMessages(library(gProfileR)) +suppressMessages(library(RColorBrewer)) +require(scales) +suppressMessages(library(monocle)) +suppressMessages(library(Seurat)) +suppressMessages(library(scater)) +suppressMessages(library(xgboost)) +suppressMessages(library(igraph)) +suppressMessages(library(getopt)) + +source("R_src/getSigPerCells.R") +source("R_src/Enrichment.R") +source("R_src/castle.R") + +spec = matrix(c( + 'help', 'h', 0, "logical", "Help about the program", + 'sourceRDS', 'i', 1, "character", "REQUIRED: 10X data to trained classifier prepared as Seurat object (.RDS generated by RodriguezAnalysis.R).", + 'targetRDS', 't', 1, "character", "REQUIRED: 10X data targeted prepared as seurat object (.RDS generated by SeuratCL.R (filtered, normalized)).", + 'targetMonocleRDS', 'm', 1, "character", "REQUIRED: 10X data targeted prepared as monocle object (.RDS generated by seuratMonocleOrdering.R).", + 'outdir', 'o',1, "character", 'Outdir path (default ./)' +), byrow=TRUE, ncol=5); + +opt = getopt(spec) + +if ( !is.null(opt$help) | is.null(opt$sourceRDS)) { + cat("Perform perform cell type assignation using a source dataset and castle XGradientBoost tool") + cat(getopt(spec, usage=TRUE)) + q(status=1) +} + + +source_seurat <- readRDS(opt$sourceRDS) +target_seurat <- readRDS(opt$targetRDS) + + + + seurat <- castleSeurat3(source_seurat,target_seurat,outdir = opt$outdir) + + if (!is.null(opt$targetMonocleRDS)) { + target_monocle <- readRDS(opt$targetMonocleRDS) + monocle <- transfertToMonocle(seurat,target_monocle,outdir = opt$outdir) + saveRDS(monocle,paste(opt$outdir,"/monocleTrainedCt.rds",sep ="")) + + } + + png(paste(opt$outdir,"/CellTypeLearnedPropGG.png",sep = "")) + pie <- ggplot(seurat@meta.data, + aes(x = factor(1), fill = factor(predicted))) + geom_bar(width = 1) + pie <- pie + coord_polar(theta = "y") + + theme(axis.title.x = element_blank(), axis.title.y = element_blank()) + print(pie) + dev.off() + + png(paste(opt$outdir,"/CellTypeLearnedProp.png",sep = "")) + pie(table(seurat@meta.data$predicted),labels = names(table(seurat@meta.data$predicted)),col = hue_pal()(4),main = paste("predicted prop")) + dev.off() + + saveRDS(seurat,paste(opt$outdir,"/seuratTrainedCt.rds",sep = "")) + diff --git a/R_src/computeDiffFun.R b/R_src/computeDiffFun.R new file mode 100644 index 0000000000000000000000000000000000000000..e0c2cc69611dcbe98c298ed4451a121510fa1d84 --- /dev/null +++ b/R_src/computeDiffFun.R @@ -0,0 +1,218 @@ +#Functions to compute difference of feature in seurat object (because Seurat compute logFC) + +getExtAvgDiff <- function(row) { + res <- NA + if(row[1] > 0 & row[2] > 0) { + res <- min(c(row)) + } + if(row[1] < 0 & row[2] < 0) { + res <- max(c(row)) + } + return(res) +} + +featureDiff <- function(seurat,cells.1,cells.2,feature) { + data <- GetAssayData(seurat,slot = "data") + total.diff <- mean(data[feature,cells.1]) - mean(data[feature,cells.2]) + return(total.diff) +} + + +getTrueDiff <- function (seurat,table,colIdent = "numclust",suffix = "",colTest="AGE") { + + if(!is.null(levels(seurat[[colTest]]))) { + factors <- levels(seurat@meta.data[,colTest]) + } else { + factors <- unique(seurat@meta.data[,colTest]) + } + factors <- as.vector(factors) + + table[,paste("avg_diff",suffix,sep = "")] <- NA + + for (rm in rownames(table)) { + feature <- table[rm,"Gene"] + cells.1 <- colnames(seurat)[which(seurat[[colIdent]] == table[rm,colIdent] & seurat[[colTest]] == factors[1])] + cells.2 <- colnames(seurat)[which(seurat[[colIdent]] == table[rm,colIdent] & seurat[[colTest]] == factors[2])] + table[rm,paste("avg_diff",suffix,sep = "")] <- featureDiff(seurat,cells.1,cells.2,feature) + } + + return(table) +} + + +## Test functions + +FindMarkerPerClustGroupVar <- function(cluster, + hspc.combined, + condition ="AGE", ## only var with two factors allowed + grouping.var = "platform", + max.cells.per.ident = Inf, + filterOnPadj = T, + logfc.threshold = 0.25, + min.pct = 0.1, + keepDiverging = F, + test.use = "wilcox", + identCol = "numclust", + pseudocount.use = 1, + computeTrueDiff= F) { + + clusterCondition <- paste0("cluster.",condition) + if(!is.null(levels(seurat@meta.data[,condition]))) { + factors <- levels(seurat@meta.data[,condition]) + } else { + factors <- unique(seurat@meta.data[,condition]) + } + + hspc.combined@meta.data[,clusterCondition] <- paste(hspc.combined@meta.data[,identCol], + hspc.combined@meta.data[,condition], sep = "_") + + test <- table(hspc.combined@meta.data[,clusterCondition], hspc.combined@meta.data[,grouping.var]) + groups = unique(hspc.combined@meta.data[,grouping.var]) + + ident1 = paste0(cluster,"_",factors[1]) + ident2 = paste0(cluster,"_",factors[2]) + + + if( (test[ident1,groups[1]]>3 & test[ident1,groups[2]]>3) & + (test[ident2,groups[1]]>3&test[ident2,groups[2]]>3)) { + + Idents(object = hspc.combined) <- clusterCondition + + + + + markers <- FindConservedMarkers(hspc.combined, + assay = DefaultAssay(hspc.combined), + grouping.var = grouping.var, + test.use=test.use, + ident.1 = ident1, + ident.2 = ident2, + pseudocount.use = pseudocount.use, + min.pct = min.pct, + logfc.threshold = logfc.threshold, + max.cells.per.ident = max.cells.per.ident) + + print(dim(markers)) + #Add a column the min abs(logFC) + markers$min_avg_logFC <- NA + + if(!keepDiverging) { + #print("keep only markers with conserved logfc sign") + markers <- markers[which(markers[,paste0(groups[1],"_avg_logFC")]/markers[,paste0(group[2],"_avg_logFC")] > 0),] + } else { + #print("combined p_val of diverging markers between two batch are set to 1 and their min_avg_logfc to 0") + markers[which(markers[,paste0(groups[1],"_avg_logFC")]/markers[,paste0(group[2],"_avg_logFC")] > 0),"min_avg_logFC"] <- 0 + markers[which(markers[,paste0(groups[1],"_avg_logFC")]/markers[,paste0(group[2],"_avg_logFC")] > 0),"minimump_p_val"] <- 1 + } + + for (g in rownames(markers)) { + if(markers[g,paste0(groups[1],"_avg_logFC")] < 0) { + markers[g,"min_avg_logFC"] <- max(markers[g,c(paste0(groups[1],"_avg_logFC"),paste0(groups[2],"_avg_logFC"))]) + } else { + markers[g,"min_avg_logFC"] <- min(markers[g,c(paste0(groups[1],"_avg_logFC"),paste0(groups[2],"_avg_logFC"))]) + } + } + + markers$Cluster <- paste0(ident1,"_up") + markers$Cluster[which(markers$min_avg_logFC < 0)] <- paste0(ident1,"_down") + markers$Gene <- rownames(markers) + markers <- markers[order(markers$min_avg_logFC),] + + if(computeTrueDiff) { + markers$numclust <- str_split_fixed(markers$Cluster,pattern = "_",n=3)[,1] + + + markers <- getTrueDiff(seurat[,which(seurat[[grouping.var]] == group[1])], + markers, + colIdent = "numclust", + colTest = condition, + suffix = paste0("_",group[1])) + + markers <- getTrueDiff(seurat[,which(seurat[[grouping.var]] == group[2])], + markers, + colIdent = "numclust", + colTest = condition, + suffix = paste0("_",group[2])) + + + markers$min_avg_diff <- apply(markers[,c(paste0("avg_diff_",group[1]),paste0("avg_diff_",group[2]))], + 1, + FUN=getExtAvgDiff) + } + + + if (filterOnPadj) { + markers <- markers[which(markers$A_p_val_adj < 0.05 & markers$B_p_val_adj < 0.05),] + } + }else { + markers <- NULL + } + return(markers) + +} + +FindMarkerPerClust <- function(cluster, + hspc.combined, + condition ="AGE", ## only var with two factors allowed + max.cells.per.ident = Inf, + logfc.threshold = 0.25, + min.pct = 0.1, + test.use = "wilcox", + identCol = "numclust", + pseudocount.use = 1, + computeTrueDiff = F) { + + clusterCondition <- paste0("cluster.",condition) + if(!is.null(levels(seurat@meta.data[,condition]))) { + factors <- levels(seurat@meta.data[,condition]) + } else { + factors <- unique(seurat@meta.data[,condition]) + } + + hspc.combined@meta.data[,clusterCondition] <- paste(hspc.combined@meta.data[,identCol], + hspc.combined@meta.data[,condition], sep = "_") + + test <- table(hspc.combined@meta.data[,clusterCondition]) + + ident1 = paste0(cluster,"_",factors[1]) + ident2 = paste0(cluster,"_",factors[2]) + + + if( test[ident1]>3 & test[ident2]>3) { + + Idents(object = hspc.combined) <- clusterCondition + + markers <- FindMarkers(hspc.combined, + assay = DefaultAssay(hspc.combined), + test.use=test.use, + ident.1 = ident1, + ident.2 = ident2, + pseudocount.use = pseudocount.use, + min.pct = min.pct, + logfc.threshold = logfc.threshold, + max.cells.per.ident = max.cells.per.ident) + + markers$Cluster <- paste0(ident1,"_up") + markers$Cluster[which(markers$avg_logFC < 0)] <- paste0(ident1,"_down") + markers$Gene <- rownames(markers) + markers <- markers[order(markers$avg_logFC),] + + if(computeTrueDiff) { + markers$numclust <- str_split_fixed(markers$Cluster,pattern = "_",n=3)[,1] + + + markers <- getTrueDiff(seurat, + markers, + colIdent = "numclust", + colTest = condition + ) + + } + + } else { + markers <- NULL + } + return(markers) + +} + diff --git a/R_src/data_preparation.R b/R_src/data_preparation.R new file mode 100644 index 0000000000000000000000000000000000000000..70f735b86e0481e39574772038e25a770b8f6afe --- /dev/null +++ b/R_src/data_preparation.R @@ -0,0 +1,133 @@ +# R function to prepare single cell data for monocle & Seurat + +getCellCyclePhases <- function(gbm_cds,outdir = "./") { + + dir.create(path = outdir,recursive = T,showWarnings = F) + gene_count_matrix <- as.matrix(exprs(gbm_cds)) + set.seed(100) + mm.pairs <- readRDS(system.file("exdata", "mouse_cycle_markers.rds", package="scran")) + assignments <- cyclone(gene_count_matrix, mm.pairs, gene.names=rownames(gene_count_matrix)) + + ## add cell cycle phase to pData + pData(gbm_cds)$phases <- assignments$phases + pData(gbm_cds)$G1_score <- assignments$scores$G1 + pData(gbm_cds)$G2M_score <- assignments$scores$G2M + pData(gbm_cds)$S_score <- assignments$scores$S + pData(gbm_cds)[which(pData(gbm_cds)$phases=="G1"),"phases"] <- "G1_G0" + pData(gbm_cds)[which(pData(gbm_cds)$phases=="G2M"),"phases"] <- "G2_M" + + + png(paste(outdir,"/cell_cycle_phases_assignment_on_umi.png",sep = "")) + plot(assignments$score$G1, assignments$score$G2M, + xlab="G1 score", ylab="G2/M score", pch=16) + dev.off() + + print("Cell cycle phases added..") + return(gbm_cds) +} + +getCellCyclePhasesSeurat <- function(seurat,outdir = "./") { + + dir.create(path = outdir,recursive = T,showWarnings = F) + gene_count_matrix <- as.matrix(GetAssayData(seurat,slot = 'counts',assay = "RNA")) + set.seed(100) + mm.pairs <- readRDS(system.file("exdata", "mouse_cycle_markers.rds", package="scran")) + assignments <- cyclone(gene_count_matrix, mm.pairs, gene.names=rownames(gene_count_matrix)) + + ## add cell cycle phase to pData + seurat@meta.data$phases <- assignments$phases + seurat@meta.data$G1_score <- assignments$scores$G1 + seurat@meta.data$G2M_score <- assignments$scores$G2M + seurat@meta.data$S_score <- assignments$scores$S + seurat@meta.data[which(seurat@meta.data$phases=="G1"),"phases"] <- "G1_G0" + seurat@meta.data[which(seurat@meta.data$phases=="G2M"),"phases"] <- "G2_M" + + + png(paste(outdir,"/cell_cycle_phases_assignment_on_umi.png",sep = "")) + plot(assignments$score$G1, assignments$score$G2M, + xlab="G1 score", ylab="G2/M score", pch=16) + dev.off() + + print("Cell cycle phases added..") + return(seurat) +} + + + +addPercentMitoch <- function(gbm_cds) { #in fact this is fraction of mitochondrial transcripts + mitoGenes <- grep(pattern = "mt-",x=fData(gbm_cds)$gene_short_name,value = T) + mitoGenesEnsembl <- rownames(gbm_cds)[which(is.element(el = fData(gbm_cds)$gene_short_name,set = mitoGenes))] + percentMito <- Matrix::colSums(as.matrix(exprs(gbm_cds))[mitoGenesEnsembl, ])/Matrix::colSums(as.matrix(exprs(gbm_cds))) + pData(gbm_cds)$percentMito <- percentMito + return(gbm_cds) +} + + +#Filtering low-quality cells + +filterCells <- function(gbm_cds,outdir="./",num_cells_expressed=10,min_expr=0.1,propMitochFilter=NULL) { + + dir.create(path = outdir,recursive = T,showWarnings = F) + + #Set detection threshold + gbm_cds <- detectGenes(gbm_cds, min_expr = min_expr) + + #Define gene expressed as gene detected in more than n cell (only use for plotting distribution) + expressed_genes <- row.names(subset(fData(gbm_cds), + num_cells_expressed >= num_cells_expressed)) + + #filtering cells as explained in monocle doc, cutting distribution tails + #eg. remove cells with no RNA or too much RNA + + pData(gbm_cds)$Total_mRNAs <- Matrix::colSums(exprs(gbm_cds)) + + png(paste(outdir,"/densityTotalmRNA_raw.png",sep ="")) + qplot(Total_mRNAs, data = pData(gbm_cds), geom ="density") + dev.off() + + upper_bound <- 10^(mean(log10(pData(gbm_cds)$Total_mRNAs)) + + 2*sd(log10(pData(gbm_cds)$Total_mRNAs))) + lower_bound <- 10^(mean(log10(pData(gbm_cds)$Total_mRNAs)) - + 2*sd(log10(pData(gbm_cds)$Total_mRNAs))) + + png(paste(outdir,"/densityWithFirstFilter.png",sep ="")) + print(qplot(Total_mRNAs, data = pData(gbm_cds), geom ="density") + + geom_vline(xintercept = lower_bound) + + geom_vline(xintercept = upper_bound)) + dev.off() + + gbm_cds <- gbm_cds[,pData(gbm_cds)$Total_mRNAs > lower_bound & + pData(gbm_cds)$Total_mRNAs < upper_bound] + + # redefining expressed gene after cells filtering + gbm_cds <- detectGenes(gbm_cds, min_expr = min_expr) + expressed_genes <- row.names(subset(fData(gbm_cds),num_cells_expressed >= num_cells_expressed)) #genes expressed in at least 10 cells of the data set. + + # Log-transform each value in the expression matrix. + L <- log(exprs(gbm_cds[expressed_genes,])) + + # Standardize each gene, so that they are all on the same scale, + # Then melt the data with plyr so we can plot it easily + melted_dens_df <- reshape2::melt(Matrix::t(scale(Matrix::t(L)))) + + # Plot the distribution of the standardized gene expression values. + png(paste(outdir,"/standardized_distribution.png",sep ="")) + print(qplot(value, geom = "density", data = melted_dens_df) + + stat_function(fun = dnorm, size = 0.5, color = 'red') + + xlab("Standardized log(UMIcounts)") + + ylab("Density")) + dev.off() + + ## Visualize percentage of mitochondiral genes + gbm_cds <- addPercentMitoch(gbm_cds) + + png(paste(outdir,"/percent_mito.png",sep ="")) + plot(hist(gbm_cds$percentMito*100,breaks = 100),main = "percentage of mitochondrial transcripts") + abline(v=propMitochFilter*100) + dev.off() + + return(gbm_cds) + +} + + diff --git a/R_src/do_consensus.R b/R_src/do_consensus.R new file mode 100644 index 0000000000000000000000000000000000000000..9e50b986eddb951f96d941f855591655baabe0bf --- /dev/null +++ b/R_src/do_consensus.R @@ -0,0 +1,26 @@ +do_consensus <- function(path){ + +# Opening the predicted files +predicted_files <- list.files(path) +predicted_names <- strsplit(predicted_files, split = ".txt") +predicted_list <- list() +for(i in 1:length(predicted_files)){ + name <- predicted_names[[i]] + print(name) + predicted_list[[name]] <- assign(predicted_names[[i]], read.table(paste0(path, predicted_files[i]), row.names = 1, header = T)) +} +# adding all the columns to a single table +predicted_table <- do.call("cbind", predicted_list) +# convert to dataframe +predicted_data <- data.frame(predicted_table) +# transpose +predicted_data <- t(predicted_data) +# creation "Consensus" +predicted_consensus=data.frame(Consensus = 1:ncol(predicted_data)) +for (i in 1:ncol(predicted_data)){ + table_compteur <- table(predicted_data[,i]) + predicted_consensus[i,1] <- names(which.max(table_compteur)) + rownames(predicted_consensus)[i] <- colnames(predicted_data)[i] +} +return(predicted_consensus) +} diff --git a/R_src/filterForDcaScenicCL.R b/R_src/filterForDcaScenicCL.R new file mode 100644 index 0000000000000000000000000000000000000000..47b93ef58c87a29a433534890d91511a91036615 --- /dev/null +++ b/R_src/filterForDcaScenicCL.R @@ -0,0 +1,85 @@ +# Script to make analyzis of pseudotime with a gbm_cds ordered +# command line interface + +## ----------------------------------------------------------------------------- +## Libraries +## ----------------------------------------------------------------------------- + + +suppressMessages(library(monocle)) +suppressMessages(library(getopt)) +suppressMessages(library(biomaRt)) + +source("R_src/Enrichment.R") + + + +spec = matrix(c( + 'help', 'h', 0, "logical", "Help about the program", + 'inputData', 'i', 1, "character", "REQUIRED: matrix data (.tsv generated by the report) from 10X data ordered by monocle.", + 'outdir', 'o',1, "character", 'Outdir path (default ./)', + 'firstFilter', 'f',1,"numeric", 'Keep only the genes with at least N = f(X) 2 mean imputed counts across all samples (X=2 by default) \n(e.g. the total number the gene would have, if it was expressed with a value of X in 1% of the cells). \nAdjust X value according to the dataset (it will depend on the dataset units, e.g. UMI, TPMs???).', + 'secondFilter', 's',1, "numeric", 'Keep the genes that are detected in at least X% of the cells (X=1% by default)' +), byrow=TRUE, ncol=5); + +opt = getopt(spec) + + + +# if help was asked, print a friendly message +# and exit with a non-zero error code + +args <- commandArgs() +if ( !is.null(opt$help) | is.null(opt$inputData)) { + cat("Gene filtering of a gbm cds ordered. For scenic use, two filters") + cat(getopt(spec, usage=TRUE)) + q(status=1) +} + +#set default arguments values + +if (is.null(opt$outdir)) { + opt$outdir = "./" +} +if (is.null(opt$firstFilter)) { + opt$firstFilter = 2 +} +if (is.null(opt$secondFilter)) { + opt$secondFilter = 0.01 +} + +data <- read.table(opt$inputData,sep = "\t",header = T,row.names = 1) + +genes <- list() +genes$names <- rownames(data) + + +nCellsPerGene <- apply(data, 1, function(x) sum(x>0)) +nCountsPerGene <- apply(data, 1, sum) +summary(nCellsPerGene) +sum(data>0) / sum(data==0) + +#first filter +minReads <- opt$firstFilter*.01*ncol(data) +genesLeftMinReads <- names(nCountsPerGene)[which(nCountsPerGene > minReads)] +length(genesLeftMinReads) + +#second filter +minSamples <- ncol(data)*opt$secondFilter +nCellsPerGene2 <- nCellsPerGene[genesLeftMinReads] +genesLeftMinCells <- names(nCellsPerGene2)[which(nCellsPerGene2 > minSamples)] +length(genesLeftMinCells) + + + +t_data <- t(data) + +print(head(genesLeftMinCells)) + +write.table(t_data[,c(genesLeftMinCells)],file = paste(opt$outdir,"/expressionRawCountFilteredForScenic.tsv",sep=""),sep = '\t',quote = F) + + + + + + diff --git a/R_src/filterForScenicCL.R b/R_src/filterForScenicCL.R new file mode 100644 index 0000000000000000000000000000000000000000..b1175633ca29a5b0836a1783648623e92ca2a831 --- /dev/null +++ b/R_src/filterForScenicCL.R @@ -0,0 +1,120 @@ +# Script to make analyzis of pseudotime with a gbm_cds ordered +# command line interface + +## ----------------------------------------------------------------------------- +## Libraries +## ----------------------------------------------------------------------------- +# http://stackoverflow.com/questions/4090169/elegant-way-to-check-for-missing-packages-and-install-them + + +suppressMessages(library(monocle)) +suppressMessages(library(getopt)) + + +spec = matrix(c( + 'help', 'h', 0, "logical", "Help about the program", + 'inputRDS', 'i', 1, "character", "REQUIRED: 10X data ordered by monocle (.RDS generated by orderCL.R).", + 'outdir', 'o',1, "character", 'Outdir path (default ./)', + 'firstFilter', 'f',1,"numeric", 'Keep only the genes with at least N = f(X) UMI counts across all samples (X=2 by default) \n(e.g. the total number the gene would have, if it was expressed with a value of X in 1% of the cells). \nAdjust X value according to the dataset (it will depend on the dataset units, e.g. UMI, TPMs???).', + 'secondFilter', 's',1, "numeric", 'Keep the genes that are detected in at least X% of the cells (X=1% by default)', + 'markerTable', 'm',1, "character", "Markers table for the identification of transcription factors", + 'mouseTF','t', 1, 'character', 'Mouse transcription factors list to identify TF in markers table' +), byrow=TRUE, ncol=5); + +opt = getopt(spec) + + + +# if help was asked, print a friendly message +# and exit with a non-zero error code + +args <- commandArgs() +if ( !is.null(opt$help) | is.null(opt$inputRDS)) { + cat("Gene filtering of a gbm cds ordered. For scenic use, two filters") + cat(getopt(spec, usage=TRUE)) + q(status=1) +} + +#set default arguments values + +if (is.null(opt$outdir)) { + opt$outdir = "./" +} +if (is.null(opt$firstFilter)) { + opt$firstFilter = 2 +} +if (is.null(opt$secondFilter)) { + opt$secondFilter = 0.01 +} + +if(!is.null(opt$markerTable)) { + print("find TF in marker table...") + markerTable <- read.table(opt$markerTable,sep = "\t",header = T) + TFlist <- read.table(opt$mouseTF,header = F) + TF_markers <- data.frame(external_gene_name = markerTable$gene[which(markerTable$gene %in% TFlist$V1)]) + + #Add tf from bonzanni plus Gfi1b + TF_bonzani <- data.frame(external_gene_name = c("Spi1","Tal1","Zfpm1","Cbfa2t3","Erg","Fli1","Gata1","Gata2","Hhex","Runx1","Smad6","Zbtb16","Gfi1b")) + TF_markers <- rbind(TF_markers,TF_bonzani) + TF_markers <- unique(TF_markers$external_gene_name) + + write.table(TF_markers,paste(opt$outdir,"/markers_and_bonzanni_TF.tsv",sep =""), + sep = "\t",quote = F,row.names = F, col.names = F) +} + +gbm_cds <- readRDS(opt$inputRDS) + +# Convert to monocle object if necessary (filtering script initially made for monocle) + +if(is(gbm_cds) == "Seurat") { + #It is easier to pass by a monocle object and then reget the seurat object with the genes filtered + print("input seurat object converting it to monocle") + DefaultAssay(object = gbm_cds) <- "RNA" + + pd <- new("AnnotatedDataFrame", data = gbm_cds@meta.data) + fd <- new("AnnotatedDataFrame", data = data.frame(gene_short_name = rownames(gbm_cds))) + rownames(fd) <- fd$gene_short_name + + + gbm_cds <- newCellDataSet(GetAssayData(gbm_cds,slot = "counts"), + phenoData = pd, + featureData = fd, + lowerDetectionLimit = 0.1, + expressionFamily = negbinomial.size()) + + gbm_cds <- detectGenes(gbm_cds, min_expr = 0.1) + + + +} + +exp <- as.matrix(exprs(gbm_cds)) +geneShortName<-make.unique(as.character(fData(gbm_cds)$gene_short_name)) # to see between make.unique, remove dup.. +rownames(exp) <- geneShortName + +nCellsPerGene <- apply(exp, 1, function(x) sum(x>0)) +nCountsPerGene <- apply(exp, 1, sum) +summary(nCellsPerGene) +sum(exp>0) / sum(exp==0) + +#first filter +minReads <- opt$firstFilter*.01*ncol(exp) +genesLeftMinReads <- names(nCountsPerGene)[which(nCountsPerGene > minReads)] +length(genesLeftMinReads) + +#second filter +minSamples <- ncol(exp)*opt$secondFilter +nCellsPerGene2 <- nCellsPerGene[genesLeftMinReads] +genesLeftMinCells <- names(nCellsPerGene2)[which(nCellsPerGene2 > minSamples)] +length(genesLeftMinCells) + +t_exp <- t(exp) + +print(head(genesLeftMinCells)) + +write.table(t_exp[,genesLeftMinCells],file = paste(opt$outdir,"/expressionRawCountFilteredForScenic.tsv",sep=""),sep = '\t',quote = F) + + + + + diff --git a/R_src/filterSeuratForScenicCL.R b/R_src/filterSeuratForScenicCL.R new file mode 100644 index 0000000000000000000000000000000000000000..7042db13561308c400b3e349bd749955b61bb715 --- /dev/null +++ b/R_src/filterSeuratForScenicCL.R @@ -0,0 +1,88 @@ +# Script to make analyzis of pseudotime with a gbm_cds ordered +# command line interface + +## ----------------------------------------------------------------------------- +## Libraries +## ----------------------------------------------------------------------------- +# http://stackoverflow.com/questions/4090169/elegant-way-to-check-for-missing-packages-and-install-them + + +suppressMessages(library(Seurat)) +suppressMessages(library(getopt)) + + +spec = matrix(c( + 'help', 'h', 0, "logical", "Help about the program", + 'inputRDS', 'i', 1, "character", "REQUIRED: 10X data ordered by monocle (.RDS generated by orderCL.R).", + 'outdir', 'o',1, "character", 'Outdir path (default ./)', + 'firstFilter', 'f',1,"numeric", 'Keep only the genes with at least N = f(X) UMI counts across all samples (X=2 by default) \n(e.g. the total number the gene would have, if it was expressed with a value of X in 1% of the cells). \nAdjust X value according to the dataset (it will depend on the dataset units, e.g. UMI, TPMs???).', + 'secondFilter', 's',1, "numeric", 'Keep the genes that are detected in at least X% of the cells (X=1% by default)', + 'mouseTF','t', 1, 'character', 'Mouse transcription factors list to identify TF in markers table' +), byrow=TRUE, ncol=5); + +opt = getopt(spec) + + + +# if help was asked, print a friendly message +# and exit with a non-zero error code + +args <- commandArgs() +if ( !is.null(opt$help) | is.null(opt$inputRDS)) { + cat("Gene filtering of a gbm cds ordered. For scenic use, two filters") + cat(getopt(spec, usage=TRUE)) + q(status=1) +} + +#set default arguments values + +if (is.null(opt$outdir)) { + opt$outdir = "./" +} +if (is.null(opt$firstFilter)) { + opt$firstFilter = 2 +} +if (is.null(opt$secondFilter)) { + opt$secondFilter = 0.01 +} + +seurat <- readRDS(opt$inputRDS) + +DefaultAssay(object = seurat) <- "RNA" + +exp <- as.matrix(GetAssayData(seurat,slot = "counts")) + + +nCellsPerGene <- apply(exp, 1, function(x) sum(x>0)) +nCountsPerGene <- apply(exp, 1, sum) +summary(nCellsPerGene) +sum(exp>0) / sum(exp==0) + +#first filter +minReads <- opt$firstFilter*.01*ncol(exp) +genesLeftMinReads <- names(nCountsPerGene)[which(nCountsPerGene > minReads)] +length(genesLeftMinReads) + +#second filter +minSamples <- ncol(exp)*opt$secondFilter +nCellsPerGene2 <- nCellsPerGene[genesLeftMinReads] +genesLeftMinCells <- names(nCellsPerGene2)[which(nCellsPerGene2 > minSamples)] +length(genesLeftMinCells) + +t_exp <- t(exp) + +print(head(genesLeftMinCells)) + + + + +t_exp <- t(exp) + +print(head(genesLeftMinCells)) + +write.table(t_exp[,genesLeftMinCells],file = paste(opt$outdir,"/expressionRawCountFilteredForScenic.tsv",sep=""),sep = '\t',quote = F) + + + + + diff --git a/R_src/funForAnalysisSeurat.R b/R_src/funForAnalysisSeurat.R new file mode 100755 index 0000000000000000000000000000000000000000..a24ae690504605424c3ceb5d797f8c69d46919d8 --- /dev/null +++ b/R_src/funForAnalysisSeurat.R @@ -0,0 +1,388 @@ +#Visualize position of a cluster on the UMAP +clusterDistrib_seurat <- function(seurat = seurat, numclust = numclust, metaData = "numclust", flip = FALSE){ + print(numclust) + seurat@meta.data$clusterDis <- FALSE + seurat@meta.data[which(seurat@meta.data[,metaData] == numclust),"clusterDis"] <- TRUE + print(nrow(seurat@meta.data[which(seurat@meta.data$clusterDis == TRUE),])) + seurat@meta.data$clusterDis <- factor(seurat@meta.data$clusterDis, levels = c(TRUE, FALSE)) + if(flip == TRUE){ + plot <- DimPlot(seurat, group.by = "clusterDis", pt.size = 0.01) + labs(title = numclust) + NoLegend() + + theme(axis.title = element_text(size = 6), axis.text = element_text(size = 6), title = element_text(size = 7)) + + scale_colour_manual(values = c("red", "grey")) + + scale_alpha_manual(values = c(1,0)) + coord_flip() + scale_y_reverse() + }else{ + plot <- DimPlot(seurat, group.by = "clusterDis", pt.size = 0.01) + labs(title = numclust) + NoLegend() + + theme(axis.title = element_text(size = 6), axis.text = element_text(size = 6), title = element_text(size = 7)) + + scale_colour_manual(values = c("red", "grey")) + + scale_alpha_manual(values = c(1,0)) + } + return(plot) +} + +clusterDistrib_seurat_cond <- function(seurat = seurat, numclust = numclust, metaData = "numclust", flip = FALSE, color){ + print(numclust) + seurat@meta.data$clusterDis <- "Other" + seurat@meta.data[which(seurat@meta.data[,metaData] == numclust),"clusterDis"] <- numclust + seurat@meta.data$clusterCondDis <- paste(seurat$clusterDis, seurat$condition, sep = "_") + # colorClusterCond <- c(colorGenotype, alpha(colorGenotype, 0.4), rep("grey", 4)) + colorClusterCond <- c(color, rep("grey", 4)) + # print(nrow(seurat@meta.data[which(seurat@meta.data$clusterDis == TRUE),])) + seurat@meta.data$clusterCondDis <- factor(seurat@meta.data$clusterCondDis, + levels = c(paste(numclust, "Young_Wild", sep = "_"), paste(numclust, "Young_Mutant", sep = "_"), + paste(numclust, "Aged_Wild", sep = "_"), paste(numclust, "Aged_Mutant", sep = "_"), + "Other_Young_Wild", "Other_Young_Mutant", "Other_Aged_Wild", "Other_Aged_Mutant")) + names(colorClusterCond) <- levels(seurat$clusterCondDis) + if(flip == TRUE){ + plot <- DimPlot(seurat, group.by = "clusterCondDis", pt.size = 0.1) + labs(title = numclust) + + theme(axis.title = element_text(size = 6), axis.text = element_text(size = 6), title = element_text(size = 7)) + + scale_colour_manual(values = colorClusterCond) + + coord_flip() + scale_y_reverse() + }else{ + plot <- DimPlot(seurat, group.by = "clusterCondDis", pt.size = 0.1) + labs(title = numclust) + + theme(axis.title = element_text(size = 6), axis.text = element_text(size = 6), title = element_text(size = 7)) + + scale_colour_manual(values = colorClusterCond) + } + return(plot) +} + +#Barplot to show distribution of samples along the cluster +getSamplePropPerClustBarplot <- function(hspc.combined) { + clustersampleName <- ddply(hspc.combined@meta.data,~FinalCluster + sampleName,nrow) + nCellSample <- ddply(hspc.combined@meta.data,~sampleName, nrow) + names(nCellSample)[2] <- "TotalCell" + test <- join(x = clustersampleName, y = nCellSample, by = "sampleName") + + test$propSample <- test$V1/test$TotalCell + + propExpect <- table(hspc.combined@meta.data$sampleName)/length(hspc.combined@meta.data$sampleName)[] + propYoungExp <- propExpect[[unique(hspc.combined@meta.data$sampleName)[1]]] + + sampleName <- ggplot(data.frame(test), aes(fill = sampleName,y = propSample, x=FinalCluster)) + + geom_bar( stat="identity", position="fill")+ + scale_fill_manual( values= rev(hue_pal()(length(unique(hspc.combined@meta.data$sampleName)))))+ + scale_y_continuous(name = "Sample (%)", labels = c(0,25,50,75,100))+ + ylab(label = "")+xlab(label = "") + coord_flip()+ + theme(legend.title=element_blank(), axis.text.y = element_text(size = 16)) + return(sampleName) +} + +getGenotypePropPerClustBarplot <- function(hspc.combined) { + clusterGenotype <- ddply(hspc.combined@meta.data,~FinalCluster + Genotype,nrow) + + propExpect <- table(hspc.combined@meta.data$Genotype)/length(hspc.combined@meta.data$Genotype)[] + # propWildExp <- propExpect[[unique(hspc.combined@meta.data$Genotype)[2]]] + propWildExp <- propExpect["Wild"][[1]] + #clusterAGE$numclust <- factor(v$clusterNature , levels = c("")) + clusterGenotype$Genotype <- factor(clusterGenotype$Genotype , levels = c("Mutant","Wild")) + + + Genotype <- ggplot(data.frame(clusterGenotype), aes(fill = Genotype,y = V1, x=FinalCluster)) + + geom_bar( stat="identity", position="fill")+ + scale_fill_manual( values= rev(hue_pal()(length(unique(hspc.combined@meta.data$Genotype)))))+ + scale_y_continuous(name = "Genotype (%)", labels = c(0,25,50,75,100))+ + ylab(label = "")+xlab(label = "") + coord_flip() + geom_hline(yintercept = propWildExp)+ + theme(legend.title=element_blank(), axis.text.y = element_text(size = 16)) + return(Genotype) + +} + +getAGEPropPerClustBarplot_2 <- function(hspc.combined) { + clusterAge <- ddply(hspc.combined@meta.data,~FinalCluster + AGE,nrow) + + propExpect <- table(hspc.combined@meta.data$AGE)/length(hspc.combined@meta.data$AGE)[] + propYoungExp <- propExpect[[unique(hspc.combined@meta.data$AGE)[1]]] + + #clusterAGE$numclust <- factor(v$clusterNature , levels = c("")) + clusterAge$AGE <- factor(clusterAge$AGE , levels = c("Aged","Young")) + + + AGE <- ggplot(data.frame(clusterAge), aes(fill = AGE,y = V1, x=FinalCluster,levels = "Young","Aged")) + + geom_bar( stat="identity", position="fill")+ + scale_fill_manual( values= rev(hue_pal()(length(unique(hspc.combined@meta.data$AGE)))))+ + scale_y_continuous(name = "Age (%)", labels = c(0,25,50,75,100))+ + ylab(label = "")+xlab(label = "") + coord_flip() + geom_hline(yintercept = 1-propYoungExp)+ + theme(legend.title=element_blank(), axis.text.y = element_text(size = 16)) + return(AGE) + +} + +#Bind sampleName and barcode (needed to annotate CR2 data) +tagCell <- function(cds){ + pData(cds)$taggedBarcode <- paste(pData(cds)$sampleName, pData(cds)$barcode, sep = "_") + return(cds) +} + +#Add information if the cell is found with CellRanger2 +getCR2diff <- function(seurat = seurat, allCellCR2){ + uniqueCell <- setdiff(rownames(seurat@meta.data), allCellCR2) + + seurat@meta.data$CellRanger2 <- "Both" + seurat@meta.data[uniqueCell,"CellRanger2"] <- "CellRanger3" + seurat@meta.data[which(is.element(seurat@meta.data$sampleName, set = c("young_Zbtb_B", "old_Zbtb_B"))),"CellRanger2"] <- "NewData" + + clusterCellRanger2 <- ddply(seurat@meta.data,~FinalCluster + CellRanger2,nrow) + + originName <- ggplot(data.frame(clusterCellRanger2), aes(fill = CellRanger2,y = V1, x=FinalCluster)) + + geom_bar( stat="identity", position="fill")+ + scale_fill_manual( values= rev(hue_pal()(length(unique(seurat@meta.data$CellRanger2)))))+ + scale_y_continuous(name = "Sample (%)", labels = c(0,25,50,75,100))+ + ylab(label = "")+xlab(label = "") + coord_flip()+ + theme(legend.title=element_blank()) + return(originName) +} + +#Hypergeometric test and plotting the result with a bp function, use a funtion from funForSeurat.R +getEnrichPopClust <- function(hspc.combined, Xname, Yname, colorX, colorY, metaCol = "AGE"){ + conditionEnrich <- getEnrichAge(hspc.combined = hspc.combined,clustCol = "numclust", metaCol = metaCol) + conditionEnrich <- as.data.frame(t(conditionEnrich)) + conditionEnrich$color <- "black" + conditionEnrich[which(as.numeric(as.vector(conditionEnrich$phyper)) < 0.05 & conditionEnrich$enrich == Xname),"color"] <- colorX + conditionEnrich[which(as.numeric(as.vector(conditionEnrich$phyper)) < 0.05 & conditionEnrich$enrich == Yname),"color"] <- colorY + return(conditionEnrich) +} + +# Allow you to find the markers between 2 conditions for each +MarkersAnalysis <- function(seurat = seurat, analysis){ + seurat@meta.data$Analysis <- paste(seurat@meta.data[,analysis], seurat@meta.data$FinalCluster, sep = "_") + condAnalysis <- unique(seurat@meta.data[,analysis]) + + Idents(seurat) <- "Analysis" + + allMarkers <- data.frame() + for(clustName in unique(seurat@meta.data$FinalCluster)){ + print(clustName) + markersClust <- FindMarkers(object = seurat, ident.1 = paste(condAnalysis[1], clustName, sep = "_"), + ident.2 = paste(condAnalysis[2], clustName, sep = "_"), logfc.threshold = 0.25) + + markersClust[,paste(condAnalysis[1], condAnalysis[2], sep = "_vs_")] <- paste(condAnalysis[1], clustName, "up", sep ="_") + markersClust[which(markersClust$avg_logFC < 0),paste(condAnalysis[1], condAnalysis[2], sep = "_vs_")] <- paste(condAnalysis[1], clustName, "down",sep ="_") + markersClust$Gene <- rownames(markersClust) + markersClust <- markersClust[order(markersClust$avg_logFC),] + markersClust <- markersClust[which(markersClust$p_val_adj < 0.05),] + allMarkers <- rbind(allMarkers, markersClust) + } + return(allMarkers) +} + +getPredictedPerSample <- function(seurat){ + propPredicted <- ddply(seurat@meta.data,~sampleName + predicted, nrow) + predicted <- ggplot(data.frame(propPredicted), aes(fill = predicted,y = V1, x=sampleName)) + + geom_bar( stat="identity", position="fill")+ + scale_fill_manual( values= hue_pal()(length(unique(seurat@meta.data$predicted))))+ + scale_y_continuous(name = "Cell type (%)", labels = c(0,25,50,75,100))+ + ylab(label = "")+xlab(label = "") + + theme(legend.title=element_blank(), axis.text.x = element_text(angle = 75, hjust = 1, size = 16), axis.text.y = element_text(size = 12)) + + return(predicted) +} + +getCellTypePerSample <- function(seurat){ + propPredicted <- ddply(seurat@meta.data,~sampleName + cellType_Actinn, nrow) + predicted <- ggplot(data.frame(propPredicted), aes(fill = cellType_Actinn,y = V1, x=sampleName)) + + geom_bar( stat="identity", position="fill")+ + scale_fill_manual( values= hue_pal()(length(unique(seurat@meta.data$cellType_Actinn))))+ + scale_y_continuous(name = "Cell type (%)", labels = c(0,25,50,75,100))+ + ylab(label = "")+xlab(label = "") + + theme(legend.title=element_blank(), axis.text.x = element_text(angle = 75, hjust = 1, size = 16), axis.text.y = element_text(size = 12)) + + return(predicted) +} + +getPhasesPerSample <- function(seurat){ + propPhases <- ddply(seurat@meta.data,~sampleName + phases, nrow) + phases <- ggplot(data.frame(propPhases), aes(fill = phases,y = V1, x=sampleName)) + + geom_bar( stat="identity", position="fill")+ + scale_fill_manual( values= hue_pal()(length(unique(seurat@meta.data$phases))))+ + scale_y_continuous(name = "Cell type (%)", labels = c(0,25,50,75,100))+ + ylab(label = "")+xlab(label = "") + + theme(legend.title=element_blank(), axis.text.x = element_text(angle = 75, hjust = 1, size = 16), axis.text.y = element_text(size = 12)) + + return(phases) +} + +# getPhasePropPerClustBarplotSplit <- function(hspc.combined, color = hue_pal()(length(unique(hspc.combined@meta.data$phases)))) { +# clusterpredicted <- ddply(hspc.combined@meta.data,~numclust + phases + condition,nrow) +# +# split_plot <- list() +# for(i in unique(clusterpredicted$condition)){ +# print(i) +# predicted <- ggplot(data.frame(clusterpredicted[which(clusterpredicted$condition == i),]), aes(fill = phases,y = V1, x=numclust)) + +# geom_bar( stat="identity", position="fill")+ +# scale_fill_manual( values= color)+ +# scale_y_continuous(name = "Cell type (%)", labels = c(0,25,50,75,100))+ +# ylab(label = "")+xlab(label = "") + coord_flip()+ +# labs(title = i) + theme(plot.title = element_text(size = 10), axis.text.x = element_text(size = 12)) +# split_plot[[i]] <- predicted +# } +# return(split_plot) +# } +getPhasePropPerClustBarplotSplit <- function(hspc.combined, color = hue_pal()(length(unique(hspc.combined@meta.data$phases))), + condition = "Young_Wild", sig_val = "black") { + clusterpredicted <- ddply(hspc.combined@meta.data,~FinalCluster + phases + condition,nrow) + + split_plot <- list() + for(i in condition){ + print(i) + predicted <- ggplot(data.frame(clusterpredicted[which(clusterpredicted$condition == i),]), aes(fill = phases,y = V1, x=FinalCluster)) + + geom_bar( stat="identity", position="fill")+ + scale_fill_manual( values= color)+ + scale_y_continuous(name = "Cell type (%)", labels = c(0,25,50,75,100))+ + ylab(label = "")+xlab(label = "") + coord_flip()+ + labs(title = i) + theme(plot.title = element_text(size = 10), axis.text.x = element_text(size = 12), + axis.text.y = element_text(colour = sig_val, size = 16)) + split_plot[[i]] <- predicted + } + return(split_plot) +} + +Fisher_for_cellCycle <- function(seurat, condition){ + summary_data <- ddply(seurat@meta.data,~condition + FinalCluster + phases, nrow) + + results_list <- list() + for(i in levels(seurat$FinalCluster)){ + print(i) + summ_clust <- summary_data[which(summary_data$FinalCluster == i & summary_data$condition %in% condition),] + contingence <- data.frame(dcast(summ_clust, phases ~ condition, fill = 0, value.var = "V1"), row.names = 1) + if(nrow(contingence)>1){ + fisher <- fisher.test(contingence) + results_list[i] <- fisher$p.value + } else{ + results_list[i] <- 1 + } + } + result <- as.data.frame(do.call("rbind",results_list)) + result$color <- "black" + result[which(result$V1 < 0.05),"color"] <- "#DD3497" + + return(result) +} + +Fisher_for_cellType <- function(seurat, condition){ + summary_data <- ddply(seurat@meta.data,~condition + FinalCluster + cellType_Actinn, nrow) + + results_list <- list() + for(i in levels(seurat$FinalCluster)){ + print(i) + summ_clust <- summary_data[which(summary_data$FinalCluster == i & summary_data$condition %in% condition),] + contingence <- data.frame(dcast(summ_clust, cellType_Actinn ~ condition, fill = 0, value.var = "V1"), row.names = 1) + if(nrow(contingence)>1){ + fisher <- fisher.test(contingence) + results_list[i] <- fisher$p.value + } else{ + results_list[i] <- 1 + } + } + result <- as.data.frame(do.call("rbind",results_list)) + result$color <- "black" + result[which(result$V1 < 0.05),"color"] <- "red" + + return(result) +} + +Fisher_for_predicted <- function(seurat, condition){ + summary_data <- ddply(seurat@meta.data,~condition + FinalCluster + predicted, nrow) + + results_list <- list() + for(i in levels(seurat$FinalCluster)){ + print(i) + summ_clust <- summary_data[which(summary_data$FinalCluster == i & summary_data$condition %in% condition),] + contingence <- data.frame(dcast(summ_clust, predicted ~ condition, fill = 0, value.var = "V1"), row.names = 1) + if(nrow(contingence)>1){ + fisher <- fisher.test(contingence) + results_list[i] <- fisher$p.value + } else{ + results_list[i] <- 1 + } + } + result <- as.data.frame(do.call("rbind",results_list)) + result$color <- "black" + result[which(result$V1 < 0.05),"color"] <- "red" + + return(result) +} + + +getPredictedPropPerClustBarplotSplit <- function(hspc.combined, colors = hue_pal()(length(unique(hspc.combined@meta.data$predicted))), + condition = "Young_Wild", sig_val = "black") { + clusterpredicted <- ddply(hspc.combined@meta.data,~FinalCluster + predicted + condition,nrow) + + split_plot <- list() + for(i in condition){ + print(i) + predicted <- ggplot(data.frame(clusterpredicted[which(clusterpredicted$condition == i),]), aes(fill = predicted,y = V1, x=FinalCluster)) + + geom_bar( stat="identity", position="fill")+ + scale_fill_manual( values= colors)+ + scale_y_continuous(name = "Cell type (%)", labels = c(0,25,50,75,100))+ + ylab(label = "")+xlab(label = "") + coord_flip()+ + labs(title = i) + theme(plot.title = element_text(size = 10), axis.text.x = element_text(size = 12), + axis.text.y = element_text(colour = sig_val, size = 16)) + split_plot[[i]] <- predicted + } + return(split_plot) +} + +getCellTypePropPerClustBarplotSplit <- function(hspc.combined, colors = hue_pal()(length(unique(hspc.combined@meta.data$predicted))), + condition = "Young_Wild", sig_val = "black") { + clusterpredicted <- ddply(hspc.combined@meta.data,~FinalCluster + cellType_Actinn + condition,nrow) + + split_plot <- list() + for(i in condition){ + print(i) + cellType <- ggplot(data.frame(clusterpredicted[which(clusterpredicted$condition == i),]), aes(fill = cellType_Actinn,y = V1, x=FinalCluster)) + + geom_bar( stat="identity", position="fill")+ + scale_fill_manual( values= colors)+ + scale_y_continuous(name = "Cell type (%)", labels = c(0,25,50,75,100))+ + ylab(label = "")+xlab(label = "") + coord_flip()+ + labs(title = i) + theme(plot.title = element_text(size = 10), + axis.text.y = element_text(colour = sig_val, size = 16)) + split_plot[[i]] <- cellType + } + return(split_plot) +} + + +getPhasePropPerPredictedBarplotSplit <- function(hspc.combined, colors = hue_pal()(length(unique(hspc.combined@meta.data$phases))), + condition = "Young_Wild") { + clusterpredicted <- ddply(hspc.combined@meta.data,~phases + predicted + condition,nrow) + + split_plot <- list() + for(i in condition){ + print(i) + predicted <- ggplot(data.frame(clusterpredicted[which(clusterpredicted$condition == i),]), aes(fill = phases,y = V1, x=predicted)) + + geom_bar( stat="identity", position="fill")+ + scale_fill_manual( values= colors)+ + scale_y_continuous(name = "Cell type (%)", labels = c(0,25,50,75,100))+ + ylab(label = "")+xlab(label = "") + + labs(title = i) + theme(plot.title = element_text(size = 10)) + split_plot[[i]] <- predicted + } + return(split_plot) +} + +# getCellTypePropPerClustBarplotSplit <- function(hspc.combined, colors = hue_pal()(length(unique(hspc.combined@meta.data$cellType_Actinn)))) { +# clusterpredicted <- ddply(hspc.combined@meta.data,~FinalCluster + cellType_Actinn + condition,nrow) +# +# split_plot <- list() +# for(i in unique(clusterpredicted$condition)){ +# print(i) +# predicted <- ggplot(data.frame(clusterpredicted[which(clusterpredicted$condition == i),]), aes(fill = cellType_Actinn,y = V1, x=FinalCluster)) + +# geom_bar( stat="identity", position="fill")+ +# scale_fill_manual( values= colors)+ +# scale_y_continuous(name = "Cell type (%)", labels = c(0,25,50,75,100))+ +# ylab(label = "")+xlab(label = "") + coord_flip()+ +# labs(title = i) + theme(plot.title = element_text(size = 10)) +# split_plot[[i]] <- predicted +# } +# return(split_plot) +# } + +getCellTypePerCondition <- function(seurat, colors = hue_pal()(length(unique(seurat$cellType_Actinn)))){ + cellType <- ddply(seurat@meta.data,~condition + cellType_Actinn, nrow) + barplot <- ggplot(data = data.frame(cellType), aes(fill=cellType_Actinn, y=V1, x=condition)) + + geom_bar(stat = "identity", position = "fill") + + scale_fill_manual(values = colors) + + scale_y_continuous(name = "Cell type (%)", labels = c(0,25,50,75,100)) + + ggtitle("Actinn") + theme(axis.text.x = element_text(size = 16, angle = -15), axis.title.x = element_blank(), plot.title = element_text(size = 25)) + + return(barplot) +} diff --git a/R_src/funForLoading.R b/R_src/funForLoading.R new file mode 100644 index 0000000000000000000000000000000000000000..839f78a4fcd58ee474173055824cc978bc054925 --- /dev/null +++ b/R_src/funForLoading.R @@ -0,0 +1,55 @@ +loadCellRangerMatrix <- function(matrix_dir,sample_name = NULL) { + if (!is.null(sample_name)) { + sample_name <- paste0(sample_name,"_") + } + barcode.path <- paste0(matrix_dir,"/",sample_name,"barcodes.tsv") + features.path <- paste0(matrix_dir,"/",sample_name,"genes.tsv") + matrix.path <- paste0(matrix_dir,"/",sample_name,"matrix.mtx") + mat <- readMM(file = matrix.path) + feature.names = read.delim(features.path, + header = FALSE, + stringsAsFactors = FALSE) + barcode.names = read.delim(barcode.path, + header = FALSE, + stringsAsFactors = FALSE) + colnames(mat) = barcode.names$V1 + rownames(mat) = feature.names$V1 + colnames(feature.names) <- c("id", "symbol") + rownames(feature.names) <- feature.names$id + colnames(barcode.names) <- c("barcode") + rownames(barcode.names) <- barcode.names$barcode + results <- list() + results$exprs <- mat + results$fd <- feature.names + results$pd <- barcode.names + + return(results) +} + +loadCellRangerMatrix_cellranger3 <- function(matrix_dir,sample_name =NULL) { + if (!is.null(sample_name)) { + sample_name <- paste0(sample_name,"_") + } + barcode.path <- paste0(matrix_dir,"/",sample_name,"barcodes.tsv") + features.path <- paste0(matrix_dir,"/",sample_name,"features.tsv") + matrix.path <- paste0(matrix_dir,"/",sample_name,"matrix.mtx") + mat <- readMM(file = matrix.path) + feature.names = read.delim(features.path, + header = FALSE, + stringsAsFactors = FALSE) + barcode.names = read.delim(barcode.path, + header = FALSE, + stringsAsFactors = FALSE) + colnames(mat) = barcode.names$V1 + rownames(mat) = feature.names$V1 + colnames(feature.names) <- c("id", "symbol") + rownames(feature.names) <- feature.names$id + colnames(barcode.names) <- c("barcode") + rownames(barcode.names) <- barcode.names$barcode + results <- list() + results$exprs <- mat + results$fd <- feature.names + results$pd <- barcode.names + + return(results) +} diff --git a/R_src/funForPlot.R b/R_src/funForPlot.R new file mode 100644 index 0000000000000000000000000000000000000000..ae6058bffef12cc436c5980d8c8a2e31d64a37a6 --- /dev/null +++ b/R_src/funForPlot.R @@ -0,0 +1,38 @@ +g_legend<-function(a.gplot,direction = "horizontal"){ + a.gplot <- a.gplot +theme(legend.direction=direction) + tmp <- ggplot_gtable(ggplot_build(a.gplot)) + leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box") + legend <- tmp$grobs[[leg]] + return(legend)} + + +plotScore <- function(monocle, + traj, + sig, + shortName, + se = T, + ptBranching1 = NULL, + ptBranching2 = NULL) { + + + plot <- ggplot(pData(monocle)[which(pData(monocle)$State %in% traj),] ,aes(color = AGE,y = get(sig), x=Pseudotime)) + + geom_smooth(se = se) + + ylab("Score")+ + scale_color_manual(values = hue_pal()(2)[c(1,2)]) + + ggtitle(shortName) + + theme(legend.text = element_text(size = 18), + axis.text = element_text(size = 16), + axis.title = element_text(size = 18), + plot.title = element_text(size = 20))+ + guides(colour = guide_legend(override.aes = list(size=8))) + + if (!is.null(ptBranching1)) { + plot <- plot + geom_vline(aes(xintercept=ptBranching1), color="red", linetype="dashed", size=1) + } + + if (!is.null(ptBranching2)) { + plot <- plot + geom_vline(aes(xintercept=ptBranching2), color="black", linetype="dashed", size=1) + } + + return(plot) +} diff --git a/R_src/funForSeurat.R b/R_src/funForSeurat.R new file mode 100644 index 0000000000000000000000000000000000000000..970b280e64021ce828326e99d88d189cc8976bb0 --- /dev/null +++ b/R_src/funForSeurat.R @@ -0,0 +1,509 @@ +firstup <- function(x) { + substr(x, 1, 1) <- toupper(substr(x, 1, 1)) + x +} + + +# DEPRECATED Seurat2 +# getClustInfo <- function(clust,signatures,testHyperSig,seurat,groupingName =NULL,markers) { +# clustInfo <- list() +# clustInfo$num_cells <- dim(seurat@meta.data[which(seurat@ident==clust),])[1] +# clustInfo$percent_cells <- clustInfo$num_cells/dim(seurat@meta.data)[1] +# +# # if(!is.null(groupingName)) { +# # for (sample_name in unique(seurat@meta.data$sampleName)) { +# # clustInfo[[paste(sample_name,"percentCells",sep ="")]] <- dim(seurat@meta.data[which(seurat@ident==clust & seurat@meta.data$sampleName==sample_name),])[1]/clustInfo$num_cells +# # } +# # } +# +# percentPhases <- table(seurat@meta.data[which(seurat@ident==clust),"phases"])/length(seurat@meta.data[which(seurat@ident==clust),"phases"]) #In fact this is fraction not percentage +# +# +# # clustInfo$percent_G1_G0 <-percentPhases["G1_G0"] +# # clustInfo$percent_S <- percentPhases["S"] +# # clustInfo$percent_G2_M <- percentPhases["G2_M"] +# +# for (p in c("G1_G0","S","G2_M")) { +# if (is.element(p,names(percentPhases))) { +# clustInfo[[p]] <- percentPhases[p] +# } else { +# clustInfo[[p]] <- 0 +# } +# } +# +# # cell types +# percentCellType <- table(seurat@meta.data[which(seurat@ident==clust),"CellType"])/length(seurat@meta.data[which(seurat@ident==clust),"CellType"]) +# +# for (t in unique(seurat@meta.data$CellType)) { +# if (is.element(t,names(percentCellType))) { +# clustInfo[[t]] <- percentCellType[t] +# } else { +# clustInfo[[t]] <- 0 +# } +# } +# +# if(!is.null(seurat@meta.data$age)) { +# +# # cell types +# percentAge <- table(seurat@meta.data[which(seurat@ident==clust),"age"])/length(seurat@meta.data[which(seurat@ident==clust),"age"]) +# +# EnrichAge <- (table(seurat@meta.data[which(seurat@ident==clust),"age"]) / ((table(seurat@meta.data[,"age"])/sum(table(seurat@meta.data[,"age"])))*clustInfo$num_cells)) - 1 +# +# for (a in unique(seurat@meta.data$age)) { +# if (is.element(a,names(percentAge))) { +# clustInfo[[a]] <- percentAge[a] +# clustInfo[[paste("enrichAge_",a,sep ="")]] <- EnrichAge[a] +# } else { +# clustInfo[[a]] <- 0 +# } +# } +# +# } +# +# if(!is.null(seurat@meta.data$percentCellTypeAge)) { +# +# # cell types by age +# percentCellTypeAge <- table(seurat@meta.data[which(seurat@ident==clust),"CellTypeAge"])/length(seurat@meta.data[which(seurat@ident==clust),"CellTypeAge"]) +# +# for (at in unique(seurat@meta.data$CellTypeAge)) { +# if (is.element(a,names(percentCellTypeAge))) { +# clustInfo[[at]] <- percentCellTypeAge[at] +# } else { +# clustInfo[[at]] <- 0 +# } +# } +# +# } +# +# +# +# clustInfo$median_genes_expressed <- median(seurat@meta.data[which(seurat@ident==clust),"numGenesPerCells"]) +# clustInfo$median_nUMI <- median(seurat@meta.data[which(seurat@ident==clust),"Total_mRNAs"]) +# clustInfo$median_percentMitochGenes <- median(seurat@meta.data[which(seurat@ident==clust),"percentMito"]) +# +# clustSig <- lapply(signatures,testHyperSig,seurat,clust,markers = markers) #forgot markers arg +# +# clustInfo <- c(clustInfo,clustSig) +# +# return(clustInfo) +# +# } + + +FindAgingMarkers3 <- function(cluster,hspc.combined) { + hspc.combined$cluster.AGE <- paste(Idents(object = hspc.combined), hspc.combined$AGE, sep = "_") + Idents(object = hspc.combined) <- "cluster.AGE" + agingMarkers <- FindMarkers(object = hspc.combined, + ident.1 = paste(cluster,"_Old",sep = ""), + ident.2 = paste(cluster,"_Young",sep = ""), + verbose = FALSE) + agingMarkers$Cluster <- paste(cluster,"_Old_up",sep ="") + agingMarkers$Cluster[which(agingMarkers$avg_logFC < 0)] <- paste(cluster,"_Old_down",sep ="") + agingMarkers$Gene <- rownames(agingMarkers) + agingMarkers <- agingMarkers[order(agingMarkers$avg_logFC),] + agingMarkers <- agingMarkers[which(agingMarkers$p_val_adj < 0.05),] + return(agingMarkers) +} + +# DEPRECATED Seurat2 +# FindAgingMarkers <- function(cluster,seurat.combined,outdir = "./",logfc.threshold = 0.25) { +# age <- unique(seurat.combined@meta.data$age) +# ident.1 <- paste0(cluster,"_age_",age[1]) +# ident.2 <- paste0(cluster,"_age_",age[2]) +# age_effect <- FindMarkers(seurat.combined, ident.1 = ident.1, ident.2 = ident.2, +# print.bar = FALSE,logfc.threshold = logfc.threshold) +# dir.create(paste(outdir,"/aging_test_on_cluster_",cluster,sep =""),showWarnings = F) +# +# print(cluster) +# +# age_effect <- age_effect[which(age_effect$p_val_adj < 0.05),] +# +# if(dim(age_effect)[1]!=0) { +# age_effect <- age_effect[order(age_effect$avg_logFC),] +# age_effect$Gene <- rownames(age_effect) +# age_effect$Cluster <- NA +# age_effect[which(age_effect$avg_logFC > 0),"Cluster"] <- "Old_down" +# age_effect[which(age_effect$avg_logFC < 0),"Cluster"] <- "Old_up" +# age_effect$Cluster <- paste(age_effect$Cluster,"_cluster_",cluster,sep="") +# +# resDir <- paste(outdir,"/aging_test_on_cluster_",cluster,sep ="") +# +# gprofileClustResult <- gProfileAnalysis(deg_clust = age_effect, +# outdir = paste(resDir,"/gProfiler", sep =""), +# background = row.names(seurat.combined@data)) +# +# # with a specific bg +# +# +# colClustRes <- "numclust" +# cellInClust <- row.names(seurat.combined@meta.data[which(seurat.combined@meta.data[,colClustRes]==cluster),]) +# +# +# subSeurat <- SubsetData(seurat.combined,cells.use = cellInClust) +# print(is(subSeurat)) +# +# #get expressed genes in this cluster +# num.cells <- rowSums(as.matrix(subSeurat@data) > 0) +# genes.use <- names(num.cells[which(num.cells >= 1)]) +# +# gprofileClustResult <- gProfileAnalysis(deg_clust = age_effect, +# outdir = paste(resDir,"/gProfilerSpecificBg", sep =""), +# background = genes.use) +# } +# +# ############################################################################################################# +# write.table(age_effect,paste(outdir,"/aging_test_on_cluster_",cluster,"/AgingMarkers_of_clust",cluster,".tsv", sep =""),sep = "\t",quote = F,col.names = NA) +# return(age_effect) +# } + + +# DEPRECATED Seurat2 +# removeNonExpressedGenes <- function(seurat,minPropCellExp) { +# pd <- new("AnnotatedDataFrame", data = seurat@meta.data) +# fd <- new("AnnotatedDataFrame", data = data.frame(gene_short_name = rownames(seurat@data))) +# rownames(fd) <- fd$gene_short_name +# +# gbm_cds <- newCellDataSet(seurat@raw.data, +# phenoData = pd, +# featureData = fd, +# lowerDetectionLimit = 0.1, +# expressionFamily = negbinomial.size()) +# +# gbm_cds <- detectGenes(gbm_cds, min_expr = 0.1) +# +# +# +# print("remove non expressed genes in the subset (non expressed in at least X% of the cells X user option in monocle dp feature 5% in seurat tutorial 0,1%)") +# fData(gbm_cds)$use_for_seurat <- fData(gbm_cds)$num_cells_expressed > minPropCellExp * ncol(gbm_cds) +# +# gbm_to_seurat <- gbm_cds[fData(gbm_cds)$use_for_seurat==T,] +# gbm_to_seurat <- gbm_cds +# +# # Only needed if ensemble id to lazy to code the test for the moment +# #rownames(gbm_to_seurat) <- make.unique(fData(gbm_to_seurat)$gene_short_name,sep = "_") #be careful in diff exp results for gene test enrichment +# +# if (is.element("Cluster",colnames(pData(gbm_to_seurat)))) { +# colnames(pData(gbm_to_seurat))[which(colnames(pData(gbm_to_seurat))=="Cluster")] <- "Cluster_monocle" +# } +# ##Convert to seurat +# seurat <- exportCDS(gbm_to_seurat,"Seurat") +# +# return(seurat) +# +# } + + +## DEPRECATED Seurat 2 Rename ident + +# renameIdent <- function(seurat, old.ident.name,new.ident.name) { +# +# seurat@ident <- plyr::mapvalues(x = seurat@ident, from = old.ident.name, to = new.ident.name) +# seurat@ident <- factor(x = seurat@ident, levels = new.ident.name) +# +# return(seurat) +# } + + + + +getAGEPropPerClustBarplot <- function(hspc.combined) { + clusterAge <- ddply(hspc.combined@meta.data,~numclust + AGE,nrow) + + propExpect <- table(hspc.combined@meta.data$AGE)/length(hspc.combined@meta.data$AGE)[] + propYoungExp <- propExpect[[unique(hspc.combined@meta.data$AGE)[1]]] + + #clusterAGE$numclust <- factor(v$clusterNature , levels = c("")) + clusterAge$AGE <- factor(clusterAge$AGE , levels = c("Old","Young")) + + + AGE <- ggplot(data.frame(clusterAge), aes(fill = AGE,y = V1, x=numclust,levels = "Young","Old")) + + geom_bar( stat="identity", position="fill")+ + scale_fill_manual( values= rev(hue_pal()(length(unique(hspc.combined@meta.data$AGE)))))+ + scale_y_continuous(name = "Age (%)", labels = c(0,25,50,75,100))+ + ylab(label = "")+xlab(label = "") + coord_flip() + geom_hline(yintercept = propYoungExp)+ + theme(legend.title=element_blank()) + return(AGE) + +} + + +getSamplePropPerClustBarplot <- function(hspc.combined) { + clustersampleName <- ddply(hspc.combined@meta.data,~numclust + sampleName,nrow) + + propExpect <- table(hspc.combined@meta.data$sampleName)/length(hspc.combined@meta.data$sampleName)[] + propYoungExp <- propExpect[[unique(hspc.combined@meta.data$sampleName)[1]]] + + #clustersampleName$numclust <- factor(v$clusterNature , levels = c("")) + #clustersampleName$sampleName <- factor(clustersampleName$predicted , levels = c("")) + + + sampleName <- ggplot(data.frame(clustersampleName), aes(fill = sampleName,y = V1, x=numclust)) + + geom_bar( stat="identity", position="fill")+ + scale_fill_manual( values= rev(hue_pal()(length(unique(hspc.combined@meta.data$sampleName)))))+ + scale_y_continuous(name = "Sample (%)", labels = c(0,25,50,75,100))+ + ylab(label = "")+xlab(label = "") + coord_flip()+ + theme(legend.title=element_blank()) + return(sampleName) + +} + +getRunDatePropPerClustBarplot <- function(hspc.combined) { + clusterrunDate <- ddply(hspc.combined@meta.data,~numclust + runDate,nrow) + + propExpect <- table(hspc.combined@meta.data$runDate)/length(hspc.combined@meta.data$runDate)[] + propYoungExp <- propExpect[[unique(hspc.combined@meta.data$runDate)[1]]] + + #clusterrunDate$numclust <- factor(v$clusterNature , levels = c("")) + #clusterrunDate$runDate <- factor(clusterrunDate$predicted , levels = c("")) + + + runDate <- ggplot(data.frame(clusterrunDate), aes(fill = runDate,y = V1, x=numclust)) + + geom_bar( stat="identity", position="fill")+ + scale_fill_manual( values= rev(hue_pal()(length(unique(hspc.combined@meta.data$runDate)))))+ + scale_y_continuous(name = "Run date (%)", labels = c(0,25,50,75,100))+ + ylab(label = "")+xlab(label = "") + coord_flip()+ + theme(legend.title=element_blank()) + return(runDate) + +} + +getPhasePropPerClustBarplot <- function(hspc.combined) { + clusterphases <- ddply(hspc.combined@meta.data,~numclust + phases,nrow) + + propExpect <- table(hspc.combined@meta.data$phases)/length(hspc.combined@meta.data$phases)[] + propYoungExp <- propExpect[[unique(hspc.combined@meta.data$phases)[1]]] + + #clusterphases$numclust <- factor(v$clusterNature , levels = c("")) + #clusterphases$phases <- factor(clusterphases$predicted , levels = c("")) + + + phases <- ggplot(data.frame(clusterphases), aes(fill = phases,y = V1, x=numclust)) + + geom_bar( stat="identity", position="fill")+ + scale_fill_manual( values= rev(hue_pal()(length(unique(hspc.combined@meta.data$phases)))))+ + scale_y_continuous(name = "Cell cycle phase (%)", labels = c(0,25,50,75,100))+ + ylab(label = "")+xlab(label = "") + coord_flip() + + theme(legend.title=element_blank()) + return(phases) + +} + + +getPredictedPropPerClustBarplot <- function(hspc.combined) { + clusterpredicted <- ddply(hspc.combined@meta.data,~numclust + predicted,nrow) + + propExpect <- table(hspc.combined@meta.data$predicted)/length(hspc.combined@meta.data$predicted)[] + propYoungExp <- propExpect[[unique(hspc.combined@meta.data$predicted)[1]]] + + #clusterpredicted$numclust <- factor(v$clusterNature , levels = c("")) + #clusterpredicted$predicted <- factor(clusterpredicted$predicted , levels = c("")) + + + predicted <- ggplot(data.frame(clusterpredicted), aes(fill = predicted,y = V1, x=numclust)) + + geom_bar( stat="identity", position="fill")+ + scale_fill_manual( values= hue_pal()(length(unique(hspc.combined@meta.data$predicted))))+ + scale_y_continuous(name = "Cell type (%)", labels = c(0,25,50,75,100))+ + ylab(label = "")+xlab(label = "") + coord_flip()+ + theme(legend.title=element_blank()) + return(predicted) + +} + + + + +getEnrichAge <- function(hspc.combined,clustCol ='clusterName',metaCol = "age") { + + table <- table(hspc.combined@meta.data[,metaCol],hspc.combined@meta.data[,clustCol]) + + #Remove null column in case of reclustering has been made + + table <- table[,as.vector(which(colSums(table)>0))] + + tablePercent <- prop.table(table,2) + + propExpect <- table(hspc.combined@meta.data[,metaCol])/length(hspc.combined@meta.data[,metaCol]) + propExpectAge_1<- propExpect[[unique(hspc.combined@meta.data[,metaCol])[1]]] + propExpectAge_2<- propExpect[[unique(hspc.combined@meta.data[,metaCol])[2]]] + phyper <- rep(NA,length(colnames(table))) + enrich <- rep(NA,length(colnames(table))) + tablePercent <- rbind(tablePercent,enrich,phyper) + + + for (age in unique(hspc.combined@meta.data[,metaCol])) { + for (cluster in colnames(table)) { + if(tablePercent[age,cluster] > propExpect[[age]]) { + cells_pull_marked <- table[age,as.character(cluster)] + cells_pull <- as.numeric(colSums(table)[as.character(cluster)]) + cells_marked_all <- rowSums(table)[age] + all_cells <- length(hspc.combined@meta.data[,metaCol]) + + + + p.value <- phyper(q=cells_pull_marked -1, + m=cells_marked_all, + n=all_cells - cells_marked_all, k= cells_pull, lower.tail=FALSE) + + tablePercent["enrich",cluster] <- age + + tablePercent["phyper",cluster] <- p.value + + } + } + } + return(tablePercent) + +} + + + + + + +# Make a summary teble of cluster metrics and signature enrichments + +getClustTable <- function(rodriguezSig,markers,signatures,seurat,outdir) { + clusterNames <- c("C1","C2","C3","Mk","Er","Ba","Neu","Mo1","Mo2", "preDC","preB","preT") + + RodriguezClustersSig <- lapply(X= c(1:length(clusterNames)),FUN = read_xlsx,path = rodriguezSig) + + names(RodriguezClustersSig) <- clusterNames + + getOnlyPos <- function(clustersSig) { + clusterSig <- clustersSig[which(clustersSig$log.effect > 0),] + return(clusterSig) + } + + RodriguezClustersSigPos <- lapply(X= RodriguezClustersSig, getOnlyPos) + + + signaturesRodriguez <- lapply(RodriguezClustersSigPos,"[[",1 ) + + + firstup <- function(x) { + substr(x, 1, 1) <- toupper(substr(x, 1, 1)) + x + } + + colnames(markers) <- firstup(colnames(markers)) + + getClustEnrichForRodriguez <- function(clust,signatures,seurat,markers) { + clustSig <- lapply(signatures,testHyperSig3,seurat,markers,clust) + propCellTypesLearned <- table(seurat@meta.data[which(seurat@meta.data$numclust==clust),"predicted"])/length(seurat@meta.data[which(seurat@meta.data$numclust==clust),"predicted"]) + clustInfo <- c(clustSig,propCellTypesLearned) + return(clustInfo) + } + + clust_list <- lapply(unique(markers$Cluster),getClustEnrichForRodriguez,signature=signaturesRodriguez,seurat =seurat,markers =markers) + + names(clust_list) <- paste("cluster_",unique(markers$Cluster),sep="") + + clust_table <- as.data.frame(matrix(unlist(clust_list), nrow=length(unlist(clust_list[1])))) + colnames(clust_table) <- names(clust_list) + rownames(clust_table) <- names(clust_list[[1]]) + + clust_df <- as.data.frame(t(clust_table)) + + write.csv(clust_df,file = paste(outdir,"/clustInfoRodriguez.csv",sep =""),quote = F) + + + + #Clusters table + + clust_table <- data.frame() + + print("begin cluster table") + + getClustInfo <- function(clust,signatures,seurat,markers) { + + clustInfo <- list() + clustInfo$num_cells <- dim(seurat@meta.data[which(seurat@active.ident==clust),])[1] + clustInfo$percent_cells <- clustInfo$num_cells/dim(seurat@meta.data)[1] + percentPhases <- table(seurat@meta.data[which(seurat@active.ident==clust),"phases"])/length(seurat@meta.data[which(seurat@active.ident==clust),"phases"]) #In fact this is fraction not percentage + + if(!is.null(seurat@meta.data$predicted)) { + percentPredicted <- table(seurat@meta.data[which(seurat@active.ident==clust),"predicted"])/length(seurat@meta.data[which(seurat@active.ident==clust),"predicted"]) #In fact this is fraction not percentage + + for (p in unique(seurat@meta.data$predicted)) { + print(p) + if (is.element(p,names(percentPredicted))) { + clustInfo[[p]] <- percentPredicted[p] + } else { + clustInfo[[p]] <- 0 + } + } + + } + + if(!is.null(seurat@meta.data$AGE)) { + percentAGE <- table(seurat@meta.data[which(seurat@active.ident==clust),"AGE"])/length(seurat@meta.data[which(seurat@active.ident==clust),"AGE"]) #In fact this is fraction not percentage + + for (p in unique(seurat@meta.data$AGE)) { + print(p) + if (is.element(p,names(percentAGE))) { + clustInfo[[p]] <- percentAGE[p] + } else { + clustInfo[[p]] <- 0 + } + } + + } + + if(!is.null(seurat@meta.data$sampleName)) { + percentsampleName <- table(seurat@meta.data[which(seurat@active.ident==clust),"sampleName"])/length(seurat@meta.data[which(seurat@active.ident==clust),"sampleName"]) #In fact this is fraction not percentage + + for (p in unique(seurat@meta.data$sampleName)) { + print(p) + if (is.element(p,names(percentsampleName))) { + clustInfo[[p]] <- percentsampleName[p] + } else { + clustInfo[[p]] <- 0 + } + } + + } + + + for (p in c("G1_G0","S","G2_M")) { + if (is.element(p,names(percentPhases))) { + clustInfo[[p]] <- percentPhases[p] + } else { + clustInfo[[p]] <- 0 + } + } + + + clustInfo$median_genes_expressed <- median(seurat@meta.data[which(seurat@active.ident==clust),"numGenesPerCells"]) + clustInfo$median_nUMI <- median(seurat@meta.data[which(seurat@active.ident==clust),"Total_mRNAs"]) + clustInfo$median_percentMitochGenes <- median(seurat@meta.data[which(seurat@active.ident==clust),"percentMito"]) + + + clustSig <- lapply(signatures,testHyperSig3,seurat,markers,clust) + + clustInfo <- c(clustInfo,clustSig) + + + } + + allSignatures <- c(signatures,signaturesRodriguez) + + clust_list <- lapply(levels(unique(seurat@active.ident)),getClustInfo,allSignatures,seurat,markers) + + names(clust_list) <- paste("cluster_",levels(unique(seurat@active.ident)),sep="") + + print("clust_list ok") + saveRDS(clust_list,paste(outdir,"/clust_list_save.rds",sep ="")) + + clust_table <- as.data.frame(matrix(unlist(clust_list), nrow=length(unlist(clust_list[1])))) + colnames(clust_table) <- names(clust_list) + rownames(clust_table) <- names(clust_list[[1]]) + + print("clust_table ok") + + clust_df <- as.data.frame(t(clust_table)) + + write.table(x = clust_df,file = paste(outdir,"/clusters_table.tsv",sep =""),sep="\t",quote=F,col.names = NA) + + return(clust_df) +} + + diff --git a/R_src/funForSeuratAnalysis.R b/R_src/funForSeuratAnalysis.R new file mode 100644 index 0000000000000000000000000000000000000000..e41c567dbfcdd97bad397d064f06fa798a527b8e --- /dev/null +++ b/R_src/funForSeuratAnalysis.R @@ -0,0 +1,71 @@ +# Do hypergeometric test of to population to find if ine proportion is significantly increased +getEnrichAge <- function(hspc.combined,clustCol ='clusterName',metaCol = "age") { + + table <- table(hspc.combined@meta.data[,metaCol],hspc.combined@meta.data[,clustCol]) + + #Remove null column in case of reclustering has been made + + table <- table[,as.vector(which(colSums(table)>0))] + + tablePercent <- prop.table(table,2) + + propExpect <- table(hspc.combined@meta.data[,metaCol])/length(hspc.combined@meta.data[,metaCol]) + propExpectAge_1<- propExpect[[unique(hspc.combined@meta.data[,metaCol])[1]]] + propExpectAge_2<- propExpect[[unique(hspc.combined@meta.data[,metaCol])[2]]] + phyper <- rep(NA,length(colnames(table))) + enrich <- rep(NA,length(colnames(table))) + tablePercent <- rbind(tablePercent,enrich,phyper) + + + for (age in unique(hspc.combined@meta.data[,metaCol])) { + for (cluster in colnames(table)) { + if(tablePercent[age,cluster] > propExpect[[age]]) { + cells_pull_marked <- table[age,as.character(cluster)] + cells_pull <- as.numeric(colSums(table)[as.character(cluster)]) + cells_marked_all <- rowSums(table)[age] + all_cells <- length(hspc.combined@meta.data[,metaCol]) + + + + p.value <- phyper(q=cells_pull_marked -1, + m=cells_marked_all, + n=all_cells - cells_marked_all, k= cells_pull, lower.tail=FALSE) + + tablePercent["enrich",cluster] <- age + + tablePercent["phyper",cluster] <- p.value + + } + } + } + return(tablePercent) + +} + +# Hypergeometric test and plotting the result with a bp function, use a getEnrichAge +getEnrichPopClust <- function(hspc.combined, Xname, Yname, colorX, colorY, metaCol = "AGE", clustCol = "numclust"){ + conditionEnrich <- getEnrichAge(hspc.combined = hspc.combined,clustCol = clustCol, metaCol = metaCol) + conditionEnrich <- as.data.frame(t(conditionEnrich)) + conditionEnrich$color <- "black" + conditionEnrich[which(as.numeric(as.vector(conditionEnrich$phyper)) < 0.01 & conditionEnrich$enrich == Xname),"color"] <- colorX + conditionEnrich[which(as.numeric(as.vector(conditionEnrich$phyper)) < 0.01 & conditionEnrich$enrich == Yname),"color"] <- colorY + return(conditionEnrich) +} + +# Function to calculate the score diff of a signature between 2 cells pop +featureDiff <- function(seurat,cells.1,cells.2,feature) { + data <- GetAssayData(seurat,slot = "data") + total.diff <- mean(data[feature,cells.1]) - mean(data[feature,cells.2]) + return(total.diff) +} + +convertHumanGeneList <- function(x){ + require("biomaRt") + human = useMart("ensembl", dataset = "hsapiens_gene_ensembl") + mouse = useMart("ensembl", dataset = "mmusculus_gene_ensembl") + genesV2 = getLDS(attributes = c("hgnc_symbol"), filters = "hgnc_symbol", values = x , mart = human, attributesL = c("mgi_symbol"), martL = mouse, uniqueRows=T) + humanx <- unique(genesV2[, 2]) + # Print the first 6 genes found to the screen + print(head(humanx)) + return(humanx) +} diff --git a/R_src/getSigPerCells.R b/R_src/getSigPerCells.R new file mode 100644 index 0000000000000000000000000000000000000000..af7e0d968794ce4190a05ab3da542223662fa500 --- /dev/null +++ b/R_src/getSigPerCells.R @@ -0,0 +1,193 @@ +getSignatures <- function(m,pheno,cellType,data,id_type = "gene_short_name",outdir="./",padj = 0.05) { + phenoTest <- pheno + phenoTest[which(phenoTest!=cellType)] <- "othersHSPC" + cat("Calling differentially expressed genes (DESeq2\n") + des <- DESeqDataSetFromMatrix(m, as.data.frame(phenoTest), design=formula(~phenoTest)) + dds <- DESeq(des,parallel=T) + res <- results(dds,contrast=c("phenoTest",cellType,"othersHSPC")) + resOrdered <- res[order(res$padj),] + #save.image("image.Rdata") + out <- as.data.frame(resOrdered) + #resMLE<- results(dds, addMLE=TRUE) + res <- results(dds) + resOrdered <- res[order(res$padj),] + #save.image("image.Rdata") + out <- as.data.frame(resOrdered) + log2.counts <- log2(counts(dds, normalized=TRUE) + 1) + colnames(log2.counts) <- colnames(m) + out <- data.frame(out, log2.counts[rownames(out),],data[rownames(out),"Gene.Symbol"]) + colnames(out)[length(colnames(out))] = "gene_short_name" + out_save <- out[which(out$padj < 0.05 & out$log2FoldChange<0),] + out_save <- out_save[order(out_save$padj),] + write.table(out_save,paste(outdir,"/",paste(cellType,"vs",paste(unique(pheno[which(pheno!=cellType)]),collapse = ""),sep = "_"),".tsv",sep =""),sep="\t",quote=F,col.names = NA) + + gene_sig <- as.vector(out_save[which(out_save$padj<padj),id_type]) + return(list(genes = gene_sig, table = out)) +} + + + +AddSigScoreMonocle <- function(gbm_cds,ntop,sigRes,cellTypeScore) { + + seurat <- exportCDS(gbm_cds,"Seurat") + + sigRes <- sigRes[which(sigRes$padj < 0.05),] + + print(head(sigRes)) + + gene_sig <- rownames(sigRes[which(sigRes$log2FoldChange<0),])[c(1:50)] + + print(head(gene_sig)) + + seurat <- AddModuleScore(seurat, genes.list = list(gene_sig), genes.pool = NULL, n.bin = 25, + seed.use = 1, ctrl.size = 100, use.k = FALSE, + random.seed = 1) + print("calcul ok") + print(head(seurat@meta.data)) + pData(gbm_cds)$newScore <- seurat@meta.data[,length(colnames(seurat@meta.data))] + print("ok)") + colnames(pData(gbm_cds))[length(colnames(pData(gbm_cds)))] <- cellTypeScore + print("ok2") + print(plot_cell_trajectory(gbm_cds,color_by=cellTypeScore) + scale_color_gradient( low="grey",high="red")) + return(gbm_cds) +} + + + +AddSigScore <- function(seurat,ntop,sigRes,scoreName) { + + sigRes <- sigRes[which(sigRes$padj < 0.05),] + + print(head(sigRes)) + + gene_sig <- sigRes[which(sigRes$log2FoldChange<0),"gene_short_name"][c(1:50)] + + print(head(gene_sig)) + + seurat <- AddModuleScore(seurat, genes.list = list(gene_sig), genes.pool = NULL, n.bin = 25, + seed.use = 1, ctrl.size = 100, use.k = FALSE, enrich.name= scoreName, + random.seed = 1) + print("calcul ok") + colnames(seurat@meta.data)[length(colnames(seurat@meta.data))] <- scoreName + print(head(seurat@meta.data)) + return(seurat) +} + +getKeggSig <- function(queryIndex) { + fullName <- queryIndex$NAME + name <- strsplit(fullName,split = " ")[[1]][1] + name <- gsub(name,pattern = "-",replacement = "_") + name <- paste(name,"_Kegg",sep ="") + print(name) + result <- data.frame(strsplit(queryIndex$GENE,";")) + result_t <- t(result) + result_vector <- as.vector(result_t[,1]) + result_genes <- result_vector[!grepl("^[0-9]{1,}$", result_vector)] + + return(list(name = name,genes = result_genes)) +} + +getKeggSigList <- function(query) { + results <- lapply(query,getKeggSig) + names(results) <- unlist(lapply(results,'[[',1)) + return(results) +} + + +getMicroArraySig <- function(file) { + arraySigName <- paste(strsplit(file,split="_")[[1]][2],"_Chambers",sep="") + arraySig <- as.vector(na.omit(read.table(paste(opt$input_microArraySigDir,file,sep="/"),sep =";",header =T,quote = "\"")$Gene.Symbol)) + return(list(name = arraySigName,genes = arraySig)) +} + +getMicroArraySigList <- function(fileList) { + results <- lapply(fileList,getMicroArraySig) + names(results) <- unlist(lapply(results,'[[',1)) + return(results) +} + + +read_excel_allsheets <- function(filename, tibble = FALSE) { + sheets <- readxl::excel_sheets(filename) + x <- lapply(sheets, function(X) readxl::read_excel(filename, sheet = X)) + if(!tibble) x <- lapply(x, as.data.frame) + names(x) <- sheets + x +} + + +getGeneNameCol <- function(sheet) { + result <- sheet[,"Gene Symbol"] + result <- result[which(result != "NA")] + return(result) +} + + +getMicroArraySigListXls <- function(allsheets) { + allsheets <- allsheets[-1] + results <- lapply(allsheets,getGeneNameCol) + names(results) <-paste(gsub("\\.txt","",names(allsheets)),"_Chambers",sep="") + return(results) +} + + + + + +getBM_vector <- function(goTerm,idType = "external_gene_name",mart) { + result <- getBM(attributes=c("ensembl_gene_id","external_gene_name","name_1006"), + filters=c("go"), + values=list(goTerm),mart=mart) + head(result) + out <- unique(result[,idType]) + name <- gsub(Term(goTerm),pattern=" |-", replacement= "_") + name <- paste(name,"_GO",sep="") + print(name) + return(list(name = name,genes = out)) +} + +#Function to score cell in seurat with a gene signature + +# DEPRECATED Seurat 2 +# scoreCells <- function(seurat,signature,outdir,sigName) { +# #remove old slot +# if (!is.null(seurat@meta.data[[sigName]])) { +# seurat@meta.data[[sigName]] <- NULL +# } +# seurat <- AddModuleScore(seurat, genes.list = list(signature), genes.pool = NULL, n.bin = 25, +# seed.use = 1, ctrl.size = 100, use.k = FALSE, enrich.name= sigName, +# random.seed = 1) +# colnames(seurat@meta.data)[length(seurat@meta.data)] <- sigName +# #print(head(seurat@meta.data)) +# if(!is.null(outdir)) { +# #print("plot") +# png(paste(outdir,"/",sigName,".png",sep ="")) +# FeaturePlot(seurat,features.plot=sigName) +# dev.off() +# } +# return(seurat) +# +# } + +#For seurat3 + +scoreCells3 <- function(seurat,signature,outdir,sigName) { + #remove old slot + if (!is.null(seurat@meta.data[[sigName]])) { + seurat@meta.data[[sigName]] <- NULL + } + seurat <- AddModuleScore(seurat, features = list(signature), pool = NULL, nbin = 25, + seed = 1, ctrl = 100, k = FALSE, name= sigName) + colnames(seurat@meta.data)[length(seurat@meta.data)] <- sigName + #print(head(seurat@meta.data)) + if(!is.null(outdir)) { + #print("plot") + png(paste(outdir,"/",sigName,".png",sep ="")) + plot(FeaturePlot(seurat,features=sigName)) + dev.off() + } + return(seurat) + +} +## Test cell signature + diff --git a/R_src/getSignaturesCL.R b/R_src/getSignaturesCL.R new file mode 100644 index 0000000000000000000000000000000000000000..dec219d714d48a770cc221bc38854e7099657d2c --- /dev/null +++ b/R_src/getSignaturesCL.R @@ -0,0 +1,101 @@ +##------------------------------------------------------------------------------ +## L. Herault +##------------------------------------------------------------------------------ + +## ----------------------------------------------------------------------------- +## Libraries +## ----------------------------------------------------------------------------- + +suppressMessages(library(getopt)) +suppressMessages(library(readxl)) + + +source("R_src/getSigPerCells.R") + + + +# Store previously published signature in an R object + +## ----------------------------------------------------------------------------- +## Command line args +## ----------------------------------------------------------------------------- + +spec = matrix(c( + 'help', 'h', 0, "logical", "Help about the program", + 'input_tabfile', 't', 1, "character","tab file with gene lists per column (Geiger signatures tab file)", + 'input_microArraySig',"m", 1, "character", "directory with micro array cell type signatures (Chambers) or directly xlsx file from cell paper", + 'outdir', 'o', 1, "character", "Output directory. Default to current working directory."), + byrow=TRUE, ncol=5) + +opt = getopt(spec) + +# if help was asked, print a friendly message +# and exit with a non-zero error code +args <- commandArgs() +if ( !is.null(opt$help) | is.null(opt$input_tabfile) | is.null(opt$input_microArraySig)) { + cat("get signatures with microo array data and/or tab files from previous studies\n") + cat(getopt(spec, usage=TRUE)) + q(status=1) +} + + + +sessionInfo() + +# output +if(is.null(opt$outdir)) + opt$outdir <- getwd() + +dir.create(opt$outdir, showWarnings=FALSE) + +#Miccro array sig + +if (dir.exists(opt$input_microArraySig)) { + arraySigFiles <- list.files(opt$input_microArraySig,full.names = F) + print(arraySigFiles) + arraySig <- getMicroArraySigList(arraySigFiles) + + + + allSignatures <- c(arraySig) + + allSignatures <- lapply(allSignatures,'[[',2) +} else { + + # Miccro array sig directly from xlsx file + allsheets <- read_excel_allsheets(opt$input_microArraySig) + + allSignatures <- getMicroArraySigListXls(allsheets) + +} + +#Tab file signatures + +if(endsWith(opt$input_tabfile,suffix = ".xls")) { #if downloaded file + tab_sig <- read_xls(opt$input_tabfile) + tab_sig <- as.data.frame(tab_sig) + tab_sig <- tab_sig[-2,] + colnames(tab_sig) <- tab_sig[1,] + tab_sig <- tab_sig[-1,] +} else { + tab_sig <- read.table(opt$input_tabfile,sep = "\t",header=T) + +} + + +nonEmptyString <- function(stringVector){ + result <- which(stringVector != "") + return(stringVector[result]) +} + +list_sig <- as.list(tab_sig) + +list_sig <- lapply(list_sig,nonEmptyString) + + +allSignatures <- c(allSignatures,list_sig) + + + +saveRDS(allSignatures,paste(opt$outdir,"publicSignatures.rds",sep = "")) + diff --git a/R_src/installMotifsEnv.R b/R_src/installMotifsEnv.R new file mode 100644 index 0000000000000000000000000000000000000000..5a491a311d4e757bb2adb925c8caa3c6c353afba --- /dev/null +++ b/R_src/installMotifsEnv.R @@ -0,0 +1,47 @@ +if (!requireNamespace("BiocManager", quietly = TRUE)) + install.packages("BiocManager") +print("Install devtools") +install.packages("devtools",repos = "http://cran.us.r-project.org") + +print("Install last signac version") +#devtools::install_github("timoast/signac", ref = "develop") + +BiocManager::install("chromVAR") + +BiocManager::install("motifmatchr") + +# BiocManager::install("BSgenome.Mmusculus.UCSC.mm10") + +# BiocManager::install("TFBSTools") + + +# To automatically install Bioconductor dependencies +setRepositories(ind=1:2) + +# BiocManager::install("GenomeInfoDb") + +# BiocManager::install("EnsDb.Mmusculus.v79") + +BiocManager::install("ggbio") + +#BiocManager::install("EnsDb.Mmusculus.v79") + + +install.packages("Signac",repos='http://cran.us.r-project.org') +#install.packages("devtools") +#devtools::install_github("timoast/signac", ref = "develop") +#packageVersion("Signac") + +# remotes::install_github("da-bar/JASPAR2020") + +# remotes::install_github("satijalab/seurat", ref = "release/4.0.0") + +remotes::install_github("jlmelville/uwot") + +# library(BSgenome.Mmusculus.UCSC.mm10) +# library(TFBSTools) +# library(GenomeInfoDb) +# library(EnsDb.Mmusculus.v79) +# library(ggbio) +# library(Signac) + diff --git a/R_src/installSignacSeurat4.R b/R_src/installSignacSeurat4.R new file mode 100644 index 0000000000000000000000000000000000000000..7a320e64f519168461d953f349a7983b3bf1eb6a --- /dev/null +++ b/R_src/installSignacSeurat4.R @@ -0,0 +1,30 @@ +# Install bioconductor + +if (!requireNamespace("BiocManager", quietly = TRUE)) + install.packages("BiocManager",repos='http://cran.us.r-project.org') + +# To automatically install Bioconductor dependencies +setRepositories(ind=1:2) + +# BiocManager::install("GenomeInfoDb") + +# BiocManager::install("EnsDb.Mmusculus.v79") + +BiocManager::install("ggbio") + +#BiocManager::install("EnsDb.Mmusculus.v79") + + +install.packages("Signac",repos='http://cran.us.r-project.org') + + +# remotes::install_github("satijalab/seurat", ref = "release/4.0.0") +# remotes::install_version("Seurat", version = "4.0.3") + +remotes::install_github("jlmelville/uwot") + + + + + + diff --git a/R_src/load_10X_Seurat.R b/R_src/load_10X_Seurat.R new file mode 100644 index 0000000000000000000000000000000000000000..cc7facd1f4f3840a04580e2d1c6a01a84d79c306 --- /dev/null +++ b/R_src/load_10X_Seurat.R @@ -0,0 +1,124 @@ +##------------------------------------------------------------------------------ +## L. Herault +##------------------------------------------------------------------------------ + +## ----------------------------------------------------------------------------- +## Libraries +## ----------------------------------------------------------------------------- + + +suppressMessages(library("getopt")) +suppressMessages(library("Matrix")) +suppressMessages(library("Seurat")) + +source("R_src/funForLoading.R") + + +## ----------------------------------------------------------------------------- +## Command line args +## ----------------------------------------------------------------------------- + +spec = matrix(c( + 'help', 'h', 0, "logical", "Help about the program", + 'inputMatrixDir', 'b', 1, "character", "REQUIRED:10X expression data (matrix files directory path)", + 'outfile', 'f',1, "character", 'Output file path (default .10X_data.rds)', + 'cellranger', 'c',1, "character", "cellranger version used to generate matrix files (default 2)", + "subSample", "p",1, "character", "proportion of cell to subsample (by default all the cells are used (e.g. 0.5))", + "sampleInfo", "s", 1, "character", "sample information that will be added to pData with the following format: age=2_months,runDate=10_12_2017..", + "sampleName", "n", 1, "character", "if matrix file have the sampl name as prefix (name_matrix.mtx...) provide it here" + +), byrow=TRUE, ncol=5); + +opt = getopt(spec) + +# if help was asked, print a friendly message +# and exit with a non-zero error code +# For test +# opt <- list() +# opt$inputMatrixDir <- "input/RAW/PLZF_RARA_CT/" +# opt$cellranger <- 4 +# opt$sampleInfo <- "age=Adult,runDate=11/2020,sampleName=RNA_Ctrl" +# opt$outfile <- "input/10X_data_RNA_Ctrl.rds" + +args <- commandArgs() +if ( !is.null(opt$help) | is.null(opt$inputMatrixDir)) { + cat("loading 10X genomics single cells data and save it as a seurat R object.") + cat(getopt(spec, usage=TRUE)) + q(status=1) +} + +## ----------------------------------------------------------------------------- +## Processing data +## ----------------------------------------------------------------------------- + +# output + +if(is.null(opt$outfile)) { + opt$outfile <- "10X_data.rds" + outdir = './' +} else { + outdir <- paste(strsplit(opt$outfile,"/")[[1]][-length(strsplit(opt$outfile,"/")[[1]])],collapse="/") + dir.create(outdir, showWarnings=FALSE) +} + + +if(is.null(opt$cellranger)) { + opt$cellranger <- "2" +} + + +if (opt$cellranger == "3" | opt$cellranger == "4") { + gbm <- loadCellRangerMatrix_cellranger3(opt$inputMatrixDir,sample_name=opt$sampleName) +} else +{ + gbm <- loadCellRangerMatrix(opt$inputMatrixDir,sample_name=opt$sampleName) +} + + +fd <- gbm$fd +pd <- gbm$pd + +# Column 'symbol' is the one (from cellRanger workflow) that corresponds to featureData's gene short names. + +colnames(fd)[which(colnames(fd)=="symbol")]<- "gene_short_name" + +for (c in c(1:length(colnames(fd)))){ + if (is.na(colnames(fd)[c])) { + colnames(fd)[c] <- paste0("col_",c) + } +} + + +seurat <- CreateSeuratObject(counts = gbm$exprs, + assay = "RNA", + meta.data = gbm$pd) + +# ## Subsample ## DEPRECATED +# if(is.null(opt$subSample)==F) { +# opt$subSample <- as.numeric(opt$subSample) +# print(opt$subSample) +# +# cellSubset <- sample(rownames(pData(gbm_cds)),size = opt$subSample*length(rownames(pData(gbm_cds)))) +# gbm_cds <- gbm_cds[,cellSubset] +# } + + +#Add sample infos + +print(opt$sampleInfo) +sampleInfos <- strsplit(opt$sampleInfo,split=",")[[1]] +for (i in sampleInfos) { + print(i) + info <- strsplit(i,split = "=")[[1]][1] + print(info) + value <- strsplit(i,split = "=")[[1]][2] + print(value) + seurat@meta.data[,info] <- value +} + + +output <- list(seurat = seurat,featureData = fd) + +saveRDS(output,file = opt$outfile) + + diff --git a/R_src/load_10X_data.R b/R_src/load_10X_data.R new file mode 100644 index 0000000000000000000000000000000000000000..f32a6c19c47905bef393f7f536a8b3e7c4754fa0 --- /dev/null +++ b/R_src/load_10X_data.R @@ -0,0 +1,112 @@ +##------------------------------------------------------------------------------ +## L. Herault +##------------------------------------------------------------------------------ + +## ----------------------------------------------------------------------------- +## Libraries +## ----------------------------------------------------------------------------- + + +suppressMessages(library("getopt")) +suppressMessages(library("Matrix")) +suppressMessages(library("monocle")) + +source("R_src/funForLoading.R") + + +## ----------------------------------------------------------------------------- +## Command line args +## ----------------------------------------------------------------------------- + +spec = matrix(c( + 'help', 'h', 0, "logical", "Help about the program", + 'inputMatrixDir', 'b', 1, "character", "REQUIRED:10X expression data (matrix files directory path)", + 'outfile', 'f',1, "character", 'Output file path (default .10X_data.rds)', + 'cellranger', 'c',1, "character", "cellranger version used to generate matrix files (default 2)", + "subSample", "p",1, "character", "proportion of cell to subsample (by default all the cells are used (e.g. 0.5))", + "sampleInfo", "s", 1, "character", "sample information that will be added to pData with the following format: age=2_months,runDate=10_12_2017..", + "sampleName", "n", 1, "character", "if matrix file have the sampl name as prefix (name_matrix.mtx...) provide it here" + +), byrow=TRUE, ncol=5); + +opt = getopt(spec) + +# if help was asked, print a friendly message +# and exit with a non-zero error code + +args <- commandArgs() +if ( !is.null(opt$help) | is.null(opt$inputMatrixDir)) { + cat("loading 10X genomics single cells data and save it as a monocle R object.") + cat(getopt(spec, usage=TRUE)) + q(status=1) +} + +## ----------------------------------------------------------------------------- +## Processing data +## ----------------------------------------------------------------------------- + +# output +if(is.null(opt$outfile)) { + opt$outfile <- "10X_data.rds" +} else { + outdir <- paste(strsplit(opt$outfile,"/")[[1]][-length(strsplit(opt$outfile,"/")[[1]])],collapse="/") + dir.create(outdir, showWarnings=FALSE) +} + + +if(is.null(opt$cellranger)) { + opt$cellranger <- "2" +} + + +if (opt$cellranger == "3" | opt$cellranger == "4") { + gbm <- loadCellRangerMatrix_cellranger3(opt$inputMatrixDir,sample_name=opt$sampleName) +} else +{ + gbm <- loadCellRangerMatrix(opt$inputMatrixDir,sample_name=opt$sampleName) +} + + +fd <- gbm$fd +pd <- gbm$pd + +# Column 'symbol' is the one (from cellRanger workflow) that corresponds to featureData's gene short names. + +colnames(fd)[which(colnames(fd)=="symbol")]<- "gene_short_name" + +for (c in c(1:length(colnames(fd)))){ + if (is.na(colnames(fd)[c])) { + colnames(fd)[c] <- paste0("col_",c) + } +} + +gbm_cds <- newCellDataSet(gbm$exprs, + phenoData = new("AnnotatedDataFrame", data = pd), + featureData = new("AnnotatedDataFrame", data = fd), + expressionFamily = negbinomial.size()) + +## Subsample +if(is.null(opt$subSample)==F) { + opt$subSample <- as.numeric(opt$subSample) + print(opt$subSample) + + cellSubset <- sample(rownames(pData(gbm_cds)),size = opt$subSample*length(rownames(pData(gbm_cds)))) + gbm_cds <- gbm_cds[,cellSubset] +} + + +#Add sample infos + +print(opt$sampleInfo) +sampleInfos <- strsplit(opt$sampleInfo,split=",")[[1]] +for (i in sampleInfos) { + print(i) + info <- strsplit(i,split = "=")[[1]][1] + print(info) + value <- strsplit(i,split = "=")[[1]][2] + print(value) + pData(gbm_cds)[[info]] <- value +} + +saveRDS(gbm_cds,file = opt$outfile) + diff --git a/R_src/makeBranchedHeatmap.R b/R_src/makeBranchedHeatmap.R new file mode 100644 index 0000000000000000000000000000000000000000..b1cb8db38130e1514fe0523ff1920105a0bee829 --- /dev/null +++ b/R_src/makeBranchedHeatmap.R @@ -0,0 +1,267 @@ +regulons_heatmap2 <- function(monocle, + regulons, + first_traj_states=c(1,2), + second_traj_states=c(1,3), + cluster=3, + legend=TRUE, + add_annotation_row = F, + showTree = F, + clusterRow =T, + gaps_row = NULL, + legend_breaks = NA, + clusterFirstData = T, + colorStates = NULL, + breaksAdapt = FALSE, + clustering_method = "complete"){ + + + rownames(regulons) <- regulons[,1] + regulons <- regulons[,-1] + regulons_pos <- regulons[,which(grepl(x = colnames(regulons),pattern = "\\(n\\)|(\\+)"))] + + metaData_monocle <- pData(monocle) + breaklist <- seq(-5,5, length.out=100) + + + getMerge <- function(metaData_monocle,breaklist) { + + first_traj <- metaData_monocle[which(metaData_monocle$State %in% first_traj_states),] + + second_traj <- metaData_monocle[which(metaData_monocle$State %in% second_traj_states),] + + regulons_fit_first <- regulons_pos[rownames(first_traj),] + regulons_fit_second <- regulons_pos[rownames(second_traj),] + + for(i in colnames(regulons_pos)){ + if(!unique(is.na(second_traj[,i]))) { + set.seed(1) + gam_second <-gam(formula = second_traj[,i] ~ s(Pseudotime, bs ="cr"), data= second_traj) + regulons_fit_second[,i] <- gam_second$fitted.values + gam_first <-gam(formula = first_traj[,i] ~ s(Pseudotime, bs ="cr"), data= first_traj) + + # gam_second <-gam(formula = second_traj[,i] ~ s(Pseudotime,bs = "cs"), data= second_traj) + # regulons_fit_second[,i] <- gam_second$fitted.values + # gam_first <-gam(formula = first_traj[,i] ~ s(Pseudotime,bs = "cs"), data= first_traj) + + regulons_fit_first[,i] <- gam_first$fitted.values + } else { + regulons_fit_second[,i] <- rep(NA,dim(second_traj)[1]) + regulons_fit_first[,i] <- rep(NA,dim(first_traj)[1]) + } + + } + regulons_fit_second <- data.frame(regulons_fit_second, check.names = FALSE) + + regulons_fit_first <- data.frame(regulons_fit_first, check.names = FALSE) + + + max_pseudotime <- max(c(max(second_traj$Pseudotime), max(first_traj$Pseudotime))) + metaData_monocle$percent_Pseudotime <- 100*metaData_monocle$Pseudotime/max_pseudotime + second_traj$percent_Pseudotime <- 100*second_traj$Pseudotime/max_pseudotime + first_traj$percent_Pseudotime <- 100*first_traj$Pseudotime/max_pseudotime + + percent_state_second <- c() + + for (s in second_traj_states){ + percent_state <- floor((max(second_traj[which(second_traj$State == s), "Pseudotime"])*100)/max_pseudotime) + percent_state_second <- c(percent_state_second,percent_state) + } + + percent_state_second[length(percent_state_second)] <- 100 #set max at 100 it will be cut if necessary in the loop + + newdata_second <- data.frame(Pseudotime = seq(1, 100, by = 1), + Branch = as.factor(rep(c(second_traj_states),c(diff(c(0,percent_state_second)))))) + + + percent_state_first <- c() + for (s in first_traj_states){ + percent_state <- floor((max(first_traj[which(first_traj$State == s), "Pseudotime"])*100)/max_pseudotime) + percent_state_first <- c(percent_state_first,percent_state) + } + + percent_state_first[length(percent_state_first)] <- 100 #set max at 100 it will be cut if necessary in the loop + + + newdata_first <- data.frame(Pseudotime = seq(1, 100, by = 1), + Branch = as.factor(rep(c(first_traj_states),c(diff(c(0,percent_state_first)))))) + + + #Setting up the object for the score + cell_names_second <- list() # not used apparantly + regulons_score_percent_pseudotime_second <- data.frame(matrix(NA, nrow = 100, ncol=ncol(regulons_fit_second))) + colnames(regulons_score_percent_pseudotime_second) <- colnames(regulons_fit_second) + + cell_names_first <- list() + regulons_score_percent_pseudotime_first <- data.frame(matrix(NA, nrow = 100, ncol=ncol(regulons_fit_first))) + colnames(regulons_score_percent_pseudotime_first) <- colnames(regulons_fit_first) + + #mean score on the % pseudotime for each 1% (the mean of each regulons score is done on the cell having a pseudotime between i and i+1) + for(i in 2:nrow(newdata_second)){ + cell_names_second[[1]] <- rownames(second_traj[0<=second_traj$percent_Pseudotime & second_traj$percent_Pseudotime<=1,]) + cell_names_second[[i]] <- rownames(subset(second_traj, percent_Pseudotime>newdata_second[(i-1),1] & percent_Pseudotime<=newdata_second[i,1])) + regulons_score_percent_pseudotime_second[1,] <- apply(regulons_fit_second[rownames(second_traj[0<=second_traj$percent_Pseudotime & second_traj$percent_Pseudotime<=1,]),],2,mean) # should use cell_names + regulons_score_percent_pseudotime_second[i,] <- apply(regulons_fit_second[rownames(subset(second_traj, percent_Pseudotime>newdata_second[(i-1),1] & percent_Pseudotime<=newdata_second[i,1])),], 2, mean) + + cell_names_first[[1]] <- rownames(first_traj[0<=first_traj$percent_Pseudotime & first_traj$percent_Pseudotime<=1,]) + cell_names_first[[i]] <- rownames(subset(first_traj, percent_Pseudotime>newdata_first[(i-1),1] & percent_Pseudotime<=newdata_first[i,1])) + regulons_score_percent_pseudotime_first[1,] <- apply(regulons_fit_first[rownames(first_traj[0<=first_traj$percent_Pseudotime & first_traj$percent_Pseudotime<=1,]),],2,mean) + regulons_score_percent_pseudotime_first[i,] <- apply(regulons_fit_first[rownames(subset(first_traj, percent_Pseudotime>newdata_first[(i-1),1] & percent_Pseudotime<=newdata_first[i,1])),], 2, mean) + } + regulons_score_percent_pseudotime_second <- regulons_score_percent_pseudotime_second[which(rowSums(is.na(regulons_score_percent_pseudotime_second))!= ncol(regulons_score_percent_pseudotime_second)), ] + regulons_score_percent_pseudotime_first <- regulons_score_percent_pseudotime_first[which(rowSums(is.na(regulons_score_percent_pseudotime_first))!= ncol(regulons_score_percent_pseudotime_first)), ] + + regulons_score_percent_pseudotime_merge <- as.data.frame(rbind(regulons_score_percent_pseudotime_first[nrow(regulons_score_percent_pseudotime_first):1,], + regulons_score_percent_pseudotime_second), check.names =FALSE) + + + + + + + annot_list_merge <- data.frame(c(as.character(newdata_first[nrow(regulons_score_percent_pseudotime_first):1,2]), as.character(newdata_second[1:nrow(regulons_score_percent_pseudotime_second),2]))) + colnames(annot_list_merge) <- "State" + rownames(annot_list_merge) <- rownames(regulons_score_percent_pseudotime_merge) + + + #Setting color panel for the heatmap + breaklist <- seq(-5,5, length.out=100) + + colnames(regulons_score_percent_pseudotime_merge) <- str_split_fixed(string = colnames(regulons_score_percent_pseudotime_merge),pattern = "\\(",n=2)[,1] + res <- list("score" = regulons_score_percent_pseudotime_merge, + "gap" = regulons_score_percent_pseudotime_first, + "annot" = annot_list_merge) + + return(res) + } + + regulons_score_percent_pseudotime_merge_young <- getMerge(metaData_monocle[which(metaData_monocle$AGE == "Young"),],breaklist) + rownames(regulons_score_percent_pseudotime_merge_young$score) <- paste0("young_",c(1:length(rownames(regulons_score_percent_pseudotime_merge_young$score)))) + rownames(regulons_score_percent_pseudotime_merge_young$annot) <- paste0("young_",c(1:length(rownames(regulons_score_percent_pseudotime_merge_young$score)))) + + + + regulons_score_percent_pseudotime_merge_old <- getMerge(metaData_monocle[which(metaData_monocle$AGE == "Old"),],breaklist) + rownames(regulons_score_percent_pseudotime_merge_old$score) <- paste0("old_",c(1:length(rownames(regulons_score_percent_pseudotime_merge_old$score)))) + rownames(regulons_score_percent_pseudotime_merge_old$annot) <- paste0("old_",c(1:length(rownames(regulons_score_percent_pseudotime_merge_old$annot)))) + + + regulons_score_percent_pseudotime_merge <- as.data.frame(rbind(regulons_score_percent_pseudotime_merge_young$score, + regulons_score_percent_pseudotime_merge_old$score), check.names =FALSE) + + scale_regulons_score_percent_pseudotime_merge <- data.frame(scale(regulons_score_percent_pseudotime_merge), check.names=FALSE) + + annot_list_merge <- rbind(regulons_score_percent_pseudotime_merge_young$annot, + regulons_score_percent_pseudotime_merge_old$annot) + + + #Setting color legend for the different States + annot_color <- list() + + states <- unique(c(first_traj_states,second_traj_states)) + + if (is.null(colorStates)) { + colorStates <- hue_pal()(length(levels(monocle$State)))[states] + } + c <- 1 + + for (s in states) { + annot_color$State[[as.character(s)]] <- colorStates[c] + c <- c+1 + } + + + gaps_col = c(nrow(regulons_score_percent_pseudotime_merge_young$gap),nrow(regulons_score_percent_pseudotime_merge_young$score),nrow(regulons_score_percent_pseudotime_merge_young$score) + nrow(regulons_score_percent_pseudotime_merge_old$gap)) + + + + + if(clusterFirstData) { + youngDataOnly <- t(scale_regulons_score_percent_pseudotime_merge[which(startsWith(rownames(scale_regulons_score_percent_pseudotime_merge),"young")),]) + + + ## hclust + + hclustRes <- hclust( dist(youngDataOnly, method = "euclidian"),method = clustering_method) + + + gapsLabelHclust <- cutree(hclustRes,k =cluster) + + gapsLabel <- gapsLabelHclust + regulonsYoungOrder <- rownames(youngDataOnly)[hclustRes$order] + + gaps <- gapsLabel[regulonsYoungOrder] + + gaps_row <- c() + + for (g in c(1:(length(gaps)-1))) { + if (gaps[g] != gaps[g+1]) { + gaps_row <- c(gaps_row,g) + } + } + + scale_regulons_score_percent_pseudotime_merge <- scale_regulons_score_percent_pseudotime_merge[,regulonsYoungOrder] + + + cutTreeRow = NA + + if (breaksAdapt) { + breaklist_2 <- breaklist + breaklist_2[length(breaklist)] <- max(max(scale_regulons_score_percent_pseudotime_merge),max(breaklist)) + breaklist_2[1] <- min(min(scale_regulons_score_percent_pseudotime_merge), min(breaklist)) + } else { + breaklist_2 <- NA + } + heatmap <- pheatmap(t(scale_regulons_score_percent_pseudotime_merge), + show_colnames = FALSE, + cluster_cols = FALSE, + gaps_col = gaps_col, + color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdYlBu")))(length(breaklist)), + annotation_col = annot_list_merge, + annotation_colors = annot_color, + legend=legend, + annotation_legend=legend, + cluster_rows = FALSE, + cutree_rows = cutTreeRow, + gaps_row = gaps_row, + breaks = breaklist_2, + na_col = "black", + legend_breaks = legend_breaks) + + + + } else { + + cutTreeRow = cluster + + if (breaksAdapt) { + breaklist_2 <- breaklist + breaklist_2[length(breaklist)] <- max(max(scale_regulons_score_percent_pseudotime_merge),max(breaklist)) + breaklist_2[1] <- min(min(scale_regulons_score_percent_pseudotime_merge), min(breaklist)) + } else { + breaklist_2 <- NA + } + + heatmap <- pheatmap(t(scale_regulons_score_percent_pseudotime_merge), + show_colnames = FALSE, + cluster_cols = FALSE, + gaps_col = gaps_col, + color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdYlBu")))(length(breaklist)), + annotation_col = annot_list_merge, + annotation_colors = annot_color, + legend=legend, + annotation_legend=legend, + cluster_rows = clusterRow, + cutree_rows = cutTreeRow, + clustering_method = clustering_method, + breaks = breaklist_2, + na_col = "black", + legend_breaks = legend_breaks) + + + + } + return(heatmap) +} + + + diff --git a/R_src/makeRegulonTableCL.R b/R_src/makeRegulonTableCL.R new file mode 100644 index 0000000000000000000000000000000000000000..083bd934fbd9b84c1eb3ddbc8dc59fe9885f7eca --- /dev/null +++ b/R_src/makeRegulonTableCL.R @@ -0,0 +1,156 @@ +##------------------------------------------------------------------------------ +## L. Herault +##------------------------------------------------------------------------------ + +## ----------------------------------------------------------------------------- +## Libraries +## ----------------------------------------------------------------------------- + +suppressMessages(library(getopt)) +suppressMessages(library(stringr)) +suppressMessages(library(plyr)) + + +# Analysis of seurat 3 integration and clusternig workflow + +## ----------------------------------------------------------------------------- +## Command line args +## ----------------------------------------------------------------------------- + +spec = matrix(c( + 'help', 'h', 0, "logical", "Help about the program", + 'outdir', 'o',1, "character", 'Outdir path (default ./)', + "tfTested", 't',1, "character", "TF tested list (eg TF with a motif in the scenic database", + "regulonJson", 'r',1, "character", "Regulon main Json file", + "regulonJsonSupp", "s", 1, "character", "Regulon supp Json files (separated by +)", + "subConditionName", "n", 1, "character", "correspondong names (separated by +)" + +), byrow=TRUE, ncol=5); + + + +opt = getopt(spec) + + +# if help was asked, print a friendly message +# and exit with a non-zero error code + +args <- commandArgs() +if ( !is.null(opt$help) | is.null(opt$regulonJson)) { + cat("Create regulons tabme") + cat(getopt(spec, usage=TRUE)) + q(status=1) +} + + +if (is.null(opt$logfc_threshold)) { + opt$logfc_threshold = 0.25 +} +if (is.null(opt$norm_method)) { + opt$norm_method = "logNorm" +} + +# Printing option before running + +for (o in names(opt)) { + print(paste(o,":", opt[[o]])) +} + +# For testing +# setwd("/shared/projects/scRNA_HSPC_Aging/scRNA_infer/") +# +# opt <- list() +# opt$tfTested <- "output/publicData/mm_mgi_tfs.txt" +# +# opt$regulonJson <- "output/ScenicRNA_multipleRuns/cis_target_maskDropouts/aggregatedRegulonsMetaTest.json" +# +# opt$regulonJsonSupp <- "output/ScenicRNA_multipleRuns_young/cis_target_maskDropouts/aggregatedRegulonsMeta.json+output/ScenicRNA_multipleRuns_old/cis_target_maskDropouts/aggregatedRegulonsMeta.json" +# opt$subConditionName <- "young+old" +# opt$outdir <- 'output/regulonAnalysis/' + +dir.create(opt$outdir,showWarnings = F,recursive = T) + +jsonSupp <- strsplit(opt$regulonJsonSupp,split = "\\+")[[1]] +subConditionName <- strsplit(opt$subConditionName,split = "\\+")[[1]] + +suppData <- cbind(jsonSupp,subConditionName) + +### + + +tf_mouse <- read.table(opt$tfTested) +length(tf_mouse$V1) + +tf_tested <- tf_mouse$V1 + + +# Interaction table regulons and their target genes + + +makeTable <- function(regulonJsonAndName) { + + regulonJsonAndName <- unlist(regulonJsonAndName) + regulonJson <- regulonJsonAndName[1] + if(!is.na(regulonJsonAndName[2])) { + suffixCol <- paste0("_",regulonJsonAndName[2]) + } else { + suffixCol <- "" + } + print(suffixCol) + + regulons <- RJSONIO::fromJSON(regulonJson) + + regulonTable<- matrix(ncol = 6) + colnames(regulonTable) <- c("regulon","gene","mor", + paste0("recoveredTimes",suffixCol), + paste0("importanceMean",suffixCol), + paste0("importanceSd",suffixCol)) + + + for (i in names(regulons)) { + mor = 1 + if (endsWith(i,"(-)")) { + mor <- -1 + } + for (t in names(regulons[[i]])) { + importanceMean <- NA + importanceSd <- NA + tf <- str_split_fixed(i,"\\(",n=2)[,1] + if(tf != t) { + importanceMean <- regulons[[i]][[t]][[2]] + importanceSd <- regulons[[i]][[t]][[3]] + } + add <- unlist(c(tf,t,mor,regulons[[i]][[t]][[1]][1],importanceMean,importanceSd)) + regulonTable<- rbind(regulonTable,add) + } + } + regulonTable <- data.frame(regulonTable[-1,]) ## first line is NA + regulonTable$interaction <- paste(regulonTable$regulon,regulonTable$gene,regulonTable$mor,sep = "_") + rownames(regulonTable) <- regulonTable$interaction + return(regulonTable) +} + + +mainRegulonTable <- makeTable(regulonJsonAndName = c(opt$regulonJson, NULL)) + +if (!is.null(opt$subConditionName)) { + conditions <- mapply(list, jsonSupp, subConditionName, SIMPLIFY=F) + suppRegulonTables <- lapply(conditions, makeTable) + ## write the supp tables + for (c in (1:length(conditions))) { + write.table(suppRegulonTables[[c]],paste0(opt$outdir,"/",conditions[[c]][[2]],"RegulonTable.tsv"),sep = '\t') + } +} + +#Add interaction recovered in one or two condition in the main table + +for (table in suppRegulonTables) { + mainRegulonTable <- cbind(mainRegulonTable,table[rownames(mainRegulonTable),c(4:6)]) +} + +## Write the tables +write.table(mainRegulonTable,paste0(opt$outdir,"/mainRegulonTable.tsv"),sep = '\t',quote = F,row.names = F) + + + + diff --git a/R_src/newMergingATAC_temp.R b/R_src/newMergingATAC_temp.R new file mode 100644 index 0000000000000000000000000000000000000000..299336fd7f0ccef107fca5ed5655243dd5747fc7 --- /dev/null +++ b/R_src/newMergingATAC_temp.R @@ -0,0 +1,156 @@ +suppressMessages(library(Signac)) +suppressMessages(library(Seurat)) +suppressMessages(library(getopt)) +suppressMessages(library(gridExtra)) +suppressMessages(library(scales)) +suppressMessages(library(grid)) +suppressMessages(library(S4Vectors)) +suppressMessages(library(patchwork)) +suppressMessages(library(ggplot2)) + +# set.seed(2021) +# dir.create("/shared/projects/scRNA_HSPC_Aging/GMP_PRARA/output/ATAC/integration_v2/", recursive = T) +outdir <- "/shared/projects/scRNA_HSPC_Aging/GMP_PRARA/output/ATAC/integration_v2/" + +atac_CT <- readRDS("/shared/projects/scRNA_HSPC_Aging/GMP_PRARA/output/ATAC/smp/Ctrl/atac.rds") +atac_RA <- readRDS("/shared/projects/scRNA_HSPC_Aging/GMP_PRARA/output/ATAC/smp/RA/atac.rds") + +granges_CT <- granges(atac_CT) +granges_RA <- granges(atac_RA) + +combined.peaks <- reduce(x = c(granges_CT, granges_RA)) + +fragment_CT <- Fragments(atac_CT) +fragment_RA <- Fragments(atac_RA) + +counts_CT <- FeatureMatrix( + fragments = fragment_CT, + features = combined.peaks, + cells = rownames(atac_CT@meta.data) +) + +counts_RA <- FeatureMatrix( + fragments = fragment_RA, + features = combined.peaks, + cells = rownames(atac_RA@meta.data) +) + + +# Create the objects +## Chromatin then seurat +CT_assay <- CreateChromatinAssay(counts = counts_CT, fragments = fragment_CT) +RA_assay <- CreateChromatinAssay(counts = counts_RA, fragments = fragment_RA) + +CT_seurat <- CreateSeuratObject(CT_assay, assay = "peaks", meta.data = atac_CT@meta.data) +RA_seurat <- CreateSeuratObject(RA_assay, assay = "peaks", meta.data = atac_RA@meta.data) + +CT_seurat$condition <- "Control" +RA_seurat$condition <- "Treated" + + +intersecting.regions <- findOverlaps(query = CT_seurat, subject = RA_seurat) +intersections.2 <- unique(queryHits(intersecting.regions)) +# choose a subset of intersecting peaks +peaks.use <- sort(granges(CT_seurat)[sample(intersections.2, size = 10000, replace = FALSE)]) + +peaks <- FeatureMatrix( + fragments = Fragments(RA_seurat), + features = peaks.use, + cells = colnames(RA_seurat) +) + +# create a new assay and add it to the second sample dataset +RA_seurat[['CTSamplePeaks']] <- CreateChromatinAssay( + counts = peaks, + min.cells = 1, + ranges = peaks.use, + genome = 'mm10' +) + +DefaultAssay(RA_seurat) <- 'CTSamplePeaks' +RA_seurat <- RunTFIDF(RA_seurat) ##normalisation + +### Integration +# create a new assay in the first sample-ATAC-seq dataset containing the common peaks +peaknames <- GRangesToString(grange = peaks.use) + +CT_seurat[['CTSamplePeaks']] <- CreateChromatinAssay( + counts <- GetAssayData(CT_seurat, assay = "peaks", slot = "counts")[peaknames, ], + ranges = peaks.use, + genome = "mm10" +) + +# run TF-IDF for the new assay +DefaultAssay(CT_seurat) <- "CTSamplePeaks" +CT_seurat <- RunTFIDF(CT_seurat) + +# CT_seurat <- RenameCells(CT_seurat[["peaks"]], +# new.names = paste0(colnames(CT_seurat[["peaks"]], "_CT"))) +# CT_seurat <- RunTFIDF(CT_seurat) +# RA_seurat <- RunTFIDF(RA_seurat) +print("Find Anchors") +anchors <- FindIntegrationAnchors( + object.list = list(CT_seurat, RA_seurat), + anchor.features = rownames(CT_seurat), + assay = c('CTSamplePeaks', 'CTSamplePeaks'), + k.filter = NA +) + +# integrate data and create a new merged object +print("Integrate data") +integrated <- IntegrateData( + anchorset = anchors, + dims = 2:30, + preserve.order = TRUE +) + +print("RunSVD") +integrated <- RunSVD( + object = integrated, + n = 30, + reduction.name = 'integratedLSI' +) + +print("RunUMAP") +integrated <- RunUMAP( + object = integrated, + dims = 2:30, + reduction = 'integratedLSI' +) + +print("Save RDS") +saveRDS(integrated, paste0(outdir, "atac_merged_integrated.rds")) + +UMAP_newInt <- DimPlot(integrated, group.by = "condition") + + +png(paste0(outdir,"UMAP_merge_integrated_condition.png"),units = "cm",height = 15,width=15,res = 300) +UMAP_newInt +dev.off() + +# combined_seurat <- merge( +# x = CT_seurat, +# y = RA_seurat, +# add.cell.ids = c("CT", "RA") +# ) +# +# combined_seurat <- RunTFIDF(combined_seurat) +# combined_seurat <- FindTopFeatures(combined_seurat, min.cutoff = 'q75') +# combined_seurat <- RunSVD(combined_seurat, n = 40) +# +# png(paste0(outdir,"DepthCor.png"),units = "in",height = 6,width=6,res = 300) +# DepthCor(combined_seurat) +# dev.off() +# +# combined_seurat <- RunUMAP(combined_seurat, dims = 1:30, reduction = "lsi") +# +# saveRDS(combined_seurat, paste0(outdir, "atac_integrated.rds")) + +# atac_int <- RunUMAP(atac_int, dims = 2:30, reduction = "lsi") +# +# UMAP_newInt <- DimPlot(atac_int, group.by = "condition") +# +# +# png(paste0(outdir,"UMAP_condition.png"),units = "cm",height = 15,width=15,res = 300) +# UMAP_newInt +# dev.off() diff --git a/R_src/orderingSeurat3IntegratedMonocleCL.R b/R_src/orderingSeurat3IntegratedMonocleCL.R new file mode 100644 index 0000000000000000000000000000000000000000..b44f6eecc0b5da6c45b392cdbbe6ddf2f3c4dc55 --- /dev/null +++ b/R_src/orderingSeurat3IntegratedMonocleCL.R @@ -0,0 +1,314 @@ +##------------------------------------------------------------------------------ +## L. Herault +##------------------------------------------------------------------------------ + +## ----------------------------------------------------------------------------- +## Libraries +## ----------------------------------------------------------------------------- + +suppressMessages(library(Seurat)) +suppressMessages(library(monocle)) +suppressMessages(library(gridExtra)) +suppressMessages(library(RColorBrewer)) +suppressMessages(library(getopt)) + +# Monocle 2 pseudotime ordering with seurat intgrated and scaled data + +## ----------------------------------------------------------------------------- +## Command line args +## ----------------------------------------------------------------------------- + +spec = matrix(c( + 'help', 'h', 0, "logical", "Help about the program", + 'inputSeurat', 'i', 1, "character", "REQUIRED: 10X data prepared as seurat object with clustering results (.RDS generated by prepare_data.R).", + 'outdir', 'o',1, "character", 'Outdir path (default ./)', + "num_comp", 'n',1,"numeric", "Number of dimension in the trajectory space (default 2)", + "signatureFile", 's',1, "character", "Signature rds file path", + "clustCol", "k", 1, "character", "cluster column in the seurat meta.data to plot them in the trajectory.\n Also used to identify the cluster to remove if provided see below", + "cluster", "r", 1, "character", "exclude a cluster from the pseudotime ordering.\n Cluster column in the seurat meta.data need to be provided see below", + 'removeClusterWithSignature', "d", 1, "character", "Name of a signature. Exclude the cluster with the highest enrichment for the given signature. \nCluster signature enrichment table need to be provided see below", + 'signatureEnrichmentTable', 't', 1, "character", "Path to the table with the signature enrichment per cluster for -d option", + "startingClust", "c", 1, "character", "Starting clust for ordering cells", + "startingCt", "e", 1, "character", "Starting cell type for ordering cells"), +byrow=TRUE, ncol=5) + + +opt = getopt(spec) + +# if help was asked, print a friendly message +# and exit with a non-zero error code + +args <- commandArgs() +if ( !is.null(opt$help) | is.null(opt$inputSeurat)) { + cat("Monocle 2 pseudotime ordering with seurat intgrated and scaled data") + cat(getopt(spec, usage=TRUE)) + q(status=1) +} + +#set default arguments values + +if (is.null(opt$num_comp)) { + opt$num_comp <- 2 +} + +if (is.null(opt$outdir)) { + opt$outdir = "./" +} + +dir.create(opt$outdir,recursive = T,showWarnings = F) + +seurat <- readRDS(opt$inputSeurat) + +print(opt) + +seurat$numclust <- seurat@meta.data[,opt$clustCol] + +if (!is.null(opt$cluster)) { + print(paste("removing cluster",opt$cluster,"in seurat metadata column",opt$clusCol,"before ordering...")) + Idents(seurat) <- opt$clustCol + seurat = subset(seurat, idents = opt$cluster,invert = TRUE) + +} + +if (!is.null(opt$removeClusterWithSignature)) { + if (!is.null(opt$signatureEnrichmentTable)) { + sigClustTable <- read.table(opt$signatureEnrichmentTable) + print(head(sigClustTable)) + clustToRemove <- rownames(sigClustTable)[which.min(sigClustTable[,opt$removeClusterWithSignature])] + clustToRemove <- strsplit(clustToRemove,split="_")[[1]][2] + + print(paste("removing cluster",clustToRemove,"in seurat metadata column",opt$clusCol,"before ordering")) + Idents(seurat) <- opt$clustCol + seurat = subset(seurat, idents = clustToRemove,invert = TRUE) + + } else { + print("You need to provide a table with the selected signature score enrichment per cluster") + q(status=1) + } +} + +DefaultAssay(seurat) <- "integrated" +pd <- new("AnnotatedDataFrame", data = seurat@meta.data) +fd <- new("AnnotatedDataFrame", data = data.frame(gene_short_name = rownames(seurat))) +rownames(fd) <- fd$gene_short_name + +monocle <- newCellDataSet(GetAssayData(object = seurat, slot = "scale.data")[,rownames(seurat@meta.data)], + phenoData = pd, + featureData = fd, + lowerDetectionLimit = 0.1, + expressionFamily = uninormal()) + +fData(monocle)$use_for_ordering <- TRUE + + +print("Reducing dimension by DDRTree...") + +monocle <- reduceDimension(monocle, + max_components = opt$num_comp, + reduction_method = 'DDRTree', + norm_method = "none", + pseudo_expr = 0, + verbose = T ) + +print("Ordering cells...") + +monocle <- orderCells(monocle) +saveRDS(monocle,paste(opt$outdir,"/monocleRamdomStart.rds",sep ="")) + + +GM_state_predicted <- function(cds,cellType){ + if (length(unique(pData(cds)$State)) > 1){ + + #Preselect terminal states + tip_leaves <- names(which(igraph::degree(minSpanningTree(cds)) == 1)) + tip_leaves <- sub(x=tip_leaves,pattern="Y_",replacement="") + print(tip_leaves) + pData(cds)$potentialRootCells <- cds@auxOrderingData$DDRTree$pr_graph_cell_proj_closest_vertex %in% tip_leaves + terminal_states <- unique(pData(cds)$State[which(pData(cds)$potentialRootCells)]) + LT_counts <- table(pData(cds)$State, pData(cds)$predicted)[,cellType] + print(LT_counts) + LT_prop <- LT_counts/rowSums(table(pData(cds)$State, pData(cds)$predicted)) + print(LT_prop) + LT_propF <- LT_prop[which(labels(LT_prop) %in% terminal_states)] + print("LT proportion for terminal states") + print(LT_propF) + print("Initial state:") + #print(which(LT_propF == max(LT_propF,na.rm = T))) + Istate <- as.numeric(names(LT_propF)[which(LT_propF == max(LT_propF,na.rm = T))]) + print(Istate) + return(Istate) + } else { + return (1) + } +} + + + +GM_state_numclust <- function(cds,num){ + if (length(unique(pData(cds)$State)) > 1){ + + #Preselect terminal states + tip_leaves <- names(which(igraph::degree(minSpanningTree(cds)) == 1)) + tip_leaves <- sub(x=tip_leaves,pattern="Y_",replacement="") + print(tip_leaves) + pData(cds)$potentialRootCells <- cds@auxOrderingData$DDRTree$pr_graph_cell_proj_closest_vertex %in% tip_leaves + terminal_states <- unique(pData(cds)$State[which(pData(cds)$potentialRootCells)]) + S_counts <- table(pData(cds)$State, pData(cds)$numclust)[,num] + print(S_counts) + S_prop <- S_counts/rowSums(table(pData(cds)$State, pData(cds)$numclust)) + print(S_prop) + S_propF <- S_prop[which(labels(S_prop) %in% terminal_states)] + print("S proportion for terminal states") + print(S_propF) + print("Initial state:") + #print(which(S_propF == max(S_propF,na.rm = T))) + Istate <- as.numeric(names(S_propF)[which(S_propF == max(S_propF,na.rm = T))]) + print(Istate) + return(Istate) + } else { + return (1) + } +} + + + +if (!is.null(pData(monocle)$predicted) & !is.null(opt$startingCt)) { + monocle <- orderCells(monocle,root_state = GM_state_predicted(monocle,cellType = opt$cellType)) +} else { + print(paste0("ordering cells with cluster ", opt$startingClust, " proportion")) + monocle <- orderCells(monocle,root_state = GM_state_numclust(monocle,num = opt$startingClust)) +} + + + + +png(paste(opt$outdir,"/cellTrajectoryStates.png",sep ="")) +plot_cell_trajectory(monocle) +dev.off() + +png(paste(opt$outdir,"/cellTrajectoryPhases.png",sep ="")) +plot_cell_trajectory(monocle,color_by = "phases") +dev.off() + + +png(paste(opt$outdir,"/sampleName.png",sep = "")) +plot_cell_trajectory(monocle,color_by = "sampleName") +dev.off() + +png(paste(opt$outdir,"/age.png",sep = "")) +plot_cell_trajectory(monocle,color_by = "AGE") +dev.off() + + +if (!is.null(pData(monocle)$predicted)) { +png(paste(opt$outdir,"/cellTrajectoryCellTypePredicted.png",sep ="")) +plot_cell_trajectory(monocle,color_by = "predicted") +dev.off() +} + +png(paste(opt$outdir,"/cellTrajectorySeuratClusters.png",sep = "")) +plot_cell_trajectory(monocle,color_by=opt$clustCol) +dev.off() + + + +signatures <- readRDS(opt$signatureFile) + +dir.create(paste(opt$outdir,"/localizeSeuratCluster",sep ="")) +dir.create(paste(opt$outdir,"/cellSignatures",sep ="")) + +sigName <- names(signatures) + +for (sig in sigName) { + print(sig) + png(paste(opt$outdir,"/cellSignatures/",sig,".png",sep = "")) + print(plot_cell_trajectory(monocle,color_by = sig) + scale_color_gradient(low = "yellow",high = "red")) + dev.off() +} + +for (c in unique(pData(monocle)[,opt$clustCol])) { + pData(monocle)$clustOfInterest <- FALSE + pData(monocle)[which(pData(monocle)[,opt$clustCol] == c),"clustOfInterest"] <- TRUE + png(paste(opt$outdir,"/localizeSeuratCluster/cluster",c,".png",sep="")) + print(plot_cell_trajectory(monocle,color_by="clustOfInterest")) + dev.off() +} + + +#Save monocle object +saveRDS(monocle,paste(opt$outdir,"/monocle.rds",sep ="")) + +#add state monocle info to seurat metadata +if (unique(rownames(pData(monocle)) == rownames(seurat@meta.data))) { + seurat@meta.data$State_monocle <- pData(monocle)$State +} + +png(paste(opt$outdir,"/tSNE_DDRTree_State.png",sep =""),width = 1600,height = 800) +grid.arrange(plot_cell_trajectory(monocle),DimPlot(object = seurat,group.by = "State_monocle"), ncol =2) +dev.off() + +png(paste(opt$outdir,"/tSNE_DDRTree_samples.png",sep =""),width = 1600,height = 800) +grid.arrange(plot_cell_trajectory(monocle,group_by = "sampleName"),DimPlot(object = seurat,group.by = "sampleName"), ncol =2) +dev.off() + +p1 <- plot_cell_trajectory(monocle,color_by= opt$clustCol) +p2 <- DimPlot(object = seurat,group.by = opt$clustCol) + theme(legend.position="none") + +png(paste(opt$outdir,"/tSNE_DDRTree_Clusters.png",sep =""),width = 1600,height = 800) +grid.arrange(p1,p2, ncol =2) +dev.off() + +#Some interesting genes + +# png(paste(opt$outdir,"/geneOfInterest.png",sep ="")) +# plot_genes_in_pseudotime(monocle[c("Cd34","Procr","Ly6a","Cd48")]) +# dev.off() + + +p3 <- plot_cell_trajectory(monocle) +p4 <- DimPlot(object = seurat,group.by = "State_monocle") + + +png(paste(opt$outdir,"/tSNE_DDRTree_State.png",sep =""),width = 1600,height = 800) +grid.arrange(p3,p4, ncol =2) +dev.off() + + + +p11 <- plot_cell_trajectory(monocle,color_by= opt$clustCol) + theme(legend.position="none") +p22 <- DimPlot(object = seurat,group.by = opt$clustCol) + theme(legend.position="none") +p33 <- FeaturePlot(object = seurat,features = "Total_mRNAs") +p44 <- plot_cell_trajectory(monocle,color_by= "phases") + + +png(paste(opt$outdir,"/tSNE_DDRTree_All_fig.png",sep =""),width = 1600,height = 1600) +grid.arrange(p33,p22,p44,p11, ncol =2) +dev.off() + +saveRDS(seurat,paste(opt$outdir,"/seurat_with_monocleStates.rds",sep ="")) + + +monocleRaw <- monocle + +DefaultAssay(seurat) <- "RNA" + +monocleRaw@assayData$exprs <- GetAssayData(seurat,slot = "counts") +fData(monocleRaw) <- data.frame(gene_short_name = rownames(seurat)) +rownames(fData(monocleRaw)) <- fData(monocleRaw)$gene_short_name +monocleRaw@expressionFamily <- negbinomial.size() + +monocleRaw <- estimateSizeFactors(monocleRaw) +monocleRaw <- estimateDispersions(monocleRaw) + +#png(paste(opt$outdir,"/geneOfInterestwithCounts.png",sep ="")) +#plot_genes_in_pseudotime(monocleRaw[c("Cd34","E2f1","Cd48")]) +#dev.off() + + +#png(paste(opt$outdir,"/geneOfInterestBranchedwithCounts.png",sep ="")) +#plot_genes_branched_pseudotime(monocleRaw[c("Cd34","E2f1","Cd48")]) +#dev.off() + +saveRDS(monocleRaw,paste(opt$outdir,"/monocleWithCounts.rds",sep ="")) + + diff --git a/R_src/prepareAUCseuratEnhancePromoseuratCL.R b/R_src/prepareAUCseuratEnhancePromoseuratCL.R new file mode 100644 index 0000000000000000000000000000000000000000..b6bf5dac9233e3d55ffb578478b0889f100510c7 --- /dev/null +++ b/R_src/prepareAUCseuratEnhancePromoseuratCL.R @@ -0,0 +1,131 @@ +#-----------------------------------------------------# +# Add promoter and enhancer atac + Regulon assay on RNA +#-----------------------------------------------------# + +suppressMessages(library(getopt)) +suppressMessages(library(Signac)) +suppressMessages(library(Seurat)) + +# library(patchwork) + + +featureDiff <- function(seurat,cells.1,cells.2,feature) { + data <- GetAssayData(seurat,slot = "data") + total.diff <- mean(data[feature,cells.1]) - mean(data[feature,cells.2]) + return(total.diff) +} + + +##-------------------------------------------------------------------------## +## Option list +##-------------------------------------------------------------------------## + +spec = matrix(c( + 'help', 'h', 0, "logical", "Help about the program", + 'inputAtacRDS', 'a', 1, "character", "REQUIRED: 10X data prepared as seurat/signac object.", + 'inputRNARDS', 'i', 1, "character", "REQUIRED: 10X data prepared as seurat object.", + 'outdir', 'o',1, "character", 'Outdir path (default ./)', + 'regulonsAUC', 'r',1, "character", "Regulons AUC file" +), byrow=TRUE, ncol=5); + +opt = getopt(spec) + +args <- commandArgs() +if ( !is.null(opt$help) | is.null(opt$inputAtacRDS)) { + cat("Create influence graph from regulon table") + cat(getopt(spec, usage=TRUE)) + q(status=1) +} + +seurat_ATAC <- readRDS(opt$inputAtacRDS) +DefaultAssay(seurat_ATAC) <- "peaks" + +seurat_RNA <- readRDS(opt$inputRNARDS) + +regulons.AUC <- read.table(opt$regulonsAUC, header = T, sep = ",", check.names = F, row.names = 1) +regulons.AUC <- regulons.AUC[,colnames(regulons.AUC)[which(endsWith(colnames(regulons.AUC),suffix = "+)"))]] + +outdir <- opt$outdir +# dir.create(outdir, recursive = T) + +#-----------------------------------------------------# +# Create promoter and enhancer assay +#-----------------------------------------------------# +print("Starting promoter activity") +# Promoter +-3kbp from TSS +gene.act.promotor <- GeneActivity(seurat_ATAC, extend.upstream = 3000, extend.downstream = 3000) + +seurat_ATAC[["promoter"]] <- CreateAssayObject(counts = gene.act.promotor) + +seurat_ATAC <- NormalizeData( + object = seurat_ATAC, + assay = "promoter", + normalization.method = "LogNormalize", + scale.factor = median(seurat_ATAC$nCount_promoter) +) + + +# Enhancer +-50kb minus +-3kb from TSS +print("Starting enhancer activity") +gene.act.enhancer <- GeneActivity(seurat_ATAC, extend.upstream = 50000, extend.downstream = 50000) + +gene.act.enhancer.final <- gene.act.enhancer - gene.act.promotor + +seurat_ATAC[["enhancer"]] <- CreateAssayObject(counts = gene.act.enhancer.final) + +seurat_ATAC <- NormalizeData( + object = seurat_ATAC, + assay = "enhancer", + normalization.method = "LogNormalize", + scale.factor = median(seurat_ATAC$nCount_enhancer) +) +print("Savin seurat file annotated") +saveRDS(seurat_ATAC, file = paste0(outdir, "/TransferRnaAtac_04/SignacMotif_2/", "ATAC_int_enhancerPromoter.rds")) + +#-----------------------------------------------------# +# Add AUCell assay to our seurat object +#-----------------------------------------------------# + +print("Start adding AUCell Assay") +auCell_data <- t(regulons.AUC) + +seurat_RNA[["AUCell"]] <- CreateAssayObject(data=auCell_data) + + +#find cluster regulon markers +Idents(seurat_RNA) <- "FinalCluster" + +#Set AUCell slot +DefaultAssay(seurat_RNA) <- "AUCell" + + +clusterRegulon <- FindAllMarkers(seurat_RNA, + only.pos = T, + logfc.threshold= 0, + pseudocount.use = 1, + min.pct = 0.1) + +# Compute true mean difference in score because Seurat comput only logFC +# The featureDiff functions is loaded from ../R_src/computeDiffFun.R file + +clusterRegulon$avg_diff <- NA + +for (rm in rownames(clusterRegulon)) { + feature <- clusterRegulon[rm,"gene"] + cells.1 <- colnames(seurat_RNA)[which(seurat_RNA$FinalCluster == clusterRegulon[rm,"cluster"])] + cells.2 <- colnames(seurat_RNA)[which(seurat_RNA$FinalCluster != clusterRegulon[rm,"cluster"])] + clusterRegulon[rm,"avg_diff"] <- featureDiff(seurat_RNA,cells.1,cells.2,feature) +} + +# cut off on p adjusted value +clusterRegulon <- clusterRegulon[which(clusterRegulon$p_val_adj < 0.05 & clusterRegulon$avg_diff > 0.002),] + + +# rename some columns +colnames(clusterRegulon) <- c("p_val","avg_logFC","pct.1","pct.2","p_val_adj","cluster","regulon","avg_diff") + +# order columns +clusterRegulon <- clusterRegulon[,c("p_val","pct.1","pct.2","p_val_adj","cluster","regulon","avg_diff")] + +saveRDS(seurat_RNA, file = paste0(outdir, "RNA/regulonAnalysis/", "seuratAUC.rds")) +write.table(clusterRegulon, paste0(outdir, "RNA/regulonAnalysis/", "regulonMarkersCluster.tsv"), sep = "\t", quote = F, row.names = T, col.names = T) diff --git a/R_src/prepareCL.R b/R_src/prepareCL.R new file mode 100644 index 0000000000000000000000000000000000000000..9af18a9ec67e42d1f30f0a280c6e97497b611b5f --- /dev/null +++ b/R_src/prepareCL.R @@ -0,0 +1,121 @@ +##------------------------------------------------------------------------------ +## L. Herault +##------------------------------------------------------------------------------ + +## ----------------------------------------------------------------------------- +## Libraries +## ----------------------------------------------------------------------------- + +#Load required packages +suppressMessages(library(monocle)) +suppressMessages(library(scran)) +suppressMessages(library(getopt)) + + + +#load homemade function +source("R_src/data_preparation.R") + + + +# Quality control and preparation of 10X data for analysis + +## ----------------------------------------------------------------------------- +## Command line args +## ----------------------------------------------------------------------------- + +spec = matrix(c( + 'help', 'h', 0, "logical", "Help about the program", + 'inputRDS', 'i', 1, "character", "REQUIRED: 10X data prepared as monocle object (.RDS generated by prepare_data.R).", + "mitochFilter", "m",1, "numeric", "Ribosomal protein gene RNA proportion filtering (eg 0.1 to remove cells with more than 10% of Rbp transcripts", + 'outdir', 'o',1, "character", 'Outdir path (default ./)' +), byrow=TRUE, ncol=5); + +opt = getopt(spec) + +# if help was asked, print a friendly message +# and exit with a non-zero error code + +args <- commandArgs() +if ( !is.null(opt$help) | is.null(opt$inputRDS)) { + cat("Add cell cycle phases to each cells with scran, filter cells of low quality, control percentage mitochondrial transcripts") + cat(getopt(spec, usage=TRUE)) + q(status=1) +} + +if (is.null(opt$outdir)) { + opt$outdir = "./" +} + + +gbm_cds <- readRDS(opt$inputRDS) + + +# print some stuff +# head of the expression matrix +exprs(gbm_cds[c(1:5),c(1:5)]) + +# cells information +head(pData(gbm_cds)) + +# gene information +head(fData(gbm_cds)) + +# Estimating size factor and dispersion not needed anymore +#gbm_cds <- estimateSizeFactors(gbm_cds) +#gbm_cds <- estimateDispersions(gbm_cds) + + +# First getting cell cycle phases with cyclone function from package scrann (see data_preparation.R)) +gbm_cds <- getCellCyclePhases(gbm_cds,outdir = paste(opt$outdir,"/cellCycle",sep ="")) + +# Filtering cells on UMI counts as explained in monocle doc +# print num cells before filtering + +print("Matrix dim before cell filtering :") +dim(pData(gbm_cds)) + +gbm_cds <- filterCells(gbm_cds,paste(opt$outdir,"/cellsFiltering",sep = ""),propMitochFilter = opt$mitochFilter) + +# Filter on mitochondrial RNA proportion if specified +if (!is.null(opt$mitochFilter)) { + valid_cells_Mito <- row.names(subset(pData(gbm_cds), + percentMito < opt$mitochFilter)) + + nonValidCellMito <- length(row.names(pData(gbm_cds)))-length(valid_cells_Mito) + + +} + +# Filter cells with to few expressed genes to assign a cell cycle phases with cyclone +# not the the case with cellranger2 workflow + +valid_cells <- row.names(subset(pData(gbm_cds), + is.na(G2M_score) == FALSE & + is.na(S_score) == FALSE & + is.na(G1_score) == FALSE & + is.na(phases) == FALSE )) + +nonValidCellNum <- length(row.names(pData(gbm_cds)))-length(valid_cells) + +print(paste(nonValidCellNum,"cells were filtered out because they express to few genes to assign a cell cycle phase with cyclone to them. ")) + +# Define final valid cells +if (!is.null(opt$mitochFilter)) { + print(paste(nonValidCellMito,"cells were filtered out because more than", opt$mitochFilter, "of their transcripts are ribosomal protein gene RNA")) + + valid_cells <- valid_cells[valid_cells %in% valid_cells_Mito] + } + + +#cells are row in pData(monocle) but column in monocle object +gbm_cds <- gbm_cds[,valid_cells] + + +#After + +print("Matrix dim after cell filtering :") +dim(pData(gbm_cds)) + + +saveRDS(object = gbm_cds,file = paste(opt$outdir,"/gbm_cds_treated.rds",sep ="")) diff --git a/R_src/prepareDataActinn.R b/R_src/prepareDataActinn.R new file mode 100644 index 0000000000000000000000000000000000000000..3bf79cedf9e0bdeba3a9ef942ca131ec87a38fd5 --- /dev/null +++ b/R_src/prepareDataActinn.R @@ -0,0 +1,64 @@ +## ----------------------------------------------------------------------------- +## Libraries +## ----------------------------------------------------------------------------- +library(Seurat) +library(getopt) + +spec = matrix(c( + 'help', 'h', 0, "logical", "Help about the program", + 'inputRDS', 'i', 1, "character", "REQUIRED: 10X data ordered by monocle (.RDS generated by orderCL.R).", + 'subsetData', 's', 1, "character", "Population to keep, sep by +", + 'inputTrain', 't', 1, "character", "Require .rds object of the training dataset", + 'labelName', 'l', 1, "character", "Name of the label column", + 'dataSlot', 'd', 1, "character", "Name of the data slot to recover : either counts, data or scale.data", + 'outdir', 'o',1, "character", 'Outdir path (default ./)' +), byrow=TRUE, ncol=5); + +opt = getopt(spec) + + +args <- commandArgs() +if ( !is.null(opt$help) | is.null(opt$inputRDS)) { + cat("Gene filtering of a gbm cds ordered. For scenic use, two filters") + cat(getopt(spec, usage=TRUE)) + q(status=1) +} + +#set default arguments values +if(is.null(opt$dataSlot)){ + opt$dataSlot <- "counts" +} + + +if (is.null(opt$outdir)) { + opt$outdir = "./" +} + +# opt$inputRDS <- "/shared/projects/scRNA_HSPC_Aging/scRNA_all_paper_pipeline/output/Seurat3_integration_3_upper_mad_filter_ref_wt/Analysis_new_cluster/annotated_update.rds" +# opt$inputTrain <- "/shared/projects/scRNA_HSPC_Aging/scRNA_all_paper_pipeline/output/RodriguezPreparation/seuratForCastle.rds" +# opt$labelName <- "library_id" + +dir.create(opt$outdir, recursive = T) + +seurat_target <- readRDS(opt$inputRDS) +seurat_training <- readRDS(opt$inputTrain) + +if(!is.null(opt$subsetData)){ + pop_keep <- strsplit(opt$subsetData, split = "\\+")[[1]] + + Idents(seurat_training) <- opt$labelName + seurat_training <- subset(seurat_training, idents = pop_keep) +} + +seurat_training$cellName <- rownames(seurat_training@meta.data) + +# Extract data from the seurat object +data_target <- as.data.frame(GetAssayData(seurat_target, slot = opt$dataSlot)) +data_training <- as.data.frame(GetAssayData(seurat_training, slot = opt$dataSlot)) + +write.table(x = seurat_training@meta.data[,c("cellName", opt$labelName)], file = paste0(opt$outdir, opt$labelName, "_training_label", ".txt"), + sep = "\t", col.names = F, row.names = F, quote = F) + +write.table(x = data_target, file = paste0(opt$outdir, "seurat_data_target", ".csv"), quote = F, sep = ",") +write.table(x = data_training, file = paste0(opt$outdir, opt$labelName, "_seurat_data_training", ".csv"), quote = F, sep = ",") + diff --git a/R_src/prepareNestorowaCL.R b/R_src/prepareNestorowaCL.R new file mode 100644 index 0000000000000000000000000000000000000000..d49078d3a319cef5cd0593d941725f730ed14965 --- /dev/null +++ b/R_src/prepareNestorowaCL.R @@ -0,0 +1,294 @@ +##------------------------------------------------------------------------------ +## L. Herault +##------------------------------------------------------------------------------ + +## ----------------------------------------------------------------------------- +## Libraries +## ----------------------------------------------------------------------------- + +suppressMessages(library(monocle)) +suppressMessages(library(Seurat)) +suppressMessages(library(scran)) +suppressMessages(library(getopt)) +suppressMessages(library(biomaRt)) +suppressMessages(library(gprofiler2)) +suppressMessages(library(stringr)) +suppressMessages(library(scales)) +suppressMessages(library(grid)) + + + + + +#load homemade function +source("R_src/data_preparation.R") + +# R data preparation scripts for Rodiguez-Fraticelli data + +## ----------------------------------------------------------------------------- +## Command line args +## ----------------------------------------------------------------------------- + +spec = matrix(c( + 'help', 'h', 0, "logical", "Help about the program", + 'inputData', 'i', 1, "character", "REQUIRED: downloaded Nestorowa raw data (.RDS generated by prepare_data.R).", + "inputMetaData", "m", 1, "character", "REQUIRED : downloaded Nestorowa meta data", + 'outdir', 'o',1, "character", 'Outdir path (default ./)', + 'minPropCellExp', "p",1,"numeric", "Genes expressed in less than this proportion of cells are discarrded frome the analysis (0.001 by default)" + ), byrow=TRUE, ncol=5); + +opt = getopt(spec) + +# if help was asked, print a friendly message +# and exit with a non-zero error code + +args <- commandArgs() +if ( !is.null(opt$help) | is.null(opt$inputData)) { + cat("Prepare Nestorowa data as source for CaSTLe") + cat(getopt(spec, usage=TRUE)) + q(status=1) +} + +if (is.null(opt$outdir)) { + opt$outdir = "./" +} + +dir.create(opt$outdir,recursive = T,showWarnings = F) + + +if (is.null(opt$minPropCellExp)) { + opt$minPropCellExp <- 0.001 + print(opt$minPropCellExp) +} + +#setwd(opt$outdir) + +#get Seurat3 Nestorowa +rawCountsPath <- opt$inputData + + +rawTable <- read.table(rawCountsPath,sep = "\t",header = T,row.names = 1) +phenoData0 <- read.table(opt$inputMetaData) + +# We kee broad gating discarding MPP_broad column that is sudivided in STHSC, MPP1, MPP2, MPP3 +phenoData <- phenoData0[,which(endsWith(colnames(phenoData0), "broad"))] +phenoData <- phenoData[,which(!startsWith(colnames(phenoData), "MPP_"))] + +# We discard 136 cells with no labels + +phenoData <- phenoData[which(rowSums(phenoData)>0),] + +# We renamed LT- and LT. in phenoData and rawTable rownames to LTHSC +unique(str_split_fixed(colnames(rawTable),"_",2)[,1]) +unique(str_split_fixed(rownames(phenoData),"_",2)[,1]) + +rownames(phenoData)[which(startsWith(rownames(phenoData),"LT-"))] <-gsub("LT-HSC",replacement = "LTHSC",x = rownames(phenoData)[which(startsWith(rownames(phenoData),"LT-"))]) +colnames(rawTable)[which(startsWith(colnames(rawTable),"LT."))] <-gsub("LT.HSC",replacement = "LTHSC",x = colnames(rawTable)[which(startsWith(colnames(rawTable),"LT."))]) + +rawTable <- rawTable[which(startsWith(rownames(rawTable), "ENSMUS")),] + +rawTable <- rawTable[,rownames(phenoData)] + + + + + + +# We revover gene short name from ensembl ID with features table from seurat +conversionTable <- gconvert(rownames(rawTable),organism = "mmusculus",target = "MGI",mthreshold = 1,filter_na = T) + +rawTable_filter <- rawTable[rownames(rawTable) %in% conversionTable$input,] + +# Only needed if ensemble id (it is the case in the workflow) + +dupGeneNames <- conversionTable$target[which(duplicated(conversionTable$target))] + +write.csv(conversionTable[which(duplicated(conversionTable$target)),], + paste(opt$outdir,"/dupGenesName.csv",sep = "")) +print("Dup gene short names existing, making them unique...") +# rownames(rawTable) <- make.unique(conversionTable$target) + +rownames(rawTable_filter) <- make.unique(conversionTable$target) + + + + + + +seurat <- CreateSeuratObject(counts = rawTable_filter,meta.data = phenoData) + + #Use of Rodriguez QC to filter poor qualisty cells To pass quality control, + #cells were required to have at least 200.000 reads mapping to nuclear genes, at least 4000 genes detected, less than 10% of mapped reads mapping to mitochondrial genes, + #and less than 50% of mapped reads mapping to the External RNA Controls Consortium (ERCC) spike-ins (#4456740, Life Technologies) +min(seurat@meta.data$nFeature_RNA,na.rm = T) +grep(x = rownames(seurat),pattern = "^Mt-") +#Filtering seems to be already done + +# seurat[["percentMito"]] <- PercentageFeatureSet(seurat, pattern = "^Mt-") +# seurat <- subset(x = seurat, subset = nFeature_RNA >= 4000 & ) +# seurat <- + +getLab <- function(cell,seurat) { + res = which.max(seurat@meta.data[cell,endsWith(colnames(seurat@meta.data),"_broad")]) + return(names(res)) +} + +seurat@meta.data$library_id <- sapply(X = rownames(seurat@meta.data), FUN = getLab,seurat = seurat,simplify = T) + + +## Switch to Monocle object to more easily discard unexpressed genes + +pd <- new("AnnotatedDataFrame", data = seurat@meta.data) +fd <- new("AnnotatedDataFrame", data = data.frame(gene_short_name = rownames(seurat))) +rownames(fd) <- fd$gene_short_name + + + +monocleAll <- newCellDataSet(GetAssayData(object = seurat, slot = "counts"), + phenoData = pd, + featureData = fd, + lowerDetectionLimit = 0.1, + expressionFamily = negbinomial.size()) + + + +pData(monocleAll)$Total_mRNAs <- Matrix::colSums(exprs(monocleAll)) +monocleAll <- detectGenes(monocleAll, min_expr = 0.1) + +# Filter out genes expressed in too few cells +fData(monocleAll)$use_for_castle <- fData(monocleAll)$num_cells_expressed > opt$minPropCellExp * ncol(monocleAll) + +gbm_to_seurat <- monocleAll[fData(monocleAll)$use_for_castle==T,] + +seurat <- CreateSeuratObject(counts = exprs(gbm_to_seurat), meta.data = pData(gbm_to_seurat)) + +# Need normalized data for castle +seurat <- NormalizeData(object = seurat) + +# Further analysis to highlight some signature +seurat$generalCellType <- "HSPC" +seurat@meta.data[seurat@meta.data$library_id %in% c("MEP_broad", "GMP_broad", "CMP_broad"), "generalCellType"] <- "Prog" + +# Signature for Prog cellType +Idents(seurat) <- "generalCellType" +markers_ProgVSHSPC <- FindMarkers(seurat, ident.1 = "Prog", ident.2 = "HSPC", logfc.threshold = 0.25) +markers_ProgVSHSPC_sig <- markers_ProgVSHSPC[markers_ProgVSHSPC$p_val_adj < 0.05,] +markers_ProgVSHSPC_sig <- markers_ProgVSHSPC_sig[order(markers_ProgVSHSPC_sig$avg_log2FC, decreasing = T),] + +top_markers_ProgVSHSPC_sig <- markers_ProgVSHSPC_sig[rownames(markers_ProgVSHSPC_sig) %in% head(rownames(markers_ProgVSHSPC_sig), 100),] +bot_markers_ProgVSHSPC_sig <- markers_ProgVSHSPC_sig[rownames(markers_ProgVSHSPC_sig) %in% tail(rownames(markers_ProgVSHSPC_sig), 100),] + +# Signature spé for GMP vs other prog +Idents(seurat) <- "library_id" +markers_GMPvsOtherProg <- FindMarkers(seurat, ident.1 = "GMP_broad", ident.2 = c("CMP_broad", "MEP_broad"), logfc.threshold = 0.25) +markers_GMPvsOtherProg_sig <- markers_GMPvsOtherProg[markers_GMPvsOtherProg$p_val_adj < 0.05,] +markers_GMPvsOtherProg_sig <- markers_GMPvsOtherProg_sig[order(markers_GMPvsOtherProg_sig$avg_log2FC, decreasing = T),] + +top_markers_GMPvsOtherProg_sig <- markers_GMPvsOtherProg_sig[rownames(markers_GMPvsOtherProg_sig) %in% head(rownames(markers_GMPvsOtherProg_sig), 100),] +bot_markers_GMPvsOtherProg_sig <- markers_GMPvsOtherProg_sig[rownames(markers_GMPvsOtherProg_sig) %in% tail(rownames(markers_GMPvsOtherProg_sig), 100),] + +# CMP vs others prog +markers_CMPvsOtherProg <- FindMarkers(seurat, ident.1 = "CMP_broad", ident.2 = c("GMP_broad", "MEP_broad"), logfc.threshold = 0.25) +markers_CMPvsOtherProg_sig <- markers_CMPvsOtherProg[markers_CMPvsOtherProg$p_val_adj < 0.05,] +markers_CMPvsOtherProg_sig <- markers_CMPvsOtherProg_sig[order(markers_CMPvsOtherProg_sig$avg_log2FC, decreasing = T),] + +top_markers_CMPvsOtherProg_sig <- markers_CMPvsOtherProg_sig[rownames(markers_CMPvsOtherProg_sig) %in% head(rownames(markers_CMPvsOtherProg_sig), 100),] +bot_markers_CMPvsOtherProg_sig <- markers_CMPvsOtherProg_sig[rownames(markers_CMPvsOtherProg_sig) %in% tail(rownames(markers_CMPvsOtherProg_sig), 100),] + +# MEP vs others prog +markers_MEPvsOtherProg <- FindMarkers(seurat, ident.1 = "MEP_broad", ident.2 = c("GMP_broad", "CMP_broad"), logfc.threshold = 0.25) +markers_MEPvsOtherProg_sig <- markers_MEPvsOtherProg[markers_MEPvsOtherProg$p_val_adj < 0.05,] +markers_MEPvsOtherProg_sig <- markers_MEPvsOtherProg_sig[order(markers_MEPvsOtherProg_sig$avg_log2FC, decreasing = T),] + +top_markers_MEPvsOtherProg_sig <- markers_MEPvsOtherProg_sig[rownames(markers_MEPvsOtherProg_sig) %in% head(rownames(markers_MEPvsOtherProg_sig), 100),] +bot_markers_MEPvsOtherProg_sig <- markers_MEPvsOtherProg_sig[rownames(markers_MEPvsOtherProg_sig) %in% tail(rownames(markers_MEPvsOtherProg_sig), 100),] + +sig_markers <- data.frame(Prog_signature_up = rownames(top_markers_ProgVSHSPC_sig), + Prog_signature_down = rownames(bot_markers_ProgVSHSPC_sig), + GMP_signature_up = rownames(top_markers_GMPvsOtherProg_sig), + GMP_signature_down = rownames(bot_markers_GMPvsOtherProg_sig), + CMP_signature_up = rownames(top_markers_CMPvsOtherProg_sig), + CMP_signature_down = rownames(bot_markers_CMPvsOtherProg_sig), + MEP_signature_up = rownames(top_markers_MEPvsOtherProg_sig), + MEP_signature_down = rownames(bot_markers_MEPvsOtherProg_sig)) + + +write.table(sig_markers, file = paste0(opt$outdir, "/progenitors_signature", ".csv"), + sep = ",", quote = F, row.names = T, col.names = T) + +# # Short seurat analysis without any correction +# seurat <- FindVariableFeatures(object = seurat,selection.method = "vst", nfeatures = 2000, verbose = T) +# seurat <- ScaleData(object = seurat) +# +# seurat <- RunPCA(object = seurat) +# +# +# png(paste(opt$outdir,"/ElbowPlot.png",sep ="")) +# ElbowPlot(object = seurat,ndims = 30) +# dev.off() +# +# +# print("Clustering...") +# +# seurat <- FindNeighbors(object = seurat,dims = c(1:15),k.param = 20) +# seurat <- FindClusters(object = seurat,resolution = c(0.5,0.6,0.7,0.8,0.9,1,1.2)) +# +# print("Running TSNE...") +# +# seurat <- RunTSNE(seurat,dims = c(1:15)) +# +# print("Running UMAP...") +# +# seurat <- RunUMAP(seurat,dims = c(1:15)) +# +# print("UMAP ok") +# print(colnames(seurat@meta.data)) +# +# +# colPrefix <- "RNA_snn_res." +# +# +# umapListRes <- list() +# tsneListRes <- list() +# for (r in c(0.6,0.8,1,1.2)) { +# umapListRes[[as.character(r)]] <- DimPlot(seurat, +# reduction = "umap", +# label = T, +# group.by = paste(colPrefix,r,sep="")) + +# NoLegend() + +# ggtitle(paste("res",r)) +# +# tsneListRes[[as.character(r)]] <- DimPlot(seurat, +# reduction = "tsne", +# label = T, +# group.by = paste(colPrefix,r,sep="")) + +# NoLegend() + +# ggtitle(paste("res",r)) +# } +# +# png(paste(opt$outdir,"/tsne_different_res.png",sep = ""),height = 800,width = 800) +# grid.arrange(tsneListRes[[1]],tsneListRes[[2]],tsneListRes[[3]],tsneListRes[[4]]) +# dev.off() +# +# png(paste(opt$outdir,"/umap_different_res.png",sep = ""),height = 800,width = 800) +# grid.arrange(umapListRes[[1]],umapListRes[[2]],umapListRes[[3]],umapListRes[[4]]) +# dev.off() +# +# +# #Check for unwanted source of variation +# pPhases <- grid.rect(gp=gpar(col="white")) +# +# +# pPred <- DimPlot(seurat,group.by = "library_id") +# +# pUMI <- FeaturePlot(seurat, "Total_mRNAs") +# pMito <- grid.rect(gp=gpar(col="white")) +# +# png(paste(opt$outdir,"/umap_factors.png",sep = ""),height = 800,width = 800) +# grid.arrange(pPhases,pPred,pUMI,pMito) +# dev.off() +# + + + +saveRDS(object = seurat,file = paste(opt$outdir,"/seuratForActinn.rds",sep ="")) diff --git a/R_src/prepareRodriguezCL.R b/R_src/prepareRodriguezCL.R new file mode 100644 index 0000000000000000000000000000000000000000..2b0e7878ef802d043900aa34d29cc7abd7039332 --- /dev/null +++ b/R_src/prepareRodriguezCL.R @@ -0,0 +1,121 @@ +##------------------------------------------------------------------------------ +## L. Herault +##------------------------------------------------------------------------------ + +## ----------------------------------------------------------------------------- +## Libraries +## ----------------------------------------------------------------------------- + +suppressMessages(library(monocle)) +suppressMessages(library(Seurat)) +suppressMessages(library(scran)) +suppressMessages(library(getopt)) + + + +#load homemade function +source("R_src/data_preparation.R") + +# R data preparation scripts for Rodiguez-Fraticelli data + +## ----------------------------------------------------------------------------- +## Command line args +## ----------------------------------------------------------------------------- + +spec = matrix(c( + 'help', 'h', 0, "logical", "Help about the program", + 'inputData', 'i', 1, "character", "REQUIRED: downloaded Rodriguez raw data (.RDS generated by prepare_data.R).", + 'outdir', 'o',1, "character", 'Outdir path (default ./)', + 'minPropCellExp', "p",1,"numeric", "Genes expressed in less than this proportion of cells are discarrded frome the analysis (0.001 by default)" + ), byrow=TRUE, ncol=5); + +opt = getopt(spec) + +# if help was asked, print a friendly message +# and exit with a non-zero error code + +args <- commandArgs() +if ( !is.null(opt$help) | is.null(opt$inputData)) { + cat("Prepare Rodriguez - Fraticelli data as source for CaSTLe") + cat(getopt(spec, usage=TRUE)) + q(status=1) +} + +if (is.null(opt$outdir)) { + opt$outdir = "./" +} + +dir.create(opt$outdir,recursive = T,showWarnings = F) + + +if (is.null(opt$minPropCellExp)) { + opt$minPropCellExp <- 0.001 + print(opt$minPropCellExp) +} + +#setwd(opt$outdir) + +#get Seurat3 Rodriguez +rawCountsPaths <-list.files(opt$inputData,full.names =T) + +#make sure we gate the correct file (no MPP4) +rawCountsPaths <- rawCountsPaths[which(grepl(pattern = 'LTHSC|MPP2|MPP3|STHSC',rawCountsPaths))] + + +getSeuratRodriguez <- function(rawCountsPath) { + + rawTable <- read.csv(rawCountsPath) + + phenoData <- rawTable[,c(1:5)] + rownames(phenoData) <- rawTable$cell_id + data <- rawTable[,c(6:ncol(rawTable))] + rownames(data) <- rawTable$cell_id + seurat <- CreateSeuratObject(counts = t(data),meta.data = phenoData) + + #Use of Rodriguez QC to filter poor qualisty cells + seurat <- subset(x = seurat, subset = pass_filter > 0.1) + print(unique(seurat@meta.data$seq_run_id)) + return(seurat) + +} + +seuratObjects <- lapply(rawCountsPaths,getSeuratRodriguez) + +# We do not take the MPP4 +seuratAll <- merge(x = merge(x = seuratObjects[[1]], y = seuratObjects[[2]]), + y= merge(x = seuratObjects[[3]],y = seuratObjects[[4]])) + +seuratAll@meta.data$library_id <- factor(seuratAll@meta.data$library_id,levels = c("LTHSC","STHSC", "MPP2", "MPP3")) + + +## Switch to Monocle object to more easily discard unexpressed genes + +pd <- new("AnnotatedDataFrame", data = seuratAll@meta.data) +fd <- new("AnnotatedDataFrame", data = data.frame(gene_short_name = rownames(seuratAll))) +rownames(fd) <- fd$gene_short_name + + + +monocleAll <- newCellDataSet(GetAssayData(object = seuratAll, slot = "counts"), + phenoData = pd, + featureData = fd, + lowerDetectionLimit = 0.1, + expressionFamily = negbinomial.size()) + + + +pData(monocleAll)$Total_mRNAs <- Matrix::colSums(exprs(monocleAll)) +monocleAll <- detectGenes(monocleAll, min_expr = 0.1) + +# Filter out genes expressed in too few cells +fData(monocleAll)$use_for_castle <- fData(monocleAll)$num_cells_expressed > opt$minPropCellExp * ncol(monocleAll) + +gbm_to_seurat <- monocleAll[fData(monocleAll)$use_for_castle==T,] + +seurat <- CreateSeuratObject(counts = exprs(gbm_to_seurat), meta.data = pData(gbm_to_seurat)) + +# Need normalized data for castle +seurat <- NormalizeData(object = seurat) + + +saveRDS(object = seurat,file = paste(opt$outdir,"/seuratForCastle.rds",sep ="")) diff --git a/R_src/prepareSTREAManalysisCL.R b/R_src/prepareSTREAManalysisCL.R new file mode 100644 index 0000000000000000000000000000000000000000..5864b180b830abe3a41b51100215d683be436e06 --- /dev/null +++ b/R_src/prepareSTREAManalysisCL.R @@ -0,0 +1,105 @@ +## ----------------------------------------------------------------------------- +## Libraries +## ----------------------------------------------------------------------------- + +suppressMessages(library(Seurat)) +suppressMessages(library(RColorBrewer)) +suppressMessages(library(readxl)) +suppressMessages(library(stringr)) +suppressMessages(library(getopt)) +suppressMessages(library(scales)) + +## ----------------------------------------------------------------------------- +## Command line args +## ----------------------------------------------------------------------------- + +spec = matrix(c( + 'help', 'h', 0, "logical", "Help about the program", + 'inputRDS', 'i', 1, "character", "REQUIRED : 10X data prepared as monocle or seurat object.", + 'outdir', 'o',1, "character", 'Outdir path (default ./)', + "prefix", "p", 1, "character", "Prefix for the output file", + "downSample", "d", 1,"numeric", "subsample the dataset with the given number of cells", + "removeCluster", "r", 1, "character", "remove cell for the given cluster (need allData)", + "subsetTreatment", "s", 1, "character", "subset cell for the given treatment (need allData)" +), byrow=TRUE, ncol=5) + +opt = getopt(spec) + +args <- commandArgs() +if ( !is.null(opt$help) | is.null(opt$inputRDS)) { + cat("Prepare data for trajectory inference with stream") + cat(getopt(spec, usage=TRUE)) + q(status=1) +} + +if (is.null(opt$outdir)) { + opt$outdir = "./" +} + + +dir.create(opt$outdir, recursive = T) + +prefix <- opt$prefix + +print(prefix) + +## --------------------------------------------------------------------- +## Color scheme +## --------------------------------------------------------------------- + +colorTreatment <- c("#664CFF", "#FF8000") +colorCluster <- c("#E69F00", "#CC79A7", "#0072B2", "#009E73", "#D55E00", "#56B4E9") +colorPhases <- c(brewer.pal(9,"RdPu"))[c(3,6,9)] +# colorBranches <- c("#332288", "#117733", "#DDCC77", "#AA4499", "#88CCEE") + +seurat <- readRDS(opt$inputRDS) +seurat$cellName <- rownames(seurat@meta.data) +#Set integrated slot as active assay +DefaultAssay(seurat) <- "integrated" + +colorClusters <- cbind(levels(seurat$FinalCluster), + colorCluster) + +if (!is.null(opt$removeCluster)) { + seurat = subset(seurat, idents = opt$removeCluster,invert = TRUE) + colorClusters = colorClusters[!colorClusters[,1] %in% opt$removeCluster,] +} + + +conditionColors <- cbind(c("Ctrl","RA"),colorTreatment) + +phasesColors <- cbind(levels(seurat$phases), colorPhases) + +if (!is.null(opt$subsetTreatment)) { + Idents(seurat) <- "condition" + + seurat = subset(seurat, idents = opt$subsetTreatment) + conditionColors = conditionColors[which(conditionColors[,1]!=opt$subsetTreatment),] + Idents(seurat) <- "FinalCluster" + +} + +if (!is.null(opt$downSample)) { + seurat = subset(seurat, cells = sample(Cells(seurat), opt$downSample),random.seed = 2020) +} + +# Extract integrated matrix +data <- GetAssayData(object = seurat, slot = "scale.data") +data <- data[,rownames(seurat@meta.data)] + +write.table(data,paste(opt$outdir,"/",prefix,"_scaleDataForStream.tsv",sep = ""),quote = FALSE, sep = "\t",col.names = T) + +# Write cluster annotation for each cells +write.table(seurat@meta.data[,c("cellName", "FinalCluster")],paste(opt$outdir,"/",prefix,"_seuratClusters.tsv",sep = ""),quote = FALSE,sep = "\t",col.names = FALSE,row.names = F) +# Write color associated to each cluster +write.table(colorClusters,paste(opt$outdir,"/",prefix,"_colorClusters.tsv",sep =""),quote = FALSE,sep = "\t",col.names = FALSE,row.names = F) + +# Write condition annot +write.table(seurat$condition,paste(opt$outdir,"/",prefix,"_treatment.tsv",sep =""),quote = FALSE,sep = "\t",col.names = FALSE) +# Write color associated to the condition +write.table(conditionColors,paste(opt$outdir,"/",prefix,"_colorTreatment.tsv",sep =""),quote = FALSE,sep = "\t",col.names = FALSE,row.names = F) + +# Write phases annot +write.table(seurat$phases,paste(opt$outdir,"/",prefix,"_cellCyclePhases.tsv",sep =""),quote = FALSE,sep = "\t",col.names = FALSE) +# Write color assosciated to phases +write.table(phasesColors,paste(opt$outdir,"/",prefix,"_colorCellCyclePhases.tsv",sep =""),quote = FALSE,sep = "\t",col.names = FALSE,row.names = F) diff --git a/R_src/prepareSeuratCL.R b/R_src/prepareSeuratCL.R new file mode 100644 index 0000000000000000000000000000000000000000..f17cc4d8cbd9f2efa1072d1e0fb6d62d4873c017 --- /dev/null +++ b/R_src/prepareSeuratCL.R @@ -0,0 +1,188 @@ +##------------------------------------------------------------------------------ +## L. Herault +##------------------------------------------------------------------------------ + +## ----------------------------------------------------------------------------- +## Libraries +## ----------------------------------------------------------------------------- + +#Load required packages +suppressMessages(library(Seurat)) +suppressMessages(library(scran)) +suppressMessages(library(getopt)) +suppressMessages(library(grid)) +suppressMessages(library(gridExtra)) +suppressMessages(library(scales)) +suppressMessages(library(ggplot2)) + + + +#load homemade function +source("R_src/data_preparation.R") + + + +# Quality control and preparation of 10X data for analysis + +## ----------------------------------------------------------------------------- +## Command line args +## ----------------------------------------------------------------------------- + +spec = matrix(c( + 'help', 'h', 0, "logical", "Help about the program", + 'inputRDS', 'i', 1, "character", "REQUIRED: 10X data prepared as a list of seurat object and feature data table (.RDS generated by prepare_data.R).", + "mitochFilter", "m",1, "numeric", "mitochondrial protein gene RNA percentage filtering (eg 10 to remove cells with more than 10% of mitoch protein transcripts, default none)", + 'outdir', 'o',1, "character", 'Outdir path (default ./)', + 'upperMad', 'u', 1, "numeric", "cells with more than u mads log counts will be discard (default 2)", + "lowerMad", 'l', 1, "numeric", "cells with less than l mads log counts will be discard (default 2)" +), byrow=TRUE, ncol=5); + +opt = getopt(spec) + +# if help was asked, print a friendly message +# and exit with a non-zero error code + +args <- commandArgs() +if ( !is.null(opt$help) | is.null(opt$inputRDS)) { + cat("Add cell cycle phases to each cells with scran, filter cells of low quality, control percentage mitochondrial transcripts, ribosomal protein transcripts") + cat(getopt(spec, usage=TRUE)) + q(status=1) +} + +if (is.null(opt$outdir)) { + opt$outdir = "./" +} + +if (is.null(opt$mitochFilter)) { + opt$mitochFilter <- 100 +} + +if (is.null(opt$mitochFilter)) { + opt$upperMad <- 2 +} + +if (is.null(opt$mitochFilter)) { + opt$lowerMad <- 2 +} + + +input <- readRDS(opt$inputRDS) +seurat <- input$seurat +fd <- input$featureData +# print some stuff + +# Estimating size factor and dispersion not needed anymore +#gbm_cds <- estimateSizeFactors(gbm_cds) +#gbm_cds <- estimateDispersions(gbm_cds) + + +# First getting cell cycle phases with cyclone function from package scran (see data_preparation.R)) +seurat <- getCellCyclePhasesSeurat(seurat,outdir = paste(opt$outdir,"/cellCycle",sep ="")) + +# Change ensembl gene names in the seurat object ot gene short names to be able to use PercentageFeatureSet Seurat function for mt and rb transcript proportion +#check for dup genes + +# Only needed if ensemble id (it is the case in the workflow) + +dupGeneNames <- fd[which((duplicated(fd$gene_short_name))),"gene_short_name"] + +if(length(dupGeneNames) == 0) { + rownames(seurat) <- fData(gbm_to_seurat)$gene_short_name + +} else { + #write.csv(fd[which(is.element(fd$gene_short_name,dupGeneNames)),], + # paste(opt$outdir,"/dupGenesName.csv",sep = "") + #) + print("Dup gene short names existing, making them unique...") + newSeuratCount <- as.matrix(GetAssayData(seurat,slot = 'counts',assay = "RNA")) + rownames(newSeuratCount) <- make.unique(fd$gene_short_name, sep = "--") + seurat <- CreateSeuratObject(counts = newSeuratCount, + assay = "RNA", + meta.data = seurat@meta.data + ) + +} + +# Compute percent Rb and percent mt + +seurat$percentMito <- PercentageFeatureSet( + seurat, + pattern = 'mt-', + assay = 'RNA' +) + + +seurat$percentRibo <- PercentageFeatureSet( + seurat, + pattern = 'Rps|Rpl|Mrp', #it might be better to have the exact gene set through GO... + assay = 'RNA' +) + +# Filtering cells outside two MADs from median +# print num cells before filtering + +print("Matrix dim before cell filtering :") +dim(seurat) + +# plot qc metrics before filtering +png(paste(opt$outdir,"/qcMetricsRaw.png",sep = "")) +VlnPlot(seurat, features = c("nFeature_RNA", "nCount_RNA", "percentMito","percentRibo"), ncol = 3,pt.size = 0.001) +dev.off() + +# plot some correlation between qc metrics +plot1 <- FeatureScatter(seurat, feature1 = "nCount_RNA", feature2 = "percentMito") +plot2 <- FeatureScatter(seurat, feature1 = "nCount_RNA", feature2 = "nFeature_RNA") + +png(paste(opt$outdir,"/qcMetricsCr.png",sep = "")) +grid.arrange(plot1,plot2) +dev.off() + + + + +minCountThreshold <- exp(median(log(seurat@meta.data$nCount_RNA)) - opt$lowerMad*mad(log(seurat@meta.data$nCount_RNA))) +maxCountThreshold <- exp(median(log(seurat@meta.data$nCount_RNA)) + opt$upperMad*mad(log(seurat@meta.data$nCount_RNA))) + + +png(paste(opt$outdir,"/cutOffLogCount.png",sep = "")) +VlnPlot(seurat, features = c( "nCount_RNA"),pt.size = 0.001) + + geom_hline(yintercept = minCountThreshold) + + geom_hline(yintercept = maxCountThreshold) +dev.off() + + + +seurat <- subset(seurat, subset = nCount_RNA > minCountThreshold & + nCount_RNA < maxCountThreshold & + percentMito < opt$mitochFilter) + +#Check if some cells don't have an assinged cell cycles phases +table(!is.na(seurat@meta.data$phases)) +png(paste(opt$outdir,"/assignedPhases.png",sep = "")) +barplot(table(!is.na(seurat@meta.data$phases)),main="Phases assigned") +dev.off() + +if("FALSE" %in% names(table(!is.na(seurat@meta.data$phases)))) { + print("Discard cells with no cell cycle phases assinged (too few genes") + print(table(!is.na(seurat@meta.data$phases))[1]) + seurat@meta.data$phases[is.na(seurat@meta.data$phases)] <- "noAssigned" + Idents(seurat) <- "phases" + seurat <- subset(seurat, idents = "noAssigned",invert = T) + Idents(seurat) <- "orig.ident" + +} + +png(paste(opt$outdir,"/ccPhases.png",sep = "")) +barplot(table(seurat@meta.data$phases),main="Phases") +dev.off() + +# plot qc metrics before filtering +png(paste(opt$outdir,"/qcMetricsFiltered.png",sep = "")) +VlnPlot(seurat, features = c("nFeature_RNA", "nCount_RNA", "percentMito","percentRibo"), ncol = 3,pt.size = 0.001) +dev.off() + +print("Matrix dim after cell filtering :") +print(dim(seurat)) + + +saveRDS(object = seurat,file = paste(opt$outdir,"/seurat_treated.rds",sep ="")) diff --git a/R_src/regulonMarkerCL.R b/R_src/regulonMarkerCL.R new file mode 100644 index 0000000000000000000000000000000000000000..e42cac61936c2535000cdead69bd0010fe25443a --- /dev/null +++ b/R_src/regulonMarkerCL.R @@ -0,0 +1,379 @@ +##------------------------------------------------------------------------------ +## L. Herault +##------------------------------------------------------------------------------ + +## ----------------------------------------------------------------------------- +## Libraries +## ----------------------------------------------------------------------------- + +suppressMessages(library(getopt)) +suppressMessages(library(stringr)) +suppressMessages(library(plyr)) +suppressMessages(library(reshape2)) +suppressMessages(library(Seurat)) +suppressMessages(library(ggplot2)) + + + +source("R_src/computeDiffFun.R") + + +# Analysis of regulon markers of Seurat cluster + +## ----------------------------------------------------------------------------- +## Command line args +## ----------------------------------------------------------------------------- + +spec = matrix(c( + 'help', 'h', 0, "logical", "Help about the program", + "inputSeurat", "i",1, "character", "input seurat object with cluster column names numclust", + "inputMonocle", "m",1, "character", "input monocle object with state column named State", + "regulonScore", "r",1, "character", "input regulon score matrix (AUCell)", + 'outdir', 'o',1, "character", 'Outdir path (default ./)', + "tfNode", 't',1, "character", "path to TF.txt to analyse in details", + "cellCycle", 'c', 0, "logical", "Add cell cycle genes in the heatmap RNA analysis", + "scoreDiff", "s", 1, "numeric", "Score differences threshold to select the markers", + "conditions", "a", 1, "character", "name of meta cols, separated by +, (eg: AGE+Genotype) for a condition diff test per clusters", + "groupingVar", "g", 1, "character", "grouping variable (eg sequencing platform) for condition test to discard batch effect" +), byrow=TRUE, ncol=5); + + + +opt = getopt(spec) + + +# if help was asked, print a friendly message +# and exit with a non-zero error code + +args <- commandArgs() +if ( !is.null(opt$help) | is.null(opt$inputSeurat)) { + cat("Create influence graph from regulon table") + cat(getopt(spec, usage=TRUE)) + q(status=1) +} + + +# For testing +# setwd("/shared/projects/scRNA_HSPC_Aging/scRNA_infer/") +# selectedTF <- c("Junb","Stat1","Irf1","Irf9","Myc","Gata2","Gata1","Spi1","Bclaf1","Cebpa","Tal1","Ikzf1","Fli1","Klf1","Zfpm1") +# write.table(selectedTF,"input/selectedTF.txt",sep = "\t",row.names = F,col.names = F) +# +# opt <- list() +# opt$outdir <- "output/regulonAnalysis/" +# opt$inputSeurat <- "../herault_et_al/scHSC_herault/report/seurat_report.rds" +# opt$inputMonocle <- "../herault_et_al/scHSC_herault/report/monocle_report.rds" +# opt$tfNode <- "input/selectedTF.txt" +# opt$regulonScore <- "output/ScenicRNA_multipleRuns/AUCell_maskDropouts_save/regulons_enrichment.csv" +# opt$scoreDiff <- 0.002 +# opt$cellCycle <- T + +# set deafault arg +if (is.null(opt$scoreDiff)) { + opt$scoreDiff <- 0.002 +} + +if(is.null(opt$outdir)) { + opt$outdir <- "./" +} + +if(is.null(opt$cellCycle)) { + opt$cellCycle <- F +} + +selectedTF <- as.character(read.table(opt$tfNode)$V1) + + +# Printing option before running + +for (o in names(opt)) { + print(paste(o,":", opt[[o]])) +} + +# loading R objects + +seurat <- readRDS(opt$inputSeurat) +#monocle <- readRDS(opt$inputMonocle) + +# Loading regulon score table +regulonTable <- read.csv(opt$regulonScore,row.names = 1,check.names = F) + +# For the moment only analyse postive regulons + +regulonTable <- regulonTable[,colnames(regulonTable)[which(endsWith(colnames(regulonTable),suffix = "+)"))]] + +# Adding regulon scores to metadata + +# store the data in a new assay in the seurat object of ordered cells +auCell_data <- t(regulonTable) + +seurat[["AUCell"]] <- CreateAssayObject(data=auCell_data) + + +#find cluster regulon markers +Idents(seurat) <- "numclust" + +#Set AUCell slot +DefaultAssay(seurat) <- "AUCell" + + +clusterRegulon <- FindAllMarkers(seurat, + only.pos = T, + logfc.threshold= 0, + pseudocount.use = 1, + min.pct = 0.1) + +# Compute true mean difference in score because Seurat comput only logFC +# The featureDiff functions is loaded from ../R_src/computeDiffFun.R file + +clusterRegulon$avg_diff <- NA + +for (rm in rownames(clusterRegulon)) { + feature <- clusterRegulon[rm,"gene"] + cells.1 <- colnames(seurat)[which(seurat$numclust == clusterRegulon[rm,"cluster"])] + cells.2 <- colnames(seurat)[which(seurat$numclust != clusterRegulon[rm,"cluster"])] + clusterRegulon[rm,"avg_diff"] <- featureDiff(seurat,cells.1,cells.2,feature) +} + +# cut off on p adjusted value +clusterRegulon <- clusterRegulon[which(clusterRegulon$p_val_adj < 0.05 & abs(clusterRegulon$avg_diff) > opt$scoreDiff),] + + +# rename some columns +colnames(clusterRegulon) <- c("p_val","avg_logFC","pct.1","pct.2","p_val_adj","cluster","regulon","avg_diff") + +# order columns +clusterRegulon <- clusterRegulon[,c("p_val","pct.1","pct.2","p_val_adj","cluster","regulon","avg_diff")] + +# Average heatmap and binarization per clust + + +DefaultAssay(seurat) <- "AUCell" + +selectedTF <- selectedTF[paste0(c(selectedTF),"(+)")%in% rownames(seurat)] + + +avgClustAuc <- AverageExpression( + seurat, + #slot = "data", + assays = "AUCell", + features = paste0(c(selectedTF),"(+)"), + return.seurat = T +) + +png(paste(opt$outdir,"/heatmapAvgAucell.png",sep =""),height = 800,width = 800) +DoHeatmap(avgClustAuc,features = paste0(selectedTF,"(+)")) +dev.off() + +binarizeScaledMatrix <- function(seuratAvg, assay = "RNA") { + selectedTFMatrix <- GetAssayData(seuratAvg,assay = assay,slot = "scale.data") + selectedTFMatrixBin <- selectedTFMatrix + selectedTFMatrixBin[] <- NA + selectedTFMatrixBin[selectedTFMatrix < -1] <- 0 + selectedTFMatrixBin[selectedTFMatrix > 1] <- 1 + return(selectedTFMatrixBin) +} + +selectedTFMatrixBin <- binarizeScaledMatrix(avgClustAuc,assay = "AUCell") + + +write.table(selectedTFMatrixBin,paste(opt$outdir,"/BinAvgClustAucell.tsv",sep =""),sep = "\t") + +# same with RNA level +DefaultAssay(seurat) <- "RNA" + +selectedTF <- as.character(read.table(opt$tfNode)$V1) + +selectedTF <- selectedTF[selectedTF %in% rownames(seurat)] + + +CDK46CycDGenes <- c("Cdk4","Cdk6","Ccnd1","Ccnd2","Ccnd3") +CIPKIPGenes <- c("Cdkn1b","Cdkn1a","Cdkn1c") +INK4Genes <-c("Cdkn2a","Cdkn2b","Cdkn2c","Cdkn2d") + +cellCycleGenes <- c(CDK46CycDGenes ,CIPKIPGenes,INK4Genes) + +cellCycleGenes <- cellCycleGenes[cellCycleGenes%in% rownames(seurat)] + +if (opt$cellCycle) { + avgClustRNA <- AverageExpression( + seurat, + #slot = "data", + assays = "RNA", + features = c(selectedTF,cellCycleGenes), + return.seurat = T + ) + + avgClustRNACC <- data.frame(GetAssayData(avgClustRNA,assay = "RNA",slot = "data")) + CIPKIP <- colSums(avgClustRNACC[CIPKIPGenes[CIPKIPGenes %in% rownames(avgClustRNACC)],]) + INK4 <- colSums(avgClustRNACC[INK4Genes[INK4Genes %in% rownames(avgClustRNACC)],]) + CDK46CycD <- colSums(avgClustRNACC[CDK46CycDGenes[CDK46CycDGenes %in% rownames(avgClustRNACC)],]) + avgClustRNACC <- rbind(avgClustRNACC,CIPKIP=CIPKIP) + avgClustRNACC <- rbind(avgClustRNACC,INK4=INK4) + avgClustRNACC <- rbind(avgClustRNACC,CDK46CycD=CDK46CycD) + avgClustRNACC <- avgClustRNACC[c(selectedTF,c("CIPKIP","INK4","CDK46CycD")),] + + avgClustRNACC_seurat <- CreateSeuratObject(counts = avgClustRNACC) + avgClustRNACC_seurat <- ScaleData(avgClustRNACC_seurat) + Idents(avgClustRNACC_seurat) <- colnames(avgClustRNACC_seurat) + + + png(paste(opt$outdir,"/heatmapAvgRNA.png",sep =""),height = 800,width = 800) + DoHeatmap(avgClustRNACC_seurat,features = c(selectedTF,c("CIPKIP","INK4","CDK46CycD"))) + dev.off() + + selectedTFMatrixBinRNA <- binarizeScaledMatrix(avgClustRNACC_seurat,assay = "RNA") + write.table(selectedTFMatrixBinRNA,paste(opt$outdir,"/BinAvgClustRNA.tsv",sep =""),sep = "\t") + + + +} else { + png(paste(opt$outdir,"/heatmapAvgRNA.png",sep =""),height = 800,width = 800) + DoHeatmap(avgClustRNA,features = selectedTF) + dev.off() + + selectedTFMatrixBinRNA <- binarizeScaledMatrix(avgClustRNA,assay = "RNA") + write.table(selectedTFMatrixBinRNA,paste(opt$outdir,"/BinAvgClustRNA.tsv",sep =""),sep = "\t") + +} + +rownames(selectedTFMatrixBin) <- selectedTF[paste0(selectedTF,"(+)") %in% rownames(selectedTFMatrixBin)] +selectedTFMatrixBin <- rbind(selectedTFMatrixBin,selectedTFMatrixBinRNA[c(selectedTF[!selectedTF %in% rownames(selectedTFMatrixBin)],"CIPKIP","INK4","CDK46CycD"),]) +consensusMatrixBin <- selectedTFMatrixBin +consensusMatrixBin[] <- NA +consensusMatrixBin[which(selectedTFMatrixBin == selectedTFMatrixBinRNA)] <- selectedTFMatrixBin[which(selectedTFMatrixBin == selectedTFMatrixBinRNA)] + +consensusMatrixBinHm <- melt(consensusMatrixBin) + +write.table(consensusMatrixBin,paste(opt$outdir,"/binMatConsensus.tsv",sep =""),sep = "\t") + +png(paste(opt$outdir,"/heatmapConsensus.png",sep =""),height = 800,width = 800) +ggplot(consensusMatrixBinHm,aes(x=Var2, y=Var1, fill=value)) + scale_fill_gradient(low="blue",high = "red") + + geom_tile() +dev.off() + +########################################################################################################################### + +if (!is.null(opt$conditions)) { + conditions <- strsplit(opt$conditions,split = "\\+")[[1]] + for (c in conditions) { + ## only implemented for a meta col AGE for the moment + if (!is.null(opt$groupingVar)) { + DefaultAssay(seurat) <- "AUCell" + + + + regulonDiffPerClust <- lapply(levels(seurat$numclust), + FindMarkerPerClustGroupVar, + seurat, + condition =c, + grouping.var = "platform", + identCol = "numclust", + test.use = "wilcox", + keepDiverging= T, + logfc.threshold = 0, + pseudocount.use = 1, + min.pct = 0.1, + filterOnPadj = F, + computeTrueDiff = T) + + + names(regulonDiffPerClust) <- levels(seurat$numclust) + + regulonDiffPerClustTable <- regulonDiffPerClust[[1]] + for (s in levels(seurat$numclust)[-1]) { + regulonDiffPerClustTable <- rbind(regulonDiffPerClustTable,regulonDiffPerClust[[s]]) + } + + dim(regulonDiffPerClustTable) + + table(regulonDiffPerClustTable$Cluster) + + #Exclude AUCell score differences with a combined pval < 0.05 + regulonDiffPerClustTable <- regulonDiffPerClustTable[which(regulonDiffPerClustTable$minimump_p_val < 0.05),] + + dim(regulonDiffPerClustTable) + + + + + dim(regulonDiffPerClustTable) + + #Exclude AUCell score differences with an opposite sign of variation (pval set to NA by FindAgingMarkers) + regulonDiffPerClustTable <- na.exclude(regulonDiffPerClustTable) + + dim(regulonDiffPerClustTable) + + + + colnames(regulonDiffPerClustTable)[which(colnames(regulonDiffPerClustTable) == "Cluster")] <- "group" + colnames(regulonDiffPerClustTable)[which(colnames(regulonDiffPerClustTable) == "Gene")] <- "regulon" + + + + regulonDiffPerClustTable <- regulonDiffPerClustTable[,c("regulon","numclust","group","minimump_p_val","max_pval","min_avg_diff","A_p_val","avg_diff_A","A_pct.1","A_pct.2","B_p_val","avg_diff_B","B_pct.1","B_pct.2")] + + + + + # Filter on average differences before writing final table + regulonDiffPerClustTable <- regulonDiffPerClustTable[which(abs(regulonDiffPerClustTable$min_avg_diff) > opt$scoreDiff),] + + write.table(regulonDiffPerClustTable,paste(opt$outdir,"/",c,"DiffRegulonTable.tsv",sep =""),sep = "\t") + } else { + + DefaultAssay(seurat) <- "AUCell" + + regulonDiffPerClust <- lapply(levels(seurat$numclust), + FindMarkerPerClust, + seurat, + condition = c, + identCol = "numclust", + test.use = "wilcox", + logfc.threshold = 0, + pseudocount.use = 1, + min.pct = 0.1, + computeTrueDiff = T) + + names(regulonDiffPerClust) <- levels(seurat$numclust) + + regulonDiffPerClustTable <- regulonDiffPerClust[[1]] + for (s in levels(seurat$numclust)[-1]) { + regulonDiffPerClustTable <- rbind(regulonDiffPerClustTable,regulonDiffPerClust[[s]]) + } + + dim(regulonDiffPerClustTable) + + table(regulonDiffPerClustTable$Cluster) + + #Exclude AUCell score differences with an adjusted pval < 0.05 + regulonDiffPerClustTable <- regulonDiffPerClustTable[which(regulonDiffPerClustTable$p_val_adj < 0.05),] + + dim(regulonDiffPerClustTable) + + colnames(regulonDiffPerClustTable)[which(colnames(regulonDiffPerClustTable) == "Cluster")] <- "group" + colnames(regulonDiffPerClustTable)[which(colnames(regulonDiffPerClustTable) == "Gene")] <- "regulon" + + + + regulonDiffPerClustTable <- regulonDiffPerClustTable[,c("regulon","numclust","group","p_val_adj","p_val","avg_diff","avg_logFC","pct.1","pct.2")] + + + + + # Filter on average differences before writing final table + regulonDiffPerClustTable <- regulonDiffPerClustTable[which(abs(regulonDiffPerClustTable$avg_diff) > opt$scoreDiff),] + + + + } + write.table(regulonDiffPerClustTable,paste(opt$outdir,"/",c,"DiffRegulonTable.tsv",sep =""),sep = "\t",row.names = F) + } +} + + +saveRDS(seurat,paste0(opt$outdir,"/seuratAUC.rds")) + +write.table(clusterRegulon,paste(opt$outdir,"/clusterMarkerRegulonTable.tsv",sep =""),sep = "\t",row.names = F) + + + diff --git a/R_src/seurat4AtacRnaCL.R b/R_src/seurat4AtacRnaCL.R new file mode 100644 index 0000000000000000000000000000000000000000..67bc200c25ba7ae136541d8f5590d0bff07bd83b --- /dev/null +++ b/R_src/seurat4AtacRnaCL.R @@ -0,0 +1,84 @@ + +CorPCPlot <- function(object, assay = NULL,reduction = "lsi", n = 10, metaCol, ...) +{ + #assay <- SetIfNull(x = assay, y = DefaultAssay(object = object)) + dr <- object[[reduction]] + embed <- Embeddings(object = dr) + counts <- object[[metaCol]] + embed <- embed[rownames(x = counts), ] + #n <- SetIfNull(x = n, y = ncol(x = embed)) + embed <- embed[, seq_len(length.out = n)] + depth.cor <- as.data.frame(cor(x = embed, y = counts)) + depth.cor$counts <- depth.cor[, 1] + depth.cor$Component <- seq_len(length.out = nrow(x = depth.cor)) + p <- ggplot(depth.cor, aes(Component, counts)) + geom_point() + + scale_x_continuous(n.breaks = n, limits = c(1, n)) + + ylab("Correlation") + ylim(c(-1, 1)) + theme_light() + + ggtitle("Correlation between depth and reduced dimension components", + subtitle = paste0("Assay: ", assay, "\t", "Reduction: ", + reduction)) + return(p) +} + + +rna <- readRDS("output/RNA/PLZF_RARA_CT/Seurat4/seurat.rds") +atac <- readRDS("output/ATAC/smp/Ctrl/atac.rds") +DimPlot(atac) + + +##Data preprocessing + +## Here, we process the gene activity matrix in order to find anchors between cells in the scATAC-seq dataset and the scRNA-seq dataset. + +DefaultAssay(atac) <- "activities" +atac <- FindVariableFeatures(atac) +atac <- NormalizeData(atac) +atac <- ScaleData(atac) + +transfer.anchors <- FindTransferAnchors(reference = rna, query = atac, features = VariableFeatures(object = rna), + reference.assay = "RNA", query.assay = "activities", reduction = "cca") + +## To improve discard first integratedLSI ? +celltype.predictions <- TransferData(anchorset = transfer.anchors, refdata = rna$numclust, dims = 1:30, + weight.reduction = atac[["lsi"]]) +atac <- AddMetaData(atac, metadata = celltype.predictions) + +hist(atac$prediction.score.max) +abline(v = 0.5, col = "red") + +atac.filtered <- subset(atac, subset = prediction.score.max > 0.5) +atac.filtered$predicted.id <- factor(atac.filtered$predicted.id, levels = levels(rna)) # to make the colors match +p1 <- DimPlot(atac.filtered, group.by = "predicted.id", label = TRUE, repel = TRUE) + ggtitle("scATAC-seq cells") + + NoLegend() + scale_colour_hue(drop = FALSE) +p2 <- DimPlot(rna, group.by = "numclust", label = TRUE, repel = TRUE) + ggtitle("scRNA-seq cells") + + NoLegend() +p1 + p2 + + +# note that we restrict the imputation to variable genes from scRNA-seq, but could impute the +# full transcriptome if we wanted to +genes.use <- VariableFeatures(rna) +refdata <- GetAssayData(rna, assay = "RNA", slot = "data")[genes.use, ] + +# refdata (input) contains a scRNA-seq expression matrix for the scRNA-seq cells. imputation +# (output) will contain an imputed scRNA-seq matrix for each of the ATAC cells +## To improve discard first integratedLSI ? +imputation <- TransferData(anchorset = transfer.anchors, refdata = refdata, weight.reduction = atac[["lsi"]]) + +# this line adds the imputed data matrix to the atac object +atac[["RNA"]] <- imputation +coembed <- merge(x = rna, y = atac) + +# Finally, we run PCA and UMAP on this combined object, to visualize the co-embedding of both +# datasets +coembed <- ScaleData(coembed, features = genes.use, do.scale = FALSE) +coembed <- RunPCA(coembed, features = genes.use, verbose = FALSE) +coembed <- RunUMAP(coembed, dims = 1:30) +coembed$celltype <- ifelse(!is.na(coembed$numclust), coembed$numclust, coembed$predicted.id) + + +p1 <- DimPlot(coembed, group.by = "sampleName") +p2 <- DimPlot(coembed, group.by = "celltype", label = TRUE, repel = TRUE) +p1 + p2 + + diff --git a/R_src/seuratAnalysisCL.R b/R_src/seuratAnalysisCL.R new file mode 100644 index 0000000000000000000000000000000000000000..0e7645c45f877f73dcb80611dbfeb13e35ce5761 --- /dev/null +++ b/R_src/seuratAnalysisCL.R @@ -0,0 +1,388 @@ +### Supplemental analysis and cluster annotation + + +## ----------------------------------------------------------------------------- +## Libraries +## ----------------------------------------------------------------------------- + +suppressMessages(library(Seurat)) +suppressMessages(library(ggplot2)) +library(plyr) +library(cowplot) +library(ggplot2) +library(RColorBrewer) +library(scales) +library(gProfileR) +library(biomaRt) +library(getopt) + +## ----------------------------------------------------------------------------- +## Load function +## ----------------------------------------------------------------------------- + +source("R_src/funForSeuratAnalysis.R") + +## ----------------------------------------------------------------------------- +## Command line args +## ----------------------------------------------------------------------------- + +spec = matrix(c( + 'help', 'h', 0, "logical", "Help about the program", + "inputSeurat", "i",1, "character", "input seurat object with cluster column names numclust", + 'outdir', 'o',1, "character", 'Outdir path (default ./)', + "resolution", "r", 1, "character", "clustering resolution to use", + "SigList", "s", 1, "character", "Path to the directory with all the signatures (.txt)", + "clustName", "c", 1, "character", "List of cluster name sep by +", + "logfc_threshold", "f", 1, "numeric", "Set a logFC threshold", + "pval", "p", 1, "numeric", "Set a pval adj threshold", + "assay", "a", 1, "character", "Chose default assay" +), byrow=TRUE, ncol=5); + + + +opt = getopt(spec) + +# if help was asked, print a friendly message +# and exit with a non-zero error code + +args <- commandArgs() +if ( !is.null(opt$help) | is.null(opt$inputSeurat)) { + cat("Create influence graph from regulon table") + cat(getopt(spec, usage=TRUE)) + q(status=1) +} + +## --------------------------------------------------------------------- +## Read inputs +## --------------------------------------------------------------------- + +# # Testing opt +# opt$inputSeurat <- "/shared/projects/scRNA_HSPC_Aging/GMP_PRARA/output/RNA/Seurat4_integration/combined.rds" +# opt$outdir <- "/shared/projects/scRNA_HSPC_Aging/GMP_PRARA/output/RNA/Seurat4_integration/Analysis/" +# opt$resolution <- "integrated_snn_res.0.3" +# opt$assay <- "RNA" +# opt$SigList <- "/shared/projects/scRNA_HSPC_Aging/GMP_PRARA/database/Signature/" +# opt$clustName <- "Neu2+NeuRA1+Rep+NeuRA2+Neu1+Neu3" + +# Read seurat object +seurat <- readRDS(opt$inputSeurat) + +outdir <- opt$outdir + +# Set logFC threshold +if (is.null(opt$logfc_threshold)) { + opt$logfc_threshold <- 0.25 +} + +# Set logFC threshold +if (is.null(opt$pval)) { + opt$pval <- 0.05 +} + +# Define ref clustering +if(is.null(opt$resolution)){ + opt$resolution <- "integrated_snn_res.0.3" +} +targetRes <- opt$resolution + +# Define default assay +if(is.null(opt$assay)){ + opt$assay <- "RNA" +} + +DefAssay <- opt$assay + +# Prepare cluster name +clustName <- strsplit(opt$clustName, split = "\\+")[[1]] + +# Read signature list +signature_files <- list.files(opt$SigList) + +signature_names <- strsplit(signature_files, split = ".txt") + +signature_list <- list() +for(i in 1:length(signature_files)){ + name <- signature_names[[i]] + print(name) + signature_list[[name]] <- assign(signature_names[[i]], + read.table(paste0(opt$SigList, signature_files[i]))[,1]) +} + +signature_list_mouse <- lapply(signature_list, convertHumanGeneList) + +theme_set(theme_classic()) +dir.create(outdir, recursive = T) + +## --------------------------------------------------------------------- +## Color scheme +## --------------------------------------------------------------------- + +colorTreatment <- c("#664CFF", "#FF8000") +colorCluster <- c("#E69F00", "#CC79A7", "#0072B2", "#009E73", "#D55E00", "#56B4E9") +colorPhases <- c(brewer.pal(9,"RdPu"))[c(3,6,9)] +colorBranches <- c("#332288", "#117733", "#DDCC77", "#AA4499", "#88CCEE") + +##-------------------------------------------------# +## Data preparation : Clust name etc +##-------------------------------------------------# + +DefaultAssay(seurat) <- DefAssay + +Idents(seurat) <- targetRes + +names(clustName) <- levels(seurat) + +seurat <- RenameIdents(seurat, clustName) +seurat@meta.data$FinalCluster <- Idents(seurat) + +seurat@meta.data$FinalCluster <- factor(seurat@meta.data$FinalCluster, levels = c("Rep", "Neu1", "Neu2", "Neu3", "NeuRA1", "NeuRA2")) + +Idents(seurat) <- "FinalCluster" + +seurat$Treatment_clust <- paste(seurat$FinalCluster, seurat$condition, sep = "_") +seurat@meta.data$phases <- factor(seurat@meta.data$phases, levels = c("G1_G0", "S", "G2_M")) + +##--------------------------------------------------# +## Looking for cluster markers with new names +##--------------------------------------------------# + +Idents(seurat) <- "FinalCluster" +markers_cluster <- FindAllMarkers(seurat, logfc.threshold = opt$logfc_threshold) +markers_cluster_sig <- markers_cluster[markers_cluster$p_val_adj < opt$pval,] + +write.table(x = markers_cluster_sig, file = paste0(outdir, "Markers_table.tsv"), quote = F, col.names = T, row.names = F, sep = "\t") + +##-----------------------------------------------------# +## Looking for geneSet enrichment on cluster markers +##-----------------------------------------------------# + +enrich_clust <- list() +for(cluster in unique(markers_cluster$cluster)){ + print(cluster) + enrich_clust[[cluster]] <- gProfileR::gprofiler(markers_cluster[markers_cluster$cluster %in% cluster, "gene"], + organism = "mmusculus", + custom_bg = rownames(seurat), + ordered_query = "none") + enrich_clust[[cluster]]$cluster <- cluster +} +enrich_clust.df <- do.call("rbind", enrich_clust) + +write.table(enrich_clust.df, file = paste0(outdir, "enrichmentTable.tsv"), sep = "\t", row.names = F, col.names = T, quote = F) + + +#-------------------------------------------------# +# Basic UMAP plotting +#-------------------------------------------------# +dir.create(paste0(outdir, "UMAP/"), recursive = T) + +UMAP_Clust <- DimPlot(seurat, pt.size = 0.4, cols = colorCluster, group.by = "FinalCluster") + theme(axis.title = element_blank(), axis.text = element_blank()) +UMAP_treatment <- DimPlot(seurat, split.by = "condition", group.by = "condition", pt.size = 0.4, cols = colorTreatment) + theme(axis.title = element_blank(), axis.text = element_blank()) + +# UMAP treatment split +Idents(seurat) <- "condition" +seurat_control <- subset(seurat, idents = "Control") +seurat_treated <- subset(seurat, idents = "Treated") +Idents(seurat) <- "FinalCluster" + +UMAP_control <- DimPlot(seurat_control, split.by = "condition", group.by = "condition", pt.size = 0.4, cols = colorTreatment[1]) + theme(axis.title = element_blank(), axis.text = element_blank(), legend.position = "none") +UMAP_treated <- DimPlot(seurat_treated, split.by = "condition", group.by = "condition", pt.size = 0.4, cols = colorTreatment[2]) + theme(axis.title = element_blank(), axis.text = element_blank(), legend.position = "none") + +UMAP_CellCycle <- DimPlot(seurat, pt.size = 0.4, cols = colorPhases, group.by = "phases") + theme(axis.title = element_blank(), axis.text = element_blank()) +UMAP_CellCycle_split <- DimPlot(seurat, split.by = "condition", group.by = "phases", pt.size = 0.4, cols = colorPhases) + theme(axis.title = element_blank(), axis.text = element_blank()) + +ggsave(paste0(outdir,"UMAP/", 'UMAP_CellCycle', '.png'), plot = UMAP_CellCycle, device = 'png', path = NULL, + scale = 1, width = 15, height = 15, units = 'cm', dpi = 300) + +ggsave(paste0(outdir,"UMAP/", 'UMAP_split_CellCycle', '.png'), plot = UMAP_CellCycle_split, device = 'png', path = NULL, + scale = 1, width = 20, height = 15, units = 'cm', dpi = 300) +ggsave(paste0(outdir,"UMAP/", 'UMAP_cluster', '.png'), plot = UMAP_Clust, device = 'png', path = NULL, + scale = 1, width = 15, height = 15, units = 'cm', dpi = 300) + +ggsave(paste0(outdir, "UMAP/", 'UMAP_split_treatment', '.png'), plot = UMAP_treatment, device = 'png', path = NULL, + scale = 1, width = 20, height = 15, units = 'cm', dpi = 300) +ggsave(paste0(outdir, "UMAP/", 'UMAP_treatment_Control', '.png'), plot = UMAP_control, device = 'png', path = NULL, + scale = 1, width = 15, height = 15, units = 'cm', dpi = 300) +ggsave(paste0(outdir, "UMAP/", 'UMAP_treatment_Treated', '.png'), plot = UMAP_treated, device = 'png', path = NULL, + scale = 1, width = 15, height = 15, units = 'cm', dpi = 300) + +#-------------------------------------------------# +# Generate the barplot for cell distrib in cluster +#-------------------------------------------------# +dir.create(paste0(outdir, "BarPlot/"), recursive = T) +#BP of cell cycle distrib per cluster +summary_phases <- ddply(seurat@meta.data,~FinalCluster + phases + condition, nrow) + +bp_phasesClust_control <- ggplot(data.frame(summary_phases[summary_phases$condition %in% "Control",]), aes(fill = phases,y = V1, x=FinalCluster)) + + geom_bar( stat="identity", position="fill")+ + scale_fill_manual( values= colorPhases)+ + scale_y_continuous(name = "Sample (%)", labels = c(0,25,50,75,100))+ + ylab(label = "")+xlab(label = "") + coord_flip()+ + theme(legend.title=element_blank()) + ggtitle("Control") + +bp_phasesClust_treated <- ggplot(data.frame(summary_phases[summary_phases$condition %in% "Treated",]), aes(fill = phases,y = V1, x=FinalCluster)) + + geom_bar( stat="identity", position="fill")+ + scale_fill_manual( values= colorPhases)+ + scale_y_continuous(name = "Sample (%)", labels = c(0,25,50,75,100))+ + ylab(label = "")+xlab(label = "") + coord_flip()+ + theme(legend.title=element_blank()) + ggtitle("Treated") + +cellCycle_legend <- get_legend(bp_phasesClust_treated) + +bp_grid_cellCycle <- plot_grid(bp_phasesClust_control + theme(legend.position = "none"), bp_phasesClust_treated + theme(legend.position = "none")) +bp_cellCycle_leg_grid <- plot_grid(bp_grid_cellCycle, cellCycle_legend, rel_widths = c(3,0.4)) + +ggsave(paste0(outdir, "BarPlot/", 'BP_cellCycle_split', '.png'), plot = bp_cellCycle_leg_grid, device = 'png', path = NULL, + scale = 1, width = 20, height = 20, units = 'cm', dpi = 300) + +CellPerClust <- ddply(seurat@meta.data,~FinalCluster + condition, nrow) + +bp_rawCellCount_control <- ggplot(data.frame(CellPerClust[CellPerClust$condition %in% "Control",]), aes(y = V1, x=FinalCluster)) + + geom_bar( stat="identity") + coord_flip() + ylab("nCells") + expand_limits(y = c(NULL, 3000)) +bp_rawCellCount_treated <- ggplot(data.frame(CellPerClust[CellPerClust$condition %in% "Treated",]), aes(y = V1, x=FinalCluster)) + + geom_bar( stat="identity") + coord_flip() + ylab("nCells") + expand_limits(y = c(NULL, 3000)) + +ggsave(paste0(outdir, "BarPlot/", 'BP_raw_cellCount_control', '.png'), plot = bp_rawCellCount_control, device = 'png', path = NULL, + scale = 1, width = 15, height = 20, units = 'cm', dpi = 300) +ggsave(paste0(outdir, "BarPlot/", 'BP_raw_cellCount_treated', '.png'), plot = bp_rawCellCount_treated, device = 'png', path = NULL, + scale = 1, width = 15, height = 20, units = 'cm', dpi = 300) + +bp_control_raw_cycle <- plot_grid(bp_phasesClust_control + theme(legend.position = "none"), bp_rawCellCount_control + theme(axis.title.y = element_blank(), axis.text.y = element_blank()) + ggtitle(" "), rel_widths = c(2,1)) +bp_treated_raw_cycle <- plot_grid(bp_phasesClust_treated + theme(legend.position = "none"), bp_rawCellCount_treated + theme(axis.title.y = element_blank(), axis.text.y = element_blank()) + ggtitle(" "), rel_widths = c(2,1)) + +bp_all_grid_noleg <- plot_grid(bp_control_raw_cycle, bp_treated_raw_cycle, nrow = 2) +bp_all_grid_leg <- plot_grid(bp_all_grid_noleg, cellCycle_legend, rel_widths = c(3,0.4)) + +ggsave(paste0(outdir, "BarPlot/", 'BP_grid_cellCycle_cellCount', '.png'), plot = bp_treated_raw_cycle, device = 'png', path = NULL, + scale = 1, width = 20, height = 20, units = 'cm', dpi = 300) + +# BP proportion treatment per cluster +clustersampleName <- ddply(seurat@meta.data,~FinalCluster + condition,nrow) +propExpect <- table(seurat@meta.data$condition)/length(seurat@meta.data$condition)[] + +enrich_clust <- getEnrichPopClust(hspc.combined = seurat, Xname = "Control", Yname = "Treated", colorX = colorTreatment[1], colorY = colorTreatment[2], metaCol = "condition", clustCol = "FinalCluster") + +clustersampleName$condition <- factor(clustersampleName$condition, levels = c("Treated", "Control")) + +bp_treatmentClust <- ggplot(data.frame(clustersampleName), aes(fill = condition,y = V1, x=FinalCluster)) + + geom_bar( stat="identity", position="fill")+ + scale_fill_manual( values= rev(colorTreatment))+ + scale_y_continuous(name = "Sample (%)", labels = c(0,25,50,75,100))+ + ylab(label = "")+xlab(label = "") + coord_flip()+ + theme(legend.title=element_blank(), axis.text.y = element_text(colour = enrich_clust[,"color"])) + + geom_hline(yintercept = propExpect[1]) + +ggsave(paste0(outdir, "BarPlot/",'BP_treatment', '.png'), plot = bp_treatmentClust, device = 'png', path = NULL, + scale = 1, width = 15, height = 20, units = 'cm', dpi = 300) + +# BP cell number distribution per cluster +CellPerClust <- ddply(seurat@meta.data,~FinalCluster, nrow) + +bp_rawTreatmentClust <- ggplot(data.frame(CellPerClust), aes(y = V1, x=FinalCluster)) + + geom_bar( stat="identity") + coord_flip() + ylab("nCells") + +ggsave(paste0(outdir, "BarPlot/",'BP_raw_treatment', '.png'), plot = bp_rawTreatmentClust, device = 'png', path = NULL, + scale = 1, width = 15, height = 20, units = 'cm', dpi = 300) + +# Arranging BP grid +treatment_legend <- get_legend(bp_treatmentClust) + +bp_grid <- plot_grid(bp_treatmentClust + theme(legend.position = "none"), + bp_rawTreatmentClust + theme(axis.text.y = element_blank(), axis.title.y = element_blank()), + treatment_legend, rel_widths = c(2,1,0.5), nrow = 1) + +ggsave(paste0(outdir, "BarPlot/",'BP_grid_treatment', '.png'), plot = bp_grid, device = 'png', path = NULL, + scale = 1, width = 15, height = 20, units = 'cm', dpi = 300) + + +#-------------------------------------------------# +# Create signature assay +#-------------------------------------------------# + +for(sig in signature_names){ + print(sig) + print(head(signature_list_mouse[[sig]])) + seurat <- AddModuleScore(seurat, features = list(signature_list_mouse[[sig]]), name = sig) + colnames(seurat@meta.data)[which(colnames(seurat@meta.data) == paste0(sig, 1))] <- sig +} + +dir.create(paste0(outdir, "Signature/"), recursive = T) + +seurat[["Signature"]] <- CreateAssayObject(data = t(as.matrix(seurat@meta.data[,do.call("rbind", signature_names)[,1]]))) + +DefaultAssay(seurat) <- "Signature" + +##------------------------------------------------ +## Signature analysis +##------------------------------------------------ + + + +for(sig in signature_names){ + print(sig) + Idents(seurat) <- "FinalCluster" + sig_assay <- gsub(sig, pattern = "_", replacement = "-") + print(sig_assay) + + markers_sig <- FindAllMarkers(seurat, logfc.threshold = 0, min.pct = 0, features = sig_assay) + + score_clust <- list() + for(cluster in levels(seurat$FinalCluster)){ + cells_clust <- rownames(seurat@meta.data[seurat@meta.data$FinalCluster %in% cluster,]) + cells_other <- rownames(seurat@meta.data[!seurat@meta.data$FinalCluster %in% cluster,]) + score_clust[[cluster]] <- featureDiff(seurat = seurat, cells.1 = cells_clust, cells.2 = cells_other, feature = sig_assay) + } + summary_score_clust <- do.call("rbind", score_clust) + + rownames(markers_sig) <- markers_sig$cluster + markers_sig$Diff <- summary_score_clust[rownames(markers_sig),1] + + # Find population in each cluster were the signature is overexpressed + Idents(seurat) <- "Treatment_clust" + # print() + summary_marker_split <- list() + for(cluster in levels(seurat$FinalCluster)){ + print(paste0("Starting treatment effect on ", cluster)) + summary_marker_split[[paste(cluster, "Treated_vs_Control", sep = "_")]] <- FindMarkers(object = seurat, + ident.1 = paste(cluster, "Treated", sep = "_"), + ident.2 = paste(cluster, "Control", sep = "_"), + logfc.threshold = 0, min.pct = 0, features = sig_assay) + + summary_marker_split[[paste(cluster, "Treated_vs_Control", sep = "_")]]$Cluster <- cluster + summary_marker_split[[paste(cluster, "Treated_vs_Control", sep = "_")]]$Comparison <- "Treated_vs_Control" + } + + summary_marker_split.df <- do.call("rbind", summary_marker_split) + + # Find population in each cluster were the signature is overexpressed + score_clustTreat <- list() + for(cluster in levels(seurat$FinalCluster)){ + cells_clust_treated <- rownames(seurat@meta.data[seurat@meta.data$Treatment_clust %in% paste(cluster, "Treated", sep = "_"),]) + cells_clust_control <- rownames(seurat@meta.data[seurat@meta.data$Treatment_clust %in% paste(cluster, "Control", sep = "_"),]) + score_clustTreat[[paste(cluster, "Treated_vs_Control", sep = "_")]] <- featureDiff(seurat = seurat, cells.1 = cells_clust_treated, cells.2 = cells_clust_control, feature = sig_assay) + } + summary_score_clustTreat <- do.call("rbind", score_clustTreat) + + summary_marker_split.df$Diff <- summary_score_clustTreat[rownames(summary_marker_split.df),1] + + # Violin plot of the signature + vln_sig <- VlnPlot(seurat, features = sig_assay, pt.size = 0, cols = colorCluster, group.by = "FinalCluster") + + vln_sig_split <- VlnPlot(seurat, features = sig_assay, group.by = "FinalCluster", split.by = "condition", cols = colorTreatment, pt.size = 0) + + #Saving data + write.table(markers_sig, file = paste0(outdir, "Signature/", "Table_", sig, ".tsv"), quote = F, row.names = F, col.names = T, sep = "\t") + + write.table(summary_marker_split.df, file = paste0(outdir, "Signature/", "Table_", sig, "_Treatment", ".tsv"), quote = F, row.names = F, col.names = T, sep = "\t") + + ggsave(paste0(outdir, "Signature/", 'Vln_', sig, '.png'), plot = vln_sig, device = 'png', path = NULL, + scale = 1, width = 15, height = 10, units = 'cm', dpi = 300) + + ggsave(paste0(outdir, "Signature/", 'Vln_', sig, '_Treatment', '.png'), plot = vln_sig_split, device = 'png', path = NULL, + scale = 1, width = 15, height = 10, units = 'cm', dpi = 300) + +} + +Idents(seurat) <- "FinalCluster" +DefaultAssay(seurat) <- DefAssay + +saveRDS(seurat, paste0(outdir, "seurat_annotated", ".rds")) diff --git a/R_src/signacSmpCL.R b/R_src/signacSmpCL.R new file mode 100644 index 0000000000000000000000000000000000000000..7fa4d978d9c7654481fdb4f1a091f005f1045295 --- /dev/null +++ b/R_src/signacSmpCL.R @@ -0,0 +1,433 @@ +##------------------------------------------------------------------------------ +## L. Herault +##------------------------------------------------------------------------------ + +## ----------------------------------------------------------------------------- +## Libraries +## ----------------------------------------------------------------------------- +library(Signac) +library(Seurat) +library(GenomeInfoDb) +library(EnsDb.Mmusculus.v79) +library(ggplot2) +library(patchwork) +library(getopt) + +set.seed(1234) + +## ----------------------------------------------------------------------------- +## Command line args +## ----------------------------------------------------------------------------- + +spec = matrix(c( + 'help', 'h', 0, "logical", "Help about the program", + 'inputDir', 'i', 1, "character", "REQUIRED : Celranger data raw dir with filtered_peak_bc_matrix.h5,singlecell.csv and fragments.tsv.gz files", + 'outdir', 'o',1, "character", 'Outdir path (default ./)', + "num_dim", 'n', 1, "numeric", "first n dimension of the SVD to use for the reclustering", + "firstDimLSI", "f",0, "logical", "Use first LSI dim, default is do not use as it is often strongly correlated to sequencing depht", + "logfc_threshold", "d", 1, "numeric", "logfc threshold for finding cluster markers (1 cluster vs all deg) 0.25 by default", + "sampleInfo", "s", 1, "character", "sample information that will be added to pData with the following format: age=2_months,runDate=10_12_2017..", + "peakRegionFragmentsMadUp", "u", 1, "numeric", "cell with peaks in region fragment upper than u mad will be discarded (default n = 2)", + "peakRegionFragmentsMadLow", "l", 1, "numeric", "cell with peaks in region fragment lower than l mad will be discarded (default n = 2)", + "pctReadsInPeaksLow", "r", 1, "numeric", "cells with percentage reads in peaks lower than r % will be discarded (default = 15)", + "blackListRatio", "b", 1, "numeric", "cells with a blacklist ratio higher than b will be discarded (default b = 0.05)", + "nucleosomeSignal", "c", 1 , "numeric", "cells with a nucleosome signal higher tban n will be discarded (default n = 4)", + "tssEnrichment", "t", 1, "numeric","cells with a tssEnrichment lower than t will be discarded (default t = 2)" + +), byrow=TRUE, ncol=5); + + + +opt = getopt(spec) + + +# if help was asked, print a friendly message +# and exit with a non-zero error code + +args <- commandArgs() +if ( !is.null(opt$help) | is.null(opt$inputDir)) { + cat("Perform signac sample processing, quality control, genes activity, peak calling, DE peaks") + cat(getopt(spec, usage=TRUE)) + q(status=1) +} + + +if (is.null(opt$logfc_threshold)) { + opt$logfc_threshold = 0.25 +} + +if (is.null(opt$num_dim)) { + opt$num_dim = 40 +} + +if (is.null(opt$firstDimLSI)) { + opt$firstDimLSI <- FALSE +} + +if (is.null(opt$peakRegionFragmentsMadUp)) { + opt$peakRegionFragmentsMadUp <- 2 +} + +if (is.null(opt$peakRegionFragmentsMadLow)) { + opt$peakRegionFragmentsMadLow <- 2 +} + +if (is.null(opt$pctReadsInPeaksLow)) { + opt$pctReadsInPeaksLow <- 15 +} + +if (is.null(opt$blackListRatio)) { + opt$blackListRatio <- 0.05 +} + +if (is.null(opt$nucleosomeSignal)) { + opt$nucleosomeSignal <- 4 +} + +if (is.null(opt$tssEnrichment)) { + opt$tssEnrichment <- 2 +} + + +print(opt) + + + +## ----load------------------------------------------------------------------------------------------------------ +counts <- Read10X_h5(filename =paste0(opt$inputDir,"/filtered_peak_bc_matrix.h5")) +metadata <- read.csv( + file = paste0(opt$inputDir,"/singlecell.csv"), + header = TRUE, + row.names = 1 +) + +chrom_assay <- CreateChromatinAssay( + counts = counts, + sep = c(":", "-"), + genome = 'mm10', + fragments = paste0(opt$inputDir,"/fragments.tsv.gz"), + min.cells = 10, + min.features = 200 +) + +prom <- CreateSeuratObject( + counts = chrom_assay, + assay = "peaks", + meta.data = metadata +) + +#Add sample infos + +print(opt$sampleInfo) +sampleInfos <- strsplit(opt$sampleInfo,split=",")[[1]] +for (i in sampleInfos) { + print(i) + info <- strsplit(i,split = "=")[[1]][1] + print(info) + value <- strsplit(i,split = "=")[[1]][2] + print(value) + prom@meta.data[,info] <- value +} + +## -------------------------------------------------------------------------------------------------------------- +prom + + +## -------------------------------------------------------------------------------------------------------------- +prom[["peaks"]] + + +## -------------------------------------------------------------------------------------------------------------- +gr <- granges(prom) + +df <- data.frame(seqnames=seqnames(gr), + starts=start(gr)-1, + ends=end(gr), + names=c(rep(".", length(gr))), + scores=c(rep(".", length(gr))), + strands=strand(gr)) + +write.table(df, file=paste0(opt$outdir,"/10X_peaks.bed"), quote=F, sep="\t", row.names=F, col.names=F) +## -------------------------------------------------------------------------------------------------------------- +# extract gene annotations from EnsDb +annotations <- GetGRangesFromEnsDb(ensdb = EnsDb.Mmusculus.v79) + +# change to UCSC style since the data was mapped to hg19 +seqlevelsStyle(annotations) <- 'UCSC' +genome(annotations) <- "mm10" + +# add the gene information to the object +Annotation(prom) <- annotations + + +## -------------------------------------------------------------------------------------------------------------- +# compute nucleosome signal score per cell +prom <- NucleosomeSignal(object = prom) + +# compute TSS enrichment score per cell +prom <- TSSEnrichment(object = prom, fast = FALSE) + +# add blacklist ratio and fraction of reads in peaks +prom$pct_reads_in_peaks <- prom$peak_region_fragments / prom$passed_filters * 100 +prom$blacklist_ratio <- prom$blacklist_region_fragments / prom$peak_region_fragments + + +## -------------------------------------------------------------------------------------------------------------- +prom$high.tss <- ifelse(prom$TSS.enrichment > 2, 'High', 'Low') +png(paste0(opt$outdir,"/TSSplot.png"),units = "in",height = 6,width=5,res = 300) +TSSPlot(prom, group.by = 'high.tss') + NoLegend() +dev.off() + +## -------------------------------------------------------------------------------------------------------------- +prom$nucleosome_group <- ifelse(prom$nucleosome_signal > 2, 'NS > 2', 'NS < 2') +FragmentHistogram(object = prom,region = "chr1-1-100000000",group.by = 'nucleosome_group') + + +## ----fig.height=5,fig.width=14--------------------------------------------------------------------------------- +png(paste0(opt$outdir,"/qcMetrics.png"),units = "in",height = 5,width=14,res = 300) +VlnPlot( + object = prom, + features = c('pct_reads_in_peaks', 'peak_region_fragments', + 'TSS.enrichment', 'blacklist_ratio', 'nucleosome_signal'), + pt.size = 0.001, + ncol = 5 +) +dev.off() + +## ------------------------------------------------------------------------------------- +minCountThreshold <- median(prom$peak_region_fragments) -opt$peakRegionFragmentsMadLow*mad(prom$peak_region_fragments) +maxCountThreshold <- median(prom$peak_region_fragments) +opt$peakRegionFragmentsMadUp*mad(prom$peak_region_fragments) + +png(paste0(opt$outdir,"/peak_region_fragments_raw.png"),units = "in",height = 5,width=6,res = 300) +VlnPlot( + object = prom, + features = c('peak_region_fragments'), + pt.size = 0.001, +) + geom_hline(yintercept = minCountThreshold) + + geom_hline(yintercept = maxCountThreshold) +dev.off() + + +## -------------------------------------------------------------------------------------------------------------- +prom <- subset( + x = prom, + subset = peak_region_fragments > median(prom$peak_region_fragments) -opt$peakRegionFragmentsMadLow*mad(prom$peak_region_fragments) & + peak_region_fragments < median(prom$peak_region_fragments) +opt$peakRegionFragmentsMadUp*mad(prom$peak_region_fragments) & + pct_reads_in_peaks > opt$pctReadsInPeaksLow & + blacklist_ratio < opt$blackListRatio & + nucleosome_signal < opt$nucleosomeSignal & + TSS.enrichment > opt$tssEnrichment +) +prom + +## ------------------------------------------------------------------------------------- + +png(paste0(opt$outdir,"/peak_region_fragments_filtered.png"),units = "in",height = 5,width=6,res = 300) +VlnPlot( + object = prom, + features = c('peak_region_fragments'), + pt.size = 0.001, +) +dev.off() + + + +## ----fig.height=5,fig.width=14--------------------------------------------------------------------------------- +png(paste0(opt$outdir,"/qcMetricsFiltered.png"),units = "in",height = 5,width=14,res = 300) +VlnPlot( + object = prom, + features = c('pct_reads_in_peaks', 'peak_region_fragments', + 'TSS.enrichment', 'blacklist_ratio', 'nucleosome_signal'), + pt.size = 0.001, + ncol = 5 +) +dev.off() + + +## -------------------------------------------------------------------------------------------------------------- +prom <- RunTFIDF(prom) +prom <- FindTopFeatures(prom, min.cutoff = 'q75') +prom <- RunSVD(prom, n =opt$num_dim) + + +## -------------------------------------------------------------------------------------------------------------- +png(paste0(opt$outdir,"/DepthCor.png"),units = "in",height = 6,width=6,res = 300) +DepthCor(prom) +dev.off() + +## -------------------------------------------------------------------------------------------------------------- +if (opt$firstDimLSI) { + dims = c(1:30) +} else { + dims = c(2:30) +} + +prom <- RunUMAP(object = prom, reduction = 'lsi', dims = dims) +prom <- FindNeighbors(object = prom, reduction = 'lsi', dims = dims) +prom <- FindClusters(object = prom, verbose = FALSE, algorithm = 3) +png(paste0(opt$outdir,"/UMAP_clust.png"),units = "in",height = 6,width=6,res = 300) +DimPlot(object = prom, label = TRUE) + NoLegend() +dev.off() + + +## -------------------------------------------------------------------------------------------------------------- +gene.activities <- GeneActivity(prom) + + +## -------------------------------------------------------------------------------------------------------------- +# add the gene activity matrix to the Seurat object as a new assay and normalize it +prom[['activities']] <- CreateAssayObject(counts = gene.activities) +prom <- NormalizeData( + object = prom, + assay = 'activities', + normalization.method = 'LogNormalize', + scale.factor = median(prom$nCount_activities) +) + +## -------------------------------------------------------------------------------------------------------------- +DefaultAssay(prom) <- 'activities' +markers <- FindAllMarkers(object = prom,only.pos = T) +markers <- markers[order(markers$p_val_adj),] +head(markers) +markers <- markers[markers$p_val_adj<0.05,] + +write.table(markers,paste0(opt$outdir,"/geneActMarkers.tsv"),quote = F,sep = '\t',row.names = F) + +## ----fig.height=8,fig.width=6---------------------------------------------------------------------------------- +png(paste0(opt$outdir,"/FeaturePlot.png"),units = "in",height = 8,width=6,res = 300) +FeaturePlot( + object = prom, + features = head(markers$gene), + pt.size = 0.1, + max.cutoff = 'q95', +) +dev.off() + + +## -------------------------------------------------------------------------------------------------------------- +DefaultAssay(prom) <- 'peaks' + +da_peaks <- FindAllMarkers( + object = prom, + + min.pct = 0.2, + test.use = 'LR', + latent.vars = 'peak_region_fragments' +) +da_peaks <- da_peaks[da_peaks$p_val_adj < 0.05,] +head(da_peaks) +write.table(da_peaks,paste0(opt$outdir,"/peakMarkers.tsv"),quote = F,sep = '\t',row.names = F) + + +## -------------------------------------------------------------------------------------------------------------- + +#da_peaks <- da_peaks[da_peaks$pct.1>0.4,] + +closest_genes_clusterPeaks <- ClosestFeature(prom, regions = rownames(da_peaks)) + + +head(closest_genes_clusterPeaks) + + +## ----fig.height=8,fig.width=8---------------------------------------------------------------------------------- + +plot1 <- VlnPlot( + object = prom, + features = rownames(da_peaks)[c(1,10)], + pt.size = 0.1,ncol = 1, +) +plot2 <- FeaturePlot( + object = prom, + features =rownames(da_peaks)[c(1,10)], + pt.size = 0.1,ncol = 1, +) +png(paste0(opt$outdir,"/peaksPlot.png"),units = "in",height = 8,width=8,res = 300) +plot1 | plot2 +dev.off() + + + +## -------------------------------------------------------------------------------------------------------------- +#closest_genes_clusterPeaks[closest_genes_clusterPeaks$query_region %in% rownames(da_peaks[order(da_peaks$avg_logFC,decreasing = T),])[c(1,10)],"gene_name"] +#To do add to peak markers table + + +## -------------------------------------------------------------------------------------------------------------- + + +## -------------------------------------------------------------------------------------------------------------- +# set plotting order + +png(paste0(opt$outdir,"/CoveragePlot.png"),units = "in",height = 8,width=8,res = 300) +CoveragePlot( + object = prom, + region = rownames(da_peaks)[1], + extend.upstream = 40000, + extend.downstream = 20000 +) +dev.off() + + +# # Trying macs2 call peaks +# promMacs <- prom +# +# peaksMacs <- CallPeaks( +# object = prom, +# macs2.path = "/shared/home/lherault/bin/miniconda3/envs/signac_env/bin/macs2" #TO DO update env and âs macc2 path through command line +# ) +# +# +# gr <- peaksMacs +# df <- data.frame(seqnames=seqnames(gr), +# starts=start(gr)-1, +# ends=end(gr), +# names=c(rep(".", length(gr))), +# scores=c(rep(".", length(gr))), +# strands=strand(gr)) +# +# write.table(df, file=paste0(opt$outdir,"/macs2_peaks.bed"), quote=F, sep="\t", row.names=F, col.names=F) +# +# +# +# # call peaks for each cell type using MACS2 +# DefaultAssay(pbmc) <- "cellranger" +# peaks <- CallPeaks( +# object = pbmc, +# group.by = "celltype", +# additional.args = "--max-gap 50" +# ) +# +# peaks <- keepStandardChromosomes(peaks, pruning.mode = "coarse") +# +# # remove peaks in blacklist regions +# peaks <- subsetByOverlaps(x = peaks, ranges = blacklist_hg38_unified, invert = TRUE) +# +# # quantify peaks +# peakcounts <- FeatureMatrix( +# fragments = Fragments(pbmc), +# features = peaks, +# cells = colnames(pbmc) +# ) +# +# pbmc[["ATAC"]] <- CreateChromatinAssay( +# counts = peakcounts, +# min.cells = 5, +# genome = "hg38", +# fragments = fragments, +# annotation = annotations +# ) +# +# DefaultAssay(pbmc) <- "ATAC" +# +# pbmc <- FindTopFeatures(pbmc, min.cutoff = 10) +# pbmc <- RunTFIDF(pbmc) +# pbmc <- RunSVD(pbmc) +# pbmc <- RunUMAP(pbmc, reduction = "lsi", dims = 2:40, reduction.name = "umap.atac") +# pbmc <- FindNeighbors(pbmc, reduction = "lsi", dims = 2:40) +# pbmc <- FindClusters(pbmc, algorithm = 3) + + + +## -------------------------------------------------------------------------------------------------------------- +saveRDS(prom,paste0(opt$outdir,"/atac.rds")) + diff --git a/R_src/signac_integration.R b/R_src/signac_integration.R new file mode 100644 index 0000000000000000000000000000000000000000..092514fc870360454655914bd8abd84da69c3aac --- /dev/null +++ b/R_src/signac_integration.R @@ -0,0 +1,272 @@ +##------------------------------------------------------------------------------ +## L. Herault +##------------------------------------------------------------------------------ + +## ----------------------------------------------------------------------------- +## Libraries +## ----------------------------------------------------------------------------- + +suppressMessages(library(Signac)) +suppressMessages(library(Seurat)) +suppressMessages(library(getopt)) +suppressMessages(library(gridExtra)) +# suppressMessages(library(gProfileR)) +# suppressMessages(library(RColorBrewer)) +# suppressMessages(library(readxl)) +# suppressMessages(library(stringr)) +suppressMessages(library(scales)) +# suppressMessages(library(cowplot)) +# suppressMessages(library(sctransform)) +# suppressMessages(library(stringr)) +# suppressMessages(library(plyr)) +suppressMessages(library(grid)) + +suppressMessages(library(S4Vectors)) +suppressMessages(library(patchwork)) +set.seed(2021) + +source("R_src/getSigPerCells.R") +source("R_src/Enrichment.R") +source("R_src/funForSeurat.R") + + +# Seurat 3 integration workflow + +## ----------------------------------------------------------------------------- +## Command line args +## ----------------------------------------------------------------------------- + +spec = matrix(c( + 'help', 'h', 0, "logical", "Help about the program", + 'inputFiles', 'i', 1, "character", "REQUIRED: 10X dataset paths prepared as atac.rds object (.RDS generated by signacSmpCL.R) separated by +", + 'minCutOff', 'm', 1, "character" , "minimum Cut Off on count for peaks (default 50)", + 'subsetInterPeaks', 's',1, "character", "REQUIRED:sizie of subset intersection peaks used for integration (default 10000)", + 'outdir', 'o',1, "character", 'Outdir path (default ./)', + "num_dim_CCA", 'q',1,"numeric", "First n dimensions of CCA to use for FindIntegrationAnchors Seurat function (15 by default)", + "num_dim_weight", 'w',1,"numeric", "First n dimensions to use for IntegrateData Seurat function (15 by default)", + "num_dim", 'n',1,"numeric", "First n dimensions of PCA to use for clustering, UMAP and TSNE (15 by default)", + "num_dim_integrated",'N',1,"numeric", "Number of PCA dimension computed to analyse integrated data (40 by default)", + "cores", 'c',1, "numeric", "Number of cores to use for ordering (for differencially expressed gene between clusters test)", + "resolution", 'r',1, "numeric", "resolution for hspc.combined clustering", + "correction", "b",1,"character", "Covariable to use as blocking factor (eg one or several columns of pData: betch, cell cycle phases... separated by +)", + "logfc_threshold", "l", 1, "numeric", "logfc threshold for finding cluster markers (1 cluster vs all deg) 0.25 by default", + "rodriguezSig", "k", 1, "character", "path for xls file of Rodriguez result", + "norm_method", "z",1, "character", "normalization method, logNorm (by default) or sctransform", + "reusePca", "p", 0, "character","re use pca calculated before when caculating anchor weights for each dataset default to FALSE (permit to correct for cell cycle before integration)" +), byrow=TRUE, ncol=5); + +opt = getopt(spec) + +## For test +setwd("/shared/projects/scRNA_HSPC_Aging/GMP_PRARA/") +opt <- list() +opt$inputFiles <- "output/ATAC/smp/RA/atac.rds+output/ATAC/smp/Ctrl/atac.rds" +opt$subsetInterPeaks <- 10000 +opt$minCutOff <- 50 +opt$outdir <- "output/ATAC/integration/" +# if help was asked, print a friendly message +# and exit with a non-zero error code + +args <- commandArgs() +if ( !is.null(opt$help) | is.null(opt$inputFiles)) { + cat("Perform Seurat 3 integration workflow, then cluster the cell with seurat 3 at different resolutions") + cat(getopt(spec, usage=TRUE)) + q(status=1) +} + +#set default arguments values + +if (is.null(opt$resolution)) { + opt$resolution <- 0.6 +} + +if (is.null(opt$logfc_threshold)) { + opt$logfc_threshold <- 0.25 +} + +if (is.null(opt$num_dim_CCA)) { + opt$num_dim_CCA <- 15 +} + +if (is.null(opt$num_dim_weight)) { + opt$num_dim_weight <- 15 +} + +if (is.null(opt$num_dim)) { + opt$num_dim <- 15 +} + +if (is.null(opt$num_dim_integrated)) { + opt$num_dim_integrated <- 40 +} + +if (is.null(opt$norm_method)) { + opt$norm_method <- "logNorm" +} + +if (is.null(opt$reusePca)) { + opt$reusePca <- FALSE +} + +#blank <- grid.rect(gp=gpar(col="white")) +dir.create(opt$outdir,recursive = T,showWarnings = F) + +print(opt) + + +############################################################################################################################################ +## QC preprocessing already done in the workflow +## Get common features across the datasets + +# load object + +listFile <- strsplit(opt$inputFiles,split = "\\+")[[1]] + +smpList <- list() +#load dataset + +for (i in 1:length(x = listFile)) { + #For testing, in final workflow sampleName will be incorporated in metadata with the loading of cell ranger matrix + sampleName <- strsplit(listFile[i],split = "/")[[1]][4] + smpList[[i]] <- readRDS(listFile[i]) + #smpList[[i]]@meta.data$sampleName <- sampleName + smpList[[i]] <- RenameCells(smpList[[i]],add.cell.id = sampleName) + +} + + +# find peaks that intersect in both datasets +intersecting.regions <- findOverlaps(query = smpList[[1]], subject = smpList[[2]]) + +# find the coordinates of peaks in the first sample that intersect peaks in the second sample +intersections.2 <- unique(queryHits(intersecting.regions)) + +# choose a subset of intersecting peaks +## TO DO +peaks.use <- sort(granges(smpList[[1]])[sample(intersections.2, size = opt$subsetInterPeaks, replace = FALSE)]) + +# count fragments per cell overlapping the set of peaks in the secon sample data +peaks <- FeatureMatrix( + fragments = Fragments(smpList[[2]]), + features = peaks.use, + cells = colnames(smpList[[2]]) +) + +# create a new assay and add it to the second sample dataset +smpList[[2]][['firstSamplePeaks']] <- CreateChromatinAssay( + counts = peaks, + min.cells = 1, + ranges = peaks.use, + genome = 'mm10' +) + +DefaultAssay(smpList[[2]]) <- 'firstSamplePeaks' +smpList[[2]] <- RunTFIDF(smpList[[2]]) ##normalisation + + +############################################################################################################################################# +### Integration +## Look at the data without integration first +# create a new assay in the first sample-ATAC-seq dataset containing the common peaks +peaknames <- GRangesToString(grange = peaks.use) + +smpList[[1]][['firstSamplePeaks']] <- CreateChromatinAssay( + counts <- GetAssayData(smpList[[1]], assay = "peaks", slot = "counts")[peaknames, ], + ranges = peaks.use, + genome = "mm10" +) + +# run TF-IDF for the new assay +DefaultAssay(smpList[[1]]) <- "firstSamplePeaks" +smpList[[1]] <- RunTFIDF(smpList[[1]]) + +unintegrated <- merge(smpList[[1]], smpList[[2]]) +DefaultAssay(unintegrated) <- "firstSamplePeaks" +unintegrated <- RunTFIDF(unintegrated) +unintegrated <- FindTopFeatures(unintegrated, min.cutoff = opt$minCutOff) +unintegrated <- RunSVD(unintegrated) + +png(paste0(opt$outdir,"/unintegratedDepthCor.png"),units = "in",height = 8,width=4,res = 300) +DepthCor(unintegrated) +dev.off() +# first lsi dim not used ? +unintegrated <- RunUMAP(unintegrated, reduction = 'lsi', dims = 2:30) + +p1 <- DimPlot(unintegrated, group.by = 'sampleName', pt.size = 0.1) + ggplot2::ggtitle("Unintegrated") + +## Integration with anchors +# find integration anchors between 10x and sci-ATAC +anchors <- FindIntegrationAnchors( + object.list = list(smpList[[2]], smpList[[1]]), + anchor.features = rownames(smpList[[2]]), + assay = c('firstSamplePeaks', 'firstSamplePeaks'), + k.filter = NA +) + +# integrate data and create a new merged object +integrated <- IntegrateData( + anchorset = anchors, + weight.reduction = list(smpList[[2]][['lsi']],smpList[[1]][['lsi']]), + dims = 2:30, # first dim discarded + preserve.order = TRUE # the larger object has been put first sample and will be the reference +) + +# we now have a "corrected" TF-IDF matrix, and can run LSI again on this corrected matrix +integrated <- RunSVD( + object = integrated, + n = 30, + reduction.name = 'integratedLSI' +) + +png(paste0(opt$outdir,"/DepthCor.png"),units = "in",height = 6,width=6,res = 300) +DepthCor(integrated,reduction = "integratedLSI",assay = "firstSamplePeaks") +dev.off() + +integrated <- RunUMAP( + object = integrated, + dims = 2:30, + reduction = 'integratedLSI' +) + +png(paste0(opt$outdir,"/UmapUnintegratedVSIntegrated.png"),units = "in",height = 8,width=4,res = 300) +p2 <- DimPlot(integrated, group.by = 'sampleName', pt.size = 0.1) + ggplot2::ggtitle("Integrated") +grid.arrange(p1,p2) +dev.off() + +############################################################################################################################################ +### Clustering on lsi integrated space +integrated <- FindNeighbors(object = integrated, reduction = 'integratedLSI', dims = 2:30) +integrated <- FindClusters(object = integrated, verbose = FALSE, algorithm = 3) + +png(paste0(opt$outdir,"/UmapIntegratedClust.png"),units = "in",height = 6,width=6,res = 300) +DimPlot(object = integrated, label = TRUE) + NoLegend() +dev.off() + + + +##################################################################################################################### +### trying harmony + +# library(harmony) +# +# hm.integrated <- RunHarmony( +# object = unintegrated, +# group.by.vars = 'condition', +# reduction = 'lsi', +# assay.use = 'firstSamplePeaks', +# project.dim = FALSE +# ) +# +# # re-compute the UMAP using corrected LSI embeddings +# DepthCor(hm.integrated) +# hm.integrated <- RunUMAP(hm.integrated, dims = 2:30, reduction = 'harmony') +# p5 <- DimPlot(hm.integrated, group.by = 'condition', pt.size = 0.1) + ggplot2::ggtitle("Harmony integration") +# p1 + p5 + +saveRDS(integrated,paste(opt$outdir,"/atacCombined.rds",sep ="")) + + + + + + diff --git a/R_src/streamSeuratCL.R b/R_src/streamSeuratCL.R new file mode 100644 index 0000000000000000000000000000000000000000..0739876c0f7213d526f070b1cefa7ecccb97acbb --- /dev/null +++ b/R_src/streamSeuratCL.R @@ -0,0 +1,226 @@ +## ----------------------------------------------------------------------------- +## Libraries +## ----------------------------------------------------------------------------- + +suppressMessages(library(Seurat)) +suppressMessages(library(ggplot2)) +suppressMessages(library(scales)) +suppressMessages(library(getopt)) +suppressMessages(library(plyr)) +suppressMessages(library(RColorBrewer)) + +# Do hypergeometric test of to population to find if ine proportion is significantly increased +getEnrichAge <- function(hspc.combined,clustCol ='clusterName',metaCol = "age") { + + table <- table(hspc.combined@meta.data[,metaCol],hspc.combined@meta.data[,clustCol]) + + #Remove null column in case of reclustering has been made + + table <- table[,as.vector(which(colSums(table)>0))] + + tablePercent <- prop.table(table,2) + + propExpect <- table(hspc.combined@meta.data[,metaCol])/length(hspc.combined@meta.data[,metaCol]) + propExpectAge_1<- propExpect[[unique(hspc.combined@meta.data[,metaCol])[1]]] + propExpectAge_2<- propExpect[[unique(hspc.combined@meta.data[,metaCol])[2]]] + phyper <- rep(NA,length(colnames(table))) + enrich <- rep(NA,length(colnames(table))) + tablePercent <- rbind(tablePercent,enrich,phyper) + + + for (age in unique(hspc.combined@meta.data[,metaCol])) { + for (cluster in colnames(table)) { + if(tablePercent[age,cluster] > propExpect[[age]]) { + cells_pull_marked <- table[age,as.character(cluster)] + cells_pull <- as.numeric(colSums(table)[as.character(cluster)]) + cells_marked_all <- rowSums(table)[age] + all_cells <- length(hspc.combined@meta.data[,metaCol]) + + + + p.value <- phyper(q=cells_pull_marked -1, + m=cells_marked_all, + n=all_cells - cells_marked_all, k= cells_pull, lower.tail=FALSE) + + tablePercent["enrich",cluster] <- age + + tablePercent["phyper",cluster] <- p.value + + } + } + } + return(tablePercent) + +} + +# Hypergeometric test and plotting the result with a bp function, use a getEnrichAge +getEnrichPopClust <- function(hspc.combined, Xname, Yname, colorX, colorY, metaCol = "AGE", clustCol = "numclust"){ + conditionEnrich <- getEnrichAge(hspc.combined = hspc.combined,clustCol = clustCol, metaCol = metaCol) + conditionEnrich <- as.data.frame(t(conditionEnrich)) + conditionEnrich$color <- "black" + conditionEnrich[which(as.numeric(as.vector(conditionEnrich$phyper)) < 0.01 & conditionEnrich$enrich == Xname),"color"] <- colorX + conditionEnrich[which(as.numeric(as.vector(conditionEnrich$phyper)) < 0.01 & conditionEnrich$enrich == Yname),"color"] <- colorY + return(conditionEnrich) +} + + +## ----------------------------------------------------------------------------- +## Command line args +## ----------------------------------------------------------------------------- + +spec = matrix(c( + 'help', 'h', 0, "logical", "Help about the program", + "inputSeurat", "i",1, "character", "input seurat object with cluster column names numclust", + 'outdir', 'o',1, "character", 'Outdir path (default ./)', + "inputStream", "s", 1, "character", "input stream output in csv format", + "colMetaData", "m", 1, "character", "List all the column from inputStream that you want to transfer (sep by +)", + "refPseudotime", "r", 1, "character", "Name of the reference pseudotime", + "treeShape", "t", 1, "character", "Level of the pseudotime tree, from the root to the leafs (sep by +)" +), byrow=TRUE, ncol=5); + + + +opt = getopt(spec) + +# if help was asked, print a friendly message +# and exit with a non-zero error code + +args <- commandArgs() +if ( !is.null(opt$help) | is.null(opt$inputSeurat)) { + cat("Create influence graph from regulon table") + cat(getopt(spec, usage=TRUE)) + q(status=1) +} + +## --------------------------------------------------------------------- +## Read inputs +## --------------------------------------------------------------------- + +# opt$inputSeurat <- "/shared/projects/scRNA_HSPC_Aging/GMP_PRARA/output/RNA/Seurat4_integration/Analysis/seurat_annotated.rds" +# opt$inputStream <- "/shared/projects/scRNA_HSPC_Aging/GMP_PRARA/output/RNA/streamAnalysis/stream_metadata.csv" +# opt$outdir <- "/shared/projects/scRNA_HSPC_Aging/GMP_PRARA/output/RNA/streamAnalysisSeurat/" +# opt$colMetaData <- "branch_id_alias+S0_pseudotime+S1_pseudotime+S2_pseudotime+S3_pseudotime+S4_pseudotime+S5_pseudotime" +# opt$refPseudotime <- "S1_pseudotime" +# opt$treeShape <- "(S1, S0)+(S2, S0)+(S3, S0)+(S4, S3)+(S5, S3)" + +print(opt$colMetaData) +print(opt$refPseudotime) +print(opt$treeShape) + +# Read seurat object +seurat <- readRDS(opt$inputSeurat) + +stream <- read.table(opt$inputStream, header = T, sep = ",", row.names = 1) + +pseudotimeMetadata <- strsplit(opt$colMetaData, split = "\\+")[[1]] + +treeShape <- strsplit(opt$treeShape, split = "\\+")[[1]] + +outdir <- opt$outdir + +print(treeShape) + +# Define colors +colorTreatment <- c("#664CFF", "#FF8000") +colorCluster <- c("#E69F00", "#CC79A7", "#0072B2", "#009E73", "#D55E00", "#56B4E9") +colorPhases <- c(brewer.pal(9,"RdPu"))[c(3,6,9)] +colorBranches <- c("#332288", "#117733", "#DDCC77", "#AA4499", "#88CCEE") + +#----------------------------------------------------# +# Prepare the seurat object for pseudotime analysis +#----------------------------------------------------# + +dir.create(outdir, recursive = T) + +DefaultAssay(seurat) <- "RNA" + +# Add all the metadata from stream output to seurat +seurat@meta.data[,pseudotimeMetadata] <- stream[rownames(seurat@meta.data), pseudotimeMetadata] + +# Setup default pseudotime and make it 100 bigger +seurat$Pseudotime <- seurat@meta.data[,opt$refPseudotime] +seurat$Pseudotime_100 <- seurat@meta.data$Pseudotime*100 + +# Rename branch to easier name +seurat@meta.data$branch_id_alias <- gsub(pattern = "'", replacement = "", x = seurat@meta.data$branch_id_alias) +seurat@meta.data$branch_id_alias <- factor(seurat@meta.data$branch_id_alias, levels = treeShape) + +print("Rename trajectory name") +Idents(seurat) <- "branch_id_alias" +new.traj.name <- LETTERS[1:length(treeShape)] + +names(new.traj.name) <- levels(seurat) + +seurat <- RenameIdents(seurat, new.traj.name) +seurat@meta.data$BranchName <- Idents(seurat) + +print("Renaming done") +Idents(seurat) <- "FinalCluster" + +# seurat@meta.data$FinalBranch <- FALSE +# seurat@meta.data[which(seurat@meta.data$BranchName %in% c("D", "E", "B")), "FinalBranch"] <- TRUE + +#----------------------------------------------------# +# Pseudotime ploting +#----------------------------------------------------# + + +rescale_pseudotime <- rescale(seurat@meta.data[order(seurat$Pseudotime_100, decreasing = F), "Pseudotime_100"], to = c(0,1)) +UMAP_pseudotime_rescale <- FeaturePlot(seurat, features = "Pseudotime_100", pt.size = 0.4) + scale_color_viridis_c(option = "C", values = rescale_pseudotime) + +UMAP_branchName <- DimPlot(seurat, group.by = "BranchName", cols = colorBranches, pt.size = 0.4) + +ggsave(paste0(outdir, 'UMAP_Branches', '.png'), plot = UMAP_branchName, device = 'png', path = NULL, + scale = 1, width = 20, height = 20, units = 'cm', dpi = 300) + +ggsave(paste0(outdir, 'UMAP_Pseudotime_rescale', '.png'), plot = UMAP_pseudotime_rescale, device = 'png', path = NULL, + scale = 1, width = 20, height = 20, units = 'cm', dpi = 300) + +summary_branch_clust <- ddply(seurat@meta.data,~FinalCluster + BranchName + condition + phases, nrow) +summary_branch_clust$condition <- factor(summary_branch_clust$condition, levels = c("Treated", "Control")) + +propExpect <- table(seurat@meta.data$condition)/length(seurat@meta.data$condition)[] + +enrich_branch <- getEnrichPopClust(hspc.combined = seurat, Xname = "Control", Yname = "Treated", colorX = colorTreatment[1], + colorY = colorTreatment[2], metaCol = "condition", clustCol = "BranchName") + +bp_treatmentBranch <- ggplot(data.frame(summary_branch_clust), aes(fill = condition,y = V1, x=BranchName)) + + geom_bar( stat="identity", position="fill")+ + scale_fill_manual( values= rev(colorTreatment))+ + scale_y_continuous(name = "Sample (%)", labels = c(0,25,50,75,100))+ + ylab(label = "")+xlab(label = "") + coord_flip()+ + theme(legend.title=element_blank(), axis.text.y = element_text(colour = enrich_branch[,"color"])) + + geom_hline(yintercept = propExpect["Control"]) + + +bp_branchState <- ggplot(data.frame(summary_branch_clust), aes(fill = BranchName,y = V1, x=FinalCluster)) + + geom_bar( stat="identity", position="fill")+ + scale_fill_manual( values= colorBranches)+ + scale_y_continuous(name = "Sample (%)", labels = c(0,25,50,75,100))+ + ylab(label = "")+xlab(label = "") + coord_flip()+ + theme(legend.title=element_blank()) + +bp_stateBranch <- ggplot(data.frame(summary_branch_clust), aes(fill = FinalCluster,y = V1, x=BranchName)) + + geom_bar( stat="identity", position="fill")+ + scale_fill_manual( values= colorCluster)+ + scale_y_continuous(name = "Sample (%)", labels = c(0,25,50,75,100))+ + ylab(label = "")+xlab(label = "") + coord_flip()+ + theme(legend.title=element_blank()) + +bp_cellCycleBranch <- ggplot(data.frame(summary_branch_clust), aes(fill = phases,y = V1, x=BranchName)) + + geom_bar( stat="identity", position="fill")+ + scale_fill_manual( values= colorPhases)+ + scale_y_continuous(name = "Sample (%)", labels = c(0,25,50,75,100))+ + ylab(label = "")+xlab(label = "") + coord_flip()+ + theme(legend.title=element_blank()) + +ggsave(paste0(outdir, 'BP_treatmentPerBranch', '.png'), plot = bp_treatmentBranch, device = 'png', path = NULL, + scale = 1, width = 10, height = 10, units = 'cm', dpi = 300) +ggsave(paste0(outdir, 'BP_branchPerCluster', '.png'), plot = bp_branchState, device = 'png', path = NULL, + scale = 1, width = 10, height = 10, units = 'cm', dpi = 300) +ggsave(paste0(outdir, 'BP_clusterPerBranch', '.png'), plot = bp_stateBranch, device = 'png', path = NULL, + scale = 1, width = 10, height = 10, units = 'cm', dpi = 300) +ggsave(paste0(outdir, 'BP_cellCyclePerBranch', '.png'), plot = bp_cellCycleBranch, device = 'png', path = NULL, + scale = 1, width = 10, height = 10, units = 'cm', dpi = 300) + +saveRDS(seurat, paste0(outdir, "seuratPseudotime.rds")) \ No newline at end of file diff --git a/R_src/testGetopt.R b/R_src/testGetopt.R new file mode 100644 index 0000000000000000000000000000000000000000..0b399be9a810289547e623b2cb4f393722cb89d6 --- /dev/null +++ b/R_src/testGetopt.R @@ -0,0 +1,103 @@ +##------------------------------------------------------------------------------ +## L. Herault +##------------------------------------------------------------------------------ + +## ----------------------------------------------------------------------------- +## Libraries +## ----------------------------------------------------------------------------- + +suppressMessages(library(monocle)) +suppressMessages(library(Seurat)) +suppressMessages(library(BiocParallel)) +suppressMessages(library(getopt)) +suppressMessages(library(gridExtra)) +suppressMessages(library(gProfileR)) +suppressMessages(library(RColorBrewer)) +suppressMessages(library(readxl)) +suppressMessages(library(stringr)) +suppressMessages(library(scales)) +suppressMessages(library(cowplot)) +suppressMessages(library(sctransform)) +suppressMessages(library(stringr)) +suppressMessages(library(plyr)) + + +source("R_src/getSigPerCells.R") +source("R_src/Enrichment.R") +source("R_src/funForSeurat.R") + + +# Seurat 3 integration workflow + +## ----------------------------------------------------------------------------- +## Command line args +## ----------------------------------------------------------------------------- + +spec = matrix(c( + 'help', 'h', 0, "logical", "Help about the program", + 'inputFiles', 'i', 1, "character", "REQUIRED: 10X dataset paths prepared as hspc.combined object (.RDS generated by prepare_data.R) separated by +", + 'signaturesFile', 's',1, "character", "REQUIRED: signatures rds file", + 'outdir', 'o',1, "character", 'Outdir path (default ./)', + "num_dim_CCA", 'q',1,"numeric", "First n dimensions of CCA to use for FindIntegrationAnchors Seurat function (15 by default)", + "num_dim_weight", 'w',1,"numeric", "First n dimensions to use for IntegrateData Seurat function (15 by default)", + "num_dim", 'n',1,"numeric", "First n dimensions of PCA to use for clustering, UMAP and TSNE (15 by default)", + "num_dim_integrated",'N',1,"numeric", "Number of PCA dimension computed to analyse integrated data (40 by default)", + "cores", 'c',1, "numeric", "Number of cores to use for ordering (for differencially expressed gene between clusters test)", + "resolution", 'r',1, "numeric", "resolution for hspc.combined clustering", + "correction", "b",1,"character", "Covariable to use as blocking factor (eg one or several columns of pData: betch, cell cycle phases... separated by +)", + "logfc_threshold", "l", 1, "numeric", "logfc threshold for finding cluster markers (1 cluster vs all deg) 0.25 by default", + "rodriguezSig", "k", 1, "character", "path for xls file of Rodriguez result", + "norm_method", "z",1, "character", "normalization method, logNorm (by default) or sctransform", + "reusePca", "p", 0, "character","re use pca calculated before when caculating anchor weights for each dataset default to FALSE (permit to correct for cell cycle before integration)" +), byrow=TRUE, ncol=5); + +opt = getopt(spec) + +# if help was asked, print a friendly message +# and exit with a non-zero error code + +args <- commandArgs() +if ( !is.null(opt$help) | is.null(opt$inputFiles)) { + cat("Perform Seurat 3 integration workflow, then cluster the cell with seurat 3 at different resolutions") + cat(getopt(spec, usage=TRUE)) + q(status=1) +} + +print(opt$num_dim) +print(opt$num_dim_integrated) +print(opt$num_dim_CCA) +print(opt$num_dim_weigth) +#set default arguments values + +if (is.null(opt$resolution)) { + opt$resolution <- 0.6 +} + +if (is.null(opt$logfc_threshold)) { + opt$logfc_threshold <- 0.25 +} + +if (is.null(opt$num_dim_CCA)) { + opt$num_dim_CCA <- 15 +} + +if (is.null(opt$num_dim_weight)) { + opt$num_dim_weight <- 15 +} + +if (is.null(opt$num_dim)) { + opt$num_dim <- 15 +} + +if (is.null(opt$num_dim_integrated)) { + opt$num_dim_integrated <- 40 +} + +if (is.null(opt$norm_method)) { + opt$norm_method <- "logNorm" +} + +if (is.null(opt$reusePca)) { + opt$reusePca <- FALSE +} + diff --git a/R_src/transferFuns.R b/R_src/transferFuns.R new file mode 100644 index 0000000000000000000000000000000000000000..19dd974977814810236a41c930bcb9c59c49bc2a --- /dev/null +++ b/R_src/transferFuns.R @@ -0,0 +1,76 @@ + +integrateByConditionOneAtacRnaAll <- function(atac, + rna0, + clustCol ="numclust", + refAssayLabel = "RNA", + refAssayTransfer = "RNA", + condCol, + condition, + dimsLSI = c(1:30), + dimsCCA = c(1:30), + outdir = "./") { + + + Idents(rna0) <- condCol + rna <- subset(rna0,idents = condition) + + ##Data preprocessing + + ## Here, we process the gene activity matrix in order to find anchors between cells in the scATAC-seq dataset and the scRNA-seq dataset. + rna$numclust <- rna@meta.data[,clustCol] + DefaultAssay(atac) <- "activities" + atac <- FindVariableFeatures(atac) + atac <- NormalizeData(atac) + atac <- ScaleData(atac) + + transfer.anchors <- FindTransferAnchors(reference = rna, query = atac, features = VariableFeatures(object = rna), + reference.assay = refAssayLabel, query.assay = "activities", reduction = "cca") + + ## To improve discard first integratedLSI ? + celltype.predictions <- TransferData(anchorset = transfer.anchors, refdata = rna$numclust, dims = dimsLSI, + weight.reduction = atac[["lsi"]]) + atac <- AddMetaData(atac, metadata = celltype.predictions) + + png(paste0(outdir,"prediction.score.max.png")) + hist(atac$prediction.score.max) + abline(v = 0.5, col = "red") + dev.off() + + ## We filter atac cells with a weak prediction score for rna cluster transfer + atac.filtered <- subset(atac, subset = prediction.score.max > 0.4) + atac.filtered$predicted.id <- factor(atac.filtered$predicted.id, levels = levels(rna$numclust)) # to make the colors match + + p1 <- DimPlot(atac.filtered, group.by = "predicted.id", label = TRUE, repel = TRUE) + ggtitle("scATAC-seq cells") + + NoLegend() + scale_colour_hue(drop = FALSE) + p2 <- DimPlot(rna, group.by = "numclust", label = TRUE, repel = TRUE) + ggtitle("scRNA-seq cells") + + NoLegend() + + png(paste0(outdir,"predictedATACvsRNAclust.png"),units = "in",res = 300,height=6,width =10) + grid.arrange(p1,p2,nrow = 1) + dev.off() + + ### Input with all rna data as ref + # note that we restrict the imputation to variable genes from scRNA-seq + genes.use <- VariableFeatures(rna0) # all genes of the slot if integrated + refdata <- GetAssayData(rna0, assay = refAssayTransfer, slot = "data")[genes.use, ] + + # refdata (input) contains a scRNA-seq expression matrix for the scRNA-seq cells. imputation + # (output) will contain an imputed scRNA-seq matrix for each of the ATAC cells + transfer.anchors.2 <- FindTransferAnchors(reference = rna0, + query = atac.filtered, + features = VariableFeatures(object = rna0), + reference.assay = refAssayTransfer, + query.assay = "activities", + reduction = "cca", + dims = dimsCCA) + + imputation <- TransferData(anchorset = transfer.anchors.2, + refdata = refdata, + weight.reduction = atac.filtered[["lsi"]], + dims = dimsLSI) + + # this line adds the imputed data matrix to the atac object + atac.filtered[[refAssayTransfer]] <- imputation + + return(atac.filtered) +} diff --git a/R_src/transferRnaAtacCL.R b/R_src/transferRnaAtacCL.R new file mode 100644 index 0000000000000000000000000000000000000000..48a8f602e44608f33decaa72b30375a28c455162 --- /dev/null +++ b/R_src/transferRnaAtacCL.R @@ -0,0 +1,170 @@ + +suppressMessages(library(Signac)) +suppressMessages(library(Seurat)) +suppressMessages(library(getopt)) +suppressMessages(library(gridExtra)) +suppressMessages(library(scales)) +suppressMessages(library(grid)) +suppressMessages(library(S4Vectors)) +suppressMessages(library(patchwork)) +suppressMessages(library(ggplot2)) + +set.seed(2021) + +source("R_src/transferFuns.R") + + +spec = matrix(c( + 'help', 'h', 0, "logical", "Help about the program", + 'inputRNAintegrated', 'i', 1, "character", "REQUIRED: scRNA integrated dataset path prepared as seurat object (.RDS generated by seurat4_integration.R)", + 'inputAtac1', 'a',1, "character", "REQUIRED: scATAC dataset path prepared as seurat object (.RDS generated by SignacSmpCL.R)", + 'inputAtac2', 'b',1, "character", "REQUIRED: scATAC dataset path prepared as seurat object (.RDS generated by SignacSmpCL.R)", + "num_dim_CCA", 'q',1,"numeric", "First n dimensions of CCA to use for FindIntegrationAnchors and TransferData Seurat functions (30 by default)", + "num_dim_LSI", 'l',1,"numeric", "First n dimensions of LSI to use for the atac query in TransferData Seurat functions (30 by default)", + "discardFirstLsiDim", 'd',0,"logical", "discard first dim lsi for the query in TransferData function", + "num_dim", 'n',1,"numeric", "First n dimensions of PCA to use for clustering, UMAP and TSNE (15 by default)", + "num_dim_integrated",'N',1,"numeric", "Number of PCA dimension computed to analyse integrated data (40 by default)", + "clustCol", 'r',1, "character", "cluster column name to transfer to atac data", + "condClust", 'c',1, "character", "condition column name present in all inputs", + "outdir", "o",1,"character", "outdir path"), + +byrow=TRUE, ncol=5); + +opt = getopt(spec) + +# if help was asked, print a friendly message +# and exit with a non-zero error code + +args <- commandArgs() +if ( !is.null(opt$help)) { + cat("Perform Seurat 3 integration workflow, then cluster the cell with seurat 3 at different resolutions") + cat(getopt(spec, usage=TRUE)) + q(status=1) +} + +#set default arguments values + +if (is.null(opt$clustCol)) { + opt$clustCol <- "numclust" +} + +if (is.null(opt$condCol)) { + opt$condCol <- "condition" +} + + +if (is.null(opt$num_dim_CCA)) { + opt$num_dim_CCA <- 30 +} + +dimsCCA <- c(1:opt$num_dim_CCA) + +if (is.null(opt$num_dim_LSI)) { + opt$num_dim_LSI <- 30 +} + +if (is.null(opt$discardFirstLsiDim)) { + dimsLSI <- c(1:opt$num_dim_LSI) +} else { + dimsLSI <- c(2:opt$num_dim_LSI) +} + +if (is.null(opt$num_dim_integrated)) { + opt$num_dim_integrated <- 40 +} + +if (is.null(opt$num_dim)) { + opt$num_dim <- 15 +} + + +print(opt) + +rna0 <- readRDS(opt$inputRNAintegrated) +atac1 <- readRDS(opt$inputAtac1) +atac2 <- readRDS(opt$inputAtac2) + +print("add metadata...") + + +rna0$orig.type <- "RNA" +atac1$orig.type <- "ATAC" +atac2$orig.type <- "ATAC" + +print("set idents...") + +Idents(atac1) <- opt$condCol +Idents(atac2) <- opt$condCol + +Idents(rna0) <- opt$condCol + +print("set numclust...") + +rna0$numclust <- rna0@meta.data[,opt$clustCol] +print("retrieve conditions...") +cond <- unique(rna0@meta.data[,opt$condCol]) +print(cond) +print("cluster transfer to first ATAC smp...") + + +atac1 <- integrateByConditionOneAtacRnaAll(atac = atac1, + rna = rna0, + clustCol = opt$clustCol, + refAssayLabel = "RNA", + refAssayTransfer = "integrated", + condCol = opt$condCol, + condition = cond[1], + dimsLSI = dimsLSI, + dimsCCA = dimsCCA, + outdir = opt$outdir) + + +atac2 <- integrateByConditionOneAtacRnaAll(atac = atac2, + rna = rna0, + clustCol = opt$clustCol, + refAssayLabel = "RNA", + refAssayTransfer = "integrated", + condCol = opt$condCol, + condition = cond[2], + dimsLSI = dimsLSI, + dimsCCA = dimsCCA, + outdir = opt$outdir) + + +## merge all the objects +atacAll <- merge(x = atac1, y = atac2) +coembed <- merge(x= rna0,y=atacAll) + +## Reduce dim on integrated (imputed) slot +genes.use <- VariableFeatures(rna0) # Will be all genes if integrated + +coembed <- ScaleData(coembed, features = genes.use, do.scale = FALSE) +coembed <- RunPCA(coembed, features = genes.use, verbose = FALSE) + +png(paste0(opt$outdir,"elbowAll.png")) +ElbowPlot(object = coembed,ndims = opt$num_dim_integrated) +dev.off() + +coembed <- RunUMAP(coembed, dims = 1:opt$num_dim) +coembed$numclust <- ifelse(coembed$orig.type == "RNA", coembed$numclust, coembed$predicted.id) + +coembed$condType <- paste(coembed$condition,coembed$orig.type) + +p1 <- DimPlot(coembed, group.by = "numclust") +p2 <- DimPlot(coembed, split.by = "condType", label = TRUE, repel = TRUE,ncol = 2) + +png(paste0(opt$outdir,"/elbowAll.png"),height = 800,width=1600) +grid.arrange(p1,p2,ncol = 2) +dev.off() + +saveRDS(atacAll, paste0(opt$outdir, "/atac_integrated.rds")) +saveRDS(coembed,paste0(opt$outdir,"/coembedAll.rds")) + + + + + + + + +