initial commit R_src

4a435285 · Leonard Herault · 5566366a · 4a435285 · 4a435285 · 4a435285
Commit 4a435285 authored 3 years ago by Leonard Herault
20 changed files
--- a/R_src/DoMultipleBarHeatmap.R
+++ b/R_src/DoMultipleBarHeatmap.R
+# Adapted from https://github.com/satijalab/seurat/issues/2201
+DoMultiBarHeatmap <- function (object, 
+                               features = NULL, 
+                               cells = NULL, 
+                               group.by = "ident", 
+                               additional.group.by = NULL, 
+                               group.bar = TRUE, 
+                               disp.min = -2.5, 
+                               disp.max = NULL, 
+                               slot = "scale.data", 
+                               assay = NULL, 
+                               label = TRUE, 
+                               size = 5.5, 
+                               hjust = 0, 
+                               angle = 45, 
+                               raster = TRUE, 
+                               draw.lines = TRUE, 
+                               lines.width = NULL, 
+                               group.bar.height = 0.02, 
+                               combine = TRUE,
+                               group.col = NULL
+                               ) 
+{
+  cells <- cells %||% colnames(x = object)
+  if (is.numeric(x = cells)) {
+    cells <- colnames(x = object)[cells]
+  }
+  assay <- assay %||% DefaultAssay(object = object)
+  DefaultAssay(object = object) <- assay
+  features <- features %||% VariableFeatures(object = object)
+  ## Why reverse???
+  features <- rev(x = unique(x = features))
+  disp.max <- disp.max %||% ifelse(test = slot == "scale.data", 
+                                   yes = 2.5, no = 6)
+  possible.features <- rownames(x = GetAssayData(object = object, 
+                                                 slot = slot))
+  if (any(!features %in% possible.features)) {
+    bad.features <- features[!features %in% possible.features]
+    features <- features[features %in% possible.features]
+    if (length(x = features) == 0) {
+      stop("No requested features found in the ", slot, 
+           " slot for the ", assay, " assay.")
+    }
+    warning("The following features were omitted as they were not found in the ", 
+            slot, " slot for the ", assay, " assay: ", paste(bad.features, 
+                                                             collapse = ", "))
+  }
+  data <- as.data.frame(x = as.matrix(x = t(x = GetAssayData(object = object, 
+                                                             slot = slot)[features, cells, drop = FALSE])))
+  
+  object <- suppressMessages(expr = StashIdent(object = object, 
+                                               save.name = "ident"))
+    
+  group.by <- group.by %||% "ident"
+  groups.use <- object[[c(group.by, additional.group.by)]][cells, , drop = FALSE]
+  plots <- list()
+  for (i in group.by) {
+    data.group <- data
+    group.use <- groups.use[, c(i, additional.group.by), drop = FALSE]
+    
+    for(colname in colnames(group.use)){
+      if (!is.factor(x = group.use[[colname]])) {
+        group.use[[colname]] <- factor(x = group.use[[colname]])
+      }  
+    }
+    
+    if (draw.lines) {
+      lines.width <- lines.width %||% ceiling(x = nrow(x = data.group) * 
+                                                0.0025)
+      placeholder.cells <- sapply(X = 1:(length(x = levels(x = group.use[[i]])) * 
+                                           lines.width), FUN = function(x) {
+                                             return(Seurat:::RandomName(length = 20))
+                                           })
+      placeholder.groups <- data.frame(foo=rep(x = levels(x = group.use[[i]]), times = lines.width))
+      placeholder.groups[additional.group.by] = NA
+      colnames(placeholder.groups) <- colnames(group.use)
+      rownames(placeholder.groups) <- placeholder.cells
+      
+      group.levels <- levels(x = group.use[[i]])
+      
+      group.use <- sapply(group.use, as.vector)
+      rownames(x = group.use) <- cells
+      
+      group.use <- rbind(group.use, placeholder.groups)
+      
+      na.data.group <- matrix(data = NA, nrow = length(x = placeholder.cells), 
+                              ncol = ncol(x = data.group), dimnames = list(placeholder.cells, 
+                                                                           colnames(x = data.group)))
+      data.group <- rbind(data.group, na.data.group)
+    }
+    
+    
+    
+    group.use <- group.use[with(group.use, eval(parse(text=paste('order(', paste(c(i, additional.group.by), collapse=', '), ')', sep='')))), , drop=F]
+
+    
+    plot <- Seurat:::SingleRasterMap(data = data.group, raster = raster, 
+                                     disp.min = disp.min, disp.max = disp.max, feature.order = features, 
+                                     cell.order = rownames(x = group.use), group.by = group.use[[i]])
+    
+    if (group.bar) {
+      
+      if(is.null(group.col)) {
+        colPal <- scales::hue_pal()
+      } else {
+        colPal <- function(n) {
+          return(group.col[c(1:n)])
+        }
+      }
+      pbuild <- ggplot_build(plot = plot)
+      group.use2 <- group.use
+      cols <- list()
+      na.group <- Seurat:::RandomName(length = 20)
+      for (colname in rev(x = colnames(group.use2))){
+        if (colname == group.by){
+          colid = colname
+          colPal2 <- colPal
+        } else {
+          colid = colname
+          colPal2 <- function(n) {
+            return(rev(scales::hue_pal()(n)))
+          }
+        }
+        
+        if (draw.lines) {
+          levels(x = group.use2[[colname]]) <- c(levels(x = group.use2[[colname]]), na.group)  
+          group.use2[placeholder.cells, colname] <- na.group
+          cols[[colname]] <- c(colPal2(length(x = levels(x = group.use[[colname]]))), "#FFFFFF")
+        } else {
+          cols[[colname]] <- c(colPal2(length(x = levels(x = group.use[[colname]]))))
+        }
+        names(x = cols[[colname]]) <- levels(x = group.use2[[colname]])
+        y.range <- diff(x = pbuild$layout$panel_params[[1]]$y.range)
+        y.pos <- max(pbuild$layout$panel_params[[1]]$y.range) + y.range * 0.015
+        y.max <- y.pos + group.bar.height * y.range
+        pbuild$layout$panel_params[[1]]$y.range <- c(pbuild$layout$panel_params[[1]]$y.range[1], y.max)
+        #print(cols[[colname]][group.use2[[colname]]])
+        plot <- suppressMessages(plot + 
+                                   annotation_raster(raster = t(x = cols[[colname]][group.use2[[colname]]]),  xmin = -Inf, xmax = Inf, ymin = y.pos, ymax = y.max) + 
+                                   annotation_custom(grob = grid::textGrob(label = colid, hjust = 0, gp = grid::gpar(cex = 0.75)), ymin = mean(c(y.pos, y.max)), ymax = mean(c(y.pos, y.max)), xmin = Inf, xmax = Inf) +
+                                   coord_cartesian(ylim = c(0, y.max), clip = "off")) 
+        
+        #temp <- as.data.frame(cols[[colname]][levels(group.use[[colname]])])
+        #colnames(temp) <- 'color'
+        #temp$x <- temp$y <- 1
+        #temp[['name']] <- as.factor(rownames(temp))
+        
+        #temp <- ggplot(temp, aes(x=x, y=y, fill=name)) + geom_point(shape=21, size=5) + labs(fill=colname) + theme(legend.position = "bottom")
+        #legend <- get_legend(temp)
+        #multiplot(plot, legend, heights=3,1)
+        
+        if ((colname == group.by) && label) {
+          x.max <- max(pbuild$layout$panel_params[[1]]$x.range)
+          x.divs <- pbuild$layout$panel_params[[1]]$x.major
+          group.use$x <- x.divs
+          label.x.pos <- tapply(X = group.use$x, INDEX = group.use[[colname]],
+                                FUN = median) * x.max
+          label.x.pos <- data.frame(group = names(x = label.x.pos), 
+                                    label.x.pos)
+          label.x.pos$group <- str_split_fixed(label.x.pos$group,"_",2)[,2]
+          plot <- plot + geom_text(stat = "identity", 
+                                   data = label.x.pos, aes_string(label = "group", 
+                                                                  x = "label.x.pos"), y = y.max + y.max * 
+                                     0.03 * 0.5, angle = angle, hjust = hjust, 
+                                   size = size)
+          plot <- suppressMessages(plot + coord_cartesian(ylim = c(0, 
+                                                                   y.max + y.max * 0.002 * max(nchar(x = levels(x = group.use[[colname]]))) * 
+                                                                     size), clip = "off"))
+        }
+      }
+    }
+    plot <- plot + theme(line = element_blank())
+    plots[[i]] <- plot
+  }
+  if (combine) {
+    plots <- CombinePlots(plots = plots)
+  }
+  return(plots)
+}
--- a/R_src/Enrichment.R
+++ b/R_src/Enrichment.R
+# function to make gene enrichment analyzis with gProfileR
+
+## function to perform gprofler enrcihment analysis on each gene cluster of 
+## a differentially expressed gene result (eg data frame of DE genes with a column cluster)
+getGeneClustGprofile <- function(deg_clust,
+                                 background,
+                                 organism = "mmusculus",
+                                 hier_filtering = "none",
+                                 ordered_query = F) {
+  
+  resEnrichTest <- lapply(split(as.vector(deg_clust$Gene),f= deg_clust$Cluster),gprofiler,
+                    organism = "mmusculus",
+                    custom_bg = background,
+                    ordered_query = ordered_query,
+                    hier_filtering = hier_filtering)
+  return(resEnrichTest)
+}
+
+
+## function to plot gprofiler results
+plotWriteResEnrich <- function(resEnrichTest,
+                          sources=c("BP","keg"),
+                          outdir = "./gprofiler",
+                          clusterLabel =names(resEnrichTest),
+                          colors = brewer.pal(length(resEnrichTest)+1, "Set1")) {
+  
+  dir.create(outdir,recursive = T,showWarnings = F)
+  
+  colors <- colors
+
+  for (cluster in clusterLabel) {
+    write.table(file= paste(outdir,"/gprofiler_table_clust_",cluster,".tsv", sep = ""),
+                            resEnrichTest[[cluster]],quote = F,sep = '\t',row.names = F)
+    for (s in sources) {
+      results <- resEnrichTest[[cluster]][which(resEnrichTest[[cluster]][,"domain"] == s),]
+      results <- results[order(results$p.value),]
+      results <- as.data.frame(results[,c("term.id","term.name","p.value")])
+      results$logPval <- -log(base = 10,x = results$p.value)
+      if(s == "tf") {
+        results$term.name <- paste(results$term.name,results$term.id)
+      }
+      
+      ## TO DO : if to much term plot only the first n 
+      
+      if (length(results$term.name) > 55) {
+        png(paste(outdir,"/gprofiler_top30_",cluster,"_",s,".png",sep =""),width = 600,height = 600)
+        gp <- ggplot(results[c(1:30),], aes(x=reorder(term.name, logPval), y=logPval)) +
+          geom_bar(stat='identity',fill = colors[which(names(resEnrichTest)==cluster)] ) +
+          coord_flip() +
+          xlab("term name") +
+          ylab("-log10(p value)") +
+          theme(text = element_text(size = 20))
+        print(gp)
+        
+        dev.off()
+        
+        png(paste(outdir,"/gprofiler_",cluster,"_",s,".png",sep =""),width = 1400,height = 1400)
+        
+      } else {
+        png(paste(outdir,"/gprofiler_",cluster,"_",s,".png",sep =""),width = 600,height = 600)
+      }
+      gp <- ggplot(results, aes(x=reorder(term.name, logPval), y=logPval)) +
+        geom_bar(stat='identity',fill = colors[which(names(resEnrichTest)==cluster)] ) +
+        coord_flip() +
+        xlab("term name") +
+        ylab("-log10(p value)") +
+        theme(text = element_text(size = 20))
+      print(gp)
+      
+      dev.off()
+    }
+    
+  }
+}
+
+## function to composate
+gProfileAnalysis <- function(deg_clust,
+                             background,
+                             organism = "mmusculus",
+                             hier_filtering = "moderate",
+                             ordered_query = F,
+                             sources=c("BP","keg","CC",'MF','tf'),
+                             outdir = "/gprofiler",
+                             clusterLabel =names(resEnrichTest),
+                             colors = brewer.pal(length(resEnrichTest)+1, "Set1")) {
+  
+  resEnrichTest <- getGeneClustGprofile(deg_clust = deg_clust,
+                                        background = background,
+                                        organism = organism,
+                                        hier_filtering = hier_filtering,
+                                        ordered_query = ordered_query)
+  
+  plotWriteResEnrich(resEnrichTest,
+                     sources=sources,
+                     outdir = outdir,
+                     clusterLabel =clusterLabel,
+                     colors = colors)
+  
+  return(resEnrichTest)
+  
+}
+
+# function for hypergeometric test
+
+# hypergeometric test https://dputhier.github.io/ASG/practicals/go_statistics_td/go_statistics_td_2015.html
+
+#DEPRECATED Seurat 2
+# testHyper <- function(gene_set, signature, background,seurat) {
+#     signature <- signature[which(is.element(signature,set = rownames(seurat@data)))]
+#     genesMarked <- gene_set[which(is.element(gene_set,set = signature))]
+#     p.value <-  phyper(q=length(genesMarked) -1, 
+#                        m=length(signature),                  
+#                        n=length(background) - length(signature), k= length(gene_set), lower.tail=FALSE)
+#     return(p.value)
+# }
+
+#For Seurat 3
+testHyper3 <- function(gene_set, signature, background,seurat) {
+  signature <- signature[which(is.element(signature,set = rownames(seurat)))]
+  genesMarked <- gene_set[which(is.element(gene_set,set = signature))]
+  p.value <-  phyper(q=length(genesMarked) -1, 
+                     m=length(signature),                 
+                     n=length(background) - length(signature), k= length(gene_set), lower.tail=FALSE)
+  return(p.value)
+}
+
+
+
+testHyperCells <- function(cells_pull, cells_setTarget, allCells) {
+  cellsMarked <- gene_set[which(is.element(gene_set,set = cells_set))]
+  p.value <-  phyper(q=length(cellsMarked) -1, 
+                     m=length(cells_setTarget),
+                     n=length(allCells) - length(cells_setTarget), k= length(cells_pull), lower.tail=FALSE)
+  return(p.value)
+}
+
+## DEPRECATED Seurat2
+# testHyperSig <- function(signature,seurat,markers,clust) {  
+#   result <- testHyper(gene_set=markers[which(markers$Cluster == clust),"Gene"],
+#                       signature=signature,seurat=seurat,background=rownames(seurat@data))
+#   return(result)
+# }
+
+#For Seurat 3
+testHyperSig3 <- function(signature,seurat,markers,clust) { 
+  result <- testHyper3(gene_set=markers[which(markers$Cluster == clust),"Gene"],
+                       signature=signature,seurat=seurat,background=rownames(seurat))
+  return(result)
+}
+
+
+
+
+  
+
+
--- a/R_src/Seurat3CL.R
+++ b/R_src/Seurat3CL.R
+##------------------------------------------------------------------------------
+## L. Herault
+##------------------------------------------------------------------------------
+
+## -----------------------------------------------------------------------------
+## Libraries
+## -----------------------------------------------------------------------------
+
+suppressMessages(library(monocle))
+suppressMessages(library(Seurat))
+suppressMessages(library(BiocParallel))
+suppressMessages(library(getopt))
+suppressMessages(library(gridExtra))
+suppressMessages(library(gProfileR))
+suppressMessages(library(RColorBrewer))
+suppressMessages(library(readxl))
+suppressMessages(library(stringr))
+suppressMessages(library(scales))
+suppressMessages(library(grid))
+
+
+
+source("R_src/getSigPerCells.R")
+source("R_src/Enrichment.R")
+source("R_src/funForSeurat.R")
+
+# Seurat 3 analysis of an individual sample required for using CaSTLe script.
+
+## -----------------------------------------------------------------------------
+## Command line args
+## -----------------------------------------------------------------------------
+
+spec = matrix(c(
+  'help',        'h', 0, "logical",   "Help about the program",
+  'inputRDS',  'i', 1, "character", "REQUIRED : 10X data prepared as monocle or seurat object (.RDS generated by prepare_data.R).",
+  'outdir',     'o',1, "character", 'Outdir path (default ./)',
+  "num_dim",      'n',1,"numeric", "Number of dimension to use for ordering (eg n first pc of PCA on input data) 10 by default",
+  "correction",      "b",1,"character", "Covariable to use as blocking factor (eg one or several columns of pData: betch, cell cycle phases... separated by +)",
+  "minPropCellExp", "p",1,"numeric", "minimal proportion of cells that expressed the genes kept for the analaysis 0.001 by default",
+  "norm_method",    "z",1, "character", "normalisation method, logNorm seurat (by default) or sctransform",
+  "resolution", "r", 1,"numeric", "resolution for Seurat clustering 0.9 by default",
+  "signaturesFile", "s", 1, "character", "path to folder with signature list store as rds object",
+  "identRemoved",    "d", 1, "character", "Optionnal cluster to remove only work if input is a seurat object",
+  "nonExpressedGenesRemoved", "e", 0,"logical", "non expressed gene already removed default to FALSE",
+  "gprofiler", "g", 0, "logical", "if true doing gprofiler default to true",
+  "logfc_threshold", "l", 1, "numeric", "logfc threshold for finding cluster markers (1 cluster vs all deg) 0.25 by default",
+  "rodriguezSig", "k", 1, "character", "path for xls file of Rodriguez result"
+  
+), byrow=TRUE, ncol=5);
+
+opt = getopt(spec)
+
+
+# if help was asked, print a friendly message
+# and exit with a non-zero error code
+
+args <- commandArgs()
+if ( !is.null(opt$help) | is.null(opt$inputRDS)) {
+  cat("For an individual sample, perform PCA, tSNE, UMAP, Louvain clustering, diff expression between clusters with Seurat3 package with filtered poor quality cells data (gbm_cds monocle object)")
+  cat(getopt(spec, usage=TRUE))
+  q(status=1)
+}
+
+
+#set default arguments values
+
+if (is.null(opt$num_dim)) {
+  opt$num_dim <- 10
+}
+
+if (is.null(opt$resolution)) {
+  opt$resolution <- 0.8
+}
+
+if (is.null(opt$minPropCellExp)) {
+  opt$minPropCellExp <- 0.001
+  print(opt$minPropCellExp)
+}
+
+
+if (is.null(opt$logfc_threshold)) {
+  opt$logfc_threshold = 0.25
+} 
+
+print(paste("logfc threshold:", opt$logfc_threshold) )
+
+if (is.null(opt$outdir)) {
+  opt$outdir = "./"
+}
+
+
+if (is.null(opt$nonExpressedGenesRemoved)) {
+  opt$nonExpressedGenesRemoved = F
+} else {
+  opt$nonExpressedGenesRemoved = T
+}
+
+if (is.null(opt$gprofiler)) {
+  opt$gprofiler <- T
+} else {
+  opt$gprofiler <- F
+}
+
+print(opt$gprofiler)
+print(opt$nonExpressedGenesRemoved)
+
+
+# get correction vector
+if (!is.null(opt$correction)) {
+  corrections <- strsplit(x = opt$correction,split = "\\+")[[1]]
+} else {
+  corrections <- NULL
+}
+
+# create blank for the plots
+blank <- grid.rect(gp=gpar(col="white"))
+
+dir.create(opt$outdir,recursive = T,showWarnings = F)
+
+object <- readRDS(opt$inputRDS)
+
+# input is a monocle object in the workflow
+# Code to use this script with seurat input (eg to re analyse)
+if (is(object)[1]== "CellDataSet") {
+  
+  print("input monocle")
+  gbm_cds <- readRDS(opt$inputRDS)
+  
+} else {
+  if(is(object) == "Seurat") {
+    #It is easier to pass by a monocle object and then reget the seurat object with the genes filtered
+    print("input seurat object")
+    seurat <- object
+    
+    if (!is.null(opt$identRemoved)) {
+      print(unique(seurat@active.ident))
+      print(paste("removing identities",opt$identRemoved))
+      seurat <- SubsetData(seurat,ident.remove = strsplit(x = opt$identRemoved,split = ",")[[1]],subset.raw = T) # may not work with seurat3
+    }
+    
+    
+    pd <- new("AnnotatedDataFrame", data = gbm_cds@meta.data)
+    fd <- new("AnnotatedDataFrame", data = data.frame(gene_short_name = rownames(gbm_cds))) 
+    rownames(fd) <- fd$gene_short_name
+    
+    gbm_cds <- newCellDataSet(GetAssayData(gbm_cds,slot = "counts"),
+                              phenoData = pd,
+                              featureData = fd,
+                              lowerDetectionLimit = 0.1,
+                              expressionFamily = negbinomial.size())
+    
+    gbm_cds <- detectGenes(gbm_cds, min_expr = 0.1)
+    
+    
+    
+  }
+}
+
+if(opt$nonExpressedGenesRemoved == F) {
+  print("remove non expressed genes (non expressed in at least X% of the cells X user option in monocle dp feature 5% in seurat tutorial 0,1%)")
+  fData(gbm_cds)$use_for_seurat <- fData(gbm_cds)$num_cells_expressed > opt$minPropCellExp * ncol(gbm_cds)
+  
+  gbm_to_seurat <- gbm_cds[fData(gbm_cds)$use_for_seurat==T,]
+} else {
+  gbm_to_seurat <- gbm_cds
+}
+
+if (is.element("Cluster",colnames(pData(gbm_to_seurat)))) {
+  colnames(pData(gbm_to_seurat))[which(colnames(pData(gbm_to_seurat))=="Cluster")] <- "Cluster_monocle"
+}
+
+##Convert to seurat3
+#check for dup genes
+
+# Only needed if ensemble id (it is the case in the workflow)
+
+dupGeneNames <- fData(gbm_to_seurat)[which((duplicated(featureData(gbm_to_seurat)$gene_short_name))),"gene_short_name"]
+
+if(length(dupGeneNames) == 0) {
+  rownames(gbm_to_seurat) <- fData(gbm_to_seurat)$gene_short_name
+  
+} else {
+  write.csv(fData(gbm_to_seurat)[which(is.element(fData(gbm_to_seurat)$gene_short_name,dupGeneNames)),],
+            paste(opt$outdir,"/dupGenesName.csv",sep = "")
+  )
+  print("Dup gene short names existing, making them unique...")
+  rownames(gbm_to_seurat) <- make.unique(fData(gbm_to_seurat)$gene_short_name, sep = "--")
+  
+}
+
+
+seurat <- CreateSeuratObject(counts = exprs(gbm_to_seurat), meta.data = pData(gbm_to_seurat))
+
+
+
+#################################################################################################
+####################################### Seurat Workflow #########################################
+#################################################################################################
+
+if (opt$norm_method == "sctransform") {
+  
+  seurat <- SCTransform(object = seurat, vars.to.regress = c("G2M_score","S_score","G1_score"))
+  
+} else {
+  
+  seurat <- NormalizeData(object = seurat)
+  seurat <- FindVariableFeatures(object = seurat,selection.method = "vst", nfeatures = 2000, verbose = T)
+  seurat <- ScaleData(object = seurat,vars.to.regress = corrections)
+  
+}
+
+seurat <- RunPCA(object = seurat)
+
+
+png(paste(opt$outdir,"/ElbowPlot.png",sep =""))
+ElbowPlot(object = seurat,ndims = 30)
+dev.off()
+
+
+print("Clustering...")
+
+seurat <- FindNeighbors(object = seurat,dims = c(1:opt$num_dim),k.param = 20)
+seurat <- FindClusters(object = seurat,resolution = c(0.5,0.6,0.7,0.8,0.9,1,1.2))
+
+print("Running TSNE...")
+
+seurat <- RunTSNE(seurat,dims = c(1:opt$num_dim))
+
+print("Running UMAP...")
+
+seurat <- RunUMAP(seurat,dims = c(1:opt$num_dim))
+
+print("UMAP ok")
+print(colnames(seurat@meta.data))
+
+colPrefix <- "RNA_snn_res."
+if(opt$norm_method == "sctransform") {
+  colPrefix <- "SCT_snn_res."
+}
+
+umapListRes <- list()
+tsneListRes <- list()
+for (r in c(0.6,0.8,1,1.2)) {
+    umapListRes[[as.character(r)]] <- DimPlot(seurat,
+                              reduction = "umap",
+                              label = T,
+                              group.by = paste(colPrefix,r,sep="")) + 
+      NoLegend() +
+      ggtitle(paste("res",r))
+    
+    tsneListRes[[as.character(r)]] <- DimPlot(seurat,
+                                reduction = "tsne",
+                                label = T,
+                                group.by = paste(colPrefix,r,sep="")) + 
+      NoLegend() +
+      ggtitle(paste("res",r))
+}
+
+png(paste(opt$outdir,"/tsne_different_res.png",sep = ""),height = 800,width = 800)
+grid.arrange(tsneListRes[[1]],tsneListRes[[2]],tsneListRes[[3]],tsneListRes[[4]])
+dev.off()
+
+png(paste(opt$outdir,"/umap_different_res.png",sep = ""),height = 800,width = 800)
+grid.arrange(umapListRes[[1]],umapListRes[[2]],umapListRes[[3]],umapListRes[[4]])
+dev.off()
+
+
+
+
+Idents(seurat) <- paste(colPrefix,opt$resolution,sep = "")
+seurat@meta.data$numclust <- seurat@meta.data[,paste(colPrefix,opt$resolution,sep = "")]
+
+
+#Check for unwanted source of variation
+pPhases <- DimPlot(seurat,group.by = "phases")
+if (!is.null(seurat@meta.data$predicted)) {
+  pPred <- DimPlot(seurat,group.by = "predicted")
+} else {
+  pPred <- blank
+}
+
+pUMI <- FeaturePlot(seurat, "Total_mRNAs")
+pMito <- FeaturePlot(seurat, "percentMito")
+
+png(paste(opt$outdir,"/umap_factors.png",sep = ""),height = 800,width = 800)
+grid.arrange(pPhases,pPred,pUMI,pMito)
+dev.off()
+
+#Check for unwanted source of variation
+pPhases <- DimPlot(seurat,group.by = "phases",reduction = "tsne")
+if (!is.null(seurat@meta.data$predicted)) {
+  pPred <- DimPlot(seurat,group.by = "predicted",reduction = "tsne")
+} else {
+  pPred <- blank
+}
+pUMI <- FeaturePlot(seurat, "Total_mRNAs",reduction = "tsne")
+pMito <- FeaturePlot(seurat, "percentMito",reduction = "tsne")
+
+png(paste(opt$outdir,"/tsne_factors.png",sep = ""),height = 800,width = 800)
+grid.arrange(pPhases,pPred,pUMI,pMito)
+dev.off()
+
+
+#Check for genes
+
+gene_list<- c("Pdzk1ip1","Mllt3",
+                     "Ctla2a","Cd27","Cd34",
+                     "Lig1","Hells","Tyms",
+                     "Notch2","Lst1",
+                     "Irf7","Stat1",
+                     "Pf4","Itga2b",
+                     "Klf1","Gata1",
+                     "Mpo","Cd48",
+                     "Fcer1a","Hdc",
+                     "Il7r","Thy1", # Ccr9 not found
+                     "Cdc20","Ccnb1","Racgap1",
+                     "Mzb1","Ly6d", "Trp53inp1",
+                     "Jun","Fos","Nr4a1")
+
+gene_list <- gene_list[which(is.element(set=rownames(seurat),el = gene_list))] 
+
+dir.create(paste(opt$outdir,"/genes/umap/",sep =""),recursive = T)
+dir.create(paste(opt$outdir,"/genes/tsne/",sep =""),recursive = T)
+for (g in gene_list) {
+  png(paste(opt$outdir,"/genes/umap/",g,".png",sep = ""))
+  plot(FeaturePlot(seurat,features = g))
+  dev.off()
+}
+
+for (g in gene_list) {
+  png(paste(opt$outdir,"/genes/tsne/",g,".png",sep = ""))
+  plot(FeaturePlot(seurat,features = g),reduction = "tsne")
+  dev.off()
+}
+
+markers <- FindAllMarkers(seurat,only.pos = T,logfc.threshold= opt$logfc_threshold)
+markers <- markers[which(markers$p_val_adj < 0.05),]
+
+write.table(x = markers,paste(opt$outdir,"/markers.tsv", sep =""),sep = "\t",quote = F,row.names = F,col.names = T)
+
+
+dir.create(paste(opt$outdir,"/markers/",sep = ""))
+
+ylab <- "LogNormalized UMI counts"
+if (opt$norm_method == "sctransform") {
+  ylab <- "Expression level"
+}
+
+for (numClust in unique(markers$cluster)) {
+  print(head(markers[which(markers$cluster == numClust),],n=9))
+  png(paste(opt$outdir,"/markers/Cluster_",numClust,"_topGenesVlnPlot.png",sep =""),width = 1000, height = 1000)
+  plot(VlnPlot(object = seurat, features = head(markers[which(markers$cluster == numClust),"gene"],n=9),pt.size = 0.5) + 
+         labs(x = "Clusters",y= ylab,colour = "black") +
+         theme(axis.text = element_text(size=20),
+               plot.title = element_text(size=25)) )
+  dev.off()
+  
+}
+
+#################################################################################################
+##########                                Enrichment                                   ##########
+#################################################################################################
+
+## Add signatures
+dir.create(paste(opt$outdir,"/cellSignatures/",sep = ""))
+signatures <- readRDS(opt$signaturesFile)
+
+
+for (sig in c(1:length(signatures))) {
+  sigName <- names(signatures)[sig]
+  signature <- signatures[[sig]]
+  seurat <- scoreCells3(seurat,signature,outdir= paste(opt$outdir,"/cellSignatures/",sep=""),sigName)
+}
+
+## Enrichment
+
+# Modify colnames of markers to use gProfileAnalysis function
+firstup <- function(x) {
+  substr(x, 1, 1) <- toupper(substr(x, 1, 1))
+  x
+}
+
+colnames(markers) <- firstup(colnames(markers))
+
+
+#be careful background
+if(opt$gprofiler) {
+gprofiler_result <- gProfileAnalysis(deg_clust = markers,
+                                     outdir = paste(opt$outdir,"/gProfileR", sep =""),
+                                     background = rownames(seurat),
+                                     colors = hue_pal()(length(unique(markers$Cluster))))
+
+saveRDS(gprofiler_result,file=paste(opt$outdir,"/gProfileR/gprofiler_results.rds",sep =""))
+}
+
+
+
+# Test Rodriguez cluster sig
+if(!is.null(opt$rodriguezSig)) {
+  
+  clusterNames <- c("C1","C2","C3","Mk","Er","Ba","Neu","Mo1","Mo2", "preDC","preB","preT")
+  
+  RodriguezClustersSig <- lapply(X= c(1:length(clusterNames)),FUN = read_xlsx,path = opt$rodriguezSig)
+  
+  names(RodriguezClustersSig) <- clusterNames 
+  
+  getOnlyPos <- function(clustersSig) {
+    clusterSig <- clustersSig[which(clustersSig$log.effect > 0),]
+    return(clusterSig)
+  }
+  
+  RodriguezClustersSigPos <- lapply(X= RodriguezClustersSig, getOnlyPos)
+  
+  
+  signaturesRodriguez <- lapply(RodriguezClustersSigPos,"[[",1 )
+  
+  
+  firstup <- function(x) {
+    substr(x, 1, 1) <- toupper(substr(x, 1, 1))
+    x
+  }
+  
+  colnames(markers) <- firstup(colnames(markers))
+  
+  getClustEnrichForRodriguez <- function(clust,signatures,seurat,markers) {
+    clustSig <- lapply(signatures,testHyperSig3,seurat,markers,clust)
+    if (!is.null(seurat@meta.data$predicted)) {
+    propCellTypesLearned <- table(seurat@meta.data[which(seurat@meta.data$numclust==clust),"predicted"])/length(seurat@meta.data[which(seurat@meta.data$numclust==clust),"predicted"])
+    } else{
+      propCellTypesLearned <- NULL
+    }
+    clustInfo <- c(clustSig,propCellTypesLearned)
+    return(clustInfo)
+  }
+  
+  clust_list <- lapply(unique(markers$Cluster),getClustEnrichForRodriguez,signature=signaturesRodriguez,seurat =seurat,markers =markers) ##markers arg forgotten in testHyper sig
+
+  names(clust_list) <- paste("cluster_",unique(markers$Cluster),sep="")
+  
+  clust_table <- as.data.frame(matrix(unlist(clust_list), nrow=length(unlist(clust_list[1]))))
+  colnames(clust_table) <- names(clust_list)
+  rownames(clust_table) <- names(clust_list[[1]])
+  
+  clust_df <- as.data.frame(t(clust_table))
+  
+  write.csv(clust_df,file = paste(opt$outdir,"/clustInfoRodriguez.csv",sep =""),quote = F)
+  
+}
+
+
+#Clusters table summary
+
+clust_table <- data.frame()
+
+print("Creating cluster table summary")
+
+
+getClustInfo <- function(clust,signatures,seurat,markers) {
+  
+  clustInfo <- list()
+  clustInfo$num_cells <- dim(seurat@meta.data[which(seurat@active.ident==clust),])[1]
+  clustInfo$percent_cells <- clustInfo$num_cells/dim(seurat@meta.data)[1]
+  percentPhases <- table(seurat@meta.data[which(seurat@active.ident==clust),"phases"])/length(seurat@meta.data[which(seurat@active.ident==clust),"phases"]) #In fact this is fraction not percentage
+  
+  if(!is.null(seurat@meta.data$predicted)) {
+    percentPredicted <- table(seurat@meta.data[which(seurat@active.ident==clust),"predicted"])/length(seurat@meta.data[which(seurat@active.ident==clust),"predicted"]) #In fact this is fraction not percentage
+    
+    for (p in unique(seurat@meta.data$predicted)) {
+      print(p)
+      if (is.element(p,names(percentPhases))) {
+        clustInfo[[p]] <- percentPhases[p] 
+      } else {
+        clustInfo[[p]] <- 0
+      }
+    }
+    
+  }
+  
+  for (p in c("G1_G0","S","G2_M")) {
+    if (is.element(p,names(percentPhases))) {
+      clustInfo[[p]] <- percentPhases[p] 
+    } else {
+      clustInfo[[p]] <- 0
+    }
+  }
+  
+  
+  clustInfo$median_genes_expressed <- median(seurat@meta.data[which(seurat@active.ident==clust),"numGenesPerCells"])
+  clustInfo$median_nUMI <- median(seurat@meta.data[which(seurat@active.ident==clust),"Total_mRNAs"])
+  clustInfo$median_percentMitochGenes <- median(seurat@meta.data[which(seurat@active.ident==clust),"percentMito"])
+  
+  
+  clustSig <- lapply(signatures,testHyperSig3,seurat,markers,clust) 
+  
+  clustInfo <- c(clustInfo,clustSig)
+  
+}
+
+allSignatures <- c(signatures,signaturesRodriguez)
+
+clust_list <- lapply(levels(unique(seurat@active.ident)),getClustInfo,allSignatures,seurat,markers)
+
+names(clust_list) <- paste("cluster_",levels(unique(seurat@active.ident)),sep="")
+
+saveRDS(clust_list,paste(opt$outdir,"/clust_list_save.rds",sep =""))
+
+clust_table <- as.data.frame(matrix(unlist(clust_list), nrow=length(unlist(clust_list[1]))))
+colnames(clust_table) <- names(clust_list)
+rownames(clust_table) <- names(clust_list[[1]])
+
+
+clust_df <- as.data.frame(t(clust_table))
+
+write.table(x = clust_df,file = paste(opt$outdir,"/clusters_table.tsv",sep =""),sep="\t",quote=F,col.names = NA)
+
+saveRDS(seurat,file = paste(opt$outdir,"/seurat.rds",sep = ""))
+
+
+
--- a/R_src/Seurat3_integration.R
+++ b/R_src/Seurat3_integration.R
+##------------------------------------------------------------------------------
+## L. Herault
+##------------------------------------------------------------------------------
+
+## -----------------------------------------------------------------------------
+## Libraries
+## -----------------------------------------------------------------------------
+
+suppressMessages(library(monocle))
+suppressMessages(library(Seurat))
+suppressMessages(library(BiocParallel))
+suppressMessages(library(getopt))
+suppressMessages(library(gridExtra))
+suppressMessages(library(gProfileR))
+suppressMessages(library(RColorBrewer))
+suppressMessages(library(readxl))
+suppressMessages(library(stringr))
+suppressMessages(library(scales))
+suppressMessages(library(cowplot))
+suppressMessages(library(sctransform))
+suppressMessages(library(stringr))
+suppressMessages(library(plyr))
+suppressMessages(library(grid))
+
+
+source("R_src/getSigPerCells.R")
+source("R_src/Enrichment.R")
+source("R_src/funForSeurat.R")
+
+
+# Seurat 3 integration workflow
+
+## -----------------------------------------------------------------------------
+## Command line args
+## -----------------------------------------------------------------------------
+
+spec = matrix(c(
+  'help',        'h', 0, "logical",   "Help about the program",
+  'inputFiles',  'i', 1, "character", "REQUIRED: 10X dataset paths prepared as hspc.combined object (.RDS generated by prepare_data.R) separated by +",
+  'signaturesFile', 's',1, "character", "REQUIRED: signatures rds file",
+  'outdir',     'o',1, "character", 'Outdir path (default ./)',
+  "num_dim_CCA", 'q',1,"numeric", "First n dimensions of CCA to use for FindIntegrationAnchors Seurat function (15 by default)",
+  "num_dim_weight", 'w',1,"numeric", "First n dimensions to use for IntegrateData Seurat function (15 by default)",
+  "num_dim",      'n',1,"numeric", "First n dimensions of PCA to use for clustering, UMAP and TSNE (15 by default)",
+  "num_dim_integrated",'N',1,"numeric", "Number of PCA dimension computed to analyse integrated data (40 by default)",
+  "cores",        'c',1, "numeric", "Number of cores to use for ordering (for differencially expressed gene between clusters test)",
+  "resolution",    'r',1, "numeric", "resolution for hspc.combined clustering",
+  "correction",      "b",1,"character", "Covariable to use as blocking factor (eg one or several columns of pData: betch, cell cycle phases... separated by +)",
+  "logfc_threshold", "l", 1, "numeric", "logfc threshold for finding cluster markers (1 cluster vs all deg) 0.25 by default",
+  "rodriguezSig", "k", 1, "character", "path for xls file of Rodriguez result",
+  "norm_method", "z",1, "character", "normalization method, logNorm (by default) or sctransform",
+  "reusePca", "p", 0, "character","re use pca calculated before when caculating anchor weights for each dataset default to FALSE (permit to correct for cell cycle before integration)"
+), byrow=TRUE, ncol=5);
+
+opt = getopt(spec)
+
+# if help was asked, print a friendly message
+# and exit with a non-zero error code
+
+args <- commandArgs()
+if ( !is.null(opt$help) | is.null(opt$inputFiles)) {
+  cat("Perform Seurat 3 integration workflow, then cluster the cell with seurat 3 at different resolutions")
+  cat(getopt(spec, usage=TRUE))
+  q(status=1)
+}
+
+#set default arguments values
+
+if (is.null(opt$resolution)) {
+  opt$resolution <- 0.6
+}
+
+if (is.null(opt$logfc_threshold)) {
+  opt$logfc_threshold <- 0.25
+}
+
+if (is.null(opt$num_dim_CCA)) {
+  opt$num_dim_CCA <- 15
+}
+
+if (is.null(opt$num_dim_weight)) {
+  opt$num_dim_weight <- 15
+}
+
+if (is.null(opt$num_dim)) {
+  opt$num_dim <- 15
+}
+
+if (is.null(opt$num_dim_integrated)) {
+  opt$num_dim_integrated <- 40
+}
+
+if (is.null(opt$norm_method)) {
+  opt$norm_method <- "logNorm"
+}
+
+if (is.null(opt$reusePca)) {
+  opt$reusePca <- FALSE
+}
+
+blank <- grid.rect(gp=gpar(col="white"))
+
+print(opt$reusePca)
+
+dir.create(opt$outdir,recursive = T,showWarnings = F)
+
+
+corrections <- strsplit(x = opt$correction,split = "\\+")[[1]]
+
+
+dir.create(opt$outdir,recursive = T,showWarnings = F)
+
+
+hspc.listFile <- strsplit(opt$inputFiles,split = "\\+")[[1]]
+
+hspc.list <- list()
+#load dataset 
+
+for (i in 1:length(x = hspc.listFile)) {
+  #For testing, in final workflow sampleName will be incorporated in metadata with the loading of cell ranger matrix
+  sampleName <- strsplit(hspc.listFile[i],split = "/")[[1]][2]
+  hspc.list[[i]] <- readRDS(hspc.listFile[i])
+  hspc.list[[i]]@meta.data$sampleName <- sampleName
+  hspc.list[[i]] <- RenameCells(hspc.list[[i]],add.cell.id = sampleName)
+  
+  if(opt$norm_method != "sctransform") {
+    hspc.list[[i]] <- NormalizeData(object = hspc.list[[i]], verbose = FALSE)
+    hspc.list[[i]] <- FindVariableFeatures(object = hspc.list[[i]], selection.method = "vst", 
+                                           nfeatures = 2000, verbose = FALSE)
+  } else {
+    hspc.list[[i]] <- SCTransform(hspc.list[[i]],vars.to.regress = corrections,verbose = T)
+  }
+}
+
+
+hspc.anchors <- FindIntegrationAnchors(object.list = hspc.list, dims = 1:opt$num_dim_CCA)
+
+if(opt$reusePca) {
+  print("Re use pca")
+  hspc.combined <- IntegrateData(anchorset = hspc.anchors, weight.reduction = "pca", dims = 1:opt$num_dim_weight)
+}
+
+hspc.combined <- IntegrateData(anchorset = hspc.anchors, dims = 1:opt$num_dim_weight)
+
+DefaultAssay(object = hspc.combined) <- "integrated"
+
+# Run the standard workflow for visualization and clustering
+#Here certainly need to reuse SCTransform on combined data if norm_method = sct
+
+hspc.combined <- ScaleData(object = hspc.combined, verbose = T,vars.to.regress = corrections)
+hspc.combined <- RunPCA(object = hspc.combined, npcs = opt$num_dim_integrated, verbose = FALSE)
+
+
+png(paste(opt$outdir,"/ElbowPlot.png",sep =""))
+ElbowPlot(object = hspc.combined,ndims = opt$num_dim_integrated)
+dev.off()
+
+
+# t-SNE UMAP and Clustering
+hspc.combined <- RunUMAP(object = hspc.combined, reduction = "pca", dims = 1:opt$num_dim)
+hspc.combined <- RunTSNE(object = hspc.combined, reduction = "pca", dims = 1:opt$num_dim)
+hspc.combined <- FindNeighbors(object = hspc.combined, reduction = "pca", dims = 1:opt$num_dim)
+hspc.combined <- FindClusters(hspc.combined, resolution = c(0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,1.2))
+
+colPrefix <- "integrated_snn_res."
+
+Idents(hspc.combined) <- paste(colPrefix,opt$resolution,sep = "")
+
+hspc.combined@meta.data$AGE <- "Old"
+hspc.combined@meta.data$AGE[which(hspc.combined@meta.data$age == "2_months")] <- "Young"
+
+
+
+# Visualization of the samples
+p1 <- DimPlot(object = hspc.combined, reduction = "umap", group.by = "sampleName")
+p2 <- DimPlot(object = hspc.combined, reduction = "umap", label = TRUE)
+p3 <- DimPlot(object = hspc.combined, reduction = "umap", group.by = "AGE")
+p4 <- DimPlot(object = hspc.combined, reduction = "umap", group.by = "runDate")
+p3tsne <- DimPlot(object = hspc.combined, reduction = "tsne", group.by = "AGE")
+p4tsne <- DimPlot(object = hspc.combined, reduction = "tsne", group.by = "runDate")
+
+
+
+png(paste(opt$outdir,"/umap_samples.png",sep =""),height = 800,width=1200)
+grid.arrange(p1, p2,ncol = 2)
+dev.off()
+
+png(paste(opt$outdir,"/AGE.png",sep =""),height = 800,width=1200)
+grid.arrange(p3, p3tsne,ncol = 2)
+dev.off()
+
+png(paste(opt$outdir,"/runDate.png",sep =""),height = 800,width=1200)
+grid.arrange(p4, p4tsne,ncol = 2)
+dev.off()
+
+png(paste(opt$outdir,"/UmapClusters.png",sep =""),height = 800,width=1200)
+DimPlot(object = hspc.combined, reduction = "umap", label = TRUE)
+dev.off()
+
+
+p1tsne <- DimPlot(object = hspc.combined, reduction = "tsne", group.by = "sampleName")
+p2tsne <- DimPlot(object = hspc.combined, reduction = "tsne", label = TRUE)
+
+png(paste(opt$outdir,"/tsne_samples.png",sep =""),height = 800,width=1000)
+grid.arrange(p1tsne, p2tsne,ncol = 2)
+dev.off()
+
+#Visualistaoin one sample at a time
+sampleNames <- unique(hspc.combined@meta.data$sampleName) 
+dir.create(paste(opt$outdir,"/samples/umap/",sep = ""),recursive = T)
+dir.create(paste(opt$outdir,"/samples/tsne/",sep = ""),recursive = T)
+
+#samplePlotList <- list()
+  for (s in sampleNames) {
+    png(paste(opt$outdir,"/samples/umap/",s,".png",sep = ""))
+    plot(DimPlot(object = hspc.combined, cells= grep(s, hspc.combined@meta.data$sampleName), reduction = "umap") + ggtitle(s))
+    dev.off()
+    png(paste(opt$outdir,"/samples/tsne/",s,".png",sep = ""))
+    plot(DimPlot(object = hspc.combined, cells= grep(s, hspc.combined@meta.data$sampleName), reduction = "tsne") + ggtitle(s))
+    dev.off()
+  }
+
+
+umapListRes <- list()
+tsneListRes <- list()
+for (r in c(0.6,0.8,1,1.2)) {
+  umapListRes[[as.character(r)]] <- DimPlot(hspc.combined,
+                                            reduction = "umap",
+                                            label = T,
+                                            group.by = paste(colPrefix,r,sep="")) + 
+    NoLegend() +
+    ggtitle(paste("res",r))
+  
+  tsneListRes[[as.character(r)]] <- DimPlot(hspc.combined,
+                                            reduction = "tsne",
+                                            label = T,
+                                            group.by = paste(colPrefix,r,sep="")) + 
+    NoLegend() +
+    ggtitle(paste("res",r))
+}
+
+png(paste(opt$outdir,"/tsne_different_res.png",sep = ""),height = 800,width = 800)
+grid.arrange(tsneListRes[[1]],tsneListRes[[2]],tsneListRes[[3]],tsneListRes[[4]])
+dev.off()
+
+png(paste(opt$outdir,"/umap_different_res.png",sep = ""),height = 800,width = 800)
+grid.arrange(umapListRes[[1]],umapListRes[[2]],umapListRes[[3]],umapListRes[[4]])
+dev.off()
+
+
+
+hspc.combined@meta.data$numclust <- hspc.combined@meta.data[,paste(colPrefix,opt$resolution,sep = "")]
+
+
+#Check for unwanted source of variation UMAP
+pPhases <- DimPlot(hspc.combined,group.by = "phases")
+if (!is.null(hspc.combined@meta.data$predicted)) {
+  pPred <- DimPlot(hspc.combined,group.by = "predicted")
+} else {
+  pPred <- blank
+}
+pUMI <- FeaturePlot(hspc.combined, "Total_mRNAs")
+pMito <- FeaturePlot(hspc.combined, "percentMito")
+
+png(paste(opt$outdir,"/umap_factors.png",sep = ""),height = 800,width = 800)
+grid.arrange(pPhases,pPred,pUMI,pMito)
+dev.off()
+
+#Check for unwanted source of variation tSNE
+pPhases <- DimPlot(hspc.combined,group.by = "phases",reduction = "tsne")
+if (!is.null(hspc.combined@meta.data$predicted)) {
+  pPred <- DimPlot(hspc.combined,group.by = "predicted",reduction = "tsne")
+} else {
+  pPred <- blank
+}
+pUMI <- FeaturePlot(hspc.combined, "Total_mRNAs",reduction = "tsne")
+pMito <- FeaturePlot(hspc.combined, "percentMito",reduction = "tsne")
+
+png(paste(opt$outdir,"/tsne_factors.png",sep = ""),height = 800,width = 800)
+grid.arrange(pPhases,pPred,pUMI,pMito)
+dev.off()
+
+
+#Check for genes
+
+gene_list<- c("Pdzk1ip1","Mllt3",
+              "Ctla2a","Cd27","Cd34",
+              "Lig1","Hells","Tyms",
+              "Notch2","Lst1",
+              "Irf7","Stat1",
+              "Pf4","Itga2b",
+              "Klf1","Gata1",
+              "Mpo","Cd48",
+              "Fcer1a","Hdc",
+              "Il7r","Thy1", # Ccr9 not found
+              "Cdc20","Ccnb1","Racgap1",
+              "Mzb1","Ly6d", "Trp53inp1",
+              "Jun","Fos","Nr4a1","Jund")
+
+gene_list <- gene_list[which(is.element(set=rownames(hspc.combined),el = gene_list))] 
+
+dir.create(paste(opt$outdir,"/genes/umap/",sep =""),recursive = T)
+dir.create(paste(opt$outdir,"/genes/tsne/",sep =""),recursive = T)
+for (g in gene_list) {
+  png(paste(opt$outdir,"/genes/umap/",g,".png",sep = ""))
+  plot(FeaturePlot(hspc.combined,features = g))
+  dev.off()
+}
+
+for (g in gene_list) {
+  png(paste(opt$outdir,"/genes/tsne/",g,".png",sep = ""))
+  plot(FeaturePlot(hspc.combined,features = g,reduction = "tsne"))
+  dev.off()
+}
+
+#AGE prop for each cluster 
+age_clust <- getAGEPropPerClustBarplot(hspc.combined)
+#sample prop for each cluster
+sample_clust <- getSamplePropPerClustBarplot(hspc.combined)
+#runDate prop for each cluster
+runDate_clust <- getRunDatePropPerClustBarplot(hspc.combined)
+
+png(paste(opt$outdir,"/propPerClust.png",sep = ""))
+grid.arrange(age_clust,sample_clust,runDate_clust,p2tsne)
+dev.off()
+
+
+## saving results
+
+saveRDS(hspc.combined,paste(opt$outdir,"/combined.rds",sep =""))
+
+
+
+
+
+
--- a/R_src/Seurat3_integration_biology.R
+++ b/R_src/Seurat3_integration_biology.R
+##------------------------------------------------------------------------------
+## L. Herault
+##------------------------------------------------------------------------------
+
+## -----------------------------------------------------------------------------
+## Libraries
+## -----------------------------------------------------------------------------
+
+suppressMessages(library(monocle))
+suppressMessages(library(Seurat))
+suppressMessages(library(BiocParallel))
+suppressMessages(library(getopt))
+suppressMessages(library(gridExtra))
+suppressMessages(library(gProfileR))
+suppressMessages(library(RColorBrewer))
+suppressMessages(library(readxl))
+suppressMessages(library(stringr))
+suppressMessages(library(plyr))
+suppressMessages(library(biomaRt))
+
+
+
+suppressMessages(library(scales))
+
+
+source("R_src/getSigPerCells.R")
+source("R_src/Enrichment.R")
+source("R_src/funForSeurat.R")
+
+# Analysis of seurat 3 integration and clusternig workflow
+
+## -----------------------------------------------------------------------------
+## Command line args
+## -----------------------------------------------------------------------------
+
+spec = matrix(c(
+  'help',        'h', 0, "logical",   "Help about the program",
+  'inputRDS',  'i', 1, "character", "REQUIRED : 10X data prepared qs seurat object with CCA and first clustering (.RDS generated by prepare_data.R).",
+  'outdir',     'o',1, "character", 'Outdir path (default ./)',
+  "mart",         'm',1, "character", "mart data rds format if biomaRt server is down and you have a save ",
+  "resolution",       'r',1, "numeric", "resolution of the clustering analysed default to 0.8",
+  "num_dim",          'n', 1, "numeric", "first n dimension of the CCA to use for the reclustering",
+  "minPropCellExp", "p",1,"character", "proportion minimal of cells that expressed the genes kept for the analaysis 0.001 by default",
+  "signaturesFile", "s", 1, "character", "path to folder with signature list store as rds object",
+  "logfc_threshold", "l", 1, "numeric", "logfc threshold for finding cluster markers (1 cluster vs all deg) 0.25 by default",
+  "rodriguezSig", "k", 1, "character", "path for xls file of Rodriguez result",
+  "gprofiler", "g", 0, "logical", "Put to avoid gprofiler analysis",
+  "norm_method",    "z",1, "character", "normalisation method, logNorm seurat (by default) or sctransform"
+), byrow=TRUE, ncol=5);
+
+
+
+opt = getopt(spec)
+
+
+# if help was asked, print a friendly message
+# and exit with a non-zero error code
+
+args <- commandArgs()
+if ( !is.null(opt$help) | is.null(opt$inputRDS)) {
+  cat("Perform diff expression between clusters previously obtained with Seurat3, make a summary table of cluster metrics and signature enrichments")
+  cat(getopt(spec, usage=TRUE))
+  q(status=1)
+}
+
+
+if (is.null(opt$logfc_threshold)) {
+  opt$logfc_threshold = 0.25
+} 
+if (is.null(opt$norm_method)) {
+  opt$norm_method = "logNorm"
+} 
+
+print(paste("logfc threshold:", opt$logfc_threshold) )
+
+if (is.null(opt$outdir)) {
+  opt$outdir = "./"
+}
+
+
+if(is.null(opt$doMarkers)) {
+  opt$doMarkers = T
+}
+
+if (is.null(opt$gprofiler)) {
+  opt$gprofiler <- T
+} else {
+  opt$gprofiler <- F
+}
+
+
+if (is.null(opt$mart)){
+  mart <- useDataset("mmusculus_gene_ensembl", useMart("ensembl"))
+} else {
+  mart <- readRDS(opt$mart)
+}
+
+
+dir.create(opt$outdir,recursive = T,showWarnings = F)
+
+hspc.combined <- readRDS(opt$inputRDS)
+signatures <- readRDS(opt$signaturesFile)
+
+# Set active idents/slot
+Idents(hspc.combined) <- paste("integrated_snn_res.",opt$resolution,sep ="")
+
+hspc.combined@meta.data$numclust <- Idents(hspc.combined)
+
+DefaultAssay(object = hspc.combined) <- "RNA"
+if (opt$norm_method == "sctransform") {
+  DefaultAssay(object = hspc.combined) <- "SCT"
+}
+
+## Plotting
+
+png(paste(opt$outdir,"/seurat_res",opt$resolution,".png",sep = ""),height = 800,width = 800)
+DimPlot(hspc.combined,label = TRUE,pt.size = 1.5) + ggtitle(paste("seurat_res",opt$resolution,sep = "")) + NoLegend()
+dev.off()
+
+#age <- getAGEPropPerClustBarplot(hspc.combined)
+
+print(paste("integrated_snn_res.",opt$resolution,sep =""))
+print(colnames(hspc.combined@meta.data))
+
+#ageEnrich <- getEnrichAge(hspc.combined,clustCol =paste("integrated_snn_res.",opt$resolution,sep =""),metaCol = "AGE")
+
+# ageEnrich <- as.data.frame(t(ageEnrich))
+# ageEnrich$color <- "black"
+# ageEnrich[which(as.numeric(as.vector(ageEnrich$phyper)) < 0.05),"color"] <- "red"
+# age <- age + theme(axis.text.y = element_text(colour = ageEnrich[,'color']))
+
+
+# png(paste(opt$outdir,"/bp_AGE.png",sep = ""),height = 800,width = 800)
+# plot(age)
+# dev.off()
+
+png(paste(opt$outdir,"/bp_sample.png",sep = ""),height = 800,width = 800)
+getSamplePropPerClustBarplot(hspc.combined) 
+dev.off()
+
+png(paste(opt$outdir,"/bp_runDate.png",sep = ""),height = 800,width = 800)
+getRunDatePropPerClustBarplot(hspc.combined) 
+dev.off()
+
+png(paste(opt$outdir,"/bp_phases.png",sep = ""),height = 800,width = 800)
+getPhasePropPerClustBarplot(hspc.combined) 
+dev.off()
+
+if (!is.null(hspc.combined@meta.data$predicted)) {
+png(paste(opt$outdir,"/bp_predicted.png",sep = ""),height = 800,width = 800)
+getPredictedPropPerClustBarplot(hspc.combined) 
+dev.off()
+}
+
+
+## Perform differential expression analysis between clusters
+
+
+dirMarkers <- paste(opt$outdir,"/markers_res",opt$resolution,sep ="")
+dir.create(dirMarkers)
+
+markers <- FindAllMarkers(hspc.combined,only.pos = T,logfc.threshold= opt$logfc_threshold)
+markers <- markers[which(markers$p_val_adj < 0.05),]
+
+ylab <- "LogNormalized UMI counts"
+if (opt$norm_method == "sctransform") {
+  ylab <- "Expression level"
+}
+
+for (numClust in unique(markers$cluster)) {
+  print(head(markers[which(markers$cluster == numClust),],n=9))
+  png(paste(dirMarkers,"/Cluster_",numClust,"_topGenesVlnPlot.png",sep =""),width = 1000, height = 1000)
+  print(plot(VlnPlot(object = hspc.combined, features = head(markers[which(markers$cluster == numClust),"gene"],n=9),pt.size = 0.5) + 
+               labs(x = "Clusters",y= ylab,colour = "black") +
+               theme(axis.text = element_text(size=20),
+                     plot.title = element_text(size=25)) ))
+  dev.off()
+}
+
+write.table(x = markers,paste(dirMarkers,"/markers.tsv", sep =""),sep = "\t",quote = F,row.names = F,col.names = T)
+
+print("getting TF markers")
+TF_markers <- getBM(attributes=c("external_gene_name"),
+                    filters=c("external_gene_name","go"),
+                    values=list(markers$gene,
+                                "GO:0003700"),mart=mart)
+
+#Add tf from bonzanni plus Zbtb16 and Gfi1b
+TF_bonzani <- data.frame(external_gene_name = c("Spi1","Tal1","Zfpm1","Cbfa2t3","Erg","Fli1","Gata1","Gata2","Hhex","Runx1","Smad6","Zbtb16","Gfi1b"))
+TF_markers <- rbind(TF_markers,TF_bonzani)
+TF_markers <- unique(TF_markers)
+
+write.table(TF_markers,paste(dirMarkers,"/markers_and_bonzanni_TF.tsv",sep =""),
+            sep = "\t",quote = F,row.names = F, col.names = F)
+
+# Modify colnames of markers to use gProfileAnalysis function
+firstup <- function(x) {
+  substr(x, 1, 1) <- toupper(substr(x, 1, 1))
+  x
+}
+
+colnames(markers) <- firstup(colnames(markers))
+
+
+if(opt$gprofiler) {
+  gprofiler_result <- gProfileAnalysis(deg_clust = markers,
+                                       outdir = paste(dirMarkers,"/gProfileR", sep =""),
+                                       background = rownames(hspc.combined),
+                                       colors = hue_pal()(length(unique(markers$Cluster))))
+  
+  saveRDS(gprofiler_result,file=paste(dirMarkers,"/gProfileR/gprofiler_results.rds",sep =""))
+}
+
+getClustTable(opt$rodriguezSig,markers=markers,signatures = signatures,seurat = hspc.combined,outdir = dirMarkers)
+
+
+
+
+
--- a/R_src/Seurat4CL.R
+++ b/R_src/Seurat4CL.R
+##------------------------------------------------------------------------------
+## L. Herault
+##------------------------------------------------------------------------------
+
+## -----------------------------------------------------------------------------
+## Libraries
+## -----------------------------------------------------------------------------
+
+suppressMessages(library(Seurat))
+suppressMessages(library(BiocParallel))
+suppressMessages(library(getopt))
+suppressMessages(library(gridExtra))
+#suppressMessages(library(gProfileR))
+suppressMessages(library(RColorBrewer))
+suppressMessages(library(readxl))
+suppressMessages(library(stringr))
+suppressMessages(library(scales))
+suppressMessages(library(grid))
+suppressMessages(library(ggplot2))
+
+
+
+
+source("R_src/getSigPerCells.R")
+source("R_src/Enrichment.R")
+source("R_src/funForSeurat.R")
+
+# Seurat 3 analysis of an individual sample required for using CaSTLe script.
+
+## -----------------------------------------------------------------------------
+## Command line args
+## -----------------------------------------------------------------------------
+
+spec = matrix(c(
+  'help',        'h', 0, "logical",   "Help about the program",
+  'inputRDS',  'i', 1, "character", "REQUIRED : 10X data prepared as monocle or seurat object (.RDS generated by prepare_data.R).",
+  'outdir',     'o',1, "character", 'Outdir path (default ./)',
+  "num_dim",      'n',1,"numeric", "Number of dimension to use for ordering (eg n first pc of PCA on input data) 10 by default",
+  "correction",      "b",1,"character", "Covariable to use as blocking factor (eg one or several columns of pData: betch, cell cycle phases... separated by +)",
+  "minPropCellExp", "p",1,"numeric", "minimal proportion of cells that expressed the genes kept for the analaysis 0.001 by default",
+  "norm_method",    "z",1, "character", "normalisation method, logNorm seurat (by default) or sctransform",
+  "resolution", "r", 1,"numeric", "resolution for Seurat clustering 0.9 by default",
+  "signaturesFile", "s", 1, "character", "path to folder with signature list store as rds object",
+  "identRemoved",    "d", 1, "character", "Optionnal cluster to remove only work if input is a seurat object",
+  "nonExpressedGenesRemoved", "e", 0,"logical", "non expressed gene already removed default to FALSE",
+  "gprofiler", "g", 0, "logical", "if true doing gprofiler default to true",
+  "logfc_threshold", "l", 1, "numeric", "logfc threshold for finding cluster markers (1 cluster vs all deg) 0.25 by default"
+  
+), byrow=TRUE, ncol=5);
+
+opt = getopt(spec)
+
+## For testing
+# 
+# opt <- list()
+# opt$outdir <- "output/testSeurat4/"
+# opt$correction <- "G2M_score+S_score+G1_score"
+# opt$inputRDS <- "seurat_treated.rds"
+# opt$signatureFile <- "output/signatures/publicSignatures.rds"
+
+
+
+# if help was asked, print a friendly message
+# and exit with a non-zero error code
+
+args <- commandArgs()
+if ( !is.null(opt$help) | is.null(opt$inputRDS)) {
+  cat("For an individual sample, perform PCA, tSNE, UMAP, Louvain clustering, diff expression between clusters with Seurat3 package with filtered poor quality cells data (gbm_cds monocle object)")
+  cat(getopt(spec, usage=TRUE))
+  q(status=1)
+}
+
+
+#set default arguments values
+
+if (is.null(opt$num_dim)) {
+  opt$num_dim <- 10
+}
+
+if (is.null(opt$resolution)) {
+  opt$resolution <- 0.8
+}
+
+if (is.null(opt$minPropCellExp)) {
+  opt$minPropCellExp <- 0.001
+  print(opt$minPropCellExp)
+}
+
+
+if (is.null(opt$logfc_threshold)) {
+  opt$logfc_threshold = 0.25
+} 
+
+print(paste("logfc threshold:", opt$logfc_threshold) )
+
+if (is.null(opt$outdir)) {
+  opt$outdir = "./"
+}
+
+if (is.null(opt$norm_method)) {
+  opt$norm_method = "logNorm"
+}
+
+
+
+if (is.null(opt$nonExpressedGenesRemoved)) {
+  opt$nonExpressedGenesRemoved = F
+} else {
+  opt$nonExpressedGenesRemoved = T
+}
+
+# if (is.null(opt$gprofiler)) {
+#   opt$gprofiler <- T
+# } else {
+#   opt$gprofiler <- F
+# }
+
+
+# get correction vector
+if (!is.null(opt$correction)) {
+  corrections <- strsplit(x = opt$correction,split = "\\+")[[1]]
+} else {
+  corrections <- NULL
+}
+
+
+
+dir.create(opt$outdir,recursive = T,showWarnings = F)
+
+print(opt)
+
+# Loading
+seurat <- readRDS(opt$inputRDS)
+
+# Remove non expressed genes if not done
+if(opt$nonExpressedGenesRemoved == F) {
+  genesFiltered <- list(genes_before = dim(seurat)[1])
+  
+  print("remove non expressed genes (non expressed in at least X% of the cells X user option in monocle dp feature 5% in seurat tutorial 0,1%)")
+  threshold <- opt$minPropCellExp * ncol(seurat)
+
+  seurat <- CreateSeuratObject(counts = as.matrix(GetAssayData(seurat,slot = 'counts',assay = "RNA")),
+                               assay = "RNA",
+                               meta.data = seurat@meta.data,
+                               min.cells = threshold)
+  genesFiltered$genes_after <- dim(seurat)[1]
+  
+  png(paste(opt$outdir,"/genesFiltering.png",sep = ""))
+  barplot(unlist(genesFiltered),main = "Genes filtering")
+  dev.off()
+  
+}
+
+
+
+#################################################################################################
+####################################### Seurat Workflow #########################################
+#################################################################################################
+
+
+classicSeuratWorkflow <- function(seurat, corrections,outdir) {
+  dir.create(outdir,recursive = T,showWarnings = F)
+  
+  if (opt$norm_method == "sctransform") {
+    
+    seurat <- SCTransform(object = seurat, vars.to.regress = corrections)
+    
+  } else {
+    
+    seurat <- NormalizeData(object = seurat)
+    seurat <- FindVariableFeatures(object = seurat,selection.method = "vst", nfeatures = 2000, verbose = T)
+    seurat <- ScaleData(object = seurat,vars.to.regress = corrections)
+    
+  }
+  
+  seurat <- RunPCA(object = seurat)
+  
+  
+  png(paste(outdir,"/ElbowPlot.png",sep =""))
+  ElbowPlot(object = seurat,ndims = 30)
+  dev.off()
+  
+  
+  print("Clustering...")
+  
+  seurat <- FindNeighbors(object = seurat,dims = c(1:opt$num_dim),k.param = 20)
+  seurat <- FindClusters(object = seurat,resolution = c(0.5,0.6,0.7,0.8,0.9,1,1.2))
+  
+  print("Running TSNE...")
+  
+  seurat <- RunTSNE(seurat,dims = c(1:opt$num_dim))
+  
+  print("Running UMAP...")
+  
+  seurat <- RunUMAP(seurat,dims = c(1:opt$num_dim))
+  
+  print("UMAP ok")
+  print(colnames(seurat@meta.data))
+  
+  colPrefix <- "RNA_snn_res."
+  if(opt$norm_method == "sctransform") {
+    colPrefix <- "SCT_snn_res."
+  }
+  
+  umapListRes <- list()
+  tsneListRes <- list()
+  for (r in c(0.6,0.8,1,1.2)) {
+    umapListRes[[as.character(r)]] <- DimPlot(seurat,
+                                              reduction = "umap",
+                                              label = T,
+                                              group.by = paste(colPrefix,r,sep="")) + 
+      NoLegend() +
+      ggplot2::ggtitle(paste("res",r))
+    
+    tsneListRes[[as.character(r)]] <- DimPlot(seurat,
+                                              reduction = "tsne",
+                                              label = T,
+                                              group.by = paste(colPrefix,r,sep="")) + 
+      NoLegend() +
+      ggplot2::ggtitle(paste("res",r))
+  }
+  
+  png(paste(outdir,"/tsne_different_res.png",sep = ""),height = 800,width = 800)
+  grid.arrange(tsneListRes[[1]],tsneListRes[[2]],tsneListRes[[3]],tsneListRes[[4]])
+  dev.off()
+  
+  png(paste(outdir,"/umap_different_res.png",sep = ""),height = 800,width = 800)
+  grid.arrange(umapListRes[[1]],umapListRes[[2]],umapListRes[[3]],umapListRes[[4]])
+  dev.off()
+  
+  
+  
+  
+  Idents(seurat) <- paste(colPrefix,opt$resolution,sep = "")
+  seurat@meta.data$numclust <- seurat@meta.data[,paste(colPrefix,opt$resolution,sep = "")]
+  
+  
+  #Check for unwanted source of variation
+  pPhases <- DimPlot(seurat,group.by = "phases")
+  if (!is.null(seurat@meta.data$predicted)) {
+    pPred <- DimPlot(seurat,group.by = "predicted")
+  } else {
+    pPred <- DimPlot(seurat)
+  }
+  
+  pUMI <- FeaturePlot(seurat, "nCount_RNA")
+  pMito <- FeaturePlot(seurat, "percentMito")
+  
+  png(paste(outdir,"/umap_factors.png",sep = ""),height = 800,width = 800)
+  grid.arrange(pPhases,pPred,pUMI,pMito)
+  dev.off()
+  
+  #Check for unwanted source of variation
+  pPhases <- DimPlot(seurat,group.by = "phases",reduction = "tsne")
+  if (!is.null(seurat@meta.data$predicted)) {
+    pPred <- DimPlot(seurat,group.by = "predicted",reduction = "tsne")
+  } else {
+    pPred <- DimPlot(seurat,reduction = "tsne")
+  }
+  pUMI <- FeaturePlot(seurat, "nCount_RNA",reduction = "tsne")
+  pMito <- FeaturePlot(seurat, "percentMito",reduction = "tsne")
+  
+  png(paste(outdir,"/tsne_factors.png",sep = ""),height = 800,width = 800)
+  grid.arrange(pPhases,pPred,pUMI,pMito)
+  dev.off()
+  
+  return(seurat)
+  
+}
+
+
+## First without integration without correction
+unintegrated <- seurat
+DefaultAssay(unintegrated) <- "RNA"
+classicSeuratWorkflow(unintegrated,correction = NULL,outdir = paste0(opt$outdir,"/SeuratWoIntegrationWoCr/"))
+
+## without integration without correction
+
+classicSeuratWorkflow(unintegrated,correction = corrections,outdir = paste0(opt$outdir,"/SeuratWoIntegrationWithCr/"))
+
+
+## With integration without correction
+if (!is.null(corrections)) {
+  print("Seurat workflow without correction")
+  classicSeuratWorkflow(seurat,correction = NULL,outdir = paste0(opt$outdir,"/SeuratWithoutCr/"))
+}
+
+## With integration with correction
+
+seurat <- classicSeuratWorkflow(seurat,corrections = corrections,outdir =opt$outdir)
+
+#################################################################################################
+####################################### Markers analysis ########################################
+#################################################################################################
+
+markers <- FindAllMarkers(seurat,only.pos = T,logfc.threshold= opt$logfc_threshold)
+markers <- markers[which(markers$p_val_adj < 0.05),]
+
+write.table(x = markers,paste(opt$outdir,"/markers.tsv", sep =""),sep = "\t",quote = F,row.names = F,col.names = T)
+
+
+dir.create(paste(opt$outdir,"/markers/",sep = ""))
+
+ylab <- "LogNormalized UMI counts"
+if (opt$norm_method == "sctransform") {
+  ylab <- "Expression level"
+}
+
+for (numClust in unique(markers$cluster)) {
+  print(head(markers[which(markers$cluster == numClust),],n=9))
+  png(paste(opt$outdir,"/markers/Cluster_",numClust,"_topGenesVlnPlot.png",sep =""),width = 1000, height = 1000)
+  plot(VlnPlot(object = seurat, features = head(markers[which(markers$cluster == numClust),"gene"],n=9),pt.size = 0.5) + 
+         labs(x = "Clusters",y= ylab,colour = "black") +
+         theme(axis.text = element_text(size=20),
+               plot.title = element_text(size=25)) )
+  dev.off()
+  
+}
+
+
+
+## Add signatures scores
+dir.create(paste(opt$outdir,"/cellSignatures/",sep = ""))
+signatures <- readRDS(opt$signaturesFile)
+names(signatures) <- paste0(names(signatures),"_",seurat$sampleName[1])
+
+for (sig in c(1:length(signatures))) {
+  sigName <- names(signatures)[sig]
+  signature <- signatures[[sig]]
+  seurat <- scoreCells3(seurat,signature,outdir= paste(opt$outdir,"/cellSignatures/",sep=""),sigName)
+}
+
+#################################################################################################
+##########                                Enrichment                                   ##########
+#################################################################################################
+
+
+# ## Enrichment
+# 
+# # Modify colnames of markers to use gProfileAnalysis function
+# firstup <- function(x) {
+#   substr(x, 1, 1) <- toupper(substr(x, 1, 1))
+#   x
+# }
+# 
+# colnames(markers) <- firstup(colnames(markers))
+# 
+# 
+# #be careful background
+# if(opt$gprofiler) {
+# gprofiler_result <- gProfileAnalysis(deg_clust = markers,
+#                                      outdir = paste(opt$outdir,"/gProfileR", sep =""),
+#                                      background = rownames(seurat),
+#                                      colors = hue_pal()(length(unique(markers$Cluster))))
+# 
+# saveRDS(gprofiler_result,file=paste(opt$outdir,"/gProfileR/gprofiler_results.rds",sep =""))
+# }
+# 
+# 
+# 
+# # Test Rodriguez cluster sig
+# if(!is.null(opt$rodriguezSig)) {
+#   
+#   clusterNames <- c("C1","C2","C3","Mk","Er","Ba","Neu","Mo1","Mo2", "preDC","preB","preT")
+#   
+#   RodriguezClustersSig <- lapply(X= c(1:length(clusterNames)),FUN = read_xlsx,path = opt$rodriguezSig)
+#   
+#   names(RodriguezClustersSig) <- clusterNames 
+#   
+#   getOnlyPos <- function(clustersSig) {
+#     clusterSig <- clustersSig[which(clustersSig$log.effect > 0),]
+#     return(clusterSig)
+#   }
+#   
+#   RodriguezClustersSigPos <- lapply(X= RodriguezClustersSig, getOnlyPos)
+#   
+#   
+#   signaturesRodriguez <- lapply(RodriguezClustersSigPos,"[[",1 )
+#   
+#   
+#   firstup <- function(x) {
+#     substr(x, 1, 1) <- toupper(substr(x, 1, 1))
+#     x
+#   }
+#   
+#   colnames(markers) <- firstup(colnames(markers))
+#   
+#   getClustEnrichForRodriguez <- function(clust,signatures,seurat,markers) {
+#     clustSig <- lapply(signatures,testHyperSig3,seurat,markers,clust)
+#     if (!is.null(seurat@meta.data$predicted)) {
+#     propCellTypesLearned <- table(seurat@meta.data[which(seurat@meta.data$numclust==clust),"predicted"])/length(seurat@meta.data[which(seurat@meta.data$numclust==clust),"predicted"])
+#     } else{
+#       propCellTypesLearned <- NULL
+#     }
+#     clustInfo <- c(clustSig,propCellTypesLearned)
+#     return(clustInfo)
+#   }
+#   
+#   clust_list <- lapply(unique(markers$Cluster),getClustEnrichForRodriguez,signature=signaturesRodriguez,seurat =seurat,markers =markers) ##markers arg forgotten in testHyper sig
+# 
+#   names(clust_list) <- paste("cluster_",unique(markers$Cluster),sep="")
+#   
+#   clust_table <- as.data.frame(matrix(unlist(clust_list), nrow=length(unlist(clust_list[1]))))
+#   colnames(clust_table) <- names(clust_list)
+#   rownames(clust_table) <- names(clust_list[[1]])
+#   
+#   clust_df <- as.data.frame(t(clust_table))
+#   
+#   write.csv(clust_df,file = paste(opt$outdir,"/clustInfoRodriguez.csv",sep =""),quote = F)
+#   
+# }
+
+
+#################################################################################################
+##########                     Cluster summary table                                    #########
+#################################################################################################
+
+clust_table <- data.frame()
+
+print("Creating cluster summary table ")
+
+  firstup <- function(x) {
+    substr(x, 1, 1) <- toupper(substr(x, 1, 1))
+    x
+  }
+
+  colnames(markers) <- firstup(colnames(markers))
+
+getClustInfo <- function(clust,signatures,seurat,markers) {
+  
+  clustInfo <- list()
+  clustInfo$num_cells <- dim(seurat@meta.data[which(seurat@active.ident==clust),])[1]
+  clustInfo$percent_cells <- clustInfo$num_cells/dim(seurat@meta.data)[1]
+  percentPhases <- table(seurat@meta.data[which(seurat@active.ident==clust),"phases"])/length(seurat@meta.data[which(seurat@active.ident==clust),"phases"]) #In fact this is fraction not percentage
+  
+  if(!is.null(seurat@meta.data$predicted)) {
+    percentPredicted <- table(seurat@meta.data[which(seurat@active.ident==clust),"predicted"])/length(seurat@meta.data[which(seurat@active.ident==clust),"predicted"]) #In fact this is fraction not percentage
+    
+    for (p in unique(seurat@meta.data$predicted)) {
+      print(p)
+      if (is.element(p,names(percentPhases))) {
+        clustInfo[[p]] <- percentPhases[p] 
+      } else {
+        clustInfo[[p]] <- 0
+      }
+    }
+    
+  }
+  
+  for (p in c("G1_G0","S","G2_M")) {
+    if (is.element(p,names(percentPhases))) {
+      clustInfo[[p]] <- percentPhases[p] 
+    } else {
+      clustInfo[[p]] <- 0
+    }
+  }
+  
+  
+  clustInfo$median_genes_expressed <- median(seurat@meta.data[which(seurat@active.ident==clust),"nFeature_RNA"])
+  clustInfo$median_nUMI <- median(seurat@meta.data[which(seurat@active.ident==clust),"nCount_RNA"])
+  clustInfo$median_percentMitochGenes <- median(seurat@meta.data[which(seurat@active.ident==clust),"percentMito"])
+  
+  
+  clustSig <- lapply(signatures,testHyperSig3,seurat,markers,clust) 
+  
+  clustInfo <- c(clustInfo,clustSig)
+  
+}
+
+#allSignatures <- c(signatures,signaturesRodriguez)
+
+clust_list <- lapply(levels(unique(seurat@active.ident)),getClustInfo,signatures,seurat,markers)
+
+names(clust_list) <- paste("cluster_",levels(unique(seurat@active.ident)),sep="")
+
+saveRDS(clust_list,paste(opt$outdir,"/clust_list_save.rds",sep =""))
+
+clust_table <- as.data.frame(matrix(unlist(clust_list), nrow=length(unlist(clust_list[1]))))
+colnames(clust_table) <- names(clust_list)
+rownames(clust_table) <- names(clust_list[[1]])
+
+
+clust_df <- as.data.frame(t(clust_table))
+
+write.table(x = clust_df,file = paste(opt$outdir,"/clusters_table.tsv",sep =""),sep="\t",quote=F,col.names = NA)
+
+saveRDS(seurat,file = paste(opt$outdir,"/seurat.rds",sep = ""))
+
+
+
--- a/R_src/Seurat4_integration.R
+++ b/R_src/Seurat4_integration.R
+##------------------------------------------------------------------------------
+## L. Herault
+##------------------------------------------------------------------------------
+
+## -----------------------------------------------------------------------------
+## Libraries
+## -----------------------------------------------------------------------------
+
+#suppressMessages(library(monocle))
+suppressMessages(library(Seurat))
+suppressMessages(library(BiocParallel))
+suppressMessages(library(getopt))
+suppressMessages(library(gridExtra))
+#suppressMessages(library(gProfileR))
+suppressMessages(library(RColorBrewer))
+suppressMessages(library(readxl))
+suppressMessages(library(stringr))
+suppressMessages(library(scales))
+suppressMessages(library(cowplot))
+suppressMessages(library(sctransform))
+suppressMessages(library(stringr))
+suppressMessages(library(plyr))
+suppressMessages(library(grid))
+suppressMessages(library(ggplot2))
+
+
+
+source("R_src/getSigPerCells.R")
+source("R_src/Enrichment.R")
+source("R_src/funForSeurat.R")
+
+
+# Seurat 3 integration workflow
+
+## -----------------------------------------------------------------------------
+## Command line args
+## -----------------------------------------------------------------------------
+
+spec = matrix(c(
+  'help',        'h', 0, "logical",   "Help about the program",
+  'inputFiles',  'i', 1, "character", "REQUIRED: 10X dataset paths prepared as seurat object (.RDS generated by prepare_data.R) separated by +",
+  'signaturesFile', 's',1, "character", "REQUIRED: signatures rds file",
+  'outdir',     'o',1, "character", 'Outdir path (default ./)',
+  "num_dim_CCA", 'q',1,"numeric", "First n dimensions of CCA to use for FindIntegrationAnchors Seurat function (15 by default)",
+  "num_dim_weight", 'w',1,"numeric", "First n dimensions to use for IntegrateData Seurat function (15 by default)",
+  "num_dim",      'n',1,"numeric", "First n dimensions of PCA to use for clustering, UMAP and TSNE (15 by default)",
+  "num_dim_integrated",'N',1,"numeric", "Number of PCA dimension computed to analyse integrated data (40 by default)",
+  "cores",        'c',1, "numeric", "Number of cores to use for ordering (for differencially expressed gene between clusters test)",
+  "resolution",    'r',1, "numeric", "resolution for smp.combined clustering",
+  "correction",      "b",1,"character", "Covariable to use as blocking factor (eg one or several columns of pData: betch, cell cycle phases... separated by +)",
+  "logfc_threshold", "l", 1, "numeric", "logfc threshold for finding cluster markers (1 cluster vs all deg) 0.25 by default",
+  "rodriguezSig", "k", 1, "character", "path for xls file of Rodriguez result",
+  "norm_method", "z",1, "character", "normalization method, logNorm (by default) or sctransform",
+  "reusePca", "p", 0, "character","re use pca calculated before when caculating anchor weights for each dataset default to FALSE (permit to correct for cell cycle before integration)"
+), byrow=TRUE, ncol=5);
+
+opt = getopt(spec)
+
+# if help was asked, print a friendly message
+# and exit with a non-zero error code
+
+args <- commandArgs()
+if ( !is.null(opt$help) | is.null(opt$inputFiles)) {
+  cat("Perform Seurat 3 integration workflow, then cluster the cell with seurat 3 at different resolutions")
+  cat(getopt(spec, usage=TRUE))
+  q(status=1)
+}
+
+#set default arguments values
+
+if (is.null(opt$resolution)) {
+  opt$resolution <- 0.6
+}
+
+if (is.null(opt$logfc_threshold)) {
+  opt$logfc_threshold <- 0.25
+}
+
+if (is.null(opt$num_dim_CCA)) {
+  opt$num_dim_CCA <- 15
+}
+
+if (is.null(opt$num_dim_weight)) {
+  opt$num_dim_weight <- 15
+}
+
+if (is.null(opt$num_dim)) {
+  opt$num_dim <- 15
+}
+
+if (is.null(opt$num_dim_integrated)) {
+  opt$num_dim_integrated <- 40
+}
+
+if (is.null(opt$norm_method)) {
+  opt$norm_method <- "logNorm"
+}
+
+if (is.null(opt$reusePca)) {
+  opt$reusePca <- FALSE
+}
+
+print(opt$reusePca)
+
+dir.create(opt$outdir,recursive = T,showWarnings = F)
+
+
+corrections <- strsplit(x = opt$correction,split = "\\+")[[1]]
+
+
+dir.create(opt$outdir,recursive = T,showWarnings = F)
+
+
+smp.listFile <- strsplit(opt$inputFiles,split = "\\+")[[1]]
+
+smp.list <- list()
+#load dataset 
+
+for (i in 1:length(x = smp.listFile)) {
+  #For testing, in final workflow sampleName will be incorporated in metadata with the loading of cell ranger matrix
+  smp.list[[i]] <- readRDS(smp.listFile[i])
+  sampleName <- unique(smp.list[[i]]@meta.data$sampleName)
+  smp.list[[i]] <- RenameCells(smp.list[[i]],add.cell.id = sampleName)
+  
+  if(opt$norm_method != "sctransform") {
+    smp.list[[i]] <- NormalizeData(object = smp.list[[i]], verbose = FALSE)
+    smp.list[[i]] <- FindVariableFeatures(object = smp.list[[i]], selection.method = "vst", 
+                                           nfeatures = 2000, verbose = FALSE)
+  } else {
+    smp.list[[i]] <- SCTransform(smp.list[[i]],vars.to.regress = corrections,verbose = T)
+  }
+}
+
+
+smp.anchors <- FindIntegrationAnchors(object.list = smp.list, dims = 1:opt$num_dim_CCA)
+
+if(opt$reusePca) {
+  print("Re use pca")
+  smp.combined <- IntegrateData(anchorset = smp.anchors, weight.reduction = "pca", dims = 1:opt$num_dim_weight)
+}
+
+smp.combined <- IntegrateData(anchorset = smp.anchors, dims = 1:opt$num_dim_weight)
+
+
+# Run the standard workflow for visualization and clustering
+#Here certainly need to reuse SCTransform on combined data if norm_method = sct
+
+
+classicSeuratWorkflow <- function(smp.combined, corrections,outdir) {
+  dir.create(outdir,recursive = T)
+  smp.combined <- ScaleData(object = smp.combined, verbose = T,vars.to.regress = corrections)
+  smp.combined <- RunPCA(object = smp.combined, npcs = opt$num_dim_integrated, verbose = FALSE)
+  
+  
+  png(paste(outdir,"/ElbowPlot.png",sep =""))
+  ElbowPlot(object = smp.combined,ndims = opt$num_dim_integrated)
+  dev.off()
+  
+  
+  # t-SNE UMAP and Clustering
+  smp.combined <- RunUMAP(object = smp.combined, reduction = "pca", dims = 1:opt$num_dim)
+  smp.combined <- RunTSNE(object = smp.combined, reduction = "pca", dims = 1:opt$num_dim)
+  smp.combined <- FindNeighbors(object = smp.combined, reduction = "pca", dims = 1:opt$num_dim)
+  smp.combined <- FindClusters(smp.combined, resolution = c(0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,1.2))
+  
+  colPrefix <- paste0(DefaultAssay(smp.combined),"_snn_res.")
+  
+  Idents(smp.combined) <- paste(colPrefix,opt$resolution,sep = "")
+  
+  smp.combined@meta.data$AGE <- "Old"
+  smp.combined@meta.data$AGE[which(smp.combined@meta.data$age == "2_months")] <- "Young"
+  
+  
+  
+  # Visualization of the samples
+  p1 <- DimPlot(object = smp.combined, reduction = "umap", group.by = "sampleName")
+  p2 <- DimPlot(object = smp.combined, reduction = "umap", label = TRUE)
+  p3 <- DimPlot(object = smp.combined, reduction = "umap", group.by = "AGE")
+  p4 <- DimPlot(object = smp.combined, reduction = "umap", group.by = "runDate")
+  p3tsne <- DimPlot(object = smp.combined, reduction = "tsne", group.by = "AGE")
+  p4tsne <- DimPlot(object = smp.combined, reduction = "tsne", group.by = "runDate")
+  
+  
+  
+  png(paste(outdir,"/umap_samples.png",sep =""),height = 800,width=1200)
+  grid.arrange(p1, p2,ncol = 2)
+  dev.off()
+  
+  png(paste(outdir,"/AGE.png",sep =""),height = 800,width=1200)
+  grid.arrange(p3, p3tsne,ncol = 2)
+  dev.off()
+  
+  png(paste(outdir,"/runDate.png",sep =""),height = 800,width=1200)
+  grid.arrange(p4, p4tsne,ncol = 2)
+  dev.off()
+  
+  png(paste(outdir,"/UmapClusters.png",sep =""),height = 800,width=1200)
+  DimPlot(object = smp.combined, reduction = "umap", label = TRUE)
+  dev.off()
+  
+  
+  p1tsne <- DimPlot(object = smp.combined, reduction = "tsne", group.by = "sampleName")
+  p2tsne <- DimPlot(object = smp.combined, reduction = "tsne", label = TRUE)
+  
+  png(paste(outdir,"/tsne_samples.png",sep =""),height = 800,width=1000)
+  grid.arrange(p1tsne, p2tsne,ncol = 2)
+  dev.off()
+  
+  #Visualistaoin one sample at a time
+  sampleNames <- unique(smp.combined@meta.data$sampleName) 
+  dir.create(paste(outdir,"/samples/umap/",sep = ""),recursive = T)
+  dir.create(paste(outdir,"/samples/tsne/",sep = ""),recursive = T)
+  
+  #samplePlotList <- list()
+  for (s in sampleNames) {
+    png(paste(outdir,"/samples/umap/",s,".png",sep = ""))
+    plot(DimPlot(object = smp.combined, cells= grep(s, smp.combined@meta.data$sampleName), reduction = "umap") + ggtitle(s))
+    dev.off()
+    png(paste(outdir,"/samples/tsne/",s,".png",sep = ""))
+    plot(DimPlot(object = smp.combined, cells= grep(s, smp.combined@meta.data$sampleName), reduction = "tsne") + ggtitle(s))
+    dev.off()
+  }
+  
+  
+  umapListRes <- list()
+  tsneListRes <- list()
+  for (r in c(0.6,0.8,1,1.2)) {
+    umapListRes[[as.character(r)]] <- DimPlot(smp.combined,
+                                              reduction = "umap",
+                                              label = T,
+                                              group.by = paste(colPrefix,r,sep="")) + 
+      NoLegend() +
+      ggtitle(paste("res",r))
+    
+    tsneListRes[[as.character(r)]] <- DimPlot(smp.combined,
+                                              reduction = "tsne",
+                                              label = T,
+                                              group.by = paste(colPrefix,r,sep="")) + 
+      NoLegend() +
+      ggtitle(paste("res",r))
+  }
+  
+  png(paste(outdir,"/tsne_different_res.png",sep = ""),height = 800,width = 800)
+  grid.arrange(tsneListRes[[1]],tsneListRes[[2]],tsneListRes[[3]],tsneListRes[[4]])
+  dev.off()
+  
+  png(paste(outdir,"/umap_different_res.png",sep = ""),height = 800,width = 800)
+  grid.arrange(umapListRes[[1]],umapListRes[[2]],umapListRes[[3]],umapListRes[[4]])
+  dev.off()
+  
+  
+  
+  smp.combined@meta.data$numclust <- smp.combined@meta.data[,paste(colPrefix,opt$resolution,sep = "")]
+  
+  
+  #Check for unwanted source of variation UMAP
+  pPhases <- DimPlot(smp.combined,group.by = "phases")
+  if (!is.null(smp.combined@meta.data$predicted)) {
+    pPred <- DimPlot(smp.combined,group.by = "predicted")
+  } else {
+    pPred <-  DimPlot(smp.combined)
+  }
+  pUMI <- FeaturePlot(smp.combined, "nCount_RNA")
+  pMito <- FeaturePlot(smp.combined, "percentMito")
+  
+  png(paste(outdir,"/umap_factors.png",sep = ""),height = 800,width = 800)
+  grid.arrange(pPhases,pPred,pUMI,pMito)
+  dev.off()
+  
+  #Check for unwanted source of variation tSNE
+  pPhases <- DimPlot(smp.combined,group.by = "phases",reduction = "tsne")
+  if (!is.null(smp.combined@meta.data$predicted)) {
+    pPred <- DimPlot(smp.combined,group.by = "predicted",reduction = "tsne")
+  } else {
+    pPred <-  DimPlot(smp.combined,reduction = "tsne")
+  }
+  pUMI <- FeaturePlot(smp.combined, "nCount_RNA",reduction = "tsne")
+  pMito <- FeaturePlot(smp.combined, "percentMito",reduction = "tsne")
+  
+  png(paste(outdir,"/tsne_factors.png",sep = ""),height = 800,width = 800)
+  grid.arrange(pPhases,pPred,pUMI,pMito)
+  dev.off()
+  return(smp.combined)
+}
+
+print("Seurat workflow without integration")
+DefaultAssay(object = smp.combined) <- "RNA"
+smp.combined <- NormalizeData(object = smp.combined)
+smp.combined <- FindVariableFeatures(object = smp.combined,selection.method = "vst", nfeatures = 2000, verbose = T)
+
+if (!is.null(corrections)) {
+  print("Seurat workflow without correction")
+  classicSeuratWorkflow(smp.combined,correction = NULL,outdir = paste0(opt$outdir,"/SeuratMergingWoIntegrationWoCr/"))
+}
+
+classicSeuratWorkflow(smp.combined,corrections = corrections,outdir = paste0(opt$outdir,"/SeuratMergingWoIntegration/"))
+
+
+
+print("Seurat workflow with integration")
+
+DefaultAssay(object = smp.combined) <- "integrated"
+
+
+if (!is.null(corrections)) {
+  print("Seurat workflow without correction")
+  classicSeuratWorkflow(smp.combined,correction = NULL,outdir = paste0(opt$outdir,"/SeuratWithoutCr/"))
+}
+
+smp.combined <- classicSeuratWorkflow(smp.combined,corrections = corrections,outdir =opt$outdir)
+
+
+
+#AGE prop for each cluster 
+age_clust <- getAGEPropPerClustBarplot(smp.combined)
+#sample prop for each cluster
+sample_clust <- getSamplePropPerClustBarplot(smp.combined)
+#runDate prop for each cluster
+runDate_clust <- getRunDatePropPerClustBarplot(smp.combined)
+
+png(paste(opt$outdir,"/propPerClust.png",sep = ""))
+grid.arrange(age_clust,sample_clust,runDate_clust,
+             DimPlot(object = smp.combined, reduction = "tsne", label = TRUE))
+dev.off()
+
+
+## saving results
+
+saveRDS(smp.combined,paste(opt$outdir,"/combined.rds",sep =""))
+
+
+
+
+
+
--- a/R_src/SignacMotifAnnotCL.R
+++ b/R_src/SignacMotifAnnotCL.R
+#-----------------------------------------------------#
+#           Signac Motif analysis tutorial
+#-----------------------------------------------------#
+
+suppressMessages(library(getopt))
+suppressMessages(library(Signac))
+suppressMessages(library(Seurat))
+suppressMessages(library(JASPAR2020))
+suppressMessages(library(TFBSTools))
+suppressMessages(library(BSgenome.Mmusculus.UCSC.mm10))
+suppressMessages(library(ggplot2))
+# library(patchwork)
+
+
+##-------------------------------------------------------------------------##
+## Option list
+##-------------------------------------------------------------------------##
+
+spec = matrix(c(
+  'help',        'h', 0, "logical",   "Help about the program",
+  'inputRDS',  'i', 1, "character", "REQUIRED: 10X data prepared as seurat/signac object.",
+  'outdir',     'o',1, "character", 'Outdir path (default ./)',
+  'species', 's', 1, "numeric", "Species ID number for the motif DB, human = 9606, mouse = 10090 (default human)",
+  'assay', 'a', 1, "character", "Name of the assay that contain the atac data (default 'peaks')",
+  'cluster', 'c', 1, "character", "Name of the cluster identity for the markers",
+  'IdentName', 'n', 1, "character", "Idents new name '+' separated",
+  'IdentLevel', 'l', 1, "character", "Levels of the new Ident '+' separated",
+  'integrated', 'I', 1, "logical", "Is the RDS an integrated dataset or not"
+), byrow=TRUE, ncol=5);
+
+opt = getopt(spec)
+
+# if help was asked, print a friendly message
+# and exit with a non-zero error code
+
+args <- commandArgs()
+if ( !is.null(opt$help) | is.null(opt$inputRDS)) {
+  cat("Help message")
+  cat(getopt(spec, usage=TRUE))
+  q(status=1)
+}
+
+if (is.null(opt$outdir)) {
+  opt$outdir = "./"
+}
+
+if (is.null(opt$species)){
+  opt$species <- 9606
+}
+
+if(is.null(opt$assay)){
+  opt$assay <- "peaks"
+}
+
+# opt$inputRDS <- "/shared/projects/scRNA_HSPC_Aging/GMP_PRARA/output/TransferRnaAtac/coembedAll.rds"
+# opt$IdentName <- "Neu1+Neu3+Neu2+NeuRA1+Rep+NeuRA2"
+# opt$IdentLevel <- "Rep+Neu1+Neu2+Neu3+NeuRA1+NeuRA2"
+# opt$species <- 9606
+# opt$cluster <- "numclust"
+
+
+defAssay <- opt$assay
+
+outdir <- opt$outdir
+
+dir.create(outdir, recursive = T)
+
+#------------------------------------#
+#      Load the motif dataBase
+#------------------------------------#
+print("Load JASPAR DB")
+# Get a list of motif position frequency matrices from the JASPAR database
+# species = 9606 is human species id while the tutorial is using mice dataset
+pfm <- getMatrixSet(
+  x = JASPAR2020,
+  opts = list(species = opt$species, all_versions = FALSE)
+)
+
+#------------------------------------#
+#  Read seurat and prepare the data
+#------------------------------------#
+
+seurat <- readRDS(opt$inputRDS)
+
+DefaultAssay(seurat) <- defAssay
+
+Idents(seurat) <- opt$cluster
+
+print("Entering analysis loop")
+print(paste("opt$integrated == ", opt$integrated, sep = ""))
+if(opt$integrated == TRUE){
+  # Prepare data 
+  # Rename the cluster with real name for an easier analysis
+  new.ident.name <- strsplit(opt$IdentName, split = "\\+")[[1]]
+  
+  names(new.ident.name) <- levels(seurat)
+  
+  seurat <- RenameIdents(seurat, new.ident.name)
+  seurat@meta.data$FinalCluster <- Idents(seurat)
+  
+  ident_levels <- strsplit(opt$IdentLevel, split = "\\+")[[1]]
+  
+  seurat@meta.data$FinalCluster <- factor(seurat@meta.data$FinalCluster, levels = ident_levels)
+  
+  Idents(seurat) <- "FinalCluster"
+  
+  # Subset only ATAC data
+  Idents(seurat) <- "condType"
+  seurat_atac <- subset(seurat, idents = grep(pattern = "ATAC", x = unique(Idents(seurat)), value = T))
+  seurat_atac[[defAssay]] <-  as.ChromatinAssay(seurat_atac[[defAssay]], seqinfo = "mm10")
+  
+  Idents(seurat_atac) <- "FinalCluster"
+
+  # Adding motif information
+  seurat_atac <- AddMotifs(object = seurat_atac, genome = BSgenome.Mmusculus.UCSC.mm10, pfm = pfm)
+  
+  motif_correl <- seurat_atac@assays$peaks@motifs@motif.names 
+  motif_correl.df <- data.frame(do.call("rbind", motif_correl))
+  names(motif_correl.df)[1] <- "Motif_Name"
+  motif_correl.df$Motif_ID <- rownames(motif_correl.df)
+  
+  write.table(x = motif_correl.df, file = paste0(outdir, "/motif_name_table.tsv"), sep = "\t", row.names = F, col.names = T, quote = F)
+  
+  #----------------------------#
+  #  Method 1 : Find DA region
+  #----------------------------#
+  # dir.create(paste0(outdir, "/Overrep_Motif"), recursive = T)
+  # print("Starting method 1")
+  # ## Treatment impact
+  # dir.create(paste0(outdir, "/Overrep_Motif/Treatment"), recursive = T)
+  # # FindMarkers using Logistic regression
+  # Idents(seurat_atac) <- "condition"
+  # 
+  # da_peaks_treatment <- FindMarkers(seurat_atac, ident.1 = "Treated", ident.2 = "Control", only.pos = T, test.use = "LR", latent.vars = "nCount_peaks")
+  # da_peaks_treatment.sig <- rownames(da_peaks_treatment[da_peaks_treatment$p_val < 0.005, ])
+  # 
+  # # Calculate a motif score for each of the most DA region
+  # enriched_motifs_treatment <- FindMotifs(object = seurat_atac, features = da_peaks_treatment.sig)
+  # motif_plot_treatment <- MotifPlot(seurat_atac, head(enriched_motifs_treatment$motif, 6))
+  # 
+  # ggsave(paste0(outdir, "/Overrep_Motif/Treatment", "/Top_motif_treatment", '.png'), plot = motif_plot_treatment, device = 'png', path = NULL,
+  #        scale = 1, width = 20, height = 15, units = 'cm', dpi = 300)
+  # 
+  # write.table(da_peaks_treatment, file = paste0(outdir, "/Overrep_Motif/Treatment", "/DA_genomic_regions_treatment", ".tsv"), 
+  #             sep = "\t", col.names = T, row.names = T, quote = F)
+  # write.table(enriched_motifs_treatment, file = paste0(outdir, "/Overrep_Motif/Treatment", "/Motif_enrichment_treatment", ".tsv"), 
+  #             sep = "\t", col.names = T, row.names = T, quote = F)
+  # print("Treatment analysis done")
+  # 
+  # ## Cluster impact
+  # dir.create(paste0(outdir, "/Overrep_Motif/Cluster"), recursive = T)
+  # 
+  # Idents(seurat_atac) <- "FinalCluster"
+  # 
+  # print(unique(seurat_atac@meta.data$FinalCluster))
+  # # BiocParallel::register(BiocParallel::SerialParam())
+  # # da_peaks_cluster <- FindAllMarkers(seurat_atac, only.pos = T, test.use = "LR", latent.vars = "nCount_peaks")
+  # da_peaks_cluster.list <- list()
+  # for(cluster in unique(seurat_atac$FinalCluster)){
+  #   print(paste("Find markers cluster", cluster, sep = " "))
+  #   if(nrow(seurat_atac@meta.data[seurat_atac$FinalCluster %in% cluster,]) >3){
+  #     da_peaks_cluster.list[[cluster]] <- FindMarkers(seurat_atac, ident.1 = cluster, only.pos = T, test.use = "LR", latent.vars = "nCount_peaks")
+  #     da_peaks_cluster.list[[cluster]]$Cluster <- cluster
+  #     da_peaks_cluster.list[[cluster]]$peaks <- rownames(da_peaks_cluster.list[[cluster]])
+  #   } else{
+  #     print("Not enough cells")
+  #   }
+  # }
+  # 
+  # da_peaks_cluster.db <- do.call("rbind", da_peaks_cluster.list)
+  # print("FindMarkers cluster done")
+  # 
+  # enriched_motifs_cluster.list <- list()
+  # for(cluster in unique(da_peaks_cluster.db$Cluster)){
+  #   print(paste(cluster, "FindMotif", sep = " "))
+  #   da_peaks_cluster.sig <- da_peaks_cluster.db[which(da_peaks_cluster.db$p_val < 0.005 & da_peaks_cluster.db$Cluster == cluster),]
+  #   da_peaks_cluster.sig.name <- da_peaks_cluster.sig$peaks
+  #   enriched_motifs_cluster.list[[paste0(cluster, "_motifs")]] <- FindMotifs(seurat_atac, features = da_peaks_cluster.sig.name)
+  #   enriched_motifs_cluster.list[[paste0(cluster, "_motifs")]]$Cluster <- cluster
+  #   
+  #   # motif_plot <- MotifPlot(atac_small, motifs = head(enriched_motifs_cluster.list[[paste0(cluster, "_motifs")]]$motif, 6))
+  #   # ggsave(paste0(outdir, "/Overrep_Motif/Cluster", "/Top_motif_cluster_", cluster, '.png'), plot = motif_plot, device = 'png', path = NULL,
+  #   #        scale = 1, width = 20, height = 15, units = 'cm', dpi = 300)
+  # }
+  # enriched_motifs_cluster.df <- do.call("rbind", enriched_motifs_cluster.list)
+  # 
+  # write.table(da_peaks_cluster.db, file = paste0(outdir, "/Overrep_Motif/Cluster", "/Motif_enrichment_cluster", ".tsv"), 
+  #             sep = "\t", col.names = T, row.names = T, quote = F)
+  # write.table(enriched_motifs_cluster.df, file = paste0(outdir, "/Overrep_Motif/Cluster", "/Motif_enrichment_cluster", ".tsv"), 
+  #             sep = "\t", col.names = T, row.names = T, quote = F)
+  # print("cluster impact done")
+  #---------------------------------------#
+  #  Method 2 : Computing motif activity
+  #---------------------------------------#
+  dir.create(paste0(outdir, "/Motif_activity"), recursive = T)
+  
+  print("Starting method 2")
+  # Computing motif per cell motif activity score
+  # Try this for error with multicore
+  BiocParallel::register(BiocParallel::SerialParam())
+  seurat_atac <- RunChromVAR(object = seurat_atac, genome = BSgenome.Mmusculus.UCSC.mm10)
+  
+  DefaultAssay(seurat_atac) <- "chromvar"
+  print("End chromvar")
+  
+  # Find differential activity in motif per treatment
+  dir.create(paste0(outdir, "/Motif_activity/Treatment"), recursive = T)
+  print("Starting treatment analysis")
+  
+  Idents(seurat_atac) <- "condition"
+  diff_act_treatment <- FindMarkers(object = seurat_atac, ident.1 = "Treated", ident.2 = "Control", test.use = "LR", latent.vars = "nCount_peaks")
+  diff_act_treatment$Motif_Name <- motif_correl.df[rownames(diff_act_treatment), "Motif_Name"]
+  
+  top_motif_treatment <- MotifPlot(object = seurat_atac, motifs = head(rownames(diff_act_treatment), 6), assay = "peaks")
+  
+  write.table(diff_act_treatment, file = paste0(outdir, "/Motif_activity/Treatment", "/Motif_enrich_treatment", ".tsv"), 
+              sep = "\t", row.names = T, col.names = T, quote = F)
+  
+  ggsave(paste0(outdir, "/Motif_activity/Treatment", "/Top_motif_treatment", '.png'), plot = top_motif_treatment, device = 'png', path = NULL,
+         scale = 1, width = 20, height = 15, units = 'cm', dpi = 300)
+  print("End treatment analysis")
+  
+  # Find differential activity in motif per cluster
+  dir.create(paste0(outdir, "/Motif_activity/Cluster"), recursive = T)
+  
+  print("Starting cluster analysis")
+  Idents(seurat_atac) <- "FinalCluster"
+  diff_act_cluster <- FindAllMarkers(object = seurat_atac, only.pos = T, test.use = "LR", latent.vars = "nCount_peaks")
+  
+  for(cluster in unique(diff_act_cluster$cluster)){
+    top_motif <- diff_act_cluster[diff_act_cluster$cluster %in% cluster,]
+    top_motif.name <- head(top_motif$gene, 6)
+    
+    top_motif.plot <- MotifPlot(object = seurat_atac, motifs = top_motif.name, assay = "peaks")
+    ggsave(paste0(outdir, "/Motif_activity/Cluster", "/Top_motif_cluster_", cluster, '.png'), plot = top_motif.plot, device = 'png', path = NULL,
+           scale = 1, width = 20, height = 15, units = 'cm', dpi = 300)
+  }
+  
+
+  
+  # rownames(diff_act_cluster) <- diff_act_cluster$gene
+  print("Adding Motif_name to table output")
+  diff_act_cluster.list <- list()
+  for(cluster in unique(diff_act_cluster$cluster)){
+    diff_act_cluster.temp <- diff_act_cluster[diff_act_cluster$cluster %in% cluster,]
+    rownames(diff_act_cluster.temp) <- diff_act_cluster.temp$gene
+    diff_act_cluster.temp$Motif_Name <- motif_correl.df[rownames(diff_act_cluster.temp), "Motif_Name"]
+    diff_act_cluster.list[[cluster]] <- diff_act_cluster.temp
+  }
+  diff_act_cluster.final <- do.call("rbind", diff_act_cluster.list)
+  
+  write.table(diff_act_cluster.final, file = paste0(outdir, "/Motif_activity/Cluster", "/Motif_enrich_cluster", ".tsv"), 
+              sep = "\t", row.names = T, col.names = T, quote = F)
+  print("End cluster analysis")
+  
+  # Analysing motif markers of difference between pop IN a cluster (ex : Clus1_Treated vs Clust1_Control)
+  print("Starting the treatment per cluster analysis")
+  dir.create(paste0(outdir, "/Motif_activity/Treatment_per_clust"), recursive = T)
+  
+  seurat_atac$cluster_cond <- paste(seurat_atac$condition, seurat_atac$FinalCluster, sep = "_")
+  Idents(seurat_atac) <- "cluster_cond"
+
+  diff_act_cluster_cond.list <- list()
+  for(cluster in unique(seurat_atac$FinalCluster)){
+    print(paste("Treated_vs_Control", cluster, sep = "_"))
+    
+    if(nrow(seurat_atac@meta.data[seurat_atac$cluster_cond %in% paste("Control", cluster, sep = "_"),]) >3 && 
+       nrow(seurat_atac@meta.data[seurat_atac$cluster_cond %in% paste("Treated", cluster, sep = "_"),])){
+      print("Enough cells for analysis")
+      diff_act_cluster_cond.list[[paste("Treated_vs_Control", cluster, sep = "_")]] <- FindMarkers(object = seurat_atac, ident.1 = paste("Treated", cluster, sep = "_"),
+                                                                                                   ident.2 = paste("Control", cluster, sep = "_"),
+                                                                                                   test.use = 'LR', latent.vars = 'nCount_peaks')
+      
+      diff_act_cluster_cond.list[[paste("Treated_vs_Control", cluster, sep = "_")]]$Comparison <- "Treated_vs_Control"
+      diff_act_cluster_cond.list[[paste("Treated_vs_Control", cluster, sep = "_")]]$Cluster <- cluster
+      diff_act_cluster_cond.list[[paste("Treated_vs_Control", cluster, sep = "_")]]$Motif_ID <- rownames(diff_act_cluster_cond.list[[paste("Treated_vs_Control", cluster, sep = "_")]])
+      diff_act_cluster_cond.list[[paste("Treated_vs_Control", cluster, sep = "_")]]$Motif_Name <- motif_correl.df[rownames(diff_act_cluster_cond.list[[paste("Treated_vs_Control", cluster, sep = "_")]]), "Motif_Name"]
+    }else{
+      print("Not enough cells")
+    }
+  }
+  diff_act_cluster_cond.df <- do.call("rbind", diff_act_cluster_cond.list)
+  
+  print("Plotting top motif for treated and control per cluster")
+  for(cluster in unique(diff_act_cluster_cond.df$Cluster)){
+    top_motif <- diff_act_cluster_cond.df[diff_act_cluster_cond.df$Cluster %in% cluster,]
+    top_motif.name <- head(top_motif$Motif_ID, 6)
+    bot_motif.name <- tail(top_motif$Motif_ID, 6)
+    
+    top_motif.plot <- MotifPlot(object = seurat_atac, motifs = top_motif.name, assay = "peaks")
+    bot_motif.plot <- MotifPlot(object = seurat_atac, motifs = bot_motif.name, assay = "peaks")
+    ggsave(paste0(outdir, "/Motif_activity/Treatment_per_clust", "/Top_motif_treated_cluster_", cluster, '.png'), plot = top_motif.plot, device = 'png', path = NULL,
+           scale = 1, width = 20, height = 15, units = 'cm', dpi = 300)
+    ggsave(paste0(outdir, "/Motif_activity/Treatment_per_clust", "/Top_motif_control_cluster_", cluster, '.png'), plot = bot_motif.plot, device = 'png', path = NULL,
+           scale = 1, width = 20, height = 15, units = 'cm', dpi = 300)
+  }
+  
+  write.table(diff_act_cluster_cond.df, file = paste0(outdir, "/Motif_activity/Treatment_per_clust", "/Motif_enrich_cluster", ".tsv"), 
+              sep = "\t", row.names = T, col.names = T, quote = F)
+  print("End of the treatment per cluster analysis")
+  
+  saveRDS(seurat_atac, file = paste0(outdir, "/atac_motif", '.rds'))
+  
+} else if(opt$integrated == FALSE){
+  print("Only computing motif activity score")
+  
+  # Adding motif information
+  seurat <- AddMotifs(object = seurat, genome = BSgenome.Mmusculus.UCSC.mm10, pfm = pfm)
+  
+  motif_correl <- seurat@assays$peaks@motifs@motif.names 
+  motif_correl.df <- data.frame(do.call("rbind", motif_correl))
+  names(motif_correl.df)[1] <- "Motif_Name"
+  motif_correl.df$Motif_ID <- rownames(motif_correl.df)
+  
+  write.table(x = motif_correl.df, file = paste0(outdir, "/motif_name_table.tsv"), sep = "\t", row.names = F, col.names = T, quote = F)
+  
+  # Calculate motif score
+  BiocParallel::register(BiocParallel::SerialParam())
+  seurat <- RunChromVAR(object = seurat, genome = BSgenome.Mmusculus.UCSC.mm10)
+  
+  saveRDS(seurat, file = paste0(outdir, "/atac_motif", ".rds"))
+}
+
+
--- a/R_src/SignacMotifAnnotCL_v2.R
+++ b/R_src/SignacMotifAnnotCL_v2.R
+#-----------------------------------------------------#
+#           Signac Motif analysis tutorial
+#-----------------------------------------------------#
+
+suppressMessages(library(getopt))
+suppressMessages(library(Signac))
+suppressMessages(library(Seurat))
+suppressMessages(library(JASPAR2020))
+suppressMessages(library(TFBSTools))
+suppressMessages(library(BSgenome.Mmusculus.UCSC.mm10))
+suppressMessages(library(ggplot2))
+# library(patchwork)
+
+
+##-------------------------------------------------------------------------##
+## Option list
+##-------------------------------------------------------------------------##
+
+spec = matrix(c(
+  'help',        'h', 0, "logical",   "Help about the program",
+  'inputRDS',  'i', 1, "character", "REQUIRED: 10X data prepared as seurat/signac object.",
+  'outdir',     'o',1, "character", 'Outdir path (default ./)',
+  'species', 's', 1, "numeric", "Species ID number for the motif DB, human = 9606, mouse = 10090 (default human)",
+  'assay', 'a', 1, "character", "Name of the assay that contain the atac data (default 'peaks')",
+  'cluster', 'c', 1, "character", "Name of the cluster identity for the markers",
+  'IdentName', 'n', 1, "character", "Idents new name '+' separated",
+  'IdentLevel', 'l', 1, "character", "Levels of the new Ident '+' separated",
+  'integrated', 'I', 1, "logical", "Is the RDS an integrated dataset or not"
+), byrow=TRUE, ncol=5);
+
+opt = getopt(spec)
+
+# if help was asked, print a friendly message
+# and exit with a non-zero error code
+
+args <- commandArgs()
+if ( !is.null(opt$help) | is.null(opt$inputRDS)) {
+  cat("Help message")
+  cat(getopt(spec, usage=TRUE))
+  q(status=1)
+}
+
+if (is.null(opt$outdir)) {
+  opt$outdir = "./"
+}
+
+if (is.null(opt$species)){
+  opt$species <- 9606
+}
+
+if(is.null(opt$assay)){
+  opt$assay <- "peaks"
+}
+
+# opt$inputRDS <- "/shared/projects/scRNA_HSPC_Aging/GMP_PRARA/output/TransferRnaAtac/coembedAll.rds"
+# opt$IdentName <- "Neu2+Rep+Neu3+NeuRA2+NeuRA1+Neu1"
+# opt$IdentLevel <- "Rep+Neu1+Neu2+Neu3+NeuRA1+NeuRA2"
+# opt$species <- 9606
+# opt$cluster <- "predicted.id"
+
+
+defAssay <- opt$assay
+
+outdir <- opt$outdir
+
+dir.create(outdir, recursive = T)
+
+#------------------------------------#
+#      Load the motif dataBase
+#------------------------------------#
+print("Load JASPAR DB")
+# Get a list of motif position frequency matrices from the JASPAR database
+# species = 9606 is human species id while the tutorial is using mice dataset
+pfm <- getMatrixSet(
+  x = JASPAR2020,
+  opts = list(species = opt$species, all_versions = FALSE)
+)
+
+#------------------------------------#
+#  Read seurat and prepare the data
+#------------------------------------#
+
+seurat <- readRDS(opt$inputRDS)
+
+DefaultAssay(seurat) <- defAssay
+
+Idents(seurat) <- opt$cluster
+
+print("Entering analysis loop")
+print(paste("opt$integrated == ", opt$integrated, sep = ""))
+if(opt$integrated == TRUE){
+  # Prepare data 
+  # Rename the cluster with real name for an easier analysis
+  new.ident.name <- strsplit(opt$IdentName, split = "\\+")[[1]]
+  
+  names(new.ident.name) <- levels(seurat)
+  
+  seurat <- RenameIdents(seurat, new.ident.name)
+  seurat@meta.data$FinalCluster <- Idents(seurat)
+  
+  ident_levels <- strsplit(opt$IdentLevel, split = "\\+")[[1]]
+  
+  seurat@meta.data$FinalCluster <- factor(seurat@meta.data$FinalCluster, levels = ident_levels)
+  
+  Idents(seurat) <- "FinalCluster"
+  
+  seurat_atac <- seurat
+  # Adding motif information
+  seurat_atac <- AddMotifs(object = seurat_atac, genome = BSgenome.Mmusculus.UCSC.mm10, pfm = pfm)
+  
+  motif_correl <- seurat_atac@assays$peaks@motifs@motif.names 
+  motif_correl.df <- data.frame(do.call("rbind", motif_correl))
+  names(motif_correl.df)[1] <- "Motif_Name"
+  motif_correl.df$Motif_ID <- rownames(motif_correl.df)
+  
+  write.table(x = motif_correl.df, file = paste0(outdir, "/motif_name_table.tsv"), sep = "\t", row.names = F, col.names = T, quote = F)
+  
+  #---------------------------------------#
+  #  Method 2 : Computing motif activity
+  #---------------------------------------#
+  dir.create(paste0(outdir, "/Motif_activity"), recursive = T)
+  
+  print("Starting method 2")
+  # Computing motif per cell motif activity score
+  # Try this for error with multicore
+  BiocParallel::register(BiocParallel::SerialParam())
+  seurat_atac <- RunChromVAR(object = seurat_atac, genome = BSgenome.Mmusculus.UCSC.mm10)
+  
+  DefaultAssay(seurat_atac) <- "chromvar"
+  print("End chromvar")
+  
+  # Find differential activity in motif per treatment
+  dir.create(paste0(outdir, "/Motif_activity/Treatment"), recursive = T)
+  print("Starting treatment analysis")
+  
+  Idents(seurat_atac) <- "condition"
+  diff_act_treatment <- FindMarkers(object = seurat_atac, ident.1 = "Treated", ident.2 = "Control", test.use = "LR", latent.vars = "nCount_peaks")
+  diff_act_treatment$Motif_Name <- motif_correl.df[rownames(diff_act_treatment), "Motif_Name"]
+  
+  top_motif_treatment <- MotifPlot(object = seurat_atac, motifs = head(rownames(diff_act_treatment), 6), assay = "peaks")
+  
+  write.table(diff_act_treatment, file = paste0(outdir, "/Motif_activity/Treatment", "/Motif_enrich_treatment", ".tsv"), 
+              sep = "\t", row.names = T, col.names = T, quote = F)
+  
+  ggsave(paste0(outdir, "/Motif_activity/Treatment", "/Top_motif_treatment", '.png'), plot = top_motif_treatment, device = 'png', path = NULL,
+         scale = 1, width = 20, height = 15, units = 'cm', dpi = 300)
+  print("End treatment analysis")
+  
+  # Find differential activity in motif per cluster
+  dir.create(paste0(outdir, "/Motif_activity/Cluster"), recursive = T)
+  
+  print("Starting cluster analysis")
+  Idents(seurat_atac) <- "FinalCluster"
+  diff_act_cluster <- FindAllMarkers(object = seurat_atac, only.pos = T, test.use = "LR", latent.vars = "nCount_peaks")
+  
+  for(cluster in unique(diff_act_cluster$cluster)){
+    top_motif <- diff_act_cluster[diff_act_cluster$cluster %in% cluster,]
+    top_motif.name <- head(top_motif$gene, 6)
+    
+    top_motif.plot <- MotifPlot(object = seurat_atac, motifs = top_motif.name, assay = "peaks")
+    ggsave(paste0(outdir, "/Motif_activity/Cluster", "/Top_motif_cluster_", cluster, '.png'), plot = top_motif.plot, device = 'png', path = NULL,
+           scale = 1, width = 20, height = 15, units = 'cm', dpi = 300)
+  }
+  
+  
+  
+  # rownames(diff_act_cluster) <- diff_act_cluster$gene
+  print("Adding Motif_name to table output")
+  diff_act_cluster.list <- list()
+  for(cluster in unique(diff_act_cluster$cluster)){
+    diff_act_cluster.temp <- diff_act_cluster[diff_act_cluster$cluster %in% cluster,]
+    rownames(diff_act_cluster.temp) <- diff_act_cluster.temp$gene
+    diff_act_cluster.temp$Motif_Name <- motif_correl.df[rownames(diff_act_cluster.temp), "Motif_Name"]
+    diff_act_cluster.list[[cluster]] <- diff_act_cluster.temp
+  }
+  diff_act_cluster.final <- do.call("rbind", diff_act_cluster.list)
+  
+  write.table(diff_act_cluster.final, file = paste0(outdir, "/Motif_activity/Cluster", "/Motif_enrich_cluster", ".tsv"), 
+              sep = "\t", row.names = T, col.names = T, quote = F)
+  print("End cluster analysis")
+  
+  # Analysing motif markers of difference between pop IN a cluster (ex : Clus1_Treated vs Clust1_Control)
+  print("Starting the treatment per cluster analysis")
+  dir.create(paste0(outdir, "/Motif_activity/Treatment_per_clust"), recursive = T)
+  
+  seurat_atac$cluster_cond <- paste(seurat_atac$condition, seurat_atac$FinalCluster, sep = "_")
+  Idents(seurat_atac) <- "cluster_cond"
+  
+  diff_act_cluster_cond.list <- list()
+  for(cluster in unique(seurat_atac$FinalCluster)){
+    print(paste("Treated_vs_Control", cluster, sep = "_"))
+    
+    if(nrow(seurat_atac@meta.data[seurat_atac$cluster_cond %in% paste("Control", cluster, sep = "_"),]) >3 && 
+       nrow(seurat_atac@meta.data[seurat_atac$cluster_cond %in% paste("Treated", cluster, sep = "_"),])){
+      print("Enough cells for analysis")
+      diff_act_cluster_cond.list[[paste("Treated_vs_Control", cluster, sep = "_")]] <- FindMarkers(object = seurat_atac, ident.1 = paste("Treated", cluster, sep = "_"),
+                                                                                                   ident.2 = paste("Control", cluster, sep = "_"),
+                                                                                                   test.use = 'LR', latent.vars = 'nCount_peaks')
+      
+      diff_act_cluster_cond.list[[paste("Treated_vs_Control", cluster, sep = "_")]]$Comparison <- "Treated_vs_Control"
+      diff_act_cluster_cond.list[[paste("Treated_vs_Control", cluster, sep = "_")]]$Cluster <- cluster
+      diff_act_cluster_cond.list[[paste("Treated_vs_Control", cluster, sep = "_")]]$Motif_ID <- rownames(diff_act_cluster_cond.list[[paste("Treated_vs_Control", cluster, sep = "_")]])
+      diff_act_cluster_cond.list[[paste("Treated_vs_Control", cluster, sep = "_")]]$Motif_Name <- motif_correl.df[rownames(diff_act_cluster_cond.list[[paste("Treated_vs_Control", cluster, sep = "_")]]), "Motif_Name"]
+    }else{
+      print("Not enough cells")
+    }
+  }
+  diff_act_cluster_cond.df <- do.call("rbind", diff_act_cluster_cond.list)
+  
+  print("Plotting top motif for treated and control per cluster")
+  for(cluster in unique(diff_act_cluster_cond.df$Cluster)){
+    top_motif <- diff_act_cluster_cond.df[diff_act_cluster_cond.df$Cluster %in% cluster,]
+    top_motif.name <- head(top_motif$Motif_ID, 6)
+    bot_motif.name <- tail(top_motif$Motif_ID, 6)
+    
+    top_motif.plot <- MotifPlot(object = seurat_atac, motifs = top_motif.name, assay = "peaks")
+    bot_motif.plot <- MotifPlot(object = seurat_atac, motifs = bot_motif.name, assay = "peaks")
+    ggsave(paste0(outdir, "/Motif_activity/Treatment_per_clust", "/Top_motif_treated_cluster_", cluster, '.png'), plot = top_motif.plot, device = 'png', path = NULL,
+           scale = 1, width = 20, height = 15, units = 'cm', dpi = 300)
+    ggsave(paste0(outdir, "/Motif_activity/Treatment_per_clust", "/Top_motif_control_cluster_", cluster, '.png'), plot = bot_motif.plot, device = 'png', path = NULL,
+           scale = 1, width = 20, height = 15, units = 'cm', dpi = 300)
+  }
+  
+  write.table(diff_act_cluster_cond.df, file = paste0(outdir, "/Motif_activity/Treatment_per_clust", "/Motif_enrich_cluster", ".tsv"), 
+              sep = "\t", row.names = T, col.names = T, quote = F)
+  print("End of the treatment per cluster analysis")
+  
+  saveRDS(seurat_atac, file = paste0(outdir, "/atac_motif", '.rds'))
+  
+} else if(opt$integrated == FALSE){
+  print("Only computing motif activity score")
+  
+  # Adding motif information
+  seurat <- AddMotifs(object = seurat, genome = BSgenome.Mmusculus.UCSC.mm10, pfm = pfm)
+  
+  motif_correl <- seurat@assays$peaks@motifs@motif.names 
+  motif_correl.df <- data.frame(do.call("rbind", motif_correl))
+  names(motif_correl.df)[1] <- "Motif_Name"
+  motif_correl.df$Motif_ID <- rownames(motif_correl.df)
+  
+  write.table(x = motif_correl.df, file = paste0(outdir, "/motif_name_table.tsv"), sep = "\t", row.names = F, col.names = T, quote = F)
+  
+  # Calculate motif score
+  BiocParallel::register(BiocParallel::SerialParam())
+  seurat <- RunChromVAR(object = seurat, genome = BSgenome.Mmusculus.UCSC.mm10)
+  
+  saveRDS(seurat, file = paste0(outdir, "/atac_motif", ".rds"))
+}
+
+
--- a/R_src/SignacOneSampleCL.R
+++ b/R_src/SignacOneSampleCL.R
+##------------------------------------------------------------------------------
+## L. Herault
+##------------------------------------------------------------------------------
+
+## -----------------------------------------------------------------------------
+## Libraries
+## -----------------------------------------------------------------------------
+
+suppressMessages(library(Signac))
+suppressMessages(library(Seurat))
+suppressMessages(library(GenomeInfoDb))
+suppressMessages(library(ggplot2))
+suppressMessages(library(patchwork))
+set.seed(1234)
+
+
+# Analysis of seurat 3 integration and clusternig workflow
+
+## -----------------------------------------------------------------------------
+## Command line args
+## -----------------------------------------------------------------------------
+
+spec = matrix(c(
+  'help',        'h', 0, "logical",   "Help about the program",
+  'filename',    'i', 1, 'character', 'input filtered_peak_bc_matrix.h5',
+  'metafile',    'm', 1, 'character', 'input metadata file eg. singlecell.csv',
+  'fragments',    'f', 1, 'character', 'fragments.tsv.gz file',
+  'genome',    'g',  1, 'character', 'genome,  corresponding R libraries have to be installed (eg EnsDb.Mmusculus.v79) for mm10',
+  'minCell',     'c', 1, 'numeric', 'min cell number by feature (default 10)',
+  'minFeature',  'e', 1, 'numeric', "min feature number per cells (default 200)",
+  'outdir',     'o',1, "character", 'Outdir path (default ./)',
+  "tfTested",         't',1, "character", "TF tested list (eg TF with a motif in the scenic database",
+  "regulonJson",       'r',1, "character", "Regulon main Json file",
+  "regulonJsonSupp", "s", 1, "character", "Regulon supp Json files (separated by +)",
+  "subConditionName", "n", 1, "character", "correspondong names (separated by +)"
+  
+), byrow=TRUE, ncol=5);
+
+## default settings
+
+if (is.null(opt$outdir)) {
+  opt$outdir = "./"
+} 
+
+if (is.null(opt$minCell)) {
+  opt$minCell = 10
+} 
+
+if (is.null(opt$minFeature)) {
+  opt$minFeature = 200
+} 
+
+# Printing option before running
+
+for (o in names(opt)) {
+  print(paste(o,":", opt[[o]]))
+}
+
+## Load data
+counts <- Read10X_h5(filename = opt$filename)
+
+metadata <- read.csv(
+  file = opt$metafile,
+  header = TRUE,
+  row.names = 1
+)
+
+chrom_assay <- CreateChromatinAssay(
+  counts = counts,
+  sep = c(":", "-"),
+  genome = opt$genome,
+  fragments = opt$fragments,
+  min.cells = opt$minCell,
+  min.features = opt$minFeature
+)
+
+signac <- CreateSeuratObject(
+  counts = chrom_assay,
+  assay = "peaks",
+  meta.data = metadata
+)
+
+if(opt$genome == "mm10") {
+  suppressMessages(library(EnsDb.Mmusculus.v79))
+  ensdb = EnsDb.Mmusculus.v79
+}
+
+# extract gene annotations from EnsDb
+annotations <- GetGRangesFromEnsDb(ensdb = ensdb)
+
+# change to UCSC style since the data was mapped to hg19
+seqlevelsStyle(annotations) <- 'UCSC'
+genome(annotations) <- opt$genome
+
+# add the gene information to the object
+Annotation(signac) <- annotations
+
+signac <- NucleosomeSignal(object = signac)
+
+# Quality control and cells filtering
+
+# compute TSS enrichment score per cell
+signac <- TSSEnrichment(object = signac, fast = FALSE)
+
+# add blacklist ratio and fraction of reads in peaks
+signac$pct_reads_in_peaks <- signac$peak_region_fragments / signac$passed_filters * 100
+signac$blacklist_ratio <- signac$blacklist_region_fragments / signac$peak_region_fragments
+
+# Plot TSS enrichment, separating cells in two groups
+signac$high.tss <- ifelse(signac$TSS.enrichment > 2, 'High', 'Low')
+
+png(pasteO(opt$oudir,'/TSS_enrichment.png'),width = 800,height = 800)
+TSSPlot(signac, group.by = 'high.tss') + NoLegend()
+dev.off()
+
+# Plot FragmentHistogram on chr1, separating cells in two groups depending on nucleosome signal
+
+signac$nucleosome_group <- ifelse(signac$nucleosome_signal > 2, 'NS > 2', 'NS < 2')
+
+png(pasteO(opt$oudir,'/fragmentHistogram.png'),width = 800,height = 800)
+FragmentHistogram(object = signac,region = "chr1-1-100000000",group.by = 'nucleosome_group')
+dev.off()
+
+# VlnPlot of control metrics
+png(pasteO(opt$oudir,'/vlncontrolMetrics.png'),width = 1400,height = 500)
+
+VlnPlot(
+  object = signac,
+  features = c('pct_reads_in_peaks', 'peak_region_fragments',
+               'TSS.enrichment', 'blacklist_ratio', 'nucleosome_signal'),
+  pt.size = 0.001,
+  ncol = 5
+)
+dev.off()
+
+# Filter outliers
+signac <- subset(
+  x = signac,
+  subset = peak_region_fragments > 3000 &
+    peak_region_fragments < 20000 &
+    pct_reads_in_peaks > 15 &
+    blacklist_ratio < 0.05 &
+    nucleosome_signal < 4 &
+    TSS.enrichment > 2
+)
+
+signac
+
+
+
+
--- a/R_src/actinnSeuratCL.R
+++ b/R_src/actinnSeuratCL.R
+## -----------------------------------------------------------------------------
+## Libraries
+## -----------------------------------------------------------------------------
+print("Loading library")
+
+library(Seurat)
+library(getopt)
+library(scales)
+library(RColorBrewer)
+library(plyr)
+library(ggplot2)
+library(gridExtra)
+library(cowplot)
+library(reshape2)
+library(cowplot)
+library(gridExtra)
+
+source("R_src/do_consensus.R")
+source("R_src/funForAnalysisSeurat.R")
+
+theme_set(theme_classic())
+
+
+
+
+spec = matrix(c(
+  'help',        'h', 0, "logical",   "Help about the program",
+  'inputRDS',  'i', 1, "character", "REQUIRED: 10X data ordered by monocle (.RDS generated by orderCL.R).",
+  'actinnPathGMP', 'g', 1, "character", "Path to actinn multiple run directory",
+  'actinnPathProg', 'p', 1, "character", "Path to actinn multiple run directory",
+  'signatures', 's', 1, "character", "Signature files from nestorowa analysis",
+  'outdir',     'o',1, "character", 'Outdir path (default ./)'
+), byrow=TRUE, ncol=5);
+
+opt = getopt(spec)
+
+
+args <- commandArgs()
+if ( !is.null(opt$help) | is.null(opt$inputRDS)) {
+  cat("Gene filtering of a gbm cds ordered. For scenic use, two filters")
+  cat(getopt(spec, usage=TRUE))
+  q(status=1)
+}
+
+if (is.null(opt$outdir)) {
+  opt$outdir = "./"
+}
+outdir <- opt$outdir
+
+# Data for test
+# opt$inputRDS <- "/shared/projects/scRNA_HSPC_Aging/GMP_PRARA/output/RNA/streamAnalysisSeurat/seuratPseudotime.rds"
+# opt$actinnPathGMP <- "/shared/projects/scRNA_HSPC_Aging/GMP_PRARA/output/RNA/multiple_Actinn_Nestorowa_gmp/"
+# opt$actinnPathProg <- "/shared/projects/scRNA_HSPC_Aging/GMP_PRARA/output/RNA/multiple_Actinn_Nestorowa_prog/"
+# opt$signatures <- "/shared/projects/scRNA_HSPC_Aging/GMP_PRARA/output/RNA/Nestorowa/progenitors_signature.csv"
+
+dir.create(outdir, recursive = T)
+
+print("Loading files")
+# Read Seurat file
+seurat <- readRDS(opt$inputRDS)
+
+# Read and do actinn consensus
+actinn_consensus_gmp <- do_consensus(opt$actinnPathGMP)
+actinn_consensus_prog <- do_consensus(opt$actinnPathProg)
+
+# Read signatures list
+signatures <- read.table(opt$signatures, sep = ",", header = T)
+
+## Color palette
+
+colorTreatment <- c("#664CFF", "#FF8000")
+colorCluster <- c("#E69F00", "#CC79A7", "#0072B2", "#009E73", "#D55E00", "#56B4E9")
+
+
+# Add actinn result as meta data 
+
+seurat$cellType_Actinn_GMP <- actinn_consensus_gmp[rownames(seurat@meta.data),"Consensus"] 
+seurat$cellType_Actinn_GMP <- factor(seurat$cellType_Actinn_GMP, levels = c("GMP_broad", "CMP_broad", "MEP_broad"))
+
+seurat$cellType_Actinn_Prog <- actinn_consensus_prog[rownames(seurat@meta.data),"Consensus"] 
+seurat$cellType_Actinn_Prog <- factor(seurat$cellType_Actinn_Prog, levels = c("Prog", "HSPC"))
+
+# Calculate signature score
+
+for(sig in 1:ncol(signatures)){
+  print(names(signatures[sig]))
+  seurat <- AddModuleScore(seurat, features = list(signatures[,sig]), name = names(signatures[sig]))
+  colnames(seurat@meta.data)[which(colnames(seurat@meta.data) == paste0(names(signatures[sig]), 1))] <- names(signatures[sig])
+}
+
+seurat[["Signature_nesto"]] <- CreateAssayObject(data = t(as.matrix(seurat@meta.data[,names(signatures)])))
+
+
+## Plotting results of Actinn transfer
+# Cell size UMAP 0.4
+
+## Prog transfer and signature ##
+## We only look at Prog signature, the transfer result is not convincing
+
+# UMAP 
+# UMAP_actinn_prog <- DimPlot(seurat, group.by = "cellType_Actinn_prog") + ggtitle("")
+
+
+# Violin plot
+Prog_all_melt <- melt(seurat@meta.data[,c("Prog_signature_up", "Prog_signature_down")])
+Prog_Ctrl_melt <- melt(seurat@meta.data[seurat$condition %in% "Control", c("Prog_signature_up", "Prog_signature_down")])
+Prog_RA_melt <- melt(seurat@meta.data[seurat$condition %in% "Treated", c("Prog_signature_up", "Prog_signature_down")])
+
+
+
+violinplot_prog <- ggplot(Prog_all_melt, aes(x = variable, y = value)) +
+  geom_violin(fill = "grey") + theme(legend.position = "none", axis.title.x = element_blank()) +
+  ylab("Signature score") + scale_x_discrete(labels = c("Prog_signature_up" = "Prog", "Prog_signature_down" = "HSPC"))
+
+violinplot_Ctrl_prog <- ggplot(Prog_Ctrl_melt, aes(x = variable, y = value)) +
+  geom_violin(fill = "grey") + theme(legend.position = "none", axis.title.x = element_blank()) +
+  ylab("Signature score") + scale_x_discrete(labels = c("Prog_signature_up" = "Prog", "Prog_signature_down" = "HSPC"))
+
+violinplot_RA_prog <- ggplot(Prog_RA_melt, aes(x = variable, y = value)) +
+  geom_violin(fill = "grey") + theme(legend.position = "none", axis.title.x = element_blank()) +
+  ylab("Signature score") + scale_x_discrete(labels = c("Prog_signature_up" = "Prog", "Prog_signature_down" = "HSPC"))
+
+
+ggsave(paste0(outdir, 'Vln_Prog_HSPC', '.png'), plot = violinplot_prog, device = 'png', path = NULL,
+       scale = 1, width = 15, height = 10, units = 'cm', dpi = 300)
+ggsave(paste0(outdir, 'Vln_Ctrl_Prog_HSPC', '.png'), plot = violinplot_Ctrl_prog, device = 'png', path = NULL,
+       scale = 1, width = 15, height = 10, units = 'cm', dpi = 300)
+ggsave(paste0(outdir, 'Vln_RA_Prog_HSPC', '.png'), plot = violinplot_RA_prog, device = 'png', path = NULL,
+       scale = 1, width = 15, height = 10, units = 'cm', dpi = 300)
+## GMP transfer and signature visualisation ##
+
+# Violin Plot
+GMP_all_melt <- melt(seurat@meta.data[,c("GMP_signature_up", "CMP_signature_up", "MEP_signature_up")])
+GMP_Ctrl_melt <- melt(seurat@meta.data[seurat$condition %in% "Control", c("GMP_signature_up", "CMP_signature_up", "MEP_signature_up")])
+GMP_RA_melt <- melt(seurat@meta.data[seurat$condition %in% "Treated", c("GMP_signature_up", "CMP_signature_up", "MEP_signature_up")])
+
+violinplot_gmp <- ggplot(GMP_all_melt, aes(x = variable, y = value)) +
+  geom_violin(fill = "grey") + theme(legend.position = "none", axis.title.x = element_blank()) +
+  ylab("Signature score") + scale_x_discrete(labels = c("GMP_signature_up" = "GMP", "CMP_signature_up" = "CMP", "MEP_signature_up" = "MEP"))
+violinplot_Ctrl_gmp <- ggplot(GMP_Ctrl_melt, aes(x = variable, y = value)) +
+  geom_violin(fill = "grey") + theme(legend.position = "none", axis.title.x = element_blank()) +
+  ylab("Signature score") + scale_x_discrete(labels = c("GMP_signature_up" = "GMP", "CMP_signature_up" = "CMP", "MEP_signature_up" = "MEP"))
+violinplot_RA_gmp <- ggplot(GMP_RA_melt, aes(x = variable, y = value)) +
+  geom_violin(fill = "grey") + theme(legend.position = "none", axis.title.x = element_blank()) +
+  ylab("Signature score") + scale_x_discrete(labels = c("GMP_signature_up" = "GMP", "CMP_signature_up" = "CMP", "MEP_signature_up" = "MEP"))
+
+
+ggsave(paste0(outdir, 'Vln_GMP_CMP_MEP', '.png'), plot = violinplot_gmp, device = 'png', path = NULL,
+       scale = 1, width = 15, height = 10, units = 'cm', dpi = 300)
+ggsave(paste0(outdir, 'Vln_Ctrl_GMP_CMP_MEP', '.png'), plot = violinplot_Ctrl_gmp, device = 'png', path = NULL,
+       scale = 1, width = 15, height = 10, units = 'cm', dpi = 300)
+ggsave(paste0(outdir, 'Vln_RA_GMP_CMP_MEP', '.png'), plot = violinplot_RA_gmp, device = 'png', path = NULL,
+       scale = 1, width = 15, height = 10, units = 'cm', dpi = 300)
+# UMAP
+
+# Can't plot after subseting on MEP_broad because there is only one cell
+Idents(seurat) <- "cellType_Actinn_GMP"
+
+seurat_gmp <- subset(seurat, idents = "GMP_broad")
+seurat_cmp <- subset(seurat, idents = "CMP_broad")
+# seurat_mep <- subset(seurat, idents = "MEP_broad")
+
+Idents(seurat) <- "FinalCluster"
+
+UMAP_actinn_GMP <- DimPlot(seurat_gmp, group.by = "cellType_Actinn_GMP", pt.size = 0.4) +
+  theme(axis.title = element_blank(), axis.text = element_blank(), legend.position = "none") +
+  scale_color_manual(values = "grey")
+
+UMAP_actinn_CMP <- DimPlot(seurat_cmp, group.by = "cellType_Actinn_GMP", pt.size = 0.4) +
+  theme(axis.title = element_blank(), axis.text = element_blank(), legend.position = "none") +
+  scale_color_manual(values = "grey")
+
+# UMAP_actinn_MEP <- DimPlot(seurat_mep, group.by = "cellType_Actinn_GMP")
+
+UMAP_actinn_gmp_split <- DimPlot(seurat, group.by = "cellType_Actinn_GMP", split.by = "cellType_Actinn_GMP", pt.size = 0.4) +
+  theme(axis.title = element_blank(), axis.text = element_blank(), legend.position = "none") + ggtitle("") +
+  scale_color_manual(values = c("grey", "grey", "grey"))
+
+
+ggsave(paste0(outdir, 'UMAP_GMP_split', '.png'), plot = UMAP_actinn_gmp_split, device = 'png', path = NULL,
+       scale = 1, width = 25, height = 15, units = 'cm', dpi = 300)
+ggsave(paste0(outdir, 'UMAP_GMP', '.png'), plot = UMAP_actinn_GMP, device = 'png', path = NULL,
+       scale = 1, width = 15, height = 15, units = 'cm', dpi = 300)
+ggsave(paste0(outdir, 'UMAP_CMP', '.png'), plot = UMAP_actinn_CMP, device = 'png', path = NULL,
+       scale = 1, width = 15, height = 15, units = 'cm', dpi = 300)
+
+
+saveRDS(seurat, paste0(outdir, "seurat_actinn", ".rds"))
\ No newline at end of file
--- a/R_src/assignSurfaceMarkerSeuratCL.R
+++ b/R_src/assignSurfaceMarkerSeuratCL.R
+## -----------------------------------------------------------------------------
+## Libraries
+## -----------------------------------------------------------------------------
+
+library(Seurat)
+library(monocle)
+library(plyr)
+library(ggplot2)
+library(getopt)
+
+## -----------------------------------------------------------------------------
+## Command line args
+## -----------------------------------------------------------------------------
+
+spec = matrix(c(
+  'help',        'h', 0, "logical",   "Help about the program",
+  "inputSeurat",  "i",1, "character", "input seurat object with cluster column names numclust",
+  'outdir',     'o',1, "character", 'Outdir path (default ./)',
+  "surfaceMarkers", "s", 1, "character", "List of surface marker split by \\+",
+  "namePop", "n", 1, "character", "Name of the double pos population"
+), byrow=TRUE, ncol=5);
+
+
+opt = getopt(spec)
+
+# if help was asked, print a friendly message
+# and exit with a non-zero error code
+
+args <- commandArgs()
+if ( !is.null(opt$help) | is.null(opt$inputSeurat)) {
+  cat("Create influence graph from regulon table")
+  cat(getopt(spec, usage=TRUE))
+  q(status=1)
+}
+
+if(is.null(opt$outdir)){
+  opt$outdir <- "./"
+}
+
+theme_set(theme_classic())
+
+# Testing option
+opt$outdir <- "/shared/projects/scRNA_HSPC_Aging/GMP_PRARA/output/RNA/Seurat4_integration/Analysis/"
+opt$inputSeurat <- readRDS("/shared/projects/scRNA_HSPC_Aging/GMP_PRARA/output/RNA/regulonAnalysis/seuratAUC_2.rds")
+opt$surfaceMarkers <- "Ly6g+Itgam"
+opt$namePop <- "Diff"
+
+outdir <- opt$outdir
+
+# Read surface markers 
+surfaceMarkers <- strsplit(opt$surfaceMarkers, split = "\\+")[[1]]
+
+# Open seurat then save it as CDS object
+seurat <- readRDS(opt$inputSeurat)
+seurat_cds <- as.CellDataSet(seurat, assay = "RNA")
+
+
+# Classify cells with surface markers
+cth <- newCellTypeHierarchy()
+
+SM_1_id <- row.names(subset(fData(seurat_cds), gene_short_name == surfaceMarkers[1]))
+SM_2_id <- row.names(subset(fData(seurat_cds), gene_short_name == surfaceMarkers[1]))
+
+cth <- addCellType(cth, opt$namePop, classify_func = 
+                     function(x){ x[SM_1_id,] >=1 & x[SM_2_id,] >=1})
+
+seurat_cds <- classifyCells(seurat_cds, cth, 0.1)
+
+# Add cellType information to the seurat object
+seurat@meta.data$CellType <- pData(seurat_cds)[rownames(seurat@meta.data), "CellType"]
+
+# Barplot cellType distribution
+summary_diff <- ddply(seurat@meta.data,~FinalCluster + CellType + condition, nrow)
+summary_diff$Diff_cell <- factor(summary_diff$CellType, levels = c("Unknown", "Diff"))
+
+cellType_control_bp <- ggplot(data = data.frame(summary_diff[summary_diff$condition %in% "Control",]), aes(x = FinalCluster, y = V1, fill = CellType)) + 
+  geom_bar(stat = "identity", position = position_dodge()) +
+  ylab(label = "") + xlab(label = "") + coord_flip()
+
+cellType_treated_bp <- ggplot(data = data.frame(summary_diff[summary_diff$condition %in% "Treated",]), aes(x = FinalCluster, y = V1, fill = CellType)) + 
+  geom_bar(stat = "identity", position = position_dodge()) +
+  ylab(label = "") + xlab(label = "") + coord_flip()
+
+ggsave(paste0(outdir, "BarPlot/", 'BP_CS_Markers_', opt$namePop, '_control', '.png'), plot = cellType_control_bp, device = 'png', path = NULL, 
+       scale = 1, width = 10, height = 10, units = 'cm', dpi = 300)
+ggsave(paste0(outdir, "BarPlot/", 'BP_CS_Markers_', opt$namePop, '_treated', '.png'), plot = cellType_treated_bp, device = 'png', path = NULL, 
+       scale = 1, width = 10, height = 10, units = 'cm', dpi = 300)
+
+saveRDS(seurat, paste0(outdir, "seurat_CT", ".rds"))
--- a/R_src/castle.R
+++ b/R_src/castle.R
+castle <- function(source_seurat,target_seurat,outdir = "./") {
+  set.seed(2018)
+  BREAKS=c(-1, 0, 1, 6, Inf)
+  nFeatures = 100
+  
+  dir.create(outdir,showWarnings = F)
+
+  # 1. Load datasets in scater format: loaded files expected to contain "Large SingleCellExperiment" object
+  
+  
+  # 1.1 convert from seurat (be careful seurat 2) to sce object if seurat object as input
+  source <- Convert(from = source_seurat, to = "sce")
+  target <- Convert(from = target_seurat, to = "sce")
+
+
+  
+  
+  ds1 = t(exprs(source)) 
+  ds2 = t(exprs(target)) 
+  sourceCellTypes = as.factor(colData(source)[,"library_id"])
+
+  # 2. Unify sets, excluding low expressed genes
+  source_n_cells_counts = apply(exprs(source), 1, function(x) { sum(x > 0) } )
+  print("source OK")
+  target_n_cells_counts = apply(exprs(target), 1, function(x) { sum(x > 0) } )
+  common_genes = intersect( rownames(source)[source_n_cells_counts>10], 
+                            rownames(target)[target_n_cells_counts>10]
+  )
+  remove(source_n_cells_counts, target_n_cells_counts)
+  ds1 = ds1[, colnames(ds1) %in% common_genes]
+  ds2 = ds2[, colnames(ds2) %in% common_genes]
+  ds = rbind(ds1[,common_genes], ds2[,common_genes])
+  isSource = c(rep(TRUE,nrow(ds1)), rep(FALSE,nrow(ds2)))
+  remove(ds1, ds2)
+  
+  # 3. Highest mean in both source and target
+  topFeaturesAvg = colnames(ds)[order(apply(ds, 2, mean), decreasing = T)]
+  
+  # 4. Highest mutual information in source
+  topFeaturesMi = names(sort(apply(ds[isSource,],2,function(x) { compare(cut(x,breaks=BREAKS),sourceCellTypes,method = "nmi") }), decreasing = T))
+  
+  # 5. Top n genes that appear in both mi and avg
+  selectedFeatures = union(head(topFeaturesAvg, nFeatures) , head(topFeaturesMi, nFeatures) )
+  
+  # 6. remove correlated features
+  tmp = cor(as.matrix(ds[,selectedFeatures]), method = "pearson")
+  tmp[!lower.tri(tmp)] = 0
+  selectedFeatures = selectedFeatures[apply(tmp,2,function(x) any(x < 0.9))]
+  remove(tmp)
+  
+  # 7,8. Convert data from continous to binned dummy vars
+  # break datasets to bins
+  dsBins = apply(ds[, selectedFeatures], 2, cut, breaks= BREAKS)
+  # use only bins with more than one value
+  nUniq = apply(dsBins, 2, function(x) { length(unique(x)) })
+  # convert to dummy vars
+  ds = model.matrix(~ . , as.data.frame(dsBins[,nUniq>1]))
+  remove(dsBins, nUniq)
+  
+  
+  # 9. Classify
+  train = runif(nrow(ds[isSource,]))<0.8
+  #slightly different setup for multiclass and binary classification
+  if (length(unique(sourceCellTypes)) > 2) {
+    xg=xgboost(data=ds[isSource,][train, ] ,
+               label=as.numeric(sourceCellTypes[train])-1,
+               objective="multi:softmax", num_class=length(unique(sourceCellTypes)),
+               eta=0.7 , nthread=5, nround=20, verbose=0,
+               gamma=0.001, max_depth=5, min_child_weight=10)
+  } else {
+    xg=xgboost(data=ds[isSource,][train, ] ,
+               label=as.numeric(sourceCellTypes[train])-1,
+               eta=0.7 , nthread=5, nround=20, verbose=0,
+               gamma=0.001, max_depth=5, min_child_weight=10)
+  }
+  
+  
+  
+  # 10. Predict
+  predictedClasses = predict(xg, ds[!isSource, ])
+  
+  target_seurat@meta.data$predicted <- predictedClasses
+ 
+  
+  conv <- data.frame(cellType = unique(sourceCellTypes),numCellTypes=unique(as.numeric(sourceCellTypes[train]))-1)
+  
+  
+  for (num in conv$numCellTypes) {
+    target_seurat@meta.data$predicted[target_seurat@meta.data$predicted==num] <- as.character(conv[which(conv$numCellType== num),"cellType"])
+  }    
+  
+  
+  
+  target_seurat@meta.data$predicted <- factor(target_seurat@meta.data$predicted,levels = c("LTHSC","STHSC", "MPP2", "MPP3"))
+  png(paste(outdir,"/cellTypeLearned_tsne.png", sep = ""),height = 800,width = 800)
+  TSNEPlot(target_seurat,group.by="predicted")
+  dev.off()
+  
+  
+  
+  return(target_seurat)
+}
+
+castleSeurat3 <- function(source_seurat,target_seurat,outdir = "./") {
+  set.seed(2018)
+  BREAKS=c(-1, 0, 1, 6, Inf)
+  nFeatures = 100
+  
+  dir.create(outdir,showWarnings = F)
+  
+  # 1. Load datasets in scater format: loaded files expected to contain "Large SingleCellExperiment" object
+  
+  
+  # 1.1 convert from seurat (be careful seurat 3) to sce object if seurat object as input
+  source <- as.SingleCellExperiment(source_seurat, to = "sce",data = "logcounts")
+  target <- as.SingleCellExperiment(target_seurat, to = "sce",data = "logcounts")
+  
+  
+  
+  
+  ds1 = t(exprs(source)) 
+  ds2 = t(exprs(target)) 
+  sourceCellTypes = as.factor(colData(source)[,"library_id"])
+  
+  # 2. Unify sets, excluding low expressed genes
+  source_n_cells_counts = apply(exprs(source), 1, function(x) { sum(x > 0) } )
+  print("source OK")
+  target_n_cells_counts = apply(exprs(target), 1, function(x) { sum(x > 0) } )
+  common_genes = intersect( rownames(source)[source_n_cells_counts>10], 
+                            rownames(target)[target_n_cells_counts>10]
+  )
+  remove(source_n_cells_counts, target_n_cells_counts)
+  ds1 = ds1[, colnames(ds1) %in% common_genes]
+  ds2 = ds2[, colnames(ds2) %in% common_genes]
+  ds = rbind(ds1[,common_genes], ds2[,common_genes])
+  isSource = c(rep(TRUE,nrow(ds1)), rep(FALSE,nrow(ds2)))
+  remove(ds1, ds2)
+  
+  # 3. Highest mean in both source and target
+  topFeaturesAvg = colnames(ds)[order(apply(ds, 2, mean), decreasing = T)]
+  
+  # 4. Highest mutual information in source
+  topFeaturesMi = names(sort(apply(ds[isSource,],2,function(x) { compare(cut(x,breaks=BREAKS),sourceCellTypes,method = "nmi") }), decreasing = T))
+  
+  # 5. Top n genes that appear in both mi and avg
+  selectedFeatures = union(head(topFeaturesAvg, nFeatures) , head(topFeaturesMi, nFeatures) )
+  
+  # 6. remove correlated features
+  tmp = cor(as.matrix(ds[,selectedFeatures]), method = "pearson")
+  tmp[!lower.tri(tmp)] = 0
+  selectedFeatures = selectedFeatures[apply(tmp,2,function(x) any(x < 0.9))]
+  remove(tmp)
+  
+  # 7,8. Convert data from continous to binned dummy vars
+  # break datasets to bins
+  dsBins = apply(ds[, selectedFeatures], 2, cut, breaks= BREAKS)
+  # use only bins with more than one value
+  nUniq = apply(dsBins, 2, function(x) { length(unique(x)) })
+  # convert to dummy vars
+  ds = model.matrix(~ . , as.data.frame(dsBins[,nUniq>1]))
+  remove(dsBins, nUniq)
+  
+  
+  # 9. Classify
+  train = runif(nrow(ds[isSource,]))<0.8
+  #slightly different setup for multiclass and binary classification
+  if (length(unique(sourceCellTypes)) > 2) {
+    xg=xgboost(data=ds[isSource,][train, ] ,
+               label=as.numeric(sourceCellTypes[train])-1,
+               objective="multi:softmax", num_class=length(unique(sourceCellTypes)),
+               eta=0.7 , nthread=5, nround=20, verbose=0,
+               gamma=0.001, max_depth=5, min_child_weight=10)
+  } else {
+    xg=xgboost(data=ds[isSource,][train, ] ,
+               label=as.numeric(sourceCellTypes[train])-1,
+               eta=0.7 , nthread=5, nround=20, verbose=0,
+               gamma=0.001, max_depth=5, min_child_weight=10)
+  }
+  
+  
+  
+  # 10. Predict
+  predictedClasses = predict(xg, ds[!isSource, ])
+  
+  target_seurat@meta.data$predicted <- predictedClasses
+ 
+  
+  conv <- data.frame(cellType = unique(sourceCellTypes),numCellTypes=unique(as.numeric(sourceCellTypes[train]))-1)
+  
+  
+  for (num in conv$numCellTypes) {
+    target_seurat@meta.data$predicted[target_seurat@meta.data$predicted==num] <- as.character(conv[which(conv$numCellType== num),"cellType"])
+  }    
+  
+  
+  
+  target_seurat@meta.data$predicted <- factor(target_seurat@meta.data$predicted,levels = c("LTHSC","STHSC", "MPP2", "MPP3"))
+  png(paste(outdir,"/cellTypeLearned_tsne.png", sep = ""),height = 800,width = 800)
+  DimPlot(target_seurat,group.by ="predicted")
+  dev.off()
+  
+  
+  
+  return(target_seurat)
+}
+
+
+transfertToMonocle <- function(target_seurat,target_monocle,outdir = "./") {
+  pData(target_monocle)$predicted <- target_seurat@meta.data$predicted
+  png(paste(outdir,"/cellTypeLearned_ddrtree.png", sep = ""),height = 800,width = 800)
+  print(plot_cell_trajectory(target_monocle,color_by = "predicted"))
+  dev.off()
+  return(target_monocle)
+}
+
--- a/R_src/castleSeurat3CL.R
+++ b/R_src/castleSeurat3CL.R
+suppressMessages(library(gProfileR))
+suppressMessages(library(RColorBrewer))
+require(scales)
+suppressMessages(library(monocle))
+suppressMessages(library(Seurat))
+suppressMessages(library(scater))
+suppressMessages(library(xgboost))
+suppressMessages(library(igraph))
+suppressMessages(library(getopt))
+
+source("R_src/getSigPerCells.R")
+source("R_src/Enrichment.R")
+source("R_src/castle.R")
+
+spec = matrix(c(
+  'help',        'h', 0, "logical",   "Help about the program",
+  'sourceRDS',  'i', 1, "character", "REQUIRED: 10X data to trained classifier prepared as Seurat object (.RDS generated by RodriguezAnalysis.R).",
+  'targetRDS',  't', 1, "character", "REQUIRED: 10X data targeted prepared as seurat object (.RDS generated by SeuratCL.R (filtered, normalized)).",
+  'targetMonocleRDS', 'm', 1, "character", "REQUIRED: 10X data targeted prepared as monocle object (.RDS generated by seuratMonocleOrdering.R).",
+  'outdir',     'o',1, "character", 'Outdir path (default ./)'
+), byrow=TRUE, ncol=5);
+
+opt = getopt(spec)
+
+if ( !is.null(opt$help) | is.null(opt$sourceRDS)) {
+  cat("Perform perform cell type assignation using a source dataset and castle XGradientBoost tool")
+  cat(getopt(spec, usage=TRUE))
+  q(status=1)
+}
+
+
+source_seurat <- readRDS(opt$sourceRDS)
+target_seurat <- readRDS(opt$targetRDS)
+
+
+
+  seurat <- castleSeurat3(source_seurat,target_seurat,outdir = opt$outdir)
+  
+  if (!is.null(opt$targetMonocleRDS)) {
+    target_monocle <- readRDS(opt$targetMonocleRDS)
+    monocle <- transfertToMonocle(seurat,target_monocle,outdir = opt$outdir)
+    saveRDS(monocle,paste(opt$outdir,"/monocleTrainedCt.rds",sep =""))
+    
+  }
+  
+  png(paste(opt$outdir,"/CellTypeLearnedPropGG.png",sep = ""))
+  pie <- ggplot(seurat@meta.data,
+                aes(x = factor(1), fill = factor(predicted))) + geom_bar(width = 1)
+  pie <- pie + coord_polar(theta = "y") +
+    theme(axis.title.x = element_blank(), axis.title.y = element_blank())
+  print(pie)
+  dev.off()
+  
+  png(paste(opt$outdir,"/CellTypeLearnedProp.png",sep = ""))
+  pie(table(seurat@meta.data$predicted),labels = names(table(seurat@meta.data$predicted)),col = hue_pal()(4),main = paste("predicted prop"))
+  dev.off()
+  
+  saveRDS(seurat,paste(opt$outdir,"/seuratTrainedCt.rds",sep = ""))
+
--- a/R_src/computeDiffFun.R
+++ b/R_src/computeDiffFun.R
+#Functions to compute difference of feature in seurat object (because Seurat compute logFC)
+
+getExtAvgDiff <- function(row) {
+  res <- NA
+  if(row[1] > 0 & row[2] > 0) {
+    res <- min(c(row))
+  }
+  if(row[1] < 0 & row[2] < 0) {
+    res <- max(c(row))
+  }
+  return(res)
+}
+
+featureDiff <- function(seurat,cells.1,cells.2,feature) {
+  data <- GetAssayData(seurat,slot = "data")
+  total.diff <- mean(data[feature,cells.1]) - mean(data[feature,cells.2])
+  return(total.diff)
+} 
+
+
+getTrueDiff <- function (seurat,table,colIdent = "numclust",suffix = "",colTest="AGE") {
+  
+  if(!is.null(levels(seurat[[colTest]]))) {
+    factors <- levels(seurat@meta.data[,colTest])
+  } else {
+    factors <- unique(seurat@meta.data[,colTest])
+  }
+  factors <- as.vector(factors)
+
+  table[,paste("avg_diff",suffix,sep = "")] <- NA
+  
+  for (rm in rownames(table)) {
+    feature <- table[rm,"Gene"]
+    cells.1 <- colnames(seurat)[which(seurat[[colIdent]] == table[rm,colIdent] & seurat[[colTest]] == factors[1])]
+    cells.2 <- colnames(seurat)[which(seurat[[colIdent]] == table[rm,colIdent] & seurat[[colTest]] == factors[2])]
+    table[rm,paste("avg_diff",suffix,sep = "")] <- featureDiff(seurat,cells.1,cells.2,feature)
+  }
+  
+  return(table)
+}
+
+
+## Test functions
+
+FindMarkerPerClustGroupVar <- function(cluster,
+                                       hspc.combined,
+                                       condition ="AGE",        ## only var with two factors allowed
+                                       grouping.var = "platform",
+                                       max.cells.per.ident = Inf,
+                                       filterOnPadj = T,
+                                       logfc.threshold = 0.25,
+                                       min.pct = 0.1,
+                                       keepDiverging = F,
+                                       test.use = "wilcox",
+                                       identCol = "numclust",
+                                       pseudocount.use = 1,
+                                       computeTrueDiff= F) {
+  
+  clusterCondition <- paste0("cluster.",condition)
+  if(!is.null(levels(seurat@meta.data[,condition]))) {
+    factors <- levels(seurat@meta.data[,condition])
+  } else {
+    factors <- unique(seurat@meta.data[,condition])
+  }
+  
+  hspc.combined@meta.data[,clusterCondition] <- paste(hspc.combined@meta.data[,identCol],
+                                                      hspc.combined@meta.data[,condition], sep = "_")
+  
+  test <- table(hspc.combined@meta.data[,clusterCondition], hspc.combined@meta.data[,grouping.var])
+  groups = unique(hspc.combined@meta.data[,grouping.var])
+  
+  ident1 = paste0(cluster,"_",factors[1])
+  ident2 = paste0(cluster,"_",factors[2])
+  
+  
+  if( (test[ident1,groups[1]]>3 & test[ident1,groups[2]]>3) &
+      (test[ident2,groups[1]]>3&test[ident2,groups[2]]>3)) {
+    
+    Idents(object = hspc.combined) <- clusterCondition
+    
+    
+    
+    
+    markers <- FindConservedMarkers(hspc.combined,
+                                    assay = DefaultAssay(hspc.combined),
+                                    grouping.var = grouping.var,
+                                    test.use=test.use,
+                                    ident.1 = ident1,
+                                    ident.2 = ident2,
+                                    pseudocount.use = pseudocount.use,
+                                    min.pct = min.pct,
+                                    logfc.threshold = logfc.threshold,
+                                    max.cells.per.ident = max.cells.per.ident)
+    
+    print(dim(markers))
+    #Add a column the min abs(logFC) 
+    markers$min_avg_logFC <- NA
+    
+    if(!keepDiverging) {
+      #print("keep only markers with conserved logfc sign")
+      markers <- markers[which(markers[,paste0(groups[1],"_avg_logFC")]/markers[,paste0(group[2],"_avg_logFC")] > 0),]
+    } else {
+      #print("combined p_val of diverging markers between two batch are set to 1 and their min_avg_logfc to 0")
+      markers[which(markers[,paste0(groups[1],"_avg_logFC")]/markers[,paste0(group[2],"_avg_logFC")] > 0),"min_avg_logFC"] <- 0
+      markers[which(markers[,paste0(groups[1],"_avg_logFC")]/markers[,paste0(group[2],"_avg_logFC")] > 0),"minimump_p_val"] <- 1
+    }
+    
+    for (g in rownames(markers)) {
+      if(markers[g,paste0(groups[1],"_avg_logFC")] < 0) {
+        markers[g,"min_avg_logFC"] <- max(markers[g,c(paste0(groups[1],"_avg_logFC"),paste0(groups[2],"_avg_logFC"))])
+      } else {
+        markers[g,"min_avg_logFC"] <- min(markers[g,c(paste0(groups[1],"_avg_logFC"),paste0(groups[2],"_avg_logFC"))])
+      }
+    }
+    
+    markers$Cluster <- paste0(ident1,"_up")
+    markers$Cluster[which(markers$min_avg_logFC < 0)] <-  paste0(ident1,"_down")
+    markers$Gene <- rownames(markers)
+    markers <- markers[order(markers$min_avg_logFC),]
+    
+    if(computeTrueDiff) {
+      markers$numclust <- str_split_fixed(markers$Cluster,pattern = "_",n=3)[,1]
+      
+      
+      markers <- getTrueDiff(seurat[,which(seurat[[grouping.var]] == group[1])],
+                                  markers,
+                                  colIdent = "numclust",
+                                  colTest = condition,
+                                  suffix = paste0("_",group[1]))
+      
+      markers <- getTrueDiff(seurat[,which(seurat[[grouping.var]] == group[2])],
+                                  markers,
+                                  colIdent = "numclust",
+                                  colTest = condition,
+                                  suffix = paste0("_",group[2]))
+      
+      
+      markers$min_avg_diff <- apply(markers[,c(paste0("avg_diff_",group[1]),paste0("avg_diff_",group[2]))],
+                                    1,
+                                    FUN=getExtAvgDiff) 
+    }
+    
+    
+    if (filterOnPadj) {
+      markers <- markers[which(markers$A_p_val_adj < 0.05 & markers$B_p_val_adj < 0.05),]
+    }
+  }else {
+    markers <- NULL 
+  }
+  return(markers)
+  
+}
+
+FindMarkerPerClust <- function(cluster,
+                               hspc.combined,
+                               condition ="AGE",        ## only var with two factors allowed
+                               max.cells.per.ident = Inf,
+                               logfc.threshold = 0.25,
+                               min.pct = 0.1,
+                               test.use = "wilcox",
+                               identCol = "numclust",
+                               pseudocount.use = 1,
+                               computeTrueDiff = F) {
+  
+  clusterCondition <- paste0("cluster.",condition)
+  if(!is.null(levels(seurat@meta.data[,condition]))) {
+    factors <- levels(seurat@meta.data[,condition])
+  } else {
+    factors <- unique(seurat@meta.data[,condition])
+  }
+  
+  hspc.combined@meta.data[,clusterCondition] <- paste(hspc.combined@meta.data[,identCol],
+                                                      hspc.combined@meta.data[,condition], sep = "_")
+  
+  test <- table(hspc.combined@meta.data[,clusterCondition])
+  
+  ident1 = paste0(cluster,"_",factors[1])
+  ident2 = paste0(cluster,"_",factors[2])
+  
+  
+  if( test[ident1]>3 & test[ident2]>3) {
+    
+    Idents(object = hspc.combined) <- clusterCondition
+    
+    markers <- FindMarkers(hspc.combined,
+                           assay = DefaultAssay(hspc.combined),
+                           test.use=test.use,
+                           ident.1 = ident1,
+                           ident.2 = ident2,
+                           pseudocount.use = pseudocount.use,
+                           min.pct = min.pct,
+                           logfc.threshold = logfc.threshold,
+                           max.cells.per.ident = max.cells.per.ident)
+    
+    markers$Cluster <- paste0(ident1,"_up")
+    markers$Cluster[which(markers$avg_logFC < 0)] <-  paste0(ident1,"_down")
+    markers$Gene <- rownames(markers)
+    markers <- markers[order(markers$avg_logFC),]
+    
+    if(computeTrueDiff) {
+      markers$numclust <- str_split_fixed(markers$Cluster,pattern = "_",n=3)[,1]
+      
+      
+      markers <- getTrueDiff(seurat,
+                             markers,
+                             colIdent = "numclust",
+                             colTest = condition
+                             )
+      
+     }
+    
+  } else {
+    markers <- NULL 
+  }
+  return(markers)
+  
+}
+
--- a/R_src/data_preparation.R
+++ b/R_src/data_preparation.R
+# R function to prepare single cell data for monocle & Seurat
+
+getCellCyclePhases <- function(gbm_cds,outdir = "./") {
+  
+  dir.create(path = outdir,recursive = T,showWarnings = F)
+  gene_count_matrix <- as.matrix(exprs(gbm_cds))
+  set.seed(100)
+  mm.pairs <- readRDS(system.file("exdata", "mouse_cycle_markers.rds", package="scran"))
+  assignments <- cyclone(gene_count_matrix, mm.pairs, gene.names=rownames(gene_count_matrix))
+
+  ## add cell cycle phase to pData
+  pData(gbm_cds)$phases <- assignments$phases
+  pData(gbm_cds)$G1_score <- assignments$scores$G1
+  pData(gbm_cds)$G2M_score <- assignments$scores$G2M
+  pData(gbm_cds)$S_score <- assignments$scores$S
+  pData(gbm_cds)[which(pData(gbm_cds)$phases=="G1"),"phases"] <- "G1_G0"
+  pData(gbm_cds)[which(pData(gbm_cds)$phases=="G2M"),"phases"] <- "G2_M"
+  
+  
+  png(paste(outdir,"/cell_cycle_phases_assignment_on_umi.png",sep = ""))
+  plot(assignments$score$G1, assignments$score$G2M, 
+       xlab="G1 score", ylab="G2/M score", pch=16)
+  dev.off()
+  
+  print("Cell cycle phases added..")
+  return(gbm_cds)
+}
+
+getCellCyclePhasesSeurat <- function(seurat,outdir = "./") {
+  
+  dir.create(path = outdir,recursive = T,showWarnings = F)
+  gene_count_matrix <- as.matrix(GetAssayData(seurat,slot = 'counts',assay = "RNA"))
+  set.seed(100)
+  mm.pairs <- readRDS(system.file("exdata", "mouse_cycle_markers.rds", package="scran"))
+  assignments <- cyclone(gene_count_matrix, mm.pairs, gene.names=rownames(gene_count_matrix))
+  
+  ## add cell cycle phase to pData
+  seurat@meta.data$phases <- assignments$phases
+  seurat@meta.data$G1_score <- assignments$scores$G1
+  seurat@meta.data$G2M_score <- assignments$scores$G2M
+  seurat@meta.data$S_score <- assignments$scores$S
+  seurat@meta.data[which(seurat@meta.data$phases=="G1"),"phases"] <- "G1_G0"
+  seurat@meta.data[which(seurat@meta.data$phases=="G2M"),"phases"] <- "G2_M"
+  
+  
+  png(paste(outdir,"/cell_cycle_phases_assignment_on_umi.png",sep = ""))
+  plot(assignments$score$G1, assignments$score$G2M, 
+       xlab="G1 score", ylab="G2/M score", pch=16)
+  dev.off()
+  
+  print("Cell cycle phases added..")
+  return(seurat)
+}
+
+
+
+addPercentMitoch <- function(gbm_cds) {      #in fact this is fraction of mitochondrial transcripts
+  mitoGenes <- grep(pattern = "mt-",x=fData(gbm_cds)$gene_short_name,value = T)
+  mitoGenesEnsembl <- rownames(gbm_cds)[which(is.element(el = fData(gbm_cds)$gene_short_name,set = mitoGenes))]
+  percentMito <- Matrix::colSums(as.matrix(exprs(gbm_cds))[mitoGenesEnsembl, ])/Matrix::colSums(as.matrix(exprs(gbm_cds)))
+  pData(gbm_cds)$percentMito <- percentMito
+  return(gbm_cds)
+}
+
+
+#Filtering low-quality cells
+
+filterCells <- function(gbm_cds,outdir="./",num_cells_expressed=10,min_expr=0.1,propMitochFilter=NULL) {
+  
+  dir.create(path = outdir,recursive = T,showWarnings = F)
+  
+  #Set detection threshold
+  gbm_cds <- detectGenes(gbm_cds, min_expr = min_expr)
+  
+  #Define gene expressed as gene detected in more than n cell (only use for plotting distribution)
+  expressed_genes <- row.names(subset(fData(gbm_cds),            
+                                    num_cells_expressed >= num_cells_expressed))
+
+  #filtering cells as explained in monocle doc, cutting distribution tails
+  #eg. remove cells with no RNA or too much RNA
+
+  pData(gbm_cds)$Total_mRNAs <- Matrix::colSums(exprs(gbm_cds))
+  
+  png(paste(outdir,"/densityTotalmRNA_raw.png",sep =""))
+  qplot(Total_mRNAs, data = pData(gbm_cds), geom ="density")
+  dev.off()
+  
+  upper_bound <- 10^(mean(log10(pData(gbm_cds)$Total_mRNAs)) +
+                     2*sd(log10(pData(gbm_cds)$Total_mRNAs)))
+  lower_bound <- 10^(mean(log10(pData(gbm_cds)$Total_mRNAs)) -
+                     2*sd(log10(pData(gbm_cds)$Total_mRNAs)))
+
+  png(paste(outdir,"/densityWithFirstFilter.png",sep =""))
+  print(qplot(Total_mRNAs, data = pData(gbm_cds), geom ="density") + 
+          geom_vline(xintercept = lower_bound) + 
+          geom_vline(xintercept = upper_bound))
+  dev.off()
+  
+  gbm_cds <- gbm_cds[,pData(gbm_cds)$Total_mRNAs > lower_bound &
+                       pData(gbm_cds)$Total_mRNAs < upper_bound]
+
+  # redefining expressed gene after cells filtering
+  gbm_cds <- detectGenes(gbm_cds, min_expr = min_expr)
+  expressed_genes <- row.names(subset(fData(gbm_cds),num_cells_expressed >= num_cells_expressed)) #genes expressed in at least 10 cells of the data set.
+  
+  # Log-transform each value in the expression matrix.
+  L <- log(exprs(gbm_cds[expressed_genes,]))
+
+  # Standardize each gene, so that they are all on the same scale,
+  # Then melt the data with plyr so we can plot it easily
+  melted_dens_df <- reshape2::melt(Matrix::t(scale(Matrix::t(L))))
+
+  # Plot the distribution of the standardized gene expression values.
+  png(paste(outdir,"/standardized_distribution.png",sep =""))
+  print(qplot(value, geom = "density", data = melted_dens_df) +
+        stat_function(fun = dnorm, size = 0.5, color = 'red') +
+        xlab("Standardized log(UMIcounts)") +
+        ylab("Density"))
+  dev.off()
+  
+  ## Visualize percentage of mitochondiral genes
+  gbm_cds <- addPercentMitoch(gbm_cds)
+  
+  png(paste(outdir,"/percent_mito.png",sep =""))
+  plot(hist(gbm_cds$percentMito*100,breaks = 100),main = "percentage of mitochondrial transcripts")
+  abline(v=propMitochFilter*100)
+  dev.off()
+  
+  return(gbm_cds)
+
+}
+
+
--- a/R_src/do_consensus.R
+++ b/R_src/do_consensus.R
+do_consensus <- function(path){
+
+# Opening the predicted files
+predicted_files <- list.files(path)
+predicted_names <- strsplit(predicted_files, split = ".txt")
+predicted_list <- list()
+for(i in 1:length(predicted_files)){
+  name <- predicted_names[[i]]
+  print(name)
+  predicted_list[[name]] <- assign(predicted_names[[i]], read.table(paste0(path, predicted_files[i]), row.names = 1, header = T))
+}
+# adding all the columns to a single table 
+predicted_table <- do.call("cbind", predicted_list)
+# convert to dataframe
+predicted_data <- data.frame(predicted_table)
+# transpose
+predicted_data <- t(predicted_data)
+# creation "Consensus"
+predicted_consensus=data.frame(Consensus = 1:ncol(predicted_data))
+for (i in 1:ncol(predicted_data)){
+  table_compteur <- table(predicted_data[,i])
+  predicted_consensus[i,1] <- names(which.max(table_compteur))
+  rownames(predicted_consensus)[i] <- colnames(predicted_data)[i]
+}
+return(predicted_consensus)
+}
--- a/R_src/filterForDcaScenicCL.R
+++ b/R_src/filterForDcaScenicCL.R
+# Script to make analyzis of pseudotime with a gbm_cds ordered 
+# command line interface
+
+## -----------------------------------------------------------------------------
+## Libraries
+## -----------------------------------------------------------------------------
+
+
+suppressMessages(library(monocle))
+suppressMessages(library(getopt))
+suppressMessages(library(biomaRt))
+
+source("R_src/Enrichment.R")
+
+
+
+spec = matrix(c(
+  'help',        'h', 0, "logical",   "Help about the program",
+  'inputData',  'i', 1, "character", "REQUIRED: matrix data (.tsv generated by the report) from 10X data ordered by monocle.",
+  'outdir',     'o',1, "character", 'Outdir path (default ./)',
+  'firstFilter',   'f',1,"numeric", 'Keep only the genes with at least N = f(X) 2 mean imputed counts across all samples (X=2 by default) \n(e.g. the total number the gene would have, if it was expressed with a value of X in 1% of the cells). \nAdjust X value according to the dataset (it will depend on the dataset units, e.g. UMI, TPMs???).',
+  'secondFilter',  's',1, "numeric", 'Keep the genes that are detected in at least X% of the cells (X=1% by default)'
+), byrow=TRUE, ncol=5);
+
+opt = getopt(spec)
+
+
+
+# if help was asked, print a friendly message
+# and exit with a non-zero error code
+
+args <- commandArgs()
+if ( !is.null(opt$help) | is.null(opt$inputData)) {
+  cat("Gene filtering of a gbm cds ordered. For scenic use, two filters")
+  cat(getopt(spec, usage=TRUE))
+  q(status=1)
+}
+
+#set default arguments values
+
+if (is.null(opt$outdir)) {
+  opt$outdir = "./"
+}
+if (is.null(opt$firstFilter)) {
+  opt$firstFilter = 2
+}
+if (is.null(opt$secondFilter)) {
+  opt$secondFilter = 0.01
+}
+
+data <- read.table(opt$inputData,sep = "\t",header = T,row.names = 1)
+
+genes <- list()
+genes$names <- rownames(data)
+
+
+nCellsPerGene <- apply(data, 1, function(x) sum(x>0))
+nCountsPerGene <- apply(data, 1, sum)
+summary(nCellsPerGene)
+sum(data>0) / sum(data==0)
+
+#first filter
+minReads <- opt$firstFilter*.01*ncol(data)
+genesLeftMinReads <- names(nCountsPerGene)[which(nCountsPerGene > minReads)]
+length(genesLeftMinReads)
+
+#second filter
+minSamples <- ncol(data)*opt$secondFilter
+nCellsPerGene2 <- nCellsPerGene[genesLeftMinReads]
+genesLeftMinCells <- names(nCellsPerGene2)[which(nCellsPerGene2 > minSamples)]
+length(genesLeftMinCells)
+
+
+
+t_data <- t(data)
+
+print(head(genesLeftMinCells))
+
+write.table(t_data[,c(genesLeftMinCells)],file = paste(opt$outdir,"/expressionRawCountFilteredForScenic.tsv",sep=""),sep = '\t',quote = F)
+
+
+
+
+
+
--- a/R_src/filterForScenicCL.R
+++ b/R_src/filterForScenicCL.R
+# Script to make analyzis of pseudotime with a gbm_cds ordered 
+# command line interface
+
+## -----------------------------------------------------------------------------
+## Libraries
+## -----------------------------------------------------------------------------
+# http://stackoverflow.com/questions/4090169/elegant-way-to-check-for-missing-packages-and-install-them
+
+
+suppressMessages(library(monocle))
+suppressMessages(library(getopt))
+
+
+spec = matrix(c(
+  'help',        'h', 0, "logical",   "Help about the program",
+  'inputRDS',  'i', 1, "character", "REQUIRED: 10X data ordered by monocle (.RDS generated by orderCL.R).",
+  'outdir',     'o',1, "character", 'Outdir path (default ./)',
+  'firstFilter',   'f',1,"numeric", 'Keep only the genes with at least N = f(X) UMI counts across all samples (X=2 by default) \n(e.g. the total number the gene would have, if it was expressed with a value of X in 1% of the cells). \nAdjust X value according to the dataset (it will depend on the dataset units, e.g. UMI, TPMs???).',
+  'secondFilter',  's',1, "numeric", 'Keep the genes that are detected in at least X% of the cells (X=1% by default)',
+  'markerTable', 'm',1, "character", "Markers table for the identification of transcription factors",
+  'mouseTF','t', 1, 'character', 'Mouse transcription factors list to identify TF in markers table'
+), byrow=TRUE, ncol=5);
+
+opt = getopt(spec)
+
+
+
+# if help was asked, print a friendly message
+# and exit with a non-zero error code
+
+args <- commandArgs()
+if ( !is.null(opt$help) | is.null(opt$inputRDS)) {
+  cat("Gene filtering of a gbm cds ordered. For scenic use, two filters")
+  cat(getopt(spec, usage=TRUE))
+  q(status=1)
+}
+
+#set default arguments values
+
+if (is.null(opt$outdir)) {
+  opt$outdir = "./"
+}
+if (is.null(opt$firstFilter)) {
+  opt$firstFilter = 2
+}
+if (is.null(opt$secondFilter)) {
+  opt$secondFilter = 0.01
+}
+
+if(!is.null(opt$markerTable)) {
+  print("find TF in marker table...")
+  markerTable <- read.table(opt$markerTable,sep = "\t",header = T)
+  TFlist <- read.table(opt$mouseTF,header = F)
+  TF_markers <- data.frame(external_gene_name = markerTable$gene[which(markerTable$gene %in% TFlist$V1)])
+  
+  #Add tf from bonzanni plus Gfi1b
+  TF_bonzani <- data.frame(external_gene_name = c("Spi1","Tal1","Zfpm1","Cbfa2t3","Erg","Fli1","Gata1","Gata2","Hhex","Runx1","Smad6","Zbtb16","Gfi1b"))
+  TF_markers <- rbind(TF_markers,TF_bonzani)
+  TF_markers <- unique(TF_markers$external_gene_name)
+  
+  write.table(TF_markers,paste(opt$outdir,"/markers_and_bonzanni_TF.tsv",sep =""),
+              sep = "\t",quote = F,row.names = F, col.names = F)
+}
+
+gbm_cds <- readRDS(opt$inputRDS)
+
+# Convert to monocle object if necessary (filtering script initially made for monocle)
+
+if(is(gbm_cds) == "Seurat") {
+  #It is easier to pass by a monocle object and then reget the seurat object with the genes filtered
+  print("input seurat object converting it to monocle")
+  DefaultAssay(object = gbm_cds) <- "RNA"
+  
+  pd <- new("AnnotatedDataFrame", data = gbm_cds@meta.data)
+  fd <- new("AnnotatedDataFrame", data = data.frame(gene_short_name = rownames(gbm_cds))) 
+  rownames(fd) <- fd$gene_short_name
+  
+  
+  gbm_cds <- newCellDataSet(GetAssayData(gbm_cds,slot = "counts"),
+                            phenoData = pd,
+                            featureData = fd,
+                            lowerDetectionLimit = 0.1,
+                            expressionFamily = negbinomial.size())
+  
+  gbm_cds <- detectGenes(gbm_cds, min_expr = 0.1)
+  
+  
+  
+}
+
+exp <- as.matrix(exprs(gbm_cds))
+geneShortName<-make.unique(as.character(fData(gbm_cds)$gene_short_name)) # to see between make.unique, remove dup..
+rownames(exp) <- geneShortName
+
+nCellsPerGene <- apply(exp, 1, function(x) sum(x>0))
+nCountsPerGene <- apply(exp, 1, sum)
+summary(nCellsPerGene)
+sum(exp>0) / sum(exp==0)
+
+#first filter
+minReads <- opt$firstFilter*.01*ncol(exp)
+genesLeftMinReads <- names(nCountsPerGene)[which(nCountsPerGene > minReads)]
+length(genesLeftMinReads)
+
+#second filter
+minSamples <- ncol(exp)*opt$secondFilter
+nCellsPerGene2 <- nCellsPerGene[genesLeftMinReads]
+genesLeftMinCells <- names(nCellsPerGene2)[which(nCellsPerGene2 > minSamples)]
+length(genesLeftMinCells)
+
+t_exp <- t(exp)
+
+print(head(genesLeftMinCells))
+
+write.table(t_exp[,genesLeftMinCells],file = paste(opt$outdir,"/expressionRawCountFilteredForScenic.tsv",sep=""),sep = '\t',quote = F)
+
+
+
+
+
--- a/R_src/filterSeuratForScenicCL.R
+++ b/R_src/filterSeuratForScenicCL.R
+# Script to make analyzis of pseudotime with a gbm_cds ordered 
+# command line interface
+
+## -----------------------------------------------------------------------------
+## Libraries
+## -----------------------------------------------------------------------------
+# http://stackoverflow.com/questions/4090169/elegant-way-to-check-for-missing-packages-and-install-them
+
+
+suppressMessages(library(Seurat))
+suppressMessages(library(getopt))
+
+
+spec = matrix(c(
+  'help',        'h', 0, "logical",   "Help about the program",
+  'inputRDS',  'i', 1, "character", "REQUIRED: 10X data ordered by monocle (.RDS generated by orderCL.R).",
+  'outdir',     'o',1, "character", 'Outdir path (default ./)',
+  'firstFilter',   'f',1,"numeric", 'Keep only the genes with at least N = f(X) UMI counts across all samples (X=2 by default) \n(e.g. the total number the gene would have, if it was expressed with a value of X in 1% of the cells). \nAdjust X value according to the dataset (it will depend on the dataset units, e.g. UMI, TPMs???).',
+  'secondFilter',  's',1, "numeric", 'Keep the genes that are detected in at least X% of the cells (X=1% by default)',
+  'mouseTF','t', 1, 'character', 'Mouse transcription factors list to identify TF in markers table'
+), byrow=TRUE, ncol=5);
+
+opt = getopt(spec)
+
+
+
+# if help was asked, print a friendly message
+# and exit with a non-zero error code
+
+args <- commandArgs()
+if ( !is.null(opt$help) | is.null(opt$inputRDS)) {
+  cat("Gene filtering of a gbm cds ordered. For scenic use, two filters")
+  cat(getopt(spec, usage=TRUE))
+  q(status=1)
+}
+
+#set default arguments values
+
+if (is.null(opt$outdir)) {
+  opt$outdir = "./"
+}
+if (is.null(opt$firstFilter)) {
+  opt$firstFilter = 2
+}
+if (is.null(opt$secondFilter)) {
+  opt$secondFilter = 0.01
+}
+
+seurat <- readRDS(opt$inputRDS)
+
+DefaultAssay(object = seurat) <- "RNA"
+
+exp <- as.matrix(GetAssayData(seurat,slot = "counts"))
+
+
+nCellsPerGene <- apply(exp, 1, function(x) sum(x>0))
+nCountsPerGene <- apply(exp, 1, sum)
+summary(nCellsPerGene)
+sum(exp>0) / sum(exp==0)
+
+#first filter
+minReads <- opt$firstFilter*.01*ncol(exp)
+genesLeftMinReads <- names(nCountsPerGene)[which(nCountsPerGene > minReads)]
+length(genesLeftMinReads)
+
+#second filter
+minSamples <- ncol(exp)*opt$secondFilter
+nCellsPerGene2 <- nCellsPerGene[genesLeftMinReads]
+genesLeftMinCells <- names(nCellsPerGene2)[which(nCellsPerGene2 > minSamples)]
+length(genesLeftMinCells)
+
+t_exp <- t(exp)
+
+print(head(genesLeftMinCells))
+
+
+
+
+t_exp <- t(exp)
+
+print(head(genesLeftMinCells))
+
+write.table(t_exp[,genesLeftMinCells],file = paste(opt$outdir,"/expressionRawCountFilteredForScenic.tsv",sep=""),sep = '\t',quote = F)
+
+
+
+
+