1. 02 Dec, 2021 1 commit
  2. 29 Nov, 2021 7 commits
    • Adrien Mazuel's avatar
      Update REAME · 0804e525
      Adrien Mazuel authored
      0804e525
    • Adrien Mazuel's avatar
      Remove old conda environment · d6e0b33f
      Adrien Mazuel authored
      d6e0b33f
    • Adrien Mazuel's avatar
      Rm non used python script · b61dad0c
      Adrien Mazuel authored
      b61dad0c
    • Adrien Mazuel's avatar
      Remove file not used in the pipeline · 5a31acd6
      Adrien Mazuel authored
      5a31acd6
    • Adrien Mazuel's avatar
      60868c88
    • Adrien Mazuel's avatar
      Change the file name · 46393697
      Adrien Mazuel authored
      46393697
    • Adrien Mazuel's avatar
      import os · afbb2f54
      Adrien Mazuel authored
      import re
      
      # Get samples and ages from config file
      samples = config["samples"]
      samples = samples.split(",")
      print(samples)
      
      SMPATAC = config["samplesATAC"]
      SMPATAC = SMPATAC.split(",")
      print(SMPATAC)
      
      ages = config["ages"]
      ages = ages.split(",")
      print(ages)
      
      # Get conda env file from config file
      seurat_monocle_env = config["seurat_monocle_env"]
      pyscenic_env = config["pyscenic_env"]
      signacSeurat4_env = config["signacSeurat4_env"]
      streamAnalysis_env = config["streamAnalysis_env"]
      
      feather_database = config["scenic"]["cis_target_url"].split("/")[-1]
      print(feather_database)
      
      # Define base directory
      baseDir = config["dir"]["base"]
      os.chdir(baseDir)
      
      print(samples)
      
      scenicRuns = list(range(50))
      
      rule all:
          input: #"report/final_report.html",
                 "output/RNA/Seurat4_integration/combined.rds",
                 #"output/RNA/MonocleIntegration/monocleWithCounts.rds",
                 #"output/RNA/regulonAnalysis/mainRegulonTable.tsv",
                 #"output/RNA/regulonAnalysis/clusterMarkerRegulonTable.tsv",
                 "output/ATAC/integration/atacCombined.rds",
                 #"output/TransferRnaAtac/coembedAll.rds",
                 #"output/motifsEnv_installed",
                 #"output/TransferRnaAtac/SignacMotif/atac_motif.rds",
                 #expand("output/ATAC/smp/{smpAtac}/SignacMotif/atac_motif.rds", smpAtac = SMPATAC),
                 "output/RNA/regulonAnalysis/mainRegulonTable.tsv",
                 #"output/RNA/regulonAnalysis/clusterMarkerRegulonTable.tsv",
                 # "output/Nestorowa/seuratForCastle.rds"
                 #"output/signacSeurat4_installed_v2",
                 #"output/TransferRnaAtac/coembedAll.rds",
                 #"output/motifsEnv_installed_v2",
                 #"output/TransferRnaAtac/SignacMotif/atac_motif.rds",
                 #"output/TransferRnaAtac/SignacMotif_2/atac_motif.rds"
                 #"output/RNA/regulonAnalysis/seuratAUC.rds",
                 "output/ATAC/integration_v2/atac_merged_integrated.rds"
      
      rule untar:
          input: "input/RAW/{smp}/barcodes.tsv.gz",
                 "input/RAW/{smp}/features.tsv.gz",
                 "input/RAW/{smp}/matrix.mtx.gz",
          output:"input/RAW/{smp}/barcodes.tsv",
                 "input/RAW/{smp}/features.tsv",
                 "input/RAW/{smp}/matrix.mtx",
          shell: "cd input/RAW/{wildcards.smp};gunzip *.gz"
      
      rule install_seurat4:
        conda: signacSeurat4_env
        output: "output/signacSeurat4_installed"
        shell: "Rscript R_src/installSignacSeurat4.R;touch {output}"
      
      rule load_10X_data:
          input: "input/RAW/{smp}/barcodes.tsv",
                 "input/RAW/{smp}/features.tsv",
                 "input/RAW/{smp}/matrix.mtx",
                 "output/signacSeurat4_installed"
          output: "input/10X_data_{smp}.rds"
          params: age = lambda wildcards : config[wildcards.smp]["age"],
                  runDate = lambda wildcards : config[wildcards.smp]["date"],
                  cellranger = config["cellrangerVersion"],
                  condition = lambda wildcards : config[wildcards.smp]["condition"]
          threads: 1
          conda: signacSeurat4_env
          shell: "Rscript R_src/load_10X_Seurat.R -c {params.cellranger} -b input/RAW/{wildcards.smp} -f {output} \
          -s age={params.age},runDate={params.runDate},sampleName={wildcards.smp},condition={params.condition}"
      
      rule prepare_data_for_seurat:
          input: data = "input/10X_data_{smp}.rds"
          output: "output/RNA/{smp}/dataPreparation/seurat_treated.rds"
          threads: 1
          conda: seurat_monocle_env
          shell: "Rscript R_src/prepareSeuratCL.R -i {input.data} -o output/RNA/{wildcards.smp}/dataPreparation -m 0.1 -u 2 -l 3 -m 5"
      
      rule download_signatures:
          params: geiger = config["publicSignatures"]["geiger_url"],
                  chambers = config["publicSignatures"]["chambers_url"],
          output: geiger = "publicData/geiger_signatures.xls",
                  chambers = "publicData/chambers_signatures.xls",
          shell: """wget -cO - {params.geiger} > {output.geiger};
          wget -cO - {params.chambers} > {output.chambers}"""
      
      rule get_signatures:
          input: geiger = "publicData/geiger_signatures.xls",
                 chambers = "publicData/chambers_signatures.xls",
          output: "output/signatures/publicSignatures.rds"
          threads: 12
          conda: signacSeurat4_env
          shell : "Rscript R_src/getSignaturesCL.R -t {input.geiger} -m {input.chambers} -o output/signatures/"
      
      rule seurat4:
          input: dataPrepared =  "output/RNA/{smp}/dataPreparation/seurat_treated.rds",
                 sig = "output/signatures/publicSignatures.rds",
          output: "output/RNA/{smp}/Seurat4/seurat.rds"
          conda: signacSeurat4_env
          params: num_dim = config["seurat3"]["dim"],
                  cr = config["seurat3"]["correction"],
                  minPropCell = config["seurat3"]["minPropCellExp"],
                  res = config["seurat3"]["resolution"]
          threads: 1
          shell: "Rscript R_src/Seurat4CL.R -i {input.dataPrepared} \
          -o output/RNA/{wildcards.smp}/Seurat4 -n {params.num_dim} \
          -b {params.cr} -s {input.sig}\
          -r {params.res} -z logNorm"
      
      rule seurat4_integration:
          input: expand("output/RNA/{smp}/Seurat4/seurat.rds",smp=samples)
          output: "output/RNA/Seurat4_integration/combined.rds"
          conda: signacSeurat4_env
          params: cr = config["seurat_integration"]["correction"],
                  num_dim_smp = config["seurat_integration"]["dim"],
                  num_dim_int = config["seurat_integration"]["num_dim_int"],
                  smpList = config["seurat_integration"]["smp_list"],
                  res = config["seurat_integration"]["resolution"]
          threads: 1
          shell: "Rscript R_src/Seurat4_integration.R -i {params.smpList} -n {params.num_dim_smp} -N {params.num_dim_int} -o output/RNA/Seurat4_integration -r {params.res} -b {params.cr} -z logNorm"
      
      rule seurat4_analysis:
          input: seurat = "output/RNA/Seurat4_integration/combined.rds"
          output: "output/RNA/Seurat4_integration/Analysis/seurat_annotated.rds"
          conda: signacSeurat4_env
          params: res = config["seurat_analysis"]["resolution"],
                  logFC = config["seurat_analysis"]["logfc_threshold"],
                  padj = config["seurat_analysis"]["pval_adj"],
                  assay = config["seurat_analysis"]["assay"],
                  clusters = config["seurat_analysis"]["clusters"],
                  signatures = config["seurat_analysis"]["signatures"]
          threads: 1
          shell: "Rscript R_src/seuratAnalysisCL.R -i {input.seurat} -o output/RNA/Seurat4_integration/Analysis/ \
          -r {params.res} -s {params.signatures} -c {params.clusters} \
          -f {params.logFC} -p {params.padj} -a {params.assay}"
      
      rule prepare_STREAM_analysis:
          input: seurat = "output/RNA/Seurat4_integration/Analysis/seurat_annotated.rds"
          output: "output/RNA/prepareSTREAM/All_colorCellCyclePhases.tsv"
          conda: signacSeurat4_env
          params: prefix = config["prepare_STREAM"]["prefix"]
          threads: 1
          shell: "Rscript R_src/prepareSTREAManalysisCL.R -i {input.seurat} -o output/RNA/prepareSTREAM/ -p {params.prefix}"
      
      rule run_STREAM_analysis:
          input: matrix = "output/RNA/streamFiltering/All_cells_scaleDataForStream.tsv"
          output: "output/RNA/streamAnalysis/stream_metadata.csv"
          conda: streamAnalysis_env
          params: cells = config["STREAM_analysis"]["cells_clusters"],
                  colors = config["STREAM_analysis"]["colors_clusters"],
                  epg_alpha = config["STREAM_analysis"]["epg_alpha"],
                  epg_mu = config["STREAM_analysis"]["epg_mu"],
                  epg_lambda = config["STREAM_analysis"]["epg_lambda"]
          threads: 1
          shell: "python py_src/streamPseudotime.py -i {input.matrix} -o output/RNA/streamAnalysis/ \
          -c {params.cells} -v {params.colors} -a {params.epg_alpha} -m {params.epg_mu} -l {params.epg_lambda}"
      
      rule seurat_STREAM:
          input: stream = "output/RNA/streamAnalysis/stream_metadata.csv",
                 seurat = "output/RNA/Seurat4_integration/Analysis/seurat_annotated.rds"
          output: "output/RNA/streamAnalysisSeurat/seuratPseudotime.rds"
          conda: signacSeurat4_env
          params: metaData = config["seurat_STREAM"]["metaData"],
                  refPseudotime = config["seurat_STREAM"]["refPseudotime"],
                  treeShape = config["seurat_STREAM"]["treeShape"]
          threads: 1
          shell: "Rscript R_src/streamSeuratCL.R -i {input.seurat} -s {input.stream} -o output/RNA/streamAnalysisSeurat/ \
          -m {params.metaData} -p {params.refPseudotime} -t {params.treeShape}"
      
      rule prepare_nestorowa_data:
        input: "input/Nestorowa/GSE81682_HTSeq_counts.txt","input/Nestorowa/all_cell_types.txt"
        output: "output/Nestorowa/seuratForActinn.rds"
        conda: signacSeurat4_env
        shell: "Rscript R_src/prepareNestorowaCL.R -i {input[0]} -m {input[1]} -o output/Nestorowa"
      ####################### Scenic part of the workflow ####################################
      
      rule get_databases_for_scenic:
          output: feather="publicData/database/mm9-tss-centered-10kb-7species.mc9nr.feather",
                  tbl="publicData/database/motifs-v9-nr.mgi-m0.001-o0.0.tbl"
          params: feather=config["scenic"]["cis_target_url"],
                  tbl=config["scenic"]["motif_url"],
                  sums=config["scenic"]["sums"],
                  feather_name=feather_database
          shell: """cd publicData/database; \
          wget {params.feather}; \
          wget {params.tbl}; \
          wget {params.sums}; \
          awk -v feather_database={params.feather_name} '$2 == feather_database' sha256sum.txt | sha256sum -c -"""
      
      rule prepare_tf_list:
           input: "publicData/database/motifs-v9-nr.mgi-m0.001-o0.0.tbl"
           output: "output/publicData/mm_mgi_tfs.txt"
           conda: pyscenic_env
           shell: "python py_src/getTfList.py -i {input} -o {output}"
      
      rule filter_genes_for_scenic_all:
          input: "output/RNA/Seurat4_integration/combined.rds",
                 "output/publicData/mm_mgi_tfs.txt"
          output: "output/RNA/ScenicAll/expressionRawCountFilteredForScenic.tsv",
          conda: signacSeurat4_env
          shell: "Rscript R_src/filterSeuratForScenicCL.R -i {input[0]} -t {input[1]} -o output/RNA/ScenicAll/"
      
      rule split_by_condition:
          input: "output/RNA/ScenicAll/expressionRawCountFilteredForScenic.tsv",
          output: "output/RNA/ScenicRNA_multipleRuns_{smp}/expressionRawCountFilteredForScenic.tsv"
          conda: pyscenic_env
          shell:
              "head -1 {input} > {output}; cat {input} | grep '^{wildcards.smp}' >> {output}"
      
      rule multiple_grnboost_rna:
          input: "output/RNA/ScenicAll/expressionRawCountFilteredForScenic.tsv",
                  "output/publicData/mm_mgi_tfs.txt"
          output: "output/RNA/ScenicRNA_multipleRuns/GRNboost/run{run}_GRNboost.tsv"
          threads: 20
          conda: pyscenic_env
          shell:
              "pyscenic grn --seed {wildcards.run} --num_workers {threads} {input[0]} {input[1]} -o {output}"
      
      rule multiple_grnboost_rna_by_condition:
          input: "output/RNA/ScenicRNA_multipleRuns_{smp}/expressionRawCountFilteredForScenic.tsv",
                 "output/publicData/mm_mgi_tfs.txt"
          output: "output/RNA/ScenicRNA_multipleRuns_{smp}/GRNboost/run{run}_GRNboost.tsv"
          threads: 20
          conda: pyscenic_env
          shell:
              "pyscenic grn --seed {wildcards.run} --num_workers {threads} {input[0]} {input[1]} -o {output}"
      
      rule multiple_prune_modules_rna_maskDropouts_with_cis_target_csv:
          input:
            exp="output/RNA/ScenicAll/expressionRawCountFilteredForScenic.tsv",
            GRN_boost= "output/RNA/ScenicRNA_multipleRuns/GRNboost/run{run}_GRNboost.tsv",
            TF_motif="publicData/database/motifs-v9-nr.mgi-m0.001-o0.0.tbl",
            cis_motif="publicData/database/mm9-tss-centered-10kb-7species.mc9nr.feather"
          output:"output/RNA/ScenicRNA_multipleRuns/cis_target_maskDropouts/run{run}_regulons.csv"
          threads:10
          conda:pyscenic_env
          shell:
              "pyscenic ctx --annotations_fname {input.TF_motif} -a --mask_dropouts\
       --num_workers {threads} \
       --expression_mtx_fname {input.exp} -o {output} \
       {input.GRN_boost} {input.cis_motif}"
      
      rule multiple_prune_modules_rna_maskDropouts_with_cis_target_csv_by_condition:
          input:
            exp="output/RNA/ScenicRNA_multipleRuns_{smp}/expressionRawCountFilteredForScenic.tsv",
            GRN_boost="output/RNA/ScenicRNA_multipleRuns_{smp}/GRNboost/run{run}_GRNboost.tsv",
            TF_motif="publicData/database/motifs-v9-nr.mgi-m0.001-o0.0.tbl",
            cis_motif="publicData/database/mm9-tss-centered-10kb-7species.mc9nr.feather"
          output:"output/RNA/ScenicRNA_multipleRuns_{smp}/cis_target_maskDropouts/run{run}_regulons.csv"
          threads:12
          conda:pyscenic_env
          shell:
              "pyscenic ctx --annotations_fname {input.TF_motif} -a --mask_dropouts\
       --num_workers {threads} \
       --expression_mtx_fname {input.exp} -o {output} \
       {input.GRN_boost} {input.cis_motif}"
      
      rule multiple_motifEnriched2regulons_rna_maskDropouts:
          input: "output/RNA/ScenicRNA_multipleRuns/cis_target_maskDropouts/run{run}_regulons.csv"
          output: "output/RNA/ScenicRNA_multipleRuns/cis_target_maskDropouts/run{run}_regulons.json"
          conda:pyscenic_env
          shell: "python py_src/motif2regulon.py -i {input} -o {output}"
      
      rule multiple_motifEnriched2regulons_rna_maskDropouts_by_condition:
          input: "output/RNA/ScenicRNA_multipleRuns_{smp}/cis_target_maskDropouts/run{run}_regulons.csv"
          output: "output/RNA/ScenicRNA_multipleRuns_{smp}/cis_target_maskDropouts/run{run}_regulons.json"
          conda:pyscenic_env
          shell: "python py_src/motif2regulon.py -i {input} -o {output}"
      
      rule merging_multiple_runs:
          input: files = expand("output/RNA/ScenicRNA_multipleRuns/cis_target_maskDropouts/run{run}_regulons.json",run = scenicRuns),
                 adjacencies =  expand("output/RNA/ScenicRNA_multipleRuns/GRNboost/run{run}_GRNboost.tsv",run = scenicRuns)
          output: "output/RNA/ScenicRNA_multipleRuns/cis_target_maskDropouts/aggregatedRegulons.json",
                  "output/RNA/ScenicRNA_multipleRuns/cis_target_maskDropouts/aggregatedRegulonsMeta.json",
                  "output/RNA/ScenicRNA_multipleRuns/cis_target_maskDropouts/adja.sqlite"
          shell: "python py_src/aggregateMultiRun.py -r output/RNA/ScenicRNA_multipleRuns/cis_target_maskDropouts/ -c 0.8 -a output/RNA/ScenicRNA_multipleRuns/GRNboost/ -o output/RNA/ScenicRNA_multipleRuns/cis_target_maskDropouts/"
      
      rule merging_multiple_runs_by_condition:
          input: files = expand("output/RNA/ScenicRNA_multipleRuns_{smp}/cis_target_maskDropouts/run{run}_regulons.json",run = scenicRuns,smp =samples),
                 adjacencies =  expand("output/RNA/ScenicRNA_multipleRuns_{smp}/GRNboost/run{run}_GRNboost.tsv",run = scenicRuns,smp =samples),
          output: "output/RNA/ScenicRNA_multipleRuns_{smp}/cis_target_maskDropouts/aggregatedRegulons.json",
                  "output/RNA/ScenicRNA_multipleRuns_{smp}/cis_target_maskDropouts/aggregatedRegulonsMeta.json",
                  "output/RNA/ScenicRNA_multipleRuns_{smp}/cis_target_maskDropouts/adja.sqlite"
          shell: "python py_src/aggregateMultiRun.py -r output/RNA/ScenicRNA_multipleRuns_{wildcards.smp}/cis_target_maskDropouts/\
          -a output/RNA/ScenicRNA_multipleRuns_{wildcards.smp}/GRNboost/ -c 0.8 \
          -o output/RNA/ScenicRNA_multipleRuns_{wildcards.smp}/cis_target_maskDropouts/"
      
      rule multiple_aucell_TF_seurat_regulon_rna_maskDropouts:
          input:"output/RNA/ScenicAll/expressionRawCountFilteredForScenic.tsv",
                "output/RNA/ScenicRNA_multipleRuns/cis_target_maskDropouts/aggregatedRegulons.json"
          output:"output/RNA/ScenicRNA_multipleRuns/AUCell_maskDropouts/regulons_enrichment.csv"
      
          threads:12
          conda:pyscenic_env
          shell:
              "python3.6 py_src/aucellJson.py -c {threads} -s 2020 -e {input[0]} -r {input[1]} -o output/RNA/ScenicRNA_multipleRuns/AUCell_maskDropouts/"
      
      rule creating_regulonTable:
          input: main = "output/RNA/ScenicRNA_multipleRuns/cis_target_maskDropouts/aggregatedRegulonsMeta.json",
                 supp = expand("output/RNA/ScenicRNA_multipleRuns_{smp}/cis_target_maskDropouts/aggregatedRegulonsMeta.json",smp=samples)
          output: "output/RNA/regulonAnalysis/mainRegulonTable.tsv"
          params: condName = "RA+CT",
                  tf_database = "output/publicData/mm_mgi_tfs.txt",
                  supp = "+".join(expand("output/RNA/ScenicRNA_multipleRuns_{smp}/cis_target_maskDropouts/aggregatedRegulonsMeta.json",smp=samples))
          conda: seurat_monocle_env
          shell:
              "Rscript R_src/makeRegulonTableCL.R -o output/RNA/regulonAnalysis/ -t {params.tf_database} -r {input.main} -s {params.supp} -n {params.condName}"
      
      rule analyseRegulonScore:
          input: seurat =  "report/seurat_promyelocytes.rds",
                 tfList = "input/selectedTF.txt",
                 regulonScore =  "output/RNA/ScenicRNA_multipleRuns/AUCell_maskDropouts/regulons_enrichment.csv"
          output: "output/RNA/regulonAnalysis/clusterMarkerRegulonTable.tsv"
          params: scoreDiff = config["regulonAnalysis"]["scoreDiff"]
          conda: seurat_monocle_env
          shell:
            "Rscript R_src/regulonMarkerCL.R -i {input.seurat} -t {input.tfList} \
            -r {input.regulonScore} -o output/RNA/regulonAnalysis/ -c -a condition"
      
      ####################### ATAC seq part of the workflow ####################################
      
      rule process_atac_smp:
        input:"input/RAW_atac/{smpAtac}/filtered_peak_bc_matrix.h5",
              "input/RAW_atac/{smpAtac}/singlecell.csv",
              "input/RAW_atac/{smpAtac}/fragments.tsv.gz",
              "output/signacSeurat4_installed"
        output:"output/ATAC/smp/{smpAtac}/atac.rds"
        params: age = lambda wildcards : config[wildcards.smpAtac]["age"],
                runDate = lambda wildcards : config[wildcards.smpAtac]["date"],
                condition = lambda wildcards : config[wildcards.smpAtac]["condition"]
        threads:12
        conda: signacSeurat4_env
        shell: "Rscript R_src/signacSmpCL.R -i input/RAW_atac/{wildcards.smpAtac}/ -o output/ATAC/smp/{wildcards.smpAtac}/ \
        -u 2 -l 2 -t 2 -c 2 -r 30 -b 0.05 \
        -s age={params.age},runDate={params.runDate},sampleName={wildcards.smpAtac},condition={params.condition}"
      
      rule integrate_atac:
        input: expand("output/ATAC/smp/{smpAtac}/atac.rds",smpAtac = SMPATAC),
               "output/signacSeurat4_installed"
        output:"output/ATAC/integration/atacCombined.rds"
        params: smpList = config["signac_integration"]["smp_list"],
                minCutOff = config["signac_integration"]["minCutOff"],
                subsetInterPeaks = config["signac_integration"]["subsetInterPeaks"]
        conda: signacSeurat4_env
        shell: "Rscript R_src/signac_integration.R -i {params.smpList} -o output/ATAC/integration/ -m {params.minCutOff} -s {params.subsetInterPeaks}"
      
      rule transfer_rna_atac:
        input: "output/RNA/Seurat4_integration/combined.rds",
               expand("output/ATAC/smp/{smpAtac}/atac.rds",smpAtac = SMPATAC),
               "output/signacSeurat4_installed_v2"
        output:"output/TransferRnaAtac/coembedAll.rds"
        conda: signacSeurat4_env
        shell: "Rscript R_src/transferRnaAtacCL.R -i {input[0]} -a {input[1]} -b {input[2]} -d -N 50 -r integrated_snn_res.0.3 -o output/TransferRnaAtac/"
      
      rule install_signac_motif_packages:
        conda: signacSeurat4_env
        output: "output/motifsEnv_installed"
        shell: "Rscript R_src/installMotifsEnv.R;touch {output}"
      
      rule signac_motif_integrated:
        input: "output/TransferRnaAtac/atac_integrated.rds"
        output: "output/TransferRnaAtac/SignacMotif/atac_motif.rds"
        params: clustName = config["signac_motif_integrated"]["cluster_name"],
                clustLevel = config["signac_motif_integrated"]["cluster_level"],
                clustCol = config["signac_motif_integrated"]["cluster_col"]
        conda: signacSeurat4_env
        shell: "Rscript R_src/SignacMotifAnnotCL.R -i {input} -o output/TransferRnaAtac/SignacMotif -s 9606 -a peaks -c {params.clustCol} -n {params.clustName} -l {params.clustLevel} -I TRUE"
      
      rule seurat_signac_annotAUC_enhancer_promoter:
        input: "output/TransferRnaAtac/SignacMotif_2/atac_motif.rds",
               "output/RNA/Seurat4_integration/Report_20210412/annotated_update.rds",
               "output/RNA/ScenicRNA_multipleRuns/AUCell_maskDropouts/regulons_enrichment.csv"
        output: "output/RNA/regulonAnalysis/seuratAUC.rds"
        conda: signacSeurat4_env
        shell: "Rscript R_src/prepareAUCseuratEnhancePromoseuratCL.R -i {input[1]} -a {input[0]} -r {input[2]} -o output/"
      afbb2f54
  3. 25 Nov, 2021 7 commits