• Adrien Mazuel's avatar
    import os · afbb2f54
    Adrien Mazuel authored
    import re
    
    # Get samples and ages from config file
    samples = config["samples"]
    samples = samples.split(",")
    print(samples)
    
    SMPATAC = config["samplesATAC"]
    SMPATAC = SMPATAC.split(",")
    print(SMPATAC)
    
    ages = config["ages"]
    ages = ages.split(",")
    print(ages)
    
    # Get conda env file from config file
    seurat_monocle_env = config["seurat_monocle_env"]
    pyscenic_env = config["pyscenic_env"]
    signacSeurat4_env = config["signacSeurat4_env"]
    streamAnalysis_env = config["streamAnalysis_env"]
    
    feather_database = config["scenic"]["cis_target_url"].split("/")[-1]
    print(feather_database)
    
    # Define base directory
    baseDir = config["dir"]["base"]
    os.chdir(baseDir)
    
    print(samples)
    
    scenicRuns = list(range(50))
    
    rule all:
        input: #"report/final_report.html",
               "output/RNA/Seurat4_integration/combined.rds",
               #"output/RNA/MonocleIntegration/monocleWithCounts.rds",
               #"output/RNA/regulonAnalysis/mainRegulonTable.tsv",
               #"output/RNA/regulonAnalysis/clusterMarkerRegulonTable.tsv",
               "output/ATAC/integration/atacCombined.rds",
               #"output/TransferRnaAtac/coembedAll.rds",
               #"output/motifsEnv_installed",
               #"output/TransferRnaAtac/SignacMotif/atac_motif.rds",
               #expand("output/ATAC/smp/{smpAtac}/SignacMotif/atac_motif.rds", smpAtac = SMPATAC),
               "output/RNA/regulonAnalysis/mainRegulonTable.tsv",
               #"output/RNA/regulonAnalysis/clusterMarkerRegulonTable.tsv",
               # "output/Nestorowa/seuratForCastle.rds"
               #"output/signacSeurat4_installed_v2",
               #"output/TransferRnaAtac/coembedAll.rds",
               #"output/motifsEnv_installed_v2",
               #"output/TransferRnaAtac/SignacMotif/atac_motif.rds",
               #"output/TransferRnaAtac/SignacMotif_2/atac_motif.rds"
               #"output/RNA/regulonAnalysis/seuratAUC.rds",
               "output/ATAC/integration_v2/atac_merged_integrated.rds"
    
    rule untar:
        input: "input/RAW/{smp}/barcodes.tsv.gz",
               "input/RAW/{smp}/features.tsv.gz",
               "input/RAW/{smp}/matrix.mtx.gz",
        output:"input/RAW/{smp}/barcodes.tsv",
               "input/RAW/{smp}/features.tsv",
               "input/RAW/{smp}/matrix.mtx",
        shell: "cd input/RAW/{wildcards.smp};gunzip *.gz"
    
    rule install_seurat4:
      conda: signacSeurat4_env
      output: "output/signacSeurat4_installed"
      shell: "Rscript R_src/installSignacSeurat4.R;touch {output}"
    
    rule load_10X_data:
        input: "input/RAW/{smp}/barcodes.tsv",
               "input/RAW/{smp}/features.tsv",
               "input/RAW/{smp}/matrix.mtx",
               "output/signacSeurat4_installed"
        output: "input/10X_data_{smp}.rds"
        params: age = lambda wildcards : config[wildcards.smp]["age"],
                runDate = lambda wildcards : config[wildcards.smp]["date"],
                cellranger = config["cellrangerVersion"],
                condition = lambda wildcards : config[wildcards.smp]["condition"]
        threads: 1
        conda: signacSeurat4_env
        shell: "Rscript R_src/load_10X_Seurat.R -c {params.cellranger} -b input/RAW/{wildcards.smp} -f {output} \
        -s age={params.age},runDate={params.runDate},sampleName={wildcards.smp},condition={params.condition}"
    
    rule prepare_data_for_seurat:
        input: data = "input/10X_data_{smp}.rds"
        output: "output/RNA/{smp}/dataPreparation/seurat_treated.rds"
        threads: 1
        conda: seurat_monocle_env
        shell: "Rscript R_src/prepareSeuratCL.R -i {input.data} -o output/RNA/{wildcards.smp}/dataPreparation -m 0.1 -u 2 -l 3 -m 5"
    
    rule download_signatures:
        params: geiger = config["publicSignatures"]["geiger_url"],
                chambers = config["publicSignatures"]["chambers_url"],
        output: geiger = "publicData/geiger_signatures.xls",
                chambers = "publicData/chambers_signatures.xls",
        shell: """wget -cO - {params.geiger} > {output.geiger};
        wget -cO - {params.chambers} > {output.chambers}"""
    
    rule get_signatures:
        input: geiger = "publicData/geiger_signatures.xls",
               chambers = "publicData/chambers_signatures.xls",
        output: "output/signatures/publicSignatures.rds"
        threads: 12
        conda: signacSeurat4_env
        shell : "Rscript R_src/getSignaturesCL.R -t {input.geiger} -m {input.chambers} -o output/signatures/"
    
    rule seurat4:
        input: dataPrepared =  "output/RNA/{smp}/dataPreparation/seurat_treated.rds",
               sig = "output/signatures/publicSignatures.rds",
        output: "output/RNA/{smp}/Seurat4/seurat.rds"
        conda: signacSeurat4_env
        params: num_dim = config["seurat3"]["dim"],
                cr = config["seurat3"]["correction"],
                minPropCell = config["seurat3"]["minPropCellExp"],
                res = config["seurat3"]["resolution"]
        threads: 1
        shell: "Rscript R_src/Seurat4CL.R -i {input.dataPrepared} \
        -o output/RNA/{wildcards.smp}/Seurat4 -n {params.num_dim} \
        -b {params.cr} -s {input.sig}\
        -r {params.res} -z logNorm"
    
    rule seurat4_integration:
        input: expand("output/RNA/{smp}/Seurat4/seurat.rds",smp=samples)
        output: "output/RNA/Seurat4_integration/combined.rds"
        conda: signacSeurat4_env
        params: cr = config["seurat_integration"]["correction"],
                num_dim_smp = config["seurat_integration"]["dim"],
                num_dim_int = config["seurat_integration"]["num_dim_int"],
                smpList = config["seurat_integration"]["smp_list"],
                res = config["seurat_integration"]["resolution"]
        threads: 1
        shell: "Rscript R_src/Seurat4_integration.R -i {params.smpList} -n {params.num_dim_smp} -N {params.num_dim_int} -o output/RNA/Seurat4_integration -r {params.res} -b {params.cr} -z logNorm"
    
    rule seurat4_analysis:
        input: seurat = "output/RNA/Seurat4_integration/combined.rds"
        output: "output/RNA/Seurat4_integration/Analysis/seurat_annotated.rds"
        conda: signacSeurat4_env
        params: res = config["seurat_analysis"]["resolution"],
                logFC = config["seurat_analysis"]["logfc_threshold"],
                padj = config["seurat_analysis"]["pval_adj"],
                assay = config["seurat_analysis"]["assay"],
                clusters = config["seurat_analysis"]["clusters"],
                signatures = config["seurat_analysis"]["signatures"]
        threads: 1
        shell: "Rscript R_src/seuratAnalysisCL.R -i {input.seurat} -o output/RNA/Seurat4_integration/Analysis/ \
        -r {params.res} -s {params.signatures} -c {params.clusters} \
        -f {params.logFC} -p {params.padj} -a {params.assay}"
    
    rule prepare_STREAM_analysis:
        input: seurat = "output/RNA/Seurat4_integration/Analysis/seurat_annotated.rds"
        output: "output/RNA/prepareSTREAM/All_colorCellCyclePhases.tsv"
        conda: signacSeurat4_env
        params: prefix = config["prepare_STREAM"]["prefix"]
        threads: 1
        shell: "Rscript R_src/prepareSTREAManalysisCL.R -i {input.seurat} -o output/RNA/prepareSTREAM/ -p {params.prefix}"
    
    rule run_STREAM_analysis:
        input: matrix = "output/RNA/streamFiltering/All_cells_scaleDataForStream.tsv"
        output: "output/RNA/streamAnalysis/stream_metadata.csv"
        conda: streamAnalysis_env
        params: cells = config["STREAM_analysis"]["cells_clusters"],
                colors = config["STREAM_analysis"]["colors_clusters"],
                epg_alpha = config["STREAM_analysis"]["epg_alpha"],
                epg_mu = config["STREAM_analysis"]["epg_mu"],
                epg_lambda = config["STREAM_analysis"]["epg_lambda"]
        threads: 1
        shell: "python py_src/streamPseudotime.py -i {input.matrix} -o output/RNA/streamAnalysis/ \
        -c {params.cells} -v {params.colors} -a {params.epg_alpha} -m {params.epg_mu} -l {params.epg_lambda}"
    
    rule seurat_STREAM:
        input: stream = "output/RNA/streamAnalysis/stream_metadata.csv",
               seurat = "output/RNA/Seurat4_integration/Analysis/seurat_annotated.rds"
        output: "output/RNA/streamAnalysisSeurat/seuratPseudotime.rds"
        conda: signacSeurat4_env
        params: metaData = config["seurat_STREAM"]["metaData"],
                refPseudotime = config["seurat_STREAM"]["refPseudotime"],
                treeShape = config["seurat_STREAM"]["treeShape"]
        threads: 1
        shell: "Rscript R_src/streamSeuratCL.R -i {input.seurat} -s {input.stream} -o output/RNA/streamAnalysisSeurat/ \
        -m {params.metaData} -p {params.refPseudotime} -t {params.treeShape}"
    
    rule prepare_nestorowa_data:
      input: "input/Nestorowa/GSE81682_HTSeq_counts.txt","input/Nestorowa/all_cell_types.txt"
      output: "output/Nestorowa/seuratForActinn.rds"
      conda: signacSeurat4_env
      shell: "Rscript R_src/prepareNestorowaCL.R -i {input[0]} -m {input[1]} -o output/Nestorowa"
    ####################### Scenic part of the workflow ####################################
    
    rule get_databases_for_scenic:
        output: feather="publicData/database/mm9-tss-centered-10kb-7species.mc9nr.feather",
                tbl="publicData/database/motifs-v9-nr.mgi-m0.001-o0.0.tbl"
        params: feather=config["scenic"]["cis_target_url"],
                tbl=config["scenic"]["motif_url"],
                sums=config["scenic"]["sums"],
                feather_name=feather_database
        shell: """cd publicData/database; \
        wget {params.feather}; \
        wget {params.tbl}; \
        wget {params.sums}; \
        awk -v feather_database={params.feather_name} '$2 == feather_database' sha256sum.txt | sha256sum -c -"""
    
    rule prepare_tf_list:
         input: "publicData/database/motifs-v9-nr.mgi-m0.001-o0.0.tbl"
         output: "output/publicData/mm_mgi_tfs.txt"
         conda: pyscenic_env
         shell: "python py_src/getTfList.py -i {input} -o {output}"
    
    rule filter_genes_for_scenic_all:
        input: "output/RNA/Seurat4_integration/combined.rds",
               "output/publicData/mm_mgi_tfs.txt"
        output: "output/RNA/ScenicAll/expressionRawCountFilteredForScenic.tsv",
        conda: signacSeurat4_env
        shell: "Rscript R_src/filterSeuratForScenicCL.R -i {input[0]} -t {input[1]} -o output/RNA/ScenicAll/"
    
    rule split_by_condition:
        input: "output/RNA/ScenicAll/expressionRawCountFilteredForScenic.tsv",
        output: "output/RNA/ScenicRNA_multipleRuns_{smp}/expressionRawCountFilteredForScenic.tsv"
        conda: pyscenic_env
        shell:
            "head -1 {input} > {output}; cat {input} | grep '^{wildcards.smp}' >> {output}"
    
    rule multiple_grnboost_rna:
        input: "output/RNA/ScenicAll/expressionRawCountFilteredForScenic.tsv",
                "output/publicData/mm_mgi_tfs.txt"
        output: "output/RNA/ScenicRNA_multipleRuns/GRNboost/run{run}_GRNboost.tsv"
        threads: 20
        conda: pyscenic_env
        shell:
            "pyscenic grn --seed {wildcards.run} --num_workers {threads} {input[0]} {input[1]} -o {output}"
    
    rule multiple_grnboost_rna_by_condition:
        input: "output/RNA/ScenicRNA_multipleRuns_{smp}/expressionRawCountFilteredForScenic.tsv",
               "output/publicData/mm_mgi_tfs.txt"
        output: "output/RNA/ScenicRNA_multipleRuns_{smp}/GRNboost/run{run}_GRNboost.tsv"
        threads: 20
        conda: pyscenic_env
        shell:
            "pyscenic grn --seed {wildcards.run} --num_workers {threads} {input[0]} {input[1]} -o {output}"
    
    rule multiple_prune_modules_rna_maskDropouts_with_cis_target_csv:
        input:
          exp="output/RNA/ScenicAll/expressionRawCountFilteredForScenic.tsv",
          GRN_boost= "output/RNA/ScenicRNA_multipleRuns/GRNboost/run{run}_GRNboost.tsv",
          TF_motif="publicData/database/motifs-v9-nr.mgi-m0.001-o0.0.tbl",
          cis_motif="publicData/database/mm9-tss-centered-10kb-7species.mc9nr.feather"
        output:"output/RNA/ScenicRNA_multipleRuns/cis_target_maskDropouts/run{run}_regulons.csv"
        threads:10
        conda:pyscenic_env
        shell:
            "pyscenic ctx --annotations_fname {input.TF_motif} -a --mask_dropouts\
     --num_workers {threads} \
     --expression_mtx_fname {input.exp} -o {output} \
     {input.GRN_boost} {input.cis_motif}"
    
    rule multiple_prune_modules_rna_maskDropouts_with_cis_target_csv_by_condition:
        input:
          exp="output/RNA/ScenicRNA_multipleRuns_{smp}/expressionRawCountFilteredForScenic.tsv",
          GRN_boost="output/RNA/ScenicRNA_multipleRuns_{smp}/GRNboost/run{run}_GRNboost.tsv",
          TF_motif="publicData/database/motifs-v9-nr.mgi-m0.001-o0.0.tbl",
          cis_motif="publicData/database/mm9-tss-centered-10kb-7species.mc9nr.feather"
        output:"output/RNA/ScenicRNA_multipleRuns_{smp}/cis_target_maskDropouts/run{run}_regulons.csv"
        threads:12
        conda:pyscenic_env
        shell:
            "pyscenic ctx --annotations_fname {input.TF_motif} -a --mask_dropouts\
     --num_workers {threads} \
     --expression_mtx_fname {input.exp} -o {output} \
     {input.GRN_boost} {input.cis_motif}"
    
    rule multiple_motifEnriched2regulons_rna_maskDropouts:
        input: "output/RNA/ScenicRNA_multipleRuns/cis_target_maskDropouts/run{run}_regulons.csv"
        output: "output/RNA/ScenicRNA_multipleRuns/cis_target_maskDropouts/run{run}_regulons.json"
        conda:pyscenic_env
        shell: "python py_src/motif2regulon.py -i {input} -o {output}"
    
    rule multiple_motifEnriched2regulons_rna_maskDropouts_by_condition:
        input: "output/RNA/ScenicRNA_multipleRuns_{smp}/cis_target_maskDropouts/run{run}_regulons.csv"
        output: "output/RNA/ScenicRNA_multipleRuns_{smp}/cis_target_maskDropouts/run{run}_regulons.json"
        conda:pyscenic_env
        shell: "python py_src/motif2regulon.py -i {input} -o {output}"
    
    rule merging_multiple_runs:
        input: files = expand("output/RNA/ScenicRNA_multipleRuns/cis_target_maskDropouts/run{run}_regulons.json",run = scenicRuns),
               adjacencies =  expand("output/RNA/ScenicRNA_multipleRuns/GRNboost/run{run}_GRNboost.tsv",run = scenicRuns)
        output: "output/RNA/ScenicRNA_multipleRuns/cis_target_maskDropouts/aggregatedRegulons.json",
                "output/RNA/ScenicRNA_multipleRuns/cis_target_maskDropouts/aggregatedRegulonsMeta.json",
                "output/RNA/ScenicRNA_multipleRuns/cis_target_maskDropouts/adja.sqlite"
        shell: "python py_src/aggregateMultiRun.py -r output/RNA/ScenicRNA_multipleRuns/cis_target_maskDropouts/ -c 0.8 -a output/RNA/ScenicRNA_multipleRuns/GRNboost/ -o output/RNA/ScenicRNA_multipleRuns/cis_target_maskDropouts/"
    
    rule merging_multiple_runs_by_condition:
        input: files = expand("output/RNA/ScenicRNA_multipleRuns_{smp}/cis_target_maskDropouts/run{run}_regulons.json",run = scenicRuns,smp =samples),
               adjacencies =  expand("output/RNA/ScenicRNA_multipleRuns_{smp}/GRNboost/run{run}_GRNboost.tsv",run = scenicRuns,smp =samples),
        output: "output/RNA/ScenicRNA_multipleRuns_{smp}/cis_target_maskDropouts/aggregatedRegulons.json",
                "output/RNA/ScenicRNA_multipleRuns_{smp}/cis_target_maskDropouts/aggregatedRegulonsMeta.json",
                "output/RNA/ScenicRNA_multipleRuns_{smp}/cis_target_maskDropouts/adja.sqlite"
        shell: "python py_src/aggregateMultiRun.py -r output/RNA/ScenicRNA_multipleRuns_{wildcards.smp}/cis_target_maskDropouts/\
        -a output/RNA/ScenicRNA_multipleRuns_{wildcards.smp}/GRNboost/ -c 0.8 \
        -o output/RNA/ScenicRNA_multipleRuns_{wildcards.smp}/cis_target_maskDropouts/"
    
    rule multiple_aucell_TF_seurat_regulon_rna_maskDropouts:
        input:"output/RNA/ScenicAll/expressionRawCountFilteredForScenic.tsv",
              "output/RNA/ScenicRNA_multipleRuns/cis_target_maskDropouts/aggregatedRegulons.json"
        output:"output/RNA/ScenicRNA_multipleRuns/AUCell_maskDropouts/regulons_enrichment.csv"
    
        threads:12
        conda:pyscenic_env
        shell:
            "python3.6 py_src/aucellJson.py -c {threads} -s 2020 -e {input[0]} -r {input[1]} -o output/RNA/ScenicRNA_multipleRuns/AUCell_maskDropouts/"
    
    rule creating_regulonTable:
        input: main = "output/RNA/ScenicRNA_multipleRuns/cis_target_maskDropouts/aggregatedRegulonsMeta.json",
               supp = expand("output/RNA/ScenicRNA_multipleRuns_{smp}/cis_target_maskDropouts/aggregatedRegulonsMeta.json",smp=samples)
        output: "output/RNA/regulonAnalysis/mainRegulonTable.tsv"
        params: condName = "RA+CT",
                tf_database = "output/publicData/mm_mgi_tfs.txt",
                supp = "+".join(expand("output/RNA/ScenicRNA_multipleRuns_{smp}/cis_target_maskDropouts/aggregatedRegulonsMeta.json",smp=samples))
        conda: seurat_monocle_env
        shell:
            "Rscript R_src/makeRegulonTableCL.R -o output/RNA/regulonAnalysis/ -t {params.tf_database} -r {input.main} -s {params.supp} -n {params.condName}"
    
    rule analyseRegulonScore:
        input: seurat =  "report/seurat_promyelocytes.rds",
               tfList = "input/selectedTF.txt",
               regulonScore =  "output/RNA/ScenicRNA_multipleRuns/AUCell_maskDropouts/regulons_enrichment.csv"
        output: "output/RNA/regulonAnalysis/clusterMarkerRegulonTable.tsv"
        params: scoreDiff = config["regulonAnalysis"]["scoreDiff"]
        conda: seurat_monocle_env
        shell:
          "Rscript R_src/regulonMarkerCL.R -i {input.seurat} -t {input.tfList} \
          -r {input.regulonScore} -o output/RNA/regulonAnalysis/ -c -a condition"
    
    ####################### ATAC seq part of the workflow ####################################
    
    rule process_atac_smp:
      input:"input/RAW_atac/{smpAtac}/filtered_peak_bc_matrix.h5",
            "input/RAW_atac/{smpAtac}/singlecell.csv",
            "input/RAW_atac/{smpAtac}/fragments.tsv.gz",
            "output/signacSeurat4_installed"
      output:"output/ATAC/smp/{smpAtac}/atac.rds"
      params: age = lambda wildcards : config[wildcards.smpAtac]["age"],
              runDate = lambda wildcards : config[wildcards.smpAtac]["date"],
              condition = lambda wildcards : config[wildcards.smpAtac]["condition"]
      threads:12
      conda: signacSeurat4_env
      shell: "Rscript R_src/signacSmpCL.R -i input/RAW_atac/{wildcards.smpAtac}/ -o output/ATAC/smp/{wildcards.smpAtac}/ \
      -u 2 -l 2 -t 2 -c 2 -r 30 -b 0.05 \
      -s age={params.age},runDate={params.runDate},sampleName={wildcards.smpAtac},condition={params.condition}"
    
    rule integrate_atac:
      input: expand("output/ATAC/smp/{smpAtac}/atac.rds",smpAtac = SMPATAC),
             "output/signacSeurat4_installed"
      output:"output/ATAC/integration/atacCombined.rds"
      params: smpList = config["signac_integration"]["smp_list"],
              minCutOff = config["signac_integration"]["minCutOff"],
              subsetInterPeaks = config["signac_integration"]["subsetInterPeaks"]
      conda: signacSeurat4_env
      shell: "Rscript R_src/signac_integration.R -i {params.smpList} -o output/ATAC/integration/ -m {params.minCutOff} -s {params.subsetInterPeaks}"
    
    rule transfer_rna_atac:
      input: "output/RNA/Seurat4_integration/combined.rds",
             expand("output/ATAC/smp/{smpAtac}/atac.rds",smpAtac = SMPATAC),
             "output/signacSeurat4_installed_v2"
      output:"output/TransferRnaAtac/coembedAll.rds"
      conda: signacSeurat4_env
      shell: "Rscript R_src/transferRnaAtacCL.R -i {input[0]} -a {input[1]} -b {input[2]} -d -N 50 -r integrated_snn_res.0.3 -o output/TransferRnaAtac/"
    
    rule install_signac_motif_packages:
      conda: signacSeurat4_env
      output: "output/motifsEnv_installed"
      shell: "Rscript R_src/installMotifsEnv.R;touch {output}"
    
    rule signac_motif_integrated:
      input: "output/TransferRnaAtac/atac_integrated.rds"
      output: "output/TransferRnaAtac/SignacMotif/atac_motif.rds"
      params: clustName = config["signac_motif_integrated"]["cluster_name"],
              clustLevel = config["signac_motif_integrated"]["cluster_level"],
              clustCol = config["signac_motif_integrated"]["cluster_col"]
      conda: signacSeurat4_env
      shell: "Rscript R_src/SignacMotifAnnotCL.R -i {input} -o output/TransferRnaAtac/SignacMotif -s 9606 -a peaks -c {params.clustCol} -n {params.clustName} -l {params.clustLevel} -I TRUE"
    
    rule seurat_signac_annotAUC_enhancer_promoter:
      input: "output/TransferRnaAtac/SignacMotif_2/atac_motif.rds",
             "output/RNA/Seurat4_integration/Report_20210412/annotated_update.rds",
             "output/RNA/ScenicRNA_multipleRuns/AUCell_maskDropouts/regulons_enrichment.csv"
      output: "output/RNA/regulonAnalysis/seuratAUC.rds"
      conda: signacSeurat4_env
      shell: "Rscript R_src/prepareAUCseuratEnhancePromoseuratCL.R -i {input[1]} -a {input[0]} -r {input[2]} -o output/"
    afbb2f54
Snakefile_for_paper.py 19.3 KB