pigx-rnaseq.w

;; PiGx RNAseq Pipeline.
;; 
;; Copyright © 2017, 2018 Bora Uyar <bora.uyar@mdc-berlin.de>
;; Copyright © 2017, 2018 Jonathan Ronen <yablee@gmail.com>
;; Copyright © 2017-2021 Ricardo Wurmus <ricardo.wurmus@mdc-berlin.de>
;; 
;; This file is part of the PiGx RNAseq Pipeline.
;; 
;; This program is free software: you can redistribute it and/or modify
;; it under the terms of the GNU General Public License as published by
;; the Free Software Foundation, either version 3 of the License, or
;; (at your option) any later version.
;; 
;; This program is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;; GNU General Public License for more details.
;; 
;; You should have received a copy of the GNU General Public License
;; along with this program.  If not, see <http://www.gnu.org/licenses/>.

;; TODO: validate configuration file!
;; include: string-append (config['locations']['pkglibexecdir'], 'scripts/validate_input.py')
;; validate_config(config)


;;; Locations

define GTF_FILE
  get config "locations" "gtf-file"

define SAMPLE_SHEET_FILE
  get config "locations" "sample-sheet"

define GENOME_FASTA
  get config "locations" "genome-fasta"

define CDNA_FASTA
  get config "locations" "cdna-fasta"

define READS_DIR
  get config "locations" "reads-dir"

define OUTPUT_DIR
  get config "locations" "output-dir"

define LOGO
  if : getenv "PIGX_UNINSTALLED"
    file
      get config "locations" "pkgdatadir"
      . / "images" / "Logo_PiGx.png"
    file
      get config "locations" "pkgdatadir"
      . / "Logo_PiGx.png"

define SCRIPTS_DIR
  file
    get config "locations" "pkglibexecdir"
    . / "scripts"

define TRIMMED_READS_DIR
  file OUTPUT_DIR / "trimmed_reads"

define LOG_DIR
  file OUTPUT_DIR / "logs"

define FASTQC_DIR
  file OUTPUT_DIR / "fastqc"

define MULTIQC_DIR
  file OUTPUT_DIR / "multiqc"

define MAPPED_READS_DIR
  file OUTPUT_DIR / "mapped_reads"

define BIGWIG_DIR
  file OUTPUT_DIR / "bigwig_files"

define COUNTS_DIR
  file OUTPUT_DIR / "feature_counts"

define SALMON_DIR
  file OUTPUT_DIR / "salmon_output"


;;; Tools

define : toolArgs name
  or
    false-if-exception
      get config "tools" name "args"
    ""

define : tool name
  string-append
    get config "tools" name "executable"
    " "
    toolArgs name

define FASTQC_EXEC
  tool "fastqc"
define MULTIQC_EXEC
  tool "multiqc"
define STAR_EXEC_MAP
  tool "star_map"
define STAR_EXEC_INDEX
  tool "star_index"
define SALMON_INDEX_EXEC
  tool "salmon_index"
define SALMON_QUANT_EXEC
  tool "salmon_quant"
define TRIM_GALORE_EXEC
  tool "trim-galore"
define SAMTOOLS_EXEC
  tool "samtools"
define HTSEQ_COUNT_EXEC
  tool "htseq-count"
define GUNZIP_EXEC
  tool "gunzip"
define RSCRIPT_EXEC
  tool "Rscript"
define SED_EXEC
  tool "sed"


;;; Configurations

define STAR_INDEX_THREADS
  get config "execution" "rules" "star_index" "threads"

define SALMON_INDEX_THREADS
  get config "execution" "rules" "salmon_index" "threads"

define STAR_MAP_THREADS
  get config "execution" "rules" "star_map" "threads"

define SALMON_QUANT_THREADS
  get config "execution" "rules" "salmon_quant" "threads"

define ORGANISM
  get config "organism"

define DE_ANALYSIS_LIST
  or
    false-if-exception
      get config "DEanalyses"
    list

define : parse-sample-sheet file
  "Parse FILE as a sample sheet and return a list (rows) of
lists (columns)."
  import : dsv
  catch 'dsv-parser-error
    lambda _
      with-input-from-file file
        lambda _
          dsv->scm format: 'rfc4180
    lambda _
      error "Failed to parse the sample sheet."

;; This is a list of alists.  Each list item corresponds to a row.  A
;; row is an alist of column names to column values.
define SAMPLE_SHEET
  define rows
    parse-sample-sheet SAMPLE_SHEET_FILE
  define header
    first rows
  define rows
    drop rows 1
  map
    lambda : row
      map cons header row
    . rows

define SAMPLES
  on SAMPLE_SHEET map
    lambda : row
      get row "name"

define : lookup column predicate . fields
  "Convenience function to access fields of sample sheet columns that
match the predicate.  The predicate may be a string."
  define predicate*
    if : procedure? predicate
      . predicate
      cut equals? predicate <>
  define records
    filter
      lambda : line
        predicate*
          assoc-ref column line
      . SAMPLE_SHEET
  ;; Return only the selected fields
  map
    lambda : record
      map
        lambda : column-name
          assoc-ref record column-name
        . fields
    . records

define : reads-for-sample sample
  remove string-null?
    lookup "name" sample "reads" "reads2"

define : single-end? sample
  "Return #true if the SAMPLE library is single ended."
  equal? 1
    length : reads-for-sample sample

define : trimmed-reads-for-sample sample
  if : single-end? sample
    files TRIMMED_READS_DIR / sample "_R.fastq.gz"
    files TRIMMED_READS_DIR / sample "_R" (list "1" "2") ".fastq.gz"

define : trim-galore-input sample
  files READS_DIR /
    reads-for-sample sample


;;; Processes

process translate_sample_sheet_for_report
  inputs
    . sample-sheet: SAMPLE_SHEET_FILE
    . script:
    file SCRIPTS_DIR / "translate_sample_sheet_for_report.R"
  outputs
    ;; TODO: this doesn't look right
    file (cwd) / "/colData.tsv"
  packages
    . "r-minimal"
  # {
    {{RSCRIPT_EXEC}} {{inputs:script}} {{inputs:sample-sheet}}
  }

process trim_galore_pe (with sample)
  inputs
    trim-galore-input sample
  outputs
    . r1:
    file TRIMMED_READS_DIR / sample "_R1.fastq.gz"
    . r2:
    file TRIMMED_READS_DIR / sample "_R2.fastq.gz"
    . log:
    file LOG_DIR / "trim_galore_" sample ".log"
  packages
    . "trim-galore"
  procedure
    define read1
      first
        lookup "name" sample "reads"
    define read2
      first
        lookup "name" sample "reads2"
    ;; TODO: use basename instead?  Or is the extension not always the same?
    ;; For example ".fastq.gz" and ".fq.gz"?
    define : without-extension file-name
      string-join
        drop-right
          string-split file-name #\.
          . 2
        . "."
    define tmp1
      file TRIMMED_READS_DIR /
        without-extension read1
        . "_val_1.fq.gz"
    define tmp2
      file TRIMMED_READS_DIR /
        without-extension read2
        . "_val_2.fq.gz"
    # {
      {{TRIM_GALORE_EXEC}} -o {{TRIMMED_READS_DIR}} \
        --paired {{inputs}} >> {{outputs:log}} 2>&1 && \
        sleep 10 && \
        mv {{tmp1}} {{outputs:r1}} && \
        mv {{tmp2}} {{outputs:r2}}
    }

process trim_galore_se (with sample)
  inputs
    trim-galore-input sample
  outputs
    . trimmed:
    file TRIMMED_READS_DIR / sample "_R.fastq.gz"
    . log:
    file LOG_DIR / "trim_galore_" sample ".log"
  packages
    . "trim-galore"
  procedure
    define input1
      pick first inputs
    define file
      first
        lookup "name" sample "reads"
    ;; TODO: use basename instead?  Or is the extension not always the same?
    ;; For example ".fastq.gz" and ".fq.gz"?
    define without-extension
      string-join
        drop-right
          string-split file #\.
          2
        "."
    define tmp
      file TRIMMED_READS_DIR / without-extension "_trimmed.fq.gz"
    # {
      {{TRIM_GALORE_EXEC}} -o {{TRIMMED_READS_DIR}} \
        {{input1}} >> {{outputs:log}} 2>&1 && \
        sleep 10 && \
        mv {{tmp}} {{outputs:trimmed}}
    }

process star_index
  inputs
    . gtf: GTF_FILE
    . genome: GENOME_FASTA
  outputs
    . star_index_file:
    file OUTPUT_DIR / "star_index" / "SAindex"
    . star_index_dir:
    file OUTPUT_DIR / "star_index"
    . log:
    file LOG_DIR / "star_index.log"
  packages
    . "star"
  # {
    {{STAR_EXEC_INDEX}} \
      --runMode genomeGenerate \
      --runThreadN {{STAR_INDEX_THREADS}} \
      --genomeDir {{outputs:star_index_dir}} \
      --genomeFastaFiles {{inputs:genome}} \
      --sjdbGTFfile {{inputs:gtf}} >> {{outputs:log}} 2>&1
  }

process star_map (with sample)
  inputs
    ;; This process really depends on the whole directory (see
    ;; params.index_dir), but we can't register it as an input/output
    ;; in its own right since Snakemake 5.
    . index_file:
    pick star_index_file:
      process-outputs star_index
    . index_dir:
    pick star_index_dir:
      process-outputs star_index
    . reads:
    trimmed-reads-for-sample sample
  outputs
    file MAPPED_READS_DIR / sample "_Aligned.out.bam"
    . log:
    file LOG_DIR / "star_map_" sample ".log"
  packages
    . "star"
    . "gzip"
  procedure
    define output_prefix
      file MAPPED_READS_DIR / sample "_"
    # {
      {{STAR_EXEC_MAP}}                         \
        --runThreadN {{STAR_MAP_THREADS}}       \
        --genomeDir {{inputs:index_dir}}        \
        --readFilesIn {{input::reads}}          \
        --readFilesCommand '{{GUNZIP_EXEC}} -c' \
        --outSAMtype BAM Unsorted               \
        --outFileNamePrefix {{output_prefix}} >> {{outputs:log}} 2>&1
  }

process sort_bam (with sample)
  inputs
    file MAPPED_READS_DIR / sample "_Aligned.out.bam"
  outputs
    . bam:
    file MAPPED_READS_DIR / sample "_Aligned.sortedByCoord.out.bam"
    . log:
    file LOG_DIR / "samtools_sort_" sample ".log"
  packages
    . "samtools"
  # {
    {{SAMTOOLS_EXEC}} sort -o {{outputs:bam}} {{inputs}} >> {{outputs:log}} 2>&1
  }

process index_bam (with sample)
  inputs
    file MAPPED_READS_DIR / sample "_Aligned.sortedByCoord.out.bam"
  outputs
    . bai:
    file MAPPED_READS_DIR / sample "_Aligned.sortedByCoord.out.bam.bai"
    . log:
    file LOG_DIR / "samtools_index_" sample ".log"
  packages
    . "samtools"
  # {
    {{SAMTOOLS_EXEC}} index {{inputs}} {{outputs:bai}} >> {{outputs:log}} 2>&1
  }

process fastqc (with sample)
  inputs
    file MAPPED_READS_DIR / sample "_Aligned.sortedByCoord.out.bam"
  outputs
    file FASTQC_DIR / sample "_Aligned.sortedByCoord.out_fastqc.zip"
    . log:
    file LOG_DIR / "fastqc_" sample ".log"
  packages
    . "fastqc"
  # {
    {{FASTQC_EXEC}} -o {{FASTQC_DIR}} -f bam {{inputs}} >> {{outputs:log}} 2>&1
  }

process salmon_index
  inputs
    . CDNA_FASTA
  outputs
    . salmon_index_dir:
    file OUTPUT_DIR / "salmon_index"
    . salmon_index_file:
    file OUTPUT_DIR / "salmon_index" / "sa.bin"
    . log:
    file LOG_DIR / "salmon_index.log"
  packages
    . "salmon"
  # {
    {{SALMON_INDEX_EXEC}} -t {{inputs}} \
      -i {{outputs:salmon_index_dir}}   \
      -p {{SALMON_INDEX_THREADS}} >> {{outputs:log}} 2>&1
  }

;; This process really depends on the whole directory, not just the index_file (see
;; params.index_dir), but we can't register it as an input/output
;; in its own right since Snakemake 5.
process salmon_quant (with sample)
  inputs
    . gtf: GTF_FILE
    . index_file:
    pick salmon_index_file:
      process-output salmon_index
    . index_dir:
    pick salmon_index_dir:
      process-outputs salmon_index ; TODO: this is actually "params" instead of "outputs"
    . reads:
    trimmed-reads-for-sample sample
  outputs
    file SALMON_DIR / sample / "quant.sf"
    file SALMON_DIR / sample / "quant.genes.sf"
    . outfolder:
    file SALMON_DIR / sample
    . log:
    file LOG_DIR / "salmon_quant_" sample ".log"
  packages
    . "salmon"
  procedure
    define reads
      pick * reads: inputs
    define single?
      equal? 1
        length reads
    define arguments
      if single?
        string-append "-r " (first reads) " "
        string-append "-1 " (first reads) " -2 " (second reads) " "
    # {
      {{SALMON_QUANT_EXEC}}              \
        -i {{inputs:index_dir}}          \
        -l A -p {{SALMON_QUANT_THREADS}} \
        {{arguments}}                    \
        -o {{outputs:outfolder}}         \
        --seqBias --gcBias               \
        -g {{inputs:gtf}} >> {{outputs:log}} 2>&1
    }

process counts_from_SALMON
  inputs
    . quantFiles:
    files SALMON_DIR / SAMPLES / "quant.sf"
    . quantGenesFiles:
    files SALMON_DIR / SAMPLES / "quant.genes.sf"
    . colDataFile
    pick first
      process-outputs translate_sample_sheet_for_report
    . script
    file SCRIPTS_DIR / "counts_matrix_from_SALMON.R"
  outputs
    define types
      list "transcripts" "genes"
    files COUNTS_DIR / "raw_counts" / "counts_from_SALMON." types ".tsv"
    files COUNTS_DIR / "normalized" / "TPM_counts_from_SALMON." types ".tsv"
    . log:
    file LOG_DIR / "salmon_import_counts.log"
  packages
    . "r-minimal"
  # {
    {{RSCRIPT_EXEC}} {{script}} \
      {{SALMON_DIR}} {{COUNTS_DIR}} {{inputs:colDataFile}} >> {{outputs:log}} 2>&1
  }

process genomeCoverage (with sample)
  inputs
    . size_factors_file:
    file COUNTS_DIR / "normalized" / "deseq_size_factors.txt"
    . bam:
    file MAPPED_READS_DIR / sample "_Aligned.sortedByCoord.out.bam"
    . bai:
    file MAPPED_READS_DIR / sample "_Aligned.sortedByCoord.out.bam.bai"
    . script:
    file SCRIPTS_DIR / "export_bigwig.R"
  outputs
    files BIGWIG_DIR / sample / (list ".forward" ".reverse") ".bigwig"
    . log:
    file LOG_DIR / "genomeCoverage_" sample ".log"
  packages
    . "r-minimal"
  # {
    {{RSCRIPT_EXEC}} {{script}} \
      {{inputs:bam}} {{sample}} {{inputs:size_factors_file}} {{BIGWIG_DIR}} >> {{outputs:log}} 2>&1
  }      

process multiqc
  inputs
    . work-directory:
    file OUTPUT_DIR
    . salmon_output:
    files SALMON_DIR / SAMPLES / "quant.sf"
    . star_output:
    files MAPPED_READS_DIR / SAMPLES "_Aligned.sortedByCoord.out.bam"
    . fastqc_output:
    files FASTQC_DIR / SAMPLES "_Aligned.sortedByCoord.out_fastqc.zip"
  outputs
    file MULTIQC_DIR / "multiqc_report.html"
    . log:
    file LOG_DIR / "multiqc.log"
  packages
    . "multiqc"
  ;; TODO: what is OUTPUT_DIR?  Why isn't this using the declared inputs?
  # {
    {{MULTIQC_EXEC}} -o {{MULTIQC_DIR}} {{inputs:work-directory}} >> {{outputs:log}} 2>&1
  }
    
process count_reads (with sample)
  inputs
    . gtf: GTF_FILE
    . bam:
    file MAPPED_READS_DIR / sample "_Aligned.sortedByCoord.out.bam"
    . bai:
    file MAPPED_READS_DIR / sample "_Aligned.sortedByCoord.out.bam.bai"
    . script:
    file SCRIPTS_DIR / "count_reads.R"
  outputs
    file MAPPED_READS_DIR / sample ".read_counts.csv"
    . log:
    file LOG_DIR / sample ".count_reads.log"
  params:
    define single_end
      single-end? sample
    define mode
      get config "counting" "counting_mode"
    define nonunique
      get config "counting" "count_nonunique"
    define strandedness
      get config "counting" "strandedness"
    define feature
      get config "counting" "feature"
    define group_by
      get config "counting" "group_feature_by"
    define yield_size
      get config "counting" "yield_size"
    # {
      {{RSCRIPT_EXEC}} {{script}} \
        {{sample}} {{inputs:bam}} {{inputs:gtf}} \
        {{single_end}} {{mode}} {{nonunique}} {{strandedness}} \
        {{feature}} {{group_by}} {{yield_size}} >> {{outputs:log}} 2>&1
    }

process collate_read_counts
  inputs
    . counts:
    files MAPPED_READS_DIR / SAMPLES ".read_counts.csv"
    . script:
    file SCRIPTS_DIR / "collate_read_counts.R"
  outputs
    file COUNTS_DIR / "raw_counts" / "counts_from_star.tsv"
    . log:
    file LOG_DIR / "collate_read_counts.log"
  packages
    . "r-minimal"
  # {
    {{RSCRIPT_EXEC}} {{script}} {{MAPPED_READS_DIR}} {{outputs}} >> {{outputs:log}} 2>&1
  }

process htseq_count
  inputs
    . gtf: GTF_FILE
    . bams:
    files MAPPED_READS_DIR / SAMPLES "_Aligned.sortedByCoord.out.bam"
  outputs
    . stats_file:
    file COUNTS_DIR / "raw_counts" / "htseq_stats.txt"
    . counts_file:
    file COUNTS_DIR / "raw_counts" / "counts_from_star_htseq-count.txt"
    . log:
    file LOG_DIR / "htseq-count.log"
  packages
    . "sed"
    . "htseq" ; htseq-count
    . "coreutils" ; head, tail, rm
  procedure
    define tmp_file
      file COUNTS_DIR / "raw_counts" / "htseq_out.txt"
    # {
        echo {{SAMPLES}} | sed 's/ /\t/g' > {{tmp_file}}
        htseq-count {inputs::bams}} {{inputs:gtf}} 1>> {{tmp_file}} 2>> {{outputs:log}}

        # move feature count stats (e.g. __no_feature etc) to another file
        echo {{SAMPLES}} > {{outputs:stats_file}}
        tail -n 5 {{tmp_file}} >> {{outputs:stats_file}}

        # only keep feature counts in the counts table (remove stats)
        head -n -5 {{tmp_file}} > {{outputs:counts_file}}

        # remove temp file
        rm {{tmp_file}}
    }

;; Create a normalized counts table including all samples using the
;; median-of-ratios normalization procedure of deseq2.
process norm_counts_deseq
  inputs
    . counts_file:
    file COUNTS_DIR / "raw_counts" / "counts_from_star.tsv"
    . colDataFile:
    pick first
      process-outputs translate_sample_sheet_for_report
    . script:
    file SCRIPTS_DIR / "norm_counts_deseq.R"
  outputs
    . outdir:
    file COUNTS_DIR / "normalized"
    . size_factors:
    file COUNTS_DIR / "normalized" / "deseq_size_factors.txt"
    . norm_counts:
    file COUNTS_DIR / "normalized" / "deseq_normalized_counts.tsv"
    . log:
    file LOG_DIR / "norm_counts_deseq.log"
  packages
    . "r-minimal"
  # {
    {{RSCRIPT_EXEC}} {{script}} \
      {{inputs:counts_file}} \
      {{inputs:colDataFile}} \
      {{outputs:outdir}} >> {{outputs:log}} 2>&1
  }

;; TODO: merge with report2 and report3
process report1 (with analysis)
  inputs
    . logo: LOGO
    . gtf: GTF_FILE
    . counts:
    file COUNTS_DIR / "raw_counts" / "counts_from_star.tsv"
    . coldata:
    pick first
      process-outputs translate_sample_sheet_for_report
    . reportR:
    file SCRIPTS_DIR / "runDeseqReport.R"
    . reportRmd:
    file SCRIPTS_DIR / "deseqReport.Rmd"
  outputs
    . outdir:
    file OUTPUT_DIR / "report"
    file OUTPUT_DIR / "report" / analysis ".star.deseq.report.html"
    . log:
    file LOG_DIR / analysis ".report.star.log"
  packages
    . "r-minimal"
  procedure
    define case
      get DE_ANALYSIS_LIST analysis "case_sample_groups"
    define control
      get DE_ANALYSIS_LIST analysis "control_sample_groups"
    define covariates
      get DE_ANALYSIS_LIST analysis "covariates"
    # {
      {{RSCRIPT_EXEC}} {{inputs:reportR}} \
        --logo={{inputs:logo}} \
        --prefix='{{analysis}}.star' \
        --reportFile={{inputs:reportRmd}} \
        --countDataFile={{inputs:counts}} \
        --colDataFile={{inputs:coldata}} \
        --gtfFile={{inputs:gtf}} \
        --caseSampleGroups='{{case}}' \
        --controlSampleGroups='{{control}}' \
        --covariates='{{covariates}}' \
        --workdir={{outputs:outdir}} \
        --organism='{{ORGANISM}}' >> {{outputs:log}} 2>&1
    }

process report2 (with analysis)
  inputs
    . logo: LOGO
    . gtf: GTF_FILE
    . counts:
    file COUNTS_DIR / "raw_counts" / "counts_from_SALMON.transcripts.tsv"
    . coldata:
    pick first
      process-outputs translate_sample_sheet_for_report
    . reportR:
    file SCRIPTS_DIR / "runDeseqReport.R"
    . reportRmd:
    file SCRIPTS_DIR / "deseqReport.Rmd"
  outputs
    . outdir:
    file OUTPUT_DIR / "report"
    file OUTPUT_DIR / "report" / analysis ".salmon.transcripts.deseq.report.html"
    . log:
    file LOG_DIR / analysis ".report.salmon.transcripts.log"
  packages
    . "r-minimal"
  procedure
    define case
      get DE_ANALYSIS_LIST analysis "case_sample_groups"
    define control
      get DE_ANALYSIS_LIST analysis "control_sample_groups"
    define covariates
      get DE_ANALYSIS_LIST analysis "covariates"
    # {
      {{RSCRIPT_EXEC}} {{inputs:reportR}} \
        --logo={{inputs:logo}} \
        --prefix='{{analysis}}.salmon.transcripts' \
        --reportFile={{inputs:reportRmd}} \
        --countDataFile={{inputs:counts}} \
        --colDataFile={{inputs:coldata}} \
        --gtfFile={{inputs:gtf}} \
        --caseSampleGroups='{{case}}' \
        --controlSampleGroups='{{control}}' \
        --covariates='{{covariates}}' \
        --workdir={{outdir}} \
        --organism='{{ORGANISM}}' >> {{outputs:log}} 2>&1
    }

process report3 (with analysis)
  inputs
    . logo: LOGO
    . gtf: GTF_FILE
    . counts:
    file COUNTS_DIR / "raw_counts" / "counts_from_SALMON.genes.tsv"
    . coldata:
    pick first
      process-outputs translate_sample_sheet_for_report
    . reportR:
    file SCRIPTS_DIR / "runDeseqReport.R"
    . reportRmd:
    file SCRIPTS_DIR / "deseqReport.Rmd"
  outputs
    . outdir:
    file OUTPUT_DIR / "report"
    file OUTPUT_DIR / "report" / analysis ".salmon.genes.deseq.report.html"
    . log:
    file LOG_DIR / analysis ".report.salmon.genes.log"
  packages
    . "r-minimal"
  procedure
    define case
      get DE_ANALYSIS_LIST analysis "case_sample_groups"
    define control
      get DE_ANALYSIS_LIST analysis "control_sample_groups"
    define covariates
      get DE_ANALYSIS_LIST analysis "covariates"
    # {
      {{RSCRIPT_EXEC}} {{inputs:reportR}} \
        --logo={{inputs:logo}} \
        --prefix='{{analysis}}.salmon.genes' \
        --reportFile={{inputs:reportRmd}} \
        --countDataFile={{inputs:counts} \
        --colDataFile={{inputs:coldata}} \
        --gtfFile={{inputs:gtf}} \
        --caseSampleGroups='{{case}}' \
        --controlSampleGroups='{{control}}' \
        --covariates='{{covariates}}' \
        --workdir={{outputs:outdir}} \
        --organism='{{ORGANISM}}' >> {{outputs:log}} 2>&1
    }


define ANALYSES
  map first DE_ANALYSIS_LIST
  
process final_report
  synopsis "Produce a comprehensive report.  This is the default target."
  inputs
    file OUTPUT_DIR / "star_index" / "SAindex"
    file OUTPUT_DIR / "salmon_index" / "sa.bin"
    file MULTIQC_DIR / "multiqc_report.html"
    file COUNTS_DIR / "raw_counts" / "counts_from_SALMON.transcripts.tsv"
    file COUNTS_DIR / "raw_counts" / "counts_from_SALMON.genes.tsv"
    file COUNTS_DIR / "normalized" / "TPM_counts_from_SALMON.transcripts.tsv"
    file COUNTS_DIR / "normalized" / "TPM_counts_from_SALMON.genes.tsv"
    file COUNTS_DIR / "raw_counts" / "counts_from_star.tsv"
    file COUNTS_DIR / "normalized" / "deseq_normalized_counts.tsv"
    file COUNTS_DIR / "normalized" / "deseq_size_factors.txt"

    files BIGWIG_DIR / SAMPLES ".forward.bigwig"
    files BIGWIG_DIR / SAMPLES ".reverse.bigwig"
    files OUTPUT_DIR / "report" / ANALYSES
      list ".star.deseq.report.html"
         . ".salmon.transcripts.deseq.report.html"
         . ".salmon.genes.deseq.report.html"
  procedure
    ' display "Final reports have been generated."

process deseq_report_star
  synopsis "Produce one HTML report for each analysis based on STAR results."
  inputs
    files OUTPUT_DIR / "report" / ANALYSES ".star.deseq.report.html"
  procedure
    ' display "STAR reports have been generated."

process deseq_report_salmon_transcripts
  synopsis "Produce one HTML report for each analysis based on SALMON results at transcript level."
  inputs
    files OUTPUT_DIR / "report" / ANALYSES ".salmon.transcripts.deseq.report.html"
  procedure
    ' display "SALMON transcript reports have been generated."

process deseq_report_salmon_genes
  synopsis "Produce one HTML report for each analysis based on SALMON results at gene level."
  inputs
    files OUTPUT_DIR / "report" / ANALYSES ".salmon.genes.deseq.report.html"
  procedure
    ' display "SALMON genes reports have been generated."

process star_map
  synopsis "Produce a STAR mapping results in BAM file format."
  inputs
    files MAPPED_READS_DIR / SAMPLES "_Aligned.sortedByCoord.out.bam"
  procedure
    ' display "STAR mappings have been generated."

process star_counts
  synopsis "Get count matrix from STAR mapping results using summarizeOverlaps."
  inputs
    file COUNTS_DIR / "raw_counts" / "counts_from_star.tsv"
  procedure
    ' display "STAR count matrix has been generated."

process genome_coverage
  synopsis "Compute genome coverage values from BAM files - save in bigwig format"
  inputs
    files BIGWIG_DIR / SAMPLES (list ".forward" ".reverse") ".bigwig"
  procedure
    ' display "Genome coverage has been computed."

process fastqc
  synopsis "post-mapping quality control by FASTQC."
  inputs
    files FASTQC_DIR / SAMPLES "_Aligned.sortedByCoord.out_fastqc.zip"
  procedure
    ' display "FastQC quality control completed."

process salmon_index
  synopsis "Create SALMON index file."
  inputs
    file OUTPUT_DIR / "salmon_index" / "sa.bin"
  procedure
    ' display "SALMON index has been generated."

process salmon_quant
  synopsis "Calculate read counts per transcript using SALMON."
  inputs
    files SALMON_DIR / SAMPLES / (list "quant.sf" "quant.genes.sf")
  procedure
    ' display "SALMON read counts has been computed."

process salmon_counts
  synopsis "Get count matrix from SALMON quant."
  inputs
    file COUNTS_DIR / "raw_counts" / "counts_from_SALMON.transcripts.tsv"
    file COUNTS_DIR / "raw_counts" / "counts_from_SALMON.genes.tsv"
    file COUNTS_DIR / "normalized" / "TPM_counts_from_SALMON.transcripts.tsv"
    file COUNTS_DIR / "normalized" / "TPM_counts_from_SALMON.genes.tsv"
  procedure
    ' display "SALMON count matrix has been computed."

process multiqc-generated
  synopsis "Get multiQC report based on STAR alignments and fastQC reports."
  inputs
    file MULTIQC_DIR / "multiqc_report.html"
  procedure
    ' display "multiQC report completed."

;; TODO: allow selection of targets
;; # Selected output files from the above set.
;; selected_targets = config['execution']['target'] or ['final-report']

workflow
  packages
    . "guile"
    . "guile-dsv" ; for CSV file parsing
  processes
    apply auto-connect
      append
        ;; process templates using samples
        feed SAMPLES to
          . trim-galore-pe
          . trim-galore-se
          . star_map
          . sort_bam
          . index_bam
          . fastqc
          . salmon_quant
          . genomeCoverage
          . count_reads
        ;; report process templates using analyses
        feed ANALYSES to
          . report1
          . report2
          . report3
        ;; simple processes
        list
          . translate_sample_sheet_for_report
          . star_index
          . salmon_index
          . counts_from_SALMON
          . multiqc
          . collate_read_counts
          . htseq_count
          . norm_counts_deseq
        ;; public targets
        list
          . final_report
          . deseq_report_star
          . deseq_report_salmon_transcripts
          . deseq_report_salmon_genes
          . star_map
          . star_counts
          . genome_coverage
          . fastqc
          . salmon_index
          . salmon_quant
          . salmon_counts
          . multiqc-generated
Generated by Ricardo Wurmus using scpaste at Sun Jan 17 12:12:11 2021. CET. (original)