diff --git a/MelArray/MelArray.snake b/MelArray/MelArray.snake index 6f8ff2b578787b3a587aae957d1274cf3b7e60b6..f2beeb451438db0eb3ab858219a25d9a3bf3e96d 100644 --- a/MelArray/MelArray.snake +++ b/MelArray/MelArray.snake @@ -10,61 +10,150 @@ # INIT # ######## + +# import entire packages import os, logging +# import some specific packages from timeit import default_timer as t from datetime import datetime +# import the load_config function from the scripts folder +from scripts.load_config import load_config + + # set up logging logging.basicConfig(level = logging.INFO, format = '#%(asctime)s | %(levelname)-7s | %(filename)-20s: %(message)s') -logging.info('Starting snakemake pipeline ...') +# store the original "config" dictionary that is created from the input parameters +input_params = config -# load the configuration file +# load the configuration file from the "config.yaml" in the current working directory config_file_path = os.getcwd() + "/config.yaml" config = load_config(config_file_path) -# get the list of files to process -IDs, = glob_wildcards(IN_FOLDER + "{id}.fastq") +# store the paths as a separate dictionary +paths = config['path'] + +# get the list of files to process using the regexp: [0-9]*.A-\([A-Za-z0-9-]*\)_[A-Za-z0-9_]*_R1.fastq.gz +# this creates 4 lists, one for each part of the regexp +dates, lanes, IDs, others = glob_wildcards(paths['fastq'] + + "/{date,[0-9]*}.{lanes,[AB]}-{ID,[A-Za-z0-9-]*}_{other,[A-Za-z0-9_]*}_R1.fastq.gz") + +# check if the IDs were not also specified in the input parameters, in which case we should use those IDs +IDs = input_params.get('IDs', IDs) +# transform the comma-separated values string into a list of strings +if type(IDs) == str: IDs = IDs.split(',') + +# get the mapping between the initial fastq file path and their ID +FASTQ_R1_FILES, FASTQ_R2_FILES, i_ID = {}, {}, 0 +# loops through all the IDs +for ID in IDs: + # create the full path for the R1 read and store it in the dictionary under the [ID] key + FASTQ_R1_FILES[ID] = paths['fastq'] + '/' + dates[i_ID] + '.' + lanes[i_ID] + '-' + ID + '_' + others[i_ID] + '_R1.fastq.gz' + # create the full path for the R2 read and store it in the dictionary under the [ID] key + FASTQ_R2_FILES[ID] = paths['fastq'] + '/' + dates[i_ID] + '.' + lanes[i_ID] + '-' + ID + '_' + others[i_ID] + '_R2.fastq.gz' + # increment the counter + i_ID += 1 + + +# piece of code to be executed before the pipeline starts +onstart: -# check if IDs were not specified in config -IDs = config.get('ids', IDs) -if type(IDs) == str: - IDs = IDs.split(',') + logging.info('Starting MelArray Snakemake pipeline ...') + logging.info("IDs:") + for ID in IDs: + logging.info(" " + ID) ######### # RULES # ######### +# final step that will gather all the outputs rule all: input: - expand(OUT_FOLDER + '{id}.bam', id = IDs) + expand(paths['dedup'] + '/{id}_sorted_dedup.bam', id = IDs), + expand(paths['dedup'] + '/{id}_metrics.txt', id = IDs) + + run: logging.info('Pipeline completed.') -rule step1: +# trimming step +rule skewer: input: - IN_FOLDER + '{id}.fastq' + # inputs are generated by looking up the path corresponding to the current ID + fastq_R1 = lambda wildcards: FASTQ_R1_FILES[wildcards.id], + fastq_R2 = lambda wildcards: FASTQ_R2_FILES[wildcards.id], output: - TMP_FOLDER + '{id}.sam' + trimmed_R1 = paths['trimmed'] + '/{id}-trimmed-pair1.fastq.gz', + trimmed_R2 = paths['trimmed'] + '/{id}-trimmed-pair2.fastq.gz' + log: + logfile = paths['trimmed'] + '/{id}_skewer_output.log' shell: - 'cat {input} > {output}' - - -rule step2: + # create the command line using the parameters contained in config['skewer'] + 'skewer -t {n_threads} -m {trimming_mode} -q {quality_threshold} -z --quiet '.format(**config['skewer']) + + # add the output to be in the 'trimmed' folder and to have the [ID] prefix + '-o {OUT_FOLDER_PATH}/{{wildcards.id}} '.format(OUT_FOLDER_PATH = paths['trimmed']) + + # finish the command by using the input paths that Snakemake will automatically replace + '{input.fastq_R2} {input.fastq_R1} ' + + # dump all standard and error output to the log file + '&> {log.logfile}' + + +# alignment and sorting step +rule bwa_bam: input: - TMP_FOLDER + '{id}.sam' + # use the outputs of the previous step as inputs for the current step + trimmed_R1 = rules.skewer.output.trimmed_R1, + trimmed_R2 = rules.skewer.output.trimmed_R2 output: - OUT_FOLDER + '{id}.bam' + sorted_bam = paths['sorted'] + '/{id}_sorted.bam' + log: + logfile = paths['sorted'] + '/{id}_bwa_bam_output.log' shell: - 'cat {input} > {output}' + # create the command line using the parameters contained in config['bwa'] + 'bwa mem -t {n_threads} -M -R "@RG\tID:{ID}\tLB:{LB}\tSM:{{wildcards.id}}\tPL:{PL}\tPU:{PU}" '.format(**config['bwa']) + + # complete the bwa command with the reference genome and the two inputs + '{ref_genome} {{input.trimmed_R1}} {{input.trimmed_R2}} '.format(**paths) + + # dump all standard and error output to the log file + '&> {log.logfile} ' + + # pipe the output into the picard tools with the amount of memory specified in the config + '| java -Xmx{memory}G '.format(**config['bwa']) + + # specify the picard JAR path and the output path + '-jar {picard} SortSam I=/dev/stdin O={{output.sorted_bam}} '.format(**paths) + + # complete the picard command with the static parameters + 'CREATE_INDEX=true VALIDATION_STRINGENCY=SILENT SO=coordinate ' + + # dump all standard and error output to the log file + '&> {log.logfile}' + +# marking duplicates step +rule mark_duplicates: + input: + # use the output of the previous step as input for the current step + sorted_bam = rules.bwa_bam.output.sorted_bam + output: + dedup_bam = paths['dedup'] + '/{id}_sorted_dedup.bam', + metrics = paths['dedup'] + '/{id}_metrics.txt' + log: + logfile = paths['dedup'] + '/{id}_sorted_dedup_output.log' + shell: + # create the command for the picard tools with the amount of memory specified in the config + 'java -Xmx{memory}G '.format(**config['bwa']) + + # specify the picard JAR path and the output path + '-jar {picard} MarkDuplicates I={{input.sorted_bam}} O={{output.dedup_bam}} '.format(**paths) + + # complete the command with the output of the metrics file and dump all standard and error output to the log file + 'M={output.metrics} &> {log.logfile}' + +# piece of code to be executed after the pipeline ends *without* error onsuccess: logging.info('Pipeline completed successfully.') +# piece of code to be executed after the pipeline ends *with* an error onerror: logging.info('Pipeline failed.') diff --git a/MelArray/config.yaml b/MelArray/config.yaml index 2443b4af707b0b091aedd947ed6f3ffc8959d5df..cd3b6c22004db01e2be14ab87e1fb16d84f9f9c7 100644 --- a/MelArray/config.yaml +++ b/MelArray/config.yaml @@ -9,11 +9,77 @@ # @date: 2017 +# TODO: make it flexible for single end and paired-end + + # file and folder paths path: - # root path for the code/scripts - root: "/vagrant/MelArray" + # root path for the pipeline / project + project_root: "/vagrant/MelArray" + + + # root path for the data folders + data_root: "__PROJECT_ROOT__/data" + + # path to the folder where the original fastq files are stored + fastq: "__DATA_ROOT__/fastq" + + # folder where the trimmed fastq files are stored + trimmed: "__DATA_ROOT__/trimmed" + + # folder where the sorted bam files are stored + sorted: "__DATA_ROOT__/sorted" + + # folder where the sorted & deduplicated bam files are stored + dedup: "__DATA_ROOT__/sorted_dedup" + + + # root path for the reference files + ref_root: "__PROJECT_ROOT__/ref" + + # path where the reference fasta genome is stored +# ref_genome: "/data/Phil/ref_phil/GATK_resource/hg19/ucsc.hg19.fasta" + ref_genome: "__REF_ROOT__/hg19/ucsc.hg19.fasta" + + + # root path for the software & tools + soft_root: "__PROJECT_ROOT__/software" + + # path where the picard tools JAR file is located +# picard: "/data/Phil/software/picard-tools-2.4.1/picard.jar" + picard: "__SOFT_ROOT__/picard-tools-2.9.0/picard.jar" + + + + +# parameters for skewer +skewer: + + # number of threads (-t) + n_threads: "2" + + # quality-based trimming or filtering (-q) + quality_threshold: "20" + + # trimming mode (-m, values: 'head', 'tail', 'any', 'pe', 'mp', 'ap') + trimming_mode: "pe" + + +# parameters for bwa +bwa: + + # number of threads (-t) + n_threads: "2" + + # amount of RAM for Java (-Xmx..G) + memory: "14" + + # header read group columns + ID: "Melanoma" + LB: "Melarray" + PL: "ILLUMINA" + PU: "HiSEQ4000" - # folder where the original fastq data is - fastq: "__ROOT__/fastq" + # trimming mode (-m, values: 'head', 'tail', 'any', 'pe', 'mp', 'ap') + trimming_mode: "pe" diff --git a/MelArray/other/create_dag.sh b/MelArray/other/create_dag.sh new file mode 100644 index 0000000000000000000000000000000000000000..212c4ba239ecdbd15c70c05b9336c32175dc8c5c --- /dev/null +++ b/MelArray/other/create_dag.sh @@ -0,0 +1 @@ +#!/usr/bin/env bash \ No newline at end of file diff --git a/MelArray/other/dag_v01.dag b/MelArray/other/dag_v01.dag new file mode 100644 index 0000000000000000000000000000000000000000..a4f544c6e8b55464550af1de6470730fcbc6f0d2 --- /dev/null +++ b/MelArray/other/dag_v01.dag @@ -0,0 +1,14 @@ +digraph snakemake_dag { + graph[bgcolor=white, margin=0]; + node[shape=box, style=rounded, fontname=sans, fontsize=10, penwidth=2]; + edge[penwidth=2, color=grey]; + 0[label = "skewer\nid: PBMCPatient1", color = "0.33 0.6 0.85", style="rounded,dashed"]; + 1[label = "all", color = "0.00 0.6 0.85", style="rounded"]; + 2[label = "skewer\nid: MTBPatient2", color = "0.33 0.6 0.85", style="rounded,dashed"]; + 3[label = "skewer\nid: MTBPatient1", color = "0.33 0.6 0.85", style="rounded,dashed"]; + 4[label = "skewer\nid: PBMCPatient2", color = "0.33 0.6 0.85", style="rounded,dashed"]; + 0 -> 1 + 2 -> 1 + 3 -> 1 + 4 -> 1 +} diff --git a/MelArray/other/dag_v01.png b/MelArray/other/dag_v01.png new file mode 100644 index 0000000000000000000000000000000000000000..c7508bab5aa5b4ae52ab14fb8b01d8028131a074 Binary files /dev/null and b/MelArray/other/dag_v01.png differ diff --git a/MelArray/other/dag_v01_hori.png b/MelArray/other/dag_v01_hori.png new file mode 100644 index 0000000000000000000000000000000000000000..cb706a4d7e4251d356687b96af34c4d8dd5386de Binary files /dev/null and b/MelArray/other/dag_v01_hori.png differ diff --git a/MelArray/other/dag_v01_vert.png b/MelArray/other/dag_v01_vert.png new file mode 100644 index 0000000000000000000000000000000000000000..7a4bf95b700017e2158877dc0986c671dd46faf0 Binary files /dev/null and b/MelArray/other/dag_v01_vert.png differ diff --git a/MelArray/other/dag_v02.dag b/MelArray/other/dag_v02.dag new file mode 100644 index 0000000000000000000000000000000000000000..611be4624290830d495d0181f01fc82a6967bf07 --- /dev/null +++ b/MelArray/other/dag_v02.dag @@ -0,0 +1,30 @@ +digraph snakemake_dag { + graph[bgcolor=white, margin=0]; + node[shape=box, style=rounded, fontname=sans, fontsize=10, penwidth=2]; + edge[penwidth=2, color=grey]; + 0[label = "bwa_bam", color = "0.50 0.6 0.85", style="rounded,dashed"]; + 1[label = "mark_duplicates", color = "0.33 0.6 0.85", style="rounded"]; + 2[label = "bwa_bam", color = "0.50 0.6 0.85", style="rounded,dashed"]; + 3[label = "skewer\nid: PBMCpatient1", color = "0.17 0.6 0.85", style="rounded,dashed"]; + 4[label = "bwa_bam", color = "0.50 0.6 0.85", style="rounded,dashed"]; + 5[label = "mark_duplicates", color = "0.33 0.6 0.85", style="rounded"]; + 6[label = "bwa_bam", color = "0.50 0.6 0.85", style="rounded,dashed"]; + 7[label = "skewer\nid: MTBPatient2", color = "0.17 0.6 0.85", style="rounded,dashed"]; + 8[label = "mark_duplicates", color = "0.33 0.6 0.85", style="rounded"]; + 9[label = "mark_duplicates", color = "0.33 0.6 0.85", style="rounded"]; + 10[label = "skewer\nid: MTBPatient1", color = "0.17 0.6 0.85", style="rounded,dashed"]; + 11[label = "all", color = "0.00 0.6 0.85", style="rounded"]; + 12[label = "skewer\nid: PBMCpatient2", color = "0.17 0.6 0.85", style="rounded,dashed"]; + 10 -> 0 + 0 -> 1 + 7 -> 2 + 12 -> 4 + 2 -> 5 + 3 -> 6 + 6 -> 8 + 4 -> 9 + 5 -> 11 + 1 -> 11 + 8 -> 11 + 9 -> 11 +} diff --git a/MelArray/other/dag_v02_hori.png b/MelArray/other/dag_v02_hori.png new file mode 100644 index 0000000000000000000000000000000000000000..a6092350b8ba4045361758eb2cd6b7f97afe0ca0 Binary files /dev/null and b/MelArray/other/dag_v02_hori.png differ diff --git a/MelArray/other/dag_v02_vert.png b/MelArray/other/dag_v02_vert.png new file mode 100644 index 0000000000000000000000000000000000000000..0a401e639617b597dbc127148cc7b4075e95ebe6 Binary files /dev/null and b/MelArray/other/dag_v02_vert.png differ diff --git a/MelArray/original_bash_scripts/Mel_1_skewer_170228.sh b/MelArray/other/original_bash_scripts/Mel_1_skewer_170228.sh similarity index 100% rename from MelArray/original_bash_scripts/Mel_1_skewer_170228.sh rename to MelArray/other/original_bash_scripts/Mel_1_skewer_170228.sh diff --git a/MelArray/original_bash_scripts/Mel_2_bwa_GATK_170228.sh b/MelArray/other/original_bash_scripts/Mel_2_bwa_GATK_170228.sh similarity index 100% rename from MelArray/original_bash_scripts/Mel_2_bwa_GATK_170228.sh rename to MelArray/other/original_bash_scripts/Mel_2_bwa_GATK_170228.sh diff --git a/MelArray/other/rule_graph_v01.dag b/MelArray/other/rule_graph_v01.dag new file mode 100644 index 0000000000000000000000000000000000000000..40c5e0a4a4ffe3410d35e32ec6be43a8b0445f27 --- /dev/null +++ b/MelArray/other/rule_graph_v01.dag @@ -0,0 +1,8 @@ +digraph snakemake_dag { + graph[bgcolor=white, margin=0]; + node[shape=box, style=rounded, fontname=sans, fontsize=10, penwidth=2]; + edge[penwidth=2, color=grey]; + 0[label = "all", color = "0.00 0.6 0.85", style="rounded"]; + 1[label = "skewer", color = "0.33 0.6 0.85", style="rounded"]; + 1 -> 0 +} diff --git a/MelArray/other/rule_graph_v01.png b/MelArray/other/rule_graph_v01.png new file mode 100644 index 0000000000000000000000000000000000000000..2e3adb3b98dd1e773858a75de89ece46a29aadac Binary files /dev/null and b/MelArray/other/rule_graph_v01.png differ diff --git a/MelArray/other/rule_graph_v01_hori.png b/MelArray/other/rule_graph_v01_hori.png new file mode 100644 index 0000000000000000000000000000000000000000..5acf11c10e6a68fa99a14a6d483a3d9e5d47bbd5 Binary files /dev/null and b/MelArray/other/rule_graph_v01_hori.png differ diff --git a/MelArray/other/rule_graph_v01_vert.png b/MelArray/other/rule_graph_v01_vert.png new file mode 100644 index 0000000000000000000000000000000000000000..45e0ade85db7a112e4ddf2435d98f07afb751ecd Binary files /dev/null and b/MelArray/other/rule_graph_v01_vert.png differ diff --git a/MelArray/other/rule_graph_v02.dag b/MelArray/other/rule_graph_v02.dag new file mode 100644 index 0000000000000000000000000000000000000000..85b31a4b3c2d65ceeb5f3cf1b2a1abae84d60253 --- /dev/null +++ b/MelArray/other/rule_graph_v02.dag @@ -0,0 +1,12 @@ +digraph snakemake_dag { + graph[bgcolor=white, margin=0]; + node[shape=box, style=rounded, fontname=sans, fontsize=10, penwidth=2]; + edge[penwidth=2, color=grey]; + 0[label = "bwa_bam", color = "0.50 0.6 0.85", style="rounded"]; + 1[label = "all", color = "0.17 0.6 0.85", style="rounded"]; + 2[label = "skewer", color = "0.33 0.6 0.85", style="rounded"]; + 3[label = "mark_duplicates", color = "0.00 0.6 0.85", style="rounded"]; + 2 -> 0 + 3 -> 1 + 0 -> 3 +} diff --git a/MelArray/other/rule_graph_v02_hori.png b/MelArray/other/rule_graph_v02_hori.png new file mode 100644 index 0000000000000000000000000000000000000000..4b09653ac52321a4de09ce5c1e83bdd22698f460 Binary files /dev/null and b/MelArray/other/rule_graph_v02_hori.png differ diff --git a/MelArray/other/rule_graph_v02_vert.png b/MelArray/other/rule_graph_v02_vert.png new file mode 100644 index 0000000000000000000000000000000000000000..e2bb7e1be4eecbe45a689c519c6f2d133177d6ef Binary files /dev/null and b/MelArray/other/rule_graph_v02_vert.png differ diff --git a/MelArray/scripts/load_config.py b/MelArray/scripts/load_config.py index 4906c0275770a5c52cfffff9886eb2b60c851ad8..bc839b5b5a142d60fc4104633c0a00185383d29f 100644 --- a/MelArray/scripts/load_config.py +++ b/MelArray/scripts/load_config.py @@ -1,6 +1,7 @@ import logging import yaml +import re def load_config(config_file_path): @@ -11,11 +12,34 @@ def load_config(config_file_path): @date: 2017 """ - logging.debug("Loading config file from %s", config_file_path) + logging.debug("Loading config file from %s ...", config_file_path) with open(config_file_path, 'r') as config_file: config = yaml.load(config_file) - # initialize the path(s) by replacing the '__ROOT__' tag with the actual root path - config["path"]["fastq"] = config["path"]["fastq"].replace("__ROOT__", config["path"]['root']) + # exract the paths as a separate dictionary + paths = config['path'] + + # do the "root" parts replacing in the paths + logging.debug("Replacing '__XXXX_ROOT__' tags ...", config_file_path) + is_match = True + # repeat until no more roots have been replaced + while is_match: + is_match = False + # loop through each key/value pair in the "path" dictionary + for key, value in paths.items(): + # replace the "__XXX_ROOT__" tags in the path with their corresponding path entry + # for example: "__DATA_ROOT__" should be replaced with paths['data_root'] + if re.match('__\w+_ROOT__', value): + # a match was found, make sure to loop again + is_match = True + # extract the different parts of the path + re_match = re.match('(.*)__(\w+)_ROOT__(.*)', value) + # get the root's name, like "DATA" or "REF" + root_name = re_match.groups()[1] + # get the lower case version ("data" or "ref") + root_name_lower = root_name.lower() + # replace the path: for example "__DATA_ROOT__" with paths['data_root'] + paths[key] = value.replace("__" + root_name + "_ROOT__", paths[root_name_lower + '_root']) + return config \ No newline at end of file diff --git a/Vagrantfile b/Vagrantfile index 1a55195885faf232a8239365cc6c5090028c0619..a75540887be3c24a22a91df5e32733991e8607e5 100644 --- a/Vagrantfile +++ b/Vagrantfile @@ -37,20 +37,23 @@ Vagrant.configure("2") do |config| # the path on the host to the actual folder. The second argument is # the path on the guest to mount the folder. And the optional third # argument is a set of non-required options. - # config.vm.synced_folder "src", "target" + config.vm.synced_folder "/data/USZ/data", "/vagrant/MelArray/data" + config.vm.synced_folder "/data/USZ/ref", "/vagrant/MelArray/ref" + config.vm.synced_folder "/data/USZ/software", "/vagrant/MelArray/software" # Provider-specific configuration so you can fine-tune various # backing providers for Vagrant. These expose provider-specific options. # Example for VirtualBox: - # - # config.vm.provider "virtualbox" do |vb| - # # Display the VirtualBox GUI when booting the machine - # vb.gui = true - # - # # Customize the amount of memory on the VM: - # vb.memory = "1024" - # end - # + + config.vm.provider "virtualbox" do |vb| + # Display the VirtualBox GUI when booting the machine + vb.gui = false + # Customize the amount of memory on the VM: + vb.memory = "14336" + # Customize the amount of CPUs on the VM: + vb.cpus = "2" + end + # View the documentation for the provider you are using for more # information on available options. diff --git a/pipeline1/dag.png b/pipeline1/dag.png new file mode 100644 index 0000000000000000000000000000000000000000..d46adae2e744f4778a03804a9ea85ee57f112e85 Binary files /dev/null and b/pipeline1/dag.png differ diff --git a/pipeline1/dag_partial.png b/pipeline1/dag_partial.png new file mode 100644 index 0000000000000000000000000000000000000000..5d4d50d5a8bfc53d9e59807ef18272f406ee6fac Binary files /dev/null and b/pipeline1/dag_partial.png differ diff --git a/pipeline1/rule_graph.png b/pipeline1/rule_graph.png new file mode 100644 index 0000000000000000000000000000000000000000..3e9b3beee7df54662265b3faf73770a254add54b Binary files /dev/null and b/pipeline1/rule_graph.png differ