diff --git a/MelArray/Dockerfile b/MelArray/Dockerfile index 6d0ff6c42caf005ffd94cdb7c8c8debbbcc783d2..ecdc3acbf01e9aa08baeaa454e8a15d574ff611a 100644 --- a/MelArray/Dockerfile +++ b/MelArray/Dockerfile @@ -29,7 +29,7 @@ ENV BWA_VERSION "0.7.13" ENV PICARD_VERSION "2.9.2" # GATK is a toolkit to do variant discovery and genotyping. Because downloading GATK requires a # login, the GATK tar file has to be manually downloaded and put in the $GATK_PATH folder -ENV GATK_VERSION "3.6" +ENV GATK_VERSION "3.7" ENV GATK_PATH "./software" # R is needed for generating plots with GATK ENV R_VERSION "3.3.3" @@ -107,8 +107,6 @@ RUN cd /usr/local/src && tar xf "bwa-$BWA_VERSION.tar.bz2" RUN cd /usr/local/src/bwa-$BWA_VERSION && make RUN cp /usr/local/src/bwa-$BWA_VERSION/bwa /usr/local/bin -ENV PICARD_VERSION "2.4.1" -# TODO: clean this up, this should go top RUN echo " - Installing picard tools $PICARD_VERSION ..." RUN wget -O "/usr/local/bin/picard.jar"\ "https://github.com/broadinstitute/picard/releases/download/$PICARD_VERSION/picard.jar" diff --git a/MelArray/MelArray.snake b/MelArray/MelArray.snake index 8ff8d3ba7500d2c0327ce2471e71784e2ea54b90..96f132f0db409b64d6428726b2993cf5395cad5c 100644 --- a/MelArray/MelArray.snake +++ b/MelArray/MelArray.snake @@ -11,7 +11,7 @@ ########## # import entire packages -import os, logging, platform, re +import os, logging, platform, re, hashlib # import some specific packages from timeit import default_timer as t @@ -142,6 +142,9 @@ indexes = [i for i in indexes if i >= 0 and i < len(IDs)] if len(indexes) == 0: indexes = list(range(len(IDs))) # get the subset of IDs based on the unique list indexes ("set" makes a list unique) IDs = [IDs[i] for i in set(indexes)] +# calculate the MD5 hash of the sorted list of IDs +IDs_MD5 = hashlib.md5(repr(IDs.sort()).encode('UTF-8')).hexdigest() + logging.debug("IDs to process: ({} ID(s))", len(IDs)) logging.debug(IDs) @@ -169,8 +172,6 @@ for i_ID in range(len(original_IDs)): .replace(paths['fastq'], paths['fastq_renamed_root']) -# TODO: add a check here to make sure all files exist - # show the content of the mapping dictionaries logging.debug("FASTQ_R1_FILES:" + str(FASTQ_R1_FILES)) logging.debug("FASTQ_R2_FILES:" + str(FASTQ_R2_FILES)) @@ -254,16 +255,20 @@ rule all: # put here all the files that are required to be present at the end of the pipeline: # "expand" creates a list of paths where "{id}" is replaced by the values in the "IDs" list recal_plots = expand(paths['recal_plots'], id = IDs), + # multi_qc is a unique path multi_qc = paths['multi_qc'], - hc_vcf = expand(paths['hc_vcf'], id = IDs), + # expand '{tumor}' with elements from TUMOR_LIST and '{normal}' with elements from NORMAL_LIST MuTect2 = [ paths['MuTect2_vcf'] .replace('{tumor}', TUMOR_LIST[i_list]) .replace('{normal}', NORMAL_LIST[i_list]) for i_list in range(len(TUMOR_LIST)) ], + # expand '{tumor}' with elements from TUMOR_LIST and '{normal}' with elements from NORMAL_LIST small_seqz = [ paths['small_seqz'] .replace('{tumor}', TUMOR_LIST[i_list]) .replace('{normal}', NORMAL_LIST[i_list]) - for i_list in range(len(TUMOR_LIST)) ] + for i_list in range(len(TUMOR_LIST)) ], + # expand '{md5}' with the MD5 hash (string) stored in IDs_MD5 + recal_indel_vcf = expand(paths['recal_indel_vcf'], md5 = IDs_MD5) run: logging.info('Pipeline completed.') @@ -431,7 +436,7 @@ rule base_recal: # specify the GATK JAR path, the GATK command and the reference genome '-jar {gatk} -T BaseRecalibrator -R {ref_genome} '.format(**rule_paths) + # specify the reference files - '-knownSites {dbsnip} -knownSites {Mills} -knownSites {1000G} -L {bed} '.format(**rule_paths) + + '-knownSites {dbsnp} -knownSites {mills} -knownSites {1000G} -L {bed} '.format(**rule_paths) + # specify the input and output path '-I {fixmate_bam} -o {base_recal_table} '.format(**rule_paths) + # specify the interval padding @@ -445,7 +450,7 @@ rule base_recal: # post recalibration step rule post_recal: input: - # use the output of the previous steps as input for the current step + # use the output of the previous step(s) as input for the current step fixmate_bam = rules.fix_mate.output.fixmate_bam, base_recal_table = rules.base_recal.output.base_recal_table output: @@ -462,7 +467,7 @@ rule post_recal: # specify the GATK JAR path, the GATK command and the reference genome '-jar {gatk} -T BaseRecalibrator -R {ref_genome} '.format(**rule_paths) + # specify the reference files - '-knownSites {dbsnip} -knownSites {Mills} -knownSites {1000G} -L {bed} '.format(**rule_paths) + + '-knownSites {dbsnp} -knownSites {mills} -knownSites {1000G} -L {bed} '.format(**rule_paths) + # specify the inputs and output path '-I {fixmate_bam} -BQSR {base_recal_table} -o {post_recal_table} '.format(**rule_paths) + # specify the interval padding @@ -476,7 +481,7 @@ rule post_recal: # analyze covariates step rule analyze_covariates: input: - # use the output of the previous steps as input for the current step + # use the output of the previous step(s) as input for the current step base_recal_table = rules.base_recal.output.base_recal_table, post_recal_table = rules.post_recal.output.post_recal_table output: @@ -503,7 +508,7 @@ rule analyze_covariates: # print reads step rule print_reads: input: - # use the output of the previous steps as input for the current step + # use the output of the previous step(s) as input for the current step fixmate_bam = rules.fix_mate.output.fixmate_bam, base_recal_table = rules.base_recal.output.base_recal_table output: @@ -530,7 +535,7 @@ rule print_reads: # create interval list file from bed file rule bed_to_interval_list: input: - # use the output of the previous steps as input for the current step + # use the output of the previous step(s) as input for the current step bed = paths['bed'] output: interval_list = paths['interval_list'] @@ -557,7 +562,7 @@ rule bed_to_interval_list: # hybrid selection quality control step rule hybrid_sel_QC: input: - # use the output of the previous steps as input for the current step + # use the output of the previous step(s) as input for the current step bqsr_bam = rules.print_reads.output.bqsr_bam, interval_list = rules.bed_to_interval_list.output.interval_list output: @@ -586,7 +591,7 @@ rule hybrid_sel_QC: # alignment summary quality control step rule align_QC: input: - # use the output of the previous steps as input for the current step + # use the output of the previous step(s) as input for the current step bqsr_bam = rules.print_reads.output.bqsr_bam output: aln_qc_metrics = paths['aln_qc_metrics'] @@ -612,7 +617,7 @@ rule align_QC: # pull together the quality controls using multiQC rule multi_qc: input: - # use the output of the previous steps as input for the current step + # use the output of the previous step(s) as input for the current step trimmed_log = expand(rules.skewer.output.trimmed_log, id = IDs), dedup_metrics = expand(rules.mark_duplicates.output.dedup_metrics, id = IDs), hs_qc_metrics = expand(rules.hybrid_sel_QC.output.hs_qc_metrics, id = IDs), @@ -637,7 +642,7 @@ rule multi_qc: # haplotype caller step rule haplotype_caller: input: - # use the output of the previous steps as input for the current step + # use the output of the previous step(s) as input for the current step bqsr_bam = rules.print_reads.output.bqsr_bam output: hc_vcf = paths['hc_vcf'], @@ -650,8 +655,8 @@ rule haplotype_caller: SINGULARITY_CMD + # create the command for GATK with the amount of memory specified in the config 'java -Xmx{memory}G '.format(**soft['gatk']) + - # specify the GATK JAR path, the GATK command and the reference genome - '-jar {gatk} -T HaplotypeCaller -R {ref_genome} --dbsnp {dbsnip} '.format(**rule_paths) + + # specify the GATK JAR path, the GATK command, the reference genome and some more references + '-jar {gatk} -T HaplotypeCaller -R {ref_genome} --dbsnp {dbsnp} '.format(**rule_paths) + # specify the inputs and outputs '-I {bqsr_bam} -o {hc_vcf} '.format(**rule_paths) + # add some parameters from the configuration that are specific for this step @@ -669,7 +674,7 @@ rule haplotype_caller: # MuTect2 step rule MuTect2: input: - # use the output of the previous steps as input for the current step + # use the output of the previous step(s) as input for the current step bqsr_bam_normal = paths['bqsr_bam'].replace('{id}', '{normal}'), bqsr_bam_tumor = paths['bqsr_bam'].replace('{id}', '{tumor}') output: @@ -684,7 +689,7 @@ rule MuTect2: # create the command for GATK with the amount of memory specified in the config 'java -Xmx{memory}G '.format(**soft['gatk']) + # specify the GATK JAR path, the GATK command and the reference files - '-jar {gatk} -T MuTect2 -R {ref_genome} --dbsnp {dbsnip} --cosmic {cosmic} -L {bed} '.format(**rule_paths) + + '-jar {gatk} -T MuTect2 -R {ref_genome} --dbsnp {dbsnp} --cosmic {cosmic} -L {bed} '.format(**rule_paths) + # specify the normal input '-I:normal {bqsr_bam} '.format(**rule_paths).replace('{id}', '{wildcards.normal}') + # specify the tumor input @@ -700,10 +705,11 @@ rule MuTect2: # replace all instances of '{normal}' with '{wildcards.normal}' .replace('{normal}', '{wildcards.normal}') + # pileup step rule pileup: input: - # use the output of the previous steps as input for the current step + # use the output of the previous step(s) as input for the current step bqsr_bam = rules.print_reads.output.bqsr_bam output: pileup = paths['pileup'] @@ -726,17 +732,16 @@ rule pileup: '2>{pileup_out} '.format(**paths) + # pipe the standard output (stdout) to the next software call ... '| ' + - # append again the Singularity container call (if needed) - #SINGULARITY_CMD + # pipe the standard output (stdout) into gzip - 'gzip > {pileup} '.format(**paths) + 'gzip > {pileup}'.format(**paths) # replace all instances of '{id}' with '{wildcards.id}' ).replace('{id}', '{wildcards.id}') + # create 50bp GC binning file if it doesn't exist rule GCbinning: input: - # use the output of the previous steps as input for the current step + # use the output of the previous step(s) as input for the current step ref = paths['ref_genome'] output: gcbin = paths['50bp'] @@ -755,15 +760,14 @@ rule GCbinning: '2>{50bp_out} '.format(**paths) + # pipe the standard output (stdout) to the next software call ... '| ' + - # append again the Singularity container call (if needed) - #SINGULARITY_CMD + # pipe the standard output (stdout) into gzip - 'gzip > {50bp} '.format(**paths) + 'gzip > {50bp}'.format(**paths) + # pileup2seqz step rule pileup2seqz: input: - # use the output of the previous steps as input for the current step + # use the output of the previous step(s) as input for the current step pileup_normal = paths['pileup'].replace('{id}', '{normal}'), pileup_tumor = paths['pileup'].replace('{id}', '{tumor}'), gcbin = paths['50bp'] @@ -788,19 +792,18 @@ rule pileup2seqz: '2>{seqz_out} '.format(**paths) + # pipe the standard output (stdout) to the next software call ... '| ' + - # append again the Singularity container call (if needed) - # SINGULARITY_CMD + # pipe the standard output (stdout) into gzip - 'gzip > {seqz} '.format(**paths) + 'gzip > {seqz}'.format(**paths) # replace all instances of '{tumor}' with '{wildcards.tumor}' ).replace('{tumor}', '{wildcards.tumor}') # replace all instances of '{normal}' with '{wildcards.normal}' .replace('{normal}', '{wildcards.normal}') -#small_seqz + +# small_seqz step rule small_seqz: input: - # use the output of the previous steps as input for the current step + # use the output of the previous step(s) as input for the current step seqz = rules.pileup2seqz.output.seqz output: small_seqz = paths['small_seqz'], @@ -821,15 +824,210 @@ rule small_seqz: '2>{small_seqz_out} '.format(**paths) + # pipe the standard output (stdout) to the next software call ... '| ' + - # append again the Singularity container call (if needed) - #SINGULARITY_CMD + # pipe the standard output (stdout) into gzip - 'gzip > {small_seqz} '.format(**paths) + 'gzip > {small_seqz}'.format(**paths) # replace all instances of '{tumor}' with '{wildcards.tumor}' ).replace('{tumor}', '{wildcards.tumor}') # replace all instances of '{normal}' with '{wildcards.normal}' .replace('{normal}', '{wildcards.normal}') + +# create the *.g.vcf file list based on the IDs +rule create_gvcf_list: + input: + # use the output of all the haplotype caller steps + hc_vcf = expand(paths['hc_vcf'], id = IDs) + output: + # use the MD5-hash of the sorted list of IDs as name for the file, to make it unique. This + # way, every time a new ID is added, this list gets re-calculated + vcf_list = paths['vcf_list'] + shell: + # wrap the whole command in one string + ( + # append the Singularity container call (if needed) + SINGULARITY_CMD + + # get the list of all VCF files names + 'find {hc_vcf_root} -name "*{hc_vcf_pattern}" '.format(**rule_paths) + + # filter for .g.vcf files and dump all the file names in the list file + '> ' + paths['vcf_list'] + # replace all instances of '{md5}' with '{wildcards.md5}' + ).replace('{md5}', '{wildcards.md5}') + + +# genotype gvcfs step +rule genotype_gvcf: + input: + # use the output of the previous step(s) as input for the current step + vcf_list = rules.create_gvcf_list.output.vcf_list + output: + hc_geno_vcf = paths['hc_geno_vcf'], + log: + hc_geno_vcf_out = paths['hc_geno_vcf_out'] + shell: + # wrap the whole command in one string + ( + # append the Singularity container call (if needed) + SINGULARITY_CMD + + # create the command for GATK with the amount of memory specified in the config + 'java -Xmx{memory}G '.format(**soft['gatk']) + + # specify the GATK JAR path, the GATK command, the reference genome and some more references + '-jar {gatk} -T GenotypeGVCFs -R {ref_genome} --dbsnp {dbsnp} '.format(**rule_paths) + + # specify the inputs and outputs + '-V {vcf_list} -o {hc_geno_vcf} '.format(**rule_paths) + + # add some parameters from the configuration that are specific for this step + '-nt {num_threads} '.format(**soft['hc_genotyping']) + + # dump all standard and error output to the log file + '&> {hc_geno_vcf_out}'.format(**paths) + # replace all instances of '{md5}' with '{wildcards.md5}' + ).replace('{md5}', '{wildcards.md5}') + + +# SNV variant recalibrator step +rule var_snv_recal: + input: + # use the output of the previous step(s) as input for the current step + hc_geno_vcf = rules.genotype_gvcf.output.hc_geno_vcf + output: + var_snv_recal = paths['var_snv_recal'], + var_snv_recal_tranches = paths['var_snv_recal_tranches'] + log: + var_snv_recal_out = paths['var_snv_recal_out'] + shell: + # wrap the whole command in one string + ( + # append the Singularity container call (if needed) + SINGULARITY_CMD + + # create the command for GATK with the amount of memory specified in the config + 'java -Xmx{memory}G '.format(**soft['gatk']) + + # specify the GATK JAR path, the GATK command and the reference genome + '-jar {gatk} -T VariantRecalibrator -R {ref_genome} '.format(**rule_paths) + + # specify the input + '-input {hc_geno_vcf} '.format(**rule_paths) + + # specify the outputs + '-recalFile {var_snv_recal} -tranchesFile {var_snv_recal_tranches} '.format(**rule_paths) + + # specify all resources / references + '-resource:{hapmap} '.format(**soft['var_snv_recal']) + # parameter string + '{hapmap} '.format(**rule_paths) + # path of the reference file + '-resource:{omni} '.format(**soft['var_snv_recal']) + # parameter string + '{omni} '.format(**rule_paths) + # path of the reference file + '-resource:{1000G} '.format(**soft['var_snv_recal']) + # parameter string + '{1000G} '.format(**rule_paths) + # path of the reference file + '-resource:{dbsnp} '.format(**soft['var_snv_recal']) + # parameter string + '{dbsnp} '.format(**rule_paths) + # path of the reference file + # add some parameters from the configuration that are specific for this step + '-nt {num_threads} -mode {mode} '.format(**soft['var_snv_recal']) + + # transform the list of annotations in a '-an XXXX -an XXXX -an XXXX' list + '-an {} '.format(' -an '.join(soft['var_snv_recal']['an_list'])) + + # dump all standard and error output to the log file + '&> {var_snv_recal_out}'.format(**paths) + # replace all instances of '{md5}' with '{wildcards.md5}' + ).replace('{md5}', '{wildcards.md5}') + + +# apply SNV variant recalibration step +rule apply_snv_recal: + input: + # use the output of the previous step(s) as input for the current step + var_snv_recal = rules.var_snv_recal.output.var_snv_recal, + var_snv_recal_tranches = rules.var_snv_recal.output.var_snv_recal_tranches, + hc_geno_vcf = rules.genotype_gvcf.output.hc_geno_vcf + output: + recal_snv_vcf = paths['recal_snv_vcf'] + log: + recal_snv_vcf_out = paths['recal_snv_vcf_out'] + shell: + # wrap the whole command in one string + ( + # append the Singularity container call (if needed) + SINGULARITY_CMD + + # create the command for GATK with the amount of memory specified in the config + 'java -Xmx{memory}G '.format(**soft['gatk']) + + # specify the GATK JAR path, the GATK command and the reference genome + '-jar {gatk} -T ApplyRecalibration -R {ref_genome} '.format(**rule_paths) + + # specify the inputs + '-input {hc_geno_vcf} -tranchesFile {var_snv_recal_tranches} '.format(**rule_paths) + + '-recalFile {var_snv_recal} '.format(**rule_paths) + + # specify the output + '-o {recal_snv_vcf} '.format(**rule_paths) + + # add some parameters from the configuration that are specific for this step + '--ts_filter_level {ts_filter_level} -mode {mode} '.format(**soft['apply_snv_recal']) + + # dump all standard and error output to the log file + '&> {recal_snv_vcf_out}'.format(**paths) + # replace all instances of '{md5}' with '{wildcards.md5}' + ).replace('{md5}', '{wildcards.md5}') + + +# variant indel recalibrator step +rule var_indel_recal: + input: + # use the output of the previous step(s) as input for the current step + recal_snv_vcf = rules.apply_snv_recal.output.recal_snv_vcf + output: + var_indel_recal = paths['var_indel_recal'], + var_indel_recal_tranches = paths['var_indel_recal_tranches'] + log: + var_indel_recal_out = paths['var_indel_recal_out'] + shell: + # wrap the whole command in one string + ( + # append the Singularity container call (if needed) + SINGULARITY_CMD + + # create the command for GATK with the amount of memory specified in the config + 'java -Xmx{memory}G '.format(**soft['gatk']) + + # specify the GATK JAR path, the GATK command and the reference genome + '-jar {gatk} -T VariantRecalibrator -R {ref_genome} '.format(**rule_paths) + + # specify the input + '-input {recal_snv_vcf} '.format(**rule_paths) + + # specify the outputs + '-recalFile {var_indel_recal} -tranchesFile {var_indel_recal_tranches} '.format(**rule_paths) + + # specify all resources / references + '-resource:{mills} '.format(**soft['var_indel_recal']) + # parameter string + '{mills} '.format(**rule_paths) + # path of the reference file + '-resource:{dbsnp} '.format(**soft['var_indel_recal']) + # parameter string + '{dbsnp} '.format(**rule_paths) + # path of the reference file + # add some parameters from the configuration that are specific for this step + '--maxGaussians {maxGaussians} -nt {num_threads} -mode {mode} '.format(**soft['var_indel_recal']) + + # transform the list of annotations in a '-an XXXX -an XXXX -an XXXX' list + '-an {} '.format(' -an '.join(soft['var_indel_recal']['an_list'])) + + # dump all standard and error output to the log file + '&> {var_indel_recal_out}'.format(**paths) + # replace all instances of '{md5}' with '{wildcards.md5}' + ).replace('{md5}', '{wildcards.md5}') + + +# apply indel variant recalibration step +rule apply_indel_recal: + input: + # use the output of the previous step(s) as input for the current step + var_indel_recal = rules.var_indel_recal.output.var_indel_recal, + var_indel_recal_tranches = rules.var_indel_recal.output.var_indel_recal_tranches, + hc_geno_vcf = rules.genotype_gvcf.output.hc_geno_vcf + output: + recal_indel_vcf = paths['recal_indel_vcf'] + log: + recal_indel_vcf_out = paths['recal_indel_vcf_out'] + shell: + # wrap the whole command in one string + ( + # append the Singularity container call (if needed) + SINGULARITY_CMD + + # create the command for GATK with the amount of memory specified in the config + 'java -Xmx{memory}G '.format(**soft['gatk']) + + # specify the GATK JAR path, the GATK command and the reference genome + '-jar {gatk} -T ApplyRecalibration -R {ref_genome} '.format(**rule_paths) + + # specify the inputs + '-input {hc_geno_vcf} -tranchesFile {var_indel_recal_tranches} '.format(**rule_paths) + + '-recalFile {var_indel_recal} '.format(**rule_paths) + + # specify the output + '-o {recal_indel_vcf} '.format(**rule_paths) + + # add some parameters from the configuration that are specific for this step + '--ts_filter_level {ts_filter_level} -mode {mode} '.format(**soft['apply_indel_recal']) + + # dump all standard and error output to the log file + '&> {recal_indel_vcf_out}'.format(**paths) + # replace all instances of '{md5}' with '{wildcards.md5}' + ).replace('{md5}', '{wildcards.md5}') + + # piece of code to be executed after the pipeline ends *without* error onsuccess: logging.info('Pipeline completed successfully.') diff --git a/MelArray/cluster.json b/MelArray/cluster.json index 7c16d0004346d7ba8bc09735357e1cc084120594..60aab93aa389764bbce3d113fca4f2c553b66595 100644 --- a/MelArray/cluster.json +++ b/MelArray/cluster.json @@ -25,6 +25,30 @@ "name": "{rule}", "n_cores": "1" }, + "create_g_vcf_list": { + "name": "{rule}", + "n_cores": "1" + }, + "genotype_gvcf": { + "name": "{rule}", + "n_cores": "1" + }, + "var_snv_recal": { + "name": "{rule}", + "n_cores": "1" + }, + "apply_snv_recal": { + "name": "{rule}", + "n_cores": "1" + }, + "var_indel_recal": { + "name": "{rule}", + "n_cores": "1" + }, + "apply_indel_recal": { + "name": "{rule}", + "n_cores": "1" + }, "MuTect2": { "name": "{rule}_{wildcards.tumor}_{wildcards.normal}", diff --git a/MelArray/config_derm_server.yaml b/MelArray/config_derm_server.yaml index 925934725d706ac425ee239d89bff6037c4d4fd7..fb5e87919d32deafb3ffa5aa1a3abd655160823b 100644 --- a/MelArray/config_derm_server.yaml +++ b/MelArray/config_derm_server.yaml @@ -148,7 +148,7 @@ software: rename: # copy input files for renaming # rename_cmd: "cp" - # movie input files for renaming + # move input files for renaming rename_cmd: "mv" # parameters for picard diff --git a/MelArray/config_local.yaml b/MelArray/config_local.yaml index 1f812f3ff3384ee4c87c17f0c4b85bdb12e72bae..0a683bfaa931a61f135ceed882180e23095efe54 100644 --- a/MelArray/config_local.yaml +++ b/MelArray/config_local.yaml @@ -8,15 +8,12 @@ # @date: 2017 -# TODO: make it flexible for single end and paired-end -# TODO: implement the different run modes - # general run configuration main: # defines how the pipeline should be run: - # 'host': the pipeline runs on the host, whithout using any containers. This requires that the - # software dependencies are installed on host. + # 'host': the pipeline runs on the host, without using any containers. This requires that the + # software dependencies are installed on the host. # 'run_outside_single': the pipeline runs on the host, but using a single container that # contains all the software dependencies. A container path has to be specified. # 'run_inside_single': the pipeline runs inside a single container, using the software dependencies @@ -25,7 +22,6 @@ main: # one per software dependency. A container path has to be specified for each dependency. # 'run_inside_multi': the pipeline runs in a "main" container, but using several "sub-containers", typically # one per software dependency. A container path has to be specified for each dependency. -# run_mode: "run_inside_single" run_mode: "run_outside_single" @@ -33,7 +29,6 @@ main: path: # root path for the pipeline / project -# project_root: "/vagrant/MelArray" project_root: "/pipeline" # path to the Singularity container @@ -90,54 +85,86 @@ path: aln_qc_metrics: "__DATA_ROOT__/QC_reports/{id}_aln_metrics.txt" aln_qc_out: "__DATA_ROOT__/QC_reports/{id}_aln_metrics.out" # file path for the "bed to interval list" step's log - bed_to_interval_list_out: "__DATA_ROOT__/7_QC_reports/bed_to_interval_list.out" + bed_to_interval_list_out: "__DATA_ROOT__/bed_to_interval_list.out" # folder path for the MultiQC quality control metric multi_qc: "__DATA_ROOT__/multiqc" multi_qc_out: "__DATA_ROOT__/multiqc/multiqc.out" - # file paths for the haplotype caller files & log - hc_vcf: "__DATA_ROOT__/hc/{id}_raw.g.vcf" - hc_vcf_out: "__DATA_ROOT__/hc/{id}.out" + # file paths for the haplotype caller files & log, including file patterns and root folders + hc_vcf_root: "__DATA_ROOT__/hc" + hc_vcf_pattern: "_raw.g.vcf" + hc_vcf: "__HC_VCF_ROOT__/{id}_raw.g.vcf" # this has to use the pattern specified in hc_vcf_pattern + hc_vcf_out: "__HC_VCF_ROOT__/{id}.out" # file paths for the MuTect2 files & log MuTect2_vcf: "__DATA_ROOT__/MuTect2/{tumor}_{normal}/{tumor}_mutect2.vcf" MuTect2_out: "__DATA_ROOT__/MuTect2/{tumor}_{normal}/{tumor}.out" + # file path for pileup + pileup: "__DATA_ROOT__/pileup/{id}.pileup.gz" + pileup_out: "__DATA_ROOT__/pileup/{id}.out" + # file path for 50bp log + 50bp_out: "__DATA_ROOT__/50bp_GC_bin.out" + # file paths for the seqz files & log + seqz: "__DATA_ROOT__/sequenza/{tumor}_{normal}.seqz.gz" + seqz_out: "__DATA_ROOT__/sequenza/{tumor}_{normal}.out" + # file paths for the small seqz files & log + small_seqz: "__DATA_ROOT__/sequenza_small/{tumor}_{normal}/small_out.seqz.gz" + small_seqz_out: "__DATA_ROOT__/sequenza_small/{tumor}_{normal}/{tumor}_{normal}_small.out" + # file paths for vcf list + vcf_list: "__DATA_ROOT__/hc_genotype/vcf_{md5}.list" + # file paths for the haplotype genotyping gvcf files & log + hc_geno_vcf: "__DATA_ROOT__/hc_genotype/output_{md5}.vcf" + hc_geno_vcf_out: "__DATA_ROOT__/hc_genotype/hc_genotype_{md5}.out" + # file paths for the snv variant recalibration files & log + var_snv_recal: "__DATA_ROOT__/hc_genotype/melarray_snv_{md5}.recal" + var_snv_recal_tranches: "__DATA_ROOT__/hc_genotype/melarray_snv_{md5}.tranches" + var_snv_recal_out: "__DATA_ROOT__/hc_genotype/var_recal_snv_{md5}.out" + # file paths for the snv variant recalibration apply step's files & log + recal_snv_vcf: "__DATA_ROOT__/hc_genotype/melarray_snv_recal_{md5}.vcf" + recal_snv_vcf_out: "__DATA_ROOT__/hc_genotype/apply_snv_recal_{md5}.out" + # file paths for the indel variant recalibration files & log + var_indel_recal: "__DATA_ROOT__/hc_genotype/melarray_indel_{md5}.recal" + var_indel_recal_tranches: "__DATA_ROOT__/hc_genotype/melarray_indel_{md5}.tranches" + var_indel_recal_out: "__DATA_ROOT__/hc_genotype/var_recal_indel_{md5}.out" + # file paths for the indel variant recalibration apply step's files & log + recal_indel_vcf: "__DATA_ROOT__/hc_genotype/melarray_snv_indel_recal_{md5}.vcf" + recal_indel_vcf_out: "__DATA_ROOT__/hc_genotype/apply_indel_recal_{md5}.out" # root path for the reference files ref_root: "/ref" -# ref_root: "__PROJECT_ROOT__/ref/hg19" + # path where the reference fasta genome is stored -# ref_genome: "__REF_ROOT__/ucsc.hg19.fasta" - ref_genome: "__REF_ROOT__/ucsc.hg19.chr1.fa" + ref_genome: "__REF_ROOT__/ucsc.hg19.chr1.fasta" # path where the reference genome's dictionary is stored -# ref_dict: "__REF_ROOT__/ucsc.hg19.dict" ref_dict: "__REF_ROOT__/ucsc.hg19.chr1.dict" # path where the dbsnip reference file is stored - dbsnip: "__REF_ROOT__/dbsnp_138.hg19.vcf.gz" + dbsnp: "__REF_ROOT__/dbsnp_138.hg19.vcf.gz" # path where the Mills reference file is stored - Mills: "__REF_ROOT__/Mills_and_1000G_gold_standard.indels.hg19.sites.vcf.gz" + mills: "__REF_ROOT__/Mills_and_1000G_gold_standard.indels.hg19.sites.vcf.gz" # path where the 1000 genomes reference file is stored 1000G: "__REF_ROOT__/1000G_phase1.snps.high_confidence.hg19.sites.vcf.gz" # path where the 1000 genomes reference file is stored cosmic: "__REF_ROOT__/CosmicCodingMuts_chr_M_sorted.vcf.gz" + # path where the 50bp GC binning reference for sequenza is stored + 50bp: "__REF_ROOT__/ucsc.hg19.gc50Base.txt.gz" + # path where the omni reference is stored + omni: "__REF_ROOT__/1000G_omni2.5.hg19.sites.vcf.gz" + # path where the hapmap reference is stored + hapmap: "__REF_ROOT__/hapmap_3.3.hg19.sites.vcf.gz" + # path where the "bed" reference file is stored -# bed: "__REF_ROOT__/151127_out.bed" bed: "__REF_ROOT__/151127_out_chr1.bed" # path where the "interval list" reference file is stored -# interval_list: "__REF_ROOT__/melarray.hg19.interval_list" interval_list: "__REF_ROOT__/melarray.chr1.interval_list" # path where the tumor <-> normal samples matching file is stored tumor_normal_match: "__REF_ROOT__/Melarray_mutect2_tumor_normal.txt" - # root path for the software & tools -# soft_root: "__PROJECT_ROOT__/software" soft_root: "/usr/local/bin" # path where the picard tools JAR file is located -# picard: "/data/Phil/software/picard-tools-2.4.1/picard.jar" picard: "__SOFT_ROOT__/picard.jar" # path where the GATK JAR file is located -# gatk: "/data/Phil/software/GATK_3.6/GenomeAnalysisTK.jar" gatk: "__SOFT_ROOT__/GenomeAnalysisTK.jar" - + # path to sequenza binning python script + seqz_utils: "/usr/lib64/R/library/sequenza/exec/sequenza-utils.py" # specific run-time parameters for each software / tool software: @@ -146,18 +173,18 @@ software: rename: # copy input files for renaming rename_cmd: "cp" - # movie input files for renaming + # move input files for renaming # rename_cmd: "mv" # parameters for picard picard: # amount of RAM for Java (-Xmx..G) - memory: "14" + memory: "16" # parameters for GATK gatk: # amount of RAM for Java (-Xmx..G) - memory: "14" + memory: "16" # parameters for skewer skewer: @@ -171,7 +198,7 @@ software: # parameters for bwa bwa: # number of threads (-t) - n_threads: "2" + n_threads: "1" # header read group columns ID: "Melanoma" LB: "Melarray" @@ -197,3 +224,78 @@ software: # parameter to pass to the VCF/BCF IndexCreator variant_index_parameter: "128000" + # parameters for samtools mpileup + samtools: + # quality filter + qual: "20" + + # parameters for seqz utils window binning + seqz_utils: + # window binning + gc_window: "50" + b_window: "50" + + # parameters for the haplotype caller genotyping + hc_genotyping: + # number of threads + num_threads: "1" + + # parameters for the snv variant recal + var_snv_recal: + # specify the parameters string for the hapmap resource + hapmap: "hapmap,known=false,training=true,truth=true,prior=15.0" + # specify the parameters string for the omni resource + omni: "omni,known=false,training=true,truth=true,prior=12.0" + # specify the parameters string for the 1000G resource + 1000G: "1000G,known=false,training=true,truth=false,prior=10.0" + # specify the parameters string for the dbsnp resource + dbsnp: "dbsnp,known=true,training=false,truth=false,prior=2.0" + # specify the recalibration mode to employ (SNP|INDEL|BOTH) + mode: "SNP" + # number of threads + num_threads: "1" + # list of the names of the annotations which should be used for calculations + an_list: + - "QD" + - "MQ" + - "MQRankSum" + - "ReadPosRankSum" + - "FS" + - "SOR" + - "DP" + + # parameters for the snv variant recal apply step + apply_snv_recal: + # specify the recalibration mode to employ (SNP|INDEL|BOTH) + mode: "SNP" + # specify the truth sensitivity level at which to start filtering + ts_filter_level: "99.5" + + # parameters for the indel variant recal + var_indel_recal: + # specify the parameters string for the hapmap resource + mills: "mills,known=false,training=true,truth=true,prior=12.0" + # specify the parameters string for the dbsnp resource + dbsnp: "dbsnp,known=true,training=false,truth=false,prior=2.0" + # specify the maximum gaussian parameter + maxGaussians: "4" + # specify the recalibration mode to employ (SNP|INDEL|BOTH) + mode: "INDEL" + # number of threads + num_threads: "1" + # list of the names of the annotations which should be used for calculations + an_list: + - "QD" + - "MQ" + - "MQRankSum" + - "ReadPosRankSum" + - "FS" + - "SOR" + - "DP" + + # parameters for the indel variant recal apply step + apply_indel_recal: + # specify the recalibration mode to employ (SNP|INDEL|BOTH) + mode: "INDEL" + # specify the truth sensitivity level at which to start filtering + ts_filter_level: "90.0" \ No newline at end of file diff --git a/MelArray/other/create_dag.sh b/MelArray/other/create_dag.sh index 231577d854760f4e91794d3b22a4c0cbe68a6ddf..a355dabae616e996d4564f91ecc2e3c76c4108fd 100755 --- a/MelArray/other/create_dag.sh +++ b/MelArray/other/create_dag.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -v=06 +v=08 snakemake -s MelArray.snake --rulegraph > other/dag/rule_graph_v$v.dag snakemake -s MelArray.snake --dag > other/dag/dag_v$v.dag diff --git a/MelArray/other/create_dag_with_container.sh b/MelArray/other/create_dag_with_container.sh new file mode 100755 index 0000000000000000000000000000000000000000..b7e67a3f2cbd0d3f092bf7205800aa03c0e95c43 --- /dev/null +++ b/MelArray/other/create_dag_with_container.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash +v=08 +snakemake -s MelArray.snake --rulegraph > other/dag/rule_graph_v$v.dag +snakemake -s MelArray.snake --dag > other/dag/dag_v$v.dag + +cat other/dag/dag_v$v.dag | singularity exec melarray.img dot -Tpng > other/dag_png/dag_v${v}_vert.png +cat other/dag/rule_graph_v$v.dag | singularity exec melarray.img dot -Tpng > other/rule_graph_png/rule_graph_v${v}_vert.png + +cat other/dag/dag_v$v.dag | sed 's/graph\[/graph\[rankdir=LR,/g' \ + | singularity exec melarray.img dot -Tpng > other/dag_png/dag_v${v}_hori.png +cat other/dag/rule_graph_v$v.dag | sed 's/graph\[/graph\[rankdir=LR,/g' \ + | singularity exec melarray.img dot -Tpng > other/rule_graph_png/rule_graph_v${v}_hori.png diff --git a/MelArray/other/dag/dag_v07.dag b/MelArray/other/dag/dag_v07.dag new file mode 100644 index 0000000000000000000000000000000000000000..cc24dc016913bc752374f62d610c417e70753a40 --- /dev/null +++ b/MelArray/other/dag/dag_v07.dag @@ -0,0 +1,181 @@ +digraph snakemake_dag { + graph[bgcolor=white, margin=0]; + node[shape=box, style=rounded, fontname=sans, fontsize=10, penwidth=2]; + edge[penwidth=2, color=grey]; + 0[label = "all", color = "0.03 0.6 0.85", style="rounded"]; + 1[label = "analyze_covariates", color = "0.61 0.6 0.85", style="rounded,dashed"]; + 2[label = "MuTect2\nnormal: PBMCpatient1\ntumor: MTBPatient1", color = "0.42 0.6 0.85", style="rounded,dashed"]; + 3[label = "analyze_covariates", color = "0.61 0.6 0.85", style="rounded,dashed"]; + 4[label = "apply_recal", color = "0.14 0.6 0.85", style="rounded"]; + 5[label = "analyze_covariates", color = "0.61 0.6 0.85", style="rounded,dashed"]; + 6[label = "MuTect2\nnormal: PBMCpatient2\ntumor: MTBPatient2", color = "0.42 0.6 0.85", style="rounded,dashed"]; + 7[label = "small_seqz", color = "0.19 0.6 0.85", style="rounded,dashed"]; + 8[label = "analyze_covariates", color = "0.61 0.6 0.85", style="rounded,dashed"]; + 9[label = "small_seqz", color = "0.19 0.6 0.85", style="rounded,dashed"]; + 10[label = "multi_qc", color = "0.47 0.6 0.85", style="rounded,dashed"]; + 11[label = "base_recal", color = "0.00 0.6 0.85", style="rounded,dashed"]; + 12[label = "post_recal", color = "0.33 0.6 0.85", style="rounded,dashed"]; + 13[label = "print_reads", color = "0.39 0.6 0.85", style="rounded,dashed"]; + 14[label = "print_reads", color = "0.39 0.6 0.85", style="rounded,dashed"]; + 15[label = "post_recal", color = "0.33 0.6 0.85", style="rounded,dashed"]; + 16[label = "base_recal", color = "0.00 0.6 0.85", style="rounded,dashed"]; + 17[label = "genotype_gvcf", color = "0.50 0.6 0.85", style="rounded,dashed"]; + 18[label = "var_recal", color = "0.17 0.6 0.85", style="rounded,dashed"]; + 19[label = "base_recal", color = "0.00 0.6 0.85", style="rounded,dashed"]; + 20[label = "post_recal", color = "0.33 0.6 0.85", style="rounded,dashed"]; + 21[label = "print_reads", color = "0.39 0.6 0.85", style="rounded,dashed"]; + 22[label = "print_reads", color = "0.39 0.6 0.85", style="rounded,dashed"]; + 23[label = "pileup2seqz\nnormal: PBMCpatient1\ntumor: MTBPatient1", color = "0.11 0.6 0.85", style="rounded,dashed"]; + 24[label = "base_recal", color = "0.00 0.6 0.85", style="rounded,dashed"]; + 25[label = "post_recal", color = "0.33 0.6 0.85", style="rounded,dashed"]; + 26[label = "pileup2seqz\nnormal: PBMCpatient2\ntumor: MTBPatient2", color = "0.11 0.6 0.85", style="rounded,dashed"]; + 27[label = "skewer", color = "0.53 0.6 0.85", style="rounded,dashed"]; + 28[label = "skewer", color = "0.53 0.6 0.85", style="rounded,dashed"]; + 29[label = "align_QC", color = "0.28 0.6 0.85", style="rounded,dashed"]; + 30[label = "hybrid_sel_QC", color = "0.56 0.6 0.85", style="rounded,dashed"]; + 31[label = "mark_duplicates", color = "0.22 0.6 0.85", style="rounded,dashed"]; + 32[label = "hybrid_sel_QC", color = "0.56 0.6 0.85", style="rounded,dashed"]; + 33[label = "align_QC", color = "0.28 0.6 0.85", style="rounded,dashed"]; + 34[label = "skewer", color = "0.53 0.6 0.85", style="rounded,dashed"]; + 35[label = "mark_duplicates", color = "0.22 0.6 0.85", style="rounded,dashed"]; + 36[label = "mark_duplicates", color = "0.22 0.6 0.85", style="rounded,dashed"]; + 37[label = "skewer", color = "0.53 0.6 0.85", style="rounded,dashed"]; + 38[label = "align_QC", color = "0.28 0.6 0.85", style="rounded,dashed"]; + 39[label = "align_QC", color = "0.28 0.6 0.85", style="rounded,dashed"]; + 40[label = "hybrid_sel_QC", color = "0.56 0.6 0.85", style="rounded,dashed"]; + 41[label = "hybrid_sel_QC", color = "0.56 0.6 0.85", style="rounded,dashed"]; + 42[label = "mark_duplicates", color = "0.22 0.6 0.85", style="rounded,dashed"]; + 43[label = "fix_mate", color = "0.44 0.6 0.85", style="rounded,dashed"]; + 44[label = "fix_mate", color = "0.44 0.6 0.85", style="rounded,dashed"]; + 45[label = "fix_mate", color = "0.44 0.6 0.85", style="rounded,dashed"]; + 46[label = "create_gvcf_list\nmd5: 6adf97f83acf6453d4a6a4b1070f3754", color = "0.08 0.6 0.85", style="rounded,dashed"]; + 47[label = "fix_mate", color = "0.44 0.6 0.85", style="rounded,dashed"]; + 48[label = "pileup", color = "0.25 0.6 0.85", style="rounded,dashed"]; + 49[label = "GCbinning", color = "0.31 0.6 0.85", style="rounded,dashed"]; + 50[label = "pileup", color = "0.25 0.6 0.85", style="rounded,dashed"]; + 51[label = "pileup", color = "0.25 0.6 0.85", style="rounded,dashed"]; + 52[label = "pileup", color = "0.25 0.6 0.85", style="rounded,dashed"]; + 53[label = "rename\nid: MTBPatient2", color = "0.58 0.6 0.85", style="rounded,dashed"]; + 54[label = "rename\nid: PBMCpatient2", color = "0.58 0.6 0.85", style="rounded,dashed"]; + 55[label = "bed_to_interval_list", color = "0.64 0.6 0.85", style="rounded,dashed"]; + 56[label = "bwa_bam", color = "0.06 0.6 0.85", style="rounded,dashed"]; + 57[label = "rename\nid: PBMCpatient1", color = "0.58 0.6 0.85", style="rounded,dashed"]; + 58[label = "bwa_bam", color = "0.06 0.6 0.85", style="rounded,dashed"]; + 59[label = "bwa_bam", color = "0.06 0.6 0.85", style="rounded,dashed"]; + 60[label = "rename\nid: MTBPatient1", color = "0.58 0.6 0.85", style="rounded,dashed"]; + 61[label = "bwa_bam", color = "0.06 0.6 0.85", style="rounded,dashed"]; + 62[label = "haplotype_caller", color = "0.36 0.6 0.85", style="rounded,dashed"]; + 63[label = "haplotype_caller", color = "0.36 0.6 0.85", style="rounded,dashed"]; + 64[label = "haplotype_caller", color = "0.36 0.6 0.85", style="rounded,dashed"]; + 65[label = "haplotype_caller", color = "0.36 0.6 0.85", style="rounded,dashed"]; + 1 -> 0 + 2 -> 0 + 3 -> 0 + 4 -> 0 + 5 -> 0 + 6 -> 0 + 7 -> 0 + 8 -> 0 + 9 -> 0 + 10 -> 0 + 11 -> 1 + 12 -> 1 + 13 -> 2 + 14 -> 2 + 15 -> 3 + 16 -> 3 + 17 -> 4 + 18 -> 4 + 19 -> 5 + 20 -> 5 + 21 -> 6 + 22 -> 6 + 23 -> 7 + 24 -> 8 + 25 -> 8 + 26 -> 9 + 27 -> 10 + 28 -> 10 + 29 -> 10 + 30 -> 10 + 31 -> 10 + 32 -> 10 + 33 -> 10 + 34 -> 10 + 35 -> 10 + 36 -> 10 + 37 -> 10 + 38 -> 10 + 39 -> 10 + 40 -> 10 + 41 -> 10 + 42 -> 10 + 43 -> 11 + 43 -> 12 + 11 -> 12 + 43 -> 13 + 11 -> 13 + 44 -> 14 + 24 -> 14 + 16 -> 15 + 45 -> 15 + 45 -> 16 + 46 -> 17 + 17 -> 18 + 47 -> 19 + 47 -> 20 + 19 -> 20 + 16 -> 21 + 45 -> 21 + 47 -> 22 + 19 -> 22 + 48 -> 23 + 49 -> 23 + 50 -> 23 + 44 -> 24 + 44 -> 25 + 24 -> 25 + 49 -> 26 + 51 -> 26 + 52 -> 26 + 53 -> 27 + 54 -> 28 + 21 -> 29 + 14 -> 30 + 55 -> 30 + 56 -> 31 + 22 -> 32 + 55 -> 32 + 14 -> 33 + 57 -> 34 + 58 -> 35 + 59 -> 36 + 60 -> 37 + 13 -> 38 + 22 -> 39 + 21 -> 40 + 55 -> 40 + 13 -> 41 + 55 -> 41 + 61 -> 42 + 42 -> 43 + 35 -> 44 + 36 -> 45 + 62 -> 46 + 63 -> 46 + 64 -> 46 + 65 -> 46 + 31 -> 47 + 13 -> 48 + 14 -> 50 + 22 -> 51 + 21 -> 52 + 28 -> 56 + 34 -> 58 + 27 -> 59 + 37 -> 61 + 22 -> 62 + 21 -> 63 + 13 -> 64 + 14 -> 65 +} diff --git a/MelArray/other/dag/dag_v08.dag b/MelArray/other/dag/dag_v08.dag new file mode 100644 index 0000000000000000000000000000000000000000..b51f0fe48f7a0a6b57a51acdaba336f9d69fe588 --- /dev/null +++ b/MelArray/other/dag/dag_v08.dag @@ -0,0 +1,186 @@ +digraph snakemake_dag { + graph[bgcolor=white, margin=0]; + node[shape=box, style=rounded, fontname=sans, fontsize=10, penwidth=2]; + edge[penwidth=2, color=grey]; + 0[label = "all", color = "0.62 0.6 0.85", style="rounded"]; + 1[label = "analyze_covariates", color = "0.18 0.6 0.85", style="rounded,dashed"]; + 2[label = "apply_indel_recal", color = "0.28 0.6 0.85", style="rounded"]; + 3[label = "multi_qc", color = "0.00 0.6 0.85", style="rounded,dashed"]; + 4[label = "analyze_covariates", color = "0.18 0.6 0.85", style="rounded,dashed"]; + 5[label = "analyze_covariates", color = "0.18 0.6 0.85", style="rounded,dashed"]; + 6[label = "analyze_covariates", color = "0.18 0.6 0.85", style="rounded,dashed"]; + 7[label = "small_seqz", color = "0.13 0.6 0.85", style="rounded,dashed"]; + 8[label = "small_seqz", color = "0.13 0.6 0.85", style="rounded,dashed"]; + 9[label = "MuTect2\nnormal: PBMCpatient2\ntumor: MTBPatient2", color = "0.56 0.6 0.85", style="rounded,dashed"]; + 10[label = "MuTect2\nnormal: PBMCpatient1\ntumor: MTBPatient1", color = "0.56 0.6 0.85", style="rounded,dashed"]; + 11[label = "post_recal", color = "0.51 0.6 0.85", style="rounded,dashed"]; + 12[label = "base_recal", color = "0.59 0.6 0.85", style="rounded,dashed"]; + 13[label = "genotype_gvcf", color = "0.49 0.6 0.85", style="rounded,dashed"]; + 14[label = "var_indel_recal", color = "0.05 0.6 0.85", style="rounded"]; + 15[label = "skewer", color = "0.41 0.6 0.85", style="rounded,dashed"]; + 16[label = "mark_duplicates", color = "0.38 0.6 0.85", style="rounded,dashed"]; + 17[label = "skewer", color = "0.41 0.6 0.85", style="rounded,dashed"]; + 18[label = "skewer", color = "0.41 0.6 0.85", style="rounded,dashed"]; + 19[label = "mark_duplicates", color = "0.38 0.6 0.85", style="rounded,dashed"]; + 20[label = "hybrid_sel_QC", color = "0.26 0.6 0.85", style="rounded,dashed"]; + 21[label = "align_QC", color = "0.54 0.6 0.85", style="rounded,dashed"]; + 22[label = "hybrid_sel_QC", color = "0.26 0.6 0.85", style="rounded,dashed"]; + 23[label = "hybrid_sel_QC", color = "0.26 0.6 0.85", style="rounded,dashed"]; + 24[label = "hybrid_sel_QC", color = "0.26 0.6 0.85", style="rounded,dashed"]; + 25[label = "mark_duplicates", color = "0.38 0.6 0.85", style="rounded,dashed"]; + 26[label = "align_QC", color = "0.54 0.6 0.85", style="rounded,dashed"]; + 27[label = "mark_duplicates", color = "0.38 0.6 0.85", style="rounded,dashed"]; + 28[label = "align_QC", color = "0.54 0.6 0.85", style="rounded,dashed"]; + 29[label = "skewer", color = "0.41 0.6 0.85", style="rounded,dashed"]; + 30[label = "align_QC", color = "0.54 0.6 0.85", style="rounded,dashed"]; + 31[label = "post_recal", color = "0.51 0.6 0.85", style="rounded,dashed"]; + 32[label = "base_recal", color = "0.59 0.6 0.85", style="rounded,dashed"]; + 33[label = "base_recal", color = "0.59 0.6 0.85", style="rounded,dashed"]; + 34[label = "post_recal", color = "0.51 0.6 0.85", style="rounded,dashed"]; + 35[label = "base_recal", color = "0.59 0.6 0.85", style="rounded,dashed"]; + 36[label = "post_recal", color = "0.51 0.6 0.85", style="rounded,dashed"]; + 37[label = "pileup2seqz\nnormal: PBMCpatient2\ntumor: MTBPatient2", color = "0.46 0.6 0.85", style="rounded,dashed"]; + 38[label = "pileup2seqz\nnormal: PBMCpatient1\ntumor: MTBPatient1", color = "0.46 0.6 0.85", style="rounded,dashed"]; + 39[label = "print_reads", color = "0.03 0.6 0.85", style="rounded,dashed"]; + 40[label = "print_reads", color = "0.03 0.6 0.85", style="rounded,dashed"]; + 41[label = "print_reads", color = "0.03 0.6 0.85", style="rounded,dashed"]; + 42[label = "print_reads", color = "0.03 0.6 0.85", style="rounded,dashed"]; + 43[label = "fix_mate", color = "0.64 0.6 0.85", style="rounded,dashed"]; + 44[label = "create_gvcf_list\nmd5: 6adf97f83acf6453d4a6a4b1070f3754", color = "0.21 0.6 0.85", style="rounded,dashed"]; + 45[label = "apply_snv_recal", color = "0.31 0.6 0.85", style="rounded"]; + 46[label = "rename\nid: MTBPatient2", color = "0.08 0.6 0.85", style="rounded,dashed"]; + 47[label = "bwa_bam", color = "0.10 0.6 0.85", style="rounded,dashed"]; + 48[label = "rename\nid: MTBPatient1", color = "0.08 0.6 0.85", style="rounded,dashed"]; + 49[label = "rename\nid: PBMCpatient1", color = "0.08 0.6 0.85", style="rounded,dashed"]; + 50[label = "bwa_bam", color = "0.10 0.6 0.85", style="rounded,dashed"]; + 51[label = "bed_to_interval_list", color = "0.44 0.6 0.85", style="rounded,dashed"]; + 52[label = "bwa_bam", color = "0.10 0.6 0.85", style="rounded,dashed"]; + 53[label = "bwa_bam", color = "0.10 0.6 0.85", style="rounded,dashed"]; + 54[label = "rename\nid: PBMCpatient2", color = "0.08 0.6 0.85", style="rounded,dashed"]; + 55[label = "fix_mate", color = "0.64 0.6 0.85", style="rounded,dashed"]; + 56[label = "fix_mate", color = "0.64 0.6 0.85", style="rounded,dashed"]; + 57[label = "fix_mate", color = "0.64 0.6 0.85", style="rounded,dashed"]; + 58[label = "pileup", color = "0.33 0.6 0.85", style="rounded,dashed"]; + 59[label = "pileup", color = "0.33 0.6 0.85", style="rounded,dashed"]; + 60[label = "GCbinning", color = "0.15 0.6 0.85", style="rounded,dashed"]; + 61[label = "pileup", color = "0.33 0.6 0.85", style="rounded,dashed"]; + 62[label = "pileup", color = "0.33 0.6 0.85", style="rounded,dashed"]; + 63[label = "haplotype_caller", color = "0.23 0.6 0.85", style="rounded,dashed"]; + 64[label = "haplotype_caller", color = "0.23 0.6 0.85", style="rounded,dashed"]; + 65[label = "haplotype_caller", color = "0.23 0.6 0.85", style="rounded,dashed"]; + 66[label = "haplotype_caller", color = "0.23 0.6 0.85", style="rounded,dashed"]; + 67[label = "var_snv_recal", color = "0.36 0.6 0.85", style="rounded,dashed"]; + 1 -> 0 + 2 -> 0 + 3 -> 0 + 4 -> 0 + 5 -> 0 + 6 -> 0 + 7 -> 0 + 8 -> 0 + 9 -> 0 + 10 -> 0 + 11 -> 1 + 12 -> 1 + 13 -> 2 + 14 -> 2 + 15 -> 3 + 16 -> 3 + 17 -> 3 + 18 -> 3 + 19 -> 3 + 20 -> 3 + 21 -> 3 + 22 -> 3 + 23 -> 3 + 24 -> 3 + 25 -> 3 + 26 -> 3 + 27 -> 3 + 28 -> 3 + 29 -> 3 + 30 -> 3 + 31 -> 4 + 32 -> 4 + 33 -> 5 + 34 -> 5 + 35 -> 6 + 36 -> 6 + 37 -> 7 + 38 -> 8 + 39 -> 9 + 40 -> 9 + 41 -> 10 + 42 -> 10 + 43 -> 11 + 12 -> 11 + 43 -> 12 + 44 -> 13 + 45 -> 14 + 46 -> 15 + 47 -> 16 + 48 -> 17 + 49 -> 18 + 50 -> 19 + 51 -> 20 + 42 -> 20 + 42 -> 21 + 39 -> 22 + 51 -> 22 + 51 -> 23 + 40 -> 23 + 51 -> 24 + 41 -> 24 + 52 -> 25 + 41 -> 26 + 53 -> 27 + 39 -> 28 + 54 -> 29 + 40 -> 30 + 55 -> 31 + 32 -> 31 + 55 -> 32 + 56 -> 33 + 33 -> 34 + 56 -> 34 + 57 -> 35 + 35 -> 36 + 57 -> 36 + 58 -> 37 + 59 -> 37 + 60 -> 37 + 60 -> 38 + 61 -> 38 + 62 -> 38 + 55 -> 39 + 32 -> 39 + 33 -> 40 + 56 -> 40 + 43 -> 41 + 12 -> 41 + 35 -> 42 + 57 -> 42 + 19 -> 43 + 63 -> 44 + 64 -> 44 + 65 -> 44 + 66 -> 44 + 13 -> 45 + 67 -> 45 + 15 -> 47 + 17 -> 50 + 29 -> 52 + 18 -> 53 + 25 -> 55 + 16 -> 56 + 27 -> 57 + 39 -> 58 + 40 -> 59 + 41 -> 61 + 42 -> 62 + 40 -> 63 + 42 -> 64 + 39 -> 65 + 41 -> 66 + 13 -> 67 +} diff --git a/MelArray/other/dag/rule_graph_v07.dag b/MelArray/other/dag/rule_graph_v07.dag new file mode 100644 index 0000000000000000000000000000000000000000..78115a12f3afe0ad3effb059fec3b1bf94984396 --- /dev/null +++ b/MelArray/other/dag/rule_graph_v07.dag @@ -0,0 +1,63 @@ +digraph snakemake_dag { + graph[bgcolor=white, margin=0]; + node[shape=box, style=rounded, fontname=sans, fontsize=10, penwidth=2]; + edge[penwidth=2, color=grey]; + 0[label = "all", color = "0.53 0.6 0.85", style="rounded"]; + 1[label = "MuTect2", color = "0.39 0.6 0.85", style="rounded"]; + 2[label = "analyze_covariates", color = "0.19 0.6 0.85", style="rounded"]; + 3[label = "apply_recal", color = "0.28 0.6 0.85", style="rounded"]; + 4[label = "multi_qc", color = "0.44 0.6 0.85", style="rounded"]; + 5[label = "small_seqz", color = "0.17 0.6 0.85", style="rounded"]; + 6[label = "print_reads", color = "0.06 0.6 0.85", style="rounded"]; + 7[label = "base_recal", color = "0.42 0.6 0.85", style="rounded"]; + 8[label = "post_recal", color = "0.47 0.6 0.85", style="rounded"]; + 9[label = "var_recal", color = "0.50 0.6 0.85", style="rounded"]; + 10[label = "genotype_gvcf", color = "0.64 0.6 0.85", style="rounded"]; + 11[label = "mark_duplicates", color = "0.31 0.6 0.85", style="rounded"]; + 12[label = "skewer", color = "0.33 0.6 0.85", style="rounded"]; + 13[label = "hybrid_sel_QC", color = "0.61 0.6 0.85", style="rounded"]; + 14[label = "align_QC", color = "0.22 0.6 0.85", style="rounded"]; + 15[label = "pileup2seqz", color = "0.11 0.6 0.85", style="rounded"]; + 16[label = "fix_mate", color = "0.25 0.6 0.85", style="rounded"]; + 17[label = "create_gvcf_list", color = "0.14 0.6 0.85", style="rounded"]; + 18[label = "bwa_bam", color = "0.56 0.6 0.85", style="rounded"]; + 19[label = "rename", color = "0.00 0.6 0.85", style="rounded"]; + 20[label = "bed_to_interval_list", color = "0.58 0.6 0.85", style="rounded"]; + 21[label = "GCbinning", color = "0.08 0.6 0.85", style="rounded"]; + 22[label = "pileup", color = "0.03 0.6 0.85", style="rounded"]; + 23[label = "haplotype_caller", color = "0.36 0.6 0.85", style="rounded"]; + 3 -> 0 + 1 -> 0 + 4 -> 0 + 5 -> 0 + 2 -> 0 + 6 -> 1 + 8 -> 2 + 7 -> 2 + 9 -> 3 + 10 -> 3 + 12 -> 4 + 13 -> 4 + 11 -> 4 + 14 -> 4 + 15 -> 5 + 16 -> 6 + 7 -> 6 + 16 -> 7 + 16 -> 8 + 7 -> 8 + 10 -> 9 + 17 -> 10 + 18 -> 11 + 19 -> 12 + 20 -> 13 + 6 -> 13 + 6 -> 14 + 22 -> 15 + 21 -> 15 + 11 -> 16 + 23 -> 17 + 12 -> 18 + 6 -> 22 + 6 -> 23 +} diff --git a/MelArray/other/dag/rule_graph_v08.dag b/MelArray/other/dag/rule_graph_v08.dag new file mode 100644 index 0000000000000000000000000000000000000000..c7420b18de59e01ab7c188ea5a4ac7d755463571 --- /dev/null +++ b/MelArray/other/dag/rule_graph_v08.dag @@ -0,0 +1,68 @@ +digraph snakemake_dag { + graph[bgcolor=white, margin=0]; + node[shape=box, style=rounded, fontname=sans, fontsize=10, penwidth=2]; + edge[penwidth=2, color=grey]; + 0[label = "all", color = "0.56 0.6 0.85", style="rounded"]; + 1[label = "analyze_covariates", color = "0.46 0.6 0.85", style="rounded"]; + 2[label = "MuTect2", color = "0.59 0.6 0.85", style="rounded"]; + 3[label = "small_seqz", color = "0.54 0.6 0.85", style="rounded"]; + 4[label = "multi_qc", color = "0.15 0.6 0.85", style="rounded"]; + 5[label = "apply_indel_recal", color = "0.36 0.6 0.85", style="rounded"]; + 6[label = "base_recal", color = "0.23 0.6 0.85", style="rounded"]; + 7[label = "post_recal", color = "0.26 0.6 0.85", style="rounded"]; + 8[label = "print_reads", color = "0.18 0.6 0.85", style="rounded"]; + 9[label = "pileup2seqz", color = "0.62 0.6 0.85", style="rounded"]; + 10[label = "mark_duplicates", color = "0.33 0.6 0.85", style="rounded"]; + 11[label = "align_QC", color = "0.28 0.6 0.85", style="rounded"]; + 12[label = "skewer", color = "0.05 0.6 0.85", style="rounded"]; + 13[label = "hybrid_sel_QC", color = "0.41 0.6 0.85", style="rounded"]; + 14[label = "var_indel_recal", color = "0.44 0.6 0.85", style="rounded"]; + 15[label = "genotype_gvcf", color = "0.31 0.6 0.85", style="rounded"]; + 16[label = "fix_mate", color = "0.10 0.6 0.85", style="rounded"]; + 17[label = "pileup", color = "0.08 0.6 0.85", style="rounded"]; + 18[label = "GCbinning", color = "0.00 0.6 0.85", style="rounded"]; + 19[label = "bwa_bam", color = "0.51 0.6 0.85", style="rounded"]; + 20[label = "rename", color = "0.21 0.6 0.85", style="rounded"]; + 21[label = "bed_to_interval_list", color = "0.64 0.6 0.85", style="rounded"]; + 22[label = "apply_snv_recal", color = "0.49 0.6 0.85", style="rounded"]; + 23[label = "create_gvcf_list", color = "0.03 0.6 0.85", style="rounded"]; + 24[label = "var_snv_recal", color = "0.13 0.6 0.85", style="rounded"]; + 25[label = "haplotype_caller", color = "0.38 0.6 0.85", style="rounded"]; + 3 -> 0 + 5 -> 0 + 2 -> 0 + 4 -> 0 + 1 -> 0 + 6 -> 1 + 7 -> 1 + 8 -> 2 + 9 -> 3 + 13 -> 4 + 10 -> 4 + 11 -> 4 + 12 -> 4 + 14 -> 5 + 15 -> 5 + 16 -> 6 + 16 -> 7 + 6 -> 7 + 16 -> 8 + 6 -> 8 + 18 -> 9 + 17 -> 9 + 19 -> 10 + 8 -> 11 + 20 -> 12 + 8 -> 13 + 21 -> 13 + 22 -> 14 + 23 -> 15 + 10 -> 16 + 8 -> 17 + 12 -> 19 + 24 -> 22 + 15 -> 22 + 25 -> 23 + 15 -> 24 + 8 -> 25 +} diff --git a/MelArray/other/dag_png/dag_v07_hori.png b/MelArray/other/dag_png/dag_v07_hori.png new file mode 100644 index 0000000000000000000000000000000000000000..31e7770122c6708d809b78226703a9d1e69b788e Binary files /dev/null and b/MelArray/other/dag_png/dag_v07_hori.png differ diff --git a/MelArray/other/dag_png/dag_v07_vert.png b/MelArray/other/dag_png/dag_v07_vert.png new file mode 100644 index 0000000000000000000000000000000000000000..f800c6f4a44342fb50a160af4fd369f2533064cf Binary files /dev/null and b/MelArray/other/dag_png/dag_v07_vert.png differ diff --git a/MelArray/other/dag_png/dag_v08_hori.png b/MelArray/other/dag_png/dag_v08_hori.png new file mode 100644 index 0000000000000000000000000000000000000000..6ddff65a8ee022f67c1c4a223927ef677665c177 Binary files /dev/null and b/MelArray/other/dag_png/dag_v08_hori.png differ diff --git a/MelArray/other/dag_png/dag_v08_vert.png b/MelArray/other/dag_png/dag_v08_vert.png new file mode 100644 index 0000000000000000000000000000000000000000..cd566aabbeb1d5d261992a25724c6b849371d773 Binary files /dev/null and b/MelArray/other/dag_png/dag_v08_vert.png differ diff --git a/MelArray/other/rule_graph_png/rule_graph_v07_hori.png b/MelArray/other/rule_graph_png/rule_graph_v07_hori.png new file mode 100644 index 0000000000000000000000000000000000000000..f6f788164a0f46080058841f2cd70b7e9b9d9028 Binary files /dev/null and b/MelArray/other/rule_graph_png/rule_graph_v07_hori.png differ diff --git a/MelArray/other/rule_graph_png/rule_graph_v07_vert.png b/MelArray/other/rule_graph_png/rule_graph_v07_vert.png new file mode 100644 index 0000000000000000000000000000000000000000..bbadc929ae20ac23ac6166eecab23775c16606a4 Binary files /dev/null and b/MelArray/other/rule_graph_png/rule_graph_v07_vert.png differ diff --git a/MelArray/other/rule_graph_png/rule_graph_v08_hori.png b/MelArray/other/rule_graph_png/rule_graph_v08_hori.png new file mode 100644 index 0000000000000000000000000000000000000000..bcabf8037358ae230fe843636dd8a268ce8a2f8e Binary files /dev/null and b/MelArray/other/rule_graph_png/rule_graph_v08_hori.png differ diff --git a/MelArray/other/rule_graph_png/rule_graph_v08_vert.png b/MelArray/other/rule_graph_png/rule_graph_v08_vert.png new file mode 100644 index 0000000000000000000000000000000000000000..06d863a91ad5baa48c1899c32669964e793e3b2b Binary files /dev/null and b/MelArray/other/rule_graph_png/rule_graph_v08_vert.png differ diff --git a/MelArray/other/run_times/20170524_015400_rules_timing.png b/MelArray/other/run_times/20170524_015400_rules_timing.png index 9b95100896bd40b038072a6996ce8cfa0632b7c0..0fe8baef07f63e323e1ec8b3417c303926564b27 100644 Binary files a/MelArray/other/run_times/20170524_015400_rules_timing.png and b/MelArray/other/run_times/20170524_015400_rules_timing.png differ diff --git a/MelArray/other/run_times/20170524_015400_samples_timing.png b/MelArray/other/run_times/20170524_015400_samples_timing.png index 3693740fb69eba57704836411bf4aa3cd1528f5a..718db2796466f4af449fc56b498e3c3958494d04 100644 Binary files a/MelArray/other/run_times/20170524_015400_samples_timing.png and b/MelArray/other/run_times/20170524_015400_samples_timing.png differ diff --git a/MelArray/scripts/create_single_container.sh b/MelArray/scripts/create_single_container.sh index ad9d35502203ab781951361cc816a4b22134d6a8..bb2c14290c6a6df6a88d077da1a65d8c319c32d4 100755 --- a/MelArray/scripts/create_single_container.sh +++ b/MelArray/scripts/create_single_container.sh @@ -28,3 +28,4 @@ singularity exec melarray.img bwa 2>&1 | grep -P 'Version' | sed 's/Version:/BWA singularity exec melarray.img java -jar /usr/local/bin/picard.jar SortSam --version 2>&1 | sed 's/\([0-9\.]\+\).\+/Picard \1/' singularity exec melarray.img java -jar /usr/local/bin/GenomeAnalysisTK.jar --version | sed 's/\([0-9\.]\+\).\+/GATK \1/' singularity exec melarray.img R --version | grep "R version" | sed 's/R version \([0-9\.]\+\).\+/R \1/' +singularity exec melarray.img samtools --version | grep "samtools" diff --git a/MelArray/scripts/parse_qsubs_times.py b/MelArray/scripts/parse_qsubs_times.py index 621a166ce6fadda7c3b77ed15d4c81378879bc3c..ebb6c067bc3d66f61fb572b4278efab25a570bf0 100644 --- a/MelArray/scripts/parse_qsubs_times.py +++ b/MelArray/scripts/parse_qsubs_times.py @@ -18,16 +18,22 @@ with open(qsubs_file_path) as qsubs_file: n = 0 for line in qsubs_file: line = line[:-1] - if re.match('^-- rename', line): + if re.match('^-- rename', line) or re.match('^-- multi_qc', line): qsubs_file.readline() qsubs_file.readline() continue # print(str(i) + ': "' + line + '"') if i == 0: - m = re.match('^-- (\w+)_(\w+) +--$', line) - rules.append(m.groups()[0]) - samples.append(m.groups()[1]) + if re.match('^-- (\w+)_([A-Z]\w+)_([A-Z]\w+) +--$', line) and not re.match('^-- (\w+)_(QC)_([A-Z]\w+) +--$', line): + m = re.match('^-- (\w+)_([A-Z]\w+)_([A-Z]\w+) +--$', line) + rules.append(m.groups()[0]) + samples.append(m.groups()[1]) + + else: + m = re.match('^-- (\w+)_(\w+) +--$', line) + rules.append(m.groups()[0]) + samples.append(m.groups()[1]) n += 1 start_date = None @@ -95,7 +101,7 @@ plt.figure(2) ax = plt.gca() ax.scatter(rule_inds, durations, 40, edgecolors = [colors[i] for i in sample_inds], facecolors = 'none') ax.set_yscale("log") -plt.xticks(range(len(rule_labels)), rule_labels, rotation = 19) +plt.xticks(range(len(rule_labels)), rule_labels, rotation = 25) plt.ylabel('duration [s]', fontsize = 18) plt.xlabel('steps', fontsize = 18) plt.ylim(yLimsSec)