diff --git a/MelArray/MelArray.snake b/MelArray/MelArray.snake new file mode 100644 index 0000000000000000000000000000000000000000..6f8ff2b578787b3a587aae957d1274cf3b7e60b6 --- /dev/null +++ b/MelArray/MelArray.snake @@ -0,0 +1,70 @@ +############################## +## USZ Snakemake pipeline 1 ## +############################## +# +# original bash pipeline from Phil Cheng +# @author: Balazs Laurenczy (for Snakemake pipeline) +# @date: 2017 + +######## +# INIT # +######## + +import os, logging + +from timeit import default_timer as t +from datetime import datetime + +# set up logging +logging.basicConfig(level = logging.INFO, format = '#%(asctime)s | %(levelname)-7s | %(filename)-20s: %(message)s') + +logging.info('Starting snakemake pipeline ...') + +# load the configuration file +config_file_path = os.getcwd() + "/config.yaml" +config = load_config(config_file_path) + +# get the list of files to process +IDs, = glob_wildcards(IN_FOLDER + "{id}.fastq") + +# check if IDs were not specified in config +IDs = config.get('ids', IDs) +if type(IDs) == str: + IDs = IDs.split(',') + + +######### +# RULES # +######### + +rule all: + input: + expand(OUT_FOLDER + '{id}.bam', id = IDs) + run: + logging.info('Pipeline completed.') + + +rule step1: + input: + IN_FOLDER + '{id}.fastq' + output: + TMP_FOLDER + '{id}.sam' + shell: + 'cat {input} > {output}' + + +rule step2: + input: + TMP_FOLDER + '{id}.sam' + output: + OUT_FOLDER + '{id}.bam' + shell: + 'cat {input} > {output}' + + +onsuccess: + logging.info('Pipeline completed successfully.') + +onerror: + logging.info('Pipeline failed.') + diff --git a/MelArray/config.yaml b/MelArray/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2443b4af707b0b091aedd947ed6f3ffc8959d5df --- /dev/null +++ b/MelArray/config.yaml @@ -0,0 +1,19 @@ +############################# +## Main configuration file ## +############################# +# +# This file controls all the parameters for running the pipeline. +# This file should be available on the machine on the machine running the pipeline. +# +# @author: Balazs Laurenczy +# @date: 2017 + + +# file and folder paths +path: + + # root path for the code/scripts + root: "/vagrant/MelArray" + + # folder where the original fastq data is + fastq: "__ROOT__/fastq" diff --git a/MelArray/original_bash_scripts/Mel_1_skewer_170228.sh b/MelArray/original_bash_scripts/Mel_1_skewer_170228.sh new file mode 100644 index 0000000000000000000000000000000000000000..6a1383f7d7d3c390b7d0169916590aea5911650b --- /dev/null +++ b/MelArray/original_bash_scripts/Mel_1_skewer_170228.sh @@ -0,0 +1,23 @@ +#!/bin/bash +rg_dir=`echo 2_skewer` +mkdir -p "$rg_dir" + +foo () { +local i=$1 + OUTPUT_DIR=`echo ./2_skewer` + SAMPLE=`echo $i | sed 's/\(.*\)_R1.fastq.gz/\1/'`; # sample is the full name like "YYYYMMDD.A-...-_NAME_R1 + NAME=`echo $i | sed 's/[0-9]*.A-\([A-Za-z0-9-]*\)_[A-Za-z0-9_]*_R1.fastq.gz/\1/'`; # The ID + #echo ${i} + #echo ${SAMPLE}_R2.fastq.gz + #echo ${NAME}_R2.fastq.gz + # rule name skewer + skewer -t 32 -m pe -q 20 -z ${i} ${SAMPLE}_R2.fastq.gz -o ${NAME} + # 2 inputs: "R[12].fastq.gz" + # 2 outputs: "-trimmed-pair[12].fastq.gz" +} + +for i in *R1.fastq.gz +do foo "$i" +done + +mv *trimmed* ./2_skewer diff --git a/MelArray/original_bash_scripts/Mel_2_bwa_GATK_170228.sh b/MelArray/original_bash_scripts/Mel_2_bwa_GATK_170228.sh new file mode 100644 index 0000000000000000000000000000000000000000..247dcb26e6fe24b3c8a6b7d8dfe9febd2cde5cc3 --- /dev/null +++ b/MelArray/original_bash_scripts/Mel_2_bwa_GATK_170228.sh @@ -0,0 +1,256 @@ +#!/bin/bash +rg_dir=`echo 3_bwamem` +mkdir -p "$rg_dir" + + +# PMBC is blood (normal) +# MTB is cancer sample +#2 patients +# machine version Centos 7.2.1511 + + # rule name create_interval_list + # input is "config: reference bed 151127_out.bed" + # output is "melarray.interval_list" + # config: path to reference dictionary "ucsc.hg19.dict" + java -Xmx16G -jar /data/Phil/software/picard-tools-2.4.1/picard.jar BedToIntervalList I=.bed O=melarray.interval_list SD=ucsc.hg19.dict + + +# TODO: create another sub-pipeline to do the pre-processing steps for reference genome (.tbi, .dict) + + +foo () { +local i=$1 + OUTPUT_DIR=`echo ./3_bwamem` + #OUTPUT_DIR2=`echo ./GATK_RNA_dedup/` + + SAMPLE=`echo $i | sed 's/\([A-Za-z0-9]*\)-trimmed-pair1.fastq.gz/\1/'`; # output name + # INPUT: "-trimmed-pair[12].fastq.gz" + # output is "[ID]_sorted.bam" + # rule name bwa_bam + # config: header read group, separate into 4 groups: ID, LB, PL, PU "@RG\tID:Melanoma\tLB:Melarray\tSM:${SAMPLE}\tPL:ILLUMINA\tPU:HiSEQ4000" + # config: reference fasta path /data/Phil/ref_phil/GATK_resource/hg19/ucsc.hg19.fasta + # config: path to picard jar /data/Phil/software/picard-tools-2.4.1/picard.jar + # config: testing parameter: align it only against chromosome 1 + bwa mem -t 16 -M -R "@RG\tID:Melanoma\tLB:Melarray\tSM:${SAMPLE}\tPL:ILLUMINA\tPU:HiSEQ4000" /data/Phil/ref_phil/GATK_resource/hg19/ucsc.hg19.fasta ./${i} ./${SAMPLE}-trimmed-pair2.fastq.gz | java -Xmx16G -jar /data/Phil/software/picard-tools-2.4.1/picard.jar SortSam I=/dev/stdin O=${SAMPLE}_sorted.bam CREATE_INDEX=true VALIDATION_STRINGENCY=SILENT SO=coordinate +} +export -f foo + +for i in *-trimmed-pair1.fastq.gz +do +sem -j 1 foo "$i" +done +sem --wait + +rg_dir2=`echo 4_dedup` +mkdir -p "$rg_dir2" + +foo2 () { +local i=$1 + #OUTPUT_DIR=`echo 4_dedup/` + #OUTPUT_DIR2=`echo ./GATK_RNA_dedup/` + NAME=`echo $i | sed 's/\([A-Za-z0-9-]*\)_sorted.bam/\1/'`; + echo ${NAME} + # separate metrics log file + # rule name mark_duplicates + # input is "sorted/[ID]_sorted.bam" + # output is "sorted_dedup/[ID]_sorted_dedup.bam" + # config: path to picard jar /data/Phil/software/picard-tools-2.4.1/picard.jar + java -Xmx16G -jar /data/Phil/software/picard-tools-2.4.1/picard.jar MarkDuplicates I=${i} O=${NAME}_sorted_dedup.bam M=${NAME}_metrics.txt +} +export -f foo2 + +for i in *sorted.bam +do +sem -j 16 --id dedup foo2 "$i" +done +sem --wait --id dedup + +rg_dir3=`echo 5_fixmate` +mkdir -p "$rg_dir3" + +foo3 () { +local i=$1 + OUTPUT_DIR=`echo ./5_fixmate/` + #OUTPUT_DIR2=`echo ./GATK_RNA_dedup/` + NAME=`echo $i | sed 's/\([A-Za-z0-9-]*\)_sorted_dedup.bam/\1/'`; + echo ${NAME} + # rule name fix_mate + # input is "sorted_dedup/[ID]_sorted_dedup.bam" + # output is "sorted_dedup_fixmate/[ID]_sorted_dedup_fixmate.bam" + # config: path to picard jar /data/Phil/software/picard-tools-2.4.1/picard.jar + java -Xmx16G -jar /data/Phil/software/picard-tools-2.4.1/picard.jar FixMateInformation VALIDATION_STRINGENCY=SILENT CREATE_INDEX=true SORT_ORDER=coordinate I=${i} O=${NAME}_sorted_dedup_fixmate.bam +} +export -f foo3 +for i in *sorted_dedup.bam +do +sem -j 16 --id fixmate foo3 "$i" +done +sem --wait --id fixmate + +rg_dir4=`echo 6_bqsr` +mkdir -p "$rg_dir4" + +foo4 () { +local i=$1 + NAME=`echo $i | sed 's/\([A-Za-z0-9-]*\)_sorted_dedup_fixmate.bam/\1/'`; + echo ${NAME} + # rule name base_recal + # input is "sorted_dedup_fixmate/[ID]_sorted_dedup_fixmate.bam" + # output is "recal_table/[ID]_recal_data.table" + # config: path to gatk jar /data/Phil/software/GATK_3.6/GenomeAnalysisTK.jar + # config: reference fasta path /data/Phil/ref_phil/GATK_resource/hg19/ucsc.hg19.fasta + # config: reference dbsnip /data/Phil/ref_phil/GATK_resource/hg19/dbsnp_138.hg19.vcf.gz + # config: reference Mills /data/Phil/ref_phil/GATK_resource/hg19/Mills_and_1000G_gold_standard.indels.hg19.sites.vcf.gz + # config: reference 1000G /data/Phil/ref_phil/GATK_resource/hg19/1000G_phase1.snps.high_confidence.hg19.sites.vcf.gz + # config: reference bed 151127_out.bed + # config: interval_padding 100 + java -Xmx16G -jar /data/Phil/software/GATK_3.6/GenomeAnalysisTK.jar -T BaseRecalibrator -R /data/Phil/ref_phil/GATK_resource/hg19/ucsc.hg19.fasta -I ${i} -knownSites /data/Phil/ref_phil/GATK_resource/hg19/dbsnp_138.hg19.vcf.gz -knownSites /data/Phil/ref_phil/GATK_resource/hg19/Mills_and_1000G_gold_standard.indels.hg19.sites.vcf.gz -knownSites /data/Phil/ref_phil/GATK_resource/hg19/1000G_phase1.snps.high_confidence.hg19.sites.vcf.gz -L 151127_out.bed --interval_padding 100 -o ${NAME}_recal_data.table +} +export -f foo4 + +for i in *sorted_dedup_fixmate.bam +do +sem -j 16 foo4 "$i" +done +sem --wait + +foo5 () { +local i=$1 + NAME=`echo $i | sed 's/\([A-Za-z0-9-]*\)_sorted_dedup_fixmate.bam/\1/'`; + + # rule name post_recal + # input is "sorted_dedup_fixmate/[ID]_sorted_dedup_fixmate.bam" + # 2nd input is "recal_table/[ID]_recal_data.table" + # output is "post_recal_table/[ID]_post_recal_data.table" + # config: path to gatk jar /data/Phil/software/GATK_3.6/GenomeAnalysisTK.jar + # config: reference fasta path /data/Phil/ref_phil/GATK_resource/hg19/ucsc.hg19.fasta + # config: reference dbsnip /data/Phil/ref_phil/GATK_resource/hg19/dbsnp_138.hg19.vcf.gz + # config: reference Mills /data/Phil/ref_phil/GATK_resource/hg19/Mills_and_1000G_gold_standard.indels.hg19.sites.vcf.gz + # config: reference 1000G /data/Phil/ref_phil/GATK_resource/hg19/1000G_phase1.snps.high_confidence.hg19.sites.vcf.gz + # config: reference bed 151127_out.bed + # config: interval_padding 100 + java -Xmx16G -jar /data/Phil/software/GATK_3.6/GenomeAnalysisTK.jar -T BaseRecalibrator -R /data/Phil/ref_phil/GATK_resource/hg19/ucsc.hg19.fasta -I ${i} -knownSites /data/Phil/ref_phil/GATK_resource/hg19/dbsnp_138.hg19.vcf.gz -knownSites /data/Phil/ref_phil/GATK_resource/hg19/Mills_and_1000G_gold_standard.indels.hg19.sites.vcf.gz -knownSites /data/Phil/ref_phil/GATK_resource/hg19/1000G_phase1.snps.high_confidence.hg19.sites.vcf.gz -L 151127_out.bed --interval_padding 100 -BQSR ${NAME}_recal_data.table -o ${NAME}_post_recal_data.table + +} +export -f foo5 +for i in *sorted_dedup_fixmate.bam +do +sem -j 16 foo5 "$i" +done +sem --wait + +foo6 () { +local i=$1 + NAME=`echo $i | sed 's/\([A-Za-z0-9-]*\)_sorted_dedup_fixmate.bam/\1/'`; + # rule name analyze_covariates + # input is "sorted_dedup_fixmate/[ID]_sorted_dedup_fixmate.bam" + # 2nd input is "recal_table/[ID]_recal_data.table" + # 3nd input is "post_recal_table/[ID]_post_recal_data.table" + # output is "recal_plots/[ID]_recalibration_plots.pdf" + # config: path to gatk jar /data/Phil/software/GATK_3.6/GenomeAnalysisTK.jar + # config: reference fasta path /data/Phil/ref_phil/GATK_resource/hg19/ucsc.hg19.fasta + java -Xmx16G -jar /data/Phil/software/GATK_3.6/GenomeAnalysisTK.jar -T AnalyzeCovariates -R /data/Phil/ref_phil/GATK_resource/hg19/ucsc.hg19.fasta -before ${NAME}_recal_data.table -after ${NAME}_post_recal_data.table -plots ${NAME}_recalibration_plots.pdf +} +export -f foo6 +for i in *sorted_dedup_fixmate.bam +do +sem -j 16 foo6 "$i" +done +sem --wait + +foo7 () { +local i=$1 + OUTPUT_DIR=`echo GATK_BQSR/` + NAME=`echo $i | sed 's/\([A-Za-z0-9-]*\)_sorted_dedup_fixmate.bam/\1/'`; + # rule name print_reads + # input is "sorted_dedup_fixmate/[ID]_sorted_dedup_fixmate.bam" + # 2nd input is "_recal_data.table" + # output is "sorted_dedup_fixmate_bqsr/[ID]_sorted_dedup_fixmate_bqsr.bam" + # config: path to gatk jar /data/Phil/software/GATK_3.6/GenomeAnalysisTK.jar + # config: reference fasta path /data/Phil/ref_phil/GATK_resource/hg19/ucsc.hg19.fasta + java -Xmx16G -jar /data/Phil/software/GATK_3.6/GenomeAnalysisTK.jar -T PrintReads -R /data/Phil/ref_phil/GATK_resource/hg19/ucsc.hg19.fasta -I ${i} -BQSR ${NAME}_recal_data.table -o ${NAME}_sorted_dedup_fixmate_bqsr.bam +} +export -f foo7 +for i in *sorted_dedup_fixmate.bam +do +sem -j 16 foo7 "$i" +done +sem --wait + +rg_dir5=`echo QC_reports` +mkdir -p "$rg_dir5" + +foo8 () { +local i=$1 +NAME=`echo $i | sed 's/\([A-Za-z0-9-]*\)_sorted_dedup_fixmate_bqsr.bam/\1/'`; + # rule name qc_bam + # input is "sorted_dedup_fixmate/[ID]_sorted_dedup_fixmate.bam" + # 2nd input is "melarray.interval_list" + # output is "_hs_metrics.txt" + # config: path to picard jar /data/Phil/software/picard-tools-2.4.1/picard.jar + # config: reference fasta path /data/Phil/ref_phil/GATK_resource/hg19/ucsc.hg19.fasta + java -Xmx16G -jar /data/Phil/software/picard-tools-2.4.1/picard.jar CollectHsMetrics I=${i} O=${NAME}_hs_metrics.txt R=/data/Phil/ref_phil/GATK_resource/hg19/ucsc.hg19.fasta BAIT_INTERVALS=melarray.interval_list TARGET_INTERVALS=melarray.interval_list + + # rule name collect_alignment_summary_metrics + # input is "sorted_dedup_fixmate/[ID]_sorted_dedup_fixmate.bam" + # output is "_aln_metrics.txt" + # config: path to picard jar /data/Phil/software/picard-tools-2.4.1/picard.jar + # config: reference fasta path /data/Phil/ref_phil/GATK_resource/hg19/ucsc.hg19.fasta + java -Xmx16G -jar /data/Phil/software/picard-tools-2.4.1/picard.jar CollectAlignmentSummaryMetrics I=${i} O=${NAME}_aln_metrics.txt R=/data/Phil/ref_phil/GATK_resource/hg19/ucsc.hg19.fasta + } + +export -f foo8 + +for i in *sorted_dedup_fixmate_bqsr.bam +do +sem -j 32 foo8 "$i" +done +sem --wait + + +# rule name multi_qc +# input is all the folders +# output html file and multiqc_data folder +multiqc {folder1} {folder2} + +# rule name haplotype_caller +# input is "sorted_dedup_fixmate_bqsr/[ID]_sorted_dedup_fixmate_bqsr.bam" +# output is "hc/${NAME}_raw.g.vcf" +# config: path to gatk jar /data/Phil/software/GATK_3.6/GenomeAnalysisTK.jar +# config: reference fasta path /data/Phil/ref_phil/GATK_resource/hg19/ucsc.hg19.fasta +# config: reference dbsnp /data/Phil/ref_phil/GATK_resource/hg19/dbsnp_138.hg19.vcf.gz +# config: emitRefConfidence GVCF +# config: variant_index_type LINEAR +# config: variant_index_parameter 128000 +java -Xmx16G -jar /data/Phil/software/GATK_3.6/GenomeAnalysisTK.jar -T HaplotypeCaller -R /data/Phil/ref_phil/GATK_resource/hg19/ucsc.hg19.fasta -I ${i} --emitRefConfidence GVCF --variant_index_type LINEAR --variant_index_parameter 128000 -o ${NAME}_raw.g.vcf --dbsnp /data/Phil/ref_phil/GATK_resource/hg19/dbsnp_138.hg19.vcf.gz + + +# rule name MuTect2 +# read in _tumor_normal file at the beginning and store IDs in memory +# both inputs are "sorted_dedup_fixmate_bqsr/[ID]_sorted_dedup_fixmate_bqsr.bam" but with different IDs, tumor is always present +# 3rd input reference bed from config 151127_out.bed +# output MuTect2/${t1}_${n1}/${t1}_mutect2.vcf +# config: path to gatk jar /data/Phil/software/GATK_3.6/GenomeAnalysisTK.jar +# config: reference fasta path /data/Phil/ref_phil/GATK_resource/hg19/ucsc.hg19.fasta +# config: reference dbsnp /data/Phil/ref_phil/GATK_resource/hg19/dbsnp_138.hg19.vcf.gz +# config: reference cosmic /data/Phil/ref_phil/GATK_resource/hg19/CosmicCodingMuts_chr_M_sorted.vcf.gz +java -Xmx16G -jar /data/Phil/software/GATK_3.6/GenomeAnalysisTK.jar -T MuTect2 -R /data/Phil/ref_phil/GATK_resource/hg19/ucsc.hg19.fasta -I:tumor ${t1}_sorted_dedup_fixmate_bqsr.bam -I:normal ${n1}_sorted_dedup_fixmate_bqsr.bam -o MuTect2/${t1}_${n1}/${t1}_mutect2.vcf --dbsnp /data/Phil/ref_phil/GATK_resource/hg19/dbsnp_138.hg19.vcf.gz --cosmic /data/Phil/ref_phil/GATK_resource/hg19/CosmicCodingMuts_chr_M_sorted.vcf.gz -L 151127_out.bed + + +mv *sorted.bam 3_bwamem +mv *sorted.bai 3_bwamem + +mv *metrics.txt 4_dedup +mv *sorted_dedup.bam 4_dedup + +mv *sorted_dedup_fixmate.bam 5_fixmate +mv *sorted_dedup_fixmate.bai 5_fixmate + +mv *.table 6_bqsr +mv *plots.pdf 6_bqsr +mv *sorted_dedup_fixmate_bqsr.bam 6_bqsr +mv *sorted_dedup_fixmate_bqsr.bai 6_bqsr + +mv *pcr_metrics.txt QC_reports +mv *aln_metrics.txt QC_reports +mv *hs_metrics.txt QC_reports diff --git a/MelArray/scripts/load_config.py b/MelArray/scripts/load_config.py new file mode 100644 index 0000000000000000000000000000000000000000..4906c0275770a5c52cfffff9886eb2b60c851ad8 --- /dev/null +++ b/MelArray/scripts/load_config.py @@ -0,0 +1,21 @@ + +import logging +import yaml + + +def load_config(config_file_path): + """ + Load the YAML configuration file and return a dictionary with all the configuration parameters. + + @author: Balazs Laurenczy + @date: 2017 + """ + + logging.debug("Loading config file from %s", config_file_path) + with open(config_file_path, 'r') as config_file: + config = yaml.load(config_file) + + # initialize the path(s) by replacing the '__ROOT__' tag with the actual root path + config["path"]["fastq"] = config["path"]["fastq"].replace("__ROOT__", config["path"]['root']) + + return config \ No newline at end of file diff --git a/Vagrantfile b/Vagrantfile new file mode 100644 index 0000000000000000000000000000000000000000..1a55195885faf232a8239365cc6c5090028c0619 --- /dev/null +++ b/Vagrantfile @@ -0,0 +1,71 @@ +# -*- mode: ruby -*- +# vi: set ft=ruby : + +# All Vagrant configuration is done below. The "2" in Vagrant.configure +# configures the configuration version (we support older styles for +# backwards compatibility). Please don't change it unless you know what +# you're doing. +Vagrant.configure("2") do |config| + # The most common configuration options are documented and commented below. + # For a complete reference, please see the online documentation at + # https://docs.vagrantup.com. + + # Every Vagrant development environment requires a box. You can search for + # boxes at https://atlas.hashicorp.com/search. + config.vm.box = "bento/centos-7.2" + + # Disable automatic box update checking. If you disable this, then + # boxes will only be checked for updates when the user runs + # `vagrant box outdated`. This is not recommended. + # config.vm.box_check_update = false + + # Create a forwarded port mapping which allows access to a specific port + # within the machine from a port on the host machine. In the example below, + # accessing "localhost:8080" will access port 80 on the guest machine. + # config.vm.network "forwarded_port", guest: 80, host: 8080 + + # Create a private network, which allows host-only access to the machine + # using a specific IP. + # config.vm.network "private_network", ip: "192.168.33.10" + + # Create a public network, which generally matched to bridged network. + # Bridged networks make the machine appear as another physical device on + # your network. + # config.vm.network "public_network" + + # Share an additional folder to the guest VM. The first argument is + # the path on the host to the actual folder. The second argument is + # the path on the guest to mount the folder. And the optional third + # argument is a set of non-required options. + # config.vm.synced_folder "src", "target" + + # Provider-specific configuration so you can fine-tune various + # backing providers for Vagrant. These expose provider-specific options. + # Example for VirtualBox: + # + # config.vm.provider "virtualbox" do |vb| + # # Display the VirtualBox GUI when booting the machine + # vb.gui = true + # + # # Customize the amount of memory on the VM: + # vb.memory = "1024" + # end + # + # View the documentation for the provider you are using for more + # information on available options. + + # Define a Vagrant Push strategy for pushing to Atlas. Other push strategies + # such as FTP and Heroku are also available. See the documentation at + # https://docs.vagrantup.com/v2/push/atlas.html for more information. + # config.push.define "atlas" do |push| + # push.app = "YOUR_ATLAS_USERNAME/YOUR_APPLICATION_NAME" + # end + + # Enable provisioning with a shell script. Additional provisioners such as + # Puppet, Chef, Ansible, Salt, and Docker are also available. Please see the + # documentation for more information about their specific syntax and use. + # config.vm.provision "shell", inline: <<-SHELL + # apt-get update + # apt-get install -y apache2 + # SHELL +end diff --git a/pipeline1/Mel_1_skewer_170228.txt b/pipeline1/Mel_1_skewer_170228.txt new file mode 100644 index 0000000000000000000000000000000000000000..b830a9b447a7f83375d2ae2048fc3e37b9cd1c2a --- /dev/null +++ b/pipeline1/Mel_1_skewer_170228.txt @@ -0,0 +1,20 @@ +#!/bin/bash +rg_dir=`echo 2_skewer` +mkdir -p "$rg_dir" + +foo () { +local i=$1 + OUTPUT_DIR=`echo ./2_skewer` + SAMPLE=`echo $i | sed 's/\(.*\)_R1.fastq.gz/\1/'`; + NAME=`echo $i | sed 's/[0-9]*.A-\([A-Za-z0-9-]*\)_[A-Za-z0-9_]*_R1.fastq.gz/\1/'`; + #echo ${i} + #echo ${SAMPLE}_R2.fastq.gz + #echo ${NAME}_R2.fastq.gz + skewer -t 32 -m pe -q 20 -z ${i} ${SAMPLE}_R2.fastq.gz -o ${NAME} +} + +for i in *R1.fastq.gz +do foo "$i" +done + +mv *trimmed* ./2_skewer diff --git a/pipeline1/Mel_2_bwa_GATK_170228.txt b/pipeline1/Mel_2_bwa_GATK_170228.txt new file mode 100644 index 0000000000000000000000000000000000000000..3bb858345fe0898337256dd17a04db4c8991bf51 --- /dev/null +++ b/pipeline1/Mel_2_bwa_GATK_170228.txt @@ -0,0 +1,153 @@ +#!/bin/bash +rg_dir=`echo 3_bwamem` +mkdir -p "$rg_dir" + +foo () { +local i=$1 + OUTPUT_DIR=`echo ./3_bwamem` + #OUTPUT_DIR2=`echo ./GATK_RNA_dedup/` + + SAMPLE=`echo $i | sed 's/\([A-Za-z0-9]*\)-trimmed-pair1.fastq.gz/\1/'`; + #NAME=`echo $i | sed 's/[0-9]*.[A-Z]-\([A-Za-z0-9-]*\)_R1.fastq-trimmed-pair1.fastq.gz/\1/'`; + #dir = `echo pwd` + #echo ${SAMPLE} + #echo $NAME + #echo ${i} + #echo ./${SAMPLE}_R1.fastq-trimmed-pair2.fastq.gz + bwa mem -t 16 -M -R "@RG\tID:Melanoma\tLB:Melarray\tSM:${SAMPLE}\tPL:ILLUMINA\tPU:HiSEQ4000" /data/Phil/ref_phil/GATK_resource/hg19/ucsc.hg19.fasta ./${i} ./${SAMPLE}-trimmed-pair2.fastq.gz | java -Xmx16G -jar /data/Phil/software/picard-tools-2.4.1/picard.jar SortSam I=/dev/stdin O=${SAMPLE}_sorted.bam CREATE_INDEX=true VALIDATION_STRINGENCY=SILENT SO=coordinate +} +export -f foo + +for i in *-trimmed-pair1.fastq.gz +do +sem -j 1 foo "$i" +done +sem --wait + +rg_dir2=`echo 4_dedup` +mkdir -p "$rg_dir2" + +foo2 () { +local i=$1 + #OUTPUT_DIR=`echo 4_dedup/` + #OUTPUT_DIR2=`echo ./GATK_RNA_dedup/` + NAME=`echo $i | sed 's/\([A-Za-z0-9-]*\)_sorted.bam/\1/'`; + echo ${NAME} + java -Xmx16G -jar /data/Phil/software/picard-tools-2.4.1/picard.jar MarkDuplicates I=${i} O=${NAME}_sorted_dedup.bam M=${NAME}_metrics.txt +} +export -f foo2 + +for i in *sorted.bam +do +sem -j 16 --id dedup foo2 "$i" +done +sem --wait --id dedup + +rg_dir3=`echo 5_fixmate` +mkdir -p "$rg_dir3" + +foo3 () { +local i=$1 + OUTPUT_DIR=`echo ./5_fixmate/` + #OUTPUT_DIR2=`echo ./GATK_RNA_dedup/` + NAME=`echo $i | sed 's/\([A-Za-z0-9-]*\)_sorted_dedup.bam/\1/'`; + echo ${NAME} + java -Xmx16G -jar /data/Phil/software/picard-tools-2.4.1/picard.jar FixMateInformation VALIDATION_STRINGENCY=SILENT CREATE_INDEX=true SORT_ORDER=coordinate I=${i} O=${NAME}_sorted_dedup_fixmate.bam +} +export -f foo3 +for i in *sorted_dedup.bam +do +sem -j 16 --id fixmate foo3 "$i" +done +sem --wait --id fixmate + +rg_dir4=`echo 6_bqsr` +mkdir -p "$rg_dir4" + +foo4 () { +local i=$1 + NAME=`echo $i | sed 's/\([A-Za-z0-9-]*\)_sorted_dedup_fixmate.bam/\1/'`; + echo ${NAME} + java -Xmx16G -jar /data/Phil/software/GATK_3.6/GenomeAnalysisTK.jar -T BaseRecalibrator -R /data/Phil/ref_phil/GATK_resource/hg19/ucsc.hg19.fasta -I ${i} -knownSites /data/Phil/ref_phil/GATK_resource/hg19/dbsnp_138.hg19.vcf.gz -knownSites /data/Phil/ref_phil/GATK_resource/hg19/Mills_and_1000G_gold_standard.indels.hg19.sites.vcf.gz -knownSites /data/Phil/ref_phil/GATK_resource/hg19/1000G_phase1.snps.high_confidence.hg19.sites.vcf.gz -L 151127_out.bed --interval_padding 100 -o ${NAME}_recal_data.table +} +export -f foo4 + +for i in *sorted_dedup_fixmate.bam +do +sem -j 16 foo4 "$i" +done +sem --wait + +foo5 () { +local i=$1 + NAME=`echo $i | sed 's/\([A-Za-z0-9-]*\)_sorted_dedup_fixmate.bam/\1/'`; + java -Xmx16G -jar /data/Phil/software/GATK_3.6/GenomeAnalysisTK.jar -T BaseRecalibrator -R /data/Phil/ref_phil/GATK_resource/hg19/ucsc.hg19.fasta -I ${i} -knownSites /data/Phil/ref_phil/GATK_resource/hg19/dbsnp_138.hg19.vcf.gz -knownSites /data/Phil/ref_phil/GATK_resource/hg19/Mills_and_1000G_gold_standard.indels.hg19.sites.vcf.gz -knownSites /data/Phil/ref_phil/GATK_resource/hg19/1000G_phase1.snps.high_confidence.hg19.sites.vcf.gz -L 151127_out.bed --interval_padding 100 -BQSR ${NAME}_recal_data.table -o ${NAME}_post_recal_data.table + +} +export -f foo5 +for i in *sorted_dedup_fixmate.bam +do +sem -j 16 foo5 "$i" +done +sem --wait + +foo6 () { +local i=$1 + NAME=`echo $i | sed 's/\([A-Za-z0-9-]*\)_sorted_dedup_fixmate.bam/\1/'`; + java -Xmx16G -jar /data/Phil/software/GATK_3.6/GenomeAnalysisTK.jar -T AnalyzeCovariates -R /data/Phil/ref_phil/GATK_resource/hg19/ucsc.hg19.fasta -before ${NAME}_recal_data.table -after ${NAME}_post_recal_data.table -plots ${NAME}_recalibration_plots.pdf +} +export -f foo6 +for i in *sorted_dedup_fixmate.bam +do +sem -j 16 foo6 "$i" +done +sem --wait + +foo7 () { +local i=$1 + OUTPUT_DIR=`echo GATK_BQSR/` + NAME=`echo $i | sed 's/\([A-Za-z0-9-]*\)_sorted_dedup_fixmate.bam/\1/'`; + java -Xmx16G -jar /data/Phil/software/GATK_3.6/GenomeAnalysisTK.jar -T PrintReads -R /data/Phil/ref_phil/GATK_resource/hg19/ucsc.hg19.fasta -I ${i} -BQSR ${NAME}_recal_data.table -o ${NAME}_sorted_dedup_fixmate_bqsr.bam +} +export -f foo7 +for i in *sorted_dedup_fixmate.bam +do +sem -j 16 foo7 "$i" +done +sem --wait + +rg_dir5=`echo QC_reports` +mkdir -p "$rg_dir5" + +foo8 () { +local i=$1 +NAME=`echo $i | sed 's/\([A-Za-z0-9-]*\)_sorted_dedup_fixmate_bqsr.bam/\1/'`; + java -Xmx16G -jar /data/Phil/software/picard-tools-2.4.1/picard.jar CollectHsMetrics I=${i} O=${NAME}_hs_metrics.txt R=/data/Phil/ref_phil/GATK_resource/hg19/ucsc.hg19.fasta BAIT_INTERVALS=melarray.interval_list TARGET_INTERVALS=melarray.interval_list + java -Xmx16G -jar /data/Phil/software/picard-tools-2.4.1/picard.jar CollectAlignmentSummaryMetrics I=${i} O=${NAME}_aln_metrics.txt R=/data/Phil/ref_phil/GATK_resource/hg19/ucsc.hg19.fasta + } + +export -f foo8 + +for i in *sorted_dedup_fixmate_bqsr.bam +do +sem -j 32 foo8 "$i" +done +sem --wait + +mv *sorted.bam 3_bwamem +mv *sorted.bai 3_bwamem + +mv *metrics.txt 4_dedup +mv *sorted_dedup.bam 4_dedup + +mv *sorted_dedup_fixmate.bam 5_fixmate +mv *sorted_dedup_fixmate.bai 5_fixmate + +mv *.table 6_bqsr +mv *plots.pdf 6_bqsr +mv *sorted_dedup_fixmate_bqsr.bam 6_bqsr +mv *sorted_dedup_fixmate_bqsr.bai 6_bqsr + +mv *pcr_metrics.txt QC_reports +mv *aln_metrics.txt QC_reports +mv *hs_metrics.txt QC_reports diff --git a/pipeline1/pipeline1.snake b/pipeline1/pipeline1.snake index b6eb04e561434f74b2840f8dc2bdae30ea84ccad..f5cf2c5fc69aa4e199eebd4d76ae76f3f8b334b6 100644 --- a/pipeline1/pipeline1.snake +++ b/pipeline1/pipeline1.snake @@ -1,6 +1,6 @@ -############################## -## USZ Snakemake pipeline 1 ## -############################## +##################################### +## USZ MelArray Snakemake pipeline ## +##################################### # # # @author: Balazs Laurenczy (for Snakemake pipeline, original bash pipeline from Phil Cheng) @@ -20,33 +20,18 @@ from datetime import datetime # set up logging logging.basicConfig(level = logging.INFO, format = '#%(asctime)s | %(levelname)-7s | %(filename)-20s: %(message)s') -logging.info('Starting snakemake pipeline ...') +logging.info('Starting MelArray Snakemake pipeline ...') # get config parameters -N_FILES = int(config.get('n', 4)) - -IN_FOLDER = config.get('input_folder', 'input') -if IN_FOLDER[:-1] != '/': IN_FOLDER += '/' # add ending slash to folder path -# check if input folder exists -if not os.path.exists(IN_FOLDER): - exit('Cannot find input folder at path "{}". Aborting.'.format(IN_FOLDER)) - -OUT_FOLDER = config.get('output_folder', 'output') -if OUT_FOLDER[:-1] != '/': OUT_FOLDER += '/' # add ending slash to folder path -# make sure folder exists -shell('mkdir -p {OUT_FOLDER}') - -TMP_FOLDER = config.get('temp_folder', 'tmp') -if TMP_FOLDER[:-1] != '/': TMP_FOLDER += '/' # add ending slash to folder path -# make sure folder exists -shell('mkdir -p {TMP_FOLDER}') - -REF_FOLDER = config.get('ref_folder', 'ref_output') -if REF_FOLDER[:-1] != '/': REF_FOLDER += '/' # add ending slash to folder path # get the list of files to process IDs, = glob_wildcards(IN_FOLDER + "{id}.fastq") +# check if IDs were not specified in config +IDs = config.get('ids', IDs) +if type(IDs) == str: + IDs = IDs.split(',') + ######### # RULES # @@ -61,18 +46,18 @@ rule all: rule step1: input: - TMP_FOLDER + '{id}.sam' + IN_FOLDER + '{id}.fastq' output: - OUT_FOLDER + '{id}.bam' + TMP_FOLDER + '{id}.sam' shell: 'cat {input} > {output}' rule step2: input: - IN_FOLDER + '{id}.fastq' - output: TMP_FOLDER + '{id}.sam' + output: + OUT_FOLDER + '{id}.bam' shell: 'cat {input} > {output}'