Skip to content
Snippets Groups Projects
Commit 1efe67a8 authored by Balazs Laurenczy's avatar Balazs Laurenczy
Browse files

Adding MelArray pipeline

parent dd4a841e
No related branches found
No related tags found
No related merge requests found
##############################
## USZ Snakemake pipeline 1 ##
##############################
#
# original bash pipeline from Phil Cheng
# @author: Balazs Laurenczy (for Snakemake pipeline)
# @date: 2017
########
# INIT #
########
import os, logging
from timeit import default_timer as t
from datetime import datetime
# set up logging
logging.basicConfig(level = logging.INFO, format = '#%(asctime)s | %(levelname)-7s | %(filename)-20s: %(message)s')
logging.info('Starting snakemake pipeline ...')
# load the configuration file
config_file_path = os.getcwd() + "/config.yaml"
config = load_config(config_file_path)
# get the list of files to process
IDs, = glob_wildcards(IN_FOLDER + "{id}.fastq")
# check if IDs were not specified in config
IDs = config.get('ids', IDs)
if type(IDs) == str:
IDs = IDs.split(',')
#########
# RULES #
#########
rule all:
input:
expand(OUT_FOLDER + '{id}.bam', id = IDs)
run:
logging.info('Pipeline completed.')
rule step1:
input:
IN_FOLDER + '{id}.fastq'
output:
TMP_FOLDER + '{id}.sam'
shell:
'cat {input} > {output}'
rule step2:
input:
TMP_FOLDER + '{id}.sam'
output:
OUT_FOLDER + '{id}.bam'
shell:
'cat {input} > {output}'
onsuccess:
logging.info('Pipeline completed successfully.')
onerror:
logging.info('Pipeline failed.')
#############################
## Main configuration file ##
#############################
#
# This file controls all the parameters for running the pipeline.
# This file should be available on the machine on the machine running the pipeline.
#
# @author: Balazs Laurenczy
# @date: 2017
# file and folder paths
path:
# root path for the code/scripts
root: "/vagrant/MelArray"
# folder where the original fastq data is
fastq: "__ROOT__/fastq"
#!/bin/bash
rg_dir=`echo 2_skewer`
mkdir -p "$rg_dir"
foo () {
local i=$1
OUTPUT_DIR=`echo ./2_skewer`
SAMPLE=`echo $i | sed 's/\(.*\)_R1.fastq.gz/\1/'`; # sample is the full name like "YYYYMMDD.A-...-_NAME_R1
NAME=`echo $i | sed 's/[0-9]*.A-\([A-Za-z0-9-]*\)_[A-Za-z0-9_]*_R1.fastq.gz/\1/'`; # The ID
#echo ${i}
#echo ${SAMPLE}_R2.fastq.gz
#echo ${NAME}_R2.fastq.gz
# rule name skewer
skewer -t 32 -m pe -q 20 -z ${i} ${SAMPLE}_R2.fastq.gz -o ${NAME}
# 2 inputs: "R[12].fastq.gz"
# 2 outputs: "-trimmed-pair[12].fastq.gz"
}
for i in *R1.fastq.gz
do foo "$i"
done
mv *trimmed* ./2_skewer
#!/bin/bash
rg_dir=`echo 3_bwamem`
mkdir -p "$rg_dir"
# PMBC is blood (normal)
# MTB is cancer sample
#2 patients
# machine version Centos 7.2.1511
# rule name create_interval_list
# input is "config: reference bed 151127_out.bed"
# output is "melarray.interval_list"
# config: path to reference dictionary "ucsc.hg19.dict"
java -Xmx16G -jar /data/Phil/software/picard-tools-2.4.1/picard.jar BedToIntervalList I=.bed O=melarray.interval_list SD=ucsc.hg19.dict
# TODO: create another sub-pipeline to do the pre-processing steps for reference genome (.tbi, .dict)
foo () {
local i=$1
OUTPUT_DIR=`echo ./3_bwamem`
#OUTPUT_DIR2=`echo ./GATK_RNA_dedup/`
SAMPLE=`echo $i | sed 's/\([A-Za-z0-9]*\)-trimmed-pair1.fastq.gz/\1/'`; # output name
# INPUT: "-trimmed-pair[12].fastq.gz"
# output is "[ID]_sorted.bam"
# rule name bwa_bam
# config: header read group, separate into 4 groups: ID, LB, PL, PU "@RG\tID:Melanoma\tLB:Melarray\tSM:${SAMPLE}\tPL:ILLUMINA\tPU:HiSEQ4000"
# config: reference fasta path /data/Phil/ref_phil/GATK_resource/hg19/ucsc.hg19.fasta
# config: path to picard jar /data/Phil/software/picard-tools-2.4.1/picard.jar
# config: testing parameter: align it only against chromosome 1
bwa mem -t 16 -M -R "@RG\tID:Melanoma\tLB:Melarray\tSM:${SAMPLE}\tPL:ILLUMINA\tPU:HiSEQ4000" /data/Phil/ref_phil/GATK_resource/hg19/ucsc.hg19.fasta ./${i} ./${SAMPLE}-trimmed-pair2.fastq.gz | java -Xmx16G -jar /data/Phil/software/picard-tools-2.4.1/picard.jar SortSam I=/dev/stdin O=${SAMPLE}_sorted.bam CREATE_INDEX=true VALIDATION_STRINGENCY=SILENT SO=coordinate
}
export -f foo
for i in *-trimmed-pair1.fastq.gz
do
sem -j 1 foo "$i"
done
sem --wait
rg_dir2=`echo 4_dedup`
mkdir -p "$rg_dir2"
foo2 () {
local i=$1
#OUTPUT_DIR=`echo 4_dedup/`
#OUTPUT_DIR2=`echo ./GATK_RNA_dedup/`
NAME=`echo $i | sed 's/\([A-Za-z0-9-]*\)_sorted.bam/\1/'`;
echo ${NAME}
# separate metrics log file
# rule name mark_duplicates
# input is "sorted/[ID]_sorted.bam"
# output is "sorted_dedup/[ID]_sorted_dedup.bam"
# config: path to picard jar /data/Phil/software/picard-tools-2.4.1/picard.jar
java -Xmx16G -jar /data/Phil/software/picard-tools-2.4.1/picard.jar MarkDuplicates I=${i} O=${NAME}_sorted_dedup.bam M=${NAME}_metrics.txt
}
export -f foo2
for i in *sorted.bam
do
sem -j 16 --id dedup foo2 "$i"
done
sem --wait --id dedup
rg_dir3=`echo 5_fixmate`
mkdir -p "$rg_dir3"
foo3 () {
local i=$1
OUTPUT_DIR=`echo ./5_fixmate/`
#OUTPUT_DIR2=`echo ./GATK_RNA_dedup/`
NAME=`echo $i | sed 's/\([A-Za-z0-9-]*\)_sorted_dedup.bam/\1/'`;
echo ${NAME}
# rule name fix_mate
# input is "sorted_dedup/[ID]_sorted_dedup.bam"
# output is "sorted_dedup_fixmate/[ID]_sorted_dedup_fixmate.bam"
# config: path to picard jar /data/Phil/software/picard-tools-2.4.1/picard.jar
java -Xmx16G -jar /data/Phil/software/picard-tools-2.4.1/picard.jar FixMateInformation VALIDATION_STRINGENCY=SILENT CREATE_INDEX=true SORT_ORDER=coordinate I=${i} O=${NAME}_sorted_dedup_fixmate.bam
}
export -f foo3
for i in *sorted_dedup.bam
do
sem -j 16 --id fixmate foo3 "$i"
done
sem --wait --id fixmate
rg_dir4=`echo 6_bqsr`
mkdir -p "$rg_dir4"
foo4 () {
local i=$1
NAME=`echo $i | sed 's/\([A-Za-z0-9-]*\)_sorted_dedup_fixmate.bam/\1/'`;
echo ${NAME}
# rule name base_recal
# input is "sorted_dedup_fixmate/[ID]_sorted_dedup_fixmate.bam"
# output is "recal_table/[ID]_recal_data.table"
# config: path to gatk jar /data/Phil/software/GATK_3.6/GenomeAnalysisTK.jar
# config: reference fasta path /data/Phil/ref_phil/GATK_resource/hg19/ucsc.hg19.fasta
# config: reference dbsnip /data/Phil/ref_phil/GATK_resource/hg19/dbsnp_138.hg19.vcf.gz
# config: reference Mills /data/Phil/ref_phil/GATK_resource/hg19/Mills_and_1000G_gold_standard.indels.hg19.sites.vcf.gz
# config: reference 1000G /data/Phil/ref_phil/GATK_resource/hg19/1000G_phase1.snps.high_confidence.hg19.sites.vcf.gz
# config: reference bed 151127_out.bed
# config: interval_padding 100
java -Xmx16G -jar /data/Phil/software/GATK_3.6/GenomeAnalysisTK.jar -T BaseRecalibrator -R /data/Phil/ref_phil/GATK_resource/hg19/ucsc.hg19.fasta -I ${i} -knownSites /data/Phil/ref_phil/GATK_resource/hg19/dbsnp_138.hg19.vcf.gz -knownSites /data/Phil/ref_phil/GATK_resource/hg19/Mills_and_1000G_gold_standard.indels.hg19.sites.vcf.gz -knownSites /data/Phil/ref_phil/GATK_resource/hg19/1000G_phase1.snps.high_confidence.hg19.sites.vcf.gz -L 151127_out.bed --interval_padding 100 -o ${NAME}_recal_data.table
}
export -f foo4
for i in *sorted_dedup_fixmate.bam
do
sem -j 16 foo4 "$i"
done
sem --wait
foo5 () {
local i=$1
NAME=`echo $i | sed 's/\([A-Za-z0-9-]*\)_sorted_dedup_fixmate.bam/\1/'`;
# rule name post_recal
# input is "sorted_dedup_fixmate/[ID]_sorted_dedup_fixmate.bam"
# 2nd input is "recal_table/[ID]_recal_data.table"
# output is "post_recal_table/[ID]_post_recal_data.table"
# config: path to gatk jar /data/Phil/software/GATK_3.6/GenomeAnalysisTK.jar
# config: reference fasta path /data/Phil/ref_phil/GATK_resource/hg19/ucsc.hg19.fasta
# config: reference dbsnip /data/Phil/ref_phil/GATK_resource/hg19/dbsnp_138.hg19.vcf.gz
# config: reference Mills /data/Phil/ref_phil/GATK_resource/hg19/Mills_and_1000G_gold_standard.indels.hg19.sites.vcf.gz
# config: reference 1000G /data/Phil/ref_phil/GATK_resource/hg19/1000G_phase1.snps.high_confidence.hg19.sites.vcf.gz
# config: reference bed 151127_out.bed
# config: interval_padding 100
java -Xmx16G -jar /data/Phil/software/GATK_3.6/GenomeAnalysisTK.jar -T BaseRecalibrator -R /data/Phil/ref_phil/GATK_resource/hg19/ucsc.hg19.fasta -I ${i} -knownSites /data/Phil/ref_phil/GATK_resource/hg19/dbsnp_138.hg19.vcf.gz -knownSites /data/Phil/ref_phil/GATK_resource/hg19/Mills_and_1000G_gold_standard.indels.hg19.sites.vcf.gz -knownSites /data/Phil/ref_phil/GATK_resource/hg19/1000G_phase1.snps.high_confidence.hg19.sites.vcf.gz -L 151127_out.bed --interval_padding 100 -BQSR ${NAME}_recal_data.table -o ${NAME}_post_recal_data.table
}
export -f foo5
for i in *sorted_dedup_fixmate.bam
do
sem -j 16 foo5 "$i"
done
sem --wait
foo6 () {
local i=$1
NAME=`echo $i | sed 's/\([A-Za-z0-9-]*\)_sorted_dedup_fixmate.bam/\1/'`;
# rule name analyze_covariates
# input is "sorted_dedup_fixmate/[ID]_sorted_dedup_fixmate.bam"
# 2nd input is "recal_table/[ID]_recal_data.table"
# 3nd input is "post_recal_table/[ID]_post_recal_data.table"
# output is "recal_plots/[ID]_recalibration_plots.pdf"
# config: path to gatk jar /data/Phil/software/GATK_3.6/GenomeAnalysisTK.jar
# config: reference fasta path /data/Phil/ref_phil/GATK_resource/hg19/ucsc.hg19.fasta
java -Xmx16G -jar /data/Phil/software/GATK_3.6/GenomeAnalysisTK.jar -T AnalyzeCovariates -R /data/Phil/ref_phil/GATK_resource/hg19/ucsc.hg19.fasta -before ${NAME}_recal_data.table -after ${NAME}_post_recal_data.table -plots ${NAME}_recalibration_plots.pdf
}
export -f foo6
for i in *sorted_dedup_fixmate.bam
do
sem -j 16 foo6 "$i"
done
sem --wait
foo7 () {
local i=$1
OUTPUT_DIR=`echo GATK_BQSR/`
NAME=`echo $i | sed 's/\([A-Za-z0-9-]*\)_sorted_dedup_fixmate.bam/\1/'`;
# rule name print_reads
# input is "sorted_dedup_fixmate/[ID]_sorted_dedup_fixmate.bam"
# 2nd input is "_recal_data.table"
# output is "sorted_dedup_fixmate_bqsr/[ID]_sorted_dedup_fixmate_bqsr.bam"
# config: path to gatk jar /data/Phil/software/GATK_3.6/GenomeAnalysisTK.jar
# config: reference fasta path /data/Phil/ref_phil/GATK_resource/hg19/ucsc.hg19.fasta
java -Xmx16G -jar /data/Phil/software/GATK_3.6/GenomeAnalysisTK.jar -T PrintReads -R /data/Phil/ref_phil/GATK_resource/hg19/ucsc.hg19.fasta -I ${i} -BQSR ${NAME}_recal_data.table -o ${NAME}_sorted_dedup_fixmate_bqsr.bam
}
export -f foo7
for i in *sorted_dedup_fixmate.bam
do
sem -j 16 foo7 "$i"
done
sem --wait
rg_dir5=`echo QC_reports`
mkdir -p "$rg_dir5"
foo8 () {
local i=$1
NAME=`echo $i | sed 's/\([A-Za-z0-9-]*\)_sorted_dedup_fixmate_bqsr.bam/\1/'`;
# rule name qc_bam
# input is "sorted_dedup_fixmate/[ID]_sorted_dedup_fixmate.bam"
# 2nd input is "melarray.interval_list"
# output is "_hs_metrics.txt"
# config: path to picard jar /data/Phil/software/picard-tools-2.4.1/picard.jar
# config: reference fasta path /data/Phil/ref_phil/GATK_resource/hg19/ucsc.hg19.fasta
java -Xmx16G -jar /data/Phil/software/picard-tools-2.4.1/picard.jar CollectHsMetrics I=${i} O=${NAME}_hs_metrics.txt R=/data/Phil/ref_phil/GATK_resource/hg19/ucsc.hg19.fasta BAIT_INTERVALS=melarray.interval_list TARGET_INTERVALS=melarray.interval_list
# rule name collect_alignment_summary_metrics
# input is "sorted_dedup_fixmate/[ID]_sorted_dedup_fixmate.bam"
# output is "_aln_metrics.txt"
# config: path to picard jar /data/Phil/software/picard-tools-2.4.1/picard.jar
# config: reference fasta path /data/Phil/ref_phil/GATK_resource/hg19/ucsc.hg19.fasta
java -Xmx16G -jar /data/Phil/software/picard-tools-2.4.1/picard.jar CollectAlignmentSummaryMetrics I=${i} O=${NAME}_aln_metrics.txt R=/data/Phil/ref_phil/GATK_resource/hg19/ucsc.hg19.fasta
}
export -f foo8
for i in *sorted_dedup_fixmate_bqsr.bam
do
sem -j 32 foo8 "$i"
done
sem --wait
# rule name multi_qc
# input is all the folders
# output html file and multiqc_data folder
multiqc {folder1} {folder2}
# rule name haplotype_caller
# input is "sorted_dedup_fixmate_bqsr/[ID]_sorted_dedup_fixmate_bqsr.bam"
# output is "hc/${NAME}_raw.g.vcf"
# config: path to gatk jar /data/Phil/software/GATK_3.6/GenomeAnalysisTK.jar
# config: reference fasta path /data/Phil/ref_phil/GATK_resource/hg19/ucsc.hg19.fasta
# config: reference dbsnp /data/Phil/ref_phil/GATK_resource/hg19/dbsnp_138.hg19.vcf.gz
# config: emitRefConfidence GVCF
# config: variant_index_type LINEAR
# config: variant_index_parameter 128000
java -Xmx16G -jar /data/Phil/software/GATK_3.6/GenomeAnalysisTK.jar -T HaplotypeCaller -R /data/Phil/ref_phil/GATK_resource/hg19/ucsc.hg19.fasta -I ${i} --emitRefConfidence GVCF --variant_index_type LINEAR --variant_index_parameter 128000 -o ${NAME}_raw.g.vcf --dbsnp /data/Phil/ref_phil/GATK_resource/hg19/dbsnp_138.hg19.vcf.gz
# rule name MuTect2
# read in _tumor_normal file at the beginning and store IDs in memory
# both inputs are "sorted_dedup_fixmate_bqsr/[ID]_sorted_dedup_fixmate_bqsr.bam" but with different IDs, tumor is always present
# 3rd input reference bed from config 151127_out.bed
# output MuTect2/${t1}_${n1}/${t1}_mutect2.vcf
# config: path to gatk jar /data/Phil/software/GATK_3.6/GenomeAnalysisTK.jar
# config: reference fasta path /data/Phil/ref_phil/GATK_resource/hg19/ucsc.hg19.fasta
# config: reference dbsnp /data/Phil/ref_phil/GATK_resource/hg19/dbsnp_138.hg19.vcf.gz
# config: reference cosmic /data/Phil/ref_phil/GATK_resource/hg19/CosmicCodingMuts_chr_M_sorted.vcf.gz
java -Xmx16G -jar /data/Phil/software/GATK_3.6/GenomeAnalysisTK.jar -T MuTect2 -R /data/Phil/ref_phil/GATK_resource/hg19/ucsc.hg19.fasta -I:tumor ${t1}_sorted_dedup_fixmate_bqsr.bam -I:normal ${n1}_sorted_dedup_fixmate_bqsr.bam -o MuTect2/${t1}_${n1}/${t1}_mutect2.vcf --dbsnp /data/Phil/ref_phil/GATK_resource/hg19/dbsnp_138.hg19.vcf.gz --cosmic /data/Phil/ref_phil/GATK_resource/hg19/CosmicCodingMuts_chr_M_sorted.vcf.gz -L 151127_out.bed
mv *sorted.bam 3_bwamem
mv *sorted.bai 3_bwamem
mv *metrics.txt 4_dedup
mv *sorted_dedup.bam 4_dedup
mv *sorted_dedup_fixmate.bam 5_fixmate
mv *sorted_dedup_fixmate.bai 5_fixmate
mv *.table 6_bqsr
mv *plots.pdf 6_bqsr
mv *sorted_dedup_fixmate_bqsr.bam 6_bqsr
mv *sorted_dedup_fixmate_bqsr.bai 6_bqsr
mv *pcr_metrics.txt QC_reports
mv *aln_metrics.txt QC_reports
mv *hs_metrics.txt QC_reports
import logging
import yaml
def load_config(config_file_path):
"""
Load the YAML configuration file and return a dictionary with all the configuration parameters.
@author: Balazs Laurenczy
@date: 2017
"""
logging.debug("Loading config file from %s", config_file_path)
with open(config_file_path, 'r') as config_file:
config = yaml.load(config_file)
# initialize the path(s) by replacing the '__ROOT__' tag with the actual root path
config["path"]["fastq"] = config["path"]["fastq"].replace("__ROOT__", config["path"]['root'])
return config
\ No newline at end of file
# -*- mode: ruby -*-
# vi: set ft=ruby :
# All Vagrant configuration is done below. The "2" in Vagrant.configure
# configures the configuration version (we support older styles for
# backwards compatibility). Please don't change it unless you know what
# you're doing.
Vagrant.configure("2") do |config|
# The most common configuration options are documented and commented below.
# For a complete reference, please see the online documentation at
# https://docs.vagrantup.com.
# Every Vagrant development environment requires a box. You can search for
# boxes at https://atlas.hashicorp.com/search.
config.vm.box = "bento/centos-7.2"
# Disable automatic box update checking. If you disable this, then
# boxes will only be checked for updates when the user runs
# `vagrant box outdated`. This is not recommended.
# config.vm.box_check_update = false
# Create a forwarded port mapping which allows access to a specific port
# within the machine from a port on the host machine. In the example below,
# accessing "localhost:8080" will access port 80 on the guest machine.
# config.vm.network "forwarded_port", guest: 80, host: 8080
# Create a private network, which allows host-only access to the machine
# using a specific IP.
# config.vm.network "private_network", ip: "192.168.33.10"
# Create a public network, which generally matched to bridged network.
# Bridged networks make the machine appear as another physical device on
# your network.
# config.vm.network "public_network"
# Share an additional folder to the guest VM. The first argument is
# the path on the host to the actual folder. The second argument is
# the path on the guest to mount the folder. And the optional third
# argument is a set of non-required options.
# config.vm.synced_folder "src", "target"
# Provider-specific configuration so you can fine-tune various
# backing providers for Vagrant. These expose provider-specific options.
# Example for VirtualBox:
#
# config.vm.provider "virtualbox" do |vb|
# # Display the VirtualBox GUI when booting the machine
# vb.gui = true
#
# # Customize the amount of memory on the VM:
# vb.memory = "1024"
# end
#
# View the documentation for the provider you are using for more
# information on available options.
# Define a Vagrant Push strategy for pushing to Atlas. Other push strategies
# such as FTP and Heroku are also available. See the documentation at
# https://docs.vagrantup.com/v2/push/atlas.html for more information.
# config.push.define "atlas" do |push|
# push.app = "YOUR_ATLAS_USERNAME/YOUR_APPLICATION_NAME"
# end
# Enable provisioning with a shell script. Additional provisioners such as
# Puppet, Chef, Ansible, Salt, and Docker are also available. Please see the
# documentation for more information about their specific syntax and use.
# config.vm.provision "shell", inline: <<-SHELL
# apt-get update
# apt-get install -y apache2
# SHELL
end
#!/bin/bash
rg_dir=`echo 2_skewer`
mkdir -p "$rg_dir"
foo () {
local i=$1
OUTPUT_DIR=`echo ./2_skewer`
SAMPLE=`echo $i | sed 's/\(.*\)_R1.fastq.gz/\1/'`;
NAME=`echo $i | sed 's/[0-9]*.A-\([A-Za-z0-9-]*\)_[A-Za-z0-9_]*_R1.fastq.gz/\1/'`;
#echo ${i}
#echo ${SAMPLE}_R2.fastq.gz
#echo ${NAME}_R2.fastq.gz
skewer -t 32 -m pe -q 20 -z ${i} ${SAMPLE}_R2.fastq.gz -o ${NAME}
}
for i in *R1.fastq.gz
do foo "$i"
done
mv *trimmed* ./2_skewer
#!/bin/bash
rg_dir=`echo 3_bwamem`
mkdir -p "$rg_dir"
foo () {
local i=$1
OUTPUT_DIR=`echo ./3_bwamem`
#OUTPUT_DIR2=`echo ./GATK_RNA_dedup/`
SAMPLE=`echo $i | sed 's/\([A-Za-z0-9]*\)-trimmed-pair1.fastq.gz/\1/'`;
#NAME=`echo $i | sed 's/[0-9]*.[A-Z]-\([A-Za-z0-9-]*\)_R1.fastq-trimmed-pair1.fastq.gz/\1/'`;
#dir = `echo pwd`
#echo ${SAMPLE}
#echo $NAME
#echo ${i}
#echo ./${SAMPLE}_R1.fastq-trimmed-pair2.fastq.gz
bwa mem -t 16 -M -R "@RG\tID:Melanoma\tLB:Melarray\tSM:${SAMPLE}\tPL:ILLUMINA\tPU:HiSEQ4000" /data/Phil/ref_phil/GATK_resource/hg19/ucsc.hg19.fasta ./${i} ./${SAMPLE}-trimmed-pair2.fastq.gz | java -Xmx16G -jar /data/Phil/software/picard-tools-2.4.1/picard.jar SortSam I=/dev/stdin O=${SAMPLE}_sorted.bam CREATE_INDEX=true VALIDATION_STRINGENCY=SILENT SO=coordinate
}
export -f foo
for i in *-trimmed-pair1.fastq.gz
do
sem -j 1 foo "$i"
done
sem --wait
rg_dir2=`echo 4_dedup`
mkdir -p "$rg_dir2"
foo2 () {
local i=$1
#OUTPUT_DIR=`echo 4_dedup/`
#OUTPUT_DIR2=`echo ./GATK_RNA_dedup/`
NAME=`echo $i | sed 's/\([A-Za-z0-9-]*\)_sorted.bam/\1/'`;
echo ${NAME}
java -Xmx16G -jar /data/Phil/software/picard-tools-2.4.1/picard.jar MarkDuplicates I=${i} O=${NAME}_sorted_dedup.bam M=${NAME}_metrics.txt
}
export -f foo2
for i in *sorted.bam
do
sem -j 16 --id dedup foo2 "$i"
done
sem --wait --id dedup
rg_dir3=`echo 5_fixmate`
mkdir -p "$rg_dir3"
foo3 () {
local i=$1
OUTPUT_DIR=`echo ./5_fixmate/`
#OUTPUT_DIR2=`echo ./GATK_RNA_dedup/`
NAME=`echo $i | sed 's/\([A-Za-z0-9-]*\)_sorted_dedup.bam/\1/'`;
echo ${NAME}
java -Xmx16G -jar /data/Phil/software/picard-tools-2.4.1/picard.jar FixMateInformation VALIDATION_STRINGENCY=SILENT CREATE_INDEX=true SORT_ORDER=coordinate I=${i} O=${NAME}_sorted_dedup_fixmate.bam
}
export -f foo3
for i in *sorted_dedup.bam
do
sem -j 16 --id fixmate foo3 "$i"
done
sem --wait --id fixmate
rg_dir4=`echo 6_bqsr`
mkdir -p "$rg_dir4"
foo4 () {
local i=$1
NAME=`echo $i | sed 's/\([A-Za-z0-9-]*\)_sorted_dedup_fixmate.bam/\1/'`;
echo ${NAME}
java -Xmx16G -jar /data/Phil/software/GATK_3.6/GenomeAnalysisTK.jar -T BaseRecalibrator -R /data/Phil/ref_phil/GATK_resource/hg19/ucsc.hg19.fasta -I ${i} -knownSites /data/Phil/ref_phil/GATK_resource/hg19/dbsnp_138.hg19.vcf.gz -knownSites /data/Phil/ref_phil/GATK_resource/hg19/Mills_and_1000G_gold_standard.indels.hg19.sites.vcf.gz -knownSites /data/Phil/ref_phil/GATK_resource/hg19/1000G_phase1.snps.high_confidence.hg19.sites.vcf.gz -L 151127_out.bed --interval_padding 100 -o ${NAME}_recal_data.table
}
export -f foo4
for i in *sorted_dedup_fixmate.bam
do
sem -j 16 foo4 "$i"
done
sem --wait
foo5 () {
local i=$1
NAME=`echo $i | sed 's/\([A-Za-z0-9-]*\)_sorted_dedup_fixmate.bam/\1/'`;
java -Xmx16G -jar /data/Phil/software/GATK_3.6/GenomeAnalysisTK.jar -T BaseRecalibrator -R /data/Phil/ref_phil/GATK_resource/hg19/ucsc.hg19.fasta -I ${i} -knownSites /data/Phil/ref_phil/GATK_resource/hg19/dbsnp_138.hg19.vcf.gz -knownSites /data/Phil/ref_phil/GATK_resource/hg19/Mills_and_1000G_gold_standard.indels.hg19.sites.vcf.gz -knownSites /data/Phil/ref_phil/GATK_resource/hg19/1000G_phase1.snps.high_confidence.hg19.sites.vcf.gz -L 151127_out.bed --interval_padding 100 -BQSR ${NAME}_recal_data.table -o ${NAME}_post_recal_data.table
}
export -f foo5
for i in *sorted_dedup_fixmate.bam
do
sem -j 16 foo5 "$i"
done
sem --wait
foo6 () {
local i=$1
NAME=`echo $i | sed 's/\([A-Za-z0-9-]*\)_sorted_dedup_fixmate.bam/\1/'`;
java -Xmx16G -jar /data/Phil/software/GATK_3.6/GenomeAnalysisTK.jar -T AnalyzeCovariates -R /data/Phil/ref_phil/GATK_resource/hg19/ucsc.hg19.fasta -before ${NAME}_recal_data.table -after ${NAME}_post_recal_data.table -plots ${NAME}_recalibration_plots.pdf
}
export -f foo6
for i in *sorted_dedup_fixmate.bam
do
sem -j 16 foo6 "$i"
done
sem --wait
foo7 () {
local i=$1
OUTPUT_DIR=`echo GATK_BQSR/`
NAME=`echo $i | sed 's/\([A-Za-z0-9-]*\)_sorted_dedup_fixmate.bam/\1/'`;
java -Xmx16G -jar /data/Phil/software/GATK_3.6/GenomeAnalysisTK.jar -T PrintReads -R /data/Phil/ref_phil/GATK_resource/hg19/ucsc.hg19.fasta -I ${i} -BQSR ${NAME}_recal_data.table -o ${NAME}_sorted_dedup_fixmate_bqsr.bam
}
export -f foo7
for i in *sorted_dedup_fixmate.bam
do
sem -j 16 foo7 "$i"
done
sem --wait
rg_dir5=`echo QC_reports`
mkdir -p "$rg_dir5"
foo8 () {
local i=$1
NAME=`echo $i | sed 's/\([A-Za-z0-9-]*\)_sorted_dedup_fixmate_bqsr.bam/\1/'`;
java -Xmx16G -jar /data/Phil/software/picard-tools-2.4.1/picard.jar CollectHsMetrics I=${i} O=${NAME}_hs_metrics.txt R=/data/Phil/ref_phil/GATK_resource/hg19/ucsc.hg19.fasta BAIT_INTERVALS=melarray.interval_list TARGET_INTERVALS=melarray.interval_list
java -Xmx16G -jar /data/Phil/software/picard-tools-2.4.1/picard.jar CollectAlignmentSummaryMetrics I=${i} O=${NAME}_aln_metrics.txt R=/data/Phil/ref_phil/GATK_resource/hg19/ucsc.hg19.fasta
}
export -f foo8
for i in *sorted_dedup_fixmate_bqsr.bam
do
sem -j 32 foo8 "$i"
done
sem --wait
mv *sorted.bam 3_bwamem
mv *sorted.bai 3_bwamem
mv *metrics.txt 4_dedup
mv *sorted_dedup.bam 4_dedup
mv *sorted_dedup_fixmate.bam 5_fixmate
mv *sorted_dedup_fixmate.bai 5_fixmate
mv *.table 6_bqsr
mv *plots.pdf 6_bqsr
mv *sorted_dedup_fixmate_bqsr.bam 6_bqsr
mv *sorted_dedup_fixmate_bqsr.bai 6_bqsr
mv *pcr_metrics.txt QC_reports
mv *aln_metrics.txt QC_reports
mv *hs_metrics.txt QC_reports
############################## #####################################
## USZ Snakemake pipeline 1 ## ## USZ MelArray Snakemake pipeline ##
############################## #####################################
# #
# #
# @author: Balazs Laurenczy (for Snakemake pipeline, original bash pipeline from Phil Cheng) # @author: Balazs Laurenczy (for Snakemake pipeline, original bash pipeline from Phil Cheng)
...@@ -20,33 +20,18 @@ from datetime import datetime ...@@ -20,33 +20,18 @@ from datetime import datetime
# set up logging # set up logging
logging.basicConfig(level = logging.INFO, format = '#%(asctime)s | %(levelname)-7s | %(filename)-20s: %(message)s') logging.basicConfig(level = logging.INFO, format = '#%(asctime)s | %(levelname)-7s | %(filename)-20s: %(message)s')
logging.info('Starting snakemake pipeline ...') logging.info('Starting MelArray Snakemake pipeline ...')
# get config parameters # get config parameters
N_FILES = int(config.get('n', 4))
IN_FOLDER = config.get('input_folder', 'input')
if IN_FOLDER[:-1] != '/': IN_FOLDER += '/' # add ending slash to folder path
# check if input folder exists
if not os.path.exists(IN_FOLDER):
exit('Cannot find input folder at path "{}". Aborting.'.format(IN_FOLDER))
OUT_FOLDER = config.get('output_folder', 'output')
if OUT_FOLDER[:-1] != '/': OUT_FOLDER += '/' # add ending slash to folder path
# make sure folder exists
shell('mkdir -p {OUT_FOLDER}')
TMP_FOLDER = config.get('temp_folder', 'tmp')
if TMP_FOLDER[:-1] != '/': TMP_FOLDER += '/' # add ending slash to folder path
# make sure folder exists
shell('mkdir -p {TMP_FOLDER}')
REF_FOLDER = config.get('ref_folder', 'ref_output')
if REF_FOLDER[:-1] != '/': REF_FOLDER += '/' # add ending slash to folder path
# get the list of files to process # get the list of files to process
IDs, = glob_wildcards(IN_FOLDER + "{id}.fastq") IDs, = glob_wildcards(IN_FOLDER + "{id}.fastq")
# check if IDs were not specified in config
IDs = config.get('ids', IDs)
if type(IDs) == str:
IDs = IDs.split(',')
######### #########
# RULES # # RULES #
...@@ -61,18 +46,18 @@ rule all: ...@@ -61,18 +46,18 @@ rule all:
rule step1: rule step1:
input: input:
TMP_FOLDER + '{id}.sam' IN_FOLDER + '{id}.fastq'
output: output:
OUT_FOLDER + '{id}.bam' TMP_FOLDER + '{id}.sam'
shell: shell:
'cat {input} > {output}' 'cat {input} > {output}'
rule step2: rule step2:
input: input:
IN_FOLDER + '{id}.fastq'
output:
TMP_FOLDER + '{id}.sam' TMP_FOLDER + '{id}.sam'
output:
OUT_FOLDER + '{id}.bam'
shell: shell:
'cat {input} > {output}' 'cat {input} > {output}'
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment