diff --git a/deep_sequencing_unit/source/Python/fastq_quality.py b/deep_sequencing_unit/source/Python/fastq_quality.py new file mode 100755 index 0000000000000000000000000000000000000000..80064b84e5d2f1d895ca7f045a12a85203e84eb4 --- /dev/null +++ b/deep_sequencing_unit/source/Python/fastq_quality.py @@ -0,0 +1,42 @@ +#!/links/application/dsu/Python-3.2/python + +''' +Docu +''' + +import subprocess +import shlex +import os +import fnmatch +import concurrent.futures + +rscriptPath = '/links/application/dsu/R-scripts' +pwd = os.getcwd() +pattern = '*.fastq.gz' +rscript = '/links/application/dsu/R-2.13.2/bin/Rscript' +concatenationScript='/links/application/dsu/bin/concatenate_pdfs.py' +maxConcurrentJobs=5 + +def run_me(fastqFile): + (path, file) = os.path.split(fastqFile) + os.chdir(path) + args = rscript + ' --vanilla ' + rscriptPath + '/' + 'fastq_quality.R ' + file + #print(args) + SplitArgs = shlex.split(args) + p = subprocess.Popen(SplitArgs) + p.wait() + #subprocess.Popen(concatenationScript) + +def findFiles (pattern): + matches = [] + for root, dirnames, filenames in os.walk(pwd): + for filename in fnmatch.filter(filenames, pattern): + matches.append(os.path.join(root, filename)) + return matches + +def callR(): + matchingFiles = findFiles(pattern) + with concurrent.futures.ThreadPoolExecutor(max_workers=maxConcurrentJobs) as executor: + out = [executor.submit(run_me, lane) + for lane in matchingFiles] +callR() diff --git a/deep_sequencing_unit/source/R/fastq_quality.R b/deep_sequencing_unit/source/R/fastq_quality.R index a0fb93ce6dac4dcda5b7fff605640b7a23961efb..456c8a31c8eadaa9b7d0c2e59abecc763dd8cbe5 100644 --- a/deep_sequencing_unit/source/R/fastq_quality.R +++ b/deep_sequencing_unit/source/R/fastq_quality.R @@ -1,88 +1,101 @@ -require(multicore) +# Needs ShortRead_1.11.42 or higher +# +# @author Manuel Kohler +# ETHZ Zurich, 2011 + require(ShortRead) require(RColorBrewer) -require(RSvgDevice) +#require(RSvgDevice) -# ShortRead Quality assessment ################################################ args <- commandArgs(TRUE) -fastq_file <- args[1] - -# Ugly but only way to simulate a rsplit -p="" -tokens<-unlist(strsplit(fastq_file, "/")) -path_tokens<-tokens[2:(length(tokens)-1)] -fastq_only<-tokens[length(tokens)] -for (x in 1:length(path_tokens)) p<-paste(p,path_tokens[x],sep="/") - -qa <- qa(p,fastq_only, type="fastq") -qa_report <- report (qa, dest=paste(fastq_file, "ShortRead_qa", sep="_"), -type="html") - -# Boxplot for per cycle Phred quality -reads <- readFastq(fastq_file) -# if more than a million reads, sample randomly -if (length(reads) > 10000000) { - reads <- sample(reads,10000000) + +loadFile <- function(args) { + file <- args[1] } -qual <- FastqQuality(quality(quality(reads))) # get quality scores -readM <- as(qual, "matrix") # convert scores to matrix -devSVG(file = paste(fastq_file,"boxplot.svg", sep="_"), width = 10, -height = 8, bg = "white", fg = "black", onefile=TRUE, xmlHeader=TRUE) -boxplot(readM, outline = FALSE, main="Per Cycle Read Quality", sub=fastq_file, -xlab="Cycle", ylab="Phred Quality", col=brewer.pal(11, "Spectral")[1]) -dev.off() +splitPathAndFileName <- function(fullPath) { + # Splits a full path into path and file name and returns it as a list -# Save box plot as boxplot.pdf in current folder -pdf(file=paste(fastq_file,"boxplot.pdf", sep="_")) -boxplot(readM, outline = FALSE, main="Per Cycle Read Quality", sub=fastq_file, -xlab="Cycle", ylab="Phred Quality", col=brewer.pal(11, "Spectral")[1]) -dev.off() + fastqFileOnly <- basename(fullPath) + pathOnly <- dirname(fullPath) + return (c(pathOnly, fastqFileOnly)) +} -# Nucleotides per Cycle ###################################################### -a <-alphabetByCycle(sread(reads)) -cycles <- dim(a)[2] -total_number_bases_first_cycle <- a[,1][[1]]+ a[,1][[2]]+a[,1][[3]]+ a[,1][[4]] -+ a[,1][[5]] -n <- c(names(a[1,1]),names(a[2,1]), names(a[3,1]), names(a[4,1]), -names(a[15,1])) - -# Save box plot in current folder -pdf(file=paste(fastq_only,"nuc_per_cycle.pdf", sep="_")) -par(xpd=T, mar=par()$mar+c(0,0,0,4)) -barplot(a, main="Numbers of nucleotides per cycle", sub=fastq_file, -ylab="Absolute number", col=heat.colors(5), names.arg=c(1:cycles)) -legend(cycles+10, total_number_bases_first_cycle, n, fill=heat.colors(5)) -dev.off() -devSVG(file = paste(fastq_only,"nuc_per_cycle.svg", sep="_"), width = 10, -height = 8, bg = "white", fg = "black", onefile=TRUE, xmlHeader=TRUE) -par(xpd=T, mar=par()$mar+c(0,0,0,4)) -barplot(a, main="Numbers of nucleotides per cycle", sub=fastq_file, -ylab="Absolute number", col=heat.colors(5), names.arg=c(1:cycles)) -legend(cycles+10, total_number_bases_first_cycle, n, fill=heat.colors(5)) -dev.off() +subSampledReads <- function(fastqFile, samples=1e7, blocksize=2e9) { + # needed ShortRead_1.11.42 + fastqFile + fqSub <- FastqSampler(fastqFile, n=samples, readerBlockSize=blocksize, verbose=FALSE) + subReads <- yield(fqSub) + return (subReads) +} + +boxPlotPerCycle <- function() { + # Boxplot for per cycle Phred quality + qual <- FastqQuality(quality(quality(subReads))) # get quality scores + readM <- as(qual, "matrix") # convert scores to matrix + boxplot(readM, outline = FALSE, main="Per Cycle Read Quality", sub=fastq_only, + xlab="Cycle", ylab="Phred Quality", col=brewer.pal(11, "Spectral")[1]) + return (boxplot) +} +nucleotidesPerCyclePlot <- function() { + # Nucleotide distribution per cycle + a <-alphabetByCycle(sread(subReads)) + cycles <- dim(a)[2] + total_number_bases_first_cycle <- a[,1][[1]]+ a[,1][[2]]+a[,1][[3]]+ a[,1][[4]] + + a[,1][[5]] + n <- c(names(a[1,1]),names(a[2,1]), names(a[3,1]), names(a[4,1]), + names(a[15,1])) + par(xpd=T, mar=par()$mar+c(0,0,0,4)) + barplot(a, main="Numbers of nucleotides per cycle", sub=fastq_only, + ylab="Absolute number", col=heat.colors(5), names.arg=c(1:cycles)) + legend(cycles+10, total_number_bases_first_cycle, n, fill=heat.colors(5)) + return (barplot) +} + +cumOccurencesPlot <- function() { + # Taken from + # http://www.bioconductor.org/help/workflows/high-throughput-sequencing/ + # Calculate and plot cumulative reads vs. occurrences + + #seq <- readFastq(subReads) + tbl <- tables(subReads)[[2]] + xy <- xyplot(cumsum(nReads * nOccurrences) ~ nOccurrences, tbl, + scales=list(x=list(log=TRUE)), main=fastq_only, type="b", pch=20, + xlab="Number of Occurrences", + ylab="Cumulative Number of Reads") + return(xy) +} + + +plotPdf <- function (plotObject) { + pdf(file=paste(fastq_file, "quality.pdf", sep="_")) + plotObject + dev.off() +} + +plotSvg <- function (plotObject) { + devSVG(file = paste(fastq_file,"quality.svg", sep="_"), width = 10, + height = 8, bg = "white", fg = "black", onefile=TRUE, xmlHeader=TRUE) + plotObject + dev.off() +} -# Taken from -# http://www.bioconductor.org/help/workflows/high-throughput-sequencing/ +# MAIN ######################################################################## -seq <- readFastq(fastq_file) -tbl <- tables(seq)[[2]] +fastq_file <- loadFile(args) +subReads <- subSampledReads(fastq_file) -pdf(file=paste(fastq_only,"NumberOfOccurrences.pdf", sep="_")) +fastqFilePathVector <- splitPathAndFileName(fastq_file) +fastq_only <- fastqFilePathVector[2] -## Calculate and plot cumulative reads vs. occurrences -xyplot(cumsum(nReads * nOccurrences) ~ nOccurrences, tbl, -scales=list(x=list(log=TRUE)), main=fastq_only, type="b", pch=20, -xlab="Number of Occurrences", -ylab="Cumulative Number of Reads") +pdf(file=paste(fastq_file,"quality.pdf", sep="_"), paper="a4") + box <- boxPlotPerCycle() + nuc <- nucleotidesPerCyclePlot() + cumOccurencesPlot() dev.off() -devSVG(file = paste(fastq_only,"NumberOfOccurrences.svg", sep="_"), width = 10, -height = 8, bg = "white", fg = "black", onefile=TRUE, xmlHeader=TRUE) -xyplot(cumsum(nReads * nOccurrences) ~ nOccurrences, tbl, -scales=list(x=list(log=TRUE)), main=fastq_only, type="b", pch=20, -xlab="Number of Occurrences", -ylab="Cumulative Number of Reads") -dev.off() \ No newline at end of file +#plotPdf(box) +#plotPdf(nuc) +#plotPdf(cumOccurencesPlot())