Newer
Older
kohleman
committed
# Needs ShortRead_1.11.42 or higher
#
# @author Manuel Kohler
# ETHZ Zurich, 2011
require(ShortRead)
require(RColorBrewer)
kohleman
committed
#require(RSvgDevice)
args <- commandArgs(TRUE)
kohleman
committed
loadFile <- function(args) {
file <- args[1]
kohleman
committed
splitPathAndFileName <- function(fullPath) {
# Splits a full path into path and file name and returns it as a list
kohleman
committed
fastqFileOnly <- basename(fullPath)
pathOnly <- dirname(fullPath)
return (c(pathOnly, fastqFileOnly))
}
kohleman
committed
subSampledReads <- function(fastqFile, samples=1e7, blocksize=2e9) {
# needed ShortRead_1.11.42
fastqFile
fqSub <- FastqSampler(fastqFile, n=samples, readerBlockSize=blocksize, verbose=FALSE)
subReads <- yield(fqSub)
return (subReads)
}
boxPlotPerCycle <- function() {
# Boxplot for per cycle Phred quality
qual <- FastqQuality(quality(quality(subReads))) # get quality scores
readM <- as(qual, "matrix") # convert scores to matrix
boxplot(readM, outline = FALSE, main="Per Cycle Read Quality", sub=fastq_only,
xlab="Cycle", ylab="Phred Quality", col=brewer.pal(11, "Spectral")[1])
return (boxplot)
}
kohleman
committed
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
nucleotidesPerCyclePlot <- function() {
# Nucleotide distribution per cycle
a <-alphabetByCycle(sread(subReads))
cycles <- dim(a)[2]
total_number_bases_first_cycle <- a[,1][[1]]+ a[,1][[2]]+a[,1][[3]]+ a[,1][[4]]
+ a[,1][[5]]
n <- c(names(a[1,1]),names(a[2,1]), names(a[3,1]), names(a[4,1]),
names(a[15,1]))
par(xpd=T, mar=par()$mar+c(0,0,0,4))
barplot(a, main="Numbers of nucleotides per cycle", sub=fastq_only,
ylab="Absolute number", col=heat.colors(5), names.arg=c(1:cycles))
legend(cycles+10, total_number_bases_first_cycle, n, fill=heat.colors(5))
return (barplot)
}
cumOccurencesPlot <- function() {
# Taken from
# http://www.bioconductor.org/help/workflows/high-throughput-sequencing/
# Calculate and plot cumulative reads vs. occurrences
#seq <- readFastq(subReads)
tbl <- tables(subReads)[[2]]
xy <- xyplot(cumsum(nReads * nOccurrences) ~ nOccurrences, tbl,
scales=list(x=list(log=TRUE)), main=fastq_only, type="b", pch=20,
xlab="Number of Occurrences",
ylab="Cumulative Number of Reads")
return(xy)
}
plotPdf <- function (plotObject) {
pdf(file=paste(fastq_file, "quality.pdf", sep="_"))
plotObject
dev.off()
}
plotSvg <- function (plotObject) {
devSVG(file = paste(fastq_file,"quality.svg", sep="_"), width = 10,
height = 8, bg = "white", fg = "black", onefile=TRUE, xmlHeader=TRUE)
plotObject
dev.off()
}
kohleman
committed
# MAIN ########################################################################
kohleman
committed
fastq_file <- loadFile(args)
subReads <- subSampledReads(fastq_file)
kohleman
committed
fastqFilePathVector <- splitPathAndFileName(fastq_file)
fastq_only <- fastqFilePathVector[2]
kohleman
committed
pdf(file=paste(fastq_file,"quality.pdf", sep="_"), paper="a4")
box <- boxPlotPerCycle()
nuc <- nucleotidesPerCyclePlot()
dev.off()
kohleman
committed
#plotPdf(box)
#plotPdf(nuc)
#plotPdf(cumOccurencesPlot())