initial check in, needs to be re-factored but does the job for now

SVN: 21841

initial check in, needs to be re-factored but does the job for now
50aaf9eb · kohleman · be9fe9c2 · 50aaf9eb
Commit 50aaf9eb authored 13 years ago by kohleman
--- a/deep_sequencing_unit/source/R/fastq_quality.R
+++ b/deep_sequencing_unit/source/R/fastq_quality.R
+require(multicore)
+require(ShortRead)
+require(RColorBrewer)
+require(RSvgDevice)
+
+# ShortRead Quality assessment ################################################
+args <- commandArgs(TRUE)
+fastq_file <- args[1]
+
+# Ugly but only way to simulate a rsplit
+p=""
+tokens<-unlist(strsplit(fastq_file, "/"))
+path_tokens<-tokens[2:(length(tokens)-1)]
+fastq_only<-tokens[length(tokens)]
+for (x in 1:length(path_tokens)) p<-paste(p,path_tokens[x],sep="/")
+
+qa <- qa(p,fastq_only, type="fastq")
+qa_report <- report (qa, dest=paste(fastq_file, "ShortRead_qa", sep="_"),
+type="html")
+
+# Boxplot for per cycle Phred quality
+reads <- readFastq(fastq_file)
+# if more than a million reads, sample randomly
+if (length(reads) > 10000000) {
+	reads <- sample(reads,10000000)
+}
+qual <- FastqQuality(quality(quality(reads))) # get quality scores
+readM <- as(qual, "matrix") # convert scores to matrix
+
+devSVG(file = paste(fastq_file,"boxplot.svg", sep="_"), width = 10,
+height = 8, bg = "white", fg = "black", onefile=TRUE, xmlHeader=TRUE)
+boxplot(readM, outline = FALSE, main="Per Cycle Read Quality", sub=fastq_file,
+xlab="Cycle", ylab="Phred Quality", col=brewer.pal(11, "Spectral")[1])
+dev.off()
+
+# Save box plot as boxplot.pdf in current folder
+pdf(file=paste(fastq_file,"boxplot.pdf", sep="_"))
+boxplot(readM, outline = FALSE, main="Per Cycle Read Quality", sub=fastq_file,
+xlab="Cycle", ylab="Phred Quality", col=brewer.pal(11, "Spectral")[1])
+dev.off()
+
+# Nucleotides per Cycle  ######################################################
+a <-alphabetByCycle(sread(reads))
+cycles <- dim(a)[2]
+total_number_bases_first_cycle <- a[,1][[1]]+ a[,1][[2]]+a[,1][[3]]+ a[,1][[4]]
+ a[,1][[5]]
+n <- c(names(a[1,1]),names(a[2,1]), names(a[3,1]), names(a[4,1]),
+names(a[15,1]))
+
+# Save box plot in current folder
+pdf(file=paste(fastq_only,"nuc_per_cycle.pdf", sep="_"))
+par(xpd=T, mar=par()$mar+c(0,0,0,4))
+barplot(a, main="Numbers of nucleotides per cycle", sub=fastq_file,
+ylab="Absolute number",  col=heat.colors(5), names.arg=c(1:cycles))
+legend(cycles+10, total_number_bases_first_cycle, n, fill=heat.colors(5))
+dev.off()
+
+devSVG(file = paste(fastq_only,"nuc_per_cycle.svg", sep="_"), width = 10,
+height = 8, bg = "white", fg = "black", onefile=TRUE, xmlHeader=TRUE)
+par(xpd=T, mar=par()$mar+c(0,0,0,4))
+barplot(a, main="Numbers of nucleotides per cycle", sub=fastq_file,
+ylab="Absolute number",  col=heat.colors(5), names.arg=c(1:cycles))
+legend(cycles+10, total_number_bases_first_cycle, n, fill=heat.colors(5))
+dev.off()
+
+
+# Taken from
+# http://www.bioconductor.org/help/workflows/high-throughput-sequencing/
+
+seq <- readFastq(fastq_file)
+tbl <- tables(seq)[[2]]
+
+pdf(file=paste(fastq_only,"NumberOfOccurrences.pdf", sep="_"))
+
+## Calculate and plot cumulative reads vs. occurrences
+xyplot(cumsum(nReads * nOccurrences) ~ nOccurrences, tbl, 
+scales=list(x=list(log=TRUE)), main=fastq_only, type="b", pch=20,
+xlab="Number of Occurrences", 
+ylab="Cumulative Number of Reads")
+dev.off()
+
+devSVG(file = paste(fastq_only,"NumberOfOccurrences.svg", sep="_"), width = 10,
+height = 8, bg = "white", fg = "black", onefile=TRUE, xmlHeader=TRUE)
+xyplot(cumsum(nReads * nOccurrences) ~ nOccurrences, tbl,
+scales=list(x=list(log=TRUE)), main=fastq_only, type="b", pch=20,
+xlab="Number of Occurrences",
+ylab="Cumulative Number of Reads")
+dev.off()
\ No newline at end of file