diff --git a/deep_sequencing_unit/source/R/fastq_quality.R b/deep_sequencing_unit/source/R/fastq_quality.R new file mode 100644 index 0000000000000000000000000000000000000000..a0fb93ce6dac4dcda5b7fff605640b7a23961efb --- /dev/null +++ b/deep_sequencing_unit/source/R/fastq_quality.R @@ -0,0 +1,88 @@ +require(multicore) +require(ShortRead) +require(RColorBrewer) +require(RSvgDevice) + +# ShortRead Quality assessment ################################################ +args <- commandArgs(TRUE) +fastq_file <- args[1] + +# Ugly but only way to simulate a rsplit +p="" +tokens<-unlist(strsplit(fastq_file, "/")) +path_tokens<-tokens[2:(length(tokens)-1)] +fastq_only<-tokens[length(tokens)] +for (x in 1:length(path_tokens)) p<-paste(p,path_tokens[x],sep="/") + +qa <- qa(p,fastq_only, type="fastq") +qa_report <- report (qa, dest=paste(fastq_file, "ShortRead_qa", sep="_"), +type="html") + +# Boxplot for per cycle Phred quality +reads <- readFastq(fastq_file) +# if more than a million reads, sample randomly +if (length(reads) > 10000000) { + reads <- sample(reads,10000000) +} +qual <- FastqQuality(quality(quality(reads))) # get quality scores +readM <- as(qual, "matrix") # convert scores to matrix + +devSVG(file = paste(fastq_file,"boxplot.svg", sep="_"), width = 10, +height = 8, bg = "white", fg = "black", onefile=TRUE, xmlHeader=TRUE) +boxplot(readM, outline = FALSE, main="Per Cycle Read Quality", sub=fastq_file, +xlab="Cycle", ylab="Phred Quality", col=brewer.pal(11, "Spectral")[1]) +dev.off() + +# Save box plot as boxplot.pdf in current folder +pdf(file=paste(fastq_file,"boxplot.pdf", sep="_")) +boxplot(readM, outline = FALSE, main="Per Cycle Read Quality", sub=fastq_file, +xlab="Cycle", ylab="Phred Quality", col=brewer.pal(11, "Spectral")[1]) +dev.off() + +# Nucleotides per Cycle ###################################################### +a <-alphabetByCycle(sread(reads)) +cycles <- dim(a)[2] +total_number_bases_first_cycle <- a[,1][[1]]+ a[,1][[2]]+a[,1][[3]]+ a[,1][[4]] ++ a[,1][[5]] +n <- c(names(a[1,1]),names(a[2,1]), names(a[3,1]), names(a[4,1]), +names(a[15,1])) + +# Save box plot in current folder +pdf(file=paste(fastq_only,"nuc_per_cycle.pdf", sep="_")) +par(xpd=T, mar=par()$mar+c(0,0,0,4)) +barplot(a, main="Numbers of nucleotides per cycle", sub=fastq_file, +ylab="Absolute number", col=heat.colors(5), names.arg=c(1:cycles)) +legend(cycles+10, total_number_bases_first_cycle, n, fill=heat.colors(5)) +dev.off() + +devSVG(file = paste(fastq_only,"nuc_per_cycle.svg", sep="_"), width = 10, +height = 8, bg = "white", fg = "black", onefile=TRUE, xmlHeader=TRUE) +par(xpd=T, mar=par()$mar+c(0,0,0,4)) +barplot(a, main="Numbers of nucleotides per cycle", sub=fastq_file, +ylab="Absolute number", col=heat.colors(5), names.arg=c(1:cycles)) +legend(cycles+10, total_number_bases_first_cycle, n, fill=heat.colors(5)) +dev.off() + + +# Taken from +# http://www.bioconductor.org/help/workflows/high-throughput-sequencing/ + +seq <- readFastq(fastq_file) +tbl <- tables(seq)[[2]] + +pdf(file=paste(fastq_only,"NumberOfOccurrences.pdf", sep="_")) + +## Calculate and plot cumulative reads vs. occurrences +xyplot(cumsum(nReads * nOccurrences) ~ nOccurrences, tbl, +scales=list(x=list(log=TRUE)), main=fastq_only, type="b", pch=20, +xlab="Number of Occurrences", +ylab="Cumulative Number of Reads") +dev.off() + +devSVG(file = paste(fastq_only,"NumberOfOccurrences.svg", sep="_"), width = 10, +height = 8, bg = "white", fg = "black", onefile=TRUE, xmlHeader=TRUE) +xyplot(cumsum(nReads * nOccurrences) ~ nOccurrences, tbl, +scales=list(x=list(log=TRUE)), main=fastq_only, type="b", pch=20, +xlab="Number of Occurrences", +ylab="Cumulative Number of Reads") +dev.off() \ No newline at end of file