Skip to content
Snippets Groups Projects
fastq_quality.R 2.92 KiB
Newer Older
  • Learn to ignore specific revisions
  • # Needs ShortRead_1.11.42 or higher
    #
    # @author Manuel Kohler
    # ETHZ Zurich, 2011
    
    
    require(ShortRead)
    require(RColorBrewer)
    
    splitPathAndFileName <- function(fullPath) {
      # Splits a full path into path and file name and returns it as a list
    
      fastqFileOnly <- basename(fullPath)
      pathOnly <- dirname(fullPath)
      return (c(pathOnly, fastqFileOnly)) 
    }
    
    subSampledReads <- function(fastqFile, samples=1e7, blocksize=2e9) {
      # needed ShortRead_1.11.42
      fastqFile
      fqSub <- FastqSampler(fastqFile, n=samples, readerBlockSize=blocksize, verbose=FALSE)
      subReads <- yield(fqSub)
      return (subReads)
    }
    
    boxPlotPerCycle <- function() {
      # Boxplot for per cycle Phred quality
      qual <- FastqQuality(quality(quality(subReads))) # get quality scores
      readM <- as(qual, "matrix") # convert scores to matrix
      boxplot(readM, outline = FALSE, main="Per Cycle Read Quality", sub=fastq_only,
              xlab="Cycle", ylab="Phred Quality", col=brewer.pal(11, "Spectral")[1])
      return (boxplot)
    }
    
    nucleotidesPerCyclePlot <- function() {
      # Nucleotide distribution per cycle
      a <-alphabetByCycle(sread(subReads))
      cycles <- dim(a)[2]
      total_number_bases_first_cycle <- a[,1][[1]]+ a[,1][[2]]+a[,1][[3]]+ a[,1][[4]]
                                    + a[,1][[5]]
      n <- c(names(a[1,1]),names(a[2,1]), names(a[3,1]), names(a[4,1]),
             names(a[15,1]))
      par(xpd=T, mar=par()$mar+c(0,0,0,4))
      barplot(a, main="Numbers of nucleotides per cycle", sub=fastq_only,
             ylab="Absolute number",  col=heat.colors(5), names.arg=c(1:cycles))
      legend(cycles+10, total_number_bases_first_cycle, n, fill=heat.colors(5))
      return (barplot)
    }
    
    cumOccurencesPlot <- function() {
      # Taken from
      # http://www.bioconductor.org/help/workflows/high-throughput-sequencing/
      # Calculate and plot cumulative reads vs. occurrences
    
      #seq <- readFastq(subReads)
      tbl <- tables(subReads)[[2]]
      xy <- xyplot(cumsum(nReads * nOccurrences) ~ nOccurrences, tbl,
               scales=list(x=list(log=TRUE)), main=fastq_only, type="b", pch=20,
               xlab="Number of Occurrences",
               ylab="Cumulative Number of Reads")
      return(xy)
    }
    
    
    plotPdf <- function (plotObject) {
      pdf(file=paste(fastq_file, "quality.pdf", sep="_"))
      plotObject
      dev.off() 
    }
    
    plotSvg <- function (plotObject) {
      devSVG(file = paste(fastq_file,"quality.svg", sep="_"), width = 10,
           height = 8, bg = "white", fg = "black", onefile=TRUE, xmlHeader=TRUE)
      plotObject
      dev.off()
    }
    
    # MAIN ########################################################################
    
    fastq_file <- loadFile(args)
    subReads <- subSampledReads(fastq_file)
    
    fastqFilePathVector <- splitPathAndFileName(fastq_file)
    fastq_only <- fastqFilePathVector[2]
    
    pdf(file=paste(fastq_file,"quality.pdf", sep="_"), paper="a4")
     box <- boxPlotPerCycle()
     nuc <- nucleotidesPerCyclePlot()
    
    #plotPdf(box)
    #plotPdf(nuc)
    #plotPdf(cumOccurencesPlot())