From c638f1737d724d71d3ee3f3bf930b447695d42ef Mon Sep 17 00:00:00 2001 From: kohleman <kohleman> Date: Fri, 22 Jun 2012 16:12:34 +0000 Subject: [PATCH] initial checkin: - parses Unaligned_no_mismatch/Basecall_Stats_<FC>/Flowcell_demux_summary.xml calculates several statistical values and sets the properties of the matching data sets to those values SVN: 25838 --- .../etc/data-set-handler-demultiplex-stats.py | 428 ++++++++++++++++++ 1 file changed, 428 insertions(+) create mode 100755 deep_sequencing_unit/dist/etc/data-set-handler-demultiplex-stats.py diff --git a/deep_sequencing_unit/dist/etc/data-set-handler-demultiplex-stats.py b/deep_sequencing_unit/dist/etc/data-set-handler-demultiplex-stats.py new file mode 100755 index 00000000000..0781051f88e --- /dev/null +++ b/deep_sequencing_unit/dist/etc/data-set-handler-demultiplex-stats.py @@ -0,0 +1,428 @@ +''' + @copyright: 2012 ETH Zuerich, CISD + + @license: + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + +@author: Manuel Kohler + +XML Structur which is processed: + +<?xml version="1.0"?> +<Summary> + <Lane index="8"> + <Sample index="lane8"> + <Barcode index="Undetermined"> + <Tile index="1101"> + <Read index="1"> + <Raw> + <Yield>1921250</Yield> + <YieldQ30>949680</YieldQ30> + <ClusterCount>38425</ClusterCount> + <ClusterCount0MismatchBarcode>0</ClusterCount0MismatchBarcode> + <ClusterCount1MismatchBarcode>0</ClusterCount1MismatchBarcode> + <QualityScoreSum>40995660</QualityScoreSum> + </Raw> + <Pf> + <Yield>945450</Yield> + <YieldQ30>854815</YieldQ30> + <ClusterCount>18909</ClusterCount> + <ClusterCount0MismatchBarcode>0</ClusterCount0MismatchBarcode> + <ClusterCount1MismatchBarcode>0</ClusterCount1MismatchBarcode> + <QualityScoreSum>33815505</QualityScoreSum> + </Pf> + </Read> + </Tile> + [...] + +@note: +print statements go to: <openBIS_HOME>/datastore_server/log/startup_log.txt +''' + +import time +import os +import fnmatch +import xml.etree.ElementTree as etree +from ch.systemsx.cisd.openbis.generic.shared.api.v1.dto import SearchCriteria +from ch.systemsx.cisd.openbis.generic.shared.api.v1.dto import SearchSubCriteria + + +class parseXmlFile: + + def __init__(self, xmlFile): + self.xmlFile = xmlFile + self.tree = etree.parse(self.xmlFile) + self.root = self.tree.getroot() + +# ----------------------------------------------------------------------------- + +class qcValues(object): + def __init__(self, Yield = 0, YieldQ30 = 0, ClusterCount = 0, + ClusterCount0MismatchBarcode = 0, ClusterCount1MismatchBarcode = 0, + QualityScoreSum = 0, *args, **kwargs): + self.Yield = Yield + self.YieldQ30 = YieldQ30 + self.ClusterCount = ClusterCount + self.ClusterCount0MismatchBarcode = ClusterCount0MismatchBarcode + self.ClusterCount1MismatchBarcode = ClusterCount1MismatchBarcode + self.QualityScoreSum = QualityScoreSum + + def __str__(self): + return "Yield: %s, YieldQ30: %s, ClusterCount: %s, ClusterCount0MismatchBarcode: %s," \ + " CusterCount1MismatchBarcode: %s, QualityScoreSum: %s" \ + % (self.Yield, self.YieldQ30, self.ClusterCount, self.ClusterCount0MismatchBarcode, + self.ClusterCount1MismatchBarcode, self.QualityScoreSum) + +class sample: + def __init__(self, Lane = 0, Sample = '', Barcode = '', Tile = '', Read = '', rawqc = qcValues([]), + pfqc = qcValues([]), *args, **kwargs): + self.Lane = Lane + self.Sample = Sample + self.Barcode = Barcode + self.Tile = Tile + self.Read = Read + self.rawqc = rawqc + self.pfqc = pfqc + + def __str__(self): + return "Lane: %s, Sample: %s, Barcode: %s, Tile: %s, Read: %s, rawqc: %s, pfqc: %s" \ + % (self.Lane, self.Sample, self.Barcode, self.Tile, self.Read, self.rawqc, self.pfqc) + +# ----------------------------------------------------------------------------- + +class Statistics: + def __init__(self, lane = 0, sampleName = "", index1 = "NoIndex", index2 = "NoIndex", pfYieldSum = 0, + rawYieldSum = 0, pfPercentage = 0.0, rawReadsSum = 0, pfReadsSum = 0, + pfYieldQ30Sum = 0, qualityScoreSum = 0, rawPercentageReadsPerLane = 0.0, + pfYieldQ30Percentage = 0.0, pfsumQualityScore = 0, pfmeanQualityScore = 0.0): + self.lane = lane + self.sampleName = sampleName + self.index1 = index1 + self.index2 = index2 + self.pfYieldSum = pfYieldSum + self.rawYieldSum = rawYieldSum + self.pfPercentage = pfPercentage + self.rawReadsSum = rawReadsSum + self.pfReadsSum = pfReadsSum + self.pfYieldQ30Sum = pfYieldQ30Sum + self.qualityScoreSum = qualityScoreSum + self.rawPercentageReadsPerLane = rawPercentageReadsPerLane + self.pfYieldQ30Percentage = pfYieldQ30Percentage + self.pfsumQualityScore = pfsumQualityScore + self.pfmeanQualityScore = pfmeanQualityScore + + def __str__(self): + return "lane: %s, sampleName: %s, index1: %s, index2: %s, pfYieldSum: %s, pfPercentage: %s," \ + " rawReadsSum: %s, pfReadsSum: %s," \ + " rawPercentageReadsPerLane: %s, pfYieldQ30Percentage: %s," \ + " pfmeanQualityScore: %s" \ + % (self.lane, self.sampleName, self.index1, self.index2, self.pfYieldSum, self.pfPercentage, + self.rawReadsSum, self.pfReadsSum, + self.rawPercentageReadsPerLane, self.pfYieldQ30Percentage, self.pfmeanQualityScore) + + def calculatePercentagePF (self, rawYield = 0, pfYield = 1): + try: + return round(float(pfYield) / float(rawYield) * 100, 2) + except: + return 0.0 + + def calulateMeanQualityScore (self, pfqualityScoreSum = 0, pfYield = 1): + try: + return round (float(pfqualityScoreSum) / float(pfYield), 2) + except: + return 0.0 + + def calculateYieldQ30Percentage (self, pfYieldQ30 = 0, pfYield = 1): + try: + return round (float(pfYieldQ30) / float(pfYield) * 100, 2) + except: + return 0.0 + +# ----------------------------------------------------------------------------- + +def xml2Memory(DEMULTIPLEX_XML): + ''' + Parse the XML file and put all values in a memory structure: + List of: + lane, sample, barcode, tile, read, qcRawList, qcPfList + ''' + + RAW_TAG = "Raw" + PF_TAG = "Pf" + + sampleList = [] + + xml = parseXmlFile(DEMULTIPLEX_XML) + r = xml.tree.getroot() + + for lane in r.getchildren(): + for mysample in lane: + for barcode in mysample: + for tile in barcode: + for read in tile: + + qcRaw = qcValues() + qcPf = qcValues() + qcRawList = [] + qcPfList = [] + + # Read out the Raw fields + raw = read.find(RAW_TAG) + for child in raw.getchildren(): + # equivalent to a Java reflection + setattr(qcRaw, child.tag, int(child.text)) + + # Read out the Pf fields + pf = read.find(PF_TAG) + for child in pf.getchildren(): + # equivalent to a Java reflection + setattr(qcPf, child.tag, int(child.text)) + + qcRawList.append(qcRaw) + qcPfList.append(qcPf) + + singleElement = sample () + + setattr(singleElement, lane.tag, lane.attrib) + setattr(singleElement, mysample.tag, mysample.attrib) + setattr(singleElement, barcode.tag, barcode.attrib) + setattr(singleElement, tile.tag, tile.attrib) + setattr(singleElement, read.tag, read.attrib) + singleElement.rawqc = qcRawList + singleElement.pfqc = qcPfList + + sampleList.append(singleElement) + return sampleList + +# ----------------------------------------------------------------------------- + +def calculateStatistics(listofSamples): + ''' + Structure of 'listofSamples' + Lane: {'index': '6'}, Sample: {'index': 'BSSE-QGF-3524_C0NKPACXX'}, Barcode: {'index': 'TGACCA'}, + Tile: {'index': '2307'}, Read: {'index': '1'}, rawqc:<mem>, pfqc:<mem> + ''' + + numberOfTiles = len(listofSamples) + + tile = sample() + raw = qcValues () + pf = qcValues () + stats = Statistics() + + for tile in listofSamples: + raw = tile.rawqc[0] + pf = tile.pfqc[0] + + stats.pfYieldSum += pf.Yield + stats.rawYieldSum += raw.Yield + stats.rawReadsSum += raw.ClusterCount + stats.pfReadsSum += pf.ClusterCount + stats.pfYieldQ30Sum += pf.YieldQ30 + stats.qualityScoreSum += pf.QualityScoreSum + + # Can not be set here, needs to be calculated later + #stats.rawPercentageReadsPerLane = rawPercentageReadsPerLane + stats.pfPercentage = stats.calculatePercentagePF(stats.rawYieldSum, stats.pfYieldSum) + stats.pfYieldQ30Percentage = stats.calculateYieldQ30Percentage(stats.pfYieldQ30Sum, stats.pfYieldSum) + stats.pfmeanQualityScore = stats.calulateMeanQualityScore(stats.qualityScoreSum, stats.pfYieldSum) + stats.lane = listofSamples[0].Lane.values()[0] + stats.sampleName = listofSamples[0].Sample.values()[0] + index = listofSamples[0].Barcode.values()[0] + try: + stats.index1, stats.index2 = index.split("-") + except: + stats.index1 = index + return stats + +# ----------------------------------------------------------------------------- + + +def rawReadSumPerSamples(stat): + ''' + Creates a dictionary with the lanes as keys + The values are a list where the elements are a dictionary again. + This dictionary has the sample names as key and the RawReadSum as value. + + Example: + {4': [{'BSSE-QGF-3434_C0NKPACXX': 248999502}], '7': [{'lane7': 123921974}, + {'BSSE-QGF-3527_C0NKPACXX': 38587703}, {'BSSE-QGF-3529_C0NKPACXX': 30130893}, + {'BSSE-QGF-3528_C0NKPACXX': 34519296}, {'BSSE-QGF-3526_C0NKPACXX': 34980179}]} + ''' + + laneDict = {} + for e in stat: + if e.lane not in laneDict: + laneDict[e.lane] = [{e.sampleName:e.rawReadsSum}] + else: + laneDict[e.lane].append({e.sampleName:e.rawReadsSum}) + return laneDict + +# ----------------------------------------------------------------------------- + +def createSumRawReadsPerLane(laneDict): + ''' + Creates a dictionary with lane as key and sum of Raw Reads as value: + {'1': 183180877, '3': 244968562, '2': 191496395, '5': 193466239, '4': 248999502, + '7': 262140045, '6': 257136830, '8': 209948449} + ''' + sumRawReadsDict = {} + for lane in laneDict: + sumRawReads = 0 + for sampleNameDict in laneDict[lane]: + sumRawReads += sampleNameDict.values()[0] + + sumRawReadsDict[lane] = sumRawReads + return sumRawReadsDict + +# ----------------------------------------------------------------------------- + +def createPercentagePerLane(laneDict, sumRawReadsDict): + ''' + Creates a dictionary with the sample Name as key and the percentage of raw reads related to + all reads in the same lane + {'lane7': 47.27, 'BSSE-QGF-3433_C0NKPACXX': 100.0, 'BSSE-QGF-3666_C0NKPACXX': 54.12} + ''' + + relRawReadsDict = {} + for lane in laneDict: + for sampleName in laneDict[lane]: + relRawReadsDict[sampleName.keys()[0]] = round(float(sampleName.values()[0]) / + float(sumRawReadsDict[lane]) * 100, 2) + return relRawReadsDict + +# ----------------------------------------------------------------------------- + +def locate(pattern, root): + '''Locate all files matching supplied filename pattern in and below + supplied root directory.''' + for path, dirs, files in os.walk(os.path.abspath(root)): + for filename in fnmatch.filter(files, pattern): + yield os.path.join(path, filename) + +# ----------------------------------------------------------------------------- +# ----------------------------------------------------------------------------- + +FASTQ_DATA_SET_TYPE='FASTQ_GZ' +DEMUX_FILE='Flowcell_demux_summary.xml' +NO_INDEX='NOINDEX' +UNDETERMINED='UNDETERMINED' + +print('\n'+time.ctime()) +incomingPath = incoming.getAbsolutePath() +# Create a "transaction" -- a way of grouping operations together so they all +# happen or none of them do. +transaction = service.transaction() +# Get the incoming name +incomingName = incoming.getName() +# Get the search service +search_service = transaction.getSearchService() + +FileGenerator= locate(DEMUX_FILE, incomingPath) +DEMULTIPLEX_XML = FileGenerator.next() + +sampleList = xml2Memory(DEMULTIPLEX_XML) + +sa = sample() +sampleDict = {} + +# key = sample name, value = sample() +for element in range(0, len(sampleList)): + sa = sampleList[element] + # Check if new sample + if (sa.Sample is not sampleList[element - 1].Sample): + sampleName = sa.Sample.values()[0] + sampleDict[sampleName] = [sa] + else: + sampleDict[sampleName].append(sa) + +stat = [calculateStatistics(sampleDict[mysample]) for mysample in sampleDict] + +# calculate the relative amount of reads per index +laneDict = rawReadSumPerSamples(stat) +sumRawReadsDict = createSumRawReadsPerLane(laneDict) +relRawReadsDict = createPercentagePerLane(laneDict, sumRawReadsDict) + +# set the values in the object +for mye in stat: + mye.rawPercentageReadsPerLane = relRawReadsDict[mye.sampleName] + +#for m in stat: +# print m + +def sampleSearch(Code=''): + sc = SearchCriteria() + numberOfLanes = 0 + sc.addMatchClause(SearchCriteria.MatchClause.createAttributeMatch(SearchCriteria.MatchClauseAttribute.CODE, Code)); + search_service = transaction.getSearchService() + foundSample = search_service.searchForSamples(sc) + if foundSample.size() > 0: + # Search for contained samples + sampleSc = SearchCriteria() + sampleSc.addSubCriteria(SearchSubCriteria.createSampleContainerCriteria(sc)) + foundContainedSamples = search_service.searchForSamples(sampleSc) + numberOfLanes = foundContainedSamples.size() + return foundSample, foundContainedSamples, numberOfLanes + +def searchDataSetsofSample(sample, index1, index2, DATA_SET_TYPE): + sc = SearchCriteria() + sc.addMatchClause(SearchCriteria.MatchClause.createAttributeMatch(SearchCriteria.MatchClauseAttribute.CODE, sample)); + search_service = transaction.getSearchService() + foundSample = search_service.searchForSamples(sc) + + dataSetSc = SearchCriteria() + dataSetSc.addMatchClause(SearchCriteria.MatchClause.createAttributeMatch(SearchCriteria.MatchClauseAttribute.TYPE, DATA_SET_TYPE)) + if index1 not in (NO_INDEX): + if index1 in (UNDETERMINED): + index1 = NO_INDEX + else: + # Ugly workaround, the Index vocabulary has 7 nucleotides of which the last one is always an 'A' + index1 = index1 + "A" + dataSetSc.addMatchClause(SearchCriteria.MatchClause.createPropertyMatch("BARCODE", index1)) + if index2 not in (NO_INDEX): + if index2 in (UNDETERMINED): + index2 = NO_INDEX + dataSetSc.addMatchClause(SearchCriteria.MatchClause.createPropertyMatch("INDEX2", index2)) + dataSetSc.addSubCriteria(SearchSubCriteria.createSampleCriteria(sc)) + foundDataSets = search_service.searchForDataSets(dataSetSc) + + return foundDataSets + +flowcell, lanes, numberOfLanes = sampleSearch(incomingName) + +for mystat in stat: + laneCode = flowcell[0].getCode() + ":" + mystat.lane + searchIndex1 = mystat.index1.upper() + searchIndex2 = mystat.index2.upper() + + # Search for a data set with those two indices + DataSet = searchDataSetsofSample(laneCode, searchIndex1, searchIndex2, FASTQ_DATA_SET_TYPE) + try: + assert DataSet.size() == 1 + except AssertionError: + print (str(DataSet.size()) + ' data sets found which match the criterias: '+ + str(laneCode), searchIndex1, searchIndex2) + break + + sa = transaction.getDataSetForUpdate(DataSet[0].getDataSetCode()) + sa.setPropertyValue('YIELD_MBASES', str(mystat.pfYieldSum)) + sa.setPropertyValue('RAW_YIELD_MBASES', str(mystat.rawYieldSum)) + sa.setPropertyValue('PERCENTAGE_PASSED_FILTERING',str(mystat.pfPercentage)) + sa.setPropertyValue('PF_READS_SUM',str(mystat.pfReadsSum)) + sa.setPropertyValue('RAW_READS_SUM',str(mystat.rawReadsSum)) + sa.setPropertyValue('PERCENTAGE_RAW_CLUSTERS_PER_LANE', str(mystat.rawPercentageReadsPerLane)) + sa.setPropertyValue('PFYIELDQ30PERCENTAGE', str(mystat.pfYieldQ30Percentage)) + sa.setPropertyValue('PFMEANQUALITYSCORE', str(mystat.pfmeanQualityScore)) + + print "Modified data sets properties of: " + DataSet[0].getDataSetCode() -- GitLab