From c638f1737d724d71d3ee3f3bf930b447695d42ef Mon Sep 17 00:00:00 2001
From: kohleman <kohleman>
Date: Fri, 22 Jun 2012 16:12:34 +0000
Subject: [PATCH] initial checkin: - parses
 Unaligned_no_mismatch/Basecall_Stats_<FC>/Flowcell_demux_summary.xml
 calculates several statistical values and sets the properties of the matching
 data sets to those values

SVN: 25838
---
 .../etc/data-set-handler-demultiplex-stats.py | 428 ++++++++++++++++++
 1 file changed, 428 insertions(+)
 create mode 100755 deep_sequencing_unit/dist/etc/data-set-handler-demultiplex-stats.py

diff --git a/deep_sequencing_unit/dist/etc/data-set-handler-demultiplex-stats.py b/deep_sequencing_unit/dist/etc/data-set-handler-demultiplex-stats.py
new file mode 100755
index 00000000000..0781051f88e
--- /dev/null
+++ b/deep_sequencing_unit/dist/etc/data-set-handler-demultiplex-stats.py
@@ -0,0 +1,428 @@
+'''
+  @copyright: 2012 ETH Zuerich, CISD
+  
+  @license: 
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+ 
+       http://www.apache.org/licenses/LICENSE-2.0
+ 
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+
+
+@author: Manuel Kohler
+
+XML Structur which is processed:
+
+<?xml version="1.0"?>
+<Summary>
+  <Lane index="8">
+    <Sample index="lane8">
+      <Barcode index="Undetermined">
+        <Tile index="1101">
+          <Read index="1">
+            <Raw>
+              <Yield>1921250</Yield>
+              <YieldQ30>949680</YieldQ30>
+              <ClusterCount>38425</ClusterCount>
+              <ClusterCount0MismatchBarcode>0</ClusterCount0MismatchBarcode>
+              <ClusterCount1MismatchBarcode>0</ClusterCount1MismatchBarcode>
+              <QualityScoreSum>40995660</QualityScoreSum>
+            </Raw>
+            <Pf>
+              <Yield>945450</Yield>
+              <YieldQ30>854815</YieldQ30>
+              <ClusterCount>18909</ClusterCount>
+              <ClusterCount0MismatchBarcode>0</ClusterCount0MismatchBarcode>
+              <ClusterCount1MismatchBarcode>0</ClusterCount1MismatchBarcode>
+              <QualityScoreSum>33815505</QualityScoreSum>
+            </Pf>
+          </Read>
+        </Tile>
+        [...]
+
+@note: 
+print statements go to: <openBIS_HOME>/datastore_server/log/startup_log.txt
+'''
+
+import time
+import os
+import fnmatch
+import xml.etree.ElementTree as etree
+from ch.systemsx.cisd.openbis.generic.shared.api.v1.dto import SearchCriteria
+from ch.systemsx.cisd.openbis.generic.shared.api.v1.dto import SearchSubCriteria
+
+
+class parseXmlFile:
+
+  def __init__(self, xmlFile):
+    self.xmlFile = xmlFile
+    self.tree = etree.parse(self.xmlFile)
+    self.root = self.tree.getroot()
+
+# -----------------------------------------------------------------------------
+
+class qcValues(object):
+  def __init__(self, Yield = 0, YieldQ30 = 0, ClusterCount = 0,
+               ClusterCount0MismatchBarcode = 0, ClusterCount1MismatchBarcode = 0,
+               QualityScoreSum = 0, *args, **kwargs):
+    self.Yield = Yield
+    self.YieldQ30 = YieldQ30
+    self.ClusterCount = ClusterCount 
+    self.ClusterCount0MismatchBarcode = ClusterCount0MismatchBarcode
+    self.ClusterCount1MismatchBarcode = ClusterCount1MismatchBarcode
+    self.QualityScoreSum = QualityScoreSum
+    
+  def __str__(self):
+        return "Yield: %s, YieldQ30: %s, ClusterCount: %s, ClusterCount0MismatchBarcode: %s," \
+          " CusterCount1MismatchBarcode: %s, QualityScoreSum: %s" \
+          % (self.Yield, self.YieldQ30, self.ClusterCount, self.ClusterCount0MismatchBarcode,
+          self.ClusterCount1MismatchBarcode, self.QualityScoreSum)    
+
+class sample:
+  def __init__(self, Lane = 0, Sample = '', Barcode = '', Tile = '', Read = '', rawqc = qcValues([]),
+                pfqc = qcValues([]), *args, **kwargs):
+    self.Lane = Lane
+    self.Sample = Sample
+    self.Barcode = Barcode
+    self.Tile = Tile
+    self.Read = Read
+    self.rawqc = rawqc
+    self.pfqc = pfqc
+    
+  def __str__(self):
+    return "Lane: %s, Sample: %s, Barcode: %s, Tile: %s, Read: %s, rawqc: %s, pfqc: %s" \
+      % (self.Lane, self.Sample, self.Barcode, self.Tile, self.Read, self.rawqc, self.pfqc)
+
+# -----------------------------------------------------------------------------
+
+class Statistics:
+  def __init__(self, lane = 0, sampleName = "", index1 = "NoIndex", index2 = "NoIndex", pfYieldSum = 0,
+                 rawYieldSum = 0, pfPercentage = 0.0, rawReadsSum = 0, pfReadsSum = 0,
+                 pfYieldQ30Sum = 0, qualityScoreSum = 0, rawPercentageReadsPerLane = 0.0,
+                 pfYieldQ30Percentage = 0.0, pfsumQualityScore = 0, pfmeanQualityScore = 0.0):
+      self.lane = lane
+      self.sampleName = sampleName
+      self.index1 = index1
+      self.index2 = index2
+      self.pfYieldSum = pfYieldSum
+      self.rawYieldSum = rawYieldSum
+      self.pfPercentage = pfPercentage
+      self.rawReadsSum = rawReadsSum
+      self.pfReadsSum = pfReadsSum
+      self.pfYieldQ30Sum = pfYieldQ30Sum
+      self.qualityScoreSum = qualityScoreSum
+      self.rawPercentageReadsPerLane = rawPercentageReadsPerLane
+      self.pfYieldQ30Percentage = pfYieldQ30Percentage
+      self.pfsumQualityScore = pfsumQualityScore
+      self.pfmeanQualityScore = pfmeanQualityScore
+      
+  def __str__(self):
+    return "lane: %s, sampleName: %s, index1: %s, index2: %s, pfYieldSum: %s, pfPercentage: %s," \
+           " rawReadsSum: %s, pfReadsSum: %s," \
+           " rawPercentageReadsPerLane: %s, pfYieldQ30Percentage: %s," \
+           " pfmeanQualityScore: %s" \
+        % (self.lane, self.sampleName, self.index1, self.index2, self.pfYieldSum, self.pfPercentage,
+           self.rawReadsSum, self.pfReadsSum,
+           self.rawPercentageReadsPerLane, self.pfYieldQ30Percentage, self.pfmeanQualityScore)
+         
+  def calculatePercentagePF (self, rawYield = 0, pfYield = 1):
+    try:
+      return round(float(pfYield) / float(rawYield) * 100, 2)
+    except:
+      return 0.0
+  
+  def calulateMeanQualityScore (self, pfqualityScoreSum = 0, pfYield = 1):
+    try:
+      return round (float(pfqualityScoreSum) / float(pfYield), 2)
+    except:
+      return 0.0
+    
+  def calculateYieldQ30Percentage (self, pfYieldQ30 = 0, pfYield = 1):
+    try:
+      return round (float(pfYieldQ30) / float(pfYield) * 100, 2)
+    except:
+      return 0.0
+
+# -----------------------------------------------------------------------------
+
+def xml2Memory(DEMULTIPLEX_XML):
+  '''
+  Parse the XML file and put all values in a memory structure:
+  List of: 
+  lane, sample, barcode, tile, read, qcRawList, qcPfList
+  '''
+
+  RAW_TAG = "Raw"
+  PF_TAG = "Pf"
+
+  sampleList = []
+  
+  xml = parseXmlFile(DEMULTIPLEX_XML)
+  r = xml.tree.getroot()
+
+  for lane in r.getchildren():
+    for mysample in lane:  
+      for barcode in mysample:       
+        for tile in barcode:
+          for read in tile:
+            
+            qcRaw = qcValues()
+            qcPf = qcValues()  
+            qcRawList = []
+            qcPfList = []
+
+            # Read out the Raw fields
+            raw = read.find(RAW_TAG) 
+            for child in raw.getchildren():
+              # equivalent to a Java reflection
+              setattr(qcRaw, child.tag, int(child.text))
+              
+            # Read out the Pf fields
+            pf = read.find(PF_TAG)
+            for child in pf.getchildren():
+              # equivalent to a Java reflection
+              setattr(qcPf, child.tag, int(child.text))
+
+            qcRawList.append(qcRaw)
+            qcPfList.append(qcPf)
+            
+            singleElement = sample ()
+            
+            setattr(singleElement, lane.tag, lane.attrib)
+            setattr(singleElement, mysample.tag, mysample.attrib)
+            setattr(singleElement, barcode.tag, barcode.attrib)
+            setattr(singleElement, tile.tag, tile.attrib)
+            setattr(singleElement, read.tag, read.attrib)
+            singleElement.rawqc = qcRawList
+            singleElement.pfqc = qcPfList
+            
+            sampleList.append(singleElement)
+  return sampleList
+
+# -----------------------------------------------------------------------------
+
+def calculateStatistics(listofSamples):
+  '''
+  Structure of 'listofSamples'
+  Lane: {'index': '6'}, Sample: {'index': 'BSSE-QGF-3524_C0NKPACXX'}, Barcode: {'index': 'TGACCA'},
+  Tile: {'index': '2307'}, Read: {'index': '1'}, rawqc:<mem>, pfqc:<mem> 
+  '''
+
+  numberOfTiles = len(listofSamples)
+  
+  tile = sample()
+  raw = qcValues ()
+  pf = qcValues ()
+  stats = Statistics()
+ 
+  for tile in listofSamples:
+    raw = tile.rawqc[0]
+    pf = tile.pfqc[0]
+    
+    stats.pfYieldSum += pf.Yield
+    stats.rawYieldSum += raw.Yield
+    stats.rawReadsSum += raw.ClusterCount 
+    stats.pfReadsSum += pf.ClusterCount
+    stats.pfYieldQ30Sum += pf.YieldQ30
+    stats.qualityScoreSum += pf.QualityScoreSum
+     
+  # Can not be set here, needs to be calculated later   
+  #stats.rawPercentageReadsPerLane = rawPercentageReadsPerLane
+  stats.pfPercentage = stats.calculatePercentagePF(stats.rawYieldSum, stats.pfYieldSum)
+  stats.pfYieldQ30Percentage = stats.calculateYieldQ30Percentage(stats.pfYieldQ30Sum, stats.pfYieldSum)
+  stats.pfmeanQualityScore = stats.calulateMeanQualityScore(stats.qualityScoreSum, stats.pfYieldSum)
+  stats.lane = listofSamples[0].Lane.values()[0]
+  stats.sampleName = listofSamples[0].Sample.values()[0]
+  index = listofSamples[0].Barcode.values()[0]
+  try:
+    stats.index1, stats.index2 = index.split("-")
+  except:
+    stats.index1 = index
+  return stats
+
+# -----------------------------------------------------------------------------
+
+
+def rawReadSumPerSamples(stat):
+  '''
+  Creates a dictionary with the lanes as keys
+  The values are a list where the elements are a dictionary again.
+  This dictionary has the sample names as key and the RawReadSum as value.  
+  
+  Example:
+  {4': [{'BSSE-QGF-3434_C0NKPACXX': 248999502}], '7': [{'lane7': 123921974}, 
+  {'BSSE-QGF-3527_C0NKPACXX': 38587703}, {'BSSE-QGF-3529_C0NKPACXX': 30130893},
+  {'BSSE-QGF-3528_C0NKPACXX': 34519296}, {'BSSE-QGF-3526_C0NKPACXX': 34980179}]}
+  '''
+  
+  laneDict = {}
+  for e in stat:
+    if e.lane not in laneDict:
+      laneDict[e.lane] = [{e.sampleName:e.rawReadsSum}]
+    else:
+      laneDict[e.lane].append({e.sampleName:e.rawReadsSum})
+  return laneDict
+
+# -----------------------------------------------------------------------------
+
+def createSumRawReadsPerLane(laneDict):
+  '''
+  Creates a dictionary with lane as key and sum of Raw Reads as value:
+  {'1': 183180877, '3': 244968562, '2': 191496395, '5': 193466239, '4': 248999502, 
+  '7': 262140045, '6': 257136830, '8': 209948449}
+  '''
+  sumRawReadsDict = {}
+  for lane in laneDict:
+    sumRawReads = 0
+    for sampleNameDict in laneDict[lane]:
+      sumRawReads += sampleNameDict.values()[0]
+    
+    sumRawReadsDict[lane] = sumRawReads 
+  return sumRawReadsDict
+
+# -----------------------------------------------------------------------------
+
+def createPercentagePerLane(laneDict, sumRawReadsDict):
+  '''
+  Creates a dictionary with the sample Name as key and the percentage of raw reads related to 
+  all reads in the same lane
+  {'lane7': 47.27, 'BSSE-QGF-3433_C0NKPACXX': 100.0, 'BSSE-QGF-3666_C0NKPACXX': 54.12}
+  '''
+  
+  relRawReadsDict = {}
+  for lane in laneDict:
+    for sampleName in laneDict[lane]:
+      relRawReadsDict[sampleName.keys()[0]] = round(float(sampleName.values()[0]) / 
+                                                    float(sumRawReadsDict[lane]) * 100, 2)
+  return relRawReadsDict
+
+# -----------------------------------------------------------------------------
+
+def locate(pattern, root):
+    '''Locate all files matching supplied filename pattern in and below
+    supplied root directory.'''
+    for path, dirs, files in os.walk(os.path.abspath(root)):
+        for filename in fnmatch.filter(files, pattern):
+            yield os.path.join(path, filename)
+
+# -----------------------------------------------------------------------------
+# -----------------------------------------------------------------------------
+
+FASTQ_DATA_SET_TYPE='FASTQ_GZ'
+DEMUX_FILE='Flowcell_demux_summary.xml'
+NO_INDEX='NOINDEX'
+UNDETERMINED='UNDETERMINED'
+
+print('\n'+time.ctime())
+incomingPath = incoming.getAbsolutePath()
+# Create a "transaction" -- a way of grouping operations together so they all
+# happen or none of them do.
+transaction = service.transaction()
+# Get the incoming name 
+incomingName = incoming.getName()
+# Get the search service
+search_service = transaction.getSearchService()
+
+FileGenerator= locate(DEMUX_FILE, incomingPath)
+DEMULTIPLEX_XML = FileGenerator.next()
+
+sampleList = xml2Memory(DEMULTIPLEX_XML)    
+
+sa = sample()
+sampleDict = {}
+
+# key = sample name, value = sample()  
+for element in range(0, len(sampleList)):
+  sa = sampleList[element]
+  # Check if new sample 
+  if (sa.Sample is not sampleList[element - 1].Sample):
+    sampleName = sa.Sample.values()[0]
+    sampleDict[sampleName] = [sa]
+  else:
+    sampleDict[sampleName].append(sa)
+    
+stat = [calculateStatistics(sampleDict[mysample]) for mysample in sampleDict]
+
+# calculate the relative amount of reads per index
+laneDict = rawReadSumPerSamples(stat)
+sumRawReadsDict = createSumRawReadsPerLane(laneDict)
+relRawReadsDict = createPercentagePerLane(laneDict, sumRawReadsDict)
+
+# set the values in the object
+for mye in stat:
+  mye.rawPercentageReadsPerLane = relRawReadsDict[mye.sampleName]
+ 
+#for m in stat:
+#  print m
+
+def sampleSearch(Code=''):
+  sc = SearchCriteria()
+  numberOfLanes = 0
+  sc.addMatchClause(SearchCriteria.MatchClause.createAttributeMatch(SearchCriteria.MatchClauseAttribute.CODE, Code));
+  search_service = transaction.getSearchService()
+  foundSample = search_service.searchForSamples(sc)
+  if foundSample.size() > 0:
+    # Search for contained samples
+    sampleSc = SearchCriteria()
+    sampleSc.addSubCriteria(SearchSubCriteria.createSampleContainerCriteria(sc))
+    foundContainedSamples = search_service.searchForSamples(sampleSc)
+    numberOfLanes = foundContainedSamples.size()
+  return foundSample, foundContainedSamples, numberOfLanes
+
+def searchDataSetsofSample(sample, index1, index2, DATA_SET_TYPE):
+  sc = SearchCriteria()
+  sc.addMatchClause(SearchCriteria.MatchClause.createAttributeMatch(SearchCriteria.MatchClauseAttribute.CODE, sample));
+  search_service = transaction.getSearchService()
+  foundSample = search_service.searchForSamples(sc)
+
+  dataSetSc = SearchCriteria()
+  dataSetSc.addMatchClause(SearchCriteria.MatchClause.createAttributeMatch(SearchCriteria.MatchClauseAttribute.TYPE, DATA_SET_TYPE))
+  if index1 not in (NO_INDEX):
+    if index1 in (UNDETERMINED):
+      index1 = NO_INDEX
+    else:
+      # Ugly workaround, the Index vocabulary has 7 nucleotides of which the last one is always an 'A'
+      index1 = index1 + "A"
+    dataSetSc.addMatchClause(SearchCriteria.MatchClause.createPropertyMatch("BARCODE", index1))
+  if index2 not in (NO_INDEX):
+    if index2 in (UNDETERMINED):
+      index2 = NO_INDEX
+    dataSetSc.addMatchClause(SearchCriteria.MatchClause.createPropertyMatch("INDEX2", index2))
+  dataSetSc.addSubCriteria(SearchSubCriteria.createSampleCriteria(sc))
+  foundDataSets = search_service.searchForDataSets(dataSetSc)
+
+  return foundDataSets
+
+flowcell, lanes, numberOfLanes  = sampleSearch(incomingName)
+
+for mystat in stat:
+  laneCode = flowcell[0].getCode() + ":" + mystat.lane
+  searchIndex1 = mystat.index1.upper()
+  searchIndex2 = mystat.index2.upper()
+
+  # Search for a data set with those two indices
+  DataSet = searchDataSetsofSample(laneCode, searchIndex1, searchIndex2, FASTQ_DATA_SET_TYPE)
+  try:
+    assert DataSet.size() == 1
+  except AssertionError:
+    print (str(DataSet.size()) + ' data sets found which match the criterias: '+ 
+            str(laneCode), searchIndex1, searchIndex2)
+    break
+
+  sa = transaction.getDataSetForUpdate(DataSet[0].getDataSetCode())
+  sa.setPropertyValue('YIELD_MBASES', str(mystat.pfYieldSum))
+  sa.setPropertyValue('RAW_YIELD_MBASES', str(mystat.rawYieldSum))
+  sa.setPropertyValue('PERCENTAGE_PASSED_FILTERING',str(mystat.pfPercentage))
+  sa.setPropertyValue('PF_READS_SUM',str(mystat.pfReadsSum))
+  sa.setPropertyValue('RAW_READS_SUM',str(mystat.rawReadsSum))
+  sa.setPropertyValue('PERCENTAGE_RAW_CLUSTERS_PER_LANE', str(mystat.rawPercentageReadsPerLane))
+  sa.setPropertyValue('PFYIELDQ30PERCENTAGE', str(mystat.pfYieldQ30Percentage))
+  sa.setPropertyValue('PFMEANQUALITYSCORE', str(mystat.pfmeanQualityScore))
+
+  print "Modified data sets properties of: " + DataSet[0].getDataSetCode()
-- 
GitLab