From 2540d7c343d541481ad05ddef063f7ad4b757865 Mon Sep 17 00:00:00 2001
From: kohleman <kohleman>
Date: Fri, 3 May 2013 16:24:32 +0000
Subject: [PATCH] dirty commit: work in progress

SVN: 28999
---
 .../Jython/createSampleSheet_Illumina_NGS.py  | 536 ++++++++++++++++++
 1 file changed, 536 insertions(+)
 create mode 100644 deep_sequencing_unit/source/Jython/createSampleSheet_Illumina_NGS.py

diff --git a/deep_sequencing_unit/source/Jython/createSampleSheet_Illumina_NGS.py b/deep_sequencing_unit/source/Jython/createSampleSheet_Illumina_NGS.py
new file mode 100644
index 00000000000..4bf2b513ca4
--- /dev/null
+++ b/deep_sequencing_unit/source/Jython/createSampleSheet_Illumina_NGS.py
@@ -0,0 +1,536 @@
+'''
+@copyright:
+Copyright 2013 ETH Zuerich, CISD
+ 
+@license:
+Licensed under the Apache License, Version 2.0 (the 'License');
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+ 
+http://www.apache.org/licenses/LICENSE-2.0
+ 
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an 'AS IS' BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+@author:
+Manuel Kohler
+
+@description:
+Creates the SampleSheet.csv out of values from openBIS for Demultiplexing 
+used in the Illumina pipeline (configureBclToFastq.pl) 
+
+@attention:
+Runs under Jython
+
+@note:
+Takes into account to replace special characters with an underscore so that the Illumina script
+does not fail
+
+HiSeq Header Description
+========================
+Column Header  Description
+FCID  Flow cell ID
+Lane  Positive integer, indicating the lane number (1-8)
+SampleID  ID of the sample
+SampleRef  The reference used for alignment for the sample
+Index  Index sequences. Multiple index reads are separated by a hyphen (for example, ACCAGTAA-GGACATGA).
+Description  Description of the sample
+Control  Y indicates this lane is a control lane, N means sample
+Recipe Recipe used during sequencing
+Operator Name or ID of the operator
+SampleProject  The project the sample belongs to
+'''
+
+from __future__ import with_statement
+import os
+import logging
+import re
+import sys
+import string
+import smtplib
+import operator
+from ConfigParser import SafeConfigParser
+from optparse import OptionParser
+from datetime import *
+
+from ch.systemsx.cisd.openbis.dss.client.api.v1 import OpenbisServiceFacadeFactory
+from ch.systemsx.cisd.openbis.generic.shared.api.v1.dto import SearchCriteria
+from ch.systemsx.cisd.openbis.generic.shared.api.v1.dto import SearchSubCriteria
+from java.util import EnumSet
+from ch.systemsx.cisd.openbis.generic.shared.api.v1.dto import SampleFetchOption
+
+lineending = {'win32':'\r\n', 'linux':'\n', 'mac':'\r'}
+COMMA = ','
+
+def getLaneAncestors(service, lane, logger):
+  fetchOptions = EnumSet.of(SampleFetchOption.ANCESTORS, SampleFetchOption.PROPERTIES)
+
+  sc = SearchCriteria();
+  sc.addMatchClause(SearchCriteria.MatchClause.createAttributeMatch(SearchCriteria.MatchClauseAttribute.CODE, lane));
+  dList = service.searchForSamples(sc, fetchOptions)
+
+
+  def recursiveSamples(list, codeList, propertyDict):
+       
+    for element in list:
+      elementCode = element.getCode()
+      codeList.append(elementCode)
+      elementProperties = element.getProperties()
+      
+      tmpDict = {}
+      
+      for eP in elementProperties:
+        tmpDict[eP] = elementProperties[eP]
+        
+      tmpDict['SAMPLETYPE'] = element.getSampleTypeCode()
+      propertyDict[elementCode] = tmpDict
+
+      elementParents = element.getParents()
+      
+      if (elementParents.size() > 0):
+        recursiveSamples(elementParents, codeList, propertyDict)
+ 
+  codeList = []
+  propertyDict = {}
+    
+  recursiveSamples(dList, codeList, propertyDict)
+  
+  logger.info(codeList)
+  logger.info(propertyDict)
+  
+  return codeList, propertyDict
+
+
+def login(configMap, logger):
+  logger.info('Logging into ' + configMap['openbisServer'])
+  try:
+    service = OpenbisServiceFacadeFactory.tryCreate(configMap['openbisUserName'],
+                                                  configMap['openbisPassword'],
+                                                  configMap['openbisServer'],
+                                                  configMap['connectionTimeout'])
+  except BaseException:
+    print ('Could not connect to ' + configMap['openbisServer'] + '. Please check if the server ' +
+    'address is OK, the firewall is not blocking the communication, or openBIS is down.')
+
+  return service
+
+def logout (service, logger):
+  service.logout()
+  logger.info('Logged out')
+
+def setUpLogger(logPath, logLevel=logging.INFO):
+  logFileName = 'createSampleSheet'
+  d = datetime.now()
+  logFileName = logFileName + '_' + d.strftime('%Y-%m-%d_%H_%M_%S') + '.log'
+  logging.basicConfig(filename=logPath + logFileName,
+                      format='%(asctime)s [%(levelname)s] %(message)s', level=logLevel)
+  logger = logging.getLogger(logFileName)
+  return logger
+
+def parseOptions(logger):
+  logger.info('Parsing command line parameters')
+  parser = OptionParser(version='%prog 1.0')
+  parser.add_option('-f', '--flowcell',
+                  dest='flowcell',
+                  help='The flowcell which is used to create the SampleSheet.csv',
+                  metavar='<flowcell>')
+  parser.add_option('-l', '--lineending',
+                  dest='lineending',
+                  type='choice',
+                  action='store',
+                  choices=['win32', 'linux', 'mac'],
+                  default='linux',
+                  help='Specify end of line separator: win32, linux, mac. Default: linux' ,
+                  metavar='<lineending>')
+  parser.add_option('-o', '--outdir',
+                  dest='outdir',
+                  default='./',
+                  help='Specify the ouput directory. Default: ./' ,
+                  metavar='<outdir>')
+  parser.add_option('-d', '--debug',
+                  dest='debug',
+                  default=False,
+                  action='store_true',
+                  help='Verbose debug logging. Default: False')
+  parser.add_option('-v', '--verbose',
+                  dest='verbose',
+                  default=False,
+                  action='store_true',
+                  help='Write Sample Sheet to stout. Default: False')
+
+  (options, args) = parser.parse_args()
+
+  if options.outdir[-1] <> '/':
+    options.outdir = options.outdir + '/'
+
+  if options.flowcell is None:
+    parser.print_help()
+    exit(-1)
+  return options
+
+def parseConfigurationFile(logger, propertyFile='etc/createSampleSheet_Illumina_NGS.properties'):
+  '''
+  Parses the given config files and returns the values
+  '''
+  logger.info('Reading config file ' + propertyFile)
+  config = SafeConfigParser()
+  config.read(propertyFile)
+  config.sections()
+  return config
+
+def readConfig(logger):
+
+  GENERAL = 'GENERAL'
+  OPENBIS = 'OPENBIS'
+  ILLUMINA = 'ILLUMINA'
+
+  configMap = {}
+
+  configParameters = parseConfigurationFile(logger)
+
+  configMap['facilityName'] = configParameters.get(GENERAL, 'facilityName')
+  configMap['facilityNameShort'] = configParameters.get(GENERAL, 'facilityNameShort')
+  configMap['facilityInstitution'] = configParameters.get(GENERAL, 'facilityInstitution')
+  configMap['sampleSheetFileName'] = configParameters.get(GENERAL, 'sampleSheetFileName')
+  configMap['separator'] = configParameters.get(GENERAL, 'separator')
+  configMap['indexSeparator'] = configParameters.get(GENERAL, 'indexSeparator')
+
+  configMap['openbisUserName'] = configParameters.get(OPENBIS, 'openbisUserName')
+  configMap['openbisPassword'] = configParameters.get(OPENBIS, 'openbisPassword', raw=True)
+  configMap['openbisServer'] = configParameters.get(OPENBIS, 'openbisServer')
+  configMap['connectionTimeout'] = configParameters.getint(OPENBIS, 'connectionTimeout')
+  configMap['illuminaFlowCellTypeName'] = configParameters.get(OPENBIS, 'illuminaFlowCellTypeName')
+  configMap['index1Name'] = configParameters.get(OPENBIS, 'index1Name')
+  configMap['index2Name'] = configParameters.get(OPENBIS, 'index2Name')
+  configMap['species'] = configParameters.get(OPENBIS, 'species')
+  configMap['sampleName'] = configParameters.get(OPENBIS, 'sampleName')
+  configMap['operator'] = configParameters.get(OPENBIS, 'operator')
+  configMap['endType'] = configParameters.get(OPENBIS, 'endType')
+  configMap['readLength'] = configParameters.get(OPENBIS, 'readLength')
+  configMap['lengthIndex1'] = configParameters.get(OPENBIS, 'lengthIndex1')
+  configMap['lengthIndex2'] = configParameters.get(OPENBIS, 'lengthIndex2')
+  configMap['gaNumber'] = configParameters.get(OPENBIS, 'gaNumber')
+
+  configMap['hiSeqNames'] = configParameters.get(ILLUMINA, 'hiSeqNames')
+  configMap['hiSeqHeader'] = configParameters.get(ILLUMINA, 'hiSeqHeader')
+
+  return configMap
+
+def sanitizeString(myString):
+  return re.sub('[^A-Za-z0-9]+', '_', myString)
+
+def getVocabulary(vocabularyCode):
+  ''' Returns the vocabulary terms and vocabulary labels of a vocabulary in a dictionary
+      specified by the parameter vocabularyCode
+      '''
+  terms = []
+  vocabularies = service.listVocabularies()
+  vocabularyDict = {}
+  for vocabulary in vocabularies:
+    if (vocabulary.getCode() == vocabularyCode):
+      terms = vocabulary.getTerms()
+  if terms:
+    for term in terms:
+      vocabularyDict[term.getCode()] = term.getLabel()
+  else:
+    print ('No vocabulary found for ' + vocabularyCode)
+  return vocabularyDict
+
+def getFlowCell (illuminaFlowCellTypeName, flowCellName, service, logger):
+  '''
+  Getting the the matching FlowCell
+  '''
+  sc = SearchCriteria();
+  sc.addMatchClause(SearchCriteria.MatchClause.createAttributeMatch(SearchCriteria.MatchClauseAttribute.TYPE, illuminaFlowCellTypeName));
+  sc.addMatchClause(SearchCriteria.MatchClause.createAttributeMatch(SearchCriteria.MatchClauseAttribute.CODE, flowCellName));
+  foundSample = service.searchForSamples(sc)
+  try:
+    assert foundSample.size() == 1
+  except AssertionError:
+    print (str(foundSample.size()) + ' flow cells found which match.')
+    exit(1)
+
+  logger.info('Found ' + foundSample[0].getCode() + ' in openBIS')
+  # Search for contained samples
+  sampleSc = SearchCriteria()
+  sampleSc.addSubCriteria(SearchSubCriteria.createSampleContainerCriteria(sc))
+  foundContainedSamples = service.searchForSamples(sampleSc)
+  
+  return foundSample[0], foundContainedSamples
+
+
+def getParents(sampleName, service):
+  '''
+  Returns a list of parents of a sample 
+  '''
+  sc = SearchCriteria();
+  sc.addMatchClause(SearchCriteria.MatchClause.createAttributeMatch(SearchCriteria.MatchClauseAttribute.CODE, sampleName));
+  foundSample = service.searchForSamples(sc)
+
+  # set the criteria for getting the parents when providing the child name
+  sampleSc = SearchCriteria()
+  sampleSc.addSubCriteria(SearchSubCriteria.createSampleChildCriteria(sc))
+  foundParentSamples = service.searchForSamples(sampleSc)
+
+  return foundParentSamples
+
+def getContainedSampleProperties(logger, containedSamples, service):
+  '''
+  Takes a java.util.ArrayList of contained samples, retrieves the parents and their properties and returns it
+  as a dictionary. The key is the sample name, the value is a dictionary of the properties
+  '''
+  laneParentDict = {}
+
+  for lane in containedSamples:
+    parents = getParents (lane.getCode(), service)
+
+    for parent in parents:
+      parentCode = parent.getCode()
+      parentProperties = parent.getProperties()
+
+      propertyDict = {}
+      for property in parentProperties:
+        propertyDict[property] = parentProperties.get(property)
+
+      propertyDict['LANE'] = lane.getCode()
+      propertyDict['SAMPLE_TYPE'] = parent.getSampleTypeCode()
+      myKey = sanitizeString(lane.getCode() + '_' + parentCode)
+      laneParentDict[myKey] = propertyDict
+
+  logger.info('Found ' + str(len(laneParentDict)) + ' samples on the flow cell.')
+  return laneParentDict
+
+
+def convertSampleToDict(foundFlowCell, configMap):
+  '''
+  converts <type 'ch.systemsx.cisd.openbis.generic.shared.api.v1.dto.Sample'> to a python dict
+  '''
+  flowCellDict = {}
+  fcProperties = foundFlowCell.getProperties()
+  for property in fcProperties:
+    flowCellDict[property] = fcProperties.get(property)
+  flowCellDict['Name'] = foundFlowCell.getIdentifier().split('/')[-1]
+  flowCellDict['CODE'] = foundFlowCell.getCode()
+  return flowCellDict
+
+def getIndex(indx1, indx2, index1ReadLength, indexRead2Length, properties, configMap):
+  if indx1 in properties and index1ReadLength > 0:
+    index = properties[indx1][0:index1ReadLength]
+  else:
+    index = ''
+  if indx2 in properties and indexRead2Length > 0:
+    index = index + configMap['indexSeparator'] + properties[indx2][0:index1ReadLength]
+  return index
+
+
+def getSampleProperties(parentsKey, service, logger):
+  for sample in parentsKey:
+    sampleProperties = sample.getProperties()
+    logger.debug(sample.getSampleTypeCode() + ' ' + sample.getCode())
+    parentSamples = getParents(sample.getCode(), service)
+    for parentSample in parentSamples:
+      logger.debug(parentSample.getSampleTypeCode() + ' ' + parentSample.getCode())
+      parentSampleProperties = parentSample.getProperties()
+
+  return parentSample, parentSampleProperties
+
+
+def createSampleSheetDict(configMap, control, sampleSheetDict, flowCellName, flowCellOperator,
+                           end_type, cycles, lane, gaNumber, index, sample, sampleProperties, library):
+  sampleSheetDict[lane + '_' + sample.getCode()] = [
+    flowCellName + COMMA + lane + COMMA + library.getCode() + COMMA +
+    sampleProperties[configMap['species']] + COMMA + index + COMMA +
+    sanitizeString(sampleProperties[configMap['sampleName']]) + COMMA + control + COMMA +
+    end_type + '_' + cycles + COMMA + flowCellOperator + COMMA + gaNumber]
+
+
+def createHiseqSampleSheet(codeList, propertyDict, laneParentDict, flowCellDict, configMap, service, logger, myoptions):
+  '''
+    Builds up a dictionary with all entries in the Sample Sheet
+  '''
+  control = 'N'
+  # the illlumina pipeline uses always one base less than the sequencer is sequencing
+  demultiplexIndexLengthPenalty = 0
+
+  sampleSheetDict = {}
+  # Making sure this is on the top of the Sample Sheet
+  sampleSheetDict[u'!'] = ([configMap['hiSeqHeader']])
+
+  indx1 = configMap['index1Name']
+  indx2 = configMap['index2Name']
+
+  flowCellName = flowCellDict['CODE']
+  flowCellOperator = flowCellDict[configMap['operator']]
+  end_type = flowCellDict[configMap['endType']]
+  cycles = flowCellDict[configMap['readLength']]
+  index1ReadLength = int(flowCellDict[configMap['lengthIndex1']]) + demultiplexIndexLengthPenalty
+  indexRead2Length = int(flowCellDict[configMap['lengthIndex2']]) + demultiplexIndexLengthPenalty
+
+  for key in laneParentDict.keys():
+    lane = laneParentDict[key]['LANE'][-1:]
+    properties = laneParentDict[key]
+    sampleName = laneParentDict[key]['LIBRARYID']
+
+    # already Library with index
+    if indx1 in properties:
+      gaNumber = laneParentDict[key][configMap['gaNumber']]
+      index = getIndex(indx1, indx2, index1ReadLength, indexRead2Length, properties, configMap)
+      sample, sampleProperties = getSampleProperties(getParents(sampleName, service), service, logger)
+
+      createSampleSheetDict(configMap, control, sampleSheetDict, flowCellName, flowCellOperator,
+                          end_type, cycles, lane, gaNumber, index, sample, sampleProperties)
+    else:
+      for library in getParents(sampleName, service):
+        libraryProperties = library.getProperties()
+        gaNumber = libraryProperties[configMap['gaNumber']]
+        if not gaNumber:
+          logger.warning('No GA number found for ' + library.getCode())
+        index = getIndex(indx1, indx2, index1ReadLength, indexRead2Length, libraryProperties, configMap)
+        if not index:
+          logger.warning('No index found for ' + library.getCode())
+        sample, sampleProperties = getSampleProperties(getParents(library.getCode(), service), service, logger)
+
+        createSampleSheetDict(configMap, control, sampleSheetDict, flowCellName, flowCellOperator,
+                          end_type, cycles, lane, gaNumber, index, sample, sampleProperties, library)
+
+  logger.debug(sampleSheetDict)
+  sortedSampleSheetList = sampleSheetDict.keys()
+  sortedSampleSheetList.sort()
+  writeSampleSheet(flowCellName, sampleSheetDict, sortedSampleSheetList, myoptions, logger, fileName=myoptions.outdir +
+                      configMap['sampleSheetFileName'])
+
+def writeSampleSheet(flowCellName, sampleSheetDict, sortedSampleSheetList, myoptions, logger, fileName):
+  '''
+  Write the given dictionary to a csv file
+  '''
+  newline = lineending[myoptions.lineending]
+  myFile = fileName + '_' + flowCellName + '.csv'
+  try:
+    with open(myFile, 'w') as sampleSheetFile:
+      for listElement in sortedSampleSheetList:
+        if myoptions.verbose:
+          print sampleSheetDict[listElement][0]
+        sampleSheetFile.write(sampleSheetDict[listElement][0] + newline)
+
+      logger.info('Writing file ' + myFile)
+
+  except IOError:
+    logger.error('File error: ' + str(err))
+    print ('File error: ' + str(err))
+
+
+class sampleSheetLine:
+  def __init__(self, fcId='', lane=0, sampleId='', sampleRef='', index='', description='',
+                 control='', recipe='', operator='', sampleProject=''):
+    self.fcId = fcId
+    self.lane = lane
+    self.sampleId = sampleId
+    self.sampleRef = sampleRef
+    self.index = index
+    self.description = description
+    self.control = control
+    self.recipe = recipe
+    self.operator = operator
+    self.sampleProject = sampleProject
+
+  def __repr__(self):
+    return self.__str__()
+      
+  def __str__(self):
+    return str(self.fcId) + COMMA + str(self.lane) + COMMA + str(self.sampleId) + COMMA +  \
+      str(self.sampleRef) + COMMA + str(self.index) + COMMA + str(self.description) + COMMA + \
+      str(self.control) + COMMA + str(self.recipe) + COMMA + str(self.operator) + COMMA + \
+      str(self.sampleProject)
+
+def main():
+  '''
+  Main script
+  '''
+  print datetime.now()
+
+  logger = setUpLogger('log/')
+  logger.info('Started Creation of Sample Sheet...')
+
+  myoptions = parseOptions(logger)
+  if myoptions.debug:
+    logger.setLevel(logging.DEBUG)
+
+  flowCellName = myoptions.flowcell
+  configDict = readConfig(logger)
+  service = login(configDict, logger)
+
+  foundFlowCell, containedSamples = getFlowCell(configDict['illuminaFlowCellTypeName'], flowCellName, service, logger)
+  flowCellName = foundFlowCell.getCode()
+  flowCellDict = convertSampleToDict(foundFlowCell, configDict)
+
+  laneParentDict = getContainedSampleProperties(logger, containedSamples, service)
+  
+  sortedParentList = laneParentDict.keys()
+  sortedParentList.sort()
+  
+  for laneKey in sortedParentList:
+    lane = laneParentDict[laneKey]['LANE']
+    print lane
+    logger.info('Processing Lane '+ lane)
+    codeList, propertyDict = getLaneAncestors(service, lane, logger)
+    #print codeList
+    #print propertyDict
+    
+  #print codeList
+  
+    ssLine = sampleSheetLine (operator=configDict['facilityInstitution'])
+    ssLineList = []
+    
+    for code in codeList:
+      sampleProperties = propertyDict[code]
+      #print sampleProperties
+      
+      
+      # Precedence of the EXTERNAL_SAMPLE_NAME property is:
+      # RAW_SAMPLE -> LIBRARY -> LIBRARY_POOL 
+      try:
+        ssLine.description = sampleProperties['EXTERNAL_SAMPLE_NAME']
+      except:
+        pass
+      
+      if sampleProperties['SAMPLETYPE'] == 'LIBRARY':
+          
+        index = ''
+          
+        if 'INDEX1' in sampleProperties:
+          index = sampleProperties['INDEX1'][:-1]
+        if 'INDEX2' in sampleProperties:
+          index = index + sampleProperties['INDEX2']
+          
+        ssLine.fcId = flowCellName
+        ssLine.lane = lane[-1]
+        ssLine.sampleId = code + '_' + flowCellDict['RUN_FOLDER_NAME'] + '_' + ssLine.lane
+        ssLine.index = index
+        
+        
+      if sampleProperties['SAMPLETYPE'] == 'MASTER_SAMPLE':
+        ssLine.sampleRef = sampleProperties['NCBI_ORGANISM_TAXONOMY'] 
+        
+        # Check if PhiX Control
+        if int(ssLine.sampleRef) == 10847:
+          ssLine.control = 'Y'
+        else:
+          ssLine.control = 'N'
+        
+        ssLine.recipe = flowCellDict['END_TYPE'] + '_' + flowCellDict['CYCLES']
+        ssLine.sampleProject = flowCellDict['RUN_FOLDER_NAME'] + '_' + ssLine.lane
+  
+        ssLineList.append(ssLine)
+        ssLine = sampleSheetLine (operator=configDict['facilityInstitution'])
+  
+    for l in ssLineList: 
+      print l
+
+  logout(service, logger)
+  print datetime.now()
+
+if __name__ == "__main__":
+    main()
-- 
GitLab