From 1f1bdeedd381eb16e03d493e2741a48d6869f5c7 Mon Sep 17 00:00:00 2001
From: kohleman <kohleman>
Date: Tue, 16 Oct 2012 09:44:54 +0000
Subject: [PATCH] CISDTT-30: Work Package 3: Sample Sheet Creation for Novartis
 NGS Master Data Schema.

SVN: 27199
---
 .../source/Jython/createSampleSheet_nov.py    | 387 ++++++++++++++++++
 1 file changed, 387 insertions(+)
 create mode 100644 deep_sequencing_unit/source/Jython/createSampleSheet_nov.py

diff --git a/deep_sequencing_unit/source/Jython/createSampleSheet_nov.py b/deep_sequencing_unit/source/Jython/createSampleSheet_nov.py
new file mode 100644
index 00000000000..30c8487db4f
--- /dev/null
+++ b/deep_sequencing_unit/source/Jython/createSampleSheet_nov.py
@@ -0,0 +1,387 @@
+'''
+@copyright:
+Copyright 2012 ETH Zuerich, CISD
+ 
+@license:
+Licensed under the Apache License, Version 2.0 (the 'License');
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+ 
+http://www.apache.org/licenses/LICENSE-2.0
+ 
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an 'AS IS' BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+@author:
+Manuel Kohler
+
+@description:
+Creates the SampleSheet.csv out of values from openBIS for Demultiplexing 
+used in the Illumina pipeline (configureBclToFastq.pl) 
+
+@attention:
+Runs under Jython
+
+@note:
+Takes into account to replace special characters with an underscore so that the Illumina script
+does not fail
+
+HiSeq Header Description
+========================
+Column Header  Description
+FCID  Flow cell ID
+Lane  Positive integer, indicating the lane number (1-8)
+SampleID  ID of the sample
+SampleRef  The reference used for alignment for the sample
+Index  Index sequences. Multiple index reads are separated by a hyphen (for example, ACCAGTAA-GGACATGA).
+Description  Description of the sample
+Control  Y indicates this lane is a control lane, N means sample
+Recipe Recipe used during sequencing
+Operator Name or ID of the operator
+SampleProject  The project the sample belongs to
+'''
+
+
+from __future__ import with_statement
+import os
+import logging
+import re
+import sys
+import string
+import smtplib
+from ConfigParser import SafeConfigParser
+from optparse import OptionParser
+from datetime import *
+
+from ch.systemsx.cisd.openbis.dss.client.api.v1 import OpenbisServiceFacadeFactory
+from ch.systemsx.cisd.openbis.generic.shared.api.v1.dto import SearchCriteria
+from ch.systemsx.cisd.openbis.generic.shared.api.v1.dto import SearchSubCriteria
+from java.util import EnumSet
+
+lineending = {'win32':'\r\n', 'linux':'\n', 'mac':'\r'}
+COMMA = ','
+
+def login(configMap, logger):
+  logger.info('Logging into ' + configMap['openbisServer'])
+  try:
+    service = OpenbisServiceFacadeFactory.tryCreate(configMap['openbisUserName'],
+                                                  configMap['openbisPassword'],
+                                                  configMap['openbisServer'],
+                                                  configMap['connectionTimeout'])
+  except:
+    raise ('Could not connect to ' +  configMap['openbisServer'] + '. Please check if the server ' +
+    'address is OK, the firewall is not blocking the communication or openBIS is down.')
+  
+  return service
+
+def logout (service, logger):
+  service.logout()
+  logger.info('Logged out')
+
+def setUpLogger(logPath, logLevel = logging.INFO):
+  logFileName = 'createSampleSheet'
+  d=datetime.now()
+  logFileName = logFileName + '_' + d.strftime('%Y-%m-%d_%H_%M_%S') + '.log'
+  logging.basicConfig(filename=logPath + logFileName, format='%(asctime)s %(message)s', level=logLevel)
+  logger = logging.getLogger(logFileName)
+  return logger
+
+def parseConfigurationFile(logger, propertyFile = 'etc/createSampleSheet_nov.properties'):
+  '''
+  Parses the given config files and returns the values
+  '''
+  logger.info('Reading config file ' + propertyFile)
+  config = SafeConfigParser()
+  config.read(propertyFile)
+  config.sections()
+  return config
+
+def parseOptions(logger):
+  logger.info('Parsing command line parameters')
+  parser = OptionParser(version='%prog 1.0')
+  parser.add_option('-f', '--flowcell',
+                  dest = 'flowcell',
+                  help = 'The flowcell which is used to create the SampleSheet.csv',
+                  metavar = '<flowcell>')
+  parser.add_option('-l', '--lineending',
+                  dest = 'lineending',
+                  type='choice',
+                  action='store',
+                  choices=['win32', 'linux', 'mac'],
+                  default='linux',
+                  help = 'Specify end of line separator: win32, linux, mac. Default: linux' ,
+                  metavar = '<lineending>')
+  parser.add_option('-o', '--outdir',
+                  dest = 'outdir',
+                  default='./',
+                  help = 'Specify the ouput directory. Default: ./' ,
+                  metavar = '<outdir>')
+  parser.add_option('-d', '--debug',
+                  dest = 'debug',
+                  default=False,
+                  action='store_true',
+                  help = 'Verbose debug logging. Default: False')
+
+  (options, args) = parser.parse_args()
+  
+  if options.outdir[-1] <> '/':
+    options.outdir = options.outdir + '/'
+  
+  if options.flowcell is None:
+    parser.print_help()
+    exit(-1)
+  return options
+
+def readConfig(logger):
+  configMap = {}
+  
+  configParameters = parseConfigurationFile(logger)
+  configMap['facilityName'] = configParameters.get('GENERAL', 'facilityName')
+  configMap['facilityNameShort'] = configParameters.get('GENERAL', 'facilityNameShort')
+  configMap['facilityInstitution'] = configParameters.get('GENERAL', 'facilityInstitution')
+  configMap['sampleSheetFileName'] = configParameters.get('GENERAL', 'sampleSheetFileName')
+  configMap['lanePrefix'] = configParameters.get('GENERAL', 'lanePrefix')
+  configMap['separator'] = configParameters.get('GENERAL', 'separator')
+  configMap['indexSeparator'] = configParameters.get('GENERAL', 'indexSeparator')
+  
+  configMap['openbisUserName'] = configParameters.get('OPENBIS', 'openbisUserName')
+  configMap['openbisPassword'] = configParameters.get('OPENBIS', 'openbisPassword', raw=True)
+  configMap['openbisServer'] = configParameters.get('OPENBIS', 'openbisServer')
+  configMap['connectionTimeout'] = configParameters.getint('OPENBIS', 'connectionTimeout')
+  configMap['illuminaFlowCellTypeName'] = configParameters.get('OPENBIS', 'illuminaFlowCellTypeName')
+  configMap['index1Name'] = configParameters.get('OPENBIS', 'index1Name')
+  configMap['index2Name'] = configParameters.get('OPENBIS', 'index2Name')
+
+  configMap['hiSeqNames'] = configParameters.get('ILLUMINA', 'hiSeqNames')
+  configMap['hiSeqHeader'] = configParameters.get('ILLUMINA', 'hiSeqHeader')
+  
+  return configMap
+
+def sanitizeString(myString):
+  return re.sub('[^A-Za-z0-9]+', '_', myString)
+
+def getVocabulary(vocabularyCode):
+  ''' Returns the vocabulary terms and vocabulary labels of a vocabulary in a dictionary
+      specified by the parameter vocabularyCode
+      '''
+  terms = []
+  vocabularies = service.listVocabularies()
+  vocabularyDict = {}
+  for vocabulary in vocabularies:
+    if (vocabulary.getCode() == vocabularyCode):
+      terms = vocabulary.getTerms()
+  if terms:
+    for term in terms:
+      vocabularyDict[term.getCode()] = term.getLabel()
+  else:
+    print ('No vocabulary found for ' + vocabularyCode)
+  return vocabularyDict 
+
+def getFlowCell (illuminaFlowCellTypeName, flowCellName, service, logger):
+  '''
+  Getting the the matching FlowCell
+  '''
+  sc = SearchCriteria();
+  sc.addMatchClause(SearchCriteria.MatchClause.createAttributeMatch(SearchCriteria.MatchClauseAttribute.TYPE, illuminaFlowCellTypeName));
+  sc.addMatchClause(SearchCriteria.MatchClause.createAttributeMatch(SearchCriteria.MatchClauseAttribute.CODE, flowCellName));
+  foundSample = service.searchForSamples(sc)
+  try:
+    assert foundSample.size() == 1
+  except AssertionError:
+    print (str(foundSample.size()) + ' flow cells found which match.')
+    exit(1)
+  
+  logger.info('Found ' + foundSample[0].getCode() + ' in openBIS')
+  # Search for contained samples
+  sampleSc = SearchCriteria()
+  sampleSc.addSubCriteria(SearchSubCriteria.createSampleContainerCriteria(sc))
+  foundContainedSamples = service.searchForSamples(sampleSc)
+ 
+  return foundSample[0], foundContainedSamples
+
+
+def getParents(sampleName, service):
+  '''
+  Returns a list of parents of a sample 
+  '''
+  sc = SearchCriteria();
+  sc.addMatchClause(SearchCriteria.MatchClause.createAttributeMatch(SearchCriteria.MatchClauseAttribute.CODE, sampleName));
+  foundSample = service.searchForSamples(sc)
+   
+  # set the criteria for getting the parents when providing the child name
+  sampleSc = SearchCriteria()
+  sampleSc.addSubCriteria(SearchSubCriteria.createSampleChildCriteria(sc))
+  foundParentSamples = service.searchForSamples(sampleSc)
+  
+  return foundParentSamples 
+
+def getContainedSampleProperties(containedSamples, service):
+  '''
+  Takes a  list of contained samples, retrieves the parents and their properties and returns it
+  as a dictionary. The key is the sample name, the value is a list of the properties
+  '''
+  laneParentDict = {}
+  
+  for lane in containedSamples:
+    parents = getParents (lane.getCode(), service)
+    
+    for parent in parents:
+      parentCode = parent.getCode()
+      parentProperties = parent.getProperties()
+      
+      propertyDict = {}
+      for property in parentProperties:
+        propertyDict[property] = parentProperties.get(property)
+      
+      propertyDict['LANE'] = lane.getCode()
+      propertyDict['SAMPLE_TYPE'] = parent.getSampleTypeCode()
+      myKey = sanitizeString(parentCode + '_' + lane.getCode())
+      laneParentDict[myKey] = propertyDict
+  return laneParentDict
+
+
+def convertSampleToDict(foundFlowCell, configMap):
+  '''
+  converts <type 'ch.systemsx.cisd.openbis.generic.shared.api.v1.dto.Sample'> to a python dict
+  '''
+  flowCellDict = {}
+  fcProperties = foundFlowCell.getProperties()
+  for property in fcProperties:
+    flowCellDict[property] = fcProperties.get(property)
+  flowCellDict['Name'] = foundFlowCell.getIdentifier().split('/')[-1]
+  flowCellDict['CODE'] = foundFlowCell.getCode()
+  return flowCellDict
+
+def getIndex(indx1, indx2, index1ReadLength, indexRead2Length, properties, configMap):
+  if indx1 in properties and index1ReadLength > 0:
+    index = properties[indx1][0:index1ReadLength]
+  else:
+    index = ''
+  if indx2 in properties and indexRead2Length > 0:
+    index = index + configMap['indexSeparator'] + properties[indx2][0:index1ReadLength]
+  return index
+
+
+def getSampleProperties(parentsKey, service, logger):
+  for sample in parentsKey:
+    sampleProperties = sample.getProperties()
+    logger.debug(sample.getSampleTypeCode() + ' ' + sample.getCode())
+    parentSamples = getParents(sample.getCode(), service)
+    for parentSample in parentSamples:
+      logger.debug(parentSample.getSampleTypeCode() + ' ' + parentSample.getCode())
+      parentSampleProperties = parentSample.getProperties()
+  
+  return parentSample, parentSampleProperties
+
+
+def createSampleSheetDict(configMap, control, sampleSheetDict, flowCellName, flowCellOperator,
+                           end_type, cycles, lane, gaNumber, index, sample, sampleProperties):
+  sampleSheetDict[lane + '_' + sample.getCode()] = [
+    flowCellName + COMMA + configMap['lanePrefix'] + lane + COMMA + sample.getCode() + COMMA + 
+    sampleProperties['SPECIES'] + COMMA + index + COMMA + sanitizeString(sampleProperties['SAMPLE_NAME']) +
+     COMMA + control + COMMA + end_type + '_' + cycles + COMMA + flowCellOperator + COMMA + gaNumber]
+
+def createHiseqSampleSheet(laneParentDict, flowCellDict, configMap, service, logger, myoptions):
+  '''
+    Builds up a dictionary with all entries in the Sample Sheet
+  '''
+  control = 'N'
+  # the illlumina pipeline uses always one base less than the sequencer is sequencing 
+  demultiplexIndexLengthPenalty = -1
+  
+  sampleSheetDict = {}
+  # Making sure this is on the top of the Sample Sheet
+  sampleSheetDict[u'!'] = ([configMap['hiSeqHeader']])
+
+  indx1 = configMap['index1Name']
+  indx2 = configMap['index2Name']
+  
+  flowCellName = flowCellDict['CODE']
+  flowCellOperator = flowCellDict['OPERATOR']
+  end_type = flowCellDict['END_TYPE']
+  cycles = flowCellDict['READ_LEN']
+  index1ReadLength = int(flowCellDict['LENGTH_OF_INDEX1']) + demultiplexIndexLengthPenalty
+  indexRead2Length = int(flowCellDict['LENGTH_OF_INDEX2']) + demultiplexIndexLengthPenalty
+
+  for key in laneParentDict.keys():
+    lane = laneParentDict[key]['LANE'][-1:]
+    properties = laneParentDict[key]
+    
+    # already Library with index
+    if indx1 in properties:
+      gaNumber = laneParentDict[key]['GA_NUMBER']
+      index = getIndex(indx1, indx2, index1ReadLength, indexRead2Length, properties, configMap)
+      sample, sampleProperties = getSampleProperties(getParents(key.rsplit('_',2)[0], service), service, logger)
+      
+      createSampleSheetDict(configMap, control, sampleSheetDict, flowCellName, flowCellOperator, 
+                          end_type, cycles, lane, gaNumber, index, sample, sampleProperties)
+    else:
+      for library in getParents(key.rsplit('_',2)[0], service):
+        libraryProperties = library.getProperties()
+        gaNumber = libraryProperties['GA_NUMBER']
+        logger.debug(library.getSampleTypeCode())
+        index = getIndex(indx1, indx2, index1ReadLength, indexRead2Length, libraryProperties, configMap)
+        sample, sampleProperties = getSampleProperties(getParents(library.getCode(), service), service, logger)
+    
+        createSampleSheetDict(configMap, control, sampleSheetDict, flowCellName, flowCellOperator, 
+                          end_type, cycles, lane, gaNumber, index, sample, sampleProperties)
+
+  logger.debug(sampleSheetDict)
+  sortedSampleSheetList = sampleSheetDict.keys()
+  sortedSampleSheetList.sort()
+  writeSampleSheet(flowCellName, sampleSheetDict, sortedSampleSheetList, myoptions, logger, fileName = myoptions.outdir + 
+                      configMap['sampleSheetFileName'])
+
+def writeSampleSheet(flowCellName, sampleSheetDict, sortedSampleSheetList, myoptions, logger, fileName):
+  '''
+  Write the given dictionary to a csv file
+  '''
+  newline = lineending[myoptions.lineending]
+  myFile = fileName + '_' + flowCellName + '.csv'
+  try:
+    with open(myFile, 'w') as sampleSheetFile:
+      for listElement in sortedSampleSheetList:
+        sampleSheetFile.write(sampleSheetDict[listElement][0] + newline)
+        
+      logger.info('Writing file ' + myFile)
+      print ('Written ' + myFile)
+  except IOError:
+    logger.error('File error: ' + str(err))
+    print ('File error: ' + str(err))  
+
+
+def main():
+  '''
+  Main script
+  '''
+  logger = setUpLogger('log/')
+  logger.info('Started Creation of Sample Sheet...')
+  
+  myoptions = parseOptions(logger)
+  if myoptions.debug: 
+    logger.setLevel(logging.DEBUG)
+  
+  flowCellName = myoptions.flowcell
+  configMap = readConfig(logger)
+  service = login(configMap, logger)
+  
+  foundFlowCell, containedSamples = getFlowCell(configMap['illuminaFlowCellTypeName'], flowCellName, service, logger)
+  flowCellName = foundFlowCell.getCode()
+  flowCellDict = convertSampleToDict(foundFlowCell, configMap)
+    
+  laneParentDict = getContainedSampleProperties(containedSamples, service)
+  
+  logger.info('Found ' + str(len(laneParentDict)) + ' samples on the flow cell ' + flowCellName)
+  sampleSheetList = []
+  
+  createHiseqSampleSheet(laneParentDict,flowCellDict, configMap, service, logger, myoptions)
+  
+  logout(service, logger)
+  print('DONE')
+
+
+if __name__ == "__main__":
+    main()
-- 
GitLab