Skip to content
Snippets Groups Projects
data-set-handler-alignment.py 4.4 KiB
Newer Older
  • Learn to ignore specific revisions
  • '''
    This is handling bowtie-BAM files and extracts some properties from the BAM header and
    the samtools flagstat command. The results are formatted and attached  as a property
    to the openBIS DataSet.
    Prerequisites are the DataSetType: ALIGNMENT and
    the following properties assigned to the DataSetType mentioned above:
    ALIGNMENT_SOFTWARE, ISSUED_COMMAND, SAMTOOLS_FLAGSTAT,
    TOTAL_READS, MAPPED_READS
    
    Obviously you need a working samtools binary
    
    
    kohleman's avatar
    kohleman committed
    Uses 'flagstat' and 'view -H'
    
    
    Note: 
    print statements go to: ~openbis/sprint/datastore_server/log/startup_log.txt
    '''
    
    import os
    
    kohleman's avatar
    kohleman committed
    import fnmatch
    import re
    
    from ch.systemsx.cisd.openbis.generic.shared.api.v1.dto import SearchCriteria
    
    kohleman's avatar
    kohleman committed
    from ch.systemsx.cisd.openbis.generic.shared.api.v1.dto import SearchSubCriteria
    
    kohleman's avatar
    kohleman committed
    FOLDER='/links/shared/dsu-dss/dss/incoming-jython-alignment/'
    
    SAMTOOLS='/usr/local/dsu/samtools/samtools'
    
    kohleman's avatar
    kohleman committed
    BAM_PATTERN='*.bam'
    
    matches = []
    searchStrings = ['@PG']
    programList = []
    
    
    # Create a "transaction" -- a way of grouping operations together so they all
    # happen or none of them do.
    transaction = service.transaction()
     
    # Create a data set and set type
    dataSet = transaction.createNewDataSet("ALIGNMENT")
    dataSet.setMeasuredData(False)
    
    incomingPath = incoming.getAbsolutePath()
    
    # Get the incoming name 
    
    kohleman's avatar
    kohleman committed
    # expected: 
    # Project_110907_SN792_0059_AC012FACXX_3/Sample_BSSE-DSU-1662/BSSE-DSU-1662_CGATGTA_L003_R1_001_sorted.bam
    
    name = incoming.getName()
    
    kohleman's avatar
    kohleman committed
    split=name.split('_')
    if (len(split) == 6): 
      incoming_sample=split[1]+ '_'+ split[2] + '_' + split[3] + '_' + split[4]+ ':' + split[-1]
    if (len(split) ==4):
      incoming_sample=split[1]+ '_'+ split[2] + ':' + split[-1]
    
    
    # Looking for BAMS:
    for root, dirnames, filenames in os.walk(FOLDER + name):
      for filename in fnmatch.filter(filenames, BAM_PATTERN):
          matches.append(os.path.join(root, filename))
    
    # -----------------------------------------------------------------------------
    
    def listSearch (myList, searchString):
      '''
      Searches for a given String in a list.
      Only lines matching  the start of a line a considerd as a match
      '''
      matches = []
      for i in range (0, len(myList)):
        if(re.match(searchString, myList[i])):
          matches.append(myList[i])
      return (matches)
    
    # -----------------------------------------------------------------------------
    
    def programParameters (programList):
      '''
      Extracts the aligner datils from the bam header
      '''
      elements = {}
      for program in range(0, len(programList)):
        line = programList[program].split('\t')
     
      for element in range (1, len(line)):
        key, value = line[element].split(":")
        elements[key] = value
        
      return elements  
    
    
    
    # Extract values from a samtools view and set the results as DataSet properties 
    # Command: samtools view -H ETHZ_BSSE_110429_63558AAXX_1_sorted.bam
    
    kohleman's avatar
    kohleman committed
    
    arguments = SAMTOOLS + ' view -H ' + matches[0]
    print('Arguments: '+ arguments)
    
    cmdResult=os.popen(arguments).read()
    
    
    kohleman's avatar
    kohleman committed
    properties=cmdResult.split("\n")
    for s in range (0, len(searchStrings)):
      programList = listSearch (properties, searchStrings[s])
    print(programList)  
    
    e = programParameters (programList)  
    
    dataSet.setPropertyValue("ALIGNMENT_SOFTWARE", e['ID'])
    dataSet.setPropertyValue("VERSION", e['VN'])
    dataSet.setPropertyValue("ISSUED_COMMAND", e['CL'])
    
    
    arguments = SAMTOOLS + ' flagstat ' + matches[0]
    
    
    cmdResult=os.popen(arguments).read()
    totalReads=cmdResult.split('\n')[0].split(' ')[0]
    mappedReads=cmdResult.split('\n')[2].split(' ')[0]
    
    dataSet.setPropertyValue("SAMTOOLS_FLAGSTAT", cmdResult)
    dataSet.setPropertyValue("TOTAL_READS", totalReads)
    dataSet.setPropertyValue("MAPPED_READS", mappedReads)
     
    # Add the incoming file into the data set
    transaction.moveFile(incomingPath, dataSet)
    
    # Get the search service
    search_service = transaction.getSearchService()
    
    # Search for the sample
    sc = SearchCriteria()
    
    kohleman's avatar
    kohleman committed
    sc.addMatchClause(SearchCriteria.MatchClause.createAttributeMatch(SearchCriteria.MatchClauseAttribute.CODE, incoming_sample));
    
    foundSamples = search_service.searchForSamples(sc)
    
    if foundSamples.size() > 0:
      dataSet.setSample(foundSamples[0])
    
    kohleman's avatar
    kohleman committed
      
      # Search for parent data set of the same sample
      dataSetSc = SearchCriteria()
      dataSetSc.addMatchClause(SearchCriteria.MatchClause.createAttributeMatch(SearchCriteria.MatchClauseAttribute.TYPE, 'FASTQ_GZ'))
      dataSetSc.addSubCriteria(SearchSubCriteria.createSampleCriteria(sc))
      foundDataSets = search_service.searchForDataSets(dataSetSc)
      if foundDataSets.size() > 0:
        dataSet.setParentDatasets([ds.getDataSetCode() for ds in foundDataSets])