From cec4f52e31d22d3b27e8c2d34cc2103f5b3d2036 Mon Sep 17 00:00:00 2001 From: kohleman <kohleman> Date: Wed, 14 Mar 2012 10:58:44 +0000 Subject: [PATCH] cleaner code SVN: 24724 --- .../dist/etc/data-set-handler-alignment.py | 93 ++++++++++++++++--- 1 file changed, 80 insertions(+), 13 deletions(-) diff --git a/deep_sequencing_unit/dist/etc/data-set-handler-alignment.py b/deep_sequencing_unit/dist/etc/data-set-handler-alignment.py index 97fad7cf8a9..52530a1fef8 100755 --- a/deep_sequencing_unit/dist/etc/data-set-handler-alignment.py +++ b/deep_sequencing_unit/dist/etc/data-set-handler-alignment.py @@ -9,15 +9,25 @@ TOTAL_READS, MAPPED_READS Obviously you need a working samtools binary +Uses 'flagstat' and 'view -H' + Note: print statements go to: ~openbis/sprint/datastore_server/log/startup_log.txt ''' import os +import fnmatch +import re from ch.systemsx.cisd.openbis.generic.shared.api.v1.dto import SearchCriteria +from ch.systemsx.cisd.openbis.generic.shared.api.v1.dto import SearchSubCriteria -FOLDER='/net/bs-dsu-data/array0/dsu/dss/incoming-jython-alignment/' +FOLDER='/links/shared/dsu-dss/dss/incoming-jython-alignment/' SAMTOOLS='/usr/local/dsu/samtools/samtools' +BAM_PATTERN='*.bam' + +matches = [] +searchStrings = ['@PG'] +programList = [] # Create a "transaction" -- a way of grouping operations together so they all # happen or none of them do. @@ -30,27 +40,76 @@ dataSet.setMeasuredData(False) incomingPath = incoming.getAbsolutePath() # Get the incoming name +# expected: +# Project_110907_SN792_0059_AC012FACXX_3/Sample_BSSE-DSU-1662/BSSE-DSU-1662_CGATGTA_L003_R1_001_sorted.bam name = incoming.getName() -# expected incoming Name, e.g.:ETHZ_BSSE_110429_63558AAXX_1_sorted.bam -split=name.split("_") -sample=split[2]+ '_'+ split[3] + ':' + split[4] +split=name.split('_') +if (len(split) == 6): + incoming_sample=split[1]+ '_'+ split[2] + '_' + split[3] + '_' + split[4]+ ':' + split[-1] +if (len(split) ==4): + incoming_sample=split[1]+ '_'+ split[2] + ':' + split[-1] + + +# Looking for BAMS: +for root, dirnames, filenames in os.walk(FOLDER + name): + for filename in fnmatch.filter(filenames, BAM_PATTERN): + matches.append(os.path.join(root, filename)) + +# ----------------------------------------------------------------------------- + +def listSearch (myList, searchString): + ''' + Searches for a given String in a list. + Only lines matching the start of a line a considerd as a match + ''' + matches = [] + for i in range (0, len(myList)): + if(re.match(searchString, myList[i])): + matches.append(myList[i]) + return (matches) + +# ----------------------------------------------------------------------------- + +def programParameters (programList): + ''' + Extracts the aligner datils from the bam header + ''' + elements = {} + for program in range(0, len(programList)): + line = programList[program].split('\t') + + for element in range (1, len(line)): + key, value = line[element].split(":") + elements[key] = value + + return elements + # Extract values from a samtools view and set the results as DataSet properties # Command: samtools view -H ETHZ_BSSE_110429_63558AAXX_1_sorted.bam -arguments = SAMTOOLS + ' view -H ' + FOLDER + name -#print('Arguments: '+ arguments) + +arguments = SAMTOOLS + ' view -H ' + matches[0] +print('Arguments: '+ arguments) cmdResult=os.popen(arguments).read() -properties=cmdResult.split("\n")[-2].split('\t') -aligner=(properties[1].split(':')[1].upper() + '_' + properties[2].split(':')[1]) -command=properties[3] -arguments = SAMTOOLS + ' flagstat ' + FOLDER + name +properties=cmdResult.split("\n") +for s in range (0, len(searchStrings)): + programList = listSearch (properties, searchStrings[s]) +print(programList) + +e = programParameters (programList) + +dataSet.setPropertyValue("ALIGNMENT_SOFTWARE", e['ID']) +dataSet.setPropertyValue("VERSION", e['VN']) +dataSet.setPropertyValue("ISSUED_COMMAND", e['CL']) + + +arguments = SAMTOOLS + ' flagstat ' + matches[0] + cmdResult=os.popen(arguments).read() totalReads=cmdResult.split('\n')[0].split(' ')[0] mappedReads=cmdResult.split('\n')[2].split(' ')[0] -dataSet.setPropertyValue("ALIGNMENT_SOFTWARE", aligner) -dataSet.setPropertyValue("ISSUED_COMMAND", command) dataSet.setPropertyValue("SAMTOOLS_FLAGSTAT", cmdResult) dataSet.setPropertyValue("TOTAL_READS", totalReads) dataSet.setPropertyValue("MAPPED_READS", mappedReads) @@ -63,8 +122,16 @@ search_service = transaction.getSearchService() # Search for the sample sc = SearchCriteria() -sc.addMatchClause(SearchCriteria.MatchClause.createAttributeMatch(SearchCriteria.MatchClauseAttribute.CODE, sample)); +sc.addMatchClause(SearchCriteria.MatchClause.createAttributeMatch(SearchCriteria.MatchClauseAttribute.CODE, incoming_sample)); foundSamples = search_service.searchForSamples(sc) if foundSamples.size() > 0: dataSet.setSample(foundSamples[0]) + + # Search for parent data set of the same sample + dataSetSc = SearchCriteria() + dataSetSc.addMatchClause(SearchCriteria.MatchClause.createAttributeMatch(SearchCriteria.MatchClauseAttribute.TYPE, 'FASTQ_GZ')) + dataSetSc.addSubCriteria(SearchSubCriteria.createSampleCriteria(sc)) + foundDataSets = search_service.searchForDataSets(dataSetSc) + if foundDataSets.size() > 0: + dataSet.setParentDatasets([ds.getDataSetCode() for ds in foundDataSets]) -- GitLab