diff --git a/deep_sequencing_unit/dist/etc/data-set-handler-create-flow-cell.py b/deep_sequencing_unit/dist/etc/data-set-handler-create-flow-cell.py new file mode 100755 index 0000000000000000000000000000000000000000..4daa7d4c5ea0417307db9bb42aad8dfde32ea424 --- /dev/null +++ b/deep_sequencing_unit/dist/etc/data-set-handler-create-flow-cell.py @@ -0,0 +1,157 @@ +''' +expected incoming Name for HiSeq2000 runs: 110715_SN792_0054_BC035RACXX +expected incoming Name for GAII runs: 110812_6353WAAXX + +Note: +print statements go to: ~openbis/sprint/datastore_server/log/startup_log.txt +''' + +import os +from time import * +from datetime import * +import xml.etree.ElementTree as etree +from ch.systemsx.cisd.openbis.generic.shared.api.v1.dto import SearchCriteria + +IS_HISEQ_RUN=False +RUNPARAMETERS = 'runParameters.xml' +RUNINFO = 'RunInfo.xml' +FLOWCELL_SPACE='/BSSE_FLOWCELLS/' +FLOWCELL_PROJECT='FLOWCELLS/' +EXPERIMENT_TYPE_CODE='HT_SEQUENCING' + +# Mapping between XML file naming and used in here +RUNPARAMETERS_XML = {'FLOWCELL':'Flowcell', 'RTAVERSION':'RTAVersion', + 'CONTROLLANE':'ControlLane', 'SBS':'Sbs', 'INDEX':'Index', + 'CYCLES_REQUESTED_BY_CUSTOMER':'Read1', 'PE':'Pe'} +RUNINFO_XML = {'LANECOUNT':'LaneCount', 'SURFACECOUNT':'SurfaceCount', + 'SWATHCOUNT':'SwathCount', 'TILECOUNT':'TileCount'} +INSTRUMENT = {'SN792':'RUA', 'BS-DSU-ELLAC':'ELLAC'} + +class parseXmlFile: + + def __init__(self, xmlFile): + self.xmlFile = xmlFile + self.tree = etree.parse(self.xmlFile) + self.root = self.tree.getroot() + + def getXmlElement (self, elementName): + ''' + Returns the text value of a given XML element + ''' + for e in self.root.getchildren(): + element = e.find(elementName) + if element is None: + return 'None' + else: + return element.text + + def getAllchildren (self, elementName): + ''' + finds all children of a given XML Element and returns them as list + ''' + for e in self.root.getchildren(): + # the '//' means look recursively for all children not only direct ones + childList = self.tree.findall('//' + elementName) + return childList + +# ----------------------------------------------------------------------------- + +def create_openbis_timestamp (): + ''' + Create an openBIS conform timestamp + ''' + tz=localtime()[3]-gmtime()[3] + d=datetime.now() + return d.strftime("%Y-%m-%d %H:%M:%S GMT"+"%+.2d" % tz+":00") + +# ----------------------------------------------------------------------------- + +# Create a "transaction" -- a way of grouping operations together so they all +# happen or none of them do. +transaction = service.transaction() + +incomingPath = incoming.getAbsolutePath() + +# Get the incoming name +name = incoming.getName() + +split=name.split("_") +if (len(split) == 4): + IS_HISEQ_RUN=True +if (len(split) == 2): + pass + +# Search for the sample and check if there is already sample with this name +search_service = transaction.getSearchService() +sc = SearchCriteria() +sc.addMatchClause(SearchCriteria.MatchClause.createAttributeMatch(SearchCriteria.MatchClauseAttribute.CODE, name)); +foundSamples = search_service.searchForSamples(sc) +if foundSamples.size() > 0: + raise NameError('Already found a Flow Cell with the following name: '+ name) + +# Parse the RunInfo.xml file +runInfo = parseXmlFile(incomingPath + '/' + RUNINFO) + +# Create a new Flow Cell and set the experiment +newFlowCell = transaction.createNewSample(FLOWCELL_SPACE + name, "ILLUMINA_FLOW_CELL") +exp = transaction.getExperiment(FLOWCELL_SPACE + FLOWCELL_PROJECT + datetime.now().strftime("%Y.%m")) +if exp == None: + exp = transaction.createNewExperiment(FLOWCELL_SPACE + FLOWCELL_PROJECT + datetime.now().strftime("%Y.%m"), + EXPERIMENT_TYPE_CODE) +newFlowCell.setExperiment(exp) + +if IS_HISEQ_RUN: + run = runInfo.getAllchildren('Run')[0].attrib + if (run['Id'] != name): + raise NameError('Flowcell names do not match between directory name '+ name + + ' and ' + RUNINFO + 'property file: ' + run['Id']) + + # The HiSeq is providing more infos, which we will parse here: + runParameters = parseXmlFile(incomingPath + '/' + RUNPARAMETERS) + + newFlowCell.setPropertyValue("ILLUMINA_PIPELINE_VERSION", runParameters.getXmlElement(RUNPARAMETERS_XML['RTAVERSION'])) + newFlowCell.setPropertyValue("FLOWCELLTYPE", runParameters.getXmlElement(RUNPARAMETERS_XML['FLOWCELL'])) + newFlowCell.setPropertyValue("CONTROL_LANE", runParameters.getXmlElement(RUNPARAMETERS_XML['CONTROLLANE'])) + newFlowCell.setPropertyValue("SBS_KIT", runParameters.getXmlElement(RUNPARAMETERS_XML['SBS'])) + + read1 = runParameters.getAllchildren('Read1') + newFlowCell.setPropertyValue("CYCLES_REQUESTED_BY_CUSTOMER", read1[0].text) + + read2 = runParameters.getAllchildren('Read2') + if (str(read2[0].text) == '0'): + newFlowCell.setPropertyValue("END_TYPE", "SINGLE_READ") + else: + newFlowCell.setPropertyValue("END_TYPE", "PAIRED_END") + newFlowCell.setPropertyValue("PAIRED_END_KIT", runParameters.getXmlElement(RUNPARAMETERS_XML['PE'])) + + indexRead = runParameters.getAllchildren('IndexRead') + newFlowCell.setPropertyValue("INDEXREAD", indexRead[0].text) + + def setFcProperty(searchId, dict): + children = runInfo.getAllchildren(searchId) + for element in (dict): + if (element <> '') and (dict[element] <> ''): + newFlowCell.setPropertyValue(element, children[0].attrib[dict[element]]) + + setFcProperty('FlowcellLayout', RUNINFO_XML) + + +sequencer = runInfo.getAllchildren('Instrument') +newFlowCell.setPropertyValue("SEQUENCER", INSTRUMENT[sequencer[0].text]) + +newFlowCell.setPropertyValue("FLOW_CELL_SEQUENCED_ON", create_openbis_timestamp()) +if IS_HISEQ_RUN: + maxLanes = runInfo.getAllchildren('FlowcellLayout')[0].attrib[RUNINFO_XML['LANECOUNT']] +else: + maxLanes = len(runInfo.getAllchildren('Tiles')[0]) + +# ----------------------------------------------------------------------------- + +def registerFlowLane(a_lane): + ''' + Registers a new Flow lane + ''' + newFlowLane = transaction.createNewSample(FLOWCELL_SPACE + name + ':' + str(a_lane), "ILLUMINA_FLOW_LANE") + newFlowLane.setContainer(newFlowCell) + +[registerFlowLane(lane) for lane in range(1,int(maxLanes)+1)]