diff --git a/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/create-flowcell-hiseq/create-flowcell-hiseq.py b/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/create-flowcell-hiseq/create-flowcell-hiseq.py new file mode 100644 index 0000000000000000000000000000000000000000..a63f60f88c06e93d7f9acaea991ec988d2940467 --- /dev/null +++ b/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/create-flowcell-hiseq/create-flowcell-hiseq.py @@ -0,0 +1,194 @@ +''' +@copyright: +2012 ETH Zuerich, CISD + +@license: +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +@description: +Parses the two Illumina provided files 'runParameters.xml' and 'RunInfo.xml' +and creates one Sample of type 'ILLUMINA_FLOW_CELL' and sets Sample properties +from those two XML files. Additionally the number of lanes are read out and +are created as contained samples of type 'ILLUMINA_FLOW_LANE'. + +@note: +print statements go to: <openBIS_HOME>/datastore_server/log/startup_log.txt +expected incoming Name for HiSeq runs: 110715_SN792_0054_BC035RACXX +expected incoming Name for GAII runs: 110812_6353WAAXX + +@author: +Manuel Kohler +''' + +import os +import shutil +from time import * +from datetime import * +import xml.etree.ElementTree as etree +from ch.systemsx.cisd.openbis.generic.shared.api.v1.dto import SearchCriteria + +IS_HISEQ_RUN=False +RUNPARAMETERS = 'runParameters.xml' +RUNINFO = 'RunInfo.xml' +FLOWCELL_SPACE='BSSE_FLOWCELLS' +FLOWCELL_PROJECT='FLOWCELLS' +EXPERIMENT_TYPE_CODE='HT_SEQUENCING' + +FLOWCELL_PROJECT_ID = "/%(FLOWCELL_SPACE)s/%(FLOWCELL_PROJECT)s" % vars() + +# Mapping between XML file naming and used in here +RUNPARAMETERS_XML = {'FLOWCELL':'Flowcell', 'RTAVERSION':'RTAVersion', + 'CONTROLLANE':'ControlLane', 'SBS':'Sbs', 'INDEX':'Index', + 'CYCLES_REQUESTED_BY_CUSTOMER':'Read1', 'PE':'Pe'} +RUNINFO_XML = {'LANECOUNT':'LaneCount', 'SURFACECOUNT':'SurfaceCount', + 'SWATHCOUNT':'SwathCount', 'TILECOUNT':'TileCount'} + +class parseXmlFile: + + def __init__(self, xmlFile): + self.xmlFile = xmlFile + self.tree = etree.parse(self.xmlFile) + self.root = self.tree.getroot() + + def getXmlElement (self, elementName): + ''' + Returns the text value of a given XML element + ''' + for e in self.root.getchildren(): + element = e.find(elementName) + if element is None: + return 'None' + else: + return element.text + + def getAllchildren (self, elementName): + ''' + finds all children of a given XML Element and returns them as list + ''' + for e in self.root.getchildren(): + # the '//' means look recursively for all children not only direct ones + childList = self.tree.findall('//' + elementName) + return childList + +# ----------------------------------------------------------------------------- + +def create_openbis_timestamp (): + ''' + Create an openBIS conform timestamp + ''' + tz=localtime()[3]-gmtime()[3] + d=datetime.now() + return d.strftime("%Y-%m-%d %H:%M:%S GMT"+"%+.2d" % tz+":00") + +# ----------------------------------------------------------------------------- + +def registerFlowLane(transaction, a_lane, name, newFlowCell): + ''' + Registers a new Flow lane + ''' + newFlowLane = transaction.createNewSample('/' + FLOWCELL_SPACE + '/' + name + ':' + str(a_lane), "ILLUMINA_FLOW_LANE") + newFlowLane.setContainer(newFlowCell) + +# ----------------------------------------------------------------------------- + +def process(transaction): + + incoming = transaction.getIncoming() + incomingPath = incoming.getAbsolutePath() + + # Get the incoming name + name = incoming.getName() + + split=name.split("_") + if (len(split) == 4): + IS_HISEQ_RUN=True + if (len(split) == 2): + pass + + # Search for the sample and check if there is already sample with this name + search_service = transaction.getSearchService() + sc = SearchCriteria() + sc.addMatchClause(SearchCriteria.MatchClause.createAttributeMatch(SearchCriteria.MatchClauseAttribute.CODE, name)); + foundSamples = search_service.searchForSamples(sc) + if foundSamples.size() > 0: + raise NameError('Already found a Flow Cell with the following name: '+ name) + + # Parse the RunInfo.xml file + runInfo = parseXmlFile(incomingPath + '/' + RUNINFO) + + # Create a new Flow Cell and set the experiment + project = transaction.getProject(FLOWCELL_PROJECT_ID) + if project == None: + space = transaction.getSpace(FLOWCELL_SPACE) + if space == None: + space = transaction.createNewSpace(FLOWCELL_SPACE, None) + space.setDescription("A test space") + project = transaction.createNewProject(FLOWCELL_PROJECT_ID) + project.setDescription("A demo project") + expID = FLOWCELL_PROJECT_ID + '/' + datetime.now().strftime("%Y.%m") + exp = transaction.getExperiment(expID) + if exp == None: + exp = transaction.createNewExperiment(expID, EXPERIMENT_TYPE_CODE) + newFlowCell = transaction.createNewSample('/' + FLOWCELL_SPACE + '/' + name, "ILLUMINA_FLOW_CELL") + newFlowCell.setExperiment(exp) + + if IS_HISEQ_RUN: + run = runInfo.getAllchildren('Run')[0].attrib + if (run['Id'] != name): + raise NameError('Flowcell names do not match between directory name '+ name + + ' and ' + RUNINFO + 'property file: ' + run['Id']) + + # The HiSeq is providing more infos, which we will parse here: + runParameters = parseXmlFile(incomingPath + '/' + RUNPARAMETERS) + + newFlowCell.setPropertyValue("ILLUMINA_PIPELINE_VERSION", runParameters.getXmlElement(RUNPARAMETERS_XML['RTAVERSION'])) + newFlowCell.setPropertyValue("FLOWCELLTYPE", runParameters.getXmlElement(RUNPARAMETERS_XML['FLOWCELL'])) + newFlowCell.setPropertyValue("CONTROL_LANE", runParameters.getXmlElement(RUNPARAMETERS_XML['CONTROLLANE'])) + newFlowCell.setPropertyValue("SBS_KIT", runParameters.getXmlElement(RUNPARAMETERS_XML['SBS'])) + + read1 = runParameters.getAllchildren('Read1') + newFlowCell.setPropertyValue("CYCLES_REQUESTED_BY_CUSTOMER", read1[0].text) + + read2 = runParameters.getAllchildren('Read2') + if (str(read2[0].text) == '0'): + newFlowCell.setPropertyValue("END_TYPE", "SINGLE_READ") + else: + newFlowCell.setPropertyValue("END_TYPE", "PAIRED_END") + newFlowCell.setPropertyValue("PAIRED_END_KIT", runParameters.getXmlElement(RUNPARAMETERS_XML['PE'])) + + indexRead1 = runParameters.getAllchildren('IndexRead1') + newFlowCell.setPropertyValue("INDEXREAD", indexRead1[0].text) + + indexRead2 = runParameters.getAllchildren('IndexRead2') + newFlowCell.setPropertyValue("INDEXREAD2", indexRead2[0].text) + + def setFcProperty(searchId, dict): + children = runInfo.getAllchildren(searchId) + for element in (dict): + if (element <> '') and (dict[element] <> ''): + newFlowCell.setPropertyValue(element, children[0].attrib[dict[element]]) + + setFcProperty('FlowcellLayout', RUNINFO_XML) + + sequencer = runInfo.getAllchildren('Instrument') + newFlowCell.setPropertyValue("SEQUENCER", sequencer[0].text) + + newFlowCell.setPropertyValue("FLOW_CELL_SEQUENCED_ON", create_openbis_timestamp()) + if IS_HISEQ_RUN: + maxLanes = runInfo.getAllchildren('FlowcellLayout')[0].attrib[RUNINFO_XML['LANECOUNT']] + else: + maxLanes = len(runInfo.getAllchildren('Tiles')[0]) + + [registerFlowLane(transaction, lane, name, newFlowCell) for lane in range(1,int(maxLanes)+1)] + + shutil.rmtree(incomingPath) diff --git a/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/create-flowcell-hiseq/plugin.properties b/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/create-flowcell-hiseq/plugin.properties new file mode 100644 index 0000000000000000000000000000000000000000..5f63b57d7e1e47f69260433be02e2072af4e59e1 --- /dev/null +++ b/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/create-flowcell-hiseq/plugin.properties @@ -0,0 +1,12 @@ +# +# Drop box for creating a flow cell based on runParameters.xml and RunInfo.xml created by an +# Illumina HiSeq +# +# Variables: +# incoming-root-dir +# Path to the directory which contains incoming directories for drop boxes. +incoming-dir = /links/shared/dsu/dss/create-flowcell-hiseq +incoming-data-completeness-condition = marker-file +top-level-data-set-handler = ch.systemsx.cisd.etlserver.registrator.api.v2.JythonTopLevelDataSetHandlerV2 +script-path = create-flowcell-hiseq.py +storage-processor = ch.systemsx.cisd.etlserver.DefaultStorageProcessor diff --git a/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/create-flowcell-miseq/create-flow-cell-miseq.py b/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/create-flowcell-miseq/create-flow-cell-miseq.py new file mode 100755 index 0000000000000000000000000000000000000000..6188431519df407b9b7749b2a0a4c8d4f58a26ed --- /dev/null +++ b/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/create-flowcell-miseq/create-flow-cell-miseq.py @@ -0,0 +1,199 @@ +''' +@copyright: +2012 ETH Zuerich, CISD + +@license: +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +@description: +Parses the two Illumina provided files 'RunParameters.xml' and 'RunInfo.xml' +and creates one Sample of type 'ILLUMINA_FLOW_CELL' and sets Sample properties +from those two XML files. Additionally the number of lanes are read out and +are created as contained samples of type 'ILLUMINA_FLOW_LANE'. + +@note: +print statements go to: <openBIS_HOME>/datastore_server/log/startup_log.txt +expected incoming Name for MiSeq runs: 120726_M00721_0011_A000000000-A1FVF + +@author: +Manuel Kohler +''' + +import os +import shutil +from time import * +from datetime import * +import xml.etree.ElementTree as etree +from ch.systemsx.cisd.openbis.generic.shared.api.v1.dto import SearchCriteria + +RUNPARAMETERS = 'RunParameters.xml' +#RUNPARAMETERS = 'runParameters.xml' +RUNINFO = 'RunInfo.xml' +FLOWCELL_SPACE='/BSSE_FLOWCELLS/' +FLOWCELL_PROJECT='FLOWCELLS/' +EXPERIMENT_TYPE_CODE='HT_SEQUENCING' + +# Mapping between XML file naming and openBIS properties + +RUNINFO_XML = {'LANECOUNT':'LaneCount', 'SURFACECOUNT':'SurfaceCount', + 'SWATHCOUNT':'SwathCount', 'TILECOUNT':'TileCount'} + +#------------------------------------------------------------------------------ + +class parseXmlFile: + + def __init__(self, xmlFile): + self.xmlFile = xmlFile + self.tree = etree.parse(self.xmlFile) + self.root = self.tree.getroot() + + def getXmlElement (self, elementName): + ''' + Returns the text value of a given XML element + ''' + for e in self.root.getchildren(): + element = e.find(elementName) + if element is None: + return 'None' + else: + return element.text + + def getAllchildren (self, elementName): + ''' + finds all children of a given XML Element and returns them as list + ''' + for e in self.root.getchildren(): + # the '//' means look recursively for all children not only direct ones + childList = self.tree.findall('//' + elementName) + return childList + +# ----------------------------------------------------------------------------- + +def create_openbis_timestamp (): + ''' + Create an openBIS conform timestamp + ''' + tz=localtime()[3]-gmtime()[3] + d=datetime.now() + return d.strftime("%Y-%m-%d %H:%M:%S GMT"+"%+.2d" % tz+":00") + +# ----------------------------------------------------------------------------- + +def registerFlowLane(a_lane, transaction, name, newFlowCell): + ''' + Registers a new Flow lane + ''' + newFlowLane = transaction.createNewSample(FLOWCELL_SPACE + name + ':' + str(a_lane), "ILLUMINA_FLOW_LANE") + newFlowLane.setContainer(newFlowCell) + +# ----------------------------------------------------------------------------- +def process(transaction): + + incoming = transaction.getIncoming() + incomingPath = incoming.getAbsolutePath() + + name = incoming.getName() + # ['120726', 'M00721', '0011', 'A000000000-A1FVF'] + runDate, MiseqID, runningNumber, trayAndFcId = name.split("_") + tray = trayAndFcId[0] + fcId = trayAndFcId[1:] + + # ----------------------------------------------------------------------------- + + # Search for the sample and check if there is already sample with this name + search_service = transaction.getSearchService() + sc = SearchCriteria() + sc.addMatchClause(SearchCriteria.MatchClause.createAttributeMatch(SearchCriteria.MatchClauseAttribute.CODE, name)); + foundSamples = search_service.searchForSamples(sc) + if foundSamples.size() > 0: + raise NameError('Already found a Flow Cell with the following name: '+ name) + + # Parse the RunInfo.xml file + runInfo = parseXmlFile(incomingPath + '/' + RUNINFO) + + # Create a new Flow Cell and set the experiment + newFlowCell = transaction.createNewSample(FLOWCELL_SPACE + name, "ILLUMINA_FLOW_CELL") + exp = transaction.getExperiment(FLOWCELL_SPACE + FLOWCELL_PROJECT + datetime.now().strftime("%Y.%m")) + if exp == None: + exp = transaction.createNewExperiment(FLOWCELL_SPACE + FLOWCELL_PROJECT + datetime.now().strftime("%Y.%m"), + EXPERIMENT_TYPE_CODE) + newFlowCell.setExperiment(exp) + + run = runInfo.getAllchildren('Run')[0].attrib + if (run['Id'] != name): + raise NameError('Flowcell names do not match between directory name '+ name + + ' and ' + RUNINFO + 'property file: ' + run['Id']) + + runParameters = parseXmlFile(incomingPath + '/' + RUNPARAMETERS) + RTAversion = (runParameters.getAllchildren('RTAVersion'))[0].text + print RTAversion + newFlowCell.setPropertyValue("ILLUMINA_PIPELINE_VERSION", RTAversion) + + def setFcProperty(searchId, dict): + children = runInfo.getAllchildren(searchId) + for element in (dict): + if (element <> '') and (dict[element] <> ''): + newFlowCell.setPropertyValue(element, children[0].attrib[dict[element]]) + + # Reading out <FlowcellLayout LaneCount="1" SurfaceCount="1" SwathCount="1" TileCount="12" /> + setFcProperty('FlowcellLayout', RUNINFO_XML) + + sequencer = runInfo.getAllchildren('Instrument') + newFlowCell.setPropertyValue("SEQUENCER", sequencer[0].text) + + readMap = {} + reads = runInfo.getAllchildren('Reads') + read = reads[0].findall('Read') + + for r in read: + cycles = r.get('NumCycles', 'str') + number = r.get('Number', 'str') + isIndexed = r.get('IsIndexedRead', 'str') + readMap[number] = [cycles, isIndexed] + + # example of readMap: {'1': ['151', 'N'], '2': ['8', 'Y'], '3': ['8', 'Y'], '4': ['151', 'N']} + + newFlowCell.setPropertyValue("CYCLES_REQUESTED_BY_CUSTOMER", readMap['1'][0]) + + indexCount = 0 + readCount = 0 + + for e in readMap: + if readMap[e][1] == 'Y': + indexCount += 1 + else: + readCount += 1 + + if indexCount == 2: + newFlowCell.setPropertyValue("END_TYPE", "PAIRED_END") + else: + newFlowCell.setPropertyValue("END_TYPE", "SINGLE_READ") + + try: + newFlowCell.setPropertyValue("INDEXREAD", readMap['2'][0]) + except: + newFlowCell.setPropertyValue("INDEXREAD", '0') + + try: + newFlowCell.setPropertyValue("INDEXREAD2", readMap['3'][0]) + except: + newFlowCell.setPropertyValue("INDEXREAD2", '0') + + newFlowCell.setPropertyValue("FLOW_CELL_SEQUENCED_ON", create_openbis_timestamp()) + + # get the number of lanes + maxLanes = runInfo.getAllchildren('FlowcellLayout')[0].attrib[RUNINFO_XML['LANECOUNT']] + + [registerFlowLane(lane, transaction, name, newFlowCell) for lane in range(1,int(maxLanes)+1)] + + shutil.rmtree(incomingPath) diff --git a/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/create-flowcell-miseq/plugin.properties b/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/create-flowcell-miseq/plugin.properties new file mode 100644 index 0000000000000000000000000000000000000000..07d7e308bc835999095ab10a409c3a82d7fd41e0 --- /dev/null +++ b/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/create-flowcell-miseq/plugin.properties @@ -0,0 +1,12 @@ +# +# Drop box for creating a flow cell +# +# Variables: +# incoming-root-dir +# Path to the directory which contains incoming directories for drop boxes. +incoming-dir = ${root}/dss/create-flowcell-miseq +incoming-data-completeness-condition = marker-file +top-level-data-set-handler = ch.systemsx.cisd.etlserver.registrator.api.v2.JythonTopLevelDataSetHandlerV2 +script-path = create-flow-cell-miseq.py +storage-processor = ch.systemsx.cisd.etlserver.DefaultStorageProcessor + diff --git a/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/read-demultiplex-stats/plugin.properties b/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/read-demultiplex-stats/plugin.properties new file mode 100644 index 0000000000000000000000000000000000000000..78ce865e67d0ffe1625cd424b9ce9658cb2dbb6e --- /dev/null +++ b/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/read-demultiplex-stats/plugin.properties @@ -0,0 +1,10 @@ +# +# Variables: +# incoming-root-dir +# Path to the directory which contains incoming directories for drop boxes. +incoming-dir = ${root}/dss/read-demultiplex-stats +incoming-data-completeness-condition = marker-file +top-level-data-set-handler = ch.systemsx.cisd.etlserver.registrator.api.v2.JythonTopLevelDataSetHandlerV2 +script-path = read-demultiplex-stats.py +storage-processor = ch.systemsx.cisd.etlserver.DefaultStorageProcessor + diff --git a/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/read-demultiplex-stats/read-demultiplex-stats.py b/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/read-demultiplex-stats/read-demultiplex-stats.py new file mode 100755 index 0000000000000000000000000000000000000000..df7c525e37c4de61929203b8c7b7cea8d7649f77 --- /dev/null +++ b/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/read-demultiplex-stats/read-demultiplex-stats.py @@ -0,0 +1,538 @@ +''' + @copyright: 2012 ETH Zuerich, CISD + + @license: + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + +@author: Manuel Kohler + +XML Structur which is processed: + +<?xml version="1.0"?> +<Summary> + <Lane index="8"> + <Sample index="lane8"> + <Barcode index="Undetermined"> + <Tile index="1101"> + <Read index="1"> + <Raw> + <Yield>1921250</Yield> + <YieldQ30>949680</YieldQ30> + <ClusterCount>38425</ClusterCount> + <ClusterCount0MismatchBarcode>0</ClusterCount0MismatchBarcode> + <ClusterCount1MismatchBarcode>0</ClusterCount1MismatchBarcode> + <QualityScoreSum>40995660</QualityScoreSum> + </Raw> + <Pf> + <Yield>945450</Yield> + <YieldQ30>854815</YieldQ30> + <ClusterCount>18909</ClusterCount> + <ClusterCount0MismatchBarcode>0</ClusterCount0MismatchBarcode> + <ClusterCount1MismatchBarcode>0</ClusterCount1MismatchBarcode> + <QualityScoreSum>33815505</QualityScoreSum> + </Pf> + </Read> + </Tile> + [...] + +@note: +print statements go to: <openBIS_HOME>/datastore_server/log/startup_log.txt +''' + +import time +import os +import fnmatch +import xml.etree.ElementTree as etree +from ch.systemsx.cisd.openbis.generic.shared.api.v1.dto import SearchCriteria +from ch.systemsx.cisd.openbis.generic.shared.api.v1.dto import SearchSubCriteria + + +class parseXmlFile: + + def __init__(self, xmlFile): + self.xmlFile = xmlFile + self.tree = etree.parse(self.xmlFile) + self.root = self.tree.getroot() + +# ----------------------------------------------------------------------------- + +class qcValues(object): + def __init__(self, Yield = 0, YieldQ30 = 0, ClusterCount = 0, + ClusterCount0MismatchBarcode = 0, ClusterCount1MismatchBarcode = 0, + QualityScoreSum = 0, *args, **kwargs): + self.Yield = Yield + self.YieldQ30 = YieldQ30 + self.ClusterCount = ClusterCount + self.ClusterCount0MismatchBarcode = ClusterCount0MismatchBarcode + self.ClusterCount1MismatchBarcode = ClusterCount1MismatchBarcode + self.QualityScoreSum = QualityScoreSum + + def __str__(self): + return "Yield: %s, YieldQ30: %s, ClusterCount: %s, ClusterCount0MismatchBarcode: %s," \ + " CusterCount1MismatchBarcode: %s, QualityScoreSum: %s" \ + % (self.Yield, self.YieldQ30, self.ClusterCount, self.ClusterCount0MismatchBarcode, + self.ClusterCount1MismatchBarcode, self.QualityScoreSum) + +class sample: + def __init__(self, Lane = 0, Sample = '', Barcode = '', Tile = '', Read = '', rawqc = qcValues([]), + pfqc = qcValues([]), *args, **kwargs): + self.Lane = Lane + self.Sample = Sample + self.Barcode = Barcode + self.Tile = Tile + self.Read = Read + self.rawqc = rawqc + self.pfqc = pfqc + + def __str__(self): + return "Lane: %s, Sample: %s, Barcode: %s, Tile: %s, Read: %s, rawqc: %s, pfqc: %s" \ + % (self.Lane, self.Sample, self.Barcode, self.Tile, self.Read, self.rawqc, self.pfqc) + +# ----------------------------------------------------------------------------- + +class Statistics: + def __init__(self, lane = 0, sampleName = "", index1 = "NoIndex", index2 = "NoIndex", pfYieldSum = 0, + rawYieldSum = 0, pfPercentage = 0.0, rawReadsSum = 0, pfReadsSum = 0, + pfYieldQ30Sum = 0, qualityScoreSum = 0, rawPercentageReadsPerLane = 0.0, + pfYieldQ30Percentage = 0.0, pfsumQualityScore = 0, pfmeanQualityScore = 0.0): + self.lane = lane + self.sampleName = sampleName + self.index1 = index1 + self.index2 = index2 + self.pfYieldSum = pfYieldSum + self.rawYieldSum = rawYieldSum + self.pfPercentage = pfPercentage + self.rawReadsSum = rawReadsSum + self.pfReadsSum = pfReadsSum + self.pfYieldQ30Sum = pfYieldQ30Sum + self.qualityScoreSum = qualityScoreSum + self.rawPercentageReadsPerLane = rawPercentageReadsPerLane + self.pfYieldQ30Percentage = pfYieldQ30Percentage + self.pfsumQualityScore = pfsumQualityScore + self.pfmeanQualityScore = pfmeanQualityScore + + def __str__(self): + return "lane: %s, sampleName: %s, index1: %s, index2: %s, pfYieldSum: %s, pfPercentage: %s," \ + " rawReadsSum: %s, pfReadsSum: %s," \ + " rawPercentageReadsPerLane: %s, pfYieldQ30Percentage: %s," \ + " pfmeanQualityScore: %s" \ + % (self.lane, self.sampleName, self.index1, self.index2, self.pfYieldSum, self.pfPercentage, + self.rawReadsSum, self.pfReadsSum, + self.rawPercentageReadsPerLane, self.pfYieldQ30Percentage, self.pfmeanQualityScore) + + def calculatePercentagePF (self, rawYield = 0, pfYield = 1): + try: + return round(float(pfYield) / float(rawYield) * 100, 2) + except: + return 0.0 + + def calulateMeanQualityScore (self, pfqualityScoreSum = 0, pfYield = 1): + try: + return round (float(pfqualityScoreSum) / float(pfYield), 2) + except: + return 0.0 + + def calculateYieldQ30Percentage (self, pfYieldQ30 = 0, pfYield = 1): + try: + return round (float(pfYieldQ30) / float(pfYield) * 100, 2) + except: + return 0.0 + +# ----------------------------------------------------------------------------- + +def xml2Memory(DEMULTIPLEX_XML): + ''' + Parse the XML file and put all values in a memory structure: + List of: + lane, sample, barcode, tile, read, qcRawList, qcPfList + ''' + + RAW_TAG = "Raw" + PF_TAG = "Pf" + + sampleList = [] + + xml = parseXmlFile(DEMULTIPLEX_XML) + r = xml.tree.getroot() + + for lane in r.getchildren(): + for mysample in lane: + for barcode in mysample: + for tile in barcode: + for read in tile: + + qcRaw = qcValues() + qcPf = qcValues() + qcRawList = [] + qcPfList = [] + + # Read out the Raw fields + raw = read.find(RAW_TAG) + for child in raw.getchildren(): + # equivalent to a Java reflection + setattr(qcRaw, child.tag, int(child.text)) + + # Read out the Pf fields + pf = read.find(PF_TAG) + for child in pf.getchildren(): + # equivalent to a Java reflection + setattr(qcPf, child.tag, int(child.text)) + + qcRawList.append(qcRaw) + qcPfList.append(qcPf) + + singleElement = sample () + + setattr(singleElement, lane.tag, lane.attrib) + setattr(singleElement, mysample.tag, mysample.attrib) + setattr(singleElement, barcode.tag, barcode.attrib) + setattr(singleElement, tile.tag, tile.attrib) + setattr(singleElement, read.tag, read.attrib) + singleElement.rawqc = qcRawList + singleElement.pfqc = qcPfList + + sampleList.append(singleElement) + return sampleList + +# ----------------------------------------------------------------------------- + +def calculateStatistics(listofSamples): + ''' + Structure of 'listofSamples' + Lane: {'index': '6'}, Sample: {'index': 'BSSE-QGF-3524_C0NKPACXX'}, Barcode: {'index': 'TGACCA'}, + Tile: {'index': '2307'}, Read: {'index': '1'}, rawqc:<mem>, pfqc:<mem> + ''' + + numberOfTiles = len(listofSamples) + + tile = sample() + raw = qcValues () + pf = qcValues () + stats = Statistics() + + for tile in listofSamples: + raw = tile.rawqc[0] + pf = tile.pfqc[0] + + stats.pfYieldSum += pf.Yield + stats.rawYieldSum += raw.Yield + stats.rawReadsSum += raw.ClusterCount + stats.pfReadsSum += pf.ClusterCount + stats.pfYieldQ30Sum += pf.YieldQ30 + stats.qualityScoreSum += pf.QualityScoreSum + + # Can not be set here, needs to be calculated later + #stats.rawPercentageReadsPerLane = rawPercentageReadsPerLane + stats.pfPercentage = stats.calculatePercentagePF(stats.rawYieldSum, stats.pfYieldSum) + stats.pfYieldQ30Percentage = stats.calculateYieldQ30Percentage(stats.pfYieldQ30Sum, stats.pfYieldSum) + stats.pfmeanQualityScore = stats.calulateMeanQualityScore(stats.qualityScoreSum, stats.pfYieldSum) + stats.lane = listofSamples[0].Lane.values()[0] + stats.sampleName = listofSamples[0].Sample.values()[0] + index = listofSamples[0].Barcode.values()[0] + try: + stats.index1, stats.index2 = index.split("-") + except: + stats.index1 = index + return stats + +# ----------------------------------------------------------------------------- + + +def rawReadSumPerSamples(stat): + ''' + Creates a dictionary with the lanes as keys + The values are a list where the elements are a dictionary again. + This dictionary has the sample names as key and the RawReadSum as value. + + Example: + {4': [{'BSSE-QGF-3434_C0NKPACXX': 248999502}], '7': [{'lane7': 123921974}, + {'BSSE-QGF-3527_C0NKPACXX': 38587703}, {'BSSE-QGF-3529_C0NKPACXX': 30130893}, + {'BSSE-QGF-3528_C0NKPACXX': 34519296}, {'BSSE-QGF-3526_C0NKPACXX': 34980179}]} + ''' + + laneDict = {} + for e in stat: + if e.lane not in laneDict: + laneDict[e.lane] = [{e.sampleName:e.rawReadsSum}] + else: + laneDict[e.lane].append({e.sampleName:e.rawReadsSum}) + return laneDict + +# ----------------------------------------------------------------------------- + +def createSumRawReadsPerLane(laneDict): + ''' + Creates a dictionary with lane as key and sum of Raw Reads as value: + {'1': 183180877, '3': 244968562, '2': 191496395, '5': 193466239, '4': 248999502, + '7': 262140045, '6': 257136830, '8': 209948449} + ''' + sumRawReadsDict = {} + for lane in laneDict: + sumRawReads = 0 + for sampleNameDict in laneDict[lane]: + sumRawReads += sampleNameDict.values()[0] + + sumRawReadsDict[lane] = sumRawReads + return sumRawReadsDict + +# ----------------------------------------------------------------------------- + +def createPercentagePerLane(laneDict, sumRawReadsDict): + ''' + Creates a dictionary with the sample Name as key and the percentage of raw reads related to + all reads in the same lane + {'lane7': 47.27, 'BSSE-QGF-3433_C0NKPACXX': 100.0, 'BSSE-QGF-3666_C0NKPACXX': 54.12} + ''' + + relRawReadsDict = {} + for lane in laneDict: + for sampleName in laneDict[lane]: + relRawReadsDict[sampleName.keys()[0]] = round(float(sampleName.values()[0]) / + float(sumRawReadsDict[lane]) * 100, 2) + return relRawReadsDict + +# ----------------------------------------------------------------------------- + +def locate(pattern, root): + '''Locate all files matching supplied filename pattern in and below + supplied root directory.''' + for path, dirs, files in os.walk(os.path.abspath(root)): + for filename in fnmatch.filter(files, pattern): + yield os.path.join(path, filename) + +# ----------------------------------------------------------------------------- + +def getVocabulary(transaction, vocabularyCode): + + vocabularyTermList = [] + vocabulary = transaction.getSearchService().searchForVocabulary(vocabularyCode) + if (vocabulary is None): + print 'VOCABULARY %s does not exist' % (vocabularyCode) + else: + print "Getting VOCABULARY: " + vocabulary.getCode() + for term in vocabulary.getTerms(): + vocabularyTermList.append(term.getCode()) + vocabularyTermList.sort() + return vocabularyTermList + +# ----------------------------------------------------------------------------- + +def getFlowCellMetaData (transaction,flowCellId): + + def sortedDictValues(adict): + keys = adict.keys() + keys.sort() + return map(adict.get, keys) + + search = transaction.getSearchService() + sc = SearchCriteria() + print('Searching FlowCell: '+ str(flowCellId)) + sc.addMatchClause(SearchCriteria.MatchClause.createAttributeMatch(SearchCriteria.MatchClauseAttribute.CODE, flowCellId)); + foundFlowCells = search.searchForSamples(sc) + + try: + assert foundFlowCells.size() == 1 + except AssertionError: + print (str(foundFlowCells.size()) + \ + ' flow cells found which match the criterias: '+ flowCellId) + + fcPropertiesDict = {} + fcPropertyTypes = [] + + fcProperties = foundFlowCells[0].getSample().getProperties() + for property in fcProperties: + code = property.getPropertyType().getSimpleCode() + fcPropertyTypes.append(code) + fcPropertiesDict[code] = property.getValue() + + fcPropertyTypes.sort() + return fcPropertiesDict, fcPropertyTypes + + +# ----------------------------------------------------------------------------- + +def process(transaction): + ''' + Main + ''' + + FASTQ_DATA_SET_TYPE='FASTQ_GZ' + DEMUX_FILE='Flowcell_demux_summary.xml' + NO_INDEX='NOINDEX' + UNDETERMINED='UNDETERMINED' + + incomingPath = transaction.getIncoming().getPath() + name = transaction.getIncoming().getName() + + print('\n'+time.ctime()) + + fcPropertiesDict, fcPropertyTypes = getFlowCellMetaData(transaction, name) + print fcPropertiesDict + print fcPropertyTypes + + search_service = transaction.getSearchService() + + FileGenerator= locate(DEMUX_FILE, incomingPath) + DEMULTIPLEX_XML = FileGenerator.next() + + sampleList = xml2Memory(DEMULTIPLEX_XML) + + sa = sample() + sampleDict = {} + + # key = sample name, value = sample() + for element in range(0, len(sampleList)): + sa = sampleList[element] + # Check if new sample + if (sa.Sample is not sampleList[element - 1].Sample): + sampleName = sa.Sample.values()[0] + sampleDict[sampleName] = [sa] + else: + sampleDict[sampleName].append(sa) + + stat = [calculateStatistics(sampleDict[mysample]) for mysample in sampleDict] + + # calculate the relative amount of reads per index + laneDict = rawReadSumPerSamples(stat) + sumRawReadsDict = createSumRawReadsPerLane(laneDict) + relRawReadsDict = createPercentagePerLane(laneDict, sumRawReadsDict) + + # set the values in the object + for mye in stat: + mye.rawPercentageReadsPerLane = relRawReadsDict[mye.sampleName] + + def sampleSearch(Code=''): + sc = SearchCriteria() + numberOfLanes = 0 + sc.addMatchClause(SearchCriteria.MatchClause.createAttributeMatch(SearchCriteria.MatchClauseAttribute.CODE, Code)); + search_service = transaction.getSearchService() + foundSample = search_service.searchForSamples(sc) + if foundSample.size() > 0: + # Search for contained samples + sampleSc = SearchCriteria() + sampleSc.addSubCriteria(SearchSubCriteria.createSampleContainerCriteria(sc)) + foundContainedSamples = search_service.searchForSamples(sampleSc) + numberOfLanes = foundContainedSamples.size() + return foundSample, foundContainedSamples, numberOfLanes + +#-------------------------------------------------------------------------------------------------------------------------------------- + + def searchDataSetsofSample(sample, index1, index2, DATA_SET_TYPE): + sc = SearchCriteria() + sc.addMatchClause(SearchCriteria.MatchClause.createAttributeMatch(SearchCriteria.MatchClauseAttribute.CODE, sample)); + search_service = transaction.getSearchService() + foundSample = search_service.searchForSamples(sc) + + dataSetSc = SearchCriteria() + # set the Search Criteria to an OR condition, default is AND + #dataSetSc.setOperator(SearchCriteria.SearchOperator.MATCH_ANY_CLAUSES) + dataSetSc.addMatchClause(SearchCriteria.MatchClause.createAttributeMatch(SearchCriteria.MatchClauseAttribute.TYPE, DATA_SET_TYPE)) + dataSetSc.addMatchClause(SearchCriteria.MatchClause.createPropertyMatch("BARCODE", index1 )) + dataSetSc.addMatchClause(SearchCriteria.MatchClause.createPropertyMatch("INDEX2", index2)) + dataSetSc.addSubCriteria(SearchSubCriteria.createSampleCriteria(sc)) + foundDataSets = search_service.searchForDataSets(dataSetSc) + print "foundDataSets.size() "+ str(foundDataSets.size()) + for ds in foundDataSets: + print "Index1 for found Data Set" + ds.getDataSetCode() + " " + ds.getPropertyValue('BARCODE') + print "Index2 for found Data Set" + ds.getDataSetCode() + " " + ds.getPropertyValue('INDEX2') + + return foundDataSets + +#-------------------------------------------------------------------------------------------------------------------------------------- + + def getIndexesofDataSetsofSample(sample, DATA_SET_TYPE): + + index1List = [] + index2List = [] + sc = SearchCriteria() + sc.addMatchClause(SearchCriteria.MatchClause.createAttributeMatch(SearchCriteria.MatchClauseAttribute.CODE, sample)); + search_service = transaction.getSearchService() + foundSample = search_service.searchForSamples(sc) + + dataSetSc = SearchCriteria() + dataSetSc.addMatchClause(SearchCriteria.MatchClause.createAttributeMatch(SearchCriteria.MatchClauseAttribute.TYPE, DATA_SET_TYPE)) + dataSetSc.addSubCriteria(SearchSubCriteria.createSampleCriteria(sc)) + foundDataSets = search_service.searchForDataSets(dataSetSc) + for ds in foundDataSets: + index1List.append(ds.getPropertyValue('BARCODE')) + index2List.append(ds.getPropertyValue('INDEX2')) + return index1List, index2List + + + flowcell, lanes, numberOfLanes = sampleSearch(name) + + index1Length = fcPropertiesDict['INDEXREAD'] + index2Length = fcPropertiesDict['INDEXREAD2'] + + for mystat in stat: + laneCode = flowcell[0].getCode() + ":" + mystat.lane + searchIndex1 = mystat.index1.upper() + searchIndex2 = mystat.index2.upper() + print '\n' + print mystat + + index1List, index2List = getIndexesofDataSetsofSample(laneCode, FASTQ_DATA_SET_TYPE) + print "Searching for "+ searchIndex1 + " in " + str(index1List) + print "Searching for "+ searchIndex2 + " in " + str(index2List) + + if searchIndex1 not in (NO_INDEX): + if searchIndex1 not in (UNDETERMINED): + if index1Length > 7: + searchIndex1 = [ index1 for index1 in index1List if searchIndex1 in index1] + else: + searchIndex1 = [ index1 for index1 in index1List if searchIndex1 in index1[:-1]] + try: + searchIndex1 = searchIndex1[0] + except: + searchIndex1 = 'MISSING' + else: + searchIndex1 = NO_INDEX + if searchIndex2 not in (NO_INDEX): + if searchIndex2 not in (UNDETERMINED): + if index2Length > 7: + searchIndex2 = [ index2 for index2 in index2List if searchIndex2 in index2] + else: + searchIndex2 = [ index2 for index2 in index2List if searchIndex2 in index2[:-1]] + try: + searchIndex2 = searchIndex2[0] + except: + searchIndex1 = 'MISSING' + else: + searchIndex2 = NO_INDEX + + print "searchIndex1 " + str(searchIndex1) + print "searchIndex2 " + str(searchIndex2) + + # Search for a data set with those two indices + DataSet = searchDataSetsofSample(laneCode, searchIndex1, searchIndex2, FASTQ_DATA_SET_TYPE) + try: + assert DataSet.size() == 1 + except AssertionError: + print (str(DataSet.size()) + ' data sets found which match the criterias: '+ + str(laneCode), searchIndex1, searchIndex2) + continue + + sa = transaction.getDataSetForUpdate(DataSet[0].getDataSetCode()) + sa.setPropertyValue('YIELD_MBASES', str(mystat.pfYieldSum)) + sa.setPropertyValue('RAW_YIELD_MBASES', str(mystat.rawYieldSum)) + sa.setPropertyValue('PERCENTAGE_PASSED_FILTERING',str(mystat.pfPercentage)) + sa.setPropertyValue('PF_READS_SUM',str(mystat.pfReadsSum)) + sa.setPropertyValue('RAW_READS_SUM',str(mystat.rawReadsSum)) + sa.setPropertyValue('PERCENTAGE_RAW_CLUSTERS_PER_LANE', str(mystat.rawPercentageReadsPerLane)) + sa.setPropertyValue('PFYIELDQ30PERCENTAGE', str(mystat.pfYieldQ30Percentage)) + sa.setPropertyValue('PFMEANQUALITYSCORE', str(mystat.pfmeanQualityScore)) + + print "Modified data sets properties of: " + DataSet[0].getDataSetCode() + + print "DONE" diff --git a/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/read-rta-timestamp/plugin.properties b/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/read-rta-timestamp/plugin.properties new file mode 100644 index 0000000000000000000000000000000000000000..6074eab9e6cb9275cf225adb27ffed65ab068a01 --- /dev/null +++ b/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/read-rta-timestamp/plugin.properties @@ -0,0 +1,11 @@ +# +# Drop box for registering a flow cell output as a data set +# +# Variables: +# incoming-root-dir +# Path to the directory which contains incoming directories for drop boxes. +incoming-dir = ${root}/dss/read-rta-timestamp +incoming-data-completeness-condition = marker-file +top-level-data-set-handler = ch.systemsx.cisd.etlserver.registrator.api.v2.JythonTopLevelDataSetHandlerV2 +script-path = read-rta-timestamp.py +storage-processor = ch.systemsx.cisd.etlserver.DefaultStorageProcessor diff --git a/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/read-rta-timestamp/read-rta-timestamp.py b/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/read-rta-timestamp/read-rta-timestamp.py new file mode 100644 index 0000000000000000000000000000000000000000..1e68261b6dee7ad488c351a00203215175081ce5 --- /dev/null +++ b/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/read-rta-timestamp/read-rta-timestamp.py @@ -0,0 +1,45 @@ +import os +import shutil +from time import * +from datetime import * +from ch.systemsx.cisd.openbis.generic.shared.api.v1.dto import SearchCriteria + +MarkerGAComplete = 'RTAComplete.txt' +MarkerHiSeqComplete = 'RTAComplete.txt' + +def createOpenbisTimeStamp(file): + ''' + Creates a openBIS compatible time stamp of a file time stamp + ''' + mtime = os.path.getmtime(file) + lt = localtime(mtime) + tz = localtime().tm_hour - gmtime().tm_hour + return (strftime("%Y-%m-%d %H:%M:%S GMT" + "%+.2d" % tz + ":00", lt)) + +# ----------------------------------------------------------------------------- + +def process(transaction): + + incomingPath = transaction.getIncoming().getAbsolutePath() + # Get the incoming name + name = transaction.getIncoming().getName() + + split=name.split("_") + if (len(split) == 4): + IS_HISEQ_RUN=True + Markerfile = incomingPath + "/" + MarkerHiSeqComplete + if (len(split) == 2): + Markerfile = incomingPath + "/" + MarkerGAComplete + + + # Search for the sample and check if there is already sample with this name + search_service = transaction.getSearchService() + sc = SearchCriteria() + sc.addMatchClause(SearchCriteria.MatchClause.createAttributeMatch(SearchCriteria.MatchClauseAttribute.CODE, name)); + foundSamples = search_service.searchForSamples(sc) + + if foundSamples.size() > 0: + sa = transaction.getSampleForUpdate(foundSamples[0].getSampleIdentifier()) + sa.setPropertyValue("SEQUENCER_FINISHED", createOpenbisTimeStamp(Markerfile)) + + shutil.rmtree(incomingPath) diff --git a/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/register-basecall-stats/plugin.properties b/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/register-basecall-stats/plugin.properties new file mode 100644 index 0000000000000000000000000000000000000000..17aed1797bd15ac2bb8c4247c3dc8ea4aae3350e --- /dev/null +++ b/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/register-basecall-stats/plugin.properties @@ -0,0 +1,11 @@ +# +# Drop box for registering a flow cell output as a data set +# +# Variables: +# incoming-root-dir +# Path to the directory which contains incoming directories for drop boxes. +incoming-dir = ${root}/dss/register-basecall-stats +incoming-data-completeness-condition = marker-file +top-level-data-set-handler = ch.systemsx.cisd.etlserver.registrator.api.v2.JythonTopLevelDataSetHandlerV2 +script-path = register-basecall-stats.py +storage-processor = ch.systemsx.cisd.etlserver.DefaultStorageProcessor diff --git a/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/register-basecall-stats/register-basecall-stats.py b/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/register-basecall-stats/register-basecall-stats.py new file mode 100644 index 0000000000000000000000000000000000000000..6efcd1eb5df94f044d973b3016379402ce66e1b6 --- /dev/null +++ b/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/register-basecall-stats/register-basecall-stats.py @@ -0,0 +1,72 @@ +''' +@copyright: +2012 ETH Zuerich, CISD + +@license: +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +@description: +Registers an incoming directory as a data set in openBIS. The name of the directory is used to +search for the matching sample. + +@note: +print statements go to: <openBIS_HOME>/datastore_server/log/startup_log.txt +expected incoming Name for HiSeq runs: 110715_SN792_0054_BC035RACXX +expected incoming Name for GAII runs: 110812_6353WAAXX +expected incoming Name for MiSeq runs: 121218_M00721_0017_000000000-A0T19 + +@author: +Manuel Kohler +''' + +import os +from ch.systemsx.cisd.openbis.generic.shared.api.v1.dto import SearchCriteria +from ch.systemsx.cisd.openbis.generic.shared.api.v1.dto import SearchSubCriteria + +def process(transaction): + + incomingPath = transaction.getIncoming().getPath() + incomingFolder = transaction.getIncoming().getName() + + folders=os.listdir(incomingPath) + + # expected incoming Name, e.g.: 110715_SN792_0054_BC035RACXX + + # Create a data set and set type + dataSet = transaction.createNewDataSet("BASECALL_STATS") + dataSet.setMeasuredData(False) + + dataSet.setPropertyValue("MISMATCH_IN_INDEX", "NONE") + + # Get the search service + search_service = transaction.getSearchService() + + # Add the incoming file into the data set + transaction.moveFile(incomingPath, dataSet) + + # Search for the sample + sc = SearchCriteria() + sc.addMatchClause(SearchCriteria.MatchClause.createAttributeMatch(SearchCriteria.MatchClauseAttribute.CODE, incomingPath)) + foundSamples = search_service.searchForSamples(sc) + + + if foundSamples.size() > 0: + dataSet.setSample(foundSamples[0]) + + # Search for another data set of the same sample + #dataSetSc = SearchCriteria() + #dataSetSc.addMatchClause(SearchCriteria.MatchClause.createAttributeMatch(SearchCriteria.MatchClauseAttribute.TYPE, 'ILLUMINA_HISEQ_OUTPUT')) + #dataSetSc.addSubCriteria(SearchSubCriteria.createSampleCriteria(sc)) + #foundDataSets = search_service.searchForDataSets(dataSetSc) + #if foundDataSets.size() > 0: + # dataSet.setParentDatasets([ds.getDataSetCode() for ds in foundDataSets]) diff --git a/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/register-bigwig/plugin.properties b/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/register-bigwig/plugin.properties new file mode 100644 index 0000000000000000000000000000000000000000..c59f43ebe29e60d149d0d8f4cedd004f100701d0 --- /dev/null +++ b/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/register-bigwig/plugin.properties @@ -0,0 +1,10 @@ +# +# Variables: +# incoming-root-dir +# Path to the directory which contains incoming directories for drop boxes. +incoming-dir = ${root}/dss/register-bigwig/ +incoming-data-completeness-condition = marker-file +top-level-data-set-handler = ch.systemsx.cisd.etlserver.registrator.api.v2.JythonTopLevelDataSetHandlerV2 +script-path = register-bigwig.py +storage-processor = ch.systemsx.cisd.etlserver.DefaultStorageProcessor + diff --git a/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/register-bigwig/register-bigwig.py b/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/register-bigwig/register-bigwig.py new file mode 100755 index 0000000000000000000000000000000000000000..31a0863709fb07a6f23a4912221d41763388ffd9 --- /dev/null +++ b/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/register-bigwig/register-bigwig.py @@ -0,0 +1,140 @@ +''' +@copyright: +2012 ETH Zuerich, CISD + +@license: +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +@note: +print statements go to: <openBIS_HOME>/datastore_server/log/startup_log.txt +expected incoming Name for HiSeq runs: 110715_SN792_0054_BC035RACXX +expected incoming Name for GAII runs: 110812_6353WAAXX + +@author: +Manuel Kohler +''' + +import os +import fnmatch +import re +from ch.systemsx.cisd.openbis.generic.shared.api.v1.dto import SearchCriteria +from ch.systemsx.cisd.openbis.generic.shared.api.v1.dto import SearchSubCriteria + +FOLDER='/links/shared/dsu/dss/register-bigwig/' +BIGWIGINFO='/links/application/dsu/bigWig/bigWigInfo ' +BW_PATTERN='*.bw' + +matches = [] + +# ----------------------------------------------------------------------------- + +def listSearch (myList, searchString): + ''' + Searches for a given String in a list. + Only lines matching the start of a line a considerd as a match + ''' + matches = [] + for i in range (0, len(myList)): + if(re.match(searchString, myList[i])): + matches.append(myList[i]) + return (matches) + +# ----------------------------------------------------------------------------- + +def translateBoolean (value): + if (value.lower() == 'yes') | (value.lower() =='y'): + return True + else: + return False + +def sanitizeInt (intNumber): + return intNumber.replace(',','') + +# ----------------------------------------------------------------------------- + +def convertListToDict(prop): + d={} + for i in range(0,len(prop)-1): + lineSplit = prop[i].split(':') + d[lineSplit[0].strip()] = lineSplit[1].strip() + return(d) + +# ----------------------------------------------------------------------------- + +def process(transaction): + + incomingPath = transaction.getIncoming().getPath() + name = transaction.getIncoming().getName() + + # Create a data set and set type + dataSet = transaction.createNewDataSet("BIGWIGGLE") + dataSet.setMeasuredData(False) + + # expected: + # Project_110907_SN792_0059_AC012FACXX_3/BSSE-DSU-1662_CGATGTA_L003_R1_001_sorted.bw + split=name.split('_') + if (len(split) == 6): + incoming_sample=split[1]+ '_'+ split[2] + '_' + split[3] + '_' + split[4]+ ':' + split[-1] + if (len(split) ==4): + incoming_sample=split[1]+ '_'+ split[2] + ':' + split[-1] + + # Looking for BWs: + for root, dirnames, filenames in os.walk(FOLDER + name): + for filename in fnmatch.filter(filenames, BW_PATTERN): + matches.append(os.path.join(root, filename)) + + # Extract values from a samtools view and set the results as DataSet properties + # Command: samtools view -H ETHZ_BSSE_110429_63558AAXX_1_sorted.bam + + arguments = BIGWIGINFO + matches[0] + #print('Arguments: '+ arguments) + cmdResult=os.popen(arguments).read() + + properties=cmdResult.split("\n") + dictProp = convertListToDict(properties) + #print(dictProp) + + dataSet.setPropertyValue("VERSION", dictProp['version']) + dataSet.setPropertyValue("ISCOMPRESSED", str(translateBoolean(dictProp['isCompressed']))) + dataSet.setPropertyValue("ISSWAPPED", dictProp['isSwapped']) + dataSet.setPropertyValue("PRIMARYDATASIZE", sanitizeInt(dictProp['primaryDataSize'])) + dataSet.setPropertyValue("PRIMARYINDEXSIZE", sanitizeInt(dictProp['primaryIndexSize'])) + dataSet.setPropertyValue("ZOOMLEVELS", dictProp['zoomLevels']) + dataSet.setPropertyValue("CHROMCOUNT", dictProp['chromCount']) + dataSet.setPropertyValue("BASESCOVERED", sanitizeInt(dictProp['basesCovered'])) + dataSet.setPropertyValue("MEAN", dictProp['mean']) + dataSet.setPropertyValue("MIN", dictProp['min']) + dataSet.setPropertyValue("MAX", dictProp['max']) + dataSet.setPropertyValue("STD", dictProp['std']) + + # Add the incoming file into the data set + transaction.moveFile(incomingPath, dataSet) + + # Get the search service + search_service = transaction.getSearchService() + + # Search for the sample + sc = SearchCriteria() + sc.addMatchClause(SearchCriteria.MatchClause.createAttributeMatch(SearchCriteria.MatchClauseAttribute.CODE, incoming_sample)); + foundSamples = search_service.searchForSamples(sc) + + if foundSamples.size() > 0: + dataSet.setSample(foundSamples[0]) + + # Search for parent data set of the same sample + dataSetSc = SearchCriteria() + dataSetSc.addMatchClause(SearchCriteria.MatchClause.createAttributeMatch(SearchCriteria.MatchClauseAttribute.TYPE, 'FASTQ_GZ')) + dataSetSc.addSubCriteria(SearchSubCriteria.createSampleCriteria(sc)) + foundDataSets = search_service.searchForDataSets(dataSetSc) + if foundDataSets.size() > 0: + dataSet.setParentDatasets([ds.getDataSetCode() for ds in foundDataSets]) diff --git a/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/register-bowtie/plugin.properties b/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/register-bowtie/plugin.properties new file mode 100644 index 0000000000000000000000000000000000000000..79d85834f4997a8f0ef32f4d6c4d23cadc03f6ce --- /dev/null +++ b/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/register-bowtie/plugin.properties @@ -0,0 +1,11 @@ +# +# Drop box for registering a flow cell output as a data set +# +# Variables: +# incoming-root-dir +# Path to the directory which contains incoming directories for drop boxes. +incoming-dir = ${root}/dss/register-bowtie +incoming-data-completeness-condition = marker-file +top-level-data-set-handler = ch.systemsx.cisd.etlserver.registrator.api.v2.JythonTopLevelDataSetHandlerV2 +script-path = register-bowtie.py +storage-processor = ch.systemsx.cisd.etlserver.DefaultStorageProcessor diff --git a/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/register-bowtie/register-bowtie.py b/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/register-bowtie/register-bowtie.py new file mode 100644 index 0000000000000000000000000000000000000000..11f72f74fafb4fe45fc87e703982f83be83cd2ce --- /dev/null +++ b/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/register-bowtie/register-bowtie.py @@ -0,0 +1,130 @@ +''' +This is handling bowtie-BAM files and extracts some properties from the BAM header and +the samtools flagstat command. The results are formatted and attached as a property +to the openBIS DataSet. +Prerequisites are the DataSetType: ALIGNMENT and +the following properties assigned to the DataSetType mentioned above: +ALIGNMENT_SOFTWARE, ISSUED_COMMAND, SAMTOOLS_FLAGSTAT, +TOTAL_READS, MAPPED_READS + +Obviously you need a working samtools binary + +Uses 'flagstat' and 'view -H' +''' + +import os +import fnmatch +import re +from ch.systemsx.cisd.openbis.generic.shared.api.v1.dto import SearchCriteria +from ch.systemsx.cisd.openbis.generic.shared.api.v1.dto import SearchSubCriteria + +FOLDER='/links/shared/dsu/dss/register-bowtie/' +SAMTOOLS='/links/application/dsu/samtools/samtools' +BAM_PATTERN='*.bam' + +matches = [] +searchStrings = ['@PG'] +programList = [] + +# ----------------------------------------------------------------------------- + +def listSearch (myList, searchString): + ''' + Searches for a given String in a list. + Only lines matching the start of a line a considerd as a match + ''' + matches = [] + for i in range (0, len(myList)): + if(re.match(searchString, myList[i])): + matches.append(myList[i]) + return (matches) + +# ----------------------------------------------------------------------------- + +def programParameters (programList): + ''' + Extracts the aligner datils from the bam header + ''' + elements = {} + for program in range(0, len(programList)): + line = programList[program].split('\t') + + for element in range (1, len(line)): + key, value = line[element].split(":") + elements[key] = value + + return elements + +# ----------------------------------------------------------------------------- + +def process(transaction): + + incomingPath = transaction.getIncoming().getName() + + dataSet = transaction.createNewDataSet("ALIGNMENT") + dataSet.setMeasuredData(False) + + # Get the incoming name + # expected: + # Project_110907_SN792_0059_AC012FACXX_3/Sample_BSSE-DSU-1662/BSSE-DSU-1662_CGATGTA_L003_R1_001_sorted.bam + split=incomingPath.split('_') + if (len(split) == 6): + incoming_sample=split[1]+ '_'+ split[2] + '_' + split[3] + '_' + split[4]+ ':' + split[-1] + if (len(split) ==4): + incoming_sample=split[1]+ '_'+ split[2] + ':' + split[-1] + + # Looking for BAMS: + for root, dirnames, filenames in os.walk(FOLDER + incomingPath): + for filename in fnmatch.filter(filenames, BAM_PATTERN): + matches.append(os.path.join(root, filename)) + + # Extract values from a samtools view and set the results as DataSet properties + # Command: samtools view -H ETHZ_BSSE_110429_63558AAXX_1_sorted.bam + + arguments = SAMTOOLS + ' view -H ' + matches[0] + print('Arguments: '+ arguments) + cmdResult=os.popen(arguments).read() + + properties=cmdResult.split("\n") + for s in range (0, len(searchStrings)): + programList = listSearch (properties, searchStrings[s]) + print(programList) + + e = programParameters (programList) + + dataSet.setPropertyValue("ALIGNMENT_SOFTWARE", e['ID']) + dataSet.setPropertyValue("VERSION", e['VN']) + dataSet.setPropertyValue("ISSUED_COMMAND", e['CL']) + + + arguments = SAMTOOLS + ' flagstat ' + matches[0] + + cmdResult=os.popen(arguments).read() + totalReads=cmdResult.split('\n')[0].split(' ')[0] + mappedReads=cmdResult.split('\n')[2].split(' ')[0] + + dataSet.setPropertyValue("SAMTOOLS_FLAGSTAT", cmdResult) + dataSet.setPropertyValue("TOTAL_READS", totalReads) + dataSet.setPropertyValue("MAPPED_READS", mappedReads) + + # Add the incoming file into the data set + transaction.moveFile(incomingPath, dataSet) + + # Get the search service + search_service = transaction.getSearchService() + + # Search for the sample + sc = SearchCriteria() + sc.addMatchClause(SearchCriteria.MatchClause.createAttributeMatch(SearchCriteria.MatchClauseAttribute.CODE, incoming_sample)); + foundSamples = search_service.searchForSamples(sc) + + if foundSamples.size() > 0: + dataSet.setSample(foundSamples[0]) + + # Search for parent data set of the same sample + dataSetSc = SearchCriteria() + dataSetSc.addMatchClause(SearchCriteria.MatchClause.createAttributeMatch(SearchCriteria.MatchClauseAttribute.TYPE, 'FASTQ_GZ')) + dataSetSc.addSubCriteria(SearchSubCriteria.createSampleCriteria(sc)) + foundDataSets = search_service.searchForDataSets(dataSetSc) + if foundDataSets.size() > 0: + dataSet.setParentDatasets([ds.getDataSetCode() for ds in foundDataSets]) diff --git a/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/register-fastqc/plugin.properties b/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/register-fastqc/plugin.properties new file mode 100644 index 0000000000000000000000000000000000000000..f6c0d69fdb1e85dfc4709a9c04b7b082559ae057 --- /dev/null +++ b/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/register-fastqc/plugin.properties @@ -0,0 +1,11 @@ +# +# Drop box for registering a flow cell output as a data set +# +# Variables: +# incoming-root-dir +# Path to the directory which contains incoming directories for drop boxes. +incoming-dir = ${root}/dss/register-fastqc +incoming-data-completeness-condition = marker-file +top-level-data-set-handler = ch.systemsx.cisd.etlserver.registrator.api.v2.JythonTopLevelDataSetHandlerV2 +script-path = register-fastqc.py +storage-processor = ch.systemsx.cisd.etlserver.DefaultStorageProcessor diff --git a/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/register-fastqc/register-fastqc.py b/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/register-fastqc/register-fastqc.py new file mode 100644 index 0000000000000000000000000000000000000000..26c09f2d4afb5c1a474d8a595efa86334371ce69 --- /dev/null +++ b/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/register-fastqc/register-fastqc.py @@ -0,0 +1,73 @@ +''' +@copyright: +2012 ETH Zuerich, CISD + +@license: +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +@note +print statements go to: <openBIS_HOME>/datastore_server/log/startup_log.txt +Expects as incoming folder: Project_120427_SN792_0110_AD0YCGACXX_1 + +@author: +Manuel Kohler +''' + +import os +import shutil +from ch.systemsx.cisd.openbis.generic.shared.api.v1.dto import SearchCriteria +from ch.systemsx.cisd.openbis.generic.shared.api.v1.dto import SearchSubCriteria + +def renameFiles(dir, flowcellName): + print dir + print flowcellName + for root, dirs, files in os.walk(dir): + for file in files: + print root + file + os.rename(root + '/' + file, root + "/" + flowcellName + "_" + file) + + +def process(transaction): + + incomingPath = transaction.getIncoming().getPath() + incomingFolder = transaction.getIncoming().getName() + + fcAndLane = incomingFolder.split("_",1)[-1] + flowCell, flowLane = fcAndLane.rsplit("_",1) + print flowLane + + # Get the search service + search_service = transaction.getSearchService() + + # Search for the sample + sc = SearchCriteria() + sc.addMatchClause(SearchCriteria.MatchClause.createAttributeMatch(SearchCriteria.MatchClauseAttribute.CODE, flowCell)) + foundSamples = search_service.searchForSamples(sc) + + if foundSamples.size() > 0: + + # Search for another data set of the same sample and make it a child of it + sampleSc = SearchCriteria() + sampleSc.addSubCriteria(SearchSubCriteria.createSampleContainerCriteria(sc)) + foundContainedSamples = search_service.searchForSamples(sampleSc) + if foundContainedSamples.size() > 0: + dataSet = transaction.createNewDataSet("FASTQC") + dataSet.setMeasuredData(False) + for indx in range(0, len(foundContainedSamples)): + lane = foundContainedSamples[indx].getCode().split(':')[-1] + print lane + if (flowLane == lane): + dataSet.setSample(foundContainedSamples[indx]) + # Add the incoming file into the data set + transaction.moveFile(incomingPath + "/fastqc/", dataSet) + break diff --git a/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/register-flowcell-hiseq/plugin.properties b/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/register-flowcell-hiseq/plugin.properties new file mode 100644 index 0000000000000000000000000000000000000000..427cd80979afdb40789b2fe9140750887c78d43f --- /dev/null +++ b/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/register-flowcell-hiseq/plugin.properties @@ -0,0 +1,12 @@ +# +# Drop box for registering a flow cell output as a data set +# +# Variables: +# incoming-root-dir +# Path to the directory which contains incoming directories for drop boxes. +incoming-dir = ${root}/dss/register-flowcell-hiseq +#incoming-dir = /links/sonas/cisd/store/incoming-flowCell +incoming-data-completeness-condition = marker-file +top-level-data-set-handler = ch.systemsx.cisd.etlserver.registrator.api.v2.JythonTopLevelDataSetHandlerV2 +script-path = register-flowcell-hiseq.py +storage-processor = ch.systemsx.cisd.etlserver.DefaultStorageProcessor diff --git a/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/register-flowcell-hiseq/register-flowcell-hiseq.py b/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/register-flowcell-hiseq/register-flowcell-hiseq.py new file mode 100644 index 0000000000000000000000000000000000000000..3048b4473d30543fbcadf3fb4ee8063aed05f926 --- /dev/null +++ b/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/register-flowcell-hiseq/register-flowcell-hiseq.py @@ -0,0 +1,97 @@ +''' +@copyright: +2012 ETH Zuerich, CISD + +@license: +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +@description: +Registers an incoming directory as a data set in openBIS. The name of the directory is used to +search for the matching sample. + +@note: +print statements go to: <openBIS_HOME>/datastore_server/log/startup_log.txt +expected incoming Name for HiSeq runs: 110715_SN792_0054_BC035RACXX +expected incoming Name for GAII runs: 110812_6353WAAXX + +@author: +Manuel Kohler +''' + +import os +import shutil +import glob +import xml.etree.ElementTree as etree +from ch.systemsx.cisd.openbis.generic.shared.api.v1.dto import SearchCriteria + +IS_HISEQ_RUN=False + +RUNINFO_FOLDER='/links/shared/dsu/dss/register-runstatistics/' +REGEX_RUNINFO_SAMPLE = '/Data/Status*' +REGEX_RUNINFO_REPORTS = '/Data/reports' +MARKER_STRING='.MARKER_is_finished_' + +def touch_markerfile(filename): + try: + # do a touch + open(filename, 'w').close() + except: + print('Could not touch ' + filename) + + +def process(transaction): + + incomingPath = transaction.getIncoming().getPath() + name = transaction.getIncoming().getName() + + folders=[] + folders=os.listdir(incomingPath) + + split=name.split("_") + if (len(split) == 4): + dataSet = transaction.createNewDataSet("ILLUMINA_HISEQ_OUTPUT") + IS_HISEQ_RUN=True + if (len(split) == 2): + dataSet = transaction.createNewDataSet("ILLUMINA_GA_OUTPUT") + + #move RunInfo into a different drop box + runInfoSample=glob.glob(incomingPath + REGEX_RUNINFO_SAMPLE) + runInfoReport=glob.glob(incomingPath + REGEX_RUNINFO_REPORTS) + runInfoList = runInfoSample + runInfoReport + os.makedirs(RUNINFO_FOLDER + name + '/Data/') + for runInfo in runInfoList: + try: + if os.path.isdir(runInfo): + shutil.copytree(runInfo, RUNINFO_FOLDER + name + '/Data/' + os.path.basename(runInfo)) + else: + shutil.copy2(runInfo, RUNINFO_FOLDER + name + '/Data/') + except (IOError, os.error), why: + print (runInfo, RUNINFO_FOLDER + name, str(why)) + + touch_markerfile(RUNINFO_FOLDER+MARKER_STRING+name) + + # Create a data set and set type + dataSet.setMeasuredData(False) + + # Get the search service + search_service = transaction.getSearchService() + + # Search for the sample + sc = SearchCriteria() + sc.addMatchClause(SearchCriteria.MatchClause.createAttributeMatch(SearchCriteria.MatchClauseAttribute.CODE, name)); + foundSamples = search_service.searchForSamples(sc) + + if foundSamples.size() > 0: + # Add the incoming file into the data set + transaction.moveFile(incomingPath, dataSet) + dataSet.setSample(foundSamples[0]) diff --git a/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/register-flowcell-miseq/plugin.properties b/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/register-flowcell-miseq/plugin.properties new file mode 100644 index 0000000000000000000000000000000000000000..20949c407e2e257e5e80882b17ad7a1b615a9e35 --- /dev/null +++ b/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/register-flowcell-miseq/plugin.properties @@ -0,0 +1,12 @@ +# +# Drop box for registering a flow cell output as a data set +# +# Variables: +# incoming-root-dir +# Path to the directory which contains incoming directories for drop boxes. +incoming-dir = ${root}/dss/register-flowcell-miseq +incoming-data-completeness-condition = marker-file +top-level-data-set-handler = ch.systemsx.cisd.etlserver.registrator.api.v2.JythonTopLevelDataSetHandlerV2 +script-path = register-flowcell-miseq.py +storage-processor = ch.systemsx.cisd.etlserver.DefaultStorageProcessor + diff --git a/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/register-flowcell-miseq/register-flowcell-miseq.py b/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/register-flowcell-miseq/register-flowcell-miseq.py new file mode 100644 index 0000000000000000000000000000000000000000..c9321e6c87eb162eeee01ea5a05a815d8563bf18 --- /dev/null +++ b/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/register-flowcell-miseq/register-flowcell-miseq.py @@ -0,0 +1,60 @@ +''' +@copyright: +2012 ETH Zuerich, CISD + +@license: +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +@description: +Registers an incoming directory as a data set in openBIS. The name of the directory is used to +search for the matching sample. + +@note: +print statements go to: <openBIS_HOME>/datastore_server/log/startup_log.txt +expected incoming Name for MiSeq runs: 120726_M00721_0011_A000000000-A1FVF + +@author: +Manuel Kohler +''' + +from ch.systemsx.cisd.openbis.generic.shared.api.v1.dto import SearchCriteria + +DATASET_TYPE_MISEQ = "ILLUMINA_MISEQ_OUTPUT" + +def process(transaction): + + incomingPath = transaction.getIncoming().getAbsolutePath() + flowCellId = transaction.getIncoming().getName() + dataSet = transaction.createNewDataSet(DATASET_TYPE_MISEQ) + + # Create a data set and set type + dataSet.setMeasuredData(False) + + # Get the search service + search_service = transaction.getSearchService() + + # Search for the sample + sc = SearchCriteria() + sc.addMatchClause(SearchCriteria.MatchClause.createAttributeMatch(SearchCriteria.MatchClauseAttribute.CODE, flowCellId)); + foundFlowCells = search_service.searchForSamples(sc) + + # Make sure there is only one Flow cell registered with this Flow Cell Name / ID + try: + assert foundFlowCells.size() == 1 + except AssertionError: + print (str(foundFlowCells.size()) + ' flow cells found which match the criterias: '+ flowCellId) + + print (foundFlowCells) + # Add the incoming file into the data set + transaction.moveFile(incomingPath, dataSet) + dataSet.setSample(foundFlowCells[0]) diff --git a/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/register-lane-hiseq/plugin.properties b/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/register-lane-hiseq/plugin.properties new file mode 100644 index 0000000000000000000000000000000000000000..51e92f579bf34cf5d4f96b3bc652ec0a059a1cf0 --- /dev/null +++ b/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/register-lane-hiseq/plugin.properties @@ -0,0 +1,12 @@ +# +# Drop box for registering a flow cell output as a data set +# +# Variables: +# incoming-root-dir +# Path to the directory which contains incoming directories for drop boxes. +incoming-dir = ${root}/dss/register-lane-hiseq +incoming-data-completeness-condition = marker-file +top-level-data-set-handler = ch.systemsx.cisd.etlserver.registrator.api.v2.JythonTopLevelDataSetHandlerV2 +script-path = register-lane-hiseq.py +storage-processor = ch.systemsx.cisd.etlserver.DefaultStorageProcessor + diff --git a/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/register-lane-hiseq/register-lane-hiseq.py b/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/register-lane-hiseq/register-lane-hiseq.py new file mode 100755 index 0000000000000000000000000000000000000000..8279053de36ebaeab370e68f70a0352d3dfed7de --- /dev/null +++ b/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/register-lane-hiseq/register-lane-hiseq.py @@ -0,0 +1,279 @@ +''' +Processes each flow lane of a Sequencing run + +Expects as incoming folder: +Project_<Flow Cell>_<Lane> +e.g.Project_110715_SN792_0054_BC035RACXX_1 or Project_110816_6354LAAXX_1 + +Note: +print statements go to: ~openbis/sprint/datastore_server/log/startup_log.txt +''' + +import os +import fnmatch +import time +import shutil +from time import * +from datetime import * +from ch.systemsx.cisd.openbis.generic.shared.api.v1.dto import SearchCriteria + +FASTQ_GZ_PATTERN = "*.fastq.gz" +METADATA_FILE_SUFFIX = "_metadata.tsv" +AFFILIATION= {'FMI': '/links/shared/dsu/dss/customers/fmi/drop-box/','BIOCENTER_BASEL': '/links/shared/dsu/dss/customers/biozentrum/drop-box/'} +AFFILIATION_PROPERTY_NAME='AFFILIATION' +INDEX1='BARCODE' +INDEX2='INDEX2' +EXTERNAL_SAMPLE_NAME='EXTERNAL_SAMPLE_NAME' + +DEFAULT_INDEX='NoIndex' + +# ------------------------------------------------------------------------------- + +def getFileNames(path): + ''' + Gets all files matching a PATTERN in a path recursively + and returns the result as a list + ''' + matches = [] + for root, dirnames, filenames in os.walk(path): + for filename in fnmatch.filter(filenames, FASTQ_GZ_PATTERN): + matches.append(os.path.join(root, filename)) + matches.sort() + return(matches) + +def writeMetadataFile (fileName, parentPropertyTypes, parentPropertiesMap, fcMetaDataDict, fcMetaDataList): + ''' + Writes a file of meta date related to one sample + ''' + try: + metaDataFile = open(fileName,'w') + for propertyType in parentPropertyTypes: + metaDataFile.write(propertyType.encode('utf-8') + "\t" + + parentPropertiesMap[propertyType].tryGetAsString().encode('utf-8') + "\n") + + metaDataFile.write("\nFLOWCELL PROPERTIES\n".encode('utf-8')) + + for fcMetaData in fcMetaDataList: + metaDataFile.write(fcMetaData.encode('utf-8') + "\t" + + fcMetaDataDict[fcMetaData].tryGetAsString().encode('utf-8') + "\n") + pass + except IOError: + print ('File error, could not write '+ fileName) + finally: + metaDataFile.close() + +def create_openbis_timestamp (): + ''' + Create an openBIS conform timestamp + ''' + tz=localtime()[3]-gmtime()[3] + d=datetime.now() + return d.strftime("%Y-%m-%d %H:%M:%S GMT"+"%+.2d" % tz+":00") + +def extraCopy (affiliation_name, path): + ''' + Handles the extra copies of the data for transfer with datamover via the + bc2 network to the FMI and BIOCENTER + For the BIOCENTER there is a folder created in which all data gets into + ''' + if (affiliation_name in AFFILIATION): + if (affiliation_name == 'BIOCENTER_BASEL'): + dirname = AFFILIATION[affiliation_name] + datetime.now().strftime("%Y-%m-%d") + if not os.path.exists(dirname): + os.mkdir(dirname) + shutil.copy(path, dirname) + else: + shutil.copy(path, AFFILIATION[affiliation_name]) +# ------------------------------------------------------------------------------- + +def getFlowCellMetaData (transaction, flowCellId): + + def sortedDictValues(adict): + keys = adict.keys() + keys.sort() + return map(adict.get, keys) + + search = transaction.getSearchService() + sc = SearchCriteria() + print('Searching FlowCell: '+ str(flowCellId)) + sc.addMatchClause(SearchCriteria.MatchClause.createAttributeMatch(SearchCriteria.MatchClauseAttribute.CODE, flowCellId)); + foundFlowCells = search.searchForSamples(sc) + + try: + assert foundFlowCells.size() == 1 + except AssertionError: + print (str(foundFlowCells.size()) + ' flow cells found which match the criterias: '+ flowCellId) + + fcPropertiesDict = {} + fcPropertyTypes = [] + + fcProperties = foundFlowCells[0].getSample().getProperties() + for property in fcProperties: + code = property.getPropertyType().getSimpleCode() + fcPropertyTypes.append(code) + fcPropertiesDict[code] = property + + fcPropertyTypes.sort() + return fcPropertiesDict, fcPropertyTypes + +# ------------------------------------------------------------------------------- + +def searchParents (search_service, parents): + + # search for the parents + sc = SearchCriteria() + # set the Search Criteria to an OR condition, default is AND + sc.setOperator(SearchCriteria.SearchOperator.MATCH_ANY_CLAUSES) + # Get the codes for all parents + for parent in parents: + parentSubCode = parent.getSubCode() + sc.addMatchClause(SearchCriteria.MatchClause.createAttributeMatch(SearchCriteria.MatchClauseAttribute.CODE, parentSubCode)); + # all parents of the flow lane + foundParents = search_service.searchForSamples(sc) + + return foundParents + +# ------------------------------------------------------------------------------- + + +def process(transaction): + + # useful for debugging: + print(datetime.now()) + + incomingPath = transaction.getIncoming().getAbsolutePath() + name = transaction.getIncoming().getName() + + folders=[] + folders=os.listdir(incomingPath) + + # expected incoming Name, e.g.: Project_110715_SN792_0054_BC035RACXX_1 + split=name.split("_") + if (len(split) == 6): + runningDate = split[1] + sequencerId = split[2] + sequentialNumber = split[3] + hiseqTray = split[4][0] + flowCellId = split[4][1:] + flowLane = split[-1] + incoming_sample=runningDate+ '_'+ sequencerId + '_' + sequentialNumber + '_' + hiseqTray + flowCellId + ':' + flowLane + # expected Project_120112_63537AAXX_1 + if (len(split) ==4): + runningDate = split[1] + flowCellId = split[2] + flowLane = split[-1] + incoming_sample=runningDate+ '_'+ flowCellId + ':' + flowLane + + # Get the search service + search_service = transaction.getSearchService() + + # Search for the incoming_sample which is a Flow Lane + sc = SearchCriteria() + print('Processing sample: '+ str(incoming_sample)) + sc.addMatchClause(SearchCriteria.MatchClause.createAttributeMatch(SearchCriteria.MatchClauseAttribute.CODE, incoming_sample)); + foundSamples = search_service.searchForSamples(sc) + + # there should be only one sample because it is unique within one Flow Cell + if (len(foundSamples) > 1): + raise Exception("More than one sample found! No unique code: " + incoming_sample) + elif (len(foundSamples) == 0): + raise Exception("No matching sample found for: " + incoming_sample) + else : + sample = foundSamples[0].getSample() + parents = sample.getParents() + + foundParents = searchParents(search_service, parents) + + # ------------------------------------------------------------------------------- + + # loop over each Sample folder within a lane + for f in range(0,len(folders)): + # Create a data set and set type + dataSet = transaction.createNewDataSet("FASTQ_GZ") + dataSet.setMeasuredData(False) + dataSet.setPropertyValue(INDEX1, DEFAULT_INDEX) + dataSet.setPropertyValue(INDEX2, DEFAULT_INDEX) + dirName = transaction.createNewDirectory(dataSet,folders[f]) + + # if multiplexed samples then there is more than one folder + pathPerLane = incomingPath + '/' + folders[f] + print ("pathPerLane: " + pathPerLane) + + # get all properties of the parent samples + for foundParent in foundParents: + parent = foundParent.getSample() + # ArrayList + parentProperties = parent.getProperties() + # just get the current code + parentCode = parent.getCode() + #print("Found parent code: "+ parentCode) + + # reformat Java ArrayList and Sort + parentPropertyTypes = [] + parentPropertiesMap = {} + for property in parentProperties: + code = property.getPropertyType().getSimpleCode() + parentPropertyTypes.append(code) + parentPropertiesMap[code] = property + try: + barcode = parentPropertiesMap[INDEX1].tryGetAsString() + if barcode == "NOINDEX": + barcode = DEFAULT_INDEX + else: + barcode.split()[-1][:-1] + except: + barcode = DEFAULT_INDEX + + try: + index2 = parentPropertiesMap[INDEX2].tryGetAsString() + if index2 == "NOINDEX": + index2 = DEFAULT_INDEX + else: + index2.split()[-1][:-1] + except: + index2 = DEFAULT_INDEX + + # just use the first six nucleotides for the naming + completeBarcode=barcode + "-" + index2 + + parentPropertyTypes.sort() + # BSSE-1754_C0364ACXX_CTTGTAA-AACC_L007_R1_001.fastq.gz + # BSSE_QGF_10002_121130_SN792_0189_AD1FBTACXX_1_NoIndex_L001_R1_001.fastq.gz + nameOfFile = parentCode.replace('-','_') + "_" + incoming_sample.replace(':', '_') + "_" + completeBarcode + "_L00" + flowLane +METADATA_FILE_SUFFIX + + print folders[f] + folderSplit = '-'.join(folders[f].split('_')[1:4]) + print "Folder split: " + folderSplit + print str(parentCode) + + if (parentCode == folderSplit): + dataSet.setPropertyValue(INDEX1, barcode) + dataSet.setPropertyValue(INDEX2, index2) + print parentPropertiesMap[EXTERNAL_SAMPLE_NAME].tryGetAsString() + dataSet.setPropertyValue(EXTERNAL_SAMPLE_NAME, parentPropertiesMap[EXTERNAL_SAMPLE_NAME].tryGetAsString()) + print("Creating metadata file:" + nameOfFile) + # get a file from the IDataSetRegistrationTransaction so it is automatically part of the data set + pathToFile = transaction.createNewFile(dataSet, folders[f], nameOfFile) + + fcMetaDataDict, fcMetaDataList = getFlowCellMetaData(transaction, incoming_sample.split(":")[0]) + writeMetadataFile(pathToFile, parentPropertyTypes, parentPropertiesMap, fcMetaDataDict, fcMetaDataList) + + affiliation_name = parentPropertiesMap[AFFILIATION_PROPERTY_NAME].tryGetAsString() + extraCopy (affiliation_name, pathToFile) + + # get all fastqs in this dataSet + fastqFileList=getFileNames(pathPerLane) + + # put the files into the dataSet + affiliation_name = parentPropertiesMap[AFFILIATION_PROPERTY_NAME].tryGetAsString() + for file in fastqFileList: + extraCopy (affiliation_name, file) + # finally add the files to the data set + transaction.moveFile(file , dataSet, folders[f]) + + if foundSamples.size() > 0: + sa = transaction.getSampleForUpdate(foundSamples[0].getSampleIdentifier()) + sa.setPropertyValue("DATA_TRANSFERRED", create_openbis_timestamp()) + dataSet.setSample(foundSamples[0]) + + shutil.rmtree(incomingPath) diff --git a/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/register-lane-miseq/plugin.properties b/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/register-lane-miseq/plugin.properties new file mode 100644 index 0000000000000000000000000000000000000000..881421e7899fdfc4a73a58693f0415e94cd4d8fe --- /dev/null +++ b/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/register-lane-miseq/plugin.properties @@ -0,0 +1,12 @@ +# +# Drop box for registering a flow cell output as a data set +# +# Variables: +# incoming-root-dir +# Path to the directory which contains incoming directories for drop boxes. +incoming-dir = ${root}/dss/register-lane-miseq +incoming-data-completeness-condition = marker-file +top-level-data-set-handler = ch.systemsx.cisd.etlserver.registrator.api.v2.JythonTopLevelDataSetHandlerV2 +script-path = register-lane-miseq.py +storage-processor = ch.systemsx.cisd.etlserver.DefaultStorageProcessor + diff --git a/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/register-lane-miseq/register-lane-miseq.py b/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/register-lane-miseq/register-lane-miseq.py new file mode 100755 index 0000000000000000000000000000000000000000..5174af1295cd0755705b5ade12fa7e9629416b9d --- /dev/null +++ b/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/register-lane-miseq/register-lane-miseq.py @@ -0,0 +1,203 @@ +''' +Processes each flow lane of a Sequencing run + +Note: +print statements go to: ~openbis/sprint/datastore_server/log/startup_log.txt +''' + +import os +import fnmatch +import time +import shutil +from time import * +from datetime import * +from itertools import islice, chain +from ch.systemsx.cisd.openbis.generic.shared.api.v1.dto import SearchCriteria + +FASTQ_GZ_PATTERN = "*.fastq.gz" +METADATA_FILE_SUFFIX = "_metadata.tsv" + +# ------------------------------------------------------------------------------- + +def getFileNames(path): + ''' + Gets all files matching a PATTERN in a path recursively + and returns the result as a list + ''' + matches = [] + for root, dirnames, filenames in os.walk(path): + for filename in fnmatch.filter(filenames, FASTQ_GZ_PATTERN): + matches.append(os.path.join(root, filename)) + matches.sort() + return(matches) + +def writeMetadataFile (fileName, parentPropertyTypes, parentPropertiesMap, fcMetaDataDict, fcMetaDataList): + ''' + Writes a file of meta date related to one sample + ''' + try: + metaDataFile = open(fileName,'w') + for propertyType in parentPropertyTypes: + metaDataFile.write(propertyType.encode('utf-8') + "\t" + + parentPropertiesMap[propertyType].tryGetAsString().encode('utf-8') + "\n") + + metaDataFile.write("\nFLOWCELL PROPERTIES\n".encode('utf-8')) + + for fcMetaData in fcMetaDataList: + metaDataFile.write(fcMetaData.encode('utf-8') + "\t" + + fcMetaDataDict[fcMetaData].tryGetAsString().encode('utf-8') + "\n") + pass + except IOError: + print ('File error, could not write '+ fileName) + finally: + metaDataFile.close() + +def create_openbis_timestamp (): + ''' + Create an openBIS conform timestamp + ''' + tz=localtime()[3]-gmtime()[3] + d=datetime.now() + return d.strftime("%Y-%m-%d %H:%M:%S GMT"+"%+.2d" % tz+":00") + +def extraCopy (affiliationName, path): + ''' + Handles the extra copies of the data for transfer with datamover via the + bc2 network to the FMI and BIOCENTER + For the BIOCENTER there is a folder created in which all data gets into + ''' + if (affiliation_name in AFFILIATION): + if (affiliation_name == 'BIOCENTER_BASEL'): + dirname = AFFILIATION[affiliation_name] + datetime.now().strftime("%Y-%m-%d") + if not os.path.exists(dirname): + os.mkdir(dirname) + shutil.copy(path, dirname) + else: + shutil.copy(path, AFFILIATION[affiliation_name]) +# ------------------------------------------------------------------------------- + +def getFlowCellMetaData (flowCellId): + + def sortedDictValues(adict): + keys = adict.keys() + keys.sort() + return map(adict.get, keys) + + search = transaction.getSearchService() + sc = SearchCriteria() + print('Searching FlowCell: '+ str(flowCellId)) + sc.addMatchClause(SearchCriteria.MatchClause.createAttributeMatch(SearchCriteria.MatchClauseAttribute.CODE, flowCellId)); + foundFlowCells = search.searchForSamples(sc) + + try: + assert foundFlowCells.size() == 1 + except AssertionError: + print (str(foundFlowCells.size()) + ' flow cells found which match the criterias: '+ flowCellId) + + fcPropertiesDict = {} + fcPropertyTypes = [] + + fcProperties = foundFlowCells[0].getSample().getProperties() + for property in fcProperties: + code = property.getPropertyType().getSimpleCode() + fcPropertyTypes.append(code) + fcPropertiesDict[code] = property + + fcPropertyTypes.sort() + return fcPropertiesDict, fcPropertyTypes + + +# ------------------------------------------------------------------------------- + +def searchForLane(lane, search_service): + + sc = SearchCriteria() + print('Processing sample: '+ str(lane)) + sc.addMatchClause(SearchCriteria.MatchClause.createAttributeMatch(SearchCriteria.MatchClauseAttribute.CODE, lane)); + foundSamples = search_service.searchForSamples(sc) + + # there should be only one sample because it is unique within one Flow Cell + if (len(foundSamples) > 1): + raise Exception("More than one sample found! No unique code: " + lane) + elif (len(foundSamples) == 0): + raise Exception("No matching sample found for: " + lane) + else : + foundLane = foundSamples[0] + + return foundLane + + +# ------------------------------------------------------------------------------- + +def searchForParents (parents, search_service): + # search for the parents + sc = SearchCriteria() + # set the Search Criteria to an OR condition, default is AND + sc.setOperator(SearchCriteria.SearchOperator.MATCH_ANY_CLAUSES) + # Get the codes for all parents + for parent in parents: + parentSubCode = parent.getSubCode() + sc.addMatchClause(SearchCriteria.MatchClause.createAttributeMatch(SearchCriteria.MatchClauseAttribute.CODE, parentSubCode)); + # all parents of the flow lane + foundParents = search_service.searchForSamples(sc) + return foundParents + + +# ------------------------------------------------------------------------------- +''' +def batch(iterable, size): + sourceiter = iter(iterable) + while True: + batchiter = islice(sourceiter, size) + yield chain([batchiter.next()], batchiter) +''' +# ------------------------------------------------------------------------------- + +def process(transaction): + + # useful for debugging: + print(datetime.now()) + + incomingPath = transaction.getIncoming().getAbsolutePath() + + # Get the incoming name + name = transaction.getIncoming().getName() + miSeqLane = name + ':1' + + # Get the search service + search_service = transaction.getSearchService() + + lane = searchForLane(miSeqLane, search_service) + + # get all fastqs in this dataSet + fastqFileList=getFileNames(incomingPath) + fastqFileList.sort() + + while fastqFileList: + + # Create a data set and set type + dataSet = transaction.createNewDataSet("FASTQ_GZ") + dataSet.setMeasuredData(False) + if len(fastqFileList) == 1: + transaction.moveFile(fastqFileList.pop(0) , dataSet) + else: + try: + currentFullPath , currentFilename = os.path.split(fastqFileList[0]) + nextFullPath , nextFilename = os.path.split(fastqFileList[1]) + + newpath = currentFullPath + "/fastq" + if not os.path.exists(newpath): + os.makedirs(newpath) + + if currentFilename.rsplit('_',2)[0] == nextFilename.rsplit('_',2)[0]: + shutil.move(fastqFileList.pop(0), newpath) + shutil.move(fastqFileList.pop(0), newpath) + transaction.moveFile(newpath , dataSet) + else: + transaction.moveFile(fastqFileList.pop(0) , dataSet) + except: + pass + dataSet.setSample(lane) + + sa = transaction.getSampleForUpdate(lane.getSampleIdentifier()) + sa.setPropertyValue("DATA_TRANSFERRED", create_openbis_timestamp()) diff --git a/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/register-macs/plugin.properties b/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/register-macs/plugin.properties new file mode 100644 index 0000000000000000000000000000000000000000..b9d3428f09bad4e881e42cfb1b196acccd43a24c --- /dev/null +++ b/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/register-macs/plugin.properties @@ -0,0 +1,11 @@ +# +# Drop box for registering a flow cell output as a data set +# +# Variables: +# incoming-root-dir +# Path to the directory which contains incoming directories for drop boxes. +incoming-dir = ${root}/dss/register-macs +incoming-data-completeness-condition = marker-file +top-level-data-set-handler = ch.systemsx.cisd.etlserver.registrator.api.v2.JythonTopLevelDataSetHandlerV2 +script-path = register-macs.py +storage-processor = ch.systemsx.cisd.etlserver.DefaultStorageProcessor diff --git a/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/register-macs/register-macs.py b/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/register-macs/register-macs.py new file mode 100755 index 0000000000000000000000000000000000000000..17a9d5b074ca2038531cb770bbe4468c25d22225 --- /dev/null +++ b/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/register-macs/register-macs.py @@ -0,0 +1,22 @@ + +def process(transaction): + # Create a data set and set type + dataSet = transaction.createNewDataSet("MACS_OUTPUT") + dataSet.setPropertyValue("MACS_VERSION", "1.4.0RC2") + dataSet.setMeasuredData(False) + + incomingPath = incoming.getAbsolutePath() + + # Add the incoming file into the data set + transaction.moveFile(incomingPath, dataSet) + + # rsplit not in Python 2.2 + #sampleName = ":".join(incomingPath.split("/")[-1].rsplit("_",1)) + flowCell = "_".join((incomingPath.split("/")[-1].split("_")[2:-1])) + lane = (incomingPath.split("/")[-1]).split("_")[-1] + sampleName = flowCell + ":" + lane + + # Set the owner of the data set -- the specified sample + sample = transaction.getSample("/BSSE_BEISEL/" + sampleName) + + dataSet.setSample(sample) diff --git a/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/register-runstatistics/plugin.properties b/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/register-runstatistics/plugin.properties new file mode 100644 index 0000000000000000000000000000000000000000..40f3884ecfc7cfc5bc53453c1501ff7b025d24fe --- /dev/null +++ b/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/register-runstatistics/plugin.properties @@ -0,0 +1,11 @@ +# +# Drop box for registering a flow cell output as a data set +# +# Variables: +# incoming-root-dir +# Path to the directory which contains incoming directories for drop boxes. +incoming-dir = ${root}/dss/register-runstatistics +incoming-data-completeness-condition = marker-file +top-level-data-set-handler = ch.systemsx.cisd.etlserver.registrator.api.v2.JythonTopLevelDataSetHandlerV2 +script-path = register-runstatistics.py +storage-processor = ch.systemsx.cisd.etlserver.DefaultStorageProcessor diff --git a/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/register-runstatistics/register-runstatistics.py b/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/register-runstatistics/register-runstatistics.py new file mode 100644 index 0000000000000000000000000000000000000000..d7b8b8400d33aa61b42ecf6cee6b7fa3c23d5165 --- /dev/null +++ b/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/register-runstatistics/register-runstatistics.py @@ -0,0 +1,73 @@ +''' +@copyright: +2012 ETH Zuerich, CISD + +@license: +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +@description: +Registers an incoming directory as a data set in openBIS. The name of the directory is used to +search for the matching sample. + +@note: +print statements go to: <openBIS_HOME>/datastore_server/log/startup_log.txt +expected incoming Name for HiSeq runs: 110715_SN792_0054_BC035RACXX +expected incoming Name for GAII runs: 110812_6353WAAXX + +@author: +Manuel Kohler +''' + +import os +import shutil +from ch.systemsx.cisd.openbis.generic.shared.api.v1.dto import SearchCriteria +from ch.systemsx.cisd.openbis.generic.shared.api.v1.dto import SearchSubCriteria + +def process(transaction): + + incomingPath = transaction.getIncoming().getPath() + incomingFolder = transaction.getIncoming().getName() + + folders=[] + folders=os.listdir(incomingPath) + + split=incomingFolder.split("_") + if (len(split) == 4): + IS_HISEQ_RUN=True + if (len(split) == 2): + IS_HISEQ_RUN=False + + # Get the search service + search_service = transaction.getSearchService() + + # Search for the sample + sc = SearchCriteria() + sc.addMatchClause(SearchCriteria.MatchClause.createAttributeMatch(SearchCriteria.MatchClauseAttribute.CODE, incomingFolder)); + foundSamples = search_service.searchForSamples(sc) + + if foundSamples.size() > 0: + + # Search for another data set of the same sample and make it a child of it + sampleSc = SearchCriteria() + sampleSc.addSubCriteria(SearchSubCriteria.createSampleContainerCriteria(sc)) + foundContainedSamples = search_service.searchForSamples(sampleSc) + if foundContainedSamples.size() > 0: + for fcs in range(0,foundContainedSamples.size()): + dataSet = transaction.createNewDataSet("RUNINFO") + dataSet.setMeasuredData(False) + dataSet.setSample(foundContainedSamples[fcs]) + # Add the incoming file into the data set + shutil.copytree(incomingPath, incomingPath + "_" + str(fcs+1)) + transaction.moveFile(incomingPath + "_" + str(fcs+1), dataSet) + + shutil.rmtree(incomingPath) diff --git a/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/register-thumbnails/plugin.properties b/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/register-thumbnails/plugin.properties new file mode 100644 index 0000000000000000000000000000000000000000..5d976b5624c020dfe94621ef297aa1db7a883910 --- /dev/null +++ b/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/register-thumbnails/plugin.properties @@ -0,0 +1,12 @@ +# +# Drop box for registering a flow cell output as a data set +# +# Variables: +# incoming-root-dir +# Path to the directory which contains incoming directories for drop boxes. +incoming-dir = ${root}/dss/register-thumbnails +#incoming-dir = /links/sonas/cisd/store/incoming-flowCell +incoming-data-completeness-condition = marker-file +top-level-data-set-handler = ch.systemsx.cisd.etlserver.registrator.api.v2.JythonTopLevelDataSetHandlerV2 +script-path = register-thumbnails.py +storage-processor = ch.systemsx.cisd.etlserver.DefaultStorageProcessor diff --git a/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/register-thumbnails/register-thumbnails.py b/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/register-thumbnails/register-thumbnails.py new file mode 100644 index 0000000000000000000000000000000000000000..60937ed4520f013555638718adc54297f27880eb --- /dev/null +++ b/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/register-thumbnails/register-thumbnails.py @@ -0,0 +1,55 @@ +''' +@copyright: +2012 ETH Zuerich, CISD + +@license: +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +@description: +Registers an incoming directory as a data set in openBIS. The name of the directory is used to +search for the matching sample. + +@note: +print statements go to: <openBIS_HOME>/datastore_server/log/startup_log.txt +expected incoming Name for HiSeq runs: 110715_SN792_0054_BC035RACXX +expected incoming Name for GAII runs: 110812_6353WAAXX + +@author: +Manuel Kohler +''' + +import xml.etree.ElementTree as etree +from ch.systemsx.cisd.openbis.generic.shared.api.v1.dto import SearchCriteria + +def process(transaction): + + incomingPath = transaction.getIncoming().getPath() + name = transaction.getIncoming().getName() + + dataSet = transaction.createNewDataSet("THUMBNAILS") + + # Create a data set and set type + dataSet.setMeasuredData(False) + + # Get the search service + search_service = transaction.getSearchService() + + # Search for the sample + sc = SearchCriteria() + sc.addMatchClause(SearchCriteria.MatchClause.createAttributeMatch(SearchCriteria.MatchClauseAttribute.CODE, name)); + foundSamples = search_service.searchForSamples(sc) + + if foundSamples.size() > 0: + # Add the incoming file into the data set + transaction.moveFile(incomingPath, dataSet) + dataSet.setSample(foundSamples[0]) diff --git a/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/register-unaligned/plugin.properties b/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/register-unaligned/plugin.properties new file mode 100644 index 0000000000000000000000000000000000000000..ca004d0970599e55d3705db9f9f701b988209df6 --- /dev/null +++ b/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/register-unaligned/plugin.properties @@ -0,0 +1,11 @@ +# +# Drop box for registering a flow cell output as a data set +# +# Variables: +# incoming-root-dir +# Path to the directory which contains incoming directories for drop boxes. +incoming-dir = ${root}/dss/register-unaligned +incoming-data-completeness-condition = marker-file +top-level-data-set-handler = ch.systemsx.cisd.etlserver.registrator.api.v2.JythonTopLevelDataSetHandlerV2 +script-path = register-unaligned.py +storage-processor = ch.systemsx.cisd.etlserver.DefaultStorageProcessor diff --git a/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/register-unaligned/register-unaligned.py b/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/register-unaligned/register-unaligned.py new file mode 100755 index 0000000000000000000000000000000000000000..8b82056105601367e3902ecd1510884b1c2e53a1 --- /dev/null +++ b/deep_sequencing_unit/source/core-plugins/illumina-ngs/1/dss/drop-boxes/register-unaligned/register-unaligned.py @@ -0,0 +1,179 @@ +''' +@copyright: +2012 ETH Zuerich, CISD + +@license: +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +@description: +Expects as incoming folder: <FlowCell>/Unaligned_no_mismatch + +@note: + +@author: +Manuel Kohler +''' + +import os +import glob +import shutil +import time +from ch.systemsx.cisd.openbis.generic.shared.api.v1.dto import SearchCriteria +from ch.systemsx.cisd.openbis.generic.shared.api.v1.dto import SearchSubCriteria + +BASECALL_STATS_FOLDER = 'Basecall_Stats_' +REGEX_FILES = '*.*' +REGEX_MAKEFILE = 'Make*' +REGEX_LANES='/P*' +REGEX_UNDETERMINED = '/U*/Sample*' +UNALIGNED_FOLDER='Unaligned_no_mismatch' +LANE_FOLDER='/links/shared/dsu/dss/register-lane-hiseq/' +MARKER_STRING='.MARKER_is_finished_' + +def touch_markerfile(filename): + try: + # do a touch + open(filename, 'w').close() + except: + print('Could not touch ' + filename) + +def searchForLaneParents(transaction, sampleCode): + + search_service = transaction.getSearchService() + sc = SearchCriteria() + sc.addMatchClause(SearchCriteria.MatchClause.createAttributeMatch(SearchCriteria.MatchClauseAttribute.CODE, sampleCode)) + foundSamples = search_service.searchForSamples(sc) + # there should be only one sample because it is unique within one Flow Cell + if (len(foundSamples) > 1): + raise Exception("More than one sample found! No unique code: " + sampleCode) + elif (len(foundSamples) == 0): + raise Exception("No matching sample found for: " + sampleCode) + else : + sample = foundSamples[0].getSample() + parents = sample.getParents() + + # search for the parents + sc = SearchCriteria() + # set the Search Criteria to an OR condition, default is AND + sc.setOperator(SearchCriteria.SearchOperator.MATCH_ANY_CLAUSES) + # Get the codes for all parents + for parent in parents: + parentSubCode = parent.getSubCode() + sc.addMatchClause(SearchCriteria.MatchClause.createAttributeMatch(SearchCriteria.MatchClauseAttribute.CODE, parentSubCode)); + # all parents of the flow lane + foundParents = search_service.searchForSamples(sc) + + parentCodeList = [] + + for foundParent in foundParents: + parent = foundParent.getSample() + # ArrayList + parentProperties = parent.getProperties() + # just get the current code + parentCode = parent.getCode() + parentCodeList.append(parentCode) + #print("Found parent code: "+ parentCode) + + return parentCodeList + + +def renameFiles(transaction, dir, flowcellName): + # Limit of Samples when the Sample Code is still used in the renaming. If Number of Samples is + # bigger than this number, we just write the number in the file name. + + MAX_SAMPLES = 20 + FACILITY_CODE = "BSSE_QGF_" + + for root, dirs, files in os.walk(dir): + for file in files: + lane = file.split("_")[0][-1] + if lane.isdigit(): + sampleCode = flowcellName + ":" + lane + print sampleCode + parentCodeList = searchForLaneParents(transaction, sampleCode) + print parentCodeList + length = len(parentCodeList) + if length > MAX_SAMPLES: + # BSSE_QGF_96_samples_C1A8YACXX_Undetermined_L008_R1_001.fastq.gz + os.rename(root + '/' + file, root + "/" + FACILITY_CODE + str(length) + "_samples_" + flowcellName + "_" + file) + else: + # BSSE_QGF_10001_10002_10003_10004_C1A8YACXX_Undetermined_L008_R1_001.fastq.gz + SampleCodeString = "_".join([(e.split("-")[-1]) for e in parentCodeList]) + os.rename(root + '/' + file, root + "/" + FACILITY_CODE + SampleCodeString + "_" + flowcellName + "_" + file) + +def process(transaction): + + incomingPath = transaction.getIncoming().getPath() + name = transaction.getIncoming().getName() + + split=name.split("_") + if (len(split) == 4): + DSTYPE='ILLUMINA_HISEQ_OUTPUT' + flowcell=name.split("_")[-1][1:] + if (len(split) ==2): + DSTYPE='ILLUMINA_GA_OUTPUT' + flowcell=name.split("_")[-1] + + #move Lanes into a different drop box + laneList=glob.glob(incomingPath + '/'+ UNALIGNED_FOLDER + REGEX_LANES) + laneList.sort() + + undeterminedList=glob.glob(incomingPath + '/'+ UNALIGNED_FOLDER + REGEX_UNDETERMINED) + undeterminedList.sort() + + # add the Flow Cell Name to the Undetermined FASTQ files + [renameFiles(transaction, dir, name) for dir in undeterminedList] + + # Multiplexing: + # First move the Undetermined reads to the other ones + [shutil.move(undeterminedLane, laneList[int(undeterminedLane.split('/')[-1][-1])-1] +'/' + undeterminedLane.split('/')[-1]) for undeterminedLane in undeterminedList] + + [shutil.move(lane, LANE_FOLDER+lane.split('/')[-1]) for lane in laneList] + markerFileList = [touch_markerfile(LANE_FOLDER+MARKER_STRING+lane.split('/')[-1]) for lane in laneList] + + # Create a data set and set type + dataSet = transaction.createNewDataSet("BASECALL_STATS") + dataSet.setMeasuredData(False) + + # Build up a list of file which are part of the data set + fileList=glob.glob(incomingPath + '/'+ UNALIGNED_FOLDER + '/' + REGEX_FILES) + [fileList.append(i) for i in glob.glob(incomingPath + '/' + UNALIGNED_FOLDER +'/' +REGEX_MAKEFILE)] + + # Add the incoming file into the data set + transaction.createNewDirectory(dataSet, UNALIGNED_FOLDER) + # move all files is data set + [transaction.moveFile(file, dataSet, UNALIGNED_FOLDER) for file in fileList] + # move base call stat dir into data set + print("flowcell: "+flowcell) + transaction.moveFile(incomingPath + '/' + UNALIGNED_FOLDER + '/' + BASECALL_STATS_FOLDER + flowcell, dataSet, UNALIGNED_FOLDER + '/') + + # Get the search service + search_service = transaction.getSearchService() + + # Search for the sample + sc = SearchCriteria() + sc.addMatchClause(SearchCriteria.MatchClause.createAttributeMatch(SearchCriteria.MatchClauseAttribute.CODE, name)) + foundSamples = search_service.searchForSamples(sc) + + if foundSamples.size() > 0: + dataSet.setSample(foundSamples[0]) + + # Search for another data set of the same sample and make it a child of it + dataSetSc = SearchCriteria() + dataSetSc.addMatchClause(SearchCriteria.MatchClause.createAttributeMatch(SearchCriteria.MatchClauseAttribute.TYPE, DSTYPE)) + dataSetSc.addSubCriteria(SearchSubCriteria.createSampleCriteria(sc)) + foundDataSets = search_service.searchForDataSets(dataSetSc) + if foundDataSets.size() > 0: + dataSet.setParentDatasets([ds.getDataSetCode() for ds in foundDataSets]) + + shutil.rmtree(incomingPath)