diff --git a/deep_sequencing_unit/sourceTest/core-plugins/illumina-qgf/1/dss/drop-boxes/create-flowcell-hiseq/create-flowcell-hiseq.py b/deep_sequencing_unit/sourceTest/core-plugins/illumina-qgf/1/dss/drop-boxes/create-flowcell-hiseq/create-flowcell-hiseq.py index 304ca8cd4112c3ce026426ab8016ed31a178b67e..7a6945be12eb5cb25413bb1720fdd17239683d1f 100644 --- a/deep_sequencing_unit/sourceTest/core-plugins/illumina-qgf/1/dss/drop-boxes/create-flowcell-hiseq/create-flowcell-hiseq.py +++ b/deep_sequencing_unit/sourceTest/core-plugins/illumina-qgf/1/dss/drop-boxes/create-flowcell-hiseq/create-flowcell-hiseq.py @@ -37,7 +37,6 @@ from time import * from datetime import * import xml.etree.ElementTree as etree from ch.systemsx.cisd.openbis.generic.shared.api.v1.dto import SearchCriteria -from os.path import isfile IS_HISEQ_RUN=False RUNPARAMETERS = 'runParameters.xml' @@ -144,12 +143,6 @@ def setFcProperty(searchId, dict, newFlowCell, runInfo): # ----------------------------------------------------------------------------- -def checkIfFileExists (filePath): - if not os.path.isfile(filePath): - return False - else: - return True - def process(transaction): incoming = transaction.getIncoming() incomingPath = incoming.getAbsolutePath() @@ -159,16 +152,8 @@ def process(transaction): name = incoming.getName() split=name.split("_") - runInfoFile = os.path.join(incomingPath, RUNINFO) - runParametersFile = os.path.join(incomingPath, RUNPARAMETERS) - - if not (checkIfFileExists(runInfoFile)): - raise IOError ("File not found: " + RUNINFO) - if not (checkIfFileExists(runParametersFile)): - raise IOError ("File not found: " + RUNPARAMETERS) - # Parse the RunInfo.xml file - runInfo = parseXmlFile(runInfoFile) + runInfo = parseXmlFile(incomingPath + '/' + RUNINFO) maxLanes = runInfo.getAllchildren('FlowcellLayout')[0].attrib[RUNINFO_XML['LANECOUNT']] # Search for the sample and check if there is already sample with this name @@ -194,10 +179,10 @@ def process(transaction): run = runInfo.getAllchildren('Run')[0].attrib if (run['Id'] != name): raise NameError('Flowcell names do not match between directory name '+ name + - ' and ' + RUNINFO + ' property file: ' + run['Id']) + ' and ' + RUNINFO + 'property file: ' + run['Id']) # The HiSeq is providing more infos, which we will parse here: - runParameters = parseXmlFile(runParametersFile) + runParameters = parseXmlFile(incomingPath + '/' + RUNPARAMETERS) addVocabularyTerm(transaction, "PIPELINE_VERSION", runParameters.getXmlElement(RUNPARAMETERS_XML['RTAVERSION'])) newFlowCell.setPropertyValue("ILLUMINA_PIPELINE_VERSION", runParameters.getXmlElement(RUNPARAMETERS_XML['RTAVERSION'])) diff --git a/deep_sequencing_unit/sourceTest/core-plugins/illumina-qgf/1/dss/drop-boxes/export-meta-data/export-meta-data.py b/deep_sequencing_unit/sourceTest/core-plugins/illumina-qgf/1/dss/drop-boxes/export-meta-data/export-meta-data.py new file mode 100644 index 0000000000000000000000000000000000000000..f5253bc08749ffdd20d0faad672fc9a3765af137 --- /dev/null +++ b/deep_sequencing_unit/sourceTest/core-plugins/illumina-qgf/1/dss/drop-boxes/export-meta-data/export-meta-data.py @@ -0,0 +1,464 @@ +''' +Processes each flow lane of a Sequencing run + +Expects as incoming folder: +BSSE_QGF_22266_H0W8YBGXX_1 +or +Undetermined_H0W8YBGXX + +Note: +print statements go to: ~openbis/sprint/datastore_server/log/startup_log.txt +''' + +import os +import fnmatch +import time +import shutil +import re +import subprocess +from time import * +from datetime import * +from ch.systemsx.cisd.openbis.generic.shared.api.v1.dto import SearchCriteria +from ch.systemsx.cisd.openbis.dss.generic.shared import DataSourceQueryService + +FASTQ_GZ_PATTERN = "*.fastq.gz" +METADATA_FILE_SUFFIX = "_metadata.tsv" +AFFILIATION= {'FMI': '/links/shared/dsu/dss/customers/fmi/drop-box/', + 'BIOCENTER_BASEL': '/links/shared/dsu/dss/customers/biozentrum/drop-box/', + 'NEUROSTEMX': '/links/shared/dsu/dss/customers/biozentrum/drop-box/', + 'SWISS_TPH' : '/links/shared/dsu/dss/customers/biozentrum/drop-box/'} +AFFILIATION_PROPERTY_NAME='AFFILIATION' +INDEX1='BARCODE' +INDEX2='INDEX2' +EXTERNAL_SAMPLE_NAME='EXTERNAL_SAMPLE_NAME' +INDEXREAD1='INDEXREAD' +INDEXREAD2='INDEXREAD2' +SAMPLE_TYPE = 'SAMPLE_TYPE' +SAMPLE_CODE = 'SAMPLE_CODE' +NCBI_ORGANISM_TAXONOMY='NCBI_ORGANISM_TAXONOMY' +PHIX_TAXONOMY_ID='10847' +DEFAULT_INDEX='NoIndex' +CRC32_PATH='lib/crc32' + +# ------------------------------------------------------------------------------- + +def getThreadProperties(transaction): + threadPropertyDict = {} + threadProperties = transaction.getGlobalState().getThreadParameters().getThreadProperties() + for key in threadProperties: + try: + threadPropertyDict[key] = threadProperties.getProperty(key) + except: + pass + return threadPropertyDict + +def CRC32_from_file(filename, transaction): + threadPropertyDict = getThreadProperties(transaction) + absolutePath = os.path.dirname(os.path.realpath(threadPropertyDict['script-path'])) + fullPathCrc32 = (os.path.join(absolutePath, CRC32_PATH)) + if os.path.exists(fullPathCrc32): + args = [fullPathCrc32, filename] + p = subprocess.Popen(args, stdout=subprocess.PIPE) + cksum = (p.communicate()[0]) + print("Calculated crc32 checksum for: "+ os.path.basename(filename) + " " + cksum) + else: + cksum = 0 & 0xFFFFFFFF + return cksum + +def getFileNames(path): + ''' + Gets all files matching a PATTERN in a path recursively + and returns the result as a list + ''' + matches = [] + for root, dirnames, filenames in os.walk(path): + for filename in fnmatch.filter(filenames, FASTQ_GZ_PATTERN): + matches.append(os.path.join(root, filename)) + matches.sort() + return(matches) + +def writeMetadataFile(transaction, folder_name, meta_data_file_name, sequencing_sample_properties_dict, + fcMetaDataDict, experiment, affiliation_name, fastqFileList, flowLane): + ''' + Writes a file of meta data related to one sample + ''' + + sequencing_sample_properties_list = sequencing_sample_properties_dict.keys() + sequencing_sample_properties_list.sort() + + expId = experiment.getExperimentIdentifier() + try: + meta_data_file = open(meta_data_file_name,'w') + for propertyType in sequencing_sample_properties_list: + if (propertyType in [u'FLOW_CELL_PROPERTIES']): + continue + if propertyType in [SAMPLE_TYPE] or propertyType in [SAMPLE_CODE]: + meta_data_file.write(propertyType.encode('utf-8') + "\t" + + str(sequencing_sample_properties_dict[propertyType])+ "\n") + else: + meta_data_file.write(propertyType.encode('utf-8') + "\t" + + sequencing_sample_properties_dict[propertyType].encode('utf-8').replace('\n',',') + "\n") + + meta_data_file.write("EXPERIMENT\t" + expId + "\n".encode('utf-8')) + meta_data_file.write("\nFLOWCELL PROPERTIES\n".encode('utf-8')) + fcMetaDataDict["LANE_NUMBER"] = flowLane + keys = fcMetaDataDict.keys() + keys.sort() + + sequencer_vocabulary_description = get_vocabulary_descriptions(transaction, 'SEQUENCER') + meta_data_file.write('SEQUENCER_MODEL' + "\t" + sequencer_vocabulary_description[fcMetaDataDict['SEQUENCER']].encode('utf-8') + "\n") + + for k in keys: + meta_data_file.write(k.encode('utf-8') + "\t" + fcMetaDataDict[k].encode('utf-8') + "\n") + + meta_data_file.write("\nFASTQ_FILES\n".encode('utf-8')) + for file in fastqFileList: + meta_data_file.write(os.path.basename(file) + "\t" + str(CRC32_from_file(file, transaction)) + "\n") + + except IOError: + print ('File error, could not write '+ file) + finally: + meta_data_file.close() + + destinationFolder = folder_name + #extraCopy (affiliation_name, meta_data_file_name) + #extraCopySciCore (affiliation_name, meta_data_file_name, destinationFolder) + +def create_openbis_timestamp (): + ''' + Create an openBIS conform timestamp + ''' + tz=localtime()[3]-gmtime()[3] + d=datetime.now() + return d.strftime("%Y-%m-%d %H:%M:%S GMT"+"%+.2d" % tz+":00") + +# ------------------------------------------------------------------------------- + + def sortedDictValues(adict): + ''' + Given a dictionary it returns the values of this dict sorted + by the keys + d = {2:1, 4:1, 1:1, 100:3, 3:5} + sortedDictValues(d) + [1, 1, 5, 1, 3] + ''' + keys = adict.keys() + keys.sort() + return map(adict.get, keys) + +# ------------------------------------------------------------------------------- + +def extraCopy (affiliation_name, path): + ''' + @deprecated: replaced with extraCopySciCore + Handles the extra copies of the data for transfer with datamover via the + bc2 network to the FMI and BIOCENTER + For the BIOCENTER there is a folder created in which all data gets into + ''' + if (affiliation_name in AFFILIATION): + if (affiliation_name == 'BIOCENTER_BASEL' or affiliation_name == 'NEUROSTEMX' ): + dirname = AFFILIATION[affiliation_name] + datetime.now().strftime("%Y-%m-%d") + if not os.path.exists(dirname): + os.mkdir(dirname) + shutil.copy(path, dirname) + else: + shutil.copy(path, AFFILIATION[affiliation_name]) + +# ------------------------------------------------------------------------------- + +def extraCopySciCore (affiliation_name, filePath, destinationFolder=""): + ''' + Handles the extra copies of the data for transfer with datamover for SCICORE + ''' + + #dropBoxFolder = '/tmp/scicore' + dropBoxFolder = '/links/shared/dsu/dss/customers/biozentrum_scicore/drop-box' + basename = os.path.basename(filePath) + + print("extraCopySciCore") + print basename + print affiliation_name + + if (affiliation_name in ['BIOCENTER_BASEL', 'NEUROSTEMX', 'SWISS_TPH']): + dirname = os.path.join(dropBoxFolder, destinationFolder) + if not os.path.exists(dirname): + os.mkdir(dirname) + print("COPYING " + filePath + " TO " + dirname) + shutil.copy(filePath, dirname) + +# ------------------------------------------------------------------------------- + +def get_sample_properties (transaction, sample): + + sample_properties_dict = {} + # returns Map<String, String> + sample_properties = sample.getSample().getProperties() + sequencing_sample_type = sample.getSampleType() + sequencing_sample_code = sample.getCode() + sample_properties_dict[SAMPLE_TYPE] = sequencing_sample_type + sample_properties_dict[SAMPLE_CODE] = sequencing_sample_code + + for property in sample_properties: + code = property.getPropertyType().getSimpleCode() + sample_properties_dict[code] = property.tryGetAsString() + + return sample_properties_dict + +# ------------------------------------------------------------------------------- + +def searchParents (search_service, parents): + + # search for the parents + sc = SearchCriteria() + # set the Search Criteria to an OR condition, default is AND + sc.setOperator(SearchCriteria.SearchOperator.MATCH_ANY_CLAUSES) + # Get the codes for all parents + for parent in parents: + parentSubCode = parent.getSubCode() + sc.addMatchClause(SearchCriteria.MatchClause.createAttributeMatch(SearchCriteria.MatchClauseAttribute.CODE, parentSubCode)); + # all parents of the flow lane + foundParents = search_service.searchForSamples(sc) + + return foundParents + +# ------------------------------------------------------------------------------- + +def sanitizeString(myString): + return re.sub('[^A-Za-z0-9]+', '_', myString) + +# ------------------------------------------------------------------------------- + +def searchSample (sample_code, search_service): + sc = SearchCriteria() + print('Searching sample: '+ str(sample_code)) + sc.addMatchClause(SearchCriteria.MatchClause.createAttributeMatch(SearchCriteria.MatchClauseAttribute.CODE, sample_code)); + foundSamples = search_service.searchForSamples(sc) + return foundSamples + + +def renameFiles (fastq_files, undetermined, flow_cell_id): + + newFastqFileList = [] + for file in fastq_files: + if undetermined: + folder = os.path.dirname(file) + fileName = os.path.basename(file) + filepart, suffix = fileName.split('.',1) + new_file = folder + "/" + flow_cell_id + '_' + filepart + "." + suffix + print ("Renaming file " + file + " to " + new_file) + os.rename(file, new_file) + else: + new_file = file + newFastqFileList.append(new_file) + return newFastqFileList + +# ------------------------------------------------------------------------------- + +def put_files_to_dataset (transaction, dataSet, fastq_files, folder_name, flow_cell_id, affiliation_name, undetermined): + + for file in fastq_files: + extraCopySciCore (affiliation_name, file, folder_name) + transaction.moveFile(file, dataSet, folder_name) + +# ------------------------------------------------------------------------------- + +def split_incoming_folder_name (name): + split=name.split("_") + + # expected incoming Name, e.g.: BSSE_QGF_22266_H0W8YBGXX_1 + if (len(split) == 5): + sample_code = '-'.join([split[0], split[1], split[2]]) + flowCellId = split[3] + flowLane = split[-1] + undetermined = False + + # expected Undetermined_H0W8YBGXX + if (len(split) == 2): + sample_code = '' + flowCellId = split[-1] + flowLane = "1" + undetermined = True + + incoming_sample = flowCellId + ':' + flowLane + return sample_code, flowCellId, flowLane, incoming_sample, undetermined + +# ------------------------------------------------------------------------------- + +def get_vocabulary_descriptions (transaction, vocabulary_name): + vocabulary_descriptions_dict = {} + vocabulary = transaction.getVocabulary(vocabulary_name) + vocabulary_terms = vocabulary.getTerms() + for term in vocabulary_terms: + vocabulary_descriptions_dict[term.getCode()] = term.getDescription() + return vocabulary_descriptions_dict + +# ------------------------------------------------------------------------------- + +def process(transaction): + + undetermined = False + + print("\n" + str(datetime.now())) + + incomingPath = transaction.getIncoming().getAbsolutePath() + name = transaction.getIncoming().getName() + # Get the search service + search_service = transaction.getSearchService() + + def searchSampleInSpace (type, search_service): + + spaceCode = "UNI_BASEL_GAGNEUX" + sc = SearchCriteria() +# print('Searching sample: '+ str(code)) + sc.addMatchClause(SearchCriteria.MatchClause.createAttributeMatch(SearchCriteria.MatchClauseAttribute.SPACE, spaceCode)); + sc.addMatchClause(SearchCriteria.MatchClause.createAttributeMatch(SearchCriteria.MatchClauseAttribute.TYPE, type)); + foundSamples = search_service.searchForSamples(sc) + return foundSamples + + samples = searchSampleInSpace("ILLUMINA_SEQUENCING", search_service) + print(samples.size()) + + + + + for sample in samples[0:5]: + + sequencing_sample_properties_dict = get_sample_properties (transaction, sample) + meta_data_file_name = sequencing_sample_properties_dict["SAMPLE_CODE"] + METADATA_FILE_SUFFIX + + sampleName = sequencing_sample_properties_dict["SAMPLE_CODE"] + + print(meta_data_file_name) + print(sequencing_sample_properties_dict) + + flowCellId = "150109_D00535_0042_AHB2K0ADXX" + + flowcell_sample_immutable = searchSample (flowCellId, search_service) + fcMetaDataDict = get_sample_properties(transaction, flowcell_sample_immutable[0]) + + experiment = sample.getExperiment() + affiliation_name = "dummy" + flowLane = "1" + newFastqFiles = ["/tmp/hello.txt", "/tmp/bio.txt"] + + meta_data_file_path = '/tmp/metadata_'+ sampleName + + DATA_SOURCE = "pathinfo_dev" + QUERY = """ + SELECT ds.code as "data_set_code", dsf.* + FROM data_sets ds, data_set_files dsf + WHERE ds.code = ?{1} AND dsf.dase_id = ds.id + """ + + dsqs = DataSourceQueryService() + + dsqs.select(DATA_SOURCE, QUERY) + #results = queryService.select(DATA_SOURCE, QUERY, ['20150108225105804-60446532']) + print(results) + + + writeMetadataFile(transaction, sampleName, meta_data_file_path, sequencing_sample_properties_dict, + fcMetaDataDict, experiment, affiliation_name, newFastqFiles, flowLane) + +# sample_code, flowCellId, flowLane, incoming_sample, undetermined = split_incoming_folder_name (name) +# +# # get all fastqs +# fastq_files=getFileNames(incomingPath) +# +# # BSSE-QGF-22266-H0W8YBGXX-1-654-BC3-TTAGGC_S1_L001_R1_001.fastq.gz +# # BSSE-QGF-22051-H0T25AGXX-1-1-1-TAAGGCGA-CTCTCTAT_S46_L001_R1_001.fastq.gz +# first_fastq_file = os.path.basename(fastq_files[0]) +# +# flowcell_sample_immutable = searchSample (flowCellId, search_service) +# fcMetaDataDict = get_sample_properties(transaction, flowcell_sample_immutable[0]) +# foundLane = searchSample (incoming_sample, search_service) +# +# # there should be only one sample because it is unique within one Flow Cell +# if (len(foundLane) > 1): +# raise Exception("More than one sample found! No unique code: " + incoming_sample) +# elif (len(foundLane) == 0): +# raise Exception("No matching sample found for: " + incoming_sample) +# else : +# sample = foundLane[0].getSample() +# parents = sample.getParents() +# +# # Create a data set and set type +# dataSet = transaction.createNewDataSet("FASTQ_GZ") +# dataSet.setMeasuredData(False) +# dataSet.setPropertyValue(INDEX1, DEFAULT_INDEX) +# dataSet.setPropertyValue(INDEX2, DEFAULT_INDEX) +# dirName = transaction.createNewDirectory(dataSet,name) +# +# if not undetermined: +# newFastqFiles = fastq_files +# foundSample = searchSample (sample_code, search_service) +# sequencing_sample = foundSample[0].getSample() +# experiment = sequencing_sample.getExperiment() +# sequencing_sample_code = sequencing_sample.getCode() +# print("sequencing_sample_code: "+ sequencing_sample_code) +# +# sequencing_sample_properties_dict = get_sample_properties (transaction, foundSample[0]) +# +# if (INDEX1 in sequencing_sample_properties_dict) and (fcMetaDataDict[INDEXREAD1] > 0): +# #print(sequencing_sample_properties_dict[INDEX1]) +# dataSet.setPropertyValue(INDEX1, sequencing_sample_properties_dict[INDEX1]) +# if (INDEX2 in sequencing_sample_properties_dict) and (fcMetaDataDict[INDEXREAD2] > 0): +# dataSet.setPropertyValue(INDEX2, sequencing_sample_properties_dict[INDEX2]) +# dataSet.setPropertyValue(EXTERNAL_SAMPLE_NAME, sequencing_sample_properties_dict[EXTERNAL_SAMPLE_NAME]) +# +# if (AFFILIATION_PROPERTY_NAME in sequencing_sample_properties_dict): +# affiliation_name = sequencing_sample_properties_dict[AFFILIATION_PROPERTY_NAME] +# +# filepart, suffix = first_fastq_file.split('.',1) +# meta_data_file_name = filepart.rsplit('_',2)[0] + METADATA_FILE_SUFFIX +# # get a file from the IDataSetRegistrationTransaction so it is automatically part of the data set +# meta_data_file_path = transaction.createNewFile(dataSet, name, meta_data_file_name) +# writeMetadataFile(transaction, name, meta_data_file_path, sequencing_sample_properties_dict, +# fcMetaDataDict, experiment, affiliation_name, fastq_files, flowLane) +# +# # Undetermined Files +# else: +# affiliation_name = "" +# affiliation_for_Undetermined = "" +# newFastqFiles = [] +# lane_parents = searchParents (search_service, parents) +# newFastqFiles = renameFiles(fastq_files, undetermined, flowCellId) +# for parent in lane_parents: +# sequencing_sample_properties_dict = get_sample_properties (transaction, parent) +# parent_sample = parent.getSample() +# sample_code = parent_sample.getCode() +# experiment = parent_sample.getExperiment() +# if (AFFILIATION_PROPERTY_NAME in sequencing_sample_properties_dict): +# affiliation_name = sequencing_sample_properties_dict[AFFILIATION_PROPERTY_NAME] +# +# # Special Sample Types without index (e.g. ILLUMINA_SEQUENCING_NEUROSTEMX_SINGLECELL) are caught here. +# # as those samples do not have a NCBI ORGANISM TAXONOMY +# if NCBI_ORGANISM_TAXONOMY not in sequencing_sample_properties_dict: +# print(sample_code + ": Processing Sample without NCBI ORGANISM TAXONOMY: ILLUMINA_SEQUENCING_NEUROSTEMX_SINGLECELL") +# meta_data_file_path = transaction.createNewFile(dataSet, name, sample_code + '_' + flowCellId + '_' + first_fastq_file.split('.')[0] + METADATA_FILE_SUFFIX) +# writeMetadataFile(transaction, name, meta_data_file_path, sequencing_sample_properties_dict, +# fcMetaDataDict, experiment, affiliation_name, newFastqFiles, flowLane) +# affiliation_for_Undetermined = affiliation_name +# +# elif (INDEX1 not in sequencing_sample_properties_dict) and (INDEX2 not in sequencing_sample_properties_dict) and \ +# (sequencing_sample_properties_dict[NCBI_ORGANISM_TAXONOMY] != PHIX_TAXONOMY_ID): +# print('NONINDEXED sample and Taxonomy id is NOT' + PHIX_TAXONOMY_ID +', probably a pool: ' + sample_code) +# meta_data_file_path = transaction.createNewFile(dataSet, name, sample_code + '_' + flowCellId + '_' + first_fastq_file.split('.')[0] + METADATA_FILE_SUFFIX) +# writeMetadataFile(transaction, name, meta_data_file_path, sequencing_sample_properties_dict, +# fcMetaDataDict, experiment, affiliation_name, newFastqFiles, flowLane) +# affiliation_for_Undetermined = affiliation_name +# # PARENTS: +# else: +# # Create Parent Meta data +# print(sample_code + ": Create parent meta data file") +# meta_data_file_path = transaction.createNewFile(dataSet, name, 'PARENT_' + sample_code + '_' + flowCellId + METADATA_FILE_SUFFIX) +# writeMetadataFile(transaction, name, meta_data_file_path, sequencing_sample_properties_dict, +# fcMetaDataDict, experiment, affiliation_name, [], flowLane) +# continue +# +# put_files_to_dataset (transaction, dataSet, newFastqFiles, name, flowCellId, affiliation_for_Undetermined, undetermined) +# +# if foundLane.size() > 0: +# sa = transaction.getSampleForUpdate(foundLane[0].getSampleIdentifier()) +# sa.setPropertyValue("DATA_TRANSFERRED", create_openbis_timestamp()) +# dataSet.setSample(foundLane[0]) diff --git a/deep_sequencing_unit/sourceTest/core-plugins/illumina-qgf/1/dss/drop-boxes/export-meta-data/lib/crc32 b/deep_sequencing_unit/sourceTest/core-plugins/illumina-qgf/1/dss/drop-boxes/export-meta-data/lib/crc32 new file mode 100755 index 0000000000000000000000000000000000000000..ffaf277227c6dfb0dae3fbe751d9db158bd5ae29 Binary files /dev/null and b/deep_sequencing_unit/sourceTest/core-plugins/illumina-qgf/1/dss/drop-boxes/export-meta-data/lib/crc32 differ diff --git a/deep_sequencing_unit/sourceTest/core-plugins/illumina-qgf/1/dss/drop-boxes/export-meta-data/plugin.properties b/deep_sequencing_unit/sourceTest/core-plugins/illumina-qgf/1/dss/drop-boxes/export-meta-data/plugin.properties new file mode 100644 index 0000000000000000000000000000000000000000..6e3d93e5e91dd176dec76e66af13591e5b639f5e --- /dev/null +++ b/deep_sequencing_unit/sourceTest/core-plugins/illumina-qgf/1/dss/drop-boxes/export-meta-data/plugin.properties @@ -0,0 +1,12 @@ +# +# Drop box for registering a flow cell output as a data set +# +# Variables: +# incoming-root-dir +# Path to the directory which contains incoming directories for drop boxes. +incoming-dir = ${incoming-root-dir}/export-meta-data +incoming-data-completeness-condition = marker-file +top-level-data-set-handler = ch.systemsx.cisd.etlserver.registrator.api.v2.JythonTopLevelDataSetHandlerV2 +script-path = export-meta-data.py +storage-processor = ch.systemsx.cisd.etlserver.DefaultStorageProcessor + diff --git a/deep_sequencing_unit/sourceTest/core-plugins/illumina-qgf/1/dss/drop-boxes/read-demultiplex-stats-nextseq/plugin.properties b/deep_sequencing_unit/sourceTest/core-plugins/illumina-qgf/1/dss/drop-boxes/read-demultiplex-stats-nextseq/plugin.properties index af28b8443a706ecb416030d519ec5e6ff5220498..102a5fe3f83698dc29055fd828dd2fd1518d6bf7 100644 --- a/deep_sequencing_unit/sourceTest/core-plugins/illumina-qgf/1/dss/drop-boxes/read-demultiplex-stats-nextseq/plugin.properties +++ b/deep_sequencing_unit/sourceTest/core-plugins/illumina-qgf/1/dss/drop-boxes/read-demultiplex-stats-nextseq/plugin.properties @@ -2,7 +2,7 @@ # Variables: # incoming-root-dir # Path to the directory which contains incoming directories for drop boxes. -incoming-dir = ${root}/dss/read-demultiplex-stats-nextseq +incoming-dir = ${incoming-root-dir}/read-demultiplex-stats-nextseq incoming-data-completeness-condition = marker-file top-level-data-set-handler = ch.systemsx.cisd.etlserver.registrator.api.v2.JythonTopLevelDataSetHandlerV2 script-path = read-demultiplex-stats-nextseq.py diff --git a/deep_sequencing_unit/sourceTest/core-plugins/illumina-qgf/1/dss/drop-boxes/read-demultiplex-stats-nextseq/read-demultiplex-stats-nextseq.py b/deep_sequencing_unit/sourceTest/core-plugins/illumina-qgf/1/dss/drop-boxes/read-demultiplex-stats-nextseq/read-demultiplex-stats-nextseq.py index 4be28cc09e8250e64b558ac5bd3b1334cc1033a1..50bdbc493a01428075aeaa0667c2318dc44e493d 100755 --- a/deep_sequencing_unit/sourceTest/core-plugins/illumina-qgf/1/dss/drop-boxes/read-demultiplex-stats-nextseq/read-demultiplex-stats-nextseq.py +++ b/deep_sequencing_unit/sourceTest/core-plugins/illumina-qgf/1/dss/drop-boxes/read-demultiplex-stats-nextseq/read-demultiplex-stats-nextseq.py @@ -1,545 +1,328 @@ ''' - @copyright: 2012 ETH Zuerich, CISD - - @license: - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - - -@author: Manuel Kohler - -XML Structur which is processed: - -<?xml version="1.0"?> -<Summary> - <Lane index="8"> - <Sample index="lane8"> - <Barcode index="Undetermined"> - <Tile index="1101"> - <Read index="1"> - <Raw> - <Yield>1921250</Yield> - <YieldQ30>949680</YieldQ30> - <ClusterCount>38425</ClusterCount> - <ClusterCount0MismatchBarcode>0</ClusterCount0MismatchBarcode> - <ClusterCount1MismatchBarcode>0</ClusterCount1MismatchBarcode> - <QualityScoreSum>40995660</QualityScoreSum> - </Raw> - <Pf> - <Yield>945450</Yield> - <YieldQ30>854815</YieldQ30> - <ClusterCount>18909</ClusterCount> - <ClusterCount0MismatchBarcode>0</ClusterCount0MismatchBarcode> - <ClusterCount1MismatchBarcode>0</ClusterCount1MismatchBarcode> - <QualityScoreSum>33815505</QualityScoreSum> - </Pf> - </Read> - </Tile> - [...] - -@note: -print statements go to: <openBIS_HOME>/datastore_server/log/startup_log.txt -''' - -import time -import os -import fnmatch -import xml.etree.ElementTree as etree -from ch.systemsx.cisd.openbis.generic.shared.api.v1.dto import SearchCriteria -from ch.systemsx.cisd.openbis.generic.shared.api.v1.dto import SearchSubCriteria - - -class parseXmlFile: - - def __init__(self, xmlFile): - self.xmlFile = xmlFile - self.tree = etree.parse(self.xmlFile) - self.root = self.tree.getroot() - -# ----------------------------------------------------------------------------- - -class qcValues(object): - def __init__(self, Yield = 0, YieldQ30 = 0, ClusterCount = 0, - ClusterCount0MismatchBarcode = 0, ClusterCount1MismatchBarcode = 0, - QualityScoreSum = 0, *args, **kwargs): - self.Yield = Yield - self.YieldQ30 = YieldQ30 - self.ClusterCount = ClusterCount - self.ClusterCount0MismatchBarcode = ClusterCount0MismatchBarcode - self.ClusterCount1MismatchBarcode = ClusterCount1MismatchBarcode - self.QualityScoreSum = QualityScoreSum - - def __str__(self): - return "Yield: %s, YieldQ30: %s, ClusterCount: %s, ClusterCount0MismatchBarcode: %s," \ - " CusterCount1MismatchBarcode: %s, QualityScoreSum: %s" \ - % (self.Yield, self.YieldQ30, self.ClusterCount, self.ClusterCount0MismatchBarcode, - self.ClusterCount1MismatchBarcode, self.QualityScoreSum) - -class sample: - def __init__(self, Lane = 0, Sample = '', Barcode = '', Tile = '', Read = '', rawqc = qcValues([]), - pfqc = qcValues([]), *args, **kwargs): - self.Lane = Lane - self.Sample = Sample - self.Barcode = Barcode - self.Tile = Tile - self.Read = Read - self.rawqc = rawqc - self.pfqc = pfqc - - def __str__(self): - return "Lane: %s, Sample: %s, Barcode: %s, Tile: %s, Read: %s, rawqc: %s, pfqc: %s" \ - % (self.Lane, self.Sample, self.Barcode, self.Tile, self.Read, self.rawqc, self.pfqc) - -# ----------------------------------------------------------------------------- - -class Statistics: - def __init__(self, lane = 0, sampleName = "", index1 = "NoIndex", index2 = "NoIndex", pfYieldSum = 0, - rawYieldSum = 0, pfPercentage = 0.0, rawReadsSum = 0, pfReadsSum = 0, - pfYieldQ30Sum = 0, qualityScoreSum = 0, rawPercentageReadsPerLane = 0.0, - pfYieldQ30Percentage = 0.0, pfsumQualityScore = 0, pfmeanQualityScore = 0.0): - self.lane = lane - self.sampleName = sampleName - self.index1 = index1 - self.index2 = index2 - self.pfYieldSum = pfYieldSum - self.rawYieldSum = rawYieldSum - self.pfPercentage = pfPercentage - self.rawReadsSum = rawReadsSum - self.pfReadsSum = pfReadsSum - self.pfYieldQ30Sum = pfYieldQ30Sum - self.qualityScoreSum = qualityScoreSum - self.rawPercentageReadsPerLane = rawPercentageReadsPerLane - self.pfYieldQ30Percentage = pfYieldQ30Percentage - self.pfsumQualityScore = pfsumQualityScore - self.pfmeanQualityScore = pfmeanQualityScore - - def __str__(self): - return "lane: %s, sampleName: %s, index1: %s, index2: %s, pfYieldSum: %s, pfPercentage: %s," \ - " rawReadsSum: %s, pfReadsSum: %s," \ - " rawPercentageReadsPerLane: %s, pfYieldQ30Percentage: %s," \ - " pfmeanQualityScore: %s" \ - % (self.lane, self.sampleName, self.index1, self.index2, self.pfYieldSum, self.pfPercentage, - self.rawReadsSum, self.pfReadsSum, - self.rawPercentageReadsPerLane, self.pfYieldQ30Percentage, self.pfmeanQualityScore) - - def calculatePercentagePF (self, rawYield = 0, pfYield = 1): - try: - return round(float(pfYield) / float(rawYield) * 100, 2) - except: - return 0.0 - - def calulateMeanQualityScore (self, pfqualityScoreSum = 0, pfYield = 1): - try: - return round (float(pfqualityScoreSum) / float(pfYield), 2) - except: - return 0.0 - - def calculateYieldQ30Percentage (self, pfYieldQ30 = 0, pfYield = 1): - try: - return round (float(pfYieldQ30) / float(pfYield) * 100, 2) - except: - return 0.0 - -# ----------------------------------------------------------------------------- - -def xml2Memory(DEMULTIPLEX_XML): - ''' - Parse the XML file and put all values in a memory structure: - List of: - lane, sample, barcode, tile, read, qcRawList, qcPfList - ''' - - RAW_TAG = "Raw" - PF_TAG = "Pf" - - sampleList = [] - - xml = parseXmlFile(DEMULTIPLEX_XML) - r = xml.tree.getroot() - - for lane in r.getchildren(): - for mysample in lane: - for barcode in mysample: - for tile in barcode: - for read in tile: - - qcRaw = qcValues() - qcPf = qcValues() - qcRawList = [] - qcPfList = [] - - # Read out the Raw fields - raw = read.find(RAW_TAG) - for child in raw.getchildren(): - # equivalent to a Java reflection - setattr(qcRaw, child.tag, int(child.text)) - - # Read out the Pf fields - pf = read.find(PF_TAG) - for child in pf.getchildren(): - # equivalent to a Java reflection - setattr(qcPf, child.tag, int(child.text)) - - qcRawList.append(qcRaw) - qcPfList.append(qcPf) - - singleElement = sample () - - setattr(singleElement, lane.tag, lane.attrib) - setattr(singleElement, mysample.tag, mysample.attrib) - setattr(singleElement, barcode.tag, barcode.attrib) - setattr(singleElement, tile.tag, tile.attrib) - setattr(singleElement, read.tag, read.attrib) - singleElement.rawqc = qcRawList - singleElement.pfqc = qcPfList - - sampleList.append(singleElement) - return sampleList - -# ----------------------------------------------------------------------------- - -def calculateStatistics(listofSamples): - ''' - Structure of 'listofSamples' - Lane: {'index': '6'}, Sample: {'index': 'BSSE-QGF-3524_C0NKPACXX'}, Barcode: {'index': 'TGACCA'}, - Tile: {'index': '2307'}, Read: {'index': '1'}, rawqc:<mem>, pfqc:<mem> - ''' - - numberOfTiles = len(listofSamples) - - tile = sample() - raw = qcValues () - pf = qcValues () - stats = Statistics() - - for tile in listofSamples: - raw = tile.rawqc[0] - pf = tile.pfqc[0] - - stats.pfYieldSum += pf.Yield - stats.rawYieldSum += raw.Yield - stats.rawReadsSum += raw.ClusterCount - stats.pfReadsSum += pf.ClusterCount - stats.pfYieldQ30Sum += pf.YieldQ30 - stats.qualityScoreSum += pf.QualityScoreSum - - # Can not be set here, needs to be calculated later - #stats.rawPercentageReadsPerLane = rawPercentageReadsPerLane - stats.pfPercentage = stats.calculatePercentagePF(stats.rawYieldSum, stats.pfYieldSum) - stats.pfYieldQ30Percentage = stats.calculateYieldQ30Percentage(stats.pfYieldQ30Sum, stats.pfYieldSum) - stats.pfmeanQualityScore = stats.calulateMeanQualityScore(stats.qualityScoreSum, stats.pfYieldSum) - stats.lane = listofSamples[0].Lane.values()[0] - stats.sampleName = listofSamples[0].Sample.values()[0] - index = listofSamples[0].Barcode.values()[0] - try: - stats.index1, stats.index2 = index.split("-") - except: - stats.index1 = index - return stats - -# ----------------------------------------------------------------------------- - - -def rawReadSumPerSamples(stat): - ''' - Creates a dictionary with the lanes as keys - The values are a list where the elements are a dictionary again. - This dictionary has the sample names as key and the RawReadSum as value. - - Example: - {4': [{'BSSE-QGF-3434_C0NKPACXX': 248999502}], '7': [{'lane7': 123921974}, - {'BSSE-QGF-3527_C0NKPACXX': 38587703}, {'BSSE-QGF-3529_C0NKPACXX': 30130893}, - {'BSSE-QGF-3528_C0NKPACXX': 34519296}, {'BSSE-QGF-3526_C0NKPACXX': 34980179}]} - ''' - - laneDict = {} - for e in stat: - if e.lane not in laneDict: - laneDict[e.lane] = [{e.sampleName:e.rawReadsSum}] - else: - laneDict[e.lane].append({e.sampleName:e.rawReadsSum}) - return laneDict - -# ----------------------------------------------------------------------------- - -def createSumRawReadsPerLane(laneDict): - ''' - Creates a dictionary with lane as key and sum of Raw Reads as value: - {'1': 183180877, '3': 244968562, '2': 191496395, '5': 193466239, '4': 248999502, - '7': 262140045, '6': 257136830, '8': 209948449} - ''' - sumRawReadsDict = {} - for lane in laneDict: - sumRawReads = 0 - for sampleNameDict in laneDict[lane]: - sumRawReads += sampleNameDict.values()[0] - - sumRawReadsDict[lane] = sumRawReads - return sumRawReadsDict - -# ----------------------------------------------------------------------------- - -def createPercentagePerLane(laneDict, sumRawReadsDict): - ''' - Creates a dictionary with the sample Name as key and the percentage of raw reads related to - all reads in the same lane - {'lane7': 47.27, 'BSSE-QGF-3433_C0NKPACXX': 100.0, 'BSSE-QGF-3666_C0NKPACXX': 54.12} - ''' - - relRawReadsDict = {} - for lane in laneDict: - for sampleName in laneDict[lane]: - relRawReadsDict[sampleName.keys()[0]] = round(float(sampleName.values()[0]) / - float(sumRawReadsDict[lane]) * 100, 2) - return relRawReadsDict - -# ----------------------------------------------------------------------------- - -def locate(pattern, root): - '''Locate all files matching supplied filename pattern in and below - supplied root directory.''' - for path, dirs, files in os.walk(os.path.abspath(root)): - for filename in fnmatch.filter(files, pattern): - yield os.path.join(path, filename) - -# ----------------------------------------------------------------------------- - -def getVocabulary(transaction, vocabularyCode): - - vocabularyTermList = [] - vocabulary = transaction.getSearchService().searchForVocabulary(vocabularyCode) - if (vocabulary is None): - print 'VOCABULARY %s does not exist' % (vocabularyCode) - else: - print "Getting VOCABULARY: " + vocabulary.getCode() - for term in vocabulary.getTerms(): - vocabularyTermList.append(term.getCode()) - vocabularyTermList.sort() - return vocabularyTermList +@copyright: 2012 ETH Zuerich, CISD -# ----------------------------------------------------------------------------- +@license: +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at -def getFlowCellMetaData (transaction,flowCellId): + http://www.apache.org/licenses/LICENSE-2.0 - def sortedDictValues(adict): - keys = adict.keys() - keys.sort() - return map(adict.get, keys) +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. - search = transaction.getSearchService() - sc = SearchCriteria() - print('Searching FlowCell: '+ str(flowCellId)) - sc.addMatchClause(SearchCriteria.MatchClause.createAttributeMatch(SearchCriteria.MatchClauseAttribute.CODE, flowCellId)); - foundFlowCells = search.searchForSamples(sc) +@author: Fabian Gemperle - try: - assert foundFlowCells.size() == 1 - except AssertionError: - print (str(foundFlowCells.size()) + \ - ' flow cells found which match the criterias: '+ flowCellId) +@note: print statements go to ~/openbis/servers/datastore_server/log/startup_log.txt - fcPropertiesDict = {} - fcPropertyTypes = [] - - fcProperties = foundFlowCells[0].getSample().getProperties() - for property in fcProperties: - code = property.getPropertyType().getSimpleCode() - fcPropertyTypes.append(code) - fcPropertiesDict[code] = property.getValue() - - fcPropertyTypes.sort() - return fcPropertiesDict, fcPropertyTypes +''' +import time +import math +import os +import fnmatch +# Load Java-Library to import XML data: +import read_demultiplex_stats +# Load openBIS-Libraries: +from ch.systemsx.cisd.openbis.generic.shared.api.v1.dto import SearchCriteria +from ch.systemsx.cisd.openbis.generic.shared.api.v1.dto import SearchSubCriteria -# ----------------------------------------------------------------------------- def process(transaction): ''' - Main + Main method in corresponding openBIS dropbox ''' - FASTQ_DATA_SET_TYPE='FASTQ_GZ' - DEMUX_FILE='Flowcell_demux_summary.xml' - NO_INDEX='NOINDEX' - UNDETERMINED='UNDETERMINED' - - incomingPath = transaction.getIncoming().getPath() - name = transaction.getIncoming().getName() - - print('\n'+time.ctime()) - - fcPropertiesDict, fcPropertyTypes = getFlowCellMetaData(transaction, name) - print fcPropertiesDict - print fcPropertyTypes - - search_service = transaction.getSearchService() - - FileGenerator= locate(DEMUX_FILE, incomingPath) - DEMULTIPLEX_XML = FileGenerator.next() - - sampleList = xml2Memory(DEMULTIPLEX_XML) - - sa = sample() - sampleDict = {} - - # key = sample name, value = sample() - for element in range(0, len(sampleList)): - sa = sampleList[element] - # Check if new sample - if (sa.Sample is not sampleList[element - 1].Sample): - sampleName = sa.Sample.values()[0] - sampleDict[sampleName] = [sa] + # Constants: + XML_FILENAME = 'ConversionStats.xml' + TYPE_DATASET = 'FASTQ_GZ' + INDEX_NO = 'NOINDEX' + INDEX_UNKNOWN = 'UNKNOWN' + INDEX_EMPTY = '' + CODE_INDEX1 = 'BARCODE' + CODE_INDEX2 = 'INDEX2' + CODE_INDEX1LENGTH = 'INDEXREAD' + CODE_INDEX2LENGTH = 'INDEXREAD2' + + ########################################################## + def locate(pattern, root): + '''Locate all files matching supplied filename pattern in and below supplied root directory.''' + for path, dirs, files in os.walk(os.path.abspath(root)): + for filename in fnmatch.filter(files, pattern): + yield os.path.join(path, filename) + + ########################################################## + def getInfoVocabularyTerms(vocabularyCode): + ''' + Get information about Terms of certain Vocabulary in openBIS. + Input: + - vocabularyCode: code of Vocabulary to be investigated + Output: + - vocabularyTerms: list of Terms in Vocabulary + ''' + vocabulary = transaction.getSearchService().searchForVocabulary(vocabularyCode) + + vocabularyTerms = [] + if (vocabulary is None): + print '\nOCCURRED EXCEPTION: Vocabulary %s does not exist' % (vocabularyCode) else: - sampleDict[sampleName].append(sa) - - stat = [calculateStatistics(sampleDict[mysample]) for mysample in sampleDict] - - # calculate the relative amount of reads per index - laneDict = rawReadSumPerSamples(stat) - sumRawReadsDict = createSumRawReadsPerLane(laneDict) - relRawReadsDict = createPercentagePerLane(laneDict, sumRawReadsDict) - - # set the values in the object - for mye in stat: - mye.rawPercentageReadsPerLane = relRawReadsDict[mye.sampleName] - - def sampleSearch(Code=''): - sc = SearchCriteria() - numberOfLanes = 0 - sc.addMatchClause(SearchCriteria.MatchClause.createAttributeMatch(SearchCriteria.MatchClauseAttribute.CODE, Code)); - search_service = transaction.getSearchService() - foundSample = search_service.searchForSamples(sc) - if foundSample.size() > 0: - # Search for contained samples - sampleSc = SearchCriteria() - sampleSc.addSubCriteria(SearchSubCriteria.createSampleContainerCriteria(sc)) - foundContainedSamples = search_service.searchForSamples(sampleSc) - numberOfLanes = foundContainedSamples.size() - return foundSample, foundContainedSamples, numberOfLanes - -#-------------------------------------------------------------------------------------------------------------------------------------- - - def searchDataSetsofSample(sample, index1, index2, DATA_SET_TYPE): - sc = SearchCriteria() - sc.addMatchClause(SearchCriteria.MatchClause.createAttributeMatch(SearchCriteria.MatchClauseAttribute.CODE, sample)); - search_service = transaction.getSearchService() - foundSample = search_service.searchForSamples(sc) - - dataSetSc = SearchCriteria() - # set the Search Criteria to an OR condition, default is AND - #dataSetSc.setOperator(SearchCriteria.SearchOperator.MATCH_ANY_CLAUSES) - dataSetSc.addMatchClause(SearchCriteria.MatchClause.createAttributeMatch(SearchCriteria.MatchClauseAttribute.TYPE, DATA_SET_TYPE)) - dataSetSc.addMatchClause(SearchCriteria.MatchClause.createPropertyMatch("BARCODE", index1 )) - dataSetSc.addMatchClause(SearchCriteria.MatchClause.createPropertyMatch("INDEX2", index2)) - dataSetSc.addSubCriteria(SearchSubCriteria.createSampleCriteria(sc)) - foundDataSets = search_service.searchForDataSets(dataSetSc) - print "foundDataSets.size() "+ str(foundDataSets.size()) + for term in vocabulary.getTerms(): + vocabularyTerms.append(term.getCode()) + vocabularyTerms.sort() + + return vocabularyTerms + + ########################################################## + def getInfoSampleProperties(sampleCode): + ''' + Get information about Properties of certain Sample in openBIS. + Input: + - sampleCode: code of Sample to be investigated + Outputs: + - propertiesCode: list of Properties' codes + - propertiesCodeValue: dictionary of Properties' codes and values + ''' + ss = transaction.getSearchService() + + scSample = SearchCriteria() + scSample.addMatchClause(SearchCriteria.MatchClause.createAttributeMatch(SearchCriteria.MatchClauseAttribute.CODE, sampleCode)); + foundSamples = ss.searchForSamples(scSample) + + propertiesCode = [] + propertiesCodeValue = {} + try: + assert foundSamples.size() == 1 + properties = foundSamples[0].getSample().getProperties() + for p in properties: + codeProperty = p.getPropertyType().getSimpleCode() + propertiesCode.append(codeProperty) + propertiesCodeValue[codeProperty] = p.getValue() + propertiesCode.sort() + except AssertionError: + print ('\nOCCURRED EXCEPTION: ' + str(foundSamples.size()) + ' Samples found which match the criteria code \"' + sampleCode + '\".') + + return propertiesCode, propertiesCodeValue + + ########################################################## + def getInfoDataSetPropertiesOfSample(sampleCode): + ''' + Get information about Properties of some DataSet of certain Sample in openBIS. + Input: + - sampleCode: code of DataSet's Sample to be investigated + Outputs: + - propertiesCode: list of Properties' codes + - propertiesCodeValue: dictionary of Properties' codes and values + ''' + ss = transaction.getSearchService() + + scSample = SearchCriteria() + scSample.addMatchClause(SearchCriteria.MatchClause.createAttributeMatch(SearchCriteria.MatchClauseAttribute.CODE, sampleCode)); + #foundSamples = ss.searchForSamples(scSample) + + scDataSet = SearchCriteria() + scDataSet.addMatchClause(SearchCriteria.MatchClause.createAttributeMatch(SearchCriteria.MatchClauseAttribute.TYPE, TYPE_DATASET)) + scDataSet.addSubCriteria(SearchSubCriteria.createSampleCriteria(scSample)) + foundDataSets = ss.searchForDataSets(scDataSet) + + propertiesCode = [] + propertiesCodeValue = {} + try: + assert foundDataSets.size() > 0 + codeProperties = foundDataSets[0].getAllPropertyCodes() + for cp in codeProperties: + propertiesCode.append(cp) + propertiesCodeValue[cp] = foundDataSets[0].getPropertyValue(cp) + propertiesCode.sort() + except AssertionError: + print ('\nOCCURRED EXCEPTION: ' + str(foundDataSets.size()) + ' DataSets found which Sample match the criteria code \"' + sampleCode + '\" and type \"' + TYPE_DATASET + '\".') + + return propertiesCode, propertiesCodeValue + + ########################################################## + def getIndexesOfDataSetsOfSample(sampleFlowLaneCode): + ''' + Get both indexes (parts of barcode) of all DataSets of certain FlowLane-Sample in openBIS. + Inputs: + - sampleFlowLaneCode: code of DataSet's Sample + Outputs: + - indexes1: list of first index of DataSets + - indexes2: list of second index of DataSets + ''' + ss = transaction.getSearchService() + + scSample = SearchCriteria() + scSample.addMatchClause(SearchCriteria.MatchClause.createAttributeMatch(SearchCriteria.MatchClauseAttribute.CODE, sampleFlowLaneCode)); + #foundSamples = ss.searchForSamples(scSample) + + scDataSet = SearchCriteria() + scDataSet.addMatchClause(SearchCriteria.MatchClause.createAttributeMatch(SearchCriteria.MatchClauseAttribute.TYPE, TYPE_DATASET)) + scDataSet.addSubCriteria(SearchSubCriteria.createSampleCriteria(scSample)) + foundDataSets = ss.searchForDataSets(scDataSet) + + indexes1 = [] + indexes2 = [] + try: + assert foundDataSets.size() > 0 + except AssertionError: + print ('\nOCCURRED EXCEPTION: ' + str(foundDataSets.size()) + ' DataSets found which Sample match the criteria code \"' + sampleFlowLaneCode + '\" and type \"' + TYPE_DATASET + '\".') for ds in foundDataSets: - print "Index1 for found Data Set" + ds.getDataSetCode() + " " + ds.getPropertyValue('BARCODE') - print "Index2 for found Data Set" + ds.getDataSetCode() + " " + ds.getPropertyValue('INDEX2') - - return foundDataSets - -#-------------------------------------------------------------------------------------------------------------------------------------- - - def getIndexesofDataSetsofSample(sample, DATA_SET_TYPE): - - index1List = [] - index2List = [] - sc = SearchCriteria() - sc.addMatchClause(SearchCriteria.MatchClause.createAttributeMatch(SearchCriteria.MatchClauseAttribute.CODE, sample)); - search_service = transaction.getSearchService() - foundSample = search_service.searchForSamples(sc) - - dataSetSc = SearchCriteria() - dataSetSc.addMatchClause(SearchCriteria.MatchClause.createAttributeMatch(SearchCriteria.MatchClauseAttribute.TYPE, DATA_SET_TYPE)) - dataSetSc.addSubCriteria(SearchSubCriteria.createSampleCriteria(sc)) - foundDataSets = search_service.searchForDataSets(dataSetSc) + indexes1.append(ds.getPropertyValue(CODE_INDEX1)) + indexes2.append(ds.getPropertyValue(CODE_INDEX2)) + + return indexes1, indexes2 + + ########################################################## + def searchDataSetsOfSample(sampleFlowLaneCode, index1, index2): + ''' + Search DataSets by corresponding indexes (parts of barcode) of certain FlowLane-Sample in openBIS. + Inputs: + - sampleFlowLaneCode: code of DataSet's Sample + - index1: first index of DataSet + - index2: second index of DataSet + Output: + - foundDataSets: DataSets corresponding to inputs and constant TYPE_DATASET + ''' + ss = transaction.getSearchService() + + scSample = SearchCriteria() + scSample.addMatchClause(SearchCriteria.MatchClause.createAttributeMatch(SearchCriteria.MatchClauseAttribute.CODE, sampleFlowLaneCode)); + #foundSample = ss.searchForSamples(scSample) + + scDataSet = SearchCriteria() + scDataSet.addMatchClause(SearchCriteria.MatchClause.createAttributeMatch(SearchCriteria.MatchClauseAttribute.TYPE, TYPE_DATASET)) + scDataSet.addMatchClause(SearchCriteria.MatchClause.createPropertyMatch(CODE_INDEX1, index1)) + scDataSet.addMatchClause(SearchCriteria.MatchClause.createPropertyMatch(CODE_INDEX2, index2)) + scDataSet.addSubCriteria(SearchSubCriteria.createSampleCriteria(scSample)) + foundDataSets = ss.searchForDataSets(scDataSet) for ds in foundDataSets: - index1List.append(ds.getPropertyValue('BARCODE')) - index2List.append(ds.getPropertyValue('INDEX2')) - return index1List, index2List - - - flowcell, lanes, numberOfLanes = sampleSearch(name) + print "Index1 of found DataSet " + ds.getDataSetCode() + ": " + ds.getPropertyValue(CODE_INDEX1) + print "Index2 of found DataSet " + ds.getDataSetCode() + ": " + ds.getPropertyValue(CODE_INDEX2) - index1Length = fcPropertiesDict['INDEXREAD'] - index2Length = fcPropertiesDict['INDEXREAD2'] - - for mystat in stat: - laneCode = flowcell[0].getCode() + ":" + mystat.lane - searchIndex1 = mystat.index1.upper() - searchIndex2 = mystat.index2.upper() - print '\n' - print mystat - - index1List, index2List = getIndexesofDataSetsofSample(laneCode, FASTQ_DATA_SET_TYPE) - print "Searching for "+ searchIndex1 + " in " + str(index1List) - print "Searching for "+ searchIndex2 + " in " + str(index2List) - - if searchIndex1 not in (NO_INDEX): - if searchIndex1 not in (UNDETERMINED): - if index1Length > 7: - searchIndex1List = [ index1 for index1 in index1List if searchIndex1 in index1] - else: - searchIndex1List = [ index1 for index1 in index1List if searchIndex1 in index1[:-2]] - try: - if len(searchIndex1List) > 1: - if searchIndex1List[0].startswith(searchIndex1): - searchIndex1 = searchIndex1List[0] - else: - searchIndex1 = searchIndex1List[1] - else: - searchIndex1 = searchIndex1List[0] - print searchIndex1 - except: - searchIndex1 = 'MISSING' - else: - searchIndex1 = NO_INDEX - if searchIndex2 not in (NO_INDEX): - if searchIndex2 not in (UNDETERMINED): - if index2Length > 7: - searchIndex2 = [ index2 for index2 in index2List if searchIndex2 in index2] - else: - searchIndex2 = [ index2 for index2 in index2List if searchIndex2 in index2[:-2]] - try: - searchIndex2 = searchIndex2[0] - except: - searchIndex1 = 'MISSING' - else: - searchIndex2 = NO_INDEX - - print "searchIndex1 " + str(searchIndex1) - print "searchIndex2 " + str(searchIndex2) + return foundDataSets - # Search for a data set with those two indices - DataSet = searchDataSetsofSample(laneCode, searchIndex1, searchIndex2, FASTQ_DATA_SET_TYPE) + ########################################################## + def reversecomplement(sequence): + ''' + Reverse sequence and replace each nucleotide by its complement. + Input: + - sequence: sequence of nucleotides + Output: + - reverse_complement_sequence: reversed and complemented sequence + ''' + lookup_table = {'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G'} + reverse_complement_sequence = '' + for nucleotide in reversed(sequence): + reverse_complement_sequence += lookup_table[nucleotide] + return reverse_complement_sequence + + ########################################################## + + print('\nPROCESS RUNNING '+time.ctime()) + incomingPath = transaction.getIncoming().getPath() + FileGenerator= locate(XML_FILENAME, incomingPath) + xmlfile = FileGenerator.next() + print "File: " + xmlfile + + # Import data of XML file (independent of openBIS data): + JavaClassToProcessXML = read_demultiplex_stats() # this function is implemented as Class in Java: + samplestatisticslist = JavaClassToProcessXML.importXMLdata_and_calculateStatistics(xmlfile) + if len(samplestatisticslist) == 0: + print "\nNo Projects/Samples/Barcodes are contained in XML-file " + xmlfile + "!" + return + + # Prepare links between XML and openBIS w.r.t. Samples: + codeSampleFlowCell = samplestatisticslist[0].Flowcell # expect just one equal FlowCell of all imported datasets + codeSampleFlowLane = samplestatisticslist[0].Flowcell + ":1" # expect just one equal FlowLane of all imported datasets + + # Just get information of openBIS about Properties in Sample of FlowCell and corresponding FlowLane: + #propertiesCode, propertiesCodeValue = getInfoSampleProperties(codeSampleFlowCell) + #print "\nCode of Properties in FlowCell Sample "+codeSampleFlowCell+":\n", propertiesCode, "\n", propertiesCodeValue + #propertiesCode, propertiesCodeValue = getInfoSampleProperties(codeSampleFlowLane) + #print "\nCode of Properties in FlowLane Sample "+codeSampleFlowLane+":\n", propertiesCode, "\n", propertiesCodeValue + + # Just get information of openBIS about Properties in one of the DataSets of FlowLane: + #propertiesCode, propertiesCodeValue = getInfoDataSetPropertiesOfSample(codeSampleFlowLane) + #print "\nCode of Properties in one of DataSets in Sample "+codeSampleFlowLane+" having Type "+TYPE_DATASET+":\n", propertiesCode, "\n", propertiesCodeValue + + # Just get information of openBIS about Terms in some Vocabulary: + #vocabularyCode = CODE_INDEX1 # CODE_INDEX1 CODE_INDEX2 + #vocabularyTerms = getInfoVocabularyTerms(vocabularyCode) + #print "\nTerms in Vocabulary "+vocabularyCode+":\n", vocabularyTerms + + # Prepare links between XML and openBIS w.r.t to indexes in DataSet (openBIS): + index1list, index2list = getIndexesOfDataSetsOfSample(codeSampleFlowLane) + propertiesCode, propertiesCodeValue = getInfoSampleProperties(codeSampleFlowCell) + index1length = int(propertiesCodeValue[CODE_INDEX1LENGTH]) + index2length = int(propertiesCodeValue[CODE_INDEX2LENGTH]) + + nprocessedDataSets = 0 + for s in samplestatisticslist: + print "\nContent in XML file:\n", s + print "Connection to openBIS:" + + # Prepare link between XML and openBIS w.r.t to indexes in Barcode (XML): + indexes = s.Barcode.split("-") + if len(indexes) == 1: # only first part in Barcode + index1search = indexes[0].upper() + index2search = INDEX_EMPTY + elif len(indexes) == 2: # both parts in Barcode + index1search = indexes[0].upper() + index2search = indexes[1].upper() + else: + index1search = INDEX_EMPTY + index2search = INDEX_EMPTY + + # Set link between XML and openBIS w.r.t to indexes in DataSet (openBIS): + if index1search == INDEX_EMPTY or index1search == INDEX_UNKNOWN: + index1 = INDEX_NO + else: # Hint: just two cases were known about index1length, that is 8 or 6 + if index1length > 7: + index1 = [ index1 for index1 in index1list if index1search == index1 ] + else: # for smaller indexlength, the index is by 1 shorter in XML-file than in openBIS + index1 = [ index1 for index1 in index1list if index1search == index1[:index1length] ] + try: + index1 = index1[0] + except: + print '\nOCCURRED EXCEPTION: First index \"' + index1search + '\" of Barcode in XML file has no corresponding DataSet in openBIS!' + index1 = 'MISSING' + if index2search == INDEX_EMPTY or index2search == INDEX_UNKNOWN: + index2 = INDEX_NO + else: # Hint: just one case was known about index2length, that is 8 + if index2length > 7: # second and larger index must be reversed and complemented in contrast to first or smaller index + index2 = [ index2 for index2 in index2list if reversecomplement(index2search) == index2 ] + else: # second and smaller index is unknown how to handle + index2 = [ index2 for index2 in index2list if reversecomplement(index2search) == index2 ] + try: + index2 = index2[0] + except: + print '\nOCCURRED EXCEPTION: Second index \"' + index2search + '\" of Barcode in XML file has no corresponding DataSet in openBIS!' + index2 = 'MISSING' + + # Get DataSet of openBIS corresponding to Project/Sample/Barcode of XML file: + correspondingDataSet = searchDataSetsOfSample(codeSampleFlowLane, index1, index2) try: - assert DataSet.size() == 1 - except AssertionError: - print (str(DataSet.size()) + ' data sets found which match the criterias: '+ - str(laneCode), searchIndex1, searchIndex2) - continue - - sa = transaction.getDataSetForUpdate(DataSet[0].getDataSetCode()) - sa.setPropertyValue('YIELD_MBASES', str(mystat.pfYieldSum)) - sa.setPropertyValue('RAW_YIELD_MBASES', str(mystat.rawYieldSum)) - sa.setPropertyValue('PERCENTAGE_PASSED_FILTERING',str(mystat.pfPercentage)) - sa.setPropertyValue('PF_READS_SUM',str(mystat.pfReadsSum)) - sa.setPropertyValue('RAW_READS_SUM',str(mystat.rawReadsSum)) - sa.setPropertyValue('PERCENTAGE_RAW_CLUSTERS_PER_LANE', str(mystat.rawPercentageReadsPerLane)) - sa.setPropertyValue('PFYIELDQ30PERCENTAGE', str(mystat.pfYieldQ30Percentage)) - sa.setPropertyValue('PFMEANQUALITYSCORE', str(mystat.pfmeanQualityScore)) - - print "Modified data sets properties of: " + DataSet[0].getDataSetCode() - - print "DONE" + assert correspondingDataSet.size() == 1 + except AssertionError: + print ('\nOCCURRED EXCEPTION: ' + str(correspondingDataSet.size()) + ' DataSets found which Sample match the criteria index1 \"' + str(index1) + '\" and index2 \"' + str(index2) + '\" and code \"' + codeSampleFlowLane + '\" and type \"' + TYPE_DATASET + '\".') + continue + + # Modify Properties of corresponding DataSet: + # (method setPropertyValue requires Strings as Input, but Number format must fit to Properties already defined in openBIS) + ds = transaction.getDataSetForUpdate(correspondingDataSet[0].getDataSetCode()) + ds.setPropertyValue('YIELD_MBASES', str(int(s.Mega_PfYield))) + ds.setPropertyValue('RAW_YIELD_MBASES', str(int(s.Mega_RawYield))) + ds.setPropertyValue('PERCENTAGE_PASSED_FILTERING',str(s.Percentage_PfClusterCount_RawClusterCount)) + ds.setPropertyValue('PF_READS_SUM',str(int(s.Sum_PfClusterCount))) # convert first to Integer, then to String + ds.setPropertyValue('RAW_READS_SUM',str(int(s.Sum_RawClusterCount))) # convert first to Integer, then to String + ds.setPropertyValue('PERCENTAGE_RAW_CLUSTERS_PER_LANE', str(s.Percentage_RawClusterCount_AllRawClusterCounts)) + ds.setPropertyValue('PFYIELDQ30PERCENTAGE', str(s.Percentage_PfYieldQ30_PfYield)) + ds.setPropertyValue('PFMEANQUALITYSCORE', str(s.Fraction_PfQualityScoreSum_PfYield)) + print "Properties in DataSet \"" + correspondingDataSet[0].getDataSetCode() + "\" are modified." + nprocessedDataSets += 1 + + print "\n", nprocessedDataSets, " openBIS-DataSets were processed." + print len(samplestatisticslist), " XML-Projects/-Samples/-Barcodes were processed." + print("PROCESS DONE "+time.ctime()) diff --git a/deep_sequencing_unit/sourceTest/core-plugins/illumina-qgf/1/dss/drop-boxes/read-demultiplex-stats-nextseq/read_demultiplex_stats.java b/deep_sequencing_unit/sourceTest/core-plugins/illumina-qgf/1/dss/drop-boxes/read-demultiplex-stats-nextseq/read_demultiplex_stats.java new file mode 100644 index 0000000000000000000000000000000000000000..818ca97b5112064a3d590f91af05025c7adfd2e9 --- /dev/null +++ b/deep_sequencing_unit/sourceTest/core-plugins/illumina-qgf/1/dss/drop-boxes/read-demultiplex-stats-nextseq/read_demultiplex_stats.java @@ -0,0 +1,526 @@ +/* +@copyright: 2012 ETH Zuerich, CISD + +@license: +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +@author: Fabian Gemperle + +@note: This Class is in Jython importable as Java-Library after compiling it. + In compilation 4 Class files arise: + - read_demultiplex_stats.class + - read_demultiplex_stats$SampleItem.class + - read_demultiplex_stats$Sample.class + - read_demultiplex_stats$Statistics.class +*/ + +import java.util.List; +import java.util.ArrayList; +import javax.xml.stream.XMLInputFactory; +import javax.xml.stream.XMLStreamConstants; +import javax.xml.stream.XMLStreamException; +import javax.xml.stream.XMLStreamReader; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.InputStream; + +//########################################################## + +public final class read_demultiplex_stats { + + public List<Statistics> samplestatisticslist = new ArrayList<Statistics>(); + + public read_demultiplex_stats() { + // This Class-Constructor is unnecessary??? + } + + public List<Statistics> importXMLdata_and_calculateStatistics(String XMLfile) { + /*Parse corresponding XML file, put all values into a memory structure, calculate statistics overall samples. + Output: samplestatisticslist (list of Sample-Statistics-Objects) + Input: XMLfile having structure of file ConversionStats.xml (Example of 1.10.2014): + <?xml version="1.0" encoding="utf-8"?> => Assumptions about XML structure: + <Stats> => Element is singlechild + <Flowcell flowcell-id="H0YVKBGXX"> => Element is singlechild + <Project name="BSSE_QGF_23096_H0YVKBGXX_1"> => Element is one of many children, additionally there is summary-element with attribute name="all" + <Sample name="BSSE_QGF_23096_H0YVKBGXX_1_PZ27_PZ33_CelSEQ_"> => Element is singlechild except second summary-element with attribute name="all" + <Barcode name="unknown"> => Element is singlechild except second summary-element with attribute name="all" + <Lane number="1"> => Element is one of several children + <Tile number="11101"> => Element is one of many children + <Raw> => Element is singlechild + <ClusterCount>328653</ClusterCount> => Element is singlechild + <Read number="1"> => Element is one of several children + <Yield>24977628</Yield> => Element is singlechild + <YieldQ30>16162292</YieldQ30> => Element is singlechild + <QualityScoreSum>703070796</QualityScoreSum> => Element is singlechild + </Read> + <Read number="2"> => Element is one of several children + <Yield>24977628</Yield> => Element is singlechild + <YieldQ30>16233173</YieldQ30> => Element is singlechild + <QualityScoreSum>699507245</QualityScoreSum> => Element is singlechild + </Read> + </Raw> + <Pf> => Element is singlechild + <ClusterCount>302121</ClusterCount> => Element is singlechild + <Read number="1"> => Element is one of several children + <Yield>22961196</Yield> => Element is singlechild + <YieldQ30>15842531</YieldQ30> => Element is singlechild + <QualityScoreSum>686898532</QualityScoreSum> => Element is singlechild + </Read> + <Read number="2"> => Element is one of several children + <Yield>22961196</Yield> => Element is singlechild + <YieldQ30>16233173</YieldQ30> => Element is singlechild + <QualityScoreSum>699507245</QualityScoreSum> => Element is singlechild + </Read> + </Pf> + </Tile> + <Tile number="11102"> + [...] + */ + + try { + + // temporary variables (frequently changing during XML-read-in): + String errormessage = ""; + int event = 0; + int skip = 0; + String curflowcellname = ""; + String curprojectname = ""; + String cursamplename = ""; + String curbarcodename = ""; + double curlanenumber = Double.NaN; // obvious double, but could be turned into int + double curtilenumber = Double.NaN; // obvious double, but could be turned into int + Sample cursample = null; + Statistics curstatistics = null; + SampleItem currawitem = null; + SampleItem curpfitem = null; + + InputStream xmlfile = new FileInputStream(XMLfile); + XMLInputFactory xmlfactory = XMLInputFactory.newInstance(); + XMLStreamReader xmlparser = xmlfactory.createXMLStreamReader(xmlfile); + + // Start-Tag "Stats": + event = xmlparser.nextTag(); // Assumption: just white space or comments are aside explicit start-tag + if (event != XMLStreamConstants.START_ELEMENT || !xmlparser.getLocalName().equals("Stats")) { + errormessage = "STRANGE ERROR IN METHOD importXMLdata_and_calculateStatistics WHEN READING IN XMLFILE. => CHECK CODE AND XMLFILE-STRUCTURE!"; + //System.out.println(errormessage); + throw new Exception(errormessage); + } + //System.out.println("\nStart-Element with tag " + xmlparser.getLocalName() + " with " + xmlparser.getAttributeCount() + " attributes with first attribute: " + xmlparser.getAttributeLocalName(0) +" = " + xmlparser.getAttributeValue(0)); + + // Loop over potential Tags "Flowcell": + event = xmlparser.nextTag(); // Assumption: just white spaces or comments are aside start- or end-tag + List<Statistics> samplestatistics = new ArrayList<Statistics>(); + boolean doimport = true; + while (doimport) { + + // concerning tag Flowcell: + if (event == XMLStreamConstants.START_ELEMENT && xmlparser.getLocalName().equals("Flowcell")) { + //System.out.println("\nStart-Element with tag " + xmlparser.getLocalName() + " with " + xmlparser.getAttributeCount() + " attributes with first attribute: " + xmlparser.getAttributeLocalName(0) +" = " + xmlparser.getAttributeValue(0)); + curflowcellname = xmlparser.getAttributeValue(0); // Assumption: Flowcell-attribute flowcell-id is just string + event = xmlparser.nextTag(); + } else if (event == XMLStreamConstants.END_ELEMENT && xmlparser.getLocalName().equals("Flowcell")) { + curflowcellname = ""; + event = xmlparser.nextTag(); + + // concerning tag Project: + } else if (event == XMLStreamConstants.START_ELEMENT && xmlparser.getLocalName().equals("Project")) { + //System.out.println("\nStart-Element with tag " + xmlparser.getLocalName() + " with " + xmlparser.getAttributeCount() + " attributes with first attribute: " + xmlparser.getAttributeLocalName(0) +" = " + xmlparser.getAttributeValue(0)); + if (xmlparser.getAttributeValue(0).equals("all")) { + // skip the current XML element and all of its following subelements: + skip = 1; + while (skip > 0) { + event = xmlparser.next(); + switch (event) { + case XMLStreamConstants.END_ELEMENT: skip -= 1; + break; // break-command after each case is necessary in switch-statement + case XMLStreamConstants.START_ELEMENT: skip += 1; + break; // break-command after each case is necessary in switch-statement + default: skip += 0; // text elements, spaces, ... + break; // break-command after each case is necessary in switch-statement + } + } + } else { + curprojectname = xmlparser.getAttributeValue(0); // Assumption: Project-attribute name is just string + } + event = xmlparser.nextTag(); + } else if (event == XMLStreamConstants.END_ELEMENT && xmlparser.getLocalName().equals("Project")) { + curprojectname = ""; + event = xmlparser.nextTag(); + + // concerning tag Sample: + } else if (event == XMLStreamConstants.START_ELEMENT && xmlparser.getLocalName().equals("Sample")) { + //System.out.println("\nStart-Element with tag " + xmlparser.getLocalName() + " with " + xmlparser.getAttributeCount() + " attributes with first attribute: " + xmlparser.getAttributeLocalName(0) +" = " + xmlparser.getAttributeValue(0)); + if (xmlparser.getAttributeValue(0).equals("all")) { + // skip the current XML element and all of its following subelements: + skip = 1; + while (skip > 0) { + event = xmlparser.next(); + switch (event) { + case XMLStreamConstants.END_ELEMENT: skip -= 1; + break; // break-command after each case is necessary in switch-statement + case XMLStreamConstants.START_ELEMENT: skip += 1; + break; // break-command after each case is necessary in switch-statement + default: skip += 0; // text elements, spaces, ... + break; // break-command after each case is necessary in switch-statement + } + } + } else { + cursamplename = xmlparser.getAttributeValue(0); // Assumption: Sample-attribute name is just string + } + event = xmlparser.nextTag(); + } else if (event == XMLStreamConstants.END_ELEMENT && xmlparser.getLocalName().equals("Sample")) { + cursamplename = ""; + event = xmlparser.nextTag(); + + // concerning tag Barcode (which is as well the start/end of Project-/Sample-Entry): + } else if (event == XMLStreamConstants.START_ELEMENT && xmlparser.getLocalName().equals("Barcode")) { + //System.out.println("\nStart-Element with tag " + xmlparser.getLocalName() + " with " + xmlparser.getAttributeCount() + " attributes with first attribute: " + xmlparser.getAttributeLocalName(0) +" = " + xmlparser.getAttributeValue(0)); + if (xmlparser.getAttributeValue(0).equals("all")) { + // skip the current XML element and all of its following subelements: + skip = 1; + while (skip > 0) { + event = xmlparser.next(); + switch (event) { + case XMLStreamConstants.END_ELEMENT: skip -= 1; + break; // break-command after each case is necessary in switch-statement + case XMLStreamConstants.START_ELEMENT: skip += 1; + break; // break-command after each case is necessary in switch-statement + default: skip += 0; // text elements, spaces, ... + break; // break-command after each case is necessary in switch-statement + } + } + } else { + curbarcodename = xmlparser.getAttributeValue(0); // Assumption: Barcode-attribute name is just string + cursample = new Sample(); + cursample.Flowcell = curflowcellname; + cursample.Project = curprojectname; + cursample.Sample = cursamplename; + cursample.Barcode = curbarcodename; + } + event = xmlparser.nextTag(); + } else if (event == XMLStreamConstants.END_ELEMENT && xmlparser.getLocalName().equals("Barcode")) { + // Statistics 1st step: calculate individual statistics per sample: + curstatistics = new Statistics(cursample); + samplestatisticslist.add(curstatistics); + cursample = null; + curstatistics = null; + curbarcodename = ""; + event = xmlparser.nextTag(); + + // concerning Lane: + } else if (event == XMLStreamConstants.START_ELEMENT && xmlparser.getLocalName().equals("Lane")) { + //System.out.println("\nStart-Element with tag " + xmlparser.getLocalName() + " with " + xmlparser.getAttributeCount() + " attributes with first attribute: " + xmlparser.getAttributeLocalName(0) +" = " + xmlparser.getAttributeValue(0)); + curlanenumber = Double.parseDouble(xmlparser.getAttributeValue(0)); // Assumption:Lane-attribute number is always numeric + event = xmlparser.nextTag(); + } else if (event == XMLStreamConstants.END_ELEMENT && xmlparser.getLocalName().equals("Lane")) { + curlanenumber = Double.NaN; + event = xmlparser.nextTag(); + + // concerning Tile with all its sub-elements: + } else if (event == XMLStreamConstants.START_ELEMENT && xmlparser.getLocalName().equals("Tile")) { + //System.out.println("\nStart-Element with tag " + xmlparser.getLocalName() + " with " + xmlparser.getAttributeCount() + " attributes with first attribute: " + xmlparser.getAttributeLocalName(0) +" = " + xmlparser.getAttributeValue(0)); + curtilenumber = Double.parseDouble(xmlparser.getAttributeValue(0)); // Assumption: Tile-attribute number is always numeric + // concerning Raw with Assumption: Raw-element is singlechild: + xmlparser.nextTag(); + //System.out.println("\nStart-Element with tag " + xmlparser.getLocalName()); + currawitem = new SampleItem(); + currawitem.Type = "Raw"; + currawitem.Lane = curlanenumber; + currawitem.Tile = curtilenumber; + xmlparser.nextTag(); + currawitem.ClusterCount = Double.parseDouble(xmlparser.getElementText()); // Assumption: ClusterCount-element is numeric singlechild + //System.out.println("\nValue: ClusterCount=" + currawitem.ClusterCount); + xmlparser.nextTag(); + while (event == XMLStreamConstants.START_ELEMENT && xmlparser.getLocalName().equals("Read")) { // Assumption: at least or more than 1 Read-element + //System.out.println("\nStart-Element with tag " + xmlparser.getLocalName() + " with " + xmlparser.getAttributeCount() + " attributes with first attribute: " + xmlparser.getAttributeLocalName(0) +" = " + xmlparser.getAttributeValue(0)); + xmlparser.nextTag(); + currawitem.YieldList.add(Double.parseDouble(xmlparser.getElementText())); // Assumption: Yield-element is numeric singlechild + xmlparser.nextTag(); + currawitem.YieldQ30List.add(Double.parseDouble(xmlparser.getElementText())); // Assumption: YieldQ30List-element is numeric singlechild + xmlparser.nextTag(); + currawitem.QualityScoreSumList.add(Double.parseDouble(xmlparser.getElementText())); // Assumption: QualityScoreSumList-element is numeric singlechild + xmlparser.nextTag(); + xmlparser.nextTag(); + //System.out.println("Values in Read: Yield=" + currawitem.YieldList.get(currawitem.YieldList.size()-1) + ", YieldQ30=" + currawitem.YieldQ30List.get(currawitem.YieldQ30List.size()-1) + ", QualityScoreSum=" + currawitem.QualityScoreSumList.get(currawitem.QualityScoreSumList.size()-1)); + } + //System.out.println("\nRaw-SampleItem " + currawitem); + cursample.RawList.add(currawitem); + currawitem = null; + // concerning Pf with Assumption that entire Pf-element is structured same as Raw: + xmlparser.nextTag(); + //System.out.println("\nStart-Element with tag " + xmlparser.getLocalName()); + curpfitem = new SampleItem(); + curpfitem.Type = "Pf"; + curpfitem.Lane = curlanenumber; + curpfitem.Tile = curtilenumber; + xmlparser.nextTag(); + curpfitem.ClusterCount = Double.parseDouble(xmlparser.getElementText()); + //System.out.println("\nValue: ClusterCount=" + curpfitem.ClusterCount); + xmlparser.nextTag(); + while (event == XMLStreamConstants.START_ELEMENT && xmlparser.getLocalName().equals("Read")) { + //System.out.println("\nStart-Element with tag " + xmlparser.getLocalName() + " with " + xmlparser.getAttributeCount() + " attributes with first attribute: " + xmlparser.getAttributeLocalName(0) +" = " + xmlparser.getAttributeValue(0)); + xmlparser.nextTag(); + curpfitem.YieldList.add(Double.parseDouble(xmlparser.getElementText())); + xmlparser.nextTag(); + curpfitem.YieldQ30List.add(Double.parseDouble(xmlparser.getElementText())); + xmlparser.nextTag(); + curpfitem.QualityScoreSumList.add(Double.parseDouble(xmlparser.getElementText())); + xmlparser.nextTag(); + xmlparser.nextTag(); + //System.out.println("Values in Read: Yield=" + curpfitem.YieldList.get(curpfitem.YieldList.size()-1) + ", YieldQ30=" + curpfitem.YieldQ30List.get(curpfitem.YieldQ30List.size()-1) + ", QualityScoreSum=" + curpfitem.QualityScoreSumList.get(curpfitem.QualityScoreSumList.size()-1)); + } + //System.out.println("\nPf-SampleItem " + curpfitem); + cursample.PfList.add(curpfitem); + curpfitem = null; + // attain end of current Tile and afterwards continue in next Tile/Lane/Barcode/Sample/Project: + event = xmlparser.nextTag(); + } else if (event == XMLStreamConstants.END_ELEMENT && xmlparser.getLocalName().equals("Tile")) { + curtilenumber = Double.NaN; + event = xmlparser.nextTag(); + + // concerning finish of reading in XML or hit upon error due XML content: + } else if (event == XMLStreamConstants.END_ELEMENT && xmlparser.getLocalName().equals("Stats")) { + // this final part of while loop is just for analyzing potential errors, but + // could be removed and changed in: while xmlparser.getLocalName() != 'Stats' + doimport = false; + } else { + doimport = false; + errormessage = "STRANGE ERROR IN METHOD importXMLdata_and_calculateStatistics WHEN READING IN XMLFILE. => CHECK CODE AND XMLFILE-STRUCTURE!"; + //System.out.println(errormessage); + throw new Exception(errormessage); + } + + } + xmlparser.close(); + + } catch (FileNotFoundException | XMLStreamException e) { + System.out.println("OCCURRED EXCEPTION " + e.toString()); + e.printStackTrace(); + } catch (IllegalArgumentException e) { + System.out.println("OCCURRED EXCEPTION " + e.toString()); + e.printStackTrace(); + } catch (Exception e) { // catch any other exception + System.out.println("OCCURRED EXCEPTION " + e.toString()); + e.printStackTrace(); + } finally { // anyway adapt statistics and return output (It would be empty list due to constructor of this object) + // Statistics 2nd step: adapt single sample's statistics including entire list of samples + for (Statistics s : samplestatisticslist) { + s.adaptStatisticsWithRespectToAllSamples(samplestatisticslist); + } + return samplestatisticslist; + } + + } + +//########################################################## + + public class SampleItem { + /* + Object of an item in sample including + - the corresponding type Raw or Pf + - the index of corresponding Lane, Tile + - measured value of ClusterCount and values in Lists (w.r.t. Read) of Yield, YieldQ30, QualityScoreSum + */ + + public String Type = ""; + // Define unknown numerical value. Type double could be exchanged with int, but double is necessary to initialize NaN-value. + public double Lane = Double.NaN; + public double Tile = Double.NaN; + public double ClusterCount = Double.NaN; + // Define unknown ArrayList of numerical values with changeable List size. Type Double(object) instead of double(primitive) is necessary in Lists. + public List<Double> YieldList = new ArrayList<Double>(); + public List<Double> YieldQ30List = new ArrayList<Double>(); + public List<Double> QualityScoreSumList = new ArrayList<Double>(); + + public String toString() { + return "Type: " + this.Type + ", Lane: " + (long)this.Lane + ", Tile: " + (long)this.Tile + + ", ClusterCount: " + (long)this.ClusterCount + ", YieldList: " + this.YieldList + + ", YieldQ30List: " + this.YieldQ30List + ", QualityScoreSumList: " + this.QualityScoreSumList; + } + + } + +//########################################################## + + public class Sample { + /* + Object of an entire sample including + - the name of Flowcell, Project, Sample, Barcode + - the list of Raw and Pf SampleItem-Objects + */ + + public String Flowcell = ""; + public String Project = ""; + public String Sample = ""; + public String Barcode = ""; + // Define unknown ArrayList of SampleItems: + public List<SampleItem> RawList = new ArrayList<SampleItem>(); + public List<SampleItem> PfList = new ArrayList<SampleItem>(); + + public String toString() { + return "Flowcell: " + this.Flowcell + ", Project: " + this.Project + ", Sample: " + this.Sample + ", Barcode: " + this.Barcode + + ", RawList: " + this.RawList + ", PfList: " + this.PfList; + } + + } + +//########################################################## + + public class Statistics extends Sample { + /* + Object of Statistics within one single sample inherited from Sample-Object + */ + + public double Sum_RawClusterCount = Double.NaN; // obvious double, but could be turned into int + public double Sum_PfClusterCount = Double.NaN; // obvious double, but could be turned into int + public double Sum_RawYield = Double.NaN; + public double Sum_PfYield = Double.NaN; + public double Sum_RawYieldQ30 = Double.NaN; + public double Sum_PfYieldQ30 = Double.NaN; + public double Sum_RawQualityScoreSum = Double.NaN; + public double Sum_PfQualityScoreSum = Double.NaN; + public double Mega_RawYield = Double.NaN; // obvious double, but could be turned into int + public double Mega_PfYield = Double.NaN; // obvious double, but could be turned into int + public double Percentage_PfYield_RawYield = Double.NaN; + public double Percentage_PfYieldQ30_PfYield = Double.NaN; + public double Fraction_PfQualityScoreSum_PfYield = Double.NaN; + public double Percentage_PfClusterCount_RawClusterCount = Double.NaN; + public double Percentage_RawClusterCount_AllRawClusterCounts = Double.NaN; + + public Statistics(Sample sample) { + /* + Constructor of derived class + Initialization: Already initialized Sample-Object is necessary argument. + */ + + super(); + Flowcell = sample.Flowcell; + Project = sample.Project; + Sample = sample.Sample; + Barcode = sample.Barcode; + RawList = sample.RawList; + PfList = sample.PfList; + + if (RawList.size()>0) Sum_RawClusterCount = 0; + for (SampleItem s : RawList) Sum_RawClusterCount += s.ClusterCount; + if (PfList.size()>0) Sum_PfClusterCount = 0; + for (SampleItem s : PfList) Sum_PfClusterCount += s.ClusterCount; + if (RawList.size()>0) Sum_RawYield = 0; + for (SampleItem s : RawList) for (double d : s.YieldList) Sum_RawYield += d; + if (PfList.size()>0) Sum_PfYield = 0; + for (SampleItem s : PfList) for (double d : s.YieldList) Sum_PfYield += d; + if (RawList.size()>0) Sum_RawYieldQ30 = 0; + for (SampleItem s : RawList) for (double d : s.YieldQ30List) Sum_RawYieldQ30 += d; + if (PfList.size()>0) Sum_PfYieldQ30 = 0; + for (SampleItem s : PfList) for (double d : s.YieldQ30List) Sum_PfYieldQ30 += d; + if (RawList.size()>0) Sum_RawQualityScoreSum = 0; + for (SampleItem s : RawList) for (double d : s.QualityScoreSumList) Sum_RawQualityScoreSum += d; + if (PfList.size()>0) Sum_PfQualityScoreSum = 0; + for (SampleItem s : PfList) for (double d : s.QualityScoreSumList) Sum_PfQualityScoreSum += d; +// Mega_RawYield = calculate_MegaUnit(Sum_RawYield); +// Mega_PfYield = calculate_MegaUnit(Sum_PfYield); + + Mega_RawYield = Sum_RawYield; + Mega_PfYield = Sum_PfYield; + + Percentage_PfYield_RawYield = calculate_Percentage(Sum_PfYield,Sum_RawYield); + Percentage_PfYieldQ30_PfYield = calculate_Percentage(Sum_PfYieldQ30,Sum_PfYield); + Fraction_PfQualityScoreSum_PfYield = calculate_Fraction(Sum_PfQualityScoreSum,Sum_PfYield); + Percentage_PfClusterCount_RawClusterCount = calculate_Percentage(Sum_PfClusterCount,Sum_RawClusterCount); + // Calculation of attribute "Percentage_RawClusterCount_AllRawClusterCounts" needs statistics of all other included samples. => After initializing this object, apply method: adaptStatisticsWithRespectToAllSamples(statisticslist) + } + + public String toString() { + return "Flowcell: " + this.Flowcell + ", Project: " + this.Project + ", Sample: " + this.Sample + ", Barcode: " + this.Barcode + + ", Raw Clusters: " + (long)this.Sum_RawClusterCount + ", Mbases Raw Yield: " + this.Mega_RawYield + + ", % Raw Clusters overall: " + this.Percentage_RawClusterCount_AllRawClusterCounts + + ", Pf Clusters: " + (long)this.Sum_PfClusterCount + ", Mbases Pf Yield: " + this.Mega_PfYield + + ", % PfYield/RawYield: " + this.Percentage_PfYield_RawYield + + ", % PfYieldQ30/PfYield: " + this.Percentage_PfYieldQ30_PfYield + + ", Mean Pf Quality Score: " + this.Fraction_PfQualityScoreSum_PfYield + + ", % Passes Filtering: " + this.Percentage_PfClusterCount_RawClusterCount; + } + + public void adaptStatisticsWithRespectToAllSamples(List<Statistics> statisticslist) { + /* + This Statistics-Object (corresponding to one sample) is adapted by employing a list of + Statistics-Objects corresponding to all samples influencing the statistics of single sample. + Input: statisticslist contains all Statistics-Objects + */ + if (statisticslist.size()==0) { // here it additionally should be checked, if calling object is included ... + Percentage_RawClusterCount_AllRawClusterCounts = Double.NaN; + String errormessage = "INPUT ARGUMENT statisticslist MUST BE LIST OF Statistics OBJECTS INCLUDING THE CALLING OBJECT IN METHOD adaptStatisticsWithRespectToAllSamples!"; + //System.out.println(errormessage); + throw new IllegalArgumentException(errormessage); // Exception reasonable since otherwise wrong results. + } else { + double sum_allrawclustercounts = 0; + for (Statistics s : statisticslist) sum_allrawclustercounts += s.Sum_RawClusterCount; + Percentage_RawClusterCount_AllRawClusterCounts = calculate_Percentage(Sum_RawClusterCount, sum_allrawclustercounts); + } + } + + public double calculate_MegaUnit(double x) { + double z; + if (x == Double.POSITIVE_INFINITY || x == Double.NEGATIVE_INFINITY || Double.isNaN(x)) { + z = Double.NaN; + String errormessage = "INPUT ARGUMENT WAS UNREASONABLE IN METHOD calculate_MegaUnit! x = " + x + " (Values were in Sample " + Sample + ")"; + //System.out.println(errormessage); + //throw new IllegalArgumentException(errormessage); + } else { + z = x/1000000; + } + return z; + } + + public double calculate_Percentage(double x, double y) { + double z = x/y; + if (z == Double.POSITIVE_INFINITY || z == Double.NEGATIVE_INFINITY || Double.isNaN(z)) { + z = Double.NaN; + String errormessage = "INPUT ARGUMENT WAS UNREASONABLE IN METHOD calculate_Percentage! x = " + x + ", y = " + y + " (Values were in Sample " + Sample + ")"; + //System.out.println(errormessage); + //throw new IllegalArgumentException(errormessage); + } else { + z = 100*z; + } + return z; + } + + public double calculate_Fraction(double x, double y) { + double z = x/y; + if (z == Double.POSITIVE_INFINITY || z == Double.NEGATIVE_INFINITY || Double.isNaN(z)) { + z = Double.NaN; + String errormessage = "INPUT ARGUMENT WAS UNREASONABLE IN METHOD calculate_Fraction! x = " + x + ", y = " + y + " (Values were in Sample " + Sample + ")"; + //System.out.println(errormessage); + //throw new IllegalArgumentException(errormessage); + } + return z; + } + + public double roundspecific(double x, int places) { + double z; + if (places < 0 || places > 13 || x == Double.POSITIVE_INFINITY || x == Double.NEGATIVE_INFINITY || Double.isNaN(x)) { + z = Double.NaN; + String errormessage = "INPUT ARGUMENT WAS UNREASONABLE IN METHOD roundspecific! x = " + x + ", places = " + places + " (Values were in Sample " + Sample + ")"; + //System.out.println(errormessage); + //throw new IllegalArgumentException(errormessage); + } else { + double factor = Math.pow(10, places); + z = Math.round(x * factor); + z = z / factor; + } + return z; + } + + } + +} \ No newline at end of file diff --git a/deep_sequencing_unit/sourceTest/core-plugins/illumina-qgf/1/dss/drop-boxes/read-demultiplex-stats/read-demultiplex-stats.py b/deep_sequencing_unit/sourceTest/core-plugins/illumina-qgf/1/dss/drop-boxes/read-demultiplex-stats/read-demultiplex-stats.py index b6c0cc3a544373d46611b170ef965187596ca02a..3e86dfcf7a4d2d4d16a3cb805be00f2b075dac2f 100755 --- a/deep_sequencing_unit/sourceTest/core-plugins/illumina-qgf/1/dss/drop-boxes/read-demultiplex-stats/read-demultiplex-stats.py +++ b/deep_sequencing_unit/sourceTest/core-plugins/illumina-qgf/1/dss/drop-boxes/read-demultiplex-stats/read-demultiplex-stats.py @@ -499,7 +499,16 @@ def process(transaction): else: searchIndex1 = [ index1 for index1 in index1List if searchIndex1 in index1[:-1]] try: - searchIndex1 = searchIndex1[0] + if len(searchIndex1) > 1: + print("AMBIGIOUS INDEX FOUND!") + print(searchIndex1) + if searchIndex1[0].startswith(mystat.index1.upper()): + searchIndex1 = searchIndex1[0] + else: + searchIndex1 = searchIndex1[1] + else: + searchIndex1 = searchIndex1[0] + except: searchIndex1 = 'MISSING' else: diff --git a/deep_sequencing_unit/sourceTest/core-plugins/illumina-qgf/1/dss/drop-boxes/register-cluster-alignment-java/plugin.properties b/deep_sequencing_unit/sourceTest/core-plugins/illumina-qgf/1/dss/drop-boxes/register-cluster-alignment-java/plugin.properties index 35b2b5253d07f4bbba318804faa7f1534f691fca..f3d4bd413ad00de1ccf2b09a23f71282dcab5d7e 100644 --- a/deep_sequencing_unit/sourceTest/core-plugins/illumina-qgf/1/dss/drop-boxes/register-cluster-alignment-java/plugin.properties +++ b/deep_sequencing_unit/sourceTest/core-plugins/illumina-qgf/1/dss/drop-boxes/register-cluster-alignment-java/plugin.properties @@ -3,5 +3,5 @@ top-level-data-set-handler = ch.systemsx.cisd.etlserver.registrator.api.v2.JavaT program-class = ch.systemsx.cisd.etlserver.registrator.api.v2.AlignmentJavaDataSetRegistrationDropboxV2 #program-class = AlignmentJavaDataSetRegistrationDropboxV2 storage-processor = ch.systemsx.cisd.etlserver.DefaultStorageProcessor -incoming-dir-create = true +incoming-dir-create = false incoming-data-completeness-condition = auto-detection diff --git a/deep_sequencing_unit/sourceTest/core-plugins/illumina-qgf/1/dss/drop-boxes/register-dummy/plugin.properties b/deep_sequencing_unit/sourceTest/core-plugins/illumina-qgf/1/dss/drop-boxes/register-dummy/plugin.properties new file mode 100644 index 0000000000000000000000000000000000000000..ad9cc236b385ad8c1a9d8d8119d276f3d428bdae --- /dev/null +++ b/deep_sequencing_unit/sourceTest/core-plugins/illumina-qgf/1/dss/drop-boxes/register-dummy/plugin.properties @@ -0,0 +1,11 @@ +# +# Drop box for registering a flow cell output as a data set +# +# Variables: +# incoming-root-dir +# Path to the directory which contains incoming directories for drop boxes. +incoming-dir = ${incoming-root-dir}/register-dummy +incoming-data-completeness-condition = marker-file +top-level-data-set-handler = ch.systemsx.cisd.etlserver.registrator.api.v2.JythonTopLevelDataSetHandlerV2 +script-path = register-dummy.py +storage-processor = ch.systemsx.cisd.etlserver.DefaultStorageProcessor diff --git a/deep_sequencing_unit/sourceTest/core-plugins/illumina-qgf/1/dss/drop-boxes/register-dummy/register-dummy.py b/deep_sequencing_unit/sourceTest/core-plugins/illumina-qgf/1/dss/drop-boxes/register-dummy/register-dummy.py new file mode 100644 index 0000000000000000000000000000000000000000..01a2eec1e4fe3fd271f352be60b1b43211f84d29 --- /dev/null +++ b/deep_sequencing_unit/sourceTest/core-plugins/illumina-qgf/1/dss/drop-boxes/register-dummy/register-dummy.py @@ -0,0 +1,56 @@ +''' +@copyright: +2014 ETH Zuerich, CISD + +@license: +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +@author: +Manuel Kohler +''' + +import os +import shutil +from ch.systemsx.cisd.openbis.generic.shared.api.v1.dto import SearchCriteria +from ch.systemsx.cisd.openbis.generic.shared.api.v1.dto import SearchSubCriteria + + +def splitFolderName (folderName): + runFolder, lane = folderName.rsplit("_", 1) + return runFolder + ":" + lane + +def searchSample (transaction, sampleCode): + # Get the search service + search_service = transaction.getSearchService() + + print("Searching for " + sampleCode) + # Search for the sample + sc = SearchCriteria() + sc.addMatchClause(SearchCriteria.MatchClause.createAttributeMatch(SearchCriteria.MatchClauseAttribute.CODE, sampleCode)) + foundSamples = search_service.searchForSamples(sc) + assert(len(foundSamples), 1) + return foundSamples[0] + +def process(transaction): + + # expected incoming folder names: + # 120917_SN792_0158_AC125KACXX_4 + + incomingFolder = transaction.getIncoming().getName() + flowLaneName = splitFolderName(incomingFolder) + flowLane = searchSample(transaction, flowLaneName) + + dataSet = transaction.createNewDataSet("ALIGNMENT") + dataSet.setMeasuredData(False) + dataSet.setSample(flowLane) + transaction.moveFile(transaction.getIncoming().getPath(), dataSet) diff --git a/deep_sequencing_unit/sourceTest/core-plugins/illumina-qgf/1/dss/drop-boxes/register-lane-hiseq/lib/crc32 b/deep_sequencing_unit/sourceTest/core-plugins/illumina-qgf/1/dss/drop-boxes/register-lane-hiseq/lib/crc32 index ffaf277227c6dfb0dae3fbe751d9db158bd5ae29..7959ca94a1fa61e47c2e4df815bb35fceb7bd809 100755 Binary files a/deep_sequencing_unit/sourceTest/core-plugins/illumina-qgf/1/dss/drop-boxes/register-lane-hiseq/lib/crc32 and b/deep_sequencing_unit/sourceTest/core-plugins/illumina-qgf/1/dss/drop-boxes/register-lane-hiseq/lib/crc32 differ diff --git a/deep_sequencing_unit/sourceTest/core-plugins/illumina-qgf/1/dss/drop-boxes/register-lane-miseq/lib/crc32 b/deep_sequencing_unit/sourceTest/core-plugins/illumina-qgf/1/dss/drop-boxes/register-lane-miseq/lib/crc32 index ffaf277227c6dfb0dae3fbe751d9db158bd5ae29..7959ca94a1fa61e47c2e4df815bb35fceb7bd809 100755 Binary files a/deep_sequencing_unit/sourceTest/core-plugins/illumina-qgf/1/dss/drop-boxes/register-lane-miseq/lib/crc32 and b/deep_sequencing_unit/sourceTest/core-plugins/illumina-qgf/1/dss/drop-boxes/register-lane-miseq/lib/crc32 differ diff --git a/deep_sequencing_unit/sourceTest/core-plugins/illumina-qgf/1/dss/drop-boxes/register-lane-miseq/register-lane-miseq.py b/deep_sequencing_unit/sourceTest/core-plugins/illumina-qgf/1/dss/drop-boxes/register-lane-miseq/register-lane-miseq.py index 56d96799ca62d3b247acc8771633d2a394beed6e..985022f203b2e273160a28c76275b317056c477c 100644 --- a/deep_sequencing_unit/sourceTest/core-plugins/illumina-qgf/1/dss/drop-boxes/register-lane-miseq/register-lane-miseq.py +++ b/deep_sequencing_unit/sourceTest/core-plugins/illumina-qgf/1/dss/drop-boxes/register-lane-miseq/register-lane-miseq.py @@ -25,8 +25,8 @@ AFFILIATION_PROPERTY_NAME='AFFILIATION' INDEX1='BARCODE' INDEX2='INDEX2' EXTERNAL_SAMPLE_NAME='EXTERNAL_SAMPLE_NAME' + DEFAULT_INDEX='NoIndex' -CRC32_PATH='lib/crc32' # ------------------------------------------------------------------------------- diff --git a/deep_sequencing_unit/sourceTest/core-plugins/illumina-qgf/1/dss/drop-boxes/register-lane-nextseq/lib/crc32 b/deep_sequencing_unit/sourceTest/core-plugins/illumina-qgf/1/dss/drop-boxes/register-lane-nextseq/lib/crc32 index ffaf277227c6dfb0dae3fbe751d9db158bd5ae29..7959ca94a1fa61e47c2e4df815bb35fceb7bd809 100755 Binary files a/deep_sequencing_unit/sourceTest/core-plugins/illumina-qgf/1/dss/drop-boxes/register-lane-nextseq/lib/crc32 and b/deep_sequencing_unit/sourceTest/core-plugins/illumina-qgf/1/dss/drop-boxes/register-lane-nextseq/lib/crc32 differ diff --git a/deep_sequencing_unit/sourceTest/core-plugins/illumina-qgf/1/dss/drop-boxes/register-lane-nextseq/lib/crc32_v2.c b/deep_sequencing_unit/sourceTest/core-plugins/illumina-qgf/1/dss/drop-boxes/register-lane-nextseq/lib/crc32_v2.c new file mode 100644 index 0000000000000000000000000000000000000000..b38d22d40f641254661994d4cc01492bb7c8d35a --- /dev/null +++ b/deep_sequencing_unit/sourceTest/core-plugins/illumina-qgf/1/dss/drop-boxes/register-lane-nextseq/lib/crc32_v2.c @@ -0,0 +1,206 @@ +/*----------------------------------------------------------------------------*\ + * CRC-32 version 2.0.0 by Craig Bruce, 2006-04-29. + * + * This program generates the CRC-32 values for the files named in the + * command-line arguments. These are the same CRC-32 values used by GZIP, + * PKZIP, and ZMODEM. The Crc32_ComputeBuf() can also be detached and + * used independently. + * + * THIS PROGRAM IS PUBLIC-DOMAIN SOFTWARE. + * + * Based on the byte-oriented implementation "File Verification Using CRC" + * by Mark R. Nelson in Dr. Dobb's Journal, May 1992, pp. 64-67. + * + * v1.0.0: original release. + * v1.0.1: fixed printf formats. + * v1.0.2: fixed something else. + * v1.0.3: replaced CRC constant table by generator function. + * v1.0.4: reformatted code, made ANSI C. 1994-12-05. + * v2.0.0: rewrote to use memory buffer & static table, 2006-04-29. +\*----------------------------------------------------------------------------*/ + +#include <stdio.h> +#include <stdlib.h> + +/*----------------------------------------------------------------------------*\ + * Local functions +\*----------------------------------------------------------------------------*/ + +static int Crc32_ComputeFile( FILE *file, unsigned long *outCrc32 ); + +static unsigned long Crc32_ComputeBuf( unsigned long inCrc32, const void *buf, + size_t bufLen ); + +/*----------------------------------------------------------------------------*\ + * NAME: + * main() - main function for CRC-32 generation + * DESCRIPTION: + * Computes the CRC-32 value for the set of files named in the command- + * line arguments. + * ARGUMENTS: + * argc - command-line-argument count + * argv - command-line-argument strings + * RETURNS: + * err - 0 on success or executes exit(1) on error + * ERRORS: + * - file errors +\*----------------------------------------------------------------------------*/ + +int main( int argc, const char *argv[] ) +{ + FILE *file = NULL; + const char *filename; + unsigned long argIdx; + unsigned long crc32; + int err; + + /** compute crcs **/ + if (argc < 2) { + /** read from 'stdin' if no arguments given **/ + err = Crc32_ComputeFile( stdin, &crc32 ); + if (err == -1) goto ERR_EXIT; + printf("crc32 = 0x%08lX for (stdin)\n", crc32 ); + } else { + /** report named files in sequence **/ + for (argIdx=1; argIdx < argc; argIdx++) { + filename = argv[argIdx]; + file = fopen( filename, "rb" ); + if (file == NULL) { + fprintf( stderr, "error opening file \"%s\"!\n", filename ); + goto ERR_EXIT; + } + err = Crc32_ComputeFile( file, &crc32 ); + if (err == -1) goto ERR_EXIT; + /*printf("crc32 = 0x%08lX for \"%s\"\n", crc32, filename );*/ + printf("%08lX", crc32); + err = fclose( file ); + file = NULL; + if (err == EOF) { + fprintf( stderr, "error closing file \"%s\"!\n", filename ); + goto ERR_EXIT; + } + } + } + return( 0 ); + + /** error exit **/ +ERR_EXIT: + if (file != NULL) fclose( file ); + exit( 1 ); +} + +/*----------------------------------------------------------------------------*\ + * NAME: + * Crc32_ComputeFile() - compute CRC-32 value for a file + * DESCRIPTION: + * Computes the CRC-32 value for an opened file. + * ARGUMENTS: + * file - file pointer + * outCrc32 - (out) result CRC-32 value + * RETURNS: + * err - 0 on success or -1 on error + * ERRORS: + * - file errors +\*----------------------------------------------------------------------------*/ + +static int Crc32_ComputeFile( FILE *file, unsigned long *outCrc32 ) +{ +# define CRC_BUFFER_SIZE 8192 + unsigned char buf[CRC_BUFFER_SIZE]; + size_t bufLen; + + /** accumulate crc32 from file **/ + *outCrc32 = 0; + while (1) { + bufLen = fread( buf, 1, CRC_BUFFER_SIZE, file ); + if (bufLen == 0) { + if (ferror(file)) { + fprintf( stderr, "error reading file\n" ); + goto ERR_EXIT; + } + break; + } + *outCrc32 = Crc32_ComputeBuf( *outCrc32, buf, bufLen ); + } + return( 0 ); + + /** error exit **/ +ERR_EXIT: + return( -1 ); +} + +/*----------------------------------------------------------------------------*\ + * NAME: + * Crc32_ComputeBuf() - computes the CRC-32 value of a memory buffer + * DESCRIPTION: + * Computes or accumulates the CRC-32 value for a memory buffer. + * The 'inCrc32' gives a previously accumulated CRC-32 value to allow + * a CRC to be generated for multiple sequential buffer-fuls of data. + * The 'inCrc32' for the first buffer must be zero. + * ARGUMENTS: + * inCrc32 - accumulated CRC-32 value, must be 0 on first call + * buf - buffer to compute CRC-32 value for + * bufLen - number of bytes in buffer + * RETURNS: + * crc32 - computed CRC-32 value + * ERRORS: + * (no errors are possible) +\*----------------------------------------------------------------------------*/ + +static unsigned long Crc32_ComputeBuf( unsigned long inCrc32, const void *buf, + size_t bufLen ) +{ + static const unsigned long crcTable[256] = { + 0x00000000,0x77073096,0xEE0E612C,0x990951BA,0x076DC419,0x706AF48F,0xE963A535, + 0x9E6495A3,0x0EDB8832,0x79DCB8A4,0xE0D5E91E,0x97D2D988,0x09B64C2B,0x7EB17CBD, + 0xE7B82D07,0x90BF1D91,0x1DB71064,0x6AB020F2,0xF3B97148,0x84BE41DE,0x1ADAD47D, + 0x6DDDE4EB,0xF4D4B551,0x83D385C7,0x136C9856,0x646BA8C0,0xFD62F97A,0x8A65C9EC, + 0x14015C4F,0x63066CD9,0xFA0F3D63,0x8D080DF5,0x3B6E20C8,0x4C69105E,0xD56041E4, + 0xA2677172,0x3C03E4D1,0x4B04D447,0xD20D85FD,0xA50AB56B,0x35B5A8FA,0x42B2986C, + 0xDBBBC9D6,0xACBCF940,0x32D86CE3,0x45DF5C75,0xDCD60DCF,0xABD13D59,0x26D930AC, + 0x51DE003A,0xC8D75180,0xBFD06116,0x21B4F4B5,0x56B3C423,0xCFBA9599,0xB8BDA50F, + 0x2802B89E,0x5F058808,0xC60CD9B2,0xB10BE924,0x2F6F7C87,0x58684C11,0xC1611DAB, + 0xB6662D3D,0x76DC4190,0x01DB7106,0x98D220BC,0xEFD5102A,0x71B18589,0x06B6B51F, + 0x9FBFE4A5,0xE8B8D433,0x7807C9A2,0x0F00F934,0x9609A88E,0xE10E9818,0x7F6A0DBB, + 0x086D3D2D,0x91646C97,0xE6635C01,0x6B6B51F4,0x1C6C6162,0x856530D8,0xF262004E, + 0x6C0695ED,0x1B01A57B,0x8208F4C1,0xF50FC457,0x65B0D9C6,0x12B7E950,0x8BBEB8EA, + 0xFCB9887C,0x62DD1DDF,0x15DA2D49,0x8CD37CF3,0xFBD44C65,0x4DB26158,0x3AB551CE, + 0xA3BC0074,0xD4BB30E2,0x4ADFA541,0x3DD895D7,0xA4D1C46D,0xD3D6F4FB,0x4369E96A, + 0x346ED9FC,0xAD678846,0xDA60B8D0,0x44042D73,0x33031DE5,0xAA0A4C5F,0xDD0D7CC9, + 0x5005713C,0x270241AA,0xBE0B1010,0xC90C2086,0x5768B525,0x206F85B3,0xB966D409, + 0xCE61E49F,0x5EDEF90E,0x29D9C998,0xB0D09822,0xC7D7A8B4,0x59B33D17,0x2EB40D81, + 0xB7BD5C3B,0xC0BA6CAD,0xEDB88320,0x9ABFB3B6,0x03B6E20C,0x74B1D29A,0xEAD54739, + 0x9DD277AF,0x04DB2615,0x73DC1683,0xE3630B12,0x94643B84,0x0D6D6A3E,0x7A6A5AA8, + 0xE40ECF0B,0x9309FF9D,0x0A00AE27,0x7D079EB1,0xF00F9344,0x8708A3D2,0x1E01F268, + 0x6906C2FE,0xF762575D,0x806567CB,0x196C3671,0x6E6B06E7,0xFED41B76,0x89D32BE0, + 0x10DA7A5A,0x67DD4ACC,0xF9B9DF6F,0x8EBEEFF9,0x17B7BE43,0x60B08ED5,0xD6D6A3E8, + 0xA1D1937E,0x38D8C2C4,0x4FDFF252,0xD1BB67F1,0xA6BC5767,0x3FB506DD,0x48B2364B, + 0xD80D2BDA,0xAF0A1B4C,0x36034AF6,0x41047A60,0xDF60EFC3,0xA867DF55,0x316E8EEF, + 0x4669BE79,0xCB61B38C,0xBC66831A,0x256FD2A0,0x5268E236,0xCC0C7795,0xBB0B4703, + 0x220216B9,0x5505262F,0xC5BA3BBE,0xB2BD0B28,0x2BB45A92,0x5CB36A04,0xC2D7FFA7, + 0xB5D0CF31,0x2CD99E8B,0x5BDEAE1D,0x9B64C2B0,0xEC63F226,0x756AA39C,0x026D930A, + 0x9C0906A9,0xEB0E363F,0x72076785,0x05005713,0x95BF4A82,0xE2B87A14,0x7BB12BAE, + 0x0CB61B38,0x92D28E9B,0xE5D5BE0D,0x7CDCEFB7,0x0BDBDF21,0x86D3D2D4,0xF1D4E242, + 0x68DDB3F8,0x1FDA836E,0x81BE16CD,0xF6B9265B,0x6FB077E1,0x18B74777,0x88085AE6, + 0xFF0F6A70,0x66063BCA,0x11010B5C,0x8F659EFF,0xF862AE69,0x616BFFD3,0x166CCF45, + 0xA00AE278,0xD70DD2EE,0x4E048354,0x3903B3C2,0xA7672661,0xD06016F7,0x4969474D, + 0x3E6E77DB,0xAED16A4A,0xD9D65ADC,0x40DF0B66,0x37D83BF0,0xA9BCAE53,0xDEBB9EC5, + 0x47B2CF7F,0x30B5FFE9,0xBDBDF21C,0xCABAC28A,0x53B39330,0x24B4A3A6,0xBAD03605, + 0xCDD70693,0x54DE5729,0x23D967BF,0xB3667A2E,0xC4614AB8,0x5D681B02,0x2A6F2B94, + 0xB40BBE37,0xC30C8EA1,0x5A05DF1B,0x2D02EF8D }; + unsigned long crc32; + unsigned char *byteBuf; + size_t i; + + /** accumulate crc32 for buffer **/ + crc32 = inCrc32 ^ 0xFFFFFFFF; + byteBuf = (unsigned char*) buf; + for (i=0; i < bufLen; i++) { + crc32 = (crc32 >> 8) ^ crcTable[ (crc32 ^ byteBuf[i]) & 0xFF ]; + } + return( crc32 ^ 0xFFFFFFFF ); +} + +/*----------------------------------------------------------------------------*\ + * END OF MODULE: crc32.c +\*----------------------------------------------------------------------------*/ diff --git a/deep_sequencing_unit/sourceTest/core-plugins/illumina-qgf/1/dss/drop-boxes/register-lane-nextseq/register-lane-nextseq.py b/deep_sequencing_unit/sourceTest/core-plugins/illumina-qgf/1/dss/drop-boxes/register-lane-nextseq/register-lane-nextseq.py index d357a6a02f1f3674ca607ef70afe0dcff76d6632..4b6a2ce34560667f14c2bdceb36082d1ea97252e 100644 --- a/deep_sequencing_unit/sourceTest/core-plugins/illumina-qgf/1/dss/drop-boxes/register-lane-nextseq/register-lane-nextseq.py +++ b/deep_sequencing_unit/sourceTest/core-plugins/illumina-qgf/1/dss/drop-boxes/register-lane-nextseq/register-lane-nextseq.py @@ -360,6 +360,7 @@ def process(transaction): meta_data_file_path = transaction.createNewFile(dataSet, name, meta_data_file_name) writeMetadataFile(transaction, name, meta_data_file_path, sequencing_sample_properties_dict, fcMetaDataDict, experiment, affiliation_name, fastq_files, flowLane) + affiliation_for_Undetermined = affiliation_name # Undetermined Files else: