From 98af93886d666b0d35d99737a4c36336d8742168 Mon Sep 17 00:00:00 2001 From: cramakri <cramakri> Date: Mon, 11 Jul 2011 12:37:40 +0000 Subject: [PATCH] LMS-2391 Split TSV files where necessary. Switched tab to spaces in all python code. SVN: 22070 --- eu_basynthec/dist/etc/data-set-validator.py | 4 +- .../etc/growth-profiles/data-set-handler.py | 143 +++++---- .../etc/growth-profiles/data-set-validator.py | 102 +++---- .../dist/etc/metabolomics/data-set-handler.py | 88 +++--- .../etc/metabolomics/data-set-validator.py | 116 +++---- .../dist/etc/proteomics/data-set-handler.py | 88 +++--- .../dist/etc/proteomics/data-set-validator.py | 124 ++++---- .../dist/etc/shared/shared-classes.py | 282 +++++++++--------- .../etc/transcriptomics/data-set-handler.py | 231 +++++++++----- .../etc/transcriptomics/data-set-validator.py | 128 ++++---- .../sourceTest/examples/OD600-Example.xlsx | Bin 9914 -> 10077 bytes ...stractBaSynthecDataSetRegistratorTest.java | 19 +- .../cisd/dss/TimeSeriesDataExcelTest.java | 3 +- .../OD600DataSetRegistratorTest.java | 30 +- ...TranscriptomicsDataSetRegistratorTest.java | 30 +- 15 files changed, 782 insertions(+), 606 deletions(-) diff --git a/eu_basynthec/dist/etc/data-set-validator.py b/eu_basynthec/dist/etc/data-set-validator.py index 6b1f13c649b..8af662eac42 100644 --- a/eu_basynthec/dist/etc/data-set-validator.py +++ b/eu_basynthec/dist/etc/data-set-validator.py @@ -2,5 +2,5 @@ import os import re def validate_data_set_file(file): - errors = [] - return errors + errors = [] + return errors diff --git a/eu_basynthec/dist/etc/growth-profiles/data-set-handler.py b/eu_basynthec/dist/etc/growth-profiles/data-set-handler.py index 4f4df43954f..bc76e78a48d 100644 --- a/eu_basynthec/dist/etc/growth-profiles/data-set-handler.py +++ b/eu_basynthec/dist/etc/growth-profiles/data-set-handler.py @@ -2,58 +2,92 @@ from datetime import datetime from eu.basynthec.cisd.dss import TimeSeriesDataExcel def retrieve_experiment(tr, exp_id): - """Get the specified experiment form the server. Return the experiment.""" - if exp_id is None: - exp = None - else: - exp = tr.getExperiment(exp_id) - return exp + """Get the specified experiment form the server. Return the experiment.""" + if exp_id is None: + exp = None + else: + exp = tr.getExperiment(exp_id) + return exp def extract_strains(): - """Extract the strains from the data sheet""" - strains = [] - lines = timeSeriesData.getRawDataLines() - for i in range(1, len(lines)): - line = lines[i] - strains.append(line[0]) - return ",".join(strains) + """Extract the strains from the data sheet""" + strains = [] + lines = timeSeriesData.getRawDataLines() + for i in range(1, len(lines)): + line = lines[i] + strains.append(line[0]) + return ",".join(strains) def assign_properties(dataset, metadata): - """Assign properties to the data set from information in the data.""" - propertyNameMap = { - "STRAIN_NAMES": "STRAIN_NAMES", - "TIMEPOINT TYPE": "TIMEPOINT_TYPE", - "CELL LOCATION": "CELL_LOCATION", - "VALUE TYPE": "VALUE_TYPE", - "VALUE UNIT": "VALUE_UNIT", - "SCALE": "SCALE" - } - - for prop in metadata.keySet(): - key = propertyNameMap.get(prop) - if key is not None: - value = metadata.get(prop) - if (key == "STRAIN"): - value = value + " (STRAIN)" - dataset.setPropertyValue(key, value.upper()) - + """Assign properties to the data set from information in the data.""" + propertyNameMap = { + "STRAIN_NAMES": "STRAIN_NAMES", + "TIMEPOINT TYPE": "TIMEPOINT_TYPE", + "CELL LOCATION": "CELL_LOCATION", + "VALUE TYPE": "VALUE_TYPE", + "VALUE UNIT": "VALUE_UNIT", + "SCALE": "SCALE" + } + + for prop in metadata.keySet(): + key = propertyNameMap.get(prop) + if key is not None: + value = metadata.get(prop) + if (key == "STRAIN"): + value = value + " (STRAIN)" + dataset.setPropertyValue(key, value.upper()) + def convert_data_to_tsv(tr, dataset, location): - """Create a tsv file containing the data and add it to the data set.""" - tr.createNewDirectory(dataset, location) - tsvFileName = tr.createNewFile(dataset, location, incoming.getName() + ".tsv") - tsv = open(tsvFileName, 'w') - for line in timeSeriesData.getRawDataLines(): - for i in range(0, len(line) - 1): - tsv.write(line[i]) - tsv.write("\t") - tsv.write(line[len(line) - 1]) - tsv.write("\n") - tsv.close() - + """Create a tsv file containing the data and add it to the data set.""" + tr.createNewDirectory(dataset, location) + tsvFileName = tr.createNewFile(dataset, location, incoming.getName() + ".tsv") + tsv = open(tsvFileName, 'w') + for line in timeSeriesData.getRawDataLines(): + for i in range(0, len(line) - 1): + tsv.write(line[i]) + tsv.write("\t") + tsv.write(line[len(line) - 1]) + tsv.write("\n") + tsv.close() + +def convert_data_to_split_tsv(tr, dataset, location): + """Create one tsv file per strain in the original data.""" + raw_data_lines = timeSeriesData.getRawDataLines() + + # Extract the header -- this is shared by all files + header_line = raw_data_lines[0] + # In the header we don't need the strain, but we start with a run number + header = 'RunNumber\t' + '\t'.join(header_line[1:len(header_line)]) + + tr.createNewDirectory(dataset, location) + + # Keep track of the strains, since a strain can be measured multiple times + data_per_strain = {} + + lines_len = len(raw_data_lines) + for i in range(1, len(raw_data_lines)): + line = raw_data_lines[i] + strain_name = line[0] + strain_data = data_per_strain.setdefault(strain_name, []) + # Append the line -- this is run number + the data + strain_data.append(str(len(strain_data)) + '\t' + '\t'.join(line[1:len(line)])) + + # Create the files + for strain in data_per_strain.iterkeys(): + tsvFileName = tr.createNewFile(dataset, location, incoming.getName() + "_" + strain + ".tsv") + tsv = open(tsvFileName, 'w') + tsv.write(header) + + strain_data = data_per_strain[strain] + for line in strain_data: + tsv.write("\n") + tsv.write(line) + tsv.close() + def store_original_data(tr, dataset, location): - """Put the original data into the data set.""" - tr.createNewDirectory(dataset, location) - tr.moveFile(incoming.getAbsolutePath(), dataset, location + "/" + incoming.getName()) + """Put the original data into the data set.""" + tr.createNewDirectory(dataset, location) + tr.moveFile(incoming.getAbsolutePath(), dataset, location + "/" + incoming.getName()) tr = service.transaction(incoming) @@ -73,6 +107,9 @@ store_original_data(tr, original_dataset, "xls") tsv_dataset = tr.createNewDataSet("TSV_MULTISTRAIN_EXPORT") convert_data_to_tsv(tr, tsv_dataset, "tsv-multi") +tsv_split_dataset = tr.createNewDataSet("TSV_EXPORT") +convert_data_to_split_tsv(tr, tsv_split_dataset, "tsv") + # Make the original contain these contained_codes = [original_dataset.getDataSetCode(), tsv_dataset.getDataSetCode()] dataset.setContainedDataSetCodes(contained_codes) @@ -80,10 +117,10 @@ dataset.setContainedDataSetCodes(contained_codes) # If no experiment has been set, then get the experiment from the excel file if dataset.getExperiment() is None: - exp_id = metadata.get("EXPERIMENT") - exp = retrieve_experiment(tr, exp_id) - if exp is not None: - dataset.setExperiment(exp) - original_dataset.setExperiment(exp) - tsv_dataset.setExperiment(exp) - + exp_id = metadata.get("EXPERIMENT") + exp = retrieve_experiment(tr, exp_id) + if exp is not None: + dataset.setExperiment(exp) + original_dataset.setExperiment(exp) + tsv_dataset.setExperiment(exp) + tsv_split_dataset.setExperiment(exp) diff --git a/eu_basynthec/dist/etc/growth-profiles/data-set-validator.py b/eu_basynthec/dist/etc/growth-profiles/data-set-validator.py index b2f2c0bd96c..3bdb793b652 100644 --- a/eu_basynthec/dist/etc/growth-profiles/data-set-validator.py +++ b/eu_basynthec/dist/etc/growth-profiles/data-set-validator.py @@ -1,60 +1,60 @@ def validate_data(timeSeriesData, errors): - dataLines = timeSeriesData.getRawDataLines() - lineCount = 0 - for line in dataLines: - # The header needs to be Abs - if lineCount is 0: - if line[0] != "Strain": - errors.append(createFileValidationError("The first data column must be 'Strain'")) - break - lineCount = lineCount + 1 - continue + dataLines = timeSeriesData.getRawDataLines() + lineCount = 0 + for line in dataLines: + # The header needs to be Abs + if lineCount is 0: + if line[0] != "Strain": + errors.append(createFileValidationError("The first data column must be 'Strain'")) + break + lineCount = lineCount + 1 + continue - # The compound id should be one of these forms - strain = line[0] - if not isStrainIdValid(strain): - errors.append(createFileValidationError("Line " + str(lineCount + 1) + ", column 1 must be MGP[0-999] (instead of " + strain + ").")) - lineCount = lineCount + 1 + # The compound id should be one of these forms + strain = line[0] + if not isStrainIdValid(strain): + errors.append(createFileValidationError("Line " + str(lineCount + 1) + ", column 1 must be MGP[0-999] (instead of " + strain + ").")) + lineCount = lineCount + 1 def validate_metadata(time_series_data, errors): - metadata = time_series_data.getMetadataMap() - validationHelper = ValidationHelper(metadata, errors) - - # validate the header format - validationHelper.validateDefaultHeaderFormat() - - # validate the timepoint type - validationHelper.validateControlledVocabularyProperty("TIMEPOINT TYPE", - "time point type", ['EX', 'IN', 'SI'], "'EX', 'IN', 'SI'") + metadata = time_series_data.getMetadataMap() + validationHelper = ValidationHelper(metadata, errors) + + # validate the header format + validationHelper.validateDefaultHeaderFormat() + + # validate the timepoint type + validationHelper.validateControlledVocabularyProperty("TIMEPOINT TYPE", + "time point type", ['EX', 'IN', 'SI'], "'EX', 'IN', 'SI'") - # validate the cell location - validationHelper.validateControlledVocabularyProperty("CELL LOCATION", - "cell location", ['CE', 'ES', 'ME', 'CY', 'NC'], "'CE', 'ES', 'ME', 'CY', 'NC'") + # validate the cell location + validationHelper.validateControlledVocabularyProperty("CELL LOCATION", + "cell location", ['CE', 'ES', 'ME', 'CY', 'NC'], "'CE', 'ES', 'ME', 'CY', 'NC'") - # validate the value type - validationHelper.validateControlledVocabularyProperty("VALUE TYPE", - "value type", ['VALUE', 'MEAN', 'MEDIAN', 'STD', 'VAR', 'ERROR', 'IQR'], - "'Value', 'Mean', 'Median', 'Std', 'Var', 'Error', 'Iqr'") + # validate the value type + validationHelper.validateControlledVocabularyProperty("VALUE TYPE", + "value type", ['VALUE', 'MEAN', 'MEDIAN', 'STD', 'VAR', 'ERROR', 'IQR'], + "'Value', 'Mean', 'Median', 'Std', 'Var', 'Error', 'Iqr'") - # validate the value unit - validationHelper.validateControlledVocabularyProperty("VALUE UNIT", - "value unit", ['DIMENSIONLESS'], "'DIMENSIONLESS'") - - # validate the scale - validationHelper.validateControlledVocabularyProperty("SCALE", "scale", - ['LIN', 'LOG2', 'LOG10', 'LN'], "'lin', 'log2', 'log10', 'ln'") + # validate the value unit + validationHelper.validateControlledVocabularyProperty("VALUE UNIT", + "value unit", ['DIMENSIONLESS'], "'DIMENSIONLESS'") + + # validate the scale + validationHelper.validateControlledVocabularyProperty("SCALE", "scale", + ['LIN', 'LOG2', 'LOG10', 'LN'], "'lin', 'log2', 'log10', 'ln'") def validate_data_set_file(file): - errors = [] - time_series_data = create_time_series_excel(file.getAbsolutePath()) - if time_series_data is None: - errors.append(createFileValidationError(file.getName() + " is not an Excel file.")) - return errors - - # validate the metadata - validate_metadata(time_series_data, errors) - - # validate the data - validate_data(time_series_data, errors) - - return errors + errors = [] + time_series_data = create_time_series_excel(file.getAbsolutePath()) + if time_series_data is None: + errors.append(createFileValidationError(file.getName() + " is not an Excel file.")) + return errors + + # validate the metadata + validate_metadata(time_series_data, errors) + + # validate the data + validate_data(time_series_data, errors) + + return errors diff --git a/eu_basynthec/dist/etc/metabolomics/data-set-handler.py b/eu_basynthec/dist/etc/metabolomics/data-set-handler.py index 9cdbf69004f..56737bf7a8d 100644 --- a/eu_basynthec/dist/etc/metabolomics/data-set-handler.py +++ b/eu_basynthec/dist/etc/metabolomics/data-set-handler.py @@ -2,47 +2,47 @@ from datetime import datetime from eu.basynthec.cisd.dss import TimeSeriesDataExcel def retrieve_experiment(tr, exp_id): - """Get the specified experiment form the server. Return the experiment.""" - if exp_id is None: - exp = None - else: - exp = tr.getExperiment(exp_id) - return exp + """Get the specified experiment form the server. Return the experiment.""" + if exp_id is None: + exp = None + else: + exp = tr.getExperiment(exp_id) + return exp def assign_properties(dataset, metadata): - """Assign properties to the data set from information in the data.""" - propertyNameMap = { - "STRAIN":"STRAIN_NAMES", - "TIMEPOINT TYPE": "TIMEPOINT_TYPE", - "CELL LOCATION": "CELL_LOCATION", - "VALUE TYPE": "VALUE_TYPE", - "VALUE UNIT": "VALUE_UNIT", - "SCALE": "SCALE" - } - - for prop in metadata.keySet(): - key = propertyNameMap.get(prop) - if key is not None: - value = metadata.get(prop) - dataset.setPropertyValue(key, value.upper()) - + """Assign properties to the data set from information in the data.""" + propertyNameMap = { + "STRAIN":"STRAIN_NAMES", + "TIMEPOINT TYPE": "TIMEPOINT_TYPE", + "CELL LOCATION": "CELL_LOCATION", + "VALUE TYPE": "VALUE_TYPE", + "VALUE UNIT": "VALUE_UNIT", + "SCALE": "SCALE" + } + + for prop in metadata.keySet(): + key = propertyNameMap.get(prop) + if key is not None: + value = metadata.get(prop) + dataset.setPropertyValue(key, value.upper()) + def convert_data_to_tsv(tr, dataset, location): - """Create a tsv file containing the data and add it to the data set.""" - tr.createNewDirectory(dataset, location) - tsvFileName = tr.createNewFile(dataset, location, incoming.getName() + ".tsv") - tsv = open(tsvFileName, 'w') - for line in timeSeriesData.getRawDataLines(): - for i in range(0, len(line) - 1): - tsv.write(line[i]) - tsv.write("\t") - tsv.write(line[len(line) - 1]) - tsv.write("\n") - tsv.close() - + """Create a tsv file containing the data and add it to the data set.""" + tr.createNewDirectory(dataset, location) + tsvFileName = tr.createNewFile(dataset, location, incoming.getName() + ".tsv") + tsv = open(tsvFileName, 'w') + for line in timeSeriesData.getRawDataLines(): + for i in range(0, len(line) - 1): + tsv.write(line[i]) + tsv.write("\t") + tsv.write(line[len(line) - 1]) + tsv.write("\n") + tsv.close() + def store_original_data(tr, dataset, location): - """Put the original data into the data set.""" - tr.createNewDirectory(dataset, location) - tr.moveFile(incoming.getAbsolutePath(), dataset, location + "/" + incoming.getName()) + """Put the original data into the data set.""" + tr.createNewDirectory(dataset, location) + tr.moveFile(incoming.getAbsolutePath(), dataset, location + "/" + incoming.getName()) tr = service.transaction(incoming) @@ -52,7 +52,7 @@ timeSeriesData = TimeSeriesDataExcel.createTimeSeriesDataExcel(incoming.getAbsol dataset = tr.createNewDataSet("METABOLITE_INTENSITIES") metadata = timeSeriesData.getMetadataMap() assign_properties(dataset, metadata) - + # Store the original and tsv data in data sets original_dataset = tr.createNewDataSet("EXCEL_ORIGINAL") store_original_data(tr, original_dataset, "xls") @@ -67,11 +67,11 @@ dataset.setContainedDataSetCodes(contained_codes) # If no experiment has been set, then get the experiment from the excel file if dataset.getExperiment() is None: - exp_id = metadata.get("EXPERIMENT") - exp = retrieve_experiment(tr, exp_id) - if exp is not None: - dataset.setExperiment(exp) - original_dataset.setExperiment(exp) - tsv_dataset.setExperiment(exp) + exp_id = metadata.get("EXPERIMENT") + exp = retrieve_experiment(tr, exp_id) + if exp is not None: + dataset.setExperiment(exp) + original_dataset.setExperiment(exp) + tsv_dataset.setExperiment(exp) diff --git a/eu_basynthec/dist/etc/metabolomics/data-set-validator.py b/eu_basynthec/dist/etc/metabolomics/data-set-validator.py index 79d9b3d462a..0e77b3af1eb 100644 --- a/eu_basynthec/dist/etc/metabolomics/data-set-validator.py +++ b/eu_basynthec/dist/etc/metabolomics/data-set-validator.py @@ -1,67 +1,67 @@ def validate_data(time_series_data, errors): - chebiRegex = re.compile("^CHEBI:[0-9]+") - bsbmeRegex = re.compile("^BSBME:[0-9]+") - dataLines = time_series_data.getRawDataLines() - lineCount = 0 - for line in dataLines: - # The header needs to be CompoundID - if lineCount is 0: - if line[0] != "CompoundID": - errors.append(createFileValidationError("The first data column must be 'CompoundID'")) - break - lineCount = lineCount + 1 - continue + chebiRegex = re.compile("^CHEBI:[0-9]+") + bsbmeRegex = re.compile("^BSBME:[0-9]+") + dataLines = time_series_data.getRawDataLines() + lineCount = 0 + for line in dataLines: + # The header needs to be CompoundID + if lineCount is 0: + if line[0] != "CompoundID": + errors.append(createFileValidationError("The first data column must be 'CompoundID'")) + break + lineCount = lineCount + 1 + continue - # The compound id should be one of these forms - compoundId = line[0] - if not chebiRegex.match(compoundId): - if not bsbmeRegex.match(compoundId): - errors.append(createFileValidationError("Line " + str(lineCount + 1) + ", column 1 must be of the format 'CHEBI:#' or 'BSBME:#' (instead of " + compoundId + ").")) - lineCount = lineCount + 1 - + # The compound id should be one of these forms + compoundId = line[0] + if not chebiRegex.match(compoundId): + if not bsbmeRegex.match(compoundId): + errors.append(createFileValidationError("Line " + str(lineCount + 1) + ", column 1 must be of the format 'CHEBI:#' or 'BSBME:#' (instead of " + compoundId + ").")) + lineCount = lineCount + 1 + def validate_metadata(time_series_data, errors): - metadata = time_series_data.getMetadataMap() - validationHelper = ValidationHelper(metadata, errors) - - # validate the strain - validationHelper.validateStrain() - - # validate the header format - validationHelper.validateDefaultHeaderFormat() - - # validate the timepoint type - validationHelper.validateControlledVocabularyProperty("TIMEPOINT TYPE", - "time point type", ['EX', 'IN', 'SI'], "'EX', 'IN', 'SI'") + metadata = time_series_data.getMetadataMap() + validationHelper = ValidationHelper(metadata, errors) + + # validate the strain + validationHelper.validateStrain() + + # validate the header format + validationHelper.validateDefaultHeaderFormat() + + # validate the timepoint type + validationHelper.validateControlledVocabularyProperty("TIMEPOINT TYPE", + "time point type", ['EX', 'IN', 'SI'], "'EX', 'IN', 'SI'") - # validate the cell location - validationHelper.validateControlledVocabularyProperty("CELL LOCATION", - "cell location", ['CE', 'ES', 'ME', 'CY', 'NC'], "'CE', 'ES', 'ME', 'CY', 'NC'") + # validate the cell location + validationHelper.validateControlledVocabularyProperty("CELL LOCATION", + "cell location", ['CE', 'ES', 'ME', 'CY', 'NC'], "'CE', 'ES', 'ME', 'CY', 'NC'") - # validate the value type - validationHelper.validateControlledVocabularyProperty("VALUE TYPE", - "value type", ['VALUE', 'MEAN', 'MEDIAN', 'STD', 'VAR', 'ERROR', 'IQR'], - "'Value', 'Mean', 'Median', 'Std', 'Var', 'Error', 'Iqr'") + # validate the value type + validationHelper.validateControlledVocabularyProperty("VALUE TYPE", + "value type", ['VALUE', 'MEAN', 'MEDIAN', 'STD', 'VAR', 'ERROR', 'IQR'], + "'Value', 'Mean', 'Median', 'Std', 'Var', 'Error', 'Iqr'") - # validate the value unit - validationHelper.validateControlledVocabularyProperty("VALUE UNIT", - "value unit", ['MM', 'UM', 'RATIOT1', 'RATIOCS'], "'mM', 'uM', 'RatioT1', 'RatioCs'") - - # validate the scale - validationHelper.validateControlledVocabularyProperty("SCALE", "scale", - ['LIN', 'LOG2', 'LOG10', 'LN'], "'lin', 'log2', 'log10', 'ln'") + # validate the value unit + validationHelper.validateControlledVocabularyProperty("VALUE UNIT", + "value unit", ['MM', 'UM', 'RATIOT1', 'RATIOCS'], "'mM', 'uM', 'RatioT1', 'RatioCs'") + + # validate the scale + validationHelper.validateControlledVocabularyProperty("SCALE", "scale", + ['LIN', 'LOG2', 'LOG10', 'LN'], "'lin', 'log2', 'log10', 'ln'") def validate_data_set_file(file): - errors = [] - time_series_data = create_time_series_excel(file.getAbsolutePath()) - if time_series_data is None: - errors.append(createFileValidationError(file.getName() + " is not an Excel file.")) - return errors - - # validate the metadata - validate_metadata(time_series_data, errors) - - # validate the data - validate_data(time_series_data, errors) - - return errors + errors = [] + time_series_data = create_time_series_excel(file.getAbsolutePath()) + if time_series_data is None: + errors.append(createFileValidationError(file.getName() + " is not an Excel file.")) + return errors + + # validate the metadata + validate_metadata(time_series_data, errors) + + # validate the data + validate_data(time_series_data, errors) + + return errors diff --git a/eu_basynthec/dist/etc/proteomics/data-set-handler.py b/eu_basynthec/dist/etc/proteomics/data-set-handler.py index 4ab87d54b49..369cf170b58 100644 --- a/eu_basynthec/dist/etc/proteomics/data-set-handler.py +++ b/eu_basynthec/dist/etc/proteomics/data-set-handler.py @@ -2,48 +2,48 @@ from datetime import datetime from eu.basynthec.cisd.dss import TimeSeriesDataExcel def retrieve_experiment(tr, exp_id): - """Get the specified experiment form the server. Return the experiment.""" - if exp_id is None: - exp = None - else: - exp = tr.getExperiment(exp_id) - return exp + """Get the specified experiment form the server. Return the experiment.""" + if exp_id is None: + exp = None + else: + exp = tr.getExperiment(exp_id) + return exp def assign_properties(dataset, metadata): - """Assign properties to the data set from information in the data.""" - propertyNameMap = { - "STRAIN":"STRAIN_NAMES", - "TIMEPOINT TYPE": "TIMEPOINT_TYPE", - "CELL LOCATION": "CELL_LOCATION", - "VALUE UNIT": "VALUE_UNIT", - "SCALE": "SCALE" - } - - for prop in metadata.keySet(): - key = propertyNameMap.get(prop) - if key is not None: - value = metadata.get(prop) - if (key == "STRAIN"): - value = value + " (STRAIN)" - dataset.setPropertyValue(key, value.upper()) - + """Assign properties to the data set from information in the data.""" + propertyNameMap = { + "STRAIN":"STRAIN_NAMES", + "TIMEPOINT TYPE": "TIMEPOINT_TYPE", + "CELL LOCATION": "CELL_LOCATION", + "VALUE UNIT": "VALUE_UNIT", + "SCALE": "SCALE" + } + + for prop in metadata.keySet(): + key = propertyNameMap.get(prop) + if key is not None: + value = metadata.get(prop) + if (key == "STRAIN"): + value = value + " (STRAIN)" + dataset.setPropertyValue(key, value.upper()) + def convert_data_to_tsv(tr, dataset, location): - """Create a tsv file containing the data and add it to the data set.""" - tr.createNewDirectory(dataset, location) - tsvFileName = tr.createNewFile(dataset, location, incoming.getName() + ".tsv") - tsv = open(tsvFileName, 'w') - for line in timeSeriesData.getRawDataLines(): - for i in range(0, len(line) - 1): - tsv.write(line[i]) - tsv.write("\t") - tsv.write(line[len(line) - 1]) - tsv.write("\n") - tsv.close() - + """Create a tsv file containing the data and add it to the data set.""" + tr.createNewDirectory(dataset, location) + tsvFileName = tr.createNewFile(dataset, location, incoming.getName() + ".tsv") + tsv = open(tsvFileName, 'w') + for line in timeSeriesData.getRawDataLines(): + for i in range(0, len(line) - 1): + tsv.write(line[i]) + tsv.write("\t") + tsv.write(line[len(line) - 1]) + tsv.write("\n") + tsv.close() + def store_original_data(tr, dataset, location): - """Put the original data into the data set.""" - tr.createNewDirectory(dataset, location) - tr.moveFile(incoming.getAbsolutePath(), dataset, location + "/" + incoming.getName()) + """Put the original data into the data set.""" + tr.createNewDirectory(dataset, location) + tr.moveFile(incoming.getAbsolutePath(), dataset, location + "/" + incoming.getName()) tr = service.transaction(incoming) @@ -68,11 +68,11 @@ dataset.setContainedDataSetCodes(contained_codes) # If no experiment has been set, then get the experiment from the excel file if dataset.getExperiment() is None: - exp_id = metadata.get("EXPERIMENT") - exp = retrieve_experiment(tr, exp_id) - if exp is not None: - dataset.setExperiment(exp) - original_dataset.setExperiment(exp) - tsv_dataset.setExperiment(exp) + exp_id = metadata.get("EXPERIMENT") + exp = retrieve_experiment(tr, exp_id) + if exp is not None: + dataset.setExperiment(exp) + original_dataset.setExperiment(exp) + tsv_dataset.setExperiment(exp) diff --git a/eu_basynthec/dist/etc/proteomics/data-set-validator.py b/eu_basynthec/dist/etc/proteomics/data-set-validator.py index 63c80b68f35..c794234bf4b 100644 --- a/eu_basynthec/dist/etc/proteomics/data-set-validator.py +++ b/eu_basynthec/dist/etc/proteomics/data-set-validator.py @@ -1,69 +1,69 @@ def validate_data(time_series_data, errors): - gene_locus_regex = re.compile("^BSU[0-9]+|^BSU_misc_RNA_[0-9]+|^VMG_[0-9]+_[0-9]+(_c)?") - column_header_regex = re.compile("(\+|-)?[0-9]+::(value|mean|median|std|var|error|iqr)") - dataLines = time_series_data.getRawDataLines() - lineCount = 0 - for line in dataLines: - # The header needs to be GeneLocus - if lineCount is 0: - if line[0] != "GeneLocus": - errors.append(createFileValidationError("The first data column must be 'GeneLocus'")) - break - lineCount = lineCount + 1 - has_human_readable = line[1] == "HumanReadable" - - if has_human_readable: - range_start = 2 - else: - range_start = 1 - for i in range(range_start, len(line)): - if not column_header_regex.match(line[i].lower()): - errors.append(createFileValidationError("Column " + str(i) + " header must be of the format Timepoint::(value|mean|median|std|var|error|iqr), (instead of " + line[i] + ").")) - continue + gene_locus_regex = re.compile("^BSU[0-9]+|^BSU_misc_RNA_[0-9]+|^VMG_[0-9]+_[0-9]+(_c)?") + column_header_regex = re.compile("(\+|-)?[0-9]+::(value|mean|median|std|var|error|iqr)") + dataLines = time_series_data.getRawDataLines() + lineCount = 0 + for line in dataLines: + # The header needs to be GeneLocus + if lineCount is 0: + if line[0] != "GeneLocus": + errors.append(createFileValidationError("The first data column must be 'GeneLocus'")) + break + lineCount = lineCount + 1 + has_human_readable = line[1] == "HumanReadable" + + if has_human_readable: + range_start = 2 + else: + range_start = 1 + for i in range(range_start, len(line)): + if not column_header_regex.match(line[i].lower()): + errors.append(createFileValidationError("Column " + str(i) + " header must be of the format Timepoint::(value|mean|median|std|var|error|iqr), (instead of " + line[i] + ").")) + continue - # The compound id should be one of these forms - gene_locus = line[0] - if not gene_locus_regex.match(gene_locus): - errors.append(createFileValidationError("Line " + str(lineCount + 1) + ", column 1 must be of the format 'BSU#', 'BSU_misc_RNA_#', 'VMG_#_#', or 'VMG_#_#_c' (instead of " + gene_locus + ").")) - lineCount = lineCount + 1 - + # The compound id should be one of these forms + gene_locus = line[0] + if not gene_locus_regex.match(gene_locus): + errors.append(createFileValidationError("Line " + str(lineCount + 1) + ", column 1 must be of the format 'BSU#', 'BSU_misc_RNA_#', 'VMG_#_#', or 'VMG_#_#_c' (instead of " + gene_locus + ").")) + lineCount = lineCount + 1 + def validate_metadata(time_series_data, errors): - metadata = time_series_data.getMetadataMap() - validationHelper = ValidationHelper(metadata, errors) - - # validate the strain - validationHelper.validateStrain() - - # validate the header format - validationHelper.validateExplicitHeaderFormat("TIME::VALUE_TYPE") - - # validate the timepoint type - validationHelper.validateControlledVocabularyProperty("TIMEPOINT TYPE", - "time point type", ['EX', 'IN', 'SI'], "'EX', 'IN', 'SI'") + metadata = time_series_data.getMetadataMap() + validationHelper = ValidationHelper(metadata, errors) + + # validate the strain + validationHelper.validateStrain() + + # validate the header format + validationHelper.validateExplicitHeaderFormat("TIME::VALUE_TYPE") + + # validate the timepoint type + validationHelper.validateControlledVocabularyProperty("TIMEPOINT TYPE", + "time point type", ['EX', 'IN', 'SI'], "'EX', 'IN', 'SI'") - # validate the cell location - validationHelper.validateControlledVocabularyProperty("CELL LOCATION", - "cell location", ['CE', 'ES', 'ME', 'CY', 'NC'], "'CE', 'ES', 'ME', 'CY', 'NC'") + # validate the cell location + validationHelper.validateControlledVocabularyProperty("CELL LOCATION", + "cell location", ['CE', 'ES', 'ME', 'CY', 'NC'], "'CE', 'ES', 'ME', 'CY', 'NC'") - # validate the value unit - validationHelper.validateControlledVocabularyProperty("VALUE UNIT", - "value unit", ['MM', 'UM', 'PERCENT', 'RATIOT1', 'RATIOCS', 'AU', 'DIMENSIONLESS'], "'mM', 'uM', 'Percent', 'RatioT1', 'RatioCs', 'AU', 'Dimensionless'") - - # validate the scale - validationHelper.validateControlledVocabularyProperty("SCALE", "scale", - ['LIN', 'LOG2', 'LOG10', 'LN'], "'lin', 'log2', 'log10', 'ln'") + # validate the value unit + validationHelper.validateControlledVocabularyProperty("VALUE UNIT", + "value unit", ['MM', 'UM', 'PERCENT', 'RATIOT1', 'RATIOCS', 'AU', 'DIMENSIONLESS'], "'mM', 'uM', 'Percent', 'RatioT1', 'RatioCs', 'AU', 'Dimensionless'") + + # validate the scale + validationHelper.validateControlledVocabularyProperty("SCALE", "scale", + ['LIN', 'LOG2', 'LOG10', 'LN'], "'lin', 'log2', 'log10', 'ln'") def validate_data_set_file(file): - errors = [] - time_series_data = create_time_series_excel(file.getAbsolutePath()) - if time_series_data is None: - errors.append(createFileValidationError(file.getName() + " is not an Excel file.")) - return errors - - # validate the metadata - validate_metadata(time_series_data, errors) - - # validate the data - validate_data(time_series_data, errors) - - return errors + errors = [] + time_series_data = create_time_series_excel(file.getAbsolutePath()) + if time_series_data is None: + errors.append(createFileValidationError(file.getName() + " is not an Excel file.")) + return errors + + # validate the metadata + validate_metadata(time_series_data, errors) + + # validate the data + validate_data(time_series_data, errors) + + return errors diff --git a/eu_basynthec/dist/etc/shared/shared-classes.py b/eu_basynthec/dist/etc/shared/shared-classes.py index 6d474765692..72e6fd1ae05 100644 --- a/eu_basynthec/dist/etc/shared/shared-classes.py +++ b/eu_basynthec/dist/etc/shared/shared-classes.py @@ -13,156 +13,156 @@ OPENBIS_METADATA_SHEET_NAME = "openbis-metadata" OPENBIS_DATA_SHEET_NAME = "openbis-data" class TimeSeriesDataExcel: - """ - An abstraction for accessing time series data following the BaSynthec conventions - from an Excel file. This class ported from Java, thus the camelCase naming. - """ - def __init__(self, file, fileReader): - self.file = file - self.fileReader = fileReader - - def getRawMetadataLines(self): - """Get the raw lines of the metadata sheet.""" - try: - return self.fileReader.readLines(OPENBIS_METADATA_SHEET_NAME); - except IOException, ex: - operationLog.error("Could not read data from [file: " + self.file.getPath() + ", sheet: " - + OPENBIS_METADATA_SHEET_NAME + "]", ex) - return [] + """ + An abstraction for accessing time series data following the BaSynthec conventions + from an Excel file. This class ported from Java, thus the camelCase naming. + """ + def __init__(self, file, fileReader): + self.file = file + self.fileReader = fileReader + + def getRawMetadataLines(self): + """Get the raw lines of the metadata sheet.""" + try: + return self.fileReader.readLines(OPENBIS_METADATA_SHEET_NAME); + except IOException, ex: + operationLog.error("Could not read data from [file: " + self.file.getPath() + ", sheet: " + + OPENBIS_METADATA_SHEET_NAME + "]", ex) + return [] - def getRawDataLines(self): - """Get the raw lines of the data sheet.""" - try: - return self.fileReader.readLines(OPENBIS_DATA_SHEET_NAME) - except IOException, ex: - operationLog.error("Could not read data from [file: " + file.getPath() + ", sheet: " - + OPENBIS_DATA_SHEET_NAME + "]", ex) - return [] + def getRawDataLines(self): + """Get the raw lines of the data sheet.""" + try: + return self.fileReader.readLines(OPENBIS_DATA_SHEET_NAME) + except IOException, ex: + operationLog.error("Could not read data from [file: " + file.getPath() + ", sheet: " + + OPENBIS_DATA_SHEET_NAME + "]", ex) + return [] - def getMetadataMap(self): - """ - Return the metadata has a hashmap, with all keys uppercased. - - Assumes the metadata sheet corresponds to the following format: [Property] [Value] [... stuff - that can be ignored], that is the property name is in column 1 and property value is in - column 2, and everything else can be ignored. - """ - metadataMap = {} - metadataLines = self.getRawMetadataLines() - - # Skip the first line, this is just the header - for i in range(1, metadataLines.size()): - line = metadataLines.get(i) - value = line[1]; - if "BLANK" == value: - value = None - metadataMap[line[0].upper()] = value - return metadataMap - + def getMetadataMap(self): + """ + Return the metadata has a hashmap, with all keys uppercased. + + Assumes the metadata sheet corresponds to the following format: [Property] [Value] [... stuff + that can be ignored], that is the property name is in column 1 and property value is in + column 2, and everything else can be ignored. + """ + metadataMap = {} + metadataLines = self.getRawMetadataLines() + + # Skip the first line, this is just the header + for i in range(1, metadataLines.size()): + line = metadataLines.get(i) + value = line[1]; + if "BLANK" == value: + value = None + metadataMap[line[0].upper()] = value + return metadataMap + def create_time_series_excel(fileName): - """Factory method for the TimeSeriesData object. Returns None if it cannot be created.""" - file = java.io.File(fileName) - try: - workbook = ExcelFileReader.getExcelWorkbook(file) - fileReader = ExcelFileReader(workbook, True) - return TimeSeriesDataExcel(file, fileReader) - except IllegalArgumentException, ex: - operationLog.error("Could not open file [" + fileName + "] as Excel data.", ex) - except IOException, ex: - operationLog.error("Could not open file [" + fileName + "] as Excel data.", ex) - return None + """Factory method for the TimeSeriesData object. Returns None if it cannot be created.""" + file = java.io.File(fileName) + try: + workbook = ExcelFileReader.getExcelWorkbook(file) + fileReader = ExcelFileReader(workbook, True) + return TimeSeriesDataExcel(file, fileReader) + except IllegalArgumentException, ex: + operationLog.error("Could not open file [" + fileName + "] as Excel data.", ex) + except IOException, ex: + operationLog.error("Could not open file [" + fileName + "] as Excel data.", ex) + return None - + class ValidationHelper: - """ - Methods for simplifying validation in BaSynthec. - This class is ported from Java, thus the camelCase naming. - """ - def __init__(self, metadataMap, errors): - self.metadataMap = metadataMap - self.errors = errors + """ + Methods for simplifying validation in BaSynthec. + This class is ported from Java, thus the camelCase naming. + """ + def __init__(self, metadataMap, errors): + self.metadataMap = metadataMap + self.errors = errors - def checkIsSpecified(self, property, displayName): - """Verify that a property is specified; if not, add a validation error to the list.""" - if self.metadataMap.get(property) is None: - self.errors.append(ValidationError.createFileValidationError("A " + displayName - + " must be specified.")) - return False - return True - - def validateStrain(self): - """Verify that the strain is specified and of the correct format""" - if not self.checkIsSpecified("STRAIN", "strain"): - return - strain = self.metadataMap.get("STRAIN") - if not isStrainIdValid(strain): - self.errors.append(createFileValidationError("Strain must be MGP[0-999] (instead of " + strain + ").")) - - def validateDefaultHeaderFormat(self): - """Validate that header format is either not specified or matches default (TIME)""" - if self.metadataMap.get("HEADER FORMAT") is None: - return - format = self.metadataMap.get("HEADER FORMAT") - expected_format = "TIME" - if expected_format != format: - self.errors.append(createFileValidationError("Header format must be " + expected_format + " (not " + format + ").")) - - def validateExplicitHeaderFormat(self, expected_format): - """Validate that header format is specified and matches the expected_format argument""" - if not self.checkIsSpecified("HEADER FORMAT", "header format"): - return - format = self.metadataMap.get("HEADER FORMAT") - if expected_format != format: - self.errors.append(createFileValidationError("Header format must be " + expected_format + " (not " + format + ").")) - - def validateControlledVocabularyProperty(self, property, displayName, allowedValues, allowedValuesDisplay): - """Validate that the property is specified and in the list of allowed values""" - if not self.checkIsSpecified(property, displayName): - return - value = self.metadataMap.get(property).upper() - if value not in allowedValues: - if len(allowedValues) > 1: - self.errors.append(createFileValidationError("The " + displayName + " must be one of " + allowedValuesDisplay + " (not " + value + ").")) - else: - self.errors.append(createFileValidationError("The " + displayName + " must be " + allowedValuesDisplay + " (not " + value + ").")) - - def validateStartDataRowCol(self): - if self.checkIsSpecified("START DATA ROW", "Start Data Row"): - value = self.metadataMap.get("START DATA ROW") - match = re.match("[0-9]+", value) - if match is None: - self.errors.append(createFileValidationError("The Start Data Row must be a number (not " + value + ").")) - if self.checkIsSpecified("START DATA COL", "Start Data Col"): - value = self.metadataMap.get("START DATA COL") - match = re.match("[A-Z]", value) - if match is None: - self.errors.append(createFileValidationError("The Start Data Col must be a letter between A and Z (not " + value + ").")) - + def checkIsSpecified(self, property, displayName): + """Verify that a property is specified; if not, add a validation error to the list.""" + if self.metadataMap.get(property) is None: + self.errors.append(ValidationError.createFileValidationError("A " + displayName + + " must be specified.")) + return False + return True + + def validateStrain(self): + """Verify that the strain is specified and of the correct format""" + if not self.checkIsSpecified("STRAIN", "strain"): + return + strain = self.metadataMap.get("STRAIN") + if not isStrainIdValid(strain): + self.errors.append(createFileValidationError("Strain must be MGP[0-999] (instead of " + strain + ").")) + + def validateDefaultHeaderFormat(self): + """Validate that header format is either not specified or matches default (TIME)""" + if self.metadataMap.get("HEADER FORMAT") is None: + return + format = self.metadataMap.get("HEADER FORMAT") + expected_format = "TIME" + if expected_format != format: + self.errors.append(createFileValidationError("Header format must be " + expected_format + " (not " + format + ").")) + + def validateExplicitHeaderFormat(self, expected_format): + """Validate that header format is specified and matches the expected_format argument""" + if not self.checkIsSpecified("HEADER FORMAT", "header format"): + return + format = self.metadataMap.get("HEADER FORMAT") + if expected_format != format: + self.errors.append(createFileValidationError("Header format must be " + expected_format + " (not " + format + ").")) + + def validateControlledVocabularyProperty(self, property, displayName, allowedValues, allowedValuesDisplay): + """Validate that the property is specified and in the list of allowed values""" + if not self.checkIsSpecified(property, displayName): + return + value = self.metadataMap.get(property).upper() + if value not in allowedValues: + if len(allowedValues) > 1: + self.errors.append(createFileValidationError("The " + displayName + " must be one of " + allowedValuesDisplay + " (not " + value + ").")) + else: + self.errors.append(createFileValidationError("The " + displayName + " must be " + allowedValuesDisplay + " (not " + value + ").")) + + def validateStartDataRowCol(self): + if self.checkIsSpecified("START DATA ROW", "Start Data Row"): + value = self.metadataMap.get("START DATA ROW") + match = re.match("[0-9]+", value) + if match is None: + self.errors.append(createFileValidationError("The Start Data Row must be a number (not " + value + ").")) + if self.checkIsSpecified("START DATA COL", "Start Data Col"): + value = self.metadataMap.get("START DATA COL") + match = re.match("[A-Z]", value) + if match is None: + self.errors.append(createFileValidationError("The Start Data Col must be a letter between A and Z (not " + value + ").")) + strainIdRegex = re.compile("^MGP[0-9]{1,3}") def isStrainIdValid(strainId): - """Return true if the strain id passes validation (has the form MGP[:digit:]{1,3})""" - match = strainIdRegex.match(strainId) - if match is None: - return False - return match.end() == len(strainId) - + """Return true if the strain id passes validation (has the form MGP[:digit:]{1,3})""" + match = strainIdRegex.match(strainId) + if match is None: + return False + return match.end() == len(strainId) + def getInitialDataRowAndCol(metadata): - """Extract the initial row and column as specified in the metadata. Returns an array with [row, col].""" - # get the raw value from the map - first_data_row = metadata.get("START DATA ROW") - first_data_col = metadata.get("START DATA COL") + """Extract the initial row and column as specified in the metadata. Returns an array with [row, col].""" + # get the raw value from the map + first_data_row = metadata.get("START DATA ROW") + first_data_col = metadata.get("START DATA COL") - # convert the row numeric string to an int - if first_data_row is None: - first_data_row = 0 - else: - first_data_row = int(float(first_data_row)) - 1 + # convert the row numeric string to an int + if first_data_row is None: + first_data_row = 0 + else: + first_data_row = int(float(first_data_row)) - 1 - # convert the column spreadsheet value to an int - if first_data_col is None: - first_data_col = 0 - else: - # columns start at A - first_data_col = ord(first_data_col) - ord('A') - return [first_data_row, first_data_col] + # convert the column spreadsheet value to an int + if first_data_col is None: + first_data_col = 0 + else: + # columns start at A + first_data_col = ord(first_data_col) - ord('A') + return [first_data_row, first_data_col] diff --git a/eu_basynthec/dist/etc/transcriptomics/data-set-handler.py b/eu_basynthec/dist/etc/transcriptomics/data-set-handler.py index d7d875f0585..81078ecaa8a 100644 --- a/eu_basynthec/dist/etc/transcriptomics/data-set-handler.py +++ b/eu_basynthec/dist/etc/transcriptomics/data-set-handler.py @@ -3,87 +3,159 @@ from eu.basynthec.cisd.dss import TimeSeriesDataExcel import re def getInitialDataRowAndCol(metadata): - """Extract the initial row and column as specified in the metadata. Returns an array with [row, col].""" - # get the raw value from the map - first_data_row = metadata.get("START DATA ROW") - first_data_col = metadata.get("START DATA COL") - - # convert the row numeric string to an int - if first_data_row is None: - first_data_row = 0 - else: - first_data_row = int(float(first_data_row)) - 1 - - # convert the column spreadsheet value to an int - if first_data_col is None: - first_data_col = 0 - else: - # columns start at A - first_data_col = ord(first_data_col) - ord('A') - return [first_data_row, first_data_col] + """Extract the initial row and column as specified in the metadata. Returns an array with [row, col].""" + # get the raw value from the map + first_data_row = metadata.get("START DATA ROW") + first_data_col = metadata.get("START DATA COL") + + # convert the row numeric string to an int + if first_data_row is None: + first_data_row = 0 + else: + first_data_row = int(float(first_data_row)) - 1 + + # convert the column spreadsheet value to an int + if first_data_col is None: + first_data_col = 0 + else: + # columns start at A + first_data_col = ord(first_data_col) - ord('A') + return [first_data_row, first_data_col] def retrieve_experiment(tr, exp_id): - """Get the specified experiment form the server. Return the experiment.""" - if exp_id is None: - exp = None - else: - exp = tr.getExperiment(exp_id) - return exp + """Get the specified experiment form the server. Return the experiment.""" + if exp_id is None: + exp = None + else: + exp = tr.getExperiment(exp_id) + return exp def assign_properties(dataset, metadata): - """Assign properties to the data set from information in the data.""" - propertyNameMap = { - "STRAIN_NAMES": "STRAIN_NAMES", - "TIMEPOINT TYPE": "TIMEPOINT_TYPE", - "CELL LOCATION": "CELL_LOCATION", - "VALUE TYPE": "VALUE_TYPE", - "VALUE UNIT": "VALUE_UNIT", - "SCALE": "SCALE" - } - - for prop in metadata.keySet(): - key = propertyNameMap.get(prop) - if key is not None: - value = metadata.get(prop) - if (key == "STRAIN"): - value = value + " (STRAIN)" - dataset.setPropertyValue(key, value.upper()) - + """Assign properties to the data set from information in the data.""" + propertyNameMap = { + "STRAIN_NAMES": "STRAIN_NAMES", + "TIMEPOINT TYPE": "TIMEPOINT_TYPE", + "CELL LOCATION": "CELL_LOCATION", + "VALUE TYPE": "VALUE_TYPE", + "VALUE UNIT": "VALUE_UNIT", + "SCALE": "SCALE" + } + + for prop in metadata.keySet(): + key = propertyNameMap.get(prop) + if key is not None: + value = metadata.get(prop) + if (key == "STRAIN"): + value = value + " (STRAIN)" + dataset.setPropertyValue(key, value.upper()) + def convert_data_to_tsv(tr, start_row, start_col, dataset, location): - """Create a tsv file containing the data and add it to the data set.""" - tr.createNewDirectory(dataset, location) - tsvFileName = tr.createNewFile(dataset, location, incoming.getName() + ".tsv") - tsv = open(tsvFileName, 'w') - raw_data = timeSeriesData.getRawDataLines() - for i in range(start_row, len(raw_data)): - line = raw_data[i] - # write the metabolite id - tsv.write(line[0]) - tsv.write("\t") - for j in range(start_col, len(line) - 1): - tsv.write(line[j]) - tsv.write("\t") - tsv.write(line[len(line) - 1]) - tsv.write("\n") - tsv.close() - + """Create a tsv file containing the data and add it to the data set.""" + tr.createNewDirectory(dataset, location) + tsvFileName = tr.createNewFile(dataset, location, incoming.getName() + ".tsv") + tsv = open(tsvFileName, 'w') + raw_data = timeSeriesData.getRawDataLines() + for i in range(start_row, len(raw_data)): + line = raw_data[i] + # write the metabolite id + tsv.write(line[0]) + tsv.write("\t") + for j in range(start_col, len(line) - 1): + tsv.write(line[j]) + tsv.write("\t") + tsv.write(line[len(line) - 1]) + tsv.write("\n") + tsv.close() + +class SplitColumnInfo: + """ + A class that stores, for each column in the file, the column number, the strain name, + the biological replicate, the hybridization number, and the column offset in the resulting file + """ + def __init__(self, column, strain_name, bio_replicate, hybrid_number, output_col): + self.column = column + self.strain_name = strain_name + self.bio_replicate = bio_replicate + self.hybrid_number = hybrid_number + self.output_col = output_col + + tsv = None + + +def convert_data_to_split_tsv(tr, start_row, start_col, dataset, location): + """Create one tsv file per strain in the original data.""" + raw_data = timeSeriesData.getRawDataLines() + + # Keep track of the mapping from columns to strains and strains to columns + column_infos = [] + strain_column_info = {} + + # Extract the column / strain mapping + header_line = raw_data[start_row] + header_regex = re.compile("^(MGP[0-9]{1,3})-([0-9]) ([0-9]+)") + for i in range(start_col, len(header_line)): + match = header_regex.match(header_line[i]) + strain_name = match.group(1) + strain_cols = strain_column_info.setdefault(strain_name, []) + column_info = SplitColumnInfo(i, strain_name, match.group(2), match.group(3), len(strain_cols)) + strain_cols.append(column_info) + column_infos.append(column_info) + + # create the files + tr.createNewDirectory(dataset, location) + for strain in strain_column_info.iterkeys(): + tsvFileName = tr.createNewFile(dataset, location, incoming.getName() + "_" + strain + ".tsv") + tsv = open(tsvFileName, 'w') + for column_info in strain_column_info[strain]: + column_info.tsv = tsv + + # Write the header + line = raw_data[start_row] + tag = line[0] + # write the first column to each file + for strain in strain_column_info.iterkeys(): + strain_column_info[strain][0].tsv.write(tag) + + for column_info in column_infos: + column_info.tsv.write('\t') + column_info.tsv.write(column_info.bio_replicate) + column_info.tsv.write(' ') + column_info.tsv.write(column_info.hybrid_number) + + # Write the data to the files + for i in range(start_row + 1, len(raw_data)): + line = raw_data[i] + tag = line[0] + for strain in strain_column_info.iterkeys(): + strain_column_info[strain][0].tsv.write('\n') + # write the first column to each file + strain_column_info[strain][0].tsv.write(tag) + # Write the remaining data to each file + for column_info in column_infos: + column_info.tsv.write('\t') + column_info.tsv.write(line[column_info.column]) + + # Close each file + for strain in strain_column_info.iterkeys(): + strain_column_info[strain][0].tsv.close() + def store_original_data(tr, dataset, location): - """Put the original data into the data set.""" - tr.createNewDirectory(dataset, location) - tr.moveFile(incoming.getAbsolutePath(), dataset, location + "/" + incoming.getName()) + """Put the original data into the data set.""" + tr.createNewDirectory(dataset, location) + tr.moveFile(incoming.getAbsolutePath(), dataset, location + "/" + incoming.getName()) def extract_strains(start_row, start_col): - """Extract the strain names from the header.""" - strains = [] - line = timeSeriesData.getRawDataLines()[start_row] - header_regex = re.compile("^(MGP[0-9]{1,3})-([0-9]) ([0-9]+)") - for i in range(start_col, len(line)): - match = header_regex.match(line[i]) - strains.append(match.group(1)) - return ",".join(strains) + """Extract the strain names from the header.""" + strains = [] + line = timeSeriesData.getRawDataLines()[start_row] + header_regex = re.compile("^(MGP[0-9]{1,3})-([0-9]) ([0-9]+)") + for i in range(start_col, len(line)): + match = header_regex.match(line[i]) + strains.append(match.group(1)) + return ",".join(strains) - + tr = service.transaction(incoming) timeSeriesData = TimeSeriesDataExcel.createTimeSeriesDataExcel(incoming.getAbsolutePath()) @@ -102,6 +174,9 @@ store_original_data(tr, original_dataset, "xls") tsv_dataset = tr.createNewDataSet("TSV_MULTISTRAIN_EXPORT") convert_data_to_tsv(tr, dataStart[0], dataStart[1], tsv_dataset, "tsv-multi") +tsv_split_dataset = tr.createNewDataSet("TSV_EXPORT") +convert_data_to_split_tsv(tr, dataStart[0], dataStart[1], tsv_split_dataset, "tsv") + # Make the original contain these contained_codes = [original_dataset.getDataSetCode(), tsv_dataset.getDataSetCode()] dataset.setContainedDataSetCodes(contained_codes) @@ -109,11 +184,11 @@ dataset.setContainedDataSetCodes(contained_codes) # If no experiment has been set, then get the experiment from the excel file if dataset.getExperiment() is None: - exp_id = metadata.get("EXPERIMENT") - exp = retrieve_experiment(tr, exp_id) - if exp is not None: - dataset.setExperiment(exp) - original_dataset.setExperiment(exp) - tsv_dataset.setExperiment(exp) - + exp_id = metadata.get("EXPERIMENT") + exp = retrieve_experiment(tr, exp_id) + if exp is not None: + dataset.setExperiment(exp) + original_dataset.setExperiment(exp) + tsv_dataset.setExperiment(exp) + tsv_split_dataset.setExperiment(exp) diff --git a/eu_basynthec/dist/etc/transcriptomics/data-set-validator.py b/eu_basynthec/dist/etc/transcriptomics/data-set-validator.py index ff0208622f0..7eece315c6a 100644 --- a/eu_basynthec/dist/etc/transcriptomics/data-set-validator.py +++ b/eu_basynthec/dist/etc/transcriptomics/data-set-validator.py @@ -1,75 +1,75 @@ def validate_header(line, first_data_col, errors): - """Validate the header, returning False if there is no point in continuing validation""" - if line[0] != "Locustag": - errors.append(createFileValidationError("The first data column must be 'Locustag' (not " + line[0] + ").")) - return False - header_regex = re.compile("^MGP[0-9]{1,3}-[0-9] [0-9]+") - for i in range(first_data_col, len(line)): - match = header_regex.match(line[i]) - if match is None: - errors.append(createFileValidationError("The column header + " + str(i) + " must be of the form [STRAIN]-[BIOLOGICAL REPLICATE] [HYBRIDIZATION NUMBER]")) + """Validate the header, returning False if there is no point in continuing validation""" + if line[0] != "Locustag": + errors.append(createFileValidationError("The first data column must be 'Locustag' (not " + line[0] + ").")) + return False + header_regex = re.compile("^MGP[0-9]{1,3}-[0-9] [0-9]+") + for i in range(first_data_col, len(line)): + match = header_regex.match(line[i]) + if match is None: + errors.append(createFileValidationError("The column header + " + str(i) + " must be of the form [STRAIN]-[BIOLOGICAL REPLICATE] [HYBRIDIZATION NUMBER]")) def validate_data(time_series_data, first_data_row, first_data_col, errors): - gene_locus_regex = re.compile("^BSU[0-9]+|^BSU_misc_RNA_[0-9]+|^VMG_[0-9]+_[0-9]+(_c)?") - dataLines = time_series_data.getRawDataLines() - for i in range(first_data_row, len(dataLines)): - line = dataLines[i] - # The header needs to be CompoundID - if i is first_data_row: - if not validate_header(line, first_data_col, errors): - break - continue + gene_locus_regex = re.compile("^BSU[0-9]+|^BSU_misc_RNA_[0-9]+|^VMG_[0-9]+_[0-9]+(_c)?") + dataLines = time_series_data.getRawDataLines() + for i in range(first_data_row, len(dataLines)): + line = dataLines[i] + # The header needs to be CompoundID + if i is first_data_row: + if not validate_header(line, first_data_col, errors): + break + continue - # The compound id should be one of these forms - gene_locus = line[0] - if not gene_locus_regex.match(gene_locus): - errors.append(createFileValidationError("Line " + str(i + 1) + ", column 1 must be of the format 'BSU#', 'BSU_misc_RNA_#', 'VMG_#_#', or 'VMG_#_#_c' (instead of " + gene_locus + ").")) - + # The compound id should be one of these forms + gene_locus = line[0] + if not gene_locus_regex.match(gene_locus): + errors.append(createFileValidationError("Line " + str(i + 1) + ", column 1 must be of the format 'BSU#', 'BSU_misc_RNA_#', 'VMG_#_#', or 'VMG_#_#_c' (instead of " + gene_locus + ").")) + def validate_metadata(time_series_data, errors): - metadata = time_series_data.getMetadataMap() - validationHelper = ValidationHelper(metadata, errors) - - # validate the header format - validationHelper.validateExplicitHeaderFormat("STRAIN-BIOREP HYBRID") - - # validate the timepoint type - validationHelper.validateControlledVocabularyProperty("TIMEPOINT TYPE", - "time point type", ['EX', 'IN', 'SI'], "'EX', 'IN', 'SI'") + metadata = time_series_data.getMetadataMap() + validationHelper = ValidationHelper(metadata, errors) + + # validate the header format + validationHelper.validateExplicitHeaderFormat("STRAIN-BIOREP HYBRID") + + # validate the timepoint type + validationHelper.validateControlledVocabularyProperty("TIMEPOINT TYPE", + "time point type", ['EX', 'IN', 'SI'], "'EX', 'IN', 'SI'") - # validate the cell location - validationHelper.validateControlledVocabularyProperty("CELL LOCATION", - "cell location", ['CE', 'ES', 'ME', 'CY', 'NC'], "'CE', 'ES', 'ME', 'CY', 'NC'") + # validate the cell location + validationHelper.validateControlledVocabularyProperty("CELL LOCATION", + "cell location", ['CE', 'ES', 'ME', 'CY', 'NC'], "'CE', 'ES', 'ME', 'CY', 'NC'") - # validate the value type - validationHelper.validateControlledVocabularyProperty("VALUE TYPE", - "value type", ['VALUE', 'MEAN', 'MEDIAN', 'STD', 'VAR', 'ERROR', 'IQR'], - "'Value', 'Mean', 'Median', 'Std', 'Var', 'Error', 'Iqr'") + # validate the value type + validationHelper.validateControlledVocabularyProperty("VALUE TYPE", + "value type", ['VALUE', 'MEAN', 'MEDIAN', 'STD', 'VAR', 'ERROR', 'IQR'], + "'Value', 'Mean', 'Median', 'Std', 'Var', 'Error', 'Iqr'") - # validate the value unit - validationHelper.validateControlledVocabularyProperty("VALUE UNIT", - "value unit", ['MM', 'UM', 'PERCENT', 'RATIOT1', 'RATIOCS', 'AU', 'DIMENSIONLESS'], "'mM', 'uM', 'Percent', 'RatioT1', 'RatioCs', 'AU', 'Dimensionless'") - - # validate the scale - validationHelper.validateControlledVocabularyProperty("SCALE", "scale", - ['LIN', 'LOG2', 'LOG10', 'LN'], "'lin', 'log2', 'log10', 'ln'") - - # validate the data position specification - validationHelper.validateStartDataRowCol() + # validate the value unit + validationHelper.validateControlledVocabularyProperty("VALUE UNIT", + "value unit", ['MM', 'UM', 'PERCENT', 'RATIOT1', 'RATIOCS', 'AU', 'DIMENSIONLESS'], "'mM', 'uM', 'Percent', 'RatioT1', 'RatioCs', 'AU', 'Dimensionless'") + + # validate the scale + validationHelper.validateControlledVocabularyProperty("SCALE", "scale", + ['LIN', 'LOG2', 'LOG10', 'LN'], "'lin', 'log2', 'log10', 'ln'") + + # validate the data position specification + validationHelper.validateStartDataRowCol() def validate_data_set_file(file): - errors = [] - time_series_data = create_time_series_excel(file.getAbsolutePath()) - if time_series_data is None: - errors.append(createFileValidationError(file.getName() + " is not an Excel file.")) - return errors - - # validate the metadata - validate_metadata(time_series_data, errors) - - data_start = getInitialDataRowAndCol(time_series_data.getMetadataMap()) - - # validate the data - validate_data(time_series_data, data_start[0], data_start[1], errors) - - return errors + errors = [] + time_series_data = create_time_series_excel(file.getAbsolutePath()) + if time_series_data is None: + errors.append(createFileValidationError(file.getName() + " is not an Excel file.")) + return errors + + # validate the metadata + validate_metadata(time_series_data, errors) + + data_start = getInitialDataRowAndCol(time_series_data.getMetadataMap()) + + # validate the data + validate_data(time_series_data, data_start[0], data_start[1], errors) + + return errors diff --git a/eu_basynthec/sourceTest/examples/OD600-Example.xlsx b/eu_basynthec/sourceTest/examples/OD600-Example.xlsx index 4f83f9ce28149ce85353f1e0456bfaddba7aa093..cee4d575d2575a349bfb7c3a456c3dfae9f8b5f8 100644 GIT binary patch delta 3751 zcmY*c2{aU38y?IsG{%zM*bU0QHr9x-FJ(*iHAHsd6Jr-j%rL}+EFqJ9-=#?u#yX6w zsmLxZ2*0oIobNmTJ?FXSp7%cIdCz<Ax$n78yUDz1Egp<2#3KX<6aauW6#&2h006>$ zrNjJ#d|dtgeWb$g`r^#{{N`jJT?{i<!R?|!>gDTkGsu=|n-~|dq-u3>Ii3r3!rH|? zKX>#7ho}8iA*u$#gn@zbmM)y}0B;}7mha#wmebQYyQ)b&>-X{e@Jd`m%=mPDFq`2i zFxJW1AGPSK_vLZqm-L4KU$Ka^8UF<E3y@I%_;=^ZafH9H`5J@Z0k1ocEkNPv+lEbh zj!MZwCZ$s+8T$z@5XqFDT4IEsS0zQ~YGXS?Ct0aI{y0KN<oL8o;dSI!#D;sXS(9#s zk>m9>-SF&P?K2mm1HQVRm3?Xy?yp`2e>HoINAYLsXHGv-jqI$ScFA)HVm5`YX{c!X zBn4ElLPJtOl&OaEJ{<kh?>Xw%wdTl=;7+T+-Gj?q7VBE18LXQ0^d13s3^^hS$|^gt zXX&(^vE9nLpN-_A`!4ihFsjtBfsnRr5G9ytdK;s#SM@_h0#Z~e>0$r))Jso=_#(Q* zd3_};^wp8wqQClKT7lZ|ReGy>i&A$r=3gE7x6^wd<6m%RmBL}?=T|>CH||TdQs)eD z%5uj-N`c07=t8N26NT`Tv-zoM<9zWN^1Ncl`f6-+z(Slp8Y}u@W%cYUO+yS_5G5#I zab9r*^AjS2%6@TM(2~oA+Z?SMZ#`FbL9~x~W>U%G+fG=~n1)~Ey?D@?TIbGTt--eA z5j48^8#%AZ#-2b)oNONZ+{nf-^@X8@^8WSbH=ZiCvUwh?NJX7V?P&lATs>@e!Toxd zqUK#|erNL#iTNd=U34KErsE7ytB-cCiD5yU16S&AqR8NiEw5L$_D&Z#9!2xYMn=_k zvU;zC<|P~~@XrYOHl;ZGn7un}e;`x~Z5GKCFiFBTf*kl@u$Qz*ZBt;vH_f<5!}vB1 z7CDnuos(qTC#F>opsH;HpL|$hU@MK@rGVx4F1|#PDMRegt3J#2fxbHzi;8^%^mF7^ z7lp)rpn}muasBUVH!2F6R&0N({201l^26M0zN){+5Z7}ZE=D5~(5UgkPv;)>J6i`} zcb!ueRwOoQ!{P?zsS<U&K;%)ySfHBYS5>>RTbjs^&HcJ&KAd@A-s}?Pz>)E5B{$U? zG(@X9@u&8Pp)_?-6=~(M;Px3nxJ=$uQGO+%Tv@VHduKKX0U$qDUhc?D&lXKMyS)}2 zd~pDV^auzs4+M8eCxgtWn6MHTF~H}EQ+K{A#g$%7JYdJf(6M6fz!@7T7<g#t?7y2| z(FOqkXBRQYT%0JmKW{a6F}PTTC1!Ew{Rtlj?~d_dS&Yh8)21!Iu(_$1_kMN@>Uz*6 za2MbGS+htbP$!I#X<=af_7sOp@ShEOQ@DV(Q58VzA)c-^yR35=YIi;wKhZ%sE;`VM zNS150)4Sq|Q%=)^P9_d^dfwFJcCOMSR8Vt{-!JA9Lk;Ubf09g-1#F9z&!U`lK?7O3 z(OpkGfuOk{bLRYXkBHmMI+Y`G5A}0k;Vo&`LgM?~;>?H8j4U)PUq=~kV88hq%)lL( zGjiPSv91Wmv*e4VuSGtwELAo+5Hir4YOY3$2_ptnZQpakwGb-sx@!3HT6Cc+v#%?n zbk#B)B|$sHr2Nds(4bM-99W)((>+_8*i(wzPm?Es$tDLrkq4XWpF`|<M@GS2pZTg4 zxqPj~bOn%ht$OX&xD)D+IfA>qNfK{!b10B+_)EuVhi|~NT^alqQNHy42Rrj7ZK;>A zPtr$8_Xj)nv63mxv_T33do^B047E}fb;u6X632w-?;{ZMm@0A1@y5LeZn^Y5O~RKI z;<jk~@soG<i5#tdi`>7es}l!sD+%Q<KQ}xvRlKO-RQ05Kd2YIvBv*ISB*9E$?8;I4 z27_;TpaEvezTmZqX8d~Y(oK3;!pb)Zv4_oi>GY!9Y$YTa$lUPqvG>=5H=k+KrSMCe zD4~N5U&m5}NzLZ7Pd1?;{h2#&*RVn@Ukn<y5nm(r@uRR=D$c0ox8_5wHAKzI6(8h% zkCB$tgXH~d46K1!%)H{R;<Mesf}(aZOYc4!Dfx9^Ll9Rv)*|*R)nx&6hVs^sa4cjR z(n}tH=t-S4P2(J=_KH8FWssjkgL`!fWtBTsWRV<PZuwSDq6O;5vHMVCa({WCy9;gp zp<aJ+H|aYcd_NrteW;~Qv=F@dt=`ut=roN#U1u8<Sk`r)^>x^WIO*r_M+3?yD&b2- zq9+^oSBs=$cPJqnm29xq=q{G^YRL$kXe=KLauO}sU(`*!nAWgtU_~QNQ93@2f*_LM zv7Xd_B?X&WH82-EG_iiQJ8QVE@2MC*VyD6MrOU`#jz}<MoBaol*gIwXso8O2)^y{p z@W&STRn0my6~U0ARJC)dXxNaq+-6WKKU<=@?>sa<4N<Te?DY^o9{xwwK(<P2cIXr? z&bj6aPun~P>U@$mYBFae7;3SIgS_j|1DC9WifjgBUCRLDFP#F2xBp~0mu4e5mjTSZ zeW<zc?o0LAyX<Zg4a`ZS)`n*d!*{MXk9WDlsT}a`ng3wbqtcqL*yhp$*D`)d>wN^l z5XvI1d>zlC+5Y_kl#E{uOwEQh-a-woaGl34-yPPB&xY7H*N5l$gv-Y+H)N0V2{v$L zy*!Y*HK@rhe#`wIMBIAur}%Sa{j_W~(?7+ZD}yrpPU7N~#3OL7aGJZ;^{v$BAwxTP zqBER|-nxE-A~I|QO5`cN0d|PzG4&qH5?e@FlwV6im4>HJydYS1ZhH3ds#Cl3m9?>z z@U;4H66VxhctAfASMbJleJp>0K7FFd(*rvaB&p)UI4n-fhT5(j<a*!A5U5X$?tg_G zpu~WWbUY%xiam3$GwWEApfac2bo49v5_xM;o-=%Zb;~D^WGYyciYaYCU1P~OY@yZY z<ya}B<jG&mJc3jzIwBi4wF<=*#_S#pHIs_&MDox|oS;0-1_tN64tFggU)#-y`-Wx~ z!(n$LPJBSBV$#oiZ8%e<qI0J5k;fIAk;Z3#_gPvY>+)pZ^B<8P=7>uW<)1IZ5>?oE z788z}1Q_JIKe*mxId%D^ZPVj>HR*25;xo6K1=f2Oo9KaFs#cM-0m?sr{R)WkN52yi z@66Ljf}9S=gb&nT)y-V*b~x;bUMP@^GV^|YrGgQM|Ls})``y9E<W>Gv$1U&}uS`JG zewb}eAfs5P5Tr6ij5hg78!7|h9iBC(nT;-bWa5gHq3@6AW&G6b_TW<_LZ`sO*bi9E zu}sd?H|M86no7sM@NoO39s(JRvMO-tGc2;1i2LekL>`+I??J2iHHW?t!(L0V&&TBo zz)1LA^`ws-lI+YcFUX`!Ded#82-b_-v^8q4R-nkS;X&<BxNB>5qxL>vgq;p=-oQ!z z<~PABKhA<#?kvbV)+Em4vxntp^HIhNg8FWgc;k#r<jh$#72-#~4+M^^EVf*c@8=3> zJUPCXVt1UvTiT+V>RudP%-t1l8)H6`0kcXxm4Z5f=eHGfa*`O+DC?I=Zj)K`zU(|( z--*JbqU>n_Y$npis6#bw3qXQ+8OZ05TToB-uGkXUTtZ|l0<j~DJHt@i^F#JFx#UW> z69j$*?e*U6&kK%MFuZsP%9|yiZv~e_@(?n-f2b=KxP99n>U1~^UW~7e^LlEEDWdia zjBmXMw&~K=DNN3IueB5qnsF-`tf^ZLf~<RcE9O>A&6Y`{Izp=U$pXEnP^PEWwSUuo z0#j75v0#~HRwfY@mcWHFRVJt`qimczbeuhKEEA&8OF|*zQ=p3PJ3`9)1gN=8#SucV z(wb#@K4Zl~x{?zP5tT`qBQgmpOO!!BQGTo0Ux}yI)%?^Y_(~^Pu$b_cmWH!}!^-3{ ze-!Hk>Iz=QbEAw^M8%vb-4dbiL<j-KT6@_XT{Z=KCT@6b6S1nf#(A!5&TUI%1$~6W zQCq_Eexyja{DN`Mr7`RWp|&#n2+-iO-Tpz!Mt#!!hj_NKGMfwZTSE)m&H+q_-^;+> zHpcd-fZ<&(>N&><8%vm&OK4JfA|3Y#**VV}|7oFwpW)8J_jkMZBV4dFtp@Thx&A`C z8C?vTbjoff-PV!wsxMKVqc5d}l=eQ-<ETRL>ltrmIVf2)qYus+CK7l)Cf7V(=$eVC zk>TzR+z0rPkIVTup78RKC1d<%^s6-1PA`?BC@`^fETGjE)1NvGCA=aOC?}(S85Y7K zs7HG+GT;3S_H0Yj!wd=(P4$;;+ekIxLVLCN`;+~e<oX6a1aI#YL>jcKR9rry+!(U; zd>{fu79k~#9NgC;I?{8*f(m>Ft8BHC(wB&rd%6u1U5cA`y?m&qbC!xV$J1#s&$@ml zxop1XWeGC~SmWQmQ`O3VGP85;C(uX(%)qytK?O$J%6_qRoDbF#vRpOu%<Mt|mt-u9 zG$WI}ukAf1#anHUCYf?7TdhA9*$hcly=^p>7Js}n)HkDGGH%?#A#jyYlb>AfH`O}1 z^%7X5mK{fX;R8u2XRjkTW93Grx)b5)VuI0*VMfb9UuY%Jn!SrgBktifj%mHC;?ffx z$6H*VE?)CC{=Apvw~iYI=SwV*`+AP%0;05@zBx1MO!4MtRx0;xhB1xCs1ext1kb4d zKFtPhfov+;e@z~yP1uwo0|SQh!~R%;bK~a)1^`&jjo*J`3xcL+!`Q+vQ*J>qXHZs5 zKAaKu--i<b;P{VwzT#rToN=*YKEPRF|F+F@&HpE1xX(#k7*P>M*q{47PwYSm05JWZ Xk|H0*LPVXS7LzF=L>(*q_ul^ilL*ze delta 3573 zcma)<cTf}Fvd5Fqk={w@5SmEuy%SmlgwRCkH6hXjq(ndvL|Q_TB7*dyQUsMEMT!xX zCSpJcMI?aqDpD@*_j~U*@BVRT?wQ%y-8tXc*_kuD^Eor{Hp_;3YK(`~f_(=N08k<W z02l!PKm<}I9333sjz$MaN1%{JHkRlwvP_-A+x$|*@W=FoKu44fyIH>(wH@_T^JG4< ze&59ND(rZZTg=5bj%>j*V{rFpr%H1>_?a6~&~V)aZ$I=<;ns)ADPHrl^|kc{Y;*G1 z#8?Cs6m#$Wi`q%di`xl|RSAb;#sKNu4x?YK`i2wnIYBELx3c}B8(1XhOqkmNP3czv zPQp7<?Mw*&L7#i5BThP^qEk<reJ90z1sFg$i!_J`&$4efhP-~lE=LG^QURSUoB#Me zncqLnDzBqtgmG&@aBrslMq###Pk?mo6QNB|9kh29g~-I1{(!CB@6>S7SPois36_T+ zkcd&{@*W-Iv{7y4<UnZEh7#R2r#CI(sKjOcK2|0e6cNo#e{EH7@>{N^jPlCh$pQ0T zqmg@XqXdb0Y2Mu3dM8XdjCFX*GWOc^if5p_F#U4LVlT*S42lyd_)wgUd)+V*O7rJV z&ZF~Z8?WpOP8=<Z{rle3V2!iFR~~f}J&jsTbzJ+dGFR>To!mb0I78IUm7#Cdbg=J| z{JgU7TKBxv%exBsLUnpRG4VNldaY#Yl)hbQ2c}SH$7DJdTq*e(a`Ut~g*F&OQBnCF z{RvAii^0`af+W)#=p{_c^y=2f5^=6HNdLl`--jCIquPN{w-YExnjM?J8%#HDB=N;Q z-JL41m3r<D^aW4*hgkF*oc?~72H?;3vqRHYF5J8E-D{Hwouw*qSiClxC_8JmQpiBH z^x{`?(kxFs#J0paJUceD+KDKbRGK%W`ZhOV3}dY8A;xO69VF-~e;nC+vfr~~?3ps@ z$kG&*;L6xADjJ`rB|SW8#1j|RAeTbtD1W>B3dwtO&ec-fIao-b$e0C*4E)qzlIbYq zv#Rr>Be|qn=-XJqxoSwb+Wq$MY!D}$X8;_txW`p3#bfb|iszX!(EkJF{ZI*mZLRp= za17kWE;vH%Kx?Ae`g-ckD!qL4mm8}!n6M-JlvK`I@}+L_^sgopSKohjov2RjRyhcm z85J416(f1Mn>e8R%JV9}qI&wEdCV}vkHc#8HL7u9*C|ms-ek!aUex?@gp@N}Ka6jy zUe)1D-w7T}k(Pw-sktBlnp~g!NA|xj?5c6hmK`G1cmSyJqOEnq!4Isx7=_TTx9J|~ zPbF2=YgF&8=bgq^#ZjB*rZ|Ged;MGPK=j6*=pS5aUS`mvVvr)|jeblH0BkV<09*iE zgD5)=BT9iO%5w>MYR05XL{FSiJs5mOdy}@hA`mJ{@KkY}>HDQEoNh+JCH=7A<j=J1 z9+x-F)P*1PamDXWl5pi9aTe>Kk=c>#t3HZ)H>QM53gxYsPC0Ap=;R_f^~8tr_9;SH zPam@#zgo$|444#jacJm2qU0WPf4a#vWL)WZPXmh~Pt;Suf(Rp;G+ftzIyb)$W~r|; zHASlDF6EmtRj3ckerW$uQfSGB4FGSjV&#iP#Dcm@&fGb@D7?*krB|>5m>C~VK}&g9 z<Aa6dPJ2_Mf_H|*<(29iUyWE2)vl0|azczvnbH|d>jO+x+lZBlj0icTO>ng_hnfT| zRsfUtaB0+knEIDZFjve()nXX=QeM>aB~OJ>nf1}sy2A0G7B!4eu#|RDz|xHu;xSix zPVYQ*&MXy>qZzUb0P-++8ZW%I_>!=^kAD2pzEaWXbH?(c^VU1ssn?w;DDp>-&@a6; zn1GrNILn3%0;|?+mT$X<tP_;ZkFI(CMXn*HLZRX7+%t8)HN1hxTw4r!uDy+8<B~C} zg)@hUxB9+?>Vn1O7AoMIh63|fhZ&Z1)lrq>^fg9g)qbw%i#Jm#Yj-KegWwrpYN~!t zN2}F<3?4|Ul9Y*z8&|G#_rBZe-oQ^_t91SRgb7K7P;U|R=*BmLp~eCdIBWAgi82c_ zy<6`1D)~;a=9-33lt7t=VXn<UOd6#byN>_D<&fB~<vQ8uJu6ldiTOs+R_r<_iyEPb z>?YsMt)S_a@=w1%=?&w;*Hg%>tO~Y2#`)k~@xbrS1>3znBgX)XOcO5homOu0cVY=0 zm9+NsB*poUHEd0EL#X%w2F)_YmgpnQHx*8s2N1p$$ZW%JYaw*xk8UnSGA-ziMy{7L zb~ij;mE{d*U(VMZ112JB_RX80Pq>!%3g>`&SC4ON3z5v{@&NC=Yt=xL=v*PK7K(vL zmZLOR(jkPK@)JaBTNN^EYq=$L*LDg^HrODt)vGsr3tj_8<aU9sY$|bSJ4#?sD|TeX zI|y{2<EFx8HIh3t<JaGoTj3;;>|P=8ASb@4k!6wXGP5{B$f}-A_fBsJ`Eq@qYQNcq z`w?^nK`W5c`lF=Zj6wTV6EP4<_oP01md&1C+?jCx(Tt(Z0WaMCm!){sAZVXD{8iQG z?*b1nIn->IDKhy^d3w|?%fTC2E?ngG)Ep43;)<GYBQ|09*}ZeF>De6F=q8(EuXcZ3 zg4yAJ+vwhZYSgLz$0>NzrioV(Gb2S?sxv*3ozr=f5Yyo^;#nDLJl+-}GhHk74Lc}6 zH1;d)_axhJGA5WTb%ZZ+KJ>PG!0t7YV^^g?!qX^f<pO&kX52S);X4C-v<TicQLQ-m z)zv*CbEK|0a)*)^{AWZ2Em(7WqBpa=xl1}9E>6e28eN#rIDkzyMEmMnElW6dKdsyC z8g{d4okdLS=+BHay6t-tbnH8cdpZ|1aeht)urenjX9G|Is4sFs0ORALTj3Y15qpvT zwaUPlB7{pup(<O<&6;!Iwu2u?M8Wg{^#onnD8j#`Z2e$yB1k%g9eHg!^J5n=ppdC# z+^$6iC_T8$$A5Ff^5E=aT?s`+K3kz|e!m+a{sS-TAIqbqCsl=ET)qZ<>9#iAqR~L! zx5LDftjFc*t7La+GWAH75wex=GT1oTt*1ViCdxRpS&gdPQT0LAbN0-~&V6;he1k*D ztfKnEnYG0EOfS5+rrFdjrU9IIb+X>+akqbr+u~k0N!5C=@@*-064vO&t{{{CvdT}z zt;}!=wU(q%Tb^6d7+?eUo!XJ4Q?=Cjm1yhfTx!}8mgPeMw!L&XAl{0iD>3h|Bqm`Z zTRp^yclgA`Rpw)`9CflE6nsA0Q;KQ5W~3v;Cec>M@fKTiIc&bN;k&X$mgMgFg~64Z zMyWm#R7;z}84;}$oo;k|x)S8pY#388e+pf~X*Bt;BU6&}5yWqwd1PgB2N=x8Y=5F7 zk6FK+xyrV+-m^u#24l!RBe~}f@C1zcI){u;cwI!+$4#VKrvr}35P}lv+6p%;cAvGL z+y7!+?~O-w@W_5Y6C>OBp3)sK*l>LxFLxBO^B(;!e-}oFSAbpx=0eRTlp2kS%U_Re zL<MS99V~UmwA19d-E~VFRes`Qo%(r<VSA4GQKyN>=0Ikz!IsCGU@9MT=|jwoOJMnt z;8E8yxKCH~j7+#Bzb0sXlNqm+kif<l-(?RQYkp{0Lh~ZIVYD%MI`$3cJhLz}D{NlS zfH_<X4<jo+y9`E#*1Hj84#Ar4<?{tSkc&I|uB{%#Zi}q+eYY|Xdfa&IR&rKS%Ew*< zWAJk?C3(>0O?GBwb&jlhj3_+fJ*>anpO%rk_R)DS?H|-BGBFKq+sDezCvPkFM7Kf7 zDYx{&<zDA`je>LEH8Qy5JMr5MHnbiPkyw|X6$6#Ro`Oay8_2F2JfAjmi;q4Pd$z;T zHZsvEdo%E?=lzs5vZCkqQXr%1dw|YaMADp%SxDs8r!64Wj5U=lMj0d_ry=%KFy|6y zo@+5`Le5p3F<|y^=;(Hg&$#e+>E>tPD6G$rf@)#L)aw8hc{ODM{`)%B7dk52AA`_z zM{rMmd|}&K;HvV23nq*l7+%>Oe9<HJ#^7d<@(FZ~dnWXdsC~^nD*9LJ-L@cM&VB3i zVZn3G|C>(c|399-MAKN~W*EOH5MyorPL?f^NO5j}k1DL3`ALnUl8g!#aIlU}#m^v+ zW0{8Sr%~g#>vCwce-?*yMQuO{s{(6tmmWR?=Loa(iv#*0)-^8a#Y17Q4dcQrXb>v2 z!J*`-i2i-M0i~t1IFoT-+2<fg1hy%Ajc5JwIGe}Np+J6Ghe2e}G#9=IgDK$s`ViB= zJ13IP=@9P`ua>0YT5TevE=a+jbv*t>OZOI?63pteI}7v4Vwh|BX>+N)DBST)*@4F6 z1vnv|ae<`$&0nz<1!?i)f43}l6t=td<pmf~+s;m9>luG7@a~%dHlgTO9P*kHVas*- zVGzs;ielkFwN09Rc(sgCSC=V{(cnJGX9?QbE;CsTbJy>U;HxYb2~kh=Yl>K-JYU`H zrKgAlpUVU>iw?=ZEnRt7slvLzZRL5iV8BKMG}j8&kkS0}zrtOI!h!d3bx?u7u^cx7 zWe5Ji{e<cP7jfdEqCEc&c^Av_pTorQzsSeV^Y82PAH_wm2)+<WQvCIRIHD*!D3cF& SAgTqV$0>>lQ_c$ikM>`?U7F?q diff --git a/eu_basynthec/sourceTest/java/eu/basynthec/cisd/dss/AbstractBaSynthecDataSetRegistratorTest.java b/eu_basynthec/sourceTest/java/eu/basynthec/cisd/dss/AbstractBaSynthecDataSetRegistratorTest.java index 3a05bdbb6ca..4cf7b8ae70c 100644 --- a/eu_basynthec/sourceTest/java/eu/basynthec/cisd/dss/AbstractBaSynthecDataSetRegistratorTest.java +++ b/eu_basynthec/sourceTest/java/eu/basynthec/cisd/dss/AbstractBaSynthecDataSetRegistratorTest.java @@ -82,6 +82,15 @@ public abstract class AbstractBaSynthecDataSetRegistratorTest extends one(openBisService).createDataSetCode(); will(returnValue(excelDataSetCode)); + // Some if there is a multistrain data set type, it needs to be taken care of in + // addition to the normal one + if (tsvDataSetType == TSV_MULTISTRAIN_EXPORT_DATA_SET_TYPE) + { + String tsvMultistrain = DATA_SET_CODE + "-TSV-MULTISTRAIN"; + one(openBisService).createDataSetCode(); + will(returnValue(tsvMultistrain)); + } + String tsvDataSetCode = DATA_SET_CODE + "-TSV"; one(openBisService).createDataSetCode(); will(returnValue(tsvDataSetCode)); @@ -103,13 +112,13 @@ public abstract class AbstractBaSynthecDataSetRegistratorTest extends { one(dataSetValidator).assertValidDataSet( TSV_MULTISTRAIN_EXPORT_DATA_SET_TYPE, - new File(new File(stagingDirectory, tsvDataSetCode), "tsv-multi")); - } else - { - one(dataSetValidator).assertValidDataSet(TSV_DATA_SET_TYPE, - new File(new File(stagingDirectory, tsvDataSetCode), "tsv")); + new File(new File(stagingDirectory, DATA_SET_CODE + + "-TSV-MULTISTRAIN"), "tsv-multi")); } + one(dataSetValidator).assertValidDataSet(TSV_DATA_SET_TYPE, + new File(new File(stagingDirectory, tsvDataSetCode), "tsv")); + one(openBisService).performEntityOperations(with(atomicatOperationDetails)); will(returnValue(new AtomicEntityOperationResult())); } diff --git a/eu_basynthec/sourceTest/java/eu/basynthec/cisd/dss/TimeSeriesDataExcelTest.java b/eu_basynthec/sourceTest/java/eu/basynthec/cisd/dss/TimeSeriesDataExcelTest.java index 0d566507204..d2be9831e7f 100644 --- a/eu_basynthec/sourceTest/java/eu/basynthec/cisd/dss/TimeSeriesDataExcelTest.java +++ b/eu_basynthec/sourceTest/java/eu/basynthec/cisd/dss/TimeSeriesDataExcelTest.java @@ -105,7 +105,8 @@ public class TimeSeriesDataExcelTest extends AssertJUnit { "MGP1", "OD600", "0.05", "0.064" }, { "MGP100", "OD600", "0.05", "0.064" }, { "MGP20", "OD600", "0.05", "0.064" }, - { "MGP999", "OD600", "0.05", "0.064" } }; + { "MGP999", "OD600", "0.05", "0.064" }, + { "MGP1", "OD600", "0.05", "0.064" } }; assertLinesAreEqual(dataLines, expectedData); } diff --git a/eu_basynthec/sourceTest/java/eu/basynthec/cisd/dss/growthprofiles/OD600DataSetRegistratorTest.java b/eu_basynthec/sourceTest/java/eu/basynthec/cisd/dss/growthprofiles/OD600DataSetRegistratorTest.java index a8a036d19b6..8205496cd8a 100644 --- a/eu_basynthec/sourceTest/java/eu/basynthec/cisd/dss/growthprofiles/OD600DataSetRegistratorTest.java +++ b/eu_basynthec/sourceTest/java/eu/basynthec/cisd/dss/growthprofiles/OD600DataSetRegistratorTest.java @@ -16,10 +16,13 @@ package eu.basynthec.cisd.dss.growthprofiles; +import java.io.File; import java.io.IOException; +import java.util.Arrays; import java.util.HashMap; import java.util.Properties; +import org.apache.commons.io.FileUtils; import org.testng.annotations.Test; import ch.systemsx.cisd.common.test.RecordingMatcher; @@ -50,7 +53,7 @@ public class OD600DataSetRegistratorTest extends AbstractBaSynthecDataSetRegistr handler.handle(markerFile); - assertEquals(3, atomicOperationDetails.recordedObject().getDataSetRegistrations().size()); + assertEquals(4, atomicOperationDetails.recordedObject().getDataSetRegistrations().size()); NewExternalData dataSet = atomicOperationDetails.recordedObject().getDataSetRegistrations().get(0); @@ -64,7 +67,20 @@ public class OD600DataSetRegistratorTest extends AbstractBaSynthecDataSetRegistr assertNotNull(strainProperty); assert null != strainProperty; - assertEquals("MGP1,MGP100,MGP20,MGP999", strainProperty.getValue()); + + NewExternalData tsvSplitDataSet = + atomicOperationDetails.recordedObject().getDataSetRegistrations().get(3); + String location = tsvSplitDataSet.getLocation(); + File tsvSplitFolder = new File(workingDirectory, "/1/" + location); + String[] contents = tsvSplitFolder.list(); + Arrays.sort(contents); + String[] expectedContents = + { "OD600-Example.xlsx_MGP1.tsv", "OD600-Example.xlsx_MGP100.tsv", + "OD600-Example.xlsx_MGP20.tsv", "OD600-Example.xlsx_MGP999.tsv" }; + assertEquals(Arrays.asList(expectedContents), Arrays.asList(contents)); + File tsvSplitFile = new File(tsvSplitFolder, "OD600-Example.xlsx_MGP1.tsv"); + checkTsvSplitContent(tsvSplitFile); + context.assertIsSatisfied(); } @@ -73,4 +89,14 @@ public class OD600DataSetRegistratorTest extends AbstractBaSynthecDataSetRegistr { return "dist/etc/growth-profiles/"; } + + private void checkTsvSplitContent(File tsvFile) throws IOException + { + String content = FileUtils.readFileToString(tsvFile); + assertEquals( + "RunNumber\tHumanReadable\t-19020.0\t-17220.0\t-15360.0\t-13620.0\t-11820.0\t-10020.0\t-8220.0\t-7020.0\t-4920.0\t-2820.0\t-1020.0\t-120.0\t720.0\t1500.0\t3660.0\t5460.0\t6060.0\t7200.0\t9000.0\n" + + "0\tOD600\t0.05\t0.064\t0.077\t0.089\t0.107\t0.127\t0.155\t0.176\t0.24\t0.33\t0.43\t0.49\t0.58\t0.66\t0.975\t1.42\t1.49\t2.09\t3.22\n" + + "1\tOD600\t0.05\t0.064\t0.077\t0.089\t0.107\t0.127\t0.155\t0.176\t0.24\t0.33\t0.43\t0.49\t0.58\t0.66\t0.975\t1.42\t1.49\t2.09\t3.22", + content); + } } diff --git a/eu_basynthec/sourceTest/java/eu/basynthec/cisd/dss/transcriptomics/TranscriptomicsDataSetRegistratorTest.java b/eu_basynthec/sourceTest/java/eu/basynthec/cisd/dss/transcriptomics/TranscriptomicsDataSetRegistratorTest.java index d8da395faf3..1131553c802 100644 --- a/eu_basynthec/sourceTest/java/eu/basynthec/cisd/dss/transcriptomics/TranscriptomicsDataSetRegistratorTest.java +++ b/eu_basynthec/sourceTest/java/eu/basynthec/cisd/dss/transcriptomics/TranscriptomicsDataSetRegistratorTest.java @@ -18,6 +18,7 @@ package eu.basynthec.cisd.dss.transcriptomics; import java.io.File; import java.io.IOException; +import java.util.Arrays; import java.util.HashMap; import java.util.Properties; @@ -52,7 +53,7 @@ public class TranscriptomicsDataSetRegistratorTest extends AbstractBaSynthecData handler.handle(markerFile); - assertEquals(3, atomicOperationDetails.recordedObject().getDataSetRegistrations().size()); + assertEquals(4, atomicOperationDetails.recordedObject().getDataSetRegistrations().size()); NewExternalData dataSet = atomicOperationDetails.recordedObject().getDataSetRegistrations().get(0); @@ -75,6 +76,19 @@ public class TranscriptomicsDataSetRegistratorTest extends AbstractBaSynthecData new File(new File(workingDirectory, "/1/" + location), "Transcriptomics-Example.xlsx.tsv"); checkTsvContent(tsvFile); + + NewExternalData tsvSplitDataSet = + atomicOperationDetails.recordedObject().getDataSetRegistrations().get(3); + location = tsvSplitDataSet.getLocation(); + File tsvSplitFolder = new File(workingDirectory, "/1/" + location); + String[] contents = tsvSplitFolder.list(); + Arrays.sort(contents); + String[] expectedContents = + { "Transcriptomics-Example.xlsx_MGP253.tsv", + "Transcriptomics-Example.xlsx_MGP776.tsv" }; + assertEquals(Arrays.asList(expectedContents), Arrays.asList(contents)); + File tsvSplitFile = new File(tsvSplitFolder, "Transcriptomics-Example.xlsx_MGP253.tsv"); + checkSplitTsvContent(tsvSplitFile); context.assertIsSatisfied(); } @@ -102,4 +116,18 @@ public class TranscriptomicsDataSetRegistratorTest extends AbstractBaSynthecData + "BSU00260\t11.7669\t11.4658\n" + "BSU00270\t12.2675\t11.8745\n" + "BSU00280\t12.5574\t12.1608\n", content); } + + private void checkSplitTsvContent(File tsvFile) throws IOException + { + String content = FileUtils.readFileToString(tsvFile); + assertEquals("Locustag\t1 66687802\n" + "BSU00010\t13.7953\n" + "BSU00020\t13.5907\n" + + "BSU00030\t13.8489\n" + "BSU00040\t14.3564\n" + "BSU00050\t14.5239\n" + + "BSU00060\t14.3293\n" + "BSU00070\t14.481\n" + "BSU00090\t15.474\n" + + "BSU00100\t14.4332\n" + "BSU00110\t15.2669\n" + "BSU00120\t15.3344\n" + + "BSU_misc_RNA_1\t15.4497\n" + "BSU00130\t13.6604\n" + "BSU00180\t9.8208\n" + + "BSU_misc_RNA_2\t13.6614\n" + "BSU00190\t13.464\n" + "BSU00200\t14.6102\n" + + "BSU00210\t13.5285\n" + "BSU00220\t13.1007\n" + "BSU00230\t11.8547\n" + + "BSU00240\t10.8623\n" + "BSU00250\t11.6694\n" + "BSU00260\t11.7669\n" + + "BSU00270\t12.2675\n" + "BSU00280\t12.5574", content); + } } -- GitLab