diff --git a/eu_basynthec/dist/etc/data-set-validator.py b/eu_basynthec/dist/etc/data-set-validator.py index 6b1f13c649b0994381177bcf8d5bb76dc40f3155..8af662eac42eb6a9e624b607e88acef2ad3d84f7 100644 --- a/eu_basynthec/dist/etc/data-set-validator.py +++ b/eu_basynthec/dist/etc/data-set-validator.py @@ -2,5 +2,5 @@ import os import re def validate_data_set_file(file): - errors = [] - return errors + errors = [] + return errors diff --git a/eu_basynthec/dist/etc/growth-profiles/data-set-handler.py b/eu_basynthec/dist/etc/growth-profiles/data-set-handler.py index 4f4df43954f3d02971a0957700dd7cdde6c9974c..bc76e78a48d3ca0504b14bb9470e6dedd36dad79 100644 --- a/eu_basynthec/dist/etc/growth-profiles/data-set-handler.py +++ b/eu_basynthec/dist/etc/growth-profiles/data-set-handler.py @@ -2,58 +2,92 @@ from datetime import datetime from eu.basynthec.cisd.dss import TimeSeriesDataExcel def retrieve_experiment(tr, exp_id): - """Get the specified experiment form the server. Return the experiment.""" - if exp_id is None: - exp = None - else: - exp = tr.getExperiment(exp_id) - return exp + """Get the specified experiment form the server. Return the experiment.""" + if exp_id is None: + exp = None + else: + exp = tr.getExperiment(exp_id) + return exp def extract_strains(): - """Extract the strains from the data sheet""" - strains = [] - lines = timeSeriesData.getRawDataLines() - for i in range(1, len(lines)): - line = lines[i] - strains.append(line[0]) - return ",".join(strains) + """Extract the strains from the data sheet""" + strains = [] + lines = timeSeriesData.getRawDataLines() + for i in range(1, len(lines)): + line = lines[i] + strains.append(line[0]) + return ",".join(strains) def assign_properties(dataset, metadata): - """Assign properties to the data set from information in the data.""" - propertyNameMap = { - "STRAIN_NAMES": "STRAIN_NAMES", - "TIMEPOINT TYPE": "TIMEPOINT_TYPE", - "CELL LOCATION": "CELL_LOCATION", - "VALUE TYPE": "VALUE_TYPE", - "VALUE UNIT": "VALUE_UNIT", - "SCALE": "SCALE" - } - - for prop in metadata.keySet(): - key = propertyNameMap.get(prop) - if key is not None: - value = metadata.get(prop) - if (key == "STRAIN"): - value = value + " (STRAIN)" - dataset.setPropertyValue(key, value.upper()) - + """Assign properties to the data set from information in the data.""" + propertyNameMap = { + "STRAIN_NAMES": "STRAIN_NAMES", + "TIMEPOINT TYPE": "TIMEPOINT_TYPE", + "CELL LOCATION": "CELL_LOCATION", + "VALUE TYPE": "VALUE_TYPE", + "VALUE UNIT": "VALUE_UNIT", + "SCALE": "SCALE" + } + + for prop in metadata.keySet(): + key = propertyNameMap.get(prop) + if key is not None: + value = metadata.get(prop) + if (key == "STRAIN"): + value = value + " (STRAIN)" + dataset.setPropertyValue(key, value.upper()) + def convert_data_to_tsv(tr, dataset, location): - """Create a tsv file containing the data and add it to the data set.""" - tr.createNewDirectory(dataset, location) - tsvFileName = tr.createNewFile(dataset, location, incoming.getName() + ".tsv") - tsv = open(tsvFileName, 'w') - for line in timeSeriesData.getRawDataLines(): - for i in range(0, len(line) - 1): - tsv.write(line[i]) - tsv.write("\t") - tsv.write(line[len(line) - 1]) - tsv.write("\n") - tsv.close() - + """Create a tsv file containing the data and add it to the data set.""" + tr.createNewDirectory(dataset, location) + tsvFileName = tr.createNewFile(dataset, location, incoming.getName() + ".tsv") + tsv = open(tsvFileName, 'w') + for line in timeSeriesData.getRawDataLines(): + for i in range(0, len(line) - 1): + tsv.write(line[i]) + tsv.write("\t") + tsv.write(line[len(line) - 1]) + tsv.write("\n") + tsv.close() + +def convert_data_to_split_tsv(tr, dataset, location): + """Create one tsv file per strain in the original data.""" + raw_data_lines = timeSeriesData.getRawDataLines() + + # Extract the header -- this is shared by all files + header_line = raw_data_lines[0] + # In the header we don't need the strain, but we start with a run number + header = 'RunNumber\t' + '\t'.join(header_line[1:len(header_line)]) + + tr.createNewDirectory(dataset, location) + + # Keep track of the strains, since a strain can be measured multiple times + data_per_strain = {} + + lines_len = len(raw_data_lines) + for i in range(1, len(raw_data_lines)): + line = raw_data_lines[i] + strain_name = line[0] + strain_data = data_per_strain.setdefault(strain_name, []) + # Append the line -- this is run number + the data + strain_data.append(str(len(strain_data)) + '\t' + '\t'.join(line[1:len(line)])) + + # Create the files + for strain in data_per_strain.iterkeys(): + tsvFileName = tr.createNewFile(dataset, location, incoming.getName() + "_" + strain + ".tsv") + tsv = open(tsvFileName, 'w') + tsv.write(header) + + strain_data = data_per_strain[strain] + for line in strain_data: + tsv.write("\n") + tsv.write(line) + tsv.close() + def store_original_data(tr, dataset, location): - """Put the original data into the data set.""" - tr.createNewDirectory(dataset, location) - tr.moveFile(incoming.getAbsolutePath(), dataset, location + "/" + incoming.getName()) + """Put the original data into the data set.""" + tr.createNewDirectory(dataset, location) + tr.moveFile(incoming.getAbsolutePath(), dataset, location + "/" + incoming.getName()) tr = service.transaction(incoming) @@ -73,6 +107,9 @@ store_original_data(tr, original_dataset, "xls") tsv_dataset = tr.createNewDataSet("TSV_MULTISTRAIN_EXPORT") convert_data_to_tsv(tr, tsv_dataset, "tsv-multi") +tsv_split_dataset = tr.createNewDataSet("TSV_EXPORT") +convert_data_to_split_tsv(tr, tsv_split_dataset, "tsv") + # Make the original contain these contained_codes = [original_dataset.getDataSetCode(), tsv_dataset.getDataSetCode()] dataset.setContainedDataSetCodes(contained_codes) @@ -80,10 +117,10 @@ dataset.setContainedDataSetCodes(contained_codes) # If no experiment has been set, then get the experiment from the excel file if dataset.getExperiment() is None: - exp_id = metadata.get("EXPERIMENT") - exp = retrieve_experiment(tr, exp_id) - if exp is not None: - dataset.setExperiment(exp) - original_dataset.setExperiment(exp) - tsv_dataset.setExperiment(exp) - + exp_id = metadata.get("EXPERIMENT") + exp = retrieve_experiment(tr, exp_id) + if exp is not None: + dataset.setExperiment(exp) + original_dataset.setExperiment(exp) + tsv_dataset.setExperiment(exp) + tsv_split_dataset.setExperiment(exp) diff --git a/eu_basynthec/dist/etc/growth-profiles/data-set-validator.py b/eu_basynthec/dist/etc/growth-profiles/data-set-validator.py index b2f2c0bd96c771fa775a77043cee990abab5543f..3bdb793b652e0b95a06eb65889060650152b66be 100644 --- a/eu_basynthec/dist/etc/growth-profiles/data-set-validator.py +++ b/eu_basynthec/dist/etc/growth-profiles/data-set-validator.py @@ -1,60 +1,60 @@ def validate_data(timeSeriesData, errors): - dataLines = timeSeriesData.getRawDataLines() - lineCount = 0 - for line in dataLines: - # The header needs to be Abs - if lineCount is 0: - if line[0] != "Strain": - errors.append(createFileValidationError("The first data column must be 'Strain'")) - break - lineCount = lineCount + 1 - continue + dataLines = timeSeriesData.getRawDataLines() + lineCount = 0 + for line in dataLines: + # The header needs to be Abs + if lineCount is 0: + if line[0] != "Strain": + errors.append(createFileValidationError("The first data column must be 'Strain'")) + break + lineCount = lineCount + 1 + continue - # The compound id should be one of these forms - strain = line[0] - if not isStrainIdValid(strain): - errors.append(createFileValidationError("Line " + str(lineCount + 1) + ", column 1 must be MGP[0-999] (instead of " + strain + ").")) - lineCount = lineCount + 1 + # The compound id should be one of these forms + strain = line[0] + if not isStrainIdValid(strain): + errors.append(createFileValidationError("Line " + str(lineCount + 1) + ", column 1 must be MGP[0-999] (instead of " + strain + ").")) + lineCount = lineCount + 1 def validate_metadata(time_series_data, errors): - metadata = time_series_data.getMetadataMap() - validationHelper = ValidationHelper(metadata, errors) - - # validate the header format - validationHelper.validateDefaultHeaderFormat() - - # validate the timepoint type - validationHelper.validateControlledVocabularyProperty("TIMEPOINT TYPE", - "time point type", ['EX', 'IN', 'SI'], "'EX', 'IN', 'SI'") + metadata = time_series_data.getMetadataMap() + validationHelper = ValidationHelper(metadata, errors) + + # validate the header format + validationHelper.validateDefaultHeaderFormat() + + # validate the timepoint type + validationHelper.validateControlledVocabularyProperty("TIMEPOINT TYPE", + "time point type", ['EX', 'IN', 'SI'], "'EX', 'IN', 'SI'") - # validate the cell location - validationHelper.validateControlledVocabularyProperty("CELL LOCATION", - "cell location", ['CE', 'ES', 'ME', 'CY', 'NC'], "'CE', 'ES', 'ME', 'CY', 'NC'") + # validate the cell location + validationHelper.validateControlledVocabularyProperty("CELL LOCATION", + "cell location", ['CE', 'ES', 'ME', 'CY', 'NC'], "'CE', 'ES', 'ME', 'CY', 'NC'") - # validate the value type - validationHelper.validateControlledVocabularyProperty("VALUE TYPE", - "value type", ['VALUE', 'MEAN', 'MEDIAN', 'STD', 'VAR', 'ERROR', 'IQR'], - "'Value', 'Mean', 'Median', 'Std', 'Var', 'Error', 'Iqr'") + # validate the value type + validationHelper.validateControlledVocabularyProperty("VALUE TYPE", + "value type", ['VALUE', 'MEAN', 'MEDIAN', 'STD', 'VAR', 'ERROR', 'IQR'], + "'Value', 'Mean', 'Median', 'Std', 'Var', 'Error', 'Iqr'") - # validate the value unit - validationHelper.validateControlledVocabularyProperty("VALUE UNIT", - "value unit", ['DIMENSIONLESS'], "'DIMENSIONLESS'") - - # validate the scale - validationHelper.validateControlledVocabularyProperty("SCALE", "scale", - ['LIN', 'LOG2', 'LOG10', 'LN'], "'lin', 'log2', 'log10', 'ln'") + # validate the value unit + validationHelper.validateControlledVocabularyProperty("VALUE UNIT", + "value unit", ['DIMENSIONLESS'], "'DIMENSIONLESS'") + + # validate the scale + validationHelper.validateControlledVocabularyProperty("SCALE", "scale", + ['LIN', 'LOG2', 'LOG10', 'LN'], "'lin', 'log2', 'log10', 'ln'") def validate_data_set_file(file): - errors = [] - time_series_data = create_time_series_excel(file.getAbsolutePath()) - if time_series_data is None: - errors.append(createFileValidationError(file.getName() + " is not an Excel file.")) - return errors - - # validate the metadata - validate_metadata(time_series_data, errors) - - # validate the data - validate_data(time_series_data, errors) - - return errors + errors = [] + time_series_data = create_time_series_excel(file.getAbsolutePath()) + if time_series_data is None: + errors.append(createFileValidationError(file.getName() + " is not an Excel file.")) + return errors + + # validate the metadata + validate_metadata(time_series_data, errors) + + # validate the data + validate_data(time_series_data, errors) + + return errors diff --git a/eu_basynthec/dist/etc/metabolomics/data-set-handler.py b/eu_basynthec/dist/etc/metabolomics/data-set-handler.py index 9cdbf69004f4efaa291af3b51d81335adacebc50..56737bf7a8dc2b5897a402cbc9c9c36daf32056c 100644 --- a/eu_basynthec/dist/etc/metabolomics/data-set-handler.py +++ b/eu_basynthec/dist/etc/metabolomics/data-set-handler.py @@ -2,47 +2,47 @@ from datetime import datetime from eu.basynthec.cisd.dss import TimeSeriesDataExcel def retrieve_experiment(tr, exp_id): - """Get the specified experiment form the server. Return the experiment.""" - if exp_id is None: - exp = None - else: - exp = tr.getExperiment(exp_id) - return exp + """Get the specified experiment form the server. Return the experiment.""" + if exp_id is None: + exp = None + else: + exp = tr.getExperiment(exp_id) + return exp def assign_properties(dataset, metadata): - """Assign properties to the data set from information in the data.""" - propertyNameMap = { - "STRAIN":"STRAIN_NAMES", - "TIMEPOINT TYPE": "TIMEPOINT_TYPE", - "CELL LOCATION": "CELL_LOCATION", - "VALUE TYPE": "VALUE_TYPE", - "VALUE UNIT": "VALUE_UNIT", - "SCALE": "SCALE" - } - - for prop in metadata.keySet(): - key = propertyNameMap.get(prop) - if key is not None: - value = metadata.get(prop) - dataset.setPropertyValue(key, value.upper()) - + """Assign properties to the data set from information in the data.""" + propertyNameMap = { + "STRAIN":"STRAIN_NAMES", + "TIMEPOINT TYPE": "TIMEPOINT_TYPE", + "CELL LOCATION": "CELL_LOCATION", + "VALUE TYPE": "VALUE_TYPE", + "VALUE UNIT": "VALUE_UNIT", + "SCALE": "SCALE" + } + + for prop in metadata.keySet(): + key = propertyNameMap.get(prop) + if key is not None: + value = metadata.get(prop) + dataset.setPropertyValue(key, value.upper()) + def convert_data_to_tsv(tr, dataset, location): - """Create a tsv file containing the data and add it to the data set.""" - tr.createNewDirectory(dataset, location) - tsvFileName = tr.createNewFile(dataset, location, incoming.getName() + ".tsv") - tsv = open(tsvFileName, 'w') - for line in timeSeriesData.getRawDataLines(): - for i in range(0, len(line) - 1): - tsv.write(line[i]) - tsv.write("\t") - tsv.write(line[len(line) - 1]) - tsv.write("\n") - tsv.close() - + """Create a tsv file containing the data and add it to the data set.""" + tr.createNewDirectory(dataset, location) + tsvFileName = tr.createNewFile(dataset, location, incoming.getName() + ".tsv") + tsv = open(tsvFileName, 'w') + for line in timeSeriesData.getRawDataLines(): + for i in range(0, len(line) - 1): + tsv.write(line[i]) + tsv.write("\t") + tsv.write(line[len(line) - 1]) + tsv.write("\n") + tsv.close() + def store_original_data(tr, dataset, location): - """Put the original data into the data set.""" - tr.createNewDirectory(dataset, location) - tr.moveFile(incoming.getAbsolutePath(), dataset, location + "/" + incoming.getName()) + """Put the original data into the data set.""" + tr.createNewDirectory(dataset, location) + tr.moveFile(incoming.getAbsolutePath(), dataset, location + "/" + incoming.getName()) tr = service.transaction(incoming) @@ -52,7 +52,7 @@ timeSeriesData = TimeSeriesDataExcel.createTimeSeriesDataExcel(incoming.getAbsol dataset = tr.createNewDataSet("METABOLITE_INTENSITIES") metadata = timeSeriesData.getMetadataMap() assign_properties(dataset, metadata) - + # Store the original and tsv data in data sets original_dataset = tr.createNewDataSet("EXCEL_ORIGINAL") store_original_data(tr, original_dataset, "xls") @@ -67,11 +67,11 @@ dataset.setContainedDataSetCodes(contained_codes) # If no experiment has been set, then get the experiment from the excel file if dataset.getExperiment() is None: - exp_id = metadata.get("EXPERIMENT") - exp = retrieve_experiment(tr, exp_id) - if exp is not None: - dataset.setExperiment(exp) - original_dataset.setExperiment(exp) - tsv_dataset.setExperiment(exp) + exp_id = metadata.get("EXPERIMENT") + exp = retrieve_experiment(tr, exp_id) + if exp is not None: + dataset.setExperiment(exp) + original_dataset.setExperiment(exp) + tsv_dataset.setExperiment(exp) diff --git a/eu_basynthec/dist/etc/metabolomics/data-set-validator.py b/eu_basynthec/dist/etc/metabolomics/data-set-validator.py index 79d9b3d462a839c036e486bb622d313d95d2301a..0e77b3af1eb1f26b40069c452d377943446f6517 100644 --- a/eu_basynthec/dist/etc/metabolomics/data-set-validator.py +++ b/eu_basynthec/dist/etc/metabolomics/data-set-validator.py @@ -1,67 +1,67 @@ def validate_data(time_series_data, errors): - chebiRegex = re.compile("^CHEBI:[0-9]+") - bsbmeRegex = re.compile("^BSBME:[0-9]+") - dataLines = time_series_data.getRawDataLines() - lineCount = 0 - for line in dataLines: - # The header needs to be CompoundID - if lineCount is 0: - if line[0] != "CompoundID": - errors.append(createFileValidationError("The first data column must be 'CompoundID'")) - break - lineCount = lineCount + 1 - continue + chebiRegex = re.compile("^CHEBI:[0-9]+") + bsbmeRegex = re.compile("^BSBME:[0-9]+") + dataLines = time_series_data.getRawDataLines() + lineCount = 0 + for line in dataLines: + # The header needs to be CompoundID + if lineCount is 0: + if line[0] != "CompoundID": + errors.append(createFileValidationError("The first data column must be 'CompoundID'")) + break + lineCount = lineCount + 1 + continue - # The compound id should be one of these forms - compoundId = line[0] - if not chebiRegex.match(compoundId): - if not bsbmeRegex.match(compoundId): - errors.append(createFileValidationError("Line " + str(lineCount + 1) + ", column 1 must be of the format 'CHEBI:#' or 'BSBME:#' (instead of " + compoundId + ").")) - lineCount = lineCount + 1 - + # The compound id should be one of these forms + compoundId = line[0] + if not chebiRegex.match(compoundId): + if not bsbmeRegex.match(compoundId): + errors.append(createFileValidationError("Line " + str(lineCount + 1) + ", column 1 must be of the format 'CHEBI:#' or 'BSBME:#' (instead of " + compoundId + ").")) + lineCount = lineCount + 1 + def validate_metadata(time_series_data, errors): - metadata = time_series_data.getMetadataMap() - validationHelper = ValidationHelper(metadata, errors) - - # validate the strain - validationHelper.validateStrain() - - # validate the header format - validationHelper.validateDefaultHeaderFormat() - - # validate the timepoint type - validationHelper.validateControlledVocabularyProperty("TIMEPOINT TYPE", - "time point type", ['EX', 'IN', 'SI'], "'EX', 'IN', 'SI'") + metadata = time_series_data.getMetadataMap() + validationHelper = ValidationHelper(metadata, errors) + + # validate the strain + validationHelper.validateStrain() + + # validate the header format + validationHelper.validateDefaultHeaderFormat() + + # validate the timepoint type + validationHelper.validateControlledVocabularyProperty("TIMEPOINT TYPE", + "time point type", ['EX', 'IN', 'SI'], "'EX', 'IN', 'SI'") - # validate the cell location - validationHelper.validateControlledVocabularyProperty("CELL LOCATION", - "cell location", ['CE', 'ES', 'ME', 'CY', 'NC'], "'CE', 'ES', 'ME', 'CY', 'NC'") + # validate the cell location + validationHelper.validateControlledVocabularyProperty("CELL LOCATION", + "cell location", ['CE', 'ES', 'ME', 'CY', 'NC'], "'CE', 'ES', 'ME', 'CY', 'NC'") - # validate the value type - validationHelper.validateControlledVocabularyProperty("VALUE TYPE", - "value type", ['VALUE', 'MEAN', 'MEDIAN', 'STD', 'VAR', 'ERROR', 'IQR'], - "'Value', 'Mean', 'Median', 'Std', 'Var', 'Error', 'Iqr'") + # validate the value type + validationHelper.validateControlledVocabularyProperty("VALUE TYPE", + "value type", ['VALUE', 'MEAN', 'MEDIAN', 'STD', 'VAR', 'ERROR', 'IQR'], + "'Value', 'Mean', 'Median', 'Std', 'Var', 'Error', 'Iqr'") - # validate the value unit - validationHelper.validateControlledVocabularyProperty("VALUE UNIT", - "value unit", ['MM', 'UM', 'RATIOT1', 'RATIOCS'], "'mM', 'uM', 'RatioT1', 'RatioCs'") - - # validate the scale - validationHelper.validateControlledVocabularyProperty("SCALE", "scale", - ['LIN', 'LOG2', 'LOG10', 'LN'], "'lin', 'log2', 'log10', 'ln'") + # validate the value unit + validationHelper.validateControlledVocabularyProperty("VALUE UNIT", + "value unit", ['MM', 'UM', 'RATIOT1', 'RATIOCS'], "'mM', 'uM', 'RatioT1', 'RatioCs'") + + # validate the scale + validationHelper.validateControlledVocabularyProperty("SCALE", "scale", + ['LIN', 'LOG2', 'LOG10', 'LN'], "'lin', 'log2', 'log10', 'ln'") def validate_data_set_file(file): - errors = [] - time_series_data = create_time_series_excel(file.getAbsolutePath()) - if time_series_data is None: - errors.append(createFileValidationError(file.getName() + " is not an Excel file.")) - return errors - - # validate the metadata - validate_metadata(time_series_data, errors) - - # validate the data - validate_data(time_series_data, errors) - - return errors + errors = [] + time_series_data = create_time_series_excel(file.getAbsolutePath()) + if time_series_data is None: + errors.append(createFileValidationError(file.getName() + " is not an Excel file.")) + return errors + + # validate the metadata + validate_metadata(time_series_data, errors) + + # validate the data + validate_data(time_series_data, errors) + + return errors diff --git a/eu_basynthec/dist/etc/proteomics/data-set-handler.py b/eu_basynthec/dist/etc/proteomics/data-set-handler.py index 4ab87d54b496d78b196f88b4945206769bd321ed..369cf170b5870c353a48827190526ffa623b8332 100644 --- a/eu_basynthec/dist/etc/proteomics/data-set-handler.py +++ b/eu_basynthec/dist/etc/proteomics/data-set-handler.py @@ -2,48 +2,48 @@ from datetime import datetime from eu.basynthec.cisd.dss import TimeSeriesDataExcel def retrieve_experiment(tr, exp_id): - """Get the specified experiment form the server. Return the experiment.""" - if exp_id is None: - exp = None - else: - exp = tr.getExperiment(exp_id) - return exp + """Get the specified experiment form the server. Return the experiment.""" + if exp_id is None: + exp = None + else: + exp = tr.getExperiment(exp_id) + return exp def assign_properties(dataset, metadata): - """Assign properties to the data set from information in the data.""" - propertyNameMap = { - "STRAIN":"STRAIN_NAMES", - "TIMEPOINT TYPE": "TIMEPOINT_TYPE", - "CELL LOCATION": "CELL_LOCATION", - "VALUE UNIT": "VALUE_UNIT", - "SCALE": "SCALE" - } - - for prop in metadata.keySet(): - key = propertyNameMap.get(prop) - if key is not None: - value = metadata.get(prop) - if (key == "STRAIN"): - value = value + " (STRAIN)" - dataset.setPropertyValue(key, value.upper()) - + """Assign properties to the data set from information in the data.""" + propertyNameMap = { + "STRAIN":"STRAIN_NAMES", + "TIMEPOINT TYPE": "TIMEPOINT_TYPE", + "CELL LOCATION": "CELL_LOCATION", + "VALUE UNIT": "VALUE_UNIT", + "SCALE": "SCALE" + } + + for prop in metadata.keySet(): + key = propertyNameMap.get(prop) + if key is not None: + value = metadata.get(prop) + if (key == "STRAIN"): + value = value + " (STRAIN)" + dataset.setPropertyValue(key, value.upper()) + def convert_data_to_tsv(tr, dataset, location): - """Create a tsv file containing the data and add it to the data set.""" - tr.createNewDirectory(dataset, location) - tsvFileName = tr.createNewFile(dataset, location, incoming.getName() + ".tsv") - tsv = open(tsvFileName, 'w') - for line in timeSeriesData.getRawDataLines(): - for i in range(0, len(line) - 1): - tsv.write(line[i]) - tsv.write("\t") - tsv.write(line[len(line) - 1]) - tsv.write("\n") - tsv.close() - + """Create a tsv file containing the data and add it to the data set.""" + tr.createNewDirectory(dataset, location) + tsvFileName = tr.createNewFile(dataset, location, incoming.getName() + ".tsv") + tsv = open(tsvFileName, 'w') + for line in timeSeriesData.getRawDataLines(): + for i in range(0, len(line) - 1): + tsv.write(line[i]) + tsv.write("\t") + tsv.write(line[len(line) - 1]) + tsv.write("\n") + tsv.close() + def store_original_data(tr, dataset, location): - """Put the original data into the data set.""" - tr.createNewDirectory(dataset, location) - tr.moveFile(incoming.getAbsolutePath(), dataset, location + "/" + incoming.getName()) + """Put the original data into the data set.""" + tr.createNewDirectory(dataset, location) + tr.moveFile(incoming.getAbsolutePath(), dataset, location + "/" + incoming.getName()) tr = service.transaction(incoming) @@ -68,11 +68,11 @@ dataset.setContainedDataSetCodes(contained_codes) # If no experiment has been set, then get the experiment from the excel file if dataset.getExperiment() is None: - exp_id = metadata.get("EXPERIMENT") - exp = retrieve_experiment(tr, exp_id) - if exp is not None: - dataset.setExperiment(exp) - original_dataset.setExperiment(exp) - tsv_dataset.setExperiment(exp) + exp_id = metadata.get("EXPERIMENT") + exp = retrieve_experiment(tr, exp_id) + if exp is not None: + dataset.setExperiment(exp) + original_dataset.setExperiment(exp) + tsv_dataset.setExperiment(exp) diff --git a/eu_basynthec/dist/etc/proteomics/data-set-validator.py b/eu_basynthec/dist/etc/proteomics/data-set-validator.py index 63c80b68f355d2ea9860840d5b2e94598d4313eb..c794234bf4bacd6721d74a9f4a2ec36f33dedbc5 100644 --- a/eu_basynthec/dist/etc/proteomics/data-set-validator.py +++ b/eu_basynthec/dist/etc/proteomics/data-set-validator.py @@ -1,69 +1,69 @@ def validate_data(time_series_data, errors): - gene_locus_regex = re.compile("^BSU[0-9]+|^BSU_misc_RNA_[0-9]+|^VMG_[0-9]+_[0-9]+(_c)?") - column_header_regex = re.compile("(\+|-)?[0-9]+::(value|mean|median|std|var|error|iqr)") - dataLines = time_series_data.getRawDataLines() - lineCount = 0 - for line in dataLines: - # The header needs to be GeneLocus - if lineCount is 0: - if line[0] != "GeneLocus": - errors.append(createFileValidationError("The first data column must be 'GeneLocus'")) - break - lineCount = lineCount + 1 - has_human_readable = line[1] == "HumanReadable" - - if has_human_readable: - range_start = 2 - else: - range_start = 1 - for i in range(range_start, len(line)): - if not column_header_regex.match(line[i].lower()): - errors.append(createFileValidationError("Column " + str(i) + " header must be of the format Timepoint::(value|mean|median|std|var|error|iqr), (instead of " + line[i] + ").")) - continue + gene_locus_regex = re.compile("^BSU[0-9]+|^BSU_misc_RNA_[0-9]+|^VMG_[0-9]+_[0-9]+(_c)?") + column_header_regex = re.compile("(\+|-)?[0-9]+::(value|mean|median|std|var|error|iqr)") + dataLines = time_series_data.getRawDataLines() + lineCount = 0 + for line in dataLines: + # The header needs to be GeneLocus + if lineCount is 0: + if line[0] != "GeneLocus": + errors.append(createFileValidationError("The first data column must be 'GeneLocus'")) + break + lineCount = lineCount + 1 + has_human_readable = line[1] == "HumanReadable" + + if has_human_readable: + range_start = 2 + else: + range_start = 1 + for i in range(range_start, len(line)): + if not column_header_regex.match(line[i].lower()): + errors.append(createFileValidationError("Column " + str(i) + " header must be of the format Timepoint::(value|mean|median|std|var|error|iqr), (instead of " + line[i] + ").")) + continue - # The compound id should be one of these forms - gene_locus = line[0] - if not gene_locus_regex.match(gene_locus): - errors.append(createFileValidationError("Line " + str(lineCount + 1) + ", column 1 must be of the format 'BSU#', 'BSU_misc_RNA_#', 'VMG_#_#', or 'VMG_#_#_c' (instead of " + gene_locus + ").")) - lineCount = lineCount + 1 - + # The compound id should be one of these forms + gene_locus = line[0] + if not gene_locus_regex.match(gene_locus): + errors.append(createFileValidationError("Line " + str(lineCount + 1) + ", column 1 must be of the format 'BSU#', 'BSU_misc_RNA_#', 'VMG_#_#', or 'VMG_#_#_c' (instead of " + gene_locus + ").")) + lineCount = lineCount + 1 + def validate_metadata(time_series_data, errors): - metadata = time_series_data.getMetadataMap() - validationHelper = ValidationHelper(metadata, errors) - - # validate the strain - validationHelper.validateStrain() - - # validate the header format - validationHelper.validateExplicitHeaderFormat("TIME::VALUE_TYPE") - - # validate the timepoint type - validationHelper.validateControlledVocabularyProperty("TIMEPOINT TYPE", - "time point type", ['EX', 'IN', 'SI'], "'EX', 'IN', 'SI'") + metadata = time_series_data.getMetadataMap() + validationHelper = ValidationHelper(metadata, errors) + + # validate the strain + validationHelper.validateStrain() + + # validate the header format + validationHelper.validateExplicitHeaderFormat("TIME::VALUE_TYPE") + + # validate the timepoint type + validationHelper.validateControlledVocabularyProperty("TIMEPOINT TYPE", + "time point type", ['EX', 'IN', 'SI'], "'EX', 'IN', 'SI'") - # validate the cell location - validationHelper.validateControlledVocabularyProperty("CELL LOCATION", - "cell location", ['CE', 'ES', 'ME', 'CY', 'NC'], "'CE', 'ES', 'ME', 'CY', 'NC'") + # validate the cell location + validationHelper.validateControlledVocabularyProperty("CELL LOCATION", + "cell location", ['CE', 'ES', 'ME', 'CY', 'NC'], "'CE', 'ES', 'ME', 'CY', 'NC'") - # validate the value unit - validationHelper.validateControlledVocabularyProperty("VALUE UNIT", - "value unit", ['MM', 'UM', 'PERCENT', 'RATIOT1', 'RATIOCS', 'AU', 'DIMENSIONLESS'], "'mM', 'uM', 'Percent', 'RatioT1', 'RatioCs', 'AU', 'Dimensionless'") - - # validate the scale - validationHelper.validateControlledVocabularyProperty("SCALE", "scale", - ['LIN', 'LOG2', 'LOG10', 'LN'], "'lin', 'log2', 'log10', 'ln'") + # validate the value unit + validationHelper.validateControlledVocabularyProperty("VALUE UNIT", + "value unit", ['MM', 'UM', 'PERCENT', 'RATIOT1', 'RATIOCS', 'AU', 'DIMENSIONLESS'], "'mM', 'uM', 'Percent', 'RatioT1', 'RatioCs', 'AU', 'Dimensionless'") + + # validate the scale + validationHelper.validateControlledVocabularyProperty("SCALE", "scale", + ['LIN', 'LOG2', 'LOG10', 'LN'], "'lin', 'log2', 'log10', 'ln'") def validate_data_set_file(file): - errors = [] - time_series_data = create_time_series_excel(file.getAbsolutePath()) - if time_series_data is None: - errors.append(createFileValidationError(file.getName() + " is not an Excel file.")) - return errors - - # validate the metadata - validate_metadata(time_series_data, errors) - - # validate the data - validate_data(time_series_data, errors) - - return errors + errors = [] + time_series_data = create_time_series_excel(file.getAbsolutePath()) + if time_series_data is None: + errors.append(createFileValidationError(file.getName() + " is not an Excel file.")) + return errors + + # validate the metadata + validate_metadata(time_series_data, errors) + + # validate the data + validate_data(time_series_data, errors) + + return errors diff --git a/eu_basynthec/dist/etc/shared/shared-classes.py b/eu_basynthec/dist/etc/shared/shared-classes.py index 6d4747656921cb2dba245074720223cf45cca2b6..72e6fd1ae055d722a81e87c9f2db06c635d0ef00 100644 --- a/eu_basynthec/dist/etc/shared/shared-classes.py +++ b/eu_basynthec/dist/etc/shared/shared-classes.py @@ -13,156 +13,156 @@ OPENBIS_METADATA_SHEET_NAME = "openbis-metadata" OPENBIS_DATA_SHEET_NAME = "openbis-data" class TimeSeriesDataExcel: - """ - An abstraction for accessing time series data following the BaSynthec conventions - from an Excel file. This class ported from Java, thus the camelCase naming. - """ - def __init__(self, file, fileReader): - self.file = file - self.fileReader = fileReader - - def getRawMetadataLines(self): - """Get the raw lines of the metadata sheet.""" - try: - return self.fileReader.readLines(OPENBIS_METADATA_SHEET_NAME); - except IOException, ex: - operationLog.error("Could not read data from [file: " + self.file.getPath() + ", sheet: " - + OPENBIS_METADATA_SHEET_NAME + "]", ex) - return [] + """ + An abstraction for accessing time series data following the BaSynthec conventions + from an Excel file. This class ported from Java, thus the camelCase naming. + """ + def __init__(self, file, fileReader): + self.file = file + self.fileReader = fileReader + + def getRawMetadataLines(self): + """Get the raw lines of the metadata sheet.""" + try: + return self.fileReader.readLines(OPENBIS_METADATA_SHEET_NAME); + except IOException, ex: + operationLog.error("Could not read data from [file: " + self.file.getPath() + ", sheet: " + + OPENBIS_METADATA_SHEET_NAME + "]", ex) + return [] - def getRawDataLines(self): - """Get the raw lines of the data sheet.""" - try: - return self.fileReader.readLines(OPENBIS_DATA_SHEET_NAME) - except IOException, ex: - operationLog.error("Could not read data from [file: " + file.getPath() + ", sheet: " - + OPENBIS_DATA_SHEET_NAME + "]", ex) - return [] + def getRawDataLines(self): + """Get the raw lines of the data sheet.""" + try: + return self.fileReader.readLines(OPENBIS_DATA_SHEET_NAME) + except IOException, ex: + operationLog.error("Could not read data from [file: " + file.getPath() + ", sheet: " + + OPENBIS_DATA_SHEET_NAME + "]", ex) + return [] - def getMetadataMap(self): - """ - Return the metadata has a hashmap, with all keys uppercased. - - Assumes the metadata sheet corresponds to the following format: [Property] [Value] [... stuff - that can be ignored], that is the property name is in column 1 and property value is in - column 2, and everything else can be ignored. - """ - metadataMap = {} - metadataLines = self.getRawMetadataLines() - - # Skip the first line, this is just the header - for i in range(1, metadataLines.size()): - line = metadataLines.get(i) - value = line[1]; - if "BLANK" == value: - value = None - metadataMap[line[0].upper()] = value - return metadataMap - + def getMetadataMap(self): + """ + Return the metadata has a hashmap, with all keys uppercased. + + Assumes the metadata sheet corresponds to the following format: [Property] [Value] [... stuff + that can be ignored], that is the property name is in column 1 and property value is in + column 2, and everything else can be ignored. + """ + metadataMap = {} + metadataLines = self.getRawMetadataLines() + + # Skip the first line, this is just the header + for i in range(1, metadataLines.size()): + line = metadataLines.get(i) + value = line[1]; + if "BLANK" == value: + value = None + metadataMap[line[0].upper()] = value + return metadataMap + def create_time_series_excel(fileName): - """Factory method for the TimeSeriesData object. Returns None if it cannot be created.""" - file = java.io.File(fileName) - try: - workbook = ExcelFileReader.getExcelWorkbook(file) - fileReader = ExcelFileReader(workbook, True) - return TimeSeriesDataExcel(file, fileReader) - except IllegalArgumentException, ex: - operationLog.error("Could not open file [" + fileName + "] as Excel data.", ex) - except IOException, ex: - operationLog.error("Could not open file [" + fileName + "] as Excel data.", ex) - return None + """Factory method for the TimeSeriesData object. Returns None if it cannot be created.""" + file = java.io.File(fileName) + try: + workbook = ExcelFileReader.getExcelWorkbook(file) + fileReader = ExcelFileReader(workbook, True) + return TimeSeriesDataExcel(file, fileReader) + except IllegalArgumentException, ex: + operationLog.error("Could not open file [" + fileName + "] as Excel data.", ex) + except IOException, ex: + operationLog.error("Could not open file [" + fileName + "] as Excel data.", ex) + return None - + class ValidationHelper: - """ - Methods for simplifying validation in BaSynthec. - This class is ported from Java, thus the camelCase naming. - """ - def __init__(self, metadataMap, errors): - self.metadataMap = metadataMap - self.errors = errors + """ + Methods for simplifying validation in BaSynthec. + This class is ported from Java, thus the camelCase naming. + """ + def __init__(self, metadataMap, errors): + self.metadataMap = metadataMap + self.errors = errors - def checkIsSpecified(self, property, displayName): - """Verify that a property is specified; if not, add a validation error to the list.""" - if self.metadataMap.get(property) is None: - self.errors.append(ValidationError.createFileValidationError("A " + displayName - + " must be specified.")) - return False - return True - - def validateStrain(self): - """Verify that the strain is specified and of the correct format""" - if not self.checkIsSpecified("STRAIN", "strain"): - return - strain = self.metadataMap.get("STRAIN") - if not isStrainIdValid(strain): - self.errors.append(createFileValidationError("Strain must be MGP[0-999] (instead of " + strain + ").")) - - def validateDefaultHeaderFormat(self): - """Validate that header format is either not specified or matches default (TIME)""" - if self.metadataMap.get("HEADER FORMAT") is None: - return - format = self.metadataMap.get("HEADER FORMAT") - expected_format = "TIME" - if expected_format != format: - self.errors.append(createFileValidationError("Header format must be " + expected_format + " (not " + format + ").")) - - def validateExplicitHeaderFormat(self, expected_format): - """Validate that header format is specified and matches the expected_format argument""" - if not self.checkIsSpecified("HEADER FORMAT", "header format"): - return - format = self.metadataMap.get("HEADER FORMAT") - if expected_format != format: - self.errors.append(createFileValidationError("Header format must be " + expected_format + " (not " + format + ").")) - - def validateControlledVocabularyProperty(self, property, displayName, allowedValues, allowedValuesDisplay): - """Validate that the property is specified and in the list of allowed values""" - if not self.checkIsSpecified(property, displayName): - return - value = self.metadataMap.get(property).upper() - if value not in allowedValues: - if len(allowedValues) > 1: - self.errors.append(createFileValidationError("The " + displayName + " must be one of " + allowedValuesDisplay + " (not " + value + ").")) - else: - self.errors.append(createFileValidationError("The " + displayName + " must be " + allowedValuesDisplay + " (not " + value + ").")) - - def validateStartDataRowCol(self): - if self.checkIsSpecified("START DATA ROW", "Start Data Row"): - value = self.metadataMap.get("START DATA ROW") - match = re.match("[0-9]+", value) - if match is None: - self.errors.append(createFileValidationError("The Start Data Row must be a number (not " + value + ").")) - if self.checkIsSpecified("START DATA COL", "Start Data Col"): - value = self.metadataMap.get("START DATA COL") - match = re.match("[A-Z]", value) - if match is None: - self.errors.append(createFileValidationError("The Start Data Col must be a letter between A and Z (not " + value + ").")) - + def checkIsSpecified(self, property, displayName): + """Verify that a property is specified; if not, add a validation error to the list.""" + if self.metadataMap.get(property) is None: + self.errors.append(ValidationError.createFileValidationError("A " + displayName + + " must be specified.")) + return False + return True + + def validateStrain(self): + """Verify that the strain is specified and of the correct format""" + if not self.checkIsSpecified("STRAIN", "strain"): + return + strain = self.metadataMap.get("STRAIN") + if not isStrainIdValid(strain): + self.errors.append(createFileValidationError("Strain must be MGP[0-999] (instead of " + strain + ").")) + + def validateDefaultHeaderFormat(self): + """Validate that header format is either not specified or matches default (TIME)""" + if self.metadataMap.get("HEADER FORMAT") is None: + return + format = self.metadataMap.get("HEADER FORMAT") + expected_format = "TIME" + if expected_format != format: + self.errors.append(createFileValidationError("Header format must be " + expected_format + " (not " + format + ").")) + + def validateExplicitHeaderFormat(self, expected_format): + """Validate that header format is specified and matches the expected_format argument""" + if not self.checkIsSpecified("HEADER FORMAT", "header format"): + return + format = self.metadataMap.get("HEADER FORMAT") + if expected_format != format: + self.errors.append(createFileValidationError("Header format must be " + expected_format + " (not " + format + ").")) + + def validateControlledVocabularyProperty(self, property, displayName, allowedValues, allowedValuesDisplay): + """Validate that the property is specified and in the list of allowed values""" + if not self.checkIsSpecified(property, displayName): + return + value = self.metadataMap.get(property).upper() + if value not in allowedValues: + if len(allowedValues) > 1: + self.errors.append(createFileValidationError("The " + displayName + " must be one of " + allowedValuesDisplay + " (not " + value + ").")) + else: + self.errors.append(createFileValidationError("The " + displayName + " must be " + allowedValuesDisplay + " (not " + value + ").")) + + def validateStartDataRowCol(self): + if self.checkIsSpecified("START DATA ROW", "Start Data Row"): + value = self.metadataMap.get("START DATA ROW") + match = re.match("[0-9]+", value) + if match is None: + self.errors.append(createFileValidationError("The Start Data Row must be a number (not " + value + ").")) + if self.checkIsSpecified("START DATA COL", "Start Data Col"): + value = self.metadataMap.get("START DATA COL") + match = re.match("[A-Z]", value) + if match is None: + self.errors.append(createFileValidationError("The Start Data Col must be a letter between A and Z (not " + value + ").")) + strainIdRegex = re.compile("^MGP[0-9]{1,3}") def isStrainIdValid(strainId): - """Return true if the strain id passes validation (has the form MGP[:digit:]{1,3})""" - match = strainIdRegex.match(strainId) - if match is None: - return False - return match.end() == len(strainId) - + """Return true if the strain id passes validation (has the form MGP[:digit:]{1,3})""" + match = strainIdRegex.match(strainId) + if match is None: + return False + return match.end() == len(strainId) + def getInitialDataRowAndCol(metadata): - """Extract the initial row and column as specified in the metadata. Returns an array with [row, col].""" - # get the raw value from the map - first_data_row = metadata.get("START DATA ROW") - first_data_col = metadata.get("START DATA COL") + """Extract the initial row and column as specified in the metadata. Returns an array with [row, col].""" + # get the raw value from the map + first_data_row = metadata.get("START DATA ROW") + first_data_col = metadata.get("START DATA COL") - # convert the row numeric string to an int - if first_data_row is None: - first_data_row = 0 - else: - first_data_row = int(float(first_data_row)) - 1 + # convert the row numeric string to an int + if first_data_row is None: + first_data_row = 0 + else: + first_data_row = int(float(first_data_row)) - 1 - # convert the column spreadsheet value to an int - if first_data_col is None: - first_data_col = 0 - else: - # columns start at A - first_data_col = ord(first_data_col) - ord('A') - return [first_data_row, first_data_col] + # convert the column spreadsheet value to an int + if first_data_col is None: + first_data_col = 0 + else: + # columns start at A + first_data_col = ord(first_data_col) - ord('A') + return [first_data_row, first_data_col] diff --git a/eu_basynthec/dist/etc/transcriptomics/data-set-handler.py b/eu_basynthec/dist/etc/transcriptomics/data-set-handler.py index d7d875f0585b23cf9e227339c72955aa05212a25..81078ecaa8a2dc7718229ae3caf0777084b62f3f 100644 --- a/eu_basynthec/dist/etc/transcriptomics/data-set-handler.py +++ b/eu_basynthec/dist/etc/transcriptomics/data-set-handler.py @@ -3,87 +3,159 @@ from eu.basynthec.cisd.dss import TimeSeriesDataExcel import re def getInitialDataRowAndCol(metadata): - """Extract the initial row and column as specified in the metadata. Returns an array with [row, col].""" - # get the raw value from the map - first_data_row = metadata.get("START DATA ROW") - first_data_col = metadata.get("START DATA COL") - - # convert the row numeric string to an int - if first_data_row is None: - first_data_row = 0 - else: - first_data_row = int(float(first_data_row)) - 1 - - # convert the column spreadsheet value to an int - if first_data_col is None: - first_data_col = 0 - else: - # columns start at A - first_data_col = ord(first_data_col) - ord('A') - return [first_data_row, first_data_col] + """Extract the initial row and column as specified in the metadata. Returns an array with [row, col].""" + # get the raw value from the map + first_data_row = metadata.get("START DATA ROW") + first_data_col = metadata.get("START DATA COL") + + # convert the row numeric string to an int + if first_data_row is None: + first_data_row = 0 + else: + first_data_row = int(float(first_data_row)) - 1 + + # convert the column spreadsheet value to an int + if first_data_col is None: + first_data_col = 0 + else: + # columns start at A + first_data_col = ord(first_data_col) - ord('A') + return [first_data_row, first_data_col] def retrieve_experiment(tr, exp_id): - """Get the specified experiment form the server. Return the experiment.""" - if exp_id is None: - exp = None - else: - exp = tr.getExperiment(exp_id) - return exp + """Get the specified experiment form the server. Return the experiment.""" + if exp_id is None: + exp = None + else: + exp = tr.getExperiment(exp_id) + return exp def assign_properties(dataset, metadata): - """Assign properties to the data set from information in the data.""" - propertyNameMap = { - "STRAIN_NAMES": "STRAIN_NAMES", - "TIMEPOINT TYPE": "TIMEPOINT_TYPE", - "CELL LOCATION": "CELL_LOCATION", - "VALUE TYPE": "VALUE_TYPE", - "VALUE UNIT": "VALUE_UNIT", - "SCALE": "SCALE" - } - - for prop in metadata.keySet(): - key = propertyNameMap.get(prop) - if key is not None: - value = metadata.get(prop) - if (key == "STRAIN"): - value = value + " (STRAIN)" - dataset.setPropertyValue(key, value.upper()) - + """Assign properties to the data set from information in the data.""" + propertyNameMap = { + "STRAIN_NAMES": "STRAIN_NAMES", + "TIMEPOINT TYPE": "TIMEPOINT_TYPE", + "CELL LOCATION": "CELL_LOCATION", + "VALUE TYPE": "VALUE_TYPE", + "VALUE UNIT": "VALUE_UNIT", + "SCALE": "SCALE" + } + + for prop in metadata.keySet(): + key = propertyNameMap.get(prop) + if key is not None: + value = metadata.get(prop) + if (key == "STRAIN"): + value = value + " (STRAIN)" + dataset.setPropertyValue(key, value.upper()) + def convert_data_to_tsv(tr, start_row, start_col, dataset, location): - """Create a tsv file containing the data and add it to the data set.""" - tr.createNewDirectory(dataset, location) - tsvFileName = tr.createNewFile(dataset, location, incoming.getName() + ".tsv") - tsv = open(tsvFileName, 'w') - raw_data = timeSeriesData.getRawDataLines() - for i in range(start_row, len(raw_data)): - line = raw_data[i] - # write the metabolite id - tsv.write(line[0]) - tsv.write("\t") - for j in range(start_col, len(line) - 1): - tsv.write(line[j]) - tsv.write("\t") - tsv.write(line[len(line) - 1]) - tsv.write("\n") - tsv.close() - + """Create a tsv file containing the data and add it to the data set.""" + tr.createNewDirectory(dataset, location) + tsvFileName = tr.createNewFile(dataset, location, incoming.getName() + ".tsv") + tsv = open(tsvFileName, 'w') + raw_data = timeSeriesData.getRawDataLines() + for i in range(start_row, len(raw_data)): + line = raw_data[i] + # write the metabolite id + tsv.write(line[0]) + tsv.write("\t") + for j in range(start_col, len(line) - 1): + tsv.write(line[j]) + tsv.write("\t") + tsv.write(line[len(line) - 1]) + tsv.write("\n") + tsv.close() + +class SplitColumnInfo: + """ + A class that stores, for each column in the file, the column number, the strain name, + the biological replicate, the hybridization number, and the column offset in the resulting file + """ + def __init__(self, column, strain_name, bio_replicate, hybrid_number, output_col): + self.column = column + self.strain_name = strain_name + self.bio_replicate = bio_replicate + self.hybrid_number = hybrid_number + self.output_col = output_col + + tsv = None + + +def convert_data_to_split_tsv(tr, start_row, start_col, dataset, location): + """Create one tsv file per strain in the original data.""" + raw_data = timeSeriesData.getRawDataLines() + + # Keep track of the mapping from columns to strains and strains to columns + column_infos = [] + strain_column_info = {} + + # Extract the column / strain mapping + header_line = raw_data[start_row] + header_regex = re.compile("^(MGP[0-9]{1,3})-([0-9]) ([0-9]+)") + for i in range(start_col, len(header_line)): + match = header_regex.match(header_line[i]) + strain_name = match.group(1) + strain_cols = strain_column_info.setdefault(strain_name, []) + column_info = SplitColumnInfo(i, strain_name, match.group(2), match.group(3), len(strain_cols)) + strain_cols.append(column_info) + column_infos.append(column_info) + + # create the files + tr.createNewDirectory(dataset, location) + for strain in strain_column_info.iterkeys(): + tsvFileName = tr.createNewFile(dataset, location, incoming.getName() + "_" + strain + ".tsv") + tsv = open(tsvFileName, 'w') + for column_info in strain_column_info[strain]: + column_info.tsv = tsv + + # Write the header + line = raw_data[start_row] + tag = line[0] + # write the first column to each file + for strain in strain_column_info.iterkeys(): + strain_column_info[strain][0].tsv.write(tag) + + for column_info in column_infos: + column_info.tsv.write('\t') + column_info.tsv.write(column_info.bio_replicate) + column_info.tsv.write(' ') + column_info.tsv.write(column_info.hybrid_number) + + # Write the data to the files + for i in range(start_row + 1, len(raw_data)): + line = raw_data[i] + tag = line[0] + for strain in strain_column_info.iterkeys(): + strain_column_info[strain][0].tsv.write('\n') + # write the first column to each file + strain_column_info[strain][0].tsv.write(tag) + # Write the remaining data to each file + for column_info in column_infos: + column_info.tsv.write('\t') + column_info.tsv.write(line[column_info.column]) + + # Close each file + for strain in strain_column_info.iterkeys(): + strain_column_info[strain][0].tsv.close() + def store_original_data(tr, dataset, location): - """Put the original data into the data set.""" - tr.createNewDirectory(dataset, location) - tr.moveFile(incoming.getAbsolutePath(), dataset, location + "/" + incoming.getName()) + """Put the original data into the data set.""" + tr.createNewDirectory(dataset, location) + tr.moveFile(incoming.getAbsolutePath(), dataset, location + "/" + incoming.getName()) def extract_strains(start_row, start_col): - """Extract the strain names from the header.""" - strains = [] - line = timeSeriesData.getRawDataLines()[start_row] - header_regex = re.compile("^(MGP[0-9]{1,3})-([0-9]) ([0-9]+)") - for i in range(start_col, len(line)): - match = header_regex.match(line[i]) - strains.append(match.group(1)) - return ",".join(strains) + """Extract the strain names from the header.""" + strains = [] + line = timeSeriesData.getRawDataLines()[start_row] + header_regex = re.compile("^(MGP[0-9]{1,3})-([0-9]) ([0-9]+)") + for i in range(start_col, len(line)): + match = header_regex.match(line[i]) + strains.append(match.group(1)) + return ",".join(strains) - + tr = service.transaction(incoming) timeSeriesData = TimeSeriesDataExcel.createTimeSeriesDataExcel(incoming.getAbsolutePath()) @@ -102,6 +174,9 @@ store_original_data(tr, original_dataset, "xls") tsv_dataset = tr.createNewDataSet("TSV_MULTISTRAIN_EXPORT") convert_data_to_tsv(tr, dataStart[0], dataStart[1], tsv_dataset, "tsv-multi") +tsv_split_dataset = tr.createNewDataSet("TSV_EXPORT") +convert_data_to_split_tsv(tr, dataStart[0], dataStart[1], tsv_split_dataset, "tsv") + # Make the original contain these contained_codes = [original_dataset.getDataSetCode(), tsv_dataset.getDataSetCode()] dataset.setContainedDataSetCodes(contained_codes) @@ -109,11 +184,11 @@ dataset.setContainedDataSetCodes(contained_codes) # If no experiment has been set, then get the experiment from the excel file if dataset.getExperiment() is None: - exp_id = metadata.get("EXPERIMENT") - exp = retrieve_experiment(tr, exp_id) - if exp is not None: - dataset.setExperiment(exp) - original_dataset.setExperiment(exp) - tsv_dataset.setExperiment(exp) - + exp_id = metadata.get("EXPERIMENT") + exp = retrieve_experiment(tr, exp_id) + if exp is not None: + dataset.setExperiment(exp) + original_dataset.setExperiment(exp) + tsv_dataset.setExperiment(exp) + tsv_split_dataset.setExperiment(exp) diff --git a/eu_basynthec/dist/etc/transcriptomics/data-set-validator.py b/eu_basynthec/dist/etc/transcriptomics/data-set-validator.py index ff0208622f086e3cb2c051650af93cc08d04484c..7eece315c6a1b5477aeb480eb1decf87d8b2352e 100644 --- a/eu_basynthec/dist/etc/transcriptomics/data-set-validator.py +++ b/eu_basynthec/dist/etc/transcriptomics/data-set-validator.py @@ -1,75 +1,75 @@ def validate_header(line, first_data_col, errors): - """Validate the header, returning False if there is no point in continuing validation""" - if line[0] != "Locustag": - errors.append(createFileValidationError("The first data column must be 'Locustag' (not " + line[0] + ").")) - return False - header_regex = re.compile("^MGP[0-9]{1,3}-[0-9] [0-9]+") - for i in range(first_data_col, len(line)): - match = header_regex.match(line[i]) - if match is None: - errors.append(createFileValidationError("The column header + " + str(i) + " must be of the form [STRAIN]-[BIOLOGICAL REPLICATE] [HYBRIDIZATION NUMBER]")) + """Validate the header, returning False if there is no point in continuing validation""" + if line[0] != "Locustag": + errors.append(createFileValidationError("The first data column must be 'Locustag' (not " + line[0] + ").")) + return False + header_regex = re.compile("^MGP[0-9]{1,3}-[0-9] [0-9]+") + for i in range(first_data_col, len(line)): + match = header_regex.match(line[i]) + if match is None: + errors.append(createFileValidationError("The column header + " + str(i) + " must be of the form [STRAIN]-[BIOLOGICAL REPLICATE] [HYBRIDIZATION NUMBER]")) def validate_data(time_series_data, first_data_row, first_data_col, errors): - gene_locus_regex = re.compile("^BSU[0-9]+|^BSU_misc_RNA_[0-9]+|^VMG_[0-9]+_[0-9]+(_c)?") - dataLines = time_series_data.getRawDataLines() - for i in range(first_data_row, len(dataLines)): - line = dataLines[i] - # The header needs to be CompoundID - if i is first_data_row: - if not validate_header(line, first_data_col, errors): - break - continue + gene_locus_regex = re.compile("^BSU[0-9]+|^BSU_misc_RNA_[0-9]+|^VMG_[0-9]+_[0-9]+(_c)?") + dataLines = time_series_data.getRawDataLines() + for i in range(first_data_row, len(dataLines)): + line = dataLines[i] + # The header needs to be CompoundID + if i is first_data_row: + if not validate_header(line, first_data_col, errors): + break + continue - # The compound id should be one of these forms - gene_locus = line[0] - if not gene_locus_regex.match(gene_locus): - errors.append(createFileValidationError("Line " + str(i + 1) + ", column 1 must be of the format 'BSU#', 'BSU_misc_RNA_#', 'VMG_#_#', or 'VMG_#_#_c' (instead of " + gene_locus + ").")) - + # The compound id should be one of these forms + gene_locus = line[0] + if not gene_locus_regex.match(gene_locus): + errors.append(createFileValidationError("Line " + str(i + 1) + ", column 1 must be of the format 'BSU#', 'BSU_misc_RNA_#', 'VMG_#_#', or 'VMG_#_#_c' (instead of " + gene_locus + ").")) + def validate_metadata(time_series_data, errors): - metadata = time_series_data.getMetadataMap() - validationHelper = ValidationHelper(metadata, errors) - - # validate the header format - validationHelper.validateExplicitHeaderFormat("STRAIN-BIOREP HYBRID") - - # validate the timepoint type - validationHelper.validateControlledVocabularyProperty("TIMEPOINT TYPE", - "time point type", ['EX', 'IN', 'SI'], "'EX', 'IN', 'SI'") + metadata = time_series_data.getMetadataMap() + validationHelper = ValidationHelper(metadata, errors) + + # validate the header format + validationHelper.validateExplicitHeaderFormat("STRAIN-BIOREP HYBRID") + + # validate the timepoint type + validationHelper.validateControlledVocabularyProperty("TIMEPOINT TYPE", + "time point type", ['EX', 'IN', 'SI'], "'EX', 'IN', 'SI'") - # validate the cell location - validationHelper.validateControlledVocabularyProperty("CELL LOCATION", - "cell location", ['CE', 'ES', 'ME', 'CY', 'NC'], "'CE', 'ES', 'ME', 'CY', 'NC'") + # validate the cell location + validationHelper.validateControlledVocabularyProperty("CELL LOCATION", + "cell location", ['CE', 'ES', 'ME', 'CY', 'NC'], "'CE', 'ES', 'ME', 'CY', 'NC'") - # validate the value type - validationHelper.validateControlledVocabularyProperty("VALUE TYPE", - "value type", ['VALUE', 'MEAN', 'MEDIAN', 'STD', 'VAR', 'ERROR', 'IQR'], - "'Value', 'Mean', 'Median', 'Std', 'Var', 'Error', 'Iqr'") + # validate the value type + validationHelper.validateControlledVocabularyProperty("VALUE TYPE", + "value type", ['VALUE', 'MEAN', 'MEDIAN', 'STD', 'VAR', 'ERROR', 'IQR'], + "'Value', 'Mean', 'Median', 'Std', 'Var', 'Error', 'Iqr'") - # validate the value unit - validationHelper.validateControlledVocabularyProperty("VALUE UNIT", - "value unit", ['MM', 'UM', 'PERCENT', 'RATIOT1', 'RATIOCS', 'AU', 'DIMENSIONLESS'], "'mM', 'uM', 'Percent', 'RatioT1', 'RatioCs', 'AU', 'Dimensionless'") - - # validate the scale - validationHelper.validateControlledVocabularyProperty("SCALE", "scale", - ['LIN', 'LOG2', 'LOG10', 'LN'], "'lin', 'log2', 'log10', 'ln'") - - # validate the data position specification - validationHelper.validateStartDataRowCol() + # validate the value unit + validationHelper.validateControlledVocabularyProperty("VALUE UNIT", + "value unit", ['MM', 'UM', 'PERCENT', 'RATIOT1', 'RATIOCS', 'AU', 'DIMENSIONLESS'], "'mM', 'uM', 'Percent', 'RatioT1', 'RatioCs', 'AU', 'Dimensionless'") + + # validate the scale + validationHelper.validateControlledVocabularyProperty("SCALE", "scale", + ['LIN', 'LOG2', 'LOG10', 'LN'], "'lin', 'log2', 'log10', 'ln'") + + # validate the data position specification + validationHelper.validateStartDataRowCol() def validate_data_set_file(file): - errors = [] - time_series_data = create_time_series_excel(file.getAbsolutePath()) - if time_series_data is None: - errors.append(createFileValidationError(file.getName() + " is not an Excel file.")) - return errors - - # validate the metadata - validate_metadata(time_series_data, errors) - - data_start = getInitialDataRowAndCol(time_series_data.getMetadataMap()) - - # validate the data - validate_data(time_series_data, data_start[0], data_start[1], errors) - - return errors + errors = [] + time_series_data = create_time_series_excel(file.getAbsolutePath()) + if time_series_data is None: + errors.append(createFileValidationError(file.getName() + " is not an Excel file.")) + return errors + + # validate the metadata + validate_metadata(time_series_data, errors) + + data_start = getInitialDataRowAndCol(time_series_data.getMetadataMap()) + + # validate the data + validate_data(time_series_data, data_start[0], data_start[1], errors) + + return errors diff --git a/eu_basynthec/sourceTest/examples/OD600-Example.xlsx b/eu_basynthec/sourceTest/examples/OD600-Example.xlsx index 4f83f9ce28149ce85353f1e0456bfaddba7aa093..cee4d575d2575a349bfb7c3a456c3dfae9f8b5f8 100644 Binary files a/eu_basynthec/sourceTest/examples/OD600-Example.xlsx and b/eu_basynthec/sourceTest/examples/OD600-Example.xlsx differ diff --git a/eu_basynthec/sourceTest/java/eu/basynthec/cisd/dss/AbstractBaSynthecDataSetRegistratorTest.java b/eu_basynthec/sourceTest/java/eu/basynthec/cisd/dss/AbstractBaSynthecDataSetRegistratorTest.java index 3a05bdbb6ca8516434bf2490712314846056633f..4cf7b8ae70c3b46b8309975e47330f42c1a60b37 100644 --- a/eu_basynthec/sourceTest/java/eu/basynthec/cisd/dss/AbstractBaSynthecDataSetRegistratorTest.java +++ b/eu_basynthec/sourceTest/java/eu/basynthec/cisd/dss/AbstractBaSynthecDataSetRegistratorTest.java @@ -82,6 +82,15 @@ public abstract class AbstractBaSynthecDataSetRegistratorTest extends one(openBisService).createDataSetCode(); will(returnValue(excelDataSetCode)); + // Some if there is a multistrain data set type, it needs to be taken care of in + // addition to the normal one + if (tsvDataSetType == TSV_MULTISTRAIN_EXPORT_DATA_SET_TYPE) + { + String tsvMultistrain = DATA_SET_CODE + "-TSV-MULTISTRAIN"; + one(openBisService).createDataSetCode(); + will(returnValue(tsvMultistrain)); + } + String tsvDataSetCode = DATA_SET_CODE + "-TSV"; one(openBisService).createDataSetCode(); will(returnValue(tsvDataSetCode)); @@ -103,13 +112,13 @@ public abstract class AbstractBaSynthecDataSetRegistratorTest extends { one(dataSetValidator).assertValidDataSet( TSV_MULTISTRAIN_EXPORT_DATA_SET_TYPE, - new File(new File(stagingDirectory, tsvDataSetCode), "tsv-multi")); - } else - { - one(dataSetValidator).assertValidDataSet(TSV_DATA_SET_TYPE, - new File(new File(stagingDirectory, tsvDataSetCode), "tsv")); + new File(new File(stagingDirectory, DATA_SET_CODE + + "-TSV-MULTISTRAIN"), "tsv-multi")); } + one(dataSetValidator).assertValidDataSet(TSV_DATA_SET_TYPE, + new File(new File(stagingDirectory, tsvDataSetCode), "tsv")); + one(openBisService).performEntityOperations(with(atomicatOperationDetails)); will(returnValue(new AtomicEntityOperationResult())); } diff --git a/eu_basynthec/sourceTest/java/eu/basynthec/cisd/dss/TimeSeriesDataExcelTest.java b/eu_basynthec/sourceTest/java/eu/basynthec/cisd/dss/TimeSeriesDataExcelTest.java index 0d566507204e211d0449e0931293c6c792d3f84f..d2be9831e7f5a636bfccf1777674070087fe8c39 100644 --- a/eu_basynthec/sourceTest/java/eu/basynthec/cisd/dss/TimeSeriesDataExcelTest.java +++ b/eu_basynthec/sourceTest/java/eu/basynthec/cisd/dss/TimeSeriesDataExcelTest.java @@ -105,7 +105,8 @@ public class TimeSeriesDataExcelTest extends AssertJUnit { "MGP1", "OD600", "0.05", "0.064" }, { "MGP100", "OD600", "0.05", "0.064" }, { "MGP20", "OD600", "0.05", "0.064" }, - { "MGP999", "OD600", "0.05", "0.064" } }; + { "MGP999", "OD600", "0.05", "0.064" }, + { "MGP1", "OD600", "0.05", "0.064" } }; assertLinesAreEqual(dataLines, expectedData); } diff --git a/eu_basynthec/sourceTest/java/eu/basynthec/cisd/dss/growthprofiles/OD600DataSetRegistratorTest.java b/eu_basynthec/sourceTest/java/eu/basynthec/cisd/dss/growthprofiles/OD600DataSetRegistratorTest.java index a8a036d19b62128999be6273ce01bb1dd72a2b75..8205496cd8ad2dbc1d16644a00b771fd5fccc4bf 100644 --- a/eu_basynthec/sourceTest/java/eu/basynthec/cisd/dss/growthprofiles/OD600DataSetRegistratorTest.java +++ b/eu_basynthec/sourceTest/java/eu/basynthec/cisd/dss/growthprofiles/OD600DataSetRegistratorTest.java @@ -16,10 +16,13 @@ package eu.basynthec.cisd.dss.growthprofiles; +import java.io.File; import java.io.IOException; +import java.util.Arrays; import java.util.HashMap; import java.util.Properties; +import org.apache.commons.io.FileUtils; import org.testng.annotations.Test; import ch.systemsx.cisd.common.test.RecordingMatcher; @@ -50,7 +53,7 @@ public class OD600DataSetRegistratorTest extends AbstractBaSynthecDataSetRegistr handler.handle(markerFile); - assertEquals(3, atomicOperationDetails.recordedObject().getDataSetRegistrations().size()); + assertEquals(4, atomicOperationDetails.recordedObject().getDataSetRegistrations().size()); NewExternalData dataSet = atomicOperationDetails.recordedObject().getDataSetRegistrations().get(0); @@ -64,7 +67,20 @@ public class OD600DataSetRegistratorTest extends AbstractBaSynthecDataSetRegistr assertNotNull(strainProperty); assert null != strainProperty; - assertEquals("MGP1,MGP100,MGP20,MGP999", strainProperty.getValue()); + + NewExternalData tsvSplitDataSet = + atomicOperationDetails.recordedObject().getDataSetRegistrations().get(3); + String location = tsvSplitDataSet.getLocation(); + File tsvSplitFolder = new File(workingDirectory, "/1/" + location); + String[] contents = tsvSplitFolder.list(); + Arrays.sort(contents); + String[] expectedContents = + { "OD600-Example.xlsx_MGP1.tsv", "OD600-Example.xlsx_MGP100.tsv", + "OD600-Example.xlsx_MGP20.tsv", "OD600-Example.xlsx_MGP999.tsv" }; + assertEquals(Arrays.asList(expectedContents), Arrays.asList(contents)); + File tsvSplitFile = new File(tsvSplitFolder, "OD600-Example.xlsx_MGP1.tsv"); + checkTsvSplitContent(tsvSplitFile); + context.assertIsSatisfied(); } @@ -73,4 +89,14 @@ public class OD600DataSetRegistratorTest extends AbstractBaSynthecDataSetRegistr { return "dist/etc/growth-profiles/"; } + + private void checkTsvSplitContent(File tsvFile) throws IOException + { + String content = FileUtils.readFileToString(tsvFile); + assertEquals( + "RunNumber\tHumanReadable\t-19020.0\t-17220.0\t-15360.0\t-13620.0\t-11820.0\t-10020.0\t-8220.0\t-7020.0\t-4920.0\t-2820.0\t-1020.0\t-120.0\t720.0\t1500.0\t3660.0\t5460.0\t6060.0\t7200.0\t9000.0\n" + + "0\tOD600\t0.05\t0.064\t0.077\t0.089\t0.107\t0.127\t0.155\t0.176\t0.24\t0.33\t0.43\t0.49\t0.58\t0.66\t0.975\t1.42\t1.49\t2.09\t3.22\n" + + "1\tOD600\t0.05\t0.064\t0.077\t0.089\t0.107\t0.127\t0.155\t0.176\t0.24\t0.33\t0.43\t0.49\t0.58\t0.66\t0.975\t1.42\t1.49\t2.09\t3.22", + content); + } } diff --git a/eu_basynthec/sourceTest/java/eu/basynthec/cisd/dss/transcriptomics/TranscriptomicsDataSetRegistratorTest.java b/eu_basynthec/sourceTest/java/eu/basynthec/cisd/dss/transcriptomics/TranscriptomicsDataSetRegistratorTest.java index d8da395faf3557e028848856698835209330dd3d..1131553c8023c279f94a6193c0a5c2866871c672 100644 --- a/eu_basynthec/sourceTest/java/eu/basynthec/cisd/dss/transcriptomics/TranscriptomicsDataSetRegistratorTest.java +++ b/eu_basynthec/sourceTest/java/eu/basynthec/cisd/dss/transcriptomics/TranscriptomicsDataSetRegistratorTest.java @@ -18,6 +18,7 @@ package eu.basynthec.cisd.dss.transcriptomics; import java.io.File; import java.io.IOException; +import java.util.Arrays; import java.util.HashMap; import java.util.Properties; @@ -52,7 +53,7 @@ public class TranscriptomicsDataSetRegistratorTest extends AbstractBaSynthecData handler.handle(markerFile); - assertEquals(3, atomicOperationDetails.recordedObject().getDataSetRegistrations().size()); + assertEquals(4, atomicOperationDetails.recordedObject().getDataSetRegistrations().size()); NewExternalData dataSet = atomicOperationDetails.recordedObject().getDataSetRegistrations().get(0); @@ -75,6 +76,19 @@ public class TranscriptomicsDataSetRegistratorTest extends AbstractBaSynthecData new File(new File(workingDirectory, "/1/" + location), "Transcriptomics-Example.xlsx.tsv"); checkTsvContent(tsvFile); + + NewExternalData tsvSplitDataSet = + atomicOperationDetails.recordedObject().getDataSetRegistrations().get(3); + location = tsvSplitDataSet.getLocation(); + File tsvSplitFolder = new File(workingDirectory, "/1/" + location); + String[] contents = tsvSplitFolder.list(); + Arrays.sort(contents); + String[] expectedContents = + { "Transcriptomics-Example.xlsx_MGP253.tsv", + "Transcriptomics-Example.xlsx_MGP776.tsv" }; + assertEquals(Arrays.asList(expectedContents), Arrays.asList(contents)); + File tsvSplitFile = new File(tsvSplitFolder, "Transcriptomics-Example.xlsx_MGP253.tsv"); + checkSplitTsvContent(tsvSplitFile); context.assertIsSatisfied(); } @@ -102,4 +116,18 @@ public class TranscriptomicsDataSetRegistratorTest extends AbstractBaSynthecData + "BSU00260\t11.7669\t11.4658\n" + "BSU00270\t12.2675\t11.8745\n" + "BSU00280\t12.5574\t12.1608\n", content); } + + private void checkSplitTsvContent(File tsvFile) throws IOException + { + String content = FileUtils.readFileToString(tsvFile); + assertEquals("Locustag\t1 66687802\n" + "BSU00010\t13.7953\n" + "BSU00020\t13.5907\n" + + "BSU00030\t13.8489\n" + "BSU00040\t14.3564\n" + "BSU00050\t14.5239\n" + + "BSU00060\t14.3293\n" + "BSU00070\t14.481\n" + "BSU00090\t15.474\n" + + "BSU00100\t14.4332\n" + "BSU00110\t15.2669\n" + "BSU00120\t15.3344\n" + + "BSU_misc_RNA_1\t15.4497\n" + "BSU00130\t13.6604\n" + "BSU00180\t9.8208\n" + + "BSU_misc_RNA_2\t13.6614\n" + "BSU00190\t13.464\n" + "BSU00200\t14.6102\n" + + "BSU00210\t13.5285\n" + "BSU00220\t13.1007\n" + "BSU00230\t11.8547\n" + + "BSU00240\t10.8623\n" + "BSU00250\t11.6694\n" + "BSU00260\t11.7669\n" + + "BSU00270\t12.2675\n" + "BSU00280\t12.5574", content); + } }