diff --git a/eu_basynthec/dist/etc/metabolomics2/data-set-handler.py b/eu_basynthec/dist/etc/metabolomics2/data-set-handler.py new file mode 100644 index 0000000000000000000000000000000000000000..eee034223fce52437b99fe1c46bb00d9528060ca --- /dev/null +++ b/eu_basynthec/dist/etc/metabolomics2/data-set-handler.py @@ -0,0 +1,82 @@ +from datetime import datetime +from eu.basynthec.cisd.dss import TimeSeriesDataExcel + +def set_data_type(data_set): + data_set.setPropertyValue("DATA_TYPE", "METABOLITE_INTENSITIES") + +def retrieve_experiment(tr, exp_id): + """Get the specified experiment form the server. Return the experiment.""" + if exp_id is None: + exp = None + else: + exp = tr.getExperiment(exp_id) + return exp + +def assign_properties(dataset, metadata): + """Assign properties to the data set from information in the data.""" + propertyNameMap = { + "STRAIN":"STRAIN_NAMES", + "TIMEPOINT TYPE": "TIMEPOINT_TYPE", + "CELL LOCATION": "CELL_LOCATION", + "VALUE TYPE": "VALUE_TYPE", + "VALUE UNIT": "VALUE_UNIT", + "SCALE": "SCALE" + } + + for prop in metadata.keySet(): + key = propertyNameMap.get(prop) + if key is not None: + value = metadata.get(prop) + dataset.setPropertyValue(key, value.upper()) + +def convert_data_to_tsv(tr, dataset, location): + """Create a tsv file containing the data and add it to the data set.""" + tr.createNewDirectory(dataset, location) + tsvFileName = tr.createNewFile(dataset, location, incoming.getName() + ".tsv") + tsv = open(tsvFileName, 'w') + for line in timeSeriesData.getRawDataLines(): + for i in range(0, len(line) - 1): + tsv.write(line[i]) + tsv.write("\t") + tsv.write(line[len(line) - 1]) + tsv.write("\n") + tsv.close() + +def store_original_data(tr, dataset, location): + """Put the original data into the data set.""" + tr.createNewDirectory(dataset, location) + tr.moveFile(incoming.getAbsolutePath(), dataset, location + "/" + incoming.getName()) + + +tr = service.transaction(incoming) +timeSeriesData = TimeSeriesDataExcel.createTimeSeriesDataExcel(incoming.getAbsolutePath()) + +# create the data set and assign the metadata from the file +dataset = tr.createNewDataSet("METABOLITE_INTENSITIES") +metadata = timeSeriesData.getMetadataMap() +assign_properties(dataset, metadata) + +# Store the original and tsv data in data sets +original_dataset = tr.createNewDataSet("EXCEL_ORIGINAL") +set_data_type(original_dataset) +store_original_data(tr, original_dataset, "xls") + +tsv_dataset = tr.createNewDataSet("TSV_EXPORT") +set_data_type(tsv_dataset) +convert_data_to_tsv(tr, tsv_dataset, "tsv") + +# Make the original contain these +contained_codes = [original_dataset.getDataSetCode(), tsv_dataset.getDataSetCode()] +dataset.setContainedDataSetCodes(contained_codes) + + +# If no experiment has been set, then get the experiment from the excel file +if dataset.getExperiment() is None: + exp_id = metadata.get("EXPERIMENT") + exp = retrieve_experiment(tr, exp_id) + if exp is not None: + dataset.setExperiment(exp) + original_dataset.setExperiment(exp) + tsv_dataset.setExperiment(exp) + + diff --git a/eu_basynthec/dist/etc/metabolomics2/data-set-validator.py b/eu_basynthec/dist/etc/metabolomics2/data-set-validator.py new file mode 100644 index 0000000000000000000000000000000000000000..ca28902b9868cd01e0c7cc8f8178744986529c01 --- /dev/null +++ b/eu_basynthec/dist/etc/metabolomics2/data-set-validator.py @@ -0,0 +1,91 @@ +# validate the header -- row 1 contains a strainid, row 2 a value type, row 3, a value unit +def validate_header_line(row, first_data_col, line, errors): + # validate the strain + if row is 0: + for i in range(first_data_col, len(line)): + strain = line[i] + if not isStrainIdValid(strain): + errors.append(createFileValidationError("Strain in col " + str(i + 1) + " " + strainValidationErrorMessageFragment(strain))) + + # validate the value type + elif row is 1: + for i in range(first_data_col, len(line)): + isControlledVocabularyPropertyValid(line[i], + "value type", ['VALUE', 'MEAN', 'MEDIAN', 'STD', 'VAR', 'ERROR', 'IQR'], + "'Value', 'Mean', 'Median', 'Std', 'Var', 'Error', 'Iqr'", + errors) + + # validate the value unit + else: + for i in range(first_data_col, len(line)): + isControlledVocabularyPropertyValid(line[i], + "value unit", ['MM', 'UM', 'RATIOT1', 'RATIOCS'], "'mM', 'uM', 'RatioT1', 'RatioCs'", + errors) + + +def validate_data(time_series_data, first_data_row, first_data_col, errors): + chebiRegex = re.compile("^CHEBI:[0-9]+") + bsbmeRegex = re.compile("^BSBME:[0-9]+") + dataLines = time_series_data.getRawDataLines() + lineCount = 0 + for line in dataLines: + # Dispatch to another function to validate the header + if lineCount < first_data_row: + validate_header_line(lineCount, first_data_col, line, errors) + lineCount = lineCount + 1 + continue + + # The header needs to be CompoundID + if lineCount is first_data_row: + if line[0] != "CompoundID": + errors.append(createFileValidationError("The first data column must be 'CompoundID'")) + break + lineCount = lineCount + 1 + continue + + # The compound id should be one of these forms + compoundId = line[0] + if not chebiRegex.match(compoundId): + if not bsbmeRegex.match(compoundId): + errors.append(createFileValidationError("Line " + str(lineCount + 1) + ", column 1 must be of the format 'CHEBI:#' or 'BSBME:#' (instead of " + compoundId + ").")) + lineCount = lineCount + 1 + +def validate_metadata(time_series_data, errors): + metadata = time_series_data.getMetadataMap() + validationHelper = ValidationHelper(metadata, errors) + + # validate the header format + validationHelper.validateExplicitHeaderFormat("METABOL HYBRID") + + # validate the timepoint type + validationHelper.validateControlledVocabularyProperty("TIMEPOINT TYPE", + "time point type", ['EX', 'IN', 'SI'], "'EX', 'IN', 'SI'") + + # validate the cell location + validationHelper.validateControlledVocabularyProperty("CELL LOCATION", + "cell location", ['CE', 'ES', 'ME', 'CY', 'NC'], "'CE', 'ES', 'ME', 'CY', 'NC'") + + # validate the scale + validationHelper.validateControlledVocabularyProperty("SCALE", "scale", + ['LIN', 'LOG2', 'LOG10', 'LN'], "'lin', 'log2', 'log10', 'ln'") + + # validate the data position specification + validationHelper.validateStartDataRowCol() + + +def validate_data_set_file(file): + errors = [] + time_series_data = create_time_series_excel(file.getAbsolutePath()) + if time_series_data is None: + errors.append(createFileValidationError(file.getName() + " is not an Excel file.")) + return errors + + # validate the metadata + validate_metadata(time_series_data, errors) + + data_start = getInitialDataRowAndCol(time_series_data.getMetadataMap()) + + # validate the data + validate_data(time_series_data, data_start[0], data_start[1], errors) + + return errors diff --git a/eu_basynthec/dist/etc/shared/shared-classes.py b/eu_basynthec/dist/etc/shared/shared-classes.py index 5ff2e494996528c75431e2310e8423cdb7d0852f..85911070ab2460befe13e521846122e1b7abd846 100644 --- a/eu_basynthec/dist/etc/shared/shared-classes.py +++ b/eu_basynthec/dist/etc/shared/shared-classes.py @@ -61,7 +61,8 @@ class TimeSeriesDataExcel: value = line[1]; if "BLANK" == value: value = None - metadataMap[line[0].upper()] = value + if line[0] is not None: + metadataMap[line[0].upper()] = value return metadataMap def create_time_series_excel(fileName): @@ -123,14 +124,8 @@ class ValidationHelper: def validateControlledVocabularyProperty(self, property, displayName, allowedValues, allowedValuesDisplay): """Validate that the property is specified and in the list of allowed values""" - if not self.checkIsSpecified(property, displayName): - return - value = self.metadataMap.get(property).upper() - if value not in allowedValues: - if len(allowedValues) > 1: - self.errors.append(createFileValidationError("The " + displayName + " must be one of " + allowedValuesDisplay + " (not " + value + ").")) - else: - self.errors.append(createFileValidationError("The " + displayName + " must be " + allowedValuesDisplay + " (not " + value + ").")) + value = self.metadataMap.get(property) + isControlledVocabularyPropertyValid(value, displayName, allowedValues, allowedValuesDisplay, self.errors) def validateStartDataRowCol(self): if self.checkIsSpecified("START DATA ROW", "Start Data Row"): @@ -147,6 +142,7 @@ class ValidationHelper: strainIdRegex = re.compile("^JJS-MGP[0-9]{1,3}|^JJS-DIN[0-9]{1,3}|^MS|CHASSIS\s*[1-3]|WT 168 TRP\+") def isStrainIdValid(strainId): """Return true if the strain id passes validation (has the form sepecified in the regex)""" + strainId = strainId.strip().upper() match = strainIdRegex.match(strainId) if match is None: return False @@ -155,6 +151,21 @@ def isStrainIdValid(strainId): def strainValidationErrorMessageFragment(strain): """Return a sentence fragment describing the strain validation error.""" return "must be either JJS-MGP[0-999], JJS-DIN[0-999], MS, CHASSIS [1-3], or WT 168 TRP+ (instead of " + strain + ")." + +def isControlledVocabularyPropertyValid(value, displayName, allowedValues, allowedValuesDisplay, errors): + """Validate that the property is specified and in the list of allowed values""" + if value is None: + errors.append(ValidationError.createFileValidationError("A " + displayName + " must be specified.")) + return False + value = value.upper() + if value not in allowedValues: + if len(allowedValues) > 1: + errors.append(createFileValidationError("The " + displayName + " must be one of " + allowedValuesDisplay + " (not " + value + ").")) + return False + else: + errors.append(createFileValidationError("The " + displayName + " must be " + allowedValuesDisplay + " (not " + value + ").")) + return False + return True def getInitialDataRowAndCol(metadata): """Extract the initial row and column as specified in the metadata. Returns an array with [row, col].""" @@ -166,12 +177,18 @@ def getInitialDataRowAndCol(metadata): if first_data_row is None: first_data_row = 0 else: - first_data_row = int(float(first_data_row)) - 1 + try: + first_data_row = int(float(first_data_row)) - 1 + except: + first_data_row = 0 # convert the column spreadsheet value to an int if first_data_col is None: first_data_col = 0 else: # columns start at A - first_data_col = ord(first_data_col) - ord('A') + try: + first_data_col = ord(first_data_col) - ord('A') + except: + first_data_cal = 0 return [first_data_row, first_data_col] diff --git a/eu_basynthec/sourceTest/examples/Metabolomics2-BadData.xlsx b/eu_basynthec/sourceTest/examples/Metabolomics2-BadData.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..59033160ac866fc6f6e0a7dec0af4041462a9a34 Binary files /dev/null and b/eu_basynthec/sourceTest/examples/Metabolomics2-BadData.xlsx differ diff --git a/eu_basynthec/sourceTest/examples/Metabolomics2-Example.xlsx b/eu_basynthec/sourceTest/examples/Metabolomics2-Example.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..c088af65b2b61a70398ce1071dba9c3254fa3bd0 Binary files /dev/null and b/eu_basynthec/sourceTest/examples/Metabolomics2-Example.xlsx differ diff --git a/eu_basynthec/sourceTest/examples/Metabolomics2-Template.xlsx b/eu_basynthec/sourceTest/examples/Metabolomics2-Template.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..bfe543475cb730817d5da050ea6235897d4c5de9 Binary files /dev/null and b/eu_basynthec/sourceTest/examples/Metabolomics2-Template.xlsx differ diff --git a/eu_basynthec/sourceTest/examples/~$Metabolomics2-Example.xlsx b/eu_basynthec/sourceTest/examples/~$Metabolomics2-Example.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..24f29bb860948c7c5f82a89ff6bfd4ff17ce621a Binary files /dev/null and b/eu_basynthec/sourceTest/examples/~$Metabolomics2-Example.xlsx differ diff --git a/eu_basynthec/sourceTest/java/eu/basynthec/cisd/dss/metabolomics/MetabolomicsDataSetRegistrator2Test.java b/eu_basynthec/sourceTest/java/eu/basynthec/cisd/dss/metabolomics/MetabolomicsDataSetRegistrator2Test.java new file mode 100644 index 0000000000000000000000000000000000000000..abefb863e1301ff65e96c04adacc6dd6425fdbd6 --- /dev/null +++ b/eu_basynthec/sourceTest/java/eu/basynthec/cisd/dss/metabolomics/MetabolomicsDataSetRegistrator2Test.java @@ -0,0 +1,80 @@ +/* + * Copyright 2011 ETH Zuerich, CISD + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package eu.basynthec.cisd.dss.metabolomics; + +import java.io.IOException; +import java.util.HashMap; +import java.util.Properties; + +import org.testng.annotations.Test; + +import ch.systemsx.cisd.common.test.RecordingMatcher; +import ch.systemsx.cisd.openbis.generic.shared.basic.dto.DataSetType; +import ch.systemsx.cisd.openbis.generic.shared.dto.NewExternalData; +import ch.systemsx.cisd.openbis.generic.shared.dto.NewProperty; + +import eu.basynthec.cisd.dss.AbstractBaSynthecDataSetRegistratorTest; + +/** + * @author Chandrasekhar Ramakrishnan + */ +public class MetabolomicsDataSetRegistrator2Test extends AbstractBaSynthecDataSetRegistratorTest +{ + private static final DataSetType DATA_SET_TYPE = new DataSetType("METABOLITE_INTENSITIES"); + + @Test + public void testSimpleTransaction() throws IOException + { + setUpHomeDataBaseExpectations(); + Properties properties = createThreadProperties(); + createHandler(properties, false, true); + createData("Metabolomics-Example.xlsx"); + + final RecordingMatcher<ch.systemsx.cisd.openbis.generic.shared.dto.AtomicEntityOperationDetails> atomicOperationDetails = + setUpDataSetRegistrationExpectations(DATA_SET_TYPE, TSV_DATA_SET_TYPE); + + handler.handle(markerFile); + + assertEquals(3, atomicOperationDetails.recordedObject().getDataSetRegistrations().size()); + + checkDataTypeProperty(atomicOperationDetails.recordedObject().getDataSetRegistrations() + .get(1), "METABOLITE_INTENSITIES"); + checkDataTypeProperty(atomicOperationDetails.recordedObject().getDataSetRegistrations() + .get(2), "METABOLITE_INTENSITIES"); + + NewExternalData dataSet = + atomicOperationDetails.recordedObject().getDataSetRegistrations().get(0); + + assertEquals(DATA_SET_CODE, dataSet.getCode()); + assertEquals(DATA_SET_TYPE, dataSet.getDataSetType()); + + HashMap<String, NewProperty> propertyMap = + getDataSetPropertiesMap(dataSet.getDataSetProperties()); + NewProperty strainProperty = propertyMap.get(STRAIN_NAMES_PROP); + + assertNotNull(strainProperty); + assert null != strainProperty; + assertEquals("CHASSIS 1", strainProperty.getValue()); + context.assertIsSatisfied(); + } + + @Override + protected String getRegistrationScriptsFolderPath() + { + return "dist/etc/metabolomics/"; + } +} diff --git a/eu_basynthec/sourceTest/java/eu/basynthec/cisd/dss/metabolomics/MetabolomicsValidator2Test.java b/eu_basynthec/sourceTest/java/eu/basynthec/cisd/dss/metabolomics/MetabolomicsValidator2Test.java new file mode 100644 index 0000000000000000000000000000000000000000..270c82379de8d178926897cb971fd4ba7c0b13ff --- /dev/null +++ b/eu_basynthec/sourceTest/java/eu/basynthec/cisd/dss/metabolomics/MetabolomicsValidator2Test.java @@ -0,0 +1,66 @@ +/* + * Copyright 2011 ETH Zuerich, CISD + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package eu.basynthec.cisd.dss.metabolomics; + +import java.io.File; +import java.util.List; + +import org.testng.AssertJUnit; +import org.testng.annotations.Test; + +import ch.systemsx.cisd.openbis.dss.generic.shared.api.v1.validation.ValidationError; +import ch.systemsx.cisd.openbis.dss.generic.shared.api.v1.validation.ValidationScriptRunner; + +/** + * @author Chandrasekhar Ramakrishnan + */ +public class MetabolomicsValidator2Test extends AssertJUnit +{ + private static final String[] VALIDATION_SCRIPT_PATH = new String[] + { "dist/etc/shared/shared-classes.py", "dist/etc/metabolomics2/data-set-validator.py" }; + + @Test + public void testGoodData() + { + ValidationScriptRunner scriptRunner = + ValidationScriptRunner.createValidatorFromScriptPaths(VALIDATION_SCRIPT_PATH); + List<ValidationError> errors = + scriptRunner.validate(new File("sourceTest/examples/Metabolomics2-Example.xlsx")); + assertTrue("The example should have no errors", errors.isEmpty()); + } + + @Test + public void testTemplate() + { + ValidationScriptRunner scriptRunner = + ValidationScriptRunner.createValidatorFromScriptPaths(VALIDATION_SCRIPT_PATH); + List<ValidationError> errors = + scriptRunner.validate(new File("sourceTest/examples/Metabolomics2-Template.xlsx")); + System.out.println(errors); + assertEquals("The template should have seven errors", 7, errors.size()); + } + + @Test + public void testBadData() + { + ValidationScriptRunner scriptRunner = + ValidationScriptRunner.createValidatorFromScriptPaths(VALIDATION_SCRIPT_PATH); + List<ValidationError> errors = + scriptRunner.validate(new File("sourceTest/examples/Metabolomics2-BadData.xlsx")); + assertEquals("The bad data should have seven errors", 7, errors.size()); + } +}