From b881ac4019a5c48faa3d2cc85402fd13e17a1cdb Mon Sep 17 00:00:00 2001 From: cramakri <cramakri> Date: Mon, 21 Nov 2011 09:54:49 +0000 Subject: [PATCH] LMS-2631 Fixed importer. SVN: 23737 --- .../etc/metabolomics2/data-set-handler.py | 116 +++++++++++++++++- .../cisd/dss/TimeSeriesDataExcel.java | 5 + .../examples/~$Metabolomics2-Example.xlsx | Bin 171 -> 0 bytes .../MetabolomicsDataSetRegistrator2Test.java | 9 +- 4 files changed, 121 insertions(+), 9 deletions(-) delete mode 100644 eu_basynthec/sourceTest/examples/~$Metabolomics2-Example.xlsx diff --git a/eu_basynthec/dist/etc/metabolomics2/data-set-handler.py b/eu_basynthec/dist/etc/metabolomics2/data-set-handler.py index eee034223fc..a3e3150cb30 100644 --- a/eu_basynthec/dist/etc/metabolomics2/data-set-handler.py +++ b/eu_basynthec/dist/etc/metabolomics2/data-set-handler.py @@ -3,6 +3,26 @@ from eu.basynthec.cisd.dss import TimeSeriesDataExcel def set_data_type(data_set): data_set.setPropertyValue("DATA_TYPE", "METABOLITE_INTENSITIES") + +def getInitialDataRowAndCol(metadata): + """Extract the initial row and column as specified in the metadata. Returns an array with [row, col].""" + # get the raw value from the map + first_data_row = metadata.get("START DATA ROW") + first_data_col = metadata.get("START DATA COL") + + # convert the row numeric string to an int + if first_data_row is None: + first_data_row = 0 + else: + first_data_row = int(float(first_data_row)) - 1 + + # convert the column spreadsheet value to an int + if first_data_col is None: + first_data_col = 0 + else: + # columns start at A + first_data_col = ord(first_data_col) - ord('A') + return [first_data_row, first_data_col] def retrieve_experiment(tr, exp_id): """Get the specified experiment form the server. Return the experiment.""" @@ -15,11 +35,11 @@ def retrieve_experiment(tr, exp_id): def assign_properties(dataset, metadata): """Assign properties to the data set from information in the data.""" propertyNameMap = { - "STRAIN":"STRAIN_NAMES", + "STRAIN_NAMES":"STRAIN_NAMES", "TIMEPOINT TYPE": "TIMEPOINT_TYPE", "CELL LOCATION": "CELL_LOCATION", - "VALUE TYPE": "VALUE_TYPE", - "VALUE UNIT": "VALUE_UNIT", + "VALUE TYPE": "VALUE_TYPES", + "VALUE UNIT": "VALUE_UNITS", "SCALE": "SCALE" } @@ -36,24 +56,110 @@ def convert_data_to_tsv(tr, dataset, location): tsv = open(tsvFileName, 'w') for line in timeSeriesData.getRawDataLines(): for i in range(0, len(line) - 1): - tsv.write(line[i]) + field = line[i] + if field is None: + field = "" + tsv.write(field) tsv.write("\t") tsv.write(line[len(line) - 1]) tsv.write("\n") tsv.close() +class SplitColumnInfo: + """ + A class that stores, for each column in the file, the column number, the strain name, + the biological replicate, the hybridization number, and the column offset in the resulting file + """ + def __init__(self, column, strain_name, value_type, value_unit, output_col): + self.column = column + self.strain_name = strain_name + self.value_type = value_type + self.value_unit = value_unit + self.output_col = output_col + + tsv = None + +def convert_data_to_split_tsv(tr, start_row, start_col, dataset, location): + """Create one tsv file per strain in the original data.""" + raw_data = timeSeriesData.getRawDataLines() + + # Keep track of the mapping from columns to strains and strains to columns + column_infos = [] + strain_column_info = {} + + # Extract the column / strain mapping + header_line = raw_data[start_row] + header_regex = re.compile("^(MGP[0-9]{1,3})-([0-9]) ([0-9]+)") + for i in range(start_col, len(header_line)): + match = header_regex.match(header_line[i]) + strain_name = match.group(1) + strain_cols = strain_column_info.setdefault(strain_name, []) + column_info = SplitColumnInfo(i, strain_name, match.group(2), match.group(3), len(strain_cols)) + strain_cols.append(column_info) + column_infos.append(column_info) + + # create the files + tr.createNewDirectory(dataset, location) + for strain in strain_column_info.iterkeys(): + tsvFileName = tr.createNewFile(dataset, location, incoming.getName() + "_" + strain + ".tsv") + tsv = open(tsvFileName, 'w') + for column_info in strain_column_info[strain]: + column_info.tsv = tsv + + # Write the header + line = raw_data[start_row] + tag = line[0] + # write the first column to each file + for strain in strain_column_info.iterkeys(): + strain_column_info[strain][0].tsv.write(tag) + + for column_info in column_infos: + column_info.tsv.write('\t') + column_info.tsv.write(column_info.bio_replicate) + column_info.tsv.write(' ') + column_info.tsv.write(column_info.hybrid_number) + + # Write the data to the files + for i in range(start_row + 1, len(raw_data)): + line = raw_data[i] + tag = line[0] + for strain in strain_column_info.iterkeys(): + strain_column_info[strain][0].tsv.write('\n') + # write the first column to each file + strain_column_info[strain][0].tsv.write(tag) + # Write the remaining data to each file + for column_info in column_infos: + column_info.tsv.write('\t') + column_info.tsv.write(line[column_info.column]) + + # Close each file + for strain in strain_column_info.iterkeys(): + strain_column_info[strain][0].tsv.close() + def store_original_data(tr, dataset, location): """Put the original data into the data set.""" tr.createNewDirectory(dataset, location) tr.moveFile(incoming.getAbsolutePath(), dataset, location + "/" + incoming.getName()) + +def extract_strains(start_row, start_col): + """Extract the strain names from the header.""" + strains = [] + line = timeSeriesData.getRawDataLines()[0] + for i in range(start_col, len(line)): + strain = line[i] + if (strain not in strains): + strains.append(strain) + return ",".join(strains) tr = service.transaction(incoming) timeSeriesData = TimeSeriesDataExcel.createTimeSeriesDataExcel(incoming.getAbsolutePath()) +dataStart = getInitialDataRowAndCol(timeSeriesData.getMetadataMap()) # create the data set and assign the metadata from the file -dataset = tr.createNewDataSet("METABOLITE_INTENSITIES") +dataset = tr.createNewDataSet("METABOLITE_INTENSITIES_GROUPED") metadata = timeSeriesData.getMetadataMap() +metadata["STRAIN_NAMES"] = extract_strains(dataStart[0], dataStart[1]) assign_properties(dataset, metadata) # Store the original and tsv data in data sets diff --git a/eu_basynthec/source/java/eu/basynthec/cisd/dss/TimeSeriesDataExcel.java b/eu_basynthec/source/java/eu/basynthec/cisd/dss/TimeSeriesDataExcel.java index 8f23b0401ad..811e27e8493 100644 --- a/eu_basynthec/source/java/eu/basynthec/cisd/dss/TimeSeriesDataExcel.java +++ b/eu_basynthec/source/java/eu/basynthec/cisd/dss/TimeSeriesDataExcel.java @@ -133,6 +133,11 @@ public class TimeSeriesDataExcel for (int i = 1; i < metadataLines.size(); ++i) { String[] line = metadataLines.get(i); + String key = line[0]; + if (key == null) + { + continue; + } String value = line[1]; if ("BLANK".equals(value)) { diff --git a/eu_basynthec/sourceTest/examples/~$Metabolomics2-Example.xlsx b/eu_basynthec/sourceTest/examples/~$Metabolomics2-Example.xlsx deleted file mode 100644 index 24f29bb860948c7c5f82a89ff6bfd4ff17ce621a..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 171 mcmZQe(M>8YPE{ZgurZ`C=rSZR6a!%@g944Az^uDIin#y}(h}+b diff --git a/eu_basynthec/sourceTest/java/eu/basynthec/cisd/dss/metabolomics/MetabolomicsDataSetRegistrator2Test.java b/eu_basynthec/sourceTest/java/eu/basynthec/cisd/dss/metabolomics/MetabolomicsDataSetRegistrator2Test.java index abefb863e13..7ebd84303e6 100644 --- a/eu_basynthec/sourceTest/java/eu/basynthec/cisd/dss/metabolomics/MetabolomicsDataSetRegistrator2Test.java +++ b/eu_basynthec/sourceTest/java/eu/basynthec/cisd/dss/metabolomics/MetabolomicsDataSetRegistrator2Test.java @@ -34,7 +34,8 @@ import eu.basynthec.cisd.dss.AbstractBaSynthecDataSetRegistratorTest; */ public class MetabolomicsDataSetRegistrator2Test extends AbstractBaSynthecDataSetRegistratorTest { - private static final DataSetType DATA_SET_TYPE = new DataSetType("METABOLITE_INTENSITIES"); + private static final DataSetType DATA_SET_TYPE = new DataSetType( + "METABOLITE_INTENSITIES_GROUPED"); @Test public void testSimpleTransaction() throws IOException @@ -42,7 +43,7 @@ public class MetabolomicsDataSetRegistrator2Test extends AbstractBaSynthecDataSe setUpHomeDataBaseExpectations(); Properties properties = createThreadProperties(); createHandler(properties, false, true); - createData("Metabolomics-Example.xlsx"); + createData("Metabolomics2-Example.xlsx"); final RecordingMatcher<ch.systemsx.cisd.openbis.generic.shared.dto.AtomicEntityOperationDetails> atomicOperationDetails = setUpDataSetRegistrationExpectations(DATA_SET_TYPE, TSV_DATA_SET_TYPE); @@ -68,13 +69,13 @@ public class MetabolomicsDataSetRegistrator2Test extends AbstractBaSynthecDataSe assertNotNull(strainProperty); assert null != strainProperty; - assertEquals("CHASSIS 1", strainProperty.getValue()); + assertEquals("CHASSIS 1,JJS-MGP192", strainProperty.getValue()); context.assertIsSatisfied(); } @Override protected String getRegistrationScriptsFolderPath() { - return "dist/etc/metabolomics/"; + return "dist/etc/metabolomics2/"; } } -- GitLab