From b881ac4019a5c48faa3d2cc85402fd13e17a1cdb Mon Sep 17 00:00:00 2001
From: cramakri <cramakri>
Date: Mon, 21 Nov 2011 09:54:49 +0000
Subject: [PATCH] LMS-2631 Fixed importer.

SVN: 23737
---
 .../etc/metabolomics2/data-set-handler.py     | 116 +++++++++++++++++-
 .../cisd/dss/TimeSeriesDataExcel.java         |   5 +
 .../examples/~$Metabolomics2-Example.xlsx     | Bin 171 -> 0 bytes
 .../MetabolomicsDataSetRegistrator2Test.java  |   9 +-
 4 files changed, 121 insertions(+), 9 deletions(-)
 delete mode 100644 eu_basynthec/sourceTest/examples/~$Metabolomics2-Example.xlsx

diff --git a/eu_basynthec/dist/etc/metabolomics2/data-set-handler.py b/eu_basynthec/dist/etc/metabolomics2/data-set-handler.py
index eee034223fc..a3e3150cb30 100644
--- a/eu_basynthec/dist/etc/metabolomics2/data-set-handler.py
+++ b/eu_basynthec/dist/etc/metabolomics2/data-set-handler.py
@@ -3,6 +3,26 @@ from eu.basynthec.cisd.dss import TimeSeriesDataExcel
 
 def set_data_type(data_set):
   data_set.setPropertyValue("DATA_TYPE", "METABOLITE_INTENSITIES")
+  
+def getInitialDataRowAndCol(metadata):
+  """Extract the initial row and column as specified in the metadata. Returns an array with [row, col]."""
+  # get the raw value from the map
+  first_data_row = metadata.get("START DATA ROW")
+  first_data_col = metadata.get("START DATA COL")
+
+  # convert the row numeric string to an int
+  if first_data_row is None:
+    first_data_row = 0
+  else:
+    first_data_row = int(float(first_data_row)) - 1
+
+  # convert the column spreadsheet value to an int
+  if first_data_col is None:
+    first_data_col = 0
+  else:
+    # columns start at A
+    first_data_col = ord(first_data_col) - ord('A')
+  return [first_data_row, first_data_col]
 
 def retrieve_experiment(tr, exp_id):
   """Get the specified experiment form the server. Return the experiment."""
@@ -15,11 +35,11 @@ def retrieve_experiment(tr, exp_id):
 def assign_properties(dataset, metadata):
   """Assign properties to the data set from information in the data."""
   propertyNameMap = {
-    "STRAIN":"STRAIN_NAMES", 
+    "STRAIN_NAMES":"STRAIN_NAMES", 
     "TIMEPOINT TYPE": "TIMEPOINT_TYPE", 
     "CELL LOCATION": "CELL_LOCATION", 
-    "VALUE TYPE": "VALUE_TYPE", 
-    "VALUE UNIT": "VALUE_UNIT", 
+    "VALUE TYPE": "VALUE_TYPES", 
+    "VALUE UNIT": "VALUE_UNITS", 
     "SCALE": "SCALE"
     }
     
@@ -36,24 +56,110 @@ def convert_data_to_tsv(tr, dataset, location):
   tsv = open(tsvFileName, 'w')
   for line in timeSeriesData.getRawDataLines():
     for i in range(0, len(line) - 1):
-      tsv.write(line[i])
+      field = line[i]
+      if field is None:
+        field = ""
+      tsv.write(field)
       tsv.write("\t")
     tsv.write(line[len(line) - 1])
     tsv.write("\n")
   tsv.close()
   
+class SplitColumnInfo:
+  """
+    A class that stores, for each column in the file, the column number, the strain name,
+    the biological replicate, the hybridization number, and the column offset in the resulting file
+  """
+  def __init__(self, column, strain_name, value_type, value_unit, output_col):
+    self.column = column
+    self.strain_name = strain_name
+    self.value_type = value_type
+    self.value_unit = value_unit
+    self.output_col = output_col
+    
+  tsv = None
+  
+def convert_data_to_split_tsv(tr, start_row, start_col, dataset, location):
+  """Create one tsv file per strain in the original data."""
+  raw_data = timeSeriesData.getRawDataLines()
+  
+  # Keep track of the mapping from columns to strains and strains to columns
+  column_infos = []
+  strain_column_info = {}
+  
+  # Extract the column / strain mapping
+  header_line = raw_data[start_row]
+  header_regex = re.compile("^(MGP[0-9]{1,3})-([0-9]) ([0-9]+)")
+  for i in range(start_col, len(header_line)):
+    match = header_regex.match(header_line[i])
+    strain_name = match.group(1)
+    strain_cols = strain_column_info.setdefault(strain_name, [])
+    column_info = SplitColumnInfo(i, strain_name, match.group(2), match.group(3), len(strain_cols))
+    strain_cols.append(column_info)
+    column_infos.append(column_info)
+    
+  # create the files
+  tr.createNewDirectory(dataset, location)
+  for strain in strain_column_info.iterkeys():
+    tsvFileName = tr.createNewFile(dataset, location, incoming.getName() + "_" + strain + ".tsv")
+    tsv = open(tsvFileName, 'w')
+    for column_info in strain_column_info[strain]:
+      column_info.tsv = tsv
+      
+  # Write the header
+  line = raw_data[start_row]
+  tag = line[0]
+  # write the first column to each file
+  for strain in strain_column_info.iterkeys():
+    strain_column_info[strain][0].tsv.write(tag)
+    
+  for column_info in column_infos:
+    column_info.tsv.write('\t')
+    column_info.tsv.write(column_info.bio_replicate)
+    column_info.tsv.write(' ')
+    column_info.tsv.write(column_info.hybrid_number)
+
+  # Write the data to the files
+  for i in range(start_row + 1, len(raw_data)):
+    line = raw_data[i]
+    tag = line[0]
+    for strain in strain_column_info.iterkeys():
+      strain_column_info[strain][0].tsv.write('\n')
+      # write the first column to each file
+      strain_column_info[strain][0].tsv.write(tag)
+    # Write the remaining data to each file
+    for column_info in column_infos:
+      column_info.tsv.write('\t')
+      column_info.tsv.write(line[column_info.column])
+
+  # Close each file
+  for strain in strain_column_info.iterkeys():
+    strain_column_info[strain][0].tsv.close()
+  
 def store_original_data(tr, dataset, location):
   """Put the original data into the data set."""
   tr.createNewDirectory(dataset, location)
   tr.moveFile(incoming.getAbsolutePath(), dataset, location + "/" + incoming.getName())
+  
+def extract_strains(start_row, start_col):
+  """Extract the strain names from the header."""
+  strains = []
+  line = timeSeriesData.getRawDataLines()[0]
+  for i in range(start_col, len(line)):
+    strain = line[i]
+    if (strain not in strains):
+      strains.append(strain)
+  return ",".join(strains)
 
 
 tr = service.transaction(incoming)
 timeSeriesData = TimeSeriesDataExcel.createTimeSeriesDataExcel(incoming.getAbsolutePath())
+dataStart = getInitialDataRowAndCol(timeSeriesData.getMetadataMap())
 
 # create the data set and assign the metadata from the file
-dataset = tr.createNewDataSet("METABOLITE_INTENSITIES")
+dataset = tr.createNewDataSet("METABOLITE_INTENSITIES_GROUPED")
 metadata = timeSeriesData.getMetadataMap()
+metadata["STRAIN_NAMES"] = extract_strains(dataStart[0], dataStart[1])
 assign_properties(dataset, metadata)
     
 # Store the original and tsv data in data sets                                                                                                                    
diff --git a/eu_basynthec/source/java/eu/basynthec/cisd/dss/TimeSeriesDataExcel.java b/eu_basynthec/source/java/eu/basynthec/cisd/dss/TimeSeriesDataExcel.java
index 8f23b0401ad..811e27e8493 100644
--- a/eu_basynthec/source/java/eu/basynthec/cisd/dss/TimeSeriesDataExcel.java
+++ b/eu_basynthec/source/java/eu/basynthec/cisd/dss/TimeSeriesDataExcel.java
@@ -133,6 +133,11 @@ public class TimeSeriesDataExcel
         for (int i = 1; i < metadataLines.size(); ++i)
         {
             String[] line = metadataLines.get(i);
+            String key = line[0];
+            if (key == null)
+            {
+                continue;
+            }
             String value = line[1];
             if ("BLANK".equals(value))
             {
diff --git a/eu_basynthec/sourceTest/examples/~$Metabolomics2-Example.xlsx b/eu_basynthec/sourceTest/examples/~$Metabolomics2-Example.xlsx
deleted file mode 100644
index 24f29bb860948c7c5f82a89ff6bfd4ff17ce621a..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 171
mcmZQe(M>8YPE{ZgurZ`C=rSZR6a!%@g944Az^uDIin#y}(h}+b

diff --git a/eu_basynthec/sourceTest/java/eu/basynthec/cisd/dss/metabolomics/MetabolomicsDataSetRegistrator2Test.java b/eu_basynthec/sourceTest/java/eu/basynthec/cisd/dss/metabolomics/MetabolomicsDataSetRegistrator2Test.java
index abefb863e13..7ebd84303e6 100644
--- a/eu_basynthec/sourceTest/java/eu/basynthec/cisd/dss/metabolomics/MetabolomicsDataSetRegistrator2Test.java
+++ b/eu_basynthec/sourceTest/java/eu/basynthec/cisd/dss/metabolomics/MetabolomicsDataSetRegistrator2Test.java
@@ -34,7 +34,8 @@ import eu.basynthec.cisd.dss.AbstractBaSynthecDataSetRegistratorTest;
  */
 public class MetabolomicsDataSetRegistrator2Test extends AbstractBaSynthecDataSetRegistratorTest
 {
-    private static final DataSetType DATA_SET_TYPE = new DataSetType("METABOLITE_INTENSITIES");
+    private static final DataSetType DATA_SET_TYPE = new DataSetType(
+            "METABOLITE_INTENSITIES_GROUPED");
 
     @Test
     public void testSimpleTransaction() throws IOException
@@ -42,7 +43,7 @@ public class MetabolomicsDataSetRegistrator2Test extends AbstractBaSynthecDataSe
         setUpHomeDataBaseExpectations();
         Properties properties = createThreadProperties();
         createHandler(properties, false, true);
-        createData("Metabolomics-Example.xlsx");
+        createData("Metabolomics2-Example.xlsx");
 
         final RecordingMatcher<ch.systemsx.cisd.openbis.generic.shared.dto.AtomicEntityOperationDetails> atomicOperationDetails =
                 setUpDataSetRegistrationExpectations(DATA_SET_TYPE, TSV_DATA_SET_TYPE);
@@ -68,13 +69,13 @@ public class MetabolomicsDataSetRegistrator2Test extends AbstractBaSynthecDataSe
 
         assertNotNull(strainProperty);
         assert null != strainProperty;
-        assertEquals("CHASSIS 1", strainProperty.getValue());
+        assertEquals("CHASSIS 1,JJS-MGP192", strainProperty.getValue());
         context.assertIsSatisfied();
     }
 
     @Override
     protected String getRegistrationScriptsFolderPath()
     {
-        return "dist/etc/metabolomics/";
+        return "dist/etc/metabolomics2/";
     }
 }
-- 
GitLab