From e4e156f7ca7d7358ee1565a1119e5fdca4ec29c3 Mon Sep 17 00:00:00 2001
From: cramakri <cramakri>
Date: Wed, 7 Dec 2011 09:54:20 +0000
Subject: [PATCH] SE-391 Improvments to transcriptomics handler and validator

SVN: 23903
---
 .../dist/etc/transcriptomics/data-set-handler.py       |  7 ++++---
 .../dist/etc/transcriptomics/data-set-validator.py     | 10 ++++++++--
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/eu_basynthec/dist/etc/transcriptomics/data-set-handler.py b/eu_basynthec/dist/etc/transcriptomics/data-set-handler.py
index c3a30de5479..df05c0e01b4 100644
--- a/eu_basynthec/dist/etc/transcriptomics/data-set-handler.py
+++ b/eu_basynthec/dist/etc/transcriptomics/data-set-handler.py
@@ -2,6 +2,9 @@ from datetime import datetime
 from eu.basynthec.cisd.dss import TimeSeriesDataExcel
 import re
 
+# A Regex for matching the column headers
+header_regex = re.compile("^(.+)-([0-9]) ([0-9]+)")
+
 def set_data_type(data_set):
   data_set.setPropertyValue("DATA_TYPE", "TRANSCRIPTOMICS")
 
@@ -96,7 +99,6 @@ def convert_data_to_split_tsv(tr, start_row, start_col, dataset, location):
   
   # Extract the column / strain mapping
   header_line = raw_data[start_row]
-  header_regex = re.compile("^(MGP[0-9]{1,3})-([0-9]) ([0-9]+)")
   for i in range(start_col, len(header_line)):
     match = header_regex.match(header_line[i])
     strain_name = match.group(1)
@@ -149,10 +151,9 @@ def store_original_data(tr, dataset, location):
   tr.moveFile(incoming.getAbsolutePath(), dataset, location + "/" + incoming.getName())
 
 def extract_strains(start_row, start_col):
-  """Extract the strain names from the header."""
+  """Extract the strain names from the header. These have already been validated by the validator."""
   strains = []
   line = timeSeriesData.getRawDataLines()[start_row]
-  header_regex = re.compile("^(MGP[0-9]{1,3})-([0-9]) ([0-9]+)")
   for i in range(start_col, len(line)):
     match = header_regex.match(line[i])
     strains.append(match.group(1))
diff --git a/eu_basynthec/dist/etc/transcriptomics/data-set-validator.py b/eu_basynthec/dist/etc/transcriptomics/data-set-validator.py
index 0b7095673b6..ad62da99623 100644
--- a/eu_basynthec/dist/etc/transcriptomics/data-set-validator.py
+++ b/eu_basynthec/dist/etc/transcriptomics/data-set-validator.py
@@ -3,11 +3,17 @@ def validate_header(line, first_data_col, errors):
   if line[0] != "Locustag":
     errors.append(createFileValidationError("The first data column must be 'Locustag' (not " + line[0] + ")."))
     return False
-  header_regex = re.compile("^.+-[0-9] [0-9]+")
+  header_regex = re.compile("^(.+)-([0-9]) ([0-9]+)")
   for i in range(first_data_col, len(line)):
     match = header_regex.match(line[i])
     if match is None:
-      errors.append(createFileValidationError("The column header + " + str(i) + " must be of the form [STRAIN]-[BIOLOGICAL REPLICATE] [HYBRIDIZATION NUMBER]"))
+      errors.append(createFileValidationError("The column header + " + str(i) + " must be of the form [STRAIN]-[BIOLOGICAL REPLICATE] [HYBRIDIZATION NUMBER]. " + line[i] + " is not."))
+      continue
+    strainName = match.group(1)
+    if isStrainIdValid(strainName) is False:
+      errors.append(createFileValidationError("The column header + " + str(i) + " must be of the form [STRAIN]-[BIOLOGICAL REPLICATE] [HYBRIDIZATION NUMBER]. " + strainName + " is not a recognized strain."))
+      continue      
+    
 
 
 def validate_data(time_series_data, first_data_row, first_data_col, errors):
-- 
GitLab