SE-391 Improvments to transcriptomics handler and validator

SVN: 23903

SE-391 Improvments to transcriptomics handler and validator
e4e156f7 · cramakri · a32b965b · e4e156f7 · e4e156f7
Commit e4e156f7 authored 13 years ago by cramakri
--- a/eu_basynthec/dist/etc/transcriptomics/data-set-handler.py
+++ b/eu_basynthec/dist/etc/transcriptomics/data-set-handler.py
@@ -2,6 +2,9 @@ from datetime import datetime
 from eu.basynthec.cisd.dss import TimeSeriesDataExcel
 import re
+# A Regex for matching the column headers
+header_regex = re.compile("^(.+)-([0-9]) ([0-9]+)")
 def set_data_type(data_set):
  data_set.setPropertyValue("DATA_TYPE", "TRANSCRIPTOMICS")
@@ -96,7 +99,6 @@ def convert_data_to_split_tsv(tr, start_row, start_col, dataset, location):
  # Extract the column / strain mapping
  header_line = raw_data[start_row]
-  header_regex = re.compile("^(MGP[0-9]{1,3})-([0-9]) ([0-9]+)")
  for i in range(start_col, len(header_line)):
    match = header_regex.match(header_line[i])
    strain_name = match.group(1)
@@ -149,10 +151,9 @@ def store_original_data(tr, dataset, location):
  tr.moveFile(incoming.getAbsolutePath(), dataset, location + "/" + incoming.getName())
 def extract_strains(start_row, start_col):
-  """Extract the strain names from the header."""
+  """Extract the strain names from the header. These have already been validated by the validator."""
  strains = []
  line = timeSeriesData.getRawDataLines()[start_row]
-  header_regex = re.compile("^(MGP[0-9]{1,3})-([0-9]) ([0-9]+)")
  for i in range(start_col, len(line)):
    match = header_regex.match(line[i])
    strains.append(match.group(1))

--- a/eu_basynthec/dist/etc/transcriptomics/data-set-validator.py
+++ b/eu_basynthec/dist/etc/transcriptomics/data-set-validator.py
@@ -3,11 +3,17 @@ def validate_header(line, first_data_col, errors):
  if line[0] != "Locustag":
    errors.append(createFileValidationError("The first data column must be 'Locustag' (not " + line[0] + ")."))
    return False
-  header_regex = re.compile("^.+-[0-9] [0-9]+")
+  header_regex = re.compile("^(.+)-([0-9]) ([0-9]+)")
  for i in range(first_data_col, len(line)):
    match = header_regex.match(line[i])
    if match is None:
-      errors.append(createFileValidationError("The column header + " + str(i) + " must be of the form [STRAIN]-[BIOLOGICAL REPLICATE] [HYBRIDIZATION NUMBER]"))
+      errors.append(createFileValidationError("The column header + " + str(i) + " must be of the form [STRAIN]-[BIOLOGICAL REPLICATE] [HYBRIDIZATION NUMBER]. " + line[i] + " is not."))
+      continue
+    strainName = match.group(1)
+    if isStrainIdValid(strainName) is False:
+      errors.append(createFileValidationError("The column header + " + str(i) + " must be of the form [STRAIN]-[BIOLOGICAL REPLICATE] [HYBRIDIZATION NUMBER]. " + strainName + " is not a recognized strain."))
+      continue      
 def validate_data(time_series_data, first_data_row, first_data_col, errors):