From e4e156f7ca7d7358ee1565a1119e5fdca4ec29c3 Mon Sep 17 00:00:00 2001 From: cramakri <cramakri> Date: Wed, 7 Dec 2011 09:54:20 +0000 Subject: [PATCH] SE-391 Improvments to transcriptomics handler and validator SVN: 23903 --- .../dist/etc/transcriptomics/data-set-handler.py | 7 ++++--- .../dist/etc/transcriptomics/data-set-validator.py | 10 ++++++++-- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/eu_basynthec/dist/etc/transcriptomics/data-set-handler.py b/eu_basynthec/dist/etc/transcriptomics/data-set-handler.py index c3a30de5479..df05c0e01b4 100644 --- a/eu_basynthec/dist/etc/transcriptomics/data-set-handler.py +++ b/eu_basynthec/dist/etc/transcriptomics/data-set-handler.py @@ -2,6 +2,9 @@ from datetime import datetime from eu.basynthec.cisd.dss import TimeSeriesDataExcel import re +# A Regex for matching the column headers +header_regex = re.compile("^(.+)-([0-9]) ([0-9]+)") + def set_data_type(data_set): data_set.setPropertyValue("DATA_TYPE", "TRANSCRIPTOMICS") @@ -96,7 +99,6 @@ def convert_data_to_split_tsv(tr, start_row, start_col, dataset, location): # Extract the column / strain mapping header_line = raw_data[start_row] - header_regex = re.compile("^(MGP[0-9]{1,3})-([0-9]) ([0-9]+)") for i in range(start_col, len(header_line)): match = header_regex.match(header_line[i]) strain_name = match.group(1) @@ -149,10 +151,9 @@ def store_original_data(tr, dataset, location): tr.moveFile(incoming.getAbsolutePath(), dataset, location + "/" + incoming.getName()) def extract_strains(start_row, start_col): - """Extract the strain names from the header.""" + """Extract the strain names from the header. These have already been validated by the validator.""" strains = [] line = timeSeriesData.getRawDataLines()[start_row] - header_regex = re.compile("^(MGP[0-9]{1,3})-([0-9]) ([0-9]+)") for i in range(start_col, len(line)): match = header_regex.match(line[i]) strains.append(match.group(1)) diff --git a/eu_basynthec/dist/etc/transcriptomics/data-set-validator.py b/eu_basynthec/dist/etc/transcriptomics/data-set-validator.py index 0b7095673b6..ad62da99623 100644 --- a/eu_basynthec/dist/etc/transcriptomics/data-set-validator.py +++ b/eu_basynthec/dist/etc/transcriptomics/data-set-validator.py @@ -3,11 +3,17 @@ def validate_header(line, first_data_col, errors): if line[0] != "Locustag": errors.append(createFileValidationError("The first data column must be 'Locustag' (not " + line[0] + ").")) return False - header_regex = re.compile("^.+-[0-9] [0-9]+") + header_regex = re.compile("^(.+)-([0-9]) ([0-9]+)") for i in range(first_data_col, len(line)): match = header_regex.match(line[i]) if match is None: - errors.append(createFileValidationError("The column header + " + str(i) + " must be of the form [STRAIN]-[BIOLOGICAL REPLICATE] [HYBRIDIZATION NUMBER]")) + errors.append(createFileValidationError("The column header + " + str(i) + " must be of the form [STRAIN]-[BIOLOGICAL REPLICATE] [HYBRIDIZATION NUMBER]. " + line[i] + " is not.")) + continue + strainName = match.group(1) + if isStrainIdValid(strainName) is False: + errors.append(createFileValidationError("The column header + " + str(i) + " must be of the form [STRAIN]-[BIOLOGICAL REPLICATE] [HYBRIDIZATION NUMBER]. " + strainName + " is not a recognized strain.")) + continue + def validate_data(time_series_data, first_data_row, first_data_col, errors): -- GitLab