diff --git a/eu_basynthec/dist/etc/transcriptomics/data-set-handler.py b/eu_basynthec/dist/etc/transcriptomics/data-set-handler.py index c3a30de547991465af77051e43cdf979bc753284..df05c0e01b490a91de6ddc5e4ebd0b59e2a19a22 100644 --- a/eu_basynthec/dist/etc/transcriptomics/data-set-handler.py +++ b/eu_basynthec/dist/etc/transcriptomics/data-set-handler.py @@ -2,6 +2,9 @@ from datetime import datetime from eu.basynthec.cisd.dss import TimeSeriesDataExcel import re +# A Regex for matching the column headers +header_regex = re.compile("^(.+)-([0-9]) ([0-9]+)") + def set_data_type(data_set): data_set.setPropertyValue("DATA_TYPE", "TRANSCRIPTOMICS") @@ -96,7 +99,6 @@ def convert_data_to_split_tsv(tr, start_row, start_col, dataset, location): # Extract the column / strain mapping header_line = raw_data[start_row] - header_regex = re.compile("^(MGP[0-9]{1,3})-([0-9]) ([0-9]+)") for i in range(start_col, len(header_line)): match = header_regex.match(header_line[i]) strain_name = match.group(1) @@ -149,10 +151,9 @@ def store_original_data(tr, dataset, location): tr.moveFile(incoming.getAbsolutePath(), dataset, location + "/" + incoming.getName()) def extract_strains(start_row, start_col): - """Extract the strain names from the header.""" + """Extract the strain names from the header. These have already been validated by the validator.""" strains = [] line = timeSeriesData.getRawDataLines()[start_row] - header_regex = re.compile("^(MGP[0-9]{1,3})-([0-9]) ([0-9]+)") for i in range(start_col, len(line)): match = header_regex.match(line[i]) strains.append(match.group(1)) diff --git a/eu_basynthec/dist/etc/transcriptomics/data-set-validator.py b/eu_basynthec/dist/etc/transcriptomics/data-set-validator.py index 0b7095673b65d9d2a6508ea7c8a558a771662018..ad62da996232b75a1e8323011bc24a7c3c8eca0b 100644 --- a/eu_basynthec/dist/etc/transcriptomics/data-set-validator.py +++ b/eu_basynthec/dist/etc/transcriptomics/data-set-validator.py @@ -3,11 +3,17 @@ def validate_header(line, first_data_col, errors): if line[0] != "Locustag": errors.append(createFileValidationError("The first data column must be 'Locustag' (not " + line[0] + ").")) return False - header_regex = re.compile("^.+-[0-9] [0-9]+") + header_regex = re.compile("^(.+)-([0-9]) ([0-9]+)") for i in range(first_data_col, len(line)): match = header_regex.match(line[i]) if match is None: - errors.append(createFileValidationError("The column header + " + str(i) + " must be of the form [STRAIN]-[BIOLOGICAL REPLICATE] [HYBRIDIZATION NUMBER]")) + errors.append(createFileValidationError("The column header + " + str(i) + " must be of the form [STRAIN]-[BIOLOGICAL REPLICATE] [HYBRIDIZATION NUMBER]. " + line[i] + " is not.")) + continue + strainName = match.group(1) + if isStrainIdValid(strainName) is False: + errors.append(createFileValidationError("The column header + " + str(i) + " must be of the form [STRAIN]-[BIOLOGICAL REPLICATE] [HYBRIDIZATION NUMBER]. " + strainName + " is not a recognized strain.")) + continue + def validate_data(time_series_data, first_data_row, first_data_col, errors):