Skip to content
Snippets Groups Projects
Commit e4e156f7 authored by cramakri's avatar cramakri
Browse files

SE-391 Improvments to transcriptomics handler and validator

SVN: 23903
parent a32b965b
No related branches found
No related tags found
No related merge requests found
...@@ -2,6 +2,9 @@ from datetime import datetime ...@@ -2,6 +2,9 @@ from datetime import datetime
from eu.basynthec.cisd.dss import TimeSeriesDataExcel from eu.basynthec.cisd.dss import TimeSeriesDataExcel
import re import re
# A Regex for matching the column headers
header_regex = re.compile("^(.+)-([0-9]) ([0-9]+)")
def set_data_type(data_set): def set_data_type(data_set):
data_set.setPropertyValue("DATA_TYPE", "TRANSCRIPTOMICS") data_set.setPropertyValue("DATA_TYPE", "TRANSCRIPTOMICS")
...@@ -96,7 +99,6 @@ def convert_data_to_split_tsv(tr, start_row, start_col, dataset, location): ...@@ -96,7 +99,6 @@ def convert_data_to_split_tsv(tr, start_row, start_col, dataset, location):
# Extract the column / strain mapping # Extract the column / strain mapping
header_line = raw_data[start_row] header_line = raw_data[start_row]
header_regex = re.compile("^(MGP[0-9]{1,3})-([0-9]) ([0-9]+)")
for i in range(start_col, len(header_line)): for i in range(start_col, len(header_line)):
match = header_regex.match(header_line[i]) match = header_regex.match(header_line[i])
strain_name = match.group(1) strain_name = match.group(1)
...@@ -149,10 +151,9 @@ def store_original_data(tr, dataset, location): ...@@ -149,10 +151,9 @@ def store_original_data(tr, dataset, location):
tr.moveFile(incoming.getAbsolutePath(), dataset, location + "/" + incoming.getName()) tr.moveFile(incoming.getAbsolutePath(), dataset, location + "/" + incoming.getName())
def extract_strains(start_row, start_col): def extract_strains(start_row, start_col):
"""Extract the strain names from the header.""" """Extract the strain names from the header. These have already been validated by the validator."""
strains = [] strains = []
line = timeSeriesData.getRawDataLines()[start_row] line = timeSeriesData.getRawDataLines()[start_row]
header_regex = re.compile("^(MGP[0-9]{1,3})-([0-9]) ([0-9]+)")
for i in range(start_col, len(line)): for i in range(start_col, len(line)):
match = header_regex.match(line[i]) match = header_regex.match(line[i])
strains.append(match.group(1)) strains.append(match.group(1))
......
...@@ -3,11 +3,17 @@ def validate_header(line, first_data_col, errors): ...@@ -3,11 +3,17 @@ def validate_header(line, first_data_col, errors):
if line[0] != "Locustag": if line[0] != "Locustag":
errors.append(createFileValidationError("The first data column must be 'Locustag' (not " + line[0] + ").")) errors.append(createFileValidationError("The first data column must be 'Locustag' (not " + line[0] + ")."))
return False return False
header_regex = re.compile("^.+-[0-9] [0-9]+") header_regex = re.compile("^(.+)-([0-9]) ([0-9]+)")
for i in range(first_data_col, len(line)): for i in range(first_data_col, len(line)):
match = header_regex.match(line[i]) match = header_regex.match(line[i])
if match is None: if match is None:
errors.append(createFileValidationError("The column header + " + str(i) + " must be of the form [STRAIN]-[BIOLOGICAL REPLICATE] [HYBRIDIZATION NUMBER]")) errors.append(createFileValidationError("The column header + " + str(i) + " must be of the form [STRAIN]-[BIOLOGICAL REPLICATE] [HYBRIDIZATION NUMBER]. " + line[i] + " is not."))
continue
strainName = match.group(1)
if isStrainIdValid(strainName) is False:
errors.append(createFileValidationError("The column header + " + str(i) + " must be of the form [STRAIN]-[BIOLOGICAL REPLICATE] [HYBRIDIZATION NUMBER]. " + strainName + " is not a recognized strain."))
continue
def validate_data(time_series_data, first_data_row, first_data_col, errors): def validate_data(time_series_data, first_data_row, first_data_col, errors):
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment