diff --git a/screening/etc/data-set-handler.py b/screening/etc/data-set-handler.py index a08bd028e6fa1382b20d49676f66777fa239f076..77255fd2416bee93e9259a58fe39ea897721d503 100755 --- a/screening/etc/data-set-handler.py +++ b/screening/etc/data-set-handler.py @@ -2,54 +2,138 @@ # This is an example Jython dropbox for importing HCS image datasets import os -import ch.systemsx.cisd.openbis.generic.shared.dto.identifier as identifier +import random + +from ch.systemsx.cisd.openbis.generic.shared.dto.identifier import SampleIdentifier import ch.systemsx.cisd.openbis.generic.shared.basic.dto as dto +from ch.systemsx.cisd.openbis.generic.shared.basic.dto import FileFormatType, DataSetType, SampleType, NewSample from ch.systemsx.cisd.openbis.dss.etl.dto.api.v1 import ImageFileInfo, ImageDataSetInformation, Channel +from ch.systemsx.cisd.openbis.dss.etl.custom.geexplorer import GEExplorerImageAnalysisResultParser +from java.io import File # ------------ -# You may want to modify this part. +# Dropbox specific image dataset registration. You may want to modify this part. # ------------ -# type of the new dataset -DATASET_TYPE = "HCS_IMAGE" -# file format of files in a new dataset -FILE_FORMAT = "TIFF" +# type of the new image dataset +IMAGE_DATASET_TYPE = "HCS_IMAGE" +# file format of files in a new image dataset +IMAGE_FILE_FORMAT = "TIFF" +# type of the new analysis dataset +ANALYSIS_DATASET_TYPE = "HCS_IMAGE_ANALYSIS_DATA" +# file format of the analysis dataset file +ANALYSIS_FILE_FORMAT = "CSV" + # space where the plate for which the dataset has been acquired exist -SAMPLE_SPACE = "DEMO" -# number of rile rows and columns on a well -TILE_ROWS_NUMBER = 2 -TILE_COLUMNS_NUMBER = 2 -# available channels, for each channel code and label have to be specified -CHANNELS = [ Channel("DAPI", "DAPI"), Channel("FITC", "FITC"), Channel("CY5", "Cy5") ] +PLATE_SPACE = "DEMO" RECOGNIZED_IMAGES_EXTENSIONS = ["tiff", "tif", "png", "gif", "jpg", "jpeg"] +# --------- +# sample type code of the plate, needed if a new sample is registered automatically +PLATE_TYPE_CODE = "PLATE" +# project and experiment where new plates will be registered +DEFAULT_PROJECT_CODE = "TEST" +DEFAULT_EXPERIMENT_CODE = "SANOFI" +PLATE_GEOMETRY_PROPERTY_CODE = "$PLATE_GEOMETRY" +PLATE_GEOMETRY = "384_WELLS_16X24" + # extracts code of the sample from the directory name -def extract_sample_code(file_basename): - return file_basename.split(".")[0] +def extract_sample_code(incoming_name): + file_basename = extract_file_basename(incoming_name) + #return file_basename.split(".")[0] + code = file_basename[file_basename.find("plates_")+7 : file_basename.rfind("_") ] + if code == "": + code = file_basename + return code -# Creates ImageFileInfo for a given path to an image +def get_tile_coords(tile_num, tile_geometry): + columns = tile_geometry[1] + row = ((tile_num-1) / columns) + 1 + col = ((tile_num-1) % columns) + 1 + return (row, col) + +def get_max_tile_number(image_tokens_list): + max_tile = 0 + for image_tokens in image_tokens_list: + max_tile = max(max_tile, image_tokens.tile) + #print "max_tile", max_tile + return max_tile + +# returns (rows, columns) +def get_tile_geometry(image_tokens_list): + max_tile = get_max_tile_number(image_tokens_list) + if max_tile % 4 == 0 and max_tile != 4: + return (max_tile / 4, 4) + elif max_tile % 3 == 0: + return (max_tile / 3, 3) + elif max_tile % 2 == 0: + return (max_tile / 2, 2) + else: + return (max_tile, 1) + +class ImageTokens: + channel = None + tile = -1 + path = "" + well = "" + +# Creates ImageTokens for a given path to an image # Example file name: A - 1(fld 1 wv Cy5 - Cy5).tif -def create_image_info(path): +def create_image_tokens(path): + image_tokens = ImageTokens() + image_tokens.path = path + basename = os.path.splitext(path)[0] wellText = basename[0:find(basename, "(")] # A - 1 - well = wellText.replace(" - ", "") + image_tokens.well = wellText.replace(" - ", "") + fieldText = basename[find(basename, "fld ")+4 : find(basename, " wv")] try: - fieldNum = int(fieldText) - 1 + image_tokens.tile = int(fieldText) + #print "image_tokens.tile", image_tokens.tile except ValueError: raise Exception("Cannot parse field number from '"+fieldText+"' in '"+basename+"' file name.") - channel = basename[rfind(basename, " - ") + 3 : -1] + + image_tokens.channel = basename[rfind(basename, " - ") + 3 : -1] + return image_tokens - tileRow = (fieldNum / TILE_COLUMNS_NUMBER) + 1 - tileCol = (fieldNum % TILE_COLUMNS_NUMBER) + 1 - img = ImageFileInfo(channel, tileRow, tileCol, path) - img.setWell(well) +# Creates ImageFileInfo for a given path to an image +# Example file name: A - 1(fld 1 wv Cy5 - Cy5).tif +def create_image_info(image_tokens, tile_geometry): + fieldNum = image_tokens.tile - 1 + tileRow = (fieldNum / tile_geometry[1]) + 1 + tileCol = (fieldNum % tile_geometry[1]) + 1 + + img = ImageFileInfo(image_tokens.channel, tileRow, tileCol, image_tokens.path) + img.setWell(image_tokens.well) return img +def parse_image_tokens(dir): + image_tokens_list = [] + dir_path = dir.getPath() + for file in os.listdir(dir_path): + ext = get_file_ext(file) + try: + extIx = RECOGNIZED_IMAGES_EXTENSIONS.index(ext) + # not reached if extension not found + image_tokens = create_image_tokens(file) + image_tokens_list.append(image_tokens) + except ValueError: + pass # extension not recognized + return image_tokens_list + + +def create_image_infos(image_tokens_list, tile_geometry): + images = [] + for image_tokens in image_tokens_list: + image = create_image_info(image_tokens, tile_geometry) + images.append(image) + return images + # ------------ -# You probably do not want to modify this part +# Generic utility # ------------ def rfind(text, pattern): @@ -66,68 +150,177 @@ def ensurePatternFound(ix, file, pattern): if ix == -1: raise Exception("Cannot find '"+pattern+"' pattern in file name '"+file+"'") -def create_dataset(): - basename = extract_file_basename(incoming.getName()) - dataset = Dataset() - - dataset.dataset_type = DATASET_TYPE - dataset.file_format = FILE_FORMAT - dataset.sample_space = SAMPLE_SPACE - dataset.sample_code = extract_sample_code(basename) - - dataset.tile_rows_number = TILE_ROWS_NUMBER - dataset.tile_columns_number = TILE_COLUMNS_NUMBER - - return dataset +def extract_file_basename(filename): + lastDot = filename.rfind(".") + if lastDot != -1: + return filename[0:lastDot] + else: + return filename + +def get_file_ext(file): + return os.path.splitext(file)[1][1:].lower() + +def find_file(incoming_file, expected_ext): + if not incoming_file.isDirectory(): + return None + incoming_path = incoming_file.getPath() + for file in os.listdir(incoming_path): + ext = get_file_ext(file) + if ext.upper() == expected_ext.upper(): + return File(incoming_path, file) + return None -def get_available_channels(): - return CHANNELS +def get_random_string(): + return str(int(random.random()*1000000000)) + +def get_tmp_dir(incoming, label): + dropbox_parent_dir = incoming.getParentFile().getParent() + tmp_dir = File(dropbox_parent_dir, label+".tmp."+get_random_string()) + os.mkdir(tmp_dir.getPath()) + return tmp_dir + +# ------------ +# Generic dataset registration +# ------------ class Dataset: - sample_code = None - sample_space = None - dataset_type = None - file_format = None - # -- imaging specific + sample_code = "" + sample_space = "" + dataset_type = "" + file_format = "" + is_measured = True + parent_dataset_code = "" + +def set_dataset_details(dataset, registration_details): + data_set_info = registration_details.getDataSetInformation() + data_set_info.setSpaceCode(dataset.sample_space) + data_set_info.setSampleCode(dataset.sample_code) + if (dataset.parent_dataset_code): + data_set_info.setParentDataSetCodes([dataset.parent_dataset_code]) + registration_details.setFileFormatType(FileFormatType(dataset.file_format)); + registration_details.setDataSetType(DataSetType(dataset.dataset_type)); + registration_details.setMeasuredData(dataset.is_measured); + return registration_details + +# ------------ +# Image dataset registration +# ------------ + +class ImageDataset: # extends Dataset + tile_num = -1 tile_rows_number = -1 tile_columns_number = -1 + images = None + channels = None -def create_image_infos(dir): - images = [] - dir_path = dir.getPath() - for file in os.listdir(dir_path): - ext = os.path.splitext(file)[1][1:].lower() - try: - extIx = RECOGNIZED_IMAGES_EXTENSIONS.index(ext) - # not reached if extension not found - image = create_image_info(file) - images.append(image) - except ValueError: - pass # extension not recognized - return images +def get_available_channels(images): + channel_codes = {} + for image in images: + channel_codes[image.getChannelCode()] = 1 + channels = [] + for channelCode in channel_codes.keys(): + channels.append(Channel(channelCode, channelCode)) + return channels + +def create_image_dataset(incoming): + dataset = ImageDataset() + + dataset.dataset_type = IMAGE_DATASET_TYPE + dataset.file_format = IMAGE_FILE_FORMAT + dataset.sample_space = PLATE_SPACE + dataset.sample_code = extract_sample_code(incoming.getName()) + dataset.is_measured = True + dataset.parent_dataset_code = None + + image_tokens_list = parse_image_tokens(incoming) + tile_geometry = get_tile_geometry(image_tokens_list) + dataset.images = create_image_infos(image_tokens_list, tile_geometry) + dataset.channels = get_available_channels(dataset.images) + + dataset.tile_rows_number = tile_geometry[0] + dataset.tile_columns_number = tile_geometry[1] + + return dataset + +def create_image_dataset_details(incoming): + image_dataset = create_image_dataset(incoming) + registration_details = factory.createImageRegistrationDetails() + + set_dataset_details(image_dataset, registration_details) -def set_dataset_info(registration_details, dataset): data_set_info = registration_details.getDataSetInformation() - data_set_info.setSpaceCode(dataset.sample_space) - data_set_info.setSampleCode(dataset.sample_code) - registration_details.setFileFormatType(dto.FileFormatType(dataset.file_format)); - registration_details.setDataSetType(dto.DataSetType(dataset.dataset_type)); - registration_details.setMeasuredData(True); - # -- imaging specific - data_set_info.setTileGeometry(dataset.tile_rows_number, dataset.tile_columns_number) - images = create_image_infos(incoming) - data_set_info.setImages(images) - channels = get_available_channels() - data_set_info.setChannels(channels) + data_set_info.setTileGeometry(image_dataset.tile_rows_number, image_dataset.tile_columns_number) + data_set_info.setImages(image_dataset.images) + data_set_info.setChannels(image_dataset.channels) + return registration_details + +# --------------------- + +def create_analysis_dataset(sample_space, sample_code, parent_dataset_code): + dataset = Dataset() -def extract_file_basename(filename): - lastDot = filename.rfind(".") - if lastDot != -1: - return filename[0:lastDot] - else: - return filename - -dataset = create_dataset() -registration_details = factory.createRegistrationDetails() -set_dataset_info(registration_details, dataset) -service.queueDataSetRegistration(incoming, registration_details) + dataset.dataset_type = ANALYSIS_DATASET_TYPE + dataset.file_format = ANALYSIS_FILE_FORMAT + dataset.sample_space = sample_space + dataset.sample_code = sample_code + dataset.is_measured = False + dataset.parent_dataset_code = parent_dataset_code + return dataset + +def create_analysis_dataset_details(sample_space, sample_code, parent_dataset_code): + analysis_registration_details = factory.createRegistrationDetails() + analysis_dataset = create_analysis_dataset(sample_space, sample_code, parent_dataset_code) + set_dataset_details(analysis_dataset, analysis_registration_details) + return analysis_registration_details + +def register_sample_if_necessary(space_code, project_code, experiment_code, sample_code): + openbis = state.getOpenBisService() + sampleIdentifier = SampleIdentifier.create(space_code, sample_code) + if (openbis.tryGetSampleWithExperiment(sampleIdentifier) == None): + sample = NewSample() + sampleType = SampleType() + sampleType.setCode(PLATE_TYPE_CODE) + sample.setSampleType(sampleType) + sample.setIdentifier(sampleIdentifier.toString()) + + property = dto.VocabularyTermEntityProperty(); + vocabularyTerm = dto.VocabularyTerm(); + vocabularyTerm.setCode(PLATE_GEOMETRY); + property.setVocabularyTerm(vocabularyTerm); + propertyType = dto.PropertyType(); + dataType = dto.DataType(); + dataType.setCode(dto.DataTypeCode.CONTROLLEDVOCABULARY); + propertyType.setDataType(dataType); + propertyType.setCode(PLATE_GEOMETRY_PROPERTY_CODE); + property.setPropertyType(propertyType); + sample.setProperties([ property ]) + + sample.setExperimentIdentifier("/"+space_code+"/"+project_code+"/"+experiment_code) + openbis.registerSample(sample, None) + +# --------------------- + +def register_images_and_analysis(incoming): + if not incoming.isDirectory(): + return + analysis_file=find_file(incoming, "xml") + + image_dataset_details = create_image_dataset_details(incoming) + plate_code = image_dataset_details.getDataSetInformation().getSampleCode() + space_code = image_dataset_details.getDataSetInformation().getSpaceCode() + + if analysis_file != None: + dropbox_parent_dir = incoming.getParentFile().getParent() + tmp_analysis_dir = get_tmp_dir(incoming, "image-analysis") + tmp_analysis_file = File(tmp_analysis_dir, analysis_file.getName()) + GEExplorerImageAnalysisResultParser(analysis_file.getPath()).writeCSV(tmp_analysis_file) + + register_sample_if_necessary(space_code, DEFAULT_PROJECT_CODE, DEFAULT_EXPERIMENT_CODE, plate_code) + img_dataset_code = service.queueDataSetRegistration(incoming, image_dataset_details).getCode() + + if analysis_file != None: + analysis_registration_details = create_analysis_dataset_details(space_code, plate_code, img_dataset_code) + service.queueDataSetRegistration(tmp_analysis_file, analysis_registration_details) + + +register_images_and_analysis(incoming)