diff --git a/screening/etc/sanofi-data-set-handler.py b/screening/etc/sanofi-data-set-handler.py new file mode 100755 index 0000000000000000000000000000000000000000..94a444df3d15ba1ec59673c17f117236dd3391cc --- /dev/null +++ b/screening/etc/sanofi-data-set-handler.py @@ -0,0 +1,547 @@ +#! /usr/bin/env python +# This is an example Jython dropbox for importing HCS image datasets + +import os +import shutil +import random + +import ch.systemsx.cisd.openbis.generic.shared.basic.dto as dto +from ch.systemsx.cisd.openbis.generic.shared.basic.dto import SampleType, NewSample +from ch.systemsx.cisd.openbis.generic.shared.dto.identifier import SampleIdentifier +from ch.systemsx.cisd.openbis.dss.etl.dto.api.v1 import * +from ch.systemsx.cisd.openbis.dss.etl.custom.geexplorer import GEExplorerImageAnalysisResultParser +from java.io import File + +from ch.systemsx.cisd.common.fileconverter import FileConverter, Tiff2PngConversionStrategy +from ch.systemsx.cisd.common.mail import From + +from ch.systemsx.cisd.openbis.generic.shared.dto import NewProperty + +# ------------ +# Dropbox specific image dataset registration. You may want to modify this part. +# ------------ + +""" type of the new image dataset """ +IMAGE_DATASET_TYPE = "HCS_IMAGE" +""" file format code of files in a new image dataset """ +IMAGE_FILE_FORMAT = "TIFF" + +""" type of the new analysis dataset """ +ANALYSIS_DATASET_TYPE = "HCS_IMAGE_ANALYSIS_DATA" +""" file format of the analysis dataset """ +ANALYSIS_FILE_FORMAT = "CSV" + +""" type of the new image overlay dataset """ +OVERLAY_IMAGE_DATASET_TYPE = "HCS_IMAGE_SEGMENTATION_OVERLAY" +""" file format of the image overlay dataset """ +OVERLAY_IMAGE_FILE_FORMAT = "PNG" + +""" space where the plate for which the dataset has been acquired exist """ +PLATE_SPACE = "DEMO" + +""" only files with these extensions will be recognized as images """ +RECOGNIZED_IMAGES_EXTENSIONS = ["tiff", "tif", "png", "gif", "jpg", "jpeg"] + +""" should thumbnails be generated? """ +GENERATE_THUMBNAILS = True +""" the maximal width and height of the generated thumbnails """ +MAX_THUMNAIL_WIDTH_AND_HEIGHT = 256 +""" +number of threads that are used for thumbnail generation will be equal to: + this constant * number of processor cores +""" +ALLOWED_MACHINE_LOAD_DURING_THUMBNAIL_GENERATION = 1.0 + +""" should all dataset in one experiment use the same channels? """ +STORE_CHANNELS_ON_EXPERIMENT_LEVEL = False +""" should the original data be stored in the original form or should we pack them into one container? """ +ORIGINAL_DATA_STORAGE_FORMAT = OriginalDataStorageFormat.UNCHANGED + +# --------- + +""" name of the color which should be treated as transparent in overlays """ +OVERLAYS_TRANSPARENT_COLOR = "black" + +""" sample type code of the plate, needed if a new sample is registered automatically """ +PLATE_TYPE_CODE = "PLATE" +""" project and experiment where new plates will be registered """ +DEFAULT_PROJECT_CODE = "TEST" +DEFAULT_EXPERIMENT_CODE = "SANOFI" +PLATE_GEOMETRY_PROPERTY_CODE = "$PLATE_GEOMETRY" +PLATE_GEOMETRY = "384_WELLS_16X24" + +ANALYSIS_RUN_PROPERTY_CODE = "ANALYSIS_RUN" + +# --------- + +""" extracts code of the sample from the directory name """ +def extract_sample_code(incoming_name): + file_basename = extract_file_basename(incoming_name) + code = file_basename.split("_")[1] + if code == "": + code = file_basename + return code + +""" +For a given tile number and tiles geometry returns a (x,y) tuple which describes where the tile +is located on the well. +""" +def get_tile_coords(tile_num, tile_geometry): + columns = tile_geometry[1] + row = ((tile_num - 1) / columns) + 1 + col = ((tile_num - 1) % columns) + 1 + return (row, col) + +""" +Parameters: + image_tokens_list - list of ImageTokens +Returns: (rows, columns) tuple describing the matrix of tiles (aka fields or sides) in the well +""" +def get_tile_geometry(image_tokens_list): + max_tile = get_max_tile_number(image_tokens_list) + if max_tile % 4 == 0 and max_tile != 4: + return (max_tile / 4, 4) + elif max_tile % 3 == 0: + return (max_tile / 3, 3) + elif max_tile % 2 == 0: + return (max_tile / 2, 2) + else: + return (max_tile, 1) + +""" +Creates ImageFileInfo for a given ImageTokens. +Converts tile number to coordinates on the 'well matrix'. +Example file name: A - 1(fld 1 wv Cy5 - Cy5).tif +Returns: + ImageTokens +""" +def create_image_tokens(path): + image_tokens = ImageTokens() + image_tokens.path = path + + basename = os.path.splitext(path)[0] + + wellText = basename[0:find(basename, "(")] # A - 1 + image_tokens.well = wellText.replace(" - ", "") + + if " wv " in basename: + fieldText = basename[find(basename, "fld ") + 4 : find(basename, " wv")] + image_tokens.channel = basename[rfind(basename, " - ") + 3 :-1] + else: + fieldText = basename[find(basename, "fld ") + 4 : find(basename, ")")] + image_tokens.channel = "DEFAULT" + + try: + image_tokens.tile = int(fieldText) + #print "image_tokens.tile", image_tokens.tile + except ValueError: + raise Exception("Cannot parse field number from '" + fieldText + "' in '" + basename + "' file name.") + + return image_tokens + +# ------------ +# END of the part which you will probably need to modify +# ------------ + +# ------------ +# Generic utility +# ------------ + +""" +Finds first occurence of the patter from the right. +Throws exception if the pattern cannot be found. +""" +def rfind(text, pattern): + ix = text.rfind(pattern) + ensurePatternFound(ix, text, pattern) + return ix + +""" +Finds first occurence of the patter from the left. +Throws exception if the pattern cannot be found. +""" +def find(text, pattern): + ix = text.find(pattern) + ensurePatternFound(ix, text, pattern) + return ix + +def ensurePatternFound(ix, file, pattern): + if ix == -1: + raise Exception("Cannot find '" + pattern + "' pattern in file name '" + file + "'") + +""" Returns: name of the file without the extension """ +def extract_file_basename(filename): + base_with_ext = os.path.split(filename)[1] + if os.path.isfile(base_with_ext) : + return os.path.splitext(base_with_ext)[0] + else: + return base_with_ext + +""" Returns: extension of the file """ +def get_file_ext(file): + return os.path.splitext(file)[1][1:].lower() + +""" Returns: java.io.File - first file with the specified extension or None if no file matches """ +def find_file_by_ext(incoming_file, expected_ext): + if not incoming_file.isDirectory(): + return None + incoming_path = incoming_file.getPath() + for file in os.listdir(incoming_path): + ext = get_file_ext(file) + if ext.upper() == expected_ext.upper(): + return File(incoming_path, file) + return None + +""" Returns: java.io.File - subdirectory which contains the specified marker in the name """ +def find_dir(incoming_file, dir_name_marker): + if not incoming_file.isDirectory(): + return None + incoming_path = incoming_file.getPath() + for file in os.listdir(incoming_path): + if dir_name_marker.upper() in file.upper(): + return File(incoming_path, file) + return None + +# ------------ +# Image dataset registration +# ------------ + +""" +Auxiliary function to extract all channel codes used by specified images. +The channel label will be equal to channel code. +Parameters: + images - list of ImageFileInfo +Returns: + list of Channel +""" +def get_available_channels(images): + channel_codes = {} + for image in images: + channel_codes[image.getChannelCode()] = 1 + channels = [] + for channelCode in channel_codes.keys(): + channels.append(Channel(channelCode, channelCode)) + return channels + +""" +Parameters: + dataset - BasicDataSetInformation + registration_details - DataSetRegistrationDetails +""" +def set_dataset_details(dataset, registration_details): + registration_details.setDataSetInformation(dataset) + registration_details.setFileFormatType(dataset.getFileFormatTypeCode()) + registration_details.setDataSetType(dataset.getDataSetType()) + registration_details.setMeasuredData(dataset.isMeasured()) + +def set_image_dataset_storage_config(image_dataset): + config = ImageStorageConfiguraton.createDefault() + config.setStoreChannelsOnExperimentLevel(STORE_CHANNELS_ON_EXPERIMENT_LEVEL) + config.setOriginalDataStorageFormat(ORIGINAL_DATA_STORAGE_FORMAT) + if GENERATE_THUMBNAILS: + thumbnailsStorageFormat = ThumbnailsStorageFormat() + thumbnailsStorageFormat.setAllowedMachineLoadDuringGeneration(ALLOWED_MACHINE_LOAD_DURING_THUMBNAIL_GENERATION) + thumbnailsStorageFormat.setMaxWidth(MAX_THUMNAIL_WIDTH_AND_HEIGHT) + thumbnailsStorageFormat.setMaxHeight(MAX_THUMNAIL_WIDTH_AND_HEIGHT) + config.setThumbnailsStorageFormat(thumbnailsStorageFormat) + image_dataset.setImageStorageConfiguraton(config) + +""" +Parameters: + dataset - BasicDataSetInformation +Returns: + DataSetRegistrationDetails +""" +def create_image_dataset_details(incoming): + registration_details = factory.createImageRegistrationDetails() + image_dataset = registration_details.getDataSetInformation() + set_image_dataset(incoming, image_dataset) + + set_image_dataset_storage_config(image_dataset) + set_dataset_details(image_dataset, registration_details) + return registration_details + + +""" Returns: integer - maximal tile number """ +def get_max_tile_number(image_tokens_list): + max_tile = 0 + for image_tokens in image_tokens_list: + max_tile = max(max_tile, image_tokens.tile) + return max_tile + +""" Auxiliary structure to store tokens of the image file name. """ +class ImageTokens: + # channel code + channel = None + # tile number + tile = -1 + # path to the image + path = "" + # well code, e.g. A1 + well = "" + +""" +Creates ImageFileInfo for a given path to an image +Example of the accepted file name: A - 1(fld 1 wv Cy5 - Cy5).tif +Returns: + ImageFileInfo +""" +def create_image_info(image_tokens, tile_geometry): + tileCoords = get_tile_coords(image_tokens.tile, tile_geometry) + img = ImageFileInfo(image_tokens.channel, tileCoords[0], tileCoords[1], image_tokens.path) + img.setWell(image_tokens.well) + return img + +""" +Tokenizes file names of all images in the directory. +Returns: + list of ImageTokens +""" +def parse_image_tokens(dir, recognized_image_extensions): + image_tokens_list = [] + dir_path = dir.getPath() + for file in os.listdir(dir_path): + ext = get_file_ext(file) + try: + extIx = recognized_image_extensions.index(ext) + # not reached if extension not found + image_tokens = create_image_tokens(file) + #print "tile", image_tokens.tile, "path", image_tokens.path, "well", image_tokens.well + image_tokens_list.append(image_tokens) + except ValueError: + pass # extension not recognized + return image_tokens_list + +""" +Parameters: +- image_tokens_list - list of ImageTokens for each image +- tile_geometry - (rows, columns) tuple describing the matrix of tiles (aka fields or sides) in the well +Returns: + list of ImageFileInfo +""" +def create_image_infos(image_tokens_list, tile_geometry): + images = [] + for image_tokens in image_tokens_list: + image = create_image_info(image_tokens, tile_geometry) + images.append(image) + return images + +# --------------------- + +""" +Extracts all images from the incoming directory. +Parameters: + incoming - java.io.File, folder with images + dataset - ImageDataSetInformation where the result will be stored +""" +def set_image_dataset(incoming, dataset): + dataset.setDatasetTypeCode(IMAGE_DATASET_TYPE) + dataset.setFileFormatCode(IMAGE_FILE_FORMAT) + + sample_code = extract_sample_code(incoming.getName()) + dataset.setSample(PLATE_SPACE, sample_code) + dataset.setMeasured(True) + + image_tokens_list = parse_image_tokens(incoming, RECOGNIZED_IMAGES_EXTENSIONS) + tile_geometry = get_tile_geometry(image_tokens_list) + images = create_image_infos(image_tokens_list, tile_geometry) + channels = get_available_channels(images) + + dataset.setImages(images) + dataset.setChannels(channels) + dataset.setTileGeometry(tile_geometry[0], tile_geometry[1]) + + return dataset + +""" +Extracts all overlay images from the overlays_dir directory. +Parameters: + overlays_dir - java.io.File, folder with + image_dataset - ImageDataSetInformation, image dataset to which the overlay dataset belongs + img_dataset_code - string, code of the image dataset to which the overlay dataset belongs + overlay_dataset - ImageDataSetInformation where the result will be stored + extension - accepted image file extensions +""" +def set_overlay_dataset(overlays_dir, image_dataset, img_dataset_code, overlay_dataset, extension): + overlay_dataset.setDatasetTypeCode(OVERLAY_IMAGE_DATASET_TYPE) + overlay_dataset.setFileFormatCode(OVERLAY_IMAGE_FILE_FORMAT) + + overlay_dataset.setSample(image_dataset.getSpaceCode(), image_dataset.getSampleCode()) + overlay_dataset.setMeasured(False) + overlay_dataset.setParentDatasetCode(img_dataset_code) + + if extension == None: + recognized_image_exts = RECOGNIZED_IMAGES_EXTENSIONS + else: + recognized_image_exts = [ extension ] + image_tokens_list = parse_image_tokens(overlays_dir, recognized_image_exts) + tile_geometry = (image_dataset.getTileRowsNumber(), image_dataset.getTileColumnsNumber()) + images = create_image_infos(image_tokens_list, tile_geometry) + channels = get_available_channels(images) + + overlay_dataset.setImages(images) + overlay_dataset.setChannels(channels) + overlay_dataset.setTileGeometry(tile_geometry[0], tile_geometry[1]) + +""" +Creates registration details of the image overlays dataset. +Parameters: + overlays_dir - java.io.File, folder with + image_dataset - ImageDataset, image dataset to which the overlay dataset belongs + img_dataset_code - string, code of the image dataset to which the overlay dataset belongs +Returns: + DataSetRegistrationDetails +""" +def create_overlay_dataset_details(overlays_dir, image_dataset, img_dataset_code, extension): + overlay_dataset_details = factory.createImageRegistrationDetails() + overlay_dataset = overlay_dataset_details.getDataSetInformation() + set_overlay_dataset(overlays_dir, image_dataset, img_dataset_code, overlay_dataset, extension) + set_dataset_details(overlay_dataset, overlay_dataset_details) + set_image_dataset_storage_config(overlay_dataset) + + config = overlay_dataset.getImageStorageConfiguraton() + # channels will be connected to the dataset + config.setStoreChannelsOnExperimentLevel(False) + if GENERATE_THUMBNAILS: + # overlay thumbnails should be generated with higher quality + thumbnailsStorageFormat = config.getThumbnailsStorageFormat() + thumbnailsStorageFormat.setHighQuality(True); + config.setThumbnailsStorageFormat(thumbnailsStorageFormat) + + overlay_dataset.setImageStorageConfiguraton(config) + return overlay_dataset_details + +# --------------------- + +""" +Creates the analysis dataset description. +The dataset will be connected to the specified sample and parent dataset. +Parameters: + dataset - BasicDataSetInformation where the result will be stored +""" +def set_analysis_dataset(sample_space, sample_code, parent_dataset_code, dataset): + dataset.setDatasetTypeCode(ANALYSIS_DATASET_TYPE) + dataset.setFileFormatCode(ANALYSIS_FILE_FORMAT) + dataset.setSample(sample_space, sample_code) + dataset.setMeasured(False) + dataset.setParentDatasetCode(parent_dataset_code) + +""" +Creates registration details of the analysis dataset. +Returns: + DataSetRegistrationDetails +""" +def create_analysis_dataset_details(sample_space, sample_code, parent_dataset_code, analysis_run): + registration_details = factory.createBasicRegistrationDetails() + dataset = registration_details.getDataSetInformation() + set_analysis_dataset(sample_space, sample_code, parent_dataset_code, dataset) + + analysis_run_property = NewProperty(ANALYSIS_RUN_PROPERTY_CODE, analysis_run) + dataset.setDataSetProperties([ analysis_run_property ]) + + set_dataset_details(dataset, registration_details) + return registration_details + +""" registers sample if it does not exist already """ +def register_sample_if_necessary(space_code, project_code, experiment_code, sample_code): + openbis = state.getOpenBisService() + sampleIdentifier = SampleIdentifier.create(space_code, sample_code) + if (openbis.tryGetSampleWithExperiment(sampleIdentifier) == None): + sample = NewSample() + sampleType = SampleType() + sampleType.setCode(PLATE_TYPE_CODE) + sample.setSampleType(sampleType) + sample.setIdentifier(sampleIdentifier.toString()) + + property = dto.VocabularyTermEntityProperty(); + vocabularyTerm = dto.VocabularyTerm(); + vocabularyTerm.setCode(PLATE_GEOMETRY); + property.setVocabularyTerm(vocabularyTerm); + propertyType = dto.PropertyType(); + dataType = dto.DataType(); + dataType.setCode(dto.DataTypeCode.CONTROLLEDVOCABULARY); + propertyType.setDataType(dataType); + propertyType.setCode(PLATE_GEOMETRY_PROPERTY_CODE); + property.setPropertyType(propertyType); + sample.setProperties([ property ]) + + sample.setExperimentIdentifier("/" + space_code + "/" + project_code + "/" + experiment_code) + openbis.registerSample(sample, None) + +# --------------------- + +def debug(*msg): + print "".join(msg) + +def convert_to_png(dir, transparent_color): + delete_original_files = True + strategy = Tiff2PngConversionStrategy(transparent_color, 0, delete_original_files) + # Uses #cores * machineLoad threads for the conversion, but not more than maxThreads + machineLoad = ALLOWED_MACHINE_LOAD_DURING_THUMBNAIL_GENERATION + maxThreads = 100 + errorMsg = FileConverter.performConversion(File(dir), strategy, machineLoad, maxThreads) + if errorMsg != None: + raise Exception("Error", errorMsg) + +def notify(plate_code): + content = "Dear Mr./Mrs.\n" + hostname = "http://bwdl27.bw.f2.enterprise:8080/openbis" + plate_link = hostname+"?viewMode=simple#entity=SAMPLE&action=SEARCH&code="+plate_code+"&sample_type=PLATE" + content += "Data for plate : " + plate_code + " has been registered : \n" + plate_link + "\n" + content += "\n" + content += "Have a nice day!\n" + content += " openBIS\n" + replyAddress = "Matthew.Smicker@sanofi-aventis.com" + fromAddress = From("openbis@sanofi-aventis.com") + recipients = [ "Matthew.Smicker@sanofi-aventis.com" ] + state.mailClient.sendMessage("openBIS: registration finished - " + plate_code, content, replyAddress, fromAddress, recipients) + +""" +Allows to recognize that the subdirectory of the incoming dataset directory contains overlay images. +This text has to appear in the subdirectory name. +""" +OVERLAYS_DIR_PATTERN = "_ROITiff" + +def register_images_with_overlays_and_analysis(incoming): + if not incoming.isDirectory(): + return + + tr = service.transaction(incoming, factory) + + image_dataset_details = create_image_dataset_details(incoming) + plate_code = image_dataset_details.getDataSetInformation().getSampleCode() + space_code = image_dataset_details.getDataSetInformation().getSpaceCode() + register_sample_if_necessary(space_code, DEFAULT_PROJECT_CODE, DEFAULT_EXPERIMENT_CODE, plate_code) + + # create the image data set and put everything in it initially + image_data_set = tr.createNewDataSet(image_dataset_details) + image_data_set_folder = tr.moveFile(incoming.getPath(), image_data_set) + img_dataset_code = image_data_set.getDataSetCode() + + # move overlays folder + overlays_dir = find_dir(File(image_data_set_folder), OVERLAYS_DIR_PATTERN) + if overlays_dir != None: + tr_overlays = service.transaction(overlays_dir, factory) + convert_to_png(overlays_dir.getPath(), OVERLAYS_TRANSPARENT_COLOR) + overlay_dataset_details = create_overlay_dataset_details(overlays_dir, + image_dataset_details.getDataSetInformation(), img_dataset_code, "png") + overlays_data_set = tr_overlays.createNewDataSet(overlay_dataset_details) + tr_overlays.moveFile(overlays_dir.getPath(), overlays_data_set, "overlays") + tr_overlays.commit() + + # transform and move analysis file + analysis_file = find_file_by_ext(File(image_data_set_folder), "xml") + if analysis_file != None: + tr_analysis = service.transaction(analysis_file, factory) + analysis_run = extract_file_basename(analysis_file.getName()) + analysis_registration_details = create_analysis_dataset_details( + space_code, plate_code, img_dataset_code, analysis_run) + analysis_data_set = tr_analysis.createNewDataSet(analysis_registration_details) + analysis_data_set_file = tr_analysis.createNewFile(analysis_data_set, analysis_file.getName()) + GEExplorerImageAnalysisResultParser(analysis_file.getPath()).writeCSV(File(analysis_data_set_file)) + tr_analysis.commit() + + service.commit() + notify(plate_code) + +register_images_with_overlays_and_analysis(incoming)