Skip to content
Snippets Groups Projects
data-set-handler.py 20.5 KiB
Newer Older
  • Learn to ignore specific revisions
  • #! /usr/bin/env python
    # This is an example Jython dropbox for importing HCS image datasets
    
    import os
    
    import shutil
    
    import ch.systemsx.cisd.openbis.generic.shared.basic.dto as dto
    
    from ch.systemsx.cisd.openbis.generic.shared.basic.dto import SampleType, NewSample
    
    from ch.systemsx.cisd.openbis.generic.shared.dto.identifier import SampleIdentifier
    from ch.systemsx.cisd.openbis.dss.etl.dto.api.v1 import *
    
    from ch.systemsx.cisd.openbis.dss.etl.custom.geexplorer import GEExplorerImageAnalysisResultParser
    from java.io import File
    
    from ch.systemsx.cisd.common.fileconverter import FileConverter, Tiff2PngConversionStrategy
    from ch.systemsx.cisd.common.mail import From
    
    
    from ch.systemsx.cisd.openbis.generic.shared.dto import NewProperty
    
    
    # ------------
    # Dropbox specific image dataset registration. You may want to modify this part.
    # ------------
    
    """ type of the new image dataset """
    IMAGE_DATASET_TYPE = "HCS_IMAGE"
    """ file format code of files in a new image dataset """
    IMAGE_FILE_FORMAT = "TIFF"
    
    """ type of the new analysis dataset """
    ANALYSIS_DATASET_TYPE = "HCS_IMAGE_ANALYSIS_DATA"
    """ file format of the analysis dataset """
    ANALYSIS_FILE_FORMAT = "CSV"
    
    """ type of the new image overlay dataset """
    OVERLAY_IMAGE_DATASET_TYPE = "HCS_IMAGE_SEGMENTATION_OVERLAY"
    """ file format of the image overlay dataset """
    OVERLAY_IMAGE_FILE_FORMAT = "PNG"
    
    """ space where the plate for which the dataset has been acquired exist """
    PLATE_SPACE = "DEMO"
    
    """ only files with these extensions will be recognized as images """
    RECOGNIZED_IMAGES_EXTENSIONS = ["tiff", "tif", "png", "gif", "jpg", "jpeg"]
    
    
    tpylak's avatar
    tpylak committed
    """ should thumbnails be generated? """
    GENERATE_THUMBNAILS = True
    
    """ the maximal width and height of the generated thumbnails """
    MAX_THUMNAIL_WIDTH_AND_HEIGHT = 256
    """ 
    number of threads that are used for thumbnail generation will be equal to:
       this constant * number of processor cores 
    """
    ALLOWED_MACHINE_LOAD_DURING_THUMBNAIL_GENERATION = 1.0
    
    
    tpylak's avatar
    tpylak committed
    """ should all dataset in one experiment use the same channels? """
    
    STORE_CHANNELS_ON_EXPERIMENT_LEVEL = False
    
    tpylak's avatar
    tpylak committed
    """ should the original data be stored in the original form or should we pack them into one container? """
    ORIGINAL_DATA_STORAGE_FORMAT = OriginalDataStorageFormat.UNCHANGED
    
    
    """ name of the color which should be treated as transparent in overlays """ 
    OVERLAYS_TRANSPARENT_COLOR = "black"
    
    
    """ sample type code of the plate, needed if a new sample is registered automatically """
    PLATE_TYPE_CODE = "PLATE"
    """ project and experiment where new plates will be registered """
    DEFAULT_PROJECT_CODE = "TEST"
    DEFAULT_EXPERIMENT_CODE = "SANOFI"
    PLATE_GEOMETRY_PROPERTY_CODE = "$PLATE_GEOMETRY"
    PLATE_GEOMETRY = "384_WELLS_16X24"
    
    
    ANALYSIS_RUN_PROPERTY_CODE = "ANALYSIS_RUN"
    
    
    # ---------
    
    """ extracts code of the sample from the directory name """
    def extract_sample_code(incoming_name):
        file_basename = extract_file_basename(incoming_name)
    
        code = file_basename.split("_")[1]
    
        if code == "":
            code = file_basename
        return code
    
    """ 
    For a given tile number and tiles geometry returns a (x,y) tuple which describes where the tile
    is located on the well.
    """
    def get_tile_coords(tile_num, tile_geometry):
        columns = tile_geometry[1]
        row = ((tile_num - 1) / columns) + 1
        col = ((tile_num - 1) % columns) + 1
        return (row, col)
    
    """ 
    Parameters:
        image_tokens_list - list of ImageTokens
    Returns:  (rows, columns) tuple describing the matrix of tiles (aka fields or sides) in the well  
    """
    def get_tile_geometry(image_tokens_list):
        max_tile = get_max_tile_number(image_tokens_list)
        if max_tile % 4 == 0 and max_tile != 4:
            return (max_tile / 4, 4)
        elif max_tile % 3 == 0:
            return (max_tile / 3, 3)
        elif max_tile % 2 == 0:
            return (max_tile / 2, 2)
        else:
            return (max_tile, 1)
    
    """
    Creates ImageFileInfo for a given ImageTokens.
    Converts tile number to coordinates on the 'well matrix'.
    Example file name: A - 1(fld 1 wv Cy5 - Cy5).tif
    Returns:
        ImageTokens
    """
    def create_image_tokens(path):
        image_tokens = ImageTokens()
        image_tokens.path = path
    
        basename = os.path.splitext(path)[0]
    
        wellText = basename[0:find(basename, "(")] # A - 1
        image_tokens.well = wellText.replace(" - ", "")
        
    
    tpylak's avatar
    tpylak committed
        if " wv " in basename:
            fieldText = basename[find(basename, "fld ") + 4 : find(basename, " wv")]
            image_tokens.channel = basename[rfind(basename, " - ") + 3 :-1]
        else:
            fieldText = basename[find(basename, "fld ") + 4 : find(basename, ")")]
            image_tokens.channel = "DEFAULT"
        
    
        try:
            image_tokens.tile = int(fieldText)
            #print "image_tokens.tile", image_tokens.tile
        except ValueError:
            raise Exception("Cannot parse field number from '" + fieldText + "' in '" + basename + "' file name.")
    
        return image_tokens
    
    # ------------
    # END of the part which you will probably need to modify
    # ------------
    
    
    # ------------
    # Generic utility
    # ------------
    
    """ 
    Finds first occurence of the patter from the right.
    Throws exception if the pattern cannot be found.
    """
    def rfind(text, pattern):
        ix = text.rfind(pattern)
        ensurePatternFound(ix, text, pattern)
        return ix
    
    """ 
    Finds first occurence of the patter from the left. 
    Throws exception if the pattern cannot be found.
    """
    def find(text, pattern):
        ix = text.find(pattern)
        ensurePatternFound(ix, text, pattern)
        return ix
    
    def ensurePatternFound(ix, file, pattern):
        if ix == -1:
            raise Exception("Cannot find '" + pattern + "' pattern in file name '" + file + "'")    
    
    """ Returns: name of the file without the extension """
    def extract_file_basename(filename):
    
        base_with_ext = os.path.split(filename)[1]
        if os.path.isfile(base_with_ext) :
            return os.path.splitext(base_with_ext)[0]
    
            return base_with_ext
    
    
    """ Returns: extension of the file """
    def get_file_ext(file):
        return os.path.splitext(file)[1][1:].lower()
    
    """ Returns: java.io.File - first file with the specified extension or None if no file matches """
    def find_file_by_ext(incoming_file, expected_ext):
        if not incoming_file.isDirectory():
            return None
        incoming_path = incoming_file.getPath()
        for file in os.listdir(incoming_path):
            ext = get_file_ext(file)
            if ext.upper() == expected_ext.upper():
                return File(incoming_path, file)
        return None
    
    """ Returns: java.io.File - subdirectory which contains the specified marker in the name """
    def find_dir(incoming_file, dir_name_marker):
        if not incoming_file.isDirectory():
            return None
        incoming_path = incoming_file.getPath()
        for file in os.listdir(incoming_path):
            if dir_name_marker.upper() in file.upper():
                return File(incoming_path, file)
        return None
    
    # ------------
    # Image dataset registration
    # ------------
    
    """
    Auxiliary function to extract all channel codes used by specified images.
    The channel label will be equal to channel code.
    Parameters:
        images - list of ImageFileInfo
    Returns: 
        list of Channel
    """
    def get_available_channels(images):
        channel_codes = {}
        for image in images:
            channel_codes[image.getChannelCode()] = 1
        channels = []
        for channelCode in channel_codes.keys():
            channels.append(Channel(channelCode, channelCode))
        return channels
    
    """
    Parameters:
    
        dataset - BasicDataSetInformation
        registration_details - DataSetRegistrationDetails
    """
    def set_dataset_details(dataset, registration_details):
        registration_details.setDataSetInformation(dataset)
        registration_details.setFileFormatType(dataset.getFileFormatTypeCode())
        registration_details.setDataSetType(dataset.getDataSetType())
        registration_details.setMeasuredData(dataset.isMeasured())
    
    
    tpylak's avatar
    tpylak committed
    def set_image_dataset_storage_config(image_dataset):
        config = ImageStorageConfiguraton.createDefault()
        config.setStoreChannelsOnExperimentLevel(STORE_CHANNELS_ON_EXPERIMENT_LEVEL)
        config.setOriginalDataStorageFormat(ORIGINAL_DATA_STORAGE_FORMAT)
        if GENERATE_THUMBNAILS:
    
            thumbnailsStorageFormat = ThumbnailsStorageFormat()
            thumbnailsStorageFormat.setAllowedMachineLoadDuringGeneration(ALLOWED_MACHINE_LOAD_DURING_THUMBNAIL_GENERATION)
            thumbnailsStorageFormat.setMaxWidth(MAX_THUMNAIL_WIDTH_AND_HEIGHT)
            thumbnailsStorageFormat.setMaxHeight(MAX_THUMNAIL_WIDTH_AND_HEIGHT)
            config.setThumbnailsStorageFormat(thumbnailsStorageFormat)
    
    tpylak's avatar
    tpylak committed
        image_dataset.setImageStorageConfiguraton(config)
        
    
    """
    Parameters:
        dataset - BasicDataSetInformation
    
    Returns: 
        DataSetRegistrationDetails
    """
    
    def create_image_dataset_details(incoming):
    
        registration_details = factory.createImageRegistrationDetails()
    
        image_dataset = registration_details.getDataSetInformation()
        set_image_dataset(incoming, image_dataset)
    
    tpylak's avatar
    tpylak committed
        
        set_image_dataset_storage_config(image_dataset)
    
        set_dataset_details(image_dataset, registration_details)
    
        return registration_details
    
    """ Returns: integer - maximal tile number """
    
    def get_max_tile_number(image_tokens_list):
        max_tile = 0
        for image_tokens in image_tokens_list:
            max_tile = max(max_tile, image_tokens.tile)
        return max_tile
    
    
    """ Auxiliary structure to store tokens of the image file name.  """
    
        # channel code
    
        # tile number
    
        # path to the image
    
        # well code, e.g. A1
    
    """ 
    Creates ImageFileInfo for a given path to an image
    
    Example of the accepted file name: A - 1(fld 1 wv Cy5 - Cy5).tif
    Returns:
       ImageFileInfo 
    
    def create_image_info(image_tokens, tile_geometry):
    
        tileCoords = get_tile_coords(image_tokens.tile, tile_geometry)
        img = ImageFileInfo(image_tokens.channel, tileCoords[0], tileCoords[1], image_tokens.path)
    
        img.setWell(image_tokens.well)
    
    """
    Tokenizes file names of all images in the directory.
    Returns: 
      list of ImageTokens
    """
    
    def parse_image_tokens(dir, recognized_image_extensions):
    
        image_tokens_list = []
        dir_path = dir.getPath()
        for file in os.listdir(dir_path):
            ext = get_file_ext(file)
            try:
    
                extIx = recognized_image_extensions.index(ext)
    
                # not reached if extension not found
                image_tokens = create_image_tokens(file)
    
                #print "tile", image_tokens.tile, "path", image_tokens.path, "well", image_tokens.well
    
                image_tokens_list.append(image_tokens)    
            except ValueError:
                pass # extension not recognized    
        return image_tokens_list
    
    
    """
    Parameters:
    - image_tokens_list - list of ImageTokens for each image
    - tile_geometry - (rows, columns) tuple describing the matrix of tiles (aka fields or sides) in the well  
    Returns: 
      list of ImageFileInfo
    """    
    
    def create_image_infos(image_tokens_list, tile_geometry):
        images = []
        for image_tokens in image_tokens_list:
            image = create_image_info(image_tokens, tile_geometry)
            images.append(image)    
        return images
    
    
    # ---------------------
    
    """
    Extracts all images from the incoming directory.
    Parameters:
        incoming - java.io.File, folder with images
    
        dataset - ImageDataSetInformation where the result will be stored
    
    def set_image_dataset(incoming, dataset):
        dataset.setDatasetTypeCode(IMAGE_DATASET_TYPE)
        dataset.setFileFormatCode(IMAGE_FILE_FORMAT)
    
        sample_code = extract_sample_code(incoming.getName())
        dataset.setSample(PLATE_SPACE, sample_code)
        dataset.setMeasured(True)
    
        image_tokens_list = parse_image_tokens(incoming, RECOGNIZED_IMAGES_EXTENSIONS)
    
        tile_geometry = get_tile_geometry(image_tokens_list)
    
        images = create_image_infos(image_tokens_list, tile_geometry)
        channels = get_available_channels(images)
    
        dataset.setImages(images)
        dataset.setChannels(channels)
        dataset.setTileGeometry(tile_geometry[0], tile_geometry[1])
    
    """
    Extracts all overlay images from the overlays_dir directory.
    Parameters:
        overlays_dir - java.io.File, folder with 
    
        image_dataset - ImageDataSetInformation, image dataset to which the overlay dataset belongs
    
        img_dataset_code - string, code of the  image dataset to which the overlay dataset belongs
    
        overlay_dataset - ImageDataSetInformation where the result will be stored
    
        extension - accepted image file extensions
    
    def set_overlay_dataset(overlays_dir, image_dataset, img_dataset_code, overlay_dataset, extension):
    
        overlay_dataset.setDatasetTypeCode(OVERLAY_IMAGE_DATASET_TYPE)
        overlay_dataset.setFileFormatCode(OVERLAY_IMAGE_FILE_FORMAT)
    
        overlay_dataset.setSample(image_dataset.getSpaceCode(), image_dataset.getSampleCode())
        overlay_dataset.setMeasured(False)
        overlay_dataset.setParentDatasetCode(img_dataset_code)
    
        if extension == None:
            recognized_image_exts = RECOGNIZED_IMAGES_EXTENSIONS
        else:
            recognized_image_exts = [ extension ]
        image_tokens_list = parse_image_tokens(overlays_dir, recognized_image_exts)
    
        tile_geometry = (image_dataset.getTileRowsNumber(), image_dataset.getTileColumnsNumber())
        images = create_image_infos(image_tokens_list, tile_geometry)
        channels = get_available_channels(images)
    
        overlay_dataset.setImages(images)
        overlay_dataset.setChannels(channels)
        overlay_dataset.setTileGeometry(tile_geometry[0], tile_geometry[1])
    
    """
    Creates registration details of the image overlays dataset.
    Parameters:
        overlays_dir - java.io.File, folder with 
        image_dataset - ImageDataset, image dataset to which the overlay dataset belongs
        img_dataset_code - string, code of the  image dataset to which the overlay dataset belongs
    Returns:
        DataSetRegistrationDetails
    """
    
    def create_overlay_dataset_details(overlays_dir, image_dataset, img_dataset_code, extension):
    
        overlay_dataset_details = factory.createImageRegistrationDetails()
        overlay_dataset = overlay_dataset_details.getDataSetInformation()
    
        set_overlay_dataset(overlays_dir, image_dataset, img_dataset_code, overlay_dataset, extension)
    
        set_dataset_details(overlay_dataset, overlay_dataset_details)
    
        set_image_dataset_storage_config(overlay_dataset)
    
        config = overlay_dataset.getImageStorageConfiguraton()
    
        # channels will be connected to the dataset
    
        config.setStoreChannelsOnExperimentLevel(False)
    
        if GENERATE_THUMBNAILS:
            # overlay thumbnails should be generated with higher quality
            thumbnailsStorageFormat = config.getThumbnailsStorageFormat()
            thumbnailsStorageFormat.setHighQuality(True);
            config.setThumbnailsStorageFormat(thumbnailsStorageFormat)
        
    
        overlay_dataset.setImageStorageConfiguraton(config)
    
        return overlay_dataset_details
    
    """
    Creates the analysis dataset description. 
    The dataset will be connected to the specified sample and parent dataset.
    
    Parameters:
        dataset - BasicDataSetInformation where the result will be stored
    
    def set_analysis_dataset(sample_space, sample_code, parent_dataset_code, dataset):
        dataset.setDatasetTypeCode(ANALYSIS_DATASET_TYPE)
        dataset.setFileFormatCode(ANALYSIS_FILE_FORMAT)
        dataset.setSample(sample_space, sample_code)
        dataset.setMeasured(False)
        dataset.setParentDatasetCode(parent_dataset_code)
    
    """
    Creates registration details of the analysis dataset.
    Returns:
        DataSetRegistrationDetails
    """
    
    def create_analysis_dataset_details(sample_space, sample_code, parent_dataset_code, analysis_run):
    
        registration_details = factory.createBasicRegistrationDetails()
        dataset = registration_details.getDataSetInformation()
        set_analysis_dataset(sample_space, sample_code, parent_dataset_code, dataset)
    
        
        analysis_run_property = NewProperty(ANALYSIS_RUN_PROPERTY_CODE, analysis_run)
        dataset.setDataSetProperties([ analysis_run_property ])
        
    
        set_dataset_details(dataset, registration_details)
        return registration_details
    
    """ registers sample if it does not exist already """
    
    def register_sample_if_necessary(space_code, project_code, experiment_code, sample_code):   
        openbis = state.getOpenBisService()
        sampleIdentifier = SampleIdentifier.create(space_code, sample_code)
        if (openbis.tryGetSampleWithExperiment(sampleIdentifier) == None):
            sample = NewSample()
            sampleType = SampleType()
            sampleType.setCode(PLATE_TYPE_CODE)
            sample.setSampleType(sampleType)
            sample.setIdentifier(sampleIdentifier.toString())
            
            property = dto.VocabularyTermEntityProperty();
            vocabularyTerm = dto.VocabularyTerm();
            vocabularyTerm.setCode(PLATE_GEOMETRY);
            property.setVocabularyTerm(vocabularyTerm);
            propertyType = dto.PropertyType();
            dataType = dto.DataType();
            dataType.setCode(dto.DataTypeCode.CONTROLLEDVOCABULARY);
            propertyType.setDataType(dataType);
            propertyType.setCode(PLATE_GEOMETRY_PROPERTY_CODE);
            property.setPropertyType(propertyType);
            sample.setProperties([ property ])
            
    
            sample.setExperimentIdentifier("/" + space_code + "/" + project_code + "/" + experiment_code)
    
            openbis.registerSample(sample, None)
    
    # ---------------------
    
    
    def debug(*msg):
        print "".join(msg)
        
    def convert_to_png(dir, transparent_color):
    
        delete_original_files = True
        strategy = Tiff2PngConversionStrategy(transparent_color, 0, delete_original_files)
    
        # Uses #cores * machineLoad threads for the conversion, but not more than maxThreads
    
        machineLoad = ALLOWED_MACHINE_LOAD_DURING_THUMBNAIL_GENERATION
    
    tpylak's avatar
    tpylak committed
        maxThreads = 100
        errorMsg = FileConverter.performConversion(File(dir), strategy, machineLoad, maxThreads)
    
        if errorMsg != None:
            raise Exception("Error", errorMsg)
    
    def notify(plate_code):
        content  = "Dear Mr./Mrs.\n"
    
        hostname = "http://bwdl27.bw.f2.enterprise:8080/openbis"
        plate_link = hostname+"?viewMode=simple#entity=SAMPLE&action=SEARCH&code="+plate_code+"&sample_type=PLATE"
        content += "Data for plate : " + plate_code + " has been registered : \n" + plate_link + "\n"
    
        content += "\n"
        content += "Have a nice day!\n"
        content += "   openBIS\n"
        replyAddress = "Matthew.Smicker@sanofi-aventis.com"
        fromAddress = From("openbis@sanofi-aventis.com")
    
        recipients = [ "Matthew.Smicker@sanofi-aventis.com" ]
        state.mailClient.sendMessage("openBIS: registration finished - " + plate_code, content, replyAddress, fromAddress, recipients)
    
           
    """
    Allows to recognize that the subdirectory of the incoming dataset directory contains overlay images.
    This text has to appear in the subdirectory name. 
    """
    
    OVERLAYS_DIR_PATTERN = "_ROITiff"
    
    
    def register_images_with_overlays_and_analysis(incoming):
    
        if not incoming.isDirectory():
            return
    
        tr = service.transaction(incoming, factory)
            
        image_dataset_details = create_image_dataset_details(incoming)
        plate_code = image_dataset_details.getDataSetInformation().getSampleCode()
        space_code = image_dataset_details.getDataSetInformation().getSpaceCode()
        register_sample_if_necessary(space_code, DEFAULT_PROJECT_CODE, DEFAULT_EXPERIMENT_CODE, plate_code)
    
        # create the image data set and put everything in it initially
        image_data_set = tr.createNewDataSet(image_dataset_details)
        image_data_set_folder = tr.moveFile(incoming.getPath(), image_data_set)
        img_dataset_code = image_data_set.getDataSetCode()
              
        # move overlays folder
        overlays_dir = find_dir(File(image_data_set_folder), OVERLAYS_DIR_PATTERN)
        if overlays_dir != None:
    
            tr_overlays = service.transaction(overlays_dir, factory)
            convert_to_png(overlays_dir.getPath(), OVERLAYS_TRANSPARENT_COLOR)
    
            overlay_dataset_details = create_overlay_dataset_details(overlays_dir, 
    
                                         image_dataset_details.getDataSetInformation(), img_dataset_code, "png")
    
            overlays_data_set = tr_overlays.createNewDataSet(overlay_dataset_details)
            tr_overlays.moveFile(overlays_dir.getPath(), overlays_data_set, "overlays")
            tr_overlays.commit()
    
    
        # transform and move analysis file
        analysis_file = find_file_by_ext(File(image_data_set_folder), "xml")
        if analysis_file != None:
    
            tr_analysis = service.transaction(analysis_file, factory)
    
            analysis_run = extract_file_basename(analysis_file.getName())
            analysis_registration_details = create_analysis_dataset_details(
                                                space_code, plate_code, img_dataset_code, analysis_run)
    
            analysis_data_set = tr_analysis.createNewDataSet(analysis_registration_details)
            analysis_data_set_file = tr_analysis.createNewFile(analysis_data_set, analysis_file.getName())
    
            GEExplorerImageAnalysisResultParser(analysis_file.getPath()).writeCSV(File(analysis_data_set_file))
    
        notify(plate_code)
    
    register_images_with_overlays_and_analysis(incoming)