From 8be7453e6aac75eb5b83c1faeb744c542bd65a86 Mon Sep 17 00:00:00 2001
From: tpylak <tpylak>
Date: Fri, 28 Jan 2011 12:12:41 +0000
Subject: [PATCH] LMS-1998 one jython dropbox for images and analysis results

SVN: 19637
---
 screening/etc/data-set-handler.py | 353 +++++++++++++++++++++++-------
 1 file changed, 273 insertions(+), 80 deletions(-)

diff --git a/screening/etc/data-set-handler.py b/screening/etc/data-set-handler.py
index a08bd028e6f..77255fd2416 100755
--- a/screening/etc/data-set-handler.py
+++ b/screening/etc/data-set-handler.py
@@ -2,54 +2,138 @@
 # This is an example Jython dropbox for importing HCS image datasets
 
 import os
-import ch.systemsx.cisd.openbis.generic.shared.dto.identifier as identifier
+import random
+
+from ch.systemsx.cisd.openbis.generic.shared.dto.identifier import SampleIdentifier
 import ch.systemsx.cisd.openbis.generic.shared.basic.dto as dto
+from ch.systemsx.cisd.openbis.generic.shared.basic.dto import FileFormatType, DataSetType, SampleType, NewSample
 from ch.systemsx.cisd.openbis.dss.etl.dto.api.v1 import ImageFileInfo, ImageDataSetInformation, Channel
+from ch.systemsx.cisd.openbis.dss.etl.custom.geexplorer import GEExplorerImageAnalysisResultParser
+from java.io import File
 
 # ------------
-# You may want to modify this part.
+# Dropbox specific image dataset registration. You may want to modify this part.
 # ------------
 
-# type of the new dataset
-DATASET_TYPE = "HCS_IMAGE"
-# file format of files in a new dataset
-FILE_FORMAT = "TIFF"
+# type of the new image dataset
+IMAGE_DATASET_TYPE = "HCS_IMAGE"
+# file format of files in a new image dataset
+IMAGE_FILE_FORMAT = "TIFF"
+# type of the new analysis dataset
+ANALYSIS_DATASET_TYPE = "HCS_IMAGE_ANALYSIS_DATA"
+# file format of the analysis dataset file
+ANALYSIS_FILE_FORMAT = "CSV"
+
 # space where the plate for which the dataset has been acquired exist
-SAMPLE_SPACE = "DEMO"
-# number of rile rows and columns on a well 
-TILE_ROWS_NUMBER = 2
-TILE_COLUMNS_NUMBER = 2
-# available channels, for each channel code and label have to be specified
-CHANNELS = [ Channel("DAPI", "DAPI"), Channel("FITC", "FITC"), Channel("CY5", "Cy5") ]
+PLATE_SPACE = "DEMO"
 
 RECOGNIZED_IMAGES_EXTENSIONS = ["tiff", "tif", "png", "gif", "jpg", "jpeg"]
 
+# ---------
+# sample type code of the plate, needed if a new sample is registered automatically
+PLATE_TYPE_CODE = "PLATE"
+# project and experiment where new plates will be registered 
+DEFAULT_PROJECT_CODE = "TEST"
+DEFAULT_EXPERIMENT_CODE = "SANOFI"
+PLATE_GEOMETRY_PROPERTY_CODE = "$PLATE_GEOMETRY"
+PLATE_GEOMETRY = "384_WELLS_16X24"
+
 # extracts code of the sample from the directory name
-def extract_sample_code(file_basename):
-    return file_basename.split(".")[0]
+def extract_sample_code(incoming_name):
+    file_basename = extract_file_basename(incoming_name)
+    #return file_basename.split(".")[0]
+    code = file_basename[file_basename.find("plates_")+7 : file_basename.rfind("_") ]
+    if code == "":
+        code = file_basename
+    return code
 
-# Creates ImageFileInfo for a given path to an image
+def get_tile_coords(tile_num, tile_geometry):
+    columns = tile_geometry[1]
+    row = ((tile_num-1) / columns) + 1
+    col = ((tile_num-1) % columns) + 1
+    return (row, col)
+
+def get_max_tile_number(image_tokens_list):
+    max_tile = 0
+    for image_tokens in image_tokens_list:
+        max_tile = max(max_tile, image_tokens.tile)
+    #print "max_tile", max_tile
+    return max_tile
+
+# returns (rows, columns)
+def get_tile_geometry(image_tokens_list):
+    max_tile = get_max_tile_number(image_tokens_list)
+    if max_tile % 4 == 0 and max_tile != 4:
+        return (max_tile / 4, 4)
+    elif max_tile % 3 == 0:
+        return (max_tile / 3, 3)
+    elif max_tile % 2 == 0:
+        return (max_tile / 2, 2)
+    else:
+        return (max_tile, 1)
+
+class ImageTokens:
+    channel = None
+    tile = -1
+    path = ""
+    well = ""
+
+# Creates ImageTokens for a given path to an image
 # Example file name: A - 1(fld 1 wv Cy5 - Cy5).tif
-def create_image_info(path):
+def create_image_tokens(path):
+    image_tokens = ImageTokens()
+    image_tokens.path = path
+
     basename = os.path.splitext(path)[0]
 
     wellText = basename[0:find(basename, "(")] # A - 1
-    well = wellText.replace(" - ", "")
+    image_tokens.well = wellText.replace(" - ", "")
+    
     fieldText = basename[find(basename, "fld ")+4 : find(basename, " wv")]
     try:
-        fieldNum = int(fieldText) - 1
+        image_tokens.tile = int(fieldText)
+        #print "image_tokens.tile", image_tokens.tile
     except ValueError:
         raise Exception("Cannot parse field number from '"+fieldText+"' in '"+basename+"' file name.")
-    channel = basename[rfind(basename, " - ") + 3 : -1]
+
+    image_tokens.channel = basename[rfind(basename, " - ") + 3 : -1]
+    return image_tokens
     
-    tileRow = (fieldNum / TILE_COLUMNS_NUMBER) + 1
-    tileCol = (fieldNum % TILE_COLUMNS_NUMBER) + 1
-    img = ImageFileInfo(channel, tileRow, tileCol, path)
-    img.setWell(well)
+# Creates ImageFileInfo for a given path to an image
+# Example file name: A - 1(fld 1 wv Cy5 - Cy5).tif
+def create_image_info(image_tokens, tile_geometry):
+    fieldNum = image_tokens.tile - 1
+    tileRow = (fieldNum / tile_geometry[1]) + 1
+    tileCol = (fieldNum % tile_geometry[1]) + 1
+
+    img = ImageFileInfo(image_tokens.channel, tileRow, tileCol, image_tokens.path)
+    img.setWell(image_tokens.well)
     return img
 
+def parse_image_tokens(dir):
+    image_tokens_list = []
+    dir_path = dir.getPath()
+    for file in os.listdir(dir_path):
+        ext = get_file_ext(file)
+        try:
+            extIx = RECOGNIZED_IMAGES_EXTENSIONS.index(ext)
+            # not reached if extension not found
+            image_tokens = create_image_tokens(file)
+            image_tokens_list.append(image_tokens)    
+        except ValueError:
+            pass # extension not recognized    
+    return image_tokens_list
+
+    
+def create_image_infos(image_tokens_list, tile_geometry):
+    images = []
+    for image_tokens in image_tokens_list:
+        image = create_image_info(image_tokens, tile_geometry)
+        images.append(image)    
+    return images
+
 # ------------
-# You probably do not want to modify this part
+# Generic utility
 # ------------
 
 def rfind(text, pattern):
@@ -66,68 +150,177 @@ def ensurePatternFound(ix, file, pattern):
     if ix == -1:
         raise Exception("Cannot find '"+pattern+"' pattern in file name '"+file+"'")    
 
-def create_dataset():
-    basename = extract_file_basename(incoming.getName())
-    dataset = Dataset()
-    
-    dataset.dataset_type = DATASET_TYPE
-    dataset.file_format = FILE_FORMAT
-    dataset.sample_space = SAMPLE_SPACE
-    dataset.sample_code = extract_sample_code(basename)
-    
-    dataset.tile_rows_number = TILE_ROWS_NUMBER
-    dataset.tile_columns_number = TILE_COLUMNS_NUMBER
-    
-    return dataset
+def extract_file_basename(filename):
+    lastDot = filename.rfind(".")
+    if lastDot != -1:
+        return filename[0:lastDot]
+    else:
+        return filename
+
+def get_file_ext(file):
+    return os.path.splitext(file)[1][1:].lower()
+
+def find_file(incoming_file, expected_ext):
+    if not incoming_file.isDirectory():
+        return None
+    incoming_path = incoming_file.getPath()
+    for file in os.listdir(incoming_path):
+        ext = get_file_ext(file)
+        if ext.upper() == expected_ext.upper():
+            return File(incoming_path, file)
+    return None
 
-def get_available_channels():
-    return CHANNELS
+def get_random_string():
+    return str(int(random.random()*1000000000))
+
+def get_tmp_dir(incoming, label):
+    dropbox_parent_dir = incoming.getParentFile().getParent()
+    tmp_dir = File(dropbox_parent_dir, label+".tmp."+get_random_string())
+    os.mkdir(tmp_dir.getPath())
+    return tmp_dir
+
+# ------------
+# Generic dataset registration
+# ------------
 
 class Dataset:
-    sample_code = None
-    sample_space = None
-    dataset_type = None
-    file_format = None
-    # -- imaging specific
+    sample_code = ""
+    sample_space = ""
+    dataset_type = ""
+    file_format = ""
+    is_measured = True
+    parent_dataset_code = ""
+
+def set_dataset_details(dataset, registration_details):
+    data_set_info = registration_details.getDataSetInformation()
+    data_set_info.setSpaceCode(dataset.sample_space)
+    data_set_info.setSampleCode(dataset.sample_code)
+    if (dataset.parent_dataset_code):
+        data_set_info.setParentDataSetCodes([dataset.parent_dataset_code])
+    registration_details.setFileFormatType(FileFormatType(dataset.file_format));
+    registration_details.setDataSetType(DataSetType(dataset.dataset_type));
+    registration_details.setMeasuredData(dataset.is_measured);
+    return registration_details
+
+# ------------
+# Image dataset registration
+# ------------
+
+class ImageDataset: # extends Dataset
+    tile_num = -1
     tile_rows_number = -1
     tile_columns_number = -1
+    images = None
+    channels = None
 
-def create_image_infos(dir):
-    images = []
-    dir_path = dir.getPath()
-    for file in os.listdir(dir_path):
-    	ext = os.path.splitext(file)[1][1:].lower()
-	try:
-		extIx = RECOGNIZED_IMAGES_EXTENSIONS.index(ext)
-                # not reached if extension not found
-		image = create_image_info(file)
-                images.append(image)	
-	except ValueError:
-		pass # extension not recognized	
-    return images
+def get_available_channels(images):
+    channel_codes = {}
+    for image in images:
+        channel_codes[image.getChannelCode()] = 1
+    channels = []
+    for channelCode in channel_codes.keys():
+        channels.append(Channel(channelCode, channelCode))
+    return channels
+
+def create_image_dataset(incoming):
+    dataset = ImageDataset()
+    
+    dataset.dataset_type = IMAGE_DATASET_TYPE
+    dataset.file_format = IMAGE_FILE_FORMAT
+    dataset.sample_space = PLATE_SPACE
+    dataset.sample_code = extract_sample_code(incoming.getName())
+    dataset.is_measured = True
+    dataset.parent_dataset_code = None
+
+    image_tokens_list = parse_image_tokens(incoming)
+    tile_geometry = get_tile_geometry(image_tokens_list)
+    dataset.images = create_image_infos(image_tokens_list, tile_geometry)
+    dataset.channels = get_available_channels(dataset.images)
+    
+    dataset.tile_rows_number = tile_geometry[0]
+    dataset.tile_columns_number = tile_geometry[1]
+
+    return dataset
+    
+def create_image_dataset_details(incoming):
+    image_dataset = create_image_dataset(incoming)
+    registration_details = factory.createImageRegistrationDetails()
+    
+    set_dataset_details(image_dataset, registration_details)
 
-def set_dataset_info(registration_details, dataset):
     data_set_info = registration_details.getDataSetInformation()
-    data_set_info.setSpaceCode(dataset.sample_space)
-    data_set_info.setSampleCode(dataset.sample_code)
-    registration_details.setFileFormatType(dto.FileFormatType(dataset.file_format));
-    registration_details.setDataSetType(dto.DataSetType(dataset.dataset_type));
-    registration_details.setMeasuredData(True);
-    # -- imaging specific
-    data_set_info.setTileGeometry(dataset.tile_rows_number, dataset.tile_columns_number)
-    images = create_image_infos(incoming)
-    data_set_info.setImages(images)
-    channels = get_available_channels()
-    data_set_info.setChannels(channels)
+    data_set_info.setTileGeometry(image_dataset.tile_rows_number, image_dataset.tile_columns_number)
+    data_set_info.setImages(image_dataset.images)
+    data_set_info.setChannels(image_dataset.channels)
+    return registration_details
+
+# ---------------------
+
+def create_analysis_dataset(sample_space, sample_code, parent_dataset_code):
+    dataset = Dataset()
     
-def extract_file_basename(filename):
-    lastDot = filename.rfind(".")
-    if lastDot != -1:
-        return filename[0:lastDot]
-    else:
-        return filename
-     
-dataset = create_dataset()
-registration_details = factory.createRegistrationDetails()
-set_dataset_info(registration_details, dataset)
-service.queueDataSetRegistration(incoming, registration_details)
+    dataset.dataset_type = ANALYSIS_DATASET_TYPE
+    dataset.file_format = ANALYSIS_FILE_FORMAT
+    dataset.sample_space = sample_space
+    dataset.sample_code = sample_code
+    dataset.is_measured = False
+    dataset.parent_dataset_code = parent_dataset_code
+    return dataset
+
+def create_analysis_dataset_details(sample_space, sample_code, parent_dataset_code):
+    analysis_registration_details = factory.createRegistrationDetails()
+    analysis_dataset = create_analysis_dataset(sample_space, sample_code, parent_dataset_code)
+    set_dataset_details(analysis_dataset, analysis_registration_details)
+    return analysis_registration_details
+
+def register_sample_if_necessary(space_code, project_code, experiment_code, sample_code):   
+    openbis = state.getOpenBisService()
+    sampleIdentifier = SampleIdentifier.create(space_code, sample_code)
+    if (openbis.tryGetSampleWithExperiment(sampleIdentifier) == None):
+        sample = NewSample()
+        sampleType = SampleType()
+        sampleType.setCode(PLATE_TYPE_CODE)
+        sample.setSampleType(sampleType)
+        sample.setIdentifier(sampleIdentifier.toString())
+        
+        property = dto.VocabularyTermEntityProperty();
+        vocabularyTerm = dto.VocabularyTerm();
+        vocabularyTerm.setCode(PLATE_GEOMETRY);
+        property.setVocabularyTerm(vocabularyTerm);
+        propertyType = dto.PropertyType();
+        dataType = dto.DataType();
+        dataType.setCode(dto.DataTypeCode.CONTROLLEDVOCABULARY);
+        propertyType.setDataType(dataType);
+        propertyType.setCode(PLATE_GEOMETRY_PROPERTY_CODE);
+        property.setPropertyType(propertyType);
+        sample.setProperties([ property ])
+        
+        sample.setExperimentIdentifier("/"+space_code+"/"+project_code+"/"+experiment_code)
+        openbis.registerSample(sample, None)
+
+# ---------------------
+
+def register_images_and_analysis(incoming):
+    if not incoming.isDirectory():
+        return
+    analysis_file=find_file(incoming, "xml")
+        
+    image_dataset_details = create_image_dataset_details(incoming)
+    plate_code = image_dataset_details.getDataSetInformation().getSampleCode()
+    space_code = image_dataset_details.getDataSetInformation().getSpaceCode()
+    
+    if analysis_file != None:
+        dropbox_parent_dir = incoming.getParentFile().getParent()
+        tmp_analysis_dir =  get_tmp_dir(incoming, "image-analysis")
+        tmp_analysis_file = File(tmp_analysis_dir, analysis_file.getName())
+        GEExplorerImageAnalysisResultParser(analysis_file.getPath()).writeCSV(tmp_analysis_file)
+    
+    register_sample_if_necessary(space_code, DEFAULT_PROJECT_CODE, DEFAULT_EXPERIMENT_CODE, plate_code)
+    img_dataset_code = service.queueDataSetRegistration(incoming, image_dataset_details).getCode()
+    
+    if analysis_file != None:
+        analysis_registration_details = create_analysis_dataset_details(space_code, plate_code, img_dataset_code)  
+        service.queueDataSetRegistration(tmp_analysis_file, analysis_registration_details)
+
+
+register_images_and_analysis(incoming)
-- 
GitLab