diff --git a/screening/sinergia/create_global_siRNAGene_csv.py b/screening/sinergia/create_global_siRNAGene_csv.py new file mode 100644 index 0000000000000000000000000000000000000000..988d9a86804997e7bc89eceb297f300b071544d9 --- /dev/null +++ b/screening/sinergia/create_global_siRNAGene_csv.py @@ -0,0 +1,222 @@ +import os, glob, re, csv, time, shutil, sys +from time import * +from datetime import * + +incoming =sys.argv[1] + + + +#The siRNA csv files are renamed to use the the same well names as in openBIS, instead of w1, w2, etc + +def get_sirna_wells(incoming): + for csvfile in glob.glob(os.path.join(incoming, 'w*_sirna.csv')): + (dirName, fileName) = os.path.split(csvfile) + (basename, extension) = os.path.splitext(fileName) + token_list = re.split(r"[_]",basename) + c_well = token_list[0] + other_well = token_list[1] + control_well='' + well='' + if (c_well == "w1"): + control_well = "A1" + if (c_well == "w2"): + control_well = "A2" + if (other_well == "w3"): + well = "A3" + if (other_well == "w4"): + well = "A4" + if (other_well == "w5"): + well = "A5" + if (other_well == "w6"): + well = "A6" + if (other_well == "w7"): + well = "B1" + if (other_well == "w8"): + well = "B2" + if (other_well == "w9"): + well = "B3" + if (other_well == "w10"): + well = "B4" + if (other_well == "w11"): + well = "B5" + if (other_well == "w12"): + well = "B6" + if (other_well == "w13"): + well = "C1" + if (other_well == "w14"): + well = "C2" + if (other_well == "w15"): + well = "C3" + if (other_well == "w16"): + well = "C4" + if (other_well == "w17"): + well = "C5" + if (other_well == "w18"): + well = "C6" + if (other_well == "w19"): + well = "D1" + if (other_well == "w20"): + well = "D2" + if (other_well == "w21"): + well = "D3" + if (other_well == "w22"): + well = "D4" + if (other_well == "w23"): + well = "D5" + if (other_well == "w24"): + well = "D6" + + new_csv = incoming + "/" + control_well + "_" + well + "_sirna.csv" + + if not os.path.exists(new_csv): + shutil.move(csvfile, new_csv) + +get_sirna_wells(incoming) + + +#The gene csv files are renamed to use the same well names as in openBIS, instead of w1, w2, etc + +def get_gene_wells(incoming): + for csvfile in glob.glob(os.path.join(incoming, 'w*_gene.csv')): + (dirName, fileName) = os.path.split(csvfile) + (basename, extension) = os.path.splitext(fileName) + token_list = re.split(r"[_]",basename) + c_well = token_list[0] + other_well = token_list[1] + control_well='' + gene_well='' + if (c_well == "w1"): + control_well = "A1" + if (c_well == "w2"): + control_well = "A2" + if (other_well == "w4-w5-w6"): + gene_well = "A4-A5-A6" + if (other_well == "w7-w8-w9"): + gene_well = "B1-B2-B3" + if (other_well == "w10-w11-w12"): + gene_well = "B4-B5-B6" + if (other_well == "w13-w14-w15"): + gene_well = "C1-C2-C3" + if (other_well == "w16-w17-w18"): + gene_well = "C4-C5-C6" + if (other_well == "w19-w20-w21"): + gene_well = "D1-D2-D3" + if (other_well == "w22-w23-w24"): + gene_well = "D4-D5-D6" + + gene_csv = incoming + "/" + control_well + "_" + gene_well + "_gene.csv" + + if not os.path.exists(gene_csv): + shutil.move(csvfile, gene_csv) + +get_gene_wells(incoming) + + +# The plate code is extracted from the file OriginalDataDirectory.txt. This is the plate that contains the images produced by Ludovico and the matlab files given by Fethallah +def extractPlateCode(incoming): + plateCode = '' + for textfile in glob.glob(os.path.join(incoming, 'OriginalDataDirectory.txt')): + text = open(textfile, "r") + lineIndex =0 + for line in text: + lineIndex=lineIndex+1 + if re.match('PLATE',line): + token_list = re.split(r"[\t]",line) + partialCode = token_list[0] + plateCode = partialCode[0:9] + + return plateCode + +extractPlateCode(incoming) + +#The file Info_plates_sirna_genes.txt contains info on what genes and siRNA are contained in each well of each plate. If the plate code extracted above is the same as one of the plate codes in the file, the info regarding that plate is extracted +def extractInfoPlates(incoming): + well_list=[] + plate_list=[] + sirna_list=[] + gene_list=[] + for textfile in glob.glob(os.path.join(incoming, 'Info_plates_sirna_genes.txt')): + text = open(textfile, "r") + lineIndex =0 + for line in text: + lineIndex=lineIndex+1 + token_list = re.split(r"[\t]",line) + token_list = [ item.strip() for item in token_list ] + token_list = filter(lambda x: len(x) > 0, token_list) + well = token_list[0] + plate = token_list[1] + sirna = token_list[2] + gene = token_list[3] + + if (plate == extractPlateCode(incoming).strip()): + well_list.append(well) + plate_list.append(plate) + sirna_list.append(sirna) + gene_list.append(gene) + + return well_list, plate_list, sirna_list, gene_list + + +extractInfoPlates(incoming) + +#The single sirna csv files are combined into one global_siRNA.csv which contains also info on sirna and genes contained in each well +#The single gene csv files are combined into one global_gene.csv which contains also info on genes contained in each well + +def parse_csv(incoming): + global_sirna_csv = incoming+"/global_siRNA.csv" + global_gene_csv = incoming+"/global_gene.csv" + f = open(global_sirna_csv, "a") + g = open(global_gene_csv, "a") + + for csv_file in glob.glob(os.path.join(incoming, 'A*.csv')): + (dirName2, fileName2) = os.path.split(csv_file) + (basename2, extension2) = os.path.splitext(fileName2) + well_list = re.split(r"[_]",basename2) + control = well_list[0] + measure = well_list[1] + meas = measure[0:2] + csvfile = open(csv_file, "rb") + test = csv.reader(csvfile, delimiter=',', quotechar='"') + + + if (measure == "A3"): + for x, row in enumerate(test): + #if (x==0 and control == "A2"): + # s = "siRNA Well,"+ "Control Well," + "Gene," + "siRNA," + ','.join(row) +'\n' + # f.write(s) + if x !=0: + t = measure + "," + control + "," + "control gene," + "control siRNA" + "," + ",".join(row) +"\n" + f.write(t) + + + + + for i,j,k in zip(extractInfoPlates(incoming)[0],extractInfoPlates(incoming)[2],extractInfoPlates(incoming)[3]): + if (measure == i): + for x, row in enumerate(test): + if (x==0 and measure == 'A6' and control == 'A2'): + s = "siRNA Well,"+ "Control Well," + "Gene," + "siRNA," + ','.join(row) +'\n' + f.write(s) + if x !=0: + t = measure + "," + control + "," + k + "," + j + "," + ",".join(row) + "\n" + f.write(t) + f.close + + + for l,m in zip(extractInfoPlates(incoming)[0],extractInfoPlates(incoming)[3]): + if (meas == l): + for y, row in enumerate(test): + print "y, meas, l ", y, meas, control + if (y==0 and meas == "C1" and control == "A1"): + s = "siRNA Well,"+ "Control Well," + "Gene," + ','.join(row) +'\n' + # print s + g.write(s) + if y !=0: + t = measure + "," + control + "," + m + "," + ",".join(row) + "\n" + g.write(t) + g.close + + + +parse_csv(incoming) + diff --git a/screening/sinergia/import-analysis-datasets-withParentDataSet.py b/screening/sinergia/import-analysis-datasets-withParentDataSet.py new file mode 100644 index 0000000000000000000000000000000000000000..2e03b3bb2df8826833b902c1203056ff9da4702a --- /dev/null +++ b/screening/sinergia/import-analysis-datasets-withParentDataSet.py @@ -0,0 +1,154 @@ +#! /usr/bin/env python +""" + Import analysis data in two datasets: one dataset for videos and one dataset for matlab files. + +""" + +from ch.systemsx.cisd.openbis.generic.shared.api.v1.dto import SearchCriteria +from ch.systemsx.cisd.openbis.generic.shared.api.v1.dto import SearchSubCriteria + +import os +import glob +import re +import time +import shutil, sys +from time import * +from datetime import * + + + +print '###################################' +tz=localtime()[3]-gmtime()[3] +d=datetime.now() +print d.strftime("%Y-%m-%d %H:%M:%S GMT"+"%+.2d" % tz+":00") + + + + + +def copyTextFile(incomingPath): + for textfile in glob.glob(os.path.join(incomingPath, 'OriginalDataDirectory.txt')): + rawDataFile = incomingPath + '/RawDataDirectory.txt' + shutil.copyfile(textfile, rawDataFile) + +copyTextFile(incoming.getPath()) + +def extractSpaceCode(incomingPath): + spaceCode = "SINERGIA" + return spaceCode + + +def extractPlateCode(incomingPath): + for textfile in glob.glob(os.path.join(incomingPath, 'RawDataDirectory.txt')): + text = open(textfile, "r") + lineIndex =0 + for line in text: + lineIndex=lineIndex+1 + if re.match('PLATE', line): + token_list = re.split(r"[ ]",line) + token_list = [ item.strip() for item in token_list ] + token_list = filter(lambda x: len(x) > 0, token_list) + plateCode = token_list[0] + return plateCode + +extractPlateCode(incoming.getPath()) + + +def extractDataSetCode(incomingPath): + dataSetCode = '' + for textfile in glob.glob(os.path.join(incomingPath, 'RawDataDirectory.txt')): + text = open(textfile, "r") + lineIndex =0 + for line in text: + lineIndex=lineIndex+1 + # if re.match('/raid', line): + if re.match('/Users', line): + token_list = re.split(r"[/]",line) + token_list = [ item.strip() for item in token_list ] + token_list = filter(lambda x: len(x) > 0, token_list) + # dataSetCode = token_list[8] # right position for /raid + dataSetCode = token_list[10] #right position for local use + #plateCode = line + return dataSetCode + +extractDataSetCode(incoming.getPath()) + + + + +def get_videos(incomingPath): + directory = incomingPath + '/videos' + if not os.path.exists(directory): + os.makedirs(directory) + for mp4 in glob.glob(os.path.join(incomingPath, '*.mp4')): + (incomingPath, file) = os.path.split(mp4) + (filename, extension) = os.path.splitext(file) + stage= filename + shutil.move(incomingPath +'/'+file, directory) + for webm in glob.glob(os.path.join(incomingPath, '*.webm' )): + (incomingPath, file) = os.path.split(webm) + (filename, extension) = os.path.splitext(file) + stage= filename + shutil.move(incomingPath +'/'+file, directory) + for jpg in glob.glob(os.path.join(incomingPath, '*.jpg')): + (incomingPath, file) = os.path.split(jpg) + (filename, extension) = os.path.splitext(file) + stage= filename + shutil.move(incomingPath +'/'+file, directory) + for html in glob.glob(os.path.join(incomingPath, '*.html')): + (incomingPath, file) = os.path.split(html) + (filename, extension) = os.path.splitext(file) + stage= filename + shutil.move(incomingPath +'/'+file, directory) + +get_videos(incoming.getPath()) + + +def get_matfiles(incomingPath): + matDir = incomingPath + '/matfiles' + if not os.path.exists(matDir): + os.makedirs(matDir) + for mat in glob.glob(os.path.join(incomingPath, '*.mat')): + (incomingPath, file) = os.path.split(mat) + (filename, extension) = os.path.splitext(file) + stage = filename[:3] + shutil.move(incomingPath +'/'+file, matDir) + for txt in glob.glob(os.path.join(incomingPath, 'OriginalDataDirectory.txt')): + (incomingPath, file) = os.path.split(txt) + (filename, extension) = os.path.splitext(file) + stage = filename[:3] + shutil.move(incomingPath +'/'+file, matDir) + + +get_matfiles(incoming.getPath()) + +tr = service.transaction(incoming, factory) +incoming = tr.getIncoming() + +data_set = tr.createNewDataSet() +data_set.setDataSetType("HCS_IMAGE_SEGMENTATION_TRACKING_FEATURES") + +data_set2 = tr.createNewDataSet() +data_set2.setDataSetType("HCS_ANALYSIS_SEGMENTATION_AND_FEATURES") + + +sampleIdentifier = "/"+extractSpaceCode(incoming.getPath())+"/"+extractPlateCode(incoming.getPath()) +print sampleIdentifier +plate = tr.getSample(sampleIdentifier) +data_set.setSample(plate) +data_set2.setSample(plate) +# Get the search service +search_service = tr.getSearchService() + +sc = SearchCriteria() +sc.addMatchClause(SearchCriteria.MatchClause.createAttributeMatch(SearchCriteria.MatchClauseAttribute.CODE, extractDataSetCode(incoming.getPath()) )); +foundDataSets = search_service.searchForDataSets(sc) +if foundDataSets.size() > 0: + data_set.setParentDatasets([ds.getDataSetCode() for ds in foundDataSets]) + data_set2.setParentDatasets([ds.getDataSetCode() for ds in foundDataSets]) + +videoPath = incoming.getPath() + '/videos' +tr.moveFile(videoPath, data_set) + +matPath = incoming.getPath() + '/matfiles' +tr.moveFile(matPath, data_set2) \ No newline at end of file diff --git a/screening/sinergia/import-feature-vectors-with-lists-childOfSegmentationDS-concatcsvGenes.py b/screening/sinergia/import-feature-vectors-with-lists-childOfSegmentationDS-concatcsvGenes.py new file mode 100644 index 0000000000000000000000000000000000000000..5b1e9bcc569c44dc389c6a6c0046cda4508fd682 --- /dev/null +++ b/screening/sinergia/import-feature-vectors-with-lists-childOfSegmentationDS-concatcsvGenes.py @@ -0,0 +1,506 @@ + +import os, glob, re, csv, time, shutil +from time import * +from datetime import * + + +#from ch.systemsx.cisd.openbis.dss.etl.dto.api.v2 import * +from ch.systemsx.cisd.openbis.dss.etl.dto.api.v2 import SimpleFeatureVectorDataConfig +#from java.util import Properties +from ch.systemsx.cisd.openbis.generic.shared.api.v1.dto import SearchCriteria +from ch.systemsx.cisd.openbis.generic.shared.api.v1.dto import SearchSubCriteria +from ch.systemsx.cisd.openbis.dss.etl.dto.api.v2 import FeatureListDataConfig + + +''' +Dropbox for importing a feature vector dataset and for creating feature lists datasets from there. + +This dataset is set to be a child of the segmentation dataset produced by Fethallah. + +''' +print '###################################' +tz=localtime()[3]-gmtime()[3] +d=datetime.now() +print d.strftime("%Y-%m-%d %H:%M:%S GMT"+"%+.2d" % tz+":00") + +accuracyA1_sirna_list = [] +accuracyA2_sirna_list =[] +KStestA2_sirna_list = [] +KStestA1_sirna_list = [] +KSdeltaA2_sirna_list = [] +KSdeltaA1_sirna_list = [] +KSpvalueA2_sirna_list = [] +KSpvalueA1_sirna_list = [] +feature_directionA2_sirna_list = [] +feature_directionA1_sirna_list = [] +accuracyA1_gene_list = [] +accuracyA2_gene_list =[] +KStestA2_gene_list = [] +KStestA1_gene_list = [] +KSdeltaA2_gene_list = [] +KSdeltaA1_gene_list = [] +KSpvalueA2_gene_list = [] +KSpvalueA1_gene_list = [] +feature_directionA2_gene_list = [] +feature_directionA1_gene_list = [] + + +def process(transaction): + + incoming = transaction.getIncoming() + + +# def copyTextFile(incoming): +# for textfile in glob.glob(os.path.join(incoming, 'OriginalDataDirectory.txt')): +# rawDataFile = incoming + '/RawDataDirectory.txt' +# shutil.copyfile(textfile, rawDataFile) +# +# copyTextFile(incoming.getPath()) + + +#extract dataset code and plate of original image files from file OriginalDataDirectory.txt + def extractImageDataSetCode(incoming): + dataSetCode = '' + plateCode = '' + for textfile in glob.glob(os.path.join(incoming, 'OriginalDataDirectory.txt')): + text = open(textfile, "r") + lineIndex =0 + for line in text: + lineIndex=lineIndex+1 + if re.match('/raid', line): + # if re.match('/Users', line): + token_list = re.split(r"[/]",line) + token_list = [ item.strip() for item in token_list ] + token_list = filter(lambda x: len(x) > 0, token_list) + dataSetCode = token_list[8] #right position for raid is 8, for local use is 10 + if re.match('PLATE',line): + plateCode = line + + return dataSetCode, plateCode + + extractImageDataSetCode(incoming.getPath()) + +# check if plate code extracted above is the same as one of those in file AnalysisFethallaExample_location.txt. If so, get the dataset code associated with that plate. This is the dataset +# that contains the analysis matlab files produced by Fethallah, which have been used by Riwal to perform his analysis, so the new dataset registered should be a child of Fethallah's dataset. + def extractSegmentationDataSetCode(incoming): + segmentationDataSetCode = '' + segmentationPlateCode = '' + for textfile in glob.glob(os.path.join(incoming, 'FethallahAnalysisOBLocation.txt')): + text = open(textfile, "r") + lineIndex =0 + for line in text: + lineIndex=lineIndex+1 + token_list = re.split(r"[\t]",line) + token_list = [ item.strip() for item in token_list ] + token_list = filter(lambda x: len(x) > 0, token_list) + segmentationPlateCode = token_list[1] + if (segmentationPlateCode == extractImageDataSetCode(incoming)[1].strip()): + segmentationDataSetCode = token_list[0] + + return segmentationDataSetCode + + extractSegmentationDataSetCode(incoming.getPath()) + + def parse_gene_csv(incoming): + for csv_file in glob.glob(os.path.join(incoming, 'A*gene.csv')): + (dirName2, fileName2) = os.path.split(csv_file) + (basename2, extension2) = os.path.splitext(fileName2) + well_list = re.split(r"[_]",basename2) + control = well_list[0] + measure = well_list[1] + csvfile = open(csv_file, "rb") + test = csv.reader(csvfile, delimiter=',', quotechar='"') + for i, row in enumerate(test): + if i !=0: + fnv = row[0] + accuracy_value = row[1] + KStest_value = row[2] + KSdelta_value = row[3] + KSpvalue_value = row[4] + feature_direction_value = row[5] + + accuracyA2 = (fnv +"_G_ac_A2").upper() + KStestA2 = (fnv+"_G_KSt_A2").upper() + KSdeltaA2 = (fnv+"_G_KSd_A2").upper() + KSpvalueA2 = (fnv+"_G_KSp_A2").upper() + feature_directionA2 = (fnv+"_G_dir_A2").upper() + + accuracyA1 = (fnv +"_G_ac_A1").upper() + KStestA1 = (fnv+"_G_KSt_A1").upper() + KSdeltaA1 = (fnv+"_G_KSd_A1").upper() + KSpvalueA1 = (fnv+"_G_KSp_A1").upper() + feature_directionA1 = (fnv+"_G_dir_A1").upper() + + + accuracyA2_gene_list.append(accuracyA2) + accuracyA1_gene_list.append(accuracyA1) + KStestA2_gene_list.append(KStestA2) + KStestA1_gene_list.append(KStestA1) + KSdeltaA2_gene_list.append(KSdeltaA2) + KSdeltaA1_gene_list.append(KSdeltaA1) + KSpvalueA2_gene_list.append(KSpvalueA2) + KSpvalueA1_gene_list.append(KSpvalueA1) + feature_directionA2_gene_list.append(feature_directionA2) + feature_directionA1_gene_list.append(feature_directionA1) + + return accuracyA2_gene_list, accuracyA1_gene_list, KStestA2_gene_list, KStestA1_gene_list, KSdeltaA2_gene_list, KSdeltaA1_gene_list, KSpvalueA2_gene_list, KSpvalueA1_gene_list, feature_directionA2_gene_list, feature_directionA1_gene_list + + parse_gene_csv(incoming.getPath()) + + + + def parse_sirna_csv(incoming): + + for csv_file in glob.glob(os.path.join(incoming, 'A*sirna.csv')): + (dirName2, fileName2) = os.path.split(csv_file) + (basename2, extension2) = os.path.splitext(fileName2) + if re.search("-", basename2): + continue + else: + well_list = re.split(r"[_]",basename2) + control = well_list[0] + measure = well_list[1] + csvfile = open(csv_file, "rb") + test = csv.reader(csvfile, delimiter=',', quotechar='"') + for i, row in enumerate(test): + if i !=0: + fnv = row[0] + accuracy_value = row[1] + KStest_value = row[2] + KSdelta_value = row[3] + KSpvalue_value = row[4] + feature_direction_value = row[5] + + accuracyA2 = (fnv +"_S_ac_A2").upper() + KStestA2 = (fnv+"_S_KSt_A2").upper() + KSdeltaA2 = (fnv+"_S_KSd_A2").upper() + KSpvalueA2 = (fnv+"_S_KSp_A2").upper() + feature_directionA2 = (fnv+"_S_dir_A2").upper() + + accuracyA1 = (fnv +"_S_ac_A1").upper() + KStestA1 = (fnv+"_S_KSt_A1").upper() + KSdeltaA1 = (fnv+"_S_KSd_A1").upper() + KSpvalueA1 = (fnv+"_S_KSp_A1").upper() + feature_directionA1 = (fnv+"_S_dir_A1").upper() + + + accuracyA2_sirna_list.append(accuracyA2) + accuracyA1_sirna_list.append(accuracyA1) + KStestA2_sirna_list.append(KStestA2) + KStestA1_sirna_list.append(KStestA1) + KSdeltaA2_sirna_list.append(KSdeltaA2) + KSdeltaA1_sirna_list.append(KSdeltaA1) + KSpvalueA2_sirna_list.append(KSpvalueA2) + KSpvalueA1_sirna_list.append(KSpvalueA1) + feature_directionA2_sirna_list.append(feature_directionA2) + feature_directionA1_sirna_list.append(feature_directionA1) + + return accuracyA2_sirna_list, accuracyA1_sirna_list, KStestA2_sirna_list, KStestA1_sirna_list, KSdeltaA2_sirna_list, KSdeltaA1_sirna_list, KSpvalueA2_sirna_list, KSpvalueA1_sirna_list, feature_directionA2_sirna_list, feature_directionA1_sirna_list + + parse_sirna_csv(incoming.getPath()) + + + + + def defineGeneFeatures(featuresBuilder, incoming): + for csv_file in glob.glob(os.path.join(incoming, 'global_gene.csv')): + csvf = open(csv_file,'r') + globcsv = csv.reader(csvf, delimiter=',') + globcsv.next() + result_accuracy = {} # accuracy_label => {measure_well => accuracy_value} + result_kstest ={} # kstest => {measure_well => kstest_value} + result_ksdelta = {} # ksdelta => {measure_well => ksdelta_value} + result_kspvalue ={} # kspvalue => {measure_well => kspvalue_value} + result_feature_direction ={}# feature_direction => {measure_well => feature_direction_value} + + for row in globcsv: + measure_well = row[0] + group_well = re.split(r"[-]",measure_well) + group_well1 = group_well[0] + control_well = row[1] + feature_name = row[3] + accuracy_label = feature_name + "_G_ac" + accuracy_value = row[4] + kstest = feature_name + "_G_KSt" + kstest_value = row[5] + ksdelta = feature_name + "_G_KSd" + ksdelta_value = row[6] + kspvalue = feature_name + "_G_KSp" + kspvalue_value = row[7] + feature_direction = feature_name + "_G_dir" + feature_direction_value = row[8] + + + accuracy_key = "%s:%s" %(accuracy_label, control_well) + kstest_key = "%s:%s" %(kstest, control_well) + ksdelta_key = "%s:%s" %(ksdelta, control_well) + kspvalue_key ="%s:%s" %(kspvalue, control_well) + feature_direction_key = "%s:%s" %(feature_direction, control_well) + + + if not accuracy_key in result_accuracy: + result_accuracy[accuracy_key] = {} + + result_accuracy[accuracy_key][group_well1] = accuracy_value +# if not kstest_key in result_kstest: +# result_kstest[kstest_key] = {} +# +# result_kstest[kstest_key][measure_well] = kstest_value +# +# +# if not ksdelta_key in result_ksdelta: +# result_ksdelta[ksdelta_key] = {} +# +# result_ksdelta[ksdelta_key][measure_well] = ksdelta_value +# +# if not kspvalue_key in result_kspvalue: +# result_kspvalue[kspvalue_key] = {} +# +# result_kspvalue[kspvalue_key][measure_well] = kspvalue_value +# +# +# if not feature_direction_key in result_feature_direction: +# result_feature_direction[feature_direction_key] = {} +# +# result_feature_direction[feature_direction_key][measure_well] = feature_direction_value + + + + for feature in result_accuracy: + feature_accuracy = featuresBuilder.defineFeature(feature) + for well in result_accuracy[feature]: + value = result_accuracy[feature][well] + feature_accuracy.addValue(well, value) + +# for feature_kst in result_kstest: +# feature_kstest = featuresBuilder.defineFeature(feature_kst) +# for well2 in result_kstest[feature_kst]: +# value2 = result_kstest[feature_kst][well2] +# feature_kstest.addValue(well2, value2) +# +# +# for feature_ksd in result_ksdelta: +# feature_ksdelta = featuresBuilder.defineFeature(feature_ksd) +# for well1 in result_ksdelta[feature_ksd]: +# value1 = result_ksdelta[feature_ksd][well1] +# feature_ksdelta.addValue(well1, value1) +# +# for feature_ksp in result_kspvalue: +# feature_kspvalue = featuresBuilder.defineFeature(feature_ksp) +# for well3 in result_kspvalue[feature_ksp]: +# value3 = result_kspvalue[feature_ksp][well3] +# feature_kspvalue.addValue(well3, value3) +# +# for feature_fd in result_feature_direction: +# feature_feature_direction= featuresBuilder.defineFeature(feature_fd) +# for well4 in result_feature_direction[feature_fd]: +# value4 = result_feature_direction[feature_fd][well4] +# feature_feature_direction.addValue(well4, value4) + + + + for csv_file2 in glob.glob(os.path.join(incoming, 'global_siRNA.csv')): + csvf2 = open(csv_file2,'r') + globcsv2 = csv.reader(csvf2, delimiter=',') + globcsv2.next() + result_accuracy_sirna = {} # accuracy_label => {measure_well => accuracy_value} + result_kstest_sirna ={} # kstest => {measure_well => kstest_value} + result_ksdelta_sirna = {} # ksdelta => {measure_well => ksdelta_value} + result_kspvalue_sirna ={} # kspvalue => {measure_well => kspvalue_value} + result_feature_direction_sirna ={}# feature_direction => {measure_well => feature_direction_value} + + for row in globcsv2: + measure_well_sirna = row[0] + control_well_sirna = row[1] + feature_name_sirna = row[4] + accuracy_label_sirna = feature_name_sirna + "_S_ac" + accuracy_value_sirna = row[5] + kstest_sirna = feature_name_sirna + "_S_KSt" + kstest_value_sirna = row[6] + ksdelta_sirna = feature_name_sirna + "_S_KSd" + ksdelta_value_sirna = row[7] + kspvalue_sirna = feature_name_sirna + "_S_KSp" + kspvalue_value_sirna = row[8] + feature_direction_sirna = feature_name_sirna + "_S_dir" + feature_direction_value_sirna = row[9] + + + accuracy_key_sirna = "%s:%s" %(accuracy_label_sirna, control_well_sirna) + kstest_key_sirna = "%s:%s" %(kstest_sirna, control_well_sirna) + ksdelta_key_sirna = "%s:%s" %(ksdelta_sirna, control_well_sirna) + kspvalue_key_sirna ="%s:%s" %(kspvalue_sirna, control_well_sirna) + feature_direction_key_sirna = "%s:%s" %(feature_direction_sirna, control_well_sirna) + + if not accuracy_key_sirna in result_accuracy_sirna: + result_accuracy_sirna[accuracy_key_sirna] = {} + result_accuracy_sirna[accuracy_key_sirna][measure_well_sirna] = accuracy_value_sirna + + + + +# if not kstest_key in result_kstest: +# result_kstest[kstest_key] = {} +# +# result_kstest[kstest_key][measure_well] = kstest_value +# +# +# if not ksdelta_key in result_ksdelta: +# result_ksdelta[ksdelta_key] = {} +# +# result_ksdelta[ksdelta_key][measure_well] = ksdelta_value +# +# if not kspvalue_key in result_kspvalue: +# result_kspvalue[kspvalue_key] = {} +# +# result_kspvalue[kspvalue_key][measure_well] = kspvalue_value +# +# +# if not feature_direction_key in result_feature_direction: +# result_feature_direction[feature_direction_key] = {} +# +# result_feature_direction[feature_direction_key][measure_well] = feature_direction_value + + + + for feature_sirna in result_accuracy_sirna: + feature_accuracy_sirna = featuresBuilder.defineFeature(feature_sirna) + for well_sirna in result_accuracy_sirna[feature_sirna]: + value_sirna = result_accuracy_sirna[feature_sirna][well_sirna] + feature_accuracy_sirna.addValue(well_sirna, value_sirna) + +# for feature_kst in result_kstest: +# feature_kstest = featuresBuilder.defineFeature(feature_kst) +# for well2 in result_kstest[feature_kst]: +# value2 = result_kstest[feature_kst][well2] +# feature_kstest.addValue(well2, value2) +# +# +# for feature_ksd in result_ksdelta: +# feature_ksdelta = featuresBuilder.defineFeature(feature_ksd) +# for well1 in result_ksdelta[feature_ksd]: +# value1 = result_ksdelta[feature_ksd][well1] +# feature_ksdelta.addValue(well1, value1) +# +# for feature_ksp in result_kspvalue: +# feature_kspvalue = featuresBuilder.defineFeature(feature_ksp) +# for well3 in result_kspvalue[feature_ksp]: +# value3 = result_kspvalue[feature_ksp][well3] +# feature_kspvalue.addValue(well3, value3) +# +# for feature_fd in result_feature_direction: +# feature_feature_direction= featuresBuilder.defineFeature(feature_fd) +# for well4 in result_feature_direction[feature_fd]: +# value4 = result_feature_direction[feature_fd][well4] +# feature_feature_direction.addValue(well4, value4) + + + config = SimpleFeatureVectorDataConfig() + featuresBuilder = config.featuresBuilder + defineGeneFeatures(featuresBuilder, incoming.getPath()) + analysisDataset = transaction.createNewFeatureVectorDataSet(config, incoming) + + + rawImagesDataSetSample1 = transaction.getDataSet(extractSegmentationDataSetCode(incoming.getPath())).getSample() + rawImagesDataSetSample = transaction.getSample('/SINERGIA/' + rawImagesDataSetSample1.getCode()) + +# plateIdentifier = "/SINERGIA/PLATE1-G1-10X" +# test = transaction.getSample("/SINERGIA/PLATE1-G1-10X") +# analysisDataset.setSample(test) + + analysisDataset.setSample(rawImagesDataSetSample) + + search_service = transaction.getSearchService() + + sc = SearchCriteria() + sc.addMatchClause(SearchCriteria.MatchClause.createAttributeMatch(SearchCriteria.MatchClauseAttribute.CODE, extractSegmentationDataSetCode(incoming.getPath()) )); + foundDataSets = search_service.searchForDataSets(sc) + if foundDataSets.size() > 0: + analysisDataset.setParentDatasets([ds.getDataSetCode() for ds in foundDataSets]) + + # store the original file in the dataset. + transaction.moveFile(incoming.getPath(), analysisDataset) + + + + + +######################## Create Feature lists Datasets ########################################### + + config_accA2 = FeatureListDataConfig() + config_accA2.setName("siRNA-based accuracy (reference well: A2)"); + config_accA2.setFeatureList(accuracyA2_sirna_list) + config_accA2.setContainerDataSet(analysisDataset) + transaction.createNewFeatureListDataSet(config_accA2) + +# config_accA1 = FeatureListDataConfig() +# config_accA1.setName("siRNA-based accuracy (reference well: A1)"); +# config_accA1.setFeatureList(accuracyA1_sirna_list) +# config_accA1.setContainerDataSet(analysisDataset) +# transaction.createNewFeatureListDataSet(config_accA1) +# +# config_gene_accA2 = FeatureListDataConfig() +# config_gene_accA2.setName("gene-based accuracy (reference well: A2)"); +# config_gene_accA2.setFeatureList(accuracyA2_gene_list) +# config_gene_accA2.setContainerDataSet(analysisDataset) +# transaction.createNewFeatureListDataSet(config_gene_accA2) +# +# config_gene_accA1 = FeatureListDataConfig() +# config_gene_accA1.setName("gene-based accuracy (reference well: A1)"); +# config_gene_accA1.setFeatureList(accuracyA1_gene_list) +# config_gene_accA1.setContainerDataSet(analysisDataset) +# transaction.createNewFeatureListDataSet(config_gene_accA1) +# +# +# +# config_KStestA2 = FeatureListDataConfig() +# config_KStestA2.setName("KStest (reference well: A2)"); +# config_KStestA2.setFeatureList(KStestA2_sirna_list) +# config_KStestA2.setContainerDataSet(analysisDataset) +# transaction.createNewFeatureListDataSet(config_KStestA2) +# +# config_KStestA1 = FeatureListDataConfig() +# config_KStestA1.setName("KStest (reference well: A1)"); +# config_KStestA1.setFeatureList(KStestA1_sirna_list) +# config_KStestA1.setContainerDataSet(analysisDataset) +# transaction.createNewFeatureListDataSet(config_KStestA1) +# +# +# config_KSdeltaA2 = FeatureListDataConfig() +# config_KSdeltaA2.setName("KSdelta (reference well: A2)"); +# config_KSdeltaA2.setFeatureList(KSdeltaA2_sirna_list) +# config_KSdeltaA2.setContainerDataSet(analysisDataset) +# transaction.createNewFeatureListDataSet(config_KSdeltaA2) +# +# config_KSdeltaA1 = FeatureListDataConfig() +# config_KSdeltaA1.setName("KSdelta (reference well: A1)"); +# config_KSdeltaA1.setFeatureList(accuracyA1_sirna_list) +# config_KSdeltaA1.setContainerDataSet(analysisDataset) +# transaction.createNewFeatureListDataSet(config_KSdeltaA1) +# +# +# config_KSpvalueA2 = FeatureListDataConfig() +# config_KSpvalueA2.setName("KSpvalue (reference well: A2)"); +# config_KSpvalueA2.setFeatureList(KSpvalueA2_sirna_list) +# config_KSpvalueA2.setContainerDataSet(analysisDataset) +# transaction.createNewFeatureListDataSet(config_KSpvalueA2) +# +# config_KSpvalueA1 = FeatureListDataConfig() +# config_KSpvalueA1.setName("KSpvalue (reference well: A1)"); +# config_KSpvalueA1.setFeatureList(KSpvalueA1_sirna_list) +# config_KSpvalueA1.setContainerDataSet(analysisDataset) +# transaction.createNewFeatureListDataSet(config_KSpvalueA1) +# +# config_feature_directionA2 = FeatureListDataConfig() +# config_feature_directionA2.setName("Direction (reference well: A2)"); +# config_feature_directionA2.setFeatureList(feature_directionA2_sirna_list) +# config_feature_directionA2.setContainerDataSet(analysisDataset) +# transaction.createNewFeatureListDataSet(config_feature_directionA2) +# +# config_feature_directionA1 = FeatureListDataConfig() +# config_feature_directionA1.setName("Direction (reference well: A1)"); +# config_feature_directionA1.setFeatureList(feature_directionA1_sirna_list) +# config_feature_directionA1.setContainerDataSet(analysisDataset) +# transaction.createNewFeatureListDataSet(config_feature_directionA1) + + + diff --git a/screening/sinergia/sinergia-dropbox.py b/screening/sinergia/sinergia-dropbox.py new file mode 100644 index 0000000000000000000000000000000000000000..0f42928f9b1da3544317d897919ead4b3ee55c6e --- /dev/null +++ b/screening/sinergia/sinergia-dropbox.py @@ -0,0 +1,276 @@ +#! /usr/bin/env python +""" + The implementation of the Sinergia dropbox. + + Sinergia data is uploaded in a format where many files are provided in a single folder containing images and a metadata file (with the ".nd" extension). The dropbox implementation takes this format, extracts metadata and converts the file structure to a different one that is more manageable. The resulting file structure contains diretories for each well, containing directories for each channel. The images are located inside the chanel directory. +""" + +import os +import glob +import re +import time +import shutil + +from ch.systemsx.cisd.openbis.dss.etl.dto.api.v1 import SimpleImageDataConfig +from ch.systemsx.cisd.openbis.dss.etl.dto.api.v1 import ImageMetadata +from ch.systemsx.cisd.hdf5.h5ar import HDF5ArchiverFactory +from java.io import File + + +ORIGINAL_THUMBNAILS_FOLDER="thumbnails-original" +Uncomplete_Experiments = "Uncomplete_Experiments" + +class SinergiaPlate: + def __init__(self): + self.stages = [] + +class SinergiaStage: + def __init__(self): + self.channels = [] + self.well = "" + # dummy value generated underneath + self.tileNumber = 0 + self.siteString = 0 + +class SinergiaChannel: + def __init__(self): + self.name = [] + self.timepoints = [] + +def move_file_to_dir(srcFile, destDir): + destFile = destDir + "/" + os.path.basename(srcFile) + if srcFile == destFile: + return + if os.path.exists(destFile): + raise Exception("Cowardly refusing to override existing file %(destFile)s with source file %(srcFile)s." % vars()) + os.rename(srcFile, destFile) + + +def process_stage(plate, files): + """Process all the files for one stage of a plate, returning the stage""" + stage = SinergiaStage() + + red = SinergiaChannel() + red.name = "red" + redfiles = filter(lambda x: re.match('.*red_.*', x), files) #find all files for the red channel + red.timepoints = redfiles + stage.channels.append(red) + + green = SinergiaChannel() + green.name = "green" + greenfiles = filter(lambda x: re.match('.*green_.*', x), files) #find all files for the green channel + green.timepoints = greenfiles + stage.channels.append(green) + + return stage + +def parse_plate_metadata(incomingPath, pattern_start, sinergia_plate): + dummyTileCounter = 0; + for ndFileName in glob.glob( os.path.join(incomingPath, pattern_start + '.nd')): + ndfile = open(ndFileName, "r") + lineIndex = 0 + for line in ndfile: + lineIndex = lineIndex + 1 + if re.match('"Stage', line): + token_list = re.split(r"[\"\:\\\n\,]",line) + token_list = [ item.strip() for item in token_list ] + token_list = filter(lambda x: len(x) > 0, token_list) + + stageString = token_list[0] + + + + try: + stageIdx = int( stageString[len("Stage"):] ) - 1 + except ValueError: + raise Exception("Cannot parse stage number from '%(stageString)s: %(ndFileName)s, line %(lineIndex)i'" % vars()) + + try: + stage = sinergia_plate.stages[stageIdx] + except IndexError: + raise Exception("Invalid stage number '%(stageString)s: %(ndFileName)s, line %(lineIndex)i'. No corresponding TIF file was found." % vars()) + + stage.well = token_list[1] + stage.siteString = token_list[2] + stage.tileNumber = (dummyTileCounter % 10) + 1 + dummyTileCounter = dummyTileCounter + 1 + + + ndfile.close() + +def move_to_original_thumbnail_folder(incomingPath, thumbFiles): + thumbFolder = incomingPath + "/" + ORIGINAL_THUMBNAILS_FOLDER + if not os.path.exists(thumbFolder): + os.makedirs(thumbFolder) + for thumbFile in thumbFiles: + move_file_to_dir(thumbFile, thumbFolder) + + + +def create_glob_pattern_start(incomingPath): + """ + Return the Experiment{*} part of a pattern used by functions that need + to process only the valid experiment + """ + for ndfile in glob.glob(os.path.join(incomingPath, '*.nd')): + (incomingPath, file) = os.path.split(ndfile) + (filename, extension) = os.path.splitext(file) + pattern = filename + '*.TIF' + match_count = len(glob.glob1(incomingPath, pattern)) + # if match_count < 29000: + if match_count < 46560: + unwanted = incomingPath+ "/../../" + Uncomplete_Experiments + if not os.path.exists(unwanted): + os.makedirs(unwanted) + shutil.move(ndfile, unwanted) + #continue + else: + ret = filename + return ret + + +def is_thumbnail(fileName): + return "_thumb_" in fileName + +def remove_uncomplete_experiments(incomingPath): + pattern_start = create_glob_pattern_start(incomingPath) + for tif in glob.glob(os.path.join(incomingPath, '*.TIF')): + (incomingPath, file) = os.path.split(tif) + (filename, extension) = os.path.splitext(file) + token_list = re.split('_', file) + for token in token_list: + if re.match('Exp', token): + Experiment = token + if Experiment != pattern_start: + unwanted = incomingPath+"/../../" + Uncomplete_Experiments + if not os.path.exists(unwanted): + os.makedirs(unwanted) + shutil.move(tif, unwanted) + shutil.rmtree(unwanted) + +remove_uncomplete_experiments(incoming.getPath()) + + + +def process_plate(incomingPath): + """Look at all the files in the incoming path and group them into plates""" + sinergia_plate = SinergiaPlate() + remove_uncomplete_experiments(incomingPath) + + for stage_number in range(1, 241): + pattern_start = create_glob_pattern_start(incomingPath) + sinergia_plate.pattern_start = pattern_start + pattern = pattern_start + "*_s" + str(stage_number) +'_*.TIF' + files = glob.glob(os.path.join(incomingPath, pattern)) + thumbFiles = filter(is_thumbnail, files) + imageFiles = filter(lambda x: not is_thumbnail(x), files) + stage = process_stage(sinergia_plate, imageFiles) + move_to_original_thumbnail_folder(incomingPath, thumbFiles) + sinergia_plate.stages.append(stage) + + parse_plate_metadata(incomingPath, pattern_start, sinergia_plate) + + + return sinergia_plate + +def get_directory_for_image_file(stageIdx, channelName): + fullPath = incoming.getPath() + "/" + str(stageIdx) + "/" + channelName + if not os.path.exists(fullPath): + os.makedirs(fullPath) + return fullPath + +def transform_plate_file_structure(plate): + for idx, stage in enumerate(plate.stages): + for channel in stage.channels: + for imageFile in channel.timepoints: + directory = get_directory_for_image_file(idx + 1, channel.name) + move_file_to_dir(imageFile, directory) + +class SinergiaImageDataSetConfig(SimpleImageDataConfig): + def __init__(self, sinergia_plate): + self.sinergia_plate = sinergia_plate + + def extractImageMetadata(self, imagePath): + (dirName, filename) = os.path.split(imagePath) + (basename, extension) = os.path.splitext(filename) + + if is_thumbnail(basename): + return None + if not basename.startswith(self.sinergia_plate.pattern_start): + return None + + image_tokens = ImageMetadata() + + token_dict = {} + for token in basename.split("_"): + token_dict[token[:1]] = token[1:] + + channelName = token_dict["w"] + if "1LED green" == channelName: + channelCode = "LIFE ACT-GFP" + elif "2LED red" == channelName: + channelCode = "NLS-mCHERRY" + else: + channelCode = channelName + image_tokens.channelCode = channelCode + + stageIdx = int(token_dict["s"]) - 1 + image_tokens.well = sinergia_plate.stages[stageIdx].well + image_tokens.tileNumber = sinergia_plate.stages[stageIdx].tileNumber + image_tokens.timepoint = int(token_dict["t"]) + + + return image_tokens + +def get_or_create_bis_plate(tr, plateName,incomingPath): + spaceCode = "SINERGIA" + #plateIdentifier = "/" + spaceCode + "/" + plateName + "-REP" + plateIdentifier = "/" + spaceCode + "/" + plateName + "-10x" + plate = tr.getSample(plateIdentifier) + (pathName, dirName) = os.path.split(incomingPath) + + if not plate: + token_list = re.split('-', dirName) + for token in token_list: + if re.match('G', token): + groupNum=token[1:] + groupIdentifier = "/" + spaceCode + "/SIRNA_TIMELAPSES_10X/GROUP-" + groupNum + plate = tr.createNewSample(plateIdentifier, 'PLATE') + plate.setPropertyValue("$PLATE_GEOMETRY", "24_WELLS_4X6") + exp = tr.getExperiment(groupIdentifier) + if not exp: + exp = tr.createNewExperiment(groupIdentifier, 'SIRNA_HCS') + exp.setPropertyValue("DESCRIPTION", "siRNA screening: timelapses") + #exp = tr.getExperiment("/SINERGIA/SIRNA_MOVIES/GROUP-1") + plate.setExperiment(exp) + return plate; + + +def archive_thumbnails(incomingPath): + thumbnailDir = os.path.join(incomingPath, "thumbnails-original") + thumbnailContainer = os.path.join(incomingPath, "thumbnails-original.h5ar") + if os.path.isdir(thumbnailDir): + archiver = HDF5ArchiverFactory.open(thumbnailContainer) + archiver.archiveFromFilesystem(File(thumbnailDir)) + archiver.close() + shutil.rmtree(thumbnailDir) + + +sinergia_plate = process_plate(incoming.getPath()) +transform_plate_file_structure(sinergia_plate) + + +tr = service.transaction(incoming, factory) + +if incoming.isDirectory(): + archive_thumbnails(incoming.getPath()) + imageDatasetConfig = SinergiaImageDataSetConfig(sinergia_plate) + imageDatasetConfig.setImageLibrary("IJ") + imageDatasetConfig.setRawImageDatasetType() + imageDatasetConfig.setGenerateImageRepresentationsUsingImageResolutions(["64x52"]) + imageDataSetDetails = factory.createImageRegistrationDetails(imageDatasetConfig, incoming) + + plate = get_or_create_bis_plate(tr, incoming.getName().upper(), incoming.getPath()) + dataSet = tr.createNewDataSet(imageDataSetDetails) + dataSet.setSample(plate) + tr.moveFile(incoming.getPath(), dataSet)