From 17576eeae67306aba75d881e59575c46eff5f4e6 Mon Sep 17 00:00:00 2001 From: tpylak <tpylak> Date: Wed, 16 Mar 2011 09:40:18 +0000 Subject: [PATCH] LMS-2069 iBrain2 dropboxes minor refactoring: allow python dropboxes to read image analysis result CSV files using the existing Java code SVN: 20359 --- .../migration/MigrationStepFrom003To004.java | 284 +---------------- .../CsvFeatureVectorMigrator.java | 24 +- .../featurevector/CsvFeatureVectorParser.java | 292 ++++++++++++++++++ .../CsvToCanonicalFeatureVector.java | 258 +--------------- .../FeatureVectorStorageProcessor.java | 42 +-- .../etl/jython/JythonPlateDataSetHandler.java | 40 ++- .../CsvToCanonicalFeatureVectorTest.java | 35 ++- 7 files changed, 384 insertions(+), 591 deletions(-) create mode 100644 screening/source/java/ch/systemsx/cisd/openbis/dss/etl/featurevector/CsvFeatureVectorParser.java diff --git a/screening/source/java/ch/systemsx/cisd/openbis/dss/etl/dataaccess/migration/MigrationStepFrom003To004.java b/screening/source/java/ch/systemsx/cisd/openbis/dss/etl/dataaccess/migration/MigrationStepFrom003To004.java index 0e5e35f54f6..581be0964f8 100644 --- a/screening/source/java/ch/systemsx/cisd/openbis/dss/etl/dataaccess/migration/MigrationStepFrom003To004.java +++ b/screening/source/java/ch/systemsx/cisd/openbis/dss/etl/dataaccess/migration/MigrationStepFrom003To004.java @@ -16,305 +16,27 @@ package ch.systemsx.cisd.openbis.dss.etl.dataaccess.migration; -import java.io.File; -import java.io.IOException; -import java.sql.ResultSet; -import java.sql.SQLException; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Map.Entry; - import javax.sql.DataSource; -import net.lemnik.eodsql.QueryTool; - -import org.apache.log4j.Logger; import org.springframework.dao.DataAccessException; -import org.springframework.jdbc.core.simple.ParameterizedRowMapper; import org.springframework.jdbc.core.simple.SimpleJdbcTemplate; -import ch.systemsx.cisd.common.exceptions.EnvironmentFailureException; -import ch.systemsx.cisd.common.logging.LogCategory; -import ch.systemsx.cisd.common.logging.LogFactory; -import ch.systemsx.cisd.common.utilities.AbstractHashable; -import ch.systemsx.cisd.common.utilities.ExtendedProperties; import ch.systemsx.cisd.dbmigration.java.IMigrationStep; -import ch.systemsx.cisd.etlserver.plugins.ChainedDataSetMigrationTask; -import ch.systemsx.cisd.openbis.dss.etl.dataaccess.IImagingQueryDAO; -import ch.systemsx.cisd.openbis.dss.etl.featurevector.CanonicalFeatureVector; -import ch.systemsx.cisd.openbis.dss.etl.featurevector.CsvToCanonicalFeatureVector; -import ch.systemsx.cisd.openbis.dss.etl.featurevector.FeatureVectorUploader; -import ch.systemsx.cisd.openbis.dss.etl.featurevector.CsvToCanonicalFeatureVector.CsvToCanonicalFeatureVectorConfiguration; -import ch.systemsx.cisd.openbis.dss.generic.server.plugins.tasks.DatasetFileLines; -import ch.systemsx.cisd.openbis.dss.generic.shared.utils.DatasetLocationUtil; -import ch.systemsx.cisd.openbis.dss.generic.shared.utils.DssPropertyParametersUtil; -import ch.systemsx.cisd.openbis.plugin.screening.shared.basic.dto.ScreeningConstants; -import ch.systemsx.cisd.utils.CsvFileReaderHelper; -import ch.systemsx.cisd.utils.CsvFileReaderHelper.DefaultCsvFileReaderConfiguration; -import ch.systemsx.cisd.utils.CsvFileReaderHelper.ICsvFileReaderConfiguration; /** + * Since S102 it is no longer possible to migrate from database older than version 4 to a newest + * version directly. One would have to migrate to a version S101 first in such a case. + * <p> * Reads all feature vector files and reuploads them to the imaging database. * * @author Tomasz Pylak */ public class MigrationStepFrom003To004 implements IMigrationStep { - private static final Logger operationLog = - LogFactory.getLogger(LogCategory.OPERATION, MigrationStepFrom003To004.class); - - // name of the columns which contain row and column info - private static final String ROW_COLNAME = "row"; - - private static final String COLUMN_COLNAME = "col"; - - public MigrationStepFrom003To004() - { - } - - private static File getStoreRootDir() - { - ExtendedProperties properties = DssPropertyParametersUtil.loadServiceProperties(); - File storeRootDir = DssPropertyParametersUtil.getStoreRootDir(properties); - return storeRootDir; - } - - private static CsvToCanonicalFeatureVectorConfiguration createCsvConfig() - { - return new CsvToCanonicalFeatureVectorConfiguration(ROW_COLNAME, COLUMN_COLNAME); - } - - private final static ParameterizedRowMapper<MigrationDatasetRef> DATASET_ROW_MAPPER = - new ParameterizedRowMapper<MigrationDatasetRef>() - { - public final MigrationDatasetRef mapRow(final ResultSet rs, final int rowNum) - throws SQLException - { - long id = rs.getLong("id"); - String permId = rs.getString("perm_id"); - int plateWidth = rs.getInt("plate_width"); - int plateHeight = rs.getInt("plate_height"); - return new MigrationDatasetRef(id, permId, plateWidth, plateHeight); - - } - }; - - private static class MigrationDatasetRef extends AbstractHashable - { - final private long id; - - final private String permId; - - final private int plateWidth; - - final private int plateHeight; - - public MigrationDatasetRef(long id, String permId, int plateWidth, int plateHeight) - { - this.id = id; - this.permId = permId; - this.plateWidth = plateWidth; - this.plateHeight = plateHeight; - } - - public long getId() - { - return id; - } - - public String getPermId() - { - return permId; - } - - public int getPlateWidth() - { - return plateWidth; - } - - public int getPlateHeight() - { - return plateHeight; - } - } public void performPostMigration(SimpleJdbcTemplate jdbc, DataSource dataSource) throws DataAccessException { - IImagingQueryDAO dao = QueryTool.getQuery(dataSource, IImagingQueryDAO.class); - File storeRootDir = getStoreRootDir(); - String dbUUID = tryGetDatabaseInstanceUUID(storeRootDir); - if (dbUUID == null) - { - operationLog.warn("Store is empty - there is nothing to migrate."); - return; - } - List<MigrationDatasetRef> datasets = fetchImagingDatasets(jdbc); - Map<MigrationDatasetRef, DatasetFileLines> fileMap = - createFileMap(datasets, storeRootDir, dbUUID); - boolean ok = migrateDatasets(fileMap, jdbc, dao); - dao.commit(); - dao.close(); - if (ok == false) - { - operationLog.warn("There were some error during feature vector migration!"); - } - } - - private boolean migrateDatasets(Map<MigrationDatasetRef, DatasetFileLines> fileMap, - SimpleJdbcTemplate jdbc, IImagingQueryDAO dao) - { - boolean wholeMigrationOk = true; - for (Entry<MigrationDatasetRef, DatasetFileLines> entry : fileMap.entrySet()) - { - MigrationDatasetRef datasetRef = entry.getKey(); - String permId = datasetRef.getPermId(); - DatasetFileLines featureVectorLines = entry.getValue(); - try - { - operationLog.info("Migrating dataset: " + permId); - migrateDataset(jdbc, dao, datasetRef, featureVectorLines); - } catch (Exception ex) - { - operationLog.error("Cannot migrate dataset " + permId + ": " + ex.getMessage()); - if (ex instanceof IllegalArgumentException == false) - { - ex.printStackTrace(); - } - wholeMigrationOk = false; - } - } - return wholeMigrationOk; - } - - private void migrateDataset(SimpleJdbcTemplate jdbc, IImagingQueryDAO dao, - MigrationDatasetRef datasetRef, DatasetFileLines featureVectorLines) - { - long datasetId = datasetRef.getId(); - List<CanonicalFeatureVector> fvecs = extractFeatureVectors(featureVectorLines, datasetRef); - int deleted = deleteFeatureVectors(datasetId, jdbc); - if (deleted != fvecs.size()) - { - operationLog.error(String.format( - "Dataset techId(%d) had %d features, but now it has %d.", datasetId, deleted, - fvecs.size())); - } - uploadFeatureVectors(datasetId, fvecs, dao); - } - - private void uploadFeatureVectors(long datasetId, List<CanonicalFeatureVector> fvecs, - IImagingQueryDAO dao) - { - FeatureVectorUploader.uploadFeatureVectors(dao, fvecs, datasetId); - } - - private List<CanonicalFeatureVector> extractFeatureVectors(DatasetFileLines featureVectorLines, - MigrationDatasetRef datasetRef) - - { - CsvToCanonicalFeatureVectorConfiguration convertorConfig = createCsvConfig(); - return new CsvToCanonicalFeatureVector(featureVectorLines, convertorConfig, datasetRef - .getPlateHeight(), datasetRef.getPlateWidth()).convert(); - } - - private static DatasetFileLines getDatasetFileLines(File file, final char separator) - throws IOException - { - ICsvFileReaderConfiguration configuration = new DefaultCsvFileReaderConfiguration() - { - @Override - public char getColumnDelimiter() - { - return separator; - } - }; - return CsvFileReaderHelper.getDatasetFileLines(file, configuration); - } - - private int deleteFeatureVectors(long datasetId, SimpleJdbcTemplate jdbc) - { - return jdbc.update("delete from feature_defs defs where defs.ds_id = ?", datasetId); - } - - private List<MigrationDatasetRef> fetchImagingDatasets(SimpleJdbcTemplate simpleJdbcTemplate) - { - return simpleJdbcTemplate.query( - "select distinct d.id, d.perm_id, c.spots_width plate_width, c.spots_height plate_height " - + " from feature_defs defs, data_sets d, containers c " - + " where d.id = defs.ds_id and c.id = d.cont_id;", DATASET_ROW_MAPPER); - } - - private String tryGetDatabaseInstanceUUID(File storeRootDir) - { - File dbInstanceDir = ChainedDataSetMigrationTask.tryGetDatabaseInstanceDir(storeRootDir); - if (dbInstanceDir == null) - { - return null; - } else - { - return dbInstanceDir.getName(); - } - } - - private static Map<MigrationDatasetRef, DatasetFileLines/* lines with feature vectors */> createFileMap( - List<MigrationDatasetRef> datasets, File storeRootDir, String dbUUID) - { - Map<MigrationDatasetRef, DatasetFileLines> fileMap = - new HashMap<MigrationDatasetRef, DatasetFileLines>(); - for (MigrationDatasetRef dataset : datasets) - { - String permId = dataset.getPermId(); - File datasetDir = - DatasetLocationUtil.getDatasetLocationPath(storeRootDir, permId, "1", dbUUID); - DatasetFileLines featureVectorLines = tryFindFeatureVectorsFile(datasetDir); - if (featureVectorLines != null) - { - fileMap.put(dataset, featureVectorLines); - } - } - return fileMap; - } - - private static DatasetFileLines tryFindFeatureVectorsFile(File datasetDir) - { - File origDir = new File(datasetDir, ScreeningConstants.ORIGINAL_DATA_DIR); - File[] datasetFiles = origDir.listFiles(); - if (datasetFiles == null || datasetFiles.length == 0) - { - operationLog.warn("Empty dataset dir: " + datasetDir); - return null; - } - - for (File datasetFile : datasetFiles) - { - DatasetFileLines fileLines = tryReadFeatureVectors(datasetFile, ','); - if (fileLines == null || fileLines.getHeaderLabels().length <= 2) - { - fileLines = tryReadFeatureVectors(datasetFile, ';'); - } - if (fileLines != null && fileLines.getHeaderLabels().length > 2) - { - return fileLines; - } - } - throw new EnvironmentFailureException( - "Cannot find the file with feature vectors for the dataset. " - + "Delete this dataset from openBIS and restart the server to perform migration again. Dataset: " - + datasetDir.getName() + ". Directory: " + datasetDir); - } - - private static DatasetFileLines tryReadFeatureVectors(File datasetFile, char separator) - { - try - { - return getDatasetFileLines(datasetFile, separator); - } catch (Exception ex) - { - operationLog.warn("Cannot read the file or file has the wrong format: " + datasetFile - + ": " + ex.getMessage()); - return null; - } } public void performPreMigration(SimpleJdbcTemplate simpleJdbcTemplate, DataSource dataSource) diff --git a/screening/source/java/ch/systemsx/cisd/openbis/dss/etl/featurevector/CsvFeatureVectorMigrator.java b/screening/source/java/ch/systemsx/cisd/openbis/dss/etl/featurevector/CsvFeatureVectorMigrator.java index 06484e4911a..94b201d5aff 100644 --- a/screening/source/java/ch/systemsx/cisd/openbis/dss/etl/featurevector/CsvFeatureVectorMigrator.java +++ b/screening/source/java/ch/systemsx/cisd/openbis/dss/etl/featurevector/CsvFeatureVectorMigrator.java @@ -18,15 +18,15 @@ package ch.systemsx.cisd.openbis.dss.etl.featurevector; import java.io.File; import java.io.IOException; -import java.util.ArrayList; import java.util.HashMap; +import java.util.List; import java.util.Properties; import ch.systemsx.cisd.base.exceptions.IOExceptionUnchecked; import ch.systemsx.cisd.etlserver.DefaultStorageProcessor; import ch.systemsx.cisd.openbis.dss.etl.HCSContainerDatasetInfo; import ch.systemsx.cisd.openbis.dss.etl.dataaccess.IImagingQueryDAO; -import ch.systemsx.cisd.openbis.dss.etl.featurevector.CsvToCanonicalFeatureVector.CsvToCanonicalFeatureVectorConfiguration; +import ch.systemsx.cisd.openbis.dss.etl.featurevector.CsvFeatureVectorParser.CsvFeatureVectorParserConfiguration; import ch.systemsx.cisd.openbis.dss.generic.server.plugins.tasks.DatasetFileLines; import ch.systemsx.cisd.openbis.generic.shared.dto.SimpleDataSetInformationDTO; import ch.systemsx.cisd.utils.CsvFileReaderHelper; @@ -38,7 +38,7 @@ public class CsvFeatureVectorMigrator extends AbstractFeatureVectorMigrator { protected final FeatureVectorStorageProcessorConfiguration configuration; - protected final CsvToCanonicalFeatureVectorConfiguration convertorConfig; + protected final CsvFeatureVectorParserConfiguration convertorConfig; /** * @param properties @@ -48,8 +48,7 @@ public class CsvFeatureVectorMigrator extends AbstractFeatureVectorMigrator super(properties); this.configuration = new FeatureVectorStorageProcessorConfiguration(properties); - convertorConfig = - new CsvToCanonicalFeatureVectorConfiguration(configuration); + convertorConfig = new CsvFeatureVectorParserConfiguration(configuration); } @Override @@ -117,12 +116,11 @@ public class CsvFeatureVectorMigrator extends AbstractFeatureVectorMigrator { private final FeatureVectorStorageProcessorConfiguration configuration; - private final CsvToCanonicalFeatureVectorConfiguration convertorConfig; + private final CsvFeatureVectorParserConfiguration convertorConfig; - protected ImporterCsv(IImagingQueryDAO dao, - HCSContainerDatasetInfo screeningDataSetInfo, File fileToMigrate, - FeatureVectorStorageProcessorConfiguration configuration, - CsvToCanonicalFeatureVectorConfiguration convertorConfig) + protected ImporterCsv(IImagingQueryDAO dao, HCSContainerDatasetInfo screeningDataSetInfo, + File fileToMigrate, FeatureVectorStorageProcessorConfiguration configuration, + CsvFeatureVectorParserConfiguration convertorConfig) { super(dao, screeningDataSetInfo, fileToMigrate); this.configuration = configuration; @@ -138,9 +136,9 @@ public class CsvFeatureVectorMigrator extends AbstractFeatureVectorMigrator fileLines = getDatasetFileLines(fileToMigrate); CsvToCanonicalFeatureVector convertor = new CsvToCanonicalFeatureVector(fileLines, convertorConfig, - screeningDataSetInfo.getContainerRows(), screeningDataSetInfo - .getContainerColumns()); - ArrayList<CanonicalFeatureVector> fvecs = convertor.convert(); + screeningDataSetInfo.getContainerRows(), + screeningDataSetInfo.getContainerColumns()); + List<CanonicalFeatureVector> fvecs = convertor.convert(); FeatureVectorUploader uploader = new FeatureVectorUploader(dao, screeningDataSetInfo); diff --git a/screening/source/java/ch/systemsx/cisd/openbis/dss/etl/featurevector/CsvFeatureVectorParser.java b/screening/source/java/ch/systemsx/cisd/openbis/dss/etl/featurevector/CsvFeatureVectorParser.java new file mode 100644 index 00000000000..44611286369 --- /dev/null +++ b/screening/source/java/ch/systemsx/cisd/openbis/dss/etl/featurevector/CsvFeatureVectorParser.java @@ -0,0 +1,292 @@ +/* + * Copyright 2010 ETH Zuerich, CISD + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package ch.systemsx.cisd.openbis.dss.etl.featurevector; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Properties; +import java.util.Set; + +import ch.systemsx.cisd.common.shared.basic.utils.StringUtils; +import ch.systemsx.cisd.common.utilities.Counters; +import ch.systemsx.cisd.openbis.dss.etl.dto.api.impl.FeatureDefinition; +import ch.systemsx.cisd.openbis.dss.generic.server.plugins.tasks.DatasetFileLines; +import ch.systemsx.cisd.openbis.dss.generic.shared.utils.CodeAndLabelUtil; +import ch.systemsx.cisd.openbis.generic.shared.basic.dto.CodeAndLabel; +import ch.systemsx.cisd.openbis.plugin.screening.shared.basic.dto.WellLocation; +import ch.systemsx.cisd.openbis.plugin.screening.shared.imaging.dataaccess.ImgFeatureDefDTO; +import ch.systemsx.cisd.utils.CsvFileReaderHelper; + +/** + * Converts feature vectors from CSV files to {@link FeatureDefinition} objects. + * + * @author Chandrasekhar Ramakrishnan + * @author Tomasz Pylak + */ +public class CsvFeatureVectorParser +{ + public static List<FeatureDefinition> parse(File dataSet, Properties properties) + throws IOException + { + FeatureVectorStorageProcessorConfiguration storageProcessorConfiguration = + new FeatureVectorStorageProcessorConfiguration(properties); + return parse(dataSet, storageProcessorConfiguration); + } + + public static List<FeatureDefinition> parse(File dataSet, + FeatureVectorStorageProcessorConfiguration configuration) throws IOException + { + CsvFeatureVectorParserConfiguration convertorConfig = + new CsvFeatureVectorParserConfiguration(configuration); + DatasetFileLines datasetFileLines = + CsvFileReaderHelper.getDatasetFileLines(dataSet, configuration); + CsvFeatureVectorParser parser = + new CsvFeatureVectorParser(datasetFileLines, convertorConfig); + return parser.parse(); + } + + public static class CsvFeatureVectorParserConfiguration + { + private final String wellRowColumn; + + private final String wellColumnColumn; + + private final boolean isSplit; + + private final Set<String> columnsToBeIgnored; + + public CsvFeatureVectorParserConfiguration(FeatureVectorStorageProcessorConfiguration config) + { + this(config.getWellRow(), config.getWellColumn(), config.getColumnsToBeIgnored()); + } + + public CsvFeatureVectorParserConfiguration(String wellRow, String wellColumn) + { + this(wellRow, wellColumn, Collections.<String> emptySet()); + } + + public CsvFeatureVectorParserConfiguration(String wellRow, String wellColumn, + Set<String> columnsToBeIgnored) + { + this.wellRowColumn = wellRow; + this.wellColumnColumn = wellColumn; + this.columnsToBeIgnored = columnsToBeIgnored; + + isSplit = (false == wellRow.equals(wellColumn)); + } + + public String getWellRowColumn() + { + return wellRowColumn; + } + + public String getWellColumnColumn() + { + return wellColumnColumn; + } + + public boolean isSplit() + { + return isSplit; + } + + public boolean shouldColumnBeIgnored(String column) + { + return columnsToBeIgnored.contains(column); + } + } + + private final CsvFeatureVectorParserConfiguration configuration; + + private final String[] header; + + private final List<String[]> lines; + + private final ArrayList<FeatureColumn> columns = new ArrayList<FeatureColumn>(); + + // Will be initialized during conversion + private int xColumn = -1; + + private int yColumn = -1; + + private int maxRowFound = 0; + + private int maxColFound = 0; + + public CsvFeatureVectorParser(DatasetFileLines fileLines, + CsvFeatureVectorParserConfiguration config) + { + this.configuration = config; + this.header = fileLines.getHeaderLabels(); + this.lines = fileLines.getDataLines(); + } + + public List<FeatureDefinition> parse() + { + initializeColumns(); + readLines(); + + return convertColumnsToFeatureDefinitions(); + } + + private List<FeatureDefinition> convertColumnsToFeatureDefinitions() + { + List<FeatureDefinition> result = new ArrayList<FeatureDefinition>(); + Counters<String> counters = new Counters<String>(); + for (FeatureColumn column : columns) + { + if ((true == column.isWellName) || column.isEmpty()) + { + continue; + } + FeatureDefinition featureVector = convertColumnToFeatureDefinition(column, counters); + result.add(featureVector); + } + + return result; + } + + private FeatureDefinition convertColumnToFeatureDefinition(FeatureColumn column, + Counters<String> counters) + { + CodeAndLabel codeAndTitle = CodeAndLabelUtil.create(column.name); + ImgFeatureDefDTO featureDef = new ImgFeatureDefDTO(); + featureDef.setLabel(codeAndTitle.getLabel()); + featureDef.setDescription(codeAndTitle.getLabel()); + String code = codeAndTitle.getCode(); + int count = counters.count(code); + featureDef.setCode(count == 1 ? code : code + count); + + return column.getFeatureDefinition(featureDef); + } + + private void readLines() + { + for (String[] line : lines) + { + readLine(line); + } + } + + private void readLine(String[] line) + { + final WellLocation well = readWellLocationFromLine(line); + for (FeatureColumn column : columns) + { + if (true == column.isWellName) + { + continue; + } + String columnValue = line[column.index]; + if (StringUtils.isBlank(columnValue) == false) + { + column.addValue(well, columnValue); + } + } + + if (well.getRow() > maxRowFound) + { + maxRowFound = well.getRow(); + } + + if (well.getColumn() > maxColFound) + { + maxColFound = well.getColumn(); + } + } + + private WellLocation readWellLocationFromLine(String[] line) + { + if (configuration.isSplit()) + { + String rowString = line[xColumn]; + String colString = line[yColumn]; + return WellLocation.parseLocationStr(rowString, colString); + } else + { + return WellLocation.parseLocationStr(line[xColumn]); + } + } + + private void initializeColumns() + { + for (int i = 0; i < header.length; ++i) + { + String headerName = header[i]; + boolean isWellName = true; + if (configuration.getWellRowColumn().equals(headerName)) + { + xColumn = i; + } else if (configuration.getWellColumnColumn().equals(headerName)) + { + yColumn = i; + } else if (configuration.shouldColumnBeIgnored(headerName) == false) + { + isWellName = false; + } + FeatureColumn featureColumn = new FeatureColumn(i, headerName, isWellName); + columns.add(featureColumn); + } + + if (false == configuration.isSplit()) + { + yColumn = xColumn; + } + + if (xColumn < 0 || yColumn < 0) + { + throw new IllegalArgumentException("Could not parse data set"); + } + } + + private static class FeatureColumn + { + private final int index; + + private final String name; + + private final boolean isWellName; + + private final FeatureValuesMap values; + + private FeatureColumn(int index, String name, boolean isWellName) + { + this.index = index; + this.name = name; + this.isWellName = isWellName; + values = new FeatureValuesMap(0., 0.); + } + + public void addValue(WellLocation well, String columnValue) + { + values.addValue(columnValue, well); + } + + public FeatureDefinition getFeatureDefinition(ImgFeatureDefDTO featureDef) + { + return new FeatureDefinition(featureDef, values); + } + + public boolean isEmpty() + { + return values.isEmpty(); + } + } +} diff --git a/screening/source/java/ch/systemsx/cisd/openbis/dss/etl/featurevector/CsvToCanonicalFeatureVector.java b/screening/source/java/ch/systemsx/cisd/openbis/dss/etl/featurevector/CsvToCanonicalFeatureVector.java index 9aacfc2e6da..2d1265281b6 100644 --- a/screening/source/java/ch/systemsx/cisd/openbis/dss/etl/featurevector/CsvToCanonicalFeatureVector.java +++ b/screening/source/java/ch/systemsx/cisd/openbis/dss/etl/featurevector/CsvToCanonicalFeatureVector.java @@ -17,19 +17,12 @@ package ch.systemsx.cisd.openbis.dss.etl.featurevector; import java.util.ArrayList; -import java.util.Collections; import java.util.List; -import java.util.Set; -import ch.systemsx.cisd.common.shared.basic.utils.StringUtils; -import ch.systemsx.cisd.common.utilities.Counters; import ch.systemsx.cisd.openbis.dss.etl.dto.api.impl.FeatureDefinition; +import ch.systemsx.cisd.openbis.dss.etl.featurevector.CsvFeatureVectorParser.CsvFeatureVectorParserConfiguration; import ch.systemsx.cisd.openbis.dss.generic.server.plugins.tasks.DatasetFileLines; -import ch.systemsx.cisd.openbis.dss.generic.shared.utils.CodeAndLabelUtil; -import ch.systemsx.cisd.openbis.generic.shared.basic.dto.CodeAndLabel; import ch.systemsx.cisd.openbis.plugin.screening.shared.api.v1.dto.Geometry; -import ch.systemsx.cisd.openbis.plugin.screening.shared.basic.dto.WellLocation; -import ch.systemsx.cisd.openbis.plugin.screening.shared.imaging.dataaccess.ImgFeatureDefDTO; /** * Converts feature vectors from CSV files to CanonicaFeatureVector objects. @@ -38,259 +31,32 @@ import ch.systemsx.cisd.openbis.plugin.screening.shared.imaging.dataaccess.ImgFe */ public class CsvToCanonicalFeatureVector { - public static class CsvToCanonicalFeatureVectorConfiguration - { - private final String wellRowColumn; - - private final String wellColumnColumn; - - private final boolean isSplit; - - private final Set<String> columnsToBeIgnored; - - public CsvToCanonicalFeatureVectorConfiguration( - FeatureVectorStorageProcessorConfiguration config) - { - this(config.getWellRow(), config.getWellColumn(), config.getColumnsToBeIgnored()); - } - - public CsvToCanonicalFeatureVectorConfiguration(String wellRow, String wellColumn) - { - this(wellRow, wellColumn, Collections.<String> emptySet()); - } - - public CsvToCanonicalFeatureVectorConfiguration(String wellRow, String wellColumn, - Set<String> columnsToBeIgnored) - { - this.wellRowColumn = wellRow; - this.wellColumnColumn = wellColumn; - this.columnsToBeIgnored = columnsToBeIgnored; - - isSplit = (false == wellRow.equals(wellColumn)); - } - - public String getWellRowColumn() - { - return wellRowColumn; - } - - public String getWellColumnColumn() - { - return wellColumnColumn; - } - - public boolean isSplit() - { - return isSplit; - } - - public boolean shouldColumnBeIgnored(String column) - { - return columnsToBeIgnored.contains(column); - } - } - - private final CsvToCanonicalFeatureVectorConfiguration configuration; - - private final String[] header; - - private final List<String[]> lines; - - private final ArrayList<FeatureColumn> columns = new ArrayList<FeatureColumn>(); - - // Will be initialized during conversion - private int xColumn = -1; + private final CsvFeatureVectorParser parser; - private int yColumn = -1; - - private int maxRowFound = 0; - - private int maxColFound = 0; - - private final int maxPlateGeometryRow; - - private final int maxPlateGeometryCol; + private final Geometry plateGeometry; public CsvToCanonicalFeatureVector(DatasetFileLines fileLines, - CsvToCanonicalFeatureVectorConfiguration config, Geometry plateGeometry) + CsvFeatureVectorParserConfiguration config, Geometry plateGeometry) { this(fileLines, config, plateGeometry.getNumberOfRows(), plateGeometry.getNumberOfColumns()); } public CsvToCanonicalFeatureVector(DatasetFileLines fileLines, - CsvToCanonicalFeatureVectorConfiguration config, int maxRow, int maxCol) + CsvFeatureVectorParserConfiguration config, int maxRow, int maxCol) { - this.configuration = config; - this.header = fileLines.getHeaderLabels(); - this.lines = fileLines.getDataLines(); - this.maxPlateGeometryRow = maxRow; - this.maxPlateGeometryCol = maxCol; + this.parser = new CsvFeatureVectorParser(fileLines, config); + this.plateGeometry = Geometry.createFromRowColDimensions(maxRow, maxCol); } - public ArrayList<CanonicalFeatureVector> convert() + public List<CanonicalFeatureVector> convert() { - initializeColumns(); - readLines(); - - return convertColumnsToFeatureVectors(); - } - - private ArrayList<CanonicalFeatureVector> convertColumnsToFeatureVectors() - { - final Geometry geometry = - Geometry.createFromRowColDimensions(maxPlateGeometryRow, maxPlateGeometryCol); - - ArrayList<CanonicalFeatureVector> result = new ArrayList<CanonicalFeatureVector>(); - Counters<String> counters = new Counters<String>(); - for (FeatureColumn column : columns) + List<FeatureDefinition> featureDefinitions = parser.parse(); + List<CanonicalFeatureVector> result = new ArrayList<CanonicalFeatureVector>(); + for (FeatureDefinition featureDefinition : featureDefinitions) { - if ((true == column.isWellName) || column.isEmpty()) - { - continue; - } - CanonicalFeatureVector featureVector = - convertColumnToFeatureVector(geometry, column, counters); - result.add(featureVector); + result.add(featureDefinition.getCanonicalFeatureVector(plateGeometry)); } - return result; } - private CanonicalFeatureVector convertColumnToFeatureVector(Geometry geometry, - FeatureColumn column, Counters<String> counters) - { - CodeAndLabel codeAndTitle = CodeAndLabelUtil.create(column.name); - ImgFeatureDefDTO featureDef = new ImgFeatureDefDTO(); - featureDef.setLabel(codeAndTitle.getLabel()); - featureDef.setDescription(codeAndTitle.getLabel()); - String code = codeAndTitle.getCode(); - int count = counters.count(code); - featureDef.setCode(count == 1 ? code : code + count); - - return column.createCanonicalFeatureVector(featureDef, geometry); - } - - private void readLines() - { - for (String[] line : lines) - { - readLine(line); - } - if (maxColFound > maxPlateGeometryCol || maxRowFound > maxPlateGeometryRow) - { - throw new IllegalStateException(String.format( - "Feature vector has values outside the plate geometry. " - + "Plate geometry: (%d, %d), well: (%d, %d).", maxPlateGeometryRow, - maxPlateGeometryCol, maxRowFound, maxColFound)); - } - } - - private void readLine(String[] line) - { - final WellLocation well = readWellLocationFromLine(line); - for (FeatureColumn column : columns) - { - if (true == column.isWellName) - { - continue; - } - String columnValue = line[column.index]; - if (StringUtils.isBlank(columnValue) == false) - { - column.addValue(well, columnValue); - } - } - - if (well.getRow() > maxRowFound) - { - maxRowFound = well.getRow(); - } - - if (well.getColumn() > maxColFound) - { - maxColFound = well.getColumn(); - } - } - - private WellLocation readWellLocationFromLine(String[] line) - { - if (configuration.isSplit()) - { - String rowString = line[xColumn]; - String colString = line[yColumn]; - return WellLocation.parseLocationStr(rowString, colString); - } else - { - return WellLocation.parseLocationStr(line[xColumn]); - } - } - - private void initializeColumns() - { - for (int i = 0; i < header.length; ++i) - { - String headerName = header[i]; - boolean isWellName = true; - if (configuration.getWellRowColumn().equals(headerName)) - { - xColumn = i; - } else if (configuration.getWellColumnColumn().equals(headerName)) - { - yColumn = i; - } else if (configuration.shouldColumnBeIgnored(headerName) == false) - { - isWellName = false; - } - FeatureColumn featureColumn = new FeatureColumn(i, headerName, isWellName); - columns.add(featureColumn); - } - - if (false == configuration.isSplit()) - { - yColumn = xColumn; - } - - if (xColumn < 0 || yColumn < 0) - { - throw new IllegalArgumentException("Could not parse data set"); - } - } - - private static class FeatureColumn - { - private final int index; - - private final String name; - - private final boolean isWellName; - - private final FeatureValuesMap values; - - private FeatureColumn(int index, String name, boolean isWellName) - { - this.index = index; - this.name = name; - this.isWellName = isWellName; - values = new FeatureValuesMap(0., 0.); - } - - public void addValue(WellLocation well, String columnValue) - { - values.addValue(columnValue, well); - } - - public CanonicalFeatureVector createCanonicalFeatureVector(ImgFeatureDefDTO featureDef, - Geometry geometry) - { - FeatureDefinition featureDefinitionValues = - new FeatureDefinition(featureDef, values); - - return featureDefinitionValues.getCanonicalFeatureVector(geometry); - } - - public boolean isEmpty() - { - return values.isEmpty(); - } - } } diff --git a/screening/source/java/ch/systemsx/cisd/openbis/dss/etl/featurevector/FeatureVectorStorageProcessor.java b/screening/source/java/ch/systemsx/cisd/openbis/dss/etl/featurevector/FeatureVectorStorageProcessor.java index 34c6bdcdf33..a93d75ac5c6 100644 --- a/screening/source/java/ch/systemsx/cisd/openbis/dss/etl/featurevector/FeatureVectorStorageProcessor.java +++ b/screening/source/java/ch/systemsx/cisd/openbis/dss/etl/featurevector/FeatureVectorStorageProcessor.java @@ -38,8 +38,6 @@ import ch.systemsx.cisd.openbis.dss.etl.dataaccess.IImagingQueryDAO; import ch.systemsx.cisd.openbis.dss.etl.dto.api.impl.FeatureDefinition; import ch.systemsx.cisd.openbis.dss.etl.dto.api.impl.FeatureVectorDataSetInformation; import ch.systemsx.cisd.openbis.dss.etl.dto.api.v1.ImageDataSetInformation; -import ch.systemsx.cisd.openbis.dss.etl.featurevector.CsvToCanonicalFeatureVector.CsvToCanonicalFeatureVectorConfiguration; -import ch.systemsx.cisd.openbis.dss.generic.server.plugins.tasks.DatasetFileLines; import ch.systemsx.cisd.openbis.dss.generic.shared.IEncapsulatedOpenBISService; import ch.systemsx.cisd.openbis.dss.generic.shared.ServiceProvider; import ch.systemsx.cisd.openbis.dss.generic.shared.dto.DataSetInformation; @@ -47,7 +45,6 @@ import ch.systemsx.cisd.openbis.generic.shared.basic.dto.ExternalData; import ch.systemsx.cisd.openbis.generic.shared.basic.dto.Sample; import ch.systemsx.cisd.openbis.plugin.screening.shared.api.v1.dto.Geometry; import ch.systemsx.cisd.openbis.plugin.screening.shared.basic.dto.ScreeningConstants; -import ch.systemsx.cisd.utils.CsvFileReaderHelper; /** * Extract features from the file and store them in the database. @@ -62,8 +59,6 @@ public class FeatureVectorStorageProcessor extends AbstractDelegatingStorageProc private final FeatureVectorStorageProcessorConfiguration configuration; - private final CsvToCanonicalFeatureVectorConfiguration convertorConfig; - private final DataSource dataSource; private final IEncapsulatedOpenBISService openBisService; @@ -72,7 +67,6 @@ public class FeatureVectorStorageProcessor extends AbstractDelegatingStorageProc { super(properties); this.configuration = new FeatureVectorStorageProcessorConfiguration(properties); - convertorConfig = new CsvToCanonicalFeatureVectorConfiguration(configuration); this.dataSource = ServiceProvider.getDataSourceProvider().getDataSource(properties); this.openBisService = ServiceProvider.getOpenBISService(); } @@ -161,8 +155,7 @@ public class FeatureVectorStorageProcessor extends AbstractDelegatingStorageProc } private void loadDataSetIntoDatabase(IImagingQueryDAO dataAccessObject, File dataSet, - DataSetInformation dataSetInformation) - throws IOException + DataSetInformation dataSetInformation) throws IOException { HCSContainerDatasetInfo datasetInfo = createScreeningDatasetInfo(dataSetInformation); @@ -177,25 +170,24 @@ public class FeatureVectorStorageProcessor extends AbstractDelegatingStorageProc private List<CanonicalFeatureVector> extractCanonicalFeatureVectors(File dataSet, DataSetInformation dataSetInformation, Geometry plateGeometry) throws IOException { + List<FeatureDefinition> featureDefinitions; if (dataSetInformation instanceof FeatureVectorDataSetInformation) { - return extractCanonicalFeatureVectors( - (FeatureVectorDataSetInformation) dataSetInformation, plateGeometry); + featureDefinitions = + ((FeatureVectorDataSetInformation) dataSetInformation).getFeatures(); } else { - return extractCanonicalFeatureVectorsFromFile(dataSet, plateGeometry); + featureDefinitions = CsvFeatureVectorParser.parse(dataSet, configuration); } + return extractCanonicalFeatureVectors(featureDefinitions, plateGeometry); } private static List<CanonicalFeatureVector> extractCanonicalFeatureVectors( - FeatureVectorDataSetInformation dataSetInformation, Geometry plateGeometry) + List<FeatureDefinition> featuresDefinitions, Geometry plateGeometry) { - List<FeatureDefinition> featuresDefinitionValuesList = - dataSetInformation.getFeatures(); - List<CanonicalFeatureVector> canonicalFeatureVectors = new ArrayList<CanonicalFeatureVector>(); - for (FeatureDefinition featureDefinitionValues : featuresDefinitionValuesList) + for (FeatureDefinition featureDefinitionValues : featuresDefinitions) { CanonicalFeatureVector canonicalFeatureVector = featureDefinitionValues.getCanonicalFeatureVector(plateGeometry); @@ -204,16 +196,6 @@ public class FeatureVectorStorageProcessor extends AbstractDelegatingStorageProc return canonicalFeatureVectors; } - private List<CanonicalFeatureVector> extractCanonicalFeatureVectorsFromFile(File dataSet, - Geometry plateGeometry) throws IOException - { - DatasetFileLines fileLines = getDatasetFileLines(dataSet); - CsvToCanonicalFeatureVector convertor = - new CsvToCanonicalFeatureVector(fileLines, convertorConfig, plateGeometry); - List<CanonicalFeatureVector> fvecs = convertor.convert(); - return fvecs; - } - private HCSContainerDatasetInfo createScreeningDatasetInfo(DataSetInformation dataSetInformation) { Sample sampleOrNull = tryFindSampleForDataSet(dataSetInformation); @@ -251,12 +233,4 @@ public class FeatureVectorStorageProcessor extends AbstractDelegatingStorageProc return QueryTool.getQuery(dataSource, IImagingQueryDAO.class); } - /** - * Return the tabular data as a DatasetFileLines. - */ - private DatasetFileLines getDatasetFileLines(File file) throws IOException - { - return CsvFileReaderHelper.getDatasetFileLines(file, configuration); - } - } diff --git a/screening/source/java/ch/systemsx/cisd/openbis/dss/etl/jython/JythonPlateDataSetHandler.java b/screening/source/java/ch/systemsx/cisd/openbis/dss/etl/jython/JythonPlateDataSetHandler.java index 9caf65bc9e3..cafe13cc88a 100644 --- a/screening/source/java/ch/systemsx/cisd/openbis/dss/etl/jython/JythonPlateDataSetHandler.java +++ b/screening/source/java/ch/systemsx/cisd/openbis/dss/etl/jython/JythonPlateDataSetHandler.java @@ -1,6 +1,9 @@ package ch.systemsx.cisd.openbis.dss.etl.jython; import java.io.File; +import java.io.IOException; +import java.util.List; +import java.util.Properties; import org.python.util.PythonInterpreter; @@ -11,12 +14,14 @@ import ch.systemsx.cisd.etlserver.registrator.IDataSetRegistrationDetailsFactory import ch.systemsx.cisd.etlserver.registrator.JythonTopLevelDataSetHandler; import ch.systemsx.cisd.etlserver.registrator.api.v1.IDataSet; import ch.systemsx.cisd.etlserver.registrator.api.v1.impl.DataSetRegistrationTransaction; +import ch.systemsx.cisd.openbis.dss.etl.dto.api.impl.FeatureDefinition; import ch.systemsx.cisd.openbis.dss.etl.dto.api.impl.FeatureVectorDataSetInformation; import ch.systemsx.cisd.openbis.dss.etl.dto.api.impl.FeaturesBuilder; import ch.systemsx.cisd.openbis.dss.etl.dto.api.v1.BasicDataSetInformation; import ch.systemsx.cisd.openbis.dss.etl.dto.api.v1.IFeaturesBuilder; import ch.systemsx.cisd.openbis.dss.etl.dto.api.v1.ImageDataSetInformation; import ch.systemsx.cisd.openbis.dss.etl.dto.api.v1.SimpleImageDataConfig; +import ch.systemsx.cisd.openbis.dss.etl.featurevector.CsvFeatureVectorParser; import ch.systemsx.cisd.openbis.dss.generic.shared.dto.DataSetInformation; import ch.systemsx.cisd.openbis.plugin.screening.shared.basic.dto.ScreeningConstants; @@ -110,11 +115,44 @@ public class JythonPlateDataSetHandler extends JythonTopLevelDataSetHandler<Data IFeaturesBuilder featureBuilder, File incomingDatasetFolder) { FeaturesBuilder myFeatureBuilder = (FeaturesBuilder) featureBuilder; + List<FeatureDefinition> featureDefinitions = + myFeatureBuilder.getFeatureDefinitionValuesList(); + return createFeatureVectorRegistrationDetails(featureDefinitions); + } + + /** + * Parses the feature vactors from the specified CSV file. CSV format can be configured with + * following properties: + * + * <pre> + * # Separator character between headers and row cells. + * separator = , + * ignore-comments = true + * # Header of the column denoting the row of a well. + * well-name-row = row + * # Header of the column denoting the column of a well. + * well-name-col = col + * well-name-col-is-alphanum = true + * </pre> + * + * @throws IOException if file cannot be parsed + */ + public DataSetRegistrationDetails<FeatureVectorDataSetInformation> createFeatureVectorRegistrationDetails( + String dataSetPath, Properties properties) throws IOException + { + List<FeatureDefinition> featureDefinitions = + CsvFeatureVectorParser.parse(new File(dataSetPath), properties); + return createFeatureVectorRegistrationDetails(featureDefinitions); + } + + private DataSetRegistrationDetails<FeatureVectorDataSetInformation> createFeatureVectorRegistrationDetails( + List<FeatureDefinition> featureDefinitions) + { DataSetRegistrationDetails<FeatureVectorDataSetInformation> registrationDetails = featureVectorDatasetFactory.createDataSetRegistrationDetails(); FeatureVectorDataSetInformation featureVectorDataSet = registrationDetails.getDataSetInformation(); - featureVectorDataSet.setFeatures(myFeatureBuilder.getFeatureDefinitionValuesList()); + featureVectorDataSet.setFeatures(featureDefinitions); registrationDetails .setDataSetType(ScreeningConstants.DEFAULT_ANALYSIS_WELL_DATASET_TYPE); registrationDetails.setMeasuredData(false); diff --git a/screening/sourceTest/java/ch/systemsx/cisd/openbis/dss/etl/featurevector/CsvToCanonicalFeatureVectorTest.java b/screening/sourceTest/java/ch/systemsx/cisd/openbis/dss/etl/featurevector/CsvToCanonicalFeatureVectorTest.java index 535bf5162b1..9e3299f78a5 100644 --- a/screening/sourceTest/java/ch/systemsx/cisd/openbis/dss/etl/featurevector/CsvToCanonicalFeatureVectorTest.java +++ b/screening/sourceTest/java/ch/systemsx/cisd/openbis/dss/etl/featurevector/CsvToCanonicalFeatureVectorTest.java @@ -33,7 +33,7 @@ import com.csvreader.CsvReader; import ch.rinn.restrictions.Friend; import ch.systemsx.cisd.common.exceptions.UserFailureException; -import ch.systemsx.cisd.openbis.dss.etl.featurevector.CsvToCanonicalFeatureVector.CsvToCanonicalFeatureVectorConfiguration; +import ch.systemsx.cisd.openbis.dss.etl.featurevector.CsvFeatureVectorParser.CsvFeatureVectorParserConfiguration; import ch.systemsx.cisd.openbis.dss.generic.server.plugins.tasks.DatasetFileLines; import ch.systemsx.cisd.openbis.plugin.screening.shared.api.v1.dto.Geometry; import ch.systemsx.cisd.openbis.plugin.screening.shared.dto.PlateFeatureValues; @@ -43,17 +43,17 @@ import ch.systemsx.cisd.openbis.plugin.screening.shared.imaging.dataaccess.ImgFe /** * @author Chandrasekhar Ramakrishnan */ -@Friend(toClasses=FeatureVectorStorageProcessorConfiguration.class) +@Friend(toClasses = FeatureVectorStorageProcessorConfiguration.class) public class CsvToCanonicalFeatureVectorTest extends AssertJUnit { @Test public void testConversion() throws IOException { - CsvToCanonicalFeatureVectorConfiguration config = - new CsvToCanonicalFeatureVectorConfiguration("WellName", "WellName"); + CsvFeatureVectorParserConfiguration config = + new CsvFeatureVectorParserConfiguration("WellName", "WellName"); CsvToCanonicalFeatureVector converter = new CsvToCanonicalFeatureVector(getDatasetFileLines(), config, 16, 24); - ArrayList<CanonicalFeatureVector> fvs = converter.convert(); + List<CanonicalFeatureVector> fvs = converter.convert(); // Not all the columns are not empty assertEquals(18, fvs.size()); // Check total cells feature @@ -86,12 +86,14 @@ public class CsvToCanonicalFeatureVectorTest extends AssertJUnit public void testIgnoringColumns() throws IOException { Properties properties = new Properties(); - properties.setProperty(COLUMNS_TO_BE_IGNORED_KEY, "RelativeInfectionIndex, Log2RelativeInfectionIndex,ZScore"); - CsvToCanonicalFeatureVectorConfiguration config = - new CsvToCanonicalFeatureVectorConfiguration(new FeatureVectorStorageProcessorConfiguration(properties)); + properties.setProperty(COLUMNS_TO_BE_IGNORED_KEY, + "RelativeInfectionIndex, Log2RelativeInfectionIndex,ZScore"); + CsvFeatureVectorParserConfiguration config = + new CsvFeatureVectorParserConfiguration( + new FeatureVectorStorageProcessorConfiguration(properties)); CsvToCanonicalFeatureVector converter = - new CsvToCanonicalFeatureVector(getDatasetFileLines(), config, 16, 24); - ArrayList<CanonicalFeatureVector> fvs = converter.convert(); + new CsvToCanonicalFeatureVector(getDatasetFileLines(), config, 16, 24); + List<CanonicalFeatureVector> fvs = converter.convert(); // Not all the columns are not empty assertEquals(15, fvs.size()); // Check total cells feature @@ -119,20 +121,21 @@ public class CsvToCanonicalFeatureVectorTest extends AssertJUnit assertEquals(0.037157f, darr.getForWellLocation(1, 2)); assertEquals(0.001052f, darr.getForWellLocation(2, 1)); } - + @Test public void testNoIgnoringColumns() throws IOException { Properties properties = new Properties(); properties.setProperty(COLUMNS_TO_BE_IGNORED_KEY, ""); - CsvToCanonicalFeatureVectorConfiguration config = - new CsvToCanonicalFeatureVectorConfiguration(new FeatureVectorStorageProcessorConfiguration(properties)); + CsvFeatureVectorParserConfiguration config = + new CsvFeatureVectorParserConfiguration( + new FeatureVectorStorageProcessorConfiguration(properties)); CsvToCanonicalFeatureVector converter = - new CsvToCanonicalFeatureVector(getDatasetFileLines(), config, 16, 24); - ArrayList<CanonicalFeatureVector> fvs = converter.convert(); + new CsvToCanonicalFeatureVector(getDatasetFileLines(), config, 16, 24); + List<CanonicalFeatureVector> fvs = converter.convert(); assertEquals(18, fvs.size()); } - + /** * Return the tabular data as a DatasetFileLines. */ -- GitLab