From f835535fc77bc54c74db8a9a38442209a6fadd4b Mon Sep 17 00:00:00 2001 From: cramakri <cramakri> Date: Tue, 22 Jun 2010 09:26:34 +0000 Subject: [PATCH] LMS-1584 Convert Genedata feature vectors to the canonical form. SVN: 16652 --- ...enedataFormatToCanonicalFeatureVector.java | 257 ++++++++++++++++++ 1 file changed, 257 insertions(+) create mode 100644 screening/source/java/ch/systemsx/cisd/openbis/dss/etl/featurevector/GenedataFormatToCanonicalFeatureVector.java diff --git a/screening/source/java/ch/systemsx/cisd/openbis/dss/etl/featurevector/GenedataFormatToCanonicalFeatureVector.java b/screening/source/java/ch/systemsx/cisd/openbis/dss/etl/featurevector/GenedataFormatToCanonicalFeatureVector.java new file mode 100644 index 00000000000..4db30d31dec --- /dev/null +++ b/screening/source/java/ch/systemsx/cisd/openbis/dss/etl/featurevector/GenedataFormatToCanonicalFeatureVector.java @@ -0,0 +1,257 @@ +/* + * Copyright 2010 ETH Zuerich, CISD + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package ch.systemsx.cisd.openbis.dss.etl.featurevector; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.StringTokenizer; + +import org.apache.commons.lang.StringUtils; + +import ch.systemsx.cisd.base.mdarray.MDDoubleArray; +import ch.systemsx.cisd.common.exceptions.UserFailureException; +import ch.systemsx.cisd.common.geometry.Point; +import ch.systemsx.cisd.openbis.dss.etl.dataaccess.ImgFeatureDefDTO; +import ch.systemsx.cisd.openbis.dss.etl.dataaccess.ImgFeatureValuesDTO; + +/** + * Converts currentFeature vectors from the Genedata currentFeature vector file format to + * CanonicaFeatureVector objects. + * + * @author Chandrasekhar Ramakrishnan + */ +public class GenedataFormatToCanonicalFeatureVector +{ + private final String layerPrefix; + + private final List<String> lines; + + private final ArrayList<FeatureParser> features = new ArrayList<FeatureParser>(); + + public GenedataFormatToCanonicalFeatureVector(List<String> lines, String layerPrefix) + { + this.layerPrefix = layerPrefix; + this.lines = lines; + } + + public ArrayList<CanonicalFeatureVector> convert() + { + readLines(); + + return convertFeaturesToFeatureFectors(); + } + + private ArrayList<CanonicalFeatureVector> convertFeaturesToFeatureFectors() + { + ArrayList<CanonicalFeatureVector> result = new ArrayList<CanonicalFeatureVector>(); + for (FeatureParser feature : features) + { + CanonicalFeatureVector featureVector = convertFeatureToFeatureVector(feature); + result.add(featureVector); + } + + return result; + } + + private CanonicalFeatureVector convertFeatureToFeatureVector(FeatureParser feature) + { + int[] dims = + { feature.numberOfRows, feature.numberOfColumns }; + + CanonicalFeatureVector featureVector = new CanonicalFeatureVector(); + featureVector.setFeatureDef(new ImgFeatureDefDTO(feature.name, feature.name, 0)); + MDDoubleArray valuesValues = convertColumnToByteArray(dims, feature); + ImgFeatureValuesDTO values = new ImgFeatureValuesDTO(0., 0., valuesValues, 0); + featureVector.setValues(Collections.singletonList(values)); + return featureVector; + } + + private MDDoubleArray convertColumnToByteArray(int[] dims, FeatureParser feature) + { + MDDoubleArray doubleArray = new MDDoubleArray(dims); + for (Point loc : feature.values.keySet()) + { + Double value = feature.values.get(loc); + doubleArray.set(value, loc.getX(), loc.getY()); + } + + return doubleArray; + } + + private void readLines() + { + String featureName = null; + ArrayList<String> featureLines = new ArrayList<String>(); + + // Don't need to do anything with the barcode, just make sure it is there + extractBarCode(lines.get(0).trim()); + + for (int i = 1; i < lines.size(); i++) + { + String line = lines.get(i).trim(); + + if (StringUtils.isEmpty(line)) + { + continue; + } + // If the line starts with the layer prefix, this is a new feature + if (line.startsWith(getLayerPrefix())) + { + // End the old feature + if (false == featureLines.isEmpty()) + { + createFeature(featureName, featureLines); + } + + // begin the new feature + featureName = extractLayer(line, i); + featureLines = new ArrayList<String>(); + } else + { + featureLines.add(line); + } + } + // End the last feature + createFeature(featureName, featureLines); + } + + private FeatureParser createFeature(String name, ArrayList<String> featureLines) + { + FeatureParser feature = new FeatureParser(name, featureLines); + feature.parse(); + features.add(feature); + return feature; + } + + private String extractBarCode(String firstLine) + { + int indexOfEqual = firstLine.indexOf('='); + if (indexOfEqual < 0) + { + throw error(0, firstLine, "Missing '='"); + } + return firstLine.substring(indexOfEqual + 1).trim(); + } + + private String extractLayer(String line, int lineIndex) + { + String layer = line.substring(getLayerPrefix().length()); + if (layer.endsWith(">") == false) + { + throw error(lineIndex, line, "Missing '>' at the end"); + } + return layer.substring(0, layer.length() - 1); + } + + private String getLayerPrefix() + { + return layerPrefix; + } + + private UserFailureException error(int lineIndex, String line, String reason) + { + return new UserFailureException("Error in line " + lineIndex + 1 + ": " + reason + ": " + + line); + } + + /** + * Class for parsing features from the Genedata format. + * + * @author Chandrasekhar Ramakrishnan + */ + private static class FeatureParser + { + private final String name; + + private final ArrayList<String> lines; + + private final ArrayList<String> rowLetters; + + private final int numberOfColumns; + + private final HashMap<Point, Double> values; + + // this is not known until we complete processing + private int numberOfRows = 0; + + private FeatureParser(String name, ArrayList<String> lines) + { + this.name = name; + this.lines = lines; + this.numberOfColumns = computeNumberOfColumns(lines); + values = new HashMap<Point, Double>(); + rowLetters = new ArrayList<String>(); + } + + /** + * Parse the header to get the number of columns + */ + private int computeNumberOfColumns(List<String> aList) + { + StringTokenizer tokenizer = new StringTokenizer(aList.get(0)); + return tokenizer.countTokens(); + } + + public void parse() + { + // skip the first line, the header, since the only information it contains is the number + // of columns + for (int i = 1; i < lines.size(); ++i) + { + pasrseLine(lines.get(i), i); + } + numberOfRows = rowLetters.size(); + } + + public void pasrseLine(String line, int lineIndex) + { + StringTokenizer tokenizer = new StringTokenizer(line); + int countTokens = tokenizer.countTokens(); + if (countTokens != numberOfColumns + 1) + { + throw error(lineIndex, line, "Inconsistent number of features: Expected " + + numberOfColumns + " but was " + (countTokens - 1)); + } + + String rowLetter = tokenizer.nextToken(); + if (rowLetters.contains(rowLetter) == false) + { + rowLetters.add(rowLetter); + } + + for (int i = 0; tokenizer.hasMoreTokens(); ++i) + { + String token = tokenizer.nextToken(); + Point point = new Point(rowLetters.size() - 1, i); + try + { + values.put(point, Double.parseDouble(token)); + } catch (NumberFormatException ex) + { + } + } + } + + private UserFailureException error(int lineIndex, String line, String reason) + { + return new UserFailureException("Error in line " + lineIndex + 1 + ": " + reason + ": " + + line); + } + } +} -- GitLab