Skip to content
Snippets Groups Projects
Commit 41bbc1ce authored by tpylak's avatar tpylak
Browse files

SE-218 LMC: merge image analysis data with gene information

SVN: 15030
parent b1e9cf60
No related branches found
No related tags found
No related merge requests found
/*
* Copyright 2010 ETH Zuerich, CISD
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package ch.systemsx.cisd.openbis.metadata;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import ch.systemsx.cisd.common.collections.CollectionUtils;
import ch.systemsx.cisd.common.exceptions.UserFailureException;
/**
* Provides structured information from one row of the csv file.
*
* @author Tomasz Pylak
*/
public class AbstractColumnExtractor
{
private final Map<String/* column name */, Integer/* index in the header table */> columnIndices;
private final Map<String/* column name */, Integer/* index in the header table */> unknownColumnIndices;
public AbstractColumnExtractor(String[] headerTokens, String[] expectedColumnNames)
{
this.columnIndices = createColumnIndex(headerTokens, expectedColumnNames);
this.unknownColumnIndices = getOmittedIndices(columnIndices, headerTokens);
}
public List<String> getUnknownColumnNames()
{
return new ArrayList<String>(unknownColumnIndices.keySet());
}
// ------------
private static Map<String, Integer> getOmittedIndices(Map<String, Integer> columnIndex,
String[] headers)
{
Map<String, Integer> omittedIndices = new HashMap<String, Integer>();
Set<Integer> knownIndices = new HashSet<Integer>(columnIndex.values());
for (int i = 0; i < headers.length; i++)
{
if (knownIndices.contains(i) == false)
{
omittedIndices.put(headers[i], i);
}
}
return omittedIndices;
}
private static Map<String, Integer> createColumnIndex(String[] headers,
String[] expectedColumnNames)
{
Map<String, Integer> map = new HashMap<String, Integer>();
for (String columnName : expectedColumnNames)
{
findAndPut(map, headers, columnName);
}
return map;
}
private static void findAndPut(Map<String, Integer> map, String[] headers, String columnName)
{
int ix = findIndexOrDie(headers, columnName);
map.put(columnName, ix);
}
private static int findIndexOrDie(String[] headers, String columnName)
{
for (int i = 0; i < headers.length; i++)
{
if (headers[i].equalsIgnoreCase(columnName))
{
return i;
}
}
throw new UserFailureException("Column " + columnName + " does not exist in "
+ CollectionUtils.abbreviate(headers, -1));
}
protected String getValue(String[] row, String columnName)
{
Integer ix = columnIndices.get(columnName);
return valueAt(row, ix);
}
private static String valueAt(String[] row, Integer ix)
{
if (ix >= row.length)
{
return "";
} else
{
return row[ix];
}
}
private String asCode(String value)
{
String code = "";
for (int i = 0; i < value.length(); i++)
{
char ch = value.charAt(i);
if (isValidCodeCharacter(ch) == false)
{
ch = '_';
}
code += ch;
}
return code;
}
private boolean isValidCodeCharacter(char ch)
{
return Character.isLetterOrDigit(ch) || ch == '.' || ch == '-' || ch == '_';
}
protected String getCodeValue(String[] row, String columnName)
{
return asCode(getValue(row, columnName));
}
public List<String> getUnknownColumnValues(String[] row, List<String> columnNames)
{
List<String> values = new ArrayList<String>();
for (String columnName : columnNames)
{
Integer ix = unknownColumnIndices.get(columnName);
String value = valueAt(row, ix);
values.add(value);
}
return values;
}
}
/*
* Copyright 2010 ETH Zuerich, CISD
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package ch.systemsx.cisd.openbis.metadata;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.charset.Charset;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import org.apache.commons.io.IOUtils;
import com.csvreader.CsvReader;
import ch.systemsx.cisd.openbis.metadata.QiagenScreeningLibraryColumnExtractor.GeneDetails;
/**
* Merges image analysis results with gene information from library.
*
* @author Tomasz Pylak
*/
public class ImageAnalysisGeneMerger
{
private static final char SEPARATOR = ',';
private static final String GENE_SYMBOL = "gene";
private static final String GENE_DESCRIPTION = "description";
public static void main(String[] args) throws Exception
{
if (args.length != 3)
{
error("Invalid parameters. Expected: "
+ "<library-file-path> <image-analysis-folder> <output-folder>");
}
CsvReader libraryReader = readFile(new File(args[0]));
// should contain one file per plate analysis results
File analysisFolder = new File(args[1]);
// folder where results will be saved
File outputFolder = new File(args[2]);
Map<WellLocation, GeneDetails> geneMap = readGeneMap(libraryReader);
libraryReader.close();
mergeAnalysisDirWithGenes(analysisFolder, outputFolder, geneMap);
}
private static void mergeAnalysisDirWithGenes(File analysisFolder, File outputFolder,
Map<WellLocation, GeneDetails> geneMap) throws Exception
{
outputFolder.mkdirs();
for (File plateAnalysisFile : analysisFolder.listFiles())
{
File outFile = new File(outputFolder, plateAnalysisFile.getName());
mergeAnalysisFileWithGenes(plateAnalysisFile, outFile, geneMap);
}
}
private static void mergeAnalysisFileWithGenes(File plateAnalysisFile, File outFile,
Map<WellLocation, GeneDetails> geneMap) throws Exception
{
// open file to read
CsvReader reader = readFile(plateAnalysisFile);
boolean headerPresent = reader.readRecord();
if (headerPresent == false)
{
throw error("header not found");
}
String orgHeaders = reader.getRawRecord();
PlateImageAnalysisColumnExtractor extractor =
new PlateImageAnalysisColumnExtractor(reader.getValues());
// open file to write results
OutputStream out = new FileOutputStream(outFile);
writeLine(createHeader(orgHeaders), out);
while (reader.readRecord())
{
String[] row = reader.getValues();
WellLocation loc = extractor.getWellLocation(row);
GeneDetails gene = geneMap.get(loc);
String resultLine = createLine(reader.getRawRecord(), gene);
writeLine(resultLine, out);
}
out.close();
}
private static void writeLine(String line, OutputStream out) throws IOException
{
IOUtils.writeLines(Arrays.asList(line), "\n", out);
}
private static String createHeader(String originalLine)
{
return originalLine + SEPARATOR + GENE_SYMBOL + SEPARATOR + GENE_DESCRIPTION;
}
private static String createLine(String originalLine, GeneDetails geneOrNull)
{
return originalLine + SEPARATOR + (geneOrNull == null ? "" : geneOrNull.getSymbol())
+ SEPARATOR + (geneOrNull == null ? "" : quote(geneOrNull.getDescription()));
}
private static String quote(String value)
{
return "\"" + value + "\"";
}
private static Map<WellLocation, GeneDetails> readGeneMap(CsvReader libraryReader)
throws Exception
{
Map<WellLocation, GeneDetails> map = new HashMap<WellLocation, GeneDetails>();
boolean headerPresent = libraryReader.readRecord();
if (headerPresent == false)
{
throw error("header not found");
}
String[] headers = libraryReader.getValues();
QiagenScreeningLibraryColumnExtractor extractor =
new QiagenScreeningLibraryColumnExtractor(headers);
while (libraryReader.readRecord())
{
String[] row = libraryReader.getValues();
WellLocation loc = extractor.getWellLocation(row);
GeneDetails gene = extractor.getGeneDetails(row);
map.put(loc, gene);
}
return map;
}
static CsvReader readFile(File file) throws FileNotFoundException, IOException
{
if (file.isFile() == false)
{
error(file + " does not exist or is not a file.");
}
FileInputStream fileInputStream = new FileInputStream(file);
CsvReader csvReader = new CsvReader(fileInputStream, Charset.defaultCharset());
csvReader.setDelimiter(SEPARATOR);
csvReader.setSafetySwitch(false);
return csvReader;
}
private static Exception error(String msg)
{
System.err.println(msg);
System.exit(1);
return new Exception();
}
}
......@@ -54,7 +54,7 @@ public class LibraryEntityRegistrator
private final PlateRegistrator plateRegistrator;
public LibraryEntityRegistrator(IScreeningLibraryColumnExtractor extractor,
public LibraryEntityRegistrator(QiagenScreeningLibraryColumnExtractor extractor,
String experimentIdentifier, String plateGeometry, String groupCode) throws IOException
{
this.geneRegistrator = new GeneRegistrator(new File(GENES_FILE_NAME));
......@@ -66,7 +66,7 @@ public class LibraryEntityRegistrator
plateGeometry, groupCode);
}
public void register(IScreeningLibraryColumnExtractor extractor, String[] row)
public void register(QiagenScreeningLibraryColumnExtractor extractor, String[] row)
throws IOException
{
String geneId = geneRegistrator.register(extractor, row);
......@@ -153,7 +153,7 @@ public class LibraryEntityRegistrator
}
/** @return sampleIdentifier */
public String registerPlate(IScreeningLibraryColumnExtractor extractor, String[] row)
public String registerPlate(QiagenScreeningLibraryColumnExtractor extractor, String[] row)
throws IOException
{
String plateCode = extractor.getPlateCode(row);
......@@ -171,7 +171,7 @@ public class LibraryEntityRegistrator
return "/" + groupCode + "/" + plateCode;
}
public void registerWell(IScreeningLibraryColumnExtractor extractor, String[] row,
public void registerWell(QiagenScreeningLibraryColumnExtractor extractor, String[] row,
String plateId, String oligoId) throws IOException
{
String wellCode = extractor.getWellCode(row);
......@@ -210,7 +210,7 @@ public class LibraryEntityRegistrator
}
// / returns gene id
public String register(IScreeningLibraryColumnExtractor extractor, String[] row)
public String register(QiagenScreeningLibraryColumnExtractor extractor, String[] row)
throws IOException
{
String geneSymbol = extractor.getGeneCode(row);
......@@ -253,7 +253,7 @@ public class LibraryEntityRegistrator
}
// / returns openbis id
public String register(IScreeningLibraryColumnExtractor extractor, String[] row,
public String register(QiagenScreeningLibraryColumnExtractor extractor, String[] row,
String inhibitedGeneCode) throws IOException
{
String geneSymbol = extractor.getGeneCode(row);
......
/*
* Copyright 2010 ETH Zuerich, CISD
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package ch.systemsx.cisd.openbis.metadata;
/**
* Provides structured information from one row of the plate images analysis results.
*
* @author Tomasz Pylak
*/
public class PlateImageAnalysisColumnExtractor extends AbstractColumnExtractor
{
// ----- column names
private final static String PLATE_NAME = "barcode";
private final static String WELL_ROW = "row";
private final static String WELL_COL = "col";
private final static String[] EXPECTED_COLUMNS = new String[]
{ PLATE_NAME, WELL_ROW, WELL_COL };
// -------------
public PlateImageAnalysisColumnExtractor(String[] headerTokens)
{
super(headerTokens, EXPECTED_COLUMNS);
}
private String getPlateCode(String[] row)
{
return getCodeValue(row, PLATE_NAME);
}
private String getWellCol(String[] row)
{
return getValue(row, WELL_COL);
}
private String getWellRow(String[] row)
{
return getValue(row, WELL_ROW);
}
public WellLocation getWellLocation(String[] row)
{
return new WellLocation(getPlateCode(row), getWellRow(row), getWellCol(row));
}
}
......@@ -16,22 +16,14 @@
package ch.systemsx.cisd.openbis.metadata;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import ch.systemsx.cisd.common.collections.CollectionUtils;
import ch.systemsx.cisd.common.exceptions.UserFailureException;
/**
* Provides structured information from one row of the QIAGEN library.
*
* @author Tomasz Pylak
*/
public class QiagenScreeningLibraryColumnExtractor implements IScreeningLibraryColumnExtractor
public class QiagenScreeningLibraryColumnExtractor extends AbstractColumnExtractor
{
// ----- column names
......@@ -55,162 +47,113 @@ public class QiagenScreeningLibraryColumnExtractor implements IScreeningLibraryC
private final static String OLIGO_ID = "productId";
private final static String[] ALL_COLUMNS = new String[]
private final static String[] EXPECTED_COLUMNS = new String[]
{ PLATE_NAME, WELL_ROW, WELL_COL, RNA_SEQUENCE, GENE_ID, GENE_SYMBOL, GENE_DESC, OLIGO_ID };
// -------------
private final Map<String/* column name */, Integer/* index in the header table */> columnIndices;
private final Map<String/* column name */, Integer/* index in the header table */> unknownColumnIndices;
public QiagenScreeningLibraryColumnExtractor(String[] headerTokens)
{
this.columnIndices = createColumnIndex(headerTokens);
this.unknownColumnIndices = getOmittedIndices(columnIndices, headerTokens);
super(headerTokens, EXPECTED_COLUMNS);
}
public List<String> getAdditionalOligoPropertyNames()
{
return new ArrayList<String>(unknownColumnIndices.keySet());
return getUnknownColumnNames();
}
// ------------
private static Map<String, Integer> getOmittedIndices(Map<String, Integer> columnIndex,
String[] headers)
public String getPlateCode(String[] row)
{
Map<String, Integer> omittedIndices = new HashMap<String, Integer>();
Set<Integer> knownIndices = new HashSet<Integer>(columnIndex.values());
for (int i = 0; i < headers.length; i++)
{
if (knownIndices.contains(i) == false)
{
omittedIndices.put(headers[i], i);
}
}
return omittedIndices;
return getCodeValue(row, PLATE_NAME);
}
private static Map<String, Integer> createColumnIndex(String[] headers)
public String getWellCode(String[] row)
{
Map<String, Integer> map = new HashMap<String, Integer>();
for (String columnName : ALL_COLUMNS)
{
findAndPut(map, headers, columnName);
}
return map;
String wellRow = getWellRow(row);
String wellCol = getWellCol(row);
return wellRow + wellCol;
}
private static void findAndPut(Map<String, Integer> map, String[] headers, String columnName)
private String getWellCol(String[] row)
{
int ix = findIndexOrDie(headers, columnName);
map.put(columnName, ix);
return getValue(row, WELL_COL);
}
private static int findIndexOrDie(String[] headers, String columnName)
private String getWellRow(String[] row)
{
for (int i = 0; i < headers.length; i++)
{
if (headers[i].equalsIgnoreCase(columnName))
{
return i;
}
}
throw new UserFailureException("Column " + columnName + " does not exist in "
+ CollectionUtils.abbreviate(headers, -1));
return getValue(row, WELL_ROW);
}
private String getValue(String[] row, String columnName)
public String getRNASequence(String[] row)
{
Integer ix = columnIndices.get(columnName);
return valueAt(row, ix);
return getValue(row, RNA_SEQUENCE);
}
private static String valueAt(String[] row, Integer ix)
public String getOligoId(String[] row)
{
if (ix >= row.length)
{
return "";
} else
{
return row[ix];
}
return getValue(row, OLIGO_ID);
}
private String asCode(String value)
public String getGeneId(String[] row)
{
String code = "";
for (int i = 0; i < value.length(); i++)
{
char ch = value.charAt(i);
if (isValidCodeCharacter(ch) == false)
{
ch = '_';
}
code += ch;
}
return code;
return getValue(row, GENE_ID);
}
private boolean isValidCodeCharacter(char ch)
public String getGeneCode(String[] row)
{
return Character.isLetterOrDigit(ch) || ch == '.' || ch == '-' || ch == '_';
return getCodeValue(row, GENE_SYMBOL);
}
private String getCodeValue(String[] row, String columnName)
public String getGeneDescription(String[] row)
{
return asCode(getValue(row, columnName));
return getValue(row, GENE_DESC);
}
// ------------
public String getPlateCode(String[] row)
public List<String> getAdditionalOligoPropertyValues(String[] row, List<String> columnNames)
{
return getCodeValue(row, PLATE_NAME);
return getUnknownColumnValues(row, columnNames);
}
public String getWellCode(String[] row)
public WellLocation getWellLocation(String[] row)
{
String wellRow = getValue(row, WELL_ROW);
String wellCol = getValue(row, WELL_COL);
return wellRow + wellCol;
return new WellLocation(getPlateCode(row), getWellRow(row), getWellCol(row));
}
public String getRNASequence(String[] row)
public GeneDetails getGeneDetails(String[] row)
{
return getValue(row, RNA_SEQUENCE);
return new GeneDetails(getGeneCode(row), getGeneDescription(row));
}
public String getOligoId(String[] row)
public static class GeneDetails
{
return getValue(row, OLIGO_ID);
}
private String symbol, description;
public String getGeneId(String[] row)
{
return getValue(row, GENE_ID);
}
public GeneDetails(String symbol, String description)
{
this.symbol = symbol;
this.description = description;
}
public String getGeneCode(String[] row)
{
return getCodeValue(row, GENE_SYMBOL);
}
public String getSymbol()
{
return symbol;
}
public String getGeneDescription(String[] row)
{
return getValue(row, GENE_DESC);
}
public void setSymbol(String symbol)
{
this.symbol = symbol;
}
public List<String> getAdditionalOligoPropertyValues(String[] row, List<String> columnNames)
{
List<String> values = new ArrayList<String>();
for (String columnName : columnNames)
public String getDescription()
{
return description;
}
public void setDescription(String description)
{
Integer ix = unknownColumnIndices.get(columnName);
String value = valueAt(row, ix);
values.add(value);
this.description = description;
}
return values;
}
}
......@@ -60,7 +60,7 @@ public class ScreeningLibraryTransformer
return;
}
String[] headers = csvReader.getValues();
IScreeningLibraryColumnExtractor extractor =
QiagenScreeningLibraryColumnExtractor extractor =
new QiagenScreeningLibraryColumnExtractor(headers);
LibraryEntityRegistrator registrator =
new LibraryEntityRegistrator(extractor, experimentIdentifier, plateGeometry,
......
......@@ -16,31 +16,48 @@
package ch.systemsx.cisd.openbis.metadata;
import java.util.List;
/**
* Provides structured information from one row of the library.
* Describes well location.
*
* @author Tomasz Pylak
*/
public interface IScreeningLibraryColumnExtractor
public class WellLocation
{
public String getPlateCode(String[] row);
public String getWellCode(String[] row);
public String getRNASequence(String[] row);
public String getOligoId(String[] row);
public String getGeneId(String[] row);
public String getGeneCode(String[] row);
public String getGeneDescription(String[] row);
public List<String> getAdditionalOligoPropertyNames();
public List<String> getAdditionalOligoPropertyValues(String[] row, List<String> columnNames);
private final String barcode, row, col;
public WellLocation(String barcode, String row, String col)
{
this.barcode = barcode;
this.row = row;
this.col = col;
}
public String getBarcode()
{
return barcode;
}
public String getRow()
{
return row;
}
public String getCol()
{
return col;
}
@Override
public boolean equals(Object o)
{
WellLocation loc = (WellLocation) o;
return barcode.equals(loc.barcode) && row.equals(loc.row) && col.equals(loc.col);
}
@Override
public int hashCode()
{
return barcode.hashCode() ^ row.hashCode() ^ col.hashCode();
}
}
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment