diff --git a/datastore_server/source/java/ch/systemsx/cisd/etlserver/utils/TabSeparatedValueTable.java b/datastore_server/source/java/ch/systemsx/cisd/etlserver/utils/TabSeparatedValueTable.java index 70487ab10381242c9ac39493fad3b94514362185..e5f5a259af795639de2ec2a810ab73c33843cfae 100644 --- a/datastore_server/source/java/ch/systemsx/cisd/etlserver/utils/TabSeparatedValueTable.java +++ b/datastore_server/source/java/ch/systemsx/cisd/etlserver/utils/TabSeparatedValueTable.java @@ -26,6 +26,8 @@ import org.apache.commons.io.IOUtils; import org.apache.commons.io.LineIterator; import org.apache.commons.lang.StringUtils; +import ch.systemsx.cisd.common.exceptions.UserFailureException; + /** * Helper class for get table data out of a TSV file. * @@ -40,6 +42,7 @@ public class TabSeparatedValueTable private final boolean ignoreHashedLines; private String currentLine; + private int currentLineNumber; RowLineIterator(LineIterator lineIterator, boolean ignoreEmptyLines, boolean ignoreHashedLines) { @@ -72,6 +75,11 @@ public class TabSeparatedValueTable } } + public final int getCurrentLineNumber() + { + return currentLineNumber; + } + public void remove() { throw new UnsupportedOperationException(); @@ -86,9 +94,10 @@ public class TabSeparatedValueTable return null; } String line = lineIterator.nextLine(); + currentLineNumber++; if ((ignoreEmptyLines == false || line == null || StringUtils.isNotBlank(line)) && (ignoreHashedLines == false || line == null || line.startsWith("#") == false)) - { + { return line; } } @@ -97,6 +106,7 @@ public class TabSeparatedValueTable private final RowLineIterator rowLineIterator; private final List<String> headers; + private final boolean strictRowSize; /** * Creates a new instance. Short cut for @@ -105,7 +115,7 @@ public class TabSeparatedValueTable public TabSeparatedValueTable(Reader reader, String nameOfReadingSource, boolean ignoreEmptyLines) { - this(reader, nameOfReadingSource, ignoreEmptyLines, false); + this(reader, nameOfReadingSource, ignoreEmptyLines, false, false); } /** @@ -116,11 +126,13 @@ public class TabSeparatedValueTable * @param nameOfReadingSource Source (usually file name) from which the table will be read. This * is used for error messages only. * @param ignoreEmptyLines If <code>true</code> lines with only white spaces will be ignored. + * @param strictRowSize If <code>true</code> the number of row cells have to be equal the number of headers. * @param ignoreHashedLines If <code>true</code> lines starting with '#' will be ignored. */ public TabSeparatedValueTable(Reader reader, String nameOfReadingSource, - boolean ignoreEmptyLines, boolean ignoreHashedLines) + boolean ignoreEmptyLines, boolean strictRowSize, boolean ignoreHashedLines) { + this.strictRowSize = strictRowSize; rowLineIterator = new RowLineIterator(IOUtils.lineIterator(reader), ignoreEmptyLines, ignoreHashedLines); if (rowLineIterator.hasNext() == false) { @@ -175,6 +187,11 @@ public class TabSeparatedValueTable return null; } List<String> row = getRowCells(line); + if (strictRowSize && row.size() != headers.size()) + { + throw new UserFailureException(rowLineIterator.getCurrentLineNumber() - 1 + + ". row has " + row.size() + " instead of " + headers.size() + " cells."); + } for (int i = row.size(); i < headers.size(); i++) { row.add(""); diff --git a/datastore_server/source/java/ch/systemsx/cisd/etlserver/validation/DataSetValidatorForTSV.java b/datastore_server/source/java/ch/systemsx/cisd/etlserver/validation/DataSetValidatorForTSV.java index 208f6f8dc0c3fb50e173c2e7e9346b7e4adef74e..d9423374fe3f704416b4c7bc78251511bee7bf35 100644 --- a/datastore_server/source/java/ch/systemsx/cisd/etlserver/validation/DataSetValidatorForTSV.java +++ b/datastore_server/source/java/ch/systemsx/cisd/etlserver/validation/DataSetValidatorForTSV.java @@ -60,6 +60,8 @@ class DataSetValidatorForTSV implements IDataSetValidator static final String EXCLUDE_PATH_PATTERNS_KEY = "exclude-path-patterns"; static final String IGNORE_EMPTY_LINES_KEY = "ignore-empty-lines"; + + static final String STRICT_ROW_SIZE_KEY = "strict-row-size"; static final String COLUMNS_KEY = "columns"; @@ -73,10 +75,13 @@ class DataSetValidatorForTSV implements IDataSetValidator private final boolean ignoreEmptyLines; + private final boolean strictRowSize; + DataSetValidatorForTSV(Properties properties) { fileScanners = new ArrayList<FileScanner>(); ignoreEmptyLines = PropertyUtils.getBoolean(properties, IGNORE_EMPTY_LINES_KEY, true); + strictRowSize = PropertyUtils.getBoolean(properties, STRICT_ROW_SIZE_KEY, true); String pathPatterns = properties.getProperty(PATH_PATTERNS_KEY, "*"); StringTokenizer tokenizer = new StringTokenizer(pathPatterns, ","); while (tokenizer.hasMoreTokens()) @@ -160,7 +165,7 @@ class DataSetValidatorForTSV implements IDataSetValidator { reader = new FileReader(file); TabSeparatedValueTable table = - new TabSeparatedValueTable(reader, file.toString(), ignoreEmptyLines); + new TabSeparatedValueTable(reader, file.toString(), ignoreEmptyLines, strictRowSize, false); List<String> headers = table.getHeaders(); assertUniqueHeaders(headers); ColumnDefinition[] definitions = findColumnDefinitions(headers); diff --git a/datastore_server/sourceTest/java/ch/systemsx/cisd/etlserver/utils/TabSeparatedValueTableTest.java b/datastore_server/sourceTest/java/ch/systemsx/cisd/etlserver/utils/TabSeparatedValueTableTest.java index b3fd9fd17c16c473359a81b6913a0eea79b0af8a..2b2efbbc9092e5267d810ec97a324e5368a3bf94 100644 --- a/datastore_server/sourceTest/java/ch/systemsx/cisd/etlserver/utils/TabSeparatedValueTableTest.java +++ b/datastore_server/sourceTest/java/ch/systemsx/cisd/etlserver/utils/TabSeparatedValueTableTest.java @@ -22,6 +22,8 @@ import java.util.List; import org.testng.AssertJUnit; import org.testng.annotations.Test; +import ch.systemsx.cisd.common.exceptions.UserFailureException; + /** * * @@ -121,8 +123,14 @@ public class TabSeparatedValueTableTest extends AssertJUnit @Test public void testParsingIgnoringEmptyLinesAndHashedLines() { - StringReader source = new StringReader("alpha\tbeta\n\n# hello world\n11\t12\n\t22\n31\n\n"); - TabSeparatedValueTable table = new TabSeparatedValueTable(source, "", true, true); + StringReader source = new StringReader("alpha\tbeta\n" + + "\n" + + "# hello world\n" + + "11\t12\n" + + "\t22\n" + + "31\n" + + "\n"); + TabSeparatedValueTable table = new TabSeparatedValueTable(source, "", true, false, true); List<Column> columns = table.getColumns(); assertEquals(2, columns.size()); @@ -134,6 +142,22 @@ public class TabSeparatedValueTableTest extends AssertJUnit assertEquals(null, table.tryToGetNextRow()); } + @Test + public void testParsingStrictRowSize() + { + StringReader source = new StringReader("alpha\tbeta\n1\t2\t\n"); + TabSeparatedValueTable table = new TabSeparatedValueTable(source, "", true, true, true); + + try + { + table.getColumns(); + fail("UserFailureException expected"); + } catch (UserFailureException ex) + { + assertEquals("1. row has 3 instead of 2 cells.", ex.getMessage()); + } + } + @Test public void testGetColumnsCombinedWithIterator() {