diff --git a/common/source/java/ch/systemsx/cisd/common/fasta/FastaUtilities.java b/common/source/java/ch/systemsx/cisd/common/fasta/FastaUtilities.java index 830f04fea218a3cfff8a4c13596c7ad791139e61..4cada4b2655ae107759a322434b949898952cc02 100644 --- a/common/source/java/ch/systemsx/cisd/common/fasta/FastaUtilities.java +++ b/common/source/java/ch/systemsx/cisd/common/fasta/FastaUtilities.java @@ -35,17 +35,68 @@ public class FastaUtilities */ public static final List<Character> NUCLEIC_ACID_CODES = Arrays.asList('A', 'C', 'G', 'T', 'U', 'R', 'Y', 'K', 'M', 'S', 'W', 'B', 'D', 'H', 'V', 'N', 'X', '-'); - + + public static final List<Character> STRICT_NUCLEIC_ACID_CODES = Arrays.asList('A', 'T', 'U', 'C', 'G'); + /** * Amino acid codes as used in FASTA files (see https://en.wikipedia.org/wiki/FASTA_format). */ public static final List<Character> AMINO_ACID_CODES = Arrays.asList('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'Y', 'Z', 'X', '*', '-'); + + public static final List<Character> STRICT_AMINO_ACID_CODES = Arrays.asList('A', 'R', 'N', 'D', 'C', 'E', + 'Q', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'); private static final Set<Character> NUCLEIC_ACID_CODES_SET = new HashSet<Character>(NUCLEIC_ACID_CODES); + private static final Set<Character> STRICT_NUCLEIC_ACID_CODES_SET = new HashSet<Character>(STRICT_NUCLEIC_ACID_CODES); + private static final Set<Character> AMINO_ACID_CODES_SET = new HashSet<Character>(AMINO_ACID_CODES); + private static final Set<Character> STRICT_AMINO_ACID_CODES_SET = new HashSet<Character>(STRICT_AMINO_ACID_CODES); + + /** + * Returns the sequenceType of the specified string or <code>null</code> if undetermined. + * + * @return {@link SequenceType#PROT} if all characters are from the set STRICT_AMINO_ACID_CODES + * and at least one character is not in the set STRICT_NUCLEIC_ACID_CODES. Otherwise + * {@link SequenceType#NUCL} is returned if all characters are from the set STRICT_NUCLEIC_ACID_CODES. + * If non of these two cases are fulfilled <code>null</code> is returned. + */ + public static SequenceType determineSequenceTypeOrNull(String line) + { + boolean isAminoAcidSequence = false; + int nuclCounter = 0; + int aminoCounter = 0; + for (char c : line.toUpperCase().toCharArray()) + { + boolean isNucleicAcidCode = STRICT_NUCLEIC_ACID_CODES_SET.contains(c); + boolean isAmoniAcidCode = STRICT_AMINO_ACID_CODES_SET.contains(c); + if (isNucleicAcidCode == false && isAmoniAcidCode == false) + { + return null; + } + if (isNucleicAcidCode) + { + nuclCounter++; + } + if (isAmoniAcidCode) + { + aminoCounter++; + if (isNucleicAcidCode == false) + { + isAminoAcidSequence = true; + } + } + } + if (aminoCounter == line.length() && isAminoAcidSequence) + { + return SequenceType.PROT; + } + return nuclCounter == line.length() ? SequenceType.NUCL : null; + } + + /** * Determines the sequence type from the specified line of a FASTA file. * diff --git a/common/sourceTest/java/ch/systemsx/cisd/common/fasta/FastaUtilitiesTest.java b/common/sourceTest/java/ch/systemsx/cisd/common/fasta/FastaUtilitiesTest.java index 45d67be46a6742d99d5380aa3860d7e965eeca04..ce6d260fdbe02979d529defc2c0d8b0f4ab31d6b 100644 --- a/common/sourceTest/java/ch/systemsx/cisd/common/fasta/FastaUtilitiesTest.java +++ b/common/sourceTest/java/ch/systemsx/cisd/common/fasta/FastaUtilitiesTest.java @@ -78,5 +78,35 @@ public class FastaUtilitiesTest extends AssertJUnit { assertEquals(SequenceType.PROT, FastaUtilities.determineSequenceType("TLI IGGBC")); } + + @Test + public void testDetermineSequenceTypeOrNullForPureU() + { + assertEquals(SequenceType.NUCL, FastaUtilities.determineSequenceTypeOrNull("UUU")); + } + + @Test + public void testDetermineSequenceTypeOrNullForPureNuclSequence() + { + assertEquals(SequenceType.NUCL, FastaUtilities.determineSequenceTypeOrNull("GATTACA")); + } + + @Test + public void testDetermineSequenceTypeOrNullForProtSequence() + { + assertEquals(SequenceType.PROT, FastaUtilities.determineSequenceTypeOrNull("ACE")); + } + + @Test + public void testDetermineSequenceTypeOrNullForMixOfPureNuclCharactersAndPureAminoCharacters() + { + assertEquals(null, FastaUtilities.determineSequenceTypeOrNull("UV")); + } + + @Test + public void testDetermineSequenceTypeOrNullForUnknownCharacter() + { + assertEquals(null, FastaUtilities.determineSequenceTypeOrNull("ABCZ")); + } } diff --git a/datastore_server/source/java/ch/systemsx/cisd/etlserver/plugins/BlastDatabaseCreationMaintenanceTask.java b/datastore_server/source/java/ch/systemsx/cisd/etlserver/plugins/BlastDatabaseCreationMaintenanceTask.java index f299a4c263c681ea1d0dbd8f03cfd9c4d4e9341d..9d8b97ac8d61b14b70f3eb5874a65dc4d78ec7be 100644 --- a/datastore_server/source/java/ch/systemsx/cisd/etlserver/plugins/BlastDatabaseCreationMaintenanceTask.java +++ b/datastore_server/source/java/ch/systemsx/cisd/etlserver/plugins/BlastDatabaseCreationMaintenanceTask.java @@ -39,7 +39,6 @@ import java.util.Set; import java.util.StringTokenizer; import java.util.TreeSet; import java.util.regex.Pattern; -import java.util.regex.PatternSyntaxException; import org.apache.commons.io.FilenameUtils; import org.apache.commons.io.IOUtils; @@ -566,17 +565,21 @@ public class BlastDatabaseCreationMaintenanceTask implements IMaintenanceTask { if (property.getPropertyType().getCode().equals(propertyType)) { - String sequence = property.tryGetAsString(); + String sequence = normalize(property.tryGetAsString()); if (sequence != null) { - Sequence seq = new Sequence(entity, propertyType, sequence); - Sequences sequences = map.get(seq.getSequenceType()); - if (sequences == null) + SequenceType sequenceType = FastaUtilities.determineSequenceTypeOrNull(sequence); + if (sequenceType != null) { - sequences = new Sequences(); - map.put(seq.getSequenceType(), sequences); + Sequence seq = new Sequence(entity, propertyType, sequenceType, sequence); + Sequences sequences = map.get(seq.getSequenceType()); + if (sequences == null) + { + sequences = new Sequences(); + map.put(seq.getSequenceType(), sequences); + } + sequences.addSequence(seq); } - sequences.addSequence(seq); } break; } @@ -611,6 +614,24 @@ public class BlastDatabaseCreationMaintenanceTask implements IMaintenanceTask return latestModificationDate; } } + + private static String normalize(String sequence) + { + if (sequence == null) + { + return null; + } + StringBuilder builder = new StringBuilder(); + for (int i = 0; i < sequence.length(); i++) + { + char c = sequence.charAt(i); + if (Character.isWhitespace(c) == false) + { + builder.append(Character.toUpperCase(c)); + } + } + return builder.toString(); + } private static final class Sequence { @@ -625,10 +646,12 @@ public class BlastDatabaseCreationMaintenanceTask implements IMaintenanceTask private final SequenceType sequenceType; @SuppressWarnings("rawtypes") - Sequence(IEntityInformationHolderWithProperties entity, String propertyType, String sequence) + Sequence(IEntityInformationHolderWithProperties entity, String propertyType, + SequenceType sequenceType, String sequence) { this.propertyType = propertyType; - this.sequence = removeWhiteSpaces(sequence); + this.sequenceType = sequenceType; + this.sequence = sequence; permId = entity.getPermId(); Date date = null; if (entity instanceof CodeWithRegistrationAndModificationDate) @@ -636,22 +659,6 @@ public class BlastDatabaseCreationMaintenanceTask implements IMaintenanceTask date = ((CodeWithRegistrationAndModificationDate) entity).getModificationDate(); } modificationDate = date == null ? new Date() : date; - sequenceType = FastaUtilities.determineSequenceType(this.sequence); - } - - private static String removeWhiteSpaces(String sequence) - { - StringBuilder builder = new StringBuilder(); - for (int i = 0; i < sequence.length(); i++) - { - char c = sequence.charAt(i); - if (Character.isWhitespace(c) == false) - { - builder.append(c); - } - } - String string = builder.toString(); - return string; } String getPermId()