From d890c591e53b481d7c697f45ab2093a83120437d Mon Sep 17 00:00:00 2001 From: felmer <felmer> Date: Wed, 30 Apr 2014 10:49:02 +0000 Subject: [PATCH] SSDM-85: FastaUtilities with a method to determine whether a string is a sequence of nucleic acid codes or amino acid codes. SVN: 31436 --- .../cisd/common/fasta/FastaUtilities.java | 88 +++++++++++++++++++ .../cisd/common/fasta/SequenceType.java | 34 +++++++ .../cisd/common/fasta/FastaUtilitiesTest.java | 73 +++++++++++++++ 3 files changed, 195 insertions(+) create mode 100644 common/source/java/ch/systemsx/cisd/common/fasta/FastaUtilities.java create mode 100644 common/source/java/ch/systemsx/cisd/common/fasta/SequenceType.java create mode 100644 common/sourceTest/java/ch/systemsx/cisd/common/fasta/FastaUtilitiesTest.java diff --git a/common/source/java/ch/systemsx/cisd/common/fasta/FastaUtilities.java b/common/source/java/ch/systemsx/cisd/common/fasta/FastaUtilities.java new file mode 100644 index 00000000000..6d79e381033 --- /dev/null +++ b/common/source/java/ch/systemsx/cisd/common/fasta/FastaUtilities.java @@ -0,0 +1,88 @@ +/* + * Copyright 2014 ETH Zuerich, SIS + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package ch.systemsx.cisd.common.fasta; + +import java.util.Arrays; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import ch.systemsx.cisd.common.utilities.Counters; + +/** + * Utility methods for FASTA files. + * + * @author Franz-Josef Elmer + */ +public class FastaUtilities +{ + /** + * Nucleic acid codes as used in FASTA files (see https://en.wikipedia.org/wiki/FASTA_format). + */ + public static final List<Character> NUCLEIC_ACID_CODES = Arrays.asList('A', 'C', 'G', 'T', 'U', 'R', 'Y', + 'K', 'M', 'S', 'W', 'B', 'D', 'H', 'V', 'N', 'X', '-'); + + /** + * Amino acid codes as used in FASTA files (see https://en.wikipedia.org/wiki/FASTA_format). + */ + public static final List<Character> AMINO_ACID_CODES = Arrays.asList('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', + 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'Y', 'Z', 'X', '*', '-'); + + private static final Set<Character> NUCLEIC_ACID_CODES_SET = new HashSet<Character>(NUCLEIC_ACID_CODES); + private static final Set<Character> AMINO_ACID_CODES_SET = new HashSet<Character>(AMINO_ACID_CODES); + + /** + * Determines the sequence type from the specified line of a FASTA file. + * + * @param line Line from a FASTA file. Can be in lowercase. + * @throws IllegalArgumentException if the line contains a character which isn't neither + * from NUCLEIC_ACID_CODES nor AMINO_ACID_CODES. + */ + public static SequenceType determineSequenceType(String line) + { + Counters<Character> counters = new Counters<Character>(); + for (char c : line.toUpperCase().toCharArray()) + { + boolean isNucleicAcidCode = NUCLEIC_ACID_CODES_SET.contains(c); + boolean isAmoniAcidCode = AMINO_ACID_CODES_SET.contains(c); + if (isNucleicAcidCode == false && isAmoniAcidCode == false) + { + throw new IllegalArgumentException("Invalid symbol '" + c + "' in line '" + line + "'."); + } + if (isNucleicAcidCode == false) + { + return SequenceType.PROT; + } + counters.count(c); + } + if (counters.getNumberOfDifferentObjectsCounted() > 5 || containsUAndT(counters)) + { + return SequenceType.PROT; + } + int nonCommonNucleicAcidCodeSites = line.length(); + for (Character c : "ACGTUN".toCharArray()) + { + nonCommonNucleicAcidCodeSites -= counters.getCountOf(c); + } + return nonCommonNucleicAcidCodeSites == 0 ? SequenceType.NUCL : SequenceType.PROT; + } + + private static boolean containsUAndT(Counters<Character> counters) + { + return counters.getCountOf('T') > 0 && counters.getCountOf('U') > 0; + } +} diff --git a/common/source/java/ch/systemsx/cisd/common/fasta/SequenceType.java b/common/source/java/ch/systemsx/cisd/common/fasta/SequenceType.java new file mode 100644 index 00000000000..279b61d5fec --- /dev/null +++ b/common/source/java/ch/systemsx/cisd/common/fasta/SequenceType.java @@ -0,0 +1,34 @@ +/* + * Copyright 2014 ETH Zuerich, SIS + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package ch.systemsx.cisd.common.fasta; + +/** + * Enum of sequence types. + * + * @author Franz-Josef Elmer + */ +public enum SequenceType +{ + /** + * Nucleic acid sequence + */ + NUCL, + /** + * Amino acid sequence + */ + PROT; +} diff --git a/common/sourceTest/java/ch/systemsx/cisd/common/fasta/FastaUtilitiesTest.java b/common/sourceTest/java/ch/systemsx/cisd/common/fasta/FastaUtilitiesTest.java new file mode 100644 index 00000000000..e9b611ce039 --- /dev/null +++ b/common/sourceTest/java/ch/systemsx/cisd/common/fasta/FastaUtilitiesTest.java @@ -0,0 +1,73 @@ +/* + * Copyright 2014 ETH Zuerich, SIS + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package ch.systemsx.cisd.common.fasta; + + +import org.testng.AssertJUnit; +import org.testng.annotations.Test; + +/** + * + * + * @author Franz-Josef Elmer + */ +public class FastaUtilitiesTest extends AssertJUnit +{ + + @Test + public void testLineWithInvalidCharacter() + { + try + { + FastaUtilities.determineSequenceType("ABC3DEF"); + } catch (IllegalArgumentException ex) + { + assertEquals("Invalid symbol '3' in line 'ABC3DEF'.", ex.getMessage()); + } + } + + @Test + public void testLineWithExclusiveAminoAcidCodes() + { + assertEquals(SequenceType.PROT, FastaUtilities.determineSequenceType("abij")); + } + + @Test + public void testLineWithMoreThanFiveDifferentCodes() + { + assertEquals(SequenceType.PROT, FastaUtilities.determineSequenceType("ABCDEFG")); + } + + @Test + public void testLineWithMoreUAndT() + { + assertEquals(SequenceType.PROT, FastaUtilities.determineSequenceType("UT")); + } + + @Test + public void testLineWithOnlyNuclCodesWithT() + { + assertEquals(SequenceType.NUCL, FastaUtilities.determineSequenceType("AAGGCCTTN")); + } + + @Test + public void testLineWithOnlyNuclCodesWithU() + { + assertEquals(SequenceType.NUCL, FastaUtilities.determineSequenceType("aagcunnau")); + } + +} -- GitLab