From d890c591e53b481d7c697f45ab2093a83120437d Mon Sep 17 00:00:00 2001
From: felmer <felmer>
Date: Wed, 30 Apr 2014 10:49:02 +0000
Subject: [PATCH] SSDM-85: FastaUtilities with a method to determine whether a
 string is a sequence of nucleic acid codes or amino acid codes.

SVN: 31436
---
 .../cisd/common/fasta/FastaUtilities.java     | 88 +++++++++++++++++++
 .../cisd/common/fasta/SequenceType.java       | 34 +++++++
 .../cisd/common/fasta/FastaUtilitiesTest.java | 73 +++++++++++++++
 3 files changed, 195 insertions(+)
 create mode 100644 common/source/java/ch/systemsx/cisd/common/fasta/FastaUtilities.java
 create mode 100644 common/source/java/ch/systemsx/cisd/common/fasta/SequenceType.java
 create mode 100644 common/sourceTest/java/ch/systemsx/cisd/common/fasta/FastaUtilitiesTest.java

diff --git a/common/source/java/ch/systemsx/cisd/common/fasta/FastaUtilities.java b/common/source/java/ch/systemsx/cisd/common/fasta/FastaUtilities.java
new file mode 100644
index 00000000000..6d79e381033
--- /dev/null
+++ b/common/source/java/ch/systemsx/cisd/common/fasta/FastaUtilities.java
@@ -0,0 +1,88 @@
+/*
+ * Copyright 2014 ETH Zuerich, SIS
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ch.systemsx.cisd.common.fasta;
+
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import ch.systemsx.cisd.common.utilities.Counters;
+
+/**
+ * Utility methods for FASTA files.
+ *
+ * @author Franz-Josef Elmer
+ */
+public class FastaUtilities
+{
+    /**
+     * Nucleic acid codes as used in FASTA files (see https://en.wikipedia.org/wiki/FASTA_format).
+     */
+    public static final List<Character> NUCLEIC_ACID_CODES = Arrays.asList('A', 'C', 'G', 'T', 'U', 'R', 'Y', 
+            'K', 'M', 'S', 'W', 'B', 'D', 'H', 'V', 'N', 'X', '-');
+
+    /**
+     * Amino acid codes as used in FASTA files (see https://en.wikipedia.org/wiki/FASTA_format).
+     */
+    public static final List<Character> AMINO_ACID_CODES = Arrays.asList('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 
+            'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'Y', 'Z', 'X', '*', '-');
+    
+    private static final Set<Character> NUCLEIC_ACID_CODES_SET = new HashSet<Character>(NUCLEIC_ACID_CODES);
+    private static final Set<Character> AMINO_ACID_CODES_SET = new HashSet<Character>(AMINO_ACID_CODES);
+
+    /**
+     * Determines the sequence type from the specified line of a FASTA file.
+     * 
+     * @param line Line from a FASTA file. Can be in lowercase.
+     * @throws IllegalArgumentException if the line contains a character which isn't neither 
+     *      from NUCLEIC_ACID_CODES nor AMINO_ACID_CODES.
+     */
+    public static SequenceType determineSequenceType(String line)
+    {
+        Counters<Character> counters = new Counters<Character>();
+        for (char c : line.toUpperCase().toCharArray())
+        {
+            boolean isNucleicAcidCode = NUCLEIC_ACID_CODES_SET.contains(c);
+            boolean isAmoniAcidCode = AMINO_ACID_CODES_SET.contains(c);
+            if (isNucleicAcidCode == false && isAmoniAcidCode == false)
+            {
+                throw new IllegalArgumentException("Invalid symbol '" + c + "' in line '" + line + "'.");
+            }
+            if (isNucleicAcidCode == false)
+            {
+                return SequenceType.PROT;
+            }
+            counters.count(c);
+        }
+        if (counters.getNumberOfDifferentObjectsCounted() > 5 || containsUAndT(counters))
+        {
+            return SequenceType.PROT;
+        }
+        int nonCommonNucleicAcidCodeSites = line.length();
+        for (Character c : "ACGTUN".toCharArray())
+        {
+            nonCommonNucleicAcidCodeSites -= counters.getCountOf(c);
+        }
+        return nonCommonNucleicAcidCodeSites == 0 ? SequenceType.NUCL : SequenceType.PROT;
+    }
+
+    private static boolean containsUAndT(Counters<Character> counters)
+    {
+        return counters.getCountOf('T') > 0 && counters.getCountOf('U') > 0;
+    }
+}
diff --git a/common/source/java/ch/systemsx/cisd/common/fasta/SequenceType.java b/common/source/java/ch/systemsx/cisd/common/fasta/SequenceType.java
new file mode 100644
index 00000000000..279b61d5fec
--- /dev/null
+++ b/common/source/java/ch/systemsx/cisd/common/fasta/SequenceType.java
@@ -0,0 +1,34 @@
+/*
+ * Copyright 2014 ETH Zuerich, SIS
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ch.systemsx.cisd.common.fasta;
+
+/**
+ * Enum of sequence types. 
+ *
+ * @author Franz-Josef Elmer
+ */
+public enum SequenceType
+{
+    /**
+     * Nucleic acid sequence
+     */
+    NUCL,
+    /**
+     * Amino acid sequence
+     */
+    PROT;
+}
diff --git a/common/sourceTest/java/ch/systemsx/cisd/common/fasta/FastaUtilitiesTest.java b/common/sourceTest/java/ch/systemsx/cisd/common/fasta/FastaUtilitiesTest.java
new file mode 100644
index 00000000000..e9b611ce039
--- /dev/null
+++ b/common/sourceTest/java/ch/systemsx/cisd/common/fasta/FastaUtilitiesTest.java
@@ -0,0 +1,73 @@
+/*
+ * Copyright 2014 ETH Zuerich, SIS
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ch.systemsx.cisd.common.fasta;
+
+
+import org.testng.AssertJUnit;
+import org.testng.annotations.Test;
+
+/**
+ * 
+ *
+ * @author Franz-Josef Elmer
+ */
+public class FastaUtilitiesTest extends AssertJUnit
+{
+    
+    @Test
+    public void testLineWithInvalidCharacter()
+    {
+        try
+        {
+            FastaUtilities.determineSequenceType("ABC3DEF");
+        } catch (IllegalArgumentException ex)
+        {
+            assertEquals("Invalid symbol '3' in line 'ABC3DEF'.", ex.getMessage());
+        }
+    }
+
+    @Test
+    public void testLineWithExclusiveAminoAcidCodes()
+    {
+        assertEquals(SequenceType.PROT, FastaUtilities.determineSequenceType("abij"));
+    }
+    
+    @Test
+    public void testLineWithMoreThanFiveDifferentCodes()
+    {
+        assertEquals(SequenceType.PROT, FastaUtilities.determineSequenceType("ABCDEFG"));
+    }
+    
+    @Test
+    public void testLineWithMoreUAndT()
+    {
+        assertEquals(SequenceType.PROT, FastaUtilities.determineSequenceType("UT"));
+    }
+    
+    @Test
+    public void testLineWithOnlyNuclCodesWithT()
+    {
+        assertEquals(SequenceType.NUCL, FastaUtilities.determineSequenceType("AAGGCCTTN"));
+    }
+    
+    @Test
+    public void testLineWithOnlyNuclCodesWithU()
+    {
+        assertEquals(SequenceType.NUCL, FastaUtilities.determineSequenceType("aagcunnau"));
+    }
+
+}
-- 
GitLab