From 44bbdc34f7191a1cf3afe37bfc5fac9f22ada81f Mon Sep 17 00:00:00 2001
From: izabel <izabel>
Date: Wed, 6 Oct 2010 07:45:34 +0000
Subject: [PATCH] [LMS-1827] allow to search for A#B in non-wildcard mode

SVN: 18161
---
 .../dataaccess/db/search/CharacterHelper.java | 68 +++++++++++++++++++
 .../db/search/LuceneQueryBuilder.java         | 51 ++++++++++++--
 .../dataaccess/db/search/SearchAnalyzer.java  | 29 +-------
 .../db/search/LuceneQueryBuilderTest.java     |  8 ++-
 4 files changed, 123 insertions(+), 33 deletions(-)
 create mode 100644 openbis/source/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/search/CharacterHelper.java

diff --git a/openbis/source/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/search/CharacterHelper.java b/openbis/source/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/search/CharacterHelper.java
new file mode 100644
index 00000000000..0c8d82d702a
--- /dev/null
+++ b/openbis/source/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/search/CharacterHelper.java
@@ -0,0 +1,68 @@
+/*
+ * Copyright 2010 ETH Zuerich, CISD
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ch.systemsx.cisd.openbis.generic.server.dataaccess.db.search;
+
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.Set;
+
+/**
+ * Gathers characters that need special treatment and methods useful during indexing.
+ * 
+ * @author Izabela Adamczyk
+ */
+public class CharacterHelper
+{
+    private static final char ESCAPE_CHARACTER = '\\';
+
+    public final static Set<Character> SPECIAL_CHARACTERS = new HashSet<Character>(Arrays.asList(
+    // Special code characters
+            '.', ':', '-', '_',
+
+            // Special word characters
+            '\''));
+
+    // (don't trim '-' or '_' because they may have special meaning in identifiers)
+    /** those of special chars that should be trimmed */
+    private final static Set<Character> TRIMMED_CHARACTERS = new HashSet<Character>(Arrays.asList(
+            '.', ':', '\''));
+
+    public static boolean isTokenCharacter(char c)
+    {
+        return Character.isLetterOrDigit(c) || SPECIAL_CHARACTERS.contains(c);
+    }
+
+    public static Collection<Character> getTokenSeparators()
+    {
+        Set<Character> separators = new HashSet<Character>();
+        for (char ch = 32; ch < 256; ch++)
+        {
+            if (isTokenCharacter(ch) == false && ch != ESCAPE_CHARACTER)
+            {
+                separators.add(ch);
+            }
+        }
+        return separators;
+    }
+
+    public static Set<Character> getTrimmedSpecialCharacters()
+    {
+        return TRIMMED_CHARACTERS;
+    }
+
+}
\ No newline at end of file
diff --git a/openbis/source/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/search/LuceneQueryBuilder.java b/openbis/source/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/search/LuceneQueryBuilder.java
index 6eb42addbeb..8dc1ded3a50 100644
--- a/openbis/source/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/search/LuceneQueryBuilder.java
+++ b/openbis/source/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/search/LuceneQueryBuilder.java
@@ -16,14 +16,17 @@
 
 package ch.systemsx.cisd.openbis.generic.server.dataaccess.db.search;
 
+import java.util.ArrayList;
+import java.util.Collection;
 import java.util.List;
 
+import org.apache.commons.lang.StringUtils;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.queryParser.ParseException;
 import org.apache.lucene.queryParser.QueryParser;
+import org.apache.lucene.search.BooleanClause.Occur;
 import org.apache.lucene.search.BooleanQuery;
 import org.apache.lucene.search.Query;
-import org.apache.lucene.search.BooleanClause.Occur;
 
 import ch.systemsx.cisd.common.exceptions.UserFailureException;
 import ch.systemsx.cisd.openbis.generic.server.dataaccess.db.search.detailed.DetailedQueryBuilder;
@@ -36,12 +39,22 @@ import ch.systemsx.cisd.openbis.generic.shared.translator.DtoConverters;
  */
 public class LuceneQueryBuilder
 {
+    private static final String NOT = "NOT";
+
+    private static final String OR = "OR";
+
+    private static final String AND = "AND";
+
+    private static final char STAR = '*';
+
+    private static final char SPACE = ' ';
+
     /** @throws UserFailureException when some search patterns are incorrect */
     public static Query createDetailedSearchQuery(DetailedSearchCriteria searchCriteria,
             EntityKind entityKind)
     {
-        return DetailedQueryBuilder.createQuery(searchCriteria, DtoConverters
-                .convertEntityKind(entityKind));
+        return DetailedQueryBuilder.createQuery(searchCriteria,
+                DtoConverters.convertEntityKind(entityKind));
     }
 
     private static final char FIELD_SEPARATOR = ':';
@@ -68,11 +81,41 @@ public class LuceneQueryBuilder
         // add '*' wildcard at the beginning and at the end of the query in basic search mode
         if (useWildcardSearchMode == false && isQuoted(result) == false)
         {
-            result = '*' + result + '*';
+            result = addWildcards(result);
         }
         return result;
     }
 
+    private static String addWildcards(String result)
+    {
+        String[] queryTokens = StringUtils.split(result, SPACE);
+        List<String> transformedTokens = new ArrayList<String>();
+        for (String qt : queryTokens)
+        {
+            if (qt.equals(AND) || qt.equals(OR) || qt.equals(NOT))
+            {
+                transformedTokens.add(qt);
+            } else
+            {
+                transformedTokens.add(addWildcartdsToToken(qt));
+            }
+        }
+        return StringUtils.join(transformedTokens, SPACE);
+    }
+
+    private static String addWildcartdsToToken(String token)
+    {
+        Collection<Character> tokenSeparators = CharacterHelper.getTokenSeparators();
+        tokenSeparators.removeAll(new ArrayList<String>());
+        String[] miniTokens = StringUtils.split(token, StringUtils.join(tokenSeparators, ""));
+        List<String> transformedMiniTokens = new ArrayList<String>();
+        for (String qt : miniTokens)
+        {
+            transformedMiniTokens.add(STAR + qt + STAR);
+        }
+        return '(' + StringUtils.join(transformedMiniTokens, SPACE + AND + SPACE) + ')';
+    }
+
     private static boolean isQuoted(String result)
     {
         return result.startsWith("\"") && result.endsWith("\"");
diff --git a/openbis/source/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/search/SearchAnalyzer.java b/openbis/source/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/search/SearchAnalyzer.java
index 927394d919d..0890266995b 100644
--- a/openbis/source/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/search/SearchAnalyzer.java
+++ b/openbis/source/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/search/SearchAnalyzer.java
@@ -17,8 +17,6 @@
 package ch.systemsx.cisd.openbis.generic.server.dataaccess.db.search;
 
 import java.io.Reader;
-import java.util.Arrays;
-import java.util.HashSet;
 import java.util.Set;
 
 import org.apache.lucene.analysis.Analyzer;
@@ -47,26 +45,13 @@ public class SearchAnalyzer extends Analyzer
 
     /**
      * A tokenizer that divides text at chars different than letters, digits and special chars
-     * allowed in codes ('.', ':', '-', '_') or words (like apostrophe).
+     * defined in {@link CharacterHelper}.
      * <p>
      * Additionally it normalizes token text to lower case (with a performance gain compared to
      * using LowerCaseFilter after tokenization).
      */
     private static class WordAndCodeTokenizer extends CharTokenizer
     {
-        /** special characters allowed in codes */
-        private final static Character[] SPECIAL_CODE_CHARS =
-            { '.', ':', '-', '_' };
-
-        /** special characters allowed in words (separated from code chars for clarity) */
-        private final static Character[] SPECIAL_WORD_CHARS =
-            { '\'' };
-
-        private final static Set<Character> specialCharacters = new HashSet<Character>();
-        {
-            specialCharacters.addAll(Arrays.asList(SPECIAL_CODE_CHARS));
-            specialCharacters.addAll(Arrays.asList(SPECIAL_WORD_CHARS));
-        }
 
         public WordAndCodeTokenizer(Reader input)
         {
@@ -76,7 +61,7 @@ public class SearchAnalyzer extends Analyzer
         @Override
         protected boolean isTokenChar(char c)
         {
-            return Character.isLetterOrDigit(c) || specialCharacters.contains(c);
+            return CharacterHelper.isTokenCharacter(c);
         }
 
         @Override
@@ -91,15 +76,6 @@ public class SearchAnalyzer extends Analyzer
      */
     private static final class TrimSpecialCharsFilter extends TokenFilter
     {
-        // (don't trim '-' or '_' because they may have special meaning in identifiers)
-        /** those of special chars that should be trimmed */
-        private final static Character[] TRIMMED_SPECIAL_CHARS =
-            { '.', ':', '\'' };
-
-        private final static Set<Character> trimmedCharacters = new HashSet<Character>();
-        {
-            trimmedCharacters.addAll(Arrays.asList(TRIMMED_SPECIAL_CHARS));
-        }
 
         public TrimSpecialCharsFilter(TokenStream input)
         {
@@ -119,6 +95,7 @@ public class SearchAnalyzer extends Analyzer
             final int bufferLength = nextToken.termLength();
 
             int startCounter = 0; // counts chars to trim from the beginning
+            Set<Character> trimmedCharacters = CharacterHelper.getTrimmedSpecialCharacters();
             for (int i = 0; i < bufferLength; i++)
             {
                 if (trimmedCharacters.contains(buffer[i]))
diff --git a/openbis/sourceTest/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/search/LuceneQueryBuilderTest.java b/openbis/sourceTest/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/search/LuceneQueryBuilderTest.java
index ad218283384..38539257373 100644
--- a/openbis/sourceTest/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/search/LuceneQueryBuilderTest.java
+++ b/openbis/sourceTest/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/search/LuceneQueryBuilderTest.java
@@ -33,8 +33,9 @@ public class LuceneQueryBuilderTest extends AssertJUnit
     {
         return new Object[][]
             {
-                { "abc", "*abc*" },
-                { "code:CP registrator:Joe", "*code\\:CP registrator\\:Joe*" } };
+                { "abc", "(*abc*)" },
+                { "code:CP registrator:Joe", "(*code\\:CP*) (*registrator\\:Joe*)" },
+                { "ab#c OR d", "(*ab* AND *c*) OR (*d*)" } };
     }
 
     @DataProvider(name = "wildcardModeQueries")
@@ -43,7 +44,8 @@ public class LuceneQueryBuilderTest extends AssertJUnit
         return new Object[][]
             {
                 { "abc", "abc" },
-                { "code:CP registrator:Joe", "code\\:CP registrator\\:Joe" } };
+                { "code:CP registrator:Joe", "code\\:CP registrator\\:Joe" },
+                { "*ab#c OR d", "*ab#c OR d" } };
     }
 
     @Test(dataProvider = "basicModeQueries")
-- 
GitLab