[LMS-1258] removed word separators from the search

SVN: 13245

[LMS-1258] removed word separators from the search
96f99e10 · buczekp · c8d86f17 · 96f99e10 · 96f99e10 · c8d86f17
Commit 96f99e10 authored 15 years ago by buczekp
--- a/openbis/source/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/search/LuceneQueryBuilder.java
+++ b/openbis/source/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/search/LuceneQueryBuilder.java
@@ -25,7 +25,6 @@ import org.apache.lucene.search.BooleanQuery;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.BooleanClause.Occur;
-import ch.rinn.restrictions.Private;
 import ch.systemsx.cisd.common.exceptions.UserFailureException;
 import ch.systemsx.cisd.openbis.generic.server.dataaccess.db.search.detailed.DetailedQueryBuilder;
 import ch.systemsx.cisd.openbis.generic.shared.basic.dto.DetailedSearchCriteria;
@@ -48,73 +47,9 @@ public class LuceneQueryBuilder
    public static String adaptQuery(String userQuery)
    {
        String result = disableFieldQuery(userQuery);
-        result = replaceWordSeparators(result, SeparatorSplitterTokenFilter.WORD_SEPARATORS);
        return result;
    }
-    @Private
-    static String replaceWordSeparators(String query, char[] wordSeparators)
-    {
-        if (looksLikeNumber(query))
-        {
-            return query;
-        }
-        String queryTrimmed = removeSurroundingWordSeparators(query, wordSeparators);
-        String charsRegexp = createAnyWordSeparatorRegexp(wordSeparators);
-        String queryWithoutSeparators = queryTrimmed.replaceAll(charsRegexp, " AND ");
-        if (queryWithoutSeparators.equals(queryTrimmed))
-        {
-            return queryTrimmed;
-        } else
-        {
-            return "(" + queryWithoutSeparators + ")";
-        }
-    }
-    private static boolean looksLikeNumber(String query)
-    {
-        return query.length() > 0 && Character.isDigit(query.charAt(0))
-                && Character.isDigit(query.charAt(query.length() - 1));
-    }
-    private static String createAnyWordSeparatorRegexp(char[] wordSeparators)
-    {
-        String charsRegexp = "[";
-        for (int i = 0; i < wordSeparators.length; i++)
-        {
-            charsRegexp += "\\" + wordSeparators[i];
-        }
-        charsRegexp += "]";
-        return charsRegexp;
-    }
-    private static String removeSurroundingWordSeparators(String query, char[] wordSeparators)
-    {
-        int startIx = 0;
-        while (startIx < query.length() && isSeparator(query.charAt(startIx), wordSeparators))
-        {
-            startIx++;
-        }
-        int endIx = query.length();
-        while (endIx > 0 && isSeparator(query.charAt(endIx - 1), wordSeparators))
-        {
-            endIx--;
-        }
-        return query.substring(startIx, endIx);
-    }
-    private static boolean isSeparator(char ch, char[] wordSeparators)
-    {
-        for (int i = 0; i < wordSeparators.length; i++)
-        {
-            if (ch == wordSeparators[i])
-            {
-                return true;
-            }
-        }
-        return false;
-    }
    // disables field query by escaping all field separator characters.
    public static String disableFieldQuery(String userQuery)
    {

--- a/openbis/source/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/search/SearchAnalyzer.java
+++ b/openbis/source/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/search/SearchAnalyzer.java
@@ -17,21 +17,67 @@
 package ch.systemsx.cisd.openbis.generic.server.dataaccess.db.search;
 import java.io.Reader;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharTokenizer;
 import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.standard.StandardAnalyzer;
 /**
- * Extends {@link StandardAnalyzer} by applying additional {@link SeparatorSplitterTokenFilter}.
+ * Extends {@link Analyzer} splitting text on characters not allowed in codes or words.
 * 
- * @author Tomasz Pylak
+ * @author Piotr Buczek
 */
-public class SearchAnalyzer extends StandardAnalyzer
+public class SearchAnalyzer extends Analyzer
 {
    @Override
    public TokenStream tokenStream(String fieldName, Reader reader)
    {
-        TokenStream original = super.tokenStream(fieldName, reader);
+        return new WordAndCodeTokenizer(reader);
-        return new SeparatorSplitterTokenFilter(original);
+    }
+    /**
+     * A tokenizer that divides text at chars different than letters, digits and special chars
+     * allowed in codes ('.', ':', '-', '_') or words (like apostrophe).
+     * <p>
+     * Additionally it normalizes token text to lower case (with a performance gain compared to
+     * using LowerCaseFilter after tokenization).
+     */
+    private static class WordAndCodeTokenizer extends CharTokenizer
+    {
+        /** special characters allowed in codes */
+        private final static Character[] SPECIAL_CODE_CHARS =
+            { '.', ':', '-', '_' };
+        /** special characters allowed in words (separated from code chars for clarity) */
+        private final static Character[] SPECIAL_WORD_CHARS =
+            { '\'' };
+        private final static Set<Character> specialCharacters = new HashSet<Character>();
+        {
+            specialCharacters.addAll(Arrays.asList(SPECIAL_CODE_CHARS));
+            specialCharacters.addAll(Arrays.asList(SPECIAL_WORD_CHARS));
+        }
+        public WordAndCodeTokenizer(Reader input)
+        {
+            super(input);
+        }
+        @Override
+        protected boolean isTokenChar(char c)
+        {
+            return Character.isLetterOrDigit(c) || specialCharacters.contains(c);
+        }
+        @Override
+        protected char normalize(char c)
+        {
+            return Character.toLowerCase(c);
+        }
    }
 }
--- a/openbis/source/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/search/SeparatorSplitterTokenFilter.java
+++ b/openbis/source/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/search/SeparatorSplitterTokenFilter.java
-/*
- * Copyright 2008 ETH Zuerich, CISD
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package ch.systemsx.cisd.openbis.generic.server.dataaccess.db.search;
-import java.io.IOException;
-import java.util.LinkedList;
-import java.util.List;
-import org.apache.lucene.analysis.Token;
-import org.apache.lucene.analysis.TokenFilter;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.standard.StandardTokenizer;
-/**
- * Used to split tokens further down after standard tokenizer. We need this, because "." which is
- * not followed be a space is not treated as a token separator by default.
- * 
- * @author Tomasz Pylak
- */
-public class SeparatorSplitterTokenFilter extends TokenFilter
-{
-    static final char[] WORD_SEPARATORS = new char[]
-        { '.', ',', '-', '_' };
-    private static final String ALPHANUM_TOKEN_TYPE =
-            StandardTokenizer.TOKEN_TYPES[StandardTokenizer.ALPHANUM];
-    private static final String HOST_TOKEN_TYPE =
-            StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HOST];
-    private static final String NUM_TOKEN_TYPE =
-            StandardTokenizer.TOKEN_TYPES[StandardTokenizer.NUM];
-    private List<Token> tokens = new LinkedList<Token>();
-    protected SeparatorSplitterTokenFilter(TokenStream input)
-    {
-        super(input);
-    }
-    /**
-     * Returns tokens from standard analysis, split additionally at specified separator characters.
-     */
-    @Override
-    public final Token next(final Token reusableToken) throws IOException
-    {
-        if (tokens.size() > 0)
-        {
-            return extractFirstToken();
-        }
-        Token token = input.next(reusableToken);
-        // avoid splitting special tokens like e-mails
-        if (token == null || isSplittableToken(token) == false)
-        {
-            return token;
-        }
-        char[] termText = token.termBuffer();
-        int endPos = token.termLength(); // exclusive
-        int curPos = 0;
-        do
-        {
-            int nextPos = getSeparatorIndex(termText, curPos, endPos);
-            if (nextPos == endPos && tokens.size() == 0)
-            {
-                return token; // optimalisation, no split has occurred
-            }
-            addToken(token, curPos, nextPos);
-            curPos = nextPos + 1;
-        } while (curPos < endPos);
-        return extractFirstToken();
-    }
-    private static boolean isSplittableToken(Token token)
-    {
-        String type = token.type();
-        if (type.equals(ALPHANUM_TOKEN_TYPE) || type.equals(HOST_TOKEN_TYPE))
-        {
-            return true;
-        }
-        if (type.equals(NUM_TOKEN_TYPE))
-        {
-            // sometimes the original tokenizer lies to us and reports terms like 'version_3' to be
-            // numbers. This is a heuristic to correct those lies.
-            return Character.isLetter(token.term().charAt(0));
-        }
-        return false;
-    }
-    // returns the position of the first separator character. Starts browsing at curPos.
-    private static int getSeparatorIndex(char[] termText, int startIndex, int endIndex)
-    {
-        for (int i = startIndex; i < endIndex; i++)
-        {
-            if (isSeparator(termText[i]))
-            {
-                return i;
-            }
-        }
-        return endIndex;
-    }
-    private static boolean isSeparator(char ch)
-    {
-        for (int i = 0; i < WORD_SEPARATORS.length; i++)
-        {
-            if (WORD_SEPARATORS[i] == ch)
-            {
-                return true;
-            }
-        }
-        return false;
-    }
-    private Token extractFirstToken()
-    {
-        assert tokens.size() > 0 : "no more tokens";
-        Token t = tokens.get(0);
-        tokens.remove(0);
-        return t;
-    }
-    // startPos is inclusive position of the new token start
-    // endPos is exclusive position of the new token end
-    private void addToken(Token token, int startPos, int endPos)
-    {
-        if (startPos < endPos)
-        {
-            int startOffset = token.startOffset() + startPos;
-            int endOffset = token.startOffset() + endPos;
-            Token newToken =
-                    new Token(token.termBuffer(), startPos, endPos - startPos, startOffset,
-                            endOffset);
-            tokens.add(newToken);
-        }
-    }
-}