From 96f99e105e1701bcfdb11f859644c161b2c9a3b2 Mon Sep 17 00:00:00 2001 From: buczekp <buczekp> Date: Fri, 6 Nov 2009 12:34:02 +0000 Subject: [PATCH] [LMS-1258] removed word separators from the search SVN: 13245 --- .../db/search/LuceneQueryBuilder.java | 65 -------- .../dataaccess/db/search/SearchAnalyzer.java | 58 ++++++- .../search/SeparatorSplitterTokenFilter.java | 150 ------------------ 3 files changed, 52 insertions(+), 221 deletions(-) delete mode 100644 openbis/source/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/search/SeparatorSplitterTokenFilter.java diff --git a/openbis/source/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/search/LuceneQueryBuilder.java b/openbis/source/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/search/LuceneQueryBuilder.java index 671e7fd357a..7628af1db7a 100644 --- a/openbis/source/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/search/LuceneQueryBuilder.java +++ b/openbis/source/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/search/LuceneQueryBuilder.java @@ -25,7 +25,6 @@ import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.BooleanClause.Occur; -import ch.rinn.restrictions.Private; import ch.systemsx.cisd.common.exceptions.UserFailureException; import ch.systemsx.cisd.openbis.generic.server.dataaccess.db.search.detailed.DetailedQueryBuilder; import ch.systemsx.cisd.openbis.generic.shared.basic.dto.DetailedSearchCriteria; @@ -48,73 +47,9 @@ public class LuceneQueryBuilder public static String adaptQuery(String userQuery) { String result = disableFieldQuery(userQuery); - result = replaceWordSeparators(result, SeparatorSplitterTokenFilter.WORD_SEPARATORS); return result; } - @Private - static String replaceWordSeparators(String query, char[] wordSeparators) - { - if (looksLikeNumber(query)) - { - return query; - } - String queryTrimmed = removeSurroundingWordSeparators(query, wordSeparators); - String charsRegexp = createAnyWordSeparatorRegexp(wordSeparators); - String queryWithoutSeparators = queryTrimmed.replaceAll(charsRegexp, " AND "); - if (queryWithoutSeparators.equals(queryTrimmed)) - { - return queryTrimmed; - } else - { - return "(" + queryWithoutSeparators + ")"; - } - } - - private static boolean looksLikeNumber(String query) - { - return query.length() > 0 && Character.isDigit(query.charAt(0)) - && Character.isDigit(query.charAt(query.length() - 1)); - } - - private static String createAnyWordSeparatorRegexp(char[] wordSeparators) - { - String charsRegexp = "["; - for (int i = 0; i < wordSeparators.length; i++) - { - charsRegexp += "\\" + wordSeparators[i]; - } - charsRegexp += "]"; - return charsRegexp; - } - - private static String removeSurroundingWordSeparators(String query, char[] wordSeparators) - { - int startIx = 0; - while (startIx < query.length() && isSeparator(query.charAt(startIx), wordSeparators)) - { - startIx++; - } - int endIx = query.length(); - while (endIx > 0 && isSeparator(query.charAt(endIx - 1), wordSeparators)) - { - endIx--; - } - return query.substring(startIx, endIx); - } - - private static boolean isSeparator(char ch, char[] wordSeparators) - { - for (int i = 0; i < wordSeparators.length; i++) - { - if (ch == wordSeparators[i]) - { - return true; - } - } - return false; - } - // disables field query by escaping all field separator characters. public static String disableFieldQuery(String userQuery) { diff --git a/openbis/source/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/search/SearchAnalyzer.java b/openbis/source/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/search/SearchAnalyzer.java index ffda9408199..b0ca028a641 100644 --- a/openbis/source/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/search/SearchAnalyzer.java +++ b/openbis/source/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/search/SearchAnalyzer.java @@ -17,21 +17,67 @@ package ch.systemsx.cisd.openbis.generic.server.dataaccess.db.search; import java.io.Reader; +import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.CharTokenizer; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.standard.StandardAnalyzer; /** - * Extends {@link StandardAnalyzer} by applying additional {@link SeparatorSplitterTokenFilter}. + * Extends {@link Analyzer} splitting text on characters not allowed in codes or words. * - * @author Tomasz Pylak + * @author Piotr Buczek */ -public class SearchAnalyzer extends StandardAnalyzer +public class SearchAnalyzer extends Analyzer { + @Override public TokenStream tokenStream(String fieldName, Reader reader) { - TokenStream original = super.tokenStream(fieldName, reader); - return new SeparatorSplitterTokenFilter(original); + return new WordAndCodeTokenizer(reader); + } + + /** + * A tokenizer that divides text at chars different than letters, digits and special chars + * allowed in codes ('.', ':', '-', '_') or words (like apostrophe). + * <p> + * Additionally it normalizes token text to lower case (with a performance gain compared to + * using LowerCaseFilter after tokenization). + */ + private static class WordAndCodeTokenizer extends CharTokenizer + { + /** special characters allowed in codes */ + private final static Character[] SPECIAL_CODE_CHARS = + { '.', ':', '-', '_' }; + + /** special characters allowed in words (separated from code chars for clarity) */ + private final static Character[] SPECIAL_WORD_CHARS = + { '\'' }; + + private final static Set<Character> specialCharacters = new HashSet<Character>(); + { + specialCharacters.addAll(Arrays.asList(SPECIAL_CODE_CHARS)); + specialCharacters.addAll(Arrays.asList(SPECIAL_WORD_CHARS)); + } + + public WordAndCodeTokenizer(Reader input) + { + super(input); + } + + @Override + protected boolean isTokenChar(char c) + { + return Character.isLetterOrDigit(c) || specialCharacters.contains(c); + } + + @Override + protected char normalize(char c) + { + return Character.toLowerCase(c); + } } + } diff --git a/openbis/source/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/search/SeparatorSplitterTokenFilter.java b/openbis/source/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/search/SeparatorSplitterTokenFilter.java deleted file mode 100644 index 8177c5ecfbf..00000000000 --- a/openbis/source/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/search/SeparatorSplitterTokenFilter.java +++ /dev/null @@ -1,150 +0,0 @@ -/* - * Copyright 2008 ETH Zuerich, CISD - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package ch.systemsx.cisd.openbis.generic.server.dataaccess.db.search; - -import java.io.IOException; -import java.util.LinkedList; -import java.util.List; - -import org.apache.lucene.analysis.Token; -import org.apache.lucene.analysis.TokenFilter; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.standard.StandardTokenizer; - -/** - * Used to split tokens further down after standard tokenizer. We need this, because "." which is - * not followed be a space is not treated as a token separator by default. - * - * @author Tomasz Pylak - */ -public class SeparatorSplitterTokenFilter extends TokenFilter -{ - static final char[] WORD_SEPARATORS = new char[] - { '.', ',', '-', '_' }; - - private static final String ALPHANUM_TOKEN_TYPE = - StandardTokenizer.TOKEN_TYPES[StandardTokenizer.ALPHANUM]; - - private static final String HOST_TOKEN_TYPE = - StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HOST]; - - private static final String NUM_TOKEN_TYPE = - StandardTokenizer.TOKEN_TYPES[StandardTokenizer.NUM]; - - private List<Token> tokens = new LinkedList<Token>(); - - protected SeparatorSplitterTokenFilter(TokenStream input) - { - super(input); - } - - /** - * Returns tokens from standard analysis, split additionally at specified separator characters. - */ - @Override - public final Token next(final Token reusableToken) throws IOException - { - if (tokens.size() > 0) - { - return extractFirstToken(); - } - Token token = input.next(reusableToken); - // avoid splitting special tokens like e-mails - if (token == null || isSplittableToken(token) == false) - { - return token; - } - char[] termText = token.termBuffer(); - int endPos = token.termLength(); // exclusive - int curPos = 0; - do - { - int nextPos = getSeparatorIndex(termText, curPos, endPos); - if (nextPos == endPos && tokens.size() == 0) - { - return token; // optimalisation, no split has occurred - } - addToken(token, curPos, nextPos); - curPos = nextPos + 1; - } while (curPos < endPos); - return extractFirstToken(); - } - - private static boolean isSplittableToken(Token token) - { - String type = token.type(); - if (type.equals(ALPHANUM_TOKEN_TYPE) || type.equals(HOST_TOKEN_TYPE)) - { - return true; - } - if (type.equals(NUM_TOKEN_TYPE)) - { - // sometimes the original tokenizer lies to us and reports terms like 'version_3' to be - // numbers. This is a heuristic to correct those lies. - return Character.isLetter(token.term().charAt(0)); - } - return false; - } - - // returns the position of the first separator character. Starts browsing at curPos. - private static int getSeparatorIndex(char[] termText, int startIndex, int endIndex) - { - for (int i = startIndex; i < endIndex; i++) - { - if (isSeparator(termText[i])) - { - return i; - } - } - return endIndex; - } - - private static boolean isSeparator(char ch) - { - for (int i = 0; i < WORD_SEPARATORS.length; i++) - { - if (WORD_SEPARATORS[i] == ch) - { - return true; - } - } - return false; - } - - private Token extractFirstToken() - { - assert tokens.size() > 0 : "no more tokens"; - Token t = tokens.get(0); - tokens.remove(0); - return t; - } - - // startPos is inclusive position of the new token start - // endPos is exclusive position of the new token end - private void addToken(Token token, int startPos, int endPos) - { - if (startPos < endPos) - { - int startOffset = token.startOffset() + startPos; - int endOffset = token.startOffset() + endPos; - Token newToken = - new Token(token.termBuffer(), startPos, endPos - startPos, startOffset, - endOffset); - tokens.add(newToken); - } - } -} -- GitLab