Skip to content
Snippets Groups Projects
Commit 96f99e10 authored by buczekp's avatar buczekp
Browse files

[LMS-1258] removed word separators from the search

SVN: 13245
parent c8d86f17
No related branches found
No related tags found
No related merge requests found
...@@ -25,7 +25,6 @@ import org.apache.lucene.search.BooleanQuery; ...@@ -25,7 +25,6 @@ import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Query; import org.apache.lucene.search.Query;
import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.BooleanClause.Occur;
import ch.rinn.restrictions.Private;
import ch.systemsx.cisd.common.exceptions.UserFailureException; import ch.systemsx.cisd.common.exceptions.UserFailureException;
import ch.systemsx.cisd.openbis.generic.server.dataaccess.db.search.detailed.DetailedQueryBuilder; import ch.systemsx.cisd.openbis.generic.server.dataaccess.db.search.detailed.DetailedQueryBuilder;
import ch.systemsx.cisd.openbis.generic.shared.basic.dto.DetailedSearchCriteria; import ch.systemsx.cisd.openbis.generic.shared.basic.dto.DetailedSearchCriteria;
...@@ -48,73 +47,9 @@ public class LuceneQueryBuilder ...@@ -48,73 +47,9 @@ public class LuceneQueryBuilder
public static String adaptQuery(String userQuery) public static String adaptQuery(String userQuery)
{ {
String result = disableFieldQuery(userQuery); String result = disableFieldQuery(userQuery);
result = replaceWordSeparators(result, SeparatorSplitterTokenFilter.WORD_SEPARATORS);
return result; return result;
} }
@Private
static String replaceWordSeparators(String query, char[] wordSeparators)
{
if (looksLikeNumber(query))
{
return query;
}
String queryTrimmed = removeSurroundingWordSeparators(query, wordSeparators);
String charsRegexp = createAnyWordSeparatorRegexp(wordSeparators);
String queryWithoutSeparators = queryTrimmed.replaceAll(charsRegexp, " AND ");
if (queryWithoutSeparators.equals(queryTrimmed))
{
return queryTrimmed;
} else
{
return "(" + queryWithoutSeparators + ")";
}
}
private static boolean looksLikeNumber(String query)
{
return query.length() > 0 && Character.isDigit(query.charAt(0))
&& Character.isDigit(query.charAt(query.length() - 1));
}
private static String createAnyWordSeparatorRegexp(char[] wordSeparators)
{
String charsRegexp = "[";
for (int i = 0; i < wordSeparators.length; i++)
{
charsRegexp += "\\" + wordSeparators[i];
}
charsRegexp += "]";
return charsRegexp;
}
private static String removeSurroundingWordSeparators(String query, char[] wordSeparators)
{
int startIx = 0;
while (startIx < query.length() && isSeparator(query.charAt(startIx), wordSeparators))
{
startIx++;
}
int endIx = query.length();
while (endIx > 0 && isSeparator(query.charAt(endIx - 1), wordSeparators))
{
endIx--;
}
return query.substring(startIx, endIx);
}
private static boolean isSeparator(char ch, char[] wordSeparators)
{
for (int i = 0; i < wordSeparators.length; i++)
{
if (ch == wordSeparators[i])
{
return true;
}
}
return false;
}
// disables field query by escaping all field separator characters. // disables field query by escaping all field separator characters.
public static String disableFieldQuery(String userQuery) public static String disableFieldQuery(String userQuery)
{ {
......
...@@ -17,21 +17,67 @@ ...@@ -17,21 +17,67 @@
package ch.systemsx.cisd.openbis.generic.server.dataaccess.db.search; package ch.systemsx.cisd.openbis.generic.server.dataaccess.db.search;
import java.io.Reader; import java.io.Reader;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharTokenizer;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
/** /**
* Extends {@link StandardAnalyzer} by applying additional {@link SeparatorSplitterTokenFilter}. * Extends {@link Analyzer} splitting text on characters not allowed in codes or words.
* *
* @author Tomasz Pylak * @author Piotr Buczek
*/ */
public class SearchAnalyzer extends StandardAnalyzer public class SearchAnalyzer extends Analyzer
{ {
@Override @Override
public TokenStream tokenStream(String fieldName, Reader reader) public TokenStream tokenStream(String fieldName, Reader reader)
{ {
TokenStream original = super.tokenStream(fieldName, reader); return new WordAndCodeTokenizer(reader);
return new SeparatorSplitterTokenFilter(original); }
/**
* A tokenizer that divides text at chars different than letters, digits and special chars
* allowed in codes ('.', ':', '-', '_') or words (like apostrophe).
* <p>
* Additionally it normalizes token text to lower case (with a performance gain compared to
* using LowerCaseFilter after tokenization).
*/
private static class WordAndCodeTokenizer extends CharTokenizer
{
/** special characters allowed in codes */
private final static Character[] SPECIAL_CODE_CHARS =
{ '.', ':', '-', '_' };
/** special characters allowed in words (separated from code chars for clarity) */
private final static Character[] SPECIAL_WORD_CHARS =
{ '\'' };
private final static Set<Character> specialCharacters = new HashSet<Character>();
{
specialCharacters.addAll(Arrays.asList(SPECIAL_CODE_CHARS));
specialCharacters.addAll(Arrays.asList(SPECIAL_WORD_CHARS));
}
public WordAndCodeTokenizer(Reader input)
{
super(input);
}
@Override
protected boolean isTokenChar(char c)
{
return Character.isLetterOrDigit(c) || specialCharacters.contains(c);
}
@Override
protected char normalize(char c)
{
return Character.toLowerCase(c);
}
} }
} }
/*
* Copyright 2008 ETH Zuerich, CISD
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package ch.systemsx.cisd.openbis.generic.server.dataaccess.db.search;
import java.io.IOException;
import java.util.LinkedList;
import java.util.List;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardTokenizer;
/**
* Used to split tokens further down after standard tokenizer. We need this, because "." which is
* not followed be a space is not treated as a token separator by default.
*
* @author Tomasz Pylak
*/
public class SeparatorSplitterTokenFilter extends TokenFilter
{
static final char[] WORD_SEPARATORS = new char[]
{ '.', ',', '-', '_' };
private static final String ALPHANUM_TOKEN_TYPE =
StandardTokenizer.TOKEN_TYPES[StandardTokenizer.ALPHANUM];
private static final String HOST_TOKEN_TYPE =
StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HOST];
private static final String NUM_TOKEN_TYPE =
StandardTokenizer.TOKEN_TYPES[StandardTokenizer.NUM];
private List<Token> tokens = new LinkedList<Token>();
protected SeparatorSplitterTokenFilter(TokenStream input)
{
super(input);
}
/**
* Returns tokens from standard analysis, split additionally at specified separator characters.
*/
@Override
public final Token next(final Token reusableToken) throws IOException
{
if (tokens.size() > 0)
{
return extractFirstToken();
}
Token token = input.next(reusableToken);
// avoid splitting special tokens like e-mails
if (token == null || isSplittableToken(token) == false)
{
return token;
}
char[] termText = token.termBuffer();
int endPos = token.termLength(); // exclusive
int curPos = 0;
do
{
int nextPos = getSeparatorIndex(termText, curPos, endPos);
if (nextPos == endPos && tokens.size() == 0)
{
return token; // optimalisation, no split has occurred
}
addToken(token, curPos, nextPos);
curPos = nextPos + 1;
} while (curPos < endPos);
return extractFirstToken();
}
private static boolean isSplittableToken(Token token)
{
String type = token.type();
if (type.equals(ALPHANUM_TOKEN_TYPE) || type.equals(HOST_TOKEN_TYPE))
{
return true;
}
if (type.equals(NUM_TOKEN_TYPE))
{
// sometimes the original tokenizer lies to us and reports terms like 'version_3' to be
// numbers. This is a heuristic to correct those lies.
return Character.isLetter(token.term().charAt(0));
}
return false;
}
// returns the position of the first separator character. Starts browsing at curPos.
private static int getSeparatorIndex(char[] termText, int startIndex, int endIndex)
{
for (int i = startIndex; i < endIndex; i++)
{
if (isSeparator(termText[i]))
{
return i;
}
}
return endIndex;
}
private static boolean isSeparator(char ch)
{
for (int i = 0; i < WORD_SEPARATORS.length; i++)
{
if (WORD_SEPARATORS[i] == ch)
{
return true;
}
}
return false;
}
private Token extractFirstToken()
{
assert tokens.size() > 0 : "no more tokens";
Token t = tokens.get(0);
tokens.remove(0);
return t;
}
// startPos is inclusive position of the new token start
// endPos is exclusive position of the new token end
private void addToken(Token token, int startPos, int endPos)
{
if (startPos < endPos)
{
int startOffset = token.startOffset() + startPos;
int endOffset = token.startOffset() + endPos;
Token newToken =
new Token(token.termBuffer(), startPos, endPos - startPos, startOffset,
endOffset);
tokens.add(newToken);
}
}
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment