Skip to content
Snippets Groups Projects
Commit 44bbdc34 authored by izabel's avatar izabel
Browse files

[LMS-1827] allow to search for A#B in non-wildcard mode

SVN: 18161
parent 289410eb
No related branches found
No related tags found
No related merge requests found
/*
* Copyright 2010 ETH Zuerich, CISD
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package ch.systemsx.cisd.openbis.generic.server.dataaccess.db.search;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.Set;
/**
* Gathers characters that need special treatment and methods useful during indexing.
*
* @author Izabela Adamczyk
*/
public class CharacterHelper
{
private static final char ESCAPE_CHARACTER = '\\';
public final static Set<Character> SPECIAL_CHARACTERS = new HashSet<Character>(Arrays.asList(
// Special code characters
'.', ':', '-', '_',
// Special word characters
'\''));
// (don't trim '-' or '_' because they may have special meaning in identifiers)
/** those of special chars that should be trimmed */
private final static Set<Character> TRIMMED_CHARACTERS = new HashSet<Character>(Arrays.asList(
'.', ':', '\''));
public static boolean isTokenCharacter(char c)
{
return Character.isLetterOrDigit(c) || SPECIAL_CHARACTERS.contains(c);
}
public static Collection<Character> getTokenSeparators()
{
Set<Character> separators = new HashSet<Character>();
for (char ch = 32; ch < 256; ch++)
{
if (isTokenCharacter(ch) == false && ch != ESCAPE_CHARACTER)
{
separators.add(ch);
}
}
return separators;
}
public static Set<Character> getTrimmedSpecialCharacters()
{
return TRIMMED_CHARACTERS;
}
}
\ No newline at end of file
......@@ -16,14 +16,17 @@
package ch.systemsx.cisd.openbis.generic.server.dataaccess.db.search;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import org.apache.commons.lang.StringUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.BooleanClause.Occur;
import ch.systemsx.cisd.common.exceptions.UserFailureException;
import ch.systemsx.cisd.openbis.generic.server.dataaccess.db.search.detailed.DetailedQueryBuilder;
......@@ -36,12 +39,22 @@ import ch.systemsx.cisd.openbis.generic.shared.translator.DtoConverters;
*/
public class LuceneQueryBuilder
{
private static final String NOT = "NOT";
private static final String OR = "OR";
private static final String AND = "AND";
private static final char STAR = '*';
private static final char SPACE = ' ';
/** @throws UserFailureException when some search patterns are incorrect */
public static Query createDetailedSearchQuery(DetailedSearchCriteria searchCriteria,
EntityKind entityKind)
{
return DetailedQueryBuilder.createQuery(searchCriteria, DtoConverters
.convertEntityKind(entityKind));
return DetailedQueryBuilder.createQuery(searchCriteria,
DtoConverters.convertEntityKind(entityKind));
}
private static final char FIELD_SEPARATOR = ':';
......@@ -68,11 +81,41 @@ public class LuceneQueryBuilder
// add '*' wildcard at the beginning and at the end of the query in basic search mode
if (useWildcardSearchMode == false && isQuoted(result) == false)
{
result = '*' + result + '*';
result = addWildcards(result);
}
return result;
}
private static String addWildcards(String result)
{
String[] queryTokens = StringUtils.split(result, SPACE);
List<String> transformedTokens = new ArrayList<String>();
for (String qt : queryTokens)
{
if (qt.equals(AND) || qt.equals(OR) || qt.equals(NOT))
{
transformedTokens.add(qt);
} else
{
transformedTokens.add(addWildcartdsToToken(qt));
}
}
return StringUtils.join(transformedTokens, SPACE);
}
private static String addWildcartdsToToken(String token)
{
Collection<Character> tokenSeparators = CharacterHelper.getTokenSeparators();
tokenSeparators.removeAll(new ArrayList<String>());
String[] miniTokens = StringUtils.split(token, StringUtils.join(tokenSeparators, ""));
List<String> transformedMiniTokens = new ArrayList<String>();
for (String qt : miniTokens)
{
transformedMiniTokens.add(STAR + qt + STAR);
}
return '(' + StringUtils.join(transformedMiniTokens, SPACE + AND + SPACE) + ')';
}
private static boolean isQuoted(String result)
{
return result.startsWith("\"") && result.endsWith("\"");
......
......@@ -17,8 +17,6 @@
package ch.systemsx.cisd.openbis.generic.server.dataaccess.db.search;
import java.io.Reader;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
......@@ -47,26 +45,13 @@ public class SearchAnalyzer extends Analyzer
/**
* A tokenizer that divides text at chars different than letters, digits and special chars
* allowed in codes ('.', ':', '-', '_') or words (like apostrophe).
* defined in {@link CharacterHelper}.
* <p>
* Additionally it normalizes token text to lower case (with a performance gain compared to
* using LowerCaseFilter after tokenization).
*/
private static class WordAndCodeTokenizer extends CharTokenizer
{
/** special characters allowed in codes */
private final static Character[] SPECIAL_CODE_CHARS =
{ '.', ':', '-', '_' };
/** special characters allowed in words (separated from code chars for clarity) */
private final static Character[] SPECIAL_WORD_CHARS =
{ '\'' };
private final static Set<Character> specialCharacters = new HashSet<Character>();
{
specialCharacters.addAll(Arrays.asList(SPECIAL_CODE_CHARS));
specialCharacters.addAll(Arrays.asList(SPECIAL_WORD_CHARS));
}
public WordAndCodeTokenizer(Reader input)
{
......@@ -76,7 +61,7 @@ public class SearchAnalyzer extends Analyzer
@Override
protected boolean isTokenChar(char c)
{
return Character.isLetterOrDigit(c) || specialCharacters.contains(c);
return CharacterHelper.isTokenCharacter(c);
}
@Override
......@@ -91,15 +76,6 @@ public class SearchAnalyzer extends Analyzer
*/
private static final class TrimSpecialCharsFilter extends TokenFilter
{
// (don't trim '-' or '_' because they may have special meaning in identifiers)
/** those of special chars that should be trimmed */
private final static Character[] TRIMMED_SPECIAL_CHARS =
{ '.', ':', '\'' };
private final static Set<Character> trimmedCharacters = new HashSet<Character>();
{
trimmedCharacters.addAll(Arrays.asList(TRIMMED_SPECIAL_CHARS));
}
public TrimSpecialCharsFilter(TokenStream input)
{
......@@ -119,6 +95,7 @@ public class SearchAnalyzer extends Analyzer
final int bufferLength = nextToken.termLength();
int startCounter = 0; // counts chars to trim from the beginning
Set<Character> trimmedCharacters = CharacterHelper.getTrimmedSpecialCharacters();
for (int i = 0; i < bufferLength; i++)
{
if (trimmedCharacters.contains(buffer[i]))
......
......@@ -33,8 +33,9 @@ public class LuceneQueryBuilderTest extends AssertJUnit
{
return new Object[][]
{
{ "abc", "*abc*" },
{ "code:CP registrator:Joe", "*code\\:CP registrator\\:Joe*" } };
{ "abc", "(*abc*)" },
{ "code:CP registrator:Joe", "(*code\\:CP*) (*registrator\\:Joe*)" },
{ "ab#c OR d", "(*ab* AND *c*) OR (*d*)" } };
}
@DataProvider(name = "wildcardModeQueries")
......@@ -43,7 +44,8 @@ public class LuceneQueryBuilderTest extends AssertJUnit
return new Object[][]
{
{ "abc", "abc" },
{ "code:CP registrator:Joe", "code\\:CP registrator\\:Joe" } };
{ "code:CP registrator:Joe", "code\\:CP registrator\\:Joe" },
{ "*ab#c OR d", "*ab#c OR d" } };
}
@Test(dataProvider = "basicModeQueries")
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment