Skip to content
Snippets Groups Projects
Commit 44bbdc34 authored by izabel's avatar izabel
Browse files

[LMS-1827] allow to search for A#B in non-wildcard mode

SVN: 18161
parent 289410eb
No related branches found
No related tags found
No related merge requests found
/*
* Copyright 2010 ETH Zuerich, CISD
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package ch.systemsx.cisd.openbis.generic.server.dataaccess.db.search;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.Set;
/**
* Gathers characters that need special treatment and methods useful during indexing.
*
* @author Izabela Adamczyk
*/
public class CharacterHelper
{
private static final char ESCAPE_CHARACTER = '\\';
public final static Set<Character> SPECIAL_CHARACTERS = new HashSet<Character>(Arrays.asList(
// Special code characters
'.', ':', '-', '_',
// Special word characters
'\''));
// (don't trim '-' or '_' because they may have special meaning in identifiers)
/** those of special chars that should be trimmed */
private final static Set<Character> TRIMMED_CHARACTERS = new HashSet<Character>(Arrays.asList(
'.', ':', '\''));
public static boolean isTokenCharacter(char c)
{
return Character.isLetterOrDigit(c) || SPECIAL_CHARACTERS.contains(c);
}
public static Collection<Character> getTokenSeparators()
{
Set<Character> separators = new HashSet<Character>();
for (char ch = 32; ch < 256; ch++)
{
if (isTokenCharacter(ch) == false && ch != ESCAPE_CHARACTER)
{
separators.add(ch);
}
}
return separators;
}
public static Set<Character> getTrimmedSpecialCharacters()
{
return TRIMMED_CHARACTERS;
}
}
\ No newline at end of file
...@@ -16,14 +16,17 @@ ...@@ -16,14 +16,17 @@
package ch.systemsx.cisd.openbis.generic.server.dataaccess.db.search; package ch.systemsx.cisd.openbis.generic.server.dataaccess.db.search;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List; import java.util.List;
import org.apache.commons.lang.StringUtils;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Query; import org.apache.lucene.search.Query;
import org.apache.lucene.search.BooleanClause.Occur;
import ch.systemsx.cisd.common.exceptions.UserFailureException; import ch.systemsx.cisd.common.exceptions.UserFailureException;
import ch.systemsx.cisd.openbis.generic.server.dataaccess.db.search.detailed.DetailedQueryBuilder; import ch.systemsx.cisd.openbis.generic.server.dataaccess.db.search.detailed.DetailedQueryBuilder;
...@@ -36,12 +39,22 @@ import ch.systemsx.cisd.openbis.generic.shared.translator.DtoConverters; ...@@ -36,12 +39,22 @@ import ch.systemsx.cisd.openbis.generic.shared.translator.DtoConverters;
*/ */
public class LuceneQueryBuilder public class LuceneQueryBuilder
{ {
private static final String NOT = "NOT";
private static final String OR = "OR";
private static final String AND = "AND";
private static final char STAR = '*';
private static final char SPACE = ' ';
/** @throws UserFailureException when some search patterns are incorrect */ /** @throws UserFailureException when some search patterns are incorrect */
public static Query createDetailedSearchQuery(DetailedSearchCriteria searchCriteria, public static Query createDetailedSearchQuery(DetailedSearchCriteria searchCriteria,
EntityKind entityKind) EntityKind entityKind)
{ {
return DetailedQueryBuilder.createQuery(searchCriteria, DtoConverters return DetailedQueryBuilder.createQuery(searchCriteria,
.convertEntityKind(entityKind)); DtoConverters.convertEntityKind(entityKind));
} }
private static final char FIELD_SEPARATOR = ':'; private static final char FIELD_SEPARATOR = ':';
...@@ -68,11 +81,41 @@ public class LuceneQueryBuilder ...@@ -68,11 +81,41 @@ public class LuceneQueryBuilder
// add '*' wildcard at the beginning and at the end of the query in basic search mode // add '*' wildcard at the beginning and at the end of the query in basic search mode
if (useWildcardSearchMode == false && isQuoted(result) == false) if (useWildcardSearchMode == false && isQuoted(result) == false)
{ {
result = '*' + result + '*'; result = addWildcards(result);
} }
return result; return result;
} }
private static String addWildcards(String result)
{
String[] queryTokens = StringUtils.split(result, SPACE);
List<String> transformedTokens = new ArrayList<String>();
for (String qt : queryTokens)
{
if (qt.equals(AND) || qt.equals(OR) || qt.equals(NOT))
{
transformedTokens.add(qt);
} else
{
transformedTokens.add(addWildcartdsToToken(qt));
}
}
return StringUtils.join(transformedTokens, SPACE);
}
private static String addWildcartdsToToken(String token)
{
Collection<Character> tokenSeparators = CharacterHelper.getTokenSeparators();
tokenSeparators.removeAll(new ArrayList<String>());
String[] miniTokens = StringUtils.split(token, StringUtils.join(tokenSeparators, ""));
List<String> transformedMiniTokens = new ArrayList<String>();
for (String qt : miniTokens)
{
transformedMiniTokens.add(STAR + qt + STAR);
}
return '(' + StringUtils.join(transformedMiniTokens, SPACE + AND + SPACE) + ')';
}
private static boolean isQuoted(String result) private static boolean isQuoted(String result)
{ {
return result.startsWith("\"") && result.endsWith("\""); return result.startsWith("\"") && result.endsWith("\"");
......
...@@ -17,8 +17,6 @@ ...@@ -17,8 +17,6 @@
package ch.systemsx.cisd.openbis.generic.server.dataaccess.db.search; package ch.systemsx.cisd.openbis.generic.server.dataaccess.db.search;
import java.io.Reader; import java.io.Reader;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set; import java.util.Set;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
...@@ -47,26 +45,13 @@ public class SearchAnalyzer extends Analyzer ...@@ -47,26 +45,13 @@ public class SearchAnalyzer extends Analyzer
/** /**
* A tokenizer that divides text at chars different than letters, digits and special chars * A tokenizer that divides text at chars different than letters, digits and special chars
* allowed in codes ('.', ':', '-', '_') or words (like apostrophe). * defined in {@link CharacterHelper}.
* <p> * <p>
* Additionally it normalizes token text to lower case (with a performance gain compared to * Additionally it normalizes token text to lower case (with a performance gain compared to
* using LowerCaseFilter after tokenization). * using LowerCaseFilter after tokenization).
*/ */
private static class WordAndCodeTokenizer extends CharTokenizer private static class WordAndCodeTokenizer extends CharTokenizer
{ {
/** special characters allowed in codes */
private final static Character[] SPECIAL_CODE_CHARS =
{ '.', ':', '-', '_' };
/** special characters allowed in words (separated from code chars for clarity) */
private final static Character[] SPECIAL_WORD_CHARS =
{ '\'' };
private final static Set<Character> specialCharacters = new HashSet<Character>();
{
specialCharacters.addAll(Arrays.asList(SPECIAL_CODE_CHARS));
specialCharacters.addAll(Arrays.asList(SPECIAL_WORD_CHARS));
}
public WordAndCodeTokenizer(Reader input) public WordAndCodeTokenizer(Reader input)
{ {
...@@ -76,7 +61,7 @@ public class SearchAnalyzer extends Analyzer ...@@ -76,7 +61,7 @@ public class SearchAnalyzer extends Analyzer
@Override @Override
protected boolean isTokenChar(char c) protected boolean isTokenChar(char c)
{ {
return Character.isLetterOrDigit(c) || specialCharacters.contains(c); return CharacterHelper.isTokenCharacter(c);
} }
@Override @Override
...@@ -91,15 +76,6 @@ public class SearchAnalyzer extends Analyzer ...@@ -91,15 +76,6 @@ public class SearchAnalyzer extends Analyzer
*/ */
private static final class TrimSpecialCharsFilter extends TokenFilter private static final class TrimSpecialCharsFilter extends TokenFilter
{ {
// (don't trim '-' or '_' because they may have special meaning in identifiers)
/** those of special chars that should be trimmed */
private final static Character[] TRIMMED_SPECIAL_CHARS =
{ '.', ':', '\'' };
private final static Set<Character> trimmedCharacters = new HashSet<Character>();
{
trimmedCharacters.addAll(Arrays.asList(TRIMMED_SPECIAL_CHARS));
}
public TrimSpecialCharsFilter(TokenStream input) public TrimSpecialCharsFilter(TokenStream input)
{ {
...@@ -119,6 +95,7 @@ public class SearchAnalyzer extends Analyzer ...@@ -119,6 +95,7 @@ public class SearchAnalyzer extends Analyzer
final int bufferLength = nextToken.termLength(); final int bufferLength = nextToken.termLength();
int startCounter = 0; // counts chars to trim from the beginning int startCounter = 0; // counts chars to trim from the beginning
Set<Character> trimmedCharacters = CharacterHelper.getTrimmedSpecialCharacters();
for (int i = 0; i < bufferLength; i++) for (int i = 0; i < bufferLength; i++)
{ {
if (trimmedCharacters.contains(buffer[i])) if (trimmedCharacters.contains(buffer[i]))
......
...@@ -33,8 +33,9 @@ public class LuceneQueryBuilderTest extends AssertJUnit ...@@ -33,8 +33,9 @@ public class LuceneQueryBuilderTest extends AssertJUnit
{ {
return new Object[][] return new Object[][]
{ {
{ "abc", "*abc*" }, { "abc", "(*abc*)" },
{ "code:CP registrator:Joe", "*code\\:CP registrator\\:Joe*" } }; { "code:CP registrator:Joe", "(*code\\:CP*) (*registrator\\:Joe*)" },
{ "ab#c OR d", "(*ab* AND *c*) OR (*d*)" } };
} }
@DataProvider(name = "wildcardModeQueries") @DataProvider(name = "wildcardModeQueries")
...@@ -43,7 +44,8 @@ public class LuceneQueryBuilderTest extends AssertJUnit ...@@ -43,7 +44,8 @@ public class LuceneQueryBuilderTest extends AssertJUnit
return new Object[][] return new Object[][]
{ {
{ "abc", "abc" }, { "abc", "abc" },
{ "code:CP registrator:Joe", "code\\:CP registrator\\:Joe" } }; { "code:CP registrator:Joe", "code\\:CP registrator\\:Joe" },
{ "*ab#c OR d", "*ab#c OR d" } };
} }
@Test(dataProvider = "basicModeQueries") @Test(dataProvider = "basicModeQueries")
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment