From 44bbdc34f7191a1cf3afe37bfc5fac9f22ada81f Mon Sep 17 00:00:00 2001 From: izabel <izabel> Date: Wed, 6 Oct 2010 07:45:34 +0000 Subject: [PATCH] [LMS-1827] allow to search for A#B in non-wildcard mode SVN: 18161 --- .../dataaccess/db/search/CharacterHelper.java | 68 +++++++++++++++++++ .../db/search/LuceneQueryBuilder.java | 51 ++++++++++++-- .../dataaccess/db/search/SearchAnalyzer.java | 29 +------- .../db/search/LuceneQueryBuilderTest.java | 8 ++- 4 files changed, 123 insertions(+), 33 deletions(-) create mode 100644 openbis/source/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/search/CharacterHelper.java diff --git a/openbis/source/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/search/CharacterHelper.java b/openbis/source/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/search/CharacterHelper.java new file mode 100644 index 00000000000..0c8d82d702a --- /dev/null +++ b/openbis/source/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/search/CharacterHelper.java @@ -0,0 +1,68 @@ +/* + * Copyright 2010 ETH Zuerich, CISD + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package ch.systemsx.cisd.openbis.generic.server.dataaccess.db.search; + +import java.util.Arrays; +import java.util.Collection; +import java.util.HashSet; +import java.util.Set; + +/** + * Gathers characters that need special treatment and methods useful during indexing. + * + * @author Izabela Adamczyk + */ +public class CharacterHelper +{ + private static final char ESCAPE_CHARACTER = '\\'; + + public final static Set<Character> SPECIAL_CHARACTERS = new HashSet<Character>(Arrays.asList( + // Special code characters + '.', ':', '-', '_', + + // Special word characters + '\'')); + + // (don't trim '-' or '_' because they may have special meaning in identifiers) + /** those of special chars that should be trimmed */ + private final static Set<Character> TRIMMED_CHARACTERS = new HashSet<Character>(Arrays.asList( + '.', ':', '\'')); + + public static boolean isTokenCharacter(char c) + { + return Character.isLetterOrDigit(c) || SPECIAL_CHARACTERS.contains(c); + } + + public static Collection<Character> getTokenSeparators() + { + Set<Character> separators = new HashSet<Character>(); + for (char ch = 32; ch < 256; ch++) + { + if (isTokenCharacter(ch) == false && ch != ESCAPE_CHARACTER) + { + separators.add(ch); + } + } + return separators; + } + + public static Set<Character> getTrimmedSpecialCharacters() + { + return TRIMMED_CHARACTERS; + } + +} \ No newline at end of file diff --git a/openbis/source/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/search/LuceneQueryBuilder.java b/openbis/source/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/search/LuceneQueryBuilder.java index 6eb42addbeb..8dc1ded3a50 100644 --- a/openbis/source/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/search/LuceneQueryBuilder.java +++ b/openbis/source/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/search/LuceneQueryBuilder.java @@ -16,14 +16,17 @@ package ch.systemsx.cisd.openbis.generic.server.dataaccess.db.search; +import java.util.ArrayList; +import java.util.Collection; import java.util.List; +import org.apache.commons.lang.StringUtils; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.QueryParser; +import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.Query; -import org.apache.lucene.search.BooleanClause.Occur; import ch.systemsx.cisd.common.exceptions.UserFailureException; import ch.systemsx.cisd.openbis.generic.server.dataaccess.db.search.detailed.DetailedQueryBuilder; @@ -36,12 +39,22 @@ import ch.systemsx.cisd.openbis.generic.shared.translator.DtoConverters; */ public class LuceneQueryBuilder { + private static final String NOT = "NOT"; + + private static final String OR = "OR"; + + private static final String AND = "AND"; + + private static final char STAR = '*'; + + private static final char SPACE = ' '; + /** @throws UserFailureException when some search patterns are incorrect */ public static Query createDetailedSearchQuery(DetailedSearchCriteria searchCriteria, EntityKind entityKind) { - return DetailedQueryBuilder.createQuery(searchCriteria, DtoConverters - .convertEntityKind(entityKind)); + return DetailedQueryBuilder.createQuery(searchCriteria, + DtoConverters.convertEntityKind(entityKind)); } private static final char FIELD_SEPARATOR = ':'; @@ -68,11 +81,41 @@ public class LuceneQueryBuilder // add '*' wildcard at the beginning and at the end of the query in basic search mode if (useWildcardSearchMode == false && isQuoted(result) == false) { - result = '*' + result + '*'; + result = addWildcards(result); } return result; } + private static String addWildcards(String result) + { + String[] queryTokens = StringUtils.split(result, SPACE); + List<String> transformedTokens = new ArrayList<String>(); + for (String qt : queryTokens) + { + if (qt.equals(AND) || qt.equals(OR) || qt.equals(NOT)) + { + transformedTokens.add(qt); + } else + { + transformedTokens.add(addWildcartdsToToken(qt)); + } + } + return StringUtils.join(transformedTokens, SPACE); + } + + private static String addWildcartdsToToken(String token) + { + Collection<Character> tokenSeparators = CharacterHelper.getTokenSeparators(); + tokenSeparators.removeAll(new ArrayList<String>()); + String[] miniTokens = StringUtils.split(token, StringUtils.join(tokenSeparators, "")); + List<String> transformedMiniTokens = new ArrayList<String>(); + for (String qt : miniTokens) + { + transformedMiniTokens.add(STAR + qt + STAR); + } + return '(' + StringUtils.join(transformedMiniTokens, SPACE + AND + SPACE) + ')'; + } + private static boolean isQuoted(String result) { return result.startsWith("\"") && result.endsWith("\""); diff --git a/openbis/source/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/search/SearchAnalyzer.java b/openbis/source/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/search/SearchAnalyzer.java index 927394d919d..0890266995b 100644 --- a/openbis/source/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/search/SearchAnalyzer.java +++ b/openbis/source/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/search/SearchAnalyzer.java @@ -17,8 +17,6 @@ package ch.systemsx.cisd.openbis.generic.server.dataaccess.db.search; import java.io.Reader; -import java.util.Arrays; -import java.util.HashSet; import java.util.Set; import org.apache.lucene.analysis.Analyzer; @@ -47,26 +45,13 @@ public class SearchAnalyzer extends Analyzer /** * A tokenizer that divides text at chars different than letters, digits and special chars - * allowed in codes ('.', ':', '-', '_') or words (like apostrophe). + * defined in {@link CharacterHelper}. * <p> * Additionally it normalizes token text to lower case (with a performance gain compared to * using LowerCaseFilter after tokenization). */ private static class WordAndCodeTokenizer extends CharTokenizer { - /** special characters allowed in codes */ - private final static Character[] SPECIAL_CODE_CHARS = - { '.', ':', '-', '_' }; - - /** special characters allowed in words (separated from code chars for clarity) */ - private final static Character[] SPECIAL_WORD_CHARS = - { '\'' }; - - private final static Set<Character> specialCharacters = new HashSet<Character>(); - { - specialCharacters.addAll(Arrays.asList(SPECIAL_CODE_CHARS)); - specialCharacters.addAll(Arrays.asList(SPECIAL_WORD_CHARS)); - } public WordAndCodeTokenizer(Reader input) { @@ -76,7 +61,7 @@ public class SearchAnalyzer extends Analyzer @Override protected boolean isTokenChar(char c) { - return Character.isLetterOrDigit(c) || specialCharacters.contains(c); + return CharacterHelper.isTokenCharacter(c); } @Override @@ -91,15 +76,6 @@ public class SearchAnalyzer extends Analyzer */ private static final class TrimSpecialCharsFilter extends TokenFilter { - // (don't trim '-' or '_' because they may have special meaning in identifiers) - /** those of special chars that should be trimmed */ - private final static Character[] TRIMMED_SPECIAL_CHARS = - { '.', ':', '\'' }; - - private final static Set<Character> trimmedCharacters = new HashSet<Character>(); - { - trimmedCharacters.addAll(Arrays.asList(TRIMMED_SPECIAL_CHARS)); - } public TrimSpecialCharsFilter(TokenStream input) { @@ -119,6 +95,7 @@ public class SearchAnalyzer extends Analyzer final int bufferLength = nextToken.termLength(); int startCounter = 0; // counts chars to trim from the beginning + Set<Character> trimmedCharacters = CharacterHelper.getTrimmedSpecialCharacters(); for (int i = 0; i < bufferLength; i++) { if (trimmedCharacters.contains(buffer[i])) diff --git a/openbis/sourceTest/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/search/LuceneQueryBuilderTest.java b/openbis/sourceTest/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/search/LuceneQueryBuilderTest.java index ad218283384..38539257373 100644 --- a/openbis/sourceTest/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/search/LuceneQueryBuilderTest.java +++ b/openbis/sourceTest/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/search/LuceneQueryBuilderTest.java @@ -33,8 +33,9 @@ public class LuceneQueryBuilderTest extends AssertJUnit { return new Object[][] { - { "abc", "*abc*" }, - { "code:CP registrator:Joe", "*code\\:CP registrator\\:Joe*" } }; + { "abc", "(*abc*)" }, + { "code:CP registrator:Joe", "(*code\\:CP*) (*registrator\\:Joe*)" }, + { "ab#c OR d", "(*ab* AND *c*) OR (*d*)" } }; } @DataProvider(name = "wildcardModeQueries") @@ -43,7 +44,8 @@ public class LuceneQueryBuilderTest extends AssertJUnit return new Object[][] { { "abc", "abc" }, - { "code:CP registrator:Joe", "code\\:CP registrator\\:Joe" } }; + { "code:CP registrator:Joe", "code\\:CP registrator\\:Joe" }, + { "*ab#c OR d", "*ab#c OR d" } }; } @Test(dataProvider = "basicModeQueries") -- GitLab