From 551b4b1cb0e0a682f6b0ce5a7919ced482810c2d Mon Sep 17 00:00:00 2001 From: tpylak <tpylak> Date: Thu, 30 Jul 2009 08:51:03 +0000 Subject: [PATCH] LMS-1048 Fix the way the '*' works in search SVN: 11941 --- .../dataaccess/db/HibernateSearchDAO.java | 11 ++-- .../db/search/LuceneQueryBuilder.java | 33 ++++++++++- .../search/SeparatorSplitterTokenFilter.java | 27 ++++++++- .../dataaccess/db/HibernateSearchDAOTest.java | 18 ------ .../db/search/LuceneQueryBuilderTest.java | 57 +++++++++++++++++++ 5 files changed, 116 insertions(+), 30 deletions(-) create mode 100644 openbis/sourceTest/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/search/LuceneQueryBuilderTest.java diff --git a/openbis/source/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/HibernateSearchDAO.java b/openbis/source/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/HibernateSearchDAO.java index 03861d701bb..644919df258 100644 --- a/openbis/source/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/HibernateSearchDAO.java +++ b/openbis/source/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/HibernateSearchDAO.java @@ -29,9 +29,7 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReader.FieldOption; -import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.Query; -import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.highlight.Formatter; import org.apache.lucene.search.highlight.Highlighter; import org.apache.lucene.search.highlight.QueryScorer; @@ -126,7 +124,7 @@ final class HibernateSearchDAO extends HibernateDaoSupport implements IHibernate MyIndexReaderProvider<T> indexProvider = new MyIndexReaderProvider<T>(fullTextSession, entityClass); - String searchQuery = LuceneQueryBuilder.disableFieldQuery(userQuery); + String searchQuery = LuceneQueryBuilder.adaptQuery(userQuery); try { @@ -225,8 +223,8 @@ final class HibernateSearchDAO extends HibernateDaoSupport implements IHibernate highlighter.getBestFragment(content, fieldName, documentId); } else { - // we do not store file content in the index - matchingText = "file content"; + // in some cases (e.g. attachments) we do not store content in the index + matchingText = "[content]"; } } catch (IOException ex) { @@ -347,8 +345,7 @@ final class HibernateSearchDAO extends HibernateDaoSupport implements IHibernate private List<ExternalDataPE> searchForDataSets(Session session, DataSetSearchCriteria datasetSearchCriteria) { - BooleanQuery query = new BooleanQuery(); - query.add(LuceneQueryBuilder.createQuery(datasetSearchCriteria), Occur.MUST); + Query query = LuceneQueryBuilder.createQuery(datasetSearchCriteria); final FullTextSession fullTextSession = Search.getFullTextSession(session); final FullTextQuery hibernateQuery = fullTextSession.createFullTextQuery(query, ExternalDataPE.class); diff --git a/openbis/source/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/search/LuceneQueryBuilder.java b/openbis/source/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/search/LuceneQueryBuilder.java index b3de472b82f..d4902ee5c2e 100644 --- a/openbis/source/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/search/LuceneQueryBuilder.java +++ b/openbis/source/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/search/LuceneQueryBuilder.java @@ -36,6 +36,7 @@ import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.BooleanClause.Occur; +import ch.rinn.restrictions.Private; import ch.systemsx.cisd.common.exceptions.InternalErr; import ch.systemsx.cisd.common.exceptions.UserFailureException; import ch.systemsx.cisd.common.logging.LogCategory; @@ -63,12 +64,12 @@ public class LuceneQueryBuilder List<DataSetSearchCriterion> criteria = dataSetCriteria.getCriteria(); Occur occureCondition = createOccureCondition(dataSetCriteria.getConnection()); - SearchAnalyzer analyzer = createSearchAnalyzer(); + Analyzer analyzer = createSearchAnalyzer(); BooleanQuery resultQuery = new BooleanQuery(); for (DataSetSearchCriterion criterion : criteria) { List<String> fieldNames = getIndexFieldNames(criterion.getField()); - String searchPattern = LuceneQueryBuilder.disableFieldQuery(criterion.getValue()); + String searchPattern = LuceneQueryBuilder.adaptQuery(criterion.getValue()); Query luceneQuery = parseQuery(fieldNames, searchPattern, analyzer); resultQuery.add(luceneQuery, occureCondition); @@ -225,6 +226,32 @@ public class LuceneQueryBuilder return SearchFieldConstants.PREFIX_PROPERTIES + propertyCode; } + public static String adaptQuery(String userQuery) + { + String result = disableFieldQuery(userQuery); + result = replaceWordSeparators(result, SeparatorSplitterTokenFilter.WORD_SEPARATORS); + return result; + } + + @Private + static String replaceWordSeparators(String query, char[] wordSeparators) + { + String charsRegexp = "["; + for (int i = 0; i < wordSeparators.length; i++) + { + charsRegexp += "\\" + wordSeparators[i]; + } + charsRegexp += "]"; + String queryWithoutSeparators = query.replaceAll(charsRegexp, " AND "); + if (queryWithoutSeparators.equals(query)) + { + return query; + } else + { + return "(" + queryWithoutSeparators + ")"; + } + } + // disables field query by escaping all field separator characters. public static String disableFieldQuery(String userQuery) { @@ -248,7 +275,7 @@ public class LuceneQueryBuilder * All the search query parsers should use this method to get the analyzer, because this is the * one which is used to build the index. */ - public static SearchAnalyzer createSearchAnalyzer() + public static Analyzer createSearchAnalyzer() { return new SearchAnalyzer(); } diff --git a/openbis/source/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/search/SeparatorSplitterTokenFilter.java b/openbis/source/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/search/SeparatorSplitterTokenFilter.java index 9f96bd703b7..8177c5ecfbf 100644 --- a/openbis/source/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/search/SeparatorSplitterTokenFilter.java +++ b/openbis/source/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/search/SeparatorSplitterTokenFilter.java @@ -33,12 +33,18 @@ import org.apache.lucene.analysis.standard.StandardTokenizer; */ public class SeparatorSplitterTokenFilter extends TokenFilter { + static final char[] WORD_SEPARATORS = new char[] + { '.', ',', '-', '_' }; + private static final String ALPHANUM_TOKEN_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.ALPHANUM]; private static final String HOST_TOKEN_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HOST]; + private static final String NUM_TOKEN_TYPE = + StandardTokenizer.TOKEN_TYPES[StandardTokenizer.NUM]; + private List<Token> tokens = new LinkedList<Token>(); protected SeparatorSplitterTokenFilter(TokenStream input) @@ -81,7 +87,17 @@ public class SeparatorSplitterTokenFilter extends TokenFilter private static boolean isSplittableToken(Token token) { String type = token.type(); - return type.equals(ALPHANUM_TOKEN_TYPE) || type.equals(HOST_TOKEN_TYPE); + if (type.equals(ALPHANUM_TOKEN_TYPE) || type.equals(HOST_TOKEN_TYPE)) + { + return true; + } + if (type.equals(NUM_TOKEN_TYPE)) + { + // sometimes the original tokenizer lies to us and reports terms like 'version_3' to be + // numbers. This is a heuristic to correct those lies. + return Character.isLetter(token.term().charAt(0)); + } + return false; } // returns the position of the first separator character. Starts browsing at curPos. @@ -99,7 +115,14 @@ public class SeparatorSplitterTokenFilter extends TokenFilter private static boolean isSeparator(char ch) { - return ch == '.' || ch == ',' || ch == '-' || ch == '_'; + for (int i = 0; i < WORD_SEPARATORS.length; i++) + { + if (WORD_SEPARATORS[i] == ch) + { + return true; + } + } + return false; } private Token extractFirstToken() diff --git a/openbis/sourceTest/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/HibernateSearchDAOTest.java b/openbis/sourceTest/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/HibernateSearchDAOTest.java index b94dbde4bf3..04368471c17 100644 --- a/openbis/sourceTest/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/HibernateSearchDAOTest.java +++ b/openbis/sourceTest/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/HibernateSearchDAOTest.java @@ -44,7 +44,6 @@ import ch.systemsx.cisd.common.filesystem.FileUtilities; import ch.systemsx.cisd.common.test.AssertionUtil; import ch.systemsx.cisd.openbis.generic.server.dataaccess.IHibernateSearchDAO; import ch.systemsx.cisd.openbis.generic.server.dataaccess.db.search.FullTextIndexerRunnable; -import ch.systemsx.cisd.openbis.generic.server.dataaccess.db.search.LuceneQueryBuilder; import ch.systemsx.cisd.openbis.generic.shared.basic.dto.DataSetSearchCriteria; import ch.systemsx.cisd.openbis.generic.shared.basic.dto.DataSetSearchCriterion; import ch.systemsx.cisd.openbis.generic.shared.basic.dto.DataSetSearchField; @@ -183,23 +182,6 @@ public final class HibernateSearchDAOTest extends AbstractDAOTest } } - @DataProvider(name = "queryEscaping") - protected Object[][] getQueriesToTest() - { - return new Object[][] - { - { "abc", "abc" }, - { "code:CP registrator:Joe", "code\\:CP registrator\\:Joe" }, - { "::", "\\:\\:" } }; - } - - @Test(dataProvider = "queryEscaping") - public final void testDisableAdvancedSearch(String unescapedQuery, String escapedQuery) - { - String query = LuceneQueryBuilder.disableFieldQuery(unescapedQuery); - assertEquals(escapedQuery, query); - } - private static void ensureContains(Set<MaterialPropertyPE> properties, String propertyValue) { boolean ok = false; diff --git a/openbis/sourceTest/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/search/LuceneQueryBuilderTest.java b/openbis/sourceTest/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/search/LuceneQueryBuilderTest.java new file mode 100644 index 00000000000..0f2507fa91e --- /dev/null +++ b/openbis/sourceTest/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/search/LuceneQueryBuilderTest.java @@ -0,0 +1,57 @@ +/* + * Copyright 2009 ETH Zuerich, CISD + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package ch.systemsx.cisd.openbis.generic.server.dataaccess.db.search; + +import org.testng.AssertJUnit; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import ch.rinn.restrictions.Friend; + +/** + * @author Tomasz Pylak + */ +@Friend(toClasses = LuceneQueryBuilder.class) +public class LuceneQueryBuilderTest extends AssertJUnit +{ + @Test + public void testReplaceWordSeparators() + { + char[] wordSeparators = new char[] + { '.', ',', '-', '_' }; + String result = LuceneQueryBuilder.replaceWordSeparators("a.b-c_d,e", wordSeparators); + assertEquals("(a AND b AND c AND d AND e)", result); + } + + @DataProvider(name = "queryEscaping") + protected Object[][] getQueriesToTest() + { + return new Object[][] + { + { "abc", "abc" }, + { "code:CP registrator:Joe", "code\\:CP registrator\\:Joe" }, + { "::", "\\:\\:" } }; + } + + @Test(dataProvider = "queryEscaping") + public final void testDisableAdvancedSearch(String unescapedQuery, String escapedQuery) + { + String query = LuceneQueryBuilder.adaptQuery(unescapedQuery); + assertEquals(escapedQuery, query); + } + +} -- GitLab