Skip to content
Snippets Groups Projects
Commit 2791fd25 authored by tpylak's avatar tpylak
Browse files

LMS-704 treat "." as token separator in free-text search

SVN: 9446
parent 9695f2c5
No related branches found
No related tags found
No related merge requests found
...@@ -26,7 +26,6 @@ import org.apache.commons.lang.StringUtils; ...@@ -26,7 +26,6 @@ import org.apache.commons.lang.StringUtils;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document; import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexReader.FieldOption; import org.apache.lucene.index.IndexReader.FieldOption;
...@@ -58,6 +57,7 @@ import ch.rinn.restrictions.Private; ...@@ -58,6 +57,7 @@ import ch.rinn.restrictions.Private;
import ch.systemsx.cisd.common.logging.LogCategory; import ch.systemsx.cisd.common.logging.LogCategory;
import ch.systemsx.cisd.common.logging.LogFactory; import ch.systemsx.cisd.common.logging.LogFactory;
import ch.systemsx.cisd.openbis.generic.server.dataaccess.IHibernateSearchDAO; import ch.systemsx.cisd.openbis.generic.server.dataaccess.IHibernateSearchDAO;
import ch.systemsx.cisd.openbis.generic.server.dataaccess.db.search.SearchAnalyzer;
import ch.systemsx.cisd.openbis.generic.shared.dto.IMatchingEntity; import ch.systemsx.cisd.openbis.generic.shared.dto.IMatchingEntity;
import ch.systemsx.cisd.openbis.generic.shared.dto.SearchHit; import ch.systemsx.cisd.openbis.generic.shared.dto.SearchHit;
...@@ -129,7 +129,7 @@ final class HibernateSearchDAO extends HibernateDaoSupport implements IHibernate ...@@ -129,7 +129,7 @@ final class HibernateSearchDAO extends HibernateDaoSupport implements IHibernate
throws DataAccessException, ParseException throws DataAccessException, ParseException
{ {
final FullTextSession fullTextSession = Search.getFullTextSession(session); final FullTextSession fullTextSession = Search.getFullTextSession(session);
StandardAnalyzer analyzer = new StandardAnalyzer(); SearchAnalyzer analyzer = new SearchAnalyzer();
MyIndexReaderProvider<T> indexProvider = MyIndexReaderProvider<T> indexProvider =
new MyIndexReaderProvider<T>(fullTextSession, entityClass); new MyIndexReaderProvider<T>(fullTextSession, entityClass);
...@@ -250,6 +250,10 @@ final class HibernateSearchDAO extends HibernateDaoSupport implements IHibernate ...@@ -250,6 +250,10 @@ final class HibernateSearchDAO extends HibernateDaoSupport implements IHibernate
// same code. The first value will be taken. // same code. The first value will be taken.
matchingText = matchingText =
highlighter.getBestFragment(content, fieldName, documentId); highlighter.getBestFragment(content, fieldName, documentId);
} else
{
// we do not store file content in the index
matchingText = "file content";
} }
} catch (IOException ex) } catch (IOException ex)
{ {
......
/*
* Copyright 2008 ETH Zuerich, CISD
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package ch.systemsx.cisd.openbis.generic.server.dataaccess.db.search;
import java.io.Reader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
/**
* Extends {@link StandardAnalyzer} by applying additional {@link SeparatorSplitterTokenFilter}.
*
* @author Tomasz Pylak
*/
public class SearchAnalyzer extends StandardAnalyzer
{
@Override
public TokenStream tokenStream(String fieldName, Reader reader)
{
TokenStream original = super.tokenStream(fieldName, reader);
return new SeparatorSplitterTokenFilter(original);
}
}
/*
* Copyright 2008 ETH Zuerich, CISD
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package ch.systemsx.cisd.openbis.generic.server.dataaccess.db.search;
import java.io.IOException;
import java.util.LinkedList;
import java.util.List;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardTokenizer;
/**
* Used to split tokens further down after standard tokenizer. We need this, because "." which is
* not followed be a space is not treated as a token separator by default.
*
* @author Tomasz Pylak
*/
public class SeparatorSplitterTokenFilter extends TokenFilter
{
private static final String ALPHANUM_TOKEN_TYPE =
StandardTokenizer.TOKEN_TYPES[StandardTokenizer.ALPHANUM];
private static final String HOST_TOKEN_TYPE =
StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HOST];
private List<Token> tokens = new LinkedList<Token>();
protected SeparatorSplitterTokenFilter(TokenStream input)
{
super(input);
}
/**
* Returns tokens from standard analysis, split additionally at specified separator characters.
*/
@Override
public final Token next(final Token reusableToken) throws IOException
{
if (tokens.size() > 0)
{
return extractFirstToken();
}
Token token = input.next(reusableToken);
// avoid splitting special tokens like e-mails
if (token == null || isSplittableToken(token) == false)
{
return token;
}
char[] termText = token.termBuffer();
int endPos = token.termLength(); // exclusive
int curPos = 0;
do
{
int nextPos = getSeparatorIndex(termText, curPos, endPos);
if (nextPos == endPos && tokens.size() == 0)
{
return token; // optimalisation, no split has occurred
}
addToken(token, curPos, nextPos);
curPos = nextPos + 1;
} while (curPos < endPos);
return extractFirstToken();
}
private static boolean isSplittableToken(Token token)
{
String type = token.type();
return type.equals(ALPHANUM_TOKEN_TYPE) || type.equals(HOST_TOKEN_TYPE);
}
// returns the position of the first separator character. Starts browsing at curPos.
private static int getSeparatorIndex(char[] termText, int startIndex, int endIndex)
{
for (int i = startIndex; i < endIndex; i++)
{
if (isSeparator(termText[i]))
{
return i;
}
}
return endIndex;
}
private static boolean isSeparator(char ch)
{
return ch == '.' || ch == ',' || ch == '-' || ch == '_';
}
private Token extractFirstToken()
{
assert tokens.size() > 0 : "no more tokens";
Token t = tokens.get(0);
tokens.remove(0);
return t;
}
// startPos is inclusive position of the new token start
// endPos is exclusive position of the new token end
private void addToken(Token token, int startPos, int endPos)
{
if (startPos < endPos)
{
int startOffset = token.startOffset() + startPos;
int endOffset = token.startOffset() + endPos;
Token newToken =
new Token(token.termBuffer(), startPos, endPos - startPos, startOffset,
endOffset);
tokens.add(newToken);
}
}
}
...@@ -91,6 +91,7 @@ ...@@ -91,6 +91,7 @@
<prop key="hibernate.search.default.optimizer.operation_limit.max">1000</prop> <prop key="hibernate.search.default.optimizer.operation_limit.max">1000</prop>
<prop key="hibernate.search.default.optimizer.transaction_limit.max">100</prop> <prop key="hibernate.search.default.optimizer.transaction_limit.max">100</prop>
<prop key="hibernate.search.worker.batch_size">${hibernate.search.batch-size}</prop> <prop key="hibernate.search.worker.batch_size">${hibernate.search.batch-size}</prop>
<prop key="hibernate.search.analyzer">ch.systemsx.cisd.openbis.generic.server.dataaccess.db.search.SearchAnalyzer</prop>
<!-- Disabling Hibernate Search. Possible values are [true, false]. <!-- Disabling Hibernate Search. Possible values are [true, false].
<prop key="hibernate.search.autoregister_listeners">false</prop> <prop key="hibernate.search.autoregister_listeners">false</prop>
--> -->
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment