diff --git a/openbis/source/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/HibernateSearchDAO.java b/openbis/source/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/HibernateSearchDAO.java index 0553145e4c557d5da4fd54458f1342e98d407b3c..71eac360da57930a2c8f33e9532199ff0d9b1634 100644 --- a/openbis/source/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/HibernateSearchDAO.java +++ b/openbis/source/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/HibernateSearchDAO.java @@ -26,7 +26,6 @@ import org.apache.commons.lang.StringUtils; import org.apache.log4j.Logger; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReader.FieldOption; @@ -58,6 +57,7 @@ import ch.rinn.restrictions.Private; import ch.systemsx.cisd.common.logging.LogCategory; import ch.systemsx.cisd.common.logging.LogFactory; import ch.systemsx.cisd.openbis.generic.server.dataaccess.IHibernateSearchDAO; +import ch.systemsx.cisd.openbis.generic.server.dataaccess.db.search.SearchAnalyzer; import ch.systemsx.cisd.openbis.generic.shared.dto.IMatchingEntity; import ch.systemsx.cisd.openbis.generic.shared.dto.SearchHit; @@ -129,7 +129,7 @@ final class HibernateSearchDAO extends HibernateDaoSupport implements IHibernate throws DataAccessException, ParseException { final FullTextSession fullTextSession = Search.getFullTextSession(session); - StandardAnalyzer analyzer = new StandardAnalyzer(); + SearchAnalyzer analyzer = new SearchAnalyzer(); MyIndexReaderProvider<T> indexProvider = new MyIndexReaderProvider<T>(fullTextSession, entityClass); @@ -250,6 +250,10 @@ final class HibernateSearchDAO extends HibernateDaoSupport implements IHibernate // same code. The first value will be taken. matchingText = highlighter.getBestFragment(content, fieldName, documentId); + } else + { + // we do not store file content in the index + matchingText = "file content"; } } catch (IOException ex) { diff --git a/openbis/source/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/search/SearchAnalyzer.java b/openbis/source/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/search/SearchAnalyzer.java new file mode 100644 index 0000000000000000000000000000000000000000..ffda9408199c0919c4fcfd1c06cf8b7a5b7a9e54 --- /dev/null +++ b/openbis/source/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/search/SearchAnalyzer.java @@ -0,0 +1,37 @@ +/* + * Copyright 2008 ETH Zuerich, CISD + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package ch.systemsx.cisd.openbis.generic.server.dataaccess.db.search; + +import java.io.Reader; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.standard.StandardAnalyzer; + +/** + * Extends {@link StandardAnalyzer} by applying additional {@link SeparatorSplitterTokenFilter}. + * + * @author Tomasz Pylak + */ +public class SearchAnalyzer extends StandardAnalyzer +{ + @Override + public TokenStream tokenStream(String fieldName, Reader reader) + { + TokenStream original = super.tokenStream(fieldName, reader); + return new SeparatorSplitterTokenFilter(original); + } +} diff --git a/openbis/source/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/search/SeparatorSplitterTokenFilter.java b/openbis/source/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/search/SeparatorSplitterTokenFilter.java new file mode 100644 index 0000000000000000000000000000000000000000..9f96bd703b7c7e898037d7cab1df1a88f5112332 --- /dev/null +++ b/openbis/source/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/search/SeparatorSplitterTokenFilter.java @@ -0,0 +1,127 @@ +/* + * Copyright 2008 ETH Zuerich, CISD + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package ch.systemsx.cisd.openbis.generic.server.dataaccess.db.search; + +import java.io.IOException; +import java.util.LinkedList; +import java.util.List; + +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.standard.StandardTokenizer; + +/** + * Used to split tokens further down after standard tokenizer. We need this, because "." which is + * not followed be a space is not treated as a token separator by default. + * + * @author Tomasz Pylak + */ +public class SeparatorSplitterTokenFilter extends TokenFilter +{ + private static final String ALPHANUM_TOKEN_TYPE = + StandardTokenizer.TOKEN_TYPES[StandardTokenizer.ALPHANUM]; + + private static final String HOST_TOKEN_TYPE = + StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HOST]; + + private List<Token> tokens = new LinkedList<Token>(); + + protected SeparatorSplitterTokenFilter(TokenStream input) + { + super(input); + } + + /** + * Returns tokens from standard analysis, split additionally at specified separator characters. + */ + @Override + public final Token next(final Token reusableToken) throws IOException + { + if (tokens.size() > 0) + { + return extractFirstToken(); + } + Token token = input.next(reusableToken); + // avoid splitting special tokens like e-mails + if (token == null || isSplittableToken(token) == false) + { + return token; + } + char[] termText = token.termBuffer(); + int endPos = token.termLength(); // exclusive + int curPos = 0; + do + { + int nextPos = getSeparatorIndex(termText, curPos, endPos); + if (nextPos == endPos && tokens.size() == 0) + { + return token; // optimalisation, no split has occurred + } + addToken(token, curPos, nextPos); + curPos = nextPos + 1; + } while (curPos < endPos); + return extractFirstToken(); + } + + private static boolean isSplittableToken(Token token) + { + String type = token.type(); + return type.equals(ALPHANUM_TOKEN_TYPE) || type.equals(HOST_TOKEN_TYPE); + } + + // returns the position of the first separator character. Starts browsing at curPos. + private static int getSeparatorIndex(char[] termText, int startIndex, int endIndex) + { + for (int i = startIndex; i < endIndex; i++) + { + if (isSeparator(termText[i])) + { + return i; + } + } + return endIndex; + } + + private static boolean isSeparator(char ch) + { + return ch == '.' || ch == ',' || ch == '-' || ch == '_'; + } + + private Token extractFirstToken() + { + assert tokens.size() > 0 : "no more tokens"; + Token t = tokens.get(0); + tokens.remove(0); + return t; + } + + // startPos is inclusive position of the new token start + // endPos is exclusive position of the new token end + private void addToken(Token token, int startPos, int endPos) + { + if (startPos < endPos) + { + int startOffset = token.startOffset() + startPos; + int endOffset = token.startOffset() + endPos; + Token newToken = + new Token(token.termBuffer(), startPos, endPos - startPos, startOffset, + endOffset); + tokens.add(newToken); + } + } +} diff --git a/openbis/source/java/hibernateContext.xml b/openbis/source/java/hibernateContext.xml index 72a82ea0c2ca1e6585aa7fd8dab48f3cc5357563..af9ab41ef36e8c7e30911499c9732dc23266536f 100644 --- a/openbis/source/java/hibernateContext.xml +++ b/openbis/source/java/hibernateContext.xml @@ -91,6 +91,7 @@ <prop key="hibernate.search.default.optimizer.operation_limit.max">1000</prop> <prop key="hibernate.search.default.optimizer.transaction_limit.max">100</prop> <prop key="hibernate.search.worker.batch_size">${hibernate.search.batch-size}</prop> + <prop key="hibernate.search.analyzer">ch.systemsx.cisd.openbis.generic.server.dataaccess.db.search.SearchAnalyzer</prop> <!-- Disabling Hibernate Search. Possible values are [true, false]. <prop key="hibernate.search.autoregister_listeners">false</prop> -->