LMS-704 treat "." as token separator in free-text search

SVN: 9446

LMS-704 treat "." as token separator in free-text search
2791fd25 · tpylak · 9695f2c5 · 2791fd25 · 2791fd25 · 2791fd25
Commit 2791fd25 authored 16 years ago by tpylak
--- a/openbis/source/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/HibernateSearchDAO.java
+++ b/openbis/source/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/HibernateSearchDAO.java
@@ -26,7 +26,6 @@ import org.apache.commons.lang.StringUtils;
 import org.apache.log4j.Logger;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexReader.FieldOption;
@@ -58,6 +57,7 @@ import ch.rinn.restrictions.Private;
 import ch.systemsx.cisd.common.logging.LogCategory;
 import ch.systemsx.cisd.common.logging.LogFactory;
 import ch.systemsx.cisd.openbis.generic.server.dataaccess.IHibernateSearchDAO;
+import ch.systemsx.cisd.openbis.generic.server.dataaccess.db.search.SearchAnalyzer;
 import ch.systemsx.cisd.openbis.generic.shared.dto.IMatchingEntity;
 import ch.systemsx.cisd.openbis.generic.shared.dto.SearchHit;
@@ -129,7 +129,7 @@ final class HibernateSearchDAO extends HibernateDaoSupport implements IHibernate
            throws DataAccessException, ParseException
    {
        final FullTextSession fullTextSession = Search.getFullTextSession(session);
-        StandardAnalyzer analyzer = new StandardAnalyzer();
+        SearchAnalyzer analyzer = new SearchAnalyzer();
        MyIndexReaderProvider<T> indexProvider =
                new MyIndexReaderProvider<T>(fullTextSession, entityClass);
@@ -250,6 +250,10 @@ final class HibernateSearchDAO extends HibernateDaoSupport implements IHibernate
                            // same code. The first value will be taken.
                            matchingText =
                                    highlighter.getBestFragment(content, fieldName, documentId);
+                        } else
+                        {
+                            // we do not store file content in the index
+                            matchingText = "file content";
                        }
                    } catch (IOException ex)
                    {

--- a/openbis/source/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/search/SearchAnalyzer.java
+++ b/openbis/source/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/search/SearchAnalyzer.java
+/*
+ * Copyright 2008 ETH Zuerich, CISD
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package ch.systemsx.cisd.openbis.generic.server.dataaccess.db.search;
+import java.io.Reader;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+/**
+ * Extends {@link StandardAnalyzer} by applying additional {@link SeparatorSplitterTokenFilter}.
+ * 
+ * @author Tomasz Pylak
+ */
+public class SearchAnalyzer extends StandardAnalyzer
+{
+    @Override
+    public TokenStream tokenStream(String fieldName, Reader reader)
+    {
+        TokenStream original = super.tokenStream(fieldName, reader);
+        return new SeparatorSplitterTokenFilter(original);
+    }
+}
--- a/openbis/source/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/search/SeparatorSplitterTokenFilter.java
+++ b/openbis/source/java/ch/systemsx/cisd/openbis/generic/server/dataaccess/db/search/SeparatorSplitterTokenFilter.java
+/*
+ * Copyright 2008 ETH Zuerich, CISD
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package ch.systemsx.cisd.openbis.generic.server.dataaccess.db.search;
+import java.io.IOException;
+import java.util.LinkedList;
+import java.util.List;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+/**
+ * Used to split tokens further down after standard tokenizer. We need this, because "." which is
+ * not followed be a space is not treated as a token separator by default.
+ * 
+ * @author Tomasz Pylak
+ */
+public class SeparatorSplitterTokenFilter extends TokenFilter
+{
+    private static final String ALPHANUM_TOKEN_TYPE =
+            StandardTokenizer.TOKEN_TYPES[StandardTokenizer.ALPHANUM];
+    private static final String HOST_TOKEN_TYPE =
+            StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HOST];
+    private List<Token> tokens = new LinkedList<Token>();
+    protected SeparatorSplitterTokenFilter(TokenStream input)
+    {
+        super(input);
+    }
+    /**
+     * Returns tokens from standard analysis, split additionally at specified separator characters.
+     */
+    @Override
+    public final Token next(final Token reusableToken) throws IOException
+    {
+        if (tokens.size() > 0)
+        {
+            return extractFirstToken();
+        }
+        Token token = input.next(reusableToken);
+        // avoid splitting special tokens like e-mails
+        if (token == null || isSplittableToken(token) == false)
+        {
+            return token;
+        }
+        char[] termText = token.termBuffer();
+        int endPos = token.termLength(); // exclusive
+        int curPos = 0;
+        do
+        {
+            int nextPos = getSeparatorIndex(termText, curPos, endPos);
+            if (nextPos == endPos && tokens.size() == 0)
+            {
+                return token; // optimalisation, no split has occurred
+            }
+            addToken(token, curPos, nextPos);
+            curPos = nextPos + 1;
+        } while (curPos < endPos);
+        return extractFirstToken();
+    }
+    private static boolean isSplittableToken(Token token)
+    {
+        String type = token.type();
+        return type.equals(ALPHANUM_TOKEN_TYPE) || type.equals(HOST_TOKEN_TYPE);
+    }
+    // returns the position of the first separator character. Starts browsing at curPos.
+    private static int getSeparatorIndex(char[] termText, int startIndex, int endIndex)
+    {
+        for (int i = startIndex; i < endIndex; i++)
+        {
+            if (isSeparator(termText[i]))
+            {
+                return i;
+            }
+        }
+        return endIndex;
+    }
+    private static boolean isSeparator(char ch)
+    {
+        return ch == '.' || ch == ',' || ch == '-' || ch == '_';
+    }
+    private Token extractFirstToken()
+    {
+        assert tokens.size() > 0 : "no more tokens";
+        Token t = tokens.get(0);
+        tokens.remove(0);
+        return t;
+    }
+    // startPos is inclusive position of the new token start
+    // endPos is exclusive position of the new token end
+    private void addToken(Token token, int startPos, int endPos)
+    {
+        if (startPos < endPos)
+        {
+            int startOffset = token.startOffset() + startPos;
+            int endOffset = token.startOffset() + endPos;
+            Token newToken =
+                    new Token(token.termBuffer(), startPos, endPos - startPos, startOffset,
+                            endOffset);
+            tokens.add(newToken);
+        }
+    }
+}
--- a/openbis/source/java/hibernateContext.xml
+++ b/openbis/source/java/hibernateContext.xml
@@ -91,6 +91,7 @@
                <prop key="hibernate.search.default.optimizer.operation_limit.max">1000</prop>
                <prop key="hibernate.search.default.optimizer.transaction_limit.max">100</prop>
                <prop key="hibernate.search.worker.batch_size">${hibernate.search.batch-size}</prop>
+                <prop key="hibernate.search.analyzer">ch.systemsx.cisd.openbis.generic.server.dataaccess.db.search.SearchAnalyzer</prop>
                <!-- Disabling Hibernate Search. Possible values are [true, false].
                    <prop key="hibernate.search.autoregister_listeners">false</prop>
                -->